diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index d1c805b46abd20..55eb590619bd0c 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -343,4 +343,9 @@ extern void um_trace_signals_off(void); /* time-travel */ extern void deliver_time_travel_irqs(void); +/* zpoline */ +#ifndef CONFIG_MMU +extern int um_zpoline_enabled; +#endif + #endif diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index d02181d1bfe36f..de3ed8fc026873 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -27,11 +27,16 @@ static void sigsys_handler(int sig, struct siginfo *si, mcontext_t *mc) { struct uml_pt_regs r; - /* trap SIGSYS to userspace */ - get_regs_from_mc(&r, mc); - trap_sigsys(&r); - /* force handle signals after rt_sigreturn() */ - mc_set_regs_ip_relay(mc); + if (!um_zpoline_enabled) { + /* hook syscall via SIGSYS */ + mc_set_sigsys_hook(mc); + } else { + /* trap SIGSYS to userspace */ + get_regs_from_mc(&r, mc); + trap_sigsys(&r); + /* force handle signals after rt_sigreturn() */ + mc_set_regs_ip_relay(mc); + } } #endif diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h index 33f69f1eac10d6..6f5977ff0d213a 100644 --- a/arch/x86/um/asm/elf.h +++ b/arch/x86/um/asm/elf.h @@ -188,6 +188,9 @@ do { \ struct linux_binprm; extern int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); +struct elf_fdpic_params; +extern int elf_arch_finalize_exec(struct elf_fdpic_params *exec_params, + struct elf_fdpic_params *interp_params); extern unsigned long um_vdso_addr; #define AT_SYSINFO_EHDR 33 diff --git a/arch/x86/um/nommu/Makefile b/arch/x86/um/nommu/Makefile index 5febc4cea9ed03..23e42b7fb6bdfa 100644 --- a/arch/x86/um/nommu/Makefile +++ b/arch/x86/um/nommu/Makefile @@ -5,4 +5,17 @@ else BITS := 64 endif -obj-y = do_syscall_$(BITS).o entry_$(BITS).o process.o syscalls_$(BITS).o +# used by zpoline.c to translate syscall/sysenter instructions +# note: only in x86_64 w/ !CONFIG_MMU +inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk +inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt +quiet_cmd_inat_tables = GEN $@ + cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ +$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) + $(call cmd,inat_tables) + +targets += inat-tables.c +$(obj)/../../lib/inat.o: $(obj)/inat-tables.c +obj-y += ../../lib/insn.o ../../lib/inat.o + +obj-y += do_syscall_$(BITS).o entry_$(BITS).o process.o syscalls_$(BITS).o zpoline.o diff --git a/arch/x86/um/nommu/zpoline.c b/arch/x86/um/nommu/zpoline.c new file mode 100644 index 00000000000000..2e30ee7193625e --- /dev/null +++ b/arch/x86/um/nommu/zpoline.c @@ -0,0 +1,241 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * zpoline.c + * + * Replace syscall/sysenter instructions to `call *%rax` to hook syscalls. + * + */ +//#define DEBUG +#include +#include +#include +#include +#include +#include +#include + +int um_zpoline_enabled; +/* start of trampoline code area */ +static char *__zpoline_start; + +static int __zpoline_translate_syscalls(struct elf_fdpic_params *params) +{ + int count = 0, loop; + struct insn insn; + unsigned long addr; + struct elf_fdpic_loadseg *seg; + struct elf_phdr *phdr; + struct elfhdr *ehdr = (struct elfhdr *)params->elfhdr_addr; + + if (!ehdr) + return 0; + + seg = params->loadmap->segs; + phdr = params->phdrs; + for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { + if (phdr->p_type != PT_LOAD) + continue; + addr = seg->addr; + /* skip translation of trampoline code */ + if (addr <= (unsigned long)(&__zpoline_start[0] + 0x1000 + 0x0100)) { + pr_warn("%lx: address is in the range of trampoline", addr); + return -EINVAL; + } + + /* translate only segment with Executable flag */ + if (!(phdr->p_flags & PF_X)) { + seg++; + continue; + } + + pr_debug("translation 0x%lx-0x%llx", addr, + seg->addr + seg->p_memsz); + /* now ready to translate */ + while (addr < (seg->addr + seg->p_memsz)) { + insn_init(&insn, (void *)addr, MAX_INSN_SIZE, 1); + insn_get_length(&insn); + + insn_get_opcode(&insn); + + switch (insn.opcode.bytes[0]) { + case 0xf: + switch (insn.opcode.bytes[1]) { + case 0x05: /* syscall */ + case 0x34: /* sysenter */ + pr_debug("%lx: found syscall/sysenter", addr); + *(char *)addr = 0xff; // callq + *((char *)addr + 1) = 0xd0; // *%rax + count++; + break; + } + default: + break; + } + + addr += insn.length; + if (insn.length == 0) { + pr_debug("%lx: length zero with byte %x. skip ?", + addr, insn.opcode.bytes[0]); + addr += 1; + } + } + seg++; + } + return count; +} + +/** + * elf_arch_finalize_exec() - architecture hook to translate syscall/sysenter + * + * translate syscall/sysenter instruction upon loading ELF binary file + * on execve(2)&co syscall. + * + * suppose we have those instructions: + * + * mov $sysnr, %rax + * syscall 0f 05 + * + * this will translate it with: + * + * mov $sysnr, %rax (<= untouched) + * call *(%rax) ff d0 + * + * this will finally called hook function guided by trampoline code installed + * at setup_zpoline_trampoline(). + * + * @exec_params: ELF meta data for executable file + * @interp_params: ELF meta data for the interpreter file + */ +int elf_arch_finalize_exec(struct elf_fdpic_params *exec_params, + struct elf_fdpic_params *interp_params) +{ + int err = 0, count = 0; + struct mm_struct *mm = current->mm; + + /* zpoline disabled */ + if (!um_zpoline_enabled) + return 0; + + if (down_write_killable(&mm->mmap_lock)) + return -EINTR; + + /* translate for the executable */ + err = __zpoline_translate_syscalls(exec_params); + if (err < 0) { + pr_info("zpoline: xlate error %d", err); + goto out; + } + count += err; + pr_debug("zpoline: rewritten (exec) %d syscalls\n", count); + + /* translate for the interpreter */ + err = __zpoline_translate_syscalls(interp_params); + if (err < 0) { + pr_info("zpoline: xlate error %d", err); + goto out; + } + count += err; + + err = 0; + pr_debug("zpoline: rewritten (exec+interp) %d syscalls\n", count); + +out: + up_write(&mm->mmap_lock); + return err; +} + +/** + * setup_zpoline_trampoline() - install trampoline code for zpoline + * + * setup trampoline code for syscall hooks + * + * the trampoline code guides to call hooked function, __kernel_vsyscall + * in this case, via nop slides at the memory address zero (thus, zpoline). + * + * loaded binary by exec(2) is translated to call the function. + */ +static int __init setup_zpoline_trampoline(void) +{ + int i, ret; + int ptr; + + if (!um_zpoline_enabled) + return 0; + + /* zpoline: map area of trampoline code started from addr 0x0 */ + __zpoline_start = 0x0; + + ret = os_map_memory((void *) 0, -1, 0, PAGE_SIZE, 1, 1, 1); + if (ret) + panic("map failed\n NOTE: /proc/sys/vm/mmap_min_addr should be set 0\n"); + + /* fill nop instructions until the trampoline code */ + for (i = 0; i < NR_syscalls; i++) + __zpoline_start[i] = 0x90; + + /* optimization to skip old syscalls */ + /* short jmp */ + __zpoline_start[214 /* __NR_epoll_ctl_old */] = 0xeb; + /* range of a short jmp : -128 ~ +127 */ + __zpoline_start[215 /* __NR_epoll_wait_old */] = 127; + + /** + * FIXME: shift red zone area to properly handle the case + */ + + /** + * put code for jumping to __kernel_vsyscall. + * + * here we embed the following code. + * + * movabs [$addr],%r11 + * jmpq *%r11 + * + */ + ptr = NR_syscalls; + /* 49 bb [64-bit addr (8-byte)] movabs [64-bit addr (8-byte)],%r11 */ + __zpoline_start[ptr++] = 0x49; + __zpoline_start[ptr++] = 0xbb; + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 0)); + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 1)); + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 2)); + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 3)); + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 4)); + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 5)); + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 6)); + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 7)); + + /* + * pretending to be syscall instruction by putting return + * address in %rcx. + */ + /* 48 8b 0c 24 mov (%rsp),%rcx */ + __zpoline_start[ptr++] = 0x48; + __zpoline_start[ptr++] = 0x8b; + __zpoline_start[ptr++] = 0x0c; + __zpoline_start[ptr++] = 0x24; + + /* 41 ff e3 jmp *%r11 */ + __zpoline_start[ptr++] = 0x41; + __zpoline_start[ptr++] = 0xff; + __zpoline_start[ptr++] = 0xe3; + + /* permission: XOM (PROT_EXEC only) */ + ret = os_protect_memory(0, PAGE_SIZE, 0, 0, 1); + if (ret) + panic("failed: can't configure permission on trampoline code"); + + pr_info("zpoline: setting up trampoline code done\n"); + return 0; +} +arch_initcall(setup_zpoline_trampoline); + +static int __init zpoline_set(char *str) +{ + int val = 0; + + get_option(&str, &val); + um_zpoline_enabled = val; + return 1; +} +__setup("zpoline=", zpoline_set); diff --git a/arch/x86/um/vdso/um_vdso.c b/arch/x86/um/vdso/um_vdso.c index cbae2584124fd0..70be7082534802 100644 --- a/arch/x86/um/vdso/um_vdso.c +++ b/arch/x86/um/vdso/um_vdso.c @@ -19,15 +19,39 @@ int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz); __kernel_old_time_t __vdso_time(__kernel_old_time_t *t); long __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused); +/* XXX: FIXME, how to retrieve this switch from vdso object ??? */ +static int um_zpoline_enabled; + +#define __VDSO_SYSCALL1(sysnr, ret, a0) \ + if (!um_zpoline_enabled || IS_ENABLED(CONFIG_MMU)) { \ + asm("syscall" \ + : "=a" (ret) \ + : "0" (sysnr), "D" (a0) \ + : "rcx", "r11", "memory"); \ + } else { \ + asm("call *%%rax" \ + : "=a" (ret) \ + : "a" (sysnr), "D" (a0) \ + : "rcx", "r11", "memory"); \ + } +#define __VDSO_SYSCALL2(sysnr, ret, a0, a1) \ + if (!um_zpoline_enabled || IS_ENABLED(CONFIG_MMU)) { \ + asm("syscall" \ + : "=a" (ret) \ + : "0" (sysnr), "D" (a0), "S" (a1) \ + : "rcx", "r11", "memory"); \ + } else { \ + asm("call *%%rax" \ + : "=a" (ret) \ + : "a" (sysnr), "D" (a0), "S" (a1) \ + : "rcx", "r11", "memory"); \ + } + int __vdso_clock_gettime(clockid_t clock, struct __kernel_old_timespec *ts) { long ret; - asm("syscall" - : "=a" (ret) - : "0" (__NR_clock_gettime), "D" (clock), "S" (ts) - : "rcx", "r11", "memory"); - + __VDSO_SYSCALL2(__NR_clock_gettime, ret, clock, ts); return ret; } int clock_gettime(clockid_t, struct __kernel_old_timespec *) @@ -37,11 +61,7 @@ int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) { long ret; - asm("syscall" - : "=a" (ret) - : "0" (__NR_gettimeofday), "D" (tv), "S" (tz) - : "rcx", "r11", "memory"); - + __VDSO_SYSCALL2(__NR_gettimeofday, ret, tv, tz); return ret; } int gettimeofday(struct __kernel_old_timeval *, struct timezone *) @@ -51,10 +71,7 @@ __kernel_old_time_t __vdso_time(__kernel_old_time_t *t) { long secs; - asm volatile("syscall" - : "=a" (secs) - : "0" (__NR_time), "D" (t) : "cc", "r11", "cx", "memory"); - + __VDSO_SYSCALL1(__NR_time, secs, t); return secs; } __kernel_old_time_t time(__kernel_old_time_t *t) __attribute__((weak, alias("__vdso_time")));