Skip to content

Commit

Permalink
x86/um: nommu: syscall translation by zpoline
Browse files Browse the repository at this point in the history
This commit adds a mechanism to hook syscalls for unmodified userspace
programs used under UML in !MMU mode. The mechanism, called zpoline,
translates syscall/sysenter instructions with `call *%rax`, which can be
processed by a trampoline code also installed upon an initcall during
boot. The translation is triggered by elf_arch_finalize_exec(), an arch
hook introduced by another commit.

All syscalls issued by userspace thus redirected to a specific function,
__kernel_vsyscall, introduced as a syscall entry point for !MMU UML.
This totally changes the code path to hook syscall with ptrace(2) used by
MMU-full UML.

Signed-off-by: Hajime Tazaki <thehajime@gmail.com>
  • Loading branch information
thehajime committed Dec 5, 2024
1 parent 6f7de73 commit e483321
Show file tree
Hide file tree
Showing 6 changed files with 304 additions and 20 deletions.
5 changes: 5 additions & 0 deletions arch/um/include/shared/os.h
Original file line number Diff line number Diff line change
Expand Up @@ -343,4 +343,9 @@ extern void um_trace_signals_off(void);
/* time-travel */
extern void deliver_time_travel_irqs(void);

/* zpoline */
#ifndef CONFIG_MMU
extern int um_zpoline_enabled;
#endif

#endif
15 changes: 10 additions & 5 deletions arch/um/os-Linux/signal.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,16 @@ static void sigsys_handler(int sig, struct siginfo *si, mcontext_t *mc)
{
struct uml_pt_regs r;

/* trap SIGSYS to userspace */
get_regs_from_mc(&r, mc);
trap_sigsys(&r);
/* force handle signals after rt_sigreturn() */
mc_set_regs_ip_relay(mc);
if (!um_zpoline_enabled) {
/* hook syscall via SIGSYS */
mc_set_sigsys_hook(mc);
} else {
/* trap SIGSYS to userspace */
get_regs_from_mc(&r, mc);
trap_sigsys(&r);
/* force handle signals after rt_sigreturn() */
mc_set_regs_ip_relay(mc);
}
}
#endif

Expand Down
3 changes: 3 additions & 0 deletions arch/x86/um/asm/elf.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,9 @@ do { \
struct linux_binprm;
extern int arch_setup_additional_pages(struct linux_binprm *bprm,
int uses_interp);
struct elf_fdpic_params;
extern int elf_arch_finalize_exec(struct elf_fdpic_params *exec_params,
struct elf_fdpic_params *interp_params);

extern unsigned long um_vdso_addr;
#define AT_SYSINFO_EHDR 33
Expand Down
15 changes: 14 additions & 1 deletion arch/x86/um/nommu/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,17 @@ else
BITS := 64
endif

obj-y = do_syscall_$(BITS).o entry_$(BITS).o process.o syscalls_$(BITS).o
# used by zpoline.c to translate syscall/sysenter instructions
# note: only in x86_64 w/ !CONFIG_MMU
inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
quiet_cmd_inat_tables = GEN $@
cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@
$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps)
$(call cmd,inat_tables)

targets += inat-tables.c
$(obj)/../../lib/inat.o: $(obj)/inat-tables.c
obj-y += ../../lib/insn.o ../../lib/inat.o

obj-y += do_syscall_$(BITS).o entry_$(BITS).o process.o syscalls_$(BITS).o zpoline.o
241 changes: 241 additions & 0 deletions arch/x86/um/nommu/zpoline.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
// SPDX-License-Identifier: GPL-2.0
/*
* zpoline.c
*
* Replace syscall/sysenter instructions to `call *%rax` to hook syscalls.
*
*/
//#define DEBUG
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/elf-fdpic.h>
#include <asm/unistd.h>
#include <asm/insn.h>
#include <sysdep/syscalls.h>
#include <os.h>

int um_zpoline_enabled;
/* start of trampoline code area */
static char *__zpoline_start;

static int __zpoline_translate_syscalls(struct elf_fdpic_params *params)
{
int count = 0, loop;
struct insn insn;
unsigned long addr;
struct elf_fdpic_loadseg *seg;
struct elf_phdr *phdr;
struct elfhdr *ehdr = (struct elfhdr *)params->elfhdr_addr;

if (!ehdr)
return 0;

seg = params->loadmap->segs;
phdr = params->phdrs;
for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) {
if (phdr->p_type != PT_LOAD)
continue;
addr = seg->addr;
/* skip translation of trampoline code */
if (addr <= (unsigned long)(&__zpoline_start[0] + 0x1000 + 0x0100)) {
pr_warn("%lx: address is in the range of trampoline", addr);
return -EINVAL;
}

/* translate only segment with Executable flag */
if (!(phdr->p_flags & PF_X)) {
seg++;
continue;
}

pr_debug("translation 0x%lx-0x%llx", addr,
seg->addr + seg->p_memsz);
/* now ready to translate */
while (addr < (seg->addr + seg->p_memsz)) {
insn_init(&insn, (void *)addr, MAX_INSN_SIZE, 1);
insn_get_length(&insn);

insn_get_opcode(&insn);

switch (insn.opcode.bytes[0]) {
case 0xf:
switch (insn.opcode.bytes[1]) {
case 0x05: /* syscall */
case 0x34: /* sysenter */
pr_debug("%lx: found syscall/sysenter", addr);
*(char *)addr = 0xff; // callq
*((char *)addr + 1) = 0xd0; // *%rax
count++;
break;
}
default:
break;
}

addr += insn.length;
if (insn.length == 0) {
pr_debug("%lx: length zero with byte %x. skip ?",
addr, insn.opcode.bytes[0]);
addr += 1;
}
}
seg++;
}
return count;
}

/**
* elf_arch_finalize_exec() - architecture hook to translate syscall/sysenter
*
* translate syscall/sysenter instruction upon loading ELF binary file
* on execve(2)&co syscall.
*
* suppose we have those instructions:
*
* mov $sysnr, %rax
* syscall 0f 05
*
* this will translate it with:
*
* mov $sysnr, %rax (<= untouched)
* call *(%rax) ff d0
*
* this will finally called hook function guided by trampoline code installed
* at setup_zpoline_trampoline().
*
* @exec_params: ELF meta data for executable file
* @interp_params: ELF meta data for the interpreter file
*/
int elf_arch_finalize_exec(struct elf_fdpic_params *exec_params,
struct elf_fdpic_params *interp_params)
{
int err = 0, count = 0;
struct mm_struct *mm = current->mm;

/* zpoline disabled */
if (!um_zpoline_enabled)
return 0;

if (down_write_killable(&mm->mmap_lock))
return -EINTR;

/* translate for the executable */
err = __zpoline_translate_syscalls(exec_params);
if (err < 0) {
pr_info("zpoline: xlate error %d", err);
goto out;
}
count += err;
pr_debug("zpoline: rewritten (exec) %d syscalls\n", count);

/* translate for the interpreter */
err = __zpoline_translate_syscalls(interp_params);
if (err < 0) {
pr_info("zpoline: xlate error %d", err);
goto out;
}
count += err;

err = 0;
pr_debug("zpoline: rewritten (exec+interp) %d syscalls\n", count);

out:
up_write(&mm->mmap_lock);
return err;
}

/**
* setup_zpoline_trampoline() - install trampoline code for zpoline
*
* setup trampoline code for syscall hooks
*
* the trampoline code guides to call hooked function, __kernel_vsyscall
* in this case, via nop slides at the memory address zero (thus, zpoline).
*
* loaded binary by exec(2) is translated to call the function.
*/
static int __init setup_zpoline_trampoline(void)
{
int i, ret;
int ptr;

if (!um_zpoline_enabled)
return 0;

/* zpoline: map area of trampoline code started from addr 0x0 */
__zpoline_start = 0x0;

ret = os_map_memory((void *) 0, -1, 0, PAGE_SIZE, 1, 1, 1);
if (ret)
panic("map failed\n NOTE: /proc/sys/vm/mmap_min_addr should be set 0\n");

/* fill nop instructions until the trampoline code */
for (i = 0; i < NR_syscalls; i++)
__zpoline_start[i] = 0x90;

/* optimization to skip old syscalls */
/* short jmp */
__zpoline_start[214 /* __NR_epoll_ctl_old */] = 0xeb;
/* range of a short jmp : -128 ~ +127 */
__zpoline_start[215 /* __NR_epoll_wait_old */] = 127;

/**
* FIXME: shift red zone area to properly handle the case
*/

/**
* put code for jumping to __kernel_vsyscall.
*
* here we embed the following code.
*
* movabs [$addr],%r11
* jmpq *%r11
*
*/
ptr = NR_syscalls;
/* 49 bb [64-bit addr (8-byte)] movabs [64-bit addr (8-byte)],%r11 */
__zpoline_start[ptr++] = 0x49;
__zpoline_start[ptr++] = 0xbb;
__zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 0));
__zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 1));
__zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 2));
__zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 3));
__zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 4));
__zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 5));
__zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 6));
__zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 7));

/*
* pretending to be syscall instruction by putting return
* address in %rcx.
*/
/* 48 8b 0c 24 mov (%rsp),%rcx */
__zpoline_start[ptr++] = 0x48;
__zpoline_start[ptr++] = 0x8b;
__zpoline_start[ptr++] = 0x0c;
__zpoline_start[ptr++] = 0x24;

/* 41 ff e3 jmp *%r11 */
__zpoline_start[ptr++] = 0x41;
__zpoline_start[ptr++] = 0xff;
__zpoline_start[ptr++] = 0xe3;

/* permission: XOM (PROT_EXEC only) */
ret = os_protect_memory(0, PAGE_SIZE, 0, 0, 1);
if (ret)
panic("failed: can't configure permission on trampoline code");

pr_info("zpoline: setting up trampoline code done\n");
return 0;
}
arch_initcall(setup_zpoline_trampoline);

static int __init zpoline_set(char *str)
{
int val = 0;

get_option(&str, &val);
um_zpoline_enabled = val;
return 1;
}
__setup("zpoline=", zpoline_set);
45 changes: 31 additions & 14 deletions arch/x86/um/vdso/um_vdso.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,39 @@ int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz);
__kernel_old_time_t __vdso_time(__kernel_old_time_t *t);
long __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused);

/* XXX: FIXME, how to retrieve this switch from vdso object ??? */
static int um_zpoline_enabled;

#define __VDSO_SYSCALL1(sysnr, ret, a0) \
if (!um_zpoline_enabled || IS_ENABLED(CONFIG_MMU)) { \
asm("syscall" \
: "=a" (ret) \
: "0" (sysnr), "D" (a0) \
: "rcx", "r11", "memory"); \
} else { \
asm("call *%%rax" \
: "=a" (ret) \
: "a" (sysnr), "D" (a0) \
: "rcx", "r11", "memory"); \
}
#define __VDSO_SYSCALL2(sysnr, ret, a0, a1) \
if (!um_zpoline_enabled || IS_ENABLED(CONFIG_MMU)) { \
asm("syscall" \
: "=a" (ret) \
: "0" (sysnr), "D" (a0), "S" (a1) \
: "rcx", "r11", "memory"); \
} else { \
asm("call *%%rax" \
: "=a" (ret) \
: "a" (sysnr), "D" (a0), "S" (a1) \
: "rcx", "r11", "memory"); \
}

int __vdso_clock_gettime(clockid_t clock, struct __kernel_old_timespec *ts)
{
long ret;

asm("syscall"
: "=a" (ret)
: "0" (__NR_clock_gettime), "D" (clock), "S" (ts)
: "rcx", "r11", "memory");

__VDSO_SYSCALL2(__NR_clock_gettime, ret, clock, ts);
return ret;
}
int clock_gettime(clockid_t, struct __kernel_old_timespec *)
Expand All @@ -37,11 +61,7 @@ int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz)
{
long ret;

asm("syscall"
: "=a" (ret)
: "0" (__NR_gettimeofday), "D" (tv), "S" (tz)
: "rcx", "r11", "memory");

__VDSO_SYSCALL2(__NR_gettimeofday, ret, tv, tz);
return ret;
}
int gettimeofday(struct __kernel_old_timeval *, struct timezone *)
Expand All @@ -51,10 +71,7 @@ __kernel_old_time_t __vdso_time(__kernel_old_time_t *t)
{
long secs;

asm volatile("syscall"
: "=a" (secs)
: "0" (__NR_time), "D" (t) : "cc", "r11", "cx", "memory");

__VDSO_SYSCALL1(__NR_time, secs, t);
return secs;
}
__kernel_old_time_t time(__kernel_old_time_t *t) __attribute__((weak, alias("__vdso_time")));
Expand Down

0 comments on commit e483321

Please sign in to comment.