From 281d98ed755fccc2d2963d3706c3b6540fdb4625 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Fri, 28 Sep 2012 12:20:02 +0100 Subject: [PATCH] --- yaml --- r: 344824 b: refs/heads/master c: 631527703d1aa2f0c5dc2af0d998f4da95c83f0e h: refs/heads/master v: v3 --- [refs] | 2 +- trunk/Documentation/prctl/seccomp_filter.txt | 74 +------------ trunk/arch/x86/kernel/vsyscall_64.c | 110 +++++++++---------- trunk/kernel/seccomp.c | 13 +-- trunk/security/keys/process_keys.c | 2 - 5 files changed, 61 insertions(+), 140 deletions(-) diff --git a/[refs] b/[refs] index 116f2951ec0d..f7cc8fca5bcf 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: 87b526d349b04c31d7b3a40b434eb3f825d22305 +refs/heads/master: 631527703d1aa2f0c5dc2af0d998f4da95c83f0e diff --git a/trunk/Documentation/prctl/seccomp_filter.txt b/trunk/Documentation/prctl/seccomp_filter.txt index 1e469ef75778..597c3c581375 100644 --- a/trunk/Documentation/prctl/seccomp_filter.txt +++ b/trunk/Documentation/prctl/seccomp_filter.txt @@ -95,15 +95,12 @@ SECCOMP_RET_KILL: SECCOMP_RET_TRAP: Results in the kernel sending a SIGSYS signal to the triggering - task without executing the system call. siginfo->si_call_addr - will show the address of the system call instruction, and - siginfo->si_syscall and siginfo->si_arch will indicate which - syscall was attempted. The program counter will be as though - the syscall happened (i.e. it will not point to the syscall - instruction). The return value register will contain an arch- - dependent value -- if resuming execution, set it to something - sensible. (The architecture dependency is because replacing - it with -ENOSYS could overwrite some useful information.) + task without executing the system call. The kernel will + rollback the register state to just before the system call + entry such that a signal handler in the task will be able to + inspect the ucontext_t->uc_mcontext registers and emulate + system call success or failure upon return from the signal + handler. The SECCOMP_RET_DATA portion of the return value will be passed as si_errno. @@ -126,18 +123,6 @@ SECCOMP_RET_TRACE: the BPF program return value will be available to the tracer via PTRACE_GETEVENTMSG. - The tracer can skip the system call by changing the syscall number - to -1. Alternatively, the tracer can change the system call - requested by changing the system call to a valid syscall number. If - the tracer asks to skip the system call, then the system call will - appear to return the value that the tracer puts in the return value - register. - - The seccomp check will not be run again after the tracer is - notified. (This means that seccomp-based sandboxes MUST NOT - allow use of ptrace, even of other sandboxed processes, without - extreme care; ptracers can use this mechanism to escape.) - SECCOMP_RET_ALLOW: Results in the system call being executed. @@ -176,50 +161,3 @@ architecture supports both ptrace_event and seccomp, it will be able to support seccomp filter with minor fixup: SIGSYS support and seccomp return value checking. Then it must just add CONFIG_HAVE_ARCH_SECCOMP_FILTER to its arch-specific Kconfig. - - - -Caveats -------- - -The vDSO can cause some system calls to run entirely in userspace, -leading to surprises when you run programs on different machines that -fall back to real syscalls. To minimize these surprises on x86, make -sure you test with -/sys/devices/system/clocksource/clocksource0/current_clocksource set to -something like acpi_pm. - -On x86-64, vsyscall emulation is enabled by default. (vsyscalls are -legacy variants on vDSO calls.) Currently, emulated vsyscalls will honor seccomp, with a few oddities: - -- A return value of SECCOMP_RET_TRAP will set a si_call_addr pointing to - the vsyscall entry for the given call and not the address after the - 'syscall' instruction. Any code which wants to restart the call - should be aware that (a) a ret instruction has been emulated and (b) - trying to resume the syscall will again trigger the standard vsyscall - emulation security checks, making resuming the syscall mostly - pointless. - -- A return value of SECCOMP_RET_TRACE will signal the tracer as usual, - but the syscall may not be changed to another system call using the - orig_rax register. It may only be changed to -1 order to skip the - currently emulated call. Any other change MAY terminate the process. - The rip value seen by the tracer will be the syscall entry address; - this is different from normal behavior. The tracer MUST NOT modify - rip or rsp. (Do not rely on other changes terminating the process. - They might work. For example, on some kernels, choosing a syscall - that only exists in future kernels will be correctly emulated (by - returning -ENOSYS). - -To detect this quirky behavior, check for addr & ~0x0C00 == -0xFFFFFFFFFF600000. (For SECCOMP_RET_TRACE, use rip. For -SECCOMP_RET_TRAP, use siginfo->si_call_addr.) Do not check any other -condition: future kernels may improve vsyscall emulation and current -kernels in vsyscall=native mode will behave differently, but the -instructions at 0xF...F600{0,4,8,C}00 will not be system calls in these -cases. - -Note that modern systems are unlikely to use vsyscalls at all -- they -are a legacy feature and they are considerably slower than standard -syscalls. New code will use the vDSO, and vDSO-issued system calls -are indistinguishable from normal system calls. diff --git a/trunk/arch/x86/kernel/vsyscall_64.c b/trunk/arch/x86/kernel/vsyscall_64.c index b2e58a248b3b..8d141b309046 100644 --- a/trunk/arch/x86/kernel/vsyscall_64.c +++ b/trunk/arch/x86/kernel/vsyscall_64.c @@ -136,6 +136,19 @@ static int addr_to_vsyscall_nr(unsigned long addr) return nr; } +#ifdef CONFIG_SECCOMP +static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr) +{ + if (!seccomp_mode(&tsk->seccomp)) + return 0; + task_pt_regs(tsk)->orig_ax = syscall_nr; + task_pt_regs(tsk)->ax = syscall_nr; + return __secure_computing(syscall_nr); +} +#else +#define vsyscall_seccomp(_tsk, _nr) 0 +#endif + static bool write_ok_or_segv(unsigned long ptr, size_t size) { /* @@ -168,9 +181,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) { struct task_struct *tsk; unsigned long caller; - int vsyscall_nr, syscall_nr, tmp; + int vsyscall_nr; int prev_sig_on_uaccess_error; long ret; + int skip; /* * No point in checking CS -- the only way to get here is a user mode @@ -202,84 +216,56 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) } tsk = current; + /* + * With a real vsyscall, page faults cause SIGSEGV. We want to + * preserve that behavior to make writing exploits harder. + */ + prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; + current_thread_info()->sig_on_uaccess_error = 1; /* - * Check for access_ok violations and find the syscall nr. - * * NULL is a valid user pointer (in the access_ok sense) on 32-bit and * 64-bit, so we don't need to special-case it here. For all the * vsyscalls, NULL means "don't write anything" not "write it at * address 0". */ + ret = -EFAULT; + skip = 0; switch (vsyscall_nr) { case 0: - if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || - !write_ok_or_segv(regs->si, sizeof(struct timezone))) { - ret = -EFAULT; - goto check_fault; - } - - syscall_nr = __NR_gettimeofday; - break; - - case 1: - if (!write_ok_or_segv(regs->di, sizeof(time_t))) { - ret = -EFAULT; - goto check_fault; - } - - syscall_nr = __NR_time; - break; - - case 2: - if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || - !write_ok_or_segv(regs->si, sizeof(unsigned))) { - ret = -EFAULT; - goto check_fault; - } - - syscall_nr = __NR_getcpu; - break; - } - - /* - * Handle seccomp. regs->ip must be the original value. - * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. - * - * We could optimize the seccomp disabled case, but performance - * here doesn't matter. - */ - regs->orig_ax = syscall_nr; - regs->ax = -ENOSYS; - tmp = secure_computing(syscall_nr); - if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { - warn_bad_vsyscall(KERN_DEBUG, regs, - "seccomp tried to change syscall nr or ip"); - do_exit(SIGSYS); - } - if (tmp) - goto do_ret; /* skip requested */ + skip = vsyscall_seccomp(tsk, __NR_gettimeofday); + if (skip) + break; - /* - * With a real vsyscall, page faults cause SIGSEGV. We want to - * preserve that behavior to make writing exploits harder. - */ - prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; - current_thread_info()->sig_on_uaccess_error = 1; + if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || + !write_ok_or_segv(regs->si, sizeof(struct timezone))) + break; - ret = -EFAULT; - switch (vsyscall_nr) { - case 0: ret = sys_gettimeofday( (struct timeval __user *)regs->di, (struct timezone __user *)regs->si); break; case 1: + skip = vsyscall_seccomp(tsk, __NR_time); + if (skip) + break; + + if (!write_ok_or_segv(regs->di, sizeof(time_t))) + break; + ret = sys_time((time_t __user *)regs->di); break; case 2: + skip = vsyscall_seccomp(tsk, __NR_getcpu); + if (skip) + break; + + if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || + !write_ok_or_segv(regs->si, sizeof(unsigned))) + break; + ret = sys_getcpu((unsigned __user *)regs->di, (unsigned __user *)regs->si, NULL); @@ -288,7 +274,12 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; -check_fault: + if (skip) { + if ((long)regs->ax <= 0L) /* seccomp errno emulation */ + goto do_ret; + goto done; /* seccomp trace/trap */ + } + if (ret == -EFAULT) { /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, @@ -311,6 +302,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) /* Emulate a ret instruction. */ regs->ip = caller; regs->sp += 8; +done: return true; sigsegv: diff --git a/trunk/kernel/seccomp.c b/trunk/kernel/seccomp.c index 5af44b593770..ee376beedaf9 100644 --- a/trunk/kernel/seccomp.c +++ b/trunk/kernel/seccomp.c @@ -396,29 +396,25 @@ int __secure_computing(int this_syscall) #ifdef CONFIG_SECCOMP_FILTER case SECCOMP_MODE_FILTER: { int data; - struct pt_regs *regs = task_pt_regs(current); ret = seccomp_run_filters(this_syscall); data = ret & SECCOMP_RET_DATA; ret &= SECCOMP_RET_ACTION; switch (ret) { case SECCOMP_RET_ERRNO: /* Set the low-order 16-bits as a errno. */ - syscall_set_return_value(current, regs, + syscall_set_return_value(current, task_pt_regs(current), -data, 0); goto skip; case SECCOMP_RET_TRAP: /* Show the handler the original registers. */ - syscall_rollback(current, regs); + syscall_rollback(current, task_pt_regs(current)); /* Let the filter pass back 16 bits of data. */ seccomp_send_sigsys(this_syscall, data); goto skip; case SECCOMP_RET_TRACE: /* Skip these calls if there is no tracer. */ - if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { - syscall_set_return_value(current, regs, - -ENOSYS, 0); + if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) goto skip; - } /* Allow the BPF to provide the event message */ ptrace_event(PTRACE_EVENT_SECCOMP, data); /* @@ -429,9 +425,6 @@ int __secure_computing(int this_syscall) */ if (fatal_signal_pending(current)) break; - if (syscall_get_nr(current, regs) < 0) - goto skip; /* Explicit request to skip. */ - return 0; case SECCOMP_RET_ALLOW: return 0; diff --git a/trunk/security/keys/process_keys.c b/trunk/security/keys/process_keys.c index 54339cfd6734..178b8c3b130a 100644 --- a/trunk/security/keys/process_keys.c +++ b/trunk/security/keys/process_keys.c @@ -357,8 +357,6 @@ key_ref_t search_my_process_keyrings(struct key_type *type, switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ - if (ret) - break; case -ENOKEY: /* negative key */ ret = key_ref; break;