From 281d98ed755fccc2d2963d3706c3b6540fdb4625 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Fri, 28 Sep 2012 12:20:02 +0100
Subject: [PATCH] --- yaml --- r: 344824 b: refs/heads/master c:
 631527703d1aa2f0c5dc2af0d998f4da95c83f0e h: refs/heads/master v: v3

---
 [refs]                                       |   2 +-
 trunk/Documentation/prctl/seccomp_filter.txt |  74 +------------
 trunk/arch/x86/kernel/vsyscall_64.c          | 110 +++++++++----------
 trunk/kernel/seccomp.c                       |  13 +--
 trunk/security/keys/process_keys.c           |   2 -
 5 files changed, 61 insertions(+), 140 deletions(-)

diff --git a/[refs] b/[refs]
index 116f2951ec0d..f7cc8fca5bcf 100644
--- a/[refs]
+++ b/[refs]
@@ -1,2 +1,2 @@
 ---
-refs/heads/master: 87b526d349b04c31d7b3a40b434eb3f825d22305
+refs/heads/master: 631527703d1aa2f0c5dc2af0d998f4da95c83f0e
diff --git a/trunk/Documentation/prctl/seccomp_filter.txt b/trunk/Documentation/prctl/seccomp_filter.txt
index 1e469ef75778..597c3c581375 100644
--- a/trunk/Documentation/prctl/seccomp_filter.txt
+++ b/trunk/Documentation/prctl/seccomp_filter.txt
@@ -95,15 +95,12 @@ SECCOMP_RET_KILL:
 
 SECCOMP_RET_TRAP:
 	Results in the kernel sending a SIGSYS signal to the triggering
-	task without executing the system call.  siginfo->si_call_addr
-	will show the address of the system call instruction, and
-	siginfo->si_syscall and siginfo->si_arch will indicate which
-	syscall was attempted.  The program counter will be as though
-	the syscall happened (i.e. it will not point to the syscall
-	instruction).  The return value register will contain an arch-
-	dependent value -- if resuming execution, set it to something
-	sensible.  (The architecture dependency is because replacing
-	it with -ENOSYS could overwrite some useful information.)
+	task without executing the system call.  The kernel will
+	rollback the register state to just before the system call
+	entry such that a signal handler in the task will be able to
+	inspect the ucontext_t->uc_mcontext registers and emulate
+	system call success or failure upon return from the signal
+	handler.
 
 	The SECCOMP_RET_DATA portion of the return value will be passed
 	as si_errno.
@@ -126,18 +123,6 @@ SECCOMP_RET_TRACE:
 	the BPF program return value will be available to the tracer
 	via PTRACE_GETEVENTMSG.
 
-	The tracer can skip the system call by changing the syscall number
-	to -1.  Alternatively, the tracer can change the system call
-	requested by changing the system call to a valid syscall number.  If
-	the tracer asks to skip the system call, then the system call will
-	appear to return the value that the tracer puts in the return value
-	register.
-
-	The seccomp check will not be run again after the tracer is
-	notified.  (This means that seccomp-based sandboxes MUST NOT
-	allow use of ptrace, even of other sandboxed processes, without
-	extreme care; ptracers can use this mechanism to escape.)
-
 SECCOMP_RET_ALLOW:
 	Results in the system call being executed.
 
@@ -176,50 +161,3 @@ architecture supports both ptrace_event and seccomp, it will be able to
 support seccomp filter with minor fixup: SIGSYS support and seccomp return
 value checking.  Then it must just add CONFIG_HAVE_ARCH_SECCOMP_FILTER
 to its arch-specific Kconfig.
-
-
-
-Caveats
--------
-
-The vDSO can cause some system calls to run entirely in userspace,
-leading to surprises when you run programs on different machines that
-fall back to real syscalls.  To minimize these surprises on x86, make
-sure you test with
-/sys/devices/system/clocksource/clocksource0/current_clocksource set to
-something like acpi_pm.
-
-On x86-64, vsyscall emulation is enabled by default.  (vsyscalls are
-legacy variants on vDSO calls.)  Currently, emulated vsyscalls will honor seccomp, with a few oddities:
-
-- A return value of SECCOMP_RET_TRAP will set a si_call_addr pointing to
-  the vsyscall entry for the given call and not the address after the
-  'syscall' instruction.  Any code which wants to restart the call
-  should be aware that (a) a ret instruction has been emulated and (b)
-  trying to resume the syscall will again trigger the standard vsyscall
-  emulation security checks, making resuming the syscall mostly
-  pointless.
-
-- A return value of SECCOMP_RET_TRACE will signal the tracer as usual,
-  but the syscall may not be changed to another system call using the
-  orig_rax register. It may only be changed to -1 order to skip the
-  currently emulated call. Any other change MAY terminate the process.
-  The rip value seen by the tracer will be the syscall entry address;
-  this is different from normal behavior.  The tracer MUST NOT modify
-  rip or rsp.  (Do not rely on other changes terminating the process.
-  They might work.  For example, on some kernels, choosing a syscall
-  that only exists in future kernels will be correctly emulated (by
-  returning -ENOSYS).
-
-To detect this quirky behavior, check for addr & ~0x0C00 ==
-0xFFFFFFFFFF600000.  (For SECCOMP_RET_TRACE, use rip.  For
-SECCOMP_RET_TRAP, use siginfo->si_call_addr.)  Do not check any other
-condition: future kernels may improve vsyscall emulation and current
-kernels in vsyscall=native mode will behave differently, but the
-instructions at 0xF...F600{0,4,8,C}00 will not be system calls in these
-cases.
-
-Note that modern systems are unlikely to use vsyscalls at all -- they
-are a legacy feature and they are considerably slower than standard
-syscalls.  New code will use the vDSO, and vDSO-issued system calls
-are indistinguishable from normal system calls.
diff --git a/trunk/arch/x86/kernel/vsyscall_64.c b/trunk/arch/x86/kernel/vsyscall_64.c
index b2e58a248b3b..8d141b309046 100644
--- a/trunk/arch/x86/kernel/vsyscall_64.c
+++ b/trunk/arch/x86/kernel/vsyscall_64.c
@@ -136,6 +136,19 @@ static int addr_to_vsyscall_nr(unsigned long addr)
 	return nr;
 }
 
+#ifdef CONFIG_SECCOMP
+static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr)
+{
+	if (!seccomp_mode(&tsk->seccomp))
+		return 0;
+	task_pt_regs(tsk)->orig_ax = syscall_nr;
+	task_pt_regs(tsk)->ax = syscall_nr;
+	return __secure_computing(syscall_nr);
+}
+#else
+#define vsyscall_seccomp(_tsk, _nr) 0
+#endif
+
 static bool write_ok_or_segv(unsigned long ptr, size_t size)
 {
 	/*
@@ -168,9 +181,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 {
 	struct task_struct *tsk;
 	unsigned long caller;
-	int vsyscall_nr, syscall_nr, tmp;
+	int vsyscall_nr;
 	int prev_sig_on_uaccess_error;
 	long ret;
+	int skip;
 
 	/*
 	 * No point in checking CS -- the only way to get here is a user mode
@@ -202,84 +216,56 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 	}
 
 	tsk = current;
+	/*
+	 * With a real vsyscall, page faults cause SIGSEGV.  We want to
+	 * preserve that behavior to make writing exploits harder.
+	 */
+	prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
+	current_thread_info()->sig_on_uaccess_error = 1;
 
 	/*
-	 * Check for access_ok violations and find the syscall nr.
-	 *
 	 * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
 	 * 64-bit, so we don't need to special-case it here.  For all the
 	 * vsyscalls, NULL means "don't write anything" not "write it at
 	 * address 0".
 	 */
+	ret = -EFAULT;
+	skip = 0;
 	switch (vsyscall_nr) {
 	case 0:
-		if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
-		    !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
-			ret = -EFAULT;
-			goto check_fault;
-		}
-
-		syscall_nr = __NR_gettimeofday;
-		break;
-
-	case 1:
-		if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
-			ret = -EFAULT;
-			goto check_fault;
-		}
-
-		syscall_nr = __NR_time;
-		break;
-
-	case 2:
-		if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
-		    !write_ok_or_segv(regs->si, sizeof(unsigned))) {
-			ret = -EFAULT;
-			goto check_fault;
-		}
-
-		syscall_nr = __NR_getcpu;
-		break;
-	}
-
-	/*
-	 * Handle seccomp.  regs->ip must be the original value.
-	 * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt.
-	 *
-	 * We could optimize the seccomp disabled case, but performance
-	 * here doesn't matter.
-	 */
-	regs->orig_ax = syscall_nr;
-	regs->ax = -ENOSYS;
-	tmp = secure_computing(syscall_nr);
-	if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
-		warn_bad_vsyscall(KERN_DEBUG, regs,
-				  "seccomp tried to change syscall nr or ip");
-		do_exit(SIGSYS);
-	}
-	if (tmp)
-		goto do_ret;  /* skip requested */
+		skip = vsyscall_seccomp(tsk, __NR_gettimeofday);
+		if (skip)
+			break;
 
-	/*
-	 * With a real vsyscall, page faults cause SIGSEGV.  We want to
-	 * preserve that behavior to make writing exploits harder.
-	 */
-	prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
-	current_thread_info()->sig_on_uaccess_error = 1;
+		if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
+		    !write_ok_or_segv(regs->si, sizeof(struct timezone)))
+			break;
 
-	ret = -EFAULT;
-	switch (vsyscall_nr) {
-	case 0:
 		ret = sys_gettimeofday(
 			(struct timeval __user *)regs->di,
 			(struct timezone __user *)regs->si);
 		break;
 
 	case 1:
+		skip = vsyscall_seccomp(tsk, __NR_time);
+		if (skip)
+			break;
+
+		if (!write_ok_or_segv(regs->di, sizeof(time_t)))
+			break;
+
 		ret = sys_time((time_t __user *)regs->di);
 		break;
 
 	case 2:
+		skip = vsyscall_seccomp(tsk, __NR_getcpu);
+		if (skip)
+			break;
+
+		if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
+		    !write_ok_or_segv(regs->si, sizeof(unsigned)))
+			break;
+
 		ret = sys_getcpu((unsigned __user *)regs->di,
 				 (unsigned __user *)regs->si,
 				 NULL);
@@ -288,7 +274,12 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 
 	current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
 
-check_fault:
+	if (skip) {
+		if ((long)regs->ax <= 0L) /* seccomp errno emulation */
+			goto do_ret;
+		goto done; /* seccomp trace/trap */
+	}
+
 	if (ret == -EFAULT) {
 		/* Bad news -- userspace fed a bad pointer to a vsyscall. */
 		warn_bad_vsyscall(KERN_INFO, regs,
@@ -311,6 +302,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 	/* Emulate a ret instruction. */
 	regs->ip = caller;
 	regs->sp += 8;
+done:
 	return true;
 
 sigsegv:
diff --git a/trunk/kernel/seccomp.c b/trunk/kernel/seccomp.c
index 5af44b593770..ee376beedaf9 100644
--- a/trunk/kernel/seccomp.c
+++ b/trunk/kernel/seccomp.c
@@ -396,29 +396,25 @@ int __secure_computing(int this_syscall)
 #ifdef CONFIG_SECCOMP_FILTER
 	case SECCOMP_MODE_FILTER: {
 		int data;
-		struct pt_regs *regs = task_pt_regs(current);
 		ret = seccomp_run_filters(this_syscall);
 		data = ret & SECCOMP_RET_DATA;
 		ret &= SECCOMP_RET_ACTION;
 		switch (ret) {
 		case SECCOMP_RET_ERRNO:
 			/* Set the low-order 16-bits as a errno. */
-			syscall_set_return_value(current, regs,
+			syscall_set_return_value(current, task_pt_regs(current),
 						 -data, 0);
 			goto skip;
 		case SECCOMP_RET_TRAP:
 			/* Show the handler the original registers. */
-			syscall_rollback(current, regs);
+			syscall_rollback(current, task_pt_regs(current));
 			/* Let the filter pass back 16 bits of data. */
 			seccomp_send_sigsys(this_syscall, data);
 			goto skip;
 		case SECCOMP_RET_TRACE:
 			/* Skip these calls if there is no tracer. */
-			if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
-				syscall_set_return_value(current, regs,
-							 -ENOSYS, 0);
+			if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP))
 				goto skip;
-			}
 			/* Allow the BPF to provide the event message */
 			ptrace_event(PTRACE_EVENT_SECCOMP, data);
 			/*
@@ -429,9 +425,6 @@ int __secure_computing(int this_syscall)
 			 */
 			if (fatal_signal_pending(current))
 				break;
-			if (syscall_get_nr(current, regs) < 0)
-				goto skip;  /* Explicit request to skip. */
-
 			return 0;
 		case SECCOMP_RET_ALLOW:
 			return 0;
diff --git a/trunk/security/keys/process_keys.c b/trunk/security/keys/process_keys.c
index 54339cfd6734..178b8c3b130a 100644
--- a/trunk/security/keys/process_keys.c
+++ b/trunk/security/keys/process_keys.c
@@ -357,8 +357,6 @@ key_ref_t search_my_process_keyrings(struct key_type *type,
 
 		switch (PTR_ERR(key_ref)) {
 		case -EAGAIN: /* no key */
-			if (ret)
-				break;
 		case -ENOKEY: /* negative key */
 			ret = key_ref;
 			break;