From 133e2d3e81de5d9706cab2dd1d52d231c27382e5 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 12 Jun 2022 23:07:22 -0700 Subject: [PATCH 1/4] fs/exec: allow to unshare a time namespace on vfork+exec Right now, a new process can't be forked in another time namespace if it shares mm with its parent. It is prohibited, because each time namespace has its own vvar page that is mapped into a process address space. When a process calls exec, it gets a new mm and so it could be "legal" to switch time namespace in that case. This was not implemented and now if we want to do this, we need to add another clone flag to not break backward compatibility. We don't have any user requests to switch times on exec except the vfork+exec combination, so there is no reason to add a new clone flag. As for vfork+exec, this should be safe to allow switching timens with the current clone flag. Right now, vfork (CLONE_VFORK | CLONE_VM) fails if a child is forked into another time namespace. With this change, vfork creates a new process in parent's timens, and the following exec does the actual switch to the target time namespace. Suggested-by: Florian Weimer Signed-off-by: Andrei Vagin Acked-by: Christian Brauner (Microsoft) Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220613060723.197407-1-avagin@gmail.com --- fs/exec.c | 7 +++++++ kernel/fork.c | 5 ++++- kernel/nsproxy.c | 3 ++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 0989fb8472a18..347e8f55bc2bb 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -982,10 +983,12 @@ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; + bool vfork; int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; + vfork = !!tsk->vfork_done; old_mm = current->mm; exec_mm_release(tsk, old_mm); if (old_mm) @@ -1030,6 +1033,10 @@ static int exec_mmap(struct mm_struct *mm) tsk->mm->vmacache_seqnum = 0; vmacache_flush(tsk); task_unlock(tsk); + + if (vfork) + timens_on_fork(tsk->nsproxy, tsk); + if (old_mm) { mmap_read_unlock(old_mm); BUG_ON(active_mm != old_mm); diff --git a/kernel/fork.c b/kernel/fork.c index 9d44f2d46c696..9174146f6812e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2033,8 +2033,11 @@ static __latent_entropy struct task_struct *copy_process( /* * If the new process will be in a different time namespace * do not allow it to share VM or a thread group with the forking task. + * + * On vfork, the child process enters the target time namespace only + * after exec. */ - if (clone_flags & (CLONE_THREAD | CLONE_VM)) { + if ((clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM) { if (nsp->time_ns != nsp->time_ns_for_children) return ERR_PTR(-EINVAL); } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index eec72ca962e24..b4cbb406bc284 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -179,7 +179,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) if (IS_ERR(new_ns)) return PTR_ERR(new_ns); - timens_on_fork(new_ns, tsk); + if ((flags & CLONE_VM) == 0) + timens_on_fork(new_ns, tsk); tsk->nsproxy = new_ns; return 0; From 6342140db6609a0c7d34f68c52b2947468e0e630 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 12 Jun 2022 23:07:23 -0700 Subject: [PATCH 2/4] selftests/timens: add a test for vfork+exit * check that a child process is in parent's time namespace after vfork. * check that a child process is in the target namespace after exec. Output on success: $ ./vfork_exec 1..1 ok 1 exec Signed-off-by: Andrei Vagin Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220613060723.197407-2-avagin@gmail.com --- tools/testing/selftests/timens/Makefile | 2 +- tools/testing/selftests/timens/vfork_exec.c | 90 +++++++++++++++++++++ 2 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/timens/vfork_exec.c diff --git a/tools/testing/selftests/timens/Makefile b/tools/testing/selftests/timens/Makefile index 3a5936cc10abc..f0d51d4d2c879 100644 --- a/tools/testing/selftests/timens/Makefile +++ b/tools/testing/selftests/timens/Makefile @@ -1,4 +1,4 @@ -TEST_GEN_PROGS := timens timerfd timer clock_nanosleep procfs exec futex +TEST_GEN_PROGS := timens timerfd timer clock_nanosleep procfs exec futex vfork_exec TEST_GEN_PROGS_EXTENDED := gettime_perf CFLAGS := -Wall -Werror -pthread diff --git a/tools/testing/selftests/timens/vfork_exec.c b/tools/testing/selftests/timens/vfork_exec.c new file mode 100644 index 0000000000000..e6ccd900f30a4 --- /dev/null +++ b/tools/testing/selftests/timens/vfork_exec.c @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "timens.h" + +#define OFFSET (36000) + +int main(int argc, char *argv[]) +{ + struct timespec now, tst; + int status, i; + pid_t pid; + + if (argc > 1) { + if (sscanf(argv[1], "%ld", &now.tv_sec) != 1) + return pr_perror("sscanf"); + + for (i = 0; i < 2; i++) { + _gettime(CLOCK_MONOTONIC, &tst, i); + if (abs(tst.tv_sec - now.tv_sec) > 5) + return pr_fail("%ld %ld\n", now.tv_sec, tst.tv_sec); + } + return 0; + } + + nscheck(); + + ksft_set_plan(1); + + clock_gettime(CLOCK_MONOTONIC, &now); + + if (unshare_timens()) + return 1; + + if (_settime(CLOCK_MONOTONIC, OFFSET)) + return 1; + + for (i = 0; i < 2; i++) { + _gettime(CLOCK_MONOTONIC, &tst, i); + if (abs(tst.tv_sec - now.tv_sec) > 5) + return pr_fail("%ld %ld\n", + now.tv_sec, tst.tv_sec); + } + + pid = vfork(); + if (pid < 0) + return pr_perror("fork"); + + if (pid == 0) { + char now_str[64]; + char *cargv[] = {"exec", now_str, NULL}; + char *cenv[] = {NULL}; + + // Check that we are still in the source timens. + for (i = 0; i < 2; i++) { + _gettime(CLOCK_MONOTONIC, &tst, i); + if (abs(tst.tv_sec - now.tv_sec) > 5) + return pr_fail("%ld %ld\n", + now.tv_sec, tst.tv_sec); + } + + /* Check for proper vvar offsets after execve. */ + snprintf(now_str, sizeof(now_str), "%ld", now.tv_sec + OFFSET); + execve("/proc/self/exe", cargv, cenv); + return pr_perror("execve"); + } + + if (waitpid(pid, &status, 0) != pid) + return pr_perror("waitpid"); + + if (status) + ksft_exit_fail(); + + ksft_test_result_pass("exec\n"); + ksft_exit_pass(); + return 0; +} From 5036793d7dbd0b14aec51526441a50b01c7bf66d Mon Sep 17 00:00:00 2001 From: Zhang Jiaming Date: Wed, 29 Jun 2022 15:29:32 +0800 Subject: [PATCH 3/4] exec: Fix a spelling mistake Change 'wont't' to 'won't'. Signed-off-by: Zhang Jiaming Reviewed-by: Souptick Joarder (HPE) Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220629072932.27506-1-jiaming@nfschina.com --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index 347e8f55bc2bb..0fc56c70c870f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1156,7 +1156,7 @@ static int de_thread(struct task_struct *tsk) /* * We are going to release_task()->ptrace_unlink() silently, * the tracer can sleep in do_wait(). EXIT_DEAD guarantees - * the tracer wont't block again waiting for this thread. + * the tracer won't block again waiting for this thread. */ if (unlikely(leader->ptrace)) __wake_up_parent(leader, leader->parent); From c6e8e36c6ae4b11bed5643317afb66b6c3cadba8 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Sun, 24 Jul 2022 23:25:23 +0200 Subject: [PATCH 4/4] exec: Call kmap_local_page() in copy_string_kernel() The use of kmap_atomic() is being deprecated in favor of kmap_local_page(). With kmap_local_page(), the mappings are per thread, CPU local and not globally visible. Furthermore, the mappings can be acquired from any context (including interrupts). Therefore, replace kmap_atomic() with kmap_local_page() in copy_string_kernel(). Instead of open-coding local mapping + memcpy(), use memcpy_to_page(). Delete a redundant call to flush_dcache_page(). Tested with xfstests on a QEMU/ KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Suggested-by: Ira Weiny Signed-off-by: Fabio M. De Francesco Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220724212523.13317-1-fmdefrancesco@gmail.com --- fs/exec.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 0fc56c70c870f..7e842d1bcb674 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -631,7 +631,6 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) unsigned int bytes_to_copy = min_t(unsigned int, len, min_not_zero(offset_in_page(pos), PAGE_SIZE)); struct page *page; - char *kaddr; pos -= bytes_to_copy; arg -= bytes_to_copy; @@ -640,11 +639,8 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) page = get_arg_page(bprm, pos, 1); if (!page) return -E2BIG; - kaddr = kmap_atomic(page); flush_arg_page(bprm, pos & PAGE_MASK, page); - memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy); - flush_dcache_page(page); - kunmap_atomic(kaddr); + memcpy_to_page(page, offset_in_page(pos), arg, bytes_to_copy); put_arg_page(page); }