From 8c1b21198551d795a44f08ad185f716732b47bbe Mon Sep 17 00:00:00 2001 From: Milan Landaverde Date: Tue, 22 Mar 2022 10:49:45 -0400 Subject: [PATCH 01/16] bpf/bpftool: Add unprivileged_bpf_disabled check against value of 2 In [1], we added a kconfig knob that can set /proc/sys/kernel/unprivileged_bpf_disabled to 2 We now check against this value in bpftool feature probe [1] https://lore.kernel.org/bpf/74ec548079189e4e4dffaeb42b8987bb3c852eee.1620765074.git.daniel@iogearbox.net Signed-off-by: Milan Landaverde Signed-off-by: Alexei Starovoitov Acked-by: Quentin Monnet Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20220322145012.1315376-1-milan@mdaverde.com --- tools/bpf/bpftool/feature.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index c2f43a5d38e01..290998c82de12 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -207,7 +207,10 @@ static void probe_unprivileged_disabled(void) printf("bpf() syscall for unprivileged users is enabled\n"); break; case 1: - printf("bpf() syscall restricted to privileged users\n"); + printf("bpf() syscall restricted to privileged users (without recovery)\n"); + break; + case 2: + printf("bpf() syscall restricted to privileged users (admin can change)\n"); break; case -1: printf("Unable to retrieve required privileges for bpf() syscall\n"); From 9052e4e83762c40d89f6650fec161ad9001e93f3 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 23 Mar 2022 16:35:26 +0900 Subject: [PATCH 02/16] fprobe: Fix smatch type mismatch warning Fix the type mismatching warning of 'rethook_node vs fprobe_rethook_node' found by Smatch. Reported-by: Dan Carpenter Signed-off-by: Masami Hiramatsu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/164802092611.1732982.12268174743437084619.stgit@devnote2 --- kernel/trace/fprobe.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 8b2dd5b9dcd14..63b2321b22a08 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -150,15 +150,15 @@ static int fprobe_init_rethook(struct fprobe *fp, int num) fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler); for (i = 0; i < size; i++) { - struct rethook_node *node; + struct fprobe_rethook_node *node; - node = kzalloc(sizeof(struct fprobe_rethook_node), GFP_KERNEL); + node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) { rethook_free(fp->rethook); fp->rethook = NULL; return -ENOMEM; } - rethook_add_node(fp->rethook, node); + rethook_add_node(fp->rethook, &node->node); } return 0; } From 261608f3105ce65e9fd01919f72ead74dcb9f14d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 23 Mar 2022 16:35:36 +0900 Subject: [PATCH 03/16] fprobe: Fix sparse warning for acccessing __rcu ftrace_hash Since ftrace_ops::local_hash::filter_hash field is an __rcu pointer, we have to use rcu_access_pointer() to access it. Reported-by: kernel test robot Signed-off-by: Masami Hiramatsu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/164802093635.1732982.4938094876018890866.stgit@devnote2 --- kernel/trace/fprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 63b2321b22a08..89d9f994ebb02 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -215,7 +215,7 @@ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter * correctly calculate the total number of filtered symbols * from both filter and notfilter. */ - hash = fp->ops.local_hash.filter_hash; + hash = rcu_access_pointer(fp->ops.local_hash.filter_hash); if (WARN_ON_ONCE(!hash)) goto out; From 98870605b3742c38353b50b6dbad33fab9de415f Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 24 Mar 2022 16:37:32 +0800 Subject: [PATCH 04/16] bpf: Sync comments for bpf_get_stack Commit ee2a098851bf missed updating the comments for helper bpf_get_stack in tools/include/uapi/linux/bpf.h. Sync it. Fixes: ee2a098851bf ("bpf: Adjust BPF stack helper functions to accommodate skip > 0") Signed-off-by: Geliang Tang Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/ce54617746b7ed5e9ba3b844e55e74cb8a60e0b5.1648110794.git.geliang.tang@suse.com --- tools/include/uapi/linux/bpf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7604e7d5438f8..d14b10b85e51f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3009,8 +3009,8 @@ union bpf_attr { * * # sysctl kernel.perf_event_max_stack= * Return - * A non-negative value equal to or less than *size* on success, - * or a negative error in case of failure. + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. * * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) * Description @@ -4316,8 +4316,8 @@ union bpf_attr { * * # sysctl kernel.perf_event_max_stack= * Return - * A non-negative value equal to or less than *size* on success, - * or a negative error in case of failure. + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. * * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) * Description From c29a4920dfcaa1433b09e2674f605f72767a385c Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Fri, 25 Mar 2022 00:42:38 +0800 Subject: [PATCH 05/16] bpf: Fix maximum permitted number of arguments check Since the m->arg_size array can hold up to MAX_BPF_FUNC_ARGS argument sizes, it's ok that nargs is equal to MAX_BPF_FUNC_ARGS. Signed-off-by: Yuntao Wang Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220324164238.1274915-1-ytcoode@gmail.com --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 24788ce564a08..0918a39279f6c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5507,7 +5507,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, } args = (const struct btf_param *)(func + 1); nargs = btf_type_vlen(func); - if (nargs >= MAX_BPF_FUNC_ARGS) { + if (nargs > MAX_BPF_FUNC_ARGS) { bpf_log(log, "The function %s has %d arguments. Too many.\n", tname, nargs); From 99dea2c664d7bc7e4f6f6947182d0d365165a998 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 25 Mar 2022 15:56:43 -0700 Subject: [PATCH 06/16] selftests/bpf: fix selftest after random: Urandom_read tracepoint removal 14c174633f34 ("random: remove unused tracepoints") removed all the tracepoints from drivers/char/random.c, one of which, random:urandom_read, was used by stacktrace_build_id selftest to trigger stack trace capture. Fix breakage by switching to kprobing urandom_read() function. Suggested-by: Yonghong Song Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220325225643.2606-1-andrii@kernel.org --- .../selftests/bpf/progs/test_stacktrace_build_id.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c b/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c index 36a707e7c7a7a..6c62bfb8bb6fa 100644 --- a/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c +++ b/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c @@ -39,16 +39,8 @@ struct { __type(value, stack_trace_t); } stack_amap SEC(".maps"); -/* taken from /sys/kernel/debug/tracing/events/random/urandom_read/format */ -struct random_urandom_args { - unsigned long long pad; - int got_bits; - int pool_left; - int input_left; -}; - -SEC("tracepoint/random/urandom_read") -int oncpu(struct random_urandom_args *args) +SEC("kprobe/urandom_read") +int oncpu(struct pt_regs *args) { __u32 max_len = sizeof(struct bpf_stack_build_id) * PERF_MAX_STACK_DEPTH; From ef8a257b4e499a979364b1f9caf25a325f6ee8b8 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 28 Mar 2022 10:37:03 +0200 Subject: [PATCH 07/16] bpftool: Fix generated code in codegen_asserts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Arnaldo reported perf compilation fail with: $ make -k BUILD_BPF_SKEL=1 CORESIGHT=1 PYTHON=python3 ... In file included from util/bpf_counter.c:28: /tmp/build/perf//util/bpf_skel/bperf_leader.skel.h: In function ‘bperf_leader_bpf__assert’: /tmp/build/perf//util/bpf_skel/bperf_leader.skel.h:351:51: error: unused parameter ‘s’ [-Werror=unused-parameter] 351 | bperf_leader_bpf__assert(struct bperf_leader_bpf *s) | ~~~~~~~~~~~~~~~~~~~~~~~~~^ cc1: all warnings being treated as errors If there's nothing to generate in the new assert function, we will get unused 's' warn/error, adding 'unused' attribute to it. Fixes: 08d4dba6ae77 ("bpftool: Bpf skeletons assert type sizes") Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Tested-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/bpf/20220328083703.2880079-1-jolsa@kernel.org --- tools/bpf/bpftool/gen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 7ba7ff55d2eaa..91af2850b5057 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -477,7 +477,7 @@ static void codegen_asserts(struct bpf_object *obj, const char *obj_name) codegen("\ \n\ __attribute__((unused)) static void \n\ - %1$s__assert(struct %1$s *s) \n\ + %1$s__assert(struct %1$s *s __attribute__((unused))) \n\ { \n\ #ifdef __cplusplus \n\ #define _Static_assert static_assert \n\ From 73f9b911faa74ac5107879de05c9489c419f41bb Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 26 Mar 2022 11:27:05 +0900 Subject: [PATCH 08/16] kprobes: Use rethook for kretprobe if possible Use rethook for kretprobe function return hooking if the arch sets CONFIG_HAVE_RETHOOK=y. In this case, CONFIG_KRETPROBE_ON_RETHOOK is set to 'y' automatically, and the kretprobe internal data fields switches to use rethook. If not, it continues to use kretprobe specific function return hooks. Suggested-by: Peter Zijlstra Signed-off-by: Masami Hiramatsu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/164826162556.2455864.12255833167233452047.stgit@devnote2 --- arch/Kconfig | 8 ++- include/linux/kprobes.h | 51 ++++++++++++++- kernel/Makefile | 1 + kernel/kprobes.c | 124 ++++++++++++++++++++++++++++++------ kernel/trace/trace_kprobe.c | 4 +- 5 files changed, 163 insertions(+), 25 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 84bc1de02720b..33e06966f2487 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -164,7 +164,13 @@ config ARCH_USE_BUILTIN_BSWAP config KRETPROBES def_bool y - depends on KPROBES && HAVE_KRETPROBES + depends on KPROBES && (HAVE_KRETPROBES || HAVE_RETHOOK) + +config KRETPROBE_ON_RETHOOK + def_bool y + depends on HAVE_RETHOOK + depends on KRETPROBES + select RETHOOK config USER_RETURN_NOTIFIER bool diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 312ff997c7436..157168769fc26 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #ifdef CONFIG_KPROBES @@ -149,13 +150,20 @@ struct kretprobe { int maxactive; int nmissed; size_t data_size; +#ifdef CONFIG_KRETPROBE_ON_RETHOOK + struct rethook *rh; +#else struct freelist_head freelist; struct kretprobe_holder *rph; +#endif }; #define KRETPROBE_MAX_DATA_SIZE 4096 struct kretprobe_instance { +#ifdef CONFIG_KRETPROBE_ON_RETHOOK + struct rethook_node node; +#else union { struct freelist_node freelist; struct rcu_head rcu; @@ -164,6 +172,7 @@ struct kretprobe_instance { struct kretprobe_holder *rph; kprobe_opcode_t *ret_addr; void *fp; +#endif char data[]; }; @@ -186,10 +195,24 @@ extern void kprobe_busy_begin(void); extern void kprobe_busy_end(void); #ifdef CONFIG_KRETPROBES -extern void arch_prepare_kretprobe(struct kretprobe_instance *ri, - struct pt_regs *regs); +/* Check whether @p is used for implementing a trampoline. */ extern int arch_trampoline_kprobe(struct kprobe *p); +#ifdef CONFIG_KRETPROBE_ON_RETHOOK +static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance *ri) +{ + RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(), + "Kretprobe is accessed from instance under preemptive context"); + + return (struct kretprobe *)READ_ONCE(ri->node.rethook->data); +} +static nokprobe_inline unsigned long get_kretprobe_retaddr(struct kretprobe_instance *ri) +{ + return ri->node.ret_addr; +} +#else +extern void arch_prepare_kretprobe(struct kretprobe_instance *ri, + struct pt_regs *regs); void arch_kretprobe_fixup_return(struct pt_regs *regs, kprobe_opcode_t *correct_ret_addr); @@ -232,6 +255,12 @@ static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance return READ_ONCE(ri->rph->rp); } +static nokprobe_inline unsigned long get_kretprobe_retaddr(struct kretprobe_instance *ri) +{ + return (unsigned long)ri->ret_addr; +} +#endif /* CONFIG_KRETPROBE_ON_RETHOOK */ + #else /* !CONFIG_KRETPROBES */ static inline void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) @@ -395,7 +424,11 @@ void unregister_kretprobe(struct kretprobe *rp); int register_kretprobes(struct kretprobe **rps, int num); void unregister_kretprobes(struct kretprobe **rps, int num); +#ifdef CONFIG_KRETPROBE_ON_RETHOOK +#define kprobe_flush_task(tk) do {} while (0) +#else void kprobe_flush_task(struct task_struct *tk); +#endif void kprobe_free_init_mem(void); @@ -509,6 +542,19 @@ static inline bool is_kprobe_optinsn_slot(unsigned long addr) #endif /* !CONFIG_OPTPROBES */ #ifdef CONFIG_KRETPROBES +#ifdef CONFIG_KRETPROBE_ON_RETHOOK +static nokprobe_inline bool is_kretprobe_trampoline(unsigned long addr) +{ + return is_rethook_trampoline(addr); +} + +static nokprobe_inline +unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp, + struct llist_node **cur) +{ + return rethook_find_ret_addr(tsk, (unsigned long)fp, cur); +} +#else static nokprobe_inline bool is_kretprobe_trampoline(unsigned long addr) { return (void *)addr == kretprobe_trampoline_addr(); @@ -516,6 +562,7 @@ static nokprobe_inline bool is_kretprobe_trampoline(unsigned long addr) unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp, struct llist_node **cur); +#endif #else static nokprobe_inline bool is_kretprobe_trampoline(unsigned long addr) { diff --git a/kernel/Makefile b/kernel/Makefile index 56f4ee97f3284..471d71935e90a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -108,6 +108,7 @@ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_TRACE_CLOCK) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ +obj-$(CONFIG_RETHOOK) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_BPF) += bpf/ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 185badc780b7c..dbe57df2e199e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1237,6 +1237,27 @@ void kprobes_inc_nmissed_count(struct kprobe *p) } NOKPROBE_SYMBOL(kprobes_inc_nmissed_count); +static struct kprobe kprobe_busy = { + .addr = (void *) get_kprobe, +}; + +void kprobe_busy_begin(void) +{ + struct kprobe_ctlblk *kcb; + + preempt_disable(); + __this_cpu_write(current_kprobe, &kprobe_busy); + kcb = get_kprobe_ctlblk(); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; +} + +void kprobe_busy_end(void) +{ + __this_cpu_write(current_kprobe, NULL); + preempt_enable(); +} + +#if !defined(CONFIG_KRETPROBE_ON_RETHOOK) static void free_rp_inst_rcu(struct rcu_head *head) { struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu); @@ -1258,26 +1279,6 @@ static void recycle_rp_inst(struct kretprobe_instance *ri) } NOKPROBE_SYMBOL(recycle_rp_inst); -static struct kprobe kprobe_busy = { - .addr = (void *) get_kprobe, -}; - -void kprobe_busy_begin(void) -{ - struct kprobe_ctlblk *kcb; - - preempt_disable(); - __this_cpu_write(current_kprobe, &kprobe_busy); - kcb = get_kprobe_ctlblk(); - kcb->kprobe_status = KPROBE_HIT_ACTIVE; -} - -void kprobe_busy_end(void) -{ - __this_cpu_write(current_kprobe, NULL); - preempt_enable(); -} - /* * This function is called from delayed_put_task_struct() when a task is * dead and cleaned up to recycle any kretprobe instances associated with @@ -1327,6 +1328,7 @@ static inline void free_rp_inst(struct kretprobe *rp) rp->rph = NULL; } } +#endif /* !CONFIG_KRETPROBE_ON_RETHOOK */ /* Add the new probe to 'ap->list'. */ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) @@ -1925,6 +1927,7 @@ static struct notifier_block kprobe_exceptions_nb = { #ifdef CONFIG_KRETPROBES +#if !defined(CONFIG_KRETPROBE_ON_RETHOOK) /* This assumes the 'tsk' is the current task or the is not running. */ static kprobe_opcode_t *__kretprobe_find_ret_addr(struct task_struct *tsk, struct llist_node **cur) @@ -2087,6 +2090,57 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) return 0; } NOKPROBE_SYMBOL(pre_handler_kretprobe); +#else /* CONFIG_KRETPROBE_ON_RETHOOK */ +/* + * This kprobe pre_handler is registered with every kretprobe. When probe + * hits it will set up the return probe. + */ +static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) +{ + struct kretprobe *rp = container_of(p, struct kretprobe, kp); + struct kretprobe_instance *ri; + struct rethook_node *rhn; + + rhn = rethook_try_get(rp->rh); + if (!rhn) { + rp->nmissed++; + return 0; + } + + ri = container_of(rhn, struct kretprobe_instance, node); + + if (rp->entry_handler && rp->entry_handler(ri, regs)) + rethook_recycle(rhn); + else + rethook_hook(rhn, regs, kprobe_ftrace(p)); + + return 0; +} +NOKPROBE_SYMBOL(pre_handler_kretprobe); + +static void kretprobe_rethook_handler(struct rethook_node *rh, void *data, + struct pt_regs *regs) +{ + struct kretprobe *rp = (struct kretprobe *)data; + struct kretprobe_instance *ri; + struct kprobe_ctlblk *kcb; + + /* The data must NOT be null. This means rethook data structure is broken. */ + if (WARN_ON_ONCE(!data)) + return; + + __this_cpu_write(current_kprobe, &rp->kp); + kcb = get_kprobe_ctlblk(); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + + ri = container_of(rh, struct kretprobe_instance, node); + rp->handler(ri, regs); + + __this_cpu_write(current_kprobe, NULL); +} +NOKPROBE_SYMBOL(kretprobe_rethook_handler); + +#endif /* !CONFIG_KRETPROBE_ON_RETHOOK */ /** * kprobe_on_func_entry() -- check whether given address is function entry @@ -2155,6 +2209,29 @@ int register_kretprobe(struct kretprobe *rp) rp->maxactive = num_possible_cpus(); #endif } +#ifdef CONFIG_KRETPROBE_ON_RETHOOK + rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler); + if (!rp->rh) + return -ENOMEM; + + for (i = 0; i < rp->maxactive; i++) { + inst = kzalloc(sizeof(struct kretprobe_instance) + + rp->data_size, GFP_KERNEL); + if (inst == NULL) { + rethook_free(rp->rh); + rp->rh = NULL; + return -ENOMEM; + } + rethook_add_node(rp->rh, &inst->node); + } + rp->nmissed = 0; + /* Establish function entry probe point */ + ret = register_kprobe(&rp->kp); + if (ret != 0) { + rethook_free(rp->rh); + rp->rh = NULL; + } +#else /* !CONFIG_KRETPROBE_ON_RETHOOK */ rp->freelist.head = NULL; rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL); if (!rp->rph) @@ -2179,6 +2256,7 @@ int register_kretprobe(struct kretprobe *rp) ret = register_kprobe(&rp->kp); if (ret != 0) free_rp_inst(rp); +#endif return ret; } EXPORT_SYMBOL_GPL(register_kretprobe); @@ -2217,7 +2295,11 @@ void unregister_kretprobes(struct kretprobe **rps, int num) for (i = 0; i < num; i++) { if (__unregister_kprobe_top(&rps[i]->kp) < 0) rps[i]->kp.addr = NULL; +#ifdef CONFIG_KRETPROBE_ON_RETHOOK + rethook_free(rps[i]->rh); +#else rps[i]->rph->rp = NULL; +#endif } mutex_unlock(&kprobe_mutex); @@ -2225,7 +2307,9 @@ void unregister_kretprobes(struct kretprobe **rps, int num) for (i = 0; i < num; i++) { if (rps[i]->kp.addr) { __unregister_kprobe_bottom(&rps[i]->kp); +#ifndef CONFIG_KRETPROBE_ON_RETHOOK free_rp_inst(rps[i]); +#endif } } } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b62fd785b599b..47cebef78532c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1433,7 +1433,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, fbuffer.regs = regs; entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->func = (unsigned long)tk->rp.kp.addr; - entry->ret_ip = (unsigned long)ri->ret_addr; + entry->ret_ip = get_kretprobe_retaddr(ri); store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); @@ -1628,7 +1628,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, return; entry->func = (unsigned long)tk->rp.kp.addr; - entry->ret_ip = (unsigned long)ri->ret_addr; + entry->ret_ip = get_kretprobe_retaddr(ri); store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); From f3a112c0c40dd96d53c8bdf3ea8d94d528f3b7b8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 26 Mar 2022 11:27:17 +0900 Subject: [PATCH 09/16] x86,rethook,kprobes: Replace kretprobe with rethook on x86 Replaces the kretprobe code with rethook on x86. With this patch, kretprobe on x86 uses the rethook instead of kretprobe specific trampoline code. Signed-off-by: Masami Hiramatsu Signed-off-by: Alexei Starovoitov Tested-by: Jiri Olsa Link: https://lore.kernel.org/bpf/164826163692.2455864.13745421016848209527.stgit@devnote2 --- arch/x86/Kconfig | 1 + arch/x86/include/asm/unwind.h | 23 +++--- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/kprobes/common.h | 1 + arch/x86/kernel/kprobes/core.c | 107 -------------------------- arch/x86/kernel/rethook.c | 125 +++++++++++++++++++++++++++++++ arch/x86/kernel/unwind_orc.c | 10 +-- 7 files changed, 144 insertions(+), 124 deletions(-) create mode 100644 arch/x86/kernel/rethook.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7340d9f01b621..ff45a27fc29ce 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -224,6 +224,7 @@ config X86 select HAVE_KPROBES_ON_FTRACE select HAVE_FUNCTION_ERROR_INJECTION select HAVE_KRETPROBES + select HAVE_RETHOOK select HAVE_KVM select HAVE_LIVEPATCH if X86_64 select HAVE_MIXED_BREAKPOINTS_REGS diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index 2a1f8734416dc..7cede4dc21f00 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -4,7 +4,7 @@ #include #include -#include +#include #include #include @@ -16,7 +16,7 @@ struct unwind_state { unsigned long stack_mask; struct task_struct *task; int graph_idx; -#ifdef CONFIG_KRETPROBES +#if defined(CONFIG_RETHOOK) struct llist_node *kr_cur; #endif bool error; @@ -104,19 +104,18 @@ void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, #endif static inline -unsigned long unwind_recover_kretprobe(struct unwind_state *state, - unsigned long addr, unsigned long *addr_p) +unsigned long unwind_recover_rethook(struct unwind_state *state, + unsigned long addr, unsigned long *addr_p) { -#ifdef CONFIG_KRETPROBES - return is_kretprobe_trampoline(addr) ? - kretprobe_find_ret_addr(state->task, addr_p, &state->kr_cur) : - addr; -#else - return addr; +#ifdef CONFIG_RETHOOK + if (is_rethook_trampoline(addr)) + return rethook_find_ret_addr(state->task, (unsigned long)addr_p, + &state->kr_cur); #endif + return addr; } -/* Recover the return address modified by kretprobe and ftrace_graph. */ +/* Recover the return address modified by rethook and ftrace_graph. */ static inline unsigned long unwind_recover_ret_addr(struct unwind_state *state, unsigned long addr, unsigned long *addr_p) @@ -125,7 +124,7 @@ unsigned long unwind_recover_ret_addr(struct unwind_state *state, ret = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr, addr_p); - return unwind_recover_kretprobe(state, ret, addr_p); + return unwind_recover_rethook(state, ret, addr_p); } /* diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 6462e3dd98f49..c41ef42adbe8a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -103,6 +103,7 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o obj-$(CONFIG_TRACING) += trace.o +obj-$(CONFIG_RETHOOK) += rethook.o obj-$(CONFIG_CRASH_CORE) += crash_core_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index 7d3a2e2daf011..c993521d49332 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -6,6 +6,7 @@ #include #include +#include #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 8ef933c03afac..7c4ab8870da44 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -811,18 +811,6 @@ set_current_kprobe(struct kprobe *p, struct pt_regs *regs, = (regs->flags & X86_EFLAGS_IF); } -void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - unsigned long *sara = stack_addr(regs); - - ri->ret_addr = (kprobe_opcode_t *) *sara; - ri->fp = sara; - - /* Replace the return addr with trampoline addr */ - *sara = (unsigned long) &__kretprobe_trampoline; -} -NOKPROBE_SYMBOL(arch_prepare_kretprobe); - static void kprobe_post_process(struct kprobe *cur, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { @@ -1023,101 +1011,6 @@ int kprobe_int3_handler(struct pt_regs *regs) } NOKPROBE_SYMBOL(kprobe_int3_handler); -/* - * When a retprobed function returns, this code saves registers and - * calls trampoline_handler() runs, which calls the kretprobe's handler. - */ -asm( - ".text\n" - ".global __kretprobe_trampoline\n" - ".type __kretprobe_trampoline, @function\n" - "__kretprobe_trampoline:\n" -#ifdef CONFIG_X86_64 - ANNOTATE_NOENDBR - /* Push a fake return address to tell the unwinder it's a kretprobe. */ - " pushq $__kretprobe_trampoline\n" - UNWIND_HINT_FUNC - /* Save the 'sp - 8', this will be fixed later. */ - " pushq %rsp\n" - " pushfq\n" - SAVE_REGS_STRING - " movq %rsp, %rdi\n" - " call trampoline_handler\n" - RESTORE_REGS_STRING - /* In trampoline_handler(), 'regs->flags' is copied to 'regs->sp'. */ - " addq $8, %rsp\n" - " popfq\n" -#else - /* Push a fake return address to tell the unwinder it's a kretprobe. */ - " pushl $__kretprobe_trampoline\n" - UNWIND_HINT_FUNC - /* Save the 'sp - 4', this will be fixed later. */ - " pushl %esp\n" - " pushfl\n" - SAVE_REGS_STRING - " movl %esp, %eax\n" - " call trampoline_handler\n" - RESTORE_REGS_STRING - /* In trampoline_handler(), 'regs->flags' is copied to 'regs->sp'. */ - " addl $4, %esp\n" - " popfl\n" -#endif - ASM_RET - ".size __kretprobe_trampoline, .-__kretprobe_trampoline\n" -); -NOKPROBE_SYMBOL(__kretprobe_trampoline); -/* - * __kretprobe_trampoline() skips updating frame pointer. The frame pointer - * saved in trampoline_handler() points to the real caller function's - * frame pointer. Thus the __kretprobe_trampoline() doesn't have a - * standard stack frame with CONFIG_FRAME_POINTER=y. - * Let's mark it non-standard function. Anyway, FP unwinder can correctly - * unwind without the hint. - */ -STACK_FRAME_NON_STANDARD_FP(__kretprobe_trampoline); - -/* This is called from kretprobe_trampoline_handler(). */ -void arch_kretprobe_fixup_return(struct pt_regs *regs, - kprobe_opcode_t *correct_ret_addr) -{ - unsigned long *frame_pointer = ®s->sp + 1; - - /* Replace fake return address with real one. */ - *frame_pointer = (unsigned long)correct_ret_addr; -} - -/* - * Called from __kretprobe_trampoline - */ -__used __visible void trampoline_handler(struct pt_regs *regs) -{ - unsigned long *frame_pointer; - - /* fixup registers */ - regs->cs = __KERNEL_CS; -#ifdef CONFIG_X86_32 - regs->gs = 0; -#endif - regs->ip = (unsigned long)&__kretprobe_trampoline; - regs->orig_ax = ~0UL; - regs->sp += sizeof(long); - frame_pointer = ®s->sp + 1; - - /* - * The return address at 'frame_pointer' is recovered by the - * arch_kretprobe_fixup_return() which called from the - * kretprobe_trampoline_handler(). - */ - kretprobe_trampoline_handler(regs, frame_pointer); - - /* - * Copy FLAGS to 'pt_regs::sp' so that __kretprobe_trapmoline() - * can do RET right after POPF. - */ - regs->sp = regs->flags; -} -NOKPROBE_SYMBOL(trampoline_handler); - int kprobe_fault_handler(struct pt_regs *regs, int trapnr) { struct kprobe *cur = kprobe_running(); diff --git a/arch/x86/kernel/rethook.c b/arch/x86/kernel/rethook.c new file mode 100644 index 0000000000000..af7e6713a07a7 --- /dev/null +++ b/arch/x86/kernel/rethook.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * x86 implementation of rethook. Mostly copied from arch/x86/kernel/kprobes/core.c. + */ +#include +#include +#include +#include + +#include "kprobes/common.h" + +__visible void arch_rethook_trampoline_callback(struct pt_regs *regs); + +#ifndef ANNOTATE_NOENDBR +#define ANNOTATE_NOENDBR +#endif + +/* + * When a target function returns, this code saves registers and calls + * arch_rethook_trampoline_callback(), which calls the rethook handler. + */ +asm( + ".text\n" + ".global arch_rethook_trampoline\n" + ".type arch_rethook_trampoline, @function\n" + "arch_rethook_trampoline:\n" +#ifdef CONFIG_X86_64 + ANNOTATE_NOENDBR /* This is only jumped from ret instruction */ + /* Push a fake return address to tell the unwinder it's a rethook. */ + " pushq $arch_rethook_trampoline\n" + UNWIND_HINT_FUNC + /* Save the 'sp - 8', this will be fixed later. */ + " pushq %rsp\n" + " pushfq\n" + SAVE_REGS_STRING + " movq %rsp, %rdi\n" + " call arch_rethook_trampoline_callback\n" + RESTORE_REGS_STRING + /* In the callback function, 'regs->flags' is copied to 'regs->sp'. */ + " addq $8, %rsp\n" + " popfq\n" +#else + /* Push a fake return address to tell the unwinder it's a rethook. */ + " pushl $arch_rethook_trampoline\n" + UNWIND_HINT_FUNC + /* Save the 'sp - 4', this will be fixed later. */ + " pushl %esp\n" + " pushfl\n" + SAVE_REGS_STRING + " movl %esp, %eax\n" + " call arch_rethook_trampoline_callback\n" + RESTORE_REGS_STRING + /* In the callback function, 'regs->flags' is copied to 'regs->sp'. */ + " addl $4, %esp\n" + " popfl\n" +#endif + ASM_RET + ".size arch_rethook_trampoline, .-arch_rethook_trampoline\n" +); +NOKPROBE_SYMBOL(arch_rethook_trampoline); + +/* + * Called from arch_rethook_trampoline + */ +__used __visible void arch_rethook_trampoline_callback(struct pt_regs *regs) +{ + unsigned long *frame_pointer; + + /* fixup registers */ + regs->cs = __KERNEL_CS; +#ifdef CONFIG_X86_32 + regs->gs = 0; +#endif + regs->ip = (unsigned long)&arch_rethook_trampoline; + regs->orig_ax = ~0UL; + regs->sp += sizeof(long); + frame_pointer = ®s->sp + 1; + + /* + * The return address at 'frame_pointer' is recovered by the + * arch_rethook_fixup_return() which called from this + * rethook_trampoline_handler(). + */ + rethook_trampoline_handler(regs, (unsigned long)frame_pointer); + + /* + * Copy FLAGS to 'pt_regs::sp' so that arch_rethook_trapmoline() + * can do RET right after POPF. + */ + regs->sp = regs->flags; +} +NOKPROBE_SYMBOL(arch_rethook_trampoline_callback); + +/* + * arch_rethook_trampoline() skips updating frame pointer. The frame pointer + * saved in arch_rethook_trampoline_callback() points to the real caller + * function's frame pointer. Thus the arch_rethook_trampoline() doesn't have + * a standard stack frame with CONFIG_FRAME_POINTER=y. + * Let's mark it non-standard function. Anyway, FP unwinder can correctly + * unwind without the hint. + */ +STACK_FRAME_NON_STANDARD_FP(arch_rethook_trampoline); + +/* This is called from rethook_trampoline_handler(). */ +void arch_rethook_fixup_return(struct pt_regs *regs, + unsigned long correct_ret_addr) +{ + unsigned long *frame_pointer = ®s->sp + 1; + + /* Replace fake return address with real one. */ + *frame_pointer = correct_ret_addr; +} +NOKPROBE_SYMBOL(arch_rethook_fixup_return); + +void arch_rethook_prepare(struct rethook_node *rh, struct pt_regs *regs, bool mcount) +{ + unsigned long *stack = (unsigned long *)regs->sp; + + rh->ret_addr = stack[0]; + rh->frame = regs->sp; + + /* Replace the return addr with trampoline addr */ + stack[0] = (unsigned long) arch_rethook_trampoline; +} +NOKPROBE_SYMBOL(arch_rethook_prepare); diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 2de3c8c5eba9f..794fdef2501ab 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -550,15 +550,15 @@ bool unwind_next_frame(struct unwind_state *state) } /* * There is a small chance to interrupt at the entry of - * __kretprobe_trampoline() where the ORC info doesn't exist. - * That point is right after the RET to __kretprobe_trampoline() + * arch_rethook_trampoline() where the ORC info doesn't exist. + * That point is right after the RET to arch_rethook_trampoline() * which was modified return address. - * At that point, the @addr_p of the unwind_recover_kretprobe() + * At that point, the @addr_p of the unwind_recover_rethook() * (this has to point the address of the stack entry storing * the modified return address) must be "SP - (a stack entry)" * because SP is incremented by the RET. */ - state->ip = unwind_recover_kretprobe(state, state->ip, + state->ip = unwind_recover_rethook(state, state->ip, (unsigned long *)(state->sp - sizeof(long))); state->regs = (struct pt_regs *)sp; state->prev_regs = NULL; @@ -573,7 +573,7 @@ bool unwind_next_frame(struct unwind_state *state) goto err; } /* See UNWIND_HINT_TYPE_REGS case comment. */ - state->ip = unwind_recover_kretprobe(state, state->ip, + state->ip = unwind_recover_rethook(state, state->ip, (unsigned long *)(state->sp - sizeof(long))); if (state->full_regs) From 0ef6f5c09371f17e142814e6996d6dfb8741925b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 26 Mar 2022 11:27:28 +0900 Subject: [PATCH 10/16] x86,rethook: Fix arch_rethook_trampoline() to generate a complete pt_regs Currently arch_rethook_trampoline() generates an almost complete pt_regs on-stack, everything except regs->ss that is, that currently points to the fake return address, which is not a valid segment descriptor. Since interpretation of regs->[sb]p should be done in the context of regs->ss, and we have code actually doing that (see arch/x86/lib/insn-eval.c for instance), complete the job by also pushing ss. This ensures that anybody who does do look at regs->ss doesn't mysteriously malfunction, avoiding much future pain. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Alexei Starovoitov Reviewed-by: Masami Hiramatsu Link: https://lore.kernel.org/bpf/164826164851.2455864.17272661073069737350.stgit@devnote2 --- arch/x86/kernel/rethook.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/rethook.c b/arch/x86/kernel/rethook.c index af7e6713a07a7..8a1c0111ae792 100644 --- a/arch/x86/kernel/rethook.c +++ b/arch/x86/kernel/rethook.c @@ -29,29 +29,31 @@ asm( /* Push a fake return address to tell the unwinder it's a rethook. */ " pushq $arch_rethook_trampoline\n" UNWIND_HINT_FUNC - /* Save the 'sp - 8', this will be fixed later. */ + " pushq $" __stringify(__KERNEL_DS) "\n" + /* Save the 'sp - 16', this will be fixed later. */ " pushq %rsp\n" " pushfq\n" SAVE_REGS_STRING " movq %rsp, %rdi\n" " call arch_rethook_trampoline_callback\n" RESTORE_REGS_STRING - /* In the callback function, 'regs->flags' is copied to 'regs->sp'. */ - " addq $8, %rsp\n" + /* In the callback function, 'regs->flags' is copied to 'regs->ss'. */ + " addq $16, %rsp\n" " popfq\n" #else /* Push a fake return address to tell the unwinder it's a rethook. */ " pushl $arch_rethook_trampoline\n" UNWIND_HINT_FUNC - /* Save the 'sp - 4', this will be fixed later. */ + " pushl %ss\n" + /* Save the 'sp - 8', this will be fixed later. */ " pushl %esp\n" " pushfl\n" SAVE_REGS_STRING " movl %esp, %eax\n" " call arch_rethook_trampoline_callback\n" RESTORE_REGS_STRING - /* In the callback function, 'regs->flags' is copied to 'regs->sp'. */ - " addl $4, %esp\n" + /* In the callback function, 'regs->flags' is copied to 'regs->ss'. */ + " addl $8, %esp\n" " popfl\n" #endif ASM_RET @@ -73,8 +75,8 @@ __used __visible void arch_rethook_trampoline_callback(struct pt_regs *regs) #endif regs->ip = (unsigned long)&arch_rethook_trampoline; regs->orig_ax = ~0UL; - regs->sp += sizeof(long); - frame_pointer = ®s->sp + 1; + regs->sp += 2*sizeof(long); + frame_pointer = (long *)(regs + 1); /* * The return address at 'frame_pointer' is recovered by the @@ -84,10 +86,10 @@ __used __visible void arch_rethook_trampoline_callback(struct pt_regs *regs) rethook_trampoline_handler(regs, (unsigned long)frame_pointer); /* - * Copy FLAGS to 'pt_regs::sp' so that arch_rethook_trapmoline() + * Copy FLAGS to 'pt_regs::ss' so that arch_rethook_trapmoline() * can do RET right after POPF. */ - regs->sp = regs->flags; + *(unsigned long *)®s->ss = regs->flags; } NOKPROBE_SYMBOL(arch_rethook_trampoline_callback); @@ -105,7 +107,7 @@ STACK_FRAME_NON_STANDARD_FP(arch_rethook_trampoline); void arch_rethook_fixup_return(struct pt_regs *regs, unsigned long correct_ret_addr) { - unsigned long *frame_pointer = ®s->sp + 1; + unsigned long *frame_pointer = (void *)(regs + 1); /* Replace fake return address with real one. */ *frame_pointer = correct_ret_addr; From 45c23bf4d1a416d32e509f83719a7399e35bdaf9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 26 Mar 2022 11:27:40 +0900 Subject: [PATCH 11/16] x86,kprobes: Fix optprobe trampoline to generate complete pt_regs Currently the optprobe trampoline template code ganerate an almost complete pt_regs on-stack, everything except regs->ss. The 'regs->ss' points to the top of stack, which is not a valid segment decriptor. As same as the rethook does, complete the job by also pushing ss. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Masami Hiramatsu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/164826166027.2455864.14759128090648961900.stgit@devnote2 --- arch/x86/kernel/kprobes/opt.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index b4a54a52aa595..e6b8c5362b945 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -106,7 +106,8 @@ asm ( ".global optprobe_template_entry\n" "optprobe_template_entry:\n" #ifdef CONFIG_X86_64 - /* We don't bother saving the ss register */ + " pushq $" __stringify(__KERNEL_DS) "\n" + /* Save the 'sp - 8', this will be fixed later. */ " pushq %rsp\n" " pushfq\n" ".global optprobe_template_clac\n" @@ -121,14 +122,17 @@ asm ( ".global optprobe_template_call\n" "optprobe_template_call:\n" ASM_NOP5 - /* Move flags to rsp */ + /* Copy 'regs->flags' into 'regs->ss'. */ " movq 18*8(%rsp), %rdx\n" - " movq %rdx, 19*8(%rsp)\n" + " movq %rdx, 20*8(%rsp)\n" RESTORE_REGS_STRING - /* Skip flags entry */ - " addq $8, %rsp\n" + /* Skip 'regs->flags' and 'regs->sp'. */ + " addq $16, %rsp\n" + /* And pop flags register from 'regs->ss'. */ " popfq\n" #else /* CONFIG_X86_32 */ + " pushl %ss\n" + /* Save the 'sp - 4', this will be fixed later. */ " pushl %esp\n" " pushfl\n" ".global optprobe_template_clac\n" @@ -142,12 +146,13 @@ asm ( ".global optprobe_template_call\n" "optprobe_template_call:\n" ASM_NOP5 - /* Move flags into esp */ + /* Copy 'regs->flags' into 'regs->ss'. */ " movl 14*4(%esp), %edx\n" - " movl %edx, 15*4(%esp)\n" + " movl %edx, 16*4(%esp)\n" RESTORE_REGS_STRING - /* Skip flags entry */ - " addl $4, %esp\n" + /* Skip 'regs->flags' and 'regs->sp'. */ + " addl $8, %esp\n" + /* And pop flags register from 'regs->ss'. */ " popfl\n" #endif ".global optprobe_template_end\n" @@ -179,6 +184,8 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) kprobes_inc_nmissed_count(&op->kp); } else { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + /* Adjust stack pointer */ + regs->sp += sizeof(long); /* Save skipped registers */ regs->cs = __KERNEL_CS; #ifdef CONFIG_X86_32 From a95a4d9b39b0324402569ed7395aae59b8fd2b11 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 28 Mar 2022 16:21:20 +0200 Subject: [PATCH 12/16] xsk: Do not write NULL in SW ring at allocation failure For the case when xp_alloc_batch() is used but the batched allocation cannot be used, there is a slow path that uses the non-batched xp_alloc(). When it fails to allocate an entry, it returns NULL. The current code wrote this NULL into the entry of the provided results array (pointer to the driver SW ring usually) and returned. This might not be what the driver expects and to make things simpler, just write successfully allocated xdp_buffs into the SW ring,. The driver might have information in there that is still important after an allocation failure. Note that at this point in time, there are no drivers using xp_alloc_batch() that could trigger this slow path. But one might get added. Fixes: 47e4075df300 ("xsk: Batched buffer allocation for the pool") Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220328142123.170157-2-maciej.fijalkowski@intel.com --- net/xdp/xsk_buff_pool.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index b34fca6ada869..af040ffa14ff3 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -591,9 +591,13 @@ u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) u32 nb_entries1 = 0, nb_entries2; if (unlikely(pool->dma_need_sync)) { + struct xdp_buff *buff; + /* Slow path */ - *xdp = xp_alloc(pool); - return !!*xdp; + buff = xp_alloc(pool); + if (buff) + *xdp = buff; + return !!buff; } if (unlikely(pool->free_list_cnt)) { From 30d19d57d513821c58de4556e7445982ed22b923 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 28 Mar 2022 16:21:21 +0200 Subject: [PATCH 13/16] ice: xsk: Eliminate unnecessary loop iteration The NIC Tx ring completion routine cleans entries from the ring in batches. However, it processes one more batch than it is supposed to. Note that this does not matter from a functionality point of view since it will not find a set DD bit for the next batch and just exit the loop. But from a performance perspective, it is faster to terminate the loop before and not issue an expensive read over PCIe to get the DD bit. Fixes: 126cdfe1007a ("ice: xsk: Improve AF_XDP ZC Tx and use batching API") Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220328142123.170157-3-maciej.fijalkowski@intel.com --- drivers/net/ethernet/intel/ice/ice_xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 88853a6ed9317..51427cb4971a5 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -754,7 +754,7 @@ static u16 ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring, int napi_budget) next_dd = next_dd + tx_thresh; if (next_dd >= desc_cnt) next_dd = tx_thresh - 1; - } while (budget--); + } while (--budget); xdp_ring->next_dd = next_dd; From 0ec1713009c5cc24244c918def1cd14080be27e3 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 28 Mar 2022 16:21:22 +0200 Subject: [PATCH 14/16] ice: xsk: Stop Rx processing when ntc catches ntu This can happen with big budget values and some breakage of re-filling descriptors as we do not clear the entry that ntu is pointing at the end of ice_alloc_rx_bufs_zc. So if ntc is at ntu then it might be the case that status_error0 has an old, uncleared value and ntc would go over with processing which would result in false results. Break Rx loop when ntc == ntu to avoid broken behavior. Fixes: 3876ff525de7 ("ice: xsk: Handle SW XDP ring wrap and bump tail more often") Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220328142123.170157-4-maciej.fijalkowski@intel.com --- drivers/net/ethernet/intel/ice/ice_xsk.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 51427cb4971a5..dfbcaf08520ee 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -608,6 +608,9 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget) */ dma_rmb(); + if (unlikely(rx_ring->next_to_clean == rx_ring->next_to_use)) + break; + xdp = *ice_xdp_buf(rx_ring, rx_ring->next_to_clean); size = le16_to_cpu(rx_desc->wb.pkt_len) & From 1ac2524de7b366633fc336db6c94062768d0ab03 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 28 Mar 2022 16:21:23 +0200 Subject: [PATCH 15/16] ice: xsk: Fix indexing in ice_tx_xsk_pool() Ice driver tries to always create XDP rings array to be num_possible_cpus() sized, regardless of user's queue count setting that can be changed via ethtool -L for example. Currently, ice_tx_xsk_pool() calculates the qid by decrementing the ring->q_index by the count of XDP queues, but ring->q_index is set to 'i + vsi->alloc_txq'. When user did ethtool -L $IFACE combined 1, alloc_txq is 1, but vsi->num_xdp_txq is still num_possible_cpus(). Then, ice_tx_xsk_pool() will do OOB access and in the final result ring would not get xsk_pool pointer assigned. Then, each ice_xsk_wakeup() call will fail with error and it will not be possible to get into NAPI and do the processing from driver side. Fix this by decrementing vsi->alloc_txq instead of vsi->num_xdp_txq from ring-q_index in ice_tx_xsk_pool() so the calculation is reflected to the setting of ring->q_index. Fixes: 22bf877e528f ("ice: introduce XDP_TX fallback path") Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220328142123.170157-5-maciej.fijalkowski@intel.com --- drivers/net/ethernet/intel/ice/ice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index b0b27bfcd7a28..d4f1874df7d0b 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -710,7 +710,7 @@ static inline struct xsk_buff_pool *ice_tx_xsk_pool(struct ice_tx_ring *ring) struct ice_vsi *vsi = ring->vsi; u16 qid; - qid = ring->q_index - vsi->num_xdp_txq; + qid = ring->q_index - vsi->alloc_txq; if (!ice_is_xdp_ena_vsi(vsi) || !test_bit(qid, vsi->af_xdp_zc_qps)) return NULL; From ccaff3d56acc47c257a99b2807b7c78a9467cf09 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 25 Mar 2022 13:03:04 -0700 Subject: [PATCH 16/16] selftests/bpf: Fix clang compilation errors llvm upstream patch ([1]) added to issue warning for code like void test() { int j = 0; for (int i = 0; i < 1000; i++) j++; return; } This triggered several errors in selftests/bpf build since compilation flag -Werror is used. ... test_lpm_map.c:212:15: error: variable 'n_matches' set but not used [-Werror,-Wunused-but-set-variable] size_t i, j, n_matches, n_matches_after_delete, n_nodes, n_lookups; ^ test_lpm_map.c:212:26: error: variable 'n_matches_after_delete' set but not used [-Werror,-Wunused-but-set-variable] size_t i, j, n_matches, n_matches_after_delete, n_nodes, n_lookups; ^ ... prog_tests/get_stack_raw_tp.c:32:15: error: variable 'cnt' set but not used [-Werror,-Wunused-but-set-variable] static __u64 cnt; ^ ... For test_lpm_map.c, 'n_matches'/'n_matches_after_delete' are changed to be volatile in order to silent the warning. I didn't remove these two declarations since they are referenced in a commented code which might be used by people in certain cases. For get_stack_raw_tp.c, the variable 'cnt' is removed. [1] https://reviews.llvm.org/D122271 Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220325200304.2915588-1-yhs@fb.com --- tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c | 3 --- tools/testing/selftests/bpf/test_lpm_map.c | 3 ++- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c index e834a01de16ac..16048978a1eff 100644 --- a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c +++ b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c @@ -29,11 +29,8 @@ static void get_stack_print_output(void *ctx, int cpu, void *data, __u32 size) */ struct get_stack_trace_t e; int i, num_stack; - static __u64 cnt; struct ksym *ks; - cnt++; - memset(&e, 0, sizeof(e)); memcpy(&e, data, size <= sizeof(e) ? size : sizeof(e)); diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c index baa3e3ecae824..aa294612e0a7d 100644 --- a/tools/testing/selftests/bpf/test_lpm_map.c +++ b/tools/testing/selftests/bpf/test_lpm_map.c @@ -209,7 +209,8 @@ static void test_lpm_order(void) static void test_lpm_map(int keysize) { LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC); - size_t i, j, n_matches, n_matches_after_delete, n_nodes, n_lookups; + volatile size_t n_matches, n_matches_after_delete; + size_t i, j, n_nodes, n_lookups; struct tlpm_node *t, *list = NULL; struct bpf_lpm_trie_key *key; uint8_t *data, *value;