From 8c1b21198551d795a44f08ad185f716732b47bbe Mon Sep 17 00:00:00 2001 From: Milan Landaverde Date: Tue, 22 Mar 2022 10:49:45 -0400 Subject: [PATCH 01/57] bpf/bpftool: Add unprivileged_bpf_disabled check against value of 2 In [1], we added a kconfig knob that can set /proc/sys/kernel/unprivileged_bpf_disabled to 2 We now check against this value in bpftool feature probe [1] https://lore.kernel.org/bpf/74ec548079189e4e4dffaeb42b8987bb3c852eee.1620765074.git.daniel@iogearbox.net Signed-off-by: Milan Landaverde Signed-off-by: Alexei Starovoitov Acked-by: Quentin Monnet Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20220322145012.1315376-1-milan@mdaverde.com --- tools/bpf/bpftool/feature.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index c2f43a5d38e01..290998c82de12 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -207,7 +207,10 @@ static void probe_unprivileged_disabled(void) printf("bpf() syscall for unprivileged users is enabled\n"); break; case 1: - printf("bpf() syscall restricted to privileged users\n"); + printf("bpf() syscall restricted to privileged users (without recovery)\n"); + break; + case 2: + printf("bpf() syscall restricted to privileged users (admin can change)\n"); break; case -1: printf("Unable to retrieve required privileges for bpf() syscall\n"); From 9052e4e83762c40d89f6650fec161ad9001e93f3 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 23 Mar 2022 16:35:26 +0900 Subject: [PATCH 02/57] fprobe: Fix smatch type mismatch warning Fix the type mismatching warning of 'rethook_node vs fprobe_rethook_node' found by Smatch. Reported-by: Dan Carpenter Signed-off-by: Masami Hiramatsu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/164802092611.1732982.12268174743437084619.stgit@devnote2 --- kernel/trace/fprobe.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 8b2dd5b9dcd14..63b2321b22a08 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -150,15 +150,15 @@ static int fprobe_init_rethook(struct fprobe *fp, int num) fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler); for (i = 0; i < size; i++) { - struct rethook_node *node; + struct fprobe_rethook_node *node; - node = kzalloc(sizeof(struct fprobe_rethook_node), GFP_KERNEL); + node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) { rethook_free(fp->rethook); fp->rethook = NULL; return -ENOMEM; } - rethook_add_node(fp->rethook, node); + rethook_add_node(fp->rethook, &node->node); } return 0; } From 261608f3105ce65e9fd01919f72ead74dcb9f14d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 23 Mar 2022 16:35:36 +0900 Subject: [PATCH 03/57] fprobe: Fix sparse warning for acccessing __rcu ftrace_hash Since ftrace_ops::local_hash::filter_hash field is an __rcu pointer, we have to use rcu_access_pointer() to access it. Reported-by: kernel test robot Signed-off-by: Masami Hiramatsu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/164802093635.1732982.4938094876018890866.stgit@devnote2 --- kernel/trace/fprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 63b2321b22a08..89d9f994ebb02 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -215,7 +215,7 @@ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter * correctly calculate the total number of filtered symbols * from both filter and notfilter. */ - hash = fp->ops.local_hash.filter_hash; + hash = rcu_access_pointer(fp->ops.local_hash.filter_hash); if (WARN_ON_ONCE(!hash)) goto out; From 98870605b3742c38353b50b6dbad33fab9de415f Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 24 Mar 2022 16:37:32 +0800 Subject: [PATCH 04/57] bpf: Sync comments for bpf_get_stack Commit ee2a098851bf missed updating the comments for helper bpf_get_stack in tools/include/uapi/linux/bpf.h. Sync it. Fixes: ee2a098851bf ("bpf: Adjust BPF stack helper functions to accommodate skip > 0") Signed-off-by: Geliang Tang Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/ce54617746b7ed5e9ba3b844e55e74cb8a60e0b5.1648110794.git.geliang.tang@suse.com --- tools/include/uapi/linux/bpf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7604e7d5438f8..d14b10b85e51f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3009,8 +3009,8 @@ union bpf_attr { * * # sysctl kernel.perf_event_max_stack= * Return - * A non-negative value equal to or less than *size* on success, - * or a negative error in case of failure. + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. * * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) * Description @@ -4316,8 +4316,8 @@ union bpf_attr { * * # sysctl kernel.perf_event_max_stack= * Return - * A non-negative value equal to or less than *size* on success, - * or a negative error in case of failure. + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. * * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) * Description From c29a4920dfcaa1433b09e2674f605f72767a385c Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Fri, 25 Mar 2022 00:42:38 +0800 Subject: [PATCH 05/57] bpf: Fix maximum permitted number of arguments check Since the m->arg_size array can hold up to MAX_BPF_FUNC_ARGS argument sizes, it's ok that nargs is equal to MAX_BPF_FUNC_ARGS. Signed-off-by: Yuntao Wang Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220324164238.1274915-1-ytcoode@gmail.com --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 24788ce564a08..0918a39279f6c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5507,7 +5507,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, } args = (const struct btf_param *)(func + 1); nargs = btf_type_vlen(func); - if (nargs >= MAX_BPF_FUNC_ARGS) { + if (nargs > MAX_BPF_FUNC_ARGS) { bpf_log(log, "The function %s has %d arguments. Too many.\n", tname, nargs); From 99dea2c664d7bc7e4f6f6947182d0d365165a998 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 25 Mar 2022 15:56:43 -0700 Subject: [PATCH 06/57] selftests/bpf: fix selftest after random: Urandom_read tracepoint removal 14c174633f34 ("random: remove unused tracepoints") removed all the tracepoints from drivers/char/random.c, one of which, random:urandom_read, was used by stacktrace_build_id selftest to trigger stack trace capture. Fix breakage by switching to kprobing urandom_read() function. Suggested-by: Yonghong Song Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220325225643.2606-1-andrii@kernel.org --- .../selftests/bpf/progs/test_stacktrace_build_id.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c b/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c index 36a707e7c7a7a..6c62bfb8bb6fa 100644 --- a/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c +++ b/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c @@ -39,16 +39,8 @@ struct { __type(value, stack_trace_t); } stack_amap SEC(".maps"); -/* taken from /sys/kernel/debug/tracing/events/random/urandom_read/format */ -struct random_urandom_args { - unsigned long long pad; - int got_bits; - int pool_left; - int input_left; -}; - -SEC("tracepoint/random/urandom_read") -int oncpu(struct random_urandom_args *args) +SEC("kprobe/urandom_read") +int oncpu(struct pt_regs *args) { __u32 max_len = sizeof(struct bpf_stack_build_id) * PERF_MAX_STACK_DEPTH; From ef8a257b4e499a979364b1f9caf25a325f6ee8b8 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 28 Mar 2022 10:37:03 +0200 Subject: [PATCH 07/57] bpftool: Fix generated code in codegen_asserts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Arnaldo reported perf compilation fail with: $ make -k BUILD_BPF_SKEL=1 CORESIGHT=1 PYTHON=python3 ... In file included from util/bpf_counter.c:28: /tmp/build/perf//util/bpf_skel/bperf_leader.skel.h: In function ‘bperf_leader_bpf__assert’: /tmp/build/perf//util/bpf_skel/bperf_leader.skel.h:351:51: error: unused parameter ‘s’ [-Werror=unused-parameter] 351 | bperf_leader_bpf__assert(struct bperf_leader_bpf *s) | ~~~~~~~~~~~~~~~~~~~~~~~~~^ cc1: all warnings being treated as errors If there's nothing to generate in the new assert function, we will get unused 's' warn/error, adding 'unused' attribute to it. Fixes: 08d4dba6ae77 ("bpftool: Bpf skeletons assert type sizes") Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Tested-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/bpf/20220328083703.2880079-1-jolsa@kernel.org --- tools/bpf/bpftool/gen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 7ba7ff55d2eaa..91af2850b5057 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -477,7 +477,7 @@ static void codegen_asserts(struct bpf_object *obj, const char *obj_name) codegen("\ \n\ __attribute__((unused)) static void \n\ - %1$s__assert(struct %1$s *s) \n\ + %1$s__assert(struct %1$s *s __attribute__((unused))) \n\ { \n\ #ifdef __cplusplus \n\ #define _Static_assert static_assert \n\ From 73f9b911faa74ac5107879de05c9489c419f41bb Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 26 Mar 2022 11:27:05 +0900 Subject: [PATCH 08/57] kprobes: Use rethook for kretprobe if possible Use rethook for kretprobe function return hooking if the arch sets CONFIG_HAVE_RETHOOK=y. In this case, CONFIG_KRETPROBE_ON_RETHOOK is set to 'y' automatically, and the kretprobe internal data fields switches to use rethook. If not, it continues to use kretprobe specific function return hooks. Suggested-by: Peter Zijlstra Signed-off-by: Masami Hiramatsu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/164826162556.2455864.12255833167233452047.stgit@devnote2 --- arch/Kconfig | 8 ++- include/linux/kprobes.h | 51 ++++++++++++++- kernel/Makefile | 1 + kernel/kprobes.c | 124 ++++++++++++++++++++++++++++++------ kernel/trace/trace_kprobe.c | 4 +- 5 files changed, 163 insertions(+), 25 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 84bc1de02720b..33e06966f2487 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -164,7 +164,13 @@ config ARCH_USE_BUILTIN_BSWAP config KRETPROBES def_bool y - depends on KPROBES && HAVE_KRETPROBES + depends on KPROBES && (HAVE_KRETPROBES || HAVE_RETHOOK) + +config KRETPROBE_ON_RETHOOK + def_bool y + depends on HAVE_RETHOOK + depends on KRETPROBES + select RETHOOK config USER_RETURN_NOTIFIER bool diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 312ff997c7436..157168769fc26 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #ifdef CONFIG_KPROBES @@ -149,13 +150,20 @@ struct kretprobe { int maxactive; int nmissed; size_t data_size; +#ifdef CONFIG_KRETPROBE_ON_RETHOOK + struct rethook *rh; +#else struct freelist_head freelist; struct kretprobe_holder *rph; +#endif }; #define KRETPROBE_MAX_DATA_SIZE 4096 struct kretprobe_instance { +#ifdef CONFIG_KRETPROBE_ON_RETHOOK + struct rethook_node node; +#else union { struct freelist_node freelist; struct rcu_head rcu; @@ -164,6 +172,7 @@ struct kretprobe_instance { struct kretprobe_holder *rph; kprobe_opcode_t *ret_addr; void *fp; +#endif char data[]; }; @@ -186,10 +195,24 @@ extern void kprobe_busy_begin(void); extern void kprobe_busy_end(void); #ifdef CONFIG_KRETPROBES -extern void arch_prepare_kretprobe(struct kretprobe_instance *ri, - struct pt_regs *regs); +/* Check whether @p is used for implementing a trampoline. */ extern int arch_trampoline_kprobe(struct kprobe *p); +#ifdef CONFIG_KRETPROBE_ON_RETHOOK +static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance *ri) +{ + RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(), + "Kretprobe is accessed from instance under preemptive context"); + + return (struct kretprobe *)READ_ONCE(ri->node.rethook->data); +} +static nokprobe_inline unsigned long get_kretprobe_retaddr(struct kretprobe_instance *ri) +{ + return ri->node.ret_addr; +} +#else +extern void arch_prepare_kretprobe(struct kretprobe_instance *ri, + struct pt_regs *regs); void arch_kretprobe_fixup_return(struct pt_regs *regs, kprobe_opcode_t *correct_ret_addr); @@ -232,6 +255,12 @@ static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance return READ_ONCE(ri->rph->rp); } +static nokprobe_inline unsigned long get_kretprobe_retaddr(struct kretprobe_instance *ri) +{ + return (unsigned long)ri->ret_addr; +} +#endif /* CONFIG_KRETPROBE_ON_RETHOOK */ + #else /* !CONFIG_KRETPROBES */ static inline void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) @@ -395,7 +424,11 @@ void unregister_kretprobe(struct kretprobe *rp); int register_kretprobes(struct kretprobe **rps, int num); void unregister_kretprobes(struct kretprobe **rps, int num); +#ifdef CONFIG_KRETPROBE_ON_RETHOOK +#define kprobe_flush_task(tk) do {} while (0) +#else void kprobe_flush_task(struct task_struct *tk); +#endif void kprobe_free_init_mem(void); @@ -509,6 +542,19 @@ static inline bool is_kprobe_optinsn_slot(unsigned long addr) #endif /* !CONFIG_OPTPROBES */ #ifdef CONFIG_KRETPROBES +#ifdef CONFIG_KRETPROBE_ON_RETHOOK +static nokprobe_inline bool is_kretprobe_trampoline(unsigned long addr) +{ + return is_rethook_trampoline(addr); +} + +static nokprobe_inline +unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp, + struct llist_node **cur) +{ + return rethook_find_ret_addr(tsk, (unsigned long)fp, cur); +} +#else static nokprobe_inline bool is_kretprobe_trampoline(unsigned long addr) { return (void *)addr == kretprobe_trampoline_addr(); @@ -516,6 +562,7 @@ static nokprobe_inline bool is_kretprobe_trampoline(unsigned long addr) unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp, struct llist_node **cur); +#endif #else static nokprobe_inline bool is_kretprobe_trampoline(unsigned long addr) { diff --git a/kernel/Makefile b/kernel/Makefile index 56f4ee97f3284..471d71935e90a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -108,6 +108,7 @@ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_TRACE_CLOCK) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ +obj-$(CONFIG_RETHOOK) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_BPF) += bpf/ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 185badc780b7c..dbe57df2e199e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1237,6 +1237,27 @@ void kprobes_inc_nmissed_count(struct kprobe *p) } NOKPROBE_SYMBOL(kprobes_inc_nmissed_count); +static struct kprobe kprobe_busy = { + .addr = (void *) get_kprobe, +}; + +void kprobe_busy_begin(void) +{ + struct kprobe_ctlblk *kcb; + + preempt_disable(); + __this_cpu_write(current_kprobe, &kprobe_busy); + kcb = get_kprobe_ctlblk(); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; +} + +void kprobe_busy_end(void) +{ + __this_cpu_write(current_kprobe, NULL); + preempt_enable(); +} + +#if !defined(CONFIG_KRETPROBE_ON_RETHOOK) static void free_rp_inst_rcu(struct rcu_head *head) { struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu); @@ -1258,26 +1279,6 @@ static void recycle_rp_inst(struct kretprobe_instance *ri) } NOKPROBE_SYMBOL(recycle_rp_inst); -static struct kprobe kprobe_busy = { - .addr = (void *) get_kprobe, -}; - -void kprobe_busy_begin(void) -{ - struct kprobe_ctlblk *kcb; - - preempt_disable(); - __this_cpu_write(current_kprobe, &kprobe_busy); - kcb = get_kprobe_ctlblk(); - kcb->kprobe_status = KPROBE_HIT_ACTIVE; -} - -void kprobe_busy_end(void) -{ - __this_cpu_write(current_kprobe, NULL); - preempt_enable(); -} - /* * This function is called from delayed_put_task_struct() when a task is * dead and cleaned up to recycle any kretprobe instances associated with @@ -1327,6 +1328,7 @@ static inline void free_rp_inst(struct kretprobe *rp) rp->rph = NULL; } } +#endif /* !CONFIG_KRETPROBE_ON_RETHOOK */ /* Add the new probe to 'ap->list'. */ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) @@ -1925,6 +1927,7 @@ static struct notifier_block kprobe_exceptions_nb = { #ifdef CONFIG_KRETPROBES +#if !defined(CONFIG_KRETPROBE_ON_RETHOOK) /* This assumes the 'tsk' is the current task or the is not running. */ static kprobe_opcode_t *__kretprobe_find_ret_addr(struct task_struct *tsk, struct llist_node **cur) @@ -2087,6 +2090,57 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) return 0; } NOKPROBE_SYMBOL(pre_handler_kretprobe); +#else /* CONFIG_KRETPROBE_ON_RETHOOK */ +/* + * This kprobe pre_handler is registered with every kretprobe. When probe + * hits it will set up the return probe. + */ +static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) +{ + struct kretprobe *rp = container_of(p, struct kretprobe, kp); + struct kretprobe_instance *ri; + struct rethook_node *rhn; + + rhn = rethook_try_get(rp->rh); + if (!rhn) { + rp->nmissed++; + return 0; + } + + ri = container_of(rhn, struct kretprobe_instance, node); + + if (rp->entry_handler && rp->entry_handler(ri, regs)) + rethook_recycle(rhn); + else + rethook_hook(rhn, regs, kprobe_ftrace(p)); + + return 0; +} +NOKPROBE_SYMBOL(pre_handler_kretprobe); + +static void kretprobe_rethook_handler(struct rethook_node *rh, void *data, + struct pt_regs *regs) +{ + struct kretprobe *rp = (struct kretprobe *)data; + struct kretprobe_instance *ri; + struct kprobe_ctlblk *kcb; + + /* The data must NOT be null. This means rethook data structure is broken. */ + if (WARN_ON_ONCE(!data)) + return; + + __this_cpu_write(current_kprobe, &rp->kp); + kcb = get_kprobe_ctlblk(); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + + ri = container_of(rh, struct kretprobe_instance, node); + rp->handler(ri, regs); + + __this_cpu_write(current_kprobe, NULL); +} +NOKPROBE_SYMBOL(kretprobe_rethook_handler); + +#endif /* !CONFIG_KRETPROBE_ON_RETHOOK */ /** * kprobe_on_func_entry() -- check whether given address is function entry @@ -2155,6 +2209,29 @@ int register_kretprobe(struct kretprobe *rp) rp->maxactive = num_possible_cpus(); #endif } +#ifdef CONFIG_KRETPROBE_ON_RETHOOK + rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler); + if (!rp->rh) + return -ENOMEM; + + for (i = 0; i < rp->maxactive; i++) { + inst = kzalloc(sizeof(struct kretprobe_instance) + + rp->data_size, GFP_KERNEL); + if (inst == NULL) { + rethook_free(rp->rh); + rp->rh = NULL; + return -ENOMEM; + } + rethook_add_node(rp->rh, &inst->node); + } + rp->nmissed = 0; + /* Establish function entry probe point */ + ret = register_kprobe(&rp->kp); + if (ret != 0) { + rethook_free(rp->rh); + rp->rh = NULL; + } +#else /* !CONFIG_KRETPROBE_ON_RETHOOK */ rp->freelist.head = NULL; rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL); if (!rp->rph) @@ -2179,6 +2256,7 @@ int register_kretprobe(struct kretprobe *rp) ret = register_kprobe(&rp->kp); if (ret != 0) free_rp_inst(rp); +#endif return ret; } EXPORT_SYMBOL_GPL(register_kretprobe); @@ -2217,7 +2295,11 @@ void unregister_kretprobes(struct kretprobe **rps, int num) for (i = 0; i < num; i++) { if (__unregister_kprobe_top(&rps[i]->kp) < 0) rps[i]->kp.addr = NULL; +#ifdef CONFIG_KRETPROBE_ON_RETHOOK + rethook_free(rps[i]->rh); +#else rps[i]->rph->rp = NULL; +#endif } mutex_unlock(&kprobe_mutex); @@ -2225,7 +2307,9 @@ void unregister_kretprobes(struct kretprobe **rps, int num) for (i = 0; i < num; i++) { if (rps[i]->kp.addr) { __unregister_kprobe_bottom(&rps[i]->kp); +#ifndef CONFIG_KRETPROBE_ON_RETHOOK free_rp_inst(rps[i]); +#endif } } } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b62fd785b599b..47cebef78532c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1433,7 +1433,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, fbuffer.regs = regs; entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->func = (unsigned long)tk->rp.kp.addr; - entry->ret_ip = (unsigned long)ri->ret_addr; + entry->ret_ip = get_kretprobe_retaddr(ri); store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); @@ -1628,7 +1628,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, return; entry->func = (unsigned long)tk->rp.kp.addr; - entry->ret_ip = (unsigned long)ri->ret_addr; + entry->ret_ip = get_kretprobe_retaddr(ri); store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); From f3a112c0c40dd96d53c8bdf3ea8d94d528f3b7b8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 26 Mar 2022 11:27:17 +0900 Subject: [PATCH 09/57] x86,rethook,kprobes: Replace kretprobe with rethook on x86 Replaces the kretprobe code with rethook on x86. With this patch, kretprobe on x86 uses the rethook instead of kretprobe specific trampoline code. Signed-off-by: Masami Hiramatsu Signed-off-by: Alexei Starovoitov Tested-by: Jiri Olsa Link: https://lore.kernel.org/bpf/164826163692.2455864.13745421016848209527.stgit@devnote2 --- arch/x86/Kconfig | 1 + arch/x86/include/asm/unwind.h | 23 +++--- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/kprobes/common.h | 1 + arch/x86/kernel/kprobes/core.c | 107 -------------------------- arch/x86/kernel/rethook.c | 125 +++++++++++++++++++++++++++++++ arch/x86/kernel/unwind_orc.c | 10 +-- 7 files changed, 144 insertions(+), 124 deletions(-) create mode 100644 arch/x86/kernel/rethook.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7340d9f01b621..ff45a27fc29ce 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -224,6 +224,7 @@ config X86 select HAVE_KPROBES_ON_FTRACE select HAVE_FUNCTION_ERROR_INJECTION select HAVE_KRETPROBES + select HAVE_RETHOOK select HAVE_KVM select HAVE_LIVEPATCH if X86_64 select HAVE_MIXED_BREAKPOINTS_REGS diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index 2a1f8734416dc..7cede4dc21f00 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -4,7 +4,7 @@ #include #include -#include +#include #include #include @@ -16,7 +16,7 @@ struct unwind_state { unsigned long stack_mask; struct task_struct *task; int graph_idx; -#ifdef CONFIG_KRETPROBES +#if defined(CONFIG_RETHOOK) struct llist_node *kr_cur; #endif bool error; @@ -104,19 +104,18 @@ void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, #endif static inline -unsigned long unwind_recover_kretprobe(struct unwind_state *state, - unsigned long addr, unsigned long *addr_p) +unsigned long unwind_recover_rethook(struct unwind_state *state, + unsigned long addr, unsigned long *addr_p) { -#ifdef CONFIG_KRETPROBES - return is_kretprobe_trampoline(addr) ? - kretprobe_find_ret_addr(state->task, addr_p, &state->kr_cur) : - addr; -#else - return addr; +#ifdef CONFIG_RETHOOK + if (is_rethook_trampoline(addr)) + return rethook_find_ret_addr(state->task, (unsigned long)addr_p, + &state->kr_cur); #endif + return addr; } -/* Recover the return address modified by kretprobe and ftrace_graph. */ +/* Recover the return address modified by rethook and ftrace_graph. */ static inline unsigned long unwind_recover_ret_addr(struct unwind_state *state, unsigned long addr, unsigned long *addr_p) @@ -125,7 +124,7 @@ unsigned long unwind_recover_ret_addr(struct unwind_state *state, ret = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr, addr_p); - return unwind_recover_kretprobe(state, ret, addr_p); + return unwind_recover_rethook(state, ret, addr_p); } /* diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 6462e3dd98f49..c41ef42adbe8a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -103,6 +103,7 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o obj-$(CONFIG_TRACING) += trace.o +obj-$(CONFIG_RETHOOK) += rethook.o obj-$(CONFIG_CRASH_CORE) += crash_core_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index 7d3a2e2daf011..c993521d49332 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -6,6 +6,7 @@ #include #include +#include #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 8ef933c03afac..7c4ab8870da44 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -811,18 +811,6 @@ set_current_kprobe(struct kprobe *p, struct pt_regs *regs, = (regs->flags & X86_EFLAGS_IF); } -void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - unsigned long *sara = stack_addr(regs); - - ri->ret_addr = (kprobe_opcode_t *) *sara; - ri->fp = sara; - - /* Replace the return addr with trampoline addr */ - *sara = (unsigned long) &__kretprobe_trampoline; -} -NOKPROBE_SYMBOL(arch_prepare_kretprobe); - static void kprobe_post_process(struct kprobe *cur, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { @@ -1023,101 +1011,6 @@ int kprobe_int3_handler(struct pt_regs *regs) } NOKPROBE_SYMBOL(kprobe_int3_handler); -/* - * When a retprobed function returns, this code saves registers and - * calls trampoline_handler() runs, which calls the kretprobe's handler. - */ -asm( - ".text\n" - ".global __kretprobe_trampoline\n" - ".type __kretprobe_trampoline, @function\n" - "__kretprobe_trampoline:\n" -#ifdef CONFIG_X86_64 - ANNOTATE_NOENDBR - /* Push a fake return address to tell the unwinder it's a kretprobe. */ - " pushq $__kretprobe_trampoline\n" - UNWIND_HINT_FUNC - /* Save the 'sp - 8', this will be fixed later. */ - " pushq %rsp\n" - " pushfq\n" - SAVE_REGS_STRING - " movq %rsp, %rdi\n" - " call trampoline_handler\n" - RESTORE_REGS_STRING - /* In trampoline_handler(), 'regs->flags' is copied to 'regs->sp'. */ - " addq $8, %rsp\n" - " popfq\n" -#else - /* Push a fake return address to tell the unwinder it's a kretprobe. */ - " pushl $__kretprobe_trampoline\n" - UNWIND_HINT_FUNC - /* Save the 'sp - 4', this will be fixed later. */ - " pushl %esp\n" - " pushfl\n" - SAVE_REGS_STRING - " movl %esp, %eax\n" - " call trampoline_handler\n" - RESTORE_REGS_STRING - /* In trampoline_handler(), 'regs->flags' is copied to 'regs->sp'. */ - " addl $4, %esp\n" - " popfl\n" -#endif - ASM_RET - ".size __kretprobe_trampoline, .-__kretprobe_trampoline\n" -); -NOKPROBE_SYMBOL(__kretprobe_trampoline); -/* - * __kretprobe_trampoline() skips updating frame pointer. The frame pointer - * saved in trampoline_handler() points to the real caller function's - * frame pointer. Thus the __kretprobe_trampoline() doesn't have a - * standard stack frame with CONFIG_FRAME_POINTER=y. - * Let's mark it non-standard function. Anyway, FP unwinder can correctly - * unwind without the hint. - */ -STACK_FRAME_NON_STANDARD_FP(__kretprobe_trampoline); - -/* This is called from kretprobe_trampoline_handler(). */ -void arch_kretprobe_fixup_return(struct pt_regs *regs, - kprobe_opcode_t *correct_ret_addr) -{ - unsigned long *frame_pointer = ®s->sp + 1; - - /* Replace fake return address with real one. */ - *frame_pointer = (unsigned long)correct_ret_addr; -} - -/* - * Called from __kretprobe_trampoline - */ -__used __visible void trampoline_handler(struct pt_regs *regs) -{ - unsigned long *frame_pointer; - - /* fixup registers */ - regs->cs = __KERNEL_CS; -#ifdef CONFIG_X86_32 - regs->gs = 0; -#endif - regs->ip = (unsigned long)&__kretprobe_trampoline; - regs->orig_ax = ~0UL; - regs->sp += sizeof(long); - frame_pointer = ®s->sp + 1; - - /* - * The return address at 'frame_pointer' is recovered by the - * arch_kretprobe_fixup_return() which called from the - * kretprobe_trampoline_handler(). - */ - kretprobe_trampoline_handler(regs, frame_pointer); - - /* - * Copy FLAGS to 'pt_regs::sp' so that __kretprobe_trapmoline() - * can do RET right after POPF. - */ - regs->sp = regs->flags; -} -NOKPROBE_SYMBOL(trampoline_handler); - int kprobe_fault_handler(struct pt_regs *regs, int trapnr) { struct kprobe *cur = kprobe_running(); diff --git a/arch/x86/kernel/rethook.c b/arch/x86/kernel/rethook.c new file mode 100644 index 0000000000000..af7e6713a07a7 --- /dev/null +++ b/arch/x86/kernel/rethook.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * x86 implementation of rethook. Mostly copied from arch/x86/kernel/kprobes/core.c. + */ +#include +#include +#include +#include + +#include "kprobes/common.h" + +__visible void arch_rethook_trampoline_callback(struct pt_regs *regs); + +#ifndef ANNOTATE_NOENDBR +#define ANNOTATE_NOENDBR +#endif + +/* + * When a target function returns, this code saves registers and calls + * arch_rethook_trampoline_callback(), which calls the rethook handler. + */ +asm( + ".text\n" + ".global arch_rethook_trampoline\n" + ".type arch_rethook_trampoline, @function\n" + "arch_rethook_trampoline:\n" +#ifdef CONFIG_X86_64 + ANNOTATE_NOENDBR /* This is only jumped from ret instruction */ + /* Push a fake return address to tell the unwinder it's a rethook. */ + " pushq $arch_rethook_trampoline\n" + UNWIND_HINT_FUNC + /* Save the 'sp - 8', this will be fixed later. */ + " pushq %rsp\n" + " pushfq\n" + SAVE_REGS_STRING + " movq %rsp, %rdi\n" + " call arch_rethook_trampoline_callback\n" + RESTORE_REGS_STRING + /* In the callback function, 'regs->flags' is copied to 'regs->sp'. */ + " addq $8, %rsp\n" + " popfq\n" +#else + /* Push a fake return address to tell the unwinder it's a rethook. */ + " pushl $arch_rethook_trampoline\n" + UNWIND_HINT_FUNC + /* Save the 'sp - 4', this will be fixed later. */ + " pushl %esp\n" + " pushfl\n" + SAVE_REGS_STRING + " movl %esp, %eax\n" + " call arch_rethook_trampoline_callback\n" + RESTORE_REGS_STRING + /* In the callback function, 'regs->flags' is copied to 'regs->sp'. */ + " addl $4, %esp\n" + " popfl\n" +#endif + ASM_RET + ".size arch_rethook_trampoline, .-arch_rethook_trampoline\n" +); +NOKPROBE_SYMBOL(arch_rethook_trampoline); + +/* + * Called from arch_rethook_trampoline + */ +__used __visible void arch_rethook_trampoline_callback(struct pt_regs *regs) +{ + unsigned long *frame_pointer; + + /* fixup registers */ + regs->cs = __KERNEL_CS; +#ifdef CONFIG_X86_32 + regs->gs = 0; +#endif + regs->ip = (unsigned long)&arch_rethook_trampoline; + regs->orig_ax = ~0UL; + regs->sp += sizeof(long); + frame_pointer = ®s->sp + 1; + + /* + * The return address at 'frame_pointer' is recovered by the + * arch_rethook_fixup_return() which called from this + * rethook_trampoline_handler(). + */ + rethook_trampoline_handler(regs, (unsigned long)frame_pointer); + + /* + * Copy FLAGS to 'pt_regs::sp' so that arch_rethook_trapmoline() + * can do RET right after POPF. + */ + regs->sp = regs->flags; +} +NOKPROBE_SYMBOL(arch_rethook_trampoline_callback); + +/* + * arch_rethook_trampoline() skips updating frame pointer. The frame pointer + * saved in arch_rethook_trampoline_callback() points to the real caller + * function's frame pointer. Thus the arch_rethook_trampoline() doesn't have + * a standard stack frame with CONFIG_FRAME_POINTER=y. + * Let's mark it non-standard function. Anyway, FP unwinder can correctly + * unwind without the hint. + */ +STACK_FRAME_NON_STANDARD_FP(arch_rethook_trampoline); + +/* This is called from rethook_trampoline_handler(). */ +void arch_rethook_fixup_return(struct pt_regs *regs, + unsigned long correct_ret_addr) +{ + unsigned long *frame_pointer = ®s->sp + 1; + + /* Replace fake return address with real one. */ + *frame_pointer = correct_ret_addr; +} +NOKPROBE_SYMBOL(arch_rethook_fixup_return); + +void arch_rethook_prepare(struct rethook_node *rh, struct pt_regs *regs, bool mcount) +{ + unsigned long *stack = (unsigned long *)regs->sp; + + rh->ret_addr = stack[0]; + rh->frame = regs->sp; + + /* Replace the return addr with trampoline addr */ + stack[0] = (unsigned long) arch_rethook_trampoline; +} +NOKPROBE_SYMBOL(arch_rethook_prepare); diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 2de3c8c5eba9f..794fdef2501ab 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -550,15 +550,15 @@ bool unwind_next_frame(struct unwind_state *state) } /* * There is a small chance to interrupt at the entry of - * __kretprobe_trampoline() where the ORC info doesn't exist. - * That point is right after the RET to __kretprobe_trampoline() + * arch_rethook_trampoline() where the ORC info doesn't exist. + * That point is right after the RET to arch_rethook_trampoline() * which was modified return address. - * At that point, the @addr_p of the unwind_recover_kretprobe() + * At that point, the @addr_p of the unwind_recover_rethook() * (this has to point the address of the stack entry storing * the modified return address) must be "SP - (a stack entry)" * because SP is incremented by the RET. */ - state->ip = unwind_recover_kretprobe(state, state->ip, + state->ip = unwind_recover_rethook(state, state->ip, (unsigned long *)(state->sp - sizeof(long))); state->regs = (struct pt_regs *)sp; state->prev_regs = NULL; @@ -573,7 +573,7 @@ bool unwind_next_frame(struct unwind_state *state) goto err; } /* See UNWIND_HINT_TYPE_REGS case comment. */ - state->ip = unwind_recover_kretprobe(state, state->ip, + state->ip = unwind_recover_rethook(state, state->ip, (unsigned long *)(state->sp - sizeof(long))); if (state->full_regs) From 0ef6f5c09371f17e142814e6996d6dfb8741925b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 26 Mar 2022 11:27:28 +0900 Subject: [PATCH 10/57] x86,rethook: Fix arch_rethook_trampoline() to generate a complete pt_regs Currently arch_rethook_trampoline() generates an almost complete pt_regs on-stack, everything except regs->ss that is, that currently points to the fake return address, which is not a valid segment descriptor. Since interpretation of regs->[sb]p should be done in the context of regs->ss, and we have code actually doing that (see arch/x86/lib/insn-eval.c for instance), complete the job by also pushing ss. This ensures that anybody who does do look at regs->ss doesn't mysteriously malfunction, avoiding much future pain. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Alexei Starovoitov Reviewed-by: Masami Hiramatsu Link: https://lore.kernel.org/bpf/164826164851.2455864.17272661073069737350.stgit@devnote2 --- arch/x86/kernel/rethook.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/rethook.c b/arch/x86/kernel/rethook.c index af7e6713a07a7..8a1c0111ae792 100644 --- a/arch/x86/kernel/rethook.c +++ b/arch/x86/kernel/rethook.c @@ -29,29 +29,31 @@ asm( /* Push a fake return address to tell the unwinder it's a rethook. */ " pushq $arch_rethook_trampoline\n" UNWIND_HINT_FUNC - /* Save the 'sp - 8', this will be fixed later. */ + " pushq $" __stringify(__KERNEL_DS) "\n" + /* Save the 'sp - 16', this will be fixed later. */ " pushq %rsp\n" " pushfq\n" SAVE_REGS_STRING " movq %rsp, %rdi\n" " call arch_rethook_trampoline_callback\n" RESTORE_REGS_STRING - /* In the callback function, 'regs->flags' is copied to 'regs->sp'. */ - " addq $8, %rsp\n" + /* In the callback function, 'regs->flags' is copied to 'regs->ss'. */ + " addq $16, %rsp\n" " popfq\n" #else /* Push a fake return address to tell the unwinder it's a rethook. */ " pushl $arch_rethook_trampoline\n" UNWIND_HINT_FUNC - /* Save the 'sp - 4', this will be fixed later. */ + " pushl %ss\n" + /* Save the 'sp - 8', this will be fixed later. */ " pushl %esp\n" " pushfl\n" SAVE_REGS_STRING " movl %esp, %eax\n" " call arch_rethook_trampoline_callback\n" RESTORE_REGS_STRING - /* In the callback function, 'regs->flags' is copied to 'regs->sp'. */ - " addl $4, %esp\n" + /* In the callback function, 'regs->flags' is copied to 'regs->ss'. */ + " addl $8, %esp\n" " popfl\n" #endif ASM_RET @@ -73,8 +75,8 @@ __used __visible void arch_rethook_trampoline_callback(struct pt_regs *regs) #endif regs->ip = (unsigned long)&arch_rethook_trampoline; regs->orig_ax = ~0UL; - regs->sp += sizeof(long); - frame_pointer = ®s->sp + 1; + regs->sp += 2*sizeof(long); + frame_pointer = (long *)(regs + 1); /* * The return address at 'frame_pointer' is recovered by the @@ -84,10 +86,10 @@ __used __visible void arch_rethook_trampoline_callback(struct pt_regs *regs) rethook_trampoline_handler(regs, (unsigned long)frame_pointer); /* - * Copy FLAGS to 'pt_regs::sp' so that arch_rethook_trapmoline() + * Copy FLAGS to 'pt_regs::ss' so that arch_rethook_trapmoline() * can do RET right after POPF. */ - regs->sp = regs->flags; + *(unsigned long *)®s->ss = regs->flags; } NOKPROBE_SYMBOL(arch_rethook_trampoline_callback); @@ -105,7 +107,7 @@ STACK_FRAME_NON_STANDARD_FP(arch_rethook_trampoline); void arch_rethook_fixup_return(struct pt_regs *regs, unsigned long correct_ret_addr) { - unsigned long *frame_pointer = ®s->sp + 1; + unsigned long *frame_pointer = (void *)(regs + 1); /* Replace fake return address with real one. */ *frame_pointer = correct_ret_addr; From 45c23bf4d1a416d32e509f83719a7399e35bdaf9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 26 Mar 2022 11:27:40 +0900 Subject: [PATCH 11/57] x86,kprobes: Fix optprobe trampoline to generate complete pt_regs Currently the optprobe trampoline template code ganerate an almost complete pt_regs on-stack, everything except regs->ss. The 'regs->ss' points to the top of stack, which is not a valid segment decriptor. As same as the rethook does, complete the job by also pushing ss. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Masami Hiramatsu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/164826166027.2455864.14759128090648961900.stgit@devnote2 --- arch/x86/kernel/kprobes/opt.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index b4a54a52aa595..e6b8c5362b945 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -106,7 +106,8 @@ asm ( ".global optprobe_template_entry\n" "optprobe_template_entry:\n" #ifdef CONFIG_X86_64 - /* We don't bother saving the ss register */ + " pushq $" __stringify(__KERNEL_DS) "\n" + /* Save the 'sp - 8', this will be fixed later. */ " pushq %rsp\n" " pushfq\n" ".global optprobe_template_clac\n" @@ -121,14 +122,17 @@ asm ( ".global optprobe_template_call\n" "optprobe_template_call:\n" ASM_NOP5 - /* Move flags to rsp */ + /* Copy 'regs->flags' into 'regs->ss'. */ " movq 18*8(%rsp), %rdx\n" - " movq %rdx, 19*8(%rsp)\n" + " movq %rdx, 20*8(%rsp)\n" RESTORE_REGS_STRING - /* Skip flags entry */ - " addq $8, %rsp\n" + /* Skip 'regs->flags' and 'regs->sp'. */ + " addq $16, %rsp\n" + /* And pop flags register from 'regs->ss'. */ " popfq\n" #else /* CONFIG_X86_32 */ + " pushl %ss\n" + /* Save the 'sp - 4', this will be fixed later. */ " pushl %esp\n" " pushfl\n" ".global optprobe_template_clac\n" @@ -142,12 +146,13 @@ asm ( ".global optprobe_template_call\n" "optprobe_template_call:\n" ASM_NOP5 - /* Move flags into esp */ + /* Copy 'regs->flags' into 'regs->ss'. */ " movl 14*4(%esp), %edx\n" - " movl %edx, 15*4(%esp)\n" + " movl %edx, 16*4(%esp)\n" RESTORE_REGS_STRING - /* Skip flags entry */ - " addl $4, %esp\n" + /* Skip 'regs->flags' and 'regs->sp'. */ + " addl $8, %esp\n" + /* And pop flags register from 'regs->ss'. */ " popfl\n" #endif ".global optprobe_template_end\n" @@ -179,6 +184,8 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) kprobes_inc_nmissed_count(&op->kp); } else { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + /* Adjust stack pointer */ + regs->sp += sizeof(long); /* Save skipped registers */ regs->cs = __KERNEL_CS; #ifdef CONFIG_X86_32 From a95a4d9b39b0324402569ed7395aae59b8fd2b11 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 28 Mar 2022 16:21:20 +0200 Subject: [PATCH 12/57] xsk: Do not write NULL in SW ring at allocation failure For the case when xp_alloc_batch() is used but the batched allocation cannot be used, there is a slow path that uses the non-batched xp_alloc(). When it fails to allocate an entry, it returns NULL. The current code wrote this NULL into the entry of the provided results array (pointer to the driver SW ring usually) and returned. This might not be what the driver expects and to make things simpler, just write successfully allocated xdp_buffs into the SW ring,. The driver might have information in there that is still important after an allocation failure. Note that at this point in time, there are no drivers using xp_alloc_batch() that could trigger this slow path. But one might get added. Fixes: 47e4075df300 ("xsk: Batched buffer allocation for the pool") Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220328142123.170157-2-maciej.fijalkowski@intel.com --- net/xdp/xsk_buff_pool.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index b34fca6ada869..af040ffa14ff3 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -591,9 +591,13 @@ u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) u32 nb_entries1 = 0, nb_entries2; if (unlikely(pool->dma_need_sync)) { + struct xdp_buff *buff; + /* Slow path */ - *xdp = xp_alloc(pool); - return !!*xdp; + buff = xp_alloc(pool); + if (buff) + *xdp = buff; + return !!buff; } if (unlikely(pool->free_list_cnt)) { From 30d19d57d513821c58de4556e7445982ed22b923 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 28 Mar 2022 16:21:21 +0200 Subject: [PATCH 13/57] ice: xsk: Eliminate unnecessary loop iteration The NIC Tx ring completion routine cleans entries from the ring in batches. However, it processes one more batch than it is supposed to. Note that this does not matter from a functionality point of view since it will not find a set DD bit for the next batch and just exit the loop. But from a performance perspective, it is faster to terminate the loop before and not issue an expensive read over PCIe to get the DD bit. Fixes: 126cdfe1007a ("ice: xsk: Improve AF_XDP ZC Tx and use batching API") Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220328142123.170157-3-maciej.fijalkowski@intel.com --- drivers/net/ethernet/intel/ice/ice_xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 88853a6ed9317..51427cb4971a5 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -754,7 +754,7 @@ static u16 ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring, int napi_budget) next_dd = next_dd + tx_thresh; if (next_dd >= desc_cnt) next_dd = tx_thresh - 1; - } while (budget--); + } while (--budget); xdp_ring->next_dd = next_dd; From 0ec1713009c5cc24244c918def1cd14080be27e3 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 28 Mar 2022 16:21:22 +0200 Subject: [PATCH 14/57] ice: xsk: Stop Rx processing when ntc catches ntu This can happen with big budget values and some breakage of re-filling descriptors as we do not clear the entry that ntu is pointing at the end of ice_alloc_rx_bufs_zc. So if ntc is at ntu then it might be the case that status_error0 has an old, uncleared value and ntc would go over with processing which would result in false results. Break Rx loop when ntc == ntu to avoid broken behavior. Fixes: 3876ff525de7 ("ice: xsk: Handle SW XDP ring wrap and bump tail more often") Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220328142123.170157-4-maciej.fijalkowski@intel.com --- drivers/net/ethernet/intel/ice/ice_xsk.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 51427cb4971a5..dfbcaf08520ee 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -608,6 +608,9 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget) */ dma_rmb(); + if (unlikely(rx_ring->next_to_clean == rx_ring->next_to_use)) + break; + xdp = *ice_xdp_buf(rx_ring, rx_ring->next_to_clean); size = le16_to_cpu(rx_desc->wb.pkt_len) & From 1ac2524de7b366633fc336db6c94062768d0ab03 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 28 Mar 2022 16:21:23 +0200 Subject: [PATCH 15/57] ice: xsk: Fix indexing in ice_tx_xsk_pool() Ice driver tries to always create XDP rings array to be num_possible_cpus() sized, regardless of user's queue count setting that can be changed via ethtool -L for example. Currently, ice_tx_xsk_pool() calculates the qid by decrementing the ring->q_index by the count of XDP queues, but ring->q_index is set to 'i + vsi->alloc_txq'. When user did ethtool -L $IFACE combined 1, alloc_txq is 1, but vsi->num_xdp_txq is still num_possible_cpus(). Then, ice_tx_xsk_pool() will do OOB access and in the final result ring would not get xsk_pool pointer assigned. Then, each ice_xsk_wakeup() call will fail with error and it will not be possible to get into NAPI and do the processing from driver side. Fix this by decrementing vsi->alloc_txq instead of vsi->num_xdp_txq from ring-q_index in ice_tx_xsk_pool() so the calculation is reflected to the setting of ring->q_index. Fixes: 22bf877e528f ("ice: introduce XDP_TX fallback path") Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220328142123.170157-5-maciej.fijalkowski@intel.com --- drivers/net/ethernet/intel/ice/ice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index b0b27bfcd7a28..d4f1874df7d0b 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -710,7 +710,7 @@ static inline struct xsk_buff_pool *ice_tx_xsk_pool(struct ice_tx_ring *ring) struct ice_vsi *vsi = ring->vsi; u16 qid; - qid = ring->q_index - vsi->num_xdp_txq; + qid = ring->q_index - vsi->alloc_txq; if (!ice_is_xdp_ena_vsi(vsi) || !test_bit(qid, vsi->af_xdp_zc_qps)) return NULL; From ccaff3d56acc47c257a99b2807b7c78a9467cf09 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 25 Mar 2022 13:03:04 -0700 Subject: [PATCH 16/57] selftests/bpf: Fix clang compilation errors llvm upstream patch ([1]) added to issue warning for code like void test() { int j = 0; for (int i = 0; i < 1000; i++) j++; return; } This triggered several errors in selftests/bpf build since compilation flag -Werror is used. ... test_lpm_map.c:212:15: error: variable 'n_matches' set but not used [-Werror,-Wunused-but-set-variable] size_t i, j, n_matches, n_matches_after_delete, n_nodes, n_lookups; ^ test_lpm_map.c:212:26: error: variable 'n_matches_after_delete' set but not used [-Werror,-Wunused-but-set-variable] size_t i, j, n_matches, n_matches_after_delete, n_nodes, n_lookups; ^ ... prog_tests/get_stack_raw_tp.c:32:15: error: variable 'cnt' set but not used [-Werror,-Wunused-but-set-variable] static __u64 cnt; ^ ... For test_lpm_map.c, 'n_matches'/'n_matches_after_delete' are changed to be volatile in order to silent the warning. I didn't remove these two declarations since they are referenced in a commented code which might be used by people in certain cases. For get_stack_raw_tp.c, the variable 'cnt' is removed. [1] https://reviews.llvm.org/D122271 Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220325200304.2915588-1-yhs@fb.com --- tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c | 3 --- tools/testing/selftests/bpf/test_lpm_map.c | 3 ++- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c index e834a01de16ac..16048978a1eff 100644 --- a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c +++ b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c @@ -29,11 +29,8 @@ static void get_stack_print_output(void *ctx, int cpu, void *data, __u32 size) */ struct get_stack_trace_t e; int i, num_stack; - static __u64 cnt; struct ksym *ks; - cnt++; - memset(&e, 0, sizeof(e)); memcpy(&e, data, size <= sizeof(e) ? size : sizeof(e)); diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c index baa3e3ecae824..aa294612e0a7d 100644 --- a/tools/testing/selftests/bpf/test_lpm_map.c +++ b/tools/testing/selftests/bpf/test_lpm_map.c @@ -209,7 +209,8 @@ static void test_lpm_order(void) static void test_lpm_map(int keysize) { LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC); - size_t i, j, n_matches, n_matches_after_delete, n_nodes, n_lookups; + volatile size_t n_matches, n_matches_after_delete; + size_t i, j, n_nodes, n_lookups; struct tlpm_node *t, *list = NULL; struct bpf_lpm_trie_key *key; uint8_t *data, *value; From f19c44452b58a84d95e209b847f5495d91c9983a Mon Sep 17 00:00:00 2001 From: Martin Varghese Date: Mon, 28 Mar 2022 11:11:48 +0530 Subject: [PATCH 17/57] openvswitch: Fixed nd target mask field in the flow dump. IPv6 nd target mask was not getting populated in flow dump. In the function __ovs_nla_put_key the icmp code mask field was checked instead of icmp code key field to classify the flow as neighbour discovery. ufid:bdfbe3e5-60c2-43b0-a5ff-dfcac1c37328, recirc_id(0),dp_hash(0/0), skb_priority(0/0),in_port(ovs-nm1),skb_mark(0/0),ct_state(0/0), ct_zone(0/0),ct_mark(0/0),ct_label(0/0), eth(src=00:00:00:00:00:00/00:00:00:00:00:00, dst=00:00:00:00:00:00/00:00:00:00:00:00), eth_type(0x86dd), ipv6(src=::/::,dst=::/::,label=0/0,proto=58,tclass=0/0,hlimit=0/0,frag=no), icmpv6(type=135,code=0), nd(target=2001::2/::, sll=00:00:00:00:00:00/00:00:00:00:00:00, tll=00:00:00:00:00:00/00:00:00:00:00:00), packets:10, bytes:860, used:0.504s, dp:ovs, actions:ovs-nm2 Fixes: e64457191a25 (openvswitch: Restructure datapath.c and flow.c) Signed-off-by: Martin Varghese Link: https://lore.kernel.org/r/20220328054148.3057-1-martinvarghesenokia@gmail.com Signed-off-by: Paolo Abeni --- net/openvswitch/flow_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 5176f6ccac8ef..cc282a58b75b9 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2230,8 +2230,8 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, icmpv6_key->icmpv6_type = ntohs(output->tp.src); icmpv6_key->icmpv6_code = ntohs(output->tp.dst); - if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION || - icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) { + if (swkey->tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) || + swkey->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { struct ovs_key_nd *nd_key; nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key)); From 5352a761308397a0e6250fdc629bb3f615b94747 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Mon, 28 Mar 2022 21:00:14 +0800 Subject: [PATCH 18/57] ax25: fix UAF bug in ax25_send_control() There are UAF bugs in ax25_send_control(), when we call ax25_release() to deallocate ax25_dev. The possible race condition is shown below: (Thread 1) | (Thread 2) ax25_dev_device_up() //(1) | | ax25_kill_by_device() ax25_bind() //(2) | ax25_connect() | ... ax25->state = AX25_STATE_1 | ... | ax25_dev_device_down() //(3) (Thread 3) ax25_release() | ax25_dev_put() //(4) FREE | case AX25_STATE_1: | ax25_send_control() | alloc_skb() //USE | The refcount of ax25_dev increases in position (1) and (2), and decreases in position (3) and (4). The ax25_dev will be freed before dereference sites in ax25_send_control(). The following is part of the report: [ 102.297448] BUG: KASAN: use-after-free in ax25_send_control+0x33/0x210 [ 102.297448] Read of size 8 at addr ffff888009e6e408 by task ax25_close/602 [ 102.297448] Call Trace: [ 102.303751] ax25_send_control+0x33/0x210 [ 102.303751] ax25_release+0x356/0x450 [ 102.305431] __sock_release+0x6d/0x120 [ 102.305431] sock_close+0xf/0x20 [ 102.305431] __fput+0x11f/0x420 [ 102.305431] task_work_run+0x86/0xd0 [ 102.307130] get_signal+0x1075/0x1220 [ 102.308253] arch_do_signal_or_restart+0x1df/0xc00 [ 102.308253] exit_to_user_mode_prepare+0x150/0x1e0 [ 102.308253] syscall_exit_to_user_mode+0x19/0x50 [ 102.308253] do_syscall_64+0x48/0x90 [ 102.308253] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 102.308253] RIP: 0033:0x405ae7 This patch defers the free operation of ax25_dev and net_device after all corresponding dereference sites in ax25_release() to avoid UAF. Fixes: 9fd75b66b8f6 ("ax25: Fix refcount leaks caused by ax25_cb_del()") Signed-off-by: Duoming Zhou Signed-off-by: Paolo Abeni --- net/ax25/af_ax25.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 992b6e5d85d71..f5686c463bc0d 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -991,10 +991,6 @@ static int ax25_release(struct socket *sock) sock_orphan(sk); ax25 = sk_to_ax25(sk); ax25_dev = ax25->ax25_dev; - if (ax25_dev) { - dev_put_track(ax25_dev->dev, &ax25_dev->dev_tracker); - ax25_dev_put(ax25_dev); - } if (sk->sk_type == SOCK_SEQPACKET) { switch (ax25->state) { @@ -1056,6 +1052,10 @@ static int ax25_release(struct socket *sock) sk->sk_state_change(sk); ax25_destroy_socket(ax25); } + if (ax25_dev) { + dev_put_track(ax25_dev->dev, &ax25_dev->dev_tracker); + ax25_dev_put(ax25_dev); + } sock->sk = NULL; release_sock(sk); From 82e31755e55fbcea6a9dfaae5fe4860ade17cbc0 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Mon, 28 Mar 2022 21:00:15 +0800 Subject: [PATCH 19/57] ax25: Fix UAF bugs in ax25 timers There are race conditions that may lead to UAF bugs in ax25_heartbeat_expiry(), ax25_t1timer_expiry(), ax25_t2timer_expiry(), ax25_t3timer_expiry() and ax25_idletimer_expiry(), when we call ax25_release() to deallocate ax25_dev. One of the UAF bugs caused by ax25_release() is shown below: (Thread 1) | (Thread 2) ax25_dev_device_up() //(1) | ... | ax25_kill_by_device() ax25_bind() //(2) | ax25_connect() | ... ax25_std_establish_data_link() | ax25_start_t1timer() | ax25_dev_device_down() //(3) mod_timer(&ax25->t1timer,..) | | ax25_release() (wait a time) | ... | ax25_dev_put(ax25_dev) //(4)FREE ax25_t1timer_expiry() | ax25->ax25_dev->values[..] //USE| ... ... | We increase the refcount of ax25_dev in position (1) and (2), and decrease the refcount of ax25_dev in position (3) and (4). The ax25_dev will be freed in position (4) and be used in ax25_t1timer_expiry(). The fail log is shown below: ============================================================== [ 106.116942] BUG: KASAN: use-after-free in ax25_t1timer_expiry+0x1c/0x60 [ 106.116942] Read of size 8 at addr ffff88800bda9028 by task swapper/0/0 [ 106.116942] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.17.0-06123-g0905eec574 [ 106.116942] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-14 [ 106.116942] Call Trace: ... [ 106.116942] ax25_t1timer_expiry+0x1c/0x60 [ 106.116942] call_timer_fn+0x122/0x3d0 [ 106.116942] __run_timers.part.0+0x3f6/0x520 [ 106.116942] run_timer_softirq+0x4f/0xb0 [ 106.116942] __do_softirq+0x1c2/0x651 ... This patch adds del_timer_sync() in ax25_release(), which could ensure that all timers stop before we deallocate ax25_dev. Signed-off-by: Duoming Zhou Signed-off-by: Paolo Abeni --- net/ax25/af_ax25.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index f5686c463bc0d..363d47f945324 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1053,6 +1053,11 @@ static int ax25_release(struct socket *sock) ax25_destroy_socket(ax25); } if (ax25_dev) { + del_timer_sync(&ax25->timer); + del_timer_sync(&ax25->t1timer); + del_timer_sync(&ax25->t2timer); + del_timer_sync(&ax25->t3timer); + del_timer_sync(&ax25->idletimer); dev_put_track(ax25_dev->dev, &ax25_dev->dev_tracker); ax25_dev_put(ax25_dev); } From ad7da1ce5749c0eb4f09dd7e5510123be56f10fb Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Tue, 29 Mar 2022 00:03:50 +0200 Subject: [PATCH 20/57] net: lan966x: fix kernel oops on ioctl when I/F is down ioctls handled by phy_mii_ioctl() will cause a kernel oops when the interface is down. Fix it by making sure there is a PHY attached. Fixes: 735fec995b21 ("net: lan966x: Implement SIOCSHWTSTAMP and SIOCGHWTSTAMP") Signed-off-by: Michael Walle Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20220328220350.3118969-1-michael@walle.cc Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microchip/lan966x/lan966x_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c index e1bcb28039dc7..1f8c67f0261bf 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c @@ -408,6 +408,9 @@ static int lan966x_port_ioctl(struct net_device *dev, struct ifreq *ifr, } } + if (!dev->phydev) + return -ENODEV; + return phy_mii_ioctl(dev->phydev, ifr, cmd); } From 6094e391e643fddd65d4b938c1d499a838dd4907 Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Sat, 26 Mar 2022 01:37:31 +0530 Subject: [PATCH 21/57] dt-bindings: net: qcom,ethqos: Document SM8150 SoC compatible SM8150 has an ethernet controller and it needs a different configuration, so add a new compatible for this. Acked-by: Rob Herring Signed-off-by: Vinod Koul [bhsharma: Massage the commit log] Signed-off-by: Bhupesh Sharma Link: https://lore.kernel.org/r/20220325200731.1585554-1-bhupesh.sharma@linaro.org Signed-off-by: Jakub Kicinski --- Documentation/devicetree/bindings/net/qcom,ethqos.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/qcom,ethqos.txt b/Documentation/devicetree/bindings/net/qcom,ethqos.txt index fcf5035810b57..1f5746849a716 100644 --- a/Documentation/devicetree/bindings/net/qcom,ethqos.txt +++ b/Documentation/devicetree/bindings/net/qcom,ethqos.txt @@ -7,7 +7,9 @@ This device has following properties: Required properties: -- compatible: Should be qcom,qcs404-ethqos" +- compatible: Should be one of: + "qcom,qcs404-ethqos" + "qcom,sm8150-ethqos" - reg: Address and length of the register set for the device From 866b7a278cdb51eb158cd8513bc7438fc857804a Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Tue, 29 Mar 2022 09:08:00 +0000 Subject: [PATCH 22/57] net: dsa: felix: fix possible NULL pointer dereference As the possible failure of the allocation, kzalloc() may return NULL pointer. Therefore, it should be better to check the 'sgi' in order to prevent the dereference of NULL pointer. Fixes: 23ae3a7877718 ("net: dsa: felix: add stream gate settings for psfp"). Signed-off-by: Zheng Yongjun Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220329090800.130106-1-zhengyongjun3@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/ocelot/felix_vsc9959.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index 62d52e0874e9d..8d382b27e6257 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -1928,6 +1928,10 @@ static int vsc9959_psfp_filter_add(struct ocelot *ocelot, int port, case FLOW_ACTION_GATE: size = struct_size(sgi, entries, a->gate.num_entries); sgi = kzalloc(size, GFP_KERNEL); + if (!sgi) { + ret = -ENOMEM; + goto err; + } vsc9959_psfp_parse_gate(a, sgi); ret = vsc9959_psfp_sgi_table_add(ocelot, sgi); if (ret) { From 8f0588e80e33273c2fa219da4622affab0cdd22f Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Tue, 29 Mar 2022 09:03:54 -0700 Subject: [PATCH 23/57] ptp: ocp: handle error from nvmem_device_find nvmem_device_find returns a valid pointer or IS_ERR(). Handle this properly. Fixes: 0cfcdd1ebcfe ("ptp: ocp: add nvmem interface for accessing eeprom") Signed-off-by: Jonathan Lemon Link: https://lore.kernel.org/r/20220329160354.4035-1-jonathan.lemon@gmail.com Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_ocp.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/ptp/ptp_ocp.c b/drivers/ptp/ptp_ocp.c index c3d0fcf609e37..0feaa4b453175 100644 --- a/drivers/ptp/ptp_ocp.c +++ b/drivers/ptp/ptp_ocp.c @@ -1214,10 +1214,9 @@ ptp_ocp_nvmem_device_get(struct ptp_ocp *bp, const void * const tag) static inline void ptp_ocp_nvmem_device_put(struct nvmem_device **nvmemp) { - if (*nvmemp != NULL) { + if (!IS_ERR_OR_NULL(*nvmemp)) nvmem_device_put(*nvmemp); - *nvmemp = NULL; - } + *nvmemp = NULL; } static void @@ -1241,13 +1240,15 @@ ptp_ocp_read_eeprom(struct ptp_ocp *bp) } if (!nvmem) { nvmem = ptp_ocp_nvmem_device_get(bp, tag); - if (!nvmem) - goto out; + if (IS_ERR(nvmem)) { + ret = PTR_ERR(nvmem); + goto fail; + } } ret = nvmem_device_read(nvmem, map->off, map->len, BP_MAP_ENTRY_ADDR(bp, map)); if (ret != map->len) - goto read_fail; + goto fail; } bp->has_eeprom_data = true; @@ -1256,7 +1257,7 @@ ptp_ocp_read_eeprom(struct ptp_ocp *bp) ptp_ocp_nvmem_device_put(&nvmem); return; -read_fail: +fail: dev_err(&bp->pdev->dev, "could not read eeprom: %d\n", ret); goto out; } From c9ad266bbef58dcbb6e74a6dbc5c4c2ed166e9b7 Mon Sep 17 00:00:00 2001 From: Martin Habets Date: Tue, 29 Mar 2022 17:07:49 +0100 Subject: [PATCH 24/57] sfc: Avoid NULL pointer dereference on systems without numa awareness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On such systems cpumask_of_node() returns NULL, which bitmap operations are not happy with. Fixes: c265b569a45f ("sfc: default config to 1 channel/core in local NUMA node only") Fixes: 09a99ab16c60 ("sfc: set affinity hints in local NUMA node only") Signed-off-by: Martin Habets Reviewed-by: Íñigo Huguet Link: https://lore.kernel.org/r/164857006953.8140.3265568858101821256.stgit@palantir17.mph.net Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sfc/efx_channels.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/sfc/efx_channels.c b/drivers/net/ethernet/sfc/efx_channels.c index d6fdcdc530caf..f9064532beb66 100644 --- a/drivers/net/ethernet/sfc/efx_channels.c +++ b/drivers/net/ethernet/sfc/efx_channels.c @@ -91,11 +91,9 @@ static unsigned int count_online_cores(struct efx_nic *efx, bool local_node) } cpumask_copy(filter_mask, cpu_online_mask); - if (local_node) { - int numa_node = pcibus_to_node(efx->pci_dev->bus); - - cpumask_and(filter_mask, filter_mask, cpumask_of_node(numa_node)); - } + if (local_node) + cpumask_and(filter_mask, filter_mask, + cpumask_of_pcibus(efx->pci_dev->bus)); count = 0; for_each_cpu(cpu, filter_mask) { @@ -386,8 +384,7 @@ int efx_probe_interrupts(struct efx_nic *efx) #if defined(CONFIG_SMP) void efx_set_interrupt_affinity(struct efx_nic *efx) { - int numa_node = pcibus_to_node(efx->pci_dev->bus); - const struct cpumask *numa_mask = cpumask_of_node(numa_node); + const struct cpumask *numa_mask = cpumask_of_pcibus(efx->pci_dev->bus); struct efx_channel *channel; unsigned int cpu; From ec59f128a9bd4255798abb1e06ac3b442f46ef68 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 29 Mar 2022 21:31:24 -0400 Subject: [PATCH 25/57] wireguard: queueing: use CFI-safe ptr_ring cleanup function We make too nuanced use of ptr_ring to entirely move to the skb_array wrappers, but we at least should avoid the naughty function pointer cast when cleaning up skbs. Otherwise RAP/CFI will honk at us. This patch uses the __skb_array_destroy_skb wrapper for the cleanup, rather than directly providing kfree_skb, which is what other drivers in the same situation do too. Reported-by: PaX Team Fixes: 886fcee939ad ("wireguard: receive: use ring buffer for incoming handshakes") Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/queueing.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireguard/queueing.c b/drivers/net/wireguard/queueing.c index 1de413b19e342..8084e7408c0ae 100644 --- a/drivers/net/wireguard/queueing.c +++ b/drivers/net/wireguard/queueing.c @@ -4,6 +4,7 @@ */ #include "queueing.h" +#include struct multicore_worker __percpu * wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr) @@ -42,7 +43,7 @@ void wg_packet_queue_free(struct crypt_queue *queue, bool purge) { free_percpu(queue->worker); WARN_ON(!purge && !__ptr_ring_empty(&queue->ring)); - ptr_ring_cleanup(&queue->ring, purge ? (void(*)(void*))kfree_skb : NULL); + ptr_ring_cleanup(&queue->ring, purge ? __skb_array_destroy_skb : NULL); } #define NEXT(skb) ((skb)->prev) From ca93ca23409b827b48a2fc0a692496d3f7b67944 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 29 Mar 2022 21:31:25 -0400 Subject: [PATCH 26/57] wireguard: selftests: simplify RNG seeding The seed_rng() function was written to work across lots of old kernels, back when WireGuard used a big compatibility layer. Now that things have evolved, we can vastly simplify this, by just marking the RNG as seeded. Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- tools/testing/selftests/wireguard/qemu/init.c | 26 +++++-------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/tools/testing/selftests/wireguard/qemu/init.c b/tools/testing/selftests/wireguard/qemu/init.c index c9698120ac9d8..0b45055d9de0e 100644 --- a/tools/testing/selftests/wireguard/qemu/init.c +++ b/tools/testing/selftests/wireguard/qemu/init.c @@ -56,26 +56,14 @@ static void print_banner(void) static void seed_rng(void) { - int fd; - struct { - int entropy_count; - int buffer_size; - unsigned char buffer[256]; - } entropy = { - .entropy_count = sizeof(entropy.buffer) * 8, - .buffer_size = sizeof(entropy.buffer), - .buffer = "Adding real entropy is not actually important for these tests. Don't try this at home, kids!" - }; + int bits = 256, fd; - if (mknod("/dev/urandom", S_IFCHR | 0644, makedev(1, 9))) - panic("mknod(/dev/urandom)"); - fd = open("/dev/urandom", O_WRONLY); + pretty_message("[+] Fake seeding RNG..."); + fd = open("/dev/random", O_WRONLY); if (fd < 0) - panic("open(urandom)"); - for (int i = 0; i < 256; ++i) { - if (ioctl(fd, RNDADDENTROPY, &entropy) < 0) - panic("ioctl(urandom)"); - } + panic("open(random)"); + if (ioctl(fd, RNDADDTOENTCNT, &bits) < 0) + panic("ioctl(RNDADDTOENTCNT)"); close(fd); } @@ -270,10 +258,10 @@ static void check_leaks(void) int main(int argc, char *argv[]) { - seed_rng(); ensure_console(); print_banner(); mount_filesystems(); + seed_rng(); kmod_selftests(); enable_logging(); clear_leaks(); From bbbf962d9460194993ee1943a793a0a0af4a7fbf Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Tue, 29 Mar 2022 21:31:26 -0400 Subject: [PATCH 27/57] wireguard: socket: free skb in send6 when ipv6 is disabled I got a memory leak report: unreferenced object 0xffff8881191fc040 (size 232): comm "kworker/u17:0", pid 23193, jiffies 4295238848 (age 3464.870s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] slab_post_alloc_hook+0x84/0x3b0 [] kmem_cache_alloc_node+0x167/0x340 [] __alloc_skb+0x1db/0x200 [] wg_socket_send_buffer_to_peer+0x3d/0xc0 [] wg_packet_send_handshake_initiation+0xfa/0x110 [] wg_packet_handshake_send_worker+0x21/0x30 [] process_one_work+0x2e8/0x770 [] worker_thread+0x4a/0x4b0 [] kthread+0x120/0x160 [] ret_from_fork+0x1f/0x30 In function wg_socket_send_buffer_as_reply_to_skb() or wg_socket_send_ buffer_to_peer(), the semantics of send6() is required to free skb. But when CONFIG_IPV6 is disable, kfree_skb() is missing. This patch adds it to fix this bug. Signed-off-by: Wang Hai Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/socket.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c index 6f07b949cb81d..467eef0e563bf 100644 --- a/drivers/net/wireguard/socket.c +++ b/drivers/net/wireguard/socket.c @@ -160,6 +160,7 @@ static int send6(struct wg_device *wg, struct sk_buff *skb, rcu_read_unlock_bh(); return ret; #else + kfree_skb(skb); return -EAFNOSUPPORT; #endif } From 77fc73ac89be96ec8f39e8efa53885caa7cb3645 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 29 Mar 2022 21:31:27 -0400 Subject: [PATCH 28/57] wireguard: socket: ignore v6 endpoints when ipv6 is disabled The previous commit fixed a memory leak on the send path in the event that IPv6 is disabled at compile time, but how did a packet even arrive there to begin with? It turns out we have previously allowed IPv6 endpoints even when IPv6 support is disabled at compile time. This is awkward and inconsistent. Instead, let's just ignore all things IPv6, the same way we do other malformed endpoints, in the case where IPv6 is disabled. Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c index 467eef0e563bf..0414d7a6ce741 100644 --- a/drivers/net/wireguard/socket.c +++ b/drivers/net/wireguard/socket.c @@ -242,7 +242,7 @@ int wg_socket_endpoint_from_skb(struct endpoint *endpoint, endpoint->addr4.sin_addr.s_addr = ip_hdr(skb)->saddr; endpoint->src4.s_addr = ip_hdr(skb)->daddr; endpoint->src_if4 = skb->skb_iif; - } else if (skb->protocol == htons(ETH_P_IPV6)) { + } else if (IS_ENABLED(CONFIG_IPV6) && skb->protocol == htons(ETH_P_IPV6)) { endpoint->addr6.sin6_family = AF_INET6; endpoint->addr6.sin6_port = udp_hdr(skb)->source; endpoint->addr6.sin6_addr = ipv6_hdr(skb)->saddr; @@ -285,7 +285,7 @@ void wg_socket_set_peer_endpoint(struct wg_peer *peer, peer->endpoint.addr4 = endpoint->addr4; peer->endpoint.src4 = endpoint->src4; peer->endpoint.src_if4 = endpoint->src_if4; - } else if (endpoint->addr.sa_family == AF_INET6) { + } else if (IS_ENABLED(CONFIG_IPV6) && endpoint->addr.sa_family == AF_INET6) { peer->endpoint.addr6 = endpoint->addr6; peer->endpoint.src6 = endpoint->src6; } else { From f9512d654f62604664251dedd437a22fe484974a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 29 Mar 2022 18:20:25 -0700 Subject: [PATCH 29/57] net: sparx5: uses, depends on BRIDGE or !BRIDGE Fix build errors when BRIDGE=m and SPARX5_SWITCH=y: riscv64-linux-ld: drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.o: in function `.L305': sparx5_switchdev.c:(.text+0xdb0): undefined reference to `br_vlan_enabled' riscv64-linux-ld: drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.o: in function `.L283': sparx5_switchdev.c:(.text+0xee0): undefined reference to `br_vlan_enabled' Fixes: 3cfa11bac9bb ("net: sparx5: add the basic sparx5 driver") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Horatiu Vultur Cc: Lars Povlsen Cc: Steen Hegelund Cc: UNGLinuxDriver@microchip.com Cc: Paolo Abeni Link: https://lore.kernel.org/r/20220330012025.29560-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/sparx5/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/microchip/sparx5/Kconfig b/drivers/net/ethernet/microchip/sparx5/Kconfig index 85b24edb65d51..cc5e48e1bb4c3 100644 --- a/drivers/net/ethernet/microchip/sparx5/Kconfig +++ b/drivers/net/ethernet/microchip/sparx5/Kconfig @@ -5,6 +5,7 @@ config SPARX5_SWITCH depends on OF depends on ARCH_SPARX5 || COMPILE_TEST depends on PTP_1588_CLOCK_OPTIONAL + depends on BRIDGE || BRIDGE=n select PHYLINK select PHY_SPARX5_SERDES select RESET_CONTROLLER From e382fea8ae54f5bb62869c6b69b33993d43adeca Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 28 Mar 2022 13:36:11 +0200 Subject: [PATCH 30/57] can: isotp: restore accidentally removed MSG_PEEK feature In commit 42bf50a1795a ("can: isotp: support MSG_TRUNC flag when reading from socket") a new check for recvmsg flags has been introduced that only checked for the flags that are handled in isotp_recvmsg() itself. This accidentally removed the MSG_PEEK feature flag which is processed later in the call chain in __skb_try_recv_from_queue(). Add MSG_PEEK to the set of valid flags to restore the feature. Fixes: 42bf50a1795a ("can: isotp: support MSG_TRUNC flag when reading from socket") Link: https://github.com/linux-can/can-utils/issues/347#issuecomment-1079554254 Link: https://lore.kernel.org/all/20220328113611.3691-1-socketcan@hartkopp.net Reported-by: Derek Will Suggested-by: Derek Will Tested-by: Derek Will Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/isotp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/can/isotp.c b/net/can/isotp.c index f6f8ba1f816d4..bafb0fb5f0e0e 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1050,7 +1050,7 @@ static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int noblock = flags & MSG_DONTWAIT; int ret = 0; - if (flags & ~(MSG_DONTWAIT | MSG_TRUNC)) + if (flags & ~(MSG_DONTWAIT | MSG_TRUNC | MSG_PEEK)) return -EINVAL; if (!so->bound) From fa7b514d2b2894e052b8e94c7a29feb98e90093f Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sat, 19 Mar 2022 08:31:28 -0700 Subject: [PATCH 31/57] can: mcp251xfd: mcp251xfd_register_get_dev_id(): fix return of error value Clang static analysis reports this issue: | mcp251xfd-core.c:1813:7: warning: The left operand | of '&' is a garbage value | FIELD_GET(MCP251XFD_REG_DEVID_ID_MASK, dev_id), | ^ ~~~~~~ dev_id is set in a successful call to mcp251xfd_register_get_dev_id(). Though the status of calls made by mcp251xfd_register_get_dev_id() are checked and handled, their status' are not returned. So return err. Fixes: 55e5b97f003e ("can: mcp25xxfd: add driver for Microchip MCP25xxFD SPI CAN") Link: https://lore.kernel.org/all/20220319153128.2164120-1-trix@redhat.com Signed-off-by: Tom Rix Signed-off-by: Marc Kleine-Budde --- drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index 325024be7b045..f9dd8fdba12bc 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -1786,7 +1786,7 @@ mcp251xfd_register_get_dev_id(const struct mcp251xfd_priv *priv, u32 *dev_id, out_kfree_buf_rx: kfree(buf_rx); - return 0; + return err; } #define MCP251XFD_QUIRK_ACTIVE(quirk) \ From 2e8e79c416aae1de224c0f1860f2e3350fa171f8 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Thu, 17 Mar 2022 08:57:35 +0100 Subject: [PATCH 32/57] can: m_can: m_can_tx_handler(): fix use after free of skb can_put_echo_skb() will clone skb then free the skb. Move the can_put_echo_skb() for the m_can version 3.0.x directly before the start of the xmit in hardware, similar to the 3.1.x branch. Fixes: 80646733f11c ("can: m_can: update to support CAN FD features") Link: https://lore.kernel.org/all/20220317081305.739554-1-mkl@pengutronix.de Cc: stable@vger.kernel.org Reported-by: Hangyu Hua Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/m_can.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index 1a4b56f6fa8c6..b3b5bc1c803b3 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -1637,8 +1637,6 @@ static netdev_tx_t m_can_tx_handler(struct m_can_classdev *cdev) if (err) goto out_fail; - can_put_echo_skb(skb, dev, 0, 0); - if (cdev->can.ctrlmode & CAN_CTRLMODE_FD) { cccr = m_can_read(cdev, M_CAN_CCCR); cccr &= ~CCCR_CMR_MASK; @@ -1655,6 +1653,9 @@ static netdev_tx_t m_can_tx_handler(struct m_can_classdev *cdev) m_can_write(cdev, M_CAN_CCCR, cccr); } m_can_write(cdev, M_CAN_TXBTIE, 0x1); + + can_put_echo_skb(skb, dev, 0, 0); + m_can_write(cdev, M_CAN_TXBAR, 0x1); /* End of xmit function for version 3.0.x */ } else { From c70222752228a62135cee3409dccefd494a24646 Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Mon, 28 Feb 2022 16:36:39 +0800 Subject: [PATCH 33/57] can: ems_usb: ems_usb_start_xmit(): fix double dev_kfree_skb() in error path There is no need to call dev_kfree_skb() when usb_submit_urb() fails beacause can_put_echo_skb() deletes the original skb and can_free_echo_skb() deletes the cloned skb. Link: https://lore.kernel.org/all/20220228083639.38183-1-hbh25y@gmail.com Fixes: 702171adeed3 ("ems_usb: Added support for EMS CPC-USB/ARM7 CAN/USB interface") Cc: stable@vger.kernel.org Cc: Sebastian Haas Signed-off-by: Hangyu Hua Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/ems_usb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c index 7bedceffdfa36..bbec3311d8934 100644 --- a/drivers/net/can/usb/ems_usb.c +++ b/drivers/net/can/usb/ems_usb.c @@ -819,7 +819,6 @@ static netdev_tx_t ems_usb_start_xmit(struct sk_buff *skb, struct net_device *ne usb_unanchor_urb(urb); usb_free_coherent(dev->udev, size, buf, urb->transfer_dma); - dev_kfree_skb(skb); atomic_dec(&dev->active_tx_urbs); From 3d3925ff6433f98992685a9679613a2cc97f3ce2 Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Fri, 11 Mar 2022 16:06:14 +0800 Subject: [PATCH 34/57] can: usb_8dev: usb_8dev_start_xmit(): fix double dev_kfree_skb() in error path There is no need to call dev_kfree_skb() when usb_submit_urb() fails because can_put_echo_skb() deletes original skb and can_free_echo_skb() deletes the cloned skb. Fixes: 0024d8ad1639 ("can: usb_8dev: Add support for USB2CAN interface from 8 devices") Link: https://lore.kernel.org/all/20220311080614.45229-1-hbh25y@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Hangyu Hua Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/usb_8dev.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/drivers/net/can/usb/usb_8dev.c b/drivers/net/can/usb/usb_8dev.c index 431af1ec1e3ca..b638604bf1eef 100644 --- a/drivers/net/can/usb/usb_8dev.c +++ b/drivers/net/can/usb/usb_8dev.c @@ -663,9 +663,20 @@ static netdev_tx_t usb_8dev_start_xmit(struct sk_buff *skb, atomic_inc(&priv->active_tx_urbs); err = usb_submit_urb(urb, GFP_ATOMIC); - if (unlikely(err)) - goto failed; - else if (atomic_read(&priv->active_tx_urbs) >= MAX_TX_URBS) + if (unlikely(err)) { + can_free_echo_skb(netdev, context->echo_index, NULL); + + usb_unanchor_urb(urb); + usb_free_coherent(priv->udev, size, buf, urb->transfer_dma); + + atomic_dec(&priv->active_tx_urbs); + + if (err == -ENODEV) + netif_device_detach(netdev); + else + netdev_warn(netdev, "failed tx_urb %d\n", err); + stats->tx_dropped++; + } else if (atomic_read(&priv->active_tx_urbs) >= MAX_TX_URBS) /* Slow down tx path */ netif_stop_queue(netdev); @@ -684,19 +695,6 @@ static netdev_tx_t usb_8dev_start_xmit(struct sk_buff *skb, return NETDEV_TX_BUSY; -failed: - can_free_echo_skb(netdev, context->echo_index, NULL); - - usb_unanchor_urb(urb); - usb_free_coherent(priv->udev, size, buf, urb->transfer_dma); - - atomic_dec(&priv->active_tx_urbs); - - if (err == -ENODEV) - netif_device_detach(netdev); - else - netdev_warn(netdev, "failed tx_urb %d\n", err); - nomembuf: usb_free_urb(urb); From 04c9b00ba83594a29813d6b1fb8fdc93a3915174 Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Fri, 11 Mar 2022 16:02:08 +0800 Subject: [PATCH 35/57] can: mcba_usb: mcba_usb_start_xmit(): fix double dev_kfree_skb in error path There is no need to call dev_kfree_skb() when usb_submit_urb() fails because can_put_echo_skb() deletes original skb and can_free_echo_skb() deletes the cloned skb. Fixes: 51f3baad7de9 ("can: mcba_usb: Add support for Microchip CAN BUS Analyzer") Link: https://lore.kernel.org/all/20220311080208.45047-1-hbh25y@gmail.com Signed-off-by: Hangyu Hua Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/mcba_usb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/can/usb/mcba_usb.c b/drivers/net/can/usb/mcba_usb.c index 77bddff86252b..7c198eb5bc9c3 100644 --- a/drivers/net/can/usb/mcba_usb.c +++ b/drivers/net/can/usb/mcba_usb.c @@ -364,7 +364,6 @@ static netdev_tx_t mcba_usb_start_xmit(struct sk_buff *skb, xmit_failed: can_free_echo_skb(priv->netdev, ctx->ndx, NULL); mcba_usb_free_ctx(ctx); - dev_kfree_skb(skb); stats->tx_dropped++; return NETDEV_TX_OK; From 136bed0bfd3bc9c95c88aafff2d22ecb3a919f23 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Sun, 13 Mar 2022 13:09:03 +0300 Subject: [PATCH 36/57] can: mcba_usb: properly check endpoint type Syzbot reported warning in usb_submit_urb() which is caused by wrong endpoint type. We should check that in endpoint is actually present to prevent this warning. Found pipes are now saved to struct mcba_priv and code uses them directly instead of making pipes in place. Fail log: | usb 5-1: BOGUS urb xfer, pipe 3 != type 1 | WARNING: CPU: 1 PID: 49 at drivers/usb/core/urb.c:502 usb_submit_urb+0xed2/0x18a0 drivers/usb/core/urb.c:502 | Modules linked in: | CPU: 1 PID: 49 Comm: kworker/1:2 Not tainted 5.17.0-rc6-syzkaller-00184-g38f80f42147f #0 | Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014 | Workqueue: usb_hub_wq hub_event | RIP: 0010:usb_submit_urb+0xed2/0x18a0 drivers/usb/core/urb.c:502 | ... | Call Trace: | | mcba_usb_start drivers/net/can/usb/mcba_usb.c:662 [inline] | mcba_usb_probe+0x8a3/0xc50 drivers/net/can/usb/mcba_usb.c:858 | usb_probe_interface+0x315/0x7f0 drivers/usb/core/driver.c:396 | call_driver_probe drivers/base/dd.c:517 [inline] Fixes: 51f3baad7de9 ("can: mcba_usb: Add support for Microchip CAN BUS Analyzer") Link: https://lore.kernel.org/all/20220313100903.10868-1-paskripkin@gmail.com Reported-and-tested-by: syzbot+3bc1dce0cc0052d60fde@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Reviewed-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/mcba_usb.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/drivers/net/can/usb/mcba_usb.c b/drivers/net/can/usb/mcba_usb.c index 7c198eb5bc9c3..c45a814e1de2f 100644 --- a/drivers/net/can/usb/mcba_usb.c +++ b/drivers/net/can/usb/mcba_usb.c @@ -33,10 +33,6 @@ #define MCBA_USB_RX_BUFF_SIZE 64 #define MCBA_USB_TX_BUFF_SIZE (sizeof(struct mcba_usb_msg)) -/* MCBA endpoint numbers */ -#define MCBA_USB_EP_IN 1 -#define MCBA_USB_EP_OUT 1 - /* Microchip command id */ #define MBCA_CMD_RECEIVE_MESSAGE 0xE3 #define MBCA_CMD_I_AM_ALIVE_FROM_CAN 0xF5 @@ -83,6 +79,8 @@ struct mcba_priv { atomic_t free_ctx_cnt; void *rxbuf[MCBA_MAX_RX_URBS]; dma_addr_t rxbuf_dma[MCBA_MAX_RX_URBS]; + int rx_pipe; + int tx_pipe; }; /* CAN frame */ @@ -268,10 +266,8 @@ static netdev_tx_t mcba_usb_xmit(struct mcba_priv *priv, memcpy(buf, usb_msg, MCBA_USB_TX_BUFF_SIZE); - usb_fill_bulk_urb(urb, priv->udev, - usb_sndbulkpipe(priv->udev, MCBA_USB_EP_OUT), buf, - MCBA_USB_TX_BUFF_SIZE, mcba_usb_write_bulk_callback, - ctx); + usb_fill_bulk_urb(urb, priv->udev, priv->tx_pipe, buf, MCBA_USB_TX_BUFF_SIZE, + mcba_usb_write_bulk_callback, ctx); urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; usb_anchor_urb(urb, &priv->tx_submitted); @@ -607,7 +603,7 @@ static void mcba_usb_read_bulk_callback(struct urb *urb) resubmit_urb: usb_fill_bulk_urb(urb, priv->udev, - usb_rcvbulkpipe(priv->udev, MCBA_USB_EP_OUT), + priv->rx_pipe, urb->transfer_buffer, MCBA_USB_RX_BUFF_SIZE, mcba_usb_read_bulk_callback, priv); @@ -652,7 +648,7 @@ static int mcba_usb_start(struct mcba_priv *priv) urb->transfer_dma = buf_dma; usb_fill_bulk_urb(urb, priv->udev, - usb_rcvbulkpipe(priv->udev, MCBA_USB_EP_IN), + priv->rx_pipe, buf, MCBA_USB_RX_BUFF_SIZE, mcba_usb_read_bulk_callback, priv); urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; @@ -806,6 +802,13 @@ static int mcba_usb_probe(struct usb_interface *intf, struct mcba_priv *priv; int err; struct usb_device *usbdev = interface_to_usbdev(intf); + struct usb_endpoint_descriptor *in, *out; + + err = usb_find_common_endpoints(intf->cur_altsetting, &in, &out, NULL, NULL); + if (err) { + dev_err(&intf->dev, "Can't find endpoints\n"); + return err; + } netdev = alloc_candev(sizeof(struct mcba_priv), MCBA_MAX_TX_URBS); if (!netdev) { @@ -851,6 +854,9 @@ static int mcba_usb_probe(struct usb_interface *intf, goto cleanup_free_candev; } + priv->rx_pipe = usb_rcvbulkpipe(priv->udev, in->bEndpointAddress); + priv->tx_pipe = usb_sndbulkpipe(priv->udev, out->bEndpointAddress); + devm_can_led_init(netdev); /* Start USB dev only if we have successfully registered CAN device */ From 50d34a0d151dc7abbdbec781bd7f09f2b3cbf01a Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Tue, 29 Mar 2022 21:29:43 +0200 Subject: [PATCH 37/57] can: gs_usb: gs_make_candev(): fix memory leak for devices with extended bit timing configuration Some CAN-FD capable devices offer extended bit timing information for the data bit timing. The information must be read with an USB control message. The memory for this message is allocated but not free()ed (in the non error case). This patch adds the missing free. Fixes: 6679f4c5e5a6 ("can: gs_usb: add extended bt_const feature") Link: https://lore.kernel.org/all/20220329193450.659726-1-mkl@pengutronix.de Reported-by: syzbot+4d0ae90a195b269f102d@syzkaller.appspotmail.com Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/gs_usb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c index 67408e3160621..b29ba9138866b 100644 --- a/drivers/net/can/usb/gs_usb.c +++ b/drivers/net/can/usb/gs_usb.c @@ -1092,6 +1092,8 @@ static struct gs_can *gs_make_candev(unsigned int channel, dev->data_bt_const.brp_inc = le32_to_cpu(bt_const_extended->dbrp_inc); dev->can.data_bittiming_const = &dev->data_bt_const; + + kfree(bt_const_extended); } SET_NETDEV_DEV(netdev, &intf->dev); From 50386f7526dd7fd71a390966ea9736c8420aa31f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Mar 2022 21:24:52 -0700 Subject: [PATCH 38/57] docs: netdev: replace references to old archives Most people use (or should use) lore at this point. Replace the pointers to older archiving systems. Signed-off-by: Jakub Kicinski Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni --- Documentation/networking/netdev-FAQ.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst index e26532f497605..25b8a7de737c2 100644 --- a/Documentation/networking/netdev-FAQ.rst +++ b/Documentation/networking/netdev-FAQ.rst @@ -16,10 +16,8 @@ Note that some subsystems (e.g. wireless drivers) which have a high volume of traffic have their own specific mailing lists. The netdev list is managed (like many other Linux mailing lists) through -VGER (http://vger.kernel.org/) and archives can be found below: - -- http://marc.info/?l=linux-netdev -- http://www.spinics.net/lists/netdev/ +VGER (http://vger.kernel.org/) with archives available at +https://lore.kernel.org/netdev/ Aside from subsystems like that mentioned above, all network-related Linux development (i.e. RFC, review, comments, etc.) takes place on From 30cddd30532a72f857abf9d7c80d8b620fd3e5a1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Mar 2022 21:24:53 -0700 Subject: [PATCH 39/57] docs: netdev: minor reword that -> those Signed-off-by: Jakub Kicinski Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni --- Documentation/networking/netdev-FAQ.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst index 25b8a7de737c2..f7e5755e013e4 100644 --- a/Documentation/networking/netdev-FAQ.rst +++ b/Documentation/networking/netdev-FAQ.rst @@ -19,7 +19,7 @@ The netdev list is managed (like many other Linux mailing lists) through VGER (http://vger.kernel.org/) with archives available at https://lore.kernel.org/netdev/ -Aside from subsystems like that mentioned above, all network-related +Aside from subsystems like those mentioned above, all network-related Linux development (i.e. RFC, review, comments, etc.) takes place on netdev. From c82d90b14f6c23f4e7cd89944f9a89f89223dc58 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Mar 2022 21:24:54 -0700 Subject: [PATCH 40/57] docs: netdev: move the patch marking section up We want people to mark their patches with net and net-next in the subject. Many miss doing that. Move the FAQ section which points that out up, and place it after the section which enumerates the trees, that seems like a pretty logical place for it. Since the two sections are together we can remove a little bit (not too much) of the repetition. v2: also remove the text for non-git setups, we want people to use git. Signed-off-by: Jakub Kicinski Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni --- Documentation/networking/netdev-FAQ.rst | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst index f7e5755e013e4..fd5f5a1a08467 100644 --- a/Documentation/networking/netdev-FAQ.rst +++ b/Documentation/networking/netdev-FAQ.rst @@ -35,6 +35,17 @@ for the future release. You can find the trees here: - https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git - https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git +How do I indicate which tree (net vs. net-next) my patch should be in? +---------------------------------------------------------------------- +To help maintainers and CI bots you should explicitly mark which tree +your patch is targeting. Assuming that you use git, use the prefix +flag:: + + git format-patch --subject-prefix='PATCH net-next' start..finish + +Use ``net`` instead of ``net-next`` (always lower case) in the above for +bug-fix ``net`` content. + How often do changes from these trees make it to the mainline Linus tree? ------------------------------------------------------------------------- To understand this, you need to know a bit of background information on @@ -90,20 +101,6 @@ and note the top of the "tags" section. If it is rc1, it is early in the dev cycle. If it was tagged rc7 a week ago, then a release is probably imminent. -How do I indicate which tree (net vs. net-next) my patch should be in? ----------------------------------------------------------------------- -Firstly, think whether you have a bug fix or new "next-like" content. -Then once decided, assuming that you use git, use the prefix flag, i.e. -:: - - git format-patch --subject-prefix='PATCH net-next' start..finish - -Use ``net`` instead of ``net-next`` (always lower case) in the above for -bug-fix ``net`` content. If you don't use git, then note the only magic -in the above is just the subject text of the outgoing e-mail, and you -can manually change it yourself with whatever MUA you are comfortable -with. - I sent a patch and I'm wondering what happened to it - how can I tell whether it got merged? -------------------------------------------------------------------------------------------- Start by looking at the main patchworks queue for netdev: From 2fd4c50dbff19eb3d798236074bfa3226b5df4a6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Mar 2022 21:24:55 -0700 Subject: [PATCH 41/57] docs: netdev: turn the net-next closed into a Warning Use the sphinx Warning box to make the net-next being closed stand out more. Signed-off-by: Jakub Kicinski Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni --- Documentation/networking/netdev-FAQ.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst index fd5f5a1a08467..041993258dda3 100644 --- a/Documentation/networking/netdev-FAQ.rst +++ b/Documentation/networking/netdev-FAQ.rst @@ -70,8 +70,9 @@ relating to vX.Y An announcement indicating when ``net-next`` has been closed is usually sent to netdev, but knowing the above, you can predict that in advance. -IMPORTANT: Do not send new ``net-next`` content to netdev during the -period during which ``net-next`` tree is closed. +.. warning:: + Do not send new ``net-next`` content to netdev during the + period during which ``net-next`` tree is closed. Shortly after the two weeks have passed (and vX.Y-rc1 is released), the tree for ``net-next`` reopens to collect content for the next (vX.Y+1) From 0e242e3fb7a7f6c120195c5a57faf78a321dbf65 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Mar 2022 21:24:56 -0700 Subject: [PATCH 42/57] docs: netdev: note that RFC postings are allowed any time Document that RFCs are allowed during the merge window. Signed-off-by: Jakub Kicinski Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni --- Documentation/networking/netdev-FAQ.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst index 041993258dda3..f4c77efa75d4c 100644 --- a/Documentation/networking/netdev-FAQ.rst +++ b/Documentation/networking/netdev-FAQ.rst @@ -74,6 +74,9 @@ sent to netdev, but knowing the above, you can predict that in advance. Do not send new ``net-next`` content to netdev during the period during which ``net-next`` tree is closed. +RFC patches sent for review only are obviously welcome at any time +(use ``--subject-prefix='RFC net-next'`` with ``git format-patch``). + Shortly after the two weeks have passed (and vX.Y-rc1 is released), the tree for ``net-next`` reopens to collect content for the next (vX.Y+1) release. From 5d84921ac750d2b2e42c7102890b978cb2c2729e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Mar 2022 21:24:57 -0700 Subject: [PATCH 43/57] docs: netdev: shorten the name and mention msgid for patch status Cut down the length of the question so it renders better in docs. Mention that Message-ID can be used to search patchwork. Signed-off-by: Jakub Kicinski Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni --- Documentation/networking/netdev-FAQ.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst index f4c77efa75d4c..e10a8140d6420 100644 --- a/Documentation/networking/netdev-FAQ.rst +++ b/Documentation/networking/netdev-FAQ.rst @@ -105,14 +105,16 @@ and note the top of the "tags" section. If it is rc1, it is early in the dev cycle. If it was tagged rc7 a week ago, then a release is probably imminent. -I sent a patch and I'm wondering what happened to it - how can I tell whether it got merged? --------------------------------------------------------------------------------------------- +How can I tell the status of a patch I've sent? +----------------------------------------------- Start by looking at the main patchworks queue for netdev: https://patchwork.kernel.org/project/netdevbpf/list/ The "State" field will tell you exactly where things are at with your -patch. +patch. Patches are indexed by the ``Message-ID`` header of the emails +which carried them so if you have trouble finding your patch append +the value of ``Message-ID`` to the URL above. The above only says "Under Review". How can I find out more? ------------------------------------------------------------- From 8f785c1bb84f7c8ca07c408dc96e8d8358b02bb3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Mar 2022 21:24:58 -0700 Subject: [PATCH 44/57] docs: netdev: rephrase the 'Under review' question The semantics of "Under review" have shifted. Reword the question about it a bit and focus it on the response time. Signed-off-by: Jakub Kicinski Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni --- Documentation/networking/netdev-FAQ.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst index e10a8140d6420..00ac300ebe6a6 100644 --- a/Documentation/networking/netdev-FAQ.rst +++ b/Documentation/networking/netdev-FAQ.rst @@ -116,10 +116,12 @@ patch. Patches are indexed by the ``Message-ID`` header of the emails which carried them so if you have trouble finding your patch append the value of ``Message-ID`` to the URL above. -The above only says "Under Review". How can I find out more? -------------------------------------------------------------- +How long before my patch is accepted? +------------------------------------- Generally speaking, the patches get triaged quickly (in less than -48h). So be patient. Asking the maintainer for status updates on your +48h). But be patient, if your patch is active in patchwork (i.e. it's +listed on the project's patch list) the chances it was missed are close to zero. +Asking the maintainer for status updates on your patch is a good way to ensure your patch is ignored or pushed to the bottom of the priority list. From 724c1a7443c5ccb7cda9bfb5ccab84bcece2c50c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Mar 2022 21:24:59 -0700 Subject: [PATCH 45/57] docs: netdev: rephrase the 'should I update patchwork' question Make the question shorter and adjust the start of the answer accordingly. Signed-off-by: Jakub Kicinski Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni --- Documentation/networking/netdev-FAQ.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst index 00ac300ebe6a6..9c455d08510df 100644 --- a/Documentation/networking/netdev-FAQ.rst +++ b/Documentation/networking/netdev-FAQ.rst @@ -125,9 +125,11 @@ Asking the maintainer for status updates on your patch is a good way to ensure your patch is ignored or pushed to the bottom of the priority list. -I submitted multiple versions of the patch series. Should I directly update patchwork for the previous versions of these patch series? --------------------------------------------------------------------------------------------------------------------------------------- -No, please don't interfere with the patch status on patchwork, leave +Should I directly update patchwork state of my own patches? +----------------------------------------------------------- +It may be tempting to help the maintainers and update the state of your +own patches when you post a new version or spot a bug. Please do not do that. +Interfering with the patch status on patchwork will only cause confusion. Leave it to the maintainer to figure out what is the most recent and current version that should be applied. If there is any doubt, the maintainer will reply and ask what should be done. From b8ba106378a058711d8abd0889351ff9c81850c1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Mar 2022 21:25:00 -0700 Subject: [PATCH 46/57] docs: netdev: add a question about re-posting frequency We have to tell people to stop reposting to often lately, or not to repost while the discussion is ongoing. Document this. Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni --- Documentation/networking/netdev-FAQ.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst index 9c455d08510df..f8b89dc81cab2 100644 --- a/Documentation/networking/netdev-FAQ.rst +++ b/Documentation/networking/netdev-FAQ.rst @@ -140,6 +140,17 @@ No, please resend the entire patch series and make sure you do number your patches such that it is clear this is the latest and greatest set of patches that can be applied. +I have received review feedback, when should I post a revised version of the patches? +------------------------------------------------------------------------------------- +Allow at least 24 hours to pass between postings. This will ensure reviewers +from all geographical locations have a chance to chime in. Do not wait +too long (weeks) between postings either as it will make it harder for reviewers +to recall all the context. + +Make sure you address all the feedback in your new posting. Do not post a new +version of the code if the discussion about the previous version is still +ongoing, unless directly instructed by a reviewer. + I submitted multiple versions of a patch series and it looks like a version other than the last one has been accepted, what should I do? ---------------------------------------------------------------------------------------------------------------------------------------- There is no revert possible, once it is pushed out, it stays like that. From 3eca381457ca58fbde16f827ddfaaecde3d61127 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Mar 2022 21:25:01 -0700 Subject: [PATCH 47/57] docs: netdev: make the testing requirement more stringent These days we often ask for selftests so let's update our testing requirements. Signed-off-by: Jakub Kicinski Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni --- Documentation/networking/netdev-FAQ.rst | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst index f8b89dc81cab2..1388f78cfbc57 100644 --- a/Documentation/networking/netdev-FAQ.rst +++ b/Documentation/networking/netdev-FAQ.rst @@ -196,11 +196,15 @@ as possible alternative mechanisms. What level of testing is expected before I submit my change? ------------------------------------------------------------ -If your changes are against ``net-next``, the expectation is that you -have tested by layering your changes on top of ``net-next``. Ideally -you will have done run-time testing specific to your change, but at a -minimum, your changes should survive an ``allyesconfig`` and an -``allmodconfig`` build without new warnings or failures. +At the very minimum your changes must survive an ``allyesconfig`` and an +``allmodconfig`` build with ``W=1`` set without new warnings or failures. + +Ideally you will have done run-time testing specific to your change, +and the patch series contains a set of kernel selftest for +``tools/testing/selftests/net`` or using the KUnit framework. + +You are expected to test your changes on top of the relevant networking +tree (``net`` or ``net-next``) and not e.g. a stable tree or ``linux-next``. How do I post corresponding changes to user space components? ------------------------------------------------------------- From a300597318771f889136db36f9f0dcfd26b84f18 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Mar 2022 21:25:02 -0700 Subject: [PATCH 48/57] docs: netdev: add missing back ticks I think double back ticks are more correct. Add where they are missing. Signed-off-by: Jakub Kicinski Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni --- Documentation/networking/netdev-FAQ.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst index 1388f78cfbc57..294ad9b0162d9 100644 --- a/Documentation/networking/netdev-FAQ.rst +++ b/Documentation/networking/netdev-FAQ.rst @@ -218,7 +218,7 @@ or the user space project is not reviewed on netdev include a link to a public repo where user space patches can be seen. In case user space tooling lives in a separate repository but is -reviewed on netdev (e.g. patches to `iproute2` tools) kernel and +reviewed on netdev (e.g. patches to ``iproute2`` tools) kernel and user space patches should form separate series (threads) when posted to the mailing list, e.g.:: @@ -251,18 +251,18 @@ traffic if we can help it. netdevsim is great, can I extend it for my out-of-tree tests? ------------------------------------------------------------- -No, `netdevsim` is a test vehicle solely for upstream tests. -(Please add your tests under tools/testing/selftests/.) +No, ``netdevsim`` is a test vehicle solely for upstream tests. +(Please add your tests under ``tools/testing/selftests/``.) -We also give no guarantees that `netdevsim` won't change in the future +We also give no guarantees that ``netdevsim`` won't change in the future in a way which would break what would normally be considered uAPI. Is netdevsim considered a "user" of an API? ------------------------------------------- Linux kernel has a long standing rule that no API should be added unless -it has a real, in-tree user. Mock-ups and tests based on `netdevsim` are -strongly encouraged when adding new APIs, but `netdevsim` in itself +it has a real, in-tree user. Mock-ups and tests based on ``netdevsim`` are +strongly encouraged when adding new APIs, but ``netdevsim`` in itself is **not** considered a use case/user. Any other tips to help ensure my net/net-next patch gets OK'd? From 99eba4e5cbd462db47dcb949af6d5474acdac953 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Mar 2022 21:25:03 -0700 Subject: [PATCH 49/57] docs: netdev: call out the merge window in tag checking Add the most important case to the question about "where are we in the cycle" - the case of net-next being closed. Signed-off-by: Jakub Kicinski Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni --- Documentation/networking/netdev-FAQ.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst index 294ad9b0162d9..a18e4e671e85a 100644 --- a/Documentation/networking/netdev-FAQ.rst +++ b/Documentation/networking/netdev-FAQ.rst @@ -103,7 +103,9 @@ Load the mainline (Linus) page here: and note the top of the "tags" section. If it is rc1, it is early in the dev cycle. If it was tagged rc7 a week ago, then a release is -probably imminent. +probably imminent. If the most recent tag is a final release tag +(without an ``-rcN`` suffix) - we are most likely in a merge window +and ``net-next`` is closed. How can I tell the status of a patch I've sent? ----------------------------------------------- From 08767a26f095909423f963c864f15db50c0ce9a7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Mar 2022 21:25:04 -0700 Subject: [PATCH 50/57] docs: netdev: broaden the new vs old code formatting guidelines Convert the "should I use new or old comment formatting" to cover all formatting. This makes the question itself shorter. Signed-off-by: Jakub Kicinski Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni --- Documentation/networking/netdev-FAQ.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst index a18e4e671e85a..c456b5225d664 100644 --- a/Documentation/networking/netdev-FAQ.rst +++ b/Documentation/networking/netdev-FAQ.rst @@ -183,10 +183,10 @@ it is requested that you make it look like this:: * another line of text */ -I am working in existing code that has the former comment style and not the latter. Should I submit new code in the former style or the latter? ------------------------------------------------------------------------------------------------------------------------------------------------ -Make it the latter style, so that eventually all code in the domain -of netdev is of this format. +I am working in existing code which uses non-standard formatting. Which formatting should I use? +------------------------------------------------------------------------------------------------ +Make your code follow the most recent guidelines, so that eventually all code +in the domain of netdev is in the preferred format. I found a bug that might have possible security implications or similar. Should I mail the main netdev maintainer off-list? --------------------------------------------------------------------------------------------------------------------------- From 8df0136376dc9227a45fd6a1420016f58792b5d0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Mar 2022 21:25:05 -0700 Subject: [PATCH 51/57] docs: netdev: move the netdev-FAQ to the process pages The documentation for the tip tree is really in quite a similar spirit to the netdev-FAQ. Move the netdev-FAQ to the process docs as well. Signed-off-by: Jakub Kicinski Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni --- Documentation/bpf/bpf_devel_QA.rst | 2 +- Documentation/networking/index.rst | 3 ++- Documentation/process/maintainer-handbooks.rst | 1 + .../netdev-FAQ.rst => process/maintainer-netdev.rst} | 0 MAINTAINERS | 1 + 5 files changed, 5 insertions(+), 2 deletions(-) rename Documentation/{networking/netdev-FAQ.rst => process/maintainer-netdev.rst} (100%) diff --git a/Documentation/bpf/bpf_devel_QA.rst b/Documentation/bpf/bpf_devel_QA.rst index 253496af8fef2..761474bd7fe6e 100644 --- a/Documentation/bpf/bpf_devel_QA.rst +++ b/Documentation/bpf/bpf_devel_QA.rst @@ -658,7 +658,7 @@ when: .. Links .. _Documentation/process/: https://www.kernel.org/doc/html/latest/process/ -.. _netdev-FAQ: ../networking/netdev-FAQ.rst +.. _netdev-FAQ: Documentation/process/maintainer-netdev.rst .. _selftests: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/testing/selftests/bpf/ .. _Documentation/dev-tools/kselftest.rst: diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index ce017136ab053..72cf33579b785 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -1,12 +1,13 @@ Linux Networking Documentation ============================== +Refer to :ref:`netdev-FAQ` for a guide on netdev development process specifics. + Contents: .. toctree:: :maxdepth: 2 - netdev-FAQ af_xdp bareudp batman-adv diff --git a/Documentation/process/maintainer-handbooks.rst b/Documentation/process/maintainer-handbooks.rst index 6af1abb0da482..d783060b4cc6e 100644 --- a/Documentation/process/maintainer-handbooks.rst +++ b/Documentation/process/maintainer-handbooks.rst @@ -16,3 +16,4 @@ Contents: :maxdepth: 2 maintainer-tip + maintainer-netdev diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/process/maintainer-netdev.rst similarity index 100% rename from Documentation/networking/netdev-FAQ.rst rename to Documentation/process/maintainer-netdev.rst diff --git a/MAINTAINERS b/MAINTAINERS index d91f6c6e3d3b5..00dd58fc8bf3b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13661,6 +13661,7 @@ B: mailto:netdev@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git F: Documentation/networking/ +F: Documentation/process/maintainer-netdev.rst F: include/linux/in.h F: include/linux/net.h F: include/linux/netdevice.h From 9c9a04212fa380d2e7d1412bb281309955c0a781 Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Wed, 30 Mar 2022 21:45:05 +0800 Subject: [PATCH 52/57] net: hns3: fix the concurrency between functions reading debugfs Currently, the debugfs mechanism is that all functions share a global variable to save the pointer for obtaining data. When different functions concurrently access the same file node, repeated release exceptions occur. Therefore, the granularity of the pointer for storing the obtained data is adjusted to be private for each function. Fixes: 5e69ea7ee2a6 ("net: hns3: refactor the debugfs process") Signed-off-by: Yufeng Mo Signed-off-by: Guangbin Huang Signed-off-by: Paolo Abeni --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 1 + .../net/ethernet/hisilicon/hns3/hns3_debugfs.c | 15 +++++++++++---- .../net/ethernet/hisilicon/hns3/hns3_debugfs.h | 1 - 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index d44dd7091fa13..79c64f4e67d2b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -845,6 +845,7 @@ struct hnae3_handle { struct dentry *hnae3_dbgfs; /* protects concurrent contention between debugfs commands */ struct mutex dbgfs_lock; + char **dbgfs_buf; /* Network interface message level enabled bits */ u32 msg_enable; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index f726a5b70f9e2..44d9b560b3374 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -1227,7 +1227,7 @@ static ssize_t hns3_dbg_read(struct file *filp, char __user *buffer, return ret; mutex_lock(&handle->dbgfs_lock); - save_buf = &hns3_dbg_cmd[index].buf; + save_buf = &handle->dbgfs_buf[index]; if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state) || test_bit(HNS3_NIC_STATE_RESETTING, &priv->state)) { @@ -1332,6 +1332,13 @@ int hns3_dbg_init(struct hnae3_handle *handle) int ret; u32 i; + handle->dbgfs_buf = devm_kcalloc(&handle->pdev->dev, + ARRAY_SIZE(hns3_dbg_cmd), + sizeof(*handle->dbgfs_buf), + GFP_KERNEL); + if (!handle->dbgfs_buf) + return -ENOMEM; + hns3_dbg_dentry[HNS3_DBG_DENTRY_COMMON].dentry = debugfs_create_dir(name, hns3_dbgfs_root); handle->hnae3_dbgfs = hns3_dbg_dentry[HNS3_DBG_DENTRY_COMMON].dentry; @@ -1380,9 +1387,9 @@ void hns3_dbg_uninit(struct hnae3_handle *handle) u32 i; for (i = 0; i < ARRAY_SIZE(hns3_dbg_cmd); i++) - if (hns3_dbg_cmd[i].buf) { - kvfree(hns3_dbg_cmd[i].buf); - hns3_dbg_cmd[i].buf = NULL; + if (handle->dbgfs_buf[i]) { + kvfree(handle->dbgfs_buf[i]); + handle->dbgfs_buf[i] = NULL; } mutex_destroy(&handle->dbgfs_lock); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.h b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.h index 83aa1450ab9fe..97578eabb7d8b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.h @@ -49,7 +49,6 @@ struct hns3_dbg_cmd_info { enum hnae3_dbg_cmd cmd; enum hns3_dbg_dentry_type dentry; u32 buf_len; - char *buf; int (*init)(struct hnae3_handle *handle, unsigned int cmd); }; From 7ed258f12ec5ce855f15cdfb5710361dc82fe899 Mon Sep 17 00:00:00 2001 From: Guangbin Huang Date: Wed, 30 Mar 2022 21:45:06 +0800 Subject: [PATCH 53/57] net: hns3: fix software vlan talbe of vlan 0 inconsistent with hardware When user delete vlan 0, as driver will not delete vlan 0 for hardware in function hclge_set_vlan_filter_hw(), so vlan 0 in software vlan talbe should not be deleted. Fixes: fe4144d47eef ("net: hns3: sync VLAN filter entries when kill VLAN ID failed") Signed-off-by: Guangbin Huang Signed-off-by: Paolo Abeni --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 2a5e6a241d527..8cebb180c812f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -10323,11 +10323,11 @@ int hclge_set_vlan_filter(struct hnae3_handle *handle, __be16 proto, } if (!ret) { - if (is_kill) - hclge_rm_vport_vlan_table(vport, vlan_id, false); - else + if (!is_kill) hclge_add_vport_vlan_table(vport, vlan_id, writen_to_tbl); + else if (is_kill && vlan_id != 0) + hclge_rm_vport_vlan_table(vport, vlan_id, false); } else if (is_kill) { /* when remove hw vlan filter failed, record the vlan id, * and try to remove it from hw later, to be consistence From 4a7f62f91933c8ae5308f9127fd8ea48188b6bc3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 30 Mar 2022 15:39:16 +0100 Subject: [PATCH 54/57] rxrpc: Fix call timer start racing with call destruction The rxrpc_call struct has a timer used to handle various timed events relating to a call. This timer can get started from the packet input routines that are run in softirq mode with just the RCU read lock held. Unfortunately, because only the RCU read lock is held - and neither ref or other lock is taken - the call can start getting destroyed at the same time a packet comes in addressed to that call. This causes the timer - which was already stopped - to get restarted. Later, the timer dispatch code may then oops if the timer got deallocated first. Fix this by trying to take a ref on the rxrpc_call struct and, if successful, passing that ref along to the timer. If the timer was already running, the ref is discarded. The timer completion routine can then pass the ref along to the call's work item when it queues it. If the timer or work item where already queued/running, the extra ref is discarded. Fixes: a158bdd3247b ("rxrpc: Fix call timeouts") Reported-by: Marc Dionne Signed-off-by: David Howells Reviewed-by: Marc Dionne Tested-by: Marc Dionne cc: linux-afs@lists.infradead.org Link: http://lists.infradead.org/pipermail/linux-afs/2022-March/005073.html Link: https://lore.kernel.org/r/164865115696.2943015.11097991776647323586.stgit@warthog.procyon.org.uk Signed-off-by: Paolo Abeni --- include/trace/events/rxrpc.h | 8 +++++++- net/rxrpc/ar-internal.h | 15 +++++++------- net/rxrpc/call_event.c | 2 +- net/rxrpc/call_object.c | 40 +++++++++++++++++++++++++++++++----- 4 files changed, 50 insertions(+), 15 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index e70c90116edae..4a3ab0ed6e062 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -83,12 +83,15 @@ enum rxrpc_call_trace { rxrpc_call_error, rxrpc_call_got, rxrpc_call_got_kernel, + rxrpc_call_got_timer, rxrpc_call_got_userid, rxrpc_call_new_client, rxrpc_call_new_service, rxrpc_call_put, rxrpc_call_put_kernel, rxrpc_call_put_noqueue, + rxrpc_call_put_notimer, + rxrpc_call_put_timer, rxrpc_call_put_userid, rxrpc_call_queued, rxrpc_call_queued_ref, @@ -278,12 +281,15 @@ enum rxrpc_tx_point { EM(rxrpc_call_error, "*E*") \ EM(rxrpc_call_got, "GOT") \ EM(rxrpc_call_got_kernel, "Gke") \ + EM(rxrpc_call_got_timer, "GTM") \ EM(rxrpc_call_got_userid, "Gus") \ EM(rxrpc_call_new_client, "NWc") \ EM(rxrpc_call_new_service, "NWs") \ EM(rxrpc_call_put, "PUT") \ EM(rxrpc_call_put_kernel, "Pke") \ - EM(rxrpc_call_put_noqueue, "PNQ") \ + EM(rxrpc_call_put_noqueue, "PnQ") \ + EM(rxrpc_call_put_notimer, "PnT") \ + EM(rxrpc_call_put_timer, "PTM") \ EM(rxrpc_call_put_userid, "Pus") \ EM(rxrpc_call_queued, "QUE") \ EM(rxrpc_call_queued_ref, "QUR") \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 7bd6f8a66a3ef..969e532f77a90 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -777,14 +777,12 @@ void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool, bool, enum rxrpc_propose_ack_trace); void rxrpc_process_call(struct work_struct *); -static inline void rxrpc_reduce_call_timer(struct rxrpc_call *call, - unsigned long expire_at, - unsigned long now, - enum rxrpc_timer_trace why) -{ - trace_rxrpc_timer(call, why, now); - timer_reduce(&call->timer, expire_at); -} +void rxrpc_reduce_call_timer(struct rxrpc_call *call, + unsigned long expire_at, + unsigned long now, + enum rxrpc_timer_trace why); + +void rxrpc_delete_call_timer(struct rxrpc_call *call); /* * call_object.c @@ -808,6 +806,7 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *); bool __rxrpc_queue_call(struct rxrpc_call *); bool rxrpc_queue_call(struct rxrpc_call *); void rxrpc_see_call(struct rxrpc_call *); +bool rxrpc_try_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op); void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_cleanup_call(struct rxrpc_call *); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index df864e6922679..22e05de5d1ca9 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -310,7 +310,7 @@ void rxrpc_process_call(struct work_struct *work) } if (call->state == RXRPC_CALL_COMPLETE) { - del_timer_sync(&call->timer); + rxrpc_delete_call_timer(call); goto out_put; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 4eb91d958a48d..043508fd8d8a5 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -53,10 +53,30 @@ static void rxrpc_call_timer_expired(struct timer_list *t) if (call->state < RXRPC_CALL_COMPLETE) { trace_rxrpc_timer(call, rxrpc_timer_expired, jiffies); - rxrpc_queue_call(call); + __rxrpc_queue_call(call); + } else { + rxrpc_put_call(call, rxrpc_call_put); + } +} + +void rxrpc_reduce_call_timer(struct rxrpc_call *call, + unsigned long expire_at, + unsigned long now, + enum rxrpc_timer_trace why) +{ + if (rxrpc_try_get_call(call, rxrpc_call_got_timer)) { + trace_rxrpc_timer(call, why, now); + if (timer_reduce(&call->timer, expire_at)) + rxrpc_put_call(call, rxrpc_call_put_notimer); } } +void rxrpc_delete_call_timer(struct rxrpc_call *call) +{ + if (del_timer_sync(&call->timer)) + rxrpc_put_call(call, rxrpc_call_put_timer); +} + static struct lock_class_key rxrpc_call_user_mutex_lock_class_key; /* @@ -463,6 +483,17 @@ void rxrpc_see_call(struct rxrpc_call *call) } } +bool rxrpc_try_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op) +{ + const void *here = __builtin_return_address(0); + int n = atomic_fetch_add_unless(&call->usage, 1, 0); + + if (n == 0) + return false; + trace_rxrpc_call(call->debug_id, op, n, here, NULL); + return true; +} + /* * Note the addition of a ref on a call. */ @@ -510,8 +541,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) spin_unlock_bh(&call->lock); rxrpc_put_call_slot(call); - - del_timer_sync(&call->timer); + rxrpc_delete_call_timer(call); /* Make sure we don't get any more notifications */ write_lock_bh(&rx->recvmsg_lock); @@ -618,6 +648,8 @@ static void rxrpc_destroy_call(struct work_struct *work) struct rxrpc_call *call = container_of(work, struct rxrpc_call, processor); struct rxrpc_net *rxnet = call->rxnet; + rxrpc_delete_call_timer(call); + rxrpc_put_connection(call->conn); rxrpc_put_peer(call->peer); kfree(call->rxtx_buffer); @@ -652,8 +684,6 @@ void rxrpc_cleanup_call(struct rxrpc_call *call) memset(&call->sock_node, 0xcd, sizeof(call->sock_node)); - del_timer_sync(&call->timer); - ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags)); From ff8376ade4f668130385839cef586a0990f8ef87 Mon Sep 17 00:00:00 2001 From: Xiaolong Huang Date: Wed, 30 Mar 2022 15:22:14 +0100 Subject: [PATCH 55/57] rxrpc: fix some null-ptr-deref bugs in server_key.c Some function calls are not implemented in rxrpc_no_security, there are preparse_server_key, free_preparse_server_key and destroy_server_key. When rxrpc security type is rxrpc_no_security, user can easily trigger a null-ptr-deref bug via ioctl. So judgment should be added to prevent it The crash log: user@syzkaller:~$ ./rxrpc_preparse_s [ 37.956878][T15626] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 37.957645][T15626] #PF: supervisor instruction fetch in kernel mode [ 37.958229][T15626] #PF: error_code(0x0010) - not-present page [ 37.958762][T15626] PGD 4aadf067 P4D 4aadf067 PUD 4aade067 PMD 0 [ 37.959321][T15626] Oops: 0010 [#1] PREEMPT SMP [ 37.959739][T15626] CPU: 0 PID: 15626 Comm: rxrpc_preparse_ Not tainted 5.17.0-01442-gb47d5a4f6b8d #43 [ 37.960588][T15626] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1 04/01/2014 [ 37.961474][T15626] RIP: 0010:0x0 [ 37.961787][T15626] Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6. [ 37.962480][T15626] RSP: 0018:ffffc9000d9abdc0 EFLAGS: 00010286 [ 37.963018][T15626] RAX: ffffffff84335200 RBX: ffff888012a1ce80 RCX: 0000000000000000 [ 37.963727][T15626] RDX: 0000000000000000 RSI: ffffffff84a736dc RDI: ffffc9000d9abe48 [ 37.964425][T15626] RBP: ffffc9000d9abe48 R08: 0000000000000000 R09: 0000000000000002 [ 37.965118][T15626] R10: 000000000000000a R11: f000000000000000 R12: ffff888013145680 [ 37.965836][T15626] R13: 0000000000000000 R14: ffffffffffffffec R15: ffff8880432aba80 [ 37.966441][T15626] FS: 00007f2177907700(0000) GS:ffff88803ec00000(0000) knlGS:0000000000000000 [ 37.966979][T15626] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 37.967384][T15626] CR2: ffffffffffffffd6 CR3: 000000004aaf1000 CR4: 00000000000006f0 [ 37.967864][T15626] Call Trace: [ 37.968062][T15626] [ 37.968240][T15626] rxrpc_preparse_s+0x59/0x90 [ 37.968541][T15626] key_create_or_update+0x174/0x510 [ 37.968863][T15626] __x64_sys_add_key+0x139/0x1d0 [ 37.969165][T15626] do_syscall_64+0x35/0xb0 [ 37.969451][T15626] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 37.969824][T15626] RIP: 0033:0x43a1f9 Signed-off-by: Xiaolong Huang Tested-by: Xiaolong Huang Signed-off-by: David Howells Acked-by: Marc Dionne cc: linux-afs@lists.infradead.org Link: http://lists.infradead.org/pipermail/linux-afs/2022-March/005069.html Fixes: 12da59fcab5a ("rxrpc: Hand server key parsing off to the security class") Link: https://lore.kernel.org/r/164865013439.2941502.8966285221215590921.stgit@warthog.procyon.org.uk Signed-off-by: Paolo Abeni --- net/rxrpc/server_key.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/rxrpc/server_key.c b/net/rxrpc/server_key.c index ead3471307ee5..ee269e0e6ee87 100644 --- a/net/rxrpc/server_key.c +++ b/net/rxrpc/server_key.c @@ -84,6 +84,9 @@ static int rxrpc_preparse_s(struct key_preparsed_payload *prep) prep->payload.data[1] = (struct rxrpc_security *)sec; + if (!sec->preparse_server_key) + return -EINVAL; + return sec->preparse_server_key(prep); } @@ -91,7 +94,7 @@ static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep) { const struct rxrpc_security *sec = prep->payload.data[1]; - if (sec) + if (sec && sec->free_preparse_server_key) sec->free_preparse_server_key(prep); } @@ -99,7 +102,7 @@ static void rxrpc_destroy_s(struct key *key) { const struct rxrpc_security *sec = key->payload.data[1]; - if (sec) + if (sec && sec->destroy_server_key) sec->destroy_server_key(key); } From ea07af2e71cdf2c08251d8ead196ce1c9466e38c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Wed, 30 Mar 2022 15:42:45 -0400 Subject: [PATCH 56/57] openvswitch: Add recirc_id to recirc warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When hitting the recirculation limit, the kernel would currently log something like this: [ 58.586597] openvswitch: ovs-system: deferred action limit reached, drop recirc action Which isn't all that useful to debug as we only have the interface name to go on but can't track it down to a specific flow. With this change, we now instead get: [ 58.586597] openvswitch: ovs-system: deferred action limit reached, drop recirc action (recirc_id=0x9e) Which can now be correlated with the flow entries from OVS. Suggested-by: Frode Nordahl Signed-off-by: Stéphane Graber Tested-by: Stephane Graber Acked-by: Eelco Chaudron Link: https://lore.kernel.org/r/20220330194244.3476544-1-stgraber@ubuntu.com Signed-off-by: Jakub Kicinski --- net/openvswitch/actions.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 780d9e2246f39..7056cb1b8ba0f 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1539,8 +1539,8 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb, pr_warn("%s: deferred action limit reached, drop sample action\n", ovs_dp_name(dp)); } else { /* Recirc action */ - pr_warn("%s: deferred action limit reached, drop recirc action\n", - ovs_dp_name(dp)); + pr_warn("%s: deferred action limit reached, drop recirc action (recirc_id=%#x)\n", + ovs_dp_name(dp), recirc_id); } } } From 9d570741aec1e1ebd37823b34a2958f24809ff24 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Mar 2022 12:46:43 -0700 Subject: [PATCH 57/57] vxlan: do not feed vxlan_vnifilter_dump_dev with non vxlan devices vxlan_vnifilter_dump_dev() assumes it is called only for vxlan devices. Make sure it is the case. BUG: KASAN: slab-out-of-bounds in vxlan_vnifilter_dump_dev+0x9a0/0xb40 drivers/net/vxlan/vxlan_vnifilter.c:349 Read of size 4 at addr ffff888060d1ce70 by task syz-executor.3/17662 CPU: 0 PID: 17662 Comm: syz-executor.3 Tainted: G W 5.17.0-syzkaller-12888-g77c9387c0c5b #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0xeb/0x495 mm/kasan/report.c:313 print_report mm/kasan/report.c:429 [inline] kasan_report.cold+0xf4/0x1c6 mm/kasan/report.c:491 vxlan_vnifilter_dump_dev+0x9a0/0xb40 drivers/net/vxlan/vxlan_vnifilter.c:349 vxlan_vnifilter_dump+0x3ff/0x650 drivers/net/vxlan/vxlan_vnifilter.c:428 netlink_dump+0x4b5/0xb70 net/netlink/af_netlink.c:2270 __netlink_dump_start+0x647/0x900 net/netlink/af_netlink.c:2375 netlink_dump_start include/linux/netlink.h:245 [inline] rtnetlink_rcv_msg+0x70c/0xb80 net/core/rtnetlink.c:5953 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2496 netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] netlink_unicast+0x543/0x7f0 net/netlink/af_netlink.c:1345 netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1921 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:725 ____sys_sendmsg+0x6e2/0x800 net/socket.c:2413 ___sys_sendmsg+0xf3/0x170 net/socket.c:2467 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2496 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0x80 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f87b8e89049 Fixes: f9c4bb0b245c ("vxlan: vni filtering support on collect metadata device") Signed-off-by: Eric Dumazet Reported-by: syzbot Acked-by: Roopa Prabhu Link: https://lore.kernel.org/r/20220330194643.2706132-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_vnifilter.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c index 9f28d0b6a6b26..3e04af4c5daa1 100644 --- a/drivers/net/vxlan/vxlan_vnifilter.c +++ b/drivers/net/vxlan/vxlan_vnifilter.c @@ -425,6 +425,12 @@ static int vxlan_vnifilter_dump(struct sk_buff *skb, struct netlink_callback *cb err = -ENODEV; goto out_err; } + if (!netif_is_vxlan(dev)) { + NL_SET_ERR_MSG(cb->extack, + "The device is not a vxlan device"); + err = -EINVAL; + goto out_err; + } err = vxlan_vnifilter_dump_dev(dev, skb, cb); /* if the dump completed without an error we return 0 here */ if (err != -EMSGSIZE)