From a069a22f391019e84390f4e8c1a9c531ba4fb28f Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Sun, 18 Aug 2024 21:48:03 +0900
Subject: [PATCH 1/7] tracing: fgraph: Fix to add new fgraph_ops to array after
 ftrace_startup_subops()

Since the register_ftrace_graph() assigns a new fgraph_ops to
fgraph_array before registring it by ftrace_startup_subops(), the new
fgraph_ops can be used in function_graph_enter().

In most cases, it is still OK because those fgraph_ops's hashtable is
already initialized by ftrace_set_filter*() etc.

But if a user registers a new fgraph_ops which does not initialize the
hash list, ftrace_ops_test() in function_graph_enter() causes a NULL
pointer dereference BUG because fgraph_ops->ops.func_hash is NULL.

This can be reproduced by the below commands because function profiler's
fgraph_ops does not initialize the hash list;

 # cd /sys/kernel/tracing
 # echo function_graph > current_tracer
 # echo 1 > function_profile_enabled

To fix this problem, add a new fgraph_ops to fgraph_array after
ftrace_startup_subops(). Thus, until the new fgraph_ops is initialized,
we will see fgraph_stub on the corresponding fgraph_array entry.

Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Florent Revest <revest@chromium.org>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: bpf <bpf@vger.kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alan Maguire <alan.maguire@oracle.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Guo Ren <guoren@kernel.org>
Link: https://lore.kernel.org/172398528350.293426.8347220120333730248.stgit@devnote2
Fixes: c132be2c4fcc ("function_graph: Have the instances use their own ftrace_ops for filtering")
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/fgraph.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index d1d5ea2d0a1b7..d7d4fb403f6f0 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -1206,18 +1206,24 @@ static void init_task_vars(int idx)
 	read_unlock(&tasklist_lock);
 }
 
-static void ftrace_graph_enable_direct(bool enable_branch)
+static void ftrace_graph_enable_direct(bool enable_branch, struct fgraph_ops *gops)
 {
 	trace_func_graph_ent_t func = NULL;
 	trace_func_graph_ret_t retfunc = NULL;
 	int i;
 
-	for_each_set_bit(i, &fgraph_array_bitmask,
-			 sizeof(fgraph_array_bitmask) * BITS_PER_BYTE) {
-		func = fgraph_array[i]->entryfunc;
-		retfunc = fgraph_array[i]->retfunc;
-		fgraph_direct_gops = fgraph_array[i];
-	 }
+	if (gops) {
+		func = gops->entryfunc;
+		retfunc = gops->retfunc;
+		fgraph_direct_gops = gops;
+	} else {
+		for_each_set_bit(i, &fgraph_array_bitmask,
+				 sizeof(fgraph_array_bitmask) * BITS_PER_BYTE) {
+			func = fgraph_array[i]->entryfunc;
+			retfunc = fgraph_array[i]->retfunc;
+			fgraph_direct_gops = fgraph_array[i];
+		}
+	}
 	if (WARN_ON_ONCE(!func))
 		return;
 
@@ -1256,8 +1262,6 @@ int register_ftrace_graph(struct fgraph_ops *gops)
 		ret = -ENOSPC;
 		goto out;
 	}
-
-	fgraph_array[i] = gops;
 	gops->idx = i;
 
 	ftrace_graph_active++;
@@ -1266,7 +1270,7 @@ int register_ftrace_graph(struct fgraph_ops *gops)
 		ftrace_graph_disable_direct(true);
 
 	if (ftrace_graph_active == 1) {
-		ftrace_graph_enable_direct(false);
+		ftrace_graph_enable_direct(false, gops);
 		register_pm_notifier(&ftrace_suspend_notifier);
 		ret = start_graph_tracing();
 		if (ret)
@@ -1281,14 +1285,15 @@ int register_ftrace_graph(struct fgraph_ops *gops)
 	} else {
 		init_task_vars(gops->idx);
 	}
-
 	/* Always save the function, and reset at unregistering */
 	gops->saved_func = gops->entryfunc;
 
 	ret = ftrace_startup_subops(&graph_ops, &gops->ops, command);
+	if (!ret)
+		fgraph_array[i] = gops;
+
 error:
 	if (ret) {
-		fgraph_array[i] = &fgraph_stub;
 		ftrace_graph_active--;
 		gops->saved_func = NULL;
 		fgraph_lru_release_index(i);
@@ -1324,7 +1329,7 @@ void unregister_ftrace_graph(struct fgraph_ops *gops)
 	ftrace_shutdown_subops(&graph_ops, &gops->ops, command);
 
 	if (ftrace_graph_active == 1)
-		ftrace_graph_enable_direct(true);
+		ftrace_graph_enable_direct(true, NULL);
 	else if (!ftrace_graph_active)
 		ftrace_graph_disable_direct(false);
 

From bc754cc76d1bbc87be5d8b7eee05ceb0ae613bce Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Tue, 20 Aug 2024 09:56:38 +0900
Subject: [PATCH 2/7] tracing: Fix memory leak in fgraph storage selftest

With ftrace boot-time selftest, kmemleak reported some memory leaks in
the new test case for function graph storage for multiple tracers.

unreferenced object 0xffff888005060080 (size 32):
  comm "swapper/0", pid 1, jiffies 4294676440
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 20 10 06 05 80 88 ff ff  ........ .......
    54 0c 1e 81 ff ff ff ff 00 00 00 00 00 00 00 00  T...............
  backtrace (crc 7c93416c):
    [<000000000238ee6f>] __kmalloc_cache_noprof+0x11f/0x2a0
    [<0000000033d2b6c5>] enter_record+0xe8/0x150
    [<0000000054c38424>] match_records+0x1cd/0x230
    [<00000000c775b63d>] ftrace_set_hash+0xff/0x380
    [<000000007bf7208c>] ftrace_set_filter+0x70/0x90
    [<00000000a5c08dda>] test_graph_storage_multi+0x2e/0xf0
    [<000000006ba028ca>] trace_selftest_startup_function_graph+0x1e8/0x260
    [<00000000a715d3eb>] run_tracer_selftest+0x111/0x190
    [<00000000395cbf90>] register_tracer+0xdf/0x1f0
    [<0000000093e67f7b>] do_one_initcall+0x141/0x3b0
    [<00000000c591b682>] do_initcall_level+0x82/0xa0
    [<000000004e4c6600>] do_initcalls+0x43/0x70
    [<0000000034f3c4e4>] kernel_init_freeable+0x170/0x1f0
    [<00000000c7a5dab2>] kernel_init+0x1a/0x1a0
    [<00000000ea105947>] ret_from_fork+0x3a/0x50
    [<00000000a1932e84>] ret_from_fork_asm+0x1a/0x30
...

This means filter hash allocated for the fixtures are not correctly
released after the test.

Free those hash lists after tests are done and split the loop for
initialize fixture and register fixture for rollback.

Fixes: dd120af2d5f8 ("ftrace: Add multiple fgraph storage selftest")
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/172411539857.28895.13119957560263401102.stgit@devnote2
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_selftest.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 97f1e4bc47dc8..c4ad7cd7e7780 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -942,7 +942,7 @@ static __init int test_graph_storage_multi(void)
 {
 	struct fgraph_fixture *fixture;
 	bool printed = false;
-	int i, ret;
+	int i, j, ret;
 
 	pr_cont("PASSED\n");
 	pr_info("Testing multiple fgraph storage on a function: ");
@@ -953,22 +953,35 @@ static __init int test_graph_storage_multi(void)
 		if (ret && ret != -ENODEV) {
 			pr_cont("*Could not set filter* ");
 			printed = true;
-			goto out;
+			goto out2;
 		}
+	}
 
+	for (j = 0; j < ARRAY_SIZE(store_bytes); j++) {
+		fixture = &store_bytes[j];
 		ret = register_ftrace_graph(&fixture->gops);
 		if (ret) {
 			pr_warn("Failed to init store_bytes fgraph tracing\n");
 			printed = true;
-			goto out;
+			goto out1;
 		}
 	}
 
 	DYN_FTRACE_TEST_NAME();
-out:
+out1:
+	while (--j >= 0) {
+		fixture = &store_bytes[j];
+		unregister_ftrace_graph(&fixture->gops);
+
+		if (fixture->error_str && !printed) {
+			pr_cont("*** %s ***", fixture->error_str);
+			printed = true;
+		}
+	}
+out2:
 	while (--i >= 0) {
 		fixture = &store_bytes[i];
-		unregister_ftrace_graph(&fixture->gops);
+		ftrace_free_filter(&fixture->gops.ops);
 
 		if (fixture->error_str && !printed) {
 			pr_cont("*** %s ***", fixture->error_str);

From 49aa8a1f4d6800721c7971ed383078257f12e8f9 Mon Sep 17 00:00:00 2001
From: Zheng Yejian <zhengyejian@huaweicloud.com>
Date: Tue, 27 Aug 2024 20:46:54 +0800
Subject: [PATCH 3/7] tracing: Avoid possible softlockup in
 tracing_iter_reset()

In __tracing_open(), when max latency tracers took place on the cpu,
the time start of its buffer would be updated, then event entries with
timestamps being earlier than start of the buffer would be skipped
(see tracing_iter_reset()).

Softlockup will occur if the kernel is non-preemptible and too many
entries were skipped in the loop that reset every cpu buffer, so add
cond_resched() to avoid it.

Cc: stable@vger.kernel.org
Fixes: 2f26ebd549b9a ("tracing: use timestamp to determine start of latency traces")
Link: https://lore.kernel.org/20240827124654.3817443-1-zhengyejian@huaweicloud.com
Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Zheng Yejian <zhengyejian@huaweicloud.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ebe7ce2f5f4a5..edf6bc817aa12 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3958,6 +3958,8 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 			break;
 		entries++;
 		ring_buffer_iter_advance(buf_iter);
+		/* This could be a big loop */
+		cond_resched();
 	}
 
 	per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries;

From d2603279c7d645bf0d11fa253b23f1ab48fc8d3c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 4 Sep 2024 13:16:05 -0400
Subject: [PATCH 4/7] eventfs: Use list_del_rcu() for SRCU protected list
 variable

Chi Zhiling reported:

  We found a null pointer accessing in tracefs[1], the reason is that the
  variable 'ei_child' is set to LIST_POISON1, that means the list was
  removed in eventfs_remove_rec. so when access the ei_child->is_freed, the
  panic triggered.

  by the way, the following script can reproduce this panic

  loop1 (){
      while true
      do
          echo "p:kp submit_bio" > /sys/kernel/debug/tracing/kprobe_events
          echo "" > /sys/kernel/debug/tracing/kprobe_events
      done
  }
  loop2 (){
      while true
      do
          tree /sys/kernel/debug/tracing/events/kprobes/
      done
  }
  loop1 &
  loop2

  [1]:
  [ 1147.959632][T17331] Unable to handle kernel paging request at virtual address dead000000000150
  [ 1147.968239][T17331] Mem abort info:
  [ 1147.971739][T17331]   ESR = 0x0000000096000004
  [ 1147.976172][T17331]   EC = 0x25: DABT (current EL), IL = 32 bits
  [ 1147.982171][T17331]   SET = 0, FnV = 0
  [ 1147.985906][T17331]   EA = 0, S1PTW = 0
  [ 1147.989734][T17331]   FSC = 0x04: level 0 translation fault
  [ 1147.995292][T17331] Data abort info:
  [ 1147.998858][T17331]   ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000
  [ 1148.005023][T17331]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
  [ 1148.010759][T17331]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
  [ 1148.016752][T17331] [dead000000000150] address between user and kernel address ranges
  [ 1148.024571][T17331] Internal error: Oops: 0000000096000004 [#1] SMP
  [ 1148.030825][T17331] Modules linked in: team_mode_loadbalance team nlmon act_gact cls_flower sch_ingress bonding tls macvlan dummy ib_core bridge stp llc veth amdgpu amdxcp mfd_core gpu_sched drm_exec drm_buddy radeon crct10dif_ce video drm_suballoc_helper ghash_ce drm_ttm_helper sha2_ce ttm sha256_arm64 i2c_algo_bit sha1_ce sbsa_gwdt cp210x drm_display_helper cec sr_mod cdrom drm_kms_helper binfmt_misc sg loop fuse drm dm_mod nfnetlink ip_tables autofs4 [last unloaded: tls]
  [ 1148.072808][T17331] CPU: 3 PID: 17331 Comm: ls Tainted: G        W         ------- ----  6.6.43 #2
  [ 1148.081751][T17331] Source Version: 21b3b386e948bedd29369af66f3e98ab01b1c650
  [ 1148.088783][T17331] Hardware name: Greatwall GW-001M1A-FTF/GW-001M1A-FTF, BIOS KunLun BIOS V4.0 07/16/2020
  [ 1148.098419][T17331] pstate: 20000005 (nzCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
  [ 1148.106060][T17331] pc : eventfs_iterate+0x2c0/0x398
  [ 1148.111017][T17331] lr : eventfs_iterate+0x2fc/0x398
  [ 1148.115969][T17331] sp : ffff80008d56bbd0
  [ 1148.119964][T17331] x29: ffff80008d56bbf0 x28: ffff001ff5be2600 x27: 0000000000000000
  [ 1148.127781][T17331] x26: ffff001ff52ca4e0 x25: 0000000000009977 x24: dead000000000100
  [ 1148.135598][T17331] x23: 0000000000000000 x22: 000000000000000b x21: ffff800082645f10
  [ 1148.143415][T17331] x20: ffff001fddf87c70 x19: ffff80008d56bc90 x18: 0000000000000000
  [ 1148.151231][T17331] x17: 0000000000000000 x16: 0000000000000000 x15: ffff001ff52ca4e0
  [ 1148.159048][T17331] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000
  [ 1148.166864][T17331] x11: 0000000000000000 x10: 0000000000000000 x9 : ffff8000804391d0
  [ 1148.174680][T17331] x8 : 0000000180000000 x7 : 0000000000000018 x6 : 0000aaab04b92862
  [ 1148.182498][T17331] x5 : 0000aaab04b92862 x4 : 0000000080000000 x3 : 0000000000000068
  [ 1148.190314][T17331] x2 : 000000000000000f x1 : 0000000000007ea8 x0 : 0000000000000001
  [ 1148.198131][T17331] Call trace:
  [ 1148.201259][T17331]  eventfs_iterate+0x2c0/0x398
  [ 1148.205864][T17331]  iterate_dir+0x98/0x188
  [ 1148.210036][T17331]  __arm64_sys_getdents64+0x78/0x160
  [ 1148.215161][T17331]  invoke_syscall+0x78/0x108
  [ 1148.219593][T17331]  el0_svc_common.constprop.0+0x48/0xf0
  [ 1148.224977][T17331]  do_el0_svc+0x24/0x38
  [ 1148.228974][T17331]  el0_svc+0x40/0x168
  [ 1148.232798][T17331]  el0t_64_sync_handler+0x120/0x130
  [ 1148.237836][T17331]  el0t_64_sync+0x1a4/0x1a8
  [ 1148.242182][T17331] Code: 54ffff6c f9400676 910006d6 f9000676 (b9405300)
  [ 1148.248955][T17331] ---[ end trace 0000000000000000 ]---

The issue is that list_del() is used on an SRCU protected list variable
before the synchronization occurs. This can poison the list pointers while
there is a reader iterating the list.

This is simply fixed by using list_del_rcu() that is specifically made for
this purpose.

Link: https://lore.kernel.org/linux-trace-kernel/20240829085025.3600021-1-chizhiling@163.com/

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/20240904131605.640d42b1@gandalf.local.home
Fixes: 43aa6f97c2d03 ("eventfs: Get rid of dentry pointers without refcounts")
Reported-by: Chi Zhiling <chizhiling@kylinos.cn>
Tested-by: Chi Zhiling <chizhiling@kylinos.cn>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 fs/tracefs/event_inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 01e99e98457dd..8705c77a9e75a 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -862,7 +862,7 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level)
 	list_for_each_entry(ei_child, &ei->children, list)
 		eventfs_remove_rec(ei_child, level + 1);
 
-	list_del(&ei->list);
+	list_del_rcu(&ei->list);
 	free_ei(ei);
 }
 

From 177e1cc2f41235c145041eed03ef5bab18f32328 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 4 Sep 2024 10:34:28 -0400
Subject: [PATCH 5/7] tracing/osnoise: Use a cpumask to know what threads are
 kthreads

The start_kthread() and stop_thread() code was not always called with the
interface_lock held. This means that the kthread variable could be
unexpectedly changed causing the kthread_stop() to be called on it when it
should not have been, leading to:

 while true; do
   rtla timerlat top -u -q & PID=$!;
   sleep 5;
   kill -INT $PID;
   sleep 0.001;
   kill -TERM $PID;
   wait $PID;
  done

Causing the following OOPS:

 Oops: general protection fault, probably for non-canonical address 0xdffffc0000000002: 0000 [#1] PREEMPT SMP KASAN PTI
 KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017]
 CPU: 5 UID: 0 PID: 885 Comm: timerlatu/5 Not tainted 6.11.0-rc4-test-00002-gbc754cc76d1b-dirty #125 a533010b71dab205ad2f507188ce8c82203b0254
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
 RIP: 0010:hrtimer_active+0x58/0x300
 Code: 48 c1 ee 03 41 54 48 01 d1 48 01 d6 55 53 48 83 ec 20 80 39 00 0f 85 30 02 00 00 49 8b 6f 30 4c 8d 75 10 4c 89 f0 48 c1 e8 03 <0f> b6 3c 10 4c 89 f0 83 e0 07 83 c0 03 40 38 f8 7c 09 40 84 ff 0f
 RSP: 0018:ffff88811d97f940 EFLAGS: 00010202
 RAX: 0000000000000002 RBX: ffff88823c6b5b28 RCX: ffffed10478d6b6b
 RDX: dffffc0000000000 RSI: ffffed10478d6b6c RDI: ffff88823c6b5b28
 RBP: 0000000000000000 R08: ffff88823c6b5b58 R09: ffff88823c6b5b60
 R10: ffff88811d97f957 R11: 0000000000000010 R12: 00000000000a801d
 R13: ffff88810d8b35d8 R14: 0000000000000010 R15: ffff88823c6b5b28
 FS:  0000000000000000(0000) GS:ffff88823c680000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000561858ad7258 CR3: 000000007729e001 CR4: 0000000000170ef0
 Call Trace:
  <TASK>
  ? die_addr+0x40/0xa0
  ? exc_general_protection+0x154/0x230
  ? asm_exc_general_protection+0x26/0x30
  ? hrtimer_active+0x58/0x300
  ? __pfx_mutex_lock+0x10/0x10
  ? __pfx_locks_remove_file+0x10/0x10
  hrtimer_cancel+0x15/0x40
  timerlat_fd_release+0x8e/0x1f0
  ? security_file_release+0x43/0x80
  __fput+0x372/0xb10
  task_work_run+0x11e/0x1f0
  ? _raw_spin_lock+0x85/0xe0
  ? __pfx_task_work_run+0x10/0x10
  ? poison_slab_object+0x109/0x170
  ? do_exit+0x7a0/0x24b0
  do_exit+0x7bd/0x24b0
  ? __pfx_migrate_enable+0x10/0x10
  ? __pfx_do_exit+0x10/0x10
  ? __pfx_read_tsc+0x10/0x10
  ? ktime_get+0x64/0x140
  ? _raw_spin_lock_irq+0x86/0xe0
  do_group_exit+0xb0/0x220
  get_signal+0x17ba/0x1b50
  ? vfs_read+0x179/0xa40
  ? timerlat_fd_read+0x30b/0x9d0
  ? __pfx_get_signal+0x10/0x10
  ? __pfx_timerlat_fd_read+0x10/0x10
  arch_do_signal_or_restart+0x8c/0x570
  ? __pfx_arch_do_signal_or_restart+0x10/0x10
  ? vfs_read+0x179/0xa40
  ? ksys_read+0xfe/0x1d0
  ? __pfx_ksys_read+0x10/0x10
  syscall_exit_to_user_mode+0xbc/0x130
  do_syscall_64+0x74/0x110
  ? __pfx___rseq_handle_notify_resume+0x10/0x10
  ? __pfx_ksys_read+0x10/0x10
  ? fpregs_restore_userregs+0xdb/0x1e0
  ? fpregs_restore_userregs+0xdb/0x1e0
  ? syscall_exit_to_user_mode+0x116/0x130
  ? do_syscall_64+0x74/0x110
  ? do_syscall_64+0x74/0x110
  ? do_syscall_64+0x74/0x110
  entry_SYSCALL_64_after_hwframe+0x71/0x79
 RIP: 0033:0x7ff0070eca9c
 Code: Unable to access opcode bytes at 0x7ff0070eca72.
 RSP: 002b:00007ff006dff8c0 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
 RAX: 0000000000000000 RBX: 0000000000000005 RCX: 00007ff0070eca9c
 RDX: 0000000000000400 RSI: 00007ff006dff9a0 RDI: 0000000000000003
 RBP: 00007ff006dffde0 R08: 0000000000000000 R09: 00007ff000000ba0
 R10: 00007ff007004b08 R11: 0000000000000246 R12: 0000000000000003
 R13: 00007ff006dff9a0 R14: 0000000000000007 R15: 0000000000000008
  </TASK>
 Modules linked in: snd_hda_intel snd_intel_dspcfg snd_intel_sdw_acpi snd_hda_codec snd_hwdep snd_hda_core
 ---[ end trace 0000000000000000 ]---

This is because it would mistakenly call kthread_stop() on a user space
thread making it "exit" before it actually exits.

Since kthreads are created based on global behavior, use a cpumask to know
when kthreads are running and that they need to be shutdown before
proceeding to do new work.

Link: https://lore.kernel.org/all/20240820130001.124768-1-tglozar@redhat.com/

This was debugged by using the persistent ring buffer:

Link: https://lore.kernel.org/all/20240823013902.135036960@goodmis.org/

Note, locking was originally used to fix this, but that proved to cause too
many deadlocks to work around:

  https://lore.kernel.org/linux-trace-kernel/20240823102816.5e55753b@gandalf.local.home/

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>
Link: https://lore.kernel.org/20240904103428.08efdf4c@gandalf.local.home
Fixes: e88ed227f639e ("tracing/timerlat: Add user-space interface")
Reported-by: Tomas Glozar <tglozar@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_osnoise.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 66a871553d4a1..d770927efcd9a 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1612,6 +1612,7 @@ static int run_osnoise(void)
 
 static struct cpumask osnoise_cpumask;
 static struct cpumask save_cpumask;
+static struct cpumask kthread_cpumask;
 
 /*
  * osnoise_sleep - sleep until the next period
@@ -1675,6 +1676,7 @@ static inline int osnoise_migration_pending(void)
 	 */
 	mutex_lock(&interface_lock);
 	this_cpu_osn_var()->kthread = NULL;
+	cpumask_clear_cpu(smp_processor_id(), &kthread_cpumask);
 	mutex_unlock(&interface_lock);
 
 	return 1;
@@ -1947,9 +1949,10 @@ static void stop_kthread(unsigned int cpu)
 
 	kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread;
 	if (kthread) {
-		if (test_bit(OSN_WORKLOAD, &osnoise_options)) {
+		if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask) &&
+		    !WARN_ON(!test_bit(OSN_WORKLOAD, &osnoise_options))) {
 			kthread_stop(kthread);
-		} else {
+		} else if (!WARN_ON(test_bit(OSN_WORKLOAD, &osnoise_options))) {
 			/*
 			 * This is a user thread waiting on the timerlat_fd. We need
 			 * to close all users, and the best way to guarantee this is
@@ -2021,6 +2024,7 @@ static int start_kthread(unsigned int cpu)
 	}
 
 	per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread;
+	cpumask_set_cpu(cpu, &kthread_cpumask);
 
 	return 0;
 }
@@ -2048,8 +2052,16 @@ static int start_per_cpu_kthreads(void)
 	 */
 	cpumask_and(current_mask, cpu_online_mask, &osnoise_cpumask);
 
-	for_each_possible_cpu(cpu)
+	for_each_possible_cpu(cpu) {
+		if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask)) {
+			struct task_struct *kthread;
+
+			kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread;
+			if (!WARN_ON(!kthread))
+				kthread_stop(kthread);
+		}
 		per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
+	}
 
 	for_each_cpu(cpu, current_mask) {
 		retval = start_kthread(cpu);

From e6a53481da292d970d1edf0d8831121d1c5e2f0d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 5 Sep 2024 08:53:30 -0400
Subject: [PATCH 6/7] tracing/timerlat: Only clear timer if a kthread exists

The timerlat tracer can use user space threads to check for osnoise and
timer latency. If the program using this is killed via a SIGTERM, the
threads are shutdown one at a time and another tracing instance can start
up resetting the threads before they are fully closed. That causes the
hrtimer assigned to the kthread to be shutdown and freed twice when the
dying thread finally closes the file descriptors, causing a use-after-free
bug.

Only cancel the hrtimer if the associated thread is still around. Also add
the interface_lock around the resetting of the tlat_var->kthread.

Note, this is just a quick fix that can be backported to stable. A real
fix is to have a better synchronization between the shutdown of old
threads and the starting of new ones.

Link: https://lore.kernel.org/all/20240820130001.124768-1-tglozar@redhat.com/

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>
Link: https://lore.kernel.org/20240905085330.45985730@gandalf.local.home
Fixes: e88ed227f639e ("tracing/timerlat: Add user-space interface")
Reported-by: Tomas Glozar <tglozar@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_osnoise.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index d770927efcd9a..48e5014dd4ab7 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -252,6 +252,11 @@ static inline struct timerlat_variables *this_cpu_tmr_var(void)
 	return this_cpu_ptr(&per_cpu_timerlat_var);
 }
 
+/*
+ * Protect the interface.
+ */
+static struct mutex interface_lock;
+
 /*
  * tlat_var_reset - Reset the values of the given timerlat_variables
  */
@@ -259,14 +264,20 @@ static inline void tlat_var_reset(void)
 {
 	struct timerlat_variables *tlat_var;
 	int cpu;
+
+	/* Synchronize with the timerlat interfaces */
+	mutex_lock(&interface_lock);
 	/*
 	 * So far, all the values are initialized as 0, so
 	 * zeroing the structure is perfect.
 	 */
 	for_each_cpu(cpu, cpu_online_mask) {
 		tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu);
+		if (tlat_var->kthread)
+			hrtimer_cancel(&tlat_var->timer);
 		memset(tlat_var, 0, sizeof(*tlat_var));
 	}
+	mutex_unlock(&interface_lock);
 }
 #else /* CONFIG_TIMERLAT_TRACER */
 #define tlat_var_reset()	do {} while (0)
@@ -331,11 +342,6 @@ struct timerlat_sample {
 };
 #endif
 
-/*
- * Protect the interface.
- */
-static struct mutex interface_lock;
-
 /*
  * Tracer data.
  */
@@ -2591,7 +2597,8 @@ static int timerlat_fd_release(struct inode *inode, struct file *file)
 	osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu);
 	tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu);
 
-	hrtimer_cancel(&tlat_var->timer);
+	if (tlat_var->kthread)
+		hrtimer_cancel(&tlat_var->timer);
 	memset(tlat_var, 0, sizeof(*tlat_var));
 
 	osn_var->sampling = 0;

From 5bfbcd1ee57b607fd29e4645c7f350dd385dd9ad Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 5 Sep 2024 11:33:59 -0400
Subject: [PATCH 7/7] tracing/timerlat: Add interface_lock around clearing of
 kthread in stop_kthread()

The timerlat interface will get and put the task that is part of the
"kthread" field of the osn_var to keep it around until all references are
released. But here's a race in the "stop_kthread()" code that will call
put_task_struct() on the kthread if it is not a kernel thread. This can
race with the releasing of the references to that task struct and the
put_task_struct() can be called twice when it should have been called just
once.

Take the interface_lock() in stop_kthread() to synchronize this change.
But to do so, the function stop_per_cpu_kthreads() needs to change the
loop from for_each_online_cpu() to for_each_possible_cpu() and remove the
cpu_read_lock(), as the interface_lock can not be taken while the cpu
locks are held. The only side effect of this change is that it may do some
extra work, as the per_cpu variables of the offline CPUs would not be set
anyway, and would simply be skipped in the loop.

Remove unneeded "return;" in stop_kthread().

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Tomas Glozar <tglozar@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>
Link: https://lore.kernel.org/20240905113359.2b934242@gandalf.local.home
Fixes: e88ed227f639e ("tracing/timerlat: Add user-space interface")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_osnoise.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 48e5014dd4ab7..bbe47781617ef 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1953,8 +1953,12 @@ static void stop_kthread(unsigned int cpu)
 {
 	struct task_struct *kthread;
 
+	mutex_lock(&interface_lock);
 	kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread;
 	if (kthread) {
+		per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
+		mutex_unlock(&interface_lock);
+
 		if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask) &&
 		    !WARN_ON(!test_bit(OSN_WORKLOAD, &osnoise_options))) {
 			kthread_stop(kthread);
@@ -1967,8 +1971,8 @@ static void stop_kthread(unsigned int cpu)
 			kill_pid(kthread->thread_pid, SIGKILL, 1);
 			put_task_struct(kthread);
 		}
-		per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
 	} else {
+		mutex_unlock(&interface_lock);
 		/* if no workload, just return */
 		if (!test_bit(OSN_WORKLOAD, &osnoise_options)) {
 			/*
@@ -1976,7 +1980,6 @@ static void stop_kthread(unsigned int cpu)
 			 */
 			per_cpu(per_cpu_osnoise_var, cpu).sampling = false;
 			barrier();
-			return;
 		}
 	}
 }
@@ -1991,12 +1994,8 @@ static void stop_per_cpu_kthreads(void)
 {
 	int cpu;
 
-	cpus_read_lock();
-
-	for_each_online_cpu(cpu)
+	for_each_possible_cpu(cpu)
 		stop_kthread(cpu);
-
-	cpus_read_unlock();
 }
 
 /*