Skip to content

Commit

Permalink
perf: attach/detach PMU specific data
Browse files Browse the repository at this point in the history
The LBR call stack data has to be saved/restored during context switch
to fix the shorter LBRs call stacks issue in the  system-wide mode.
Allocate PMU specific data and attach them to the corresponding
task_struct during LBR call stack monitoring.

When a LBR call stack event is accounted, the perf_ctx_data for the
related tasks will be allocated/attached by attach_perf_ctx_data().
When a LBR call stack event is unaccounted, the perf_ctx_data for
related tasks will be detached/freed by detach_perf_ctx_data().

The LBR call stack event could be a per-task event or a system-wide
event.
- For a per-task event, perf only allocates the perf_ctx_data for the
  current task. If the allocation fails, perf will error out.
- For a system-wide event, perf has to allocate the perf_ctx_data for
  both the existing tasks and the upcoming tasks.
  The allocation for the existing tasks is done in perf_event_alloc().
  If any allocation fails, perf will error out.
  The allocation for the new tasks will be done in perf_event_fork().
  A global reader/writer semaphore, global_ctx_data_rwsem, is added to
  address the global race.
- The perf_ctx_data only be freed by the last LBR call stack event.
  The number of the per-task events is tracked by refcount of each task.
  Since the system-wide events impact all tasks, it's not practical to
  go through the whole task list to update the refcount for each
  system-wide event. The number of system-wide events is tracked by a
  global variable global_ctx_data_ref.

Suggested-by: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250314172700.438923-3-kan.liang@linux.intel.com
  • Loading branch information
Kan Liang authored and Peter Zijlstra committed Mar 17, 2025
1 parent fdfda86 commit 506e64e
Show file tree
Hide file tree
Showing 2 changed files with 291 additions and 1 deletion.
3 changes: 2 additions & 1 deletion include/linux/perf_event.h
Original file line number Diff line number Diff line change
Expand Up @@ -676,11 +676,12 @@ struct swevent_hlist {
#define PERF_ATTACH_GROUP 0x0002
#define PERF_ATTACH_TASK 0x0004
#define PERF_ATTACH_TASK_DATA 0x0008
#define PERF_ATTACH_ITRACE 0x0010
#define PERF_ATTACH_GLOBAL_DATA 0x0010
#define PERF_ATTACH_SCHED_CB 0x0020
#define PERF_ATTACH_CHILD 0x0040
#define PERF_ATTACH_EXCLUSIVE 0x0080
#define PERF_ATTACH_CALLCHAIN 0x0100
#define PERF_ATTACH_ITRACE 0x0200

struct bpf_prog;
struct perf_cgroup;
Expand Down
289 changes: 289 additions & 0 deletions kernel/events/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
#include <linux/pgtable.h>
#include <linux/buildid.h>
#include <linux/task_work.h>
#include <linux/percpu-rwsem.h>

#include "internal.h"

Expand Down Expand Up @@ -5217,6 +5218,225 @@ static void unaccount_freq_event(void)
atomic_dec(&nr_freq_events);
}


static struct perf_ctx_data *
alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global)
{
struct perf_ctx_data *cd;

cd = kzalloc(sizeof(*cd), GFP_KERNEL);
if (!cd)
return NULL;

cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL);
if (!cd->data) {
kfree(cd);
return NULL;
}

cd->global = global;
cd->ctx_cache = ctx_cache;
refcount_set(&cd->refcount, 1);

return cd;
}

static void free_perf_ctx_data(struct perf_ctx_data *cd)
{
kmem_cache_free(cd->ctx_cache, cd->data);
kfree(cd);
}

static void __free_perf_ctx_data_rcu(struct rcu_head *rcu_head)
{
struct perf_ctx_data *cd;

cd = container_of(rcu_head, struct perf_ctx_data, rcu_head);
free_perf_ctx_data(cd);
}

static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd)
{
call_rcu(&cd->rcu_head, __free_perf_ctx_data_rcu);
}

static int
attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache,
bool global)
{
struct perf_ctx_data *cd, *old = NULL;

cd = alloc_perf_ctx_data(ctx_cache, global);
if (!cd)
return -ENOMEM;

for (;;) {
if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) {
if (old)
perf_free_ctx_data_rcu(old);
return 0;
}

if (!old) {
/*
* After seeing a dead @old, we raced with
* removal and lost, try again to install @cd.
*/
continue;
}

if (refcount_inc_not_zero(&old->refcount)) {
free_perf_ctx_data(cd); /* unused */
return 0;
}

/*
* @old is a dead object, refcount==0 is stable, try and
* replace it with @cd.
*/
}
return 0;
}

static void __detach_global_ctx_data(void);
DEFINE_STATIC_PERCPU_RWSEM(global_ctx_data_rwsem);
static refcount_t global_ctx_data_ref;

static int
attach_global_ctx_data(struct kmem_cache *ctx_cache)
{
struct task_struct *g, *p;
struct perf_ctx_data *cd;
int ret;

if (refcount_inc_not_zero(&global_ctx_data_ref))
return 0;

guard(percpu_write)(&global_ctx_data_rwsem);
if (refcount_inc_not_zero(&global_ctx_data_ref))
return 0;
again:
/* Allocate everything */
scoped_guard (rcu) {
for_each_process_thread(g, p) {
cd = rcu_dereference(p->perf_ctx_data);
if (cd && !cd->global) {
cd->global = 1;
if (!refcount_inc_not_zero(&cd->refcount))
cd = NULL;
}
if (!cd) {
get_task_struct(p);
goto alloc;
}
}
}

refcount_set(&global_ctx_data_ref, 1);

return 0;
alloc:
ret = attach_task_ctx_data(p, ctx_cache, true);
put_task_struct(p);
if (ret) {
__detach_global_ctx_data();
return ret;
}
goto again;
}

static int
attach_perf_ctx_data(struct perf_event *event)
{
struct task_struct *task = event->hw.target;
struct kmem_cache *ctx_cache = event->pmu->task_ctx_cache;
int ret;

if (!ctx_cache)
return -ENOMEM;

if (task)
return attach_task_ctx_data(task, ctx_cache, false);

ret = attach_global_ctx_data(ctx_cache);
if (ret)
return ret;

event->attach_state |= PERF_ATTACH_GLOBAL_DATA;
return 0;
}

static void
detach_task_ctx_data(struct task_struct *p)
{
struct perf_ctx_data *cd;

scoped_guard (rcu) {
cd = rcu_dereference(p->perf_ctx_data);
if (!cd || !refcount_dec_and_test(&cd->refcount))
return;
}

/*
* The old ctx_data may be lost because of the race.
* Nothing is required to do for the case.
* See attach_task_ctx_data().
*/
if (try_cmpxchg((struct perf_ctx_data **)&p->perf_ctx_data, &cd, NULL))
perf_free_ctx_data_rcu(cd);
}

static void __detach_global_ctx_data(void)
{
struct task_struct *g, *p;
struct perf_ctx_data *cd;

again:
scoped_guard (rcu) {
for_each_process_thread(g, p) {
cd = rcu_dereference(p->perf_ctx_data);
if (!cd || !cd->global)
continue;
cd->global = 0;
get_task_struct(p);
goto detach;
}
}
return;
detach:
detach_task_ctx_data(p);
put_task_struct(p);
goto again;
}

static void detach_global_ctx_data(void)
{
if (refcount_dec_not_one(&global_ctx_data_ref))
return;

guard(percpu_write)(&global_ctx_data_rwsem);
if (!refcount_dec_and_test(&global_ctx_data_ref))
return;

/* remove everything */
__detach_global_ctx_data();
}

static void detach_perf_ctx_data(struct perf_event *event)
{
struct task_struct *task = event->hw.target;

event->attach_state &= ~PERF_ATTACH_TASK_DATA;

if (task)
return detach_task_ctx_data(task);

if (event->attach_state & PERF_ATTACH_GLOBAL_DATA) {
detach_global_ctx_data();
event->attach_state &= ~PERF_ATTACH_GLOBAL_DATA;
}
}

static void unaccount_event(struct perf_event *event)
{
bool dec = false;
Expand Down Expand Up @@ -5398,6 +5618,9 @@ static void __free_event(struct perf_event *event)
if (is_cgroup_event(event))
perf_detach_cgroup(event);

if (event->attach_state & PERF_ATTACH_TASK_DATA)
detach_perf_ctx_data(event);

if (event->destroy)
event->destroy(event);

Expand Down Expand Up @@ -8607,10 +8830,58 @@ static void perf_event_task(struct task_struct *task,
task_ctx);
}

/*
* Allocate data for a new task when profiling system-wide
* events which require PMU specific data
*/
static void
perf_event_alloc_task_data(struct task_struct *child,
struct task_struct *parent)
{
struct kmem_cache *ctx_cache = NULL;
struct perf_ctx_data *cd;

if (!refcount_read(&global_ctx_data_ref))
return;

scoped_guard (rcu) {
cd = rcu_dereference(parent->perf_ctx_data);
if (cd)
ctx_cache = cd->ctx_cache;
}

if (!ctx_cache)
return;

guard(percpu_read)(&global_ctx_data_rwsem);
scoped_guard (rcu) {
cd = rcu_dereference(child->perf_ctx_data);
if (!cd) {
/*
* A system-wide event may be unaccount,
* when attaching the perf_ctx_data.
*/
if (!refcount_read(&global_ctx_data_ref))
return;
goto attach;
}

if (!cd->global) {
cd->global = 1;
refcount_inc(&cd->refcount);
}
}

return;
attach:
attach_task_ctx_data(child, ctx_cache, true);
}

void perf_event_fork(struct task_struct *task)
{
perf_event_task(task, NULL, 1);
perf_event_namespaces(task);
perf_event_alloc_task_data(task, current);
}

/*
Expand Down Expand Up @@ -12490,6 +12761,18 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (IS_ERR(pmu))
return (void*)pmu;

/*
* The PERF_ATTACH_TASK_DATA is set in the event_init()->hw_config().
* The attach should be right after the perf_init_event().
* Otherwise, the __free_event() would mistakenly detach the non-exist
* perf_ctx_data because of the other errors between them.
*/
if (event->attach_state & PERF_ATTACH_TASK_DATA) {
err = attach_perf_ctx_data(event);
if (err)
return ERR_PTR(err);
}

/*
* Disallow uncore-task events. Similarly, disallow uncore-cgroup
* events (they don't make sense as the cgroup will be different
Expand Down Expand Up @@ -13637,6 +13920,12 @@ void perf_event_exit_task(struct task_struct *child)
* At this point we need to send EXIT events to cpu contexts.
*/
perf_event_task(child, NULL, 0);

/*
* Detach the perf_ctx_data for the system-wide event.
*/
guard(percpu_read)(&global_ctx_data_rwsem);
detach_task_ctx_data(child);
}

static void perf_free_event(struct perf_event *event,
Expand Down

0 comments on commit 506e64e

Please sign in to comment.