Skip to content

Commit

Permalink
perf: Optimize event scheduling locking
Browse files Browse the repository at this point in the history
Currently we only hold one ctx->lock at a time, which results in us
flipping back and forth between cpuctx->ctx.lock and task_ctx->lock.

Avoid this and gain large atomic regions by holding both locks. We
nest the task lock inside the cpu lock, since with task scheduling we
might have to change task ctx while holding the cpu ctx lock.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110409192141.769881865@chello.nl
Signed-off-by: Ingo Molnar <mingo@elte.hu>
  • Loading branch information
Peter Zijlstra authored and Ingo Molnar committed May 28, 2011
1 parent 9137fb2 commit facc430
Showing 1 changed file with 36 additions and 25 deletions.
61 changes: 36 additions & 25 deletions kernel/events/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,22 @@ __get_cpu_context(struct perf_event_context *ctx)
return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
}

static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
raw_spin_lock(&cpuctx->ctx.lock);
if (ctx)
raw_spin_lock(&ctx->lock);
}

static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
if (ctx)
raw_spin_unlock(&ctx->lock);
raw_spin_unlock(&cpuctx->ctx.lock);
}

#ifdef CONFIG_CGROUP_PERF

/*
Expand Down Expand Up @@ -340,11 +356,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
rcu_read_lock();

list_for_each_entry_rcu(pmu, &pmus, entry) {

cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);

perf_pmu_disable(cpuctx->ctx.pmu);

/*
* perf_cgroup_events says at least one
* context on this CPU has cgroup events.
Expand All @@ -353,6 +366,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
* events for a context.
*/
if (cpuctx->ctx.nr_cgroups > 0) {
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(cpuctx->ctx.pmu);

if (mode & PERF_CGROUP_SWOUT) {
cpu_ctx_sched_out(cpuctx, EVENT_ALL);
Expand All @@ -372,9 +387,9 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
cpuctx->cgrp = perf_cgroup_from_task(task);
cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
}
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}

perf_pmu_enable(cpuctx->ctx.pmu);
}

rcu_read_unlock();
Expand Down Expand Up @@ -1759,15 +1774,14 @@ static void ctx_sched_out(struct perf_event_context *ctx,
{
struct perf_event *event;

raw_spin_lock(&ctx->lock);
ctx->is_active = 0;
if (likely(!ctx->nr_events))
goto out;
return;

update_context_time(ctx);
update_cgrp_time_from_cpuctx(cpuctx);

if (!ctx->nr_active)
goto out;
return;

perf_pmu_disable(ctx->pmu);
if (event_type & EVENT_PINNED) {
Expand All @@ -1780,8 +1794,6 @@ static void ctx_sched_out(struct perf_event_context *ctx,
group_sched_out(event, cpuctx, ctx);
}
perf_pmu_enable(ctx->pmu);
out:
raw_spin_unlock(&ctx->lock);
}

/*
Expand Down Expand Up @@ -1929,8 +1941,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
rcu_read_unlock();

if (do_switch) {
raw_spin_lock(&ctx->lock);
ctx_sched_out(ctx, cpuctx, EVENT_ALL);
cpuctx->task_ctx = NULL;
raw_spin_unlock(&ctx->lock);
}
}

Expand Down Expand Up @@ -2056,10 +2070,9 @@ ctx_sched_in(struct perf_event_context *ctx,
{
u64 now;

raw_spin_lock(&ctx->lock);
ctx->is_active = 1;
if (likely(!ctx->nr_events))
goto out;
return;

now = perf_clock();
ctx->timestamp = now;
Expand All @@ -2074,9 +2087,6 @@ ctx_sched_in(struct perf_event_context *ctx,
/* Then walk through the lower prio flexible groups */
if (event_type & EVENT_FLEXIBLE)
ctx_flexible_sched_in(ctx, cpuctx);

out:
raw_spin_unlock(&ctx->lock);
}

static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
Expand Down Expand Up @@ -2110,6 +2120,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
if (cpuctx->task_ctx == ctx)
return;

perf_ctx_lock(cpuctx, ctx);
perf_pmu_disable(ctx->pmu);
/*
* We want to keep the following priority order:
Expand All @@ -2124,12 +2135,14 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,

cpuctx->task_ctx = ctx;

perf_pmu_enable(ctx->pmu);
perf_ctx_unlock(cpuctx, ctx);

/*
* Since these rotations are per-cpu, we need to ensure the
* cpu-context we got scheduled on is actually rotating.
*/
perf_pmu_rotate_start(ctx->pmu);
perf_pmu_enable(ctx->pmu);
}

/*
Expand Down Expand Up @@ -2269,7 +2282,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
u64 interrupts, now;
s64 delta;

raw_spin_lock(&ctx->lock);
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
Expand Down Expand Up @@ -2301,24 +2313,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
if (delta > 0)
perf_adjust_period(event, period, delta);
}
raw_spin_unlock(&ctx->lock);
}

/*
* Round-robin a context's events:
*/
static void rotate_ctx(struct perf_event_context *ctx)
{
raw_spin_lock(&ctx->lock);

/*
* Rotate the first entry last of non-pinned groups. Rotation might be
* disabled by the inheritance code.
*/
if (!ctx->rotate_disable)
list_rotate_left(&ctx->flexible_groups);

raw_spin_unlock(&ctx->lock);
}

/*
Expand All @@ -2345,6 +2352,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
rotate = 1;
}

perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(cpuctx->ctx.pmu);
perf_ctx_adjust_freq(&cpuctx->ctx, interval);
if (ctx)
Expand All @@ -2370,6 +2378,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
list_del_init(&cpuctx->rotation_list);

perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}

void perf_event_task_tick(void)
Expand Down Expand Up @@ -2424,9 +2433,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
* in.
*/
perf_cgroup_sched_out(current);
task_ctx_sched_out(ctx, EVENT_ALL);

raw_spin_lock(&ctx->lock);
task_ctx_sched_out(ctx, EVENT_ALL);

list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
ret = event_enable_on_exec(event, ctx);
Expand Down Expand Up @@ -5982,6 +5991,7 @@ static int pmu_dev_alloc(struct pmu *pmu)
}

static struct lock_class_key cpuctx_mutex;
static struct lock_class_key cpuctx_lock;

int perf_pmu_register(struct pmu *pmu, char *name, int type)
{
Expand Down Expand Up @@ -6032,6 +6042,7 @@ int perf_pmu_register(struct pmu *pmu, char *name, int type)
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
__perf_event_init_context(&cpuctx->ctx);
lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
cpuctx->ctx.type = cpu_context;
cpuctx->ctx.pmu = pmu;
cpuctx->jiffies_interval = 1;
Expand Down Expand Up @@ -6776,14 +6787,14 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* our context.
*/
child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
task_ctx_sched_out(child_ctx, EVENT_ALL);

/*
* Take the context lock here so that if find_get_context is
* reading child->perf_event_ctxp, we wait until it has
* incremented the context's refcount before we do put_ctx below.
*/
raw_spin_lock(&child_ctx->lock);
task_ctx_sched_out(child_ctx, EVENT_ALL);
child->perf_event_ctxp[ctxn] = NULL;
/*
* If this context is a clone; unclone it so it can't get
Expand Down

0 comments on commit facc430

Please sign in to comment.