Skip to content

Commit

Permalink
Merge tag 'perf-core-for-mingo' of git://git.kernel.org/pub/scm/linux…
Browse files Browse the repository at this point in the history
…/kernel/git/acme/linux into perf/core

Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo:

New features:

  - Introduce PERF_RECORD_SWITCH(_CPU_WIDE) and use it in 'record' to
    ask for context switches, allowing non priviledged tasks to know
    when they are switched in and out, which wasn't possible with
    the other context switch tracepoint and software events, see the
    patch description for a comprehensive justification (Adrian Hunter)

  - Stop collecting /proc/kallsyms in perf.data files, saving about
    4.5MB on a typical x86-64 system, use the the symbol resolution
    routines used in all the other tools (report, top, etc) now that
    we can ask libtraceevent to use perf's symbol resolution code.
    (Arnaldo Carvalho de Melo)

User visible fixes:

  - Expose perf's symbol resolver to libtraceevent, so that its plugins can
    resolve tracepoint fields to kernel functions, like the 'function' field
    in the "timer:hrtimer_start tracepoint" (Arnaldo Carvalho de Melo)

Infrastructure changes:

  - Map propagation of thread and cpu maps improvements, prep work for
    'perf stat' new features (Jiri Olsa)

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
  • Loading branch information
Ingo Molnar committed Jul 27, 2015
2 parents a11c51a + 7c14898 commit 4b0c53e
Show file tree
Hide file tree
Showing 33 changed files with 513 additions and 110 deletions.
31 changes: 30 additions & 1 deletion include/uapi/linux/perf_event.h
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,8 @@ struct perf_event_attr {
mmap2 : 1, /* include mmap with inode data */
comm_exec : 1, /* flag comm events that are due to an exec */
use_clockid : 1, /* use @clockid for time fields */
__reserved_1 : 38;
context_switch : 1, /* context switch data */
__reserved_1 : 37;

union {
__u32 wakeup_events; /* wakeup every n events */
Expand Down Expand Up @@ -572,9 +573,11 @@ struct perf_event_mmap_page {
/*
* PERF_RECORD_MISC_MMAP_DATA and PERF_RECORD_MISC_COMM_EXEC are used on
* different events so can reuse the same bit position.
* Ditto PERF_RECORD_MISC_SWITCH_OUT.
*/
#define PERF_RECORD_MISC_MMAP_DATA (1 << 13)
#define PERF_RECORD_MISC_COMM_EXEC (1 << 13)
#define PERF_RECORD_MISC_SWITCH_OUT (1 << 13)
/*
* Indicates that the content of PERF_SAMPLE_IP points to
* the actual instruction that triggered the event. See also
Expand Down Expand Up @@ -818,6 +821,32 @@ enum perf_event_type {
*/
PERF_RECORD_LOST_SAMPLES = 13,

/*
* Records a context switch in or out (flagged by
* PERF_RECORD_MISC_SWITCH_OUT). See also
* PERF_RECORD_SWITCH_CPU_WIDE.
*
* struct {
* struct perf_event_header header;
* struct sample_id sample_id;
* };
*/
PERF_RECORD_SWITCH = 14,

/*
* CPU-wide version of PERF_RECORD_SWITCH with next_prev_pid and
* next_prev_tid that are the next (switching out) or previous
* (switching in) pid/tid.
*
* struct {
* struct perf_event_header header;
* u32 next_prev_pid;
* u32 next_prev_tid;
* struct sample_id sample_id;
* };
*/
PERF_RECORD_SWITCH_CPU_WIDE = 15,

PERF_RECORD_MAX, /* non-ABI */
};

Expand Down
103 changes: 103 additions & 0 deletions kernel/events/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;

static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
Expand Down Expand Up @@ -2619,6 +2620,9 @@ static void perf_pmu_sched_task(struct task_struct *prev,
local_irq_restore(flags);
}

static void perf_event_switch(struct task_struct *task,
struct task_struct *next_prev, bool sched_in);

#define for_each_task_context_nr(ctxn) \
for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)

Expand All @@ -2641,6 +2645,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(task, next, false);

if (atomic_read(&nr_switch_events))
perf_event_switch(task, next, false);

for_each_task_context_nr(ctxn)
perf_event_context_sched_out(task, ctxn, next);

Expand Down Expand Up @@ -2831,6 +2838,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
perf_cgroup_sched_in(prev, task);

if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);

if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(prev, task, true);
}
Expand Down Expand Up @@ -3454,6 +3464,10 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_task_events);
if (event->attr.freq)
atomic_dec(&nr_freq_events);
if (event->attr.context_switch) {
static_key_slow_dec_deferred(&perf_sched_events);
atomic_dec(&nr_switch_events);
}
if (is_cgroup_event(event))
static_key_slow_dec_deferred(&perf_sched_events);
if (has_branch_stack(event))
Expand Down Expand Up @@ -5981,6 +5995,91 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost)
perf_output_end(&handle);
}

/*
* context_switch tracking
*/

struct perf_switch_event {
struct task_struct *task;
struct task_struct *next_prev;

struct {
struct perf_event_header header;
u32 next_prev_pid;
u32 next_prev_tid;
} event_id;
};

static int perf_event_switch_match(struct perf_event *event)
{
return event->attr.context_switch;
}

static void perf_event_switch_output(struct perf_event *event, void *data)
{
struct perf_switch_event *se = data;
struct perf_output_handle handle;
struct perf_sample_data sample;
int ret;

if (!perf_event_switch_match(event))
return;

/* Only CPU-wide events are allowed to see next/prev pid/tid */
if (event->ctx->task) {
se->event_id.header.type = PERF_RECORD_SWITCH;
se->event_id.header.size = sizeof(se->event_id.header);
} else {
se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
se->event_id.header.size = sizeof(se->event_id);
se->event_id.next_prev_pid =
perf_event_pid(event, se->next_prev);
se->event_id.next_prev_tid =
perf_event_tid(event, se->next_prev);
}

perf_event_header__init_id(&se->event_id.header, &sample, event);

ret = perf_output_begin(&handle, event, se->event_id.header.size);
if (ret)
return;

if (event->ctx->task)
perf_output_put(&handle, se->event_id.header);
else
perf_output_put(&handle, se->event_id);

perf_event__output_id_sample(event, &handle, &sample);

perf_output_end(&handle);
}

static void perf_event_switch(struct task_struct *task,
struct task_struct *next_prev, bool sched_in)
{
struct perf_switch_event switch_event;

/* N.B. caller checks nr_switch_events != 0 */

switch_event = (struct perf_switch_event){
.task = task,
.next_prev = next_prev,
.event_id = {
.header = {
/* .type */
.misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
/* .size */
},
/* .next_prev_pid */
/* .next_prev_tid */
},
};

perf_event_aux(perf_event_switch_output,
&switch_event,
NULL);
}

/*
* IRQ throttle logging
*/
Expand Down Expand Up @@ -7479,6 +7578,10 @@ static void account_event(struct perf_event *event)
if (atomic_inc_return(&nr_freq_events) == 1)
tick_nohz_full_kick_all();
}
if (event->attr.context_switch) {
atomic_inc(&nr_switch_events);
static_key_slow_inc(&perf_sched_events.key);
}
if (has_branch_stack(event))
static_key_slow_inc(&perf_sched_events.key);
if (is_cgroup_event(event))
Expand Down
68 changes: 67 additions & 1 deletion tools/lib/traceevent/event-parse.c
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ static int func_map_init(struct pevent *pevent)
}

static struct func_map *
find_func(struct pevent *pevent, unsigned long long addr)
__find_func(struct pevent *pevent, unsigned long long addr)
{
struct func_map *func;
struct func_map key;
Expand All @@ -434,6 +434,71 @@ find_func(struct pevent *pevent, unsigned long long addr)
return func;
}

struct func_resolver {
pevent_func_resolver_t *func;
void *priv;
struct func_map map;
};

/**
* pevent_set_function_resolver - set an alternative function resolver
* @pevent: handle for the pevent
* @resolver: function to be used
* @priv: resolver function private state.
*
* Some tools may have already a way to resolve kernel functions, allow them to
* keep using it instead of duplicating all the entries inside
* pevent->funclist.
*/
int pevent_set_function_resolver(struct pevent *pevent,
pevent_func_resolver_t *func, void *priv)
{
struct func_resolver *resolver = malloc(sizeof(*resolver));

if (resolver == NULL)
return -1;

resolver->func = func;
resolver->priv = priv;

free(pevent->func_resolver);
pevent->func_resolver = resolver;

return 0;
}

/**
* pevent_reset_function_resolver - reset alternative function resolver
* @pevent: handle for the pevent
*
* Stop using whatever alternative resolver was set, use the default
* one instead.
*/
void pevent_reset_function_resolver(struct pevent *pevent)
{
free(pevent->func_resolver);
pevent->func_resolver = NULL;
}

static struct func_map *
find_func(struct pevent *pevent, unsigned long long addr)
{
struct func_map *map;

if (!pevent->func_resolver)
return __find_func(pevent, addr);

map = &pevent->func_resolver->map;
map->mod = NULL;
map->addr = addr;
map->func = pevent->func_resolver->func(pevent->func_resolver->priv,
&map->addr, &map->mod);
if (map->func == NULL)
return NULL;

return map;
}

/**
* pevent_find_function - find a function by a given address
* @pevent: handle for the pevent
Expand Down Expand Up @@ -6564,6 +6629,7 @@ void pevent_free(struct pevent *pevent)
free(pevent->trace_clock);
free(pevent->events);
free(pevent->sort_events);
free(pevent->func_resolver);

free(pevent);
}
Expand Down
8 changes: 8 additions & 0 deletions tools/lib/traceevent/event-parse.h
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,10 @@ struct cmdline_list;
struct func_map;
struct func_list;
struct event_handler;
struct func_resolver;

typedef char *(pevent_func_resolver_t)(void *priv,
unsigned long long *addrp, char **modp);

struct pevent {
int ref_count;
Expand Down Expand Up @@ -481,6 +485,7 @@ struct pevent {
int cmdline_count;

struct func_map *func_map;
struct func_resolver *func_resolver;
struct func_list *funclist;
unsigned int func_count;

Expand Down Expand Up @@ -611,6 +616,9 @@ enum trace_flag_type {
TRACE_FLAG_SOFTIRQ = 0x10,
};

int pevent_set_function_resolver(struct pevent *pevent,
pevent_func_resolver_t *func, void *priv);
void pevent_reset_function_resolver(struct pevent *pevent);
int pevent_register_comm(struct pevent *pevent, const char *comm, int pid);
int pevent_register_trace_clock(struct pevent *pevent, const char *trace_clock);
int pevent_register_function(struct pevent *pevent, char *name,
Expand Down
4 changes: 4 additions & 0 deletions tools/perf/Documentation/perf-record.txt
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,10 @@ When processing pre-existing threads /proc/XXX/mmap, it may take a long time,
because the file may be huge. A time out is needed in such cases.
This option sets the time out limit. The default value is 500 ms.

--switch-events::
Record context switch events i.e. events of type PERF_RECORD_SWITCH or
PERF_RECORD_SWITCH_CPU_WIDE.

SEE ALSO
--------
linkperf:perf-stat[1], linkperf:perf-list[1]
4 changes: 4 additions & 0 deletions tools/perf/Documentation/perf-script.txt
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,10 @@ OPTIONS
--show-mmap-events
Display mmap related events (e.g. MMAP, MMAP2).

--show-switch-events
Display context switch events i.e. events of type PERF_RECORD_SWITCH or
PERF_RECORD_SWITCH_CPU_WIDE.

--header
Show perf.data header.

Expand Down
1 change: 1 addition & 0 deletions tools/perf/builtin-inject.c
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,7 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
.lost = perf_event__repipe,
.aux = perf_event__repipe,
.itrace_start = perf_event__repipe,
.context_switch = perf_event__repipe,
.read = perf_event__repipe_sample,
.throttle = perf_event__repipe,
.unthrottle = perf_event__repipe,
Expand Down
7 changes: 7 additions & 0 deletions tools/perf/builtin-record.c
Original file line number Diff line number Diff line change
Expand Up @@ -1075,6 +1075,8 @@ struct option __record_options[] = {
"opts", "AUX area tracing Snapshot Mode", ""),
OPT_UINTEGER(0, "proc-map-timeout", &record.opts.proc_map_timeout,
"per thread proc mmap processing timeout in ms"),
OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
"Record context switch events"),
OPT_END()
};

Expand Down Expand Up @@ -1102,6 +1104,11 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
" system-wide mode\n");
usage_with_options(record_usage, record_options);
}
if (rec->opts.record_switch_events &&
!perf_can_record_switch_events()) {
ui__error("kernel does not support recording context switch events (--switch-events option)\n");
usage_with_options(record_usage, record_options);
}

if (!rec->itr) {
rec->itr = auxtrace_record__init(rec->evlist, &err);
Expand Down
Loading

0 comments on commit 4b0c53e

Please sign in to comment.