Skip to content

Commit

Permalink
perf record: Enable off-cpu analysis with BPF
Browse files Browse the repository at this point in the history
Add --off-cpu option to enable the off-cpu profiling with BPF.  It'd
use a bpf_output event and rename it to "offcpu-time".  Samples will
be synthesized at the end of the record session using data from a BPF
map which contains the aggregated off-cpu time at context switches.
So it needs root privilege to get the off-cpu profiling.

Each sample will have a separate user stacktrace so it will skip
kernel threads.  The sample ip will be set from the stacktrace and
other sample data will be updated accordingly.  Currently it only
handles some basic sample types.

The sample timestamp is set to a dummy value just not to bother with
other events during the sorting.  So it has a very big initial value
and increase it on processing each samples.

Good thing is that it can be used together with regular profiling like
cpu cycles.  If you don't want to that, you can use a dummy event to
enable off-cpu profiling only.

Example output:
  $ sudo perf record --off-cpu perf bench sched messaging -l 1000

  $ sudo perf report --stdio --call-graph=no
  # Total Lost Samples: 0
  #
  # Samples: 41K of event 'cycles'
  # Event count (approx.): 42137343851
  ...

  # Samples: 1K of event 'offcpu-time'
  # Event count (approx.): 587990831640
  #
  # Children      Self  Command          Shared Object       Symbol
  # ........  ........  ...............  ..................  .........................
  #
      81.66%     0.00%  sched-messaging  libc-2.33.so        [.] __libc_start_main
      81.66%     0.00%  sched-messaging  perf                [.] cmd_bench
      81.66%     0.00%  sched-messaging  perf                [.] main
      81.66%     0.00%  sched-messaging  perf                [.] run_builtin
      81.43%     0.00%  sched-messaging  perf                [.] bench_sched_messaging
      40.86%    40.86%  sched-messaging  libpthread-2.33.so  [.] __read
      37.66%    37.66%  sched-messaging  libpthread-2.33.so  [.] __write
       2.91%     2.91%  sched-messaging  libc-2.33.so        [.] __poll
  ...

As you can see it spent most of off-cpu time in read and write in
bench_sched_messaging().  The --call-graph=no was added just to make
the output concise here.

It uses perf hooks facility to control BPF program during the record
session rather than adding new BPF/off-cpu specific calls.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Hao Luo <haoluo@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20220518224725.742882-3-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
  • Loading branch information
Namhyung Kim authored and Arnaldo Carvalho de Melo committed May 26, 2022
1 parent 303ead4 commit edc41a1
Show file tree
Hide file tree
Showing 7 changed files with 404 additions and 0 deletions.
10 changes: 10 additions & 0 deletions tools/perf/Documentation/perf-record.txt
Original file line number Diff line number Diff line change
Expand Up @@ -758,6 +758,16 @@ include::intel-hybrid.txt[]
If the URLs is not specified, the value of DEBUGINFOD_URLS
system environment variable is used.

--off-cpu::
Enable off-cpu profiling with BPF. The BPF program will collect
task scheduling information with (user) stacktrace and save them
as sample data of a software event named "offcpu-time". The
sample period will have the time the task slept in nanoseconds.

Note that BPF can collect stack traces using frame pointer ("fp")
only, as of now. So the applications built without the frame
pointer might see bogus addresses.

SEE ALSO
--------
linkperf:perf-stat[1], linkperf:perf-list[1], linkperf:perf-intel-pt[1]
1 change: 1 addition & 0 deletions tools/perf/Makefile.perf
Original file line number Diff line number Diff line change
Expand Up @@ -1038,6 +1038,7 @@ SKEL_TMP_OUT := $(abspath $(SKEL_OUT)/.tmp)
SKELETONS := $(SKEL_OUT)/bpf_prog_profiler.skel.h
SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h
SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h $(SKEL_OUT)/func_latency.skel.h
SKELETONS += $(SKEL_OUT)/off_cpu.skel.h

$(SKEL_TMP_OUT) $(LIBBPF_OUTPUT):
$(Q)$(MKDIR) -p $@
Expand Down
25 changes: 25 additions & 0 deletions tools/perf/builtin-record.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
#include "util/clockid.h"
#include "util/pmu-hybrid.h"
#include "util/evlist-hybrid.h"
#include "util/off_cpu.h"
#include "asm/bug.h"
#include "perf.h"
#include "cputopo.h"
Expand Down Expand Up @@ -162,6 +163,7 @@ struct record {
bool buildid_mmap;
bool timestamp_filename;
bool timestamp_boundary;
bool off_cpu;
struct switch_output switch_output;
unsigned long long samples;
unsigned long output_max_size; /* = 0: unlimited */
Expand Down Expand Up @@ -888,6 +890,11 @@ static int record__config_text_poke(struct evlist *evlist)
return 0;
}

static int record__config_off_cpu(struct record *rec)
{
return off_cpu_prepare(rec->evlist);
}

static bool record__kcore_readable(struct machine *machine)
{
char kcore[PATH_MAX];
Expand Down Expand Up @@ -2591,6 +2598,9 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
} else
status = err;

if (rec->off_cpu)
rec->bytes_written += off_cpu_write(rec->session);

record__synthesize(rec, true);
/* this will be recalculated during process_buildids() */
rec->samples = 0;
Expand Down Expand Up @@ -3315,6 +3325,7 @@ static struct option __record_options[] = {
OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
"write collected trace data into several data files using parallel threads",
record__parse_threads),
OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
OPT_END()
};

Expand Down Expand Up @@ -3734,6 +3745,12 @@ int cmd_record(int argc, const char **argv)
set_nobuild('\0', "vmlinux", true);
# undef set_nobuild
# undef REASON
#endif

#ifndef HAVE_BPF_SKEL
# define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
# undef set_nobuild
#endif

rec->opts.affinity = PERF_AFFINITY_SYS;
Expand Down Expand Up @@ -3972,6 +3989,14 @@ int cmd_record(int argc, const char **argv)
}
}

if (rec->off_cpu) {
err = record__config_off_cpu(rec);
if (err) {
pr_err("record__config_off_cpu failed, error %d\n", err);
goto out;
}
}

if (record_opts__config(&rec->opts)) {
err = -EINVAL;
goto out;
Expand Down
1 change: 1 addition & 0 deletions tools/perf/util/Build
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ perf-$(CONFIG_LIBBPF) += bpf_map.o
perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o
perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter_cgroup.o
perf-$(CONFIG_PERF_BPF_SKEL) += bpf_ftrace.o
perf-$(CONFIG_PERF_BPF_SKEL) += bpf_off_cpu.o
perf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
perf-$(CONFIG_LIBELF) += symbol-elf.o
perf-$(CONFIG_LIBELF) += probe-file.o
Expand Down
204 changes: 204 additions & 0 deletions tools/perf/util/bpf_off_cpu.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
// SPDX-License-Identifier: GPL-2.0
#include "util/bpf_counter.h"
#include "util/debug.h"
#include "util/evsel.h"
#include "util/evlist.h"
#include "util/off_cpu.h"
#include "util/perf-hooks.h"
#include "util/session.h"
#include <bpf/bpf.h>

#include "bpf_skel/off_cpu.skel.h"

#define MAX_STACKS 32
/* we don't need actual timestamp, just want to put the samples at last */
#define OFF_CPU_TIMESTAMP (~0ull << 32)

static struct off_cpu_bpf *skel;

struct off_cpu_key {
u32 pid;
u32 tgid;
u32 stack_id;
u32 state;
};

union off_cpu_data {
struct perf_event_header hdr;
u64 array[1024 / sizeof(u64)];
};

static int off_cpu_config(struct evlist *evlist)
{
struct evsel *evsel;
struct perf_event_attr attr = {
.type = PERF_TYPE_SOFTWARE,
.config = PERF_COUNT_SW_BPF_OUTPUT,
.size = sizeof(attr), /* to capture ABI version */
};
char *evname = strdup(OFFCPU_EVENT);

if (evname == NULL)
return -ENOMEM;

evsel = evsel__new(&attr);
if (!evsel) {
free(evname);
return -ENOMEM;
}

evsel->core.attr.freq = 1;
evsel->core.attr.sample_period = 1;
/* off-cpu analysis depends on stack trace */
evsel->core.attr.sample_type = PERF_SAMPLE_CALLCHAIN;

evlist__add(evlist, evsel);

free(evsel->name);
evsel->name = evname;

return 0;
}

static void off_cpu_start(void *arg __maybe_unused)
{
skel->bss->enabled = 1;
}

static void off_cpu_finish(void *arg __maybe_unused)
{
skel->bss->enabled = 0;
off_cpu_bpf__destroy(skel);
}

int off_cpu_prepare(struct evlist *evlist)
{
int err;

if (off_cpu_config(evlist) < 0) {
pr_err("Failed to config off-cpu BPF event\n");
return -1;
}

set_max_rlimit();

skel = off_cpu_bpf__open_and_load();
if (!skel) {
pr_err("Failed to open off-cpu BPF skeleton\n");
return -1;
}

err = off_cpu_bpf__attach(skel);
if (err) {
pr_err("Failed to attach off-cpu BPF skeleton\n");
goto out;
}

if (perf_hooks__set_hook("record_start", off_cpu_start, NULL) ||
perf_hooks__set_hook("record_end", off_cpu_finish, NULL)) {
pr_err("Failed to attach off-cpu skeleton\n");
goto out;
}

return 0;

out:
off_cpu_bpf__destroy(skel);
return -1;
}

int off_cpu_write(struct perf_session *session)
{
int bytes = 0, size;
int fd, stack;
u64 sample_type, val, sid = 0;
struct evsel *evsel;
struct perf_data_file *file = &session->data->file;
struct off_cpu_key prev, key;
union off_cpu_data data = {
.hdr = {
.type = PERF_RECORD_SAMPLE,
.misc = PERF_RECORD_MISC_USER,
},
};
u64 tstamp = OFF_CPU_TIMESTAMP;

skel->bss->enabled = 0;

evsel = evlist__find_evsel_by_str(session->evlist, OFFCPU_EVENT);
if (evsel == NULL) {
pr_err("%s evsel not found\n", OFFCPU_EVENT);
return 0;
}

sample_type = evsel->core.attr.sample_type;

if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) {
if (evsel->core.id)
sid = evsel->core.id[0];
}

fd = bpf_map__fd(skel->maps.off_cpu);
stack = bpf_map__fd(skel->maps.stacks);
memset(&prev, 0, sizeof(prev));

while (!bpf_map_get_next_key(fd, &prev, &key)) {
int n = 1; /* start from perf_event_header */
int ip_pos = -1;

bpf_map_lookup_elem(fd, &key, &val);

if (sample_type & PERF_SAMPLE_IDENTIFIER)
data.array[n++] = sid;
if (sample_type & PERF_SAMPLE_IP) {
ip_pos = n;
data.array[n++] = 0; /* will be updated */
}
if (sample_type & PERF_SAMPLE_TID)
data.array[n++] = (u64)key.pid << 32 | key.tgid;
if (sample_type & PERF_SAMPLE_TIME)
data.array[n++] = tstamp;
if (sample_type & PERF_SAMPLE_ID)
data.array[n++] = sid;
if (sample_type & PERF_SAMPLE_CPU)
data.array[n++] = 0;
if (sample_type & PERF_SAMPLE_PERIOD)
data.array[n++] = val;
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
int len = 0;

/* data.array[n] is callchain->nr (updated later) */
data.array[n + 1] = PERF_CONTEXT_USER;
data.array[n + 2] = 0;

bpf_map_lookup_elem(stack, &key.stack_id, &data.array[n + 2]);
while (data.array[n + 2 + len])
len++;

/* update length of callchain */
data.array[n] = len + 1;

/* update sample ip with the first callchain entry */
if (ip_pos >= 0)
data.array[ip_pos] = data.array[n + 2];

/* calculate sample callchain data array length */
n += len + 2;
}
/* TODO: handle more sample types */

size = n * sizeof(u64);
data.hdr.size = size;
bytes += size;

if (perf_data_file__write(file, &data, size) < 0) {
pr_err("failed to write perf data, error: %m\n");
return bytes;
}

prev = key;
/* increase dummy timestamp to sort later samples */
tstamp++;
}
return bytes;
}
Loading

0 comments on commit edc41a1

Please sign in to comment.