Skip to content

Commit

Permalink
Merge tag 'perf-core-for-mingo-4.16-20180110' of git://git.kernel.org…
Browse files Browse the repository at this point in the history
…/pub/scm/linux/kernel/git/acme/linux into perf/core

Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo:

- The 'perf test bpf' entry hooked a eBPF proggie to the
  SyS_epoll_wait() kernel function and expected it to be hit when calling
  the epoll_wait() libc wrapper, which changed recently, in systems such
  as Fedora 27, with the glibc wrapper calling instead the epoll_pwait()
  syscall, so switch to epoll_pwait() for both the kernel and libc
  function, getting it to work both in old and new systems (Arnaldo Carvalho de Melo)

- Beautify 'gettid' syscall result in 'perf trace', and in doing so
  noticed that we need to handle namespaces in 'perf trace', will be
  dealt with in follow up patches where we'll try to figure out if
  the recent support for namespace in tools/perf/ can be used for this
  purpose as well. (Arnaldo Carvalho de Melo)

- Introduce 'perf report --mmaps' and 'perf report --tasks' to show
  info present in 'perf.data' (Jiri Olsa, Arnaldo Carvalho de Melo)

- Synchronize kernel <-> tooling headers wrt meltdown/spectre changes
  (Arnaldo Carvalho de Melo)

- Fix a wrong offset issue when using /proc/kcore (Jin Yao)

- Fix bug that prevented annotating symbols in perf.data files
  generated with 'perf record --branch-any'  (Jin Yao)

- Add infrastructure to record first and last sample time to the
  perf.data file header, so that when processing all samples in
  a 'perf record' session, such as when doing build-id processing,
  or when specifically requesting that that info be recorded, use
  that in 'perf report --time', that also got support for percent
  slices in addition to absolute ones.

  I.e. now it is possible to ask for the samples in the 10%-20%
  time slice of a perf.data file (Jin Yao)

- Enable building with libbabeltrace by default (Jiri Olsa)

- Display perf_event_attr::namespaces when duping the attributes
  in verbose mode (Jiri Olsa)

- Allocate context task_ctx_data for child event (Jiri Olsa)

- Update comments for PERF_RECORD_ITRACE_START and PERF_RECORD_MISC_* (Jiri Olsa)

- Add support for showing PERF_RECORD_LOST events in 'perf script' (Jiri Olsa)

- Add 'perf report --stats' option to display quick statistics about
  metadata events (PERF_RECORD_*) i.e. what we get at the end of 'perf
  report -D' (Jiri Olsa)

- Fix compile error with libunwind x86 (Wang Nan)

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
  • Loading branch information
Ingo Molnar committed Jan 11, 2018
2 parents 9128d3e + 5d64db2 commit 1ccb8fe
Show file tree
Hide file tree
Showing 36 changed files with 884 additions and 140 deletions.
10 changes: 7 additions & 3 deletions include/uapi/linux/perf_event.h
Original file line number Diff line number Diff line change
Expand Up @@ -612,9 +612,12 @@ struct perf_event_mmap_page {
*/
#define PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT (1 << 12)
/*
* PERF_RECORD_MISC_MMAP_DATA and PERF_RECORD_MISC_COMM_EXEC are used on
* different events so can reuse the same bit position.
* Ditto PERF_RECORD_MISC_SWITCH_OUT.
* Following PERF_RECORD_MISC_* are used on different
* events, so can reuse the same bit position:
*
* PERF_RECORD_MISC_MMAP_DATA - PERF_RECORD_MMAP* events
* PERF_RECORD_MISC_COMM_EXEC - PERF_RECORD_COMM event
* PERF_RECORD_MISC_SWITCH_OUT - PERF_RECORD_SWITCH* events
*/
#define PERF_RECORD_MISC_MMAP_DATA (1 << 13)
#define PERF_RECORD_MISC_COMM_EXEC (1 << 13)
Expand Down Expand Up @@ -864,6 +867,7 @@ enum perf_event_type {
* struct perf_event_header header;
* u32 pid;
* u32 tid;
* struct sample_id sample_id;
* };
*/
PERF_RECORD_ITRACE_START = 12,
Expand Down
15 changes: 0 additions & 15 deletions kernel/events/callchain.c
Original file line number Diff line number Diff line change
Expand Up @@ -178,21 +178,6 @@ put_callchain_entry(int rctx)
put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
}

struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
bool kernel = !event->attr.exclude_callchain_kernel;
bool user = !event->attr.exclude_callchain_user;
/* Disallow cross-task user callchains. */
bool crosstask = event->ctx->task && event->ctx->task != current;
const u32 max_stack = event->attr.sample_max_stack;

if (!kernel && !user)
return NULL;

return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true);
}

struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
u32 max_stack, bool crosstask, bool add_mark)
Expand Down
54 changes: 39 additions & 15 deletions kernel/events/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -5815,19 +5815,11 @@ void perf_output_sample(struct perf_output_handle *handle,
perf_output_read(handle, event);

if (sample_type & PERF_SAMPLE_CALLCHAIN) {
if (data->callchain) {
int size = 1;

if (data->callchain)
size += data->callchain->nr;

size *= sizeof(u64);
int size = 1;

__output_copy(handle, data->callchain, size);
} else {
u64 nr = 0;
perf_output_put(handle, nr);
}
size += data->callchain->nr;
size *= sizeof(u64);
__output_copy(handle, data->callchain, size);
}

if (sample_type & PERF_SAMPLE_RAW) {
Expand Down Expand Up @@ -5980,6 +5972,26 @@ static u64 perf_virt_to_phys(u64 virt)
return phys_addr;
}

static struct perf_callchain_entry __empty_callchain = { .nr = 0, };

static struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
bool kernel = !event->attr.exclude_callchain_kernel;
bool user = !event->attr.exclude_callchain_user;
/* Disallow cross-task user callchains. */
bool crosstask = event->ctx->task && event->ctx->task != current;
const u32 max_stack = event->attr.sample_max_stack;
struct perf_callchain_entry *callchain;

if (!kernel && !user)
return &__empty_callchain;

callchain = get_perf_callchain(regs, 0, kernel, user,
max_stack, crosstask, true);
return callchain ?: &__empty_callchain;
}

void perf_prepare_sample(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event,
Expand All @@ -6002,9 +6014,7 @@ void perf_prepare_sample(struct perf_event_header *header,
int size = 1;

data->callchain = perf_callchain(event, regs);

if (data->callchain)
size += data->callchain->nr;
size += data->callchain->nr;

header->size += size * sizeof(u64);
}
Expand Down Expand Up @@ -10703,6 +10713,19 @@ inherit_event(struct perf_event *parent_event,
if (IS_ERR(child_event))
return child_event;


if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
!child_ctx->task_ctx_data) {
struct pmu *pmu = child_event->pmu;

child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
GFP_KERNEL);
if (!child_ctx->task_ctx_data) {
free_event(child_event);
return NULL;
}
}

/*
* is_orphaned_event() and list_add_tail(&parent_event->child_list)
* must be under the same lock in order to serialize against
Expand All @@ -10713,6 +10736,7 @@ inherit_event(struct perf_event *parent_event,
if (is_orphaned_event(parent_event) ||
!atomic_long_inc_not_zero(&parent_event->refcount)) {
mutex_unlock(&parent_event->child_mutex);
/* task_ctx_data is freed with child_ctx */
free_event(child_event);
return NULL;
}
Expand Down
4 changes: 0 additions & 4 deletions kernel/events/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,10 +201,6 @@ arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)

DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)

/* Callchain handling */
extern struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs);

static inline int get_recursion_context(int *recursion)
{
int rctx;
Expand Down
4 changes: 3 additions & 1 deletion tools/arch/x86/include/asm/cpufeatures.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,11 +197,12 @@
#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */

#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */

#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */
Expand Down Expand Up @@ -340,5 +341,6 @@
#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */

#endif /* _ASM_X86_CPUFEATURES_H */
8 changes: 7 additions & 1 deletion tools/arch/x86/include/asm/disabled-features.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@
# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31))
#endif

#ifdef CONFIG_PAGE_TABLE_ISOLATION
# define DISABLE_PTI 0
#else
# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
#endif

/*
* Make sure to add features to the correct mask
*/
Expand All @@ -60,7 +66,7 @@
#define DISABLED_MASK4 (DISABLE_PCID)
#define DISABLED_MASK5 0
#define DISABLED_MASK6 0
#define DISABLED_MASK7 0
#define DISABLED_MASK7 (DISABLE_PTI)
#define DISABLED_MASK8 0
#define DISABLED_MASK9 (DISABLE_MPX)
#define DISABLED_MASK10 0
Expand Down
10 changes: 7 additions & 3 deletions tools/include/uapi/linux/perf_event.h
Original file line number Diff line number Diff line change
Expand Up @@ -612,9 +612,12 @@ struct perf_event_mmap_page {
*/
#define PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT (1 << 12)
/*
* PERF_RECORD_MISC_MMAP_DATA and PERF_RECORD_MISC_COMM_EXEC are used on
* different events so can reuse the same bit position.
* Ditto PERF_RECORD_MISC_SWITCH_OUT.
* Following PERF_RECORD_MISC_* are used on different
* events, so can reuse the same bit position:
*
* PERF_RECORD_MISC_MMAP_DATA - PERF_RECORD_MMAP* events
* PERF_RECORD_MISC_COMM_EXEC - PERF_RECORD_COMM event
* PERF_RECORD_MISC_SWITCH_OUT - PERF_RECORD_SWITCH* events
*/
#define PERF_RECORD_MISC_MMAP_DATA (1 << 13)
#define PERF_RECORD_MISC_COMM_EXEC (1 << 13)
Expand Down Expand Up @@ -864,6 +867,7 @@ enum perf_event_type {
* struct perf_event_header header;
* u32 pid;
* u32 tid;
* struct sample_id sample_id;
* };
*/
PERF_RECORD_ITRACE_START = 12,
Expand Down
3 changes: 3 additions & 0 deletions tools/perf/Documentation/perf-record.txt
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,9 @@ Configure all used events to run in user space.
--timestamp-filename
Append timestamp to output file name.

--timestamp-boundary::
Record timestamp boundary (time of first/last samples).

--switch-output[=mode]::
Generate multiple perf.data files, timestamp prefixed, switching to a new one
based on 'mode' value:
Expand Down
37 changes: 36 additions & 1 deletion tools/perf/Documentation/perf-report.txt
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,26 @@ OPTIONS
stop time is not given (i.e, time string is 'x.y,') then analysis goes
to end of file.

Also support time percent with multiple time range. Time string is
'a%/n,b%/m,...' or 'a%-b%,c%-%d,...'. The maximum number of slices is 10.

For example:
Select the second 10% time slice:

perf report --time 10%/2

Select from 0% to 10% time slice:

perf report --time 0%-10%

Select the first and second 10% time slices:

perf report --time 10%/1,10%/2

Select from 0% to 10% and 30% to 40% slices:

perf report --time 0%-10%,30%-40%

--itrace::
Options for decoding instruction tracing data. The options are:

Expand Down Expand Up @@ -437,8 +457,23 @@ include::itrace.txt[]
will be printed. Each entry is function name or file/line. Enabled by
default, disable with --no-inline.

--mmaps::
Show --tasks output plus mmap information in a format similar to
/proc/<PID>/maps.

Please note that not all mmaps are stored, options affecting which ones
are include 'perf record --data', for instance.

--stats::
Display overall events statistics without any further processing.
(like the one at the end of the perf report -D command)

--tasks::
Display monitored tasks stored in perf data. Displaying pid/tid/ppid
plus the command string aligned to distinguish parent and child tasks.

include::callchain-overhead-calculation.txt[]

SEE ALSO
--------
linkperf:perf-stat[1], linkperf:perf-annotate[1]
linkperf:perf-stat[1], linkperf:perf-annotate[1], linkperf:perf-record[1]
39 changes: 38 additions & 1 deletion tools/perf/Documentation/perf-script.txt
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ OPTIONS
Comma separated list of fields to print. Options are:
comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
srcline, period, iregs, uregs, brstack, brstacksym, flags, bpf-output, brstackinsn,
brstackoff, callindent, insn, insnlen, synth, phys_addr, metric.
brstackoff, callindent, insn, insnlen, synth, phys_addr, metric, misc.
Field list can be prepended with the type, trace, sw or hw,
to indicate to which event type the field list applies.
e.g., -F sw:comm,tid,time,ip,sym and -F trace:time,cpu,trace
Expand Down Expand Up @@ -225,6 +225,24 @@ OPTIONS
that the metric computed is averaged over the whole sampling
period, not just for the sample point.

For sample events it's possible to display misc field with -F +misc option,
following letters are displayed for each bit:

PERF_RECORD_MISC_KERNEL K
PERF_RECORD_MISC_USER U
PERF_RECORD_MISC_HYPERVISOR H
PERF_RECORD_MISC_GUEST_KERNEL G
PERF_RECORD_MISC_GUEST_USER g
PERF_RECORD_MISC_MMAP_DATA* M
PERF_RECORD_MISC_COMM_EXEC E
PERF_RECORD_MISC_SWITCH_OUT S

$ perf script -F +misc ...
sched-messaging 1414 K 28690.636582: 4590 cycles ...
sched-messaging 1407 U 28690.636600: 325620 cycles ...
sched-messaging 1414 K 28690.636608: 19473 cycles ...
misc field ___________/

-k::
--vmlinux=<file>::
vmlinux pathname
Expand Down Expand Up @@ -282,6 +300,9 @@ OPTIONS
Display context switch events i.e. events of type PERF_RECORD_SWITCH or
PERF_RECORD_SWITCH_CPU_WIDE.

--show-lost-events
Display lost events i.e. events of type PERF_RECORD_LOST.

--demangle::
Demangle symbol names to human readable form. It's enabled by default,
disable with --no-demangle.
Expand Down Expand Up @@ -329,6 +350,22 @@ include::itrace.txt[]
stop time is not given (i.e, time string is 'x.y,') then analysis goes
to end of file.

Also support time percent with multipe time range. Time string is
'a%/n,b%/m,...' or 'a%-b%,c%-%d,...'. The maximum number of slices is 10.

For example:
Select the second 10% time slice
perf script --time 10%/2

Select from 0% to 10% time slice
perf script --time 0%-10%

Select the first and second 10% time slices
perf script --time 10%/1,10%/2

Select from 0% to 10% and 30% to 40% slices
perf script --time 0%-10%,30%-40%

--max-blocks::
Set the maximum number of program blocks to print with brstackasm for
each sample.
Expand Down
4 changes: 4 additions & 0 deletions tools/perf/Documentation/perf.data-file-format.txt
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,10 @@ struct {
struct perf_header_string map;
}[number_of_cache_levels];

HEADER_SAMPLE_TIME = 21,

Two uint64_t for the time of first sample and the time of last sample.

other bits are reserved and should ignored for now
HEADER_FEAT_BITS = 256,

Expand Down
2 changes: 1 addition & 1 deletion tools/perf/Makefile.config
Original file line number Diff line number Diff line change
Expand Up @@ -780,7 +780,7 @@ else
NO_PERF_READ_VDSOX32 := 1
endif

ifdef LIBBABELTRACE
ifndef NO_LIBBABELTRACE
$(call feature_check,libbabeltrace)
ifeq ($(feature-libbabeltrace), 1)
CFLAGS += -DHAVE_LIBBABELTRACE_SUPPORT $(LIBBABELTRACE_CFLAGS)
Expand Down
2 changes: 1 addition & 1 deletion tools/perf/Makefile.perf
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ include ../scripts/utilities.mak
#
# Define NO_ZLIB if you do not want to support compressed kernel modules
#
# Define LIBBABELTRACE if you DO want libbabeltrace support
# Define NO_LIBBABELTRACE if you do not want libbabeltrace support
# for CTF data format.
#
# Define NO_LZMA if you do not want to support compressed (xz) kernel modules
Expand Down
2 changes: 1 addition & 1 deletion tools/perf/arch/x86/util/unwind-libunwind.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0

#ifndef REMOTE_UNWIND_LIBUNWIND
#include <errno.h>
#ifndef REMOTE_UNWIND_LIBUNWIND
#include <libunwind.h>
#include "perf_regs.h"
#include "../../util/unwind.h"
Expand Down
Loading

0 comments on commit 1ccb8fe

Please sign in to comment.