diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2a62b96600ad9..abaed4f8bb7f2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2867,16 +2867,11 @@ static int perf_event_modify_breakpoint(struct perf_event *bp,
 	_perf_event_disable(bp);
 
 	err = modify_user_hw_breakpoint_check(bp, attr, true);
-	if (err) {
-		if (!bp->attr.disabled)
-			_perf_event_enable(bp);
 
-		return err;
-	}
-
-	if (!attr->disabled)
+	if (!bp->attr.disabled)
 		_perf_event_enable(bp);
-	return 0;
+
+	return err;
 }
 
 static int perf_event_modify_attr(struct perf_event *event,
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index b3814fce5ecb6..d6b56180827c7 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -509,6 +509,8 @@ modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *a
  */
 int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
 {
+	int err;
+
 	/*
 	 * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
 	 * will not be possible to raise IPIs that invoke __perf_event_disable.
@@ -520,15 +522,12 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
 	else
 		perf_event_disable(bp);
 
-	if (!attr->disabled) {
-		int err = modify_user_hw_breakpoint_check(bp, attr, false);
+	err = modify_user_hw_breakpoint_check(bp, attr, false);
 
-		if (err)
-			return err;
+	if (!bp->attr.disabled)
 		perf_event_enable(bp);
-		bp->attr.disabled = 0;
-	}
-	return 0;
+
+	return err;
 }
 EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
 
diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index ce1e20227c64d..70a42bec6931c 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -24,6 +24,7 @@
 #include <netinet/in.h>
 #include "event-parse.h"
 #include "event-utils.h"
+#include "trace-seq.h"
 
 static const char *input_buf;
 static unsigned long long input_buf_ptr;
diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h
index 44b7c2d41f9fc..fa665c66bfa4e 100644
--- a/tools/lib/traceevent/event-parse.h
+++ b/tools/lib/traceevent/event-parse.h
@@ -26,17 +26,12 @@
 #include <regex.h>
 #include <string.h>
 
+#include "trace-seq.h"
+
 #ifndef __maybe_unused
 #define __maybe_unused __attribute__((unused))
 #endif
 
-/* ----------------------- trace_seq ----------------------- */
-
-
-#ifndef TRACE_SEQ_BUF_SIZE
-#define TRACE_SEQ_BUF_SIZE 4096
-#endif
-
 #ifndef DEBUG_RECORD
 #define DEBUG_RECORD 0
 #endif
@@ -59,43 +54,6 @@ struct tep_record {
 #endif
 };
 
-enum trace_seq_fail {
-	TRACE_SEQ__GOOD,
-	TRACE_SEQ__BUFFER_POISONED,
-	TRACE_SEQ__MEM_ALLOC_FAILED,
-};
-
-/*
- * Trace sequences are used to allow a function to call several other functions
- * to create a string of data to use (up to a max of PAGE_SIZE).
- */
-
-struct trace_seq {
-	char			*buffer;
-	unsigned int		buffer_size;
-	unsigned int		len;
-	unsigned int		readpos;
-	enum trace_seq_fail	state;
-};
-
-void trace_seq_init(struct trace_seq *s);
-void trace_seq_reset(struct trace_seq *s);
-void trace_seq_destroy(struct trace_seq *s);
-
-extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
-	__attribute__ ((format (printf, 2, 0)));
-
-extern int trace_seq_puts(struct trace_seq *s, const char *str);
-extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
-
-extern void trace_seq_terminate(struct trace_seq *s);
-
-extern int trace_seq_do_fprintf(struct trace_seq *s, FILE *fp);
-extern int trace_seq_do_printf(struct trace_seq *s);
-
-
 /* ----------------------- pevent ----------------------- */
 
 struct tep_handle;
diff --git a/tools/lib/traceevent/event-plugin.c b/tools/lib/traceevent/event-plugin.c
index f17e25097e1e2..ec16a103c0cce 100644
--- a/tools/lib/traceevent/event-plugin.c
+++ b/tools/lib/traceevent/event-plugin.c
@@ -15,6 +15,7 @@
 #include <dirent.h>
 #include "event-parse.h"
 #include "event-utils.h"
+#include "trace-seq.h"
 
 #define LOCAL_PLUGIN_DIR ".traceevent/plugins"
 
diff --git a/tools/lib/traceevent/plugin_function.c b/tools/lib/traceevent/plugin_function.c
index 424747475d37b..2919042e7dc20 100644
--- a/tools/lib/traceevent/plugin_function.c
+++ b/tools/lib/traceevent/plugin_function.c
@@ -23,6 +23,7 @@
 
 #include "event-parse.h"
 #include "event-utils.h"
+#include "trace-seq.h"
 
 static struct func_stack {
 	int size;
diff --git a/tools/lib/traceevent/plugin_hrtimer.c b/tools/lib/traceevent/plugin_hrtimer.c
index b43bfec565d83..29b608076ea05 100644
--- a/tools/lib/traceevent/plugin_hrtimer.c
+++ b/tools/lib/traceevent/plugin_hrtimer.c
@@ -23,6 +23,7 @@
 #include <string.h>
 
 #include "event-parse.h"
+#include "trace-seq.h"
 
 static int timer_expire_handler(struct trace_seq *s,
 				struct tep_record *record,
diff --git a/tools/lib/traceevent/plugin_jbd2.c b/tools/lib/traceevent/plugin_jbd2.c
index 45a9acd196409..a5e34135dd6a2 100644
--- a/tools/lib/traceevent/plugin_jbd2.c
+++ b/tools/lib/traceevent/plugin_jbd2.c
@@ -22,6 +22,7 @@
 #include <string.h>
 
 #include "event-parse.h"
+#include "trace-seq.h"
 
 #define MINORBITS	20
 #define MINORMASK	((1U << MINORBITS) - 1)
diff --git a/tools/lib/traceevent/plugin_kmem.c b/tools/lib/traceevent/plugin_kmem.c
index 73966b05abce3..a7a162575e2c9 100644
--- a/tools/lib/traceevent/plugin_kmem.c
+++ b/tools/lib/traceevent/plugin_kmem.c
@@ -22,6 +22,7 @@
 #include <string.h>
 
 #include "event-parse.h"
+#include "trace-seq.h"
 
 static int call_site_handler(struct trace_seq *s, struct tep_record *record,
 			     struct event_format *event, void *context)
diff --git a/tools/lib/traceevent/plugin_kvm.c b/tools/lib/traceevent/plugin_kvm.c
index 1d0d159062251..a0dfd3d0f1970 100644
--- a/tools/lib/traceevent/plugin_kvm.c
+++ b/tools/lib/traceevent/plugin_kvm.c
@@ -23,6 +23,7 @@
 #include <stdint.h>
 
 #include "event-parse.h"
+#include "trace-seq.h"
 
 #ifdef HAVE_UDIS86
 
diff --git a/tools/lib/traceevent/plugin_mac80211.c b/tools/lib/traceevent/plugin_mac80211.c
index de50a53162030..0b7779444b634 100644
--- a/tools/lib/traceevent/plugin_mac80211.c
+++ b/tools/lib/traceevent/plugin_mac80211.c
@@ -22,6 +22,7 @@
 #include <string.h>
 
 #include "event-parse.h"
+#include "trace-seq.h"
 
 #define INDENT 65
 
diff --git a/tools/lib/traceevent/plugin_sched_switch.c b/tools/lib/traceevent/plugin_sched_switch.c
index eecb4bd95c11b..582d3be2849b3 100644
--- a/tools/lib/traceevent/plugin_sched_switch.c
+++ b/tools/lib/traceevent/plugin_sched_switch.c
@@ -22,6 +22,7 @@
 #include <string.h>
 
 #include "event-parse.h"
+#include "trace-seq.h"
 
 static void write_state(struct trace_seq *s, int val)
 {
diff --git a/tools/lib/traceevent/plugin_scsi.c b/tools/lib/traceevent/plugin_scsi.c
index 5ec346f6b8425..4eba25cc14318 100644
--- a/tools/lib/traceevent/plugin_scsi.c
+++ b/tools/lib/traceevent/plugin_scsi.c
@@ -3,6 +3,7 @@
 #include <string.h>
 #include <inttypes.h>
 #include "event-parse.h"
+#include "trace-seq.h"
 
 typedef unsigned long sector_t;
 typedef uint64_t u64;
diff --git a/tools/lib/traceevent/plugin_xen.c b/tools/lib/traceevent/plugin_xen.c
index b2acbd6e9c86c..bc0496e4c296f 100644
--- a/tools/lib/traceevent/plugin_xen.c
+++ b/tools/lib/traceevent/plugin_xen.c
@@ -3,6 +3,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "event-parse.h"
+#include "trace-seq.h"
 
 #define __HYPERVISOR_set_trap_table			0
 #define __HYPERVISOR_mmu_update				1
diff --git a/tools/lib/traceevent/trace-seq.c b/tools/lib/traceevent/trace-seq.c
index e3bac4543d3b7..8ff1d55954d15 100644
--- a/tools/lib/traceevent/trace-seq.c
+++ b/tools/lib/traceevent/trace-seq.c
@@ -3,6 +3,8 @@
  * Copyright (C) 2009 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
  *
  */
+#include "trace-seq.h"
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
diff --git a/tools/lib/traceevent/trace-seq.h b/tools/lib/traceevent/trace-seq.h
new file mode 100644
index 0000000000000..d68ec69f8d1a8
--- /dev/null
+++ b/tools/lib/traceevent/trace-seq.h
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: LGPL-2.1
+/*
+ * Copyright (C) 2009, 2010 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+
+#ifndef _TRACE_SEQ_H
+#define _TRACE_SEQ_H
+
+#include <stdarg.h>
+#include <stdio.h>
+
+/* ----------------------- trace_seq ----------------------- */
+
+#ifndef TRACE_SEQ_BUF_SIZE
+#define TRACE_SEQ_BUF_SIZE 4096
+#endif
+
+enum trace_seq_fail {
+	TRACE_SEQ__GOOD,
+	TRACE_SEQ__BUFFER_POISONED,
+	TRACE_SEQ__MEM_ALLOC_FAILED,
+};
+
+/*
+ * Trace sequences are used to allow a function to call several other functions
+ * to create a string of data to use (up to a max of PAGE_SIZE).
+ */
+
+struct trace_seq {
+	char			*buffer;
+	unsigned int		buffer_size;
+	unsigned int		len;
+	unsigned int		readpos;
+	enum trace_seq_fail	state;
+};
+
+void trace_seq_init(struct trace_seq *s);
+void trace_seq_reset(struct trace_seq *s);
+void trace_seq_destroy(struct trace_seq *s);
+
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+extern int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+	__attribute__ ((format (printf, 2, 0)));
+
+extern int trace_seq_puts(struct trace_seq *s, const char *str);
+extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
+
+extern void trace_seq_terminate(struct trace_seq *s);
+
+extern int trace_seq_do_fprintf(struct trace_seq *s, FILE *fp);
+extern int trace_seq_do_printf(struct trace_seq *s);
+
+#endif /* _TRACE_SEQ_H */
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index b3d1b12a5081b..92514fb3689f0 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -777,14 +777,14 @@ endif
 	$(call QUIET_INSTALL, libexec) \
 		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
 ifndef NO_LIBBPF
-	$(call QUIET_INSTALL, lib) \
-		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perf_include_instdir_SQ)/bpf'
-	$(call QUIET_INSTALL, include/bpf) \
-		$(INSTALL) include/bpf/*.h '$(DESTDIR_SQ)$(perf_include_instdir_SQ)/bpf'
-	$(call QUIET_INSTALL, lib) \
-		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perf_examples_instdir_SQ)/bpf'
-	$(call QUIET_INSTALL, examples/bpf) \
-		$(INSTALL) examples/bpf/*.c '$(DESTDIR_SQ)$(perf_examples_instdir_SQ)/bpf'
+	$(call QUIET_INSTALL, bpf-headers) \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perf_include_instdir_SQ)/bpf'; \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perf_include_instdir_SQ)/bpf/linux'; \
+		$(INSTALL) include/bpf/*.h -t '$(DESTDIR_SQ)$(perf_include_instdir_SQ)/bpf'; \
+		$(INSTALL) include/bpf/linux/*.h -t '$(DESTDIR_SQ)$(perf_include_instdir_SQ)/bpf/linux'
+	$(call QUIET_INSTALL, bpf-examples) \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perf_examples_instdir_SQ)/bpf'; \
+		$(INSTALL) examples/bpf/*.c -t '$(DESTDIR_SQ)$(perf_examples_instdir_SQ)/bpf'
 endif
 	$(call QUIET_INSTALL, perf-archive) \
 		$(INSTALL) $(OUTPUT)perf-archive -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
diff --git a/tools/perf/arch/arm64/Makefile b/tools/perf/arch/arm64/Makefile
index f013b115dc860..dbef716a19135 100644
--- a/tools/perf/arch/arm64/Makefile
+++ b/tools/perf/arch/arm64/Makefile
@@ -11,7 +11,8 @@ PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1
 
 out    := $(OUTPUT)arch/arm64/include/generated/asm
 header := $(out)/syscalls.c
-sysdef := $(srctree)/tools/include/uapi/asm-generic/unistd.h
+incpath := $(srctree)/tools
+sysdef := $(srctree)/tools/arch/arm64/include/uapi/asm/unistd.h
 sysprf := $(srctree)/tools/perf/arch/arm64/entry/syscalls/
 systbl := $(sysprf)/mksyscalltbl
 
@@ -19,7 +20,7 @@ systbl := $(sysprf)/mksyscalltbl
 _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
 
 $(header): $(sysdef) $(systbl)
-	$(Q)$(SHELL) '$(systbl)' '$(CC)' '$(HOSTCC)' $(sysdef) > $@
+	$(Q)$(SHELL) '$(systbl)' '$(CC)' '$(HOSTCC)' $(incpath) $(sysdef) > $@
 
 clean::
 	$(call QUIET_CLEAN, arm64) $(RM) $(header)
diff --git a/tools/perf/arch/arm64/annotate/instructions.c b/tools/perf/arch/arm64/annotate/instructions.c
index 6688977e4ac77..76c6345a57d5e 100644
--- a/tools/perf/arch/arm64/annotate/instructions.c
+++ b/tools/perf/arch/arm64/annotate/instructions.c
@@ -8,6 +8,63 @@ struct arm64_annotate {
 		jump_insn;
 };
 
+static int arm64_mov__parse(struct arch *arch __maybe_unused,
+			    struct ins_operands *ops,
+			    struct map_symbol *ms __maybe_unused)
+{
+	char *s = strchr(ops->raw, ','), *target, *endptr;
+
+	if (s == NULL)
+		return -1;
+
+	*s = '\0';
+	ops->source.raw = strdup(ops->raw);
+	*s = ',';
+
+	if (ops->source.raw == NULL)
+		return -1;
+
+	target = ++s;
+	ops->target.raw = strdup(target);
+	if (ops->target.raw == NULL)
+		goto out_free_source;
+
+	ops->target.addr = strtoull(target, &endptr, 16);
+	if (endptr == target)
+		goto out_free_target;
+
+	s = strchr(endptr, '<');
+	if (s == NULL)
+		goto out_free_target;
+	endptr = strchr(s + 1, '>');
+	if (endptr == NULL)
+		goto out_free_target;
+
+	*endptr = '\0';
+	*s = ' ';
+	ops->target.name = strdup(s);
+	*s = '<';
+	*endptr = '>';
+	if (ops->target.name == NULL)
+		goto out_free_target;
+
+	return 0;
+
+out_free_target:
+	zfree(&ops->target.raw);
+out_free_source:
+	zfree(&ops->source.raw);
+	return -1;
+}
+
+static int mov__scnprintf(struct ins *ins, char *bf, size_t size,
+			  struct ins_operands *ops);
+
+static struct ins_ops arm64_mov_ops = {
+	.parse	   = arm64_mov__parse,
+	.scnprintf = mov__scnprintf,
+};
+
 static struct ins_ops *arm64__associate_instruction_ops(struct arch *arch, const char *name)
 {
 	struct arm64_annotate *arm = arch->priv;
@@ -21,7 +78,7 @@ static struct ins_ops *arm64__associate_instruction_ops(struct arch *arch, const
 	else if (!strcmp(name, "ret"))
 		ops = &ret_ops;
 	else
-		return NULL;
+		ops = &arm64_mov_ops;
 
 	arch__associate_ins_ops(arch, name, ops);
 	return ops;
diff --git a/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl b/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl
index 52e197317d3ee..2dbb8cade048f 100755
--- a/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl
+++ b/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl
@@ -11,7 +11,8 @@
 
 gcc=$1
 hostcc=$2
-input=$3
+incpath=$3
+input=$4
 
 if ! test -r $input; then
 	echo "Could not read input file" >&2
@@ -28,7 +29,6 @@ create_table_from_c()
 
 	cat <<-_EoHEADER
 		#include <stdio.h>
-		#define __ARCH_WANT_RENAMEAT
 		#include "$input"
 		int main(int argc, char *argv[])
 		{
@@ -42,7 +42,7 @@ create_table_from_c()
 	printf "%s\n" "	printf(\"#define SYSCALLTBL_ARM64_MAX_ID %d\\n\", __NR_$last_sc);"
 	printf "}\n"
 
-	} | $hostcc -o $create_table_exe -x c -
+	} | $hostcc -I $incpath/include/uapi -o $create_table_exe -x c -
 
 	$create_table_exe
 
diff --git a/tools/perf/arch/powerpc/util/sym-handling.c b/tools/perf/arch/powerpc/util/sym-handling.c
index 20e7d74d86cd1..10a44e946f773 100644
--- a/tools/perf/arch/powerpc/util/sym-handling.c
+++ b/tools/perf/arch/powerpc/util/sym-handling.c
@@ -22,15 +22,16 @@ bool elf__needs_adjust_symbols(GElf_Ehdr ehdr)
 
 #endif
 
-#if !defined(_CALL_ELF) || _CALL_ELF != 2
 int arch__choose_best_symbol(struct symbol *syma,
 			     struct symbol *symb __maybe_unused)
 {
 	char *sym = syma->name;
 
+#if !defined(_CALL_ELF) || _CALL_ELF != 2
 	/* Skip over any initial dot */
 	if (*sym == '.')
 		sym++;
+#endif
 
 	/* Avoid "SyS" kernel syscall aliases */
 	if (strlen(sym) >= 3 && !strncmp(sym, "SyS", 3))
@@ -41,6 +42,7 @@ int arch__choose_best_symbol(struct symbol *syma,
 	return SYMBOL_A;
 }
 
+#if !defined(_CALL_ELF) || _CALL_ELF != 2
 /* Allow matching against dot variants */
 int arch__compare_symbol_names(const char *namea, const char *nameb)
 {
diff --git a/tools/perf/arch/s390/annotate/instructions.c b/tools/perf/arch/s390/annotate/instructions.c
index cee4e2f7c0578..de0dd66dbb481 100644
--- a/tools/perf/arch/s390/annotate/instructions.c
+++ b/tools/perf/arch/s390/annotate/instructions.c
@@ -100,8 +100,6 @@ static int s390_mov__parse(struct arch *arch __maybe_unused,
 	return -1;
 }
 
-static int mov__scnprintf(struct ins *ins, char *bf, size_t size,
-			  struct ins_operands *ops);
 
 static struct ins_ops s390_mov_ops = {
 	.parse	   = s390_mov__parse,
diff --git a/tools/perf/arch/x86/include/arch-tests.h b/tools/perf/arch/x86/include/arch-tests.h
index c1bd979b957be..613709cfbbd03 100644
--- a/tools/perf/arch/x86/include/arch-tests.h
+++ b/tools/perf/arch/x86/include/arch-tests.h
@@ -9,6 +9,7 @@ struct test;
 int test__rdpmc(struct test *test __maybe_unused, int subtest);
 int test__perf_time_to_tsc(struct test *test __maybe_unused, int subtest);
 int test__insn_x86(struct test *test __maybe_unused, int subtest);
+int test__bp_modify(struct test *test, int subtest);
 
 #ifdef HAVE_DWARF_UNWIND_SUPPORT
 struct thread;
diff --git a/tools/perf/arch/x86/tests/Build b/tools/perf/arch/x86/tests/Build
index 8e2c5a38c3b90..586849ff83a07 100644
--- a/tools/perf/arch/x86/tests/Build
+++ b/tools/perf/arch/x86/tests/Build
@@ -5,3 +5,4 @@ libperf-y += arch-tests.o
 libperf-y += rdpmc.o
 libperf-y += perf-time-to-tsc.o
 libperf-$(CONFIG_AUXTRACE) += insn-x86.o
+libperf-$(CONFIG_X86_64) += bp-modify.o
diff --git a/tools/perf/arch/x86/tests/arch-tests.c b/tools/perf/arch/x86/tests/arch-tests.c
index cc1802ff54109..d47d3f8e3c8e0 100644
--- a/tools/perf/arch/x86/tests/arch-tests.c
+++ b/tools/perf/arch/x86/tests/arch-tests.c
@@ -23,6 +23,12 @@ struct test arch_tests[] = {
 		.desc = "x86 instruction decoder - new instructions",
 		.func = test__insn_x86,
 	},
+#endif
+#if defined(__x86_64__)
+	{
+		.desc = "x86 bp modify",
+		.func = test__bp_modify,
+	},
 #endif
 	{
 		.func = NULL,
diff --git a/tools/perf/arch/x86/tests/bp-modify.c b/tools/perf/arch/x86/tests/bp-modify.c
new file mode 100644
index 0000000000000..f53e4406709f8
--- /dev/null
+++ b/tools/perf/arch/x86/tests/bp-modify.c
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/compiler.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/user.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ptrace.h>
+#include <asm/ptrace.h>
+#include <errno.h>
+#include "debug.h"
+#include "tests/tests.h"
+#include "arch-tests.h"
+
+static noinline int bp_1(void)
+{
+	pr_debug("in %s\n", __func__);
+	return 0;
+}
+
+static noinline int bp_2(void)
+{
+	pr_debug("in %s\n", __func__);
+	return 0;
+}
+
+static int spawn_child(void)
+{
+	int child = fork();
+
+	if (child == 0) {
+		/*
+		 * The child sets itself for as tracee and
+		 * waits in signal for parent to trace it,
+		 * then it calls bp_1 and quits.
+		 */
+		int err = ptrace(PTRACE_TRACEME, 0, NULL, NULL);
+
+		if (err) {
+			pr_debug("failed to PTRACE_TRACEME\n");
+			exit(1);
+		}
+
+		raise(SIGCONT);
+		bp_1();
+		exit(0);
+	}
+
+	return child;
+}
+
+/*
+ * This tests creates HW breakpoint, tries to
+ * change it and checks it was properly changed.
+ */
+static int bp_modify1(void)
+{
+	pid_t child;
+	int status;
+	unsigned long rip = 0, dr7 = 1;
+
+	child = spawn_child();
+
+	waitpid(child, &status, 0);
+	if (WIFEXITED(status)) {
+		pr_debug("tracee exited prematurely 1\n");
+		return TEST_FAIL;
+	}
+
+	/*
+	 * The parent does following steps:
+	 *  - creates a new breakpoint (id 0) for bp_2 function
+	 *  - changes that breakponit to bp_1 function
+	 *  - waits for the breakpoint to hit and checks
+	 *    it has proper rip of bp_1 function
+	 *  - detaches the child
+	 */
+	if (ptrace(PTRACE_POKEUSER, child,
+		   offsetof(struct user, u_debugreg[0]), bp_2)) {
+		pr_debug("failed to set breakpoint, 1st time: %s\n",
+			 strerror(errno));
+		goto out;
+	}
+
+	if (ptrace(PTRACE_POKEUSER, child,
+		   offsetof(struct user, u_debugreg[0]), bp_1)) {
+		pr_debug("failed to set breakpoint, 2nd time: %s\n",
+			 strerror(errno));
+		goto out;
+	}
+
+	if (ptrace(PTRACE_POKEUSER, child,
+		   offsetof(struct user, u_debugreg[7]), dr7)) {
+		pr_debug("failed to set dr7: %s\n", strerror(errno));
+		goto out;
+	}
+
+	if (ptrace(PTRACE_CONT, child, NULL, NULL)) {
+		pr_debug("failed to PTRACE_CONT: %s\n", strerror(errno));
+		goto out;
+	}
+
+	waitpid(child, &status, 0);
+	if (WIFEXITED(status)) {
+		pr_debug("tracee exited prematurely 2\n");
+		return TEST_FAIL;
+	}
+
+	rip = ptrace(PTRACE_PEEKUSER, child,
+		     offsetof(struct user_regs_struct, rip), NULL);
+	if (rip == (unsigned long) -1) {
+		pr_debug("failed to PTRACE_PEEKUSER: %s\n",
+			 strerror(errno));
+		goto out;
+	}
+
+	pr_debug("rip %lx, bp_1 %p\n", rip, bp_1);
+
+out:
+	if (ptrace(PTRACE_DETACH, child, NULL, NULL)) {
+		pr_debug("failed to PTRACE_DETACH: %s", strerror(errno));
+		return TEST_FAIL;
+	}
+
+	return rip == (unsigned long) bp_1 ? TEST_OK : TEST_FAIL;
+}
+
+/*
+ * This tests creates HW breakpoint, tries to
+ * change it to bogus value and checks the original
+ * breakpoint is hit.
+ */
+static int bp_modify2(void)
+{
+	pid_t child;
+	int status;
+	unsigned long rip = 0, dr7 = 1;
+
+	child = spawn_child();
+
+	waitpid(child, &status, 0);
+	if (WIFEXITED(status)) {
+		pr_debug("tracee exited prematurely 1\n");
+		return TEST_FAIL;
+	}
+
+	/*
+	 * The parent does following steps:
+	 *  - creates a new breakpoint (id 0) for bp_1 function
+	 *  - tries to change that breakpoint to (-1) address
+	 *  - waits for the breakpoint to hit and checks
+	 *    it has proper rip of bp_1 function
+	 *  - detaches the child
+	 */
+	if (ptrace(PTRACE_POKEUSER, child,
+		   offsetof(struct user, u_debugreg[0]), bp_1)) {
+		pr_debug("failed to set breakpoint: %s\n",
+			 strerror(errno));
+		goto out;
+	}
+
+	if (ptrace(PTRACE_POKEUSER, child,
+		   offsetof(struct user, u_debugreg[7]), dr7)) {
+		pr_debug("failed to set dr7: %s\n", strerror(errno));
+		goto out;
+	}
+
+	if (!ptrace(PTRACE_POKEUSER, child,
+		   offsetof(struct user, u_debugreg[0]), (unsigned long) (-1))) {
+		pr_debug("failed, breakpoint set to bogus address\n");
+		goto out;
+	}
+
+	if (ptrace(PTRACE_CONT, child, NULL, NULL)) {
+		pr_debug("failed to PTRACE_CONT: %s\n", strerror(errno));
+		goto out;
+	}
+
+	waitpid(child, &status, 0);
+	if (WIFEXITED(status)) {
+		pr_debug("tracee exited prematurely 2\n");
+		return TEST_FAIL;
+	}
+
+	rip = ptrace(PTRACE_PEEKUSER, child,
+		     offsetof(struct user_regs_struct, rip), NULL);
+	if (rip == (unsigned long) -1) {
+		pr_debug("failed to PTRACE_PEEKUSER: %s\n",
+			 strerror(errno));
+		goto out;
+	}
+
+	pr_debug("rip %lx, bp_1 %p\n", rip, bp_1);
+
+out:
+	if (ptrace(PTRACE_DETACH, child, NULL, NULL)) {
+		pr_debug("failed to PTRACE_DETACH: %s", strerror(errno));
+		return TEST_FAIL;
+	}
+
+	return rip == (unsigned long) bp_1 ? TEST_OK : TEST_FAIL;
+}
+
+int test__bp_modify(struct test *test __maybe_unused,
+		    int subtest __maybe_unused)
+{
+	TEST_ASSERT_VAL("modify test 1 failed\n", !bp_modify1());
+	TEST_ASSERT_VAL("modify test 2 failed\n", !bp_modify2());
+
+	return 0;
+}
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 22ebeb92ac51d..9853552bcf164 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -758,7 +758,7 @@ static int record__synthesize(struct record *rec, bool tail)
 		 * We need to synthesize events first, because some
 		 * features works on top of them (on report side).
 		 */
-		err = perf_event__synthesize_attrs(tool, session,
+		err = perf_event__synthesize_attrs(tool, rec->evlist,
 						   process_synthesized_event);
 		if (err < 0) {
 			pr_err("Couldn't synthesize attrs.\n");
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index ba481d73f910f..6176bae177c23 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -1544,7 +1544,8 @@ struct metric_ctx {
 	FILE 			*fp;
 };
 
-static void script_print_metric(void *ctx, const char *color,
+static void script_print_metric(struct perf_stat_config *config __maybe_unused,
+				void *ctx, const char *color,
 			        const char *fmt,
 			        const char *unit, double val)
 {
@@ -1562,7 +1563,8 @@ static void script_print_metric(void *ctx, const char *color,
 	fprintf(mctx->fp, " %s\n", unit);
 }
 
-static void script_new_line(void *ctx)
+static void script_new_line(struct perf_stat_config *config __maybe_unused,
+			    void *ctx)
 {
 	struct metric_ctx *mctx = ctx;
 
@@ -1608,7 +1610,7 @@ static void perf_sample__fprint_metric(struct perf_script *script,
 	evsel_script(evsel)->val = val;
 	if (evsel_script(evsel->leader)->gnum == evsel->leader->nr_members) {
 		for_each_group_member (ev2, evsel->leader) {
-			perf_stat__print_shadow_stats(ev2,
+			perf_stat__print_shadow_stats(&stat_config, ev2,
 						      evsel_script(ev2)->val,
 						      sample->cpu,
 						      &ctx,
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index d097b5b47eb81..0b0e3961d511a 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -88,8 +88,6 @@
 #include "sane_ctype.h"
 
 #define DEFAULT_SEPARATOR	" "
-#define CNTR_NOT_SUPPORTED	"<not supported>"
-#define CNTR_NOT_COUNTED	"<not counted>"
 #define FREEZE_ON_SMI_PATH	"devices/cpu/freeze_on_smi"
 
 static void print_counters(struct timespec *ts, int argc, const char **argv);
@@ -137,54 +135,30 @@ static const char *smi_cost_attrs = {
 
 static struct perf_evlist	*evsel_list;
 
-static struct rblist		 metric_events;
-
 static struct target target = {
 	.uid	= UINT_MAX,
 };
 
-typedef int (*aggr_get_id_t)(struct cpu_map *m, int cpu);
-
 #define METRIC_ONLY_LEN 20
 
-static int			run_count			=  1;
-static bool			no_inherit			= false;
 static volatile pid_t		child_pid			= -1;
-static bool			null_run			=  false;
 static int			detailed_run			=  0;
 static bool			transaction_run;
 static bool			topdown_run			= false;
 static bool			smi_cost			= false;
 static bool			smi_reset			= false;
-static bool			big_num				=  true;
 static int			big_num_opt			=  -1;
-static const char		*csv_sep			= NULL;
-static bool			csv_output			= false;
 static bool			group				= false;
 static const char		*pre_cmd			= NULL;
 static const char		*post_cmd			= NULL;
 static bool			sync_run			= false;
-static unsigned int		initial_delay			= 0;
-static unsigned int		unit_width			= 4; /* strlen("unit") */
 static bool			forever				= false;
-static bool			metric_only			= false;
 static bool			force_metric_only		= false;
-static bool			no_merge			= false;
-static bool			walltime_run_table		= false;
 static struct timespec		ref_time;
-static struct cpu_map		*aggr_map;
-static aggr_get_id_t		aggr_get_id;
 static bool			append_file;
 static bool			interval_count;
-static bool			interval_clear;
 static const char		*output_name;
 static int			output_fd;
-static int			print_free_counters_hint;
-static int			print_mixed_hw_group_error;
-static u64			*walltime_run;
-static bool			ru_display			= false;
-static struct rusage		ru_data;
-static unsigned int		metric_only_len			= METRIC_ONLY_LEN;
 
 struct perf_stat {
 	bool			 record;
@@ -204,15 +178,15 @@ static struct perf_stat		perf_stat;
 static volatile int done = 0;
 
 static struct perf_stat_config stat_config = {
-	.aggr_mode	= AGGR_GLOBAL,
-	.scale		= true,
+	.aggr_mode		= AGGR_GLOBAL,
+	.scale			= true,
+	.unit_width		= 4, /* strlen("unit") */
+	.run_count		= 1,
+	.metric_only_len	= METRIC_ONLY_LEN,
+	.walltime_nsecs_stats	= &walltime_nsecs_stats,
+	.big_num		= true,
 };
 
-static bool is_duration_time(struct perf_evsel *evsel)
-{
-	return !strcmp(evsel->name, "duration_time");
-}
-
 static inline void diff_timespec(struct timespec *r, struct timespec *a,
 				 struct timespec *b)
 {
@@ -236,66 +210,6 @@ static void perf_stat__reset_stats(void)
 		perf_stat__reset_shadow_per_stat(&stat_config.stats[i]);
 }
 
-static int create_perf_stat_counter(struct perf_evsel *evsel)
-{
-	struct perf_event_attr *attr = &evsel->attr;
-	struct perf_evsel *leader = evsel->leader;
-
-	if (stat_config.scale) {
-		attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
-				    PERF_FORMAT_TOTAL_TIME_RUNNING;
-	}
-
-	/*
-	 * The event is part of non trivial group, let's enable
-	 * the group read (for leader) and ID retrieval for all
-	 * members.
-	 */
-	if (leader->nr_members > 1)
-		attr->read_format |= PERF_FORMAT_ID|PERF_FORMAT_GROUP;
-
-	attr->inherit = !no_inherit;
-
-	/*
-	 * Some events get initialized with sample_(period/type) set,
-	 * like tracepoints. Clear it up for counting.
-	 */
-	attr->sample_period = 0;
-
-	/*
-	 * But set sample_type to PERF_SAMPLE_IDENTIFIER, which should be harmless
-	 * while avoiding that older tools show confusing messages.
-	 *
-	 * However for pipe sessions we need to keep it zero,
-	 * because script's perf_evsel__check_attr is triggered
-	 * by attr->sample_type != 0, and we can't run it on
-	 * stat sessions.
-	 */
-	if (!(STAT_RECORD && perf_stat.data.is_pipe))
-		attr->sample_type = PERF_SAMPLE_IDENTIFIER;
-
-	/*
-	 * Disabling all counters initially, they will be enabled
-	 * either manually by us or by kernel via enable_on_exec
-	 * set later.
-	 */
-	if (perf_evsel__is_group_leader(evsel)) {
-		attr->disabled = 1;
-
-		/*
-		 * In case of initial_delay we enable tracee
-		 * events manually.
-		 */
-		if (target__none(&target) && !initial_delay)
-			attr->enable_on_exec = 1;
-	}
-
-	if (target__has_cpu(&target) && !target__has_per_thread(&target))
-		return perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel));
-
-	return perf_evsel__open_per_thread(evsel, evsel_list->threads);
-}
-
 static int process_synthesized_event(struct perf_tool *tool __maybe_unused,
 				     union perf_event *event,
 				     struct perf_sample *sample __maybe_unused,
@@ -428,15 +342,15 @@ static void process_interval(void)
 
 static void enable_counters(void)
 {
-	if (initial_delay)
-		usleep(initial_delay * USEC_PER_MSEC);
+	if (stat_config.initial_delay)
+		usleep(stat_config.initial_delay * USEC_PER_MSEC);
 
 	/*
 	 * We need to enable counters only if:
 	 * - we don't have tracee (attaching to task or cpu)
 	 * - we have initial delay configured
 	 */
-	if (!target__none(&target) || initial_delay)
+	if (!target__none(&target) || stat_config.initial_delay)
 		perf_evlist__enable(evsel_list);
 }
 
@@ -464,80 +378,6 @@ static void workload_exec_failed_signal(int signo __maybe_unused, siginfo_t *inf
 	workload_exec_errno = info->si_value.sival_int;
 }
 
-static int perf_stat_synthesize_config(bool is_pipe)
-{
-	int err;
-
-	if (is_pipe) {
-		err = perf_event__synthesize_attrs(NULL, perf_stat.session,
-						   process_synthesized_event);
-		if (err < 0) {
-			pr_err("Couldn't synthesize attrs.\n");
-			return err;
-		}
-	}
-
-	err = perf_event__synthesize_extra_attr(NULL,
-						evsel_list,
-						process_synthesized_event,
-						is_pipe);
-
-	err = perf_event__synthesize_thread_map2(NULL, evsel_list->threads,
-						process_synthesized_event,
-						NULL);
-	if (err < 0) {
-		pr_err("Couldn't synthesize thread map.\n");
-		return err;
-	}
-
-	err = perf_event__synthesize_cpu_map(NULL, evsel_list->cpus,
-					     process_synthesized_event, NULL);
-	if (err < 0) {
-		pr_err("Couldn't synthesize thread map.\n");
-		return err;
-	}
-
-	err = perf_event__synthesize_stat_config(NULL, &stat_config,
-						 process_synthesized_event, NULL);
-	if (err < 0) {
-		pr_err("Couldn't synthesize config.\n");
-		return err;
-	}
-
-	return 0;
-}
-
-#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
-
-static int __store_counter_ids(struct perf_evsel *counter)
-{
-	int cpu, thread;
-
-	for (cpu = 0; cpu < xyarray__max_x(counter->fd); cpu++) {
-		for (thread = 0; thread < xyarray__max_y(counter->fd);
-		     thread++) {
-			int fd = FD(counter, cpu, thread);
-
-			if (perf_evlist__id_add_fd(evsel_list, counter,
-						   cpu, thread, fd) < 0)
-				return -1;
-		}
-	}
-
-	return 0;
-}
-
-static int store_counter_ids(struct perf_evsel *counter)
-{
-	struct cpu_map *cpus = counter->cpus;
-	struct thread_map *threads = counter->threads;
-
-	if (perf_evsel__alloc_id(counter, cpus->nr, threads->nr))
-		return -ENOMEM;
-
-	return __store_counter_ids(counter);
-}
-
 static bool perf_evsel__should_store_id(struct perf_evsel *counter)
 {
 	return STAT_RECORD || counter->attr.read_format & PERF_FORMAT_ID;
@@ -609,7 +449,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 
 	evlist__for_each_entry(evsel_list, counter) {
 try_again:
-		if (create_perf_stat_counter(counter) < 0) {
+		if (create_perf_stat_counter(counter, &stat_config, &target) < 0) {
 
 			/* Weak group failed. Reset the group. */
 			if ((errno == EINVAL || errno == EBADF) &&
@@ -664,11 +504,11 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 		counter->supported = true;
 
 		l = strlen(counter->unit);
-		if (l > unit_width)
-			unit_width = l;
+		if (l > stat_config.unit_width)
+			stat_config.unit_width = l;
 
 		if (perf_evsel__should_store_id(counter) &&
-		    store_counter_ids(counter))
+		    perf_evsel__store_ids(counter, evsel_list))
 			return -1;
 	}
 
@@ -699,7 +539,8 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 		if (err < 0)
 			return err;
 
-		err = perf_stat_synthesize_config(is_pipe);
+		err = perf_stat_synthesize_config(&stat_config, NULL, evsel_list,
+						  process_synthesized_event, is_pipe);
 		if (err < 0)
 			return err;
 	}
@@ -724,7 +565,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 					break;
 			}
 		}
-		wait4(child_pid, &status, 0, &ru_data);
+		wait4(child_pid, &status, 0, &stat_config.ru_data);
 
 		if (workload_exec_errno) {
 			const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
@@ -752,8 +593,8 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 
 	t1 = rdclock();
 
-	if (walltime_run_table)
-		walltime_run[run_idx] = t1 - t0;
+	if (stat_config.walltime_run_table)
+		stat_config.walltime_run[run_idx] = t1 - t0;
 
 	update_stats(&walltime_nsecs_stats, t1 - t0);
 
@@ -795,1105 +636,14 @@ static int run_perf_stat(int argc, const char **argv, int run_idx)
 	return ret;
 }
 
-static void print_running(u64 run, u64 ena)
-{
-	if (csv_output) {
-		fprintf(stat_config.output, "%s%" PRIu64 "%s%.2f",
-					csv_sep,
-					run,
-					csv_sep,
-					ena ? 100.0 * run / ena : 100.0);
-	} else if (run != ena) {
-		fprintf(stat_config.output, "  (%.2f%%)", 100.0 * run / ena);
-	}
-}
-
-static void print_noise_pct(double total, double avg)
-{
-	double pct = rel_stddev_stats(total, avg);
-
-	if (csv_output)
-		fprintf(stat_config.output, "%s%.2f%%", csv_sep, pct);
-	else if (pct)
-		fprintf(stat_config.output, "  ( +-%6.2f%% )", pct);
-}
-
-static void print_noise(struct perf_evsel *evsel, double avg)
-{
-	struct perf_stat_evsel *ps;
-
-	if (run_count == 1)
-		return;
-
-	ps = evsel->stats;
-	print_noise_pct(stddev_stats(&ps->res_stats[0]), avg);
-}
-
-static void aggr_printout(struct perf_evsel *evsel, int id, int nr)
-{
-	switch (stat_config.aggr_mode) {
-	case AGGR_CORE:
-		fprintf(stat_config.output, "S%d-C%*d%s%*d%s",
-			cpu_map__id_to_socket(id),
-			csv_output ? 0 : -8,
-			cpu_map__id_to_cpu(id),
-			csv_sep,
-			csv_output ? 0 : 4,
-			nr,
-			csv_sep);
-		break;
-	case AGGR_SOCKET:
-		fprintf(stat_config.output, "S%*d%s%*d%s",
-			csv_output ? 0 : -5,
-			id,
-			csv_sep,
-			csv_output ? 0 : 4,
-			nr,
-			csv_sep);
-			break;
-	case AGGR_NONE:
-		fprintf(stat_config.output, "CPU%*d%s",
-			csv_output ? 0 : -4,
-			perf_evsel__cpus(evsel)->map[id], csv_sep);
-		break;
-	case AGGR_THREAD:
-		fprintf(stat_config.output, "%*s-%*d%s",
-			csv_output ? 0 : 16,
-			thread_map__comm(evsel->threads, id),
-			csv_output ? 0 : -8,
-			thread_map__pid(evsel->threads, id),
-			csv_sep);
-		break;
-	case AGGR_GLOBAL:
-	case AGGR_UNSET:
-	default:
-		break;
-	}
-}
-
-struct outstate {
-	FILE *fh;
-	bool newline;
-	const char *prefix;
-	int  nfields;
-	int  id, nr;
-	struct perf_evsel *evsel;
-};
-
-#define METRIC_LEN  35
-
-static void new_line_std(void *ctx)
-{
-	struct outstate *os = ctx;
-
-	os->newline = true;
-}
-
-static void do_new_line_std(struct outstate *os)
-{
-	fputc('\n', os->fh);
-	fputs(os->prefix, os->fh);
-	aggr_printout(os->evsel, os->id, os->nr);
-	if (stat_config.aggr_mode == AGGR_NONE)
-		fprintf(os->fh, "        ");
-	fprintf(os->fh, "                                                 ");
-}
-
-static void print_metric_std(void *ctx, const char *color, const char *fmt,
-			     const char *unit, double val)
-{
-	struct outstate *os = ctx;
-	FILE *out = os->fh;
-	int n;
-	bool newline = os->newline;
-
-	os->newline = false;
-
-	if (unit == NULL || fmt == NULL) {
-		fprintf(out, "%-*s", METRIC_LEN, "");
-		return;
-	}
-
-	if (newline)
-		do_new_line_std(os);
-
-	n = fprintf(out, " # ");
-	if (color)
-		n += color_fprintf(out, color, fmt, val);
-	else
-		n += fprintf(out, fmt, val);
-	fprintf(out, " %-*s", METRIC_LEN - n - 1, unit);
-}
-
-static void new_line_csv(void *ctx)
-{
-	struct outstate *os = ctx;
-	int i;
-
-	fputc('\n', os->fh);
-	if (os->prefix)
-		fprintf(os->fh, "%s%s", os->prefix, csv_sep);
-	aggr_printout(os->evsel, os->id, os->nr);
-	for (i = 0; i < os->nfields; i++)
-		fputs(csv_sep, os->fh);
-}
-
-static void print_metric_csv(void *ctx,
-			     const char *color __maybe_unused,
-			     const char *fmt, const char *unit, double val)
-{
-	struct outstate *os = ctx;
-	FILE *out = os->fh;
-	char buf[64], *vals, *ends;
-
-	if (unit == NULL || fmt == NULL) {
-		fprintf(out, "%s%s", csv_sep, csv_sep);
-		return;
-	}
-	snprintf(buf, sizeof(buf), fmt, val);
-	ends = vals = ltrim(buf);
-	while (isdigit(*ends) || *ends == '.')
-		ends++;
-	*ends = 0;
-	while (isspace(*unit))
-		unit++;
-	fprintf(out, "%s%s%s%s", csv_sep, vals, csv_sep, unit);
-}
-
-/* Filter out some columns that don't work well in metrics only mode */
-
-static bool valid_only_metric(const char *unit)
-{
-	if (!unit)
-		return false;
-	if (strstr(unit, "/sec") ||
-	    strstr(unit, "hz") ||
-	    strstr(unit, "Hz") ||
-	    strstr(unit, "CPUs utilized"))
-		return false;
-	return true;
-}
-
-static const char *fixunit(char *buf, struct perf_evsel *evsel,
-			   const char *unit)
-{
-	if (!strncmp(unit, "of all", 6)) {
-		snprintf(buf, 1024, "%s %s", perf_evsel__name(evsel),
-			 unit);
-		return buf;
-	}
-	return unit;
-}
-
-static void print_metric_only(void *ctx, const char *color, const char *fmt,
-			      const char *unit, double val)
-{
-	struct outstate *os = ctx;
-	FILE *out = os->fh;
-	char buf[1024], str[1024];
-	unsigned mlen = metric_only_len;
-
-	if (!valid_only_metric(unit))
-		return;
-	unit = fixunit(buf, os->evsel, unit);
-	if (mlen < strlen(unit))
-		mlen = strlen(unit) + 1;
-
-	if (color)
-		mlen += strlen(color) + sizeof(PERF_COLOR_RESET) - 1;
-
-	color_snprintf(str, sizeof(str), color ?: "", fmt, val);
-	fprintf(out, "%*s ", mlen, str);
-}
-
-static void print_metric_only_csv(void *ctx, const char *color __maybe_unused,
-				  const char *fmt,
-				  const char *unit, double val)
-{
-	struct outstate *os = ctx;
-	FILE *out = os->fh;
-	char buf[64], *vals, *ends;
-	char tbuf[1024];
-
-	if (!valid_only_metric(unit))
-		return;
-	unit = fixunit(tbuf, os->evsel, unit);
-	snprintf(buf, sizeof buf, fmt, val);
-	ends = vals = ltrim(buf);
-	while (isdigit(*ends) || *ends == '.')
-		ends++;
-	*ends = 0;
-	fprintf(out, "%s%s", vals, csv_sep);
-}
-
-static void new_line_metric(void *ctx __maybe_unused)
-{
-}
-
-static void print_metric_header(void *ctx, const char *color __maybe_unused,
-				const char *fmt __maybe_unused,
-				const char *unit, double val __maybe_unused)
-{
-	struct outstate *os = ctx;
-	char tbuf[1024];
-
-	if (!valid_only_metric(unit))
-		return;
-	unit = fixunit(tbuf, os->evsel, unit);
-	if (csv_output)
-		fprintf(os->fh, "%s%s", unit, csv_sep);
-	else
-		fprintf(os->fh, "%*s ", metric_only_len, unit);
-}
-
-static int first_shadow_cpu(struct perf_evsel *evsel, int id)
-{
-	int i;
-
-	if (!aggr_get_id)
-		return 0;
-
-	if (stat_config.aggr_mode == AGGR_NONE)
-		return id;
-
-	if (stat_config.aggr_mode == AGGR_GLOBAL)
-		return 0;
-
-	for (i = 0; i < perf_evsel__nr_cpus(evsel); i++) {
-		int cpu2 = perf_evsel__cpus(evsel)->map[i];
-
-		if (aggr_get_id(evsel_list->cpus, cpu2) == id)
-			return cpu2;
-	}
-	return 0;
-}
-
-static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
-{
-	FILE *output = stat_config.output;
-	double sc =  evsel->scale;
-	const char *fmt;
-
-	if (csv_output) {
-		fmt = floor(sc) != sc ?  "%.2f%s" : "%.0f%s";
-	} else {
-		if (big_num)
-			fmt = floor(sc) != sc ? "%'18.2f%s" : "%'18.0f%s";
-		else
-			fmt = floor(sc) != sc ? "%18.2f%s" : "%18.0f%s";
-	}
-
-	aggr_printout(evsel, id, nr);
-
-	fprintf(output, fmt, avg, csv_sep);
-
-	if (evsel->unit)
-		fprintf(output, "%-*s%s",
-			csv_output ? 0 : unit_width,
-			evsel->unit, csv_sep);
-
-	fprintf(output, "%-*s", csv_output ? 0 : 25, perf_evsel__name(evsel));
-
-	if (evsel->cgrp)
-		fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
-}
-
-static bool is_mixed_hw_group(struct perf_evsel *counter)
-{
-	struct perf_evlist *evlist = counter->evlist;
-	u32 pmu_type = counter->attr.type;
-	struct perf_evsel *pos;
-
-	if (counter->nr_members < 2)
-		return false;
-
-	evlist__for_each_entry(evlist, pos) {
-		/* software events can be part of any hardware group */
-		if (pos->attr.type == PERF_TYPE_SOFTWARE)
-			continue;
-		if (pmu_type == PERF_TYPE_SOFTWARE) {
-			pmu_type = pos->attr.type;
-			continue;
-		}
-		if (pmu_type != pos->attr.type)
-			return true;
-	}
-
-	return false;
-}
-
-static void printout(int id, int nr, struct perf_evsel *counter, double uval,
-		     char *prefix, u64 run, u64 ena, double noise,
-		     struct runtime_stat *st)
-{
-	struct perf_stat_output_ctx out;
-	struct outstate os = {
-		.fh = stat_config.output,
-		.prefix = prefix ? prefix : "",
-		.id = id,
-		.nr = nr,
-		.evsel = counter,
-	};
-	print_metric_t pm = print_metric_std;
-	void (*nl)(void *);
-
-	if (metric_only) {
-		nl = new_line_metric;
-		if (csv_output)
-			pm = print_metric_only_csv;
-		else
-			pm = print_metric_only;
-	} else
-		nl = new_line_std;
-
-	if (csv_output && !metric_only) {
-		static int aggr_fields[] = {
-			[AGGR_GLOBAL] = 0,
-			[AGGR_THREAD] = 1,
-			[AGGR_NONE] = 1,
-			[AGGR_SOCKET] = 2,
-			[AGGR_CORE] = 2,
-		};
-
-		pm = print_metric_csv;
-		nl = new_line_csv;
-		os.nfields = 3;
-		os.nfields += aggr_fields[stat_config.aggr_mode];
-		if (counter->cgrp)
-			os.nfields++;
-	}
-	if (run == 0 || ena == 0 || counter->counts->scaled == -1) {
-		if (metric_only) {
-			pm(&os, NULL, "", "", 0);
-			return;
-		}
-		aggr_printout(counter, id, nr);
-
-		fprintf(stat_config.output, "%*s%s",
-			csv_output ? 0 : 18,
-			counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
-			csv_sep);
-
-		if (counter->supported) {
-			print_free_counters_hint = 1;
-			if (is_mixed_hw_group(counter))
-				print_mixed_hw_group_error = 1;
-		}
-
-		fprintf(stat_config.output, "%-*s%s",
-			csv_output ? 0 : unit_width,
-			counter->unit, csv_sep);
-
-		fprintf(stat_config.output, "%*s",
-			csv_output ? 0 : -25,
-			perf_evsel__name(counter));
-
-		if (counter->cgrp)
-			fprintf(stat_config.output, "%s%s",
-				csv_sep, counter->cgrp->name);
-
-		if (!csv_output)
-			pm(&os, NULL, NULL, "", 0);
-		print_noise(counter, noise);
-		print_running(run, ena);
-		if (csv_output)
-			pm(&os, NULL, NULL, "", 0);
-		return;
-	}
-
-	if (!metric_only)
-		abs_printout(id, nr, counter, uval);
-
-	out.print_metric = pm;
-	out.new_line = nl;
-	out.ctx = &os;
-	out.force_header = false;
-
-	if (csv_output && !metric_only) {
-		print_noise(counter, noise);
-		print_running(run, ena);
-	}
-
-	perf_stat__print_shadow_stats(counter, uval,
-				first_shadow_cpu(counter, id),
-				&out, &metric_events, st);
-	if (!csv_output && !metric_only) {
-		print_noise(counter, noise);
-		print_running(run, ena);
-	}
-}
-
-static void aggr_update_shadow(void)
-{
-	int cpu, s2, id, s;
-	u64 val;
-	struct perf_evsel *counter;
-
-	for (s = 0; s < aggr_map->nr; s++) {
-		id = aggr_map->map[s];
-		evlist__for_each_entry(evsel_list, counter) {
-			val = 0;
-			for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
-				s2 = aggr_get_id(evsel_list->cpus, cpu);
-				if (s2 != id)
-					continue;
-				val += perf_counts(counter->counts, cpu, 0)->val;
-			}
-			perf_stat__update_shadow_stats(counter, val,
-					first_shadow_cpu(counter, id),
-					&rt_stat);
-		}
-	}
-}
-
-static void uniquify_event_name(struct perf_evsel *counter)
-{
-	char *new_name;
-	char *config;
-
-	if (counter->uniquified_name ||
-	    !counter->pmu_name || !strncmp(counter->name, counter->pmu_name,
-					   strlen(counter->pmu_name)))
-		return;
-
-	config = strchr(counter->name, '/');
-	if (config) {
-		if (asprintf(&new_name,
-			     "%s%s", counter->pmu_name, config) > 0) {
-			free(counter->name);
-			counter->name = new_name;
-		}
-	} else {
-		if (asprintf(&new_name,
-			     "%s [%s]", counter->name, counter->pmu_name) > 0) {
-			free(counter->name);
-			counter->name = new_name;
-		}
-	}
-
-	counter->uniquified_name = true;
-}
-
-static void collect_all_aliases(struct perf_evsel *counter,
-			    void (*cb)(struct perf_evsel *counter, void *data,
-				       bool first),
-			    void *data)
-{
-	struct perf_evsel *alias;
-
-	alias = list_prepare_entry(counter, &(evsel_list->entries), node);
-	list_for_each_entry_continue (alias, &evsel_list->entries, node) {
-		if (strcmp(perf_evsel__name(alias), perf_evsel__name(counter)) ||
-		    alias->scale != counter->scale ||
-		    alias->cgrp != counter->cgrp ||
-		    strcmp(alias->unit, counter->unit) ||
-		    perf_evsel__is_clock(alias) != perf_evsel__is_clock(counter))
-			break;
-		alias->merged_stat = true;
-		cb(alias, data, false);
-	}
-}
-
-static bool collect_data(struct perf_evsel *counter,
-			    void (*cb)(struct perf_evsel *counter, void *data,
-				       bool first),
-			    void *data)
-{
-	if (counter->merged_stat)
-		return false;
-	cb(counter, data, true);
-	if (no_merge)
-		uniquify_event_name(counter);
-	else if (counter->auto_merge_stats)
-		collect_all_aliases(counter, cb, data);
-	return true;
-}
-
-struct aggr_data {
-	u64 ena, run, val;
-	int id;
-	int nr;
-	int cpu;
-};
-
-static void aggr_cb(struct perf_evsel *counter, void *data, bool first)
-{
-	struct aggr_data *ad = data;
-	int cpu, s2;
-
-	for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
-		struct perf_counts_values *counts;
-
-		s2 = aggr_get_id(perf_evsel__cpus(counter), cpu);
-		if (s2 != ad->id)
-			continue;
-		if (first)
-			ad->nr++;
-		counts = perf_counts(counter->counts, cpu, 0);
-		/*
-		 * When any result is bad, make them all to give
-		 * consistent output in interval mode.
-		 */
-		if (counts->ena == 0 || counts->run == 0 ||
-		    counter->counts->scaled == -1) {
-			ad->ena = 0;
-			ad->run = 0;
-			break;
-		}
-		ad->val += counts->val;
-		ad->ena += counts->ena;
-		ad->run += counts->run;
-	}
-}
-
-static void print_aggr(char *prefix)
-{
-	FILE *output = stat_config.output;
-	struct perf_evsel *counter;
-	int s, id, nr;
-	double uval;
-	u64 ena, run, val;
-	bool first;
-
-	if (!(aggr_map || aggr_get_id))
-		return;
-
-	aggr_update_shadow();
-
-	/*
-	 * With metric_only everything is on a single line.
-	 * Without each counter has its own line.
-	 */
-	for (s = 0; s < aggr_map->nr; s++) {
-		struct aggr_data ad;
-		if (prefix && metric_only)
-			fprintf(output, "%s", prefix);
-
-		ad.id = id = aggr_map->map[s];
-		first = true;
-		evlist__for_each_entry(evsel_list, counter) {
-			if (is_duration_time(counter))
-				continue;
-
-			ad.val = ad.ena = ad.run = 0;
-			ad.nr = 0;
-			if (!collect_data(counter, aggr_cb, &ad))
-				continue;
-			nr = ad.nr;
-			ena = ad.ena;
-			run = ad.run;
-			val = ad.val;
-			if (first && metric_only) {
-				first = false;
-				aggr_printout(counter, id, nr);
-			}
-			if (prefix && !metric_only)
-				fprintf(output, "%s", prefix);
-
-			uval = val * counter->scale;
-			printout(id, nr, counter, uval, prefix, run, ena, 1.0,
-				 &rt_stat);
-			if (!metric_only)
-				fputc('\n', output);
-		}
-		if (metric_only)
-			fputc('\n', output);
-	}
-}
-
-static int cmp_val(const void *a, const void *b)
-{
-	return ((struct perf_aggr_thread_value *)b)->val -
-		((struct perf_aggr_thread_value *)a)->val;
-}
-
-static struct perf_aggr_thread_value *sort_aggr_thread(
-					struct perf_evsel *counter,
-					int nthreads, int ncpus,
-					int *ret)
-{
-	int cpu, thread, i = 0;
-	double uval;
-	struct perf_aggr_thread_value *buf;
-
-	buf = calloc(nthreads, sizeof(struct perf_aggr_thread_value));
-	if (!buf)
-		return NULL;
-
-	for (thread = 0; thread < nthreads; thread++) {
-		u64 ena = 0, run = 0, val = 0;
-
-		for (cpu = 0; cpu < ncpus; cpu++) {
-			val += perf_counts(counter->counts, cpu, thread)->val;
-			ena += perf_counts(counter->counts, cpu, thread)->ena;
-			run += perf_counts(counter->counts, cpu, thread)->run;
-		}
-
-		uval = val * counter->scale;
-
-		/*
-		 * Skip value 0 when enabling --per-thread globally,
-		 * otherwise too many 0 output.
-		 */
-		if (uval == 0.0 && target__has_per_thread(&target))
-			continue;
-
-		buf[i].counter = counter;
-		buf[i].id = thread;
-		buf[i].uval = uval;
-		buf[i].val = val;
-		buf[i].run = run;
-		buf[i].ena = ena;
-		i++;
-	}
-
-	qsort(buf, i, sizeof(struct perf_aggr_thread_value), cmp_val);
-
-	if (ret)
-		*ret = i;
-
-	return buf;
-}
-
-static void print_aggr_thread(struct perf_evsel *counter, char *prefix)
-{
-	FILE *output = stat_config.output;
-	int nthreads = thread_map__nr(counter->threads);
-	int ncpus = cpu_map__nr(counter->cpus);
-	int thread, sorted_threads, id;
-	struct perf_aggr_thread_value *buf;
-
-	buf = sort_aggr_thread(counter, nthreads, ncpus, &sorted_threads);
-	if (!buf) {
-		perror("cannot sort aggr thread");
-		return;
-	}
-
-	for (thread = 0; thread < sorted_threads; thread++) {
-		if (prefix)
-			fprintf(output, "%s", prefix);
-
-		id = buf[thread].id;
-		if (stat_config.stats)
-			printout(id, 0, buf[thread].counter, buf[thread].uval,
-				 prefix, buf[thread].run, buf[thread].ena, 1.0,
-				 &stat_config.stats[id]);
-		else
-			printout(id, 0, buf[thread].counter, buf[thread].uval,
-				 prefix, buf[thread].run, buf[thread].ena, 1.0,
-				 &rt_stat);
-		fputc('\n', output);
-	}
-
-	free(buf);
-}
-
-struct caggr_data {
-	double avg, avg_enabled, avg_running;
-};
-
-static void counter_aggr_cb(struct perf_evsel *counter, void *data,
-			    bool first __maybe_unused)
-{
-	struct caggr_data *cd = data;
-	struct perf_stat_evsel *ps = counter->stats;
-
-	cd->avg += avg_stats(&ps->res_stats[0]);
-	cd->avg_enabled += avg_stats(&ps->res_stats[1]);
-	cd->avg_running += avg_stats(&ps->res_stats[2]);
-}
-
-/*
- * Print out the results of a single counter:
- * aggregated counts in system-wide mode
- */
-static void print_counter_aggr(struct perf_evsel *counter, char *prefix)
-{
-	FILE *output = stat_config.output;
-	double uval;
-	struct caggr_data cd = { .avg = 0.0 };
-
-	if (!collect_data(counter, counter_aggr_cb, &cd))
-		return;
-
-	if (prefix && !metric_only)
-		fprintf(output, "%s", prefix);
-
-	uval = cd.avg * counter->scale;
-	printout(-1, 0, counter, uval, prefix, cd.avg_running, cd.avg_enabled,
-		 cd.avg, &rt_stat);
-	if (!metric_only)
-		fprintf(output, "\n");
-}
-
-static void counter_cb(struct perf_evsel *counter, void *data,
-		       bool first __maybe_unused)
-{
-	struct aggr_data *ad = data;
-
-	ad->val += perf_counts(counter->counts, ad->cpu, 0)->val;
-	ad->ena += perf_counts(counter->counts, ad->cpu, 0)->ena;
-	ad->run += perf_counts(counter->counts, ad->cpu, 0)->run;
-}
-
-/*
- * Print out the results of a single counter:
- * does not use aggregated count in system-wide
- */
-static void print_counter(struct perf_evsel *counter, char *prefix)
-{
-	FILE *output = stat_config.output;
-	u64 ena, run, val;
-	double uval;
-	int cpu;
-
-	for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
-		struct aggr_data ad = { .cpu = cpu };
-
-		if (!collect_data(counter, counter_cb, &ad))
-			return;
-		val = ad.val;
-		ena = ad.ena;
-		run = ad.run;
-
-		if (prefix)
-			fprintf(output, "%s", prefix);
-
-		uval = val * counter->scale;
-		printout(cpu, 0, counter, uval, prefix, run, ena, 1.0,
-			 &rt_stat);
-
-		fputc('\n', output);
-	}
-}
-
-static void print_no_aggr_metric(char *prefix)
-{
-	int cpu;
-	int nrcpus = 0;
-	struct perf_evsel *counter;
-	u64 ena, run, val;
-	double uval;
-
-	nrcpus = evsel_list->cpus->nr;
-	for (cpu = 0; cpu < nrcpus; cpu++) {
-		bool first = true;
-
-		if (prefix)
-			fputs(prefix, stat_config.output);
-		evlist__for_each_entry(evsel_list, counter) {
-			if (is_duration_time(counter))
-				continue;
-			if (first) {
-				aggr_printout(counter, cpu, 0);
-				first = false;
-			}
-			val = perf_counts(counter->counts, cpu, 0)->val;
-			ena = perf_counts(counter->counts, cpu, 0)->ena;
-			run = perf_counts(counter->counts, cpu, 0)->run;
-
-			uval = val * counter->scale;
-			printout(cpu, 0, counter, uval, prefix, run, ena, 1.0,
-				 &rt_stat);
-		}
-		fputc('\n', stat_config.output);
-	}
-}
-
-static int aggr_header_lens[] = {
-	[AGGR_CORE] = 18,
-	[AGGR_SOCKET] = 12,
-	[AGGR_NONE] = 6,
-	[AGGR_THREAD] = 24,
-	[AGGR_GLOBAL] = 0,
-};
-
-static const char *aggr_header_csv[] = {
-	[AGGR_CORE] 	= 	"core,cpus,",
-	[AGGR_SOCKET] 	= 	"socket,cpus",
-	[AGGR_NONE] 	= 	"cpu,",
-	[AGGR_THREAD] 	= 	"comm-pid,",
-	[AGGR_GLOBAL] 	=	""
-};
-
-static void print_metric_headers(const char *prefix, bool no_indent)
-{
-	struct perf_stat_output_ctx out;
-	struct perf_evsel *counter;
-	struct outstate os = {
-		.fh = stat_config.output
-	};
-
-	if (prefix)
-		fprintf(stat_config.output, "%s", prefix);
-
-	if (!csv_output && !no_indent)
-		fprintf(stat_config.output, "%*s",
-			aggr_header_lens[stat_config.aggr_mode], "");
-	if (csv_output) {
-		if (stat_config.interval)
-			fputs("time,", stat_config.output);
-		fputs(aggr_header_csv[stat_config.aggr_mode],
-			stat_config.output);
-	}
-
-	/* Print metrics headers only */
-	evlist__for_each_entry(evsel_list, counter) {
-		if (is_duration_time(counter))
-			continue;
-		os.evsel = counter;
-		out.ctx = &os;
-		out.print_metric = print_metric_header;
-		out.new_line = new_line_metric;
-		out.force_header = true;
-		os.evsel = counter;
-		perf_stat__print_shadow_stats(counter, 0,
-					      0,
-					      &out,
-					      &metric_events,
-					      &rt_stat);
-	}
-	fputc('\n', stat_config.output);
-}
-
-static void print_interval(char *prefix, struct timespec *ts)
-{
-	FILE *output = stat_config.output;
-	static int num_print_interval;
-
-	if (interval_clear)
-		puts(CONSOLE_CLEAR);
-
-	sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, csv_sep);
-
-	if ((num_print_interval == 0 && !csv_output) || interval_clear) {
-		switch (stat_config.aggr_mode) {
-		case AGGR_SOCKET:
-			fprintf(output, "#           time socket cpus");
-			if (!metric_only)
-				fprintf(output, "             counts %*s events\n", unit_width, "unit");
-			break;
-		case AGGR_CORE:
-			fprintf(output, "#           time core         cpus");
-			if (!metric_only)
-				fprintf(output, "             counts %*s events\n", unit_width, "unit");
-			break;
-		case AGGR_NONE:
-			fprintf(output, "#           time CPU    ");
-			if (!metric_only)
-				fprintf(output, "                counts %*s events\n", unit_width, "unit");
-			break;
-		case AGGR_THREAD:
-			fprintf(output, "#           time             comm-pid");
-			if (!metric_only)
-				fprintf(output, "                  counts %*s events\n", unit_width, "unit");
-			break;
-		case AGGR_GLOBAL:
-		default:
-			fprintf(output, "#           time");
-			if (!metric_only)
-				fprintf(output, "             counts %*s events\n", unit_width, "unit");
-		case AGGR_UNSET:
-			break;
-		}
-	}
-
-	if ((num_print_interval == 0 || interval_clear) && metric_only)
-		print_metric_headers(" ", true);
-	if (++num_print_interval == 25)
-		num_print_interval = 0;
-}
-
-static void print_header(int argc, const char **argv)
-{
-	FILE *output = stat_config.output;
-	int i;
-
-	fflush(stdout);
-
-	if (!csv_output) {
-		fprintf(output, "\n");
-		fprintf(output, " Performance counter stats for ");
-		if (target.system_wide)
-			fprintf(output, "\'system wide");
-		else if (target.cpu_list)
-			fprintf(output, "\'CPU(s) %s", target.cpu_list);
-		else if (!target__has_task(&target)) {
-			fprintf(output, "\'%s", argv ? argv[0] : "pipe");
-			for (i = 1; argv && (i < argc); i++)
-				fprintf(output, " %s", argv[i]);
-		} else if (target.pid)
-			fprintf(output, "process id \'%s", target.pid);
-		else
-			fprintf(output, "thread id \'%s", target.tid);
-
-		fprintf(output, "\'");
-		if (run_count > 1)
-			fprintf(output, " (%d runs)", run_count);
-		fprintf(output, ":\n\n");
-	}
-}
-
-static int get_precision(double num)
-{
-	if (num > 1)
-		return 0;
-
-	return lround(ceil(-log10(num)));
-}
-
-static void print_table(FILE *output, int precision, double avg)
-{
-	char tmp[64];
-	int idx, indent = 0;
-
-	scnprintf(tmp, 64, " %17.*f", precision, avg);
-	while (tmp[indent] == ' ')
-		indent++;
-
-	fprintf(output, "%*s# Table of individual measurements:\n", indent, "");
-
-	for (idx = 0; idx < run_count; idx++) {
-		double run = (double) walltime_run[idx] / NSEC_PER_SEC;
-		int h, n = 1 + abs((int) (100.0 * (run - avg)/run) / 5);
-
-		fprintf(output, " %17.*f (%+.*f) ",
-			precision, run, precision, run - avg);
-
-		for (h = 0; h < n; h++)
-			fprintf(output, "#");
-
-		fprintf(output, "\n");
-	}
-
-	fprintf(output, "\n%*s# Final result:\n", indent, "");
-}
-
-static double timeval2double(struct timeval *t)
-{
-	return t->tv_sec + (double) t->tv_usec/USEC_PER_SEC;
-}
-
-static void print_footer(void)
-{
-	double avg = avg_stats(&walltime_nsecs_stats) / NSEC_PER_SEC;
-	FILE *output = stat_config.output;
-	int n;
-
-	if (!null_run)
-		fprintf(output, "\n");
-
-	if (run_count == 1) {
-		fprintf(output, " %17.9f seconds time elapsed", avg);
-
-		if (ru_display) {
-			double ru_utime = timeval2double(&ru_data.ru_utime);
-			double ru_stime = timeval2double(&ru_data.ru_stime);
-
-			fprintf(output, "\n\n");
-			fprintf(output, " %17.9f seconds user\n", ru_utime);
-			fprintf(output, " %17.9f seconds sys\n", ru_stime);
-		}
-	} else {
-		double sd = stddev_stats(&walltime_nsecs_stats) / NSEC_PER_SEC;
-		/*
-		 * Display at most 2 more significant
-		 * digits than the stddev inaccuracy.
-		 */
-		int precision = get_precision(sd) + 2;
-
-		if (walltime_run_table)
-			print_table(output, precision, avg);
-
-		fprintf(output, " %17.*f +- %.*f seconds time elapsed",
-			precision, avg, precision, sd);
-
-		print_noise_pct(sd, avg);
-	}
-	fprintf(output, "\n\n");
-
-	if (print_free_counters_hint &&
-	    sysctl__read_int("kernel/nmi_watchdog", &n) >= 0 &&
-	    n > 0)
-		fprintf(output,
-"Some events weren't counted. Try disabling the NMI watchdog:\n"
-"	echo 0 > /proc/sys/kernel/nmi_watchdog\n"
-"	perf stat ...\n"
-"	echo 1 > /proc/sys/kernel/nmi_watchdog\n");
-
-	if (print_mixed_hw_group_error)
-		fprintf(output,
-			"The events in group usually have to be from "
-			"the same PMU. Try reorganizing the group.\n");
-}
-
 static void print_counters(struct timespec *ts, int argc, const char **argv)
 {
-	int interval = stat_config.interval;
-	struct perf_evsel *counter;
-	char buf[64], *prefix = NULL;
-
 	/* Do not print anything if we record to the pipe. */
 	if (STAT_RECORD && perf_stat.data.is_pipe)
 		return;
 
-	if (interval)
-		print_interval(prefix = buf, ts);
-	else
-		print_header(argc, argv);
-
-	if (metric_only) {
-		static int num_print_iv;
-
-		if (num_print_iv == 0 && !interval)
-			print_metric_headers(prefix, false);
-		if (num_print_iv++ == 25)
-			num_print_iv = 0;
-		if (stat_config.aggr_mode == AGGR_GLOBAL && prefix)
-			fprintf(stat_config.output, "%s", prefix);
-	}
-
-	switch (stat_config.aggr_mode) {
-	case AGGR_CORE:
-	case AGGR_SOCKET:
-		print_aggr(prefix);
-		break;
-	case AGGR_THREAD:
-		evlist__for_each_entry(evsel_list, counter) {
-			if (is_duration_time(counter))
-				continue;
-			print_aggr_thread(counter, prefix);
-		}
-		break;
-	case AGGR_GLOBAL:
-		evlist__for_each_entry(evsel_list, counter) {
-			if (is_duration_time(counter))
-				continue;
-			print_counter_aggr(counter, prefix);
-		}
-		if (metric_only)
-			fputc('\n', stat_config.output);
-		break;
-	case AGGR_NONE:
-		if (metric_only)
-			print_no_aggr_metric(prefix);
-		else {
-			evlist__for_each_entry(evsel_list, counter) {
-				if (is_duration_time(counter))
-					continue;
-				print_counter(counter, prefix);
-			}
-		}
-		break;
-	case AGGR_UNSET:
-	default:
-		break;
-	}
-
-	if (!interval && !csv_output)
-		print_footer();
-
-	fflush(stat_config.output);
+	perf_evlist__print_counters(evsel_list, &stat_config, &target,
+				    ts, argc, argv);
 }
 
 static volatile int signr = -1;
@@ -1950,7 +700,7 @@ static int enable_metric_only(const struct option *opt __maybe_unused,
 			      const char *s __maybe_unused, int unset)
 {
 	force_metric_only = true;
-	metric_only = !unset;
+	stat_config.metric_only = !unset;
 	return 0;
 }
 
@@ -1958,7 +708,7 @@ static int parse_metric_groups(const struct option *opt,
 			       const char *str,
 			       int unset __maybe_unused)
 {
-	return metricgroup__parse_groups(opt, str, &metric_events);
+	return metricgroup__parse_groups(opt, str, &stat_config.metric_events);
 }
 
 static const struct option stat_options[] = {
@@ -1969,7 +719,7 @@ static const struct option stat_options[] = {
 		     parse_events_option),
 	OPT_CALLBACK(0, "filter", &evsel_list, "filter",
 		     "event filter", parse_filter),
-	OPT_BOOLEAN('i', "no-inherit", &no_inherit,
+	OPT_BOOLEAN('i', "no-inherit", &stat_config.no_inherit,
 		    "child tasks do not inherit counters"),
 	OPT_STRING('p', "pid", &target.pid, "pid",
 		   "stat events on existing process id"),
@@ -1982,11 +732,11 @@ static const struct option stat_options[] = {
 	OPT_BOOLEAN('c', "scale", &stat_config.scale, "scale/normalize counters"),
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show counter open errors, etc)"),
-	OPT_INTEGER('r', "repeat", &run_count,
+	OPT_INTEGER('r', "repeat", &stat_config.run_count,
 		    "repeat command and print average + stddev (max: 100, forever: 0)"),
-	OPT_BOOLEAN(0, "table", &walltime_run_table,
+	OPT_BOOLEAN(0, "table", &stat_config.walltime_run_table,
 		    "display details about each run (only with -r option)"),
-	OPT_BOOLEAN('n', "null", &null_run,
+	OPT_BOOLEAN('n', "null", &stat_config.null_run,
 		    "null run - dont start any counters"),
 	OPT_INCR('d', "detailed", &detailed_run,
 		    "detailed run - start a lot of events"),
@@ -1999,8 +749,8 @@ static const struct option stat_options[] = {
 		    "list of cpus to monitor in system-wide"),
 	OPT_SET_UINT('A', "no-aggr", &stat_config.aggr_mode,
 		    "disable CPU count aggregation", AGGR_NONE),
-	OPT_BOOLEAN(0, "no-merge", &no_merge, "Do not merge identical named events"),
-	OPT_STRING('x', "field-separator", &csv_sep, "separator",
+	OPT_BOOLEAN(0, "no-merge", &stat_config.no_merge, "Do not merge identical named events"),
+	OPT_STRING('x', "field-separator", &stat_config.csv_sep, "separator",
 		   "print counts with custom separator"),
 	OPT_CALLBACK('G', "cgroup", &evsel_list, "name",
 		     "monitor event in cgroup name only", parse_cgroups),
@@ -2017,7 +767,7 @@ static const struct option stat_options[] = {
 		    "(overhead is possible for values <= 100ms)"),
 	OPT_INTEGER(0, "interval-count", &stat_config.times,
 		    "print counts for fixed number of times"),
-	OPT_BOOLEAN(0, "interval-clear", &interval_clear,
+	OPT_BOOLEAN(0, "interval-clear", &stat_config.interval_clear,
 		    "clear screen in between new interval"),
 	OPT_UINTEGER(0, "timeout", &stat_config.timeout,
 		    "stop workload and print counts after a timeout period in ms (>= 10ms)"),
@@ -2027,9 +777,9 @@ static const struct option stat_options[] = {
 		     "aggregate counts per physical processor core", AGGR_CORE),
 	OPT_SET_UINT(0, "per-thread", &stat_config.aggr_mode,
 		     "aggregate counts per thread", AGGR_THREAD),
-	OPT_UINTEGER('D', "delay", &initial_delay,
+	OPT_UINTEGER('D', "delay", &stat_config.initial_delay,
 		     "ms to wait before starting measurement after program start"),
-	OPT_CALLBACK_NOOPT(0, "metric-only", &metric_only, NULL,
+	OPT_CALLBACK_NOOPT(0, "metric-only", &stat_config.metric_only, NULL,
 			"Only print computed metrics. No raw values", enable_metric_only),
 	OPT_BOOLEAN(0, "topdown", &topdown_run,
 			"measure topdown level 1 statistics"),
@@ -2041,12 +791,14 @@ static const struct option stat_options[] = {
 	OPT_END()
 };
 
-static int perf_stat__get_socket(struct cpu_map *map, int cpu)
+static int perf_stat__get_socket(struct perf_stat_config *config __maybe_unused,
+				 struct cpu_map *map, int cpu)
 {
 	return cpu_map__get_socket(map, cpu, NULL);
 }
 
-static int perf_stat__get_core(struct cpu_map *map, int cpu)
+static int perf_stat__get_core(struct perf_stat_config *config __maybe_unused,
+			       struct cpu_map *map, int cpu)
 {
 	return cpu_map__get_core(map, cpu, NULL);
 }
@@ -2063,9 +815,8 @@ static int cpu_map__get_max(struct cpu_map *map)
 	return max;
 }
 
-static struct cpu_map *cpus_aggr_map;
-
-static int perf_stat__get_aggr(aggr_get_id_t get_id, struct cpu_map *map, int idx)
+static int perf_stat__get_aggr(struct perf_stat_config *config,
+			       aggr_get_id_t get_id, struct cpu_map *map, int idx)
 {
 	int cpu;
 
@@ -2074,20 +825,22 @@ static int perf_stat__get_aggr(aggr_get_id_t get_id, struct cpu_map *map, int id
 
 	cpu = map->map[idx];
 
-	if (cpus_aggr_map->map[cpu] == -1)
-		cpus_aggr_map->map[cpu] = get_id(map, idx);
+	if (config->cpus_aggr_map->map[cpu] == -1)
+		config->cpus_aggr_map->map[cpu] = get_id(config, map, idx);
 
-	return cpus_aggr_map->map[cpu];
+	return config->cpus_aggr_map->map[cpu];
 }
 
-static int perf_stat__get_socket_cached(struct cpu_map *map, int idx)
+static int perf_stat__get_socket_cached(struct perf_stat_config *config,
+					struct cpu_map *map, int idx)
 {
-	return perf_stat__get_aggr(perf_stat__get_socket, map, idx);
+	return perf_stat__get_aggr(config, perf_stat__get_socket, map, idx);
 }
 
-static int perf_stat__get_core_cached(struct cpu_map *map, int idx)
+static int perf_stat__get_core_cached(struct perf_stat_config *config,
+				      struct cpu_map *map, int idx)
 {
-	return perf_stat__get_aggr(perf_stat__get_core, map, idx);
+	return perf_stat__get_aggr(config, perf_stat__get_core, map, idx);
 }
 
 static int perf_stat_init_aggr_mode(void)
@@ -2096,18 +849,18 @@ static int perf_stat_init_aggr_mode(void)
 
 	switch (stat_config.aggr_mode) {
 	case AGGR_SOCKET:
-		if (cpu_map__build_socket_map(evsel_list->cpus, &aggr_map)) {
+		if (cpu_map__build_socket_map(evsel_list->cpus, &stat_config.aggr_map)) {
 			perror("cannot build socket map");
 			return -1;
 		}
-		aggr_get_id = perf_stat__get_socket_cached;
+		stat_config.aggr_get_id = perf_stat__get_socket_cached;
 		break;
 	case AGGR_CORE:
-		if (cpu_map__build_core_map(evsel_list->cpus, &aggr_map)) {
+		if (cpu_map__build_core_map(evsel_list->cpus, &stat_config.aggr_map)) {
 			perror("cannot build core map");
 			return -1;
 		}
-		aggr_get_id = perf_stat__get_core_cached;
+		stat_config.aggr_get_id = perf_stat__get_core_cached;
 		break;
 	case AGGR_NONE:
 	case AGGR_GLOBAL:
@@ -2123,16 +876,16 @@ static int perf_stat_init_aggr_mode(void)
 	 * the aggregation translate cpumap.
 	 */
 	nr = cpu_map__get_max(evsel_list->cpus);
-	cpus_aggr_map = cpu_map__empty_new(nr + 1);
-	return cpus_aggr_map ? 0 : -ENOMEM;
+	stat_config.cpus_aggr_map = cpu_map__empty_new(nr + 1);
+	return stat_config.cpus_aggr_map ? 0 : -ENOMEM;
 }
 
 static void perf_stat__exit_aggr_mode(void)
 {
-	cpu_map__put(aggr_map);
-	cpu_map__put(cpus_aggr_map);
-	aggr_map = NULL;
-	cpus_aggr_map = NULL;
+	cpu_map__put(stat_config.aggr_map);
+	cpu_map__put(stat_config.cpus_aggr_map);
+	stat_config.aggr_map = NULL;
+	stat_config.cpus_aggr_map = NULL;
 }
 
 static inline int perf_env__get_cpu(struct perf_env *env, struct cpu_map *map, int idx)
@@ -2190,12 +943,14 @@ static int perf_env__build_core_map(struct perf_env *env, struct cpu_map *cpus,
 	return cpu_map__build_map(cpus, corep, perf_env__get_core, env);
 }
 
-static int perf_stat__get_socket_file(struct cpu_map *map, int idx)
+static int perf_stat__get_socket_file(struct perf_stat_config *config __maybe_unused,
+				      struct cpu_map *map, int idx)
 {
 	return perf_env__get_socket(map, idx, &perf_stat.session->header.env);
 }
 
-static int perf_stat__get_core_file(struct cpu_map *map, int idx)
+static int perf_stat__get_core_file(struct perf_stat_config *config __maybe_unused,
+				    struct cpu_map *map, int idx)
 {
 	return perf_env__get_core(map, idx, &perf_stat.session->header.env);
 }
@@ -2206,18 +961,18 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat *st)
 
 	switch (stat_config.aggr_mode) {
 	case AGGR_SOCKET:
-		if (perf_env__build_socket_map(env, evsel_list->cpus, &aggr_map)) {
+		if (perf_env__build_socket_map(env, evsel_list->cpus, &stat_config.aggr_map)) {
 			perror("cannot build socket map");
 			return -1;
 		}
-		aggr_get_id = perf_stat__get_socket_file;
+		stat_config.aggr_get_id = perf_stat__get_socket_file;
 		break;
 	case AGGR_CORE:
-		if (perf_env__build_core_map(env, evsel_list->cpus, &aggr_map)) {
+		if (perf_env__build_core_map(env, evsel_list->cpus, &stat_config.aggr_map)) {
 			perror("cannot build core map");
 			return -1;
 		}
-		aggr_get_id = perf_stat__get_core_file;
+		stat_config.aggr_get_id = perf_stat__get_core_file;
 		break;
 	case AGGR_NONE:
 	case AGGR_GLOBAL:
@@ -2401,7 +1156,7 @@ static int add_default_attributes(void)
 	struct parse_events_error errinfo;
 
 	/* Set attrs if no event is selected and !null_run: */
-	if (null_run)
+	if (stat_config.null_run)
 		return 0;
 
 	if (transaction_run) {
@@ -2414,7 +1169,7 @@ static int add_default_attributes(void)
 			struct option opt = { .value = &evsel_list };
 
 			return metricgroup__parse_groups(&opt, "transaction",
-							 &metric_events);
+							 &stat_config.metric_events);
 		}
 
 		if (pmu_have_event("cpu", "cycles-ct") &&
@@ -2452,7 +1207,7 @@ static int add_default_attributes(void)
 		if (pmu_have_event("msr", "aperf") &&
 		    pmu_have_event("msr", "smi")) {
 			if (!force_metric_only)
-				metric_only = true;
+				stat_config.metric_only = true;
 			err = parse_events(evsel_list, smi_cost_attrs, &errinfo);
 		} else {
 			fprintf(stderr, "To measure SMI cost, it needs "
@@ -2483,7 +1238,7 @@ static int add_default_attributes(void)
 		}
 
 		if (!force_metric_only)
-			metric_only = true;
+			stat_config.metric_only = true;
 		if (topdown_filter_events(topdown_attrs, &str,
 				arch_topdown_check_group(&warn)) < 0) {
 			pr_err("Out of memory\n");
@@ -2580,7 +1335,7 @@ static int __cmd_record(int argc, const char **argv)
 	if (output_name)
 		data->file.path = output_name;
 
-	if (run_count != 1 || forever) {
+	if (stat_config.run_count != 1 || forever) {
 		pr_err("Cannot use -r option with perf stat record.\n");
 		return -1;
 	}
@@ -2853,12 +1608,12 @@ int cmd_stat(int argc, const char **argv)
 	perf_stat__collect_metric_expr(evsel_list);
 	perf_stat__init_shadow_stats();
 
-	if (csv_sep) {
-		csv_output = true;
-		if (!strcmp(csv_sep, "\\t"))
-			csv_sep = "\t";
+	if (stat_config.csv_sep) {
+		stat_config.csv_output = true;
+		if (!strcmp(stat_config.csv_sep, "\\t"))
+			stat_config.csv_sep = "\t";
 	} else
-		csv_sep = DEFAULT_SEPARATOR;
+		stat_config.csv_sep = DEFAULT_SEPARATOR;
 
 	if (argc && !strncmp(argv[0], "rec", 3)) {
 		argc = __cmd_record(argc, argv);
@@ -2883,17 +1638,17 @@ int cmd_stat(int argc, const char **argv)
 		goto out;
 	}
 
-	if (metric_only && stat_config.aggr_mode == AGGR_THREAD) {
+	if (stat_config.metric_only && stat_config.aggr_mode == AGGR_THREAD) {
 		fprintf(stderr, "--metric-only is not supported with --per-thread\n");
 		goto out;
 	}
 
-	if (metric_only && run_count > 1) {
+	if (stat_config.metric_only && stat_config.run_count > 1) {
 		fprintf(stderr, "--metric-only is not supported with -r\n");
 		goto out;
 	}
 
-	if (walltime_run_table && run_count <= 1) {
+	if (stat_config.walltime_run_table && stat_config.run_count <= 1) {
 		fprintf(stderr, "--table is only supported with -r\n");
 		parse_options_usage(stat_usage, stat_options, "r", 1);
 		parse_options_usage(NULL, stat_options, "table", 0);
@@ -2931,7 +1686,7 @@ int cmd_stat(int argc, const char **argv)
 	/*
 	 * let the spreadsheet do the pretty-printing
 	 */
-	if (csv_output) {
+	if (stat_config.csv_output) {
 		/* User explicitly passed -B? */
 		if (big_num_opt == 1) {
 			fprintf(stderr, "-B option not supported with -x\n");
@@ -2939,9 +1694,9 @@ int cmd_stat(int argc, const char **argv)
 			parse_options_usage(NULL, stat_options, "x", 1);
 			goto out;
 		} else /* Nope, so disable big number formatting */
-			big_num = false;
+			stat_config.big_num = false;
 	} else if (big_num_opt == 0) /* User passed --no-big-num */
-		big_num = false;
+		stat_config.big_num = false;
 
 	setup_system_wide(argc);
 
@@ -2949,21 +1704,21 @@ int cmd_stat(int argc, const char **argv)
 	 * Display user/system times only for single
 	 * run and when there's specified tracee.
 	 */
-	if ((run_count == 1) && target__none(&target))
-		ru_display = true;
+	if ((stat_config.run_count == 1) && target__none(&target))
+		stat_config.ru_display = true;
 
-	if (run_count < 0) {
+	if (stat_config.run_count < 0) {
 		pr_err("Run count must be a positive number\n");
 		parse_options_usage(stat_usage, stat_options, "r", 1);
 		goto out;
-	} else if (run_count == 0) {
+	} else if (stat_config.run_count == 0) {
 		forever = true;
-		run_count = 1;
+		stat_config.run_count = 1;
 	}
 
-	if (walltime_run_table) {
-		walltime_run = zalloc(run_count * sizeof(walltime_run[0]));
-		if (!walltime_run) {
+	if (stat_config.walltime_run_table) {
+		stat_config.walltime_run = zalloc(stat_config.run_count * sizeof(stat_config.walltime_run[0]));
+		if (!stat_config.walltime_run) {
 			pr_err("failed to setup -r option");
 			goto out;
 		}
@@ -3065,6 +1820,17 @@ int cmd_stat(int argc, const char **argv)
 	if (perf_stat_init_aggr_mode())
 		goto out;
 
+	/*
+	 * Set sample_type to PERF_SAMPLE_IDENTIFIER, which should be harmless
+	 * while avoiding that older tools show confusing messages.
+	 *
+	 * However for pipe sessions we need to keep it zero,
+	 * because script's perf_evsel__check_attr is triggered
+	 * by attr->sample_type != 0, and we can't run it on
+	 * stat sessions.
+	 */
+	stat_config.identifier = !(STAT_RECORD && perf_stat.data.is_pipe);
+
 	/*
 	 * We dont want to block the signals - that would cause
 	 * child tasks to inherit that and Ctrl-C would not work.
@@ -3079,8 +1845,8 @@ int cmd_stat(int argc, const char **argv)
 	signal(SIGABRT, skip_signal);
 
 	status = 0;
-	for (run_idx = 0; forever || run_idx < run_count; run_idx++) {
-		if (run_count != 1 && verbose > 0)
+	for (run_idx = 0; forever || run_idx < stat_config.run_count; run_idx++) {
+		if (stat_config.run_count != 1 && verbose > 0)
 			fprintf(output, "[ perf stat: executing run #%d ... ]\n",
 				run_idx + 1);
 
@@ -3132,7 +1898,7 @@ int cmd_stat(int argc, const char **argv)
 	perf_stat__exit_aggr_mode();
 	perf_evlist__free_stats(evsel_list);
 out:
-	free(walltime_run);
+	free(stat_config.walltime_run);
 
 	if (smi_cost && smi_reset)
 		sysfs__write_int(FREEZE_ON_SMI_PATH, 0);
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 22ab8e67c7600..7ce277d22a915 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -288,6 +288,13 @@ static int perf_evsel__init_augmented_syscall_tp_args(struct perf_evsel *evsel)
 	return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
 }
 
+static int perf_evsel__init_augmented_syscall_tp_ret(struct perf_evsel *evsel)
+{
+	struct syscall_tp *sc = evsel->priv;
+
+	return __tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap);
+}
+
 static int perf_evsel__init_raw_syscall_tp(struct perf_evsel *evsel, void *handler)
 {
 	evsel->priv = malloc(sizeof(struct syscall_tp));
@@ -498,16 +505,6 @@ static const char *clockid[] = {
 };
 static DEFINE_STRARRAY(clockid);
 
-static const char *socket_families[] = {
-	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
-	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
-	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
-	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
-	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
-	"ALG", "NFC", "VSOCK",
-};
-static DEFINE_STRARRAY(socket_families);
-
 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
 						 struct syscall_arg *arg)
 {
@@ -631,6 +628,8 @@ static struct syscall_fmt {
 } syscall_fmts[] = {
 	{ .name	    = "access",
 	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
+	{ .name	    = "bind",
+	  .arg = { [1] = { .scnprintf = SCA_SOCKADDR, /* umyaddr */ }, }, },
 	{ .name	    = "bpf",
 	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
 	{ .name	    = "brk",	    .hexret = true,
@@ -645,6 +644,8 @@ static struct syscall_fmt {
 		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
 	{ .name	    = "close",
 	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
+	{ .name	    = "connect",
+	  .arg = { [1] = { .scnprintf = SCA_SOCKADDR, /* servaddr */ }, }, },
 	{ .name	    = "epoll_ctl",
 	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
 	{ .name	    = "eventfd2",
@@ -801,7 +802,8 @@ static struct syscall_fmt {
 	{ .name	    = "sendmsg",
 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 	{ .name	    = "sendto",
-	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
+	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ },
+		   [4] = { .scnprintf = SCA_SOCKADDR, /* addr */ }, }, },
 	{ .name	    = "set_tid_address", .errpid = true, },
 	{ .name	    = "setitimer",
 	  .arg = { [0] = STRARRAY(which, itimers), }, },
@@ -830,6 +832,7 @@ static struct syscall_fmt {
 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 	{ .name	    = "tkill",
 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
+	{ .name     = "umount2", .alias = "umount", },
 	{ .name	    = "uname", .alias = "newuname", },
 	{ .name	    = "unlinkat",
 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
@@ -856,10 +859,12 @@ static struct syscall_fmt *syscall_fmt__find(const char *name)
 /*
  * is_exit: is this "exit" or "exit_group"?
  * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
+ * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc.
  */
 struct syscall {
 	struct event_format *tp_format;
 	int		    nr_args;
+	int		    args_size;
 	bool		    is_exit;
 	bool		    is_open;
 	struct format_field *args;
@@ -1095,11 +1100,21 @@ static void thread__set_filename_pos(struct thread *thread, const char *bf,
 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
 }
 
+static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, char *bf, size_t size)
+{
+	struct augmented_arg *augmented_arg = arg->augmented.args;
+
+	return scnprintf(bf, size, "%.*s", augmented_arg->size, augmented_arg->value);
+}
+
 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
 					      struct syscall_arg *arg)
 {
 	unsigned long ptr = arg->val;
 
+	if (arg->augmented.args)
+		return syscall_arg__scnprintf_augmented_string(arg, bf, size);
+
 	if (!arg->trace->vfs_getname)
 		return scnprintf(bf, size, "%#x", ptr);
 
@@ -1142,11 +1157,9 @@ static void sig_handler(int sig)
 	interrupted = sig == SIGINT;
 }
 
-static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
-					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
+static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp)
 {
-	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
-	printed += fprintf_duration(duration, duration_calculated, fp);
+	size_t printed = 0;
 
 	if (trace->multiple_threads) {
 		if (trace->show_comm)
@@ -1157,6 +1170,14 @@ static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thre
 	return printed;
 }
 
+static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
+					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
+{
+	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
+	printed += fprintf_duration(duration, duration_calculated, fp);
+	return printed + trace__fprintf_comm_tid(trace, thread, fp);
+}
+
 static int trace__process_event(struct trace *trace, struct machine *machine,
 				union perf_event *event, struct perf_sample *sample)
 {
@@ -1258,10 +1279,12 @@ static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
 
 static int syscall__set_arg_fmts(struct syscall *sc)
 {
-	struct format_field *field;
+	struct format_field *field, *last_field = NULL;
 	int idx = 0, len;
 
 	for (field = sc->args; field; field = field->next, ++idx) {
+		last_field = field;
+
 		if (sc->fmt && sc->fmt->arg[idx].scnprintf)
 			continue;
 
@@ -1292,6 +1315,9 @@ static int syscall__set_arg_fmts(struct syscall *sc)
 		}
 	}
 
+	if (last_field)
+		sc->args_size = last_field->offset + last_field->size;
+
 	return 0;
 }
 
@@ -1472,14 +1498,18 @@ static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
 }
 
 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
-				      unsigned char *args, struct trace *trace,
-				      struct thread *thread)
+				      unsigned char *args, void *augmented_args, int augmented_args_size,
+				      struct trace *trace, struct thread *thread)
 {
 	size_t printed = 0;
 	unsigned long val;
 	u8 bit = 1;
 	struct syscall_arg arg = {
 		.args	= args,
+		.augmented = {
+			.size = augmented_args_size,
+			.args = augmented_args,
+		},
 		.idx	= 0,
 		.mask	= 0,
 		.trace  = trace,
@@ -1654,6 +1684,17 @@ static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
 	return printed;
 }
 
+static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size)
+{
+	void *augmented_args = NULL;
+
+	*augmented_args_size = sample->raw_size - sc->args_size;
+	if (*augmented_args_size > 0)
+		augmented_args = sample->raw_data + sc->args_size;
+
+	return augmented_args;
+}
+
 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 			    union perf_event *event __maybe_unused,
 			    struct perf_sample *sample)
@@ -1663,6 +1704,8 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 	size_t printed = 0;
 	struct thread *thread;
 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
+	int augmented_args_size = 0;
+	void *augmented_args = NULL;
 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
 	struct thread_trace *ttrace;
 
@@ -1686,13 +1729,24 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 
 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
 		trace__printf_interrupted_entry(trace);
-
+	/*
+	 * If this is raw_syscalls.sys_enter, then it always comes with the 6 possible
+	 * arguments, even if the syscall being handled, say "openat", uses only 4 arguments
+	 * this breaks syscall__augmented_args() check for augmented args, as we calculate
+	 * syscall->args_size using each syscalls:sys_enter_NAME tracefs format file,
+	 * so when handling, say the openat syscall, we end up getting 6 args for the
+	 * raw_syscalls:sys_enter event, when we expected just 4, we end up mistakenly
+	 * thinking that the extra 2 u64 args are the augmented filename, so just check
+	 * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
+	 */
+	if (evsel != trace->syscalls.events.sys_enter)
+		augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size);
 	ttrace->entry_time = sample->time;
 	msg = ttrace->entry_str;
 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
 
 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
-					   args, trace, thread);
+					   args, augmented_args, augmented_args_size, trace, thread);
 
 	if (sc->is_exit) {
 		if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
@@ -1723,7 +1777,8 @@ static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evse
 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
 	char msg[1024];
-	void *args;
+	void *args, *augmented_args = NULL;
+	int augmented_args_size;
 
 	if (sc == NULL)
 		return -1;
@@ -1738,7 +1793,8 @@ static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evse
 		goto out_put;
 
 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
-	syscall__scnprintf_args(sc, msg, sizeof(msg), args, trace, thread);
+	augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size);
+	syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread);
 	fprintf(trace->output, "%s", msg);
 	err = 0;
 out_put:
@@ -2022,6 +2078,7 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
 				union perf_event *event __maybe_unused,
 				struct perf_sample *sample)
 {
+	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
 	int callchain_ret = 0;
 
 	if (sample->callchain) {
@@ -2039,13 +2096,31 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
 	if (trace->trace_syscalls)
 		fprintf(trace->output, "(         ): ");
 
+	if (thread)
+		trace__fprintf_comm_tid(trace, thread, trace->output);
+
+	if (evsel == trace->syscalls.events.augmented) {
+		int id = perf_evsel__sc_tp_uint(evsel, id, sample);
+		struct syscall *sc = trace__syscall_info(trace, evsel, id);
+
+		if (sc) {
+			fprintf(trace->output, "%s(", sc->name);
+			trace__fprintf_sys_enter(trace, evsel, sample);
+			fputc(')', trace->output);
+			goto newline;
+		}
+
+		/*
+		 * XXX: Not having the associated syscall info or not finding/adding
+		 * 	the thread should never happen, but if it does...
+		 * 	fall thru and print it as a bpf_output event.
+		 */
+	}
+
 	fprintf(trace->output, "%s:", evsel->name);
 
 	if (perf_evsel__is_bpf_output(evsel)) {
-		if (evsel == trace->syscalls.events.augmented)
-			trace__fprintf_sys_enter(trace, evsel, sample);
-		else
-			bpf_output__fprintf(trace, sample);
+		bpf_output__fprintf(trace, sample);
 	} else if (evsel->tp_format) {
 		if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
 		    trace__fprintf_sys_enter(trace, evsel, sample)) {
@@ -2055,12 +2130,14 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
 		}
 	}
 
+newline:
 	fprintf(trace->output, "\n");
 
 	if (callchain_ret > 0)
 		trace__fprintf_callchain(trace, sample);
 	else if (callchain_ret < 0)
 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
+	thread__put(thread);
 out:
 	return 0;
 }
@@ -3276,12 +3353,8 @@ int cmd_trace(int argc, const char **argv)
 		goto out;
 	}
 
-	if (evsel) {
-		if (perf_evsel__init_augmented_syscall_tp(evsel) ||
-		    perf_evsel__init_augmented_syscall_tp_args(evsel))
-			goto out;
+	if (evsel)
 		trace.syscalls.events.augmented = evsel;
-	}
 
 	err = bpf__setup_stdout(trace.evlist);
 	if (err) {
@@ -3326,6 +3399,34 @@ int cmd_trace(int argc, const char **argv)
 		}
 	}
 
+	/*
+	 * If we are augmenting syscalls, then combine what we put in the
+	 * __augmented_syscalls__ BPF map with what is in the
+	 * syscalls:sys_exit_FOO tracepoints, i.e. just like we do without BPF,
+	 * combining raw_syscalls:sys_enter with raw_syscalls:sys_exit.
+	 *
+	 * We'll switch to look at two BPF maps, one for sys_enter and the
+	 * other for sys_exit when we start augmenting the sys_exit paths with
+	 * buffers that are being copied from kernel to userspace, think 'read'
+	 * syscall.
+	 */
+	if (trace.syscalls.events.augmented) {
+		evsel = trace.syscalls.events.augmented;
+
+		if (perf_evsel__init_augmented_syscall_tp(evsel) ||
+		    perf_evsel__init_augmented_syscall_tp_args(evsel))
+			goto out;
+		evsel->handler = trace__sys_enter;
+
+		evlist__for_each_entry(trace.evlist, evsel) {
+			if (strstarts(perf_evsel__name(evsel), "syscalls:sys_exit_")) {
+				perf_evsel__init_augmented_syscall_tp(evsel);
+				perf_evsel__init_augmented_syscall_tp_ret(evsel);
+				evsel->handler = trace__sys_exit;
+			}
+		}
+	}
+
 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
 		return trace__record(&trace, argc-1, &argv[1]);
 
diff --git a/tools/perf/examples/bpf/augmented_syscalls.c b/tools/perf/examples/bpf/augmented_syscalls.c
index 69a31386d8cd9..2ae44813ef2d1 100644
--- a/tools/perf/examples/bpf/augmented_syscalls.c
+++ b/tools/perf/examples/bpf/augmented_syscalls.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Augment the openat syscall with the contents of the filename pointer argument.
+ * Augment syscalls with the contents of the pointer arguments.
  *
  * Test it with:
  *
@@ -10,15 +10,14 @@
  * the last one should be the one for '/etc/passwd'.
  *
  * This matches what is marshalled into the raw_syscall:sys_enter payload
- * expected by the 'perf trace' beautifiers, and can be used by them unmodified,
- * which will be done as that feature is implemented in the next csets, for now
- * it will appear in a dump done by the default tracepoint handler in 'perf trace',
- * that uses bpf_output__fprintf() to just dump those contents, as done with
- * the bpf-output event associated with the __bpf_output__ map declared in
- * tools/perf/include/bpf/stdio.h.
+ * expected by the 'perf trace' beautifiers, and can be used by them, that will
+ * check if perf_sample->raw_data is more than what is expected for each
+ * syscalls:sys_{enter,exit}_SYSCALL tracepoint, uing the extra data as the
+ * contents of pointer arguments.
  */
 
 #include <stdio.h>
+#include <linux/socket.h>
 
 struct bpf_map SEC("maps") __augmented_syscalls__ = {
        .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
@@ -27,6 +26,44 @@ struct bpf_map SEC("maps") __augmented_syscalls__ = {
        .max_entries = __NR_CPUS__,
 };
 
+struct syscall_exit_args {
+	unsigned long long common_tp_fields;
+	long		   syscall_nr;
+	long		   ret;
+};
+
+struct augmented_filename {
+	unsigned int	size;
+	int		reserved;
+	char		value[256];
+};
+
+#define augmented_filename_syscall(syscall)							\
+struct augmented_enter_##syscall##_args {			 				\
+	struct syscall_enter_##syscall##_args	args;				 		\
+	struct augmented_filename		filename;				 	\
+};												\
+int syscall_enter(syscall)(struct syscall_enter_##syscall##_args *args)				\
+{												\
+	struct augmented_enter_##syscall##_args augmented_args = { .filename.reserved = 0, }; 	\
+	unsigned int len = sizeof(augmented_args);						\
+	probe_read(&augmented_args.args, sizeof(augmented_args.args), args);			\
+	augmented_args.filename.size = probe_read_str(&augmented_args.filename.value, 		\
+						      sizeof(augmented_args.filename.value), 	\
+						      args->filename_ptr); 			\
+	if (augmented_args.filename.size < sizeof(augmented_args.filename.value)) {		\
+		len -= sizeof(augmented_args.filename.value) - augmented_args.filename.size;	\
+		len &= sizeof(augmented_args.filename.value) - 1;				\
+	}											\
+	perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, 			\
+			  &augmented_args, len);						\
+	return 0;										\
+}												\
+int syscall_exit(syscall)(struct syscall_exit_args *args)					\
+{												\
+       return 1; /* 0 as soon as we start copying data returned by the kernel, e.g. 'read' */	\
+}
+
 struct syscall_enter_openat_args {
 	unsigned long long common_tp_fields;
 	long		   syscall_nr;
@@ -36,20 +73,101 @@ struct syscall_enter_openat_args {
 	long		   mode;
 };
 
-struct augmented_enter_openat_args {
-	struct syscall_enter_openat_args args;
-	char				 filename[64];
+augmented_filename_syscall(openat);
+
+struct syscall_enter_open_args {
+	unsigned long long common_tp_fields;
+	long		   syscall_nr;
+	char		   *filename_ptr;
+	long		   flags;
+	long		   mode;
+};
+
+augmented_filename_syscall(open);
+
+struct syscall_enter_inotify_add_watch_args {
+	unsigned long long common_tp_fields;
+	long		   syscall_nr;
+	long		   fd;
+	char		   *filename_ptr;
+	long		   mask;
+};
+
+augmented_filename_syscall(inotify_add_watch);
+
+struct statbuf;
+
+struct syscall_enter_newstat_args {
+	unsigned long long common_tp_fields;
+	long		   syscall_nr;
+	char		   *filename_ptr;
+	struct stat	   *statbuf;
 };
 
-int syscall_enter(openat)(struct syscall_enter_openat_args *args)
-{
-	struct augmented_enter_openat_args augmented_args;
+augmented_filename_syscall(newstat);
+
+#ifndef _K_SS_MAXSIZE
+#define _K_SS_MAXSIZE 128
+#endif
 
-	probe_read(&augmented_args.args, sizeof(augmented_args.args), args);
-	probe_read_str(&augmented_args.filename, sizeof(augmented_args.filename), args->filename_ptr);
-	perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU,
-			  &augmented_args, sizeof(augmented_args));
-	return 1;
+#define augmented_sockaddr_syscall(syscall)						\
+struct augmented_enter_##syscall##_args {			 				\
+	struct syscall_enter_##syscall##_args	args;				 		\
+	struct sockaddr_storage			addr;						\
+};												\
+int syscall_enter(syscall)(struct syscall_enter_##syscall##_args *args)				\
+{												\
+	struct augmented_enter_##syscall##_args augmented_args;				 	\
+	unsigned long addrlen = sizeof(augmented_args.addr);					\
+	probe_read(&augmented_args.args, sizeof(augmented_args.args), args);			\
+/* FIXME_CLANG_OPTIMIZATION_THAT_ACCESSES_USER_CONTROLLED_ADDRLEN_DESPITE_THIS_CHECK */		\
+/*	if (addrlen > augmented_args.args.addrlen)				     */		\
+/*		addrlen = augmented_args.args.addrlen;				     */		\
+/*										     */		\
+	probe_read(&augmented_args.addr, addrlen, args->addr_ptr); 				\
+	perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, 			\
+			  &augmented_args, 							\
+			  sizeof(augmented_args) - sizeof(augmented_args.addr) + addrlen);	\
+	return 0;										\
+}												\
+int syscall_exit(syscall)(struct syscall_exit_args *args)					\
+{												\
+       return 1; /* 0 as soon as we start copying data returned by the kernel, e.g. 'read' */	\
 }
 
+struct sockaddr;
+
+struct syscall_enter_bind_args {
+	unsigned long long common_tp_fields;
+	long		   syscall_nr;
+	long		   fd;
+	struct sockaddr	   *addr_ptr;
+	unsigned long	   addrlen;
+};
+
+augmented_sockaddr_syscall(bind);
+
+struct syscall_enter_connect_args {
+	unsigned long long common_tp_fields;
+	long		   syscall_nr;
+	long		   fd;
+	struct sockaddr	   *addr_ptr;
+	unsigned long	   addrlen;
+};
+
+augmented_sockaddr_syscall(connect);
+
+struct syscall_enter_sendto_args {
+	unsigned long long common_tp_fields;
+	long		   syscall_nr;
+	long		   fd;
+	void		   *buff;
+	long		   len;
+	unsigned long	   flags;
+	struct sockaddr	   *addr_ptr;
+	long		   addr_len;
+};
+
+augmented_sockaddr_syscall(sendto);
+
 license(GPL);
diff --git a/tools/perf/examples/bpf/etcsnoop.c b/tools/perf/examples/bpf/etcsnoop.c
new file mode 100644
index 0000000000000..b59e8812ee8c6
--- /dev/null
+++ b/tools/perf/examples/bpf/etcsnoop.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Augment the filename syscalls with the contents of the filename pointer argument
+ * filtering only those that do not start with /etc/.
+ *
+ * Test it with:
+ *
+ * perf trace -e tools/perf/examples/bpf/augmented_syscalls.c cat /etc/passwd > /dev/null
+ *
+ * It'll catch some openat syscalls related to the dynamic linked and
+ * the last one should be the one for '/etc/passwd'.
+ *
+ * This matches what is marshalled into the raw_syscall:sys_enter payload
+ * expected by the 'perf trace' beautifiers, and can be used by them unmodified,
+ * which will be done as that feature is implemented in the next csets, for now
+ * it will appear in a dump done by the default tracepoint handler in 'perf trace',
+ * that uses bpf_output__fprintf() to just dump those contents, as done with
+ * the bpf-output event associated with the __bpf_output__ map declared in
+ * tools/perf/include/bpf/stdio.h.
+ */
+
+#include <stdio.h>
+
+struct bpf_map SEC("maps") __augmented_syscalls__ = {
+       .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+       .key_size = sizeof(int),
+       .value_size = sizeof(u32),
+       .max_entries = __NR_CPUS__,
+};
+
+struct augmented_filename {
+	int	size;
+	int	reserved;
+	char	value[64];
+};
+
+#define augmented_filename_syscall_enter(syscall) 						\
+struct augmented_enter_##syscall##_args {			 				\
+	struct syscall_enter_##syscall##_args	args;				 		\
+	struct augmented_filename		filename;				 	\
+};												\
+int syscall_enter(syscall)(struct syscall_enter_##syscall##_args *args)				\
+{												\
+	char etc[6] = "/etc/";									\
+	struct augmented_enter_##syscall##_args augmented_args = { .filename.reserved = 0, }; 	\
+	probe_read(&augmented_args.args, sizeof(augmented_args.args), args);			\
+	augmented_args.filename.size = probe_read_str(&augmented_args.filename.value, 		\
+						      sizeof(augmented_args.filename.value), 	\
+						      args->filename_ptr); 			\
+	if (__builtin_memcmp(augmented_args.filename.value, etc, 4) != 0)			\
+		return 0;									\
+	perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, 			\
+			  &augmented_args, 							\
+			  (sizeof(augmented_args) - sizeof(augmented_args.filename.value) +	\
+			   augmented_args.filename.size));					\
+	return 0;										\
+}
+
+struct syscall_enter_openat_args {
+	unsigned long long common_tp_fields;
+	long		   syscall_nr;
+	long		   dfd;
+	char		   *filename_ptr;
+	long		   flags;
+	long		   mode;
+};
+
+augmented_filename_syscall_enter(openat);
+
+struct syscall_enter_open_args {
+	unsigned long long common_tp_fields;
+	long		   syscall_nr;
+	char		   *filename_ptr;
+	long		   flags;
+	long		   mode;
+};
+
+augmented_filename_syscall_enter(open);
+
+license(GPL);
diff --git a/tools/perf/include/bpf/bpf.h b/tools/perf/include/bpf/bpf.h
index 47897d65e799b..52b6d87fe822c 100644
--- a/tools/perf/include/bpf/bpf.h
+++ b/tools/perf/include/bpf/bpf.h
@@ -26,6 +26,9 @@ struct bpf_map {
 #define syscall_enter(name) \
 	SEC("syscalls:sys_enter_" #name) syscall_enter_ ## name
 
+#define syscall_exit(name) \
+	SEC("syscalls:sys_exit_" #name) syscall_exit_ ## name
+
 #define license(name) \
 char _license[] SEC("license") = #name; \
 int _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/tools/perf/include/bpf/linux/socket.h b/tools/perf/include/bpf/linux/socket.h
new file mode 100644
index 0000000000000..7f844568dab87
--- /dev/null
+++ b/tools/perf/include/bpf/linux/socket.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_SOCKET_H
+#define _UAPI_LINUX_SOCKET_H
+
+/*
+ * Desired design of maximum size and alignment (see RFC2553)
+ */
+#define _K_SS_MAXSIZE	128	/* Implementation specific max size */
+#define _K_SS_ALIGNSIZE	(__alignof__ (struct sockaddr *))
+				/* Implementation specific desired alignment */
+
+typedef unsigned short __kernel_sa_family_t;
+
+struct __kernel_sockaddr_storage {
+	__kernel_sa_family_t	ss_family;		/* address family */
+	/* Following field(s) are implementation specific */
+	char		__data[_K_SS_MAXSIZE - sizeof(unsigned short)];
+				/* space to achieve desired size, */
+				/* _SS_MAXSIZE value minus size of ss_family */
+} __attribute__ ((aligned(_K_SS_ALIGNSIZE)));	/* force desired alignment */
+
+#define sockaddr_storage __kernel_sockaddr_storage
+
+#endif /* _UAPI_LINUX_SOCKET_H */
diff --git a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh
index 3013ac8f83d0a..cab7b0aea6eab 100755
--- a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh
+++ b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh
@@ -48,7 +48,7 @@ trace_libc_inet_pton_backtrace() {
 	*)
 		eventattr='max-stack=3'
 		echo "getaddrinfo\+0x[[:xdigit:]]+[[:space:]]\($libc\)$" >> $expected
-		echo ".*\+0x[[:xdigit:]]+[[:space:]]\(.*/bin/ping.*\)$" >> $expected
+		echo ".*(\+0x[[:xdigit:]]+|\[unknown\])[[:space:]]\(.*/bin/ping.*\)$" >> $expected
 		;;
 	esac
 
diff --git a/tools/perf/trace/beauty/Build b/tools/perf/trace/beauty/Build
index f528ba35e1409..c3b0afd67760a 100644
--- a/tools/perf/trace/beauty/Build
+++ b/tools/perf/trace/beauty/Build
@@ -7,5 +7,6 @@ endif
 libperf-y += kcmp.o
 libperf-y += pkey_alloc.o
 libperf-y += prctl.o
+libperf-y += sockaddr.o
 libperf-y += socket.o
 libperf-y += statx.o
diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h
index 9615af5d412b1..2570152d39097 100644
--- a/tools/perf/trace/beauty/beauty.h
+++ b/tools/perf/trace/beauty/beauty.h
@@ -30,9 +30,36 @@ struct thread;
 
 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size);
 
+extern struct strarray strarray__socket_families;
+
+/**
+ * augmented_arg: extra payload for syscall pointer arguments
+ 
+ * If perf_sample->raw_size is more than what a syscall sys_enter_FOO puts,
+ * then its the arguments contents, so that we can show more than just a
+ * pointer. This will be done initially with eBPF, the start of that is at the
+ * tools/perf/examples/bpf/augmented_syscalls.c example for the openat, but
+ * will eventually be done automagically caching the running kernel tracefs
+ * events data into an eBPF C script, that then gets compiled and its .o file
+ * cached for subsequent use. For char pointers like the ones for 'open' like
+ * syscalls its easy, for the rest we should use DWARF or better, BTF, much
+ * more compact.
+ *
+ * @size: 8 if all we need is an integer, otherwise all of the augmented arg.
+ * @int_arg: will be used for integer like pointer contents, like 'accept's 'upeer_addrlen'
+ * @value: u64 aligned, for structs, pathnames
+ */
+struct augmented_arg {
+	int  size;
+	int  int_arg;
+	u64  value[];
+};
+
 /**
  * @val: value of syscall argument being formatted
  * @args: All the args, use syscall_args__val(arg, nth) to access one
+ * @augmented_args: Extra data that can be collected, for instance, with eBPF for expanding the pathname for open, etc
+ * @augmented_args_size: augmented_args total payload size
  * @thread: tid state (maps, pid, tid, etc)
  * @trace: 'perf trace' internals: all threads, etc
  * @parm: private area, may be an strarray, for instance
@@ -43,6 +70,10 @@ size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_
 struct syscall_arg {
 	unsigned long val;
 	unsigned char *args;
+	struct {
+		struct augmented_arg *args;
+		int		     size;
+	} augmented;
 	struct thread *thread;
 	struct trace  *trace;
 	void	      *parm;
@@ -106,6 +137,9 @@ size_t syscall_arg__scnprintf_prctl_arg2(char *bf, size_t size, struct syscall_a
 size_t syscall_arg__scnprintf_prctl_arg3(char *bf, size_t size, struct syscall_arg *arg);
 #define SCA_PRCTL_ARG3 syscall_arg__scnprintf_prctl_arg3
 
+size_t syscall_arg__scnprintf_sockaddr(char *bf, size_t size, struct syscall_arg *arg);
+#define SCA_SOCKADDR syscall_arg__scnprintf_sockaddr
+
 size_t syscall_arg__scnprintf_socket_protocol(char *bf, size_t size, struct syscall_arg *arg);
 #define SCA_SK_PROTO syscall_arg__scnprintf_socket_protocol
 
diff --git a/tools/perf/trace/beauty/sockaddr.c b/tools/perf/trace/beauty/sockaddr.c
new file mode 100644
index 0000000000000..71a79f72d9d92
--- /dev/null
+++ b/tools/perf/trace/beauty/sockaddr.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2018, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
+
+#include "trace/beauty/beauty.h"
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <arpa/inet.h>
+
+static const char *socket_families[] = {
+	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
+	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
+	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
+	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
+	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
+	"ALG", "NFC", "VSOCK",
+};
+DEFINE_STRARRAY(socket_families);
+
+static size_t af_inet__scnprintf(struct sockaddr *sa, char *bf, size_t size)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)sa;
+	char tmp[16];
+	return scnprintf(bf, size, ", port: %d, addr: %s", ntohs(sin->sin_port),
+			 inet_ntop(sin->sin_family, &sin->sin_addr, tmp, sizeof(tmp)));
+}
+
+static size_t af_inet6__scnprintf(struct sockaddr *sa, char *bf, size_t size)
+{
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa;
+	u32 flowinfo = ntohl(sin6->sin6_flowinfo);
+	char tmp[512];
+	size_t printed = scnprintf(bf, size, ", port: %d, addr: %s", ntohs(sin6->sin6_port),
+				   inet_ntop(sin6->sin6_family, &sin6->sin6_addr, tmp, sizeof(tmp)));
+	if (flowinfo != 0)
+		printed += scnprintf(bf + printed, size - printed, ", flowinfo: %lu", flowinfo);
+	if (sin6->sin6_scope_id != 0)
+		printed += scnprintf(bf + printed, size - printed, ", scope_id: %lu", sin6->sin6_scope_id);
+
+	return printed;
+}
+
+static size_t af_local__scnprintf(struct sockaddr *sa, char *bf, size_t size)
+{
+	struct sockaddr_un *sun = (struct sockaddr_un *)sa;
+	return scnprintf(bf, size, ", path: %s", sun->sun_path);
+}
+
+static size_t (*af_scnprintfs[])(struct sockaddr *sa, char *bf, size_t size) = {
+	[AF_LOCAL] = af_local__scnprintf,
+	[AF_INET]  = af_inet__scnprintf,
+	[AF_INET6] = af_inet6__scnprintf,
+};
+
+static size_t syscall_arg__scnprintf_augmented_sockaddr(struct syscall_arg *arg, char *bf, size_t size)
+{
+	struct sockaddr *sa = (struct sockaddr *)arg->augmented.args;
+	char family[32];
+	size_t printed;
+
+	strarray__scnprintf(&strarray__socket_families, family, sizeof(family), "%d", sa->sa_family);
+	printed = scnprintf(bf, size, "{ .family: %s", family);
+
+	if (sa->sa_family < ARRAY_SIZE(af_scnprintfs) && af_scnprintfs[sa->sa_family])
+		printed += af_scnprintfs[sa->sa_family](sa, bf + printed, size - printed);
+
+	return printed + scnprintf(bf + printed, size - printed, " }");
+}
+
+size_t syscall_arg__scnprintf_sockaddr(char *bf, size_t size, struct syscall_arg *arg)
+{
+	if (arg->augmented.args)
+		return syscall_arg__scnprintf_augmented_sockaddr(arg, bf, size);
+
+	return scnprintf(bf, size, "%#x", arg->val);
+}
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 7efe15b9618d0..ecd9f9ceda77c 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -73,6 +73,7 @@ libperf-y += vdso.o
 libperf-y += counts.o
 libperf-y += stat.o
 libperf-y += stat-shadow.o
+libperf-y += stat-display.o
 libperf-y += record.o
 libperf-y += srcline.o
 libperf-y += data.o
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 20061cf422887..28cd6a17491b2 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -246,8 +246,14 @@ static int call__parse(struct arch *arch, struct ins_operands *ops, struct map_s
 
 indirect_call:
 	tok = strchr(endptr, '*');
-	if (tok != NULL)
-		ops->target.addr = strtoull(tok + 1, NULL, 16);
+	if (tok != NULL) {
+		endptr++;
+
+		/* Indirect call can use a non-rip register and offset: callq  *0x8(%rbx).
+		 * Do not parse such instruction.  */
+		if (strstr(endptr, "(%r") == NULL)
+			ops->target.addr = strtoull(endptr, NULL, 16);
+	}
 	goto find_target;
 }
 
@@ -276,7 +282,19 @@ bool ins__is_call(const struct ins *ins)
 	return ins->ops == &call_ops || ins->ops == &s390_call_ops;
 }
 
-static int jump__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, struct map_symbol *ms)
+/*
+ * Prevents from matching commas in the comment section, e.g.:
+ * ffff200008446e70:       b.cs    ffff2000084470f4 <generic_exec_single+0x314>  // b.hs, b.nlast
+ */
+static inline const char *validate_comma(const char *c, struct ins_operands *ops)
+{
+	if (ops->raw_comment && c > ops->raw_comment)
+		return NULL;
+
+	return c;
+}
+
+static int jump__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms)
 {
 	struct map *map = ms->map;
 	struct symbol *sym = ms->sym;
@@ -285,6 +303,10 @@ static int jump__parse(struct arch *arch __maybe_unused, struct ins_operands *op
 	};
 	const char *c = strchr(ops->raw, ',');
 	u64 start, end;
+
+	ops->raw_comment = strchr(ops->raw, arch->objdump.comment_char);
+	c = validate_comma(c, ops);
+
 	/*
 	 * Examples of lines to parse for the _cpp_lex_token@@Base
 	 * function:
@@ -304,6 +326,7 @@ static int jump__parse(struct arch *arch __maybe_unused, struct ins_operands *op
 		ops->target.addr = strtoull(c, NULL, 16);
 		if (!ops->target.addr) {
 			c = strchr(c, ',');
+			c = validate_comma(c, ops);
 			if (c++ != NULL)
 				ops->target.addr = strtoull(c, NULL, 16);
 		}
@@ -361,9 +384,12 @@ static int jump__scnprintf(struct ins *ins, char *bf, size_t size,
 		return scnprintf(bf, size, "%-6s %s", ins->name, ops->target.sym->name);
 
 	c = strchr(ops->raw, ',');
+	c = validate_comma(c, ops);
+
 	if (c != NULL) {
 		const char *c2 = strchr(c + 1, ',');
 
+		c2 = validate_comma(c2, ops);
 		/* check for 3-op insn */
 		if (c2 != NULL)
 			c = c2;
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 005a5fe8a8c6b..5399ba2321bbb 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -22,6 +22,7 @@ struct ins {
 
 struct ins_operands {
 	char	*raw;
+	char	*raw_comment;
 	struct {
 		char	*raw;
 		char	*name;
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index c980bbff63536..4ec909d57e9c9 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -251,8 +251,9 @@ struct perf_evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx)
 {
 	struct perf_evsel *evsel = zalloc(perf_evsel__object.size);
 
-	if (evsel != NULL)
-		perf_evsel__init(evsel, attr, idx);
+	if (!evsel)
+		return NULL;
+	perf_evsel__init(evsel, attr, idx);
 
 	if (perf_evsel__is_bpf_output(evsel)) {
 		evsel->attr.sample_type |= (PERF_SAMPLE_RAW | PERF_SAMPLE_TIME |
@@ -2939,3 +2940,32 @@ struct perf_env *perf_evsel__env(struct perf_evsel *evsel)
 		return evsel->evlist->env;
 	return NULL;
 }
+
+static int store_evsel_ids(struct perf_evsel *evsel, struct perf_evlist *evlist)
+{
+	int cpu, thread;
+
+	for (cpu = 0; cpu < xyarray__max_x(evsel->fd); cpu++) {
+		for (thread = 0; thread < xyarray__max_y(evsel->fd);
+		     thread++) {
+			int fd = FD(evsel, cpu, thread);
+
+			if (perf_evlist__id_add_fd(evlist, evsel,
+						   cpu, thread, fd) < 0)
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+int perf_evsel__store_ids(struct perf_evsel *evsel, struct perf_evlist *evlist)
+{
+	struct cpu_map *cpus = evsel->cpus;
+	struct thread_map *threads = evsel->threads;
+
+	if (perf_evsel__alloc_id(evsel, cpus->nr, threads->nr))
+		return -ENOMEM;
+
+	return store_evsel_ids(evsel, evlist);
+}
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 163c960614d33..4f8430a85531c 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -481,4 +481,5 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
 
 struct perf_env *perf_evsel__env(struct perf_evsel *evsel);
 
+int perf_evsel__store_ids(struct perf_evsel *evsel, struct perf_evlist *evlist);
 #endif /* __PERF_EVSEL_H */
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 3cadc252dd897..91e6d9cfd9063 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -3637,13 +3637,13 @@ size_t perf_event__fprintf_event_update(union perf_event *event, FILE *fp)
 }
 
 int perf_event__synthesize_attrs(struct perf_tool *tool,
-				   struct perf_session *session,
-				   perf_event__handler_t process)
+				 struct perf_evlist *evlist,
+				 perf_event__handler_t process)
 {
 	struct perf_evsel *evsel;
 	int err = 0;
 
-	evlist__for_each_entry(session->evlist, evsel) {
+	evlist__for_each_entry(evlist, evsel) {
 		err = perf_event__synthesize_attr(tool, &evsel->attr, evsel->ids,
 						  evsel->id, process);
 		if (err) {
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 6d7fe44aadc0d..ff2a1263fb9ba 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -124,7 +124,7 @@ int perf_event__synthesize_attr(struct perf_tool *tool,
 				struct perf_event_attr *attr, u32 ids, u64 *id,
 				perf_event__handler_t process);
 int perf_event__synthesize_attrs(struct perf_tool *tool,
-				 struct perf_session *session,
+				 struct perf_evlist *evlist,
 				 perf_event__handler_t process);
 int perf_event__synthesize_event_update_unit(struct perf_tool *tool,
 					     struct perf_evsel *evsel,
diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c
index 19262f98cd4e1..5b0b60f00275e 100644
--- a/tools/perf/util/llvm-utils.c
+++ b/tools/perf/util/llvm-utils.c
@@ -19,7 +19,7 @@
 #define CLANG_BPF_CMD_DEFAULT_TEMPLATE				\
 		"$CLANG_EXEC -D__KERNEL__ -D__NR_CPUS__=$NR_CPUS "\
 		"-DLINUX_VERSION_CODE=$LINUX_VERSION_CODE "	\
-		"$CLANG_OPTIONS $KERNEL_INC_OPTIONS $PERF_BPF_INC_OPTIONS " \
+		"$CLANG_OPTIONS $PERF_BPF_INC_OPTIONS $KERNEL_INC_OPTIONS " \
 		"-Wno-unused-value -Wno-pointer-sign "		\
 		"-working-directory $WORKING_DIR "		\
 		"-c \"$CLANG_SOURCE\" -target bpf $CLANG_EMIT_LLVM -O2 -o - $LLVM_OPTIONS_PIPE"
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 36d0763311efc..3f07a587c8e6a 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -320,12 +320,11 @@ int map__load(struct map *map)
 			build_id__sprintf(map->dso->build_id,
 					  sizeof(map->dso->build_id),
 					  sbuild_id);
-			pr_warning("%s with build id %s not found",
-				   name, sbuild_id);
+			pr_debug("%s with build id %s not found", name, sbuild_id);
 		} else
-			pr_warning("Failed to open %s", name);
+			pr_debug("Failed to open %s", name);
 
-		pr_warning(", continuing without symbols\n");
+		pr_debug(", continuing without symbols\n");
 		return -1;
 	} else if (nr == 0) {
 #ifdef HAVE_LIBELF_SUPPORT
@@ -334,12 +333,11 @@ int map__load(struct map *map)
 
 		if (len > sizeof(DSO__DELETED) &&
 		    strcmp(name + real_len + 1, DSO__DELETED) == 0) {
-			pr_warning("%.*s was updated (is prelink enabled?). "
+			pr_debug("%.*s was updated (is prelink enabled?). "
 				"Restart the long running apps that use it!\n",
 				   (int)real_len, name);
 		} else {
-			pr_warning("no symbols found in %s, maybe install "
-				   "a debug package?\n", name);
+			pr_debug("no symbols found in %s, maybe install a debug package?\n", name);
 		}
 #endif
 		return -1;
@@ -701,8 +699,7 @@ static int maps__fixup_overlappings(struct maps *maps, struct map *map, FILE *fp
 		if (verbose >= 2) {
 
 			if (use_browser) {
-				pr_warning("overlapping maps in %s "
-					   "(disable tui for more info)\n",
+				pr_debug("overlapping maps in %s (disable tui for more info)\n",
 					   map->dso->name);
 			} else {
 				fputs("overlapping maps:\n", fp);
diff --git a/tools/perf/util/s390-cpumsf.c b/tools/perf/util/s390-cpumsf.c
index d2c78ffd9feea..a2eeebbfb25f5 100644
--- a/tools/perf/util/s390-cpumsf.c
+++ b/tools/perf/util/s390-cpumsf.c
@@ -147,6 +147,9 @@
 #include <linux/bitops.h>
 #include <linux/log2.h>
 
+#include <sys/stat.h>
+#include <sys/types.h>
+
 #include "cpumap.h"
 #include "color.h"
 #include "evsel.h"
@@ -159,6 +162,7 @@
 #include "auxtrace.h"
 #include "s390-cpumsf.h"
 #include "s390-cpumsf-kernel.h"
+#include "config.h"
 
 struct s390_cpumsf {
 	struct auxtrace		auxtrace;
@@ -170,6 +174,8 @@ struct s390_cpumsf {
 	u32			pmu_type;
 	u16			machine_type;
 	bool			data_queued;
+	bool			use_logfile;
+	char			*logdir;
 };
 
 struct s390_cpumsf_queue {
@@ -177,6 +183,7 @@ struct s390_cpumsf_queue {
 	unsigned int		queue_nr;
 	struct auxtrace_buffer	*buffer;
 	int			cpu;
+	FILE			*logfile;
 };
 
 /* Display s390 CPU measurement facility basic-sampling data entry */
@@ -595,6 +602,12 @@ static int s390_cpumsf_run_decoder(struct s390_cpumsf_queue *sfq,
 			buffer->use_size = buffer->size;
 			buffer->use_data = buffer->data;
 		}
+		if (sfq->logfile) {	/* Write into log file */
+			size_t rc = fwrite(buffer->data, buffer->size, 1,
+					   sfq->logfile);
+			if (rc != 1)
+				pr_err("Failed to write auxiliary data\n");
+		}
 	} else
 		buffer = sfq->buffer;
 
@@ -606,6 +619,13 @@ static int s390_cpumsf_run_decoder(struct s390_cpumsf_queue *sfq,
 			return -ENOMEM;
 		buffer->use_size = buffer->size;
 		buffer->use_data = buffer->data;
+
+		if (sfq->logfile) {	/* Write into log file */
+			size_t rc = fwrite(buffer->data, buffer->size, 1,
+					   sfq->logfile);
+			if (rc != 1)
+				pr_err("Failed to write auxiliary data\n");
+		}
 	}
 	pr_debug4("%s queue_nr:%d buffer:%" PRId64 " offset:%#" PRIx64 " size:%#zx rest:%#zx\n",
 		  __func__, sfq->queue_nr, buffer->buffer_nr, buffer->offset,
@@ -640,6 +660,23 @@ s390_cpumsf_alloc_queue(struct s390_cpumsf *sf, unsigned int queue_nr)
 	sfq->sf = sf;
 	sfq->queue_nr = queue_nr;
 	sfq->cpu = -1;
+	if (sf->use_logfile) {
+		char *name;
+		int rc;
+
+		rc = (sf->logdir)
+			? asprintf(&name, "%s/aux.smp.%02x",
+				 sf->logdir, queue_nr)
+			: asprintf(&name, "aux.smp.%02x", queue_nr);
+		if (rc > 0)
+			sfq->logfile = fopen(name, "w");
+		if (sfq->logfile == NULL) {
+			pr_err("Failed to open auxiliary log file %s,"
+			       "continue...\n", name);
+			sf->use_logfile = false;
+		}
+		free(name);
+	}
 	return sfq;
 }
 
@@ -850,8 +887,16 @@ static void s390_cpumsf_free_queues(struct perf_session *session)
 	struct auxtrace_queues *queues = &sf->queues;
 	unsigned int i;
 
-	for (i = 0; i < queues->nr_queues; i++)
+	for (i = 0; i < queues->nr_queues; i++) {
+		struct s390_cpumsf_queue *sfq = (struct s390_cpumsf_queue *)
+						queues->queue_array[i].priv;
+
+		if (sfq != NULL && sfq->logfile) {
+			fclose(sfq->logfile);
+			sfq->logfile = NULL;
+		}
 		zfree(&queues->queue_array[i].priv);
+	}
 	auxtrace_queues__free(queues);
 }
 
@@ -864,6 +909,7 @@ static void s390_cpumsf_free(struct perf_session *session)
 	auxtrace_heap__free(&sf->heap);
 	s390_cpumsf_free_queues(session);
 	session->auxtrace = NULL;
+	free(sf->logdir);
 	free(sf);
 }
 
@@ -877,17 +923,55 @@ static int s390_cpumsf_get_type(const char *cpuid)
 
 /* Check itrace options set on perf report command.
  * Return true, if none are set or all options specified can be
- * handled on s390.
+ * handled on s390 (currently only option 'd' for logging.
  * Return false otherwise.
  */
 static bool check_auxtrace_itrace(struct itrace_synth_opts *itops)
 {
+	bool ison = false;
+
 	if (!itops || !itops->set)
 		return true;
-	pr_err("No --itrace options supported\n");
+	ison = itops->inject || itops->instructions || itops->branches ||
+		itops->transactions || itops->ptwrites ||
+		itops->pwr_events || itops->errors ||
+		itops->dont_decode || itops->calls || itops->returns ||
+		itops->callchain || itops->thread_stack ||
+		itops->last_branch;
+	if (!ison)
+		return true;
+	pr_err("Unsupported --itrace options specified\n");
 	return false;
 }
 
+/* Check for AUXTRACE dump directory if it is needed.
+ * On failure print an error message but continue.
+ * Return 0 on wrong keyword in config file and 1 otherwise.
+ */
+static int s390_cpumsf__config(const char *var, const char *value, void *cb)
+{
+	struct s390_cpumsf *sf = cb;
+	struct stat stbuf;
+	int rc;
+
+	if (strcmp(var, "auxtrace.dumpdir"))
+		return 0;
+	sf->logdir = strdup(value);
+	if (sf->logdir == NULL) {
+		pr_err("Failed to find auxtrace log directory %s,"
+		       " continue with current directory...\n", value);
+		return 1;
+	}
+	rc = stat(sf->logdir, &stbuf);
+	if (rc == -1 || !S_ISDIR(stbuf.st_mode)) {
+		pr_err("Missing auxtrace log directory %s,"
+		       " continue with current directory...\n", value);
+		free(sf->logdir);
+		sf->logdir = NULL;
+	}
+	return 1;
+}
+
 int s390_cpumsf_process_auxtrace_info(union perf_event *event,
 				      struct perf_session *session)
 {
@@ -906,6 +990,9 @@ int s390_cpumsf_process_auxtrace_info(union perf_event *event,
 		err = -EINVAL;
 		goto err_free;
 	}
+	sf->use_logfile = session->itrace_synth_opts->log;
+	if (sf->use_logfile)
+		perf_config(s390_cpumsf__config, sf);
 
 	err = auxtrace_queues__init(&sf->queues);
 	if (err)
@@ -940,6 +1027,7 @@ int s390_cpumsf_process_auxtrace_info(union perf_event *event,
 	auxtrace_queues__free(&sf->queues);
 	session->auxtrace = NULL;
 err_free:
+	free(sf->logdir);
 	free(sf);
 	return err;
 }
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
new file mode 100644
index 0000000000000..e7b4c44ebb625
--- /dev/null
+++ b/tools/perf/util/stat-display.c
@@ -0,0 +1,1166 @@
+#include <stdio.h>
+#include <inttypes.h>
+#include <linux/time64.h>
+#include <math.h>
+#include "evlist.h"
+#include "evsel.h"
+#include "stat.h"
+#include "top.h"
+#include "thread_map.h"
+#include "cpumap.h"
+#include "string2.h"
+#include "sane_ctype.h"
+#include "cgroup.h"
+#include <math.h>
+#include <api/fs/fs.h>
+
+#define CNTR_NOT_SUPPORTED	"<not supported>"
+#define CNTR_NOT_COUNTED	"<not counted>"
+
+static bool is_duration_time(struct perf_evsel *evsel)
+{
+	return !strcmp(evsel->name, "duration_time");
+}
+
+static void print_running(struct perf_stat_config *config,
+			  u64 run, u64 ena)
+{
+	if (config->csv_output) {
+		fprintf(config->output, "%s%" PRIu64 "%s%.2f",
+					config->csv_sep,
+					run,
+					config->csv_sep,
+					ena ? 100.0 * run / ena : 100.0);
+	} else if (run != ena) {
+		fprintf(config->output, "  (%.2f%%)", 100.0 * run / ena);
+	}
+}
+
+static void print_noise_pct(struct perf_stat_config *config,
+			    double total, double avg)
+{
+	double pct = rel_stddev_stats(total, avg);
+
+	if (config->csv_output)
+		fprintf(config->output, "%s%.2f%%", config->csv_sep, pct);
+	else if (pct)
+		fprintf(config->output, "  ( +-%6.2f%% )", pct);
+}
+
+static void print_noise(struct perf_stat_config *config,
+			struct perf_evsel *evsel, double avg)
+{
+	struct perf_stat_evsel *ps;
+
+	if (config->run_count == 1)
+		return;
+
+	ps = evsel->stats;
+	print_noise_pct(config, stddev_stats(&ps->res_stats[0]), avg);
+}
+
+static void aggr_printout(struct perf_stat_config *config,
+			  struct perf_evsel *evsel, int id, int nr)
+{
+	switch (config->aggr_mode) {
+	case AGGR_CORE:
+		fprintf(config->output, "S%d-C%*d%s%*d%s",
+			cpu_map__id_to_socket(id),
+			config->csv_output ? 0 : -8,
+			cpu_map__id_to_cpu(id),
+			config->csv_sep,
+			config->csv_output ? 0 : 4,
+			nr,
+			config->csv_sep);
+		break;
+	case AGGR_SOCKET:
+		fprintf(config->output, "S%*d%s%*d%s",
+			config->csv_output ? 0 : -5,
+			id,
+			config->csv_sep,
+			config->csv_output ? 0 : 4,
+			nr,
+			config->csv_sep);
+			break;
+	case AGGR_NONE:
+		fprintf(config->output, "CPU%*d%s",
+			config->csv_output ? 0 : -4,
+			perf_evsel__cpus(evsel)->map[id], config->csv_sep);
+		break;
+	case AGGR_THREAD:
+		fprintf(config->output, "%*s-%*d%s",
+			config->csv_output ? 0 : 16,
+			thread_map__comm(evsel->threads, id),
+			config->csv_output ? 0 : -8,
+			thread_map__pid(evsel->threads, id),
+			config->csv_sep);
+		break;
+	case AGGR_GLOBAL:
+	case AGGR_UNSET:
+	default:
+		break;
+	}
+}
+
+struct outstate {
+	FILE *fh;
+	bool newline;
+	const char *prefix;
+	int  nfields;
+	int  id, nr;
+	struct perf_evsel *evsel;
+};
+
+#define METRIC_LEN  35
+
+static void new_line_std(struct perf_stat_config *config __maybe_unused,
+			 void *ctx)
+{
+	struct outstate *os = ctx;
+
+	os->newline = true;
+}
+
+static void do_new_line_std(struct perf_stat_config *config,
+			    struct outstate *os)
+{
+	fputc('\n', os->fh);
+	fputs(os->prefix, os->fh);
+	aggr_printout(config, os->evsel, os->id, os->nr);
+	if (config->aggr_mode == AGGR_NONE)
+		fprintf(os->fh, "        ");
+	fprintf(os->fh, "                                                 ");
+}
+
+static void print_metric_std(struct perf_stat_config *config,
+			     void *ctx, const char *color, const char *fmt,
+			     const char *unit, double val)
+{
+	struct outstate *os = ctx;
+	FILE *out = os->fh;
+	int n;
+	bool newline = os->newline;
+
+	os->newline = false;
+
+	if (unit == NULL || fmt == NULL) {
+		fprintf(out, "%-*s", METRIC_LEN, "");
+		return;
+	}
+
+	if (newline)
+		do_new_line_std(config, os);
+
+	n = fprintf(out, " # ");
+	if (color)
+		n += color_fprintf(out, color, fmt, val);
+	else
+		n += fprintf(out, fmt, val);
+	fprintf(out, " %-*s", METRIC_LEN - n - 1, unit);
+}
+
+static void new_line_csv(struct perf_stat_config *config, void *ctx)
+{
+	struct outstate *os = ctx;
+	int i;
+
+	fputc('\n', os->fh);
+	if (os->prefix)
+		fprintf(os->fh, "%s%s", os->prefix, config->csv_sep);
+	aggr_printout(config, os->evsel, os->id, os->nr);
+	for (i = 0; i < os->nfields; i++)
+		fputs(config->csv_sep, os->fh);
+}
+
+static void print_metric_csv(struct perf_stat_config *config __maybe_unused,
+			     void *ctx,
+			     const char *color __maybe_unused,
+			     const char *fmt, const char *unit, double val)
+{
+	struct outstate *os = ctx;
+	FILE *out = os->fh;
+	char buf[64], *vals, *ends;
+
+	if (unit == NULL || fmt == NULL) {
+		fprintf(out, "%s%s", config->csv_sep, config->csv_sep);
+		return;
+	}
+	snprintf(buf, sizeof(buf), fmt, val);
+	ends = vals = ltrim(buf);
+	while (isdigit(*ends) || *ends == '.')
+		ends++;
+	*ends = 0;
+	while (isspace(*unit))
+		unit++;
+	fprintf(out, "%s%s%s%s", config->csv_sep, vals, config->csv_sep, unit);
+}
+
+/* Filter out some columns that don't work well in metrics only mode */
+
+static bool valid_only_metric(const char *unit)
+{
+	if (!unit)
+		return false;
+	if (strstr(unit, "/sec") ||
+	    strstr(unit, "hz") ||
+	    strstr(unit, "Hz") ||
+	    strstr(unit, "CPUs utilized"))
+		return false;
+	return true;
+}
+
+static const char *fixunit(char *buf, struct perf_evsel *evsel,
+			   const char *unit)
+{
+	if (!strncmp(unit, "of all", 6)) {
+		snprintf(buf, 1024, "%s %s", perf_evsel__name(evsel),
+			 unit);
+		return buf;
+	}
+	return unit;
+}
+
+static void print_metric_only(struct perf_stat_config *config,
+			      void *ctx, const char *color, const char *fmt,
+			      const char *unit, double val)
+{
+	struct outstate *os = ctx;
+	FILE *out = os->fh;
+	char buf[1024], str[1024];
+	unsigned mlen = config->metric_only_len;
+
+	if (!valid_only_metric(unit))
+		return;
+	unit = fixunit(buf, os->evsel, unit);
+	if (mlen < strlen(unit))
+		mlen = strlen(unit) + 1;
+
+	if (color)
+		mlen += strlen(color) + sizeof(PERF_COLOR_RESET) - 1;
+
+	color_snprintf(str, sizeof(str), color ?: "", fmt, val);
+	fprintf(out, "%*s ", mlen, str);
+}
+
+static void print_metric_only_csv(struct perf_stat_config *config __maybe_unused,
+				  void *ctx, const char *color __maybe_unused,
+				  const char *fmt,
+				  const char *unit, double val)
+{
+	struct outstate *os = ctx;
+	FILE *out = os->fh;
+	char buf[64], *vals, *ends;
+	char tbuf[1024];
+
+	if (!valid_only_metric(unit))
+		return;
+	unit = fixunit(tbuf, os->evsel, unit);
+	snprintf(buf, sizeof buf, fmt, val);
+	ends = vals = ltrim(buf);
+	while (isdigit(*ends) || *ends == '.')
+		ends++;
+	*ends = 0;
+	fprintf(out, "%s%s", vals, config->csv_sep);
+}
+
+static void new_line_metric(struct perf_stat_config *config __maybe_unused,
+			    void *ctx __maybe_unused)
+{
+}
+
+static void print_metric_header(struct perf_stat_config *config,
+				void *ctx, const char *color __maybe_unused,
+				const char *fmt __maybe_unused,
+				const char *unit, double val __maybe_unused)
+{
+	struct outstate *os = ctx;
+	char tbuf[1024];
+
+	if (!valid_only_metric(unit))
+		return;
+	unit = fixunit(tbuf, os->evsel, unit);
+	if (config->csv_output)
+		fprintf(os->fh, "%s%s", unit, config->csv_sep);
+	else
+		fprintf(os->fh, "%*s ", config->metric_only_len, unit);
+}
+
+static int first_shadow_cpu(struct perf_stat_config *config,
+			    struct perf_evsel *evsel, int id)
+{
+	struct perf_evlist *evlist = evsel->evlist;
+	int i;
+
+	if (!config->aggr_get_id)
+		return 0;
+
+	if (config->aggr_mode == AGGR_NONE)
+		return id;
+
+	if (config->aggr_mode == AGGR_GLOBAL)
+		return 0;
+
+	for (i = 0; i < perf_evsel__nr_cpus(evsel); i++) {
+		int cpu2 = perf_evsel__cpus(evsel)->map[i];
+
+		if (config->aggr_get_id(config, evlist->cpus, cpu2) == id)
+			return cpu2;
+	}
+	return 0;
+}
+
+static void abs_printout(struct perf_stat_config *config,
+			 int id, int nr, struct perf_evsel *evsel, double avg)
+{
+	FILE *output = config->output;
+	double sc =  evsel->scale;
+	const char *fmt;
+
+	if (config->csv_output) {
+		fmt = floor(sc) != sc ?  "%.2f%s" : "%.0f%s";
+	} else {
+		if (config->big_num)
+			fmt = floor(sc) != sc ? "%'18.2f%s" : "%'18.0f%s";
+		else
+			fmt = floor(sc) != sc ? "%18.2f%s" : "%18.0f%s";
+	}
+
+	aggr_printout(config, evsel, id, nr);
+
+	fprintf(output, fmt, avg, config->csv_sep);
+
+	if (evsel->unit)
+		fprintf(output, "%-*s%s",
+			config->csv_output ? 0 : config->unit_width,
+			evsel->unit, config->csv_sep);
+
+	fprintf(output, "%-*s", config->csv_output ? 0 : 25, perf_evsel__name(evsel));
+
+	if (evsel->cgrp)
+		fprintf(output, "%s%s", config->csv_sep, evsel->cgrp->name);
+}
+
+static bool is_mixed_hw_group(struct perf_evsel *counter)
+{
+	struct perf_evlist *evlist = counter->evlist;
+	u32 pmu_type = counter->attr.type;
+	struct perf_evsel *pos;
+
+	if (counter->nr_members < 2)
+		return false;
+
+	evlist__for_each_entry(evlist, pos) {
+		/* software events can be part of any hardware group */
+		if (pos->attr.type == PERF_TYPE_SOFTWARE)
+			continue;
+		if (pmu_type == PERF_TYPE_SOFTWARE) {
+			pmu_type = pos->attr.type;
+			continue;
+		}
+		if (pmu_type != pos->attr.type)
+			return true;
+	}
+
+	return false;
+}
+
+static void printout(struct perf_stat_config *config, int id, int nr,
+		     struct perf_evsel *counter, double uval,
+		     char *prefix, u64 run, u64 ena, double noise,
+		     struct runtime_stat *st)
+{
+	struct perf_stat_output_ctx out;
+	struct outstate os = {
+		.fh = config->output,
+		.prefix = prefix ? prefix : "",
+		.id = id,
+		.nr = nr,
+		.evsel = counter,
+	};
+	print_metric_t pm = print_metric_std;
+	new_line_t nl;
+
+	if (config->metric_only) {
+		nl = new_line_metric;
+		if (config->csv_output)
+			pm = print_metric_only_csv;
+		else
+			pm = print_metric_only;
+	} else
+		nl = new_line_std;
+
+	if (config->csv_output && !config->metric_only) {
+		static int aggr_fields[] = {
+			[AGGR_GLOBAL] = 0,
+			[AGGR_THREAD] = 1,
+			[AGGR_NONE] = 1,
+			[AGGR_SOCKET] = 2,
+			[AGGR_CORE] = 2,
+		};
+
+		pm = print_metric_csv;
+		nl = new_line_csv;
+		os.nfields = 3;
+		os.nfields += aggr_fields[config->aggr_mode];
+		if (counter->cgrp)
+			os.nfields++;
+	}
+	if (run == 0 || ena == 0 || counter->counts->scaled == -1) {
+		if (config->metric_only) {
+			pm(config, &os, NULL, "", "", 0);
+			return;
+		}
+		aggr_printout(config, counter, id, nr);
+
+		fprintf(config->output, "%*s%s",
+			config->csv_output ? 0 : 18,
+			counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
+			config->csv_sep);
+
+		if (counter->supported) {
+			config->print_free_counters_hint = 1;
+			if (is_mixed_hw_group(counter))
+				config->print_mixed_hw_group_error = 1;
+		}
+
+		fprintf(config->output, "%-*s%s",
+			config->csv_output ? 0 : config->unit_width,
+			counter->unit, config->csv_sep);
+
+		fprintf(config->output, "%*s",
+			config->csv_output ? 0 : -25,
+			perf_evsel__name(counter));
+
+		if (counter->cgrp)
+			fprintf(config->output, "%s%s",
+				config->csv_sep, counter->cgrp->name);
+
+		if (!config->csv_output)
+			pm(config, &os, NULL, NULL, "", 0);
+		print_noise(config, counter, noise);
+		print_running(config, run, ena);
+		if (config->csv_output)
+			pm(config, &os, NULL, NULL, "", 0);
+		return;
+	}
+
+	if (!config->metric_only)
+		abs_printout(config, id, nr, counter, uval);
+
+	out.print_metric = pm;
+	out.new_line = nl;
+	out.ctx = &os;
+	out.force_header = false;
+
+	if (config->csv_output && !config->metric_only) {
+		print_noise(config, counter, noise);
+		print_running(config, run, ena);
+	}
+
+	perf_stat__print_shadow_stats(config, counter, uval,
+				first_shadow_cpu(config, counter, id),
+				&out, &config->metric_events, st);
+	if (!config->csv_output && !config->metric_only) {
+		print_noise(config, counter, noise);
+		print_running(config, run, ena);
+	}
+}
+
+static void aggr_update_shadow(struct perf_stat_config *config,
+			       struct perf_evlist *evlist)
+{
+	int cpu, s2, id, s;
+	u64 val;
+	struct perf_evsel *counter;
+
+	for (s = 0; s < config->aggr_map->nr; s++) {
+		id = config->aggr_map->map[s];
+		evlist__for_each_entry(evlist, counter) {
+			val = 0;
+			for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
+				s2 = config->aggr_get_id(config, evlist->cpus, cpu);
+				if (s2 != id)
+					continue;
+				val += perf_counts(counter->counts, cpu, 0)->val;
+			}
+			perf_stat__update_shadow_stats(counter, val,
+					first_shadow_cpu(config, counter, id),
+					&rt_stat);
+		}
+	}
+}
+
+static void uniquify_event_name(struct perf_evsel *counter)
+{
+	char *new_name;
+	char *config;
+
+	if (counter->uniquified_name ||
+	    !counter->pmu_name || !strncmp(counter->name, counter->pmu_name,
+					   strlen(counter->pmu_name)))
+		return;
+
+	config = strchr(counter->name, '/');
+	if (config) {
+		if (asprintf(&new_name,
+			     "%s%s", counter->pmu_name, config) > 0) {
+			free(counter->name);
+			counter->name = new_name;
+		}
+	} else {
+		if (asprintf(&new_name,
+			     "%s [%s]", counter->name, counter->pmu_name) > 0) {
+			free(counter->name);
+			counter->name = new_name;
+		}
+	}
+
+	counter->uniquified_name = true;
+}
+
+static void collect_all_aliases(struct perf_stat_config *config, struct perf_evsel *counter,
+			    void (*cb)(struct perf_stat_config *config, struct perf_evsel *counter, void *data,
+				       bool first),
+			    void *data)
+{
+	struct perf_evlist *evlist = counter->evlist;
+	struct perf_evsel *alias;
+
+	alias = list_prepare_entry(counter, &(evlist->entries), node);
+	list_for_each_entry_continue (alias, &evlist->entries, node) {
+		if (strcmp(perf_evsel__name(alias), perf_evsel__name(counter)) ||
+		    alias->scale != counter->scale ||
+		    alias->cgrp != counter->cgrp ||
+		    strcmp(alias->unit, counter->unit) ||
+		    perf_evsel__is_clock(alias) != perf_evsel__is_clock(counter))
+			break;
+		alias->merged_stat = true;
+		cb(config, alias, data, false);
+	}
+}
+
+static bool collect_data(struct perf_stat_config *config, struct perf_evsel *counter,
+			    void (*cb)(struct perf_stat_config *config, struct perf_evsel *counter, void *data,
+				       bool first),
+			    void *data)
+{
+	if (counter->merged_stat)
+		return false;
+	cb(config, counter, data, true);
+	if (config->no_merge)
+		uniquify_event_name(counter);
+	else if (counter->auto_merge_stats)
+		collect_all_aliases(config, counter, cb, data);
+	return true;
+}
+
+struct aggr_data {
+	u64 ena, run, val;
+	int id;
+	int nr;
+	int cpu;
+};
+
+static void aggr_cb(struct perf_stat_config *config,
+		    struct perf_evsel *counter, void *data, bool first)
+{
+	struct aggr_data *ad = data;
+	int cpu, s2;
+
+	for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
+		struct perf_counts_values *counts;
+
+		s2 = config->aggr_get_id(config, perf_evsel__cpus(counter), cpu);
+		if (s2 != ad->id)
+			continue;
+		if (first)
+			ad->nr++;
+		counts = perf_counts(counter->counts, cpu, 0);
+		/*
+		 * When any result is bad, make them all to give
+		 * consistent output in interval mode.
+		 */
+		if (counts->ena == 0 || counts->run == 0 ||
+		    counter->counts->scaled == -1) {
+			ad->ena = 0;
+			ad->run = 0;
+			break;
+		}
+		ad->val += counts->val;
+		ad->ena += counts->ena;
+		ad->run += counts->run;
+	}
+}
+
+static void print_aggr(struct perf_stat_config *config,
+		       struct perf_evlist *evlist,
+		       char *prefix)
+{
+	bool metric_only = config->metric_only;
+	FILE *output = config->output;
+	struct perf_evsel *counter;
+	int s, id, nr;
+	double uval;
+	u64 ena, run, val;
+	bool first;
+
+	if (!(config->aggr_map || config->aggr_get_id))
+		return;
+
+	aggr_update_shadow(config, evlist);
+
+	/*
+	 * With metric_only everything is on a single line.
+	 * Without each counter has its own line.
+	 */
+	for (s = 0; s < config->aggr_map->nr; s++) {
+		struct aggr_data ad;
+		if (prefix && metric_only)
+			fprintf(output, "%s", prefix);
+
+		ad.id = id = config->aggr_map->map[s];
+		first = true;
+		evlist__for_each_entry(evlist, counter) {
+			if (is_duration_time(counter))
+				continue;
+
+			ad.val = ad.ena = ad.run = 0;
+			ad.nr = 0;
+			if (!collect_data(config, counter, aggr_cb, &ad))
+				continue;
+			nr = ad.nr;
+			ena = ad.ena;
+			run = ad.run;
+			val = ad.val;
+			if (first && metric_only) {
+				first = false;
+				aggr_printout(config, counter, id, nr);
+			}
+			if (prefix && !metric_only)
+				fprintf(output, "%s", prefix);
+
+			uval = val * counter->scale;
+			printout(config, id, nr, counter, uval, prefix,
+				 run, ena, 1.0, &rt_stat);
+			if (!metric_only)
+				fputc('\n', output);
+		}
+		if (metric_only)
+			fputc('\n', output);
+	}
+}
+
+static int cmp_val(const void *a, const void *b)
+{
+	return ((struct perf_aggr_thread_value *)b)->val -
+		((struct perf_aggr_thread_value *)a)->val;
+}
+
+static struct perf_aggr_thread_value *sort_aggr_thread(
+					struct perf_evsel *counter,
+					int nthreads, int ncpus,
+					int *ret,
+					struct target *_target)
+{
+	int cpu, thread, i = 0;
+	double uval;
+	struct perf_aggr_thread_value *buf;
+
+	buf = calloc(nthreads, sizeof(struct perf_aggr_thread_value));
+	if (!buf)
+		return NULL;
+
+	for (thread = 0; thread < nthreads; thread++) {
+		u64 ena = 0, run = 0, val = 0;
+
+		for (cpu = 0; cpu < ncpus; cpu++) {
+			val += perf_counts(counter->counts, cpu, thread)->val;
+			ena += perf_counts(counter->counts, cpu, thread)->ena;
+			run += perf_counts(counter->counts, cpu, thread)->run;
+		}
+
+		uval = val * counter->scale;
+
+		/*
+		 * Skip value 0 when enabling --per-thread globally,
+		 * otherwise too many 0 output.
+		 */
+		if (uval == 0.0 && target__has_per_thread(_target))
+			continue;
+
+		buf[i].counter = counter;
+		buf[i].id = thread;
+		buf[i].uval = uval;
+		buf[i].val = val;
+		buf[i].run = run;
+		buf[i].ena = ena;
+		i++;
+	}
+
+	qsort(buf, i, sizeof(struct perf_aggr_thread_value), cmp_val);
+
+	if (ret)
+		*ret = i;
+
+	return buf;
+}
+
+static void print_aggr_thread(struct perf_stat_config *config,
+			      struct target *_target,
+			      struct perf_evsel *counter, char *prefix)
+{
+	FILE *output = config->output;
+	int nthreads = thread_map__nr(counter->threads);
+	int ncpus = cpu_map__nr(counter->cpus);
+	int thread, sorted_threads, id;
+	struct perf_aggr_thread_value *buf;
+
+	buf = sort_aggr_thread(counter, nthreads, ncpus, &sorted_threads, _target);
+	if (!buf) {
+		perror("cannot sort aggr thread");
+		return;
+	}
+
+	for (thread = 0; thread < sorted_threads; thread++) {
+		if (prefix)
+			fprintf(output, "%s", prefix);
+
+		id = buf[thread].id;
+		if (config->stats)
+			printout(config, id, 0, buf[thread].counter, buf[thread].uval,
+				 prefix, buf[thread].run, buf[thread].ena, 1.0,
+				 &config->stats[id]);
+		else
+			printout(config, id, 0, buf[thread].counter, buf[thread].uval,
+				 prefix, buf[thread].run, buf[thread].ena, 1.0,
+				 &rt_stat);
+		fputc('\n', output);
+	}
+
+	free(buf);
+}
+
+struct caggr_data {
+	double avg, avg_enabled, avg_running;
+};
+
+static void counter_aggr_cb(struct perf_stat_config *config __maybe_unused,
+			    struct perf_evsel *counter, void *data,
+			    bool first __maybe_unused)
+{
+	struct caggr_data *cd = data;
+	struct perf_stat_evsel *ps = counter->stats;
+
+	cd->avg += avg_stats(&ps->res_stats[0]);
+	cd->avg_enabled += avg_stats(&ps->res_stats[1]);
+	cd->avg_running += avg_stats(&ps->res_stats[2]);
+}
+
+/*
+ * Print out the results of a single counter:
+ * aggregated counts in system-wide mode
+ */
+static void print_counter_aggr(struct perf_stat_config *config,
+			       struct perf_evsel *counter, char *prefix)
+{
+	bool metric_only = config->metric_only;
+	FILE *output = config->output;
+	double uval;
+	struct caggr_data cd = { .avg = 0.0 };
+
+	if (!collect_data(config, counter, counter_aggr_cb, &cd))
+		return;
+
+	if (prefix && !metric_only)
+		fprintf(output, "%s", prefix);
+
+	uval = cd.avg * counter->scale;
+	printout(config, -1, 0, counter, uval, prefix, cd.avg_running, cd.avg_enabled,
+		 cd.avg, &rt_stat);
+	if (!metric_only)
+		fprintf(output, "\n");
+}
+
+static void counter_cb(struct perf_stat_config *config __maybe_unused,
+		       struct perf_evsel *counter, void *data,
+		       bool first __maybe_unused)
+{
+	struct aggr_data *ad = data;
+
+	ad->val += perf_counts(counter->counts, ad->cpu, 0)->val;
+	ad->ena += perf_counts(counter->counts, ad->cpu, 0)->ena;
+	ad->run += perf_counts(counter->counts, ad->cpu, 0)->run;
+}
+
+/*
+ * Print out the results of a single counter:
+ * does not use aggregated count in system-wide
+ */
+static void print_counter(struct perf_stat_config *config,
+			  struct perf_evsel *counter, char *prefix)
+{
+	FILE *output = config->output;
+	u64 ena, run, val;
+	double uval;
+	int cpu;
+
+	for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
+		struct aggr_data ad = { .cpu = cpu };
+
+		if (!collect_data(config, counter, counter_cb, &ad))
+			return;
+		val = ad.val;
+		ena = ad.ena;
+		run = ad.run;
+
+		if (prefix)
+			fprintf(output, "%s", prefix);
+
+		uval = val * counter->scale;
+		printout(config, cpu, 0, counter, uval, prefix, run, ena, 1.0,
+			 &rt_stat);
+
+		fputc('\n', output);
+	}
+}
+
+static void print_no_aggr_metric(struct perf_stat_config *config,
+				 struct perf_evlist *evlist,
+				 char *prefix)
+{
+	int cpu;
+	int nrcpus = 0;
+	struct perf_evsel *counter;
+	u64 ena, run, val;
+	double uval;
+
+	nrcpus = evlist->cpus->nr;
+	for (cpu = 0; cpu < nrcpus; cpu++) {
+		bool first = true;
+
+		if (prefix)
+			fputs(prefix, config->output);
+		evlist__for_each_entry(evlist, counter) {
+			if (is_duration_time(counter))
+				continue;
+			if (first) {
+				aggr_printout(config, counter, cpu, 0);
+				first = false;
+			}
+			val = perf_counts(counter->counts, cpu, 0)->val;
+			ena = perf_counts(counter->counts, cpu, 0)->ena;
+			run = perf_counts(counter->counts, cpu, 0)->run;
+
+			uval = val * counter->scale;
+			printout(config, cpu, 0, counter, uval, prefix, run, ena, 1.0,
+				 &rt_stat);
+		}
+		fputc('\n', config->output);
+	}
+}
+
+static int aggr_header_lens[] = {
+	[AGGR_CORE] = 18,
+	[AGGR_SOCKET] = 12,
+	[AGGR_NONE] = 6,
+	[AGGR_THREAD] = 24,
+	[AGGR_GLOBAL] = 0,
+};
+
+static const char *aggr_header_csv[] = {
+	[AGGR_CORE] 	= 	"core,cpus,",
+	[AGGR_SOCKET] 	= 	"socket,cpus",
+	[AGGR_NONE] 	= 	"cpu,",
+	[AGGR_THREAD] 	= 	"comm-pid,",
+	[AGGR_GLOBAL] 	=	""
+};
+
+static void print_metric_headers(struct perf_stat_config *config,
+				 struct perf_evlist *evlist,
+				 const char *prefix, bool no_indent)
+{
+	struct perf_stat_output_ctx out;
+	struct perf_evsel *counter;
+	struct outstate os = {
+		.fh = config->output
+	};
+
+	if (prefix)
+		fprintf(config->output, "%s", prefix);
+
+	if (!config->csv_output && !no_indent)
+		fprintf(config->output, "%*s",
+			aggr_header_lens[config->aggr_mode], "");
+	if (config->csv_output) {
+		if (config->interval)
+			fputs("time,", config->output);
+		fputs(aggr_header_csv[config->aggr_mode], config->output);
+	}
+
+	/* Print metrics headers only */
+	evlist__for_each_entry(evlist, counter) {
+		if (is_duration_time(counter))
+			continue;
+		os.evsel = counter;
+		out.ctx = &os;
+		out.print_metric = print_metric_header;
+		out.new_line = new_line_metric;
+		out.force_header = true;
+		os.evsel = counter;
+		perf_stat__print_shadow_stats(config, counter, 0,
+					      0,
+					      &out,
+					      &config->metric_events,
+					      &rt_stat);
+	}
+	fputc('\n', config->output);
+}
+
+static void print_interval(struct perf_stat_config *config,
+			   struct perf_evlist *evlist,
+			   char *prefix, struct timespec *ts)
+{
+	bool metric_only = config->metric_only;
+	unsigned int unit_width = config->unit_width;
+	FILE *output = config->output;
+	static int num_print_interval;
+
+	if (config->interval_clear)
+		puts(CONSOLE_CLEAR);
+
+	sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, config->csv_sep);
+
+	if ((num_print_interval == 0 && !config->csv_output) || config->interval_clear) {
+		switch (config->aggr_mode) {
+		case AGGR_SOCKET:
+			fprintf(output, "#           time socket cpus");
+			if (!metric_only)
+				fprintf(output, "             counts %*s events\n", unit_width, "unit");
+			break;
+		case AGGR_CORE:
+			fprintf(output, "#           time core         cpus");
+			if (!metric_only)
+				fprintf(output, "             counts %*s events\n", unit_width, "unit");
+			break;
+		case AGGR_NONE:
+			fprintf(output, "#           time CPU    ");
+			if (!metric_only)
+				fprintf(output, "                counts %*s events\n", unit_width, "unit");
+			break;
+		case AGGR_THREAD:
+			fprintf(output, "#           time             comm-pid");
+			if (!metric_only)
+				fprintf(output, "                  counts %*s events\n", unit_width, "unit");
+			break;
+		case AGGR_GLOBAL:
+		default:
+			fprintf(output, "#           time");
+			if (!metric_only)
+				fprintf(output, "             counts %*s events\n", unit_width, "unit");
+		case AGGR_UNSET:
+			break;
+		}
+	}
+
+	if ((num_print_interval == 0 || config->interval_clear) && metric_only)
+		print_metric_headers(config, evlist, " ", true);
+	if (++num_print_interval == 25)
+		num_print_interval = 0;
+}
+
+static void print_header(struct perf_stat_config *config,
+			 struct target *_target,
+			 int argc, const char **argv)
+{
+	FILE *output = config->output;
+	int i;
+
+	fflush(stdout);
+
+	if (!config->csv_output) {
+		fprintf(output, "\n");
+		fprintf(output, " Performance counter stats for ");
+		if (_target->system_wide)
+			fprintf(output, "\'system wide");
+		else if (_target->cpu_list)
+			fprintf(output, "\'CPU(s) %s", _target->cpu_list);
+		else if (!target__has_task(_target)) {
+			fprintf(output, "\'%s", argv ? argv[0] : "pipe");
+			for (i = 1; argv && (i < argc); i++)
+				fprintf(output, " %s", argv[i]);
+		} else if (_target->pid)
+			fprintf(output, "process id \'%s", _target->pid);
+		else
+			fprintf(output, "thread id \'%s", _target->tid);
+
+		fprintf(output, "\'");
+		if (config->run_count > 1)
+			fprintf(output, " (%d runs)", config->run_count);
+		fprintf(output, ":\n\n");
+	}
+}
+
+static int get_precision(double num)
+{
+	if (num > 1)
+		return 0;
+
+	return lround(ceil(-log10(num)));
+}
+
+static void print_table(struct perf_stat_config *config,
+			FILE *output, int precision, double avg)
+{
+	char tmp[64];
+	int idx, indent = 0;
+
+	scnprintf(tmp, 64, " %17.*f", precision, avg);
+	while (tmp[indent] == ' ')
+		indent++;
+
+	fprintf(output, "%*s# Table of individual measurements:\n", indent, "");
+
+	for (idx = 0; idx < config->run_count; idx++) {
+		double run = (double) config->walltime_run[idx] / NSEC_PER_SEC;
+		int h, n = 1 + abs((int) (100.0 * (run - avg)/run) / 5);
+
+		fprintf(output, " %17.*f (%+.*f) ",
+			precision, run, precision, run - avg);
+
+		for (h = 0; h < n; h++)
+			fprintf(output, "#");
+
+		fprintf(output, "\n");
+	}
+
+	fprintf(output, "\n%*s# Final result:\n", indent, "");
+}
+
+static double timeval2double(struct timeval *t)
+{
+	return t->tv_sec + (double) t->tv_usec/USEC_PER_SEC;
+}
+
+static void print_footer(struct perf_stat_config *config)
+{
+	double avg = avg_stats(config->walltime_nsecs_stats) / NSEC_PER_SEC;
+	FILE *output = config->output;
+	int n;
+
+	if (!config->null_run)
+		fprintf(output, "\n");
+
+	if (config->run_count == 1) {
+		fprintf(output, " %17.9f seconds time elapsed", avg);
+
+		if (config->ru_display) {
+			double ru_utime = timeval2double(&config->ru_data.ru_utime);
+			double ru_stime = timeval2double(&config->ru_data.ru_stime);
+
+			fprintf(output, "\n\n");
+			fprintf(output, " %17.9f seconds user\n", ru_utime);
+			fprintf(output, " %17.9f seconds sys\n", ru_stime);
+		}
+	} else {
+		double sd = stddev_stats(config->walltime_nsecs_stats) / NSEC_PER_SEC;
+		/*
+		 * Display at most 2 more significant
+		 * digits than the stddev inaccuracy.
+		 */
+		int precision = get_precision(sd) + 2;
+
+		if (config->walltime_run_table)
+			print_table(config, output, precision, avg);
+
+		fprintf(output, " %17.*f +- %.*f seconds time elapsed",
+			precision, avg, precision, sd);
+
+		print_noise_pct(config, sd, avg);
+	}
+	fprintf(output, "\n\n");
+
+	if (config->print_free_counters_hint &&
+	    sysctl__read_int("kernel/nmi_watchdog", &n) >= 0 &&
+	    n > 0)
+		fprintf(output,
+"Some events weren't counted. Try disabling the NMI watchdog:\n"
+"	echo 0 > /proc/sys/kernel/nmi_watchdog\n"
+"	perf stat ...\n"
+"	echo 1 > /proc/sys/kernel/nmi_watchdog\n");
+
+	if (config->print_mixed_hw_group_error)
+		fprintf(output,
+			"The events in group usually have to be from "
+			"the same PMU. Try reorganizing the group.\n");
+}
+
+void
+perf_evlist__print_counters(struct perf_evlist *evlist,
+			    struct perf_stat_config *config,
+			    struct target *_target,
+			    struct timespec *ts,
+			    int argc, const char **argv)
+{
+	bool metric_only = config->metric_only;
+	int interval = config->interval;
+	struct perf_evsel *counter;
+	char buf[64], *prefix = NULL;
+
+	if (interval)
+		print_interval(config, evlist, prefix = buf, ts);
+	else
+		print_header(config, _target, argc, argv);
+
+	if (metric_only) {
+		static int num_print_iv;
+
+		if (num_print_iv == 0 && !interval)
+			print_metric_headers(config, evlist, prefix, false);
+		if (num_print_iv++ == 25)
+			num_print_iv = 0;
+		if (config->aggr_mode == AGGR_GLOBAL && prefix)
+			fprintf(config->output, "%s", prefix);
+	}
+
+	switch (config->aggr_mode) {
+	case AGGR_CORE:
+	case AGGR_SOCKET:
+		print_aggr(config, evlist, prefix);
+		break;
+	case AGGR_THREAD:
+		evlist__for_each_entry(evlist, counter) {
+			if (is_duration_time(counter))
+				continue;
+			print_aggr_thread(config, _target, counter, prefix);
+		}
+		break;
+	case AGGR_GLOBAL:
+		evlist__for_each_entry(evlist, counter) {
+			if (is_duration_time(counter))
+				continue;
+			print_counter_aggr(config, counter, prefix);
+		}
+		if (metric_only)
+			fputc('\n', config->output);
+		break;
+	case AGGR_NONE:
+		if (metric_only)
+			print_no_aggr_metric(config, evlist, prefix);
+		else {
+			evlist__for_each_entry(evlist, counter) {
+				if (is_duration_time(counter))
+					continue;
+				print_counter(config, counter, prefix);
+			}
+		}
+		break;
+	case AGGR_UNSET:
+	default:
+		break;
+	}
+
+	if (!interval && !config->csv_output)
+		print_footer(config);
+
+	fflush(config->output);
+}
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 99990f5f2512a..8ad32763cffff 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -410,7 +410,8 @@ static double runtime_stat_n(struct runtime_stat *st,
 	return v->stats.n;
 }
 
-static void print_stalled_cycles_frontend(int cpu,
+static void print_stalled_cycles_frontend(struct perf_stat_config *config,
+					  int cpu,
 					  struct perf_evsel *evsel, double avg,
 					  struct perf_stat_output_ctx *out,
 					  struct runtime_stat *st)
@@ -427,13 +428,14 @@ static void print_stalled_cycles_frontend(int cpu,
 	color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio);
 
 	if (ratio)
-		out->print_metric(out->ctx, color, "%7.2f%%", "frontend cycles idle",
+		out->print_metric(config, out->ctx, color, "%7.2f%%", "frontend cycles idle",
 				  ratio);
 	else
-		out->print_metric(out->ctx, NULL, NULL, "frontend cycles idle", 0);
+		out->print_metric(config, out->ctx, NULL, NULL, "frontend cycles idle", 0);
 }
 
-static void print_stalled_cycles_backend(int cpu,
+static void print_stalled_cycles_backend(struct perf_stat_config *config,
+					 int cpu,
 					 struct perf_evsel *evsel, double avg,
 					 struct perf_stat_output_ctx *out,
 					 struct runtime_stat *st)
@@ -449,10 +451,11 @@ static void print_stalled_cycles_backend(int cpu,
 
 	color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio);
 
-	out->print_metric(out->ctx, color, "%7.2f%%", "backend cycles idle", ratio);
+	out->print_metric(config, out->ctx, color, "%7.2f%%", "backend cycles idle", ratio);
 }
 
-static void print_branch_misses(int cpu,
+static void print_branch_misses(struct perf_stat_config *config,
+				int cpu,
 				struct perf_evsel *evsel,
 				double avg,
 				struct perf_stat_output_ctx *out,
@@ -469,10 +472,11 @@ static void print_branch_misses(int cpu,
 
 	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
 
-	out->print_metric(out->ctx, color, "%7.2f%%", "of all branches", ratio);
+	out->print_metric(config, out->ctx, color, "%7.2f%%", "of all branches", ratio);
 }
 
-static void print_l1_dcache_misses(int cpu,
+static void print_l1_dcache_misses(struct perf_stat_config *config,
+				   int cpu,
 				   struct perf_evsel *evsel,
 				   double avg,
 				   struct perf_stat_output_ctx *out,
@@ -490,10 +494,11 @@ static void print_l1_dcache_misses(int cpu,
 
 	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
 
-	out->print_metric(out->ctx, color, "%7.2f%%", "of all L1-dcache hits", ratio);
+	out->print_metric(config, out->ctx, color, "%7.2f%%", "of all L1-dcache hits", ratio);
 }
 
-static void print_l1_icache_misses(int cpu,
+static void print_l1_icache_misses(struct perf_stat_config *config,
+				   int cpu,
 				   struct perf_evsel *evsel,
 				   double avg,
 				   struct perf_stat_output_ctx *out,
@@ -510,10 +515,11 @@ static void print_l1_icache_misses(int cpu,
 		ratio = avg / total * 100.0;
 
 	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-	out->print_metric(out->ctx, color, "%7.2f%%", "of all L1-icache hits", ratio);
+	out->print_metric(config, out->ctx, color, "%7.2f%%", "of all L1-icache hits", ratio);
 }
 
-static void print_dtlb_cache_misses(int cpu,
+static void print_dtlb_cache_misses(struct perf_stat_config *config,
+				    int cpu,
 				    struct perf_evsel *evsel,
 				    double avg,
 				    struct perf_stat_output_ctx *out,
@@ -529,10 +535,11 @@ static void print_dtlb_cache_misses(int cpu,
 		ratio = avg / total * 100.0;
 
 	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-	out->print_metric(out->ctx, color, "%7.2f%%", "of all dTLB cache hits", ratio);
+	out->print_metric(config, out->ctx, color, "%7.2f%%", "of all dTLB cache hits", ratio);
 }
 
-static void print_itlb_cache_misses(int cpu,
+static void print_itlb_cache_misses(struct perf_stat_config *config,
+				    int cpu,
 				    struct perf_evsel *evsel,
 				    double avg,
 				    struct perf_stat_output_ctx *out,
@@ -548,10 +555,11 @@ static void print_itlb_cache_misses(int cpu,
 		ratio = avg / total * 100.0;
 
 	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-	out->print_metric(out->ctx, color, "%7.2f%%", "of all iTLB cache hits", ratio);
+	out->print_metric(config, out->ctx, color, "%7.2f%%", "of all iTLB cache hits", ratio);
 }
 
-static void print_ll_cache_misses(int cpu,
+static void print_ll_cache_misses(struct perf_stat_config *config,
+				  int cpu,
 				  struct perf_evsel *evsel,
 				  double avg,
 				  struct perf_stat_output_ctx *out,
@@ -567,7 +575,7 @@ static void print_ll_cache_misses(int cpu,
 		ratio = avg / total * 100.0;
 
 	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-	out->print_metric(out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio);
+	out->print_metric(config, out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio);
 }
 
 /*
@@ -674,7 +682,8 @@ static double td_be_bound(int ctx, int cpu, struct runtime_stat *st)
 	return sanitize_val(1.0 - sum);
 }
 
-static void print_smi_cost(int cpu, struct perf_evsel *evsel,
+static void print_smi_cost(struct perf_stat_config *config,
+			   int cpu, struct perf_evsel *evsel,
 			   struct perf_stat_output_ctx *out,
 			   struct runtime_stat *st)
 {
@@ -694,11 +703,12 @@ static void print_smi_cost(int cpu, struct perf_evsel *evsel,
 
 	if (cost > 10)
 		color = PERF_COLOR_RED;
-	out->print_metric(out->ctx, color, "%8.1f%%", "SMI cycles%", cost);
-	out->print_metric(out->ctx, NULL, "%4.0f", "SMI#", smi_num);
+	out->print_metric(config, out->ctx, color, "%8.1f%%", "SMI cycles%", cost);
+	out->print_metric(config, out->ctx, NULL, "%4.0f", "SMI#", smi_num);
 }
 
-static void generic_metric(const char *metric_expr,
+static void generic_metric(struct perf_stat_config *config,
+			   const char *metric_expr,
 			   struct perf_evsel **metric_events,
 			   char *name,
 			   const char *metric_name,
@@ -737,20 +747,21 @@ static void generic_metric(const char *metric_expr,
 		const char *p = metric_expr;
 
 		if (expr__parse(&ratio, &pctx, &p) == 0)
-			print_metric(ctxp, NULL, "%8.1f",
+			print_metric(config, ctxp, NULL, "%8.1f",
 				metric_name ?
 				metric_name :
 				out->force_header ?  name : "",
 				ratio);
 		else
-			print_metric(ctxp, NULL, NULL,
+			print_metric(config, ctxp, NULL, NULL,
 				     out->force_header ?
 				     (metric_name ? metric_name : name) : "", 0);
 	} else
-		print_metric(ctxp, NULL, NULL, "", 0);
+		print_metric(config, ctxp, NULL, NULL, "", 0);
 }
 
-void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
+void perf_stat__print_shadow_stats(struct perf_stat_config *config,
+				   struct perf_evsel *evsel,
 				   double avg, int cpu,
 				   struct perf_stat_output_ctx *out,
 				   struct rblist *metric_events,
@@ -769,10 +780,10 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 
 		if (total) {
 			ratio = avg / total;
-			print_metric(ctxp, NULL, "%7.2f ",
+			print_metric(config, ctxp, NULL, "%7.2f ",
 					"insn per cycle", ratio);
 		} else {
-			print_metric(ctxp, NULL, NULL, "insn per cycle", 0);
+			print_metric(config, ctxp, NULL, NULL, "insn per cycle", 0);
 		}
 
 		total = runtime_stat_avg(st, STAT_STALLED_CYCLES_FRONT,
@@ -783,20 +794,20 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 						    ctx, cpu));
 
 		if (total && avg) {
-			out->new_line(ctxp);
+			out->new_line(config, ctxp);
 			ratio = total / avg;
-			print_metric(ctxp, NULL, "%7.2f ",
+			print_metric(config, ctxp, NULL, "%7.2f ",
 					"stalled cycles per insn",
 					ratio);
 		} else if (have_frontend_stalled) {
-			print_metric(ctxp, NULL, NULL,
+			print_metric(config, ctxp, NULL, NULL,
 				     "stalled cycles per insn", 0);
 		}
 	} else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES)) {
 		if (runtime_stat_n(st, STAT_BRANCHES, ctx, cpu) != 0)
-			print_branch_misses(cpu, evsel, avg, out, st);
+			print_branch_misses(config, cpu, evsel, avg, out, st);
 		else
-			print_metric(ctxp, NULL, NULL, "of all branches", 0);
+			print_metric(config, ctxp, NULL, NULL, "of all branches", 0);
 	} else if (
 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1D |
@@ -804,9 +815,9 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
 
 		if (runtime_stat_n(st, STAT_L1_DCACHE, ctx, cpu) != 0)
-			print_l1_dcache_misses(cpu, evsel, avg, out, st);
+			print_l1_dcache_misses(config, cpu, evsel, avg, out, st);
 		else
-			print_metric(ctxp, NULL, NULL, "of all L1-dcache hits", 0);
+			print_metric(config, ctxp, NULL, NULL, "of all L1-dcache hits", 0);
 	} else if (
 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1I |
@@ -814,9 +825,9 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
 
 		if (runtime_stat_n(st, STAT_L1_ICACHE, ctx, cpu) != 0)
-			print_l1_icache_misses(cpu, evsel, avg, out, st);
+			print_l1_icache_misses(config, cpu, evsel, avg, out, st);
 		else
-			print_metric(ctxp, NULL, NULL, "of all L1-icache hits", 0);
+			print_metric(config, ctxp, NULL, NULL, "of all L1-icache hits", 0);
 	} else if (
 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_DTLB |
@@ -824,9 +835,9 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
 
 		if (runtime_stat_n(st, STAT_DTLB_CACHE, ctx, cpu) != 0)
-			print_dtlb_cache_misses(cpu, evsel, avg, out, st);
+			print_dtlb_cache_misses(config, cpu, evsel, avg, out, st);
 		else
-			print_metric(ctxp, NULL, NULL, "of all dTLB cache hits", 0);
+			print_metric(config, ctxp, NULL, NULL, "of all dTLB cache hits", 0);
 	} else if (
 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_ITLB |
@@ -834,9 +845,9 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
 
 		if (runtime_stat_n(st, STAT_ITLB_CACHE, ctx, cpu) != 0)
-			print_itlb_cache_misses(cpu, evsel, avg, out, st);
+			print_itlb_cache_misses(config, cpu, evsel, avg, out, st);
 		else
-			print_metric(ctxp, NULL, NULL, "of all iTLB cache hits", 0);
+			print_metric(config, ctxp, NULL, NULL, "of all iTLB cache hits", 0);
 	} else if (
 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_LL |
@@ -844,9 +855,9 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
 
 		if (runtime_stat_n(st, STAT_LL_CACHE, ctx, cpu) != 0)
-			print_ll_cache_misses(cpu, evsel, avg, out, st);
+			print_ll_cache_misses(config, cpu, evsel, avg, out, st);
 		else
-			print_metric(ctxp, NULL, NULL, "of all LL-cache hits", 0);
+			print_metric(config, ctxp, NULL, NULL, "of all LL-cache hits", 0);
 	} else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES)) {
 		total = runtime_stat_avg(st, STAT_CACHEREFS, ctx, cpu);
 
@@ -854,32 +865,32 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 			ratio = avg * 100 / total;
 
 		if (runtime_stat_n(st, STAT_CACHEREFS, ctx, cpu) != 0)
-			print_metric(ctxp, NULL, "%8.3f %%",
+			print_metric(config, ctxp, NULL, "%8.3f %%",
 				     "of all cache refs", ratio);
 		else
-			print_metric(ctxp, NULL, NULL, "of all cache refs", 0);
+			print_metric(config, ctxp, NULL, NULL, "of all cache refs", 0);
 	} else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) {
-		print_stalled_cycles_frontend(cpu, evsel, avg, out, st);
+		print_stalled_cycles_frontend(config, cpu, evsel, avg, out, st);
 	} else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) {
-		print_stalled_cycles_backend(cpu, evsel, avg, out, st);
+		print_stalled_cycles_backend(config, cpu, evsel, avg, out, st);
 	} else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) {
 		total = runtime_stat_avg(st, STAT_NSECS, 0, cpu);
 
 		if (total) {
 			ratio = avg / total;
-			print_metric(ctxp, NULL, "%8.3f", "GHz", ratio);
+			print_metric(config, ctxp, NULL, "%8.3f", "GHz", ratio);
 		} else {
-			print_metric(ctxp, NULL, NULL, "Ghz", 0);
+			print_metric(config, ctxp, NULL, NULL, "Ghz", 0);
 		}
 	} else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX)) {
 		total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu);
 
 		if (total)
-			print_metric(ctxp, NULL,
+			print_metric(config, ctxp, NULL,
 					"%7.2f%%", "transactional cycles",
 					100.0 * (avg / total));
 		else
-			print_metric(ctxp, NULL, NULL, "transactional cycles",
+			print_metric(config, ctxp, NULL, NULL, "transactional cycles",
 				     0);
 	} else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX_CP)) {
 		total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu);
@@ -888,10 +899,10 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 		if (total2 < avg)
 			total2 = avg;
 		if (total)
-			print_metric(ctxp, NULL, "%7.2f%%", "aborted cycles",
+			print_metric(config, ctxp, NULL, "%7.2f%%", "aborted cycles",
 				100.0 * ((total2-avg) / total));
 		else
-			print_metric(ctxp, NULL, NULL, "aborted cycles", 0);
+			print_metric(config, ctxp, NULL, NULL, "aborted cycles", 0);
 	} else if (perf_stat_evsel__is(evsel, TRANSACTION_START)) {
 		total = runtime_stat_avg(st, STAT_CYCLES_IN_TX,
 					 ctx, cpu);
@@ -900,10 +911,10 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 			ratio = total / avg;
 
 		if (runtime_stat_n(st, STAT_CYCLES_IN_TX, ctx, cpu) != 0)
-			print_metric(ctxp, NULL, "%8.0f",
+			print_metric(config, ctxp, NULL, "%8.0f",
 				     "cycles / transaction", ratio);
 		else
-			print_metric(ctxp, NULL, NULL, "cycles / transaction",
+			print_metric(config, ctxp, NULL, NULL, "cycles / transaction",
 				      0);
 	} else if (perf_stat_evsel__is(evsel, ELISION_START)) {
 		total = runtime_stat_avg(st, STAT_CYCLES_IN_TX,
@@ -912,33 +923,33 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 		if (avg)
 			ratio = total / avg;
 
-		print_metric(ctxp, NULL, "%8.0f", "cycles / elision", ratio);
+		print_metric(config, ctxp, NULL, "%8.0f", "cycles / elision", ratio);
 	} else if (perf_evsel__is_clock(evsel)) {
 		if ((ratio = avg_stats(&walltime_nsecs_stats)) != 0)
-			print_metric(ctxp, NULL, "%8.3f", "CPUs utilized",
+			print_metric(config, ctxp, NULL, "%8.3f", "CPUs utilized",
 				     avg / (ratio * evsel->scale));
 		else
-			print_metric(ctxp, NULL, NULL, "CPUs utilized", 0);
+			print_metric(config, ctxp, NULL, NULL, "CPUs utilized", 0);
 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_BUBBLES)) {
 		double fe_bound = td_fe_bound(ctx, cpu, st);
 
 		if (fe_bound > 0.2)
 			color = PERF_COLOR_RED;
-		print_metric(ctxp, color, "%8.1f%%", "frontend bound",
+		print_metric(config, ctxp, color, "%8.1f%%", "frontend bound",
 				fe_bound * 100.);
 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_RETIRED)) {
 		double retiring = td_retiring(ctx, cpu, st);
 
 		if (retiring > 0.7)
 			color = PERF_COLOR_GREEN;
-		print_metric(ctxp, color, "%8.1f%%", "retiring",
+		print_metric(config, ctxp, color, "%8.1f%%", "retiring",
 				retiring * 100.);
 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_RECOVERY_BUBBLES)) {
 		double bad_spec = td_bad_spec(ctx, cpu, st);
 
 		if (bad_spec > 0.1)
 			color = PERF_COLOR_RED;
-		print_metric(ctxp, color, "%8.1f%%", "bad speculation",
+		print_metric(config, ctxp, color, "%8.1f%%", "bad speculation",
 				bad_spec * 100.);
 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_ISSUED)) {
 		double be_bound = td_be_bound(ctx, cpu, st);
@@ -955,12 +966,12 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 		if (be_bound > 0.2)
 			color = PERF_COLOR_RED;
 		if (td_total_slots(ctx, cpu, st) > 0)
-			print_metric(ctxp, color, "%8.1f%%", name,
+			print_metric(config, ctxp, color, "%8.1f%%", name,
 					be_bound * 100.);
 		else
-			print_metric(ctxp, NULL, NULL, name, 0);
+			print_metric(config, ctxp, NULL, NULL, name, 0);
 	} else if (evsel->metric_expr) {
-		generic_metric(evsel->metric_expr, evsel->metric_events, evsel->name,
+		generic_metric(config, evsel->metric_expr, evsel->metric_events, evsel->name,
 				evsel->metric_name, avg, cpu, out, st);
 	} else if (runtime_stat_n(st, STAT_NSECS, 0, cpu) != 0) {
 		char unit = 'M';
@@ -975,9 +986,9 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 			unit = 'K';
 		}
 		snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit);
-		print_metric(ctxp, NULL, "%8.3f", unit_buf, ratio);
+		print_metric(config, ctxp, NULL, "%8.3f", unit_buf, ratio);
 	} else if (perf_stat_evsel__is(evsel, SMI_NUM)) {
-		print_smi_cost(cpu, evsel, out, st);
+		print_smi_cost(config, cpu, evsel, out, st);
 	} else {
 		num = 0;
 	}
@@ -987,12 +998,12 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 
 		list_for_each_entry (mexp, &me->head, nd) {
 			if (num++ > 0)
-				out->new_line(ctxp);
-			generic_metric(mexp->metric_expr, mexp->metric_events,
+				out->new_line(config, ctxp);
+			generic_metric(config, mexp->metric_expr, mexp->metric_events,
 					evsel->name, mexp->metric_name,
 					avg, cpu, out, st);
 		}
 	}
 	if (num == 0)
-		print_metric(ctxp, NULL, NULL, NULL, 0);
+		print_metric(config, ctxp, NULL, NULL, NULL, 0);
 }
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index a0061e0b0fade..5d3172bcc4ae7 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -435,3 +435,98 @@ size_t perf_event__fprintf_stat_config(union perf_event *event, FILE *fp)
 
 	return ret;
 }
+
+int create_perf_stat_counter(struct perf_evsel *evsel,
+			     struct perf_stat_config *config,
+			     struct target *target)
+{
+	struct perf_event_attr *attr = &evsel->attr;
+	struct perf_evsel *leader = evsel->leader;
+
+	if (config->scale) {
+		attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
+				    PERF_FORMAT_TOTAL_TIME_RUNNING;
+	}
+
+	/*
+	 * The event is part of non trivial group, let's enable
+	 * the group read (for leader) and ID retrieval for all
+	 * members.
+	 */
+	if (leader->nr_members > 1)
+		attr->read_format |= PERF_FORMAT_ID|PERF_FORMAT_GROUP;
+
+	attr->inherit = !config->no_inherit;
+
+	/*
+	 * Some events get initialized with sample_(period/type) set,
+	 * like tracepoints. Clear it up for counting.
+	 */
+	attr->sample_period = 0;
+
+	if (config->identifier)
+		attr->sample_type = PERF_SAMPLE_IDENTIFIER;
+
+	/*
+	 * Disabling all counters initially, they will be enabled
+	 * either manually by us or by kernel via enable_on_exec
+	 * set later.
+	 */
+	if (perf_evsel__is_group_leader(evsel)) {
+		attr->disabled = 1;
+
+		/*
+		 * In case of initial_delay we enable tracee
+		 * events manually.
+		 */
+		if (target__none(target) && !config->initial_delay)
+			attr->enable_on_exec = 1;
+	}
+
+	if (target__has_cpu(target) && !target__has_per_thread(target))
+		return perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel));
+
+	return perf_evsel__open_per_thread(evsel, evsel->threads);
+}
+
+int perf_stat_synthesize_config(struct perf_stat_config *config,
+				struct perf_tool *tool,
+				struct perf_evlist *evlist,
+				perf_event__handler_t process,
+				bool attrs)
+{
+	int err;
+
+	if (attrs) {
+		err = perf_event__synthesize_attrs(tool, evlist, process);
+		if (err < 0) {
+			pr_err("Couldn't synthesize attrs.\n");
+			return err;
+		}
+	}
+
+	err = perf_event__synthesize_extra_attr(tool, evlist, process,
+						attrs);
+
+	err = perf_event__synthesize_thread_map2(tool, evlist->threads,
+						 process, NULL);
+	if (err < 0) {
+		pr_err("Couldn't synthesize thread map.\n");
+		return err;
+	}
+
+	err = perf_event__synthesize_cpu_map(tool, evlist->cpus,
+					     process, NULL);
+	if (err < 0) {
+		pr_err("Couldn't synthesize thread map.\n");
+		return err;
+	}
+
+	err = perf_event__synthesize_stat_config(tool, config, process, NULL);
+	if (err < 0) {
+		pr_err("Couldn't synthesize config.\n");
+		return err;
+	}
+
+	return 0;
+}
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 36efb986f7fc6..3a13a6dc5a629 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -4,8 +4,14 @@
 
 #include <linux/types.h>
 #include <stdio.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/wait.h>
 #include "xyarray.h"
 #include "rblist.h"
+#include "perf.h"
+#include "event.h"
 
 struct stats {
 	double n, mean, M2;
@@ -84,15 +90,42 @@ struct runtime_stat {
 	struct rblist value_list;
 };
 
+typedef int (*aggr_get_id_t)(struct perf_stat_config *config,
+			     struct cpu_map *m, int cpu);
+
 struct perf_stat_config {
-	enum aggr_mode	aggr_mode;
-	bool		scale;
-	FILE		*output;
-	unsigned int	interval;
-	unsigned int	timeout;
-	int		times;
-	struct runtime_stat *stats;
-	int		stats_num;
+	enum aggr_mode		 aggr_mode;
+	bool			 scale;
+	bool			 no_inherit;
+	bool			 identifier;
+	bool			 csv_output;
+	bool			 interval_clear;
+	bool			 metric_only;
+	bool			 null_run;
+	bool			 ru_display;
+	bool			 big_num;
+	bool			 no_merge;
+	bool			 walltime_run_table;
+	FILE			*output;
+	unsigned int		 interval;
+	unsigned int		 timeout;
+	unsigned int		 initial_delay;
+	unsigned int		 unit_width;
+	unsigned int		 metric_only_len;
+	int			 times;
+	int			 run_count;
+	int			 print_free_counters_hint;
+	int			 print_mixed_hw_group_error;
+	struct runtime_stat	*stats;
+	int			 stats_num;
+	const char		*csv_sep;
+	struct stats		*walltime_nsecs_stats;
+	struct rusage		 ru_data;
+	struct cpu_map		*aggr_map;
+	aggr_get_id_t		 aggr_get_id;
+	struct cpu_map		*cpus_aggr_map;
+	u64			*walltime_run;
+	struct rblist		 metric_events;
 };
 
 void update_stats(struct stats *stats, u64 val);
@@ -130,9 +163,10 @@ bool __perf_evsel_stat__is(struct perf_evsel *evsel,
 extern struct runtime_stat rt_stat;
 extern struct stats walltime_nsecs_stats;
 
-typedef void (*print_metric_t)(void *ctx, const char *color, const char *unit,
+typedef void (*print_metric_t)(struct perf_stat_config *config,
+			       void *ctx, const char *color, const char *unit,
 			       const char *fmt, double val);
-typedef void (*new_line_t )(void *ctx);
+typedef void (*new_line_t)(struct perf_stat_config *config, void *ctx);
 
 void runtime_stat__init(struct runtime_stat *st);
 void runtime_stat__exit(struct runtime_stat *st);
@@ -148,7 +182,8 @@ struct perf_stat_output_ctx {
 	bool force_header;
 };
 
-void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
+void perf_stat__print_shadow_stats(struct perf_stat_config *config,
+				   struct perf_evsel *evsel,
 				   double avg, int cpu,
 				   struct perf_stat_output_ctx *out,
 				   struct rblist *metric_events,
@@ -171,4 +206,19 @@ int perf_event__process_stat_event(struct perf_tool *tool,
 size_t perf_event__fprintf_stat(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_stat_round(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_stat_config(union perf_event *event, FILE *fp);
+
+int create_perf_stat_counter(struct perf_evsel *evsel,
+			     struct perf_stat_config *config,
+			     struct target *target);
+int perf_stat_synthesize_config(struct perf_stat_config *config,
+				struct perf_tool *tool,
+				struct perf_evlist *evlist,
+				perf_event__handler_t process,
+				bool attrs);
+void
+perf_evlist__print_counters(struct perf_evlist *evlist,
+			    struct perf_stat_config *config,
+			    struct target *_target,
+			    struct timespec *ts,
+			    int argc, const char **argv);
 #endif
diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c
index c85d0d1a65ed7..7b0ca7cbb7de8 100644
--- a/tools/perf/util/trace-event-info.c
+++ b/tools/perf/util/trace-event-info.c
@@ -377,7 +377,7 @@ static int record_ftrace_printk(void)
 
 static int record_saved_cmdline(void)
 {
-	unsigned int size;
+	unsigned long long size;
 	char *path;
 	struct stat st;
 	int ret, err = 0;
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 920b1d58a0689..e76214f8d596b 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -164,16 +164,15 @@ void parse_ftrace_printk(struct tep_handle *pevent,
 void parse_saved_cmdline(struct tep_handle *pevent,
 			 char *file, unsigned int size __maybe_unused)
 {
-	char *comm;
+	char comm[17]; /* Max comm length in the kernel is 16. */
 	char *line;
 	char *next = NULL;
 	int pid;
 
 	line = strtok_r(file, "\n", &next);
 	while (line) {
-		sscanf(line, "%d %ms", &pid, &comm);
-		tep_register_comm(pevent, comm, pid);
-		free(comm);
+		if (sscanf(line, "%d %16s", &pid, comm) == 2)
+			tep_register_comm(pevent, comm, pid);
 		line = strtok_r(NULL, "\n", &next);
 	}
 }
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index 40204ec3a7a28..c69d77d7cf55d 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -3,6 +3,7 @@
 #define _PERF_UTIL_TRACE_EVENT_H
 
 #include <traceevent/event-parse.h>
+#include <traceevent/trace-seq.h>
 #include "parse-events.h"
 
 struct machine;