From e8a99af68c068865dbac7f3330e97bf8e96edf33 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 3 Dec 2024 15:44:17 +0800 Subject: [PATCH 01/25] tools/power turbostat: Add initial support for PantherLake Add initial support for PantherLake. It shares the same features with Lunarlake. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 58a487c225a73..540336138ce9f 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1024,6 +1024,7 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_ARROWLAKE_U, &adl_features }, { INTEL_ARROWLAKE, &adl_features }, { INTEL_LUNARLAKE_M, &lnl_features }, + { INTEL_PANTHERLAKE_L, &lnl_features }, { INTEL_ATOM_SILVERMONT, &slv_features }, { INTEL_ATOM_SILVERMONT_D, &slvd_features }, { INTEL_ATOM_AIRMONT, &amt_features }, From 6b47ed23e2f1bc2c177da47437970e6208ac9ea0 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 3 Dec 2024 15:44:18 +0800 Subject: [PATCH 02/25] tools/power turbostat: Add initial support for ClearwaterForest Add initial support for ClearwaterForest. It shares the same features with SierraForest. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 540336138ce9f..e203f109dd2e2 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1037,6 +1037,7 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_ATOM_GRACEMONT, &adl_features }, { INTEL_ATOM_CRESTMONT_X, &srf_features }, { INTEL_ATOM_CRESTMONT, &grr_features }, + { INTEL_ATOM_DARKMONT_X, &srf_features }, { INTEL_XEON_PHI_KNL, &knl_features }, { INTEL_XEON_PHI_KNM, &knl_features }, /* From 9e47f8adb053b69e2e8310551e6fd5156704cef4 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 3 Dec 2024 12:23:22 -0500 Subject: [PATCH 03/25] tools/power turbostat: update turbostat(8) Clarify how to get the latest version. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 28 ++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index a7f7ed01421c1..59b89e6b25bf0 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -516,14 +516,40 @@ that they count at TSC rate, which is true on all processors tested to date. Volume 3B: System Programming Guide" https://www.intel.com/products/processor/manuals/ +.SH RUN THE LATEST VERSION +If turbostat complains that it doesn't recognize your processor, +please try the latest version. + +The latest version of turbostat does not require the latest version of the Linux kernel. +However, some features, such as perf(1) counters, do require kernel support. + +The latest turbostat release is available in the upstream Linux Kernel source tree. +eg. "git pull https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git" +and run make in tools/power/x86/turbostat/. + +n.b. "make install" will update your system manually, but a distro update may subsequently downgrade your turbostat to an older version. +For this reason, manually installing to /usr/local/bin may be what you want. + +Note that turbostat/Makefile has a "make snapshot" target, which will create a tar file +that can build without a local kernel source tree. + +If the upstream version isn't new enough, the development tree can be found here: +"git pull https://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git turbostat" + +If the development tree doesn't work, please contact the author via chat, +or via email with the word "turbostat" on the Subject line. + .SH FILES .ta .nf +/sys/bus/event_source/devices/ /dev/cpu/*/msr +/sys/class/intel_pmt/ +/sys/devices/system/cpu/ .fi .SH "SEE ALSO" -msr(4), vmstat(8) +perf(1), msr(4), vmstat(8) .PP .SH AUTHOR .nf From 4133be39e216130a86382fb5cfbaf6851a6f7a45 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 3 Dec 2024 15:51:16 +0800 Subject: [PATCH 04/25] tools/power turbostat: Exit on unsupported Intel models Turbostat requires per-platform enabling for Intel CPU models due to platform-specific features. When running on unsupported Intel CPU models, turbostat currently operates with limited default features, which can lead to users unknowingly using an outdated version of the tool. Enhance turbostat to exit by default when run on unsupported Intel CPU models, with a clear message to users, informing them that their CPU model is not supported and advising them to update to the latest version of turbostat for full functionality. [lenb: updated error message wording] Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e203f109dd2e2..5e894b71003c9 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1079,6 +1079,10 @@ void probe_platform_features(unsigned int family, unsigned int model) return; } } + + fprintf(stderr, "Unsupported platform detected.\n" + "\tSee RUN THE LATEST VERSION on turbostat(8)\n"); + exit(1); } /* Model specific support End */ From 48c62ba1b407140229e92f5cfae6ae113fc4af8e Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 3 Dec 2024 15:51:17 +0800 Subject: [PATCH 05/25] tools/power turbostat: Exit on unsupported Vendors Turbostat currently supports x86 processors from Intel, AMD, and Hygon. The behavior of turbostat on CPUs from other vendors has not been evaluated and may lead to incorrect or undefined behavior. Enhance turbostat to exit by default when running on an unsupported CPU vendor. This ensures that users are aware that their CPU is not currently supported by turbostat, guiding them to seek support for their specific hardware through future patches. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 5e894b71003c9..cb659b274554b 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1056,9 +1056,9 @@ void probe_platform_features(unsigned int family, unsigned int model) { int i; - platform = &default_features; if (authentic_amd || hygon_genuine) { + platform = &default_features; if (max_extended_level >= 0x80000007) { unsigned int eax, ebx, ecx, edx; @@ -1071,7 +1071,7 @@ void probe_platform_features(unsigned int family, unsigned int model) } if (!genuine_intel) - return; + goto end; for (i = 0; turbostat_pdata[i].features; i++) { if (VFM_FAMILY(turbostat_pdata[i].vfm) == family && VFM_MODEL(turbostat_pdata[i].vfm) == model) { @@ -1080,6 +1080,10 @@ void probe_platform_features(unsigned int family, unsigned int model) } } +end: + if (platform) + return; + fprintf(stderr, "Unsupported platform detected.\n" "\tSee RUN THE LATEST VERSION on turbostat(8)\n"); exit(1); From cc63f89ef9db70f74c563317d36028bb5e6196a1 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 3 Dec 2024 15:51:18 +0800 Subject: [PATCH 06/25] tools/power turbostat: Improve --help output Improve the `--help` output of turbostat by standardizing the format and enhancing readability. The following changes are made to ensure consistency and clarity in the help message: 1. Use a consistent pattern for each parameter's help message: - Display the parameter and its input (if any) on the same line, separated by a space. - Provide the detailed description on a separate line. 2. Ensure that the first character of each description is in lower-case. These changes make the help output more uniform and easier to read, helping users quickly understand the available options and their usage. No functional change. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 41 +++++++++++++++++---------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index cb659b274554b..5165450a81877 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2145,41 +2145,52 @@ void help(void) "when COMMAND completes.\n" "If no COMMAND is specified, turbostat wakes every 5-seconds\n" "to print statistics, until interrupted.\n" - " -a, --add add a counter\n" + " -a, --add counter\n" + " add a counter\n" " eg. --add msr0x10,u64,cpu,delta,MY_TSC\n" " eg. --add perf/cstate_pkg/c2-residency,package,delta,percent,perfPC2\n" " eg. --add pmt,name=XTAL,type=raw,domain=package0,offset=0,lsb=0,msb=63,guid=0x1a067102\n" - " -c, --cpu cpu-set limit output to summary plus cpu-set:\n" + " -c, --cpu cpu-set\n" + " limit output to summary plus cpu-set:\n" " {core | package | j,k,l..m,n-p }\n" - " -d, --debug displays usec, Time_Of_Day_Seconds and more debugging\n" + " -d, --debug\n" + " displays usec, Time_Of_Day_Seconds and more debugging\n" " debug messages are printed to stderr\n" - " -D, --Dump displays the raw counter values\n" - " -e, --enable [all | column]\n" + " -D, --Dump\n" + " displays the raw counter values\n" + " -e, --enable [all | column]\n" " shows all or the specified disabled column\n" - " -H, --hide [column|column,column,...]\n" + " -H, --hide [column | column,column,...]\n" " hide the specified column(s)\n" " -i, --interval sec.subsec\n" - " Override default 5-second measurement interval\n" - " -J, --Joules displays energy in Joules instead of Watts\n" - " -l, --list list column headers only\n" - " -M, --no-msr Disable all uses of the MSR driver\n" - " -P, --no-perf Disable all uses of the perf API\n" + " override default 5-second measurement interval\n" + " -J, --Joules\n" + " displays energy in Joules instead of Watts\n" + " -l, --list\n" + " list column headers only\n" + " -M, --no-msr\n" + " disable all uses of the MSR driver\n" + " -P, --no-perf\n" + " disable all uses of the perf API\n" " -n, --num_iterations num\n" " number of the measurement iterations\n" " -N, --header_iterations num\n" " print header every num iterations\n" " -o, --out file\n" " create or truncate \"file\" for all output\n" - " -q, --quiet skip decoding system configuration header\n" - " -s, --show [column|column,column,...]\n" + " -q, --quiet\n" + " skip decoding system configuration header\n" + " -s, --show [column | column,column,...]\n" " show only the specified column(s)\n" " -S, --Summary\n" " limits output to 1-line system summary per interval\n" " -T, --TCC temperature\n" " sets the Thermal Control Circuit temperature in\n" " degrees Celsius\n" - " -h, --help print this help message\n" - " -v, --version print version information\n" "\n" "For more help, run \"man turbostat\"\n"); + " -h, --help\n" + " print this help message\n" + " -v, --version\n" + " print version information\n" "\n" "For more help, run \"man turbostat\"\n"); } /* From 3d94026af328d3d355d15c1d7fe73278f77c6a42 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 3 Dec 2024 15:51:19 +0800 Subject: [PATCH 07/25] tools/power turbostat: Introduce --force parameter Turbostat currently exits under the following conditions: 1. When running on non-Intel/AMD/Hygon x86 vendors. 2. When running on Intel models that lack specific platform features. Introduce a new `--force` parameter that allows turbostat to run on these unsupported platforms with minimal default feature support. This provides users with the flexibility to gather basic information even on unsupported systems. [lenb: updated warning message text] Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 5165450a81877..7accc4a733667 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -326,6 +326,7 @@ unsigned int rapl_joules; unsigned int summary_only; unsigned int list_header_only; unsigned int dump_only; +unsigned int force_load; unsigned int has_aperf; unsigned int has_aperf_access; unsigned int has_epb; @@ -1058,7 +1059,8 @@ void probe_platform_features(unsigned int family, unsigned int model) if (authentic_amd || hygon_genuine) { - platform = &default_features; + /* fallback to default features on unsupported models */ + force_load++; if (max_extended_level >= 0x80000007) { unsigned int eax, ebx, ecx, edx; @@ -1067,7 +1069,7 @@ void probe_platform_features(unsigned int family, unsigned int model) if ((edx & (1 << 14)) && family >= 0x17) platform = &amd_features_with_rapl; } - return; + goto end; } if (!genuine_intel) @@ -1081,6 +1083,11 @@ void probe_platform_features(unsigned int family, unsigned int model) } end: + if (force_load && !platform) { + fprintf(outf, "Forced to run on unsupported platform!\n"); + platform = &default_features; + } + if (platform) return; @@ -2160,6 +2167,8 @@ void help(void) " displays the raw counter values\n" " -e, --enable [all | column]\n" " shows all or the specified disabled column\n" + " -f, --force\n" + " force load turbostat with minimum default features on unsupported platforms.\n" " -H, --hide [column | column,column,...]\n" " hide the specified column(s)\n" " -i, --interval sec.subsec\n" @@ -9942,6 +9951,7 @@ void cmdline(int argc, char **argv) { "Dump", no_argument, 0, 'D' }, { "debug", no_argument, 0, 'd' }, /* internal, not documented */ { "enable", required_argument, 0, 'e' }, + { "force", no_argument, 0, 'f' }, { "interval", required_argument, 0, 'i' }, { "IPC", no_argument, 0, 'I' }, { "num_iterations", required_argument, 0, 'n' }, @@ -10002,6 +10012,9 @@ void cmdline(int argc, char **argv) /* --enable specified counter */ bic_enabled = bic_enabled | bic_lookup(optarg, SHOW_LIST); break; + case 'f': + force_load++; + break; case 'd': debug++; ENABLE_BIC(BIC_DISABLED_BY_DEFAULT); From 05c14d8fd71b9c19391d0b4d65b1c1764e1c440f Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 10 Dec 2024 09:49:45 -0500 Subject: [PATCH 08/25] tools/power turbostat: add Busy% to "show idle" Suggested-by: Artem Bityutskiy Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 7accc4a733667..7a10e51a13496 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -270,7 +270,7 @@ struct msr_counter bic[] = { #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die ) #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__) #define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) -#define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) +#define BIC_IDLE (BIC_Busy | BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) #define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC | BIC_SysWatt | BIC_Sys_J) From 22a835282b6240f38097f479ae2194bbeb0181e4 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 17 Dec 2024 18:00:31 -0500 Subject: [PATCH 09/25] tools/power turbostat: Add an NMI column Add an NMI column, a proper sub-set of the IRQ column. It would be preferable if the kernel exported /sys/kernel/irq/NMI/per_cpu_count. But since we are already forced to parse /proc/interrupts, noticing which row is the NMI is simple enough. Suggested-by: Artem Bityutskiy Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 54 ++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 7a10e51a13496..2620ed000ad07 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -202,6 +202,7 @@ struct msr_counter bic[] = { { 0x0, "Die%c6", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "SysWatt", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "Sys_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "NMI", NULL, 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -266,12 +267,13 @@ struct msr_counter bic[] = { #define BIC_Diec6 (1ULL << 58) #define BIC_SysWatt (1ULL << 59) #define BIC_Sys_J (1ULL << 60) +#define BIC_NMI (1ULL << 61) #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die ) #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__) #define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) #define BIC_IDLE (BIC_Busy | BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) -#define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) +#define BIC_OTHER ( BIC_IRQ | BIC_NMI | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC | BIC_SysWatt | BIC_Sys_J) @@ -1628,6 +1630,7 @@ struct thread_data { unsigned long long c1; unsigned long long instr_count; unsigned long long irq_count; + unsigned long long nmi_count; unsigned int smi_count; unsigned int cpu_id; unsigned int apic_id; @@ -1934,6 +1937,7 @@ struct timeval tv_even, tv_odd, tv_delta; int *irq_column_2_cpu; /* /proc/interrupts column numbers */ int *irqs_per_cpu; /* indexed by cpu_num */ +int *nmi_per_cpu; /* indexed by cpu_num */ void setup_all_buffers(bool startup); @@ -2319,6 +2323,12 @@ void print_header(char *delim) else outp += sprintf(outp, "%sIRQ", (printed++ ? delim : "")); } + if (DO_BIC(BIC_NMI)) { + if (sums_need_wide_columns) + outp += sprintf(outp, "%s NMI", (printed++ ? delim : "")); + else + outp += sprintf(outp, "%sNMI", (printed++ ? delim : "")); + } if (DO_BIC(BIC_SMI)) outp += sprintf(outp, "%sSMI", (printed++ ? delim : "")); @@ -2605,6 +2615,8 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p if (DO_BIC(BIC_IRQ)) outp += sprintf(outp, "IRQ: %lld\n", t->irq_count); + if (DO_BIC(BIC_NMI)) + outp += sprintf(outp, "IRQ: %lld\n", t->nmi_count); if (DO_BIC(BIC_SMI)) outp += sprintf(outp, "SMI: %d\n", t->smi_count); @@ -2824,6 +2836,14 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->irq_count); } + /* NMI */ + if (DO_BIC(BIC_NMI)) { + if (sums_need_wide_columns) + outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->nmi_count); + else + outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->nmi_count); + } + /* SMI */ if (DO_BIC(BIC_SMI)) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->smi_count); @@ -3439,6 +3459,9 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d if (DO_BIC(BIC_IRQ)) old->irq_count = new->irq_count - old->irq_count; + if (DO_BIC(BIC_NMI)) + old->nmi_count = new->nmi_count - old->nmi_count; + if (DO_BIC(BIC_SMI)) old->smi_count = new->smi_count - old->smi_count; @@ -3519,6 +3542,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data t->instr_count = 0; t->irq_count = 0; + t->nmi_count = 0; t->smi_count = 0; c->c3 = 0; @@ -3623,6 +3647,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.threads.instr_count += t->instr_count; average.threads.irq_count += t->irq_count; + average.threads.nmi_count += t->nmi_count; average.threads.smi_count += t->smi_count; for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { @@ -3764,6 +3789,9 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data if (average.threads.irq_count > 9999999) sums_need_wide_columns = 1; + if (average.threads.nmi_count > 9999999) + sums_need_wide_columns = 1; + average.cores.c3 /= topo.allowed_cores; average.cores.c6 /= topo.allowed_cores; @@ -4620,6 +4648,8 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (DO_BIC(BIC_IRQ)) t->irq_count = irqs_per_cpu[cpu]; + if (DO_BIC(BIC_NMI)) + t->nmi_count = nmi_per_cpu[cpu]; get_cstate_counters(cpu, t, c, p); @@ -5365,6 +5395,7 @@ void free_all_buffers(void) free(irq_column_2_cpu); free(irqs_per_cpu); + free(nmi_per_cpu); for (i = 0; i <= topo.max_cpu_num; ++i) { if (cpus[i].put_ids) @@ -5821,31 +5852,37 @@ int snapshot_proc_interrupts(void) irq_column_2_cpu[column] = cpu_number; irqs_per_cpu[cpu_number] = 0; + nmi_per_cpu[cpu_number] = 0; } /* read /proc/interrupt count lines and sum up irqs per cpu */ while (1) { int column; char buf[64]; + int this_row_is_nmi = 0; - retval = fscanf(fp, " %s:", buf); /* flush irq# "N:" */ + retval = fscanf(fp, " %s:", buf); /* irq# "N:" */ if (retval != 1) break; + if (strncmp(buf, "NMI", strlen("NMI")) == 0) + this_row_is_nmi = 1; + /* read the count per cpu */ for (column = 0; column < topo.num_cpus; ++column) { int cpu_number, irq_count; retval = fscanf(fp, " %d", &irq_count); + if (retval != 1) break; cpu_number = irq_column_2_cpu[column]; irqs_per_cpu[cpu_number] += irq_count; - + if (this_row_is_nmi) + nmi_per_cpu[cpu_number] += irq_count; } - while (getc(fp) != '\n') ; /* flush interrupt description */ } @@ -5942,7 +5979,7 @@ int snapshot_sys_lpi_us(void) */ int snapshot_proc_sysfs_files(void) { - if (DO_BIC(BIC_IRQ)) + if (DO_BIC(BIC_IRQ) || DO_BIC(BIC_NMI)) if (snapshot_proc_interrupts()) return 1; @@ -8263,6 +8300,7 @@ void process_cpuid() aperf_mperf_multiplier = platform->need_perf_multiplier ? 1024 : 1; BIC_PRESENT(BIC_IRQ); + BIC_PRESENT(BIC_NMI); BIC_PRESENT(BIC_TSC_MHz); } @@ -8613,7 +8651,11 @@ void allocate_irq_buffers(void) irqs_per_cpu = calloc(topo.max_cpu_num + 1, sizeof(int)); if (irqs_per_cpu == NULL) - err(-1, "calloc %d", topo.max_cpu_num + 1); + err(-1, "calloc %d IRQ", topo.max_cpu_num + 1); + + nmi_per_cpu = calloc(topo.max_cpu_num + 1, sizeof(int)); + if (nmi_per_cpu == NULL) + err(-1, "calloc %d NMI", topo.max_cpu_num + 1); } int update_topo(struct thread_data *t, struct core_data *c, struct pkg_data *p) From 4a358ba215dfefe161b5904e51e48f5f0e82652f Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 18 Dec 2024 11:43:32 +0100 Subject: [PATCH 10/25] tools/power turbostat: Remove SysWatt from DISABLED_BY_DEFAULT The counter is present on most supporting Intel platforms and provides useful data to the user. There is no reason to disable the counter by default. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 2 +- tools/power/x86/turbostat/turbostat.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 59b89e6b25bf0..f043a93defd4a 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -190,7 +190,7 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBRAMWatt\fP Watts consumed by the DRAM DIMMS -- available only on server processors. .PP -\fBSysWatt\fP Watts consumed by the whole platform (RAPL PSYS). Disabled by default. Enable with --enable SysWatt. +\fBSysWatt\fP Watts consumed by the whole platform (RAPL PSYS). .PP \fBPKG_%\fP percent of the interval that RAPL throttling was active on the Package. Note that the system summary is the sum of the package throttling time, and thus may be higher than 100% on a multi-package system. Note that the meaning of this field is model specific. For example, some hardware increments this counter when RAPL responds to thermal limits, but does not increment this counter when RAPL responds to power limits. Comparing PkgWatt and PkgTmp to system limits is necessary. .PP diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 2620ed000ad07..1d99aaf9681b0 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -270,12 +270,12 @@ struct msr_counter bic[] = { #define BIC_NMI (1ULL << 61) #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die ) -#define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__) +#define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__ | BIC_SysWatt) #define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) #define BIC_IDLE (BIC_Busy | BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) #define BIC_OTHER ( BIC_IRQ | BIC_NMI | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) -#define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC | BIC_SysWatt | BIC_Sys_J) +#define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) unsigned long long bic_enabled = (0xFFFFFFFFFFFFFFFFULL & ~BIC_DISABLED_BY_DEFAULT); unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC_X2APIC; From 2f60f03934a50bc1fb69bb4f47a25cddd6807b0b Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Fri, 20 Dec 2024 13:38:34 +0100 Subject: [PATCH 11/25] tools/power turbostat: Fix PMT mmaped file size rounding This (the old code) is just not how you round up to a page size. Noticed on a recent Intel platform. Previous ones must have been reporting sizes already aligned to a page and so the bug was missed when testing. Fixes: f0e4ed752fda ("tools/power turbostat: Add early support for PMT counters") Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 1d99aaf9681b0..a2ca1c6c3638a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -95,6 +95,8 @@ #define INTEL_ECORE_TYPE 0x20 #define INTEL_PCORE_TYPE 0x40 +#define ROUND_UP_TO_PAGE_SIZE(n) (((n) + 0x1000UL-1UL) & ~(0x1000UL-1UL)) + enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M }; enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE }; @@ -8996,7 +8998,7 @@ struct pmt_mmio *pmt_mmio_open(unsigned int target_guid) if (fd_pmt == -1) goto loop_cleanup_and_break; - mmap_size = (size + 0x1000UL) & (~0x1000UL); + mmap_size = ROUND_UP_TO_PAGE_SIZE(size); mmio = mmap(0, mmap_size, PROT_READ, MAP_SHARED, fd_pmt, 0); if (mmio != MAP_FAILED) { From debe797c1e972ebe434c90f3fa7f54d9cf7ab251 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 5 Dec 2024 19:26:14 +0100 Subject: [PATCH 12/25] tools/power turbostat: Add fixed RAPL PSYS divisor for SPR Intel Sapphire Rapids is an exception and has fixed divisor for RAPL PSYS counter set to 1.0. Add a platform bit and enable it for SPR. Reported-by: Zhang Rui Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a2ca1c6c3638a..1bcecfed721b2 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -358,7 +358,7 @@ unsigned long long cpuidle_cur_sys_lpi_us; unsigned int tj_max; unsigned int tj_max_override; double rapl_power_units, rapl_time_units; -double rapl_dram_energy_units, rapl_energy_units; +double rapl_dram_energy_units, rapl_energy_units, rapl_psys_energy_units; double rapl_joule_counter_range; unsigned int crystal_hz; unsigned long long tsc_hz; @@ -424,6 +424,7 @@ struct platform_features { bool has_per_core_rapl; /* Indicates cores energy collection is per-core, not per-package. AMD specific for now */ bool has_rapl_divisor; /* Divisor for Energy unit raw value from MSR_RAPL_POWER_UNIT */ bool has_fixed_rapl_unit; /* Fixed Energy Unit used for DRAM RAPL Domain */ + bool has_fixed_rapl_psys_unit; /* Fixed Energy Unit used for PSYS RAPL Domain */ int rapl_quirk_tdp; /* Hardcoded TDP value when cannot be retrieved from hardware */ int tcc_offset_bits; /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */ bool enable_tsc_tweak; /* Use CPU Base freq instead of TSC freq for aperf/mperf counter */ @@ -824,6 +825,7 @@ static const struct platform_features spr_features = { .has_msr_core_c1_res = 1, .has_irtl_msrs = 1, .has_cst_prewake_bit = 1, + .has_fixed_rapl_psys_unit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS, }; @@ -1292,7 +1294,7 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = { .msr = MSR_PLATFORM_ENERGY_STATUS, .msr_mask = 0x00000000FFFFFFFF, .msr_shift = 0, - .platform_rapl_msr_scale = &rapl_energy_units, + .platform_rapl_msr_scale = &rapl_psys_energy_units, .rci_index = RAPL_RCI_INDEX_ENERGY_PLATFORM, .bic = BIC_SysWatt | BIC_Sys_J, .compat_scale = 1.0, @@ -7112,6 +7114,11 @@ void rapl_probe_intel(void) else rapl_dram_energy_units = rapl_energy_units; + if (platform->has_fixed_rapl_psys_unit) + rapl_psys_energy_units = 1.0; + else + rapl_psys_energy_units = rapl_energy_units; + time_unit = msr >> 16 & 0xF; if (time_unit == 0) time_unit = 0xA; From 1af5baeda512d0940748fdf9b559e1041dbab0cf Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 8 Jan 2025 14:19:42 +0800 Subject: [PATCH 13/25] tools/power turbostat: Enhance turbostat self-performance visibility Include procfs and sysfs data collection time in the system summary row of the "usec" column. This is useful for isolating where the time goes during turbostat data collection. Background: Column "usec" shows 1. the number of microseconds elapsed during counter collection, including thread migration -- if any, for each CPU row. 2. total elapsed time to collect the counters on all cpus, for the summary row. This can be used to check the time cost of a give column. For example, run below commands separately turbostat --show usec sleep 1 turbostat --show usec,CoreTmp sleep 1 and the delta in the usec column will tell the time cost for CoreTmp (Thermal MSR read) Problem: Some of the kernel procfs/sysfs accesses are expensive, especially on high core count systems. "usec" column cannot tell this because it only includes the time cost of the counters. Solution: Leave the per CPU "usec" as it is and modify the summary "usec" to include the time cost of the procfs/sysfs snapshot. With it, the "usec" column can be used to get 1. the baseline, e.g. turbostat --show usec sleep 1 2. the baseline + some per CPU counter cost, e.g. turbostat --show usec,CoreTmp sleep 1 3. the baseline + some per CPU sysfs cost, e.g. turbostat --show usec,C1 sleep 1 4. the baseline + /proc/interrupts cost, e.g turbostat --show usec,IRQ sleep 1 Man-page update is also included. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 2 +- tools/power/x86/turbostat/turbostat.c | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index f043a93defd4a..99bf905ade812 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -136,7 +136,7 @@ displays the statistics gathered since it was forked. The system configuration dump (if --quiet is not used) is followed by statistics. The first row of the statistics labels the content of each column (below). The second row of statistics is the system summary line. The system summary line has a '-' in the columns for the Package, Core, and CPU. The contents of the system summary line depends on the type of column. Columns that count items (eg. IRQ) show the sum across all CPUs in the system. Columns that show a percentage show the average across all CPUs in the system. Columns that dump raw MSR values simply show 0 in the summary. After the system summary row, each row describes a specific Package/Core/CPU. Note that if the --cpu parameter is used to limit which specific CPUs are displayed, turbostat will still collect statistics for all CPUs in the system and will still show the system summary for all CPUs in the system. .SH COLUMN DESCRIPTIONS .PP -\fBusec\fP For each CPU, the number of microseconds elapsed during counter collection, including thread migration -- if any. This counter is disabled by default, and is enabled with "--enable usec", or --debug. On the summary row, usec refers to the total elapsed time to collect the counters on all cpus. +\fBusec\fP For each CPU, the number of microseconds elapsed during counter collection, including thread migration -- if any. This counter is disabled by default, and is enabled with "--enable usec", or --debug. On the summary row, usec refers to the total elapsed time to snapshot the procfs/sysfs and collect the counters on all cpus. .PP \fBTime_Of_Day_Seconds\fP For each CPU, the gettimeofday(2) value (seconds.subsec since Epoch) when the counters ending the measurement interval were collected. This column is disabled by default, and can be enabled with "--enable Time_Of_Day_Seconds" or "--debug". On the summary row, Time_Of_Day_Seconds refers to the timestamp following collection of counters on the last CPU. .PP diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 1bcecfed721b2..adcf5f0a06334 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -370,6 +370,9 @@ unsigned int has_hwp_activity_window; /* IA32_HWP_REQUEST[bits 41:32] */ unsigned int has_hwp_epp; /* IA32_HWP_REQUEST[bits 31:24] */ unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */ unsigned int first_counter_read = 1; + +static struct timeval procsysfs_tv_begin; + int ignore_stdin; bool no_msr; bool no_perf; @@ -3638,7 +3641,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) /* remember first tv_begin */ if (average.threads.tv_begin.tv_sec == 0) - average.threads.tv_begin = t->tv_begin; + average.threads.tv_begin = procsysfs_tv_begin; /* remember last tv_end */ average.threads.tv_end = t->tv_end; @@ -5983,6 +5986,8 @@ int snapshot_sys_lpi_us(void) */ int snapshot_proc_sysfs_files(void) { + gettimeofday(&procsysfs_tv_begin, (struct timezone *)NULL); + if (DO_BIC(BIC_IRQ) || DO_BIC(BIC_NMI)) if (snapshot_proc_interrupts()) return 1; From 7c6fee25bdf5c8f8a1bcc6fa3566fffb7fe9eb9a Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 14 Jan 2025 16:11:28 +0100 Subject: [PATCH 14/25] tools/power turbostat: Check for non-zero value when MSR probing For some MSRs, for example, the Platform Energy Counter (RAPL PSYS), it is required to additionally check for a non-zero value to confirm that it is present. From Intel SDM vol. 4: Platform Energy Counter (R/O) This MSR is valid only if both platform vendor hardware implementation and BIOS enablement support it. This MSR will read 0 if not valid. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index adcf5f0a06334..6b72b922e2f5d 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2113,13 +2113,17 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr) int probe_msr(int cpu, off_t offset) { ssize_t retval; - unsigned long long dummy; + unsigned long long value; assert(!no_msr); - retval = pread(get_msr_fd(cpu), &dummy, sizeof(dummy), offset); + retval = pread(get_msr_fd(cpu), &value, sizeof(value), offset); - if (retval != sizeof(dummy)) + /* + * Expect MSRs to accumulate some non-zero value since the system was powered on. + * Treat zero as a read failure. + */ + if (retval != sizeof(value) || value == 0) return 1; return 0; From 34537ddd208d614dbefeb97823ae1c79e7771588 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 10 Dec 2024 18:27:38 +0100 Subject: [PATCH 15/25] tools/power turbostat: Return default value for unmapped PMT domains When requesting PMT counters with --add command, user may want to skip specifying values for all the domains (that is, cpu, core, package etc). For the domains that user did not provide information on how to read the counter, return default value - zero. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 6b72b922e2f5d..60b1ade8659b5 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4615,7 +4615,8 @@ unsigned long pmt_gen_value_mask(unsigned int lsb, unsigned int msb) unsigned long pmt_read_counter(struct pmt_counter *ppmt, unsigned int domain_id) { - assert(domain_id < ppmt->num_domains); + if (domain_id >= ppmt->num_domains) + return 0; const unsigned long *pmmio = ppmt->domains[domain_id].pcounter; const unsigned long value = pmmio ? *pmmio : 0; From 089134cb0502ba962bce9402ce96e0875876d401 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 10 Dec 2024 11:41:58 +0100 Subject: [PATCH 16/25] tools/power turbostat: Extend PMT identification with a sequence number When platforms expose multiple PMT aggregators with the same GUID, the only way to identify them and map to specific domain is by reading them in an order they were exposed via PCIe. Intel PMT kernel driver does keep the same order and numbers the telemetry directories accordingly. Use GUID and sequence number (order) to uniquely identify PMT aggregators. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 60b1ade8659b5..14c4958867463 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1536,6 +1536,7 @@ static struct msr_counter_arch_info msr_counter_arch_infos[] = { #define PMT_COUNTER_MTL_DC6_LSB 0 #define PMT_COUNTER_MTL_DC6_MSB 63 #define PMT_MTL_DC6_GUID 0x1a067102 +#define PMT_MTL_DC6_SEQ 0 #define PMT_COUNTER_NAME_SIZE_BYTES 16 #define PMT_COUNTER_TYPE_NAME_SIZE_BYTES 32 @@ -9083,7 +9084,7 @@ void *pmt_get_counter_pointer(struct pmt_mmio *pmmio, unsigned long counter_offs return ret; } -struct pmt_mmio *pmt_add_guid(unsigned int guid) +struct pmt_mmio *pmt_add_guid(unsigned int guid, unsigned int seq) { struct pmt_mmio *ret; @@ -9091,6 +9092,11 @@ struct pmt_mmio *pmt_add_guid(unsigned int guid) if (!ret) ret = pmt_mmio_open(guid); + while (ret && seq) { + ret = ret->next; + --seq; + } + return ret; } @@ -9137,7 +9143,7 @@ void pmt_counter_add_domain(struct pmt_counter *pcounter, unsigned long *pmmio, pcounter->domains[domain_id].pcounter = pmmio; } -int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type, +int pmt_add_counter(unsigned int guid, unsigned int seq, const char *name, enum pmt_datatype type, unsigned int lsb, unsigned int msb, unsigned int offset, enum counter_scope scope, enum counter_format format, unsigned int domain_id, enum pmt_open_mode mode) { @@ -9157,10 +9163,10 @@ int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type, exit(1); } - mmio = pmt_add_guid(guid); + mmio = pmt_add_guid(guid, seq); if (!mmio) { if (mode != PMT_OPEN_TRY) { - fprintf(stderr, "%s: failed to map PMT MMIO for guid %x\n", __func__, guid); + fprintf(stderr, "%s: failed to map PMT MMIO for guid %x, seq %u\n", __func__, guid, seq); exit(1); } @@ -9216,9 +9222,9 @@ int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type, void pmt_init(void) { if (BIC_IS_ENABLED(BIC_Diec6)) { - pmt_add_counter(PMT_MTL_DC6_GUID, "Die%c6", PMT_TYPE_XTAL_TIME, PMT_COUNTER_MTL_DC6_LSB, - PMT_COUNTER_MTL_DC6_MSB, PMT_COUNTER_MTL_DC6_OFFSET, SCOPE_PACKAGE, FORMAT_DELTA, - 0, PMT_OPEN_TRY); + pmt_add_counter(PMT_MTL_DC6_GUID, PMT_MTL_DC6_SEQ, "Die%c6", PMT_TYPE_XTAL_TIME, + PMT_COUNTER_MTL_DC6_LSB, PMT_COUNTER_MTL_DC6_MSB, PMT_COUNTER_MTL_DC6_OFFSET, + SCOPE_PACKAGE, FORMAT_DELTA, 0, PMT_OPEN_TRY); } } @@ -9699,6 +9705,7 @@ void parse_add_command_pmt(char *add_command) unsigned int lsb; unsigned int msb; unsigned int guid; + unsigned int seq = 0; /* By default, pick first file in a sequence with a given GUID. */ unsigned int domain_id; enum counter_scope scope = 0; enum pmt_datatype type = PMT_TYPE_RAW; @@ -9778,6 +9785,10 @@ void parse_add_command_pmt(char *add_command) goto next; } + if (sscanf(add_command, "seq=%x", &seq) == 1) { + goto next; + } + next: add_command = strchr(add_command, ','); if (add_command) { @@ -9864,7 +9875,7 @@ void parse_add_command_pmt(char *add_command) exit(1); } - pmt_add_counter(guid, name, type, lsb, msb, offset, scope, format, domain_id, PMT_OPEN_REQUIRED); + pmt_add_counter(guid, seq, name, type, lsb, msb, offset, scope, format, domain_id, PMT_OPEN_REQUIRED); } void parse_add_command(char *add_command) From 4265a86582eaa224d171328be0c71e2a7ccd194f Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 12 Dec 2024 18:59:25 +0100 Subject: [PATCH 17/25] tools/power turbostat: Add PMT directory iterator helper PMT directories exposed in sysfs use the following pattern: telem%u for example: telem0, telem2, telem3, ..., telem15, telem16 This naming scheme preserves the ordering from the PCIe discovery, which is important to correctly map the telemetry directory to the specific domain (cpu, core, package etc). Because readdir() traverses the entries in alphabetical order, causing for example "telem13" to be traversed before "telem3", it is necessary to use scandir() with custom compare() callback to preserve the PCIe ordering. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 87 +++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 14c4958867463..6104d5bcca5c1 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1589,6 +1589,93 @@ struct pmt_counter { struct pmt_domain_info *domains; }; +/* + * PMT telemetry directory iterator. + * Used to iterate telemetry files in sysfs in correct order. + */ +struct pmt_diriter_t +{ + DIR *dir; + struct dirent **namelist; + unsigned int num_names; + unsigned int current_name_idx; +}; + +int pmt_telemdir_filter(const struct dirent *e) +{ + unsigned int dummy; + return sscanf(e->d_name, "telem%u", &dummy); +} + +int pmt_telemdir_sort(const struct dirent **a, const struct dirent **b) +{ + unsigned int aidx = 0, bidx = 0; + + sscanf((*a)->d_name, "telem%u", &aidx); + sscanf((*b)->d_name, "telem%u", &bidx); + + return aidx >= bidx; +} + +const struct dirent* pmt_diriter_next(struct pmt_diriter_t *iter) +{ + const struct dirent *ret = NULL; + + if (!iter->dir) + return NULL; + + if (iter->current_name_idx >= iter->num_names) + return NULL; + + ret = iter->namelist[iter->current_name_idx]; + ++iter->current_name_idx; + + return ret; +} + +const struct dirent* pmt_diriter_begin(struct pmt_diriter_t *iter, const char *pmt_root_path) +{ + int num_names = iter->num_names; + + if (!iter->dir) { + iter->dir = opendir(pmt_root_path); + if (iter->dir == NULL) + return NULL; + + num_names = scandir(pmt_root_path, &iter->namelist, pmt_telemdir_filter, pmt_telemdir_sort); + if (num_names == -1) + return NULL; + } + + iter->current_name_idx = 0; + iter->num_names = num_names; + + return pmt_diriter_next(iter); +} + +void pmt_diriter_init(struct pmt_diriter_t *iter) +{ + memset(iter, 0, sizeof(*iter)); +} + +void pmt_diriter_remove(struct pmt_diriter_t *iter) +{ + if (iter->namelist) { + for (unsigned int i = 0; i < iter->num_names; i++) { + free(iter->namelist[i]); + iter->namelist[i] = NULL; + } + } + + free(iter->namelist); + iter->namelist = NULL; + iter->num_names = 0; + iter->current_name_idx = 0; + + closedir(iter->dir); + iter->dir = NULL; +} + unsigned int pmt_counter_get_width(const struct pmt_counter *p) { return (p->msb - p->lsb) + 1; From 16ce467875ef8572b82f9af30fcf7b2f65fc2e95 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Fri, 6 Dec 2024 11:22:00 +0100 Subject: [PATCH 18/25] tools/power turbostat: Allow mapping multiple PMT files with the same GUID Some platforms may expose multiple telemetry files identified with the same GUID. Interpreting it correctly, to associate given counter with a CPU, core or a package requires more metadata from the user. Parse and create ordered, linked list of those PMT aggregators, so that we can identify specific aggregator with GUID + sequence number. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 75 ++++++++++++++------------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 6104d5bcca5c1..f76e1de3f9687 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -9033,46 +9033,35 @@ int parse_telem_info_file(int fd_dir, const char *info_filename, const char *for struct pmt_mmio *pmt_mmio_open(unsigned int target_guid) { - DIR *dirp; - struct dirent *entry; + struct pmt_diriter_t pmt_iter; + const struct dirent *entry; struct stat st; - unsigned int telem_idx; int fd_telem_dir, fd_pmt; unsigned long guid, size, offset; size_t mmap_size; void *mmio; - struct pmt_mmio *ret = NULL; + struct pmt_mmio *head = NULL, *last = NULL; + struct pmt_mmio *new_pmt = NULL; if (stat(SYSFS_TELEM_PATH, &st) == -1) return NULL; - dirp = opendir(SYSFS_TELEM_PATH); - if (dirp == NULL) + pmt_diriter_init(&pmt_iter); + entry = pmt_diriter_begin(&pmt_iter, SYSFS_TELEM_PATH); + if (!entry) { + pmt_diriter_remove(&pmt_iter); return NULL; + } - for (;;) { - entry = readdir(dirp); - - if (entry == NULL) - break; - - if (strcmp(entry->d_name, ".") == 0) - continue; - - if (strcmp(entry->d_name, "..") == 0) - continue; - - if (sscanf(entry->d_name, "telem%u", &telem_idx) != 1) - continue; - - if (fstatat(dirfd(dirp), entry->d_name, &st, 0) == -1) { + for (;entry != NULL; entry = pmt_diriter_next(&pmt_iter)) { + if (fstatat(dirfd(pmt_iter.dir), entry->d_name, &st, 0) == -1) { break; } if (!S_ISDIR(st.st_mode)) continue; - fd_telem_dir = openat(dirfd(dirp), entry->d_name, O_RDONLY); + fd_telem_dir = openat(dirfd(pmt_iter.dir), entry->d_name, O_RDONLY); if (fd_telem_dir == -1) { break; } @@ -9106,35 +9095,51 @@ struct pmt_mmio *pmt_mmio_open(unsigned int target_guid) mmap_size = ROUND_UP_TO_PAGE_SIZE(size); mmio = mmap(0, mmap_size, PROT_READ, MAP_SHARED, fd_pmt, 0); if (mmio != MAP_FAILED) { - if (debug) fprintf(stderr, "%s: 0x%lx mmaped at: %p\n", __func__, guid, mmio); - ret = calloc(1, sizeof(*ret)); + new_pmt = calloc(1, sizeof(*new_pmt)); - if (!ret) { + if (!new_pmt) { fprintf(stderr, "%s: Failed to allocate pmt_mmio\n", __func__); exit(1); } - ret->guid = guid; - ret->mmio_base = mmio; - ret->pmt_offset = offset; - ret->size = size; + /* + * Create linked list of mmaped regions, + * but preserve the ordering from sysfs. + * Ordering is important for the user to + * use the seq=%u parameter when adding a counter. + */ + new_pmt->guid = guid; + new_pmt->mmio_base = mmio; + new_pmt->pmt_offset = offset; + new_pmt->size = size; + new_pmt->next = pmt_mmios; + + if (last) + last->next = new_pmt; + else + head = new_pmt; - ret->next = pmt_mmios; - pmt_mmios = ret; + last = new_pmt; } loop_cleanup_and_break: close(fd_pmt); close(fd_telem_dir); - break; } - closedir(dirp); + pmt_diriter_remove(&pmt_iter); - return ret; + /* + * If we found something, stick just + * created linked list to the front. + */ + if (head) + pmt_mmios = head; + + return head; } struct pmt_mmio *pmt_mmio_find(unsigned int guid) From 83fbeb9f9776cd044d36af606127f56206337bab Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 12 Dec 2024 19:11:34 +0100 Subject: [PATCH 19/25] tools/power turbostat: Allow adding PMT counters directly by sysfs path Allow user to add PMT counters by either identifying the source with: guid=%u,seq=%u or, since this patch, with direct sysfs path: path=%s, for example path=/sys/class/intel_pmt/telem5 In the later case, the guid and sequence number will be infered by turbostat. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 107 +++++++++++++++++++++++++- 1 file changed, 106 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f76e1de3f9687..0f2475fa9fa4e 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -9788,11 +9788,96 @@ bool starts_with(const char *str, const char *prefix) return strncmp(prefix, str, strlen(prefix)) == 0; } +int pmt_parse_from_path(const char *target_path, unsigned int *out_guid, unsigned int *out_seq) +{ + struct pmt_diriter_t pmt_iter; + const struct dirent *dirname; + struct stat stat, target_stat; + int fd_telem_dir = -1; + int fd_target_dir; + unsigned int seq = 0; + unsigned long guid, target_guid; + int ret = -1; + + fd_target_dir = open(target_path, O_RDONLY | O_DIRECTORY); + if (fd_target_dir == -1) { + return -1; + } + + if (fstat(fd_target_dir, &target_stat) == -1) { + fprintf(stderr, "%s: Failed to stat the target: %s", __func__, strerror(errno)); + exit(1); + } + + if (parse_telem_info_file(fd_target_dir, "guid", "%lx", &target_guid)) { + fprintf(stderr, "%s: Failed to parse the target guid file: %s", __func__, strerror(errno)); + exit(1); + } + + close(fd_target_dir); + + pmt_diriter_init(&pmt_iter); + + for (dirname = pmt_diriter_begin(&pmt_iter, SYSFS_TELEM_PATH); dirname != NULL; + dirname = pmt_diriter_next(&pmt_iter)) { + + fd_telem_dir = openat(dirfd(pmt_iter.dir), dirname->d_name, O_RDONLY | O_DIRECTORY); + if (fd_telem_dir == -1) { + continue; + } + + if (parse_telem_info_file(fd_telem_dir, "guid", "%lx", &guid)) { + fprintf(stderr, "%s: Failed to parse the guid file: %s", __func__, strerror(errno)); + continue; + } + + if (fstat(fd_telem_dir, &stat) == -1) { + fprintf(stderr, "%s: Failed to stat %s directory: %s", __func__, + dirname->d_name, strerror(errno)); + continue; + } + + /* + * If reached the same directory as target, exit the loop. + * Seq has the correct value now. + */ + if (stat.st_dev == target_stat.st_dev && stat.st_ino == target_stat.st_ino) { + ret = 0; + break; + } + + /* + * If reached directory with the same guid, + * but it's not the target directory yet, + * increment seq and continue the search. + */ + if (guid == target_guid) + ++seq; + + close(fd_telem_dir); + fd_telem_dir = -1; + } + + pmt_diriter_remove(&pmt_iter); + + if (fd_telem_dir != -1) + close(fd_telem_dir); + + if (!ret) { + *out_guid = target_guid; + *out_seq = seq; + } + + return ret; +} + void parse_add_command_pmt(char *add_command) { char *name = NULL; char *type_name = NULL; char *format_name = NULL; + char *direct_path = NULL; + static const char direct_path_prefix[] = "path="; unsigned int offset; unsigned int lsb; unsigned int msb; @@ -9881,6 +9966,10 @@ void parse_add_command_pmt(char *add_command) goto next; } + if (strncmp(add_command, direct_path_prefix, strlen(direct_path_prefix)) == 0) { + direct_path = add_command + strlen(direct_path_prefix); + goto next; + } next: add_command = strchr(add_command, ','); if (add_command) { @@ -9952,8 +10041,24 @@ void parse_add_command_pmt(char *add_command) exit(1); } + if (direct_path && has_guid) { + printf("%s: path and guid+seq parameters are mutually exclusive\n" + "notice: passed guid=0x%x and path=%s\n", __func__, guid, direct_path); + exit(1); + } + + if (direct_path) { + if (pmt_parse_from_path(direct_path, &guid, &seq)) { + printf("%s: failed to parse PMT file from %s\n", __func__, direct_path); + exit(1); + } + + /* GUID was just infered from the direct path. */ + has_guid = true; + } + if (!has_guid) { - printf("%s: missing %s\n", __func__, "guid"); + printf("%s: missing %s\n", __func__, "guid or path"); exit(1); } From a80e53472209b1c749e02e91ac62c053ac457099 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 3 Dec 2024 16:11:21 -0500 Subject: [PATCH 20/25] tools/power turbostat: version 2025.01.14 Fix checkpatch whitespace issues since 2024.11.30 Summary of Changes since 2024.11.30: Enable SysWatt by default. Add initial PTL, CWF platform support. Refuse to run on unsupported platforms without --force to avoid not-so-useful measurements mistakenly made using obsolete versions. Harden initial PMT code in response to early use. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 36 ++++++++++++--------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 0f2475fa9fa4e..76d2632e60ac6 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3,7 +3,7 @@ * turbostat -- show CPU frequency and C-state residency * on modern Intel and AMD processors. * - * Copyright (c) 2024 Intel Corporation. + * Copyright (c) 2025 Intel Corporation. * Len Brown */ @@ -271,11 +271,11 @@ struct msr_counter bic[] = { #define BIC_Sys_J (1ULL << 60) #define BIC_NMI (1ULL << 61) -#define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die ) -#define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__ | BIC_SysWatt) +#define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die) +#define BIC_THERMAL_PWR (BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__ | BIC_SysWatt) #define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) #define BIC_IDLE (BIC_Busy | BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) -#define BIC_OTHER ( BIC_IRQ | BIC_NMI | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) +#define BIC_OTHER (BIC_IRQ | BIC_NMI | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) @@ -1593,8 +1593,7 @@ struct pmt_counter { * PMT telemetry directory iterator. * Used to iterate telemetry files in sysfs in correct order. */ -struct pmt_diriter_t -{ +struct pmt_diriter_t { DIR *dir; struct dirent **namelist; unsigned int num_names; @@ -1604,6 +1603,7 @@ struct pmt_diriter_t int pmt_telemdir_filter(const struct dirent *e) { unsigned int dummy; + return sscanf(e->d_name, "telem%u", &dummy); } @@ -1617,7 +1617,7 @@ int pmt_telemdir_sort(const struct dirent **a, const struct dirent **b) return aidx >= bidx; } -const struct dirent* pmt_diriter_next(struct pmt_diriter_t *iter) +const struct dirent *pmt_diriter_next(struct pmt_diriter_t *iter) { const struct dirent *ret = NULL; @@ -1633,7 +1633,7 @@ const struct dirent* pmt_diriter_next(struct pmt_diriter_t *iter) return ret; } -const struct dirent* pmt_diriter_begin(struct pmt_diriter_t *iter, const char *pmt_root_path) +const struct dirent *pmt_diriter_begin(struct pmt_diriter_t *iter, const char *pmt_root_path) { int num_names = iter->num_names; @@ -2302,7 +2302,7 @@ void help(void) " -h, --help\n" " print this help message\n" " -v, --version\n" - " print version information\n" "\n" "For more help, run \"man turbostat\"\n"); + " print version information\n\nFor more help, run \"man turbostat\"\n"); } /* @@ -9053,18 +9053,16 @@ struct pmt_mmio *pmt_mmio_open(unsigned int target_guid) return NULL; } - for (;entry != NULL; entry = pmt_diriter_next(&pmt_iter)) { - if (fstatat(dirfd(pmt_iter.dir), entry->d_name, &st, 0) == -1) { + for ( ; entry != NULL; entry = pmt_diriter_next(&pmt_iter)) { + if (fstatat(dirfd(pmt_iter.dir), entry->d_name, &st, 0) == -1) break; - } if (!S_ISDIR(st.st_mode)) continue; fd_telem_dir = openat(dirfd(pmt_iter.dir), entry->d_name, O_RDONLY); - if (fd_telem_dir == -1) { + if (fd_telem_dir == -1) break; - } if (parse_telem_info_file(fd_telem_dir, "guid", "%lx", &guid)) { close(fd_telem_dir); @@ -9425,7 +9423,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2024.11.30 - Len Brown \n"); + fprintf(outf, "turbostat version 2025.01.14 - Len Brown \n"); } #define COMMAND_LINE_SIZE 2048 @@ -9750,7 +9748,7 @@ void parse_add_command_msr(char *add_command) } if ((msr_num == 0) && (path == NULL) && (perf_device[0] == '\0' || perf_event[0] == '\0')) { - fprintf(stderr, "--add: (msrDDD | msr0xXXX | /path_to_counter | perf/device/event ) required\n"); + fprintf(stderr, "--add: (msrDDD | msr0xXXX | /path_to_counter | perf/device/event) required\n"); fail++; } @@ -9822,9 +9820,8 @@ int pmt_parse_from_path(const char *target_path, unsigned int *out_guid, unsigne dirname = pmt_diriter_next(&pmt_iter)) { fd_telem_dir = openat(dirfd(pmt_iter.dir), dirname->d_name, O_RDONLY | O_DIRECTORY); - if (fd_telem_dir == -1) { + if (fd_telem_dir == -1) continue; - } if (parse_telem_info_file(fd_telem_dir, "guid", "%lx", &guid)) { fprintf(stderr, "%s: Failed to parse the guid file: %s", __func__, strerror(errno)); @@ -9962,9 +9959,8 @@ void parse_add_command_pmt(char *add_command) goto next; } - if (sscanf(add_command, "seq=%x", &seq) == 1) { + if (sscanf(add_command, "seq=%x", &seq) == 1) goto next; - } if (strncmp(add_command, direct_path_prefix, strlen(direct_path_prefix)) == 0) { direct_path = add_command + strlen(direct_path_prefix); From 1a202afeaa370970413846c2cb09b383875e753c Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Fri, 17 Jan 2025 13:36:59 +0100 Subject: [PATCH 21/25] tools/power turbostat: Add tcore clock PMT type Some PMT counters, for example module c1e residency on Intel Clearwater Forest, are reported using tcore clock type. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 32 ++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 76d2632e60ac6..ecaa4e0fb2c0d 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1538,6 +1538,8 @@ static struct msr_counter_arch_info msr_counter_arch_infos[] = { #define PMT_MTL_DC6_GUID 0x1a067102 #define PMT_MTL_DC6_SEQ 0 +unsigned long long tcore_clock_freq_hz = 800000000; + #define PMT_COUNTER_NAME_SIZE_BYTES 16 #define PMT_COUNTER_TYPE_NAME_SIZE_BYTES 32 @@ -1560,6 +1562,7 @@ struct pmt_mmio { enum pmt_datatype { PMT_TYPE_RAW, PMT_TYPE_XTAL_TIME, + PMT_TYPE_TCORE_CLOCK, }; struct pmt_domain_info { @@ -2474,6 +2477,7 @@ void print_header(char *delim) break; case PMT_TYPE_XTAL_TIME: + case PMT_TYPE_TCORE_CLOCK: outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), ppmt->name); break; } @@ -2548,6 +2552,7 @@ void print_header(char *delim) break; case PMT_TYPE_XTAL_TIME: + case PMT_TYPE_TCORE_CLOCK: outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), ppmt->name); break; } @@ -2679,6 +2684,7 @@ void print_header(char *delim) break; case PMT_TYPE_XTAL_TIME: + case PMT_TYPE_TCORE_CLOCK: outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), ppmt->name); break; } @@ -2997,7 +3003,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) { const unsigned long value_raw = t->pmt_counter[i]; - const double value_converted = 100.0 * value_raw / crystal_hz / interval_float; + double value_converted; switch (ppmt->type) { case PMT_TYPE_RAW: if (pmt_counter_get_width(ppmt) <= 32) @@ -3009,8 +3015,13 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data break; case PMT_TYPE_XTAL_TIME: + value_converted = 100.0 * value_raw / crystal_hz / interval_float; outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); break; + + case PMT_TYPE_TCORE_CLOCK: + value_converted = 100.0 * value_raw / tcore_clock_freq_hz / interval_float; + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); } } @@ -3077,7 +3088,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) { const unsigned long value_raw = c->pmt_counter[i]; - const double value_converted = 100.0 * value_raw / crystal_hz / interval_float; + double value_converted; switch (ppmt->type) { case PMT_TYPE_RAW: if (pmt_counter_get_width(ppmt) <= 32) @@ -3089,8 +3100,13 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data break; case PMT_TYPE_XTAL_TIME: + value_converted = 100.0 * value_raw / crystal_hz / interval_float; outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); break; + + case PMT_TYPE_TCORE_CLOCK: + value_converted = 100.0 * value_raw / tcore_clock_freq_hz / interval_float; + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); } } @@ -3275,7 +3291,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) { const unsigned long value_raw = p->pmt_counter[i]; - const double value_converted = 100.0 * value_raw / crystal_hz / interval_float; + double value_converted; switch (ppmt->type) { case PMT_TYPE_RAW: if (pmt_counter_get_width(ppmt) <= 32) @@ -3287,8 +3303,13 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data break; case PMT_TYPE_XTAL_TIME: + value_converted = 100.0 * value_raw / crystal_hz / interval_float; outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); break; + + case PMT_TYPE_TCORE_CLOCK: + value_converted = 100.0 * value_raw / tcore_clock_freq_hz / interval_float; + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); } } @@ -10016,6 +10037,11 @@ void parse_add_command_pmt(char *add_command) has_type = true; } + if (strcmp("tcore_clock", type_name) == 0) { + type = PMT_TYPE_TCORE_CLOCK; + has_type = true; + } + if (!has_type) { printf("%s: invalid %s: %s\n", __func__, "type", type_name); exit(1); From b32c36975da48afc9089f8b61f7b2dcc40e479d2 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 27 Jan 2025 16:42:19 -0600 Subject: [PATCH 22/25] tools/power turbostat: Fix forked child affinity regression In "one-shot" mode, turbostat 1. takes a counter snapshot 2. forks and waits for a child 3. takes the end counter snapshot and prints the result. But turbostat counter snapshots currently use affinity to travel around the system so that counter reads are "local", and this affinity must be cleared between #1 and #2 above. The offending commit removed that reset that allowed the child to run on cpu_present_set. Fix that issue, and improve upon the original by using cpu_possible_set for the child. This allows the child to also run on CPUs that hotplug online during its runtime. Reported-by: Zhang Rui Fixes: 7bb3fe27ad4f ("tools/power/turbostat: Obey allowed CPUs during startup") Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 54 ++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ecaa4e0fb2c0d..1f188a0908da6 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1120,8 +1120,8 @@ int backwards_count; char *progname; #define CPU_SUBSET_MAXCPUS 1024 /* need to use before probe... */ -cpu_set_t *cpu_present_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset; -size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size; +cpu_set_t *cpu_present_set, *cpu_possible_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset; +size_t cpu_present_setsize, cpu_possible_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size; #define MAX_ADDED_THREAD_COUNTERS 24 #define MAX_ADDED_CORE_COUNTERS 8 #define MAX_ADDED_PACKAGE_COUNTERS 16 @@ -8488,6 +8488,33 @@ int dir_filter(const struct dirent *dirp) return 0; } +char *possible_file = "/sys/devices/system/cpu/possible"; +char possible_buf[1024]; + +int initialize_cpu_possible_set(void) +{ + FILE *fp; + + fp = fopen(possible_file, "r"); + if (!fp) { + warn("open %s", possible_file); + return -1; + } + if (fread(possible_buf, sizeof(char), 1024, fp) == 0) { + warn("read %s", possible_file); + goto err; + } + if (parse_cpu_str(possible_buf, cpu_possible_set, cpu_possible_setsize)) { + warnx("%s: cpu str malformat %s\n", possible_file, cpu_effective_str); + goto err; + } + return 0; + +err: + fclose(fp); + return -1; +} + void topology_probe(bool startup) { int i; @@ -8519,6 +8546,16 @@ void topology_probe(bool startup) CPU_ZERO_S(cpu_present_setsize, cpu_present_set); for_all_proc_cpus(mark_cpu_present); + /* + * Allocate and initialize cpu_possible_set + */ + cpu_possible_set = CPU_ALLOC((topo.max_cpu_num + 1)); + if (cpu_possible_set == NULL) + err(3, "CPU_ALLOC"); + cpu_possible_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1)); + CPU_ZERO_S(cpu_possible_setsize, cpu_possible_set); + initialize_cpu_possible_set(); + /* * Allocate and initialize cpu_effective_set */ @@ -9371,6 +9408,18 @@ void turbostat_init() } } +void affinitize_child(void) +{ + /* Prefer cpu_possible_set, if available */ + if (sched_setaffinity(0, cpu_possible_setsize, cpu_possible_set)) { + warn("sched_setaffinity cpu_possible_set"); + + /* Otherwise, allow child to run on same cpu set as turbostat */ + if (sched_setaffinity(0, cpu_allowed_setsize, cpu_allowed_set)) + warn("sched_setaffinity cpu_allowed_set"); + } +} + int fork_it(char **argv) { pid_t child_pid; @@ -9386,6 +9435,7 @@ int fork_it(char **argv) child_pid = fork(); if (!child_pid) { /* child */ + affinitize_child(); execvp(argv[0], argv); err(errno, "exec %s", argv[0]); } else { From 5499b5ac0b2c661cc37190a23a4aee9308b3d3ee Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 27 Jan 2025 20:58:42 -0600 Subject: [PATCH 23/25] tools/power turbostat: Harden one-shot mode against cpu offline when turbostat interval mode can't migrate to a CPU, it complains, prints no data, re-initializes with the new CPU configuration and starts a new interval. But this strategy in the face of a CPU hotplug offline during an interval doesn't help in one-shot mode. When the missing CPU is discovered at the end of the interval, the forked program has already returned and there is nothing left for a new interval to measure. So instead of aborting get_coutners() and delta_cpu() if a missing CPU is detected, complain, but carry on and output what statistics are actually present. Use the same strategy for delta_cpu when aperf:mperf are observed to have been reset -- complain, but carry on and print data for the CPUs that are still present. Interval mode error handling is unchanged. One-shot mode can now do this: $ sudo chcpu -e 1 ; sudo ./turbostat --quiet --show PkgWatt,Busy%,CPU chcpu -d 1 CPU 1 enabled CPU 1 disabled get_counters: Could not migrate to CPU 1 ./turbostat: Counter reset detected 0.036920 sec CPU Busy% PkgWatt - 0.00 10.00 0 99.73 10.00 1 0.00 2 91.53 3 16.83 Suggested-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 1f188a0908da6..8df08819e7b41 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2063,6 +2063,8 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk { int retval, pkg_no, core_no, thread_no, node_no; + retval = 0; + for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) { for (node_no = 0; node_no < topo.nodes_per_pkg; node_no++) { for (core_no = 0; core_no < topo.cores_per_node; ++core_no) { @@ -2078,14 +2080,12 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk c = GET_CORE(core_base, core_no, node_no, pkg_no); p = GET_PKG(pkg_base, pkg_no); - retval = func(t, c, p); - if (retval) - return retval; + retval |= func(t, c, p); } } } } - return 0; + return retval; } int is_cpu_first_thread_in_core(struct thread_data *t, struct core_data *c, struct pkg_data *p) @@ -3620,12 +3620,10 @@ int delta_cpu(struct thread_data *t, struct core_data *c, /* always calculate thread delta */ retval = delta_thread(t, t2, c2); /* c2 is core delta */ - if (retval) - return retval; /* calculate package delta only for 1st core in package */ if (is_cpu_first_core_in_package(t, c, p)) - retval = delta_package(p, p2); + retval |= delta_package(p, p2); return retval; } @@ -5748,6 +5746,8 @@ int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *, { int retval, pkg_no, node_no, core_no, thread_no; + retval = 0; + for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) { for (node_no = 0; node_no < topo.nodes_per_pkg; ++node_no) { for (core_no = 0; core_no < topo.cores_per_node; ++core_no) { @@ -5769,14 +5769,12 @@ int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *, p = GET_PKG(pkg_base, pkg_no); p2 = GET_PKG(pkg_base2, pkg_no); - retval = func(t, c, p, t2, c2, p2); - if (retval) - return retval; + retval |= func(t, c, p, t2, c2, p2); } } } } - return 0; + return retval; } /* @@ -9462,10 +9460,9 @@ int fork_it(char **argv) timersub(&tv_odd, &tv_even, &tv_delta); if (for_all_cpus_2(delta_cpu, ODD_COUNTERS, EVEN_COUNTERS)) fprintf(outf, "%s: Counter reset detected\n", progname); - else { - compute_average(EVEN_COUNTERS); - format_all_counters(EVEN_COUNTERS); - } + + compute_average(EVEN_COUNTERS); + format_all_counters(EVEN_COUNTERS); fprintf(outf, "%.6f sec\n", tv_delta.tv_sec + tv_delta.tv_usec / 1000000.0); From 5ce1e9bbb2a1d43cf9e613cb03e65ecdfd309fe9 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Fri, 17 Jan 2025 13:50:29 +0100 Subject: [PATCH 24/25] tools/power turbostat: Add CPU%c1e BIC for CWF Intel Clearwater Forest report PMT telemetry with GUID 0x14421519, which can be used to obtain module c1e residency counter of type tcore clock. Add early support for the counter by using heuristic that should work for the Clearwater Forest platforms. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 68 +++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 8df08819e7b41..364a44a7d7aee 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -205,6 +205,7 @@ struct msr_counter bic[] = { { 0x0, "SysWatt", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "Sys_J", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "NMI", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c1e", NULL, 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -270,6 +271,7 @@ struct msr_counter bic[] = { #define BIC_SysWatt (1ULL << 59) #define BIC_Sys_J (1ULL << 60) #define BIC_NMI (1ULL << 61) +#define BIC_CPU_c1e (1ULL << 62) #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die) #define BIC_THERMAL_PWR (BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__ | BIC_SysWatt) @@ -1538,6 +1540,14 @@ static struct msr_counter_arch_info msr_counter_arch_infos[] = { #define PMT_MTL_DC6_GUID 0x1a067102 #define PMT_MTL_DC6_SEQ 0 +#define PMT_COUNTER_CWF_MC1E_OFFSET_BASE 20936 +#define PMT_COUNTER_CWF_MC1E_OFFSET_INCREMENT 24 +#define PMT_COUNTER_CWF_MC1E_NUM_MODULES_PER_FILE 12 +#define PMT_COUNTER_CWF_CPUS_PER_MODULE 4 +#define PMT_COUNTER_CWF_MC1E_LSB 0 +#define PMT_COUNTER_CWF_MC1E_MSB 63 +#define PMT_CWF_MC1E_GUID 0x14421519 + unsigned long long tcore_clock_freq_hz = 800000000; #define PMT_COUNTER_NAME_SIZE_BYTES 16 @@ -9367,11 +9377,69 @@ int pmt_add_counter(unsigned int guid, unsigned int seq, const char *name, enum void pmt_init(void) { + int cpu_num; + unsigned long seq, offset, mod_num; + if (BIC_IS_ENABLED(BIC_Diec6)) { pmt_add_counter(PMT_MTL_DC6_GUID, PMT_MTL_DC6_SEQ, "Die%c6", PMT_TYPE_XTAL_TIME, PMT_COUNTER_MTL_DC6_LSB, PMT_COUNTER_MTL_DC6_MSB, PMT_COUNTER_MTL_DC6_OFFSET, SCOPE_PACKAGE, FORMAT_DELTA, 0, PMT_OPEN_TRY); } + + if (BIC_IS_ENABLED(BIC_CPU_c1e)) { + seq = 0; + offset = PMT_COUNTER_CWF_MC1E_OFFSET_BASE; + mod_num = 0; /* Relative module number for current PMT file. */ + + /* Open the counter for each CPU. */ + for (cpu_num = 0; cpu_num < topo.max_cpu_num;) { + + if (cpu_is_not_allowed(cpu_num)) + goto next_loop_iter; + + /* + * Set the scope to CPU, even though CWF report the counter per module. + * CPUs inside the same module will read from the same location, instead of reporting zeros. + * + * CWF with newer firmware might require a PMT_TYPE_XTAL_TIME intead of PMT_TYPE_TCORE_CLOCK. + */ + pmt_add_counter(PMT_CWF_MC1E_GUID, seq, "CPU%c1e", PMT_TYPE_TCORE_CLOCK, + PMT_COUNTER_CWF_MC1E_LSB, PMT_COUNTER_CWF_MC1E_MSB, offset, SCOPE_CPU, + FORMAT_DELTA, cpu_num, PMT_OPEN_TRY); + + /* + * Rather complex logic for each time we go to the next loop iteration, + * so keep it as a label. + */ +next_loop_iter: + /* + * Advance the cpu number and check if we should also advance offset to + * the next counter inside the PMT file. + * + * On Clearwater Forest platform, the counter is reported per module, + * so open the same counter for all of the CPUs inside the module. + * That way, reported table show the correct value for all of the CPUs inside the module, + * instead of zeros. + */ + ++cpu_num; + if (cpu_num % PMT_COUNTER_CWF_CPUS_PER_MODULE == 0) { + offset += PMT_COUNTER_CWF_MC1E_OFFSET_INCREMENT; + ++mod_num; + } + + /* + * There are PMT_COUNTER_CWF_MC1E_NUM_MODULES_PER_FILE in each PMT file. + * + * If that number is reached, seq must be incremented to advance to the next file in a sequence. + * Offset inside that file and a module counter has to be reset. + */ + if (mod_num == PMT_COUNTER_CWF_MC1E_NUM_MODULES_PER_FILE) { + ++seq; + offset = PMT_COUNTER_CWF_MC1E_OFFSET_BASE; + mod_num = 0; + } + } + } } void turbostat_init() From 2c4627c8ced77855b106c7104ecab70837d53799 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 2 Feb 2025 10:43:02 -0600 Subject: [PATCH 25/25] tools/power turbostat: version 2025.02.02 Summary of Changes since 2024.11.30: Fix regression in 2023.11.07 that affinitized forked child in one-shot mode. Harden one-shot mode against hotplug online/offline Enable RAPL SysWatt column by default. Add initial PTL, CWF platform support. Harden initial PMT code in response to early use. Enable first built-in PMT counter: CWF c1e residency Refuse to run on unsupported platforms without --force, to encourage updating to a version that supports the system, and to avoid no-so-useful measurement results. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 364a44a7d7aee..8d5011a0bf60d 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -9559,7 +9559,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2025.01.14 - Len Brown \n"); + fprintf(outf, "turbostat version 2025.02.02 - Len Brown \n"); } #define COMMAND_LINE_SIZE 2048