From 004b500a9031973ba0d05edca860193f9a5938df Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Tue, 27 Aug 2024 16:48:18 +0100 Subject: [PATCH 01/72] arch_topology: init capacity_freq_ref to 0 It's useful to have capacity_freq_ref initialized to 0 for users of arch_scale_freq_ref() to detect when capacity_freq_ref was not yet set. The only scenario affected by this change in the init value is when a cpufreq driver is never loaded. As a result, the only setter of a cpu scale factor remains the call of topology_normalize_cpu_scale() from parse_dt_topology(). There we cannot use the value 0 of capacity_freq_ref so we have to compensate for its uninitialized state. Signed-off-by: Ionela Voinescu Signed-off-by: Beata Michalska Reviewed-by: Vincent Guittot Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20240827154818.1195849-1-ionela.voinescu@arm.com Signed-off-by: Catalin Marinas --- drivers/base/arch_topology.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 3ebe77566788b..8202447eb4300 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -28,7 +28,7 @@ static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data); static struct cpumask scale_freq_counters_mask; static bool scale_freq_invariant; -DEFINE_PER_CPU(unsigned long, capacity_freq_ref) = 1; +DEFINE_PER_CPU(unsigned long, capacity_freq_ref) = 0; EXPORT_PER_CPU_SYMBOL_GPL(capacity_freq_ref); static bool supports_scale_freq_counters(const struct cpumask *cpus) @@ -293,13 +293,15 @@ void topology_normalize_cpu_scale(void) capacity_scale = 1; for_each_possible_cpu(cpu) { - capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu); + capacity = raw_capacity[cpu] * + (per_cpu(capacity_freq_ref, cpu) ?: 1); capacity_scale = max(capacity, capacity_scale); } pr_debug("cpu_capacity: capacity_scale=%llu\n", capacity_scale); for_each_possible_cpu(cpu) { - capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu); + capacity = raw_capacity[cpu] * + (per_cpu(capacity_freq_ref, cpu) ?: 1); capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT, capacity_scale); topology_set_cpu_scale(cpu, capacity); From 38e480d4fcac2e191e2f24f381aa8957865c49b2 Mon Sep 17 00:00:00 2001 From: Beata Michalska Date: Fri, 31 Jan 2025 16:24:36 +0000 Subject: [PATCH 02/72] cpufreq: Allow arch_freq_get_on_cpu to return an error Allow arch_freq_get_on_cpu to return an error for cases when retrieving current CPU frequency is not possible, whether that being due to lack of required arch support or due to other circumstances when the current frequency cannot be determined at given point of time. Signed-off-by: Beata Michalska Reviewed-by: Prasanna Kumar T S M Acked-by: Viresh Kumar Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20250131162439.3843071-2-beata.michalska@arm.com Signed-off-by: Catalin Marinas --- arch/x86/kernel/cpu/aperfmperf.c | 2 +- arch/x86/kernel/cpu/proc.c | 7 +++++-- drivers/cpufreq/cpufreq.c | 8 ++++---- include/linux/cpufreq.h | 2 +- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index f642de2ebdac8..6cf31a1649c4b 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -498,7 +498,7 @@ void arch_scale_freq_tick(void) */ #define MAX_SAMPLE_AGE ((unsigned long)HZ / 50) -unsigned int arch_freq_get_on_cpu(int cpu) +int arch_freq_get_on_cpu(int cpu) { struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu); unsigned int seq, freq; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 41ed01f46bd92..6571d432cbe33 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -86,9 +86,12 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); if (cpu_has(c, X86_FEATURE_TSC)) { - unsigned int freq = arch_freq_get_on_cpu(cpu); + int freq = arch_freq_get_on_cpu(cpu); - seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); + if (freq < 0) + seq_puts(m, "cpu MHz\t\t: Unknown\n"); + else + seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); } /* Cache size */ diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 30ffbddc7ecec..e69b0e39654ca 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -729,18 +729,18 @@ show_one(cpuinfo_transition_latency, cpuinfo.transition_latency); show_one(scaling_min_freq, min); show_one(scaling_max_freq, max); -__weak unsigned int arch_freq_get_on_cpu(int cpu) +__weak int arch_freq_get_on_cpu(int cpu) { - return 0; + return -EOPNOTSUPP; } static ssize_t show_scaling_cur_freq(struct cpufreq_policy *policy, char *buf) { ssize_t ret; - unsigned int freq; + int freq; freq = arch_freq_get_on_cpu(policy->cpu); - if (freq) + if (freq > 0) ret = sysfs_emit(buf, "%u\n", freq); else if (cpufreq_driver->setpolicy && cpufreq_driver->get) ret = sysfs_emit(buf, "%u\n", cpufreq_driver->get(policy->cpu)); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 7fe0981a7e467..02fd4746231da 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -1184,7 +1184,7 @@ static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_ } #endif -extern unsigned int arch_freq_get_on_cpu(int cpu); +extern int arch_freq_get_on_cpu(int cpu); #ifndef arch_set_freq_scale static __always_inline From fbb4a4759b541d09ebb8e391d5fa7f9a5a0cad61 Mon Sep 17 00:00:00 2001 From: Beata Michalska Date: Fri, 31 Jan 2025 16:24:37 +0000 Subject: [PATCH 03/72] cpufreq: Introduce an optional cpuinfo_avg_freq sysfs entry Currently the CPUFreq core exposes two sysfs attributes that can be used to query current frequency of a given CPU(s): namely cpuinfo_cur_freq and scaling_cur_freq. Both provide slightly different view on the subject and they do come with their own drawbacks. cpuinfo_cur_freq provides higher precision though at a cost of being rather expensive. Moreover, the information retrieved via this attribute is somewhat short lived as frequency can change at any point of time making it difficult to reason from. scaling_cur_freq, on the other hand, tends to be less accurate but then the actual level of precision (and source of information) varies between architectures making it a bit ambiguous. The new attribute, cpuinfo_avg_freq, is intended to provide more stable, distinct interface, exposing an average frequency of a given CPU(s), as reported by the hardware, over a time frame spanning no more than a few milliseconds. As it requires appropriate hardware support, this interface is optional. Note that under the hood, the new attribute relies on the information provided by arch_freq_get_on_cpu, which, up to this point, has been feeding data for scaling_cur_freq attribute, being the source of ambiguity when it comes to interpretation. This has been amended by restoring the intended behavior for scaling_cur_freq, with a new dedicated config option to maintain status quo for those, who may need it. CC: Jonathan Corbet CC: Thomas Gleixner CC: Ingo Molnar CC: Borislav Petkov CC: Dave Hansen CC: H. Peter Anvin CC: Phil Auld CC: x86@kernel.org CC: linux-doc@vger.kernel.org Signed-off-by: Beata Michalska Reviewed-by: Prasanna Kumar T S M Reviewed-by: Sumit Gupta Acked-by: Viresh Kumar Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20250131162439.3843071-3-beata.michalska@arm.com Signed-off-by: Catalin Marinas --- Documentation/admin-guide/pm/cpufreq.rst | 17 +++++++++++++- drivers/cpufreq/Kconfig.x86 | 12 ++++++++++ drivers/cpufreq/cpufreq.c | 30 +++++++++++++++++++++++- 3 files changed, 57 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/pm/cpufreq.rst b/Documentation/admin-guide/pm/cpufreq.rst index a21369eba034d..3950583f2b154 100644 --- a/Documentation/admin-guide/pm/cpufreq.rst +++ b/Documentation/admin-guide/pm/cpufreq.rst @@ -248,6 +248,20 @@ are the following: If that frequency cannot be determined, this attribute should not be present. +``cpuinfo_avg_freq`` + An average frequency (in KHz) of all CPUs belonging to a given policy, + derived from a hardware provided feedback and reported on a time frame + spanning at most few milliseconds. + + This is expected to be based on the frequency the hardware actually runs + at and, as such, might require specialised hardware support (such as AMU + extension on ARM). If one cannot be determined, this attribute should + not be present. + + Note, that failed attempt to retrieve current frequency for a given + CPU(s) will result in an appropriate error, i.e: EAGAIN for CPU that + remains idle (raised on ARM). + ``cpuinfo_max_freq`` Maximum possible operating frequency the CPUs belonging to this policy can run at (in kHz). @@ -293,7 +307,8 @@ are the following: Some architectures (e.g. ``x86``) may attempt to provide information more precisely reflecting the current CPU frequency through this attribute, but that still may not be the exact current CPU frequency as - seen by the hardware at the moment. + seen by the hardware at the moment. This behavior though, is only + available via c:macro:``CPUFREQ_ARCH_CUR_FREQ`` option. ``scaling_driver`` The scaling driver currently in use. diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86 index 97c2d4f15d76e..2c5c228408bf2 100644 --- a/drivers/cpufreq/Kconfig.x86 +++ b/drivers/cpufreq/Kconfig.x86 @@ -340,3 +340,15 @@ config X86_SPEEDSTEP_RELAXED_CAP_CHECK option lets the probing code bypass some of those checks if the parameter "relaxed_check=1" is passed to the module. +config CPUFREQ_ARCH_CUR_FREQ + default y + bool "Current frequency derived from HW provided feedback" + help + This determines whether the scaling_cur_freq sysfs attribute returns + the last requested frequency or a more precise value based on hardware + provided feedback (as architected counters). + Given that a more precise frequency can now be provided via the + cpuinfo_avg_freq attribute, by enabling this option, + scaling_cur_freq maintains the provision of a counter based frequency, + for compatibility reasons. + diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index e69b0e39654ca..0ce79fed8e55d 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -734,12 +734,20 @@ __weak int arch_freq_get_on_cpu(int cpu) return -EOPNOTSUPP; } +static inline bool cpufreq_avg_freq_supported(struct cpufreq_policy *policy) +{ + return arch_freq_get_on_cpu(policy->cpu) != -EOPNOTSUPP; +} + static ssize_t show_scaling_cur_freq(struct cpufreq_policy *policy, char *buf) { ssize_t ret; int freq; - freq = arch_freq_get_on_cpu(policy->cpu); + freq = IS_ENABLED(CONFIG_CPUFREQ_ARCH_CUR_FREQ) + ? arch_freq_get_on_cpu(policy->cpu) + : 0; + if (freq > 0) ret = sysfs_emit(buf, "%u\n", freq); else if (cpufreq_driver->setpolicy && cpufreq_driver->get) @@ -784,6 +792,19 @@ static ssize_t show_cpuinfo_cur_freq(struct cpufreq_policy *policy, return sysfs_emit(buf, "\n"); } +/* + * show_cpuinfo_avg_freq - average CPU frequency as detected by hardware + */ +static ssize_t show_cpuinfo_avg_freq(struct cpufreq_policy *policy, + char *buf) +{ + int avg_freq = arch_freq_get_on_cpu(policy->cpu); + + if (avg_freq > 0) + return sysfs_emit(buf, "%u\n", avg_freq); + return avg_freq != 0 ? avg_freq : -EINVAL; +} + /* * show_scaling_governor - show the current policy for the specified CPU */ @@ -946,6 +967,7 @@ static ssize_t show_bios_limit(struct cpufreq_policy *policy, char *buf) } cpufreq_freq_attr_ro_perm(cpuinfo_cur_freq, 0400); +cpufreq_freq_attr_ro(cpuinfo_avg_freq); cpufreq_freq_attr_ro(cpuinfo_min_freq); cpufreq_freq_attr_ro(cpuinfo_max_freq); cpufreq_freq_attr_ro(cpuinfo_transition_latency); @@ -1073,6 +1095,12 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy) return ret; } + if (cpufreq_avg_freq_supported(policy)) { + ret = sysfs_create_file(&policy->kobj, &cpuinfo_avg_freq.attr); + if (ret) + return ret; + } + ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr); if (ret) return ret; From 6d61527d931ba07bca691f204cdf3201b280879d Mon Sep 17 00:00:00 2001 From: Yury Khrustalev Date: Mon, 13 Jan 2025 17:06:17 +0000 Subject: [PATCH 04/72] mm/pkey: Add PKEY_UNRESTRICTED macro Memory protection keys (pkeys) uapi has two macros for pkeys restrictions: - PKEY_DISABLE_ACCESS 0x1 - PKEY_DISABLE_WRITE 0x2 with implicit literal value of 0x0 that means "unrestricted". Code that works with pkeys has to use this literal value when implying that a pkey imposes no restrictions. This may reduce readability because 0 can be written in various ways (e.g. 0x0 or 0) and also because 0 in the context of pkeys can be mistaken for "no permissions" (akin PROT_NONE) while it actually means "no restrictions". This is important because pkeys are oftentimes used near mprotect() that uses PROT_ macros. This patch adds PKEY_UNRESTRICTED macro defined as 0x0. Signed-off-by: Yury Khrustalev Acked-by: Dave Hansen Reviewed-by: Kevin Brodsky Link: https://lore.kernel.org/r/20250113170619.484698-2-yury.khrustalev@arm.com Signed-off-by: Catalin Marinas --- include/uapi/asm-generic/mman-common.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 1ea2c4c33b86a..ef1c27fa3c570 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -85,6 +85,7 @@ /* compatibility flags */ #define MAP_FILE 0 +#define PKEY_UNRESTRICTED 0x0 #define PKEY_DISABLE_ACCESS 0x1 #define PKEY_DISABLE_WRITE 0x2 #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ From 3809cefe93f6ae7aa2dd0d466d07500f8d7fe461 Mon Sep 17 00:00:00 2001 From: Yury Khrustalev Date: Mon, 13 Jan 2025 17:06:18 +0000 Subject: [PATCH 05/72] selftests/mm: Use PKEY_UNRESTRICTED macro Replace literal 0 with macro PKEY_UNRESTRICTED where pkey_*() functions are used in mm selftests for memory protection keys. Signed-off-by: Yury Khrustalev Suggested-by: Joey Gouly Acked-by: Dave Hansen Link: https://lore.kernel.org/r/20250113170619.484698-3-yury.khrustalev@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/mm/mseal_test.c | 6 +++--- tools/testing/selftests/mm/pkey-helpers.h | 3 ++- tools/testing/selftests/mm/pkey_sighandler_tests.c | 4 ++-- tools/testing/selftests/mm/protection_keys.c | 2 +- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c index ad17005521a8e..005f29c86484e 100644 --- a/tools/testing/selftests/mm/mseal_test.c +++ b/tools/testing/selftests/mm/mseal_test.c @@ -218,7 +218,7 @@ bool seal_support(void) bool pkey_supported(void) { #if defined(__i386__) || defined(__x86_64__) /* arch */ - int pkey = sys_pkey_alloc(0, 0); + int pkey = sys_pkey_alloc(0, PKEY_UNRESTRICTED); if (pkey > 0) return true; @@ -1671,7 +1671,7 @@ static void test_seal_discard_ro_anon_on_pkey(bool seal) setup_single_address_rw(size, &ptr); FAIL_TEST_IF_FALSE(ptr != (void *)-1); - pkey = sys_pkey_alloc(0, 0); + pkey = sys_pkey_alloc(0, PKEY_UNRESTRICTED); FAIL_TEST_IF_FALSE(pkey > 0); ret = sys_mprotect_pkey((void *)ptr, size, PROT_READ | PROT_WRITE, pkey); @@ -1683,7 +1683,7 @@ static void test_seal_discard_ro_anon_on_pkey(bool seal) } /* sealing doesn't take effect if PKRU allow write. */ - set_pkey(pkey, 0); + set_pkey(pkey, PKEY_UNRESTRICTED); ret = sys_madvise(ptr, size, MADV_DONTNEED); FAIL_TEST_IF_FALSE(!ret); diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h index f080e97b39bea..ea404f80e6cb9 100644 --- a/tools/testing/selftests/mm/pkey-helpers.h +++ b/tools/testing/selftests/mm/pkey-helpers.h @@ -13,6 +13,7 @@ #include #include +#include #include #include "../kselftest.h" @@ -193,7 +194,7 @@ static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si) static inline int kernel_has_pkeys(void) { /* try allocating a key and see if it succeeds */ - int ret = sys_pkey_alloc(0, 0); + int ret = sys_pkey_alloc(0, PKEY_UNRESTRICTED); if (ret <= 0) { return 0; } diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c index 1ac8c88098807..b5e076a564c95 100644 --- a/tools/testing/selftests/mm/pkey_sighandler_tests.c +++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c @@ -311,7 +311,7 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void) __write_pkey_reg(pkey_reg); /* Protect the new stack with MPK 1 */ - pkey = sys_pkey_alloc(0, 0); + pkey = sys_pkey_alloc(0, PKEY_UNRESTRICTED); sys_mprotect_pkey(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey); /* Set up alternate signal stack that will use the default MPK */ @@ -484,7 +484,7 @@ static void test_pkru_sigreturn(void) __write_pkey_reg(pkey_reg); /* Protect the stack with MPK 2 */ - pkey = sys_pkey_alloc(0, 0); + pkey = sys_pkey_alloc(0, PKEY_UNRESTRICTED); sys_mprotect_pkey(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey); /* Set up alternate signal stack that will use the default MPK */ diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c index a4683f2476f27..434d8a8dc637c 100644 --- a/tools/testing/selftests/mm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@ -463,7 +463,7 @@ static pid_t fork_lazy_child(void) static int alloc_pkey(void) { int ret; - unsigned long init_val = 0x0; + unsigned long init_val = PKEY_UNRESTRICTED; dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); From 00894c3fc91791c742c7724de6b8db1d1e383c16 Mon Sep 17 00:00:00 2001 From: Yury Khrustalev Date: Mon, 13 Jan 2025 17:06:19 +0000 Subject: [PATCH 06/72] selftests/powerpc: Use PKEY_UNRESTRICTED macro Replace literal 0 with macro PKEY_UNRESTRICTED where pkey_*() functions are used in mm selftests for memory protection keys for ppc target. Signed-off-by: Yury Khrustalev Suggested-by: Kevin Brodsky Reviewed-by: Kevin Brodsky Link: https://lore.kernel.org/r/20250113170619.484698-4-yury.khrustalev@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/powerpc/include/pkeys.h | 2 +- tools/testing/selftests/powerpc/mm/pkey_exec_prot.c | 2 +- tools/testing/selftests/powerpc/mm/pkey_siginfo.c | 2 +- tools/testing/selftests/powerpc/ptrace/core-pkey.c | 6 +++--- tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c | 6 +++--- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/powerpc/include/pkeys.h b/tools/testing/selftests/powerpc/include/pkeys.h index 3a0129467de64..c6d4063dd4f6f 100644 --- a/tools/testing/selftests/powerpc/include/pkeys.h +++ b/tools/testing/selftests/powerpc/include/pkeys.h @@ -93,7 +93,7 @@ int pkeys_unsupported(void) SKIP_IF(!hash_mmu); /* Check if the system call is supported */ - pkey = sys_pkey_alloc(0, 0); + pkey = sys_pkey_alloc(0, PKEY_UNRESTRICTED); SKIP_IF(pkey < 0); sys_pkey_free(pkey); diff --git a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c index 0af4f02669a11..29b91b7456eb9 100644 --- a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c +++ b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c @@ -72,7 +72,7 @@ static void segv_handler(int signum, siginfo_t *sinfo, void *ctx) switch (fault_type) { case PKEY_DISABLE_ACCESS: - pkey_set_rights(fault_pkey, 0); + pkey_set_rights(fault_pkey, PKEY_UNRESTRICTED); break; case PKEY_DISABLE_EXECUTE: /* diff --git a/tools/testing/selftests/powerpc/mm/pkey_siginfo.c b/tools/testing/selftests/powerpc/mm/pkey_siginfo.c index 2db76e56d4cb9..e89a164c686ba 100644 --- a/tools/testing/selftests/powerpc/mm/pkey_siginfo.c +++ b/tools/testing/selftests/powerpc/mm/pkey_siginfo.c @@ -83,7 +83,7 @@ static void segv_handler(int signum, siginfo_t *sinfo, void *ctx) mprotect(pgstart, pgsize, PROT_EXEC)) _exit(1); else - pkey_set_rights(pkey, 0); + pkey_set_rights(pkey, PKEY_UNRESTRICTED); fault_count++; } diff --git a/tools/testing/selftests/powerpc/ptrace/core-pkey.c b/tools/testing/selftests/powerpc/ptrace/core-pkey.c index f061434af452b..7ff53caeb4aa9 100644 --- a/tools/testing/selftests/powerpc/ptrace/core-pkey.c +++ b/tools/testing/selftests/powerpc/ptrace/core-pkey.c @@ -95,16 +95,16 @@ static int child(struct shared_info *info) /* Get some pkeys so that we can change their bits in the AMR. */ pkey1 = sys_pkey_alloc(0, PKEY_DISABLE_EXECUTE); if (pkey1 < 0) { - pkey1 = sys_pkey_alloc(0, 0); + pkey1 = sys_pkey_alloc(0, PKEY_UNRESTRICTED); FAIL_IF(pkey1 < 0); disable_execute = false; } - pkey2 = sys_pkey_alloc(0, 0); + pkey2 = sys_pkey_alloc(0, PKEY_UNRESTRICTED); FAIL_IF(pkey2 < 0); - pkey3 = sys_pkey_alloc(0, 0); + pkey3 = sys_pkey_alloc(0, PKEY_UNRESTRICTED); FAIL_IF(pkey3 < 0); info->amr |= 3ul << pkeyshift(pkey1) | 2ul << pkeyshift(pkey2); diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c index fc633014424f7..10f63042cf91b 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c @@ -57,16 +57,16 @@ static int child(struct shared_info *info) /* Get some pkeys so that we can change their bits in the AMR. */ pkey1 = sys_pkey_alloc(0, PKEY_DISABLE_EXECUTE); if (pkey1 < 0) { - pkey1 = sys_pkey_alloc(0, 0); + pkey1 = sys_pkey_alloc(0, PKEY_UNRESTRICTED); CHILD_FAIL_IF(pkey1 < 0, &info->child_sync); disable_execute = false; } - pkey2 = sys_pkey_alloc(0, 0); + pkey2 = sys_pkey_alloc(0, PKEY_UNRESTRICTED); CHILD_FAIL_IF(pkey2 < 0, &info->child_sync); - pkey3 = sys_pkey_alloc(0, 0); + pkey3 = sys_pkey_alloc(0, PKEY_UNRESTRICTED); CHILD_FAIL_IF(pkey3 < 0, &info->child_sync); info->amr1 |= 3ul << pkeyshift(pkey1); From 16d1e27475f673295f3eaac296dd835db3b8c4d4 Mon Sep 17 00:00:00 2001 From: Beata Michalska Date: Fri, 31 Jan 2025 16:24:38 +0000 Subject: [PATCH 07/72] arm64: Provide an AMU-based version of arch_freq_get_on_cpu With the Frequency Invariance Engine (FIE) being already wired up with sched tick and making use of relevant (core counter and constant counter) AMU counters, getting the average frequency for a given CPU, can be achieved by utilizing the frequency scale factor which reflects an average CPU frequency for the last tick period length. The solution is partially based on APERF/MPERF implementation of arch_freq_get_on_cpu. Suggested-by: Ionela Voinescu Signed-off-by: Beata Michalska Reviewed-by: Prasanna Kumar T S M Reviewed-by: Sumit Gupta Link: https://lore.kernel.org/r/20250131162439.3843071-4-beata.michalska@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/topology.c | 111 +++++++++++++++++++++++++++++++---- 1 file changed, 101 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index cb180684d10d5..28fb004d3103c 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -88,18 +89,28 @@ int __init parse_acpi_topology(void) * initialized. */ static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) = 1UL << (2 * SCHED_CAPACITY_SHIFT); -static DEFINE_PER_CPU(u64, arch_const_cycles_prev); -static DEFINE_PER_CPU(u64, arch_core_cycles_prev); static cpumask_var_t amu_fie_cpus; +struct amu_cntr_sample { + u64 arch_const_cycles_prev; + u64 arch_core_cycles_prev; + unsigned long last_scale_update; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct amu_cntr_sample, cpu_amu_samples); + void update_freq_counters_refs(void) { - this_cpu_write(arch_core_cycles_prev, read_corecnt()); - this_cpu_write(arch_const_cycles_prev, read_constcnt()); + struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples); + + amu_sample->arch_core_cycles_prev = read_corecnt(); + amu_sample->arch_const_cycles_prev = read_constcnt(); } static inline bool freq_counters_valid(int cpu) { + struct amu_cntr_sample *amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu); + if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask)) return false; @@ -108,8 +119,8 @@ static inline bool freq_counters_valid(int cpu) return false; } - if (unlikely(!per_cpu(arch_const_cycles_prev, cpu) || - !per_cpu(arch_core_cycles_prev, cpu))) { + if (unlikely(!amu_sample->arch_const_cycles_prev || + !amu_sample->arch_core_cycles_prev)) { pr_debug("CPU%d: cycle counters are not enabled.\n", cpu); return false; } @@ -152,17 +163,22 @@ void freq_inv_set_max_ratio(int cpu, u64 max_rate) static void amu_scale_freq_tick(void) { + struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples); u64 prev_core_cnt, prev_const_cnt; u64 core_cnt, const_cnt, scale; - prev_const_cnt = this_cpu_read(arch_const_cycles_prev); - prev_core_cnt = this_cpu_read(arch_core_cycles_prev); + prev_const_cnt = amu_sample->arch_const_cycles_prev; + prev_core_cnt = amu_sample->arch_core_cycles_prev; update_freq_counters_refs(); - const_cnt = this_cpu_read(arch_const_cycles_prev); - core_cnt = this_cpu_read(arch_core_cycles_prev); + const_cnt = amu_sample->arch_const_cycles_prev; + core_cnt = amu_sample->arch_core_cycles_prev; + /* + * This should not happen unless the AMUs have been reset and the + * counter values have not been restored - unlikely + */ if (unlikely(core_cnt <= prev_core_cnt || const_cnt <= prev_const_cnt)) return; @@ -182,6 +198,8 @@ static void amu_scale_freq_tick(void) scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE); this_cpu_write(arch_freq_scale, (unsigned long)scale); + + amu_sample->last_scale_update = jiffies; } static struct scale_freq_data amu_sfd = { @@ -189,6 +207,79 @@ static struct scale_freq_data amu_sfd = { .set_freq_scale = amu_scale_freq_tick, }; +static __always_inline bool amu_fie_cpu_supported(unsigned int cpu) +{ + return cpumask_available(amu_fie_cpus) && + cpumask_test_cpu(cpu, amu_fie_cpus); +} + +#define AMU_SAMPLE_EXP_MS 20 + +int arch_freq_get_on_cpu(int cpu) +{ + struct amu_cntr_sample *amu_sample; + unsigned int start_cpu = cpu; + unsigned long last_update; + unsigned int freq = 0; + u64 scale; + + if (!amu_fie_cpu_supported(cpu) || !arch_scale_freq_ref(cpu)) + return -EOPNOTSUPP; + + while (1) { + + amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu); + + last_update = amu_sample->last_scale_update; + + /* + * For those CPUs that are in full dynticks mode, or those that have + * not seen tick for a while, try an alternative source for the counters + * (and thus freq scale), if available, for given policy: this boils + * down to identifying an active cpu within the same freq domain, if any. + */ + if (!housekeeping_cpu(cpu, HK_TYPE_TICK) || + time_is_before_jiffies(last_update + msecs_to_jiffies(AMU_SAMPLE_EXP_MS))) { + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + int ref_cpu = cpu; + + if (!policy) + return -EINVAL; + + if (!cpumask_intersects(policy->related_cpus, + housekeeping_cpumask(HK_TYPE_TICK))) { + cpufreq_cpu_put(policy); + return -EOPNOTSUPP; + } + + do { + ref_cpu = cpumask_next_wrap(ref_cpu, policy->cpus, + start_cpu, true); + + } while (ref_cpu < nr_cpu_ids && idle_cpu(ref_cpu)); + + cpufreq_cpu_put(policy); + + if (ref_cpu >= nr_cpu_ids) + /* No alternative to pull info from */ + return -EAGAIN; + + cpu = ref_cpu; + } else { + break; + } + } + /* + * Reversed computation to the one used to determine + * the arch_freq_scale value + * (see amu_scale_freq_tick for details) + */ + scale = arch_scale_freq_capacity(cpu); + freq = scale * arch_scale_freq_ref(cpu); + freq >>= SCHED_CAPACITY_SHIFT; + return freq; +} + static void amu_fie_setup(const struct cpumask *cpus) { int cpu; From 39b19974982e03bd7b950f33bc0855385845c9fb Mon Sep 17 00:00:00 2001 From: Beata Michalska Date: Fri, 31 Jan 2025 16:24:39 +0000 Subject: [PATCH 08/72] arm64: Update AMU-based freq scale factor on entering idle Now that the frequency scale factor has been activated for retrieving current frequency on a given CPU, trigger its update upon entering idle. This will, to an extent, allow querying last known frequency in a non-invasive way. It will also improve the frequency scale factor accuracy when a CPU entering idle did not receive a tick for a while. As a consequence, for idle cores, the reported frequency will be the last one observed before entering the idle state. Suggested-by: Vanshidhar Konda Signed-off-by: Beata Michalska Reviewed-by: Prasanna Kumar T S M Reviewed-by: Sumit Gupta Link: https://lore.kernel.org/r/20250131162439.3843071-5-beata.michalska@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/topology.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 28fb004d3103c..a09b0551ec59f 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -213,6 +213,19 @@ static __always_inline bool amu_fie_cpu_supported(unsigned int cpu) cpumask_test_cpu(cpu, amu_fie_cpus); } +void arch_cpu_idle_enter(void) +{ + unsigned int cpu = smp_processor_id(); + + if (!amu_fie_cpu_supported(cpu)) + return; + + /* Kick in AMU update but only if one has not happened already */ + if (housekeeping_cpu(cpu, HK_TYPE_TICK) && + time_is_before_jiffies(per_cpu(cpu_amu_samples.last_scale_update, cpu))) + amu_scale_freq_tick(); +} + #define AMU_SAMPLE_EXP_MS 20 int arch_freq_get_on_cpu(int cpu) From cc15f548cc77574bcd68425ae01a796659bd3705 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 3 Feb 2025 10:38:22 +0530 Subject: [PATCH 09/72] arm64/sysreg: Update register fields for ID_AA64MMFR0_EL1 This updates ID_AA64MMFR0_EL1 register fields as per the definitions based on DDI0601 2024-12. Cc: Will Deacon Cc: Mark Brown Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Eric Auger Reviewed-by: Mark Brown Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20250203050828.1049370-2-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/tools/sysreg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 762ee084b37c5..13479c5a9b1bf 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1664,6 +1664,7 @@ EndEnum UnsignedEnum 59:56 FGT 0b0000 NI 0b0001 IMP + 0b0010 FGT2 EndEnum Res0 55:48 UnsignedEnum 47:44 EXS @@ -1725,6 +1726,7 @@ Enum 3:0 PARANGE 0b0100 44 0b0101 48 0b0110 52 + 0b0111 56 EndEnum EndSysreg From 44844551670cff70a8aa5c1cde27ad1e0367e009 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 3 Feb 2025 10:38:23 +0530 Subject: [PATCH 10/72] arm64/sysreg: Add register fields for HDFGRTR2_EL2 This adds register fields for HDFGRTR2_EL2 as per the definitions based on DDI0601 2024-12. Cc: Will Deacon Cc: Mark Brown Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Eric Auger Reviewed-by: Mark Brown Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20250203050828.1049370-3-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/tools/sysreg | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 13479c5a9b1bf..736c72d772de8 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2643,6 +2643,35 @@ Field 0 E0HTRE EndSysreg +Sysreg HDFGRTR2_EL2 3 4 3 1 0 +Res0 63:25 +Field 24 nPMBMAR_EL1 +Field 23 nMDSTEPOP_EL1 +Field 22 nTRBMPAM_EL1 +Res0 21 +Field 20 nTRCITECR_EL1 +Field 19 nPMSDSFR_EL1 +Field 18 nSPMDEVAFF_EL1 +Field 17 nSPMID +Field 16 nSPMSCR_EL1 +Field 15 nSPMACCESSR_EL1 +Field 14 nSPMCR_EL0 +Field 13 nSPMOVS +Field 12 nSPMINTEN +Field 11 nSPMCNTEN +Field 10 nSPMSELR_EL0 +Field 9 nSPMEVTYPERn_EL0 +Field 8 nSPMEVCNTRn_EL0 +Field 7 nPMSSCR_EL1 +Field 6 nPMSSDATA +Field 5 nMDSELR_EL1 +Field 4 nPMUACR_EL1 +Field 3 nPMICFILTR_EL0 +Field 2 nPMICNTR_EL0 +Field 1 nPMIAR_EL1 +Field 0 nPMECR_EL1 +EndSysreg + Sysreg HDFGRTR_EL2 3 4 3 1 4 Field 63 PMBIDR_EL1 Field 62 nPMSNEVFR_EL1 From 2f1f62a1257b9d5eb98a8e161ea7d11f1678f7ad Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 3 Feb 2025 10:38:24 +0530 Subject: [PATCH 11/72] arm64/sysreg: Add register fields for HDFGWTR2_EL2 This adds register fields for HDFGWTR2_EL2 as per the definitions based on DDI0601 2024-12. Cc: Will Deacon Cc: Mark Brown Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Eric Auger Reviewed-by: Mark Brown Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20250203050828.1049370-4-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/tools/sysreg | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 736c72d772de8..f1c366866c938 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2672,6 +2672,34 @@ Field 1 nPMIAR_EL1 Field 0 nPMECR_EL1 EndSysreg +Sysreg HDFGWTR2_EL2 3 4 3 1 1 +Res0 63:25 +Field 24 nPMBMAR_EL1 +Field 23 nMDSTEPOP_EL1 +Field 22 nTRBMPAM_EL1 +Field 21 nPMZR_EL0 +Field 20 nTRCITECR_EL1 +Field 19 nPMSDSFR_EL1 +Res0 18:17 +Field 16 nSPMSCR_EL1 +Field 15 nSPMACCESSR_EL1 +Field 14 nSPMCR_EL0 +Field 13 nSPMOVS +Field 12 nSPMINTEN +Field 11 nSPMCNTEN +Field 10 nSPMSELR_EL0 +Field 9 nSPMEVTYPERn_EL0 +Field 8 nSPMEVCNTRn_EL0 +Field 7 nPMSSCR_EL1 +Res0 6 +Field 5 nMDSELR_EL1 +Field 4 nPMUACR_EL1 +Field 3 nPMICFILTR_EL0 +Field 2 nPMICNTR_EL0 +Field 1 nPMIAR_EL1 +Field 0 nPMECR_EL1 +EndSysreg + Sysreg HDFGRTR_EL2 3 4 3 1 4 Field 63 PMBIDR_EL1 Field 62 nPMSNEVFR_EL1 From 9401476f17747586a8bfb29abfdf5ade7a8bceef Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 3 Feb 2025 10:38:25 +0530 Subject: [PATCH 12/72] arm64/sysreg: Add register fields for HFGITR2_EL2 This adds register fields for HFGITR2_EL2 as per the definitions based on DDI0601 2024-12. Cc: Will Deacon Cc: Mark Brown Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Eric Auger Reviewed-by: Mark Brown Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20250203050828.1049370-5-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/tools/sysreg | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index f1c366866c938..0008ff35a6558 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2872,6 +2872,12 @@ Field 1 AMEVCNTR00_EL0 Field 0 AMCNTEN0 EndSysreg +Sysreg HFGITR2_EL2 3 4 3 1 7 +Res0 63:2 +Field 1 nDCCIVAPS +Field 0 TSBCSYNC +EndSysreg + Sysreg ZCR_EL2 3 4 1 2 0 Fields ZCR_ELx EndSysreg From 59236089ad5243377b6905d78e39ba4183dc35f5 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 3 Feb 2025 10:38:26 +0530 Subject: [PATCH 13/72] arm64/sysreg: Add register fields for HFGRTR2_EL2 This adds register fields for HFGRTR2_EL2 as per the definitions based on DDI0601 2024-12. Cc: Will Deacon Cc: Mark Brown Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Eric Auger Reviewed-by: Mark Brown Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20250203050828.1049370-6-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/tools/sysreg | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 0008ff35a6558..cae085317b8c0 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2700,6 +2700,25 @@ Field 1 nPMIAR_EL1 Field 0 nPMECR_EL1 EndSysreg +Sysreg HFGRTR2_EL2 3 4 3 1 2 +Res0 63:15 +Field 14 nACTLRALIAS_EL1 +Field 13 nACTLRMASK_EL1 +Field 12 nTCR2ALIAS_EL1 +Field 11 nTCRALIAS_EL1 +Field 10 nSCTLRALIAS2_EL1 +Field 9 nSCTLRALIAS_EL1 +Field 8 nCPACRALIAS_EL1 +Field 7 nTCR2MASK_EL1 +Field 6 nTCRMASK_EL1 +Field 5 nSCTLR2MASK_EL1 +Field 4 nSCTLRMASK_EL1 +Field 3 nCPACRMASK_EL1 +Field 2 nRCWSMASK_EL1 +Field 1 nERXGSR_EL1 +Field 0 nPFAR_EL1 +EndSysreg + Sysreg HDFGRTR_EL2 3 4 3 1 4 Field 63 PMBIDR_EL1 Field 62 nPMSNEVFR_EL1 From ea37be0773f04420515b8db49e50abedbaa97e23 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 3 Feb 2025 10:38:27 +0530 Subject: [PATCH 14/72] arm64/sysreg: Add register fields for HFGWTR2_EL2 This adds register fields for HFGWTR2_EL2 as per the definitions based on DDI0601 2024-12. Cc: Will Deacon Cc: Mark Brown Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Eric Auger Reviewed-by: Mark Brown Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20250203050828.1049370-7-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/tools/sysreg | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index cae085317b8c0..891fe033e1b63 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2719,6 +2719,25 @@ Field 1 nERXGSR_EL1 Field 0 nPFAR_EL1 EndSysreg +Sysreg HFGWTR2_EL2 3 4 3 1 3 +Res0 63:15 +Field 14 nACTLRALIAS_EL1 +Field 13 nACTLRMASK_EL1 +Field 12 nTCR2ALIAS_EL1 +Field 11 nTCRALIAS_EL1 +Field 10 nSCTLRALIAS2_EL1 +Field 9 nSCTLRALIAS_EL1 +Field 8 nCPACRALIAS_EL1 +Field 7 nTCR2MASK_EL1 +Field 6 nTCRMASK_EL1 +Field 5 nSCTLR2MASK_EL1 +Field 4 nSCTLRMASK_EL1 +Field 3 nCPACRMASK_EL1 +Field 2 nRCWSMASK_EL1 +Res0 1 +Field 0 nPFAR_EL1 +EndSysreg + Sysreg HDFGRTR_EL2 3 4 3 1 4 Field 63 PMBIDR_EL1 Field 62 nPMSNEVFR_EL1 From 20711efa91e8ba44149f5e2ed1cf81e5355650e5 Mon Sep 17 00:00:00 2001 From: Beata Michalska Date: Thu, 20 Feb 2025 09:10:15 +0000 Subject: [PATCH 15/72] arm64: Utilize for_each_cpu_wrap for reference lookup While searching for a reference CPU within a given policy, arch_freq_get_on_cpu relies on cpumask_next_wrap to iterate over all available CPUs and to ensure each is verified only once. Recent changes to cpumask_next_wrap will handle the latter no more, so switching to for_each_cpu_wrap, which preserves expected behavior while ensuring compatibility with the updates. Not to mention that when iterating over each CPU, using a dedicated iterator is preferable to an open-coded loop. Fixes: 16d1e27475f6 ("arm64: Provide an AMU-based version of arch_freq_get_on_cpu") Signed-off-by: Beata Michalska Link: https://lore.kernel.org/r/20250220091015.2319901-1-beata.michalska@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/topology.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index a09b0551ec59f..9e3583720668a 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -254,7 +254,7 @@ int arch_freq_get_on_cpu(int cpu) if (!housekeeping_cpu(cpu, HK_TYPE_TICK) || time_is_before_jiffies(last_update + msecs_to_jiffies(AMU_SAMPLE_EXP_MS))) { struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); - int ref_cpu = cpu; + int ref_cpu; if (!policy) return -EINVAL; @@ -265,11 +265,15 @@ int arch_freq_get_on_cpu(int cpu) return -EOPNOTSUPP; } - do { - ref_cpu = cpumask_next_wrap(ref_cpu, policy->cpus, - start_cpu, true); - - } while (ref_cpu < nr_cpu_ids && idle_cpu(ref_cpu)); + for_each_cpu_wrap(ref_cpu, policy->cpus, cpu + 1) { + if (ref_cpu == start_cpu) { + /* Prevent verifying same CPU twice */ + ref_cpu = nr_cpu_ids; + break; + } + if (!idle_cpu(ref_cpu)) + break; + } cpufreq_cpu_put(policy); From 0424b1a81a42cbb3e5ac71ecaec83c8273cd96ce Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Fri, 21 Feb 2025 18:03:45 +0000 Subject: [PATCH 16/72] perf: arm_pmuv3: Add support for ARM Rainier PMU Add support for the ARM Rainier CPU core PMU. Signed-off-by: Vincenzo Frascino Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250221180349.1413089-7-vincenzo.frascino@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm_pmuv3.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 0e360feb3432e..3785522a08e75 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -1369,6 +1369,7 @@ PMUV3_INIT_SIMPLE(armv8_neoverse_v1) PMUV3_INIT_SIMPLE(armv8_neoverse_v2) PMUV3_INIT_SIMPLE(armv8_neoverse_v3) PMUV3_INIT_SIMPLE(armv8_neoverse_v3ae) +PMUV3_INIT_SIMPLE(armv8_rainier) PMUV3_INIT_SIMPLE(armv8_nvidia_carmel) PMUV3_INIT_SIMPLE(armv8_nvidia_denver) @@ -1416,6 +1417,7 @@ static const struct of_device_id armv8_pmu_of_device_ids[] = { {.compatible = "arm,neoverse-v2-pmu", .data = armv8_neoverse_v2_pmu_init}, {.compatible = "arm,neoverse-v3-pmu", .data = armv8_neoverse_v3_pmu_init}, {.compatible = "arm,neoverse-v3ae-pmu", .data = armv8_neoverse_v3ae_pmu_init}, + {.compatible = "arm,rainier-pmu", .data = armv8_rainier_pmu_init}, {.compatible = "cavium,thunder-pmu", .data = armv8_cavium_thunder_pmu_init}, {.compatible = "brcm,vulcan-pmu", .data = armv8_brcm_vulcan_pmu_init}, {.compatible = "nvidia,carmel-pmu", .data = armv8_nvidia_carmel_pmu_init}, From 04bd15c4cbc3f7bd2399d1baab958c5e738dbfc9 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Tue, 18 Feb 2025 14:39:56 -0600 Subject: [PATCH 17/72] perf: arm_pmuv3: Call kvm_vcpu_pmu_resync_el0() before enabling counters Counting events related to setup of the PMU is not desired, but kvm_vcpu_pmu_resync_el0() is called just after the PMU counters have been enabled. Move the call to before enabling the counters. Signed-off-by: Rob Herring (Arm) Reviewed-by: Anshuman Khandual Tested-by: James Clark Link: https://lore.kernel.org/r/20250218-arm-brbe-v19-v20-1-4e9922fc2e8e@kernel.org Signed-off-by: Will Deacon --- drivers/perf/arm_pmuv3.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 3785522a08e75..ab070d5257517 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -825,10 +825,10 @@ static void armv8pmu_start(struct arm_pmu *cpu_pmu) else armv8pmu_disable_user_access(); + kvm_vcpu_pmu_resync_el0(); + /* Enable all counters */ armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E); - - kvm_vcpu_pmu_resync_el0(); } static void armv8pmu_stop(struct arm_pmu *cpu_pmu) From dcca27bc1eccb9abc2552aab950b18a9742fb8e7 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 18 Feb 2025 14:39:57 -0600 Subject: [PATCH 18/72] perf: arm_pmu: Don't disable counter in armpmu_add() Currently armpmu_add() tries to handle a newly-allocated counter having a stale associated event, but this should not be possible, and if this were to happen the current mitigation is insufficient and potentially expensive. It would be better to warn if we encounter the impossible case. Calls to pmu::add() and pmu::del() are serialized by the core perf code, and armpmu_del() clears the relevant slot in pmu_hw_events::events[] before clearing the bit in pmu_hw_events::used_mask such that the counter can be reallocated. Thus when armpmu_add() allocates a counter index from pmu_hw_events::used_mask, it should not be possible to observe a stale even in pmu_hw_events::events[] unless either pmu_hw_events::used_mask or pmu_hw_events::events[] have been corrupted. If this were to happen, we'd end up with two events with the same event->hw.idx, which would clash with each other during reprogramming, deletion, etc, and produce bogus results. Add a WARN_ON_ONCE() for this case so that we can detect if this ever occurs in practice. That possiblity aside, there's no need to call arm_pmu::disable(event) for the new event. The PMU reset code initialises the counter in a disabled state, and armpmu_del() will disable the counter before it can be reused. Remove the redundant disable. Signed-off-by: Mark Rutland Signed-off-by: Rob Herring (Arm) Reviewed-by: Anshuman Khandual Tested-by: James Clark Link: https://lore.kernel.org/r/20250218-arm-brbe-v19-v20-2-4e9922fc2e8e@kernel.org Signed-off-by: Will Deacon --- drivers/perf/arm_pmu.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 398cce3d76fc4..2f33e69a8caf2 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -342,12 +342,10 @@ armpmu_add(struct perf_event *event, int flags) if (idx < 0) return idx; - /* - * If there is an event in the counter we are going to use then make - * sure it is disabled. - */ + /* The newly-allocated counter should be empty */ + WARN_ON_ONCE(hw_events->events[idx]); + event->hw.idx = idx; - armpmu->disable(event); hw_events->events[idx] = event; hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; From 4b0567ad0be52b9e7a36de94193b89e94487363f Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 18 Feb 2025 14:39:58 -0600 Subject: [PATCH 19/72] perf: arm_pmuv3: Don't disable counter in armv8pmu_enable_event() Currently armv8pmu_enable_event() starts by disabling the event counter it has been asked to enable. This should not be necessary as the counter (and the PMU as a whole) should not be active when armv8pmu_enable_event() is called. Remove the redundant call to armv8pmu_disable_event_counter(). At the same time, remove the comment immeditately above as everything it says is obvious from the function names below. Signed-off-by: Mark Rutland Signed-off-by: Rob Herring (Arm) Reviewed-by: Anshuman Khandual Tested-by: James Clark Link: https://lore.kernel.org/r/20250218-arm-brbe-v19-v20-3-4e9922fc2e8e@kernel.org Signed-off-by: Will Deacon --- drivers/perf/arm_pmuv3.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index ab070d5257517..e506d59654e73 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -795,11 +795,6 @@ static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu) static void armv8pmu_enable_event(struct perf_event *event) { - /* - * Enable counter and interrupt, and set the counter to count - * the event that we're interested in. - */ - armv8pmu_disable_event_counter(event); armv8pmu_write_event_type(event); armv8pmu_enable_event_irq(event); armv8pmu_enable_event_counter(event); From 7a5387748215c19cee7bd693ebaf336cf9bbbdcb Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Tue, 18 Feb 2025 14:39:59 -0600 Subject: [PATCH 20/72] perf: arm_v7_pmu: Drop obvious comments for enabling/disabling counters and interrupts The function calls for enabling/disabling counters and interrupts are pretty obvious as to what they are doing, and the comments don't add any additional value. Signed-off-by: Rob Herring (Arm) Reviewed-by: Anshuman Khandual Tested-by: James Clark Link: https://lore.kernel.org/r/20250218-arm-brbe-v19-v20-4-4e9922fc2e8e@kernel.org Signed-off-by: Will Deacon --- drivers/perf/arm_v7_pmu.c | 44 --------------------------------------- 1 file changed, 44 deletions(-) diff --git a/drivers/perf/arm_v7_pmu.c b/drivers/perf/arm_v7_pmu.c index 420cadd108e79..7fa88e3b64e07 100644 --- a/drivers/perf/arm_v7_pmu.c +++ b/drivers/perf/arm_v7_pmu.c @@ -857,14 +857,6 @@ static void armv7pmu_enable_event(struct perf_event *event) return; } - /* - * Enable counter and interrupt, and set the counter to count - * the event that we're interested in. - */ - - /* - * Disable counter - */ armv7_pmnc_disable_counter(idx); /* @@ -875,14 +867,7 @@ static void armv7pmu_enable_event(struct perf_event *event) if (cpu_pmu->set_event_filter || idx != ARMV7_IDX_CYCLE_COUNTER) armv7_pmnc_write_evtsel(idx, hwc->config_base); - /* - * Enable interrupt for this counter - */ armv7_pmnc_enable_intens(idx); - - /* - * Enable counter - */ armv7_pmnc_enable_counter(idx); } @@ -898,18 +883,7 @@ static void armv7pmu_disable_event(struct perf_event *event) return; } - /* - * Disable counter and interrupt - */ - - /* - * Disable counter - */ armv7_pmnc_disable_counter(idx); - - /* - * Disable interrupt for this counter - */ armv7_pmnc_disable_intens(idx); } @@ -1476,12 +1450,6 @@ static void krait_pmu_enable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; - /* - * Enable counter and interrupt, and set the counter to count - * the event that we're interested in. - */ - - /* Disable counter */ armv7_pmnc_disable_counter(idx); /* @@ -1494,10 +1462,7 @@ static void krait_pmu_enable_event(struct perf_event *event) else armv7_pmnc_write_evtsel(idx, hwc->config_base); - /* Enable interrupt for this counter */ armv7_pmnc_enable_intens(idx); - - /* Enable counter */ armv7_pmnc_enable_counter(idx); } @@ -1797,12 +1762,6 @@ static void scorpion_pmu_enable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; - /* - * Enable counter and interrupt, and set the counter to count - * the event that we're interested in. - */ - - /* Disable counter */ armv7_pmnc_disable_counter(idx); /* @@ -1815,10 +1774,7 @@ static void scorpion_pmu_enable_event(struct perf_event *event) else if (idx != ARMV7_IDX_CYCLE_COUNTER) armv7_pmnc_write_evtsel(idx, hwc->config_base); - /* Enable interrupt for this counter */ armv7_pmnc_enable_intens(idx); - - /* Enable counter */ armv7_pmnc_enable_counter(idx); } From 7bf1001e0d9124ad65642a5bbff76b46a35798f3 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Tue, 18 Feb 2025 14:40:00 -0600 Subject: [PATCH 21/72] perf: arm_v7_pmu: Don't disable counter in (armv7|krait_|scorpion_)pmu_enable_event() Currently (armv7|krait_|scorpion_)pmu_enable_event() start by disabling the event counter it has been asked to enable. This should not be necessary as the counter (and the PMU as a whole) should not be active when *_enable_event() is called. Signed-off-by: Rob Herring (Arm) Reviewed-by: Anshuman Khandual Tested-by: James Clark Link: https://lore.kernel.org/r/20250218-arm-brbe-v19-v20-5-4e9922fc2e8e@kernel.org Signed-off-by: Will Deacon --- drivers/perf/arm_v7_pmu.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/perf/arm_v7_pmu.c b/drivers/perf/arm_v7_pmu.c index 7fa88e3b64e07..17831e1920bd8 100644 --- a/drivers/perf/arm_v7_pmu.c +++ b/drivers/perf/arm_v7_pmu.c @@ -857,8 +857,6 @@ static void armv7pmu_enable_event(struct perf_event *event) return; } - armv7_pmnc_disable_counter(idx); - /* * Set event (if destined for PMNx counters) * We only need to set the event for the cycle counter if we @@ -1450,8 +1448,6 @@ static void krait_pmu_enable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; - armv7_pmnc_disable_counter(idx); - /* * Set event (if destined for PMNx counters) * We set the event for the cycle counter because we @@ -1762,8 +1758,6 @@ static void scorpion_pmu_enable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; - armv7_pmnc_disable_counter(idx); - /* * Set event (if destined for PMNx counters) * We don't set the event for the cycle counter because we From c2e793da59fc297b4844669e57208c66f45fce0e Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Tue, 18 Feb 2025 14:40:01 -0600 Subject: [PATCH 22/72] perf: apple_m1: Don't disable counter in m1_pmu_enable_event() Currently m1_pmu_enable_event() starts by disabling the event counter it has been asked to enable. This should not be necessary as the counter (and the PMU as a whole) should not be active when m1_pmu_enable_event() is called. Cc: Marc Zyngier Signed-off-by: Rob Herring (Arm) Reviewed-by: Anshuman Khandual Tested-by: James Clark Link: https://lore.kernel.org/r/20250218-arm-brbe-v19-v20-6-4e9922fc2e8e@kernel.org Signed-off-by: Will Deacon --- drivers/perf/apple_m1_cpu_pmu.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/perf/apple_m1_cpu_pmu.c b/drivers/perf/apple_m1_cpu_pmu.c index 06fd317529fcb..39349ecec3c1d 100644 --- a/drivers/perf/apple_m1_cpu_pmu.c +++ b/drivers/perf/apple_m1_cpu_pmu.c @@ -396,10 +396,6 @@ static void m1_pmu_enable_event(struct perf_event *event) user = event->hw.config_base & M1_PMU_CFG_COUNT_USER; kernel = event->hw.config_base & M1_PMU_CFG_COUNT_KERNEL; - m1_pmu_disable_counter_interrupt(event->hw.idx); - m1_pmu_disable_counter(event->hw.idx); - isb(); - m1_pmu_configure_counter(event->hw.idx, evt, user, kernel); m1_pmu_enable_counter(event->hw.idx); m1_pmu_enable_counter_interrupt(event->hw.idx); From dc4d58a752ea6cb0821d889e8412c22d5289f3d3 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 18 Feb 2025 14:40:02 -0600 Subject: [PATCH 23/72] perf: arm_pmu: Move PMUv3-specific data A few fields in struct arm_pmu are only used with PMUv3, and soon we will need to add more for BRBE. Group the fields together so that we have a logical place to add more data in future. At the same time, remove the comment for reg_pmmir as it doesn't convey anything useful. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Signed-off-by: Rob Herring (Arm) Reviewed-by: Anshuman Khandual Tested-by: James Clark Link: https://lore.kernel.org/r/20250218-arm-brbe-v19-v20-7-4e9922fc2e8e@kernel.org Signed-off-by: Will Deacon --- include/linux/perf/arm_pmu.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 4b5b83677e3f2..c70d528594f26 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -84,7 +84,6 @@ struct arm_pmu { struct pmu pmu; cpumask_t supported_cpus; char *name; - int pmuver; irqreturn_t (*handle_irq)(struct arm_pmu *pmu); void (*enable)(struct perf_event *event); void (*disable)(struct perf_event *event); @@ -102,18 +101,20 @@ struct arm_pmu { int (*map_event)(struct perf_event *event); DECLARE_BITMAP(cntr_mask, ARMPMU_MAX_HWEVENTS); bool secure_access; /* 32-bit ARM only */ -#define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40 - DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); -#define ARMV8_PMUV3_EXT_COMMON_EVENT_BASE 0x4000 - DECLARE_BITMAP(pmceid_ext_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); struct platform_device *plat_device; struct pmu_hw_events __percpu *hw_events; struct hlist_node node; struct notifier_block cpu_pm_nb; /* the attr_groups array must be NULL-terminated */ const struct attribute_group *attr_groups[ARMPMU_NR_ATTR_GROUPS + 1]; - /* store the PMMIR_EL1 to expose slots */ + + /* PMUv3 only */ + int pmuver; u64 reg_pmmir; +#define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40 + DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); +#define ARMV8_PMUV3_EXT_COMMON_EVENT_BASE 0x4000 + DECLARE_BITMAP(pmceid_ext_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); /* Only to be used by ACPI probing code */ unsigned long acpi_cpuid; From 678a5d3d6db64250b293e92504a4f5f6fe8743dd Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 6 Feb 2025 14:24:04 +0000 Subject: [PATCH 24/72] perf/arm-cmn: Minor event type housekeeping While handling RN-D nodes under the functionally-identical RN-I type works fine for perf tool users using the "rnid_" event aliases, and that is the documented and expected ABI, there's little reason not to be permissive and accept the actual RN-D type as an additional encoding for the same events as well. This may be convenient for other tooling generating event configs directly from its own topology data. In the RN-I event mood, it also seems as good a time as any to clean up a forgotten macro for CCLA_RNI events which ended up being unnecessary. Signed-off-by: Robin Murphy Reviewed-by: Ilkka Koskinen Link: https://lore.kernel.org/r/ef46a47fc4ab909093f14b2b4289a4835836ab6c.1738851844.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index ef959e66db7c3..d4fe30ff225b6 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -802,8 +802,6 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj, CMN_EVENT_ATTR(_model, ccha_##_name, CMN_TYPE_CCHA, _event) #define CMN_EVENT_CCLA(_name, _event) \ CMN_EVENT_ATTR(CMN_ANY, ccla_##_name, CMN_TYPE_CCLA, _event) -#define CMN_EVENT_CCLA_RNI(_name, _event) \ - CMN_EVENT_ATTR(CMN_ANY, ccla_rni_##_name, CMN_TYPE_CCLA_RNI, _event) #define CMN_EVENT_HNS(_name, _event) \ CMN_EVENT_ATTR(CMN_ANY, hns_##_name, CMN_TYPE_HNS, _event) @@ -1798,6 +1796,9 @@ static int arm_cmn_event_init(struct perf_event *event) } else if (type == CMN_TYPE_XP && (cmn->part == PART_CMN700 || cmn->part == PART_CMN_S3)) { hw->wide_sel = true; + } else if (type == CMN_TYPE_RND) { + /* Secretly permit this as an alias for "rnid" events */ + type = CMN_TYPE_RNI; } /* This is sufficiently annoying to recalculate, so cache it */ From 6eb1e8ef586ac4a3dcdc20248f9cb45e4ceb141f Mon Sep 17 00:00:00 2001 From: Yunhui Cui Date: Thu, 20 Feb 2025 20:17:15 +0800 Subject: [PATCH 25/72] perf/dwc_pcie: fix some unreleased resources Release leaked resources, such as plat_dev and dev_info. Signed-off-by: Yunhui Cui Reviewed-by: Shuai Xue Link: https://lore.kernel.org/r/20250220121716.50324-2-cuiyunhui@bytedance.com Signed-off-by: Will Deacon --- drivers/perf/dwc_pcie_pmu.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c index cccecae9823f6..19fa2ba8dd670 100644 --- a/drivers/perf/dwc_pcie_pmu.c +++ b/drivers/perf/dwc_pcie_pmu.c @@ -572,8 +572,10 @@ static int dwc_pcie_register_dev(struct pci_dev *pdev) return PTR_ERR(plat_dev); dev_info = kzalloc(sizeof(*dev_info), GFP_KERNEL); - if (!dev_info) + if (!dev_info) { + platform_device_unregister(plat_dev); return -ENOMEM; + } /* Cache platform device to handle pci device hotplug */ dev_info->plat_dev = plat_dev; @@ -730,6 +732,15 @@ static struct platform_driver dwc_pcie_pmu_driver = { .driver = {.name = "dwc_pcie_pmu",}, }; +static void dwc_pcie_cleanup_devices(void) +{ + struct dwc_pcie_dev_info *dev_info, *tmp; + + list_for_each_entry_safe(dev_info, tmp, &dwc_pcie_dev_info_head, dev_node) { + dwc_pcie_unregister_dev(dev_info); + } +} + static int __init dwc_pcie_pmu_init(void) { struct pci_dev *pdev = NULL; @@ -742,7 +753,7 @@ static int __init dwc_pcie_pmu_init(void) ret = dwc_pcie_register_dev(pdev); if (ret) { pci_dev_put(pdev); - return ret; + goto err_cleanup; } } @@ -751,35 +762,35 @@ static int __init dwc_pcie_pmu_init(void) dwc_pcie_pmu_online_cpu, dwc_pcie_pmu_offline_cpu); if (ret < 0) - return ret; + goto err_cleanup; dwc_pcie_pmu_hp_state = ret; ret = platform_driver_register(&dwc_pcie_pmu_driver); if (ret) - goto platform_driver_register_err; + goto err_remove_cpuhp; ret = bus_register_notifier(&pci_bus_type, &dwc_pcie_pmu_nb); if (ret) - goto platform_driver_register_err; + goto err_unregister_driver; notify = true; return 0; -platform_driver_register_err: +err_unregister_driver: + platform_driver_unregister(&dwc_pcie_pmu_driver); +err_remove_cpuhp: cpuhp_remove_multi_state(dwc_pcie_pmu_hp_state); - +err_cleanup: + dwc_pcie_cleanup_devices(); return ret; } static void __exit dwc_pcie_pmu_exit(void) { - struct dwc_pcie_dev_info *dev_info, *tmp; - if (notify) bus_unregister_notifier(&pci_bus_type, &dwc_pcie_pmu_nb); - list_for_each_entry_safe(dev_info, tmp, &dwc_pcie_dev_info_head, dev_node) - dwc_pcie_unregister_dev(dev_info); + dwc_pcie_cleanup_devices(); platform_driver_unregister(&dwc_pcie_pmu_driver); cpuhp_remove_multi_state(dwc_pcie_pmu_hp_state); } From 7f35b429802a8065aa61e2a3f567089649f4d98e Mon Sep 17 00:00:00 2001 From: Yunhui Cui Date: Thu, 20 Feb 2025 20:17:16 +0800 Subject: [PATCH 26/72] perf/dwc_pcie: fix duplicate pci_dev devices During platform_device_register, wrongly using struct device pci_dev as platform_data caused a kmemdup copy of pci_dev. Worse still, accessing the duplicated device leads to list corruption as its mutex content (e.g., list, magic) remains the same as the original. Signed-off-by: Yunhui Cui Reviewed-by: Shuai Xue Link: https://lore.kernel.org/r/20250220121716.50324-3-cuiyunhui@bytedance.com Signed-off-by: Will Deacon --- drivers/perf/dwc_pcie_pmu.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c index 19fa2ba8dd670..f851e070760c5 100644 --- a/drivers/perf/dwc_pcie_pmu.c +++ b/drivers/perf/dwc_pcie_pmu.c @@ -565,9 +565,7 @@ static int dwc_pcie_register_dev(struct pci_dev *pdev) u32 sbdf; sbdf = (pci_domain_nr(pdev->bus) << 16) | PCI_DEVID(pdev->bus->number, pdev->devfn); - plat_dev = platform_device_register_data(NULL, "dwc_pcie_pmu", sbdf, - pdev, sizeof(*pdev)); - + plat_dev = platform_device_register_simple("dwc_pcie_pmu", sbdf, NULL, 0); if (IS_ERR(plat_dev)) return PTR_ERR(plat_dev); @@ -616,18 +614,26 @@ static struct notifier_block dwc_pcie_pmu_nb = { static int dwc_pcie_pmu_probe(struct platform_device *plat_dev) { - struct pci_dev *pdev = plat_dev->dev.platform_data; + struct pci_dev *pdev; struct dwc_pcie_pmu *pcie_pmu; char *name; u32 sbdf; u16 vsec; int ret; + sbdf = plat_dev->id; + pdev = pci_get_domain_bus_and_slot(sbdf >> 16, PCI_BUS_NUM(sbdf & 0xffff), + sbdf & 0xff); + if (!pdev) { + pr_err("No pdev found for the sbdf 0x%x\n", sbdf); + return -ENODEV; + } + vsec = dwc_pcie_des_cap(pdev); if (!vsec) return -ENODEV; - sbdf = plat_dev->id; + pci_dev_put(pdev); name = devm_kasprintf(&plat_dev->dev, GFP_KERNEL, "dwc_rootport_%x", sbdf); if (!name) return -ENOMEM; @@ -642,7 +648,7 @@ static int dwc_pcie_pmu_probe(struct platform_device *plat_dev) pcie_pmu->on_cpu = -1; pcie_pmu->pmu = (struct pmu){ .name = name, - .parent = &pdev->dev, + .parent = &plat_dev->dev, .module = THIS_MODULE, .attr_groups = dwc_pcie_attr_groups, .capabilities = PERF_PMU_CAP_NO_EXCLUDE, From 2d7872f3ae3b790ff5b1343fa84f8bf2c3e486e6 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 27 Feb 2025 07:54:12 +0530 Subject: [PATCH 27/72] arm64/mm: Convert __pte_to_phys() and __phys_to_pte_val() as functions When CONFIG_ARM64_PA_BITS_52 is enabled, page table helpers __pte_to_phys() and __phys_to_pte_val() are functions which return phys_addr_t and pteval_t respectively as expected. But otherwise without this config being enabled, they are defined as macros and their return types are implicit. Until now this has worked out correctly as both pte_t and phys_addr_t data types have been 64 bits. But with the introduction of 128 bit page tables, pte_t becomes 128 bits. Hence this ends up with incorrect widths after the conversions, which leads to compiler warnings. Fix these warnings by converting __pte_to_phys() and __phys_to_pte_val() as functions instead where the return types are handled explicitly. Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Reviewed-by: Ryan Roberts Link: https://lore.kernel.org/r/20250227022412.2015835-1-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 0b2a2ad1b9e83..4ebfa60ea5c62 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -68,10 +68,6 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #define pte_ERROR(e) \ pr_err("%s:%d: bad pte %016llx.\n", __FILE__, __LINE__, pte_val(e)) -/* - * Macros to convert between a physical address and its placement in a - * page table entry, taking care of 52-bit addresses. - */ #ifdef CONFIG_ARM64_PA_BITS_52 static inline phys_addr_t __pte_to_phys(pte_t pte) { @@ -84,8 +80,15 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) return (phys | (phys >> PTE_ADDR_HIGH_SHIFT)) & PHYS_TO_PTE_ADDR_MASK; } #else -#define __pte_to_phys(pte) (pte_val(pte) & PTE_ADDR_LOW) -#define __phys_to_pte_val(phys) (phys) +static inline phys_addr_t __pte_to_phys(pte_t pte) +{ + return pte_val(pte) & PTE_ADDR_LOW; +} + +static inline pteval_t __phys_to_pte_val(phys_addr_t phys) +{ + return phys; +} #endif #define pte_pfn(pte) (__pte_to_phys(pte) >> PAGE_SHIFT) From 34e8e63a6dc1e71350b7d93c7db3f0370f2b3d75 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 20 Feb 2025 10:35:34 +0530 Subject: [PATCH 28/72] arm64/hugetlb: Consistently use pud_sect_supported() Let's be consistent in using pud_sect_supported() for PUD_SIZE sized pages. Hence change hugetlb_mask_last_page() and arch_make_huge_pte() as required. Also re-arranged the switch statement for a common warning message. Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Reviewed-by: Ryan Roberts Link: https://lore.kernel.org/r/20250220050534.799645-1-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/mm/hugetlbpage.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 98a2a0e64e255..fd7448bb8c806 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -342,7 +342,9 @@ unsigned long hugetlb_mask_last_page(struct hstate *h) switch (hp_size) { #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: - return PGDIR_SIZE - PUD_SIZE; + if (pud_sect_supported()) + return PGDIR_SIZE - PUD_SIZE; + break; #endif case CONT_PMD_SIZE: return PUD_SIZE - CONT_PMD_SIZE; @@ -364,23 +366,21 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) switch (pagesize) { #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: - entry = pud_pte(pud_mkhuge(pte_pud(entry))); + if (pud_sect_supported()) + return pud_pte(pud_mkhuge(pte_pud(entry))); break; #endif case CONT_PMD_SIZE: - entry = pmd_pte(pmd_mkcont(pte_pmd(entry))); - fallthrough; + return pmd_pte(pmd_mkhuge(pmd_mkcont(pte_pmd(entry)))); case PMD_SIZE: - entry = pmd_pte(pmd_mkhuge(pte_pmd(entry))); - break; + return pmd_pte(pmd_mkhuge(pte_pmd(entry))); case CONT_PTE_SIZE: - entry = pte_mkcont(entry); - break; + return pte_mkcont(entry); default: - pr_warn("%s: unrecognized huge page size 0x%lx\n", - __func__, pagesize); break; } + pr_warn("%s: unrecognized huge page size 0x%lx\n", + __func__, pagesize); return entry; } From 7ae95109c64d64cdcf195788cf466ec0b3019a94 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 21 Feb 2025 09:33:30 +0000 Subject: [PATCH 29/72] kselftest/arm64: mte: Use the correct naming for tag check modes in check_hugetlb_options.c The architecture doesn't define precise/imprecise MTE tag check modes, only synchronous and asynchronous. Use the correct naming and also ensure they match the MTE_{ASYNC,SYNC}_ERR type. Fixes: 27879e8cb6b0 ("selftests: arm64: add hugetlb mte tests") Cc: Yang Shi Reviewed-by: Yang Shi Link: https://lore.kernel.org/r/20250221093331.2184245-2-catalin.marinas@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/mte/check_hugetlb_options.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/arm64/mte/check_hugetlb_options.c b/tools/testing/selftests/arm64/mte/check_hugetlb_options.c index 303260a6dc65b..11f812635b510 100644 --- a/tools/testing/selftests/arm64/mte/check_hugetlb_options.c +++ b/tools/testing/selftests/arm64/mte/check_hugetlb_options.c @@ -270,13 +270,13 @@ int main(int argc, char *argv[]) "Check clear PROT_MTE flags with private mapping and sync error mode and mmap/mprotect memory\n"); evaluate_test(check_child_hugetlb_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB), - "Check child hugetlb memory with private mapping, precise mode and mmap memory\n"); + "Check child hugetlb memory with private mapping, sync error mode and mmap memory\n"); evaluate_test(check_child_hugetlb_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB), - "Check child hugetlb memory with private mapping, precise mode and mmap memory\n"); + "Check child hugetlb memory with private mapping, async error mode and mmap memory\n"); evaluate_test(check_child_hugetlb_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB), - "Check child hugetlb memory with private mapping, precise mode and mmap/mprotect memory\n"); + "Check child hugetlb memory with private mapping, sync error mode and mmap/mprotect memory\n"); evaluate_test(check_child_hugetlb_memory_mapping(USE_MPROTECT, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB), - "Check child hugetlb memory with private mapping, precise mode and mmap/mprotect memory\n"); + "Check child hugetlb memory with private mapping, async error mode and mmap/mprotect memory\n"); mte_restore_setup(); free_hugetlb(); From 306219d59b72cfca4005229b2c6ad43a16479e3b Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 21 Feb 2025 09:33:31 +0000 Subject: [PATCH 30/72] kselftest/arm64: mte: Skip the hugetlb tests if MTE not supported on such mappings While the kselftest was added at the same time with the kernel support for MTE on hugetlb mappings, the tests may be run on older kernels. Skip the tests if PROT_MTE is not supported on MAP_HUGETLB mappings. Fixes: 27879e8cb6b0 ("selftests: arm64: add hugetlb mte tests") Cc: Yang Shi Reported-by: Naresh Kamboju Reviewed-by: Dev Jain Reviewed-by: Yang Shi Link: https://lore.kernel.org/r/20250221093331.2184245-3-catalin.marinas@arm.com Signed-off-by: Catalin Marinas --- .../selftests/arm64/mte/check_hugetlb_options.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/arm64/mte/check_hugetlb_options.c b/tools/testing/selftests/arm64/mte/check_hugetlb_options.c index 11f812635b510..3bfcd3848432b 100644 --- a/tools/testing/selftests/arm64/mte/check_hugetlb_options.c +++ b/tools/testing/selftests/arm64/mte/check_hugetlb_options.c @@ -227,6 +227,8 @@ static int check_child_hugetlb_memory_mapping(int mem_type, int mode, int mappin int main(int argc, char *argv[]) { int err; + void *map_ptr; + unsigned long map_size; err = mte_default_setup(); if (err) @@ -243,6 +245,15 @@ int main(int argc, char *argv[]) return KSFT_FAIL; } + /* Check if MTE supports hugetlb mappings */ + map_size = default_huge_page_size(); + map_ptr = mmap(NULL, map_size, PROT_READ | PROT_MTE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); + if (map_ptr == MAP_FAILED) + ksft_exit_skip("PROT_MTE not supported with MAP_HUGETLB mappings\n"); + else + munmap(map_ptr, map_size); + /* Set test plan */ ksft_set_plan(12); From 653884f88777b8858ef438cd8ee6ae4722fd5553 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristina=20Mart=C5=A1enko?= Date: Fri, 28 Feb 2025 17:00:04 +0000 Subject: [PATCH 31/72] arm64: extable: Add fixup handling for uaccess CPY* instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A subsequent patch will use CPY* instructions to copy between user and kernel memory. Add a new exception fixup type to avoid fixing up faults on kernel memory accesses, in order to make it easier to debug kernel bugs and to keep the same behavior as with regular loads/stores. Signed-off-by: Kristina Martšenko Reviewed-by: Robin Murphy Link: https://lore.kernel.org/r/20250228170006.390100-2-kristina.martsenko@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/asm-extable.h | 10 +++++++++- arch/arm64/include/asm/extable.h | 2 +- arch/arm64/mm/extable.c | 25 ++++++++++++++++++++++++- arch/arm64/mm/fault.c | 2 +- 4 files changed, 35 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/asm-extable.h b/arch/arm64/include/asm/asm-extable.h index b8a5861dc7b77..292f2687a12e5 100644 --- a/arch/arm64/include/asm/asm-extable.h +++ b/arch/arm64/include/asm/asm-extable.h @@ -9,7 +9,8 @@ #define EX_TYPE_BPF 1 #define EX_TYPE_UACCESS_ERR_ZERO 2 #define EX_TYPE_KACCESS_ERR_ZERO 3 -#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD 4 +#define EX_TYPE_UACCESS_CPY 4 +#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD 5 /* Data fields for EX_TYPE_UACCESS_ERR_ZERO */ #define EX_DATA_REG_ERR_SHIFT 0 @@ -23,6 +24,9 @@ #define EX_DATA_REG_ADDR_SHIFT 5 #define EX_DATA_REG_ADDR GENMASK(9, 5) +/* Data fields for EX_TYPE_UACCESS_CPY */ +#define EX_DATA_UACCESS_WRITE BIT(0) + #ifdef __ASSEMBLY__ #define __ASM_EXTABLE_RAW(insn, fixup, type, data) \ @@ -69,6 +73,10 @@ .endif .endm + .macro _asm_extable_uaccess_cpy, insn, fixup, uaccess_is_write + __ASM_EXTABLE_RAW(\insn, \fixup, EX_TYPE_UACCESS_CPY, \uaccess_is_write) + .endm + #else /* __ASSEMBLY__ */ #include diff --git a/arch/arm64/include/asm/extable.h b/arch/arm64/include/asm/extable.h index 72b0e71cc3de8..5892b8977710c 100644 --- a/arch/arm64/include/asm/extable.h +++ b/arch/arm64/include/asm/extable.h @@ -45,5 +45,5 @@ bool ex_handler_bpf(const struct exception_table_entry *ex, } #endif /* !CONFIG_BPF_JIT */ -bool fixup_exception(struct pt_regs *regs); +bool fixup_exception(struct pt_regs *regs, unsigned long esr); #endif diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c index 228d681a87159..afb5241e4d91c 100644 --- a/arch/arm64/mm/extable.c +++ b/arch/arm64/mm/extable.c @@ -8,8 +8,18 @@ #include #include +#include #include +static bool cpy_faulted_on_uaccess(const struct exception_table_entry *ex, + unsigned long esr) +{ + bool uaccess_is_write = FIELD_GET(EX_DATA_UACCESS_WRITE, ex->data); + bool fault_on_write = esr & ESR_ELx_WNR; + + return uaccess_is_write == fault_on_write; +} + static inline unsigned long get_ex_fixup(const struct exception_table_entry *ex) { @@ -29,6 +39,17 @@ static bool ex_handler_uaccess_err_zero(const struct exception_table_entry *ex, return true; } +static bool ex_handler_uaccess_cpy(const struct exception_table_entry *ex, + struct pt_regs *regs, unsigned long esr) +{ + /* Do not fix up faults on kernel memory accesses */ + if (!cpy_faulted_on_uaccess(ex, esr)) + return false; + + regs->pc = get_ex_fixup(ex); + return true; +} + static bool ex_handler_load_unaligned_zeropad(const struct exception_table_entry *ex, struct pt_regs *regs) @@ -56,7 +77,7 @@ ex_handler_load_unaligned_zeropad(const struct exception_table_entry *ex, return true; } -bool fixup_exception(struct pt_regs *regs) +bool fixup_exception(struct pt_regs *regs, unsigned long esr) { const struct exception_table_entry *ex; @@ -70,6 +91,8 @@ bool fixup_exception(struct pt_regs *regs) case EX_TYPE_UACCESS_ERR_ZERO: case EX_TYPE_KACCESS_ERR_ZERO: return ex_handler_uaccess_err_zero(ex, regs); + case EX_TYPE_UACCESS_CPY: + return ex_handler_uaccess_cpy(ex, regs, esr); case EX_TYPE_LOAD_UNALIGNED_ZEROPAD: return ex_handler_load_unaligned_zeropad(ex, regs); } diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index ef63651099a9d..da4854fc6150a 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -375,7 +375,7 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr, * Are we prepared to handle this kernel fault? * We are almost certainly not prepared to handle instruction faults. */ - if (!is_el1_instruction_abort(esr) && fixup_exception(regs)) + if (!is_el1_instruction_abort(esr) && fixup_exception(regs, esr)) return; if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs), From 04a9f771d81c109b3927d224a797dc21e2774a5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristina=20Mart=C5=A1enko?= Date: Fri, 28 Feb 2025 17:00:05 +0000 Subject: [PATCH 32/72] arm64: mm: Handle PAN faults on uaccess CPY* instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A subsequent patch will use CPY* instructions to copy between user and kernel memory. Add handling for PAN faults caused by an intended kernel memory access erroneously accessing user memory, in order to make it easier to debug kernel bugs and to keep the same behavior as with regular loads/stores. Signed-off-by: Kristina Martšenko Reviewed-by: Robin Murphy Link: https://lore.kernel.org/r/20250228170006.390100-3-kristina.martsenko@arm.com [catalin.marinas@arm.com: Folded the extable search into insn_may_access_user()] Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/extable.h | 2 ++ arch/arm64/mm/extable.c | 15 +++++++++++++++ arch/arm64/mm/fault.c | 2 +- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/extable.h b/arch/arm64/include/asm/extable.h index 5892b8977710c..9dc39612bdf53 100644 --- a/arch/arm64/include/asm/extable.h +++ b/arch/arm64/include/asm/extable.h @@ -33,6 +33,8 @@ do { \ (b)->data = (tmp).data; \ } while (0) +bool insn_may_access_user(unsigned long addr, unsigned long esr); + #ifdef CONFIG_BPF_JIT bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs); diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c index afb5241e4d91c..6e0528831cd3a 100644 --- a/arch/arm64/mm/extable.c +++ b/arch/arm64/mm/extable.c @@ -20,6 +20,21 @@ static bool cpy_faulted_on_uaccess(const struct exception_table_entry *ex, return uaccess_is_write == fault_on_write; } +bool insn_may_access_user(unsigned long addr, unsigned long esr) +{ + const struct exception_table_entry *ex = search_exception_tables(addr); + + if (!ex) + return false; + + switch (ex->type) { + case EX_TYPE_UACCESS_CPY: + return cpy_faulted_on_uaccess(ex, esr); + default: + return true; + } +} + static inline unsigned long get_ex_fixup(const struct exception_table_entry *ex) { diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index da4854fc6150a..ec0a337891ddf 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -606,7 +606,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, die_kernel_fault("execution of user memory", addr, esr, regs); - if (!search_exception_tables(regs->pc)) + if (!insn_may_access_user(regs->pc, esr)) die_kernel_fault("access to user memory outside uaccess routines", addr, esr, regs); } From fe59e0358d9b032a09d903350d5fef73601166f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristina=20Mart=C5=A1enko?= Date: Fri, 28 Feb 2025 17:00:06 +0000 Subject: [PATCH 33/72] arm64: lib: Use MOPS for usercopy routines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similarly to what was done with the memcpy() routines, make copy_to_user(), copy_from_user() and clear_user() also use the Armv8.8 FEAT_MOPS instructions. Both MOPS implementation options (A and B) are supported, including asymmetric systems. The exception fixup code fixes up the registers according to the option used. In case of a fault the routines return precisely how much was not copied (as required by the comment in include/linux/uaccess.h), as unprivileged versions of CPY/SET are guaranteed not to have written past the addresses reported in the GPRs. The MOPS instructions could possibly be inlined into callers (and patched to branch to the generic implementation if not detected; similarly to what x86 does), but as a first step this patch just uses them in the out-of-line routines. Signed-off-by: Kristina Martšenko Acked-by: Robin Murphy Link: https://lore.kernel.org/r/20250228170006.390100-4-kristina.martsenko@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/asm-uaccess.h | 4 ++++ arch/arm64/lib/clear_user.S | 25 +++++++++++++++++++++---- arch/arm64/lib/copy_from_user.S | 10 ++++++++++ arch/arm64/lib/copy_template.S | 10 ++++++++++ arch/arm64/lib/copy_to_user.S | 10 ++++++++++ 5 files changed, 55 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index 5b6efe8abeeb4..9148f5a319681 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -61,6 +61,10 @@ alternative_else_nop_endif 9999: x; \ _asm_extable_uaccess 9999b, l +#define USER_CPY(l, uaccess_is_write, x...) \ +9999: x; \ + _asm_extable_uaccess_cpy 9999b, l, uaccess_is_write + /* * Generate the assembly for LDTR/STTR with exception table entries. * This is complicated as there is no post-increment or pair versions of the diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index a5a5f5b97b175..de9a303b6ad0e 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -17,14 +17,27 @@ * Alignment fixed up by hardware. */ - .p2align 4 - // Alignment is for the loop, but since the prologue (including BTI) - // is also 16 bytes we can keep any padding outside the function SYM_FUNC_START(__arch_clear_user) add x2, x0, x1 + +#ifdef CONFIG_AS_HAS_MOPS + .arch_extension mops +alternative_if_not ARM64_HAS_MOPS + b .Lno_mops +alternative_else_nop_endif + +USER(9f, setpt [x0]!, x1!, xzr) +USER(6f, setmt [x0]!, x1!, xzr) +USER(6f, setet [x0]!, x1!, xzr) + mov x0, #0 + ret +.Lno_mops: +#endif + subs x1, x1, #8 b.mi 2f -1: + +1: .p2align 4 USER(9f, sttr xzr, [x0]) add x0, x0, #8 subs x1, x1, #8 @@ -47,6 +60,10 @@ USER(7f, sttrb wzr, [x2, #-1]) ret // Exception fixups +6: b.cs 9f + // Registers are in Option A format + add x0, x0, x1 + b 9f 7: sub x0, x2, #5 // Adjust for faulting on the final byte... 8: add x0, x0, #4 // ...or the second word of the 4-7 byte case 9: sub x0, x2, x0 diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 34e3179075244..400057d607ecd 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -52,6 +52,13 @@ stp \reg1, \reg2, [\ptr], \val .endm + .macro cpy1 dst, src, count + .arch_extension mops + USER_CPY(9997f, 0, cpyfprt [\dst]!, [\src]!, \count!) + USER_CPY(9996f, 0, cpyfmrt [\dst]!, [\src]!, \count!) + USER_CPY(9996f, 0, cpyfert [\dst]!, [\src]!, \count!) + .endm + end .req x5 srcin .req x15 SYM_FUNC_START(__arch_copy_from_user) @@ -62,6 +69,9 @@ SYM_FUNC_START(__arch_copy_from_user) ret // Exception fixups +9996: b.cs 9997f + // Registers are in Option A format + add dst, dst, count 9997: cmp dst, dstin b.ne 9998f // Before being absolutely sure we couldn't copy anything, try harder diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S index 488df234c49a2..7f2f5a0e2fb9f 100644 --- a/arch/arm64/lib/copy_template.S +++ b/arch/arm64/lib/copy_template.S @@ -40,6 +40,16 @@ D_l .req x13 D_h .req x14 mov dst, dstin + +#ifdef CONFIG_AS_HAS_MOPS +alternative_if_not ARM64_HAS_MOPS + b .Lno_mops +alternative_else_nop_endif + cpy1 dst, src, count + b .Lexitfunc +.Lno_mops: +#endif + cmp count, #16 /*When memory length is less than 16, the accessed are not aligned.*/ b.lo .Ltiny15 diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 8022317726085..819f2e3fc7a93 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -51,6 +51,13 @@ user_stp 9997f, \reg1, \reg2, \ptr, \val .endm + .macro cpy1 dst, src, count + .arch_extension mops + USER_CPY(9997f, 1, cpyfpwt [\dst]!, [\src]!, \count!) + USER_CPY(9996f, 1, cpyfmwt [\dst]!, [\src]!, \count!) + USER_CPY(9996f, 1, cpyfewt [\dst]!, [\src]!, \count!) + .endm + end .req x5 srcin .req x15 SYM_FUNC_START(__arch_copy_to_user) @@ -61,6 +68,9 @@ SYM_FUNC_START(__arch_copy_to_user) ret // Exception fixups +9996: b.cs 9997f + // Registers are in Option A format + add dst, dst, count 9997: cmp dst, dstin b.ne 9998f // Before being absolutely sure we couldn't copy anything, try harder From f91a3a6088ea4396299f1a72f6b0302bbe21a314 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Wed, 19 Feb 2025 16:40:27 +0000 Subject: [PATCH 34/72] arm64/sysreg: Improve PIR/POR helpers We currently have one helper to set a PIRx_ELx's permission field to a given value, PIRx_ELx_PERM(), and another helper to extract a permission field from POR_ELx, POR_ELx_IDX(). The naming is pretty confusing - it isn't clear at all that "_PERM" corresponds to a setter and "_IDX" to a getter. This patch aims at improving the situation by using the same suffixes as FIELD_PREP()/FIELD_GET(), which we have already adopted for SYS_FIELD_{PREP,GET}(): * PIRx_ELx_PERM_PREP(), POR_ELx_PERM_PREP() create a register value where the permission field for a given index is set to a given value. * POR_ELx_PERM_GET() extracts the permission field from a given register value for a given index. These helpers are not implemented using FIELD_PREP()/FIELD_GET() because the mask may not be constant, and they need to be usable in assembly. They are all defined in asm/sysreg.h, as one would expect for basic sysreg-related helpers. Finally the new POR_ELx_PERM_* macros are used for existing calculations in signal.c and mmu.c. Signed-off-by: Kevin Brodsky Link: https://lore.kernel.org/r/20250219164029.2309119-2-kevin.brodsky@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable-prot.h | 36 +++++++++++++-------------- arch/arm64/include/asm/por.h | 9 +++---- arch/arm64/include/asm/sysreg.h | 10 +++++++- arch/arm64/kernel/signal.c | 2 +- arch/arm64/mm/mmu.c | 6 ++--- 5 files changed, 34 insertions(+), 29 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index a95f1f77bb39a..7830d031742e5 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -169,25 +169,25 @@ static inline bool __pure lpa2_is_enabled(void) #define PAGE_GCS_RO __pgprot(_PAGE_GCS_RO) #define PIE_E0 ( \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS), PIE_GCS) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS_RO), PIE_R) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_X_O) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX_O) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY), PIE_R_O) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED), PIE_RW_O)) + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS), PIE_GCS) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO), PIE_R) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_EXECONLY), PIE_X_O) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX_O) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY), PIE_R_O) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED), PIE_RW_O)) #define PIE_E1 ( \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS), PIE_NONE_O) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS_RO), PIE_NONE_O) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_NONE_O) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_R) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RW) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY), PIE_R) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED), PIE_RW) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_KERNEL_ROX), PIE_RX) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_KERNEL_EXEC), PIE_RWX) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_KERNEL_RO), PIE_R) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_KERNEL), PIE_RW)) + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS), PIE_NONE_O) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO), PIE_NONE_O) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_EXECONLY), PIE_NONE_O) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY_EXEC), PIE_R) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RW) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY), PIE_R) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED), PIE_RW) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_ROX), PIE_RX) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_EXEC), PIE_RWX) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_RO), PIE_R) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL), PIE_RW)) #endif /* __ASM_PGTABLE_PROT_H */ diff --git a/arch/arm64/include/asm/por.h b/arch/arm64/include/asm/por.h index e06e9f473675f..e6bf00bd05003 100644 --- a/arch/arm64/include/asm/por.h +++ b/arch/arm64/include/asm/por.h @@ -6,26 +6,25 @@ #ifndef _ASM_ARM64_POR_H #define _ASM_ARM64_POR_H -#define POR_BITS_PER_PKEY 4 -#define POR_ELx_IDX(por_elx, idx) (((por_elx) >> ((idx) * POR_BITS_PER_PKEY)) & 0xf) +#include static inline bool por_elx_allows_read(u64 por, u8 pkey) { - u8 perm = POR_ELx_IDX(por, pkey); + u8 perm = POR_ELx_PERM_GET(pkey, por); return perm & POE_R; } static inline bool por_elx_allows_write(u64 por, u8 pkey) { - u8 perm = POR_ELx_IDX(por, pkey); + u8 perm = POR_ELx_PERM_GET(pkey, por); return perm & POE_W; } static inline bool por_elx_allows_exec(u64 por, u8 pkey) { - u8 perm = POR_ELx_IDX(por, pkey); + u8 perm = POR_ELx_PERM_GET(pkey, por); return perm & POE_X; } diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 05ea5223d2d55..c9ce4ce5131c8 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1062,8 +1062,11 @@ #define PIE_RX UL(0xa) #define PIE_RW UL(0xc) #define PIE_RWX UL(0xe) +#define PIE_MASK UL(0xf) -#define PIRx_ELx_PERM(idx, perm) ((perm) << ((idx) * 4)) +#define PIRx_ELx_BITS_PER_IDX 4 +#define PIRx_ELx_PERM_SHIFT(idx) ((idx) * PIRx_ELx_BITS_PER_IDX) +#define PIRx_ELx_PERM_PREP(idx, perm) (((perm) & PIE_MASK) << PIRx_ELx_PERM_SHIFT(idx)) /* * Permission Overlay Extension (POE) permission encodings. @@ -1078,6 +1081,11 @@ #define POE_RXW UL(0x7) #define POE_MASK UL(0xf) +#define POR_ELx_BITS_PER_IDX 4 +#define POR_ELx_PERM_SHIFT(idx) ((idx) * POR_ELx_BITS_PER_IDX) +#define POR_ELx_PERM_GET(idx, reg) (((reg) >> POR_ELx_PERM_SHIFT(idx)) & POE_MASK) +#define POR_ELx_PERM_PREP(idx, perm) (((perm) & POE_MASK) << POR_ELx_PERM_SHIFT(idx)) + /* Initial value for Permission Overlay Extension for EL0 */ #define POR_EL0_INIT POE_RXW diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 99ea26d400ffe..bf62262565499 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -91,7 +91,7 @@ static void save_reset_user_access_state(struct user_access_state *ua_state) u64 por_enable_all = 0; for (int pkey = 0; pkey < arch_max_pkey(); pkey++) - por_enable_all |= POE_RXW << (pkey * POR_BITS_PER_PKEY); + por_enable_all |= POR_ELx_PERM_PREP(pkey, POE_RXW); ua_state->por_el0 = read_sysreg_s(SYS_POR_EL0); write_sysreg_s(por_enable_all, SYS_POR_EL0); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index b4df5bc5b1b8b..69a83a77ccce1 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1557,7 +1557,6 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long i { u64 new_por = POE_RXW; u64 old_por; - u64 pkey_shift; if (!system_supports_poe()) return -ENOSPC; @@ -1582,12 +1581,11 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long i new_por &= ~POE_X; /* Shift the bits in to the correct place in POR for pkey: */ - pkey_shift = pkey * POR_BITS_PER_PKEY; - new_por <<= pkey_shift; + new_por = POR_ELx_PERM_PREP(pkey, new_por); /* Get old POR and mask off any old bits in place: */ old_por = read_sysreg_s(SYS_POR_EL0); - old_por &= ~(POE_MASK << pkey_shift); + old_por &= ~(POE_MASK << POR_ELx_PERM_SHIFT(pkey)); /* Write old part along with new part: */ write_sysreg_s(old_por | new_por, SYS_POR_EL0); From 83d78bbfd2a489f61e68f9e5e859ec91c247fc04 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Wed, 19 Feb 2025 16:40:28 +0000 Subject: [PATCH 35/72] arm64/sysreg: Rename POE_RXW to POE_RWX It is customary to list R, W, X permissions in that order. In fact this is already the case for PIE constants (PIE_RWX). Rename POE_RXW accordingly, as well as POE_XW (currently unused). While at it also swap the W/X lines in compute_s1_overlay_permissions() to follow the R, W, X order. Signed-off-by: Kevin Brodsky Link: https://lore.kernel.org/r/20250219164029.2309119-3-kevin.brodsky@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/sysreg.h | 6 +++--- arch/arm64/kernel/signal.c | 2 +- arch/arm64/kvm/at.c | 8 ++++---- arch/arm64/mm/mmu.c | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index c9ce4ce5131c8..034e0576de5ab 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1077,8 +1077,8 @@ #define POE_RX UL(0x3) #define POE_W UL(0x4) #define POE_RW UL(0x5) -#define POE_XW UL(0x6) -#define POE_RXW UL(0x7) +#define POE_WX UL(0x6) +#define POE_RWX UL(0x7) #define POE_MASK UL(0xf) #define POR_ELx_BITS_PER_IDX 4 @@ -1087,7 +1087,7 @@ #define POR_ELx_PERM_PREP(idx, perm) (((perm) & POE_MASK) << POR_ELx_PERM_SHIFT(idx)) /* Initial value for Permission Overlay Extension for EL0 */ -#define POR_EL0_INIT POE_RXW +#define POR_EL0_INIT POE_RWX /* * Definitions for Guarded Control Stack diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index bf62262565499..a7c37afb4ebeb 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -91,7 +91,7 @@ static void save_reset_user_access_state(struct user_access_state *ua_state) u64 por_enable_all = 0; for (int pkey = 0; pkey < arch_max_pkey(); pkey++) - por_enable_all |= POR_ELx_PERM_PREP(pkey, POE_RXW); + por_enable_all |= POR_ELx_PERM_PREP(pkey, POE_RWX); ua_state->por_el0 = read_sysreg_s(SYS_POR_EL0); write_sysreg_s(por_enable_all, SYS_POR_EL0); diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 3a96c96816e93..f74a66ce3064b 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -1090,22 +1090,22 @@ static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu, break; } - if (pov_perms & ~POE_RXW) + if (pov_perms & ~POE_RWX) pov_perms = POE_NONE; if (wi->poe && wr->pov) { wr->pr &= pov_perms & POE_R; - wr->px &= pov_perms & POE_X; wr->pw &= pov_perms & POE_W; + wr->px &= pov_perms & POE_X; } - if (uov_perms & ~POE_RXW) + if (uov_perms & ~POE_RWX) uov_perms = POE_NONE; if (wi->e0poe && wr->uov) { wr->ur &= uov_perms & POE_R; - wr->ux &= uov_perms & POE_X; wr->uw &= uov_perms & POE_W; + wr->ux &= uov_perms & POE_X; } } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 69a83a77ccce1..3c54dea1303ff 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1555,7 +1555,7 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) #ifdef CONFIG_ARCH_HAS_PKEYS int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) { - u64 new_por = POE_RXW; + u64 new_por; u64 old_por; if (!system_supports_poe()) @@ -1570,7 +1570,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long i return -EINVAL; /* Set the bits we need in POR: */ - new_por = POE_RXW; + new_por = POE_RWX; if (init_val & PKEY_DISABLE_WRITE) new_por &= ~POE_W; if (init_val & PKEY_DISABLE_ACCESS) From 650701e4ead6c97b0fbfbcb6920451e4070aadb0 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Wed, 19 Feb 2025 16:40:29 +0000 Subject: [PATCH 36/72] arm64/sysreg: Move POR_EL0_INIT to asm/por.h The value of POR_EL0_INIT is not architectural, it is a software decision. Since we have a dedicated header for POR_ELx, we might as well define POR_EL0_INIT there. While at it also define POR_EL0_INIT using POR_ELx_PERM_PREP(), making it clearer that we are setting permissions for POIndex/pkey 0. Signed-off-by: Kevin Brodsky Link: https://lore.kernel.org/r/20250219164029.2309119-4-kevin.brodsky@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/por.h | 2 ++ arch/arm64/include/asm/sysreg.h | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/por.h b/arch/arm64/include/asm/por.h index e6bf00bd05003..d913d5b529e49 100644 --- a/arch/arm64/include/asm/por.h +++ b/arch/arm64/include/asm/por.h @@ -8,6 +8,8 @@ #include +#define POR_EL0_INIT POR_ELx_PERM_PREP(0, POE_RWX) + static inline bool por_elx_allows_read(u64 por, u8 pkey) { u8 perm = POR_ELx_PERM_GET(pkey, por); diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 034e0576de5ab..e3252f8bb465e 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1086,9 +1086,6 @@ #define POR_ELx_PERM_GET(idx, reg) (((reg) >> POR_ELx_PERM_SHIFT(idx)) & POE_MASK) #define POR_ELx_PERM_PREP(idx, perm) (((perm) & POE_MASK) << POR_ELx_PERM_SHIFT(idx)) -/* Initial value for Permission Overlay Extension for EL0 */ -#define POR_EL0_INIT POE_RWX - /* * Definitions for Guarded Control Stack */ From 401c3333bb2396aa52e4121887a6f6a6e2f040bc Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 19 Dec 2024 13:11:09 -0800 Subject: [PATCH 37/72] arm64: cputype: Add QCOM_CPU_PART_KRYO_3XX_GOLD Add a definition for the Qualcomm Kryo 300-series Gold cores. Reviewed-by: Dmitry Baryshkov Signed-off-by: Douglas Anderson Acked-by: Trilok Soni Link: https://lore.kernel.org/r/20241219131107.v3.1.I18e0288742871393228249a768e5d56ea65d93dc@changeid Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cputype.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 6f3f4142e214f..c2da1661a44e6 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -119,6 +119,7 @@ #define QCOM_CPU_PART_KRYO 0x200 #define QCOM_CPU_PART_KRYO_2XX_GOLD 0x800 #define QCOM_CPU_PART_KRYO_2XX_SILVER 0x801 +#define QCOM_CPU_PART_KRYO_3XX_GOLD 0x802 #define QCOM_CPU_PART_KRYO_3XX_SILVER 0x803 #define QCOM_CPU_PART_KRYO_4XX_GOLD 0x804 #define QCOM_CPU_PART_KRYO_4XX_SILVER 0x805 @@ -196,6 +197,7 @@ #define MIDR_QCOM_KRYO MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO) #define MIDR_QCOM_KRYO_2XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_2XX_GOLD) #define MIDR_QCOM_KRYO_2XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_2XX_SILVER) +#define MIDR_QCOM_KRYO_3XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_3XX_GOLD) #define MIDR_QCOM_KRYO_3XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_3XX_SILVER) #define MIDR_QCOM_KRYO_4XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_GOLD) #define MIDR_QCOM_KRYO_4XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_SILVER) From 53a52a0ec7680287b170b36488203b5822e6da2d Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 19 Dec 2024 13:11:10 -0800 Subject: [PATCH 38/72] arm64: cputype: Add comments about Qualcomm Kryo 5XX and 6XX cores As tested on one example of a Qualcomm Kryo 5XX CPU [1] and one example of a Qualcomm Kryo 6XX CPU [2], we don't need any extra MIDR definitions for the cores in those processors. Add comments to make it clear that these IDs weren't forgotten and just aren't needed. [1] https://lore.kernel.org/r/l5rqbbxn6hktlcxooolkvi5n3arkht6zzhrvdjf6kis322nsup@5hsrak4cgteq/ [2] https://lore.kernel.org/r/tx7vtur7yea6ruefrkpkccqptahgmxnsrudwdz5uzcfxnng25b@afrr5bmdk2xa/ Suggested-by: Julius Werner Signed-off-by: Douglas Anderson Reviewed-by: Dmitry Baryshkov Acked-by: Trilok Soni Link: https://lore.kernel.org/r/20241219131107.v3.2.I520dfa10ad9f598581c2591d631aa6e9e26f7603@changeid Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cputype.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index c2da1661a44e6..a1a232332596a 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -202,6 +202,16 @@ #define MIDR_QCOM_KRYO_4XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_GOLD) #define MIDR_QCOM_KRYO_4XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_SILVER) #define MIDR_QCOM_ORYON_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_ORYON_X1) + +/* + * NOTES: + * - Qualcomm Kryo 5XX Prime / Gold ID themselves as MIDR_CORTEX_A77 + * - Qualcomm Kryo 5XX Silver IDs itself as MIDR_QCOM_KRYO_4XX_SILVER + * - Qualcomm Kryo 6XX Prime IDs itself as MIDR_CORTEX_X1 + * - Qualcomm Kryo 6XX Gold IDs itself as ARM_CPU_PART_CORTEX_A78 + * - Qualcomm Kryo 6XX Silver IDs itself as MIDR_CORTEX_A55 + */ + #define MIDR_NVIDIA_DENVER MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_DENVER) #define MIDR_NVIDIA_CARMEL MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_CARMEL) #define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX) From 73276cee1a25c4a56266faf6cf0f33e88bb63859 Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Tue, 11 Mar 2025 14:11:29 +0530 Subject: [PATCH 39/72] selftest/powerpc/mm/pkey: fix build-break introduced by commit 00894c3fc917 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Build break was reported in the powerpc mailing list for next-20250218 with below errors make[1]: Nothing to be done for 'all'. BUILD_TARGET=/root/venkat/linux-next/tools/testing/selftests/powerpc/mm; mkdir -p $BUILD_TARGET; make OUTPUT=$BUILD_TARGET -k -C mm all CC pkey_exec_prot In file included from pkey_exec_prot.c:18: /root/venkat/linux-next/tools/testing/selftests/powerpc/include/pkeys.h: In function ‘pkeys_unsupported’: /root/venkat/linux-next/tools/testing/selftests/powerpc/include/pkeys.h:96:34: error: ‘PKEY_UNRESTRICTED’ undeclared (first use in this function) 96 | pkey = sys_pkey_alloc(0, PKEY_UNRESTRICTED); | ^~~~~~~~~~~~~~~~~ https://lore.kernel.org/all/20250113170619.484698-2-yury.khrustalev@arm.com/ patchset has been queued to arm64/for-next/pkey_unrestricted which is causing a build break in the selftest/powerpc builds. Commit 6d61527d931ba ("mm/pkey: Add PKEY_UNRESTRICTED macro") added a macro PKEY_UNRESTRICTED to handle implicit literal value of 0x0 (which is "unrestricted"). Add the same to selftest/powerpc/pkeys.h to fix the reported build break. Fixes: 00894c3fc917 ("selftests/powerpc: Use PKEY_UNRESTRICTED macro") Reported-by: Venkat Rao Bagalkote Closes: https://lore.kernel.org/lkml/3267ea6e-5a1a-4752-96ef-8351c912d386@linux.ibm.com/T/ Tested-by: Venkat Rao Bagalkote Signed-off-by: Madhavan Srinivasan Link: https://lore.kernel.org/r/20250311084129.39308-1-maddy@linux.ibm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/powerpc/include/pkeys.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/powerpc/include/pkeys.h b/tools/testing/selftests/powerpc/include/pkeys.h index c6d4063dd4f6f..d6deb6ffa1b9b 100644 --- a/tools/testing/selftests/powerpc/include/pkeys.h +++ b/tools/testing/selftests/powerpc/include/pkeys.h @@ -24,6 +24,9 @@ #undef PKEY_DISABLE_EXECUTE #define PKEY_DISABLE_EXECUTE 0x4 +#undef PKEY_UNRESTRICTED +#define PKEY_UNRESTRICTED 0x0 + /* Older versions of libc do not define this */ #ifndef SEGV_PKUERR #define SEGV_PKUERR 4 From c380931712d16e23f6aa90703f438330139e9731 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 27 Feb 2025 14:41:48 +0000 Subject: [PATCH 40/72] dma: Fix encryption bit clearing for dma_to_phys phys_to_dma() sets the encryption bit on the translated DMA address. But dma_to_phys() clears the encryption bit after it has been translated back to the physical address, which could fail if the device uses DMA ranges. AMD SME doesn't use the DMA ranges and thus this is harmless. But as we are about to add support for other architectures, let us fix this. Reported-by: Aneesh Kumar K.V Link: https://lkml.kernel.org/r/yq5amsen9stc.fsf@kernel.org Cc: Will Deacon Cc: Jean-Philippe Brucker Cc: Robin Murphy Cc: Steven Price Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Tom Lendacky Reviewed-by: Robin Murphy Acked-by: Tom Lendacky Signed-off-by: Suzuki K Poulose Reviewed-by: Gavin Shan Acked-by: Marek Szyprowski Fixes: 42be24a4178f ("arm64: Enable memory encrypt for Realms") Acked-by: Will Deacon Link: https://lore.kernel.org/r/20250227144150.1667735-2-suzuki.poulose@arm.com Signed-off-by: Catalin Marinas --- include/linux/dma-direct.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index d7e30d4f7503a..d20ecc24cb0f5 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -101,12 +101,13 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr) { phys_addr_t paddr; + dma_addr = __sme_clr(dma_addr); if (dev->dma_range_map) paddr = translate_dma_to_phys(dev, dma_addr); else paddr = dma_addr; - return __sme_clr(paddr); + return paddr; } #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */ From b66e2ee7b6c8d45bbe4b6f6885ee27511506812c Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 27 Feb 2025 14:41:49 +0000 Subject: [PATCH 41/72] dma: Introduce generic dma_addr_*crypted helpers AMD SME added __sme_set/__sme_clr primitives to modify the DMA address for encrypted/decrypted traffic. However this doesn't fit in with other models, e.g., Arm CCA where the meanings are the opposite. i.e., "decrypted" traffic has a bit set and "encrypted" traffic has the top bit cleared. In preparation for adding the support for Arm CCA DMA conversions, convert the existing primitives to more generic ones that can be provided by the backends. i.e., add helpers to 1. dma_addr_encrypted - Convert a DMA address to "encrypted" [ == __sme_set() ] 2. dma_addr_unencrypted - Convert a DMA address to "decrypted" [ None exists today ] 3. dma_addr_canonical - Clear any "encryption"/"decryption" bits from DMA address [ SME uses __sme_clr() ] and convert to a canonical DMA address. Since the original __sme_xxx helpers come from linux/mem_encrypt.h, use that as the home for the new definitions and provide dummy ones when none is provided by the architectures. With the above, phys_to_dma_unencrypted() uses the newly added dma_addr_unencrypted() helper and to make it a bit more easier to read and avoid double conversion, provide __phys_to_dma(). Suggested-by: Robin Murphy Cc: Will Deacon Cc: Jean-Philippe Brucker Cc: Robin Murphy Cc: Steven Price Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Tom Lendacky Cc: Aneesh Kumar K.V Signed-off-by: Suzuki K Poulose Reviewed-by: Robin Murphy Reviewed-by: Gavin Shan Acked-by: Marek Szyprowski Fixes: 42be24a4178f ("arm64: Enable memory encrypt for Realms") Acked-by: Will Deacon Link: https://lore.kernel.org/r/20250227144150.1667735-3-suzuki.poulose@arm.com Signed-off-by: Catalin Marinas --- include/linux/dma-direct.h | 12 ++++++++---- include/linux/mem_encrypt.h | 23 +++++++++++++++++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index d20ecc24cb0f5..f3bc0bcd70980 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -78,14 +78,18 @@ static inline dma_addr_t dma_range_map_max(const struct bus_dma_region *map) #define phys_to_dma_unencrypted phys_to_dma #endif #else -static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev, - phys_addr_t paddr) +static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) { if (dev->dma_range_map) return translate_phys_to_dma(dev, paddr); return paddr; } +static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev, + phys_addr_t paddr) +{ + return dma_addr_unencrypted(__phys_to_dma(dev, paddr)); +} /* * If memory encryption is supported, phys_to_dma will set the memory encryption * bit in the DMA address, and dma_to_phys will clear it. @@ -94,14 +98,14 @@ static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev, */ static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { - return __sme_set(phys_to_dma_unencrypted(dev, paddr)); + return dma_addr_encrypted(__phys_to_dma(dev, paddr)); } static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr) { phys_addr_t paddr; - dma_addr = __sme_clr(dma_addr); + dma_addr = dma_addr_canonical(dma_addr); if (dev->dma_range_map) paddr = translate_dma_to_phys(dev, dma_addr); else diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h index ae45263892611..07584c5e36fb4 100644 --- a/include/linux/mem_encrypt.h +++ b/include/linux/mem_encrypt.h @@ -26,11 +26,34 @@ */ #define __sme_set(x) ((x) | sme_me_mask) #define __sme_clr(x) ((x) & ~sme_me_mask) + +#define dma_addr_encrypted(x) __sme_set(x) +#define dma_addr_canonical(x) __sme_clr(x) + #else #define __sme_set(x) (x) #define __sme_clr(x) (x) #endif +/* + * dma_addr_encrypted() and dma_addr_unencrypted() are for converting a given DMA + * address to the respective type of addressing. + * + * dma_addr_canonical() is used to reverse any conversions for encrypted/decrypted + * back to the canonical address. + */ +#ifndef dma_addr_encrypted +#define dma_addr_encrypted(x) (x) +#endif + +#ifndef dma_addr_unencrypted +#define dma_addr_unencrypted(x) (x) +#endif + +#ifndef dma_addr_canonical +#define dma_addr_canonical(x) (x) +#endif + #endif /* __ASSEMBLY__ */ #endif /* __MEM_ENCRYPT_H__ */ From 7d953a06241624ee2efb172d037a4168978f4147 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 27 Feb 2025 14:41:50 +0000 Subject: [PATCH 42/72] arm64: realm: Use aliased addresses for device DMA to shared buffers When a device performs DMA to a shared buffer using physical addresses, (without Stage1 translation), the device must use the "{I}PA address" with the top bit set in Realm. This is to make sure that a trusted device will be able to write to shared buffers as well as the protected buffers. Thus, a Realm must always program the full address including the "protection" bit, like AMD SME encryption bits. Enable this by providing arm64 specific dma_addr_{encrypted, canonical} helpers for Realms. Please note that the VMM needs to similarly make sure that the SMMU Stage2 in the Non-secure world is setup accordingly to map IPA at the unprotected alias. Cc: Will Deacon Cc: Jean-Philippe Brucker Cc: Robin Murphy Cc: Steven Price Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Tom Lendacky Cc: Aneesh Kumar K.V Signed-off-by: Suzuki K Poulose Reviewed-by: Gavin Shan Acked-by: Catalin Marinas Reviewed-by: Robin Murphy Acked-by: Marek Szyprowski Fixes: 42be24a4178f ("arm64: Enable memory encrypt for Realms") Acked-by: Will Deacon Link: https://lore.kernel.org/r/20250227144150.1667735-4-suzuki.poulose@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/mem_encrypt.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/arm64/include/asm/mem_encrypt.h b/arch/arm64/include/asm/mem_encrypt.h index f8f78f622dd2c..a2a1eeb36d4b5 100644 --- a/arch/arm64/include/asm/mem_encrypt.h +++ b/arch/arm64/include/asm/mem_encrypt.h @@ -21,4 +21,15 @@ static inline bool force_dma_unencrypted(struct device *dev) return is_realm_world(); } +/* + * For Arm CCA guests, canonical addresses are "encrypted", so no changes + * required for dma_addr_encrypted(). + * The unencrypted DMA buffers must be accessed via the unprotected IPA, + * "top IPA bit" set. + */ +#define dma_addr_unencrypted(x) ((x) | PROT_NS_SHARED) + +/* Clear the "top" IPA bit while converting back */ +#define dma_addr_canonical(x) ((x) & ~PROT_NS_SHARED) + #endif /* __ASM_MEM_ENCRYPT_H */ From 858c7bfcb35e1100b58bb63c9f562d86e09418d9 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 27 Feb 2025 09:21:19 +0530 Subject: [PATCH 43/72] arm64/boot: Enable EL2 requirements for FEAT_PMUv3p9 FEAT_PMUv3p9 registers such as PMICNTR_EL0, PMICFILTR_EL0, and PMUACR_EL1 access from EL1 requires appropriate EL2 fine grained trap configuration via FEAT_FGT2 based trap control registers HDFGRTR2_EL2 and HDFGWTR2_EL2. Otherwise such register accesses will result in traps into EL2. Add a new helper __init_el2_fgt2() which initializes FEAT_FGT2 based fine grained trap control registers HDFGRTR2_EL2 and HDFGWTR2_EL2 (setting the bits nPMICNTR_EL0, nPMICFILTR_EL0 and nPMUACR_EL1) to enable access into PMICNTR_EL0, PMICFILTR_EL0, and PMUACR_EL1 registers. Also update booting.rst with SCR_EL3.FGTEn2 requirement for all FEAT_FGT2 based registers to be accessible in EL2. Cc: Will Deacon Cc: Mark Rutland Cc: Rob Herring Cc: Jonathan Corbet Cc: Marc Zyngier Cc: Oliver Upton Cc: linux-arm-kernel@lists.infradead.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: kvmarm@lists.linux.dev Fixes: 0bbff9ed8165 ("perf/arm_pmuv3: Add PMUv3.9 per counter EL0 access control") Fixes: d8226d8cfbaf ("perf: arm_pmuv3: Add support for Armv9.4 PMU instruction counter") Tested-by: Rob Herring (Arm) Reviewed-by: Rob Herring (Arm) Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20250227035119.2025171-1-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- Documentation/arch/arm64/booting.rst | 22 ++++++++++++++++++++++ arch/arm64/include/asm/el2_setup.h | 25 +++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/Documentation/arch/arm64/booting.rst b/Documentation/arch/arm64/booting.rst index cad6fdc96b98b..dee7b6de864fc 100644 --- a/Documentation/arch/arm64/booting.rst +++ b/Documentation/arch/arm64/booting.rst @@ -288,6 +288,12 @@ Before jumping into the kernel, the following conditions must be met: - SCR_EL3.FGTEn (bit 27) must be initialised to 0b1. + For CPUs with the Fine Grained Traps 2 (FEAT_FGT2) extension present: + + - If EL3 is present and the kernel is entered at EL2: + + - SCR_EL3.FGTEn2 (bit 59) must be initialised to 0b1. + For CPUs with support for HCRX_EL2 (FEAT_HCX) present: - If EL3 is present and the kernel is entered at EL2: @@ -382,6 +388,22 @@ Before jumping into the kernel, the following conditions must be met: - SMCR_EL2.EZT0 (bit 30) must be initialised to 0b1. + For CPUs with the Performance Monitors Extension (FEAT_PMUv3p9): + + - If EL3 is present: + + - MDCR_EL3.EnPM2 (bit 7) must be initialised to 0b1. + + - If the kernel is entered at EL1 and EL2 is present: + + - HDFGRTR2_EL2.nPMICNTR_EL0 (bit 2) must be initialised to 0b1. + - HDFGRTR2_EL2.nPMICFILTR_EL0 (bit 3) must be initialised to 0b1. + - HDFGRTR2_EL2.nPMUACR_EL1 (bit 4) must be initialised to 0b1. + + - HDFGWTR2_EL2.nPMICNTR_EL0 (bit 2) must be initialised to 0b1. + - HDFGWTR2_EL2.nPMICFILTR_EL0 (bit 3) must be initialised to 0b1. + - HDFGWTR2_EL2.nPMUACR_EL1 (bit 4) must be initialised to 0b1. + For CPUs with Memory Copy and Memory Set instructions (FEAT_MOPS): - If the kernel is entered at EL1 and EL2 is present: diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 25e1626517500..1a0071faf57ec 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -233,6 +233,30 @@ .Lskip_fgt_\@: .endm +.macro __init_el2_fgt2 + mrs x1, id_aa64mmfr0_el1 + ubfx x1, x1, #ID_AA64MMFR0_EL1_FGT_SHIFT, #4 + cmp x1, #ID_AA64MMFR0_EL1_FGT_FGT2 + b.lt .Lskip_fgt2_\@ + + mov x0, xzr + mrs x1, id_aa64dfr0_el1 + ubfx x1, x1, #ID_AA64DFR0_EL1_PMUVer_SHIFT, #4 + cmp x1, #ID_AA64DFR0_EL1_PMUVer_V3P9 + b.lt .Lskip_pmuv3p9_\@ + + orr x0, x0, #HDFGRTR2_EL2_nPMICNTR_EL0 + orr x0, x0, #HDFGRTR2_EL2_nPMICFILTR_EL0 + orr x0, x0, #HDFGRTR2_EL2_nPMUACR_EL1 +.Lskip_pmuv3p9_\@: + msr_s SYS_HDFGRTR2_EL2, x0 + msr_s SYS_HDFGWTR2_EL2, x0 + msr_s SYS_HFGRTR2_EL2, xzr + msr_s SYS_HFGWTR2_EL2, xzr + msr_s SYS_HFGITR2_EL2, xzr +.Lskip_fgt2_\@: +.endm + .macro __init_el2_gcs mrs_s x1, SYS_ID_AA64PFR1_EL1 ubfx x1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4 @@ -283,6 +307,7 @@ __init_el2_nvhe_idregs __init_el2_cptr __init_el2_fgt + __init_el2_fgt2 __init_el2_gcs .endm From 31208bad39372cd301c6fe2a1c23829073f46eb9 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Sun, 9 Mar 2025 15:07:23 +0800 Subject: [PATCH 44/72] arm64/fpsimd: Remove unused declaration fpsimd_kvm_prepare() Commit fbc7e61195e2 ("KVM: arm64: Unconditionally save+flush host FPSIMD/SVE/SME state") removed the implementation but leave declaration. Signed-off-by: Yue Haibing Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20250309070723.1390958-1-yuehaibing@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/fpsimd.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index f2a84efc36185..564bc09b3e06d 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -80,7 +80,6 @@ extern void fpsimd_signal_preserve_current_state(void); extern void fpsimd_preserve_current_state(void); extern void fpsimd_restore_current_state(void); extern void fpsimd_update_current_state(struct user_fpsimd_state const *state); -extern void fpsimd_kvm_prepare(void); struct cpu_fp_state { struct user_fpsimd_state *st; From 75ecffc361bbc85696c084f3d3c73eb207386e3f Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 5 Mar 2025 12:26:28 -0800 Subject: [PATCH 45/72] drivers/perf: apple_m1: Refactor event select/filter configuration Supporting guest mode events will necessitate programming two event filters. Prepare by splitting up the programming of the event selector + event filter into separate headers. Opportunistically replace RMW patterns with sysreg_clear_set_s(). Tested-by: Janne Grunau Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250305202641.428114-2-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- drivers/perf/apple_m1_cpu_pmu.c | 52 ++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/drivers/perf/apple_m1_cpu_pmu.c b/drivers/perf/apple_m1_cpu_pmu.c index 06fd317529fcb..cea80afd12537 100644 --- a/drivers/perf/apple_m1_cpu_pmu.c +++ b/drivers/perf/apple_m1_cpu_pmu.c @@ -327,11 +327,10 @@ static void m1_pmu_disable_counter_interrupt(unsigned int index) __m1_pmu_enable_counter_interrupt(index, false); } -static void m1_pmu_configure_counter(unsigned int index, u8 event, - bool user, bool kernel) +static void __m1_pmu_configure_event_filter(unsigned int index, bool user, + bool kernel) { - u64 val, user_bit, kernel_bit; - int shift; + u64 clear, set, user_bit, kernel_bit; switch (index) { case 0 ... 7: @@ -346,19 +345,24 @@ static void m1_pmu_configure_counter(unsigned int index, u8 event, BUG(); } - val = read_sysreg_s(SYS_IMP_APL_PMCR1_EL1); - + clear = set = 0; if (user) - val |= user_bit; + set |= user_bit; else - val &= ~user_bit; + clear |= user_bit; if (kernel) - val |= kernel_bit; + set |= kernel_bit; else - val &= ~kernel_bit; + clear |= kernel_bit; - write_sysreg_s(val, SYS_IMP_APL_PMCR1_EL1); + sysreg_clear_set_s(SYS_IMP_APL_PMCR1_EL1, clear, set); +} + +static void __m1_pmu_configure_eventsel(unsigned int index, u8 event) +{ + u64 clear = 0, set = 0; + int shift; /* * Counters 0 and 1 have fixed events. For anything else, @@ -371,21 +375,29 @@ static void m1_pmu_configure_counter(unsigned int index, u8 event, break; case 2 ... 5: shift = (index - 2) * 8; - val = read_sysreg_s(SYS_IMP_APL_PMESR0_EL1); - val &= ~((u64)0xff << shift); - val |= (u64)event << shift; - write_sysreg_s(val, SYS_IMP_APL_PMESR0_EL1); + clear |= (u64)0xff << shift; + set |= (u64)event << shift; + sysreg_clear_set_s(SYS_IMP_APL_PMESR0_EL1, clear, set); break; case 6 ... 9: shift = (index - 6) * 8; - val = read_sysreg_s(SYS_IMP_APL_PMESR1_EL1); - val &= ~((u64)0xff << shift); - val |= (u64)event << shift; - write_sysreg_s(val, SYS_IMP_APL_PMESR1_EL1); + clear |= (u64)0xff << shift; + set |= (u64)event << shift; + sysreg_clear_set_s(SYS_IMP_APL_PMESR1_EL1, clear, set); break; } } +static void m1_pmu_configure_counter(unsigned int index, unsigned long config_base) +{ + bool kernel = config_base & M1_PMU_CFG_COUNT_KERNEL; + bool user = config_base & M1_PMU_CFG_COUNT_USER; + u8 evt = config_base & M1_PMU_CFG_EVENT; + + __m1_pmu_configure_event_filter(index, user, kernel); + __m1_pmu_configure_eventsel(index, evt); +} + /* arm_pmu backend */ static void m1_pmu_enable_event(struct perf_event *event) { @@ -400,7 +412,7 @@ static void m1_pmu_enable_event(struct perf_event *event) m1_pmu_disable_counter(event->hw.idx); isb(); - m1_pmu_configure_counter(event->hw.idx, evt, user, kernel); + m1_pmu_configure_counter(event->hw.idx, event->hw.config_base); m1_pmu_enable_counter(event->hw.idx); m1_pmu_enable_counter_interrupt(event->hw.idx); isb(); From 46573d944f00f440dc794fa87ebbdb0dd9dbf691 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 5 Mar 2025 12:26:29 -0800 Subject: [PATCH 46/72] drivers/perf: apple_m1: Support host/guest event filtering The PMU appears to have a separate register for filtering 'guest' exception levels (i.e. EL1 and !ELIsInHost(EL0)) which has the same layout as PMCR1_EL1. Conveniently, there exists a VHE register alias (PMCR1_EL12) that can be used to configure it. Support guest events by programming the EL12 register with the intended guest kernel/userspace filters. Limit support for guest events to VHE (i.e. kernel running at EL2), as it avoids involving KVM to context switch PMU registers. VHE is the only supported mode on M* parts anyway, so this isn't an actual feature limitation. Tested-by: Janne Grunau Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250305202641.428114-3-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/include/asm/apple_m1_pmu.h | 1 + drivers/perf/apple_m1_cpu_pmu.c | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/apple_m1_pmu.h b/arch/arm64/include/asm/apple_m1_pmu.h index 99483b19b99fc..02e05d05851f7 100644 --- a/arch/arm64/include/asm/apple_m1_pmu.h +++ b/arch/arm64/include/asm/apple_m1_pmu.h @@ -37,6 +37,7 @@ #define PMCR0_PMI_ENABLE_8_9 GENMASK(45, 44) #define SYS_IMP_APL_PMCR1_EL1 sys_reg(3, 1, 15, 1, 0) +#define SYS_IMP_APL_PMCR1_EL12 sys_reg(3, 1, 15, 7, 2) #define PMCR1_COUNT_A64_EL0_0_7 GENMASK(15, 8) #define PMCR1_COUNT_A64_EL1_0_7 GENMASK(23, 16) #define PMCR1_COUNT_A64_EL0_8_9 GENMASK(41, 40) diff --git a/drivers/perf/apple_m1_cpu_pmu.c b/drivers/perf/apple_m1_cpu_pmu.c index cea80afd12537..d6d4ff6da8626 100644 --- a/drivers/perf/apple_m1_cpu_pmu.c +++ b/drivers/perf/apple_m1_cpu_pmu.c @@ -120,6 +120,8 @@ enum m1_pmu_events { */ M1_PMU_CFG_COUNT_USER = BIT(8), M1_PMU_CFG_COUNT_KERNEL = BIT(9), + M1_PMU_CFG_COUNT_HOST = BIT(10), + M1_PMU_CFG_COUNT_GUEST = BIT(11), }; /* @@ -328,7 +330,7 @@ static void m1_pmu_disable_counter_interrupt(unsigned int index) } static void __m1_pmu_configure_event_filter(unsigned int index, bool user, - bool kernel) + bool kernel, bool host) { u64 clear, set, user_bit, kernel_bit; @@ -356,7 +358,10 @@ static void __m1_pmu_configure_event_filter(unsigned int index, bool user, else clear |= kernel_bit; - sysreg_clear_set_s(SYS_IMP_APL_PMCR1_EL1, clear, set); + if (host) + sysreg_clear_set_s(SYS_IMP_APL_PMCR1_EL1, clear, set); + else if (is_kernel_in_hyp_mode()) + sysreg_clear_set_s(SYS_IMP_APL_PMCR1_EL12, clear, set); } static void __m1_pmu_configure_eventsel(unsigned int index, u8 event) @@ -391,10 +396,13 @@ static void __m1_pmu_configure_eventsel(unsigned int index, u8 event) static void m1_pmu_configure_counter(unsigned int index, unsigned long config_base) { bool kernel = config_base & M1_PMU_CFG_COUNT_KERNEL; + bool guest = config_base & M1_PMU_CFG_COUNT_GUEST; + bool host = config_base & M1_PMU_CFG_COUNT_HOST; bool user = config_base & M1_PMU_CFG_COUNT_USER; u8 evt = config_base & M1_PMU_CFG_EVENT; - __m1_pmu_configure_event_filter(index, user, kernel); + __m1_pmu_configure_event_filter(index, user && host, kernel && host, true); + __m1_pmu_configure_event_filter(index, user && guest, kernel && guest, false); __m1_pmu_configure_eventsel(index, evt); } @@ -570,7 +578,7 @@ static int m1_pmu_set_event_filter(struct hw_perf_event *event, { unsigned long config_base = 0; - if (!attr->exclude_guest) { + if (!attr->exclude_guest && !is_kernel_in_hyp_mode()) { pr_debug("ARM performance counters do not support mode exclusion\n"); return -EOPNOTSUPP; } @@ -578,6 +586,10 @@ static int m1_pmu_set_event_filter(struct hw_perf_event *event, config_base |= M1_PMU_CFG_COUNT_KERNEL; if (!attr->exclude_user) config_base |= M1_PMU_CFG_COUNT_USER; + if (!attr->exclude_host) + config_base |= M1_PMU_CFG_COUNT_HOST; + if (!attr->exclude_guest) + config_base |= M1_PMU_CFG_COUNT_GUEST; event->config_base = config_base; From 0b626b245c570c9e9ef5bf03e0541fedab334a9d Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 21 Feb 2025 10:12:20 +0530 Subject: [PATCH 47/72] KVM: arm64: ptdump: Test PMD_TYPE_MASK for block mapping Test given page table entries against PMD_TYPE_SECT on PMD_TYPE_MASK mask bits for identifying block mappings in stage 2 page tables. Cc: Marc Zyngier Cc: Oliver Upton Cc: James Morse Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: kvmarm@lists.linux.dev Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Acked-by: Marc Zyngier Reviewed-by: Ryan Roberts Link: https://lore.kernel.org/r/20250221044227.1145393-2-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kvm/ptdump.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c index e4a342e903e25..098416d7e5c25 100644 --- a/arch/arm64/kvm/ptdump.c +++ b/arch/arm64/kvm/ptdump.c @@ -52,8 +52,8 @@ static const struct ptdump_prot_bits stage2_pte_bits[] = { .set = "AF", .clear = " ", }, { - .mask = PTE_TABLE_BIT | PTE_VALID, - .val = PTE_VALID, + .mask = PMD_TYPE_MASK, + .val = PMD_TYPE_SECT, .set = "BLK", .clear = " ", }, From f5e93819e2cc8433061e6f5787fffe8d8cdf9c35 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 21 Feb 2025 10:12:21 +0530 Subject: [PATCH 48/72] arm64/ptdump: Test PMD_TYPE_MASK for block mapping Test given page table entries against PMD_TYPE_SECT on PMD_TYPE_MASK mask bits for identifying block mappings in stage 1 page tables. Cc: Will Deacon Cc: Ard Biesheuvel Cc: Ryan Roberts Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20250221044227.1145393-3-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/mm/ptdump.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index 688fbe0271ca4..8cec0da4cff28 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -80,8 +80,8 @@ static const struct ptdump_prot_bits pte_bits[] = { .set = "CON", .clear = " ", }, { - .mask = PTE_TABLE_BIT | PTE_VALID, - .val = PTE_VALID, + .mask = PMD_TYPE_MASK, + .val = PMD_TYPE_SECT, .set = "BLK", .clear = " ", }, { From dba954801004ce79db69fdc4d710063a7eed006c Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 21 Feb 2025 10:12:22 +0530 Subject: [PATCH 49/72] arm64/mm: Clear PXX_TYPE_MASK in mk_[pmd|pud]_sect_prot() Clear PXX_TYPE_MASK bits in mk_[pmd|pud]_sect_prot() while creating section mappings instead of just clearing the PXX_TABLE_BIT. Cc: Will Deacon Cc: Ard Biesheuvel Cc: Ryan Roberts Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20250221044227.1145393-4-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 0b2a2ad1b9e83..c16e9e74e3096 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -483,12 +483,12 @@ static inline pmd_t pte_pmd(pte_t pte) static inline pgprot_t mk_pud_sect_prot(pgprot_t prot) { - return __pgprot((pgprot_val(prot) & ~PUD_TABLE_BIT) | PUD_TYPE_SECT); + return __pgprot((pgprot_val(prot) & ~PUD_TYPE_MASK) | PUD_TYPE_SECT); } static inline pgprot_t mk_pmd_sect_prot(pgprot_t prot) { - return __pgprot((pgprot_val(prot) & ~PMD_TABLE_BIT) | PMD_TYPE_SECT); + return __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT); } static inline pte_t pte_swp_mkexclusive(pte_t pte) From 1601df9e366eeeb0dd55cc7d74c0a1fc008223ef Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 21 Feb 2025 10:12:23 +0530 Subject: [PATCH 50/72] arm64/mm: Clear PXX_TYPE_MASK and set PXD_TYPE_SECT in [pmd|pud]_mkhuge() Clear PXX_TYPE_MASK in [pmd|pud]_mkhuge() while creating section mappings instead of just the PXX_TABLE_BIT and also set PXD_TYPE_SECT. Also ensure PTE_VALID does not get modified in these helpers, because present-invalid entries should preserve their state across. Cc: Will Deacon Cc: Ard Biesheuvel Cc: Ryan Roberts Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20250221044227.1145393-5-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index c16e9e74e3096..4485ecdffd8a5 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -585,7 +585,18 @@ static inline int pmd_trans_huge(pmd_t pmd) #define pmd_write(pmd) pte_write(pmd_pte(pmd)) -#define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT)) +static inline pmd_t pmd_mkhuge(pmd_t pmd) +{ + /* + * It's possible that the pmd is present-invalid on entry + * and in that case it needs to remain present-invalid on + * exit. So ensure the VALID bit does not get modified. + */ + pmdval_t mask = PMD_TYPE_MASK & ~PTE_VALID; + pmdval_t val = PMD_TYPE_SECT & ~PTE_VALID; + + return __pmd((pmd_val(pmd) & ~mask) | val); +} #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define pmd_devmap(pmd) pte_devmap(pmd_pte(pmd)) @@ -613,7 +624,18 @@ static inline pmd_t pmd_mkspecial(pmd_t pmd) #define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) #define pud_write(pud) pte_write(pud_pte(pud)) -#define pud_mkhuge(pud) (__pud(pud_val(pud) & ~PUD_TABLE_BIT)) +static inline pud_t pud_mkhuge(pud_t pud) +{ + /* + * It's possible that the pud is present-invalid on entry + * and in that case it needs to remain present-invalid on + * exit. So ensure the VALID bit does not get modified. + */ + pudval_t mask = PUD_TYPE_MASK & ~PTE_VALID; + pudval_t val = PUD_TYPE_SECT & ~PTE_VALID; + + return __pud((pud_val(pud) & ~mask) | val); +} #define __pud_to_phys(pud) __pte_to_phys(pud_pte(pud)) #define __phys_to_pud_val(phys) __phys_to_pte_val(phys) From 4fa8a9c0fc996fe5cde5f201f33e2c1dba4b5498 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 21 Feb 2025 10:12:24 +0530 Subject: [PATCH 51/72] arm64/mm: Check PXD_TYPE_TABLE in [p4d|pgd]_bad() Check page table entries against PXD_TYPE_TABLE on PXD_TYPE_MASK mask bits in [p4d|pgd]_bad() while determining a table entry instead of just checking only for PXD_TABLE_BIT. Cc: Will Deacon Cc: Ard Biesheuvel Cc: Ryan Roberts Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20250221044227.1145393-6-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 4485ecdffd8a5..0c1691979c148 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -918,7 +918,9 @@ static inline bool mm_pud_folded(const struct mm_struct *mm) pr_err("%s:%d: bad pud %016llx.\n", __FILE__, __LINE__, pud_val(e)) #define p4d_none(p4d) (pgtable_l4_enabled() && !p4d_val(p4d)) -#define p4d_bad(p4d) (pgtable_l4_enabled() && !(p4d_val(p4d) & P4D_TABLE_BIT)) +#define p4d_bad(p4d) (pgtable_l4_enabled() && \ + ((p4d_val(p4d) & P4D_TYPE_MASK) != \ + P4D_TYPE_TABLE)) #define p4d_present(p4d) (!p4d_none(p4d)) static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) @@ -1045,7 +1047,9 @@ static inline bool mm_p4d_folded(const struct mm_struct *mm) pr_err("%s:%d: bad p4d %016llx.\n", __FILE__, __LINE__, p4d_val(e)) #define pgd_none(pgd) (pgtable_l5_enabled() && !pgd_val(pgd)) -#define pgd_bad(pgd) (pgtable_l5_enabled() && !(pgd_val(pgd) & PGD_TABLE_BIT)) +#define pgd_bad(pgd) (pgtable_l5_enabled() && \ + ((pgd_val(pgd) & PGD_TYPE_MASK) != \ + PGD_TYPE_TABLE)) #define pgd_present(pgd) (!pgd_none(pgd)) static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) From bfb1d2b9021c21891427acc86eb848ccedeb274e Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 21 Feb 2025 10:12:25 +0530 Subject: [PATCH 52/72] arm64/mm: Check PUD_TYPE_TABLE in pud_bad() pud_bad() is currently defined in terms of pud_table(). Although for some configs, pud_table() is hard-coded to true i.e. when using 64K base pages or when page table levels are less than 3. pud_bad() is intended to check that the pud is configured correctly. Hence let's open-code the same check that the full version of pud_table() uses into pud_bad(). Then it always performs the check regardless of the config. Cc: Will Deacon Cc: Ard Biesheuvel Cc: Ryan Roberts Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ryan Roberts Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20250221044227.1145393-7-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 0c1691979c148..d16f514bde0f7 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -827,7 +827,8 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) pr_err("%s:%d: bad pmd %016llx.\n", __FILE__, __LINE__, pmd_val(e)) #define pud_none(pud) (!pud_val(pud)) -#define pud_bad(pud) (!pud_table(pud)) +#define pud_bad(pud) ((pud_val(pud) & PUD_TYPE_MASK) != \ + PUD_TYPE_TABLE) #define pud_present(pud) pte_present(pud_pte(pud)) #ifndef __PAGETABLE_PMD_FOLDED #define pud_leaf(pud) (pud_present(pud) && !pud_table(pud)) From d1770e909898c108e8c7d30ca039053e8818a9c9 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 21 Feb 2025 10:12:26 +0530 Subject: [PATCH 53/72] arm64/mm: Check pmd_table() in pmd_trans_huge() Check for pmd_table() in pmd_trans_huge() rather then just checking for the PMD_TABLE_BIT. But ensure all present-invalid entries are handled correctly by always setting PTE_VALID before checking with pmd_table(). Cc: Will Deacon Cc: Ard Biesheuvel Cc: Ryan Roberts Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ryan Roberts Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20250221044227.1145393-8-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index d16f514bde0f7..df89e8ec84132 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -548,18 +548,6 @@ static inline int pmd_protnone(pmd_t pmd) #endif #define pmd_present(pmd) pte_present(pmd_pte(pmd)) - -/* - * THP definitions. - */ - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int pmd_trans_huge(pmd_t pmd) -{ - return pmd_val(pmd) && pmd_present(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_young(pmd) pte_young(pmd_pte(pmd)) #define pmd_valid(pmd) pte_valid(pmd_pte(pmd)) @@ -746,6 +734,18 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, #define pmd_leaf_size(pmd) (pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE) #define pte_leaf_size(pte) (pte_cont(pte) ? CONT_PTE_SIZE : PAGE_SIZE) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmd_trans_huge(pmd_t pmd) +{ + /* + * If pmd is present-invalid, pmd_table() won't detect it + * as a table, so force the valid bit for the comparison. + */ + return pmd_val(pmd) && pmd_present(pmd) && + !pmd_table(__pmd(pmd_val(pmd) | PTE_VALID)); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3 static inline bool pud_sect(pud_t pud) { return false; } static inline bool pud_table(pud_t pud) { return true; } From 50c2726654bbc5e156040618413e25a3467de6f2 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 21 Feb 2025 10:12:27 +0530 Subject: [PATCH 54/72] arm64/mm: Drop PXD_TABLE_BIT Drop all PXD_TABLE_BIT macros as they are not used any more. Cc: Will Deacon Cc: Ard Biesheuvel Cc: Ryan Roberts Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20250221044227.1145393-9-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable-hwdef.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index a9136cc551ccb..d2a13c3297024 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -97,7 +97,6 @@ * Level -1 descriptor (PGD). */ #define PGD_TYPE_TABLE (_AT(pgdval_t, 3) << 0) -#define PGD_TABLE_BIT (_AT(pgdval_t, 1) << 1) #define PGD_TYPE_MASK (_AT(pgdval_t, 3) << 0) #define PGD_TABLE_AF (_AT(pgdval_t, 1) << 10) /* Ignored if no FEAT_HAFT */ #define PGD_TABLE_PXN (_AT(pgdval_t, 1) << 59) @@ -107,7 +106,6 @@ * Level 0 descriptor (P4D). */ #define P4D_TYPE_TABLE (_AT(p4dval_t, 3) << 0) -#define P4D_TABLE_BIT (_AT(p4dval_t, 1) << 1) #define P4D_TYPE_MASK (_AT(p4dval_t, 3) << 0) #define P4D_TYPE_SECT (_AT(p4dval_t, 1) << 0) #define P4D_SECT_RDONLY (_AT(p4dval_t, 1) << 7) /* AP[2] */ @@ -119,7 +117,6 @@ * Level 1 descriptor (PUD). */ #define PUD_TYPE_TABLE (_AT(pudval_t, 3) << 0) -#define PUD_TABLE_BIT (_AT(pudval_t, 1) << 1) #define PUD_TYPE_MASK (_AT(pudval_t, 3) << 0) #define PUD_TYPE_SECT (_AT(pudval_t, 1) << 0) #define PUD_SECT_RDONLY (_AT(pudval_t, 1) << 7) /* AP[2] */ @@ -133,7 +130,6 @@ #define PMD_TYPE_MASK (_AT(pmdval_t, 3) << 0) #define PMD_TYPE_TABLE (_AT(pmdval_t, 3) << 0) #define PMD_TYPE_SECT (_AT(pmdval_t, 1) << 0) -#define PMD_TABLE_BIT (_AT(pmdval_t, 1) << 1) #define PMD_TABLE_AF (_AT(pmdval_t, 1) << 10) /* Ignored if no FEAT_HAFT */ /* @@ -162,7 +158,6 @@ #define PTE_VALID (_AT(pteval_t, 1) << 0) #define PTE_TYPE_MASK (_AT(pteval_t, 3) << 0) #define PTE_TYPE_PAGE (_AT(pteval_t, 3) << 0) -#define PTE_TABLE_BIT (_AT(pteval_t, 1) << 1) #define PTE_USER (_AT(pteval_t, 1) << 6) /* AP[1] */ #define PTE_RDONLY (_AT(pteval_t, 1) << 7) /* AP[2] */ #define PTE_SHARED (_AT(pteval_t, 3) << 8) /* SH[1:0], inner shareable */ From bf25266f83821697dc5cc3571ff5e079840db027 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 11 Mar 2025 08:30:44 +0100 Subject: [PATCH 55/72] arm64/kernel: Always use level 2 or higher for early mappings The page table population code in map_range() uses a recursive algorithm to create the early mappings of the kernel, the DTB and the ID mapped text and data pages, and this fails to take into account that the way these page tables may be constructed is not precisely the same at each level. In particular, block mappings are not permitted at each level, and the code as it exists today might inadvertently create such a forbidden block mapping if it were used to map a region of the appropriate size and alignment. This never happens in practice, given the limited size of the assets being mapped by the early boot code. Nonetheless, it would be better if this code would behave correctly in all circumstances. So only permit block mappings at level 2, and page mappings at level 3, for any page size, and use table mappings exclusively at all other levels. This change should have no impact in practice, but it makes the code more robust. Cc: Anshuman Khandual Reported-by: Ryan Roberts Signed-off-by: Ard Biesheuvel Reviewed-by: Anshuman Khandual Reviewed-by: Ryan Roberts Link: https://lore.kernel.org/r/20250311073043.96795-2-ardb+git@google.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/pi/map_range.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c index 2b69e3beeef80..5778697f3062b 100644 --- a/arch/arm64/kernel/pi/map_range.c +++ b/arch/arm64/kernel/pi/map_range.c @@ -45,12 +45,12 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, * clearing the mapping */ if (protval) - protval |= (level < 3) ? PMD_TYPE_SECT : PTE_TYPE_PAGE; + protval |= (level == 2) ? PMD_TYPE_SECT : PTE_TYPE_PAGE; while (start < end) { u64 next = min((start | lmask) + 1, PAGE_ALIGN(end)); - if (level < 3 && (start | next | pa) & lmask) { + if (level < 2 || (level == 2 && (start | next | pa) & lmask)) { /* * This chunk needs a finer grained mapping. Create a * table mapping if necessary and recurse. From 862f7ad4d7fdf5e8d7ff11ad8eda5af3ad44cdae Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 5 Mar 2025 16:10:06 +0000 Subject: [PATCH 56/72] perf/arm_cspmu: Move register definitons to header Implementations may occasionally want to refer to register offsets, so for the sake of consistency move all of the register definitions to join the PMIIDR fields in the private header where they can be shared. As an example nicety, we can then define Ampere's imp-def filters in terms of the architectural PMIMPDEF range rather than open-coded offsets. Signed-off-by: Robin Murphy Reviewed-by: James Clark Reviewed-by: Ilkka Koskinen Link: https://lore.kernel.org/r/5a3c796560665b51cb63fec0d473afd8f8d0a836.1741190362.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm_cspmu/ampere_cspmu.c | 8 ++--- drivers/perf/arm_cspmu/arm_cspmu.c | 45 -------------------------- drivers/perf/arm_cspmu/arm_cspmu.h | 46 +++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 49 deletions(-) diff --git a/drivers/perf/arm_cspmu/ampere_cspmu.c b/drivers/perf/arm_cspmu/ampere_cspmu.c index f72f5689923c7..31cc1a4ac9df1 100644 --- a/drivers/perf/arm_cspmu/ampere_cspmu.c +++ b/drivers/perf/arm_cspmu/ampere_cspmu.c @@ -10,10 +10,10 @@ #include "arm_cspmu.h" -#define PMAUXR0 0xD80 -#define PMAUXR1 0xD84 -#define PMAUXR2 0xD88 -#define PMAUXR3 0xD8C +#define PMAUXR0 PMIMPDEF +#define PMAUXR1 (PMIMPDEF + 0x4) +#define PMAUXR2 (PMIMPDEF + 0x8) +#define PMAUXR3 (PMIMPDEF + 0xC) #define to_ampere_cspmu_ctx(cspmu) ((struct ampere_cspmu_ctx *)(cspmu->impl.ctx)) diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c index 81e8b97e93535..769466d55bea2 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.c +++ b/drivers/perf/arm_cspmu/arm_cspmu.c @@ -40,51 +40,6 @@ ARM_CSPMU_EXT_ATTR(_name, arm_cspmu_cpumask_show, \ (unsigned long)_config) -/* - * CoreSight PMU Arch register offsets. - */ -#define PMEVCNTR_LO 0x0 -#define PMEVCNTR_HI 0x4 -#define PMEVTYPER 0x400 -#define PMCCFILTR 0x47C -#define PMEVFILTR 0xA00 -#define PMCNTENSET 0xC00 -#define PMCNTENCLR 0xC20 -#define PMINTENSET 0xC40 -#define PMINTENCLR 0xC60 -#define PMOVSCLR 0xC80 -#define PMOVSSET 0xCC0 -#define PMCFGR 0xE00 -#define PMCR 0xE04 -#define PMIIDR 0xE08 - -/* PMCFGR register field */ -#define PMCFGR_NCG GENMASK(31, 28) -#define PMCFGR_HDBG BIT(24) -#define PMCFGR_TRO BIT(23) -#define PMCFGR_SS BIT(22) -#define PMCFGR_FZO BIT(21) -#define PMCFGR_MSI BIT(20) -#define PMCFGR_UEN BIT(19) -#define PMCFGR_NA BIT(17) -#define PMCFGR_EX BIT(16) -#define PMCFGR_CCD BIT(15) -#define PMCFGR_CC BIT(14) -#define PMCFGR_SIZE GENMASK(13, 8) -#define PMCFGR_N GENMASK(7, 0) - -/* PMCR register field */ -#define PMCR_TRO BIT(11) -#define PMCR_HDBG BIT(10) -#define PMCR_FZO BIT(9) -#define PMCR_NA BIT(8) -#define PMCR_DP BIT(5) -#define PMCR_X BIT(4) -#define PMCR_D BIT(3) -#define PMCR_C BIT(2) -#define PMCR_P BIT(1) -#define PMCR_E BIT(0) - /* Each SET/CLR register supports up to 32 counters. */ #define ARM_CSPMU_SET_CLR_COUNTER_SHIFT 5 #define ARM_CSPMU_SET_CLR_COUNTER_NUM \ diff --git a/drivers/perf/arm_cspmu/arm_cspmu.h b/drivers/perf/arm_cspmu/arm_cspmu.h index 2621f31111483..576249e0deeae 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.h +++ b/drivers/perf/arm_cspmu/arm_cspmu.h @@ -65,6 +65,52 @@ /* The cycle counter, if implemented, is located at counter[31]. */ #define ARM_CSPMU_CYCLE_CNTR_IDX 31 +/* + * CoreSight PMU Arch register offsets. + */ +#define PMEVCNTR_LO 0x0 +#define PMEVCNTR_HI 0x4 +#define PMEVTYPER 0x400 +#define PMCCFILTR 0x47C +#define PMEVFILTR 0xA00 +#define PMCNTENSET 0xC00 +#define PMCNTENCLR 0xC20 +#define PMINTENSET 0xC40 +#define PMINTENCLR 0xC60 +#define PMOVSCLR 0xC80 +#define PMOVSSET 0xCC0 +#define PMIMPDEF 0xD80 +#define PMCFGR 0xE00 +#define PMCR 0xE04 +#define PMIIDR 0xE08 + +/* PMCFGR register field */ +#define PMCFGR_NCG GENMASK(31, 28) +#define PMCFGR_HDBG BIT(24) +#define PMCFGR_TRO BIT(23) +#define PMCFGR_SS BIT(22) +#define PMCFGR_FZO BIT(21) +#define PMCFGR_MSI BIT(20) +#define PMCFGR_UEN BIT(19) +#define PMCFGR_NA BIT(17) +#define PMCFGR_EX BIT(16) +#define PMCFGR_CCD BIT(15) +#define PMCFGR_CC BIT(14) +#define PMCFGR_SIZE GENMASK(13, 8) +#define PMCFGR_N GENMASK(7, 0) + +/* PMCR register field */ +#define PMCR_TRO BIT(11) +#define PMCR_HDBG BIT(10) +#define PMCR_FZO BIT(9) +#define PMCR_NA BIT(8) +#define PMCR_DP BIT(5) +#define PMCR_X BIT(4) +#define PMCR_D BIT(3) +#define PMCR_C BIT(2) +#define PMCR_P BIT(1) +#define PMCR_E BIT(0) + /* PMIIDR register field */ #define ARM_CSPMU_PMIIDR_IMPLEMENTER GENMASK(11, 0) #define ARM_CSPMU_PMIIDR_PRODUCTID GENMASK(31, 20) From 6de0298a3925f1e6e313430c45ae314a93f89ef6 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 5 Mar 2025 16:10:07 +0000 Subject: [PATCH 57/72] perf/arm_cspmu: Generalise event filtering The notion of a single u32 filter value for any event doesn't scale well when the potential architectural scope is already two 64-bit values, and implementations may add custom stuff on the side too. Rather than try to thread arbitrary filter data through the common path, let's just make the set_ev_filter op self-contained in terms of parsing and configuring any and all filtering for the given event - splitting out a distinct op for cycles events which inherently differ - and let implementations override the whole thing if they want to do something different. This already allows the Ampere code to stop looking a bit hacky. Signed-off-by: Robin Murphy Reviewed-by: James Clark Reviewed-by: Ilkka Koskinen Link: https://lore.kernel.org/r/c0cd4d4c12566dbf1b062ccd60241b3e0639f4cc.1741190362.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm_cspmu/ampere_cspmu.c | 24 ++++++---------------- drivers/perf/arm_cspmu/arm_cspmu.c | 29 +++++++++++---------------- drivers/perf/arm_cspmu/arm_cspmu.h | 8 ++++---- drivers/perf/arm_cspmu/nvidia_cspmu.c | 21 ++++++++++++++++++- 4 files changed, 42 insertions(+), 40 deletions(-) diff --git a/drivers/perf/arm_cspmu/ampere_cspmu.c b/drivers/perf/arm_cspmu/ampere_cspmu.c index 31cc1a4ac9df1..b8ca69fd9d1d2 100644 --- a/drivers/perf/arm_cspmu/ampere_cspmu.c +++ b/drivers/perf/arm_cspmu/ampere_cspmu.c @@ -132,32 +132,20 @@ ampere_cspmu_get_name(const struct arm_cspmu *cspmu) return ctx->name; } -static u32 ampere_cspmu_event_filter(const struct perf_event *event) +static void ampere_cspmu_set_cc_filter(struct arm_cspmu *cspmu, + const struct perf_event *event) { /* - * PMEVFILTR or PMCCFILTR aren't used in Ampere SoC PMU but are marked - * as RES0. Make sure, PMCCFILTR is written zero. + * PMCCFILTR is RES0, so this is just a dummy callback to override + * the default implementation and avoid writing to it. */ - return 0; } static void ampere_cspmu_set_ev_filter(struct arm_cspmu *cspmu, - struct hw_perf_event *hwc, - u32 filter) + const struct perf_event *event) { - struct perf_event *event; - unsigned int idx; u32 threshold, rank, bank; - /* - * At this point, all the events have the same filter settings. - * Therefore, take the first event and use its configuration. - */ - idx = find_first_bit(cspmu->hw_events.used_ctrs, - cspmu->cycle_counter_logical_idx); - - event = cspmu->hw_events.events[idx]; - threshold = get_threshold(event); rank = get_rank(event); bank = get_bank(event); @@ -233,7 +221,7 @@ static int ampere_cspmu_init_ops(struct arm_cspmu *cspmu) cspmu->impl.ctx = ctx; - impl_ops->event_filter = ampere_cspmu_event_filter; + impl_ops->set_cc_filter = ampere_cspmu_set_cc_filter; impl_ops->set_ev_filter = ampere_cspmu_set_ev_filter; impl_ops->validate_event = ampere_cspmu_validate_event; impl_ops->get_name = ampere_cspmu_get_name; diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c index 769466d55bea2..053bb7920df6e 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.c +++ b/drivers/perf/arm_cspmu/arm_cspmu.c @@ -66,7 +66,9 @@ static unsigned long arm_cspmu_cpuhp_state; static DEFINE_MUTEX(arm_cspmu_lock); static void arm_cspmu_set_ev_filter(struct arm_cspmu *cspmu, - struct hw_perf_event *hwc, u32 filter); + const struct perf_event *event); +static void arm_cspmu_set_cc_filter(struct arm_cspmu *cspmu, + const struct perf_event *event); static struct acpi_apmt_node *arm_cspmu_apmt_node(struct device *dev) { @@ -205,11 +207,6 @@ static bool arm_cspmu_is_cycle_counter_event(const struct perf_event *event) return (event->attr.config == ARM_CSPMU_EVT_CYCLES_DEFAULT); } -static u32 arm_cspmu_event_filter(const struct perf_event *event) -{ - return event->attr.config1 & ARM_CSPMU_FILTER_MASK; -} - static ssize_t arm_cspmu_identifier_show(struct device *dev, struct device_attribute *attr, char *page) @@ -371,7 +368,7 @@ static int arm_cspmu_init_impl_ops(struct arm_cspmu *cspmu) DEFAULT_IMPL_OP(get_name), DEFAULT_IMPL_OP(is_cycle_counter_event), DEFAULT_IMPL_OP(event_type), - DEFAULT_IMPL_OP(event_filter), + DEFAULT_IMPL_OP(set_cc_filter), DEFAULT_IMPL_OP(set_ev_filter), DEFAULT_IMPL_OP(event_attr_is_visible), }; @@ -767,26 +764,26 @@ static inline void arm_cspmu_set_event(struct arm_cspmu *cspmu, } static void arm_cspmu_set_ev_filter(struct arm_cspmu *cspmu, - struct hw_perf_event *hwc, - u32 filter) + const struct perf_event *event) { + u32 filter = event->attr.config1 & ARM_CSPMU_FILTER_MASK; u32 offset = PMEVFILTR + (4 * hwc->idx); writel(filter, cspmu->base0 + offset); } -static inline void arm_cspmu_set_cc_filter(struct arm_cspmu *cspmu, u32 filter) +static void arm_cspmu_set_cc_filter(struct arm_cspmu *cspmu, + const struct perf_event *event) { - u32 offset = PMCCFILTR; + u32 filter = event->attr.config1 & ARM_CSPMU_FILTER_MASK; - writel(filter, cspmu->base0 + offset); + writel(filter, cspmu->base0 + PMCCFILTR); } static void arm_cspmu_start(struct perf_event *event, int pmu_flags) { struct arm_cspmu *cspmu = to_arm_cspmu(event->pmu); struct hw_perf_event *hwc = &event->hw; - u32 filter; /* We always reprogram the counter */ if (pmu_flags & PERF_EF_RELOAD) @@ -794,13 +791,11 @@ static void arm_cspmu_start(struct perf_event *event, int pmu_flags) arm_cspmu_set_event_period(event); - filter = cspmu->impl.ops.event_filter(event); - if (event->hw.extra_reg.idx == cspmu->cycle_counter_logical_idx) { - arm_cspmu_set_cc_filter(cspmu, filter); + cspmu->impl.ops.set_cc_filter(cspmu, event); } else { arm_cspmu_set_event(cspmu, hwc); - cspmu->impl.ops.set_ev_filter(cspmu, hwc, filter); + cspmu->impl.ops.set_ev_filter(cspmu, event); } hwc->state = 0; diff --git a/drivers/perf/arm_cspmu/arm_cspmu.h b/drivers/perf/arm_cspmu/arm_cspmu.h index 576249e0deeae..d59040d6a7e39 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.h +++ b/drivers/perf/arm_cspmu/arm_cspmu.h @@ -149,11 +149,11 @@ struct arm_cspmu_impl_ops { bool (*is_cycle_counter_event)(const struct perf_event *event); /* Decode event type/id from configs */ u32 (*event_type)(const struct perf_event *event); - /* Decode filter value from configs */ - u32 (*event_filter)(const struct perf_event *event); - /* Set event filter */ + /* Set event filters */ + void (*set_cc_filter)(struct arm_cspmu *cspmu, + const struct perf_event *event); void (*set_ev_filter)(struct arm_cspmu *cspmu, - struct hw_perf_event *hwc, u32 filter); + const struct perf_event *event); /* Implementation specific event validation */ int (*validate_event)(struct arm_cspmu *cspmu, struct perf_event *event); diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c index 8116c7846a466..9e817f120828f 100644 --- a/drivers/perf/arm_cspmu/nvidia_cspmu.c +++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c @@ -183,6 +183,24 @@ static u32 nv_cspmu_event_filter(const struct perf_event *event) return filter_val; } +static void nv_cspmu_set_ev_filter(struct arm_cspmu *cspmu, + const struct perf_event *event) +{ + u32 filter = nv_cspmu_event_filter(event); + u32 offset = PMEVFILTR + (4 * event->hw.idx); + + writel(filter, cspmu->base0 + offset); +} + +static void nv_cspmu_set_cc_filter(struct arm_cspmu *cspmu, + const struct perf_event *event) +{ + u32 filter = nv_cspmu_event_filter(event); + + writel(filter, cspmu->base0 + PMCCFILTR); +} + + enum nv_cspmu_name_fmt { NAME_FMT_GENERIC, NAME_FMT_SOCKET @@ -322,7 +340,8 @@ static int nv_cspmu_init_ops(struct arm_cspmu *cspmu) cspmu->impl.ctx = ctx; /* NVIDIA specific callbacks. */ - impl_ops->event_filter = nv_cspmu_event_filter; + impl_ops->set_cc_filter = nv_cspmu_set_cc_filter; + impl_ops->set_ev_filter = nv_cspmu_set_ev_filter; impl_ops->get_event_attrs = nv_cspmu_get_event_attrs; impl_ops->get_format_attrs = nv_cspmu_get_format_attrs; impl_ops->get_name = nv_cspmu_get_name; From a28f3cbfd11fcb13f826b33a4799c2ac77e70729 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 5 Mar 2025 16:10:08 +0000 Subject: [PATCH 58/72] perf/arm_cspmu: Add PMEVFILT2R support Architecturally we have two filters for each regular event counter, so add generic support for the second one too. Signed-off-by: Robin Murphy Reviewed-by: James Clark Reviewed-by: Ilkka Koskinen Link: https://lore.kernel.org/r/b11be3f23a72bc27088b115099c8fe865b70babc.1741190362.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm_cspmu/arm_cspmu.c | 7 +++++-- drivers/perf/arm_cspmu/arm_cspmu.h | 3 +++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c index 053bb7920df6e..efa9b229e7012 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.c +++ b/drivers/perf/arm_cspmu/arm_cspmu.c @@ -183,6 +183,7 @@ arm_cspmu_event_attr_is_visible(struct kobject *kobj, static struct attribute *arm_cspmu_format_attrs[] = { ARM_CSPMU_FORMAT_EVENT_ATTR, ARM_CSPMU_FORMAT_FILTER_ATTR, + ARM_CSPMU_FORMAT_FILTER2_ATTR, NULL, }; @@ -767,9 +768,11 @@ static void arm_cspmu_set_ev_filter(struct arm_cspmu *cspmu, const struct perf_event *event) { u32 filter = event->attr.config1 & ARM_CSPMU_FILTER_MASK; - u32 offset = PMEVFILTR + (4 * hwc->idx); + u32 filter2 = event->attr.config2 & ARM_CSPMU_FILTER_MASK; + u32 offset = 4 * event->hw.idx; - writel(filter, cspmu->base0 + offset); + writel(filter, cspmu->base0 + PMEVFILTR + offset); + writel(filter2, cspmu->base0 + PMEVFILT2R + offset); } static void arm_cspmu_set_cc_filter(struct arm_cspmu *cspmu, diff --git a/drivers/perf/arm_cspmu/arm_cspmu.h b/drivers/perf/arm_cspmu/arm_cspmu.h index d59040d6a7e39..19684b76bd969 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.h +++ b/drivers/perf/arm_cspmu/arm_cspmu.h @@ -47,6 +47,8 @@ /* Default filter format */ #define ARM_CSPMU_FORMAT_FILTER_ATTR \ ARM_CSPMU_FORMAT_ATTR(filter, "config1:0-31") +#define ARM_CSPMU_FORMAT_FILTER2_ATTR \ + ARM_CSPMU_FORMAT_ATTR(filter2, "config2:0-31") /* * This is the default event number for cycle count, if supported, since the @@ -72,6 +74,7 @@ #define PMEVCNTR_HI 0x4 #define PMEVTYPER 0x400 #define PMCCFILTR 0x47C +#define PMEVFILT2R 0x800 #define PMEVFILTR 0xA00 #define PMCNTENSET 0xC00 #define PMCNTENCLR 0xC20 From 51ecb29f7a6518aeb5beeaa7ac433fe62ca3dc4d Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 11 Mar 2025 10:27:10 +0530 Subject: [PATCH 59/72] arm64/mm: Define PTDESC_ORDER Address bytes shifted with a single 64 bit page table entry (any page table level) has been always hard coded as 3 (aka 2^3 = 8). Although intuitive it is not very readable or easy to reason about. Besides it is going to change with D128, where each 128 bit page table entry will shift address bytes by 4 (aka 2^4 = 16) instead. Let's just formalise this address bytes shift value into a new macro called PTDESC_ORDER establishing a logical abstraction, thus improving readability as well. While here re-organize EARLY_LEVEL macro along with its dependents for better clarity. This does not cause any functional change. Also replace all (PAGE_SHIFT - PTDESC_ORDER) instances with PTDESC_TABLE_SHIFT. Cc: Will Deacon Cc: Mark Rutland Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Ard Biesheuvel Cc: Ryan Roberts Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: kasan-dev@googlegroups.com Acked-by: Ard Biesheuvel Reviewed-by: Ryan Roberts Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20250311045710.550625-1-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 2 +- arch/arm64/include/asm/kernel-pgtable.h | 8 +++---- arch/arm64/include/asm/pgtable-hwdef.h | 30 +++++++++++++++---------- arch/arm64/kernel/pi/map_range.c | 2 +- arch/arm64/mm/kasan_init.c | 6 ++--- 5 files changed, 27 insertions(+), 21 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 940343beb3d4c..c8f48945cc09b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -323,7 +323,7 @@ config ARCH_MMAP_RND_BITS_MIN default 18 # max bits determined by the following formula: -# VA_BITS - PAGE_SHIFT - 3 +# VA_BITS - PTDESC_TABLE_SHIFT config ARCH_MMAP_RND_BITS_MAX default 19 if ARM64_VA_BITS=36 default 24 if ARM64_VA_BITS=39 diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index fd5a08450b12a..9e93733523f68 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -45,11 +45,11 @@ #define SPAN_NR_ENTRIES(vstart, vend, shift) \ ((((vend) - 1) >> (shift)) - ((vstart) >> (shift)) + 1) -#define EARLY_ENTRIES(vstart, vend, shift, add) \ - (SPAN_NR_ENTRIES(vstart, vend, shift) + (add)) +#define EARLY_ENTRIES(lvl, vstart, vend) \ + SPAN_NR_ENTRIES(vstart, vend, SWAPPER_BLOCK_SHIFT + lvl * PTDESC_TABLE_SHIFT) -#define EARLY_LEVEL(lvl, lvls, vstart, vend, add) \ - (lvls > lvl ? EARLY_ENTRIES(vstart, vend, SWAPPER_BLOCK_SHIFT + lvl * (PAGE_SHIFT - 3), add) : 0) +#define EARLY_LEVEL(lvl, lvls, vstart, vend, add) \ + ((lvls) > (lvl) ? EARLY_ENTRIES(lvl, vstart, vend) + (add) : 0) #define EARLY_PAGES(lvls, vstart, vend, add) (1 /* PGDIR page */ \ + EARLY_LEVEL(3, (lvls), (vstart), (vend), add) /* each entry needs a next level page table */ \ diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index a9136cc551ccb..2889643414042 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -7,40 +7,46 @@ #include +#define PTDESC_ORDER 3 + +/* Number of VA bits resolved by a single translation table level */ +#define PTDESC_TABLE_SHIFT (PAGE_SHIFT - PTDESC_ORDER) + /* * Number of page-table levels required to address 'va_bits' wide * address, without section mapping. We resolve the top (va_bits - PAGE_SHIFT) - * bits with (PAGE_SHIFT - 3) bits at each page table level. Hence: + * bits with PTDESC_TABLE_SHIFT bits at each page table level. Hence: * - * levels = DIV_ROUND_UP((va_bits - PAGE_SHIFT), (PAGE_SHIFT - 3)) + * levels = DIV_ROUND_UP((va_bits - PAGE_SHIFT), PTDESC_TABLE_SHIFT) * * where DIV_ROUND_UP(n, d) => (((n) + (d) - 1) / (d)) * * We cannot include linux/kernel.h which defines DIV_ROUND_UP here * due to build issues. So we open code DIV_ROUND_UP here: * - * ((((va_bits) - PAGE_SHIFT) + (PAGE_SHIFT - 3) - 1) / (PAGE_SHIFT - 3)) + * ((((va_bits) - PAGE_SHIFT) + PTDESC_TABLE_SHIFT - 1) / PTDESC_TABLE_SHIFT) * * which gets simplified as : */ -#define ARM64_HW_PGTABLE_LEVELS(va_bits) (((va_bits) - 4) / (PAGE_SHIFT - 3)) +#define ARM64_HW_PGTABLE_LEVELS(va_bits) \ + (((va_bits) - PTDESC_ORDER - 1) / PTDESC_TABLE_SHIFT) /* * Size mapped by an entry at level n ( -1 <= n <= 3) - * We map (PAGE_SHIFT - 3) at all translation levels and PAGE_SHIFT bits + * We map PTDESC_TABLE_SHIFT at all translation levels and PAGE_SHIFT bits * in the final page. The maximum number of translation levels supported by * the architecture is 5. Hence, starting at level n, we have further * ((4 - n) - 1) levels of translation excluding the offset within the page. * So, the total number of bits mapped by an entry at level n is : * - * ((4 - n) - 1) * (PAGE_SHIFT - 3) + PAGE_SHIFT + * ((4 - n) - 1) * PTDESC_TABLE_SHIFT + PAGE_SHIFT * * Rearranging it a bit we get : - * (4 - n) * (PAGE_SHIFT - 3) + 3 + * (4 - n) * PTDESC_TABLE_SHIFT + PTDESC_ORDER */ -#define ARM64_HW_PGTABLE_LEVEL_SHIFT(n) ((PAGE_SHIFT - 3) * (4 - (n)) + 3) +#define ARM64_HW_PGTABLE_LEVEL_SHIFT(n) (PTDESC_TABLE_SHIFT * (4 - (n)) + PTDESC_ORDER) -#define PTRS_PER_PTE (1 << (PAGE_SHIFT - 3)) +#define PTRS_PER_PTE (1 << PTDESC_TABLE_SHIFT) /* * PMD_SHIFT determines the size a level 2 page table entry can map. @@ -49,7 +55,7 @@ #define PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2) #define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) -#define PTRS_PER_PMD (1 << (PAGE_SHIFT - 3)) +#define PTRS_PER_PMD (1 << PTDESC_TABLE_SHIFT) #endif /* @@ -59,14 +65,14 @@ #define PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1) #define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_MASK (~(PUD_SIZE-1)) -#define PTRS_PER_PUD (1 << (PAGE_SHIFT - 3)) +#define PTRS_PER_PUD (1 << PTDESC_TABLE_SHIFT) #endif #if CONFIG_PGTABLE_LEVELS > 4 #define P4D_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(0) #define P4D_SIZE (_AC(1, UL) << P4D_SHIFT) #define P4D_MASK (~(P4D_SIZE-1)) -#define PTRS_PER_P4D (1 << (PAGE_SHIFT - 3)) +#define PTRS_PER_P4D (1 << PTDESC_TABLE_SHIFT) #endif /* diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c index 5778697f3062b..81345f68f9fc0 100644 --- a/arch/arm64/kernel/pi/map_range.c +++ b/arch/arm64/kernel/pi/map_range.c @@ -31,7 +31,7 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, { u64 cmask = (level == 3) ? CONT_PTE_SIZE - 1 : U64_MAX; pteval_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK; - int lshift = (3 - level) * (PAGE_SHIFT - 3); + int lshift = (3 - level) * PTDESC_TABLE_SHIFT; u64 lmask = (PAGE_SIZE << lshift) - 1; start &= PAGE_MASK; diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index b65a29440a0c9..d541ce45daeb9 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -190,7 +190,7 @@ static void __init kasan_pgd_populate(unsigned long addr, unsigned long end, */ static bool __init root_level_aligned(u64 addr) { - int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 1) * (PAGE_SHIFT - 3); + int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 1) * PTDESC_TABLE_SHIFT; return (addr % (PAGE_SIZE << shift)) == 0; } @@ -245,7 +245,7 @@ static int __init root_level_idx(u64 addr) */ u64 vabits = IS_ENABLED(CONFIG_ARM64_64K_PAGES) ? VA_BITS : vabits_actual; - int shift = (ARM64_HW_PGTABLE_LEVELS(vabits) - 1) * (PAGE_SHIFT - 3); + int shift = (ARM64_HW_PGTABLE_LEVELS(vabits) - 1) * PTDESC_TABLE_SHIFT; return (addr & ~_PAGE_OFFSET(vabits)) >> (shift + PAGE_SHIFT); } @@ -269,7 +269,7 @@ static void __init clone_next_level(u64 addr, pgd_t *tmp_pg_dir, pud_t *pud) */ static int __init next_level_idx(u64 addr) { - int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 2) * (PAGE_SHIFT - 3); + int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 2) * PTDESC_TABLE_SHIFT; return (addr >> (shift + PAGE_SHIFT)) % PTRS_PER_PTE; } From 4b455f59945aab5610828a1320b045c82cbe8852 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Tue, 11 Mar 2025 15:51:40 +0800 Subject: [PATCH 60/72] cpu/SMT: Provide a default topology_is_primary_thread() Currently if architectures want to support HOTPLUG_SMT they need to provide a topology_is_primary_thread() telling the framework which thread in the SMT cannot offline. However arm64 doesn't have a restriction on which thread in the SMT cannot offline, a simplest choice is that just make 1st thread as the "primary" thread. So just make this as the default implementation in the framework and let architectures like x86 that have special primary thread to override this function (which they've already done). There's no need to provide a stub function if !CONFIG_SMP or !CONFIG_HOTPLUG_SMT. In such case the testing CPU is already the 1st CPU in the SMT so it's always the primary thread. Reviewed-by: Jonathan Cameron Reviewed-by: Pierre Gondois Reviewed-by: Dietmar Eggemann Signed-off-by: Yicong Yang Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20250311075143.61078-2-yangyicong@huawei.com Signed-off-by: Catalin Marinas --- arch/powerpc/include/asm/topology.h | 1 + arch/x86/include/asm/topology.h | 2 +- include/linux/topology.h | 23 +++++++++++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 16bacfe8c7a2c..da15b5efe8071 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -152,6 +152,7 @@ static inline bool topology_is_primary_thread(unsigned int cpu) { return cpu == cpu_first_thread_sibling(cpu); } +#define topology_is_primary_thread topology_is_primary_thread static inline bool topology_smt_thread_allowed(unsigned int cpu) { diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index ec134b7191446..6c79ee7c0957a 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -229,11 +229,11 @@ static inline bool topology_is_primary_thread(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_primary_thread_mask); } +#define topology_is_primary_thread topology_is_primary_thread #else /* CONFIG_SMP */ static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } static inline int topology_max_smt_threads(void) { return 1; } -static inline bool topology_is_primary_thread(unsigned int cpu) { return true; } static inline unsigned int topology_amd_nodes_per_pkg(void) { return 1; } #endif /* !CONFIG_SMP */ diff --git a/include/linux/topology.h b/include/linux/topology.h index 52f5850730b3e..f8bb02c5e8a47 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -240,6 +240,29 @@ static inline const struct cpumask *cpu_smt_mask(int cpu) } #endif +#ifndef topology_is_primary_thread + +static inline bool topology_is_primary_thread(unsigned int cpu) +{ + /* + * When disabling SMT, the primary thread of the SMT will remain + * enabled/active. Architectures that have a special primary thread + * (e.g. x86) need to override this function. Otherwise the first + * thread in the SMT can be made the primary thread. + * + * The sibling cpumask of an offline CPU always contains the CPU + * itself on architectures using the implementation of + * CONFIG_GENERIC_ARCH_TOPOLOGY for building their topology. + * Other architectures not using CONFIG_GENERIC_ARCH_TOPOLOGY for + * building their topology have to check whether to use this default + * implementation or to override it. + */ + return cpu == cpumask_first(topology_sibling_cpumask(cpu)); +} +#define topology_is_primary_thread topology_is_primary_thread + +#endif + static inline const struct cpumask *cpu_cpu_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); From 5deb9c789ae468a71a7c11c92d21769f7cbf68fa Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Tue, 11 Mar 2025 15:51:41 +0800 Subject: [PATCH 61/72] arch_topology: Support SMT control for OF based system On building the topology from the devicetree, we've already gotten the SMT thread number of each core. Update the largest SMT thread number and enable the SMT control by the end of topology parsing. The framework's SMT control provides two interface to the users through /sys/devices/system/cpu/smt/control (Documentation/ABI/testing/sysfs-devices-system-cpu): 1) enable SMT by writing "on" and disable by "off" 2) enable SMT by writing max_thread_number or disable by writing 1 Both method support to completely disable/enable the SMT cores so both work correctly for symmetric SMT platform and asymmetric platform with non-SMT and one type SMT cores like: core A: 1 thread core B: X (X!=1) threads Note that for a theoretically possible multiple SMT-X (X>1) core platform the SMT control is also supported as expected but only by writing the "on/off" method. Reviewed-by: Pierre Gondois Reviewed-by: Dietmar Eggemann Signed-off-by: Yicong Yang Reviewed-by: Jonathan Cameron Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20250311075143.61078-3-yangyicong@huawei.com Signed-off-by: Catalin Marinas --- drivers/base/arch_topology.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 3ebe77566788b..d409d323ee647 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -506,6 +507,10 @@ core_initcall(free_raw_capacity); #endif #if defined(CONFIG_ARM64) || defined(CONFIG_RISCV) + +/* Used to enable the SMT control */ +static unsigned int max_smt_thread_num = 1; + /* * This function returns the logic cpu number of the node. * There are basically three kinds of return values: @@ -565,6 +570,8 @@ static int __init parse_core(struct device_node *core, int package_id, i++; } while (1); + max_smt_thread_num = max_t(unsigned int, max_smt_thread_num, i); + cpu = get_cpu_for_node(core); if (cpu >= 0) { if (!leaf) { @@ -677,6 +684,17 @@ static int __init parse_socket(struct device_node *socket) if (!has_socket) ret = parse_cluster(socket, 0, -1, 0); + /* + * Reset the max_smt_thread_num to 1 on failure. Since on failure + * we need to notify the framework the SMT is not supported, but + * max_smt_thread_num can be initialized to the SMT thread number + * of the cores which are successfully parsed. + */ + if (ret) + max_smt_thread_num = 1; + + cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); + return ret; } From e6b18ebfaf6360a357455507ae3e965989461c71 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Tue, 11 Mar 2025 15:51:42 +0800 Subject: [PATCH 62/72] arm64: topology: Support SMT control on ACPI based system For ACPI we'll build the topology from PPTT and we cannot directly get the SMT number of each core. Instead using a temporary xarray to record the heterogeneous information (from ACPI_PPTT_ACPI_IDENTICAL) and SMT information of the first core in its heterogeneous CPU cluster when building the topology. Then we can know the largest SMT number in the system. If a homogeneous system's using ACPI 6.2 or later, all the CPUs should be under the root node of PPTT. There'll be only one entry in the xarray and all the CPUs in the system will be assumed identical. The framework's SMT control provides two interface to the users [1] through /sys/devices/system/cpu/smt/control (Documentation/ABI/testing/sysfs-devices-system-cpu): 1) enable SMT by writing "on" and disable by "off" 2) enable SMT by writing max_thread_number or disable by writing 1 Both method support to completely disable/enable the SMT cores so both work correctly for symmetric SMT platform and asymmetric platform with non-SMT and one type SMT cores like: core A: 1 thread core B: X (X!=1) threads Note that for a theoretically possible multiple SMT-X (X>1) core platform the SMT control is also supported as expected but only by writing the "on/off" method. Reviewed-by: Jonathan Cameron Reviewed-by: Hanjun Guo Reviewed-by: Pierre Gondois Reviewed-by: Dietmar Eggemann Signed-off-by: Yicong Yang Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20250311075143.61078-4-yangyicong@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/topology.c | 54 ++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index cb180684d10d5..0bcea4f89ea82 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -15,8 +15,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -37,17 +39,28 @@ static bool __init acpi_cpu_is_threaded(int cpu) return !!is_threaded; } +struct cpu_smt_info { + unsigned int thread_num; + int core_id; +}; + /* * Propagate the topology information of the processor_topology_node tree to the * cpu_topology array. */ int __init parse_acpi_topology(void) { + unsigned int max_smt_thread_num = 1; + struct cpu_smt_info *entry; + struct xarray hetero_cpu; + unsigned long hetero_id; int cpu, topology_id; if (acpi_disabled) return 0; + xa_init(&hetero_cpu); + for_each_possible_cpu(cpu) { topology_id = find_acpi_cpu_topology(cpu, 0); if (topology_id < 0) @@ -57,6 +70,34 @@ int __init parse_acpi_topology(void) cpu_topology[cpu].thread_id = topology_id; topology_id = find_acpi_cpu_topology(cpu, 1); cpu_topology[cpu].core_id = topology_id; + + /* + * In the PPTT, CPUs below a node with the 'identical + * implementation' flag have the same number of threads. + * Count the number of threads for only one CPU (i.e. + * one core_id) among those with the same hetero_id. + * See the comment of find_acpi_cpu_topology_hetero_id() + * for more details. + * + * One entry is created for each node having: + * - the 'identical implementation' flag + * - its parent not having the flag + */ + hetero_id = find_acpi_cpu_topology_hetero_id(cpu); + entry = xa_load(&hetero_cpu, hetero_id); + if (!entry) { + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + WARN_ON_ONCE(!entry); + + if (entry) { + entry->core_id = topology_id; + entry->thread_num = 1; + xa_store(&hetero_cpu, hetero_id, + entry, GFP_KERNEL); + } + } else if (entry->core_id == topology_id) { + entry->thread_num++; + } } else { cpu_topology[cpu].thread_id = -1; cpu_topology[cpu].core_id = topology_id; @@ -67,6 +108,19 @@ int __init parse_acpi_topology(void) cpu_topology[cpu].package_id = topology_id; } + /* + * This is a short loop since the number of XArray elements is the + * number of heterogeneous CPU clusters. On a homogeneous system + * there's only one entry in the XArray. + */ + xa_for_each(&hetero_cpu, hetero_id, entry) { + max_smt_thread_num = max(max_smt_thread_num, entry->thread_num); + xa_erase(&hetero_cpu, hetero_id); + kfree(entry); + } + + cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); + xa_destroy(&hetero_cpu); return 0; } #endif From eed4583bcf9a60f8d6dd3a3c7c94dea28134b1eb Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Tue, 11 Mar 2025 15:51:43 +0800 Subject: [PATCH 63/72] arm64: Kconfig: Enable HOTPLUG_SMT Enable HOTPLUG_SMT for SMT control. Reviewed-by: Jonathan Cameron Reviewed-by: Pierre Gondois Reviewed-by: Dietmar Eggemann Signed-off-by: Yicong Yang Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20250311075143.61078-5-yangyicong@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 940343beb3d4c..65fe00b1922c3 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -250,6 +250,7 @@ config ARM64 select HAVE_KRETPROBES select HAVE_GENERIC_VDSO select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU + select HOTPLUG_SMT if HOTPLUG_CPU select IRQ_DOMAIN select IRQ_FORCED_THREADING select KASAN_VMALLOC if KASAN From 00cb1e01cd29c664dcd22cabc25be410e2c7faa7 Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 15 Jan 2025 16:25:55 +0000 Subject: [PATCH 64/72] arm64/sysreg: Fix unbalanced closing block This is a sysreg block so close it with one. This doesn't make a difference to the output because the script only matches on the beginning of the word to close blocks which is correct by coincidence here. Signed-off-by: James Clark Reviewed-by: Anshuman Khandual Acked-by: Mark Rutland Acked-by: Will Deacon Link: https://lore.kernel.org/r/20250115162600.2153226-2-james.clark@linaro.org Signed-off-by: Catalin Marinas --- arch/arm64/tools/sysreg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 891fe033e1b63..10809047fc871 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2076,7 +2076,7 @@ EndEnum Res0 4:2 Field 1 ExTRE Field 0 E0TRE -EndSysregFields +EndSysreg Sysreg SMPRI_EL1 3 0 1 2 4 Res0 63:4 From 2fdbf2ff388441476aae5d914a04c47b247b5e7b Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 15 Jan 2025 16:25:56 +0000 Subject: [PATCH 65/72] arm64/sysreg: Enforce whole word match for open/close tokens Opening and closing tokens can also match on words with common prefixes like "Endsysreg" vs "EndsysregFields". This could potentially make the script go wrong in weird ways so make it fall through to the fatal unhandled statement catcher if it doesn't fully match the current block. Closing ones also get expect_fields(1) to ensure nothing other than whitespace follows. Signed-off-by: James Clark Acked-by: Will Deacon Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20250115162600.2153226-3-james.clark@linaro.org Signed-off-by: Catalin Marinas --- arch/arm64/tools/gen-sysreg.awk | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/arm64/tools/gen-sysreg.awk b/arch/arm64/tools/gen-sysreg.awk index 1a2afc9fdd42e..f2a1732cb1f63 100755 --- a/arch/arm64/tools/gen-sysreg.awk +++ b/arch/arm64/tools/gen-sysreg.awk @@ -111,7 +111,7 @@ END { /^$/ { next } /^[\t ]*#/ { next } -/^SysregFields/ && block_current() == "Root" { +$1 == "SysregFields" && block_current() == "Root" { block_push("SysregFields") expect_fields(2) @@ -127,7 +127,8 @@ END { next } -/^EndSysregFields/ && block_current() == "SysregFields" { +$1 == "EndSysregFields" && block_current() == "SysregFields" { + expect_fields(1) if (next_bit > 0) fatal("Unspecified bits in " reg) @@ -145,7 +146,7 @@ END { next } -/^Sysreg/ && block_current() == "Root" { +$1 == "Sysreg" && block_current() == "Root" { block_push("Sysreg") expect_fields(7) @@ -177,7 +178,8 @@ END { next } -/^EndSysreg/ && block_current() == "Sysreg" { +$1 == "EndSysreg" && block_current() == "Sysreg" { + expect_fields(1) if (next_bit > 0) fatal("Unspecified bits in " reg) @@ -206,7 +208,7 @@ END { # Currently this is effectivey a comment, in future we may want to emit # defines for the fields. -(/^Fields/ || /^Mapping/) && block_current() == "Sysreg" { +($1 == "Fields" || $1 == "Mapping") && block_current() == "Sysreg" { expect_fields(2) if (next_bit != 63) @@ -224,7 +226,7 @@ END { } -/^Res0/ && (block_current() == "Sysreg" || block_current() == "SysregFields") { +$1 == "Res0" && (block_current() == "Sysreg" || block_current() == "SysregFields") { expect_fields(2) parse_bitdef(reg, "RES0", $2) field = "RES0_" msb "_" lsb @@ -234,7 +236,7 @@ END { next } -/^Res1/ && (block_current() == "Sysreg" || block_current() == "SysregFields") { +$1 == "Res1" && (block_current() == "Sysreg" || block_current() == "SysregFields") { expect_fields(2) parse_bitdef(reg, "RES1", $2) field = "RES1_" msb "_" lsb @@ -244,7 +246,7 @@ END { next } -/^Unkn/ && (block_current() == "Sysreg" || block_current() == "SysregFields") { +$1 == "Unkn" && (block_current() == "Sysreg" || block_current() == "SysregFields") { expect_fields(2) parse_bitdef(reg, "UNKN", $2) field = "UNKN_" msb "_" lsb @@ -254,7 +256,7 @@ END { next } -/^Field/ && (block_current() == "Sysreg" || block_current() == "SysregFields") { +$1 == "Field" && (block_current() == "Sysreg" || block_current() == "SysregFields") { expect_fields(3) field = $3 parse_bitdef(reg, field, $2) @@ -265,14 +267,14 @@ END { next } -/^Raz/ && (block_current() == "Sysreg" || block_current() == "SysregFields") { +$1 == "Raz" && (block_current() == "Sysreg" || block_current() == "SysregFields") { expect_fields(2) parse_bitdef(reg, field, $2) next } -/^SignedEnum/ && (block_current() == "Sysreg" || block_current() == "SysregFields") { +$1 == "SignedEnum" && (block_current() == "Sysreg" || block_current() == "SysregFields") { block_push("Enum") expect_fields(3) @@ -285,7 +287,7 @@ END { next } -/^UnsignedEnum/ && (block_current() == "Sysreg" || block_current() == "SysregFields") { +$1 == "UnsignedEnum" && (block_current() == "Sysreg" || block_current() == "SysregFields") { block_push("Enum") expect_fields(3) @@ -298,7 +300,7 @@ END { next } -/^Enum/ && (block_current() == "Sysreg" || block_current() == "SysregFields") { +$1 == "Enum" && (block_current() == "Sysreg" || block_current() == "SysregFields") { block_push("Enum") expect_fields(3) @@ -310,7 +312,8 @@ END { next } -/^EndEnum/ && block_current() == "Enum" { +$1 == "EndEnum" && block_current() == "Enum" { + expect_fields(1) field = null msb = null From ed1ce841245d8febe3badf51c57e81c3619d0a1d Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 7 Jan 2025 12:05:58 -0800 Subject: [PATCH 66/72] arm64: errata: Add QCOM_KRYO_4XX_GOLD to the spectre_bhb_k24_list Qualcomm Kryo 400-series Gold cores have a derivative of an ARM Cortex A76 in them. Since A76 needs Spectre mitigation via looping then the Kyro 400-series Gold cores also need Spectre mitigation via looping. Qualcomm has confirmed that the proper "k" value for Kryo 400-series Gold cores is 24. Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Cc: Scott Bauer Signed-off-by: Douglas Anderson Acked-by: Trilok Soni Link: https://lore.kernel.org/r/20250107120555.v4.1.Ie4ef54abe02e7eb0eee50f830575719bf23bda48@changeid Signed-off-by: Catalin Marinas --- arch/arm64/kernel/proton-pack.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index da53722f95d41..e149efadff201 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -866,6 +866,7 @@ u8 spectre_bhb_loop_affected(int scope) MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_GOLD), {}, }; static const struct midr_range spectre_bhb_k11_list[] = { From e403e8538359d8580cbee1976ff71813e947101e Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 7 Jan 2025 12:05:59 -0800 Subject: [PATCH 67/72] arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/spectre.h | 1 - arch/arm64/kernel/proton-pack.c | 203 ++++++++++++++++--------------- 2 files changed, 102 insertions(+), 102 deletions(-) diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index 0c4d9045c31f4..f1524cdeacf1c 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -97,7 +97,6 @@ enum mitigation_state arm64_get_meltdown_state(void); enum mitigation_state arm64_get_spectre_bhb_state(void); bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, int scope); -u8 spectre_bhb_loop_affected(int scope); void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *__unused); bool try_emulate_el1_ssbs(struct pt_regs *regs, u32 instr); diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index e149efadff201..17aa836fe46de 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -845,53 +845,70 @@ static unsigned long system_bhb_mitigations; * This must be called with SCOPE_LOCAL_CPU for each type of CPU, before any * SCOPE_SYSTEM call will give the right answer. */ -u8 spectre_bhb_loop_affected(int scope) +static bool is_spectre_bhb_safe(int scope) +{ + static const struct midr_range spectre_bhb_safe_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A510), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A520), + MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), + {}, + }; + static bool all_safe = true; + + if (scope != SCOPE_LOCAL_CPU) + return all_safe; + + if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_safe_list)) + return true; + + all_safe = false; + + return false; +} + +static u8 spectre_bhb_loop_affected(void) { u8 k = 0; - static u8 max_bhb_k; - - if (scope == SCOPE_LOCAL_CPU) { - static const struct midr_range spectre_bhb_k32_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), - MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), - MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), - MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), - MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), - {}, - }; - static const struct midr_range spectre_bhb_k24_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), - MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), - MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_GOLD), - {}, - }; - static const struct midr_range spectre_bhb_k11_list[] = { - MIDR_ALL_VERSIONS(MIDR_AMPERE1), - {}, - }; - static const struct midr_range spectre_bhb_k8_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), - {}, - }; - - if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list)) - k = 32; - else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list)) - k = 24; - else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k11_list)) - k = 11; - else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k8_list)) - k = 8; - - max_bhb_k = max(max_bhb_k, k); - } else { - k = max_bhb_k; - } + + static const struct midr_range spectre_bhb_k32_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), + {}, + }; + static const struct midr_range spectre_bhb_k24_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_GOLD), + {}, + }; + static const struct midr_range spectre_bhb_k11_list[] = { + MIDR_ALL_VERSIONS(MIDR_AMPERE1), + {}, + }; + static const struct midr_range spectre_bhb_k8_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + {}, + }; + + if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list)) + k = 32; + else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list)) + k = 24; + else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k11_list)) + k = 11; + else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k8_list)) + k = 8; return k; } @@ -917,29 +934,13 @@ static enum mitigation_state spectre_bhb_get_cpu_fw_mitigation_state(void) } } -static bool is_spectre_bhb_fw_affected(int scope) +static bool has_spectre_bhb_fw_mitigation(void) { - static bool system_affected; enum mitigation_state fw_state; bool has_smccc = arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_NONE; - static const struct midr_range spectre_bhb_firmware_mitigated_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), - {}, - }; - bool cpu_in_list = is_midr_in_range_list(read_cpuid_id(), - spectre_bhb_firmware_mitigated_list); - - if (scope != SCOPE_LOCAL_CPU) - return system_affected; fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); - if (cpu_in_list || (has_smccc && fw_state == SPECTRE_MITIGATED)) { - system_affected = true; - return true; - } - - return false; + return has_smccc && fw_state == SPECTRE_MITIGATED; } static bool supports_ecbhb(int scope) @@ -955,6 +956,8 @@ static bool supports_ecbhb(int scope) ID_AA64MMFR1_EL1_ECBHB_SHIFT); } +static u8 max_bhb_k; + bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, int scope) { @@ -963,16 +966,18 @@ bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, if (supports_csv2p3(scope)) return false; - if (supports_clearbhb(scope)) - return true; - - if (spectre_bhb_loop_affected(scope)) - return true; + if (is_spectre_bhb_safe(scope)) + return false; - if (is_spectre_bhb_fw_affected(scope)) - return true; + /* + * At this point the core isn't known to be "safe" so we're going to + * assume it's vulnerable. We still need to update `max_bhb_k` though, + * but only if we aren't mitigating with clearbhb though. + */ + if (scope == SCOPE_LOCAL_CPU && !supports_clearbhb(SCOPE_LOCAL_CPU)) + max_bhb_k = max(max_bhb_k, spectre_bhb_loop_affected()); - return false; + return true; } static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot) @@ -1003,7 +1008,7 @@ early_param("nospectre_bhb", parse_spectre_bhb_param); void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) { bp_hardening_cb_t cpu_cb; - enum mitigation_state fw_state, state = SPECTRE_VULNERABLE; + enum mitigation_state state = SPECTRE_VULNERABLE; struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); if (!is_spectre_bhb_affected(entry, SCOPE_LOCAL_CPU)) @@ -1029,7 +1034,7 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) this_cpu_set_vectors(EL1_VECTOR_BHB_CLEAR_INSN); state = SPECTRE_MITIGATED; set_bit(BHB_INSN, &system_bhb_mitigations); - } else if (spectre_bhb_loop_affected(SCOPE_LOCAL_CPU)) { + } else if (spectre_bhb_loop_affected()) { /* * Ensure KVM uses the indirect vector which will have the * branchy-loop added. A57/A72-r0 will already have selected @@ -1042,32 +1047,29 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) this_cpu_set_vectors(EL1_VECTOR_BHB_LOOP); state = SPECTRE_MITIGATED; set_bit(BHB_LOOP, &system_bhb_mitigations); - } else if (is_spectre_bhb_fw_affected(SCOPE_LOCAL_CPU)) { - fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); - if (fw_state == SPECTRE_MITIGATED) { - /* - * Ensure KVM uses one of the spectre bp_hardening - * vectors. The indirect vector doesn't include the EL3 - * call, so needs upgrading to - * HYP_VECTOR_SPECTRE_INDIRECT. - */ - if (!data->slot || data->slot == HYP_VECTOR_INDIRECT) - data->slot += 1; - - this_cpu_set_vectors(EL1_VECTOR_BHB_FW); - - /* - * The WA3 call in the vectors supersedes the WA1 call - * made during context-switch. Uninstall any firmware - * bp_hardening callback. - */ - cpu_cb = spectre_v2_get_sw_mitigation_cb(); - if (__this_cpu_read(bp_hardening_data.fn) != cpu_cb) - __this_cpu_write(bp_hardening_data.fn, NULL); - - state = SPECTRE_MITIGATED; - set_bit(BHB_FW, &system_bhb_mitigations); - } + } else if (has_spectre_bhb_fw_mitigation()) { + /* + * Ensure KVM uses one of the spectre bp_hardening + * vectors. The indirect vector doesn't include the EL3 + * call, so needs upgrading to + * HYP_VECTOR_SPECTRE_INDIRECT. + */ + if (!data->slot || data->slot == HYP_VECTOR_INDIRECT) + data->slot += 1; + + this_cpu_set_vectors(EL1_VECTOR_BHB_FW); + + /* + * The WA3 call in the vectors supersedes the WA1 call + * made during context-switch. Uninstall any firmware + * bp_hardening callback. + */ + cpu_cb = spectre_v2_get_sw_mitigation_cb(); + if (__this_cpu_read(bp_hardening_data.fn) != cpu_cb) + __this_cpu_write(bp_hardening_data.fn, NULL); + + state = SPECTRE_MITIGATED; + set_bit(BHB_FW, &system_bhb_mitigations); } update_mitigation_state(&spectre_bhb_state, state); @@ -1101,7 +1103,6 @@ void noinstr spectre_bhb_patch_loop_iter(struct alt_instr *alt, { u8 rd; u32 insn; - u16 loop_count = spectre_bhb_loop_affected(SCOPE_SYSTEM); BUG_ON(nr_inst != 1); /* MOV -> MOV */ @@ -1110,7 +1111,7 @@ void noinstr spectre_bhb_patch_loop_iter(struct alt_instr *alt, insn = le32_to_cpu(*origptr); rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn); - insn = aarch64_insn_gen_movewide(rd, loop_count, 0, + insn = aarch64_insn_gen_movewide(rd, max_bhb_k, 0, AARCH64_INSN_VARIANT_64BIT, AARCH64_INSN_MOVEWIDE_ZERO); *updptr++ = cpu_to_le32(insn); From 0c9fc6e652cd5aed48c5f700c32b7642bea7f453 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 7 Jan 2025 12:06:00 -0800 Subject: [PATCH 68/72] arm64: errata: Add KRYO 2XX/3XX/4XX silver cores to Spectre BHB safe list Qualcomm has confirmed that, much like Cortex A53 and A55, KRYO 2XX/3XX/4XX silver cores are unaffected by Spectre BHB. Add them to the safe list. Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Cc: Scott Bauer Signed-off-by: Douglas Anderson Acked-by: Trilok Soni Link: https://lore.kernel.org/r/20250107120555.v4.3.Iab8dbfb5c9b1e143e7a29f410bce5f9525a0ba32@changeid Signed-off-by: Catalin Marinas --- arch/arm64/kernel/proton-pack.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 17aa836fe46de..89405be53d8fc 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -854,6 +854,9 @@ static bool is_spectre_bhb_safe(int scope) MIDR_ALL_VERSIONS(MIDR_CORTEX_A510), MIDR_ALL_VERSIONS(MIDR_CORTEX_A520), MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), {}, }; static bool all_safe = true; From a9b5bd81b294d30a747edd125e9f6aef2def7c79 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 7 Jan 2025 12:06:01 -0800 Subject: [PATCH 69/72] arm64: cputype: Add MIDR_CORTEX_A76AE >From the TRM, MIDR_CORTEX_A76AE has a partnum of 0xDOE and an implementor of 0x41 (ARM). Add the values. Cc: stable@vger.kernel.org # dependency of the next fix in the series Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20250107120555.v4.4.I151f3b7ee323bcc3082179b8c60c3cd03308aa94@changeid Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cputype.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 6f3f4142e214f..40d310cb2da9a 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -75,6 +75,7 @@ #define ARM_CPU_PART_CORTEX_A76 0xD0B #define ARM_CPU_PART_NEOVERSE_N1 0xD0C #define ARM_CPU_PART_CORTEX_A77 0xD0D +#define ARM_CPU_PART_CORTEX_A76AE 0xD0E #define ARM_CPU_PART_NEOVERSE_V1 0xD40 #define ARM_CPU_PART_CORTEX_A78 0xD41 #define ARM_CPU_PART_CORTEX_A78AE 0xD42 @@ -159,6 +160,7 @@ #define MIDR_CORTEX_A76 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76) #define MIDR_NEOVERSE_N1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N1) #define MIDR_CORTEX_A77 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A77) +#define MIDR_CORTEX_A76AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76AE) #define MIDR_NEOVERSE_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V1) #define MIDR_CORTEX_A78 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78) #define MIDR_CORTEX_A78AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78AE) From a5951389e58d2e816eed3dbec5877de9327fd881 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 7 Jan 2025 12:06:02 -0800 Subject: [PATCH 70/72] arm64: errata: Add newer ARM cores to the spectre_bhb_loop_affected() lists When comparing to the ARM list [1], it appears that several ARM cores were missing from the lists in spectre_bhb_loop_affected(). Add them. NOTE: for some of these cores it may not matter since other ways of clearing the BHB may be used (like the CLRBHB instruction or ECBHB), but it still seems good to have all the info from ARM's whitepaper included. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Signed-off-by: Douglas Anderson Reviewed-by: James Morse Link: https://lore.kernel.org/r/20250107120555.v4.5.I4a9a527e03f663040721c5401c41de587d015c82@changeid Signed-off-by: Catalin Marinas --- arch/arm64/kernel/proton-pack.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 89405be53d8fc..0f51fd10b4b06 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -876,6 +876,14 @@ static u8 spectre_bhb_loop_affected(void) { u8 k = 0; + static const struct midr_range spectre_bhb_k132_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_X3), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2), + }; + static const struct midr_range spectre_bhb_k38_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A715), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), + }; static const struct midr_range spectre_bhb_k32_list[] = { MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE), @@ -889,6 +897,7 @@ static u8 spectre_bhb_loop_affected(void) }; static const struct midr_range spectre_bhb_k24_list[] = { MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76AE), MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_GOLD), @@ -904,7 +913,11 @@ static u8 spectre_bhb_loop_affected(void) {}, }; - if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list)) + if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k132_list)) + k = 132; + else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k38_list)) + k = 38; + else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list)) k = 32; else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list)) k = 24; From 9651f7899cc59bcf4f6e543a2d24b404997100c0 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 17 Mar 2025 12:00:33 +0000 Subject: [PATCH 71/72] perf/arm_cspmu: Fix missing io.h include Adding the writel() calls needs io.h, which apparently gets transiently included somewhere on arm64, but not elsewhere. Fixes: 6de0298a3925 ("perf/arm_cspmu: Generalise event filtering") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202503150649.Dol8RBSh-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202503152245.cAG4FMfi-lkp@intel.com/ Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/657935ca177024ad08d5ec6f85e8faf75f82cf65.1742212833.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm_cspmu/nvidia_cspmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c index 9e817f120828f..dc6d4e3e2a1ba 100644 --- a/drivers/perf/arm_cspmu/nvidia_cspmu.c +++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c @@ -6,6 +6,7 @@ /* Support for NVIDIA specific attributes. */ +#include #include #include From 892d20acf36c3f633a1320610f3327acd28e90de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 17 Feb 2025 08:39:06 +0100 Subject: [PATCH 72/72] arm64: mm: Don't use %pK through printk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restricted pointers ("%pK") are not meant to be used through printk(). It can unintentionally expose security sensitive, raw pointer values. Use regular pointer formatting instead. Link: https://lore.kernel.org/lkml/20250113171731-dc10e3c1-da64-4af0-b767-7c7070468023@linutronix.de/ Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250217-restricted-pointers-arm64-v1-1-14bb1f516b01@linutronix.de Signed-off-by: Catalin Marinas --- arch/arm64/mm/physaddr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/physaddr.c b/arch/arm64/mm/physaddr.c index cde44c13dda1b..7d94e09b01b35 100644 --- a/arch/arm64/mm/physaddr.c +++ b/arch/arm64/mm/physaddr.c @@ -10,7 +10,7 @@ phys_addr_t __virt_to_phys(unsigned long x) { WARN(!__is_lm_address(__tag_reset(x)), - "virt_to_phys used for non-linear address: %pK (%pS)\n", + "virt_to_phys used for non-linear address: %p (%pS)\n", (void *)x, (void *)x);