From 8929bc9659640f35dd2ef8373263cbd885b4a072 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 26 Sep 2022 15:51:15 +0100 Subject: [PATCH 1/6] KVM: Use acquire/release semantics when accessing dirty ring GFN state The current implementation of the dirty ring has an implicit requirement that stores to the dirty ring from userspace must be: - be ordered with one another - visible from another CPU executing a ring reset While these implicit requirements work well for x86 (and any other TSO-like architecture), they do not work for more relaxed architectures such as arm64 where stores to different addresses can be freely reordered, and loads from these addresses not observing writes from another CPU unless the required barriers (or acquire/release semantics) are used. In order to start fixing this, upgrade the ring reset accesses: - the kvm_dirty_gfn_harvested() helper now uses acquire semantics so it is ordered after all previous writes, including that from userspace - the kvm_dirty_gfn_set_invalid() helper now uses release semantics so that the next_slot and next_offset reads don't drift past the entry invalidation This is only a partial fix as the userspace side also need upgrading. Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20220926145120.27974-2-maz@kernel.org --- virt/kvm/dirty_ring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index f4c2a6eb1666b..d6fabf238032a 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -74,7 +74,7 @@ int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size) static inline void kvm_dirty_gfn_set_invalid(struct kvm_dirty_gfn *gfn) { - gfn->flags = 0; + smp_store_release(&gfn->flags, 0); } static inline void kvm_dirty_gfn_set_dirtied(struct kvm_dirty_gfn *gfn) @@ -84,7 +84,7 @@ static inline void kvm_dirty_gfn_set_dirtied(struct kvm_dirty_gfn *gfn) static inline bool kvm_dirty_gfn_harvested(struct kvm_dirty_gfn *gfn) { - return gfn->flags & KVM_DIRTY_GFN_F_RESET; + return smp_load_acquire(&gfn->flags) & KVM_DIRTY_GFN_F_RESET; } int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring) From 17601bfed909fa080fcfd227b57da2bd4dc2d2a6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 26 Sep 2022 15:51:16 +0100 Subject: [PATCH 2/6] KVM: Add KVM_CAP_DIRTY_LOG_RING_ACQ_REL capability and config option In order to differenciate between architectures that require no extra synchronisation when accessing the dirty ring and those who do, add a new capability (KVM_CAP_DIRTY_LOG_RING_ACQ_REL) that identify the latter sort. TSO architectures can obviously advertise both, while relaxed architectures must only advertise the ACQ_REL version. This requires some configuration symbol rejigging, with HAVE_KVM_DIRTY_RING being only indirectly selected by two top-level config symbols: - HAVE_KVM_DIRTY_RING_TSO for strongly ordered architectures (x86) - HAVE_KVM_DIRTY_RING_ACQ_REL for weakly ordered architectures (arm64) Suggested-by: Paolo Bonzini Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20220926145120.27974-3-maz@kernel.org --- arch/x86/kvm/Kconfig | 2 +- include/uapi/linux/kvm.h | 1 + virt/kvm/Kconfig | 14 ++++++++++++++ virt/kvm/kvm_main.c | 9 ++++++++- 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index e3cbd77061364..876748b236ffe 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -28,7 +28,7 @@ config KVM select HAVE_KVM_IRQCHIP select HAVE_KVM_PFNCACHE select HAVE_KVM_IRQFD - select HAVE_KVM_DIRTY_RING + select HAVE_KVM_DIRTY_RING_TSO select IRQ_BYPASS_MANAGER select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index eed0315a77a6d..0d5d4419139ae 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1177,6 +1177,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220 #define KVM_CAP_S390_ZPCI_OP 221 #define KVM_CAP_S390_CPU_TOPOLOGY 222 +#define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index a8c5c9f06b3cf..800f9470e36b1 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -19,6 +19,20 @@ config HAVE_KVM_IRQ_ROUTING config HAVE_KVM_DIRTY_RING bool +# Only strongly ordered architectures can select this, as it doesn't +# put any explicit constraint on userspace ordering. They can also +# select the _ACQ_REL version. +config HAVE_KVM_DIRTY_RING_TSO + bool + select HAVE_KVM_DIRTY_RING + depends on X86 + +# Weakly ordered architectures can only select this, advertising +# to userspace the additional ordering requirements. +config HAVE_KVM_DIRTY_RING_ACQ_REL + bool + select HAVE_KVM_DIRTY_RING + config HAVE_KVM_EVENTFD bool select EVENTFD diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 584a5bab3af39..5b064dbadaf42 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4475,7 +4475,13 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_NR_MEMSLOTS: return KVM_USER_MEM_SLOTS; case KVM_CAP_DIRTY_LOG_RING: -#ifdef CONFIG_HAVE_KVM_DIRTY_RING +#ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO + return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn); +#else + return 0; +#endif + case KVM_CAP_DIRTY_LOG_RING_ACQ_REL: +#ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn); #else return 0; @@ -4580,6 +4586,7 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, return 0; } case KVM_CAP_DIRTY_LOG_RING: + case KVM_CAP_DIRTY_LOG_RING_ACQ_REL: return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]); default: return kvm_vm_ioctl_enable_cap(kvm, cap); From fc0693d4e5afe3c110503c3afa9f60600f9e964b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 26 Sep 2022 15:51:17 +0100 Subject: [PATCH 3/6] KVM: x86: Select CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL Since x86 is TSO (give or take), allow it to advertise the new ACQ_REL version of the dirty ring capability. No other change is required for it. Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20220926145120.27974-4-maz@kernel.org --- arch/x86/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 876748b236ffe..67be7f217e37b 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -29,6 +29,7 @@ config KVM select HAVE_KVM_PFNCACHE select HAVE_KVM_IRQFD select HAVE_KVM_DIRTY_RING_TSO + select HAVE_KVM_DIRTY_RING_ACQ_REL select IRQ_BYPASS_MANAGER select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING From 671c8c7f9f2349d8b2176ad810f1406794011f63 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 26 Sep 2022 15:51:18 +0100 Subject: [PATCH 4/6] KVM: Document weakly ordered architecture requirements for dirty ring Now that the kernel can expose to userspace that its dirty ring management relies on explicit ordering, document these new requirements for VMMs to do the right thing. Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20220926145120.27974-5-maz@kernel.org --- Documentation/virt/kvm/api.rst | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index abd7c32126ce0..32427ea160dfa 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8019,8 +8019,8 @@ guest according to the bits in the KVM_CPUID_FEATURES CPUID leaf (0x40000001). Otherwise, a guest may use the paravirtual features regardless of what has actually been exposed through the CPUID leaf. -8.29 KVM_CAP_DIRTY_LOG_RING ---------------------------- +8.29 KVM_CAP_DIRTY_LOG_RING/KVM_CAP_DIRTY_LOG_RING_ACQ_REL +---------------------------------------------------------- :Architectures: x86 :Parameters: args[0] - size of the dirty log ring @@ -8078,6 +8078,11 @@ on to the next GFN. The userspace should continue to do this until the flags of a GFN have the DIRTY bit cleared, meaning that it has harvested all the dirty GFNs that were available. +Note that on weakly ordered architectures, userspace accesses to the +ring buffer (and more specifically the 'flags' field) must be ordered, +using load-acquire/store-release accessors when available, or any +other memory barrier that will ensure this ordering. + It's not necessary for userspace to harvest the all dirty GFNs at once. However it must collect the dirty GFNs in sequence, i.e., the userspace program cannot skip one dirty GFN to collect the one next to it. @@ -8106,6 +8111,14 @@ KVM_CAP_DIRTY_LOG_RING with an acceptable dirty ring size, the virtual machine will switch to ring-buffer dirty page tracking and further KVM_GET_DIRTY_LOG or KVM_CLEAR_DIRTY_LOG ioctls will fail. +NOTE: KVM_CAP_DIRTY_LOG_RING_ACQ_REL is the only capability that +should be exposed by weakly ordered architecture, in order to indicate +the additional memory ordering requirements imposed on userspace when +reading the state of an entry and mutating it from DIRTY to HARVESTED. +Architecture with TSO-like ordering (such as x86) are allowed to +expose both KVM_CAP_DIRTY_LOG_RING and KVM_CAP_DIRTY_LOG_RING_ACQ_REL +to userspace. + 8.30 KVM_CAP_XEN_HVM -------------------- From 4eb6486cb43c93382c27a2659ba978c660e98498 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 26 Sep 2022 15:51:19 +0100 Subject: [PATCH 5/6] KVM: selftests: dirty-log: Upgrade flag accesses to acquire/release semantics In order to preserve ordering, make sure that the flag accesses in the dirty log are done using acquire/release accessors. Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20220926145120.27974-6-maz@kernel.org --- tools/testing/selftests/kvm/dirty_log_test.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 9c883c94d478f..53627add8a7cb 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "kvm_util.h" #include "test_util.h" @@ -279,12 +280,12 @@ static void dirty_ring_create_vm_done(struct kvm_vm *vm) static inline bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn) { - return gfn->flags == KVM_DIRTY_GFN_F_DIRTY; + return smp_load_acquire(&gfn->flags) == KVM_DIRTY_GFN_F_DIRTY; } static inline void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn) { - gfn->flags = KVM_DIRTY_GFN_F_RESET; + smp_store_release(&gfn->flags, KVM_DIRTY_GFN_F_RESET); } static uint32_t dirty_ring_collect_one(struct kvm_dirty_gfn *dirty_gfns, From 4b3402f1f4d9860301d6d5cd7aff3b67f678d577 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 26 Sep 2022 15:51:20 +0100 Subject: [PATCH 6/6] KVM: selftests: dirty-log: Use KVM_CAP_DIRTY_LOG_RING_ACQ_REL if available Pick KVM_CAP_DIRTY_LOG_RING_ACQ_REL if exposed by the kernel. Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20220926145120.27974-7-maz@kernel.org --- tools/testing/selftests/kvm/dirty_log_test.c | 3 ++- tools/testing/selftests/kvm/lib/kvm_util.c | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 53627add8a7cb..b5234d6efbe15 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -265,7 +265,8 @@ static void default_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) static bool dirty_ring_supported(void) { - return kvm_has_cap(KVM_CAP_DIRTY_LOG_RING); + return (kvm_has_cap(KVM_CAP_DIRTY_LOG_RING) || + kvm_has_cap(KVM_CAP_DIRTY_LOG_RING_ACQ_REL)); } static void dirty_ring_create_vm_done(struct kvm_vm *vm) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 9889fe0d8919c..411a4c0bc81c8 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -82,7 +82,10 @@ unsigned int kvm_check_cap(long cap) void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size) { - vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size); + if (vm_check_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL)) + vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL, ring_size); + else + vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size); vm->dirty_ring_size = ring_size; }