From 6bad6ecc63b75af294ff3f56f54d6b857c8964a5 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 1 Apr 2025 08:47:26 -0700
Subject: [PATCH 1/2] KVM: VMX: Assert that IRQs are disabled when putting vCPU
 on PI wakeup list

Assert that IRQs are already disabled when putting a vCPU on a CPU's PI
wakeup list, as opposed to saving/disabling+restoring IRQs.  KVM relies on
IRQs being disabled until the vCPU task is fully scheduled out, i.e. until
the scheduler has dropped all of its per-CPU locks (e.g. for the runqueue),
as attempting to wake the task while it's being scheduled out could lead
to deadlock.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Reviewed-by: Yan Zhao <yan.y.zhao@intel.com>
Message-ID: <20250401154727.835231-2-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/posted_intr.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index ec08fa3caf43c..840d435229a87 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -148,9 +148,8 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct pi_desc old, new;
-	unsigned long flags;
 
-	local_irq_save(flags);
+	lockdep_assert_irqs_disabled();
 
 	raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
 	list_add_tail(&vmx->pi_wakeup_list,
@@ -176,8 +175,6 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
 	 */
 	if (pi_test_on(&new))
 		__apic_send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);
-
-	local_irq_restore(flags);
 }
 
 static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu)

From c0b8dcabb2cddc98c265548632c39e97422f61b6 Mon Sep 17 00:00:00 2001
From: Yan Zhao <yan.y.zhao@intel.com>
Date: Tue, 1 Apr 2025 08:47:27 -0700
Subject: [PATCH 2/2] KVM: VMX: Use separate subclasses for PI wakeup lock to
 squash false positive

Use a separate subclass when acquiring KVM's per-CPU posted interrupts
wakeup lock in the scheduled out path, i.e. when adding a vCPU on the list
of vCPUs to wake, to workaround a false positive deadlock.

  Chain exists of:
   &p->pi_lock --> &rq->__lock --> &per_cpu(wakeup_vcpus_on_cpu_lock, cpu)

  Possible unsafe locking scenario:

        CPU0                CPU1
        ----                ----
   lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
                            lock(&rq->__lock);
                            lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
   lock(&p->pi_lock);

  *** DEADLOCK ***

In the wakeup handler, the callchain is *always*:

  sysvec_kvm_posted_intr_wakeup_ipi()
  |
  --> pi_wakeup_handler()
      |
      --> kvm_vcpu_wake_up()
          |
          --> try_to_wake_up(),

and the lock order is:

  &per_cpu(wakeup_vcpus_on_cpu_lock, cpu) --> &p->pi_lock.

For the schedule out path, the callchain is always (for all intents and
purposes; if the kernel is preemptible, kvm_sched_out() can be called from
something other than schedule(), but the beginning of the callchain will
be the same point in vcpu_block()):

  vcpu_block()
  |
  --> schedule()
      |
      --> kvm_sched_out()
          |
          --> vmx_vcpu_put()
              |
              --> vmx_vcpu_pi_put()
                  |
                  --> pi_enable_wakeup_handler()

and the lock order is:

  &rq->__lock --> &per_cpu(wakeup_vcpus_on_cpu_lock, cpu)

I.e. lockdep sees AB+BC ordering for schedule out, and CA ordering for
wakeup, and complains about the A=>C versus C=>A inversion.  In practice,
deadlock can't occur between schedule out and the wakeup handler as they
are mutually exclusive.  The entirely of the schedule out code that runs
with the problematic scheduler locks held, does so with IRQs disabled,
i.e. can't run concurrently with the wakeup handler.

Use a subclass instead disabling lockdep entirely, and tell lockdep that
both subclasses are being acquired when loading a vCPU, as the sched_out
and sched_in paths are NOT mutually exclusive, e.g.

      CPU 0                 CPU 1
  ---------------     ---------------
  vCPU0 sched_out
  vCPU1 sched_in
  vCPU1 sched_out      vCPU 0 sched_in

where vCPU0's sched_in may race with vCPU1's sched_out, on CPU 0's wakeup
list+lock.

Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-ID: <20250401154727.835231-3-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/posted_intr.c | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 840d435229a87..51116fe69a500 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -31,6 +31,8 @@ static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu);
  */
 static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock);
 
+#define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING
+
 static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
 {
 	return &(to_vmx(vcpu)->pi_desc);
@@ -89,9 +91,20 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 	 * current pCPU if the task was migrated.
 	 */
 	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
-		raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+		raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu);
+
+		/*
+		 * In addition to taking the wakeup lock for the regular/IRQ
+		 * context, tell lockdep it is being taken for the "sched out"
+		 * context as well.  vCPU loads happens in task context, and
+		 * this is taking the lock of the *previous* CPU, i.e. can race
+		 * with both the scheduler and the wakeup handler.
+		 */
+		raw_spin_lock(spinlock);
+		spin_acquire(&spinlock->dep_map, PI_LOCK_SCHED_OUT, 0, _RET_IP_);
 		list_del(&vmx->pi_wakeup_list);
-		raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+		spin_release(&spinlock->dep_map, _RET_IP_);
+		raw_spin_unlock(spinlock);
 	}
 
 	dest = cpu_physical_id(cpu);
@@ -151,7 +164,20 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
 
 	lockdep_assert_irqs_disabled();
 
-	raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+	/*
+	 * Acquire the wakeup lock using the "sched out" context to workaround
+	 * a lockdep false positive.  When this is called, schedule() holds
+	 * various per-CPU scheduler locks.  When the wakeup handler runs, it
+	 * holds this CPU's wakeup lock while calling try_to_wake_up(), which
+	 * can eventually take the aforementioned scheduler locks, which causes
+	 * lockdep to assume there is deadlock.
+	 *
+	 * Deadlock can't actually occur because IRQs are disabled for the
+	 * entirety of the sched_out critical section, i.e. the wakeup handler
+	 * can't run while the scheduler locks are held.
+	 */
+	raw_spin_lock_nested(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu),
+			     PI_LOCK_SCHED_OUT);
 	list_add_tail(&vmx->pi_wakeup_list,
 		      &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
 	raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));