Skip to content

Commit

Permalink
locking/pvqspinlock: Only kick CPU at unlock time
Browse files Browse the repository at this point in the history
For an over-committed guest with more vCPUs than physical CPUs
available, it is possible that a vCPU may be kicked twice before
getting the lock - once before it becomes queue head and once again
before it gets the lock. All these CPU kicking and halting (VMEXIT)
can be expensive and slow down system performance.

This patch adds a new vCPU state (vcpu_hashed) which enables the code
to delay CPU kicking until at unlock time. Once this state is set,
the new lock holder will set _Q_SLOW_VAL and fill in the hash table
on behalf of the halted queue head vCPU. The original vcpu_halted
state will be used by pv_wait_node() only to differentiate other
queue nodes from the qeue head.

Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Douglas Hatch <doug.hatch@hp.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hp.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1436647018-49734-2-git-send-email-Waiman.Long@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
  • Loading branch information
Waiman Long authored and Ingo Molnar committed Aug 3, 2015
1 parent ffffeaf commit 75d2270
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 21 deletions.
6 changes: 3 additions & 3 deletions kernel/locking/qspinlock.c
Original file line number Diff line number Diff line change
Expand Up @@ -239,8 +239,8 @@ static __always_inline void set_locked(struct qspinlock *lock)

static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }

static __always_inline void __pv_kick_node(struct qspinlock *lock,
struct mcs_spinlock *node) { }
static __always_inline void __pv_wait_head(struct qspinlock *lock,
struct mcs_spinlock *node) { }

Expand Down Expand Up @@ -440,7 +440,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
cpu_relax();

arch_mcs_spin_unlock_contended(&next->locked);
pv_kick_node(next);
pv_kick_node(lock, next);

release:
/*
Expand Down
66 changes: 48 additions & 18 deletions kernel/locking/qspinlock_paravirt.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,14 @@

#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)

/*
* Queue node uses: vcpu_running & vcpu_halted.
* Queue head uses: vcpu_running & vcpu_hashed.
*/
enum vcpu_state {
vcpu_running = 0,
vcpu_halted,
vcpu_halted, /* Used only in pv_wait_node */
vcpu_hashed, /* = pv_hash'ed + vcpu_halted */
};

struct pv_node {
Expand Down Expand Up @@ -153,7 +158,8 @@ static void pv_init_node(struct mcs_spinlock *node)

/*
* Wait for node->locked to become true, halt the vcpu after a short spin.
* pv_kick_node() is used to wake the vcpu again.
* pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
* behalf.
*/
static void pv_wait_node(struct mcs_spinlock *node)
{
Expand All @@ -172,19 +178,20 @@ static void pv_wait_node(struct mcs_spinlock *node)
*
* [S] pn->state = vcpu_halted [S] next->locked = 1
* MB MB
* [L] pn->locked [RmW] pn->state = vcpu_running
* [L] pn->locked [RmW] pn->state = vcpu_hashed
*
* Matches the xchg() from pv_kick_node().
* Matches the cmpxchg() from pv_kick_node().
*/
smp_store_mb(pn->state, vcpu_halted);

if (!READ_ONCE(node->locked))
pv_wait(&pn->state, vcpu_halted);

/*
* Reset the vCPU state to avoid unncessary CPU kicking
* If pv_kick_node() changed us to vcpu_hashed, retain that value
* so that pv_wait_head() knows to not also try to hash this lock.
*/
WRITE_ONCE(pn->state, vcpu_running);
cmpxchg(&pn->state, vcpu_halted, vcpu_running);

/*
* If the locked flag is still not set after wakeup, it is a
Expand All @@ -194,6 +201,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
* MCS lock will be released soon.
*/
}

/*
* By now our node->locked should be 1 and our caller will not actually
* spin-wait for it. We do however rely on our caller to do a
Expand All @@ -202,24 +210,35 @@ static void pv_wait_node(struct mcs_spinlock *node)
}

/*
* Called after setting next->locked = 1, used to wake those stuck in
* pv_wait_node().
* Called after setting next->locked = 1 when we're the lock owner.
*
* Instead of waking the waiters stuck in pv_wait_node() advance their state such
* that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
*/
static void pv_kick_node(struct mcs_spinlock *node)
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
struct __qspinlock *l = (void *)lock;

/*
* Note that because node->locked is already set, this actual
* mcs_spinlock entry could be re-used already.
* If the vCPU is indeed halted, advance its state to match that of
* pv_wait_node(). If OTOH this fails, the vCPU was running and will
* observe its next->locked value and advance itself.
*
* This should be fine however, kicking people for no reason is
* harmless.
* Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
*/
if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
return;

/*
* Put the lock into the hash table and set the _Q_SLOW_VAL.
*
* See the comment in pv_wait_node().
* As this is the same vCPU that will check the _Q_SLOW_VAL value and
* the hash table later on at unlock time, no atomic instruction is
* needed.
*/
if (xchg(&pn->state, vcpu_running) == vcpu_halted)
pv_kick(pn->cpu);
WRITE_ONCE(l->locked, _Q_SLOW_VAL);
(void)pv_hash(lock, pn);
}

/*
Expand All @@ -233,16 +252,24 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
struct qspinlock **lp = NULL;
int loop;

/*
* If pv_kick_node() already advanced our state, we don't need to
* insert ourselves into the hash table anymore.
*/
if (READ_ONCE(pn->state) == vcpu_hashed)
lp = (struct qspinlock **)1;

for (;;) {
for (loop = SPIN_THRESHOLD; loop; loop--) {
if (!READ_ONCE(l->locked))
return;
cpu_relax();
}

WRITE_ONCE(pn->state, vcpu_halted);
if (!lp) { /* ONCE */
WRITE_ONCE(pn->state, vcpu_hashed);
lp = pv_hash(lock, pn);

/*
* We must hash before setting _Q_SLOW_VAL, such that
* when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
Expand Down Expand Up @@ -333,8 +360,11 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
/*
* At this point the memory pointed at by lock can be freed/reused,
* however we can still use the pv_node to kick the CPU.
* The other vCPU may not really be halted, but kicking an active
* vCPU is harmless other than the additional latency in completing
* the unlock.
*/
if (READ_ONCE(node->state) == vcpu_halted)
if (READ_ONCE(node->state) == vcpu_hashed)
pv_kick(node->cpu);
}
/*
Expand Down

0 comments on commit 75d2270

Please sign in to comment.