Skip to content

Commit

Permalink
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm…
Browse files Browse the repository at this point in the history
…/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

 - membarrier updates (Mathieu Desnoyers)

 - SMP balancing optimizations (Mel Gorman)

 - stats update optimizations (Peter Zijlstra)

 - RT scheduler race fixes (Steven Rostedt)

 - misc fixes and updates

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/fair: Use a recently used CPU as an idle candidate and the basis for SIS
  sched/fair: Do not migrate if the prev_cpu is idle
  sched/fair: Restructure wake_affine*() to return a CPU id
  sched/fair: Remove unnecessary parameters from wake_affine_idle()
  sched/rt: Make update_curr_rt() more accurate
  sched/rt: Up the root domain ref count when passing it around via IPIs
  sched/rt: Use container_of() to get root domain in rto_push_irq_work_func()
  sched/core: Optimize update_stats_*()
  sched/core: Optimize ttwu_stat()
  membarrier/selftest: Test private expedited sync core command
  membarrier/arm64: Provide core serializing command
  membarrier/x86: Provide core serializing command
  membarrier: Provide core serializing command, *_SYNC_CORE
  lockin/x86: Implement sync_core_before_usermode()
  locking: Introduce sync_core_before_usermode()
  membarrier/selftest: Test global expedited command
  membarrier: Provide GLOBAL_EXPEDITED command
  membarrier: Document scheduler barrier requirements
  powerpc, membarrier: Skip memory barrier in switch_mm()
  membarrier/selftest: Test private expedited command
  • Loading branch information
Linus Torvalds committed Feb 7, 2018
2 parents 4b0dda4 + 8284507 commit ab2d92a
Show file tree
Hide file tree
Showing 25 changed files with 750 additions and 127 deletions.
1 change: 1 addition & 0 deletions MAINTAINERS
Original file line number Diff line number Diff line change
Expand Up @@ -9025,6 +9025,7 @@ L: linux-kernel@vger.kernel.org
S: Supported
F: kernel/sched/membarrier.c
F: include/uapi/linux/membarrier.h
F: arch/powerpc/include/asm/membarrier.h

MEMORY MANAGEMENT
L: linux-mm@kvack.org
Expand Down
1 change: 1 addition & 0 deletions arch/arm64/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ config ARM64
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
select ARCH_HAS_KCOV
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_SG_CHAIN
select ARCH_HAS_STRICT_KERNEL_RWX
Expand Down
4 changes: 4 additions & 0 deletions arch/arm64/kernel/entry.S
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,10 @@ alternative_else_nop_endif
ldp x28, x29, [sp, #16 * 14]
ldr lr, [sp, #S_LR]
add sp, sp, #S_FRAME_SIZE // restore sp
/*
* ARCH_HAS_MEMBARRIER_SYNC_CORE rely on eret context synchronization
* when returning from IPI handler, and when returning to user-space.
*/

.if \el == 0
alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0
Expand Down
1 change: 1 addition & 0 deletions arch/powerpc/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ config PPC
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_PHYS_TO_DMA
select ARCH_HAS_PMEM_API if PPC64
select ARCH_HAS_MEMBARRIER_CALLBACKS
select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE
select ARCH_HAS_SG_CHAIN
select ARCH_HAS_STRICT_KERNEL_RWX if ((PPC_BOOK3S_64 || PPC32) && !RELOCATABLE && !HIBERNATION)
Expand Down
27 changes: 27 additions & 0 deletions arch/powerpc/include/asm/membarrier.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#ifndef _ASM_POWERPC_MEMBARRIER_H
#define _ASM_POWERPC_MEMBARRIER_H

static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
struct mm_struct *next,
struct task_struct *tsk)
{
/*
* Only need the full barrier when switching between processes.
* Barrier when switching from kernel to userspace is not
* required here, given that it is implied by mmdrop(). Barrier
* when switching from userspace to kernel is not needed after
* store to rq->curr.
*/
if (likely(!(atomic_read(&next->membarrier_state) &
(MEMBARRIER_STATE_PRIVATE_EXPEDITED |
MEMBARRIER_STATE_GLOBAL_EXPEDITED)) || !prev))
return;

/*
* The membarrier system call requires a full memory barrier
* after storing to rq->curr, before going back to user-space.
*/
smp_mb();
}

#endif /* _ASM_POWERPC_MEMBARRIER_H */
7 changes: 7 additions & 0 deletions arch/powerpc/mm/mmu_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/sched/mm.h>

#include <asm/mmu_context.h>

Expand Down Expand Up @@ -58,6 +59,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
*
* On the read side the barrier is in pte_xchg(), which orders
* the store to the PTE vs the load of mm_cpumask.
*
* This full barrier is needed by membarrier when switching
* between processes after store to rq->curr, before user-space
* memory accesses.
*/
smp_mb();

Expand All @@ -80,6 +85,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,

if (new_on_cpu)
radix_kvm_prefetch_workaround(next);
else
membarrier_arch_switch_mm(prev, next, tsk);

/*
* The actual HW switching method differs between the various
Expand Down
2 changes: 2 additions & 0 deletions arch/x86/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,15 @@ config X86
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_KCOV if X86_64
select ARCH_HAS_PHYS_TO_DMA
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_REFCOUNT
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_SG_CHAIN
select ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_HAS_STRICT_MODULE_RWX
select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAS_ZONE_DEVICE if X86_64
select ARCH_HAVE_NMI_SAFE_CMPXCHG
Expand Down
5 changes: 5 additions & 0 deletions arch/x86/entry/entry_32.S
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,11 @@ restore_all:
.Lrestore_nocheck:
RESTORE_REGS 4 # skip orig_eax/error_code
.Lirq_return:
/*
* ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
* when returning from IPI handler and when returning from
* scheduler to user-space.
*/
INTERRUPT_RETURN

.section .fixup, "ax"
Expand Down
4 changes: 4 additions & 0 deletions arch/x86/entry/entry_64.S
Original file line number Diff line number Diff line change
Expand Up @@ -691,6 +691,10 @@ GLOBAL(restore_regs_and_return_to_kernel)
POP_EXTRA_REGS
POP_C_REGS
addq $8, %rsp /* skip regs->orig_ax */
/*
* ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
* when returning from IPI handler.
*/
INTERRUPT_RETURN

ENTRY(native_iret)
Expand Down
28 changes: 28 additions & 0 deletions arch/x86/include/asm/sync_core.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SYNC_CORE_H
#define _ASM_X86_SYNC_CORE_H

#include <linux/preempt.h>
#include <asm/processor.h>
#include <asm/cpufeature.h>

/*
* Ensure that a core serializing instruction is issued before returning
* to user-mode. x86 implements return to user-space through sysexit,
* sysrel, and sysretq, which are not core serializing.
*/
static inline void sync_core_before_usermode(void)
{
/* With PTI, we unconditionally serialize before running user code. */
if (static_cpu_has(X86_FEATURE_PTI))
return;
/*
* Return from interrupt and NMI is done through iret, which is core
* serializing.
*/
if (in_irq() || in_nmi())
return;
sync_core();
}

#endif /* _ASM_X86_SYNC_CORE_H */
6 changes: 6 additions & 0 deletions arch/x86/mm/tlb.c
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
#endif
this_cpu_write(cpu_tlbstate.is_lazy, false);

/*
* The membarrier system call requires a full memory barrier and
* core serialization before returning to user-space, after
* storing to rq->curr. Writing to CR3 provides that full
* memory barrier and core serializing instruction.
*/
if (real_prev == next) {
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
Expand Down
8 changes: 8 additions & 0 deletions include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,14 @@ struct task_struct {
unsigned long wakee_flip_decay_ts;
struct task_struct *last_wakee;

/*
* recent_used_cpu is initially set as the last CPU used by a task
* that wakes affine another task. Waker/wakee relationships can
* push tasks around a CPU where each wakeup moves to the next one.
* Tracking a recently used CPU allows a quick search for a recently
* used CPU that may be idle.
*/
int recent_used_cpu;
int wake_cpu;
#endif
int on_rq;
Expand Down
35 changes: 33 additions & 2 deletions include/linux/sched/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <linux/sched.h>
#include <linux/mm_types.h>
#include <linux/gfp.h>
#include <linux/sync_core.h>

/*
* Routines for handling mm_structs
Expand Down Expand Up @@ -194,18 +195,48 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)

#ifdef CONFIG_MEMBARRIER
enum {
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0),
MEMBARRIER_STATE_SWITCH_MM = (1U << 1),
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0),
MEMBARRIER_STATE_PRIVATE_EXPEDITED = (1U << 1),
MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY = (1U << 2),
MEMBARRIER_STATE_GLOBAL_EXPEDITED = (1U << 3),
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 4),
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE = (1U << 5),
};

enum {
MEMBARRIER_FLAG_SYNC_CORE = (1U << 0),
};

#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
#include <asm/membarrier.h>
#endif

static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
if (likely(!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
return;
sync_core_before_usermode();
}

static inline void membarrier_execve(struct task_struct *t)
{
atomic_set(&t->mm->membarrier_state, 0);
}
#else
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
struct mm_struct *next,
struct task_struct *tsk)
{
}
#endif
static inline void membarrier_execve(struct task_struct *t)
{
}
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
}
#endif

#endif /* _LINUX_SCHED_MM_H */
21 changes: 21 additions & 0 deletions include/linux/sync_core.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SYNC_CORE_H
#define _LINUX_SYNC_CORE_H

#ifdef CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
#include <asm/sync_core.h>
#else
/*
* This is a dummy sync_core_before_usermode() implementation that can be used
* on all architectures which return to user-space through core serializing
* instructions.
* If your architecture returns to user-space through non-core-serializing
* instructions, you need to write your own functions.
*/
static inline void sync_core_before_usermode(void)
{
}
#endif

#endif /* _LINUX_SYNC_CORE_H */

74 changes: 66 additions & 8 deletions include/uapi/linux/membarrier.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
* enum membarrier_cmd - membarrier system call command
* @MEMBARRIER_CMD_QUERY: Query the set of supported commands. It returns
* a bitmask of valid commands.
* @MEMBARRIER_CMD_SHARED: Execute a memory barrier on all running threads.
* @MEMBARRIER_CMD_GLOBAL: Execute a memory barrier on all running threads.
* Upon return from system call, the caller thread
* is ensured that all running threads have passed
* through a state where all memory accesses to
Expand All @@ -40,6 +40,28 @@
* (non-running threads are de facto in such a
* state). This covers threads from all processes
* running on the system. This command returns 0.
* @MEMBARRIER_CMD_GLOBAL_EXPEDITED:
* Execute a memory barrier on all running threads
* of all processes which previously registered
* with MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED.
* Upon return from system call, the caller thread
* is ensured that all running threads have passed
* through a state where all memory accesses to
* user-space addresses match program order between
* entry to and return from the system call
* (non-running threads are de facto in such a
* state). This only covers threads from processes
* which registered with
* MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED.
* This command returns 0. Given that
* registration is about the intent to receive
* the barriers, it is valid to invoke
* MEMBARRIER_CMD_GLOBAL_EXPEDITED from a
* non-registered process.
* @MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
* Register the process intent to receive
* MEMBARRIER_CMD_GLOBAL_EXPEDITED memory
* barriers. Always returns 0.
* @MEMBARRIER_CMD_PRIVATE_EXPEDITED:
* Execute a memory barrier on each running
* thread belonging to the same process as the current
Expand All @@ -51,7 +73,7 @@
* to and return from the system call
* (non-running threads are de facto in such a
* state). This only covers threads from the
* same processes as the caller thread. This
* same process as the caller thread. This
* command returns 0 on success. The
* "expedited" commands complete faster than
* the non-expedited ones, they never block,
Expand All @@ -64,18 +86,54 @@
* Register the process intent to use
* MEMBARRIER_CMD_PRIVATE_EXPEDITED. Always
* returns 0.
* @MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
* In addition to provide memory ordering
* guarantees described in
* MEMBARRIER_CMD_PRIVATE_EXPEDITED, ensure
* the caller thread, upon return from system
* call, that all its running threads siblings
* have executed a core serializing
* instruction. (architectures are required to
* guarantee that non-running threads issue
* core serializing instructions before they
* resume user-space execution). This only
* covers threads from the same process as the
* caller thread. This command returns 0 on
* success. The "expedited" commands complete
* faster than the non-expedited ones, they
* never block, but have the downside of
* causing extra overhead. If this command is
* not implemented by an architecture, -EINVAL
* is returned. A process needs to register its
* intent to use the private expedited sync
* core command prior to using it, otherwise
* this command returns -EPERM.
* @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
* Register the process intent to use
* MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE.
* If this command is not implemented by an
* architecture, -EINVAL is returned.
* Returns 0 on success.
* @MEMBARRIER_CMD_SHARED:
* Alias to MEMBARRIER_CMD_GLOBAL. Provided for
* header backward compatibility.
*
* Command to be passed to the membarrier system call. The commands need to
* be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
* the value 0.
*/
enum membarrier_cmd {
MEMBARRIER_CMD_QUERY = 0,
MEMBARRIER_CMD_SHARED = (1 << 0),
/* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */
/* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */
MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3),
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4),
MEMBARRIER_CMD_QUERY = 0,
MEMBARRIER_CMD_GLOBAL = (1 << 0),
MEMBARRIER_CMD_GLOBAL_EXPEDITED = (1 << 1),
MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED = (1 << 2),
MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3),
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4),
MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 5),
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 6),

/* Alias for header backward compatibility. */
MEMBARRIER_CMD_SHARED = MEMBARRIER_CMD_GLOBAL,
};

#endif /* _UAPI_LINUX_MEMBARRIER_H */
9 changes: 9 additions & 0 deletions init/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -1412,6 +1412,12 @@ config USERFAULTFD
Enable the userfaultfd() system call that allows to intercept and
handle page faults in userland.

config ARCH_HAS_MEMBARRIER_CALLBACKS
bool

config ARCH_HAS_MEMBARRIER_SYNC_CORE
bool

config EMBEDDED
bool "Embedded system"
option allnoconfig_y
Expand Down Expand Up @@ -1915,3 +1921,6 @@ config ASN1
functions to call on what tags.

source "kernel/Kconfig.locks"

config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
bool
Loading

0 comments on commit ab2d92a

Please sign in to comment.