Skip to content

Commit

Permalink
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm…
Browse files Browse the repository at this point in the history
…/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar:

 - Apply a number of membarrier related fixes and cleanups, which fixes
   a use-after-free race in the membarrier code

 - Introduce proper RCU protection for tasks on the runqueue - to get
   rid of the subtle task_rcu_dereference() interface that was easy to
   get wrong

 - Misc fixes, but also an EAS speedup

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/fair: Avoid redundant EAS calculation
  sched/core: Remove double update_max_interval() call on CPU startup
  sched/core: Fix preempt_schedule() interrupt return comment
  sched/fair: Fix -Wunused-but-set-variable warnings
  sched/core: Fix migration to invalid CPU in __set_cpus_allowed_ptr()
  sched/membarrier: Return -ENOMEM to userspace on memory allocation failure
  sched/membarrier: Skip IPIs when mm->mm_users == 1
  selftests, sched/membarrier: Add multi-threaded test
  sched/membarrier: Fix p->mm->membarrier_state racy load
  sched/membarrier: Call sync_core only before usermode for same mm
  sched/membarrier: Remove redundant check
  sched/membarrier: Fix private expedited registration check
  tasks, sched/core: RCUify the assignment of rq->curr
  tasks, sched/core: With a grace period after finish_task_switch(), remove unnecessary code
  tasks, sched/core: Ensure tasks are available for a grace period after leaving the runqueue
  tasks: Add a count of task RCU users
  sched/core: Convert vcpu_is_preempted() from macro to an inline function
  sched/fair: Remove unused cfs_rq_clock_task() function
  • Loading branch information
Linus Torvalds committed Sep 28, 2019
2 parents aefcf2f + 4892f51 commit 9c5efe9
Show file tree
Hide file tree
Showing 17 changed files with 375 additions and 250 deletions.
2 changes: 1 addition & 1 deletion fs/exec.c
Original file line number Diff line number Diff line change
Expand Up @@ -1033,6 +1033,7 @@ static int exec_mmap(struct mm_struct *mm)
}
task_lock(tsk);
active_mm = tsk->active_mm;
membarrier_exec_mmap(mm);
tsk->mm = mm;
tsk->active_mm = mm;
activate_mm(active_mm, mm);
Expand Down Expand Up @@ -1825,7 +1826,6 @@ static int __do_execve_file(int fd, struct filename *filename,
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
membarrier_execve(current);
rseq_execve(current);
acct_update_integrals(current);
task_numa_free(current, false);
Expand Down
14 changes: 11 additions & 3 deletions include/linux/mm_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,16 @@ struct mm_struct {
unsigned long highest_vm_end; /* highest vma end address */
pgd_t * pgd;

#ifdef CONFIG_MEMBARRIER
/**
* @membarrier_state: Flags controlling membarrier behavior.
*
* This field is close to @pgd to hopefully fit in the same
* cache-line, which needs to be touched by switch_mm().
*/
atomic_t membarrier_state;
#endif

/**
* @mm_users: The number of users including userspace.
*
Expand Down Expand Up @@ -452,9 +462,7 @@ struct mm_struct {
unsigned long flags; /* Must use atomic bitops to access */

struct core_state *core_state; /* coredumping support */
#ifdef CONFIG_MEMBARRIER
atomic_t membarrier_state;
#endif

#ifdef CONFIG_AIO
spinlock_t ioctx_lock;
struct kioctx_table __rcu *ioctx_table;
Expand Down
20 changes: 4 additions & 16 deletions include/linux/rcuwait.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,11 @@

/*
* rcuwait provides a way of blocking and waking up a single
* task in an rcu-safe manner; where it is forbidden to use
* after exit_notify(). task_struct is not properly rcu protected,
* unless dealing with rcu-aware lists, ie: find_task_by_*().
* task in an rcu-safe manner.
*
* Alternatively we have task_rcu_dereference(), but the return
* semantics have different implications which would break the
* wakeup side. The only time @task is non-nil is when a user is
* blocked (or checking if it needs to) on a condition, and reset
* as soon as we know that the condition has succeeded and are
* awoken.
* The only time @task is non-nil is when a user is blocked (or
* checking if it needs to) on a condition, and reset as soon as we
* know that the condition has succeeded and are awoken.
*/
struct rcuwait {
struct task_struct __rcu *task;
Expand All @@ -37,13 +32,6 @@ extern void rcuwait_wake_up(struct rcuwait *w);
*/
#define rcuwait_wait_event(w, condition) \
({ \
/* \
* Complain if we are called after do_exit()/exit_notify(), \
* as we cannot rely on the rcu critical region for the \
* wakeup side. \
*/ \
WARN_ON(current->exit_state); \
\
rcu_assign_pointer((w)->task, current); \
for (;;) { \
/* \
Expand Down
10 changes: 8 additions & 2 deletions include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -1130,7 +1130,10 @@ struct task_struct {

struct tlbflush_unmap_batch tlb_ubc;

struct rcu_head rcu;
union {
refcount_t rcu_users;
struct rcu_head rcu;
};

/* Cache last used pipe for splice(): */
struct pipe_inode_info *splice_pipe;
Expand Down Expand Up @@ -1839,7 +1842,10 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
* running or not.
*/
#ifndef vcpu_is_preempted
# define vcpu_is_preempted(cpu) false
static inline bool vcpu_is_preempted(int cpu)
{
return false;
}
#endif

extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
Expand Down
10 changes: 5 additions & 5 deletions include/linux/sched/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -362,16 +362,16 @@ enum {

static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
if (current->mm != mm)
return;
if (likely(!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
return;
sync_core_before_usermode();
}

static inline void membarrier_execve(struct task_struct *t)
{
atomic_set(&t->mm->membarrier_state, 0);
}
extern void membarrier_exec_mmap(struct mm_struct *mm);

#else
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
Expand All @@ -380,7 +380,7 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
{
}
#endif
static inline void membarrier_execve(struct task_struct *t)
static inline void membarrier_exec_mmap(struct mm_struct *mm)
{
}
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
Expand Down
2 changes: 1 addition & 1 deletion include/linux/sched/task.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ static inline void put_task_struct(struct task_struct *t)
__put_task_struct(t);
}

struct task_struct *task_rcu_dereference(struct task_struct **ptask);
void put_task_struct_rcu_user(struct task_struct *task);

#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
extern int arch_task_struct_size __read_mostly;
Expand Down
74 changes: 6 additions & 68 deletions kernel/exit.c
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,11 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
put_task_struct(tsk);
}

void put_task_struct_rcu_user(struct task_struct *task)
{
if (refcount_dec_and_test(&task->rcu_users))
call_rcu(&task->rcu, delayed_put_task_struct);
}

void release_task(struct task_struct *p)
{
Expand Down Expand Up @@ -222,76 +227,13 @@ void release_task(struct task_struct *p)

write_unlock_irq(&tasklist_lock);
release_thread(p);
call_rcu(&p->rcu, delayed_put_task_struct);
put_task_struct_rcu_user(p);

p = leader;
if (unlikely(zap_leader))
goto repeat;
}

/*
* Note that if this function returns a valid task_struct pointer (!NULL)
* task->usage must remain >0 for the duration of the RCU critical section.
*/
struct task_struct *task_rcu_dereference(struct task_struct **ptask)
{
struct sighand_struct *sighand;
struct task_struct *task;

/*
* We need to verify that release_task() was not called and thus
* delayed_put_task_struct() can't run and drop the last reference
* before rcu_read_unlock(). We check task->sighand != NULL,
* but we can read the already freed and reused memory.
*/
retry:
task = rcu_dereference(*ptask);
if (!task)
return NULL;

probe_kernel_address(&task->sighand, sighand);

/*
* Pairs with atomic_dec_and_test() in put_task_struct(). If this task
* was already freed we can not miss the preceding update of this
* pointer.
*/
smp_rmb();
if (unlikely(task != READ_ONCE(*ptask)))
goto retry;

/*
* We've re-checked that "task == *ptask", now we have two different
* cases:
*
* 1. This is actually the same task/task_struct. In this case
* sighand != NULL tells us it is still alive.
*
* 2. This is another task which got the same memory for task_struct.
* We can't know this of course, and we can not trust
* sighand != NULL.
*
* In this case we actually return a random value, but this is
* correct.
*
* If we return NULL - we can pretend that we actually noticed that
* *ptask was updated when the previous task has exited. Or pretend
* that probe_slab_address(&sighand) reads NULL.
*
* If we return the new task (because sighand is not NULL for any
* reason) - this is fine too. This (new) task can't go away before
* another gp pass.
*
* And note: We could even eliminate the false positive if re-read
* task->sighand once again to avoid the falsely NULL. But this case
* is very unlikely so we don't care.
*/
if (!sighand)
return NULL;

return task;
}

void rcuwait_wake_up(struct rcuwait *w)
{
struct task_struct *task;
Expand All @@ -311,10 +253,6 @@ void rcuwait_wake_up(struct rcuwait *w)
*/
smp_mb(); /* (B) */

/*
* Avoid using task_rcu_dereference() magic as long as we are careful,
* see comment in rcuwait_wait_event() regarding ->exit_state.
*/
task = rcu_dereference(w->task);
if (task)
wake_up_process(task);
Expand Down
8 changes: 5 additions & 3 deletions kernel/fork.c
Original file line number Diff line number Diff line change
Expand Up @@ -915,10 +915,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->cpus_ptr = &tsk->cpus_mask;

/*
* One for us, one for whoever does the "release_task()" (usually
* parent)
* One for the user space visible state that goes away when reaped.
* One for the scheduler.
*/
refcount_set(&tsk->usage, 2);
refcount_set(&tsk->rcu_users, 2);
/* One for the rcu users */
refcount_set(&tsk->usage, 1);
#ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0;
#endif
Expand Down
28 changes: 15 additions & 13 deletions kernel/sched/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -1656,7 +1656,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (cpumask_equal(p->cpus_ptr, new_mask))
goto out;

if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
if (dest_cpu >= nr_cpu_ids) {
ret = -EINVAL;
goto out;
}
Expand All @@ -1677,7 +1678,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (cpumask_test_cpu(task_cpu(p), new_mask))
goto out;

dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
if (task_running(rq, p) || p->state == TASK_WAKING) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
Expand Down Expand Up @@ -3254,7 +3254,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
/* Task is done with its stack. */
put_task_stack(prev);

put_task_struct(prev);
put_task_struct_rcu_user(prev);
}

tick_nohz_task_switch();
Expand Down Expand Up @@ -3358,15 +3358,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
else
prev->active_mm = NULL;
} else { // to user
membarrier_switch_mm(rq, prev->active_mm, next->mm);
/*
* sys_membarrier() requires an smp_mb() between setting
* rq->curr and returning to userspace.
* rq->curr / membarrier_switch_mm() and returning to userspace.
*
* The below provides this either through switch_mm(), or in
* case 'prev->active_mm == next->mm' through
* finish_task_switch()'s mmdrop().
*/

switch_mm_irqs_off(prev->active_mm, next->mm, next);

if (!prev->mm) { // from kernel
Expand Down Expand Up @@ -4042,7 +4042,11 @@ static void __sched notrace __schedule(bool preempt)

if (likely(prev != next)) {
rq->nr_switches++;
rq->curr = next;
/*
* RCU users of rcu_dereference(rq->curr) may not see
* changes to task_struct made by pick_next_task().
*/
RCU_INIT_POINTER(rq->curr, next);
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
Expand Down Expand Up @@ -4223,9 +4227,8 @@ static void __sched notrace preempt_schedule_common(void)

#ifdef CONFIG_PREEMPTION
/*
* this is the entry point to schedule() from in-kernel preemption
* off of preempt_enable. Kernel preemptions off return from interrupt
* occur there and call schedule directly.
* This is the entry point to schedule() from in-kernel preemption
* off of preempt_enable.
*/
asmlinkage __visible void __sched notrace preempt_schedule(void)
{
Expand Down Expand Up @@ -4296,7 +4299,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
#endif /* CONFIG_PREEMPTION */

/*
* this is the entry point to schedule() from kernel preemption
* This is the entry point to schedule() from kernel preemption
* off of irq context.
* Note, that this is called and return with irqs disabled. This will
* protect us against recursive calling from irq.
Expand Down Expand Up @@ -6069,7 +6072,8 @@ void init_idle(struct task_struct *idle, int cpu)
__set_task_cpu(idle, cpu);
rcu_read_unlock();

rq->curr = rq->idle = idle;
rq->idle = idle;
rcu_assign_pointer(rq->curr, idle);
idle->on_rq = TASK_ON_RQ_QUEUED;
#ifdef CONFIG_SMP
idle->on_cpu = 1;
Expand Down Expand Up @@ -6430,8 +6434,6 @@ int sched_cpu_activate(unsigned int cpu)
}
rq_unlock_irqrestore(rq, &rf);

update_max_interval();

return 0;
}

Expand Down
Loading

0 comments on commit 9c5efe9

Please sign in to comment.