Skip to content

Commit

Permalink
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/l…
Browse files Browse the repository at this point in the history
…inux/kernel/git/tip/tip

Pull scheduler changes from Ingo Molnar:
 "The biggest change is a performance improvement on SMP systems:

  | 4 socket 40 core + SMT Westmere box, single 30 sec tbench
  | runs, higher is better:
  |
  | clients     1       2       4        8       16       32       64      128
  |..........................................................................
  | pre        30      41     118      645     3769     6214    12233    14312
  | post      299     603    1211     2418     4697     6847    11606    14557
  |
  | A nice increase in performance.

  which speedup is particularly noticeable on heavily interacting
  few-tasks workloads, so the changes should help desktop-style Xorg
  workloads and interactivity as well, on multi-core CPUs.

  There are also cpuset suspend behavior fixes/restructuring and various
  smaller tweaks."

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched: Fix race in task_group()
  sched: Improve balance_cpu() to consider other cpus in its group as target of (pinned) task
  sched: Reset loop counters if all tasks are pinned and we need to redo load balance
  sched: Reorder 'struct lb_env' members to reduce its size
  sched: Improve scalability via 'CPU buddies', which withstand random perturbations
  cpusets: Remove/update outdated comments
  cpusets, hotplug: Restructure functions that are invoked during hotplug
  cpusets, hotplug: Implement cpuset tree traversal in a helper function
  CPU hotplug, cpusets, suspend: Don't modify cpusets during suspend/resume
  sched/x86: Remove broken power estimation
  • Loading branch information
Linus Torvalds committed Jul 26, 2012
2 parents 44a6b84 + 8323f26 commit 7907163
Show file tree
Hide file tree
Showing 9 changed files with 291 additions and 146 deletions.
2 changes: 1 addition & 1 deletion arch/x86/kernel/cpu/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ CFLAGS_common.o := $(nostackp)

obj-y := intel_cacheinfo.o scattered.o topology.o
obj-y += proc.o capflags.o powerflags.o common.o
obj-y += vmware.o hypervisor.o sched.o mshyperv.o
obj-y += vmware.o hypervisor.o mshyperv.o
obj-y += rdrand.o
obj-y += match.o

Expand Down
55 changes: 0 additions & 55 deletions arch/x86/kernel/cpu/sched.c

This file was deleted.

4 changes: 2 additions & 2 deletions include/linux/cpuset.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */

extern int cpuset_init(void);
extern void cpuset_init_smp(void);
extern void cpuset_update_active_cpus(void);
extern void cpuset_update_active_cpus(bool cpu_online);
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
Expand Down Expand Up @@ -124,7 +124,7 @@ static inline void set_mems_allowed(nodemask_t nodemask)
static inline int cpuset_init(void) { return 0; }
static inline void cpuset_init_smp(void) {}

static inline void cpuset_update_active_cpus(void)
static inline void cpuset_update_active_cpus(bool cpu_online)
{
partition_sched_domains(1, NULL, NULL);
}
Expand Down
12 changes: 11 additions & 1 deletion include/linux/init_task.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,17 @@ extern struct group_info init_groups;

extern struct cred init_cred;

extern struct task_group root_task_group;

#ifdef CONFIG_CGROUP_SCHED
# define INIT_CGROUP_SCHED(tsk) \
.sched_task_group = &root_task_group,
#else
# define INIT_CGROUP_SCHED(tsk)
#endif

#ifdef CONFIG_PERF_EVENTS
# define INIT_PERF_EVENTS(tsk) \
# define INIT_PERF_EVENTS(tsk) \
.perf_event_mutex = \
__MUTEX_INITIALIZER(tsk.perf_event_mutex), \
.perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list),
Expand Down Expand Up @@ -161,6 +170,7 @@ extern struct cred init_cred;
}, \
.tasks = LIST_HEAD_INIT(tsk.tasks), \
INIT_PUSHABLE_TASKS(tsk) \
INIT_CGROUP_SCHED(tsk) \
.ptraced = LIST_HEAD_INIT(tsk.ptraced), \
.ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \
.real_parent = &tsk, \
Expand Down
6 changes: 5 additions & 1 deletion include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -949,6 +949,7 @@ struct sched_domain {
unsigned int smt_gain;
int flags; /* See SD_* */
int level;
int idle_buddy; /* cpu assigned to select_idle_sibling() */

/* Runtime fields. */
unsigned long last_balance; /* init to jiffies. units in jiffies */
Expand Down Expand Up @@ -1244,6 +1245,9 @@ struct task_struct {
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif

#ifdef CONFIG_PREEMPT_NOTIFIERS
/* list of struct preempt_notifier: */
Expand Down Expand Up @@ -2721,7 +2725,7 @@ extern int sched_group_set_rt_period(struct task_group *tg,
extern long sched_group_rt_period(struct task_group *tg);
extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
#endif
#endif
#endif /* CONFIG_CGROUP_SCHED */

extern int task_can_switch_user(struct user_struct *up,
struct task_struct *tsk);
Expand Down
130 changes: 92 additions & 38 deletions kernel/cpuset.c
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,12 @@ typedef enum {
CS_SPREAD_SLAB,
} cpuset_flagbits_t;

/* the type of hotplug event */
enum hotplug_event {
CPUSET_CPU_OFFLINE,
CPUSET_MEM_OFFLINE,
};

/* convenient tests for these bits */
static inline int is_cpu_exclusive(const struct cpuset *cs)
{
Expand Down Expand Up @@ -1990,8 +1996,36 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
}

/*
* Walk the specified cpuset subtree and look for empty cpusets.
* The tasks of such cpuset must be moved to a parent cpuset.
* Helper function to traverse cpusets.
* It can be used to walk the cpuset tree from top to bottom, completing
* one layer before dropping down to the next (thus always processing a
* node before any of its children).
*/
static struct cpuset *cpuset_next(struct list_head *queue)
{
struct cpuset *cp;
struct cpuset *child; /* scans child cpusets of cp */
struct cgroup *cont;

if (list_empty(queue))
return NULL;

cp = list_first_entry(queue, struct cpuset, stack_list);
list_del(queue->next);
list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
child = cgroup_cs(cont);
list_add_tail(&child->stack_list, queue);
}

return cp;
}


/*
* Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
* online/offline) and update the cpusets accordingly.
* For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
* cpuset must be moved to a parent cpuset.
*
* Called with cgroup_mutex held. We take callback_mutex to modify
* cpus_allowed and mems_allowed.
Expand All @@ -2000,50 +2034,61 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
* before dropping down to the next. It always processes a node before
* any of its children.
*
* For now, since we lack memory hot unplug, we'll never see a cpuset
* that has tasks along with an empty 'mems'. But if we did see such
* a cpuset, we'd handle it just like we do if its 'cpus' was empty.
* In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY
* if all present pages from a node are offlined.
*/
static void scan_for_empty_cpusets(struct cpuset *root)
static void
scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
{
LIST_HEAD(queue);
struct cpuset *cp; /* scans cpusets being updated */
struct cpuset *child; /* scans child cpusets of cp */
struct cgroup *cont;
struct cpuset *cp; /* scans cpusets being updated */
static nodemask_t oldmems; /* protected by cgroup_mutex */

list_add_tail((struct list_head *)&root->stack_list, &queue);

while (!list_empty(&queue)) {
cp = list_first_entry(&queue, struct cpuset, stack_list);
list_del(queue.next);
list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
child = cgroup_cs(cont);
list_add_tail(&child->stack_list, &queue);
switch (event) {
case CPUSET_CPU_OFFLINE:
while ((cp = cpuset_next(&queue)) != NULL) {

/* Continue past cpusets with all cpus online */
if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
continue;

/* Remove offline cpus from this cpuset. */
mutex_lock(&callback_mutex);
cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
cpu_active_mask);
mutex_unlock(&callback_mutex);

/* Move tasks from the empty cpuset to a parent */
if (cpumask_empty(cp->cpus_allowed))
remove_tasks_in_empty_cpuset(cp);
else
update_tasks_cpumask(cp, NULL);
}
break;

/* Continue past cpusets with all cpus, mems online */
if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
continue;
case CPUSET_MEM_OFFLINE:
while ((cp = cpuset_next(&queue)) != NULL) {

oldmems = cp->mems_allowed;
/* Continue past cpusets with all mems online */
if (nodes_subset(cp->mems_allowed,
node_states[N_HIGH_MEMORY]))
continue;

/* Remove offline cpus and mems from this cpuset. */
mutex_lock(&callback_mutex);
cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
cpu_active_mask);
nodes_and(cp->mems_allowed, cp->mems_allowed,
oldmems = cp->mems_allowed;

/* Remove offline mems from this cpuset. */
mutex_lock(&callback_mutex);
nodes_and(cp->mems_allowed, cp->mems_allowed,
node_states[N_HIGH_MEMORY]);
mutex_unlock(&callback_mutex);
mutex_unlock(&callback_mutex);

/* Move tasks from the empty cpuset to a parent */
if (cpumask_empty(cp->cpus_allowed) ||
nodes_empty(cp->mems_allowed))
remove_tasks_in_empty_cpuset(cp);
else {
update_tasks_cpumask(cp, NULL);
update_tasks_nodemask(cp, &oldmems, NULL);
/* Move tasks from the empty cpuset to a parent */
if (nodes_empty(cp->mems_allowed))
remove_tasks_in_empty_cpuset(cp);
else
update_tasks_nodemask(cp, &oldmems, NULL);
}
}
}
Expand All @@ -2054,13 +2099,19 @@ static void scan_for_empty_cpusets(struct cpuset *root)
* (of no affect) on systems that are actively using CPU hotplug
* but making no active use of cpusets.
*
* The only exception to this is suspend/resume, where we don't
* modify cpusets at all.
*
* This routine ensures that top_cpuset.cpus_allowed tracks
* cpu_active_mask on each CPU hotplug (cpuhp) event.
*
* Called within get_online_cpus(). Needs to call cgroup_lock()
* before calling generate_sched_domains().
*
* @cpu_online: Indicates whether this is a CPU online event (true) or
* a CPU offline event (false).
*/
void cpuset_update_active_cpus(void)
void cpuset_update_active_cpus(bool cpu_online)
{
struct sched_domain_attr *attr;
cpumask_var_t *doms;
Expand All @@ -2070,7 +2121,10 @@ void cpuset_update_active_cpus(void)
mutex_lock(&callback_mutex);
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
mutex_unlock(&callback_mutex);
scan_for_empty_cpusets(&top_cpuset);

if (!cpu_online)
scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);

ndoms = generate_sched_domains(&doms, &attr);
cgroup_unlock();

Expand All @@ -2082,7 +2136,7 @@ void cpuset_update_active_cpus(void)
/*
* Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
* Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
* See also the previous routine cpuset_track_online_cpus().
* See cpuset_update_active_cpus() for CPU hotplug handling.
*/
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
Expand All @@ -2101,9 +2155,9 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
case MEM_OFFLINE:
/*
* needn't update top_cpuset.mems_allowed explicitly because
* scan_for_empty_cpusets() will update it.
* scan_cpusets_upon_hotplug() will update it.
*/
scan_for_empty_cpusets(&top_cpuset);
scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
break;
default:
break;
Expand Down
Loading

0 comments on commit 7907163

Please sign in to comment.