Skip to content

Commit

Permalink
sched: Fix domain iteration
Browse files Browse the repository at this point in the history
Weird topologies can lead to asymmetric domain setups. This needs
further consideration since these setups are typically non-minimal
too.

For now, make it work by adding an extra mask selecting which CPUs
are allowed to iterate up.

The topology that triggered it is the one from David Rientjes:

	10 20 20 30
	20 10 20 20
	20 20 10 20
	30 20 20 10

resulting in boxes that wouldn't even boot.

Reported-by: David Rientjes <rientjes@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-3p86l9cuaqnxz7uxsojmz5rm@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
  • Loading branch information
Peter Zijlstra authored and Ingo Molnar committed Jun 6, 2012
1 parent 7f1b439 commit c117487
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 10 deletions.
11 changes: 11 additions & 0 deletions include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -876,6 +876,8 @@ struct sched_group_power {
* Number of busy cpus in this group.
*/
atomic_t nr_busy_cpus;

unsigned long cpumask[0]; /* iteration mask */
};

struct sched_group {
Expand All @@ -900,6 +902,15 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
return to_cpumask(sg->cpumask);
}

/*
* cpumask masking which cpus in the group are allowed to iterate up the domain
* tree.
*/
static inline struct cpumask *sched_group_mask(struct sched_group *sg)
{
return to_cpumask(sg->sgp->cpumask);
}

/**
* group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
* @group: The group whose first cpu is to be returned.
Expand Down
64 changes: 56 additions & 8 deletions kernel/sched/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -5994,6 +5994,44 @@ struct sched_domain_topology_level {
struct sd_data data;
};

/*
* Build an iteration mask that can exclude certain CPUs from the upwards
* domain traversal.
*
* Asymmetric node setups can result in situations where the domain tree is of
* unequal depth, make sure to skip domains that already cover the entire
* range.
*
* In that case build_sched_domains() will have terminated the iteration early
* and our sibling sd spans will be empty. Domains should always include the
* cpu they're built on, so check that.
*
*/
static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
{
const struct cpumask *span = sched_domain_span(sd);
struct sd_data *sdd = sd->private;
struct sched_domain *sibling;
int i;

for_each_cpu(i, span) {
sibling = *per_cpu_ptr(sdd->sd, i);
if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
continue;

cpumask_set_cpu(i, sched_group_mask(sg));
}
}

/*
* Return the canonical balance cpu for this group, this is the first cpu
* of this group that's also in the iteration mask.
*/
int group_balance_cpu(struct sched_group *sg)
{
return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
}

static int
build_overlap_sched_groups(struct sched_domain *sd, int cpu)
{
Expand All @@ -6012,15 +6050,19 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
if (cpumask_test_cpu(i, covered))
continue;

child = *per_cpu_ptr(sdd->sd, i);

/* See the comment near build_group_mask(). */
if (!cpumask_test_cpu(i, sched_domain_span(child)))
continue;

sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
GFP_KERNEL, cpu_to_node(cpu));

if (!sg)
goto fail;

sg_span = sched_group_cpus(sg);

child = *per_cpu_ptr(sdd->sd, i);
if (child->child) {
child = child->child;
cpumask_copy(sg_span, sched_domain_span(child));
Expand All @@ -6030,13 +6072,18 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_or(covered, covered, sg_span);

sg->sgp = *per_cpu_ptr(sdd->sgp, i);
atomic_inc(&sg->sgp->ref);
if (atomic_inc_return(&sg->sgp->ref) == 1)
build_group_mask(sd, sg);


/*
* Make sure the first group of this domain contains the
* canonical balance cpu. Otherwise the sched_domain iteration
* breaks. See update_sg_lb_stats().
*/
if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
cpumask_first(sg_span) == cpu) {
WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
group_balance_cpu(sg) == cpu)
groups = sg;
}

if (!first)
first = sg;
Expand Down Expand Up @@ -6109,6 +6156,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)

cpumask_clear(sched_group_cpus(sg));
sg->sgp->power = 0;
cpumask_setall(sched_group_mask(sg));

for_each_cpu(j, span) {
if (get_group(j, sdd, NULL) != group)
Expand Down Expand Up @@ -6150,7 +6198,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
sg = sg->next;
} while (sg != sd->groups);

if (cpu != group_first_cpu(sg))
if (cpu != group_balance_cpu(sg))
return;

update_group_power(sd, cpu);
Expand Down Expand Up @@ -6525,7 +6573,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)

*per_cpu_ptr(sdd->sg, j) = sg;

sgp = kzalloc_node(sizeof(struct sched_group_power),
sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sgp)
return -ENOMEM;
Expand Down
5 changes: 3 additions & 2 deletions kernel/sched/fair.c
Original file line number Diff line number Diff line change
Expand Up @@ -3652,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
int i;

if (local_group)
balance_cpu = group_first_cpu(group);
balance_cpu = group_balance_cpu(group);

/* Tally up the load of all CPUs in the group */
max_cpu_load = 0;
Expand All @@ -3667,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,

/* Bias balancing toward cpus of our domain */
if (local_group) {
if (idle_cpu(i) && !first_idle_cpu) {
if (idle_cpu(i) && !first_idle_cpu &&
cpumask_test_cpu(i, sched_group_mask(group))) {
first_idle_cpu = 1;
balance_cpu = i;
}
Expand Down
2 changes: 2 additions & 0 deletions kernel/sched/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
DECLARE_PER_CPU(struct sched_domain *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_id);

extern int group_balance_cpu(struct sched_group *sg);

#endif /* CONFIG_SMP */

#include "stats.h"
Expand Down

0 comments on commit c117487

Please sign in to comment.