diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt index f2235a1625295..fdf7dff3f607d 100644 --- a/Documentation/cgroups/cpusets.txt +++ b/Documentation/cgroups/cpusets.txt @@ -392,8 +392,10 @@ Put simply, it costs less to balance between two smaller sched domains than one big one, but doing so means that overloads in one of the two domains won't be load balanced to the other one. -By default, there is one sched domain covering all CPUs, except those -marked isolated using the kernel boot time "isolcpus=" argument. +By default, there is one sched domain covering all CPUs, including those +marked isolated using the kernel boot time "isolcpus=" argument. However, +the isolated CPUs will not participate in load balancing, and will not +have tasks running on them unless explicitly assigned. This default load balancing across all CPUs is not well suited for the following two situations: @@ -465,6 +467,10 @@ such partially load balanced cpusets, as they may be artificially constrained to some subset of the CPUs allowed to them, for lack of load balancing to the other CPUs. +CPUs in "cpuset.isolcpus" were excluded from load balancing by the +isolcpus= kernel boot option, and will never be load balanced regardless +of the value of "cpuset.sched_load_balance" in any cpuset. + 1.7.1 sched_load_balance implementation details. ------------------------------------------------ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3f3308824fa41..f74d4cc3a3e54 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -337,6 +337,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern void init_idle_bootup_task(struct task_struct *idle); +extern cpumask_var_t cpu_isolated_map; + extern int runqueue_is_locked(int cpu); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) diff --git a/init/main.c b/init/main.c index 6f0f1c5ff8cc8..4a6974e678396 100644 --- a/init/main.c +++ b/init/main.c @@ -654,8 +654,8 @@ asmlinkage __visible void __init start_kernel(void) page_writeback_init(); proc_root_init(); nsfs_init(); - cgroup_init(); cpuset_init(); + cgroup_init(); taskstats_init_early(); delayacct_init(); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 29a7b2cc593e3..a220fdb66568e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3806,10 +3806,7 @@ static void *pidlist_allocate(int count) static void pidlist_free(void *p) { - if (is_vmalloc_addr(p)) - vfree(p); - else - kfree(p); + kvfree(p); } /* @@ -5040,6 +5037,9 @@ int __init cgroup_init(void) WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); } + + if (ss->bind) + ss->bind(init_css_set.subsys[ssid]); } cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index fc7f4748d34a9..c68f0721df10e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -622,6 +622,7 @@ static int generate_sched_domains(cpumask_var_t **domains, int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ + cpumask_var_t non_isolated_cpus; /* load balanced CPUs */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ @@ -631,6 +632,10 @@ static int generate_sched_domains(cpumask_var_t **domains, dattr = NULL; csa = NULL; + if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL)) + goto done; + cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); + /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { ndoms = 1; @@ -643,7 +648,8 @@ static int generate_sched_domains(cpumask_var_t **domains, *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } - cpumask_copy(doms[0], top_cpuset.effective_cpus); + cpumask_and(doms[0], top_cpuset.effective_cpus, + non_isolated_cpus); goto done; } @@ -666,7 +672,8 @@ static int generate_sched_domains(cpumask_var_t **domains, * the corresponding sched domain. */ if (!cpumask_empty(cp->cpus_allowed) && - !is_sched_load_balance(cp)) + !(is_sched_load_balance(cp) && + cpumask_intersects(cp->cpus_allowed, non_isolated_cpus))) continue; if (is_sched_load_balance(cp)) @@ -748,6 +755,7 @@ static int generate_sched_domains(cpumask_var_t **domains, if (apn == b->pn) { cpumask_or(dp, dp, b->effective_cpus); + cpumask_and(dp, dp, non_isolated_cpus); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -760,6 +768,7 @@ static int generate_sched_domains(cpumask_var_t **domains, BUG_ON(nslot != ndoms); done: + free_cpumask_var(non_isolated_cpus); kfree(csa); /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 261af7bfcb67d..2f7937ee9e3a6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -306,6 +306,9 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; +/* cpus with isolated domains */ +cpumask_var_t cpu_isolated_map; + /* * this_rq_lock - lock this runqueue and disable interrupts. */ @@ -5811,9 +5814,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) update_top_cache_domain(cpu); } -/* cpus with isolated domains */ -static cpumask_var_t cpu_isolated_map; - /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) {