From fcdb1eda5302599045bb366e679cccb4216f3873 Mon Sep 17 00:00:00 2001 From: Josh Don Date: Wed, 15 Mar 2023 14:40:29 -0700 Subject: [PATCH 1/7] cgroup: fix display of forceidle time at root We need to reset forceidle_sum to 0 when reading from root, since the bstat we accumulate into is stack allocated. To make this more robust, just replace the existing cputime reset with a memset of the overall bstat. Signed-off-by: Josh Don Fixes: 1fcf54deb767 ("sched/core: add forced idle accounting for cgroups") Cc: stable@vger.kernel.org # v6.0+ Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 831f1f472bb81..0a2b4967e3334 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -457,9 +457,7 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat) struct task_cputime *cputime = &bstat->cputime; int i; - cputime->stime = 0; - cputime->utime = 0; - cputime->sum_exec_runtime = 0; + memset(bstat, 0, sizeof(*bstat)); for_each_possible_cpu(i) { struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; From 292fd843de26c551856e66faf134512c52dd78b4 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 17 Mar 2023 11:15:05 -0400 Subject: [PATCH 2/7] cgroup/cpuset: Fix partition root's cpuset.cpus update bug It was found that commit 7a2127e66a00 ("cpuset: Call set_cpus_allowed_ptr() with appropriate mask for task") introduced a bug that corrupted "cpuset.cpus" of a partition root when it was updated. It is because the tmp->new_cpus field of the passed tmp parameter of update_parent_subparts_cpumask() should not be used at all as it contains important cpumask data that should not be overwritten. Fix it by using tmp->addmask instead. Also update update_cpumask() to make sure that trialcs->cpu_allowed will not be corrupted until it is no longer needed. Fixes: 7a2127e66a00 ("cpuset: Call set_cpus_allowed_ptr() with appropriate mask for task") Signed-off-by: Waiman Long Cc: stable@vger.kernel.org # v6.2+ Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 636f1c682ac07..f310915d12575 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1513,7 +1513,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, spin_unlock_irq(&callback_lock); if (adding || deleting) - update_tasks_cpumask(parent, tmp->new_cpus); + update_tasks_cpumask(parent, tmp->addmask); /* * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary. @@ -1770,10 +1770,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, /* * Use the cpumasks in trialcs for tmpmasks when they are pointers * to allocated cpumasks. + * + * Note that update_parent_subparts_cpumask() uses only addmask & + * delmask, but not new_cpus. */ tmp.addmask = trialcs->subparts_cpus; tmp.delmask = trialcs->effective_cpus; - tmp.new_cpus = trialcs->cpus_allowed; + tmp.new_cpus = NULL; #endif retval = validate_change(cs, trialcs); @@ -1838,6 +1841,11 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, } spin_unlock_irq(&callback_lock); +#ifdef CONFIG_CPUMASK_OFFSTACK + /* Now trialcs->cpus_allowed is available */ + tmp.new_cpus = trialcs->cpus_allowed; +#endif + /* effective_cpus will be updated here */ update_cpumasks_hier(cs, &tmp, false); From 57dcd64c7e036299ef526b400a8d12b8a2352f26 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 5 Apr 2023 22:15:32 +0900 Subject: [PATCH 3/7] cgroup,freezer: hold cpu_hotplug_lock before freezer_mutex syzbot is reporting circular locking dependency between cpu_hotplug_lock and freezer_mutex, for commit f5d39b020809 ("freezer,sched: Rewrite core freezer logic") replaced atomic_inc() in freezer_apply_state() with static_branch_inc() which holds cpu_hotplug_lock. cpu_hotplug_lock => cgroup_threadgroup_rwsem => freezer_mutex cgroup_file_write() { cgroup_procs_write() { __cgroup_procs_write() { cgroup_procs_write_start() { cgroup_attach_lock() { cpus_read_lock() { percpu_down_read(&cpu_hotplug_lock); } percpu_down_write(&cgroup_threadgroup_rwsem); } } cgroup_attach_task() { cgroup_migrate() { cgroup_migrate_execute() { freezer_attach() { mutex_lock(&freezer_mutex); (...snipped...) } } } } (...snipped...) } } } freezer_mutex => cpu_hotplug_lock cgroup_file_write() { freezer_write() { freezer_change_state() { mutex_lock(&freezer_mutex); freezer_apply_state() { static_branch_inc(&freezer_active) { static_key_slow_inc() { cpus_read_lock(); static_key_slow_inc_cpuslocked(); cpus_read_unlock(); } } } mutex_unlock(&freezer_mutex); } } } Swap locking order by moving cpus_read_lock() in freezer_apply_state() to before mutex_lock(&freezer_mutex) in freezer_change_state(). Reported-by: syzbot Link: https://syzkaller.appspot.com/bug?extid=c39682e86c9d84152f93 Suggested-by: Hillf Danton Fixes: f5d39b020809 ("freezer,sched: Rewrite core freezer logic") Signed-off-by: Tetsuo Handa Acked-by: Peter Zijlstra (Intel) Reviewed-by: Mukesh Ojha Signed-off-by: Tejun Heo --- kernel/cgroup/legacy_freezer.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 1b6b21851e9d4..936473203a6b5 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -22,6 +22,7 @@ #include #include #include +#include /* * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is @@ -350,7 +351,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, if (freeze) { if (!(freezer->state & CGROUP_FREEZING)) - static_branch_inc(&freezer_active); + static_branch_inc_cpuslocked(&freezer_active); freezer->state |= state; freeze_cgroup(freezer); } else { @@ -361,7 +362,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, if (!(freezer->state & CGROUP_FREEZING)) { freezer->state &= ~CGROUP_FROZEN; if (was_freezing) - static_branch_dec(&freezer_active); + static_branch_dec_cpuslocked(&freezer_active); unfreeze_cgroup(freezer); } } @@ -379,6 +380,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) { struct cgroup_subsys_state *pos; + cpus_read_lock(); /* * Update all its descendants in pre-order traversal. Each * descendant will try to inherit its parent's FREEZING state as @@ -407,6 +409,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) } rcu_read_unlock(); mutex_unlock(&freezer_mutex); + cpus_read_unlock(); } static ssize_t freezer_write(struct kernfs_open_file *of, From ba9182a89626d5f83c2ee4594f55cb9c1e60f0e2 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 11 Apr 2023 09:35:57 -0400 Subject: [PATCH 4/7] cgroup/cpuset: Wake up cpuset_attach_wq tasks in cpuset_cancel_attach() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After a successful cpuset_can_attach() call which increments the attach_in_progress flag, either cpuset_cancel_attach() or cpuset_attach() will be called later. In cpuset_attach(), tasks in cpuset_attach_wq, if present, will be woken up at the end. That is not the case in cpuset_cancel_attach(). So missed wakeup is possible if the attach operation is somehow cancelled. Fix that by doing the wakeup in cpuset_cancel_attach() as well. Fixes: e44193d39e8d ("cpuset: let hotplug propagation work wait for task attaching") Signed-off-by: Waiman Long Reviewed-by: Michal Koutný Cc: stable@vger.kernel.org # v3.11+ Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f310915d12575..ff7eb8e2efdc0 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2502,11 +2502,15 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) static void cpuset_cancel_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; + struct cpuset *cs; cgroup_taskset_first(tset, &css); + cs = css_cs(css); percpu_down_write(&cpuset_rwsem); - css_cs(css)->attach_in_progress--; + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); percpu_up_write(&cpuset_rwsem); } From 42a11bf5c5436e91b040aeb04063be1710bb9f9c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 11 Apr 2023 09:35:58 -0400 Subject: [PATCH 5/7] cgroup/cpuset: Make cpuset_fork() handle CLONE_INTO_CGROUP properly By default, the clone(2) syscall spawn a child process into the same cgroup as its parent. With the use of the CLONE_INTO_CGROUP flag introduced by commit ef2c41cf38a7 ("clone3: allow spawning processes into cgroups"), the child will be spawned into a different cgroup which is somewhat similar to writing the child's tid into "cgroup.threads". The current cpuset_fork() method does not properly handle the CLONE_INTO_CGROUP case where the cpuset of the child may be different from that of its parent. Update the cpuset_fork() method to treat the CLONE_INTO_CGROUP case similar to cpuset_attach(). Since the newly cloned task has not been running yet, its actual memory usage isn't known. So it is not necessary to make change to mm in cpuset_fork(). Fixes: ef2c41cf38a7 ("clone3: allow spawning processes into cgroups") Reported-by: Giuseppe Scrivano Signed-off-by: Waiman Long Cc: stable@vger.kernel.org # v5.7+ Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 62 ++++++++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 20 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ff7eb8e2efdc0..2ccfae74acf9a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2515,16 +2515,33 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) } /* - * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach() + * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach_task() * but we can't allocate it dynamically there. Define it global and * allocate from cpuset_init(). */ static cpumask_var_t cpus_attach; +static nodemask_t cpuset_attach_nodemask_to; + +static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) +{ + percpu_rwsem_assert_held(&cpuset_rwsem); + + if (cs != &top_cpuset) + guarantee_online_cpus(task, cpus_attach); + else + cpumask_copy(cpus_attach, task_cpu_possible_mask(task)); + /* + * can_attach beforehand should guarantee that this doesn't + * fail. TODO: have a better way to handle failure here + */ + WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); + + cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); + cpuset_update_task_spread_flags(cs, task); +} static void cpuset_attach(struct cgroup_taskset *tset) { - /* static buf protected by cpuset_rwsem */ - static nodemask_t cpuset_attach_nodemask_to; struct task_struct *task; struct task_struct *leader; struct cgroup_subsys_state *css; @@ -2555,20 +2572,8 @@ static void cpuset_attach(struct cgroup_taskset *tset) guarantee_online_mems(cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, css, tset) { - if (cs != &top_cpuset) - guarantee_online_cpus(task, cpus_attach); - else - cpumask_copy(cpus_attach, task_cpu_possible_mask(task)); - /* - * can_attach beforehand should guarantee that this doesn't - * fail. TODO: have a better way to handle failure here - */ - WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); - - cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); - cpuset_update_task_spread_flags(cs, task); - } + cgroup_taskset_for_each(task, css, tset) + cpuset_attach_task(cs, task); /* * Change mm for all threadgroup leaders. This is expensive and may @@ -3266,11 +3271,28 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) */ static void cpuset_fork(struct task_struct *task) { - if (task_css_is_root(task, cpuset_cgrp_id)) + struct cpuset *cs; + bool same_cs; + + rcu_read_lock(); + cs = task_cs(task); + same_cs = (cs == task_cs(current)); + rcu_read_unlock(); + + if (same_cs) { + if (cs == &top_cpuset) + return; + + set_cpus_allowed_ptr(task, current->cpus_ptr); + task->mems_allowed = current->mems_allowed; return; + } - set_cpus_allowed_ptr(task, current->cpus_ptr); - task->mems_allowed = current->mems_allowed; + /* CLONE_INTO_CGROUP */ + percpu_down_write(&cpuset_rwsem); + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); + cpuset_attach_task(cs, task); + percpu_up_write(&cpuset_rwsem); } struct cgroup_subsys cpuset_cgrp_subsys = { From eee87853794187f6adbe19533ed79c8b44b36a91 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 11 Apr 2023 09:35:59 -0400 Subject: [PATCH 6/7] cgroup/cpuset: Add cpuset_can_fork() and cpuset_cancel_fork() methods MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the case of CLONE_INTO_CGROUP, not all cpusets are ready to accept new tasks. It is too late to check that in cpuset_fork(). So we need to add the cpuset_can_fork() and cpuset_cancel_fork() methods to pre-check it before we can allow attachment to a different cpuset. We also need to set the attach_in_progress flag to alert other code that a new task is going to be added to the cpuset. Fixes: ef2c41cf38a7 ("clone3: allow spawning processes into cgroups") Suggested-by: Michal Koutný Signed-off-by: Waiman Long Cc: stable@vger.kernel.org # v5.7+ Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 97 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 86 insertions(+), 11 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2ccfae74acf9a..166a45019f665 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2453,6 +2453,20 @@ static int fmeter_getrate(struct fmeter *fmp) static struct cpuset *cpuset_attach_old_cs; +/* + * Check to see if a cpuset can accept a new task + * For v1, cpus_allowed and mems_allowed can't be empty. + * For v2, effective_cpus can't be empty. + * Note that in v1, effective_cpus = cpus_allowed. + */ +static int cpuset_can_attach_check(struct cpuset *cs) +{ + if (cpumask_empty(cs->effective_cpus) || + (!is_in_v2_mode() && nodes_empty(cs->mems_allowed))) + return -ENOSPC; + return 0; +} + /* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */ static int cpuset_can_attach(struct cgroup_taskset *tset) { @@ -2467,16 +2481,9 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) percpu_down_write(&cpuset_rwsem); - /* allow moving tasks into an empty cpuset if on default hierarchy */ - ret = -ENOSPC; - if (!is_in_v2_mode() && - (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) - goto out_unlock; - - /* - * Task cannot be moved to a cpuset with empty effective cpus. - */ - if (cpumask_empty(cs->effective_cpus)) + /* Check to see if task is allowed in the cpuset */ + ret = cpuset_can_attach_check(cs); + if (ret) goto out_unlock; cgroup_taskset_for_each(task, css, tset) { @@ -2493,7 +2500,6 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) * changes which zero cpus/mems_allowed. */ cs->attach_in_progress++; - ret = 0; out_unlock: percpu_up_write(&cpuset_rwsem); return ret; @@ -3264,6 +3270,68 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) percpu_up_write(&cpuset_rwsem); } +/* + * In case the child is cloned into a cpuset different from its parent, + * additional checks are done to see if the move is allowed. + */ +static int cpuset_can_fork(struct task_struct *task, struct css_set *cset) +{ + struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); + bool same_cs; + int ret; + + rcu_read_lock(); + same_cs = (cs == task_cs(current)); + rcu_read_unlock(); + + if (same_cs) + return 0; + + lockdep_assert_held(&cgroup_mutex); + percpu_down_write(&cpuset_rwsem); + + /* Check to see if task is allowed in the cpuset */ + ret = cpuset_can_attach_check(cs); + if (ret) + goto out_unlock; + + ret = task_can_attach(task, cs->effective_cpus); + if (ret) + goto out_unlock; + + ret = security_task_setscheduler(task); + if (ret) + goto out_unlock; + + /* + * Mark attach is in progress. This makes validate_change() fail + * changes which zero cpus/mems_allowed. + */ + cs->attach_in_progress++; +out_unlock: + percpu_up_write(&cpuset_rwsem); + return ret; +} + +static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) +{ + struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); + bool same_cs; + + rcu_read_lock(); + same_cs = (cs == task_cs(current)); + rcu_read_unlock(); + + if (same_cs) + return; + + percpu_down_write(&cpuset_rwsem); + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); + percpu_up_write(&cpuset_rwsem); +} + /* * Make sure the new task conform to the current state of its parent, * which could have been changed by cpuset just after it inherits the @@ -3292,6 +3360,11 @@ static void cpuset_fork(struct task_struct *task) percpu_down_write(&cpuset_rwsem); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cpuset_attach_task(cs, task); + + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); + percpu_up_write(&cpuset_rwsem); } @@ -3305,6 +3378,8 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .attach = cpuset_attach, .post_attach = cpuset_post_attach, .bind = cpuset_bind, + .can_fork = cpuset_can_fork, + .cancel_fork = cpuset_cancel_fork, .fork = cpuset_fork, .legacy_cftypes = legacy_files, .dfl_cftypes = dfl_files, From 7e27cb6ad4d85fc8bac2a2a896da62ef66b8598e Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 11 Apr 2023 09:36:00 -0400 Subject: [PATCH 7/7] cgroup/cpuset: Make cpuset_attach_task() skip subpartitions CPUs for top_cpuset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is found that attaching a task to the top_cpuset does not currently ignore CPUs allocated to subpartitions in cpuset_attach_task(). So the code is changed to fix that. Signed-off-by: Waiman Long Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 166a45019f665..505d86b166426 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2535,7 +2535,8 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) if (cs != &top_cpuset) guarantee_online_cpus(task, cpus_attach); else - cpumask_copy(cpus_attach, task_cpu_possible_mask(task)); + cpumask_andnot(cpus_attach, task_cpu_possible_mask(task), + cs->subparts_cpus); /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here