Skip to content

Commit

Permalink
---
Browse files Browse the repository at this point in the history
yaml
---
r: 83805
b: refs/heads/master
c: 956db3c
h: refs/heads/master
i:
  83803: 691ab23
v: v3
  • Loading branch information
Cliff Wickman authored and Linus Torvalds committed Feb 7, 2008
1 parent 1811e2e commit e23350b
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 46 deletions.
2 changes: 1 addition & 1 deletion [refs]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
---
refs/heads/master: 31a7df01fd0cd786f60873a921aecafac148c290
refs/heads/master: 956db3ca0606e78456786ef19fd4dc7a5151a6e1
1 change: 1 addition & 0 deletions trunk/include/linux/cgroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cont,
struct cgroup_iter *it);
void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it);
int cgroup_scan_tasks(struct cgroup_scanner *scan);
int cgroup_attach_task(struct cgroup *, struct task_struct *);

#else /* !CONFIG_CGROUPS */

Expand Down
22 changes: 11 additions & 11 deletions trunk/kernel/cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ static struct css_set *find_css_set(
* Any task can increment and decrement the count field without lock.
* So in general, code holding cgroup_mutex can't rely on the count
* field not changing. However, if the count goes to zero, then only
* attach_task() can increment it again. Because a count of zero
* cgroup_attach_task() can increment it again. Because a count of zero
* means that no tasks are currently attached, therefore there is no
* way a task attached to that cgroup can fork (the other way to
* increment the count). So code holding cgroup_mutex can safely
Expand Down Expand Up @@ -520,17 +520,17 @@ static struct css_set *find_css_set(
* The task_lock() exception
*
* The need for this exception arises from the action of
* attach_task(), which overwrites one tasks cgroup pointer with
* cgroup_attach_task(), which overwrites one tasks cgroup pointer with
* another. It does so using cgroup_mutexe, however there are
* several performance critical places that need to reference
* task->cgroup without the expense of grabbing a system global
* mutex. Therefore except as noted below, when dereferencing or, as
* in attach_task(), modifying a task'ss cgroup pointer we use
* in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
* task_lock(), which acts on a spinlock (task->alloc_lock) already in
* the task_struct routinely used for such matters.
*
* P.S. One more locking exception. RCU is used to guard the
* update of a tasks cgroup pointer by attach_task()
* update of a tasks cgroup pointer by cgroup_attach_task()
*/

/**
Expand Down Expand Up @@ -1194,7 +1194,7 @@ static void get_first_subsys(const struct cgroup *cgrp,
* Call holding cgroup_mutex. May take task_lock of
* the task 'pid' during call.
*/
static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
int retval = 0;
struct cgroup_subsys *ss;
Expand Down Expand Up @@ -1287,7 +1287,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
get_task_struct(tsk);
}

ret = attach_task(cgrp, tsk);
ret = cgroup_attach_task(cgrp, tsk);
put_task_struct(tsk);
return ret;
}
Expand Down Expand Up @@ -2514,7 +2514,7 @@ int __init cgroup_init(void)
* - Used for /proc/<pid>/cgroup.
* - No need to task_lock(tsk) on this tsk->cgroup reference, as it
* doesn't really matter if tsk->cgroup changes after we read it,
* and we take cgroup_mutex, keeping attach_task() from changing it
* and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
* anyway. No need to check that tsk->cgroup != NULL, thanks to
* the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
* cgroup to top_cgroup.
Expand Down Expand Up @@ -2625,7 +2625,7 @@ static struct file_operations proc_cgroupstats_operations = {
* A pointer to the shared css_set was automatically copied in
* fork.c by dup_task_struct(). However, we ignore that copy, since
* it was not made under the protection of RCU or cgroup_mutex, so
* might no longer be a valid cgroup pointer. attach_task() might
* might no longer be a valid cgroup pointer. cgroup_attach_task() might
* have already changed current->cgroups, allowing the previously
* referenced cgroup group to be removed and freed.
*
Expand Down Expand Up @@ -2704,8 +2704,8 @@ void cgroup_post_fork(struct task_struct *child)
* attach us to a different cgroup, decrementing the count on
* the first cgroup that we never incremented. But in this case,
* top_cgroup isn't going away, and either task has PF_EXITING set,
* which wards off any attach_task() attempts, or task is a failed
* fork, never visible to attach_task.
* which wards off any cgroup_attach_task() attempts, or task is a failed
* fork, never visible to cgroup_attach_task.
*
*/
void cgroup_exit(struct task_struct *tsk, int run_callbacks)
Expand Down Expand Up @@ -2845,7 +2845,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
}

/* All seems fine. Finish by moving the task into the new cgroup */
ret = attach_task(child, tsk);
ret = cgroup_attach_task(child, tsk);
mutex_unlock(&cgroup_mutex);

out_release:
Expand Down
167 changes: 133 additions & 34 deletions trunk/kernel/cpuset.c
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@
#include <asm/atomic.h>
#include <linux/mutex.h>
#include <linux/kfifo.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>

/*
* Tracks how many cpusets are currently defined in system.
Expand Down Expand Up @@ -96,6 +98,9 @@ struct cpuset {

/* partition number for rebuild_sched_domains() */
int pn;

/* used for walking a cpuset heirarchy */
struct list_head stack_list;
};

/* Retrieve the cpuset for a cgroup */
Expand All @@ -111,7 +116,10 @@ static inline struct cpuset *task_cs(struct task_struct *task)
return container_of(task_subsys_state(task, cpuset_subsys_id),
struct cpuset, css);
}

struct cpuset_hotplug_scanner {
struct cgroup_scanner scan;
struct cgroup *to;
};

/* bits in struct cpuset flags field */
typedef enum {
Expand Down Expand Up @@ -1687,53 +1695,146 @@ int __init cpuset_init(void)
return 0;
}

/**
* cpuset_do_move_task - move a given task to another cpuset
* @tsk: pointer to task_struct the task to move
* @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
*
* Called by cgroup_scan_tasks() for each task in a cgroup.
* Return nonzero to stop the walk through the tasks.
*/
void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan)
{
struct cpuset_hotplug_scanner *chsp;

chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
cgroup_attach_task(chsp->to, tsk);
}

/**
* move_member_tasks_to_cpuset - move tasks from one cpuset to another
* @from: cpuset in which the tasks currently reside
* @to: cpuset to which the tasks will be moved
*
* Called with manage_sem held
* callback_mutex must not be held, as attach_task() will take it.
*
* The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
* calling callback functions for each.
*/
static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
{
struct cpuset_hotplug_scanner scan;

scan.scan.cg = from->css.cgroup;
scan.scan.test_task = NULL; /* select all tasks in cgroup */
scan.scan.process_task = cpuset_do_move_task;
scan.scan.heap = NULL;
scan.to = to->css.cgroup;

if (cgroup_scan_tasks((struct cgroup_scanner *)&scan))
printk(KERN_ERR "move_member_tasks_to_cpuset: "
"cgroup_scan_tasks failed\n");
}

/*
* If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
* or memory nodes, we need to walk over the cpuset hierarchy,
* removing that CPU or node from all cpusets. If this removes the
* last CPU or node from a cpuset, then the guarantee_online_cpus()
* or guarantee_online_mems() code will use that emptied cpusets
* parent online CPUs or nodes. Cpusets that were already empty of
* CPUs or nodes are left empty.
*
* This routine is intentionally inefficient in a couple of regards.
* It will check all cpusets in a subtree even if the top cpuset of
* the subtree has no offline CPUs or nodes. It checks both CPUs and
* nodes, even though the caller could have been coded to know that
* only one of CPUs or nodes needed to be checked on a given call.
* This was done to minimize text size rather than cpu cycles.
* last CPU or node from a cpuset, then move the tasks in the empty
* cpuset to its next-highest non-empty parent.
*
* Call with both manage_mutex and callback_mutex held.
* The parent cpuset has some superset of the 'mems' nodes that the
* newly empty cpuset held, so no migration of memory is necessary.
*
* Recursive, on depth of cpuset subtree.
* Called with both manage_sem and callback_sem held
*/
static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
{
struct cpuset *parent;

/* the cgroup's css_sets list is in use if there are tasks
in the cpuset; the list is empty if there are none;
the cs->css.refcnt seems always 0 */
if (list_empty(&cs->css.cgroup->css_sets))
return;

static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
/*
* Find its next-highest non-empty parent, (top cpuset
* has online cpus, so can't be empty).
*/
parent = cs->parent;
while (cpus_empty(parent->cpus_allowed)) {
/*
* this empty cpuset should now be considered to
* have been used, and therefore eligible for
* release when empty (if it is notify_on_release)
*/
parent = parent->parent;
}

move_member_tasks_to_cpuset(cs, parent);
}

/*
* Walk the specified cpuset subtree and look for empty cpusets.
* The tasks of such cpuset must be moved to a parent cpuset.
*
* Note that such a notify_on_release cpuset must have had, at some time,
* member tasks or cpuset descendants and cpus and memory, before it can
* be a candidate for release.
*
* Called with manage_mutex held. We take callback_mutex to modify
* cpus_allowed and mems_allowed.
*
* This walk processes the tree from top to bottom, completing one layer
* before dropping down to the next. It always processes a node before
* any of its children.
*
* For now, since we lack memory hot unplug, we'll never see a cpuset
* that has tasks along with an empty 'mems'. But if we did see such
* a cpuset, we'd handle it just like we do if its 'cpus' was empty.
*/
static void scan_for_empty_cpusets(const struct cpuset *root)
{
struct cpuset *cp; /* scans cpusets being updated */
struct cpuset *child; /* scans child cpusets of cp */
struct list_head queue;
struct cgroup *cont;
struct cpuset *c;

/* Each of our child cpusets mems must be online */
list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
c = cgroup_cs(cont);
guarantee_online_cpus_mems_in_subtree(c);
if (!cpus_empty(c->cpus_allowed))
guarantee_online_cpus(c, &c->cpus_allowed);
if (!nodes_empty(c->mems_allowed))
guarantee_online_mems(c, &c->mems_allowed);
INIT_LIST_HEAD(&queue);

list_add_tail((struct list_head *)&root->stack_list, &queue);

mutex_lock(&callback_mutex);
while (!list_empty(&queue)) {
cp = container_of(queue.next, struct cpuset, stack_list);
list_del(queue.next);
list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
child = cgroup_cs(cont);
list_add_tail(&child->stack_list, &queue);
}
cont = cp->css.cgroup;
/* Remove offline cpus and mems from this cpuset. */
cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
nodes_and(cp->mems_allowed, cp->mems_allowed,
node_states[N_HIGH_MEMORY]);
if ((cpus_empty(cp->cpus_allowed) ||
nodes_empty(cp->mems_allowed))) {
/* Move tasks from the empty cpuset to a parent */
mutex_unlock(&callback_mutex);
remove_tasks_in_empty_cpuset(cp);
mutex_lock(&callback_mutex);
}
}
mutex_unlock(&callback_mutex);
return;
}

/*
* The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
* cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
* track what's online after any CPU or memory node hotplug or unplug
* event.
*
* To ensure that we don't remove a CPU or node from the top cpuset
* that is currently in use by a child cpuset (which would violate
* the rule that cpusets must be subsets of their parent), we first
* call the recursive routine guarantee_online_cpus_mems_in_subtree().
* track what's online after any CPU or memory node hotplug or unplug event.
*
* Since there are two callers of this routine, one for CPU hotplug
* events and one for memory node hotplug events, we could have coded
Expand All @@ -1744,13 +1845,11 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
static void common_cpu_mem_hotplug_unplug(void)
{
cgroup_lock();
mutex_lock(&callback_mutex);

guarantee_online_cpus_mems_in_subtree(&top_cpuset);
top_cpuset.cpus_allowed = cpu_online_map;
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
scan_for_empty_cpusets(&top_cpuset);

mutex_unlock(&callback_mutex);
cgroup_unlock();
}

Expand Down

0 comments on commit e23350b

Please sign in to comment.