Skip to content

Commit

Permalink
---
Browse files Browse the repository at this point in the history
yaml
---
r: 7422
b: refs/heads/master
c: 9bf2229
h: refs/heads/master
v: v3
  • Loading branch information
Paul Jackson authored and Linus Torvalds committed Sep 7, 2005
1 parent 8643f6f commit a4b47d5
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 21 deletions.
2 changes: 1 addition & 1 deletion [refs]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
---
refs/heads/master: f90b1d2f1aaaa40c6519a32e69615edc25bb97d5
refs/heads/master: 9bf2229f8817677127a60c177aefce1badd22d7b
12 changes: 12 additions & 0 deletions trunk/Documentation/cpusets.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,18 @@ all of the cpus in the system. This removes any overhead due to
load balancing code trying to pull tasks outside of the cpu exclusive
cpuset only to be prevented by the tasks' cpus_allowed mask.

A cpuset that is mem_exclusive restricts kernel allocations for
page, buffer and other data commonly shared by the kernel across
multiple users. All cpusets, whether mem_exclusive or not, restrict
allocations of memory for user space. This enables configuring a
system so that several independent jobs can share common kernel
data, such as file system pages, while isolating each jobs user
allocation in its own cpuset. To do this, construct a large
mem_exclusive cpuset to hold all the jobs, and construct child,
non-mem_exclusive cpusets for each individual job. Only a small
amount of typical kernel memory, such as requests from interrupt
handlers, is allowed to be taken outside even a mem_exclusive cpuset.

User level code may create and destroy cpusets by name in the cpuset
virtual file system, manage the attributes and permissions of these
cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
Expand Down
5 changes: 3 additions & 2 deletions trunk/include/linux/cpuset.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ void cpuset_init_current_mems_allowed(void);
void cpuset_update_current_mems_allowed(void);
void cpuset_restrict_to_mems_allowed(unsigned long *nodes);
int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
int cpuset_zone_allowed(struct zone *z);
extern int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask);
extern struct file_operations proc_cpuset_operations;
extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);

Expand All @@ -48,7 +48,8 @@ static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
return 1;
}

static inline int cpuset_zone_allowed(struct zone *z)
static inline int cpuset_zone_allowed(struct zone *z,
unsigned int __nocast gfp_mask)
{
return 1;
}
Expand Down
80 changes: 72 additions & 8 deletions trunk/kernel/cpuset.c
Original file line number Diff line number Diff line change
Expand Up @@ -1611,17 +1611,81 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
return 0;
}

/*
* nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
* ancestor to the specified cpuset. Call while holding cpuset_sem.
* If no ancestor is mem_exclusive (an unusual configuration), then
* returns the root cpuset.
*/
static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
{
while (!is_mem_exclusive(cs) && cs->parent)
cs = cs->parent;
return cs;
}

/**
* cpuset_zone_allowed - is zone z allowed in current->mems_allowed
* @z: zone in question
* cpuset_zone_allowed - Can we allocate memory on zone z's memory node?
* @z: is this zone on an allowed node?
* @gfp_mask: memory allocation flags (we use __GFP_HARDWALL)
*
* Is zone z allowed in current->mems_allowed, or is
* the CPU in interrupt context? (zone is always allowed in this case)
*/
int cpuset_zone_allowed(struct zone *z)
* If we're in interrupt, yes, we can always allocate. If zone
* z's node is in our tasks mems_allowed, yes. If it's not a
* __GFP_HARDWALL request and this zone's nodes is in the nearest
* mem_exclusive cpuset ancestor to this tasks cpuset, yes.
* Otherwise, no.
*
* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
* and do not allow allocations outside the current tasks cpuset.
* GFP_KERNEL allocations are not so marked, so can escape to the
* nearest mem_exclusive ancestor cpuset.
*
* Scanning up parent cpusets requires cpuset_sem. The __alloc_pages()
* routine only calls here with __GFP_HARDWALL bit _not_ set if
* it's a GFP_KERNEL allocation, and all nodes in the current tasks
* mems_allowed came up empty on the first pass over the zonelist.
* So only GFP_KERNEL allocations, if all nodes in the cpuset are
* short of memory, might require taking the cpuset_sem semaphore.
*
* The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
* calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
* hardwall cpusets - no allocation on a node outside the cpuset is
* allowed (unless in interrupt, of course).
*
* The second loop doesn't even call here for GFP_ATOMIC requests
* (if the __alloc_pages() local variable 'wait' is set). That check
* and the checks below have the combined affect in the second loop of
* the __alloc_pages() routine that:
* in_interrupt - any node ok (current task context irrelevant)
* GFP_ATOMIC - any node ok
* GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
**/

int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask)
{
return in_interrupt() ||
node_isset(z->zone_pgdat->node_id, current->mems_allowed);
int node; /* node that zone z is on */
const struct cpuset *cs; /* current cpuset ancestors */
int allowed = 1; /* is allocation in zone z allowed? */

if (in_interrupt())
return 1;
node = z->zone_pgdat->node_id;
if (node_isset(node, current->mems_allowed))
return 1;
if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
return 0;

/* Not hardwall and node outside mems_allowed: scan up cpusets */
down(&cpuset_sem);
cs = current->cpuset;
if (!cs)
goto done; /* current task exiting */
cs = nearest_exclusive_ancestor(cs);
allowed = node_isset(node, cs->mems_allowed);
done:
up(&cpuset_sem);
return allowed;
}

/*
Expand Down
16 changes: 10 additions & 6 deletions trunk/mm/page_alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -806,11 +806,14 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
classzone_idx = zone_idx(zones[0]);

restart:
/* Go through the zonelist once, looking for a zone with enough free */
/*
* Go through the zonelist once, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
for (i = 0; (z = zones[i]) != NULL; i++) {
int do_reclaim = should_reclaim_zone(z, gfp_mask);

if (!cpuset_zone_allowed(z))
if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
continue;

/*
Expand Down Expand Up @@ -845,14 +848,15 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
*
* This is the last chance, in general, before the goto nopage.
* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
for (i = 0; (z = zones[i]) != NULL; i++) {
if (!zone_watermark_ok(z, order, z->pages_min,
classzone_idx, can_try_harder,
gfp_mask & __GFP_HIGH))
continue;

if (wait && !cpuset_zone_allowed(z))
if (wait && !cpuset_zone_allowed(z, gfp_mask))
continue;

page = buffered_rmqueue(z, order, gfp_mask);
Expand All @@ -867,7 +871,7 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
/* go through the zonelist yet again, ignoring mins */
for (i = 0; (z = zones[i]) != NULL; i++) {
if (!cpuset_zone_allowed(z))
if (!cpuset_zone_allowed(z, gfp_mask))
continue;
page = buffered_rmqueue(z, order, gfp_mask);
if (page)
Expand Down Expand Up @@ -903,7 +907,7 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
gfp_mask & __GFP_HIGH))
continue;

if (!cpuset_zone_allowed(z))
if (!cpuset_zone_allowed(z, gfp_mask))
continue;

page = buffered_rmqueue(z, order, gfp_mask);
Expand All @@ -922,7 +926,7 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
classzone_idx, 0, 0))
continue;

if (!cpuset_zone_allowed(z))
if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
continue;

page = buffered_rmqueue(z, order, gfp_mask);
Expand Down
8 changes: 4 additions & 4 deletions trunk/mm/vmscan.c
Original file line number Diff line number Diff line change
Expand Up @@ -894,7 +894,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
if (zone->present_pages == 0)
continue;

if (!cpuset_zone_allowed(zone))
if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
continue;

zone->temp_priority = sc->priority;
Expand Down Expand Up @@ -940,7 +940,7 @@ int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
for (i = 0; zones[i] != NULL; i++) {
struct zone *zone = zones[i];

if (!cpuset_zone_allowed(zone))
if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
continue;

zone->temp_priority = DEF_PRIORITY;
Expand Down Expand Up @@ -986,7 +986,7 @@ int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
for (i = 0; zones[i] != 0; i++) {
struct zone *zone = zones[i];

if (!cpuset_zone_allowed(zone))
if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
continue;

zone->prev_priority = zone->temp_priority;
Expand Down Expand Up @@ -1256,7 +1256,7 @@ void wakeup_kswapd(struct zone *zone, int order)
return;
if (pgdat->kswapd_max_order < order)
pgdat->kswapd_max_order = order;
if (!cpuset_zone_allowed(zone))
if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
return;
if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
return;
Expand Down

0 comments on commit a4b47d5

Please sign in to comment.