Skip to content

Commit

Permalink
---
Browse files Browse the repository at this point in the history
yaml
---
r: 42644
b: refs/heads/master
c: 9276b1b
h: refs/heads/master
v: v3
  • Loading branch information
Paul Jackson authored and Linus Torvalds committed Dec 7, 2006
1 parent 9763ead commit d862135
Show file tree
Hide file tree
Showing 5 changed files with 266 additions and 13 deletions.
2 changes: 1 addition & 1 deletion [refs]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
---
refs/heads/master: 89689ae7f95995723fbcd5c116c47933a3bb8b13
refs/heads/master: 9276b1bc96a132f4068fdee00983c532f43d3a26
2 changes: 2 additions & 0 deletions trunk/include/linux/cpuset.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ extern void cpuset_fork(struct task_struct *p);
extern void cpuset_exit(struct task_struct *p);
extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
#define cpuset_current_mems_allowed (current->mems_allowed)
void cpuset_init_current_mems_allowed(void);
void cpuset_update_task_memory_state(void);
#define cpuset_nodes_subset_current_mems_allowed(nodes) \
Expand Down Expand Up @@ -83,6 +84,7 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
return node_possible_map;
}

#define cpuset_current_mems_allowed (node_online_map)
static inline void cpuset_init_current_mems_allowed(void) {}
static inline void cpuset_update_task_memory_state(void) {}
#define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
Expand Down
85 changes: 80 additions & 5 deletions trunk/include/linux/mmzone.h
Original file line number Diff line number Diff line change
Expand Up @@ -288,19 +288,94 @@ struct zone {
*/
#define DEF_PRIORITY 12

/* Maximum number of zones on a zonelist */
#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)

#ifdef CONFIG_NUMA
/*
* We cache key information from each zonelist for smaller cache
* footprint when scanning for free pages in get_page_from_freelist().
*
* 1) The BITMAP fullzones tracks which zones in a zonelist have come
* up short of free memory since the last time (last_fullzone_zap)
* we zero'd fullzones.
* 2) The array z_to_n[] maps each zone in the zonelist to its node
* id, so that we can efficiently evaluate whether that node is
* set in the current tasks mems_allowed.
*
* Both fullzones and z_to_n[] are one-to-one with the zonelist,
* indexed by a zones offset in the zonelist zones[] array.
*
* The get_page_from_freelist() routine does two scans. During the
* first scan, we skip zones whose corresponding bit in 'fullzones'
* is set or whose corresponding node in current->mems_allowed (which
* comes from cpusets) is not set. During the second scan, we bypass
* this zonelist_cache, to ensure we look methodically at each zone.
*
* Once per second, we zero out (zap) fullzones, forcing us to
* reconsider nodes that might have regained more free memory.
* The field last_full_zap is the time we last zapped fullzones.
*
* This mechanism reduces the amount of time we waste repeatedly
* reexaming zones for free memory when they just came up low on
* memory momentarilly ago.
*
* The zonelist_cache struct members logically belong in struct
* zonelist. However, the mempolicy zonelists constructed for
* MPOL_BIND are intentionally variable length (and usually much
* shorter). A general purpose mechanism for handling structs with
* multiple variable length members is more mechanism than we want
* here. We resort to some special case hackery instead.
*
* The MPOL_BIND zonelists don't need this zonelist_cache (in good
* part because they are shorter), so we put the fixed length stuff
* at the front of the zonelist struct, ending in a variable length
* zones[], as is needed by MPOL_BIND.
*
* Then we put the optional zonelist cache on the end of the zonelist
* struct. This optional stuff is found by a 'zlcache_ptr' pointer in
* the fixed length portion at the front of the struct. This pointer
* both enables us to find the zonelist cache, and in the case of
* MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL)
* to know that the zonelist cache is not there.
*
* The end result is that struct zonelists come in two flavors:
* 1) The full, fixed length version, shown below, and
* 2) The custom zonelists for MPOL_BIND.
* The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache.
*
* Even though there may be multiple CPU cores on a node modifying
* fullzones or last_full_zap in the same zonelist_cache at the same
* time, we don't lock it. This is just hint data - if it is wrong now
* and then, the allocator will still function, perhaps a bit slower.
*/


struct zonelist_cache {
DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */
unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */
unsigned long last_full_zap; /* when last zap'd (jiffies) */
};
#else
struct zonelist_cache;
#endif

/*
* One allocation request operates on a zonelist. A zonelist
* is a list of zones, the first one is the 'goal' of the
* allocation, the other zones are fallback zones, in decreasing
* priority.
*
* Right now a zonelist takes up less than a cacheline. We never
* modify it apart from boot-up, and only a few indices are used,
* so despite the zonelist table being relatively big, the cache
* footprint of this construct is very small.
* If zlcache_ptr is not NULL, then it is just the address of zlcache,
* as explained above. If zlcache_ptr is NULL, there is no zlcache.
*/

struct zonelist {
struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited
struct zonelist_cache *zlcache_ptr; // NULL or &zlcache
struct zone *zones[MAX_ZONES_PER_ZONELIST + 1]; // NULL delimited
#ifdef CONFIG_NUMA
struct zonelist_cache zlcache; // optional ...
#endif
};

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
Expand Down
2 changes: 2 additions & 0 deletions trunk/mm/mempolicy.c
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,11 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
enum zone_type k;

max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
max++; /* space for zlcache_ptr (see mmzone.h) */
zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
if (!zl)
return NULL;
zl->zlcache_ptr = NULL;
num = 0;
/* First put in the highest zones from all nodes, then all the next
lower zones etc. Avoid empty zones because the memory allocator
Expand Down
188 changes: 181 additions & 7 deletions trunk/mm/page_alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -918,6 +918,126 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
return 1;
}

#ifdef CONFIG_NUMA
/*
* zlc_setup - Setup for "zonelist cache". Uses cached zone data to
* skip over zones that are not allowed by the cpuset, or that have
* been recently (in last second) found to be nearly full. See further
* comments in mmzone.h. Reduces cache footprint of zonelist scans
* that have to skip over alot of full or unallowed zones.
*
* If the zonelist cache is present in the passed in zonelist, then
* returns a pointer to the allowed node mask (either the current
* tasks mems_allowed, or node_online_map.)
*
* If the zonelist cache is not available for this zonelist, does
* nothing and returns NULL.
*
* If the fullzones BITMAP in the zonelist cache is stale (more than
* a second since last zap'd) then we zap it out (clear its bits.)
*
* We hold off even calling zlc_setup, until after we've checked the
* first zone in the zonelist, on the theory that most allocations will
* be satisfied from that first zone, so best to examine that zone as
* quickly as we can.
*/
static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
{
struct zonelist_cache *zlc; /* cached zonelist speedup info */
nodemask_t *allowednodes; /* zonelist_cache approximation */

zlc = zonelist->zlcache_ptr;
if (!zlc)
return NULL;

if (jiffies - zlc->last_full_zap > 1 * HZ) {
bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
zlc->last_full_zap = jiffies;
}

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
&cpuset_current_mems_allowed :
&node_online_map;
return allowednodes;
}

/*
* Given 'z' scanning a zonelist, run a couple of quick checks to see
* if it is worth looking at further for free memory:
* 1) Check that the zone isn't thought to be full (doesn't have its
* bit set in the zonelist_cache fullzones BITMAP).
* 2) Check that the zones node (obtained from the zonelist_cache
* z_to_n[] mapping) is allowed in the passed in allowednodes mask.
* Return true (non-zero) if zone is worth looking at further, or
* else return false (zero) if it is not.
*
* This check -ignores- the distinction between various watermarks,
* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
* found to be full for any variation of these watermarks, it will
* be considered full for up to one second by all requests, unless
* we are so low on memory on all allowed nodes that we are forced
* into the second scan of the zonelist.
*
* In the second scan we ignore this zonelist cache and exactly
* apply the watermarks to all zones, even it is slower to do so.
* We are low on memory in the second scan, and should leave no stone
* unturned looking for a free page.
*/
static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
nodemask_t *allowednodes)
{
struct zonelist_cache *zlc; /* cached zonelist speedup info */
int i; /* index of *z in zonelist zones */
int n; /* node that zone *z is on */

zlc = zonelist->zlcache_ptr;
if (!zlc)
return 1;

i = z - zonelist->zones;
n = zlc->z_to_n[i];

/* This zone is worth trying if it is allowed but not full */
return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
}

/*
* Given 'z' scanning a zonelist, set the corresponding bit in
* zlc->fullzones, so that subsequent attempts to allocate a page
* from that zone don't waste time re-examining it.
*/
static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
{
struct zonelist_cache *zlc; /* cached zonelist speedup info */
int i; /* index of *z in zonelist zones */

zlc = zonelist->zlcache_ptr;
if (!zlc)
return;

i = z - zonelist->zones;

set_bit(i, zlc->fullzones);
}

#else /* CONFIG_NUMA */

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
{
return NULL;
}

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
nodemask_t *allowednodes)
{
return 1;
}

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
{
}
#endif /* CONFIG_NUMA */

/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
Expand All @@ -926,23 +1046,32 @@ static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, int alloc_flags)
{
struct zone **z = zonelist->zones;
struct zone **z;
struct page *page = NULL;
int classzone_idx = zone_idx(*z);
int classzone_idx = zone_idx(zonelist->zones[0]);
struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */

zonelist_scan:
/*
* Go through the zonelist once, looking for a zone with enough free.
* Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
z = zonelist->zones;

do {
if (NUMA_BUILD && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
zone = *z;
if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
break;
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed(zone, gfp_mask))
continue;
goto try_next_zone;

if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
unsigned long mark;
Expand All @@ -956,15 +1085,30 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
classzone_idx, alloc_flags)) {
if (!zone_reclaim_mode ||
!zone_reclaim(zone, gfp_mask, order))
continue;
goto this_zone_full;
}
}

page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
if (page)
break;

this_zone_full:
if (NUMA_BUILD)
zlc_mark_zone_full(zonelist, z);
try_next_zone:
if (NUMA_BUILD && !did_zlc_setup) {
/* we do zlc_setup after the first zone is tried */
allowednodes = zlc_setup(zonelist, alloc_flags);
zlc_active = 1;
did_zlc_setup = 1;
}
} while (*(++z) != NULL);

if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
/* Disable zlc cache for second zonelist scan */
zlc_active = 0;
goto zonelist_scan;
}
return page;
}

Expand Down Expand Up @@ -1535,6 +1679,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
}
}

/* Construct the zonelist performance cache - see further mmzone.h */
static void __meminit build_zonelist_cache(pg_data_t *pgdat)
{
int i;

for (i = 0; i < MAX_NR_ZONES; i++) {
struct zonelist *zonelist;
struct zonelist_cache *zlc;
struct zone **z;

zonelist = pgdat->node_zonelists + i;
zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
for (z = zonelist->zones; *z; z++)
zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
}
}

#else /* CONFIG_NUMA */

static void __meminit build_zonelists(pg_data_t *pgdat)
Expand Down Expand Up @@ -1572,14 +1734,26 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
}
}

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
static void __meminit build_zonelist_cache(pg_data_t *pgdat)
{
int i;

for (i = 0; i < MAX_NR_ZONES; i++)
pgdat->node_zonelists[i].zlcache_ptr = NULL;
}

#endif /* CONFIG_NUMA */

/* return values int ....just for stop_machine_run() */
static int __meminit __build_all_zonelists(void *dummy)
{
int nid;
for_each_online_node(nid)

for_each_online_node(nid) {
build_zonelists(NODE_DATA(nid));
build_zonelist_cache(NODE_DATA(nid));
}
return 0;
}

Expand Down

0 comments on commit d862135

Please sign in to comment.