Skip to content

Commit

Permalink
---
Browse files Browse the repository at this point in the history
yaml
---
r: 2448
b: refs/heads/master
c: e7c8d5c
h: refs/heads/master
v: v3
  • Loading branch information
Christoph Lameter authored and Linus Torvalds committed Jun 22, 2005
1 parent 29da7e5 commit 1bf1ca9
Show file tree
Hide file tree
Showing 7 changed files with 196 additions and 39 deletions.
2 changes: 1 addition & 1 deletion [refs]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
---
refs/heads/master: 63551ae0feaaa23807ebea60de1901564bbef32e
refs/heads/master: e7c8d5c9955a4d2e88e36b640563f5d6d5aba48a
2 changes: 1 addition & 1 deletion trunk/drivers/base/node.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ static ssize_t node_read_numastat(struct sys_device * dev, char * buf)
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *z = &pg->node_zones[i];
for (cpu = 0; cpu < NR_CPUS; cpu++) {
struct per_cpu_pageset *ps = &z->pageset[cpu];
struct per_cpu_pageset *ps = zone_pcp(z,cpu);
numa_hit += ps->numa_hit;
numa_miss += ps->numa_miss;
numa_foreign += ps->numa_foreign;
Expand Down
6 changes: 6 additions & 0 deletions trunk/include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -691,6 +691,12 @@ extern void show_mem(void);
extern void si_meminfo(struct sysinfo * val);
extern void si_meminfo_node(struct sysinfo *val, int nid);

#ifdef CONFIG_NUMA
extern void setup_per_cpu_pageset(void);
#else
static inline void setup_per_cpu_pageset(void) {}
#endif

/* prio_tree.c */
void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *);
Expand Down
11 changes: 10 additions & 1 deletion trunk/include/linux/mmzone.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ struct per_cpu_pageset {
#endif
} ____cacheline_aligned_in_smp;

#ifdef CONFIG_NUMA
#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)])
#else
#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)])
#endif

#define ZONE_DMA 0
#define ZONE_NORMAL 1
#define ZONE_HIGHMEM 2
Expand Down Expand Up @@ -122,8 +128,11 @@ struct zone {
*/
unsigned long lowmem_reserve[MAX_NR_ZONES];

#ifdef CONFIG_NUMA
struct per_cpu_pageset *pageset[NR_CPUS];
#else
struct per_cpu_pageset pageset[NR_CPUS];

#endif
/*
* free areas of different sizes
*/
Expand Down
1 change: 1 addition & 0 deletions trunk/init/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,7 @@ asmlinkage void __init start_kernel(void)
vfs_caches_init_early();
mem_init();
kmem_cache_init();
setup_per_cpu_pageset();
numa_policy_init();
if (late_time_init)
late_time_init();
Expand Down
2 changes: 1 addition & 1 deletion trunk/mm/mempolicy.c
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or
zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
page = __alloc_pages(gfp, order, zl);
if (page && page_zone(page) == zl->zones[0]) {
zl->zones[0]->pageset[get_cpu()].interleave_hit++;
zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
put_cpu();
}
return page;
Expand Down
211 changes: 176 additions & 35 deletions trunk/mm/page_alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@ EXPORT_SYMBOL(nr_swap_pages);
struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
EXPORT_SYMBOL(zone_table);

#ifdef CONFIG_NUMA
static struct per_cpu_pageset
pageset_table[MAX_NR_ZONES*MAX_NUMNODES*NR_CPUS] __initdata;
#endif

static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
int min_free_kbytes = 1024;

Expand Down Expand Up @@ -520,7 +525,7 @@ static void __drain_pages(unsigned int cpu)
for_each_zone(zone) {
struct per_cpu_pageset *pset;

pset = &zone->pageset[cpu];
pset = zone_pcp(zone, cpu);
for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
struct per_cpu_pages *pcp;

Expand Down Expand Up @@ -583,12 +588,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)

local_irq_save(flags);
cpu = smp_processor_id();
p = &z->pageset[cpu];
p = zone_pcp(z,cpu);
if (pg == orig) {
z->pageset[cpu].numa_hit++;
p->numa_hit++;
} else {
p->numa_miss++;
zonelist->zones[0]->pageset[cpu].numa_foreign++;
zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
}
if (pg == NODE_DATA(numa_node_id()))
p->local_node++;
Expand All @@ -615,7 +620,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
if (PageAnon(page))
page->mapping = NULL;
free_pages_check(__FUNCTION__, page);
pcp = &zone->pageset[get_cpu()].pcp[cold];
pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
local_irq_save(flags);
if (pcp->count >= pcp->high)
pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
Expand Down Expand Up @@ -659,7 +664,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
if (order == 0) {
struct per_cpu_pages *pcp;

pcp = &zone->pageset[get_cpu()].pcp[cold];
pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
local_irq_save(flags);
if (pcp->count <= pcp->low)
pcp->count += rmqueue_bulk(zone, 0,
Expand Down Expand Up @@ -1262,7 +1267,7 @@ void show_free_areas(void)
if (!cpu_possible(cpu))
continue;

pageset = zone->pageset + cpu;
pageset = zone_pcp(zone, cpu);

for (temperature = 0; temperature < 2; temperature++)
printk("cpu %d %s: low %d, high %d, batch %d\n",
Expand Down Expand Up @@ -1645,6 +1650,157 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
memmap_init_zone((size), (nid), (zone), (start_pfn))
#endif

static int __devinit zone_batchsize(struct zone *zone)
{
int batch;

/*
* The per-cpu-pages pools are set to around 1000th of the
* size of the zone. But no more than 1/4 of a meg - there's
* no point in going beyond the size of L2 cache.
*
* OK, so we don't know how big the cache is. So guess.
*/
batch = zone->present_pages / 1024;
if (batch * PAGE_SIZE > 256 * 1024)
batch = (256 * 1024) / PAGE_SIZE;
batch /= 4; /* We effectively *= 4 below */
if (batch < 1)
batch = 1;

/*
* Clamp the batch to a 2^n - 1 value. Having a power
* of 2 value was found to be more likely to have
* suboptimal cache aliasing properties in some cases.
*
* For example if 2 tasks are alternately allocating
* batches of pages, one task can end up with a lot
* of pages of one half of the possible page colors
* and the other with pages of the other colors.
*/
batch = (1 << fls(batch + batch/2)) - 1;
return batch;
}

#ifdef CONFIG_NUMA
/*
* Dynamicaly allocate memory for the
* per cpu pageset array in struct zone.
*/
static int __devinit process_zones(int cpu)
{
struct zone *zone, *dzone;
int i;

for_each_zone(zone) {
struct per_cpu_pageset *npageset = NULL;

npageset = kmalloc_node(sizeof(struct per_cpu_pageset),
GFP_KERNEL, cpu_to_node(cpu));
if (!npageset) {
zone->pageset[cpu] = NULL;
goto bad;
}

if (zone->pageset[cpu]) {
memcpy(npageset, zone->pageset[cpu],
sizeof(struct per_cpu_pageset));

/* Relocate lists */
for (i = 0; i < 2; i++) {
INIT_LIST_HEAD(&npageset->pcp[i].list);
list_splice(&zone->pageset[cpu]->pcp[i].list,
&npageset->pcp[i].list);
}
} else {
struct per_cpu_pages *pcp;
unsigned long batch;

batch = zone_batchsize(zone);

pcp = &npageset->pcp[0]; /* hot */
pcp->count = 0;
pcp->low = 2 * batch;
pcp->high = 6 * batch;
pcp->batch = 1 * batch;
INIT_LIST_HEAD(&pcp->list);

pcp = &npageset->pcp[1]; /* cold*/
pcp->count = 0;
pcp->low = 0;
pcp->high = 2 * batch;
pcp->batch = 1 * batch;
INIT_LIST_HEAD(&pcp->list);
}
zone->pageset[cpu] = npageset;
}

return 0;
bad:
for_each_zone(dzone) {
if (dzone == zone)
break;
kfree(dzone->pageset[cpu]);
dzone->pageset[cpu] = NULL;
}
return -ENOMEM;
}

static inline void free_zone_pagesets(int cpu)
{
#ifdef CONFIG_NUMA
struct zone *zone;

for_each_zone(zone) {
struct per_cpu_pageset *pset = zone_pcp(zone, cpu);

zone_pcp(zone, cpu) = NULL;
kfree(pset);
}
#endif
}

static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
int cpu = (long)hcpu;
int ret = NOTIFY_OK;

switch (action) {
case CPU_UP_PREPARE:
if (process_zones(cpu))
ret = NOTIFY_BAD;
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD:
free_zone_pagesets(cpu);
break;
#endif
default:
break;
}
return ret;
}

static struct notifier_block pageset_notifier =
{ &pageset_cpuup_callback, NULL, 0 };

void __init setup_per_cpu_pageset()
{
int err;

/* Initialize per_cpu_pageset for cpu 0.
* A cpuup callback will do this for every cpu
* as it comes online
*/
err = process_zones(smp_processor_id());
BUG_ON(err);
register_cpu_notifier(&pageset_notifier);
}

#endif

/*
* Set up the zone data structures:
* - mark all pages reserved
Expand Down Expand Up @@ -1687,43 +1843,28 @@ static void __init free_area_init_core(struct pglist_data *pgdat,

zone->temp_priority = zone->prev_priority = DEF_PRIORITY;

/*
* The per-cpu-pages pools are set to around 1000th of the
* size of the zone. But no more than 1/4 of a meg - there's
* no point in going beyond the size of L2 cache.
*
* OK, so we don't know how big the cache is. So guess.
*/
batch = zone->present_pages / 1024;
if (batch * PAGE_SIZE > 256 * 1024)
batch = (256 * 1024) / PAGE_SIZE;
batch /= 4; /* We effectively *= 4 below */
if (batch < 1)
batch = 1;

/*
* Clamp the batch to a 2^n - 1 value. Having a power
* of 2 value was found to be more likely to have
* suboptimal cache aliasing properties in some cases.
*
* For example if 2 tasks are alternately allocating
* batches of pages, one task can end up with a lot
* of pages of one half of the possible page colors
* and the other with pages of the other colors.
*/
batch = (1 << fls(batch + batch/2)) - 1;
batch = zone_batchsize(zone);

for (cpu = 0; cpu < NR_CPUS; cpu++) {
struct per_cpu_pages *pcp;
#ifdef CONFIG_NUMA
struct per_cpu_pageset *pgset;
pgset = &pageset_table[nid*MAX_NR_ZONES*NR_CPUS +
(j * NR_CPUS) + cpu];

zone->pageset[cpu] = pgset;
#else
struct per_cpu_pageset *pgset = zone_pcp(zone, cpu);
#endif

pcp = &zone->pageset[cpu].pcp[0]; /* hot */
pcp = &pgset->pcp[0]; /* hot */
pcp->count = 0;
pcp->low = 2 * batch;
pcp->high = 6 * batch;
pcp->batch = 1 * batch;
INIT_LIST_HEAD(&pcp->list);

pcp = &zone->pageset[cpu].pcp[1]; /* cold */
pcp = &pgset->pcp[1]; /* cold */
pcp->count = 0;
pcp->low = 0;
pcp->high = 2 * batch;
Expand Down Expand Up @@ -1929,7 +2070,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
struct per_cpu_pageset *pageset;
int j;

pageset = &zone->pageset[i];
pageset = zone_pcp(zone, i);
for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
if (pageset->pcp[j].count)
break;
Expand Down

0 comments on commit 1bf1ca9

Please sign in to comment.