Skip to content

Commit

Permalink
x86: add remapping percpu first chunk allocator
Browse files Browse the repository at this point in the history
Impact: add better first percpu allocation for NUMA

On NUMA, embedding allocator can't be used as different units can't be
made to fall in the correct NUMA nodes.  To use large page mapping,
each unit needs to be remapped.  However, percpu areas are usually
much smaller than large page size and unused space hurts a lot as the
number of cpus grow.  This allocator remaps large pages for each chunk
but gives back unused part to the bootmem allocator making the large
pages mapped twice.

This adds slightly to the TLB pressure but is much better than using
4k mappings while still being NUMA-friendly.

Ingo suggested that this would be the correct approach for NUMA.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
  • Loading branch information
Tejun Heo committed Feb 24, 2009
1 parent 89c9215 commit 8ac8375
Showing 1 changed file with 135 additions and 2 deletions.
137 changes: 135 additions & 2 deletions arch/x86/kernel/setup_percpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,133 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
#endif
}

/*
* Remap allocator
*
* This allocator uses PMD page as unit. A PMD page is allocated for
* each cpu and each is remapped into vmalloc area using PMD mapping.
* As PMD page is quite large, only part of it is used for the first
* chunk. Unused part is returned to the bootmem allocator.
*
* So, the PMD pages are mapped twice - once to the physical mapping
* and to the vmalloc area for the first percpu chunk. The double
* mapping does add one more PMD TLB entry pressure but still is much
* better than only using 4k mappings while still being NUMA friendly.
*/
#ifdef CONFIG_NEED_MULTIPLE_NODES
static size_t pcpur_size __initdata;
static void **pcpur_ptrs __initdata;

static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
{
size_t off = (size_t)pageno << PAGE_SHIFT;

if (off >= pcpur_size)
return NULL;

return virt_to_page(pcpur_ptrs[cpu] + off);
}

static ssize_t __init setup_pcpu_remap(size_t static_size)
{
static struct vm_struct vm;
pg_data_t *last;
size_t ptrs_size;
unsigned int cpu;
ssize_t ret;

/*
* If large page isn't supported, there's no benefit in doing
* this. Also, on non-NUMA, embedding is better.
*/
if (!cpu_has_pse || pcpu_need_numa())
return -EINVAL;

last = NULL;
for_each_possible_cpu(cpu) {
int node = early_cpu_to_node(cpu);

if (node_online(node) && NODE_DATA(node) &&
last && last != NODE_DATA(node))
goto proceed;

last = NODE_DATA(node);
}
return -EINVAL;

proceed:
/*
* Currently supports only single page. Supporting multiple
* pages won't be too difficult if it ever becomes necessary.
*/
pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
if (pcpur_size > PMD_SIZE) {
pr_warning("PERCPU: static data is larger than large page, "
"can't use large page\n");
return -EINVAL;
}

/* allocate pointer array and alloc large pages */
ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
pcpur_ptrs = alloc_bootmem(ptrs_size);

for_each_possible_cpu(cpu) {
pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
if (!pcpur_ptrs[cpu])
goto enomem;

/*
* Only use pcpur_size bytes and give back the rest.
*
* Ingo: The 2MB up-rounding bootmem is needed to make
* sure the partial 2MB page is still fully RAM - it's
* not well-specified to have a PAT-incompatible area
* (unmapped RAM, device memory, etc.) in that hole.
*/
free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
PMD_SIZE - pcpur_size);

memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
}

/* allocate address and map */
vm.flags = VM_ALLOC;
vm.size = num_possible_cpus() * PMD_SIZE;
vm_area_register_early(&vm, PMD_SIZE);

for_each_possible_cpu(cpu) {
pmd_t *pmd;

pmd = populate_extra_pmd((unsigned long)vm.addr
+ cpu * PMD_SIZE);
set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
PAGE_KERNEL_LARGE));
}

/* we're ready, commit */
pr_info("PERCPU: Remapped at %p with large pages, static data "
"%zu bytes\n", vm.addr, static_size);

ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE,
pcpur_size - static_size, vm.addr, NULL);
goto out_free_ar;

enomem:
for_each_possible_cpu(cpu)
if (pcpur_ptrs[cpu])
free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
ret = -ENOMEM;
out_free_ar:
free_bootmem(__pa(pcpur_ptrs), ptrs_size);
return ret;
}
#else
static ssize_t __init setup_pcpu_remap(size_t static_size)
{
return -EINVAL;
}
#endif

/*
* Embedding allocator
*
Expand Down Expand Up @@ -259,8 +386,14 @@ void __init setup_per_cpu_areas(void)
pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);

/* allocate percpu area */
ret = setup_pcpu_embed(static_size);
/*
* Allocate percpu area. If PSE is supported, try to make use
* of large page mappings. Please read comments on top of
* each allocator for details.
*/
ret = setup_pcpu_remap(static_size);
if (ret < 0)
ret = setup_pcpu_embed(static_size);
if (ret < 0)
ret = setup_pcpu_4k(static_size);
if (ret < 0)
Expand Down

0 comments on commit 8ac8375

Please sign in to comment.