From ffe0d5a575459ffe664b0762130b557f826fcace Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 29 Sep 2009 09:17:56 +0900 Subject: [PATCH 1/6] percpu: fix unit_map[] verification in pcpu_setup_first_chunk() pcpu_setup_first_chunk() incorrectly used NR_CPUS as the impossible unit number while unit number can equal and go over NR_CPUS with sparse unit map. This triggers BUG_ON() spuriously on machines which have non-power-of-two number of cpus. Use UINT_MAX instead. Signed-off-by: Tejun Heo Reported-and-tested-by: Tony Vroon --- mm/percpu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 43d8cacfdaa5e..e5c4cbda60269 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1604,7 +1604,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); for (cpu = 0; cpu < nr_cpu_ids; cpu++) - unit_map[cpu] = NR_CPUS; + unit_map[cpu] = UINT_MAX; pcpu_first_unit_cpu = NR_CPUS; for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { @@ -1619,7 +1619,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, continue; BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu)); - BUG_ON(unit_map[cpu] != NR_CPUS); + BUG_ON(unit_map[cpu] != UINT_MAX); unit_map[cpu] = unit + i; unit_off[cpu] = gi->base_offset + i * ai->unit_size; @@ -1632,7 +1632,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_nr_units = unit; for_each_possible_cpu(cpu) - BUG_ON(unit_map[cpu] == NR_CPUS); + BUG_ON(unit_map[cpu] == UINT_MAX); pcpu_nr_groups = ai->nr_groups; pcpu_group_offsets = group_offsets; From fb59e72e7e10fd9d31f4e522f1b28254c2cc8a6c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Sep 2009 18:50:34 +0900 Subject: [PATCH 2/6] percpu: make pcpu_build_alloc_info() clear static buffers pcpu_build_alloc_info() may be called multiple times when percpu is falling back to different first chunk allocator. Make it clear static buffers so that they don't contain values from previous runs. Signed-off-by: Tejun Heo --- mm/percpu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/percpu.c b/mm/percpu.c index e5c4cbda60269..a64133f8af45b 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1347,6 +1347,10 @@ struct pcpu_alloc_info * __init pcpu_build_alloc_info( struct pcpu_alloc_info *ai; unsigned int *cpu_map; + /* this function may be called multiple times */ + memset(group_map, 0, sizeof(group_map)); + memset(group_cnt, 0, sizeof(group_map)); + /* * Determine min_unit_size, alloc_size and max_upa such that * alloc_size is multiple of atom_size and is the smallest From a70c691376c7c7f94af41395848066f59501fffd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Sep 2009 18:18:55 +0900 Subject: [PATCH 3/6] sparc64: implement page mapping percpu first chunk allocator Implement page mapping percpu first chunk allocator as a fallback to the embedding allocator. The next patch will make the embedding allocator check distances between units to determine whether it fits within the vmalloc area so that this fallback can be used on such cases. sparc64 currently has relatively small vmalloc area which makes it impossible to create any dynamic chunks on certain configurations leading to percpu allocation failures. This and the next patch should allow those configurations to keep working until proper solution is found. While at it, mark pcpu_cpu_distance() with __init. Signed-off-by: Tejun Heo Acked-by: David S. Miller --- arch/sparc/Kconfig | 3 +++ arch/sparc/kernel/smp_64.c | 53 +++++++++++++++++++++++++++++++------- 2 files changed, 47 insertions(+), 9 deletions(-) diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 97fca4695e0b8..ac45aab741a50 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -102,6 +102,9 @@ config HAVE_SETUP_PER_CPU_AREA config NEED_PER_CPU_EMBED_FIRST_CHUNK def_bool y if SPARC64 +config NEED_PER_CPU_PAGE_FIRST_CHUNK + def_bool y if SPARC64 + config GENERIC_HARDIRQS_NO__DO_IRQ bool def_bool y if SPARC64 diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index ff68373ce6d67..aa36223497b93 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1420,7 +1420,7 @@ static void __init pcpu_free_bootmem(void *ptr, size_t size) free_bootmem(__pa(ptr), size); } -static int pcpu_cpu_distance(unsigned int from, unsigned int to) +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { if (cpu_to_node(from) == cpu_to_node(to)) return LOCAL_DISTANCE; @@ -1428,18 +1428,53 @@ static int pcpu_cpu_distance(unsigned int from, unsigned int to) return REMOTE_DISTANCE; } +static void __init pcpu_populate_pte(unsigned long addr) +{ + pgd_t *pgd = pgd_offset_k(addr); + pud_t *pud; + pmd_t *pmd; + + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) { + pmd_t *new; + + new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); + pud_populate(&init_mm, pud, new); + } + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) { + pte_t *new; + + new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); + pmd_populate_kernel(&init_mm, pmd, new); + } +} + void __init setup_per_cpu_areas(void) { unsigned long delta; unsigned int cpu; - int rc; - - rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, - PERCPU_DYNAMIC_RESERVE, 4 << 20, - pcpu_cpu_distance, pcpu_alloc_bootmem, - pcpu_free_bootmem); - if (rc) - panic("failed to initialize first chunk (%d)", rc); + int rc = -EINVAL; + + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, 4 << 20, + pcpu_cpu_distance, + pcpu_alloc_bootmem, + pcpu_free_bootmem); + if (rc) + pr_warning("PERCPU: %s allocator failed (%d), " + "falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); + } + if (rc < 0) + rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, + pcpu_alloc_bootmem, + pcpu_free_bootmem, + pcpu_populate_pte); + if (rc < 0) + panic("cannot initialize percpu area (err=%d)", rc); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) From 6ea529a2037ce662fc6bfa572b46d47407d08805 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Sep 2009 18:46:01 +0900 Subject: [PATCH 4/6] percpu: make embedding first chunk allocator check vmalloc space size Embedding first chunk allocator maintains the distances between units in the vmalloc area and thus needs vmalloc space to be larger than the maximum distances between units; otherwise, it wouldn't be able to create any dynamic chunks. This patch makes the embedding first chunk allocator check vmalloc space size and if the maximum distance between units is larger than 75% of it, print warning and, if page mapping allocator is available, fail initialization so that the system falls back onto it. This should work around percpu allocation failure problems on certain sparc64 configurations where distances between NUMA nodes are larger than the vmalloc area and makes percpu allocator more robust for future configurations. Signed-off-by: Tejun Heo --- mm/percpu.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index a64133f8af45b..c43da8c024d1d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1786,7 +1786,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, void *base = (void *)ULONG_MAX; void **areas = NULL; struct pcpu_alloc_info *ai; - size_t size_sum, areas_size; + size_t size_sum, areas_size, max_distance; int group, i, rc; ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, @@ -1836,8 +1836,24 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, } /* base address is now known, determine group base offsets */ - for (group = 0; group < ai->nr_groups; group++) + max_distance = 0; + for (group = 0; group < ai->nr_groups; group++) { ai->groups[group].base_offset = areas[group] - base; + max_distance = max(max_distance, ai->groups[group].base_offset); + } + max_distance += ai->unit_size; + + /* warn if maximum distance is further than 75% of vmalloc space */ + if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { + pr_warning("PERCPU: max_distance=0x%lx too large for vmalloc " + "space 0x%lx\n", + max_distance, VMALLOC_END - VMALLOC_START); +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + /* and fail if we have fallback */ + rc = -EINVAL; + goto out_free; +#endif + } pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, From 635b75fc18858d3522e481c043de764766db923c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Sep 2009 09:43:11 +0900 Subject: [PATCH 5/6] percpu: make pcpu_setup_first_chunk() failures more verbose The parameters to pcpu_setup_first_chunk() come from different sources depending on architecture and can be quite complex. The function runs various sanity checks on the parameters and triggers BUG() if something isn't right. However, this is very early during the boot and not reporting exactly what the problem is makes debugging even harder. Add PCPU_SETUP_BUG() macro which prints out enough information about the parameters. As the macro still puts separate BUG() for each check, it won't lose any information even on the situations where only the program counter can be retrieved. While at it, also bump pcpu_dump_alloc_info() message to KERN_INFO so that it's visible on the console if boot fails to complete. Signed-off-by: Tejun Heo --- mm/percpu.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index c43da8c024d1d..83617ca3ba527 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1578,6 +1578,7 @@ static void pcpu_dump_alloc_info(const char *lvl, int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, void *base_addr) { + static char cpus_buf[4096] __initdata; static int smap[2], dmap[2]; size_t dyn_size = ai->dyn_size; size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; @@ -1589,17 +1590,26 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, int *unit_map; int group, unit, i; + cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask); + +#define PCPU_SETUP_BUG_ON(cond) do { \ + if (unlikely(cond)) { \ + pr_emerg("PERCPU: failed to initialize, %s", #cond); \ + pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \ + pcpu_dump_alloc_info(KERN_EMERG, ai); \ + BUG(); \ + } \ +} while (0) + /* sanity checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); - BUG_ON(ai->nr_groups <= 0); - BUG_ON(!ai->static_size); - BUG_ON(!base_addr); - BUG_ON(ai->unit_size < size_sum); - BUG_ON(ai->unit_size & ~PAGE_MASK); - BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); - - pcpu_dump_alloc_info(KERN_DEBUG, ai); + PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); + PCPU_SETUP_BUG_ON(!ai->static_size); + PCPU_SETUP_BUG_ON(!base_addr); + PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); + PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); + PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); /* process group information and build config tables accordingly */ group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); @@ -1622,8 +1632,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (cpu == NR_CPUS) continue; - BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu)); - BUG_ON(unit_map[cpu] != UINT_MAX); + PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids); + PCPU_SETUP_BUG_ON(!cpu_possible(cpu)); + PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX); unit_map[cpu] = unit + i; unit_off[cpu] = gi->base_offset + i * ai->unit_size; @@ -1636,7 +1647,11 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_nr_units = unit; for_each_possible_cpu(cpu) - BUG_ON(unit_map[cpu] == UINT_MAX); + PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX); + + /* we're done parsing the input, undefine BUG macro and dump config */ +#undef PCPU_SETUP_BUG_ON + pcpu_dump_alloc_info(KERN_INFO, ai); pcpu_nr_groups = ai->nr_groups; pcpu_group_offsets = group_offsets; From f2badb0c950ed308be9b321203b9c8d341690cd4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 29 Sep 2009 09:17:58 +0900 Subject: [PATCH 6/6] percpu: make allocation failures more verbose Warn and dump stack when percpu allocation fails. percpu allocator is still young and unchecked NULL percpu pointer usage can result in random memory corruption when combined with the pointer shifting in access macros. Allocation failures should be rare and the warning message will be disabled after certain times. Signed-off-by: Tejun Heo --- mm/percpu.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 83617ca3ba527..4a048abad0436 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1043,7 +1043,9 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) */ static void *pcpu_alloc(size_t size, size_t align, bool reserved) { + static int warn_limit = 10; struct pcpu_chunk *chunk; + const char *err; int slot, off; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { @@ -1059,11 +1061,14 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; if (size > chunk->contig_hint || - pcpu_extend_area_map(chunk) < 0) + pcpu_extend_area_map(chunk) < 0) { + err = "failed to extend area map of reserved chunk"; goto fail_unlock; + } off = pcpu_alloc_area(chunk, size, align); if (off >= 0) goto area_found; + err = "alloc from reserved chunk failed"; goto fail_unlock; } @@ -1080,6 +1085,7 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) case 1: goto restart; /* pcpu_lock dropped, restart */ default: + err = "failed to extend area map"; goto fail_unlock; } @@ -1093,8 +1099,10 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) spin_unlock_irq(&pcpu_lock); chunk = alloc_pcpu_chunk(); - if (!chunk) + if (!chunk) { + err = "failed to allocate new chunk"; goto fail_unlock_mutex; + } spin_lock_irq(&pcpu_lock); pcpu_chunk_relocate(chunk, -1); @@ -1107,6 +1115,7 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) if (pcpu_populate_chunk(chunk, off, size)) { spin_lock_irq(&pcpu_lock); pcpu_free_area(chunk, off); + err = "failed to populate"; goto fail_unlock; } @@ -1119,6 +1128,13 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) spin_unlock_irq(&pcpu_lock); fail_unlock_mutex: mutex_unlock(&pcpu_alloc_mutex); + if (warn_limit) { + pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " + "%s\n", size, align, err); + dump_stack(); + if (!--warn_limit) + pr_info("PERCPU: limit reached, disable warning\n"); + } return NULL; }