From b755de8dfdfef97effaa91379ffafcb81f4d62a1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 20 Feb 2008 12:41:52 -0800 Subject: [PATCH 01/17] x86: make dev_to_node return online node a numa system (with multi HT chains) may return node without ram. Aka it is not online. Try to get an online node, otherwise return -1. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/pci/acpi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index d95de2f199cda..ea8685f89bc38 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -172,6 +172,9 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do set_mp_bus_to_node(busnum, node); else node = get_mp_bus_to_node(busnum); + + if (node != -1 && !node_online(node)) + node = -1; #endif /* Allocate per-root-bus (not per bus) arch-specific data. From dbb6152e6f72df367f8a955586c5e6282a7255e5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 19 Apr 2008 01:30:16 -0700 Subject: [PATCH 02/17] x86: don't call pxm_to_node again also make bus_numa work even if ACPI_NUMA is not defined. don't call pxm_to_node again, and use node directly. Signed-off-by: Yinghai Lu Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/pci/acpi.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index ea8685f89bc38..28d17a5cfb8d4 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -171,11 +171,11 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do if (node != -1) set_mp_bus_to_node(busnum, node); else +#endif node = get_mp_bus_to_node(busnum); if (node != -1 && !node_online(node)) node = -1; -#endif /* Allocate per-root-bus (not per bus) arch-specific data. * TODO: leak; this memory is never freed. @@ -207,14 +207,16 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do if (!bus) kfree(sd); + if (bus && node != -1) { #ifdef CONFIG_ACPI_NUMA - if (bus) { - if (pxm >= 0) { + if (pxm >= 0) printk(KERN_DEBUG "bus %02x -> pxm %d -> node %d\n", - busnum, pxm, pxm_to_node(pxm)); - } - } + busnum, pxm, node); +#else + printk(KERN_DEBUG "bus %02x -> node %d\n", + busnum, node); #endif + } if (bus && (pci_probe & PCI_USE__CRS)) get_current_resources(device, busnum, domain, bus); From 7496b60654e759d0b9008b80908e80727904b3c4 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:12 +0200 Subject: [PATCH 03/17] x86: fix remove cpu_pda table patch Mike Travis wrote: > Ingo Molnar wrote: >> * Mike Travis wrote: >> >>> [Ingo - please replace "PATCH 07/11" with this one.] >>> >>> * Remove 544k bytes from the kernel by removing the boot_cpu_pda >>> array from the data section and allocating it during startup. >>> >>> Fixed panic in setup_per_cpu_areas when HOTPLUG_CPU not set. >>> >>> For inclusion into sched-devel/latest tree. >> sched-devel.git randconfig testing found another crash with your queue: >> >> [ 0.111060] Brought up 1 CPUs >> [ 0.111986] Total of 1 processors activated (4022.73 BogoMIPS). >> [ 0.112987] Testing NMI watchdog ... <1>BUG: unable to handle kernel NULL pointer dereference at 0000000000000040 >> [ 0.114982] IP: [] check_nmi_watchdog+0xb0/0x210 >> [ 0.114982] PGD 0 >> [ 0.114982] Oops: 0000 [1] SMP >> [ 0.114982] CPU 0 >> [............] >> >> http://redhat.com/~mingo/misc/config-Mon_Apr_28_23_25_25_CEST_2008.bad >> http://redhat.com/~mingo/misc/log-Mon_Apr_28_23_25_25_CEST_2008.bad >> >> Ingo > > Hi Ingo, > > I need a bit more information on your hardware configuration. Building a > kernel with the above config file started up fine on both the Intel and AMD > boxes. > > Based on the above output it looks like it might be a UP machine? ... Ok, I think I found it. In check_nmi_watchdog(): for (cpu = 0; cpu < NR_CPUS; cpu++) prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count; As I mentioned it works fine on both of my systems so could you try it out? Thanks! Mike -- * Change function check_nmi_watchdog() to use nr_cpu_ids instead of NR_CPUS. Based on: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git + sched-devel/latest .../mingo/linux-2.6-sched-devel.git Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/nmi_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c index 5a29ded994fa3..2861b9408ac95 100644 --- a/arch/x86/kernel/nmi_64.c +++ b/arch/x86/kernel/nmi_64.c @@ -88,7 +88,7 @@ int __init check_nmi_watchdog(void) if (!atomic_read(&nmi_active)) return 0; - prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); + prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); if (!prev_nmi_count) return -1; @@ -99,7 +99,7 @@ int __init check_nmi_watchdog(void) smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); #endif - for (cpu = 0; cpu < NR_CPUS; cpu++) + for (cpu = 0; cpu < nr_cpu_ids; cpu++) prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count; local_irq_enable(); mdelay((20*1000)/nmi_hz); // wait 20 ticks From 1184dc2ffe2c8fb9afb766d870850f2c3165ef25 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:12 +0200 Subject: [PATCH 04/17] x86: modify Kconfig to allow up to 4096 cpus * Increase the limit of NR_CPUS to 4096 and introduce a boolean called "MAXSMP" which when set (e.g. "allyesconfig"), will set NR_CPUS = 4096 and NODES_SHIFT = 9 (512). * Changed max setting for NODES_SHIFT from 15 to 9 to accurately reflect the real limit. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bf07b6f50fa17..2e325521e5e95 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -561,20 +561,35 @@ config SWIOTLB config IOMMU_HELPER def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB) +config MAXSMP + bool "Configure Maximum number of SMP Processors and NUMA Nodes" + depends on X86_64 && SMP + default n + help + Configure maximum number of CPUS and NUMA Nodes for this architecture. + If unsure, say N. +if MAXSMP config NR_CPUS - int "Maximum number of CPUs (2-255)" - range 2 255 + int + default "4096" +endif + +if !MAXSMP +config NR_CPUS + int "Maximum number of CPUs (2-4096)" + range 2 4096 depends on SMP default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 default "8" help This allows you to specify the maximum number of CPUs which this - kernel will support. The maximum supported value is 255 and the + kernel will support. The maximum supported value is 4096 and the minimum value which makes sense is 2. This is purely to save memory - each supported CPU adds approximately eight kilobytes to the kernel image. +endif config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" @@ -965,13 +980,25 @@ config NUMA_EMU into virtual nodes when booted with "numa=fake=N", where N is the number of nodes. This is only useful for debugging. +if MAXSMP + +config NODES_SHIFT + int + default "9" +endif + +if !MAXSMP config NODES_SHIFT - int "Max num nodes shift(1-9)" - range 1 9 if X86_64 + int "Maximum NUMA Nodes (as a power of 2)" + range 1 9 if X86_64 default "6" if X86_64 default "4" if X86_NUMAQ default "3" depends on NEED_MULTIPLE_NODES + help + Specify the maximum number of NUMA Nodes available on the target + system. Increases memory reserved to accomodate various tables. +endif config HAVE_ARCH_BOOTMEM_NODE def_bool y From 23ca4bba3e20c6c3cb11c1bb0ab4770b724d39ac Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:12 +0200 Subject: [PATCH 05/17] x86: cleanup early per cpu variables/accesses v4 * Introduce a new PER_CPU macro called "EARLY_PER_CPU". This is used by some per_cpu variables that are initialized and accessed before there are per_cpu areas allocated. ["Early" in respect to per_cpu variables is "earlier than the per_cpu areas have been setup".] This patchset adds these new macros: DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) EXPORT_EARLY_PER_CPU_SYMBOL(_name) DECLARE_EARLY_PER_CPU(_type, _name) early_per_cpu_ptr(_name) early_per_cpu_map(_name, _idx) early_per_cpu(_name, _cpu) The DEFINE macro defines the per_cpu variable as well as the early map and pointer. It also initializes the per_cpu variable and map elements to "_initvalue". The early_* macros provide access to the initial map (usually setup during system init) and the early pointer. This pointer is initialized to point to the early map but is then NULL'ed when the actual per_cpu areas are setup. After that the per_cpu variable is the correct access to the variable. The early_per_cpu() macro is not very efficient but does show how to access the variable if you have a function that can be called both "early" and "late". It tests the early ptr to be NULL, and if not then it's still valid. Otherwise, the per_cpu variable is used instead: #define early_per_cpu(_name, _cpu) \ (early_per_cpu_ptr(_name) ? \ early_per_cpu_ptr(_name)[_cpu] : \ per_cpu(_name, _cpu)) A better method is to actually check the pointer manually. In the case below, numa_set_node can be called both "early" and "late": void __cpuinit numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); if (cpu_to_node_map) cpu_to_node_map[cpu] = node; else per_cpu(x86_cpu_to_node_map, cpu) = node; } * Add a flag "arch_provides_topology_pointers" that indicates pointers to topology cpumask_t maps are available. Otherwise, use the function returning the cpumask_t value. This is useful if cpumask_t set size is very large to avoid copying data on to/off of the stack. * The coverage of CONFIG_DEBUG_PER_CPU_MAPS has been increased while the non-debug case has been optimized a bit. * Remove an unreferenced compiler warning in drivers/base/topology.c * Clean up #ifdef in setup.c For inclusion into sched-devel/latest tree. Based on: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git + sched-devel/latest .../mingo/linux-2.6-sched-devel.git Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 2 +- arch/x86/Kconfig.debug | 2 +- arch/x86/kernel/apic_32.c | 9 +-- arch/x86/kernel/apic_64.c | 11 ++- arch/x86/kernel/setup.c | 96 +++++++++++++++++++++---- arch/x86/kernel/setup_32.c | 24 ------- arch/x86/kernel/setup_64.c | 9 --- arch/x86/kernel/smpboot.c | 20 +----- arch/x86/mm/numa_64.c | 43 +++-------- arch/x86/mm/srat_64.c | 2 +- drivers/base/topology.c | 25 ++++++- include/asm-x86/numa_64.h | 19 +++-- include/asm-x86/percpu.h | 46 ++++++++++++ include/asm-x86/smp.h | 15 +--- include/asm-x86/topology.h | 143 +++++++++++++++++++++---------------- 15 files changed, 270 insertions(+), 196 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2e325521e5e95..4469a0db1ae1d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -121,7 +121,7 @@ config ARCH_HAS_CACHE_LINE_SIZE def_bool y config HAVE_SETUP_PER_CPU_AREA - def_bool X86_64 || (X86_SMP && !X86_VOYAGER) + def_bool X86_64_SMP || (X86_SMP && !X86_VOYAGER) config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 18363374d51a9..24ca95a0ba542 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -60,7 +60,7 @@ config DEBUG_PAGEALLOC config DEBUG_PER_CPU_MAPS bool "Debug access to per_cpu maps" depends on DEBUG_KERNEL - depends on X86_64_SMP + depends on X86_SMP default n help Say Y to verify that the per_cpu map being accessed has diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 4b99b1bdeb6cb..f17c1c1bc3842 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -52,9 +52,6 @@ unsigned long mp_lapic_addr; -DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID; -EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid); - /* * Knob to control our willingness to enable the local APIC. * @@ -1534,9 +1531,9 @@ void __cpuinit generic_processor_info(int apicid, int version) } #ifdef CONFIG_SMP /* are we being called early in kernel startup? */ - if (x86_cpu_to_apicid_early_ptr) { - u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; - u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; + if (early_per_cpu_ptr(x86_cpu_to_apicid)) { + u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); + u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); cpu_to_apicid[cpu] = apicid; bios_cpu_apicid[cpu] = apicid; diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 0633cfd0dc291..4fd21f7d698c5 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -87,9 +87,6 @@ static unsigned long apic_phys; unsigned long mp_lapic_addr; -DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID; -EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid); - unsigned int __cpuinitdata maxcpus = NR_CPUS; /* * Get the LAPIC version @@ -1091,9 +1088,9 @@ void __cpuinit generic_processor_info(int apicid, int version) cpu = 0; } /* are we being called early in kernel startup? */ - if (x86_cpu_to_apicid_early_ptr) { - u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; - u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; + if (early_per_cpu_ptr(x86_cpu_to_apicid)) { + u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); + u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); cpu_to_apicid[cpu] = apicid; bios_cpu_apicid[cpu] = apicid; @@ -1269,7 +1266,7 @@ __cpuinit int apic_is_clustered_box(void) if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) return 0; - bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; + bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); bitmap_zero(clustermap, NUM_APIC_CLUSTERS); for (i = 0; i < NR_CPUS; i++) { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 6f80b852a1961..03caa8e4351f7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -19,13 +19,23 @@ unsigned disabled_cpus __cpuinitdata; unsigned int boot_cpu_physical_apicid = -1U; EXPORT_SYMBOL(boot_cpu_physical_apicid); -DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID; -EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); - /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map; #endif +/* map cpu index to physical APIC ID */ +DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); + +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#define X86_64_NUMA 1 + +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); +#endif + #if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) /* * Copy data used in early init routines from the initial arrays to the @@ -37,20 +47,21 @@ static void __init setup_per_cpu_maps(void) int cpu; for_each_possible_cpu(cpu) { - per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu]; + per_cpu(x86_cpu_to_apicid, cpu) = + early_per_cpu_map(x86_cpu_to_apicid, cpu); per_cpu(x86_bios_cpu_apicid, cpu) = - x86_bios_cpu_apicid_init[cpu]; -#ifdef CONFIG_NUMA + early_per_cpu_map(x86_bios_cpu_apicid, cpu); +#ifdef X86_64_NUMA per_cpu(x86_cpu_to_node_map, cpu) = - x86_cpu_to_node_map_init[cpu]; + early_per_cpu_map(x86_cpu_to_node_map, cpu); #endif } /* indicate the early static arrays will soon be gone */ - x86_cpu_to_apicid_early_ptr = NULL; - x86_bios_cpu_apicid_early_ptr = NULL; -#ifdef CONFIG_NUMA - x86_cpu_to_node_map_early_ptr = NULL; + early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; + early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; +#ifdef X86_64_NUMA + early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif } @@ -109,7 +120,8 @@ void __init setup_per_cpu_areas(void) if (!node_online(node) || !NODE_DATA(node)) { ptr = alloc_bootmem_pages(size); printk(KERN_INFO - "cpu %d has no node or node-local memory\n", i); + "cpu %d has no node %d or node-local memory\n", + i, node); } else ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); @@ -137,3 +149,63 @@ void __init setup_per_cpu_areas(void) } #endif + +#ifdef X86_64_NUMA +void __cpuinit numa_set_node(int cpu, int node) +{ + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); + + if (cpu_to_node_map) + cpu_to_node_map[cpu] = node; + + else if (per_cpu_offset(cpu)) + per_cpu(x86_cpu_to_node_map, cpu) = node; + + else + Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); +} + +void __cpuinit numa_clear_node(int cpu) +{ + numa_set_node(cpu, NUMA_NO_NODE); +} + +void __cpuinit numa_add_cpu(int cpu) +{ + cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]); +} +#endif /* CONFIG_NUMA */ + +#if defined(CONFIG_DEBUG_PER_CPU_MAPS) && defined(CONFIG_X86_64) + +int cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) { + printk(KERN_WARNING + "cpu_to_node(%d): usage too early!\n", cpu); + dump_stack(); + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} +EXPORT_SYMBOL(cpu_to_node); + +int early_cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + + if (!per_cpu_offset(cpu)) { + printk(KERN_WARNING + "early_cpu_to_node(%d): no per_cpu area!\n", cpu); + dump_stack(); + return NUMA_NO_NODE; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} +#endif diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 5a2f8e0638875..ccd5f5cdbbe61 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -737,18 +737,6 @@ char * __init __attribute__((weak)) memory_setup(void) return machine_specific_memory_setup(); } -#ifdef CONFIG_NUMA -/* - * In the golden day, when everything among i386 and x86_64 will be - * integrated, this will not live here - */ -void *x86_cpu_to_node_map_early_ptr; -int x86_cpu_to_node_map_init[NR_CPUS] = { - [0 ... NR_CPUS-1] = NUMA_NO_NODE -}; -DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE; -#endif - /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -887,18 +875,6 @@ void __init setup_arch(char **cmdline_p) io_delay_init(); -#ifdef CONFIG_X86_SMP - /* - * setup to use the early static init tables during kernel startup - * X86_SMP will exclude sub-arches that don't deal well with it. - */ - x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init; - x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init; -#ifdef CONFIG_NUMA - x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init; -#endif -#endif - #ifdef CONFIG_X86_GENERICARCH generic_apic_probe(); #endif diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 6dff1286ad8ad..e8df64fad5401 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -406,15 +406,6 @@ void __init setup_arch(char **cmdline_p) kvmclock_init(); #endif -#ifdef CONFIG_SMP - /* setup to use the early static init tables during kernel startup */ - x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init; - x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init; -#ifdef CONFIG_NUMA - x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init; -#endif -#endif - #ifdef CONFIG_ACPI /* * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3e1cecedde427..036604d3daed0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -68,22 +68,6 @@ #include #include -/* - * FIXME: For x86_64, those are defined in other files. But moving them here, - * would make the setup areas dependent on smp, which is a loss. When we - * integrate apic between arches, we can probably do a better job, but - * right now, they'll stay here -- glommer - */ - -/* which logical CPU number maps to which CPU (physical APIC ID) */ -u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata = - { [0 ... NR_CPUS-1] = BAD_APICID }; -void *x86_cpu_to_apicid_early_ptr; - -u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata - = { [0 ... NR_CPUS-1] = BAD_APICID }; -void *x86_bios_cpu_apicid_early_ptr; - #ifdef CONFIG_X86_32 u8 apicid_2_node[MAX_APICID]; static int low_mappings; @@ -992,7 +976,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) /* Try to put things back the way they were before ... */ unmap_cpu_to_logical_apicid(cpu); #ifdef CONFIG_X86_64 - clear_node_cpumask(cpu); /* was set by numa_add_cpu */ + numa_remove_cpu(cpu); /* was set by numa_add_cpu */ #endif cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */ cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ @@ -1373,7 +1357,7 @@ static void __ref remove_cpu_from_maps(int cpu) cpu_clear(cpu, cpu_callin_map); /* was set by cpu_init() */ clear_bit(cpu, (unsigned long *)&cpu_initialized); - clear_node_cpumask(cpu); + numa_remove_cpu(cpu); #endif } diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index c5066d519e5de..970f86775c41d 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -31,16 +31,6 @@ bootmem_data_t plat_node_bdata[MAX_NUMNODES]; struct memnode memnode; -#ifdef CONFIG_SMP -int x86_cpu_to_node_map_init[NR_CPUS] = { - [0 ... NR_CPUS-1] = NUMA_NO_NODE -}; -void *x86_cpu_to_node_map_early_ptr; -EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr); -#endif -DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE; -EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map); - s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; @@ -577,24 +567,6 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); } -__cpuinit void numa_add_cpu(int cpu) -{ - set_bit(cpu, - (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]); -} - -void __cpuinit numa_set_node(int cpu, int node) -{ - int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr; - - if(cpu_to_node_map) - cpu_to_node_map[cpu] = node; - else if(per_cpu_offset(cpu)) - per_cpu(x86_cpu_to_node_map, cpu) = node; - else - Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); -} - unsigned long __init numa_free_all_bootmem(void) { unsigned long pages = 0; @@ -641,6 +613,7 @@ static __init int numa_setup(char *opt) } early_param("numa", numa_setup); +#ifdef CONFIG_NUMA /* * Setup early cpu_to_node. * @@ -652,14 +625,19 @@ early_param("numa", numa_setup); * is already initialized in a round robin manner at numa_init_array, * prior to this call, and this initialization is good enough * for the fake NUMA cases. + * + * Called before the per_cpu areas are setup. */ void __init init_cpu_to_node(void) { - int i; + int cpu; + u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); - for (i = 0; i < NR_CPUS; i++) { + BUG_ON(cpu_to_apicid == NULL); + + for_each_possible_cpu(cpu) { int node; - u16 apicid = x86_cpu_to_apicid_init[i]; + u16 apicid = cpu_to_apicid[cpu]; if (apicid == BAD_APICID) continue; @@ -668,8 +646,9 @@ void __init init_cpu_to_node(void) continue; if (!node_online(node)) continue; - numa_set_node(i, node); + numa_set_node(cpu, node); } } +#endif diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 99649dccad28c..012220e31c994 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -376,7 +376,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) if (node == NUMA_NO_NODE) continue; if (!node_isset(node, node_possible_map)) - numa_set_node(i, NUMA_NO_NODE); + numa_clear_node(i); } numa_init_array(); return 0; diff --git a/drivers/base/topology.c b/drivers/base/topology.c index fdf4044d2e74a..1efe162e16d75 100644 --- a/drivers/base/topology.c +++ b/drivers/base/topology.c @@ -40,6 +40,7 @@ static ssize_t show_##name(struct sys_device *dev, char *buf) \ return sprintf(buf, "%d\n", topology_##name(cpu)); \ } +#if defined(topology_thread_siblings) || defined(topology_core_siblings) static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf) { ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf; @@ -54,21 +55,41 @@ static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf) } return n; } +#endif +#ifdef arch_provides_topology_pointers #define define_siblings_show_map(name) \ -static inline ssize_t show_##name(struct sys_device *dev, char *buf) \ +static ssize_t show_##name(struct sys_device *dev, char *buf) \ { \ unsigned int cpu = dev->id; \ return show_cpumap(0, &(topology_##name(cpu)), buf); \ } #define define_siblings_show_list(name) \ -static inline ssize_t show_##name##_list(struct sys_device *dev, char *buf) \ +static ssize_t show_##name##_list(struct sys_device *dev, char *buf) \ { \ unsigned int cpu = dev->id; \ return show_cpumap(1, &(topology_##name(cpu)), buf); \ } +#else +#define define_siblings_show_map(name) \ +static ssize_t show_##name(struct sys_device *dev, char *buf) \ +{ \ + unsigned int cpu = dev->id; \ + cpumask_t mask = topology_##name(cpu); \ + return show_cpumap(0, &mask, buf); \ +} + +#define define_siblings_show_list(name) \ +static ssize_t show_##name##_list(struct sys_device *dev, char *buf) \ +{ \ + unsigned int cpu = dev->id; \ + cpumask_t mask = topology_##name(cpu); \ + return show_cpumap(1, &mask, buf); \ +} +#endif + #define define_siblings_show_func(name) \ define_siblings_show_map(name); define_siblings_show_list(name) diff --git a/include/asm-x86/numa_64.h b/include/asm-x86/numa_64.h index 22e87c9f6a805..b510daf4f4d86 100644 --- a/include/asm-x86/numa_64.h +++ b/include/asm-x86/numa_64.h @@ -14,11 +14,9 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks, #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) -extern void numa_add_cpu(int cpu); extern void numa_init_array(void); extern int numa_off; -extern void numa_set_node(int cpu, int node); extern void srat_reserve_add_area(int nodeid); extern int hotadd_percent; @@ -31,15 +29,16 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, #ifdef CONFIG_NUMA extern void __init init_cpu_to_node(void); - -static inline void clear_node_cpumask(int cpu) -{ - clear_bit(cpu, (unsigned long *)&node_to_cpumask_map[cpu_to_node(cpu)]); -} - +extern void __cpuinit numa_set_node(int cpu, int node); +extern void __cpuinit numa_clear_node(int cpu); +extern void __cpuinit numa_add_cpu(int cpu); +extern void __cpuinit numa_remove_cpu(int cpu); #else -#define init_cpu_to_node() do {} while (0) -#define clear_node_cpumask(cpu) do {} while (0) +static inline void init_cpu_to_node(void) { } +static inline void numa_set_node(int cpu, int node) { } +static inline void numa_clear_node(int cpu) { } +static inline void numa_add_cpu(int cpu, int node) { } +static inline void numa_remove_cpu(int cpu) { } #endif #endif diff --git a/include/asm-x86/percpu.h b/include/asm-x86/percpu.h index 736fc3bb8e1ea..912a3a17b9db8 100644 --- a/include/asm-x86/percpu.h +++ b/include/asm-x86/percpu.h @@ -143,4 +143,50 @@ do { \ #define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val) #endif /* !__ASSEMBLY__ */ #endif /* !CONFIG_X86_64 */ + +#ifdef CONFIG_SMP + +/* + * Define the "EARLY_PER_CPU" macros. These are used for some per_cpu + * variables that are initialized and accessed before there are per_cpu + * areas allocated. + */ + +#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \ + DEFINE_PER_CPU(_type, _name) = _initvalue; \ + __typeof__(_type) _name##_early_map[NR_CPUS] __initdata = \ + { [0 ... NR_CPUS-1] = _initvalue }; \ + __typeof__(_type) *_name##_early_ptr = _name##_early_map + +#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ + EXPORT_PER_CPU_SYMBOL(_name) + +#define DECLARE_EARLY_PER_CPU(_type, _name) \ + DECLARE_PER_CPU(_type, _name); \ + extern __typeof__(_type) *_name##_early_ptr; \ + extern __typeof__(_type) _name##_early_map[] + +#define early_per_cpu_ptr(_name) (_name##_early_ptr) +#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx]) +#define early_per_cpu(_name, _cpu) \ + (early_per_cpu_ptr(_name) ? \ + early_per_cpu_ptr(_name)[_cpu] : \ + per_cpu(_name, _cpu)) + +#else /* !CONFIG_SMP */ +#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \ + DEFINE_PER_CPU(_type, _name) = _initvalue + +#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ + EXPORT_PER_CPU_SYMBOL(_name) + +#define DECLARE_EARLY_PER_CPU(_type, _name) \ + DECLARE_PER_CPU(_type, _name) + +#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu) +#define early_per_cpu_ptr(_name) NULL +/* no early_per_cpu_map() */ + +#endif /* !CONFIG_SMP */ + #endif /* _ASM_X86_PERCPU_H_ */ diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h index 1ebaa5cd31128..ec841639fb445 100644 --- a/include/asm-x86/smp.h +++ b/include/asm-x86/smp.h @@ -29,21 +29,12 @@ extern int smp_num_siblings; extern unsigned int num_processors; extern cpumask_t cpu_initialized; -#ifdef CONFIG_SMP -extern u16 x86_cpu_to_apicid_init[]; -extern u16 x86_bios_cpu_apicid_init[]; -extern void *x86_cpu_to_apicid_early_ptr; -extern void *x86_bios_cpu_apicid_early_ptr; -#else -#define x86_cpu_to_apicid_early_ptr NULL -#define x86_bios_cpu_apicid_early_ptr NULL -#endif - DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); DECLARE_PER_CPU(cpumask_t, cpu_core_map); DECLARE_PER_CPU(u16, cpu_llc_id); -DECLARE_PER_CPU(u16, x86_cpu_to_apicid); -DECLARE_PER_CPU(u16, x86_bios_cpu_apicid); + +DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); +DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); /* Static state in head.S used to set up a CPU */ extern struct { diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h index dcf3f8131d6b4..dac09cb66dca4 100644 --- a/include/asm-x86/topology.h +++ b/include/asm-x86/topology.h @@ -35,87 +35,67 @@ # endif #endif +/* Node not present */ +#define NUMA_NO_NODE (-1) + #ifdef CONFIG_NUMA #include #include -/* Mappings between logical cpu number and node number */ #ifdef CONFIG_X86_32 -extern int cpu_to_node_map[]; -#else -/* Returns the number of the current Node. */ -#define numa_node_id() (early_cpu_to_node(raw_smp_processor_id())) -#endif - -DECLARE_PER_CPU(int, x86_cpu_to_node_map); - -#ifdef CONFIG_SMP -extern int x86_cpu_to_node_map_init[]; -extern void *x86_cpu_to_node_map_early_ptr; -#else -#define x86_cpu_to_node_map_early_ptr NULL -#endif +/* Mappings between node number and cpus on that node. */ extern cpumask_t node_to_cpumask_map[]; -#define NUMA_NO_NODE (-1) +/* Mappings between logical cpu number and node number */ +extern int cpu_to_node_map[]; /* Returns the number of the node containing CPU 'cpu' */ -#ifdef CONFIG_X86_32 -#define early_cpu_to_node(cpu) cpu_to_node(cpu) static inline int cpu_to_node(int cpu) { return cpu_to_node_map[cpu]; } +#define early_cpu_to_node(cpu) cpu_to_node(cpu) #else /* CONFIG_X86_64 */ -#ifdef CONFIG_SMP -static inline int early_cpu_to_node(int cpu) -{ - int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr; - - if (cpu_to_node_map) - return cpu_to_node_map[cpu]; - else if (per_cpu_offset(cpu)) - return per_cpu(x86_cpu_to_node_map, cpu); - else - return NUMA_NO_NODE; -} -#else -#define early_cpu_to_node(cpu) cpu_to_node(cpu) -#endif +/* Mappings between node number and cpus on that node. */ +extern cpumask_t node_to_cpumask_map[]; + +/* Mappings between logical cpu number and node number */ +DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); + +/* Returns the number of the current Node. */ +#define numa_node_id() (per_cpu(x86_cpu_to_node_map, raw_smp_processor_id())) + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +extern int cpu_to_node(int cpu); +extern int early_cpu_to_node(int cpu); +extern cpumask_t *_node_to_cpumask_ptr(int node); +extern cpumask_t node_to_cpumask(int node); +#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ + +/* Returns the number of the node containing CPU 'cpu' */ static inline int cpu_to_node(int cpu) { -#ifdef CONFIG_DEBUG_PER_CPU_MAPS - if (x86_cpu_to_node_map_early_ptr) { - printk("KERN_NOTICE cpu_to_node(%d): usage too early!\n", - (int)cpu); - dump_stack(); - return ((int *)x86_cpu_to_node_map_early_ptr)[cpu]; - } -#endif return per_cpu(x86_cpu_to_node_map, cpu); } -#ifdef CONFIG_NUMA - -/* Returns a pointer to the cpumask of CPUs on Node 'node'. */ -#define node_to_cpumask_ptr(v, node) \ - cpumask_t *v = &(node_to_cpumask_map[node]) - -#define node_to_cpumask_ptr_next(v, node) \ - v = &(node_to_cpumask_map[node]) -#endif +/* Same function but used if called before per_cpu areas are setup */ +static inline int early_cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; -#endif /* CONFIG_X86_64 */ + return per_cpu(x86_cpu_to_node_map, cpu); +} -/* - * Returns the number of the node containing Node 'node'. This - * architecture is flat, so it is a pretty simple function! - */ -#define parent_node(node) (node) +/* Returns a pointer to the cpumask of CPUs on Node 'node'. */ +static inline cpumask_t *_node_to_cpumask_ptr(int node) +{ + return &node_to_cpumask_map[node]; +} /* Returns a bitmask of CPUs on Node 'node'. */ static inline cpumask_t node_to_cpumask(int node) @@ -123,14 +103,29 @@ static inline cpumask_t node_to_cpumask(int node) return node_to_cpumask_map[node]; } +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ +#endif /* CONFIG_X86_64 */ + +/* Replace default node_to_cpumask_ptr with optimized version */ +#define node_to_cpumask_ptr(v, node) \ + cpumask_t *v = _node_to_cpumask_ptr(node) + +#define node_to_cpumask_ptr_next(v, node) \ + v = _node_to_cpumask_ptr(node) + /* Returns the number of the first CPU on Node 'node'. */ static inline int node_to_first_cpu(int node) { - cpumask_t mask = node_to_cpumask(node); - - return first_cpu(mask); + node_to_cpumask_ptr(mask, node); + return first_cpu(*mask); } +/* + * Returns the number of the node containing Node 'node'. This + * architecture is flat, so it is a pretty simple function! + */ +#define parent_node(node) (node) + #define pcibus_to_node(bus) __pcibus_to_node(bus) #define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus) @@ -180,8 +175,31 @@ extern int __node_distance(int, int); #define node_distance(a, b) __node_distance(a, b) #endif -#else /* CONFIG_NUMA */ +#else /* !CONFIG_NUMA */ + +#define numa_node_id() 0 +#define cpu_to_node(cpu) 0 +#define early_cpu_to_node(cpu) 0 + +static inline cpumask_t *_node_to_cpumask_ptr(int node) +{ + return &cpu_online_map; +} +static inline cpumask_t node_to_cpumask(int node) +{ + return cpu_online_map; +} +static inline int node_to_first_cpu(int node) +{ + return first_cpu(cpu_online_map); +} + +/* Replace default node_to_cpumask_ptr with optimized version */ +#define node_to_cpumask_ptr(v, node) \ + cpumask_t *v = _node_to_cpumask_ptr(node) +#define node_to_cpumask_ptr_next(v, node) \ + v = _node_to_cpumask_ptr(node) #endif #include @@ -193,6 +211,9 @@ extern cpumask_t cpu_coregroup_map(int cpu); #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) #define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu)) + +/* indicates that pointers to the topology cpumask_t maps are valid */ +#define arch_provides_topology_pointers yes #endif static inline void arch_fix_phys_package_id(int num, u32 slot) @@ -220,4 +241,4 @@ static inline void set_mp_bus_to_node(int busnum, int node) } #endif -#endif +#endif /* _ASM_X86_TOPOLOGY_H */ From 7891a24e1ee50c96896c0cf7da216a8e7b573ca5 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:12 +0200 Subject: [PATCH 06/17] x86: restore pda nodenumber field * Restore the nodenumber field in the x86_64 pda. This field is slightly different than the x86_cpu_to_node_map mainly because it's a static indication of which node the cpu is on while the cpu to node map is a dyanamic mapping that may get reset if the cpu goes offline. This also simplifies the numa_node_id() macro. For inclusion into sched-devel/latest tree. Based on: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git + sched-devel/latest .../mingo/linux-2.6-sched-devel.git Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/setup.c | 4 ++++ include/asm-x86/pda.h | 1 + include/asm-x86/topology.h | 2 +- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 03caa8e4351f7..0dff17ee3d733 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -32,6 +32,7 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) #define X86_64_NUMA 1 +/* map cpu index to node index */ DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); #endif @@ -155,6 +156,9 @@ void __cpuinit numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); + if (node != NUMA_NO_NODE) + cpu_pda(cpu)->nodenumber = node; + if (cpu_to_node_map) cpu_to_node_map[cpu] = node; diff --git a/include/asm-x86/pda.h b/include/asm-x86/pda.h index 101fb9e11954e..de2ad9ac35a95 100644 --- a/include/asm-x86/pda.h +++ b/include/asm-x86/pda.h @@ -22,6 +22,7 @@ struct x8664_pda { offset 40!!! */ #endif char *irqstackptr; + int nodenumber; /* number of current node */ unsigned int __softirq_pending; unsigned int __nmi_count; /* number of NMI on this CPUs */ short mmu_state; diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h index dac09cb66dca4..c0e6ff7671ea5 100644 --- a/include/asm-x86/topology.h +++ b/include/asm-x86/topology.h @@ -66,7 +66,7 @@ extern cpumask_t node_to_cpumask_map[]; DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); /* Returns the number of the current Node. */ -#define numa_node_id() (per_cpu(x86_cpu_to_node_map, raw_smp_processor_id())) +#define numa_node_id() read_pda(nodenumber) #ifdef CONFIG_DEBUG_PER_CPU_MAPS extern int cpu_to_node(int cpu); From 9f248bde9d47cc177011198c9a15fb339b9f3215 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:12 +0200 Subject: [PATCH 07/17] x86: remove the static 256k node_to_cpumask_map * Consolidate node_to_cpumask operations and remove the 256k byte node_to_cpumask_map. This is done by allocating the node_to_cpumask_map array after the number of possible nodes (nr_node_ids) is known. * Debug printouts when CONFIG_DEBUG_PER_CPU_MAPS is active have been increased. It now shows faults when calling node_to_cpumask() and node_to_cpumask_ptr(). For inclusion into sched-devel/latest tree. Based on: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git + sched-devel/latest .../mingo/linux-2.6-sched-devel.git Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/setup.c | 132 +++++++++++++++++++++++++++++++++++-- arch/x86/mm/numa_64.c | 6 -- include/asm-x86/topology.h | 25 ++++--- 3 files changed, 144 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0dff17ee3d733..913af838c3c56 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -35,6 +35,16 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); /* map cpu index to node index */ DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); + +/* which logical CPUs are on which nodes */ +cpumask_t *node_to_cpumask_map; +EXPORT_SYMBOL(node_to_cpumask_map); + +/* setup node_to_cpumask_map */ +static void __init setup_node_to_cpumask_map(void); + +#else +static inline void setup_node_to_cpumask_map(void) { } #endif #if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) @@ -140,11 +150,15 @@ void __init setup_per_cpu_areas(void) } nr_cpu_ids = highest_cpu + 1; - printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids); + printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", + NR_CPUS, nr_cpu_ids, nr_node_ids); /* Setup percpu data maps */ setup_per_cpu_maps(); + /* Setup node to cpumask map */ + setup_node_to_cpumask_map(); + /* Setup cpumask_of_cpu map */ setup_cpumask_of_cpu(); } @@ -152,6 +166,35 @@ void __init setup_per_cpu_areas(void) #endif #ifdef X86_64_NUMA + +/* + * Allocate node_to_cpumask_map based on number of available nodes + * Requires node_possible_map to be valid. + * + * Note: node_to_cpumask() is not valid until after this is done. + */ +static void __init setup_node_to_cpumask_map(void) +{ + unsigned int node, num = 0; + cpumask_t *map; + + /* setup nr_node_ids if not done yet */ + if (nr_node_ids == MAX_NUMNODES) { + for_each_node_mask(node, node_possible_map) + num = node; + nr_node_ids = num + 1; + } + + /* allocate the map */ + map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); + + Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", + map, nr_node_ids); + + /* node_to_cpumask() will now work */ + node_to_cpumask_map = map; +} + void __cpuinit numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); @@ -174,6 +217,8 @@ void __cpuinit numa_clear_node(int cpu) numa_set_node(cpu, NUMA_NO_NODE); } +#ifndef CONFIG_DEBUG_PER_CPU_MAPS + void __cpuinit numa_add_cpu(int cpu) { cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); @@ -183,9 +228,44 @@ void __cpuinit numa_remove_cpu(int cpu) { cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]); } -#endif /* CONFIG_NUMA */ -#if defined(CONFIG_DEBUG_PER_CPU_MAPS) && defined(CONFIG_X86_64) +#else /* CONFIG_DEBUG_PER_CPU_MAPS */ + +/* + * --------- debug versions of the numa functions --------- + */ +static void __cpuinit numa_set_cpumask(int cpu, int enable) +{ + int node = cpu_to_node(cpu); + cpumask_t *mask; + char buf[64]; + + if (node_to_cpumask_map == NULL) { + printk(KERN_ERR "node_to_cpumask_map NULL\n"); + dump_stack(); + return; + } + + mask = &node_to_cpumask_map[node]; + if (enable) + cpu_set(cpu, *mask); + else + cpu_clear(cpu, *mask); + + cpulist_scnprintf(buf, sizeof(buf), *mask); + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", + enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf); + } + +void __cpuinit numa_add_cpu(int cpu) +{ + numa_set_cpumask(cpu, 1); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + numa_set_cpumask(cpu, 0); +} int cpu_to_node(int cpu) { @@ -199,6 +279,10 @@ int cpu_to_node(int cpu) } EXPORT_SYMBOL(cpu_to_node); +/* + * Same function as cpu_to_node() but used if called before the + * per_cpu areas are setup. + */ int early_cpu_to_node(int cpu) { if (early_per_cpu_ptr(x86_cpu_to_node_map)) @@ -207,9 +291,47 @@ int early_cpu_to_node(int cpu) if (!per_cpu_offset(cpu)) { printk(KERN_WARNING "early_cpu_to_node(%d): no per_cpu area!\n", cpu); - dump_stack(); + dump_stack(); return NUMA_NO_NODE; } return per_cpu(x86_cpu_to_node_map, cpu); } -#endif + +/* + * Returns a pointer to the bitmask of CPUs on Node 'node'. + */ +cpumask_t *_node_to_cpumask_ptr(int node) +{ + if (node_to_cpumask_map == NULL) { + printk(KERN_WARNING + "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n", + node); + dump_stack(); + return &cpu_online_map; + } + return &node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(_node_to_cpumask_ptr); + +/* + * Returns a bitmask of CPUs on Node 'node'. + */ +cpumask_t node_to_cpumask(int node) +{ + if (node_to_cpumask_map == NULL) { + printk(KERN_WARNING + "node_to_cpumask(%d): no node_to_cpumask_map!\n", node); + dump_stack(); + return cpu_online_map; + } + return node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(node_to_cpumask); + +/* + * --------- end of debug versions of the numa functions --------- + */ + +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ + +#endif /* X86_64_NUMA */ diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 970f86775c41d..14c7ab417ec72 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -35,9 +35,6 @@ s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; -cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_to_cpumask_map); - int numa_off __initdata; unsigned long __initdata nodemap_addr; unsigned long __initdata nodemap_size; @@ -560,9 +557,6 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) node_set(0, node_possible_map); for (i = 0; i < NR_CPUS; i++) numa_set_node(i, 0); - /* cpumask_of_cpu() may not be available during early startup */ - memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0])); - cpu_set(0, node_to_cpumask_map[0]); e820_register_active_regions(0, start_pfn, end_pfn); setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); } diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h index c0e6ff7671ea5..1f97758de4ab7 100644 --- a/include/asm-x86/topology.h +++ b/include/asm-x86/topology.h @@ -57,10 +57,16 @@ static inline int cpu_to_node(int cpu) } #define early_cpu_to_node(cpu) cpu_to_node(cpu) +/* Returns a bitmask of CPUs on Node 'node'. */ +static inline cpumask_t node_to_cpumask(int node) +{ + return node_to_cpumask_map[node]; +} + #else /* CONFIG_X86_64 */ /* Mappings between node number and cpus on that node. */ -extern cpumask_t node_to_cpumask_map[]; +extern cpumask_t *node_to_cpumask_map; /* Mappings between logical cpu number and node number */ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); @@ -104,7 +110,6 @@ static inline cpumask_t node_to_cpumask(int node) } #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ -#endif /* CONFIG_X86_64 */ /* Replace default node_to_cpumask_ptr with optimized version */ #define node_to_cpumask_ptr(v, node) \ @@ -113,12 +118,7 @@ static inline cpumask_t node_to_cpumask(int node) #define node_to_cpumask_ptr_next(v, node) \ v = _node_to_cpumask_ptr(node) -/* Returns the number of the first CPU on Node 'node'. */ -static inline int node_to_first_cpu(int node) -{ - node_to_cpumask_ptr(mask, node); - return first_cpu(*mask); -} +#endif /* CONFIG_X86_64 */ /* * Returns the number of the node containing Node 'node'. This @@ -204,6 +204,15 @@ static inline int node_to_first_cpu(int node) #include +#ifdef CONFIG_NUMA +/* Returns the number of the first CPU on Node 'node'. */ +static inline int node_to_first_cpu(int node) +{ + node_to_cpumask_ptr(mask, node); + return first_cpu(*mask); +} +#endif + extern cpumask_t cpu_coregroup_map(int cpu); #ifdef ENABLE_TOPO_DEFINES From 3461b0af025251bbc6b3d56c821c6ac2de6f7209 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:13 +0200 Subject: [PATCH 08/17] x86: remove static boot_cpu_pda array v2 * Remove the boot_cpu_pda array and pointer table from the data section. Allocate the pointer table and array during init. do_boot_cpu() will reallocate the pda in node local memory and if the cpu is being brought up before the bootmem array is released (after_bootmem = 0), then it will free the initial pda. This will happen for all cpus present at system startup. This removes 512k + 32k bytes from the data section. For inclusion into sched-devel/latest tree. Based on: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git + sched-devel/latest .../mingo/linux-2.6-sched-devel.git Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/head64.c | 26 ++++++++++++-- arch/x86/kernel/setup.c | 73 ++++++++++++++++++++++++++++++--------- arch/x86/kernel/setup64.c | 8 +++-- arch/x86/kernel/smpboot.c | 59 ++++++++++++++++++++++++------- include/asm-x86/pda.h | 6 ++-- include/linux/mm.h | 1 + 6 files changed, 135 insertions(+), 38 deletions(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index e25c57b8aa844..0ab59edd70676 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -25,6 +25,24 @@ #include #include +/* boot cpu pda */ +static struct x8664_pda _boot_cpu_pda __read_mostly; + +#ifdef CONFIG_SMP +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +/* + * We install an empty cpu_pda pointer table to trap references before + * the actual cpu_pda pointer table is created in setup_cpu_pda_map(). + */ +static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata; +#else +static struct x8664_pda *__cpu_pda[1] __read_mostly; +#endif + +#else /* !CONFIG_SMP (NR_CPUS will be 1) */ +static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; +#endif + static void __init zap_identity_mappings(void) { pgd_t *pgd = pgd_offset_k(0UL); @@ -156,10 +174,12 @@ void __init x86_64_start_kernel(char * real_mode_data) early_printk("Kernel alive\n"); - for (i = 0; i < NR_CPUS; i++) - cpu_pda(i) = &boot_cpu_pda[i]; - + _cpu_pda = __cpu_pda; + cpu_pda(0) = &_boot_cpu_pda; pda_init(0); + + early_printk("Kernel really alive\n"); + copy_bootdata(__va(real_mode_data)); reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 913af838c3c56..dd12c1c84a8fc 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -101,6 +101,50 @@ static inline void setup_cpumask_of_cpu(void) { } */ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); +static inline void setup_cpu_pda_map(void) { } + +#elif !defined(CONFIG_SMP) +static inline void setup_cpu_pda_map(void) { } + +#else /* CONFIG_SMP && CONFIG_X86_64 */ + +/* + * Allocate cpu_pda pointer table and array via alloc_bootmem. + */ +static void __init setup_cpu_pda_map(void) +{ + char *pda; + struct x8664_pda **new_cpu_pda; + unsigned long size; + int cpu; + + size = roundup(sizeof(struct x8664_pda), cache_line_size()); + + /* allocate cpu_pda array and pointer table */ + { + unsigned long tsize = nr_cpu_ids * sizeof(void *); + unsigned long asize = size * (nr_cpu_ids - 1); + + tsize = roundup(tsize, cache_line_size()); + new_cpu_pda = alloc_bootmem(tsize + asize); + pda = (char *)new_cpu_pda + tsize; + } + + /* initialize pointer table to static pda's */ + for_each_possible_cpu(cpu) { + if (cpu == 0) { + /* leave boot cpu pda in place */ + new_cpu_pda[0] = cpu_pda(0); + continue; + } + new_cpu_pda[cpu] = (struct x8664_pda *)pda; + new_cpu_pda[cpu]->in_bootmem = 1; + pda += size; + } + + /* point to new pointer table */ + _cpu_pda = new_cpu_pda; +} #endif /* @@ -110,46 +154,43 @@ EXPORT_SYMBOL(__per_cpu_offset); */ void __init setup_per_cpu_areas(void) { - int i, highest_cpu = 0; - unsigned long size; + ssize_t size = PERCPU_ENOUGH_ROOM; + char *ptr; + int cpu; #ifdef CONFIG_HOTPLUG_CPU prefill_possible_map(); +#else + nr_cpu_ids = num_processors; #endif + /* Setup cpu_pda map */ + setup_cpu_pda_map(); + /* Copy section for each CPU (we discard the original) */ size = PERCPU_ENOUGH_ROOM; printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size); - for_each_possible_cpu(i) { - char *ptr; + for_each_possible_cpu(cpu) { #ifndef CONFIG_NEED_MULTIPLE_NODES ptr = alloc_bootmem_pages(size); #else - int node = early_cpu_to_node(i); + int node = early_cpu_to_node(cpu); if (!node_online(node) || !NODE_DATA(node)) { ptr = alloc_bootmem_pages(size); printk(KERN_INFO "cpu %d has no node %d or node-local memory\n", - i, node); + cpu, node); } else ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); #endif - if (!ptr) - panic("Cannot allocate cpu data for CPU %d\n", i); -#ifdef CONFIG_X86_64 - cpu_pda(i)->data_offset = ptr - __per_cpu_start; -#else - __per_cpu_offset[i] = ptr - __per_cpu_start; -#endif + per_cpu_offset(cpu) = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - highest_cpu = i; } - nr_cpu_ids = highest_cpu + 1; printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", NR_CPUS, nr_cpu_ids, nr_node_ids); @@ -199,7 +240,7 @@ void __cpuinit numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); - if (node != NUMA_NO_NODE) + if (cpu_pda(cpu) && node != NUMA_NO_NODE) cpu_pda(cpu)->nodenumber = node; if (cpu_to_node_map) diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c index aee0e82007770..631ea6cc01d8b 100644 --- a/arch/x86/kernel/setup64.c +++ b/arch/x86/kernel/setup64.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -34,9 +35,8 @@ struct boot_params boot_params; cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; -struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly; +struct x8664_pda **_cpu_pda __read_mostly; EXPORT_SYMBOL(_cpu_pda); -struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned; struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; @@ -114,8 +114,10 @@ void pda_init(int cpu) __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); if (!pda->irqstackptr) panic("cannot allocate irqstack for cpu %d", cpu); - } + if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) + pda->nodenumber = cpu_to_node(cpu); + } pda->irqstackptr += IRQSTACKSIZE-64; } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 036604d3daed0..bf08334874559 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -816,6 +816,43 @@ static void __cpuinit do_fork_idle(struct work_struct *work) complete(&c_idle->done); } +/* + * Allocate node local memory for the AP pda. + * + * Must be called after the _cpu_pda pointer table is initialized. + */ +static int __cpuinit get_local_pda(int cpu) +{ + struct x8664_pda *oldpda, *newpda; + unsigned long size = sizeof(struct x8664_pda); + int node = cpu_to_node(cpu); + + if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem) + return 0; + + oldpda = cpu_pda(cpu); + newpda = kmalloc_node(size, GFP_ATOMIC, node); + if (!newpda) { + printk(KERN_ERR "Could not allocate node local PDA " + "for CPU %d on node %d\n", cpu, node); + + if (oldpda) + return 0; /* have a usable pda */ + else + return -1; + } + + if (oldpda) { + memcpy(newpda, oldpda, size); + if (!after_bootmem) + free_bootmem((unsigned long)oldpda, size); + } + + newpda->in_bootmem = 0; + cpu_pda(cpu) = newpda; + return 0; +} + static int __cpuinit do_boot_cpu(int apicid, int cpu) /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad @@ -841,19 +878,11 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) } /* Allocate node local memory for AP pdas */ - if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) { - struct x8664_pda *newpda, *pda; - int node = cpu_to_node(cpu); - pda = cpu_pda(cpu); - newpda = kmalloc_node(sizeof(struct x8664_pda), GFP_ATOMIC, - node); - if (newpda) { - memcpy(newpda, pda, sizeof(struct x8664_pda)); - cpu_pda(cpu) = newpda; - } else - printk(KERN_ERR - "Could not allocate node local PDA for CPU %d on node %d\n", - cpu, node); + if (cpu > 0) { + boot_error = get_local_pda(cpu); + if (boot_error) + goto restore_state; + /* if can't get pda memory, can't start cpu */ } #endif @@ -972,6 +1001,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) } } +restore_state: + if (boot_error) { /* Try to put things back the way they were before ... */ unmap_cpu_to_logical_apicid(cpu); @@ -1347,6 +1378,8 @@ __init void prefill_possible_map(void) for (i = 0; i < possible; i++) cpu_set(i, cpu_possible_map); + + nr_cpu_ids = possible; } static void __ref remove_cpu_from_maps(int cpu) diff --git a/include/asm-x86/pda.h b/include/asm-x86/pda.h index de2ad9ac35a95..b34e9a7cc80b3 100644 --- a/include/asm-x86/pda.h +++ b/include/asm-x86/pda.h @@ -22,7 +22,8 @@ struct x8664_pda { offset 40!!! */ #endif char *irqstackptr; - int nodenumber; /* number of current node */ + short nodenumber; /* number of current node (32k max) */ + short in_bootmem; /* pda lives in bootmem */ unsigned int __softirq_pending; unsigned int __nmi_count; /* number of NMI on this CPUs */ short mmu_state; @@ -38,8 +39,7 @@ struct x8664_pda { unsigned irq_spurious_count; } ____cacheline_aligned_in_smp; -extern struct x8664_pda *_cpu_pda[]; -extern struct x8664_pda boot_cpu_pda[]; +extern struct x8664_pda **_cpu_pda; extern void pda_init(int); #define cpu_pda(i) (_cpu_pda[i]) diff --git a/include/linux/mm.h b/include/linux/mm.h index 586a943cab018..0ea48a5af823d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1024,6 +1024,7 @@ extern void mem_init(void); extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); +extern int after_bootmem; #ifdef CONFIG_NUMA extern void setup_per_cpu_pageset(void); From 5deb0b2a25b7b568ab81f7c38052572ecf4ccc96 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:13 +0200 Subject: [PATCH 09/17] x86: leave initial __cpu_pda array in place until cpus are booted Ingo Molnar wrote: ... > they crashed after about 3 randconfig iterations with: > > early res: 4 [8000-afff] PGTABLE > early res: 5 [b000-b87f] MEMNODEMAP > PANIC: early exception 0e rip 10:ffffffff8077a150 error 2 cr2 37 > Pid: 0, comm: swapper Not tainted 2.6.25-sched-devel.git-x86-latest.git #14 > > Call Trace: > [] early_idt_handler+0x56/0x6a > [] ? numa_set_node+0x30/0x60 > [] ? numa_set_node+0x9/0x60 > [] numa_init_array+0x93/0xf0 > [] acpi_scan_nodes+0x3b9/0x3f0 > [] numa_initmem_init+0x136/0x150 > [] setup_arch+0x48f/0x700 > [] ? clockevents_register_notifier+0x3a/0x50 > [] start_kernel+0xd7/0x440 > [] x86_64_start_kernel+0x222/0x280 ... Here's the fixup... This one should follow the previous patches. Thanks, Mike Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/head64.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 0ab59edd70676..4bcb61cd9fcd5 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -29,17 +29,13 @@ static struct x8664_pda _boot_cpu_pda __read_mostly; #ifdef CONFIG_SMP -#ifdef CONFIG_DEBUG_PER_CPU_MAPS /* - * We install an empty cpu_pda pointer table to trap references before - * the actual cpu_pda pointer table is created in setup_cpu_pda_map(). + * We install an empty cpu_pda pointer table to indicate to early users + * (numa_set_node) that the cpu_pda pointer table for cpus other than + * the boot cpu is not yet setup. */ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata; #else -static struct x8664_pda *__cpu_pda[1] __read_mostly; -#endif - -#else /* !CONFIG_SMP (NR_CPUS will be 1) */ static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; #endif From f307d25e638d3408659a2ec98fb3fd1737f7cb31 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 21 May 2008 11:21:13 +0100 Subject: [PATCH 10/17] x86: compile error fix for smpboot.c Without this patch, my link fails with: arch/x86/kernel/built-in.o(.cpuinit.text+0x3c6e): In function `get_local_pda': : undefined reference to `_cpu_pda' arch/x86/kernel/built-in.o(.cpuinit.text+0x3cd1): In function `get_local_pda': : undefined reference to `after_bootmem' arch/x86/kernel/built-in.o(.cpuinit.text+0x3cec): In function `get_local_pda': : undefined reference to `_cpu_pda' make[2]: *** [.tmp_vmlinux1] Error 1 Caused by commit 766da892634694f795b18b9538407816896fc470 x86: remove static boot_cpu_pda array v2 Signed-off-by: Thomas Gleixner --- arch/x86/kernel/smpboot.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index bf08334874559..bc1e1257e5158 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -816,6 +816,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work) complete(&c_idle->done); } +#ifdef CONFIG_X86_64 /* * Allocate node local memory for the AP pda. * @@ -852,6 +853,7 @@ static int __cpuinit get_local_pda(int cpu) cpu_pda(cpu) = newpda; return 0; } +#endif /* CONFIG_X86_64 */ static int __cpuinit do_boot_cpu(int apicid, int cpu) /* From 864fc31ea59798905a37cd896a3e093915a3b366 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2008 15:43:36 +0200 Subject: [PATCH 11/17] x86: numa_64.c make local variables static plat_node_bdata, cmdline, nodemap_addr, nodemap_size are local to numa_64.c. Make them static Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 14c7ab417ec72..824344f1742f0 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -27,7 +27,7 @@ struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); -bootmem_data_t plat_node_bdata[MAX_NUMNODES]; +static bootmem_data_t plat_node_bdata[MAX_NUMNODES]; struct memnode memnode; @@ -36,8 +36,8 @@ s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { }; int numa_off __initdata; -unsigned long __initdata nodemap_addr; -unsigned long __initdata nodemap_size; +static unsigned long __initdata nodemap_addr; +static unsigned long __initdata nodemap_size; /* * Given a shift value, try to populate memnodemap[] @@ -296,7 +296,7 @@ void __init numa_init_array(void) #ifdef CONFIG_NUMA_EMU /* Numa emulation */ -char *cmdline __initdata; +static char *cmdline __initdata; /* * Setups up nid to range from addr to addr + size. If the end From 886533a3e370a6d5c4e46819d1e14bd2f20dbb3a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2008 15:43:36 +0200 Subject: [PATCH 12/17] x86: numa_64.c fix shadowed variable sparse mutters: arch/x86/mm/numa_64.c:195:27: warning: symbol 'end_pfn' shadows an earlier one Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 824344f1742f0..a1f3778b46801 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -179,7 +179,7 @@ static void * __init early_node_mem(int nodeid, unsigned long start, void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) { - unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size; + unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; unsigned long bootmap_start, nodedata_phys; void *bootmap; const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); @@ -191,7 +191,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, start, end); start_pfn = start >> PAGE_SHIFT; - end_pfn = end >> PAGE_SHIFT; + last_pfn = end >> PAGE_SHIFT; node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, SMP_CACHE_BYTES); @@ -204,7 +204,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; NODE_DATA(nodeid)->node_start_pfn = start_pfn; - NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; + NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; /* * Find a place for the bootmem map @@ -213,7 +213,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, * early_node_mem will get that with find_e820_area instead * of alloc_bootmem, that could clash with reserved range */ - bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); + bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); nid = phys_to_nid(nodedata_phys); if (nid == nodeid) bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); @@ -235,7 +235,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, bootmap_size = init_bootmem_node(NODE_DATA(nodeid), bootmap_start >> PAGE_SHIFT, - start_pfn, end_pfn); + start_pfn, last_pfn); printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", bootmap_start, bootmap_start + bootmap_size - 1, @@ -400,15 +400,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, } /* - * Sets up the system RAM area from start_pfn to end_pfn according to the + * Sets up the system RAM area from start_pfn to last_pfn according to the * numa=fake command-line option. */ static struct bootnode nodes[MAX_NUMNODES] __initdata; -static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) +static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn) { u64 size, addr = start_pfn << PAGE_SHIFT; - u64 max_addr = end_pfn << PAGE_SHIFT; + u64 max_addr = last_pfn << PAGE_SHIFT; int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; memset(&nodes, 0, sizeof(nodes)); @@ -514,7 +514,7 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) } #endif /* CONFIG_NUMA_EMU */ -void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) +void __init numa_initmem_init(unsigned long start_pfn, unsigned long last_pfn) { int i; @@ -522,7 +522,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) nodes_clear(node_online_map); #ifdef CONFIG_NUMA_EMU - if (cmdline && !numa_emulation(start_pfn, end_pfn)) + if (cmdline && !numa_emulation(start_pfn, last_pfn)) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); @@ -530,7 +530,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) #ifdef CONFIG_ACPI_NUMA if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, - end_pfn << PAGE_SHIFT)) + last_pfn << PAGE_SHIFT)) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); @@ -538,7 +538,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) #ifdef CONFIG_K8_NUMA if (!numa_off && !k8_scan_nodes(start_pfn< Date: Mon, 12 May 2008 21:21:12 +0200 Subject: [PATCH 13/17] sched, numa: replace MAX_NUMNODES with nr_node_ids in kernel/sched.c * Replace usages of MAX_NUMNODES with nr_node_ids in kernel/sched.c, where appropriate. This saves some allocated space as well as many wasted cycles going through node entries that are non-existent. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- kernel/sched.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 94ead43eda62b..bcc22b569ee90 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6538,9 +6538,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) min_val = INT_MAX; - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < nr_node_ids; i++) { /* Start at @node */ - n = (node + i) % MAX_NUMNODES; + n = (node + i) % nr_node_ids; if (!nr_cpus_node(n)) continue; @@ -6734,7 +6734,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) if (!sched_group_nodes) continue; - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < nr_node_ids; i++) { struct sched_group *oldsg, *sg = sched_group_nodes[i]; *nodemask = node_to_cpumask(i); @@ -6927,7 +6927,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* * Allocate the per-node list of sched groups */ - sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), + sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), GFP_KERNEL); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); @@ -7066,7 +7066,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #endif /* Set up physical groups */ - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < nr_node_ids; i++) { SCHED_CPUMASK_VAR(nodemask, allmasks); SCHED_CPUMASK_VAR(send_covered, allmasks); @@ -7090,7 +7090,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, send_covered, tmpmask); } - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < nr_node_ids; i++) { /* Set up node groups */ struct sched_group *sg, *prev; SCHED_CPUMASK_VAR(nodemask, allmasks); @@ -7129,9 +7129,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map, cpus_or(*covered, *covered, *nodemask); prev = sg; - for (j = 0; j < MAX_NUMNODES; j++) { + for (j = 0; j < nr_node_ids; j++) { SCHED_CPUMASK_VAR(notcovered, allmasks); - int n = (i + j) % MAX_NUMNODES; + int n = (i + j) % nr_node_ids; node_to_cpumask_ptr(pnodemask, n); cpus_complement(*notcovered, *covered); @@ -7184,7 +7184,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, } #ifdef CONFIG_NUMA - for (i = 0; i < MAX_NUMNODES; i++) + for (i = 0; i < nr_node_ids; i++) init_numa_sched_groups_power(sched_group_nodes[i]); if (sd_allnodes) { From 03db1f74a7d823e3de3767f36b1e08829f6fb3a1 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 6 Jun 2008 16:33:25 +0200 Subject: [PATCH 14/17] x86: don't return invalid pointers from node_to_cpumask() Signed-off-by: Vegard Nossum Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index dd12c1c84a8fc..df49ce87a3003 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -350,6 +350,7 @@ cpumask_t *_node_to_cpumask_ptr(int node) dump_stack(); return &cpu_online_map; } + BUG_ON(node >= nr_node_ids); return &node_to_cpumask_map[node]; } EXPORT_SYMBOL(_node_to_cpumask_ptr); @@ -365,6 +366,7 @@ cpumask_t node_to_cpumask(int node) dump_stack(); return cpu_online_map; } + BUG_ON(node >= nr_node_ids); return node_to_cpumask_map[node]; } EXPORT_SYMBOL(node_to_cpumask); From 053713f5745b8b08fb598adb65230bc168cb9d8d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 5 Jun 2008 11:10:59 -0700 Subject: [PATCH 15/17] x86: fix setup.c printk format warning Fix setup.c printk format warning: linux-next-20080605/arch/x86/kernel/setup.c: In function 'setup_per_cpu_areas': linux-next-20080605/arch/x86/kernel/setup.c:173: warning: format '%lu' expects type 'long unsigned int', but argument 2 has type 'ssize_t' Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index df49ce87a3003..d4eaa4eb481db 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -169,7 +169,7 @@ void __init setup_per_cpu_areas(void) /* Copy section for each CPU (we discard the original) */ size = PERCPU_ENOUGH_ROOM; - printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", + printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", size); for_each_possible_cpu(cpu) { From 3fd052b1b46ac23a2316283a996fe6c32dbcf132 Mon Sep 17 00:00:00 2001 From: Bernhard Walle Date: Sun, 8 Jun 2008 15:46:30 +0200 Subject: [PATCH 16/17] x86: add flags parameter to reserve_bootmem_generic() This patch adds a 'flags' parameter to reserve_bootmem_generic() like it already has been added in reserve_bootmem() with commit 72a7fe3967dbf86cb34e24fbf1d957fe24d2f246. It also changes all users to use BOOTMEM_DEFAULT, which doesn't effectively change the behaviour. Since the change is x86-specific, I don't think it's necessary to add a new API for migration. There are only 4 users of that function. The change is necessary for the next patch, using reserve_bootmem_generic() for crashkernel reservation. Signed-off-by: Bernhard Walle Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820_64.c | 3 ++- arch/x86/kernel/efi_64.c | 3 ++- arch/x86/kernel/mpparse.c | 5 +++-- arch/x86/mm/init_64.c | 17 ++++++++++++----- include/asm-x86/proto.h | 2 +- 5 files changed, 20 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 124480c0008dd..af1eb0789740b 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -118,7 +118,8 @@ void __init early_res_to_bootmem(unsigned long start, unsigned long end) continue; printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i, final_start, final_end - 1, r->name); - reserve_bootmem_generic(final_start, final_end - final_start); + reserve_bootmem_generic(final_start, final_end - final_start, + BOOTMEM_DEFAULT); } } diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index d0060fdcccac1..d561dd5f1e629 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c @@ -100,7 +100,8 @@ void __init efi_call_phys_epilog(void) void __init efi_reserve_bootmem(void) { reserve_bootmem_generic((unsigned long)memmap.phys_map, - memmap.nr_map * memmap.desc_size); + memmap.nr_map * memmap.desc_size, + BOOTMEM_DEFAULT); } void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 404683b94e795..4901ae3f742cc 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -729,10 +729,11 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, if (!reserve) return 1; - reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE); + reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, + BOOTMEM_DEFAULT); if (mpf->mpf_physptr) reserve_bootmem_generic(mpf->mpf_physptr, - PAGE_SIZE); + PAGE_SIZE, BOOTMEM_DEFAULT); #endif return 1; } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 819dad973b137..bf7bf1de6c25a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -799,12 +799,13 @@ void free_initrd_mem(unsigned long start, unsigned long end) } #endif -void __init reserve_bootmem_generic(unsigned long phys, unsigned len) +int __init reserve_bootmem_generic(unsigned long phys, unsigned len, int flags) { #ifdef CONFIG_NUMA int nid, next_nid; #endif unsigned long pfn = phys >> PAGE_SHIFT; + int ret; if (pfn >= end_pfn) { /* @@ -812,11 +813,11 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) * firmware tables: */ if (pfn < max_pfn_mapped) - return; + return -EFAULT; printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", phys, len); - return; + return -EFAULT; } /* Should check here against the e820 map to avoid double free */ @@ -824,9 +825,13 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) nid = phys_to_nid(phys); next_nid = phys_to_nid(phys + len - 1); if (nid == next_nid) - reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); + ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags); else - reserve_bootmem(phys, len, BOOTMEM_DEFAULT); + ret = reserve_bootmem(phys, len, flags); + + if (ret != 0) + return ret; + #else reserve_bootmem(phys, len, BOOTMEM_DEFAULT); #endif @@ -835,6 +840,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) dma_reserve += len / PAGE_SIZE; set_dma_reserve(dma_reserve); } + + return 0; } int kern_addr_valid(unsigned long addr) diff --git a/include/asm-x86/proto.h b/include/asm-x86/proto.h index 6c8b41b03f6de..a9f51472521e9 100644 --- a/include/asm-x86/proto.h +++ b/include/asm-x86/proto.h @@ -14,7 +14,7 @@ extern void ia32_syscall(void); extern void ia32_cstar_target(void); extern void ia32_sysenter_target(void); -extern void reserve_bootmem_generic(unsigned long phys, unsigned len); +extern int reserve_bootmem_generic(unsigned long phys, unsigned len, int flags); extern void syscall32_cpu_init(void); From 46f68e1c6b04a04772e828ff3bcd07ed708805c2 Mon Sep 17 00:00:00 2001 From: Bernhard Walle Date: Sun, 8 Jun 2008 15:46:31 +0200 Subject: [PATCH 17/17] x86: use reserve_bootmem_generic() to reserve crashkernel memory on x86_64 This patch uses reserve_bootmem_generic() instead of reserve_bootmem() to reserve the crashkernel memory on x86_64. That's necessary for NUMA machines, see 00212fef814612245ed0261cbac8426d0c9a31a5: [PATCH] Fix kdump Crash Kernel boot memory reservation for NUMA machines This patch will fix a boot memory reservation bug that trashes memory on the ES7000 when loading the kdump crash kernel. The code in arch/x86_64/kernel/setup.c to reserve boot memory for the crash kernel uses the non-numa aware "reserve_bootmem" function instead of the NUMA aware "reserve_bootmem_generic". I checked to make sure that no other function was using "reserve_bootmem" and found none, except the ones that had NUMA ifdef'ed out. I have tested this patch only on an ES7000 with NUMA on and off (numa=off) in a single (non-NUMA) and multi-cell (NUMA) configurations. Signed-off-by: Amul Shah Looks-good-to: Vivek Goyal Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds The switch-back to reserve_bootmem() was accidentally introduced in 5c3391f9f749023a49c64d607da4fb49263690eb when adding the BOOTMEM_EXCLUSIVE parameter. Signed-off-by: Bernhard Walle Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index e8df64fad5401..4a666cdccb689 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -243,7 +243,7 @@ static void __init reserve_crashkernel(void) return; } - if (reserve_bootmem(crash_base, crash_size, + if (reserve_bootmem_generic(crash_base, crash_size, BOOTMEM_EXCLUSIVE) < 0) { printk(KERN_INFO "crashkernel reservation failed - " "memory is in use\n");