Skip to content

Commit

Permalink
[PATCH] sched: new sched domain for representing multi-core
Browse files Browse the repository at this point in the history
Add a new sched domain for representing multi-core with shared caches
between cores.  Consider a dual package system, each package containing two
cores and with last level cache shared between cores with in a package.  If
there are two runnable processes, with this appended patch those two
processes will be scheduled on different packages.

On such systems, with this patch we have observed 8% perf improvement with
specJBB(2 warehouse) benchmark and 35% improvement with CFP2000 rate(with 2
users).

This new domain will come into play only on multi-core systems with shared
caches.  On other systems, this sched domain will be removed by domain
degeneration code.  This new domain can be also used for implementing power
savings policy (see OLS 2005 CMP kernel scheduler paper for more details..
I will post another patch for power savings policy soon)

Most of the arch/* file changes are for cpu_coregroup_map() implementation.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
  • Loading branch information
Siddha, Suresh B authored and Linus Torvalds committed Mar 27, 2006
1 parent 77e4bfb commit 1e9f28f
Show file tree
Hide file tree
Showing 14 changed files with 186 additions and 11 deletions.
9 changes: 9 additions & 0 deletions arch/i386/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,15 @@ config SCHED_SMT
cost of slightly increased overhead in some places. If unsure say
N here.

config SCHED_MC
bool "Multi-core scheduler support"
depends on SMP
default y
help
Multi-core scheduler support improves the CPU scheduler's decision
making when dealing with multi-core CPU chips at a cost of slightly
increased overhead in some places. If unsure say N here.

source "kernel/Kconfig.preempt"

config X86_UP_APIC
Expand Down
10 changes: 7 additions & 3 deletions arch/i386/kernel/cpu/common.c
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ static void __init early_cpu_detect(void)
void __cpuinit generic_identify(struct cpuinfo_x86 * c)
{
u32 tfms, xlvl;
int junk;
int ebx;

if (have_cpuid_p()) {
/* Get vendor name */
Expand All @@ -282,7 +282,7 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
/* Intel-defined flags: level 0x00000001 */
if ( c->cpuid_level >= 0x00000001 ) {
u32 capability, excap;
cpuid(0x00000001, &tfms, &junk, &excap, &capability);
cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
c->x86_capability[0] = capability;
c->x86_capability[4] = excap;
c->x86 = (tfms >> 8) & 15;
Expand All @@ -292,6 +292,11 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
if (c->x86 >= 0x6)
c->x86_model += ((tfms >> 16) & 0xF) << 4;
c->x86_mask = tfms & 15;
#ifdef CONFIG_SMP
c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
#else
c->apicid = (ebx >> 24) & 0xFF;
#endif
} else {
/* Have CPUID level 0 only - unheard of */
c->x86 = 4;
Expand Down Expand Up @@ -474,7 +479,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)

cpuid(1, &eax, &ebx, &ecx, &edx);

c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);

if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
return;
Expand Down
22 changes: 20 additions & 2 deletions arch/i386/kernel/cpu/intel_cacheinfo.c
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */
unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
#ifdef CONFIG_SMP
unsigned int cpu = (c == &boot_cpu_data) ? 0 : (c - cpu_data);
#endif

if (c->cpuid_level > 3) {
static int is_initialized;
Expand Down Expand Up @@ -205,9 +209,15 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
break;
case 2:
new_l2 = this_leaf.size/1024;
num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
index_msb = get_count_order(num_threads_sharing);
l2_id = c->apicid >> index_msb;
break;
case 3:
new_l3 = this_leaf.size/1024;
num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
index_msb = get_count_order(num_threads_sharing);
l3_id = c->apicid >> index_msb;
break;
default:
break;
Expand Down Expand Up @@ -273,11 +283,19 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
if (new_l1i)
l1i = new_l1i;

if (new_l2)
if (new_l2) {
l2 = new_l2;
#ifdef CONFIG_SMP
cpu_llc_id[cpu] = l2_id;
#endif
}

if (new_l3)
if (new_l3) {
l3 = new_l3;
#ifdef CONFIG_SMP
cpu_llc_id[cpu] = l3_id;
#endif
}

if ( trace )
printk (KERN_INFO "CPU: Trace cache: %dK uops", trace);
Expand Down
24 changes: 24 additions & 0 deletions arch/i386/kernel/smpboot.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
/* Core ID of each logical CPU */
int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};

/* Last level cache ID of each logical CPU */
int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};

/* representing HT siblings of each logical CPU */
cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(cpu_sibling_map);
Expand Down Expand Up @@ -440,6 +443,18 @@ static void __devinit smp_callin(void)

static int cpucount;

/* maps the cpu to the sched domain representing multi-core */
cpumask_t cpu_coregroup_map(int cpu)
{
struct cpuinfo_x86 *c = cpu_data + cpu;
/*
* For perf, we return last level cache shared map.
* TBD: when power saving sched policy is added, we will return
* cpu_core_map when power saving policy is enabled
*/
return c->llc_shared_map;
}

/* representing cpus for which sibling maps can be computed */
static cpumask_t cpu_sibling_setup_map;

Expand All @@ -459,19 +474,28 @@ set_cpu_sibling_map(int cpu)
cpu_set(cpu, cpu_sibling_map[i]);
cpu_set(i, cpu_core_map[cpu]);
cpu_set(cpu, cpu_core_map[i]);
cpu_set(i, c[cpu].llc_shared_map);
cpu_set(cpu, c[i].llc_shared_map);
}
}
} else {
cpu_set(cpu, cpu_sibling_map[cpu]);
}

cpu_set(cpu, c[cpu].llc_shared_map);

if (current_cpu_data.x86_max_cores == 1) {
cpu_core_map[cpu] = cpu_sibling_map[cpu];
c[cpu].booted_cores = 1;
return;
}

for_each_cpu_mask(i, cpu_sibling_setup_map) {
if (cpu_llc_id[cpu] != BAD_APICID &&
cpu_llc_id[cpu] == cpu_llc_id[i]) {
cpu_set(i, c[cpu].llc_shared_map);
cpu_set(cpu, c[i].llc_shared_map);
}
if (phys_proc_id[cpu] == phys_proc_id[i]) {
cpu_set(i, cpu_core_map[cpu]);
cpu_set(cpu, cpu_core_map[i]);
Expand Down
9 changes: 9 additions & 0 deletions arch/x86_64/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,15 @@ config SCHED_SMT
cost of slightly increased overhead in some places. If unsure say
N here.

config SCHED_MC
bool "Multi-core scheduler support"
depends on SMP
default y
help
Multi-core scheduler support improves the CPU scheduler's decision
making when dealing with multi-core CPU chips at a cost of slightly
increased overhead in some places. If unsure say N here.

source "kernel/Kconfig.preempt"

config NUMA
Expand Down
3 changes: 2 additions & 1 deletion arch/x86_64/kernel/setup.c
Original file line number Diff line number Diff line change
Expand Up @@ -962,7 +962,6 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)

cpuid(1, &eax, &ebx, &ecx, &edx);

c->apicid = phys_pkg_id(0);

if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
return;
Expand Down Expand Up @@ -1171,6 +1170,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
c->x86_capability[2] = cpuid_edx(0x80860001);
}

c->apicid = phys_pkg_id(0);

/*
* Vendor-specific initialization. In this section we
* canonicalize the feature flags, meaning if there are
Expand Down
24 changes: 24 additions & 0 deletions arch/x86_64/kernel/smpboot.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
/* core ID of each logical CPU */
u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };

/* Last level cache ID of each logical CPU */
u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};

/* Bitmask of currently online CPUs */
cpumask_t cpu_online_map __read_mostly;

Expand Down Expand Up @@ -445,6 +448,18 @@ void __cpuinit smp_callin(void)
cpu_set(cpuid, cpu_callin_map);
}

/* maps the cpu to the sched domain representing multi-core */
cpumask_t cpu_coregroup_map(int cpu)
{
struct cpuinfo_x86 *c = cpu_data + cpu;
/*
* For perf, we return last level cache shared map.
* TBD: when power saving sched policy is added, we will return
* cpu_core_map when power saving policy is enabled
*/
return c->llc_shared_map;
}

/* representing cpus for which sibling maps can be computed */
static cpumask_t cpu_sibling_setup_map;

Expand All @@ -463,19 +478,28 @@ static inline void set_cpu_sibling_map(int cpu)
cpu_set(cpu, cpu_sibling_map[i]);
cpu_set(i, cpu_core_map[cpu]);
cpu_set(cpu, cpu_core_map[i]);
cpu_set(i, c[cpu].llc_shared_map);
cpu_set(cpu, c[i].llc_shared_map);
}
}
} else {
cpu_set(cpu, cpu_sibling_map[cpu]);
}

cpu_set(cpu, c[cpu].llc_shared_map);

if (current_cpu_data.x86_max_cores == 1) {
cpu_core_map[cpu] = cpu_sibling_map[cpu];
c[cpu].booted_cores = 1;
return;
}

for_each_cpu_mask(i, cpu_sibling_setup_map) {
if (cpu_llc_id[cpu] != BAD_APICID &&
cpu_llc_id[cpu] == cpu_llc_id[i]) {
cpu_set(i, c[cpu].llc_shared_map);
cpu_set(cpu, c[i].llc_shared_map);
}
if (phys_proc_id[cpu] == phys_proc_id[i]) {
cpu_set(i, cpu_core_map[cpu]);
cpu_set(cpu, cpu_core_map[i]);
Expand Down
5 changes: 5 additions & 0 deletions include/asm-i386/processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <linux/config.h>
#include <linux/threads.h>
#include <asm/percpu.h>
#include <linux/cpumask.h>

/* flag for disabling the tsc */
extern int tsc_disable;
Expand Down Expand Up @@ -67,6 +68,9 @@ struct cpuinfo_x86 {
char pad0;
int x86_power;
unsigned long loops_per_jiffy;
#ifdef CONFIG_SMP
cpumask_t llc_shared_map; /* cpus sharing the last level cache */
#endif
unsigned char x86_max_cores; /* cpuid returned max cores value */
unsigned char booted_cores; /* number of cores as seen by OS */
unsigned char apicid;
Expand Down Expand Up @@ -103,6 +107,7 @@ extern struct cpuinfo_x86 cpu_data[];

extern int phys_proc_id[NR_CPUS];
extern int cpu_core_id[NR_CPUS];
extern int cpu_llc_id[NR_CPUS];
extern char ignore_fpu_irq;

extern void identify_cpu(struct cpuinfo_x86 *);
Expand Down
2 changes: 2 additions & 0 deletions include/asm-i386/topology.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,4 +112,6 @@ extern unsigned long node_remap_size[];

#endif /* CONFIG_NUMA */

extern cpumask_t cpu_coregroup_map(int cpu);

#endif /* _ASM_I386_TOPOLOGY_H */
4 changes: 4 additions & 0 deletions include/asm-x86_64/processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <asm/mmsegment.h>
#include <asm/percpu.h>
#include <linux/personality.h>
#include <linux/cpumask.h>

#define TF_MASK 0x00000100
#define IF_MASK 0x00000200
Expand Down Expand Up @@ -65,6 +66,9 @@ struct cpuinfo_x86 {
__u32 x86_power;
__u32 extended_cpuid_level; /* Max extended CPUID function supported */
unsigned long loops_per_jiffy;
#ifdef CONFIG_SMP
cpumask_t llc_shared_map; /* cpus sharing the last level cache */
#endif
__u8 apicid;
__u8 booted_cores; /* number of cores as seen by OS */
} ____cacheline_aligned;
Expand Down
1 change: 1 addition & 0 deletions include/asm-x86_64/smp.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ extern cpumask_t cpu_sibling_map[NR_CPUS];
extern cpumask_t cpu_core_map[NR_CPUS];
extern u8 phys_proc_id[NR_CPUS];
extern u8 cpu_core_id[NR_CPUS];
extern u8 cpu_llc_id[NR_CPUS];

#define SMP_TRAMPOLINE_BASE 0x6000

Expand Down
2 changes: 2 additions & 0 deletions include/asm-x86_64/topology.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,6 @@ extern int __node_distance(int, int);

#include <asm-generic/topology.h>

extern cpumask_t cpu_coregroup_map(int cpu);

#endif
9 changes: 9 additions & 0 deletions include/linux/topology.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,15 @@
.nr_balance_failed = 0, \
}

#ifdef CONFIG_SCHED_MC
#ifndef SD_MC_INIT
/* for now its same as SD_CPU_INIT.
* TBD: Tune Domain parameters!
*/
#define SD_MC_INIT SD_CPU_INIT
#endif
#endif

#ifdef CONFIG_NUMA
#ifndef SD_NODE_INIT
#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
Expand Down
Loading

0 comments on commit 1e9f28f

Please sign in to comment.