---

yaml --- r: 63832 b: refs/heads/master c: 8b80fc0 h: refs/heads/master v: v3
git-mirror · Aug 9, 2007 · b2147a4 · b2147a4
1 parent dc5cd8a
commit b2147a4
Show file tree

Hide file tree

Showing 73 changed files with 1,291 additions and 660 deletions.
diff --git a/[refs] b/[refs]
@@ -1,2 +1,2 @@
 ---
-refs/heads/master: 4011cd97886dd04b90fef8b671b9936cd39ab983
+refs/heads/master: 8b80fc02b829a59602b0f53eb9393ffb2db2659d
diff --git a/trunk/Documentation/lguest/Makefile b/trunk/Documentation/lguest/Makefile
@@ -13,7 +13,9 @@ LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
 
 CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds
 LDLIBS:=-lz
-
+# Removing this works for some versions of ld.so (eg. Ubuntu Feisty) and
+# not others (eg. FC7).
+LDFLAGS+=-static
 all: lguest.lds lguest
 
 # The linker script on x86 is so complex the only way of creating one

diff --git a/trunk/Documentation/sched-design-CFS.txt b/trunk/Documentation/sched-design-CFS.txt
@@ -83,7 +83,7 @@ Some implementation details:
    CFS uses nanosecond granularity accounting and does not rely on any
    jiffies or other HZ detail. Thus the CFS scheduler has no notion of
    'timeslices' and has no heuristics whatsoever. There is only one
-   central tunable:
+   central tunable (you have to switch on CONFIG_SCHED_DEBUG):
 
          /proc/sys/kernel/sched_granularity_ns
 

diff --git a/trunk/Documentation/sched-nice-design.txt b/trunk/Documentation/sched-nice-design.txt
@@ -0,0 +1,108 @@
+This document explains the thinking about the revamped and streamlined
+nice-levels implementation in the new Linux scheduler.
+
+Nice levels were always pretty weak under Linux and people continuously
+pestered us to make nice +19 tasks use up much less CPU time.
+
+Unfortunately that was not that easy to implement under the old
+scheduler, (otherwise we'd have done it long ago) because nice level
+support was historically coupled to timeslice length, and timeslice
+units were driven by the HZ tick, so the smallest timeslice was 1/HZ.
+
+In the O(1) scheduler (in 2003) we changed negative nice levels to be
+much stronger than they were before in 2.4 (and people were happy about
+that change), and we also intentionally calibrated the linear timeslice
+rule so that nice +19 level would be _exactly_ 1 jiffy. To better
+understand it, the timeslice graph went like this (cheesy ASCII art
+alert!):
+
+
+                   A
+             \     | [timeslice length]
+              \    |
+               \   |
+                \  |
+                 \ |
+                  \|___100msecs
+                   |^ . _
+                   |      ^ . _
+                   |            ^ . _
+ -*----------------------------------*-----> [nice level]
+ -20               |                +19
+                   |
+                   |
+
+So that if someone wanted to really renice tasks, +19 would give a much
+bigger hit than the normal linear rule would do. (The solution of
+changing the ABI to extend priorities was discarded early on.)
+
+This approach worked to some degree for some time, but later on with
+HZ=1000 it caused 1 jiffy to be 1 msec, which meant 0.1% CPU usage which
+we felt to be a bit excessive. Excessive _not_ because it's too small of
+a CPU utilization, but because it causes too frequent (once per
+millisec) rescheduling. (and would thus trash the cache, etc. Remember,
+this was long ago when hardware was weaker and caches were smaller, and
+people were running number crunching apps at nice +19.)
+
+So for HZ=1000 we changed nice +19 to 5msecs, because that felt like the
+right minimal granularity - and this translates to 5% CPU utilization.
+But the fundamental HZ-sensitive property for nice+19 still remained,
+and we never got a single complaint about nice +19 being too _weak_ in
+terms of CPU utilization, we only got complaints about it (still) being
+too _strong_ :-)
+
+To sum it up: we always wanted to make nice levels more consistent, but
+within the constraints of HZ and jiffies and their nasty design level
+coupling to timeslices and granularity it was not really viable.
+
+The second (less frequent but still periodically occuring) complaint
+about Linux's nice level support was its assymetry around the origo
+(which you can see demonstrated in the picture above), or more
+accurately: the fact that nice level behavior depended on the _absolute_
+nice level as well, while the nice API itself is fundamentally
+"relative":
+
+   int nice(int inc);
+
+   asmlinkage long sys_nice(int increment)
+
+(the first one is the glibc API, the second one is the syscall API.)
+Note that the 'inc' is relative to the current nice level. Tools like
+bash's "nice" command mirror this relative API.
+
+With the old scheduler, if you for example started a niced task with +1
+and another task with +2, the CPU split between the two tasks would
+depend on the nice level of the parent shell - if it was at nice -10 the
+CPU split was different than if it was at +5 or +10.
+
+A third complaint against Linux's nice level support was that negative
+nice levels were not 'punchy enough', so lots of people had to resort to
+run audio (and other multimedia) apps under RT priorities such as
+SCHED_FIFO. But this caused other problems: SCHED_FIFO is not starvation
+proof, and a buggy SCHED_FIFO app can also lock up the system for good.
+
+The new scheduler in v2.6.23 addresses all three types of complaints:
+
+To address the first complaint (of nice levels being not "punchy"
+enough), the scheduler was decoupled from 'time slice' and HZ concepts
+(and granularity was made a separate concept from nice levels) and thus
+it was possible to implement better and more consistent nice +19
+support: with the new scheduler nice +19 tasks get a HZ-independent
+1.5%, instead of the variable 3%-5%-9% range they got in the old
+scheduler.
+
+To address the second complaint (of nice levels not being consistent),
+the new scheduler makes nice(1) have the same CPU utilization effect on
+tasks, regardless of their absolute nice levels. So on the new
+scheduler, running a nice +10 and a nice 11 task has the same CPU
+utilization "split" between them as running a nice -5 and a nice -4
+task. (one will get 55% of the CPU, the other 45%.) That is why nice
+levels were changed to be "multiplicative" (or exponential) - that way
+it does not matter which nice level you start out from, the 'relative
+result' will always be the same.
+
+The third complaint (of negative nice levels not being "punchy" enough
+and forcing audio apps to run under the more dangerous SCHED_FIFO
+scheduling policy) is addressed by the new scheduler almost
+automatically: stronger negative nice levels are an automatic
+side-effect of the recalibrated dynamic range of nice levels.
diff --git a/trunk/arch/sparc/kernel/prom.c b/trunk/arch/sparc/kernel/prom.c
@@ -102,6 +102,21 @@ int of_set_property(struct device_node *dp, const char *name, void *val, int len
 }
 EXPORT_SYMBOL(of_set_property);
 
+int of_find_in_proplist(const char *list, const char *match, int len)
+{
+	while (len > 0) {
+		int l;
+
+		if (!strcmp(list, match))
+			return 1;
+		l = strlen(list) + 1;
+		list += l;
+		len -= l;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(of_find_in_proplist);
+
 static unsigned int prom_early_allocated;
 
 static void * __init prom_early_alloc(unsigned long size)

diff --git a/trunk/arch/sparc64/kernel/cpu.c b/trunk/arch/sparc64/kernel/cpu.c
@@ -1,7 +1,7 @@
 /* cpu.c: Dinky routines to look for the kind of Sparc cpu
  *        we are on.
  *
- * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 1996, 2007 David S. Miller (davem@davemloft.net)
  */
 
 #include <linux/kernel.h>
@@ -13,6 +13,7 @@
 #include <asm/fpumacro.h>
 #include <asm/cpudata.h>
 #include <asm/spitfire.h>
+#include <asm/oplib.h>
 
 DEFINE_PER_CPU(cpuinfo_sparc, __cpu_data) = { 0 };
 
@@ -61,21 +62,40 @@ struct cpu_iu_info linux_sparc_chips[] = {
 
 #define NSPARCCHIPS  ARRAY_SIZE(linux_sparc_chips)
 
-char *sparc_cpu_type = "cpu-oops";
-char *sparc_fpu_type = "fpu-oops";
+char *sparc_cpu_type;
+char *sparc_fpu_type;
 
 unsigned int fsr_storage;
 
+static void __init sun4v_cpu_probe(void)
+{
+	switch (sun4v_chip_type) {
+	case SUN4V_CHIP_NIAGARA1:
+		sparc_cpu_type = "UltraSparc T1 (Niagara)";
+		sparc_fpu_type = "UltraSparc T1 integrated FPU";
+		break;
+
+	case SUN4V_CHIP_NIAGARA2:
+		sparc_cpu_type = "UltraSparc T2 (Niagara2)";
+		sparc_fpu_type = "UltraSparc T2 integrated FPU";
+		break;
+
+	default:
+		printk(KERN_WARNING "CPU: Unknown sun4v cpu type [%s]\n",
+		       prom_cpu_compatible);
+		sparc_cpu_type = "Unknown SUN4V CPU";
+		sparc_fpu_type = "Unknown SUN4V FPU";
+		break;
+	}
+}
+
 void __init cpu_probe(void)
 {
 	unsigned long ver, fpu_vers, manuf, impl, fprs;
 	int i;
 
-	if (tlb_type == hypervisor) {
-		sparc_cpu_type = "UltraSparc T1 (Niagara)";
-		sparc_fpu_type = "UltraSparc T1 integrated FPU";
-		return;
-	}
+	if (tlb_type == hypervisor)
+		return sun4v_cpu_probe();
 
 	fprs = fprs_read();
 	fprs_write(FPRS_FEF);

diff --git a/trunk/arch/sparc64/kernel/head.S b/trunk/arch/sparc64/kernel/head.S
@@ -97,7 +97,8 @@ sparc64_boot:
 	.globl	prom_map_name, prom_unmap_name, prom_mmu_ihandle_cache
 	.globl	prom_boot_mapped_pc, prom_boot_mapping_mode
 	.globl	prom_boot_mapping_phys_high, prom_boot_mapping_phys_low
-	.globl	is_sun4v
+	.globl	prom_compatible_name, prom_cpu_path, prom_cpu_compatible
+	.globl	is_sun4v, sun4v_chip_type
 prom_peer_name:
 	.asciz	"peer"
 prom_compatible_name:
@@ -106,6 +107,8 @@ prom_finddev_name:
 	.asciz	"finddevice"
 prom_chosen_path:
 	.asciz	"/chosen"
+prom_cpu_path:
+	.asciz	"/cpu"
 prom_getprop_name:
 	.asciz	"getprop"
 prom_mmu_name:
@@ -120,9 +123,13 @@ prom_unmap_name:
 	.asciz	"unmap"
 prom_sun4v_name:
 	.asciz	"sun4v"
+prom_niagara_prefix:
+	.asciz	"SUNW,UltraSPARC-T"
 	.align	4
 prom_root_compatible:
 	.skip	64
+prom_cpu_compatible:
+	.skip	64
 prom_root_node:
 	.word	0
 prom_mmu_ihandle_cache:
@@ -138,6 +145,8 @@ prom_boot_mapping_phys_low:
 	.xword	0
 is_sun4v:
 	.word	0
+sun4v_chip_type:
+	.word	SUN4V_CHIP_INVALID
 1:
 	rd	%pc, %l0
 
@@ -296,21 +305,94 @@ is_sun4v:
 	sethi	%hi(prom_sun4v_name), %g7
 	or	%g7, %lo(prom_sun4v_name), %g7
 	mov	5, %g3
-1:	ldub	[%g7], %g2
+90:	ldub	[%g7], %g2
 	ldub	[%g1], %g4
 	cmp	%g2, %g4
-	bne,pn	%icc, 2f
+	bne,pn	%icc, 80f
 	 add	%g7, 1, %g7
 	subcc	%g3, 1, %g3
-	bne,pt	%xcc, 1b
+	bne,pt	%xcc, 90b
 	 add	%g1, 1, %g1
 
 	sethi	%hi(is_sun4v), %g1
 	or	%g1, %lo(is_sun4v), %g1
 	mov	1, %g7
 	stw	%g7, [%g1]
 
-2:
+	/* cpu_node = prom_finddevice("/cpu") */
+	mov	(1b - prom_finddev_name), %l1
+	mov	(1b - prom_cpu_path), %l2
+	sub	%l0, %l1, %l1
+	sub	%l0, %l2, %l2
+	sub	%sp, (192 + 128), %sp
+
+	stx	%l1, [%sp + 2047 + 128 + 0x00]	! service, "finddevice"
+	mov	1, %l3
+	stx	%l3, [%sp + 2047 + 128 + 0x08]	! num_args, 1
+	stx	%l3, [%sp + 2047 + 128 + 0x10]	! num_rets, 1
+	stx	%l2, [%sp + 2047 + 128 + 0x18]	! arg1, "/cpu"
+	stx	%g0, [%sp + 2047 + 128 + 0x20]	! ret1
+	call	%l7
+	 add	%sp, (2047 + 128), %o0		! argument array
+
+	ldx	[%sp + 2047 + 128 + 0x20], %l4	! cpu device node
+
+	mov	(1b - prom_getprop_name), %l1
+	mov	(1b - prom_compatible_name), %l2
+	mov	(1b - prom_cpu_compatible), %l5
+	sub	%l0, %l1, %l1
+	sub	%l0, %l2, %l2
+	sub	%l0, %l5, %l5
+
+	/* prom_getproperty(cpu_node, "compatible",
+	 *                  &prom_cpu_compatible, 64)
+	 */
+	stx	%l1, [%sp + 2047 + 128 + 0x00]	! service, "getprop"
+	mov	4, %l3
+	stx	%l3, [%sp + 2047 + 128 + 0x08]	! num_args, 4
+	mov	1, %l3
+	stx	%l3, [%sp + 2047 + 128 + 0x10]	! num_rets, 1
+	stx	%l4, [%sp + 2047 + 128 + 0x18]	! arg1, cpu_node
+	stx	%l2, [%sp + 2047 + 128 + 0x20]	! arg2, "compatible"
+	stx	%l5, [%sp + 2047 + 128 + 0x28]	! arg3, &prom_cpu_compatible
+	mov	64, %l3
+	stx	%l3, [%sp + 2047 + 128 + 0x30]	! arg4, size
+	stx	%g0, [%sp + 2047 + 128 + 0x38]	! ret1
+	call	%l7
+	 add	%sp, (2047 + 128), %o0		! argument array
+
+	add	%sp, (192 + 128), %sp
+
+	sethi	%hi(prom_cpu_compatible), %g1
+	or	%g1, %lo(prom_cpu_compatible), %g1
+	sethi	%hi(prom_niagara_prefix), %g7
+	or	%g7, %lo(prom_niagara_prefix), %g7
+	mov	17, %g3
+90:	ldub	[%g7], %g2
+	ldub	[%g1], %g4
+	cmp	%g2, %g4
+	bne,pn	%icc, 4f
+	 add	%g7, 1, %g7
+	subcc	%g3, 1, %g3
+	bne,pt	%xcc, 90b
+	 add	%g1, 1, %g1
+
+	sethi	%hi(prom_cpu_compatible), %g1
+	or	%g1, %lo(prom_cpu_compatible), %g1
+	ldub	[%g1 + 17], %g2
+	cmp	%g2, '1'
+	be,pt	%xcc, 5f
+	 mov	SUN4V_CHIP_NIAGARA1, %g4
+	cmp	%g2, '2'
+	be,pt	%xcc, 5f
+	 mov	SUN4V_CHIP_NIAGARA2, %g4
+4:
+	mov	SUN4V_CHIP_UNKNOWN, %g4
+5:	sethi	%hi(sun4v_chip_type), %g2
+	or	%g2, %lo(sun4v_chip_type), %g2
+	stw	%g4, [%g2]
+
+80:
 	BRANCH_IF_SUN4V(g1, jump_to_sun4u_init)
 	BRANCH_IF_CHEETAH_BASE(g1,g7,cheetah_boot)
 	BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1,g7,cheetah_plus_boot)
@@ -414,13 +496,32 @@ niagara_tlb_fixup:
 	stw	%g2, [%g1 + %lo(tlb_type)]
 
 	/* Patch copy/clear ops.  */
+	sethi	%hi(sun4v_chip_type), %g1
+	lduw	[%g1 + %lo(sun4v_chip_type)], %g1
+	cmp	%g1, SUN4V_CHIP_NIAGARA1
+	be,pt	%xcc, niagara_patch
+	 cmp	%g1, SUN4V_CHIP_NIAGARA2
+	be,pt	%xcc, niagara_patch
+	 nop
+
+	call	generic_patch_copyops
+	 nop
+	call	generic_patch_bzero
+	 nop
+	call	generic_patch_pageops
+	 nop
+
+	ba,a,pt	%xcc, 80f
+
+niagara_patch:
 	call	niagara_patch_copyops
 	 nop
 	call	niagara_patch_bzero
 	 nop
 	call	niagara_patch_pageops
 	 nop
 
+80:
 	/* Patch TLB/cache ops.  */
 	call	hypervisor_patch_cachetlbops
 	 nop