---

yaml --- r: 58412 b: refs/heads/master c: 71ba22f h: refs/heads/master v: v3
git-mirror · Jul 9, 2007 · 1d118ef · 1d118ef
1 parent a6a1b36
commit 1d118ef
Show file tree

Hide file tree

Showing 96 changed files with 5,520 additions and 4,100 deletions.
diff --git a/[refs] b/[refs]
@@ -1,2 +1,2 @@
 ---
-refs/heads/master: f2ec8030085a27c4ba8e95a10a96f248efb34177
+refs/heads/master: 71ba22fa739029bb158144813b9e82c00326497c
diff --git a/trunk/Documentation/kernel-parameters.txt b/trunk/Documentation/kernel-parameters.txt
@@ -1014,49 +1014,6 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	mga=		[HW,DRM]
 
-	migration_cost=
-			[KNL,SMP] debug: override scheduler migration costs
-			Format: <level-1-usecs>,<level-2-usecs>,...
-			This debugging option can be used to override the
-			default scheduler migration cost matrix. The numbers
-			are indexed by 'CPU domain distance'.
-			E.g. migration_cost=1000,2000,3000 on an SMT NUMA
-			box will set up an intra-core migration cost of
-			1 msec, an inter-core migration cost of 2 msecs,
-			and an inter-node migration cost of 3 msecs.
-
-			WARNING: using the wrong values here can break
-			scheduler performance, so it's only for scheduler
-			development purposes, not production environments.
-
-	migration_debug=
-			[KNL,SMP] migration cost auto-detect verbosity
-			Format=<0|1|2>
-			If a system's migration matrix reported at bootup
-			seems erroneous then this option can be used to
-			increase verbosity of the detection process.
-			We default to 0 (no extra messages), 1 will print
-			some more information, and 2 will be really
-			verbose (probably only useful if you also have a
-			serial console attached to the system).
-
-	migration_factor=
-			[KNL,SMP] multiply/divide migration costs by a factor
-			Format=<percent>
-			This debug option can be used to proportionally
-			increase or decrease the auto-detected migration
-			costs for all entries of the migration matrix.
-			E.g. migration_factor=150 will increase migration
-			costs by 50%. (and thus the scheduler will be less
-			eager migrating cache-hot tasks)
-			migration_factor=80 will decrease migration costs
-			by 20%. (thus the scheduler will be more eager to
-			migrate tasks)
-
-			WARNING: using the wrong values here can break
-			scheduler performance, so it's only for scheduler
-			development purposes, not production environments.
-
 	mousedev.tap_time=
 			[MOUSE] Maximum time between finger touching and
 			leaving touchpad surface for touch to be considered

diff --git a/trunk/Documentation/sched-design-CFS.txt b/trunk/Documentation/sched-design-CFS.txt
@@ -0,0 +1,119 @@
+
+This is the CFS scheduler.
+
+80% of CFS's design can be summed up in a single sentence: CFS basically
+models an "ideal, precise multi-tasking CPU" on real hardware.
+
+"Ideal multi-tasking CPU" is a (non-existent  :-))  CPU that has 100%
+physical power and which can run each task at precise equal speed, in
+parallel, each at 1/nr_running speed. For example: if there are 2 tasks
+running then it runs each at 50% physical power - totally in parallel.
+
+On real hardware, we can run only a single task at once, so while that
+one task runs, the other tasks that are waiting for the CPU are at a
+disadvantage - the current task gets an unfair amount of CPU time. In
+CFS this fairness imbalance is expressed and tracked via the per-task
+p->wait_runtime (nanosec-unit) value. "wait_runtime" is the amount of
+time the task should now run on the CPU for it to become completely fair
+and balanced.
+
+( small detail: on 'ideal' hardware, the p->wait_runtime value would
+  always be zero - no task would ever get 'out of balance' from the
+  'ideal' share of CPU time. )
+
+CFS's task picking logic is based on this p->wait_runtime value and it
+is thus very simple: it always tries to run the task with the largest
+p->wait_runtime value. In other words, CFS tries to run the task with
+the 'gravest need' for more CPU time. So CFS always tries to split up
+CPU time between runnable tasks as close to 'ideal multitasking
+hardware' as possible.
+
+Most of the rest of CFS's design just falls out of this really simple
+concept, with a few add-on embellishments like nice levels,
+multiprocessing and various algorithm variants to recognize sleepers.
+
+In practice it works like this: the system runs a task a bit, and when
+the task schedules (or a scheduler tick happens) the task's CPU usage is
+'accounted for': the (small) time it just spent using the physical CPU
+is deducted from p->wait_runtime. [minus the 'fair share' it would have
+gotten anyway]. Once p->wait_runtime gets low enough so that another
+task becomes the 'leftmost task' of the time-ordered rbtree it maintains
+(plus a small amount of 'granularity' distance relative to the leftmost
+task so that we do not over-schedule tasks and trash the cache) then the
+new leftmost task is picked and the current task is preempted.
+
+The rq->fair_clock value tracks the 'CPU time a runnable task would have
+fairly gotten, had it been runnable during that time'. So by using
+rq->fair_clock values we can accurately timestamp and measure the
+'expected CPU time' a task should have gotten. All runnable tasks are
+sorted in the rbtree by the "rq->fair_clock - p->wait_runtime" key, and
+CFS picks the 'leftmost' task and sticks to it. As the system progresses
+forwards, newly woken tasks are put into the tree more and more to the
+right - slowly but surely giving a chance for every task to become the
+'leftmost task' and thus get on the CPU within a deterministic amount of
+time.
+
+Some implementation details:
+
+ - the introduction of Scheduling Classes: an extensible hierarchy of
+   scheduler modules. These modules encapsulate scheduling policy
+   details and are handled by the scheduler core without the core
+   code assuming about them too much.
+
+ - sched_fair.c implements the 'CFS desktop scheduler': it is a
+   replacement for the vanilla scheduler's SCHED_OTHER interactivity
+   code.
+
+   I'd like to give credit to Con Kolivas for the general approach here:
+   he has proven via RSDL/SD that 'fair scheduling' is possible and that
+   it results in better desktop scheduling. Kudos Con!
+
+   The CFS patch uses a completely different approach and implementation
+   from RSDL/SD. My goal was to make CFS's interactivity quality exceed
+   that of RSDL/SD, which is a high standard to meet :-) Testing
+   feedback is welcome to decide this one way or another. [ and, in any
+   case, all of SD's logic could be added via a kernel/sched_sd.c module
+   as well, if Con is interested in such an approach. ]
+
+   CFS's design is quite radical: it does not use runqueues, it uses a
+   time-ordered rbtree to build a 'timeline' of future task execution,
+   and thus has no 'array switch' artifacts (by which both the vanilla
+   scheduler and RSDL/SD are affected).
+
+   CFS uses nanosecond granularity accounting and does not rely on any
+   jiffies or other HZ detail. Thus the CFS scheduler has no notion of
+   'timeslices' and has no heuristics whatsoever. There is only one
+   central tunable:
+
+         /proc/sys/kernel/sched_granularity_ns
+
+   which can be used to tune the scheduler from 'desktop' (low
+   latencies) to 'server' (good batching) workloads. It defaults to a
+   setting suitable for desktop workloads. SCHED_BATCH is handled by the
+   CFS scheduler module too.
+
+   Due to its design, the CFS scheduler is not prone to any of the
+   'attacks' that exist today against the heuristics of the stock
+   scheduler: fiftyp.c, thud.c, chew.c, ring-test.c, massive_intr.c all
+   work fine and do not impact interactivity and produce the expected
+   behavior.
+
+   the CFS scheduler has a much stronger handling of nice levels and
+   SCHED_BATCH: both types of workloads should be isolated much more
+   agressively than under the vanilla scheduler.
+
+   ( another detail: due to nanosec accounting and timeline sorting,
+     sched_yield() support is very simple under CFS, and in fact under
+     CFS sched_yield() behaves much better than under any other
+     scheduler i have tested so far. )
+
+ - sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler
+   way than the vanilla scheduler does. It uses 100 runqueues (for all
+   100 RT priority levels, instead of 140 in the vanilla scheduler)
+   and it needs no expired array.
+
+ - reworked/sanitized SMP load-balancing: the runqueue-walking
+   assumptions are gone from the load-balancing code now, and
+   iterators of the scheduling modules are used. The balancing code got
+   quite a bit simpler as a result.
+
diff --git a/trunk/arch/i386/kernel/smpboot.c b/trunk/arch/i386/kernel/smpboot.c
@@ -941,17 +941,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
 }
 #endif
 
-static void smp_tune_scheduling(void)
-{
-	if (cpu_khz) {
-		/* cache size in kB */
-		long cachesize = boot_cpu_data.x86_cache_size;
-
-		if (cachesize > 0)
-			max_cache_size = cachesize * 1024;
-	}
-}
-
 /*
  * Cycle through the processors sending APIC IPIs to boot each.
  */
@@ -980,7 +969,6 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
 	x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
 
 	current_thread_info()->cpu = 0;
-	smp_tune_scheduling();
 
 	set_cpu_sibling_map(0);
 

diff --git a/trunk/arch/i386/kernel/tsc.c b/trunk/arch/i386/kernel/tsc.c
@@ -4,6 +4,7 @@
  * See comments there for proper credits.
  */
 
+#include <linux/sched.h>
 #include <linux/clocksource.h>
 #include <linux/workqueue.h>
 #include <linux/cpufreq.h>
@@ -106,8 +107,13 @@ unsigned long long sched_clock(void)
 
 	/*
 	 * Fall back to jiffies if there's no TSC available:
+	 * ( But note that we still use it if the TSC is marked
+	 *   unstable. We do this because unlike Time Of Day,
+	 *   the scheduler clock tolerates small errors and it's
+	 *   very important for it to be as fast as the platform
+	 *   can achive it. )
 	 */
-	if (unlikely(!tsc_enabled))
+	if (unlikely(!tsc_enabled && !tsc_unstable))
 		/* No locking but a rare wrong value is not a big deal: */
 		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
 
@@ -277,6 +283,7 @@ static struct clocksource clocksource_tsc = {
 
 void mark_tsc_unstable(char *reason)
 {
+	sched_clock_unstable_event();
 	if (!tsc_unstable) {
 		tsc_unstable = 1;
 		tsc_enabled = 0;

diff --git a/trunk/arch/ia64/kernel/setup.c b/trunk/arch/ia64/kernel/setup.c
@@ -805,7 +805,6 @@ static void __cpuinit
 get_max_cacheline_size (void)
 {
 	unsigned long line_size, max = 1;
-	unsigned int cache_size = 0;
 	u64 l, levels, unique_caches;
         pal_cache_config_info_t cci;
         s64 status;
@@ -835,8 +834,6 @@ get_max_cacheline_size (void)
 		line_size = 1 << cci.pcci_line_size;
 		if (line_size > max)
 			max = line_size;
-		if (cache_size < cci.pcci_cache_size)
-			cache_size = cci.pcci_cache_size;
 		if (!cci.pcci_unified) {
 			status = ia64_pal_cache_config_info(l,
 						    /* cache_type (instruction)= */ 1,
@@ -853,9 +850,6 @@ get_max_cacheline_size (void)
 			ia64_i_cache_stride_shift = cci.pcci_stride;
 	}
   out:
-#ifdef CONFIG_SMP
-	max_cache_size = max(max_cache_size, cache_size);
-#endif
 	if (max > ia64_max_cacheline_size)
 		ia64_max_cacheline_size = max;
 }

diff --git a/trunk/arch/mips/kernel/smp.c b/trunk/arch/mips/kernel/smp.c
@@ -51,16 +51,6 @@ int __cpu_logical_map[NR_CPUS];		/* Map logical to physical */
 EXPORT_SYMBOL(phys_cpu_present_map);
 EXPORT_SYMBOL(cpu_online_map);
 
-/* This happens early in bootup, can't really do it better */
-static void smp_tune_scheduling (void)
-{
-	struct cache_desc *cd = &current_cpu_data.scache;
-	unsigned long cachesize = cd->linesz * cd->sets * cd->ways;
-
-	if (cachesize > max_cache_size)
-		max_cache_size = cachesize;
-}
-
 extern void __init calibrate_delay(void);
 extern ATTRIB_NORET void cpu_idle(void);
 
@@ -228,7 +218,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 {
 	init_new_context(current, &init_mm);
 	current_thread_info()->cpu = 0;
-	smp_tune_scheduling();
 	plat_prepare_cpus(max_cpus);
 #ifndef CONFIG_HOTPLUG_CPU
 	cpu_present_map = cpu_possible_map;

diff --git a/trunk/arch/sparc/kernel/smp.c b/trunk/arch/sparc/kernel/smp.c
@@ -68,16 +68,6 @@ void __cpuinit smp_store_cpu_info(int id)
 	cpu_data(id).prom_node = cpu_node;
 	cpu_data(id).mid = cpu_get_hwmid(cpu_node);
 
-	/* this is required to tune the scheduler correctly */
-	/* is it possible to have CPUs with different cache sizes? */
-	if (id == boot_cpu_id) {
-		int cache_line,cache_nlines;
-		cache_line = 0x20;
-		cache_line = prom_getintdefault(cpu_node, "ecache-line-size", cache_line);
-		cache_nlines = 0x8000;
-		cache_nlines = prom_getintdefault(cpu_node, "ecache-nlines", cache_nlines);
-		max_cache_size = cache_line * cache_nlines;
-	}
 	if (cpu_data(id).mid < 0)
 		panic("No MID found for CPU%d at node 0x%08d", id, cpu_node);
 }

diff --git a/trunk/arch/sparc64/kernel/smp.c b/trunk/arch/sparc64/kernel/smp.c
@@ -1163,32 +1163,6 @@ int setup_profiling_timer(unsigned int multiplier)
 	return -EINVAL;
 }
 
-static void __init smp_tune_scheduling(void)
-{
-	unsigned int smallest = ~0U;
-	int i;
-
-	for (i = 0; i < NR_CPUS; i++) {
-		unsigned int val = cpu_data(i).ecache_size;
-
-		if (val && val < smallest)
-			smallest = val;
-	}
-
-	/* Any value less than 256K is nonsense.  */
-	if (smallest < (256U * 1024U))
-		smallest = 256 * 1024;
-
-	max_cache_size = smallest;
-
-	if (smallest < 1U * 1024U * 1024U)
-		printk(KERN_INFO "Using max_cache_size of %uKB\n",
-		       smallest / 1024U);
-	else
-		printk(KERN_INFO "Using max_cache_size of %uMB\n",
-		       smallest / 1024U / 1024U);
-}
-
 /* Constrain the number of cpus to max_cpus.  */
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
@@ -1206,7 +1180,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	}
 
 	cpu_data(boot_cpu_id).udelay_val = loops_per_jiffy;
-	smp_tune_scheduling();
 }
 
 void __devinit smp_prepare_boot_cpu(void)

diff --git a/trunk/drivers/ide/arm/icside.c b/trunk/drivers/ide/arm/icside.c
@@ -448,23 +448,21 @@ static int icside_dma_test_irq(ide_drive_t *drive)
 			ICS_ARCIN_V6_INTRSTAT_1)) & 1;
 }
 
-static int icside_dma_timeout(ide_drive_t *drive)
+static void icside_dma_timeout(ide_drive_t *drive)
 {
 	printk(KERN_ERR "%s: DMA timeout occurred: ", drive->name);
 
 	if (icside_dma_test_irq(drive))
-		return 0;
+		return;
 
-	ide_dump_status(drive, "DMA timeout",
-		HWIF(drive)->INB(IDE_STATUS_REG));
+	ide_dump_status(drive, "DMA timeout", HWIF(drive)->INB(IDE_STATUS_REG));
 
-	return icside_dma_end(drive);
+	icside_dma_end(drive);
 }
 
-static int icside_dma_lostirq(ide_drive_t *drive)
+static void icside_dma_lost_irq(ide_drive_t *drive)
 {
 	printk(KERN_ERR "%s: IRQ lost\n", drive->name);
-	return 1;
 }
 
 static void icside_dma_init(ide_hwif_t *hwif)
@@ -490,8 +488,8 @@ static void icside_dma_init(ide_hwif_t *hwif)
 	hwif->dma_start		= icside_dma_start;
 	hwif->ide_dma_end	= icside_dma_end;
 	hwif->ide_dma_test_irq	= icside_dma_test_irq;
-	hwif->ide_dma_timeout	= icside_dma_timeout;
-	hwif->ide_dma_lostirq	= icside_dma_lostirq;
+	hwif->dma_timeout	= icside_dma_timeout;
+	hwif->dma_lost_irq	= icside_dma_lost_irq;
 
 	hwif->drives[0].autodma = hwif->autodma;
 	hwif->drives[1].autodma = hwif->autodma;

diff --git a/trunk/drivers/ide/cris/ide-cris.c b/trunk/drivers/ide/cris/ide-cris.c
@@ -819,7 +819,7 @@ init_e100_ide (void)
 		hwif->dma_host_off = &cris_dma_off;
 		hwif->dma_host_on = &cris_dma_on;
 		hwif->dma_off_quietly = &cris_dma_off;
-		hwif->udma_four = 0;
+		hwif->cbl = ATA_CBL_PATA40;
 		hwif->ultra_mask = cris_ultra_mask;
 		hwif->mwdma_mask = 0x07; /* Multiword DMA 0-2 */
 		hwif->autodma = 1;