From 0c80d63b83db4c0b8d680cd7e33f1d8f86dc5b7d Mon Sep 17 00:00:00 2001 From: Ivo van Doorn Date: Fri, 11 May 2007 15:59:40 -0400 Subject: [PATCH] --- yaml --- r: 58337 b: refs/heads/master c: 9467d64b0e88763914c01f71ddf591b166c4f526 h: refs/heads/master i: 58335: f5a0524658e9761d2119de07e91198af66b75ef1 v: v3 --- [refs] | 2 +- trunk/Documentation/kernel-parameters.txt | 43 + trunk/Documentation/sched-design-CFS.txt | 119 - trunk/arch/i386/kernel/smpboot.c | 12 + trunk/arch/i386/kernel/tsc.c | 9 +- trunk/arch/ia64/kernel/setup.c | 6 + trunk/arch/mips/kernel/smp.c | 11 + trunk/arch/sparc/kernel/smp.c | 10 + trunk/arch/sparc64/kernel/smp.c | 27 + trunk/drivers/ide/arm/icside.c | 16 +- trunk/drivers/ide/cris/ide-cris.c | 2 +- trunk/drivers/ide/ide-cd.c | 6 +- trunk/drivers/ide/ide-cd.h | 2 + trunk/drivers/ide/ide-disk.c | 8 +- trunk/drivers/ide/ide-dma.c | 110 +- trunk/drivers/ide/ide-io.c | 4 +- trunk/drivers/ide/ide-iops.c | 8 +- trunk/drivers/ide/ide-probe.c | 10 +- trunk/drivers/ide/ide-proc.c | 34 +- trunk/drivers/ide/ide-timing.h | 56 +- trunk/drivers/ide/ide.c | 33 +- trunk/drivers/ide/legacy/hd.c | 2 +- trunk/drivers/ide/legacy/macide.c | 14 + trunk/drivers/ide/mips/au1xxx-ide.c | 24 +- trunk/drivers/ide/pci/aec62xx.c | 119 +- trunk/drivers/ide/pci/alim15x3.c | 78 +- trunk/drivers/ide/pci/amd74xx.c | 127 +- trunk/drivers/ide/pci/atiixp.c | 5 +- trunk/drivers/ide/pci/cmd64x.c | 130 +- trunk/drivers/ide/pci/cs5535.c | 6 +- trunk/drivers/ide/pci/hpt366.c | 170 +- trunk/drivers/ide/pci/it8213.c | 8 +- trunk/drivers/ide/pci/it821x.c | 9 +- trunk/drivers/ide/pci/jmicron.c | 20 +- trunk/drivers/ide/pci/pdc202xx_new.c | 9 +- trunk/drivers/ide/pci/pdc202xx_old.c | 35 +- trunk/drivers/ide/pci/piix.c | 45 +- trunk/drivers/ide/pci/scc_pata.c | 2 +- trunk/drivers/ide/pci/serverworks.c | 103 +- trunk/drivers/ide/pci/sgiioc4.c | 20 +- trunk/drivers/ide/pci/siimage.c | 18 +- trunk/drivers/ide/pci/sis5513.c | 34 +- trunk/drivers/ide/pci/sl82c105.c | 20 +- trunk/drivers/ide/pci/slc90e66.c | 5 +- trunk/drivers/ide/pci/tc86c001.c | 4 +- trunk/drivers/ide/pci/via82cxxx.c | 175 +- trunk/drivers/ide/ppc/pmac.c | 42 +- trunk/drivers/misc/Kconfig | 6 +- trunk/drivers/misc/Makefile | 1 + trunk/drivers/misc/eeprom_93cx6.c | 229 ++ trunk/fs/jfs/endian24.h | 2 +- trunk/fs/jfs/jfs_debug.c | 28 + trunk/fs/jfs/jfs_debug.h | 2 + trunk/fs/jfs/jfs_dinode.h | 42 +- trunk/fs/jfs/jfs_dmap.c | 419 +-- trunk/fs/jfs/jfs_dmap.h | 118 +- trunk/fs/jfs/jfs_dtree.c | 105 +- trunk/fs/jfs/jfs_dtree.h | 2 +- trunk/fs/jfs/jfs_extent.c | 102 +- trunk/fs/jfs/jfs_filsys.h | 13 +- trunk/fs/jfs/jfs_imap.c | 296 +- trunk/fs/jfs/jfs_imap.h | 98 +- trunk/fs/jfs/jfs_incore.h | 4 +- trunk/fs/jfs/jfs_logmgr.c | 90 +- trunk/fs/jfs/jfs_logmgr.h | 26 +- trunk/fs/jfs/jfs_metapage.c | 3 +- trunk/fs/jfs/jfs_mount.c | 6 +- trunk/fs/jfs/jfs_txnmgr.c | 302 +- trunk/fs/jfs/jfs_txnmgr.h | 2 +- trunk/fs/jfs/jfs_types.h | 20 +- trunk/fs/jfs/jfs_umount.c | 2 +- trunk/fs/jfs/jfs_xtree.c | 428 +-- trunk/fs/jfs/jfs_xtree.h | 48 +- trunk/fs/jfs/namei.c | 26 +- trunk/fs/jfs/resize.c | 48 +- trunk/fs/jfs/xattr.c | 9 +- trunk/fs/proc/array.c | 59 +- trunk/fs/proc/base.c | 71 +- trunk/include/asm-generic/bitops/sched.h | 21 +- .../include/asm-mips/mach-au1x00/au1xxx_ide.h | 28 +- trunk/include/linux/eeprom_93cx6.h | 72 + trunk/include/linux/hardirq.h | 13 - trunk/include/linux/ide.h | 18 +- trunk/include/linux/sched.h | 251 +- trunk/include/linux/topology.h | 12 +- trunk/include/linux/wait.h | 16 +- trunk/init/main.c | 5 +- trunk/kernel/delayacct.c | 10 +- trunk/kernel/exit.c | 5 +- trunk/kernel/fork.c | 4 +- trunk/kernel/posix-cpu-timers.c | 34 +- trunk/kernel/sched.c | 3023 ++++++++++------- trunk/kernel/sched_debug.c | 275 -- trunk/kernel/sched_fair.c | 1131 ------ trunk/kernel/sched_idletask.c | 71 - trunk/kernel/sched_rt.c | 255 -- trunk/kernel/sched_stats.h | 235 -- trunk/kernel/softirq.c | 1 + trunk/kernel/sysctl.c | 80 - trunk/lib/Kconfig.debug | 9 - 100 files changed, 4407 insertions(+), 5521 deletions(-) delete mode 100644 trunk/Documentation/sched-design-CFS.txt create mode 100644 trunk/drivers/misc/eeprom_93cx6.c create mode 100644 trunk/include/linux/eeprom_93cx6.h delete mode 100644 trunk/kernel/sched_debug.c delete mode 100644 trunk/kernel/sched_fair.c delete mode 100644 trunk/kernel/sched_idletask.c delete mode 100644 trunk/kernel/sched_rt.c delete mode 100644 trunk/kernel/sched_stats.h diff --git a/[refs] b/[refs] index cd4f9270cd1a..6d1239337479 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: 27a278aa4309df244a2619f47031acce00ca1b7c +refs/heads/master: 9467d64b0e88763914c01f71ddf591b166c4f526 diff --git a/trunk/Documentation/kernel-parameters.txt b/trunk/Documentation/kernel-parameters.txt index 4d880b3d1f35..af50f9bbe68e 100644 --- a/trunk/Documentation/kernel-parameters.txt +++ b/trunk/Documentation/kernel-parameters.txt @@ -1014,6 +1014,49 @@ and is between 256 and 4096 characters. It is defined in the file mga= [HW,DRM] + migration_cost= + [KNL,SMP] debug: override scheduler migration costs + Format: ,,... + This debugging option can be used to override the + default scheduler migration cost matrix. The numbers + are indexed by 'CPU domain distance'. + E.g. migration_cost=1000,2000,3000 on an SMT NUMA + box will set up an intra-core migration cost of + 1 msec, an inter-core migration cost of 2 msecs, + and an inter-node migration cost of 3 msecs. + + WARNING: using the wrong values here can break + scheduler performance, so it's only for scheduler + development purposes, not production environments. + + migration_debug= + [KNL,SMP] migration cost auto-detect verbosity + Format=<0|1|2> + If a system's migration matrix reported at bootup + seems erroneous then this option can be used to + increase verbosity of the detection process. + We default to 0 (no extra messages), 1 will print + some more information, and 2 will be really + verbose (probably only useful if you also have a + serial console attached to the system). + + migration_factor= + [KNL,SMP] multiply/divide migration costs by a factor + Format= + This debug option can be used to proportionally + increase or decrease the auto-detected migration + costs for all entries of the migration matrix. + E.g. migration_factor=150 will increase migration + costs by 50%. (and thus the scheduler will be less + eager migrating cache-hot tasks) + migration_factor=80 will decrease migration costs + by 20%. (thus the scheduler will be more eager to + migrate tasks) + + WARNING: using the wrong values here can break + scheduler performance, so it's only for scheduler + development purposes, not production environments. + mousedev.tap_time= [MOUSE] Maximum time between finger touching and leaving touchpad surface for touch to be considered diff --git a/trunk/Documentation/sched-design-CFS.txt b/trunk/Documentation/sched-design-CFS.txt deleted file mode 100644 index 16feebb7bdc0..000000000000 --- a/trunk/Documentation/sched-design-CFS.txt +++ /dev/null @@ -1,119 +0,0 @@ - -This is the CFS scheduler. - -80% of CFS's design can be summed up in a single sentence: CFS basically -models an "ideal, precise multi-tasking CPU" on real hardware. - -"Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100% -physical power and which can run each task at precise equal speed, in -parallel, each at 1/nr_running speed. For example: if there are 2 tasks -running then it runs each at 50% physical power - totally in parallel. - -On real hardware, we can run only a single task at once, so while that -one task runs, the other tasks that are waiting for the CPU are at a -disadvantage - the current task gets an unfair amount of CPU time. In -CFS this fairness imbalance is expressed and tracked via the per-task -p->wait_runtime (nanosec-unit) value. "wait_runtime" is the amount of -time the task should now run on the CPU for it to become completely fair -and balanced. - -( small detail: on 'ideal' hardware, the p->wait_runtime value would - always be zero - no task would ever get 'out of balance' from the - 'ideal' share of CPU time. ) - -CFS's task picking logic is based on this p->wait_runtime value and it -is thus very simple: it always tries to run the task with the largest -p->wait_runtime value. In other words, CFS tries to run the task with -the 'gravest need' for more CPU time. So CFS always tries to split up -CPU time between runnable tasks as close to 'ideal multitasking -hardware' as possible. - -Most of the rest of CFS's design just falls out of this really simple -concept, with a few add-on embellishments like nice levels, -multiprocessing and various algorithm variants to recognize sleepers. - -In practice it works like this: the system runs a task a bit, and when -the task schedules (or a scheduler tick happens) the task's CPU usage is -'accounted for': the (small) time it just spent using the physical CPU -is deducted from p->wait_runtime. [minus the 'fair share' it would have -gotten anyway]. Once p->wait_runtime gets low enough so that another -task becomes the 'leftmost task' of the time-ordered rbtree it maintains -(plus a small amount of 'granularity' distance relative to the leftmost -task so that we do not over-schedule tasks and trash the cache) then the -new leftmost task is picked and the current task is preempted. - -The rq->fair_clock value tracks the 'CPU time a runnable task would have -fairly gotten, had it been runnable during that time'. So by using -rq->fair_clock values we can accurately timestamp and measure the -'expected CPU time' a task should have gotten. All runnable tasks are -sorted in the rbtree by the "rq->fair_clock - p->wait_runtime" key, and -CFS picks the 'leftmost' task and sticks to it. As the system progresses -forwards, newly woken tasks are put into the tree more and more to the -right - slowly but surely giving a chance for every task to become the -'leftmost task' and thus get on the CPU within a deterministic amount of -time. - -Some implementation details: - - - the introduction of Scheduling Classes: an extensible hierarchy of - scheduler modules. These modules encapsulate scheduling policy - details and are handled by the scheduler core without the core - code assuming about them too much. - - - sched_fair.c implements the 'CFS desktop scheduler': it is a - replacement for the vanilla scheduler's SCHED_OTHER interactivity - code. - - I'd like to give credit to Con Kolivas for the general approach here: - he has proven via RSDL/SD that 'fair scheduling' is possible and that - it results in better desktop scheduling. Kudos Con! - - The CFS patch uses a completely different approach and implementation - from RSDL/SD. My goal was to make CFS's interactivity quality exceed - that of RSDL/SD, which is a high standard to meet :-) Testing - feedback is welcome to decide this one way or another. [ and, in any - case, all of SD's logic could be added via a kernel/sched_sd.c module - as well, if Con is interested in such an approach. ] - - CFS's design is quite radical: it does not use runqueues, it uses a - time-ordered rbtree to build a 'timeline' of future task execution, - and thus has no 'array switch' artifacts (by which both the vanilla - scheduler and RSDL/SD are affected). - - CFS uses nanosecond granularity accounting and does not rely on any - jiffies or other HZ detail. Thus the CFS scheduler has no notion of - 'timeslices' and has no heuristics whatsoever. There is only one - central tunable: - - /proc/sys/kernel/sched_granularity_ns - - which can be used to tune the scheduler from 'desktop' (low - latencies) to 'server' (good batching) workloads. It defaults to a - setting suitable for desktop workloads. SCHED_BATCH is handled by the - CFS scheduler module too. - - Due to its design, the CFS scheduler is not prone to any of the - 'attacks' that exist today against the heuristics of the stock - scheduler: fiftyp.c, thud.c, chew.c, ring-test.c, massive_intr.c all - work fine and do not impact interactivity and produce the expected - behavior. - - the CFS scheduler has a much stronger handling of nice levels and - SCHED_BATCH: both types of workloads should be isolated much more - agressively than under the vanilla scheduler. - - ( another detail: due to nanosec accounting and timeline sorting, - sched_yield() support is very simple under CFS, and in fact under - CFS sched_yield() behaves much better than under any other - scheduler i have tested so far. ) - - - sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler - way than the vanilla scheduler does. It uses 100 runqueues (for all - 100 RT priority levels, instead of 140 in the vanilla scheduler) - and it needs no expired array. - - - reworked/sanitized SMP load-balancing: the runqueue-walking - assumptions are gone from the load-balancing code now, and - iterators of the scheduling modules are used. The balancing code got - quite a bit simpler as a result. - diff --git a/trunk/arch/i386/kernel/smpboot.c b/trunk/arch/i386/kernel/smpboot.c index 0b2954534b8e..88baed1e7e83 100644 --- a/trunk/arch/i386/kernel/smpboot.c +++ b/trunk/arch/i386/kernel/smpboot.c @@ -941,6 +941,17 @@ static int __cpuinit __smp_prepare_cpu(int cpu) } #endif +static void smp_tune_scheduling(void) +{ + if (cpu_khz) { + /* cache size in kB */ + long cachesize = boot_cpu_data.x86_cache_size; + + if (cachesize > 0) + max_cache_size = cachesize * 1024; + } +} + /* * Cycle through the processors sending APIC IPIs to boot each. */ @@ -969,6 +980,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) x86_cpu_to_apicid[0] = boot_cpu_physical_apicid; current_thread_info()->cpu = 0; + smp_tune_scheduling(); set_cpu_sibling_map(0); diff --git a/trunk/arch/i386/kernel/tsc.c b/trunk/arch/i386/kernel/tsc.c index ea63a30ca3e8..f64b81f3033b 100644 --- a/trunk/arch/i386/kernel/tsc.c +++ b/trunk/arch/i386/kernel/tsc.c @@ -4,7 +4,6 @@ * See comments there for proper credits. */ -#include #include #include #include @@ -107,13 +106,8 @@ unsigned long long sched_clock(void) /* * Fall back to jiffies if there's no TSC available: - * ( But note that we still use it if the TSC is marked - * unstable. We do this because unlike Time Of Day, - * the scheduler clock tolerates small errors and it's - * very important for it to be as fast as the platform - * can achive it. ) */ - if (unlikely(!tsc_enabled && !tsc_unstable)) + if (unlikely(!tsc_enabled)) /* No locking but a rare wrong value is not a big deal: */ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); @@ -283,7 +277,6 @@ static struct clocksource clocksource_tsc = { void mark_tsc_unstable(char *reason) { - sched_clock_unstable_event(); if (!tsc_unstable) { tsc_unstable = 1; tsc_enabled = 0; diff --git a/trunk/arch/ia64/kernel/setup.c b/trunk/arch/ia64/kernel/setup.c index 188fb73c6845..eaa6a24bc0b6 100644 --- a/trunk/arch/ia64/kernel/setup.c +++ b/trunk/arch/ia64/kernel/setup.c @@ -805,6 +805,7 @@ static void __cpuinit get_max_cacheline_size (void) { unsigned long line_size, max = 1; + unsigned int cache_size = 0; u64 l, levels, unique_caches; pal_cache_config_info_t cci; s64 status; @@ -834,6 +835,8 @@ get_max_cacheline_size (void) line_size = 1 << cci.pcci_line_size; if (line_size > max) max = line_size; + if (cache_size < cci.pcci_cache_size) + cache_size = cci.pcci_cache_size; if (!cci.pcci_unified) { status = ia64_pal_cache_config_info(l, /* cache_type (instruction)= */ 1, @@ -850,6 +853,9 @@ get_max_cacheline_size (void) ia64_i_cache_stride_shift = cci.pcci_stride; } out: +#ifdef CONFIG_SMP + max_cache_size = max(max_cache_size, cache_size); +#endif if (max > ia64_max_cacheline_size) ia64_max_cacheline_size = max; } diff --git a/trunk/arch/mips/kernel/smp.c b/trunk/arch/mips/kernel/smp.c index a1b017f2dbb3..67edfa7ed93a 100644 --- a/trunk/arch/mips/kernel/smp.c +++ b/trunk/arch/mips/kernel/smp.c @@ -51,6 +51,16 @@ int __cpu_logical_map[NR_CPUS]; /* Map logical to physical */ EXPORT_SYMBOL(phys_cpu_present_map); EXPORT_SYMBOL(cpu_online_map); +/* This happens early in bootup, can't really do it better */ +static void smp_tune_scheduling (void) +{ + struct cache_desc *cd = ¤t_cpu_data.scache; + unsigned long cachesize = cd->linesz * cd->sets * cd->ways; + + if (cachesize > max_cache_size) + max_cache_size = cachesize; +} + extern void __init calibrate_delay(void); extern ATTRIB_NORET void cpu_idle(void); @@ -218,6 +228,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) { init_new_context(current, &init_mm); current_thread_info()->cpu = 0; + smp_tune_scheduling(); plat_prepare_cpus(max_cpus); #ifndef CONFIG_HOTPLUG_CPU cpu_present_map = cpu_possible_map; diff --git a/trunk/arch/sparc/kernel/smp.c b/trunk/arch/sparc/kernel/smp.c index 4fea3ac7bff0..4d9ad59031bb 100644 --- a/trunk/arch/sparc/kernel/smp.c +++ b/trunk/arch/sparc/kernel/smp.c @@ -68,6 +68,16 @@ void __cpuinit smp_store_cpu_info(int id) cpu_data(id).prom_node = cpu_node; cpu_data(id).mid = cpu_get_hwmid(cpu_node); + /* this is required to tune the scheduler correctly */ + /* is it possible to have CPUs with different cache sizes? */ + if (id == boot_cpu_id) { + int cache_line,cache_nlines; + cache_line = 0x20; + cache_line = prom_getintdefault(cpu_node, "ecache-line-size", cache_line); + cache_nlines = 0x8000; + cache_nlines = prom_getintdefault(cpu_node, "ecache-nlines", cache_nlines); + max_cache_size = cache_line * cache_nlines; + } if (cpu_data(id).mid < 0) panic("No MID found for CPU%d at node 0x%08d", id, cpu_node); } diff --git a/trunk/arch/sparc64/kernel/smp.c b/trunk/arch/sparc64/kernel/smp.c index 40e40f968d61..4dcd7d0b60f2 100644 --- a/trunk/arch/sparc64/kernel/smp.c +++ b/trunk/arch/sparc64/kernel/smp.c @@ -1163,6 +1163,32 @@ int setup_profiling_timer(unsigned int multiplier) return -EINVAL; } +static void __init smp_tune_scheduling(void) +{ + unsigned int smallest = ~0U; + int i; + + for (i = 0; i < NR_CPUS; i++) { + unsigned int val = cpu_data(i).ecache_size; + + if (val && val < smallest) + smallest = val; + } + + /* Any value less than 256K is nonsense. */ + if (smallest < (256U * 1024U)) + smallest = 256 * 1024; + + max_cache_size = smallest; + + if (smallest < 1U * 1024U * 1024U) + printk(KERN_INFO "Using max_cache_size of %uKB\n", + smallest / 1024U); + else + printk(KERN_INFO "Using max_cache_size of %uMB\n", + smallest / 1024U / 1024U); +} + /* Constrain the number of cpus to max_cpus. */ void __init smp_prepare_cpus(unsigned int max_cpus) { @@ -1180,6 +1206,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) } cpu_data(boot_cpu_id).udelay_val = loops_per_jiffy; + smp_tune_scheduling(); } void __devinit smp_prepare_boot_cpu(void) diff --git a/trunk/drivers/ide/arm/icside.c b/trunk/drivers/ide/arm/icside.c index 444a0b84f5bd..66f826252aee 100644 --- a/trunk/drivers/ide/arm/icside.c +++ b/trunk/drivers/ide/arm/icside.c @@ -448,21 +448,23 @@ static int icside_dma_test_irq(ide_drive_t *drive) ICS_ARCIN_V6_INTRSTAT_1)) & 1; } -static void icside_dma_timeout(ide_drive_t *drive) +static int icside_dma_timeout(ide_drive_t *drive) { printk(KERN_ERR "%s: DMA timeout occurred: ", drive->name); if (icside_dma_test_irq(drive)) - return; + return 0; - ide_dump_status(drive, "DMA timeout", HWIF(drive)->INB(IDE_STATUS_REG)); + ide_dump_status(drive, "DMA timeout", + HWIF(drive)->INB(IDE_STATUS_REG)); - icside_dma_end(drive); + return icside_dma_end(drive); } -static void icside_dma_lost_irq(ide_drive_t *drive) +static int icside_dma_lostirq(ide_drive_t *drive) { printk(KERN_ERR "%s: IRQ lost\n", drive->name); + return 1; } static void icside_dma_init(ide_hwif_t *hwif) @@ -488,8 +490,8 @@ static void icside_dma_init(ide_hwif_t *hwif) hwif->dma_start = icside_dma_start; hwif->ide_dma_end = icside_dma_end; hwif->ide_dma_test_irq = icside_dma_test_irq; - hwif->dma_timeout = icside_dma_timeout; - hwif->dma_lost_irq = icside_dma_lost_irq; + hwif->ide_dma_timeout = icside_dma_timeout; + hwif->ide_dma_lostirq = icside_dma_lostirq; hwif->drives[0].autodma = hwif->autodma; hwif->drives[1].autodma = hwif->autodma; diff --git a/trunk/drivers/ide/cris/ide-cris.c b/trunk/drivers/ide/cris/ide-cris.c index 886091bc7db0..ca0341c05e55 100644 --- a/trunk/drivers/ide/cris/ide-cris.c +++ b/trunk/drivers/ide/cris/ide-cris.c @@ -819,7 +819,7 @@ init_e100_ide (void) hwif->dma_host_off = &cris_dma_off; hwif->dma_host_on = &cris_dma_on; hwif->dma_off_quietly = &cris_dma_off; - hwif->cbl = ATA_CBL_PATA40; + hwif->udma_four = 0; hwif->ultra_mask = cris_ultra_mask; hwif->mwdma_mask = 0x07; /* Multiword DMA 0-2 */ hwif->autodma = 1; diff --git a/trunk/drivers/ide/ide-cd.c b/trunk/drivers/ide/ide-cd.c index 1486eb212ccc..252ab8295edf 100644 --- a/trunk/drivers/ide/ide-cd.c +++ b/trunk/drivers/ide/ide-cd.c @@ -481,7 +481,7 @@ void cdrom_analyze_sense_data(ide_drive_t *drive, else printk(" Unknown Error Type: "); - if (sense->sense_key < ARRAY_SIZE(sense_key_texts)) + if (sense->sense_key < ARY_LEN(sense_key_texts)) s = sense_key_texts[sense->sense_key]; printk("%s -- (Sense key=0x%02x)\n", s, sense->sense_key); @@ -491,7 +491,7 @@ void cdrom_analyze_sense_data(ide_drive_t *drive, sense->ascq); s = buf; } else { - int lo = 0, mid, hi = ARRAY_SIZE(sense_data_texts); + int lo = 0, mid, hi = ARY_LEN(sense_data_texts); unsigned long key = (sense->sense_key << 16); key |= (sense->asc << 8); if (!(sense->ascq >= 0x80 && sense->ascq <= 0xdd)) @@ -524,7 +524,7 @@ void cdrom_analyze_sense_data(ide_drive_t *drive, if (failed_command != NULL) { - int lo=0, mid, hi= ARRAY_SIZE(packet_command_texts); + int lo=0, mid, hi= ARY_LEN (packet_command_texts); s = NULL; while (hi > lo) { diff --git a/trunk/drivers/ide/ide-cd.h b/trunk/drivers/ide/ide-cd.h index 228b29c5d2e4..ad1f2ed14a37 100644 --- a/trunk/drivers/ide/ide-cd.h +++ b/trunk/drivers/ide/ide-cd.h @@ -498,6 +498,8 @@ struct cdrom_info { * Descriptions of ATAPI error codes. */ +#define ARY_LEN(a) ((sizeof(a) / sizeof(a[0]))) + /* This stuff should be in cdrom.h, since it is now generic... */ /* ATAPI sense keys (from table 140 of ATAPI 2.6) */ diff --git a/trunk/drivers/ide/ide-disk.c b/trunk/drivers/ide/ide-disk.c index b1304a7f3e0a..dc2175c81f5e 100644 --- a/trunk/drivers/ide/ide-disk.c +++ b/trunk/drivers/ide/ide-disk.c @@ -1190,11 +1190,11 @@ static int idedisk_ioctl(struct inode *inode, struct file *file, return generic_ide_ioctl(drive, file, bdev, cmd, arg); read_val: - mutex_lock(&ide_setting_mtx); + down(&ide_setting_sem); spin_lock_irqsave(&ide_lock, flags); err = *val; spin_unlock_irqrestore(&ide_lock, flags); - mutex_unlock(&ide_setting_mtx); + up(&ide_setting_sem); return err >= 0 ? put_user(err, (long __user *)arg) : err; set_val: @@ -1204,9 +1204,9 @@ static int idedisk_ioctl(struct inode *inode, struct file *file, if (!capable(CAP_SYS_ADMIN)) err = -EACCES; else { - mutex_lock(&ide_setting_mtx); + down(&ide_setting_sem); err = setfunc(drive, arg); - mutex_unlock(&ide_setting_mtx); + up(&ide_setting_sem); } } return err; diff --git a/trunk/drivers/ide/ide-dma.c b/trunk/drivers/ide/ide-dma.c index 5fe1d72ab451..ead141e2db9e 100644 --- a/trunk/drivers/ide/ide-dma.c +++ b/trunk/drivers/ide/ide-dma.c @@ -91,45 +91,45 @@ static const struct drive_list_entry drive_whitelist [] = { - { "Micropolis 2112A" , NULL }, - { "CONNER CTMA 4000" , NULL }, - { "CONNER CTT8000-A" , NULL }, - { "ST34342A" , NULL }, + { "Micropolis 2112A" , "ALL" }, + { "CONNER CTMA 4000" , "ALL" }, + { "CONNER CTT8000-A" , "ALL" }, + { "ST34342A" , "ALL" }, { NULL , NULL } }; static const struct drive_list_entry drive_blacklist [] = { - { "WDC AC11000H" , NULL }, - { "WDC AC22100H" , NULL }, - { "WDC AC32500H" , NULL }, - { "WDC AC33100H" , NULL }, - { "WDC AC31600H" , NULL }, + { "WDC AC11000H" , "ALL" }, + { "WDC AC22100H" , "ALL" }, + { "WDC AC32500H" , "ALL" }, + { "WDC AC33100H" , "ALL" }, + { "WDC AC31600H" , "ALL" }, { "WDC AC32100H" , "24.09P07" }, { "WDC AC23200L" , "21.10N21" }, - { "Compaq CRD-8241B" , NULL }, - { "CRD-8400B" , NULL }, - { "CRD-8480B", NULL }, - { "CRD-8482B", NULL }, - { "CRD-84" , NULL }, - { "SanDisk SDP3B" , NULL }, - { "SanDisk SDP3B-64" , NULL }, - { "SANYO CD-ROM CRD" , NULL }, - { "HITACHI CDR-8" , NULL }, - { "HITACHI CDR-8335" , NULL }, - { "HITACHI CDR-8435" , NULL }, - { "Toshiba CD-ROM XM-6202B" , NULL }, - { "TOSHIBA CD-ROM XM-1702BC", NULL }, - { "CD-532E-A" , NULL }, - { "E-IDE CD-ROM CR-840", NULL }, - { "CD-ROM Drive/F5A", NULL }, - { "WPI CDD-820", NULL }, - { "SAMSUNG CD-ROM SC-148C", NULL }, - { "SAMSUNG CD-ROM SC", NULL }, - { "ATAPI CD-ROM DRIVE 40X MAXIMUM", NULL }, - { "_NEC DV5800A", NULL }, + { "Compaq CRD-8241B" , "ALL" }, + { "CRD-8400B" , "ALL" }, + { "CRD-8480B", "ALL" }, + { "CRD-8482B", "ALL" }, + { "CRD-84" , "ALL" }, + { "SanDisk SDP3B" , "ALL" }, + { "SanDisk SDP3B-64" , "ALL" }, + { "SANYO CD-ROM CRD" , "ALL" }, + { "HITACHI CDR-8" , "ALL" }, + { "HITACHI CDR-8335" , "ALL" }, + { "HITACHI CDR-8435" , "ALL" }, + { "Toshiba CD-ROM XM-6202B" , "ALL" }, + { "TOSHIBA CD-ROM XM-1702BC", "ALL" }, + { "CD-532E-A" , "ALL" }, + { "E-IDE CD-ROM CR-840", "ALL" }, + { "CD-ROM Drive/F5A", "ALL" }, + { "WPI CDD-820", "ALL" }, + { "SAMSUNG CD-ROM SC-148C", "ALL" }, + { "SAMSUNG CD-ROM SC", "ALL" }, + { "ATAPI CD-ROM DRIVE 40X MAXIMUM", "ALL" }, + { "_NEC DV5800A", "ALL" }, { "SAMSUNG CD-ROM SN-124", "N001" }, - { "Seagate STT20000A", NULL }, + { "Seagate STT20000A", "ALL" }, { NULL , NULL } }; @@ -147,8 +147,8 @@ int ide_in_drive_list(struct hd_driveid *id, const struct drive_list_entry *driv { for ( ; drive_table->id_model ; drive_table++) if ((!strcmp(drive_table->id_model, id->model)) && - (!drive_table->id_firmware || - strstr(id->fw_rev, drive_table->id_firmware))) + ((strstr(id->fw_rev, drive_table->id_firmware)) || + (!strcmp(drive_table->id_firmware, "ALL")))) return 1; return 0; } @@ -702,22 +702,8 @@ static unsigned int ide_get_mode_mask(ide_drive_t *drive, u8 base) mask = id->dma_mword & hwif->mwdma_mask; break; case XFER_SW_DMA_0: - if (id->field_valid & 2) { + if (id->field_valid & 2) mask = id->dma_1word & hwif->swdma_mask; - } else if (id->tDMA) { - /* - * ide_fix_driveid() doesn't convert ->tDMA to the - * CPU endianness so we need to do it here - */ - u8 mode = le16_to_cpu(id->tDMA); - - /* - * if the mode is valid convert it to the mask - * (the maximum allowed mode is XFER_SW_DMA_2) - */ - if (mode <= 2) - mask = ((2 << mode) - 1) & hwif->swdma_mask; - } break; default: BUG(); @@ -861,27 +847,27 @@ int ide_set_dma(ide_drive_t *drive) return rc; } +EXPORT_SYMBOL_GPL(ide_set_dma); + #ifdef CONFIG_BLK_DEV_IDEDMA_PCI -void ide_dma_lost_irq (ide_drive_t *drive) +int __ide_dma_lostirq (ide_drive_t *drive) { printk("%s: DMA interrupt recovery\n", drive->name); + return 1; } -EXPORT_SYMBOL(ide_dma_lost_irq); +EXPORT_SYMBOL(__ide_dma_lostirq); -void ide_dma_timeout (ide_drive_t *drive) +int __ide_dma_timeout (ide_drive_t *drive) { - ide_hwif_t *hwif = HWIF(drive); - printk(KERN_ERR "%s: timeout waiting for DMA\n", drive->name); + if (HWIF(drive)->ide_dma_test_irq(drive)) + return 0; - if (hwif->ide_dma_test_irq(drive)) - return; - - hwif->ide_dma_end(drive); + return HWIF(drive)->ide_dma_end(drive); } -EXPORT_SYMBOL(ide_dma_timeout); +EXPORT_SYMBOL(__ide_dma_timeout); /* * Needed for allowing full modular support of ide-driver @@ -1032,10 +1018,10 @@ void ide_setup_dma (ide_hwif_t *hwif, unsigned long dma_base, unsigned int num_p hwif->ide_dma_end = &__ide_dma_end; if (!hwif->ide_dma_test_irq) hwif->ide_dma_test_irq = &__ide_dma_test_irq; - if (!hwif->dma_timeout) - hwif->dma_timeout = &ide_dma_timeout; - if (!hwif->dma_lost_irq) - hwif->dma_lost_irq = &ide_dma_lost_irq; + if (!hwif->ide_dma_timeout) + hwif->ide_dma_timeout = &__ide_dma_timeout; + if (!hwif->ide_dma_lostirq) + hwif->ide_dma_lostirq = &__ide_dma_lostirq; if (hwif->chipset != ide_trm290) { u8 dma_stat = hwif->INB(hwif->dma_status); diff --git a/trunk/drivers/ide/ide-io.c b/trunk/drivers/ide/ide-io.c index c5b5011da56e..bfe8f1b712ba 100644 --- a/trunk/drivers/ide/ide-io.c +++ b/trunk/drivers/ide/ide-io.c @@ -1350,7 +1350,7 @@ static ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) hwif->INB(IDE_STATUS_REG)); } else { printk(KERN_WARNING "%s: DMA timeout retry\n", drive->name); - hwif->dma_timeout(drive); + (void) hwif->ide_dma_timeout(drive); } /* @@ -1466,7 +1466,7 @@ void ide_timer_expiry (unsigned long data) startstop = handler(drive); } else if (drive_is_ready(drive)) { if (drive->waiting_for_dma) - hwgroup->hwif->dma_lost_irq(drive); + (void) hwgroup->hwif->ide_dma_lostirq(drive); (void)ide_ack_intr(hwif); printk(KERN_WARNING "%s: lost interrupt\n", drive->name); startstop = handler(drive); diff --git a/trunk/drivers/ide/ide-iops.c b/trunk/drivers/ide/ide-iops.c index 92578b6832e9..f0be5f665a0e 100644 --- a/trunk/drivers/ide/ide-iops.c +++ b/trunk/drivers/ide/ide-iops.c @@ -574,10 +574,7 @@ u8 eighty_ninty_three (ide_drive_t *drive) ide_hwif_t *hwif = drive->hwif; struct hd_driveid *id = drive->id; - if (hwif->cbl == ATA_CBL_PATA40_SHORT) - return 1; - - if (hwif->cbl != ATA_CBL_PATA80) + if (hwif->udma_four == 0) goto no_80w; /* Check for SATA but only if we are ATA5 or higher */ @@ -603,8 +600,7 @@ u8 eighty_ninty_three (ide_drive_t *drive) printk(KERN_WARNING "%s: %s side 80-wire cable detection failed, " "limiting max speed to UDMA33\n", - drive->name, - hwif->cbl == ATA_CBL_PATA80 ? "drive" : "host"); + drive->name, hwif->udma_four ? "drive" : "host"); drive->udma33_warned = 1; diff --git a/trunk/drivers/ide/ide-probe.c b/trunk/drivers/ide/ide-probe.c index cc5801399467..f5ce22c38f82 100644 --- a/trunk/drivers/ide/ide-probe.c +++ b/trunk/drivers/ide/ide-probe.c @@ -144,7 +144,7 @@ static inline void do_identify (ide_drive_t *drive, u8 cmd) local_irq_enable(); ide_fix_driveid(id); -#if defined (CONFIG_SCSI_EATA_PIO) || defined (CONFIG_SCSI_EATA) +#if defined (CONFIG_SCSI_EATA_DMA) || defined (CONFIG_SCSI_EATA_PIO) || defined (CONFIG_SCSI_EATA) /* * EATA SCSI controllers do a hardware ATA emulation: * Ignore them if there is a driver for them available. @@ -154,7 +154,7 @@ static inline void do_identify (ide_drive_t *drive, u8 cmd) printk("%s: EATA SCSI HBA %.10s\n", drive->name, id->model); goto err_misc; } -#endif /* CONFIG_SCSI_EATA || CONFIG_SCSI_EATA_PIO */ +#endif /* CONFIG_SCSI_EATA_DMA || CONFIG_SCSI_EATA_PIO */ /* * WIN_IDENTIFY returns little-endian info, @@ -1025,7 +1025,7 @@ static int init_irq (ide_hwif_t *hwif) BUG_ON(irqs_disabled()); BUG_ON(hwif == NULL); - mutex_lock(&ide_cfg_mtx); + down(&ide_cfg_sem); hwif->hwgroup = NULL; #if MAX_HWIFS > 1 /* @@ -1154,7 +1154,7 @@ static int init_irq (ide_hwif_t *hwif) printk(" (%sed with %s)", hwif->sharing_irq ? "shar" : "serializ", match->name); printk("\n"); - mutex_unlock(&ide_cfg_mtx); + up(&ide_cfg_sem); return 0; out_unlink: spin_lock_irq(&ide_lock); @@ -1177,7 +1177,7 @@ static int init_irq (ide_hwif_t *hwif) } spin_unlock_irq(&ide_lock); out_up: - mutex_unlock(&ide_cfg_mtx); + up(&ide_cfg_sem); return 1; } diff --git a/trunk/drivers/ide/ide-proc.c b/trunk/drivers/ide/ide-proc.c index fc1d8ae6a803..ea94c9aa1220 100644 --- a/trunk/drivers/ide/ide-proc.c +++ b/trunk/drivers/ide/ide-proc.c @@ -156,7 +156,7 @@ static int __ide_add_setting(ide_drive_t *drive, const char *name, int rw, int d { ide_settings_t **p = (ide_settings_t **) &drive->settings, *setting = NULL; - mutex_lock(&ide_setting_mtx); + down(&ide_setting_sem); while ((*p) && strcmp((*p)->name, name) < 0) p = &((*p)->next); if ((setting = kzalloc(sizeof(*setting), GFP_KERNEL)) == NULL) @@ -177,10 +177,10 @@ static int __ide_add_setting(ide_drive_t *drive, const char *name, int rw, int d if (auto_remove) setting->auto_remove = 1; *p = setting; - mutex_unlock(&ide_setting_mtx); + up(&ide_setting_sem); return 0; abort: - mutex_unlock(&ide_setting_mtx); + up(&ide_setting_sem); kfree(setting); return -1; } @@ -224,7 +224,7 @@ static void __ide_remove_setting (ide_drive_t *drive, char *name) * * Automatically remove all the driver specific settings for this * drive. This function may not be called from IRQ context. The - * caller must hold ide_setting_mtx. + * caller must hold ide_setting_sem. */ static void auto_remove_settings (ide_drive_t *drive) @@ -269,7 +269,7 @@ static ide_settings_t *ide_find_setting_by_name(ide_drive_t *drive, char *name) * @setting: drive setting * * Read a drive setting and return the value. The caller - * must hold the ide_setting_mtx when making this call. + * must hold the ide_setting_sem when making this call. * * BUGS: the data return and error are the same return value * so an error -EINVAL and true return of the same value cannot @@ -306,7 +306,7 @@ static int ide_read_setting(ide_drive_t *drive, ide_settings_t *setting) * @val: value * * Write a drive setting if it is possible. The caller - * must hold the ide_setting_mtx when making this call. + * must hold the ide_setting_sem when making this call. * * BUGS: the data return and error are the same return value * so an error -EINVAL and true return of the same value cannot @@ -367,7 +367,7 @@ static int set_xfer_rate (ide_drive_t *drive, int arg) * @drive: drive being configured * * Add the generic parts of the system settings to the /proc files. - * The caller must not be holding the ide_setting_mtx. + * The caller must not be holding the ide_setting_sem. */ void ide_add_generic_settings (ide_drive_t *drive) @@ -408,7 +408,7 @@ static int proc_ide_read_settings proc_ide_settings_warn(); - mutex_lock(&ide_setting_mtx); + down(&ide_setting_sem); out += sprintf(out, "name\t\t\tvalue\t\tmin\t\tmax\t\tmode\n"); out += sprintf(out, "----\t\t\t-----\t\t---\t\t---\t\t----\n"); while(setting) { @@ -428,7 +428,7 @@ static int proc_ide_read_settings setting = setting->next; } len = out - page; - mutex_unlock(&ide_setting_mtx); + up(&ide_setting_sem); PROC_IDE_READ_RETURN(page,start,off,count,eof,len); } @@ -508,16 +508,16 @@ static int proc_ide_write_settings(struct file *file, const char __user *buffer, ++p; } - mutex_lock(&ide_setting_mtx); + down(&ide_setting_sem); setting = ide_find_setting_by_name(drive, name); if (!setting) { - mutex_unlock(&ide_setting_mtx); + up(&ide_setting_sem); goto parse_error; } if (for_real) ide_write_setting(drive, setting, val * setting->div_factor / setting->mul_factor); - mutex_unlock(&ide_setting_mtx); + up(&ide_setting_sem); } } while (!for_real++); free_page((unsigned long)buf); @@ -705,7 +705,7 @@ EXPORT_SYMBOL(ide_proc_register_driver); * Clean up the driver specific /proc files and IDE settings * for a given drive. * - * Takes ide_setting_mtx and ide_lock. + * Takes ide_setting_sem and ide_lock. * Caller must hold none of the locks. */ @@ -715,10 +715,10 @@ void ide_proc_unregister_driver(ide_drive_t *drive, ide_driver_t *driver) ide_remove_proc_entries(drive->proc, driver->proc); - mutex_lock(&ide_setting_mtx); + down(&ide_setting_sem); spin_lock_irqsave(&ide_lock, flags); /* - * ide_setting_mtx protects the settings list + * ide_setting_sem protects the settings list * ide_lock protects the use of settings * * so we need to hold both, ide_settings_sem because we want to @@ -726,11 +726,11 @@ void ide_proc_unregister_driver(ide_drive_t *drive, ide_driver_t *driver) * a setting out that is being used. * * OTOH both ide_{read,write}_setting are only ever used under - * ide_setting_mtx. + * ide_setting_sem. */ auto_remove_settings(drive); spin_unlock_irqrestore(&ide_lock, flags); - mutex_unlock(&ide_setting_mtx); + up(&ide_setting_sem); } EXPORT_SYMBOL(ide_proc_unregister_driver); diff --git a/trunk/drivers/ide/ide-timing.h b/trunk/drivers/ide/ide-timing.h index e6cb8593b5ba..c0864b1e9228 100644 --- a/trunk/drivers/ide/ide-timing.h +++ b/trunk/drivers/ide/ide-timing.h @@ -102,16 +102,66 @@ static struct ide_timing ide_timing[] = { #define EZ(v,unit) ((v)?ENOUGH(v,unit):0) #define XFER_MODE 0xf0 +#define XFER_UDMA_133 0x48 +#define XFER_UDMA_100 0x44 +#define XFER_UDMA_66 0x42 +#define XFER_UDMA 0x40 #define XFER_MWDMA 0x20 +#define XFER_SWDMA 0x10 #define XFER_EPIO 0x01 #define XFER_PIO 0x00 -static short ide_find_best_pio_mode(ide_drive_t *drive) +static short ide_find_best_mode(ide_drive_t *drive, int map) { struct hd_driveid *id = drive->id; short best = 0; - if (id->field_valid & 2) { /* EIDE PIO modes */ + if (!id) + return XFER_PIO_SLOW; + + if ((map & XFER_UDMA) && (id->field_valid & 4)) { /* Want UDMA and UDMA bitmap valid */ + + if ((map & XFER_UDMA_133) == XFER_UDMA_133) + if ((best = (id->dma_ultra & 0x0040) ? XFER_UDMA_6 : 0)) return best; + + if ((map & XFER_UDMA_100) == XFER_UDMA_100) + if ((best = (id->dma_ultra & 0x0020) ? XFER_UDMA_5 : 0)) return best; + + if ((map & XFER_UDMA_66) == XFER_UDMA_66) + if ((best = (id->dma_ultra & 0x0010) ? XFER_UDMA_4 : + (id->dma_ultra & 0x0008) ? XFER_UDMA_3 : 0)) return best; + + if ((best = (id->dma_ultra & 0x0004) ? XFER_UDMA_2 : + (id->dma_ultra & 0x0002) ? XFER_UDMA_1 : + (id->dma_ultra & 0x0001) ? XFER_UDMA_0 : 0)) return best; + } + + if ((map & XFER_MWDMA) && (id->field_valid & 2)) { /* Want MWDMA and drive has EIDE fields */ + + if ((best = (id->dma_mword & 0x0004) ? XFER_MW_DMA_2 : + (id->dma_mword & 0x0002) ? XFER_MW_DMA_1 : + (id->dma_mword & 0x0001) ? XFER_MW_DMA_0 : 0)) return best; + } + + if (map & XFER_SWDMA) { /* Want SWDMA */ + + if (id->field_valid & 2) { /* EIDE SWDMA */ + + if ((best = (id->dma_1word & 0x0004) ? XFER_SW_DMA_2 : + (id->dma_1word & 0x0002) ? XFER_SW_DMA_1 : + (id->dma_1word & 0x0001) ? XFER_SW_DMA_0 : 0)) return best; + } + + if (id->capability & 1) { /* Pre-EIDE style SWDMA */ + + if ((best = (id->tDMA == 2) ? XFER_SW_DMA_2 : + (id->tDMA == 1) ? XFER_SW_DMA_1 : + (id->tDMA == 0) ? XFER_SW_DMA_0 : 0)) return best; + } + } + + + if ((map & XFER_EPIO) && (id->field_valid & 2)) { /* EIDE PIO modes */ if ((best = (drive->id->eide_pio_modes & 4) ? XFER_PIO_5 : (drive->id->eide_pio_modes & 2) ? XFER_PIO_4 : @@ -212,7 +262,7 @@ static int ide_timing_compute(ide_drive_t *drive, short speed, struct ide_timing */ if ((speed & XFER_MODE) != XFER_PIO) { - ide_timing_compute(drive, ide_find_best_pio_mode(drive), &p, T, UT); + ide_timing_compute(drive, ide_find_best_mode(drive, XFER_PIO | XFER_EPIO), &p, T, UT); ide_timing_merge(&p, t, t, IDE_TIMING_ALL); } diff --git a/trunk/drivers/ide/ide.c b/trunk/drivers/ide/ide.c index c948a5c17a5d..0cd76bf66833 100644 --- a/trunk/drivers/ide/ide.c +++ b/trunk/drivers/ide/ide.c @@ -169,7 +169,7 @@ static const u8 ide_hwif_to_major[] = { IDE0_MAJOR, IDE1_MAJOR, static int idebus_parameter; /* holds the "idebus=" parameter */ static int system_bus_speed; /* holds what we think is VESA/PCI bus speed */ -DEFINE_MUTEX(ide_cfg_mtx); +DECLARE_MUTEX(ide_cfg_sem); __cacheline_aligned_in_smp DEFINE_SPINLOCK(ide_lock); #ifdef CONFIG_IDEPCI_PCIBUS_ORDER @@ -460,8 +460,6 @@ static void ide_hwif_restore(ide_hwif_t *hwif, ide_hwif_t *tmp_hwif) hwif->mwdma_mask = tmp_hwif->mwdma_mask; hwif->swdma_mask = tmp_hwif->swdma_mask; - hwif->cbl = tmp_hwif->cbl; - hwif->chipset = tmp_hwif->chipset; hwif->hold = tmp_hwif->hold; @@ -498,8 +496,8 @@ static void ide_hwif_restore(ide_hwif_t *hwif, ide_hwif_t *tmp_hwif) hwif->ide_dma_clear_irq = tmp_hwif->ide_dma_clear_irq; hwif->dma_host_on = tmp_hwif->dma_host_on; hwif->dma_host_off = tmp_hwif->dma_host_off; - hwif->dma_lost_irq = tmp_hwif->dma_lost_irq; - hwif->dma_timeout = tmp_hwif->dma_timeout; + hwif->ide_dma_lostirq = tmp_hwif->ide_dma_lostirq; + hwif->ide_dma_timeout = tmp_hwif->ide_dma_timeout; hwif->OUTB = tmp_hwif->OUTB; hwif->OUTBSYNC = tmp_hwif->OUTBSYNC; @@ -535,6 +533,7 @@ static void ide_hwif_restore(ide_hwif_t *hwif, ide_hwif_t *tmp_hwif) hwif->extra_base = tmp_hwif->extra_base; hwif->extra_ports = tmp_hwif->extra_ports; hwif->autodma = tmp_hwif->autodma; + hwif->udma_four = tmp_hwif->udma_four; hwif->hwif_data = tmp_hwif->hwif_data; } @@ -565,7 +564,7 @@ void ide_unregister(unsigned int index) { ide_drive_t *drive; ide_hwif_t *hwif, *g; - static ide_hwif_t tmp_hwif; /* protected by ide_cfg_mtx */ + static ide_hwif_t tmp_hwif; /* protected by ide_cfg_sem */ ide_hwgroup_t *hwgroup; int irq_count = 0, unit; @@ -573,7 +572,7 @@ void ide_unregister(unsigned int index) BUG_ON(in_interrupt()); BUG_ON(irqs_disabled()); - mutex_lock(&ide_cfg_mtx); + down(&ide_cfg_sem); spin_lock_irq(&ide_lock); hwif = &ide_hwifs[index]; if (!hwif->present) @@ -680,7 +679,7 @@ void ide_unregister(unsigned int index) abort: spin_unlock_irq(&ide_lock); - mutex_unlock(&ide_cfg_mtx); + up(&ide_cfg_sem); } EXPORT_SYMBOL(ide_unregister); @@ -818,9 +817,9 @@ EXPORT_SYMBOL(ide_register_hw); * Locks for IDE setting functionality */ -DEFINE_MUTEX(ide_setting_mtx); +DECLARE_MUTEX(ide_setting_sem); -EXPORT_SYMBOL_GPL(ide_setting_mtx); +EXPORT_SYMBOL_GPL(ide_setting_sem); /** * ide_spin_wait_hwgroup - wait for group @@ -1193,11 +1192,11 @@ int generic_ide_ioctl(ide_drive_t *drive, struct file *file, struct block_device } read_val: - mutex_lock(&ide_setting_mtx); + down(&ide_setting_sem); spin_lock_irqsave(&ide_lock, flags); err = *val; spin_unlock_irqrestore(&ide_lock, flags); - mutex_unlock(&ide_setting_mtx); + up(&ide_setting_sem); return err >= 0 ? put_user(err, (long __user *)arg) : err; set_val: @@ -1207,9 +1206,9 @@ int generic_ide_ioctl(ide_drive_t *drive, struct file *file, struct block_device if (!capable(CAP_SYS_ADMIN)) err = -EACCES; else { - mutex_lock(&ide_setting_mtx); + down(&ide_setting_sem); err = setfunc(drive, arg); - mutex_unlock(&ide_setting_mtx); + up(&ide_setting_sem); } } return err; @@ -1549,11 +1548,7 @@ static int __init ide_setup(char *s) goto bad_option; case -7: /* ata66 */ #ifdef CONFIG_BLK_DEV_IDEPCI - /* - * Use ATA_CBL_PATA40_SHORT so drive side - * cable detection is also overriden. - */ - hwif->cbl = ATA_CBL_PATA40_SHORT; + hwif->udma_four = 1; goto obsolete_option; #else goto bad_hwif; diff --git a/trunk/drivers/ide/legacy/hd.c b/trunk/drivers/ide/legacy/hd.c index 661c12f6dda6..45ed03591cd8 100644 --- a/trunk/drivers/ide/legacy/hd.c +++ b/trunk/drivers/ide/legacy/hd.c @@ -130,7 +130,7 @@ struct hd_i_struct { #ifdef HD_TYPE static struct hd_i_struct hd_info[] = { HD_TYPE }; -static int NR_HD = ARRAY_SIZE(hd_info); +static int NR_HD = ((sizeof (hd_info))/(sizeof (struct hd_i_struct))); #else static struct hd_i_struct hd_info[MAX_HD]; static int NR_HD; diff --git a/trunk/drivers/ide/legacy/macide.c b/trunk/drivers/ide/legacy/macide.c index b557c45a5a9d..c211fc78345d 100644 --- a/trunk/drivers/ide/legacy/macide.c +++ b/trunk/drivers/ide/legacy/macide.c @@ -77,6 +77,15 @@ int macide_ack_intr(ide_hwif_t* hwif) return 0; } +#ifdef CONFIG_BLK_DEV_MAC_MEDIABAY +static void macide_mediabay_interrupt(int irq, void *dev_id) +{ + int state = baboon->mb_status & 0x04; + + printk(KERN_INFO "macide: media bay %s detected\n", state? "removal":"insertion"); +} +#endif + /* * Probe for a Macintosh IDE interface */ @@ -119,6 +128,11 @@ void macide_init(void) ide_drive_t *drive = &ide_hwifs[index].drives[0]; drive->capacity64 = drive->cyl*drive->head*drive->sect; +#ifdef CONFIG_BLK_DEV_MAC_MEDIABAY + request_irq(IRQ_BABOON_2, macide_mediabay_interrupt, + IRQ_FLG_FAST, "mediabay", + macide_mediabay_interrupt); +#endif } break; diff --git a/trunk/drivers/ide/mips/au1xxx-ide.c b/trunk/drivers/ide/mips/au1xxx-ide.c index 2e7013a2a7f6..ca95e990862e 100644 --- a/trunk/drivers/ide/mips/au1xxx-ide.c +++ b/trunk/drivers/ide/mips/au1xxx-ide.c @@ -381,7 +381,9 @@ static int auide_dma_setup(ide_drive_t *drive) static int auide_dma_check(ide_drive_t *drive) { - u8 speed = ide_max_dma_mode(drive); + u8 speed; + +#ifdef CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA if( dbdma_init_done == 0 ){ auide_hwif.white_list = ide_in_drive_list(drive->id, @@ -392,6 +394,7 @@ static int auide_dma_check(ide_drive_t *drive) auide_ddma_init(&auide_hwif); dbdma_init_done = 1; } +#endif /* Is the drive in our DMA black list? */ @@ -406,6 +409,8 @@ static int auide_dma_check(ide_drive_t *drive) else drive->using_dma = 1; + speed = ide_find_best_mode(drive, XFER_PIO | XFER_MWDMA); + if (drive->autodma && (speed & XFER_MODE) != XFER_PIO) return 0; @@ -451,9 +456,10 @@ static void auide_dma_off_quietly(ide_drive_t *drive) drive->using_dma = 0; } -static void auide_dma_lost_irq(ide_drive_t *drive) +static int auide_dma_lostirq(ide_drive_t *drive) { printk(KERN_ERR "%s: IRQ lost\n", drive->name); + return 0; } static void auide_ddma_tx_callback(int irq, void *param) @@ -483,16 +489,16 @@ static void auide_init_dbdma_dev(dbdev_tab_t *dev, u32 dev_id, u32 tsize, u32 de #if defined(CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA) -static void auide_dma_timeout(ide_drive_t *drive) +static int auide_dma_timeout(ide_drive_t *drive) { - ide_hwif_t *hwif = HWIF(drive); +// printk("%s\n", __FUNCTION__); printk(KERN_ERR "%s: DMA timeout occurred: ", drive->name); - if (hwif->ide_dma_test_irq(drive)) - return; + if (HWIF(drive)->ide_dma_test_irq(drive)) + return 0; - hwif->ide_dma_end(drive); + return HWIF(drive)->ide_dma_end(drive); } @@ -715,7 +721,7 @@ static int au_ide_probe(struct device *dev) #ifdef CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA hwif->dma_off_quietly = &auide_dma_off_quietly; - hwif->dma_timeout = &auide_dma_timeout; + hwif->ide_dma_timeout = &auide_dma_timeout; hwif->ide_dma_check = &auide_dma_check; hwif->dma_exec_cmd = &auide_dma_exec_cmd; @@ -725,7 +731,7 @@ static int au_ide_probe(struct device *dev) hwif->ide_dma_test_irq = &auide_dma_test_irq; hwif->dma_host_off = &auide_dma_host_off; hwif->dma_host_on = &auide_dma_host_on; - hwif->dma_lost_irq = &auide_dma_lost_irq; + hwif->ide_dma_lostirq = &auide_dma_lostirq; hwif->ide_dma_on = &auide_dma_on; hwif->autodma = 1; diff --git a/trunk/drivers/ide/pci/aec62xx.c b/trunk/drivers/ide/pci/aec62xx.c index e5d09367627e..b173bc66ce1e 100644 --- a/trunk/drivers/ide/pci/aec62xx.c +++ b/trunk/drivers/ide/pci/aec62xx.c @@ -1,5 +1,5 @@ /* - * linux/drivers/ide/pci/aec62xx.c Version 0.24 May 24, 2007 + * linux/drivers/ide/pci/aec62xx.c Version 0.21 Apr 21, 2007 * * Copyright (C) 1999-2002 Andre Hedrick * Copyright (C) 2007 MontaVista Software, Inc. @@ -140,10 +140,25 @@ static int aec6260_tune_chipset (ide_drive_t *drive, u8 xferspeed) return(ide_config_drive_speed(drive, speed)); } +static int aec62xx_tune_chipset (ide_drive_t *drive, u8 speed) +{ + switch (HWIF(drive)->pci_dev->device) { + case PCI_DEVICE_ID_ARTOP_ATP865: + case PCI_DEVICE_ID_ARTOP_ATP865R: + case PCI_DEVICE_ID_ARTOP_ATP860: + case PCI_DEVICE_ID_ARTOP_ATP860R: + return ((int) aec6260_tune_chipset(drive, speed)); + case PCI_DEVICE_ID_ARTOP_ATP850UF: + return ((int) aec6210_tune_chipset(drive, speed)); + default: + return -1; + } +} + static void aec62xx_tune_drive (ide_drive_t *drive, u8 pio) { pio = ide_get_best_pio_mode(drive, pio, 4, NULL); - (void) HWIF(drive)->speedproc(drive, pio + XFER_PIO_0); + (void) aec62xx_tune_chipset(drive, pio + XFER_PIO_0); } static int aec62xx_config_drive_xfer_rate (ide_drive_t *drive) @@ -157,9 +172,12 @@ static int aec62xx_config_drive_xfer_rate (ide_drive_t *drive) return -1; } -static void aec62xx_dma_lost_irq (ide_drive_t *drive) +static int aec62xx_irq_timeout (ide_drive_t *drive) { - switch (HWIF(drive)->pci_dev->device) { + ide_hwif_t *hwif = HWIF(drive); + struct pci_dev *dev = hwif->pci_dev; + + switch(dev->device) { case PCI_DEVICE_ID_ARTOP_ATP860: case PCI_DEVICE_ID_ARTOP_ATP860R: case PCI_DEVICE_ID_ARTOP_ATP865: @@ -168,6 +186,7 @@ static void aec62xx_dma_lost_irq (ide_drive_t *drive) default: break; } + return 0; } static unsigned int __devinit init_chipset_aec62xx(struct pci_dev *dev, const char *name) @@ -205,46 +224,64 @@ static unsigned int __devinit init_chipset_aec62xx(struct pci_dev *dev, const ch static void __devinit init_hwif_aec62xx(ide_hwif_t *hwif) { - struct pci_dev *dev = hwif->pci_dev; - u8 reg54 = 0, mask = hwif->channel ? 0xf0 : 0x0f; - unsigned long flags; + struct pci_dev *dev = hwif->pci_dev; + hwif->autodma = 0; hwif->tuneproc = &aec62xx_tune_drive; + hwif->speedproc = &aec62xx_tune_chipset; - if (dev->device == PCI_DEVICE_ID_ARTOP_ATP850UF) { - if(hwif->mate) - hwif->mate->serialized = hwif->serialized = 1; - hwif->speedproc = &aec6210_tune_chipset; - } else - hwif->speedproc = &aec6260_tune_chipset; + if (dev->device == PCI_DEVICE_ID_ARTOP_ATP850UF) + hwif->serialized = hwif->channel; + + if (hwif->mate) + hwif->mate->serialized = hwif->serialized; if (!hwif->dma_base) { - hwif->drives[0].autotune = hwif->drives[1].autotune = 1; + hwif->drives[0].autotune = 1; + hwif->drives[1].autotune = 1; return; } hwif->ultra_mask = hwif->cds->udma_mask; + + /* atp865 and atp865r */ + if (hwif->ultra_mask == 0x3f) { + /* check bit 0x10 of DMA status register */ + if (inb(pci_resource_start(dev, 4) + 2) & 0x10) + hwif->ultra_mask = 0x7f; /* udma0-6 */ + } + hwif->mwdma_mask = 0x07; hwif->ide_dma_check = &aec62xx_config_drive_xfer_rate; - hwif->dma_lost_irq = &aec62xx_dma_lost_irq; + hwif->ide_dma_lostirq = &aec62xx_irq_timeout; + + if (!noautodma) + hwif->autodma = 1; + hwif->drives[0].autodma = hwif->autodma; + hwif->drives[1].autodma = hwif->autodma; +} + +static void __devinit init_dma_aec62xx(ide_hwif_t *hwif, unsigned long dmabase) +{ + struct pci_dev *dev = hwif->pci_dev; if (dev->device == PCI_DEVICE_ID_ARTOP_ATP850UF) { + u8 reg54h = 0; + unsigned long flags; + spin_lock_irqsave(&ide_lock, flags); - pci_read_config_byte (dev, 0x54, ®54); - pci_write_config_byte(dev, 0x54, (reg54 & ~mask)); + pci_read_config_byte(dev, 0x54, ®54h); + pci_write_config_byte(dev, 0x54, reg54h & ~(hwif->channel ? 0xF0 : 0x0F)); spin_unlock_irqrestore(&ide_lock, flags); - } else if (hwif->cbl != ATA_CBL_PATA40_SHORT) { - u8 ata66 = 0, mask = hwif->channel ? 0x02 : 0x01; - + } else { + u8 ata66 = 0; pci_read_config_byte(hwif->pci_dev, 0x49, &ata66); - - hwif->cbl = (ata66 & mask) ? ATA_CBL_PATA40 : ATA_CBL_PATA80; + if (!(hwif->udma_four)) + hwif->udma_four = (ata66&(hwif->channel?0x02:0x01))?0:1; } - if (!noautodma) - hwif->autodma = 1; - hwif->drives[0].autodma = hwif->drives[1].autodma = hwif->autodma; + ide_setup_dma(hwif, dmabase, 8); } static int __devinit init_setup_aec62xx(struct pci_dev *dev, ide_pci_device_t *d) @@ -254,12 +291,16 @@ static int __devinit init_setup_aec62xx(struct pci_dev *dev, ide_pci_device_t *d static int __devinit init_setup_aec6x80(struct pci_dev *dev, ide_pci_device_t *d) { - unsigned long dma_base = pci_resource_start(dev, 4); - - if (inb(dma_base + 2) & 0x10) { - d->name = (dev->device == PCI_DEVICE_ID_ARTOP_ATP865R) ? - "AEC6880R" : "AEC6880"; - d->udma_mask = 0x7f; /* udma0-6 */ + unsigned long bar4reg = pci_resource_start(dev, 4); + + if (inb(bar4reg+2) & 0x10) { + strcpy(d->name, "AEC6880"); + if (dev->device == PCI_DEVICE_ID_ARTOP_ATP865R) + strcpy(d->name, "AEC6880R"); + } else { + strcpy(d->name, "AEC6280"); + if (dev->device == PCI_DEVICE_ID_ARTOP_ATP865R) + strcpy(d->name, "AEC6280R"); } return ide_setup_pci_device(dev, d); @@ -271,6 +312,7 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = { .init_setup = init_setup_aec62xx, .init_chipset = init_chipset_aec62xx, .init_hwif = init_hwif_aec62xx, + .init_dma = init_dma_aec62xx, .channels = 2, .autodma = AUTODMA, .enablebits = {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}}, @@ -281,6 +323,7 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = { .init_setup = init_setup_aec62xx, .init_chipset = init_chipset_aec62xx, .init_hwif = init_hwif_aec62xx, + .init_dma = init_dma_aec62xx, .channels = 2, .autodma = NOAUTODMA, .bootable = OFF_BOARD, @@ -290,25 +333,28 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = { .init_setup = init_setup_aec62xx, .init_chipset = init_chipset_aec62xx, .init_hwif = init_hwif_aec62xx, + .init_dma = init_dma_aec62xx, .channels = 2, .autodma = AUTODMA, .enablebits = {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}}, .bootable = NEVER_BOARD, .udma_mask = 0x1f, /* udma0-4 */ },{ /* 3 */ - .name = "AEC6280", + .name = "AEC6X80", .init_setup = init_setup_aec6x80, .init_chipset = init_chipset_aec62xx, .init_hwif = init_hwif_aec62xx, + .init_dma = init_dma_aec62xx, .channels = 2, .autodma = AUTODMA, .bootable = OFF_BOARD, .udma_mask = 0x3f, /* udma0-5 */ },{ /* 4 */ - .name = "AEC6280R", + .name = "AEC6X80R", .init_setup = init_setup_aec6x80, .init_chipset = init_chipset_aec62xx, .init_hwif = init_hwif_aec62xx, + .init_dma = init_dma_aec62xx, .channels = 2, .autodma = AUTODMA, .enablebits = {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}}, @@ -324,16 +370,13 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = { * * Called when the PCI registration layer (or the IDE initialization) * finds a device matching our IDE device tables. - * - * NOTE: since we're going to modify the 'name' field for AEC-6[26]80[R] - * chips, pass a local copy of 'struct pci_device_id' down the call chain. */ static int __devinit aec62xx_init_one(struct pci_dev *dev, const struct pci_device_id *id) { - ide_pci_device_t d = aec62xx_chipsets[id->driver_data]; + ide_pci_device_t *d = &aec62xx_chipsets[id->driver_data]; - return d.init_setup(dev, &d); + return d->init_setup(dev, d); } static struct pci_device_id aec62xx_pci_tbl[] = { diff --git a/trunk/drivers/ide/pci/alim15x3.c b/trunk/drivers/ide/pci/alim15x3.c index 8a6b27b3bcc3..27525ec2e19a 100644 --- a/trunk/drivers/ide/pci/alim15x3.c +++ b/trunk/drivers/ide/pci/alim15x3.c @@ -1,5 +1,5 @@ /* - * linux/drivers/ide/pci/alim15x3.c Version 0.25 Jun 9 2007 + * linux/drivers/ide/pci/alim15x3.c Version 0.21 2007/02/03 * * Copyright (C) 1998-2000 Michel Aubry, Maintainer * Copyright (C) 1998-2000 Andrzej Krzysztofowicz, Maintainer @@ -10,7 +10,6 @@ * Copyright (C) 2002 Alan Cox * ALi (now ULi M5228) support by Clear Zhang * Copyright (C) 2007 MontaVista Software, Inc. - * Copyright (C) 2007 Bartlomiej Zolnierkiewicz * * (U)DMA capable version of ali 1533/1543(C), 1535(D) * @@ -37,7 +36,6 @@ #include #include #include -#include #include @@ -585,35 +583,6 @@ static unsigned int __devinit init_chipset_ali15x3 (struct pci_dev *dev, const c return 0; } -/* - * Cable special cases - */ - -static struct dmi_system_id cable_dmi_table[] = { - { - .ident = "HP Pavilion N5430", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), - DMI_MATCH(DMI_BOARD_NAME, "OmniBook N32N-736"), - }, - }, - { } -}; - -static int ali_cable_override(struct pci_dev *pdev) -{ - /* Fujitsu P2000 */ - if (pdev->subsystem_vendor == 0x10CF && - pdev->subsystem_device == 0x10AF) - return 1; - - /* Systems by DMI */ - if (dmi_check_system(cable_dmi_table)) - return 1; - - return 0; -} - /** * ata66_ali15x3 - check for UDMA 66 support * @hwif: IDE interface @@ -625,31 +594,37 @@ static int ali_cable_override(struct pci_dev *pdev) * FIXME: frobs bits that are not defined on newer ALi devicea */ -static u8 __devinit ata66_ali15x3(ide_hwif_t *hwif) +static unsigned int __devinit ata66_ali15x3 (ide_hwif_t *hwif) { struct pci_dev *dev = hwif->pci_dev; + unsigned int ata66 = 0; + u8 cable_80_pin[2] = { 0, 0 }; + unsigned long flags; - u8 cbl = ATA_CBL_PATA40, tmpbyte; + u8 tmpbyte; local_irq_save(flags); if (m5229_revision >= 0xC2) { /* - * m5229 80-pin cable detection (from Host View) - * - * 0x4a bit0 is 0 => primary channel has 80-pin - * 0x4a bit1 is 0 => secondary channel has 80-pin - * - * Certain laptops use short but suitable cables - * and don't implement the detect logic. + * Ultra66 cable detection (from Host View) + * m5229, 0x4a, bit0: primary, bit1: secondary 80 pin */ - if (ali_cable_override(dev)) - cbl = ATA_CBL_PATA40_SHORT; - else { - pci_read_config_byte(dev, 0x4a, &tmpbyte); - if ((tmpbyte & (1 << hwif->channel)) == 0) - cbl = ATA_CBL_PATA80; - } + pci_read_config_byte(dev, 0x4a, &tmpbyte); + /* + * 0x4a, bit0 is 0 => primary channel + * has 80-pin (from host view) + */ + if (!(tmpbyte & 0x01)) cable_80_pin[0] = 1; + /* + * 0x4a, bit1 is 0 => secondary channel + * has 80-pin (from host view) + */ + if (!(tmpbyte & 0x02)) cable_80_pin[1] = 1; + /* + * Allow ata66 if cable of current channel has 80 pins + */ + ata66 = (hwif->channel)?cable_80_pin[1]:cable_80_pin[0]; } else { /* * check m1533, 0x5e, bit 1~4 == 1001 => & 00011110 = 00010010 @@ -682,7 +657,7 @@ static u8 __devinit ata66_ali15x3(ide_hwif_t *hwif) local_irq_restore(flags); - return cbl; + return(ata66); } /** @@ -733,9 +708,8 @@ static void __devinit init_hwif_common_ali15x3 (ide_hwif_t *hwif) hwif->dma_setup = &ali15x3_dma_setup; if (!noautodma) hwif->autodma = 1; - - if (hwif->cbl != ATA_CBL_PATA40_SHORT) - hwif->cbl = ata66_ali15x3(hwif); + if (!(hwif->udma_four)) + hwif->udma_four = ata66_ali15x3(hwif); } hwif->drives[0].autodma = hwif->autodma; hwif->drives[1].autodma = hwif->autodma; diff --git a/trunk/drivers/ide/pci/amd74xx.c b/trunk/drivers/ide/pci/amd74xx.c index 84ed30cdb324..a2be65fcf89c 100644 --- a/trunk/drivers/ide/pci/amd74xx.c +++ b/trunk/drivers/ide/pci/amd74xx.c @@ -1,11 +1,10 @@ /* - * Version 2.20 + * Version 2.16 * * AMD 755/756/766/8111 and nVidia nForce/2/2s/3/3s/CK804/MCP04 * IDE driver for Linux. * * Copyright (c) 2000-2002 Vojtech Pavlik - * Copyright (c) 2007 Bartlomiej Zolnierkiewicz * * Based on the work of: * Andre Hedrick @@ -38,6 +37,11 @@ #define AMD_ADDRESS_SETUP (0x0c + amd_config->base) #define AMD_UDMA_TIMING (0x10 + amd_config->base) +#define AMD_UDMA 0x07 +#define AMD_UDMA_33 0x01 +#define AMD_UDMA_66 0x02 +#define AMD_UDMA_100 0x03 +#define AMD_UDMA_133 0x04 #define AMD_CHECK_SWDMA 0x08 #define AMD_BAD_SWDMA 0x10 #define AMD_BAD_FIFO 0x20 @@ -49,33 +53,32 @@ static struct amd_ide_chip { unsigned short id; - u8 base; - u8 udma_mask; - u8 flags; + unsigned long base; + unsigned char flags; } amd_ide_chips[] = { - { PCI_DEVICE_ID_AMD_COBRA_7401, 0x40, ATA_UDMA2, AMD_BAD_SWDMA }, - { PCI_DEVICE_ID_AMD_VIPER_7409, 0x40, ATA_UDMA4, AMD_CHECK_SWDMA }, - { PCI_DEVICE_ID_AMD_VIPER_7411, 0x40, ATA_UDMA5, AMD_BAD_FIFO }, - { PCI_DEVICE_ID_AMD_OPUS_7441, 0x40, ATA_UDMA5, }, - { PCI_DEVICE_ID_AMD_8111_IDE, 0x40, ATA_UDMA6, AMD_CHECK_SERENADE }, - { PCI_DEVICE_ID_NVIDIA_NFORCE_IDE, 0x50, ATA_UDMA5, }, - { PCI_DEVICE_ID_NVIDIA_NFORCE2_IDE, 0x50, ATA_UDMA6, }, - { PCI_DEVICE_ID_NVIDIA_NFORCE2S_IDE, 0x50, ATA_UDMA6, }, - { PCI_DEVICE_ID_NVIDIA_NFORCE2S_SATA, 0x50, ATA_UDMA6, }, - { PCI_DEVICE_ID_NVIDIA_NFORCE3_IDE, 0x50, ATA_UDMA6, }, - { PCI_DEVICE_ID_NVIDIA_NFORCE3S_IDE, 0x50, ATA_UDMA6, }, - { PCI_DEVICE_ID_NVIDIA_NFORCE3S_SATA, 0x50, ATA_UDMA6, }, - { PCI_DEVICE_ID_NVIDIA_NFORCE3S_SATA2, 0x50, ATA_UDMA6, }, - { PCI_DEVICE_ID_NVIDIA_NFORCE_CK804_IDE, 0x50, ATA_UDMA6, }, - { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_IDE, 0x50, ATA_UDMA6, }, - { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_IDE, 0x50, ATA_UDMA6, }, - { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_IDE, 0x50, ATA_UDMA6, }, - { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_IDE, 0x50, ATA_UDMA6, }, - { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP65_IDE, 0x50, ATA_UDMA6, }, - { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP67_IDE, 0x50, ATA_UDMA6, }, - { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_IDE, 0x50, ATA_UDMA6, }, - { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP77_IDE, 0x50, ATA_UDMA6, }, - { PCI_DEVICE_ID_AMD_CS5536_IDE, 0x40, ATA_UDMA5, }, + { PCI_DEVICE_ID_AMD_COBRA_7401, 0x40, AMD_UDMA_33 | AMD_BAD_SWDMA }, + { PCI_DEVICE_ID_AMD_VIPER_7409, 0x40, AMD_UDMA_66 | AMD_CHECK_SWDMA }, + { PCI_DEVICE_ID_AMD_VIPER_7411, 0x40, AMD_UDMA_100 | AMD_BAD_FIFO }, + { PCI_DEVICE_ID_AMD_OPUS_7441, 0x40, AMD_UDMA_100 }, + { PCI_DEVICE_ID_AMD_8111_IDE, 0x40, AMD_UDMA_133 | AMD_CHECK_SERENADE }, + { PCI_DEVICE_ID_NVIDIA_NFORCE_IDE, 0x50, AMD_UDMA_100 }, + { PCI_DEVICE_ID_NVIDIA_NFORCE2_IDE, 0x50, AMD_UDMA_133 }, + { PCI_DEVICE_ID_NVIDIA_NFORCE2S_IDE, 0x50, AMD_UDMA_133 }, + { PCI_DEVICE_ID_NVIDIA_NFORCE2S_SATA, 0x50, AMD_UDMA_133 }, + { PCI_DEVICE_ID_NVIDIA_NFORCE3_IDE, 0x50, AMD_UDMA_133 }, + { PCI_DEVICE_ID_NVIDIA_NFORCE3S_IDE, 0x50, AMD_UDMA_133 }, + { PCI_DEVICE_ID_NVIDIA_NFORCE3S_SATA, 0x50, AMD_UDMA_133 }, + { PCI_DEVICE_ID_NVIDIA_NFORCE3S_SATA2, 0x50, AMD_UDMA_133 }, + { PCI_DEVICE_ID_NVIDIA_NFORCE_CK804_IDE, 0x50, AMD_UDMA_133 }, + { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_IDE, 0x50, AMD_UDMA_133 }, + { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_IDE, 0x50, AMD_UDMA_133 }, + { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_IDE, 0x50, AMD_UDMA_133 }, + { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_IDE, 0x50, AMD_UDMA_133 }, + { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP65_IDE, 0x50, AMD_UDMA_133 }, + { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP67_IDE, 0x50, AMD_UDMA_133 }, + { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_IDE, 0x50, AMD_UDMA_133 }, + { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP77_IDE, 0x50, AMD_UDMA_133 }, + { PCI_DEVICE_ID_AMD_CS5536_IDE, 0x40, AMD_UDMA_100 }, { 0 } }; @@ -84,7 +87,7 @@ static ide_pci_device_t *amd_chipset; static unsigned int amd_80w; static unsigned int amd_clock; -static char *amd_dma[] = { "16", "25", "33", "44", "66", "100", "133" }; +static char *amd_dma[] = { "MWDMA16", "UDMA33", "UDMA66", "UDMA100", "UDMA133" }; static unsigned char amd_cyc2udma[] = { 6, 6, 5, 4, 0, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 7 }; /* @@ -125,7 +128,7 @@ static int amd74xx_get_info(char *buffer, char **addr, off_t offset, int count) pci_read_config_byte(dev, PCI_REVISION_ID, &t); amd_print("Revision: IDE %#x", t); - amd_print("Highest DMA rate: UDMA%s", amd_dma[fls(amd_config->udma_mask) - 1]); + amd_print("Highest DMA rate: %s", amd_dma[amd_config->flags & AMD_UDMA]); amd_print("BM-DMA base: %#lx", amd_base); amd_print("PCI clock: %d.%dMHz", amd_clock / 1000, amd_clock / 100 % 10); @@ -218,12 +221,12 @@ static void amd_set_speed(struct pci_dev *dev, unsigned char dn, struct ide_timi pci_write_config_byte(dev, AMD_DRIVE_TIMING + (3 - dn), ((FIT(timing->active, 1, 16) - 1) << 4) | (FIT(timing->recover, 1, 16) - 1)); - switch (amd_config->udma_mask) { - case ATA_UDMA2: t = timing->udma ? (0xc0 | (FIT(timing->udma, 2, 5) - 2)) : 0x03; break; - case ATA_UDMA4: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 2, 10)]) : 0x03; break; - case ATA_UDMA5: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 1, 10)]) : 0x03; break; - case ATA_UDMA6: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 1, 15)]) : 0x03; break; - default: return; + switch (amd_config->flags & AMD_UDMA) { + case AMD_UDMA_33: t = timing->udma ? (0xc0 | (FIT(timing->udma, 2, 5) - 2)) : 0x03; break; + case AMD_UDMA_66: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 2, 10)]) : 0x03; break; + case AMD_UDMA_100: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 1, 10)]) : 0x03; break; + case AMD_UDMA_133: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 1, 15)]) : 0x03; break; + default: return; } pci_write_config_byte(dev, AMD_UDMA_TIMING + (3 - dn), t); @@ -245,7 +248,7 @@ static int amd_set_drive(ide_drive_t *drive, u8 speed) ide_config_drive_speed(drive, speed); T = 1000000000 / amd_clock; - UT = (amd_config->udma_mask == ATA_UDMA2) ? T : (T / 2); + UT = T / min_t(int, max_t(int, amd_config->flags & AMD_UDMA, 1), 2); ide_timing_compute(drive, speed, &t, T, UT); @@ -274,19 +277,29 @@ static int amd_set_drive(ide_drive_t *drive, u8 speed) static void amd74xx_tune_drive(ide_drive_t *drive, u8 pio) { if (pio == 255) { - amd_set_drive(drive, ide_find_best_pio_mode(drive)); + amd_set_drive(drive, ide_find_best_mode(drive, XFER_PIO | XFER_EPIO)); return; } amd_set_drive(drive, XFER_PIO_0 + min_t(byte, pio, 5)); } +/* + * amd74xx_dmaproc() is a callback from upper layers that can do + * a lot, but we use it for DMA/PIO tuning only, delegating everything + * else to the default ide_dmaproc(). + */ + static int amd74xx_ide_dma_check(ide_drive_t *drive) { - u8 speed = ide_max_dma_mode(drive); + int w80 = HWIF(drive)->udma_four; - if (speed == 0) - speed = ide_find_best_pio_mode(drive); + u8 speed = ide_find_best_mode(drive, + XFER_PIO | XFER_EPIO | XFER_MWDMA | XFER_UDMA | + ((amd_config->flags & AMD_BAD_SWDMA) ? 0 : XFER_SWDMA) | + (w80 && (amd_config->flags & AMD_UDMA) >= AMD_UDMA_66 ? XFER_UDMA_66 : 0) | + (w80 && (amd_config->flags & AMD_UDMA) >= AMD_UDMA_100 ? XFER_UDMA_100 : 0) | + (w80 && (amd_config->flags & AMD_UDMA) >= AMD_UDMA_133 ? XFER_UDMA_133 : 0)); amd_set_drive(drive, speed); @@ -321,10 +334,10 @@ static unsigned int __devinit init_chipset_amd74xx(struct pci_dev *dev, const ch * Check 80-wire cable presence. */ - switch (amd_config->udma_mask) { + switch (amd_config->flags & AMD_UDMA) { - case ATA_UDMA6: - case ATA_UDMA5: + case AMD_UDMA_133: + case AMD_UDMA_100: pci_read_config_byte(dev, AMD_CABLE_DETECT, &t); pci_read_config_dword(dev, AMD_UDMA_TIMING, &u); amd_80w = ((t & 0x3) ? 1 : 0) | ((t & 0xc) ? 2 : 0); @@ -336,7 +349,7 @@ static unsigned int __devinit init_chipset_amd74xx(struct pci_dev *dev, const ch } break; - case ATA_UDMA4: + case AMD_UDMA_66: /* no host side cable detection */ amd_80w = 0x03; break; @@ -357,7 +370,7 @@ static unsigned int __devinit init_chipset_amd74xx(struct pci_dev *dev, const ch if ((amd_config->flags & AMD_CHECK_SERENADE) && dev->subsystem_vendor == PCI_VENDOR_ID_AMD && dev->subsystem_device == PCI_DEVICE_ID_AMD_SERENADE) - amd_config->udma_mask = ATA_UDMA5; + amd_config->flags = AMD_UDMA_100; /* * Determine the system bus clock. @@ -382,9 +395,8 @@ static unsigned int __devinit init_chipset_amd74xx(struct pci_dev *dev, const ch */ pci_read_config_byte(dev, PCI_REVISION_ID, &t); - printk(KERN_INFO "%s: %s (rev %02x) UDMA%s controller\n", - amd_chipset->name, pci_name(dev), t, - amd_dma[fls(amd_config->udma_mask) - 1]); + printk(KERN_INFO "%s: %s (rev %02x) %s controller\n", + amd_chipset->name, pci_name(dev), t, amd_dma[amd_config->flags & AMD_UDMA]); /* * Register /proc/ide/amd74xx entry @@ -425,19 +437,12 @@ static void __devinit init_hwif_amd74xx(ide_hwif_t *hwif) return; hwif->atapi_dma = 1; + hwif->ultra_mask = 0x7f; + hwif->mwdma_mask = 0x07; + hwif->swdma_mask = 0x07; - hwif->ultra_mask = amd_config->udma_mask; - hwif->mwdma_mask = 0x07; - if ((amd_config->flags & AMD_BAD_SWDMA) == 0) - hwif->swdma_mask = 0x07; - - if (hwif->cbl != ATA_CBL_PATA40_SHORT) { - if ((amd_80w >> hwif->channel) & 1) - hwif->cbl = ATA_CBL_PATA80; - else - hwif->cbl = ATA_CBL_PATA40; - } - + if (!hwif->udma_four) + hwif->udma_four = (amd_80w >> hwif->channel) & 1; hwif->ide_dma_check = &amd74xx_ide_dma_check; if (!noautodma) hwif->autodma = 1; diff --git a/trunk/drivers/ide/pci/atiixp.c b/trunk/drivers/ide/pci/atiixp.c index 2761510309b3..8ab33faf6f76 100644 --- a/trunk/drivers/ide/pci/atiixp.c +++ b/trunk/drivers/ide/pci/atiixp.c @@ -264,11 +264,10 @@ static void __devinit init_hwif_atiixp(ide_hwif_t *hwif) hwif->swdma_mask = 0x04; pci_read_config_byte(pdev, ATIIXP_IDE_UDMA_MODE + ch, &udma_mode); - if ((udma_mode & 0x07) >= 0x04 || (udma_mode & 0x70) >= 0x40) - hwif->cbl = ATA_CBL_PATA80; + hwif->udma_four = 1; else - hwif->cbl = ATA_CBL_PATA40; + hwif->udma_four = 0; hwif->dma_host_on = &atiixp_dma_host_on; hwif->dma_host_off = &atiixp_dma_host_off; diff --git a/trunk/drivers/ide/pci/cmd64x.c b/trunk/drivers/ide/pci/cmd64x.c index 8631b6c8aa15..7c57dc696f52 100644 --- a/trunk/drivers/ide/pci/cmd64x.c +++ b/trunk/drivers/ide/pci/cmd64x.c @@ -1,5 +1,5 @@ /* - * linux/drivers/ide/pci/cmd64x.c Version 1.50 May 10, 2007 + * linux/drivers/ide/pci/cmd64x.c Version 1.47 Mar 19, 2007 * * cmd64x.c: Enable interrupts at initialization time on Ultra/PCI machines. * Due to massive hardware bugs, UltraDMA is only supported @@ -52,6 +52,9 @@ #define ARTTIM23_DIS_RA2 0x04 #define ARTTIM23_DIS_RA3 0x08 #define ARTTIM23_INTR_CH1 0x10 +#define ARTTIM2 0x57 +#define ARTTIM3 0x57 +#define DRWTIM23 0x58 #define DRWTIM2 0x58 #define BRST 0x59 #define DRWTIM3 0x5b @@ -466,43 +469,71 @@ static int cmd646_1_ide_dma_end (ide_drive_t *drive) static unsigned int __devinit init_chipset_cmd64x(struct pci_dev *dev, const char *name) { + u32 class_rev = 0; u8 mrdmode = 0; - if (dev->device == PCI_DEVICE_ID_CMD_646) { - u8 rev = 0; + pci_read_config_dword(dev, PCI_CLASS_REVISION, &class_rev); + class_rev &= 0xff; - pci_read_config_byte(dev, PCI_REVISION_ID, &rev); - - switch (rev) { - case 0x07: - case 0x05: - printk("%s: UltraDMA capable", name); + switch(dev->device) { + case PCI_DEVICE_ID_CMD_643: break; - case 0x03: - default: - printk("%s: MultiWord DMA force limited", name); + case PCI_DEVICE_ID_CMD_646: + printk(KERN_INFO "%s: chipset revision 0x%02X, ", name, class_rev); + switch(class_rev) { + case 0x07: + case 0x05: + printk("UltraDMA Capable"); + break; + case 0x03: + printk("MultiWord DMA Force Limited"); + break; + case 0x01: + default: + printk("MultiWord DMA Limited, IRQ workaround enabled"); + break; + } + printk("\n"); + break; + case PCI_DEVICE_ID_CMD_648: + case PCI_DEVICE_ID_CMD_649: break; - case 0x01: - printk("%s: MultiWord DMA limited, " - "IRQ workaround enabled\n", name); + default: break; - } } /* Set a good latency timer and cache line size value. */ (void) pci_write_config_byte(dev, PCI_LATENCY_TIMER, 64); /* FIXME: pci_set_master() to ensure a good latency timer value */ - /* - * Enable interrupts, select MEMORY READ LINE for reads. - * - * NOTE: although not mentioned in the PCI0646U specs, - * bits 0-1 are write only and won't be read back as - * set or not -- PCI0646U2 specs clarify this point. + /* Setup interrupts. */ + (void) pci_read_config_byte(dev, MRDMODE, &mrdmode); + mrdmode &= ~(0x30); + (void) pci_write_config_byte(dev, MRDMODE, mrdmode); + + /* Use MEMORY READ LINE for reads. + * NOTE: Although not mentioned in the PCI0646U specs, + * these bits are write only and won't be read + * back as set or not. The PCI0646U2 specs clarify + * this point. */ - (void) pci_read_config_byte (dev, MRDMODE, &mrdmode); - mrdmode &= ~0x30; - (void) pci_write_config_byte(dev, MRDMODE, (mrdmode | 0x02)); + (void) pci_write_config_byte(dev, MRDMODE, mrdmode | 0x02); + + /* Set reasonable active/recovery/address-setup values. */ + (void) pci_write_config_byte(dev, ARTTIM0, 0x40); + (void) pci_write_config_byte(dev, DRWTIM0, 0x3f); + (void) pci_write_config_byte(dev, ARTTIM1, 0x40); + (void) pci_write_config_byte(dev, DRWTIM1, 0x3f); +#ifdef __i386__ + (void) pci_write_config_byte(dev, ARTTIM23, 0x1c); +#else + (void) pci_write_config_byte(dev, ARTTIM23, 0x5c); +#endif + (void) pci_write_config_byte(dev, DRWTIM23, 0x3f); + (void) pci_write_config_byte(dev, DRWTIM3, 0x3f); +#ifdef CONFIG_PPC + (void) pci_write_config_byte(dev, UDIDETCR0, 0xf0); +#endif /* CONFIG_PPC */ #if defined(DISPLAY_CMD64X_TIMINGS) && defined(CONFIG_IDE_PROC_FS) @@ -517,27 +548,29 @@ static unsigned int __devinit init_chipset_cmd64x(struct pci_dev *dev, const cha return 0; } -static u8 __devinit ata66_cmd64x(ide_hwif_t *hwif) +static unsigned int __devinit ata66_cmd64x(ide_hwif_t *hwif) { - struct pci_dev *dev = hwif->pci_dev; - u8 bmidecsr = 0, mask = hwif->channel ? 0x02 : 0x01; + u8 ata66 = 0, mask = (hwif->channel) ? 0x02 : 0x01; - switch (dev->device) { - case PCI_DEVICE_ID_CMD_648: - case PCI_DEVICE_ID_CMD_649: - pci_read_config_byte(dev, BMIDECSR, &bmidecsr); - return (bmidecsr & mask) ? ATA_CBL_PATA80 : ATA_CBL_PATA40; - default: - return ATA_CBL_PATA40; + switch(hwif->pci_dev->device) { + case PCI_DEVICE_ID_CMD_643: + case PCI_DEVICE_ID_CMD_646: + return ata66; + default: + break; } + pci_read_config_byte(hwif->pci_dev, BMIDECSR, &ata66); + return (ata66 & mask) ? 1 : 0; } static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif) { struct pci_dev *dev = hwif->pci_dev; - u8 rev = 0; + unsigned int class_rev; - pci_read_config_byte(dev, PCI_REVISION_ID, &rev); + hwif->autodma = 0; + pci_read_config_dword(dev, PCI_CLASS_REVISION, &class_rev); + class_rev &= 0xff; hwif->tuneproc = &cmd64x_tune_drive; hwif->speedproc = &cmd64x_tune_chipset; @@ -547,8 +580,8 @@ static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif) if (!hwif->dma_base) return; - hwif->atapi_dma = 1; - hwif->mwdma_mask = 0x07; + hwif->atapi_dma = 1; + hwif->ultra_mask = hwif->cds->udma_mask; /* @@ -563,15 +596,16 @@ static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif) * * So we only do UltraDMA on revision 0x05 and 0x07 chipsets. */ - if (dev->device == PCI_DEVICE_ID_CMD_646 && rev < 5) + if (dev->device == PCI_DEVICE_ID_CMD_646 && class_rev < 5) hwif->ultra_mask = 0x00; - hwif->ide_dma_check = &cmd64x_config_drive_for_dma; + hwif->mwdma_mask = 0x07; - if (hwif->cbl != ATA_CBL_PATA40_SHORT) - hwif->cbl = ata66_cmd64x(hwif); + hwif->ide_dma_check = &cmd64x_config_drive_for_dma; + if (!(hwif->udma_four)) + hwif->udma_four = ata66_cmd64x(hwif); - switch (dev->device) { + switch(dev->device) { case PCI_DEVICE_ID_CMD_648: case PCI_DEVICE_ID_CMD_649: alt_irq_bits: @@ -580,10 +614,10 @@ static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif) break; case PCI_DEVICE_ID_CMD_646: hwif->chipset = ide_cmd646; - if (rev == 0x01) { + if (class_rev == 0x01) { hwif->ide_dma_end = &cmd646_1_ide_dma_end; break; - } else if (rev >= 0x03) + } else if (class_rev >= 0x03) goto alt_irq_bits; /* fall thru */ default: @@ -592,9 +626,11 @@ static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif) break; } + if (!noautodma) hwif->autodma = 1; - hwif->drives[0].autodma = hwif->drives[1].autodma = hwif->autodma; + hwif->drives[0].autodma = hwif->autodma; + hwif->drives[1].autodma = hwif->autodma; } static int __devinit init_setup_cmd64x(struct pci_dev *dev, ide_pci_device_t *d) diff --git a/trunk/drivers/ide/pci/cs5535.c b/trunk/drivers/ide/pci/cs5535.c index 10f61f38243c..41925c47ef05 100644 --- a/trunk/drivers/ide/pci/cs5535.c +++ b/trunk/drivers/ide/pci/cs5535.c @@ -187,8 +187,7 @@ static u8 __devinit cs5535_cable_detect(struct pci_dev *dev) /* if a 80 wire cable was detected */ pci_read_config_byte(dev, CS5535_CABLE_DETECT, &bit); - - return (bit & 1) ? ATA_CBL_PATA80 : ATA_CBL_PATA40; + return (bit & 1); } /**** @@ -213,7 +212,8 @@ static void __devinit init_hwif_cs5535(ide_hwif_t *hwif) hwif->ultra_mask = 0x1F; hwif->mwdma_mask = 0x07; - hwif->cbl = cs5535_cable_detect(hwif->pci_dev); + + hwif->udma_four = cs5535_cable_detect(hwif->pci_dev); if (!noautodma) hwif->autodma = 1; diff --git a/trunk/drivers/ide/pci/hpt366.c b/trunk/drivers/ide/pci/hpt366.c index 4b6bae8eee82..c33d0b0f11c9 100644 --- a/trunk/drivers/ide/pci/hpt366.c +++ b/trunk/drivers/ide/pci/hpt366.c @@ -1,5 +1,5 @@ /* - * linux/drivers/ide/pci/hpt366.c Version 1.10 Jun 29, 2007 + * linux/drivers/ide/pci/hpt366.c Version 1.06 Jun 27, 2007 * * Copyright (C) 1999-2003 Andre Hedrick * Portions Copyright (C) 2001 Sun Microsystems, Inc. @@ -77,7 +77,7 @@ * since they may tamper with its fields * - prefix the driver startup messages with the real chip name * - claim the extra 240 bytes of I/O space for all chips - * - optimize the UltraDMA filtering and the drive list lookup code + * - optimize the rate masking/filtering and the drive list lookup code * - use pci_get_slot() to get to the function 1 of HPT36x/374 * - cache offset of the channel's misc. control registers (MCRs) being used * throughout the driver @@ -99,9 +99,9 @@ * stop duplicating it for each channel by storing the pointer in the pci_dev * structure: first, at the init_setup stage, point it to a static "template" * with only the chip type and its specific base DPLL frequency, the highest - * UltraDMA mode, and the chip settings table pointer filled, then, at the - * init_chipset stage, allocate per-chip instance and fill it with the rest - * of the necessary information + * supported DMA mode, and the chip settings table pointer filled, then, at + * the init_chipset stage, allocate per-chip instance and fill it with the + * rest of the necessary information * - get rid of the constant thresholds in the HPT37x PCI clock detection code, * switch to calculating PCI clock frequency based on the chip's base DPLL * frequency @@ -112,7 +112,6 @@ * also fixing the interchanged 25/40 MHz PCI clock cases for HPT36x chips; * unify HPT36x/37x timing setup code and the speedproc handlers by joining * the register setting lists into the table indexed by the clock selected - * - set the correct hwif->ultra_mask for each individual chip * Sergei Shtylyov, or */ @@ -392,7 +391,7 @@ enum ata_clock { struct hpt_info { u8 chip_type; /* Chip type */ - u8 max_ultra; /* Max. UltraDMA mode allowed */ + u8 max_mode; /* Speeds allowed */ u8 dpll_clk; /* DPLL clock in MHz */ u8 pci_clk; /* PCI clock in MHz */ u32 **settings; /* Chipset settings table */ @@ -431,77 +430,77 @@ static u32 *hpt37x_settings[NUM_ATA_CLOCKS] = { static struct hpt_info hpt36x __devinitdata = { .chip_type = HPT36x, - .max_ultra = HPT366_ALLOW_ATA66_3 ? (HPT366_ALLOW_ATA66_4 ? 4 : 3) : 2, + .max_mode = (HPT366_ALLOW_ATA66_4 || HPT366_ALLOW_ATA66_3) ? 2 : 1, .dpll_clk = 0, /* no DPLL */ .settings = hpt36x_settings }; static struct hpt_info hpt370 __devinitdata = { .chip_type = HPT370, - .max_ultra = HPT370_ALLOW_ATA100_5 ? 5 : 4, + .max_mode = HPT370_ALLOW_ATA100_5 ? 3 : 2, .dpll_clk = 48, .settings = hpt37x_settings }; static struct hpt_info hpt370a __devinitdata = { .chip_type = HPT370A, - .max_ultra = HPT370_ALLOW_ATA100_5 ? 5 : 4, + .max_mode = HPT370_ALLOW_ATA100_5 ? 3 : 2, .dpll_clk = 48, .settings = hpt37x_settings }; static struct hpt_info hpt374 __devinitdata = { .chip_type = HPT374, - .max_ultra = 5, + .max_mode = 3, .dpll_clk = 48, .settings = hpt37x_settings }; static struct hpt_info hpt372 __devinitdata = { .chip_type = HPT372, - .max_ultra = HPT372_ALLOW_ATA133_6 ? 6 : 5, + .max_mode = HPT372_ALLOW_ATA133_6 ? 4 : 3, .dpll_clk = 55, .settings = hpt37x_settings }; static struct hpt_info hpt372a __devinitdata = { .chip_type = HPT372A, - .max_ultra = HPT372_ALLOW_ATA133_6 ? 6 : 5, + .max_mode = HPT372_ALLOW_ATA133_6 ? 4 : 3, .dpll_clk = 66, .settings = hpt37x_settings }; static struct hpt_info hpt302 __devinitdata = { .chip_type = HPT302, - .max_ultra = HPT372_ALLOW_ATA133_6 ? 6 : 5, + .max_mode = HPT302_ALLOW_ATA133_6 ? 4 : 3, .dpll_clk = 66, .settings = hpt37x_settings }; static struct hpt_info hpt371 __devinitdata = { .chip_type = HPT371, - .max_ultra = HPT371_ALLOW_ATA133_6 ? 6 : 5, + .max_mode = HPT371_ALLOW_ATA133_6 ? 4 : 3, .dpll_clk = 66, .settings = hpt37x_settings }; static struct hpt_info hpt372n __devinitdata = { .chip_type = HPT372N, - .max_ultra = HPT372_ALLOW_ATA133_6 ? 6 : 5, + .max_mode = HPT372_ALLOW_ATA133_6 ? 4 : 3, .dpll_clk = 77, .settings = hpt37x_settings }; static struct hpt_info hpt302n __devinitdata = { .chip_type = HPT302N, - .max_ultra = HPT302_ALLOW_ATA133_6 ? 6 : 5, + .max_mode = HPT302_ALLOW_ATA133_6 ? 4 : 3, .dpll_clk = 77, .settings = hpt37x_settings }; static struct hpt_info hpt371n __devinitdata = { .chip_type = HPT371N, - .max_ultra = HPT371_ALLOW_ATA133_6 ? 6 : 5, + .max_mode = HPT371_ALLOW_ATA133_6 ? 4 : 3, .dpll_clk = 77, .settings = hpt37x_settings }; @@ -524,38 +523,53 @@ static int check_in_drive_list(ide_drive_t *drive, const char **list) static u8 hpt3xx_udma_filter(ide_drive_t *drive) { struct hpt_info *info = pci_get_drvdata(HWIF(drive)->pci_dev); + u8 chip_type = info->chip_type; + u8 mode = info->max_mode; u8 mask; - switch (info->chip_type) { - case HPT370A: - if (!HPT370_ALLOW_ATA100_5 || - check_in_drive_list(drive, bad_ata100_5)) - return 0x1f; - else - return 0x3f; - case HPT370: - if (!HPT370_ALLOW_ATA100_5 || - check_in_drive_list(drive, bad_ata100_5)) - mask = 0x1f; - else + switch (mode) { + case 0x04: + mask = 0x7f; + break; + case 0x03: mask = 0x3f; - break; - case HPT36x: - if (!HPT366_ALLOW_ATA66_4 || - check_in_drive_list(drive, bad_ata66_4)) - mask = 0x0f; - else + if (chip_type >= HPT374) + break; + if (!check_in_drive_list(drive, bad_ata100_5)) + goto check_bad_ata33; + /* fall thru */ + case 0x02: mask = 0x1f; - if (!HPT366_ALLOW_ATA66_3 || - check_in_drive_list(drive, bad_ata66_3)) + /* + * CHECK ME, Does this need to be changed to HPT374 ?? + */ + if (chip_type >= HPT370) + goto check_bad_ata33; + if (HPT366_ALLOW_ATA66_4 && + !check_in_drive_list(drive, bad_ata66_4)) + goto check_bad_ata33; + + mask = 0x0f; + if (HPT366_ALLOW_ATA66_3 && + !check_in_drive_list(drive, bad_ata66_3)) + goto check_bad_ata33; + /* fall thru */ + case 0x01: mask = 0x07; - break; - default: - return 0x7f; - } - return check_in_drive_list(drive, bad_ata33) ? 0x00 : mask; + check_bad_ata33: + if (chip_type >= HPT370A) + break; + if (!check_in_drive_list(drive, bad_ata33)) + break; + /* fall thru */ + case 0x00: + default: + mask = 0x00; + break; + } + return mask; } static u32 get_speed_setting(u8 speed, struct hpt_info *info) @@ -723,7 +737,7 @@ static int hpt366_config_drive_xfer_rate(ide_drive_t *drive) * This is specific to the HPT366 UDMA chipset * by HighPoint|Triones Technologies, Inc. */ -static void hpt366_dma_lost_irq(ide_drive_t *drive) +static int hpt366_ide_dma_lostirq(ide_drive_t *drive) { struct pci_dev *dev = HWIF(drive)->pci_dev; u8 mcr1 = 0, mcr3 = 0, scr1 = 0; @@ -735,7 +749,7 @@ static void hpt366_dma_lost_irq(ide_drive_t *drive) drive->name, __FUNCTION__, mcr1, mcr3, scr1); if (scr1 & 0x10) pci_write_config_byte(dev, 0x5a, scr1 & ~0x10); - ide_dma_lost_irq(drive); + return __ide_dma_lostirq(drive); } static void hpt370_clear_engine(ide_drive_t *drive) @@ -785,10 +799,10 @@ static int hpt370_ide_dma_end(ide_drive_t *drive) return __ide_dma_end(drive); } -static void hpt370_dma_timeout(ide_drive_t *drive) +static int hpt370_ide_dma_timeout(ide_drive_t *drive) { hpt370_irq_timeout(drive); - ide_dma_timeout(drive); + return __ide_dma_timeout(drive); } /* returns 1 if DMA IRQ issued, 0 otherwise */ @@ -1136,7 +1150,7 @@ static unsigned int __devinit init_chipset_hpt366(struct pci_dev *dev, const cha * Select 66 MHz DPLL clock only if UltraATA/133 mode is * supported/enabled, use 50 MHz DPLL clock otherwise... */ - if (info->max_ultra == 6) { + if (info->max_mode == 0x04) { dpll_clk = 66; clock = ATA_CLOCK_66MHZ; } else if (dpll_clk) { /* HPT36x chips don't have DPLL */ @@ -1229,7 +1243,7 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif) struct pci_dev *dev = hwif->pci_dev; struct hpt_info *info = pci_get_drvdata(dev); int serialize = HPT_SERIALIZE_IO; - u8 scr1 = 0, ata66 = hwif->channel ? 0x01 : 0x02; + u8 scr1 = 0, ata66 = (hwif->channel) ? 0x01 : 0x02; u8 chip_type = info->chip_type; u8 new_mcr, old_mcr = 0; @@ -1242,9 +1256,7 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif) hwif->intrproc = &hpt3xx_intrproc; hwif->maskproc = &hpt3xx_maskproc; hwif->busproc = &hpt3xx_busproc; - - if (chip_type <= HPT370A) - hwif->udma_filter = &hpt3xx_udma_filter; + hwif->udma_filter = &hpt3xx_udma_filter; /* * HPT3xxN chips have some complications: @@ -1293,7 +1305,7 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif) return; } - hwif->ultra_mask = hwif->cds->udma_mask; + hwif->ultra_mask = 0x7f; hwif->mwdma_mask = 0x07; /* @@ -1330,8 +1342,8 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif) } else pci_read_config_byte (dev, 0x5a, &scr1); - if (hwif->cbl != ATA_CBL_PATA40_SHORT) - hwif->cbl = (scr1 & ata66) ? ATA_CBL_PATA40 : ATA_CBL_PATA80; + if (!hwif->udma_four) + hwif->udma_four = (scr1 & ata66) ? 0 : 1; hwif->ide_dma_check = &hpt366_config_drive_xfer_rate; @@ -1341,9 +1353,9 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif) } else if (chip_type >= HPT370) { hwif->dma_start = &hpt370_ide_dma_start; hwif->ide_dma_end = &hpt370_ide_dma_end; - hwif->dma_timeout = &hpt370_dma_timeout; + hwif->ide_dma_timeout = &hpt370_ide_dma_timeout; } else - hwif->dma_lost_irq = &hpt366_dma_lost_irq; + hwif->ide_dma_lostirq = &hpt366_ide_dma_lostirq; if (!noautodma) hwif->autodma = 1; @@ -1491,35 +1503,9 @@ static int __devinit init_setup_hpt366(struct pci_dev *dev, ide_pci_device_t *d) pci_read_config_byte(dev, PCI_REVISION_ID, &rev); - switch (rev) { - case 0: - case 1: - case 2: - /* - * HPT36x chips have one channel per function and have - * both channel enable bits located differently and visible - * to both functions -- really stupid design decision... :-( - * Bit 4 is for the primary channel, bit 5 for the secondary. - */ - d->channels = 1; - d->enablebits[0].mask = d->enablebits[0].val = 0x10; - - d->udma_mask = HPT366_ALLOW_ATA66_3 ? - (HPT366_ALLOW_ATA66_4 ? 0x1f : 0x0f) : 0x07; - break; - case 3: - case 4: - d->udma_mask = HPT370_ALLOW_ATA100_5 ? 0x3f : 0x1f; - break; - default: + if (rev > 6) rev = 6; - /* fall thru */ - case 5: - case 6: - d->udma_mask = HPT372_ALLOW_ATA133_6 ? 0x7f : 0x3f; - break; - } - + d->name = chipset_names[rev]; pci_set_drvdata(dev, info[rev]); @@ -1527,6 +1513,15 @@ static int __devinit init_setup_hpt366(struct pci_dev *dev, ide_pci_device_t *d) if (rev > 2) goto init_single; + /* + * HPT36x chips have one channel per function and have + * both channel enable bits located differently and visible + * to both functions -- really stupid design decision... :-( + * Bit 4 is for the primary channel, bit 5 for the secondary. + */ + d->channels = 1; + d->enablebits[0].mask = d->enablebits[0].val = 0x10; + if ((dev2 = pci_get_slot(dev->bus, dev->devfn + 1)) != NULL) { u8 mcr1 = 0, pin1 = 0, pin2 = 0; int ret; @@ -1578,7 +1573,6 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = { .channels = 2, .autodma = AUTODMA, .enablebits = {{0x50,0x04,0x04}, {0x54,0x04,0x04}}, - .udma_mask = HPT372_ALLOW_ATA133_6 ? 0x7f : 0x3f, .bootable = OFF_BOARD, .extra = 240 },{ /* 2 */ @@ -1590,7 +1584,6 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = { .channels = 2, .autodma = AUTODMA, .enablebits = {{0x50,0x04,0x04}, {0x54,0x04,0x04}}, - .udma_mask = HPT302_ALLOW_ATA133_6 ? 0x7f : 0x3f, .bootable = OFF_BOARD, .extra = 240 },{ /* 3 */ @@ -1602,7 +1595,6 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = { .channels = 2, .autodma = AUTODMA, .enablebits = {{0x50,0x04,0x04}, {0x54,0x04,0x04}}, - .udma_mask = HPT371_ALLOW_ATA133_6 ? 0x7f : 0x3f, .bootable = OFF_BOARD, .extra = 240 },{ /* 4 */ @@ -1614,7 +1606,6 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = { .channels = 2, /* 4 */ .autodma = AUTODMA, .enablebits = {{0x50,0x04,0x04}, {0x54,0x04,0x04}}, - .udma_mask = 0x3f, .bootable = OFF_BOARD, .extra = 240 },{ /* 5 */ @@ -1626,7 +1617,6 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = { .channels = 2, /* 4 */ .autodma = AUTODMA, .enablebits = {{0x50,0x04,0x04}, {0x54,0x04,0x04}}, - .udma_mask = HPT372_ALLOW_ATA133_6 ? 0x7f : 0x3f, .bootable = OFF_BOARD, .extra = 240 } diff --git a/trunk/drivers/ide/pci/it8213.c b/trunk/drivers/ide/pci/it8213.c index ff48c23e571e..c04a02687b95 100644 --- a/trunk/drivers/ide/pci/it8213.c +++ b/trunk/drivers/ide/pci/it8213.c @@ -231,7 +231,7 @@ static int it8213_config_drive_for_dma (ide_drive_t *drive) static void __devinit init_hwif_it8213(ide_hwif_t *hwif) { - u8 reg42h = 0; + u8 reg42h = 0, ata66 = 0; hwif->speedproc = &it8213_tune_chipset; hwif->tuneproc = &it8213_tuneproc; @@ -250,11 +250,11 @@ static void __devinit init_hwif_it8213(ide_hwif_t *hwif) hwif->swdma_mask = 0x04; pci_read_config_byte(hwif->pci_dev, 0x42, ®42h); + ata66 = (reg42h & 0x02) ? 0 : 1; hwif->ide_dma_check = &it8213_config_drive_for_dma; - - if (hwif->cbl != ATA_CBL_PATA40_SHORT) - hwif->cbl = (reg42h & 0x02) ? ATA_CBL_PATA40 : ATA_CBL_PATA80; + if (!(hwif->udma_four)) + hwif->udma_four = ata66; /* * The BIOS often doesn't set up DMA on this controller diff --git a/trunk/drivers/ide/pci/it821x.c b/trunk/drivers/ide/pci/it821x.c index 8197b653ba1e..3aeb7f1b7916 100644 --- a/trunk/drivers/ide/pci/it821x.c +++ b/trunk/drivers/ide/pci/it821x.c @@ -491,10 +491,10 @@ static int it821x_config_drive_for_dma (ide_drive_t *drive) * the needed logic onboard. */ -static u8 __devinit ata66_it821x(ide_hwif_t *hwif) +static unsigned int __devinit ata66_it821x(ide_hwif_t *hwif) { /* The reference driver also only does disk side */ - return ATA_CBL_PATA80; + return 1; } /** @@ -662,9 +662,8 @@ static void __devinit init_hwif_it821x(ide_hwif_t *hwif) hwif->mwdma_mask = 0x07; hwif->ide_dma_check = &it821x_config_drive_for_dma; - - if (hwif->cbl != ATA_CBL_PATA40_SHORT) - hwif->cbl = ata66_it821x(hwif); + if (!(hwif->udma_four)) + hwif->udma_four = ata66_it821x(hwif); /* * The BIOS often doesn't set up DMA on this controller diff --git a/trunk/drivers/ide/pci/jmicron.c b/trunk/drivers/ide/pci/jmicron.c index a6008f63e71e..76ed25147229 100644 --- a/trunk/drivers/ide/pci/jmicron.c +++ b/trunk/drivers/ide/pci/jmicron.c @@ -25,10 +25,10 @@ typedef enum { * ata66_jmicron - Cable check * @hwif: IDE port * - * Returns the cable type. + * Return 1 if the cable is 80pin */ -static u8 __devinit ata66_jmicron(ide_hwif_t *hwif) +static int __devinit ata66_jmicron(ide_hwif_t *hwif) { struct pci_dev *pdev = hwif->pci_dev; @@ -70,17 +70,16 @@ static u8 __devinit ata66_jmicron(ide_hwif_t *hwif) { case PORT_PATA0: if (control & (1 << 3)) /* 40/80 pin primary */ - return ATA_CBL_PATA40; - return ATA_CBL_PATA80; + return 0; + return 1; case PORT_PATA1: if (control5 & (1 << 19)) /* 40/80 pin secondary */ - return ATA_CBL_PATA40; - return ATA_CBL_PATA80; + return 0; + return 1; case PORT_SATA: break; } - /* Avoid bogus "control reaches end of non-void function" */ - return ATA_CBL_PATA80; + return 1; /* Avoid bogus "control reaches end of non-void function" */ } static void jmicron_tuneproc (ide_drive_t *drive, byte mode_wanted) @@ -160,9 +159,8 @@ static void __devinit init_hwif_jmicron(ide_hwif_t *hwif) hwif->mwdma_mask = 0x07; hwif->ide_dma_check = &jmicron_config_drive_for_dma; - - if (hwif->cbl != ATA_CBL_PATA40_SHORT) - hwif->cbl = ata66_jmicron(hwif); + if (!(hwif->udma_four)) + hwif->udma_four = ata66_jmicron(hwif); hwif->autodma = 1; hwif->drives[0].autodma = hwif->autodma; diff --git a/trunk/drivers/ide/pci/pdc202xx_new.c b/trunk/drivers/ide/pci/pdc202xx_new.c index ee5020df005d..0765dce6948e 100644 --- a/trunk/drivers/ide/pci/pdc202xx_new.c +++ b/trunk/drivers/ide/pci/pdc202xx_new.c @@ -225,10 +225,7 @@ static void pdcnew_tune_drive(ide_drive_t *drive, u8 pio) static u8 pdcnew_cable_detect(ide_hwif_t *hwif) { - if (get_indexed_reg(hwif, 0x0b) & 0x04) - return ATA_CBL_PATA40; - else - return ATA_CBL_PATA80; + return get_indexed_reg(hwif, 0x0b) & 0x04; } static int pdcnew_config_drive_xfer_rate(ide_drive_t *drive) @@ -512,8 +509,8 @@ static void __devinit init_hwif_pdc202new(ide_hwif_t *hwif) hwif->ide_dma_check = &pdcnew_config_drive_xfer_rate; - if (hwif->cbl != ATA_CBL_PATA40_SHORT) - hwif->cbl = pdcnew_cable_detect(hwif); + if (!hwif->udma_four) + hwif->udma_four = pdcnew_cable_detect(hwif) ? 0 : 1; if (!noautodma) hwif->autodma = 1; diff --git a/trunk/drivers/ide/pci/pdc202xx_old.c b/trunk/drivers/ide/pci/pdc202xx_old.c index 41ac4a94959f..23844687deea 100644 --- a/trunk/drivers/ide/pci/pdc202xx_old.c +++ b/trunk/drivers/ide/pci/pdc202xx_old.c @@ -152,10 +152,8 @@ static void pdc202xx_tune_drive(ide_drive_t *drive, u8 pio) static u8 pdc202xx_old_cable_detect (ide_hwif_t *hwif) { u16 CIS = 0, mask = (hwif->channel) ? (1<<11) : (1<<10); - pci_read_config_word(hwif->pci_dev, 0x50, &CIS); - - return (CIS & mask) ? ATA_CBL_PATA40 : ATA_CBL_PATA80; + return (CIS & mask) ? 1 : 0; } /* @@ -269,24 +267,18 @@ static int pdc202xx_old_ide_dma_test_irq(ide_drive_t *drive) return (dma_stat & 4) == 4; /* return 1 if INTR asserted */ } -static void pdc202xx_dma_lost_irq(ide_drive_t *drive) +static int pdc202xx_ide_dma_lostirq(ide_drive_t *drive) { - ide_hwif_t *hwif = HWIF(drive); - - if (hwif->resetproc != NULL) - hwif->resetproc(drive); - - ide_dma_lost_irq(drive); + if (HWIF(drive)->resetproc != NULL) + HWIF(drive)->resetproc(drive); + return __ide_dma_lostirq(drive); } -static void pdc202xx_dma_timeout(ide_drive_t *drive) +static int pdc202xx_ide_dma_timeout(ide_drive_t *drive) { - ide_hwif_t *hwif = HWIF(drive); - - if (hwif->resetproc != NULL) - hwif->resetproc(drive); - - ide_dma_timeout(drive); + if (HWIF(drive)->resetproc != NULL) + HWIF(drive)->resetproc(drive); + return __ide_dma_timeout(drive); } static void pdc202xx_reset_host (ide_hwif_t *hwif) @@ -355,13 +347,12 @@ static void __devinit init_hwif_pdc202xx(ide_hwif_t *hwif) hwif->err_stops_fifo = 1; hwif->ide_dma_check = &pdc202xx_config_drive_xfer_rate; - hwif->dma_lost_irq = &pdc202xx_dma_lost_irq; - hwif->dma_timeout = &pdc202xx_dma_timeout; + hwif->ide_dma_lostirq = &pdc202xx_ide_dma_lostirq; + hwif->ide_dma_timeout = &pdc202xx_ide_dma_timeout; if (hwif->pci_dev->device != PCI_DEVICE_ID_PROMISE_20246) { - if (hwif->cbl != ATA_CBL_PATA40_SHORT) - hwif->cbl = pdc202xx_old_cable_detect(hwif); - + if (!(hwif->udma_four)) + hwif->udma_four = (pdc202xx_old_cable_detect(hwif)) ? 0 : 1; hwif->dma_start = &pdc202xx_old_ide_dma_start; hwif->ide_dma_end = &pdc202xx_old_ide_dma_end; } diff --git a/trunk/drivers/ide/pci/piix.c b/trunk/drivers/ide/pci/piix.c index 2e0b29ef596a..8b219dd63024 100644 --- a/trunk/drivers/ide/pci/piix.c +++ b/trunk/drivers/ide/pci/piix.c @@ -1,5 +1,5 @@ /* - * linux/drivers/ide/pci/piix.c Version 0.50 Jun 10, 2007 + * linux/drivers/ide/pci/piix.c Version 0.47 February 8, 2007 * * Copyright (C) 1998-1999 Andrzej Krzysztofowicz, Author and Maintainer * Copyright (C) 1998-2000 Andre Hedrick @@ -394,45 +394,14 @@ static void piix_dma_clear_irq(ide_drive_t *drive) hwif->OUTB(dma_stat, hwif->dma_status); } -struct ich_laptop { - u16 device; - u16 subvendor; - u16 subdevice; -}; - -/* - * List of laptops that use short cables rather than 80 wire - */ - -static const struct ich_laptop ich_laptop[] = { - /* devid, subvendor, subdev */ - { 0x27DF, 0x0005, 0x0280 }, /* ICH7 on Acer 5602WLMi */ - { 0x27DF, 0x1025, 0x0110 }, /* ICH7 on Acer 3682WLMi */ - { 0x27DF, 0x1043, 0x1267 }, /* ICH7 on Asus W5F */ - { 0x24CA, 0x1025, 0x0061 }, /* ICH4 on Acer Aspire 2023WLMi */ - /* end marker */ - { 0, } -}; - -static u8 __devinit piix_cable_detect(ide_hwif_t *hwif) +static int __devinit piix_cable_detect(ide_hwif_t *hwif) { - struct pci_dev *pdev = hwif->pci_dev; - const struct ich_laptop *lap = &ich_laptop[0]; + struct pci_dev *dev = hwif->pci_dev; u8 reg54h = 0, mask = hwif->channel ? 0xc0 : 0x30; - /* check for specials */ - while (lap->device) { - if (lap->device == pdev->device && - lap->subvendor == pdev->subsystem_vendor && - lap->subdevice == pdev->subsystem_device) { - return ATA_CBL_PATA40_SHORT; - } - lap++; - } - - pci_read_config_byte(pdev, 0x54, ®54h); + pci_read_config_byte(dev, 0x54, ®54h); - return (reg54h & mask) ? ATA_CBL_PATA80 : ATA_CBL_PATA40; + return (reg54h & mask) ? 1 : 0; } /** @@ -475,8 +444,8 @@ static void __devinit init_hwif_piix(ide_hwif_t *hwif) hwif->swdma_mask = 0x04; if (hwif->ultra_mask & 0x78) { - if (hwif->cbl != ATA_CBL_PATA40_SHORT) - hwif->cbl = piix_cable_detect(hwif); + if (!hwif->udma_four) + hwif->udma_four = piix_cable_detect(hwif); } if (no_piix_dma) diff --git a/trunk/drivers/ide/pci/scc_pata.c b/trunk/drivers/ide/pci/scc_pata.c index 7b87488e3daa..55bc0a32e34f 100644 --- a/trunk/drivers/ide/pci/scc_pata.c +++ b/trunk/drivers/ide/pci/scc_pata.c @@ -716,7 +716,7 @@ static void __devinit init_hwif_scc(ide_hwif_t *hwif) hwif->atapi_dma = 1; /* we support 80c cable only. */ - hwif->cbl = ATA_CBL_PATA80; + hwif->udma_four = 1; hwif->autodma = 0; if (!noautodma) diff --git a/trunk/drivers/ide/pci/serverworks.c b/trunk/drivers/ide/pci/serverworks.c index 1371b5bf6bf0..d9c4fd1ae996 100644 --- a/trunk/drivers/ide/pci/serverworks.c +++ b/trunk/drivers/ide/pci/serverworks.c @@ -1,5 +1,5 @@ /* - * linux/drivers/ide/pci/serverworks.c Version 0.20 Jun 3 2007 + * linux/drivers/ide/pci/serverworks.c Version 0.11 Jun 2 2007 * * Copyright (C) 1998-2000 Michel Aubry * Copyright (C) 1998-2000 Andrzej Krzysztofowicz @@ -151,11 +151,84 @@ static int svwks_tune_chipset (ide_drive_t *drive, u8 xferspeed) if(dev->device == PCI_DEVICE_ID_SERVERWORKS_OSB4 && drive->media == ide_disk && speed >= XFER_UDMA_0) BUG(); - + + pci_read_config_byte(dev, drive_pci[drive->dn], &pio_timing); + pci_read_config_byte(dev, drive_pci2[drive->dn], &dma_timing); pci_read_config_byte(dev, (0x56|hwif->channel), &ultra_timing); pci_read_config_word(dev, 0x4A, &csb5_pio); pci_read_config_byte(dev, 0x54, &ultra_enable); + /* If we are in RAID mode (eg AMI MegaIDE) then we can't it + turns out trust the firmware configuration */ + + if ((dev->class >> 8) != PCI_CLASS_STORAGE_IDE) + goto oem_setup_failed; + + /* Per Specified Design by OEM, and ASIC Architect */ + if ((dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE) || + (dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2)) { + if (!drive->init_speed) { + u8 dma_stat = inb(hwif->dma_status); + + if (((ultra_enable << (7-drive->dn) & 0x80) == 0x80) && + ((dma_stat & (1<<(5+unit))) == (1<<(5+unit)))) { + drive->current_speed = drive->init_speed = XFER_UDMA_0 + udma_modes[(ultra_timing >> (4*unit)) & ~(0xF0)]; + return 0; + } else if ((dma_timing) && + ((dma_stat&(1<<(5+unit)))==(1<<(5+unit)))) { + u8 dmaspeed; + + switch (dma_timing & 0x77) { + case 0x20: + dmaspeed = XFER_MW_DMA_2; + break; + case 0x21: + dmaspeed = XFER_MW_DMA_1; + break; + case 0x77: + dmaspeed = XFER_MW_DMA_0; + break; + default: + goto dma_pio; + } + + drive->current_speed = drive->init_speed = dmaspeed; + return 0; + } +dma_pio: + if (pio_timing) { + u8 piospeed; + + switch (pio_timing & 0x7f) { + case 0x20: + piospeed = XFER_PIO_4; + break; + case 0x22: + piospeed = XFER_PIO_3; + break; + case 0x34: + piospeed = XFER_PIO_2; + break; + case 0x47: + piospeed = XFER_PIO_1; + break; + case 0x5d: + piospeed = XFER_PIO_0; + break; + default: + goto oem_setup_failed; + } + + drive->current_speed = drive->init_speed = piospeed; + return 0; + } + } + } + +oem_setup_failed: + + pio_timing = 0; + dma_timing = 0; ultra_timing &= ~(0x0F << (4*unit)); ultra_enable &= ~(0x01 << drive->dn); csb5_pio &= ~(0x0F << (4*drive->dn)); @@ -329,9 +402,9 @@ static unsigned int __devinit init_chipset_svwks (struct pci_dev *dev, const cha return dev->irq; } -static u8 __devinit ata66_svwks_svwks(ide_hwif_t *hwif) +static unsigned int __devinit ata66_svwks_svwks (ide_hwif_t *hwif) { - return ATA_CBL_PATA80; + return 1; } /* On Dell PowerEdge servers with a CSB5/CSB6, the top two bits @@ -341,7 +414,7 @@ static u8 __devinit ata66_svwks_svwks(ide_hwif_t *hwif) * Bit 14 clear = primary IDE channel does not have 80-pin cable. * Bit 14 set = primary IDE channel has 80-pin cable. */ -static u8 __devinit ata66_svwks_dell(ide_hwif_t *hwif) +static unsigned int __devinit ata66_svwks_dell (ide_hwif_t *hwif) { struct pci_dev *dev = hwif->pci_dev; if (dev->subsystem_vendor == PCI_VENDOR_ID_DELL && @@ -349,8 +422,8 @@ static u8 __devinit ata66_svwks_dell(ide_hwif_t *hwif) (dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB5IDE || dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE)) return ((1 << (hwif->channel + 14)) & - dev->subsystem_device) ? ATA_CBL_PATA80 : ATA_CBL_PATA40; - return ATA_CBL_PATA40; + dev->subsystem_device) ? 1 : 0; + return 0; } /* Sun Cobalt Alpine hardware avoids the 80-pin cable @@ -359,18 +432,18 @@ static u8 __devinit ata66_svwks_dell(ide_hwif_t *hwif) * * WARNING: this only works on Alpine hardware! */ -static u8 __devinit ata66_svwks_cobalt(ide_hwif_t *hwif) +static unsigned int __devinit ata66_svwks_cobalt (ide_hwif_t *hwif) { struct pci_dev *dev = hwif->pci_dev; if (dev->subsystem_vendor == PCI_VENDOR_ID_SUN && dev->vendor == PCI_VENDOR_ID_SERVERWORKS && dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB5IDE) return ((1 << (hwif->channel + 14)) & - dev->subsystem_device) ? ATA_CBL_PATA80 : ATA_CBL_PATA40; - return ATA_CBL_PATA40; + dev->subsystem_device) ? 1 : 0; + return 0; } -static u8 __devinit ata66_svwks(ide_hwif_t *hwif) +static unsigned int __devinit ata66_svwks (ide_hwif_t *hwif) { struct pci_dev *dev = hwif->pci_dev; @@ -389,9 +462,9 @@ static u8 __devinit ata66_svwks(ide_hwif_t *hwif) /* Per Specified Design by OEM, and ASIC Architect */ if ((dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE) || (dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2)) - return ATA_CBL_PATA80; + return 1; - return ATA_CBL_PATA40; + return 0; } static void __devinit init_hwif_svwks (ide_hwif_t *hwif) @@ -422,8 +495,8 @@ static void __devinit init_hwif_svwks (ide_hwif_t *hwif) hwif->ide_dma_check = &svwks_config_drive_xfer_rate; if (hwif->pci_dev->device != PCI_DEVICE_ID_SERVERWORKS_OSB4IDE) { - if (hwif->cbl != ATA_CBL_PATA40_SHORT) - hwif->cbl = ata66_svwks(hwif); + if (!hwif->udma_four) + hwif->udma_four = ata66_svwks(hwif); } if (!noautodma) hwif->autodma = 1; diff --git a/trunk/drivers/ide/pci/sgiioc4.c b/trunk/drivers/ide/pci/sgiioc4.c index d396b2929ed8..d3185e29a38e 100644 --- a/trunk/drivers/ide/pci/sgiioc4.c +++ b/trunk/drivers/ide/pci/sgiioc4.c @@ -316,19 +316,19 @@ static void sgiioc4_dma_host_off(ide_drive_t * drive) sgiioc4_clearirq(drive); } -static void -sgiioc4_resetproc(ide_drive_t * drive) +static int +sgiioc4_ide_dma_lostirq(ide_drive_t * drive) { - sgiioc4_ide_dma_end(drive); - sgiioc4_clearirq(drive); + HWIF(drive)->resetproc(drive); + + return __ide_dma_lostirq(drive); } static void -sgiioc4_dma_lost_irq(ide_drive_t * drive) +sgiioc4_resetproc(ide_drive_t * drive) { - sgiioc4_resetproc(drive); - - ide_dma_lost_irq(drive); + sgiioc4_ide_dma_end(drive); + sgiioc4_clearirq(drive); } static u8 @@ -607,8 +607,8 @@ ide_init_sgiioc4(ide_hwif_t * hwif) hwif->ide_dma_test_irq = &sgiioc4_ide_dma_test_irq; hwif->dma_host_on = &sgiioc4_dma_host_on; hwif->dma_host_off = &sgiioc4_dma_host_off; - hwif->dma_lost_irq = &sgiioc4_dma_lost_irq; - hwif->dma_timeout = &ide_dma_timeout; + hwif->ide_dma_lostirq = &sgiioc4_ide_dma_lostirq; + hwif->ide_dma_timeout = &__ide_dma_timeout; hwif->INB = &sgiioc4_INB; } diff --git a/trunk/drivers/ide/pci/siimage.c b/trunk/drivers/ide/pci/siimage.c index 1c3e35487893..1a4444e7226a 100644 --- a/trunk/drivers/ide/pci/siimage.c +++ b/trunk/drivers/ide/pci/siimage.c @@ -933,17 +933,16 @@ static void __devinit init_iops_siimage(ide_hwif_t *hwif) * interface. */ -static u8 __devinit ata66_siimage(ide_hwif_t *hwif) +static unsigned int __devinit ata66_siimage(ide_hwif_t *hwif) { unsigned long addr = siimage_selreg(hwif, 0); - u8 ata66 = 0; - - if (pci_get_drvdata(hwif->pci_dev) == NULL) + if (pci_get_drvdata(hwif->pci_dev) == NULL) { + u8 ata66 = 0; pci_read_config_byte(hwif->pci_dev, addr, &ata66); - else - ata66 = hwif->INB(addr); + return (ata66 & 0x01) ? 1 : 0; + } - return (ata66 & 0x01) ? ATA_CBL_PATA80 : ATA_CBL_PATA40; + return (hwif->INB(addr) & 0x01) ? 1 : 0; } /** @@ -989,9 +988,8 @@ static void __devinit init_hwif_siimage(ide_hwif_t *hwif) hwif->atapi_dma = 1; hwif->ide_dma_check = &siimage_config_drive_for_dma; - - if (hwif->cbl != ATA_CBL_PATA40_SHORT) - hwif->cbl = ata66_siimage(hwif); + if (!(hwif->udma_four)) + hwif->udma_four = ata66_siimage(hwif); if (hwif->mmio) { hwif->ide_dma_test_irq = &siimage_mmio_ide_dma_test_irq; diff --git a/trunk/drivers/ide/pci/sis5513.c b/trunk/drivers/ide/pci/sis5513.c index f875183ac8d9..ec0adad9ef61 100644 --- a/trunk/drivers/ide/pci/sis5513.c +++ b/trunk/drivers/ide/pci/sis5513.c @@ -1,5 +1,5 @@ /* - * linux/drivers/ide/pci/sis5513.c Version 0.25 Jun 10, 2007 + * linux/drivers/ide/pci/sis5513.c Version 0.20 Mar 4, 2007 * * Copyright (C) 1999-2000 Andre Hedrick * Copyright (C) 2002 Lionel Bouton , Maintainer @@ -796,33 +796,10 @@ static unsigned int __devinit init_chipset_sis5513 (struct pci_dev *dev, const c return 0; } -struct sis_laptop { - u16 device; - u16 subvendor; - u16 subdevice; -}; - -static const struct sis_laptop sis_laptop[] = { - /* devid, subvendor, subdev */ - { 0x5513, 0x1043, 0x1107 }, /* ASUS A6K */ - /* end marker */ - { 0, } -}; - -static u8 __devinit ata66_sis5513(ide_hwif_t *hwif) +static unsigned int __devinit ata66_sis5513 (ide_hwif_t *hwif) { - struct pci_dev *pdev = hwif->pci_dev; - const struct sis_laptop *lap = &sis_laptop[0]; u8 ata66 = 0; - while (lap->device) { - if (lap->device == pdev->device && - lap->subvendor == pdev->subsystem_vendor && - lap->subdevice == pdev->subsystem_device) - return ATA_CBL_PATA40_SHORT; - lap++; - } - if (chipset_family >= ATA_133) { u16 regw = 0; u16 reg_addr = hwif->channel ? 0x52: 0x50; @@ -834,8 +811,7 @@ static u8 __devinit ata66_sis5513(ide_hwif_t *hwif) pci_read_config_byte(hwif->pci_dev, 0x48, ®48h); ata66 = (reg48h & mask) ? 0 : 1; } - - return ata66 ? ATA_CBL_PATA80 : ATA_CBL_PATA40; + return ata66; } static void __devinit init_hwif_sis5513 (ide_hwif_t *hwif) @@ -865,8 +841,8 @@ static void __devinit init_hwif_sis5513 (ide_hwif_t *hwif) if (!chipset_family) return; - if (hwif->cbl != ATA_CBL_PATA40_SHORT) - hwif->cbl = ata66_sis5513(hwif); + if (!(hwif->udma_four)) + hwif->udma_four = ata66_sis5513(hwif); if (chipset_family > ATA_16) { hwif->ide_dma_check = &sis5513_config_xfer_rate; diff --git a/trunk/drivers/ide/pci/sl82c105.c b/trunk/drivers/ide/pci/sl82c105.c index 487879842af4..7c383d9cc472 100644 --- a/trunk/drivers/ide/pci/sl82c105.c +++ b/trunk/drivers/ide/pci/sl82c105.c @@ -195,7 +195,7 @@ static inline void sl82c105_reset_host(struct pci_dev *dev) * This function is called when the IDE timer expires, the drive * indicates that it is READY, and we were waiting for DMA to complete. */ -static void sl82c105_dma_lost_irq(ide_drive_t *drive) +static int sl82c105_ide_dma_lostirq(ide_drive_t *drive) { ide_hwif_t *hwif = HWIF(drive); struct pci_dev *dev = hwif->pci_dev; @@ -222,6 +222,9 @@ static void sl82c105_dma_lost_irq(ide_drive_t *drive) } sl82c105_reset_host(dev); + + /* __ide_dma_lostirq would return 1, so we do as well */ + return 1; } /* @@ -241,12 +244,15 @@ static void sl82c105_dma_start(ide_drive_t *drive) ide_dma_start(drive); } -static void sl82c105_dma_timeout(ide_drive_t *drive) +static int sl82c105_ide_dma_timeout(ide_drive_t *drive) { - DBG(("sl82c105_dma_timeout(drive:%s)\n", drive->name)); + ide_hwif_t *hwif = HWIF(drive); + struct pci_dev *dev = hwif->pci_dev; - sl82c105_reset_host(HWIF(drive)->pci_dev); - ide_dma_timeout(drive); + DBG(("sl82c105_ide_dma_timeout(drive:%s)\n", drive->name)); + + sl82c105_reset_host(dev); + return __ide_dma_timeout(drive); } static int sl82c105_ide_dma_on(ide_drive_t *drive) @@ -435,9 +441,9 @@ static void __devinit init_hwif_sl82c105(ide_hwif_t *hwif) hwif->ide_dma_check = &sl82c105_ide_dma_check; hwif->ide_dma_on = &sl82c105_ide_dma_on; hwif->dma_off_quietly = &sl82c105_dma_off_quietly; - hwif->dma_lost_irq = &sl82c105_dma_lost_irq; + hwif->ide_dma_lostirq = &sl82c105_ide_dma_lostirq; hwif->dma_start = &sl82c105_dma_start; - hwif->dma_timeout = &sl82c105_dma_timeout; + hwif->ide_dma_timeout = &sl82c105_ide_dma_timeout; if (!noautodma) hwif->autodma = 1; diff --git a/trunk/drivers/ide/pci/slc90e66.c b/trunk/drivers/ide/pci/slc90e66.c index 575dbbd8b482..c40f291f91e0 100644 --- a/trunk/drivers/ide/pci/slc90e66.c +++ b/trunk/drivers/ide/pci/slc90e66.c @@ -199,9 +199,10 @@ static void __devinit init_hwif_slc90e66 (ide_hwif_t *hwif) hwif->mwdma_mask = 0x06; hwif->swdma_mask = 0x04; - if (hwif->cbl != ATA_CBL_PATA40_SHORT) + if (!hwif->udma_four) { /* bit[0(1)]: 0:80, 1:40 */ - hwif->cbl = (reg47 & mask) ? ATA_CBL_PATA40 : ATA_CBL_PATA80; + hwif->udma_four = (reg47 & mask) ? 0 : 1; + } hwif->ide_dma_check = &slc90e66_config_drive_xfer_rate; diff --git a/trunk/drivers/ide/pci/tc86c001.c b/trunk/drivers/ide/pci/tc86c001.c index 8de1f8e22494..cee619bb2eaf 100644 --- a/trunk/drivers/ide/pci/tc86c001.c +++ b/trunk/drivers/ide/pci/tc86c001.c @@ -220,13 +220,13 @@ static void __devinit init_hwif_tc86c001(ide_hwif_t *hwif) hwif->ide_dma_check = &tc86c001_config_drive_xfer_rate; hwif->dma_start = &tc86c001_dma_start; - if (hwif->cbl != ATA_CBL_PATA40_SHORT) { + if (!hwif->udma_four) { /* * System Control 1 Register bit 13 (PDIAGN): * 0=80-pin cable, 1=40-pin cable */ scr1 = hwif->INW(sc_base + 0x00); - hwif->cbl = (scr1 & 0x2000) ? ATA_CBL_PATA40 : ATA_CBL_PATA80; + hwif->udma_four = (scr1 & 0x2000) ? 0 : 1; } if (!noautodma) diff --git a/trunk/drivers/ide/pci/via82cxxx.c b/trunk/drivers/ide/pci/via82cxxx.c index d21dd2e7eeb3..a508550c4095 100644 --- a/trunk/drivers/ide/pci/via82cxxx.c +++ b/trunk/drivers/ide/pci/via82cxxx.c @@ -1,6 +1,6 @@ /* * - * Version 3.45 + * Version 3.38 * * VIA IDE driver for Linux. Supported southbridges: * @@ -9,7 +9,6 @@ * vt8235, vt8237, vt8237a * * Copyright (c) 2000-2002 Vojtech Pavlik - * Copyright (c) 2007 Bartlomiej Zolnierkiewicz * * Based on the work of: * Michel Aubry @@ -34,8 +33,6 @@ #include #include #include -#include - #include #ifdef CONFIG_PPC_CHRP @@ -44,6 +41,8 @@ #include "ide-timing.h" +#define DISPLAY_VIA_TIMINGS + #define VIA_IDE_ENABLE 0x40 #define VIA_IDE_CONFIG 0x41 #define VIA_FIFO_CONFIG 0x43 @@ -55,12 +54,18 @@ #define VIA_ADDRESS_SETUP 0x4c #define VIA_UDMA_TIMING 0x50 -#define VIA_BAD_PREQ 0x01 /* Crashes if PREQ# till DDACK# set */ -#define VIA_BAD_CLK66 0x02 /* 66 MHz clock doesn't work correctly */ -#define VIA_SET_FIFO 0x04 /* Needs to have FIFO split set */ -#define VIA_NO_UNMASK 0x08 /* Doesn't work with IRQ unmasking on */ -#define VIA_BAD_ID 0x10 /* Has wrong vendor ID (0x1107) */ -#define VIA_BAD_AST 0x20 /* Don't touch Address Setup Timing */ +#define VIA_UDMA 0x007 +#define VIA_UDMA_NONE 0x000 +#define VIA_UDMA_33 0x001 +#define VIA_UDMA_66 0x002 +#define VIA_UDMA_100 0x003 +#define VIA_UDMA_133 0x004 +#define VIA_BAD_PREQ 0x010 /* Crashes if PREQ# till DDACK# set */ +#define VIA_BAD_CLK66 0x020 /* 66 MHz clock doesn't work correctly */ +#define VIA_SET_FIFO 0x040 /* Needs to have FIFO split set */ +#define VIA_NO_UNMASK 0x080 /* Doesn't work with IRQ unmasking on */ +#define VIA_BAD_ID 0x100 /* Has wrong vendor ID (0x1107) */ +#define VIA_BAD_AST 0x200 /* Don't touch Address Setup Timing */ /* * VIA SouthBridge chips. @@ -71,37 +76,36 @@ static struct via_isa_bridge { u16 id; u8 rev_min; u8 rev_max; - u8 udma_mask; - u8 flags; + u16 flags; } via_isa_bridges[] = { - { "cx700", PCI_DEVICE_ID_VIA_CX700, 0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST }, - { "vt8237s", PCI_DEVICE_ID_VIA_8237S, 0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST }, - { "vt6410", PCI_DEVICE_ID_VIA_6410, 0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST }, - { "vt8251", PCI_DEVICE_ID_VIA_8251, 0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST }, - { "vt8237", PCI_DEVICE_ID_VIA_8237, 0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST }, - { "vt8237a", PCI_DEVICE_ID_VIA_8237A, 0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST }, - { "vt8235", PCI_DEVICE_ID_VIA_8235, 0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST }, - { "vt8233a", PCI_DEVICE_ID_VIA_8233A, 0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST }, - { "vt8233c", PCI_DEVICE_ID_VIA_8233C_0, 0x00, 0x2f, ATA_UDMA5, }, - { "vt8233", PCI_DEVICE_ID_VIA_8233_0, 0x00, 0x2f, ATA_UDMA5, }, - { "vt8231", PCI_DEVICE_ID_VIA_8231, 0x00, 0x2f, ATA_UDMA5, }, - { "vt82c686b", PCI_DEVICE_ID_VIA_82C686, 0x40, 0x4f, ATA_UDMA5, }, - { "vt82c686a", PCI_DEVICE_ID_VIA_82C686, 0x10, 0x2f, ATA_UDMA4, }, - { "vt82c686", PCI_DEVICE_ID_VIA_82C686, 0x00, 0x0f, ATA_UDMA2, VIA_BAD_CLK66 }, - { "vt82c596b", PCI_DEVICE_ID_VIA_82C596, 0x10, 0x2f, ATA_UDMA4, }, - { "vt82c596a", PCI_DEVICE_ID_VIA_82C596, 0x00, 0x0f, ATA_UDMA2, VIA_BAD_CLK66 }, - { "vt82c586b", PCI_DEVICE_ID_VIA_82C586_0, 0x47, 0x4f, ATA_UDMA2, VIA_SET_FIFO }, - { "vt82c586b", PCI_DEVICE_ID_VIA_82C586_0, 0x40, 0x46, ATA_UDMA2, VIA_SET_FIFO | VIA_BAD_PREQ }, - { "vt82c586b", PCI_DEVICE_ID_VIA_82C586_0, 0x30, 0x3f, ATA_UDMA2, VIA_SET_FIFO }, - { "vt82c586a", PCI_DEVICE_ID_VIA_82C586_0, 0x20, 0x2f, ATA_UDMA2, VIA_SET_FIFO }, - { "vt82c586", PCI_DEVICE_ID_VIA_82C586_0, 0x00, 0x0f, 0x00, VIA_SET_FIFO }, - { "vt82c576", PCI_DEVICE_ID_VIA_82C576, 0x00, 0x2f, 0x00, VIA_SET_FIFO | VIA_NO_UNMASK }, - { "vt82c576", PCI_DEVICE_ID_VIA_82C576, 0x00, 0x2f, 0x00, VIA_SET_FIFO | VIA_NO_UNMASK | VIA_BAD_ID }, + { "cx700", PCI_DEVICE_ID_VIA_CX700, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, + { "vt8237s", PCI_DEVICE_ID_VIA_8237S, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, + { "vt6410", PCI_DEVICE_ID_VIA_6410, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, + { "vt8251", PCI_DEVICE_ID_VIA_8251, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, + { "vt8237", PCI_DEVICE_ID_VIA_8237, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, + { "vt8237a", PCI_DEVICE_ID_VIA_8237A, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, + { "vt8235", PCI_DEVICE_ID_VIA_8235, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, + { "vt8233a", PCI_DEVICE_ID_VIA_8233A, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, + { "vt8233c", PCI_DEVICE_ID_VIA_8233C_0, 0x00, 0x2f, VIA_UDMA_100 }, + { "vt8233", PCI_DEVICE_ID_VIA_8233_0, 0x00, 0x2f, VIA_UDMA_100 }, + { "vt8231", PCI_DEVICE_ID_VIA_8231, 0x00, 0x2f, VIA_UDMA_100 }, + { "vt82c686b", PCI_DEVICE_ID_VIA_82C686, 0x40, 0x4f, VIA_UDMA_100 }, + { "vt82c686a", PCI_DEVICE_ID_VIA_82C686, 0x10, 0x2f, VIA_UDMA_66 }, + { "vt82c686", PCI_DEVICE_ID_VIA_82C686, 0x00, 0x0f, VIA_UDMA_33 | VIA_BAD_CLK66 }, + { "vt82c596b", PCI_DEVICE_ID_VIA_82C596, 0x10, 0x2f, VIA_UDMA_66 }, + { "vt82c596a", PCI_DEVICE_ID_VIA_82C596, 0x00, 0x0f, VIA_UDMA_33 | VIA_BAD_CLK66 }, + { "vt82c586b", PCI_DEVICE_ID_VIA_82C586_0, 0x47, 0x4f, VIA_UDMA_33 | VIA_SET_FIFO }, + { "vt82c586b", PCI_DEVICE_ID_VIA_82C586_0, 0x40, 0x46, VIA_UDMA_33 | VIA_SET_FIFO | VIA_BAD_PREQ }, + { "vt82c586b", PCI_DEVICE_ID_VIA_82C586_0, 0x30, 0x3f, VIA_UDMA_33 | VIA_SET_FIFO }, + { "vt82c586a", PCI_DEVICE_ID_VIA_82C586_0, 0x20, 0x2f, VIA_UDMA_33 | VIA_SET_FIFO }, + { "vt82c586", PCI_DEVICE_ID_VIA_82C586_0, 0x00, 0x0f, VIA_UDMA_NONE | VIA_SET_FIFO }, + { "vt82c576", PCI_DEVICE_ID_VIA_82C576, 0x00, 0x2f, VIA_UDMA_NONE | VIA_SET_FIFO | VIA_NO_UNMASK }, + { "vt82c576", PCI_DEVICE_ID_VIA_82C576, 0x00, 0x2f, VIA_UDMA_NONE | VIA_SET_FIFO | VIA_NO_UNMASK | VIA_BAD_ID }, { NULL } }; static unsigned int via_clock; -static char *via_dma[] = { "16", "25", "33", "44", "66", "100", "133" }; +static char *via_dma[] = { "MWDMA16", "UDMA33", "UDMA66", "UDMA100", "UDMA133" }; struct via82cxxx_dev { @@ -136,12 +140,12 @@ static void via_set_speed(ide_hwif_t *hwif, u8 dn, struct ide_timing *timing) pci_write_config_byte(dev, VIA_DRIVE_TIMING + (3 - dn), ((FIT(timing->active, 1, 16) - 1) << 4) | (FIT(timing->recover, 1, 16) - 1)); - switch (vdev->via_config->udma_mask) { - case ATA_UDMA2: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 5) - 2)) : 0x03; break; - case ATA_UDMA4: t = timing->udma ? (0xe8 | (FIT(timing->udma, 2, 9) - 2)) : 0x0f; break; - case ATA_UDMA5: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 9) - 2)) : 0x07; break; - case ATA_UDMA6: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 9) - 2)) : 0x07; break; - default: return; + switch (vdev->via_config->flags & VIA_UDMA) { + case VIA_UDMA_33: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 5) - 2)) : 0x03; break; + case VIA_UDMA_66: t = timing->udma ? (0xe8 | (FIT(timing->udma, 2, 9) - 2)) : 0x0f; break; + case VIA_UDMA_100: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 9) - 2)) : 0x07; break; + case VIA_UDMA_133: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 9) - 2)) : 0x07; break; + default: return; } pci_write_config_byte(dev, VIA_UDMA_TIMING + (3 - dn), t); @@ -169,12 +173,12 @@ static int via_set_drive(ide_drive_t *drive, u8 speed) T = 1000000000 / via_clock; - switch (vdev->via_config->udma_mask) { - case ATA_UDMA2: UT = T; break; - case ATA_UDMA4: UT = T/2; break; - case ATA_UDMA5: UT = T/3; break; - case ATA_UDMA6: UT = T/4; break; - default: UT = T; + switch (vdev->via_config->flags & VIA_UDMA) { + case VIA_UDMA_33: UT = T; break; + case VIA_UDMA_66: UT = T/2; break; + case VIA_UDMA_100: UT = T/3; break; + case VIA_UDMA_133: UT = T/4; break; + default: UT = T; } ide_timing_compute(drive, speed, &t, T, UT); @@ -204,7 +208,8 @@ static int via_set_drive(ide_drive_t *drive, u8 speed) static void via82cxxx_tune_drive(ide_drive_t *drive, u8 pio) { if (pio == 255) { - via_set_drive(drive, ide_find_best_pio_mode(drive)); + via_set_drive(drive, + ide_find_best_mode(drive, XFER_PIO | XFER_EPIO)); return; } @@ -221,10 +226,16 @@ static void via82cxxx_tune_drive(ide_drive_t *drive, u8 pio) static int via82cxxx_ide_dma_check (ide_drive_t *drive) { - u8 speed = ide_max_dma_mode(drive); + ide_hwif_t *hwif = HWIF(drive); + struct via82cxxx_dev *vdev = pci_get_drvdata(hwif->pci_dev); + u16 w80 = hwif->udma_four; - if (speed == 0) - speed = ide_find_best_pio_mode(drive); + u16 speed = ide_find_best_mode(drive, + XFER_PIO | XFER_EPIO | XFER_SWDMA | XFER_MWDMA | + (vdev->via_config->flags & VIA_UDMA ? XFER_UDMA : 0) | + (w80 && (vdev->via_config->flags & VIA_UDMA) >= VIA_UDMA_66 ? XFER_UDMA_66 : 0) | + (w80 && (vdev->via_config->flags & VIA_UDMA) >= VIA_UDMA_100 ? XFER_UDMA_100 : 0) | + (w80 && (vdev->via_config->flags & VIA_UDMA) >= VIA_UDMA_133 ? XFER_UDMA_133 : 0)); via_set_drive(drive, speed); @@ -261,8 +272,8 @@ static void __devinit via_cable_detect(struct via82cxxx_dev *vdev, u32 u) { int i; - switch (vdev->via_config->udma_mask) { - case ATA_UDMA4: + switch (vdev->via_config->flags & VIA_UDMA) { + case VIA_UDMA_66: for (i = 24; i >= 0; i -= 8) if (((u >> (i & 16)) & 8) && ((u >> i) & 0x20) && @@ -275,7 +286,7 @@ static void __devinit via_cable_detect(struct via82cxxx_dev *vdev, u32 u) } break; - case ATA_UDMA5: + case VIA_UDMA_100: for (i = 24; i >= 0; i -= 8) if (((u >> i) & 0x10) || (((u >> i) & 0x20) && @@ -287,7 +298,7 @@ static void __devinit via_cable_detect(struct via82cxxx_dev *vdev, u32 u) } break; - case ATA_UDMA6: + case VIA_UDMA_133: for (i = 24; i >= 0; i -= 8) if (((u >> i) & 0x10) || (((u >> i) & 0x20) && @@ -342,7 +353,7 @@ static unsigned int __devinit init_chipset_via82cxxx(struct pci_dev *dev, const via_cable_detect(vdev, u); - if (via_config->udma_mask == ATA_UDMA4) { + if ((via_config->flags & VIA_UDMA) == VIA_UDMA_66) { /* Enable Clk66 */ pci_write_config_dword(dev, VIA_UDMA_TIMING, u|0x80008); } else if (via_config->flags & VIA_BAD_CLK66) { @@ -405,54 +416,16 @@ static unsigned int __devinit init_chipset_via82cxxx(struct pci_dev *dev, const */ pci_read_config_byte(isa, PCI_REVISION_ID, &t); - printk(KERN_INFO "VP_IDE: VIA %s (rev %02x) IDE %sDMA%s " + printk(KERN_INFO "VP_IDE: VIA %s (rev %02x) IDE %s " "controller on pci%s\n", via_config->name, t, - via_config->udma_mask ? "U" : "MW", - via_dma[via_config->udma_mask ? - (fls(via_config->udma_mask) - 1) : 0], + via_dma[via_config->flags & VIA_UDMA], pci_name(dev)); pci_dev_put(isa); return 0; } -/* - * Cable special cases - */ - -static struct dmi_system_id cable_dmi_table[] = { - { - .ident = "Acer Ferrari 3400", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Acer,Inc."), - DMI_MATCH(DMI_BOARD_NAME, "Ferrari 3400"), - }, - }, - { } -}; - -static int via_cable_override(void) -{ - /* Systems by DMI */ - if (dmi_check_system(cable_dmi_table)) - return 1; - return 0; -} - -static u8 __devinit via82cxxx_cable_detect(ide_hwif_t *hwif) -{ - struct via82cxxx_dev *vdev = pci_get_drvdata(hwif->pci_dev); - - if (via_cable_override()) - return ATA_CBL_PATA40_SHORT; - - if ((vdev->via_80w >> hwif->channel) & 1) - return ATA_CBL_PATA80; - else - return ATA_CBL_PATA40; -} - static void __devinit init_hwif_via82cxxx(ide_hwif_t *hwif) { struct via82cxxx_dev *vdev = pci_get_drvdata(hwif->pci_dev); @@ -481,14 +454,12 @@ static void __devinit init_hwif_via82cxxx(ide_hwif_t *hwif) return; hwif->atapi_dma = 1; - - hwif->ultra_mask = vdev->via_config->udma_mask; + hwif->ultra_mask = 0x7f; hwif->mwdma_mask = 0x07; hwif->swdma_mask = 0x07; - if (hwif->cbl != ATA_CBL_PATA40_SHORT) - hwif->cbl = via82cxxx_cable_detect(hwif); - + if (!hwif->udma_four) + hwif->udma_four = (vdev->via_80w >> hwif->channel) & 1; hwif->ide_dma_check = &via82cxxx_ide_dma_check; if (!noautodma) hwif->autodma = 1; diff --git a/trunk/drivers/ide/ppc/pmac.c b/trunk/drivers/ide/ppc/pmac.c index e46f47206542..45fc36f0f219 100644 --- a/trunk/drivers/ide/ppc/pmac.c +++ b/trunk/drivers/ide/ppc/pmac.c @@ -942,8 +942,8 @@ pmac_ide_tune_chipset (ide_drive_t *drive, byte speed) return 1; case XFER_UDMA_4: case XFER_UDMA_3: - if (drive->hwif->cbl != ATA_CBL_PATA80) - return 1; + if (HWIF(drive)->udma_four == 0) + return 1; case XFER_UDMA_2: case XFER_UDMA_1: case XFER_UDMA_0: @@ -1244,7 +1244,7 @@ pmac_ide_setup_device(pmac_ide_hwif_t *pmif, ide_hwif_t *hwif) hwif->chipset = ide_pmac; hwif->noprobe = !hwif->io_ports[IDE_DATA_OFFSET] || pmif->mediabay; hwif->hold = pmif->mediabay; - hwif->cbl = pmif->cable_80 ? ATA_CBL_PATA80 : ATA_CBL_PATA40; + hwif->udma_four = pmif->cable_80; hwif->drives[0].unmask = 1; hwif->drives[1].unmask = 1; hwif->tuneproc = pmac_ide_tuneproc; @@ -1821,11 +1821,28 @@ pmac_ide_dma_check(ide_drive_t *drive) enable = 0; if (enable) { - u8 mode = ide_max_dma_mode(drive); - - if (mode >= XFER_UDMA_0) + short mode; + + map = XFER_MWDMA; + if (pmif->kind == controller_kl_ata4 + || pmif->kind == controller_un_ata6 + || pmif->kind == controller_k2_ata6 + || pmif->kind == controller_sh_ata6) { + map |= XFER_UDMA; + if (pmif->cable_80) { + map |= XFER_UDMA_66; + if (pmif->kind == controller_un_ata6 || + pmif->kind == controller_k2_ata6 || + pmif->kind == controller_sh_ata6) + map |= XFER_UDMA_100; + if (pmif->kind == controller_sh_ata6) + map |= XFER_UDMA_133; + } + } + mode = ide_find_best_mode(drive, map); + if (mode & XFER_UDMA) drive->using_dma = pmac_ide_udma_enable(drive, mode); - else if (mode >= XFER_MW_DMA_0) + else if (mode & XFER_MWDMA) drive->using_dma = pmac_ide_mdma_enable(drive, mode); hwif->OUTB(0, IDE_CONTROL_REG); /* Apply settings to controller */ @@ -1987,19 +2004,20 @@ static void pmac_ide_dma_host_on(ide_drive_t *drive) { } -static void -pmac_ide_dma_lost_irq (ide_drive_t *drive) +static int +pmac_ide_dma_lostirq (ide_drive_t *drive) { pmac_ide_hwif_t* pmif = (pmac_ide_hwif_t *)HWIF(drive)->hwif_data; volatile struct dbdma_regs __iomem *dma; unsigned long status; if (pmif == NULL) - return; + return 0; dma = pmif->dma_regs; status = readl(&dma->status); printk(KERN_ERR "ide-pmac lost interrupt, dma status: %lx\n", status); + return 0; } /* @@ -2039,8 +2057,8 @@ pmac_ide_setup_dma(pmac_ide_hwif_t *pmif, ide_hwif_t *hwif) hwif->ide_dma_test_irq = &pmac_ide_dma_test_irq; hwif->dma_host_off = &pmac_ide_dma_host_off; hwif->dma_host_on = &pmac_ide_dma_host_on; - hwif->dma_timeout = &ide_dma_timeout; - hwif->dma_lost_irq = &pmac_ide_dma_lost_irq; + hwif->ide_dma_timeout = &__ide_dma_timeout; + hwif->ide_dma_lostirq = &pmac_ide_dma_lostirq; hwif->atapi_dma = 1; switch(pmif->kind) { diff --git a/trunk/drivers/misc/Kconfig b/trunk/drivers/misc/Kconfig index 616eee9c04f1..bd601efa7bd1 100644 --- a/trunk/drivers/misc/Kconfig +++ b/trunk/drivers/misc/Kconfig @@ -34,6 +34,11 @@ config PHANTOM If you choose to build module, its name will be phantom. If unsure, say N here. +config EEPROM_93CX6 + tristate "EEPROM 93CX6 support" + ---help--- + This is a driver for the EEPROM chipsets 93c46 and 93c66. + The driver supports both read as well as write commands. If unsure, say N. @@ -187,5 +192,4 @@ config THINKPAD_ACPI_BAY If you are not sure, say Y here. - endmenu diff --git a/trunk/drivers/misc/Makefile b/trunk/drivers/misc/Makefile index 8abbf2f07a65..b5ce0e3dba86 100644 --- a/trunk/drivers/misc/Makefile +++ b/trunk/drivers/misc/Makefile @@ -14,3 +14,4 @@ obj-$(CONFIG_PHANTOM) += phantom.o obj-$(CONFIG_SGI_IOC4) += ioc4.o obj-$(CONFIG_SONY_LAPTOP) += sony-laptop.o obj-$(CONFIG_THINKPAD_ACPI) += thinkpad_acpi.o +obj-$(CONFIG_EEPROM_93CX6) += eeprom_93cx6.o diff --git a/trunk/drivers/misc/eeprom_93cx6.c b/trunk/drivers/misc/eeprom_93cx6.c new file mode 100644 index 000000000000..bfcb43424dcd --- /dev/null +++ b/trunk/drivers/misc/eeprom_93cx6.c @@ -0,0 +1,229 @@ +/* + Copyright (C) 2004 - 2006 rt2x00 SourceForge Project + + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the + Free Software Foundation, Inc., + 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +/* + Module: eeprom_93cx6 + Abstract: EEPROM reader routines for 93cx6 chipsets. + Supported chipsets: 93c46 & 93c66. + */ + +#include +#include +#include +#include +#include + +MODULE_AUTHOR("http://rt2x00.serialmonkey.com"); +MODULE_VERSION("1.0"); +MODULE_DESCRIPTION("EEPROM 93cx6 chip driver"); +MODULE_LICENSE("GPL"); + +static inline void eeprom_93cx6_pulse_high(struct eeprom_93cx6 *eeprom) +{ + eeprom->reg_data_clock = 1; + eeprom->register_write(eeprom); + udelay(1); +} + +static inline void eeprom_93cx6_pulse_low(struct eeprom_93cx6 *eeprom) +{ + eeprom->reg_data_clock = 0; + eeprom->register_write(eeprom); + udelay(1); +} + +static void eeprom_93cx6_startup(struct eeprom_93cx6 *eeprom) +{ + /* + * Clear all flags, and enable chip select. + */ + eeprom->register_read(eeprom); + eeprom->reg_data_in = 0; + eeprom->reg_data_out = 0; + eeprom->reg_data_clock = 0; + eeprom->reg_chip_select = 1; + eeprom->register_write(eeprom); + + /* + * kick a pulse. + */ + eeprom_93cx6_pulse_high(eeprom); + eeprom_93cx6_pulse_low(eeprom); +} + +static void eeprom_93cx6_cleanup(struct eeprom_93cx6 *eeprom) +{ + /* + * Clear chip_select and data_in flags. + */ + eeprom->register_read(eeprom); + eeprom->reg_data_in = 0; + eeprom->reg_chip_select = 0; + eeprom->register_write(eeprom); + + /* + * kick a pulse. + */ + eeprom_93cx6_pulse_high(eeprom); + eeprom_93cx6_pulse_low(eeprom); +} + +static void eeprom_93cx6_write_bits(struct eeprom_93cx6 *eeprom, + const u16 data, const u16 count) +{ + unsigned int i; + + eeprom->register_read(eeprom); + + /* + * Clear data flags. + */ + eeprom->reg_data_in = 0; + eeprom->reg_data_out = 0; + + /* + * Start writing all bits. + */ + for (i = count; i > 0; i--) { + /* + * Check if this bit needs to be set. + */ + eeprom->reg_data_in = !!(data & (1 << (i - 1))); + + /* + * Write the bit to the eeprom register. + */ + eeprom->register_write(eeprom); + + /* + * Kick a pulse. + */ + eeprom_93cx6_pulse_high(eeprom); + eeprom_93cx6_pulse_low(eeprom); + } + + eeprom->reg_data_in = 0; + eeprom->register_write(eeprom); +} + +static void eeprom_93cx6_read_bits(struct eeprom_93cx6 *eeprom, + u16 *data, const u16 count) +{ + unsigned int i; + u16 buf = 0; + + eeprom->register_read(eeprom); + + /* + * Clear data flags. + */ + eeprom->reg_data_in = 0; + eeprom->reg_data_out = 0; + + /* + * Start reading all bits. + */ + for (i = count; i > 0; i--) { + eeprom_93cx6_pulse_high(eeprom); + + eeprom->register_read(eeprom); + + /* + * Clear data_in flag. + */ + eeprom->reg_data_in = 0; + + /* + * Read if the bit has been set. + */ + if (eeprom->reg_data_out) + buf |= (1 << (i - 1)); + + eeprom_93cx6_pulse_low(eeprom); + } + + *data = buf; +} + +/** + * eeprom_93cx6_read - Read multiple words from eeprom + * @eeprom: Pointer to eeprom structure + * @word: Word index from where we should start reading + * @data: target pointer where the information will have to be stored + * + * This function will read the eeprom data as host-endian word + * into the given data pointer. + */ +void eeprom_93cx6_read(struct eeprom_93cx6 *eeprom, const u8 word, + u16 *data) +{ + u16 command; + + /* + * Initialize the eeprom register + */ + eeprom_93cx6_startup(eeprom); + + /* + * Select the read opcode and the word to be read. + */ + command = (PCI_EEPROM_READ_OPCODE << eeprom->width) | word; + eeprom_93cx6_write_bits(eeprom, command, + PCI_EEPROM_WIDTH_OPCODE + eeprom->width); + + /* + * Read the requested 16 bits. + */ + eeprom_93cx6_read_bits(eeprom, data, 16); + + /* + * Cleanup eeprom register. + */ + eeprom_93cx6_cleanup(eeprom); +} +EXPORT_SYMBOL_GPL(eeprom_93cx6_read); + +/** + * eeprom_93cx6_multiread - Read multiple words from eeprom + * @eeprom: Pointer to eeprom structure + * @word: Word index from where we should start reading + * @data: target pointer where the information will have to be stored + * @words: Number of words that should be read. + * + * This function will read all requested words from the eeprom, + * this is done by calling eeprom_93cx6_read() multiple times. + * But with the additional change that while the eeprom_93cx6_read + * will return host ordered bytes, this method will return little + * endian words. + */ +void eeprom_93cx6_multiread(struct eeprom_93cx6 *eeprom, const u8 word, + __le16 *data, const u16 words) +{ + unsigned int i; + u16 tmp; + + for (i = 0; i < words; i++) { + tmp = 0; + eeprom_93cx6_read(eeprom, word + i, &tmp); + data[i] = cpu_to_le16(tmp); + } +} +EXPORT_SYMBOL_GPL(eeprom_93cx6_multiread); + diff --git a/trunk/fs/jfs/endian24.h b/trunk/fs/jfs/endian24.h index fa92f7f1d0d0..79494c4f2b10 100644 --- a/trunk/fs/jfs/endian24.h +++ b/trunk/fs/jfs/endian24.h @@ -29,7 +29,7 @@ __u32 __x = (x); \ ((__u32)( \ ((__x & (__u32)0x000000ffUL) << 16) | \ - (__x & (__u32)0x0000ff00UL) | \ + (__x & (__u32)0x0000ff00UL) | \ ((__x & (__u32)0x00ff0000UL) >> 16) )); \ }) diff --git a/trunk/fs/jfs/jfs_debug.c b/trunk/fs/jfs/jfs_debug.c index 887f5759e536..9c5d59632aac 100644 --- a/trunk/fs/jfs/jfs_debug.c +++ b/trunk/fs/jfs/jfs_debug.c @@ -26,6 +26,34 @@ #include "jfs_filsys.h" #include "jfs_debug.h" +#ifdef CONFIG_JFS_DEBUG +void dump_mem(char *label, void *data, int length) +{ + int i, j; + int *intptr = data; + char *charptr = data; + char buf[10], line[80]; + + printk("%s: dump of %d bytes of data at 0x%p\n\n", label, length, + data); + for (i = 0; i < length; i += 16) { + line[0] = 0; + for (j = 0; (j < 4) && (i + j * 4 < length); j++) { + sprintf(buf, " %08x", intptr[i / 4 + j]); + strcat(line, buf); + } + buf[0] = ' '; + buf[2] = 0; + for (j = 0; (j < 16) && (i + j < length); j++) { + buf[1] = + isprint(charptr[i + j]) ? charptr[i + j] : '.'; + strcat(line, buf); + } + printk("%s\n", line); + } +} +#endif + #ifdef PROC_FS_JFS /* see jfs_debug.h */ static struct proc_dir_entry *base; diff --git a/trunk/fs/jfs/jfs_debug.h b/trunk/fs/jfs/jfs_debug.h index 044c1e654cc0..7378798f0b21 100644 --- a/trunk/fs/jfs/jfs_debug.h +++ b/trunk/fs/jfs/jfs_debug.h @@ -62,6 +62,7 @@ extern void jfs_proc_clean(void); extern int jfsloglevel; +extern void dump_mem(char *label, void *data, int length); extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *); /* information message: e.g., configuration, major event */ @@ -93,6 +94,7 @@ extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *); * --------- */ #else /* CONFIG_JFS_DEBUG */ +#define dump_mem(label,data,length) do {} while (0) #define ASSERT(p) do {} while (0) #define jfs_info(fmt, arg...) do {} while (0) #define jfs_debug(fmt, arg...) do {} while (0) diff --git a/trunk/fs/jfs/jfs_dinode.h b/trunk/fs/jfs/jfs_dinode.h index c387540d3425..40b20111383c 100644 --- a/trunk/fs/jfs/jfs_dinode.h +++ b/trunk/fs/jfs/jfs_dinode.h @@ -19,23 +19,23 @@ #define _H_JFS_DINODE /* - * jfs_dinode.h: on-disk inode manager + * jfs_dinode.h: on-disk inode manager */ -#define INODESLOTSIZE 128 -#define L2INODESLOTSIZE 7 -#define log2INODESIZE 9 /* log2(bytes per dinode) */ +#define INODESLOTSIZE 128 +#define L2INODESLOTSIZE 7 +#define log2INODESIZE 9 /* log2(bytes per dinode) */ /* - * on-disk inode : 512 bytes + * on-disk inode : 512 bytes * * note: align 64-bit fields on 8-byte boundary. */ struct dinode { /* - * I. base area (128 bytes) - * ------------------------ + * I. base area (128 bytes) + * ------------------------ * * define generic/POSIX attributes */ @@ -70,16 +70,16 @@ struct dinode { __le32 di_acltype; /* 4: Type of ACL */ /* - * Extension Areas. + * Extension Areas. * - * Historically, the inode was partitioned into 4 128-byte areas, - * the last 3 being defined as unions which could have multiple - * uses. The first 96 bytes had been completely unused until - * an index table was added to the directory. It is now more - * useful to describe the last 3/4 of the inode as a single - * union. We would probably be better off redesigning the - * entire structure from scratch, but we don't want to break - * commonality with OS/2's JFS at this time. + * Historically, the inode was partitioned into 4 128-byte areas, + * the last 3 being defined as unions which could have multiple + * uses. The first 96 bytes had been completely unused until + * an index table was added to the directory. It is now more + * useful to describe the last 3/4 of the inode as a single + * union. We would probably be better off redesigning the + * entire structure from scratch, but we don't want to break + * commonality with OS/2's JFS at this time. */ union { struct { @@ -95,7 +95,7 @@ struct dinode { } _dir; /* (384) */ #define di_dirtable u._dir._table #define di_dtroot u._dir._dtroot -#define di_parent di_dtroot.header.idotdot +#define di_parent di_dtroot.header.idotdot #define di_DASD di_dtroot.header.DASD struct { @@ -127,14 +127,14 @@ struct dinode { #define di_inlinedata u._file._u2._special._u #define di_rdev u._file._u2._special._u._rdev #define di_fastsymlink u._file._u2._special._u._fastsymlink -#define di_inlineea u._file._u2._special._inlineea +#define di_inlineea u._file._u2._special._inlineea } u; }; /* extended mode bits (on-disk inode di_mode) */ -#define IFJOURNAL 0x00010000 /* journalled file */ -#define ISPARSE 0x00020000 /* sparse file enabled */ -#define INLINEEA 0x00040000 /* inline EA area free */ +#define IFJOURNAL 0x00010000 /* journalled file */ +#define ISPARSE 0x00020000 /* sparse file enabled */ +#define INLINEEA 0x00040000 /* inline EA area free */ #define ISWAPFILE 0x00800000 /* file open for pager swap space */ /* more extended mode bits: attributes for OS/2 */ diff --git a/trunk/fs/jfs/jfs_dmap.c b/trunk/fs/jfs/jfs_dmap.c index e1985066b1c6..f3b1ebb22280 100644 --- a/trunk/fs/jfs/jfs_dmap.c +++ b/trunk/fs/jfs/jfs_dmap.c @@ -154,12 +154,12 @@ static const s8 budtab[256] = { * the in-core descriptor is initialized from disk. * * PARAMETERS: - * ipbmap - pointer to in-core inode for the block map. + * ipbmap - pointer to in-core inode for the block map. * * RETURN VALUES: - * 0 - success - * -ENOMEM - insufficient memory - * -EIO - i/o error + * 0 - success + * -ENOMEM - insufficient memory + * -EIO - i/o error */ int dbMount(struct inode *ipbmap) { @@ -232,11 +232,11 @@ int dbMount(struct inode *ipbmap) * the memory for this descriptor is freed. * * PARAMETERS: - * ipbmap - pointer to in-core inode for the block map. + * ipbmap - pointer to in-core inode for the block map. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error + * 0 - success + * -EIO - i/o error */ int dbUnmount(struct inode *ipbmap, int mounterror) { @@ -320,13 +320,13 @@ int dbSync(struct inode *ipbmap) * at a time. * * PARAMETERS: - * ip - pointer to in-core inode; - * blkno - starting block number to be freed. - * nblocks - number of blocks to be freed. + * ip - pointer to in-core inode; + * blkno - starting block number to be freed. + * nblocks - number of blocks to be freed. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error + * 0 - success + * -EIO - i/o error */ int dbFree(struct inode *ip, s64 blkno, s64 nblocks) { @@ -395,23 +395,23 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks) /* * NAME: dbUpdatePMap() * - * FUNCTION: update the allocation state (free or allocate) of the + * FUNCTION: update the allocation state (free or allocate) of the * specified block range in the persistent block allocation map. * * the blocks will be updated in the persistent map one * dmap at a time. * * PARAMETERS: - * ipbmap - pointer to in-core inode for the block map. - * free - 'true' if block range is to be freed from the persistent - * map; 'false' if it is to be allocated. - * blkno - starting block number of the range. - * nblocks - number of contiguous blocks in the range. - * tblk - transaction block; + * ipbmap - pointer to in-core inode for the block map. + * free - 'true' if block range is to be freed from the persistent + * map; 'false' if it is to be allocated. + * blkno - starting block number of the range. + * nblocks - number of contiguous blocks in the range. + * tblk - transaction block; * * RETURN VALUES: - * 0 - success - * -EIO - i/o error + * 0 - success + * -EIO - i/o error */ int dbUpdatePMap(struct inode *ipbmap, @@ -573,7 +573,7 @@ dbUpdatePMap(struct inode *ipbmap, /* * NAME: dbNextAG() * - * FUNCTION: find the preferred allocation group for new allocations. + * FUNCTION: find the preferred allocation group for new allocations. * * Within the allocation groups, we maintain a preferred * allocation group which consists of a group with at least @@ -589,10 +589,10 @@ dbUpdatePMap(struct inode *ipbmap, * empty ags around for large allocations. * * PARAMETERS: - * ipbmap - pointer to in-core inode for the block map. + * ipbmap - pointer to in-core inode for the block map. * * RETURN VALUES: - * the preferred allocation group number. + * the preferred allocation group number. */ int dbNextAG(struct inode *ipbmap) { @@ -656,7 +656,7 @@ int dbNextAG(struct inode *ipbmap) /* * NAME: dbAlloc() * - * FUNCTION: attempt to allocate a specified number of contiguous free + * FUNCTION: attempt to allocate a specified number of contiguous free * blocks from the working allocation block map. * * the block allocation policy uses hints and a multi-step @@ -680,16 +680,16 @@ int dbNextAG(struct inode *ipbmap) * size or requests that specify no hint value. * * PARAMETERS: - * ip - pointer to in-core inode; - * hint - allocation hint. - * nblocks - number of contiguous blocks in the range. - * results - on successful return, set to the starting block number + * ip - pointer to in-core inode; + * hint - allocation hint. + * nblocks - number of contiguous blocks in the range. + * results - on successful return, set to the starting block number * of the newly allocated contiguous range. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error */ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) { @@ -706,6 +706,12 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) /* assert that nblocks is valid */ assert(nblocks > 0); +#ifdef _STILL_TO_PORT + /* DASD limit check F226941 */ + if (OVER_LIMIT(ip, nblocks)) + return -ENOSPC; +#endif /* _STILL_TO_PORT */ + /* get the log2 number of blocks to be allocated. * if the number of blocks is not a log2 multiple, * it will be rounded up to the next log2 multiple. @@ -714,6 +720,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) bmp = JFS_SBI(ip->i_sb)->bmap; +//retry: /* serialize w.r.t.extendfs() */ mapSize = bmp->db_mapsize; /* the hint should be within the map */ @@ -872,17 +879,17 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) /* * NAME: dbAllocExact() * - * FUNCTION: try to allocate the requested extent; + * FUNCTION: try to allocate the requested extent; * * PARAMETERS: - * ip - pointer to in-core inode; - * blkno - extent address; - * nblocks - extent length; + * ip - pointer to in-core inode; + * blkno - extent address; + * nblocks - extent length; * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error */ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) { @@ -939,7 +946,7 @@ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) /* * NAME: dbReAlloc() * - * FUNCTION: attempt to extend a current allocation by a specified + * FUNCTION: attempt to extend a current allocation by a specified * number of blocks. * * this routine attempts to satisfy the allocation request @@ -952,21 +959,21 @@ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) * number of blocks required. * * PARAMETERS: - * ip - pointer to in-core inode requiring allocation. - * blkno - starting block of the current allocation. - * nblocks - number of contiguous blocks within the current + * ip - pointer to in-core inode requiring allocation. + * blkno - starting block of the current allocation. + * nblocks - number of contiguous blocks within the current * allocation. - * addnblocks - number of blocks to add to the allocation. - * results - on successful return, set to the starting block number + * addnblocks - number of blocks to add to the allocation. + * results - on successful return, set to the starting block number * of the existing allocation if the existing allocation * was extended in place or to a newly allocated contiguous * range if the existing allocation could not be extended * in place. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error */ int dbReAlloc(struct inode *ip, @@ -997,7 +1004,7 @@ dbReAlloc(struct inode *ip, /* * NAME: dbExtend() * - * FUNCTION: attempt to extend a current allocation by a specified + * FUNCTION: attempt to extend a current allocation by a specified * number of blocks. * * this routine attempts to satisfy the allocation request @@ -1006,16 +1013,16 @@ dbReAlloc(struct inode *ip, * immediately following the current allocation. * * PARAMETERS: - * ip - pointer to in-core inode requiring allocation. - * blkno - starting block of the current allocation. - * nblocks - number of contiguous blocks within the current + * ip - pointer to in-core inode requiring allocation. + * blkno - starting block of the current allocation. + * nblocks - number of contiguous blocks within the current * allocation. - * addnblocks - number of blocks to add to the allocation. + * addnblocks - number of blocks to add to the allocation. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error */ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks) { @@ -1102,19 +1109,19 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks) /* * NAME: dbAllocNext() * - * FUNCTION: attempt to allocate the blocks of the specified block + * FUNCTION: attempt to allocate the blocks of the specified block * range within a dmap. * * PARAMETERS: - * bmp - pointer to bmap descriptor - * dp - pointer to dmap. - * blkno - starting block number of the range. - * nblocks - number of contiguous free blocks of the range. + * bmp - pointer to bmap descriptor + * dp - pointer to dmap. + * blkno - starting block number of the range. + * nblocks - number of contiguous free blocks of the range. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error * * serialization: IREAD_LOCK(ipbmap) held on entry/exit; */ @@ -1226,7 +1233,7 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno, /* * NAME: dbAllocNear() * - * FUNCTION: attempt to allocate a number of contiguous free blocks near + * FUNCTION: attempt to allocate a number of contiguous free blocks near * a specified block (hint) within a dmap. * * starting with the dmap leaf that covers the hint, we'll @@ -1235,18 +1242,18 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno, * the desired free space. * * PARAMETERS: - * bmp - pointer to bmap descriptor - * dp - pointer to dmap. - * blkno - block number to allocate near. - * nblocks - actual number of contiguous free blocks desired. - * l2nb - log2 number of contiguous free blocks desired. - * results - on successful return, set to the starting block number + * bmp - pointer to bmap descriptor + * dp - pointer to dmap. + * blkno - block number to allocate near. + * nblocks - actual number of contiguous free blocks desired. + * l2nb - log2 number of contiguous free blocks desired. + * results - on successful return, set to the starting block number * of the newly allocated range. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error * * serialization: IREAD_LOCK(ipbmap) held on entry/exit; */ @@ -1309,7 +1316,7 @@ dbAllocNear(struct bmap * bmp, /* * NAME: dbAllocAG() * - * FUNCTION: attempt to allocate the specified number of contiguous + * FUNCTION: attempt to allocate the specified number of contiguous * free blocks within the specified allocation group. * * unless the allocation group size is equal to the number @@ -1346,17 +1353,17 @@ dbAllocNear(struct bmap * bmp, * the allocation group. * * PARAMETERS: - * bmp - pointer to bmap descriptor + * bmp - pointer to bmap descriptor * agno - allocation group number. - * nblocks - actual number of contiguous free blocks desired. - * l2nb - log2 number of contiguous free blocks desired. - * results - on successful return, set to the starting block number + * nblocks - actual number of contiguous free blocks desired. + * l2nb - log2 number of contiguous free blocks desired. + * results - on successful return, set to the starting block number * of the newly allocated range. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error * * note: IWRITE_LOCK(ipmap) held on entry/exit; */ @@ -1539,7 +1546,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) /* * NAME: dbAllocAny() * - * FUNCTION: attempt to allocate the specified number of contiguous + * FUNCTION: attempt to allocate the specified number of contiguous * free blocks anywhere in the file system. * * dbAllocAny() attempts to find the sufficient free space by @@ -1549,16 +1556,16 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) * desired free space is allocated. * * PARAMETERS: - * bmp - pointer to bmap descriptor - * nblocks - actual number of contiguous free blocks desired. - * l2nb - log2 number of contiguous free blocks desired. - * results - on successful return, set to the starting block number + * bmp - pointer to bmap descriptor + * nblocks - actual number of contiguous free blocks desired. + * l2nb - log2 number of contiguous free blocks desired. + * results - on successful return, set to the starting block number * of the newly allocated range. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error * * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; */ @@ -1591,9 +1598,9 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results) /* * NAME: dbFindCtl() * - * FUNCTION: starting at a specified dmap control page level and block + * FUNCTION: starting at a specified dmap control page level and block * number, search down the dmap control levels for a range of - * contiguous free blocks large enough to satisfy an allocation + * contiguous free blocks large enough to satisfy an allocation * request for the specified number of free blocks. * * if sufficient contiguous free blocks are found, this routine @@ -1602,17 +1609,17 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results) * is sufficient in size. * * PARAMETERS: - * bmp - pointer to bmap descriptor - * level - starting dmap control page level. - * l2nb - log2 number of contiguous free blocks desired. - * *blkno - on entry, starting block number for conducting the search. + * bmp - pointer to bmap descriptor + * level - starting dmap control page level. + * l2nb - log2 number of contiguous free blocks desired. + * *blkno - on entry, starting block number for conducting the search. * on successful return, the first block within a dmap page * that contains or starts a range of contiguous free blocks. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error * * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; */ @@ -1692,7 +1699,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno) /* * NAME: dbAllocCtl() * - * FUNCTION: attempt to allocate a specified number of contiguous + * FUNCTION: attempt to allocate a specified number of contiguous * blocks starting within a specific dmap. * * this routine is called by higher level routines that search @@ -1719,18 +1726,18 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno) * first dmap (i.e. blkno). * * PARAMETERS: - * bmp - pointer to bmap descriptor - * nblocks - actual number of contiguous free blocks to allocate. - * l2nb - log2 number of contiguous free blocks to allocate. - * blkno - starting block number of the dmap to start the allocation + * bmp - pointer to bmap descriptor + * nblocks - actual number of contiguous free blocks to allocate. + * l2nb - log2 number of contiguous free blocks to allocate. + * blkno - starting block number of the dmap to start the allocation * from. - * results - on successful return, set to the starting block number + * results - on successful return, set to the starting block number * of the newly allocated range. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error * * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; */ @@ -1863,7 +1870,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results) /* * NAME: dbAllocDmapLev() * - * FUNCTION: attempt to allocate a specified number of contiguous blocks + * FUNCTION: attempt to allocate a specified number of contiguous blocks * from a specified dmap. * * this routine checks if the contiguous blocks are available. @@ -1871,17 +1878,17 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results) * returned. * * PARAMETERS: - * mp - pointer to bmap descriptor - * dp - pointer to dmap to attempt to allocate blocks from. - * l2nb - log2 number of contiguous block desired. - * nblocks - actual number of contiguous block desired. - * results - on successful return, set to the starting block number + * mp - pointer to bmap descriptor + * dp - pointer to dmap to attempt to allocate blocks from. + * l2nb - log2 number of contiguous block desired. + * nblocks - actual number of contiguous block desired. + * results - on successful return, set to the starting block number * of the newly allocated range. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error * * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or * IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit; @@ -1926,7 +1933,7 @@ dbAllocDmapLev(struct bmap * bmp, /* * NAME: dbAllocDmap() * - * FUNCTION: adjust the disk allocation map to reflect the allocation + * FUNCTION: adjust the disk allocation map to reflect the allocation * of a specified block range within a dmap. * * this routine allocates the specified blocks from the dmap @@ -1939,14 +1946,14 @@ dbAllocDmapLev(struct bmap * bmp, * covers this dmap. * * PARAMETERS: - * bmp - pointer to bmap descriptor - * dp - pointer to dmap to allocate the block range from. - * blkno - starting block number of the block to be allocated. - * nblocks - number of blocks to be allocated. + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to allocate the block range from. + * blkno - starting block number of the block to be allocated. + * nblocks - number of blocks to be allocated. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error + * 0 - success + * -EIO - i/o error * * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; */ @@ -1982,7 +1989,7 @@ static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, /* * NAME: dbFreeDmap() * - * FUNCTION: adjust the disk allocation map to reflect the allocation + * FUNCTION: adjust the disk allocation map to reflect the allocation * of a specified block range within a dmap. * * this routine frees the specified blocks from the dmap through @@ -1990,18 +1997,18 @@ static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, * causes the maximum string of free blocks within the dmap to * change (i.e. the value of the root of the dmap's dmtree), this * routine will cause this change to be reflected up through the - * appropriate levels of the dmap control pages by a call to + * appropriate levels of the dmap control pages by a call to * dbAdjCtl() for the L0 dmap control page that covers this dmap. * * PARAMETERS: - * bmp - pointer to bmap descriptor - * dp - pointer to dmap to free the block range from. - * blkno - starting block number of the block to be freed. - * nblocks - number of blocks to be freed. + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to free the block range from. + * blkno - starting block number of the block to be freed. + * nblocks - number of blocks to be freed. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error + * 0 - success + * -EIO - i/o error * * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; */ @@ -2048,7 +2055,7 @@ static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, /* * NAME: dbAllocBits() * - * FUNCTION: allocate a specified block range from a dmap. + * FUNCTION: allocate a specified block range from a dmap. * * this routine updates the dmap to reflect the working * state allocation of the specified block range. it directly @@ -2058,10 +2065,10 @@ static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, * dmap's dmtree, as a whole, to reflect the allocated range. * * PARAMETERS: - * bmp - pointer to bmap descriptor - * dp - pointer to dmap to allocate bits from. - * blkno - starting block number of the bits to be allocated. - * nblocks - number of bits to be allocated. + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to allocate bits from. + * blkno - starting block number of the bits to be allocated. + * nblocks - number of bits to be allocated. * * RETURN VALUES: none * @@ -2142,7 +2149,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, * the allocated words. */ for (; nwords > 0; nwords -= nw) { - if (leaf[word] < BUDMIN) { + if (leaf[word] < BUDMIN) { jfs_error(bmp->db_ipbmap->i_sb, "dbAllocBits: leaf page " "corrupt"); @@ -2195,7 +2202,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, /* * NAME: dbFreeBits() * - * FUNCTION: free a specified block range from a dmap. + * FUNCTION: free a specified block range from a dmap. * * this routine updates the dmap to reflect the working * state allocation of the specified block range. it directly @@ -2205,10 +2212,10 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, * dmtree, as a whole, to reflect the deallocated range. * * PARAMETERS: - * bmp - pointer to bmap descriptor - * dp - pointer to dmap to free bits from. - * blkno - starting block number of the bits to be freed. - * nblocks - number of bits to be freed. + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to free bits from. + * blkno - starting block number of the bits to be freed. + * nblocks - number of bits to be freed. * * RETURN VALUES: 0 for success * @@ -2381,19 +2388,19 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, * the new root value and the next dmap control page level to * be adjusted. * PARAMETERS: - * bmp - pointer to bmap descriptor - * blkno - the first block of a block range within a dmap. it is + * bmp - pointer to bmap descriptor + * blkno - the first block of a block range within a dmap. it is * the allocation or deallocation of this block range that * requires the dmap control page to be adjusted. - * newval - the new value of the lower level dmap or dmap control + * newval - the new value of the lower level dmap or dmap control * page root. - * alloc - 'true' if adjustment is due to an allocation. - * level - current level of dmap control page (i.e. L0, L1, L2) to + * alloc - 'true' if adjustment is due to an allocation. + * level - current level of dmap control page (i.e. L0, L1, L2) to * be adjusted. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error + * 0 - success + * -EIO - i/o error * * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; */ @@ -2537,16 +2544,16 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level) /* * NAME: dbSplit() * - * FUNCTION: update the leaf of a dmtree with a new value, splitting + * FUNCTION: update the leaf of a dmtree with a new value, splitting * the leaf from the binary buddy system of the dmtree's * leaves, as required. * * PARAMETERS: - * tp - pointer to the tree containing the leaf. - * leafno - the number of the leaf to be updated. - * splitsz - the size the binary buddy system starting at the leaf + * tp - pointer to the tree containing the leaf. + * leafno - the number of the leaf to be updated. + * splitsz - the size the binary buddy system starting at the leaf * must be split to, specified as the log2 number of blocks. - * newval - the new value for the leaf. + * newval - the new value for the leaf. * * RETURN VALUES: none * @@ -2593,7 +2600,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval) /* * NAME: dbBackSplit() * - * FUNCTION: back split the binary buddy system of dmtree leaves + * FUNCTION: back split the binary buddy system of dmtree leaves * that hold a specified leaf until the specified leaf * starts its own binary buddy system. * @@ -2610,8 +2617,8 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval) * in which a previous join operation must be backed out. * * PARAMETERS: - * tp - pointer to the tree containing the leaf. - * leafno - the number of the leaf to be updated. + * tp - pointer to the tree containing the leaf. + * leafno - the number of the leaf to be updated. * * RETURN VALUES: none * @@ -2685,14 +2692,14 @@ static int dbBackSplit(dmtree_t * tp, int leafno) /* * NAME: dbJoin() * - * FUNCTION: update the leaf of a dmtree with a new value, joining + * FUNCTION: update the leaf of a dmtree with a new value, joining * the leaf with other leaves of the dmtree into a multi-leaf * binary buddy system, as required. * * PARAMETERS: - * tp - pointer to the tree containing the leaf. - * leafno - the number of the leaf to be updated. - * newval - the new value for the leaf. + * tp - pointer to the tree containing the leaf. + * leafno - the number of the leaf to be updated. + * newval - the new value for the leaf. * * RETURN VALUES: none */ @@ -2778,15 +2785,15 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval) /* * NAME: dbAdjTree() * - * FUNCTION: update a leaf of a dmtree with a new value, adjusting + * FUNCTION: update a leaf of a dmtree with a new value, adjusting * the dmtree, as required, to reflect the new leaf value. * the combination of any buddies must already be done before * this is called. * * PARAMETERS: - * tp - pointer to the tree to be adjusted. - * leafno - the number of the leaf to be updated. - * newval - the new value for the leaf. + * tp - pointer to the tree to be adjusted. + * leafno - the number of the leaf to be updated. + * newval - the new value for the leaf. * * RETURN VALUES: none */ @@ -2845,7 +2852,7 @@ static void dbAdjTree(dmtree_t * tp, int leafno, int newval) /* * NAME: dbFindLeaf() * - * FUNCTION: search a dmtree_t for sufficient free blocks, returning + * FUNCTION: search a dmtree_t for sufficient free blocks, returning * the index of a leaf describing the free blocks if * sufficient free blocks are found. * @@ -2854,15 +2861,15 @@ static void dbAdjTree(dmtree_t * tp, int leafno, int newval) * free space. * * PARAMETERS: - * tp - pointer to the tree to be searched. - * l2nb - log2 number of free blocks to search for. + * tp - pointer to the tree to be searched. + * l2nb - log2 number of free blocks to search for. * leafidx - return pointer to be set to the index of the leaf * describing at least l2nb free blocks if sufficient * free blocks are found. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient free blocks. + * 0 - success + * -ENOSPC - insufficient free blocks. */ static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx) { @@ -2909,18 +2916,18 @@ static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx) /* * NAME: dbFindBits() * - * FUNCTION: find a specified number of binary buddy free bits within a + * FUNCTION: find a specified number of binary buddy free bits within a * dmap bitmap word value. * * this routine searches the bitmap value for (1 << l2nb) free * bits at (1 << l2nb) alignments within the value. * * PARAMETERS: - * word - dmap bitmap word value. - * l2nb - number of free bits specified as a log2 number. + * word - dmap bitmap word value. + * l2nb - number of free bits specified as a log2 number. * * RETURN VALUES: - * starting bit number of free bits. + * starting bit number of free bits. */ static int dbFindBits(u32 word, int l2nb) { @@ -2956,14 +2963,14 @@ static int dbFindBits(u32 word, int l2nb) /* * NAME: dbMaxBud(u8 *cp) * - * FUNCTION: determine the largest binary buddy string of free + * FUNCTION: determine the largest binary buddy string of free * bits within 32-bits of the map. * * PARAMETERS: - * cp - pointer to the 32-bit value. + * cp - pointer to the 32-bit value. * * RETURN VALUES: - * largest binary buddy of free bits within a dmap word. + * largest binary buddy of free bits within a dmap word. */ static int dbMaxBud(u8 * cp) { @@ -2993,14 +3000,14 @@ static int dbMaxBud(u8 * cp) /* * NAME: cnttz(uint word) * - * FUNCTION: determine the number of trailing zeros within a 32-bit + * FUNCTION: determine the number of trailing zeros within a 32-bit * value. * * PARAMETERS: - * value - 32-bit value to be examined. + * value - 32-bit value to be examined. * * RETURN VALUES: - * count of trailing zeros + * count of trailing zeros */ static int cnttz(u32 word) { @@ -3018,14 +3025,14 @@ static int cnttz(u32 word) /* * NAME: cntlz(u32 value) * - * FUNCTION: determine the number of leading zeros within a 32-bit + * FUNCTION: determine the number of leading zeros within a 32-bit * value. * * PARAMETERS: - * value - 32-bit value to be examined. + * value - 32-bit value to be examined. * * RETURN VALUES: - * count of leading zeros + * count of leading zeros */ static int cntlz(u32 value) { @@ -3043,14 +3050,14 @@ static int cntlz(u32 value) * NAME: blkstol2(s64 nb) * * FUNCTION: convert a block count to its log2 value. if the block - * count is not a l2 multiple, it is rounded up to the next + * count is not a l2 multiple, it is rounded up to the next * larger l2 multiple. * * PARAMETERS: - * nb - number of blocks + * nb - number of blocks * * RETURN VALUES: - * log2 number of blocks + * log2 number of blocks */ static int blkstol2(s64 nb) { @@ -3092,13 +3099,13 @@ static int blkstol2(s64 nb) * at a time. * * PARAMETERS: - * ip - pointer to in-core inode; - * blkno - starting block number to be freed. - * nblocks - number of blocks to be freed. + * ip - pointer to in-core inode; + * blkno - starting block number to be freed. + * nblocks - number of blocks to be freed. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error + * 0 - success + * -EIO - i/o error */ int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks) { @@ -3271,10 +3278,10 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, * L2 * | * L1---------------------------------L1 - * | | - * L0---------L0---------L0 L0---------L0---------L0 - * | | | | | | - * d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,.,dm; + * | | + * L0---------L0---------L0 L0---------L0---------L0 + * | | | | | | + * d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,.,dm; * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm * * <---old---><----------------------------extend-----------------------> @@ -3300,7 +3307,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) (long long) blkno, (long long) nblocks, (long long) newsize); /* - * initialize bmap control page. + * initialize bmap control page. * * all the data in bmap control page should exclude * the mkfs hidden dmap page. @@ -3323,7 +3330,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0; /* - * reconfigure db_agfree[] + * reconfigure db_agfree[] * from old AG configuration to new AG configuration; * * coalesce contiguous k (newAGSize/oldAGSize) AGs; @@ -3355,7 +3362,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) bmp->db_maxag = bmp->db_maxag / k; /* - * extend bmap + * extend bmap * * update bit maps and corresponding level control pages; * global control page db_nfree, db_agfree[agno], db_maxfreebud; @@ -3403,7 +3410,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) /* compute start L0 */ j = 0; l1leaf = l1dcp->stree + CTLLEAFIND; - p += nbperpage; /* 1st L0 of L1.k */ + p += nbperpage; /* 1st L0 of L1.k */ } /* @@ -3541,7 +3548,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) return -EIO; /* - * finalize bmap control page + * finalize bmap control page */ finalize: @@ -3560,7 +3567,7 @@ void dbFinalizeBmap(struct inode *ipbmap) int i, n; /* - * finalize bmap control page + * finalize bmap control page */ //finalize: /* @@ -3946,8 +3953,8 @@ static int dbGetL2AGSize(s64 nblocks) * convert number of map pages to the zero origin top dmapctl level */ #define BMAPPGTOLEV(npages) \ - (((npages) <= 3 + MAXL0PAGES) ? 0 : \ - ((npages) <= 2 + MAXL1PAGES) ? 1 : 2) + (((npages) <= 3 + MAXL0PAGES) ? 0 \ + : ((npages) <= 2 + MAXL1PAGES) ? 1 : 2) s64 dbMapFileSizeToMapSize(struct inode * ipbmap) { @@ -3974,8 +3981,8 @@ s64 dbMapFileSizeToMapSize(struct inode * ipbmap) factor = (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1); complete = (u32) npages / factor; - ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL : - ((i == 1) ? LPERCTL : 1)); + ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL + : ((i == 1) ? LPERCTL : 1)); /* pages in last/incomplete child */ npages = (u32) npages % factor; diff --git a/trunk/fs/jfs/jfs_dmap.h b/trunk/fs/jfs/jfs_dmap.h index 11e6d471b364..45ea454c74bd 100644 --- a/trunk/fs/jfs/jfs_dmap.h +++ b/trunk/fs/jfs/jfs_dmap.h @@ -83,7 +83,7 @@ static __inline signed char TREEMAX(signed char *cp) * - 1 is added to account for the control page of the map. */ #define BLKTODMAP(b,s) \ - ((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s)) + ((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s)) /* * convert disk block number to the logical block number of the LEVEL 0 @@ -98,7 +98,7 @@ static __inline signed char TREEMAX(signed char *cp) * - 1 is added to account for the control page of the map. */ #define BLKTOL0(b,s) \ - (((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s)) + (((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s)) /* * convert disk block number to the logical block number of the LEVEL 1 @@ -120,7 +120,7 @@ static __inline signed char TREEMAX(signed char *cp) * at the specified level which describes the disk block. */ #define BLKTOCTL(b,s,l) \ - (((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s))) + (((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s))) /* * convert aggregate map size to the zero origin dmapctl level of the @@ -145,27 +145,27 @@ static __inline signed char TREEMAX(signed char *cp) * dmaptree must be consistent with dmapctl. */ struct dmaptree { - __le32 nleafs; /* 4: number of tree leafs */ - __le32 l2nleafs; /* 4: l2 number of tree leafs */ - __le32 leafidx; /* 4: index of first tree leaf */ - __le32 height; /* 4: height of the tree */ + __le32 nleafs; /* 4: number of tree leafs */ + __le32 l2nleafs; /* 4: l2 number of tree leafs */ + __le32 leafidx; /* 4: index of first tree leaf */ + __le32 height; /* 4: height of the tree */ s8 budmin; /* 1: min l2 tree leaf value to combine */ - s8 stree[TREESIZE]; /* TREESIZE: tree */ - u8 pad[2]; /* 2: pad to word boundary */ -}; /* - 360 - */ + s8 stree[TREESIZE]; /* TREESIZE: tree */ + u8 pad[2]; /* 2: pad to word boundary */ +}; /* - 360 - */ /* * dmap page per 8K blocks bitmap */ struct dmap { - __le32 nblocks; /* 4: num blks covered by this dmap */ - __le32 nfree; /* 4: num of free blks in this dmap */ - __le64 start; /* 8: starting blkno for this dmap */ - struct dmaptree tree; /* 360: dmap tree */ - u8 pad[1672]; /* 1672: pad to 2048 bytes */ - __le32 wmap[LPERDMAP]; /* 1024: bits of the working map */ - __le32 pmap[LPERDMAP]; /* 1024: bits of the persistent map */ -}; /* - 4096 - */ + __le32 nblocks; /* 4: num blks covered by this dmap */ + __le32 nfree; /* 4: num of free blks in this dmap */ + __le64 start; /* 8: starting blkno for this dmap */ + struct dmaptree tree; /* 360: dmap tree */ + u8 pad[1672]; /* 1672: pad to 2048 bytes */ + __le32 wmap[LPERDMAP]; /* 1024: bits of the working map */ + __le32 pmap[LPERDMAP]; /* 1024: bits of the persistent map */ +}; /* - 4096 - */ /* * disk map control page per level. @@ -173,14 +173,14 @@ struct dmap { * dmapctl must be consistent with dmaptree. */ struct dmapctl { - __le32 nleafs; /* 4: number of tree leafs */ - __le32 l2nleafs; /* 4: l2 number of tree leafs */ - __le32 leafidx; /* 4: index of the first tree leaf */ - __le32 height; /* 4: height of tree */ - s8 budmin; /* 1: minimum l2 tree leaf value */ - s8 stree[CTLTREESIZE]; /* CTLTREESIZE: dmapctl tree */ - u8 pad[2714]; /* 2714: pad to 4096 */ -}; /* - 4096 - */ + __le32 nleafs; /* 4: number of tree leafs */ + __le32 l2nleafs; /* 4: l2 number of tree leafs */ + __le32 leafidx; /* 4: index of the first tree leaf */ + __le32 height; /* 4: height of tree */ + s8 budmin; /* 1: minimum l2 tree leaf value */ + s8 stree[CTLTREESIZE]; /* CTLTREESIZE: dmapctl tree */ + u8 pad[2714]; /* 2714: pad to 4096 */ +}; /* - 4096 - */ /* * common definition for dmaptree within dmap and dmapctl @@ -202,41 +202,41 @@ typedef union dmtree { * on-disk aggregate disk allocation map descriptor. */ struct dbmap_disk { - __le64 dn_mapsize; /* 8: number of blocks in aggregate */ - __le64 dn_nfree; /* 8: num free blks in aggregate map */ - __le32 dn_l2nbperpage; /* 4: number of blks per page */ - __le32 dn_numag; /* 4: total number of ags */ - __le32 dn_maxlevel; /* 4: number of active ags */ - __le32 dn_maxag; /* 4: max active alloc group number */ - __le32 dn_agpref; /* 4: preferred alloc group (hint) */ - __le32 dn_aglevel; /* 4: dmapctl level holding the AG */ - __le32 dn_agheigth; /* 4: height in dmapctl of the AG */ - __le32 dn_agwidth; /* 4: width in dmapctl of the AG */ - __le32 dn_agstart; /* 4: start tree index at AG height */ - __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */ - __le64 dn_agfree[MAXAG];/* 8*MAXAG: per AG free count */ - __le64 dn_agsize; /* 8: num of blks per alloc group */ - s8 dn_maxfreebud; /* 1: max free buddy system */ - u8 pad[3007]; /* 3007: pad to 4096 */ -}; /* - 4096 - */ + __le64 dn_mapsize; /* 8: number of blocks in aggregate */ + __le64 dn_nfree; /* 8: num free blks in aggregate map */ + __le32 dn_l2nbperpage; /* 4: number of blks per page */ + __le32 dn_numag; /* 4: total number of ags */ + __le32 dn_maxlevel; /* 4: number of active ags */ + __le32 dn_maxag; /* 4: max active alloc group number */ + __le32 dn_agpref; /* 4: preferred alloc group (hint) */ + __le32 dn_aglevel; /* 4: dmapctl level holding the AG */ + __le32 dn_agheigth; /* 4: height in dmapctl of the AG */ + __le32 dn_agwidth; /* 4: width in dmapctl of the AG */ + __le32 dn_agstart; /* 4: start tree index at AG height */ + __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */ + __le64 dn_agfree[MAXAG];/* 8*MAXAG: per AG free count */ + __le64 dn_agsize; /* 8: num of blks per alloc group */ + s8 dn_maxfreebud; /* 1: max free buddy system */ + u8 pad[3007]; /* 3007: pad to 4096 */ +}; /* - 4096 - */ struct dbmap { - s64 dn_mapsize; /* number of blocks in aggregate */ - s64 dn_nfree; /* num free blks in aggregate map */ - int dn_l2nbperpage; /* number of blks per page */ - int dn_numag; /* total number of ags */ - int dn_maxlevel; /* number of active ags */ - int dn_maxag; /* max active alloc group number */ - int dn_agpref; /* preferred alloc group (hint) */ - int dn_aglevel; /* dmapctl level holding the AG */ - int dn_agheigth; /* height in dmapctl of the AG */ - int dn_agwidth; /* width in dmapctl of the AG */ - int dn_agstart; /* start tree index at AG height */ - int dn_agl2size; /* l2 num of blks per alloc group */ - s64 dn_agfree[MAXAG]; /* per AG free count */ - s64 dn_agsize; /* num of blks per alloc group */ - signed char dn_maxfreebud; /* max free buddy system */ -}; /* - 4096 - */ + s64 dn_mapsize; /* number of blocks in aggregate */ + s64 dn_nfree; /* num free blks in aggregate map */ + int dn_l2nbperpage; /* number of blks per page */ + int dn_numag; /* total number of ags */ + int dn_maxlevel; /* number of active ags */ + int dn_maxag; /* max active alloc group number */ + int dn_agpref; /* preferred alloc group (hint) */ + int dn_aglevel; /* dmapctl level holding the AG */ + int dn_agheigth; /* height in dmapctl of the AG */ + int dn_agwidth; /* width in dmapctl of the AG */ + int dn_agstart; /* start tree index at AG height */ + int dn_agl2size; /* l2 num of blks per alloc group */ + s64 dn_agfree[MAXAG]; /* per AG free count */ + s64 dn_agsize; /* num of blks per alloc group */ + signed char dn_maxfreebud; /* max free buddy system */ +}; /* - 4096 - */ /* * in-memory aggregate disk allocation map descriptor. */ diff --git a/trunk/fs/jfs/jfs_dtree.c b/trunk/fs/jfs/jfs_dtree.c index c14ba3cfa818..6d62f3222892 100644 --- a/trunk/fs/jfs/jfs_dtree.c +++ b/trunk/fs/jfs/jfs_dtree.c @@ -315,8 +315,8 @@ static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp, lv = &llck->lv[llck->index]; /* - * Linelock slot size is twice the size of directory table - * slot size. 512 entries per page. + * Linelock slot size is twice the size of directory table + * slot size. 512 entries per page. */ lv->offset = ((index - 2) & 511) >> 1; lv->length = 1; @@ -615,7 +615,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data, btstack->nsplit = 1; /* - * search down tree from root: + * search down tree from root: * * between two consecutive entries of and of * internal page, child page Pi contains entry with k, Ki <= K < Kj. @@ -659,7 +659,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data, } if (cmp == 0) { /* - * search hit + * search hit */ /* search hit - leaf page: * return the entry found @@ -723,7 +723,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data, } /* - * search miss + * search miss * * base is the smallest index with key (Kj) greater than * search key (K) and may be zero or (maxindex + 1) index. @@ -834,7 +834,7 @@ int dtInsert(tid_t tid, struct inode *ip, struct lv *lv; /* - * retrieve search result + * retrieve search result * * dtSearch() returns (leaf page pinned, index at which to insert). * n.b. dtSearch() may return index of (maxindex + 1) of @@ -843,7 +843,7 @@ int dtInsert(tid_t tid, struct inode *ip, DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); /* - * insert entry for new key + * insert entry for new key */ if (DO_INDEX(ip)) { if (JFS_IP(ip)->next_index == DIREND) { @@ -860,9 +860,9 @@ int dtInsert(tid_t tid, struct inode *ip, data.leaf.ino = *fsn; /* - * leaf page does not have enough room for new entry: + * leaf page does not have enough room for new entry: * - * extend/split the leaf page; + * extend/split the leaf page; * * dtSplitUp() will insert the entry and unpin the leaf page. */ @@ -877,9 +877,9 @@ int dtInsert(tid_t tid, struct inode *ip, } /* - * leaf page does have enough room for new entry: + * leaf page does have enough room for new entry: * - * insert the new data entry into the leaf page; + * insert the new data entry into the leaf page; */ BT_MARK_DIRTY(mp, ip); /* @@ -967,13 +967,13 @@ static int dtSplitUp(tid_t tid, } /* - * split leaf page + * split leaf page * * The split routines insert the new entry, and * acquire txLock as appropriate. */ /* - * split root leaf page: + * split root leaf page: */ if (sp->header.flag & BT_ROOT) { /* @@ -1012,7 +1012,7 @@ static int dtSplitUp(tid_t tid, } /* - * extend first leaf page + * extend first leaf page * * extend the 1st extent if less than buffer page size * (dtExtendPage() reurns leaf page unpinned) @@ -1068,7 +1068,7 @@ static int dtSplitUp(tid_t tid, } /* - * split leaf page into and a new right page . + * split leaf page into and a new right page . * * return pinned and its extent descriptor */ @@ -1433,7 +1433,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, rp->header.freecnt = rp->header.maxslot - fsi; /* - * sequential append at tail: append without split + * sequential append at tail: append without split * * If splitting the last page on a level because of appending * a entry to it (skip is maxentry), it's likely that the access is @@ -1467,7 +1467,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, } /* - * non-sequential insert (at possibly middle page) + * non-sequential insert (at possibly middle page) */ /* @@ -1508,7 +1508,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, left = 0; /* - * compute fill factor for split pages + * compute fill factor for split pages * * traces the next entry to move to rp * traces the next entry to stay in sp @@ -1551,7 +1551,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, /* poins to the 1st entry to move */ /* - * move entries to right page + * move entries to right page * * dtMoveEntry() initializes rp and reserves entry for insertion * @@ -1677,7 +1677,7 @@ static int dtExtendPage(tid_t tid, return (rc); /* - * extend the extent + * extend the extent */ pxdlist = split->pxdlist; pxd = &pxdlist->pxd[pxdlist->npxd]; @@ -1722,7 +1722,7 @@ static int dtExtendPage(tid_t tid, } /* - * extend the page + * extend the page */ sp->header.self = *pxd; @@ -1739,6 +1739,9 @@ static int dtExtendPage(tid_t tid, /* update buffer extent descriptor of extended page */ xlen = lengthPXD(pxd); xsize = xlen << JFS_SBI(sb)->l2bsize; +#ifdef _STILL_TO_PORT + bmSetXD(smp, xaddr, xsize); +#endif /* _STILL_TO_PORT */ /* * copy old stbl to new stbl at start of extended area @@ -1833,7 +1836,7 @@ static int dtExtendPage(tid_t tid, } /* - * update parent entry on the parent/root page + * update parent entry on the parent/root page */ /* * acquire a transaction lock on the parent/root page @@ -1901,7 +1904,7 @@ static int dtSplitRoot(tid_t tid, sp = &JFS_IP(ip)->i_dtroot; /* - * allocate/initialize a single (right) child page + * allocate/initialize a single (right) child page * * N.B. at first split, a one (or two) block to fit new entry * is allocated; at subsequent split, a full page is allocated; @@ -1940,7 +1943,7 @@ static int dtSplitRoot(tid_t tid, rp->header.prev = 0; /* - * move in-line root page into new right page extent + * move in-line root page into new right page extent */ /* linelock header + copied entries + new stbl (1st slot) in new page */ ASSERT(dtlck->index == 0); @@ -2013,7 +2016,7 @@ static int dtSplitRoot(tid_t tid, dtInsertEntry(rp, split->index, split->key, split->data, &dtlck); /* - * reset parent/root page + * reset parent/root page * * set the 1st entry offset to 0, which force the left-most key * at any level of the tree to be less than any search key. @@ -2099,7 +2102,7 @@ int dtDelete(tid_t tid, dtpage_t *np; /* - * search for the entry to delete: + * search for the entry to delete: * * dtSearch() returns (leaf page pinned, index at which to delete). */ @@ -2250,7 +2253,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip, int i; /* - * keep the root leaf page which has become empty + * keep the root leaf page which has become empty */ if (BT_IS_ROOT(fmp)) { /* @@ -2266,7 +2269,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip, } /* - * free the non-root leaf page + * free the non-root leaf page */ /* * acquire a transaction lock on the page @@ -2296,7 +2299,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip, discard_metapage(fmp); /* - * propagate page deletion up the directory tree + * propagate page deletion up the directory tree * * If the delete from the parent page makes it empty, * continue all the way up the tree. @@ -2437,10 +2440,10 @@ static int dtDeleteUp(tid_t tid, struct inode *ip, #ifdef _NOTYET /* - * NAME: dtRelocate() + * NAME: dtRelocate() * - * FUNCTION: relocate dtpage (internal or leaf) of directory; - * This function is mainly used by defragfs utility. + * FUNCTION: relocate dtpage (internal or leaf) of directory; + * This function is mainly used by defragfs utility. */ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, s64 nxaddr) @@ -2468,8 +2471,8 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, xlen); /* - * 1. get the internal parent dtpage covering - * router entry for the tartget page to be relocated; + * 1. get the internal parent dtpage covering + * router entry for the tartget page to be relocated; */ rc = dtSearchNode(ip, lmxaddr, opxd, &btstack); if (rc) @@ -2480,7 +2483,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, jfs_info("dtRelocate: parent router entry validated."); /* - * 2. relocate the target dtpage + * 2. relocate the target dtpage */ /* read in the target page from src extent */ DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc); @@ -2578,7 +2581,9 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, /* update the buffer extent descriptor of the dtpage */ xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize; - +#ifdef _STILL_TO_PORT + bmSetXD(mp, nxaddr, xsize); +#endif /* _STILL_TO_PORT */ /* unpin the relocated page */ DT_PUTPAGE(mp); jfs_info("dtRelocate: target dtpage relocated."); @@ -2589,7 +2594,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, */ /* - * 3. acquire maplock for the source extent to be freed; + * 3. acquire maplock for the source extent to be freed; */ /* for dtpage relocation, write a LOG_NOREDOPAGE record * for the source dtpage (logredo() will init NoRedoPage @@ -2604,7 +2609,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, pxdlock->index = 1; /* - * 4. update the parent router entry for relocation; + * 4. update the parent router entry for relocation; * * acquire tlck for the parent entry covering the target dtpage; * write LOG_REDOPAGE to apply after image only; @@ -2632,7 +2637,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, * NAME: dtSearchNode() * * FUNCTION: Search for an dtpage containing a specified address - * This function is mainly used by defragfs utility. + * This function is mainly used by defragfs utility. * * NOTE: Search result on stack, the found page is pinned at exit. * The result page must be an internal dtpage. @@ -2655,7 +2660,7 @@ static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd, BT_CLR(btstack); /* reset stack */ /* - * descend tree to the level with specified leftmost page + * descend tree to the level with specified leftmost page * * by convention, root bn = 0. */ @@ -2694,7 +2699,7 @@ static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd, } /* - * search each page at the current levevl + * search each page at the current levevl */ loop: stbl = DT_GETSTBL(p); @@ -3039,9 +3044,9 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (DO_INDEX(ip)) { /* * persistent index is stored in directory entries. - * Special cases: 0 = . - * 1 = .. - * -1 = End of directory + * Special cases: 0 = . + * 1 = .. + * -1 = End of directory */ do_index = 1; @@ -3123,10 +3128,10 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) /* * Legacy filesystem - OS/2 & Linux JFS < 0.3.6 * - * pn = index = 0: First entry "." - * pn = 0; index = 1: Second entry ".." - * pn > 0: Real entries, pn=1 -> leftmost page - * pn = index = -1: No more entries + * pn = index = 0: First entry "." + * pn = 0; index = 1: Second entry ".." + * pn > 0: Real entries, pn=1 -> leftmost page + * pn = index = -1: No more entries */ dtpos = filp->f_pos; if (dtpos == 0) { @@ -3346,7 +3351,7 @@ static int dtReadFirst(struct inode *ip, struct btstack * btstack) BT_CLR(btstack); /* reset stack */ /* - * descend leftmost path of the tree + * descend leftmost path of the tree * * by convention, root bn = 0. */ @@ -4526,7 +4531,7 @@ int dtModify(tid_t tid, struct inode *ip, struct ldtentry *entry; /* - * search for the entry to modify: + * search for the entry to modify: * * dtSearch() returns (leaf page pinned, index at which to modify). */ diff --git a/trunk/fs/jfs/jfs_dtree.h b/trunk/fs/jfs/jfs_dtree.h index 8561c6ecece0..af8513f78648 100644 --- a/trunk/fs/jfs/jfs_dtree.h +++ b/trunk/fs/jfs/jfs_dtree.h @@ -35,7 +35,7 @@ typedef union { /* - * entry segment/slot + * entry segment/slot * * an entry consists of type dependent head/only segment/slot and * additional segments/slots linked vi next field; diff --git a/trunk/fs/jfs/jfs_extent.c b/trunk/fs/jfs/jfs_extent.c index 7ae1e3281de9..a35bdca6a805 100644 --- a/trunk/fs/jfs/jfs_extent.c +++ b/trunk/fs/jfs/jfs_extent.c @@ -34,8 +34,8 @@ static int extBrealloc(struct inode *, s64, s64, s64 *, s64 *); #endif static s64 extRoundDown(s64 nb); -#define DPD(a) (printk("(a): %d\n",(a))) -#define DPC(a) (printk("(a): %c\n",(a))) +#define DPD(a) (printk("(a): %d\n",(a))) +#define DPC(a) (printk("(a): %c\n",(a))) #define DPL1(a) \ { \ if ((a) >> 32) \ @@ -51,19 +51,19 @@ static s64 extRoundDown(s64 nb); printk("(a): %x\n",(a) << 32); \ } -#define DPD1(a) (printk("(a): %d ",(a))) -#define DPX(a) (printk("(a): %08x\n",(a))) -#define DPX1(a) (printk("(a): %08x ",(a))) -#define DPS(a) (printk("%s\n",(a))) -#define DPE(a) (printk("\nENTERING: %s\n",(a))) -#define DPE1(a) (printk("\nENTERING: %s",(a))) -#define DPS1(a) (printk(" %s ",(a))) +#define DPD1(a) (printk("(a): %d ",(a))) +#define DPX(a) (printk("(a): %08x\n",(a))) +#define DPX1(a) (printk("(a): %08x ",(a))) +#define DPS(a) (printk("%s\n",(a))) +#define DPE(a) (printk("\nENTERING: %s\n",(a))) +#define DPE1(a) (printk("\nENTERING: %s",(a))) +#define DPS1(a) (printk(" %s ",(a))) /* * NAME: extAlloc() * - * FUNCTION: allocate an extent for a specified page range within a + * FUNCTION: allocate an extent for a specified page range within a * file. * * PARAMETERS: @@ -78,9 +78,9 @@ static s64 extRoundDown(s64 nb); * should be marked as allocated but not recorded. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error. - * -ENOSPC - insufficient disk resources. + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. */ int extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) @@ -192,9 +192,9 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) #ifdef _NOTYET /* - * NAME: extRealloc() + * NAME: extRealloc() * - * FUNCTION: extend the allocation of a file extent containing a + * FUNCTION: extend the allocation of a file extent containing a * partial back last page. * * PARAMETERS: @@ -207,9 +207,9 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) * should be marked as allocated but not recorded. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error. - * -ENOSPC - insufficient disk resources. + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. */ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) { @@ -345,9 +345,9 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) /* - * NAME: extHint() + * NAME: extHint() * - * FUNCTION: produce an extent allocation hint for a file offset. + * FUNCTION: produce an extent allocation hint for a file offset. * * PARAMETERS: * ip - the inode of the file. @@ -356,8 +356,8 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) * the hint. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error. + * 0 - success + * -EIO - i/o error. */ int extHint(struct inode *ip, s64 offset, xad_t * xp) { @@ -387,7 +387,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp) lxdl.nlxd = 1; lxdl.lxd = &lxd; LXDoffset(&lxd, prev) - LXDlength(&lxd, nbperpage); + LXDlength(&lxd, nbperpage); xadl.maxnxad = 1; xadl.nxad = 0; @@ -397,11 +397,11 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp) if ((rc = xtLookupList(ip, &lxdl, &xadl, 0))) return (rc); - /* check if no extent exists for the previous page. + /* check if not extent exists for the previous page. * this is possible for sparse files. */ if (xadl.nxad == 0) { -// assert(ISSPARSE(ip)); +// assert(ISSPARSE(ip)); return (0); } @@ -410,28 +410,28 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp) */ xp->flag &= XAD_NOTRECORDED; - if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) { + if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) { jfs_error(ip->i_sb, "extHint: corrupt xtree"); return -EIO; - } + } return (0); } /* - * NAME: extRecord() + * NAME: extRecord() * - * FUNCTION: change a page with a file from not recorded to recorded. + * FUNCTION: change a page with a file from not recorded to recorded. * * PARAMETERS: * ip - inode of the file. * cp - cbuf of the file page. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error. - * -ENOSPC - insufficient disk resources. + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. */ int extRecord(struct inode *ip, xad_t * xp) { @@ -451,9 +451,9 @@ int extRecord(struct inode *ip, xad_t * xp) #ifdef _NOTYET /* - * NAME: extFill() + * NAME: extFill() * - * FUNCTION: allocate disk space for a file page that represents + * FUNCTION: allocate disk space for a file page that represents * a file hole. * * PARAMETERS: @@ -461,16 +461,16 @@ int extRecord(struct inode *ip, xad_t * xp) * cp - cbuf of the file page represent the hole. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error. - * -ENOSPC - insufficient disk resources. + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. */ int extFill(struct inode *ip, xad_t * xp) { int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage; s64 blkno = offsetXAD(xp) >> ip->i_blkbits; -// assert(ISSPARSE(ip)); +// assert(ISSPARSE(ip)); /* initialize the extent allocation hint */ XADaddress(xp, 0); @@ -489,7 +489,7 @@ int extFill(struct inode *ip, xad_t * xp) /* * NAME: extBalloc() * - * FUNCTION: allocate disk blocks to form an extent. + * FUNCTION: allocate disk blocks to form an extent. * * initially, we will try to allocate disk blocks for the * requested size (nblocks). if this fails (nblocks @@ -513,9 +513,9 @@ int extFill(struct inode *ip, xad_t * xp) * allocated block range. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error. - * -ENOSPC - insufficient disk resources. + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. */ static int extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) @@ -580,7 +580,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) /* * NAME: extBrealloc() * - * FUNCTION: attempt to extend an extent's allocation. + * FUNCTION: attempt to extend an extent's allocation. * * Initially, we will try to extend the extent's allocation * in place. If this fails, we'll try to move the extent @@ -597,8 +597,8 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) * * PARAMETERS: * ip - the inode of the file. - * blkno - starting block number of the extents current allocation. - * nblks - number of blocks within the extents current allocation. + * blkno - starting block number of the extents current allocation. + * nblks - number of blocks within the extents current allocation. * newnblks - pointer to a s64 value. on entry, this value is the * the new desired extent size (number of blocks). on * successful exit, this value is set to the extent's actual @@ -606,9 +606,9 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) * newblkno - the starting block number of the extents new allocation. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error. - * -ENOSPC - insufficient disk resources. + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. */ static int extBrealloc(struct inode *ip, @@ -634,16 +634,16 @@ extBrealloc(struct inode *ip, /* - * NAME: extRoundDown() + * NAME: extRoundDown() * - * FUNCTION: round down a specified number of blocks to the next + * FUNCTION: round down a specified number of blocks to the next * smallest power of 2 number. * * PARAMETERS: * nb - the inode of the file. * * RETURN VALUES: - * next smallest power of 2 number. + * next smallest power of 2 number. */ static s64 extRoundDown(s64 nb) { diff --git a/trunk/fs/jfs/jfs_filsys.h b/trunk/fs/jfs/jfs_filsys.h index b3f5463fbe52..38f70ac03bec 100644 --- a/trunk/fs/jfs/jfs_filsys.h +++ b/trunk/fs/jfs/jfs_filsys.h @@ -34,9 +34,9 @@ #define JFS_UNICODE 0x00000001 /* unicode name */ /* mount time flags for error handling */ -#define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */ -#define JFS_ERR_CONTINUE 0x00000004 /* continue */ -#define JFS_ERR_PANIC 0x00000008 /* panic */ +#define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */ +#define JFS_ERR_CONTINUE 0x00000004 /* continue */ +#define JFS_ERR_PANIC 0x00000008 /* panic */ /* Quota support */ #define JFS_USRQUOTA 0x00000010 @@ -83,6 +83,7 @@ /* case-insensitive name/directory support */ #define JFS_AIX 0x80000000 /* AIX support */ +/* POSIX name/directory support - Never implemented*/ /* * buffer cache configuration @@ -112,10 +113,10 @@ #define IDATASIZE 256 /* inode inline data size */ #define IXATTRSIZE 128 /* inode inline extended attribute size */ -#define XTPAGE_SIZE 4096 -#define log2_PAGESIZE 12 +#define XTPAGE_SIZE 4096 +#define log2_PAGESIZE 12 -#define IAG_SIZE 4096 +#define IAG_SIZE 4096 #define IAG_EXTENT_SIZE 4096 #define INOSPERIAG 4096 /* number of disk inodes per iag */ #define L2INOSPERIAG 12 /* l2 number of disk inodes per iag */ diff --git a/trunk/fs/jfs/jfs_imap.c b/trunk/fs/jfs/jfs_imap.c index 3870ba8b9086..c6530227cda6 100644 --- a/trunk/fs/jfs/jfs_imap.c +++ b/trunk/fs/jfs/jfs_imap.c @@ -93,21 +93,21 @@ static int copy_from_dinode(struct dinode *, struct inode *); static void copy_to_dinode(struct dinode *, struct inode *); /* - * NAME: diMount() + * NAME: diMount() * - * FUNCTION: initialize the incore inode map control structures for + * FUNCTION: initialize the incore inode map control structures for * a fileset or aggregate init time. * - * the inode map's control structure (dinomap) is - * brought in from disk and placed in virtual memory. + * the inode map's control structure (dinomap) is + * brought in from disk and placed in virtual memory. * * PARAMETERS: - * ipimap - pointer to inode map inode for the aggregate or fileset. + * ipimap - pointer to inode map inode for the aggregate or fileset. * * RETURN VALUES: - * 0 - success - * -ENOMEM - insufficient free virtual memory. - * -EIO - i/o error. + * 0 - success + * -ENOMEM - insufficient free virtual memory. + * -EIO - i/o error. */ int diMount(struct inode *ipimap) { @@ -180,18 +180,18 @@ int diMount(struct inode *ipimap) /* - * NAME: diUnmount() + * NAME: diUnmount() * - * FUNCTION: write to disk the incore inode map control structures for + * FUNCTION: write to disk the incore inode map control structures for * a fileset or aggregate at unmount time. * * PARAMETERS: - * ipimap - pointer to inode map inode for the aggregate or fileset. + * ipimap - pointer to inode map inode for the aggregate or fileset. * * RETURN VALUES: - * 0 - success - * -ENOMEM - insufficient free virtual memory. - * -EIO - i/o error. + * 0 - success + * -ENOMEM - insufficient free virtual memory. + * -EIO - i/o error. */ int diUnmount(struct inode *ipimap, int mounterror) { @@ -274,9 +274,9 @@ int diSync(struct inode *ipimap) /* - * NAME: diRead() + * NAME: diRead() * - * FUNCTION: initialize an incore inode from disk. + * FUNCTION: initialize an incore inode from disk. * * on entry, the specifed incore inode should itself * specify the disk inode number corresponding to the @@ -285,7 +285,7 @@ int diSync(struct inode *ipimap) * this routine handles incore inode initialization for * both "special" and "regular" inodes. special inodes * are those required early in the mount process and - * require special handling since much of the file system + * require special handling since much of the file system * is not yet initialized. these "special" inodes are * identified by a NULL inode map inode pointer and are * actually initialized by a call to diReadSpecial(). @@ -298,12 +298,12 @@ int diSync(struct inode *ipimap) * incore inode. * * PARAMETERS: - * ip - pointer to incore inode to be initialized from disk. + * ip - pointer to incore inode to be initialized from disk. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error. - * -ENOMEM - insufficient memory + * 0 - success + * -EIO - i/o error. + * -ENOMEM - insufficient memory * */ int diRead(struct inode *ip) @@ -410,26 +410,26 @@ int diRead(struct inode *ip) /* - * NAME: diReadSpecial() + * NAME: diReadSpecial() * - * FUNCTION: initialize a 'special' inode from disk. + * FUNCTION: initialize a 'special' inode from disk. * * this routines handles aggregate level inodes. The * inode cache cannot differentiate between the * aggregate inodes and the filesystem inodes, so we * handle these here. We don't actually use the aggregate - * inode map, since these inodes are at a fixed location + * inode map, since these inodes are at a fixed location * and in some cases the aggregate inode map isn't initialized * yet. * * PARAMETERS: - * sb - filesystem superblock + * sb - filesystem superblock * inum - aggregate inode number * secondary - 1 if secondary aggregate inode table * * RETURN VALUES: - * new inode - success - * NULL - i/o error. + * new inode - success + * NULL - i/o error. */ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) { @@ -502,12 +502,12 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) } /* - * NAME: diWriteSpecial() + * NAME: diWriteSpecial() * - * FUNCTION: Write the special inode to disk + * FUNCTION: Write the special inode to disk * * PARAMETERS: - * ip - special inode + * ip - special inode * secondary - 1 if secondary aggregate inode table * * RETURN VALUES: none @@ -554,9 +554,9 @@ void diWriteSpecial(struct inode *ip, int secondary) } /* - * NAME: diFreeSpecial() + * NAME: diFreeSpecial() * - * FUNCTION: Free allocated space for special inode + * FUNCTION: Free allocated space for special inode */ void diFreeSpecial(struct inode *ip) { @@ -572,9 +572,9 @@ void diFreeSpecial(struct inode *ip) /* - * NAME: diWrite() + * NAME: diWrite() * - * FUNCTION: write the on-disk inode portion of the in-memory inode + * FUNCTION: write the on-disk inode portion of the in-memory inode * to its corresponding on-disk inode. * * on entry, the specifed incore inode should itself @@ -589,11 +589,11 @@ void diFreeSpecial(struct inode *ip) * * PARAMETERS: * tid - transacation id - * ip - pointer to incore inode to be written to the inode extent. + * ip - pointer to incore inode to be written to the inode extent. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error. + * 0 - success + * -EIO - i/o error. */ int diWrite(tid_t tid, struct inode *ip) { @@ -730,7 +730,7 @@ int diWrite(tid_t tid, struct inode *ip) ilinelock = (struct linelock *) & tlck->lock; /* - * regular file: 16 byte (XAD slot) granularity + * regular file: 16 byte (XAD slot) granularity */ if (type & tlckXTREE) { xtpage_t *p, *xp; @@ -755,7 +755,7 @@ int diWrite(tid_t tid, struct inode *ip) xad->flag &= ~(XAD_NEW | XAD_EXTENDED); } /* - * directory: 32 byte (directory entry slot) granularity + * directory: 32 byte (directory entry slot) granularity */ else if (type & tlckDTREE) { dtpage_t *p, *xp; @@ -800,8 +800,9 @@ int diWrite(tid_t tid, struct inode *ip) } /* - * lock/copy inode base: 128 byte slot granularity + * lock/copy inode base: 128 byte slot granularity */ +// baseDinode: lv = & dilinelock->lv[dilinelock->index]; lv->offset = dioffset >> L2INODESLOTSIZE; copy_to_dinode(dp, ip); @@ -812,6 +813,17 @@ int diWrite(tid_t tid, struct inode *ip) lv->length = 1; dilinelock->index++; +#ifdef _JFS_FASTDASD + /* + * We aren't logging changes to the DASD used in directory inodes, + * but we need to write them to disk. If we don't unmount cleanly, + * mount will recalculate the DASD used. + */ + if (S_ISDIR(ip->i_mode) + && (ip->i_ipmnt->i_mntflag & JFS_DASD_ENABLED)) + memcpy(&dp->di_DASD, &ip->i_DASD, sizeof(struct dasd)); +#endif /* _JFS_FASTDASD */ + /* release the buffer holding the updated on-disk inode. * the buffer will be later written by commit processing. */ @@ -822,9 +834,9 @@ int diWrite(tid_t tid, struct inode *ip) /* - * NAME: diFree(ip) + * NAME: diFree(ip) * - * FUNCTION: free a specified inode from the inode working map + * FUNCTION: free a specified inode from the inode working map * for a fileset or aggregate. * * if the inode to be freed represents the first (only) @@ -853,11 +865,11 @@ int diWrite(tid_t tid, struct inode *ip) * any updates and are held until all updates are complete. * * PARAMETERS: - * ip - inode to be freed. + * ip - inode to be freed. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error. + * 0 - success + * -EIO - i/o error. */ int diFree(struct inode *ip) { @@ -890,8 +902,7 @@ int diFree(struct inode *ip) * the map. */ if (iagno >= imap->im_nextiag) { - print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4, - imap, 32, 0); + dump_mem("imap", imap, 32); jfs_error(ip->i_sb, "diFree: inum = %d, iagno = %d, nextiag = %d", (uint) inum, iagno, imap->im_nextiag); @@ -953,8 +964,8 @@ int diFree(struct inode *ip) return -EIO; } /* - * inode extent still has some inodes or below low water mark: - * keep the inode extent; + * inode extent still has some inodes or below low water mark: + * keep the inode extent; */ if (bitmap || imap->im_agctl[agno].numfree < 96 || @@ -1036,12 +1047,12 @@ int diFree(struct inode *ip) /* - * inode extent has become free and above low water mark: - * free the inode extent; + * inode extent has become free and above low water mark: + * free the inode extent; */ /* - * prepare to update iag list(s) (careful update step 1) + * prepare to update iag list(s) (careful update step 1) */ amp = bmp = cmp = dmp = NULL; fwd = back = -1; @@ -1141,7 +1152,7 @@ int diFree(struct inode *ip) invalidate_pxd_metapages(ip, freepxd); /* - * update iag list(s) (careful update step 2) + * update iag list(s) (careful update step 2) */ /* add the iag to the ag extent free list if this is the * first free extent for the iag. @@ -1327,20 +1338,20 @@ diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp) /* - * NAME: diAlloc(pip,dir,ip) + * NAME: diAlloc(pip,dir,ip) * - * FUNCTION: allocate a disk inode from the inode working map + * FUNCTION: allocate a disk inode from the inode working map * for a fileset or aggregate. * * PARAMETERS: - * pip - pointer to incore inode for the parent inode. - * dir - 'true' if the new disk inode is for a directory. - * ip - pointer to a new inode + * pip - pointer to incore inode for the parent inode. + * dir - 'true' if the new disk inode is for a directory. + * ip - pointer to a new inode * * RETURN VALUES: - * 0 - success. - * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. */ int diAlloc(struct inode *pip, bool dir, struct inode *ip) { @@ -1422,7 +1433,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts); /* - * try to allocate from the IAG + * try to allocate from the IAG */ /* check if the inode may be allocated from the iag * (i.e. the inode has free inodes or new extent can be added). @@ -1622,9 +1633,9 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) /* - * NAME: diAllocAG(imap,agno,dir,ip) + * NAME: diAllocAG(imap,agno,dir,ip) * - * FUNCTION: allocate a disk inode from the allocation group. + * FUNCTION: allocate a disk inode from the allocation group. * * this routine first determines if a new extent of free * inodes should be added for the allocation group, with @@ -1638,17 +1649,17 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) * PRE CONDITION: Already have the AG lock for this AG. * * PARAMETERS: - * imap - pointer to inode map control structure. - * agno - allocation group to allocate from. - * dir - 'true' if the new disk inode is for a directory. - * ip - pointer to the new inode to be filled in on successful return + * imap - pointer to inode map control structure. + * agno - allocation group to allocate from. + * dir - 'true' if the new disk inode is for a directory. + * ip - pointer to the new inode to be filled in on successful return * with the disk inode number allocated, its extent address * and the start of the ag. * * RETURN VALUES: - * 0 - success. - * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. */ static int diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip) @@ -1698,9 +1709,9 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip) /* - * NAME: diAllocAny(imap,agno,dir,iap) + * NAME: diAllocAny(imap,agno,dir,iap) * - * FUNCTION: allocate a disk inode from any other allocation group. + * FUNCTION: allocate a disk inode from any other allocation group. * * this routine is called when an allocation attempt within * the primary allocation group has failed. if attempts to @@ -1708,17 +1719,17 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip) * specified primary group. * * PARAMETERS: - * imap - pointer to inode map control structure. - * agno - primary allocation group (to avoid). - * dir - 'true' if the new disk inode is for a directory. - * ip - pointer to a new inode to be filled in on successful return + * imap - pointer to inode map control structure. + * agno - primary allocation group (to avoid). + * dir - 'true' if the new disk inode is for a directory. + * ip - pointer to a new inode to be filled in on successful return * with the disk inode number allocated, its extent address * and the start of the ag. * * RETURN VALUES: - * 0 - success. - * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. */ static int diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip) @@ -1761,9 +1772,9 @@ diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip) /* - * NAME: diAllocIno(imap,agno,ip) + * NAME: diAllocIno(imap,agno,ip) * - * FUNCTION: allocate a disk inode from the allocation group's free + * FUNCTION: allocate a disk inode from the allocation group's free * inode list, returning an error if this free list is * empty (i.e. no iags on the list). * @@ -1774,16 +1785,16 @@ diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip) * PRE CONDITION: Already have AG lock for this AG. * * PARAMETERS: - * imap - pointer to inode map control structure. - * agno - allocation group. - * ip - pointer to new inode to be filled in on successful return + * imap - pointer to inode map control structure. + * agno - allocation group. + * ip - pointer to new inode to be filled in on successful return * with the disk inode number allocated, its extent address * and the start of the ag. * * RETURN VALUES: - * 0 - success. - * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. */ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) { @@ -1879,7 +1890,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) /* - * NAME: diAllocExt(imap,agno,ip) + * NAME: diAllocExt(imap,agno,ip) * * FUNCTION: add a new extent of free inodes to an iag, allocating * an inode from this extent to satisfy the current allocation @@ -1899,16 +1910,16 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) * for the purpose of satisfying this request. * * PARAMETERS: - * imap - pointer to inode map control structure. - * agno - allocation group number. - * ip - pointer to new inode to be filled in on successful return + * imap - pointer to inode map control structure. + * agno - allocation group number. + * ip - pointer to new inode to be filled in on successful return * with the disk inode number allocated, its extent address * and the start of the ag. * * RETURN VALUES: - * 0 - success. - * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. */ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) { @@ -1999,7 +2010,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) /* - * NAME: diAllocBit(imap,iagp,ino) + * NAME: diAllocBit(imap,iagp,ino) * * FUNCTION: allocate a backed inode from an iag. * @@ -2019,14 +2030,14 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) * this AG. Must have read lock on imap inode. * * PARAMETERS: - * imap - pointer to inode map control structure. - * iagp - pointer to iag. - * ino - inode number to be allocated within the iag. + * imap - pointer to inode map control structure. + * iagp - pointer to iag. + * ino - inode number to be allocated within the iag. * * RETURN VALUES: - * 0 - success. - * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. */ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino) { @@ -2133,11 +2144,11 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino) /* - * NAME: diNewExt(imap,iagp,extno) + * NAME: diNewExt(imap,iagp,extno) * - * FUNCTION: initialize a new extent of inodes for an iag, allocating - * the first inode of the extent for use for the current - * allocation request. + * FUNCTION: initialize a new extent of inodes for an iag, allocating + * the first inode of the extent for use for the current + * allocation request. * * disk resources are allocated for the new extent of inodes * and the inodes themselves are initialized to reflect their @@ -2166,14 +2177,14 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino) * this AG. Must have read lock on imap inode. * * PARAMETERS: - * imap - pointer to inode map control structure. - * iagp - pointer to iag. - * extno - extent number. + * imap - pointer to inode map control structure. + * iagp - pointer to iag. + * extno - extent number. * * RETURN VALUES: - * 0 - success. - * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. */ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno) { @@ -2419,7 +2430,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno) /* - * NAME: diNewIAG(imap,iagnop,agno) + * NAME: diNewIAG(imap,iagnop,agno) * * FUNCTION: allocate a new iag for an allocation group. * @@ -2432,16 +2443,16 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno) * and returned to satisfy the request. * * PARAMETERS: - * imap - pointer to inode map control structure. - * iagnop - pointer to an iag number set with the number of the + * imap - pointer to inode map control structure. + * iagnop - pointer to an iag number set with the number of the * newly allocated iag upon successful return. - * agno - allocation group number. + * agno - allocation group number. * bpp - Buffer pointer to be filled in with new IAG's buffer * * RETURN VALUES: - * 0 - success. - * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. * * serialization: * AG lock held on entry/exit; @@ -2450,7 +2461,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno) * * note: new iag transaction: * . synchronously write iag; - * . write log of xtree and inode of imap; + * . write log of xtree and inode of imap; * . commit; * . synchronous write of xtree (right to left, bottom to top); * . at start of logredo(): init in-memory imap with one additional iag page; @@ -2470,6 +2481,9 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) s64 xaddr = 0; s64 blkno; tid_t tid; +#ifdef _STILL_TO_PORT + xad_t xad; +#endif /* _STILL_TO_PORT */ struct inode *iplist[1]; /* pick up pointers to the inode map and mount inodes */ @@ -2660,15 +2674,15 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) } /* - * NAME: diIAGRead() + * NAME: diIAGRead() * - * FUNCTION: get the buffer for the specified iag within a fileset + * FUNCTION: get the buffer for the specified iag within a fileset * or aggregate inode map. * * PARAMETERS: - * imap - pointer to inode map control structure. - * iagno - iag number. - * bpp - point to buffer pointer to be filled in on successful + * imap - pointer to inode map control structure. + * iagno - iag number. + * bpp - point to buffer pointer to be filled in on successful * exit. * * SERIALIZATION: @@ -2677,8 +2691,8 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) * the read lock is unnecessary.) * * RETURN VALUES: - * 0 - success. - * -EIO - i/o error. + * 0 - success. + * -EIO - i/o error. */ static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp) { @@ -2698,17 +2712,17 @@ static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp) } /* - * NAME: diFindFree() + * NAME: diFindFree() * - * FUNCTION: find the first free bit in a word starting at + * FUNCTION: find the first free bit in a word starting at * the specified bit position. * * PARAMETERS: - * word - word to be examined. - * start - starting bit position. + * word - word to be examined. + * start - starting bit position. * * RETURN VALUES: - * bit position of first free bit in the word or 32 if + * bit position of first free bit in the word or 32 if * no free bits were found. */ static int diFindFree(u32 word, int start) @@ -2883,7 +2897,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap) atomic_read(&imap->im_numfree)); /* - * reconstruct imap + * reconstruct imap * * coalesce contiguous k (newAGSize/oldAGSize) AGs; * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn; @@ -2899,7 +2913,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap) } /* - * process each iag page of the map. + * process each iag page of the map. * * rebuild AG Free Inode List, AG Free Inode Extent List; */ @@ -2918,7 +2932,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap) /* leave free iag in the free iag list */ if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { - release_metapage(bp); + release_metapage(bp); continue; } @@ -3049,13 +3063,13 @@ static void duplicateIXtree(struct super_block *sb, s64 blkno, } /* - * NAME: copy_from_dinode() + * NAME: copy_from_dinode() * - * FUNCTION: Copies inode info from disk inode to in-memory inode + * FUNCTION: Copies inode info from disk inode to in-memory inode * * RETURN VALUES: - * 0 - success - * -ENOMEM - insufficient memory + * 0 - success + * -ENOMEM - insufficient memory */ static int copy_from_dinode(struct dinode * dip, struct inode *ip) { @@ -3137,9 +3151,9 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip) } /* - * NAME: copy_to_dinode() + * NAME: copy_to_dinode() * - * FUNCTION: Copies inode info from in-memory inode to disk inode + * FUNCTION: Copies inode info from in-memory inode to disk inode */ static void copy_to_dinode(struct dinode * dip, struct inode *ip) { diff --git a/trunk/fs/jfs/jfs_imap.h b/trunk/fs/jfs/jfs_imap.h index 610a0e9d8941..4f9c346ed498 100644 --- a/trunk/fs/jfs/jfs_imap.h +++ b/trunk/fs/jfs/jfs_imap.h @@ -24,17 +24,17 @@ * jfs_imap.h: disk inode manager */ -#define EXTSPERIAG 128 /* number of disk inode extent per iag */ -#define IMAPBLKNO 0 /* lblkno of dinomap within inode map */ -#define SMAPSZ 4 /* number of words per summary map */ +#define EXTSPERIAG 128 /* number of disk inode extent per iag */ +#define IMAPBLKNO 0 /* lblkno of dinomap within inode map */ +#define SMAPSZ 4 /* number of words per summary map */ #define EXTSPERSUM 32 /* number of extents per summary map entry */ #define L2EXTSPERSUM 5 /* l2 number of extents per summary map */ #define PGSPERIEXT 4 /* number of 4K pages per dinode extent */ -#define MAXIAGS ((1<<20)-1) /* maximum number of iags */ -#define MAXAG 128 /* maximum number of allocation groups */ +#define MAXIAGS ((1<<20)-1) /* maximum number of iags */ +#define MAXAG 128 /* maximum number of allocation groups */ -#define AMAPSIZE 512 /* bytes in the IAG allocation maps */ -#define SMAPSIZE 16 /* bytes in the IAG summary maps */ +#define AMAPSIZE 512 /* bytes in the IAG allocation maps */ +#define SMAPSIZE 16 /* bytes in the IAG summary maps */ /* convert inode number to iag number */ #define INOTOIAG(ino) ((ino) >> L2INOSPERIAG) @@ -60,31 +60,31 @@ * inode allocation group page (per 4096 inodes of an AG) */ struct iag { - __le64 agstart; /* 8: starting block of ag */ - __le32 iagnum; /* 4: inode allocation group number */ - __le32 inofreefwd; /* 4: ag inode free list forward */ - __le32 inofreeback; /* 4: ag inode free list back */ - __le32 extfreefwd; /* 4: ag inode extent free list forward */ - __le32 extfreeback; /* 4: ag inode extent free list back */ - __le32 iagfree; /* 4: iag free list */ + __le64 agstart; /* 8: starting block of ag */ + __le32 iagnum; /* 4: inode allocation group number */ + __le32 inofreefwd; /* 4: ag inode free list forward */ + __le32 inofreeback; /* 4: ag inode free list back */ + __le32 extfreefwd; /* 4: ag inode extent free list forward */ + __le32 extfreeback; /* 4: ag inode extent free list back */ + __le32 iagfree; /* 4: iag free list */ /* summary map: 1 bit per inode extent */ __le32 inosmap[SMAPSZ]; /* 16: sum map of mapwords w/ free inodes; - * note: this indicates free and backed - * inodes, if the extent is not backed the - * value will be 1. if the extent is - * backed but all inodes are being used the - * value will be 1. if the extent is - * backed but at least one of the inodes is - * free the value will be 0. + * note: this indicates free and backed + * inodes, if the extent is not backed the + * value will be 1. if the extent is + * backed but all inodes are being used the + * value will be 1. if the extent is + * backed but at least one of the inodes is + * free the value will be 0. */ __le32 extsmap[SMAPSZ]; /* 16: sum map of mapwords w/ free extents */ - __le32 nfreeinos; /* 4: number of free inodes */ - __le32 nfreeexts; /* 4: number of free extents */ + __le32 nfreeinos; /* 4: number of free inodes */ + __le32 nfreeexts; /* 4: number of free extents */ /* (72) */ u8 pad[1976]; /* 1976: pad to 2048 bytes */ /* allocation bit map: 1 bit per inode (0 - free, 1 - allocated) */ - __le32 wmap[EXTSPERIAG]; /* 512: working allocation map */ + __le32 wmap[EXTSPERIAG]; /* 512: working allocation map */ __le32 pmap[EXTSPERIAG]; /* 512: persistent allocation map */ pxd_t inoext[EXTSPERIAG]; /* 1024: inode extent addresses */ }; /* (4096) */ @@ -93,44 +93,44 @@ struct iag { * per AG control information (in inode map control page) */ struct iagctl_disk { - __le32 inofree; /* 4: free inode list anchor */ - __le32 extfree; /* 4: free extent list anchor */ - __le32 numinos; /* 4: number of backed inodes */ - __le32 numfree; /* 4: number of free inodes */ + __le32 inofree; /* 4: free inode list anchor */ + __le32 extfree; /* 4: free extent list anchor */ + __le32 numinos; /* 4: number of backed inodes */ + __le32 numfree; /* 4: number of free inodes */ }; /* (16) */ struct iagctl { - int inofree; /* free inode list anchor */ - int extfree; /* free extent list anchor */ - int numinos; /* number of backed inodes */ - int numfree; /* number of free inodes */ + int inofree; /* free inode list anchor */ + int extfree; /* free extent list anchor */ + int numinos; /* number of backed inodes */ + int numfree; /* number of free inodes */ }; /* * per fileset/aggregate inode map control page */ struct dinomap_disk { - __le32 in_freeiag; /* 4: free iag list anchor */ - __le32 in_nextiag; /* 4: next free iag number */ - __le32 in_numinos; /* 4: num of backed inodes */ + __le32 in_freeiag; /* 4: free iag list anchor */ + __le32 in_nextiag; /* 4: next free iag number */ + __le32 in_numinos; /* 4: num of backed inodes */ __le32 in_numfree; /* 4: num of free backed inodes */ __le32 in_nbperiext; /* 4: num of blocks per inode extent */ - __le32 in_l2nbperiext; /* 4: l2 of in_nbperiext */ - __le32 in_diskblock; /* 4: for standalone test driver */ - __le32 in_maxag; /* 4: for standalone test driver */ - u8 pad[2016]; /* 2016: pad to 2048 */ + __le32 in_l2nbperiext; /* 4: l2 of in_nbperiext */ + __le32 in_diskblock; /* 4: for standalone test driver */ + __le32 in_maxag; /* 4: for standalone test driver */ + u8 pad[2016]; /* 2016: pad to 2048 */ struct iagctl_disk in_agctl[MAXAG]; /* 2048: AG control information */ }; /* (4096) */ struct dinomap { - int in_freeiag; /* free iag list anchor */ - int in_nextiag; /* next free iag number */ - int in_numinos; /* num of backed inodes */ - int in_numfree; /* num of free backed inodes */ + int in_freeiag; /* free iag list anchor */ + int in_nextiag; /* next free iag number */ + int in_numinos; /* num of backed inodes */ + int in_numfree; /* num of free backed inodes */ int in_nbperiext; /* num of blocks per inode extent */ - int in_l2nbperiext; /* l2 of in_nbperiext */ - int in_diskblock; /* for standalone test driver */ - int in_maxag; /* for standalone test driver */ + int in_l2nbperiext; /* l2 of in_nbperiext */ + int in_diskblock; /* for standalone test driver */ + int in_maxag; /* for standalone test driver */ struct iagctl in_agctl[MAXAG]; /* AG control information */ }; @@ -139,9 +139,9 @@ struct dinomap { */ struct inomap { struct dinomap im_imap; /* 4096: inode allocation control */ - struct inode *im_ipimap; /* 4: ptr to inode for imap */ - struct mutex im_freelock; /* 4: iag free list lock */ - struct mutex im_aglock[MAXAG]; /* 512: per AG locks */ + struct inode *im_ipimap; /* 4: ptr to inode for imap */ + struct mutex im_freelock; /* 4: iag free list lock */ + struct mutex im_aglock[MAXAG]; /* 512: per AG locks */ u32 *im_DBGdimap; atomic_t im_numinos; /* num of backed inodes */ atomic_t im_numfree; /* num of free backed inodes */ diff --git a/trunk/fs/jfs/jfs_incore.h b/trunk/fs/jfs/jfs_incore.h index cb8f30985ad1..8f453eff3c83 100644 --- a/trunk/fs/jfs/jfs_incore.h +++ b/trunk/fs/jfs/jfs_incore.h @@ -40,7 +40,7 @@ struct jfs_inode_info { uint mode2; /* jfs-specific mode */ uint saved_uid; /* saved for uid mount option */ uint saved_gid; /* saved for gid mount option */ - pxd_t ixpxd; /* inode extent descriptor */ + pxd_t ixpxd; /* inode extent descriptor */ dxd_t acl; /* dxd describing acl */ dxd_t ea; /* dxd describing ea */ time_t otime; /* time created */ @@ -190,7 +190,7 @@ struct jfs_sb_info { uint gengen; /* inode generation generator*/ uint inostamp; /* shows inode belongs to fileset*/ - /* Formerly in ipbmap */ + /* Formerly in ipbmap */ struct bmap *bmap; /* incore bmap descriptor */ struct nls_table *nls_tab; /* current codepage */ struct inode *direct_inode; /* metadata inode */ diff --git a/trunk/fs/jfs/jfs_logmgr.c b/trunk/fs/jfs/jfs_logmgr.c index de3e4a506dbc..44a2f33cb98d 100644 --- a/trunk/fs/jfs/jfs_logmgr.c +++ b/trunk/fs/jfs/jfs_logmgr.c @@ -244,7 +244,7 @@ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, goto writeRecord; /* - * initialize/update page/transaction recovery lsn + * initialize/update page/transaction recovery lsn */ lsn = log->lsn; @@ -263,7 +263,7 @@ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * initialize/update lsn of tblock of the page + * initialize/update lsn of tblock of the page * * transaction inherits oldest lsn of pages associated * with allocation/deallocation of resources (their @@ -307,7 +307,7 @@ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, LOGSYNC_UNLOCK(log, flags); /* - * write the log record + * write the log record */ writeRecord: lsn = lmWriteRecord(log, tblk, lrd, tlck); @@ -372,7 +372,7 @@ lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, goto moveLrd; /* - * move log record data + * move log record data */ /* retrieve source meta-data page to log */ if (tlck->flag & tlckPAGELOCK) { @@ -465,7 +465,7 @@ lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * move log record descriptor + * move log record descriptor */ moveLrd: lrd->length = cpu_to_le16(len); @@ -574,7 +574,7 @@ static int lmNextPage(struct jfs_log * log) LOGGC_LOCK(log); /* - * write or queue the full page at the tail of write queue + * write or queue the full page at the tail of write queue */ /* get the tail tblk on commit queue */ if (list_empty(&log->cqueue)) @@ -625,7 +625,7 @@ static int lmNextPage(struct jfs_log * log) LOGGC_UNLOCK(log); /* - * allocate/initialize next page + * allocate/initialize next page */ /* if log wraps, the first data page of log is 2 * (0 never used, 1 is superblock). @@ -953,7 +953,7 @@ static int lmLogSync(struct jfs_log * log, int hard_sync) } /* - * forward syncpt + * forward syncpt */ /* if last sync is same as last syncpt, * invoke sync point forward processing to update sync. @@ -989,7 +989,7 @@ static int lmLogSync(struct jfs_log * log, int hard_sync) lsn = log->lsn; /* - * setup next syncpt trigger (SWAG) + * setup next syncpt trigger (SWAG) */ logsize = log->logsize; @@ -1000,11 +1000,11 @@ static int lmLogSync(struct jfs_log * log, int hard_sync) if (more < 2 * LOGPSIZE) { jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n"); /* - * log wrapping + * log wrapping * * option 1 - panic ? No.! * option 2 - shutdown file systems - * associated with log ? + * associated with log ? * option 3 - extend log ? */ /* @@ -1062,7 +1062,7 @@ void jfs_syncpt(struct jfs_log *log, int hard_sync) /* * NAME: lmLogOpen() * - * FUNCTION: open the log on first open; + * FUNCTION: open the log on first open; * insert filesystem in the active list of the log. * * PARAMETER: ipmnt - file system mount inode @@ -1113,7 +1113,7 @@ int lmLogOpen(struct super_block *sb) init_waitqueue_head(&log->syncwait); /* - * external log as separate logical volume + * external log as separate logical volume * * file systems to log may have n-to-1 relationship; */ @@ -1155,7 +1155,7 @@ int lmLogOpen(struct super_block *sb) return 0; /* - * unwind on error + * unwind on error */ shutdown: /* unwind lbmLogInit() */ list_del(&log->journal_list); @@ -1427,7 +1427,7 @@ int lmLogInit(struct jfs_log * log) return 0; /* - * unwind on error + * unwind on error */ errout30: /* release log page */ log->wqueue = NULL; @@ -1480,7 +1480,7 @@ int lmLogClose(struct super_block *sb) if (test_bit(log_INLINELOG, &log->flag)) { /* - * in-line log in host file system + * in-line log in host file system */ rc = lmLogShutdown(log); kfree(log); @@ -1504,7 +1504,7 @@ int lmLogClose(struct super_block *sb) goto out; /* - * external log as separate logical volume + * external log as separate logical volume */ list_del(&log->journal_list); bdev = log->bdev; @@ -1622,26 +1622,20 @@ void jfs_flush_journal(struct jfs_log *log, int wait) if (!list_empty(&log->synclist)) { struct logsyncblk *lp; - printk(KERN_ERR "jfs_flush_journal: synclist not empty\n"); list_for_each_entry(lp, &log->synclist, synclist) { if (lp->xflag & COMMIT_PAGE) { struct metapage *mp = (struct metapage *)lp; - print_hex_dump(KERN_ERR, "metapage: ", - DUMP_PREFIX_ADDRESS, 16, 4, - mp, sizeof(struct metapage), 0); - print_hex_dump(KERN_ERR, "page: ", - DUMP_PREFIX_ADDRESS, 16, - sizeof(long), mp->page, - sizeof(struct page), 0); - } else - print_hex_dump(KERN_ERR, "tblock:", - DUMP_PREFIX_ADDRESS, 16, 4, - lp, sizeof(struct tblock), 0); + dump_mem("orphan metapage", lp, + sizeof(struct metapage)); + dump_mem("page", mp->page, sizeof(struct page)); + } + else + dump_mem("orphan tblock", lp, + sizeof(struct tblock)); } } -#else - WARN_ON(!list_empty(&log->synclist)); #endif + //assert(list_empty(&log->synclist)); clear_bit(log_FLUSH, &log->flag); } @@ -1729,7 +1723,7 @@ int lmLogShutdown(struct jfs_log * log) * * PARAMETE: log - pointer to logs inode. * fsdev - kdev_t of filesystem. - * serial - pointer to returned log serial number + * serial - pointer to returned log serial number * activate - insert/remove device from active list. * * RETURN: 0 - success @@ -1969,7 +1963,7 @@ static void lbmfree(struct lbuf * bp) * FUNCTION: add a log buffer to the log redrive list * * PARAMETER: - * bp - log buffer + * bp - log buffer * * NOTES: * Takes log_redrive_lock. @@ -2060,7 +2054,7 @@ static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag, bp->l_flag = flag; /* - * insert bp at tail of write queue associated with log + * insert bp at tail of write queue associated with log * * (request is either for bp already/currently at head of queue * or new bp to be inserted at tail) @@ -2123,7 +2117,7 @@ static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag) log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize)); /* - * initiate pageout of the page + * initiate pageout of the page */ lbmStartIO(bp); } @@ -2134,7 +2128,7 @@ static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag) * * FUNCTION: Interface to DD strategy routine * - * RETURN: none + * RETURN: none * * serialization: LCACHE_LOCK() is NOT held during log i/o; */ @@ -2228,7 +2222,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error) bio_put(bio); /* - * pagein completion + * pagein completion */ if (bp->l_flag & lbmREAD) { bp->l_flag &= ~lbmREAD; @@ -2242,7 +2236,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error) } /* - * pageout completion + * pageout completion * * the bp at the head of write queue has completed pageout. * @@ -2308,7 +2302,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error) } /* - * synchronous pageout: + * synchronous pageout: * * buffer has not necessarily been removed from write queue * (e.g., synchronous write of partial-page with COMMIT): @@ -2322,7 +2316,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error) } /* - * Group Commit pageout: + * Group Commit pageout: */ else if (bp->l_flag & lbmGC) { LCACHE_UNLOCK(flags); @@ -2330,7 +2324,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error) } /* - * asynchronous pageout: + * asynchronous pageout: * * buffer must have been removed from write queue: * insert buffer at head of freelist where it can be recycled @@ -2381,7 +2375,7 @@ int jfsIOWait(void *arg) * FUNCTION: format file system log * * PARAMETERS: - * log - volume log + * log - volume log * logAddress - start address of log space in FS block * logSize - length of log space in FS block; * @@ -2413,16 +2407,16 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize) npages = logSize >> sbi->l2nbperpage; /* - * log space: + * log space: * * page 0 - reserved; * page 1 - log superblock; * page 2 - log data page: A SYNC log record is written - * into this page at logform time; + * into this page at logform time; * pages 3-N - log data page: set to empty log data pages; */ /* - * init log superblock: log page 1 + * init log superblock: log page 1 */ logsuper = (struct logsuper *) bp->l_ldata; @@ -2442,7 +2436,7 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize) goto exit; /* - * init pages 2 to npages-1 as log data pages: + * init pages 2 to npages-1 as log data pages: * * log page sequence number (lpsn) initialization: * @@ -2485,7 +2479,7 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize) goto exit; /* - * initialize succeeding log pages: lpsn = 0, 1, ..., (N-2) + * initialize succeeding log pages: lpsn = 0, 1, ..., (N-2) */ for (lspn = 0; lspn < npages - 3; lspn++) { lp->h.page = lp->t.page = cpu_to_le32(lspn); @@ -2501,7 +2495,7 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize) rc = 0; exit: /* - * finalize log + * finalize log */ /* release the buffer */ lbmFree(bp); diff --git a/trunk/fs/jfs/jfs_logmgr.h b/trunk/fs/jfs/jfs_logmgr.h index 1f85ef0ec045..a53fb17ea219 100644 --- a/trunk/fs/jfs/jfs_logmgr.h +++ b/trunk/fs/jfs/jfs_logmgr.h @@ -144,7 +144,7 @@ struct logpage { * * (this comment should be rewritten !) * jfs uses only "after" log records (only a single writer is allowed - * in a page, pages are written to temporary paging space if + * in a page, pages are written to temporary paging space if * if they must be written to disk before commit, and i/o is * scheduled for modified pages to their home location after * the log records containing the after values and the commit @@ -153,7 +153,7 @@ struct logpage { * * a log record consists of a data area of variable length followed by * a descriptor of fixed size LOGRDSIZE bytes. - * the data area is rounded up to an integral number of 4-bytes and + * the data area is rounded up to an integral number of 4-bytes and * must be no longer than LOGPSIZE. * the descriptor is of size of multiple of 4-bytes and aligned on a * 4-byte boundary. @@ -215,13 +215,13 @@ struct lrd { union { /* - * COMMIT: commit + * COMMIT: commit * * transaction commit: no type-dependent information; */ /* - * REDOPAGE: after-image + * REDOPAGE: after-image * * apply after-image; * @@ -236,7 +236,7 @@ struct lrd { } redopage; /* (20) */ /* - * NOREDOPAGE: the page is freed + * NOREDOPAGE: the page is freed * * do not apply after-image records which precede this record * in the log with the same page block number to this page. @@ -252,7 +252,7 @@ struct lrd { } noredopage; /* (20) */ /* - * UPDATEMAP: update block allocation map + * UPDATEMAP: update block allocation map * * either in-line PXD, * or out-of-line XADLIST; @@ -268,7 +268,7 @@ struct lrd { } updatemap; /* (20) */ /* - * NOREDOINOEXT: the inode extent is freed + * NOREDOINOEXT: the inode extent is freed * * do not apply after-image records which precede this * record in the log with the any of the 4 page block @@ -286,7 +286,7 @@ struct lrd { } noredoinoext; /* (20) */ /* - * SYNCPT: log sync point + * SYNCPT: log sync point * * replay log upto syncpt address specified; */ @@ -295,13 +295,13 @@ struct lrd { } syncpt; /* - * MOUNT: file system mount + * MOUNT: file system mount * * file system mount: no type-dependent information; */ /* - * ? FREEXTENT: free specified extent(s) + * ? FREEXTENT: free specified extent(s) * * free specified extent(s) from block allocation map * N.B.: nextents should be length of data/sizeof(xad_t) @@ -314,7 +314,7 @@ struct lrd { } freextent; /* - * ? NOREDOFILE: this file is freed + * ? NOREDOFILE: this file is freed * * do not apply records which precede this record in the log * with the same inode number. @@ -330,7 +330,7 @@ struct lrd { } noredofile; /* - * ? NEWPAGE: + * ? NEWPAGE: * * metadata type dependent */ @@ -342,7 +342,7 @@ struct lrd { } newpage; /* - * ? DUMMY: filler + * ? DUMMY: filler * * no type-dependent information */ diff --git a/trunk/fs/jfs/jfs_metapage.c b/trunk/fs/jfs/jfs_metapage.c index 77c7f1129dde..43d4f69afbec 100644 --- a/trunk/fs/jfs/jfs_metapage.c +++ b/trunk/fs/jfs/jfs_metapage.c @@ -472,8 +472,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) printk(KERN_ERR "JFS: bio_add_page failed unexpectedly\n"); goto skip; dump_bio: - print_hex_dump(KERN_ERR, "JFS: dump of bio: ", DUMP_PREFIX_ADDRESS, 16, - 4, bio, sizeof(*bio), 0); + dump_mem("bio", bio, sizeof(*bio)); skip: bio_put(bio); unlock_page(page); diff --git a/trunk/fs/jfs/jfs_mount.c b/trunk/fs/jfs/jfs_mount.c index 644429acb8c0..4dd479834897 100644 --- a/trunk/fs/jfs/jfs_mount.c +++ b/trunk/fs/jfs/jfs_mount.c @@ -80,7 +80,7 @@ static int logMOUNT(struct super_block *sb); */ int jfs_mount(struct super_block *sb) { - int rc = 0; /* Return code */ + int rc = 0; /* Return code */ struct jfs_sb_info *sbi = JFS_SBI(sb); struct inode *ipaimap = NULL; struct inode *ipaimap2 = NULL; @@ -169,7 +169,7 @@ int jfs_mount(struct super_block *sb) sbi->ipaimap2 = NULL; /* - * mount (the only/single) fileset + * mount (the only/single) fileset */ /* * open fileset inode allocation map (aka fileset inode) @@ -195,7 +195,7 @@ int jfs_mount(struct super_block *sb) goto out; /* - * unwind on error + * unwind on error */ errout41: /* close fileset inode allocation map inode */ diFreeSpecial(ipimap); diff --git a/trunk/fs/jfs/jfs_txnmgr.c b/trunk/fs/jfs/jfs_txnmgr.c index 7aa1f7004eaf..25430d0b0d59 100644 --- a/trunk/fs/jfs/jfs_txnmgr.c +++ b/trunk/fs/jfs/jfs_txnmgr.c @@ -18,7 +18,7 @@ */ /* - * jfs_txnmgr.c: transaction manager + * jfs_txnmgr.c: transaction manager * * notes: * transaction starts with txBegin() and ends with txCommit() @@ -60,7 +60,7 @@ #include "jfs_debug.h" /* - * transaction management structures + * transaction management structures */ static struct { int freetid; /* index of a free tid structure */ @@ -103,19 +103,19 @@ module_param(nTxLock, int, 0); MODULE_PARM_DESC(nTxLock, "Number of transaction locks (max:65536)"); -struct tblock *TxBlock; /* transaction block table */ -static int TxLockLWM; /* Low water mark for number of txLocks used */ -static int TxLockHWM; /* High water mark for number of txLocks used */ -static int TxLockVHWM; /* Very High water mark */ -struct tlock *TxLock; /* transaction lock table */ +struct tblock *TxBlock; /* transaction block table */ +static int TxLockLWM; /* Low water mark for number of txLocks used */ +static int TxLockHWM; /* High water mark for number of txLocks used */ +static int TxLockVHWM; /* Very High water mark */ +struct tlock *TxLock; /* transaction lock table */ /* - * transaction management lock + * transaction management lock */ static DEFINE_SPINLOCK(jfsTxnLock); -#define TXN_LOCK() spin_lock(&jfsTxnLock) -#define TXN_UNLOCK() spin_unlock(&jfsTxnLock) +#define TXN_LOCK() spin_lock(&jfsTxnLock) +#define TXN_UNLOCK() spin_unlock(&jfsTxnLock) #define LAZY_LOCK_INIT() spin_lock_init(&TxAnchor.LazyLock); #define LAZY_LOCK(flags) spin_lock_irqsave(&TxAnchor.LazyLock, flags) @@ -148,7 +148,7 @@ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event) #define TXN_WAKEUP(event) wake_up_all(event) /* - * statistics + * statistics */ static struct { tid_t maxtid; /* 4: biggest tid ever used */ @@ -181,8 +181,8 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, static void LogSyncRelease(struct metapage * mp); /* - * transaction block/lock management - * --------------------------------- + * transaction block/lock management + * --------------------------------- */ /* @@ -227,9 +227,9 @@ static void txLockFree(lid_t lid) } /* - * NAME: txInit() + * NAME: txInit() * - * FUNCTION: initialize transaction management structures + * FUNCTION: initialize transaction management structures * * RETURN: * @@ -333,9 +333,9 @@ int txInit(void) } /* - * NAME: txExit() + * NAME: txExit() * - * FUNCTION: clean up when module is unloaded + * FUNCTION: clean up when module is unloaded */ void txExit(void) { @@ -346,12 +346,12 @@ void txExit(void) } /* - * NAME: txBegin() + * NAME: txBegin() * - * FUNCTION: start a transaction. + * FUNCTION: start a transaction. * - * PARAMETER: sb - superblock - * flag - force for nested tx; + * PARAMETER: sb - superblock + * flag - force for nested tx; * * RETURN: tid - transaction id * @@ -447,13 +447,13 @@ tid_t txBegin(struct super_block *sb, int flag) } /* - * NAME: txBeginAnon() + * NAME: txBeginAnon() * - * FUNCTION: start an anonymous transaction. + * FUNCTION: start an anonymous transaction. * Blocks if logsync or available tlocks are low to prevent * anonymous tlocks from depleting supply. * - * PARAMETER: sb - superblock + * PARAMETER: sb - superblock * * RETURN: none */ @@ -489,11 +489,11 @@ void txBeginAnon(struct super_block *sb) } /* - * txEnd() + * txEnd() * * function: free specified transaction block. * - * logsync barrier processing: + * logsync barrier processing: * * serialization: */ @@ -577,13 +577,13 @@ void txEnd(tid_t tid) } /* - * txLock() + * txLock() * * function: acquire a transaction lock on the specified * * parameter: * - * return: transaction lock id + * return: transaction lock id * * serialization: */ @@ -829,16 +829,12 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp, /* Only locks on ipimap or ipaimap should reach here */ /* assert(jfs_ip->fileset == AGGREGATE_I); */ if (jfs_ip->fileset != AGGREGATE_I) { - printk(KERN_ERR "txLock: trying to lock locked page!"); - print_hex_dump(KERN_ERR, "ip: ", DUMP_PREFIX_ADDRESS, 16, 4, - ip, sizeof(*ip), 0); - print_hex_dump(KERN_ERR, "mp: ", DUMP_PREFIX_ADDRESS, 16, 4, - mp, sizeof(*mp), 0); - print_hex_dump(KERN_ERR, "Locker's tblock: ", - DUMP_PREFIX_ADDRESS, 16, 4, tid_to_tblock(tid), - sizeof(struct tblock), 0); - print_hex_dump(KERN_ERR, "Tlock: ", DUMP_PREFIX_ADDRESS, 16, 4, - tlck, sizeof(*tlck), 0); + jfs_err("txLock: trying to lock locked page!"); + dump_mem("ip", ip, sizeof(struct inode)); + dump_mem("mp", mp, sizeof(struct metapage)); + dump_mem("Locker's tblk", tid_to_tblock(tid), + sizeof(struct tblock)); + dump_mem("Tlock", tlck, sizeof(struct tlock)); BUG(); } INCREMENT(stattx.waitlock); /* statistics */ @@ -861,17 +857,17 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp, } /* - * NAME: txRelease() + * NAME: txRelease() * - * FUNCTION: Release buffers associated with transaction locks, but don't + * FUNCTION: Release buffers associated with transaction locks, but don't * mark homeok yet. The allows other transactions to modify * buffers, but won't let them go to disk until commit record * actually gets written. * * PARAMETER: - * tblk - + * tblk - * - * RETURN: Errors from subroutines. + * RETURN: Errors from subroutines. */ static void txRelease(struct tblock * tblk) { @@ -900,10 +896,10 @@ static void txRelease(struct tblock * tblk) } /* - * NAME: txUnlock() + * NAME: txUnlock() * - * FUNCTION: Initiates pageout of pages modified by tid in journalled - * objects and frees their lockwords. + * FUNCTION: Initiates pageout of pages modified by tid in journalled + * objects and frees their lockwords. */ static void txUnlock(struct tblock * tblk) { @@ -987,10 +983,10 @@ static void txUnlock(struct tblock * tblk) } /* - * txMaplock() + * txMaplock() * * function: allocate a transaction lock for freed page/entry; - * for freed page, maplock is used as xtlock/dtlock type; + * for freed page, maplock is used as xtlock/dtlock type; */ struct tlock *txMaplock(tid_t tid, struct inode *ip, int type) { @@ -1061,7 +1057,7 @@ struct tlock *txMaplock(tid_t tid, struct inode *ip, int type) } /* - * txLinelock() + * txLinelock() * * function: allocate a transaction lock for log vector list */ @@ -1096,39 +1092,39 @@ struct linelock *txLinelock(struct linelock * tlock) } /* - * transaction commit management - * ----------------------------- + * transaction commit management + * ----------------------------- */ /* - * NAME: txCommit() - * - * FUNCTION: commit the changes to the objects specified in - * clist. For journalled segments only the - * changes of the caller are committed, ie by tid. - * for non-journalled segments the data are flushed to - * disk and then the change to the disk inode and indirect - * blocks committed (so blocks newly allocated to the - * segment will be made a part of the segment atomically). - * - * all of the segments specified in clist must be in - * one file system. no more than 6 segments are needed - * to handle all unix svcs. - * - * if the i_nlink field (i.e. disk inode link count) - * is zero, and the type of inode is a regular file or - * directory, or symbolic link , the inode is truncated - * to zero length. the truncation is committed but the - * VM resources are unaffected until it is closed (see - * iput and iclose). + * NAME: txCommit() + * + * FUNCTION: commit the changes to the objects specified in + * clist. For journalled segments only the + * changes of the caller are committed, ie by tid. + * for non-journalled segments the data are flushed to + * disk and then the change to the disk inode and indirect + * blocks committed (so blocks newly allocated to the + * segment will be made a part of the segment atomically). + * + * all of the segments specified in clist must be in + * one file system. no more than 6 segments are needed + * to handle all unix svcs. + * + * if the i_nlink field (i.e. disk inode link count) + * is zero, and the type of inode is a regular file or + * directory, or symbolic link , the inode is truncated + * to zero length. the truncation is committed but the + * VM resources are unaffected until it is closed (see + * iput and iclose). * * PARAMETER: * * RETURN: * * serialization: - * on entry the inode lock on each segment is assumed - * to be held. + * on entry the inode lock on each segment is assumed + * to be held. * * i/o error: */ @@ -1179,7 +1175,7 @@ int txCommit(tid_t tid, /* transaction identifier */ if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0) tblk->xflag |= COMMIT_LAZY; /* - * prepare non-journaled objects for commit + * prepare non-journaled objects for commit * * flush data pages of non-journaled file * to prevent the file getting non-initialized disk blocks @@ -1190,7 +1186,7 @@ int txCommit(tid_t tid, /* transaction identifier */ cd.nip = nip; /* - * acquire transaction lock on (on-disk) inodes + * acquire transaction lock on (on-disk) inodes * * update on-disk inode from in-memory inode * acquiring transaction locks for AFTER records @@ -1266,7 +1262,7 @@ int txCommit(tid_t tid, /* transaction identifier */ } /* - * write log records from transaction locks + * write log records from transaction locks * * txUpdateMap() resets XAD_NEW in XAD. */ @@ -1298,7 +1294,7 @@ int txCommit(tid_t tid, /* transaction identifier */ !test_cflag(COMMIT_Nolink, tblk->u.ip))); /* - * write COMMIT log record + * write COMMIT log record */ lrd->type = cpu_to_le16(LOG_COMMIT); lrd->length = 0; @@ -1307,7 +1303,7 @@ int txCommit(tid_t tid, /* transaction identifier */ lmGroupCommit(log, tblk); /* - * - transaction is now committed - + * - transaction is now committed - */ /* @@ -1318,11 +1314,11 @@ int txCommit(tid_t tid, /* transaction identifier */ txForce(tblk); /* - * update allocation map. + * update allocation map. * * update inode allocation map and inode: * free pager lock on memory object of inode if any. - * update block allocation map. + * update block allocation map. * * txUpdateMap() resets XAD_NEW in XAD. */ @@ -1330,7 +1326,7 @@ int txCommit(tid_t tid, /* transaction identifier */ txUpdateMap(tblk); /* - * free transaction locks and pageout/free pages + * free transaction locks and pageout/free pages */ txRelease(tblk); @@ -1339,7 +1335,7 @@ int txCommit(tid_t tid, /* transaction identifier */ /* - * reset in-memory object state + * reset in-memory object state */ for (k = 0; k < cd.nip; k++) { ip = cd.iplist[k]; @@ -1362,11 +1358,11 @@ int txCommit(tid_t tid, /* transaction identifier */ } /* - * NAME: txLog() + * NAME: txLog() * - * FUNCTION: Writes AFTER log records for all lines modified - * by tid for segments specified by inodes in comdata. - * Code assumes only WRITELOCKS are recorded in lockwords. + * FUNCTION: Writes AFTER log records for all lines modified + * by tid for segments specified by inodes in comdata. + * Code assumes only WRITELOCKS are recorded in lockwords. * * PARAMETERS: * @@ -1425,12 +1421,12 @@ static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd) } /* - * diLog() + * diLog() * - * function: log inode tlock and format maplock to update bmap; + * function: log inode tlock and format maplock to update bmap; */ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, - struct tlock * tlck, struct commit * cd) + struct tlock * tlck, struct commit * cd) { int rc = 0; struct metapage *mp; @@ -1446,7 +1442,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, pxd = &lrd->log.redopage.pxd; /* - * inode after image + * inode after image */ if (tlck->type & tlckENTRY) { /* log after-image for logredo(): */ @@ -1460,7 +1456,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, tlck->flag |= tlckWRITEPAGE; } else if (tlck->type & tlckFREE) { /* - * free inode extent + * free inode extent * * (pages of the freed inode extent have been invalidated and * a maplock for free of the extent has been formatted at @@ -1502,7 +1498,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, jfs_err("diLog: UFO type tlck:0x%p", tlck); #ifdef _JFS_WIP /* - * alloc/free external EA extent + * alloc/free external EA extent * * a maplock for txUpdateMap() to update bPWMAP for alloc/free * of the extent has been formatted at txLock() time; @@ -1538,9 +1534,9 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * dataLog() + * dataLog() * - * function: log data tlock + * function: log data tlock */ static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck) @@ -1584,9 +1580,9 @@ static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * dtLog() + * dtLog() * - * function: log dtree tlock and format maplock to update bmap; + * function: log dtree tlock and format maplock to update bmap; */ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck) @@ -1607,10 +1603,10 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT); /* - * page extension via relocation: entry insertion; - * page extension in-place: entry insertion; - * new right page from page split, reinitialized in-line - * root from root page split: entry insertion; + * page extension via relocation: entry insertion; + * page extension in-place: entry insertion; + * new right page from page split, reinitialized in-line + * root from root page split: entry insertion; */ if (tlck->type & (tlckNEW | tlckEXTEND)) { /* log after-image of the new page for logredo(): @@ -1645,8 +1641,8 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * entry insertion/deletion, - * sibling page link update (old right page before split); + * entry insertion/deletion, + * sibling page link update (old right page before split); */ if (tlck->type & (tlckENTRY | tlckRELINK)) { /* log after-image for logredo(): */ @@ -1662,11 +1658,11 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * page deletion: page has been invalidated - * page relocation: source extent + * page deletion: page has been invalidated + * page relocation: source extent * - * a maplock for free of the page has been formatted - * at txLock() time); + * a maplock for free of the page has been formatted + * at txLock() time); */ if (tlck->type & (tlckFREE | tlckRELOCATE)) { /* log LOG_NOREDOPAGE of the deleted page for logredo() @@ -1687,9 +1683,9 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * xtLog() + * xtLog() * - * function: log xtree tlock and format maplock to update bmap; + * function: log xtree tlock and format maplock to update bmap; */ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck) @@ -1729,8 +1725,8 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, xadlock = (struct xdlistlock *) maplock; /* - * entry insertion/extension; - * sibling page link update (old right page before split); + * entry insertion/extension; + * sibling page link update (old right page before split); */ if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) { /* log after-image for logredo(): @@ -1805,7 +1801,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * page deletion: file deletion/truncation (ref. xtTruncate()) + * page deletion: file deletion/truncation (ref. xtTruncate()) * * (page will be invalidated after log is written and bmap * is updated from the page); @@ -1912,13 +1908,13 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * page/entry truncation: file truncation (ref. xtTruncate()) + * page/entry truncation: file truncation (ref. xtTruncate()) * - * |----------+------+------+---------------| - * | | | - * | | hwm - hwm before truncation - * | next - truncation point - * lwm - lwm before truncation + * |----------+------+------+---------------| + * | | | + * | | hwm - hwm before truncation + * | next - truncation point + * lwm - lwm before truncation * header ? */ if (tlck->type & tlckTRUNCATE) { @@ -1941,7 +1937,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, twm = xtlck->twm.offset; /* - * write log records + * write log records */ /* log after-image for logredo(): * @@ -2001,7 +1997,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * format maplock(s) for txUpdateMap() to update bmap + * format maplock(s) for txUpdateMap() to update bmap */ maplock->index = 0; @@ -2073,9 +2069,9 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * mapLog() + * mapLog() * - * function: log from maplock of freed data extents; + * function: log from maplock of freed data extents; */ static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck) @@ -2085,7 +2081,7 @@ static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, pxd_t *pxd; /* - * page relocation: free the source page extent + * page relocation: free the source page extent * * a maplock for txUpdateMap() for free of the page * has been formatted at txLock() time saving the src @@ -2159,10 +2155,10 @@ static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * txEA() + * txEA() * - * function: acquire maplock for EA/ACL extents or - * set COMMIT_INLINE flag; + * function: acquire maplock for EA/ACL extents or + * set COMMIT_INLINE flag; */ void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea) { @@ -2211,10 +2207,10 @@ void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea) } /* - * txForce() + * txForce() * * function: synchronously write pages locked by transaction - * after txLog() but before txUpdateMap(); + * after txLog() but before txUpdateMap(); */ static void txForce(struct tblock * tblk) { @@ -2277,10 +2273,10 @@ static void txForce(struct tblock * tblk) } /* - * txUpdateMap() + * txUpdateMap() * - * function: update persistent allocation map (and working map - * if appropriate); + * function: update persistent allocation map (and working map + * if appropriate); * * parameter: */ @@ -2302,7 +2298,7 @@ static void txUpdateMap(struct tblock * tblk) /* - * update block allocation map + * update block allocation map * * update allocation state in pmap (and wmap) and * update lsn of the pmap page; @@ -2386,7 +2382,7 @@ static void txUpdateMap(struct tblock * tblk) } } /* - * update inode allocation map + * update inode allocation map * * update allocation state in pmap and * update lsn of the pmap page; @@ -2411,24 +2407,24 @@ static void txUpdateMap(struct tblock * tblk) } /* - * txAllocPMap() + * txAllocPMap() * * function: allocate from persistent map; * * parameter: - * ipbmap - - * malock - - * xad list: - * pxd: - * - * maptype - - * allocate from persistent map; - * free from persistent map; - * (e.g., tmp file - free from working map at releae - * of last reference); - * free from persistent and working map; - * - * lsn - log sequence number; + * ipbmap - + * malock - + * xad list: + * pxd: + * + * maptype - + * allocate from persistent map; + * free from persistent map; + * (e.g., tmp file - free from working map at releae + * of last reference); + * free from persistent and working map; + * + * lsn - log sequence number; */ static void txAllocPMap(struct inode *ip, struct maplock * maplock, struct tblock * tblk) @@ -2482,9 +2478,9 @@ static void txAllocPMap(struct inode *ip, struct maplock * maplock, } /* - * txFreeMap() + * txFreeMap() * - * function: free from persistent and/or working map; + * function: free from persistent and/or working map; * * todo: optimization */ @@ -2583,9 +2579,9 @@ void txFreeMap(struct inode *ip, } /* - * txFreelock() + * txFreelock() * - * function: remove tlock from inode anonymous locklist + * function: remove tlock from inode anonymous locklist */ void txFreelock(struct inode *ip) { @@ -2623,7 +2619,7 @@ void txFreelock(struct inode *ip) } /* - * txAbort() + * txAbort() * * function: abort tx before commit; * @@ -2683,7 +2679,7 @@ void txAbort(tid_t tid, int dirty) } /* - * txLazyCommit(void) + * txLazyCommit(void) * * All transactions except those changing ipimap (COMMIT_FORCE) are * processed by this routine. This insures that the inode and block @@ -2732,7 +2728,7 @@ static void txLazyCommit(struct tblock * tblk) } /* - * jfs_lazycommit(void) + * jfs_lazycommit(void) * * To be run as a kernel daemon. If lbmIODone is called in an interrupt * context, or where blocking is not wanted, this routine will process @@ -2917,7 +2913,7 @@ void txResume(struct super_block *sb) } /* - * jfs_sync(void) + * jfs_sync(void) * * To be run as a kernel daemon. This is awakened when tlocks run low. * We write any inodes that have anonymous tlocks so they will become diff --git a/trunk/fs/jfs/jfs_txnmgr.h b/trunk/fs/jfs/jfs_txnmgr.h index ab7288937019..7863cf21afca 100644 --- a/trunk/fs/jfs/jfs_txnmgr.h +++ b/trunk/fs/jfs/jfs_txnmgr.h @@ -94,7 +94,7 @@ extern struct tblock *TxBlock; /* transaction block table */ */ struct tlock { lid_t next; /* 2: index next lockword on tid locklist - * next lockword on freelist + * next lockword on freelist */ tid_t tid; /* 2: transaction id holding lock */ diff --git a/trunk/fs/jfs/jfs_types.h b/trunk/fs/jfs/jfs_types.h index 649f9817accd..09b252958687 100644 --- a/trunk/fs/jfs/jfs_types.h +++ b/trunk/fs/jfs/jfs_types.h @@ -21,7 +21,7 @@ /* * jfs_types.h: * - * basic type/utility definitions + * basic type/utility definitions * * note: this header file must be the 1st include file * of JFS include list in all JFS .c file. @@ -54,8 +54,8 @@ struct timestruc_t { */ #define LEFTMOSTONE 0x80000000 -#define HIGHORDER 0x80000000u /* high order bit on */ -#define ONES 0xffffffffu /* all bit on */ +#define HIGHORDER 0x80000000u /* high order bit on */ +#define ONES 0xffffffffu /* all bit on */ /* * logical xd (lxd) @@ -148,7 +148,7 @@ typedef struct { #define sizeDXD(dxd) le32_to_cpu((dxd)->size) /* - * directory entry argument + * directory entry argument */ struct component_name { int namlen; @@ -160,14 +160,14 @@ struct component_name { * DASD limit information - stored in directory inode */ struct dasd { - u8 thresh; /* Alert Threshold (in percent) */ - u8 delta; /* Alert Threshold delta (in percent) */ + u8 thresh; /* Alert Threshold (in percent) */ + u8 delta; /* Alert Threshold delta (in percent) */ u8 rsrvd1; - u8 limit_hi; /* DASD limit (in logical blocks) */ - __le32 limit_lo; /* DASD limit (in logical blocks) */ + u8 limit_hi; /* DASD limit (in logical blocks) */ + __le32 limit_lo; /* DASD limit (in logical blocks) */ u8 rsrvd2[3]; - u8 used_hi; /* DASD usage (in logical blocks) */ - __le32 used_lo; /* DASD usage (in logical blocks) */ + u8 used_hi; /* DASD usage (in logical blocks) */ + __le32 used_lo; /* DASD usage (in logical blocks) */ }; #define DASDLIMIT(dasdp) \ diff --git a/trunk/fs/jfs/jfs_umount.c b/trunk/fs/jfs/jfs_umount.c index 7971f37534a3..a386f48c73fc 100644 --- a/trunk/fs/jfs/jfs_umount.c +++ b/trunk/fs/jfs/jfs_umount.c @@ -60,7 +60,7 @@ int jfs_umount(struct super_block *sb) jfs_info("UnMount JFS: sb:0x%p", sb); /* - * update superblock and close log + * update superblock and close log * * if mounted read-write and log based recovery was enabled */ diff --git a/trunk/fs/jfs/jfs_xtree.c b/trunk/fs/jfs/jfs_xtree.c index 1543906a2e0d..acc97c46d8a4 100644 --- a/trunk/fs/jfs/jfs_xtree.c +++ b/trunk/fs/jfs/jfs_xtree.c @@ -16,7 +16,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* - * jfs_xtree.c: extent allocation descriptor B+-tree manager + * jfs_xtree.c: extent allocation descriptor B+-tree manager */ #include @@ -32,30 +32,30 @@ /* * xtree local flag */ -#define XT_INSERT 0x00000001 +#define XT_INSERT 0x00000001 /* - * xtree key/entry comparison: extent offset + * xtree key/entry comparison: extent offset * * return: - * -1: k < start of extent - * 0: start_of_extent <= k <= end_of_extent - * 1: k > end_of_extent + * -1: k < start of extent + * 0: start_of_extent <= k <= end_of_extent + * 1: k > end_of_extent */ #define XT_CMP(CMP, K, X, OFFSET64)\ {\ - OFFSET64 = offsetXAD(X);\ - (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\ - ((K) < OFFSET64) ? -1 : 0;\ + OFFSET64 = offsetXAD(X);\ + (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\ + ((K) < OFFSET64) ? -1 : 0;\ } /* write a xad entry */ #define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\ {\ - (XAD)->flag = (FLAG);\ - XADoffset((XAD), (OFF));\ - XADlength((XAD), (LEN));\ - XADaddress((XAD), (ADDR));\ + (XAD)->flag = (FLAG);\ + XADoffset((XAD), (OFF));\ + XADlength((XAD), (LEN));\ + XADaddress((XAD), (ADDR));\ } #define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot) @@ -76,13 +76,13 @@ MP = NULL;\ RC = -EIO;\ }\ - }\ + }\ } /* for consistency */ #define XT_PUTPAGE(MP) BT_PUTPAGE(MP) -#define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ +#define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot) /* xtree entry parameter descriptor */ struct xtsplit { @@ -97,7 +97,7 @@ struct xtsplit { /* - * statistics + * statistics */ #ifdef CONFIG_JFS_STATISTICS static struct { @@ -136,7 +136,7 @@ static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp); #endif /* _STILL_TO_PORT */ /* - * xtLookup() + * xtLookup() * * function: map a single page into a physical extent; */ @@ -179,7 +179,7 @@ int xtLookup(struct inode *ip, s64 lstart, } /* - * compute the physical extent covering logical extent + * compute the physical extent covering logical extent * * N.B. search may have failed (e.g., hole in sparse file), * and returned the index of the next entry. @@ -220,27 +220,27 @@ int xtLookup(struct inode *ip, s64 lstart, /* - * xtLookupList() + * xtLookupList() * * function: map a single logical extent into a list of physical extent; * * parameter: - * struct inode *ip, - * struct lxdlist *lxdlist, lxd list (in) - * struct xadlist *xadlist, xad list (in/out) - * int flag) + * struct inode *ip, + * struct lxdlist *lxdlist, lxd list (in) + * struct xadlist *xadlist, xad list (in/out) + * int flag) * * coverage of lxd by xad under assumption of * . lxd's are ordered and disjoint. * . xad's are ordered and disjoint. * * return: - * 0: success + * 0: success * * note: a page being written (even a single byte) is backed fully, - * except the last page which is only backed with blocks - * required to cover the last byte; - * the extent backing a page is fully contained within an xad; + * except the last page which is only backed with blocks + * required to cover the last byte; + * the extent backing a page is fully contained within an xad; */ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist, struct xadlist * xadlist, int flag) @@ -284,7 +284,7 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist, return rc; /* - * compute the physical extent covering logical extent + * compute the physical extent covering logical extent * * N.B. search may have failed (e.g., hole in sparse file), * and returned the index of the next entry. @@ -343,7 +343,7 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist, if (lstart >= size) goto mapend; - /* compare with the current xad */ + /* compare with the current xad */ goto compare1; } /* lxd is covered by xad */ @@ -430,7 +430,7 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist, /* * lxd is partially covered by xad */ - else { /* (xend < lend) */ + else { /* (xend < lend) */ /* * get next xad @@ -477,22 +477,22 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist, /* - * xtSearch() + * xtSearch() * - * function: search for the xad entry covering specified offset. + * function: search for the xad entry covering specified offset. * * parameters: - * ip - file object; - * xoff - extent offset; - * nextp - address of next extent (if any) for search miss - * cmpp - comparison result: - * btstack - traverse stack; - * flag - search process flag (XT_INSERT); + * ip - file object; + * xoff - extent offset; + * nextp - address of next extent (if any) for search miss + * cmpp - comparison result: + * btstack - traverse stack; + * flag - search process flag (XT_INSERT); * * returns: - * btstack contains (bn, index) of search path traversed to the entry. - * *cmpp is set to result of comparison with the entry returned. - * the page containing the entry is pinned at exit. + * btstack contains (bn, index) of search path traversed to the entry. + * *cmpp is set to result of comparison with the entry returned. + * the page containing the entry is pinned at exit. */ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, int *cmpp, struct btstack * btstack, int flag) @@ -517,7 +517,7 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, btstack->nsplit = 0; /* - * search down tree from root: + * search down tree from root: * * between two consecutive entries of and of * internal page, child page Pi contains entry with k, Ki <= K < Kj. @@ -642,7 +642,7 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, XT_CMP(cmp, xoff, &p->xad[index], t64); if (cmp == 0) { /* - * search hit + * search hit */ /* search hit - leaf page: * return the entry found @@ -692,7 +692,7 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, } /* - * search miss + * search miss * * base is the smallest index with key (Kj) greater than * search key (K) and may be zero or maxentry index. @@ -773,22 +773,22 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, } /* - * xtInsert() + * xtInsert() * * function: * * parameter: - * tid - transaction id; - * ip - file object; - * xflag - extent flag (XAD_NOTRECORDED): - * xoff - extent offset; - * xlen - extent length; - * xaddrp - extent address pointer (in/out): - * if (*xaddrp) - * caller allocated data extent at *xaddrp; - * else - * allocate data extent and return its xaddr; - * flag - + * tid - transaction id; + * ip - file object; + * xflag - extent flag (XAD_NOTRECORDED): + * xoff - extent offset; + * xlen - extent length; + * xaddrp - extent address pointer (in/out): + * if (*xaddrp) + * caller allocated data extent at *xaddrp; + * else + * allocate data extent and return its xaddr; + * flag - * * return: */ @@ -813,7 +813,7 @@ int xtInsert(tid_t tid, /* transaction id */ jfs_info("xtInsert: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen); /* - * search for the entry location at which to insert: + * search for the entry location at which to insert: * * xtFastSearch() and xtSearch() both returns (leaf page * pinned, index at which to insert). @@ -853,13 +853,13 @@ int xtInsert(tid_t tid, /* transaction id */ } /* - * insert entry for new extent + * insert entry for new extent */ xflag |= XAD_NEW; /* - * if the leaf page is full, split the page and - * propagate up the router entry for the new page from split + * if the leaf page is full, split the page and + * propagate up the router entry for the new page from split * * The xtSplitUp() will insert the entry and unpin the leaf page. */ @@ -886,7 +886,7 @@ int xtInsert(tid_t tid, /* transaction id */ } /* - * insert the new entry into the leaf page + * insert the new entry into the leaf page */ /* * acquire a transaction lock on the leaf page; @@ -930,16 +930,16 @@ int xtInsert(tid_t tid, /* transaction id */ /* - * xtSplitUp() + * xtSplitUp() * * function: - * split full pages as propagating insertion up the tree + * split full pages as propagating insertion up the tree * * parameter: - * tid - transaction id; - * ip - file object; - * split - entry parameter descriptor; - * btstack - traverse stack from xtSearch() + * tid - transaction id; + * ip - file object; + * split - entry parameter descriptor; + * btstack - traverse stack from xtSearch() * * return: */ @@ -1199,22 +1199,22 @@ xtSplitUp(tid_t tid, /* - * xtSplitPage() + * xtSplitPage() * * function: - * split a full non-root page into - * original/split/left page and new right page - * i.e., the original/split page remains as left page. + * split a full non-root page into + * original/split/left page and new right page + * i.e., the original/split page remains as left page. * * parameter: - * int tid, - * struct inode *ip, - * struct xtsplit *split, - * struct metapage **rmpp, - * u64 *rbnp, + * int tid, + * struct inode *ip, + * struct xtsplit *split, + * struct metapage **rmpp, + * u64 *rbnp, * * return: - * Pointer to page in which to insert or NULL on error. + * Pointer to page in which to insert or NULL on error. */ static int xtSplitPage(tid_t tid, struct inode *ip, @@ -1248,9 +1248,9 @@ xtSplitPage(tid_t tid, struct inode *ip, rbn = addressPXD(pxd); /* Allocate blocks to quota. */ - if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) { - rc = -EDQUOT; - goto clean_up; + if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) { + rc = -EDQUOT; + goto clean_up; } quota_allocation += lengthPXD(pxd); @@ -1304,7 +1304,7 @@ xtSplitPage(tid_t tid, struct inode *ip, skip = split->index; /* - * sequential append at tail (after last entry of last page) + * sequential append at tail (after last entry of last page) * * if splitting the last page on a level because of appending * a entry to it (skip is maxentry), it's likely that the access is @@ -1342,7 +1342,7 @@ xtSplitPage(tid_t tid, struct inode *ip, } /* - * non-sequential insert (at possibly middle page) + * non-sequential insert (at possibly middle page) */ /* @@ -1465,24 +1465,25 @@ xtSplitPage(tid_t tid, struct inode *ip, /* - * xtSplitRoot() + * xtSplitRoot() * * function: - * split the full root page into original/root/split page and new - * right page - * i.e., root remains fixed in tree anchor (inode) and the root is - * copied to a single new right child page since root page << - * non-root page, and the split root page contains a single entry - * for the new right child page. + * split the full root page into + * original/root/split page and new right page + * i.e., root remains fixed in tree anchor (inode) and + * the root is copied to a single new right child page + * since root page << non-root page, and + * the split root page contains a single entry for the + * new right child page. * * parameter: - * int tid, - * struct inode *ip, - * struct xtsplit *split, - * struct metapage **rmpp) + * int tid, + * struct inode *ip, + * struct xtsplit *split, + * struct metapage **rmpp) * * return: - * Pointer to page in which to insert or NULL on error. + * Pointer to page in which to insert or NULL on error. */ static int xtSplitRoot(tid_t tid, @@ -1504,7 +1505,7 @@ xtSplitRoot(tid_t tid, INCREMENT(xtStat.split); /* - * allocate a single (right) child page + * allocate a single (right) child page */ pxdlist = split->pxdlist; pxd = &pxdlist->pxd[pxdlist->npxd]; @@ -1572,7 +1573,7 @@ xtSplitRoot(tid_t tid, } /* - * reset the root + * reset the root * * init root with the single entry for the new right page * set the 1st entry offset to 0, which force the left-most key @@ -1609,7 +1610,7 @@ xtSplitRoot(tid_t tid, /* - * xtExtend() + * xtExtend() * * function: extend in-place; * @@ -1676,7 +1677,7 @@ int xtExtend(tid_t tid, /* transaction id */ goto extendOld; /* - * extent overflow: insert entry for new extent + * extent overflow: insert entry for new extent */ //insertNew: xoff = offsetXAD(xad) + MAXXLEN; @@ -1684,8 +1685,8 @@ int xtExtend(tid_t tid, /* transaction id */ nextindex = le16_to_cpu(p->header.nextindex); /* - * if the leaf page is full, insert the new entry and - * propagate up the router entry for the new page from split + * if the leaf page is full, insert the new entry and + * propagate up the router entry for the new page from split * * The xtSplitUp() will insert the entry and unpin the leaf page. */ @@ -1730,7 +1731,7 @@ int xtExtend(tid_t tid, /* transaction id */ } } /* - * insert the new entry into the leaf page + * insert the new entry into the leaf page */ else { /* insert the new entry: mark the entry NEW */ @@ -1770,11 +1771,11 @@ int xtExtend(tid_t tid, /* transaction id */ #ifdef _NOTYET /* - * xtTailgate() + * xtTailgate() * * function: split existing 'tail' extent - * (split offset >= start offset of tail extent), and - * relocate and extend the split tail half; + * (split offset >= start offset of tail extent), and + * relocate and extend the split tail half; * * note: existing extent may or may not have been committed. * caller is responsible for pager buffer cache update, and @@ -1803,7 +1804,7 @@ int xtTailgate(tid_t tid, /* transaction id */ /* printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n", - (ulong)xoff, xlen, (ulong)xaddr); + (ulong)xoff, xlen, (ulong)xaddr); */ /* there must exist extent to be tailgated */ @@ -1841,18 +1842,18 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n", xad = &p->xad[index]; /* printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n", - (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad)); + (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad)); */ if ((llen = xoff - offsetXAD(xad)) == 0) goto updateOld; /* - * partially replace extent: insert entry for new extent + * partially replace extent: insert entry for new extent */ //insertNew: /* - * if the leaf page is full, insert the new entry and - * propagate up the router entry for the new page from split + * if the leaf page is full, insert the new entry and + * propagate up the router entry for the new page from split * * The xtSplitUp() will insert the entry and unpin the leaf page. */ @@ -1897,7 +1898,7 @@ printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n", } } /* - * insert the new entry into the leaf page + * insert the new entry into the leaf page */ else { /* insert the new entry: mark the entry NEW */ @@ -1954,17 +1955,17 @@ printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n", #endif /* _NOTYET */ /* - * xtUpdate() + * xtUpdate() * * function: update XAD; * - * update extent for allocated_but_not_recorded or - * compressed extent; + * update extent for allocated_but_not_recorded or + * compressed extent; * * parameter: - * nxad - new XAD; - * logical extent of the specified XAD must be completely - * contained by an existing XAD; + * nxad - new XAD; + * logical extent of the specified XAD must be completely + * contained by an existing XAD; */ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) { /* new XAD */ @@ -2415,19 +2416,19 @@ printf("xtUpdate.updateLeft.split p:0x%p\n", p); /* - * xtAppend() + * xtAppend() * * function: grow in append mode from contiguous region specified ; * * parameter: - * tid - transaction id; - * ip - file object; - * xflag - extent flag: - * xoff - extent offset; - * maxblocks - max extent length; - * xlen - extent length (in/out); - * xaddrp - extent address pointer (in/out): - * flag - + * tid - transaction id; + * ip - file object; + * xflag - extent flag: + * xoff - extent offset; + * maxblocks - max extent length; + * xlen - extent length (in/out); + * xaddrp - extent address pointer (in/out): + * flag - * * return: */ @@ -2459,7 +2460,7 @@ int xtAppend(tid_t tid, /* transaction id */ (ulong) xoff, maxblocks, xlen, (ulong) xaddr); /* - * search for the entry location at which to insert: + * search for the entry location at which to insert: * * xtFastSearch() and xtSearch() both returns (leaf page * pinned, index at which to insert). @@ -2481,13 +2482,13 @@ int xtAppend(tid_t tid, /* transaction id */ xlen = min(xlen, (int)(next - xoff)); //insert: /* - * insert entry for new extent + * insert entry for new extent */ xflag |= XAD_NEW; /* - * if the leaf page is full, split the page and - * propagate up the router entry for the new page from split + * if the leaf page is full, split the page and + * propagate up the router entry for the new page from split * * The xtSplitUp() will insert the entry and unpin the leaf page. */ @@ -2544,7 +2545,7 @@ int xtAppend(tid_t tid, /* transaction id */ return 0; /* - * insert the new entry into the leaf page + * insert the new entry into the leaf page */ insertLeaf: /* @@ -2588,17 +2589,17 @@ int xtAppend(tid_t tid, /* transaction id */ /* - TBD for defragmentaion/reorganization - * - * xtDelete() + * xtDelete() * * function: - * delete the entry with the specified key. + * delete the entry with the specified key. * - * N.B.: whole extent of the entry is assumed to be deleted. + * N.B.: whole extent of the entry is assumed to be deleted. * * parameter: * * return: - * ENOENT: if the entry is not found. + * ENOENT: if the entry is not found. * * exception: */ @@ -2664,10 +2665,10 @@ int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag) /* - TBD for defragmentaion/reorganization - * - * xtDeleteUp() + * xtDeleteUp() * * function: - * free empty pages as propagating deletion up the tree + * free empty pages as propagating deletion up the tree * * parameter: * @@ -2814,15 +2815,15 @@ xtDeleteUp(tid_t tid, struct inode *ip, /* - * NAME: xtRelocate() + * NAME: xtRelocate() * - * FUNCTION: relocate xtpage or data extent of regular file; - * This function is mainly used by defragfs utility. + * FUNCTION: relocate xtpage or data extent of regular file; + * This function is mainly used by defragfs utility. * - * NOTE: This routine does not have the logic to handle - * uncommitted allocated extent. The caller should call - * txCommit() to commit all the allocation before call - * this routine. + * NOTE: This routine does not have the logic to handle + * uncommitted allocated extent. The caller should call + * txCommit() to commit all the allocation before call + * this routine. */ int xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ @@ -2864,8 +2865,8 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr); /* - * 1. get and validate the parent xtpage/xad entry - * covering the source extent to be relocated; + * 1. get and validate the parent xtpage/xad entry + * covering the source extent to be relocated; */ if (xtype == DATAEXT) { /* search in leaf entry */ @@ -2909,7 +2910,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ jfs_info("xtRelocate: parent xad entry validated."); /* - * 2. relocate the extent + * 2. relocate the extent */ if (xtype == DATAEXT) { /* if the extent is allocated-but-not-recorded @@ -2922,7 +2923,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ XT_PUTPAGE(pmp); /* - * cmRelocate() + * cmRelocate() * * copy target data pages to be relocated; * @@ -2944,8 +2945,8 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ pno = offset >> CM_L2BSIZE; npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE; /* - npages = ((offset + nbytes - 1) >> CM_L2BSIZE) - - (offset >> CM_L2BSIZE) + 1; + npages = ((offset + nbytes - 1) >> CM_L2BSIZE) - + (offset >> CM_L2BSIZE) + 1; */ sxaddr = oxaddr; dxaddr = nxaddr; @@ -2980,7 +2981,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); jfs_info("xtRelocate: target data extent relocated."); - } else { /* (xtype == XTPAGE) */ + } else { /* (xtype == XTPAGE) */ /* * read in the target xtpage from the source extent; @@ -3025,14 +3026,16 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ */ if (lmp) { BT_MARK_DIRTY(lmp, ip); - tlck = txLock(tid, ip, lmp, tlckXTREE | tlckRELINK); + tlck = + txLock(tid, ip, lmp, tlckXTREE | tlckRELINK); lp->header.next = cpu_to_le64(nxaddr); XT_PUTPAGE(lmp); } if (rmp) { BT_MARK_DIRTY(rmp, ip); - tlck = txLock(tid, ip, rmp, tlckXTREE | tlckRELINK); + tlck = + txLock(tid, ip, rmp, tlckXTREE | tlckRELINK); rp->header.prev = cpu_to_le64(nxaddr); XT_PUTPAGE(rmp); } @@ -3059,7 +3062,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ * scan may be skipped by commit() and logredo(); */ BT_MARK_DIRTY(mp, ip); - /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */ + /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */ tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW); xtlck = (struct xtlock *) & tlck->lock; @@ -3081,7 +3084,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ } /* - * 3. acquire maplock for the source extent to be freed; + * 3. acquire maplock for the source extent to be freed; * * acquire a maplock saving the src relocated extent address; * to free of the extent at commit time; @@ -3102,7 +3105,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ * is no buffer associated with this lock since the buffer * has been redirected to the target location. */ - else /* (xtype == XTPAGE) */ + else /* (xtype == XTPAGE) */ tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE); pxdlock = (struct pxd_lock *) & tlck->lock; @@ -3112,7 +3115,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ pxdlock->index = 1; /* - * 4. update the parent xad entry for relocation; + * 4. update the parent xad entry for relocation; * * acquire tlck for the parent entry with XAD_NEW as entry * update which will write LOG_REDOPAGE and update bmap for @@ -3140,22 +3143,22 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ /* - * xtSearchNode() + * xtSearchNode() * - * function: search for the internal xad entry covering specified extent. - * This function is mainly used by defragfs utility. + * function: search for the internal xad entry covering specified extent. + * This function is mainly used by defragfs utility. * * parameters: - * ip - file object; - * xad - extent to find; - * cmpp - comparison result: - * btstack - traverse stack; - * flag - search process flag; + * ip - file object; + * xad - extent to find; + * cmpp - comparison result: + * btstack - traverse stack; + * flag - search process flag; * * returns: - * btstack contains (bn, index) of search path traversed to the entry. - * *cmpp is set to result of comparison with the entry returned. - * the page containing the entry is pinned at exit. + * btstack contains (bn, index) of search path traversed to the entry. + * *cmpp is set to result of comparison with the entry returned. + * the page containing the entry is pinned at exit. */ static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */ int *cmpp, struct btstack * btstack, int flag) @@ -3178,7 +3181,7 @@ static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */ xaddr = addressXAD(xad); /* - * search down tree from root: + * search down tree from root: * * between two consecutive entries of and of * internal page, child page Pi contains entry with k, Ki <= K < Kj. @@ -3214,7 +3217,7 @@ static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */ XT_CMP(cmp, xoff, &p->xad[index], t64); if (cmp == 0) { /* - * search hit + * search hit * * verify for exact match; */ @@ -3242,7 +3245,7 @@ static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */ } /* - * search miss - non-leaf page: + * search miss - non-leaf page: * * base is the smallest index with key (Kj) greater than * search key (K) and may be zero or maxentry index. @@ -3265,15 +3268,15 @@ static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */ /* - * xtRelink() + * xtRelink() * * function: - * link around a freed page. + * link around a freed page. * * Parameter: - * int tid, - * struct inode *ip, - * xtpage_t *p) + * int tid, + * struct inode *ip, + * xtpage_t *p) * * returns: */ @@ -3335,7 +3338,7 @@ static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p) /* - * xtInitRoot() + * xtInitRoot() * * initialize file root (inline in inode) */ @@ -3382,42 +3385,42 @@ void xtInitRoot(tid_t tid, struct inode *ip) #define MAX_TRUNCATE_LEAVES 50 /* - * xtTruncate() + * xtTruncate() * * function: - * traverse for truncation logging backward bottom up; - * terminate at the last extent entry at the current subtree - * root page covering new down size. - * truncation may occur within the last extent entry. + * traverse for truncation logging backward bottom up; + * terminate at the last extent entry at the current subtree + * root page covering new down size. + * truncation may occur within the last extent entry. * * parameter: - * int tid, - * struct inode *ip, - * s64 newsize, - * int type) {PWMAP, PMAP, WMAP; DELETE, TRUNCATE} + * int tid, + * struct inode *ip, + * s64 newsize, + * int type) {PWMAP, PMAP, WMAP; DELETE, TRUNCATE} * * return: * * note: - * PWMAP: - * 1. truncate (non-COMMIT_NOLINK file) - * by jfs_truncate() or jfs_open(O_TRUNC): - * xtree is updated; + * PWMAP: + * 1. truncate (non-COMMIT_NOLINK file) + * by jfs_truncate() or jfs_open(O_TRUNC): + * xtree is updated; * 2. truncate index table of directory when last entry removed - * map update via tlock at commit time; - * PMAP: + * map update via tlock at commit time; + * PMAP: * Call xtTruncate_pmap instead - * WMAP: - * 1. remove (free zero link count) on last reference release - * (pmap has been freed at commit zero link count); - * 2. truncate (COMMIT_NOLINK file, i.e., tmp file): - * xtree is updated; - * map update directly at truncation time; + * WMAP: + * 1. remove (free zero link count) on last reference release + * (pmap has been freed at commit zero link count); + * 2. truncate (COMMIT_NOLINK file, i.e., tmp file): + * xtree is updated; + * map update directly at truncation time; * - * if (DELETE) - * no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient); - * else if (TRUNCATE) - * must write LOG_NOREDOPAGE for deleted index page; + * if (DELETE) + * no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient); + * else if (TRUNCATE) + * must write LOG_NOREDOPAGE for deleted index page; * * pages may already have been tlocked by anonymous transactions * during file growth (i.e., write) before truncation; @@ -3490,7 +3493,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) * retained in the new sized file. * if type is PMAP, the data and index pages are NOT * freed, and the data and index blocks are NOT freed - * from working map. + * from working map. * (this will allow continued access of data/index of * temporary file (zerolink count file truncated to zero-length)). */ @@ -3539,7 +3542,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) goto getChild; /* - * leaf page + * leaf page */ freed = 0; @@ -3913,7 +3916,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) } /* - * internal page: go down to child page of current entry + * internal page: go down to child page of current entry */ getChild: /* save current parent entry for the child page */ @@ -3962,7 +3965,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) /* - * xtTruncate_pmap() + * xtTruncate_pmap() * * function: * Perform truncate to zero lenghth for deleted file, leaving the @@ -3971,9 +3974,9 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) * is committed to disk. * * parameter: - * tid_t tid, - * struct inode *ip, - * s64 committed_size) + * tid_t tid, + * struct inode *ip, + * s64 committed_size) * * return: new committed size * @@ -4047,7 +4050,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) } /* - * leaf page + * leaf page */ if (++locked_leaves > MAX_TRUNCATE_LEAVES) { @@ -4059,7 +4062,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) xoff = offsetXAD(xad); xlen = lengthXAD(xad); XT_PUTPAGE(mp); - return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize; + return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize; } tlck = txLock(tid, ip, mp, tlckXTREE); tlck->type = tlckXTREE | tlckFREE; @@ -4096,7 +4099,8 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) */ tlck = txLock(tid, ip, mp, tlckXTREE); xtlck = (struct xtlock *) & tlck->lock; - xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1; + xtlck->hwm.offset = + le16_to_cpu(p->header.nextindex) - 1; tlck->type = tlckXTREE | tlckFREE; XT_PUTPAGE(mp); @@ -4114,7 +4118,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) else index--; /* - * internal page: go down to child page of current entry + * internal page: go down to child page of current entry */ getChild: /* save current parent entry for the child page */ diff --git a/trunk/fs/jfs/jfs_xtree.h b/trunk/fs/jfs/jfs_xtree.h index 70815c8a3d6a..164f6f2b1019 100644 --- a/trunk/fs/jfs/jfs_xtree.h +++ b/trunk/fs/jfs/jfs_xtree.h @@ -19,14 +19,14 @@ #define _H_JFS_XTREE /* - * jfs_xtree.h: extent allocation descriptor B+-tree manager + * jfs_xtree.h: extent allocation descriptor B+-tree manager */ #include "jfs_btree.h" /* - * extent allocation descriptor (xad) + * extent allocation descriptor (xad) */ typedef struct xad { unsigned flag:8; /* 1: flag */ @@ -38,30 +38,30 @@ typedef struct xad { __le32 addr2; /* 4: address in unit of fsblksize */ } xad_t; /* (16) */ -#define MAXXLEN ((1 << 24) - 1) +#define MAXXLEN ((1 << 24) - 1) -#define XTSLOTSIZE 16 -#define L2XTSLOTSIZE 4 +#define XTSLOTSIZE 16 +#define L2XTSLOTSIZE 4 /* xad_t field construction */ #define XADoffset(xad, offset64)\ {\ - (xad)->off1 = ((u64)offset64) >> 32;\ - (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\ + (xad)->off1 = ((u64)offset64) >> 32;\ + (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\ } #define XADaddress(xad, address64)\ {\ - (xad)->addr1 = ((u64)address64) >> 32;\ - (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\ + (xad)->addr1 = ((u64)address64) >> 32;\ + (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\ } -#define XADlength(xad, length32) (xad)->len = __cpu_to_le24(length32) +#define XADlength(xad, length32) (xad)->len = __cpu_to_le24(length32) /* xad_t field extraction */ #define offsetXAD(xad)\ - ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2)) + ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2)) #define addressXAD(xad)\ - ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2)) -#define lengthXAD(xad) __le24_to_cpu((xad)->len) + ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2)) +#define lengthXAD(xad) __le24_to_cpu((xad)->len) /* xad list */ struct xadlist { @@ -71,22 +71,22 @@ struct xadlist { }; /* xad_t flags */ -#define XAD_NEW 0x01 /* new */ -#define XAD_EXTENDED 0x02 /* extended */ -#define XAD_COMPRESSED 0x04 /* compressed with recorded length */ +#define XAD_NEW 0x01 /* new */ +#define XAD_EXTENDED 0x02 /* extended */ +#define XAD_COMPRESSED 0x04 /* compressed with recorded length */ #define XAD_NOTRECORDED 0x08 /* allocated but not recorded */ -#define XAD_COW 0x10 /* copy-on-write */ +#define XAD_COW 0x10 /* copy-on-write */ /* possible values for maxentry */ -#define XTROOTINITSLOT_DIR 6 -#define XTROOTINITSLOT 10 -#define XTROOTMAXSLOT 18 -#define XTPAGEMAXSLOT 256 -#define XTENTRYSTART 2 +#define XTROOTINITSLOT_DIR 6 +#define XTROOTINITSLOT 10 +#define XTROOTMAXSLOT 18 +#define XTPAGEMAXSLOT 256 +#define XTENTRYSTART 2 /* - * xtree page: + * xtree page: */ typedef union { struct xtheader { @@ -106,7 +106,7 @@ typedef union { } xtpage_t; /* - * external declaration + * external declaration */ extern int xtLookup(struct inode *ip, s64 lstart, s64 llen, int *pflag, s64 * paddr, int *plen, int flag); diff --git a/trunk/fs/jfs/namei.c b/trunk/fs/jfs/namei.c index 25161c4121e4..41c204771262 100644 --- a/trunk/fs/jfs/namei.c +++ b/trunk/fs/jfs/namei.c @@ -328,7 +328,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) * dentry - child directory dentry * * RETURN: -EINVAL - if name is . or .. - * -EINVAL - if . or .. exist but are invalid. + * -EINVAL - if . or .. exist but are invalid. * errors from subroutines * * note: @@ -517,7 +517,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) inode_dec_link_count(ip); /* - * commit zero link count object + * commit zero link count object */ if (ip->i_nlink == 0) { assert(!test_cflag(COMMIT_Nolink, ip)); @@ -596,7 +596,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) /* * NAME: commitZeroLink() * - * FUNCTION: for non-directory, called by jfs_remove(), + * FUNCTION: for non-directory, called by jfs_remove(), * truncate a regular file, directory or symbolic * link to zero length. return 0 if type is not * one of these. @@ -676,7 +676,7 @@ static s64 commitZeroLink(tid_t tid, struct inode *ip) /* * NAME: jfs_free_zero_link() * - * FUNCTION: for non-directory, called by iClose(), + * FUNCTION: for non-directory, called by iClose(), * free resources of a file from cache and WORKING map * for a file previously committed with zero link count * while associated with a pager object, @@ -855,12 +855,12 @@ static int jfs_link(struct dentry *old_dentry, * NAME: jfs_symlink(dip, dentry, name) * * FUNCTION: creates a symbolic link to by name - * in directory + * in directory * - * PARAMETER: dip - parent directory vnode - * dentry - dentry of symbolic link - * name - the path name of the existing object - * that will be the source of the link + * PARAMETER: dip - parent directory vnode + * dentry - dentry of symbolic link + * name - the path name of the existing object + * that will be the source of the link * * RETURN: errors from subroutines * @@ -1052,9 +1052,9 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, /* - * NAME: jfs_rename + * NAME: jfs_rename * - * FUNCTION: rename a file or directory + * FUNCTION: rename a file or directory */ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) @@ -1331,9 +1331,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* - * NAME: jfs_mknod + * NAME: jfs_mknod * - * FUNCTION: Create a special file (device) + * FUNCTION: Create a special file (device) */ static int jfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) diff --git a/trunk/fs/jfs/resize.c b/trunk/fs/jfs/resize.c index 71984ee95346..79d625f3f733 100644 --- a/trunk/fs/jfs/resize.c +++ b/trunk/fs/jfs/resize.c @@ -29,17 +29,17 @@ #include "jfs_txnmgr.h" #include "jfs_debug.h" -#define BITSPERPAGE (PSIZE << 3) -#define L2MEGABYTE 20 -#define MEGABYTE (1 << L2MEGABYTE) -#define MEGABYTE32 (MEGABYTE << 5) +#define BITSPERPAGE (PSIZE << 3) +#define L2MEGABYTE 20 +#define MEGABYTE (1 << L2MEGABYTE) +#define MEGABYTE32 (MEGABYTE << 5) /* convert block number to bmap file page number */ #define BLKTODMAPN(b)\ - (((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) + (((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) /* - * jfs_extendfs() + * jfs_extendfs() * * function: extend file system; * @@ -48,9 +48,9 @@ * workspace space * * input: - * new LVSize: in LV blocks (required) - * new LogSize: in LV blocks (optional) - * new FSSize: in LV blocks (optional) + * new LVSize: in LV blocks (required) + * new LogSize: in LV blocks (optional) + * new FSSize: in LV blocks (optional) * * new configuration: * 1. set new LogSize as specified or default from new LVSize; @@ -125,8 +125,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) } /* - * reconfigure LV spaces - * --------------------- + * reconfigure LV spaces + * --------------------- * * validate new size, or, if not specified, determine new size */ @@ -198,7 +198,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) log_formatted = 1; } /* - * quiesce file system + * quiesce file system * * (prepare to move the inline log and to prevent map update) * @@ -270,8 +270,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) } /* - * extend block allocation map - * --------------------------- + * extend block allocation map + * --------------------------- * * extendfs() for new extension, retry after crash recovery; * @@ -283,7 +283,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) * s_size: aggregate size in physical blocks; */ /* - * compute the new block allocation map configuration + * compute the new block allocation map configuration * * map dinode: * di_size: map file size in byte; @@ -301,7 +301,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) newNpages = BLKTODMAPN(t64) + 1; /* - * extend map from current map (WITHOUT growing mapfile) + * extend map from current map (WITHOUT growing mapfile) * * map new extension with unmapped part of the last partial * dmap page, if applicable, and extra page(s) allocated @@ -341,8 +341,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) XSize -= nblocks; /* - * grow map file to cover remaining extension - * and/or one extra dmap page for next extendfs(); + * grow map file to cover remaining extension + * and/or one extra dmap page for next extendfs(); * * allocate new map pages and its backing blocks, and * update map file xtree @@ -422,8 +422,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) dbFinalizeBmap(ipbmap); /* - * update inode allocation map - * --------------------------- + * update inode allocation map + * --------------------------- * * move iag lists from old to new iag; * agstart field is not updated for logredo() to reconstruct @@ -442,8 +442,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) } /* - * finalize - * -------- + * finalize + * -------- * * extension is committed when on-disk super block is * updated with new descriptors: logredo will recover @@ -480,7 +480,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) diFreeSpecial(ipbmap2); /* - * update superblock + * update superblock */ if ((rc = readSuper(sb, &bh))) goto error_out; @@ -530,7 +530,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) resume: /* - * resume file system transactions + * resume file system transactions */ txResume(sb); diff --git a/trunk/fs/jfs/xattr.c b/trunk/fs/jfs/xattr.c index b2375f0774b7..b753ba216450 100644 --- a/trunk/fs/jfs/xattr.c +++ b/trunk/fs/jfs/xattr.c @@ -63,9 +63,9 @@ * * On-disk: * - * FEALISTs are stored on disk using blocks allocated by dbAlloc() and - * written directly. An EA list may be in-lined in the inode if there is - * sufficient room available. + * FEALISTs are stored on disk using blocks allocated by dbAlloc() and + * written directly. An EA list may be in-lined in the inode if there is + * sufficient room available. */ struct ea_buffer { @@ -590,8 +590,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) size_check: if (EALIST_SIZE(ea_buf->xattr) != ea_size) { printk(KERN_ERR "ea_get: invalid extended attribute\n"); - print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, - ea_buf->xattr, ea_size, 1); + dump_mem("xattr", ea_buf->xattr, ea_size); ea_release(inode, ea_buf); rc = -EIO; goto clean_up; diff --git a/trunk/fs/proc/array.c b/trunk/fs/proc/array.c index 98e78e2f18d6..74f30e0c0381 100644 --- a/trunk/fs/proc/array.c +++ b/trunk/fs/proc/array.c @@ -165,6 +165,7 @@ static inline char * task_state(struct task_struct *p, char *buffer) rcu_read_lock(); buffer += sprintf(buffer, "State:\t%s\n" + "SleepAVG:\t%lu%%\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -172,6 +173,7 @@ static inline char * task_state(struct task_struct *p, char *buffer) "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), + (p->sleep_avg/1024)*100/(1020000000/1024), p->tgid, p->pid, pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, @@ -310,41 +312,6 @@ int proc_pid_status(struct task_struct *task, char * buffer) return buffer - orig; } -static clock_t task_utime(struct task_struct *p) -{ - clock_t utime = cputime_to_clock_t(p->utime), - total = utime + cputime_to_clock_t(p->stime); - u64 temp; - - /* - * Use CFS's precise accounting: - */ - temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); - - if (total) { - temp *= utime; - do_div(temp, total); - } - utime = (clock_t)temp; - - return utime; -} - -static clock_t task_stime(struct task_struct *p) -{ - clock_t stime = cputime_to_clock_t(p->stime); - - /* - * Use CFS's precise accounting. (we subtract utime from - * the total, to make sure the total observed by userspace - * grows monotonically - apps rely on that): - */ - stime = nsec_to_clock_t(p->se.sum_exec_runtime) - task_utime(p); - - return stime; -} - - static int do_task_stat(struct task_struct *task, char * buffer, int whole) { unsigned long vsize, eip, esp, wchan = ~0UL; @@ -359,8 +326,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) unsigned long long start_time; unsigned long cmin_flt = 0, cmaj_flt = 0; unsigned long min_flt = 0, maj_flt = 0; - cputime_t cutime, cstime; - clock_t utime, stime; + cputime_t cutime, cstime, utime, stime; unsigned long rsslim = 0; char tcomm[sizeof(task->comm)]; unsigned long flags; @@ -378,8 +344,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) sigemptyset(&sigign); sigemptyset(&sigcatch); - cutime = cstime = cputime_zero; - utime = stime = 0; + cutime = cstime = utime = stime = cputime_zero; rcu_read_lock(); if (lock_task_sighand(task, &flags)) { @@ -405,15 +370,15 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) do { min_flt += t->min_flt; maj_flt += t->maj_flt; - utime += task_utime(t); - stime += task_stime(t); + utime = cputime_add(utime, t->utime); + stime = cputime_add(stime, t->stime); t = next_thread(t); } while (t != task); min_flt += sig->min_flt; maj_flt += sig->maj_flt; - utime += cputime_to_clock_t(sig->utime); - stime += cputime_to_clock_t(sig->stime); + utime = cputime_add(utime, sig->utime); + stime = cputime_add(stime, sig->stime); } sid = signal_session(sig); @@ -429,8 +394,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) if (!whole) { min_flt = task->min_flt; maj_flt = task->maj_flt; - utime = task_utime(task); - stime = task_stime(task); + utime = task->utime; + stime = task->stime; } /* scale priority and nice values from timeslices to -20..20 */ @@ -461,8 +426,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) cmin_flt, maj_flt, cmaj_flt, - utime, - stime, + cputime_to_clock_t(utime), + cputime_to_clock_t(stime), cputime_to_clock_t(cutime), cputime_to_clock_t(cstime), priority, diff --git a/trunk/fs/proc/base.c b/trunk/fs/proc/base.c index 46ea5d56e1bb..a5fa1fdafc4e 100644 --- a/trunk/fs/proc/base.c +++ b/trunk/fs/proc/base.c @@ -296,7 +296,7 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer) */ static int proc_pid_schedstat(struct task_struct *task, char *buffer) { - return sprintf(buffer, "%llu %llu %lu\n", + return sprintf(buffer, "%lu %lu %lu\n", task->sched_info.cpu_time, task->sched_info.run_delay, task->sched_info.pcnt); @@ -929,69 +929,6 @@ static const struct file_operations proc_fault_inject_operations = { }; #endif -#ifdef CONFIG_SCHED_DEBUG -/* - * Print out various scheduling related per-task fields: - */ -static int sched_show(struct seq_file *m, void *v) -{ - struct inode *inode = m->private; - struct task_struct *p; - - WARN_ON(!inode); - - p = get_proc_task(inode); - if (!p) - return -ESRCH; - proc_sched_show_task(p, m); - - put_task_struct(p); - - return 0; -} - -static ssize_t -sched_write(struct file *file, const char __user *buf, - size_t count, loff_t *offset) -{ - struct inode *inode = file->f_path.dentry->d_inode; - struct task_struct *p; - - WARN_ON(!inode); - - p = get_proc_task(inode); - if (!p) - return -ESRCH; - proc_sched_set_task(p); - - put_task_struct(p); - - return count; -} - -static int sched_open(struct inode *inode, struct file *filp) -{ - int ret; - - ret = single_open(filp, sched_show, NULL); - if (!ret) { - struct seq_file *m = filp->private_data; - - m->private = inode; - } - return ret; -} - -static const struct file_operations proc_pid_sched_operations = { - .open = sched_open, - .read = seq_read, - .write = sched_write, - .llseek = seq_lseek, - .release = seq_release, -}; - -#endif - static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; @@ -2026,9 +1963,6 @@ static const struct pid_entry tgid_base_stuff[] = { INF("environ", S_IRUSR, pid_environ), INF("auxv", S_IRUSR, pid_auxv), INF("status", S_IRUGO, pid_status), -#ifdef CONFIG_SCHED_DEBUG - REG("sched", S_IRUGO|S_IWUSR, pid_sched), -#endif INF("cmdline", S_IRUGO, pid_cmdline), INF("stat", S_IRUGO, tgid_stat), INF("statm", S_IRUGO, pid_statm), @@ -2313,9 +2247,6 @@ static const struct pid_entry tid_base_stuff[] = { INF("environ", S_IRUSR, pid_environ), INF("auxv", S_IRUSR, pid_auxv), INF("status", S_IRUGO, pid_status), -#ifdef CONFIG_SCHED_DEBUG - REG("sched", S_IRUGO|S_IWUSR, pid_sched), -#endif INF("cmdline", S_IRUGO, pid_cmdline), INF("stat", S_IRUGO, tid_stat), INF("statm", S_IRUGO, pid_statm), diff --git a/trunk/include/asm-generic/bitops/sched.h b/trunk/include/asm-generic/bitops/sched.h index 604fab7031a6..815bb0148060 100644 --- a/trunk/include/asm-generic/bitops/sched.h +++ b/trunk/include/asm-generic/bitops/sched.h @@ -6,23 +6,28 @@ /* * Every architecture must define this function. It's the fastest - * way of searching a 100-bit bitmap. It's guaranteed that at least - * one of the 100 bits is cleared. + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 + * bits is cleared. */ static inline int sched_find_first_bit(const unsigned long *b) { #if BITS_PER_LONG == 64 - if (b[0]) + if (unlikely(b[0])) return __ffs(b[0]); - return __ffs(b[1]) + 64; + if (likely(b[1])) + return __ffs(b[1]) + 64; + return __ffs(b[2]) + 128; #elif BITS_PER_LONG == 32 - if (b[0]) + if (unlikely(b[0])) return __ffs(b[0]); - if (b[1]) + if (unlikely(b[1])) return __ffs(b[1]) + 32; - if (b[2]) + if (unlikely(b[2])) return __ffs(b[2]) + 64; - return __ffs(b[3]) + 96; + if (b[3]) + return __ffs(b[3]) + 96; + return __ffs(b[4]) + 128; #else #error BITS_PER_LONG not defined #endif diff --git a/trunk/include/asm-mips/mach-au1x00/au1xxx_ide.h b/trunk/include/asm-mips/mach-au1x00/au1xxx_ide.h index 4663e8b415c9..8fcae21adbd5 100644 --- a/trunk/include/asm-mips/mach-au1x00/au1xxx_ide.h +++ b/trunk/include/asm-mips/mach-au1x00/au1xxx_ide.h @@ -88,26 +88,26 @@ static const struct drive_list_entry dma_white_list [] = { /* * Hitachi */ - { "HITACHI_DK14FA-20" , NULL }, - { "HTS726060M9AT00" , NULL }, + { "HITACHI_DK14FA-20" , "ALL" }, + { "HTS726060M9AT00" , "ALL" }, /* * Maxtor */ - { "Maxtor 6E040L0" , NULL }, - { "Maxtor 6Y080P0" , NULL }, - { "Maxtor 6Y160P0" , NULL }, + { "Maxtor 6E040L0" , "ALL" }, + { "Maxtor 6Y080P0" , "ALL" }, + { "Maxtor 6Y160P0" , "ALL" }, /* * Seagate */ - { "ST3120026A" , NULL }, - { "ST320014A" , NULL }, - { "ST94011A" , NULL }, - { "ST340016A" , NULL }, + { "ST3120026A" , "ALL" }, + { "ST320014A" , "ALL" }, + { "ST94011A" , "ALL" }, + { "ST340016A" , "ALL" }, /* * Western Digital */ - { "WDC WD400UE-00HCT0" , NULL }, - { "WDC WD400JB-00JJC0" , NULL }, + { "WDC WD400UE-00HCT0" , "ALL" }, + { "WDC WD400JB-00JJC0" , "ALL" }, { NULL , NULL } }; @@ -116,9 +116,9 @@ static const struct drive_list_entry dma_black_list [] = { /* * Western Digital */ - { "WDC WD100EB-00CGH0" , NULL }, - { "WDC WD200BB-00AUA1" , NULL }, - { "WDC AC24300L" , NULL }, + { "WDC WD100EB-00CGH0" , "ALL" }, + { "WDC WD200BB-00AUA1" , "ALL" }, + { "WDC AC24300L" , "ALL" }, { NULL , NULL } }; #endif diff --git a/trunk/include/linux/eeprom_93cx6.h b/trunk/include/linux/eeprom_93cx6.h new file mode 100644 index 000000000000..d774b7778c91 --- /dev/null +++ b/trunk/include/linux/eeprom_93cx6.h @@ -0,0 +1,72 @@ +/* + Copyright (C) 2004 - 2006 rt2x00 SourceForge Project + + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the + Free Software Foundation, Inc., + 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +/* + Module: eeprom_93cx6 + Abstract: EEPROM reader datastructures for 93cx6 chipsets. + Supported chipsets: 93c46 & 93c66. + */ + +/* + * EEPROM operation defines. + */ +#define PCI_EEPROM_WIDTH_93C46 6 +#define PCI_EEPROM_WIDTH_93C66 8 +#define PCI_EEPROM_WIDTH_OPCODE 3 +#define PCI_EEPROM_WRITE_OPCODE 0x05 +#define PCI_EEPROM_READ_OPCODE 0x06 +#define PCI_EEPROM_EWDS_OPCODE 0x10 +#define PCI_EEPROM_EWEN_OPCODE 0x13 + +/** + * struct eeprom_93cx6 - control structure for setting the commands + * for reading the eeprom data. + * @data: private pointer for the driver. + * @register_read(struct eeprom_93cx6 *eeprom): handler to + * read the eeprom register, this function should set all reg_* fields. + * @register_write(struct eeprom_93cx6 *eeprom): handler to + * write to the eeprom register by using all reg_* fields. + * @width: eeprom width, should be one of the PCI_EEPROM_WIDTH_* defines + * @reg_data_in: register field to indicate data input + * @reg_data_out: register field to indicate data output + * @reg_data_clock: register field to set the data clock + * @reg_chip_select: register field to set the chip select + * + * This structure is used for the communication between the driver + * and the eeprom_93cx6 handlers for reading the eeprom. + */ +struct eeprom_93cx6 { + void *data; + + void (*register_read)(struct eeprom_93cx6 *eeprom); + void (*register_write)(struct eeprom_93cx6 *eeprom); + + int width; + + char reg_data_in; + char reg_data_out; + char reg_data_clock; + char reg_chip_select; +}; + +extern void eeprom_93cx6_read(struct eeprom_93cx6 *eeprom, + const u8 word, u16 *data); +extern void eeprom_93cx6_multiread(struct eeprom_93cx6 *eeprom, + const u8 word, __le16 *data, const u16 words); diff --git a/trunk/include/linux/hardirq.h b/trunk/include/linux/hardirq.h index 8d302298a161..7803014f3a11 100644 --- a/trunk/include/linux/hardirq.h +++ b/trunk/include/linux/hardirq.h @@ -78,19 +78,6 @@ # define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) #endif -#ifdef CONFIG_PREEMPT -# define PREEMPT_CHECK_OFFSET 1 -#else -# define PREEMPT_CHECK_OFFSET 0 -#endif - -/* - * Check whether we were atomic before we did preempt_disable(): - * (used by the scheduler) - */ -#define in_atomic_preempt_off() \ - ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) - #ifdef CONFIG_PREEMPT # define preemptible() (preempt_count() == 0 && !irqs_disabled()) # define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1) diff --git a/trunk/include/linux/ide.h b/trunk/include/linux/ide.h index 19ab25804056..1e365acdd369 100644 --- a/trunk/include/linux/ide.h +++ b/trunk/include/linux/ide.h @@ -25,7 +25,6 @@ #include #include #include -#include /****************************************************************************** * IDE driver configuration options (play with these as desired): @@ -686,8 +685,6 @@ typedef struct hwif_s { u8 mwdma_mask; u8 swdma_mask; - u8 cbl; /* cable type */ - hwif_chipset_t chipset; /* sub-module for tuning.. */ struct pci_dev *pci_dev; /* for pci chipsets */ @@ -738,8 +735,8 @@ typedef struct hwif_s { void (*ide_dma_clear_irq)(ide_drive_t *drive); void (*dma_host_on)(ide_drive_t *drive); void (*dma_host_off)(ide_drive_t *drive); - void (*dma_lost_irq)(ide_drive_t *drive); - void (*dma_timeout)(ide_drive_t *drive); + int (*ide_dma_lostirq)(ide_drive_t *drive); + int (*ide_dma_timeout)(ide_drive_t *drive); void (*OUTB)(u8 addr, unsigned long port); void (*OUTBSYNC)(ide_drive_t *drive, u8 addr, unsigned long port); @@ -794,6 +791,7 @@ typedef struct hwif_s { unsigned sharing_irq: 1; /* 1 = sharing irq with another hwif */ unsigned reset : 1; /* reset after probe */ unsigned autodma : 1; /* auto-attempt using DMA at boot */ + unsigned udma_four : 1; /* 1=ATA-66 capable, 0=default */ unsigned no_lba48 : 1; /* 1 = cannot do LBA48 */ unsigned no_lba48_dma : 1; /* 1 = cannot do LBA48 DMA */ unsigned auto_poll : 1; /* supports nop auto-poll */ @@ -865,7 +863,7 @@ typedef struct hwgroup_s { typedef struct ide_driver_s ide_driver_t; -extern struct mutex ide_setting_mtx; +extern struct semaphore ide_setting_sem; int set_io_32bit(ide_drive_t *, int); int set_pio_mode(ide_drive_t *, int); @@ -1306,8 +1304,8 @@ extern int __ide_dma_check(ide_drive_t *); extern int ide_dma_setup(ide_drive_t *); extern void ide_dma_start(ide_drive_t *); extern int __ide_dma_end(ide_drive_t *); -extern void ide_dma_lost_irq(ide_drive_t *); -extern void ide_dma_timeout(ide_drive_t *); +extern int __ide_dma_lostirq(ide_drive_t *); +extern int __ide_dma_timeout(ide_drive_t *); #endif /* CONFIG_BLK_DEV_IDEDMA_PCI */ #else @@ -1384,11 +1382,11 @@ extern const ide_pio_timings_t ide_pio_timings[6]; extern spinlock_t ide_lock; -extern struct mutex ide_cfg_mtx; +extern struct semaphore ide_cfg_sem; /* * Structure locking: * - * ide_cfg_mtx and ide_lock together protect changes to + * ide_cfg_sem and ide_lock together protect changes to * ide_hwif_t->{next,hwgroup} * ide_drive_t->next * diff --git a/trunk/include/linux/sched.h b/trunk/include/linux/sched.h index cfb680585ab8..693f0e6c54d4 100644 --- a/trunk/include/linux/sched.h +++ b/trunk/include/linux/sched.h @@ -34,8 +34,6 @@ #define SCHED_FIFO 1 #define SCHED_RR 2 #define SCHED_BATCH 3 -/* SCHED_ISO: reserved but not implemented yet */ -#define SCHED_IDLE 5 #ifdef __KERNEL__ @@ -132,26 +130,6 @@ extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); extern unsigned long weighted_cpuload(const int cpu); -struct seq_file; -struct cfs_rq; -#ifdef CONFIG_SCHED_DEBUG -extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); -extern void proc_sched_set_task(struct task_struct *p); -extern void -print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now); -#else -static inline void -proc_sched_show_task(struct task_struct *p, struct seq_file *m) -{ -} -static inline void proc_sched_set_task(struct task_struct *p) -{ -} -static inline void -print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now) -{ -} -#endif /* * Task state bitmask. NOTE! These bits are also @@ -215,7 +193,6 @@ struct task_struct; extern void sched_init(void); extern void sched_init_smp(void); extern void init_idle(struct task_struct *idle, int cpu); -extern void init_idle_bootup_task(struct task_struct *idle); extern cpumask_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) @@ -502,7 +479,7 @@ struct signal_struct { * from jiffies_to_ns(utime + stime) if sched_clock uses something * other than jiffies.) */ - unsigned long long sum_sched_runtime; + unsigned long long sched_time; /* * We don't bother to synchronize most readers of this at all, @@ -544,6 +521,31 @@ struct signal_struct { #define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */ #define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */ + +/* + * Priority of a process goes from 0..MAX_PRIO-1, valid RT + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH + * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority + * values are inverted: lower p->prio value means higher priority. + * + * The MAX_USER_RT_PRIO value allows the actual maximum + * RT priority to be separate from the value exported to + * user-space. This allows kernel threads to set their + * priority to a value higher than any user task. Note: + * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. + */ + +#define MAX_USER_RT_PRIO 100 +#define MAX_RT_PRIO MAX_USER_RT_PRIO + +#define MAX_PRIO (MAX_RT_PRIO + 40) + +#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) +#define rt_task(p) rt_prio((p)->prio) +#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) +#define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH) +#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) + /* * Some day this will be a full-fledged user tracking system.. */ @@ -581,13 +583,13 @@ struct reclaim_state; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info { /* cumulative counters */ - unsigned long pcnt; /* # of times run on this cpu */ - unsigned long long cpu_time, /* time spent on the cpu */ - run_delay; /* time spent waiting on a runqueue */ + unsigned long cpu_time, /* time spent on the cpu */ + run_delay, /* time spent waiting on a runqueue */ + pcnt; /* # of timeslices run on this cpu */ /* timestamps */ - unsigned long long last_arrival,/* when we last ran on a cpu */ - last_queued; /* when we were last queued to run */ + unsigned long last_arrival, /* when we last ran on a cpu */ + last_queued; /* when we were last queued to run */ }; #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ @@ -637,24 +639,18 @@ static inline int sched_info_on(void) #endif } -enum cpu_idle_type { - CPU_IDLE, - CPU_NOT_IDLE, - CPU_NEWLY_IDLE, - CPU_MAX_IDLE_TYPES +enum idle_type +{ + SCHED_IDLE, + NOT_IDLE, + NEWLY_IDLE, + MAX_IDLE_TYPES }; /* * sched-domains (multiprocessor balancing) declarations: */ - -/* - * Increase resolution of nice-level calculations: - */ -#define SCHED_LOAD_SHIFT 10 -#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) - -#define SCHED_LOAD_SCALE_FUZZ (SCHED_LOAD_SCALE >> 5) +#define SCHED_LOAD_SCALE 128UL /* increase resolution of load */ #ifdef CONFIG_SMP #define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ @@ -723,14 +719,14 @@ struct sched_domain { #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ - unsigned long lb_cnt[CPU_MAX_IDLE_TYPES]; - unsigned long lb_failed[CPU_MAX_IDLE_TYPES]; - unsigned long lb_balanced[CPU_MAX_IDLE_TYPES]; - unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES]; - unsigned long lb_gained[CPU_MAX_IDLE_TYPES]; - unsigned long lb_hot_gained[CPU_MAX_IDLE_TYPES]; - unsigned long lb_nobusyg[CPU_MAX_IDLE_TYPES]; - unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES]; + unsigned long lb_cnt[MAX_IDLE_TYPES]; + unsigned long lb_failed[MAX_IDLE_TYPES]; + unsigned long lb_balanced[MAX_IDLE_TYPES]; + unsigned long lb_imbalance[MAX_IDLE_TYPES]; + unsigned long lb_gained[MAX_IDLE_TYPES]; + unsigned long lb_hot_gained[MAX_IDLE_TYPES]; + unsigned long lb_nobusyg[MAX_IDLE_TYPES]; + unsigned long lb_nobusyq[MAX_IDLE_TYPES]; /* Active load balancing */ unsigned long alb_cnt; @@ -757,6 +753,12 @@ struct sched_domain { extern int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2); +/* + * Maximum cache size the migration-costs auto-tuning code will + * search from: + */ +extern unsigned int max_cache_size; + #endif /* CONFIG_SMP */ @@ -807,86 +809,14 @@ struct mempolicy; struct pipe_inode_info; struct uts_namespace; -struct rq; -struct sched_domain; - -struct sched_class { - struct sched_class *next; - - void (*enqueue_task) (struct rq *rq, struct task_struct *p, - int wakeup, u64 now); - void (*dequeue_task) (struct rq *rq, struct task_struct *p, - int sleep, u64 now); - void (*yield_task) (struct rq *rq, struct task_struct *p); - - void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); - - struct task_struct * (*pick_next_task) (struct rq *rq, u64 now); - void (*put_prev_task) (struct rq *rq, struct task_struct *p, u64 now); - - int (*load_balance) (struct rq *this_rq, int this_cpu, - struct rq *busiest, - unsigned long max_nr_move, unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, unsigned long *total_load_moved); - - void (*set_curr_task) (struct rq *rq); - void (*task_tick) (struct rq *rq, struct task_struct *p); - void (*task_new) (struct rq *rq, struct task_struct *p); +enum sleep_type { + SLEEP_NORMAL, + SLEEP_NONINTERACTIVE, + SLEEP_INTERACTIVE, + SLEEP_INTERRUPTED, }; -struct load_weight { - unsigned long weight, inv_weight; -}; - -/* - * CFS stats for a schedulable entity (task, task-group etc) - * - * Current field usage histogram: - * - * 4 se->block_start - * 4 se->run_node - * 4 se->sleep_start - * 4 se->sleep_start_fair - * 6 se->load.weight - * 7 se->delta_fair - * 15 se->wait_runtime - */ -struct sched_entity { - long wait_runtime; - unsigned long delta_fair_run; - unsigned long delta_fair_sleep; - unsigned long delta_exec; - s64 fair_key; - struct load_weight load; /* for load-balancing */ - struct rb_node run_node; - unsigned int on_rq; - - u64 wait_start_fair; - u64 wait_start; - u64 exec_start; - u64 sleep_start; - u64 sleep_start_fair; - u64 block_start; - u64 sleep_max; - u64 block_max; - u64 exec_max; - u64 wait_max; - u64 last_ran; - - u64 sum_exec_runtime; - s64 sum_wait_runtime; - s64 sum_sleep_runtime; - unsigned long wait_runtime_overruns; - unsigned long wait_runtime_underruns; -#ifdef CONFIG_FAIR_GROUP_SCHED - struct sched_entity *parent; - /* rq on which this entity is (to be) queued: */ - struct cfs_rq *cfs_rq; - /* rq "owned" by this entity/group: */ - struct cfs_rq *my_q; -#endif -}; +struct prio_array; struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ @@ -902,20 +832,23 @@ struct task_struct { int oncpu; #endif #endif - + int load_weight; /* for niceness load balancing purposes */ int prio, static_prio, normal_prio; struct list_head run_list; - struct sched_class *sched_class; - struct sched_entity se; + struct prio_array *array; unsigned short ioprio; #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif + unsigned long sleep_avg; + unsigned long long timestamp, last_ran; + unsigned long long sched_time; /* sched_clock time spent running */ + enum sleep_type sleep_type; unsigned int policy; cpumask_t cpus_allowed; - unsigned int time_slice; + unsigned int time_slice, first_time_slice; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; @@ -1145,37 +1078,6 @@ struct task_struct { #endif }; -/* - * Priority of a process goes from 0..MAX_PRIO-1, valid RT - * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH - * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority - * values are inverted: lower p->prio value means higher priority. - * - * The MAX_USER_RT_PRIO value allows the actual maximum - * RT priority to be separate from the value exported to - * user-space. This allows kernel threads to set their - * priority to a value higher than any user task. Note: - * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. - */ - -#define MAX_USER_RT_PRIO 100 -#define MAX_RT_PRIO MAX_USER_RT_PRIO - -#define MAX_PRIO (MAX_RT_PRIO + 40) -#define DEFAULT_PRIO (MAX_RT_PRIO + 20) - -static inline int rt_prio(int prio) -{ - if (unlikely(prio < MAX_RT_PRIO)) - return 1; - return 0; -} - -static inline int rt_task(struct task_struct *p) -{ - return rt_prio(p->prio); -} - static inline pid_t process_group(struct task_struct *tsk) { return tsk->signal->pgrp; @@ -1321,7 +1223,7 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) extern unsigned long long sched_clock(void); extern unsigned long long -task_sched_runtime(struct task_struct *task); +current_sched_time(const struct task_struct *current_task); /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP @@ -1330,8 +1232,6 @@ extern void sched_exec(void); #define sched_exec() {} #endif -extern void sched_clock_unstable_event(void); - #ifdef CONFIG_HOTPLUG_CPU extern void idle_task_exit(void); #else @@ -1340,14 +1240,6 @@ static inline void idle_task_exit(void) {} extern void sched_idle_next(void); -extern unsigned int sysctl_sched_granularity; -extern unsigned int sysctl_sched_wakeup_granularity; -extern unsigned int sysctl_sched_batch_wakeup_granularity; -extern unsigned int sysctl_sched_stat_granularity; -extern unsigned int sysctl_sched_runtime_limit; -extern unsigned int sysctl_sched_child_runs_first; -extern unsigned int sysctl_sched_features; - #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); extern void rt_mutex_setprio(struct task_struct *p, int prio); @@ -1425,8 +1317,8 @@ extern void FASTCALL(wake_up_new_task(struct task_struct * tsk, #else static inline void kick_process(struct task_struct *tsk) { } #endif -extern void sched_fork(struct task_struct *p, int clone_flags); -extern void sched_dead(struct task_struct *p); +extern void FASTCALL(sched_fork(struct task_struct * p, int clone_flags)); +extern void FASTCALL(sched_exit(struct task_struct * p)); extern int in_group_p(gid_t); extern int in_egroup_p(gid_t); @@ -1514,7 +1406,7 @@ extern struct mm_struct * mm_alloc(void); extern void FASTCALL(__mmdrop(struct mm_struct *)); static inline void mmdrop(struct mm_struct * mm) { - if (unlikely(atomic_dec_and_test(&mm->mm_count))) + if (atomic_dec_and_test(&mm->mm_count)) __mmdrop(mm); } @@ -1746,7 +1638,10 @@ static inline unsigned int task_cpu(const struct task_struct *p) return task_thread_info(p)->cpu; } -extern void set_task_cpu(struct task_struct *p, unsigned int cpu); +static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + task_thread_info(p)->cpu = cpu; +} #else diff --git a/trunk/include/linux/topology.h b/trunk/include/linux/topology.h index da6c39b2d051..a9d1f049cc15 100644 --- a/trunk/include/linux/topology.h +++ b/trunk/include/linux/topology.h @@ -98,7 +98,7 @@ .cache_nice_tries = 0, \ .busy_idx = 0, \ .idle_idx = 0, \ - .newidle_idx = 0, \ + .newidle_idx = 1, \ .wake_idx = 0, \ .forkexec_idx = 0, \ .flags = SD_LOAD_BALANCE \ @@ -128,15 +128,14 @@ .imbalance_pct = 125, \ .cache_nice_tries = 1, \ .busy_idx = 2, \ - .idle_idx = 0, \ - .newidle_idx = 0, \ + .idle_idx = 1, \ + .newidle_idx = 2, \ .wake_idx = 1, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_WAKE_AFFINE \ - | SD_WAKE_IDLE \ | SD_SHARE_PKG_RESOURCES\ | BALANCE_FOR_MC_POWER, \ .last_balance = jiffies, \ @@ -159,15 +158,14 @@ .imbalance_pct = 125, \ .cache_nice_tries = 1, \ .busy_idx = 2, \ - .idle_idx = 0, \ - .newidle_idx = 0, \ + .idle_idx = 1, \ + .newidle_idx = 2, \ .wake_idx = 1, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_WAKE_AFFINE \ - | SD_WAKE_IDLE \ | BALANCE_FOR_PKG_POWER,\ .last_balance = jiffies, \ .balance_interval = 1, \ diff --git a/trunk/include/linux/wait.h b/trunk/include/linux/wait.h index 0e686280450b..e820d00e1383 100644 --- a/trunk/include/linux/wait.h +++ b/trunk/include/linux/wait.h @@ -366,15 +366,15 @@ static inline void remove_wait_queue_locked(wait_queue_head_t *q, /* * These are the old interfaces to sleep waiting for an event. - * They are racy. DO NOT use them, use the wait_event* interfaces above. - * We plan to remove these interfaces. + * They are racy. DO NOT use them, use the wait_event* interfaces above. + * We plan to remove these interfaces during 2.7. */ -extern void sleep_on(wait_queue_head_t *q); -extern long sleep_on_timeout(wait_queue_head_t *q, - signed long timeout); -extern void interruptible_sleep_on(wait_queue_head_t *q); -extern long interruptible_sleep_on_timeout(wait_queue_head_t *q, - signed long timeout); +extern void FASTCALL(sleep_on(wait_queue_head_t *q)); +extern long FASTCALL(sleep_on_timeout(wait_queue_head_t *q, + signed long timeout)); +extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q)); +extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q, + signed long timeout)); /* * Waitqueues which are removed from the waitqueue_head at wakeup time diff --git a/trunk/init/main.c b/trunk/init/main.c index 0eb1c7463fe4..eb8bdbae4fc7 100644 --- a/trunk/init/main.c +++ b/trunk/init/main.c @@ -436,16 +436,15 @@ static void noinline __init_refok rest_init(void) /* * The boot idle thread must execute schedule() - * at least once to get things moving: + * at least one to get things moving: */ - init_idle_bootup_task(current); preempt_enable_no_resched(); schedule(); preempt_disable(); /* Call into cpu_idle with preempt disabled */ cpu_idle(); -} +} /* Check for early params. */ static int __init do_early_param(char *param, char *val) diff --git a/trunk/kernel/delayacct.c b/trunk/kernel/delayacct.c index 81e697829633..c0148ae992c4 100644 --- a/trunk/kernel/delayacct.c +++ b/trunk/kernel/delayacct.c @@ -99,10 +99,9 @@ void __delayacct_blkio_end(void) int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) { s64 tmp; - unsigned long t1; - unsigned long long t2, t3; - unsigned long flags; struct timespec ts; + unsigned long t1,t2,t3; + unsigned long flags; /* Though tsk->delays accessed later, early exit avoids * unnecessary returning of other data @@ -125,10 +124,11 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->cpu_count += t1; - tmp = (s64)d->cpu_delay_total + t2; + jiffies_to_timespec(t2, &ts); + tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts); d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; - tmp = (s64)d->cpu_run_virtual_total + t3; + tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000; d->cpu_run_virtual_total = (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; diff --git a/trunk/kernel/exit.c b/trunk/kernel/exit.c index ca6a11b73023..5c8ecbaa19a5 100644 --- a/trunk/kernel/exit.c +++ b/trunk/kernel/exit.c @@ -122,9 +122,9 @@ static void __exit_signal(struct task_struct *tsk) sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; sig->nivcsw += tsk->nivcsw; + sig->sched_time += tsk->sched_time; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } @@ -182,6 +182,7 @@ void release_task(struct task_struct * p) zap_leader = (leader->exit_signal == -1); } + sched_exit(p); write_unlock_irq(&tasklist_lock); proc_flush_task(p); release_thread(p); @@ -290,7 +291,7 @@ static void reparent_to_kthreadd(void) /* Set the exit signal to SIGCHLD so we signal init on exit */ current->exit_signal = SIGCHLD; - if (task_nice(current) < 0) + if (!has_rt_policy(current) && (task_nice(current) < 0)) set_user_nice(current, 0); /* cpus_allowed? */ /* rt_priority? */ diff --git a/trunk/kernel/fork.c b/trunk/kernel/fork.c index da3a155bba0d..73ad5cda1bcd 100644 --- a/trunk/kernel/fork.c +++ b/trunk/kernel/fork.c @@ -877,7 +877,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; - sig->sum_sched_runtime = 0; + sig->sched_time = 0; INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); INIT_LIST_HEAD(&sig->cpu_timers[2]); @@ -1040,7 +1040,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = cputime_zero; p->stime = cputime_zero; - + p->sched_time = 0; #ifdef CONFIG_TASK_XACCT p->rchar = 0; /* I/O counter: bytes read */ p->wchar = 0; /* I/O counter: bytes written */ diff --git a/trunk/kernel/posix-cpu-timers.c b/trunk/kernel/posix-cpu-timers.c index b53c8fcd9d82..1de710e18373 100644 --- a/trunk/kernel/posix-cpu-timers.c +++ b/trunk/kernel/posix-cpu-timers.c @@ -161,7 +161,7 @@ static inline cputime_t virt_ticks(struct task_struct *p) } static inline unsigned long long sched_ns(struct task_struct *p) { - return task_sched_runtime(p); + return (p == current) ? current_sched_time(p) : p->sched_time; } int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) @@ -246,10 +246,10 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx, } while (t != p); break; case CPUCLOCK_SCHED: - cpu->sched = p->signal->sum_sched_runtime; + cpu->sched = p->signal->sched_time; /* Add in each other live thread. */ while ((t = next_thread(t)) != p) { - cpu->sched += t->se.sum_exec_runtime; + cpu->sched += t->sched_time; } cpu->sched += sched_ns(p); break; @@ -422,7 +422,7 @@ int posix_cpu_timer_del(struct k_itimer *timer) */ static void cleanup_timers(struct list_head *head, cputime_t utime, cputime_t stime, - unsigned long long sum_exec_runtime) + unsigned long long sched_time) { struct cpu_timer_list *timer, *next; cputime_t ptime = cputime_add(utime, stime); @@ -451,10 +451,10 @@ static void cleanup_timers(struct list_head *head, ++head; list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (timer->expires.sched < sum_exec_runtime) { + if (timer->expires.sched < sched_time) { timer->expires.sched = 0; } else { - timer->expires.sched -= sum_exec_runtime; + timer->expires.sched -= sched_time; } } } @@ -467,7 +467,7 @@ static void cleanup_timers(struct list_head *head, void posix_cpu_timers_exit(struct task_struct *tsk) { cleanup_timers(tsk->cpu_timers, - tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); + tsk->utime, tsk->stime, tsk->sched_time); } void posix_cpu_timers_exit_group(struct task_struct *tsk) @@ -475,7 +475,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) cleanup_timers(tsk->signal->cpu_timers, cputime_add(tsk->utime, tsk->signal->utime), cputime_add(tsk->stime, tsk->signal->stime), - tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime); + tsk->sched_time + tsk->signal->sched_time); } @@ -536,7 +536,7 @@ static void process_timer_rebalance(struct task_struct *p, nsleft = max_t(unsigned long long, nsleft, 1); do { if (likely(!(t->flags & PF_EXITING))) { - ns = t->se.sum_exec_runtime + nsleft; + ns = t->sched_time + nsleft; if (t->it_sched_expires == 0 || t->it_sched_expires > ns) { t->it_sched_expires = ns; @@ -1004,7 +1004,7 @@ static void check_thread_timers(struct task_struct *tsk, struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { + if (!--maxfire || tsk->sched_time < t->expires.sched) { tsk->it_sched_expires = t->expires.sched; break; } @@ -1024,7 +1024,7 @@ static void check_process_timers(struct task_struct *tsk, int maxfire; struct signal_struct *const sig = tsk->signal; cputime_t utime, stime, ptime, virt_expires, prof_expires; - unsigned long long sum_sched_runtime, sched_expires; + unsigned long long sched_time, sched_expires; struct task_struct *t; struct list_head *timers = sig->cpu_timers; @@ -1044,12 +1044,12 @@ static void check_process_timers(struct task_struct *tsk, */ utime = sig->utime; stime = sig->stime; - sum_sched_runtime = sig->sum_sched_runtime; + sched_time = sig->sched_time; t = tsk; do { utime = cputime_add(utime, t->utime); stime = cputime_add(stime, t->stime); - sum_sched_runtime += t->se.sum_exec_runtime; + sched_time += t->sched_time; t = next_thread(t); } while (t != tsk); ptime = cputime_add(utime, stime); @@ -1090,7 +1090,7 @@ static void check_process_timers(struct task_struct *tsk, struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || sum_sched_runtime < t->expires.sched) { + if (!--maxfire || sched_time < t->expires.sched) { sched_expires = t->expires.sched; break; } @@ -1182,7 +1182,7 @@ static void check_process_timers(struct task_struct *tsk, virt_left = cputime_sub(virt_expires, utime); virt_left = cputime_div_non_zero(virt_left, nthreads); if (sched_expires) { - sched_left = sched_expires - sum_sched_runtime; + sched_left = sched_expires - sched_time; do_div(sched_left, nthreads); sched_left = max_t(unsigned long long, sched_left, 1); } else { @@ -1208,7 +1208,7 @@ static void check_process_timers(struct task_struct *tsk, t->it_virt_expires = ticks; } - sched = t->se.sum_exec_runtime + sched_left; + sched = t->sched_time + sched_left; if (sched_expires && (t->it_sched_expires == 0 || t->it_sched_expires > sched)) { t->it_sched_expires = sched; @@ -1300,7 +1300,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) if (UNEXPIRED(prof) && UNEXPIRED(virt) && (tsk->it_sched_expires == 0 || - tsk->se.sum_exec_runtime < tsk->it_sched_expires)) + tsk->sched_time < tsk->it_sched_expires)) return; #undef UNEXPIRED diff --git a/trunk/kernel/sched.c b/trunk/kernel/sched.c index 9fbced64bfee..50e1a3122699 100644 --- a/trunk/kernel/sched.c +++ b/trunk/kernel/sched.c @@ -16,19 +16,13 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin - * 2007-04-15 Work begun on replacing all interactivity tuning with a - * fair scheduling design by Con Kolivas. - * 2007-05-05 Load balancing (smp-nice) and other improvements - * by Peter Williams - * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith - * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri */ #include #include #include #include -#include +#include #include #include #include @@ -59,9 +53,9 @@ #include #include #include -#include #include +#include /* * Scheduler clock - returns current time in nanosec units. @@ -97,9 +91,6 @@ unsigned long long __attribute__((weak)) sched_clock(void) #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) -#define NICE_0_LOAD SCHED_LOAD_SCALE -#define NICE_0_SHIFT SCHED_LOAD_SHIFT - /* * These are the 'tuning knobs' of the scheduler: * @@ -109,6 +100,87 @@ unsigned long long __attribute__((weak)) sched_clock(void) */ #define MIN_TIMESLICE max(5 * HZ / 1000, 1) #define DEF_TIMESLICE (100 * HZ / 1000) +#define ON_RUNQUEUE_WEIGHT 30 +#define CHILD_PENALTY 95 +#define PARENT_PENALTY 100 +#define EXIT_WEIGHT 3 +#define PRIO_BONUS_RATIO 25 +#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) +#define INTERACTIVE_DELTA 2 +#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) +#define STARVATION_LIMIT (MAX_SLEEP_AVG) +#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) + +/* + * If a task is 'interactive' then we reinsert it in the active + * array after it has expired its current timeslice. (it will not + * continue to run immediately, it will still roundrobin with + * other interactive tasks.) + * + * This part scales the interactivity limit depending on niceness. + * + * We scale it linearly, offset by the INTERACTIVE_DELTA delta. + * Here are a few examples of different nice levels: + * + * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] + * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] + * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] + * + * (the X axis represents the possible -5 ... 0 ... +5 dynamic + * priority range a task can explore, a value of '1' means the + * task is rated interactive.) + * + * Ie. nice +19 tasks can never get 'interactive' enough to be + * reinserted into the active array. And only heavily CPU-hog nice -20 + * tasks will be expired. Default nice 0 tasks are somewhere between, + * it takes some effort for them to get interactive, but it's not + * too hard. + */ + +#define CURRENT_BONUS(p) \ + (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ + MAX_SLEEP_AVG) + +#define GRANULARITY (10 * HZ / 1000 ? : 1) + +#ifdef CONFIG_SMP +#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ + num_online_cpus()) +#else +#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) +#endif + +#define SCALE(v1,v1_max,v2_max) \ + (v1) * (v2_max) / (v1_max) + +#define DELTA(p) \ + (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ + INTERACTIVE_DELTA) + +#define TASK_INTERACTIVE(p) \ + ((p)->prio <= (p)->static_prio - DELTA(p)) + +#define INTERACTIVE_SLEEP(p) \ + (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ + (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) + +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) + +#define SCALE_PRIO(x, prio) \ + max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) + +static unsigned int static_prio_timeslice(int static_prio) +{ + if (static_prio < NICE_TO_PRIO(0)) + return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); + else + return SCALE_PRIO(DEF_TIMESLICE, static_prio); +} #ifdef CONFIG_SMP /* @@ -131,87 +203,28 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) } #endif -#define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) - /* - * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] + * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] * to time slice values: [800ms ... 100ms ... 5ms] + * + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. */ -static unsigned int static_prio_timeslice(int static_prio) -{ - if (static_prio == NICE_TO_PRIO(19)) - return 1; - - if (static_prio < NICE_TO_PRIO(0)) - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); - else - return SCALE_PRIO(DEF_TIMESLICE, static_prio); -} - -static inline int rt_policy(int policy) -{ - if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) - return 1; - return 0; -} -static inline int task_has_rt_policy(struct task_struct *p) +static inline unsigned int task_timeslice(struct task_struct *p) { - return rt_policy(p->policy); + return static_prio_timeslice(p->static_prio); } /* - * This is the priority-queue data structure of the RT scheduling class: + * These are the runqueue data structures: */ -struct rt_prio_array { - DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ - struct list_head queue[MAX_RT_PRIO]; -}; - -struct load_stat { - struct load_weight load; - u64 load_update_start, load_update_last; - unsigned long delta_fair, delta_exec, delta_stat; -}; - -/* CFS-related fields in a runqueue */ -struct cfs_rq { - struct load_weight load; - unsigned long nr_running; - - s64 fair_clock; - u64 exec_clock; - s64 wait_runtime; - u64 sleeper_bonus; - unsigned long wait_runtime_overruns, wait_runtime_underruns; - - struct rb_root tasks_timeline; - struct rb_node *rb_leftmost; - struct rb_node *rb_load_balance_curr; -#ifdef CONFIG_FAIR_GROUP_SCHED - /* 'curr' points to currently running entity on this cfs_rq. - * It is set to NULL otherwise (i.e when none are currently running). - */ - struct sched_entity *curr; - struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ - /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in - * a hierarchy). Non-leaf lrqs hold other higher schedulable entities - * (like users, containers etc.) - * - * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This - * list is used during load balance. - */ - struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ -#endif -}; - -/* Real-Time classes' related field in a runqueue: */ -struct rt_rq { - struct rt_prio_array active; - int rt_load_balance_idx; - struct list_head *rt_load_balance_head, *rt_load_balance_curr; +struct prio_array { + unsigned int nr_active; + DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ + struct list_head queue[MAX_PRIO]; }; /* @@ -222,28 +235,22 @@ struct rt_rq { * acquire operations must be ordered by ascending &runqueue. */ struct rq { - spinlock_t lock; /* runqueue lock */ + spinlock_t lock; /* * nr_running and cpu_load should be in the same cacheline because * remote CPUs use both these fields when doing load calculation. */ unsigned long nr_running; - #define CPU_LOAD_IDX_MAX 5 - unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + unsigned long raw_weighted_load; +#ifdef CONFIG_SMP + unsigned long cpu_load[3]; unsigned char idle_at_tick; #ifdef CONFIG_NO_HZ unsigned char in_nohz_recently; #endif - struct load_stat ls; /* capture load from *all* tasks on this cpu */ - unsigned long nr_load_updates; - u64 nr_switches; - - struct cfs_rq cfs; -#ifdef CONFIG_FAIR_GROUP_SCHED - struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */ #endif - struct rt_rq rt; + unsigned long long nr_switches; /* * This is part of a global counter where only the total sum @@ -253,18 +260,14 @@ struct rq { */ unsigned long nr_uninterruptible; + unsigned long expired_timestamp; + /* Cached timestamp set by update_cpu_clock() */ + unsigned long long most_recent_timestamp; struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm; - - u64 clock, prev_clock_raw; - s64 clock_max_delta; - - unsigned int clock_warps, clock_overflows; - unsigned int clock_unstable_events; - - struct sched_class *load_balance_class; - + struct prio_array *active, *expired, arrays[2]; + int best_expired_prio; atomic_t nr_iowait; #ifdef CONFIG_SMP @@ -304,11 +307,6 @@ struct rq { static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; static DEFINE_MUTEX(sched_hotcpu_mutex); -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) -{ - rq->curr->sched_class->check_preempt_curr(rq, p); -} - static inline int cpu_of(struct rq *rq) { #ifdef CONFIG_SMP @@ -318,52 +316,6 @@ static inline int cpu_of(struct rq *rq) #endif } -/* - * Per-runqueue clock, as finegrained as the platform can give us: - */ -static unsigned long long __rq_clock(struct rq *rq) -{ - u64 prev_raw = rq->prev_clock_raw; - u64 now = sched_clock(); - s64 delta = now - prev_raw; - u64 clock = rq->clock; - - /* - * Protect against sched_clock() occasionally going backwards: - */ - if (unlikely(delta < 0)) { - clock++; - rq->clock_warps++; - } else { - /* - * Catch too large forward jumps too: - */ - if (unlikely(delta > 2*TICK_NSEC)) { - clock++; - rq->clock_overflows++; - } else { - if (unlikely(delta > rq->clock_max_delta)) - rq->clock_max_delta = delta; - clock += delta; - } - } - - rq->prev_clock_raw = now; - rq->clock = clock; - - return clock; -} - -static inline unsigned long long rq_clock(struct rq *rq) -{ - int this_cpu = smp_processor_id(); - - if (this_cpu == cpu_of(rq)) - return __rq_clock(rq); - - return rq->clock; -} - /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. * See detach_destroy_domains: synchronize_sched for details. @@ -379,18 +331,6 @@ static inline unsigned long long rq_clock(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -#ifdef CONFIG_FAIR_GROUP_SCHED -/* Change a task's ->cfs_rq if it moves across CPUs */ -static inline void set_task_cfs_rq(struct task_struct *p) -{ - p->se.cfs_rq = &task_rq(p)->cfs; -} -#else -static inline void set_task_cfs_rq(struct task_struct *p) -{ -} -#endif - #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif @@ -520,6 +460,134 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) spin_unlock_irqrestore(&rq->lock, *flags); } +#ifdef CONFIG_SCHEDSTATS +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 14 + +static int show_schedstat(struct seq_file *seq, void *v) +{ + int cpu; + + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + for_each_online_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); +#ifdef CONFIG_SMP + struct sched_domain *sd; + int dcnt = 0; +#endif + + /* runqueue-specific stats */ + seq_printf(seq, + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", + cpu, rq->yld_both_empty, + rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, + rq->sched_switch, rq->sched_cnt, rq->sched_goidle, + rq->ttwu_cnt, rq->ttwu_local, + rq->rq_sched_info.cpu_time, + rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); + + seq_printf(seq, "\n"); + +#ifdef CONFIG_SMP + /* domain-specific stats */ + preempt_disable(); + for_each_domain(cpu, sd) { + enum idle_type itype; + char mask_str[NR_CPUS]; + + cpumask_scnprintf(mask_str, NR_CPUS, sd->span); + seq_printf(seq, "domain%d %s", dcnt++, mask_str); + for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; + itype++) { + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " + "%lu", + sd->lb_cnt[itype], + sd->lb_balanced[itype], + sd->lb_failed[itype], + sd->lb_imbalance[itype], + sd->lb_gained[itype], + sd->lb_hot_gained[itype], + sd->lb_nobusyq[itype], + sd->lb_nobusyg[itype]); + } + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" + " %lu %lu %lu\n", + sd->alb_cnt, sd->alb_failed, sd->alb_pushed, + sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, + sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, + sd->ttwu_wake_remote, sd->ttwu_move_affine, + sd->ttwu_move_balance); + } + preempt_enable(); +#endif + } + return 0; +} + +static int schedstat_open(struct inode *inode, struct file *file) +{ + unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); + char *buf = kmalloc(size, GFP_KERNEL); + struct seq_file *m; + int res; + + if (!buf) + return -ENOMEM; + res = single_open(file, show_schedstat, NULL); + if (!res) { + m = file->private_data; + m->buf = buf; + m->size = size; + } else + kfree(buf); + return res; +} + +const struct file_operations proc_schedstat_operations = { + .open = schedstat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) +{ + if (rq) { + rq->rq_sched_info.run_delay += delta_jiffies; + rq->rq_sched_info.pcnt++; + } +} + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) +{ + if (rq) + rq->rq_sched_info.cpu_time += delta_jiffies; +} +# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) +# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) +#else /* !CONFIG_SCHEDSTATS */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) +{} +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) +{} +# define schedstat_inc(rq, field) do { } while (0) +# define schedstat_add(rq, field, amt) do { } while (0) +#endif + /* * this_rq_lock - lock this runqueue and disable interrupts. */ @@ -535,172 +603,177 @@ static inline struct rq *this_rq_lock(void) return rq; } +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) /* - * CPU frequency is/was unstable - start new by setting prev_clock_raw: + * Called when a process is dequeued from the active array and given + * the cpu. We should note that with the exception of interactive + * tasks, the expired queue will become the active queue after the active + * queue is empty, without explicitly dequeuing and requeuing tasks in the + * expired queue. (Interactive tasks may be requeued directly to the + * active queue, thus delaying tasks in the expired queue from running; + * see scheduler_tick()). + * + * This function is only called from sched_info_arrive(), rather than + * dequeue_task(). Even though a task may be queued and dequeued multiple + * times as it is shuffled about, we're really interested in knowing how + * long it was from the *first* time it was queued to the time that it + * finally hit a cpu. */ -void sched_clock_unstable_event(void) +static inline void sched_info_dequeued(struct task_struct *t) { - unsigned long flags; - struct rq *rq; - - rq = task_rq_lock(current, &flags); - rq->prev_clock_raw = sched_clock(); - rq->clock_unstable_events++; - task_rq_unlock(rq, &flags); + t->sched_info.last_queued = 0; } /* - * resched_task - mark a task 'to be rescheduled now'. - * - * On UP this means the setting of the need_resched flag, on SMP it - * might also involve a cross-CPU call to trigger the scheduler on - * the target CPU. + * Called when a task finally hits the cpu. We can now calculate how + * long it was waiting to run. We also note when it began so that we + * can keep stats on how long its timeslice is. */ -#ifdef CONFIG_SMP - -#ifndef tsk_is_polling -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) -#endif - -static void resched_task(struct task_struct *p) +static void sched_info_arrive(struct task_struct *t) { - int cpu; - - assert_spin_locked(&task_rq(p)->lock); - - if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) - return; - - set_tsk_thread_flag(p, TIF_NEED_RESCHED); + unsigned long now = jiffies, delta_jiffies = 0; - cpu = task_cpu(p); - if (cpu == smp_processor_id()) - return; + if (t->sched_info.last_queued) + delta_jiffies = now - t->sched_info.last_queued; + sched_info_dequeued(t); + t->sched_info.run_delay += delta_jiffies; + t->sched_info.last_arrival = now; + t->sched_info.pcnt++; - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(p)) - smp_send_reschedule(cpu); + rq_sched_info_arrive(task_rq(t), delta_jiffies); } -static void resched_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - if (!spin_trylock_irqsave(&rq->lock, flags)) - return; - resched_task(cpu_curr(cpu)); - spin_unlock_irqrestore(&rq->lock, flags); -} -#else -static inline void resched_task(struct task_struct *p) +/* + * Called when a process is queued into either the active or expired + * array. The time is noted and later used to determine how long we + * had to wait for us to reach the cpu. Since the expired queue will + * become the active queue after active queue is empty, without dequeuing + * and requeuing any tasks, we are interested in queuing to either. It + * is unusual but not impossible for tasks to be dequeued and immediately + * requeued in the same or another array: this can happen in sched_yield(), + * set_user_nice(), and even load_balance() as it moves tasks from runqueue + * to runqueue. + * + * This function is only called from enqueue_task(), but also only updates + * the timestamp if it is already not set. It's assumed that + * sched_info_dequeued() will clear that stamp when appropriate. + */ +static inline void sched_info_queued(struct task_struct *t) { - assert_spin_locked(&task_rq(p)->lock); - set_tsk_need_resched(p); + if (unlikely(sched_info_on())) + if (!t->sched_info.last_queued) + t->sched_info.last_queued = jiffies; } -#endif -static u64 div64_likely32(u64 divident, unsigned long divisor) +/* + * Called when a process ceases being the active-running process, either + * voluntarily or involuntarily. Now we can calculate how long we ran. + */ +static inline void sched_info_depart(struct task_struct *t) { -#if BITS_PER_LONG == 32 - if (likely(divident <= 0xffffffffULL)) - return (u32)divident / divisor; - do_div(divident, divisor); + unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; - return divident; -#else - return divident / divisor; -#endif + t->sched_info.cpu_time += delta_jiffies; + rq_sched_info_depart(task_rq(t), delta_jiffies); } -#if BITS_PER_LONG == 32 -# define WMULT_CONST (~0UL) -#else -# define WMULT_CONST (1UL << 32) -#endif - -#define WMULT_SHIFT 32 - -static inline unsigned long -calc_delta_mine(unsigned long delta_exec, unsigned long weight, - struct load_weight *lw) +/* + * Called when tasks are switched involuntarily due, typically, to expiring + * their time slice. (This may also be called when switching to or from + * the idle task.) We are only called when prev != next. + */ +static inline void +__sched_info_switch(struct task_struct *prev, struct task_struct *next) { - u64 tmp; - - if (unlikely(!lw->inv_weight)) - lw->inv_weight = WMULT_CONST / lw->weight; + struct rq *rq = task_rq(prev); - tmp = (u64)delta_exec * weight; /* - * Check whether we'd overflow the 64-bit multiplication: + * prev now departs the cpu. It's not interesting to record + * stats about how efficient we were at scheduling the idle + * process, however. */ - if (unlikely(tmp > WMULT_CONST)) { - tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight) - >> (WMULT_SHIFT/2); - } else { - tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT; - } + if (prev != rq->idle) + sched_info_depart(prev); - return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit); + if (next != rq->idle) + sched_info_arrive(next); +} +static inline void +sched_info_switch(struct task_struct *prev, struct task_struct *next) +{ + if (unlikely(sched_info_on())) + __sched_info_switch(prev, next); } +#else +#define sched_info_queued(t) do { } while (0) +#define sched_info_switch(t, next) do { } while (0) +#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ -static inline unsigned long -calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) +/* + * Adding/removing a task to/from a priority array: + */ +static void dequeue_task(struct task_struct *p, struct prio_array *array) { - return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); + array->nr_active--; + list_del(&p->run_list); + if (list_empty(array->queue + p->prio)) + __clear_bit(p->prio, array->bitmap); } -static void update_load_add(struct load_weight *lw, unsigned long inc) +static void enqueue_task(struct task_struct *p, struct prio_array *array) { - lw->weight += inc; - lw->inv_weight = 0; + sched_info_queued(p); + list_add_tail(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; } -static void update_load_sub(struct load_weight *lw, unsigned long dec) +/* + * Put task to the end of the run list without the overhead of dequeue + * followed by enqueue. + */ +static void requeue_task(struct task_struct *p, struct prio_array *array) { - lw->weight -= dec; - lw->inv_weight = 0; + list_move_tail(&p->run_list, array->queue + p->prio); } -static void __update_curr_load(struct rq *rq, struct load_stat *ls) +static inline void +enqueue_task_head(struct task_struct *p, struct prio_array *array) { - if (rq->curr != rq->idle && ls->load.weight) { - ls->delta_exec += ls->delta_stat; - ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load); - ls->delta_stat = 0; - } + list_add(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; } /* - * Update delta_exec, delta_fair fields for rq. + * __normal_prio - return the priority that is based on the static + * priority but is modified by bonuses/penalties. + * + * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] + * into the -5 ... 0 ... +5 bonus/penalty range. * - * delta_fair clock advances at a rate inversely proportional to - * total load (rq->ls.load.weight) on the runqueue, while - * delta_exec advances at the same rate as wall-clock (provided - * cpu is not idle). + * We use 25% of the full 0...39 priority range so that: * - * delta_exec / delta_fair is a measure of the (smoothened) load on this - * runqueue over any given interval. This (smoothened) load is used - * during load balance. + * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. + * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. * - * This function is called /before/ updating rq->ls.load - * and when switching tasks. + * Both properties are important to certain workloads. */ -static void update_curr_load(struct rq *rq, u64 now) + +static inline int __normal_prio(struct task_struct *p) { - struct load_stat *ls = &rq->ls; - u64 start; + int bonus, prio; - start = ls->load_update_start; - ls->load_update_start = now; - ls->delta_stat += now - start; - /* - * Stagger updates to ls->delta_fair. Very frequent updates - * can be expensive. - */ - if (ls->delta_stat >= sysctl_sched_stat_granularity) - __update_curr_load(rq, ls); + bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; + + prio = p->static_prio - bonus; + if (prio < MAX_RT_PRIO) + prio = MAX_RT_PRIO; + if (prio > MAX_PRIO-1) + prio = MAX_PRIO-1; + return prio; } /* @@ -718,146 +791,53 @@ static void update_curr_load(struct rq *rq, u64 now) * this code will need modification */ #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE -#define load_weight(lp) \ +#define LOAD_WEIGHT(lp) \ (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) #define PRIO_TO_LOAD_WEIGHT(prio) \ - load_weight(static_prio_timeslice(prio)) + LOAD_WEIGHT(static_prio_timeslice(prio)) #define RTPRIO_TO_LOAD_WEIGHT(rp) \ - (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp)) - -#define WEIGHT_IDLEPRIO 2 -#define WMULT_IDLEPRIO (1 << 31) - -/* - * Nice levels are multiplicative, with a gentle 10% change for every - * nice level changed. I.e. when a CPU-bound task goes from nice 0 to - * nice 1, it will get ~10% less CPU time than another CPU-bound task - * that remained on nice 0. - * - * The "10% effect" is relative and cumulative: from _any_ nice level, - * if you go up 1 level, it's -10% CPU usage, if you go down 1 level - * it's +10% CPU usage. - */ -static const int prio_to_weight[40] = { -/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921, -/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280, -/* 0 */ NICE_0_LOAD /* 1024 */, -/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137, -/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15, -}; + (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) -static const u32 prio_to_wmult[40] = { - 48356, 60446, 75558, 94446, 118058, 147573, - 184467, 230589, 288233, 360285, 450347, - 562979, 703746, 879575, 1099582, 1374389, - 717986, 2147483, 2684354, 3355443, 4194304, - 244160, 6557201, 8196502, 10250518, 12782640, - 16025997, 19976592, 24970740, 31350126, 39045157, - 49367440, 61356675, 76695844, 95443717, 119304647, - 148102320, 186737708, 238609294, 286331153, -}; +static void set_load_weight(struct task_struct *p) +{ + if (has_rt_policy(p)) { +#ifdef CONFIG_SMP + if (p == task_rq(p)->migration_thread) + /* + * The migration thread does the actual balancing. + * Giving its load any weight will skew balancing + * adversely. + */ + p->load_weight = 0; + else +#endif + p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); + } else + p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); +} static inline void -inc_load(struct rq *rq, const struct task_struct *p, u64 now) +inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) { - update_curr_load(rq, now); - update_load_add(&rq->ls.load, p->se.load.weight); + rq->raw_weighted_load += p->load_weight; } static inline void -dec_load(struct rq *rq, const struct task_struct *p, u64 now) +dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) { - update_curr_load(rq, now); - update_load_sub(&rq->ls.load, p->se.load.weight); + rq->raw_weighted_load -= p->load_weight; } -static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now) +static inline void inc_nr_running(struct task_struct *p, struct rq *rq) { rq->nr_running++; - inc_load(rq, p, now); + inc_raw_weighted_load(rq, p); } -static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now) +static inline void dec_nr_running(struct task_struct *p, struct rq *rq) { rq->nr_running--; - dec_load(rq, p, now); -} - -static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); - -/* - * runqueue iterator, to support SMP load-balancing between different - * scheduling classes, without having to expose their internal data - * structures to the load-balancing proper: - */ -struct rq_iterator { - void *arg; - struct task_struct *(*start)(void *); - struct task_struct *(*next)(void *); -}; - -static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_nr_move, unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, unsigned long *load_moved, - int this_best_prio, int best_prio, int best_prio_seen, - struct rq_iterator *iterator); - -#include "sched_stats.h" -#include "sched_rt.c" -#include "sched_fair.c" -#include "sched_idletask.c" -#ifdef CONFIG_SCHED_DEBUG -# include "sched_debug.c" -#endif - -#define sched_class_highest (&rt_sched_class) - -static void set_load_weight(struct task_struct *p) -{ - task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime; - p->se.wait_runtime = 0; - - if (task_has_rt_policy(p)) { - p->se.load.weight = prio_to_weight[0] * 2; - p->se.load.inv_weight = prio_to_wmult[0] >> 1; - return; - } - - /* - * SCHED_IDLE tasks get minimal weight: - */ - if (p->policy == SCHED_IDLE) { - p->se.load.weight = WEIGHT_IDLEPRIO; - p->se.load.inv_weight = WMULT_IDLEPRIO; - return; - } - - p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; - p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; -} - -static void -enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now) -{ - sched_info_queued(p); - p->sched_class->enqueue_task(rq, p, wakeup, now); - p->se.on_rq = 1; -} - -static void -dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now) -{ - p->sched_class->dequeue_task(rq, p, sleep, now); - p->se.on_rq = 0; -} - -/* - * __normal_prio - return the priority that is based on the static prio - */ -static inline int __normal_prio(struct task_struct *p) -{ - return p->static_prio; + dec_raw_weighted_load(rq, p); } /* @@ -871,7 +851,7 @@ static inline int normal_prio(struct task_struct *p) { int prio; - if (task_has_rt_policy(p)) + if (has_rt_policy(p)) prio = MAX_RT_PRIO-1 - p->rt_priority; else prio = __normal_prio(p); @@ -899,47 +879,222 @@ static int effective_prio(struct task_struct *p) } /* - * activate_task - move a task to the runqueue. + * __activate_task - move a task to the runqueue. */ -static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) +static void __activate_task(struct task_struct *p, struct rq *rq) { - u64 now = rq_clock(rq); + struct prio_array *target = rq->active; - if (p->state == TASK_UNINTERRUPTIBLE) - rq->nr_uninterruptible--; + if (batch_task(p)) + target = rq->expired; + enqueue_task(p, target); + inc_nr_running(p, rq); +} - enqueue_task(rq, p, wakeup, now); - inc_nr_running(p, rq, now); +/* + * __activate_idle_task - move idle task to the _front_ of runqueue. + */ +static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) +{ + enqueue_task_head(p, rq->active); + inc_nr_running(p, rq); } /* - * activate_idle_task - move idle task to the _front_ of runqueue. + * Recalculate p->normal_prio and p->prio after having slept, + * updating the sleep-average too: */ -static inline void activate_idle_task(struct task_struct *p, struct rq *rq) +static int recalc_task_prio(struct task_struct *p, unsigned long long now) { - u64 now = rq_clock(rq); + /* Caller must always ensure 'now >= p->timestamp' */ + unsigned long sleep_time = now - p->timestamp; - if (p->state == TASK_UNINTERRUPTIBLE) - rq->nr_uninterruptible--; + if (batch_task(p)) + sleep_time = 0; + + if (likely(sleep_time > 0)) { + /* + * This ceiling is set to the lowest priority that would allow + * a task to be reinserted into the active array on timeslice + * completion. + */ + unsigned long ceiling = INTERACTIVE_SLEEP(p); + + if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { + /* + * Prevents user tasks from achieving best priority + * with one single large enough sleep. + */ + p->sleep_avg = ceiling; + /* + * Using INTERACTIVE_SLEEP() as a ceiling places a + * nice(0) task 1ms sleep away from promotion, and + * gives it 700ms to round-robin with no chance of + * being demoted. This is more than generous, so + * mark this sleep as non-interactive to prevent the + * on-runqueue bonus logic from intervening should + * this task not receive cpu immediately. + */ + p->sleep_type = SLEEP_NONINTERACTIVE; + } else { + /* + * Tasks waking from uninterruptible sleep are + * limited in their sleep_avg rise as they + * are likely to be waiting on I/O + */ + if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { + if (p->sleep_avg >= ceiling) + sleep_time = 0; + else if (p->sleep_avg + sleep_time >= + ceiling) { + p->sleep_avg = ceiling; + sleep_time = 0; + } + } + + /* + * This code gives a bonus to interactive tasks. + * + * The boost works by updating the 'average sleep time' + * value here, based on ->timestamp. The more time a + * task spends sleeping, the higher the average gets - + * and the higher the priority boost gets as well. + */ + p->sleep_avg += sleep_time; + + } + if (p->sleep_avg > NS_MAX_SLEEP_AVG) + p->sleep_avg = NS_MAX_SLEEP_AVG; + } - enqueue_task(rq, p, 0, now); - inc_nr_running(p, rq, now); + return effective_prio(p); +} + +/* + * activate_task - move a task to the runqueue and do priority recalculation + * + * Update all the scheduling statistics stuff. (sleep average + * calculation, priority modifiers, etc.) + */ +static void activate_task(struct task_struct *p, struct rq *rq, int local) +{ + unsigned long long now; + + if (rt_task(p)) + goto out; + + now = sched_clock(); +#ifdef CONFIG_SMP + if (!local) { + /* Compensate for drifting sched_clock */ + struct rq *this_rq = this_rq(); + now = (now - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; + } +#endif + + /* + * Sleep time is in units of nanosecs, so shift by 20 to get a + * milliseconds-range estimation of the amount of time that the task + * spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + if (p->state == TASK_UNINTERRUPTIBLE) + profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), + (now - p->timestamp) >> 20); + } + + p->prio = recalc_task_prio(p, now); + + /* + * This checks to make sure it's not an uninterruptible task + * that is now waking up. + */ + if (p->sleep_type == SLEEP_NORMAL) { + /* + * Tasks which were woken up by interrupts (ie. hw events) + * are most likely of interactive nature. So we give them + * the credit of extending their sleep time to the period + * of time they spend on the runqueue, waiting for execution + * on a CPU, first time around: + */ + if (in_interrupt()) + p->sleep_type = SLEEP_INTERRUPTED; + else { + /* + * Normal first-time wakeups get a credit too for + * on-runqueue time, but it will be weighted down: + */ + p->sleep_type = SLEEP_INTERACTIVE; + } + } + p->timestamp = now; +out: + __activate_task(p, rq); } /* * deactivate_task - remove a task from the runqueue. */ -static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) +static void deactivate_task(struct task_struct *p, struct rq *rq) +{ + dec_nr_running(p, rq); + dequeue_task(p, p->array); + p->array = NULL; +} + +/* + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. + */ +#ifdef CONFIG_SMP + +#ifndef tsk_is_polling +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#endif + +static void resched_task(struct task_struct *p) { - u64 now = rq_clock(rq); + int cpu; - if (p->state == TASK_UNINTERRUPTIBLE) - rq->nr_uninterruptible++; + assert_spin_locked(&task_rq(p)->lock); - dequeue_task(rq, p, sleep, now); - dec_nr_running(p, rq, now); + if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) + return; + + set_tsk_thread_flag(p, TIF_NEED_RESCHED); + + cpu = task_cpu(p); + if (cpu == smp_processor_id()) + return; + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(p)) + smp_send_reschedule(cpu); } +static void resched_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + if (!spin_trylock_irqsave(&rq->lock, flags)) + return; + resched_task(cpu_curr(cpu)); + spin_unlock_irqrestore(&rq->lock, flags); +} +#else +static inline void resched_task(struct task_struct *p) +{ + assert_spin_locked(&task_rq(p)->lock); + set_tsk_need_resched(p); +} +#endif + /** * task_curr - is this task currently executing on a CPU? * @p: the task in question. @@ -952,42 +1107,10 @@ inline int task_curr(const struct task_struct *p) /* Used instead of source_load when we know the type == 0 */ unsigned long weighted_cpuload(const int cpu) { - return cpu_rq(cpu)->ls.load.weight; -} - -static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -{ -#ifdef CONFIG_SMP - task_thread_info(p)->cpu = cpu; - set_task_cfs_rq(p); -#endif + return cpu_rq(cpu)->raw_weighted_load; } #ifdef CONFIG_SMP - -void set_task_cpu(struct task_struct *p, unsigned int new_cpu) -{ - int old_cpu = task_cpu(p); - struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); - u64 clock_offset, fair_clock_offset; - - clock_offset = old_rq->clock - new_rq->clock; - fair_clock_offset = old_rq->cfs.fair_clock - - new_rq->cfs.fair_clock; - if (p->se.wait_start) - p->se.wait_start -= clock_offset; - if (p->se.wait_start_fair) - p->se.wait_start_fair -= fair_clock_offset; - if (p->se.sleep_start) - p->se.sleep_start -= clock_offset; - if (p->se.block_start) - p->se.block_start -= clock_offset; - if (p->se.sleep_start_fair) - p->se.sleep_start_fair -= fair_clock_offset; - - __set_task_cpu(p, new_cpu); -} - struct migration_req { struct list_head list; @@ -1010,7 +1133,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ - if (!p->se.on_rq && !task_running(rq, p)) { + if (!p->array && !task_running(rq, p)) { set_task_cpu(p, dest_cpu); return 0; } @@ -1035,8 +1158,9 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) void wait_task_inactive(struct task_struct *p) { unsigned long flags; - int running, on_rq; struct rq *rq; + struct prio_array *array; + int running; repeat: /* @@ -1068,7 +1192,7 @@ void wait_task_inactive(struct task_struct *p) */ rq = task_rq_lock(p, &flags); running = task_running(rq, p); - on_rq = p->se.on_rq; + array = p->array; task_rq_unlock(rq, &flags); /* @@ -1091,7 +1215,7 @@ void wait_task_inactive(struct task_struct *p) * running right now), it's preempted, and we should * yield - it could be a while. */ - if (unlikely(on_rq)) { + if (unlikely(array)) { yield(); goto repeat; } @@ -1137,12 +1261,11 @@ void kick_process(struct task_struct *p) static inline unsigned long source_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); if (type == 0) - return total; + return rq->raw_weighted_load; - return min(rq->cpu_load[type-1], total); + return min(rq->cpu_load[type-1], rq->raw_weighted_load); } /* @@ -1152,12 +1275,11 @@ static inline unsigned long source_load(int cpu, int type) static inline unsigned long target_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); if (type == 0) - return total; + return rq->raw_weighted_load; - return max(rq->cpu_load[type-1], total); + return max(rq->cpu_load[type-1], rq->raw_weighted_load); } /* @@ -1166,10 +1288,9 @@ static inline unsigned long target_load(int cpu, int type) static inline unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); unsigned long n = rq->nr_running; - return n ? total / n : SCHED_LOAD_SCALE; + return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; } /* @@ -1271,9 +1392,9 @@ static int sched_balance_self(int cpu, int flag) struct sched_domain *tmp, *sd = NULL; for_each_domain(cpu, tmp) { - /* - * If power savings logic is enabled for a domain, stop there. - */ + /* + * If power savings logic is enabled for a domain, stop there. + */ if (tmp->flags & SD_POWERSAVINGS_BALANCE) break; if (tmp->flags & flag) @@ -1356,9 +1477,9 @@ static int wake_idle(int cpu, struct task_struct *p) if (idle_cpu(i)) return i; } - } else { - break; } + else + break; } return cpu; } @@ -1400,7 +1521,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (!(old_state & state)) goto out; - if (p->se.on_rq) + if (p->array) goto out_running; cpu = task_cpu(p); @@ -1455,11 +1576,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) * of the current CPU: */ if (sync) - tl -= current->se.load.weight; + tl -= current->load_weight; if ((tl <= load && tl + target_load(cpu, idx) <= tl_per_task) || - 100*(tl + p->se.load.weight) <= imbalance*load) { + 100*(tl + p->load_weight) <= imbalance*load) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and @@ -1493,7 +1614,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) old_state = p->state; if (!(old_state & state)) goto out; - if (p->se.on_rq) + if (p->array) goto out_running; this_cpu = smp_processor_id(); @@ -1502,7 +1623,25 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) out_activate: #endif /* CONFIG_SMP */ - activate_task(rq, p, 1); + if (old_state == TASK_UNINTERRUPTIBLE) { + rq->nr_uninterruptible--; + /* + * Tasks on involuntary sleep don't earn + * sleep_avg beyond just interactive state. + */ + p->sleep_type = SLEEP_NONINTERACTIVE; + } else + + /* + * Tasks that have marked their sleep as noninteractive get + * woken up with their sleep average not weighted in an + * interactive way. + */ + if (old_state & TASK_NONINTERACTIVE) + p->sleep_type = SLEEP_NONINTERACTIVE; + + + activate_task(p, rq, cpu == this_cpu); /* * Sync wakeups (i.e. those types of wakeups where the waker * has indicated that it will leave the CPU in short order) @@ -1511,8 +1650,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) * the waker guarantees that the freshly woken up task is going * to be considered on this CPU.) */ - if (!sync || cpu != this_cpu) - check_preempt_curr(rq, p); + if (!sync || cpu != this_cpu) { + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } success = 1; out_running: @@ -1535,36 +1676,19 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state) return try_to_wake_up(p, state, 0); } +static void task_running_tick(struct rq *rq, struct task_struct *p); /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. - * - * __sched_fork() is basic setup used by init_idle() too: - */ -static void __sched_fork(struct task_struct *p) -{ - p->se.wait_start_fair = 0; - p->se.wait_start = 0; - p->se.exec_start = 0; - p->se.sum_exec_runtime = 0; - p->se.delta_exec = 0; - p->se.delta_fair_run = 0; - p->se.delta_fair_sleep = 0; - p->se.wait_runtime = 0; - p->se.sum_wait_runtime = 0; - p->se.sum_sleep_runtime = 0; - p->se.sleep_start = 0; - p->se.sleep_start_fair = 0; - p->se.block_start = 0; - p->se.sleep_max = 0; - p->se.block_max = 0; - p->se.exec_max = 0; - p->se.wait_max = 0; - p->se.wait_runtime_overruns = 0; - p->se.wait_runtime_underruns = 0; + */ +void fastcall sched_fork(struct task_struct *p, int clone_flags) +{ + int cpu = get_cpu(); - INIT_LIST_HEAD(&p->run_list); - p->se.on_rq = 0; +#ifdef CONFIG_SMP + cpu = sched_balance_self(cpu, SD_BALANCE_FORK); +#endif + set_task_cpu(p, cpu); /* * We mark the process as running here, but have not actually @@ -1573,29 +1697,16 @@ static void __sched_fork(struct task_struct *p) * event cannot wake it up and insert it on the runqueue either. */ p->state = TASK_RUNNING; -} - -/* - * fork()/clone()-time setup: - */ -void sched_fork(struct task_struct *p, int clone_flags) -{ - int cpu = get_cpu(); - - __sched_fork(p); - -#ifdef CONFIG_SMP - cpu = sched_balance_self(cpu, SD_BALANCE_FORK); -#endif - __set_task_cpu(p, cpu); /* * Make sure we do not leak PI boosting priority to the child: */ p->prio = current->normal_prio; + INIT_LIST_HEAD(&p->run_list); + p->array = NULL; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) - if (likely(sched_info_on())) + if (unlikely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) @@ -1605,15 +1716,33 @@ void sched_fork(struct task_struct *p, int clone_flags) /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif + /* + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesn't change, + * resulting in more scheduling fairness. + */ + local_irq_disable(); + p->time_slice = (current->time_slice + 1) >> 1; + /* + * The remainder of the first timeslice might be recovered by + * the parent if the child exits early enough. + */ + p->first_time_slice = 1; + current->time_slice >>= 1; + p->timestamp = sched_clock(); + if (unlikely(!current->time_slice)) { + /* + * This case is rare, it happens when the parent has only + * a single jiffy left from its timeslice. Taking the + * runqueue lock is not a problem. + */ + current->time_slice = 1; + task_running_tick(cpu_rq(cpu), current); + } + local_irq_enable(); put_cpu(); } -/* - * After fork, child runs first. (default) If set to 0 then - * parent will (try to) run first. - */ -unsigned int __read_mostly sysctl_sched_child_runs_first = 1; - /* * wake_up_new_task - wake up a newly created task for the first time. * @@ -1623,27 +1752,107 @@ unsigned int __read_mostly sysctl_sched_child_runs_first = 1; */ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { + struct rq *rq, *this_rq; unsigned long flags; - struct rq *rq; - int this_cpu; + int this_cpu, cpu; rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); - this_cpu = smp_processor_id(); /* parent's CPU */ + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + + /* + * We decrease the sleep average of forking parents + * and children as well, to keep max-interactive tasks + * from forking tasks that are max-interactive. The parent + * (current) is done further down, under its lock. + */ + p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * + CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); p->prio = effective_prio(p); - if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) || - task_cpu(p) != this_cpu || !current->se.on_rq) { - activate_task(rq, p, 0); + if (likely(cpu == this_cpu)) { + if (!(clone_flags & CLONE_VM)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ + if (unlikely(!current->array)) + __activate_task(p, rq); + else { + p->prio = current->prio; + p->normal_prio = current->normal_prio; + list_add_tail(&p->run_list, ¤t->run_list); + p->array = current->array; + p->array->nr_active++; + inc_nr_running(p, rq); + } + set_need_resched(); + } else + /* Run child last */ + __activate_task(p, rq); + /* + * We skip the following code due to cpu == this_cpu + * + * task_rq_unlock(rq, &flags); + * this_rq = task_rq_lock(current, &flags); + */ + this_rq = rq; } else { + this_rq = cpu_rq(this_cpu); + + /* + * Not the local CPU - must adjust timestamp. This should + * get optimised away in the !CONFIG_SMP case. + */ + p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; + __activate_task(p, rq); + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + /* - * Let the scheduling class do new task startup - * management (if any): + * Parent and child are on different CPUs, now get the + * parent runqueue to update the parent's ->sleep_avg: */ - p->sched_class->task_new(rq, p); + task_rq_unlock(rq, &flags); + this_rq = task_rq_lock(current, &flags); + } + current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * + PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + task_rq_unlock(this_rq, &flags); +} + +/* + * Potentially available exiting-child timeslices are + * retrieved here - this way the parent does not get + * penalized for creating too many threads. + * + * (this cannot be used to 'generate' timeslices + * artificially, because any timeslice recovered here + * was given away by the parent in the first place.) + */ +void fastcall sched_exit(struct task_struct *p) +{ + unsigned long flags; + struct rq *rq; + + /* + * If the child was a (relative-) CPU hog then decrease + * the sleep_avg of the parent as well. + */ + rq = task_rq_lock(p->parent, &flags); + if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { + p->parent->time_slice += p->time_slice; + if (unlikely(p->parent->time_slice > task_timeslice(p))) + p->parent->time_slice = task_timeslice(p); } - check_preempt_curr(rq, p); + if (p->sleep_avg < p->parent->sleep_avg) + p->parent->sleep_avg = p->parent->sleep_avg / + (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / + (EXIT_WEIGHT + 1); task_rq_unlock(rq, &flags); } @@ -1708,7 +1917,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) /* * Remove function-return probe instances associated with this * task and put them back on the free list. - */ + */ kprobe_flush_task(prev); put_task_struct(prev); } @@ -1736,15 +1945,13 @@ asmlinkage void schedule_tail(struct task_struct *prev) * context_switch - switch to the new MM and the new * thread's register state. */ -static inline void +static inline struct task_struct * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - struct mm_struct *mm, *oldmm; + struct mm_struct *mm = next->mm; + struct mm_struct *oldmm = prev->active_mm; - prepare_task_switch(rq, next); - mm = next->mm; - oldmm = prev->active_mm; /* * For paravirt, this is coupled with an exit in switch_to to * combine the page table reload and the switch backend into @@ -1752,15 +1959,16 @@ context_switch(struct rq *rq, struct task_struct *prev, */ arch_enter_lazy_cpu_mode(); - if (unlikely(!mm)) { + if (!mm) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); - if (unlikely(!prev->mm)) { + if (!prev->mm) { prev->active_mm = NULL; + WARN_ON(rq->prev_mm); rq->prev_mm = oldmm; } /* @@ -1776,13 +1984,7 @@ context_switch(struct rq *rq, struct task_struct *prev, /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); - barrier(); - /* - * this_rq must be evaluated again because prev may have moved - * CPUs since it called schedule(), thus the 'rq' on its stack - * frame will be invalid. - */ - finish_task_switch(this_rq(), prev); + return prev; } /* @@ -1855,65 +2057,17 @@ unsigned long nr_active(void) return running + uninterruptible; } +#ifdef CONFIG_SMP + /* - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). + * Is this task likely cache-hot: */ -static void update_cpu_load(struct rq *this_rq) +static inline int +task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd) { - u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64; - unsigned long total_load = this_rq->ls.load.weight; - unsigned long this_load = total_load; - struct load_stat *ls = &this_rq->ls; - u64 now = __rq_clock(this_rq); - int i, scale; - - this_rq->nr_load_updates++; - if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD))) - goto do_avg; - - /* Update delta_fair/delta_exec fields first */ - update_curr_load(this_rq, now); - - fair_delta64 = ls->delta_fair + 1; - ls->delta_fair = 0; - - exec_delta64 = ls->delta_exec + 1; - ls->delta_exec = 0; - - sample_interval64 = now - ls->load_update_last; - ls->load_update_last = now; - - if ((s64)sample_interval64 < (s64)TICK_NSEC) - sample_interval64 = TICK_NSEC; - - if (exec_delta64 > sample_interval64) - exec_delta64 = sample_interval64; - - idle_delta64 = sample_interval64 - exec_delta64; - - tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64); - tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64); - - this_load = (unsigned long)tmp64; - -do_avg: - - /* Update our load: */ - for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; - new_load = this_load; - - this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; - } + return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; } -#ifdef CONFIG_SMP - /* * double_rq_lock - safely lock two runqueues * @@ -2030,17 +2184,23 @@ void sched_exec(void) * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ -static void pull_task(struct rq *src_rq, struct task_struct *p, - struct rq *this_rq, int this_cpu) +static void pull_task(struct rq *src_rq, struct prio_array *src_array, + struct task_struct *p, struct rq *this_rq, + struct prio_array *this_array, int this_cpu) { - deactivate_task(src_rq, p, 0); + dequeue_task(p, src_array); + dec_nr_running(p, src_rq); set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); + inc_nr_running(p, this_rq); + enqueue_task(p, this_array); + p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) + + this_rq->most_recent_timestamp; /* * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - check_preempt_curr(this_rq, p); + if (TASK_PREEMPTS_CURR(p, this_rq)) + resched_task(this_rq->curr); } /* @@ -2048,7 +2208,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, */ static int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, - struct sched_domain *sd, enum cpu_idle_type idle, + struct sched_domain *sd, enum idle_type idle, int *all_pinned) { /* @@ -2065,67 +2225,132 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, return 0; /* - * Aggressive migration if too many balance attempts have failed: + * Aggressive migration if: + * 1) task is cache cold, or + * 2) too many balance attempts have failed. */ - if (sd->nr_balance_failed > sd->cache_nice_tries) + + if (sd->nr_balance_failed > sd->cache_nice_tries) { +#ifdef CONFIG_SCHEDSTATS + if (task_hot(p, rq->most_recent_timestamp, sd)) + schedstat_inc(sd, lb_hot_gained[idle]); +#endif return 1; + } + if (task_hot(p, rq->most_recent_timestamp, sd)) + return 0; return 1; } -static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, +#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) + +/* + * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted + * load from busiest to this_rq, as part of a balancing operation within + * "domain". Returns the number of tasks moved. + * + * Called with both runqueues locked. + */ +static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_nr_move, unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, unsigned long *load_moved, - int this_best_prio, int best_prio, int best_prio_seen, - struct rq_iterator *iterator) + struct sched_domain *sd, enum idle_type idle, + int *all_pinned) { - int pulled = 0, pinned = 0, skip_for_load; - struct task_struct *p; - long rem_load_move = max_load_move; + int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, + best_prio_seen, skip_for_load; + struct prio_array *array, *dst_array; + struct list_head *head, *curr; + struct task_struct *tmp; + long rem_load_move; if (max_nr_move == 0 || max_load_move == 0) goto out; + rem_load_move = max_load_move; pinned = 1; + this_best_prio = rq_best_prio(this_rq); + best_prio = rq_best_prio(busiest); + /* + * Enable handling of the case where there is more than one task + * with the best priority. If the current running task is one + * of those with prio==best_prio we know it won't be moved + * and therefore it's safe to override the skip (based on load) of + * any task we find with that prio. + */ + best_prio_seen = best_prio == busiest->curr->prio; /* - * Start the load-balancing iterator: + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. */ - p = iterator->start(iterator->arg); -next: - if (!p) + if (busiest->expired->nr_active) { + array = busiest->expired; + dst_array = this_rq->expired; + } else { + array = busiest->active; + dst_array = this_rq->active; + } + +new_array: + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); + if (idx >= MAX_PRIO) { + if (array == busiest->expired && busiest->active->nr_active) { + array = busiest->active; + dst_array = this_rq->active; + goto new_array; + } goto out; + } + + head = array->queue + idx; + curr = head->prev; +skip_queue: + tmp = list_entry(curr, struct task_struct, run_list); + + curr = curr->prev; + /* * To help distribute high priority tasks accross CPUs we don't * skip a task if it will be the highest priority task (i.e. smallest * prio value) on its new queue regardless of its load weight */ - skip_for_load = (p->se.load.weight >> 1) > rem_load_move + - SCHED_LOAD_SCALE_FUZZ; - if (skip_for_load && p->prio < this_best_prio) - skip_for_load = !best_prio_seen && p->prio == best_prio; + skip_for_load = tmp->load_weight > rem_load_move; + if (skip_for_load && idx < this_best_prio) + skip_for_load = !best_prio_seen && idx == best_prio; if (skip_for_load || - !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { + !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { - best_prio_seen |= p->prio == best_prio; - p = iterator->next(iterator->arg); - goto next; + best_prio_seen |= idx == best_prio; + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; } - pull_task(busiest, p, this_rq, this_cpu); + pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); pulled++; - rem_load_move -= p->se.load.weight; + rem_load_move -= tmp->load_weight; /* * We only want to steal up to the prescribed number of tasks * and the prescribed amount of weighted load. */ if (pulled < max_nr_move && rem_load_move > 0) { - if (p->prio < this_best_prio) - this_best_prio = p->prio; - p = iterator->next(iterator->arg); - goto next; + if (idx < this_best_prio) + this_best_prio = idx; + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; } out: /* @@ -2137,39 +2362,9 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, if (all_pinned) *all_pinned = pinned; - *load_moved = max_load_move - rem_load_move; return pulled; } -/* - * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted - * load from busiest to this_rq, as part of a balancing operation within - * "domain". Returns the number of tasks moved. - * - * Called with both runqueues locked. - */ -static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_nr_move, unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned) -{ - struct sched_class *class = sched_class_highest; - unsigned long load_moved, total_nr_moved = 0, nr_moved; - long rem_load_move = max_load_move; - - do { - nr_moved = class->load_balance(this_rq, this_cpu, busiest, - max_nr_move, (unsigned long)rem_load_move, - sd, idle, all_pinned, &load_moved); - total_nr_moved += nr_moved; - max_nr_move -= nr_moved; - rem_load_move -= load_moved; - class = class->next; - } while (class && max_nr_move && rem_load_move > 0); - - return total_nr_moved; -} - /* * find_busiest_group finds and returns the busiest CPU group within the * domain. It calculates and returns the amount of weighted load which @@ -2177,8 +2372,8 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, */ static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, cpumask_t *cpus, int *balance) + unsigned long *imbalance, enum idle_type idle, int *sd_idle, + cpumask_t *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -2196,9 +2391,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, max_load = this_load = total_load = total_pwr = 0; busiest_load_per_task = busiest_nr_running = 0; this_load_per_task = this_nr_running = 0; - if (idle == CPU_NOT_IDLE) + if (idle == NOT_IDLE) load_idx = sd->busy_idx; - else if (idle == CPU_NEWLY_IDLE) + else if (idle == NEWLY_IDLE) load_idx = sd->newidle_idx; else load_idx = sd->idle_idx; @@ -2242,7 +2437,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, avg_load += load; sum_nr_running += rq->nr_running; - sum_weighted_load += weighted_cpuload(i); + sum_weighted_load += rq->raw_weighted_load; } /* @@ -2282,9 +2477,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * Busy processors will not participate in power savings * balance. */ - if (idle == CPU_NOT_IDLE || - !(sd->flags & SD_POWERSAVINGS_BALANCE)) - goto group_next; + if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + goto group_next; /* * If the local group is idle or completely loaded @@ -2294,42 +2488,42 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, !this_nr_running)) power_savings_balance = 0; - /* + /* * If a group is already running at full capacity or idle, * don't include that group in power savings calculations - */ - if (!power_savings_balance || sum_nr_running >= group_capacity + */ + if (!power_savings_balance || sum_nr_running >= group_capacity || !sum_nr_running) - goto group_next; + goto group_next; - /* + /* * Calculate the group which has the least non-idle load. - * This is the group from where we need to pick up the load - * for saving power - */ - if ((sum_nr_running < min_nr_running) || - (sum_nr_running == min_nr_running && + * This is the group from where we need to pick up the load + * for saving power + */ + if ((sum_nr_running < min_nr_running) || + (sum_nr_running == min_nr_running && first_cpu(group->cpumask) < first_cpu(group_min->cpumask))) { - group_min = group; - min_nr_running = sum_nr_running; + group_min = group; + min_nr_running = sum_nr_running; min_load_per_task = sum_weighted_load / sum_nr_running; - } + } - /* + /* * Calculate the group which is almost near its - * capacity but still has some space to pick up some load - * from other group and save more power - */ - if (sum_nr_running <= group_capacity - 1) { - if (sum_nr_running > leader_nr_running || - (sum_nr_running == leader_nr_running && - first_cpu(group->cpumask) > - first_cpu(group_leader->cpumask))) { - group_leader = group; - leader_nr_running = sum_nr_running; - } + * capacity but still has some space to pick up some load + * from other group and save more power + */ + if (sum_nr_running <= group_capacity - 1) { + if (sum_nr_running > leader_nr_running || + (sum_nr_running == leader_nr_running && + first_cpu(group->cpumask) > + first_cpu(group_leader->cpumask))) { + group_leader = group; + leader_nr_running = sum_nr_running; + } } group_next: #endif @@ -2384,7 +2578,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * a think about bumping its value to force at least one task to be * moved */ - if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { + if (*imbalance < busiest_load_per_task) { unsigned long tmp, pwr_now, pwr_move; unsigned int imbn; @@ -2398,8 +2592,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, } else this_load_per_task = SCHED_LOAD_SCALE; - if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= - busiest_load_per_task * imbn) { + if (max_load - this_load >= busiest_load_per_task * imbn) { *imbalance = busiest_load_per_task; return busiest; } @@ -2446,7 +2639,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, out_balanced: #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) goto ret; if (this == group_leader && group_leader != group_min) { @@ -2463,7 +2656,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * find_busiest_queue - find the busiest runqueue among the cpus in group. */ static struct rq * -find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, +find_busiest_queue(struct sched_group *group, enum idle_type idle, unsigned long imbalance, cpumask_t *cpus) { struct rq *busiest = NULL, *rq; @@ -2471,19 +2664,17 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, int i; for_each_cpu_mask(i, group->cpumask) { - unsigned long wl; if (!cpu_isset(i, *cpus)) continue; rq = cpu_rq(i); - wl = weighted_cpuload(i); - if (rq->nr_running == 1 && wl > imbalance) + if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) continue; - if (wl > max_load) { - max_load = wl; + if (rq->raw_weighted_load > max_load) { + max_load = rq->raw_weighted_load; busiest = rq; } } @@ -2507,7 +2698,7 @@ static inline unsigned long minus_1_or_zero(unsigned long n) * tasks if there is an imbalance. */ static int load_balance(int this_cpu, struct rq *this_rq, - struct sched_domain *sd, enum cpu_idle_type idle, + struct sched_domain *sd, enum idle_type idle, int *balance) { int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; @@ -2520,10 +2711,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, - * let the state of idle sibling percolate up as CPU_IDLE, instead of - * portraying it as CPU_NOT_IDLE. + * let the state of idle sibling percolate up as IDLE, instead of + * portraying it as NOT_IDLE. */ - if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && + if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; @@ -2657,7 +2848,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. * - * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE). + * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). * this_rq is locked. */ static int @@ -2674,31 +2865,31 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, * let the state of idle sibling percolate up as IDLE, instead of - * portraying it as CPU_NOT_IDLE. + * portraying it as NOT_IDLE. */ if (sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; - schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]); + schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); redo: - group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, + group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle, &cpus, NULL); if (!group) { - schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); + schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); goto out_balanced; } - busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, + busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance, &cpus); if (!busiest) { - schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); + schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); goto out_balanced; } BUG_ON(busiest == this_rq); - schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); + schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); nr_moved = 0; if (busiest->nr_running > 1) { @@ -2706,7 +2897,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) double_lock_balance(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, minus_1_or_zero(busiest->nr_running), - imbalance, sd, CPU_NEWLY_IDLE, NULL); + imbalance, sd, NEWLY_IDLE, NULL); spin_unlock(&busiest->lock); if (!nr_moved) { @@ -2717,7 +2908,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) } if (!nr_moved) { - schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); + schedstat_inc(sd, lb_failed[NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; @@ -2727,7 +2918,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) return nr_moved; out_balanced: - schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); + schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; @@ -2743,8 +2934,8 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) static void idle_balance(int this_cpu, struct rq *this_rq) { struct sched_domain *sd; - int pulled_task = -1; - unsigned long next_balance = jiffies + HZ; + int pulled_task = 0; + unsigned long next_balance = jiffies + 60 * HZ; for_each_domain(this_cpu, sd) { unsigned long interval; @@ -2763,13 +2954,12 @@ static void idle_balance(int this_cpu, struct rq *this_rq) if (pulled_task) break; } - if (pulled_task || time_after(jiffies, this_rq->next_balance)) { + if (!pulled_task) /* * We are going idle. next_balance may be set based on * a busy processor. So reset next_balance. */ this_rq->next_balance = next_balance; - } } /* @@ -2813,7 +3003,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) schedstat_inc(sd, alb_cnt); if (move_tasks(target_rq, target_cpu, busiest_rq, 1, - RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE, + RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL)) schedstat_inc(sd, alb_pushed); else @@ -2822,6 +3012,32 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) spin_unlock(&target_rq->lock); } +static void update_load(struct rq *this_rq) +{ + unsigned long this_load; + unsigned int i, scale; + + this_load = this_rq->raw_weighted_load; + + /* Update our load: */ + for (i = 0, scale = 1; i < 3; i++, scale += scale) { + unsigned long old_load, new_load; + + /* scale is effectively 1 << i now, and >> i divides by scale */ + + old_load = this_rq->cpu_load[i]; + new_load = this_load; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale-1; + this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; + } +} + #ifdef CONFIG_NO_HZ static struct { atomic_t load_balancer; @@ -2904,7 +3120,7 @@ static DEFINE_SPINLOCK(balancing); * * Balancing parameters are set up in arch_init_sched_domains. */ -static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) +static inline void rebalance_domains(int cpu, enum idle_type idle) { int balance = 1; struct rq *rq = cpu_rq(cpu); @@ -2918,16 +3134,13 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) continue; interval = sd->balance_interval; - if (idle != CPU_IDLE) + if (idle != SCHED_IDLE) interval *= sd->busy_factor; /* scale ms to jiffies */ interval = msecs_to_jiffies(interval); if (unlikely(!interval)) interval = 1; - if (interval > HZ*NR_CPUS/10) - interval = HZ*NR_CPUS/10; - if (sd->flags & SD_SERIALIZE) { if (!spin_trylock(&balancing)) @@ -2941,7 +3154,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) * longer idle, or one of our SMT siblings is * not idle. */ - idle = CPU_NOT_IDLE; + idle = NOT_IDLE; } sd->last_balance = jiffies; } @@ -2969,12 +3182,11 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) */ static void run_rebalance_domains(struct softirq_action *h) { - int this_cpu = smp_processor_id(); - struct rq *this_rq = cpu_rq(this_cpu); - enum cpu_idle_type idle = this_rq->idle_at_tick ? - CPU_IDLE : CPU_NOT_IDLE; + int local_cpu = smp_processor_id(); + struct rq *local_rq = cpu_rq(local_cpu); + enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE; - rebalance_domains(this_cpu, idle); + rebalance_domains(local_cpu, idle); #ifdef CONFIG_NO_HZ /* @@ -2982,13 +3194,13 @@ static void run_rebalance_domains(struct softirq_action *h) * balancing on behalf of the other idle cpus whose ticks are * stopped. */ - if (this_rq->idle_at_tick && - atomic_read(&nohz.load_balancer) == this_cpu) { + if (local_rq->idle_at_tick && + atomic_read(&nohz.load_balancer) == local_cpu) { cpumask_t cpus = nohz.cpu_mask; struct rq *rq; int balance_cpu; - cpu_clear(this_cpu, cpus); + cpu_clear(local_cpu, cpus); for_each_cpu_mask(balance_cpu, cpus) { /* * If this cpu gets work to do, stop the load balancing @@ -3001,8 +3213,8 @@ static void run_rebalance_domains(struct softirq_action *h) rebalance_domains(balance_cpu, SCHED_IDLE); rq = cpu_rq(balance_cpu); - if (time_after(this_rq->next_balance, rq->next_balance)) - this_rq->next_balance = rq->next_balance; + if (time_after(local_rq->next_balance, rq->next_balance)) + local_rq->next_balance = rq->next_balance; } } #endif @@ -3015,8 +3227,9 @@ static void run_rebalance_domains(struct softirq_action *h) * idle load balancing owner or decide to stop the periodic load balancing, * if the whole system is idle. */ -static inline void trigger_load_balance(struct rq *rq, int cpu) +static inline void trigger_load_balance(int cpu) { + struct rq *rq = cpu_rq(cpu); #ifdef CONFIG_NO_HZ /* * If we were in the nohz mode recently and busy at the current @@ -3068,29 +3281,13 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); } - -#else /* CONFIG_SMP */ - +#else /* * on UP we do not need to balance between CPUs: */ static inline void idle_balance(int cpu, struct rq *rq) { } - -/* Avoid "used but not defined" warning on UP */ -static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_nr_move, unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, unsigned long *load_moved, - int this_best_prio, int best_prio, int best_prio_seen, - struct rq_iterator *iterator) -{ - *load_moved = 0; - - return 0; -} - #endif DEFINE_PER_CPU(struct kernel_stat, kstat); @@ -3098,27 +3295,53 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); /* - * Return p->sum_exec_runtime plus any more ns on the sched_clock - * that have not yet been banked in case the task is currently running. + * This is called on clock ticks and on context switches. + * Bank in p->sched_time the ns elapsed since the last tick or switch. + */ +static inline void +update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) +{ + p->sched_time += now - p->last_ran; + p->last_ran = rq->most_recent_timestamp = now; +} + +/* + * Return current->sched_time plus any more ns on the sched_clock + * that have not yet been banked. */ -unsigned long long task_sched_runtime(struct task_struct *p) +unsigned long long current_sched_time(const struct task_struct *p) { + unsigned long long ns; unsigned long flags; - u64 ns, delta_exec; - struct rq *rq; - rq = task_rq_lock(p, &flags); - ns = p->se.sum_exec_runtime; - if (rq->curr == p) { - delta_exec = rq_clock(rq) - p->se.exec_start; - if ((s64)delta_exec > 0) - ns += delta_exec; - } - task_rq_unlock(rq, &flags); + local_irq_save(flags); + ns = p->sched_time + sched_clock() - p->last_ran; + local_irq_restore(flags); return ns; } +/* + * We place interactive tasks back into the active array, if possible. + * + * To guarantee that this does not starve expired tasks we ignore the + * interactivity of a task if the first expired task had to wait more + * than a 'reasonable' amount of time. This deadline timeout is + * load-dependent, as the frequency of array switched decreases with + * increasing number of running tasks. We also ignore the interactivity + * if a better static_prio task has expired: + */ +static inline int expired_starving(struct rq *rq) +{ + if (rq->curr->static_prio > rq->best_expired_prio) + return 1; + if (!STARVATION_LIMIT || !rq->expired_timestamp) + return 0; + if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) + return 1; + return 0; +} + /* * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to @@ -3192,6 +3415,81 @@ void account_steal_time(struct task_struct *p, cputime_t steal) cpustat->steal = cputime64_add(cpustat->steal, tmp); } +static void task_running_tick(struct rq *rq, struct task_struct *p) +{ + if (p->array != rq->active) { + /* Task has expired but was not scheduled yet */ + set_tsk_need_resched(p); + return; + } + spin_lock(&rq->lock); + /* + * The task was running during this tick - update the + * time slice counter. Note: we do not update a thread's + * priority until it either goes to sleep or uses up its + * timeslice. This makes it possible for interactive tasks + * to use up their timeslices at their highest priority levels. + */ + if (rt_task(p)) { + /* + * RR tasks need a special form of timeslice management. + * FIFO tasks have no timeslices. + */ + if ((p->policy == SCHED_RR) && !--p->time_slice) { + p->time_slice = task_timeslice(p); + p->first_time_slice = 0; + set_tsk_need_resched(p); + + /* put it at the end of the queue: */ + requeue_task(p, rq->active); + } + goto out_unlock; + } + if (!--p->time_slice) { + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + p->time_slice = task_timeslice(p); + p->first_time_slice = 0; + + if (!rq->expired_timestamp) + rq->expired_timestamp = jiffies; + if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { + enqueue_task(p, rq->expired); + if (p->static_prio < rq->best_expired_prio) + rq->best_expired_prio = p->static_prio; + } else + enqueue_task(p, rq->active); + } else { + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + * + * This only applies to tasks in the interactive + * delta range with at least TIMESLICE_GRANULARITY to requeue. + */ + if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - + p->time_slice) % TIMESLICE_GRANULARITY(p)) && + (p->time_slice >= TIMESLICE_GRANULARITY(p)) && + (p->array == rq->active)) { + + requeue_task(p, rq->active); + set_tsk_need_resched(p); + } + } +out_unlock: + spin_unlock(&rq->lock); +} + /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -3201,19 +3499,20 @@ void account_steal_time(struct task_struct *p, cputime_t steal) */ void scheduler_tick(void) { + unsigned long long now = sched_clock(); + struct task_struct *p = current; int cpu = smp_processor_id(); + int idle_at_tick = idle_cpu(cpu); struct rq *rq = cpu_rq(cpu); - struct task_struct *curr = rq->curr; - spin_lock(&rq->lock); - if (curr != rq->idle) /* FIXME: needed? */ - curr->sched_class->task_tick(rq, curr); - update_cpu_load(rq); - spin_unlock(&rq->lock); + update_cpu_clock(p, rq, now); + if (!idle_at_tick) + task_running_tick(rq, p); #ifdef CONFIG_SMP - rq->idle_at_tick = idle_cpu(cpu); - trigger_load_balance(rq, cpu); + update_load(rq); + rq->idle_at_tick = idle_at_tick; + trigger_load_balance(cpu); #endif } @@ -3255,129 +3554,170 @@ EXPORT_SYMBOL(sub_preempt_count); #endif -/* - * Print scheduling while atomic bug: - */ -static noinline void __schedule_bug(struct task_struct *prev) +static inline int interactive_sleep(enum sleep_type sleep_type) { - printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n", - prev->comm, preempt_count(), prev->pid); - debug_show_held_locks(prev); - if (irqs_disabled()) - print_irqtrace_events(prev); - dump_stack(); + return (sleep_type == SLEEP_INTERACTIVE || + sleep_type == SLEEP_INTERRUPTED); } /* - * Various schedule()-time debugging checks and statistics: + * schedule() is the main scheduler function. */ -static inline void schedule_debug(struct task_struct *prev) +asmlinkage void __sched schedule(void) { + struct task_struct *prev, *next; + struct prio_array *array; + struct list_head *queue; + unsigned long long now; + unsigned long run_time; + int cpu, idx, new_prio; + long *switch_count; + struct rq *rq; + /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ - if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state)) - __schedule_bug(prev); - + if (unlikely(in_atomic() && !current->exit_state)) { + printk(KERN_ERR "BUG: scheduling while atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), current->pid); + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); + dump_stack(); + } profile_hit(SCHED_PROFILING, __builtin_return_address(0)); - schedstat_inc(this_rq(), sched_cnt); -} +need_resched: + preempt_disable(); + prev = current; + release_kernel_lock(prev); +need_resched_nonpreemptible: + rq = this_rq(); -/* - * Pick up the highest-prio task: - */ -static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev, u64 now) -{ - struct sched_class *class; - struct task_struct *p; + /* + * The idle thread is not allowed to schedule! + * Remove this check after it has been exercised a bit. + */ + if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { + printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + dump_stack(); + } + + schedstat_inc(rq, sched_cnt); + now = sched_clock(); + if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { + run_time = now - prev->timestamp; + if (unlikely((long long)(now - prev->timestamp) < 0)) + run_time = 0; + } else + run_time = NS_MAX_SLEEP_AVG; /* - * Optimization: we know that if all tasks are in - * the fair class we can call that function directly: + * Tasks charged proportionately less run_time at high sleep_avg to + * delay them losing their interactive status */ - if (likely(rq->nr_running == rq->cfs.nr_running)) { - p = fair_sched_class.pick_next_task(rq, now); - if (likely(p)) - return p; + run_time /= (CURRENT_BONUS(prev) ? : 1); + + spin_lock_irq(&rq->lock); + + switch_count = &prev->nivcsw; + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else { + if (prev->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + deactivate_task(prev, rq); + } } - class = sched_class_highest; - for ( ; ; ) { - p = class->pick_next_task(rq, now); - if (p) - return p; + cpu = smp_processor_id(); + if (unlikely(!rq->nr_running)) { + idle_balance(cpu, rq); + if (!rq->nr_running) { + next = rq->idle; + rq->expired_timestamp = 0; + goto switch_tasks; + } + } + + array = rq->active; + if (unlikely(!array->nr_active)) { /* - * Will never be NULL as the idle class always - * returns a non-NULL p: + * Switch the active and expired arrays. */ - class = class->next; + schedstat_inc(rq, sched_switch); + rq->active = rq->expired; + rq->expired = array; + array = rq->active; + rq->expired_timestamp = 0; + rq->best_expired_prio = MAX_PRIO; } -} - -/* - * schedule() is the main scheduler function. - */ -asmlinkage void __sched schedule(void) -{ - struct task_struct *prev, *next; - long *switch_count; - struct rq *rq; - u64 now; - int cpu; -need_resched: - preempt_disable(); - cpu = smp_processor_id(); - rq = cpu_rq(cpu); - rcu_qsctr_inc(cpu); - prev = rq->curr; - switch_count = &prev->nivcsw; + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + next = list_entry(queue->next, struct task_struct, run_list); - release_kernel_lock(prev); -need_resched_nonpreemptible: + if (!rt_task(next) && interactive_sleep(next->sleep_type)) { + unsigned long long delta = now - next->timestamp; + if (unlikely((long long)(now - next->timestamp) < 0)) + delta = 0; - schedule_debug(prev); + if (next->sleep_type == SLEEP_INTERACTIVE) + delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; - spin_lock_irq(&rq->lock); - clear_tsk_need_resched(prev); + array = next->array; + new_prio = recalc_task_prio(next, next->timestamp + delta); - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely((prev->state & TASK_INTERRUPTIBLE) && - unlikely(signal_pending(prev)))) { - prev->state = TASK_RUNNING; - } else { - deactivate_task(rq, prev, 1); + if (unlikely(next->prio != new_prio)) { + dequeue_task(next, array); + next->prio = new_prio; + enqueue_task(next, array); } - switch_count = &prev->nvcsw; } + next->sleep_type = SLEEP_NORMAL; +switch_tasks: + if (next == rq->idle) + schedstat_inc(rq, sched_goidle); + prefetch(next); + prefetch_stack(next); + clear_tsk_need_resched(prev); + rcu_qsctr_inc(task_cpu(prev)); - if (unlikely(!rq->nr_running)) - idle_balance(cpu, rq); + update_cpu_clock(prev, rq, now); - now = __rq_clock(rq); - prev->sched_class->put_prev_task(rq, prev, now); - next = pick_next_task(rq, prev, now); + prev->sleep_avg -= run_time; + if ((long)prev->sleep_avg <= 0) + prev->sleep_avg = 0; + prev->timestamp = prev->last_ran = now; sched_info_switch(prev, next); - if (likely(prev != next)) { + next->timestamp = next->last_ran = now; rq->nr_switches++; rq->curr = next; ++*switch_count; - context_switch(rq, prev, next); /* unlocks the rq */ + prepare_task_switch(rq, next); + prev = context_switch(rq, prev, next); + barrier(); + /* + * this_rq must be evaluated again because prev may have moved + * CPUs since it called schedule(), thus the 'rq' on its stack + * frame will be invalid. + */ + finish_task_switch(this_rq(), prev); } else spin_unlock_irq(&rq->lock); - if (unlikely(reacquire_kernel_lock(current) < 0)) { - cpu = smp_processor_id(); - rq = cpu_rq(cpu); + prev = current; + if (unlikely(reacquire_kernel_lock(prev) < 0)) goto need_resched_nonpreemptible; - } preempt_enable_no_resched(); if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) goto need_resched; @@ -3705,85 +4045,74 @@ wait_for_completion_interruptible_timeout(struct completion *x, } EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); -static inline void -sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) -{ - spin_lock_irqsave(&q->lock, *flags); - __add_wait_queue(q, wait); + +#define SLEEP_ON_VAR \ + unsigned long flags; \ + wait_queue_t wait; \ + init_waitqueue_entry(&wait, current); + +#define SLEEP_ON_HEAD \ + spin_lock_irqsave(&q->lock,flags); \ + __add_wait_queue(q, &wait); \ spin_unlock(&q->lock); -} -static inline void -sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) -{ - spin_lock_irq(&q->lock); - __remove_wait_queue(q, wait); - spin_unlock_irqrestore(&q->lock, *flags); -} +#define SLEEP_ON_TAIL \ + spin_lock_irq(&q->lock); \ + __remove_wait_queue(q, &wait); \ + spin_unlock_irqrestore(&q->lock, flags); -void __sched interruptible_sleep_on(wait_queue_head_t *q) +void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) { - unsigned long flags; - wait_queue_t wait; - - init_waitqueue_entry(&wait, current); + SLEEP_ON_VAR current->state = TASK_INTERRUPTIBLE; - sleep_on_head(q, &wait, &flags); + SLEEP_ON_HEAD schedule(); - sleep_on_tail(q, &wait, &flags); + SLEEP_ON_TAIL } EXPORT_SYMBOL(interruptible_sleep_on); -long __sched +long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) { - unsigned long flags; - wait_queue_t wait; - - init_waitqueue_entry(&wait, current); + SLEEP_ON_VAR current->state = TASK_INTERRUPTIBLE; - sleep_on_head(q, &wait, &flags); + SLEEP_ON_HEAD timeout = schedule_timeout(timeout); - sleep_on_tail(q, &wait, &flags); + SLEEP_ON_TAIL return timeout; } EXPORT_SYMBOL(interruptible_sleep_on_timeout); -void __sched sleep_on(wait_queue_head_t *q) +void fastcall __sched sleep_on(wait_queue_head_t *q) { - unsigned long flags; - wait_queue_t wait; - - init_waitqueue_entry(&wait, current); + SLEEP_ON_VAR current->state = TASK_UNINTERRUPTIBLE; - sleep_on_head(q, &wait, &flags); + SLEEP_ON_HEAD schedule(); - sleep_on_tail(q, &wait, &flags); + SLEEP_ON_TAIL } EXPORT_SYMBOL(sleep_on); -long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) +long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) { - unsigned long flags; - wait_queue_t wait; - - init_waitqueue_entry(&wait, current); + SLEEP_ON_VAR current->state = TASK_UNINTERRUPTIBLE; - sleep_on_head(q, &wait, &flags); + SLEEP_ON_HEAD timeout = schedule_timeout(timeout); - sleep_on_tail(q, &wait, &flags); + SLEEP_ON_TAIL return timeout; } + EXPORT_SYMBOL(sleep_on_timeout); #ifdef CONFIG_RT_MUTEXES @@ -3800,30 +4129,29 @@ EXPORT_SYMBOL(sleep_on_timeout); */ void rt_mutex_setprio(struct task_struct *p, int prio) { + struct prio_array *array; unsigned long flags; - int oldprio, on_rq; struct rq *rq; - u64 now; + int oldprio; BUG_ON(prio < 0 || prio > MAX_PRIO); rq = task_rq_lock(p, &flags); - now = rq_clock(rq); oldprio = p->prio; - on_rq = p->se.on_rq; - if (on_rq) - dequeue_task(rq, p, 0, now); - - if (rt_prio(prio)) - p->sched_class = &rt_sched_class; - else - p->sched_class = &fair_sched_class; - + array = p->array; + if (array) + dequeue_task(p, array); p->prio = prio; - if (on_rq) { - enqueue_task(rq, p, 0, now); + if (array) { + /* + * If changing to an RT priority then queue it + * in the active array! + */ + if (rt_task(p)) + array = rq->active; + enqueue_task(p, array); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on @@ -3832,9 +4160,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); - } else { - check_preempt_curr(rq, p); - } + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); } task_rq_unlock(rq, &flags); } @@ -3843,10 +4170,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) void set_user_nice(struct task_struct *p, long nice) { - int old_prio, delta, on_rq; + struct prio_array *array; + int old_prio, delta; unsigned long flags; struct rq *rq; - u64 now; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -3855,21 +4182,20 @@ void set_user_nice(struct task_struct *p, long nice) * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &flags); - now = rq_clock(rq); /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected * it wont have any effect on scheduling until the task is - * SCHED_FIFO/SCHED_RR: + * not SCHED_NORMAL/SCHED_BATCH: */ - if (task_has_rt_policy(p)) { + if (has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - on_rq = p->se.on_rq; - if (on_rq) { - dequeue_task(rq, p, 0, now); - dec_load(rq, p, now); + array = p->array; + if (array) { + dequeue_task(p, array); + dec_raw_weighted_load(rq, p); } p->static_prio = NICE_TO_PRIO(nice); @@ -3878,9 +4204,9 @@ void set_user_nice(struct task_struct *p, long nice) p->prio = effective_prio(p); delta = p->prio - old_prio; - if (on_rq) { - enqueue_task(rq, p, 0, now); - inc_load(rq, p, now); + if (array) { + enqueue_task(p, array); + inc_raw_weighted_load(rq, p); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -4000,28 +4326,20 @@ static inline struct task_struct *find_process_by_pid(pid_t pid) } /* Actually do priority change: must hold rq lock. */ -static void -__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) +static void __setscheduler(struct task_struct *p, int policy, int prio) { - BUG_ON(p->se.on_rq); + BUG_ON(p->array); p->policy = policy; - switch (p->policy) { - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - p->sched_class = &fair_sched_class; - break; - case SCHED_FIFO: - case SCHED_RR: - p->sched_class = &rt_sched_class; - break; - } - p->rt_priority = prio; p->normal_prio = normal_prio(p); /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); + /* + * SCHED_BATCH tasks are treated as perpetual CPU hogs: + */ + if (policy == SCHED_BATCH) + p->sleep_avg = 0; set_load_weight(p); } @@ -4036,7 +4354,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) { - int retval, oldprio, oldpolicy = -1, on_rq; + int retval, oldprio, oldpolicy = -1; + struct prio_array *array; unsigned long flags; struct rq *rq; @@ -4047,27 +4366,27 @@ int sched_setscheduler(struct task_struct *p, int policy, if (policy < 0) policy = oldpolicy = p->policy; else if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH && - policy != SCHED_IDLE) + policy != SCHED_NORMAL && policy != SCHED_BATCH) return -EINVAL; /* * Valid priorities for SCHED_FIFO and SCHED_RR are - * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, - * SCHED_BATCH and SCHED_IDLE is 0. + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and + * SCHED_BATCH is 0. */ if (param->sched_priority < 0 || (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; - if (rt_policy(policy) != (param->sched_priority != 0)) + if (is_rt_policy(policy) != (param->sched_priority != 0)) return -EINVAL; /* * Allow unprivileged RT tasks to decrease priority: */ if (!capable(CAP_SYS_NICE)) { - if (rt_policy(policy)) { + if (is_rt_policy(policy)) { unsigned long rlim_rtprio; + unsigned long flags; if (!lock_task_sighand(p, &flags)) return -ESRCH; @@ -4083,12 +4402,6 @@ int sched_setscheduler(struct task_struct *p, int policy, param->sched_priority > rlim_rtprio) return -EPERM; } - /* - * Like positive nice levels, dont allow tasks to - * move out of SCHED_IDLE either: - */ - if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) - return -EPERM; /* can't change other user's priorities */ if ((current->euid != p->euid) && @@ -4116,13 +4429,13 @@ int sched_setscheduler(struct task_struct *p, int policy, spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } - on_rq = p->se.on_rq; - if (on_rq) - deactivate_task(rq, p, 0); + array = p->array; + if (array) + deactivate_task(p, rq); oldprio = p->prio; - __setscheduler(rq, p, policy, param->sched_priority); - if (on_rq) { - activate_task(rq, p, 0); + __setscheduler(p, policy, param->sched_priority); + if (array) { + __activate_task(p, rq); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on @@ -4131,9 +4444,8 @@ int sched_setscheduler(struct task_struct *p, int policy, if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); - } else { - check_preempt_curr(rq, p); - } + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); } __task_rq_unlock(rq); spin_unlock_irqrestore(&p->pi_lock, flags); @@ -4405,18 +4717,41 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, /** * sys_sched_yield - yield the current processor to other threads. * - * This function yields the current CPU to other tasks. If there are no - * other threads running on this CPU then this function will return. + * This function yields the current CPU by moving the calling thread + * to the expired array. If there are no other threads running on this + * CPU then this function will return. */ asmlinkage long sys_sched_yield(void) { struct rq *rq = this_rq_lock(); + struct prio_array *array = current->array, *target = rq->expired; + + schedstat_inc(rq, yld_cnt); + /* + * We implement yielding by moving the task into the expired + * queue. + * + * (special rule: RT tasks will just roundrobin in the active + * array.) + */ + if (rt_task(current)) + target = rq->active; - schedstat_inc(rq, yld_cnt); - if (unlikely(rq->nr_running == 1)) + if (array->nr_active == 1) { schedstat_inc(rq, yld_act_empty); - else - current->sched_class->yield_task(rq, current); + if (!rq->expired->nr_active) + schedstat_inc(rq, yld_both_empty); + } else if (!rq->expired->nr_active) + schedstat_inc(rq, yld_exp_empty); + + if (array != target) { + dequeue_task(current, array); + enqueue_task(current, target); + } else + /* + * requeue_task is cheaper so perform that if possible. + */ + requeue_task(current, array); /* * Since we are going to call schedule() anyway, there's @@ -4567,7 +4902,6 @@ asmlinkage long sys_sched_get_priority_max(int policy) break; case SCHED_NORMAL: case SCHED_BATCH: - case SCHED_IDLE: ret = 0; break; } @@ -4592,7 +4926,6 @@ asmlinkage long sys_sched_get_priority_min(int policy) break; case SCHED_NORMAL: case SCHED_BATCH: - case SCHED_IDLE: ret = 0; } return ret; @@ -4627,7 +4960,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) goto out_unlock; jiffies_to_timespec(p->policy == SCHED_FIFO ? - 0 : static_prio_timeslice(p->static_prio), &t); + 0 : task_timeslice(p), &t); read_unlock(&tasklist_lock); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; out_nounlock: @@ -4702,9 +5035,6 @@ void show_state_filter(unsigned long state_filter) touch_all_softlockup_watchdogs(); -#ifdef CONFIG_SCHED_DEBUG - sysrq_sched_debug_show(); -#endif read_unlock(&tasklist_lock); /* * Only show locks if all tasks are dumped: @@ -4713,11 +5043,6 @@ void show_state_filter(unsigned long state_filter) debug_show_all_locks(); } -void __cpuinit init_idle_bootup_task(struct task_struct *idle) -{ - idle->sched_class = &idle_sched_class; -} - /** * init_idle - set up an idle thread for a given CPU * @idle: task in question @@ -4731,12 +5056,13 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - __sched_fork(idle); - idle->se.exec_start = sched_clock(); - + idle->timestamp = sched_clock(); + idle->sleep_avg = 0; + idle->array = NULL; idle->prio = idle->normal_prio = MAX_PRIO; + idle->state = TASK_RUNNING; idle->cpus_allowed = cpumask_of_cpu(cpu); - __set_task_cpu(idle, cpu); + set_task_cpu(idle, cpu); spin_lock_irqsave(&rq->lock, flags); rq->curr = rq->idle = idle; @@ -4751,10 +5077,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) #else task_thread_info(idle)->preempt_count = 0; #endif - /* - * The idle tasks have their own, simple scheduling class: - */ - idle->sched_class = &idle_sched_class; } /* @@ -4766,28 +5088,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) */ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; -/* - * Increase the granularity value when there are more CPUs, - * because with more CPUs the 'effective latency' as visible - * to users decreases. But the relationship is not linear, - * so pick a second-best guess by going with the log2 of the - * number of CPUs. - * - * This idea comes from the SD scheduler of Con Kolivas: - */ -static inline void sched_init_granularity(void) -{ - unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long gran_limit = 10000000; - - sysctl_sched_granularity *= factor; - if (sysctl_sched_granularity > gran_limit) - sysctl_sched_granularity = gran_limit; - - sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; - sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; -} - #ifdef CONFIG_SMP /* * This is how migration works: @@ -4861,7 +5161,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { struct rq *rq_dest, *rq_src; - int ret = 0, on_rq; + int ret = 0; if (unlikely(cpu_is_offline(dest_cpu))) return ret; @@ -4877,13 +5177,20 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; - on_rq = p->se.on_rq; - if (on_rq) - deactivate_task(rq_src, p, 0); set_task_cpu(p, dest_cpu); - if (on_rq) { - activate_task(rq_dest, p, 0); - check_preempt_curr(rq_dest, p); + if (p->array) { + /* + * Sync timestamp with rq_dest's before activating. + * The same thing could be achieved by doing this step + * afterwards, and pretending it was a local activate. + * This way is cleaner and logically correct. + */ + p->timestamp = p->timestamp - rq_src->most_recent_timestamp + + rq_dest->most_recent_timestamp; + deactivate_task(p, rq_src); + __activate_task(p, rq_dest); + if (TASK_PREEMPTS_CURR(p, rq_dest)) + resched_task(rq_dest->curr); } ret = 1; out: @@ -5035,8 +5342,7 @@ static void migrate_live_tasks(int src_cpu) write_unlock_irq(&tasklist_lock); } -/* - * Schedules idle task to be the next runnable task on current CPU. +/* Schedules idle task to be the next runnable task on current CPU. * It does so by boosting its priority to highest possible and adding it to * the _front_ of the runqueue. Used by CPU offline code. */ @@ -5056,10 +5362,10 @@ void sched_idle_next(void) */ spin_lock_irqsave(&rq->lock, flags); - __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); + __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); /* Add idle task to the _front_ of its priority queue: */ - activate_idle_task(p, rq); + __activate_idle_task(p, rq); spin_unlock_irqrestore(&rq->lock, flags); } @@ -5109,15 +5415,16 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) static void migrate_dead_tasks(unsigned int dead_cpu) { struct rq *rq = cpu_rq(dead_cpu); - struct task_struct *next; + unsigned int arr, i; - for ( ; ; ) { - if (!rq->nr_running) - break; - next = pick_next_task(rq, rq->curr, rq_clock(rq)); - if (!next) - break; - migrate_dead(dead_cpu, next); + for (arr = 0; arr < 2; arr++) { + for (i = 0; i < MAX_PRIO; i++) { + struct list_head *list = &rq->arrays[arr].queue[i]; + + while (!list_empty(list)) + migrate_dead(dead_cpu, list_entry(list->next, + struct task_struct, run_list)); + } } } #endif /* CONFIG_HOTPLUG_CPU */ @@ -5141,14 +5448,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); + p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); if (IS_ERR(p)) return NOTIFY_BAD; p->flags |= PF_NOFREEZE; kthread_bind(p, cpu); /* Must be high prio: stop_machine expects to yield to it. */ rq = task_rq_lock(p, &flags); - __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); + __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); task_rq_unlock(rq, &flags); cpu_rq(cpu)->migration_thread = p; break; @@ -5179,10 +5486,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) rq->migration_thread = NULL; /* Idle task back to normal (off runqueue, low prio) */ rq = task_rq_lock(rq->idle, &flags); - deactivate_task(rq, rq->idle, 0); + deactivate_task(rq->idle, rq); rq->idle->static_prio = MAX_PRIO; - __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); - rq->idle->sched_class = &idle_sched_class; + __setscheduler(rq->idle, SCHED_NORMAL, 0); migrate_dead_tasks(cpu); task_rq_unlock(rq, &flags); migrate_nr_uninterruptible(rq); @@ -5491,6 +5797,483 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, #define SD_NODES_PER_DOMAIN 16 +/* + * Self-tuning task migration cost measurement between source and target CPUs. + * + * This is done by measuring the cost of manipulating buffers of varying + * sizes. For a given buffer-size here are the steps that are taken: + * + * 1) the source CPU reads+dirties a shared buffer + * 2) the target CPU reads+dirties the same shared buffer + * + * We measure how long they take, in the following 4 scenarios: + * + * - source: CPU1, target: CPU2 | cost1 + * - source: CPU2, target: CPU1 | cost2 + * - source: CPU1, target: CPU1 | cost3 + * - source: CPU2, target: CPU2 | cost4 + * + * We then calculate the cost3+cost4-cost1-cost2 difference - this is + * the cost of migration. + * + * We then start off from a small buffer-size and iterate up to larger + * buffer sizes, in 5% steps - measuring each buffer-size separately, and + * doing a maximum search for the cost. (The maximum cost for a migration + * normally occurs when the working set size is around the effective cache + * size.) + */ +#define SEARCH_SCOPE 2 +#define MIN_CACHE_SIZE (64*1024U) +#define DEFAULT_CACHE_SIZE (5*1024*1024U) +#define ITERATIONS 1 +#define SIZE_THRESH 130 +#define COST_THRESH 130 + +/* + * The migration cost is a function of 'domain distance'. Domain + * distance is the number of steps a CPU has to iterate down its + * domain tree to share a domain with the other CPU. The farther + * two CPUs are from each other, the larger the distance gets. + * + * Note that we use the distance only to cache measurement results, + * the distance value is not used numerically otherwise. When two + * CPUs have the same distance it is assumed that the migration + * cost is the same. (this is a simplification but quite practical) + */ +#define MAX_DOMAIN_DISTANCE 32 + +static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = + { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = +/* + * Architectures may override the migration cost and thus avoid + * boot-time calibration. Unit is nanoseconds. Mostly useful for + * virtualized hardware: + */ +#ifdef CONFIG_DEFAULT_MIGRATION_COST + CONFIG_DEFAULT_MIGRATION_COST +#else + -1LL +#endif +}; + +/* + * Allow override of migration cost - in units of microseconds. + * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost + * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: + */ +static int __init migration_cost_setup(char *str) +{ + int ints[MAX_DOMAIN_DISTANCE+1], i; + + str = get_options(str, ARRAY_SIZE(ints), ints); + + printk("#ints: %d\n", ints[0]); + for (i = 1; i <= ints[0]; i++) { + migration_cost[i-1] = (unsigned long long)ints[i]*1000; + printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); + } + return 1; +} + +__setup ("migration_cost=", migration_cost_setup); + +/* + * Global multiplier (divisor) for migration-cutoff values, + * in percentiles. E.g. use a value of 150 to get 1.5 times + * longer cache-hot cutoff times. + * + * (We scale it from 100 to 128 to long long handling easier.) + */ + +#define MIGRATION_FACTOR_SCALE 128 + +static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; + +static int __init setup_migration_factor(char *str) +{ + get_option(&str, &migration_factor); + migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; + return 1; +} + +__setup("migration_factor=", setup_migration_factor); + +/* + * Estimated distance of two CPUs, measured via the number of domains + * we have to pass for the two CPUs to be in the same span: + */ +static unsigned long domain_distance(int cpu1, int cpu2) +{ + unsigned long distance = 0; + struct sched_domain *sd; + + for_each_domain(cpu1, sd) { + WARN_ON(!cpu_isset(cpu1, sd->span)); + if (cpu_isset(cpu2, sd->span)) + return distance; + distance++; + } + if (distance >= MAX_DOMAIN_DISTANCE) { + WARN_ON(1); + distance = MAX_DOMAIN_DISTANCE-1; + } + + return distance; +} + +static unsigned int migration_debug; + +static int __init setup_migration_debug(char *str) +{ + get_option(&str, &migration_debug); + return 1; +} + +__setup("migration_debug=", setup_migration_debug); + +/* + * Maximum cache-size that the scheduler should try to measure. + * Architectures with larger caches should tune this up during + * bootup. Gets used in the domain-setup code (i.e. during SMP + * bootup). + */ +unsigned int max_cache_size; + +static int __init setup_max_cache_size(char *str) +{ + get_option(&str, &max_cache_size); + return 1; +} + +__setup("max_cache_size=", setup_max_cache_size); + +/* + * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This + * is the operation that is timed, so we try to generate unpredictable + * cachemisses that still end up filling the L2 cache: + */ +static void touch_cache(void *__cache, unsigned long __size) +{ + unsigned long size = __size / sizeof(long); + unsigned long chunk1 = size / 3; + unsigned long chunk2 = 2 * size / 3; + unsigned long *cache = __cache; + int i; + + for (i = 0; i < size/6; i += 8) { + switch (i % 6) { + case 0: cache[i]++; + case 1: cache[size-1-i]++; + case 2: cache[chunk1-i]++; + case 3: cache[chunk1+i]++; + case 4: cache[chunk2-i]++; + case 5: cache[chunk2+i]++; + } + } +} + +/* + * Measure the cache-cost of one task migration. Returns in units of nsec. + */ +static unsigned long long +measure_one(void *cache, unsigned long size, int source, int target) +{ + cpumask_t mask, saved_mask; + unsigned long long t0, t1, t2, t3, cost; + + saved_mask = current->cpus_allowed; + + /* + * Flush source caches to RAM and invalidate them: + */ + sched_cacheflush(); + + /* + * Migrate to the source CPU: + */ + mask = cpumask_of_cpu(source); + set_cpus_allowed(current, mask); + WARN_ON(smp_processor_id() != source); + + /* + * Dirty the working set: + */ + t0 = sched_clock(); + touch_cache(cache, size); + t1 = sched_clock(); + + /* + * Migrate to the target CPU, dirty the L2 cache and access + * the shared buffer. (which represents the working set + * of a migrated task.) + */ + mask = cpumask_of_cpu(target); + set_cpus_allowed(current, mask); + WARN_ON(smp_processor_id() != target); + + t2 = sched_clock(); + touch_cache(cache, size); + t3 = sched_clock(); + + cost = t1-t0 + t3-t2; + + if (migration_debug >= 2) + printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", + source, target, t1-t0, t1-t0, t3-t2, cost); + /* + * Flush target caches to RAM and invalidate them: + */ + sched_cacheflush(); + + set_cpus_allowed(current, saved_mask); + + return cost; +} + +/* + * Measure a series of task migrations and return the average + * result. Since this code runs early during bootup the system + * is 'undisturbed' and the average latency makes sense. + * + * The algorithm in essence auto-detects the relevant cache-size, + * so it will properly detect different cachesizes for different + * cache-hierarchies, depending on how the CPUs are connected. + * + * Architectures can prime the upper limit of the search range via + * max_cache_size, otherwise the search range defaults to 20MB...64K. + */ +static unsigned long long +measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) +{ + unsigned long long cost1, cost2; + int i; + + /* + * Measure the migration cost of 'size' bytes, over an + * average of 10 runs: + * + * (We perturb the cache size by a small (0..4k) + * value to compensate size/alignment related artifacts. + * We also subtract the cost of the operation done on + * the same CPU.) + */ + cost1 = 0; + + /* + * dry run, to make sure we start off cache-cold on cpu1, + * and to get any vmalloc pagefaults in advance: + */ + measure_one(cache, size, cpu1, cpu2); + for (i = 0; i < ITERATIONS; i++) + cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2); + + measure_one(cache, size, cpu2, cpu1); + for (i = 0; i < ITERATIONS; i++) + cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1); + + /* + * (We measure the non-migrating [cached] cost on both + * cpu1 and cpu2, to handle CPUs with different speeds) + */ + cost2 = 0; + + measure_one(cache, size, cpu1, cpu1); + for (i = 0; i < ITERATIONS; i++) + cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1); + + measure_one(cache, size, cpu2, cpu2); + for (i = 0; i < ITERATIONS; i++) + cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2); + + /* + * Get the per-iteration migration cost: + */ + do_div(cost1, 2 * ITERATIONS); + do_div(cost2, 2 * ITERATIONS); + + return cost1 - cost2; +} + +static unsigned long long measure_migration_cost(int cpu1, int cpu2) +{ + unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; + unsigned int max_size, size, size_found = 0; + long long cost = 0, prev_cost; + void *cache; + + /* + * Search from max_cache_size*5 down to 64K - the real relevant + * cachesize has to lie somewhere inbetween. + */ + if (max_cache_size) { + max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); + size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); + } else { + /* + * Since we have no estimation about the relevant + * search range + */ + max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; + size = MIN_CACHE_SIZE; + } + + if (!cpu_online(cpu1) || !cpu_online(cpu2)) { + printk("cpu %d and %d not both online!\n", cpu1, cpu2); + return 0; + } + + /* + * Allocate the working set: + */ + cache = vmalloc(max_size); + if (!cache) { + printk("could not vmalloc %d bytes for cache!\n", 2 * max_size); + return 1000000; /* return 1 msec on very small boxen */ + } + + while (size <= max_size) { + prev_cost = cost; + cost = measure_cost(cpu1, cpu2, cache, size); + + /* + * Update the max: + */ + if (cost > 0) { + if (max_cost < cost) { + max_cost = cost; + size_found = size; + } + } + /* + * Calculate average fluctuation, we use this to prevent + * noise from triggering an early break out of the loop: + */ + fluct = abs(cost - prev_cost); + avg_fluct = (avg_fluct + fluct)/2; + + if (migration_debug) + printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): " + "(%8Ld %8Ld)\n", + cpu1, cpu2, size, + (long)cost / 1000000, + ((long)cost / 100000) % 10, + (long)max_cost / 1000000, + ((long)max_cost / 100000) % 10, + domain_distance(cpu1, cpu2), + cost, avg_fluct); + + /* + * If we iterated at least 20% past the previous maximum, + * and the cost has dropped by more than 20% already, + * (taking fluctuations into account) then we assume to + * have found the maximum and break out of the loop early: + */ + if (size_found && (size*100 > size_found*SIZE_THRESH)) + if (cost+avg_fluct <= 0 || + max_cost*100 > (cost+avg_fluct)*COST_THRESH) { + + if (migration_debug) + printk("-> found max.\n"); + break; + } + /* + * Increase the cachesize in 10% steps: + */ + size = size * 10 / 9; + } + + if (migration_debug) + printk("[%d][%d] working set size found: %d, cost: %Ld\n", + cpu1, cpu2, size_found, max_cost); + + vfree(cache); + + /* + * A task is considered 'cache cold' if at least 2 times + * the worst-case cost of migration has passed. + * + * (this limit is only listened to if the load-balancing + * situation is 'nice' - if there is a large imbalance we + * ignore it for the sake of CPU utilization and + * processing fairness.) + */ + return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; +} + +static void calibrate_migration_costs(const cpumask_t *cpu_map) +{ + int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); + unsigned long j0, j1, distance, max_distance = 0; + struct sched_domain *sd; + + j0 = jiffies; + + /* + * First pass - calculate the cacheflush times: + */ + for_each_cpu_mask(cpu1, *cpu_map) { + for_each_cpu_mask(cpu2, *cpu_map) { + if (cpu1 == cpu2) + continue; + distance = domain_distance(cpu1, cpu2); + max_distance = max(max_distance, distance); + /* + * No result cached yet? + */ + if (migration_cost[distance] == -1LL) + migration_cost[distance] = + measure_migration_cost(cpu1, cpu2); + } + } + /* + * Second pass - update the sched domain hierarchy with + * the new cache-hot-time estimations: + */ + for_each_cpu_mask(cpu, *cpu_map) { + distance = 0; + for_each_domain(cpu, sd) { + sd->cache_hot_time = migration_cost[distance]; + distance++; + } + } + /* + * Print the matrix: + */ + if (migration_debug) + printk("migration: max_cache_size: %d, cpu: %d MHz:\n", + max_cache_size, +#ifdef CONFIG_X86 + cpu_khz/1000 +#else + -1 +#endif + ); + if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) { + printk("migration_cost="); + for (distance = 0; distance <= max_distance; distance++) { + if (distance) + printk(","); + printk("%ld", (long)migration_cost[distance] / 1000); + } + printk("\n"); + } + j1 = jiffies; + if (migration_debug) + printk("migration: %ld seconds\n", (j1-j0) / HZ); + + /* + * Move back to the original CPU. NUMA-Q gets confused + * if we migrate to another quad during bootup. + */ + if (raw_smp_processor_id() != orig_cpu) { + cpumask_t mask = cpumask_of_cpu(orig_cpu), + saved_mask = current->cpus_allowed; + + set_cpus_allowed(current, mask); + set_cpus_allowed(current, saved_mask); + } +} + #ifdef CONFIG_NUMA /** @@ -5791,6 +6574,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) static int build_sched_domains(const cpumask_t *cpu_map) { int i; + struct sched_domain *sd; #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; int sd_allnodes = 0; @@ -5798,7 +6582,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) /* * Allocate the per-node list of sched groups */ - sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES, + sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, GFP_KERNEL); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); @@ -5817,8 +6601,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) cpus_and(nodemask, nodemask, *cpu_map); #ifdef CONFIG_NUMA - if (cpus_weight(*cpu_map) > - SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { + if (cpus_weight(*cpu_map) + > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { sd = &per_cpu(allnodes_domains, i); *sd = SD_ALLNODES_INIT; sd->span = *cpu_map; @@ -5877,8 +6661,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) if (i != first_cpu(this_sibling_map)) continue; - init_sched_build_groups(this_sibling_map, cpu_map, - &cpu_to_cpu_group); + init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); } #endif @@ -5889,11 +6672,11 @@ static int build_sched_domains(const cpumask_t *cpu_map) cpus_and(this_core_map, this_core_map, *cpu_map); if (i != first_cpu(this_core_map)) continue; - init_sched_build_groups(this_core_map, cpu_map, - &cpu_to_core_group); + init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); } #endif + /* Set up physical groups */ for (i = 0; i < MAX_NUMNODES; i++) { cpumask_t nodemask = node_to_cpumask(i); @@ -5908,8 +6691,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) #ifdef CONFIG_NUMA /* Set up node groups */ if (sd_allnodes) - init_sched_build_groups(*cpu_map, cpu_map, - &cpu_to_allnodes_group); + init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); for (i = 0; i < MAX_NUMNODES; i++) { /* Set up node groups */ @@ -5937,7 +6719,6 @@ static int build_sched_domains(const cpumask_t *cpu_map) sched_group_nodes[i] = sg; for_each_cpu_mask(j, nodemask) { struct sched_domain *sd; - sd = &per_cpu(node_domains, j); sd->groups = sg; } @@ -5982,22 +6763,19 @@ static int build_sched_domains(const cpumask_t *cpu_map) /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT for_each_cpu_mask(i, *cpu_map) { - struct sched_domain *sd = &per_cpu(cpu_domains, i); - + sd = &per_cpu(cpu_domains, i); init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC for_each_cpu_mask(i, *cpu_map) { - struct sched_domain *sd = &per_cpu(core_domains, i); - + sd = &per_cpu(core_domains, i); init_sched_groups_power(i, sd); } #endif for_each_cpu_mask(i, *cpu_map) { - struct sched_domain *sd = &per_cpu(phys_domains, i); - + sd = &per_cpu(phys_domains, i); init_sched_groups_power(i, sd); } @@ -6025,6 +6803,10 @@ static int build_sched_domains(const cpumask_t *cpu_map) #endif cpu_attach_domain(sd, i); } + /* + * Tune cache-hot values: + */ + calibrate_migration_costs(cpu_map); return 0; @@ -6231,12 +7013,10 @@ void __init sched_init_smp(void) /* Move init over to a non-isolated CPU */ if (set_cpus_allowed(current, non_isolated_cpus) < 0) BUG(); - sched_init_granularity(); } #else void __init sched_init_smp(void) { - sched_init_granularity(); } #endif /* CONFIG_SMP */ @@ -6250,51 +7030,28 @@ int in_sched_functions(unsigned long addr) && addr < (unsigned long)__sched_text_end); } -static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) -{ - cfs_rq->tasks_timeline = RB_ROOT; - cfs_rq->fair_clock = 1; -#ifdef CONFIG_FAIR_GROUP_SCHED - cfs_rq->rq = rq; -#endif -} - void __init sched_init(void) { - u64 now = sched_clock(); + int i, j, k; int highest_cpu = 0; - int i, j; - - /* - * Link up the scheduling class hierarchy: - */ - rt_sched_class.next = &fair_sched_class; - fair_sched_class.next = &idle_sched_class; - idle_sched_class.next = NULL; for_each_possible_cpu(i) { - struct rt_prio_array *array; + struct prio_array *array; struct rq *rq; rq = cpu_rq(i); spin_lock_init(&rq->lock); lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; - rq->clock = 1; - init_cfs_rq(&rq->cfs, rq); -#ifdef CONFIG_FAIR_GROUP_SCHED - INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); - list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); -#endif - rq->ls.load_update_last = now; - rq->ls.load_update_start = now; + rq->active = rq->arrays; + rq->expired = rq->arrays + 1; + rq->best_expired_prio = MAX_PRIO; - for (j = 0; j < CPU_LOAD_IDX_MAX; j++) - rq->cpu_load[j] = 0; #ifdef CONFIG_SMP rq->sd = NULL; + for (j = 1; j < 3; j++) + rq->cpu_load[j] = 0; rq->active_balance = 0; - rq->next_balance = jiffies; rq->push_cpu = 0; rq->cpu = i; rq->migration_thread = NULL; @@ -6302,14 +7059,16 @@ void __init sched_init(void) #endif atomic_set(&rq->nr_iowait, 0); - array = &rq->rt.active; - for (j = 0; j < MAX_RT_PRIO; j++) { - INIT_LIST_HEAD(array->queue + j); - __clear_bit(j, array->bitmap); + for (j = 0; j < 2; j++) { + array = rq->arrays + j; + for (k = 0; k < MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); } highest_cpu = i; - /* delimiter for bitsearch: */ - __set_bit(MAX_RT_PRIO, array->bitmap); } set_load_weight(&init_task); @@ -6336,10 +7095,6 @@ void __init sched_init(void) * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); - /* - * During early bootup we pretend to be a normal task: - */ - current->sched_class = &fair_sched_class; } #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP @@ -6370,55 +7125,29 @@ EXPORT_SYMBOL(__might_sleep); #ifdef CONFIG_MAGIC_SYSRQ void normalize_rt_tasks(void) { + struct prio_array *array; struct task_struct *g, *p; unsigned long flags; struct rq *rq; - int on_rq; read_lock_irq(&tasklist_lock); + do_each_thread(g, p) { - p->se.fair_key = 0; - p->se.wait_runtime = 0; - p->se.wait_start_fair = 0; - p->se.wait_start = 0; - p->se.exec_start = 0; - p->se.sleep_start = 0; - p->se.sleep_start_fair = 0; - p->se.block_start = 0; - task_rq(p)->cfs.fair_clock = 0; - task_rq(p)->clock = 0; - - if (!rt_task(p)) { - /* - * Renice negative nice level userspace - * tasks back to 0: - */ - if (TASK_NICE(p) < 0 && p->mm) - set_user_nice(p, 0); + if (!rt_task(p)) continue; - } spin_lock_irqsave(&p->pi_lock, flags); rq = __task_rq_lock(p); -#ifdef CONFIG_SMP - /* - * Do not touch the migration thread: - */ - if (p == rq->migration_thread) - goto out_unlock; -#endif - on_rq = p->se.on_rq; - if (on_rq) - deactivate_task(task_rq(p), p, 0); - __setscheduler(rq, p, SCHED_NORMAL, 0); - if (on_rq) { - activate_task(task_rq(p), p, 0); + array = p->array; + if (array) + deactivate_task(p, task_rq(p)); + __setscheduler(p, SCHED_NORMAL, 0); + if (array) { + __activate_task(p, task_rq(p)); resched_task(rq->curr); } -#ifdef CONFIG_SMP - out_unlock: -#endif + __task_rq_unlock(rq); spin_unlock_irqrestore(&p->pi_lock, flags); } while_each_thread(g, p); diff --git a/trunk/kernel/sched_debug.c b/trunk/kernel/sched_debug.c deleted file mode 100644 index 1baf87cceb7c..000000000000 --- a/trunk/kernel/sched_debug.c +++ /dev/null @@ -1,275 +0,0 @@ -/* - * kernel/time/sched_debug.c - * - * Print the CFS rbtree - * - * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include - -/* - * This allows printing both to /proc/sched_debug and - * to the console - */ -#define SEQ_printf(m, x...) \ - do { \ - if (m) \ - seq_printf(m, x); \ - else \ - printk(x); \ - } while (0) - -static void -print_task(struct seq_file *m, struct rq *rq, struct task_struct *p, u64 now) -{ - if (rq->curr == p) - SEQ_printf(m, "R"); - else - SEQ_printf(m, " "); - - SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d " - "%15Ld %15Ld %15Ld %15Ld %15Ld\n", - p->comm, p->pid, - (long long)p->se.fair_key, - (long long)(p->se.fair_key - rq->cfs.fair_clock), - (long long)p->se.wait_runtime, - (long long)(p->nvcsw + p->nivcsw), - p->prio, - (long long)p->se.sum_exec_runtime, - (long long)p->se.sum_wait_runtime, - (long long)p->se.sum_sleep_runtime, - (long long)p->se.wait_runtime_overruns, - (long long)p->se.wait_runtime_underruns); -} - -static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now) -{ - struct task_struct *g, *p; - - SEQ_printf(m, - "\nrunnable tasks:\n" - " task PID tree-key delta waiting" - " switches prio" - " sum-exec sum-wait sum-sleep" - " wait-overrun wait-underrun\n" - "------------------------------------------------------------------" - "----------------" - "------------------------------------------------" - "--------------------------------\n"); - - read_lock_irq(&tasklist_lock); - - do_each_thread(g, p) { - if (!p->se.on_rq || task_cpu(p) != rq_cpu) - continue; - - print_task(m, rq, p, now); - } while_each_thread(g, p); - - read_unlock_irq(&tasklist_lock); -} - -static void -print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) -{ - s64 wait_runtime_rq_sum = 0; - struct task_struct *p; - struct rb_node *curr; - unsigned long flags; - struct rq *rq = &per_cpu(runqueues, cpu); - - spin_lock_irqsave(&rq->lock, flags); - curr = first_fair(cfs_rq); - while (curr) { - p = rb_entry(curr, struct task_struct, se.run_node); - wait_runtime_rq_sum += p->se.wait_runtime; - - curr = rb_next(curr); - } - spin_unlock_irqrestore(&rq->lock, flags); - - SEQ_printf(m, " .%-30s: %Ld\n", "wait_runtime_rq_sum", - (long long)wait_runtime_rq_sum); -} - -void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now) -{ - SEQ_printf(m, "\ncfs_rq %p\n", cfs_rq); - -#define P(x) \ - SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x)) - - P(fair_clock); - P(exec_clock); - P(wait_runtime); - P(wait_runtime_overruns); - P(wait_runtime_underruns); - P(sleeper_bonus); -#undef P - - print_cfs_rq_runtime_sum(m, cpu, cfs_rq); -} - -static void print_cpu(struct seq_file *m, int cpu, u64 now) -{ - struct rq *rq = &per_cpu(runqueues, cpu); - -#ifdef CONFIG_X86 - { - unsigned int freq = cpu_khz ? : 1; - - SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", - cpu, freq / 1000, (freq % 1000)); - } -#else - SEQ_printf(m, "\ncpu#%d\n", cpu); -#endif - -#define P(x) \ - SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) - - P(nr_running); - SEQ_printf(m, " .%-30s: %lu\n", "load", - rq->ls.load.weight); - P(ls.delta_fair); - P(ls.delta_exec); - P(nr_switches); - P(nr_load_updates); - P(nr_uninterruptible); - SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies); - P(next_balance); - P(curr->pid); - P(clock); - P(prev_clock_raw); - P(clock_warps); - P(clock_overflows); - P(clock_unstable_events); - P(clock_max_delta); - P(cpu_load[0]); - P(cpu_load[1]); - P(cpu_load[2]); - P(cpu_load[3]); - P(cpu_load[4]); -#undef P - - print_cfs_stats(m, cpu, now); - - print_rq(m, rq, cpu, now); -} - -static int sched_debug_show(struct seq_file *m, void *v) -{ - u64 now = ktime_to_ns(ktime_get()); - int cpu; - - SEQ_printf(m, "Sched Debug Version: v0.04, cfs-v20, %s %.*s\n", - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - - SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now); - - for_each_online_cpu(cpu) - print_cpu(m, cpu, now); - - SEQ_printf(m, "\n"); - - return 0; -} - -void sysrq_sched_debug_show(void) -{ - sched_debug_show(NULL, NULL); -} - -static int sched_debug_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, sched_debug_show, NULL); -} - -static struct file_operations sched_debug_fops = { - .open = sched_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init init_sched_debug_procfs(void) -{ - struct proc_dir_entry *pe; - - pe = create_proc_entry("sched_debug", 0644, NULL); - if (!pe) - return -ENOMEM; - - pe->proc_fops = &sched_debug_fops; - - return 0; -} - -__initcall(init_sched_debug_procfs); - -void proc_sched_show_task(struct task_struct *p, struct seq_file *m) -{ - unsigned long flags; - int num_threads = 1; - - rcu_read_lock(); - if (lock_task_sighand(p, &flags)) { - num_threads = atomic_read(&p->signal->count); - unlock_task_sighand(p, &flags); - } - rcu_read_unlock(); - - SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); - SEQ_printf(m, "----------------------------------------------\n"); -#define P(F) \ - SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F) - - P(se.wait_start); - P(se.wait_start_fair); - P(se.exec_start); - P(se.sleep_start); - P(se.sleep_start_fair); - P(se.block_start); - P(se.sleep_max); - P(se.block_max); - P(se.exec_max); - P(se.wait_max); - P(se.wait_runtime); - P(se.wait_runtime_overruns); - P(se.wait_runtime_underruns); - P(se.sum_wait_runtime); - P(se.sum_exec_runtime); - SEQ_printf(m, "%-25s:%20Ld\n", - "nr_switches", (long long)(p->nvcsw + p->nivcsw)); - P(se.load.weight); - P(policy); - P(prio); -#undef P - - { - u64 t0, t1; - - t0 = sched_clock(); - t1 = sched_clock(); - SEQ_printf(m, "%-25s:%20Ld\n", - "clock-delta", (long long)(t1-t0)); - } -} - -void proc_sched_set_task(struct task_struct *p) -{ - p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0; - p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0; - p->se.sum_exec_runtime = 0; -} diff --git a/trunk/kernel/sched_fair.c b/trunk/kernel/sched_fair.c deleted file mode 100644 index 6971db0a7160..000000000000 --- a/trunk/kernel/sched_fair.c +++ /dev/null @@ -1,1131 +0,0 @@ -/* - * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) - * - * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar - * - * Interactivity improvements by Mike Galbraith - * (C) 2007 Mike Galbraith - * - * Various enhancements by Dmitry Adamushko. - * (C) 2007 Dmitry Adamushko - * - * Group scheduling enhancements by Srivatsa Vaddagiri - * Copyright IBM Corporation, 2007 - * Author: Srivatsa Vaddagiri - * - * Scaled math optimizations by Thomas Gleixner - * Copyright (C) 2007, Thomas Gleixner - */ - -/* - * Preemption granularity: - * (default: 2 msec, units: nanoseconds) - * - * NOTE: this granularity value is not the same as the concept of - * 'timeslice length' - timeslices in CFS will typically be somewhat - * larger than this value. (to see the precise effective timeslice - * length of your workload, run vmstat and monitor the context-switches - * field) - * - * On SMP systems the value of this is multiplied by the log2 of the - * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way - * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) - */ -unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ; - -/* - * SCHED_BATCH wake-up granularity. - * (default: 10 msec, units: nanoseconds) - * - * This option delays the preemption effects of decoupled workloads - * and reduces their over-scheduling. Synchronous workloads will still - * have immediate wakeup/sleep latencies. - */ -unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = - 10000000000ULL/HZ; - -/* - * SCHED_OTHER wake-up granularity. - * (default: 1 msec, units: nanoseconds) - * - * This option delays the preemption effects of decoupled workloads - * and reduces their over-scheduling. Synchronous workloads will still - * have immediate wakeup/sleep latencies. - */ -unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ; - -unsigned int sysctl_sched_stat_granularity __read_mostly; - -/* - * Initialized in sched_init_granularity(): - */ -unsigned int sysctl_sched_runtime_limit __read_mostly; - -/* - * Debugging: various feature bits - */ -enum { - SCHED_FEAT_FAIR_SLEEPERS = 1, - SCHED_FEAT_SLEEPER_AVG = 2, - SCHED_FEAT_SLEEPER_LOAD_AVG = 4, - SCHED_FEAT_PRECISE_CPU_LOAD = 8, - SCHED_FEAT_START_DEBIT = 16, - SCHED_FEAT_SKIP_INITIAL = 32, -}; - -unsigned int sysctl_sched_features __read_mostly = - SCHED_FEAT_FAIR_SLEEPERS *1 | - SCHED_FEAT_SLEEPER_AVG *1 | - SCHED_FEAT_SLEEPER_LOAD_AVG *1 | - SCHED_FEAT_PRECISE_CPU_LOAD *1 | - SCHED_FEAT_START_DEBIT *1 | - SCHED_FEAT_SKIP_INITIAL *0; - -extern struct sched_class fair_sched_class; - -/************************************************************** - * CFS operations on generic schedulable entities: - */ - -#ifdef CONFIG_FAIR_GROUP_SCHED - -/* cpu runqueue to which this cfs_rq is attached */ -static inline struct rq *rq_of(struct cfs_rq *cfs_rq) -{ - return cfs_rq->rq; -} - -/* currently running entity (if any) on this cfs_rq */ -static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) -{ - return cfs_rq->curr; -} - -/* An entity is a task if it doesn't "own" a runqueue */ -#define entity_is_task(se) (!se->my_q) - -static inline void -set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - cfs_rq->curr = se; -} - -#else /* CONFIG_FAIR_GROUP_SCHED */ - -static inline struct rq *rq_of(struct cfs_rq *cfs_rq) -{ - return container_of(cfs_rq, struct rq, cfs); -} - -static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) -{ - struct rq *rq = rq_of(cfs_rq); - - if (unlikely(rq->curr->sched_class != &fair_sched_class)) - return NULL; - - return &rq->curr->se; -} - -#define entity_is_task(se) 1 - -static inline void -set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { } - -#endif /* CONFIG_FAIR_GROUP_SCHED */ - -static inline struct task_struct *task_of(struct sched_entity *se) -{ - return container_of(se, struct task_struct, se); -} - - -/************************************************************** - * Scheduling class tree data structure manipulation methods: - */ - -/* - * Enqueue an entity into the rb-tree: - */ -static inline void -__enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; - struct rb_node *parent = NULL; - struct sched_entity *entry; - s64 key = se->fair_key; - int leftmost = 1; - - /* - * Find the right place in the rbtree: - */ - while (*link) { - parent = *link; - entry = rb_entry(parent, struct sched_entity, run_node); - /* - * We dont care about collisions. Nodes with - * the same key stay together. - */ - if (key - entry->fair_key < 0) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = 0; - } - } - - /* - * Maintain a cache of leftmost tree entries (it is frequently - * used): - */ - if (leftmost) - cfs_rq->rb_leftmost = &se->run_node; - - rb_link_node(&se->run_node, parent, link); - rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); - update_load_add(&cfs_rq->load, se->load.weight); - cfs_rq->nr_running++; - se->on_rq = 1; -} - -static inline void -__dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - if (cfs_rq->rb_leftmost == &se->run_node) - cfs_rq->rb_leftmost = rb_next(&se->run_node); - rb_erase(&se->run_node, &cfs_rq->tasks_timeline); - update_load_sub(&cfs_rq->load, se->load.weight); - cfs_rq->nr_running--; - se->on_rq = 0; -} - -static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) -{ - return cfs_rq->rb_leftmost; -} - -static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) -{ - return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); -} - -/************************************************************** - * Scheduling class statistics methods: - */ - -/* - * We rescale the rescheduling granularity of tasks according to their - * nice level, but only linearly, not exponentially: - */ -static long -niced_granularity(struct sched_entity *curr, unsigned long granularity) -{ - u64 tmp; - - /* - * Negative nice levels get the same granularity as nice-0: - */ - if (likely(curr->load.weight >= NICE_0_LOAD)) - return granularity; - /* - * Positive nice level tasks get linearly finer - * granularity: - */ - tmp = curr->load.weight * (u64)granularity; - - /* - * It will always fit into 'long': - */ - return (long) (tmp >> NICE_0_SHIFT); -} - -static inline void -limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - long limit = sysctl_sched_runtime_limit; - - /* - * Niced tasks have the same history dynamic range as - * non-niced tasks: - */ - if (unlikely(se->wait_runtime > limit)) { - se->wait_runtime = limit; - schedstat_inc(se, wait_runtime_overruns); - schedstat_inc(cfs_rq, wait_runtime_overruns); - } - if (unlikely(se->wait_runtime < -limit)) { - se->wait_runtime = -limit; - schedstat_inc(se, wait_runtime_underruns); - schedstat_inc(cfs_rq, wait_runtime_underruns); - } -} - -static inline void -__add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) -{ - se->wait_runtime += delta; - schedstat_add(se, sum_wait_runtime, delta); - limit_wait_runtime(cfs_rq, se); -} - -static void -add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) -{ - schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); - __add_wait_runtime(cfs_rq, se, delta); - schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); -} - -/* - * Update the current task's runtime statistics. Skip current tasks that - * are not in our scheduling class. - */ -static inline void -__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now) -{ - unsigned long delta, delta_exec, delta_fair; - long delta_mine; - struct load_weight *lw = &cfs_rq->load; - unsigned long load = lw->weight; - - if (unlikely(!load)) - return; - - delta_exec = curr->delta_exec; -#ifdef CONFIG_SCHEDSTATS - if (unlikely(delta_exec > curr->exec_max)) - curr->exec_max = delta_exec; -#endif - - curr->sum_exec_runtime += delta_exec; - cfs_rq->exec_clock += delta_exec; - - delta_fair = calc_delta_fair(delta_exec, lw); - delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); - - if (cfs_rq->sleeper_bonus > sysctl_sched_stat_granularity) { - delta = calc_delta_mine(cfs_rq->sleeper_bonus, - curr->load.weight, lw); - if (unlikely(delta > cfs_rq->sleeper_bonus)) - delta = cfs_rq->sleeper_bonus; - - cfs_rq->sleeper_bonus -= delta; - delta_mine -= delta; - } - - cfs_rq->fair_clock += delta_fair; - /* - * We executed delta_exec amount of time on the CPU, - * but we were only entitled to delta_mine amount of - * time during that period (if nr_running == 1 then - * the two values are equal) - * [Note: delta_mine - delta_exec is negative]: - */ - add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec); -} - -static void update_curr(struct cfs_rq *cfs_rq, u64 now) -{ - struct sched_entity *curr = cfs_rq_curr(cfs_rq); - unsigned long delta_exec; - - if (unlikely(!curr)) - return; - - /* - * Get the amount of time the current task was running - * since the last time we changed load (this cannot - * overflow on 32 bits): - */ - delta_exec = (unsigned long)(now - curr->exec_start); - - curr->delta_exec += delta_exec; - - if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) { - __update_curr(cfs_rq, curr, now); - curr->delta_exec = 0; - } - curr->exec_start = now; -} - -static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) -{ - se->wait_start_fair = cfs_rq->fair_clock; - se->wait_start = now; -} - -/* - * We calculate fair deltas here, so protect against the random effects - * of a multiplication overflow by capping it to the runtime limit: - */ -#if BITS_PER_LONG == 32 -static inline unsigned long -calc_weighted(unsigned long delta, unsigned long weight, int shift) -{ - u64 tmp = (u64)delta * weight >> shift; - - if (unlikely(tmp > sysctl_sched_runtime_limit*2)) - return sysctl_sched_runtime_limit*2; - return tmp; -} -#else -static inline unsigned long -calc_weighted(unsigned long delta, unsigned long weight, int shift) -{ - return delta * weight >> shift; -} -#endif - -/* - * Task is being enqueued - update stats: - */ -static void -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) -{ - s64 key; - - /* - * Are we enqueueing a waiting task? (for current tasks - * a dequeue/enqueue event is a NOP) - */ - if (se != cfs_rq_curr(cfs_rq)) - update_stats_wait_start(cfs_rq, se, now); - /* - * Update the key: - */ - key = cfs_rq->fair_clock; - - /* - * Optimize the common nice 0 case: - */ - if (likely(se->load.weight == NICE_0_LOAD)) { - key -= se->wait_runtime; - } else { - u64 tmp; - - if (se->wait_runtime < 0) { - tmp = -se->wait_runtime; - key += (tmp * se->load.inv_weight) >> - (WMULT_SHIFT - NICE_0_SHIFT); - } else { - tmp = se->wait_runtime; - key -= (tmp * se->load.weight) >> NICE_0_SHIFT; - } - } - - se->fair_key = key; -} - -/* - * Note: must be called with a freshly updated rq->fair_clock. - */ -static inline void -__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) -{ - unsigned long delta_fair = se->delta_fair_run; - -#ifdef CONFIG_SCHEDSTATS - { - s64 delta_wait = now - se->wait_start; - if (unlikely(delta_wait > se->wait_max)) - se->wait_max = delta_wait; - } -#endif - - if (unlikely(se->load.weight != NICE_0_LOAD)) - delta_fair = calc_weighted(delta_fair, se->load.weight, - NICE_0_SHIFT); - - add_wait_runtime(cfs_rq, se, delta_fair); -} - -static void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) -{ - unsigned long delta_fair; - - delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), - (u64)(cfs_rq->fair_clock - se->wait_start_fair)); - - se->delta_fair_run += delta_fair; - if (unlikely(abs(se->delta_fair_run) >= - sysctl_sched_stat_granularity)) { - __update_stats_wait_end(cfs_rq, se, now); - se->delta_fair_run = 0; - } - - se->wait_start_fair = 0; - se->wait_start = 0; -} - -static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) -{ - update_curr(cfs_rq, now); - /* - * Mark the end of the wait period if dequeueing a - * waiting task: - */ - if (se != cfs_rq_curr(cfs_rq)) - update_stats_wait_end(cfs_rq, se, now); -} - -/* - * We are picking a new current task - update its stats: - */ -static inline void -update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) -{ - /* - * We are starting a new run period: - */ - se->exec_start = now; -} - -/* - * We are descheduling a task - update its stats: - */ -static inline void -update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) -{ - se->exec_start = 0; -} - -/************************************************** - * Scheduling class queueing methods: - */ - -static void -__enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) -{ - unsigned long load = cfs_rq->load.weight, delta_fair; - long prev_runtime; - - if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) - load = rq_of(cfs_rq)->cpu_load[2]; - - delta_fair = se->delta_fair_sleep; - - /* - * Fix up delta_fair with the effect of us running - * during the whole sleep period: - */ - if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG) - delta_fair = div64_likely32((u64)delta_fair * load, - load + se->load.weight); - - if (unlikely(se->load.weight != NICE_0_LOAD)) - delta_fair = calc_weighted(delta_fair, se->load.weight, - NICE_0_SHIFT); - - prev_runtime = se->wait_runtime; - __add_wait_runtime(cfs_rq, se, delta_fair); - delta_fair = se->wait_runtime - prev_runtime; - - /* - * Track the amount of bonus we've given to sleepers: - */ - cfs_rq->sleeper_bonus += delta_fair; - - schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); -} - -static void -enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) -{ - struct task_struct *tsk = task_of(se); - unsigned long delta_fair; - - if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) || - !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS)) - return; - - delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), - (u64)(cfs_rq->fair_clock - se->sleep_start_fair)); - - se->delta_fair_sleep += delta_fair; - if (unlikely(abs(se->delta_fair_sleep) >= - sysctl_sched_stat_granularity)) { - __enqueue_sleeper(cfs_rq, se, now); - se->delta_fair_sleep = 0; - } - - se->sleep_start_fair = 0; - -#ifdef CONFIG_SCHEDSTATS - if (se->sleep_start) { - u64 delta = now - se->sleep_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > se->sleep_max)) - se->sleep_max = delta; - - se->sleep_start = 0; - se->sum_sleep_runtime += delta; - } - if (se->block_start) { - u64 delta = now - se->block_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > se->block_max)) - se->block_max = delta; - - se->block_start = 0; - se->sum_sleep_runtime += delta; - } -#endif -} - -static void -enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - int wakeup, u64 now) -{ - /* - * Update the fair clock. - */ - update_curr(cfs_rq, now); - - if (wakeup) - enqueue_sleeper(cfs_rq, se, now); - - update_stats_enqueue(cfs_rq, se, now); - __enqueue_entity(cfs_rq, se); -} - -static void -dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - int sleep, u64 now) -{ - update_stats_dequeue(cfs_rq, se, now); - if (sleep) { - se->sleep_start_fair = cfs_rq->fair_clock; -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - struct task_struct *tsk = task_of(se); - - if (tsk->state & TASK_INTERRUPTIBLE) - se->sleep_start = now; - if (tsk->state & TASK_UNINTERRUPTIBLE) - se->block_start = now; - } - cfs_rq->wait_runtime -= se->wait_runtime; -#endif - } - __dequeue_entity(cfs_rq, se); -} - -/* - * Preempt the current task with a newly woken task if needed: - */ -static void -__check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, - struct sched_entity *curr, unsigned long granularity) -{ - s64 __delta = curr->fair_key - se->fair_key; - - /* - * Take scheduling granularity into account - do not - * preempt the current task unless the best task has - * a larger than sched_granularity fairness advantage: - */ - if (__delta > niced_granularity(curr, granularity)) - resched_task(rq_of(cfs_rq)->curr); -} - -static inline void -set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) -{ - /* - * Any task has to be enqueued before it get to execute on - * a CPU. So account for the time it spent waiting on the - * runqueue. (note, here we rely on pick_next_task() having - * done a put_prev_task_fair() shortly before this, which - * updated rq->fair_clock - used by update_stats_wait_end()) - */ - update_stats_wait_end(cfs_rq, se, now); - update_stats_curr_start(cfs_rq, se, now); - set_cfs_rq_curr(cfs_rq, se); -} - -static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq, u64 now) -{ - struct sched_entity *se = __pick_next_entity(cfs_rq); - - set_next_entity(cfs_rq, se, now); - - return se; -} - -static void -put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now) -{ - /* - * If still on the runqueue then deactivate_task() - * was not called and update_curr() has to be done: - */ - if (prev->on_rq) - update_curr(cfs_rq, now); - - update_stats_curr_end(cfs_rq, prev, now); - - if (prev->on_rq) - update_stats_wait_start(cfs_rq, prev, now); - set_cfs_rq_curr(cfs_rq, NULL); -} - -static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) -{ - struct rq *rq = rq_of(cfs_rq); - struct sched_entity *next; - u64 now = __rq_clock(rq); - - /* - * Dequeue and enqueue the task to update its - * position within the tree: - */ - dequeue_entity(cfs_rq, curr, 0, now); - enqueue_entity(cfs_rq, curr, 0, now); - - /* - * Reschedule if another task tops the current one. - */ - next = __pick_next_entity(cfs_rq); - if (next == curr) - return; - - __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity); -} - -/************************************************** - * CFS operations on tasks: - */ - -#ifdef CONFIG_FAIR_GROUP_SCHED - -/* Walk up scheduling entities hierarchy */ -#define for_each_sched_entity(se) \ - for (; se; se = se->parent) - -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return p->se.cfs_rq; -} - -/* runqueue on which this entity is (to be) queued */ -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - return se->cfs_rq; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return grp->my_q; -} - -/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on - * another cpu ('this_cpu') - */ -static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) -{ - /* A later patch will take group into account */ - return &cpu_rq(this_cpu)->cfs; -} - -/* Iterate thr' all leaf cfs_rq's on a runqueue */ -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) - -/* Do the two (enqueued) tasks belong to the same group ? */ -static inline int is_same_group(struct task_struct *curr, struct task_struct *p) -{ - if (curr->se.cfs_rq == p->se.cfs_rq) - return 1; - - return 0; -} - -#else /* CONFIG_FAIR_GROUP_SCHED */ - -#define for_each_sched_entity(se) \ - for (; se; se = NULL) - -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return &task_rq(p)->cfs; -} - -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - struct task_struct *p = task_of(se); - struct rq *rq = task_rq(p); - - return &rq->cfs; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return NULL; -} - -static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) -{ - return &cpu_rq(this_cpu)->cfs; -} - -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) - -static inline int is_same_group(struct task_struct *curr, struct task_struct *p) -{ - return 1; -} - -#endif /* CONFIG_FAIR_GROUP_SCHED */ - -/* - * The enqueue_task method is called before nr_running is - * increased. Here we update the fair scheduling stats and - * then put the task into the rbtree: - */ -static void -enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now) -{ - struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se; - - for_each_sched_entity(se) { - if (se->on_rq) - break; - cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, wakeup, now); - } -} - -/* - * The dequeue_task method is called before nr_running is - * decreased. We remove the task from the rbtree and - * update the fair scheduling stats: - */ -static void -dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now) -{ - struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se; - - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - dequeue_entity(cfs_rq, se, sleep, now); - /* Don't dequeue parent if it has other entities besides us */ - if (cfs_rq->load.weight) - break; - } -} - -/* - * sched_yield() support is very simple - we dequeue and enqueue - */ -static void yield_task_fair(struct rq *rq, struct task_struct *p) -{ - struct cfs_rq *cfs_rq = task_cfs_rq(p); - u64 now = __rq_clock(rq); - - /* - * Dequeue and enqueue the task to update its - * position within the tree: - */ - dequeue_entity(cfs_rq, &p->se, 0, now); - enqueue_entity(cfs_rq, &p->se, 0, now); -} - -/* - * Preempt the current task with a newly woken task if needed: - */ -static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) -{ - struct task_struct *curr = rq->curr; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); - unsigned long gran; - - if (unlikely(rt_prio(p->prio))) { - update_curr(cfs_rq, rq_clock(rq)); - resched_task(curr); - return; - } - - gran = sysctl_sched_wakeup_granularity; - /* - * Batch tasks prefer throughput over latency: - */ - if (unlikely(p->policy == SCHED_BATCH)) - gran = sysctl_sched_batch_wakeup_granularity; - - if (is_same_group(curr, p)) - __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran); -} - -static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now) -{ - struct cfs_rq *cfs_rq = &rq->cfs; - struct sched_entity *se; - - if (unlikely(!cfs_rq->nr_running)) - return NULL; - - do { - se = pick_next_entity(cfs_rq, now); - cfs_rq = group_cfs_rq(se); - } while (cfs_rq); - - return task_of(se); -} - -/* - * Account for a descheduled task: - */ -static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now) -{ - struct sched_entity *se = &prev->se; - struct cfs_rq *cfs_rq; - - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - put_prev_entity(cfs_rq, se, now); - } -} - -/************************************************** - * Fair scheduling class load-balancing methods: - */ - -/* - * Load-balancing iterator. Note: while the runqueue stays locked - * during the whole iteration, the current task might be - * dequeued so the iterator has to be dequeue-safe. Here we - * achieve that by always pre-iterating before returning - * the current task: - */ -static inline struct task_struct * -__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) -{ - struct task_struct *p; - - if (!curr) - return NULL; - - p = rb_entry(curr, struct task_struct, se.run_node); - cfs_rq->rb_load_balance_curr = rb_next(curr); - - return p; -} - -static struct task_struct *load_balance_start_fair(void *arg) -{ - struct cfs_rq *cfs_rq = arg; - - return __load_balance_iterator(cfs_rq, first_fair(cfs_rq)); -} - -static struct task_struct *load_balance_next_fair(void *arg) -{ - struct cfs_rq *cfs_rq = arg; - - return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); -} - -static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) -{ - struct sched_entity *curr; - struct task_struct *p; - - if (!cfs_rq->nr_running) - return MAX_PRIO; - - curr = __pick_next_entity(cfs_rq); - p = task_of(curr); - - return p->prio; -} - -static int -load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_nr_move, unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, unsigned long *total_load_moved) -{ - struct cfs_rq *busy_cfs_rq; - unsigned long load_moved, total_nr_moved = 0, nr_moved; - long rem_load_move = max_load_move; - struct rq_iterator cfs_rq_iterator; - - cfs_rq_iterator.start = load_balance_start_fair; - cfs_rq_iterator.next = load_balance_next_fair; - - for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { - struct cfs_rq *this_cfs_rq; - long imbalance; - unsigned long maxload; - int this_best_prio, best_prio, best_prio_seen = 0; - - this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); - - imbalance = busy_cfs_rq->load.weight - - this_cfs_rq->load.weight; - /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ - if (imbalance <= 0) - continue; - - /* Don't pull more than imbalance/2 */ - imbalance /= 2; - maxload = min(rem_load_move, imbalance); - - this_best_prio = cfs_rq_best_prio(this_cfs_rq); - best_prio = cfs_rq_best_prio(busy_cfs_rq); - - /* - * Enable handling of the case where there is more than one task - * with the best priority. If the current running task is one - * of those with prio==best_prio we know it won't be moved - * and therefore it's safe to override the skip (based on load) - * of any task we find with that prio. - */ - if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se) - best_prio_seen = 1; - - /* pass busy_cfs_rq argument into - * load_balance_[start|next]_fair iterators - */ - cfs_rq_iterator.arg = busy_cfs_rq; - nr_moved = balance_tasks(this_rq, this_cpu, busiest, - max_nr_move, maxload, sd, idle, all_pinned, - &load_moved, this_best_prio, best_prio, - best_prio_seen, &cfs_rq_iterator); - - total_nr_moved += nr_moved; - max_nr_move -= nr_moved; - rem_load_move -= load_moved; - - if (max_nr_move <= 0 || rem_load_move <= 0) - break; - } - - *total_load_moved = max_load_move - rem_load_move; - - return total_nr_moved; -} - -/* - * scheduler tick hitting a task of our scheduling class: - */ -static void task_tick_fair(struct rq *rq, struct task_struct *curr) -{ - struct cfs_rq *cfs_rq; - struct sched_entity *se = &curr->se; - - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - entity_tick(cfs_rq, se); - } -} - -/* - * Share the fairness runtime between parent and child, thus the - * total amount of pressure for CPU stays equal - new tasks - * get a chance to run but frequent forkers are not allowed to - * monopolize the CPU. Note: the parent runqueue is locked, - * the child is not running yet. - */ -static void task_new_fair(struct rq *rq, struct task_struct *p) -{ - struct cfs_rq *cfs_rq = task_cfs_rq(p); - struct sched_entity *se = &p->se; - u64 now = rq_clock(rq); - - sched_info_queued(p); - - update_stats_enqueue(cfs_rq, se, now); - /* - * Child runs first: we let it run before the parent - * until it reschedules once. We set up the key so that - * it will preempt the parent: - */ - p->se.fair_key = current->se.fair_key - - niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1; - /* - * The first wait is dominated by the child-runs-first logic, - * so do not credit it with that waiting time yet: - */ - if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) - p->se.wait_start_fair = 0; - - /* - * The statistical average of wait_runtime is about - * -granularity/2, so initialize the task with that: - */ - if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) - p->se.wait_runtime = -(sysctl_sched_granularity / 2); - - __enqueue_entity(cfs_rq, se); - inc_nr_running(p, rq, now); -} - -#ifdef CONFIG_FAIR_GROUP_SCHED -/* Account for a task changing its policy or group. - * - * This routine is mostly called to set cfs_rq->curr field when a task - * migrates between groups/classes. - */ -static void set_curr_task_fair(struct rq *rq) -{ - struct task_struct *curr = rq->curr; - struct sched_entity *se = &curr->se; - u64 now = rq_clock(rq); - struct cfs_rq *cfs_rq; - - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - set_next_entity(cfs_rq, se, now); - } -} -#else -static void set_curr_task_fair(struct rq *rq) -{ -} -#endif - -/* - * All the scheduling class methods: - */ -struct sched_class fair_sched_class __read_mostly = { - .enqueue_task = enqueue_task_fair, - .dequeue_task = dequeue_task_fair, - .yield_task = yield_task_fair, - - .check_preempt_curr = check_preempt_curr_fair, - - .pick_next_task = pick_next_task_fair, - .put_prev_task = put_prev_task_fair, - - .load_balance = load_balance_fair, - - .set_curr_task = set_curr_task_fair, - .task_tick = task_tick_fair, - .task_new = task_new_fair, -}; - -#ifdef CONFIG_SCHED_DEBUG -void print_cfs_stats(struct seq_file *m, int cpu, u64 now) -{ - struct rq *rq = cpu_rq(cpu); - struct cfs_rq *cfs_rq; - - for_each_leaf_cfs_rq(rq, cfs_rq) - print_cfs_rq(m, cpu, cfs_rq, now); -} -#endif diff --git a/trunk/kernel/sched_idletask.c b/trunk/kernel/sched_idletask.c deleted file mode 100644 index 41841e741c4a..000000000000 --- a/trunk/kernel/sched_idletask.c +++ /dev/null @@ -1,71 +0,0 @@ -/* - * idle-task scheduling class. - * - * (NOTE: these are not related to SCHED_IDLE tasks which are - * handled in sched_fair.c) - */ - -/* - * Idle tasks are unconditionally rescheduled: - */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) -{ - resched_task(rq->idle); -} - -static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now) -{ - schedstat_inc(rq, sched_goidle); - - return rq->idle; -} - -/* - * It is not legal to sleep in the idle task - print a warning - * message if some code attempts to do it: - */ -static void -dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now) -{ - spin_unlock_irq(&rq->lock); - printk(KERN_ERR "bad: scheduling from the idle thread!\n"); - dump_stack(); - spin_lock_irq(&rq->lock); -} - -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, u64 now) -{ -} - -static int -load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_nr_move, unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, unsigned long *total_load_moved) -{ - return 0; -} - -static void task_tick_idle(struct rq *rq, struct task_struct *curr) -{ -} - -/* - * Simple, special scheduling class for the per-CPU idle tasks: - */ -static struct sched_class idle_sched_class __read_mostly = { - /* no enqueue/yield_task for idle tasks */ - - /* dequeue is not valid, we print a debug message there: */ - .dequeue_task = dequeue_task_idle, - - .check_preempt_curr = check_preempt_curr_idle, - - .pick_next_task = pick_next_task_idle, - .put_prev_task = put_prev_task_idle, - - .load_balance = load_balance_idle, - - .task_tick = task_tick_idle, - /* no .task_new for idle tasks */ -}; diff --git a/trunk/kernel/sched_rt.c b/trunk/kernel/sched_rt.c deleted file mode 100644 index 1192a2741b99..000000000000 --- a/trunk/kernel/sched_rt.c +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR - * policies) - */ - -/* - * Update the current task's runtime statistics. Skip current tasks that - * are not in our scheduling class. - */ -static inline void update_curr_rt(struct rq *rq, u64 now) -{ - struct task_struct *curr = rq->curr; - u64 delta_exec; - - if (!task_has_rt_policy(curr)) - return; - - delta_exec = now - curr->se.exec_start; - if (unlikely((s64)delta_exec < 0)) - delta_exec = 0; - if (unlikely(delta_exec > curr->se.exec_max)) - curr->se.exec_max = delta_exec; - - curr->se.sum_exec_runtime += delta_exec; - curr->se.exec_start = now; -} - -static void -enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now) -{ - struct rt_prio_array *array = &rq->rt.active; - - list_add_tail(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); -} - -/* - * Adding/removing a task to/from a priority array: - */ -static void -dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now) -{ - struct rt_prio_array *array = &rq->rt.active; - - update_curr_rt(rq, now); - - list_del(&p->run_list); - if (list_empty(array->queue + p->prio)) - __clear_bit(p->prio, array->bitmap); -} - -/* - * Put task to the end of the run list without the overhead of dequeue - * followed by enqueue. - */ -static void requeue_task_rt(struct rq *rq, struct task_struct *p) -{ - struct rt_prio_array *array = &rq->rt.active; - - list_move_tail(&p->run_list, array->queue + p->prio); -} - -static void -yield_task_rt(struct rq *rq, struct task_struct *p) -{ - requeue_task_rt(rq, p); -} - -/* - * Preempt the current task with a newly woken task if needed: - */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) -{ - if (p->prio < rq->curr->prio) - resched_task(rq->curr); -} - -static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now) -{ - struct rt_prio_array *array = &rq->rt.active; - struct task_struct *next; - struct list_head *queue; - int idx; - - idx = sched_find_first_bit(array->bitmap); - if (idx >= MAX_RT_PRIO) - return NULL; - - queue = array->queue + idx; - next = list_entry(queue->next, struct task_struct, run_list); - - next->se.exec_start = now; - - return next; -} - -static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now) -{ - update_curr_rt(rq, now); - p->se.exec_start = 0; -} - -/* - * Load-balancing iterator. Note: while the runqueue stays locked - * during the whole iteration, the current task might be - * dequeued so the iterator has to be dequeue-safe. Here we - * achieve that by always pre-iterating before returning - * the current task: - */ -static struct task_struct *load_balance_start_rt(void *arg) -{ - struct rq *rq = arg; - struct rt_prio_array *array = &rq->rt.active; - struct list_head *head, *curr; - struct task_struct *p; - int idx; - - idx = sched_find_first_bit(array->bitmap); - if (idx >= MAX_RT_PRIO) - return NULL; - - head = array->queue + idx; - curr = head->prev; - - p = list_entry(curr, struct task_struct, run_list); - - curr = curr->prev; - - rq->rt.rt_load_balance_idx = idx; - rq->rt.rt_load_balance_head = head; - rq->rt.rt_load_balance_curr = curr; - - return p; -} - -static struct task_struct *load_balance_next_rt(void *arg) -{ - struct rq *rq = arg; - struct rt_prio_array *array = &rq->rt.active; - struct list_head *head, *curr; - struct task_struct *p; - int idx; - - idx = rq->rt.rt_load_balance_idx; - head = rq->rt.rt_load_balance_head; - curr = rq->rt.rt_load_balance_curr; - - /* - * If we arrived back to the head again then - * iterate to the next queue (if any): - */ - if (unlikely(head == curr)) { - int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); - - if (next_idx >= MAX_RT_PRIO) - return NULL; - - idx = next_idx; - head = array->queue + idx; - curr = head->prev; - - rq->rt.rt_load_balance_idx = idx; - rq->rt.rt_load_balance_head = head; - } - - p = list_entry(curr, struct task_struct, run_list); - - curr = curr->prev; - - rq->rt.rt_load_balance_curr = curr; - - return p; -} - -static int -load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_nr_move, unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, unsigned long *load_moved) -{ - int this_best_prio, best_prio, best_prio_seen = 0; - int nr_moved; - struct rq_iterator rt_rq_iterator; - - best_prio = sched_find_first_bit(busiest->rt.active.bitmap); - this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap); - - /* - * Enable handling of the case where there is more than one task - * with the best priority. If the current running task is one - * of those with prio==best_prio we know it won't be moved - * and therefore it's safe to override the skip (based on load) - * of any task we find with that prio. - */ - if (busiest->curr->prio == best_prio) - best_prio_seen = 1; - - rt_rq_iterator.start = load_balance_start_rt; - rt_rq_iterator.next = load_balance_next_rt; - /* pass 'busiest' rq argument into - * load_balance_[start|next]_rt iterators - */ - rt_rq_iterator.arg = busiest; - - nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move, - max_load_move, sd, idle, all_pinned, load_moved, - this_best_prio, best_prio, best_prio_seen, - &rt_rq_iterator); - - return nr_moved; -} - -static void task_tick_rt(struct rq *rq, struct task_struct *p) -{ - /* - * RR tasks need a special form of timeslice management. - * FIFO tasks have no timeslices. - */ - if (p->policy != SCHED_RR) - return; - - if (--p->time_slice) - return; - - p->time_slice = static_prio_timeslice(p->static_prio); - set_tsk_need_resched(p); - - /* put it at the end of the queue: */ - requeue_task_rt(rq, p); -} - -/* - * No parent/child timeslice management necessary for RT tasks, - * just activate them: - */ -static void task_new_rt(struct rq *rq, struct task_struct *p) -{ - activate_task(rq, p, 1); -} - -static struct sched_class rt_sched_class __read_mostly = { - .enqueue_task = enqueue_task_rt, - .dequeue_task = dequeue_task_rt, - .yield_task = yield_task_rt, - - .check_preempt_curr = check_preempt_curr_rt, - - .pick_next_task = pick_next_task_rt, - .put_prev_task = put_prev_task_rt, - - .load_balance = load_balance_rt, - - .task_tick = task_tick_rt, - .task_new = task_new_rt, -}; diff --git a/trunk/kernel/sched_stats.h b/trunk/kernel/sched_stats.h deleted file mode 100644 index c63c38f6fa6e..000000000000 --- a/trunk/kernel/sched_stats.h +++ /dev/null @@ -1,235 +0,0 @@ - -#ifdef CONFIG_SCHEDSTATS -/* - * bump this up when changing the output format or the meaning of an existing - * format, so that tools can adapt (or abort) - */ -#define SCHEDSTAT_VERSION 14 - -static int show_schedstat(struct seq_file *seq, void *v) -{ - int cpu; - - seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); - seq_printf(seq, "timestamp %lu\n", jiffies); - for_each_online_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); -#ifdef CONFIG_SMP - struct sched_domain *sd; - int dcnt = 0; -#endif - - /* runqueue-specific stats */ - seq_printf(seq, - "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu", - cpu, rq->yld_both_empty, - rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, - rq->sched_switch, rq->sched_cnt, rq->sched_goidle, - rq->ttwu_cnt, rq->ttwu_local, - rq->rq_sched_info.cpu_time, - rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); - - seq_printf(seq, "\n"); - -#ifdef CONFIG_SMP - /* domain-specific stats */ - preempt_disable(); - for_each_domain(cpu, sd) { - enum cpu_idle_type itype; - char mask_str[NR_CPUS]; - - cpumask_scnprintf(mask_str, NR_CPUS, sd->span); - seq_printf(seq, "domain%d %s", dcnt++, mask_str); - for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; - itype++) { - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " - "%lu", - sd->lb_cnt[itype], - sd->lb_balanced[itype], - sd->lb_failed[itype], - sd->lb_imbalance[itype], - sd->lb_gained[itype], - sd->lb_hot_gained[itype], - sd->lb_nobusyq[itype], - sd->lb_nobusyg[itype]); - } - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" - " %lu %lu %lu\n", - sd->alb_cnt, sd->alb_failed, sd->alb_pushed, - sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, - sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, - sd->ttwu_wake_remote, sd->ttwu_move_affine, - sd->ttwu_move_balance); - } - preempt_enable(); -#endif - } - return 0; -} - -static int schedstat_open(struct inode *inode, struct file *file) -{ - unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); - char *buf = kmalloc(size, GFP_KERNEL); - struct seq_file *m; - int res; - - if (!buf) - return -ENOMEM; - res = single_open(file, show_schedstat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = size; - } else - kfree(buf); - return res; -} - -const struct file_operations proc_schedstat_operations = { - .open = schedstat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -/* - * Expects runqueue lock to be held for atomicity of update - */ -static inline void -rq_sched_info_arrive(struct rq *rq, unsigned long long delta) -{ - if (rq) { - rq->rq_sched_info.run_delay += delta; - rq->rq_sched_info.pcnt++; - } -} - -/* - * Expects runqueue lock to be held for atomicity of update - */ -static inline void -rq_sched_info_depart(struct rq *rq, unsigned long long delta) -{ - if (rq) - rq->rq_sched_info.cpu_time += delta; -} -# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) -# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) -#else /* !CONFIG_SCHEDSTATS */ -static inline void -rq_sched_info_arrive(struct rq *rq, unsigned long long delta) -{} -static inline void -rq_sched_info_depart(struct rq *rq, unsigned long long delta) -{} -# define schedstat_inc(rq, field) do { } while (0) -# define schedstat_add(rq, field, amt) do { } while (0) -#endif - -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) -/* - * Called when a process is dequeued from the active array and given - * the cpu. We should note that with the exception of interactive - * tasks, the expired queue will become the active queue after the active - * queue is empty, without explicitly dequeuing and requeuing tasks in the - * expired queue. (Interactive tasks may be requeued directly to the - * active queue, thus delaying tasks in the expired queue from running; - * see scheduler_tick()). - * - * This function is only called from sched_info_arrive(), rather than - * dequeue_task(). Even though a task may be queued and dequeued multiple - * times as it is shuffled about, we're really interested in knowing how - * long it was from the *first* time it was queued to the time that it - * finally hit a cpu. - */ -static inline void sched_info_dequeued(struct task_struct *t) -{ - t->sched_info.last_queued = 0; -} - -/* - * Called when a task finally hits the cpu. We can now calculate how - * long it was waiting to run. We also note when it began so that we - * can keep stats on how long its timeslice is. - */ -static void sched_info_arrive(struct task_struct *t) -{ - unsigned long long now = sched_clock(), delta = 0; - - if (t->sched_info.last_queued) - delta = now - t->sched_info.last_queued; - sched_info_dequeued(t); - t->sched_info.run_delay += delta; - t->sched_info.last_arrival = now; - t->sched_info.pcnt++; - - rq_sched_info_arrive(task_rq(t), delta); -} - -/* - * Called when a process is queued into either the active or expired - * array. The time is noted and later used to determine how long we - * had to wait for us to reach the cpu. Since the expired queue will - * become the active queue after active queue is empty, without dequeuing - * and requeuing any tasks, we are interested in queuing to either. It - * is unusual but not impossible for tasks to be dequeued and immediately - * requeued in the same or another array: this can happen in sched_yield(), - * set_user_nice(), and even load_balance() as it moves tasks from runqueue - * to runqueue. - * - * This function is only called from enqueue_task(), but also only updates - * the timestamp if it is already not set. It's assumed that - * sched_info_dequeued() will clear that stamp when appropriate. - */ -static inline void sched_info_queued(struct task_struct *t) -{ - if (unlikely(sched_info_on())) - if (!t->sched_info.last_queued) - t->sched_info.last_queued = sched_clock(); -} - -/* - * Called when a process ceases being the active-running process, either - * voluntarily or involuntarily. Now we can calculate how long we ran. - */ -static inline void sched_info_depart(struct task_struct *t) -{ - unsigned long long delta = sched_clock() - t->sched_info.last_arrival; - - t->sched_info.cpu_time += delta; - rq_sched_info_depart(task_rq(t), delta); -} - -/* - * Called when tasks are switched involuntarily due, typically, to expiring - * their time slice. (This may also be called when switching to or from - * the idle task.) We are only called when prev != next. - */ -static inline void -__sched_info_switch(struct task_struct *prev, struct task_struct *next) -{ - struct rq *rq = task_rq(prev); - - /* - * prev now departs the cpu. It's not interesting to record - * stats about how efficient we were at scheduling the idle - * process, however. - */ - if (prev != rq->idle) - sched_info_depart(prev); - - if (next != rq->idle) - sched_info_arrive(next); -} -static inline void -sched_info_switch(struct task_struct *prev, struct task_struct *next) -{ - if (unlikely(sched_info_on())) - __sched_info_switch(prev, next); -} -#else -#define sched_info_queued(t) do { } while (0) -#define sched_info_switch(t, next) do { } while (0) -#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ - diff --git a/trunk/kernel/softirq.c b/trunk/kernel/softirq.c index 73217a9e2875..0b9886a00e74 100644 --- a/trunk/kernel/softirq.c +++ b/trunk/kernel/softirq.c @@ -488,6 +488,7 @@ void __init softirq_init(void) static int ksoftirqd(void * __bind_cpu) { + set_user_nice(current, 19); current->flags |= PF_NOFREEZE; set_current_state(TASK_INTERRUPTIBLE); diff --git a/trunk/kernel/sysctl.c b/trunk/kernel/sysctl.c index 51f5dac42a00..30ee462ee79f 100644 --- a/trunk/kernel/sysctl.c +++ b/trunk/kernel/sysctl.c @@ -206,87 +206,7 @@ static ctl_table root_table[] = { { .ctl_name = 0 } }; -#ifdef CONFIG_SCHED_DEBUG -static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */ -static unsigned long max_sched_granularity_ns = 1000000000; /* 1 second */ -static unsigned long min_wakeup_granularity_ns; /* 0 usecs */ -static unsigned long max_wakeup_granularity_ns = 1000000000; /* 1 second */ -#endif - static ctl_table kern_table[] = { -#ifdef CONFIG_SCHED_DEBUG - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_granularity_ns", - .data = &sysctl_sched_granularity, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &min_sched_granularity_ns, - .extra2 = &max_sched_granularity_ns, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_wakeup_granularity_ns", - .data = &sysctl_sched_wakeup_granularity, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &min_wakeup_granularity_ns, - .extra2 = &max_wakeup_granularity_ns, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_batch_wakeup_granularity_ns", - .data = &sysctl_sched_batch_wakeup_granularity, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &min_wakeup_granularity_ns, - .extra2 = &max_wakeup_granularity_ns, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_stat_granularity_ns", - .data = &sysctl_sched_stat_granularity, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &min_wakeup_granularity_ns, - .extra2 = &max_wakeup_granularity_ns, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_runtime_limit_ns", - .data = &sysctl_sched_runtime_limit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &min_sched_granularity_ns, - .extra2 = &max_sched_granularity_ns, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_features", - .data = &sysctl_sched_features, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif { .ctl_name = KERN_PANIC, .procname = "panic", diff --git a/trunk/lib/Kconfig.debug b/trunk/lib/Kconfig.debug index fab32a286371..da95e10cfd70 100644 --- a/trunk/lib/Kconfig.debug +++ b/trunk/lib/Kconfig.debug @@ -105,15 +105,6 @@ config DETECT_SOFTLOCKUP can be detected via the NMI-watchdog, on platforms that support it.) -config SCHED_DEBUG - bool "Collect scheduler debugging info" - depends on DEBUG_KERNEL && PROC_FS - default y - help - If you say Y here, the /proc/sched_debug file will be provided - that can help debug the scheduler. The runtime overhead of this - option is minimal. - config SCHEDSTATS bool "Collect scheduler statistics" depends on DEBUG_KERNEL && PROC_FS