From 5a81d65296dfa4f531b24df40824c1985dee5e2f Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sun, 15 Jan 2012 19:40:24 -0500 Subject: [PATCH] --- yaml --- r: 286557 b: refs/heads/master c: 8d973b624ece3b85cfae9474935795d034f72faf h: refs/heads/master i: 286555: 2971f177990926e03e2a499481d71c1118690a6b v: v3 --- [refs] | 2 +- trunk/Documentation/DocBook/debugobjects.tmpl | 50 - trunk/Documentation/RCU/checklist.txt | 6 - trunk/Documentation/RCU/rcu.txt | 10 +- trunk/Documentation/RCU/stallwarn.txt | 16 +- trunk/Documentation/RCU/torture.txt | 13 - trunk/Documentation/RCU/trace.txt | 4 + trunk/Documentation/RCU/whatisRCU.txt | 19 +- trunk/Documentation/atomic_ops.txt | 87 - trunk/Documentation/kernel-parameters.txt | 12 +- trunk/Documentation/lockdep-design.txt | 63 - trunk/Documentation/trace/events.txt | 2 + trunk/Documentation/virtual/kvm/api.txt | 16 - trunk/MAINTAINERS | 13 +- trunk/Makefile | 2 +- trunk/arch/Kconfig | 4 - trunk/arch/arm/Kconfig | 4 +- trunk/arch/arm/common/pl330.c | 116 +- trunk/arch/arm/configs/imx_v4_v5_defconfig | 12 +- trunk/arch/arm/kernel/process.c | 6 +- trunk/arch/arm/kernel/setup.c | 1 - trunk/arch/arm/mach-exynos/cpu.c | 5 + trunk/arch/arm/mach-imx/Kconfig | 8 +- trunk/arch/arm/mach-imx/Makefile | 4 +- trunk/arch/arm/mach-imx/clock-imx35.c | 20 +- trunk/arch/arm/mach-imx/mach-cpuimx35.c | 8 +- .../arm/mach-omap2/omap_hwmod_3xxx_data.c | 4 + trunk/arch/arm/mm/init.c | 4 +- trunk/arch/arm/mm/proc-v7.S | 6 +- trunk/arch/arm/plat-mxc/cpufreq.c | 2 +- .../arm/plat-mxc/include/mach/uncompress.h | 1 - trunk/arch/arm/plat-mxc/pwm.c | 9 - trunk/arch/arm/plat-orion/gpio.c | 6 +- .../plat-samsung/include/plat/cpu-freq-core.h | 25 +- trunk/arch/avr32/kernel/process.c | 6 +- trunk/arch/blackfin/kernel/process.c | 6 +- trunk/arch/cris/arch-v32/kernel/time.c | 4 +- trunk/arch/ia64/Kconfig | 6 +- trunk/arch/ia64/include/asm/cputime.h | 71 +- trunk/arch/ia64/mm/contig.c | 3 +- trunk/arch/ia64/mm/init.c | 4 +- trunk/arch/m68k/platform/68328/timers.c | 4 +- trunk/arch/m68k/platform/coldfire/dma_timer.c | 5 +- trunk/arch/m68k/platform/coldfire/pit.c | 4 +- trunk/arch/m68k/platform/coldfire/sltimers.c | 4 +- trunk/arch/m68k/platform/coldfire/timers.c | 4 +- trunk/arch/microblaze/include/asm/memblock.h | 14 + trunk/arch/microblaze/kernel/process.c | 6 +- trunk/arch/microblaze/kernel/prom.c | 3 +- trunk/arch/mips/Kconfig | 6 +- trunk/arch/mips/kernel/process.c | 6 +- trunk/arch/mips/kernel/setup.c | 3 +- trunk/arch/mips/sgi-ip27/ip27-memory.c | 5 +- trunk/arch/openrisc/include/asm/memblock.h | 24 + trunk/arch/openrisc/kernel/idle.c | 6 +- trunk/arch/openrisc/kernel/prom.c | 3 +- trunk/arch/parisc/kernel/time.c | 6 +- trunk/arch/powerpc/Kconfig | 4 +- trunk/arch/powerpc/include/asm/cputime.h | 72 +- trunk/arch/powerpc/include/asm/kvm_book3s.h | 33 + .../arch/powerpc/include/asm/kvm_book3s_64.h | 33 - trunk/arch/powerpc/include/asm/memblock.h | 8 + trunk/arch/powerpc/kernel/idle.c | 15 +- trunk/arch/powerpc/kernel/machine_kexec.c | 3 + trunk/arch/powerpc/kernel/prom.c | 20 +- trunk/arch/powerpc/kvm/book3s_hv.c | 2 +- trunk/arch/powerpc/kvm/book3s_pr.c | 2 - trunk/arch/powerpc/kvm/e500.c | 1 - trunk/arch/powerpc/mm/init_32.c | 4 +- trunk/arch/powerpc/mm/mem.c | 2 +- trunk/arch/powerpc/mm/numa.c | 60 +- trunk/arch/powerpc/mm/tlb_nohash.c | 1 + .../arch/powerpc/platforms/embedded6xx/wii.c | 23 +- trunk/arch/powerpc/platforms/iseries/setup.c | 12 +- trunk/arch/powerpc/platforms/ps3/mm.c | 1 + trunk/arch/powerpc/platforms/pseries/lpar.c | 4 - trunk/arch/s390/Kconfig | 6 +- trunk/arch/s390/appldata/appldata_os.c | 16 +- trunk/arch/s390/include/asm/cputime.h | 142 +- trunk/arch/s390/kernel/process.c | 6 +- trunk/arch/s390/kernel/setup.c | 4 +- trunk/arch/s390/oprofile/hwsampler.c | 7 +- trunk/arch/s390/oprofile/init.c | 373 +-- trunk/arch/s390/oprofile/op_counter.h | 23 - trunk/arch/score/Kconfig | 6 +- trunk/arch/score/kernel/setup.c | 4 +- trunk/arch/sh/Kconfig | 1 - trunk/arch/sh/include/asm/memblock.h | 4 + trunk/arch/sh/kernel/idle.c | 6 +- trunk/arch/sh/kernel/machine_kexec.c | 3 + trunk/arch/sh/kernel/setup.c | 3 +- trunk/arch/sh/mm/Kconfig | 3 + trunk/arch/sh/mm/init.c | 3 +- trunk/arch/sparc/Kconfig | 4 +- trunk/arch/sparc/include/asm/memblock.h | 8 + trunk/arch/sparc/kernel/pci_sun4v.c | 4 +- trunk/arch/sparc/kernel/process_64.c | 6 +- trunk/arch/sparc/kernel/setup_32.c | 2 +- trunk/arch/sparc/mm/init_64.c | 32 +- trunk/arch/tile/kernel/process.c | 6 +- trunk/arch/tile/mm/fault.c | 4 +- trunk/arch/um/kernel/process.c | 6 +- trunk/arch/um/kernel/time.c | 6 +- trunk/arch/unicore32/kernel/process.c | 6 +- trunk/arch/unicore32/kernel/setup.c | 1 - trunk/arch/unicore32/mm/init.c | 4 +- trunk/arch/unicore32/mm/mmu.c | 1 - trunk/arch/x86/Kconfig | 18 +- trunk/arch/x86/ia32/ia32entry.S | 43 +- trunk/arch/x86/include/asm/alternative-asm.h | 4 +- trunk/arch/x86/include/asm/apic.h | 6 - trunk/arch/x86/include/asm/apic_flat_64.h | 7 - trunk/arch/x86/include/asm/apicdef.h | 1 - trunk/arch/x86/include/asm/bitops.h | 76 +- trunk/arch/x86/include/asm/cmpxchg.h | 163 +- trunk/arch/x86/include/asm/cmpxchg_32.h | 46 + trunk/arch/x86/include/asm/cmpxchg_64.h | 43 + trunk/arch/x86/include/asm/div64.h | 22 +- trunk/arch/x86/include/asm/e820.h | 2 +- trunk/arch/x86/include/asm/hardirq.h | 1 - trunk/arch/x86/include/asm/i387.h | 2 +- trunk/arch/x86/include/asm/insn.h | 7 - trunk/arch/x86/include/asm/mach_timer.h | 2 +- trunk/arch/x86/include/asm/mc146818rtc.h | 4 +- trunk/arch/x86/include/asm/mce.h | 12 +- trunk/arch/x86/include/asm/memblock.h | 23 + trunk/arch/x86/include/asm/microcode.h | 2 - .../x86/include/asm/numachip/numachip_csr.h | 167 -- trunk/arch/x86/include/asm/percpu.h | 53 +- trunk/arch/x86/include/asm/perf_event.h | 44 +- trunk/arch/x86/include/asm/pgtable.h | 2 +- trunk/arch/x86/include/asm/processor-flags.h | 1 - trunk/arch/x86/include/asm/processor.h | 2 + trunk/arch/x86/include/asm/spinlock.h | 15 +- trunk/arch/x86/include/asm/thread_info.h | 9 +- trunk/arch/x86/include/asm/topology.h | 2 + trunk/arch/x86/include/asm/tsc.h | 2 - trunk/arch/x86/include/asm/uaccess.h | 2 +- trunk/arch/x86/include/asm/x86_init.h | 3 - trunk/arch/x86/kernel/acpi/boot.c | 10 +- trunk/arch/x86/kernel/amd_nb.c | 8 +- trunk/arch/x86/kernel/aperture_64.c | 4 +- trunk/arch/x86/kernel/apic/Makefile | 1 - trunk/arch/x86/kernel/apic/apic.c | 113 +- trunk/arch/x86/kernel/apic/apic_flat_64.c | 9 +- trunk/arch/x86/kernel/apic/apic_numachip.c | 294 --- trunk/arch/x86/kernel/apic/io_apic.c | 6 +- trunk/arch/x86/kernel/check.c | 34 +- trunk/arch/x86/kernel/cpu/amd.c | 9 +- trunk/arch/x86/kernel/cpu/centaur.c | 2 +- trunk/arch/x86/kernel/cpu/common.c | 14 +- trunk/arch/x86/kernel/cpu/cpu.h | 5 +- trunk/arch/x86/kernel/cpu/intel.c | 2 + trunk/arch/x86/kernel/cpu/mcheck/mce-inject.c | 34 +- trunk/arch/x86/kernel/cpu/mcheck/mce.c | 66 +- trunk/arch/x86/kernel/cpu/mcheck/mce_amd.c | 7 +- .../arch/x86/kernel/cpu/mcheck/therm_throt.c | 31 +- trunk/arch/x86/kernel/cpu/mcheck/threshold.c | 2 +- trunk/arch/x86/kernel/cpu/perf_event.c | 262 +- trunk/arch/x86/kernel/cpu/perf_event.h | 51 +- trunk/arch/x86/kernel/cpu/perf_event_amd.c | 2 +- trunk/arch/x86/kernel/cpu/perf_event_intel.c | 88 +- trunk/arch/x86/kernel/cpu/powerflags.c | 3 +- trunk/arch/x86/kernel/cpu/proc.c | 4 +- trunk/arch/x86/kernel/e820.c | 58 +- trunk/arch/x86/kernel/entry_32.S | 4 - trunk/arch/x86/kernel/entry_64.S | 31 +- trunk/arch/x86/kernel/head.c | 2 +- trunk/arch/x86/kernel/head32.c | 7 +- trunk/arch/x86/kernel/head64.c | 7 +- trunk/arch/x86/kernel/hpet.c | 7 +- trunk/arch/x86/kernel/irq.c | 11 +- trunk/arch/x86/kernel/jump_label.c | 2 +- trunk/arch/x86/kernel/microcode_amd.c | 209 +- trunk/arch/x86/kernel/microcode_core.c | 5 - trunk/arch/x86/kernel/mpparse.c | 12 +- trunk/arch/x86/kernel/process.c | 2 +- trunk/arch/x86/kernel/process_32.c | 6 +- trunk/arch/x86/kernel/process_64.c | 15 +- trunk/arch/x86/kernel/ptrace.c | 3 +- trunk/arch/x86/kernel/setup.c | 21 +- trunk/arch/x86/kernel/smpboot.c | 3 +- trunk/arch/x86/kernel/trampoline.c | 4 +- trunk/arch/x86/kernel/traps.c | 7 +- trunk/arch/x86/kernel/tsc.c | 6 +- trunk/arch/x86/kernel/tsc_sync.c | 4 +- trunk/arch/x86/kernel/vsyscall_64.c | 77 +- trunk/arch/x86/kernel/x86_init.c | 1 - trunk/arch/x86/kvm/i8254.c | 10 +- trunk/arch/x86/kvm/x86.c | 19 +- trunk/arch/x86/lib/inat.c | 9 +- trunk/arch/x86/lib/insn.c | 4 +- trunk/arch/x86/lib/string_32.c | 8 +- trunk/arch/x86/lib/x86-opcode-map.txt | 610 ++--- trunk/arch/x86/mm/Makefile | 2 + trunk/arch/x86/mm/extable.c | 2 +- trunk/arch/x86/mm/fault.c | 22 +- trunk/arch/x86/mm/init.c | 8 +- trunk/arch/x86/mm/init_32.c | 36 +- trunk/arch/x86/mm/init_64.c | 2 +- trunk/arch/x86/mm/memblock.c | 348 +++ trunk/arch/x86/mm/memtest.c | 33 +- trunk/arch/x86/mm/numa.c | 37 +- trunk/arch/x86/mm/numa_32.c | 10 +- trunk/arch/x86/mm/numa_64.c | 2 +- trunk/arch/x86/mm/numa_emulation.c | 36 +- trunk/arch/x86/mm/pageattr.c | 2 +- trunk/arch/x86/mm/srat.c | 7 +- trunk/arch/x86/net/bpf_jit_comp.c | 4 +- trunk/arch/x86/oprofile/Makefile | 3 +- trunk/arch/x86/oprofile/init.c | 30 +- trunk/arch/x86/oprofile/nmi_int.c | 27 +- trunk/arch/x86/oprofile/nmi_timer_int.c | 50 + trunk/arch/x86/platform/efi/efi.c | 12 +- trunk/arch/x86/tools/Makefile | 11 +- trunk/arch/x86/tools/gen-insn-attr-x86.awk | 21 +- trunk/arch/x86/tools/insn_sanity.c | 275 -- trunk/arch/x86/xen/enlighten.c | 2 + trunk/arch/x86/xen/mmu.c | 12 +- trunk/arch/x86/xen/setup.c | 7 +- trunk/arch/xtensa/kernel/time.c | 13 +- trunk/block/blk-map.c | 2 +- trunk/block/blk-tag.c | 13 +- trunk/block/cfq-iosched.c | 12 - trunk/block/ioctl.c | 26 +- trunk/drivers/ata/Kconfig | 2 +- trunk/drivers/base/cpu.c | 7 - trunk/drivers/char/random.c | 8 +- trunk/drivers/clocksource/acpi_pm.c | 2 +- trunk/drivers/clocksource/i8253.c | 6 +- trunk/drivers/clocksource/tcb_clksrc.c | 4 +- trunk/drivers/cpufreq/cpufreq_conservative.c | 50 +- trunk/drivers/cpufreq/cpufreq_ondemand.c | 54 +- trunk/drivers/cpufreq/cpufreq_stats.c | 5 +- trunk/drivers/dma/Kconfig | 4 +- trunk/drivers/edac/i7core_edac.c | 4 +- trunk/drivers/edac/mce_amd.c | 4 +- trunk/drivers/edac/sb_edac.c | 8 +- .../gpu/drm/i915/i915_gem_execbuffer.c | 4 +- trunk/drivers/gpu/drm/i915/intel_display.c | 8 +- trunk/drivers/gpu/drm/radeon/evergreen.c | 12 - .../drivers/gpu/drm/radeon/radeon_atombios.c | 6 +- trunk/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c | 6 +- trunk/drivers/hwmon/coretemp.c | 7 +- trunk/drivers/input/mouse/sentelic.c | 8 +- trunk/drivers/input/mouse/sentelic.h | 3 +- trunk/drivers/iommu/intel-iommu.c | 25 +- trunk/drivers/iommu/iommu.c | 2 +- trunk/drivers/lguest/x86/core.c | 2 +- trunk/drivers/macintosh/rack-meter.c | 14 +- trunk/drivers/md/bitmap.c | 5 +- trunk/drivers/md/linear.c | 1 - trunk/drivers/md/md.c | 3 +- trunk/drivers/md/raid5.c | 14 +- trunk/drivers/media/video/gspca/gspca.c | 6 +- trunk/drivers/media/video/omap3isp/ispccdc.c | 2 +- trunk/drivers/media/video/omap3isp/ispstat.c | 2 +- trunk/drivers/mfd/ab5500-debugfs.c | 2 +- trunk/drivers/mfd/ab8500-core.c | 2 - trunk/drivers/mfd/adp5520.c | 2 +- trunk/drivers/mfd/da903x.c | 3 +- trunk/drivers/mfd/jz4740-adc.c | 1 - trunk/drivers/mfd/tps6586x.c | 2 +- trunk/drivers/mfd/tps65910.c | 2 +- trunk/drivers/mfd/twl-core.c | 16 +- trunk/drivers/mfd/twl4030-irq.c | 18 +- trunk/drivers/mfd/wm8994-core.c | 1 - trunk/drivers/mmc/host/mmci.c | 14 +- trunk/drivers/net/ethernet/freescale/Kconfig | 4 +- trunk/drivers/net/ethernet/marvell/skge.c | 3 - .../net/ethernet/mellanox/mlx4/en_cq.c | 1 - trunk/drivers/net/ethernet/realtek/r8169.c | 14 +- trunk/drivers/net/ethernet/ti/davinci_cpdma.c | 2 - trunk/drivers/net/usb/asix.c | 4 - trunk/drivers/net/wireless/ath/ath9k/main.c | 3 - trunk/drivers/net/wireless/ath/ath9k/rc.c | 4 +- trunk/drivers/net/wireless/b43/pio.c | 16 +- .../net/wireless/iwlwifi/iwl-agn-rxon.c | 4 +- .../drivers/net/wireless/iwlwifi/iwl-agn-tx.c | 5 +- trunk/drivers/net/wireless/iwlwifi/iwl-agn.c | 6 - .../net/wireless/iwlwifi/iwl-trans-pcie.c | 4 +- trunk/drivers/net/wireless/mwifiex/cmdevt.c | 9 +- .../drivers/net/wireless/mwifiex/sta_ioctl.c | 7 +- trunk/drivers/of/platform.c | 2 +- trunk/drivers/oprofile/nmi_timer_int.c | 173 -- trunk/drivers/oprofile/oprof.c | 30 +- trunk/drivers/oprofile/oprof.h | 10 +- trunk/drivers/oprofile/timer_int.c | 30 +- trunk/drivers/pci/Kconfig | 4 +- trunk/drivers/pci/ioapic.c | 15 +- trunk/drivers/rtc/interface.c | 50 +- trunk/drivers/usb/dwc3/core.c | 2 +- trunk/drivers/usb/gadget/epautoconf.c | 3 + trunk/drivers/usb/host/isp1760-if.c | 8 +- trunk/drivers/usb/musb/musb_host.c | 4 - trunk/drivers/watchdog/coh901327_wdt.c | 6 +- trunk/drivers/watchdog/hpwdt.c | 5 +- trunk/drivers/watchdog/iTCO_wdt.c | 6 +- trunk/drivers/watchdog/sp805_wdt.c | 2 +- trunk/firmware/README.AddingFirmware | 3 +- trunk/fs/btrfs/async-thread.c | 3 +- trunk/fs/btrfs/inode.c | 9 +- trunk/fs/ceph/dir.c | 29 +- trunk/fs/cifs/connect.c | 4 +- trunk/fs/compat_ioctl.c | 38 +- trunk/fs/fs-writeback.c | 11 + trunk/fs/ioctl.c | 2 +- trunk/fs/locks.c | 11 +- trunk/fs/minix/inode.c | 34 +- trunk/fs/proc/array.c | 8 +- trunk/fs/proc/stat.c | 65 +- trunk/fs/proc/uptime.c | 11 +- trunk/fs/xfs/xfs_super.c | 30 +- trunk/fs/xfs/xfs_sync.c | 36 - trunk/fs/xfs/xfs_sync.h | 2 - trunk/include/asm-generic/cputime.h | 64 +- trunk/include/linux/bitops.h | 10 +- trunk/include/linux/bootmem.h | 2 +- trunk/include/linux/cpu.h | 1 - trunk/include/linux/debugobjects.h | 6 - trunk/include/linux/hardirq.h | 21 + trunk/include/linux/jump_label.h | 27 - trunk/include/linux/kernel_stat.h | 36 +- trunk/include/linux/kvm.h | 1 - trunk/include/linux/latencytop.h | 3 +- trunk/include/linux/lglock.h | 36 +- trunk/include/linux/lockdep.h | 4 - trunk/include/linux/memblock.h | 170 +- trunk/include/linux/mm.h | 34 +- trunk/include/linux/mmzone.h | 8 +- trunk/include/linux/perf_event.h | 8 +- trunk/include/linux/poison.h | 6 + trunk/include/linux/rcupdate.h | 115 +- trunk/include/linux/sched.h | 31 +- trunk/include/linux/security.h | 2 +- trunk/include/linux/srcu.h | 87 +- trunk/include/linux/tick.h | 11 +- trunk/include/linux/wait.h | 4 +- trunk/include/net/dst.h | 1 - trunk/include/net/flow.h | 1 - trunk/include/net/ip_vs.h | 2 +- trunk/include/net/sctp/structs.h | 4 - trunk/include/net/sock.h | 4 +- trunk/include/trace/events/rcu.h | 122 +- trunk/include/trace/events/sched.h | 57 - trunk/include/trace/events/writeback.h | 15 +- trunk/init/Kconfig | 11 +- trunk/init/main.c | 3 +- trunk/kernel/Makefile | 20 +- trunk/kernel/acct.c | 4 +- trunk/kernel/cpu.c | 4 +- trunk/kernel/debug/kdb/kdb_support.c | 2 +- trunk/kernel/events/Makefile | 2 +- trunk/kernel/events/callchain.c | 191 -- trunk/kernel/events/core.c | 298 ++- trunk/kernel/events/internal.h | 39 +- trunk/kernel/exit.c | 31 +- trunk/kernel/fork.c | 14 +- trunk/kernel/futex.c | 28 +- trunk/kernel/hung_task.c | 14 +- trunk/kernel/itimer.c | 15 +- trunk/kernel/jump_label.c | 49 +- trunk/kernel/lockdep.c | 83 +- trunk/kernel/panic.c | 17 +- trunk/kernel/posix-cpu-timers.c | 132 +- trunk/kernel/printk.c | 11 +- trunk/kernel/ptrace.c | 13 +- trunk/kernel/rcu.h | 7 - trunk/kernel/rcupdate.c | 12 - trunk/kernel/rcutiny.c | 149 +- trunk/kernel/rcutiny_plugin.h | 29 +- trunk/kernel/rcutorture.c | 225 +- trunk/kernel/rcutree.c | 290 +-- trunk/kernel/rcutree.h | 26 +- trunk/kernel/rcutree_plugin.h | 289 +-- trunk/kernel/rcutree_trace.c | 12 +- trunk/kernel/rtmutex-debug.c | 1 - trunk/kernel/rtmutex.c | 8 + trunk/kernel/{sched/core.c => sched.c} | 2244 ++++++++++++++--- trunk/kernel/sched/Makefile | 20 - trunk/kernel/sched/sched.h | 1166 --------- trunk/kernel/sched/stats.c | 111 - .../{sched/auto_group.c => sched_autogroup.c} | 33 +- .../{sched/auto_group.h => sched_autogroup.h} | 26 +- trunk/kernel/{sched/clock.c => sched_clock.c} | 0 .../kernel/{sched/cpupri.c => sched_cpupri.c} | 4 +- .../kernel/{sched/cpupri.h => sched_cpupri.h} | 0 trunk/kernel/{sched/debug.c => sched_debug.c} | 6 +- trunk/kernel/{sched/fair.c => sched_fair.c} | 1004 ++------ .../{sched/features.h => sched_features.h} | 30 +- .../{sched/idle_task.c => sched_idletask.c} | 4 +- trunk/kernel/{sched/rt.c => sched_rt.c} | 218 +- trunk/kernel/{sched/stats.h => sched_stats.h} | 109 +- .../{sched/stop_task.c => sched_stoptask.c} | 4 +- trunk/kernel/signal.c | 8 +- trunk/kernel/softirq.c | 4 +- trunk/kernel/sys.c | 6 +- trunk/kernel/time/clockevents.c | 1 + trunk/kernel/time/tick-sched.c | 105 +- trunk/kernel/time/timekeeping.c | 10 +- trunk/kernel/timer.c | 62 +- trunk/kernel/trace/trace.c | 106 +- trunk/kernel/trace/trace.h | 2 - trunk/kernel/trace/trace_events_filter.c | 26 +- trunk/kernel/trace/trace_irqsoff.c | 13 +- trunk/kernel/trace/trace_output.c | 16 +- trunk/kernel/trace/trace_sched_wakeup.c | 13 +- trunk/kernel/tsacct.c | 2 +- trunk/kernel/wait.c | 4 +- trunk/lib/debugobjects.c | 54 +- trunk/mm/Kconfig | 6 - trunk/mm/filemap.c | 7 +- trunk/mm/hugetlb.c | 2 +- trunk/mm/memblock.c | 961 ++++--- trunk/mm/mempolicy.c | 11 +- trunk/mm/nobootmem.c | 45 +- trunk/mm/page_alloc.c | 508 +++- trunk/mm/slub.c | 4 +- trunk/net/bluetooth/hci_conn.c | 2 +- trunk/net/bluetooth/hci_core.c | 2 +- trunk/net/bluetooth/l2cap_core.c | 12 +- trunk/net/bluetooth/rfcomm/core.c | 1 - trunk/net/bridge/br_netfilter.c | 8 +- trunk/net/core/flow.c | 12 - trunk/net/core/net-sysfs.c | 7 +- trunk/net/core/sock.c | 6 +- trunk/net/ipv4/ipconfig.c | 4 - trunk/net/ipv4/route.c | 112 +- trunk/net/ipv6/ip6_output.c | 2 +- trunk/net/llc/af_llc.c | 14 +- trunk/net/netfilter/ipvs/ip_vs_conn.c | 2 +- trunk/net/netfilter/ipvs/ip_vs_ctl.c | 10 +- trunk/net/netfilter/ipvs/ip_vs_sync.c | 2 +- trunk/net/netfilter/nf_conntrack_netlink.c | 22 +- trunk/net/netfilter/xt_connbytes.c | 6 +- trunk/net/nfc/nci/core.c | 2 +- trunk/net/packet/af_packet.c | 12 +- trunk/net/sched/sch_mqprio.c | 2 +- trunk/net/sched/sch_netem.c | 7 +- trunk/net/sched/sch_qfq.c | 4 +- trunk/net/sctp/associola.c | 2 +- trunk/net/sctp/output.c | 8 +- trunk/net/sctp/outqueue.c | 6 +- trunk/net/sctp/protocol.c | 3 - trunk/net/sctp/socket.c | 2 + trunk/net/sctp/sysctl.c | 13 - trunk/net/socket.c | 16 +- trunk/net/xfrm/xfrm_policy.c | 18 +- trunk/scripts/kconfig/Makefile | 5 +- trunk/security/security.c | 2 +- trunk/sound/atmel/ac97c.c | 4 - trunk/sound/soc/codecs/wm8776.c | 1 - .../perf/Documentation/perf-annotate.txt | 4 +- .../perf/Documentation/perf-buildid-list.txt | 2 +- .../tools/perf/Documentation/perf-evlist.txt | 2 +- trunk/tools/perf/Documentation/perf-kmem.txt | 2 +- trunk/tools/perf/Documentation/perf-lock.txt | 2 +- .../tools/perf/Documentation/perf-record.txt | 2 +- .../tools/perf/Documentation/perf-report.txt | 11 +- trunk/tools/perf/Documentation/perf-sched.txt | 2 +- .../tools/perf/Documentation/perf-script.txt | 9 +- trunk/tools/perf/Documentation/perf-test.txt | 8 +- .../perf/Documentation/perf-timechart.txt | 2 +- trunk/tools/perf/Makefile | 1 - trunk/tools/perf/builtin-annotate.c | 132 +- trunk/tools/perf/builtin-buildid-list.c | 53 +- trunk/tools/perf/builtin-diff.c | 21 +- trunk/tools/perf/builtin-evlist.c | 2 +- trunk/tools/perf/builtin-inject.c | 118 +- trunk/tools/perf/builtin-kmem.c | 16 +- trunk/tools/perf/builtin-kvm.c | 2 +- trunk/tools/perf/builtin-lock.c | 12 +- trunk/tools/perf/builtin-probe.c | 1 + trunk/tools/perf/builtin-record.c | 603 +++-- trunk/tools/perf/builtin-report.c | 236 +- trunk/tools/perf/builtin-sched.c | 200 +- trunk/tools/perf/builtin-script.c | 130 +- trunk/tools/perf/builtin-stat.c | 134 +- trunk/tools/perf/builtin-test.c | 545 +--- trunk/tools/perf/builtin-timechart.c | 38 +- trunk/tools/perf/builtin-top.c | 558 ++-- trunk/tools/perf/perf.c | 33 +- trunk/tools/perf/perf.h | 24 - trunk/tools/perf/util/annotate.c | 8 +- trunk/tools/perf/util/annotate.h | 5 +- trunk/tools/perf/util/build-id.c | 26 +- trunk/tools/perf/util/build-id.h | 2 +- trunk/tools/perf/util/callchain.h | 3 - trunk/tools/perf/util/cgroup.c | 15 +- trunk/tools/perf/util/config.c | 5 +- trunk/tools/perf/util/debugfs.c | 35 +- trunk/tools/perf/util/debugfs.h | 31 +- trunk/tools/perf/util/event.c | 360 +-- trunk/tools/perf/util/event.h | 68 +- trunk/tools/perf/util/evlist.c | 299 +-- trunk/tools/perf/util/evlist.h | 43 +- trunk/tools/perf/util/evsel.c | 154 +- trunk/tools/perf/util/evsel.h | 8 - trunk/tools/perf/util/header.c | 749 +++--- trunk/tools/perf/util/header.h | 51 +- trunk/tools/perf/util/hist.h | 3 +- trunk/tools/perf/util/include/linux/bitops.h | 118 - trunk/tools/perf/util/map.c | 4 - trunk/tools/perf/util/map.h | 19 - trunk/tools/perf/util/parse-events.c | 30 +- trunk/tools/perf/util/parse-events.h | 1 + trunk/tools/perf/util/probe-finder.h | 1 + .../util/scripting-engines/trace-event-perl.c | 75 +- .../scripting-engines/trace-event-python.c | 4 +- trunk/tools/perf/util/session.c | 342 ++- trunk/tools/perf/util/session.h | 72 +- trunk/tools/perf/util/setup.py | 3 +- trunk/tools/perf/util/symbol.c | 11 +- trunk/tools/perf/util/symbol.h | 1 - trunk/tools/perf/util/thread.c | 6 +- trunk/tools/perf/util/thread.h | 14 +- trunk/tools/perf/util/tool.h | 50 - trunk/tools/perf/util/top.h | 20 +- trunk/tools/perf/util/trace-event-info.c | 28 +- trunk/tools/perf/util/trace-event-scripting.c | 2 +- trunk/tools/perf/util/trace-event.h | 8 +- trunk/tools/perf/util/ui/browsers/annotate.c | 16 +- trunk/tools/perf/util/ui/browsers/hists.c | 2 +- trunk/tools/perf/util/ui/progress.c | 3 - trunk/tools/perf/util/usage.c | 5 +- trunk/tools/perf/util/util.h | 11 - trunk/tools/perf/util/values.c | 1 - trunk/virt/kvm/assigned-dev.c | 93 +- 528 files changed, 9204 insertions(+), 13638 deletions(-) create mode 100644 trunk/arch/microblaze/include/asm/memblock.h create mode 100644 trunk/arch/openrisc/include/asm/memblock.h create mode 100644 trunk/arch/powerpc/include/asm/memblock.h delete mode 100644 trunk/arch/s390/oprofile/op_counter.h create mode 100644 trunk/arch/sh/include/asm/memblock.h create mode 100644 trunk/arch/sparc/include/asm/memblock.h delete mode 100644 trunk/arch/x86/include/asm/apic_flat_64.h create mode 100644 trunk/arch/x86/include/asm/memblock.h delete mode 100644 trunk/arch/x86/include/asm/numachip/numachip_csr.h delete mode 100644 trunk/arch/x86/kernel/apic/apic_numachip.c create mode 100644 trunk/arch/x86/mm/memblock.c create mode 100644 trunk/arch/x86/oprofile/nmi_timer_int.c delete mode 100644 trunk/arch/x86/tools/insn_sanity.c delete mode 100644 trunk/drivers/oprofile/nmi_timer_int.c delete mode 100644 trunk/kernel/events/callchain.c rename trunk/kernel/{sched/core.c => sched.c} (79%) delete mode 100644 trunk/kernel/sched/Makefile delete mode 100644 trunk/kernel/sched/sched.h delete mode 100644 trunk/kernel/sched/stats.c rename trunk/kernel/{sched/auto_group.c => sched_autogroup.c} (88%) rename trunk/kernel/{sched/auto_group.h => sched_autogroup.h} (66%) rename trunk/kernel/{sched/clock.c => sched_clock.c} (100%) rename trunk/kernel/{sched/cpupri.c => sched_cpupri.c} (99%) rename trunk/kernel/{sched/cpupri.h => sched_cpupri.h} (100%) rename trunk/kernel/{sched/debug.c => sched_debug.c} (99%) rename trunk/kernel/{sched/fair.c => sched_fair.c} (86%) rename trunk/kernel/{sched/features.h => sched_features.h} (75%) rename trunk/kernel/{sched/idle_task.c => sched_idletask.c} (96%) rename trunk/kernel/{sched/rt.c => sched_rt.c} (90%) rename trunk/kernel/{sched/stats.h => sched_stats.h} (70%) rename trunk/kernel/{sched/stop_task.c => sched_stoptask.c} (97%) delete mode 100644 trunk/tools/perf/util/tool.h diff --git a/[refs] b/[refs] index 98dbe08220b5..c5e849afafbb 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: f7e6746ebae984ea67b0a1a1e23c7e6698240631 +refs/heads/master: 8d973b624ece3b85cfae9474935795d034f72faf diff --git a/trunk/Documentation/DocBook/debugobjects.tmpl b/trunk/Documentation/DocBook/debugobjects.tmpl index 24979f691e3e..08ff908aa7a2 100644 --- a/trunk/Documentation/DocBook/debugobjects.tmpl +++ b/trunk/Documentation/DocBook/debugobjects.tmpl @@ -96,7 +96,6 @@ debug_object_deactivate debug_object_destroy debug_object_free - debug_object_assert_init Each of these functions takes the address of the real object and a pointer to the object type specific debug description @@ -274,26 +273,6 @@ debug checks. - - - debug_object_assert_init - - This function is called to assert that an object has been - initialized. - - - When the real object is not tracked by debugobjects, it calls - fixup_assert_init of the object type description structure - provided by the caller, with the hardcoded object state - ODEBUG_NOT_AVAILABLE. The fixup function can correct the problem - by calling debug_object_init and other specific initializing - functions. - - - When the real object is already tracked by debugobjects it is - ignored. - - Fixup functions @@ -402,35 +381,6 @@ statistics. - - fixup_assert_init - - This function is called from the debug code whenever a problem - in debug_object_assert_init is detected. - - - Called from debug_object_assert_init() with a hardcoded state - ODEBUG_STATE_NOTAVAILABLE when the object is not found in the - debug bucket. - - - The function returns 1 when the fixup was successful, - otherwise 0. The return value is used to update the - statistics. - - - Note, this function should make sure debug_object_init() is - called before returning. - - - The handling of statically initialized objects is a special - case. The fixup function should check if this is a legitimate - case of a statically initialized object or not. In this case only - debug_object_init() should be called to make the object known to - the tracker. Then the function should return 0 because this is not - a real fixup. - - Known Bugs And Assumptions diff --git a/trunk/Documentation/RCU/checklist.txt b/trunk/Documentation/RCU/checklist.txt index bff2d8be1e18..0c134f8afc6f 100644 --- a/trunk/Documentation/RCU/checklist.txt +++ b/trunk/Documentation/RCU/checklist.txt @@ -328,12 +328,6 @@ over a rather long period of time, but improvements are always welcome! RCU rather than SRCU, because RCU is almost always faster and easier to use than is SRCU. - If you need to enter your read-side critical section in a - hardirq or exception handler, and then exit that same read-side - critical section in the task that was interrupted, then you need - to srcu_read_lock_raw() and srcu_read_unlock_raw(), which avoid - the lockdep checking that would otherwise this practice illegal. - Also unlike other forms of RCU, explicit initialization and cleanup is required via init_srcu_struct() and cleanup_srcu_struct(). These are passed a "struct srcu_struct" diff --git a/trunk/Documentation/RCU/rcu.txt b/trunk/Documentation/RCU/rcu.txt index bf778332a28f..31852705b586 100644 --- a/trunk/Documentation/RCU/rcu.txt +++ b/trunk/Documentation/RCU/rcu.txt @@ -38,11 +38,11 @@ o How can the updater tell when a grace period has completed Preemptible variants of RCU (CONFIG_TREE_PREEMPT_RCU) get the same effect, but require that the readers manipulate CPU-local - counters. These counters allow limited types of blocking within - RCU read-side critical sections. SRCU also uses CPU-local - counters, and permits general blocking within RCU read-side - critical sections. These variants of RCU detect grace periods - by sampling these counters. + counters. These counters allow limited types of blocking + within RCU read-side critical sections. SRCU also uses + CPU-local counters, and permits general blocking within + RCU read-side critical sections. These two variants of + RCU detect grace periods by sampling these counters. o If I am running on a uniprocessor kernel, which can only do one thing at a time, why should I wait for a grace period? diff --git a/trunk/Documentation/RCU/stallwarn.txt b/trunk/Documentation/RCU/stallwarn.txt index 083d88cbc089..4e959208f736 100644 --- a/trunk/Documentation/RCU/stallwarn.txt +++ b/trunk/Documentation/RCU/stallwarn.txt @@ -101,11 +101,6 @@ o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that CONFIG_TREE_PREEMPT_RCU case, you might see stall-warning messages. -o A hardware or software issue shuts off the scheduler-clock - interrupt on a CPU that is not in dyntick-idle mode. This - problem really has happened, and seems to be most likely to - result in RCU CPU stall warnings for CONFIG_NO_HZ=n kernels. - o A bug in the RCU implementation. o A hardware failure. This is quite unlikely, but has occurred @@ -114,11 +109,12 @@ o A hardware failure. This is quite unlikely, but has occurred This resulted in a series of RCU CPU stall warnings, eventually leading the realization that the CPU had failed. -The RCU, RCU-sched, and RCU-bh implementations have CPU stall warning. -SRCU does not have its own CPU stall warnings, but its calls to -synchronize_sched() will result in RCU-sched detecting RCU-sched-related -CPU stalls. Please note that RCU only detects CPU stalls when there is -a grace period in progress. No grace period, no CPU stall warnings. +The RCU, RCU-sched, and RCU-bh implementations have CPU stall +warning. SRCU does not have its own CPU stall warnings, but its +calls to synchronize_sched() will result in RCU-sched detecting +RCU-sched-related CPU stalls. Please note that RCU only detects +CPU stalls when there is a grace period in progress. No grace period, +no CPU stall warnings. To diagnose the cause of the stall, inspect the stack traces. The offending function will usually be near the top of the stack. diff --git a/trunk/Documentation/RCU/torture.txt b/trunk/Documentation/RCU/torture.txt index d67068d0d2b9..783d6c134d3f 100644 --- a/trunk/Documentation/RCU/torture.txt +++ b/trunk/Documentation/RCU/torture.txt @@ -61,24 +61,11 @@ nreaders This is the number of RCU reading threads supported. To properly exercise RCU implementations with preemptible read-side critical sections. -onoff_interval - The number of seconds between each attempt to execute a - randomly selected CPU-hotplug operation. Defaults to - zero, which disables CPU hotplugging. In HOTPLUG_CPU=n - kernels, rcutorture will silently refuse to do any - CPU-hotplug operations regardless of what value is - specified for onoff_interval. - shuffle_interval The number of seconds to keep the test threads affinitied to a particular subset of the CPUs, defaults to 3 seconds. Used in conjunction with test_no_idle_hz. -shutdown_secs The number of seconds to run the test before terminating - the test and powering off the system. The default is - zero, which disables test termination and system shutdown. - This capability is useful for automated testing. - stat_interval The number of seconds between output of torture statistics (via printk()). Regardless of the interval, statistics are printed when the module is unloaded. diff --git a/trunk/Documentation/RCU/trace.txt b/trunk/Documentation/RCU/trace.txt index 49587abfc2f7..aaf65f6c6cd7 100644 --- a/trunk/Documentation/RCU/trace.txt +++ b/trunk/Documentation/RCU/trace.txt @@ -105,10 +105,14 @@ o "dt" is the current value of the dyntick counter that is incremented or one greater than the interrupt-nesting depth otherwise. The number after the second "/" is the NMI nesting depth. + This field is displayed only for CONFIG_NO_HZ kernels. + o "df" is the number of times that some other CPU has forced a quiescent state on behalf of this CPU due to this CPU being in dynticks-idle state. + This field is displayed only for CONFIG_NO_HZ kernels. + o "of" is the number of times that some other CPU has forced a quiescent state on behalf of this CPU due to this CPU being offline. In a perfect world, this might never happen, but it diff --git a/trunk/Documentation/RCU/whatisRCU.txt b/trunk/Documentation/RCU/whatisRCU.txt index 6bbe8dcdc3da..6ef692667e2f 100644 --- a/trunk/Documentation/RCU/whatisRCU.txt +++ b/trunk/Documentation/RCU/whatisRCU.txt @@ -4,7 +4,6 @@ to start learning about RCU: 1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/ 2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/ 3. RCU part 3: the RCU API http://lwn.net/Articles/264090/ -4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/ What is RCU? @@ -835,8 +834,6 @@ SRCU: Critical sections Grace period Barrier srcu_read_lock synchronize_srcu N/A srcu_read_unlock synchronize_srcu_expedited - srcu_read_lock_raw - srcu_read_unlock_raw srcu_dereference SRCU: Initialization/cleanup @@ -858,33 +855,27 @@ list can be helpful: a. Will readers need to block? If so, you need SRCU. -b. Is it necessary to start a read-side critical section in a - hardirq handler or exception handler, and then to complete - this read-side critical section in the task that was - interrupted? If so, you need SRCU's srcu_read_lock_raw() and - srcu_read_unlock_raw() primitives. - -c. What about the -rt patchset? If readers would need to block +b. What about the -rt patchset? If readers would need to block in an non-rt kernel, you need SRCU. If readers would block in a -rt kernel, but not in a non-rt kernel, SRCU is not necessary. -d. Do you need to treat NMI handlers, hardirq handlers, +c. Do you need to treat NMI handlers, hardirq handlers, and code segments with preemption disabled (whether via preempt_disable(), local_irq_save(), local_bh_disable(), or some other mechanism) as if they were explicit RCU readers? If so, you need RCU-sched. -e. Do you need RCU grace periods to complete even in the face +d. Do you need RCU grace periods to complete even in the face of softirq monopolization of one or more of the CPUs? For example, is your code subject to network-based denial-of-service attacks? If so, you need RCU-bh. -f. Is your workload too update-intensive for normal use of +e. Is your workload too update-intensive for normal use of RCU, but inappropriate for other synchronization mechanisms? If so, consider SLAB_DESTROY_BY_RCU. But please be careful! -g. Otherwise, use RCU. +f. Otherwise, use RCU. Of course, this all assumes that you have determined that RCU is in fact the right tool for your job. diff --git a/trunk/Documentation/atomic_ops.txt b/trunk/Documentation/atomic_ops.txt index 27f2b21a9d5c..3bd585b44927 100644 --- a/trunk/Documentation/atomic_ops.txt +++ b/trunk/Documentation/atomic_ops.txt @@ -84,93 +84,6 @@ compiler optimizes the section accessing atomic_t variables. *** YOU HAVE BEEN WARNED! *** -Properly aligned pointers, longs, ints, and chars (and unsigned -equivalents) may be atomically loaded from and stored to in the same -sense as described for atomic_read() and atomic_set(). The ACCESS_ONCE() -macro should be used to prevent the compiler from using optimizations -that might otherwise optimize accesses out of existence on the one hand, -or that might create unsolicited accesses on the other. - -For example consider the following code: - - while (a > 0) - do_something(); - -If the compiler can prove that do_something() does not store to the -variable a, then the compiler is within its rights transforming this to -the following: - - tmp = a; - if (a > 0) - for (;;) - do_something(); - -If you don't want the compiler to do this (and you probably don't), then -you should use something like the following: - - while (ACCESS_ONCE(a) < 0) - do_something(); - -Alternatively, you could place a barrier() call in the loop. - -For another example, consider the following code: - - tmp_a = a; - do_something_with(tmp_a); - do_something_else_with(tmp_a); - -If the compiler can prove that do_something_with() does not store to the -variable a, then the compiler is within its rights to manufacture an -additional load as follows: - - tmp_a = a; - do_something_with(tmp_a); - tmp_a = a; - do_something_else_with(tmp_a); - -This could fatally confuse your code if it expected the same value -to be passed to do_something_with() and do_something_else_with(). - -The compiler would be likely to manufacture this additional load if -do_something_with() was an inline function that made very heavy use -of registers: reloading from variable a could save a flush to the -stack and later reload. To prevent the compiler from attacking your -code in this manner, write the following: - - tmp_a = ACCESS_ONCE(a); - do_something_with(tmp_a); - do_something_else_with(tmp_a); - -For a final example, consider the following code, assuming that the -variable a is set at boot time before the second CPU is brought online -and never changed later, so that memory barriers are not needed: - - if (a) - b = 9; - else - b = 42; - -The compiler is within its rights to manufacture an additional store -by transforming the above code into the following: - - b = 42; - if (a) - b = 9; - -This could come as a fatal surprise to other code running concurrently -that expected b to never have the value 42 if a was zero. To prevent -the compiler from doing this, write something like: - - if (a) - ACCESS_ONCE(b) = 9; - else - ACCESS_ONCE(b) = 42; - -Don't even -think- about doing this without proper use of memory barriers, -locks, or atomic operations if variable a can change at runtime! - -*** WARNING: ACCESS_ONCE() DOES NOT IMPLY A BARRIER! *** - Now, we move onto the atomic operation interfaces typically implemented with the help of assembly code. diff --git a/trunk/Documentation/kernel-parameters.txt b/trunk/Documentation/kernel-parameters.txt index e229769606f2..81c287fad79d 100644 --- a/trunk/Documentation/kernel-parameters.txt +++ b/trunk/Documentation/kernel-parameters.txt @@ -1885,11 +1885,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. arch_perfmon: [X86] Force use of architectural perfmon on Intel CPUs instead of the CPU specific event set. - timer: [X86] Force use of architectural NMI - timer mode (see also oprofile.timer - for generic hr timer mode) - [s390] Force legacy basic mode sampling - (report cpu_type "timer") oops=panic Always panic on oopses. Default is to just kill the process, but there is a small probability of @@ -2755,10 +2750,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. functions are at fixed addresses, they make nice targets for exploits that can control RIP. - emulate [default] Vsyscalls turn into traps and are - emulated reasonably safely. + emulate Vsyscalls turn into traps and are emulated + reasonably safely. - native Vsyscalls are native syscall instructions. + native [default] Vsyscalls are native syscall + instructions. This is a little bit faster than trapping and makes a few dynamic recompilers work better than they would in emulation mode. diff --git a/trunk/Documentation/lockdep-design.txt b/trunk/Documentation/lockdep-design.txt index 5dbc99c04f6e..abf768c681e2 100644 --- a/trunk/Documentation/lockdep-design.txt +++ b/trunk/Documentation/lockdep-design.txt @@ -221,66 +221,3 @@ when the chain is validated for the first time, is then put into a hash table, which hash-table can be checked in a lockfree manner. If the locking chain occurs again later on, the hash table tells us that we dont have to validate the chain again. - -Troubleshooting: ----------------- - -The validator tracks a maximum of MAX_LOCKDEP_KEYS number of lock classes. -Exceeding this number will trigger the following lockdep warning: - - (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) - -By default, MAX_LOCKDEP_KEYS is currently set to 8191, and typical -desktop systems have less than 1,000 lock classes, so this warning -normally results from lock-class leakage or failure to properly -initialize locks. These two problems are illustrated below: - -1. Repeated module loading and unloading while running the validator - will result in lock-class leakage. The issue here is that each - load of the module will create a new set of lock classes for - that module's locks, but module unloading does not remove old - classes (see below discussion of reuse of lock classes for why). - Therefore, if that module is loaded and unloaded repeatedly, - the number of lock classes will eventually reach the maximum. - -2. Using structures such as arrays that have large numbers of - locks that are not explicitly initialized. For example, - a hash table with 8192 buckets where each bucket has its own - spinlock_t will consume 8192 lock classes -unless- each spinlock - is explicitly initialized at runtime, for example, using the - run-time spin_lock_init() as opposed to compile-time initializers - such as __SPIN_LOCK_UNLOCKED(). Failure to properly initialize - the per-bucket spinlocks would guarantee lock-class overflow. - In contrast, a loop that called spin_lock_init() on each lock - would place all 8192 locks into a single lock class. - - The moral of this story is that you should always explicitly - initialize your locks. - -One might argue that the validator should be modified to allow -lock classes to be reused. However, if you are tempted to make this -argument, first review the code and think through the changes that would -be required, keeping in mind that the lock classes to be removed are -likely to be linked into the lock-dependency graph. This turns out to -be harder to do than to say. - -Of course, if you do run out of lock classes, the next thing to do is -to find the offending lock classes. First, the following command gives -you the number of lock classes currently in use along with the maximum: - - grep "lock-classes" /proc/lockdep_stats - -This command produces the following output on a modest system: - - lock-classes: 748 [max: 8191] - -If the number allocated (748 above) increases continually over time, -then there is likely a leak. The following command can be used to -identify the leaking lock classes: - - grep "BD" /proc/lockdep - -Run the command and save the output, then compare against the output from -a later run of this command to identify the leakers. This same output -can also help you find situations where runtime lock initialization has -been omitted. diff --git a/trunk/Documentation/trace/events.txt b/trunk/Documentation/trace/events.txt index bb24c2a0e870..b510564aac7e 100644 --- a/trunk/Documentation/trace/events.txt +++ b/trunk/Documentation/trace/events.txt @@ -191,6 +191,8 @@ And for string fields they are: Currently, only exact string matches are supported. +Currently, the maximum number of predicates in a filter is 16. + 5.2 Setting filters ------------------- diff --git a/trunk/Documentation/virtual/kvm/api.txt b/trunk/Documentation/virtual/kvm/api.txt index e2a4b5287361..7945b0bd35e2 100644 --- a/trunk/Documentation/virtual/kvm/api.txt +++ b/trunk/Documentation/virtual/kvm/api.txt @@ -1100,15 +1100,6 @@ emulate them efficiently. The fields in each entry are defined as follows: eax, ebx, ecx, edx: the values returned by the cpuid instruction for this function/index combination -The TSC deadline timer feature (CPUID leaf 1, ecx[24]) is always returned -as false, since the feature depends on KVM_CREATE_IRQCHIP for local APIC -support. Instead it is reported via - - ioctl(KVM_CHECK_EXTENSION, KVM_CAP_TSC_DEADLINE_TIMER) - -if that returns true and you use KVM_CREATE_IRQCHIP, or if you emulate the -feature in userspace, then you can enable the feature for KVM_SET_CPUID2. - 4.47 KVM_PPC_GET_PVINFO Capability: KVM_CAP_PPC_GET_PVINFO @@ -1160,13 +1151,6 @@ following flags are specified: /* Depends on KVM_CAP_IOMMU */ #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) -The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure -isolation of the device. Usages not specifying this flag are deprecated. - -Only PCI header type 0 devices with PCI BAR resources are supported by -device assignment. The user requesting this ioctl must have read/write -access to the PCI sysfs resource files associated with the device. - 4.49 KVM_DEASSIGN_PCI_DEVICE Capability: KVM_CAP_DEVICE_DEASSIGNMENT diff --git a/trunk/MAINTAINERS b/trunk/MAINTAINERS index 62f1cd357ddf..6afba60c3904 100644 --- a/trunk/MAINTAINERS +++ b/trunk/MAINTAINERS @@ -1698,9 +1698,11 @@ F: arch/x86/include/asm/tce.h CAN NETWORK LAYER M: Oliver Hartkopp +M: Oliver Hartkopp +M: Urs Thuermann L: linux-can@vger.kernel.org -W: http://gitorious.org/linux-can -T: git git://gitorious.org/linux-can/linux-can-next.git +L: netdev@vger.kernel.org +W: http://developer.berlios.de/projects/socketcan/ S: Maintained F: net/can/ F: include/linux/can.h @@ -1711,10 +1713,9 @@ F: include/linux/can/gw.h CAN NETWORK DRIVERS M: Wolfgang Grandegger -M: Marc Kleine-Budde L: linux-can@vger.kernel.org -W: http://gitorious.org/linux-can -T: git git://gitorious.org/linux-can/linux-can-next.git +L: netdev@vger.kernel.org +W: http://developer.berlios.de/projects/socketcan/ S: Maintained F: drivers/net/can/ F: include/linux/can/dev.h @@ -2699,7 +2700,7 @@ FIREWIRE SUBSYSTEM M: Stefan Richter L: linux1394-devel@lists.sourceforge.net W: http://ieee1394.wiki.kernel.org/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/ieee1394/linux1394.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/ieee1394/linux1394-2.6.git S: Maintained F: drivers/firewire/ F: include/linux/firewire*.h diff --git a/trunk/Makefile b/trunk/Makefile index adddd11c3b3b..a43733df3978 100644 --- a/trunk/Makefile +++ b/trunk/Makefile @@ -1,7 +1,7 @@ VERSION = 3 PATCHLEVEL = 2 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc6 NAME = Saber-toothed Squirrel # *DOCUMENTATION* diff --git a/trunk/arch/Kconfig b/trunk/arch/Kconfig index 2505740b81d2..4b0669cbb3b0 100644 --- a/trunk/arch/Kconfig +++ b/trunk/arch/Kconfig @@ -30,10 +30,6 @@ config OPROFILE_EVENT_MULTIPLEX config HAVE_OPROFILE bool -config OPROFILE_NMI_TIMER - def_bool y - depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI - config KPROBES bool "Kprobes" depends on MODULES diff --git a/trunk/arch/arm/Kconfig b/trunk/arch/arm/Kconfig index b259c7c644e3..776d76b8cb69 100644 --- a/trunk/arch/arm/Kconfig +++ b/trunk/arch/arm/Kconfig @@ -1246,7 +1246,7 @@ config PL310_ERRATA_588369 config ARM_ERRATA_720789 bool "ARM errata: TLBIASIDIS and TLBIMVAIS operations can broadcast a faulty ASID" - depends on CPU_V7 + depends on CPU_V7 && SMP help This option enables the workaround for the 720789 Cortex-A9 (prior to r2p0) erratum. A faulty ASID can be sent to the other CPUs for the @@ -1282,7 +1282,7 @@ config ARM_ERRATA_743622 config ARM_ERRATA_751472 bool "ARM errata: Interrupted ICIALLUIS may prevent completion of broadcasted operation" - depends on CPU_V7 + depends on CPU_V7 && SMP help This option enables the workaround for the 751472 Cortex-A9 (prior to r3p0) erratum. An interrupted ICIALLUIS operation may prevent the diff --git a/trunk/arch/arm/common/pl330.c b/trunk/arch/arm/common/pl330.c index 8d8df744f7a5..f407a6b35d3d 100644 --- a/trunk/arch/arm/common/pl330.c +++ b/trunk/arch/arm/common/pl330.c @@ -221,6 +221,17 @@ */ #define MCODE_BUFF_PER_REQ 256 +/* + * Mark a _pl330_req as free. + * We do it by writing DMAEND as the first instruction + * because no valid request is going to have DMAEND as + * its first instruction to execute. + */ +#define MARK_FREE(req) do { \ + _emit_END(0, (req)->mc_cpu); \ + (req)->mc_len = 0; \ + } while (0) + /* If the _pl330_req is available to the client */ #define IS_FREE(req) (*((u8 *)((req)->mc_cpu)) == CMD_DMAEND) @@ -290,10 +301,8 @@ struct pl330_thread { struct pl330_dmac *dmac; /* Only two at a time */ struct _pl330_req req[2]; - /* Index of the last enqueued request */ + /* Index of the last submitted request */ unsigned lstenq; - /* Index of the last submitted request or -1 if the DMA is stopped */ - int req_running; }; enum pl330_dmac_state { @@ -769,22 +778,6 @@ static inline void _execute_DBGINSN(struct pl330_thread *thrd, writel(0, regs + DBGCMD); } -/* - * Mark a _pl330_req as free. - * We do it by writing DMAEND as the first instruction - * because no valid request is going to have DMAEND as - * its first instruction to execute. - */ -static void mark_free(struct pl330_thread *thrd, int idx) -{ - struct _pl330_req *req = &thrd->req[idx]; - - _emit_END(0, req->mc_cpu); - req->mc_len = 0; - - thrd->req_running = -1; -} - static inline u32 _state(struct pl330_thread *thrd) { void __iomem *regs = thrd->dmac->pinfo->base; @@ -843,6 +836,31 @@ static inline u32 _state(struct pl330_thread *thrd) } } +/* If the request 'req' of thread 'thrd' is currently active */ +static inline bool _req_active(struct pl330_thread *thrd, + struct _pl330_req *req) +{ + void __iomem *regs = thrd->dmac->pinfo->base; + u32 buf = req->mc_bus, pc = readl(regs + CPC(thrd->id)); + + if (IS_FREE(req)) + return false; + + return (pc >= buf && pc <= buf + req->mc_len) ? true : false; +} + +/* Returns 0 if the thread is inactive, ID of active req + 1 otherwise */ +static inline unsigned _thrd_active(struct pl330_thread *thrd) +{ + if (_req_active(thrd, &thrd->req[0])) + return 1; /* First req active */ + + if (_req_active(thrd, &thrd->req[1])) + return 2; /* Second req active */ + + return 0; +} + static void _stop(struct pl330_thread *thrd) { void __iomem *regs = thrd->dmac->pinfo->base; @@ -874,22 +892,17 @@ static bool _trigger(struct pl330_thread *thrd) struct _arg_GO go; unsigned ns; u8 insn[6] = {0, 0, 0, 0, 0, 0}; - int idx; /* Return if already ACTIVE */ if (_state(thrd) != PL330_STATE_STOPPED) return true; - idx = 1 - thrd->lstenq; - if (!IS_FREE(&thrd->req[idx])) - req = &thrd->req[idx]; - else { - idx = thrd->lstenq; - if (!IS_FREE(&thrd->req[idx])) - req = &thrd->req[idx]; - else - req = NULL; - } + if (!IS_FREE(&thrd->req[1 - thrd->lstenq])) + req = &thrd->req[1 - thrd->lstenq]; + else if (!IS_FREE(&thrd->req[thrd->lstenq])) + req = &thrd->req[thrd->lstenq]; + else + req = NULL; /* Return if no request */ if (!req || !req->r) @@ -920,8 +933,6 @@ static bool _trigger(struct pl330_thread *thrd) /* Only manager can execute GO */ _execute_DBGINSN(thrd, insn, true); - thrd->req_running = idx; - return true; } @@ -1371,8 +1382,8 @@ static void pl330_dotask(unsigned long data) thrd->req[0].r = NULL; thrd->req[1].r = NULL; - mark_free(thrd, 0); - mark_free(thrd, 1); + MARK_FREE(&thrd->req[0]); + MARK_FREE(&thrd->req[1]); /* Clear the reset flag */ pl330->dmac_tbd.reset_chan &= ~(1 << i); @@ -1450,12 +1461,14 @@ int pl330_update(const struct pl330_info *pi) thrd = &pl330->channels[id]; - active = thrd->req_running; - if (active == -1) /* Aborted */ + active = _thrd_active(thrd); + if (!active) /* Aborted */ continue; + active -= 1; + rqdone = &thrd->req[active]; - mark_free(thrd, active); + MARK_FREE(rqdone); /* Get going again ASAP */ _start(thrd); @@ -1496,7 +1509,7 @@ int pl330_chan_ctrl(void *ch_id, enum pl330_chan_op op) struct pl330_thread *thrd = ch_id; struct pl330_dmac *pl330; unsigned long flags; - int ret = 0, active = thrd->req_running; + int ret = 0, active; if (!thrd || thrd->free || thrd->dmac->state == DYING) return -EINVAL; @@ -1512,24 +1525,28 @@ int pl330_chan_ctrl(void *ch_id, enum pl330_chan_op op) thrd->req[0].r = NULL; thrd->req[1].r = NULL; - mark_free(thrd, 0); - mark_free(thrd, 1); + MARK_FREE(&thrd->req[0]); + MARK_FREE(&thrd->req[1]); break; case PL330_OP_ABORT: + active = _thrd_active(thrd); + /* Make sure the channel is stopped */ _stop(thrd); /* ABORT is only for the active req */ - if (active == -1) + if (!active) break; + active--; + thrd->req[active].r = NULL; - mark_free(thrd, active); + MARK_FREE(&thrd->req[active]); /* Start the next */ case PL330_OP_START: - if ((active == -1) && !_start(thrd)) + if (!_thrd_active(thrd) && !_start(thrd)) ret = -EIO; break; @@ -1570,13 +1587,14 @@ int pl330_chan_status(void *ch_id, struct pl330_chanstatus *pstatus) else pstatus->faulting = false; - active = thrd->req_running; + active = _thrd_active(thrd); - if (active == -1) { + if (!active) { /* Indicate that the thread is not running */ pstatus->top_req = NULL; pstatus->wait_req = NULL; } else { + active--; pstatus->top_req = thrd->req[active].r; pstatus->wait_req = !IS_FREE(&thrd->req[1 - active]) ? thrd->req[1 - active].r : NULL; @@ -1641,9 +1659,9 @@ void *pl330_request_channel(const struct pl330_info *pi) thrd->free = false; thrd->lstenq = 1; thrd->req[0].r = NULL; - mark_free(thrd, 0); + MARK_FREE(&thrd->req[0]); thrd->req[1].r = NULL; - mark_free(thrd, 1); + MARK_FREE(&thrd->req[1]); break; } } @@ -1749,14 +1767,14 @@ static inline void _reset_thread(struct pl330_thread *thrd) thrd->req[0].mc_bus = pl330->mcode_bus + (thrd->id * pi->mcbufsz); thrd->req[0].r = NULL; - mark_free(thrd, 0); + MARK_FREE(&thrd->req[0]); thrd->req[1].mc_cpu = thrd->req[0].mc_cpu + pi->mcbufsz / 2; thrd->req[1].mc_bus = thrd->req[0].mc_bus + pi->mcbufsz / 2; thrd->req[1].r = NULL; - mark_free(thrd, 1); + MARK_FREE(&thrd->req[1]); } static int dmac_alloc_threads(struct pl330_dmac *pl330) diff --git a/trunk/arch/arm/configs/imx_v4_v5_defconfig b/trunk/arch/arm/configs/imx_v4_v5_defconfig index cf497ce41dfe..11a4192197c8 100644 --- a/trunk/arch/arm/configs/imx_v4_v5_defconfig +++ b/trunk/arch/arm/configs/imx_v4_v5_defconfig @@ -18,10 +18,9 @@ CONFIG_ARCH_MXC=y CONFIG_ARCH_IMX_V4_V5=y CONFIG_ARCH_MX1ADS=y CONFIG_MACH_SCB9328=y -CONFIG_MACH_APF9328=y CONFIG_MACH_MX21ADS=y CONFIG_MACH_MX25_3DS=y -CONFIG_MACH_EUKREA_CPUIMX25SD=y +CONFIG_MACH_EUKREA_CPUIMX25=y CONFIG_MACH_MX27ADS=y CONFIG_MACH_PCM038=y CONFIG_MACH_CPUIMX27=y @@ -73,16 +72,17 @@ CONFIG_MTD_CFI_GEOMETRY=y CONFIG_MTD_CFI_INTELEXT=y CONFIG_MTD_PHYSMAP=y CONFIG_MTD_NAND=y -CONFIG_MTD_NAND_MXC=y CONFIG_MTD_UBI=y CONFIG_MISC_DEVICES=y CONFIG_EEPROM_AT24=y CONFIG_EEPROM_AT25=y CONFIG_NETDEVICES=y -CONFIG_DM9000=y +CONFIG_NET_ETHERNET=y CONFIG_SMC91X=y +CONFIG_DM9000=y CONFIG_SMC911X=y -CONFIG_SMSC_PHY=y +# CONFIG_NETDEV_1000 is not set +# CONFIG_NETDEV_10000 is not set # CONFIG_INPUT_MOUSEDEV is not set CONFIG_INPUT_EVDEV=y # CONFIG_INPUT_KEYBOARD is not set @@ -100,7 +100,6 @@ CONFIG_I2C_CHARDEV=y CONFIG_I2C_IMX=y CONFIG_SPI=y CONFIG_SPI_IMX=y -CONFIG_SPI_SPIDEV=y CONFIG_W1=y CONFIG_W1_MASTER_MXC=y CONFIG_W1_SLAVE_THERM=y @@ -140,7 +139,6 @@ CONFIG_MMC=y CONFIG_MMC_MXC=y CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y -CONFIG_LEDS_GPIO=y CONFIG_LEDS_MC13783=y CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=y diff --git a/trunk/arch/arm/kernel/process.c b/trunk/arch/arm/kernel/process.c index e8e8fe505df1..3d0c6fb74ae4 100644 --- a/trunk/arch/arm/kernel/process.c +++ b/trunk/arch/arm/kernel/process.c @@ -183,8 +183,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); - rcu_idle_enter(); + tick_nohz_stop_sched_tick(1); leds_event(led_idle_start); while (!need_resched()) { #ifdef CONFIG_HOTPLUG_CPU @@ -214,8 +213,7 @@ void cpu_idle(void) } } leds_event(led_idle_end); - rcu_idle_exit(); - tick_nohz_idle_exit(); + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/trunk/arch/arm/kernel/setup.c b/trunk/arch/arm/kernel/setup.c index c0b59bff6be6..8fc2c8fcbdc6 100644 --- a/trunk/arch/arm/kernel/setup.c +++ b/trunk/arch/arm/kernel/setup.c @@ -52,7 +52,6 @@ #include #include #include -#include #if defined(CONFIG_DEPRECATED_PARAM_STRUCT) #include "compat.h" diff --git a/trunk/arch/arm/mach-exynos/cpu.c b/trunk/arch/arm/mach-exynos/cpu.c index cc8d4bd6d0f7..90ec247f3b37 100644 --- a/trunk/arch/arm/mach-exynos/cpu.c +++ b/trunk/arch/arm/mach-exynos/cpu.c @@ -110,6 +110,11 @@ static struct map_desc exynos4_iodesc[] __initdata = { .pfn = __phys_to_pfn(EXYNOS4_PA_DMC0), .length = SZ_4K, .type = MT_DEVICE, + }, { + .virtual = (unsigned long)S5P_VA_SROMC, + .pfn = __phys_to_pfn(EXYNOS4_PA_SROMC), + .length = SZ_4K, + .type = MT_DEVICE, }, { .virtual = (unsigned long)S3C_VA_USB_HSPHY, .pfn = __phys_to_pfn(EXYNOS4_PA_HSPHY), diff --git a/trunk/arch/arm/mach-imx/Kconfig b/trunk/arch/arm/mach-imx/Kconfig index 0e6f1af260b6..c44aa974e79c 100644 --- a/trunk/arch/arm/mach-imx/Kconfig +++ b/trunk/arch/arm/mach-imx/Kconfig @@ -132,7 +132,7 @@ config MACH_MX25_3DS select IMX_HAVE_PLATFORM_MXC_NAND select IMX_HAVE_PLATFORM_SDHCI_ESDHC_IMX -config MACH_EUKREA_CPUIMX25SD +config MACH_EUKREA_CPUIMX25 bool "Support Eukrea CPUIMX25 Platform" select SOC_IMX25 select IMX_HAVE_PLATFORM_FLEXCAN @@ -148,7 +148,7 @@ config MACH_EUKREA_CPUIMX25SD choice prompt "Baseboard" - depends on MACH_EUKREA_CPUIMX25SD + depends on MACH_EUKREA_CPUIMX25 default MACH_EUKREA_MBIMXSD25_BASEBOARD config MACH_EUKREA_MBIMXSD25_BASEBOARD @@ -542,7 +542,7 @@ config MACH_MX35_3DS Include support for MX35PDK platform. This includes specific configurations for the board and its peripherals. -config MACH_EUKREA_CPUIMX35SD +config MACH_EUKREA_CPUIMX35 bool "Support Eukrea CPUIMX35 Platform" select SOC_IMX35 select IMX_HAVE_PLATFORM_FLEXCAN @@ -560,7 +560,7 @@ config MACH_EUKREA_CPUIMX35SD choice prompt "Baseboard" - depends on MACH_EUKREA_CPUIMX35SD + depends on MACH_EUKREA_CPUIMX35 default MACH_EUKREA_MBIMXSD35_BASEBOARD config MACH_EUKREA_MBIMXSD35_BASEBOARD diff --git a/trunk/arch/arm/mach-imx/Makefile b/trunk/arch/arm/mach-imx/Makefile index d97f409ce98b..aba73214c2a8 100644 --- a/trunk/arch/arm/mach-imx/Makefile +++ b/trunk/arch/arm/mach-imx/Makefile @@ -24,7 +24,7 @@ obj-$(CONFIG_MACH_MX21ADS) += mach-mx21ads.o # i.MX25 based machines obj-$(CONFIG_MACH_MX25_3DS) += mach-mx25_3ds.o -obj-$(CONFIG_MACH_EUKREA_CPUIMX25SD) += mach-eukrea_cpuimx25.o +obj-$(CONFIG_MACH_EUKREA_CPUIMX25) += mach-eukrea_cpuimx25.o obj-$(CONFIG_MACH_EUKREA_MBIMXSD25_BASEBOARD) += eukrea_mbimxsd25-baseboard.o # i.MX27 based machines @@ -57,7 +57,7 @@ obj-$(CONFIG_MACH_BUG) += mach-bug.o # i.MX35 based machines obj-$(CONFIG_MACH_PCM043) += mach-pcm043.o obj-$(CONFIG_MACH_MX35_3DS) += mach-mx35_3ds.o -obj-$(CONFIG_MACH_EUKREA_CPUIMX35SD) += mach-cpuimx35.o +obj-$(CONFIG_MACH_EUKREA_CPUIMX35) += mach-cpuimx35.o obj-$(CONFIG_MACH_EUKREA_MBIMXSD35_BASEBOARD) += eukrea_mbimxsd35-baseboard.o obj-$(CONFIG_MACH_VPR200) += mach-vpr200.o diff --git a/trunk/arch/arm/mach-imx/clock-imx35.c b/trunk/arch/arm/mach-imx/clock-imx35.c index ac8238caecb9..8116f119517d 100644 --- a/trunk/arch/arm/mach-imx/clock-imx35.c +++ b/trunk/arch/arm/mach-imx/clock-imx35.c @@ -507,7 +507,7 @@ static struct clk_lookup lookups[] = { int __init mx35_clocks_init() { - unsigned int cgr2 = 3 << 26; + unsigned int cgr2 = 3 << 26, cgr3 = 0; #if defined(CONFIG_DEBUG_LL) && !defined(CONFIG_DEBUG_ICEDCC) cgr2 |= 3 << 16; @@ -521,12 +521,6 @@ int __init mx35_clocks_init() __raw_writel((3 << 18), CCM_BASE + CCM_CGR0); __raw_writel((3 << 2) | (3 << 4) | (3 << 6) | (3 << 8) | (3 << 16), CCM_BASE + CCM_CGR1); - __raw_writel(cgr2, CCM_BASE + CCM_CGR2); - __raw_writel(0, CCM_BASE + CCM_CGR3); - - clk_enable(&iim_clk); - imx_print_silicon_rev("i.MX35", mx35_revision()); - clk_disable(&iim_clk); /* * Check if we came up in internal boot mode. If yes, we need some @@ -535,11 +529,17 @@ int __init mx35_clocks_init() */ if (!(__raw_readl(CCM_BASE + CCM_RCSR) & (3 << 10))) { /* Additionally turn on UART1, SCC, and IIM clocks */ - clk_enable(&iim_clk); - clk_enable(&uart1_clk); - clk_enable(&scc_clk); + cgr2 |= 3 << 16 | 3 << 4; + cgr3 |= 3 << 2; } + __raw_writel(cgr2, CCM_BASE + CCM_CGR2); + __raw_writel(cgr3, CCM_BASE + CCM_CGR3); + + clk_enable(&iim_clk); + imx_print_silicon_rev("i.MX35", mx35_revision()); + clk_disable(&iim_clk); + #ifdef CONFIG_MXC_USE_EPIT epit_timer_init(&epit1_clk, MX35_IO_ADDRESS(MX35_EPIT1_BASE_ADDR), MX35_INT_EPIT1); diff --git a/trunk/arch/arm/mach-imx/mach-cpuimx35.c b/trunk/arch/arm/mach-imx/mach-cpuimx35.c index 362aae780601..66af2e8f7e57 100644 --- a/trunk/arch/arm/mach-imx/mach-cpuimx35.c +++ b/trunk/arch/arm/mach-imx/mach-cpuimx35.c @@ -53,18 +53,12 @@ static const struct imxi2c_platform_data .bitrate = 100000, }; -#define TSC2007_IRQGPIO IMX_GPIO_NR(3, 2) -static int tsc2007_get_pendown_state(void) -{ - return !gpio_get_value(TSC2007_IRQGPIO); -} - static struct tsc2007_platform_data tsc2007_info = { .model = 2007, .x_plate_ohms = 180, - .get_pendown_state = tsc2007_get_pendown_state, }; +#define TSC2007_IRQGPIO IMX_GPIO_NR(3, 2) static struct i2c_board_info eukrea_cpuimx35_i2c_devices[] = { { I2C_BOARD_INFO("pcf8563", 0x51), diff --git a/trunk/arch/arm/mach-omap2/omap_hwmod_3xxx_data.c b/trunk/arch/arm/mach-omap2/omap_hwmod_3xxx_data.c index eef43e2e163e..7f8915ad5099 100644 --- a/trunk/arch/arm/mach-omap2/omap_hwmod_3xxx_data.c +++ b/trunk/arch/arm/mach-omap2/omap_hwmod_3xxx_data.c @@ -3247,14 +3247,18 @@ static __initdata struct omap_hwmod *omap3xxx_hwmods[] = { /* 3430ES1-only hwmods */ static __initdata struct omap_hwmod *omap3430es1_hwmods[] = { + &omap3xxx_iva_hwmod, &omap3430es1_dss_core_hwmod, + &omap3xxx_mailbox_hwmod, NULL }; /* 3430ES2+-only hwmods */ static __initdata struct omap_hwmod *omap3430es2plus_hwmods[] = { + &omap3xxx_iva_hwmod, &omap3xxx_dss_core_hwmod, &omap3xxx_usbhsotg_hwmod, + &omap3xxx_mailbox_hwmod, NULL }; diff --git a/trunk/arch/arm/mm/init.c b/trunk/arch/arm/mm/init.c index 7c38474e533a..fbdd12ea3a58 100644 --- a/trunk/arch/arm/mm/init.c +++ b/trunk/arch/arm/mm/init.c @@ -32,7 +32,6 @@ #include #include -#include #include "mm.h" @@ -333,6 +332,7 @@ void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc) sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL); + memblock_init(); for (i = 0; i < mi->nr_banks; i++) memblock_add(mi->bank[i].start, mi->bank[i].size); @@ -371,7 +371,7 @@ void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc) if (mdesc->reserve) mdesc->reserve(); - memblock_allow_resize(); + memblock_analyze(); memblock_dump_all(); } diff --git a/trunk/arch/arm/mm/proc-v7.S b/trunk/arch/arm/mm/proc-v7.S index e70a73731eaa..2c559ac38142 100644 --- a/trunk/arch/arm/mm/proc-v7.S +++ b/trunk/arch/arm/mm/proc-v7.S @@ -363,13 +363,11 @@ __v7_setup: orreq r10, r10, #1 << 6 @ set bit #6 mcreq p15, 0, r10, c15, c0, 1 @ write diagnostic register #endif -#if defined(CONFIG_ARM_ERRATA_751472) && defined(CONFIG_SMP) - ALT_SMP(cmp r6, #0x30) @ present prior to r3p0 - ALT_UP_B(1f) +#ifdef CONFIG_ARM_ERRATA_751472 + cmp r6, #0x30 @ present prior to r3p0 mrclt p15, 0, r10, c15, c0, 1 @ read diagnostic register orrlt r10, r10, #1 << 11 @ set bit #11 mcrlt p15, 0, r10, c15, c0, 1 @ write diagnostic register -1: #endif 3: mov r10, #0 diff --git a/trunk/arch/arm/plat-mxc/cpufreq.c b/trunk/arch/arm/plat-mxc/cpufreq.c index 73db34bf588a..adbff706ef6f 100644 --- a/trunk/arch/arm/plat-mxc/cpufreq.c +++ b/trunk/arch/arm/plat-mxc/cpufreq.c @@ -98,7 +98,7 @@ static int mxc_set_target(struct cpufreq_policy *policy, return ret; } -static int mxc_cpufreq_init(struct cpufreq_policy *policy) +static int __init mxc_cpufreq_init(struct cpufreq_policy *policy) { int ret; int i; diff --git a/trunk/arch/arm/plat-mxc/include/mach/uncompress.h b/trunk/arch/arm/plat-mxc/include/mach/uncompress.h index 477971b00930..88fd40452567 100644 --- a/trunk/arch/arm/plat-mxc/include/mach/uncompress.h +++ b/trunk/arch/arm/plat-mxc/include/mach/uncompress.h @@ -98,7 +98,6 @@ static __inline__ void __arch_decomp_setup(unsigned long arch_id) case MACH_TYPE_PCM043: case MACH_TYPE_LILLY1131: case MACH_TYPE_VPR200: - case MACH_TYPE_EUKREA_CPUIMX35SD: uart_base = MX3X_UART1_BASE_ADDR; break; case MACH_TYPE_MAGX_ZN5: diff --git a/trunk/arch/arm/plat-mxc/pwm.c b/trunk/arch/arm/plat-mxc/pwm.c index e032717f7d02..845de59f07ed 100644 --- a/trunk/arch/arm/plat-mxc/pwm.c +++ b/trunk/arch/arm/plat-mxc/pwm.c @@ -77,15 +77,6 @@ int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns) do_div(c, period_ns); duty_cycles = c; - /* - * according to imx pwm RM, the real period value should be - * PERIOD value in PWMPR plus 2. - */ - if (period_cycles > 2) - period_cycles -= 2; - else - period_cycles = 0; - writel(duty_cycles, pwm->mmio_base + MX3_PWMSAR); writel(period_cycles, pwm->mmio_base + MX3_PWMPR); diff --git a/trunk/arch/arm/plat-orion/gpio.c b/trunk/arch/arm/plat-orion/gpio.c index 10d160888133..41ab97ebe4cf 100644 --- a/trunk/arch/arm/plat-orion/gpio.c +++ b/trunk/arch/arm/plat-orion/gpio.c @@ -384,16 +384,12 @@ void __init orion_gpio_init(int gpio_base, int ngpio, struct orion_gpio_chip *ochip; struct irq_chip_generic *gc; struct irq_chip_type *ct; - char gc_label[16]; if (orion_gpio_chip_count == ARRAY_SIZE(orion_gpio_chips)) return; - snprintf(gc_label, sizeof(gc_label), "orion_gpio%d", - orion_gpio_chip_count); - ochip = orion_gpio_chips + orion_gpio_chip_count; - ochip->chip.label = kstrdup(gc_label, GFP_KERNEL); + ochip->chip.label = "orion_gpio"; ochip->chip.request = orion_gpio_request; ochip->chip.direction_input = orion_gpio_direction_input; ochip->chip.get = orion_gpio_get; diff --git a/trunk/arch/arm/plat-samsung/include/plat/cpu-freq-core.h b/trunk/arch/arm/plat-samsung/include/plat/cpu-freq-core.h index 95509d8eb140..dac4760c0f0a 100644 --- a/trunk/arch/arm/plat-samsung/include/plat/cpu-freq-core.h +++ b/trunk/arch/arm/plat-samsung/include/plat/cpu-freq-core.h @@ -202,6 +202,14 @@ extern int s3c_plltab_register(struct cpufreq_frequency_table *plls, extern struct s3c_cpufreq_config *s3c_cpufreq_getconfig(void); extern struct s3c_iotimings *s3c_cpufreq_getiotimings(void); +extern void s3c2410_iotiming_debugfs(struct seq_file *seq, + struct s3c_cpufreq_config *cfg, + union s3c_iobank *iob); + +extern void s3c2412_iotiming_debugfs(struct seq_file *seq, + struct s3c_cpufreq_config *cfg, + union s3c_iobank *iob); + #ifdef CONFIG_CPU_FREQ_S3C24XX_DEBUGFS #define s3c_cpufreq_debugfs_call(x) x #else @@ -218,10 +226,6 @@ extern void s3c2410_cpufreq_setrefresh(struct s3c_cpufreq_config *cfg); extern void s3c2410_set_fvco(struct s3c_cpufreq_config *cfg); #ifdef CONFIG_S3C2410_IOTIMING -extern void s3c2410_iotiming_debugfs(struct seq_file *seq, - struct s3c_cpufreq_config *cfg, - union s3c_iobank *iob); - extern int s3c2410_iotiming_calc(struct s3c_cpufreq_config *cfg, struct s3c_iotimings *iot); @@ -231,7 +235,6 @@ extern int s3c2410_iotiming_get(struct s3c_cpufreq_config *cfg, extern void s3c2410_iotiming_set(struct s3c_cpufreq_config *cfg, struct s3c_iotimings *iot); #else -#define s3c2410_iotiming_debugfs NULL #define s3c2410_iotiming_calc NULL #define s3c2410_iotiming_get NULL #define s3c2410_iotiming_set NULL @@ -239,10 +242,8 @@ extern void s3c2410_iotiming_set(struct s3c_cpufreq_config *cfg, /* S3C2412 compatible routines */ -#ifdef CONFIG_S3C2412_IOTIMING -extern void s3c2412_iotiming_debugfs(struct seq_file *seq, - struct s3c_cpufreq_config *cfg, - union s3c_iobank *iob); +extern int s3c2412_iotiming_get(struct s3c_cpufreq_config *cfg, + struct s3c_iotimings *timings); extern int s3c2412_iotiming_get(struct s3c_cpufreq_config *cfg, struct s3c_iotimings *timings); @@ -252,12 +253,6 @@ extern int s3c2412_iotiming_calc(struct s3c_cpufreq_config *cfg, extern void s3c2412_iotiming_set(struct s3c_cpufreq_config *cfg, struct s3c_iotimings *iot); -#else -#define s3c2412_iotiming_debugfs NULL -#define s3c2412_iotiming_calc NULL -#define s3c2412_iotiming_get NULL -#define s3c2412_iotiming_set NULL -#endif /* CONFIG_S3C2412_IOTIMING */ #ifdef CONFIG_CPU_FREQ_S3C24XX_DEBUG #define s3c_freq_dbg(x...) printk(KERN_INFO x) diff --git a/trunk/arch/avr32/kernel/process.c b/trunk/arch/avr32/kernel/process.c index ea3395750324..ef5a2a08fcca 100644 --- a/trunk/arch/avr32/kernel/process.c +++ b/trunk/arch/avr32/kernel/process.c @@ -34,12 +34,10 @@ void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); - rcu_idle_enter(); + tick_nohz_stop_sched_tick(1); while (!need_resched()) cpu_idle_sleep(); - rcu_idle_exit(); - tick_nohz_idle_exit(); + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/trunk/arch/blackfin/kernel/process.c b/trunk/arch/blackfin/kernel/process.c index 8dd0416673cb..6a80a9e9fc4a 100644 --- a/trunk/arch/blackfin/kernel/process.c +++ b/trunk/arch/blackfin/kernel/process.c @@ -88,12 +88,10 @@ void cpu_idle(void) #endif if (!idle) idle = default_idle; - tick_nohz_idle_enter(); - rcu_idle_enter(); + tick_nohz_stop_sched_tick(1); while (!need_resched()) idle(); - rcu_idle_exit(); - tick_nohz_idle_exit(); + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/trunk/arch/cris/arch-v32/kernel/time.c b/trunk/arch/cris/arch-v32/kernel/time.c index 6773fc83a670..bb978ede8985 100644 --- a/trunk/arch/cris/arch-v32/kernel/time.c +++ b/trunk/arch/cris/arch-v32/kernel/time.c @@ -47,12 +47,14 @@ static struct clocksource cont_rotime = { .rating = 300, .read = read_cont_rotime, .mask = CLOCKSOURCE_MASK(32), + .shift = 10, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static int __init etrax_init_cont_rotime(void) { - clocksource_register_khz(&cont_rotime, 100000); + cont_rotime.mult = clocksource_khz2mult(100000, cont_rotime.shift); + clocksource_register(&cont_rotime); return 0; } arch_initcall(etrax_init_cont_rotime); diff --git a/trunk/arch/ia64/Kconfig b/trunk/arch/ia64/Kconfig index 3b7a7c483785..27489b6dd533 100644 --- a/trunk/arch/ia64/Kconfig +++ b/trunk/arch/ia64/Kconfig @@ -23,9 +23,6 @@ config IA64 select HAVE_ARCH_TRACEHOOK select HAVE_DMA_API_DEBUG select HAVE_GENERIC_HARDIRQS - select HAVE_MEMBLOCK - select HAVE_MEMBLOCK_NODE_MAP - select ARCH_DISCARD_MEMBLOCK select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP select IRQ_PER_CPU @@ -477,6 +474,9 @@ config NODES_SHIFT MAX_NUMNODES will be 2^(This value). If in doubt, use the default. +config ARCH_POPULATES_NODE_MAP + def_bool y + # VIRTUAL_MEM_MAP and FLAT_NODE_MEM_MAP are functionally equivalent. # VIRTUAL_MEM_MAP has been retained for historical reasons. config VIRTUAL_MEM_MAP diff --git a/trunk/arch/ia64/include/asm/cputime.h b/trunk/arch/ia64/include/asm/cputime.h index 3deac956d325..6073b187528a 100644 --- a/trunk/arch/ia64/include/asm/cputime.h +++ b/trunk/arch/ia64/include/asm/cputime.h @@ -26,53 +26,59 @@ #include #include -typedef u64 __nocast cputime_t; -typedef u64 __nocast cputime64_t; +typedef u64 cputime_t; +typedef u64 cputime64_t; +#define cputime_zero ((cputime_t)0) #define cputime_one_jiffy jiffies_to_cputime(1) +#define cputime_max ((~((cputime_t)0) >> 1) - 1) +#define cputime_add(__a, __b) ((__a) + (__b)) +#define cputime_sub(__a, __b) ((__a) - (__b)) +#define cputime_div(__a, __n) ((__a) / (__n)) +#define cputime_halve(__a) ((__a) >> 1) +#define cputime_eq(__a, __b) ((__a) == (__b)) +#define cputime_gt(__a, __b) ((__a) > (__b)) +#define cputime_ge(__a, __b) ((__a) >= (__b)) +#define cputime_lt(__a, __b) ((__a) < (__b)) +#define cputime_le(__a, __b) ((__a) <= (__b)) + +#define cputime64_zero ((cputime64_t)0) +#define cputime64_add(__a, __b) ((__a) + (__b)) +#define cputime64_sub(__a, __b) ((__a) - (__b)) +#define cputime_to_cputime64(__ct) (__ct) /* * Convert cputime <-> jiffies (HZ) */ -#define cputime_to_jiffies(__ct) \ - ((__force u64)(__ct) / (NSEC_PER_SEC / HZ)) -#define jiffies_to_cputime(__jif) \ - (__force cputime_t)((__jif) * (NSEC_PER_SEC / HZ)) -#define cputime64_to_jiffies64(__ct) \ - ((__force u64)(__ct) / (NSEC_PER_SEC / HZ)) -#define jiffies64_to_cputime64(__jif) \ - (__force cputime64_t)((__jif) * (NSEC_PER_SEC / HZ)) +#define cputime_to_jiffies(__ct) ((__ct) / (NSEC_PER_SEC / HZ)) +#define jiffies_to_cputime(__jif) ((__jif) * (NSEC_PER_SEC / HZ)) +#define cputime64_to_jiffies64(__ct) ((__ct) / (NSEC_PER_SEC / HZ)) +#define jiffies64_to_cputime64(__jif) ((__jif) * (NSEC_PER_SEC / HZ)) /* * Convert cputime <-> microseconds */ -#define cputime_to_usecs(__ct) \ - ((__force u64)(__ct) / NSEC_PER_USEC) -#define usecs_to_cputime(__usecs) \ - (__force cputime_t)((__usecs) * NSEC_PER_USEC) -#define usecs_to_cputime64(__usecs) \ - (__force cputime64_t)((__usecs) * NSEC_PER_USEC) +#define cputime_to_usecs(__ct) ((__ct) / NSEC_PER_USEC) +#define usecs_to_cputime(__usecs) ((__usecs) * NSEC_PER_USEC) /* * Convert cputime <-> seconds */ -#define cputime_to_secs(__ct) \ - ((__force u64)(__ct) / NSEC_PER_SEC) -#define secs_to_cputime(__secs) \ - (__force cputime_t)((__secs) * NSEC_PER_SEC) +#define cputime_to_secs(__ct) ((__ct) / NSEC_PER_SEC) +#define secs_to_cputime(__secs) ((__secs) * NSEC_PER_SEC) /* * Convert cputime <-> timespec (nsec) */ static inline cputime_t timespec_to_cputime(const struct timespec *val) { - u64 ret = val->tv_sec * NSEC_PER_SEC + val->tv_nsec; - return (__force cputime_t) ret; + cputime_t ret = val->tv_sec * NSEC_PER_SEC; + return (ret + val->tv_nsec); } static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val) { - val->tv_sec = (__force u64) ct / NSEC_PER_SEC; - val->tv_nsec = (__force u64) ct % NSEC_PER_SEC; + val->tv_sec = ct / NSEC_PER_SEC; + val->tv_nsec = ct % NSEC_PER_SEC; } /* @@ -80,28 +86,25 @@ static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val) */ static inline cputime_t timeval_to_cputime(struct timeval *val) { - u64 ret = val->tv_sec * NSEC_PER_SEC + val->tv_usec * NSEC_PER_USEC; - return (__force cputime_t) ret; + cputime_t ret = val->tv_sec * NSEC_PER_SEC; + return (ret + val->tv_usec * NSEC_PER_USEC); } static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val) { - val->tv_sec = (__force u64) ct / NSEC_PER_SEC; - val->tv_usec = ((__force u64) ct % NSEC_PER_SEC) / NSEC_PER_USEC; + val->tv_sec = ct / NSEC_PER_SEC; + val->tv_usec = (ct % NSEC_PER_SEC) / NSEC_PER_USEC; } /* * Convert cputime <-> clock (USER_HZ) */ -#define cputime_to_clock_t(__ct) \ - ((__force u64)(__ct) / (NSEC_PER_SEC / USER_HZ)) -#define clock_t_to_cputime(__x) \ - (__force cputime_t)((__x) * (NSEC_PER_SEC / USER_HZ)) +#define cputime_to_clock_t(__ct) ((__ct) / (NSEC_PER_SEC / USER_HZ)) +#define clock_t_to_cputime(__x) ((__x) * (NSEC_PER_SEC / USER_HZ)) /* * Convert cputime64 to clock. */ -#define cputime64_to_clock_t(__ct) \ - cputime_to_clock_t((__force cputime_t)__ct) +#define cputime64_to_clock_t(__ct) cputime_to_clock_t((cputime_t)__ct) #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ #endif /* __IA64_CPUTIME_H */ diff --git a/trunk/arch/ia64/mm/contig.c b/trunk/arch/ia64/mm/contig.c index 1516d1dc11fd..f114a3b14c6a 100644 --- a/trunk/arch/ia64/mm/contig.c +++ b/trunk/arch/ia64/mm/contig.c @@ -16,7 +16,6 @@ */ #include #include -#include #include #include #include @@ -349,7 +348,7 @@ paging_init (void) printk("Virtual mem_map starts at 0x%p\n", mem_map); } #else /* !CONFIG_VIRTUAL_MEM_MAP */ - memblock_add_node(0, PFN_PHYS(max_low_pfn), 0); + add_active_range(0, 0, max_low_pfn); free_area_init_nodes(max_zone_pfns); #endif /* !CONFIG_VIRTUAL_MEM_MAP */ zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); diff --git a/trunk/arch/ia64/mm/init.c b/trunk/arch/ia64/mm/init.c index 13df239dbed1..00cb0e26c64e 100644 --- a/trunk/arch/ia64/mm/init.c +++ b/trunk/arch/ia64/mm/init.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -558,7 +557,8 @@ int __init register_active_ranges(u64 start, u64 len, int nid) #endif if (start < end) - memblock_add_node(__pa(start), end - start, nid); + add_active_range(nid, __pa(start) >> PAGE_SHIFT, + __pa(end) >> PAGE_SHIFT); return 0; } diff --git a/trunk/arch/m68k/platform/68328/timers.c b/trunk/arch/m68k/platform/68328/timers.c index f2678866067b..309f725995bf 100644 --- a/trunk/arch/m68k/platform/68328/timers.c +++ b/trunk/arch/m68k/platform/68328/timers.c @@ -93,6 +93,7 @@ static struct clocksource m68328_clk = { .name = "timer", .rating = 250, .read = m68328_read_clk, + .shift = 20, .mask = CLOCKSOURCE_MASK(32), .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -114,7 +115,8 @@ void hw_timer_init(void) /* Enable timer 1 */ TCTL |= TCTL_TEN; - clocksource_register_hz(&m68328_clk, TICKS_PER_JIFFY*HZ); + m68328_clk.mult = clocksource_hz2mult(TICKS_PER_JIFFY*HZ, m68328_clk.shift); + clocksource_register(&m68328_clk); } /***************************************************************************/ diff --git a/trunk/arch/m68k/platform/coldfire/dma_timer.c b/trunk/arch/m68k/platform/coldfire/dma_timer.c index 235ad57c4707..a5f562823d7a 100644 --- a/trunk/arch/m68k/platform/coldfire/dma_timer.c +++ b/trunk/arch/m68k/platform/coldfire/dma_timer.c @@ -44,6 +44,7 @@ static struct clocksource clocksource_cf_dt = { .rating = 200, .read = cf_dt_get_cycles, .mask = CLOCKSOURCE_MASK(32), + .shift = 20, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -59,7 +60,9 @@ static int __init init_cf_dt_clocksource(void) __raw_writeb(0x00, DTER0); __raw_writel(0x00000000, DTRR0); __raw_writew(DMA_DTMR_CLK_DIV_16 | DMA_DTMR_ENABLE, DTMR0); - return clocksource_register_hz(&clocksource_cf_dt, DMA_FREQ); + clocksource_cf_dt.mult = clocksource_hz2mult(DMA_FREQ, + clocksource_cf_dt.shift); + return clocksource_register(&clocksource_cf_dt); } arch_initcall(init_cf_dt_clocksource); diff --git a/trunk/arch/m68k/platform/coldfire/pit.c b/trunk/arch/m68k/platform/coldfire/pit.c index 02663d25822d..c2b980926bec 100644 --- a/trunk/arch/m68k/platform/coldfire/pit.c +++ b/trunk/arch/m68k/platform/coldfire/pit.c @@ -144,6 +144,7 @@ static struct clocksource pit_clk = { .name = "pit", .rating = 100, .read = pit_read_clk, + .shift = 20, .mask = CLOCKSOURCE_MASK(32), }; @@ -161,7 +162,8 @@ void hw_timer_init(void) setup_irq(MCFINT_VECBASE + MCFINT_PIT1, &pit_irq); - clocksource_register_hz(&pit_clk, FREQ); + pit_clk.mult = clocksource_hz2mult(FREQ, pit_clk.shift); + clocksource_register(&pit_clk); } /***************************************************************************/ diff --git a/trunk/arch/m68k/platform/coldfire/sltimers.c b/trunk/arch/m68k/platform/coldfire/sltimers.c index b7f822b552bb..6a85daf9a7fd 100644 --- a/trunk/arch/m68k/platform/coldfire/sltimers.c +++ b/trunk/arch/m68k/platform/coldfire/sltimers.c @@ -114,6 +114,7 @@ static struct clocksource mcfslt_clk = { .name = "slt", .rating = 250, .read = mcfslt_read_clk, + .shift = 20, .mask = CLOCKSOURCE_MASK(32), .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -135,7 +136,8 @@ void hw_timer_init(void) setup_irq(MCF_IRQ_TIMER, &mcfslt_timer_irq); - clocksource_register_hz(&mcfslt_clk, MCF_BUSCLK); + mcfslt_clk.mult = clocksource_hz2mult(MCF_BUSCLK, mcfslt_clk.shift); + clocksource_register(&mcfslt_clk); #ifdef CONFIG_HIGHPROFILE mcfslt_profile_init(); diff --git a/trunk/arch/m68k/platform/coldfire/timers.c b/trunk/arch/m68k/platform/coldfire/timers.c index 0d90da32fcdb..60242f65fea9 100644 --- a/trunk/arch/m68k/platform/coldfire/timers.c +++ b/trunk/arch/m68k/platform/coldfire/timers.c @@ -88,6 +88,7 @@ static struct clocksource mcftmr_clk = { .name = "tmr", .rating = 250, .read = mcftmr_read_clk, + .shift = 20, .mask = CLOCKSOURCE_MASK(32), .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -108,7 +109,8 @@ void hw_timer_init(void) __raw_writew(MCFTIMER_TMR_ENORI | MCFTIMER_TMR_CLK16 | MCFTIMER_TMR_RESTART | MCFTIMER_TMR_ENABLE, TA(MCFTIMER_TMR)); - clocksource_register_hz(&mcftmr_clk, FREQ); + mcftmr_clk.mult = clocksource_hz2mult(FREQ, mcftmr_clk.shift); + clocksource_register(&mcftmr_clk); setup_irq(MCF_IRQ_TIMER, &mcftmr_timer_irq); diff --git a/trunk/arch/microblaze/include/asm/memblock.h b/trunk/arch/microblaze/include/asm/memblock.h new file mode 100644 index 000000000000..20a8e257c77f --- /dev/null +++ b/trunk/arch/microblaze/include/asm/memblock.h @@ -0,0 +1,14 @@ +/* + * Copyright (C) 2008 Michal Simek + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ + +#ifndef _ASM_MICROBLAZE_MEMBLOCK_H +#define _ASM_MICROBLAZE_MEMBLOCK_H + +#endif /* _ASM_MICROBLAZE_MEMBLOCK_H */ + + diff --git a/trunk/arch/microblaze/kernel/process.c b/trunk/arch/microblaze/kernel/process.c index 7dcb5bfffb75..95cc295976a7 100644 --- a/trunk/arch/microblaze/kernel/process.c +++ b/trunk/arch/microblaze/kernel/process.c @@ -103,12 +103,10 @@ void cpu_idle(void) if (!idle) idle = default_idle; - tick_nohz_idle_enter(); - rcu_idle_enter(); + tick_nohz_stop_sched_tick(1); while (!need_resched()) idle(); - rcu_idle_exit(); - tick_nohz_idle_exit(); + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); diff --git a/trunk/arch/microblaze/kernel/prom.c b/trunk/arch/microblaze/kernel/prom.c index 80d314e81901..977484add216 100644 --- a/trunk/arch/microblaze/kernel/prom.c +++ b/trunk/arch/microblaze/kernel/prom.c @@ -122,6 +122,7 @@ void __init early_init_devtree(void *params) of_scan_flat_dt(early_init_dt_scan_chosen, cmd_line); /* Scan memory nodes and rebuild MEMBLOCKs */ + memblock_init(); of_scan_flat_dt(early_init_dt_scan_root, NULL); of_scan_flat_dt(early_init_dt_scan_memory, NULL); @@ -129,7 +130,7 @@ void __init early_init_devtree(void *params) strlcpy(boot_command_line, cmd_line, COMMAND_LINE_SIZE); parse_early_param(); - memblock_allow_resize(); + memblock_analyze(); pr_debug("Phys. mem: %lx\n", (unsigned long) memblock_phys_mem_size()); diff --git a/trunk/arch/mips/Kconfig b/trunk/arch/mips/Kconfig index 9c652eb68aaa..d46f1da18a3c 100644 --- a/trunk/arch/mips/Kconfig +++ b/trunk/arch/mips/Kconfig @@ -25,9 +25,6 @@ config MIPS select GENERIC_IRQ_SHOW select HAVE_ARCH_JUMP_LABEL select IRQ_FORCED_THREADING - select HAVE_MEMBLOCK - select HAVE_MEMBLOCK_NODE_MAP - select ARCH_DISCARD_MEMBLOCK menu "Machine selection" @@ -2067,6 +2064,9 @@ config ARCH_DISCONTIGMEM_ENABLE or have huge holes in the physical address space for other reasons. See for more. +config ARCH_POPULATES_NODE_MAP + def_bool y + config ARCH_SPARSEMEM_ENABLE bool select SPARSEMEM_STATIC diff --git a/trunk/arch/mips/kernel/process.c b/trunk/arch/mips/kernel/process.c index 7955409051c4..c47f96e453c0 100644 --- a/trunk/arch/mips/kernel/process.c +++ b/trunk/arch/mips/kernel/process.c @@ -56,8 +56,7 @@ void __noreturn cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); - rcu_idle_enter(); + tick_nohz_stop_sched_tick(1); while (!need_resched() && cpu_online(cpu)) { #ifdef CONFIG_MIPS_MT_SMTC extern void smtc_idle_loop_hook(void); @@ -78,8 +77,7 @@ void __noreturn cpu_idle(void) system_state == SYSTEM_BOOTING)) play_dead(); #endif - rcu_idle_exit(); - tick_nohz_idle_exit(); + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/trunk/arch/mips/kernel/setup.c b/trunk/arch/mips/kernel/setup.c index b1cb8f87d7b4..84af26ab2212 100644 --- a/trunk/arch/mips/kernel/setup.c +++ b/trunk/arch/mips/kernel/setup.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -353,7 +352,7 @@ static void __init bootmem_init(void) continue; #endif - memblock_add_node(PFN_PHYS(start), PFN_PHYS(end - start), 0); + add_active_range(0, start, end); } /* diff --git a/trunk/arch/mips/sgi-ip27/ip27-memory.c b/trunk/arch/mips/sgi-ip27/ip27-memory.c index b105eca3c020..bc1297109cc5 100644 --- a/trunk/arch/mips/sgi-ip27/ip27-memory.c +++ b/trunk/arch/mips/sgi-ip27/ip27-memory.c @@ -12,7 +12,6 @@ */ #include #include -#include #include #include #include @@ -382,8 +381,8 @@ static void __init szmem(void) continue; } num_physpages += slot_psize; - memblock_add_node(PFN_PHYS(slot_getbasepfn(node, slot)), - PFN_PHYS(slot_psize), node); + add_active_range(node, slot_getbasepfn(node, slot), + slot_getbasepfn(node, slot) + slot_psize); } } } diff --git a/trunk/arch/openrisc/include/asm/memblock.h b/trunk/arch/openrisc/include/asm/memblock.h new file mode 100644 index 000000000000..bbe5a1c788cb --- /dev/null +++ b/trunk/arch/openrisc/include/asm/memblock.h @@ -0,0 +1,24 @@ +/* + * OpenRISC Linux + * + * Linux architectural port borrowing liberally from similar works of + * others. All original copyrights apply as per the original source + * declaration. + * + * OpenRISC implementation: + * Copyright (C) 2003 Matjaz Breskvar + * Copyright (C) 2010-2011 Jonas Bonn + * et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __ASM_OPENRISC_MEMBLOCK_H +#define __ASM_OPENRISC_MEMBLOCK_H + +/* empty */ + +#endif /* __ASM_OPENRISC_MEMBLOCK_H */ diff --git a/trunk/arch/openrisc/kernel/idle.c b/trunk/arch/openrisc/kernel/idle.c index e5fc78877830..d5bc5f813e89 100644 --- a/trunk/arch/openrisc/kernel/idle.c +++ b/trunk/arch/openrisc/kernel/idle.c @@ -51,8 +51,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); - rcu_idle_enter(); + tick_nohz_stop_sched_tick(1); while (!need_resched()) { check_pgt_cache(); @@ -70,8 +69,7 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); } - rcu_idle_exit(); - tick_nohz_idle_exit(); + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/trunk/arch/openrisc/kernel/prom.c b/trunk/arch/openrisc/kernel/prom.c index 3d4478f6c942..1bb58ba89afa 100644 --- a/trunk/arch/openrisc/kernel/prom.c +++ b/trunk/arch/openrisc/kernel/prom.c @@ -76,13 +76,14 @@ void __init early_init_devtree(void *params) of_scan_flat_dt(early_init_dt_scan_chosen, cmd_line); /* Scan memory nodes and rebuild MEMBLOCKs */ + memblock_init(); of_scan_flat_dt(early_init_dt_scan_root, NULL); of_scan_flat_dt(early_init_dt_scan_memory, NULL); /* Save command line for /proc/cmdline and then parse parameters */ strlcpy(boot_command_line, cmd_line, COMMAND_LINE_SIZE); - memblock_allow_resize(); + memblock_analyze(); /* We must copy the flattend device tree from init memory to regular * memory because the device tree references the strings in it diff --git a/trunk/arch/parisc/kernel/time.c b/trunk/arch/parisc/kernel/time.c index 7c0774397b89..45b7389d77aa 100644 --- a/trunk/arch/parisc/kernel/time.c +++ b/trunk/arch/parisc/kernel/time.c @@ -198,6 +198,8 @@ static struct clocksource clocksource_cr16 = { .rating = 300, .read = read_cr16, .mask = CLOCKSOURCE_MASK(BITS_PER_LONG), + .mult = 0, /* to be set */ + .shift = 22, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -268,5 +270,7 @@ void __init time_init(void) /* register at clocksource framework */ current_cr16_khz = PAGE0->mem_10msec/10; /* kHz */ - clocksource_register_khz(&clocksource_cr16, current_cr16_khz); + clocksource_cr16.mult = clocksource_khz2mult(current_cr16_khz, + clocksource_cr16.shift); + clocksource_register(&clocksource_cr16); } diff --git a/trunk/arch/powerpc/Kconfig b/trunk/arch/powerpc/Kconfig index ead0bc68439d..951e18f5335b 100644 --- a/trunk/arch/powerpc/Kconfig +++ b/trunk/arch/powerpc/Kconfig @@ -117,7 +117,6 @@ config PPC select HAVE_KRETPROBES select HAVE_ARCH_TRACEHOOK select HAVE_MEMBLOCK - select HAVE_MEMBLOCK_NODE_MAP select HAVE_DMA_ATTRS select HAVE_DMA_API_DEBUG select USE_GENERIC_SMP_HELPERS if SMP @@ -422,6 +421,9 @@ config ARCH_SPARSEMEM_DEFAULT def_bool y depends on (SMP && PPC_PSERIES) || PPC_PS3 +config ARCH_POPULATES_NODE_MAP + def_bool y + config SYS_SUPPORTS_HUGETLBFS bool diff --git a/trunk/arch/powerpc/include/asm/cputime.h b/trunk/arch/powerpc/include/asm/cputime.h index 6ec1c380a4d6..1cf20bdfbeca 100644 --- a/trunk/arch/powerpc/include/asm/cputime.h +++ b/trunk/arch/powerpc/include/asm/cputime.h @@ -29,8 +29,25 @@ static inline void setup_cputime_one_jiffy(void) { } #include #include -typedef u64 __nocast cputime_t; -typedef u64 __nocast cputime64_t; +typedef u64 cputime_t; +typedef u64 cputime64_t; + +#define cputime_zero ((cputime_t)0) +#define cputime_max ((~((cputime_t)0) >> 1) - 1) +#define cputime_add(__a, __b) ((__a) + (__b)) +#define cputime_sub(__a, __b) ((__a) - (__b)) +#define cputime_div(__a, __n) ((__a) / (__n)) +#define cputime_halve(__a) ((__a) >> 1) +#define cputime_eq(__a, __b) ((__a) == (__b)) +#define cputime_gt(__a, __b) ((__a) > (__b)) +#define cputime_ge(__a, __b) ((__a) >= (__b)) +#define cputime_lt(__a, __b) ((__a) < (__b)) +#define cputime_le(__a, __b) ((__a) <= (__b)) + +#define cputime64_zero ((cputime64_t)0) +#define cputime64_add(__a, __b) ((__a) + (__b)) +#define cputime64_sub(__a, __b) ((__a) - (__b)) +#define cputime_to_cputime64(__ct) (__ct) #ifdef __KERNEL__ @@ -48,7 +65,7 @@ DECLARE_PER_CPU(unsigned long, cputime_scaled_last_delta); static inline unsigned long cputime_to_jiffies(const cputime_t ct) { - return mulhdu((__force u64) ct, __cputime_jiffies_factor); + return mulhdu(ct, __cputime_jiffies_factor); } /* Estimate the scaled cputime by scaling the real cputime based on @@ -57,15 +74,14 @@ static inline cputime_t cputime_to_scaled(const cputime_t ct) { if (cpu_has_feature(CPU_FTR_SPURR) && __get_cpu_var(cputime_last_delta)) - return (__force u64) ct * - __get_cpu_var(cputime_scaled_last_delta) / - __get_cpu_var(cputime_last_delta); + return ct * __get_cpu_var(cputime_scaled_last_delta) / + __get_cpu_var(cputime_last_delta); return ct; } static inline cputime_t jiffies_to_cputime(const unsigned long jif) { - u64 ct; + cputime_t ct; unsigned long sec; /* have to be a little careful about overflow */ @@ -77,7 +93,7 @@ static inline cputime_t jiffies_to_cputime(const unsigned long jif) } if (sec) ct += (cputime_t) sec * tb_ticks_per_sec; - return (__force cputime_t) ct; + return ct; } static inline void setup_cputime_one_jiffy(void) @@ -87,7 +103,7 @@ static inline void setup_cputime_one_jiffy(void) static inline cputime64_t jiffies64_to_cputime64(const u64 jif) { - u64 ct; + cputime_t ct; u64 sec; /* have to be a little careful about overflow */ @@ -98,13 +114,13 @@ static inline cputime64_t jiffies64_to_cputime64(const u64 jif) do_div(ct, HZ); } if (sec) - ct += (u64) sec * tb_ticks_per_sec; - return (__force cputime64_t) ct; + ct += (cputime_t) sec * tb_ticks_per_sec; + return ct; } static inline u64 cputime64_to_jiffies64(const cputime_t ct) { - return mulhdu((__force u64) ct, __cputime_jiffies_factor); + return mulhdu(ct, __cputime_jiffies_factor); } /* @@ -114,12 +130,12 @@ extern u64 __cputime_msec_factor; static inline unsigned long cputime_to_usecs(const cputime_t ct) { - return mulhdu((__force u64) ct, __cputime_msec_factor) * USEC_PER_MSEC; + return mulhdu(ct, __cputime_msec_factor) * USEC_PER_MSEC; } static inline cputime_t usecs_to_cputime(const unsigned long us) { - u64 ct; + cputime_t ct; unsigned long sec; /* have to be a little careful about overflow */ @@ -131,11 +147,9 @@ static inline cputime_t usecs_to_cputime(const unsigned long us) } if (sec) ct += (cputime_t) sec * tb_ticks_per_sec; - return (__force cputime_t) ct; + return ct; } -#define usecs_to_cputime64(us) usecs_to_cputime(us) - /* * Convert cputime <-> seconds */ @@ -143,12 +157,12 @@ extern u64 __cputime_sec_factor; static inline unsigned long cputime_to_secs(const cputime_t ct) { - return mulhdu((__force u64) ct, __cputime_sec_factor); + return mulhdu(ct, __cputime_sec_factor); } static inline cputime_t secs_to_cputime(const unsigned long sec) { - return (__force cputime_t)((u64) sec * tb_ticks_per_sec); + return (cputime_t) sec * tb_ticks_per_sec; } /* @@ -156,7 +170,7 @@ static inline cputime_t secs_to_cputime(const unsigned long sec) */ static inline void cputime_to_timespec(const cputime_t ct, struct timespec *p) { - u64 x = (__force u64) ct; + u64 x = ct; unsigned int frac; frac = do_div(x, tb_ticks_per_sec); @@ -168,11 +182,11 @@ static inline void cputime_to_timespec(const cputime_t ct, struct timespec *p) static inline cputime_t timespec_to_cputime(const struct timespec *p) { - u64 ct; + cputime_t ct; ct = (u64) p->tv_nsec * tb_ticks_per_sec; do_div(ct, 1000000000); - return (__force cputime_t)(ct + (u64) p->tv_sec * tb_ticks_per_sec); + return ct + (u64) p->tv_sec * tb_ticks_per_sec; } /* @@ -180,7 +194,7 @@ static inline cputime_t timespec_to_cputime(const struct timespec *p) */ static inline void cputime_to_timeval(const cputime_t ct, struct timeval *p) { - u64 x = (__force u64) ct; + u64 x = ct; unsigned int frac; frac = do_div(x, tb_ticks_per_sec); @@ -192,11 +206,11 @@ static inline void cputime_to_timeval(const cputime_t ct, struct timeval *p) static inline cputime_t timeval_to_cputime(const struct timeval *p) { - u64 ct; + cputime_t ct; ct = (u64) p->tv_usec * tb_ticks_per_sec; do_div(ct, 1000000); - return (__force cputime_t)(ct + (u64) p->tv_sec * tb_ticks_per_sec); + return ct + (u64) p->tv_sec * tb_ticks_per_sec; } /* @@ -206,12 +220,12 @@ extern u64 __cputime_clockt_factor; static inline unsigned long cputime_to_clock_t(const cputime_t ct) { - return mulhdu((__force u64) ct, __cputime_clockt_factor); + return mulhdu(ct, __cputime_clockt_factor); } static inline cputime_t clock_t_to_cputime(const unsigned long clk) { - u64 ct; + cputime_t ct; unsigned long sec; /* have to be a little careful about overflow */ @@ -222,8 +236,8 @@ static inline cputime_t clock_t_to_cputime(const unsigned long clk) do_div(ct, USER_HZ); } if (sec) - ct += (u64) sec * tb_ticks_per_sec; - return (__force cputime_t) ct; + ct += (cputime_t) sec * tb_ticks_per_sec; + return ct; } #define cputime64_to_clock_t(ct) cputime_to_clock_t((cputime_t)(ct)) diff --git a/trunk/arch/powerpc/include/asm/kvm_book3s.h b/trunk/arch/powerpc/include/asm/kvm_book3s.h index 69c7377d2071..d4df013ad779 100644 --- a/trunk/arch/powerpc/include/asm/kvm_book3s.h +++ b/trunk/arch/powerpc/include/asm/kvm_book3s.h @@ -381,6 +381,39 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) } #endif +static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, + unsigned long pte_index) +{ + unsigned long rb, va_low; + + rb = (v & ~0x7fUL) << 16; /* AVA field */ + va_low = pte_index >> 3; + if (v & HPTE_V_SECONDARY) + va_low = ~va_low; + /* xor vsid from AVA */ + if (!(v & HPTE_V_1TB_SEG)) + va_low ^= v >> 12; + else + va_low ^= v >> 24; + va_low &= 0x7ff; + if (v & HPTE_V_LARGE) { + rb |= 1; /* L field */ + if (cpu_has_feature(CPU_FTR_ARCH_206) && + (r & 0xff000)) { + /* non-16MB large page, must be 64k */ + /* (masks depend on page size) */ + rb |= 0x1000; /* page encoding in LP field */ + rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */ + rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */ + } + } else { + /* 4kB page */ + rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */ + } + rb |= (v >> 54) & 0x300; /* B field */ + return rb; +} + /* Magic register values loaded into r3 and r4 before the 'sc' assembly * instruction for the OSI hypercalls */ #define OSI_SC_MAGIC_R3 0x113724FA diff --git a/trunk/arch/powerpc/include/asm/kvm_book3s_64.h b/trunk/arch/powerpc/include/asm/kvm_book3s_64.h index d0ac94f98f9e..e43fe42b9875 100644 --- a/trunk/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/trunk/arch/powerpc/include/asm/kvm_book3s_64.h @@ -29,37 +29,4 @@ static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu) #define SPAPR_TCE_SHIFT 12 -static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, - unsigned long pte_index) -{ - unsigned long rb, va_low; - - rb = (v & ~0x7fUL) << 16; /* AVA field */ - va_low = pte_index >> 3; - if (v & HPTE_V_SECONDARY) - va_low = ~va_low; - /* xor vsid from AVA */ - if (!(v & HPTE_V_1TB_SEG)) - va_low ^= v >> 12; - else - va_low ^= v >> 24; - va_low &= 0x7ff; - if (v & HPTE_V_LARGE) { - rb |= 1; /* L field */ - if (cpu_has_feature(CPU_FTR_ARCH_206) && - (r & 0xff000)) { - /* non-16MB large page, must be 64k */ - /* (masks depend on page size) */ - rb |= 0x1000; /* page encoding in LP field */ - rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */ - rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */ - } - } else { - /* 4kB page */ - rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */ - } - rb |= (v >> 54) & 0x300; /* B field */ - return rb; -} - #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/trunk/arch/powerpc/include/asm/memblock.h b/trunk/arch/powerpc/include/asm/memblock.h new file mode 100644 index 000000000000..43efc345065e --- /dev/null +++ b/trunk/arch/powerpc/include/asm/memblock.h @@ -0,0 +1,8 @@ +#ifndef _ASM_POWERPC_MEMBLOCK_H +#define _ASM_POWERPC_MEMBLOCK_H + +#include + +#define MEMBLOCK_DBG(fmt...) udbg_printf(fmt) + +#endif /* _ASM_POWERPC_MEMBLOCK_H */ diff --git a/trunk/arch/powerpc/kernel/idle.c b/trunk/arch/powerpc/kernel/idle.c index 9c3cd490b1bd..39a2baa6ad58 100644 --- a/trunk/arch/powerpc/kernel/idle.c +++ b/trunk/arch/powerpc/kernel/idle.c @@ -46,12 +46,6 @@ static int __init powersave_off(char *arg) } __setup("powersave=off", powersave_off); -#if defined(CONFIG_PPC_PSERIES) && defined(CONFIG_TRACEPOINTS) -static const bool idle_uses_rcu = 1; -#else -static const bool idle_uses_rcu; -#endif - /* * The body of the idle task. */ @@ -62,10 +56,7 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while (1) { - tick_nohz_idle_enter(); - if (!idle_uses_rcu) - rcu_idle_enter(); - + tick_nohz_stop_sched_tick(1); while (!need_resched() && !cpu_should_die()) { ppc64_runlatch_off(); @@ -102,9 +93,7 @@ void cpu_idle(void) HMT_medium(); ppc64_runlatch_on(); - if (!idle_uses_rcu) - rcu_idle_exit(); - tick_nohz_idle_exit(); + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); if (cpu_should_die()) cpu_die(); diff --git a/trunk/arch/powerpc/kernel/machine_kexec.c b/trunk/arch/powerpc/kernel/machine_kexec.c index a2158a395d96..9ce1672afb59 100644 --- a/trunk/arch/powerpc/kernel/machine_kexec.c +++ b/trunk/arch/powerpc/kernel/machine_kexec.c @@ -107,6 +107,9 @@ void __init reserve_crashkernel(void) unsigned long long crash_size, crash_base; int ret; + /* this is necessary because of memblock_phys_mem_size() */ + memblock_analyze(); + /* use common parsing */ ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base); diff --git a/trunk/arch/powerpc/kernel/prom.c b/trunk/arch/powerpc/kernel/prom.c index abe405dab34d..fa1235b0503b 100644 --- a/trunk/arch/powerpc/kernel/prom.c +++ b/trunk/arch/powerpc/kernel/prom.c @@ -733,6 +733,8 @@ void __init early_init_devtree(void *params) of_scan_flat_dt(early_init_dt_scan_chosen_ppc, cmd_line); /* Scan memory nodes and rebuild MEMBLOCKs */ + memblock_init(); + of_scan_flat_dt(early_init_dt_scan_root, NULL); of_scan_flat_dt(early_init_dt_scan_memory_ppc, NULL); @@ -754,14 +756,20 @@ void __init early_init_devtree(void *params) early_reserve_mem(); phyp_dump_reserve_mem(); - /* - * Ensure that total memory size is page-aligned, because otherwise - * mark_bootmem() gets upset. - */ - limit = ALIGN(memory_limit ?: memblock_phys_mem_size(), PAGE_SIZE); + limit = memory_limit; + if (! limit) { + phys_addr_t memsize; + + /* Ensure that total memory size is page-aligned, because + * otherwise mark_bootmem() gets upset. */ + memblock_analyze(); + memsize = memblock_phys_mem_size(); + if ((memsize & PAGE_MASK) != memsize) + limit = memsize & PAGE_MASK; + } memblock_enforce_memory_limit(limit); - memblock_allow_resize(); + memblock_analyze(); memblock_dump_all(); DBG("Phys. mem: %llx\n", memblock_phys_mem_size()); diff --git a/trunk/arch/powerpc/kvm/book3s_hv.c b/trunk/arch/powerpc/kvm/book3s_hv.c index 336983da9e72..0cb137a9b038 100644 --- a/trunk/arch/powerpc/kvm/book3s_hv.c +++ b/trunk/arch/powerpc/kvm/book3s_hv.c @@ -538,7 +538,7 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu) tpaca->kvm_hstate.napping = 0; vcpu->cpu = vc->pcpu; smp_wmb(); -#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) +#ifdef CONFIG_PPC_ICP_NATIVE if (vcpu->arch.ptid) { tpaca->cpu_start = 0x80; wmb(); diff --git a/trunk/arch/powerpc/kvm/book3s_pr.c b/trunk/arch/powerpc/kvm/book3s_pr.c index e2cfb9e1e20e..3c791e1eb675 100644 --- a/trunk/arch/powerpc/kvm/book3s_pr.c +++ b/trunk/arch/powerpc/kvm/book3s_pr.c @@ -658,12 +658,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, ulong cmd = kvmppc_get_gpr(vcpu, 3); int i; -#ifdef CONFIG_KVM_BOOK3S_64_PR if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) { r = RESUME_GUEST; break; } -#endif run->papr_hcall.nr = cmd; for (i = 0; i < 9; ++i) { diff --git a/trunk/arch/powerpc/kvm/e500.c b/trunk/arch/powerpc/kvm/e500.c index 8c0d45a6faf7..26d20903f2bc 100644 --- a/trunk/arch/powerpc/kvm/e500.c +++ b/trunk/arch/powerpc/kvm/e500.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include diff --git a/trunk/arch/powerpc/mm/init_32.c b/trunk/arch/powerpc/mm/init_32.c index 58861fa1220e..161cefde5c15 100644 --- a/trunk/arch/powerpc/mm/init_32.c +++ b/trunk/arch/powerpc/mm/init_32.c @@ -134,7 +134,8 @@ void __init MMU_init(void) if (memblock.memory.cnt > 1) { #ifndef CONFIG_WII - memblock_enforce_memory_limit(memblock.memory.regions[0].size); + memblock.memory.cnt = 1; + memblock_analyze(); printk(KERN_WARNING "Only using first contiguous memory region"); #else wii_memory_fixups(); @@ -157,6 +158,7 @@ void __init MMU_init(void) #ifndef CONFIG_HIGHMEM total_memory = total_lowmem; memblock_enforce_memory_limit(total_lowmem); + memblock_analyze(); #endif /* CONFIG_HIGHMEM */ } diff --git a/trunk/arch/powerpc/mm/mem.c b/trunk/arch/powerpc/mm/mem.c index 8e2eb6611b0b..2dd6bdd31fe1 100644 --- a/trunk/arch/powerpc/mm/mem.c +++ b/trunk/arch/powerpc/mm/mem.c @@ -199,7 +199,7 @@ void __init do_init_bootmem(void) unsigned long start_pfn, end_pfn; start_pfn = memblock_region_memory_base_pfn(reg); end_pfn = memblock_region_memory_end_pfn(reg); - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); + add_active_range(0, start_pfn, end_pfn); } /* Add all physical memory to the bootmem map, mark each area diff --git a/trunk/arch/powerpc/mm/numa.c b/trunk/arch/powerpc/mm/numa.c index e6eea0ac80c8..b22a83a91cb8 100644 --- a/trunk/arch/powerpc/mm/numa.c +++ b/trunk/arch/powerpc/mm/numa.c @@ -127,25 +127,45 @@ static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn, } /* - * get_node_active_region - Return active region containing pfn - * Active range returned is empty if none found. - * @pfn: The page to return the region for - * @node_ar: Returned set to the active region containing @pfn + * get_active_region_work_fn - A helper function for get_node_active_region + * Returns datax set to the start_pfn and end_pfn if they contain + * the initial value of datax->start_pfn between them + * @start_pfn: start page(inclusive) of region to check + * @end_pfn: end page(exclusive) of region to check + * @datax: comes in with ->start_pfn set to value to search for and + * goes out with active range if it contains it + * Returns 1 if search value is in range else 0 */ -static void __init get_node_active_region(unsigned long pfn, - struct node_active_region *node_ar) +static int __init get_active_region_work_fn(unsigned long start_pfn, + unsigned long end_pfn, void *datax) { - unsigned long start_pfn, end_pfn; - int i, nid; + struct node_active_region *data; + data = (struct node_active_region *)datax; - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { - if (pfn >= start_pfn && pfn < end_pfn) { - node_ar->nid = nid; - node_ar->start_pfn = start_pfn; - node_ar->end_pfn = end_pfn; - break; - } + if (start_pfn <= data->start_pfn && end_pfn > data->start_pfn) { + data->start_pfn = start_pfn; + data->end_pfn = end_pfn; + return 1; } + return 0; + +} + +/* + * get_node_active_region - Return active region containing start_pfn + * Active range returned is empty if none found. + * @start_pfn: The page to return the region for. + * @node_ar: Returned set to the active region containing start_pfn + */ +static void __init get_node_active_region(unsigned long start_pfn, + struct node_active_region *node_ar) +{ + int nid = early_pfn_to_nid(start_pfn); + + node_ar->nid = nid; + node_ar->start_pfn = start_pfn; + node_ar->end_pfn = start_pfn; + work_with_active_regions(nid, get_active_region_work_fn, node_ar); } static void map_cpu_to_node(int cpu, int node) @@ -690,7 +710,9 @@ static void __init parse_drconf_memory(struct device_node *memory) node_set_online(nid); sz = numa_enforce_memory_limit(base, size); if (sz) - memblock_set_node(base, sz, nid); + add_active_range(nid, base >> PAGE_SHIFT, + (base >> PAGE_SHIFT) + + (sz >> PAGE_SHIFT)); } while (--ranges); } } @@ -780,7 +802,8 @@ static int __init parse_numa_properties(void) continue; } - memblock_set_node(start, size, nid); + add_active_range(nid, start >> PAGE_SHIFT, + (start >> PAGE_SHIFT) + (size >> PAGE_SHIFT)); if (--ranges) goto new_range; @@ -816,8 +839,7 @@ static void __init setup_nonnuma(void) end_pfn = memblock_region_memory_end_pfn(reg); fake_numa_create_new_node(end_pfn, &nid); - memblock_set_node(PFN_PHYS(start_pfn), - PFN_PHYS(end_pfn - start_pfn), nid); + add_active_range(nid, start_pfn, end_pfn); node_set_online(nid); } } diff --git a/trunk/arch/powerpc/mm/tlb_nohash.c b/trunk/arch/powerpc/mm/tlb_nohash.c index 573ba3b69d1f..4e13d6f9023e 100644 --- a/trunk/arch/powerpc/mm/tlb_nohash.c +++ b/trunk/arch/powerpc/mm/tlb_nohash.c @@ -615,6 +615,7 @@ static void __early_init_mmu(int boot_cpu) /* limit memory so we dont have linear faults */ memblock_enforce_memory_limit(linear_map_top); + memblock_analyze(); patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e); patch_exception(0x1e0, exc_instruction_tlb_miss_bolted_book3e); diff --git a/trunk/arch/powerpc/platforms/embedded6xx/wii.c b/trunk/arch/powerpc/platforms/embedded6xx/wii.c index 6d8dadf19f0b..1b5dc1a2e145 100644 --- a/trunk/arch/powerpc/platforms/embedded6xx/wii.c +++ b/trunk/arch/powerpc/platforms/embedded6xx/wii.c @@ -79,18 +79,23 @@ void __init wii_memory_fixups(void) BUG_ON(memblock.memory.cnt != 2); BUG_ON(!page_aligned(p[0].base) || !page_aligned(p[1].base)); - /* trim unaligned tail */ - memblock_remove(ALIGN(p[1].base + p[1].size, PAGE_SIZE), - (phys_addr_t)ULLONG_MAX); + p[0].size = _ALIGN_DOWN(p[0].size, PAGE_SIZE); + p[1].size = _ALIGN_DOWN(p[1].size, PAGE_SIZE); - /* determine hole, add & reserve them */ - wii_hole_start = ALIGN(p[0].base + p[0].size, PAGE_SIZE); + wii_hole_start = p[0].base + p[0].size; wii_hole_size = p[1].base - wii_hole_start; - memblock_add(wii_hole_start, wii_hole_size); - memblock_reserve(wii_hole_start, wii_hole_size); - BUG_ON(memblock.memory.cnt != 1); - __memblock_dump_all(); + pr_info("MEM1: <%08llx %08llx>\n", p[0].base, p[0].size); + pr_info("HOLE: <%08lx %08lx>\n", wii_hole_start, wii_hole_size); + pr_info("MEM2: <%08llx %08llx>\n", p[1].base, p[1].size); + + p[0].size += wii_hole_size + p[1].size; + + memblock.memory.cnt = 1; + memblock_analyze(); + + /* reserve the hole */ + memblock_reserve(wii_hole_start, wii_hole_size); /* allow ioremapping the address space in the hole */ __allow_ioremap_reserved = 1; diff --git a/trunk/arch/powerpc/platforms/iseries/setup.c b/trunk/arch/powerpc/platforms/iseries/setup.c index 8fc62586a973..ea0acbd8966d 100644 --- a/trunk/arch/powerpc/platforms/iseries/setup.c +++ b/trunk/arch/powerpc/platforms/iseries/setup.c @@ -563,8 +563,7 @@ static void yield_shared_processor(void) static void iseries_shared_idle(void) { while (1) { - tick_nohz_idle_enter(); - rcu_idle_enter(); + tick_nohz_stop_sched_tick(1); while (!need_resched() && !hvlpevent_is_pending()) { local_irq_disable(); ppc64_runlatch_off(); @@ -578,8 +577,7 @@ static void iseries_shared_idle(void) } ppc64_runlatch_on(); - rcu_idle_exit(); - tick_nohz_idle_exit(); + tick_nohz_restart_sched_tick(); if (hvlpevent_is_pending()) process_iSeries_events(); @@ -595,8 +593,7 @@ static void iseries_dedicated_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while (1) { - tick_nohz_idle_enter(); - rcu_idle_enter(); + tick_nohz_stop_sched_tick(1); if (!need_resched()) { while (!need_resched()) { ppc64_runlatch_off(); @@ -613,8 +610,7 @@ static void iseries_dedicated_idle(void) } ppc64_runlatch_on(); - rcu_idle_exit(); - tick_nohz_idle_exit(); + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/trunk/arch/powerpc/platforms/ps3/mm.c b/trunk/arch/powerpc/platforms/ps3/mm.c index 8bd6ba542691..72714ad27842 100644 --- a/trunk/arch/powerpc/platforms/ps3/mm.c +++ b/trunk/arch/powerpc/platforms/ps3/mm.c @@ -319,6 +319,7 @@ static int __init ps3_mm_add_memory(void) } memblock_add(start_addr, map.r1.size); + memblock_analyze(); result = online_pages(start_pfn, nr_pages); diff --git a/trunk/arch/powerpc/platforms/pseries/lpar.c b/trunk/arch/powerpc/platforms/pseries/lpar.c index 52d429be6c76..27a49508b410 100644 --- a/trunk/arch/powerpc/platforms/pseries/lpar.c +++ b/trunk/arch/powerpc/platforms/pseries/lpar.c @@ -555,8 +555,6 @@ void __trace_hcall_entry(unsigned long opcode, unsigned long *args) (*depth)++; trace_hcall_entry(opcode, args); - if (opcode == H_CEDE) - rcu_idle_enter(); (*depth)--; out: @@ -577,8 +575,6 @@ void __trace_hcall_exit(long opcode, unsigned long retval, goto out; (*depth)++; - if (opcode == H_CEDE) - rcu_idle_exit(); trace_hcall_exit(opcode, retval, retbuf); (*depth)--; diff --git a/trunk/arch/s390/Kconfig b/trunk/arch/s390/Kconfig index d48ede334434..373679b3744a 100644 --- a/trunk/arch/s390/Kconfig +++ b/trunk/arch/s390/Kconfig @@ -92,9 +92,6 @@ config S390 select HAVE_ARCH_JUMP_LABEL if !MARCH_G5 select HAVE_RCU_TABLE_FREE if SMP select ARCH_SAVE_PAGE_KEYS if HIBERNATION - select HAVE_MEMBLOCK - select HAVE_MEMBLOCK_NODE_MAP - select ARCH_DISCARD_MEMBLOCK select ARCH_INLINE_SPIN_TRYLOCK select ARCH_INLINE_SPIN_TRYLOCK_BH select ARCH_INLINE_SPIN_LOCK @@ -348,6 +345,9 @@ config WARN_DYNAMIC_STACK Say N if you are unsure. +config ARCH_POPULATES_NODE_MAP + def_bool y + comment "Kernel preemption" source "kernel/Kconfig.preempt" diff --git a/trunk/arch/s390/appldata/appldata_os.c b/trunk/arch/s390/appldata/appldata_os.c index 4de031d6b76c..92f1cb745d69 100644 --- a/trunk/arch/s390/appldata/appldata_os.c +++ b/trunk/arch/s390/appldata/appldata_os.c @@ -115,21 +115,21 @@ static void appldata_get_os_data(void *data) j = 0; for_each_online_cpu(i) { os_data->os_cpu[j].per_cpu_user = - cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_USER]); + cputime_to_jiffies(kstat_cpu(i).cpustat.user); os_data->os_cpu[j].per_cpu_nice = - cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_NICE]); + cputime_to_jiffies(kstat_cpu(i).cpustat.nice); os_data->os_cpu[j].per_cpu_system = - cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]); + cputime_to_jiffies(kstat_cpu(i).cpustat.system); os_data->os_cpu[j].per_cpu_idle = - cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IDLE]); + cputime_to_jiffies(kstat_cpu(i).cpustat.idle); os_data->os_cpu[j].per_cpu_irq = - cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IRQ]); + cputime_to_jiffies(kstat_cpu(i).cpustat.irq); os_data->os_cpu[j].per_cpu_softirq = - cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]); + cputime_to_jiffies(kstat_cpu(i).cpustat.softirq); os_data->os_cpu[j].per_cpu_iowait = - cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IOWAIT]); + cputime_to_jiffies(kstat_cpu(i).cpustat.iowait); os_data->os_cpu[j].per_cpu_steal = - cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_STEAL]); + cputime_to_jiffies(kstat_cpu(i).cpustat.steal); os_data->os_cpu[j].cpu_id = i; j++; } diff --git a/trunk/arch/s390/include/asm/cputime.h b/trunk/arch/s390/include/asm/cputime.h index c23c3900c304..081434878296 100644 --- a/trunk/arch/s390/include/asm/cputime.h +++ b/trunk/arch/s390/include/asm/cputime.h @@ -16,100 +16,114 @@ /* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */ -typedef unsigned long long __nocast cputime_t; -typedef unsigned long long __nocast cputime64_t; +typedef unsigned long long cputime_t; +typedef unsigned long long cputime64_t; -static inline unsigned long __div(unsigned long long n, unsigned long base) -{ #ifndef __s390x__ + +static inline unsigned int +__div(unsigned long long n, unsigned int base) +{ register_pair rp; rp.pair = n >> 1; asm ("dr %0,%1" : "+d" (rp) : "d" (base >> 1)); return rp.subreg.odd; -#else /* __s390x__ */ - return n / base; -#endif /* __s390x__ */ } -#define cputime_one_jiffy jiffies_to_cputime(1) - -/* - * Convert cputime to jiffies and back. - */ -static inline unsigned long cputime_to_jiffies(const cputime_t cputime) -{ - return __div((__force unsigned long long) cputime, 4096000000ULL / HZ); -} +#else /* __s390x__ */ -static inline cputime_t jiffies_to_cputime(const unsigned int jif) +static inline unsigned int +__div(unsigned long long n, unsigned int base) { - return (__force cputime_t)(jif * (4096000000ULL / HZ)); + return n / base; } -static inline u64 cputime64_to_jiffies64(cputime64_t cputime) -{ - unsigned long long jif = (__force unsigned long long) cputime; - do_div(jif, 4096000000ULL / HZ); - return jif; -} +#endif /* __s390x__ */ -static inline cputime64_t jiffies64_to_cputime64(const u64 jif) -{ - return (__force cputime64_t)(jif * (4096000000ULL / HZ)); +#define cputime_zero (0ULL) +#define cputime_one_jiffy jiffies_to_cputime(1) +#define cputime_max ((~0UL >> 1) - 1) +#define cputime_add(__a, __b) ((__a) + (__b)) +#define cputime_sub(__a, __b) ((__a) - (__b)) +#define cputime_div(__a, __n) ({ \ + unsigned long long __div = (__a); \ + do_div(__div,__n); \ + __div; \ +}) +#define cputime_halve(__a) ((__a) >> 1) +#define cputime_eq(__a, __b) ((__a) == (__b)) +#define cputime_gt(__a, __b) ((__a) > (__b)) +#define cputime_ge(__a, __b) ((__a) >= (__b)) +#define cputime_lt(__a, __b) ((__a) < (__b)) +#define cputime_le(__a, __b) ((__a) <= (__b)) +#define cputime_to_jiffies(__ct) (__div((__ct), 4096000000ULL / HZ)) +#define cputime_to_scaled(__ct) (__ct) +#define jiffies_to_cputime(__hz) ((cputime_t)(__hz) * (4096000000ULL / HZ)) + +#define cputime64_zero (0ULL) +#define cputime64_add(__a, __b) ((__a) + (__b)) +#define cputime_to_cputime64(__ct) (__ct) + +static inline u64 +cputime64_to_jiffies64(cputime64_t cputime) +{ + do_div(cputime, 4096000000ULL / HZ); + return cputime; } /* * Convert cputime to microseconds and back. */ -static inline unsigned int cputime_to_usecs(const cputime_t cputime) +static inline unsigned int +cputime_to_usecs(const cputime_t cputime) { - return (__force unsigned long long) cputime >> 12; + return cputime_div(cputime, 4096); } -static inline cputime_t usecs_to_cputime(const unsigned int m) +static inline cputime_t +usecs_to_cputime(const unsigned int m) { - return (__force cputime_t)(m * 4096ULL); + return (cputime_t) m * 4096; } -#define usecs_to_cputime64(m) usecs_to_cputime(m) - /* * Convert cputime to milliseconds and back. */ -static inline unsigned int cputime_to_secs(const cputime_t cputime) +static inline unsigned int +cputime_to_secs(const cputime_t cputime) { - return __div((__force unsigned long long) cputime, 2048000000) >> 1; + return __div(cputime, 2048000000) >> 1; } -static inline cputime_t secs_to_cputime(const unsigned int s) +static inline cputime_t +secs_to_cputime(const unsigned int s) { - return (__force cputime_t)(s * 4096000000ULL); + return (cputime_t) s * 4096000000ULL; } /* * Convert cputime to timespec and back. */ -static inline cputime_t timespec_to_cputime(const struct timespec *value) +static inline cputime_t +timespec_to_cputime(const struct timespec *value) { - unsigned long long ret = value->tv_sec * 4096000000ULL; - return (__force cputime_t)(ret + value->tv_nsec * 4096 / 1000); + return value->tv_nsec * 4096 / 1000 + (u64) value->tv_sec * 4096000000ULL; } -static inline void cputime_to_timespec(const cputime_t cputime, - struct timespec *value) +static inline void +cputime_to_timespec(const cputime_t cputime, struct timespec *value) { - unsigned long long __cputime = (__force unsigned long long) cputime; #ifndef __s390x__ register_pair rp; - rp.pair = __cputime >> 1; + rp.pair = cputime >> 1; asm ("dr %0,%1" : "+d" (rp) : "d" (2048000000UL)); value->tv_nsec = rp.subreg.even * 1000 / 4096; value->tv_sec = rp.subreg.odd; #else - value->tv_nsec = (__cputime % 4096000000ULL) * 1000 / 4096; - value->tv_sec = __cputime / 4096000000ULL; + value->tv_nsec = (cputime % 4096000000ULL) * 1000 / 4096; + value->tv_sec = cputime / 4096000000ULL; #endif } @@ -118,52 +132,50 @@ static inline void cputime_to_timespec(const cputime_t cputime, * Since cputime and timeval have the same resolution (microseconds) * this is easy. */ -static inline cputime_t timeval_to_cputime(const struct timeval *value) +static inline cputime_t +timeval_to_cputime(const struct timeval *value) { - unsigned long long ret = value->tv_sec * 4096000000ULL; - return (__force cputime_t)(ret + value->tv_usec * 4096ULL); + return value->tv_usec * 4096 + (u64) value->tv_sec * 4096000000ULL; } -static inline void cputime_to_timeval(const cputime_t cputime, - struct timeval *value) +static inline void +cputime_to_timeval(const cputime_t cputime, struct timeval *value) { - unsigned long long __cputime = (__force unsigned long long) cputime; #ifndef __s390x__ register_pair rp; - rp.pair = __cputime >> 1; + rp.pair = cputime >> 1; asm ("dr %0,%1" : "+d" (rp) : "d" (2048000000UL)); value->tv_usec = rp.subreg.even / 4096; value->tv_sec = rp.subreg.odd; #else - value->tv_usec = (__cputime % 4096000000ULL) / 4096; - value->tv_sec = __cputime / 4096000000ULL; + value->tv_usec = (cputime % 4096000000ULL) / 4096; + value->tv_sec = cputime / 4096000000ULL; #endif } /* * Convert cputime to clock and back. */ -static inline clock_t cputime_to_clock_t(cputime_t cputime) +static inline clock_t +cputime_to_clock_t(cputime_t cputime) { - unsigned long long clock = (__force unsigned long long) cputime; - do_div(clock, 4096000000ULL / USER_HZ); - return clock; + return cputime_div(cputime, 4096000000ULL / USER_HZ); } -static inline cputime_t clock_t_to_cputime(unsigned long x) +static inline cputime_t +clock_t_to_cputime(unsigned long x) { - return (__force cputime_t)(x * (4096000000ULL / USER_HZ)); + return (cputime_t) x * (4096000000ULL / USER_HZ); } /* * Convert cputime64 to clock. */ -static inline clock_t cputime64_to_clock_t(cputime64_t cputime) +static inline clock_t +cputime64_to_clock_t(cputime64_t cputime) { - unsigned long long clock = (__force unsigned long long) cputime; - do_div(clock, 4096000000ULL / USER_HZ); - return clock; + return cputime_div(cputime, 4096000000ULL / USER_HZ); } struct s390_idle_data { diff --git a/trunk/arch/s390/kernel/process.c b/trunk/arch/s390/kernel/process.c index 3201ae447990..9451b210a1b4 100644 --- a/trunk/arch/s390/kernel/process.c +++ b/trunk/arch/s390/kernel/process.c @@ -91,12 +91,10 @@ static void default_idle(void) void cpu_idle(void) { for (;;) { - tick_nohz_idle_enter(); - rcu_idle_enter(); + tick_nohz_stop_sched_tick(1); while (!need_resched()) default_idle(); - rcu_idle_exit(); - tick_nohz_idle_exit(); + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/trunk/arch/s390/kernel/setup.c b/trunk/arch/s390/kernel/setup.c index f11d1b037c50..e54c4ff8abaa 100644 --- a/trunk/arch/s390/kernel/setup.c +++ b/trunk/arch/s390/kernel/setup.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -821,8 +820,7 @@ setup_memory(void) end_chunk = min(end_chunk, end_pfn); if (start_chunk >= end_chunk) continue; - memblock_add_node(PFN_PHYS(start_chunk), - PFN_PHYS(end_chunk - start_chunk), 0); + add_active_range(0, start_chunk, end_chunk); pfn = max(start_chunk, start_pfn); for (; pfn < end_chunk; pfn++) page_set_storage_key(PFN_PHYS(pfn), diff --git a/trunk/arch/s390/oprofile/hwsampler.c b/trunk/arch/s390/oprofile/hwsampler.c index 9daee91e6c3f..f43c0e4282af 100644 --- a/trunk/arch/s390/oprofile/hwsampler.c +++ b/trunk/arch/s390/oprofile/hwsampler.c @@ -22,7 +22,6 @@ #include #include "hwsampler.h" -#include "op_counter.h" #define MAX_NUM_SDB 511 #define MIN_NUM_SDB 1 @@ -897,8 +896,6 @@ static void add_samples_to_oprofile(unsigned int cpu, unsigned long *sdbt, if (sample_data_ptr->P == 1) { /* userspace sample */ unsigned int pid = sample_data_ptr->prim_asn; - if (!counter_config.user) - goto skip_sample; rcu_read_lock(); tsk = pid_task(find_vpid(pid), PIDTYPE_PID); if (tsk) @@ -906,8 +903,6 @@ static void add_samples_to_oprofile(unsigned int cpu, unsigned long *sdbt, rcu_read_unlock(); } else { /* kernelspace sample */ - if (!counter_config.kernel) - goto skip_sample; regs = task_pt_regs(current); } @@ -915,7 +910,7 @@ static void add_samples_to_oprofile(unsigned int cpu, unsigned long *sdbt, oprofile_add_ext_hw_sample(sample_data_ptr->ia, regs, 0, !sample_data_ptr->P, tsk); mutex_unlock(&hws_sem); - skip_sample: + sample_data_ptr++; } } diff --git a/trunk/arch/s390/oprofile/init.c b/trunk/arch/s390/oprofile/init.c index 2297be406c61..bd58b72454cf 100644 --- a/trunk/arch/s390/oprofile/init.c +++ b/trunk/arch/s390/oprofile/init.c @@ -2,11 +2,10 @@ * arch/s390/oprofile/init.c * * S390 Version - * Copyright (C) 2002-2011 IBM Deutschland Entwicklung GmbH, IBM Corporation + * Copyright (C) 2003 IBM Deutschland Entwicklung GmbH, IBM Corporation * Author(s): Thomas Spatzier (tspat@de.ibm.com) * Author(s): Mahesh Salgaonkar (mahesh@linux.vnet.ibm.com) * Author(s): Heinz Graalfs (graalfs@linux.vnet.ibm.com) - * Author(s): Andreas Krebbel (krebbel@linux.vnet.ibm.com) * * @remark Copyright 2002-2011 OProfile authors */ @@ -15,8 +14,6 @@ #include #include #include -#include -#include #include "../../../drivers/oprofile/oprof.h" @@ -25,7 +22,6 @@ extern void s390_backtrace(struct pt_regs * const regs, unsigned int depth); #ifdef CONFIG_64BIT #include "hwsampler.h" -#include "op_counter.h" #define DEFAULT_INTERVAL 4127518 @@ -39,41 +35,16 @@ static unsigned long oprofile_max_interval; static unsigned long oprofile_sdbt_blocks = DEFAULT_SDBT_BLOCKS; static unsigned long oprofile_sdb_blocks = DEFAULT_SDB_BLOCKS; -static int hwsampler_enabled; +static int hwsampler_file; static int hwsampler_running; /* start_mutex must be held to change */ -static int hwsampler_available; static struct oprofile_operations timer_ops; -struct op_counter_config counter_config; - -enum __force_cpu_type { - reserved = 0, /* do not force */ - timer, -}; -static int force_cpu_type; - -static int set_cpu_type(const char *str, struct kernel_param *kp) -{ - if (!strcmp(str, "timer")) { - force_cpu_type = timer; - printk(KERN_INFO "oprofile: forcing timer to be returned " - "as cpu type\n"); - } else { - force_cpu_type = 0; - } - - return 0; -} -module_param_call(cpu_type, set_cpu_type, NULL, NULL, 0); -MODULE_PARM_DESC(cpu_type, "Force legacy basic mode sampling" - "(report cpu_type \"timer\""); - static int oprofile_hwsampler_start(void) { int retval; - hwsampler_running = hwsampler_enabled; + hwsampler_running = hwsampler_file; if (!hwsampler_running) return timer_ops.start(); @@ -101,16 +72,10 @@ static void oprofile_hwsampler_stop(void) return; } -/* - * File ops used for: - * /dev/oprofile/0/enabled - * /dev/oprofile/hwsampling/hwsampler (cpu_type = timer) - */ - static ssize_t hwsampler_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { - return oprofilefs_ulong_to_user(hwsampler_enabled, buf, count, offset); + return oprofilefs_ulong_to_user(hwsampler_file, buf, count, offset); } static ssize_t hwsampler_write(struct file *file, char const __user *buf, @@ -126,9 +91,6 @@ static ssize_t hwsampler_write(struct file *file, char const __user *buf, if (retval <= 0) return retval; - if (val != 0 && val != 1) - return -EINVAL; - if (oprofile_started) /* * save to do without locking as we set @@ -137,7 +99,7 @@ static ssize_t hwsampler_write(struct file *file, char const __user *buf, */ return -EBUSY; - hwsampler_enabled = val; + hwsampler_file = val; return count; } @@ -147,311 +109,38 @@ static const struct file_operations hwsampler_fops = { .write = hwsampler_write, }; -/* - * File ops used for: - * /dev/oprofile/0/count - * /dev/oprofile/hwsampling/hw_interval (cpu_type = timer) - * - * Make sure that the value is within the hardware range. - */ - -static ssize_t hw_interval_read(struct file *file, char __user *buf, - size_t count, loff_t *offset) -{ - return oprofilefs_ulong_to_user(oprofile_hw_interval, buf, - count, offset); -} - -static ssize_t hw_interval_write(struct file *file, char const __user *buf, - size_t count, loff_t *offset) -{ - unsigned long val; - int retval; - - if (*offset) - return -EINVAL; - retval = oprofilefs_ulong_from_user(&val, buf, count); - if (retval) - return retval; - if (val < oprofile_min_interval) - oprofile_hw_interval = oprofile_min_interval; - else if (val > oprofile_max_interval) - oprofile_hw_interval = oprofile_max_interval; - else - oprofile_hw_interval = val; - - return count; -} - -static const struct file_operations hw_interval_fops = { - .read = hw_interval_read, - .write = hw_interval_write, -}; - -/* - * File ops used for: - * /dev/oprofile/0/event - * Only a single event with number 0 is supported with this counter. - * - * /dev/oprofile/0/unit_mask - * This is a dummy file needed by the user space tools. - * No value other than 0 is accepted or returned. - */ - -static ssize_t hwsampler_zero_read(struct file *file, char __user *buf, - size_t count, loff_t *offset) -{ - return oprofilefs_ulong_to_user(0, buf, count, offset); -} - -static ssize_t hwsampler_zero_write(struct file *file, char const __user *buf, - size_t count, loff_t *offset) -{ - unsigned long val; - int retval; - - if (*offset) - return -EINVAL; - - retval = oprofilefs_ulong_from_user(&val, buf, count); - if (retval) - return retval; - if (val != 0) - return -EINVAL; - return count; -} - -static const struct file_operations zero_fops = { - .read = hwsampler_zero_read, - .write = hwsampler_zero_write, -}; - -/* /dev/oprofile/0/kernel file ops. */ - -static ssize_t hwsampler_kernel_read(struct file *file, char __user *buf, - size_t count, loff_t *offset) -{ - return oprofilefs_ulong_to_user(counter_config.kernel, - buf, count, offset); -} - -static ssize_t hwsampler_kernel_write(struct file *file, char const __user *buf, - size_t count, loff_t *offset) -{ - unsigned long val; - int retval; - - if (*offset) - return -EINVAL; - - retval = oprofilefs_ulong_from_user(&val, buf, count); - if (retval) - return retval; - - if (val != 0 && val != 1) - return -EINVAL; - - counter_config.kernel = val; - - return count; -} - -static const struct file_operations kernel_fops = { - .read = hwsampler_kernel_read, - .write = hwsampler_kernel_write, -}; - -/* /dev/oprofile/0/user file ops. */ - -static ssize_t hwsampler_user_read(struct file *file, char __user *buf, - size_t count, loff_t *offset) -{ - return oprofilefs_ulong_to_user(counter_config.user, - buf, count, offset); -} - -static ssize_t hwsampler_user_write(struct file *file, char const __user *buf, - size_t count, loff_t *offset) -{ - unsigned long val; - int retval; - - if (*offset) - return -EINVAL; - - retval = oprofilefs_ulong_from_user(&val, buf, count); - if (retval) - return retval; - - if (val != 0 && val != 1) - return -EINVAL; - - counter_config.user = val; - - return count; -} - -static const struct file_operations user_fops = { - .read = hwsampler_user_read, - .write = hwsampler_user_write, -}; - - -/* - * File ops used for: /dev/oprofile/timer/enabled - * The value always has to be the inverted value of hwsampler_enabled. So - * no separate variable is created. That way we do not need locking. - */ - -static ssize_t timer_enabled_read(struct file *file, char __user *buf, - size_t count, loff_t *offset) -{ - return oprofilefs_ulong_to_user(!hwsampler_enabled, buf, count, offset); -} - -static ssize_t timer_enabled_write(struct file *file, char const __user *buf, - size_t count, loff_t *offset) -{ - unsigned long val; - int retval; - - if (*offset) - return -EINVAL; - - retval = oprofilefs_ulong_from_user(&val, buf, count); - if (retval) - return retval; - - if (val != 0 && val != 1) - return -EINVAL; - - /* Timer cannot be disabled without having hardware sampling. */ - if (val == 0 && !hwsampler_available) - return -EINVAL; - - if (oprofile_started) - /* - * save to do without locking as we set - * hwsampler_running in start() when start_mutex is - * held - */ - return -EBUSY; - - hwsampler_enabled = !val; - - return count; -} - -static const struct file_operations timer_enabled_fops = { - .read = timer_enabled_read, - .write = timer_enabled_write, -}; - - static int oprofile_create_hwsampling_files(struct super_block *sb, - struct dentry *root) + struct dentry *root) { - struct dentry *dir; - - dir = oprofilefs_mkdir(sb, root, "timer"); - if (!dir) - return -EINVAL; - - oprofilefs_create_file(sb, dir, "enabled", &timer_enabled_fops); - - if (!hwsampler_available) - return 0; + struct dentry *hw_dir; /* reinitialize default values */ - hwsampler_enabled = 1; - counter_config.kernel = 1; - counter_config.user = 1; - - if (!force_cpu_type) { - /* - * Create the counter file system. A single virtual - * counter is created which can be used to - * enable/disable hardware sampling dynamically from - * user space. The user space will configure a single - * counter with a single event. The value of 'event' - * and 'unit_mask' are not evaluated by the kernel code - * and can only be set to 0. - */ + hwsampler_file = 1; - dir = oprofilefs_mkdir(sb, root, "0"); - if (!dir) - return -EINVAL; + hw_dir = oprofilefs_mkdir(sb, root, "hwsampling"); + if (!hw_dir) + return -EINVAL; - oprofilefs_create_file(sb, dir, "enabled", &hwsampler_fops); - oprofilefs_create_file(sb, dir, "event", &zero_fops); - oprofilefs_create_file(sb, dir, "count", &hw_interval_fops); - oprofilefs_create_file(sb, dir, "unit_mask", &zero_fops); - oprofilefs_create_file(sb, dir, "kernel", &kernel_fops); - oprofilefs_create_file(sb, dir, "user", &user_fops); - oprofilefs_create_ulong(sb, dir, "hw_sdbt_blocks", - &oprofile_sdbt_blocks); + oprofilefs_create_file(sb, hw_dir, "hwsampler", &hwsampler_fops); + oprofilefs_create_ulong(sb, hw_dir, "hw_interval", + &oprofile_hw_interval); + oprofilefs_create_ro_ulong(sb, hw_dir, "hw_min_interval", + &oprofile_min_interval); + oprofilefs_create_ro_ulong(sb, hw_dir, "hw_max_interval", + &oprofile_max_interval); + oprofilefs_create_ulong(sb, hw_dir, "hw_sdbt_blocks", + &oprofile_sdbt_blocks); - } else { - /* - * Hardware sampling can be used but the cpu_type is - * forced to timer in order to deal with legacy user - * space tools. The /dev/oprofile/hwsampling fs is - * provided in that case. - */ - dir = oprofilefs_mkdir(sb, root, "hwsampling"); - if (!dir) - return -EINVAL; - - oprofilefs_create_file(sb, dir, "hwsampler", - &hwsampler_fops); - oprofilefs_create_file(sb, dir, "hw_interval", - &hw_interval_fops); - oprofilefs_create_ro_ulong(sb, dir, "hw_min_interval", - &oprofile_min_interval); - oprofilefs_create_ro_ulong(sb, dir, "hw_max_interval", - &oprofile_max_interval); - oprofilefs_create_ulong(sb, dir, "hw_sdbt_blocks", - &oprofile_sdbt_blocks); - } return 0; } static int oprofile_hwsampler_init(struct oprofile_operations *ops) { - /* - * Initialize the timer mode infrastructure as well in order - * to be able to switch back dynamically. oprofile_timer_init - * is not supposed to fail. - */ - if (oprofile_timer_init(ops)) - BUG(); - - memcpy(&timer_ops, ops, sizeof(timer_ops)); - ops->create_files = oprofile_create_hwsampling_files; - - /* - * If the user space tools do not support newer cpu types, - * the force_cpu_type module parameter - * can be used to always return \"timer\" as cpu type. - */ - if (force_cpu_type != timer) { - struct cpuid id; - - get_cpu_id (&id); - - switch (id.machine) { - case 0x2097: case 0x2098: ops->cpu_type = "s390/z10"; break; - case 0x2817: case 0x2818: ops->cpu_type = "s390/z196"; break; - default: return -ENODEV; - } - } - if (hwsampler_setup()) return -ENODEV; /* - * Query the range for the sampling interval from the - * hardware. + * create hwsampler files only if hwsampler_setup() succeeds. */ oprofile_min_interval = hwsampler_query_min_interval(); if (oprofile_min_interval == 0) @@ -466,17 +155,23 @@ static int oprofile_hwsampler_init(struct oprofile_operations *ops) if (oprofile_hw_interval > oprofile_max_interval) oprofile_hw_interval = oprofile_max_interval; - printk(KERN_INFO "oprofile: System z hardware sampling " - "facility found.\n"); + if (oprofile_timer_init(ops)) + return -ENODEV; + + printk(KERN_INFO "oprofile: using hardware sampling\n"); + + memcpy(&timer_ops, ops, sizeof(timer_ops)); ops->start = oprofile_hwsampler_start; ops->stop = oprofile_hwsampler_stop; + ops->create_files = oprofile_create_hwsampling_files; return 0; } static void oprofile_hwsampler_exit(void) { + oprofile_timer_exit(); hwsampler_shutdown(); } @@ -487,15 +182,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops) ops->backtrace = s390_backtrace; #ifdef CONFIG_64BIT - - /* - * -ENODEV is not reported to the caller. The module itself - * will use the timer mode sampling as fallback and this is - * always available. - */ - hwsampler_available = oprofile_hwsampler_init(ops) == 0; - - return 0; + return oprofile_hwsampler_init(ops); #else return -ENODEV; #endif diff --git a/trunk/arch/s390/oprofile/op_counter.h b/trunk/arch/s390/oprofile/op_counter.h deleted file mode 100644 index 1a8d3ca09014..000000000000 --- a/trunk/arch/s390/oprofile/op_counter.h +++ /dev/null @@ -1,23 +0,0 @@ -/** - * arch/s390/oprofile/op_counter.h - * - * Copyright (C) 2011 IBM Deutschland Entwicklung GmbH, IBM Corporation - * Author(s): Andreas Krebbel (krebbel@linux.vnet.ibm.com) - * - * @remark Copyright 2011 OProfile authors - */ - -#ifndef OP_COUNTER_H -#define OP_COUNTER_H - -struct op_counter_config { - /* `enabled' maps to the hwsampler_file variable. */ - /* `count' maps to the oprofile_hw_interval variable. */ - /* `event' and `unit_mask' are unused. */ - unsigned long kernel; - unsigned long user; -}; - -extern struct op_counter_config counter_config; - -#endif /* OP_COUNTER_H */ diff --git a/trunk/arch/score/Kconfig b/trunk/arch/score/Kconfig index 8b0c9464aa9d..df169e84db4e 100644 --- a/trunk/arch/score/Kconfig +++ b/trunk/arch/score/Kconfig @@ -4,9 +4,6 @@ config SCORE def_bool y select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_SHOW - select HAVE_MEMBLOCK - select HAVE_MEMBLOCK_NODE_MAP - select ARCH_DISCARD_MEMBLOCK choice prompt "System type" @@ -63,6 +60,9 @@ config 32BIT config ARCH_FLATMEM_ENABLE def_bool y +config ARCH_POPULATES_NODE_MAP + def_bool y + source "mm/Kconfig" config MEMORY_START diff --git a/trunk/arch/score/kernel/setup.c b/trunk/arch/score/kernel/setup.c index b48459afefdd..6f898c057878 100644 --- a/trunk/arch/score/kernel/setup.c +++ b/trunk/arch/score/kernel/setup.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -55,8 +54,7 @@ static void __init bootmem_init(void) /* Initialize the boot-time allocator with low memory only. */ bootmap_size = init_bootmem_node(NODE_DATA(0), start_pfn, min_low_pfn, max_low_pfn); - memblock_add_node(PFN_PHYS(min_low_pfn), - PFN_PHYS(max_low_pfn - min_low_pfn), 0); + add_active_range(0, min_low_pfn, max_low_pfn); free_bootmem(PFN_PHYS(start_pfn), (max_low_pfn - start_pfn) << PAGE_SHIFT); diff --git a/trunk/arch/sh/Kconfig b/trunk/arch/sh/Kconfig index 47a2f1c2cb0d..5629e2099130 100644 --- a/trunk/arch/sh/Kconfig +++ b/trunk/arch/sh/Kconfig @@ -4,7 +4,6 @@ config SUPERH select CLKDEV_LOOKUP select HAVE_IDE if HAS_IOPORT select HAVE_MEMBLOCK - select HAVE_MEMBLOCK_NODE_MAP select HAVE_OPROFILE select HAVE_GENERIC_DMA_COHERENT select HAVE_ARCH_TRACEHOOK diff --git a/trunk/arch/sh/include/asm/memblock.h b/trunk/arch/sh/include/asm/memblock.h new file mode 100644 index 000000000000..e87063fad2ea --- /dev/null +++ b/trunk/arch/sh/include/asm/memblock.h @@ -0,0 +1,4 @@ +#ifndef __ASM_SH_MEMBLOCK_H +#define __ASM_SH_MEMBLOCK_H + +#endif /* __ASM_SH_MEMBLOCK_H */ diff --git a/trunk/arch/sh/kernel/idle.c b/trunk/arch/sh/kernel/idle.c index 406508d4ce74..db4ecd731a00 100644 --- a/trunk/arch/sh/kernel/idle.c +++ b/trunk/arch/sh/kernel/idle.c @@ -89,8 +89,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); - rcu_idle_enter(); + tick_nohz_stop_sched_tick(1); while (!need_resched()) { check_pgt_cache(); @@ -112,8 +111,7 @@ void cpu_idle(void) start_critical_timings(); } - rcu_idle_exit(); - tick_nohz_idle_exit(); + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/trunk/arch/sh/kernel/machine_kexec.c b/trunk/arch/sh/kernel/machine_kexec.c index 9fea49f6e667..c5a33f007f88 100644 --- a/trunk/arch/sh/kernel/machine_kexec.c +++ b/trunk/arch/sh/kernel/machine_kexec.c @@ -157,6 +157,9 @@ void __init reserve_crashkernel(void) unsigned long long crash_size, crash_base; int ret; + /* this is necessary because of memblock_phys_mem_size() */ + memblock_analyze(); + ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base); if (ret == 0 && crash_size > 0) { diff --git a/trunk/arch/sh/kernel/setup.c b/trunk/arch/sh/kernel/setup.c index 7b57bf1dc855..1a0e946679a4 100644 --- a/trunk/arch/sh/kernel/setup.c +++ b/trunk/arch/sh/kernel/setup.c @@ -230,8 +230,7 @@ void __init __add_active_range(unsigned int nid, unsigned long start_pfn, pmb_bolt_mapping((unsigned long)__va(start), start, end - start, PAGE_KERNEL); - memblock_set_node(PFN_PHYS(start_pfn), - PFN_PHYS(end_pfn - start_pfn), nid); + add_active_range(nid, start_pfn, end_pfn); } void __init __weak plat_early_device_setup(void) diff --git a/trunk/arch/sh/mm/Kconfig b/trunk/arch/sh/mm/Kconfig index cb8f9920f4dd..c3e61b366493 100644 --- a/trunk/arch/sh/mm/Kconfig +++ b/trunk/arch/sh/mm/Kconfig @@ -143,6 +143,9 @@ config MAX_ACTIVE_REGIONS CPU_SUBTYPE_SH7785) default "1" +config ARCH_POPULATES_NODE_MAP + def_bool y + config ARCH_SELECT_MEMORY_MODEL def_bool y diff --git a/trunk/arch/sh/mm/init.c b/trunk/arch/sh/mm/init.c index 82cc576fab15..939ca0f356f6 100644 --- a/trunk/arch/sh/mm/init.c +++ b/trunk/arch/sh/mm/init.c @@ -324,6 +324,7 @@ void __init paging_init(void) unsigned long vaddr, end; int nid; + memblock_init(); sh_mv.mv_mem_init(); early_reserve_mem(); @@ -336,7 +337,7 @@ void __init paging_init(void) sh_mv.mv_mem_reserve(); memblock_enforce_memory_limit(memory_limit); - memblock_allow_resize(); + memblock_analyze(); memblock_dump_all(); diff --git a/trunk/arch/sparc/Kconfig b/trunk/arch/sparc/Kconfig index 70ae9d81870e..f92602e86607 100644 --- a/trunk/arch/sparc/Kconfig +++ b/trunk/arch/sparc/Kconfig @@ -43,7 +43,6 @@ config SPARC64 select HAVE_KPROBES select HAVE_RCU_TABLE_FREE if SMP select HAVE_MEMBLOCK - select HAVE_MEMBLOCK_NODE_MAP select HAVE_SYSCALL_WRAPPERS select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD @@ -353,6 +352,9 @@ config NODES_SPAN_OTHER_NODES def_bool y depends on NEED_MULTIPLE_NODES +config ARCH_POPULATES_NODE_MAP + def_bool y if SPARC64 + config ARCH_SELECT_MEMORY_MODEL def_bool y if SPARC64 diff --git a/trunk/arch/sparc/include/asm/memblock.h b/trunk/arch/sparc/include/asm/memblock.h new file mode 100644 index 000000000000..c67b047ef85e --- /dev/null +++ b/trunk/arch/sparc/include/asm/memblock.h @@ -0,0 +1,8 @@ +#ifndef _SPARC64_MEMBLOCK_H +#define _SPARC64_MEMBLOCK_H + +#include + +#define MEMBLOCK_DBG(fmt...) prom_printf(fmt) + +#endif /* !(_SPARC64_MEMBLOCK_H) */ diff --git a/trunk/arch/sparc/kernel/pci_sun4v.c b/trunk/arch/sparc/kernel/pci_sun4v.c index af5755d20fbe..b272cda35a01 100644 --- a/trunk/arch/sparc/kernel/pci_sun4v.c +++ b/trunk/arch/sparc/kernel/pci_sun4v.c @@ -849,10 +849,10 @@ static int pci_sun4v_msiq_build_irq(struct pci_pbm_info *pbm, if (!irq) return -ENOMEM; - if (pci_sun4v_msiq_setvalid(pbm->devhandle, msiqid, HV_MSIQ_VALID)) - return -EINVAL; if (pci_sun4v_msiq_setstate(pbm->devhandle, msiqid, HV_MSIQSTATE_IDLE)) return -EINVAL; + if (pci_sun4v_msiq_setvalid(pbm->devhandle, msiqid, HV_MSIQ_VALID)) + return -EINVAL; return irq; } diff --git a/trunk/arch/sparc/kernel/process_64.c b/trunk/arch/sparc/kernel/process_64.c index 39d8b05201a2..3739a06a76cb 100644 --- a/trunk/arch/sparc/kernel/process_64.c +++ b/trunk/arch/sparc/kernel/process_64.c @@ -95,14 +95,12 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while(1) { - tick_nohz_idle_enter(); - rcu_idle_enter(); + tick_nohz_stop_sched_tick(1); while (!need_resched() && !cpu_is_offline(cpu)) sparc64_yield(cpu); - rcu_idle_exit(); - tick_nohz_idle_exit(); + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); diff --git a/trunk/arch/sparc/kernel/setup_32.c b/trunk/arch/sparc/kernel/setup_32.c index ffb883ddd0f0..fe1e3fc31bc5 100644 --- a/trunk/arch/sparc/kernel/setup_32.c +++ b/trunk/arch/sparc/kernel/setup_32.c @@ -84,7 +84,7 @@ static void prom_sync_me(void) prom_printf("PROM SYNC COMMAND...\n"); show_free_areas(0); - if (!is_idle_task(current)) { + if(current->pid != 0) { local_irq_enable(); sys_sync(); local_irq_disable(); diff --git a/trunk/arch/sparc/mm/init_64.c b/trunk/arch/sparc/mm/init_64.c index b3f5e7dfea51..8e073d802139 100644 --- a/trunk/arch/sparc/mm/init_64.c +++ b/trunk/arch/sparc/mm/init_64.c @@ -790,7 +790,7 @@ static int find_node(unsigned long addr) return -1; } -static u64 memblock_nid_range(u64 start, u64 end, int *nid) +u64 memblock_nid_range(u64 start, u64 end, int *nid) { *nid = find_node(start); start += PAGE_SIZE; @@ -808,7 +808,7 @@ static u64 memblock_nid_range(u64 start, u64 end, int *nid) return start; } #else -static u64 memblock_nid_range(u64 start, u64 end, int *nid) +u64 memblock_nid_range(u64 start, u64 end, int *nid) { *nid = 0; return end; @@ -816,7 +816,7 @@ static u64 memblock_nid_range(u64 start, u64 end, int *nid) #endif /* This must be invoked after performing all of the necessary - * memblock_set_node() calls for 'nid'. We need to be able to get + * add_active_range() calls for 'nid'. We need to be able to get * correct data from get_pfn_range_for_nid(). */ static void __init allocate_node_data(int nid) @@ -987,11 +987,14 @@ static void __init add_node_ranges(void) this_end = memblock_nid_range(start, end, &nid); - numadbg("Setting memblock NUMA node nid[%d] " + numadbg("Adding active range nid[%d] " "start[%lx] end[%lx]\n", nid, start, this_end); - memblock_set_node(start, this_end - start, nid); + add_active_range(nid, + start >> PAGE_SHIFT, + this_end >> PAGE_SHIFT); + start = this_end; } } @@ -1279,6 +1282,7 @@ static void __init bootmem_init_nonnuma(void) { unsigned long top_of_ram = memblock_end_of_DRAM(); unsigned long total_ram = memblock_phys_mem_size(); + struct memblock_region *reg; numadbg("bootmem_init_nonnuma()\n"); @@ -1288,8 +1292,20 @@ static void __init bootmem_init_nonnuma(void) (top_of_ram - total_ram) >> 20); init_node_masks_nonnuma(); - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); + + for_each_memblock(memory, reg) { + unsigned long start_pfn, end_pfn; + + if (!reg->size) + continue; + + start_pfn = memblock_region_memory_base_pfn(reg); + end_pfn = memblock_region_memory_end_pfn(reg); + add_active_range(0, start_pfn, end_pfn); + } + allocate_node_data(0); + node_set_online(0); } @@ -1753,6 +1769,8 @@ void __init paging_init(void) sun4v_ktsb_init(); } + memblock_init(); + /* Find available physical memory... * * Read it twice in order to work around a bug in openfirmware. @@ -1778,7 +1796,7 @@ void __init paging_init(void) memblock_enforce_memory_limit(cmdline_memory_size); - memblock_allow_resize(); + memblock_analyze(); memblock_dump_all(); set_bit(0, mmu_context_bmap); diff --git a/trunk/arch/tile/kernel/process.c b/trunk/arch/tile/kernel/process.c index 4c1ac6e5347a..9c45d8bbdf57 100644 --- a/trunk/arch/tile/kernel/process.c +++ b/trunk/arch/tile/kernel/process.c @@ -85,8 +85,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); - rcu_idle_enter(); + tick_nohz_stop_sched_tick(1); while (!need_resched()) { if (cpu_is_offline(cpu)) BUG(); /* no HOTPLUG_CPU */ @@ -106,8 +105,7 @@ void cpu_idle(void) local_irq_enable(); current_thread_info()->status |= TS_POLLING; } - rcu_idle_exit(); - tick_nohz_idle_exit(); + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/trunk/arch/tile/mm/fault.c b/trunk/arch/tile/mm/fault.c index c1eaaa1fcc20..25b7b90fd620 100644 --- a/trunk/arch/tile/mm/fault.c +++ b/trunk/arch/tile/mm/fault.c @@ -54,7 +54,7 @@ static noinline void force_sig_info_fault(const char *type, int si_signo, if (unlikely(tsk->pid < 2)) { panic("Signal %d (code %d) at %#lx sent to %s!", si_signo, si_code & 0xffff, address, - is_idle_task(tsk) ? "the idle task" : "init"); + tsk->pid ? "init" : "the idle task"); } info.si_signo = si_signo; @@ -515,7 +515,7 @@ static int handle_page_fault(struct pt_regs *regs, if (unlikely(tsk->pid < 2)) { panic("Kernel page fault running %s!", - is_idle_task(tsk) ? "the idle task" : "init"); + tsk->pid ? "init" : "the idle task"); } /* diff --git a/trunk/arch/um/kernel/process.c b/trunk/arch/um/kernel/process.c index 69f24905abdc..c5338351aecd 100644 --- a/trunk/arch/um/kernel/process.c +++ b/trunk/arch/um/kernel/process.c @@ -246,12 +246,10 @@ void default_idle(void) if (need_resched()) schedule(); - tick_nohz_idle_enter(); - rcu_idle_enter(); + tick_nohz_stop_sched_tick(1); nsecs = disable_timer(); idle_sleep(nsecs); - rcu_idle_exit(); - tick_nohz_idle_exit(); + tick_nohz_restart_sched_tick(); } } diff --git a/trunk/arch/um/kernel/time.c b/trunk/arch/um/kernel/time.c index 82a6e22f1f35..a08d9fab81f2 100644 --- a/trunk/arch/um/kernel/time.c +++ b/trunk/arch/um/kernel/time.c @@ -75,6 +75,8 @@ static struct clocksource itimer_clocksource = { .rating = 300, .read = itimer_read, .mask = CLOCKSOURCE_MASK(64), + .mult = 1000, + .shift = 0, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -92,9 +94,9 @@ static void __init setup_itimer(void) clockevent_delta2ns(60 * HZ, &itimer_clockevent); itimer_clockevent.min_delta_ns = clockevent_delta2ns(1, &itimer_clockevent); - err = clocksource_register_hz(&itimer_clocksource, USEC_PER_SEC); + err = clocksource_register(&itimer_clocksource); if (err) { - printk(KERN_ERR "clocksource_register_hz returned %d\n", err); + printk(KERN_ERR "clocksource_register returned %d\n", err); return; } clockevents_register_device(&itimer_clockevent); diff --git a/trunk/arch/unicore32/kernel/process.c b/trunk/arch/unicore32/kernel/process.c index 52edc2b62873..ba401df971ed 100644 --- a/trunk/arch/unicore32/kernel/process.c +++ b/trunk/arch/unicore32/kernel/process.c @@ -55,8 +55,7 @@ void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); - rcu_idle_enter(); + tick_nohz_stop_sched_tick(1); while (!need_resched()) { local_irq_disable(); stop_critical_timings(); @@ -64,8 +63,7 @@ void cpu_idle(void) local_irq_enable(); start_critical_timings(); } - rcu_idle_exit(); - tick_nohz_idle_exit(); + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/trunk/arch/unicore32/kernel/setup.c b/trunk/arch/unicore32/kernel/setup.c index 673d7a89d8ff..471b6bca8da4 100644 --- a/trunk/arch/unicore32/kernel/setup.c +++ b/trunk/arch/unicore32/kernel/setup.c @@ -37,7 +37,6 @@ #include #include #include -#include #include "setup.h" diff --git a/trunk/arch/unicore32/mm/init.c b/trunk/arch/unicore32/mm/init.c index de186bde8975..3b379cddbc64 100644 --- a/trunk/arch/unicore32/mm/init.c +++ b/trunk/arch/unicore32/mm/init.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include "mm.h" @@ -246,6 +245,7 @@ void __init uc32_memblock_init(struct meminfo *mi) sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL); + memblock_init(); for (i = 0; i < mi->nr_banks; i++) memblock_add(mi->bank[i].start, mi->bank[i].size); @@ -264,7 +264,7 @@ void __init uc32_memblock_init(struct meminfo *mi) uc32_mm_memblock_reserve(); - memblock_allow_resize(); + memblock_analyze(); memblock_dump_all(); } diff --git a/trunk/arch/unicore32/mm/mmu.c b/trunk/arch/unicore32/mm/mmu.c index 43c20b40e444..3e5c3e5a0b45 100644 --- a/trunk/arch/unicore32/mm/mmu.c +++ b/trunk/arch/unicore32/mm/mmu.c @@ -25,7 +25,6 @@ #include #include #include -#include #include diff --git a/trunk/arch/x86/Kconfig b/trunk/arch/x86/Kconfig index 5731eb70e0a0..efb42949cc09 100644 --- a/trunk/arch/x86/Kconfig +++ b/trunk/arch/x86/Kconfig @@ -26,8 +26,6 @@ config X86 select HAVE_IOREMAP_PROT select HAVE_KPROBES select HAVE_MEMBLOCK - select HAVE_MEMBLOCK_NODE_MAP - select ARCH_DISCARD_MEMBLOCK select ARCH_WANT_OPTIONAL_GPIOLIB select ARCH_WANT_FRAME_POINTERS select HAVE_DMA_ATTRS @@ -206,6 +204,9 @@ config ZONE_DMA32 bool default X86_64 +config ARCH_POPULATES_NODE_MAP + def_bool y + config AUDIT_ARCH bool default X86_64 @@ -342,7 +343,6 @@ config X86_EXTENDED_PLATFORM If you enable this option then you'll be able to select support for the following (non-PC) 64 bit x86 platforms: - Numascale NumaChip ScaleMP vSMP SGI Ultraviolet @@ -351,18 +351,6 @@ config X86_EXTENDED_PLATFORM endif # This is an alphabetically sorted list of 64 bit extended platforms # Please maintain the alphabetic order if and when there are additions -config X86_NUMACHIP - bool "Numascale NumaChip" - depends on X86_64 - depends on X86_EXTENDED_PLATFORM - depends on NUMA - depends on SMP - depends on X86_X2APIC - depends on !EDAC_AMD64 - ---help--- - Adds support for Numascale NumaChip large-SMP systems. Needed to - enable more than ~168 cores. - If you don't have one of these, you should say N here. config X86_VSMP bool "ScaleMP vSMP" diff --git a/trunk/arch/x86/ia32/ia32entry.S b/trunk/arch/x86/ia32/ia32entry.S index 3e274564f6bf..a6253ec1b284 100644 --- a/trunk/arch/x86/ia32/ia32entry.S +++ b/trunk/arch/x86/ia32/ia32entry.S @@ -134,7 +134,7 @@ ENTRY(ia32_sysenter_target) CFI_REL_OFFSET rsp,0 pushfq_cfi /*CFI_REL_OFFSET rflags,0*/ - movl TI_sysenter_return+THREAD_INFO(%rsp,3*8-KERNEL_STACK_OFFSET),%r10d + movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d CFI_REGISTER rip,r10 pushq_cfi $__USER32_CS /*CFI_REL_OFFSET cs,0*/ @@ -150,8 +150,9 @@ ENTRY(ia32_sysenter_target) .section __ex_table,"a" .quad 1b,ia32_badarg .previous - orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + GET_THREAD_INFO(%r10) + orl $TS_COMPAT,TI_status(%r10) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) CFI_REMEMBER_STATE jnz sysenter_tracesys cmpq $(IA32_NR_syscalls-1),%rax @@ -161,12 +162,13 @@ sysenter_do_call: sysenter_dispatch: call *ia32_sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) + GET_THREAD_INFO(%r10) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $_TIF_ALLWORK_MASK,TI_flags(%r10) jnz sysexit_audit sysexit_from_sys_call: - andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) + andl $~TS_COMPAT,TI_status(%r10) /* clear IF, that popfq doesn't enable interrupts early */ andl $~0x200,EFLAGS-R11(%rsp) movl RIP-R11(%rsp),%edx /* User %eip */ @@ -203,7 +205,7 @@ sysexit_from_sys_call: .endm .macro auditsys_exit exit - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) jnz ia32_ret_from_sys_call TRACE_IRQS_ON sti @@ -213,11 +215,12 @@ sysexit_from_sys_call: movzbl %al,%edi /* zero-extend that into %edi */ inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ call audit_syscall_exit + GET_THREAD_INFO(%r10) movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi cli TRACE_IRQS_OFF - testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl %edi,TI_flags(%r10) jz \exit CLEAR_RREGS -ARGOFFSET jmp int_with_check @@ -235,7 +238,7 @@ sysexit_audit: sysenter_tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) jz sysenter_auditsys #endif SAVE_REST @@ -306,8 +309,9 @@ ENTRY(ia32_cstar_target) .section __ex_table,"a" .quad 1b,ia32_badarg .previous - orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + GET_THREAD_INFO(%r10) + orl $TS_COMPAT,TI_status(%r10) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) CFI_REMEMBER_STATE jnz cstar_tracesys cmpq $IA32_NR_syscalls-1,%rax @@ -317,12 +321,13 @@ cstar_do_call: cstar_dispatch: call *ia32_sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) + GET_THREAD_INFO(%r10) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $_TIF_ALLWORK_MASK,TI_flags(%r10) jnz sysretl_audit sysretl_from_sys_call: - andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) + andl $~TS_COMPAT,TI_status(%r10) RESTORE_ARGS 0,-ARG_SKIP,0,0,0 movl RIP-ARGOFFSET(%rsp),%ecx CFI_REGISTER rip,rcx @@ -350,7 +355,7 @@ sysretl_audit: cstar_tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) jz cstar_auditsys #endif xchgl %r9d,%ebp @@ -415,8 +420,9 @@ ENTRY(ia32_syscall) /* note the registers are not zero extended to the sf. this could be a problem. */ SAVE_ARGS 0,1,0 - orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + GET_THREAD_INFO(%r10) + orl $TS_COMPAT,TI_status(%r10) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) jnz ia32_tracesys cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys @@ -453,8 +459,8 @@ quiet_ni_syscall: CFI_ENDPROC .macro PTREGSCALL label, func, arg - ALIGN -GLOBAL(\label) + .globl \label +\label: leaq \func(%rip),%rax leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ jmp ia32_ptregs_common @@ -471,8 +477,7 @@ GLOBAL(\label) PTREGSCALL stub32_vfork, sys_vfork, %rdi PTREGSCALL stub32_iopl, sys_iopl, %rsi - ALIGN -ia32_ptregs_common: +ENTRY(ia32_ptregs_common) popq %r11 CFI_ENDPROC CFI_STARTPROC32 simple diff --git a/trunk/arch/x86/include/asm/alternative-asm.h b/trunk/arch/x86/include/asm/alternative-asm.h index 952bd0100c5c..091508b533b4 100644 --- a/trunk/arch/x86/include/asm/alternative-asm.h +++ b/trunk/arch/x86/include/asm/alternative-asm.h @@ -4,10 +4,10 @@ #ifdef CONFIG_SMP .macro LOCK_PREFIX -672: lock +1: lock .section .smp_locks,"a" .balign 4 - .long 672b - . + .long 1b - . .previous .endm #else diff --git a/trunk/arch/x86/include/asm/apic.h b/trunk/arch/x86/include/asm/apic.h index 3ab9bdd87e79..1a6c09af048f 100644 --- a/trunk/arch/x86/include/asm/apic.h +++ b/trunk/arch/x86/include/asm/apic.h @@ -176,7 +176,6 @@ static inline u64 native_x2apic_icr_read(void) } extern int x2apic_phys; -extern int x2apic_preenabled; extern void check_x2apic(void); extern void enable_x2apic(void); extern void x2apic_icr_write(u32 low, u32 id); @@ -199,9 +198,6 @@ static inline void x2apic_force_phys(void) x2apic_phys = 1; } #else -static inline void disable_x2apic(void) -{ -} static inline void check_x2apic(void) { } @@ -216,7 +212,6 @@ static inline void x2apic_force_phys(void) { } -#define nox2apic 0 #define x2apic_preenabled 0 #define x2apic_supported() 0 #endif @@ -415,7 +410,6 @@ extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); #endif #ifdef CONFIG_X86_LOCAL_APIC - static inline u32 apic_read(u32 reg) { return apic->read(reg); diff --git a/trunk/arch/x86/include/asm/apic_flat_64.h b/trunk/arch/x86/include/asm/apic_flat_64.h deleted file mode 100644 index a2d312796440..000000000000 --- a/trunk/arch/x86/include/asm/apic_flat_64.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef _ASM_X86_APIC_FLAT_64_H -#define _ASM_X86_APIC_FLAT_64_H - -extern void flat_init_apic_ldr(void); - -#endif - diff --git a/trunk/arch/x86/include/asm/apicdef.h b/trunk/arch/x86/include/asm/apicdef.h index 134bba00df09..3925d8007864 100644 --- a/trunk/arch/x86/include/asm/apicdef.h +++ b/trunk/arch/x86/include/asm/apicdef.h @@ -144,7 +144,6 @@ #define APIC_BASE (fix_to_virt(FIX_APIC_BASE)) #define APIC_BASE_MSR 0x800 -#define XAPIC_ENABLE (1UL << 11) #define X2APIC_ENABLE (1UL << 10) #ifdef CONFIG_X86_32 diff --git a/trunk/arch/x86/include/asm/bitops.h b/trunk/arch/x86/include/asm/bitops.h index b97596e2b68c..1775d6e5920e 100644 --- a/trunk/arch/x86/include/asm/bitops.h +++ b/trunk/arch/x86/include/asm/bitops.h @@ -380,8 +380,6 @@ static inline unsigned long __fls(unsigned long word) return word; } -#undef ADDR - #ifdef __KERNEL__ /** * ffs - find first set bit in word @@ -397,25 +395,10 @@ static inline unsigned long __fls(unsigned long word) static inline int ffs(int x) { int r; - -#ifdef CONFIG_X86_64 - /* - * AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the - * dest reg is undefined if x==0, but their CPU architect says its - * value is written to set it to the same as before, except that the - * top 32 bits will be cleared. - * - * We cannot do this on 32 bits because at the very least some - * 486 CPUs did not behave this way. - */ - long tmp = -1; - asm("bsfl %1,%0" - : "=r" (r) - : "rm" (x), "0" (tmp)); -#elif defined(CONFIG_X86_CMOV) +#ifdef CONFIG_X86_CMOV asm("bsfl %1,%0\n\t" "cmovzl %2,%0" - : "=&r" (r) : "rm" (x), "r" (-1)); + : "=r" (r) : "rm" (x), "r" (-1)); #else asm("bsfl %1,%0\n\t" "jnz 1f\n\t" @@ -439,22 +422,7 @@ static inline int ffs(int x) static inline int fls(int x) { int r; - -#ifdef CONFIG_X86_64 - /* - * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the - * dest reg is undefined if x==0, but their CPU architect says its - * value is written to set it to the same as before, except that the - * top 32 bits will be cleared. - * - * We cannot do this on 32 bits because at the very least some - * 486 CPUs did not behave this way. - */ - long tmp = -1; - asm("bsrl %1,%0" - : "=r" (r) - : "rm" (x), "0" (tmp)); -#elif defined(CONFIG_X86_CMOV) +#ifdef CONFIG_X86_CMOV asm("bsrl %1,%0\n\t" "cmovzl %2,%0" : "=&r" (r) : "rm" (x), "rm" (-1)); @@ -466,35 +434,11 @@ static inline int fls(int x) #endif return r + 1; } +#endif /* __KERNEL__ */ -/** - * fls64 - find last set bit in a 64-bit word - * @x: the word to search - * - * This is defined in a similar way as the libc and compiler builtin - * ffsll, but returns the position of the most significant set bit. - * - * fls64(value) returns 0 if value is 0 or the position of the last - * set bit if value is nonzero. The last (most significant) bit is - * at position 64. - */ -#ifdef CONFIG_X86_64 -static __always_inline int fls64(__u64 x) -{ - long bitpos = -1; - /* - * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the - * dest reg is undefined if x==0, but their CPU architect says its - * value is written to set it to the same as before. - */ - asm("bsrq %1,%0" - : "+r" (bitpos) - : "rm" (x)); - return bitpos + 1; -} -#else -#include -#endif +#undef ADDR + +#ifdef __KERNEL__ #include @@ -506,6 +450,12 @@ static __always_inline int fls64(__u64 x) #include +#endif /* __KERNEL__ */ + +#include + +#ifdef __KERNEL__ + #include #include diff --git a/trunk/arch/x86/include/asm/cmpxchg.h b/trunk/arch/x86/include/asm/cmpxchg.h index 0c9fa2745f13..5d3acdf5a7a6 100644 --- a/trunk/arch/x86/include/asm/cmpxchg.h +++ b/trunk/arch/x86/include/asm/cmpxchg.h @@ -14,8 +14,6 @@ extern void __cmpxchg_wrong_size(void) __compiletime_error("Bad argument size for cmpxchg"); extern void __xadd_wrong_size(void) __compiletime_error("Bad argument size for xadd"); -extern void __add_wrong_size(void) - __compiletime_error("Bad argument size for add"); /* * Constants for operation sizes. On 32-bit, the 64-bit size it set to @@ -33,47 +31,60 @@ extern void __add_wrong_size(void) #define __X86_CASE_Q -1 /* sizeof will never return -1 */ #endif -/* - * An exchange-type operation, which takes a value and a pointer, and - * returns a the old value. - */ -#define __xchg_op(ptr, arg, op, lock) \ - ({ \ - __typeof__ (*(ptr)) __ret = (arg); \ - switch (sizeof(*(ptr))) { \ - case __X86_CASE_B: \ - asm volatile (lock #op "b %b0, %1\n" \ - : "+r" (__ret), "+m" (*(ptr)) \ - : : "memory", "cc"); \ - break; \ - case __X86_CASE_W: \ - asm volatile (lock #op "w %w0, %1\n" \ - : "+r" (__ret), "+m" (*(ptr)) \ - : : "memory", "cc"); \ - break; \ - case __X86_CASE_L: \ - asm volatile (lock #op "l %0, %1\n" \ - : "+r" (__ret), "+m" (*(ptr)) \ - : : "memory", "cc"); \ - break; \ - case __X86_CASE_Q: \ - asm volatile (lock #op "q %q0, %1\n" \ - : "+r" (__ret), "+m" (*(ptr)) \ - : : "memory", "cc"); \ - break; \ - default: \ - __ ## op ## _wrong_size(); \ - } \ - __ret; \ - }) - /* * Note: no "lock" prefix even on SMP: xchg always implies lock anyway. * Since this is generally used to protect other memory information, we * use "asm volatile" and "memory" clobbers to prevent gcc from moving * information around. */ -#define xchg(ptr, v) __xchg_op((ptr), (v), xchg, "") +#define __xchg(x, ptr, size) \ +({ \ + __typeof(*(ptr)) __x = (x); \ + switch (size) { \ + case __X86_CASE_B: \ + { \ + volatile u8 *__ptr = (volatile u8 *)(ptr); \ + asm volatile("xchgb %0,%1" \ + : "=q" (__x), "+m" (*__ptr) \ + : "0" (__x) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_W: \ + { \ + volatile u16 *__ptr = (volatile u16 *)(ptr); \ + asm volatile("xchgw %0,%1" \ + : "=r" (__x), "+m" (*__ptr) \ + : "0" (__x) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_L: \ + { \ + volatile u32 *__ptr = (volatile u32 *)(ptr); \ + asm volatile("xchgl %0,%1" \ + : "=r" (__x), "+m" (*__ptr) \ + : "0" (__x) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_Q: \ + { \ + volatile u64 *__ptr = (volatile u64 *)(ptr); \ + asm volatile("xchgq %0,%1" \ + : "=r" (__x), "+m" (*__ptr) \ + : "0" (__x) \ + : "memory"); \ + break; \ + } \ + default: \ + __xchg_wrong_size(); \ + } \ + __x; \ +}) + +#define xchg(ptr, v) \ + __xchg((v), (ptr), sizeof(*ptr)) /* * Atomic compare and exchange. Compare OLD with MEM, if identical, @@ -154,80 +165,46 @@ extern void __add_wrong_size(void) __cmpxchg_local((ptr), (old), (new), sizeof(*ptr)) #endif -/* - * xadd() adds "inc" to "*ptr" and atomically returns the previous - * value of "*ptr". - * - * xadd() is locked when multiple CPUs are online - * xadd_sync() is always locked - * xadd_local() is never locked - */ -#define __xadd(ptr, inc, lock) __xchg_op((ptr), (inc), xadd, lock) -#define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX) -#define xadd_sync(ptr, inc) __xadd((ptr), (inc), "lock; ") -#define xadd_local(ptr, inc) __xadd((ptr), (inc), "") - -#define __add(ptr, inc, lock) \ +#define __xadd(ptr, inc, lock) \ ({ \ __typeof__ (*(ptr)) __ret = (inc); \ switch (sizeof(*(ptr))) { \ case __X86_CASE_B: \ - asm volatile (lock "addb %b1, %0\n" \ - : "+m" (*(ptr)) : "ri" (inc) \ - : "memory", "cc"); \ + asm volatile (lock "xaddb %b0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ break; \ case __X86_CASE_W: \ - asm volatile (lock "addw %w1, %0\n" \ - : "+m" (*(ptr)) : "ri" (inc) \ - : "memory", "cc"); \ + asm volatile (lock "xaddw %w0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ break; \ case __X86_CASE_L: \ - asm volatile (lock "addl %1, %0\n" \ - : "+m" (*(ptr)) : "ri" (inc) \ - : "memory", "cc"); \ + asm volatile (lock "xaddl %0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ break; \ case __X86_CASE_Q: \ - asm volatile (lock "addq %1, %0\n" \ - : "+m" (*(ptr)) : "ri" (inc) \ - : "memory", "cc"); \ + asm volatile (lock "xaddq %q0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ break; \ default: \ - __add_wrong_size(); \ + __xadd_wrong_size(); \ } \ __ret; \ }) /* - * add_*() adds "inc" to "*ptr" + * xadd() adds "inc" to "*ptr" and atomically returns the previous + * value of "*ptr". * - * __add() takes a lock prefix - * add_smp() is locked when multiple CPUs are online - * add_sync() is always locked + * xadd() is locked when multiple CPUs are online + * xadd_sync() is always locked + * xadd_local() is never locked */ -#define add_smp(ptr, inc) __add((ptr), (inc), LOCK_PREFIX) -#define add_sync(ptr, inc) __add((ptr), (inc), "lock; ") - -#define __cmpxchg_double(pfx, p1, p2, o1, o2, n1, n2) \ -({ \ - bool __ret; \ - __typeof__(*(p1)) __old1 = (o1), __new1 = (n1); \ - __typeof__(*(p2)) __old2 = (o2), __new2 = (n2); \ - BUILD_BUG_ON(sizeof(*(p1)) != sizeof(long)); \ - BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long)); \ - VM_BUG_ON((unsigned long)(p1) % (2 * sizeof(long))); \ - VM_BUG_ON((unsigned long)((p1) + 1) != (unsigned long)(p2)); \ - asm volatile(pfx "cmpxchg%c4b %2; sete %0" \ - : "=a" (__ret), "+d" (__old2), \ - "+m" (*(p1)), "+m" (*(p2)) \ - : "i" (2 * sizeof(long)), "a" (__old1), \ - "b" (__new1), "c" (__new2)); \ - __ret; \ -}) - -#define cmpxchg_double(p1, p2, o1, o2, n1, n2) \ - __cmpxchg_double(LOCK_PREFIX, p1, p2, o1, o2, n1, n2) - -#define cmpxchg_double_local(p1, p2, o1, o2, n1, n2) \ - __cmpxchg_double(, p1, p2, o1, o2, n1, n2) +#define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX) +#define xadd_sync(ptr, inc) __xadd((ptr), (inc), "lock; ") +#define xadd_local(ptr, inc) __xadd((ptr), (inc), "") #endif /* ASM_X86_CMPXCHG_H */ diff --git a/trunk/arch/x86/include/asm/cmpxchg_32.h b/trunk/arch/x86/include/asm/cmpxchg_32.h index 53f4b219336b..fbebb07dd80b 100644 --- a/trunk/arch/x86/include/asm/cmpxchg_32.h +++ b/trunk/arch/x86/include/asm/cmpxchg_32.h @@ -166,6 +166,52 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old, #endif +#define cmpxchg8b(ptr, o1, o2, n1, n2) \ +({ \ + char __ret; \ + __typeof__(o2) __dummy; \ + __typeof__(*(ptr)) __old1 = (o1); \ + __typeof__(o2) __old2 = (o2); \ + __typeof__(*(ptr)) __new1 = (n1); \ + __typeof__(o2) __new2 = (n2); \ + asm volatile(LOCK_PREFIX "cmpxchg8b %2; setz %1" \ + : "=d"(__dummy), "=a" (__ret), "+m" (*ptr)\ + : "a" (__old1), "d"(__old2), \ + "b" (__new1), "c" (__new2) \ + : "memory"); \ + __ret; }) + + +#define cmpxchg8b_local(ptr, o1, o2, n1, n2) \ +({ \ + char __ret; \ + __typeof__(o2) __dummy; \ + __typeof__(*(ptr)) __old1 = (o1); \ + __typeof__(o2) __old2 = (o2); \ + __typeof__(*(ptr)) __new1 = (n1); \ + __typeof__(o2) __new2 = (n2); \ + asm volatile("cmpxchg8b %2; setz %1" \ + : "=d"(__dummy), "=a"(__ret), "+m" (*ptr)\ + : "a" (__old), "d"(__old2), \ + "b" (__new1), "c" (__new2), \ + : "memory"); \ + __ret; }) + + +#define cmpxchg_double(ptr, o1, o2, n1, n2) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 4); \ + VM_BUG_ON((unsigned long)(ptr) % 8); \ + cmpxchg8b((ptr), (o1), (o2), (n1), (n2)); \ +}) + +#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 4); \ + VM_BUG_ON((unsigned long)(ptr) % 8); \ + cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \ +}) + #define system_has_cmpxchg_double() cpu_has_cx8 #endif /* _ASM_X86_CMPXCHG_32_H */ diff --git a/trunk/arch/x86/include/asm/cmpxchg_64.h b/trunk/arch/x86/include/asm/cmpxchg_64.h index 614be87f1a9b..285da02c38fa 100644 --- a/trunk/arch/x86/include/asm/cmpxchg_64.h +++ b/trunk/arch/x86/include/asm/cmpxchg_64.h @@ -20,6 +20,49 @@ static inline void set_64bit(volatile u64 *ptr, u64 val) cmpxchg_local((ptr), (o), (n)); \ }) +#define cmpxchg16b(ptr, o1, o2, n1, n2) \ +({ \ + char __ret; \ + __typeof__(o2) __junk; \ + __typeof__(*(ptr)) __old1 = (o1); \ + __typeof__(o2) __old2 = (o2); \ + __typeof__(*(ptr)) __new1 = (n1); \ + __typeof__(o2) __new2 = (n2); \ + asm volatile(LOCK_PREFIX "cmpxchg16b %2;setz %1" \ + : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \ + : "b"(__new1), "c"(__new2), \ + "a"(__old1), "d"(__old2)); \ + __ret; }) + + +#define cmpxchg16b_local(ptr, o1, o2, n1, n2) \ +({ \ + char __ret; \ + __typeof__(o2) __junk; \ + __typeof__(*(ptr)) __old1 = (o1); \ + __typeof__(o2) __old2 = (o2); \ + __typeof__(*(ptr)) __new1 = (n1); \ + __typeof__(o2) __new2 = (n2); \ + asm volatile("cmpxchg16b %2;setz %1" \ + : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \ + : "b"(__new1), "c"(__new2), \ + "a"(__old1), "d"(__old2)); \ + __ret; }) + +#define cmpxchg_double(ptr, o1, o2, n1, n2) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ + VM_BUG_ON((unsigned long)(ptr) % 16); \ + cmpxchg16b((ptr), (o1), (o2), (n1), (n2)); \ +}) + +#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ + VM_BUG_ON((unsigned long)(ptr) % 16); \ + cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \ +}) + #define system_has_cmpxchg_double() cpu_has_cx16 #endif /* _ASM_X86_CMPXCHG_64_H */ diff --git a/trunk/arch/x86/include/asm/div64.h b/trunk/arch/x86/include/asm/div64.h index ced283ac79df..9a2d644c08ef 100644 --- a/trunk/arch/x86/include/asm/div64.h +++ b/trunk/arch/x86/include/asm/div64.h @@ -4,7 +4,6 @@ #ifdef CONFIG_X86_32 #include -#include /* * do_div() is NOT a C function. It wants to return @@ -22,20 +21,15 @@ ({ \ unsigned long __upper, __low, __high, __mod, __base; \ __base = (base); \ - if (__builtin_constant_p(__base) && is_power_of_2(__base)) { \ - __mod = n & (__base - 1); \ - n >>= ilog2(__base); \ - } else { \ - asm("" : "=a" (__low), "=d" (__high) : "A" (n));\ - __upper = __high; \ - if (__high) { \ - __upper = __high % (__base); \ - __high = __high / (__base); \ - } \ - asm("divl %2" : "=a" (__low), "=d" (__mod) \ - : "rm" (__base), "0" (__low), "1" (__upper)); \ - asm("" : "=A" (n) : "a" (__low), "d" (__high)); \ + asm("":"=a" (__low), "=d" (__high) : "A" (n)); \ + __upper = __high; \ + if (__high) { \ + __upper = __high % (__base); \ + __high = __high / (__base); \ } \ + asm("divl %2":"=a" (__low), "=d" (__mod) \ + : "rm" (__base), "0" (__low), "1" (__upper)); \ + asm("":"=A" (n) : "a" (__low), "d" (__high)); \ __mod; \ }) diff --git a/trunk/arch/x86/include/asm/e820.h b/trunk/arch/x86/include/asm/e820.h index 37782566af24..908b96957d88 100644 --- a/trunk/arch/x86/include/asm/e820.h +++ b/trunk/arch/x86/include/asm/e820.h @@ -117,7 +117,7 @@ static inline void early_memtest(unsigned long start, unsigned long end) extern unsigned long e820_end_of_ram_pfn(void); extern unsigned long e820_end_of_low_ram_pfn(void); -extern u64 early_reserve_e820(u64 sizet, u64 align); +extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); void memblock_x86_fill(void); void memblock_find_dma_reserve(void); diff --git a/trunk/arch/x86/include/asm/hardirq.h b/trunk/arch/x86/include/asm/hardirq.h index da0b3ca815b7..55e4de613f0e 100644 --- a/trunk/arch/x86/include/asm/hardirq.h +++ b/trunk/arch/x86/include/asm/hardirq.h @@ -11,7 +11,6 @@ typedef struct { #ifdef CONFIG_X86_LOCAL_APIC unsigned int apic_timer_irqs; /* arch dependent */ unsigned int irq_spurious_count; - unsigned int icr_read_retry_count; #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; diff --git a/trunk/arch/x86/include/asm/i387.h b/trunk/arch/x86/include/asm/i387.h index 6919e936345b..c9e09ea05644 100644 --- a/trunk/arch/x86/include/asm/i387.h +++ b/trunk/arch/x86/include/asm/i387.h @@ -218,7 +218,7 @@ static inline void fpu_fxsave(struct fpu *fpu) #ifdef CONFIG_SMP #define safe_address (__per_cpu_offset[0]) #else -#define safe_address (__get_cpu_var(kernel_cpustat).cpustat[CPUTIME_USER]) +#define safe_address (kstat_cpu(0).cpustat.user) #endif /* diff --git a/trunk/arch/x86/include/asm/insn.h b/trunk/arch/x86/include/asm/insn.h index 74df3f1eddfd..88c765e16410 100644 --- a/trunk/arch/x86/include/asm/insn.h +++ b/trunk/arch/x86/include/asm/insn.h @@ -137,13 +137,6 @@ static inline int insn_is_avx(struct insn *insn) return (insn->vex_prefix.value != 0); } -/* Ensure this instruction is decoded completely */ -static inline int insn_complete(struct insn *insn) -{ - return insn->opcode.got && insn->modrm.got && insn->sib.got && - insn->displacement.got && insn->immediate.got; -} - static inline insn_byte_t insn_vex_m_bits(struct insn *insn) { if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ diff --git a/trunk/arch/x86/include/asm/mach_timer.h b/trunk/arch/x86/include/asm/mach_timer.h index 88d0c3c74c13..853728519ae9 100644 --- a/trunk/arch/x86/include/asm/mach_timer.h +++ b/trunk/arch/x86/include/asm/mach_timer.h @@ -15,7 +15,7 @@ #define CALIBRATE_TIME_MSEC 30 /* 30 msecs */ #define CALIBRATE_LATCH \ - ((PIT_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000) + ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000) static inline void mach_prepare_counter(void) { diff --git a/trunk/arch/x86/include/asm/mc146818rtc.h b/trunk/arch/x86/include/asm/mc146818rtc.h index 0e8e85bb7c51..01fdf5674e24 100644 --- a/trunk/arch/x86/include/asm/mc146818rtc.h +++ b/trunk/arch/x86/include/asm/mc146818rtc.h @@ -81,8 +81,8 @@ static inline unsigned char current_lock_cmos_reg(void) #else #define lock_cmos_prefix(reg) do {} while (0) #define lock_cmos_suffix(reg) do {} while (0) -#define lock_cmos(reg) do { } while (0) -#define unlock_cmos() do { } while (0) +#define lock_cmos(reg) +#define unlock_cmos() #define do_i_have_lock_cmos() 0 #define current_lock_cmos_reg() 0 #endif diff --git a/trunk/arch/x86/include/asm/mce.h b/trunk/arch/x86/include/asm/mce.h index 6add827381c9..0e8ae57d3656 100644 --- a/trunk/arch/x86/include/asm/mce.h +++ b/trunk/arch/x86/include/asm/mce.h @@ -50,11 +50,10 @@ #define MCJ_CTX_MASK 3 #define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK) #define MCJ_CTX_RANDOM 0 /* inject context: random */ -#define MCJ_CTX_PROCESS 0x1 /* inject context: process */ -#define MCJ_CTX_IRQ 0x2 /* inject context: IRQ */ -#define MCJ_NMI_BROADCAST 0x4 /* do NMI broadcasting */ -#define MCJ_EXCEPTION 0x8 /* raise as exception */ -#define MCJ_IRQ_BRAODCAST 0x10 /* do IRQ broadcasting */ +#define MCJ_CTX_PROCESS 1 /* inject context: process */ +#define MCJ_CTX_IRQ 2 /* inject context: IRQ */ +#define MCJ_NMI_BROADCAST 4 /* do NMI broadcasting */ +#define MCJ_EXCEPTION 8 /* raise as exception */ /* Fields are zero when not available */ struct mce { @@ -121,8 +120,7 @@ struct mce_log { #ifdef __KERNEL__ -extern void mce_register_decode_chain(struct notifier_block *nb); -extern void mce_unregister_decode_chain(struct notifier_block *nb); +extern struct atomic_notifier_head x86_mce_decoder_chain; #include #include diff --git a/trunk/arch/x86/include/asm/memblock.h b/trunk/arch/x86/include/asm/memblock.h new file mode 100644 index 000000000000..0cd3800f33b9 --- /dev/null +++ b/trunk/arch/x86/include/asm/memblock.h @@ -0,0 +1,23 @@ +#ifndef _X86_MEMBLOCK_H +#define _X86_MEMBLOCK_H + +#define ARCH_DISCARD_MEMBLOCK + +u64 memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align); + +void memblock_x86_reserve_range(u64 start, u64 end, char *name); +void memblock_x86_free_range(u64 start, u64 end); +struct range; +int __get_free_all_memory_range(struct range **range, int nodeid, + unsigned long start_pfn, unsigned long end_pfn); +int get_free_all_memory_range(struct range **rangep, int nodeid); + +void memblock_x86_register_active_regions(int nid, unsigned long start_pfn, + unsigned long last_pfn); +u64 memblock_x86_hole_size(u64 start, u64 end); +u64 memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align); +u64 memblock_x86_free_memory_in_range(u64 addr, u64 limit); +u64 memblock_x86_memory_in_range(u64 addr, u64 limit); +bool memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align); + +#endif diff --git a/trunk/arch/x86/include/asm/microcode.h b/trunk/arch/x86/include/asm/microcode.h index 4ebe157bf73d..24215072d0e1 100644 --- a/trunk/arch/x86/include/asm/microcode.h +++ b/trunk/arch/x86/include/asm/microcode.h @@ -48,7 +48,6 @@ static inline struct microcode_ops * __init init_intel_microcode(void) #ifdef CONFIG_MICROCODE_AMD extern struct microcode_ops * __init init_amd_microcode(void); -extern void __exit exit_amd_microcode(void); static inline void get_ucode_data(void *to, const u8 *from, size_t n) { @@ -60,7 +59,6 @@ static inline struct microcode_ops * __init init_amd_microcode(void) { return NULL; } -static inline void __exit exit_amd_microcode(void) {} #endif #endif /* _ASM_X86_MICROCODE_H */ diff --git a/trunk/arch/x86/include/asm/numachip/numachip_csr.h b/trunk/arch/x86/include/asm/numachip/numachip_csr.h deleted file mode 100644 index 660f843df928..000000000000 --- a/trunk/arch/x86/include/asm/numachip/numachip_csr.h +++ /dev/null @@ -1,167 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Numascale NumaConnect-Specific Header file - * - * Copyright (C) 2011 Numascale AS. All rights reserved. - * - * Send feedback to - * - */ - -#ifndef _ASM_X86_NUMACHIP_NUMACHIP_CSR_H -#define _ASM_X86_NUMACHIP_NUMACHIP_CSR_H - -#include -#include -#include -#include -#include -#include - -#define CSR_NODE_SHIFT 16 -#define CSR_NODE_BITS(p) (((unsigned long)(p)) << CSR_NODE_SHIFT) -#define CSR_NODE_MASK 0x0fff /* 4K nodes */ - -/* 32K CSR space, b15 indicates geo/non-geo */ -#define CSR_OFFSET_MASK 0x7fffUL - -/* Global CSR space covers all 4K possible nodes with 64K CSR space per node */ -#define NUMACHIP_GCSR_BASE 0x3fff00000000ULL -#define NUMACHIP_GCSR_LIM 0x3fff0fffffffULL -#define NUMACHIP_GCSR_SIZE (NUMACHIP_GCSR_LIM - NUMACHIP_GCSR_BASE + 1) - -/* - * Local CSR space starts in global CSR space with "nodeid" = 0xfff0, however - * when using the direct mapping on x86_64, both start and size needs to be - * aligned with PMD_SIZE which is 2M - */ -#define NUMACHIP_LCSR_BASE 0x3ffffe000000ULL -#define NUMACHIP_LCSR_LIM 0x3fffffffffffULL -#define NUMACHIP_LCSR_SIZE (NUMACHIP_LCSR_LIM - NUMACHIP_LCSR_BASE + 1) - -static inline void *gcsr_address(int node, unsigned long offset) -{ - return __va(NUMACHIP_GCSR_BASE | (1UL << 15) | - CSR_NODE_BITS(node & CSR_NODE_MASK) | (offset & CSR_OFFSET_MASK)); -} - -static inline void *lcsr_address(unsigned long offset) -{ - return __va(NUMACHIP_LCSR_BASE | (1UL << 15) | - CSR_NODE_BITS(0xfff0) | (offset & CSR_OFFSET_MASK)); -} - -static inline unsigned int read_gcsr(int node, unsigned long offset) -{ - return swab32(readl(gcsr_address(node, offset))); -} - -static inline void write_gcsr(int node, unsigned long offset, unsigned int val) -{ - writel(swab32(val), gcsr_address(node, offset)); -} - -static inline unsigned int read_lcsr(unsigned long offset) -{ - return swab32(readl(lcsr_address(offset))); -} - -static inline void write_lcsr(unsigned long offset, unsigned int val) -{ - writel(swab32(val), lcsr_address(offset)); -} - -/* ========================================================================= */ -/* CSR_G0_STATE_CLEAR */ -/* ========================================================================= */ - -#define CSR_G0_STATE_CLEAR (0x000 + (0 << 12)) -union numachip_csr_g0_state_clear { - unsigned int v; - struct numachip_csr_g0_state_clear_s { - unsigned int _state:2; - unsigned int _rsvd_2_6:5; - unsigned int _lost:1; - unsigned int _rsvd_8_31:24; - } s; -}; - -/* ========================================================================= */ -/* CSR_G0_NODE_IDS */ -/* ========================================================================= */ - -#define CSR_G0_NODE_IDS (0x008 + (0 << 12)) -union numachip_csr_g0_node_ids { - unsigned int v; - struct numachip_csr_g0_node_ids_s { - unsigned int _initialid:16; - unsigned int _nodeid:12; - unsigned int _rsvd_28_31:4; - } s; -}; - -/* ========================================================================= */ -/* CSR_G3_EXT_IRQ_GEN */ -/* ========================================================================= */ - -#define CSR_G3_EXT_IRQ_GEN (0x030 + (3 << 12)) -union numachip_csr_g3_ext_irq_gen { - unsigned int v; - struct numachip_csr_g3_ext_irq_gen_s { - unsigned int _vector:8; - unsigned int _msgtype:3; - unsigned int _index:5; - unsigned int _destination_apic_id:16; - } s; -}; - -/* ========================================================================= */ -/* CSR_G3_EXT_IRQ_STATUS */ -/* ========================================================================= */ - -#define CSR_G3_EXT_IRQ_STATUS (0x034 + (3 << 12)) -union numachip_csr_g3_ext_irq_status { - unsigned int v; - struct numachip_csr_g3_ext_irq_status_s { - unsigned int _result:32; - } s; -}; - -/* ========================================================================= */ -/* CSR_G3_EXT_IRQ_DEST */ -/* ========================================================================= */ - -#define CSR_G3_EXT_IRQ_DEST (0x038 + (3 << 12)) -union numachip_csr_g3_ext_irq_dest { - unsigned int v; - struct numachip_csr_g3_ext_irq_dest_s { - unsigned int _irq:8; - unsigned int _rsvd_8_31:24; - } s; -}; - -/* ========================================================================= */ -/* CSR_G3_NC_ATT_MAP_SELECT */ -/* ========================================================================= */ - -#define CSR_G3_NC_ATT_MAP_SELECT (0x7fc + (3 << 12)) -union numachip_csr_g3_nc_att_map_select { - unsigned int v; - struct numachip_csr_g3_nc_att_map_select_s { - unsigned int _upper_address_bits:4; - unsigned int _select_ram:4; - unsigned int _rsvd_8_31:24; - } s; -}; - -/* ========================================================================= */ -/* CSR_G3_NC_ATT_MAP_SELECT_0-255 */ -/* ========================================================================= */ - -#define CSR_G3_NC_ATT_MAP_SELECT_0 (0x800 + (3 << 12)) - -#endif /* _ASM_X86_NUMACHIP_NUMACHIP_CSR_H */ - diff --git a/trunk/arch/x86/include/asm/percpu.h b/trunk/arch/x86/include/asm/percpu.h index 529bf07e8067..3470c9d0ebba 100644 --- a/trunk/arch/x86/include/asm/percpu.h +++ b/trunk/arch/x86/include/asm/percpu.h @@ -451,20 +451,23 @@ do { \ #endif /* !CONFIG_M386 */ #ifdef CONFIG_X86_CMPXCHG64 -#define percpu_cmpxchg8b_double(pcp1, pcp2, o1, o2, n1, n2) \ +#define percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) \ ({ \ - bool __ret; \ - typeof(pcp1) __o1 = (o1), __n1 = (n1); \ - typeof(pcp2) __o2 = (o2), __n2 = (n2); \ + char __ret; \ + typeof(o1) __o1 = o1; \ + typeof(o1) __n1 = n1; \ + typeof(o2) __o2 = o2; \ + typeof(o2) __n2 = n2; \ + typeof(o2) __dummy = n2; \ asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \ - : "=a" (__ret), "+m" (pcp1), "+m" (pcp2), "+d" (__o2) \ - : "b" (__n1), "c" (__n2), "a" (__o1)); \ + : "=a"(__ret), "=m" (pcp1), "=d"(__dummy) \ + : "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2)); \ __ret; \ }) -#define __this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double -#define this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double -#define irqsafe_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double +#define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) +#define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) +#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) #endif /* CONFIG_X86_CMPXCHG64 */ /* @@ -505,23 +508,31 @@ do { \ * it in software. The address used in the cmpxchg16 instruction must be * aligned to a 16 byte boundary. */ -#define percpu_cmpxchg16b_double(pcp1, pcp2, o1, o2, n1, n2) \ +#ifdef CONFIG_SMP +#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP3 +#else +#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP2 +#endif +#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \ ({ \ - bool __ret; \ - typeof(pcp1) __o1 = (o1), __n1 = (n1); \ - typeof(pcp2) __o2 = (o2), __n2 = (n2); \ - alternative_io("leaq %P1,%%rsi\n\tcall this_cpu_cmpxchg16b_emu\n\t", \ - "cmpxchg16b " __percpu_arg(1) "\n\tsetz %0\n\t", \ + char __ret; \ + typeof(o1) __o1 = o1; \ + typeof(o1) __n1 = n1; \ + typeof(o2) __o2 = o2; \ + typeof(o2) __n2 = n2; \ + typeof(o2) __dummy; \ + alternative_io(CMPXCHG16B_EMU_CALL, \ + "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \ X86_FEATURE_CX16, \ - ASM_OUTPUT2("=a" (__ret), "+m" (pcp1), \ - "+m" (pcp2), "+d" (__o2)), \ - "b" (__n1), "c" (__n2), "a" (__o1) : "rsi"); \ + ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \ + "S" (&pcp1), "b"(__n1), "c"(__n2), \ + "a"(__o1), "d"(__o2) : "memory"); \ __ret; \ }) -#define __this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double -#define this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double -#define irqsafe_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double +#define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) +#define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) +#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) #endif diff --git a/trunk/arch/x86/include/asm/perf_event.h b/trunk/arch/x86/include/asm/perf_event.h index 096c975e099f..f61c62f7d5d8 100644 --- a/trunk/arch/x86/include/asm/perf_event.h +++ b/trunk/arch/x86/include/asm/perf_event.h @@ -57,7 +57,6 @@ (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) #define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 -#define ARCH_PERFMON_EVENTS_COUNT 7 /* * Intel "Architectural Performance Monitoring" CPUID @@ -73,19 +72,6 @@ union cpuid10_eax { unsigned int full; }; -union cpuid10_ebx { - struct { - unsigned int no_unhalted_core_cycles:1; - unsigned int no_instructions_retired:1; - unsigned int no_unhalted_reference_cycles:1; - unsigned int no_llc_reference:1; - unsigned int no_llc_misses:1; - unsigned int no_branch_instruction_retired:1; - unsigned int no_branch_misses_retired:1; - } split; - unsigned int full; -}; - union cpuid10_edx { struct { unsigned int num_counters_fixed:5; @@ -95,15 +81,6 @@ union cpuid10_edx { unsigned int full; }; -struct x86_pmu_capability { - int version; - int num_counters_gp; - int num_counters_fixed; - int bit_width_gp; - int bit_width_fixed; - unsigned int events_mask; - int events_mask_len; -}; /* * Fixed-purpose performance events: @@ -112,24 +89,23 @@ struct x86_pmu_capability { /* * All 3 fixed-mode PMCs are configured via this single MSR: */ -#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d +#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d /* * The counts are available in three separate MSRs: */ /* Instr_Retired.Any: */ -#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 -#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) +#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 +#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) /* CPU_CLK_Unhalted.Core: */ -#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a -#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) +#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a +#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) /* CPU_CLK_Unhalted.Ref: */ -#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b -#define X86_PMC_IDX_FIXED_REF_CYCLES (X86_PMC_IDX_FIXED + 2) -#define X86_PMC_MSK_FIXED_REF_CYCLES (1ULL << X86_PMC_IDX_FIXED_REF_CYCLES) +#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b +#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) /* * We model BTS tracing as another fixed-mode PMC. @@ -226,7 +202,6 @@ struct perf_guest_switch_msr { }; extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); -extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); #else static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr) { @@ -234,11 +209,6 @@ static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr) return NULL; } -static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) -{ - memset(cap, 0, sizeof(*cap)); -} - static inline void perf_events_lapic_init(void) { } #endif diff --git a/trunk/arch/x86/include/asm/pgtable.h b/trunk/arch/x86/include/asm/pgtable.h index 49afb3f41eb6..18601c86fab1 100644 --- a/trunk/arch/x86/include/asm/pgtable.h +++ b/trunk/arch/x86/include/asm/pgtable.h @@ -703,7 +703,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, pte_update(mm, addr, ptep); } -#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0) +#define flush_tlb_fix_spurious_fault(vma, address) #define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) diff --git a/trunk/arch/x86/include/asm/processor-flags.h b/trunk/arch/x86/include/asm/processor-flags.h index f8ab3eaad128..2dddb317bb39 100644 --- a/trunk/arch/x86/include/asm/processor-flags.h +++ b/trunk/arch/x86/include/asm/processor-flags.h @@ -6,7 +6,6 @@ * EFLAGS bits */ #define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ -#define X86_EFLAGS_BIT1 0x00000002 /* Bit 1 - always on */ #define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ #define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */ #define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ diff --git a/trunk/arch/x86/include/asm/processor.h b/trunk/arch/x86/include/asm/processor.h index aa9088c26931..b650435ffb53 100644 --- a/trunk/arch/x86/include/asm/processor.h +++ b/trunk/arch/x86/include/asm/processor.h @@ -99,6 +99,7 @@ struct cpuinfo_x86 { u16 apicid; u16 initial_apicid; u16 x86_clflush_size; +#ifdef CONFIG_SMP /* number of cores as seen by the OS: */ u16 booted_cores; /* Physical processor id: */ @@ -109,6 +110,7 @@ struct cpuinfo_x86 { u8 compute_unit_id; /* Index into per_cpu list: */ u16 cpu_index; +#endif u32 microcode; } __attribute__((__aligned__(SMP_CACHE_BYTES))); diff --git a/trunk/arch/x86/include/asm/spinlock.h b/trunk/arch/x86/include/asm/spinlock.h index a82c2bf504b6..972c260919a3 100644 --- a/trunk/arch/x86/include/asm/spinlock.h +++ b/trunk/arch/x86/include/asm/spinlock.h @@ -79,10 +79,23 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; } +#if (NR_CPUS < 256) static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) { - __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX); + asm volatile(UNLOCK_LOCK_PREFIX "incb %0" + : "+m" (lock->head_tail) + : + : "memory", "cc"); } +#else +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) +{ + asm volatile(UNLOCK_LOCK_PREFIX "incw %0" + : "+m" (lock->head_tail) + : + : "memory", "cc"); +} +#endif static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) { diff --git a/trunk/arch/x86/include/asm/thread_info.h b/trunk/arch/x86/include/asm/thread_info.h index 185b719ec61a..a1fe5c127b52 100644 --- a/trunk/arch/x86/include/asm/thread_info.h +++ b/trunk/arch/x86/include/asm/thread_info.h @@ -40,8 +40,7 @@ struct thread_info { */ __u8 supervisor_stack[0]; #endif - int sig_on_uaccess_error:1; - int uaccess_err:1; /* uaccess failed */ + int uaccess_err; }; #define INIT_THREAD_INFO(tsk) \ @@ -232,12 +231,6 @@ static inline struct thread_info *current_thread_info(void) movq PER_CPU_VAR(kernel_stack),reg ; \ subq $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg -/* - * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in - * a certain register (to be used in assembler memory operands). - */ -#define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg) - #endif #endif /* !X86_32 */ diff --git a/trunk/arch/x86/include/asm/topology.h b/trunk/arch/x86/include/asm/topology.h index 800f77c60051..c00692476e9f 100644 --- a/trunk/arch/x86/include/asm/topology.h +++ b/trunk/arch/x86/include/asm/topology.h @@ -130,8 +130,10 @@ extern void setup_node_to_cpumask_map(void); .balance_interval = 1, \ } +#ifdef CONFIG_X86_64 extern int __node_distance(int, int); #define node_distance(a, b) __node_distance(a, b) +#endif #else /* !CONFIG_NUMA */ diff --git a/trunk/arch/x86/include/asm/tsc.h b/trunk/arch/x86/include/asm/tsc.h index 15d99153a96d..83e2efd181e2 100644 --- a/trunk/arch/x86/include/asm/tsc.h +++ b/trunk/arch/x86/include/asm/tsc.h @@ -51,8 +51,6 @@ extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); extern unsigned long native_calibrate_tsc(void); -extern int tsc_clocksource_reliable; - /* * Boot-time check whether the TSCs are synchronized across * all CPUs/cores: diff --git a/trunk/arch/x86/include/asm/uaccess.h b/trunk/arch/x86/include/asm/uaccess.h index 8be5f54d9360..36361bf6fdd1 100644 --- a/trunk/arch/x86/include/asm/uaccess.h +++ b/trunk/arch/x86/include/asm/uaccess.h @@ -462,7 +462,7 @@ struct __large_struct { unsigned long buf[100]; }; barrier(); #define uaccess_catch(err) \ - (err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \ + (err) |= current_thread_info()->uaccess_err; \ current_thread_info()->uaccess_err = prev_err; \ } while (0) diff --git a/trunk/arch/x86/include/asm/x86_init.h b/trunk/arch/x86/include/asm/x86_init.h index 1ac860a09849..1971e652d24b 100644 --- a/trunk/arch/x86/include/asm/x86_init.h +++ b/trunk/arch/x86/include/asm/x86_init.h @@ -7,7 +7,6 @@ struct mpc_bus; struct mpc_cpu; struct mpc_table; -struct cpuinfo_x86; /** * struct x86_init_mpparse - platform specific mpparse ops @@ -148,7 +147,6 @@ struct x86_init_ops { */ struct x86_cpuinit_ops { void (*setup_percpu_clockev)(void); - void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node); }; /** @@ -188,6 +186,5 @@ extern struct x86_msi_ops x86_msi; extern void x86_init_noop(void); extern void x86_init_uint_noop(unsigned int unused); -extern void x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node); #endif diff --git a/trunk/arch/x86/kernel/acpi/boot.c b/trunk/arch/x86/kernel/acpi/boot.c index ce664f33ea8e..4558f0d0822d 100644 --- a/trunk/arch/x86/kernel/acpi/boot.c +++ b/trunk/arch/x86/kernel/acpi/boot.c @@ -219,8 +219,6 @@ static int __init acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) { struct acpi_madt_local_x2apic *processor = NULL; - int apic_id; - u8 enabled; processor = (struct acpi_madt_local_x2apic *)header; @@ -229,8 +227,6 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) acpi_table_print_madt_entry(header); - apic_id = processor->local_apic_id; - enabled = processor->lapic_flags & ACPI_MADT_ENABLED; #ifdef CONFIG_X86_X2APIC /* * We need to register disabled CPU as well to permit @@ -239,10 +235,8 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) * to not preallocating memory for all NR_CPUS * when we use CPU hotplug. */ - if (!cpu_has_x2apic && (apic_id >= 0xff) && enabled) - printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); - else - acpi_register_lapic(apic_id, enabled); + acpi_register_lapic(processor->local_apic_id, /* APIC ID */ + processor->lapic_flags & ACPI_MADT_ENABLED); #else printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); #endif diff --git a/trunk/arch/x86/kernel/amd_nb.c b/trunk/arch/x86/kernel/amd_nb.c index 013c1810ce72..4c39baa8facc 100644 --- a/trunk/arch/x86/kernel/amd_nb.c +++ b/trunk/arch/x86/kernel/amd_nb.c @@ -123,14 +123,16 @@ int amd_get_subcaches(int cpu) { struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; unsigned int mask; - int cuid; + int cuid = 0; if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) return 0; pci_read_config_dword(link, 0x1d4, &mask); +#ifdef CONFIG_SMP cuid = cpu_data(cpu).compute_unit_id; +#endif return (mask >> (4 * cuid)) & 0xf; } @@ -139,7 +141,7 @@ int amd_set_subcaches(int cpu, int mask) static unsigned int reset, ban; struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); unsigned int reg; - int cuid; + int cuid = 0; if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf) return -EINVAL; @@ -157,7 +159,9 @@ int amd_set_subcaches(int cpu, int mask) pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); } +#ifdef CONFIG_SMP cuid = cpu_data(cpu).compute_unit_id; +#endif mask <<= 4 * cuid; mask |= (0xf ^ (1 << cuid)) << 26; diff --git a/trunk/arch/x86/kernel/aperture_64.c b/trunk/arch/x86/kernel/aperture_64.c index 6e76c191a835..3d2661ca6542 100644 --- a/trunk/arch/x86/kernel/aperture_64.c +++ b/trunk/arch/x86/kernel/aperture_64.c @@ -88,13 +88,13 @@ static u32 __init allocate_aperture(void) */ addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, aper_size, aper_size); - if (!addr || addr + aper_size > GART_MAX_ADDR) { + if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) { printk(KERN_ERR "Cannot allocate aperture memory hole (%lx,%uK)\n", addr, aper_size>>10); return 0; } - memblock_reserve(addr, aper_size); + memblock_x86_reserve_range(addr, addr + aper_size, "aperture64"); /* * Kmemleak should not scan this block as it may not be mapped via the * kernel direct mapping. diff --git a/trunk/arch/x86/kernel/apic/Makefile b/trunk/arch/x86/kernel/apic/Makefile index 0ae0323b1f9c..767fd04f2843 100644 --- a/trunk/arch/x86/kernel/apic/Makefile +++ b/trunk/arch/x86/kernel/apic/Makefile @@ -10,7 +10,6 @@ obj-$(CONFIG_SMP) += ipi.o ifeq ($(CONFIG_X86_64),y) # APIC probe will depend on the listing order here -obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o obj-$(CONFIG_X86_UV) += x2apic_uv_x.o obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o diff --git a/trunk/arch/x86/kernel/apic/apic.c b/trunk/arch/x86/kernel/apic/apic.c index 2eec05b6d1b8..f98d84caf94c 100644 --- a/trunk/arch/x86/kernel/apic/apic.c +++ b/trunk/arch/x86/kernel/apic/apic.c @@ -146,26 +146,16 @@ __setup("apicpmtimer", setup_apicpmtimer); int x2apic_mode; #ifdef CONFIG_X86_X2APIC /* x2apic enabled before OS handover */ -int x2apic_preenabled; -static int x2apic_disabled; -static int nox2apic; +static int x2apic_preenabled; static __init int setup_nox2apic(char *str) { if (x2apic_enabled()) { - int apicid = native_apic_msr_read(APIC_ID); - - if (apicid >= 255) { - pr_warning("Apicid: %08x, cannot enforce nox2apic\n", - apicid); - return 0; - } - - pr_warning("x2apic already enabled. will disable it\n"); - } else - setup_clear_cpu_cap(X86_FEATURE_X2APIC); - - nox2apic = 1; + pr_warning("Bios already enabled x2apic, " + "can't enforce nox2apic"); + return 0; + } + setup_clear_cpu_cap(X86_FEATURE_X2APIC); return 0; } early_param("nox2apic", setup_nox2apic); @@ -260,7 +250,6 @@ u32 native_safe_apic_wait_icr_idle(void) send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; if (!send_status) break; - inc_irq_stat(icr_read_retry_count); udelay(100); } while (timeout++ < 1000); @@ -887,8 +876,8 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ - irq_enter(); exit_idle(); + irq_enter(); local_apic_timer_interrupt(); irq_exit(); @@ -1442,45 +1431,6 @@ void __init bsp_end_local_APIC_setup(void) } #ifdef CONFIG_X86_X2APIC -/* - * Need to disable xapic and x2apic at the same time and then enable xapic mode - */ -static inline void __disable_x2apic(u64 msr) -{ - wrmsrl(MSR_IA32_APICBASE, - msr & ~(X2APIC_ENABLE | XAPIC_ENABLE)); - wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE); -} - -static __init void disable_x2apic(void) -{ - u64 msr; - - if (!cpu_has_x2apic) - return; - - rdmsrl(MSR_IA32_APICBASE, msr); - if (msr & X2APIC_ENABLE) { - u32 x2apic_id = read_apic_id(); - - if (x2apic_id >= 255) - panic("Cannot disable x2apic, id: %08x\n", x2apic_id); - - pr_info("Disabling x2apic\n"); - __disable_x2apic(msr); - - if (nox2apic) { - clear_cpu_cap(&cpu_data(0), X86_FEATURE_X2APIC); - setup_clear_cpu_cap(X86_FEATURE_X2APIC); - } - - x2apic_disabled = 1; - x2apic_mode = 0; - - register_lapic_address(mp_lapic_addr); - } -} - void check_x2apic(void) { if (x2apic_enabled()) { @@ -1491,20 +1441,15 @@ void check_x2apic(void) void enable_x2apic(void) { - u64 msr; - - rdmsrl(MSR_IA32_APICBASE, msr); - if (x2apic_disabled) { - __disable_x2apic(msr); - return; - } + int msr, msr2; if (!x2apic_mode) return; + rdmsr(MSR_IA32_APICBASE, msr, msr2); if (!(msr & X2APIC_ENABLE)) { printk_once(KERN_INFO "Enabling x2apic\n"); - wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE); + wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2); } } #endif /* CONFIG_X86_X2APIC */ @@ -1541,34 +1486,25 @@ void __init enable_IR_x2apic(void) ret = save_ioapic_entries(); if (ret) { pr_info("Saving IO-APIC state failed: %d\n", ret); - return; + goto out; } local_irq_save(flags); legacy_pic->mask_all(); mask_ioapic_entries(); - if (x2apic_preenabled && nox2apic) - disable_x2apic(); - if (dmar_table_init_ret) ret = -1; else ret = enable_IR(); - if (!x2apic_supported()) - goto skip_x2apic; - if (ret < 0) { /* IR is required if there is APIC ID > 255 even when running * under KVM */ if (max_physical_apicid > 255 || - !hypervisor_x2apic_available()) { - if (x2apic_preenabled) - disable_x2apic(); - goto skip_x2apic; - } + !hypervisor_x2apic_available()) + goto nox2apic; /* * without IR all CPUs can be addressed by IOAPIC/MSI * only in physical mode @@ -1576,10 +1512,8 @@ void __init enable_IR_x2apic(void) x2apic_force_phys(); } - if (ret == IRQ_REMAP_XAPIC_MODE) { - pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n"); - goto skip_x2apic; - } + if (ret == IRQ_REMAP_XAPIC_MODE) + goto nox2apic; x2apic_enabled = 1; @@ -1589,11 +1523,22 @@ void __init enable_IR_x2apic(void) pr_info("Enabled x2apic\n"); } -skip_x2apic: +nox2apic: if (ret < 0) /* IR enabling failed */ restore_ioapic_entries(); legacy_pic->restore_mask(); local_irq_restore(flags); + +out: + if (x2apic_enabled || !x2apic_supported()) + return; + + if (x2apic_preenabled) + panic("x2apic: enabled by BIOS but kernel init failed."); + else if (ret == IRQ_REMAP_XAPIC_MODE) + pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n"); + else if (ret < 0) + pr_info("x2apic not enabled, IRQ remapping init failed\n"); } #ifdef CONFIG_X86_64 @@ -1864,8 +1809,8 @@ void smp_spurious_interrupt(struct pt_regs *regs) { u32 v; - irq_enter(); exit_idle(); + irq_enter(); /* * Check if this really is a spurious interrupt and ACK it * if it is a vectored one. Just in case... @@ -1901,8 +1846,8 @@ void smp_error_interrupt(struct pt_regs *regs) "Illegal register address", /* APIC Error Bit 7 */ }; - irq_enter(); exit_idle(); + irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ v0 = apic_read(APIC_ESR); apic_write(APIC_ESR, 0); diff --git a/trunk/arch/x86/kernel/apic/apic_flat_64.c b/trunk/arch/x86/kernel/apic/apic_flat_64.c index 8c3cdded6f2b..f7a41e4cae47 100644 --- a/trunk/arch/x86/kernel/apic/apic_flat_64.c +++ b/trunk/arch/x86/kernel/apic/apic_flat_64.c @@ -62,7 +62,7 @@ static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask) * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel * document number 292116). So here it goes... */ -void flat_init_apic_ldr(void) +static void flat_init_apic_ldr(void) { unsigned long val; unsigned long num, id; @@ -171,14 +171,9 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb) return initial_apic_id >> index_msb; } -static int flat_probe(void) -{ - return 1; -} - static struct apic apic_flat = { .name = "flat", - .probe = flat_probe, + .probe = NULL, .acpi_madt_oem_check = flat_acpi_madt_oem_check, .apic_id_registered = flat_apic_id_registered, diff --git a/trunk/arch/x86/kernel/apic/apic_numachip.c b/trunk/arch/x86/kernel/apic/apic_numachip.c deleted file mode 100644 index 09d3d8c1cd99..000000000000 --- a/trunk/arch/x86/kernel/apic/apic_numachip.c +++ /dev/null @@ -1,294 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Numascale NumaConnect-Specific APIC Code - * - * Copyright (C) 2011 Numascale AS. All rights reserved. - * - * Send feedback to - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -static int numachip_system __read_mostly; - -static struct apic apic_numachip __read_mostly; - -static unsigned int get_apic_id(unsigned long x) -{ - unsigned long value; - unsigned int id; - - rdmsrl(MSR_FAM10H_NODE_ID, value); - id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U); - - return id; -} - -static unsigned long set_apic_id(unsigned int id) -{ - unsigned long x; - - x = ((id & 0xffU) << 24); - return x; -} - -static unsigned int read_xapic_id(void) -{ - return get_apic_id(apic_read(APIC_ID)); -} - -static int numachip_apic_id_registered(void) -{ - return physid_isset(read_xapic_id(), phys_cpu_present_map); -} - -static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) -{ - return initial_apic_id >> index_msb; -} - -static const struct cpumask *numachip_target_cpus(void) -{ - return cpu_online_mask; -} - -static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - -static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) -{ - union numachip_csr_g3_ext_irq_gen int_gen; - - int_gen.s._destination_apic_id = phys_apicid; - int_gen.s._vector = 0; - int_gen.s._msgtype = APIC_DM_INIT >> 8; - int_gen.s._index = 0; - - write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); - - int_gen.s._msgtype = APIC_DM_STARTUP >> 8; - int_gen.s._vector = start_rip >> 12; - - write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); - - atomic_set(&init_deasserted, 1); - return 0; -} - -static void numachip_send_IPI_one(int cpu, int vector) -{ - union numachip_csr_g3_ext_irq_gen int_gen; - int apicid = per_cpu(x86_cpu_to_apicid, cpu); - - int_gen.s._destination_apic_id = apicid; - int_gen.s._vector = vector; - int_gen.s._msgtype = (vector == NMI_VECTOR ? APIC_DM_NMI : APIC_DM_FIXED) >> 8; - int_gen.s._index = 0; - - write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); -} - -static void numachip_send_IPI_mask(const struct cpumask *mask, int vector) -{ - unsigned int cpu; - - for_each_cpu(cpu, mask) - numachip_send_IPI_one(cpu, vector); -} - -static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask, - int vector) -{ - unsigned int this_cpu = smp_processor_id(); - unsigned int cpu; - - for_each_cpu(cpu, mask) { - if (cpu != this_cpu) - numachip_send_IPI_one(cpu, vector); - } -} - -static void numachip_send_IPI_allbutself(int vector) -{ - unsigned int this_cpu = smp_processor_id(); - unsigned int cpu; - - for_each_online_cpu(cpu) { - if (cpu != this_cpu) - numachip_send_IPI_one(cpu, vector); - } -} - -static void numachip_send_IPI_all(int vector) -{ - numachip_send_IPI_mask(cpu_online_mask, vector); -} - -static void numachip_send_IPI_self(int vector) -{ - __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); -} - -static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - cpu = cpumask_first(cpumask); - if (likely((unsigned)cpu < nr_cpu_ids)) - return per_cpu(x86_cpu_to_apicid, cpu); - - return BAD_APICID; -} - -static unsigned int -numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; - } - return per_cpu(x86_cpu_to_apicid, cpu); -} - -static int __init numachip_probe(void) -{ - return apic == &apic_numachip; -} - -static void __init map_csrs(void) -{ - printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n", - NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1); - init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE); - - printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n", - NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1); - init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE); -} - -static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) -{ - c->phys_proc_id = node; - per_cpu(cpu_llc_id, smp_processor_id()) = node; -} - -static int __init numachip_system_init(void) -{ - unsigned int val; - - if (!numachip_system) - return 0; - - x86_cpuinit.fixup_cpu_id = fixup_cpu_id; - - map_csrs(); - - val = read_lcsr(CSR_G0_NODE_IDS); - printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val); - - return 0; -} -early_initcall(numachip_system_init); - -static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ - if (!strncmp(oem_id, "NUMASC", 6)) { - numachip_system = 1; - return 1; - } - - return 0; -} - -static struct apic apic_numachip __refconst = { - - .name = "NumaConnect system", - .probe = numachip_probe, - .acpi_madt_oem_check = numachip_acpi_madt_oem_check, - .apic_id_registered = numachip_apic_id_registered, - - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 0, /* physical */ - - .target_cpus = numachip_target_cpus, - .disable_esr = 0, - .dest_logical = 0, - .check_apicid_used = NULL, - .check_apicid_present = NULL, - - .vector_allocation_domain = numachip_vector_allocation_domain, - .init_apic_ldr = flat_init_apic_ldr, - - .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, - .multi_timer_check = NULL, - .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .setup_portio_remap = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, - .enable_apic_mode = NULL, - .phys_pkg_id = numachip_phys_pkg_id, - .mps_oem_check = NULL, - - .get_apic_id = get_apic_id, - .set_apic_id = set_apic_id, - .apic_id_mask = 0xffU << 24, - - .cpu_mask_to_apicid = numachip_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = numachip_cpu_mask_to_apicid_and, - - .send_IPI_mask = numachip_send_IPI_mask, - .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, - .send_IPI_allbutself = numachip_send_IPI_allbutself, - .send_IPI_all = numachip_send_IPI_all, - .send_IPI_self = numachip_send_IPI_self, - - .wakeup_secondary_cpu = numachip_wakeup_secondary, - .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = NULL, - .smp_callin_clear_local_apic = NULL, - .inquire_remote_apic = NULL, /* REMRD not supported */ - - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .icr_read = native_apic_icr_read, - .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, -}; -apic_driver(apic_numachip); - diff --git a/trunk/arch/x86/kernel/apic/io_apic.c b/trunk/arch/x86/kernel/apic/io_apic.c index fb072754bc1d..6d939d7847e2 100644 --- a/trunk/arch/x86/kernel/apic/io_apic.c +++ b/trunk/arch/x86/kernel/apic/io_apic.c @@ -2421,8 +2421,8 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) unsigned vector, me; ack_APIC_irq(); - irq_enter(); exit_idle(); + irq_enter(); me = smp_processor_id(); for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { @@ -2948,10 +2948,6 @@ static inline void __init check_timer(void) } local_irq_disable(); apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); - if (x2apic_preenabled) - apic_printk(APIC_QUIET, KERN_INFO - "Perhaps problem with the pre-enabled x2apic mode\n" - "Try booting with x2apic and interrupt-remapping disabled in the bios.\n"); panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " "report. Then try booting with the 'noapic' option.\n"); out: diff --git a/trunk/arch/x86/kernel/check.c b/trunk/arch/x86/kernel/check.c index 5da1269e8ddc..452932d34730 100644 --- a/trunk/arch/x86/kernel/check.c +++ b/trunk/arch/x86/kernel/check.c @@ -62,8 +62,7 @@ early_param("memory_corruption_check_size", set_corruption_check_size); void __init setup_bios_corruption_check(void) { - phys_addr_t start, end; - u64 i; + u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ if (memory_corruption_check == -1) { memory_corruption_check = @@ -83,23 +82,28 @@ void __init setup_bios_corruption_check(void) corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); - for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { - start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), - PAGE_SIZE, corruption_check_size); - end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), - PAGE_SIZE, corruption_check_size); - if (start >= end) - continue; + while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { + u64 size; + addr = memblock_x86_find_in_range_size(addr, &size, PAGE_SIZE); - memblock_reserve(start, end - start); - scan_areas[num_scan_areas].addr = start; - scan_areas[num_scan_areas].size = end - start; + if (addr == MEMBLOCK_ERROR) + break; + + if (addr >= corruption_check_size) + break; + + if ((addr + size) > corruption_check_size) + size = corruption_check_size - addr; + + memblock_x86_reserve_range(addr, addr + size, "SCAN RAM"); + scan_areas[num_scan_areas].addr = addr; + scan_areas[num_scan_areas].size = size; + num_scan_areas++; /* Assume we've already mapped this early memory */ - memset(__va(start), 0, end - start); + memset(__va(addr), 0, size); - if (++num_scan_areas >= MAX_SCAN_AREAS) - break; + addr += size; } if (num_scan_areas) diff --git a/trunk/arch/x86/kernel/cpu/amd.c b/trunk/arch/x86/kernel/cpu/amd.c index f4773f4aae35..0bab2b18bb20 100644 --- a/trunk/arch/x86/kernel/cpu/amd.c +++ b/trunk/arch/x86/kernel/cpu/amd.c @@ -148,6 +148,7 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) { +#ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -191,6 +192,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) valid_k7: ; +#endif } static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) @@ -351,13 +353,6 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) if (node == NUMA_NO_NODE) node = per_cpu(cpu_llc_id, cpu); - /* - * If core numbers are inconsistent, it's likely a multi-fabric platform, - * so invoke platform-specific handler - */ - if (c->phys_proc_id != node) - x86_cpuinit.fixup_cpu_id(c, node); - if (!node_online(node)) { /* * Two possibilities here: diff --git a/trunk/arch/x86/kernel/cpu/centaur.c b/trunk/arch/x86/kernel/cpu/centaur.c index 159103c0b1f4..e58d978e0758 100644 --- a/trunk/arch/x86/kernel/cpu/centaur.c +++ b/trunk/arch/x86/kernel/cpu/centaur.c @@ -278,7 +278,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c) } #ifdef CONFIG_X86_32 /* Cyrix III family needs CX8 & PGE explicitly enabled. */ - if (c->x86_model >= 6 && c->x86_model <= 13) { + if (c->x86_model >= 6 && c->x86_model <= 9) { rdmsr(MSR_VIA_FCR, lo, hi); lo |= (1<<1 | 1<<7); wrmsr(MSR_VIA_FCR, lo, hi); diff --git a/trunk/arch/x86/kernel/cpu/common.c b/trunk/arch/x86/kernel/cpu/common.c index 850f2963a420..aa003b13a831 100644 --- a/trunk/arch/x86/kernel/cpu/common.c +++ b/trunk/arch/x86/kernel/cpu/common.c @@ -676,7 +676,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_early_init) this_cpu->c_early_init(c); +#ifdef CONFIG_SMP c->cpu_index = 0; +#endif filter_cpuid_features(c, false); setup_smep(c); @@ -762,7 +764,10 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) c->apicid = c->initial_apicid; # endif #endif + +#ifdef CONFIG_X86_HT c->phys_proc_id = c->initial_apicid; +#endif } setup_smep(c); @@ -1135,15 +1140,6 @@ static void dbg_restore_debug_regs(void) #define dbg_restore_debug_regs() #endif /* ! CONFIG_KGDB */ -/* - * Prints an error where the NUMA and configured core-number mismatch and the - * platform didn't override this to fix it up - */ -void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node) -{ - pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id); -} - /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT diff --git a/trunk/arch/x86/kernel/cpu/cpu.h b/trunk/arch/x86/kernel/cpu/cpu.h index 8bacc7826fb3..1b22dcc51af4 100644 --- a/trunk/arch/x86/kernel/cpu/cpu.h +++ b/trunk/arch/x86/kernel/cpu/cpu.h @@ -1,4 +1,5 @@ #ifndef ARCH_X86_CPU_H + #define ARCH_X86_CPU_H struct cpu_model_info { @@ -34,4 +35,6 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); -#endif /* ARCH_X86_CPU_H */ +extern void get_cpu_cap(struct cpuinfo_x86 *c); + +#endif diff --git a/trunk/arch/x86/kernel/cpu/intel.c b/trunk/arch/x86/kernel/cpu/intel.c index 3e6ff6cbf42a..523131213f08 100644 --- a/trunk/arch/x86/kernel/cpu/intel.c +++ b/trunk/arch/x86/kernel/cpu/intel.c @@ -181,6 +181,7 @@ static void __cpuinit trap_init_f00f_bug(void) static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) { +#ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -197,6 +198,7 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) WARN_ONCE(1, "WARNING: SMP operation may be unreliable" "with B stepping processors.\n"); } +#endif } static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) diff --git a/trunk/arch/x86/kernel/cpu/mcheck/mce-inject.c b/trunk/arch/x86/kernel/cpu/mcheck/mce-inject.c index fc4beb393577..319882ef848d 100644 --- a/trunk/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/trunk/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -93,18 +92,6 @@ static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) return NMI_HANDLED; } -static void mce_irq_ipi(void *info) -{ - int cpu = smp_processor_id(); - struct mce *m = &__get_cpu_var(injectm); - - if (cpumask_test_cpu(cpu, mce_inject_cpumask) && - m->inject_flags & MCJ_EXCEPTION) { - cpumask_clear_cpu(cpu, mce_inject_cpumask); - raise_exception(m, NULL); - } -} - /* Inject mce on current CPU */ static int raise_local(void) { @@ -152,10 +139,9 @@ static void raise_mce(struct mce *m) return; #ifdef CONFIG_X86_LOCAL_APIC - if (m->inject_flags & (MCJ_IRQ_BRAODCAST | MCJ_NMI_BROADCAST)) { + if (m->inject_flags & MCJ_NMI_BROADCAST) { unsigned long start; int cpu; - get_online_cpus(); cpumask_copy(mce_inject_cpumask, cpu_online_mask); cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); @@ -165,25 +151,13 @@ static void raise_mce(struct mce *m) MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) cpumask_clear_cpu(cpu, mce_inject_cpumask); } - if (!cpumask_empty(mce_inject_cpumask)) { - if (m->inject_flags & MCJ_IRQ_BRAODCAST) { - /* - * don't wait because mce_irq_ipi is necessary - * to be sync with following raise_local - */ - preempt_disable(); - smp_call_function_many(mce_inject_cpumask, - mce_irq_ipi, NULL, 0); - preempt_enable(); - } else if (m->inject_flags & MCJ_NMI_BROADCAST) - apic->send_IPI_mask(mce_inject_cpumask, - NMI_VECTOR); - } + if (!cpumask_empty(mce_inject_cpumask)) + apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR); start = jiffies; while (!cpumask_empty(mce_inject_cpumask)) { if (!time_before(jiffies, start + 2*HZ)) { printk(KERN_ERR - "Timeout waiting for mce inject %lx\n", + "Timeout waiting for mce inject NMI %lx\n", *cpumask_bits(mce_inject_cpumask)); break; } diff --git a/trunk/arch/x86/kernel/cpu/mcheck/mce.c b/trunk/arch/x86/kernel/cpu/mcheck/mce.c index cbe82b5918ce..2af127d4c3d1 100644 --- a/trunk/arch/x86/kernel/cpu/mcheck/mce.c +++ b/trunk/arch/x86/kernel/cpu/mcheck/mce.c @@ -95,6 +95,13 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; +/* + * CPU/chipset specific EDAC code can register a notifier call here to print + * MCE errors in a human-readable form. + */ +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); +EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); + /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL @@ -102,12 +109,6 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { static DEFINE_PER_CPU(struct work_struct, mce_work); -/* - * CPU/chipset specific EDAC code can register a notifier call here to print - * MCE errors in a human-readable form. - */ -ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); - /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { @@ -118,7 +119,9 @@ void mce_setup(struct mce *m) m->time = get_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; m->cpuid = cpuid_eax(1); +#ifdef CONFIG_SMP m->socketid = cpu_data(m->extcpu).phys_proc_id; +#endif m->apicid = cpu_data(m->extcpu).initial_apicid; rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); } @@ -187,57 +190,6 @@ void mce_log(struct mce *mce) set_bit(0, &mce_need_notify); } -static void drain_mcelog_buffer(void) -{ - unsigned int next, i, prev = 0; - - next = rcu_dereference_check_mce(mcelog.next); - - do { - struct mce *m; - - /* drain what was logged during boot */ - for (i = prev; i < next; i++) { - unsigned long start = jiffies; - unsigned retries = 1; - - m = &mcelog.entry[i]; - - while (!m->finished) { - if (time_after_eq(jiffies, start + 2*retries)) - retries++; - - cpu_relax(); - - if (!m->finished && retries >= 4) { - pr_err("MCE: skipping error being logged currently!\n"); - break; - } - } - smp_rmb(); - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); - } - - memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); - prev = next; - next = cmpxchg(&mcelog.next, prev, 0); - } while (next != prev); -} - - -void mce_register_decode_chain(struct notifier_block *nb) -{ - atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); - drain_mcelog_buffer(); -} -EXPORT_SYMBOL_GPL(mce_register_decode_chain); - -void mce_unregister_decode_chain(struct notifier_block *nb) -{ - atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); -} -EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); - static void print_mce(struct mce *m) { int ret = 0; diff --git a/trunk/arch/x86/kernel/cpu/mcheck/mce_amd.c b/trunk/arch/x86/kernel/cpu/mcheck/mce_amd.c index 1d76872b6a45..f5474218cffe 100644 --- a/trunk/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/trunk/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -64,9 +64,11 @@ struct threshold_bank { }; static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); +#ifdef CONFIG_SMP static unsigned char shared_bank[NR_BANKS] = { 0, 0, 0, 0, 1 }; +#endif static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ @@ -200,9 +202,10 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (!block) per_cpu(bank_map, cpu) |= (1 << bank); +#ifdef CONFIG_SMP if (shared_bank[bank] && c->cpu_core_id) break; - +#endif offset = setup_APIC_mce(offset, (high & MASK_LVTOFF_HI) >> 20); @@ -528,6 +531,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) sprintf(name, "threshold_bank%i", bank); +#ifdef CONFIG_SMP if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ i = cpumask_first(cpu_llc_shared_mask(cpu)); @@ -554,6 +558,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } +#endif b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { diff --git a/trunk/arch/x86/kernel/cpu/mcheck/therm_throt.c b/trunk/arch/x86/kernel/cpu/mcheck/therm_throt.c index 39c6089891e4..787e06c84ea6 100644 --- a/trunk/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/trunk/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -323,6 +323,17 @@ device_initcall(thermal_throttle_init_device); #endif /* CONFIG_SYSFS */ +/* + * Set up the most two significant bit to notify mce log that this thermal + * event type. + * This is a temp solution. May be changed in the future with mce log + * infrasture. + */ +#define CORE_THROTTLED (0) +#define CORE_POWER_LIMIT ((__u64)1 << 62) +#define PACKAGE_THROTTLED ((__u64)2 << 62) +#define PACKAGE_POWER_LIMIT ((__u64)3 << 62) + static void notify_thresholds(__u64 msr_val) { /* check whether the interrupt handler is defined; @@ -352,23 +363,27 @@ static void intel_thermal_interrupt(void) if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, CORE_LEVEL) != 0) - mce_log_therm_throt_event(msr_val); + mce_log_therm_throt_event(CORE_THROTTLED | msr_val); if (this_cpu_has(X86_FEATURE_PLN)) - therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, + if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, - CORE_LEVEL); + CORE_LEVEL) != 0) + mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); if (this_cpu_has(X86_FEATURE_PTS)) { rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); - therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, + if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, - PACKAGE_LEVEL); + PACKAGE_LEVEL) != 0) + mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); if (this_cpu_has(X86_FEATURE_PLN)) - therm_throt_process(msr_val & + if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, - PACKAGE_LEVEL); + PACKAGE_LEVEL) != 0) + mce_log_therm_throt_event(PACKAGE_POWER_LIMIT + | msr_val); } } @@ -382,8 +397,8 @@ static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) { - irq_enter(); exit_idle(); + irq_enter(); inc_irq_stat(irq_thermal_count); smp_thermal_vector(); irq_exit(); diff --git a/trunk/arch/x86/kernel/cpu/mcheck/threshold.c b/trunk/arch/x86/kernel/cpu/mcheck/threshold.c index aa578cadb940..d746df2909c9 100644 --- a/trunk/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/trunk/arch/x86/kernel/cpu/mcheck/threshold.c @@ -19,8 +19,8 @@ void (*mce_threshold_vector)(void) = default_threshold_interrupt; asmlinkage void smp_threshold_interrupt(void) { - irq_enter(); exit_idle(); + irq_enter(); inc_irq_stat(irq_threshold_count); mce_threshold_vector(); irq_exit(); diff --git a/trunk/arch/x86/kernel/cpu/perf_event.c b/trunk/arch/x86/kernel/cpu/perf_event.c index 5adce1040b11..2bda212a0010 100644 --- a/trunk/arch/x86/kernel/cpu/perf_event.c +++ b/trunk/arch/x86/kernel/cpu/perf_event.c @@ -484,195 +484,18 @@ static inline int is_x86_event(struct perf_event *event) return event->pmu == &pmu; } -/* - * Event scheduler state: - * - * Assign events iterating over all events and counters, beginning - * with events with least weights first. Keep the current iterator - * state in struct sched_state. - */ -struct sched_state { - int weight; - int event; /* event index */ - int counter; /* counter index */ - int unassigned; /* number of events to be assigned left */ - unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; -}; - -/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ -#define SCHED_STATES_MAX 2 - -struct perf_sched { - int max_weight; - int max_events; - struct event_constraint **constraints; - struct sched_state state; - int saved_states; - struct sched_state saved[SCHED_STATES_MAX]; -}; - -/* - * Initialize interator that runs through all events and counters. - */ -static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c, - int num, int wmin, int wmax) -{ - int idx; - - memset(sched, 0, sizeof(*sched)); - sched->max_events = num; - sched->max_weight = wmax; - sched->constraints = c; - - for (idx = 0; idx < num; idx++) { - if (c[idx]->weight == wmin) - break; - } - - sched->state.event = idx; /* start with min weight */ - sched->state.weight = wmin; - sched->state.unassigned = num; -} - -static void perf_sched_save_state(struct perf_sched *sched) -{ - if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) - return; - - sched->saved[sched->saved_states] = sched->state; - sched->saved_states++; -} - -static bool perf_sched_restore_state(struct perf_sched *sched) -{ - if (!sched->saved_states) - return false; - - sched->saved_states--; - sched->state = sched->saved[sched->saved_states]; - - /* continue with next counter: */ - clear_bit(sched->state.counter++, sched->state.used); - - return true; -} - -/* - * Select a counter for the current event to schedule. Return true on - * success. - */ -static bool __perf_sched_find_counter(struct perf_sched *sched) -{ - struct event_constraint *c; - int idx; - - if (!sched->state.unassigned) - return false; - - if (sched->state.event >= sched->max_events) - return false; - - c = sched->constraints[sched->state.event]; - - /* Prefer fixed purpose counters */ - if (x86_pmu.num_counters_fixed) { - idx = X86_PMC_IDX_FIXED; - for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) { - if (!__test_and_set_bit(idx, sched->state.used)) - goto done; - } - } - /* Grab the first unused counter starting with idx */ - idx = sched->state.counter; - for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) { - if (!__test_and_set_bit(idx, sched->state.used)) - goto done; - } - - return false; - -done: - sched->state.counter = idx; - - if (c->overlap) - perf_sched_save_state(sched); - - return true; -} - -static bool perf_sched_find_counter(struct perf_sched *sched) -{ - while (!__perf_sched_find_counter(sched)) { - if (!perf_sched_restore_state(sched)) - return false; - } - - return true; -} - -/* - * Go through all unassigned events and find the next one to schedule. - * Take events with the least weight first. Return true on success. - */ -static bool perf_sched_next_event(struct perf_sched *sched) -{ - struct event_constraint *c; - - if (!sched->state.unassigned || !--sched->state.unassigned) - return false; - - do { - /* next event */ - sched->state.event++; - if (sched->state.event >= sched->max_events) { - /* next weight */ - sched->state.event = 0; - sched->state.weight++; - if (sched->state.weight > sched->max_weight) - return false; - } - c = sched->constraints[sched->state.event]; - } while (c->weight != sched->state.weight); - - sched->state.counter = 0; /* start with first counter */ - - return true; -} - -/* - * Assign a counter for each event. - */ -static int perf_assign_events(struct event_constraint **constraints, int n, - int wmin, int wmax, int *assign) -{ - struct perf_sched sched; - - perf_sched_init(&sched, constraints, n, wmin, wmax); - - do { - if (!perf_sched_find_counter(&sched)) - break; /* failed */ - if (assign) - assign[sched.state.event] = sched.state.counter; - } while (perf_sched_next_event(&sched)); - - return sched.state.unassigned; -} - int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - int i, wmin, wmax, num = 0; + int i, j, w, wmax, num = 0; struct hw_perf_event *hwc; bitmap_zero(used_mask, X86_PMC_IDX_MAX); - for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { + for (i = 0; i < n; i++) { c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); constraints[i] = c; - wmin = min(wmin, c->weight); - wmax = max(wmax, c->weight); } /* @@ -698,11 +521,59 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (assign) assign[i] = hwc->idx; } + if (i == n) + goto done; + + /* + * begin slow path + */ + + bitmap_zero(used_mask, X86_PMC_IDX_MAX); - /* slow path */ - if (i != n) - num = perf_assign_events(constraints, n, wmin, wmax, assign); + /* + * weight = number of possible counters + * + * 1 = most constrained, only works on one counter + * wmax = least constrained, works on any counter + * + * assign events to counters starting with most + * constrained events. + */ + wmax = x86_pmu.num_counters; + /* + * when fixed event counters are present, + * wmax is incremented by 1 to account + * for one more choice + */ + if (x86_pmu.num_counters_fixed) + wmax++; + + for (w = 1, num = n; num && w <= wmax; w++) { + /* for each event */ + for (i = 0; num && i < n; i++) { + c = constraints[i]; + hwc = &cpuc->event_list[i]->hw; + + if (c->weight != w) + continue; + + for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { + if (!test_bit(j, used_mask)) + break; + } + + if (j == X86_PMC_IDX_MAX) + break; + + __set_bit(j, used_mask); + + if (assign) + assign[i] = j; + num--; + } + } +done: /* * scheduling failed or is just a simulation, * free resources if necessary @@ -1248,7 +1119,6 @@ static void __init pmu_check_apic(void) static int __init init_hw_perf_events(void) { - struct x86_pmu_quirk *quirk; struct event_constraint *c; int err; @@ -1277,8 +1147,8 @@ static int __init init_hw_perf_events(void) pr_cont("%s PMU driver.\n", x86_pmu.name); - for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) - quirk->func(); + if (x86_pmu.quirks) + x86_pmu.quirks(); if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", @@ -1301,18 +1171,12 @@ static int __init init_hw_perf_events(void) unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, - 0, x86_pmu.num_counters, 0); + 0, x86_pmu.num_counters); if (x86_pmu.event_constraints) { - /* - * event on fixed counter2 (REF_CYCLES) only works on this - * counter, so do not extend mask to generic counters - */ for_each_event_constraint(c, x86_pmu.event_constraints) { - if (c->cmask != X86_RAW_EVENT_MASK - || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) { + if (c->cmask != X86_RAW_EVENT_MASK) continue; - } c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; c->weight += x86_pmu.num_counters; @@ -1702,15 +1566,3 @@ unsigned long perf_misc_flags(struct pt_regs *regs) return misc; } - -void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) -{ - cap->version = x86_pmu.version; - cap->num_counters_gp = x86_pmu.num_counters; - cap->num_counters_fixed = x86_pmu.num_counters_fixed; - cap->bit_width_gp = x86_pmu.cntval_bits; - cap->bit_width_fixed = x86_pmu.cntval_bits; - cap->events_mask = (unsigned int)x86_pmu.events_maskl; - cap->events_mask_len = x86_pmu.events_mask_len; -} -EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); diff --git a/trunk/arch/x86/kernel/cpu/perf_event.h b/trunk/arch/x86/kernel/cpu/perf_event.h index 8944062f46e2..b9698d40ac4b 100644 --- a/trunk/arch/x86/kernel/cpu/perf_event.h +++ b/trunk/arch/x86/kernel/cpu/perf_event.h @@ -45,7 +45,6 @@ struct event_constraint { u64 code; u64 cmask; int weight; - int overlap; }; struct amd_nb { @@ -152,40 +151,15 @@ struct cpu_hw_events { void *kfree_on_online; }; -#define __EVENT_CONSTRAINT(c, n, m, w, o) {\ +#define __EVENT_CONSTRAINT(c, n, m, w) {\ { .idxmsk64 = (n) }, \ .code = (c), \ .cmask = (m), \ .weight = (w), \ - .overlap = (o), \ } #define EVENT_CONSTRAINT(c, n, m) \ - __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0) - -/* - * The overlap flag marks event constraints with overlapping counter - * masks. This is the case if the counter mask of such an event is not - * a subset of any other counter mask of a constraint with an equal or - * higher weight, e.g.: - * - * c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); - * c_another1 = EVENT_CONSTRAINT(0, 0x07, 0); - * c_another2 = EVENT_CONSTRAINT(0, 0x38, 0); - * - * The event scheduler may not select the correct counter in the first - * cycle because it needs to know which subsequent events will be - * scheduled. It may fail to schedule the events then. So we set the - * overlap flag for such constraints to give the scheduler a hint which - * events to select for counter rescheduling. - * - * Care must be taken as the rescheduling algorithm is O(n!) which - * will increase scheduling cycles for an over-commited system - * dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros - * and its counter masks must be kept at a minimum. - */ -#define EVENT_CONSTRAINT_OVERLAP(c, n, m) \ - __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1) + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) /* * Constraint on the Event code. @@ -261,11 +235,6 @@ union perf_capabilities { u64 capabilities; }; -struct x86_pmu_quirk { - struct x86_pmu_quirk *next; - void (*func)(void); -}; - /* * struct x86_pmu - generic x86 pmu */ @@ -290,11 +259,6 @@ struct x86_pmu { int num_counters_fixed; int cntval_bits; u64 cntval_mask; - union { - unsigned long events_maskl; - unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)]; - }; - int events_mask_len; int apic; u64 max_period; struct event_constraint * @@ -304,7 +268,7 @@ struct x86_pmu { void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); struct event_constraint *event_constraints; - struct x86_pmu_quirk *quirks; + void (*quirks)(void); int perfctr_second_write; int (*cpu_prepare)(int cpu); @@ -345,15 +309,6 @@ struct x86_pmu { struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); }; -#define x86_add_quirk(func_) \ -do { \ - static struct x86_pmu_quirk __quirk __initdata = { \ - .func = func_, \ - }; \ - __quirk.next = x86_pmu.quirks; \ - x86_pmu.quirks = &__quirk; \ -} while (0) - #define ERF_NO_HT_SHARING 1 #define ERF_HAS_RSP_1 2 diff --git a/trunk/arch/x86/kernel/cpu/perf_event_amd.c b/trunk/arch/x86/kernel/cpu/perf_event_amd.c index 0397b23be8e9..aeefd45697a2 100644 --- a/trunk/arch/x86/kernel/cpu/perf_event_amd.c +++ b/trunk/arch/x86/kernel/cpu/perf_event_amd.c @@ -492,7 +492,7 @@ static __initconst const struct x86_pmu amd_pmu = { static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0); static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0); -static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); +static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0); static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); diff --git a/trunk/arch/x86/kernel/cpu/perf_event_intel.c b/trunk/arch/x86/kernel/cpu/perf_event_intel.c index 3bd37bdf1b8e..121f1be4da19 100644 --- a/trunk/arch/x86/kernel/cpu/perf_event_intel.c +++ b/trunk/arch/x86/kernel/cpu/perf_event_intel.c @@ -28,7 +28,6 @@ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, - [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ }; static struct event_constraint intel_core_event_constraints[] __read_mostly = @@ -46,7 +45,12 @@ static struct event_constraint intel_core2_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + /* + * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event + * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed + * ratio between these counters. + */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ @@ -64,7 +68,7 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ @@ -86,7 +90,7 @@ static struct event_constraint intel_westmere_event_constraints[] __read_mostly { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ @@ -98,7 +102,7 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ @@ -121,7 +125,7 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ EVENT_CONSTRAINT_END }; @@ -1515,7 +1519,7 @@ static __initconst const struct x86_pmu intel_pmu = { .guest_get_msrs = intel_guest_get_msrs, }; -static __init void intel_clovertown_quirk(void) +static void intel_clovertown_quirks(void) { /* * PEBS is unreliable due to: @@ -1541,60 +1545,19 @@ static __init void intel_clovertown_quirk(void) x86_pmu.pebs_constraints = NULL; } -static __init void intel_sandybridge_quirk(void) +static void intel_sandybridge_quirks(void) { printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); x86_pmu.pebs = 0; x86_pmu.pebs_constraints = NULL; } -static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { - { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" }, - { PERF_COUNT_HW_INSTRUCTIONS, "instructions" }, - { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" }, - { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" }, - { PERF_COUNT_HW_CACHE_MISSES, "cache misses" }, - { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" }, - { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" }, -}; - -static __init void intel_arch_events_quirk(void) -{ - int bit; - - /* disable event that reported as not presend by cpuid */ - for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { - intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; - printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n", - intel_arch_events_map[bit].name); - } -} - -static __init void intel_nehalem_quirk(void) -{ - union cpuid10_ebx ebx; - - ebx.full = x86_pmu.events_maskl; - if (ebx.split.no_branch_misses_retired) { - /* - * Erratum AAJ80 detected, we work it around by using - * the BR_MISP_EXEC.ANY event. This will over-count - * branch-misses, but it's still much better than the - * architectural event which is often completely bogus: - */ - intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; - ebx.split.no_branch_misses_retired = 0; - x86_pmu.events_maskl = ebx.full; - printk(KERN_INFO "CPU erratum AAJ80 worked around\n"); - } -} - __init int intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; - union cpuid10_ebx ebx; unsigned int unused; + unsigned int ebx; int version; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { @@ -1611,8 +1574,8 @@ __init int intel_pmu_init(void) * Check whether the Architectural PerfMon supports * Branch Misses Retired hw_event or not. */ - cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); - if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT) + cpuid(10, &eax.full, &ebx, &unused, &edx.full); + if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) return -ENODEV; version = eax.split.version_id; @@ -1626,9 +1589,6 @@ __init int intel_pmu_init(void) x86_pmu.cntval_bits = eax.split.bit_width; x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; - x86_pmu.events_maskl = ebx.full; - x86_pmu.events_mask_len = eax.split.mask_length; - /* * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events: @@ -1648,8 +1608,6 @@ __init int intel_pmu_init(void) intel_ds_init(); - x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ - /* * Install the hw-cache-events table: */ @@ -1659,7 +1617,7 @@ __init int intel_pmu_init(void) break; case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ - x86_add_quirk(intel_clovertown_quirk); + x86_pmu.quirks = intel_clovertown_quirks; case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ case 29: /* six-core 45 nm xeon "Dunnington" */ @@ -1693,8 +1651,17 @@ __init int intel_pmu_init(void) /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; - x86_add_quirk(intel_nehalem_quirk); + if (ebx & 0x40) { + /* + * Erratum AAJ80 detected, we work it around by using + * the BR_MISP_EXEC.ANY event. This will over-count + * branch-misses, but it's still much better than the + * architectural event which is often completely bogus: + */ + intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; + pr_cont("erratum AAJ80 worked around, "); + } pr_cont("Nehalem events, "); break; @@ -1734,7 +1701,7 @@ __init int intel_pmu_init(void) break; case 42: /* SandyBridge */ - x86_add_quirk(intel_sandybridge_quirk); + x86_pmu.quirks = intel_sandybridge_quirks; case 45: /* SandyBridge, "Romely-EP" */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -1771,6 +1738,5 @@ __init int intel_pmu_init(void) break; } } - return 0; } diff --git a/trunk/arch/x86/kernel/cpu/powerflags.c b/trunk/arch/x86/kernel/cpu/powerflags.c index 7b3fe56b1c21..5abbea297e0c 100644 --- a/trunk/arch/x86/kernel/cpu/powerflags.c +++ b/trunk/arch/x86/kernel/cpu/powerflags.c @@ -16,6 +16,5 @@ const char *const x86_power_flags[32] = { "100mhzsteps", "hwpstate", "", /* tsc invariant mapped to constant_tsc */ - "cpb", /* core performance boost */ - "eff_freq_ro", /* Readonly aperf/mperf */ + /* nothing */ }; diff --git a/trunk/arch/x86/kernel/cpu/proc.c b/trunk/arch/x86/kernel/cpu/proc.c index 8022c6681485..14b23140e81f 100644 --- a/trunk/arch/x86/kernel/cpu/proc.c +++ b/trunk/arch/x86/kernel/cpu/proc.c @@ -64,10 +64,12 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) static int show_cpuinfo(struct seq_file *m, void *v) { struct cpuinfo_x86 *c = v; - unsigned int cpu; + unsigned int cpu = 0; int i; +#ifdef CONFIG_SMP cpu = c->cpu_index; +#endif seq_printf(m, "processor\t: %u\n" "vendor_id\t: %s\n" "cpu family\t: %d\n" diff --git a/trunk/arch/x86/kernel/e820.c b/trunk/arch/x86/kernel/e820.c index 8071e2f3d6eb..303a0e48f076 100644 --- a/trunk/arch/x86/kernel/e820.c +++ b/trunk/arch/x86/kernel/e820.c @@ -738,17 +738,35 @@ core_initcall(e820_mark_nvs_memory); /* * pre allocated 4k and reserved it in memblock and e820_saved */ -u64 __init early_reserve_e820(u64 size, u64 align) +u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) { + u64 size = 0; u64 addr; + u64 start; - addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); - if (addr) { - e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED); - printk(KERN_INFO "update e820_saved for early_reserve_e820\n"); - update_e820_saved(); + for (start = startt; ; start += size) { + start = memblock_x86_find_in_range_size(start, &size, align); + if (start == MEMBLOCK_ERROR) + return 0; + if (size >= sizet) + break; } +#ifdef CONFIG_X86_32 + if (start >= MAXMEM) + return 0; + if (start + size > MAXMEM) + size = MAXMEM - start; +#endif + + addr = round_down(start + size - sizet, align); + if (addr < start) + return 0; + memblock_x86_reserve_range(addr, addr + sizet, "new next"); + e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); + printk(KERN_INFO "update e820_saved for early_reserve_e820\n"); + update_e820_saved(); + return addr; } @@ -1072,7 +1090,7 @@ void __init memblock_x86_fill(void) * We are safe to enable resizing, beause memblock_x86_fill() * is rather later for x86 */ - memblock_allow_resize(); + memblock_can_resize = 1; for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; @@ -1087,36 +1105,22 @@ void __init memblock_x86_fill(void) memblock_add(ei->addr, ei->size); } + memblock_analyze(); memblock_dump_all(); } void __init memblock_find_dma_reserve(void) { #ifdef CONFIG_X86_64 - u64 nr_pages = 0, nr_free_pages = 0; - unsigned long start_pfn, end_pfn; - phys_addr_t start, end; - int i; - u64 u; - + u64 free_size_pfn; + u64 mem_size_pfn; /* * need to find out used area below MAX_DMA_PFN * need to use memblock to get free size in [0, MAX_DMA_PFN] * at first, and assume boot_mem will not take below MAX_DMA_PFN */ - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { - start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN); - end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN); - nr_pages += end_pfn - start_pfn; - } - - for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) { - start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); - end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); - if (start_pfn < end_pfn) - nr_free_pages += end_pfn - start_pfn; - } - - set_dma_reserve(nr_pages - nr_free_pages); + mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; + free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; + set_dma_reserve(mem_size_pfn - free_size_pfn); #endif } diff --git a/trunk/arch/x86/kernel/entry_32.S b/trunk/arch/x86/kernel/entry_32.S index 22d0e21b4dd7..f3f6f5344001 100644 --- a/trunk/arch/x86/kernel/entry_32.S +++ b/trunk/arch/x86/kernel/entry_32.S @@ -625,8 +625,6 @@ work_notifysig: # deal with pending signals and movl %esp, %eax jne work_notifysig_v86 # returning to kernel-space or # vm86-space - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) xorl %edx, %edx call do_notify_resume jmp resume_userspace_sig @@ -640,8 +638,6 @@ work_notifysig_v86: #else movl %esp, %eax #endif - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) xorl %edx, %edx call do_notify_resume jmp resume_userspace_sig diff --git a/trunk/arch/x86/kernel/entry_64.S b/trunk/arch/x86/kernel/entry_64.S index a20e1cb9dc87..faf8d5e74b0b 100644 --- a/trunk/arch/x86/kernel/entry_64.S +++ b/trunk/arch/x86/kernel/entry_64.S @@ -221,7 +221,7 @@ ENDPROC(native_usergs_sysret64) /*CFI_REL_OFFSET ss,0*/ pushq_cfi %rax /* rsp */ CFI_REL_OFFSET rsp,0 - pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */ + pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */ /*CFI_REL_OFFSET rflags,0*/ pushq_cfi $__KERNEL_CS /* cs */ /*CFI_REL_OFFSET cs,0*/ @@ -411,7 +411,7 @@ ENTRY(ret_from_fork) RESTORE_REST testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? - jz retint_restore_args + je int_ret_from_sys_call testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET jnz int_ret_from_sys_call @@ -465,7 +465,7 @@ ENTRY(system_call) * after the swapgs, so that it can do the swapgs * for the guest and jump here on syscall. */ -GLOBAL(system_call_after_swapgs) +ENTRY(system_call_after_swapgs) movq %rsp,PER_CPU_VAR(old_rsp) movq PER_CPU_VAR(kernel_stack),%rsp @@ -478,7 +478,8 @@ GLOBAL(system_call_after_swapgs) movq %rax,ORIG_RAX-ARGOFFSET(%rsp) movq %rcx,RIP-ARGOFFSET(%rsp) CFI_REL_OFFSET rip,RIP-ARGOFFSET - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + GET_THREAD_INFO(%rcx) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx) jnz tracesys system_call_fastpath: cmpq $__NR_syscall_max,%rax @@ -495,9 +496,10 @@ ret_from_sys_call: /* edi: flagmask */ sysret_check: LOCKDEP_SYS_EXIT + GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx + movl TI_flags(%rcx),%edx andl %edi,%edx jnz sysret_careful CFI_REMEMBER_STATE @@ -581,7 +583,7 @@ sysret_audit: /* Do syscall tracing */ tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) jz auditsys #endif SAVE_REST @@ -610,6 +612,8 @@ tracesys: GLOBAL(int_ret_from_sys_call) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF + testl $3,CS-ARGOFFSET(%rsp) + je retint_restore_args movl $_TIF_ALLWORK_MASK,%edi /* edi: mask to check */ GLOBAL(int_with_check) @@ -949,7 +953,6 @@ END(common_interrupt) ENTRY(\sym) INTR_FRAME pushq_cfi $~(\num) -.Lcommon_\sym: interrupt \do_sym jmp ret_from_intr CFI_ENDPROC @@ -973,21 +976,13 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \ x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_SMP - ALIGN - INTR_FRAME -.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ +.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 .if NUM_INVALIDATE_TLB_VECTORS > \idx -ENTRY(invalidate_interrupt\idx) - pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx) - jmp .Lcommon_invalidate_interrupt0 - CFI_ADJUST_CFA_OFFSET -8 -END(invalidate_interrupt\idx) +apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ + invalidate_interrupt\idx smp_invalidate_interrupt .endif .endr - CFI_ENDPROC -apicinterrupt INVALIDATE_TLB_VECTOR_START, \ - invalidate_interrupt0, smp_invalidate_interrupt #endif apicinterrupt THRESHOLD_APIC_VECTOR \ diff --git a/trunk/arch/x86/kernel/head.c b/trunk/arch/x86/kernel/head.c index 48d9d4ea1020..af0699ba48cf 100644 --- a/trunk/arch/x86/kernel/head.c +++ b/trunk/arch/x86/kernel/head.c @@ -52,5 +52,5 @@ void __init reserve_ebda_region(void) lowmem = 0x9f000; /* reserve all memory between lowmem and the 1MB mark */ - memblock_reserve(lowmem, 0x100000 - lowmem); + memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved"); } diff --git a/trunk/arch/x86/kernel/head32.c b/trunk/arch/x86/kernel/head32.c index 51ff18616d50..3bb08509a7a1 100644 --- a/trunk/arch/x86/kernel/head32.c +++ b/trunk/arch/x86/kernel/head32.c @@ -31,8 +31,9 @@ static void __init i386_default_early_setup(void) void __init i386_start_kernel(void) { - memblock_reserve(__pa_symbol(&_text), - __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); + memblock_init(); + + memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ @@ -41,7 +42,7 @@ void __init i386_start_kernel(void) u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); + memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif diff --git a/trunk/arch/x86/kernel/head64.c b/trunk/arch/x86/kernel/head64.c index 3a3b779f41d3..5655c2272adb 100644 --- a/trunk/arch/x86/kernel/head64.c +++ b/trunk/arch/x86/kernel/head64.c @@ -98,8 +98,9 @@ void __init x86_64_start_reservations(char *real_mode_data) { copy_bootdata(__va(real_mode_data)); - memblock_reserve(__pa_symbol(&_text), - __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); + memblock_init(); + + memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ @@ -108,7 +109,7 @@ void __init x86_64_start_reservations(char *real_mode_data) unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); + memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif diff --git a/trunk/arch/x86/kernel/hpet.c b/trunk/arch/x86/kernel/hpet.c index 07b0a56a754d..1bb0bf4d92cd 100644 --- a/trunk/arch/x86/kernel/hpet.c +++ b/trunk/arch/x86/kernel/hpet.c @@ -32,6 +32,8 @@ #define HPET_MIN_CYCLES 128 #define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) +#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt) + /* * HPET address is set in acpi/boot.c, when an ACPI entry exists */ @@ -53,11 +55,6 @@ struct hpet_dev { char name[10]; }; -inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev) -{ - return container_of(evtdev, struct hpet_dev, evt); -} - inline unsigned int hpet_readl(unsigned int a) { return readl(hpet_virt_address + a); diff --git a/trunk/arch/x86/kernel/irq.c b/trunk/arch/x86/kernel/irq.c index 7943e0c21bde..429e0c92924e 100644 --- a/trunk/arch/x86/kernel/irq.c +++ b/trunk/arch/x86/kernel/irq.c @@ -74,10 +74,6 @@ int arch_show_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); seq_printf(p, " IRQ work interrupts\n"); - seq_printf(p, "%*s: ", prec, "RTR"); - for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count); - seq_printf(p, " APIC ICR read retries\n"); #endif if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); @@ -140,7 +136,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_spurious_count; sum += irq_stats(cpu)->apic_perf_irqs; sum += irq_stats(cpu)->apic_irq_work_irqs; - sum += irq_stats(cpu)->icr_read_retry_count; #endif if (x86_platform_ipi_callback) sum += irq_stats(cpu)->x86_platform_ipis; @@ -186,8 +181,8 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) unsigned vector = ~regs->orig_ax; unsigned irq; - irq_enter(); exit_idle(); + irq_enter(); irq = __this_cpu_read(vector_irq[vector]); @@ -214,10 +209,10 @@ void smp_x86_platform_ipi(struct pt_regs *regs) ack_APIC_irq(); - irq_enter(); - exit_idle(); + irq_enter(); + inc_irq_stat(x86_platform_ipis); if (x86_platform_ipi_callback) diff --git a/trunk/arch/x86/kernel/jump_label.c b/trunk/arch/x86/kernel/jump_label.c index 2889b3d43882..ea9d5f2f13ef 100644 --- a/trunk/arch/x86/kernel/jump_label.c +++ b/trunk/arch/x86/kernel/jump_label.c @@ -50,7 +50,7 @@ void arch_jump_label_transform(struct jump_entry *entry, put_online_cpus(); } -__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, +void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { __jump_label_transform(entry, type, text_poke_early); diff --git a/trunk/arch/x86/kernel/microcode_amd.c b/trunk/arch/x86/kernel/microcode_amd.c index fe86493f3ed1..d494799aafcd 100644 --- a/trunk/arch/x86/kernel/microcode_amd.c +++ b/trunk/arch/x86/kernel/microcode_amd.c @@ -1,18 +1,14 @@ /* * AMD CPU Microcode Update Driver for Linux - * Copyright (C) 2008-2011 Advanced Micro Devices Inc. + * Copyright (C) 2008 Advanced Micro Devices Inc. * * Author: Peter Oruba * * Based on work by: * Tigran Aivazian * - * Maintainers: - * Andreas Herrmann - * Borislav Petkov - * - * This driver allows to upgrade microcode on F10h AMD - * CPUs and later. + * This driver allows to upgrade microcode on AMD + * family 0x10 and 0x11 processors. * * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. @@ -75,9 +71,6 @@ struct microcode_amd { static struct equiv_cpu_entry *equiv_cpu_table; -/* page-sized ucode patch buffer */ -void *patch; - static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -93,76 +86,27 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) return 0; } -static unsigned int verify_ucode_size(int cpu, u32 patch_size, - unsigned int size) +static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr, + int rev) { - struct cpuinfo_x86 *c = &cpu_data(cpu); - u32 max_size; - -#define F1XH_MPB_MAX_SIZE 2048 -#define F14H_MPB_MAX_SIZE 1824 -#define F15H_MPB_MAX_SIZE 4096 - - switch (c->x86) { - case 0x14: - max_size = F14H_MPB_MAX_SIZE; - break; - case 0x15: - max_size = F15H_MPB_MAX_SIZE; - break; - default: - max_size = F1XH_MPB_MAX_SIZE; - break; - } - - if (patch_size > min_t(u32, size, max_size)) { - pr_err("patch size mismatch\n"); - return 0; - } - - return patch_size; -} - -static u16 find_equiv_id(void) -{ - unsigned int current_cpu_id, i = 0; + unsigned int current_cpu_id; + u16 equiv_cpu_id = 0; + unsigned int i = 0; BUG_ON(equiv_cpu_table == NULL); - current_cpu_id = cpuid_eax(0x00000001); while (equiv_cpu_table[i].installed_cpu != 0) { - if (current_cpu_id == equiv_cpu_table[i].installed_cpu) - return equiv_cpu_table[i].equiv_cpu; - + if (current_cpu_id == equiv_cpu_table[i].installed_cpu) { + equiv_cpu_id = equiv_cpu_table[i].equiv_cpu; + break; + } i++; } - return 0; -} -/* - * we signal a good patch is found by returning its size > 0 - */ -static int get_matching_microcode(int cpu, const u8 *ucode_ptr, - unsigned int leftover_size, int rev, - unsigned int *current_size) -{ - struct microcode_header_amd *mc_hdr; - unsigned int actual_size; - u16 equiv_cpu_id; - - /* size of the current patch we're staring at */ - *current_size = *(u32 *)(ucode_ptr + 4) + SECTION_HDR_SIZE; - - equiv_cpu_id = find_equiv_id(); if (!equiv_cpu_id) return 0; - /* - * let's look at the patch header itself now - */ - mc_hdr = (struct microcode_header_amd *)(ucode_ptr + SECTION_HDR_SIZE); - if (mc_hdr->processor_rev_id != equiv_cpu_id) return 0; @@ -176,20 +120,7 @@ static int get_matching_microcode(int cpu, const u8 *ucode_ptr, if (mc_hdr->patch_id <= rev) return 0; - /* - * now that the header looks sane, verify its size - */ - actual_size = verify_ucode_size(cpu, *current_size, leftover_size); - if (!actual_size) - return 0; - - /* clear the patch buffer */ - memset(patch, 0, PAGE_SIZE); - - /* all looks ok, get the binary patch */ - get_ucode_data(patch, ucode_ptr + SECTION_HDR_SIZE, actual_size); - - return actual_size; + return 1; } static int apply_microcode_amd(int cpu) @@ -224,6 +155,63 @@ static int apply_microcode_amd(int cpu) return 0; } +static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + u32 max_size, actual_size; + +#define F1XH_MPB_MAX_SIZE 2048 +#define F14H_MPB_MAX_SIZE 1824 +#define F15H_MPB_MAX_SIZE 4096 + + switch (c->x86) { + case 0x14: + max_size = F14H_MPB_MAX_SIZE; + break; + case 0x15: + max_size = F15H_MPB_MAX_SIZE; + break; + default: + max_size = F1XH_MPB_MAX_SIZE; + break; + } + + actual_size = *(u32 *)(buf + 4); + + if (actual_size + SECTION_HDR_SIZE > size || actual_size > max_size) { + pr_err("section size mismatch\n"); + return 0; + } + + return actual_size; +} + +static struct microcode_header_amd * +get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size) +{ + struct microcode_header_amd *mc = NULL; + unsigned int actual_size = 0; + + if (*(u32 *)buf != UCODE_UCODE_TYPE) { + pr_err("invalid type field in container file section header\n"); + goto out; + } + + actual_size = verify_ucode_size(cpu, buf, size); + if (!actual_size) + goto out; + + mc = vzalloc(actual_size); + if (!mc) + goto out; + + get_ucode_data(mc, buf + SECTION_HDR_SIZE, actual_size); + *mc_size = actual_size + SECTION_HDR_SIZE; + +out: + return mc; +} + static int install_equiv_cpu_table(const u8 *buf) { unsigned int *ibuf = (unsigned int *)buf; @@ -259,38 +247,36 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct microcode_header_amd *mc_hdr = NULL; - unsigned int mc_size, leftover, current_size = 0; + unsigned int mc_size, leftover; int offset; const u8 *ucode_ptr = data; void *new_mc = NULL; unsigned int new_rev = uci->cpu_sig.rev; - enum ucode_state state = UCODE_ERROR; + enum ucode_state state = UCODE_OK; offset = install_equiv_cpu_table(ucode_ptr); if (offset < 0) { pr_err("failed to create equivalent cpu table\n"); - goto out; + return UCODE_ERROR; } + ucode_ptr += offset; leftover = size - offset; - if (*(u32 *)ucode_ptr != UCODE_UCODE_TYPE) { - pr_err("invalid type field in container file section header\n"); - goto free_table; - } - while (leftover) { - mc_size = get_matching_microcode(cpu, ucode_ptr, leftover, - new_rev, ¤t_size); - if (mc_size) { - mc_hdr = patch; - new_mc = patch; + mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size); + if (!mc_hdr) + break; + + if (get_matching_microcode(cpu, mc_hdr, new_rev)) { + vfree(new_mc); new_rev = mc_hdr->patch_id; - goto out_ok; - } + new_mc = mc_hdr; + } else + vfree(mc_hdr); - ucode_ptr += current_size; - leftover -= current_size; + ucode_ptr += mc_size; + leftover -= mc_size; } if (!new_mc) { @@ -298,16 +284,19 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) goto free_table; } -out_ok: - uci->mc = new_mc; - state = UCODE_OK; - pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", - cpu, uci->cpu_sig.rev, new_rev); + if (!leftover) { + vfree(uci->mc); + uci->mc = new_mc; + pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", + cpu, uci->cpu_sig.rev, new_rev); + } else { + vfree(new_mc); + state = UCODE_ERROR; + } free_table: free_equiv_cpu_table(); -out: return state; } @@ -348,6 +337,7 @@ static void microcode_fini_cpu_amd(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + vfree(uci->mc); uci->mc = NULL; } @@ -361,14 +351,5 @@ static struct microcode_ops microcode_amd_ops = { struct microcode_ops * __init init_amd_microcode(void) { - patch = (void *)get_zeroed_page(GFP_KERNEL); - if (!patch) - return NULL; - return µcode_amd_ops; } - -void __exit exit_amd_microcode(void) -{ - free_page((unsigned long)patch); -} diff --git a/trunk/arch/x86/kernel/microcode_core.c b/trunk/arch/x86/kernel/microcode_core.c index 9302e2d0eb4b..9d46f5e43b51 100644 --- a/trunk/arch/x86/kernel/microcode_core.c +++ b/trunk/arch/x86/kernel/microcode_core.c @@ -563,8 +563,6 @@ module_init(microcode_init); static void __exit microcode_exit(void) { - struct cpuinfo_x86 *c = &cpu_data(0); - microcode_dev_exit(); unregister_hotcpu_notifier(&mc_cpu_notifier); @@ -582,9 +580,6 @@ static void __exit microcode_exit(void) microcode_ops = NULL; - if (c->x86_vendor == X86_VENDOR_AMD) - exit_amd_microcode(); - pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); } module_exit(microcode_exit); diff --git a/trunk/arch/x86/kernel/mpparse.c b/trunk/arch/x86/kernel/mpparse.c index ca470e4c92dc..0741b062a304 100644 --- a/trunk/arch/x86/kernel/mpparse.c +++ b/trunk/arch/x86/kernel/mpparse.c @@ -564,7 +564,9 @@ void __init default_get_smp_config(unsigned int early) static void __init smp_reserve_memory(struct mpf_intel *mpf) { - memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr)); + unsigned long size = get_mpc_size(mpf->physptr); + + memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc"); } static int __init smp_scan_config(unsigned long base, unsigned long length) @@ -593,7 +595,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) mpf, (u64)virt_to_phys(mpf)); mem = virt_to_phys(mpf); - memblock_reserve(mem, sizeof(*mpf)); + memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf"); if (mpf->physptr) smp_reserve_memory(mpf); @@ -834,8 +836,10 @@ early_param("alloc_mptable", parse_alloc_mptable_opt); void __init early_reserve_e820_mpc_new(void) { - if (enable_update_mptable && alloc_mptable) - mpc_new_phys = early_reserve_e820(mpc_new_length, 4); + if (enable_update_mptable && alloc_mptable) { + u64 startt = 0; + mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4); + } } static int __init update_mp_table(void) diff --git a/trunk/arch/x86/kernel/process.c b/trunk/arch/x86/kernel/process.c index 15763af7bfe3..ee5d4fbd53b4 100644 --- a/trunk/arch/x86/kernel/process.c +++ b/trunk/arch/x86/kernel/process.c @@ -293,7 +293,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.orig_ax = -1; regs.ip = (unsigned long) kernel_thread_helper; regs.cs = __KERNEL_CS | get_kernel_rpl(); - regs.flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1; + regs.flags = X86_EFLAGS_IF | 0x2; /* Ok, create the new process.. */ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); diff --git a/trunk/arch/x86/kernel/process_32.c b/trunk/arch/x86/kernel/process_32.c index 485204f58cda..795b79f984c2 100644 --- a/trunk/arch/x86/kernel/process_32.c +++ b/trunk/arch/x86/kernel/process_32.c @@ -99,8 +99,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); - rcu_idle_enter(); + tick_nohz_stop_sched_tick(1); while (!need_resched()) { check_pgt_cache(); @@ -117,8 +116,7 @@ void cpu_idle(void) pm_idle(); start_critical_timings(); } - rcu_idle_exit(); - tick_nohz_idle_exit(); + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/trunk/arch/x86/kernel/process_64.c b/trunk/arch/x86/kernel/process_64.c index 9b9fe4a85c87..3bd7e6eebf31 100644 --- a/trunk/arch/x86/kernel/process_64.c +++ b/trunk/arch/x86/kernel/process_64.c @@ -122,7 +122,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_stop_sched_tick(1); while (!need_resched()) { rmb(); @@ -139,14 +139,8 @@ void cpu_idle(void) enter_idle(); /* Don't trace irqs off for idle */ stop_critical_timings(); - - /* enter_idle() needs rcu for notifiers */ - rcu_idle_enter(); - if (cpuidle_idle_call()) pm_idle(); - - rcu_idle_exit(); start_critical_timings(); /* In many cases the interrupt that ended idle @@ -155,7 +149,7 @@ void cpu_idle(void) __exit_idle(); } - tick_nohz_idle_exit(); + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); @@ -299,12 +293,13 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { - p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr, - IO_BITMAP_BYTES, GFP_KERNEL); + p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { p->thread.io_bitmap_max = 0; return -ENOMEM; } + memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, + IO_BITMAP_BYTES); set_tsk_thread_flag(p, TIF_IO_BITMAP); } diff --git a/trunk/arch/x86/kernel/ptrace.c b/trunk/arch/x86/kernel/ptrace.c index 89a04c7b5bb6..82528799c5de 100644 --- a/trunk/arch/x86/kernel/ptrace.c +++ b/trunk/arch/x86/kernel/ptrace.c @@ -749,8 +749,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, /* * Handle PTRACE_POKEUSR calls for the debug register area. */ -static int ptrace_set_debugreg(struct task_struct *tsk, int n, - unsigned long val) +int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) { struct thread_struct *thread = &(tsk->thread); int rc = 0; diff --git a/trunk/arch/x86/kernel/setup.c b/trunk/arch/x86/kernel/setup.c index d05444ac2aea..cf0ef986cb6d 100644 --- a/trunk/arch/x86/kernel/setup.c +++ b/trunk/arch/x86/kernel/setup.c @@ -306,8 +306,7 @@ static void __init cleanup_highmap(void) static void __init reserve_brk(void) { if (_brk_end > _brk_start) - memblock_reserve(__pa(_brk_start), - __pa(_brk_end) - __pa(_brk_start)); + memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK"); /* Mark brk area as locked down and no longer taking any new allocations */ @@ -332,13 +331,13 @@ static void __init relocate_initrd(void) ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size, PAGE_SIZE); - if (!ramdisk_here) + if (ramdisk_here == MEMBLOCK_ERROR) panic("Cannot find place for new RAMDISK of size %lld\n", ramdisk_size); /* Note: this includes all the lowmem currently occupied by the initrd, we rely on that fact to keep the data intact. */ - memblock_reserve(ramdisk_here, area_size); + memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK"); initrd_start = ramdisk_here + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", @@ -394,7 +393,7 @@ static void __init reserve_initrd(void) initrd_start = 0; if (ramdisk_size >= (end_of_lowmem>>1)) { - memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); + memblock_x86_free_range(ramdisk_image, ramdisk_end); printk(KERN_ERR "initrd too large to handle, " "disabling initrd\n"); return; @@ -417,7 +416,7 @@ static void __init reserve_initrd(void) relocate_initrd(); - memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); + memblock_x86_free_range(ramdisk_image, ramdisk_end); } #else static void __init reserve_initrd(void) @@ -491,13 +490,15 @@ static void __init memblock_x86_reserve_range_setup_data(void) { struct setup_data *data; u64 pa_data; + char buf[32]; if (boot_params.hdr.version < 0x0209) return; pa_data = boot_params.hdr.setup_data; while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); - memblock_reserve(pa_data, sizeof(*data) + data->len); + sprintf(buf, "setup data %x", data->type); + memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf); pa_data = data->next; early_iounmap(data, sizeof(*data)); } @@ -553,7 +554,7 @@ static void __init reserve_crashkernel(void) crash_base = memblock_find_in_range(alignment, CRASH_KERNEL_ADDR_MAX, crash_size, alignment); - if (!crash_base) { + if (crash_base == MEMBLOCK_ERROR) { pr_info("crashkernel reservation failed - No suitable area found.\n"); return; } @@ -567,7 +568,7 @@ static void __init reserve_crashkernel(void) return; } } - memblock_reserve(crash_base, crash_size); + memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL"); printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " "for crashkernel (System RAM: %ldMB)\n", @@ -625,7 +626,7 @@ static __init void reserve_ibft_region(void) addr = find_ibft_region(&size); if (size) - memblock_reserve(addr, size); + memblock_x86_reserve_range(addr, addr + size, "* ibft"); } static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; diff --git a/trunk/arch/x86/kernel/smpboot.c b/trunk/arch/x86/kernel/smpboot.c index e38e21754eea..9f548cb4a958 100644 --- a/trunk/arch/x86/kernel/smpboot.c +++ b/trunk/arch/x86/kernel/smpboot.c @@ -840,8 +840,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || - !physid_isset(apicid, phys_cpu_present_map) || - (!x2apic_mode && apicid >= 255)) { + !physid_isset(apicid, phys_cpu_present_map)) { printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); return -EINVAL; } diff --git a/trunk/arch/x86/kernel/trampoline.c b/trunk/arch/x86/kernel/trampoline.c index a73b61055ad6..a91ae7709b49 100644 --- a/trunk/arch/x86/kernel/trampoline.c +++ b/trunk/arch/x86/kernel/trampoline.c @@ -14,11 +14,11 @@ void __init setup_trampolines(void) /* Has to be in very low memory so we can execute real-mode AP code. */ mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); - if (!mem) + if (mem == MEMBLOCK_ERROR) panic("Cannot allocate trampoline\n"); x86_trampoline_base = __va(mem); - memblock_reserve(mem, size); + memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE"); printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", x86_trampoline_base, (unsigned long long)mem, size); diff --git a/trunk/arch/x86/kernel/traps.c b/trunk/arch/x86/kernel/traps.c index fa1191fb679d..a8e3eb83466c 100644 --- a/trunk/arch/x86/kernel/traps.c +++ b/trunk/arch/x86/kernel/traps.c @@ -306,10 +306,15 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) == NOTIFY_STOP) return; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ - +#ifdef CONFIG_KPROBES if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) return; +#else + if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) + return; +#endif preempt_conditional_sti(regs); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); diff --git a/trunk/arch/x86/kernel/tsc.c b/trunk/arch/x86/kernel/tsc.c index 2c9cf0fd78f5..db483369f10b 100644 --- a/trunk/arch/x86/kernel/tsc.c +++ b/trunk/arch/x86/kernel/tsc.c @@ -35,7 +35,7 @@ static int __read_mostly tsc_unstable; erroneous rdtsc usage on !cpu_has_tsc processors */ static int __read_mostly tsc_disabled = -1; -int tsc_clocksource_reliable; +static int tsc_clocksource_reliable; /* * Scheduler clock - returns current time in nanosec units. */ @@ -178,11 +178,11 @@ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2) } #define CAL_MS 10 -#define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS)) +#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS)) #define CAL_PIT_LOOPS 1000 #define CAL2_MS 50 -#define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS)) +#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS)) #define CAL2_PIT_LOOPS 5000 diff --git a/trunk/arch/x86/kernel/tsc_sync.c b/trunk/arch/x86/kernel/tsc_sync.c index 9eba29b46cb7..0aa5fed8b9e6 100644 --- a/trunk/arch/x86/kernel/tsc_sync.c +++ b/trunk/arch/x86/kernel/tsc_sync.c @@ -113,7 +113,7 @@ void __cpuinit check_tsc_sync_source(int cpu) if (unsynchronized_tsc()) return; - if (tsc_clocksource_reliable) { + if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING) pr_info( "Skipped synchronization checks as TSC is reliable.\n"); @@ -172,7 +172,7 @@ void __cpuinit check_tsc_sync_target(void) { int cpus = 2; - if (unsynchronized_tsc() || tsc_clocksource_reliable) + if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) return; /* diff --git a/trunk/arch/x86/kernel/vsyscall_64.c b/trunk/arch/x86/kernel/vsyscall_64.c index b07ba9393564..e4d4a22e8b94 100644 --- a/trunk/arch/x86/kernel/vsyscall_64.c +++ b/trunk/arch/x86/kernel/vsyscall_64.c @@ -57,7 +57,7 @@ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), }; -static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE; static int __init vsyscall_setup(char *str) { @@ -140,40 +140,11 @@ static int addr_to_vsyscall_nr(unsigned long addr) return nr; } -static bool write_ok_or_segv(unsigned long ptr, size_t size) -{ - /* - * XXX: if access_ok, get_user, and put_user handled - * sig_on_uaccess_error, this could go away. - */ - - if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { - siginfo_t info; - struct thread_struct *thread = ¤t->thread; - - thread->error_code = 6; /* user fault, no page, write */ - thread->cr2 = ptr; - thread->trap_no = 14; - - memset(&info, 0, sizeof(info)); - info.si_signo = SIGSEGV; - info.si_errno = 0; - info.si_code = SEGV_MAPERR; - info.si_addr = (void __user *)ptr; - - force_sig_info(SIGSEGV, &info, current); - return false; - } else { - return true; - } -} - bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) { struct task_struct *tsk; unsigned long caller; int vsyscall_nr; - int prev_sig_on_uaccess_error; long ret; /* @@ -209,65 +180,35 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) if (seccomp_mode(&tsk->seccomp)) do_exit(SIGKILL); - /* - * With a real vsyscall, page faults cause SIGSEGV. We want to - * preserve that behavior to make writing exploits harder. - */ - prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; - current_thread_info()->sig_on_uaccess_error = 1; - - /* - * 0 is a valid user pointer (in the access_ok sense) on 32-bit and - * 64-bit, so we don't need to special-case it here. For all the - * vsyscalls, 0 means "don't write anything" not "write it at - * address 0". - */ - ret = -EFAULT; switch (vsyscall_nr) { case 0: - if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || - !write_ok_or_segv(regs->si, sizeof(struct timezone))) - break; - ret = sys_gettimeofday( (struct timeval __user *)regs->di, (struct timezone __user *)regs->si); break; case 1: - if (!write_ok_or_segv(regs->di, sizeof(time_t))) - break; - ret = sys_time((time_t __user *)regs->di); break; case 2: - if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || - !write_ok_or_segv(regs->si, sizeof(unsigned))) - break; - ret = sys_getcpu((unsigned __user *)regs->di, (unsigned __user *)regs->si, 0); break; } - current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; - if (ret == -EFAULT) { - /* Bad news -- userspace fed a bad pointer to a vsyscall. */ - warn_bad_vsyscall(KERN_INFO, regs, - "vsyscall fault (exploit attempt?)"); - /* - * If we failed to generate a signal for any reason, - * generate one here. (This should be impossible.) + * Bad news -- userspace fed a bad pointer to a vsyscall. + * + * With a real vsyscall, that would have caused SIGSEGV. + * To make writing reliable exploits using the emulated + * vsyscalls harder, generate SIGSEGV here as well. */ - if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && - !sigismember(&tsk->pending.signal, SIGSEGV))) - goto sigsegv; - - return true; /* Don't emulate the ret. */ + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall fault (exploit attempt?)"); + goto sigsegv; } regs->ax = ret; diff --git a/trunk/arch/x86/kernel/x86_init.c b/trunk/arch/x86/kernel/x86_init.c index 91f83e21b989..c1d6cd549397 100644 --- a/trunk/arch/x86/kernel/x86_init.c +++ b/trunk/arch/x86/kernel/x86_init.c @@ -92,7 +92,6 @@ struct x86_init_ops x86_init __initdata = { struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { .setup_percpu_clockev = setup_secondary_APIC_clock, - .fixup_cpu_id = x86_default_fixup_cpu_id, }; static void default_nmi_init(void) { }; diff --git a/trunk/arch/x86/kvm/i8254.c b/trunk/arch/x86/kvm/i8254.c index 405f2620392f..76e3f1cd0369 100644 --- a/trunk/arch/x86/kvm/i8254.c +++ b/trunk/arch/x86/kvm/i8254.c @@ -338,15 +338,11 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) return HRTIMER_NORESTART; } -static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) +static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) { - struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; struct kvm_timer *pt = &ps->pit_timer; s64 interval; - if (!irqchip_in_kernel(kvm)) - return; - interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); pr_debug("create pit timer, interval is %llu nsec\n", interval); @@ -398,13 +394,13 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) /* FIXME: enhance mode 4 precision */ case 4: if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) { - create_pit_timer(kvm, val, 0); + create_pit_timer(ps, val, 0); } break; case 2: case 3: if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){ - create_pit_timer(kvm, val, 1); + create_pit_timer(ps, val, 1); } break; default: diff --git a/trunk/arch/x86/kvm/x86.c b/trunk/arch/x86/kvm/x86.c index 4c938da2ba00..c38efd7b792e 100644 --- a/trunk/arch/x86/kvm/x86.c +++ b/trunk/arch/x86/kvm/x86.c @@ -602,6 +602,7 @@ static void update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; struct kvm_lapic *apic = vcpu->arch.apic; + u32 timer_mode_mask; best = kvm_find_cpuid_entry(vcpu, 1, 0); if (!best) @@ -614,12 +615,15 @@ static void update_cpuid(struct kvm_vcpu *vcpu) best->ecx |= bit(X86_FEATURE_OSXSAVE); } - if (apic) { - if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER)) - apic->lapic_timer.timer_mode_mask = 3 << 17; - else - apic->lapic_timer.timer_mode_mask = 1 << 17; - } + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + best->function == 0x1) { + best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER); + timer_mode_mask = 3 << 17; + } else + timer_mode_mask = 1 << 17; + + if (apic) + apic->lapic_timer.timer_mode_mask = timer_mode_mask; } int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -2131,9 +2135,6 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_TSC_CONTROL: r = kvm_has_tsc_control; break; - case KVM_CAP_TSC_DEADLINE_TIMER: - r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER); - break; default: r = 0; break; diff --git a/trunk/arch/x86/lib/inat.c b/trunk/arch/x86/lib/inat.c index 88ad5fbda6e1..46fc4ee09fc4 100644 --- a/trunk/arch/x86/lib/inat.c +++ b/trunk/arch/x86/lib/inat.c @@ -82,16 +82,9 @@ insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, const insn_attr_t *table; if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX) return 0; - /* At first, this checks the master table */ - table = inat_avx_tables[vex_m][0]; + table = inat_avx_tables[vex_m][vex_p]; if (!table) return 0; - if (!inat_is_group(table[opcode]) && vex_p) { - /* If this is not a group, get attribute directly */ - table = inat_avx_tables[vex_m][vex_p]; - if (!table) - return 0; - } return table[opcode]; } diff --git a/trunk/arch/x86/lib/insn.c b/trunk/arch/x86/lib/insn.c index 5a1f9f3e3fbb..374562ed6704 100644 --- a/trunk/arch/x86/lib/insn.c +++ b/trunk/arch/x86/lib/insn.c @@ -202,7 +202,7 @@ void insn_get_opcode(struct insn *insn) m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); - if (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr)) + if (!inat_accept_vex(insn->attr)) insn->attr = 0; /* This instruction is bad */ goto end; /* VEX has only 1 byte for opcode */ } @@ -249,8 +249,6 @@ void insn_get_modrm(struct insn *insn) pfx = insn_last_prefix(insn); insn->attr = inat_get_group_attribute(mod, pfx, insn->attr); - if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) - insn->attr = 0; /* This is bad */ } } diff --git a/trunk/arch/x86/lib/string_32.c b/trunk/arch/x86/lib/string_32.c index bd59090825db..82004d2bf05e 100644 --- a/trunk/arch/x86/lib/string_32.c +++ b/trunk/arch/x86/lib/string_32.c @@ -164,13 +164,15 @@ EXPORT_SYMBOL(strchr); size_t strlen(const char *s) { int d0; - size_t res; + int res; asm volatile("repne\n\t" - "scasb" + "scasb\n\t" + "notl %0\n\t" + "decl %0" : "=c" (res), "=&D" (d0) : "1" (s), "a" (0), "0" (0xffffffffu) : "memory"); - return ~res - 1; + return res; } EXPORT_SYMBOL(strlen); #endif diff --git a/trunk/arch/x86/lib/x86-opcode-map.txt b/trunk/arch/x86/lib/x86-opcode-map.txt index 5b83c51c12e0..8641bbb8e006 100644 --- a/trunk/arch/x86/lib/x86-opcode-map.txt +++ b/trunk/arch/x86/lib/x86-opcode-map.txt @@ -1,11 +1,5 @@ # x86 Opcode Maps # -# This is (mostly) based on following documentations. -# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2 -# (#325383-040US, October 2011) -# - Intel(R) Advanced Vector Extensions Programming Reference -# (#319433-011,JUNE 2011). -# # # Table: table-name # Referrer: escaped-name @@ -21,13 +15,10 @@ # EndTable # # AVX Superscripts -# (v): this opcode requires VEX prefix. -# (v1): this opcode only supports 128bit VEX. -# -# Last Prefix Superscripts -# - (66): the last prefix is 0x66 -# - (F3): the last prefix is 0xF3 -# - (F2): the last prefix is 0xF2 +# (VEX): this opcode can accept VEX prefix. +# (oVEX): this opcode requires VEX prefix. +# (o128): this opcode only supports 128bit VEX. +# (o256): this opcode only supports 256bit VEX. # Table: one byte opcode @@ -208,8 +199,8 @@ a0: MOV AL,Ob a1: MOV rAX,Ov a2: MOV Ob,AL a3: MOV Ov,rAX -a4: MOVS/B Yb,Xb -a5: MOVS/W/D/Q Yv,Xv +a4: MOVS/B Xb,Yb +a5: MOVS/W/D/Q Xv,Yv a6: CMPS/B Xb,Yb a7: CMPS/W/D Xv,Yv a8: TEST AL,Ib @@ -219,7 +210,9 @@ ab: STOS/W/D/Q Yv,rAX ac: LODS/B AL,Xb ad: LODS/W/D/Q rAX,Xv ae: SCAS/B AL,Yb -af: SCAS/W/D/Q rAX,Xv +# Note: The May 2011 Intel manual shows Xv for the second parameter of the +# next instruction but Yv is correct +af: SCAS/W/D/Q rAX,Yv # 0xb0 - 0xbf b0: MOV AL/R8L,Ib b1: MOV CL/R9L,Ib @@ -242,8 +235,8 @@ c0: Grp2 Eb,Ib (1A) c1: Grp2 Ev,Ib (1A) c2: RETN Iw (f64) c3: RETN -c4: LES Gz,Mp (i64) | VEX+2byte (Prefix) -c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix) +c4: LES Gz,Mp (i64) | 3bytes-VEX (Prefix) +c5: LDS Gz,Mp (i64) | 2bytes-VEX (Prefix) c6: Grp11 Eb,Ib (1A) c7: Grp11 Ev,Iz (1A) c8: ENTER Iw,Ib @@ -329,19 +322,14 @@ AVXcode: 1 # 3DNow! uses the last imm byte as opcode extension. 0f: 3DNow! Pq,Qq,Ib # 0x0f 0x10-0x1f -# NOTE: According to Intel SDM opcode map, vmovups and vmovupd has no operands -# but it actually has operands. And also, vmovss and vmovsd only accept 128bit. -# MOVSS/MOVSD has too many forms(3) on SDM. This map just shows a typical form. -# Many AVX instructions lack v1 superscript, according to Intel AVX-Prgramming -# Reference A.1 -10: vmovups Vps,Wps | vmovupd Vpd,Wpd (66) | vmovss Vx,Hx,Wss (F3),(v1) | vmovsd Vx,Hx,Wsd (F2),(v1) -11: vmovups Wps,Vps | vmovupd Wpd,Vpd (66) | vmovss Wss,Hx,Vss (F3),(v1) | vmovsd Wsd,Hx,Vsd (F2),(v1) -12: vmovlps Vq,Hq,Mq (v1) | vmovhlps Vq,Hq,Uq (v1) | vmovlpd Vq,Hq,Mq (66),(v1) | vmovsldup Vx,Wx (F3) | vmovddup Vx,Wx (F2) -13: vmovlps Mq,Vq (v1) | vmovlpd Mq,Vq (66),(v1) -14: vunpcklps Vx,Hx,Wx | vunpcklpd Vx,Hx,Wx (66) -15: vunpckhps Vx,Hx,Wx | vunpckhpd Vx,Hx,Wx (66) -16: vmovhps Vdq,Hq,Mq (v1) | vmovlhps Vdq,Hq,Uq (v1) | vmovhpd Vdq,Hq,Mq (66),(v1) | vmovshdup Vx,Wx (F3) -17: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1) +10: movups Vps,Wps (VEX) | movss Vss,Wss (F3),(VEX),(o128) | movupd Vpd,Wpd (66),(VEX) | movsd Vsd,Wsd (F2),(VEX),(o128) +11: movups Wps,Vps (VEX) | movss Wss,Vss (F3),(VEX),(o128) | movupd Wpd,Vpd (66),(VEX) | movsd Wsd,Vsd (F2),(VEX),(o128) +12: movlps Vq,Mq (VEX),(o128) | movlpd Vq,Mq (66),(VEX),(o128) | movhlps Vq,Uq (VEX),(o128) | movddup Vq,Wq (F2),(VEX) | movsldup Vq,Wq (F3),(VEX) +13: mpvlps Mq,Vq (VEX),(o128) | movlpd Mq,Vq (66),(VEX),(o128) +14: unpcklps Vps,Wq (VEX) | unpcklpd Vpd,Wq (66),(VEX) +15: unpckhps Vps,Wq (VEX) | unpckhpd Vpd,Wq (66),(VEX) +16: movhps Vq,Mq (VEX),(o128) | movhpd Vq,Mq (66),(VEX),(o128) | movlsps Vq,Uq (VEX),(o128) | movshdup Vq,Wq (F3),(VEX) +17: movhps Mq,Vq (VEX),(o128) | movhpd Mq,Vq (66),(VEX),(o128) 18: Grp16 (1A) 19: 1a: @@ -359,14 +347,14 @@ AVXcode: 1 25: 26: 27: -28: vmovaps Vps,Wps | vmovapd Vpd,Wpd (66) -29: vmovaps Wps,Vps | vmovapd Wpd,Vpd (66) -2a: cvtpi2ps Vps,Qpi | cvtpi2pd Vpd,Qpi (66) | vcvtsi2ss Vss,Hss,Ey (F3),(v1) | vcvtsi2sd Vsd,Hsd,Ey (F2),(v1) -2b: vmovntps Mps,Vps | vmovntpd Mpd,Vpd (66) -2c: cvttps2pi Ppi,Wps | cvttpd2pi Ppi,Wpd (66) | vcvttss2si Gy,Wss (F3),(v1) | vcvttsd2si Gy,Wsd (F2),(v1) -2d: cvtps2pi Ppi,Wps | cvtpd2pi Qpi,Wpd (66) | vcvtss2si Gy,Wss (F3),(v1) | vcvtsd2si Gy,Wsd (F2),(v1) -2e: vucomiss Vss,Wss (v1) | vucomisd Vsd,Wsd (66),(v1) -2f: vcomiss Vss,Wss (v1) | vcomisd Vsd,Wsd (66),(v1) +28: movaps Vps,Wps (VEX) | movapd Vpd,Wpd (66),(VEX) +29: movaps Wps,Vps (VEX) | movapd Wpd,Vpd (66),(VEX) +2a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3),(VEX),(o128) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2),(VEX),(o128) +2b: movntps Mps,Vps (VEX) | movntpd Mpd,Vpd (66),(VEX) +2c: cvttps2pi Ppi,Wps | cvttss2si Gd/q,Wss (F3),(VEX),(o128) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2),(VEX),(o128) +2d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3),(VEX),(o128) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2),(VEX),(o128) +2e: ucomiss Vss,Wss (VEX),(o128) | ucomisd Vsd,Wsd (66),(VEX),(o128) +2f: comiss Vss,Wss (VEX),(o128) | comisd Vsd,Wsd (66),(VEX),(o128) # 0x0f 0x30-0x3f 30: WRMSR 31: RDTSC @@ -402,66 +390,65 @@ AVXcode: 1 4e: CMOVLE/NG Gv,Ev 4f: CMOVNLE/G Gv,Ev # 0x0f 0x50-0x5f -50: vmovmskps Gy,Ups | vmovmskpd Gy,Upd (66) -51: vsqrtps Vps,Wps | vsqrtpd Vpd,Wpd (66) | vsqrtss Vss,Hss,Wss (F3),(v1) | vsqrtsd Vsd,Hsd,Wsd (F2),(v1) -52: vrsqrtps Vps,Wps | vrsqrtss Vss,Hss,Wss (F3),(v1) -53: vrcpps Vps,Wps | vrcpss Vss,Hss,Wss (F3),(v1) -54: vandps Vps,Hps,Wps | vandpd Vpd,Hpd,Wpd (66) -55: vandnps Vps,Hps,Wps | vandnpd Vpd,Hpd,Wpd (66) -56: vorps Vps,Hps,Wps | vorpd Vpd,Hpd,Wpd (66) -57: vxorps Vps,Hps,Wps | vxorpd Vpd,Hpd,Wpd (66) -58: vaddps Vps,Hps,Wps | vaddpd Vpd,Hpd,Wpd (66) | vaddss Vss,Hss,Wss (F3),(v1) | vaddsd Vsd,Hsd,Wsd (F2),(v1) -59: vmulps Vps,Hps,Wps | vmulpd Vpd,Hpd,Wpd (66) | vmulss Vss,Hss,Wss (F3),(v1) | vmulsd Vsd,Hsd,Wsd (F2),(v1) -5a: vcvtps2pd Vpd,Wps | vcvtpd2ps Vps,Wpd (66) | vcvtss2sd Vsd,Hx,Wss (F3),(v1) | vcvtsd2ss Vss,Hx,Wsd (F2),(v1) -5b: vcvtdq2ps Vps,Wdq | vcvtps2dq Vdq,Wps (66) | vcvttps2dq Vdq,Wps (F3) -5c: vsubps Vps,Hps,Wps | vsubpd Vpd,Hpd,Wpd (66) | vsubss Vss,Hss,Wss (F3),(v1) | vsubsd Vsd,Hsd,Wsd (F2),(v1) -5d: vminps Vps,Hps,Wps | vminpd Vpd,Hpd,Wpd (66) | vminss Vss,Hss,Wss (F3),(v1) | vminsd Vsd,Hsd,Wsd (F2),(v1) -5e: vdivps Vps,Hps,Wps | vdivpd Vpd,Hpd,Wpd (66) | vdivss Vss,Hss,Wss (F3),(v1) | vdivsd Vsd,Hsd,Wsd (F2),(v1) -5f: vmaxps Vps,Hps,Wps | vmaxpd Vpd,Hpd,Wpd (66) | vmaxss Vss,Hss,Wss (F3),(v1) | vmaxsd Vsd,Hsd,Wsd (F2),(v1) +50: movmskps Gd/q,Ups (VEX) | movmskpd Gd/q,Upd (66),(VEX) +51: sqrtps Vps,Wps (VEX) | sqrtss Vss,Wss (F3),(VEX),(o128) | sqrtpd Vpd,Wpd (66),(VEX) | sqrtsd Vsd,Wsd (F2),(VEX),(o128) +52: rsqrtps Vps,Wps (VEX) | rsqrtss Vss,Wss (F3),(VEX),(o128) +53: rcpps Vps,Wps (VEX) | rcpss Vss,Wss (F3),(VEX),(o128) +54: andps Vps,Wps (VEX) | andpd Vpd,Wpd (66),(VEX) +55: andnps Vps,Wps (VEX) | andnpd Vpd,Wpd (66),(VEX) +56: orps Vps,Wps (VEX) | orpd Vpd,Wpd (66),(VEX) +57: xorps Vps,Wps (VEX) | xorpd Vpd,Wpd (66),(VEX) +58: addps Vps,Wps (VEX) | addss Vss,Wss (F3),(VEX),(o128) | addpd Vpd,Wpd (66),(VEX) | addsd Vsd,Wsd (F2),(VEX),(o128) +59: mulps Vps,Wps (VEX) | mulss Vss,Wss (F3),(VEX),(o128) | mulpd Vpd,Wpd (66),(VEX) | mulsd Vsd,Wsd (F2),(VEX),(o128) +5a: cvtps2pd Vpd,Wps (VEX) | cvtss2sd Vsd,Wss (F3),(VEX),(o128) | cvtpd2ps Vps,Wpd (66),(VEX) | cvtsd2ss Vsd,Wsd (F2),(VEX),(o128) +5b: cvtdq2ps Vps,Wdq (VEX) | cvtps2dq Vdq,Wps (66),(VEX) | cvttps2dq Vdq,Wps (F3),(VEX) +5c: subps Vps,Wps (VEX) | subss Vss,Wss (F3),(VEX),(o128) | subpd Vpd,Wpd (66),(VEX) | subsd Vsd,Wsd (F2),(VEX),(o128) +5d: minps Vps,Wps (VEX) | minss Vss,Wss (F3),(VEX),(o128) | minpd Vpd,Wpd (66),(VEX) | minsd Vsd,Wsd (F2),(VEX),(o128) +5e: divps Vps,Wps (VEX) | divss Vss,Wss (F3),(VEX),(o128) | divpd Vpd,Wpd (66),(VEX) | divsd Vsd,Wsd (F2),(VEX),(o128) +5f: maxps Vps,Wps (VEX) | maxss Vss,Wss (F3),(VEX),(o128) | maxpd Vpd,Wpd (66),(VEX) | maxsd Vsd,Wsd (F2),(VEX),(o128) # 0x0f 0x60-0x6f -60: punpcklbw Pq,Qd | vpunpcklbw Vx,Hx,Wx (66),(v1) -61: punpcklwd Pq,Qd | vpunpcklwd Vx,Hx,Wx (66),(v1) -62: punpckldq Pq,Qd | vpunpckldq Vx,Hx,Wx (66),(v1) -63: packsswb Pq,Qq | vpacksswb Vx,Hx,Wx (66),(v1) -64: pcmpgtb Pq,Qq | vpcmpgtb Vx,Hx,Wx (66),(v1) -65: pcmpgtw Pq,Qq | vpcmpgtw Vx,Hx,Wx (66),(v1) -66: pcmpgtd Pq,Qq | vpcmpgtd Vx,Hx,Wx (66),(v1) -67: packuswb Pq,Qq | vpackuswb Vx,Hx,Wx (66),(v1) -68: punpckhbw Pq,Qd | vpunpckhbw Vx,Hx,Wx (66),(v1) -69: punpckhwd Pq,Qd | vpunpckhwd Vx,Hx,Wx (66),(v1) -6a: punpckhdq Pq,Qd | vpunpckhdq Vx,Hx,Wx (66),(v1) -6b: packssdw Pq,Qd | vpackssdw Vx,Hx,Wx (66),(v1) -6c: vpunpcklqdq Vx,Hx,Wx (66),(v1) -6d: vpunpckhqdq Vx,Hx,Wx (66),(v1) -6e: movd/q Pd,Ey | vmovd/q Vy,Ey (66),(v1) -6f: movq Pq,Qq | vmovdqa Vx,Wx (66) | vmovdqu Vx,Wx (F3) +60: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66),(VEX),(o128) +61: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66),(VEX),(o128) +62: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66),(VEX),(o128) +63: packsswb Pq,Qq | packsswb Vdq,Wdq (66),(VEX),(o128) +64: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66),(VEX),(o128) +65: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66),(VEX),(o128) +66: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66),(VEX),(o128) +67: packuswb Pq,Qq | packuswb Vdq,Wdq (66),(VEX),(o128) +68: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66),(VEX),(o128) +69: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66),(VEX),(o128) +6a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66),(VEX),(o128) +6b: packssdw Pq,Qd | packssdw Vdq,Wdq (66),(VEX),(o128) +6c: punpcklqdq Vdq,Wdq (66),(VEX),(o128) +6d: punpckhqdq Vdq,Wdq (66),(VEX),(o128) +6e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66),(VEX),(o128) +6f: movq Pq,Qq | movdqa Vdq,Wdq (66),(VEX) | movdqu Vdq,Wdq (F3),(VEX) # 0x0f 0x70-0x7f -70: pshufw Pq,Qq,Ib | vpshufd Vx,Wx,Ib (66),(v1) | vpshufhw Vx,Wx,Ib (F3),(v1) | vpshuflw Vx,Wx,Ib (F2),(v1) +70: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66),(VEX),(o128) | pshufhw Vdq,Wdq,Ib (F3),(VEX),(o128) | pshuflw VdqWdq,Ib (F2),(VEX),(o128) 71: Grp12 (1A) 72: Grp13 (1A) 73: Grp14 (1A) -74: pcmpeqb Pq,Qq | vpcmpeqb Vx,Hx,Wx (66),(v1) -75: pcmpeqw Pq,Qq | vpcmpeqw Vx,Hx,Wx (66),(v1) -76: pcmpeqd Pq,Qq | vpcmpeqd Vx,Hx,Wx (66),(v1) -# Note: Remove (v), because vzeroall and vzeroupper becomes emms without VEX. -77: emms | vzeroupper | vzeroall -78: VMREAD Ey,Gy -79: VMWRITE Gy,Ey +74: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66),(VEX),(o128) +75: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66),(VEX),(o128) +76: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66),(VEX),(o128) +77: emms/vzeroupper/vzeroall (VEX) +78: VMREAD Ed/q,Gd/q +79: VMWRITE Gd/q,Ed/q 7a: 7b: -7c: vhaddpd Vpd,Hpd,Wpd (66) | vhaddps Vps,Hps,Wps (F2) -7d: vhsubpd Vpd,Hpd,Wpd (66) | vhsubps Vps,Hps,Wps (F2) -7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1) -7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3) +7c: haddps Vps,Wps (F2),(VEX) | haddpd Vpd,Wpd (66),(VEX) +7d: hsubps Vps,Wps (F2),(VEX) | hsubpd Vpd,Wpd (66),(VEX) +7e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66),(VEX),(o128) | movq Vq,Wq (F3),(VEX),(o128) +7f: movq Qq,Pq | movdqa Wdq,Vdq (66),(VEX) | movdqu Wdq,Vdq (F3),(VEX) # 0x0f 0x80-0x8f 80: JO Jz (f64) 81: JNO Jz (f64) -82: JB/JC/JNAE Jz (f64) -83: JAE/JNB/JNC Jz (f64) -84: JE/JZ Jz (f64) -85: JNE/JNZ Jz (f64) +82: JB/JNAE/JC Jz (f64) +83: JNB/JAE/JNC Jz (f64) +84: JZ/JE Jz (f64) +85: JNZ/JNE Jz (f64) 86: JBE/JNA Jz (f64) -87: JA/JNBE Jz (f64) +87: JNBE/JA Jz (f64) 88: JS Jz (f64) 89: JNS Jz (f64) 8a: JP/JPE Jz (f64) @@ -517,18 +504,18 @@ b8: JMPE | POPCNT Gv,Ev (F3) b9: Grp10 (1A) ba: Grp8 Ev,Ib (1A) bb: BTC Ev,Gv -bc: BSF Gv,Ev | TZCNT Gv,Ev (F3) -bd: BSR Gv,Ev | LZCNT Gv,Ev (F3) +bc: BSF Gv,Ev +bd: BSR Gv,Ev be: MOVSX Gv,Eb bf: MOVSX Gv,Ew # 0x0f 0xc0-0xcf c0: XADD Eb,Gb c1: XADD Ev,Gv -c2: vcmpps Vps,Hps,Wps,Ib | vcmppd Vpd,Hpd,Wpd,Ib (66) | vcmpss Vss,Hss,Wss,Ib (F3),(v1) | vcmpsd Vsd,Hsd,Wsd,Ib (F2),(v1) -c3: movnti My,Gy -c4: pinsrw Pq,Ry/Mw,Ib | vpinsrw Vdq,Hdq,Ry/Mw,Ib (66),(v1) -c5: pextrw Gd,Nq,Ib | vpextrw Gd,Udq,Ib (66),(v1) -c6: vshufps Vps,Hps,Wps,Ib | vshufpd Vpd,Hpd,Wpd,Ib (66) +c2: cmpps Vps,Wps,Ib (VEX) | cmpss Vss,Wss,Ib (F3),(VEX),(o128) | cmppd Vpd,Wpd,Ib (66),(VEX) | cmpsd Vsd,Wsd,Ib (F2),(VEX) +c3: movnti Md/q,Gd/q +c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66),(VEX),(o128) +c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66),(VEX),(o128) +c6: shufps Vps,Wps,Ib (VEX) | shufpd Vpd,Wpd,Ib (66),(VEX) c7: Grp9 (1A) c8: BSWAP RAX/EAX/R8/R8D c9: BSWAP RCX/ECX/R9/R9D @@ -539,55 +526,55 @@ cd: BSWAP RBP/EBP/R13/R13D ce: BSWAP RSI/ESI/R14/R14D cf: BSWAP RDI/EDI/R15/R15D # 0x0f 0xd0-0xdf -d0: vaddsubpd Vpd,Hpd,Wpd (66) | vaddsubps Vps,Hps,Wps (F2) -d1: psrlw Pq,Qq | vpsrlw Vx,Hx,Wx (66),(v1) -d2: psrld Pq,Qq | vpsrld Vx,Hx,Wx (66),(v1) -d3: psrlq Pq,Qq | vpsrlq Vx,Hx,Wx (66),(v1) -d4: paddq Pq,Qq | vpaddq Vx,Hx,Wx (66),(v1) -d5: pmullw Pq,Qq | vpmullw Vx,Hx,Wx (66),(v1) -d6: vmovq Wq,Vq (66),(v1) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2) -d7: pmovmskb Gd,Nq | vpmovmskb Gd,Ux (66),(v1) -d8: psubusb Pq,Qq | vpsubusb Vx,Hx,Wx (66),(v1) -d9: psubusw Pq,Qq | vpsubusw Vx,Hx,Wx (66),(v1) -da: pminub Pq,Qq | vpminub Vx,Hx,Wx (66),(v1) -db: pand Pq,Qq | vpand Vx,Hx,Wx (66),(v1) -dc: paddusb Pq,Qq | vpaddusb Vx,Hx,Wx (66),(v1) -dd: paddusw Pq,Qq | vpaddusw Vx,Hx,Wx (66),(v1) -de: pmaxub Pq,Qq | vpmaxub Vx,Hx,Wx (66),(v1) -df: pandn Pq,Qq | vpandn Vx,Hx,Wx (66),(v1) +d0: addsubps Vps,Wps (F2),(VEX) | addsubpd Vpd,Wpd (66),(VEX) +d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66),(VEX),(o128) +d2: psrld Pq,Qq | psrld Vdq,Wdq (66),(VEX),(o128) +d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66),(VEX),(o128) +d4: paddq Pq,Qq | paddq Vdq,Wdq (66),(VEX),(o128) +d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66),(VEX),(o128) +d6: movq Wq,Vq (66),(VEX),(o128) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2) +d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66),(VEX),(o128) +d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66),(VEX),(o128) +d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66),(VEX),(o128) +da: pminub Pq,Qq | pminub Vdq,Wdq (66),(VEX),(o128) +db: pand Pq,Qq | pand Vdq,Wdq (66),(VEX),(o128) +dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66),(VEX),(o128) +dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66),(VEX),(o128) +de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66),(VEX),(o128) +df: pandn Pq,Qq | pandn Vdq,Wdq (66),(VEX),(o128) # 0x0f 0xe0-0xef -e0: pavgb Pq,Qq | vpavgb Vx,Hx,Wx (66),(v1) -e1: psraw Pq,Qq | vpsraw Vx,Hx,Wx (66),(v1) -e2: psrad Pq,Qq | vpsrad Vx,Hx,Wx (66),(v1) -e3: pavgw Pq,Qq | vpavgw Vx,Hx,Wx (66),(v1) -e4: pmulhuw Pq,Qq | vpmulhuw Vx,Hx,Wx (66),(v1) -e5: pmulhw Pq,Qq | vpmulhw Vx,Hx,Wx (66),(v1) -e6: vcvttpd2dq Vx,Wpd (66) | vcvtdq2pd Vx,Wdq (F3) | vcvtpd2dq Vx,Wpd (F2) -e7: movntq Mq,Pq | vmovntdq Mx,Vx (66) -e8: psubsb Pq,Qq | vpsubsb Vx,Hx,Wx (66),(v1) -e9: psubsw Pq,Qq | vpsubsw Vx,Hx,Wx (66),(v1) -ea: pminsw Pq,Qq | vpminsw Vx,Hx,Wx (66),(v1) -eb: por Pq,Qq | vpor Vx,Hx,Wx (66),(v1) -ec: paddsb Pq,Qq | vpaddsb Vx,Hx,Wx (66),(v1) -ed: paddsw Pq,Qq | vpaddsw Vx,Hx,Wx (66),(v1) -ee: pmaxsw Pq,Qq | vpmaxsw Vx,Hx,Wx (66),(v1) -ef: pxor Pq,Qq | vpxor Vx,Hx,Wx (66),(v1) +e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66),(VEX),(o128) +e1: psraw Pq,Qq | psraw Vdq,Wdq (66),(VEX),(o128) +e2: psrad Pq,Qq | psrad Vdq,Wdq (66),(VEX),(o128) +e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66),(VEX),(o128) +e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66),(VEX),(o128) +e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66),(VEX),(o128) +e6: cvtpd2dq Vdq,Wpd (F2),(VEX) | cvttpd2dq Vdq,Wpd (66),(VEX) | cvtdq2pd Vpd,Wdq (F3),(VEX) +e7: movntq Mq,Pq | movntdq Mdq,Vdq (66),(VEX) +e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66),(VEX),(o128) +e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66),(VEX),(o128) +ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66),(VEX),(o128) +eb: por Pq,Qq | por Vdq,Wdq (66),(VEX),(o128) +ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66),(VEX),(o128) +ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66),(VEX),(o128) +ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66),(VEX),(o128) +ef: pxor Pq,Qq | pxor Vdq,Wdq (66),(VEX),(o128) # 0x0f 0xf0-0xff -f0: vlddqu Vx,Mx (F2) -f1: psllw Pq,Qq | vpsllw Vx,Hx,Wx (66),(v1) -f2: pslld Pq,Qq | vpslld Vx,Hx,Wx (66),(v1) -f3: psllq Pq,Qq | vpsllq Vx,Hx,Wx (66),(v1) -f4: pmuludq Pq,Qq | vpmuludq Vx,Hx,Wx (66),(v1) -f5: pmaddwd Pq,Qq | vpmaddwd Vx,Hx,Wx (66),(v1) -f6: psadbw Pq,Qq | vpsadbw Vx,Hx,Wx (66),(v1) -f7: maskmovq Pq,Nq | vmaskmovdqu Vx,Ux (66),(v1) -f8: psubb Pq,Qq | vpsubb Vx,Hx,Wx (66),(v1) -f9: psubw Pq,Qq | vpsubw Vx,Hx,Wx (66),(v1) -fa: psubd Pq,Qq | vpsubd Vx,Hx,Wx (66),(v1) -fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) -fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) -fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) -fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) +f0: lddqu Vdq,Mdq (F2),(VEX) +f1: psllw Pq,Qq | psllw Vdq,Wdq (66),(VEX),(o128) +f2: pslld Pq,Qq | pslld Vdq,Wdq (66),(VEX),(o128) +f3: psllq Pq,Qq | psllq Vdq,Wdq (66),(VEX),(o128) +f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66),(VEX),(o128) +f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66),(VEX),(o128) +f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66),(VEX),(o128) +f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66),(VEX),(o128) +f8: psubb Pq,Qq | psubb Vdq,Wdq (66),(VEX),(o128) +f9: psubw Pq,Qq | psubw Vdq,Wdq (66),(VEX),(o128) +fa: psubd Pq,Qq | psubd Vdq,Wdq (66),(VEX),(o128) +fb: psubq Pq,Qq | psubq Vdq,Wdq (66),(VEX),(o128) +fc: paddb Pq,Qq | paddb Vdq,Wdq (66),(VEX),(o128) +fd: paddw Pq,Qq | paddw Vdq,Wdq (66),(VEX),(o128) +fe: paddd Pq,Qq | paddd Vdq,Wdq (66),(VEX),(o128) ff: EndTable @@ -595,193 +582,155 @@ Table: 3-byte opcode 1 (0x0f 0x38) Referrer: 3-byte escape 1 AVXcode: 2 # 0x0f 0x38 0x00-0x0f -00: pshufb Pq,Qq | vpshufb Vx,Hx,Wx (66),(v1) -01: phaddw Pq,Qq | vphaddw Vx,Hx,Wx (66),(v1) -02: phaddd Pq,Qq | vphaddd Vx,Hx,Wx (66),(v1) -03: phaddsw Pq,Qq | vphaddsw Vx,Hx,Wx (66),(v1) -04: pmaddubsw Pq,Qq | vpmaddubsw Vx,Hx,Wx (66),(v1) -05: phsubw Pq,Qq | vphsubw Vx,Hx,Wx (66),(v1) -06: phsubd Pq,Qq | vphsubd Vx,Hx,Wx (66),(v1) -07: phsubsw Pq,Qq | vphsubsw Vx,Hx,Wx (66),(v1) -08: psignb Pq,Qq | vpsignb Vx,Hx,Wx (66),(v1) -09: psignw Pq,Qq | vpsignw Vx,Hx,Wx (66),(v1) -0a: psignd Pq,Qq | vpsignd Vx,Hx,Wx (66),(v1) -0b: pmulhrsw Pq,Qq | vpmulhrsw Vx,Hx,Wx (66),(v1) -0c: vpermilps Vx,Hx,Wx (66),(v) -0d: vpermilpd Vx,Hx,Wx (66),(v) -0e: vtestps Vx,Wx (66),(v) -0f: vtestpd Vx,Wx (66),(v) +00: pshufb Pq,Qq | pshufb Vdq,Wdq (66),(VEX),(o128) +01: phaddw Pq,Qq | phaddw Vdq,Wdq (66),(VEX),(o128) +02: phaddd Pq,Qq | phaddd Vdq,Wdq (66),(VEX),(o128) +03: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66),(VEX),(o128) +04: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66),(VEX),(o128) +05: phsubw Pq,Qq | phsubw Vdq,Wdq (66),(VEX),(o128) +06: phsubd Pq,Qq | phsubd Vdq,Wdq (66),(VEX),(o128) +07: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66),(VEX),(o128) +08: psignb Pq,Qq | psignb Vdq,Wdq (66),(VEX),(o128) +09: psignw Pq,Qq | psignw Vdq,Wdq (66),(VEX),(o128) +0a: psignd Pq,Qq | psignd Vdq,Wdq (66),(VEX),(o128) +0b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66),(VEX),(o128) +0c: Vpermilps /r (66),(oVEX) +0d: Vpermilpd /r (66),(oVEX) +0e: vtestps /r (66),(oVEX) +0f: vtestpd /r (66),(oVEX) # 0x0f 0x38 0x10-0x1f 10: pblendvb Vdq,Wdq (66) 11: 12: -13: vcvtph2ps Vx,Wx,Ib (66),(v) +13: 14: blendvps Vdq,Wdq (66) 15: blendvpd Vdq,Wdq (66) -16: vpermps Vqq,Hqq,Wqq (66),(v) -17: vptest Vx,Wx (66) -18: vbroadcastss Vx,Wd (66),(v) -19: vbroadcastsd Vqq,Wq (66),(v) -1a: vbroadcastf128 Vqq,Mdq (66),(v) +16: +17: ptest Vdq,Wdq (66),(VEX) +18: vbroadcastss /r (66),(oVEX) +19: vbroadcastsd /r (66),(oVEX),(o256) +1a: vbroadcastf128 /r (66),(oVEX),(o256) 1b: -1c: pabsb Pq,Qq | vpabsb Vx,Wx (66),(v1) -1d: pabsw Pq,Qq | vpabsw Vx,Wx (66),(v1) -1e: pabsd Pq,Qq | vpabsd Vx,Wx (66),(v1) +1c: pabsb Pq,Qq | pabsb Vdq,Wdq (66),(VEX),(o128) +1d: pabsw Pq,Qq | pabsw Vdq,Wdq (66),(VEX),(o128) +1e: pabsd Pq,Qq | pabsd Vdq,Wdq (66),(VEX),(o128) 1f: # 0x0f 0x38 0x20-0x2f -20: vpmovsxbw Vx,Ux/Mq (66),(v1) -21: vpmovsxbd Vx,Ux/Md (66),(v1) -22: vpmovsxbq Vx,Ux/Mw (66),(v1) -23: vpmovsxwd Vx,Ux/Mq (66),(v1) -24: vpmovsxwq Vx,Ux/Md (66),(v1) -25: vpmovsxdq Vx,Ux/Mq (66),(v1) +20: pmovsxbw Vdq,Udq/Mq (66),(VEX),(o128) +21: pmovsxbd Vdq,Udq/Md (66),(VEX),(o128) +22: pmovsxbq Vdq,Udq/Mw (66),(VEX),(o128) +23: pmovsxwd Vdq,Udq/Mq (66),(VEX),(o128) +24: pmovsxwq Vdq,Udq/Md (66),(VEX),(o128) +25: pmovsxdq Vdq,Udq/Mq (66),(VEX),(o128) 26: 27: -28: vpmuldq Vx,Hx,Wx (66),(v1) -29: vpcmpeqq Vx,Hx,Wx (66),(v1) -2a: vmovntdqa Vx,Mx (66),(v1) -2b: vpackusdw Vx,Hx,Wx (66),(v1) -2c: vmaskmovps Vx,Hx,Mx (66),(v) -2d: vmaskmovpd Vx,Hx,Mx (66),(v) -2e: vmaskmovps Mx,Hx,Vx (66),(v) -2f: vmaskmovpd Mx,Hx,Vx (66),(v) +28: pmuldq Vdq,Wdq (66),(VEX),(o128) +29: pcmpeqq Vdq,Wdq (66),(VEX),(o128) +2a: movntdqa Vdq,Mdq (66),(VEX),(o128) +2b: packusdw Vdq,Wdq (66),(VEX),(o128) +2c: vmaskmovps(ld) /r (66),(oVEX) +2d: vmaskmovpd(ld) /r (66),(oVEX) +2e: vmaskmovps(st) /r (66),(oVEX) +2f: vmaskmovpd(st) /r (66),(oVEX) # 0x0f 0x38 0x30-0x3f -30: vpmovzxbw Vx,Ux/Mq (66),(v1) -31: vpmovzxbd Vx,Ux/Md (66),(v1) -32: vpmovzxbq Vx,Ux/Mw (66),(v1) -33: vpmovzxwd Vx,Ux/Mq (66),(v1) -34: vpmovzxwq Vx,Ux/Md (66),(v1) -35: vpmovzxdq Vx,Ux/Mq (66),(v1) -36: vpermd Vqq,Hqq,Wqq (66),(v) -37: vpcmpgtq Vx,Hx,Wx (66),(v1) -38: vpminsb Vx,Hx,Wx (66),(v1) -39: vpminsd Vx,Hx,Wx (66),(v1) -3a: vpminuw Vx,Hx,Wx (66),(v1) -3b: vpminud Vx,Hx,Wx (66),(v1) -3c: vpmaxsb Vx,Hx,Wx (66),(v1) -3d: vpmaxsd Vx,Hx,Wx (66),(v1) -3e: vpmaxuw Vx,Hx,Wx (66),(v1) -3f: vpmaxud Vx,Hx,Wx (66),(v1) +30: pmovzxbw Vdq,Udq/Mq (66),(VEX),(o128) +31: pmovzxbd Vdq,Udq/Md (66),(VEX),(o128) +32: pmovzxbq Vdq,Udq/Mw (66),(VEX),(o128) +33: pmovzxwd Vdq,Udq/Mq (66),(VEX),(o128) +34: pmovzxwq Vdq,Udq/Md (66),(VEX),(o128) +35: pmovzxdq Vdq,Udq/Mq (66),(VEX),(o128) +36: +37: pcmpgtq Vdq,Wdq (66),(VEX),(o128) +38: pminsb Vdq,Wdq (66),(VEX),(o128) +39: pminsd Vdq,Wdq (66),(VEX),(o128) +3a: pminuw Vdq,Wdq (66),(VEX),(o128) +3b: pminud Vdq,Wdq (66),(VEX),(o128) +3c: pmaxsb Vdq,Wdq (66),(VEX),(o128) +3d: pmaxsd Vdq,Wdq (66),(VEX),(o128) +3e: pmaxuw Vdq,Wdq (66),(VEX),(o128) +3f: pmaxud Vdq,Wdq (66),(VEX),(o128) # 0x0f 0x38 0x40-0x8f -40: vpmulld Vx,Hx,Wx (66),(v1) -41: vphminposuw Vdq,Wdq (66),(v1) -42: -43: -44: -45: vpsrlvd/q Vx,Hx,Wx (66),(v) -46: vpsravd Vx,Hx,Wx (66),(v) -47: vpsllvd/q Vx,Hx,Wx (66),(v) -# Skip 0x48-0x57 -58: vpbroadcastd Vx,Wx (66),(v) -59: vpbroadcastq Vx,Wx (66),(v) -5a: vbroadcasti128 Vqq,Mdq (66),(v) -# Skip 0x5b-0x77 -78: vpbroadcastb Vx,Wx (66),(v) -79: vpbroadcastw Vx,Wx (66),(v) -# Skip 0x7a-0x7f -80: INVEPT Gy,Mdq (66) -81: INVPID Gy,Mdq (66) -82: INVPCID Gy,Mdq (66) -8c: vpmaskmovd/q Vx,Hx,Mx (66),(v) -8e: vpmaskmovd/q Mx,Vx,Hx (66),(v) +40: pmulld Vdq,Wdq (66),(VEX),(o128) +41: phminposuw Vdq,Wdq (66),(VEX),(o128) +80: INVEPT Gd/q,Mdq (66) +81: INVPID Gd/q,Mdq (66) # 0x0f 0x38 0x90-0xbf (FMA) -90: vgatherdd/q Vx,Hx,Wx (66),(v) -91: vgatherqd/q Vx,Hx,Wx (66),(v) -92: vgatherdps/d Vx,Hx,Wx (66),(v) -93: vgatherqps/d Vx,Hx,Wx (66),(v) -94: -95: -96: vfmaddsub132ps/d Vx,Hx,Wx (66),(v) -97: vfmsubadd132ps/d Vx,Hx,Wx (66),(v) -98: vfmadd132ps/d Vx,Hx,Wx (66),(v) -99: vfmadd132ss/d Vx,Hx,Wx (66),(v),(v1) -9a: vfmsub132ps/d Vx,Hx,Wx (66),(v) -9b: vfmsub132ss/d Vx,Hx,Wx (66),(v),(v1) -9c: vfnmadd132ps/d Vx,Hx,Wx (66),(v) -9d: vfnmadd132ss/d Vx,Hx,Wx (66),(v),(v1) -9e: vfnmsub132ps/d Vx,Hx,Wx (66),(v) -9f: vfnmsub132ss/d Vx,Hx,Wx (66),(v),(v1) -a6: vfmaddsub213ps/d Vx,Hx,Wx (66),(v) -a7: vfmsubadd213ps/d Vx,Hx,Wx (66),(v) -a8: vfmadd213ps/d Vx,Hx,Wx (66),(v) -a9: vfmadd213ss/d Vx,Hx,Wx (66),(v),(v1) -aa: vfmsub213ps/d Vx,Hx,Wx (66),(v) -ab: vfmsub213ss/d Vx,Hx,Wx (66),(v),(v1) -ac: vfnmadd213ps/d Vx,Hx,Wx (66),(v) -ad: vfnmadd213ss/d Vx,Hx,Wx (66),(v),(v1) -ae: vfnmsub213ps/d Vx,Hx,Wx (66),(v) -af: vfnmsub213ss/d Vx,Hx,Wx (66),(v),(v1) -b6: vfmaddsub231ps/d Vx,Hx,Wx (66),(v) -b7: vfmsubadd231ps/d Vx,Hx,Wx (66),(v) -b8: vfmadd231ps/d Vx,Hx,Wx (66),(v) -b9: vfmadd231ss/d Vx,Hx,Wx (66),(v),(v1) -ba: vfmsub231ps/d Vx,Hx,Wx (66),(v) -bb: vfmsub231ss/d Vx,Hx,Wx (66),(v),(v1) -bc: vfnmadd231ps/d Vx,Hx,Wx (66),(v) -bd: vfnmadd231ss/d Vx,Hx,Wx (66),(v),(v1) -be: vfnmsub231ps/d Vx,Hx,Wx (66),(v) -bf: vfnmsub231ss/d Vx,Hx,Wx (66),(v),(v1) +96: vfmaddsub132pd/ps /r (66),(VEX) +97: vfmsubadd132pd/ps /r (66),(VEX) +98: vfmadd132pd/ps /r (66),(VEX) +99: vfmadd132sd/ss /r (66),(VEX),(o128) +9a: vfmsub132pd/ps /r (66),(VEX) +9b: vfmsub132sd/ss /r (66),(VEX),(o128) +9c: vfnmadd132pd/ps /r (66),(VEX) +9d: vfnmadd132sd/ss /r (66),(VEX),(o128) +9e: vfnmsub132pd/ps /r (66),(VEX) +9f: vfnmsub132sd/ss /r (66),(VEX),(o128) +a6: vfmaddsub213pd/ps /r (66),(VEX) +a7: vfmsubadd213pd/ps /r (66),(VEX) +a8: vfmadd213pd/ps /r (66),(VEX) +a9: vfmadd213sd/ss /r (66),(VEX),(o128) +aa: vfmsub213pd/ps /r (66),(VEX) +ab: vfmsub213sd/ss /r (66),(VEX),(o128) +ac: vfnmadd213pd/ps /r (66),(VEX) +ad: vfnmadd213sd/ss /r (66),(VEX),(o128) +ae: vfnmsub213pd/ps /r (66),(VEX) +af: vfnmsub213sd/ss /r (66),(VEX),(o128) +b6: vfmaddsub231pd/ps /r (66),(VEX) +b7: vfmsubadd231pd/ps /r (66),(VEX) +b8: vfmadd231pd/ps /r (66),(VEX) +b9: vfmadd231sd/ss /r (66),(VEX),(o128) +ba: vfmsub231pd/ps /r (66),(VEX) +bb: vfmsub231sd/ss /r (66),(VEX),(o128) +bc: vfnmadd231pd/ps /r (66),(VEX) +bd: vfnmadd231sd/ss /r (66),(VEX),(o128) +be: vfnmsub231pd/ps /r (66),(VEX) +bf: vfnmsub231sd/ss /r (66),(VEX),(o128) # 0x0f 0x38 0xc0-0xff -db: VAESIMC Vdq,Wdq (66),(v1) -dc: VAESENC Vdq,Hdq,Wdq (66),(v1) -dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1) -de: VAESDEC Vdq,Hdq,Wdq (66),(v1) -df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1) -f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) -f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) -f3: ANDN Gy,By,Ey (v) -f4: Grp17 (1A) -f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) -f6: MULX By,Gy,rDX,Ey (F2),(v) -f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) +db: aesimc Vdq,Wdq (66),(VEX),(o128) +dc: aesenc Vdq,Wdq (66),(VEX),(o128) +dd: aesenclast Vdq,Wdq (66),(VEX),(o128) +de: aesdec Vdq,Wdq (66),(VEX),(o128) +df: aesdeclast Vdq,Wdq (66),(VEX),(o128) +f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2) +f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2) EndTable Table: 3-byte opcode 2 (0x0f 0x3a) Referrer: 3-byte escape 2 AVXcode: 3 # 0x0f 0x3a 0x00-0xff -00: vpermq Vqq,Wqq,Ib (66),(v) -01: vpermpd Vqq,Wqq,Ib (66),(v) -02: vpblendd Vx,Hx,Wx,Ib (66),(v) -03: -04: vpermilps Vx,Wx,Ib (66),(v) -05: vpermilpd Vx,Wx,Ib (66),(v) -06: vperm2f128 Vqq,Hqq,Wqq,Ib (66),(v) -07: -08: vroundps Vx,Wx,Ib (66) -09: vroundpd Vx,Wx,Ib (66) -0a: vroundss Vss,Wss,Ib (66),(v1) -0b: vroundsd Vsd,Wsd,Ib (66),(v1) -0c: vblendps Vx,Hx,Wx,Ib (66) -0d: vblendpd Vx,Hx,Wx,Ib (66) -0e: vpblendw Vx,Hx,Wx,Ib (66),(v1) -0f: palignr Pq,Qq,Ib | vpalignr Vx,Hx,Wx,Ib (66),(v1) -14: vpextrb Rd/Mb,Vdq,Ib (66),(v1) -15: vpextrw Rd/Mw,Vdq,Ib (66),(v1) -16: vpextrd/q Ey,Vdq,Ib (66),(v1) -17: vextractps Ed,Vdq,Ib (66),(v1) -18: vinsertf128 Vqq,Hqq,Wqq,Ib (66),(v) -19: vextractf128 Wdq,Vqq,Ib (66),(v) -1d: vcvtps2ph Wx,Vx,Ib (66),(v) -20: vpinsrb Vdq,Hdq,Ry/Mb,Ib (66),(v1) -21: vinsertps Vdq,Hdq,Udq/Md,Ib (66),(v1) -22: vpinsrd/q Vdq,Hdq,Ey,Ib (66),(v1) -38: vinserti128 Vqq,Hqq,Wqq,Ib (66),(v) -39: vextracti128 Wdq,Vqq,Ib (66),(v) -40: vdpps Vx,Hx,Wx,Ib (66) -41: vdppd Vdq,Hdq,Wdq,Ib (66),(v1) -42: vmpsadbw Vx,Hx,Wx,Ib (66),(v1) -44: vpclmulqdq Vdq,Hdq,Wdq,Ib (66),(v1) -46: vperm2i128 Vqq,Hqq,Wqq,Ib (66),(v) -4a: vblendvps Vx,Hx,Wx,Lx (66),(v) -4b: vblendvpd Vx,Hx,Wx,Lx (66),(v) -4c: vpblendvb Vx,Hx,Wx,Lx (66),(v1) -60: vpcmpestrm Vdq,Wdq,Ib (66),(v1) -61: vpcmpestri Vdq,Wdq,Ib (66),(v1) -62: vpcmpistrm Vdq,Wdq,Ib (66),(v1) -63: vpcmpistri Vdq,Wdq,Ib (66),(v1) -df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1) -f0: RORX Gy,Ey,Ib (F2),(v) +04: vpermilps /r,Ib (66),(oVEX) +05: vpermilpd /r,Ib (66),(oVEX) +06: vperm2f128 /r,Ib (66),(oVEX),(o256) +08: roundps Vdq,Wdq,Ib (66),(VEX) +09: roundpd Vdq,Wdq,Ib (66),(VEX) +0a: roundss Vss,Wss,Ib (66),(VEX),(o128) +0b: roundsd Vsd,Wsd,Ib (66),(VEX),(o128) +0c: blendps Vdq,Wdq,Ib (66),(VEX) +0d: blendpd Vdq,Wdq,Ib (66),(VEX) +0e: pblendw Vdq,Wdq,Ib (66),(VEX),(o128) +0f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66),(VEX),(o128) +14: pextrb Rd/Mb,Vdq,Ib (66),(VEX),(o128) +15: pextrw Rd/Mw,Vdq,Ib (66),(VEX),(o128) +16: pextrd/pextrq Ed/q,Vdq,Ib (66),(VEX),(o128) +17: extractps Ed,Vdq,Ib (66),(VEX),(o128) +18: vinsertf128 /r,Ib (66),(oVEX),(o256) +19: vextractf128 /r,Ib (66),(oVEX),(o256) +20: pinsrb Vdq,Rd/q/Mb,Ib (66),(VEX),(o128) +21: insertps Vdq,Udq/Md,Ib (66),(VEX),(o128) +22: pinsrd/pinsrq Vdq,Ed/q,Ib (66),(VEX),(o128) +40: dpps Vdq,Wdq,Ib (66),(VEX) +41: dppd Vdq,Wdq,Ib (66),(VEX),(o128) +42: mpsadbw Vdq,Wdq,Ib (66),(VEX),(o128) +44: pclmulq Vdq,Wdq,Ib (66),(VEX),(o128) +4a: vblendvps /r,Ib (66),(oVEX) +4b: vblendvpd /r,Ib (66),(oVEX) +4c: vpblendvb /r,Ib (66),(oVEX),(o128) +60: pcmpestrm Vdq,Wdq,Ib (66),(VEX),(o128) +61: pcmpestri Vdq,Wdq,Ib (66),(VEX),(o128) +62: pcmpistrm Vdq,Wdq,Ib (66),(VEX),(o128) +63: pcmpistri Vdq,Wdq,Ib (66),(VEX),(o128) +df: aeskeygenassist Vdq,Wdq,Ib (66),(VEX),(o128) EndTable GrpTable: Grp1 @@ -843,7 +792,7 @@ GrpTable: Grp5 2: CALLN Ev (f64) 3: CALLF Ep 4: JMPN Ev (f64) -5: JMPF Mp +5: JMPF Ep 6: PUSH Ev (d64) 7: EndTable @@ -860,7 +809,7 @@ EndTable GrpTable: Grp7 0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) 1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001) -2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) +2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) 3: LIDT Ms 4: SMSW Mw/Rv 5: @@ -877,45 +826,44 @@ EndTable GrpTable: Grp9 1: CMPXCHG8B/16B Mq/Mdq -6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B) -7: VMPTRST Mq | VMPTRST Mq (F3) +6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) +7: VMPTRST Mq EndTable GrpTable: Grp10 EndTable GrpTable: Grp11 -# Note: the operands are given by group opcode 0: MOV EndTable GrpTable: Grp12 -2: psrlw Nq,Ib (11B) | vpsrlw Hx,Ux,Ib (66),(11B),(v1) -4: psraw Nq,Ib (11B) | vpsraw Hx,Ux,Ib (66),(11B),(v1) -6: psllw Nq,Ib (11B) | vpsllw Hx,Ux,Ib (66),(11B),(v1) +2: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B),(VEX),(o128) +4: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B),(VEX),(o128) +6: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B),(VEX),(o128) EndTable GrpTable: Grp13 -2: psrld Nq,Ib (11B) | vpsrld Hx,Ux,Ib (66),(11B),(v1) -4: psrad Nq,Ib (11B) | vpsrad Hx,Ux,Ib (66),(11B),(v1) -6: pslld Nq,Ib (11B) | vpslld Hx,Ux,Ib (66),(11B),(v1) +2: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B),(VEX),(o128) +4: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B),(VEX),(o128) +6: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B),(VEX),(o128) EndTable GrpTable: Grp14 -2: psrlq Nq,Ib (11B) | vpsrlq Hx,Ux,Ib (66),(11B),(v1) -3: vpsrldq Hx,Ux,Ib (66),(11B),(v1) -6: psllq Nq,Ib (11B) | vpsllq Hx,Ux,Ib (66),(11B),(v1) -7: vpslldq Hx,Ux,Ib (66),(11B),(v1) +2: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B),(VEX),(o128) +3: psrldq Udq,Ib (66),(11B),(VEX),(o128) +6: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B),(VEX),(o128) +7: pslldq Udq,Ib (66),(11B),(VEX),(o128) EndTable GrpTable: Grp15 -0: fxsave | RDFSBASE Ry (F3),(11B) -1: fxstor | RDGSBASE Ry (F3),(11B) -2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B) -3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B) +0: fxsave +1: fxstor +2: ldmxcsr (VEX) +3: stmxcsr (VEX) 4: XSAVE 5: XRSTOR | lfence (11B) -6: XSAVEOPT | mfence (11B) +6: mfence (11B) 7: clflush | sfence (11B) EndTable @@ -926,12 +874,6 @@ GrpTable: Grp16 3: prefetch T2 EndTable -GrpTable: Grp17 -1: BLSR By,Ey (v) -2: BLSMSK By,Ey (v) -3: BLSI By,Ey (v) -EndTable - # AMD's Prefetch Group GrpTable: GrpP 0: PREFETCH diff --git a/trunk/arch/x86/mm/Makefile b/trunk/arch/x86/mm/Makefile index 23d8e5fecf76..3d11327c9ab4 100644 --- a/trunk/arch/x86/mm/Makefile +++ b/trunk/arch/x86/mm/Makefile @@ -27,4 +27,6 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o +obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o + obj-$(CONFIG_MEMTEST) += memtest.o diff --git a/trunk/arch/x86/mm/extable.c b/trunk/arch/x86/mm/extable.c index 1fb85dbe390a..d0474ad2a6e5 100644 --- a/trunk/arch/x86/mm/extable.c +++ b/trunk/arch/x86/mm/extable.c @@ -25,7 +25,7 @@ int fixup_exception(struct pt_regs *regs) if (fixup) { /* If fixup is less than 16, it means uaccess error */ if (fixup->fixup < 16) { - current_thread_info()->uaccess_err = 1; + current_thread_info()->uaccess_err = -EFAULT; regs->ip += fixup->fixup; return 1; } diff --git a/trunk/arch/x86/mm/fault.c b/trunk/arch/x86/mm/fault.c index 9d74824a708d..5db0490deb07 100644 --- a/trunk/arch/x86/mm/fault.c +++ b/trunk/arch/x86/mm/fault.c @@ -626,7 +626,7 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, static noinline void no_context(struct pt_regs *regs, unsigned long error_code, - unsigned long address, int signal, int si_code) + unsigned long address) { struct task_struct *tsk = current; unsigned long *stackend; @@ -634,17 +634,8 @@ no_context(struct pt_regs *regs, unsigned long error_code, int sig; /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs)) { - if (current_thread_info()->sig_on_uaccess_error && signal) { - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code | PF_USER; - tsk->thread.cr2 = address; - - /* XXX: hwpoison faults will set the wrong code. */ - force_sig_info_fault(signal, si_code, address, tsk, 0); - } + if (fixup_exception(regs)) return; - } /* * 32-bit: @@ -764,7 +755,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (is_f00f_bug(regs, address)) return; - no_context(regs, error_code, address, SIGSEGV, si_code); + no_context(regs, error_code, address); } static noinline void @@ -828,7 +819,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, /* Kernel mode? Handle exceptions or die: */ if (!(error_code & PF_USER)) { - no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); + no_context(regs, error_code, address); return; } @@ -863,7 +854,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, if (!(fault & VM_FAULT_RETRY)) up_read(¤t->mm->mmap_sem); if (!(error_code & PF_USER)) - no_context(regs, error_code, address, 0, 0); + no_context(regs, error_code, address); return 1; } if (!(fault & VM_FAULT_ERROR)) @@ -873,8 +864,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, /* Kernel mode? Handle exceptions or die: */ if (!(error_code & PF_USER)) { up_read(¤t->mm->mmap_sem); - no_context(regs, error_code, address, - SIGSEGV, SEGV_MAPERR); + no_context(regs, error_code, address); return 1; } diff --git a/trunk/arch/x86/mm/init.c b/trunk/arch/x86/mm/init.c index a298914058f9..87488b93a65c 100644 --- a/trunk/arch/x86/mm/init.c +++ b/trunk/arch/x86/mm/init.c @@ -67,7 +67,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse, good_end = max_pfn_mapped << PAGE_SHIFT; base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); - if (!base) + if (base == MEMBLOCK_ERROR) panic("Cannot find space for the kernel page tables"); pgt_buf_start = base >> PAGE_SHIFT; @@ -80,7 +80,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse, void __init native_pagetable_reserve(u64 start, u64 end) { - memblock_reserve(start, end - start); + memblock_x86_reserve_range(start, end, "PGTABLE"); } struct map_range { @@ -279,8 +279,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) * so that they can be reused for other purposes. * - * On native it just means calling memblock_reserve, on Xen it also - * means marking RW the pagetable pages that we allocated before + * On native it just means calling memblock_x86_reserve_range, on Xen it + * also means marking RW the pagetable pages that we allocated before * but that haven't been used. * * In fact on xen we mark RO the whole range pgt_buf_start - diff --git a/trunk/arch/x86/mm/init_32.c b/trunk/arch/x86/mm/init_32.c index 0c1da394a634..29f7c6d98179 100644 --- a/trunk/arch/x86/mm/init_32.c +++ b/trunk/arch/x86/mm/init_32.c @@ -427,17 +427,23 @@ static void __init add_one_highpage_init(struct page *page) void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, unsigned long end_pfn) { - phys_addr_t start, end; - u64 i; - - for_each_free_mem_range(i, nid, &start, &end, NULL) { - unsigned long pfn = clamp_t(unsigned long, PFN_UP(start), - start_pfn, end_pfn); - unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end), - start_pfn, end_pfn); - for ( ; pfn < e_pfn; pfn++) - if (pfn_valid(pfn)) - add_one_highpage_init(pfn_to_page(pfn)); + struct range *range; + int nr_range; + int i; + + nr_range = __get_free_all_memory_range(&range, nid, start_pfn, end_pfn); + + for (i = 0; i < nr_range; i++) { + struct page *page; + int node_pfn; + + for (node_pfn = range[i].start; node_pfn < range[i].end; + node_pfn++) { + if (!pfn_valid(node_pfn)) + continue; + page = pfn_to_page(node_pfn); + add_one_highpage_init(page); + } } } #else @@ -644,18 +650,18 @@ void __init initmem_init(void) highstart_pfn = highend_pfn = max_pfn; if (max_pfn > max_low_pfn) highstart_pfn = max_low_pfn; + memblock_x86_register_active_regions(0, 0, highend_pfn); + sparse_memory_present_with_active_regions(0); printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else + memblock_x86_register_active_regions(0, 0, max_low_pfn); + sparse_memory_present_with_active_regions(0); num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif - - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); - sparse_memory_present_with_active_regions(0); - #ifdef CONFIG_FLATMEM max_mapnr = num_physpages; #endif diff --git a/trunk/arch/x86/mm/init_64.c b/trunk/arch/x86/mm/init_64.c index a8a56ce3a962..bbaaa005bf0e 100644 --- a/trunk/arch/x86/mm/init_64.c +++ b/trunk/arch/x86/mm/init_64.c @@ -608,7 +608,7 @@ kernel_physical_mapping_init(unsigned long start, #ifndef CONFIG_NUMA void __init initmem_init(void) { - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); + memblock_x86_register_active_regions(0, 0, max_pfn); } #endif diff --git a/trunk/arch/x86/mm/memblock.c b/trunk/arch/x86/mm/memblock.c new file mode 100644 index 000000000000..992da5ec5a64 --- /dev/null +++ b/trunk/arch/x86/mm/memblock.c @@ -0,0 +1,348 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +/* Check for already reserved areas */ +bool __init memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align) +{ + struct memblock_region *r; + u64 addr = *addrp, last; + u64 size = *sizep; + bool changed = false; + +again: + last = addr + size; + for_each_memblock(reserved, r) { + if (last > r->base && addr < r->base) { + size = r->base - addr; + changed = true; + goto again; + } + if (last > (r->base + r->size) && addr < (r->base + r->size)) { + addr = round_up(r->base + r->size, align); + size = last - addr; + changed = true; + goto again; + } + if (last <= (r->base + r->size) && addr >= r->base) { + *sizep = 0; + return false; + } + } + if (changed) { + *addrp = addr; + *sizep = size; + } + return changed; +} + +/* + * Find next free range after start, and size is returned in *sizep + */ +u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align) +{ + struct memblock_region *r; + + for_each_memblock(memory, r) { + u64 ei_start = r->base; + u64 ei_last = ei_start + r->size; + u64 addr; + + addr = round_up(ei_start, align); + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + continue; + *sizep = ei_last - addr; + while (memblock_x86_check_reserved_size(&addr, sizep, align)) + ; + + if (*sizep) + return addr; + } + + return MEMBLOCK_ERROR; +} + +static __init struct range *find_range_array(int count) +{ + u64 end, size, mem; + struct range *range; + + size = sizeof(struct range) * count; + end = memblock.current_limit; + + mem = memblock_find_in_range(0, end, size, sizeof(struct range)); + if (mem == MEMBLOCK_ERROR) + panic("can not find more space for range array"); + + /* + * This range is tempoaray, so don't reserve it, it will not be + * overlapped because We will not alloccate new buffer before + * We discard this one + */ + range = __va(mem); + memset(range, 0, size); + + return range; +} + +static void __init memblock_x86_subtract_reserved(struct range *range, int az) +{ + u64 final_start, final_end; + struct memblock_region *r; + + /* Take out region array itself at first*/ + memblock_free_reserved_regions(); + + memblock_dbg("Subtract (%ld early reservations)\n", memblock.reserved.cnt); + + for_each_memblock(reserved, r) { + memblock_dbg(" [%010llx-%010llx]\n", (u64)r->base, (u64)r->base + r->size - 1); + final_start = PFN_DOWN(r->base); + final_end = PFN_UP(r->base + r->size); + if (final_start >= final_end) + continue; + subtract_range(range, az, final_start, final_end); + } + + /* Put region array back ? */ + memblock_reserve_reserved_regions(); +} + +struct count_data { + int nr; +}; + +static int __init count_work_fn(unsigned long start_pfn, + unsigned long end_pfn, void *datax) +{ + struct count_data *data = datax; + + data->nr++; + + return 0; +} + +static int __init count_early_node_map(int nodeid) +{ + struct count_data data; + + data.nr = 0; + work_with_active_regions(nodeid, count_work_fn, &data); + + return data.nr; +} + +int __init __get_free_all_memory_range(struct range **rangep, int nodeid, + unsigned long start_pfn, unsigned long end_pfn) +{ + int count; + struct range *range; + int nr_range; + + count = (memblock.reserved.cnt + count_early_node_map(nodeid)) * 2; + + range = find_range_array(count); + nr_range = 0; + + /* + * Use early_node_map[] and memblock.reserved.region to get range array + * at first + */ + nr_range = add_from_early_node_map(range, count, nr_range, nodeid); + subtract_range(range, count, 0, start_pfn); + subtract_range(range, count, end_pfn, -1ULL); + + memblock_x86_subtract_reserved(range, count); + nr_range = clean_sort_range(range, count); + + *rangep = range; + return nr_range; +} + +int __init get_free_all_memory_range(struct range **rangep, int nodeid) +{ + unsigned long end_pfn = -1UL; + +#ifdef CONFIG_X86_32 + end_pfn = max_low_pfn; +#endif + return __get_free_all_memory_range(rangep, nodeid, 0, end_pfn); +} + +static u64 __init __memblock_x86_memory_in_range(u64 addr, u64 limit, bool get_free) +{ + int i, count; + struct range *range; + int nr_range; + u64 final_start, final_end; + u64 free_size; + struct memblock_region *r; + + count = (memblock.reserved.cnt + memblock.memory.cnt) * 2; + + range = find_range_array(count); + nr_range = 0; + + addr = PFN_UP(addr); + limit = PFN_DOWN(limit); + + for_each_memblock(memory, r) { + final_start = PFN_UP(r->base); + final_end = PFN_DOWN(r->base + r->size); + if (final_start >= final_end) + continue; + if (final_start >= limit || final_end <= addr) + continue; + + nr_range = add_range(range, count, nr_range, final_start, final_end); + } + subtract_range(range, count, 0, addr); + subtract_range(range, count, limit, -1ULL); + + /* Subtract memblock.reserved.region in range ? */ + if (!get_free) + goto sort_and_count_them; + for_each_memblock(reserved, r) { + final_start = PFN_DOWN(r->base); + final_end = PFN_UP(r->base + r->size); + if (final_start >= final_end) + continue; + if (final_start >= limit || final_end <= addr) + continue; + + subtract_range(range, count, final_start, final_end); + } + +sort_and_count_them: + nr_range = clean_sort_range(range, count); + + free_size = 0; + for (i = 0; i < nr_range; i++) + free_size += range[i].end - range[i].start; + + return free_size << PAGE_SHIFT; +} + +u64 __init memblock_x86_free_memory_in_range(u64 addr, u64 limit) +{ + return __memblock_x86_memory_in_range(addr, limit, true); +} + +u64 __init memblock_x86_memory_in_range(u64 addr, u64 limit) +{ + return __memblock_x86_memory_in_range(addr, limit, false); +} + +void __init memblock_x86_reserve_range(u64 start, u64 end, char *name) +{ + if (start == end) + return; + + if (WARN_ONCE(start > end, "memblock_x86_reserve_range: wrong range [%#llx, %#llx)\n", start, end)) + return; + + memblock_dbg(" memblock_x86_reserve_range: [%#010llx-%#010llx] %16s\n", start, end - 1, name); + + memblock_reserve(start, end - start); +} + +void __init memblock_x86_free_range(u64 start, u64 end) +{ + if (start == end) + return; + + if (WARN_ONCE(start > end, "memblock_x86_free_range: wrong range [%#llx, %#llx)\n", start, end)) + return; + + memblock_dbg(" memblock_x86_free_range: [%#010llx-%#010llx]\n", start, end - 1); + + memblock_free(start, end - start); +} + +/* + * Need to call this function after memblock_x86_register_active_regions, + * so early_node_map[] is filled already. + */ +u64 __init memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align) +{ + u64 addr; + addr = find_memory_core_early(nid, size, align, start, end); + if (addr != MEMBLOCK_ERROR) + return addr; + + /* Fallback, should already have start end within node range */ + return memblock_find_in_range(start, end, size, align); +} + +/* + * Finds an active region in the address range from start_pfn to last_pfn and + * returns its range in ei_startpfn and ei_endpfn for the memblock entry. + */ +static int __init memblock_x86_find_active_region(const struct memblock_region *ei, + unsigned long start_pfn, + unsigned long last_pfn, + unsigned long *ei_startpfn, + unsigned long *ei_endpfn) +{ + u64 align = PAGE_SIZE; + + *ei_startpfn = round_up(ei->base, align) >> PAGE_SHIFT; + *ei_endpfn = round_down(ei->base + ei->size, align) >> PAGE_SHIFT; + + /* Skip map entries smaller than a page */ + if (*ei_startpfn >= *ei_endpfn) + return 0; + + /* Skip if map is outside the node */ + if (*ei_endpfn <= start_pfn || *ei_startpfn >= last_pfn) + return 0; + + /* Check for overlaps */ + if (*ei_startpfn < start_pfn) + *ei_startpfn = start_pfn; + if (*ei_endpfn > last_pfn) + *ei_endpfn = last_pfn; + + return 1; +} + +/* Walk the memblock.memory map and register active regions within a node */ +void __init memblock_x86_register_active_regions(int nid, unsigned long start_pfn, + unsigned long last_pfn) +{ + unsigned long ei_startpfn; + unsigned long ei_endpfn; + struct memblock_region *r; + + for_each_memblock(memory, r) + if (memblock_x86_find_active_region(r, start_pfn, last_pfn, + &ei_startpfn, &ei_endpfn)) + add_active_range(nid, ei_startpfn, ei_endpfn); +} + +/* + * Find the hole size (in bytes) in the memory range. + * @start: starting address of the memory range to scan + * @end: ending address of the memory range to scan + */ +u64 __init memblock_x86_hole_size(u64 start, u64 end) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long last_pfn = end >> PAGE_SHIFT; + unsigned long ei_startpfn, ei_endpfn, ram = 0; + struct memblock_region *r; + + for_each_memblock(memory, r) + if (memblock_x86_find_active_region(r, start_pfn, last_pfn, + &ei_startpfn, &ei_endpfn)) + ram += ei_endpfn - ei_startpfn; + + return end - start - ((u64)ram << PAGE_SHIFT); +} diff --git a/trunk/arch/x86/mm/memtest.c b/trunk/arch/x86/mm/memtest.c index c80b9fb95734..92faf3a1c53e 100644 --- a/trunk/arch/x86/mm/memtest.c +++ b/trunk/arch/x86/mm/memtest.c @@ -34,7 +34,7 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) (unsigned long long) pattern, (unsigned long long) start_bad, (unsigned long long) end_bad); - memblock_reserve(start_bad, end_bad - start_bad); + memblock_x86_reserve_range(start_bad, end_bad, "BAD RAM"); } static void __init memtest(u64 pattern, u64 start_phys, u64 size) @@ -70,19 +70,24 @@ static void __init memtest(u64 pattern, u64 start_phys, u64 size) static void __init do_one_pass(u64 pattern, u64 start, u64 end) { - u64 i; - phys_addr_t this_start, this_end; - - for_each_free_mem_range(i, MAX_NUMNODES, &this_start, &this_end, NULL) { - this_start = clamp_t(phys_addr_t, this_start, start, end); - this_end = clamp_t(phys_addr_t, this_end, start, end); - if (this_start < this_end) { - printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", - (unsigned long long)this_start, - (unsigned long long)this_end, - (unsigned long long)cpu_to_be64(pattern)); - memtest(pattern, this_start, this_end - this_start); - } + u64 size = 0; + + while (start < end) { + start = memblock_x86_find_in_range_size(start, &size, 1); + + /* done ? */ + if (start >= end) + break; + if (start + size > end) + size = end - start; + + printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", + (unsigned long long) start, + (unsigned long long) start + size, + (unsigned long long) cpu_to_be64(pattern)); + memtest(pattern, start, size); + + start += size; } } diff --git a/trunk/arch/x86/mm/numa.c b/trunk/arch/x86/mm/numa.c index 496f494593bf..fbeaaf416610 100644 --- a/trunk/arch/x86/mm/numa.c +++ b/trunk/arch/x86/mm/numa.c @@ -192,6 +192,8 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) /* Initialize NODE_DATA for a node on the local memory */ static void __init setup_node_data(int nid, u64 start, u64 end) { + const u64 nd_low = PFN_PHYS(MAX_DMA_PFN); + const u64 nd_high = PFN_PHYS(max_pfn_mapped); const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); bool remapped = false; u64 nd_pa; @@ -222,12 +224,17 @@ static void __init setup_node_data(int nid, u64 start, u64 end) nd_pa = __pa(nd); remapped = true; } else { - nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); - if (!nd_pa) { + nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high, + nd_size, SMP_CACHE_BYTES); + if (nd_pa == MEMBLOCK_ERROR) + nd_pa = memblock_find_in_range(nd_low, nd_high, + nd_size, SMP_CACHE_BYTES); + if (nd_pa == MEMBLOCK_ERROR) { pr_err("Cannot find %zu bytes in node %d\n", nd_size, nid); return; } + memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA"); nd = __va(nd_pa); } @@ -364,7 +371,8 @@ void __init numa_reset_distance(void) /* numa_distance could be 1LU marking allocation failure, test cnt */ if (numa_distance_cnt) - memblock_free(__pa(numa_distance), size); + memblock_x86_free_range(__pa(numa_distance), + __pa(numa_distance) + size); numa_distance_cnt = 0; numa_distance = NULL; /* enable table creation */ } @@ -387,13 +395,13 @@ static int __init numa_alloc_distance(void) phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), size, PAGE_SIZE); - if (!phys) { + if (phys == MEMBLOCK_ERROR) { pr_warning("NUMA: Warning: can't allocate distance table!\n"); /* don't retry until explicitly reset */ numa_distance = (void *)1LU; return -ENOMEM; } - memblock_reserve(phys, size); + memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); numa_distance = __va(phys); numa_distance_cnt = cnt; @@ -474,8 +482,8 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) numaram = 0; } - e820ram = max_pfn - absent_pages_in_range(0, max_pfn); - + e820ram = max_pfn - (memblock_x86_hole_size(0, + PFN_PHYS(max_pfn)) >> PAGE_SHIFT); /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n", @@ -497,10 +505,13 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) if (WARN_ON(nodes_empty(node_possible_map))) return -EINVAL; - for (i = 0; i < mi->nr_blks; i++) { - struct numa_memblk *mb = &mi->blk[i]; - memblock_set_node(mb->start, mb->end - mb->start, mb->nid); - } + for (i = 0; i < mi->nr_blks; i++) + memblock_x86_register_active_regions(mi->blk[i].nid, + mi->blk[i].start >> PAGE_SHIFT, + mi->blk[i].end >> PAGE_SHIFT); + + /* for out of order entries */ + sort_node_map(); /* * If sections array is gonna be used for pfn -> nid mapping, check @@ -534,8 +545,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) setup_node_data(nid, start, end); } - /* Dump memblock with node info and return. */ - memblock_dump_all(); return 0; } @@ -573,7 +582,7 @@ static int __init numa_init(int (*init_func)(void)) nodes_clear(node_possible_map); nodes_clear(node_online_map); memset(&numa_meminfo, 0, sizeof(numa_meminfo)); - WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); + remove_all_active_ranges(); numa_reset_distance(); ret = init_func(); diff --git a/trunk/arch/x86/mm/numa_32.c b/trunk/arch/x86/mm/numa_32.c index 534255a36b6b..3adebe7e536a 100644 --- a/trunk/arch/x86/mm/numa_32.c +++ b/trunk/arch/x86/mm/numa_32.c @@ -199,23 +199,23 @@ void __init init_alloc_remap(int nid, u64 start, u64 end) /* allocate node memory and the lowmem remap area */ node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES); - if (!node_pa) { + if (node_pa == MEMBLOCK_ERROR) { pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n", size, nid); return; } - memblock_reserve(node_pa, size); + memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM"); remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT, size, LARGE_PAGE_BYTES); - if (!remap_pa) { + if (remap_pa == MEMBLOCK_ERROR) { pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n", size, nid); - memblock_free(node_pa, size); + memblock_x86_free_range(node_pa, node_pa + size); return; } - memblock_reserve(remap_pa, size); + memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG"); remap_va = phys_to_virt(remap_pa); /* perform actual remap */ diff --git a/trunk/arch/x86/mm/numa_64.c b/trunk/arch/x86/mm/numa_64.c index 92e27119ee1a..dd27f401f0a0 100644 --- a/trunk/arch/x86/mm/numa_64.c +++ b/trunk/arch/x86/mm/numa_64.c @@ -19,7 +19,7 @@ unsigned long __init numa_free_all_bootmem(void) for_each_online_node(i) pages += free_all_bootmem_node(NODE_DATA(i)); - pages += free_low_memory_core_early(MAX_NUMNODES); + pages += free_all_memory_core_early(MAX_NUMNODES); return pages; } diff --git a/trunk/arch/x86/mm/numa_emulation.c b/trunk/arch/x86/mm/numa_emulation.c index 46db56845f18..d0ed086b6247 100644 --- a/trunk/arch/x86/mm/numa_emulation.c +++ b/trunk/arch/x86/mm/numa_emulation.c @@ -28,16 +28,6 @@ static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) return -ENOENT; } -static u64 mem_hole_size(u64 start, u64 end) -{ - unsigned long start_pfn = PFN_UP(start); - unsigned long end_pfn = PFN_DOWN(end); - - if (start_pfn < end_pfn) - return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); - return 0; -} - /* * Sets up nid to range from @start to @end. The return value is -errno if * something went wrong, 0 otherwise. @@ -99,7 +89,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, * Calculate target node size. x86_32 freaks on __udivdi3() so do * the division in ulong number of pages and convert back. */ - size = max_addr - addr - mem_hole_size(addr, max_addr); + size = max_addr - addr - memblock_x86_hole_size(addr, max_addr); size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); /* @@ -145,7 +135,8 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, * Continue to add memory to this fake node if its * non-reserved memory is less than the per-node size. */ - while (end - start - mem_hole_size(start, end) < size) { + while (end - start - + memblock_x86_hole_size(start, end) < size) { end += FAKE_NODE_MIN_SIZE; if (end > limit) { end = limit; @@ -159,7 +150,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, * this one must extend to the boundary. */ if (end < dma32_end && dma32_end - end - - mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) end = dma32_end; /* @@ -167,7 +158,8 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, * next node, this one must extend to the end of the * physical node. */ - if (limit - end - mem_hole_size(end, limit) < size) + if (limit - end - + memblock_x86_hole_size(end, limit) < size) end = limit; ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, @@ -188,7 +180,7 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) { u64 end = start + size; - while (end - start - mem_hole_size(start, end) < size) { + while (end - start - memblock_x86_hole_size(start, end) < size) { end += FAKE_NODE_MIN_SIZE; if (end > max_addr) { end = max_addr; @@ -219,7 +211,8 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, * creates a uniform distribution of node sizes across the entire * machine (but not necessarily over physical nodes). */ - min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES; + min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / + MAX_NUMNODES; min_size = max(min_size, FAKE_NODE_MIN_SIZE); if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) min_size = (min_size + FAKE_NODE_MIN_SIZE) & @@ -259,7 +252,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, * this one must extend to the boundary. */ if (end < dma32_end && dma32_end - end - - mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) end = dma32_end; /* @@ -267,7 +260,8 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, * next node, this one must extend to the end of the * physical node. */ - if (limit - end - mem_hole_size(end, limit) < size) + if (limit - end - + memblock_x86_hole_size(end, limit) < size) end = limit; ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, @@ -357,11 +351,11 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), phys_size, PAGE_SIZE); - if (!phys) { + if (phys == MEMBLOCK_ERROR) { pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); goto no_emu; } - memblock_reserve(phys, phys_size); + memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST"); phys_dist = __va(phys); for (i = 0; i < numa_dist_cnt; i++) @@ -430,7 +424,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) /* free the copied physical distance table */ if (phys_dist) - memblock_free(__pa(phys_dist), phys_size); + memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size); return; no_emu: diff --git a/trunk/arch/x86/mm/pageattr.c b/trunk/arch/x86/mm/pageattr.c index eda2acbb6e81..f9e526742fa1 100644 --- a/trunk/arch/x86/mm/pageattr.c +++ b/trunk/arch/x86/mm/pageattr.c @@ -998,7 +998,7 @@ int set_memory_uc(unsigned long addr, int numpages) } EXPORT_SYMBOL(set_memory_uc); -static int _set_memory_array(unsigned long *addr, int addrinarray, +int _set_memory_array(unsigned long *addr, int addrinarray, unsigned long new_type) { int i, j; diff --git a/trunk/arch/x86/mm/srat.c b/trunk/arch/x86/mm/srat.c index fd61b3fb7341..81dbfdeb080d 100644 --- a/trunk/arch/x86/mm/srat.c +++ b/trunk/arch/x86/mm/srat.c @@ -69,12 +69,6 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) return; pxm = pa->proximity_domain; - apic_id = pa->apic_id; - if (!cpu_has_x2apic && (apic_id >= 0xff)) { - printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n", - pxm, apic_id); - return; - } node = setup_node(pxm); if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); @@ -82,6 +76,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) return; } + apic_id = pa->apic_id; if (apic_id >= MAX_LOCAL_APIC) { printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); return; diff --git a/trunk/arch/x86/net/bpf_jit_comp.c b/trunk/arch/x86/net/bpf_jit_comp.c index 7b65f752c5f8..bfab3fa10edc 100644 --- a/trunk/arch/x86/net/bpf_jit_comp.c +++ b/trunk/arch/x86/net/bpf_jit_comp.c @@ -568,8 +568,8 @@ cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i]; break; } if (filter[i].jt != 0) { - if (filter[i].jf && f_offset) - t_offset += is_near(f_offset) ? 2 : 5; + if (filter[i].jf) + t_offset += is_near(f_offset) ? 2 : 6; EMIT_COND_JMP(t_op, t_offset); if (filter[i].jf) EMIT_JMP(f_offset); diff --git a/trunk/arch/x86/oprofile/Makefile b/trunk/arch/x86/oprofile/Makefile index 1599f568f0e2..446902b2a6b6 100644 --- a/trunk/arch/x86/oprofile/Makefile +++ b/trunk/arch/x86/oprofile/Makefile @@ -4,8 +4,9 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ oprof.o cpu_buffer.o buffer_sync.o \ event_buffer.o oprofile_files.o \ oprofilefs.o oprofile_stats.o \ - timer_int.o nmi_timer_int.o ) + timer_int.o ) oprofile-y := $(DRIVER_OBJS) init.o backtrace.o oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \ op_model_ppro.o op_model_p4.o +oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o diff --git a/trunk/arch/x86/oprofile/init.c b/trunk/arch/x86/oprofile/init.c index 9e138d00ad36..f148cf652678 100644 --- a/trunk/arch/x86/oprofile/init.c +++ b/trunk/arch/x86/oprofile/init.c @@ -16,23 +16,37 @@ * with the NMI mode driver. */ -#ifdef CONFIG_X86_LOCAL_APIC extern int op_nmi_init(struct oprofile_operations *ops); +extern int op_nmi_timer_init(struct oprofile_operations *ops); extern void op_nmi_exit(void); -#else -static int op_nmi_init(struct oprofile_operations *ops) { return -ENODEV; } -static void op_nmi_exit(void) { } -#endif - extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth); +static int nmi_timer; + int __init oprofile_arch_init(struct oprofile_operations *ops) { + int ret; + + ret = -ENODEV; + +#ifdef CONFIG_X86_LOCAL_APIC + ret = op_nmi_init(ops); +#endif + nmi_timer = (ret != 0); +#ifdef CONFIG_X86_IO_APIC + if (nmi_timer) + ret = op_nmi_timer_init(ops); +#endif ops->backtrace = x86_backtrace; - return op_nmi_init(ops); + + return ret; } + void oprofile_arch_exit(void) { - op_nmi_exit(); +#ifdef CONFIG_X86_LOCAL_APIC + if (!nmi_timer) + op_nmi_exit(); +#endif } diff --git a/trunk/arch/x86/oprofile/nmi_int.c b/trunk/arch/x86/oprofile/nmi_int.c index 26b8a8514ee5..75f9528e0372 100644 --- a/trunk/arch/x86/oprofile/nmi_int.c +++ b/trunk/arch/x86/oprofile/nmi_int.c @@ -595,36 +595,24 @@ static int __init p4_init(char **cpu_type) return 0; } -enum __force_cpu_type { - reserved = 0, /* do not force */ - timer, - arch_perfmon, -}; - -static int force_cpu_type; - -static int set_cpu_type(const char *str, struct kernel_param *kp) +static int force_arch_perfmon; +static int force_cpu_type(const char *str, struct kernel_param *kp) { - if (!strcmp(str, "timer")) { - force_cpu_type = timer; - printk(KERN_INFO "oprofile: forcing NMI timer mode\n"); - } else if (!strcmp(str, "arch_perfmon")) { - force_cpu_type = arch_perfmon; + if (!strcmp(str, "arch_perfmon")) { + force_arch_perfmon = 1; printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); - } else { - force_cpu_type = 0; } return 0; } -module_param_call(cpu_type, set_cpu_type, NULL, NULL, 0); +module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0); static int __init ppro_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; struct op_x86_model_spec *spec = &op_ppro_spec; /* default */ - if (force_cpu_type == arch_perfmon && cpu_has_arch_perfmon) + if (force_arch_perfmon && cpu_has_arch_perfmon) return 0; /* @@ -691,9 +679,6 @@ int __init op_nmi_init(struct oprofile_operations *ops) if (!cpu_has_apic) return -ENODEV; - if (force_cpu_type == timer) - return -ENODEV; - switch (vendor) { case X86_VENDOR_AMD: /* Needs to be at least an Athlon (or hammer in 32bit mode) */ diff --git a/trunk/arch/x86/oprofile/nmi_timer_int.c b/trunk/arch/x86/oprofile/nmi_timer_int.c new file mode 100644 index 000000000000..7f8052cd6620 --- /dev/null +++ b/trunk/arch/x86/oprofile/nmi_timer_int.c @@ -0,0 +1,50 @@ +/** + * @file nmi_timer_int.c + * + * @remark Copyright 2003 OProfile authors + * @remark Read the file COPYING + * + * @author Zwane Mwaikambo + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static int profile_timer_exceptions_notify(unsigned int val, struct pt_regs *regs) +{ + oprofile_add_sample(regs, 0); + return NMI_HANDLED; +} + +static int timer_start(void) +{ + if (register_nmi_handler(NMI_LOCAL, profile_timer_exceptions_notify, + 0, "oprofile-timer")) + return 1; + return 0; +} + + +static void timer_stop(void) +{ + unregister_nmi_handler(NMI_LOCAL, "oprofile-timer"); + synchronize_sched(); /* Allow already-started NMIs to complete. */ +} + + +int __init op_nmi_timer_init(struct oprofile_operations *ops) +{ + ops->start = timer_start; + ops->stop = timer_stop; + ops->cpu_type = "timer"; + printk(KERN_INFO "oprofile: using NMI timer interrupt.\n"); + return 0; +} diff --git a/trunk/arch/x86/platform/efi/efi.c b/trunk/arch/x86/platform/efi/efi.c index 4cf9bd0a1653..37718f0f053d 100644 --- a/trunk/arch/x86/platform/efi/efi.c +++ b/trunk/arch/x86/platform/efi/efi.c @@ -238,8 +238,7 @@ static efi_status_t __init phys_efi_get_time(efi_time_t *tm, spin_lock_irqsave(&rtc_lock, flags); efi_call_phys_prelog(); - status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm), - virt_to_phys(tc)); + status = efi_call_phys2(efi_phys.get_time, tm, tc); efi_call_phys_epilog(); spin_unlock_irqrestore(&rtc_lock, flags); return status; @@ -353,7 +352,8 @@ void __init efi_memblock_x86_reserve_range(void) boot_params.efi_info.efi_memdesc_size; memmap.desc_version = boot_params.efi_info.efi_memdesc_version; memmap.desc_size = boot_params.efi_info.efi_memdesc_size; - memblock_reserve(pmap, memmap.nr_map * memmap.desc_size); + memblock_x86_reserve_range(pmap, pmap + memmap.nr_map * memmap.desc_size, + "EFI memmap"); } #if EFI_DEBUG @@ -397,14 +397,16 @@ void __init efi_reserve_boot_services(void) if ((start+size >= virt_to_phys(_text) && start <= virt_to_phys(_end)) || !e820_all_mapped(start, start+size, E820_RAM) || - memblock_is_region_reserved(start, size)) { + memblock_x86_check_reserved_size(&start, &size, + 1<num_pages = 0; memblock_dbg(PFX "Could not reserve boot range " "[0x%010llx-0x%010llx]\n", start, start+size-1); } else - memblock_reserve(start, size); + memblock_x86_reserve_range(start, start+size, + "EFI Boot"); } } diff --git a/trunk/arch/x86/tools/Makefile b/trunk/arch/x86/tools/Makefile index d511aa97533a..f82082677337 100644 --- a/trunk/arch/x86/tools/Makefile +++ b/trunk/arch/x86/tools/Makefile @@ -18,21 +18,14 @@ chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk quiet_cmd_posttest = TEST $@ cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose) -quiet_cmd_sanitytest = TEST $@ - cmd_sanitytest = $(obj)/insn_sanity $(posttest_64bit) -m 1000000 - -posttest: $(obj)/test_get_len vmlinux $(obj)/insn_sanity +posttest: $(obj)/test_get_len vmlinux $(call cmd,posttest) - $(call cmd,sanitytest) -hostprogs-y += test_get_len insn_sanity +hostprogs-y := test_get_len # -I needed for generated C source and C source which in the kernel tree. HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/ -HOSTCFLAGS_insn_sanity.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/ - # Dependencies are also needed. $(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c -$(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c diff --git a/trunk/arch/x86/tools/gen-insn-attr-x86.awk b/trunk/arch/x86/tools/gen-insn-attr-x86.awk index 5f6a5b6c3a15..eaf11f52fc0b 100644 --- a/trunk/arch/x86/tools/gen-insn-attr-x86.awk +++ b/trunk/arch/x86/tools/gen-insn-attr-x86.awk @@ -47,7 +47,7 @@ BEGIN { sep_expr = "^\\|$" group_expr = "^Grp[0-9A-Za-z]+" - imm_expr = "^[IJAOL][a-z]" + imm_expr = "^[IJAO][a-z]" imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)" @@ -59,7 +59,6 @@ BEGIN { imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)" imm_flag["Ob"] = "INAT_MOFFSET" imm_flag["Ov"] = "INAT_MOFFSET" - imm_flag["Lx"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" force64_expr = "\\([df]64\\)" @@ -71,12 +70,8 @@ BEGIN { lprefix3_expr = "\\(F2\\)" max_lprefix = 4 - # All opcodes starting with lower-case 'v' or with (v1) superscript - # accepts VEX prefix - vexok_opcode_expr = "^v.*" - vexok_expr = "\\(v1\\)" - # All opcodes with (v) superscript supports *only* VEX prefix - vexonly_expr = "\\(v\\)" + vexok_expr = "\\(VEX\\)" + vexonly_expr = "\\(oVEX\\)" prefix_expr = "\\(Prefix\\)" prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" @@ -90,8 +85,8 @@ BEGIN { prefix_num["SEG=GS"] = "INAT_PFX_GS" prefix_num["SEG=SS"] = "INAT_PFX_SS" prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ" - prefix_num["VEX+1byte"] = "INAT_PFX_VEX2" - prefix_num["VEX+2byte"] = "INAT_PFX_VEX3" + prefix_num["2bytes-VEX"] = "INAT_PFX_VEX2" + prefix_num["3bytes-VEX"] = "INAT_PFX_VEX3" clear_vars() } @@ -315,10 +310,12 @@ function convert_operands(count,opnd, i,j,imm,mod) if (match(opcode, fpu_expr)) flags = add_flags(flags, "INAT_MODRM") - # check VEX codes + # check VEX only code if (match(ext, vexonly_expr)) flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") - else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr)) + + # check VEX only code + if (match(ext, vexok_expr)) flags = add_flags(flags, "INAT_VEXOK") # check prefixes diff --git a/trunk/arch/x86/tools/insn_sanity.c b/trunk/arch/x86/tools/insn_sanity.c deleted file mode 100644 index cc2f8c131286..000000000000 --- a/trunk/arch/x86/tools/insn_sanity.c +++ /dev/null @@ -1,275 +0,0 @@ -/* - * x86 decoder sanity test - based on test_get_insn.c - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2009 - * Copyright (C) Hitachi, Ltd., 2011 - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#define unlikely(cond) (cond) -#define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0])) - -#include -#include -#include - -/* - * Test of instruction analysis against tampering. - * Feed random binary to instruction decoder and ensure not to - * access out-of-instruction-buffer. - */ - -#define DEFAULT_MAX_ITER 10000 -#define INSN_NOP 0x90 - -static const char *prog; /* Program name */ -static int verbose; /* Verbosity */ -static int x86_64; /* x86-64 bit mode flag */ -static unsigned int seed; /* Random seed */ -static unsigned long iter_start; /* Start of iteration number */ -static unsigned long iter_end = DEFAULT_MAX_ITER; /* End of iteration number */ -static FILE *input_file; /* Input file name */ - -static void usage(const char *err) -{ - if (err) - fprintf(stderr, "Error: %s\n\n", err); - fprintf(stderr, "Usage: %s [-y|-n|-v] [-s seed[,no]] [-m max] [-i input]\n", prog); - fprintf(stderr, "\t-y 64bit mode\n"); - fprintf(stderr, "\t-n 32bit mode\n"); - fprintf(stderr, "\t-v Verbosity(-vv dumps any decoded result)\n"); - fprintf(stderr, "\t-s Give a random seed (and iteration number)\n"); - fprintf(stderr, "\t-m Give a maximum iteration number\n"); - fprintf(stderr, "\t-i Give an input file with decoded binary\n"); - exit(1); -} - -static void dump_field(FILE *fp, const char *name, const char *indent, - struct insn_field *field) -{ - fprintf(fp, "%s.%s = {\n", indent, name); - fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n", - indent, field->value, field->bytes[0], field->bytes[1], - field->bytes[2], field->bytes[3]); - fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent, - field->got, field->nbytes); -} - -static void dump_insn(FILE *fp, struct insn *insn) -{ - fprintf(fp, "Instruction = {\n"); - dump_field(fp, "prefixes", "\t", &insn->prefixes); - dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix); - dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix); - dump_field(fp, "opcode", "\t", &insn->opcode); - dump_field(fp, "modrm", "\t", &insn->modrm); - dump_field(fp, "sib", "\t", &insn->sib); - dump_field(fp, "displacement", "\t", &insn->displacement); - dump_field(fp, "immediate1", "\t", &insn->immediate1); - dump_field(fp, "immediate2", "\t", &insn->immediate2); - fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n", - insn->attr, insn->opnd_bytes, insn->addr_bytes); - fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n", - insn->length, insn->x86_64, insn->kaddr); -} - -static void dump_stream(FILE *fp, const char *msg, unsigned long nr_iter, - unsigned char *insn_buf, struct insn *insn) -{ - int i; - - fprintf(fp, "%s:\n", msg); - - dump_insn(fp, insn); - - fprintf(fp, "You can reproduce this with below command(s);\n"); - - /* Input a decoded instruction sequence directly */ - fprintf(fp, " $ echo "); - for (i = 0; i < MAX_INSN_SIZE; i++) - fprintf(fp, " %02x", insn_buf[i]); - fprintf(fp, " | %s -i -\n", prog); - - if (!input_file) { - fprintf(fp, "Or \n"); - /* Give a seed and iteration number */ - fprintf(fp, " $ %s -s 0x%x,%lu\n", prog, seed, nr_iter); - } -} - -static void init_random_seed(void) -{ - int fd; - - fd = open("/dev/urandom", O_RDONLY); - if (fd < 0) - goto fail; - - if (read(fd, &seed, sizeof(seed)) != sizeof(seed)) - goto fail; - - close(fd); - return; -fail: - usage("Failed to open /dev/urandom"); -} - -/* Read given instruction sequence from the input file */ -static int read_next_insn(unsigned char *insn_buf) -{ - char buf[256] = "", *tmp; - int i; - - tmp = fgets(buf, ARRAY_SIZE(buf), input_file); - if (tmp == NULL || feof(input_file)) - return 0; - - for (i = 0; i < MAX_INSN_SIZE; i++) { - insn_buf[i] = (unsigned char)strtoul(tmp, &tmp, 16); - if (*tmp != ' ') - break; - } - - return i; -} - -static int generate_insn(unsigned char *insn_buf) -{ - int i; - - if (input_file) - return read_next_insn(insn_buf); - - /* Fills buffer with random binary up to MAX_INSN_SIZE */ - for (i = 0; i < MAX_INSN_SIZE - 1; i += 2) - *(unsigned short *)(&insn_buf[i]) = random() & 0xffff; - - while (i < MAX_INSN_SIZE) - insn_buf[i++] = random() & 0xff; - - return i; -} - -static void parse_args(int argc, char **argv) -{ - int c; - char *tmp = NULL; - int set_seed = 0; - - prog = argv[0]; - while ((c = getopt(argc, argv, "ynvs:m:i:")) != -1) { - switch (c) { - case 'y': - x86_64 = 1; - break; - case 'n': - x86_64 = 0; - break; - case 'v': - verbose++; - break; - case 'i': - if (strcmp("-", optarg) == 0) - input_file = stdin; - else - input_file = fopen(optarg, "r"); - if (!input_file) - usage("Failed to open input file"); - break; - case 's': - seed = (unsigned int)strtoul(optarg, &tmp, 0); - if (*tmp == ',') { - optarg = tmp + 1; - iter_start = strtoul(optarg, &tmp, 0); - } - if (*tmp != '\0' || tmp == optarg) - usage("Failed to parse seed"); - set_seed = 1; - break; - case 'm': - iter_end = strtoul(optarg, &tmp, 0); - if (*tmp != '\0' || tmp == optarg) - usage("Failed to parse max_iter"); - break; - default: - usage(NULL); - } - } - - /* Check errors */ - if (iter_end < iter_start) - usage("Max iteration number must be bigger than iter-num"); - - if (set_seed && input_file) - usage("Don't use input file (-i) with random seed (-s)"); - - /* Initialize random seed */ - if (!input_file) { - if (!set_seed) /* No seed is given */ - init_random_seed(); - srand(seed); - } -} - -int main(int argc, char **argv) -{ - struct insn insn; - int insns = 0; - int errors = 0; - unsigned long i; - unsigned char insn_buf[MAX_INSN_SIZE * 2]; - - parse_args(argc, argv); - - /* Prepare stop bytes with NOPs */ - memset(insn_buf + MAX_INSN_SIZE, INSN_NOP, MAX_INSN_SIZE); - - for (i = 0; i < iter_end; i++) { - if (generate_insn(insn_buf) <= 0) - break; - - if (i < iter_start) /* Skip to given iteration number */ - continue; - - /* Decode an instruction */ - insn_init(&insn, insn_buf, x86_64); - insn_get_length(&insn); - - if (insn.next_byte <= insn.kaddr || - insn.kaddr + MAX_INSN_SIZE < insn.next_byte) { - /* Access out-of-range memory */ - dump_stream(stderr, "Error: Found an access violation", i, insn_buf, &insn); - errors++; - } else if (verbose && !insn_complete(&insn)) - dump_stream(stdout, "Info: Found an undecodable input", i, insn_buf, &insn); - else if (verbose >= 2) - dump_insn(stdout, &insn); - insns++; - } - - fprintf(stdout, "%s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n", (errors) ? "Failure" : "Success", insns, (input_file) ? "given" : "random", errors, seed); - - return errors ? 1 : 0; -} diff --git a/trunk/arch/x86/xen/enlighten.c b/trunk/arch/x86/xen/enlighten.c index 12eb07bfb267..1f928659c338 100644 --- a/trunk/arch/x86/xen/enlighten.c +++ b/trunk/arch/x86/xen/enlighten.c @@ -1215,6 +1215,8 @@ asmlinkage void __init xen_start_kernel(void) local_irq_disable(); early_boot_irqs_disabled = true; + memblock_init(); + xen_raw_console_write("mapping kernel into physical memory\n"); pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); xen_ident_map_ISA(); diff --git a/trunk/arch/x86/xen/mmu.c b/trunk/arch/x86/xen/mmu.c index f4bf8aa574f4..87f6673b1207 100644 --- a/trunk/arch/x86/xen/mmu.c +++ b/trunk/arch/x86/xen/mmu.c @@ -1774,8 +1774,10 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, __xen_write_cr3(true, __pa(pgd)); xen_mc_issue(PARAVIRT_LAZY_CPU); - memblock_reserve(__pa(xen_start_info->pt_base), - xen_start_info->nr_pt_frames * PAGE_SIZE); + memblock_x86_reserve_range(__pa(xen_start_info->pt_base), + __pa(xen_start_info->pt_base + + xen_start_info->nr_pt_frames * PAGE_SIZE), + "XEN PAGETABLES"); return pgd; } @@ -1851,8 +1853,10 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, PFN_DOWN(__pa(initial_page_table))); xen_write_cr3(__pa(initial_page_table)); - memblock_reserve(__pa(xen_start_info->pt_base), - xen_start_info->nr_pt_frames * PAGE_SIZE)); + memblock_x86_reserve_range(__pa(xen_start_info->pt_base), + __pa(xen_start_info->pt_base + + xen_start_info->nr_pt_frames * PAGE_SIZE), + "XEN PAGETABLES"); return initial_page_table; } diff --git a/trunk/arch/x86/xen/setup.c b/trunk/arch/x86/xen/setup.c index e03c63692176..b2c7179fa263 100644 --- a/trunk/arch/x86/xen/setup.c +++ b/trunk/arch/x86/xen/setup.c @@ -75,7 +75,7 @@ static void __init xen_add_extra_mem(u64 start, u64 size) if (i == XEN_EXTRA_MEM_MAX_REGIONS) printk(KERN_WARNING "Warning: not enough extra memory regions\n"); - memblock_reserve(start, size); + memblock_x86_reserve_range(start, start + size, "XEN EXTRA"); xen_max_p2m_pfn = PFN_DOWN(start + size); @@ -311,8 +311,9 @@ char * __init xen_memory_setup(void) * - xen_start_info * See comment above "struct start_info" in */ - memblock_reserve(__pa(xen_start_info->mfn_list), - xen_start_info->pt_base - xen_start_info->mfn_list); + memblock_x86_reserve_range(__pa(xen_start_info->mfn_list), + __pa(xen_start_info->pt_base), + "XEN START INFO"); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); diff --git a/trunk/arch/xtensa/kernel/time.c b/trunk/arch/xtensa/kernel/time.c index ac62f9cf1e10..f3e5eb43f71c 100644 --- a/trunk/arch/xtensa/kernel/time.c +++ b/trunk/arch/xtensa/kernel/time.c @@ -41,6 +41,14 @@ static struct clocksource ccount_clocksource = { .rating = 200, .read = ccount_read, .mask = CLOCKSOURCE_MASK(32), + /* + * With a shift of 22 the lower limit of the cpu clock is + * 1MHz, where NSEC_PER_CCOUNT is 1000 or a bit less than + * 2^10: Since we have 32 bits and the multiplicator can + * already take up as much as 10 bits, this leaves us with + * remaining upper 22 bits. + */ + .shift = 22, }; static irqreturn_t timer_interrupt(int irq, void *dev_id); @@ -58,7 +66,10 @@ void __init time_init(void) printk("%d.%02d MHz\n", (int)ccount_per_jiffy/(1000000/HZ), (int)(ccount_per_jiffy/(10000/HZ))%100); #endif - clocksource_register_hz(&ccount_clocksource, CCOUNT_PER_JIFFY * HZ); + ccount_clocksource.mult = + clocksource_hz2mult(CCOUNT_PER_JIFFY * HZ, + ccount_clocksource.shift); + clocksource_register(&ccount_clocksource); /* Initialize the linux timer interrupt. */ diff --git a/trunk/block/blk-map.c b/trunk/block/blk-map.c index 623e1cd4cffe..164cd0059706 100644 --- a/trunk/block/blk-map.c +++ b/trunk/block/blk-map.c @@ -311,7 +311,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, if (IS_ERR(bio)) return PTR_ERR(bio); - if (!reading) + if (rq_data_dir(rq) == WRITE) bio->bi_rw |= REQ_WRITE; if (do_copy) diff --git a/trunk/block/blk-tag.c b/trunk/block/blk-tag.c index 4af6f5cc1167..e74d6d13838f 100644 --- a/trunk/block/blk-tag.c +++ b/trunk/block/blk-tag.c @@ -282,9 +282,18 @@ EXPORT_SYMBOL(blk_queue_resize_tags); void blk_queue_end_tag(struct request_queue *q, struct request *rq) { struct blk_queue_tag *bqt = q->queue_tags; - unsigned tag = rq->tag; /* negative tags invalid */ + int tag = rq->tag; - BUG_ON(tag >= bqt->real_max_depth); + BUG_ON(tag == -1); + + if (unlikely(tag >= bqt->max_depth)) { + /* + * This can happen after tag depth has been reduced. + * But tag shouldn't be larger than real_max_depth. + */ + WARN_ON(tag >= bqt->real_max_depth); + return; + } list_del_init(&rq->queuelist); rq->cmd_flags &= ~REQ_QUEUED; diff --git a/trunk/block/cfq-iosched.c b/trunk/block/cfq-iosched.c index 3548705b04e4..4c12869fcf77 100644 --- a/trunk/block/cfq-iosched.c +++ b/trunk/block/cfq-iosched.c @@ -1655,8 +1655,6 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, struct request *next) { struct cfq_queue *cfqq = RQ_CFQQ(rq); - struct cfq_data *cfqd = q->elevator->elevator_data; - /* * reposition in fifo if next is older than rq */ @@ -1671,16 +1669,6 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, cfq_remove_request(next); cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(next), rq_is_sync(next)); - - cfqq = RQ_CFQQ(next); - /* - * all requests of this queue are merged to other queues, delete it - * from the service tree. If it's the active_queue, - * cfq_dispatch_requests() will choose to expire it or do idle - */ - if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) && - cfqq != cfqd->active_queue) - cfq_del_cfqq_rr(cfqd, cfqq); } static int cfq_allow_merge(struct request_queue *q, struct request *rq, diff --git a/trunk/block/ioctl.c b/trunk/block/ioctl.c index d510c2a4eff8..ca939fc1030f 100644 --- a/trunk/block/ioctl.c +++ b/trunk/block/ioctl.c @@ -179,26 +179,6 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode, */ EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl); -/* - * Is it an unrecognized ioctl? The correct returns are either - * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a - * fallback"). ENOIOCTLCMD gets turned into ENOTTY by the ioctl - * code before returning. - * - * Confused drivers sometimes return EINVAL, which is wrong. It - * means "I understood the ioctl command, but the parameters to - * it were wrong". - * - * We should aim to just fix the broken drivers, the EINVAL case - * should go away. - */ -static inline int is_unrecognized_ioctl(int ret) -{ - return ret == -EINVAL || - ret == -ENOTTY || - ret == -ENOIOCTLCMD; -} - /* * always keep this in sync with compat_blkdev_ioctl() */ @@ -216,7 +196,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, return -EACCES; ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - if (!is_unrecognized_ioctl(ret)) + /* -EINVAL to handle old uncorrected drivers */ + if (ret != -EINVAL && ret != -ENOTTY) return ret; fsync_bdev(bdev); @@ -225,7 +206,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKROSET: ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - if (!is_unrecognized_ioctl(ret)) + /* -EINVAL to handle old uncorrected drivers */ + if (ret != -EINVAL && ret != -ENOTTY) return ret; if (!capable(CAP_SYS_ADMIN)) return -EACCES; diff --git a/trunk/drivers/ata/Kconfig b/trunk/drivers/ata/Kconfig index cf047c406d92..6bdedd7cca2c 100644 --- a/trunk/drivers/ata/Kconfig +++ b/trunk/drivers/ata/Kconfig @@ -820,7 +820,7 @@ config PATA_PLATFORM config PATA_OF_PLATFORM tristate "OpenFirmware platform device PATA support" - depends on PATA_PLATFORM && OF && OF_IRQ + depends on PATA_PLATFORM && OF help This option enables support for generic directly connected ATA devices commonly found on embedded systems with OpenFirmware diff --git a/trunk/drivers/base/cpu.c b/trunk/drivers/base/cpu.c index 3991502b21e5..251acea3d359 100644 --- a/trunk/drivers/base/cpu.c +++ b/trunk/drivers/base/cpu.c @@ -247,13 +247,6 @@ struct sys_device *get_cpu_sysdev(unsigned cpu) } EXPORT_SYMBOL_GPL(get_cpu_sysdev); -bool cpu_is_hotpluggable(unsigned cpu) -{ - struct sys_device *dev = get_cpu_sysdev(cpu); - return dev && container_of(dev, struct cpu, sysdev)->hotpluggable; -} -EXPORT_SYMBOL_GPL(cpu_is_hotpluggable); - int __init cpu_dev_init(void) { int err; diff --git a/trunk/drivers/char/random.c b/trunk/drivers/char/random.c index 85da8740586b..6035ab8d5ef7 100644 --- a/trunk/drivers/char/random.c +++ b/trunk/drivers/char/random.c @@ -624,8 +624,8 @@ static struct timer_rand_state input_timer_state; static void add_timer_randomness(struct timer_rand_state *state, unsigned num) { struct { + cycles_t cycles; long jiffies; - unsigned cycles; unsigned num; } sample; long delta, delta2, delta3; @@ -637,11 +637,7 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num) goto out; sample.jiffies = jiffies; - - /* Use arch random value, fall back to cycles */ - if (!arch_get_random_int(&sample.cycles)) - sample.cycles = get_cycles(); - + sample.cycles = get_cycles(); sample.num = num; mix_pool_bytes(&input_pool, &sample, sizeof(sample)); diff --git a/trunk/drivers/clocksource/acpi_pm.c b/trunk/drivers/clocksource/acpi_pm.c index 6b5cf02c35c8..effe7974aa9a 100644 --- a/trunk/drivers/clocksource/acpi_pm.c +++ b/trunk/drivers/clocksource/acpi_pm.c @@ -143,7 +143,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE, #ifndef CONFIG_X86_64 #include #define PMTMR_EXPECTED_RATE \ - ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (PIT_TICK_RATE>>10)) + ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10)) /* * Some boards have the PMTMR running way too fast. We check * the PMTMR rate against PIT channel 2 to catch these cases. diff --git a/trunk/drivers/clocksource/i8253.c b/trunk/drivers/clocksource/i8253.c index e7cab2da910f..27c49e60b7d6 100644 --- a/trunk/drivers/clocksource/i8253.c +++ b/trunk/drivers/clocksource/i8253.c @@ -53,7 +53,7 @@ static cycle_t i8253_read(struct clocksource *cs) count |= inb_p(PIT_CH0) << 8; /* VIA686a test code... reset the latch if count > max + 1 */ - if (count > PIT_LATCH) { + if (count > LATCH) { outb_p(0x34, PIT_MODE); outb_p(PIT_LATCH & 0xff, PIT_CH0); outb_p(PIT_LATCH >> 8, PIT_CH0); @@ -114,8 +114,8 @@ static void init_pit_timer(enum clock_event_mode mode, case CLOCK_EVT_MODE_PERIODIC: /* binary, mode 2, LSB/MSB, ch 0 */ outb_p(0x34, PIT_MODE); - outb_p(PIT_LATCH & 0xff , PIT_CH0); /* LSB */ - outb_p(PIT_LATCH >> 8 , PIT_CH0); /* MSB */ + outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ + outb_p(LATCH >> 8 , PIT_CH0); /* MSB */ break; case CLOCK_EVT_MODE_SHUTDOWN: diff --git a/trunk/drivers/clocksource/tcb_clksrc.c b/trunk/drivers/clocksource/tcb_clksrc.c index 55d0f95f82f9..79c47e88d5d1 100644 --- a/trunk/drivers/clocksource/tcb_clksrc.c +++ b/trunk/drivers/clocksource/tcb_clksrc.c @@ -59,6 +59,7 @@ static struct clocksource clksrc = { .rating = 200, .read = tc_get_cycles, .mask = CLOCKSOURCE_MASK(32), + .shift = 18, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -255,6 +256,7 @@ static int __init tcb_clksrc_init(void) best_divisor_idx = i; } + clksrc.mult = clocksource_hz2mult(divided_rate, clksrc.shift); printk(bootinfo, clksrc.name, CONFIG_ATMEL_TCB_CLKSRC_BLOCK, divided_rate / 1000000, @@ -290,7 +292,7 @@ static int __init tcb_clksrc_init(void) __raw_writel(ATMEL_TC_SYNC, tcaddr + ATMEL_TC_BCR); /* and away we go! */ - clocksource_register_hz(&clksrc, divided_rate); + clocksource_register(&clksrc); /* channel 2: periodic and oneshot timer support */ setup_clkevents(tc, clk32k_divisor_idx); diff --git a/trunk/drivers/cpufreq/cpufreq_conservative.c b/trunk/drivers/cpufreq/cpufreq_conservative.c index 235a340e81f2..c97b468ee9f7 100644 --- a/trunk/drivers/cpufreq/cpufreq_conservative.c +++ b/trunk/drivers/cpufreq/cpufreq_conservative.c @@ -95,26 +95,27 @@ static struct dbs_tuners { .freq_step = 5, }; -static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall) +static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, + cputime64_t *wall) { - u64 idle_time; - u64 cur_wall_time; - u64 busy_time; + cputime64_t idle_time; + cputime64_t cur_wall_time; + cputime64_t busy_time; cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); + busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user, + kstat_cpu(cpu).cpustat.system); - busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER]; - busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM]; - busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ]; - busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ]; - busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL]; - busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE]; + busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq); + busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq); + busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal); + busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.nice); - idle_time = cur_wall_time - busy_time; + idle_time = cputime64_sub(cur_wall_time, busy_time); if (wall) - *wall = jiffies_to_usecs(cur_wall_time); + *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time); - return jiffies_to_usecs(idle_time); + return (cputime64_t)jiffies_to_usecs(idle_time); } static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) @@ -271,7 +272,7 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b, dbs_info->prev_cpu_idle = get_cpu_idle_time(j, &dbs_info->prev_cpu_wall); if (dbs_tuners_ins.ignore_nice) - dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; + dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; } return count; } @@ -352,20 +353,20 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); - wall_time = (unsigned int) - (cur_wall_time - j_dbs_info->prev_cpu_wall); + wall_time = (unsigned int) cputime64_sub(cur_wall_time, + j_dbs_info->prev_cpu_wall); j_dbs_info->prev_cpu_wall = cur_wall_time; - idle_time = (unsigned int) - (cur_idle_time - j_dbs_info->prev_cpu_idle); + idle_time = (unsigned int) cputime64_sub(cur_idle_time, + j_dbs_info->prev_cpu_idle); j_dbs_info->prev_cpu_idle = cur_idle_time; if (dbs_tuners_ins.ignore_nice) { - u64 cur_nice; + cputime64_t cur_nice; unsigned long cur_nice_jiffies; - cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] - - j_dbs_info->prev_cpu_nice; + cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice, + j_dbs_info->prev_cpu_nice); /* * Assumption: nice time between sampling periods will * be less than 2^32 jiffies for 32 bit sys @@ -373,7 +374,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) cur_nice_jiffies = (unsigned long) cputime64_to_jiffies64(cur_nice); - j_dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; + j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; idle_time += jiffies_to_usecs(cur_nice_jiffies); } @@ -500,9 +501,10 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, &j_dbs_info->prev_cpu_wall); - if (dbs_tuners_ins.ignore_nice) + if (dbs_tuners_ins.ignore_nice) { j_dbs_info->prev_cpu_nice = - kcpustat_cpu(j).cpustat[CPUTIME_NICE]; + kstat_cpu(j).cpustat.nice; + } } this_dbs_info->down_skip = 0; this_dbs_info->requested_freq = policy->cur; diff --git a/trunk/drivers/cpufreq/cpufreq_ondemand.c b/trunk/drivers/cpufreq/cpufreq_ondemand.c index 3d679eee70a1..fa8af4ebb1d6 100644 --- a/trunk/drivers/cpufreq/cpufreq_ondemand.c +++ b/trunk/drivers/cpufreq/cpufreq_ondemand.c @@ -119,26 +119,27 @@ static struct dbs_tuners { .powersave_bias = 0, }; -static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall) +static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, + cputime64_t *wall) { - u64 idle_time; - u64 cur_wall_time; - u64 busy_time; + cputime64_t idle_time; + cputime64_t cur_wall_time; + cputime64_t busy_time; cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); + busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user, + kstat_cpu(cpu).cpustat.system); - busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER]; - busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM]; - busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ]; - busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ]; - busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL]; - busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE]; + busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq); + busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq); + busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal); + busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.nice); - idle_time = cur_wall_time - busy_time; + idle_time = cputime64_sub(cur_wall_time, busy_time); if (wall) - *wall = jiffies_to_usecs(cur_wall_time); + *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time); - return jiffies_to_usecs(idle_time); + return (cputime64_t)jiffies_to_usecs(idle_time); } static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) @@ -344,7 +345,7 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b, dbs_info->prev_cpu_idle = get_cpu_idle_time(j, &dbs_info->prev_cpu_wall); if (dbs_tuners_ins.ignore_nice) - dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; + dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; } return count; @@ -441,24 +442,24 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); cur_iowait_time = get_cpu_iowait_time(j, &cur_wall_time); - wall_time = (unsigned int) - (cur_wall_time - j_dbs_info->prev_cpu_wall); + wall_time = (unsigned int) cputime64_sub(cur_wall_time, + j_dbs_info->prev_cpu_wall); j_dbs_info->prev_cpu_wall = cur_wall_time; - idle_time = (unsigned int) - (cur_idle_time - j_dbs_info->prev_cpu_idle); + idle_time = (unsigned int) cputime64_sub(cur_idle_time, + j_dbs_info->prev_cpu_idle); j_dbs_info->prev_cpu_idle = cur_idle_time; - iowait_time = (unsigned int) - (cur_iowait_time - j_dbs_info->prev_cpu_iowait); + iowait_time = (unsigned int) cputime64_sub(cur_iowait_time, + j_dbs_info->prev_cpu_iowait); j_dbs_info->prev_cpu_iowait = cur_iowait_time; if (dbs_tuners_ins.ignore_nice) { - u64 cur_nice; + cputime64_t cur_nice; unsigned long cur_nice_jiffies; - cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] - - j_dbs_info->prev_cpu_nice; + cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice, + j_dbs_info->prev_cpu_nice); /* * Assumption: nice time between sampling periods will * be less than 2^32 jiffies for 32 bit sys @@ -466,7 +467,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) cur_nice_jiffies = (unsigned long) cputime64_to_jiffies64(cur_nice); - j_dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; + j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; idle_time += jiffies_to_usecs(cur_nice_jiffies); } @@ -645,9 +646,10 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, &j_dbs_info->prev_cpu_wall); - if (dbs_tuners_ins.ignore_nice) + if (dbs_tuners_ins.ignore_nice) { j_dbs_info->prev_cpu_nice = - kcpustat_cpu(j).cpustat[CPUTIME_NICE]; + kstat_cpu(j).cpustat.nice; + } } this_dbs_info->cpu = cpu; this_dbs_info->rate_mult = 1; diff --git a/trunk/drivers/cpufreq/cpufreq_stats.c b/trunk/drivers/cpufreq/cpufreq_stats.c index 2a508edd768b..c5072a91e848 100644 --- a/trunk/drivers/cpufreq/cpufreq_stats.c +++ b/trunk/drivers/cpufreq/cpufreq_stats.c @@ -61,8 +61,9 @@ static int cpufreq_stats_update(unsigned int cpu) spin_lock(&cpufreq_stats_lock); stat = per_cpu(cpufreq_stats_table, cpu); if (stat->time_in_state) - stat->time_in_state[stat->last_index] += - cur_time - stat->last_time; + stat->time_in_state[stat->last_index] = + cputime64_add(stat->time_in_state[stat->last_index], + cputime_sub(cur_time, stat->last_time)); stat->last_time = cur_time; spin_unlock(&cpufreq_stats_lock); return 0; diff --git a/trunk/drivers/dma/Kconfig b/trunk/drivers/dma/Kconfig index 5a99bb3f255a..ab8f469f5cf8 100644 --- a/trunk/drivers/dma/Kconfig +++ b/trunk/drivers/dma/Kconfig @@ -124,7 +124,7 @@ config MV_XOR config MX3_IPU bool "MX3x Image Processing Unit support" - depends on SOC_IMX31 || SOC_IMX35 + depends on ARCH_MX3 select DMA_ENGINE default y help @@ -216,7 +216,7 @@ config PCH_DMA config IMX_SDMA tristate "i.MX SDMA support" - depends on ARCH_MX25 || SOC_IMX31 || SOC_IMX35 || ARCH_MX5 + depends on ARCH_MX25 || ARCH_MX3 || ARCH_MX5 select DMA_ENGINE help Support the i.MX SDMA engine. This engine is integrated into diff --git a/trunk/drivers/edac/i7core_edac.c b/trunk/drivers/edac/i7core_edac.c index 8568d9b61875..70ad8923f1d7 100644 --- a/trunk/drivers/edac/i7core_edac.c +++ b/trunk/drivers/edac/i7core_edac.c @@ -2234,7 +2234,7 @@ static void i7core_unregister_mci(struct i7core_dev *i7core_dev) if (pvt->enable_scrub) disable_sdram_scrub_setting(mci); - mce_unregister_decode_chain(&i7_mce_dec); + atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &i7_mce_dec); /* Disable EDAC polling */ i7core_pci_ctl_release(pvt); @@ -2336,7 +2336,7 @@ static int i7core_register_mci(struct i7core_dev *i7core_dev) /* DCLK for scrub rate setting */ pvt->dclk_freq = get_dclk_freq(); - mce_register_decode_chain(&i7_mce_dec); + atomic_notifier_chain_register(&x86_mce_decoder_chain, &i7_mce_dec); return 0; diff --git a/trunk/drivers/edac/mce_amd.c b/trunk/drivers/edac/mce_amd.c index bd926ea2e00c..d0864d9c38ad 100644 --- a/trunk/drivers/edac/mce_amd.c +++ b/trunk/drivers/edac/mce_amd.c @@ -884,7 +884,7 @@ static int __init mce_amd_init(void) pr_info("MCE: In-kernel MCE decoding enabled.\n"); - mce_register_decode_chain(&amd_mce_dec_nb); + atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb); return 0; } @@ -893,7 +893,7 @@ early_initcall(mce_amd_init); #ifdef MODULE static void __exit mce_amd_exit(void) { - mce_unregister_decode_chain(&amd_mce_dec_nb); + atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb); kfree(fam_ops); } diff --git a/trunk/drivers/edac/sb_edac.c b/trunk/drivers/edac/sb_edac.c index 1dc118d83cc6..7a402bfbee7d 100644 --- a/trunk/drivers/edac/sb_edac.c +++ b/trunk/drivers/edac/sb_edac.c @@ -1609,9 +1609,11 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val, mce->cpuvendor, mce->cpuid, mce->time, mce->socketid, mce->apicid); +#ifdef CONFIG_SMP /* Only handle if it is the right mc controller */ if (cpu_data(mce->cpu).phys_proc_id != pvt->sbridge_dev->mc) return NOTIFY_DONE; +#endif smp_rmb(); if ((pvt->mce_out + 1) % MCE_LOG_LEN == pvt->mce_in) { @@ -1659,7 +1661,8 @@ static void sbridge_unregister_mci(struct sbridge_dev *sbridge_dev) debugf0("MC: " __FILE__ ": %s(): mci = %p, dev = %p\n", __func__, mci, &sbridge_dev->pdev[0]->dev); - mce_unregister_decode_chain(&sbridge_mce_dec); + atomic_notifier_chain_unregister(&x86_mce_decoder_chain, + &sbridge_mce_dec); /* Remove MC sysfs nodes */ edac_mc_del_mc(mci->dev); @@ -1728,7 +1731,8 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev) goto fail0; } - mce_register_decode_chain(&sbridge_mce_dec); + atomic_notifier_chain_register(&x86_mce_decoder_chain, + &sbridge_mce_dec); return 0; fail0: diff --git a/trunk/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/trunk/drivers/gpu/drm/i915/i915_gem_execbuffer.c index b9da8900ae4e..c681dc149d2a 100644 --- a/trunk/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/trunk/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -756,9 +756,9 @@ intel_enable_semaphores(struct drm_device *dev) if (i915_semaphores >= 0) return i915_semaphores; - /* Disable semaphores on SNB */ + /* Enable semaphores on SNB when IO remapping is off */ if (INTEL_INFO(dev)->gen == 6) - return 0; + return !intel_iommu_enabled; return 1; } diff --git a/trunk/drivers/gpu/drm/i915/intel_display.c b/trunk/drivers/gpu/drm/i915/intel_display.c index daa5743ccbd6..d809b038ca88 100644 --- a/trunk/drivers/gpu/drm/i915/intel_display.c +++ b/trunk/drivers/gpu/drm/i915/intel_display.c @@ -7922,11 +7922,13 @@ static bool intel_enable_rc6(struct drm_device *dev) return 0; /* - * Disable rc6 on Sandybridge + * Enable rc6 on Sandybridge if DMA remapping is disabled */ if (INTEL_INFO(dev)->gen == 6) { - DRM_DEBUG_DRIVER("Sandybridge: RC6 disabled\n"); - return 0; + DRM_DEBUG_DRIVER("Sandybridge: intel_iommu_enabled %s -- RC6 %sabled\n", + intel_iommu_enabled ? "true" : "false", + !intel_iommu_enabled ? "en" : "dis"); + return !intel_iommu_enabled; } DRM_DEBUG_DRIVER("RC6 enabled\n"); return 1; diff --git a/trunk/drivers/gpu/drm/radeon/evergreen.c b/trunk/drivers/gpu/drm/radeon/evergreen.c index 92c9628c572d..5e00d1670aa9 100644 --- a/trunk/drivers/gpu/drm/radeon/evergreen.c +++ b/trunk/drivers/gpu/drm/radeon/evergreen.c @@ -3276,18 +3276,6 @@ int evergreen_init(struct radeon_device *rdev) rdev->accel_working = false; } } - - /* Don't start up if the MC ucode is missing on BTC parts. - * The default clocks and voltages before the MC ucode - * is loaded are not suffient for advanced operations. - */ - if (ASIC_IS_DCE5(rdev)) { - if (!rdev->mc_fw && !(rdev->flags & RADEON_IS_IGP)) { - DRM_ERROR("radeon: MC ucode required for NI+.\n"); - return -EINVAL; - } - } - return 0; } diff --git a/trunk/drivers/gpu/drm/radeon/radeon_atombios.c b/trunk/drivers/gpu/drm/radeon/radeon_atombios.c index 5082d17d14dc..d24baf30efcb 100644 --- a/trunk/drivers/gpu/drm/radeon/radeon_atombios.c +++ b/trunk/drivers/gpu/drm/radeon/radeon_atombios.c @@ -2560,11 +2560,7 @@ void radeon_atombios_get_power_modes(struct radeon_device *rdev) rdev->pm.current_power_state_index = rdev->pm.default_power_state_index; rdev->pm.current_clock_mode_index = 0; - if (rdev->pm.default_power_state_index >= 0) - rdev->pm.current_vddc = - rdev->pm.power_state[rdev->pm.default_power_state_index].clock_info[0].voltage.voltage; - else - rdev->pm.current_vddc = 0; + rdev->pm.current_vddc = rdev->pm.power_state[rdev->pm.default_power_state_index].clock_info[0].voltage.voltage; } void radeon_atom_set_clock_gating(struct radeon_device *rdev, int enable) diff --git a/trunk/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/trunk/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index f94b33ae2215..8aa1dbb45c67 100644 --- a/trunk/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/trunk/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -1093,6 +1093,7 @@ static struct drm_framebuffer *vmw_kms_fb_create(struct drm_device *dev, struct vmw_surface *surface = NULL; struct vmw_dma_buffer *bo = NULL; struct ttm_base_object *user_obj; + u64 required_size; int ret; /** @@ -1101,9 +1102,8 @@ static struct drm_framebuffer *vmw_kms_fb_create(struct drm_device *dev, * requested framebuffer. */ - if (!vmw_kms_validate_mode_vram(dev_priv, - mode_cmd->pitch, - mode_cmd->height)) { + required_size = mode_cmd->pitch * mode_cmd->height; + if (unlikely(required_size > (u64) dev_priv->vram_size)) { DRM_ERROR("VRAM size is too small for requested mode.\n"); return ERR_PTR(-ENOMEM); } diff --git a/trunk/drivers/hwmon/coretemp.c b/trunk/drivers/hwmon/coretemp.c index 1fdef885341c..104b3767516c 100644 --- a/trunk/drivers/hwmon/coretemp.c +++ b/trunk/drivers/hwmon/coretemp.c @@ -57,15 +57,16 @@ MODULE_PARM_DESC(tjmax, "TjMax value in degrees Celsius"); #define TOTAL_ATTRS (MAX_CORE_ATTRS + 1) #define MAX_CORE_DATA (NUM_REAL_CORES + BASE_SYSFS_ATTR_NO) +#ifdef CONFIG_SMP #define TO_PHYS_ID(cpu) cpu_data(cpu).phys_proc_id #define TO_CORE_ID(cpu) cpu_data(cpu).cpu_core_id -#define TO_ATTR_NO(cpu) (TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO) - -#ifdef CONFIG_SMP #define for_each_sibling(i, cpu) for_each_cpu(i, cpu_sibling_mask(cpu)) #else +#define TO_PHYS_ID(cpu) (cpu) +#define TO_CORE_ID(cpu) (cpu) #define for_each_sibling(i, cpu) for (i = 0; false; ) #endif +#define TO_ATTR_NO(cpu) (TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO) /* * Per-Core Temperature Data diff --git a/trunk/drivers/input/mouse/sentelic.c b/trunk/drivers/input/mouse/sentelic.c index 86d6f39178b0..c5b12d2e955a 100644 --- a/trunk/drivers/input/mouse/sentelic.c +++ b/trunk/drivers/input/mouse/sentelic.c @@ -2,7 +2,7 @@ * Finger Sensing Pad PS/2 mouse driver. * * Copyright (C) 2005-2007 Asia Vital Components Co., Ltd. - * Copyright (C) 2005-2011 Tai-hwa Liang, Sentelic Corporation. + * Copyright (C) 2005-2010 Tai-hwa Liang, Sentelic Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -162,7 +162,7 @@ static int fsp_reg_write(struct psmouse *psmouse, int reg_addr, int reg_val) ps2_sendbyte(ps2dev, v, FSP_CMD_TIMEOUT2); if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0) - goto out; + return -1; if ((v = fsp_test_invert_cmd(reg_val)) != reg_val) { /* inversion is required */ @@ -261,7 +261,7 @@ static int fsp_page_reg_write(struct psmouse *psmouse, int reg_val) ps2_sendbyte(ps2dev, 0x88, FSP_CMD_TIMEOUT2); if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0) - goto out; + return -1; if ((v = fsp_test_invert_cmd(reg_val)) != reg_val) { ps2_sendbyte(ps2dev, 0x47, FSP_CMD_TIMEOUT2); @@ -309,7 +309,7 @@ static int fsp_get_buttons(struct psmouse *psmouse, int *btn) }; int val; - if (fsp_reg_read(psmouse, FSP_REG_TMOD_STATUS, &val) == -1) + if (fsp_reg_read(psmouse, FSP_REG_TMOD_STATUS1, &val) == -1) return -EIO; *btn = buttons[(val & 0x30) >> 4]; diff --git a/trunk/drivers/input/mouse/sentelic.h b/trunk/drivers/input/mouse/sentelic.h index 2e4af24f8c15..ed1395ac7b8b 100644 --- a/trunk/drivers/input/mouse/sentelic.h +++ b/trunk/drivers/input/mouse/sentelic.h @@ -2,7 +2,7 @@ * Finger Sensing Pad PS/2 mouse driver. * * Copyright (C) 2005-2007 Asia Vital Components Co., Ltd. - * Copyright (C) 2005-2011 Tai-hwa Liang, Sentelic Corporation. + * Copyright (C) 2005-2009 Tai-hwa Liang, Sentelic Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -33,7 +33,6 @@ /* Finger-sensing Pad control registers */ #define FSP_REG_SYSCTL1 0x10 #define FSP_BIT_EN_REG_CLK BIT(5) -#define FSP_REG_TMOD_STATUS 0x20 #define FSP_REG_OPC_QDOWN 0x31 #define FSP_BIT_EN_OPC_TAG BIT(7) #define FSP_REG_OPTZ_XLO 0x34 diff --git a/trunk/drivers/iommu/intel-iommu.c b/trunk/drivers/iommu/intel-iommu.c index 31053a951c34..bdc447fd4766 100644 --- a/trunk/drivers/iommu/intel-iommu.c +++ b/trunk/drivers/iommu/intel-iommu.c @@ -41,7 +41,6 @@ #include #include #include -#include #include #include @@ -2189,6 +2188,18 @@ static inline void iommu_prepare_isa(void) static int md_domain_init(struct dmar_domain *domain, int guest_width); +static int __init si_domain_work_fn(unsigned long start_pfn, + unsigned long end_pfn, void *datax) +{ + int *ret = datax; + + *ret = iommu_domain_identity_map(si_domain, + (uint64_t)start_pfn << PAGE_SHIFT, + (uint64_t)end_pfn << PAGE_SHIFT); + return *ret; + +} + static int __init si_domain_init(int hw) { struct dmar_drhd_unit *drhd; @@ -2220,15 +2231,9 @@ static int __init si_domain_init(int hw) return 0; for_each_online_node(nid) { - unsigned long start_pfn, end_pfn; - int i; - - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { - ret = iommu_domain_identity_map(si_domain, - PFN_PHYS(start_pfn), PFN_PHYS(end_pfn)); - if (ret) - return ret; - } + work_with_active_regions(nid, si_domain_work_fn, &ret); + if (ret) + return ret; } return 0; diff --git a/trunk/drivers/iommu/iommu.c b/trunk/drivers/iommu/iommu.c index 5b5fa5cdaa31..2fb2963df553 100644 --- a/trunk/drivers/iommu/iommu.c +++ b/trunk/drivers/iommu/iommu.c @@ -90,7 +90,7 @@ struct iommu_domain *iommu_domain_alloc(struct bus_type *bus) if (bus == NULL || bus->iommu_ops == NULL) return NULL; - domain = kzalloc(sizeof(*domain), GFP_KERNEL); + domain = kmalloc(sizeof(*domain), GFP_KERNEL); if (!domain) return NULL; diff --git a/trunk/drivers/lguest/x86/core.c b/trunk/drivers/lguest/x86/core.c index 39809035320a..65af42f2d593 100644 --- a/trunk/drivers/lguest/x86/core.c +++ b/trunk/drivers/lguest/x86/core.c @@ -697,7 +697,7 @@ void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start) * interrupts are enabled. We always leave interrupts enabled while * running the Guest. */ - regs->eflags = X86_EFLAGS_IF | X86_EFLAGS_BIT1; + regs->eflags = X86_EFLAGS_IF | 0x2; /* * The "Extended Instruction Pointer" register says where the Guest is diff --git a/trunk/drivers/macintosh/rack-meter.c b/trunk/drivers/macintosh/rack-meter.c index 6dc26b61219b..2637c139777b 100644 --- a/trunk/drivers/macintosh/rack-meter.c +++ b/trunk/drivers/macintosh/rack-meter.c @@ -81,13 +81,13 @@ static int rackmeter_ignore_nice; */ static inline cputime64_t get_cpu_idle_time(unsigned int cpu) { - u64 retval; + cputime64_t retval; - retval = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE] + - kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + retval = cputime64_add(kstat_cpu(cpu).cpustat.idle, + kstat_cpu(cpu).cpustat.iowait); if (rackmeter_ignore_nice) - retval += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE]; + retval = cputime64_add(retval, kstat_cpu(cpu).cpustat.nice); return retval; } @@ -220,11 +220,13 @@ static void rackmeter_do_timer(struct work_struct *work) int i, offset, load, cumm, pause; cur_jiffies = jiffies64_to_cputime64(get_jiffies_64()); - total_ticks = (unsigned int) (cur_jiffies - rcpu->prev_wall); + total_ticks = (unsigned int)cputime64_sub(cur_jiffies, + rcpu->prev_wall); rcpu->prev_wall = cur_jiffies; total_idle_ticks = get_cpu_idle_time(cpu); - idle_ticks = (unsigned int) (total_idle_ticks - rcpu->prev_idle); + idle_ticks = (unsigned int) cputime64_sub(total_idle_ticks, + rcpu->prev_idle); rcpu->prev_idle = total_idle_ticks; /* We do a very dumb calculation to update the LEDs for now, diff --git a/trunk/drivers/md/bitmap.c b/trunk/drivers/md/bitmap.c index 6d03774b176e..b6907118283a 100644 --- a/trunk/drivers/md/bitmap.c +++ b/trunk/drivers/md/bitmap.c @@ -1393,6 +1393,9 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto atomic_read(&bitmap->behind_writes), bitmap->mddev->bitmap_info.max_write_behind); } + if (bitmap->mddev->degraded) + /* Never clear bits or update events_cleared when degraded */ + success = 0; while (sectors) { sector_t blocks; @@ -1406,7 +1409,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto return; } - if (success && !bitmap->mddev->degraded && + if (success && bitmap->events_cleared < bitmap->mddev->events) { bitmap->events_cleared = bitmap->mddev->events; bitmap->need_sync = 1; diff --git a/trunk/drivers/md/linear.c b/trunk/drivers/md/linear.c index 627456542fb3..c3273efd08cb 100644 --- a/trunk/drivers/md/linear.c +++ b/trunk/drivers/md/linear.c @@ -230,7 +230,6 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) return -EINVAL; rdev->raid_disk = rdev->saved_raid_disk; - rdev->saved_raid_disk = -1; newconf = linear_conf(mddev,mddev->raid_disks+1); diff --git a/trunk/drivers/md/md.c b/trunk/drivers/md/md.c index f47f1f8ac44b..ee981737edfc 100644 --- a/trunk/drivers/md/md.c +++ b/trunk/drivers/md/md.c @@ -7360,7 +7360,8 @@ static int remove_and_add_spares(struct mddev *mddev) spares++; md_new_event(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); - } + } else + break; } } } diff --git a/trunk/drivers/md/raid5.c b/trunk/drivers/md/raid5.c index 858fdbb7eb07..31670f8d6b65 100644 --- a/trunk/drivers/md/raid5.c +++ b/trunk/drivers/md/raid5.c @@ -3065,17 +3065,11 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) } } else if (test_bit(In_sync, &rdev->flags)) set_bit(R5_Insync, &dev->flags); - else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) + else { /* in sync if before recovery_offset */ - set_bit(R5_Insync, &dev->flags); - else if (test_bit(R5_UPTODATE, &dev->flags) && - test_bit(R5_Expanded, &dev->flags)) - /* If we've reshaped into here, we assume it is Insync. - * We will shortly update recovery_offset to make - * it official. - */ - set_bit(R5_Insync, &dev->flags); - + if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) + set_bit(R5_Insync, &dev->flags); + } if (rdev && test_bit(R5_WriteError, &dev->flags)) { clear_bit(R5_Insync, &dev->flags); if (!test_bit(Faulty, &rdev->flags)) { diff --git a/trunk/drivers/media/video/gspca/gspca.c b/trunk/drivers/media/video/gspca/gspca.c index 2ca10dfec91f..881e04c7ffe6 100644 --- a/trunk/drivers/media/video/gspca/gspca.c +++ b/trunk/drivers/media/video/gspca/gspca.c @@ -838,13 +838,13 @@ static int gspca_init_transfer(struct gspca_dev *gspca_dev) gspca_dev->usb_err = 0; /* do the specific subdriver stuff before endpoint selection */ - intf = usb_ifnum_to_if(gspca_dev->dev, gspca_dev->iface); - gspca_dev->alt = gspca_dev->cam.bulk ? intf->num_altsetting : 0; + gspca_dev->alt = 0; if (gspca_dev->sd_desc->isoc_init) { ret = gspca_dev->sd_desc->isoc_init(gspca_dev); if (ret < 0) goto unlock; } + intf = usb_ifnum_to_if(gspca_dev->dev, gspca_dev->iface); xfer = gspca_dev->cam.bulk ? USB_ENDPOINT_XFER_BULK : USB_ENDPOINT_XFER_ISOC; @@ -957,7 +957,7 @@ static int gspca_init_transfer(struct gspca_dev *gspca_dev) ret = -EIO; goto out; } - gspca_dev->alt = ep_tb[--alt_idx].alt; + alt = ep_tb[--alt_idx].alt; } } out: diff --git a/trunk/drivers/media/video/omap3isp/ispccdc.c b/trunk/drivers/media/video/omap3isp/ispccdc.c index 54a4a3f22e2e..b0b0fa5a3572 100644 --- a/trunk/drivers/media/video/omap3isp/ispccdc.c +++ b/trunk/drivers/media/video/omap3isp/ispccdc.c @@ -1408,7 +1408,7 @@ static void ccdc_hs_vs_isr(struct isp_ccdc_device *ccdc) { struct isp_pipeline *pipe = to_isp_pipeline(&ccdc->video_out.video.entity); - struct video_device *vdev = ccdc->subdev.devnode; + struct video_device *vdev = &ccdc->subdev.devnode; struct v4l2_event event; memset(&event, 0, sizeof(event)); diff --git a/trunk/drivers/media/video/omap3isp/ispstat.c b/trunk/drivers/media/video/omap3isp/ispstat.c index bc0b2c7349b9..68d539456c55 100644 --- a/trunk/drivers/media/video/omap3isp/ispstat.c +++ b/trunk/drivers/media/video/omap3isp/ispstat.c @@ -496,7 +496,7 @@ static int isp_stat_bufs_alloc(struct ispstat *stat, u32 size) static void isp_stat_queue_event(struct ispstat *stat, int err) { - struct video_device *vdev = stat->subdev.devnode; + struct video_device *vdev = &stat->subdev.devnode; struct v4l2_event event; struct omap3isp_stat_event_status *status = (void *)event.u.data; diff --git a/trunk/drivers/mfd/ab5500-debugfs.c b/trunk/drivers/mfd/ab5500-debugfs.c index b7b2d3483fd4..43c0ebb81956 100644 --- a/trunk/drivers/mfd/ab5500-debugfs.c +++ b/trunk/drivers/mfd/ab5500-debugfs.c @@ -4,7 +4,7 @@ * Debugfs support for the AB5500 MFD driver */ -#include +#include #include #include #include diff --git a/trunk/drivers/mfd/ab8500-core.c b/trunk/drivers/mfd/ab8500-core.c index d3d572b2317b..1e9173804ede 100644 --- a/trunk/drivers/mfd/ab8500-core.c +++ b/trunk/drivers/mfd/ab8500-core.c @@ -620,7 +620,6 @@ static struct resource __devinitdata ab8500_fg_resources[] = { static struct resource __devinitdata ab8500_chargalg_resources[] = {}; -#ifdef CONFIG_DEBUG_FS static struct resource __devinitdata ab8500_debug_resources[] = { { .name = "IRQ_FIRST", @@ -635,7 +634,6 @@ static struct resource __devinitdata ab8500_debug_resources[] = { .flags = IORESOURCE_IRQ, }, }; -#endif static struct resource __devinitdata ab8500_usb_resources[] = { { diff --git a/trunk/drivers/mfd/adp5520.c b/trunk/drivers/mfd/adp5520.c index 8d816cce8322..f1d88483112c 100644 --- a/trunk/drivers/mfd/adp5520.c +++ b/trunk/drivers/mfd/adp5520.c @@ -109,7 +109,7 @@ int adp5520_set_bits(struct device *dev, int reg, uint8_t bit_mask) ret = __adp5520_read(chip->client, reg, ®_val); - if (!ret && ((reg_val & bit_mask) != bit_mask)) { + if (!ret && ((reg_val & bit_mask) == 0)) { reg_val |= bit_mask; ret = __adp5520_write(chip->client, reg, reg_val); } diff --git a/trunk/drivers/mfd/da903x.c b/trunk/drivers/mfd/da903x.c index 1924b857a0fb..1b79c37fd599 100644 --- a/trunk/drivers/mfd/da903x.c +++ b/trunk/drivers/mfd/da903x.c @@ -182,7 +182,7 @@ int da903x_set_bits(struct device *dev, int reg, uint8_t bit_mask) if (ret) goto out; - if ((reg_val & bit_mask) != bit_mask) { + if ((reg_val & bit_mask) == 0) { reg_val |= bit_mask; ret = __da903x_write(chip->client, reg, reg_val); } @@ -549,7 +549,6 @@ static int __devexit da903x_remove(struct i2c_client *client) struct da903x_chip *chip = i2c_get_clientdata(client); da903x_remove_subdevs(chip); - free_irq(client->irq, chip); kfree(chip); return 0; } diff --git a/trunk/drivers/mfd/jz4740-adc.c b/trunk/drivers/mfd/jz4740-adc.c index ef39528088f2..1e9ee533eacb 100644 --- a/trunk/drivers/mfd/jz4740-adc.c +++ b/trunk/drivers/mfd/jz4740-adc.c @@ -16,7 +16,6 @@ */ #include -#include #include #include #include diff --git a/trunk/drivers/mfd/tps6586x.c b/trunk/drivers/mfd/tps6586x.c index a5ddf31b60ca..bba26d96c240 100644 --- a/trunk/drivers/mfd/tps6586x.c +++ b/trunk/drivers/mfd/tps6586x.c @@ -197,7 +197,7 @@ int tps6586x_set_bits(struct device *dev, int reg, uint8_t bit_mask) if (ret) goto out; - if ((reg_val & bit_mask) != bit_mask) { + if ((reg_val & bit_mask) == 0) { reg_val |= bit_mask; ret = __tps6586x_write(to_i2c_client(dev), reg, reg_val); } diff --git a/trunk/drivers/mfd/tps65910.c b/trunk/drivers/mfd/tps65910.c index c1da84bc1573..6f5b8cf2f652 100644 --- a/trunk/drivers/mfd/tps65910.c +++ b/trunk/drivers/mfd/tps65910.c @@ -120,7 +120,7 @@ int tps65910_clear_bits(struct tps65910 *tps65910, u8 reg, u8 mask) goto out; } - data &= ~mask; + data &= mask; err = tps65910_i2c_write(tps65910, reg, 1, &data); if (err) dev_err(tps65910->dev, "write to reg %x failed\n", reg); diff --git a/trunk/drivers/mfd/twl-core.c b/trunk/drivers/mfd/twl-core.c index 61e70cfaa774..bfbd66021afd 100644 --- a/trunk/drivers/mfd/twl-core.c +++ b/trunk/drivers/mfd/twl-core.c @@ -363,13 +363,13 @@ int twl_i2c_write(u8 mod_no, u8 *value, u8 reg, unsigned num_bytes) pr_err("%s: invalid module number %d\n", DRIVER_NAME, mod_no); return -EPERM; } - if (unlikely(!inuse)) { - pr_err("%s: not initialized\n", DRIVER_NAME); - return -EPERM; - } sid = twl_map[mod_no].sid; twl = &twl_modules[sid]; + if (unlikely(!inuse)) { + pr_err("%s: client %d is not initialized\n", DRIVER_NAME, sid); + return -EPERM; + } mutex_lock(&twl->xfer_lock); /* * [MSG1]: fill the register address data @@ -420,13 +420,13 @@ int twl_i2c_read(u8 mod_no, u8 *value, u8 reg, unsigned num_bytes) pr_err("%s: invalid module number %d\n", DRIVER_NAME, mod_no); return -EPERM; } - if (unlikely(!inuse)) { - pr_err("%s: not initialized\n", DRIVER_NAME); - return -EPERM; - } sid = twl_map[mod_no].sid; twl = &twl_modules[sid]; + if (unlikely(!inuse)) { + pr_err("%s: client %d is not initialized\n", DRIVER_NAME, sid); + return -EPERM; + } mutex_lock(&twl->xfer_lock); /* [MSG1] fill the register address data */ msg = &twl->xfer_msg[0]; diff --git a/trunk/drivers/mfd/twl4030-irq.c b/trunk/drivers/mfd/twl4030-irq.c index 29f11e0765fe..f062c8cc6c38 100644 --- a/trunk/drivers/mfd/twl4030-irq.c +++ b/trunk/drivers/mfd/twl4030-irq.c @@ -432,7 +432,6 @@ struct sih_agent { u32 edge_change; struct mutex irq_lock; - char *irq_name; }; /*----------------------------------------------------------------------*/ @@ -590,7 +589,7 @@ static inline int sih_read_isr(const struct sih *sih) * Generic handler for SIH interrupts ... we "know" this is called * in task context, with IRQs enabled. */ -static irqreturn_t handle_twl4030_sih(int irq, void *data) +static void handle_twl4030_sih(unsigned irq, struct irq_desc *desc) { struct sih_agent *agent = irq_get_handler_data(irq); const struct sih *sih = agent->sih; @@ -603,7 +602,7 @@ static irqreturn_t handle_twl4030_sih(int irq, void *data) pr_err("twl4030: %s SIH, read ISR error %d\n", sih->name, isr); /* REVISIT: recover; eventually mask it all, etc */ - return IRQ_HANDLED; + return; } while (isr) { @@ -617,7 +616,6 @@ static irqreturn_t handle_twl4030_sih(int irq, void *data) pr_err("twl4030: %s SIH, invalid ISR bit %d\n", sih->name, irq); } - return IRQ_HANDLED; } static unsigned twl4030_irq_next; @@ -670,19 +668,18 @@ int twl4030_sih_setup(int module) activate_irq(irq); } + status = irq_base; twl4030_irq_next += i; /* replace generic PIH handler (handle_simple_irq) */ irq = sih_mod + twl4030_irq_base; irq_set_handler_data(irq, agent); - agent->irq_name = kasprintf(GFP_KERNEL, "twl4030_%s", sih->name); - status = request_threaded_irq(irq, NULL, handle_twl4030_sih, 0, - agent->irq_name ?: sih->name, NULL); + irq_set_chained_handler(irq, handle_twl4030_sih); pr_info("twl4030: %s (irq %d) chaining IRQs %d..%d\n", sih->name, irq, irq_base, twl4030_irq_next - 1); - return status < 0 ? status : irq_base; + return status; } /* FIXME need a call to reverse twl4030_sih_setup() ... */ @@ -736,9 +733,8 @@ int twl4030_init_irq(int irq_num, unsigned irq_base, unsigned irq_end) } /* install an irq handler to demultiplex the TWL4030 interrupt */ - status = request_threaded_irq(irq_num, NULL, handle_twl4030_pih, - IRQF_ONESHOT, - "TWL4030-PIH", NULL); + status = request_threaded_irq(irq_num, NULL, handle_twl4030_pih, 0, + "TWL4030-PIH", NULL); if (status < 0) { pr_err("twl4030: could not claim irq%d: %d\n", irq_num, status); goto fail_rqirq; diff --git a/trunk/drivers/mfd/wm8994-core.c b/trunk/drivers/mfd/wm8994-core.c index 61894fced8ea..5d6ba132837e 100644 --- a/trunk/drivers/mfd/wm8994-core.c +++ b/trunk/drivers/mfd/wm8994-core.c @@ -239,7 +239,6 @@ static int wm8994_suspend(struct device *dev) switch (wm8994->type) { case WM8958: - case WM1811: ret = wm8994_reg_read(wm8994, WM8958_MIC_DETECT_1); if (ret < 0) { dev_err(dev, "Failed to read power status: %d\n", ret); diff --git a/trunk/drivers/mmc/host/mmci.c b/trunk/drivers/mmc/host/mmci.c index 0726e59fd418..50b5f9926f64 100644 --- a/trunk/drivers/mmc/host/mmci.c +++ b/trunk/drivers/mmc/host/mmci.c @@ -675,8 +675,7 @@ mmci_data_irq(struct mmci_host *host, struct mmc_data *data, unsigned int status) { /* First check for errors */ - if (status & (MCI_DATACRCFAIL|MCI_DATATIMEOUT|MCI_STARTBITERR| - MCI_TXUNDERRUN|MCI_RXOVERRUN)) { + if (status & (MCI_DATACRCFAIL|MCI_DATATIMEOUT|MCI_TXUNDERRUN|MCI_RXOVERRUN)) { u32 remain, success; /* Terminate the DMA transfer */ @@ -755,12 +754,8 @@ mmci_cmd_irq(struct mmci_host *host, struct mmc_command *cmd, } if (!cmd->data || cmd->error) { - if (host->data) { - /* Terminate the DMA transfer */ - if (dma_inprogress(host)) - mmci_dma_data_error(host); + if (host->data) mmci_stop_data(host); - } mmci_request_end(host, cmd->mrq); } else if (!(cmd->data->flags & MMC_DATA_READ)) { mmci_start_data(host, cmd->data); @@ -960,9 +955,8 @@ static irqreturn_t mmci_irq(int irq, void *dev_id) dev_dbg(mmc_dev(host->mmc), "irq0 (data+cmd) %08x\n", status); data = host->data; - if (status & (MCI_DATACRCFAIL|MCI_DATATIMEOUT|MCI_STARTBITERR| - MCI_TXUNDERRUN|MCI_RXOVERRUN|MCI_DATAEND| - MCI_DATABLOCKEND) && data) + if (status & (MCI_DATACRCFAIL|MCI_DATATIMEOUT|MCI_TXUNDERRUN| + MCI_RXOVERRUN|MCI_DATAEND|MCI_DATABLOCKEND) && data) mmci_data_irq(host, data, status); cmd = host->cmd; diff --git a/trunk/drivers/net/ethernet/freescale/Kconfig b/trunk/drivers/net/ethernet/freescale/Kconfig index 9de37642f09f..5272f9d4dda9 100644 --- a/trunk/drivers/net/ethernet/freescale/Kconfig +++ b/trunk/drivers/net/ethernet/freescale/Kconfig @@ -23,8 +23,8 @@ if NET_VENDOR_FREESCALE config FEC bool "FEC ethernet controller (of ColdFire and some i.MX CPUs)" depends on (M523x || M527x || M5272 || M528x || M520x || M532x || \ - ARCH_MXC || SOC_IMX28) - default ARCH_MXC || SOC_IMX28 if ARM + ARCH_MXC || ARCH_MXS) + default ARCH_MXC || ARCH_MXS if ARM select PHYLIB ---help--- Say Y here if you want to use the built-in 10/100 Fast ethernet diff --git a/trunk/drivers/net/ethernet/marvell/skge.c b/trunk/drivers/net/ethernet/marvell/skge.c index dea0cb4400e2..c7b60839ac99 100644 --- a/trunk/drivers/net/ethernet/marvell/skge.c +++ b/trunk/drivers/net/ethernet/marvell/skge.c @@ -2606,9 +2606,6 @@ static int skge_up(struct net_device *dev) spin_unlock_irq(&hw->hw_lock); napi_enable(&skge->napi); - - skge_set_multicast(dev); - return 0; free_tx_ring: diff --git a/trunk/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/trunk/drivers/net/ethernet/mellanox/mlx4/en_cq.c index 5829e0b47e7e..227997d775e8 100644 --- a/trunk/drivers/net/ethernet/mellanox/mlx4/en_cq.c +++ b/trunk/drivers/net/ethernet/mellanox/mlx4/en_cq.c @@ -147,7 +147,6 @@ void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq) mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size); if (priv->mdev->dev->caps.comp_pool && cq->vector) mlx4_release_eq(priv->mdev->dev, cq->vector); - cq->vector = 0; cq->buf_size = 0; cq->buf = NULL; } diff --git a/trunk/drivers/net/ethernet/realtek/r8169.c b/trunk/drivers/net/ethernet/realtek/r8169.c index c8f47f17186f..67bf07819992 100644 --- a/trunk/drivers/net/ethernet/realtek/r8169.c +++ b/trunk/drivers/net/ethernet/realtek/r8169.c @@ -477,6 +477,7 @@ enum rtl_register_content { /* Config1 register p.24 */ LEDS1 = (1 << 7), LEDS0 = (1 << 6), + MSIEnable = (1 << 5), /* Enable Message Signaled Interrupt */ Speed_down = (1 << 4), MEMMAP = (1 << 3), IOMAP = (1 << 2), @@ -484,7 +485,6 @@ enum rtl_register_content { PMEnable = (1 << 0), /* Power Management Enable */ /* Config2 register p. 25 */ - MSIEnable = (1 << 5), /* 8169 only. Reserved in the 8168. */ PCI_Clock_66MHz = 0x01, PCI_Clock_33MHz = 0x00, @@ -3426,24 +3426,22 @@ static const struct rtl_cfg_info { }; /* Cfg9346_Unlock assumed. */ -static unsigned rtl_try_msi(struct rtl8169_private *tp, +static unsigned rtl_try_msi(struct pci_dev *pdev, void __iomem *ioaddr, const struct rtl_cfg_info *cfg) { - void __iomem *ioaddr = tp->mmio_addr; unsigned msi = 0; u8 cfg2; cfg2 = RTL_R8(Config2) & ~MSIEnable; if (cfg->features & RTL_FEATURE_MSI) { - if (pci_enable_msi(tp->pci_dev)) { - netif_info(tp, hw, tp->dev, "no MSI. Back to INTx.\n"); + if (pci_enable_msi(pdev)) { + dev_info(&pdev->dev, "no MSI. Back to INTx.\n"); } else { cfg2 |= MSIEnable; msi = RTL_FEATURE_MSI; } } - if (tp->mac_version <= RTL_GIGA_MAC_VER_06) - RTL_W8(Config2, cfg2); + RTL_W8(Config2, cfg2); return msi; } @@ -4079,7 +4077,7 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) tp->features |= RTL_FEATURE_WOL; if ((RTL_R8(Config5) & (UWF | BWF | MWF)) != 0) tp->features |= RTL_FEATURE_WOL; - tp->features |= rtl_try_msi(tp, cfg); + tp->features |= rtl_try_msi(pdev, ioaddr, cfg); RTL_W8(Cfg9346, Cfg9346_Lock); if (rtl_tbi_enabled(tp)) { diff --git a/trunk/drivers/net/ethernet/ti/davinci_cpdma.c b/trunk/drivers/net/ethernet/ti/davinci_cpdma.c index c97d2f590855..dca9d3369cdd 100644 --- a/trunk/drivers/net/ethernet/ti/davinci_cpdma.c +++ b/trunk/drivers/net/ethernet/ti/davinci_cpdma.c @@ -836,13 +836,11 @@ int cpdma_chan_stop(struct cpdma_chan *chan) chan_write(chan, cp, CPDMA_TEARDOWN_VALUE); /* handle completed packets */ - spin_unlock_irqrestore(&chan->lock, flags); do { ret = __cpdma_chan_process(chan); if (ret < 0) break; } while ((ret & CPDMA_DESC_TD_COMPLETE) == 0); - spin_lock_irqsave(&chan->lock, flags); /* remaining packets haven't been tx/rx'ed, clean them up */ while (chan->head) { diff --git a/trunk/drivers/net/usb/asix.c b/trunk/drivers/net/usb/asix.c index e95f0e60a9bc..e6fed4d4cb77 100644 --- a/trunk/drivers/net/usb/asix.c +++ b/trunk/drivers/net/usb/asix.c @@ -1655,10 +1655,6 @@ static const struct usb_device_id products [] = { // ASIX 88772a USB_DEVICE(0x0db0, 0xa877), .driver_info = (unsigned long) &ax88772_info, -}, { - // Asus USB Ethernet Adapter - USB_DEVICE (0x0b95, 0x7e2b), - .driver_info = (unsigned long) &ax88772_info, }, { }, // END }; diff --git a/trunk/drivers/net/wireless/ath/ath9k/main.c b/trunk/drivers/net/wireless/ath/ath9k/main.c index a9c5ae75277e..d2348a5a7809 100644 --- a/trunk/drivers/net/wireless/ath/ath9k/main.c +++ b/trunk/drivers/net/wireless/ath/ath9k/main.c @@ -1843,9 +1843,6 @@ static void ath9k_sta_notify(struct ieee80211_hw *hw, struct ath_softc *sc = hw->priv; struct ath_node *an = (struct ath_node *) sta->drv_priv; - if (!(sc->sc_flags & SC_OP_TXAGGR)) - return; - switch (cmd) { case STA_NOTIFY_SLEEP: an->sleeping = true; diff --git a/trunk/drivers/net/wireless/ath/ath9k/rc.c b/trunk/drivers/net/wireless/ath/ath9k/rc.c index 528d5f3e868c..888abc2be3a5 100644 --- a/trunk/drivers/net/wireless/ath/ath9k/rc.c +++ b/trunk/drivers/net/wireless/ath/ath9k/rc.c @@ -1271,9 +1271,7 @@ static void ath_rc_init(struct ath_softc *sc, ath_rc_priv->max_valid_rate = k; ath_rc_sort_validrates(rate_table, ath_rc_priv); - ath_rc_priv->rate_max_phy = (k > 4) ? - ath_rc_priv->valid_rate_index[k-4] : - ath_rc_priv->valid_rate_index[k-1]; + ath_rc_priv->rate_max_phy = ath_rc_priv->valid_rate_index[k-4]; ath_rc_priv->rate_table = rate_table; ath_dbg(common, ATH_DBG_CONFIG, diff --git a/trunk/drivers/net/wireless/b43/pio.c b/trunk/drivers/net/wireless/b43/pio.c index 279a53eae4c5..fcff923b3c18 100644 --- a/trunk/drivers/net/wireless/b43/pio.c +++ b/trunk/drivers/net/wireless/b43/pio.c @@ -617,19 +617,9 @@ static bool pio_rx_frame(struct b43_pio_rxqueue *q) const char *err_msg = NULL; struct b43_rxhdr_fw4 *rxhdr = (struct b43_rxhdr_fw4 *)wl->pio_scratchspace; - size_t rxhdr_size = sizeof(*rxhdr); BUILD_BUG_ON(sizeof(wl->pio_scratchspace) < sizeof(*rxhdr)); - switch (dev->fw.hdr_format) { - case B43_FW_HDR_410: - case B43_FW_HDR_351: - rxhdr_size -= sizeof(rxhdr->format_598) - - sizeof(rxhdr->format_351); - break; - case B43_FW_HDR_598: - break; - } - memset(rxhdr, 0, rxhdr_size); + memset(rxhdr, 0, sizeof(*rxhdr)); /* Check if we have data and wait for it to get ready. */ if (q->rev >= 8) { @@ -667,11 +657,11 @@ static bool pio_rx_frame(struct b43_pio_rxqueue *q) /* Get the preamble (RX header) */ if (q->rev >= 8) { - b43_block_read(dev, rxhdr, rxhdr_size, + b43_block_read(dev, rxhdr, sizeof(*rxhdr), q->mmio_base + B43_PIO8_RXDATA, sizeof(u32)); } else { - b43_block_read(dev, rxhdr, rxhdr_size, + b43_block_read(dev, rxhdr, sizeof(*rxhdr), q->mmio_base + B43_PIO_RXDATA, sizeof(u16)); } diff --git a/trunk/drivers/net/wireless/iwlwifi/iwl-agn-rxon.c b/trunk/drivers/net/wireless/iwlwifi/iwl-agn-rxon.c index 5c7c17c7166a..a7a6def40d05 100644 --- a/trunk/drivers/net/wireless/iwlwifi/iwl-agn-rxon.c +++ b/trunk/drivers/net/wireless/iwlwifi/iwl-agn-rxon.c @@ -606,8 +606,8 @@ int iwlagn_mac_config(struct ieee80211_hw *hw, u32 changed) if (ctx->ht.enabled) { /* if HT40 is used, it should not change * after associated except channel switch */ - if (!ctx->ht.is_40mhz || - !iwl_is_associated_ctx(ctx)) + if (iwl_is_associated_ctx(ctx) && + !ctx->ht.is_40mhz) iwlagn_config_ht40(conf, ctx); } else ctx->ht.is_40mhz = false; diff --git a/trunk/drivers/net/wireless/iwlwifi/iwl-agn-tx.c b/trunk/drivers/net/wireless/iwlwifi/iwl-agn-tx.c index df1540ca6102..35a6b71f358c 100644 --- a/trunk/drivers/net/wireless/iwlwifi/iwl-agn-tx.c +++ b/trunk/drivers/net/wireless/iwlwifi/iwl-agn-tx.c @@ -91,10 +91,7 @@ static void iwlagn_tx_cmd_build_basic(struct iwl_priv *priv, tx_cmd->tid_tspec = qc[0] & 0xf; tx_flags &= ~TX_CMD_FLG_SEQ_CTL_MSK; } else { - if (info->flags & IEEE80211_TX_CTL_ASSIGN_SEQ) - tx_flags |= TX_CMD_FLG_SEQ_CTL_MSK; - else - tx_flags &= ~TX_CMD_FLG_SEQ_CTL_MSK; + tx_flags |= TX_CMD_FLG_SEQ_CTL_MSK; } iwlagn_tx_cmd_protection(priv, info, fc, &tx_flags); diff --git a/trunk/drivers/net/wireless/iwlwifi/iwl-agn.c b/trunk/drivers/net/wireless/iwlwifi/iwl-agn.c index e0e9a3dfbc00..bacc06c95e7a 100644 --- a/trunk/drivers/net/wireless/iwlwifi/iwl-agn.c +++ b/trunk/drivers/net/wireless/iwlwifi/iwl-agn.c @@ -2850,9 +2850,6 @@ static int iwlagn_mac_tx_sync(struct ieee80211_hw *hw, int ret; u8 sta_id; - if (ctx->ctxid != IWL_RXON_CTX_PAN) - return 0; - IWL_DEBUG_MAC80211(priv, "enter\n"); mutex_lock(&priv->shrd->mutex); @@ -2901,9 +2898,6 @@ static void iwlagn_mac_finish_tx_sync(struct ieee80211_hw *hw, struct iwl_vif_priv *vif_priv = (void *)vif->drv_priv; struct iwl_rxon_context *ctx = vif_priv->ctx; - if (ctx->ctxid != IWL_RXON_CTX_PAN) - return; - IWL_DEBUG_MAC80211(priv, "enter\n"); mutex_lock(&priv->shrd->mutex); diff --git a/trunk/drivers/net/wireless/iwlwifi/iwl-trans-pcie.c b/trunk/drivers/net/wireless/iwlwifi/iwl-trans-pcie.c index 5f17ab8e76ba..ce918980e977 100644 --- a/trunk/drivers/net/wireless/iwlwifi/iwl-trans-pcie.c +++ b/trunk/drivers/net/wireless/iwlwifi/iwl-trans-pcie.c @@ -1197,7 +1197,9 @@ static int iwl_trans_pcie_tx(struct iwl_trans *trans, struct sk_buff *skb, iwl_print_hex_dump(trans, IWL_DL_TX, (u8 *)tx_cmd->hdr, hdr_len); /* Set up entry for this TFD in Tx byte-count array */ - iwl_trans_txq_update_byte_cnt_tbl(trans, txq, le16_to_cpu(tx_cmd->len)); + if (is_agg) + iwl_trans_txq_update_byte_cnt_tbl(trans, txq, + le16_to_cpu(tx_cmd->len)); dma_sync_single_for_device(bus(trans)->dev, txcmd_phys, firstlen, DMA_BIDIRECTIONAL); diff --git a/trunk/drivers/net/wireless/mwifiex/cmdevt.c b/trunk/drivers/net/wireless/mwifiex/cmdevt.c index 6e0a3eaecf70..ac278156d390 100644 --- a/trunk/drivers/net/wireless/mwifiex/cmdevt.c +++ b/trunk/drivers/net/wireless/mwifiex/cmdevt.c @@ -939,6 +939,7 @@ mwifiex_cancel_pending_ioctl(struct mwifiex_adapter *adapter) { struct cmd_ctrl_node *cmd_node = NULL, *tmp_node = NULL; unsigned long cmd_flags; + unsigned long cmd_pending_q_flags; unsigned long scan_pending_q_flags; uint16_t cancel_scan_cmd = false; @@ -948,9 +949,12 @@ mwifiex_cancel_pending_ioctl(struct mwifiex_adapter *adapter) cmd_node = adapter->curr_cmd; cmd_node->wait_q_enabled = false; cmd_node->cmd_flag |= CMD_F_CANCELED; + spin_lock_irqsave(&adapter->cmd_pending_q_lock, + cmd_pending_q_flags); + list_del(&cmd_node->list); + spin_unlock_irqrestore(&adapter->cmd_pending_q_lock, + cmd_pending_q_flags); mwifiex_insert_cmd_to_free_q(adapter, cmd_node); - mwifiex_complete_cmd(adapter, adapter->curr_cmd); - adapter->curr_cmd = NULL; spin_unlock_irqrestore(&adapter->mwifiex_cmd_lock, cmd_flags); } @@ -977,6 +981,7 @@ mwifiex_cancel_pending_ioctl(struct mwifiex_adapter *adapter) spin_unlock_irqrestore(&adapter->mwifiex_cmd_lock, cmd_flags); } adapter->cmd_wait_q.status = -1; + mwifiex_complete_cmd(adapter, adapter->curr_cmd); } /* diff --git a/trunk/drivers/net/wireless/mwifiex/sta_ioctl.c b/trunk/drivers/net/wireless/mwifiex/sta_ioctl.c index 1679c2593b7b..ea4a29b7e331 100644 --- a/trunk/drivers/net/wireless/mwifiex/sta_ioctl.c +++ b/trunk/drivers/net/wireless/mwifiex/sta_ioctl.c @@ -55,14 +55,9 @@ int mwifiex_wait_queue_complete(struct mwifiex_adapter *adapter) { bool cancel_flag = false; int status = adapter->cmd_wait_q.status; - struct cmd_ctrl_node *cmd_queued; + struct cmd_ctrl_node *cmd_queued = adapter->cmd_queued; - if (!adapter->cmd_queued) - return 0; - - cmd_queued = adapter->cmd_queued; adapter->cmd_queued = NULL; - dev_dbg(adapter->dev, "cmd pending\n"); atomic_inc(&adapter->cmd_pending); diff --git a/trunk/drivers/of/platform.c b/trunk/drivers/of/platform.c index 63b3ec48c203..cbd5d701c7e0 100644 --- a/trunk/drivers/of/platform.c +++ b/trunk/drivers/of/platform.c @@ -314,7 +314,7 @@ static const struct of_dev_auxdata *of_dev_lookup(const struct of_dev_auxdata *l if (!lookup) return NULL; - for(; lookup->compatible != NULL; lookup++) { + for(; lookup->name != NULL; lookup++) { if (!of_device_is_compatible(np, lookup->compatible)) continue; if (of_address_to_resource(np, 0, &res)) diff --git a/trunk/drivers/oprofile/nmi_timer_int.c b/trunk/drivers/oprofile/nmi_timer_int.c deleted file mode 100644 index 76f1c9357f39..000000000000 --- a/trunk/drivers/oprofile/nmi_timer_int.c +++ /dev/null @@ -1,173 +0,0 @@ -/** - * @file nmi_timer_int.c - * - * @remark Copyright 2011 Advanced Micro Devices, Inc. - * - * @author Robert Richter - */ - -#include -#include -#include -#include -#include - -#ifdef CONFIG_OPROFILE_NMI_TIMER - -static DEFINE_PER_CPU(struct perf_event *, nmi_timer_events); -static int ctr_running; - -static struct perf_event_attr nmi_timer_attr = { - .type = PERF_TYPE_HARDWARE, - .config = PERF_COUNT_HW_CPU_CYCLES, - .size = sizeof(struct perf_event_attr), - .pinned = 1, - .disabled = 1, -}; - -static void nmi_timer_callback(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - event->hw.interrupts = 0; /* don't throttle interrupts */ - oprofile_add_sample(regs, 0); -} - -static int nmi_timer_start_cpu(int cpu) -{ - struct perf_event *event = per_cpu(nmi_timer_events, cpu); - - if (!event) { - event = perf_event_create_kernel_counter(&nmi_timer_attr, cpu, NULL, - nmi_timer_callback, NULL); - if (IS_ERR(event)) - return PTR_ERR(event); - per_cpu(nmi_timer_events, cpu) = event; - } - - if (event && ctr_running) - perf_event_enable(event); - - return 0; -} - -static void nmi_timer_stop_cpu(int cpu) -{ - struct perf_event *event = per_cpu(nmi_timer_events, cpu); - - if (event && ctr_running) - perf_event_disable(event); -} - -static int nmi_timer_cpu_notifier(struct notifier_block *b, unsigned long action, - void *data) -{ - int cpu = (unsigned long)data; - switch (action) { - case CPU_DOWN_FAILED: - case CPU_ONLINE: - nmi_timer_start_cpu(cpu); - break; - case CPU_DOWN_PREPARE: - nmi_timer_stop_cpu(cpu); - break; - } - return NOTIFY_DONE; -} - -static struct notifier_block nmi_timer_cpu_nb = { - .notifier_call = nmi_timer_cpu_notifier -}; - -static int nmi_timer_start(void) -{ - int cpu; - - get_online_cpus(); - ctr_running = 1; - for_each_online_cpu(cpu) - nmi_timer_start_cpu(cpu); - put_online_cpus(); - - return 0; -} - -static void nmi_timer_stop(void) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) - nmi_timer_stop_cpu(cpu); - ctr_running = 0; - put_online_cpus(); -} - -static void nmi_timer_shutdown(void) -{ - struct perf_event *event; - int cpu; - - get_online_cpus(); - unregister_cpu_notifier(&nmi_timer_cpu_nb); - for_each_possible_cpu(cpu) { - event = per_cpu(nmi_timer_events, cpu); - if (!event) - continue; - perf_event_disable(event); - per_cpu(nmi_timer_events, cpu) = NULL; - perf_event_release_kernel(event); - } - - put_online_cpus(); -} - -static int nmi_timer_setup(void) -{ - int cpu, err; - u64 period; - - /* clock cycles per tick: */ - period = (u64)cpu_khz * 1000; - do_div(period, HZ); - nmi_timer_attr.sample_period = period; - - get_online_cpus(); - err = register_cpu_notifier(&nmi_timer_cpu_nb); - if (err) - goto out; - /* can't attach events to offline cpus: */ - for_each_online_cpu(cpu) { - err = nmi_timer_start_cpu(cpu); - if (err) - break; - } - if (err) - nmi_timer_shutdown(); -out: - put_online_cpus(); - return err; -} - -int __init op_nmi_timer_init(struct oprofile_operations *ops) -{ - int err = 0; - - err = nmi_timer_setup(); - if (err) - return err; - nmi_timer_shutdown(); /* only check, don't alloc */ - - ops->create_files = NULL; - ops->setup = nmi_timer_setup; - ops->shutdown = nmi_timer_shutdown; - ops->start = nmi_timer_start; - ops->stop = nmi_timer_stop; - ops->cpu_type = "timer"; - - printk(KERN_INFO "oprofile: using NMI timer interrupt.\n"); - - return 0; -} - -#endif diff --git a/trunk/drivers/oprofile/oprof.c b/trunk/drivers/oprofile/oprof.c index ed2c3ec07024..f8c752e408a6 100644 --- a/trunk/drivers/oprofile/oprof.c +++ b/trunk/drivers/oprofile/oprof.c @@ -246,31 +246,37 @@ static int __init oprofile_init(void) int err; /* always init architecture to setup backtrace support */ - timer_mode = 0; err = oprofile_arch_init(&oprofile_ops); - if (!err) { - if (!timer && !oprofilefs_register()) - return 0; - oprofile_arch_exit(); - } - /* setup timer mode: */ - timer_mode = 1; - /* no nmi timer mode if oprofile.timer is set */ - if (timer || op_nmi_timer_init(&oprofile_ops)) { + timer_mode = err || timer; /* fall back to timer mode on errors */ + if (timer_mode) { + if (!err) + oprofile_arch_exit(); err = oprofile_timer_init(&oprofile_ops); if (err) return err; } - return oprofilefs_register(); + err = oprofilefs_register(); + if (!err) + return 0; + + /* failed */ + if (timer_mode) + oprofile_timer_exit(); + else + oprofile_arch_exit(); + + return err; } static void __exit oprofile_exit(void) { oprofilefs_unregister(); - if (!timer_mode) + if (timer_mode) + oprofile_timer_exit(); + else oprofile_arch_exit(); } diff --git a/trunk/drivers/oprofile/oprof.h b/trunk/drivers/oprofile/oprof.h index d32ef816337c..177b73de5e5f 100644 --- a/trunk/drivers/oprofile/oprof.h +++ b/trunk/drivers/oprofile/oprof.h @@ -35,15 +35,7 @@ struct dentry; void oprofile_create_files(struct super_block *sb, struct dentry *root); int oprofile_timer_init(struct oprofile_operations *ops); -#ifdef CONFIG_OPROFILE_NMI_TIMER -int op_nmi_timer_init(struct oprofile_operations *ops); -#else -static inline int op_nmi_timer_init(struct oprofile_operations *ops) -{ - return -ENODEV; -} -#endif - +void oprofile_timer_exit(void); int oprofile_set_ulong(unsigned long *addr, unsigned long val); int oprofile_set_timeout(unsigned long time); diff --git a/trunk/drivers/oprofile/timer_int.c b/trunk/drivers/oprofile/timer_int.c index 93404f72dfa8..878fba126582 100644 --- a/trunk/drivers/oprofile/timer_int.c +++ b/trunk/drivers/oprofile/timer_int.c @@ -97,24 +97,24 @@ static struct notifier_block __refdata oprofile_cpu_notifier = { .notifier_call = oprofile_cpu_notify, }; -static int oprofile_hrtimer_setup(void) +int oprofile_timer_init(struct oprofile_operations *ops) { - return register_hotcpu_notifier(&oprofile_cpu_notifier); + int rc; + + rc = register_hotcpu_notifier(&oprofile_cpu_notifier); + if (rc) + return rc; + ops->create_files = NULL; + ops->setup = NULL; + ops->shutdown = NULL; + ops->start = oprofile_hrtimer_start; + ops->stop = oprofile_hrtimer_stop; + ops->cpu_type = "timer"; + printk(KERN_INFO "oprofile: using timer interrupt.\n"); + return 0; } -static void oprofile_hrtimer_shutdown(void) +void oprofile_timer_exit(void) { unregister_hotcpu_notifier(&oprofile_cpu_notifier); } - -int oprofile_timer_init(struct oprofile_operations *ops) -{ - ops->create_files = NULL; - ops->setup = oprofile_hrtimer_setup; - ops->shutdown = oprofile_hrtimer_shutdown; - ops->start = oprofile_hrtimer_start; - ops->stop = oprofile_hrtimer_stop; - ops->cpu_type = "timer"; - printk(KERN_INFO "oprofile: using timer interrupt.\n"); - return 0; -} diff --git a/trunk/drivers/pci/Kconfig b/trunk/drivers/pci/Kconfig index 37856f7c7781..f02b5235056d 100644 --- a/trunk/drivers/pci/Kconfig +++ b/trunk/drivers/pci/Kconfig @@ -98,11 +98,11 @@ config PCI_PASID If unsure, say N. config PCI_IOAPIC - tristate "PCI IO-APIC hotplug support" if X86 + bool depends on PCI depends on ACPI depends on HOTPLUG - default !X86 + default y config PCI_LABEL def_bool y if (DMI || ACPI) diff --git a/trunk/drivers/pci/ioapic.c b/trunk/drivers/pci/ioapic.c index 205af8dc83c2..5775638ac017 100644 --- a/trunk/drivers/pci/ioapic.c +++ b/trunk/drivers/pci/ioapic.c @@ -17,7 +17,7 @@ */ #include -#include +#include #include #include #include @@ -27,7 +27,7 @@ struct ioapic { u32 gsi_base; }; -static int __devinit ioapic_probe(struct pci_dev *dev, const struct pci_device_id *ent) +static int ioapic_probe(struct pci_dev *dev, const struct pci_device_id *ent) { acpi_handle handle; acpi_status status; @@ -88,7 +88,7 @@ static int __devinit ioapic_probe(struct pci_dev *dev, const struct pci_device_i return -ENODEV; } -static void __devexit ioapic_remove(struct pci_dev *dev) +static void ioapic_remove(struct pci_dev *dev) { struct ioapic *ioapic = pci_get_drvdata(dev); @@ -99,12 +99,13 @@ static void __devexit ioapic_remove(struct pci_dev *dev) } -static DEFINE_PCI_DEVICE_TABLE(ioapic_devices) = { - { PCI_DEVICE_CLASS(PCI_CLASS_SYSTEM_PIC_IOAPIC, ~0) }, - { PCI_DEVICE_CLASS(PCI_CLASS_SYSTEM_PIC_IOXAPIC, ~0) }, +static struct pci_device_id ioapic_devices[] = { + { PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, + PCI_CLASS_SYSTEM_PIC_IOAPIC << 8, 0xffff00, }, + { PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, + PCI_CLASS_SYSTEM_PIC_IOXAPIC << 8, 0xffff00, }, { } }; -MODULE_DEVICE_TABLE(pci, ioapic_devices); static struct pci_driver ioapic_driver = { .name = "ioapic", diff --git a/trunk/drivers/rtc/interface.c b/trunk/drivers/rtc/interface.c index 8e286259a007..3bcc7cfcaba7 100644 --- a/trunk/drivers/rtc/interface.c +++ b/trunk/drivers/rtc/interface.c @@ -73,6 +73,8 @@ int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm) err = -EINVAL; mutex_unlock(&rtc->ops_lock); + /* A timer might have just expired */ + schedule_work(&rtc->irqwork); return err; } EXPORT_SYMBOL_GPL(rtc_set_time); @@ -112,6 +114,8 @@ int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs) err = -EINVAL; mutex_unlock(&rtc->ops_lock); + /* A timer might have just expired */ + schedule_work(&rtc->irqwork); return err; } @@ -319,6 +323,20 @@ int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) } EXPORT_SYMBOL_GPL(rtc_read_alarm); +static int ___rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) +{ + int err; + + if (!rtc->ops) + err = -ENODEV; + else if (!rtc->ops->set_alarm) + err = -EINVAL; + else + err = rtc->ops->set_alarm(rtc->dev.parent, alarm); + + return err; +} + static int __rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) { struct rtc_time tm; @@ -342,14 +360,7 @@ static int __rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) * over right here, before we set the alarm. */ - if (!rtc->ops) - err = -ENODEV; - else if (!rtc->ops->set_alarm) - err = -EINVAL; - else - err = rtc->ops->set_alarm(rtc->dev.parent, alarm); - - return err; + return ___rtc_set_alarm(rtc, alarm); } int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) @@ -396,6 +407,8 @@ int rtc_initialize_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) timerqueue_add(&rtc->timerqueue, &rtc->aie_timer.node); } mutex_unlock(&rtc->ops_lock); + /* maybe that was in the past.*/ + schedule_work(&rtc->irqwork); return err; } EXPORT_SYMBOL_GPL(rtc_initialize_alarm); @@ -763,6 +776,20 @@ static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer) return 0; } +static void rtc_alarm_disable(struct rtc_device *rtc) +{ + struct rtc_wkalrm alarm; + struct rtc_time tm; + + __rtc_read_time(rtc, &tm); + + alarm.time = rtc_ktime_to_tm(ktime_add(rtc_tm_to_ktime(tm), + ktime_set(300, 0))); + alarm.enabled = 0; + + ___rtc_set_alarm(rtc, &alarm); +} + /** * rtc_timer_remove - Removes a rtc_timer from the rtc_device timerqueue * @rtc rtc device @@ -784,8 +811,10 @@ static void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer) struct rtc_wkalrm alarm; int err; next = timerqueue_getnext(&rtc->timerqueue); - if (!next) + if (!next) { + rtc_alarm_disable(rtc); return; + } alarm.time = rtc_ktime_to_tm(next->expires); alarm.enabled = 1; err = __rtc_set_alarm(rtc, &alarm); @@ -847,7 +876,8 @@ void rtc_timer_do_work(struct work_struct *work) err = __rtc_set_alarm(rtc, &alarm); if (err == -ETIME) goto again; - } + } else + rtc_alarm_disable(rtc); mutex_unlock(&rtc->ops_lock); } diff --git a/trunk/drivers/usb/dwc3/core.c b/trunk/drivers/usb/dwc3/core.c index 600d82348511..717ebc9ff941 100644 --- a/trunk/drivers/usb/dwc3/core.c +++ b/trunk/drivers/usb/dwc3/core.c @@ -264,7 +264,7 @@ static int __devinit dwc3_core_init(struct dwc3 *dwc) ret = -ENODEV; goto err0; } - dwc->revision = reg; + dwc->revision = reg & DWC3_GSNPSREV_MASK; dwc3_core_soft_reset(dwc); diff --git a/trunk/drivers/usb/gadget/epautoconf.c b/trunk/drivers/usb/gadget/epautoconf.c index 4dff83d2f265..596a0b464e61 100644 --- a/trunk/drivers/usb/gadget/epautoconf.c +++ b/trunk/drivers/usb/gadget/epautoconf.c @@ -130,6 +130,9 @@ ep_matches ( num_req_streams = ep_comp->bmAttributes & 0x1f; if (num_req_streams > ep->max_streams) return 0; + /* Update the ep_comp descriptor if needed */ + if (num_req_streams != ep->max_streams) + ep_comp->bmAttributes = ep->max_streams; } } diff --git a/trunk/drivers/usb/host/isp1760-if.c b/trunk/drivers/usb/host/isp1760-if.c index 2ac4ac2e4ef9..a7dc1e1d45f2 100644 --- a/trunk/drivers/usb/host/isp1760-if.c +++ b/trunk/drivers/usb/host/isp1760-if.c @@ -18,7 +18,7 @@ #include "isp1760-hcd.h" -#if defined(CONFIG_OF) && defined(CONFIG_OF_IRQ) +#ifdef CONFIG_OF #include #include #include @@ -31,7 +31,7 @@ #include #endif -#if defined(CONFIG_OF) && defined(CONFIG_OF_IRQ) +#ifdef CONFIG_OF struct isp1760 { struct usb_hcd *hcd; int rst_gpio; @@ -437,7 +437,7 @@ static int __init isp1760_init(void) ret = platform_driver_register(&isp1760_plat_driver); if (!ret) any_ret = 0; -#if defined(CONFIG_OF) && defined(CONFIG_OF_IRQ) +#ifdef CONFIG_OF ret = platform_driver_register(&isp1760_of_driver); if (!ret) any_ret = 0; @@ -457,7 +457,7 @@ module_init(isp1760_init); static void __exit isp1760_exit(void) { platform_driver_unregister(&isp1760_plat_driver); -#if defined(CONFIG_OF) && defined(CONFIG_OF_IRQ) +#ifdef CONFIG_OF platform_driver_unregister(&isp1760_of_driver); #endif #ifdef CONFIG_PCI diff --git a/trunk/drivers/usb/musb/musb_host.c b/trunk/drivers/usb/musb/musb_host.c index 79cb0af779fa..60ddba8066ea 100644 --- a/trunk/drivers/usb/musb/musb_host.c +++ b/trunk/drivers/usb/musb/musb_host.c @@ -774,10 +774,6 @@ static void musb_ep_program(struct musb *musb, u8 epnum, if (musb->double_buffer_not_ok) musb_writew(epio, MUSB_TXMAXP, hw_ep->max_packet_sz_tx); - else if (can_bulk_split(musb, qh->type)) - musb_writew(epio, MUSB_TXMAXP, packet_sz - | ((hw_ep->max_packet_sz_tx / - packet_sz) - 1) << 11); else musb_writew(epio, MUSB_TXMAXP, qh->maxpacket | diff --git a/trunk/drivers/watchdog/coh901327_wdt.c b/trunk/drivers/watchdog/coh901327_wdt.c index 5b89f7d6cd0f..03f449a430d2 100644 --- a/trunk/drivers/watchdog/coh901327_wdt.c +++ b/trunk/drivers/watchdog/coh901327_wdt.c @@ -76,6 +76,8 @@ static int irq; static void __iomem *virtbase; static unsigned long coh901327_users; static unsigned long boot_status; +static u16 wdogenablestore; +static u16 irqmaskstore; static struct device *parent; /* @@ -459,10 +461,6 @@ static int __init coh901327_probe(struct platform_device *pdev) } #ifdef CONFIG_PM - -static u16 wdogenablestore; -static u16 irqmaskstore; - static int coh901327_suspend(struct platform_device *pdev, pm_message_t state) { irqmaskstore = readw(virtbase + U300_WDOG_IMR) & 0x0001U; diff --git a/trunk/drivers/watchdog/hpwdt.c b/trunk/drivers/watchdog/hpwdt.c index 8464ea1c36a1..3774c9b8dac9 100644 --- a/trunk/drivers/watchdog/hpwdt.c +++ b/trunk/drivers/watchdog/hpwdt.c @@ -231,7 +231,6 @@ static int __devinit cru_detect(unsigned long map_entry, cmn_regs.u1.reax = CRU_BIOS_SIGNATURE_VALUE; - set_memory_x((unsigned long)bios32_entrypoint, (2 * PAGE_SIZE)); asminline_call(&cmn_regs, bios32_entrypoint); if (cmn_regs.u1.ral != 0) { @@ -249,10 +248,8 @@ static int __devinit cru_detect(unsigned long map_entry, if ((physical_bios_base + physical_bios_offset)) { cru_rom_addr = ioremap(cru_physical_address, cru_length); - if (cru_rom_addr) { - set_memory_x((unsigned long)cru_rom_addr, cru_length); + if (cru_rom_addr) retval = 0; - } } printk(KERN_DEBUG "hpwdt: CRU Base Address: 0x%lx\n", diff --git a/trunk/drivers/watchdog/iTCO_wdt.c b/trunk/drivers/watchdog/iTCO_wdt.c index 99796c5d913d..ba6ad662635a 100644 --- a/trunk/drivers/watchdog/iTCO_wdt.c +++ b/trunk/drivers/watchdog/iTCO_wdt.c @@ -384,10 +384,10 @@ MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); -static int turn_SMI_watchdog_clear_off = 1; +static int turn_SMI_watchdog_clear_off = 0; module_param(turn_SMI_watchdog_clear_off, int, 0); MODULE_PARM_DESC(turn_SMI_watchdog_clear_off, - "Turn off SMI clearing watchdog (depends on TCO-version)(default=1)"); + "Turn off SMI clearing watchdog (default=0)"); /* * Some TCO specific functions @@ -813,7 +813,7 @@ static int __devinit iTCO_wdt_init(struct pci_dev *pdev, ret = -EIO; goto out_unmap; } - if (turn_SMI_watchdog_clear_off >= iTCO_wdt_private.iTCO_version) { + if (turn_SMI_watchdog_clear_off) { /* Bit 13: TCO_EN -> 0 = Disables TCO logic generating an SMI# */ val32 = inl(SMI_EN); val32 &= 0xffffdfff; /* Turn off SMI clearing watchdog */ diff --git a/trunk/drivers/watchdog/sp805_wdt.c b/trunk/drivers/watchdog/sp805_wdt.c index bfaf9bb1ee0d..cc2cfbe33b30 100644 --- a/trunk/drivers/watchdog/sp805_wdt.c +++ b/trunk/drivers/watchdog/sp805_wdt.c @@ -351,7 +351,7 @@ static int __devexit sp805_wdt_remove(struct amba_device *adev) return 0; } -static struct amba_id sp805_wdt_ids[] = { +static struct amba_id sp805_wdt_ids[] __initdata = { { .id = 0x00141805, .mask = 0x00ffffff, diff --git a/trunk/firmware/README.AddingFirmware b/trunk/firmware/README.AddingFirmware index ea78c3a17eec..e24cd8986d8b 100644 --- a/trunk/firmware/README.AddingFirmware +++ b/trunk/firmware/README.AddingFirmware @@ -12,7 +12,7 @@ here. This directory is _NOT_ for adding arbitrary new firmware images. The place to add those is the separate linux-firmware repository: - git://git.kernel.org/pub/scm/linux/kernel/git/firmware/linux-firmware.git + git://git.kernel.org/pub/scm/linux/kernel/git/dwmw2/linux-firmware.git That repository contains all these firmware images which have been extracted from older drivers, as well various new firmware images which @@ -22,7 +22,6 @@ been permitted to redistribute under separate cover. To submit firmware to that repository, please send either a git binary diff or preferably a git pull request to: David Woodhouse - Ben Hutchings Your commit should include an update to the WHENCE file clearly identifying the licence under which the firmware is available, and diff --git a/trunk/fs/btrfs/async-thread.c b/trunk/fs/btrfs/async-thread.c index 0b394580d860..cb97174e2366 100644 --- a/trunk/fs/btrfs/async-thread.c +++ b/trunk/fs/btrfs/async-thread.c @@ -563,8 +563,8 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) struct list_head *fallback; int ret; - spin_lock_irqsave(&workers->lock, flags); again: + spin_lock_irqsave(&workers->lock, flags); worker = next_worker(workers); if (!worker) { @@ -579,7 +579,6 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) spin_unlock_irqrestore(&workers->lock, flags); /* we're below the limit, start another worker */ ret = __btrfs_start_workers(workers); - spin_lock_irqsave(&workers->lock, flags); if (ret) goto fallback; goto again; diff --git a/trunk/fs/btrfs/inode.c b/trunk/fs/btrfs/inode.c index fd1a06df5bc6..0a6b928813a4 100644 --- a/trunk/fs/btrfs/inode.c +++ b/trunk/fs/btrfs/inode.c @@ -4590,6 +4590,10 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans, int err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, dentry->d_name.len, backref, index); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } if (err > 0) err = -EEXIST; return err; @@ -4651,7 +4655,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, else { init_special_inode(inode, inode->i_mode, rdev); btrfs_update_inode(trans, root, inode); - d_instantiate(dentry, inode); } out_unlock: nr = trans->blocks_used; @@ -4719,7 +4722,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &btrfs_aops; inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; - d_instantiate(dentry, inode); } out_unlock: nr = trans->blocks_used; @@ -4777,7 +4779,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *parent = dentry->d_parent; err = btrfs_update_inode(trans, root, inode); BUG_ON(err); - d_instantiate(dentry, inode); btrfs_log_new_name(trans, inode, NULL, parent); } @@ -7244,8 +7245,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, drop_inode = 1; out_unlock: - if (!err) - d_instantiate(dentry, inode); nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); if (drop_inode) { diff --git a/trunk/fs/ceph/dir.c b/trunk/fs/ceph/dir.c index 98954003a8d3..3eeb97661262 100644 --- a/trunk/fs/ceph/dir.c +++ b/trunk/fs/ceph/dir.c @@ -1094,19 +1094,42 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry, /* * Set/clear/test dir complete flag on the dir's dentry. */ +static struct dentry * __d_find_any_alias(struct inode *inode) +{ + struct dentry *alias; + + if (list_empty(&inode->i_dentry)) + return NULL; + alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias); + return alias; +} + void ceph_dir_set_complete(struct inode *inode) { - /* not yet implemented */ + struct dentry *dentry = __d_find_any_alias(inode); + + if (dentry && ceph_dentry(dentry)) { + dout(" marking %p (%p) complete\n", inode, dentry); + set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); + } } void ceph_dir_clear_complete(struct inode *inode) { - /* not yet implemented */ + struct dentry *dentry = __d_find_any_alias(inode); + + if (dentry && ceph_dentry(dentry)) { + dout(" marking %p (%p) NOT complete\n", inode, dentry); + clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); + } } bool ceph_dir_test_complete(struct inode *inode) { - /* not yet implemented */ + struct dentry *dentry = __d_find_any_alias(inode); + + if (dentry && ceph_dentry(dentry)) + return test_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); return false; } diff --git a/trunk/fs/cifs/connect.c b/trunk/fs/cifs/connect.c index f3670cf72587..8cd4b52d4217 100644 --- a/trunk/fs/cifs/connect.c +++ b/trunk/fs/cifs/connect.c @@ -282,7 +282,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB) byte_count = be32_to_cpu(pTargetSMB->smb_buf_length); byte_count += total_in_buf2; /* don't allow buffer to overflow */ - if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) + if (byte_count > CIFSMaxBufSize) return -ENOBUFS; pTargetSMB->smb_buf_length = cpu_to_be32(byte_count); @@ -2122,7 +2122,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) warned_on_ntlm = true; cERROR(1, "default security mechanism requested. The default " "security mechanism will be upgraded from ntlm to " - "ntlmv2 in kernel release 3.3"); + "ntlmv2 in kernel release 3.2"); } ses->overrideSecFlg = volume_info->secFlg; diff --git a/trunk/fs/compat_ioctl.c b/trunk/fs/compat_ioctl.c index a10e428b32b4..51352de88ef1 100644 --- a/trunk/fs/compat_ioctl.c +++ b/trunk/fs/compat_ioctl.c @@ -1506,6 +1506,35 @@ static long do_ioctl_trans(int fd, unsigned int cmd, return -ENOIOCTLCMD; } +static void compat_ioctl_error(struct file *filp, unsigned int fd, + unsigned int cmd, unsigned long arg) +{ + char buf[10]; + char *fn = "?"; + char *path; + + /* find the name of the device. */ + path = (char *)__get_free_page(GFP_KERNEL); + if (path) { + fn = d_path(&filp->f_path, path, PAGE_SIZE); + if (IS_ERR(fn)) + fn = "?"; + } + + sprintf(buf,"'%c'", (cmd>>_IOC_TYPESHIFT) & _IOC_TYPEMASK); + if (!isprint(buf[1])) + sprintf(buf, "%02x", buf[1]); + compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) " + "cmd(%08x){t:%s;sz:%u} arg(%08x) on %s\n", + current->comm, current->pid, + (int)fd, (unsigned int)cmd, buf, + (cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK, + (unsigned int)arg, fn); + + if (path) + free_page((unsigned long)path); +} + static int compat_ioctl_check_table(unsigned int xcmd) { int i; @@ -1592,8 +1621,13 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, goto found_handler; error = do_ioctl_trans(fd, cmd, arg, filp); - if (error == -ENOIOCTLCMD) - error = -ENOTTY; + if (error == -ENOIOCTLCMD) { + static int count; + + if (++count <= 50) + compat_ioctl_error(filp, fd, cmd, arg); + error = -EINVAL; + } goto out_fput; diff --git a/trunk/fs/fs-writeback.c b/trunk/fs/fs-writeback.c index 517f211a3bd4..ac86f8b3e3cb 100644 --- a/trunk/fs/fs-writeback.c +++ b/trunk/fs/fs-writeback.c @@ -47,6 +47,17 @@ struct wb_writeback_work { struct completion *done; /* set if the caller waits */ }; +const char *wb_reason_name[] = { + [WB_REASON_BACKGROUND] = "background", + [WB_REASON_TRY_TO_FREE_PAGES] = "try_to_free_pages", + [WB_REASON_SYNC] = "sync", + [WB_REASON_PERIODIC] = "periodic", + [WB_REASON_LAPTOP_TIMER] = "laptop_timer", + [WB_REASON_FREE_MORE_MEM] = "free_more_memory", + [WB_REASON_FS_FREE_SPACE] = "fs_free_space", + [WB_REASON_FORKER_THREAD] = "forker_thread" +}; + /* * Include the creation of the trace points after defining the * wb_writeback_work structure so that the definition remains local to this diff --git a/trunk/fs/ioctl.c b/trunk/fs/ioctl.c index 066836e81848..1d9b9fcb2db4 100644 --- a/trunk/fs/ioctl.c +++ b/trunk/fs/ioctl.c @@ -42,7 +42,7 @@ static long vfs_ioctl(struct file *filp, unsigned int cmd, error = filp->f_op->unlocked_ioctl(filp, cmd, arg); if (error == -ENOIOCTLCMD) - error = -ENOTTY; + error = -EINVAL; out: return error; } diff --git a/trunk/fs/locks.c b/trunk/fs/locks.c index 637694bf3a03..3b0d05dcd7c1 100644 --- a/trunk/fs/locks.c +++ b/trunk/fs/locks.c @@ -1205,8 +1205,6 @@ int __break_lease(struct inode *inode, unsigned int mode) int want_write = (mode & O_ACCMODE) != O_RDONLY; new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); - if (IS_ERR(new_fl)) - return PTR_ERR(new_fl); lock_flocks(); @@ -1223,6 +1221,12 @@ int __break_lease(struct inode *inode, unsigned int mode) if (fl->fl_owner == current->files) i_have_this_lease = 1; + if (IS_ERR(new_fl) && !i_have_this_lease + && ((mode & O_NONBLOCK) == 0)) { + error = PTR_ERR(new_fl); + goto out; + } + break_time = 0; if (lease_break_time > 0) { break_time = jiffies + lease_break_time * HZ; @@ -1280,7 +1284,8 @@ int __break_lease(struct inode *inode, unsigned int mode) out: unlock_flocks(); - locks_free_lock(new_fl); + if (!IS_ERR(new_fl)) + locks_free_lock(new_fl); return error; } diff --git a/trunk/fs/minix/inode.c b/trunk/fs/minix/inode.c index 4d46a6a59070..1d9e33966db0 100644 --- a/trunk/fs/minix/inode.c +++ b/trunk/fs/minix/inode.c @@ -263,6 +263,23 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) goto out_no_root; } + ret = -ENOMEM; + s->s_root = d_alloc_root(root_inode); + if (!s->s_root) + goto out_iput; + + if (!(s->s_flags & MS_RDONLY)) { + if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */ + ms->s_state &= ~MINIX_VALID_FS; + mark_buffer_dirty(bh); + } + if (!(sbi->s_mount_state & MINIX_VALID_FS)) + printk("MINIX-fs: mounting unchecked file system, " + "running fsck is recommended\n"); + else if (sbi->s_mount_state & MINIX_ERROR_FS) + printk("MINIX-fs: mounting file system with errors, " + "running fsck is recommended\n"); + /* Apparently minix can create filesystems that allocate more blocks for * the bitmaps than needed. We simply ignore that, but verify it didn't * create one with not enough blocks and bail out if so. @@ -283,23 +300,6 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) goto out_iput; } - ret = -ENOMEM; - s->s_root = d_alloc_root(root_inode); - if (!s->s_root) - goto out_iput; - - if (!(s->s_flags & MS_RDONLY)) { - if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */ - ms->s_state &= ~MINIX_VALID_FS; - mark_buffer_dirty(bh); - } - if (!(sbi->s_mount_state & MINIX_VALID_FS)) - printk("MINIX-fs: mounting unchecked file system, " - "running fsck is recommended\n"); - else if (sbi->s_mount_state & MINIX_ERROR_FS) - printk("MINIX-fs: mounting file system with errors, " - "running fsck is recommended\n"); - return 0; out_iput: diff --git a/trunk/fs/proc/array.c b/trunk/fs/proc/array.c index 8c344f037bd0..3a1dafd228d1 100644 --- a/trunk/fs/proc/array.c +++ b/trunk/fs/proc/array.c @@ -394,8 +394,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, sigemptyset(&sigign); sigemptyset(&sigcatch); - cutime = cstime = utime = stime = 0; - cgtime = gtime = 0; + cutime = cstime = utime = stime = cputime_zero; + cgtime = gtime = cputime_zero; if (lock_task_sighand(task, &flags)) { struct signal_struct *sig = task->signal; @@ -423,14 +423,14 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, do { min_flt += t->min_flt; maj_flt += t->maj_flt; - gtime += t->gtime; + gtime = cputime_add(gtime, t->gtime); t = next_thread(t); } while (t != task); min_flt += sig->min_flt; maj_flt += sig->maj_flt; thread_group_times(task, &utime, &stime); - gtime += sig->gtime; + gtime = cputime_add(gtime, sig->gtime); } sid = task_session_nr_ns(task, ns); diff --git a/trunk/fs/proc/stat.c b/trunk/fs/proc/stat.c index 121f77cfef76..2a30d67dd6b8 100644 --- a/trunk/fs/proc/stat.c +++ b/trunk/fs/proc/stat.c @@ -22,29 +22,31 @@ #define arch_idle_time(cpu) 0 #endif -static u64 get_idle_time(int cpu) +static cputime64_t get_idle_time(int cpu) { - u64 idle, idle_time = get_cpu_idle_time_us(cpu, NULL); + u64 idle_time = get_cpu_idle_time_us(cpu, NULL); + cputime64_t idle; if (idle_time == -1ULL) { /* !NO_HZ so we can rely on cpustat.idle */ - idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; - idle += arch_idle_time(cpu); + idle = kstat_cpu(cpu).cpustat.idle; + idle = cputime64_add(idle, arch_idle_time(cpu)); } else - idle = usecs_to_cputime64(idle_time); + idle = nsecs_to_jiffies64(1000 * idle_time); return idle; } -static u64 get_iowait_time(int cpu) +static cputime64_t get_iowait_time(int cpu) { - u64 iowait, iowait_time = get_cpu_iowait_time_us(cpu, NULL); + u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL); + cputime64_t iowait; if (iowait_time == -1ULL) /* !NO_HZ so we can rely on cpustat.iowait */ - iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + iowait = kstat_cpu(cpu).cpustat.iowait; else - iowait = usecs_to_cputime64(iowait_time); + iowait = nsecs_to_jiffies64(1000 * iowait_time); return iowait; } @@ -53,30 +55,31 @@ static int show_stat(struct seq_file *p, void *v) { int i, j; unsigned long jif; - u64 user, nice, system, idle, iowait, irq, softirq, steal; - u64 guest, guest_nice; + cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; + cputime64_t guest, guest_nice; u64 sum = 0; u64 sum_softirq = 0; unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; struct timespec boottime; user = nice = system = idle = iowait = - irq = softirq = steal = 0; - guest = guest_nice = 0; + irq = softirq = steal = cputime64_zero; + guest = guest_nice = cputime64_zero; getboottime(&boottime); jif = boottime.tv_sec; for_each_possible_cpu(i) { - user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; - nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE]; - system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; - idle += get_idle_time(i); - iowait += get_iowait_time(i); - irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; - softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; - steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; - guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; - guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; + user = cputime64_add(user, kstat_cpu(i).cpustat.user); + nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); + system = cputime64_add(system, kstat_cpu(i).cpustat.system); + idle = cputime64_add(idle, get_idle_time(i)); + iowait = cputime64_add(iowait, get_iowait_time(i)); + irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); + softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); + steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); + guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); + guest_nice = cputime64_add(guest_nice, + kstat_cpu(i).cpustat.guest_nice); sum += kstat_cpu_irqs_sum(i); sum += arch_irq_stat_cpu(i); @@ -103,16 +106,16 @@ static int show_stat(struct seq_file *p, void *v) (unsigned long long)cputime64_to_clock_t(guest_nice)); for_each_online_cpu(i) { /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ - user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; - nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE]; - system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; + user = kstat_cpu(i).cpustat.user; + nice = kstat_cpu(i).cpustat.nice; + system = kstat_cpu(i).cpustat.system; idle = get_idle_time(i); iowait = get_iowait_time(i); - irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; - softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; - steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; - guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; - guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; + irq = kstat_cpu(i).cpustat.irq; + softirq = kstat_cpu(i).cpustat.softirq; + steal = kstat_cpu(i).cpustat.steal; + guest = kstat_cpu(i).cpustat.guest; + guest_nice = kstat_cpu(i).cpustat.guest_nice; seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu " "%llu\n", diff --git a/trunk/fs/proc/uptime.c b/trunk/fs/proc/uptime.c index 9610ac772d7e..766b1d456050 100644 --- a/trunk/fs/proc/uptime.c +++ b/trunk/fs/proc/uptime.c @@ -11,20 +11,15 @@ static int uptime_proc_show(struct seq_file *m, void *v) { struct timespec uptime; struct timespec idle; - u64 idletime; - u64 nsec; - u32 rem; int i; + cputime_t idletime = cputime_zero; - idletime = 0; for_each_possible_cpu(i) - idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; + idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle); do_posix_clock_monotonic_gettime(&uptime); monotonic_to_bootbased(&uptime); - nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; - idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); - idle.tv_nsec = rem; + cputime_to_timespec(idletime, &idle); seq_printf(m, "%lu.%02lu %lu.%02lu\n", (unsigned long) uptime.tv_sec, (uptime.tv_nsec / (NSEC_PER_SEC / 100)), diff --git a/trunk/fs/xfs/xfs_super.c b/trunk/fs/xfs/xfs_super.c index 8a899496fd5f..3eca58f51ae9 100644 --- a/trunk/fs/xfs/xfs_super.c +++ b/trunk/fs/xfs/xfs_super.c @@ -868,6 +868,27 @@ xfs_fs_dirty_inode( XFS_I(inode)->i_update_core = 1; } +STATIC int +xfs_log_inode( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return xfs_trans_commit(tp, 0); +} + STATIC int xfs_fs_write_inode( struct inode *inode, @@ -881,8 +902,10 @@ xfs_fs_write_inode( if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); + if (!ip->i_update_core) + return 0; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) { + if (wbc->sync_mode == WB_SYNC_ALL) { /* * Make sure the inode has made it it into the log. Instead * of forcing it all the way to stable storage using a @@ -890,14 +913,11 @@ xfs_fs_write_inode( * ->sync_fs call do that for thus, which reduces the number * of synchronous log forces dramatically. */ - error = xfs_log_dirty_inode(ip, NULL, 0); + error = xfs_log_inode(ip); if (error) goto out; return 0; } else { - if (!ip->i_update_core) - return 0; - /* * We make this non-blocking if the inode is contended, return * EAGAIN to indicate to the caller that they did not succeed. diff --git a/trunk/fs/xfs/xfs_sync.c b/trunk/fs/xfs/xfs_sync.c index f0994aedcd15..be5c51d8f757 100644 --- a/trunk/fs/xfs/xfs_sync.c +++ b/trunk/fs/xfs/xfs_sync.c @@ -336,32 +336,6 @@ xfs_sync_fsdata( return error; } -int -xfs_log_dirty_inode( - struct xfs_inode *ip, - struct xfs_perag *pag, - int flags) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; - int error; - - if (!ip->i_update_core) - return 0; - - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return xfs_trans_commit(tp, 0); -} - /* * When remounting a filesystem read-only or freezing the filesystem, we have * two phases to execute. This first phase is syncing the data before we @@ -385,16 +359,6 @@ xfs_quiesce_data( { int error, error2 = 0; - /* - * Log all pending size and timestamp updates. The vfs writeback - * code is supposed to do this, but due to its overagressive - * livelock detection it will skip inodes where appending writes - * were written out in the first non-blocking sync phase if their - * completion took long enough that it happened after taking the - * timestamp for the cut-off in the blocking phase. - */ - xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0); - xfs_qm_sync(mp, SYNC_TRYLOCK); xfs_qm_sync(mp, SYNC_WAIT); diff --git a/trunk/fs/xfs/xfs_sync.h b/trunk/fs/xfs/xfs_sync.h index fa965479d788..941202e7ac6e 100644 --- a/trunk/fs/xfs/xfs_sync.h +++ b/trunk/fs/xfs/xfs_sync.h @@ -34,8 +34,6 @@ void xfs_quiesce_attr(struct xfs_mount *mp); void xfs_flush_inodes(struct xfs_inode *ip); -int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags); - int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); int xfs_reclaim_inodes_count(struct xfs_mount *mp); void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); diff --git a/trunk/include/asm-generic/cputime.h b/trunk/include/asm-generic/cputime.h index 9a62937c56ca..62ce6823c0f2 100644 --- a/trunk/include/asm-generic/cputime.h +++ b/trunk/include/asm-generic/cputime.h @@ -4,66 +4,70 @@ #include #include -typedef unsigned long __nocast cputime_t; +typedef unsigned long cputime_t; +#define cputime_zero (0UL) #define cputime_one_jiffy jiffies_to_cputime(1) -#define cputime_to_jiffies(__ct) (__force unsigned long)(__ct) +#define cputime_max ((~0UL >> 1) - 1) +#define cputime_add(__a, __b) ((__a) + (__b)) +#define cputime_sub(__a, __b) ((__a) - (__b)) +#define cputime_div(__a, __n) ((__a) / (__n)) +#define cputime_halve(__a) ((__a) >> 1) +#define cputime_eq(__a, __b) ((__a) == (__b)) +#define cputime_gt(__a, __b) ((__a) > (__b)) +#define cputime_ge(__a, __b) ((__a) >= (__b)) +#define cputime_lt(__a, __b) ((__a) < (__b)) +#define cputime_le(__a, __b) ((__a) <= (__b)) +#define cputime_to_jiffies(__ct) (__ct) #define cputime_to_scaled(__ct) (__ct) -#define jiffies_to_cputime(__hz) (__force cputime_t)(__hz) +#define jiffies_to_cputime(__hz) (__hz) -typedef u64 __nocast cputime64_t; +typedef u64 cputime64_t; -#define cputime64_to_jiffies64(__ct) (__force u64)(__ct) -#define jiffies64_to_cputime64(__jif) (__force cputime64_t)(__jif) +#define cputime64_zero (0ULL) +#define cputime64_add(__a, __b) ((__a) + (__b)) +#define cputime64_sub(__a, __b) ((__a) - (__b)) +#define cputime64_to_jiffies64(__ct) (__ct) +#define jiffies64_to_cputime64(__jif) (__jif) +#define cputime_to_cputime64(__ct) ((u64) __ct) +#define cputime64_gt(__a, __b) ((__a) > (__b)) -#define nsecs_to_cputime64(__ct) \ - jiffies64_to_cputime64(nsecs_to_jiffies64(__ct)) +#define nsecs_to_cputime64(__ct) nsecs_to_jiffies64(__ct) /* * Convert cputime to microseconds and back. */ -#define cputime_to_usecs(__ct) \ - jiffies_to_usecs(cputime_to_jiffies(__ct)) -#define usecs_to_cputime(__usec) \ - jiffies_to_cputime(usecs_to_jiffies(__usec)) -#define usecs_to_cputime64(__usec) \ - jiffies64_to_cputime64(nsecs_to_jiffies64((__usec) * 1000)) +#define cputime_to_usecs(__ct) jiffies_to_usecs(__ct) +#define usecs_to_cputime(__msecs) usecs_to_jiffies(__msecs) /* * Convert cputime to seconds and back. */ -#define cputime_to_secs(jif) (cputime_to_jiffies(jif) / HZ) -#define secs_to_cputime(sec) jiffies_to_cputime((sec) * HZ) +#define cputime_to_secs(jif) ((jif) / HZ) +#define secs_to_cputime(sec) ((sec) * HZ) /* * Convert cputime to timespec and back. */ -#define timespec_to_cputime(__val) \ - jiffies_to_cputime(timespec_to_jiffies(__val)) -#define cputime_to_timespec(__ct,__val) \ - jiffies_to_timespec(cputime_to_jiffies(__ct),__val) +#define timespec_to_cputime(__val) timespec_to_jiffies(__val) +#define cputime_to_timespec(__ct,__val) jiffies_to_timespec(__ct,__val) /* * Convert cputime to timeval and back. */ -#define timeval_to_cputime(__val) \ - jiffies_to_cputime(timeval_to_jiffies(__val)) -#define cputime_to_timeval(__ct,__val) \ - jiffies_to_timeval(cputime_to_jiffies(__ct),__val) +#define timeval_to_cputime(__val) timeval_to_jiffies(__val) +#define cputime_to_timeval(__ct,__val) jiffies_to_timeval(__ct,__val) /* * Convert cputime to clock and back. */ -#define cputime_to_clock_t(__ct) \ - jiffies_to_clock_t(cputime_to_jiffies(__ct)) -#define clock_t_to_cputime(__x) \ - jiffies_to_cputime(clock_t_to_jiffies(__x)) +#define cputime_to_clock_t(__ct) jiffies_to_clock_t(__ct) +#define clock_t_to_cputime(__x) clock_t_to_jiffies(__x) /* * Convert cputime64 to clock. */ -#define cputime64_to_clock_t(__ct) \ - jiffies_64_to_clock_t(cputime64_to_jiffies64(__ct)) +#define cputime64_to_clock_t(__ct) jiffies_64_to_clock_t(__ct) #endif diff --git a/trunk/include/linux/bitops.h b/trunk/include/linux/bitops.h index 3c1063acb2ab..a3ef66a2a083 100644 --- a/trunk/include/linux/bitops.h +++ b/trunk/include/linux/bitops.h @@ -22,14 +22,8 @@ extern unsigned long __sw_hweight64(__u64 w); #include #define for_each_set_bit(bit, addr, size) \ - for ((bit) = find_first_bit((addr), (size)); \ - (bit) < (size); \ - (bit) = find_next_bit((addr), (size), (bit) + 1)) - -/* same as for_each_set_bit() but use bit as value to start with */ -#define for_each_set_bit_cont(bit, addr, size) \ - for ((bit) = find_next_bit((addr), (size), (bit)); \ - (bit) < (size); \ + for ((bit) = find_first_bit((addr), (size)); \ + (bit) < (size); \ (bit) = find_next_bit((addr), (size), (bit) + 1)) static __inline__ int get_bitmask_order(unsigned int count) diff --git a/trunk/include/linux/bootmem.h b/trunk/include/linux/bootmem.h index 66d3e954eb6c..ab344a521105 100644 --- a/trunk/include/linux/bootmem.h +++ b/trunk/include/linux/bootmem.h @@ -44,7 +44,7 @@ extern unsigned long init_bootmem_node(pg_data_t *pgdat, unsigned long endpfn); extern unsigned long init_bootmem(unsigned long addr, unsigned long memend); -extern unsigned long free_low_memory_core_early(int nodeid); +unsigned long free_all_memory_core_early(int nodeid); extern unsigned long free_all_bootmem_node(pg_data_t *pgdat); extern unsigned long free_all_bootmem(void); diff --git a/trunk/include/linux/cpu.h b/trunk/include/linux/cpu.h index 305c263021e7..6cb60fd2ea84 100644 --- a/trunk/include/linux/cpu.h +++ b/trunk/include/linux/cpu.h @@ -27,7 +27,6 @@ struct cpu { extern int register_cpu(struct cpu *cpu, int num); extern struct sys_device *get_cpu_sysdev(unsigned cpu); -extern bool cpu_is_hotpluggable(unsigned cpu); extern int cpu_add_sysdev_attr(struct sysdev_attribute *attr); extern void cpu_remove_sysdev_attr(struct sysdev_attribute *attr); diff --git a/trunk/include/linux/debugobjects.h b/trunk/include/linux/debugobjects.h index 0e5f5785d9f2..65970b811e22 100644 --- a/trunk/include/linux/debugobjects.h +++ b/trunk/include/linux/debugobjects.h @@ -46,8 +46,6 @@ struct debug_obj { * fails * @fixup_free: fixup function, which is called when the free check * fails - * @fixup_assert_init: fixup function, which is called when the assert_init - * check fails */ struct debug_obj_descr { const char *name; @@ -56,7 +54,6 @@ struct debug_obj_descr { int (*fixup_activate) (void *addr, enum debug_obj_state state); int (*fixup_destroy) (void *addr, enum debug_obj_state state); int (*fixup_free) (void *addr, enum debug_obj_state state); - int (*fixup_assert_init)(void *addr, enum debug_obj_state state); }; #ifdef CONFIG_DEBUG_OBJECTS @@ -67,7 +64,6 @@ extern void debug_object_activate (void *addr, struct debug_obj_descr *descr); extern void debug_object_deactivate(void *addr, struct debug_obj_descr *descr); extern void debug_object_destroy (void *addr, struct debug_obj_descr *descr); extern void debug_object_free (void *addr, struct debug_obj_descr *descr); -extern void debug_object_assert_init(void *addr, struct debug_obj_descr *descr); /* * Active state: @@ -93,8 +89,6 @@ static inline void debug_object_destroy (void *addr, struct debug_obj_descr *descr) { } static inline void debug_object_free (void *addr, struct debug_obj_descr *descr) { } -static inline void -debug_object_assert_init(void *addr, struct debug_obj_descr *descr) { } static inline void debug_objects_early_init(void) { } static inline void debug_objects_mem_init(void) { } diff --git a/trunk/include/linux/hardirq.h b/trunk/include/linux/hardirq.h index bb7f30971858..f743883f769e 100644 --- a/trunk/include/linux/hardirq.h +++ b/trunk/include/linux/hardirq.h @@ -139,7 +139,20 @@ static inline void account_system_vtime(struct task_struct *tsk) extern void account_system_vtime(struct task_struct *tsk); #endif +#if defined(CONFIG_NO_HZ) #if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) +extern void rcu_enter_nohz(void); +extern void rcu_exit_nohz(void); + +static inline void rcu_irq_enter(void) +{ + rcu_exit_nohz(); +} + +static inline void rcu_irq_exit(void) +{ + rcu_enter_nohz(); +} static inline void rcu_nmi_enter(void) { @@ -150,9 +163,17 @@ static inline void rcu_nmi_exit(void) } #else +extern void rcu_irq_enter(void); +extern void rcu_irq_exit(void); extern void rcu_nmi_enter(void); extern void rcu_nmi_exit(void); #endif +#else +# define rcu_irq_enter() do { } while (0) +# define rcu_irq_exit() do { } while (0) +# define rcu_nmi_enter() do { } while (0) +# define rcu_nmi_exit() do { } while (0) +#endif /* #if defined(CONFIG_NO_HZ) */ /* * It is safe to do non-atomic ops on ->hardirq_context, diff --git a/trunk/include/linux/jump_label.h b/trunk/include/linux/jump_label.h index 5ce8b140428f..388b0d425b50 100644 --- a/trunk/include/linux/jump_label.h +++ b/trunk/include/linux/jump_label.h @@ -3,7 +3,6 @@ #include #include -#include #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) @@ -15,12 +14,6 @@ struct jump_label_key { #endif }; -struct jump_label_key_deferred { - struct jump_label_key key; - unsigned long timeout; - struct delayed_work work; -}; - # include # define HAVE_JUMP_LABEL #endif /* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */ @@ -58,11 +51,8 @@ extern void arch_jump_label_transform_static(struct jump_entry *entry, extern int jump_label_text_reserved(void *start, void *end); extern void jump_label_inc(struct jump_label_key *key); extern void jump_label_dec(struct jump_label_key *key); -extern void jump_label_dec_deferred(struct jump_label_key_deferred *key); extern bool jump_label_enabled(struct jump_label_key *key); extern void jump_label_apply_nops(struct module *mod); -extern void jump_label_rate_limit(struct jump_label_key_deferred *key, - unsigned long rl); #else /* !HAVE_JUMP_LABEL */ @@ -78,10 +68,6 @@ static __always_inline void jump_label_init(void) { } -struct jump_label_key_deferred { - struct jump_label_key key; -}; - static __always_inline bool static_branch(struct jump_label_key *key) { if (unlikely(atomic_read(&key->enabled))) @@ -99,11 +85,6 @@ static inline void jump_label_dec(struct jump_label_key *key) atomic_dec(&key->enabled); } -static inline void jump_label_dec_deferred(struct jump_label_key_deferred *key) -{ - jump_label_dec(&key->key); -} - static inline int jump_label_text_reserved(void *start, void *end) { return 0; @@ -121,14 +102,6 @@ static inline int jump_label_apply_nops(struct module *mod) { return 0; } - -static inline void jump_label_rate_limit(struct jump_label_key_deferred *key, - unsigned long rl) -{ -} #endif /* HAVE_JUMP_LABEL */ -#define jump_label_key_enabled ((struct jump_label_key){ .enabled = ATOMIC_INIT(1), }) -#define jump_label_key_disabled ((struct jump_label_key){ .enabled = ATOMIC_INIT(0), }) - #endif /* _LINUX_JUMP_LABEL_H */ diff --git a/trunk/include/linux/kernel_stat.h b/trunk/include/linux/kernel_stat.h index 2fbd9053c2df..0cce2db580c3 100644 --- a/trunk/include/linux/kernel_stat.h +++ b/trunk/include/linux/kernel_stat.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #include @@ -16,25 +15,21 @@ * used by rstatd/perfmeter */ -enum cpu_usage_stat { - CPUTIME_USER, - CPUTIME_NICE, - CPUTIME_SYSTEM, - CPUTIME_SOFTIRQ, - CPUTIME_IRQ, - CPUTIME_IDLE, - CPUTIME_IOWAIT, - CPUTIME_STEAL, - CPUTIME_GUEST, - CPUTIME_GUEST_NICE, - NR_STATS, -}; - -struct kernel_cpustat { - u64 cpustat[NR_STATS]; +struct cpu_usage_stat { + cputime64_t user; + cputime64_t nice; + cputime64_t system; + cputime64_t softirq; + cputime64_t irq; + cputime64_t idle; + cputime64_t iowait; + cputime64_t steal; + cputime64_t guest; + cputime64_t guest_nice; }; struct kernel_stat { + struct cpu_usage_stat cpustat; #ifndef CONFIG_GENERIC_HARDIRQS unsigned int irqs[NR_IRQS]; #endif @@ -43,13 +38,10 @@ struct kernel_stat { }; DECLARE_PER_CPU(struct kernel_stat, kstat); -DECLARE_PER_CPU(struct kernel_cpustat, kernel_cpustat); +#define kstat_cpu(cpu) per_cpu(kstat, cpu) /* Must have preemption disabled for this to be meaningful. */ -#define kstat_this_cpu (&__get_cpu_var(kstat)) -#define kcpustat_this_cpu (&__get_cpu_var(kernel_cpustat)) -#define kstat_cpu(cpu) per_cpu(kstat, cpu) -#define kcpustat_cpu(cpu) per_cpu(kernel_cpustat, cpu) +#define kstat_this_cpu __get_cpu_var(kstat) extern unsigned long long nr_context_switches(void); diff --git a/trunk/include/linux/kvm.h b/trunk/include/linux/kvm.h index 68e67e50d028..c3892fc1d538 100644 --- a/trunk/include/linux/kvm.h +++ b/trunk/include/linux/kvm.h @@ -557,7 +557,6 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ #define KVM_CAP_PPC_PAPR 68 #define KVM_CAP_S390_GMAP 71 -#define KVM_CAP_TSC_DEADLINE_TIMER 72 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/trunk/include/linux/latencytop.h b/trunk/include/linux/latencytop.h index e23121f9d82a..b0e99898527c 100644 --- a/trunk/include/linux/latencytop.h +++ b/trunk/include/linux/latencytop.h @@ -10,8 +10,6 @@ #define _INCLUDE_GUARD_LATENCYTOP_H_ #include -struct task_struct; - #ifdef CONFIG_LATENCYTOP #define LT_SAVECOUNT 32 @@ -25,6 +23,7 @@ struct latency_record { }; +struct task_struct; extern int latencytop_enabled; void __account_scheduler_latency(struct task_struct *task, int usecs, int inter); diff --git a/trunk/include/linux/lglock.h b/trunk/include/linux/lglock.h index 87f402ccec55..f549056fb20b 100644 --- a/trunk/include/linux/lglock.h +++ b/trunk/include/linux/lglock.h @@ -22,7 +22,6 @@ #include #include #include -#include /* can make br locks by using local lock for read side, global lock for write */ #define br_lock_init(name) name##_lock_init() @@ -73,31 +72,9 @@ #define DEFINE_LGLOCK(name) \ \ - DEFINE_SPINLOCK(name##_cpu_lock); \ - cpumask_t name##_cpus __read_mostly; \ DEFINE_PER_CPU(arch_spinlock_t, name##_lock); \ DEFINE_LGLOCK_LOCKDEP(name); \ \ - static int \ - name##_lg_cpu_callback(struct notifier_block *nb, \ - unsigned long action, void *hcpu) \ - { \ - switch (action & ~CPU_TASKS_FROZEN) { \ - case CPU_UP_PREPARE: \ - spin_lock(&name##_cpu_lock); \ - cpu_set((unsigned long)hcpu, name##_cpus); \ - spin_unlock(&name##_cpu_lock); \ - break; \ - case CPU_UP_CANCELED: case CPU_DEAD: \ - spin_lock(&name##_cpu_lock); \ - cpu_clear((unsigned long)hcpu, name##_cpus); \ - spin_unlock(&name##_cpu_lock); \ - } \ - return NOTIFY_OK; \ - } \ - static struct notifier_block name##_lg_cpu_notifier = { \ - .notifier_call = name##_lg_cpu_callback, \ - }; \ void name##_lock_init(void) { \ int i; \ LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \ @@ -106,11 +83,6 @@ lock = &per_cpu(name##_lock, i); \ *lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; \ } \ - register_hotcpu_notifier(&name##_lg_cpu_notifier); \ - get_online_cpus(); \ - for_each_online_cpu(i) \ - cpu_set(i, name##_cpus); \ - put_online_cpus(); \ } \ EXPORT_SYMBOL(name##_lock_init); \ \ @@ -152,9 +124,9 @@ \ void name##_global_lock_online(void) { \ int i; \ - spin_lock(&name##_cpu_lock); \ + preempt_disable(); \ rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_); \ - for_each_cpu(i, &name##_cpus) { \ + for_each_online_cpu(i) { \ arch_spinlock_t *lock; \ lock = &per_cpu(name##_lock, i); \ arch_spin_lock(lock); \ @@ -165,12 +137,12 @@ void name##_global_unlock_online(void) { \ int i; \ rwlock_release(&name##_lock_dep_map, 1, _RET_IP_); \ - for_each_cpu(i, &name##_cpus) { \ + for_each_online_cpu(i) { \ arch_spinlock_t *lock; \ lock = &per_cpu(name##_lock, i); \ arch_spin_unlock(lock); \ } \ - spin_unlock(&name##_cpu_lock); \ + preempt_enable(); \ } \ EXPORT_SYMBOL(name##_global_unlock_online); \ \ diff --git a/trunk/include/linux/lockdep.h b/trunk/include/linux/lockdep.h index d36619ead3ba..b6a56e37284c 100644 --- a/trunk/include/linux/lockdep.h +++ b/trunk/include/linux/lockdep.h @@ -343,8 +343,6 @@ extern void lockdep_trace_alloc(gfp_t mask); #define lockdep_assert_held(l) WARN_ON(debug_locks && !lockdep_is_held(l)) -#define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) - #else /* !LOCKDEP */ static inline void lockdep_off(void) @@ -394,8 +392,6 @@ struct lock_class_key { }; #define lockdep_assert_held(l) do { } while (0) -#define lockdep_recursing(tsk) (0) - #endif /* !LOCKDEP */ #ifdef CONFIG_LOCK_STAT diff --git a/trunk/include/linux/memblock.h b/trunk/include/linux/memblock.h index a6bb10235148..e6b843e16e81 100644 --- a/trunk/include/linux/memblock.h +++ b/trunk/include/linux/memblock.h @@ -2,6 +2,8 @@ #define _LINUX_MEMBLOCK_H #ifdef __KERNEL__ +#define MEMBLOCK_ERROR 0 + #ifdef CONFIG_HAVE_MEMBLOCK /* * Logical memory blocks. @@ -17,161 +19,81 @@ #include #include +#include + #define INIT_MEMBLOCK_REGIONS 128 struct memblock_region { phys_addr_t base; phys_addr_t size; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - int nid; -#endif }; struct memblock_type { unsigned long cnt; /* number of regions */ unsigned long max; /* size of the allocated array */ - phys_addr_t total_size; /* size of all regions */ struct memblock_region *regions; }; struct memblock { phys_addr_t current_limit; + phys_addr_t memory_size; /* Updated by memblock_analyze() */ struct memblock_type memory; struct memblock_type reserved; }; extern struct memblock memblock; extern int memblock_debug; +extern int memblock_can_resize; #define memblock_dbg(fmt, ...) \ if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) -phys_addr_t memblock_find_in_range_node(phys_addr_t start, phys_addr_t end, - phys_addr_t size, phys_addr_t align, int nid); -phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, - phys_addr_t size, phys_addr_t align); +u64 memblock_find_in_range(u64 start, u64 end, u64 size, u64 align); int memblock_free_reserved_regions(void); int memblock_reserve_reserved_regions(void); -void memblock_allow_resize(void); -int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid); -int memblock_add(phys_addr_t base, phys_addr_t size); -int memblock_remove(phys_addr_t base, phys_addr_t size); -int memblock_free(phys_addr_t base, phys_addr_t size); -int memblock_reserve(phys_addr_t base, phys_addr_t size); - -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, - unsigned long *out_end_pfn, int *out_nid); - -/** - * for_each_mem_pfn_range - early memory pfn range iterator - * @i: an integer used as loop variable - * @nid: node selector, %MAX_NUMNODES for all nodes - * @p_start: ptr to ulong for start pfn of the range, can be %NULL - * @p_end: ptr to ulong for end pfn of the range, can be %NULL - * @p_nid: ptr to int for nid of the range, can be %NULL - * - * Walks over configured memory ranges. Available after early_node_map is - * populated. - */ -#define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \ - for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \ - i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid)) -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - -void __next_free_mem_range(u64 *idx, int nid, phys_addr_t *out_start, - phys_addr_t *out_end, int *out_nid); - -/** - * for_each_free_mem_range - iterate through free memblock areas - * @i: u64 used as loop variable - * @nid: node selector, %MAX_NUMNODES for all nodes - * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL - * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL - * @p_nid: ptr to int for nid of the range, can be %NULL - * - * Walks over free (memory && !reserved) areas of memblock. Available as - * soon as memblock is initialized. - */ -#define for_each_free_mem_range(i, nid, p_start, p_end, p_nid) \ - for (i = 0, \ - __next_free_mem_range(&i, nid, p_start, p_end, p_nid); \ - i != (u64)ULLONG_MAX; \ - __next_free_mem_range(&i, nid, p_start, p_end, p_nid)) - -void __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start, - phys_addr_t *out_end, int *out_nid); +extern void memblock_init(void); +extern void memblock_analyze(void); +extern long memblock_add(phys_addr_t base, phys_addr_t size); +extern long memblock_remove(phys_addr_t base, phys_addr_t size); +extern long memblock_free(phys_addr_t base, phys_addr_t size); +extern long memblock_reserve(phys_addr_t base, phys_addr_t size); -/** - * for_each_free_mem_range_reverse - rev-iterate through free memblock areas - * @i: u64 used as loop variable - * @nid: node selector, %MAX_NUMNODES for all nodes - * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL - * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL - * @p_nid: ptr to int for nid of the range, can be %NULL - * - * Walks over free (memory && !reserved) areas of memblock in reverse - * order. Available as soon as memblock is initialized. +/* The numa aware allocator is only available if + * CONFIG_ARCH_POPULATES_NODE_MAP is set */ -#define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid) \ - for (i = (u64)ULLONG_MAX, \ - __next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid); \ - i != (u64)ULLONG_MAX; \ - __next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid)) +extern phys_addr_t memblock_alloc_nid(phys_addr_t size, phys_addr_t align, + int nid); +extern phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, + int nid); -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -int memblock_set_node(phys_addr_t base, phys_addr_t size, int nid); - -static inline void memblock_set_region_node(struct memblock_region *r, int nid) -{ - r->nid = nid; -} - -static inline int memblock_get_region_node(const struct memblock_region *r) -{ - return r->nid; -} -#else -static inline void memblock_set_region_node(struct memblock_region *r, int nid) -{ -} - -static inline int memblock_get_region_node(const struct memblock_region *r) -{ - return 0; -} -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - -phys_addr_t memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid); -phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid); - -phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align); +extern phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align); /* Flags for memblock_alloc_base() amd __memblock_alloc_base() */ #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) #define MEMBLOCK_ALLOC_ACCESSIBLE 0 -phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align, - phys_addr_t max_addr); -phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align, - phys_addr_t max_addr); -phys_addr_t memblock_phys_mem_size(void); -phys_addr_t memblock_start_of_DRAM(void); -phys_addr_t memblock_end_of_DRAM(void); -void memblock_enforce_memory_limit(phys_addr_t memory_limit); -int memblock_is_memory(phys_addr_t addr); -int memblock_is_region_memory(phys_addr_t base, phys_addr_t size); -int memblock_is_reserved(phys_addr_t addr); -int memblock_is_region_reserved(phys_addr_t base, phys_addr_t size); - -extern void __memblock_dump_all(void); - -static inline void memblock_dump_all(void) -{ - if (memblock_debug) - __memblock_dump_all(); -} +extern phys_addr_t memblock_alloc_base(phys_addr_t size, + phys_addr_t align, + phys_addr_t max_addr); +extern phys_addr_t __memblock_alloc_base(phys_addr_t size, + phys_addr_t align, + phys_addr_t max_addr); +extern phys_addr_t memblock_phys_mem_size(void); +extern phys_addr_t memblock_start_of_DRAM(void); +extern phys_addr_t memblock_end_of_DRAM(void); +extern void memblock_enforce_memory_limit(phys_addr_t memory_limit); +extern int memblock_is_memory(phys_addr_t addr); +extern int memblock_is_region_memory(phys_addr_t base, phys_addr_t size); +extern int memblock_is_reserved(phys_addr_t addr); +extern int memblock_is_region_reserved(phys_addr_t base, phys_addr_t size); + +extern void memblock_dump_all(void); + +/* Provided by the architecture */ +extern phys_addr_t memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid); +extern int memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1, + phys_addr_t addr2, phys_addr_t size2); /** * memblock_set_current_limit - Set the current allocation limit to allow @@ -179,7 +101,7 @@ static inline void memblock_dump_all(void) * accessible during boot * @limit: New limit value (physical address) */ -void memblock_set_current_limit(phys_addr_t limit); +extern void memblock_set_current_limit(phys_addr_t limit); /* @@ -232,9 +154,9 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo region++) -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK -#define __init_memblock __meminit -#define __initdata_memblock __meminitdata +#ifdef ARCH_DISCARD_MEMBLOCK +#define __init_memblock __init +#define __initdata_memblock __initdata #else #define __init_memblock #define __initdata_memblock @@ -243,7 +165,7 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo #else static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align) { - return 0; + return MEMBLOCK_ERROR; } #endif /* CONFIG_HAVE_MEMBLOCK */ diff --git a/trunk/include/linux/mm.h b/trunk/include/linux/mm.h index 5d9b4c9813bd..4baadd18f4ad 100644 --- a/trunk/include/linux/mm.h +++ b/trunk/include/linux/mm.h @@ -1253,34 +1253,41 @@ static inline void pgtable_page_dtor(struct page *page) extern void free_area_init(unsigned long * zones_size); extern void free_area_init_node(int nid, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP /* - * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its + * With CONFIG_ARCH_POPULATES_NODE_MAP set, an architecture may initialise its * zones, allocate the backing mem_map and account for memory holes in a more * architecture independent manner. This is a substitute for creating the * zone_sizes[] and zholes_size[] arrays and passing them to * free_area_init_node() * * An architecture is expected to register range of page frames backed by - * physical memory with memblock_add[_node]() before calling + * physical memory with add_active_range() before calling * free_area_init_nodes() passing in the PFN each zone ends at. At a basic * usage, an architecture is expected to do something like * * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, * max_highmem_pfn}; * for_each_valid_physical_page_range() - * memblock_add_node(base, size, nid) + * add_active_range(node_id, start_pfn, end_pfn) * free_area_init_nodes(max_zone_pfns); * - * free_bootmem_with_active_regions() calls free_bootmem_node() for each - * registered physical page range. Similarly - * sparse_memory_present_with_active_regions() calls memory_present() for - * each range when SPARSEMEM is enabled. + * If the architecture guarantees that there are no holes in the ranges + * registered with add_active_range(), free_bootmem_active_regions() + * will call free_bootmem_node() for each registered physical page range. + * Similarly sparse_memory_present_with_active_regions() calls + * memory_present() for each range when SPARSEMEM is enabled. * * See mm/page_alloc.c for more information on each function exposed by - * CONFIG_HAVE_MEMBLOCK_NODE_MAP. + * CONFIG_ARCH_POPULATES_NODE_MAP */ extern void free_area_init_nodes(unsigned long *max_zone_pfn); +extern void add_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn); +extern void remove_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn); +extern void remove_all_active_ranges(void); +void sort_node_map(void); unsigned long node_map_pfn_alignment(void); unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, unsigned long end_pfn); @@ -1293,11 +1300,14 @@ extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); int add_from_early_node_map(struct range *range, int az, int nr_range, int nid); +u64 __init find_memory_core_early(int nid, u64 size, u64 align, + u64 goal, u64 limit); +typedef int (*work_fn_t)(unsigned long, unsigned long, void *); +extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); extern void sparse_memory_present_with_active_regions(int nid); +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - -#if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ +#if !defined(CONFIG_ARCH_POPULATES_NODE_MAP) && \ !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) static inline int __early_pfn_to_nid(unsigned long pfn) { diff --git a/trunk/include/linux/mmzone.h b/trunk/include/linux/mmzone.h index 3ac040f19369..188cb2ffe8db 100644 --- a/trunk/include/linux/mmzone.h +++ b/trunk/include/linux/mmzone.h @@ -598,13 +598,13 @@ struct zonelist { #endif }; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP struct node_active_region { unsigned long start_pfn; unsigned long end_pfn; int nid; }; -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ #ifndef CONFIG_DISCONTIGMEM /* The array of struct pages - for discontigmem use pgdat->lmem_map */ @@ -720,7 +720,7 @@ extern int movable_zone; static inline int zone_movable_is_highmem(void) { -#if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE) +#if defined(CONFIG_HIGHMEM) && defined(CONFIG_ARCH_POPULATES_NODE_MAP) return movable_zone == ZONE_HIGHMEM; #else return 0; @@ -938,7 +938,7 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, #endif #if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \ - !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) + !defined(CONFIG_ARCH_POPULATES_NODE_MAP) static inline unsigned long early_pfn_to_nid(unsigned long pfn) { return 0; diff --git a/trunk/include/linux/perf_event.h b/trunk/include/linux/perf_event.h index 08855613ceb3..b1f89122bf6a 100644 --- a/trunk/include/linux/perf_event.h +++ b/trunk/include/linux/perf_event.h @@ -54,7 +54,6 @@ enum perf_hw_id { PERF_COUNT_HW_BUS_CYCLES = 6, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND = 7, PERF_COUNT_HW_STALLED_CYCLES_BACKEND = 8, - PERF_COUNT_HW_REF_CPU_CYCLES = 9, PERF_COUNT_HW_MAX, /* non-ABI */ }; @@ -891,7 +890,6 @@ struct perf_event_context { int nr_active; int is_active; int nr_stat; - int nr_freq; int rotate_disable; atomic_t refcount; struct task_struct *task; @@ -1065,12 +1063,12 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) } } -extern struct jump_label_key_deferred perf_sched_events; +extern struct jump_label_key perf_sched_events; static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { - if (static_branch(&perf_sched_events.key)) + if (static_branch(&perf_sched_events)) __perf_event_task_sched_in(prev, task); } @@ -1079,7 +1077,7 @@ static inline void perf_event_task_sched_out(struct task_struct *prev, { perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0); - if (static_branch(&perf_sched_events.key)) + if (static_branch(&perf_sched_events)) __perf_event_task_sched_out(prev, next); } diff --git a/trunk/include/linux/poison.h b/trunk/include/linux/poison.h index 2110a81c5e2a..79159de0e341 100644 --- a/trunk/include/linux/poison.h +++ b/trunk/include/linux/poison.h @@ -40,6 +40,12 @@ #define RED_INACTIVE 0x09F911029D74E35BULL /* when obj is inactive */ #define RED_ACTIVE 0xD84156C5635688C0ULL /* when obj is active */ +#ifdef CONFIG_PHYS_ADDR_T_64BIT +#define MEMBLOCK_INACTIVE 0x3a84fb0144c9e71bULL +#else +#define MEMBLOCK_INACTIVE 0x44c9e71bUL +#endif + #define SLUB_RED_INACTIVE 0xbb #define SLUB_RED_ACTIVE 0xcc diff --git a/trunk/include/linux/rcupdate.h b/trunk/include/linux/rcupdate.h index 81c04f4348ec..2cf4226ade7e 100644 --- a/trunk/include/linux/rcupdate.h +++ b/trunk/include/linux/rcupdate.h @@ -51,8 +51,6 @@ extern int rcutorture_runnable; /* for sysctl */ #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) extern void rcutorture_record_test_transition(void); extern void rcutorture_record_progress(unsigned long vernum); -extern void do_trace_rcu_torture_read(char *rcutorturename, - struct rcu_head *rhp); #else static inline void rcutorture_record_test_transition(void) { @@ -60,12 +58,6 @@ static inline void rcutorture_record_test_transition(void) static inline void rcutorture_record_progress(unsigned long vernum) { } -#ifdef CONFIG_RCU_TRACE -extern void do_trace_rcu_torture_read(char *rcutorturename, - struct rcu_head *rhp); -#else -#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) -#endif #endif #define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b)) @@ -185,10 +177,23 @@ extern void rcu_sched_qs(int cpu); extern void rcu_bh_qs(int cpu); extern void rcu_check_callbacks(int cpu, int user); struct notifier_block; -extern void rcu_idle_enter(void); -extern void rcu_idle_exit(void); -extern void rcu_irq_enter(void); -extern void rcu_irq_exit(void); + +#ifdef CONFIG_NO_HZ + +extern void rcu_enter_nohz(void); +extern void rcu_exit_nohz(void); + +#else /* #ifdef CONFIG_NO_HZ */ + +static inline void rcu_enter_nohz(void) +{ +} + +static inline void rcu_exit_nohz(void) +{ +} + +#endif /* #else #ifdef CONFIG_NO_HZ */ /* * Infrastructure to implement the synchronize_() primitives in @@ -228,30 +233,22 @@ static inline void destroy_rcu_head_on_stack(struct rcu_head *head) #ifdef CONFIG_DEBUG_LOCK_ALLOC -#ifdef CONFIG_PROVE_RCU -extern int rcu_is_cpu_idle(void); -#else /* !CONFIG_PROVE_RCU */ -static inline int rcu_is_cpu_idle(void) -{ - return 0; -} -#endif /* else !CONFIG_PROVE_RCU */ - -static inline void rcu_lock_acquire(struct lockdep_map *map) -{ - WARN_ON_ONCE(rcu_is_cpu_idle()); - lock_acquire(map, 0, 0, 2, 1, NULL, _THIS_IP_); -} - -static inline void rcu_lock_release(struct lockdep_map *map) -{ - WARN_ON_ONCE(rcu_is_cpu_idle()); - lock_release(map, 1, _THIS_IP_); -} - extern struct lockdep_map rcu_lock_map; +# define rcu_read_acquire() \ + lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) +# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_) + extern struct lockdep_map rcu_bh_lock_map; +# define rcu_read_acquire_bh() \ + lock_acquire(&rcu_bh_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) +# define rcu_read_release_bh() lock_release(&rcu_bh_lock_map, 1, _THIS_IP_) + extern struct lockdep_map rcu_sched_lock_map; +# define rcu_read_acquire_sched() \ + lock_acquire(&rcu_sched_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) +# define rcu_read_release_sched() \ + lock_release(&rcu_sched_lock_map, 1, _THIS_IP_) + extern int debug_lockdep_rcu_enabled(void); /** @@ -265,18 +262,11 @@ extern int debug_lockdep_rcu_enabled(void); * * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. - * - * Note that rcu_read_lock() and the matching rcu_read_unlock() must - * occur in the same context, for example, it is illegal to invoke - * rcu_read_unlock() in process context if the matching rcu_read_lock() - * was invoked from within an irq handler. */ static inline int rcu_read_lock_held(void) { if (!debug_lockdep_rcu_enabled()) return 1; - if (rcu_is_cpu_idle()) - return 0; return lock_is_held(&rcu_lock_map); } @@ -300,19 +290,6 @@ extern int rcu_read_lock_bh_held(void); * * Check debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. - * - * Note that if the CPU is in the idle loop from an RCU point of - * view (ie: that we are in the section between rcu_idle_enter() and - * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU - * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs - * that are in such a section, considering these as in extended quiescent - * state, so such a CPU is effectively never in an RCU read-side critical - * section regardless of what RCU primitives it invokes. This state of - * affairs is required --- we need to keep an RCU-free window in idle - * where the CPU may possibly enter into low power mode. This way we can - * notice an extended quiescent state to other CPUs that started a grace - * period. Otherwise we would delay any grace period as long as we run in - * the idle task. */ #ifdef CONFIG_PREEMPT_COUNT static inline int rcu_read_lock_sched_held(void) @@ -321,8 +298,6 @@ static inline int rcu_read_lock_sched_held(void) if (!debug_lockdep_rcu_enabled()) return 1; - if (rcu_is_cpu_idle()) - return 0; if (debug_locks) lockdep_opinion = lock_is_held(&rcu_sched_lock_map); return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); @@ -336,8 +311,12 @@ static inline int rcu_read_lock_sched_held(void) #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -# define rcu_lock_acquire(a) do { } while (0) -# define rcu_lock_release(a) do { } while (0) +# define rcu_read_acquire() do { } while (0) +# define rcu_read_release() do { } while (0) +# define rcu_read_acquire_bh() do { } while (0) +# define rcu_read_release_bh() do { } while (0) +# define rcu_read_acquire_sched() do { } while (0) +# define rcu_read_release_sched() do { } while (0) static inline int rcu_read_lock_held(void) { @@ -658,7 +637,7 @@ static inline void rcu_read_lock(void) { __rcu_read_lock(); __acquire(RCU); - rcu_lock_acquire(&rcu_lock_map); + rcu_read_acquire(); } /* @@ -678,7 +657,7 @@ static inline void rcu_read_lock(void) */ static inline void rcu_read_unlock(void) { - rcu_lock_release(&rcu_lock_map); + rcu_read_release(); __release(RCU); __rcu_read_unlock(); } @@ -694,17 +673,12 @@ static inline void rcu_read_unlock(void) * critical sections in interrupt context can use just rcu_read_lock(), * though this should at least be commented to avoid confusing people * reading the code. - * - * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh() - * must occur in the same context, for example, it is illegal to invoke - * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh() - * was invoked from some other task. */ static inline void rcu_read_lock_bh(void) { local_bh_disable(); __acquire(RCU_BH); - rcu_lock_acquire(&rcu_bh_lock_map); + rcu_read_acquire_bh(); } /* @@ -714,7 +688,7 @@ static inline void rcu_read_lock_bh(void) */ static inline void rcu_read_unlock_bh(void) { - rcu_lock_release(&rcu_bh_lock_map); + rcu_read_release_bh(); __release(RCU_BH); local_bh_enable(); } @@ -726,17 +700,12 @@ static inline void rcu_read_unlock_bh(void) * are being done using call_rcu_sched() or synchronize_rcu_sched(). * Read-side critical sections can also be introduced by anything that * disables preemption, including local_irq_disable() and friends. - * - * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched() - * must occur in the same context, for example, it is illegal to invoke - * rcu_read_unlock_sched() from process context if the matching - * rcu_read_lock_sched() was invoked from an NMI handler. */ static inline void rcu_read_lock_sched(void) { preempt_disable(); __acquire(RCU_SCHED); - rcu_lock_acquire(&rcu_sched_lock_map); + rcu_read_acquire_sched(); } /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ @@ -753,7 +722,7 @@ static inline notrace void rcu_read_lock_sched_notrace(void) */ static inline void rcu_read_unlock_sched(void) { - rcu_lock_release(&rcu_sched_lock_map); + rcu_read_release_sched(); __release(RCU_SCHED); preempt_enable(); } diff --git a/trunk/include/linux/sched.h b/trunk/include/linux/sched.h index cf0eb342bcba..1c4f3e9b9bc5 100644 --- a/trunk/include/linux/sched.h +++ b/trunk/include/linux/sched.h @@ -273,11 +273,9 @@ extern int runqueue_is_locked(int cpu); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) extern void select_nohz_load_balancer(int stop_tick); -extern void set_cpu_sd_state_idle(void); extern int get_nohz_timer_target(void); #else static inline void select_nohz_load_balancer(int stop_tick) { } -static inline void set_cpu_sd_state_idle(void) { } #endif /* @@ -485,8 +483,8 @@ struct task_cputime { #define INIT_CPUTIME \ (struct task_cputime) { \ - .utime = 0, \ - .stime = 0, \ + .utime = cputime_zero, \ + .stime = cputime_zero, \ .sum_exec_runtime = 0, \ } @@ -903,10 +901,6 @@ struct sched_group_power { * single CPU. */ unsigned int power, power_orig; - /* - * Number of busy cpus in this group. - */ - atomic_t nr_busy_cpus; }; struct sched_group { @@ -931,15 +925,6 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) return to_cpumask(sg->cpumask); } -/** - * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. - * @group: The group whose first cpu is to be returned. - */ -static inline unsigned int group_first_cpu(struct sched_group *group) -{ - return cpumask_first(sched_group_cpus(group)); -} - struct sched_domain_attr { int relax_domain_level; }; @@ -1330,8 +1315,8 @@ struct task_struct { * older sibling, respectively. (p->father can be replaced with * p->real_parent->pid) */ - struct task_struct __rcu *real_parent; /* real parent process */ - struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */ + struct task_struct *real_parent; /* real parent process */ + struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */ /* * children/sibling forms the list of my natural children */ @@ -2085,14 +2070,6 @@ extern int sched_setscheduler(struct task_struct *, int, extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern struct task_struct *idle_task(int cpu); -/** - * is_idle_task - is the specified task an idle task? - * @tsk: the task in question. - */ -static inline bool is_idle_task(struct task_struct *p) -{ - return p->pid == 0; -} extern struct task_struct *curr_task(int cpu); extern void set_curr_task(int cpu, struct task_struct *p); diff --git a/trunk/include/linux/security.h b/trunk/include/linux/security.h index e8c619d39291..19d8e04e1688 100644 --- a/trunk/include/linux/security.h +++ b/trunk/include/linux/security.h @@ -2056,7 +2056,7 @@ static inline int security_old_inode_init_security(struct inode *inode, char **name, void **value, size_t *len) { - return -EOPNOTSUPP; + return 0; } static inline int security_inode_create(struct inode *dir, diff --git a/trunk/include/linux/srcu.h b/trunk/include/linux/srcu.h index e1b005918bbb..58971e891f48 100644 --- a/trunk/include/linux/srcu.h +++ b/trunk/include/linux/srcu.h @@ -28,7 +28,6 @@ #define _LINUX_SRCU_H #include -#include struct srcu_struct_array { int c[2]; @@ -61,10 +60,18 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name, __init_srcu_struct((sp), #sp, &__srcu_key); \ }) +# define srcu_read_acquire(sp) \ + lock_acquire(&(sp)->dep_map, 0, 0, 2, 1, NULL, _THIS_IP_) +# define srcu_read_release(sp) \ + lock_release(&(sp)->dep_map, 1, _THIS_IP_) + #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ int init_srcu_struct(struct srcu_struct *sp); +# define srcu_read_acquire(sp) do { } while (0) +# define srcu_read_release(sp) do { } while (0) + #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ void cleanup_srcu_struct(struct srcu_struct *sp); @@ -83,32 +90,12 @@ long srcu_batches_completed(struct srcu_struct *sp); * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, * this assumes we are in an SRCU read-side critical section unless it can * prove otherwise. - * - * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot - * and while lockdep is disabled. - * - * Note that if the CPU is in the idle loop from an RCU point of view - * (ie: that we are in the section between rcu_idle_enter() and - * rcu_idle_exit()) then srcu_read_lock_held() returns false even if - * the CPU did an srcu_read_lock(). The reason for this is that RCU - * ignores CPUs that are in such a section, considering these as in - * extended quiescent state, so such a CPU is effectively never in an - * RCU read-side critical section regardless of what RCU primitives it - * invokes. This state of affairs is required --- we need to keep an - * RCU-free window in idle where the CPU may possibly enter into low - * power mode. This way we can notice an extended quiescent state to - * other CPUs that started a grace period. Otherwise we would delay any - * grace period as long as we run in the idle task. */ static inline int srcu_read_lock_held(struct srcu_struct *sp) { - if (rcu_is_cpu_idle()) - return 0; - - if (!debug_lockdep_rcu_enabled()) - return 1; - - return lock_is_held(&sp->dep_map); + if (debug_locks) + return lock_is_held(&sp->dep_map); + return 1; } #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ @@ -158,17 +145,12 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp) * one way to indirectly wait on an SRCU grace period is to acquire * a mutex that is held elsewhere while calling synchronize_srcu() or * synchronize_srcu_expedited(). - * - * Note that srcu_read_lock() and the matching srcu_read_unlock() must - * occur in the same context, for example, it is illegal to invoke - * srcu_read_unlock() in an irq handler if the matching srcu_read_lock() - * was invoked in process context. */ static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp) { int retval = __srcu_read_lock(sp); - rcu_lock_acquire(&(sp)->dep_map); + srcu_read_acquire(sp); return retval; } @@ -182,51 +164,8 @@ static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp) static inline void srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp) { - rcu_lock_release(&(sp)->dep_map); - __srcu_read_unlock(sp, idx); -} - -/** - * srcu_read_lock_raw - register a new reader for an SRCU-protected structure. - * @sp: srcu_struct in which to register the new reader. - * - * Enter an SRCU read-side critical section. Similar to srcu_read_lock(), - * but avoids the RCU-lockdep checking. This means that it is legal to - * use srcu_read_lock_raw() in one context, for example, in an exception - * handler, and then have the matching srcu_read_unlock_raw() in another - * context, for example in the task that took the exception. - * - * However, the entire SRCU read-side critical section must reside within a - * single task. For example, beware of using srcu_read_lock_raw() in - * a device interrupt handler and srcu_read_unlock() in the interrupted - * task: This will not work if interrupts are threaded. - */ -static inline int srcu_read_lock_raw(struct srcu_struct *sp) -{ - unsigned long flags; - int ret; - - local_irq_save(flags); - ret = __srcu_read_lock(sp); - local_irq_restore(flags); - return ret; -} - -/** - * srcu_read_unlock_raw - unregister reader from an SRCU-protected structure. - * @sp: srcu_struct in which to unregister the old reader. - * @idx: return value from corresponding srcu_read_lock_raw(). - * - * Exit an SRCU read-side critical section without lockdep-RCU checking. - * See srcu_read_lock_raw() for more details. - */ -static inline void srcu_read_unlock_raw(struct srcu_struct *sp, int idx) -{ - unsigned long flags; - - local_irq_save(flags); + srcu_read_release(sp); __srcu_read_unlock(sp, idx); - local_irq_restore(flags); } #endif diff --git a/trunk/include/linux/tick.h b/trunk/include/linux/tick.h index ab8be90b5cc9..b232ccc0ee29 100644 --- a/trunk/include/linux/tick.h +++ b/trunk/include/linux/tick.h @@ -7,7 +7,6 @@ #define _LINUX_TICK_H #include -#include #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -122,16 +121,14 @@ static inline int tick_oneshot_mode_active(void) { return 0; } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ # ifdef CONFIG_NO_HZ -extern void tick_nohz_idle_enter(void); -extern void tick_nohz_idle_exit(void); -extern void tick_nohz_irq_exit(void); +extern void tick_nohz_stop_sched_tick(int inidle); +extern void tick_nohz_restart_sched_tick(void); extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); # else -static inline void tick_nohz_idle_enter(void) { } -static inline void tick_nohz_idle_exit(void) { } - +static inline void tick_nohz_stop_sched_tick(int inidle) { } +static inline void tick_nohz_restart_sched_tick(void) { } static inline ktime_t tick_nohz_get_sleep_length(void) { ktime_t len = { .tv64 = NSEC_PER_SEC/HZ }; diff --git a/trunk/include/linux/wait.h b/trunk/include/linux/wait.h index a9ce45e8501c..3efc9f3f43a0 100644 --- a/trunk/include/linux/wait.h +++ b/trunk/include/linux/wait.h @@ -77,13 +77,13 @@ struct task_struct; #define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ { .flags = word, .bit_nr = bit, } -extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *); +extern void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *); #define init_waitqueue_head(q) \ do { \ static struct lock_class_key __key; \ \ - __init_waitqueue_head((q), #q, &__key); \ + __init_waitqueue_head((q), &__key); \ } while (0) #ifdef CONFIG_LOCKDEP diff --git a/trunk/include/net/dst.h b/trunk/include/net/dst.h index 75766b42660e..6faec1a60216 100644 --- a/trunk/include/net/dst.h +++ b/trunk/include/net/dst.h @@ -53,7 +53,6 @@ struct dst_entry { #define DST_NOHASH 0x0008 #define DST_NOCACHE 0x0010 #define DST_NOCOUNT 0x0020 -#define DST_NOPEER 0x0040 short error; short obsolete; diff --git a/trunk/include/net/flow.h b/trunk/include/net/flow.h index 57f15a7f1cdd..a09447749e2d 100644 --- a/trunk/include/net/flow.h +++ b/trunk/include/net/flow.h @@ -207,7 +207,6 @@ extern struct flow_cache_object *flow_cache_lookup( u8 dir, flow_resolve_t resolver, void *ctx); extern void flow_cache_flush(void); -extern void flow_cache_flush_deferred(void); extern atomic_t flow_cache_genid; #endif diff --git a/trunk/include/net/ip_vs.h b/trunk/include/net/ip_vs.h index e5a7b9aaf552..873d5be7926c 100644 --- a/trunk/include/net/ip_vs.h +++ b/trunk/include/net/ip_vs.h @@ -1207,7 +1207,7 @@ extern void ip_vs_control_cleanup(void); extern struct ip_vs_dest * ip_vs_find_dest(struct net *net, int af, const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, __be16 vport, - __u16 protocol, __u32 fwmark, __u32 flags); + __u16 protocol, __u32 fwmark); extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp); diff --git a/trunk/include/net/sctp/structs.h b/trunk/include/net/sctp/structs.h index a15432da27c3..e90e7a9935dd 100644 --- a/trunk/include/net/sctp/structs.h +++ b/trunk/include/net/sctp/structs.h @@ -241,9 +241,6 @@ extern struct sctp_globals { * bits is an indicator of when to send and window update SACK. */ int rwnd_update_shift; - - /* Threshold for autoclose timeout, in seconds. */ - unsigned long max_autoclose; } sctp_globals; #define sctp_rto_initial (sctp_globals.rto_initial) @@ -284,7 +281,6 @@ extern struct sctp_globals { #define sctp_auth_enable (sctp_globals.auth_enable) #define sctp_checksum_disable (sctp_globals.checksum_disable) #define sctp_rwnd_upd_shift (sctp_globals.rwnd_update_shift) -#define sctp_max_autoclose (sctp_globals.max_autoclose) /* SCTP Socket type: UDP or TCP style. */ typedef enum { diff --git a/trunk/include/net/sock.h b/trunk/include/net/sock.h index 32e39371fba6..abb6e0f0c3c3 100644 --- a/trunk/include/net/sock.h +++ b/trunk/include/net/sock.h @@ -637,14 +637,12 @@ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) /* * Take into account size of receive queue and backlog queue - * Do not take into account this skb truesize, - * to allow even a single big packet to come. */ static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff *skb) { unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc); - return qsize > sk->sk_rcvbuf; + return qsize + skb->truesize > sk->sk_rcvbuf; } /* The per-socket spinlock must be held here. */ diff --git a/trunk/include/trace/events/rcu.h b/trunk/include/trace/events/rcu.h index d2d88bed891b..669fbd62ec25 100644 --- a/trunk/include/trace/events/rcu.h +++ b/trunk/include/trace/events/rcu.h @@ -241,73 +241,24 @@ TRACE_EVENT(rcu_fqs, /* * Tracepoint for dyntick-idle entry/exit events. These take a string - * as argument: "Start" for entering dyntick-idle mode, "End" for - * leaving it, "--=" for events moving towards idle, and "++=" for events - * moving away from idle. "Error on entry: not idle task" and "Error on - * exit: not idle task" indicate that a non-idle task is erroneously - * toying with the idle loop. - * - * These events also take a pair of numbers, which indicate the nesting - * depth before and after the event of interest. Note that task-related - * events use the upper bits of each number, while interrupt-related - * events use the lower bits. + * as argument: "Start" for entering dyntick-idle mode and "End" for + * leaving it. */ TRACE_EVENT(rcu_dyntick, - TP_PROTO(char *polarity, long long oldnesting, long long newnesting), + TP_PROTO(char *polarity), - TP_ARGS(polarity, oldnesting, newnesting), + TP_ARGS(polarity), TP_STRUCT__entry( __field(char *, polarity) - __field(long long, oldnesting) - __field(long long, newnesting) ), TP_fast_assign( __entry->polarity = polarity; - __entry->oldnesting = oldnesting; - __entry->newnesting = newnesting; - ), - - TP_printk("%s %llx %llx", __entry->polarity, - __entry->oldnesting, __entry->newnesting) -); - -/* - * Tracepoint for RCU preparation for idle, the goal being to get RCU - * processing done so that the current CPU can shut off its scheduling - * clock and enter dyntick-idle mode. One way to accomplish this is - * to drain all RCU callbacks from this CPU, and the other is to have - * done everything RCU requires for the current grace period. In this - * latter case, the CPU will be awakened at the end of the current grace - * period in order to process the remainder of its callbacks. - * - * These tracepoints take a string as argument: - * - * "No callbacks": Nothing to do, no callbacks on this CPU. - * "In holdoff": Nothing to do, holding off after unsuccessful attempt. - * "Begin holdoff": Attempt failed, don't retry until next jiffy. - * "Dyntick with callbacks": Entering dyntick-idle despite callbacks. - * "More callbacks": Still more callbacks, try again to clear them out. - * "Callbacks drained": All callbacks processed, off to dyntick idle! - * "Timer": Timer fired to cause CPU to continue processing callbacks. - */ -TRACE_EVENT(rcu_prep_idle, - - TP_PROTO(char *reason), - - TP_ARGS(reason), - - TP_STRUCT__entry( - __field(char *, reason) - ), - - TP_fast_assign( - __entry->reason = reason; ), - TP_printk("%s", __entry->reason) + TP_printk("%s", __entry->polarity) ); /* @@ -461,71 +412,27 @@ TRACE_EVENT(rcu_invoke_kfree_callback, /* * Tracepoint for exiting rcu_do_batch after RCU callbacks have been - * invoked. The first argument is the name of the RCU flavor, - * the second argument is number of callbacks actually invoked, - * the third argument (cb) is whether or not any of the callbacks that - * were ready to invoke at the beginning of this batch are still - * queued, the fourth argument (nr) is the return value of need_resched(), - * the fifth argument (iit) is 1 if the current task is the idle task, - * and the sixth argument (risk) is the return value from - * rcu_is_callbacks_kthread(). + * invoked. The first argument is the name of the RCU flavor and + * the second argument is number of callbacks actually invoked. */ TRACE_EVENT(rcu_batch_end, - TP_PROTO(char *rcuname, int callbacks_invoked, - bool cb, bool nr, bool iit, bool risk), + TP_PROTO(char *rcuname, int callbacks_invoked), - TP_ARGS(rcuname, callbacks_invoked, cb, nr, iit, risk), + TP_ARGS(rcuname, callbacks_invoked), TP_STRUCT__entry( __field(char *, rcuname) __field(int, callbacks_invoked) - __field(bool, cb) - __field(bool, nr) - __field(bool, iit) - __field(bool, risk) ), TP_fast_assign( __entry->rcuname = rcuname; __entry->callbacks_invoked = callbacks_invoked; - __entry->cb = cb; - __entry->nr = nr; - __entry->iit = iit; - __entry->risk = risk; - ), - - TP_printk("%s CBs-invoked=%d idle=%c%c%c%c", - __entry->rcuname, __entry->callbacks_invoked, - __entry->cb ? 'C' : '.', - __entry->nr ? 'S' : '.', - __entry->iit ? 'I' : '.', - __entry->risk ? 'R' : '.') -); - -/* - * Tracepoint for rcutorture readers. The first argument is the name - * of the RCU flavor from rcutorture's viewpoint and the second argument - * is the callback address. - */ -TRACE_EVENT(rcu_torture_read, - - TP_PROTO(char *rcutorturename, struct rcu_head *rhp), - - TP_ARGS(rcutorturename, rhp), - - TP_STRUCT__entry( - __field(char *, rcutorturename) - __field(struct rcu_head *, rhp) - ), - - TP_fast_assign( - __entry->rcutorturename = rcutorturename; - __entry->rhp = rhp; ), - TP_printk("%s torture read %p", - __entry->rcutorturename, __entry->rhp) + TP_printk("%s CBs-invoked=%d", + __entry->rcuname, __entry->callbacks_invoked) ); #else /* #ifdef CONFIG_RCU_TRACE */ @@ -536,16 +443,13 @@ TRACE_EVENT(rcu_torture_read, #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0) #define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0) -#define trace_rcu_dyntick(polarity, oldnesting, newnesting) do { } while (0) -#define trace_rcu_prep_idle(reason) do { } while (0) +#define trace_rcu_dyntick(polarity) do { } while (0) #define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0) #define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0) #define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0) #define trace_rcu_invoke_callback(rcuname, rhp) do { } while (0) #define trace_rcu_invoke_kfree_callback(rcuname, rhp, offset) do { } while (0) -#define trace_rcu_batch_end(rcuname, callbacks_invoked, cb, nr, iit, risk) \ - do { } while (0) -#define trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) +#define trace_rcu_batch_end(rcuname, callbacks_invoked) do { } while (0) #endif /* #else #ifdef CONFIG_RCU_TRACE */ diff --git a/trunk/include/trace/events/sched.h b/trunk/include/trace/events/sched.h index 6ba596b07a72..959ff18b63b6 100644 --- a/trunk/include/trace/events/sched.h +++ b/trunk/include/trace/events/sched.h @@ -330,13 +330,6 @@ DEFINE_EVENT(sched_stat_template, sched_stat_iowait, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(tsk, delay)); -/* - * Tracepoint for accounting blocked time (time the task is in uninterruptible). - */ -DEFINE_EVENT(sched_stat_template, sched_stat_blocked, - TP_PROTO(struct task_struct *tsk, u64 delay), - TP_ARGS(tsk, delay)); - /* * Tracepoint for accounting runtime (time the task is executing * on a CPU). @@ -370,56 +363,6 @@ TRACE_EVENT(sched_stat_runtime, (unsigned long long)__entry->vruntime) ); -#ifdef CREATE_TRACE_POINTS -static inline u64 trace_get_sleeptime(struct task_struct *tsk) -{ -#ifdef CONFIG_SCHEDSTATS - u64 block, sleep; - - block = tsk->se.statistics.block_start; - sleep = tsk->se.statistics.sleep_start; - tsk->se.statistics.block_start = 0; - tsk->se.statistics.sleep_start = 0; - - return block ? block : sleep ? sleep : 0; -#else - return 0; -#endif -} -#endif - -/* - * Tracepoint for accounting sleeptime (time the task is sleeping - * or waiting for I/O). - */ -TRACE_EVENT(sched_stat_sleeptime, - - TP_PROTO(struct task_struct *tsk, u64 now), - - TP_ARGS(tsk, now), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( u64, sleeptime ) - ), - - TP_fast_assign( - memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); - __entry->pid = tsk->pid; - __entry->sleeptime = trace_get_sleeptime(tsk); - __entry->sleeptime = __entry->sleeptime ? - now - __entry->sleeptime : 0; - ) - TP_perf_assign( - __perf_count(__entry->sleeptime); - ), - - TP_printk("comm=%s pid=%d sleeptime=%Lu [ns]", - __entry->comm, __entry->pid, - (unsigned long long)__entry->sleeptime) -); - /* * Tracepoint for showing priority inheritance modifying a tasks * priority. diff --git a/trunk/include/trace/events/writeback.h b/trunk/include/trace/events/writeback.h index 99d1d0decf88..b99caa8b780c 100644 --- a/trunk/include/trace/events/writeback.h +++ b/trunk/include/trace/events/writeback.h @@ -21,16 +21,6 @@ {I_REFERENCED, "I_REFERENCED"} \ ) -#define WB_WORK_REASON \ - {WB_REASON_BACKGROUND, "background"}, \ - {WB_REASON_TRY_TO_FREE_PAGES, "try_to_free_pages"}, \ - {WB_REASON_SYNC, "sync"}, \ - {WB_REASON_PERIODIC, "periodic"}, \ - {WB_REASON_LAPTOP_TIMER, "laptop_timer"}, \ - {WB_REASON_FREE_MORE_MEM, "free_more_memory"}, \ - {WB_REASON_FS_FREE_SPACE, "fs_free_space"}, \ - {WB_REASON_FORKER_THREAD, "forker_thread"} - struct wb_writeback_work; DECLARE_EVENT_CLASS(writeback_work_class, @@ -65,7 +55,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, __entry->for_kupdate, __entry->range_cyclic, __entry->for_background, - __print_symbolic(__entry->reason, WB_WORK_REASON) + wb_reason_name[__entry->reason] ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ @@ -194,8 +184,7 @@ TRACE_EVENT(writeback_queue_io, __entry->older, /* older_than_this in jiffies */ __entry->age, /* older_than_this in relative milliseconds */ __entry->moved, - __print_symbolic(__entry->reason, WB_WORK_REASON) - ) + wb_reason_name[__entry->reason]) ); TRACE_EVENT(global_dirty_state, diff --git a/trunk/init/Kconfig b/trunk/init/Kconfig index a34cd174b768..43298f9810fb 100644 --- a/trunk/init/Kconfig +++ b/trunk/init/Kconfig @@ -469,14 +469,14 @@ config RCU_FANOUT_EXACT config RCU_FAST_NO_HZ bool "Accelerate last non-dyntick-idle CPU's grace periods" - depends on NO_HZ && SMP + depends on TREE_RCU && NO_HZ && SMP default n help This option causes RCU to attempt to accelerate grace periods - in order to allow CPUs to enter dynticks-idle state more - quickly. On the other hand, this option increases the overhead - of the dynticks-idle checking, particularly on systems with - large numbers of CPUs. + in order to allow the final CPU to enter dynticks-idle state + more quickly. On the other hand, this option increases the + overhead of the dynticks-idle checking, particularly on systems + with large numbers of CPUs. Say Y if energy efficiency is critically important, particularly if you have relatively few CPUs. @@ -702,6 +702,7 @@ config CGROUP_PERF menuconfig CGROUP_SCHED bool "Group CPU scheduler" + depends on EXPERIMENTAL default n help This feature lets CPU scheduler recognize task groups and control CPU diff --git a/trunk/init/main.c b/trunk/init/main.c index 2c76efb513c2..217ed23e9487 100644 --- a/trunk/init/main.c +++ b/trunk/init/main.c @@ -469,12 +469,13 @@ asmlinkage void __init start_kernel(void) char * command_line; extern const struct kernel_param __start___param[], __stop___param[]; + smp_setup_processor_id(); + /* * Need to run as early as possible, to initialize the * lockdep hash: */ lockdep_init(); - smp_setup_processor_id(); debug_objects_early_init(); /* diff --git a/trunk/kernel/Makefile b/trunk/kernel/Makefile index f70396e5a24b..e898c5b9d02c 100644 --- a/trunk/kernel/Makefile +++ b/trunk/kernel/Makefile @@ -2,15 +2,16 @@ # Makefile for the linux kernel. # -obj-y = fork.o exec_domain.o panic.o printk.o \ +obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ - notifier.o ksysfs.o cred.o \ - async.o range.o groups.o + notifier.o ksysfs.o sched_clock.o cred.o \ + async.o range.o +obj-y += groups.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files @@ -19,11 +20,10 @@ CFLAGS_REMOVE_lockdep_proc.o = -pg CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg +CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_irq_work.o = -pg endif -obj-y += sched/ - obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o @@ -99,6 +99,7 @@ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ +obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o @@ -109,6 +110,15 @@ obj-$(CONFIG_PADATA) += padata.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o +ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) +# According to Alan Modra , the -fno-omit-frame-pointer is +# needed for x86 only. Why this used to be enabled for all architectures is beyond +# me. I suspect most platforms don't need this, but until we know that for sure +# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k +# to get a correct value for the wait-channel (WCHAN in ps). --davidm +CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer +endif + $(obj)/configs.o: $(obj)/config_data.h # config_data.h contains the same information as ikconfig.h but gzipped. diff --git a/trunk/kernel/acct.c b/trunk/kernel/acct.c index 203dfead2e06..fa7eb3de2ddc 100644 --- a/trunk/kernel/acct.c +++ b/trunk/kernel/acct.c @@ -613,8 +613,8 @@ void acct_collect(long exitcode, int group_dead) pacct->ac_flag |= ACORE; if (current->flags & PF_SIGNALED) pacct->ac_flag |= AXSIG; - pacct->ac_utime += current->utime; - pacct->ac_stime += current->stime; + pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); + pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); pacct->ac_minflt += current->min_flt; pacct->ac_majflt += current->maj_flt; spin_unlock_irq(¤t->sighand->siglock); diff --git a/trunk/kernel/cpu.c b/trunk/kernel/cpu.c index 5ca38d5d238a..563f13609470 100644 --- a/trunk/kernel/cpu.c +++ b/trunk/kernel/cpu.c @@ -178,7 +178,8 @@ static inline void check_for_tasks(int cpu) write_lock_irq(&tasklist_lock); for_each_process(p) { if (task_cpu(p) == cpu && p->state == TASK_RUNNING && - (p->utime || p->stime)) + (!cputime_eq(p->utime, cputime_zero) || + !cputime_eq(p->stime, cputime_zero))) printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " "(state = %ld, flags = %x)\n", p->comm, task_pid_nr(p), cpu, @@ -379,7 +380,6 @@ int __cpuinit cpu_up(unsigned int cpu) cpu_maps_update_done(); return err; } -EXPORT_SYMBOL_GPL(cpu_up); #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; diff --git a/trunk/kernel/debug/kdb/kdb_support.c b/trunk/kernel/debug/kdb/kdb_support.c index 7d6fb40d2188..5532dd37aa86 100644 --- a/trunk/kernel/debug/kdb/kdb_support.c +++ b/trunk/kernel/debug/kdb/kdb_support.c @@ -636,7 +636,7 @@ char kdb_task_state_char (const struct task_struct *p) (p->exit_state & EXIT_ZOMBIE) ? 'Z' : (p->exit_state & EXIT_DEAD) ? 'E' : (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; - if (is_idle_task(p)) { + if (p->pid == 0) { /* Idle task. Is it really idle, apart from the kdb * interrupt? */ if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) { diff --git a/trunk/kernel/events/Makefile b/trunk/kernel/events/Makefile index 22d901f9caf4..89e5e8aa4c36 100644 --- a/trunk/kernel/events/Makefile +++ b/trunk/kernel/events/Makefile @@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_core.o = -pg endif -obj-y := core.o ring_buffer.o callchain.o +obj-y := core.o ring_buffer.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o diff --git a/trunk/kernel/events/callchain.c b/trunk/kernel/events/callchain.c deleted file mode 100644 index 057e24b665cf..000000000000 --- a/trunk/kernel/events/callchain.c +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Performance events callchain code, extracted from core.c: - * - * Copyright (C) 2008 Thomas Gleixner - * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra - * Copyright © 2009 Paul Mackerras, IBM Corp. - * - * For licensing details see kernel-base/COPYING - */ - -#include -#include -#include "internal.h" - -struct callchain_cpus_entries { - struct rcu_head rcu_head; - struct perf_callchain_entry *cpu_entries[0]; -}; - -static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); -static atomic_t nr_callchain_events; -static DEFINE_MUTEX(callchain_mutex); -static struct callchain_cpus_entries *callchain_cpus_entries; - - -__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ -} - -__weak void perf_callchain_user(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ -} - -static void release_callchain_buffers_rcu(struct rcu_head *head) -{ - struct callchain_cpus_entries *entries; - int cpu; - - entries = container_of(head, struct callchain_cpus_entries, rcu_head); - - for_each_possible_cpu(cpu) - kfree(entries->cpu_entries[cpu]); - - kfree(entries); -} - -static void release_callchain_buffers(void) -{ - struct callchain_cpus_entries *entries; - - entries = callchain_cpus_entries; - rcu_assign_pointer(callchain_cpus_entries, NULL); - call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); -} - -static int alloc_callchain_buffers(void) -{ - int cpu; - int size; - struct callchain_cpus_entries *entries; - - /* - * We can't use the percpu allocation API for data that can be - * accessed from NMI. Use a temporary manual per cpu allocation - * until that gets sorted out. - */ - size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); - - entries = kzalloc(size, GFP_KERNEL); - if (!entries) - return -ENOMEM; - - size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; - - for_each_possible_cpu(cpu) { - entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, - cpu_to_node(cpu)); - if (!entries->cpu_entries[cpu]) - goto fail; - } - - rcu_assign_pointer(callchain_cpus_entries, entries); - - return 0; - -fail: - for_each_possible_cpu(cpu) - kfree(entries->cpu_entries[cpu]); - kfree(entries); - - return -ENOMEM; -} - -int get_callchain_buffers(void) -{ - int err = 0; - int count; - - mutex_lock(&callchain_mutex); - - count = atomic_inc_return(&nr_callchain_events); - if (WARN_ON_ONCE(count < 1)) { - err = -EINVAL; - goto exit; - } - - if (count > 1) { - /* If the allocation failed, give up */ - if (!callchain_cpus_entries) - err = -ENOMEM; - goto exit; - } - - err = alloc_callchain_buffers(); - if (err) - release_callchain_buffers(); -exit: - mutex_unlock(&callchain_mutex); - - return err; -} - -void put_callchain_buffers(void) -{ - if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { - release_callchain_buffers(); - mutex_unlock(&callchain_mutex); - } -} - -static struct perf_callchain_entry *get_callchain_entry(int *rctx) -{ - int cpu; - struct callchain_cpus_entries *entries; - - *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); - if (*rctx == -1) - return NULL; - - entries = rcu_dereference(callchain_cpus_entries); - if (!entries) - return NULL; - - cpu = smp_processor_id(); - - return &entries->cpu_entries[cpu][*rctx]; -} - -static void -put_callchain_entry(int rctx) -{ - put_recursion_context(__get_cpu_var(callchain_recursion), rctx); -} - -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - int rctx; - struct perf_callchain_entry *entry; - - - entry = get_callchain_entry(&rctx); - if (rctx == -1) - return NULL; - - if (!entry) - goto exit_put; - - entry->nr = 0; - - if (!user_mode(regs)) { - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); - perf_callchain_kernel(entry, regs); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - - if (regs) { - perf_callchain_store(entry, PERF_CONTEXT_USER); - perf_callchain_user(entry, regs); - } - -exit_put: - put_callchain_entry(rctx); - - return entry; -} diff --git a/trunk/kernel/events/core.c b/trunk/kernel/events/core.c index 890eb02c2f21..58690af323e4 100644 --- a/trunk/kernel/events/core.c +++ b/trunk/kernel/events/core.c @@ -128,7 +128,7 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -struct jump_label_key_deferred perf_sched_events __read_mostly; +struct jump_label_key perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static atomic_t nr_mmap_events __read_mostly; @@ -1130,8 +1130,6 @@ event_sched_out(struct perf_event *event, if (!is_software_event(event)) cpuctx->active_oncpu--; ctx->nr_active--; - if (event->attr.freq && event->attr.sample_freq) - ctx->nr_freq--; if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; } @@ -1327,7 +1325,6 @@ void perf_event_disable(struct perf_event *event) } raw_spin_unlock_irq(&ctx->lock); } -EXPORT_SYMBOL_GPL(perf_event_disable); static void perf_set_shadow_time(struct perf_event *event, struct perf_event_context *ctx, @@ -1409,8 +1406,6 @@ event_sched_in(struct perf_event *event, if (!is_software_event(event)) cpuctx->active_oncpu++; ctx->nr_active++; - if (event->attr.freq && event->attr.sample_freq) - ctx->nr_freq++; if (event->attr.exclusive) cpuctx->exclusive = 1; @@ -1667,7 +1662,8 @@ perf_install_in_context(struct perf_event_context *ctx, * Note: this works for group members as well as group leaders * since the non-leader members' sibling_lists will be empty. */ -static void __perf_event_mark_enabled(struct perf_event *event) +static void __perf_event_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) { struct perf_event *sub; u64 tstamp = perf_event_time(event); @@ -1705,7 +1701,7 @@ static int __perf_event_enable(void *info) */ perf_cgroup_set_timestamp(current, ctx); - __perf_event_mark_enabled(event); + __perf_event_mark_enabled(event, ctx); if (!event_filter_match(event)) { if (is_cgroup_event(event)) @@ -1786,7 +1782,7 @@ void perf_event_enable(struct perf_event *event) retry: if (!ctx->is_active) { - __perf_event_mark_enabled(event); + __perf_event_mark_enabled(event, ctx); goto out; } @@ -1813,7 +1809,6 @@ void perf_event_enable(struct perf_event *event) out: raw_spin_unlock_irq(&ctx->lock); } -EXPORT_SYMBOL_GPL(perf_event_enable); int perf_event_refresh(struct perf_event *event, int refresh) { @@ -2332,9 +2327,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) u64 interrupts, now; s64 delta; - if (!ctx->nr_freq) - return; - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; @@ -2390,14 +2382,12 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) { u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; struct perf_event_context *ctx = NULL; - int rotate = 0, remove = 1, freq = 0; + int rotate = 0, remove = 1; if (cpuctx->ctx.nr_events) { remove = 0; if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) rotate = 1; - if (cpuctx->ctx.nr_freq) - freq = 1; } ctx = cpuctx->task_ctx; @@ -2405,40 +2395,33 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) remove = 0; if (ctx->nr_events != ctx->nr_active) rotate = 1; - if (ctx->nr_freq) - freq = 1; } - if (!rotate && !freq) - goto done; - perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); + perf_ctx_adjust_freq(&cpuctx->ctx, interval); + if (ctx) + perf_ctx_adjust_freq(ctx, interval); - if (freq) { - perf_ctx_adjust_freq(&cpuctx->ctx, interval); - if (ctx) - perf_ctx_adjust_freq(ctx, interval); - } - - if (rotate) { - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - if (ctx) - ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); + if (!rotate) + goto done; - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (ctx) + ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, ctx, current); - } + rotate_ctx(&cpuctx->ctx); + if (ctx) + rotate_ctx(ctx); - perf_pmu_enable(cpuctx->ctx.pmu); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); + perf_event_sched_in(cpuctx, ctx, current); done: if (remove) list_del_init(&cpuctx->rotation_list); + + perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } void perf_event_task_tick(void) @@ -2465,7 +2448,7 @@ static int event_enable_on_exec(struct perf_event *event, if (event->state >= PERF_EVENT_STATE_INACTIVE) return 0; - __perf_event_mark_enabled(event); + __perf_event_mark_enabled(event, ctx); return 1; } @@ -2497,7 +2480,13 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) raw_spin_lock(&ctx->lock); task_ctx_sched_out(ctx); - list_for_each_entry(event, &ctx->event_list, event_entry) { + list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + ret = event_enable_on_exec(event, ctx); + if (ret) + enabled = 1; + } + + list_for_each_entry(event, &ctx->flexible_groups, group_entry) { ret = event_enable_on_exec(event, ctx); if (ret) enabled = 1; @@ -2584,6 +2573,215 @@ static u64 perf_event_read(struct perf_event *event) return perf_event_count(event); } +/* + * Callchain support + */ + +struct callchain_cpus_entries { + struct rcu_head rcu_head; + struct perf_callchain_entry *cpu_entries[0]; +}; + +static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); +static atomic_t nr_callchain_events; +static DEFINE_MUTEX(callchain_mutex); +struct callchain_cpus_entries *callchain_cpus_entries; + + +__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +__weak void perf_callchain_user(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +static void release_callchain_buffers_rcu(struct rcu_head *head) +{ + struct callchain_cpus_entries *entries; + int cpu; + + entries = container_of(head, struct callchain_cpus_entries, rcu_head); + + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + + kfree(entries); +} + +static void release_callchain_buffers(void) +{ + struct callchain_cpus_entries *entries; + + entries = callchain_cpus_entries; + rcu_assign_pointer(callchain_cpus_entries, NULL); + call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); +} + +static int alloc_callchain_buffers(void) +{ + int cpu; + int size; + struct callchain_cpus_entries *entries; + + /* + * We can't use the percpu allocation API for data that can be + * accessed from NMI. Use a temporary manual per cpu allocation + * until that gets sorted out. + */ + size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); + + entries = kzalloc(size, GFP_KERNEL); + if (!entries) + return -ENOMEM; + + size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; + + for_each_possible_cpu(cpu) { + entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, + cpu_to_node(cpu)); + if (!entries->cpu_entries[cpu]) + goto fail; + } + + rcu_assign_pointer(callchain_cpus_entries, entries); + + return 0; + +fail: + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + kfree(entries); + + return -ENOMEM; +} + +static int get_callchain_buffers(void) +{ + int err = 0; + int count; + + mutex_lock(&callchain_mutex); + + count = atomic_inc_return(&nr_callchain_events); + if (WARN_ON_ONCE(count < 1)) { + err = -EINVAL; + goto exit; + } + + if (count > 1) { + /* If the allocation failed, give up */ + if (!callchain_cpus_entries) + err = -ENOMEM; + goto exit; + } + + err = alloc_callchain_buffers(); + if (err) + release_callchain_buffers(); +exit: + mutex_unlock(&callchain_mutex); + + return err; +} + +static void put_callchain_buffers(void) +{ + if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { + release_callchain_buffers(); + mutex_unlock(&callchain_mutex); + } +} + +static int get_recursion_context(int *recursion) +{ + int rctx; + + if (in_nmi()) + rctx = 3; + else if (in_irq()) + rctx = 2; + else if (in_softirq()) + rctx = 1; + else + rctx = 0; + + if (recursion[rctx]) + return -1; + + recursion[rctx]++; + barrier(); + + return rctx; +} + +static inline void put_recursion_context(int *recursion, int rctx) +{ + barrier(); + recursion[rctx]--; +} + +static struct perf_callchain_entry *get_callchain_entry(int *rctx) +{ + int cpu; + struct callchain_cpus_entries *entries; + + *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); + if (*rctx == -1) + return NULL; + + entries = rcu_dereference(callchain_cpus_entries); + if (!entries) + return NULL; + + cpu = smp_processor_id(); + + return &entries->cpu_entries[cpu][*rctx]; +} + +static void +put_callchain_entry(int rctx) +{ + put_recursion_context(__get_cpu_var(callchain_recursion), rctx); +} + +static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + int rctx; + struct perf_callchain_entry *entry; + + + entry = get_callchain_entry(&rctx); + if (rctx == -1) + return NULL; + + if (!entry) + goto exit_put; + + entry->nr = 0; + + if (!user_mode(regs)) { + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); + perf_callchain_kernel(entry, regs); + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_user(entry, regs); + } + +exit_put: + put_callchain_entry(rctx); + + return entry; +} + /* * Initialize the perf_event context in a task_struct: */ @@ -2748,7 +2946,7 @@ static void free_event(struct perf_event *event) if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_dec_deferred(&perf_sched_events); + jump_label_dec(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -2759,7 +2957,7 @@ static void free_event(struct perf_event *event) put_callchain_buffers(); if (is_cgroup_event(event)) { atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_dec_deferred(&perf_sched_events); + jump_label_dec(&perf_sched_events); } } @@ -4622,6 +4820,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, struct hw_perf_event *hwc = &event->hw; int throttle = 0; + data->period = event->hw.last_period; if (!overflow) overflow = perf_swevent_set_period(event); @@ -4655,12 +4854,6 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, if (!is_sampling_event(event)) return; - if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { - data->period = nr; - return perf_swevent_overflow(event, 1, data, regs); - } else - data->period = event->hw.last_period; - if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) return perf_swevent_overflow(event, 1, data, regs); @@ -5173,7 +5366,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) regs = get_irq_regs(); if (regs && !perf_exclude_event(event, regs)) { - if (!(event->attr.exclude_idle && is_idle_task(current))) + if (!(event->attr.exclude_idle && current->pid == 0)) if (perf_event_overflow(event, &data, regs)) ret = HRTIMER_NORESTART; } @@ -5788,7 +5981,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_inc(&perf_sched_events.key); + jump_label_inc(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -6026,7 +6219,7 @@ SYSCALL_DEFINE5(perf_event_open, * - that may need work on context switch */ atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_inc(&perf_sched_events.key); + jump_label_inc(&perf_sched_events); } /* @@ -6872,9 +7065,6 @@ void __init perf_event_init(void) ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); - - /* do not patch jump label more than once per second */ - jump_label_rate_limit(&perf_sched_events, HZ); } static int __init perf_event_sysfs_init(void) diff --git a/trunk/kernel/events/internal.h b/trunk/kernel/events/internal.h index b0b107f90afc..64568a699375 100644 --- a/trunk/kernel/events/internal.h +++ b/trunk/kernel/events/internal.h @@ -1,10 +1,6 @@ #ifndef _KERNEL_EVENTS_INTERNAL_H #define _KERNEL_EVENTS_INTERNAL_H -#include - -/* Buffer handling */ - #define RING_BUFFER_WRITABLE 0x01 struct ring_buffer { @@ -71,7 +67,7 @@ static inline int page_order(struct ring_buffer *rb) } #endif -static inline unsigned long perf_data_size(struct ring_buffer *rb) +static unsigned long perf_data_size(struct ring_buffer *rb) { return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); } @@ -100,37 +96,4 @@ __output_copy(struct perf_output_handle *handle, } while (len); } -/* Callchain handling */ -extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); -extern int get_callchain_buffers(void); -extern void put_callchain_buffers(void); - -static inline int get_recursion_context(int *recursion) -{ - int rctx; - - if (in_nmi()) - rctx = 3; - else if (in_irq()) - rctx = 2; - else if (in_softirq()) - rctx = 1; - else - rctx = 0; - - if (recursion[rctx]) - return -1; - - recursion[rctx]++; - barrier(); - - return rctx; -} - -static inline void put_recursion_context(int *recursion, int rctx) -{ - barrier(); - recursion[rctx]--; -} - #endif /* _KERNEL_EVENTS_INTERNAL_H */ diff --git a/trunk/kernel/exit.c b/trunk/kernel/exit.c index d579a459309d..d0b7d988f873 100644 --- a/trunk/kernel/exit.c +++ b/trunk/kernel/exit.c @@ -121,9 +121,9 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime += tsk->utime; - sig->stime += tsk->stime; - sig->gtime += tsk->gtime; + sig->utime = cputime_add(sig->utime, tsk->utime); + sig->stime = cputime_add(sig->stime, tsk->stime); + sig->gtime = cputime_add(sig->gtime, tsk->gtime); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; @@ -1255,9 +1255,19 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; - psig->cutime += tgutime + sig->cutime; - psig->cstime += tgstime + sig->cstime; - psig->cgtime += p->gtime + sig->gtime + sig->cgtime; + psig->cutime = + cputime_add(psig->cutime, + cputime_add(tgutime, + sig->cutime)); + psig->cstime = + cputime_add(psig->cstime, + cputime_add(tgstime, + sig->cstime)); + psig->cgtime = + cputime_add(psig->cgtime, + cputime_add(p->gtime, + cputime_add(sig->gtime, + sig->cgtime))); psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += @@ -1530,15 +1540,8 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, } /* dead body doesn't have much to contribute */ - if (unlikely(p->exit_state == EXIT_DEAD)) { - /* - * But do not ignore this task until the tracer does - * wait_task_zombie()->do_notify_parent(). - */ - if (likely(!ptrace) && unlikely(ptrace_reparented(p))) - wo->notask_error = 0; + if (p->exit_state == EXIT_DEAD) return 0; - } /* slay zombie? */ if (p->exit_state == EXIT_ZOMBIE) { diff --git a/trunk/kernel/fork.c b/trunk/kernel/fork.c index b058c5820ecd..da4a6a10d088 100644 --- a/trunk/kernel/fork.c +++ b/trunk/kernel/fork.c @@ -1023,8 +1023,8 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p) */ static void posix_cpu_timers_init(struct task_struct *tsk) { - tsk->cputime_expires.prof_exp = 0; - tsk->cputime_expires.virt_exp = 0; + tsk->cputime_expires.prof_exp = cputime_zero; + tsk->cputime_expires.virt_exp = cputime_zero; tsk->cputime_expires.sched_exp = 0; INIT_LIST_HEAD(&tsk->cpu_timers[0]); INIT_LIST_HEAD(&tsk->cpu_timers[1]); @@ -1132,10 +1132,14 @@ static struct task_struct *copy_process(unsigned long clone_flags, init_sigpending(&p->pending); - p->utime = p->stime = p->gtime = 0; - p->utimescaled = p->stimescaled = 0; + p->utime = cputime_zero; + p->stime = cputime_zero; + p->gtime = cputime_zero; + p->utimescaled = cputime_zero; + p->stimescaled = cputime_zero; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - p->prev_utime = p->prev_stime = 0; + p->prev_utime = cputime_zero; + p->prev_stime = cputime_zero; #endif #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); diff --git a/trunk/kernel/futex.c b/trunk/kernel/futex.c index 1614be20173d..ea87f4d2f455 100644 --- a/trunk/kernel/futex.c +++ b/trunk/kernel/futex.c @@ -314,29 +314,17 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) #endif lock_page(page_head); - - /* - * If page_head->mapping is NULL, then it cannot be a PageAnon - * page; but it might be the ZERO_PAGE or in the gate area or - * in a special mapping (all cases which we are happy to fail); - * or it may have been a good file page when get_user_pages_fast - * found it, but truncated or holepunched or subjected to - * invalidate_complete_page2 before we got the page lock (also - * cases which we are happy to fail). And we hold a reference, - * so refcount care in invalidate_complete_page's remove_mapping - * prevents drop_caches from setting mapping to NULL beneath us. - * - * The case we do have to guard against is when memory pressure made - * shmem_writepage move it from filecache to swapcache beneath us: - * an unlikely race, but we do need to retry for page_head->mapping. - */ if (!page_head->mapping) { - int shmem_swizzled = PageSwapCache(page_head); unlock_page(page_head); put_page(page_head); - if (shmem_swizzled) - goto again; - return -EFAULT; + /* + * ZERO_PAGE pages don't have a mapping. Avoid a busy loop + * trying to find one. RW mapping would have COW'd (and thus + * have a mapping) so this page is RO and won't ever change. + */ + if ((page_head == ZERO_PAGE(address))) + return -EFAULT; + goto again; } /* diff --git a/trunk/kernel/hung_task.c b/trunk/kernel/hung_task.c index 2e48ec0c2e91..8b1748d0172c 100644 --- a/trunk/kernel/hung_task.c +++ b/trunk/kernel/hung_task.c @@ -74,17 +74,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) /* * Ensure the task is not frozen. - * Also, skip vfork and any other user process that freezer should skip. + * Also, when a freshly created task is scheduled once, changes + * its state to TASK_UNINTERRUPTIBLE without having ever been + * switched out once, it musn't be checked. */ - if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP))) - return; - - /* - * When a freshly created task is scheduled once, changes its state to - * TASK_UNINTERRUPTIBLE without having ever been switched out once, it - * musn't be checked. - */ - if (unlikely(!switch_count)) + if (unlikely(t->flags & PF_FROZEN || !switch_count)) return; if (switch_count != t->last_switch_count) { diff --git a/trunk/kernel/itimer.c b/trunk/kernel/itimer.c index 22000c3db0dd..d802883153da 100644 --- a/trunk/kernel/itimer.c +++ b/trunk/kernel/itimer.c @@ -52,22 +52,22 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, cval = it->expires; cinterval = it->incr; - if (cval) { + if (!cputime_eq(cval, cputime_zero)) { struct task_cputime cputime; cputime_t t; thread_group_cputimer(tsk, &cputime); if (clock_id == CPUCLOCK_PROF) - t = cputime.utime + cputime.stime; + t = cputime_add(cputime.utime, cputime.stime); else /* CPUCLOCK_VIRT */ t = cputime.utime; - if (cval < t) + if (cputime_le(cval, t)) /* about to fire */ cval = cputime_one_jiffy; else - cval = cval - t; + cval = cputime_sub(cval, t); } spin_unlock_irq(&tsk->sighand->siglock); @@ -161,9 +161,10 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, cval = it->expires; cinterval = it->incr; - if (cval || nval) { - if (nval > 0) - nval += cputime_one_jiffy; + if (!cputime_eq(cval, cputime_zero) || + !cputime_eq(nval, cputime_zero)) { + if (cputime_gt(nval, cputime_zero)) + nval = cputime_add(nval, cputime_one_jiffy); set_process_cpu_timer(tsk, clock_id, &nval, &cval); } it->expires = nval; diff --git a/trunk/kernel/jump_label.c b/trunk/kernel/jump_label.c index 30c3c7708132..66ff7109f697 100644 --- a/trunk/kernel/jump_label.c +++ b/trunk/kernel/jump_label.c @@ -72,46 +72,15 @@ void jump_label_inc(struct jump_label_key *key) jump_label_unlock(); } -static void __jump_label_dec(struct jump_label_key *key, - unsigned long rate_limit, struct delayed_work *work) +void jump_label_dec(struct jump_label_key *key) { if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) return; - if (rate_limit) { - atomic_inc(&key->enabled); - schedule_delayed_work(work, rate_limit); - } else - jump_label_update(key, JUMP_LABEL_DISABLE); - + jump_label_update(key, JUMP_LABEL_DISABLE); jump_label_unlock(); } -static void jump_label_update_timeout(struct work_struct *work) -{ - struct jump_label_key_deferred *key = - container_of(work, struct jump_label_key_deferred, work.work); - __jump_label_dec(&key->key, 0, NULL); -} - -void jump_label_dec(struct jump_label_key *key) -{ - __jump_label_dec(key, 0, NULL); -} - -void jump_label_dec_deferred(struct jump_label_key_deferred *key) -{ - __jump_label_dec(&key->key, key->timeout, &key->work); -} - - -void jump_label_rate_limit(struct jump_label_key_deferred *key, - unsigned long rl) -{ - key->timeout = rl; - INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); -} - static int addr_conflict(struct jump_entry *entry, void *start, void *end) { if (entry->code <= (unsigned long)end && @@ -142,7 +111,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, * running code can override this to make the non-live update case * cheaper. */ -void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry, +void __weak arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { arch_jump_label_transform(entry, type); @@ -248,13 +217,8 @@ void jump_label_apply_nops(struct module *mod) if (iter_start == iter_stop) return; - for (iter = iter_start; iter < iter_stop; iter++) { - struct jump_label_key *iterk; - - iterk = (struct jump_label_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? - JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); - } + for (iter = iter_start; iter < iter_stop; iter++) + arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); } static int jump_label_add_module(struct module *mod) @@ -294,7 +258,8 @@ static int jump_label_add_module(struct module *mod) key->next = jlm; if (jump_label_enabled(key)) - __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); + __jump_label_update(key, iter, iter_stop, + JUMP_LABEL_ENABLE); } return 0; diff --git a/trunk/kernel/lockdep.c b/trunk/kernel/lockdep.c index 8889f7dd7c46..b2e08c932d91 100644 --- a/trunk/kernel/lockdep.c +++ b/trunk/kernel/lockdep.c @@ -431,7 +431,6 @@ unsigned int max_lockdep_depth; * about it later on, in lockdep_info(). */ static int lockdep_init_error; -static const char *lock_init_error; static unsigned long lockdep_init_trace_data[20]; static struct stack_trace lockdep_init_trace = { .max_entries = ARRAY_SIZE(lockdep_init_trace_data), @@ -500,32 +499,36 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) usage[i] = '\0'; } -static void __print_lock_name(struct lock_class *class) +static int __print_lock_name(struct lock_class *class) { char str[KSYM_NAME_LEN]; const char *name; name = class->name; - if (!name) { + if (!name) name = __get_key_name(class->key, str); - printk("%s", name); - } else { - printk("%s", name); - if (class->name_version > 1) - printk("#%d", class->name_version); - if (class->subclass) - printk("/%d", class->subclass); - } + + return printk("%s", name); } static void print_lock_name(struct lock_class *class) { - char usage[LOCK_USAGE_CHARS]; + char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; + const char *name; get_usage_chars(class, usage); - printk(" ("); - __print_lock_name(class); + name = class->name; + if (!name) { + name = __get_key_name(class->key, str); + printk(" (%s", name); + } else { + printk(" (%s", name); + if (class->name_version > 1) + printk("#%d", class->name_version); + if (class->subclass) + printk("/%d", class->subclass); + } printk("){%s}", usage); } @@ -565,12 +568,11 @@ static void lockdep_print_held_locks(struct task_struct *curr) } } -static void print_kernel_ident(void) +static void print_kernel_version(void) { - printk("%s %.*s %s\n", init_utsname()->release, + printk("%s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, - print_tainted()); + init_utsname()->version); } static int very_verbose(struct lock_class *class) @@ -654,7 +656,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) if (unlikely(!lockdep_initialized)) { lockdep_init(); lockdep_init_error = 1; - lock_init_error = lock->name; save_stack_trace(&lockdep_init_trace); } #endif @@ -722,7 +723,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) class = look_up_lock_class(lock, subclass); if (likely(class)) - goto out_set_class_cache; + return class; /* * Debug-check: all keys must be persistent! @@ -807,7 +808,6 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) graph_unlock(); raw_local_irq_restore(flags); -out_set_class_cache: if (!subclass || force) lock->class_cache[0] = class; else if (subclass < NR_LOCKDEP_CACHING_CLASSES) @@ -1149,7 +1149,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, printk("\n"); printk("======================================================\n"); printk("[ INFO: possible circular locking dependency detected ]\n"); - print_kernel_ident(); + print_kernel_version(); printk("-------------------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); @@ -1488,7 +1488,7 @@ print_bad_irq_dependency(struct task_struct *curr, printk("======================================================\n"); printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", irqclass, irqclass); - print_kernel_ident(); + print_kernel_version(); printk("------------------------------------------------------\n"); printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), @@ -1717,7 +1717,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, printk("\n"); printk("=============================================\n"); printk("[ INFO: possible recursive locking detected ]\n"); - print_kernel_ident(); + print_kernel_version(); printk("---------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); @@ -2224,7 +2224,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, printk("\n"); printk("=================================\n"); printk("[ INFO: inconsistent lock state ]\n"); - print_kernel_ident(); + print_kernel_version(); printk("---------------------------------\n"); printk("inconsistent {%s} -> {%s} usage.\n", @@ -2289,7 +2289,7 @@ print_irq_inversion_bug(struct task_struct *curr, printk("\n"); printk("=========================================================\n"); printk("[ INFO: possible irq lock inversion dependency detected ]\n"); - print_kernel_ident(); + print_kernel_version(); printk("---------------------------------------------------------\n"); printk("%s/%d just changed the state of lock:\n", curr->comm, task_pid_nr(curr)); @@ -3175,7 +3175,6 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, printk("\n"); printk("=====================================\n"); printk("[ BUG: bad unlock balance detected! ]\n"); - print_kernel_ident(); printk("-------------------------------------\n"); printk("%s/%d is trying to release lock (", curr->comm, task_pid_nr(curr)); @@ -3620,7 +3619,6 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, printk("\n"); printk("=================================\n"); printk("[ BUG: bad contention detected! ]\n"); - print_kernel_ident(); printk("---------------------------------\n"); printk("%s/%d is trying to contend lock (", curr->comm, task_pid_nr(curr)); @@ -3976,8 +3974,7 @@ void __init lockdep_info(void) #ifdef CONFIG_DEBUG_LOCKDEP if (lockdep_init_error) { - printk("WARNING: lockdep init error! lock-%s was acquired" - "before lockdep_init\n", lock_init_error); + printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n"); printk("Call stack leading to lockdep invocation was:\n"); print_stack_trace(&lockdep_init_trace, 0); } @@ -3996,7 +3993,6 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, printk("\n"); printk("=========================\n"); printk("[ BUG: held lock freed! ]\n"); - print_kernel_ident(); printk("-------------------------\n"); printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", curr->comm, task_pid_nr(curr), mem_from, mem_to-1); @@ -4054,7 +4050,6 @@ static void print_held_locks_bug(struct task_struct *curr) printk("\n"); printk("=====================================\n"); printk("[ BUG: lock held at task exit time! ]\n"); - print_kernel_ident(); printk("-------------------------------------\n"); printk("%s/%d is exiting with locks still held!\n", curr->comm, task_pid_nr(curr)); @@ -4152,7 +4147,6 @@ void lockdep_sys_exit(void) printk("\n"); printk("================================================\n"); printk("[ BUG: lock held when returning to user space! ]\n"); - print_kernel_ident(); printk("------------------------------------------------\n"); printk("%s/%d is leaving the kernel with locks still held!\n", curr->comm, curr->pid); @@ -4172,33 +4166,10 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) printk("\n"); printk("===============================\n"); printk("[ INFO: suspicious RCU usage. ]\n"); - print_kernel_ident(); printk("-------------------------------\n"); printk("%s:%d %s!\n", file, line, s); printk("\nother info that might help us debug this:\n\n"); printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); - - /* - * If a CPU is in the RCU-free window in idle (ie: in the section - * between rcu_idle_enter() and rcu_idle_exit(), then RCU - * considers that CPU to be in an "extended quiescent state", - * which means that RCU will be completely ignoring that CPU. - * Therefore, rcu_read_lock() and friends have absolutely no - * effect on a CPU running in that state. In other words, even if - * such an RCU-idle CPU has called rcu_read_lock(), RCU might well - * delete data structures out from under it. RCU really has no - * choice here: we need to keep an RCU-free window in idle where - * the CPU may possibly enter into low power mode. This way we can - * notice an extended quiescent state to other CPUs that started a grace - * period. Otherwise we would delay any grace period as long as we run - * in the idle task. - * - * So complain bitterly if someone does call rcu_read_lock(), - * rcu_read_lock_bh() and so on from extended quiescent states. - */ - if (rcu_is_cpu_idle()) - printk("RCU used illegally from extended quiescent state!\n"); - lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); dump_stack(); diff --git a/trunk/kernel/panic.c b/trunk/kernel/panic.c index 3458469eb7c3..b26593604214 100644 --- a/trunk/kernel/panic.c +++ b/trunk/kernel/panic.c @@ -237,20 +237,11 @@ void add_taint(unsigned flag) * Can't trust the integrity of the kernel anymore. * We don't call directly debug_locks_off() because the issue * is not necessarily serious enough to set oops_in_progress to 1 - * Also we want to keep up lockdep for staging/out-of-tree - * development and post-warning case. + * Also we want to keep up lockdep for staging development and + * post-warning case. */ - switch (flag) { - case TAINT_CRAP: - case TAINT_OOT_MODULE: - case TAINT_WARN: - case TAINT_FIRMWARE_WORKAROUND: - break; - - default: - if (__debug_locks_off()) - printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); - } + if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off()) + printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); set_bit(flag, &tainted_mask); } diff --git a/trunk/kernel/posix-cpu-timers.c b/trunk/kernel/posix-cpu-timers.c index 125cb67daa21..e7cb76dc18f5 100644 --- a/trunk/kernel/posix-cpu-timers.c +++ b/trunk/kernel/posix-cpu-timers.c @@ -78,7 +78,7 @@ static inline int cpu_time_before(const clockid_t which_clock, if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { return now.sched < then.sched; } else { - return now.cpu < then.cpu; + return cputime_lt(now.cpu, then.cpu); } } static inline void cpu_time_add(const clockid_t which_clock, @@ -88,7 +88,7 @@ static inline void cpu_time_add(const clockid_t which_clock, if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { acc->sched += val.sched; } else { - acc->cpu += val.cpu; + acc->cpu = cputime_add(acc->cpu, val.cpu); } } static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, @@ -98,11 +98,24 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { a.sched -= b.sched; } else { - a.cpu -= b.cpu; + a.cpu = cputime_sub(a.cpu, b.cpu); } return a; } +/* + * Divide and limit the result to res >= 1 + * + * This is necessary to prevent signal delivery starvation, when the result of + * the division would be rounded down to 0. + */ +static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div) +{ + cputime_t res = cputime_div(time, div); + + return max_t(cputime_t, res, 1); +} + /* * Update expiry time from increment, and increase overrun count, * given the current clock sample. @@ -135,26 +148,28 @@ static void bump_cpu_timer(struct k_itimer *timer, } else { cputime_t delta, incr; - if (now.cpu < timer->it.cpu.expires.cpu) + if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu)) return; incr = timer->it.cpu.incr.cpu; - delta = now.cpu + incr - timer->it.cpu.expires.cpu; + delta = cputime_sub(cputime_add(now.cpu, incr), + timer->it.cpu.expires.cpu); /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; incr < delta - incr; i++) - incr += incr; - for (; i >= 0; incr = incr >> 1, i--) { - if (delta < incr) + for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) + incr = cputime_add(incr, incr); + for (; i >= 0; incr = cputime_halve(incr), i--) { + if (cputime_lt(delta, incr)) continue; - timer->it.cpu.expires.cpu += incr; + timer->it.cpu.expires.cpu = + cputime_add(timer->it.cpu.expires.cpu, incr); timer->it_overrun += 1 << i; - delta -= incr; + delta = cputime_sub(delta, incr); } } } static inline cputime_t prof_ticks(struct task_struct *p) { - return p->utime + p->stime; + return cputime_add(p->utime, p->stime); } static inline cputime_t virt_ticks(struct task_struct *p) { @@ -233,8 +248,8 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) t = tsk; do { - times->utime += t->utime; - times->stime += t->stime; + times->utime = cputime_add(times->utime, t->utime); + times->stime = cputime_add(times->stime, t->stime); times->sum_exec_runtime += task_sched_runtime(t); } while_each_thread(tsk, t); out: @@ -243,10 +258,10 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) { - if (b->utime > a->utime) + if (cputime_gt(b->utime, a->utime)) a->utime = b->utime; - if (b->stime > a->stime) + if (cputime_gt(b->stime, a->stime)) a->stime = b->stime; if (b->sum_exec_runtime > a->sum_exec_runtime) @@ -291,7 +306,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock, return -EINVAL; case CPUCLOCK_PROF: thread_group_cputime(p, &cputime); - cpu->cpu = cputime.utime + cputime.stime; + cpu->cpu = cputime_add(cputime.utime, cputime.stime); break; case CPUCLOCK_VIRT: thread_group_cputime(p, &cputime); @@ -455,24 +470,26 @@ static void cleanup_timers(struct list_head *head, unsigned long long sum_exec_runtime) { struct cpu_timer_list *timer, *next; - cputime_t ptime = utime + stime; + cputime_t ptime = cputime_add(utime, stime); list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (timer->expires.cpu < ptime) { - timer->expires.cpu = 0; + if (cputime_lt(timer->expires.cpu, ptime)) { + timer->expires.cpu = cputime_zero; } else { - timer->expires.cpu -= ptime; + timer->expires.cpu = cputime_sub(timer->expires.cpu, + ptime); } } ++head; list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (timer->expires.cpu < utime) { - timer->expires.cpu = 0; + if (cputime_lt(timer->expires.cpu, utime)) { + timer->expires.cpu = cputime_zero; } else { - timer->expires.cpu -= utime; + timer->expires.cpu = cputime_sub(timer->expires.cpu, + utime); } } @@ -503,7 +520,8 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) struct signal_struct *const sig = tsk->signal; cleanup_timers(tsk->signal->cpu_timers, - tsk->utime + sig->utime, tsk->stime + sig->stime, + cputime_add(tsk->utime, sig->utime), + cputime_add(tsk->stime, sig->stime), tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } @@ -522,7 +540,8 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) static inline int expires_gt(cputime_t expires, cputime_t new_exp) { - return expires == 0 || expires > new_exp; + return cputime_eq(expires, cputime_zero) || + cputime_gt(expires, new_exp); } /* @@ -632,7 +651,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock, default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = cputime.utime + cputime.stime; + cpu->cpu = cputime_add(cputime.utime, cputime.stime); break; case CPUCLOCK_VIRT: cpu->cpu = cputime.utime; @@ -899,12 +918,12 @@ static void check_thread_timers(struct task_struct *tsk, unsigned long soft; maxfire = 20; - tsk->cputime_expires.prof_exp = 0; + tsk->cputime_expires.prof_exp = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) { + if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { tsk->cputime_expires.prof_exp = t->expires.cpu; break; } @@ -914,12 +933,12 @@ static void check_thread_timers(struct task_struct *tsk, ++timers; maxfire = 20; - tsk->cputime_expires.virt_exp = 0; + tsk->cputime_expires.virt_exp = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) { + if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { tsk->cputime_expires.virt_exp = t->expires.cpu; break; } @@ -990,19 +1009,20 @@ static u32 onecputick; static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, cputime_t *expires, cputime_t cur_time, int signo) { - if (!it->expires) + if (cputime_eq(it->expires, cputime_zero)) return; - if (cur_time >= it->expires) { - if (it->incr) { - it->expires += it->incr; + if (cputime_ge(cur_time, it->expires)) { + if (!cputime_eq(it->incr, cputime_zero)) { + it->expires = cputime_add(it->expires, it->incr); it->error += it->incr_error; if (it->error >= onecputick) { - it->expires -= cputime_one_jiffy; + it->expires = cputime_sub(it->expires, + cputime_one_jiffy); it->error -= onecputick; } } else { - it->expires = 0; + it->expires = cputime_zero; } trace_itimer_expire(signo == SIGPROF ? @@ -1011,7 +1031,9 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); } - if (it->expires && (!*expires || it->expires < *expires)) { + if (!cputime_eq(it->expires, cputime_zero) && + (cputime_eq(*expires, cputime_zero) || + cputime_lt(it->expires, *expires))) { *expires = it->expires; } } @@ -1026,7 +1048,9 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, */ static inline int task_cputime_zero(const struct task_cputime *cputime) { - if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) + if (cputime_eq(cputime->utime, cputime_zero) && + cputime_eq(cputime->stime, cputime_zero) && + cputime->sum_exec_runtime == 0) return 1; return 0; } @@ -1052,15 +1076,15 @@ static void check_process_timers(struct task_struct *tsk, */ thread_group_cputimer(tsk, &cputime); utime = cputime.utime; - ptime = utime + cputime.stime; + ptime = cputime_add(utime, cputime.stime); sum_sched_runtime = cputime.sum_exec_runtime; maxfire = 20; - prof_expires = 0; + prof_expires = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || ptime < tl->expires.cpu) { + if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) { prof_expires = tl->expires.cpu; break; } @@ -1070,12 +1094,12 @@ static void check_process_timers(struct task_struct *tsk, ++timers; maxfire = 20; - virt_expires = 0; + virt_expires = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || utime < tl->expires.cpu) { + if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) { virt_expires = tl->expires.cpu; break; } @@ -1130,7 +1154,8 @@ static void check_process_timers(struct task_struct *tsk, } } x = secs_to_cputime(soft); - if (!prof_expires || x < prof_expires) { + if (cputime_eq(prof_expires, cputime_zero) || + cputime_lt(x, prof_expires)) { prof_expires = x; } } @@ -1224,9 +1249,12 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) static inline int task_cputime_expired(const struct task_cputime *sample, const struct task_cputime *expires) { - if (expires->utime && sample->utime >= expires->utime) + if (!cputime_eq(expires->utime, cputime_zero) && + cputime_ge(sample->utime, expires->utime)) return 1; - if (expires->stime && sample->utime + sample->stime >= expires->stime) + if (!cputime_eq(expires->stime, cputime_zero) && + cputime_ge(cputime_add(sample->utime, sample->stime), + expires->stime)) return 1; if (expires->sum_exec_runtime != 0 && sample->sum_exec_runtime >= expires->sum_exec_runtime) @@ -1361,18 +1389,18 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, * it to be relative, *newval argument is relative and we update * it to be absolute. */ - if (*oldval) { - if (*oldval <= now.cpu) { + if (!cputime_eq(*oldval, cputime_zero)) { + if (cputime_le(*oldval, now.cpu)) { /* Just about to fire. */ *oldval = cputime_one_jiffy; } else { - *oldval -= now.cpu; + *oldval = cputime_sub(*oldval, now.cpu); } } - if (!*newval) + if (cputime_eq(*newval, cputime_zero)) return; - *newval += now.cpu; + *newval = cputime_add(*newval, now.cpu); } /* diff --git a/trunk/kernel/printk.c b/trunk/kernel/printk.c index 989e4a52da76..7982a0a841ea 100644 --- a/trunk/kernel/printk.c +++ b/trunk/kernel/printk.c @@ -199,7 +199,7 @@ void __init setup_log_buf(int early) unsigned long mem; mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); - if (!mem) + if (mem == MEMBLOCK_ERROR) return; new_log_buf = __va(mem); } else { @@ -688,7 +688,6 @@ static void zap_locks(void) oops_timestamp = jiffies; - debug_locks_off(); /* If a crash is occurring, make sure we can't deadlock */ raw_spin_lock_init(&logbuf_lock); /* And make sure that we print immediately */ @@ -841,8 +840,9 @@ asmlinkage int vprintk(const char *fmt, va_list args) boot_delay_msec(); printk_delay(); + preempt_disable(); /* This stops the holder of console_sem just where we want him */ - local_irq_save(flags); + raw_local_irq_save(flags); this_cpu = smp_processor_id(); /* @@ -856,7 +856,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) * recursion and return - but flag the recursion so that * it can be printed at the next appropriate moment: */ - if (!oops_in_progress && !lockdep_recursing(current)) { + if (!oops_in_progress) { recursion_bug = 1; goto out_restore_irqs; } @@ -962,8 +962,9 @@ asmlinkage int vprintk(const char *fmt, va_list args) lockdep_on(); out_restore_irqs: - local_irq_restore(flags); + raw_local_irq_restore(flags); + preempt_enable(); return printed_len; } EXPORT_SYMBOL(printk); diff --git a/trunk/kernel/ptrace.c b/trunk/kernel/ptrace.c index 78ab24a7b0e4..24d04477b257 100644 --- a/trunk/kernel/ptrace.c +++ b/trunk/kernel/ptrace.c @@ -96,20 +96,9 @@ void __ptrace_unlink(struct task_struct *child) */ if (!(child->flags & PF_EXITING) && (child->signal->flags & SIGNAL_STOP_STOPPED || - child->signal->group_stop_count)) { + child->signal->group_stop_count)) child->jobctl |= JOBCTL_STOP_PENDING; - /* - * This is only possible if this thread was cloned by the - * traced task running in the stopped group, set the signal - * for the future reports. - * FIXME: we should change ptrace_init_task() to handle this - * case. - */ - if (!(child->jobctl & JOBCTL_STOP_SIGMASK)) - child->jobctl |= SIGSTOP; - } - /* * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick * @child in the butt. Note that @resume should be used iff @child diff --git a/trunk/kernel/rcu.h b/trunk/kernel/rcu.h index aa88baab5f78..f600868d550d 100644 --- a/trunk/kernel/rcu.h +++ b/trunk/kernel/rcu.h @@ -29,13 +29,6 @@ #define RCU_TRACE(stmt) #endif /* #else #ifdef CONFIG_RCU_TRACE */ -/* - * Process-level increment to ->dynticks_nesting field. This allows for - * architectures that use half-interrupts and half-exceptions from - * process context. - */ -#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1) - /* * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally * by call_rcu() and rcu callback execution, and are therefore not part of the diff --git a/trunk/kernel/rcupdate.c b/trunk/kernel/rcupdate.c index 2bc4e135ff23..c5b98e565aee 100644 --- a/trunk/kernel/rcupdate.c +++ b/trunk/kernel/rcupdate.c @@ -93,8 +93,6 @@ int rcu_read_lock_bh_held(void) { if (!debug_lockdep_rcu_enabled()) return 1; - if (rcu_is_cpu_idle()) - return 0; return in_softirq() || irqs_disabled(); } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); @@ -318,13 +316,3 @@ struct debug_obj_descr rcuhead_debug_descr = { }; EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ - -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) -void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp) -{ - trace_rcu_torture_read(rcutorturename, rhp); -} -EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); -#else -#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) -#endif diff --git a/trunk/kernel/rcutiny.c b/trunk/kernel/rcutiny.c index 977296dca0a4..636af6d9c6e5 100644 --- a/trunk/kernel/rcutiny.c +++ b/trunk/kernel/rcutiny.c @@ -53,137 +53,31 @@ static void __call_rcu(struct rcu_head *head, #include "rcutiny_plugin.h" -static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING; +#ifdef CONFIG_NO_HZ -/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ -static void rcu_idle_enter_common(long long oldval) -{ - if (rcu_dynticks_nesting) { - RCU_TRACE(trace_rcu_dyntick("--=", - oldval, rcu_dynticks_nesting)); - return; - } - RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); - if (!is_idle_task(current)) { - struct task_struct *idle = idle_task(smp_processor_id()); - - RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", - oldval, rcu_dynticks_nesting)); - ftrace_dump(DUMP_ALL); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } - rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ -} - -/* - * Enter idle, which is an extended quiescent state if we have fully - * entered that mode (i.e., if the new value of dynticks_nesting is zero). - */ -void rcu_idle_enter(void) -{ - unsigned long flags; - long long oldval; - - local_irq_save(flags); - oldval = rcu_dynticks_nesting; - rcu_dynticks_nesting = 0; - rcu_idle_enter_common(oldval); - local_irq_restore(flags); -} +static long rcu_dynticks_nesting = 1; /* - * Exit an interrupt handler towards idle. + * Enter dynticks-idle mode, which is an extended quiescent state + * if we have fully entered that mode (i.e., if the new value of + * dynticks_nesting is zero). */ -void rcu_irq_exit(void) +void rcu_enter_nohz(void) { - unsigned long flags; - long long oldval; - - local_irq_save(flags); - oldval = rcu_dynticks_nesting; - rcu_dynticks_nesting--; - WARN_ON_ONCE(rcu_dynticks_nesting < 0); - rcu_idle_enter_common(oldval); - local_irq_restore(flags); -} - -/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ -static void rcu_idle_exit_common(long long oldval) -{ - if (oldval) { - RCU_TRACE(trace_rcu_dyntick("++=", - oldval, rcu_dynticks_nesting)); - return; - } - RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); - if (!is_idle_task(current)) { - struct task_struct *idle = idle_task(smp_processor_id()); - - RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", - oldval, rcu_dynticks_nesting)); - ftrace_dump(DUMP_ALL); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } + if (--rcu_dynticks_nesting == 0) + rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ } /* - * Exit idle, so that we are no longer in an extended quiescent state. - */ -void rcu_idle_exit(void) -{ - unsigned long flags; - long long oldval; - - local_irq_save(flags); - oldval = rcu_dynticks_nesting; - WARN_ON_ONCE(oldval != 0); - rcu_dynticks_nesting = DYNTICK_TASK_NESTING; - rcu_idle_exit_common(oldval); - local_irq_restore(flags); -} - -/* - * Enter an interrupt handler, moving away from idle. + * Exit dynticks-idle mode, so that we are no longer in an extended + * quiescent state. */ -void rcu_irq_enter(void) +void rcu_exit_nohz(void) { - unsigned long flags; - long long oldval; - - local_irq_save(flags); - oldval = rcu_dynticks_nesting; rcu_dynticks_nesting++; - WARN_ON_ONCE(rcu_dynticks_nesting == 0); - rcu_idle_exit_common(oldval); - local_irq_restore(flags); -} - -#ifdef CONFIG_PROVE_RCU - -/* - * Test whether RCU thinks that the current CPU is idle. - */ -int rcu_is_cpu_idle(void) -{ - return !rcu_dynticks_nesting; } -EXPORT_SYMBOL(rcu_is_cpu_idle); - -#endif /* #ifdef CONFIG_PROVE_RCU */ -/* - * Test whether the current CPU was interrupted from idle. Nested - * interrupts don't count, we must be running at the first interrupt - * level. - */ -int rcu_is_cpu_rrupt_from_idle(void) -{ - return rcu_dynticks_nesting <= 0; -} +#endif /* #ifdef CONFIG_NO_HZ */ /* * Helper function for rcu_sched_qs() and rcu_bh_qs(). @@ -232,13 +126,14 @@ void rcu_bh_qs(int cpu) /* * Check to see if the scheduling-clock interrupt came from an extended - * quiescent state, and, if so, tell RCU about it. This function must - * be called from hardirq context. It is normally called from the - * scheduling-clock interrupt. + * quiescent state, and, if so, tell RCU about it. */ void rcu_check_callbacks(int cpu, int user) { - if (user || rcu_is_cpu_rrupt_from_idle()) + if (user || + (idle_cpu(cpu) && + !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) rcu_sched_qs(cpu); else if (!in_softirq()) rcu_bh_qs(cpu); @@ -259,11 +154,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) /* If no RCU callbacks ready to invoke, just return. */ if (&rcp->rcucblist == rcp->donetail) { RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); - RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, - ACCESS_ONCE(rcp->rcucblist), - need_resched(), - is_idle_task(current), - rcu_is_callbacks_kthread())); + RCU_TRACE(trace_rcu_batch_end(rcp->name, 0)); return; } @@ -292,9 +183,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) RCU_TRACE(cb_count++); } RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); - RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), - is_idle_task(current), - rcu_is_callbacks_kthread())); + RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count)); } static void rcu_process_callbacks(struct softirq_action *unused) diff --git a/trunk/kernel/rcutiny_plugin.h b/trunk/kernel/rcutiny_plugin.h index 9cb1ae4aabdd..2b0484a5dc28 100644 --- a/trunk/kernel/rcutiny_plugin.h +++ b/trunk/kernel/rcutiny_plugin.h @@ -312,8 +312,8 @@ static int rcu_boost(void) rt_mutex_lock(&mtx); rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ - return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL || - ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL; + return rcu_preempt_ctrlblk.boost_tasks != NULL || + rcu_preempt_ctrlblk.exp_tasks != NULL; } /* @@ -885,19 +885,6 @@ static void invoke_rcu_callbacks(void) wake_up(&rcu_kthread_wq); } -#ifdef CONFIG_RCU_TRACE - -/* - * Is the current CPU running the RCU-callbacks kthread? - * Caller must have preemption disabled. - */ -static bool rcu_is_callbacks_kthread(void) -{ - return rcu_kthread_task == current; -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - /* * This kthread invokes RCU callbacks whose grace periods have * elapsed. It is awakened as needed, and takes the place of the @@ -951,18 +938,6 @@ void invoke_rcu_callbacks(void) raise_softirq(RCU_SOFTIRQ); } -#ifdef CONFIG_RCU_TRACE - -/* - * There is no callback kthread, so this thread is never it. - */ -static bool rcu_is_callbacks_kthread(void) -{ - return false; -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - void rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); diff --git a/trunk/kernel/rcutorture.c b/trunk/kernel/rcutorture.c index 88f17b8a3b1d..764825c2685c 100644 --- a/trunk/kernel/rcutorture.c +++ b/trunk/kernel/rcutorture.c @@ -61,11 +61,9 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ static int stutter = 5; /* Start/stop testing interval (in sec) */ static int irqreader = 1; /* RCU readers from irq (timers). */ -static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ -static int fqs_holdoff; /* Hold time within burst (us). */ +static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ +static int fqs_holdoff = 0; /* Hold time within burst (us). */ static int fqs_stutter = 3; /* Wait time between bursts (s). */ -static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ -static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ @@ -93,10 +91,6 @@ module_param(fqs_holdoff, int, 0444); MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); module_param(fqs_stutter, int, 0444); MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); -module_param(onoff_interval, int, 0444); -MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); -module_param(shutdown_secs, int, 0444); -MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); module_param(test_boost, int, 0444); MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); module_param(test_boost_interval, int, 0444); @@ -125,10 +119,6 @@ static struct task_struct *shuffler_task; static struct task_struct *stutter_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; -static struct task_struct *shutdown_task; -#ifdef CONFIG_HOTPLUG_CPU -static struct task_struct *onoff_task; -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ #define RCU_TORTURE_PIPE_LEN 10 @@ -159,10 +149,6 @@ static long n_rcu_torture_boost_rterror; static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; static long n_rcu_torture_timers; -static long n_offline_attempts; -static long n_offline_successes; -static long n_online_attempts; -static long n_online_successes; static struct list_head rcu_torture_removed; static cpumask_var_t shuffle_tmp_mask; @@ -174,8 +160,6 @@ static int stutter_pause_test; #define RCUTORTURE_RUNNABLE_INIT 0 #endif int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; -module_param(rcutorture_runnable, int, 0444); -MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) #define rcu_can_boost() 1 @@ -183,7 +167,6 @@ MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); #define rcu_can_boost() 0 #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ -static unsigned long shutdown_time; /* jiffies to system shutdown. */ static unsigned long boost_starttime; /* jiffies of next boost test start. */ DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ /* and boost task create/destroy. */ @@ -199,9 +182,6 @@ static int fullstop = FULLSTOP_RMMOD; */ static DEFINE_MUTEX(fullstop_mutex); -/* Forward reference. */ -static void rcu_torture_cleanup(void); - /* * Detect and respond to a system shutdown. */ @@ -632,30 +612,6 @@ static struct rcu_torture_ops srcu_ops = { .name = "srcu" }; -static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) -{ - return srcu_read_lock_raw(&srcu_ctl); -} - -static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl) -{ - srcu_read_unlock_raw(&srcu_ctl, idx); -} - -static struct rcu_torture_ops srcu_raw_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, - .readlock = srcu_torture_read_lock_raw, - .read_delay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock_raw, - .completed = srcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = srcu_torture_synchronize, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu_raw" -}; - static void srcu_torture_synchronize_expedited(void) { synchronize_srcu_expedited(&srcu_ctl); @@ -957,18 +913,6 @@ rcu_torture_fakewriter(void *arg) return 0; } -void rcutorture_trace_dump(void) -{ - static atomic_t beenhere = ATOMIC_INIT(0); - - if (atomic_read(&beenhere)) - return; - if (atomic_xchg(&beenhere, 1) != 0) - return; - do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL); - ftrace_dump(DUMP_ALL); -} - /* * RCU torture reader from timer handler. Dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -990,7 +934,6 @@ static void rcu_torture_timer(unsigned long unused) rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(&srcu_ctl)); - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p == NULL) { /* Leave because rcu_torture_writer is not yet underway */ cur_ops->readunlock(idx); @@ -1008,8 +951,6 @@ static void rcu_torture_timer(unsigned long unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - if (pipe_count > 1) - rcutorture_trace_dump(); __this_cpu_inc(rcu_torture_count[pipe_count]); completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { @@ -1053,7 +994,6 @@ rcu_torture_reader(void *arg) rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(&srcu_ctl)); - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ cur_ops->readunlock(idx); @@ -1069,8 +1009,6 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - if (pipe_count > 1) - rcutorture_trace_dump(); __this_cpu_inc(rcu_torture_count[pipe_count]); completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { @@ -1118,8 +1056,7 @@ rcu_torture_printk(char *page) cnt += sprintf(&page[cnt], "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " "rtmbe: %d rtbke: %ld rtbre: %ld " - "rtbf: %ld rtb: %ld nt: %ld " - "onoff: %ld/%ld:%ld/%ld", + "rtbf: %ld rtb: %ld nt: %ld", rcu_torture_current, rcu_torture_current_version, list_empty(&rcu_torture_freelist), @@ -1131,11 +1068,7 @@ rcu_torture_printk(char *page) n_rcu_torture_boost_rterror, n_rcu_torture_boost_failure, n_rcu_torture_boosts, - n_rcu_torture_timers, - n_online_successes, - n_online_attempts, - n_offline_successes, - n_offline_attempts); + n_rcu_torture_timers); if (atomic_read(&n_rcu_torture_mberror) != 0 || n_rcu_torture_boost_ktrerror != 0 || n_rcu_torture_boost_rterror != 0 || @@ -1299,14 +1232,12 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) "shuffle_interval=%d stutter=%d irqreader=%d " "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " "test_boost=%d/%d test_boost_interval=%d " - "test_boost_duration=%d shutdown_secs=%d " - "onoff_interval=%d\n", + "test_boost_duration=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, - test_boost_interval, test_boost_duration, shutdown_secs, - onoff_interval); + test_boost_interval, test_boost_duration); } static struct notifier_block rcutorture_shutdown_nb = { @@ -1356,131 +1287,6 @@ static int rcutorture_booster_init(int cpu) return 0; } -/* - * Cause the rcutorture test to shutdown the system after the test has - * run for the time specified by the shutdown_secs module parameter. - */ -static int -rcu_torture_shutdown(void *arg) -{ - long delta; - unsigned long jiffies_snap; - - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started"); - jiffies_snap = ACCESS_ONCE(jiffies); - while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && - !kthread_should_stop()) { - delta = shutdown_time - jiffies_snap; - if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_shutdown task: %lu " - "jiffies remaining\n", - torture_type, delta); - schedule_timeout_interruptible(delta); - jiffies_snap = ACCESS_ONCE(jiffies); - } - if (kthread_should_stop()) { - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping"); - return 0; - } - - /* OK, shut down the system. */ - - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system"); - shutdown_task = NULL; /* Avoid self-kill deadlock. */ - rcu_torture_cleanup(); /* Get the success/failure message. */ - kernel_power_off(); /* Shut down the system. */ - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Execute random CPU-hotplug operations at the interval specified - * by the onoff_interval. - */ -static int -rcu_torture_onoff(void *arg) -{ - int cpu; - int maxcpu = -1; - DEFINE_RCU_RANDOM(rand); - - VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); - for_each_online_cpu(cpu) - maxcpu = cpu; - WARN_ON(maxcpu < 0); - while (!kthread_should_stop()) { - cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); - if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { - if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: offlining %d\n", - torture_type, cpu); - n_offline_attempts++; - if (cpu_down(cpu) == 0) { - if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: " - "offlined %d\n", - torture_type, cpu); - n_offline_successes++; - } - } else if (cpu_is_hotpluggable(cpu)) { - if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: onlining %d\n", - torture_type, cpu); - n_online_attempts++; - if (cpu_up(cpu) == 0) { - if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: " - "onlined %d\n", - torture_type, cpu); - n_online_successes++; - } - } - schedule_timeout_interruptible(onoff_interval * HZ); - } - VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping"); - return 0; -} - -static int -rcu_torture_onoff_init(void) -{ - if (onoff_interval <= 0) - return 0; - onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); - if (IS_ERR(onoff_task)) { - onoff_task = NULL; - return PTR_ERR(onoff_task); - } - return 0; -} - -static void rcu_torture_onoff_cleanup(void) -{ - if (onoff_task == NULL) - return; - VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); - kthread_stop(onoff_task); -} - -#else /* #ifdef CONFIG_HOTPLUG_CPU */ - -static void -rcu_torture_onoff_init(void) -{ -} - -static void rcu_torture_onoff_cleanup(void) -{ -} - -#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ - static int rcutorture_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -1585,11 +1391,6 @@ rcu_torture_cleanup(void) for_each_possible_cpu(i) rcutorture_booster_cleanup(i); } - if (shutdown_task != NULL) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); - kthread_stop(shutdown_task); - } - rcu_torture_onoff_cleanup(); /* Wait for all RCU callbacks to fire. */ @@ -1615,7 +1416,7 @@ rcu_torture_init(void) static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, - &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, + &srcu_ops, &srcu_expedited_ops, &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; mutex_lock(&fullstop_mutex); @@ -1806,18 +1607,6 @@ rcu_torture_init(void) } } } - if (shutdown_secs > 0) { - shutdown_time = jiffies + shutdown_secs * HZ; - shutdown_task = kthread_run(rcu_torture_shutdown, NULL, - "rcu_torture_shutdown"); - if (IS_ERR(shutdown_task)) { - firsterr = PTR_ERR(shutdown_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); - shutdown_task = NULL; - goto unwind; - } - } - rcu_torture_onoff_init(); register_reboot_notifier(&rcutorture_shutdown_nb); rcutorture_record_test_transition(); mutex_unlock(&fullstop_mutex); diff --git a/trunk/kernel/rcutree.c b/trunk/kernel/rcutree.c index 6c4a6722abfd..6b76d812740c 100644 --- a/trunk/kernel/rcutree.c +++ b/trunk/kernel/rcutree.c @@ -69,7 +69,7 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; NUM_RCU_LVL_3, \ NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ }, \ - .fqs_state = RCU_GP_IDLE, \ + .signaled = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ @@ -195,10 +195,12 @@ void rcu_note_context_switch(int cpu) } EXPORT_SYMBOL_GPL(rcu_note_context_switch); +#ifdef CONFIG_NO_HZ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { - .dynticks_nesting = DYNTICK_TASK_NESTING, + .dynticks_nesting = 1, .dynticks = ATOMIC_INIT(1), }; +#endif /* #ifdef CONFIG_NO_HZ */ static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ static int qhimark = 10000; /* If this many pending, ignore blimit. */ @@ -326,11 +328,11 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) return 1; } - /* - * The CPU is online, so send it a reschedule IPI. This forces - * it through the scheduler, and (inefficiently) also handles cases - * where idle loops fail to inform RCU about the CPU being idle. - */ + /* If preemptible RCU, no point in sending reschedule IPI. */ + if (rdp->preemptible) + return 0; + + /* The CPU is online, so send it a reschedule IPI. */ if (rdp->cpu != smp_processor_id()) smp_send_reschedule(rdp->cpu); else @@ -341,181 +343,59 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) #endif /* #ifdef CONFIG_SMP */ -/* - * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle - * - * If the new value of the ->dynticks_nesting counter now is zero, - * we really have entered idle, and must do the appropriate accounting. - * The caller must have disabled interrupts. - */ -static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) -{ - trace_rcu_dyntick("Start", oldval, 0); - if (!is_idle_task(current)) { - struct task_struct *idle = idle_task(smp_processor_id()); - - trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); - ftrace_dump(DUMP_ALL); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } - rcu_prepare_for_idle(smp_processor_id()); - /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic_inc(); /* See above. */ - atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ - WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); -} +#ifdef CONFIG_NO_HZ /** - * rcu_idle_enter - inform RCU that current CPU is entering idle + * rcu_enter_nohz - inform RCU that current CPU is entering nohz * - * Enter idle mode, in other words, -leave- the mode in which RCU + * Enter nohz mode, in other words, -leave- the mode in which RCU * read-side critical sections can occur. (Though RCU read-side - * critical sections can occur in irq handlers in idle, a possibility - * handled by irq_enter() and irq_exit().) - * - * We crowbar the ->dynticks_nesting field to zero to allow for - * the possibility of usermode upcalls having messed up our count - * of interrupt nesting level during the prior busy period. + * critical sections can occur in irq handlers in nohz mode, a possibility + * handled by rcu_irq_enter() and rcu_irq_exit()). */ -void rcu_idle_enter(void) +void rcu_enter_nohz(void) { unsigned long flags; - long long oldval; struct rcu_dynticks *rdtp; local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - oldval = rdtp->dynticks_nesting; - rdtp->dynticks_nesting = 0; - rcu_idle_enter_common(rdtp, oldval); + if (--rdtp->dynticks_nesting) { + local_irq_restore(flags); + return; + } + trace_rcu_dyntick("Start"); + /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ + smp_mb__before_atomic_inc(); /* See above. */ + atomic_inc(&rdtp->dynticks); + smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ + WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); local_irq_restore(flags); } -/** - * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle - * - * Exit from an interrupt handler, which might possibly result in entering - * idle mode, in other words, leaving the mode in which read-side critical - * sections can occur. - * - * This code assumes that the idle loop never does anything that might - * result in unbalanced calls to irq_enter() and irq_exit(). If your - * architecture violates this assumption, RCU will give you what you - * deserve, good and hard. But very infrequently and irreproducibly. - * - * Use things like work queues to work around this limitation. +/* + * rcu_exit_nohz - inform RCU that current CPU is leaving nohz * - * You have been warned. + * Exit nohz mode, in other words, -enter- the mode in which RCU + * read-side critical sections normally occur. */ -void rcu_irq_exit(void) +void rcu_exit_nohz(void) { unsigned long flags; - long long oldval; struct rcu_dynticks *rdtp; local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - oldval = rdtp->dynticks_nesting; - rdtp->dynticks_nesting--; - WARN_ON_ONCE(rdtp->dynticks_nesting < 0); - if (rdtp->dynticks_nesting) - trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); - else - rcu_idle_enter_common(rdtp, oldval); - local_irq_restore(flags); -} - -/* - * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle - * - * If the new value of the ->dynticks_nesting counter was previously zero, - * we really have exited idle, and must do the appropriate accounting. - * The caller must have disabled interrupts. - */ -static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) -{ + if (rdtp->dynticks_nesting++) { + local_irq_restore(flags); + return; + } smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ atomic_inc(&rdtp->dynticks); /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ smp_mb__after_atomic_inc(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); - rcu_cleanup_after_idle(smp_processor_id()); - trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); - if (!is_idle_task(current)) { - struct task_struct *idle = idle_task(smp_processor_id()); - - trace_rcu_dyntick("Error on exit: not idle task", - oldval, rdtp->dynticks_nesting); - ftrace_dump(DUMP_ALL); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } -} - -/** - * rcu_idle_exit - inform RCU that current CPU is leaving idle - * - * Exit idle mode, in other words, -enter- the mode in which RCU - * read-side critical sections can occur. - * - * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to - * allow for the possibility of usermode upcalls messing up our count - * of interrupt nesting level during the busy period that is just - * now starting. - */ -void rcu_idle_exit(void) -{ - unsigned long flags; - struct rcu_dynticks *rdtp; - long long oldval; - - local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); - oldval = rdtp->dynticks_nesting; - WARN_ON_ONCE(oldval != 0); - rdtp->dynticks_nesting = DYNTICK_TASK_NESTING; - rcu_idle_exit_common(rdtp, oldval); - local_irq_restore(flags); -} - -/** - * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle - * - * Enter an interrupt handler, which might possibly result in exiting - * idle mode, in other words, entering the mode in which read-side critical - * sections can occur. - * - * Note that the Linux kernel is fully capable of entering an interrupt - * handler that it never exits, for example when doing upcalls to - * user mode! This code assumes that the idle loop never does upcalls to - * user mode. If your architecture does do upcalls from the idle loop (or - * does anything else that results in unbalanced calls to the irq_enter() - * and irq_exit() functions), RCU will give you what you deserve, good - * and hard. But very infrequently and irreproducibly. - * - * Use things like work queues to work around this limitation. - * - * You have been warned. - */ -void rcu_irq_enter(void) -{ - unsigned long flags; - struct rcu_dynticks *rdtp; - long long oldval; - - local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); - oldval = rdtp->dynticks_nesting; - rdtp->dynticks_nesting++; - WARN_ON_ONCE(rdtp->dynticks_nesting == 0); - if (oldval) - trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); - else - rcu_idle_exit_common(rdtp, oldval); + trace_rcu_dyntick("End"); local_irq_restore(flags); } @@ -562,37 +442,27 @@ void rcu_nmi_exit(void) WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); } -#ifdef CONFIG_PROVE_RCU - /** - * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle + * rcu_irq_enter - inform RCU of entry to hard irq context * - * If the current CPU is in its idle loop and is neither in an interrupt - * or NMI handler, return true. + * If the CPU was idle with dynamic ticks active, this updates the + * rdtp->dynticks to let the RCU handling know that the CPU is active. */ -int rcu_is_cpu_idle(void) +void rcu_irq_enter(void) { - int ret; - - preempt_disable(); - ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; - preempt_enable(); - return ret; + rcu_exit_nohz(); } -EXPORT_SYMBOL(rcu_is_cpu_idle); - -#endif /* #ifdef CONFIG_PROVE_RCU */ /** - * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle + * rcu_irq_exit - inform RCU of exit from hard irq context * - * If the current CPU is idle or running at a first-level (not nested) - * interrupt from idle, return true. The caller must have at least - * disabled preemption. + * If the CPU was idle with dynamic ticks active, update the rdp->dynticks + * to put let the RCU handling be aware that the CPU is going back to idle + * with no ticks. */ -int rcu_is_cpu_rrupt_from_idle(void) +void rcu_irq_exit(void) { - return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; + rcu_enter_nohz(); } #ifdef CONFIG_SMP @@ -605,7 +475,7 @@ int rcu_is_cpu_rrupt_from_idle(void) static int dyntick_save_progress_counter(struct rcu_data *rdp) { rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); - return (rdp->dynticks_snap & 0x1) == 0; + return 0; } /* @@ -642,6 +512,26 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) #endif /* #ifdef CONFIG_SMP */ +#else /* #ifdef CONFIG_NO_HZ */ + +#ifdef CONFIG_SMP + +static int dyntick_save_progress_counter(struct rcu_data *rdp) +{ + return 0; +} + +static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) +{ + return rcu_implicit_offline_qs(rdp); +} + +#endif /* #ifdef CONFIG_SMP */ + +#endif /* #else #ifdef CONFIG_NO_HZ */ + +int rcu_cpu_stall_suppress __read_mostly; + static void record_gp_stall_check_time(struct rcu_state *rsp) { rsp->gp_start = jiffies; @@ -976,8 +866,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) /* Advance to a new grace period and initialize state. */ rsp->gpnum++; trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); - WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); - rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ + WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); + rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); @@ -987,7 +877,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; rnp->completed = rsp->completed; - rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */ + rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ rcu_start_gp_per_cpu(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); trace_rcu_grace_period_init(rsp->name, rnp->gpnum, @@ -1037,7 +927,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rnp = rcu_get_root(rsp); raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ + rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ raw_spin_unlock_irqrestore(&rsp->onofflock, flags); } @@ -1101,7 +991,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ trace_rcu_grace_period(rsp->name, rsp->completed, "end"); - rsp->fqs_state = RCU_GP_IDLE; + rsp->signaled = RCU_GP_IDLE; rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ } @@ -1331,7 +1221,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) else raw_spin_unlock_irqrestore(&rnp->lock, flags); if (need_report & RCU_OFL_TASKS_EXP_GP) - rcu_report_exp_rnp(rsp, rnp, true); + rcu_report_exp_rnp(rsp, rnp); rcu_node_kthread_setaffinity(rnp, -1); } @@ -1373,9 +1263,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* If no callbacks are ready, just return.*/ if (!cpu_has_callbacks_ready_to_invoke(rdp)) { trace_rcu_batch_start(rsp->name, 0, 0); - trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), - need_resched(), is_idle_task(current), - rcu_is_callbacks_kthread()); + trace_rcu_batch_end(rsp->name, 0); return; } @@ -1403,17 +1291,12 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) debug_rcu_head_unqueue(list); __rcu_reclaim(rsp->name, list); list = next; - /* Stop only if limit reached and CPU has something to do. */ - if (++count >= bl && - (need_resched() || - (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) + if (++count >= bl) break; } local_irq_save(flags); - trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), - is_idle_task(current), - rcu_is_callbacks_kthread()); + trace_rcu_batch_end(rsp->name, count); /* Update count, and requeue any remaining callbacks. */ rdp->qlen -= count; @@ -1451,14 +1334,16 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). * Also schedule RCU core processing. * - * This function must be called from hardirq context. It is normally + * This function must be called with hardirqs disabled. It is normally * invoked from the scheduling-clock interrupt. If rcu_pending returns * false, there is no point in invoking rcu_check_callbacks(). */ void rcu_check_callbacks(int cpu, int user) { trace_rcu_utilization("Start scheduler-tick"); - if (user || rcu_is_cpu_rrupt_from_idle()) { + if (user || + (idle_cpu(cpu) && rcu_scheduler_active && + !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { /* * Get here if this CPU took its interrupt from user @@ -1572,7 +1457,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) goto unlock_fqs_ret; /* no GP in progress, time updated. */ } rsp->fqs_active = 1; - switch (rsp->fqs_state) { + switch (rsp->signaled) { case RCU_GP_IDLE: case RCU_GP_INIT: @@ -1588,7 +1473,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) force_qs_rnp(rsp, dyntick_save_progress_counter); raw_spin_lock(&rnp->lock); /* irqs already disabled */ if (rcu_gp_in_progress(rsp)) - rsp->fqs_state = RCU_FORCE_QS; + rsp->signaled = RCU_FORCE_QS; break; case RCU_FORCE_QS: @@ -1927,7 +1812,7 @@ static int rcu_pending(int cpu) * by the current CPU, even if none need be done immediately, returning * 1 if so. */ -static int rcu_cpu_has_callbacks(int cpu) +static int rcu_needs_cpu_quick_check(int cpu) { /* RCU callbacks either ready or pending? */ return per_cpu(rcu_sched_data, cpu).nxtlist || @@ -2028,9 +1913,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; rdp->qlen = 0; +#ifdef CONFIG_NO_HZ rdp->dynticks = &per_cpu(rcu_dynticks, cpu); - WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); - WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); +#endif /* #ifdef CONFIG_NO_HZ */ rdp->cpu = cpu; rdp->rsp = rsp; raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -2057,10 +1942,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; - rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING; - atomic_set(&rdp->dynticks->dynticks, - (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); - rcu_prepare_for_idle_init(cpu); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ /* @@ -2142,7 +2023,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, rcu_send_cbs_to_online(&rcu_bh_state); rcu_send_cbs_to_online(&rcu_sched_state); rcu_preempt_send_cbs_to_online(); - rcu_cleanup_after_idle(cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: diff --git a/trunk/kernel/rcutree.h b/trunk/kernel/rcutree.h index fddff92d6676..849ce9ec51fe 100644 --- a/trunk/kernel/rcutree.h +++ b/trunk/kernel/rcutree.h @@ -84,10 +84,9 @@ * Dynticks per-CPU state. */ struct rcu_dynticks { - long long dynticks_nesting; /* Track irq/process nesting level. */ - /* Process level is worth LLONG_MAX/2. */ - int dynticks_nmi_nesting; /* Track NMI nesting level. */ - atomic_t dynticks; /* Even value for idle, else odd. */ + int dynticks_nesting; /* Track irq/process nesting level. */ + int dynticks_nmi_nesting; /* Track NMI nesting level. */ + atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ }; /* RCU's kthread states for tracing. */ @@ -275,12 +274,16 @@ struct rcu_data { /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ +#ifdef CONFIG_NO_HZ /* 3) dynticks interface. */ struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ int dynticks_snap; /* Per-GP tracking for dynticks. */ +#endif /* #ifdef CONFIG_NO_HZ */ /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ +#ifdef CONFIG_NO_HZ unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ +#endif /* #ifdef CONFIG_NO_HZ */ unsigned long offline_fqs; /* Kicked due to being offline. */ unsigned long resched_ipi; /* Sent a resched IPI. */ @@ -299,12 +302,16 @@ struct rcu_data { struct rcu_state *rsp; }; -/* Values for fqs_state field in struct rcu_state. */ +/* Values for signaled field in struct rcu_state. */ #define RCU_GP_IDLE 0 /* No grace period in progress. */ #define RCU_GP_INIT 1 /* Grace period being initialized. */ #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ +#ifdef CONFIG_NO_HZ #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK +#else /* #ifdef CONFIG_NO_HZ */ +#define RCU_SIGNAL_INIT RCU_FORCE_QS +#endif /* #else #ifdef CONFIG_NO_HZ */ #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ @@ -354,7 +361,7 @@ struct rcu_state { /* The following fields are guarded by the root rcu_node's lock. */ - u8 fqs_state ____cacheline_internodealigned_in_smp; + u8 signaled ____cacheline_internodealigned_in_smp; /* Force QS state. */ u8 fqs_active; /* force_quiescent_state() */ /* is running. */ @@ -444,8 +451,7 @@ static void rcu_preempt_check_callbacks(int cpu); static void rcu_preempt_process_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake); +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ static int rcu_preempt_pending(int cpu); static int rcu_preempt_needs_cpu(int cpu); @@ -455,7 +461,6 @@ static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static void invoke_rcu_callbacks_kthread(void); -static bool rcu_is_callbacks_kthread(void); #ifdef CONFIG_RCU_BOOST static void rcu_preempt_do_callbacks(void); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, @@ -468,8 +473,5 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg); #endif /* #ifdef CONFIG_RCU_BOOST */ static void rcu_cpu_kthread_setrt(int cpu, int to_rt); static void __cpuinit rcu_prepare_kthreads(int cpu); -static void rcu_prepare_for_idle_init(int cpu); -static void rcu_cleanup_after_idle(int cpu); -static void rcu_prepare_for_idle(int cpu); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/trunk/kernel/rcutree_plugin.h b/trunk/kernel/rcutree_plugin.h index 8bb35d73e1f9..4b9b9f8a4184 100644 --- a/trunk/kernel/rcutree_plugin.h +++ b/trunk/kernel/rcutree_plugin.h @@ -312,7 +312,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) { int empty; int empty_exp; - int empty_exp_now; unsigned long flags; struct list_head *np; #ifdef CONFIG_RCU_BOOST @@ -383,10 +382,8 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) /* * If this was the last task on the current list, and if * we aren't waiting on any CPUs, report the quiescent state. - * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, - * so we must take a snapshot of the expedited state. + * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. */ - empty_exp_now = !rcu_preempted_readers_exp(rnp); if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report("preempt_rcu", rnp->gpnum, @@ -409,8 +406,8 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) * If this was the last task on the expedited lists, * then we need to report up the rcu_node hierarchy. */ - if (!empty_exp && empty_exp_now) - rcu_report_exp_rnp(&rcu_preempt_state, rnp, true); + if (!empty_exp && !rcu_preempted_readers_exp(rnp)) + rcu_report_exp_rnp(&rcu_preempt_state, rnp); } else { local_irq_restore(flags); } @@ -732,13 +729,9 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * - * Most callers will set the "wake" flag, but the task initiating the - * expedited grace period need not wake itself. - * * Caller must hold sync_rcu_preempt_exp_mutex. */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake) +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) { unsigned long flags; unsigned long mask; @@ -751,8 +744,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, } if (rnp->parent == NULL) { raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (wake) - wake_up(&sync_rcu_preempt_exp_wq); + wake_up(&sync_rcu_preempt_exp_wq); break; } mask = rnp->grpmask; @@ -785,7 +777,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) must_wait = 1; } if (!must_wait) - rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ + rcu_report_exp_rnp(rsp, rnp); } /* @@ -1077,9 +1069,9 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); * report on tasks preempted in RCU read-side critical sections during * expedited RCU grace periods. */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake) +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) { + return; } #endif /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -1165,6 +1157,8 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp) #endif /* #else #ifdef CONFIG_RCU_TRACE */ +static struct lock_class_key rcu_boost_class; + /* * Carry out RCU priority boosting on the task indicated by ->exp_tasks * or ->boost_tasks, advancing the pointer to the next task in the @@ -1227,13 +1221,15 @@ static int rcu_boost(struct rcu_node *rnp) */ t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&mtx, t); + /* Avoid lockdep false positives. This rt_mutex is its own thing. */ + lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class, + "rcu_boost_mutex"); t->rcu_boost_mutex = &mtx; raw_spin_unlock_irqrestore(&rnp->lock, flags); rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ - return ACCESS_ONCE(rnp->exp_tasks) != NULL || - ACCESS_ONCE(rnp->boost_tasks) != NULL; + return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; } /* @@ -1332,15 +1328,6 @@ static void invoke_rcu_callbacks_kthread(void) local_irq_restore(flags); } -/* - * Is the current CPU running the RCU-callbacks kthread? - * Caller must have preemption disabled. - */ -static bool rcu_is_callbacks_kthread(void) -{ - return __get_cpu_var(rcu_cpu_kthread_task) == current; -} - /* * Set the affinity of the boost kthread. The CPU-hotplug locks are * held, so no one should be messing with the existence of the boost @@ -1785,11 +1772,6 @@ static void invoke_rcu_callbacks_kthread(void) WARN_ON_ONCE(1); } -static bool rcu_is_callbacks_kthread(void) -{ - return false; -} - static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) { } @@ -1925,7 +1907,7 @@ void synchronize_sched_expedited(void) * grace period works for us. */ get_online_cpus(); - snap = atomic_read(&sync_sched_expedited_started); + snap = atomic_read(&sync_sched_expedited_started) - 1; smp_mb(); /* ensure read is before try_stop_cpus(). */ } @@ -1957,243 +1939,88 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited); * 1 if so. This function is part of the RCU implementation; it is -not- * an exported member of the RCU API. * - * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs - * any flavor of RCU. + * Because we have preemptible RCU, just check whether this CPU needs + * any flavor of RCU. Do not chew up lots of CPU cycles with preemption + * disabled in a most-likely vain attempt to cause RCU not to need this CPU. */ int rcu_needs_cpu(int cpu) { - return rcu_cpu_has_callbacks(cpu); -} - -/* - * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it. - */ -static void rcu_prepare_for_idle_init(int cpu) -{ -} - -/* - * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up - * after it. - */ -static void rcu_cleanup_after_idle(int cpu) -{ -} - -/* - * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y, - * is nothing. - */ -static void rcu_prepare_for_idle(int cpu) -{ + return rcu_needs_cpu_quick_check(cpu); } #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -/* - * This code is invoked when a CPU goes idle, at which point we want - * to have the CPU do everything required for RCU so that it can enter - * the energy-efficient dyntick-idle mode. This is handled by a - * state machine implemented by rcu_prepare_for_idle() below. - * - * The following three proprocessor symbols control this state machine: - * - * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt - * to satisfy RCU. Beyond this point, it is better to incur a periodic - * scheduling-clock interrupt than to loop through the state machine - * at full power. - * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are - * optional if RCU does not need anything immediately from this - * CPU, even if this CPU still has RCU callbacks queued. The first - * times through the state machine are mandatory: we need to give - * the state machine a chance to communicate a quiescent state - * to the RCU core. - * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted - * to sleep in dyntick-idle mode with RCU callbacks pending. This - * is sized to be roughly one RCU grace period. Those energy-efficiency - * benchmarkers who might otherwise be tempted to set this to a large - * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your - * system. And if you are -that- concerned about energy efficiency, - * just power the system down and be done with it! - * - * The values below work well in practice. If future workloads require - * adjustment, they can be converted into kernel config parameters, though - * making the state machine smarter might be a better option. - */ -#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ -#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ -#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ - +#define RCU_NEEDS_CPU_FLUSHES 5 static DEFINE_PER_CPU(int, rcu_dyntick_drain); static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); -static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); -static ktime_t rcu_idle_gp_wait; - -/* - * Allow the CPU to enter dyntick-idle mode if either: (1) There are no - * callbacks on this CPU, (2) this CPU has not yet attempted to enter - * dyntick-idle mode, or (3) this CPU is in the process of attempting to - * enter dyntick-idle mode. Otherwise, if we have recently tried and failed - * to enter dyntick-idle mode, we refuse to try to enter it. After all, - * it is better to incur scheduling-clock interrupts than to spin - * continuously for the same time duration! - */ -int rcu_needs_cpu(int cpu) -{ - /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(cpu)) - return 0; - /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ - return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; -} - -/* - * Timer handler used to force CPU to start pushing its remaining RCU - * callbacks in the case where it entered dyntick-idle mode with callbacks - * pending. The hander doesn't really need to do anything because the - * real work is done upon re-entry to idle, or by the next scheduling-clock - * interrupt should idle not be re-entered. - */ -static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) -{ - trace_rcu_prep_idle("Timer"); - return HRTIMER_NORESTART; -} - -/* - * Initialize the timer used to pull CPUs out of dyntick-idle mode. - */ -static void rcu_prepare_for_idle_init(int cpu) -{ - static int firsttime = 1; - struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); - - hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtp->function = rcu_idle_gp_timer_func; - if (firsttime) { - unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); - - rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); - firsttime = 0; - } -} - -/* - * Clean up for exit from idle. Because we are exiting from idle, there - * is no longer any point to rcu_idle_gp_timer, so cancel it. This will - * do nothing if this timer is not active, so just cancel it unconditionally. - */ -static void rcu_cleanup_after_idle(int cpu) -{ - hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); -} /* - * Check to see if any RCU-related work can be done by the current CPU, - * and if so, schedule a softirq to get it done. This function is part - * of the RCU implementation; it is -not- an exported member of the RCU API. + * Check to see if any future RCU-related work will need to be done + * by the current CPU, even if none need be done immediately, returning + * 1 if so. This function is part of the RCU implementation; it is -not- + * an exported member of the RCU API. * - * The idea is for the current CPU to clear out all work required by the - * RCU core for the current grace period, so that this CPU can be permitted - * to enter dyntick-idle mode. In some cases, it will need to be awakened - * at the end of the grace period by whatever CPU ends the grace period. - * This allows CPUs to go dyntick-idle more quickly, and to reduce the - * number of wakeups by a modest integer factor. + * Because we are not supporting preemptible RCU, attempt to accelerate + * any current grace periods so that RCU no longer needs this CPU, but + * only if all other CPUs are already in dynticks-idle mode. This will + * allow the CPU cores to be powered down immediately, as opposed to after + * waiting many milliseconds for grace periods to elapse. * * Because it is not legal to invoke rcu_process_callbacks() with irqs * disabled, we do one pass of force_quiescent_state(), then do a * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. - * - * The caller must have disabled interrupts. */ -static void rcu_prepare_for_idle(int cpu) +int rcu_needs_cpu(int cpu) { - unsigned long flags; - - local_irq_save(flags); - - /* - * If there are no callbacks on this CPU, enter dyntick-idle mode. - * Also reset state to avoid prejudicing later attempts. - */ - if (!rcu_cpu_has_callbacks(cpu)) { - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - per_cpu(rcu_dyntick_drain, cpu) = 0; - local_irq_restore(flags); - trace_rcu_prep_idle("No callbacks"); - return; - } - - /* - * If in holdoff mode, just return. We will presumably have - * refrained from disabling the scheduling-clock tick. - */ - if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { - local_irq_restore(flags); - trace_rcu_prep_idle("In holdoff"); - return; + int c = 0; + int snap; + int thatcpu; + + /* Check for being in the holdoff period. */ + if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) + return rcu_needs_cpu_quick_check(cpu); + + /* Don't bother unless we are the last non-dyntick-idle CPU. */ + for_each_online_cpu(thatcpu) { + if (thatcpu == cpu) + continue; + snap = atomic_add_return(0, &per_cpu(rcu_dynticks, + thatcpu).dynticks); + smp_mb(); /* Order sampling of snap with end of grace period. */ + if ((snap & 0x1) != 0) { + per_cpu(rcu_dyntick_drain, cpu) = 0; + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; + return rcu_needs_cpu_quick_check(cpu); + } } /* Check and update the rcu_dyntick_drain sequencing. */ if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* First time through, initialize the counter. */ - per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; - } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && - !rcu_pending(cpu)) { - /* Can we go dyntick-idle despite still having callbacks? */ - trace_rcu_prep_idle("Dyntick with callbacks"); - per_cpu(rcu_dyntick_drain, cpu) = 0; - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), - rcu_idle_gp_wait, HRTIMER_MODE_REL); - return; /* Nothing more to do immediately. */ + per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; - local_irq_restore(flags); - trace_rcu_prep_idle("Begin holdoff"); - invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ - return; + return rcu_needs_cpu_quick_check(cpu); } - /* - * Do one step of pushing the remaining RCU callbacks through - * the RCU core state machine. - */ -#ifdef CONFIG_TREE_PREEMPT_RCU - if (per_cpu(rcu_preempt_data, cpu).nxtlist) { - local_irq_restore(flags); - rcu_preempt_qs(cpu); - force_quiescent_state(&rcu_preempt_state, 0); - local_irq_save(flags); - } -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + /* Do one step pushing remaining RCU callbacks through. */ if (per_cpu(rcu_sched_data, cpu).nxtlist) { - local_irq_restore(flags); rcu_sched_qs(cpu); force_quiescent_state(&rcu_sched_state, 0); - local_irq_save(flags); + c = c || per_cpu(rcu_sched_data, cpu).nxtlist; } if (per_cpu(rcu_bh_data, cpu).nxtlist) { - local_irq_restore(flags); rcu_bh_qs(cpu); force_quiescent_state(&rcu_bh_state, 0); - local_irq_save(flags); + c = c || per_cpu(rcu_bh_data, cpu).nxtlist; } - /* - * If RCU callbacks are still pending, RCU still needs this CPU. - * So try forcing the callbacks through the grace period. - */ - if (rcu_cpu_has_callbacks(cpu)) { - local_irq_restore(flags); - trace_rcu_prep_idle("More callbacks"); + /* If RCU callbacks are still pending, RCU still needs this CPU. */ + if (c) invoke_rcu_core(); - } else { - local_irq_restore(flags); - trace_rcu_prep_idle("Callbacks drained"); - } + return c; } #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ diff --git a/trunk/kernel/rcutree_trace.c b/trunk/kernel/rcutree_trace.c index 654cfe67f0d1..9feffa4c0695 100644 --- a/trunk/kernel/rcutree_trace.c +++ b/trunk/kernel/rcutree_trace.c @@ -67,11 +67,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->completed, rdp->gpnum, rdp->passed_quiesce, rdp->passed_quiesce_gpnum, rdp->qs_pending); - seq_printf(m, " dt=%d/%llx/%d df=%lu", +#ifdef CONFIG_NO_HZ + seq_printf(m, " dt=%d/%d/%d df=%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); +#endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); seq_printf(m, " ql=%ld qs=%c%c%c%c", rdp->qlen, @@ -139,11 +141,13 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->completed, rdp->gpnum, rdp->passed_quiesce, rdp->passed_quiesce_gpnum, rdp->qs_pending); - seq_printf(m, ",%d,%llx,%d,%lu", +#ifdef CONFIG_NO_HZ + seq_printf(m, ",%d,%d,%d,%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); +#endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != @@ -167,7 +171,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) static int show_rcudata_csv(struct seq_file *m, void *unused) { seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); +#ifdef CONFIG_NO_HZ seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); +#endif /* #ifdef CONFIG_NO_HZ */ seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); #ifdef CONFIG_RCU_BOOST seq_puts(m, "\"kt\",\"ktl\""); @@ -272,7 +278,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) gpnum = rsp->gpnum; seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", - rsp->completed, gpnum, rsp->fqs_state, + rsp->completed, gpnum, rsp->signaled, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff), rsp->n_force_qs, rsp->n_force_qs_ngp, diff --git a/trunk/kernel/rtmutex-debug.c b/trunk/kernel/rtmutex-debug.c index 16502d3a71c8..8eafd1bd273e 100644 --- a/trunk/kernel/rtmutex-debug.c +++ b/trunk/kernel/rtmutex-debug.c @@ -101,7 +101,6 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) printk("\n============================================\n"); printk( "[ BUG: circular locking deadlock detected! ]\n"); - printk("%s\n", print_tainted()); printk( "--------------------------------------------\n"); printk("%s/%d is deadlocking current task %s/%d\n\n", task->comm, task_pid_nr(task), diff --git a/trunk/kernel/rtmutex.c b/trunk/kernel/rtmutex.c index a242e691c993..f9d8482dd487 100644 --- a/trunk/kernel/rtmutex.c +++ b/trunk/kernel/rtmutex.c @@ -579,6 +579,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, struct rt_mutex_waiter *waiter) { int ret = 0; + int was_disabled; for (;;) { /* Try to acquire the lock: */ @@ -601,10 +602,17 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, raw_spin_unlock(&lock->wait_lock); + was_disabled = irqs_disabled(); + if (was_disabled) + local_irq_enable(); + debug_rt_mutex_print_deadlock(waiter); schedule_rt_mutex(lock); + if (was_disabled) + local_irq_disable(); + raw_spin_lock(&lock->wait_lock); set_current_state(state); } diff --git a/trunk/kernel/sched/core.c b/trunk/kernel/sched.c similarity index 79% rename from trunk/kernel/sched/core.c rename to trunk/kernel/sched.c index 457c881873cb..d6b149ccf925 100644 --- a/trunk/kernel/sched/core.c +++ b/trunk/kernel/sched.c @@ -1,5 +1,5 @@ /* - * kernel/sched/core.c + * kernel/sched.c * * Kernel scheduler and related syscalls * @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -74,17 +75,129 @@ #include #include +#include #ifdef CONFIG_PARAVIRT #include #endif -#include "sched.h" -#include "../workqueue_sched.h" +#include "sched_cpupri.h" +#include "workqueue_sched.h" +#include "sched_autogroup.h" #define CREATE_TRACE_POINTS #include -void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], + * and back. + */ +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) + +/* + * Helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) + +#define NICE_0_LOAD SCHED_LOAD_SCALE +#define NICE_0_SHIFT SCHED_LOAD_SHIFT + +/* + * These are the 'tuning knobs' of the scheduler: + * + * default timeslice is 100 msecs (used only for SCHED_RR tasks). + * Timeslices get refilled after they expire. + */ +#define DEF_TIMESLICE (100 * HZ / 1000) + +/* + * single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) + +static inline int rt_policy(int policy) +{ + if (policy == SCHED_FIFO || policy == SCHED_RR) + return 1; + return 0; +} + +static inline int task_has_rt_policy(struct task_struct *p) +{ + return rt_policy(p->policy); +} + +/* + * This is the priority-queue data structure of the RT scheduling class: + */ +struct rt_prio_array { + DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ + struct list_head queue[MAX_RT_PRIO]; +}; + +struct rt_bandwidth { + /* nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; +}; + +static struct rt_bandwidth def_rt_bandwidth; + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); + +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) +{ + struct rt_bandwidth *rt_b = + container_of(timer, struct rt_bandwidth, rt_period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, rt_b->rt_period); + + if (!overrun) + break; + + idle = do_sched_rt_period_timer(rt_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +static +void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) +{ + rt_b->rt_period = ns_to_ktime(period); + rt_b->rt_runtime = runtime; + + raw_spin_lock_init(&rt_b->rt_runtime_lock); + + hrtimer_init(&rt_b->rt_period_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rt_b->rt_period_timer.function = sched_rt_period_timer; +} + +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { unsigned long delta; ktime_t soft, hard, now; @@ -104,12 +217,580 @@ void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) } } -DEFINE_MUTEX(sched_domains_mutex); -DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) + return; + + if (hrtimer_active(&rt_b->rt_period_timer)) + return; + + raw_spin_lock(&rt_b->rt_runtime_lock); + start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); + raw_spin_unlock(&rt_b->rt_runtime_lock); +} + +#ifdef CONFIG_RT_GROUP_SCHED +static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + hrtimer_cancel(&rt_b->rt_period_timer); +} +#endif + +/* + * sched_domains_mutex serializes calls to init_sched_domains, + * detach_destroy_domains and partition_sched_domains. + */ +static DEFINE_MUTEX(sched_domains_mutex); + +#ifdef CONFIG_CGROUP_SCHED + +#include + +struct cfs_rq; + +static LIST_HEAD(task_groups); + +struct cfs_bandwidth { +#ifdef CONFIG_CFS_BANDWIDTH + raw_spinlock_t lock; + ktime_t period; + u64 quota, runtime; + s64 hierarchal_quota; + u64 runtime_expires; + + int idle, timer_active; + struct hrtimer period_timer, slack_timer; + struct list_head throttled_cfs_rq; + + /* statistics */ + int nr_periods, nr_throttled; + u64 throttled_time; +#endif +}; + +/* task group related information */ +struct task_group { + struct cgroup_subsys_state css; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* schedulable entities of this group on each cpu */ + struct sched_entity **se; + /* runqueue "owned" by this group on each cpu */ + struct cfs_rq **cfs_rq; + unsigned long shares; + + atomic_t load_weight; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; + + struct rt_bandwidth rt_bandwidth; +#endif + + struct rcu_head rcu; + struct list_head list; + + struct task_group *parent; + struct list_head siblings; + struct list_head children; + +#ifdef CONFIG_SCHED_AUTOGROUP + struct autogroup *autogroup; +#endif + + struct cfs_bandwidth cfs_bandwidth; +}; + +/* task_group_lock serializes the addition/removal of task groups */ +static DEFINE_SPINLOCK(task_group_lock); + +#ifdef CONFIG_FAIR_GROUP_SCHED + +# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD + +/* + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ +#define MIN_SHARES (1UL << 1) +#define MAX_SHARES (1UL << 18) + +static int root_task_group_load = ROOT_TASK_GROUP_LOAD; +#endif + +/* Default task group. + * Every task in system belong to this group at bootup. + */ +struct task_group root_task_group; + +#endif /* CONFIG_CGROUP_SCHED */ + +/* CFS-related fields in a runqueue */ +struct cfs_rq { + struct load_weight load; + unsigned long nr_running, h_nr_running; + + u64 exec_clock; + u64 min_vruntime; +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; +#endif + + struct rb_root tasks_timeline; + struct rb_node *rb_leftmost; + + struct list_head tasks; + struct list_head *balance_iterator; + + /* + * 'curr' points to currently running entity on this cfs_rq. + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_entity *curr, *next, *last, *skip; + +#ifdef CONFIG_SCHED_DEBUG + unsigned int nr_spread_over; +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ + + /* + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) + * + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This + * list is used during load balance. + */ + int on_list; + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ + +#ifdef CONFIG_SMP + /* + * the part of load.weight contributed by tasks + */ + unsigned long task_weight; + + /* + * h_load = weight * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long h_load; + + /* + * Maintaining per-cpu shares distribution for group scheduling + * + * load_stamp is the last time we updated the load average + * load_last is the last time we updated the load average and saw load + * load_unacc_exec_time is currently unaccounted execution time + */ + u64 load_avg; + u64 load_period; + u64 load_stamp, load_last, load_unacc_exec_time; + + unsigned long load_contribution; +#endif +#ifdef CONFIG_CFS_BANDWIDTH + int runtime_enabled; + u64 runtime_expires; + s64 runtime_remaining; + + u64 throttled_timestamp; + int throttled, throttle_count; + struct list_head throttled_list; +#endif +#endif +}; + +#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_CFS_BANDWIDTH +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +{ + return &tg->cfs_bandwidth; +} + +static inline u64 default_cfs_period(void); +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); +static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); + +static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) +{ + struct cfs_bandwidth *cfs_b = + container_of(timer, struct cfs_bandwidth, slack_timer); + do_sched_cfs_slack_timer(cfs_b); + + return HRTIMER_NORESTART; +} + +static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) +{ + struct cfs_bandwidth *cfs_b = + container_of(timer, struct cfs_bandwidth, period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, cfs_b->period); + + if (!overrun) + break; + + idle = do_sched_cfs_period_timer(cfs_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + raw_spin_lock_init(&cfs_b->lock); + cfs_b->runtime = 0; + cfs_b->quota = RUNTIME_INF; + cfs_b->period = ns_to_ktime(default_cfs_period()); + + INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); + hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cfs_b->period_timer.function = sched_cfs_period_timer; + hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cfs_b->slack_timer.function = sched_cfs_slack_timer; +} + +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + cfs_rq->runtime_enabled = 0; + INIT_LIST_HEAD(&cfs_rq->throttled_list); +} + +/* requires cfs_b->lock, may release to reprogram timer */ +static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + /* + * The timer may be active because we're trying to set a new bandwidth + * period or because we're racing with the tear-down path + * (timer_active==0 becomes visible before the hrtimer call-back + * terminates). In either case we ensure that it's re-programmed + */ + while (unlikely(hrtimer_active(&cfs_b->period_timer))) { + raw_spin_unlock(&cfs_b->lock); + /* ensure cfs_b->lock is available while we wait */ + hrtimer_cancel(&cfs_b->period_timer); + + raw_spin_lock(&cfs_b->lock); + /* if someone else restarted the timer then we're done */ + if (cfs_b->timer_active) + return; + } + + cfs_b->timer_active = 1; + start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); +} + +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + hrtimer_cancel(&cfs_b->period_timer); + hrtimer_cancel(&cfs_b->slack_timer); +} +#else +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} + +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +{ + return NULL; +} +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +/* Real-Time classes' related field in a runqueue: */ +struct rt_rq { + struct rt_prio_array active; + unsigned long rt_nr_running; +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + struct { + int curr; /* highest queued rt task prio */ +#ifdef CONFIG_SMP + int next; /* next highest */ +#endif + } highest_prio; +#endif +#ifdef CONFIG_SMP + unsigned long rt_nr_migratory; + unsigned long rt_nr_total; + int overloaded; + struct plist_head pushable_tasks; +#endif + int rt_throttled; + u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + +#ifdef CONFIG_RT_GROUP_SCHED + unsigned long rt_nr_boosted; + + struct rq *rq; + struct list_head leaf_rt_rq_list; + struct task_group *tg; +#endif +}; + +#ifdef CONFIG_SMP + +/* + * We add the notion of a root-domain which will be used to define per-domain + * variables. Each exclusive cpuset essentially defines an island domain by + * fully partitioning the member cpus from any other cpuset. Whenever a new + * exclusive cpuset is created, we also create and attach a new root-domain + * object. + * + */ +struct root_domain { + atomic_t refcount; + atomic_t rto_count; + struct rcu_head rcu; + cpumask_var_t span; + cpumask_var_t online; + + /* + * The "RT overload" flag: it gets set if a CPU has more than + * one runnable RT task. + */ + cpumask_var_t rto_mask; + struct cpupri cpupri; +}; + +/* + * By default the system creates a single root-domain with all cpus as + * members (mimicking the global state we have today). + */ +static struct root_domain def_root_domain; + +#endif /* CONFIG_SMP */ + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct rq { + /* runqueue lock: */ + raw_spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned long nr_running; + #define CPU_LOAD_IDX_MAX 5 + unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + unsigned long last_load_update_tick; +#ifdef CONFIG_NO_HZ + u64 nohz_stamp; + unsigned char nohz_balance_kick; +#endif + int skip_clock_update; + + /* capture load from *all* tasks on this cpu: */ + struct load_weight load; + unsigned long nr_load_updates; + u64 nr_switches; + + struct cfs_rq cfs; + struct rt_rq rt; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* list of leaf cfs_rq on this cpu: */ + struct list_head leaf_cfs_rq_list; +#endif +#ifdef CONFIG_RT_GROUP_SCHED + struct list_head leaf_rt_rq_list; +#endif + + /* + * This is part of a global counter where only the total sum + * over all CPUs matters. A task can increase this counter on + * one CPU and if it got migrated afterwards it may decrease + * it on another CPU. Always updated under the runqueue lock: + */ + unsigned long nr_uninterruptible; + + struct task_struct *curr, *idle, *stop; + unsigned long next_balance; + struct mm_struct *prev_mm; + + u64 clock; + u64 clock_task; + + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + struct root_domain *rd; + struct sched_domain *sd; + + unsigned long cpu_power; + + unsigned char idle_balance; + /* For active balancing */ + int post_schedule; + int active_balance; + int push_cpu; + struct cpu_stop_work active_balance_work; + /* cpu of this runqueue: */ + int cpu; + int online; + + u64 rt_avg; + u64 age_stamp; + u64 idle_stamp; + u64 avg_idle; +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + u64 prev_irq_time; +#endif +#ifdef CONFIG_PARAVIRT + u64 prev_steal_time; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + u64 prev_steal_time_rq; +#endif + + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; + +#ifdef CONFIG_SCHED_HRTICK +#ifdef CONFIG_SMP + int hrtick_csd_pending; + struct call_single_data hrtick_csd; +#endif + struct hrtimer hrtick_timer; +#endif + +#ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ + + /* sys_sched_yield() stats */ + unsigned int yld_count; + + /* schedule() stats */ + unsigned int sched_switch; + unsigned int sched_count; + unsigned int sched_goidle; + + /* try_to_wake_up() stats */ + unsigned int ttwu_count; + unsigned int ttwu_local; +#endif + +#ifdef CONFIG_SMP + struct llist_head wake_list; +#endif +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + + +static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); + +static inline int cpu_of(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq->cpu; +#else + return 0; +#endif +} + +#define rcu_dereference_check_sched_domain(p) \ + rcu_dereference_check((p), \ + lockdep_is_held(&sched_domains_mutex)) + +/* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See detach_destroy_domains: synchronize_sched for details. + * + * The domain tree of any CPU may only be accessed from within + * preempt-disabled sections. + */ +#define for_each_domain(cpu, __sd) \ + for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) + +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() (&__get_cpu_var(runqueues)) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define raw_rq() (&__raw_get_cpu_var(runqueues)) + +#ifdef CONFIG_CGROUP_SCHED + +/* + * Return the group to which this tasks belongs. + * + * We use task_subsys_state_check() and extend the RCU verification with + * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each + * task it moves into the cgroup. Therefore by holding either of those locks, + * we pin the task to the current cgroup. + */ +static inline struct task_group *task_group(struct task_struct *p) +{ + struct task_group *tg; + struct cgroup_subsys_state *css; + + css = task_subsys_state_check(p, cpu_cgroup_subsys_id, + lockdep_is_held(&p->pi_lock) || + lockdep_is_held(&task_rq(p)->lock)); + tg = container_of(css, struct task_group, css); + + return autogroup_task_group(p, tg); +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; + p->se.parent = task_group(p)->se[cpu]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + p->rt.rt_rq = task_group(p)->rt_rq[cpu]; + p->rt.parent = task_group(p)->rt_se[cpu]; +#endif +} + +#else /* CONFIG_CGROUP_SCHED */ + +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} + +#endif /* CONFIG_CGROUP_SCHED */ static void update_rq_clock_task(struct rq *rq, s64 delta); -void update_rq_clock(struct rq *rq) +static void update_rq_clock(struct rq *rq) { s64 delta; @@ -121,15 +802,45 @@ void update_rq_clock(struct rq *rq) update_rq_clock_task(rq, delta); } +/* + * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + */ +#ifdef CONFIG_SCHED_DEBUG +# define const_debug __read_mostly +#else +# define const_debug static const +#endif + +/** + * runqueue_is_locked - Returns true if the current cpu runqueue is locked + * @cpu: the processor in question. + * + * This interface allows printk to be called with the runqueue lock + * held and know whether or not it is OK to wake up the klogd. + */ +int runqueue_is_locked(int cpu) +{ + return raw_spin_is_locked(&cpu_rq(cpu)->lock); +} + /* * Debugging: various feature bits */ +#define SCHED_FEAT(name, enabled) \ + __SCHED_FEAT_##name , + +enum { +#include "sched_features.h" +}; + +#undef SCHED_FEAT + #define SCHED_FEAT(name, enabled) \ (1UL << __SCHED_FEAT_##name) * enabled | const_debug unsigned int sysctl_sched_features = -#include "features.h" +#include "sched_features.h" 0; #undef SCHED_FEAT @@ -139,7 +850,7 @@ const_debug unsigned int sysctl_sched_features = #name , static __read_mostly char *sched_feat_names[] = { -#include "features.h" +#include "sched_features.h" NULL }; @@ -149,7 +860,7 @@ static int sched_feat_show(struct seq_file *m, void *v) { int i; - for (i = 0; i < __SCHED_FEAT_NR; i++) { + for (i = 0; sched_feat_names[i]; i++) { if (!(sysctl_sched_features & (1UL << i))) seq_puts(m, "NO_"); seq_printf(m, "%s ", sched_feat_names[i]); @@ -159,36 +870,6 @@ static int sched_feat_show(struct seq_file *m, void *v) return 0; } -#ifdef HAVE_JUMP_LABEL - -#define jump_label_key__true jump_label_key_enabled -#define jump_label_key__false jump_label_key_disabled - -#define SCHED_FEAT(name, enabled) \ - jump_label_key__##enabled , - -struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { -#include "features.h" -}; - -#undef SCHED_FEAT - -static void sched_feat_disable(int i) -{ - if (jump_label_enabled(&sched_feat_keys[i])) - jump_label_dec(&sched_feat_keys[i]); -} - -static void sched_feat_enable(int i) -{ - if (!jump_label_enabled(&sched_feat_keys[i])) - jump_label_inc(&sched_feat_keys[i]); -} -#else -static void sched_feat_disable(int i) { }; -static void sched_feat_enable(int i) { }; -#endif /* HAVE_JUMP_LABEL */ - static ssize_t sched_feat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -212,20 +893,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf, cmp += 3; } - for (i = 0; i < __SCHED_FEAT_NR; i++) { + for (i = 0; sched_feat_names[i]; i++) { if (strcmp(cmp, sched_feat_names[i]) == 0) { - if (neg) { + if (neg) sysctl_sched_features &= ~(1UL << i); - sched_feat_disable(i); - } else { + else sysctl_sched_features |= (1UL << i); - sched_feat_enable(i); - } break; } } - if (i == __SCHED_FEAT_NR) + if (!sched_feat_names[i]) return -EINVAL; *ppos += cnt; @@ -254,7 +932,10 @@ static __init int sched_init_debug(void) return 0; } late_initcall(sched_init_debug); -#endif /* CONFIG_SCHED_DEBUG */ + +#endif + +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) /* * Number of tasks to iterate in a single balance run. @@ -276,7 +957,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; */ unsigned int sysctl_sched_rt_period = 1000000; -__read_mostly int scheduler_running; +static __read_mostly int scheduler_running; /* * part of the period that we allow rt tasks to run in us. @@ -284,7 +965,112 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; +static inline u64 global_rt_period(void) +{ + return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; +} + +static inline u64 global_rt_runtime(void) +{ + if (sysctl_sched_rt_runtime < 0) + return RUNTIME_INF; + + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +} + +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +#endif +#ifndef finish_arch_switch +# define finish_arch_switch(prev) do { } while (0) +#endif + +static inline int task_current(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + +static inline int task_running(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->on_cpu; +#else + return task_current(rq, p); +#endif +} + +#ifndef __ARCH_WANT_UNLOCKED_CTXSW +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->on_cpu = 1; +#endif +} +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->on_cpu = 0; +#endif +#ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; +#endif + /* + * If we are tracking spinlock dependencies then we have to + * fix up the runqueue lock - which gets 'carried over' from + * prev into current: + */ + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + + raw_spin_unlock_irq(&rq->lock); +} + +#else /* __ARCH_WANT_UNLOCKED_CTXSW */ +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->on_cpu = 1; +#endif +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + raw_spin_unlock_irq(&rq->lock); +#else + raw_spin_unlock(&rq->lock); +#endif +} + +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->on_cpu = 0; +#endif +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_enable(); +#endif +} +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* * __task_rq_lock - lock the rq @p resides on. @@ -367,6 +1153,20 @@ static struct rq *this_rq_lock(void) * rq->lock. */ +/* + * Use hrtick when: + * - enabled by features + * - hrtimer is actually high res + */ +static inline int hrtick_enabled(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + if (!cpu_active(cpu_of(rq))) + return 0; + return hrtimer_is_hres_active(&rq->hrtick_timer); +} + static void hrtick_clear(struct rq *rq) { if (hrtimer_active(&rq->hrtick_timer)) @@ -410,7 +1210,7 @@ static void __hrtick_start(void *arg) * * called with rq->lock held and irqs disabled */ -void hrtick_start(struct rq *rq, u64 delay) +static void hrtick_start(struct rq *rq, u64 delay) { struct hrtimer *timer = &rq->hrtick_timer; ktime_t time = ktime_add_ns(timer->base->get_time(), delay); @@ -454,7 +1254,7 @@ static __init void init_hrtick(void) * * called with rq->lock held and irqs disabled */ -void hrtick_start(struct rq *rq, u64 delay) +static void hrtick_start(struct rq *rq, u64 delay) { __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, HRTIMER_MODE_REL_PINNED, 0); @@ -505,7 +1305,7 @@ static inline void init_hrtick(void) #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) #endif -void resched_task(struct task_struct *p) +static void resched_task(struct task_struct *p) { int cpu; @@ -526,7 +1326,7 @@ void resched_task(struct task_struct *p) smp_send_reschedule(cpu); } -void resched_cpu(int cpu) +static void resched_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -599,60 +1399,234 @@ void wake_up_idle_cpu(int cpu) */ set_tsk_need_resched(rq->idle); - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(rq->idle)) - smp_send_reschedule(cpu); + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(rq->idle)) + smp_send_reschedule(cpu); +} + +static inline bool got_nohz_idle_kick(void) +{ + return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; +} + +#else /* CONFIG_NO_HZ */ + +static inline bool got_nohz_idle_kick(void) +{ + return false; +} + +#endif /* CONFIG_NO_HZ */ + +static u64 sched_avg_period(void) +{ + return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; +} + +static void sched_avg_update(struct rq *rq) +{ + s64 period = sched_avg_period(); + + while ((s64)(rq->clock - rq->age_stamp) > period) { + /* + * Inline assembly required to prevent the compiler + * optimising this loop into a divmod call. + * See __iter_div_u64_rem() for another example of this. + */ + asm("" : "+rm" (rq->age_stamp)); + rq->age_stamp += period; + rq->rt_avg /= 2; + } +} + +static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ + rq->rt_avg += rt_delta; + sched_avg_update(rq); +} + +#else /* !CONFIG_SMP */ +static void resched_task(struct task_struct *p) +{ + assert_raw_spin_locked(&task_rq(p)->lock); + set_tsk_need_resched(p); +} + +static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ +} + +static void sched_avg_update(struct rq *rq) +{ +} +#endif /* CONFIG_SMP */ + +#if BITS_PER_LONG == 32 +# define WMULT_CONST (~0UL) +#else +# define WMULT_CONST (1UL << 32) +#endif + +#define WMULT_SHIFT 32 + +/* + * Shift right and round: + */ +#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) + +/* + * delta *= weight / lw + */ +static unsigned long +calc_delta_mine(unsigned long delta_exec, unsigned long weight, + struct load_weight *lw) +{ + u64 tmp; + + /* + * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched + * entities since MIN_SHARES = 2. Treat weight as 1 if less than + * 2^SCHED_LOAD_RESOLUTION. + */ + if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) + tmp = (u64)delta_exec * scale_load_down(weight); + else + tmp = (u64)delta_exec; + + if (!lw->inv_weight) { + unsigned long w = scale_load_down(lw->weight); + + if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) + lw->inv_weight = 1; + else if (unlikely(!w)) + lw->inv_weight = WMULT_CONST; + else + lw->inv_weight = WMULT_CONST / w; + } + + /* + * Check whether we'd overflow the 64-bit multiplication: + */ + if (unlikely(tmp > WMULT_CONST)) + tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, + WMULT_SHIFT/2); + else + tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); + + return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); } -static inline bool got_nohz_idle_kick(void) +static inline void update_load_add(struct load_weight *lw, unsigned long inc) { - int cpu = smp_processor_id(); - return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); + lw->weight += inc; + lw->inv_weight = 0; } -#else /* CONFIG_NO_HZ */ +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) +{ + lw->weight -= dec; + lw->inv_weight = 0; +} -static inline bool got_nohz_idle_kick(void) +static inline void update_load_set(struct load_weight *lw, unsigned long w) { - return false; + lw->weight = w; + lw->inv_weight = 0; } -#endif /* CONFIG_NO_HZ */ +/* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. + */ -void sched_avg_update(struct rq *rq) -{ - s64 period = sched_avg_period(); +#define WEIGHT_IDLEPRIO 3 +#define WMULT_IDLEPRIO 1431655765 - while ((s64)(rq->clock - rq->age_stamp) > period) { - /* - * Inline assembly required to prevent the compiler - * optimising this loop into a divmod call. - * See __iter_div_u64_rem() for another example of this. - */ - asm("" : "+rm" (rq->age_stamp)); - rq->age_stamp += period; - rq->rt_avg /= 2; - } +/* + * Nice levels are multiplicative, with a gentle 10% change for every + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to + * nice 1, it will get ~10% less CPU time than another CPU-bound task + * that remained on nice 0. + * + * The "10% effect" is relative and cumulative: from _any_ nice level, + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. + * If a task goes up by ~10% and another task goes down by ~10% then + * the relative distance between them is ~25%.) + */ +static const int prio_to_weight[40] = { + /* -20 */ 88761, 71755, 56483, 46273, 36291, + /* -15 */ 29154, 23254, 18705, 14949, 11916, + /* -10 */ 9548, 7620, 6100, 4904, 3906, + /* -5 */ 3121, 2501, 1991, 1586, 1277, + /* 0 */ 1024, 820, 655, 526, 423, + /* 5 */ 335, 272, 215, 172, 137, + /* 10 */ 110, 87, 70, 56, 45, + /* 15 */ 36, 29, 23, 18, 15, +}; + +/* + * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. + * + * In cases where the weight does not change often, we can use the + * precalculated inverse to speed up arithmetics by turning divisions + * into multiplications: + */ +static const u32 prio_to_wmult[40] = { + /* -20 */ 48388, 59856, 76040, 92818, 118348, + /* -15 */ 147320, 184698, 229616, 287308, 360437, + /* -10 */ 449829, 563644, 704093, 875809, 1099582, + /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, + /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, + /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, + /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, + /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, +}; + +/* Time spent by the tasks of the cpu accounting group executing in ... */ +enum cpuacct_stat_index { + CPUACCT_STAT_USER, /* ... user mode */ + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ + + CPUACCT_STAT_NSTATS, +}; + +#ifdef CONFIG_CGROUP_CPUACCT +static void cpuacct_charge(struct task_struct *tsk, u64 cputime); +static void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val); +#else +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} +static inline void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val) {} +#endif + +static inline void inc_cpu_load(struct rq *rq, unsigned long load) +{ + update_load_add(&rq->load, load); } -#else /* !CONFIG_SMP */ -void resched_task(struct task_struct *p) +static inline void dec_cpu_load(struct rq *rq, unsigned long load) { - assert_raw_spin_locked(&task_rq(p)->lock); - set_tsk_need_resched(p); + update_load_sub(&rq->load, load); } -#endif /* CONFIG_SMP */ #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) +typedef int (*tg_visitor)(struct task_group *, void *); + /* * Iterate task_group tree rooted at *from, calling @down when first entering a * node and @up when leaving it for the final time. * * Caller must hold rcu_lock or sufficient equivalent. */ -int walk_tg_tree_from(struct task_group *from, +static int walk_tg_tree_from(struct task_group *from, tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; @@ -683,13 +1657,270 @@ int walk_tg_tree_from(struct task_group *from, return ret; } -int tg_nop(struct task_group *tg, void *data) +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. + */ + +static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +{ + return walk_tg_tree_from(&root_task_group, down, up, data); +} + +static int tg_nop(struct task_group *tg, void *data) +{ + return 0; +} +#endif + +#ifdef CONFIG_SMP +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cpu_rq(cpu)->load.weight; +} + +/* + * Return a low guess at the load of a migration-source cpu weighted + * according to the scheduling class and "nice" value. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +static unsigned long source_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return min(rq->cpu_load[type-1], total); +} + +/* + * Return a high guess at the load of a migration-target cpu weighted + * according to the scheduling class and "nice" value. + */ +static unsigned long target_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return max(rq->cpu_load[type-1], total); +} + +static unsigned long power_of(int cpu) +{ + return cpu_rq(cpu)->cpu_power; +} + +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +static unsigned long cpu_avg_load_per_task(int cpu) { + struct rq *rq = cpu_rq(cpu); + unsigned long nr_running = ACCESS_ONCE(rq->nr_running); + + if (nr_running) + return rq->load.weight / nr_running; + return 0; } + +#ifdef CONFIG_PREEMPT + +static void double_rq_lock(struct rq *rq1, struct rq *rq2); + +/* + * fair double_lock_balance: Safely acquires both rq->locks in a fair + * way at the expense of forcing extra atomic operations in all + * invocations. This assures that the double_lock is acquired using the + * same underlying policy as the spinlock_t on this architecture, which + * reduces latency compared to the unfair variant below. However, it + * also adds more overhead and therefore may reduce throughput. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + raw_spin_unlock(&this_rq->lock); + double_rq_lock(this_rq, busiest); + + return 1; +} + +#else +/* + * Unfair double_lock_balance: Optimizes throughput at the expense of + * latency by eliminating extra atomic operations when the locks are + * already in proper order on entry. This favors lower cpu-ids and will + * grant the double lock to lower cpus over higher ids under contention, + * regardless of entry order into the function. + */ +static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + int ret = 0; + + if (unlikely(!raw_spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + raw_spin_unlock(&this_rq->lock); + raw_spin_lock(&busiest->lock); + raw_spin_lock_nested(&this_rq->lock, + SINGLE_DEPTH_NESTING); + ret = 1; + } else + raw_spin_lock_nested(&busiest->lock, + SINGLE_DEPTH_NESTING); + } + return ret; +} + +#endif /* CONFIG_PREEMPT */ + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static int double_lock_balance(struct rq *this_rq, struct rq *busiest) +{ + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + raw_spin_unlock(&this_rq->lock); + BUG_ON(1); + } + + return _double_lock_balance(this_rq, busiest); +} + +static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + raw_spin_unlock(&busiest->lock); + lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); +} + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + if (rq1 == rq2) { + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ + } else { + if (rq1 < rq2) { + raw_spin_lock(&rq1->lock); + raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); + } else { + raw_spin_lock(&rq2->lock); + raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); + } + } +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + raw_spin_unlock(&rq1->lock); + if (rq1 != rq2) + raw_spin_unlock(&rq2->lock); + else + __release(rq2->lock); +} + +#else /* CONFIG_SMP */ + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + BUG_ON(rq1 != rq2); + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + BUG_ON(rq1 != rq2); + raw_spin_unlock(&rq1->lock); + __release(rq2->lock); +} + +#endif + +static void calc_load_account_idle(struct rq *this_rq); +static void update_sysctl(void); +static int get_update_sysctl_factor(void); +static void update_cpu_load(struct rq *this_rq); + +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_rq(p, cpu); +#ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfully executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); + task_thread_info(p)->cpu = cpu; #endif +} + +static const struct sched_class rt_sched_class; + +#define sched_class_highest (&stop_sched_class) +#define for_each_class(class) \ + for (class = sched_class_highest; class; class = class->next) + +#include "sched_stats.h" + +static void inc_nr_running(struct rq *rq) +{ + rq->nr_running++; +} -void update_cpu_load(struct rq *this_rq); +static void dec_nr_running(struct rq *rq) +{ + rq->nr_running--; +} static void set_load_weight(struct task_struct *p) { @@ -726,7 +1957,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) /* * activate_task - move a task to the runqueue. */ -void activate_task(struct rq *rq, struct task_struct *p, int flags) +static void activate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) rq->nr_uninterruptible--; @@ -737,7 +1968,7 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) /* * deactivate_task - remove a task from the runqueue. */ -void deactivate_task(struct rq *rq, struct task_struct *p, int flags) +static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) rq->nr_uninterruptible++; @@ -928,14 +2159,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) #ifdef CONFIG_IRQ_TIME_ACCOUNTING static int irqtime_account_hi_update(void) { - u64 *cpustat = kcpustat_this_cpu->cpustat; + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; unsigned long flags; u64 latest_ns; int ret = 0; local_irq_save(flags); latest_ns = this_cpu_read(cpu_hardirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) + if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) ret = 1; local_irq_restore(flags); return ret; @@ -943,14 +2174,14 @@ static int irqtime_account_hi_update(void) static int irqtime_account_si_update(void) { - u64 *cpustat = kcpustat_this_cpu->cpustat; + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; unsigned long flags; u64 latest_ns; int ret = 0; local_irq_save(flags); latest_ns = this_cpu_read(cpu_softirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) + if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) ret = 1; local_irq_restore(flags); return ret; @@ -962,6 +2193,15 @@ static int irqtime_account_si_update(void) #endif +#include "sched_idletask.c" +#include "sched_fair.c" +#include "sched_rt.c" +#include "sched_autogroup.c" +#include "sched_stoptask.c" +#ifdef CONFIG_SCHED_DEBUG +# include "sched_debug.c" +#endif + void sched_set_stop_task(int cpu, struct task_struct *stop) { struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; @@ -1059,7 +2299,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio); } -void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) +static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { const struct sched_class *class; @@ -1085,6 +2325,38 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP +/* + * Is this task likely cache-hot: + */ +static int +task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) +{ + s64 delta; + + if (p->sched_class != &fair_sched_class) + return 0; + + if (unlikely(p->policy == SCHED_IDLE)) + return 0; + + /* + * Buddy candidates are cache hot: + */ + if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && + (&p->se == cfs_rq_of(&p->se)->next || + &p->se == cfs_rq_of(&p->se)->last)) + return 1; + + if (sysctl_sched_migration_cost == -1) + return 1; + if (sysctl_sched_migration_cost == 0) + return 0; + + delta = now - p->se.exec_start; + + return delta < (s64)sysctl_sched_migration_cost; +} + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { #ifdef CONFIG_SCHED_DEBUG @@ -1511,11 +2783,6 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags) } #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ - -static inline int ttwu_share_cache(int this_cpu, int that_cpu) -{ - return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); -} #endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu) @@ -1523,7 +2790,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) struct rq *rq = cpu_rq(cpu); #if defined(CONFIG_SMP) - if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) { + if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { sched_clock_cpu(cpu); /* sync clocks x-cpu */ ttwu_queue_remote(p, cpu); return; @@ -1937,7 +3204,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) local_irq_enable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); - trace_sched_stat_sleeptime(current, rq->clock); fire_sched_in_preempt_notifiers(current); if (mm) @@ -2173,7 +3439,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) */ static atomic_long_t calc_load_tasks_idle; -void calc_load_account_idle(struct rq *this_rq) +static void calc_load_account_idle(struct rq *this_rq) { long delta; @@ -2317,7 +3583,7 @@ static void calc_global_nohz(unsigned long ticks) */ } #else -void calc_load_account_idle(struct rq *this_rq) +static void calc_load_account_idle(struct rq *this_rq) { } @@ -2460,7 +3726,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) * scheduler tick (TICK_NSEC). With tickless idle this will not be called * every tick. We fix it up based on jiffies. */ -void update_cpu_load(struct rq *this_rq) +static void update_cpu_load(struct rq *this_rq) { unsigned long this_load = this_rq->load.weight; unsigned long curr_jiffies = jiffies; @@ -2538,10 +3804,8 @@ void sched_exec(void) #endif DEFINE_PER_CPU(struct kernel_stat, kstat); -DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); EXPORT_PER_CPU_SYMBOL(kstat); -EXPORT_PER_CPU_SYMBOL(kernel_cpustat); /* * Return any ns on the sched_clock that have not yet been accounted in @@ -2594,42 +3858,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -#ifdef CONFIG_CGROUP_CPUACCT -struct cgroup_subsys cpuacct_subsys; -struct cpuacct root_cpuacct; -#endif - -static inline void task_group_account_field(struct task_struct *p, int index, - u64 tmp) -{ -#ifdef CONFIG_CGROUP_CPUACCT - struct kernel_cpustat *kcpustat; - struct cpuacct *ca; -#endif - /* - * Since all updates are sure to touch the root cgroup, we - * get ourselves ahead and touch it first. If the root cgroup - * is the only cgroup, then nothing else should be necessary. - * - */ - __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; - -#ifdef CONFIG_CGROUP_CPUACCT - if (unlikely(!cpuacct_subsys.active)) - return; - - rcu_read_lock(); - ca = task_ca(p); - while (ca && (ca != &root_cpuacct)) { - kcpustat = this_cpu_ptr(ca->cpustat); - kcpustat->cpustat[index] += tmp; - ca = parent_ca(ca); - } - rcu_read_unlock(); -#endif -} - - /* * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to @@ -2639,18 +3867,22 @@ static inline void task_group_account_field(struct task_struct *p, int index, void account_user_time(struct task_struct *p, cputime_t cputime, cputime_t cputime_scaled) { - int index; + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t tmp; /* Add user time to process. */ - p->utime += cputime; - p->utimescaled += cputime_scaled; + p->utime = cputime_add(p->utime, cputime); + p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); account_group_user_time(p, cputime); - index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; - /* Add user time to cpustat. */ - task_group_account_field(p, index, (__force u64) cputime); + tmp = cputime_to_cputime64(cputime); + if (TASK_NICE(p) > 0) + cpustat->nice = cputime64_add(cpustat->nice, tmp); + else + cpustat->user = cputime64_add(cpustat->user, tmp); + cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); /* Account for user time used */ acct_update_integrals(p); } @@ -2664,21 +3896,24 @@ void account_user_time(struct task_struct *p, cputime_t cputime, static void account_guest_time(struct task_struct *p, cputime_t cputime, cputime_t cputime_scaled) { - u64 *cpustat = kcpustat_this_cpu->cpustat; + cputime64_t tmp; + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + + tmp = cputime_to_cputime64(cputime); /* Add guest time to process. */ - p->utime += cputime; - p->utimescaled += cputime_scaled; + p->utime = cputime_add(p->utime, cputime); + p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); account_group_user_time(p, cputime); - p->gtime += cputime; + p->gtime = cputime_add(p->gtime, cputime); /* Add guest time to cpustat. */ if (TASK_NICE(p) > 0) { - cpustat[CPUTIME_NICE] += (__force u64) cputime; - cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; + cpustat->nice = cputime64_add(cpustat->nice, tmp); + cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); } else { - cpustat[CPUTIME_USER] += (__force u64) cputime; - cpustat[CPUTIME_GUEST] += (__force u64) cputime; + cpustat->user = cputime64_add(cpustat->user, tmp); + cpustat->guest = cputime64_add(cpustat->guest, tmp); } } @@ -2691,15 +3926,18 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, */ static inline void __account_system_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled, int index) + cputime_t cputime_scaled, cputime64_t *target_cputime64) { + cputime64_t tmp = cputime_to_cputime64(cputime); + /* Add system time to process. */ - p->stime += cputime; - p->stimescaled += cputime_scaled; + p->stime = cputime_add(p->stime, cputime); + p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); account_group_system_time(p, cputime); /* Add system time to cpustat. */ - task_group_account_field(p, index, (__force u64) cputime); + *target_cputime64 = cputime64_add(*target_cputime64, tmp); + cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); /* Account for system time used */ acct_update_integrals(p); @@ -2715,7 +3953,8 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, void account_system_time(struct task_struct *p, int hardirq_offset, cputime_t cputime, cputime_t cputime_scaled) { - int index; + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t *target_cputime64; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { account_guest_time(p, cputime, cputime_scaled); @@ -2723,13 +3962,13 @@ void account_system_time(struct task_struct *p, int hardirq_offset, } if (hardirq_count() - hardirq_offset) - index = CPUTIME_IRQ; + target_cputime64 = &cpustat->irq; else if (in_serving_softirq()) - index = CPUTIME_SOFTIRQ; + target_cputime64 = &cpustat->softirq; else - index = CPUTIME_SYSTEM; + target_cputime64 = &cpustat->system; - __account_system_time(p, cputime, cputime_scaled, index); + __account_system_time(p, cputime, cputime_scaled, target_cputime64); } /* @@ -2738,9 +3977,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset, */ void account_steal_time(cputime_t cputime) { - u64 *cpustat = kcpustat_this_cpu->cpustat; + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t cputime64 = cputime_to_cputime64(cputime); - cpustat[CPUTIME_STEAL] += (__force u64) cputime; + cpustat->steal = cputime64_add(cpustat->steal, cputime64); } /* @@ -2749,13 +3989,14 @@ void account_steal_time(cputime_t cputime) */ void account_idle_time(cputime_t cputime) { - u64 *cpustat = kcpustat_this_cpu->cpustat; + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t cputime64 = cputime_to_cputime64(cputime); struct rq *rq = this_rq(); if (atomic_read(&rq->nr_iowait) > 0) - cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; + cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); else - cpustat[CPUTIME_IDLE] += (__force u64) cputime; + cpustat->idle = cputime64_add(cpustat->idle, cputime64); } static __always_inline bool steal_account_process_tick(void) @@ -2805,15 +4046,16 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq) { cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); - u64 *cpustat = kcpustat_this_cpu->cpustat; + cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; if (steal_account_process_tick()) return; if (irqtime_account_hi_update()) { - cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; + cpustat->irq = cputime64_add(cpustat->irq, tmp); } else if (irqtime_account_si_update()) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; + cpustat->softirq = cputime64_add(cpustat->softirq, tmp); } else if (this_cpu_ksoftirqd() == p) { /* * ksoftirqd time do not get accounted in cpu_softirq_time. @@ -2821,7 +4063,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, * Also, p->stime needs to be updated for ksoftirqd. */ __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SOFTIRQ); + &cpustat->softirq); } else if (user_tick) { account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); } else if (p == rq->idle) { @@ -2830,7 +4072,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); } else { __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SYSTEM); + &cpustat->system); } } @@ -2929,7 +4171,7 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - cputime_t rtime, utime = p->utime, total = utime + p->stime; + cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); /* * Use CFS's precise accounting: @@ -2937,11 +4179,11 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) rtime = nsecs_to_cputime(p->se.sum_exec_runtime); if (total) { - u64 temp = (__force u64) rtime; + u64 temp = rtime; - temp *= (__force u64) utime; - do_div(temp, (__force u32) total); - utime = (__force cputime_t) temp; + temp *= utime; + do_div(temp, total); + utime = (cputime_t)temp; } else utime = rtime; @@ -2949,7 +4191,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) * Compare with previous values, to keep monotonicity: */ p->prev_utime = max(p->prev_utime, utime); - p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); + p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); *ut = p->prev_utime; *st = p->prev_stime; @@ -2966,20 +4208,21 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) thread_group_cputime(p, &cputime); - total = cputime.utime + cputime.stime; + total = cputime_add(cputime.utime, cputime.stime); rtime = nsecs_to_cputime(cputime.sum_exec_runtime); if (total) { - u64 temp = (__force u64) rtime; + u64 temp = rtime; - temp *= (__force u64) cputime.utime; - do_div(temp, (__force u32) total); - utime = (__force cputime_t) temp; + temp *= cputime.utime; + do_div(temp, total); + utime = (cputime_t)temp; } else utime = rtime; sig->prev_utime = max(sig->prev_utime, utime); - sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); + sig->prev_stime = max(sig->prev_stime, + cputime_sub(rtime, sig->prev_utime)); *ut = sig->prev_utime; *st = sig->prev_stime; @@ -3078,9 +4321,6 @@ static noinline void __schedule_bug(struct task_struct *prev) { struct pt_regs *regs = get_irq_regs(); - if (oops_in_progress) - return; - printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", prev->comm, prev->pid, preempt_count()); @@ -4612,13 +5852,6 @@ bool __sched yield_to(struct task_struct *p, bool preempt) */ if (preempt && rq != p_rq) resched_task(p_rq->curr); - } else { - /* - * We might have set it in task_yield_fair(), but are - * not going to schedule(), so don't want to skip - * the next update. - */ - rq->skip_clock_update = 0; } out: @@ -4786,7 +6019,7 @@ void sched_show_task(struct task_struct *p) free = stack_not_used(p); #endif printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, - task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), + task_pid_nr(p), task_pid_nr(p->real_parent), (unsigned long)task_thread_info(p)->flags); show_stack(p, NULL); @@ -4885,6 +6118,53 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) #endif } +/* + * Increase the granularity value when there are more CPUs, + * because with more CPUs the 'effective latency' as visible + * to users decreases. But the relationship is not linear, + * so pick a second-best guess by going with the log2 of the + * number of CPUs. + * + * This idea comes from the SD scheduler of Con Kolivas: + */ +static int get_update_sysctl_factor(void) +{ + unsigned int cpus = min_t(int, num_online_cpus(), 8); + unsigned int factor; + + switch (sysctl_sched_tunable_scaling) { + case SCHED_TUNABLESCALING_NONE: + factor = 1; + break; + case SCHED_TUNABLESCALING_LINEAR: + factor = cpus; + break; + case SCHED_TUNABLESCALING_LOG: + default: + factor = 1 + ilog2(cpus); + break; + } + + return factor; +} + +static void update_sysctl(void) +{ + unsigned int factor = get_update_sysctl_factor(); + +#define SET_SYSCTL(name) \ + (sysctl_##name = (factor) * normalized_sysctl_##name) + SET_SYSCTL(sched_min_granularity); + SET_SYSCTL(sched_latency); + SET_SYSCTL(sched_wakeup_granularity); +#undef SET_SYSCTL +} + +static inline void sched_init_granularity(void) +{ + update_sysctl(); +} + #ifdef CONFIG_SMP void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { @@ -5071,6 +6351,30 @@ static void calc_global_load_remove(struct rq *rq) rq->calc_load_active = 0; } +#ifdef CONFIG_CFS_BANDWIDTH +static void unthrottle_offline_cfs_rqs(struct rq *rq) +{ + struct cfs_rq *cfs_rq; + + for_each_leaf_cfs_rq(rq, cfs_rq) { + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + + if (!cfs_rq->runtime_enabled) + continue; + + /* + * clock_task is not advancing so we just need to make sure + * there's some valid quota amount + */ + cfs_rq->runtime_remaining = cfs_b->quota; + if (cfs_rq_throttled(cfs_rq)) + unthrottle_cfs_rq(cfs_rq); + } +} +#else +static void unthrottle_offline_cfs_rqs(struct rq *rq) {} +#endif + /* * Migrate all tasks from the rq, sleeping tasks will be migrated by * try_to_wake_up()->select_task_rq(). @@ -5676,12 +6980,6 @@ static int init_rootdomain(struct root_domain *rd) return -ENOMEM; } -/* - * By default the system creates a single root-domain with all cpus as - * members (mimicking the global state we have today). - */ -struct root_domain def_root_domain; - static void init_defrootdomain(void) { init_rootdomain(&def_root_domain); @@ -5752,31 +7050,6 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) destroy_sched_domain(sd, cpu); } -/* - * Keep a special pointer to the highest sched_domain that has - * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this - * allows us to avoid some pointer chasing select_idle_sibling(). - * - * Also keep a unique ID per domain (we use the first cpu number in - * the cpumask of the domain), this allows us to quickly tell if - * two cpus are in the same cache domain, see ttwu_share_cache(). - */ -DEFINE_PER_CPU(struct sched_domain *, sd_llc); -DEFINE_PER_CPU(int, sd_llc_id); - -static void update_top_cache_domain(int cpu) -{ - struct sched_domain *sd; - int id = cpu; - - sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); - if (sd) - id = cpumask_first(sched_domain_span(sd)); - - rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); - per_cpu(sd_llc_id, cpu) = id; -} - /* * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. @@ -5816,8 +7089,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) tmp = rq->sd; rcu_assign_pointer(rq->sd, sd); destroy_sched_domains(tmp, cpu); - - update_top_cache_domain(cpu); } /* cpus with isolated domains */ @@ -5977,7 +7248,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) continue; sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, cpu_to_node(cpu)); + GFP_KERNEL, cpu_to_node(i)); if (!sg) goto fail; @@ -6115,12 +7386,6 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) return; update_group_power(sd, cpu); - atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); -} - -int __weak arch_sd_sibling_asym_packing(void) -{ - return 0*SD_ASYM_PACKING; } /* @@ -6758,6 +8023,29 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, } } +static int update_runtime(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int cpu = (int)(long)hcpu; + + switch (action) { + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + disable_runtime(cpu_rq(cpu)); + return NOTIFY_OK; + + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + enable_runtime(cpu_rq(cpu)); + return NOTIFY_OK; + + default: + return NOTIFY_DONE; + } +} + void __init sched_init_smp(void) { cpumask_var_t non_isolated_cpus; @@ -6806,11 +8094,104 @@ int in_sched_functions(unsigned long addr) && addr < (unsigned long)__sched_text_end); } -#ifdef CONFIG_CGROUP_SCHED -struct task_group root_task_group; +static void init_cfs_rq(struct cfs_rq *cfs_rq) +{ + cfs_rq->tasks_timeline = RB_ROOT; + INIT_LIST_HEAD(&cfs_rq->tasks); + cfs_rq->min_vruntime = (u64)(-(1LL << 20)); +#ifndef CONFIG_64BIT + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; +#endif +} + +static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) +{ + struct rt_prio_array *array; + int i; + + array = &rt_rq->active; + for (i = 0; i < MAX_RT_PRIO; i++) { + INIT_LIST_HEAD(array->queue + i); + __clear_bit(i, array->bitmap); + } + /* delimiter for bitsearch: */ + __set_bit(MAX_RT_PRIO, array->bitmap); + +#if defined CONFIG_SMP + rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->highest_prio.next = MAX_RT_PRIO; + rt_rq->rt_nr_migratory = 0; + rt_rq->overloaded = 0; + plist_head_init(&rt_rq->pushable_tasks); +#endif + + rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; + rt_rq->rt_runtime = 0; + raw_spin_lock_init(&rt_rq->rt_runtime_lock); +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + struct sched_entity *se, int cpu, + struct sched_entity *parent) +{ + struct rq *rq = cpu_rq(cpu); + + cfs_rq->tg = tg; + cfs_rq->rq = rq; +#ifdef CONFIG_SMP + /* allow initial update_cfs_load() to truncate */ + cfs_rq->load_stamp = 1; +#endif + init_cfs_rq_runtime(cfs_rq); + + tg->cfs_rq[cpu] = cfs_rq; + tg->se[cpu] = se; + + /* se could be NULL for root_task_group */ + if (!se) + return; + + if (!parent) + se->cfs_rq = &rq->cfs; + else + se->cfs_rq = parent->my_q; + + se->my_q = cfs_rq; + update_load_set(&se->load, 0); + se->parent = parent; +} #endif -DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +#ifdef CONFIG_RT_GROUP_SCHED +static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, + struct sched_rt_entity *rt_se, int cpu, + struct sched_rt_entity *parent) +{ + struct rq *rq = cpu_rq(cpu); + + rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->rt_nr_boosted = 0; + rt_rq->rq = rq; + rt_rq->tg = tg; + + tg->rt_rq[cpu] = rt_rq; + tg->rt_se[cpu] = rt_se; + + if (!rt_se) + return; + + if (!parent) + rt_se->rt_rq = &rq->rt; + else + rt_se->rt_rq = parent->my_q; + + rt_se->my_q = rt_rq; + rt_se->parent = parent; + INIT_LIST_HEAD(&rt_se->run_list); +} +#endif void __init sched_init(void) { @@ -6868,17 +8249,9 @@ void __init sched_init(void) #ifdef CONFIG_CGROUP_SCHED list_add(&root_task_group.list, &task_groups); INIT_LIST_HEAD(&root_task_group.children); - INIT_LIST_HEAD(&root_task_group.siblings); autogroup_init(&init_task); - #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_CGROUP_CPUACCT - root_cpuacct.cpustat = &kernel_cpustat; - root_cpuacct.cpuusage = alloc_percpu(u64); - /* Too early, not expected to fail */ - BUG_ON(!root_cpuacct.cpuusage); -#endif for_each_possible_cpu(i) { struct rq *rq; @@ -6890,7 +8263,7 @@ void __init sched_init(void) init_cfs_rq(&rq->cfs); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED - root_task_group.shares = ROOT_TASK_GROUP_LOAD; + root_task_group.shares = root_task_group_load; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); /* * How much cpu bandwidth does root_task_group get? @@ -6940,7 +8313,7 @@ void __init sched_init(void) rq->avg_idle = 2*sysctl_sched_migration_cost; rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ - rq->nohz_flags = 0; + rq->nohz_balance_kick = 0; #endif #endif init_rq_hrtick(rq); @@ -6953,6 +8326,10 @@ void __init sched_init(void) INIT_HLIST_HEAD(&init_task.preempt_notifiers); #endif +#ifdef CONFIG_SMP + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); +#endif + #ifdef CONFIG_RT_MUTEXES plist_head_init(&init_task.pi_waiters); #endif @@ -6980,11 +8357,17 @@ void __init sched_init(void) #ifdef CONFIG_SMP zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); +#ifdef CONFIG_NO_HZ + zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); + alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); + atomic_set(&nohz.load_balancer, nr_cpu_ids); + atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); + atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); +#endif /* May be allocated at isolcpus cmdline parse time */ if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); -#endif - init_sched_fair_class(); +#endif /* SMP */ scheduler_running = 1; } @@ -7136,10 +8519,169 @@ void set_curr_task(int cpu, struct task_struct *p) #endif -#ifdef CONFIG_CGROUP_SCHED -/* task_group_lock serializes the addition/removal of task groups */ -static DEFINE_SPINLOCK(task_group_lock); +#ifdef CONFIG_FAIR_GROUP_SCHED +static void free_fair_sched_group(struct task_group *tg) +{ + int i; + + destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); + + for_each_possible_cpu(i) { + if (tg->cfs_rq) + kfree(tg->cfs_rq[i]); + if (tg->se) + kfree(tg->se[i]); + } + + kfree(tg->cfs_rq); + kfree(tg->se); +} + +static +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se; + int i; + + tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); + if (!tg->cfs_rq) + goto err; + tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); + if (!tg->se) + goto err; + + tg->shares = NICE_0_LOAD; + + init_cfs_bandwidth(tg_cfs_bandwidth(tg)); + + for_each_possible_cpu(i) { + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), + GFP_KERNEL, cpu_to_node(i)); + if (!cfs_rq) + goto err; + + se = kzalloc_node(sizeof(struct sched_entity), + GFP_KERNEL, cpu_to_node(i)); + if (!se) + goto err_free_rq; + + init_cfs_rq(cfs_rq); + init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); + } + + return 1; + +err_free_rq: + kfree(cfs_rq); +err: + return 0; +} + +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* + * Only empty task groups can be destroyed; so we can speculatively + * check on_list without danger of it being re-added. + */ + if (!tg->cfs_rq[cpu]->on_list) + return; + + raw_spin_lock_irqsave(&rq->lock, flags); + list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); + raw_spin_unlock_irqrestore(&rq->lock, flags); +} +#else /* !CONFIG_FAIR_GROUP_SCHED */ +static inline void free_fair_sched_group(struct task_group *tg) +{ +} + +static inline +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +{ + return 1; +} + +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) +{ +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_RT_GROUP_SCHED +static void free_rt_sched_group(struct task_group *tg) +{ + int i; + + if (tg->rt_se) + destroy_rt_bandwidth(&tg->rt_bandwidth); + + for_each_possible_cpu(i) { + if (tg->rt_rq) + kfree(tg->rt_rq[i]); + if (tg->rt_se) + kfree(tg->rt_se[i]); + } + + kfree(tg->rt_rq); + kfree(tg->rt_se); +} + +static +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) +{ + struct rt_rq *rt_rq; + struct sched_rt_entity *rt_se; + int i; + + tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); + if (!tg->rt_rq) + goto err; + tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); + if (!tg->rt_se) + goto err; + + init_rt_bandwidth(&tg->rt_bandwidth, + ktime_to_ns(def_rt_bandwidth.rt_period), 0); + + for_each_possible_cpu(i) { + rt_rq = kzalloc_node(sizeof(struct rt_rq), + GFP_KERNEL, cpu_to_node(i)); + if (!rt_rq) + goto err; + + rt_se = kzalloc_node(sizeof(struct sched_rt_entity), + GFP_KERNEL, cpu_to_node(i)); + if (!rt_se) + goto err_free_rq; + + init_rt_rq(rt_rq, cpu_rq(i)); + rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; + init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); + } + + return 1; + +err_free_rq: + kfree(rt_rq); +err: + return 0; +} +#else /* !CONFIG_RT_GROUP_SCHED */ +static inline void free_rt_sched_group(struct task_group *tg) +{ +} +static inline +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) +{ + return 1; +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_CGROUP_SCHED static void free_sched_group(struct task_group *tg) { free_fair_sched_group(tg); @@ -7244,6 +8786,50 @@ void sched_move_task(struct task_struct *tsk) } #endif /* CONFIG_CGROUP_SCHED */ +#ifdef CONFIG_FAIR_GROUP_SCHED +static DEFINE_MUTEX(shares_mutex); + +int sched_group_set_shares(struct task_group *tg, unsigned long shares) +{ + int i; + unsigned long flags; + + /* + * We can't change the weight of the root cgroup. + */ + if (!tg->se[0]) + return -EINVAL; + + shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); + + mutex_lock(&shares_mutex); + if (tg->shares == shares) + goto done; + + tg->shares = shares; + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + struct sched_entity *se; + + se = tg->se[i]; + /* Propagate contribution to hierarchy */ + raw_spin_lock_irqsave(&rq->lock, flags); + for_each_sched_entity(se) + update_cfs_shares(group_cfs_rq(se)); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + +done: + mutex_unlock(&shares_mutex); + return 0; +} + +unsigned long sched_group_shares(struct task_group *tg) +{ + return tg->shares; +} +#endif + #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) static unsigned long to_ratio(u64 period, u64 runtime) { @@ -7266,7 +8852,7 @@ static inline int tg_has_rt_tasks(struct task_group *tg) struct task_struct *g, *p; do_each_thread(g, p) { - if (rt_task(p) && task_rq(p)->rt.tg == tg) + if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) return 1; } while_each_thread(g, p); @@ -7617,8 +9203,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) { - int i, ret = 0, runtime_enabled, runtime_was_enabled; - struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + int i, ret = 0, runtime_enabled; + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); if (tg == &root_task_group) return -EINVAL; @@ -7645,8 +9231,6 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) goto out_unlock; runtime_enabled = quota != RUNTIME_INF; - runtime_was_enabled = cfs_b->quota != RUNTIME_INF; - account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; @@ -7662,13 +9246,13 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) for_each_possible_cpu(i) { struct cfs_rq *cfs_rq = tg->cfs_rq[i]; - struct rq *rq = cfs_rq->rq; + struct rq *rq = rq_of(cfs_rq); raw_spin_lock_irq(&rq->lock); cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_remaining = 0; - if (cfs_rq->throttled) + if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); raw_spin_unlock_irq(&rq->lock); } @@ -7682,7 +9266,7 @@ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) { u64 quota, period; - period = ktime_to_ns(tg->cfs_bandwidth.period); + period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); if (cfs_quota_us < 0) quota = RUNTIME_INF; else @@ -7695,10 +9279,10 @@ long tg_get_cfs_quota(struct task_group *tg) { u64 quota_us; - if (tg->cfs_bandwidth.quota == RUNTIME_INF) + if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) return -1; - quota_us = tg->cfs_bandwidth.quota; + quota_us = tg_cfs_bandwidth(tg)->quota; do_div(quota_us, NSEC_PER_USEC); return quota_us; @@ -7709,7 +9293,10 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) u64 quota, period; period = (u64)cfs_period_us * NSEC_PER_USEC; - quota = tg->cfs_bandwidth.quota; + quota = tg_cfs_bandwidth(tg)->quota; + + if (period <= 0) + return -EINVAL; return tg_set_cfs_bandwidth(tg, period, quota); } @@ -7718,7 +9305,7 @@ long tg_get_cfs_period(struct task_group *tg) { u64 cfs_period_us; - cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); + cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); do_div(cfs_period_us, NSEC_PER_USEC); return cfs_period_us; @@ -7778,13 +9365,13 @@ static u64 normalize_cfs_quota(struct task_group *tg, static int tg_cfs_schedulable_down(struct task_group *tg, void *data) { struct cfs_schedulable_data *d = data; - struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); s64 quota = 0, parent_quota = -1; if (!tg->parent) { quota = RUNTIME_INF; } else { - struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; + struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); quota = normalize_cfs_quota(tg, d); parent_quota = parent_b->hierarchal_quota; @@ -7828,7 +9415,7 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, struct cgroup_map_cb *cb) { struct task_group *tg = cgroup_tg(cgrp); - struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); cb->fill(cb, "nr_periods", cfs_b->nr_periods); cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); @@ -7929,16 +9516,38 @@ struct cgroup_subsys cpu_cgroup_subsys = { * (balbir@in.ibm.com). */ +/* track cpu usage of a group of tasks and its child groups */ +struct cpuacct { + struct cgroup_subsys_state css; + /* cpuusage holds pointer to a u64-type object on every cpu */ + u64 __percpu *cpuusage; + struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; + struct cpuacct *parent; +}; + +struct cgroup_subsys cpuacct_subsys; + +/* return cpu accounting group corresponding to this container */ +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), + struct cpuacct, css); +} + +/* return cpu accounting group to which this task belongs */ +static inline struct cpuacct *task_ca(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, cpuacct_subsys_id), + struct cpuacct, css); +} + /* create a new cpu accounting group */ static struct cgroup_subsys_state *cpuacct_create( struct cgroup_subsys *ss, struct cgroup *cgrp) { - struct cpuacct *ca; - - if (!cgrp->parent) - return &root_cpuacct.css; + struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + int i; - ca = kzalloc(sizeof(*ca), GFP_KERNEL); if (!ca) goto out; @@ -7946,13 +9555,18 @@ static struct cgroup_subsys_state *cpuacct_create( if (!ca->cpuusage) goto out_free_ca; - ca->cpustat = alloc_percpu(struct kernel_cpustat); - if (!ca->cpustat) - goto out_free_cpuusage; + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) + if (percpu_counter_init(&ca->cpustat[i], 0)) + goto out_free_counters; + + if (cgrp->parent) + ca->parent = cgroup_ca(cgrp->parent); return &ca->css; -out_free_cpuusage: +out_free_counters: + while (--i >= 0) + percpu_counter_destroy(&ca->cpustat[i]); free_percpu(ca->cpuusage); out_free_ca: kfree(ca); @@ -7965,8 +9579,10 @@ static void cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) { struct cpuacct *ca = cgroup_ca(cgrp); + int i; - free_percpu(ca->cpustat); + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) + percpu_counter_destroy(&ca->cpustat[i]); free_percpu(ca->cpuusage); kfree(ca); } @@ -8059,31 +9675,16 @@ static const char *cpuacct_stat_desc[] = { }; static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) + struct cgroup_map_cb *cb) { struct cpuacct *ca = cgroup_ca(cgrp); - int cpu; - s64 val = 0; - - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_USER]; - val += kcpustat->cpustat[CPUTIME_NICE]; - } - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); + int i; - val = 0; - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_SYSTEM]; - val += kcpustat->cpustat[CPUTIME_IRQ]; - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { + s64 val = percpu_counter_read(&ca->cpustat[i]); + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[i], val); } - - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); - return 0; } @@ -8113,7 +9714,7 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) * * called with rq->lock held. */ -void cpuacct_charge(struct task_struct *tsk, u64 cputime) +static void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; int cpu; @@ -8127,7 +9728,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) ca = task_ca(tsk); - for (; ca; ca = parent_ca(ca)) { + for (; ca; ca = ca->parent) { u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); *cpuusage += cputime; } @@ -8135,6 +9736,45 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) rcu_read_unlock(); } +/* + * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large + * in cputime_t units. As a result, cpuacct_update_stats calls + * percpu_counter_add with values large enough to always overflow the + * per cpu batch limit causing bad SMP scalability. + * + * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we + * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled + * and enabled. We cap it at INT_MAX which is the largest allowed batch value. + */ +#ifdef CONFIG_SMP +#define CPUACCT_BATCH \ + min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) +#else +#define CPUACCT_BATCH 0 +#endif + +/* + * Charge the system/user time to the task's accounting group. + */ +static void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val) +{ + struct cpuacct *ca; + int batch = CPUACCT_BATCH; + + if (unlikely(!cpuacct_subsys.active)) + return; + + rcu_read_lock(); + ca = task_ca(tsk); + + do { + __percpu_counter_add(&ca->cpustat[idx], val, batch); + ca = ca->parent; + } while (ca); + rcu_read_unlock(); +} + struct cgroup_subsys cpuacct_subsys = { .name = "cpuacct", .create = cpuacct_create, diff --git a/trunk/kernel/sched/Makefile b/trunk/kernel/sched/Makefile deleted file mode 100644 index 9a7dd35102a3..000000000000 --- a/trunk/kernel/sched/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_clock.o = -pg -endif - -ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) -# According to Alan Modra , the -fno-omit-frame-pointer is -# needed for x86 only. Why this used to be enabled for all architectures is beyond -# me. I suspect most platforms don't need this, but until we know that for sure -# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k -# to get a correct value for the wait-channel (WCHAN in ps). --davidm -CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer -endif - -obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o -obj-$(CONFIG_SMP) += cpupri.o -obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o -obj-$(CONFIG_SCHEDSTATS) += stats.o -obj-$(CONFIG_SCHED_DEBUG) += debug.o - - diff --git a/trunk/kernel/sched/sched.h b/trunk/kernel/sched/sched.h deleted file mode 100644 index 98c0c2623db8..000000000000 --- a/trunk/kernel/sched/sched.h +++ /dev/null @@ -1,1166 +0,0 @@ - -#include -#include -#include -#include - -#include "cpupri.h" - -extern __read_mostly int scheduler_running; - -/* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], - * and back. - */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) -#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) - -/* - * 'User priority' is the nice value converted to something we - * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. - */ -#define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) -#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) - -/* - * Helpers for converting nanosecond timing to jiffy resolution - */ -#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) - -#define NICE_0_LOAD SCHED_LOAD_SCALE -#define NICE_0_SHIFT SCHED_LOAD_SHIFT - -/* - * These are the 'tuning knobs' of the scheduler: - * - * default timeslice is 100 msecs (used only for SCHED_RR tasks). - * Timeslices get refilled after they expire. - */ -#define DEF_TIMESLICE (100 * HZ / 1000) - -/* - * single value that denotes runtime == period, ie unlimited time. - */ -#define RUNTIME_INF ((u64)~0ULL) - -static inline int rt_policy(int policy) -{ - if (policy == SCHED_FIFO || policy == SCHED_RR) - return 1; - return 0; -} - -static inline int task_has_rt_policy(struct task_struct *p) -{ - return rt_policy(p->policy); -} - -/* - * This is the priority-queue data structure of the RT scheduling class: - */ -struct rt_prio_array { - DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ - struct list_head queue[MAX_RT_PRIO]; -}; - -struct rt_bandwidth { - /* nests inside the rq lock: */ - raw_spinlock_t rt_runtime_lock; - ktime_t rt_period; - u64 rt_runtime; - struct hrtimer rt_period_timer; -}; - -extern struct mutex sched_domains_mutex; - -#ifdef CONFIG_CGROUP_SCHED - -#include - -struct cfs_rq; -struct rt_rq; - -static LIST_HEAD(task_groups); - -struct cfs_bandwidth { -#ifdef CONFIG_CFS_BANDWIDTH - raw_spinlock_t lock; - ktime_t period; - u64 quota, runtime; - s64 hierarchal_quota; - u64 runtime_expires; - - int idle, timer_active; - struct hrtimer period_timer, slack_timer; - struct list_head throttled_cfs_rq; - - /* statistics */ - int nr_periods, nr_throttled; - u64 throttled_time; -#endif -}; - -/* task group related information */ -struct task_group { - struct cgroup_subsys_state css; - -#ifdef CONFIG_FAIR_GROUP_SCHED - /* schedulable entities of this group on each cpu */ - struct sched_entity **se; - /* runqueue "owned" by this group on each cpu */ - struct cfs_rq **cfs_rq; - unsigned long shares; - - atomic_t load_weight; -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - struct sched_rt_entity **rt_se; - struct rt_rq **rt_rq; - - struct rt_bandwidth rt_bandwidth; -#endif - - struct rcu_head rcu; - struct list_head list; - - struct task_group *parent; - struct list_head siblings; - struct list_head children; - -#ifdef CONFIG_SCHED_AUTOGROUP - struct autogroup *autogroup; -#endif - - struct cfs_bandwidth cfs_bandwidth; -}; - -#ifdef CONFIG_FAIR_GROUP_SCHED -#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD - -/* - * A weight of 0 or 1 can cause arithmetics problems. - * A weight of a cfs_rq is the sum of weights of which entities - * are queued on this cfs_rq, so a weight of a entity should not be - * too large, so as the shares value of a task group. - * (The default weight is 1024 - so there's no practical - * limitation from this.) - */ -#define MIN_SHARES (1UL << 1) -#define MAX_SHARES (1UL << 18) -#endif - -/* Default task group. - * Every task in system belong to this group at bootup. - */ -extern struct task_group root_task_group; - -typedef int (*tg_visitor)(struct task_group *, void *); - -extern int walk_tg_tree_from(struct task_group *from, - tg_visitor down, tg_visitor up, void *data); - -/* - * Iterate the full tree, calling @down when first entering a node and @up when - * leaving it for the final time. - * - * Caller must hold rcu_lock or sufficient equivalent. - */ -static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) -{ - return walk_tg_tree_from(&root_task_group, down, up, data); -} - -extern int tg_nop(struct task_group *tg, void *data); - -extern void free_fair_sched_group(struct task_group *tg); -extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); -extern void unregister_fair_sched_group(struct task_group *tg, int cpu); -extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, - struct sched_entity *se, int cpu, - struct sched_entity *parent); -extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); -extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); - -extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); -extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); -extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); - -extern void free_rt_sched_group(struct task_group *tg); -extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); -extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, - struct sched_rt_entity *rt_se, int cpu, - struct sched_rt_entity *parent); - -#else /* CONFIG_CGROUP_SCHED */ - -struct cfs_bandwidth { }; - -#endif /* CONFIG_CGROUP_SCHED */ - -/* CFS-related fields in a runqueue */ -struct cfs_rq { - struct load_weight load; - unsigned long nr_running, h_nr_running; - - u64 exec_clock; - u64 min_vruntime; -#ifndef CONFIG_64BIT - u64 min_vruntime_copy; -#endif - - struct rb_root tasks_timeline; - struct rb_node *rb_leftmost; - - struct list_head tasks; - struct list_head *balance_iterator; - - /* - * 'curr' points to currently running entity on this cfs_rq. - * It is set to NULL otherwise (i.e when none are currently running). - */ - struct sched_entity *curr, *next, *last, *skip; - -#ifdef CONFIG_SCHED_DEBUG - unsigned int nr_spread_over; -#endif - -#ifdef CONFIG_FAIR_GROUP_SCHED - struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ - - /* - * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in - * a hierarchy). Non-leaf lrqs hold other higher schedulable entities - * (like users, containers etc.) - * - * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This - * list is used during load balance. - */ - int on_list; - struct list_head leaf_cfs_rq_list; - struct task_group *tg; /* group that "owns" this runqueue */ - -#ifdef CONFIG_SMP - /* - * the part of load.weight contributed by tasks - */ - unsigned long task_weight; - - /* - * h_load = weight * f(tg) - * - * Where f(tg) is the recursive weight fraction assigned to - * this group. - */ - unsigned long h_load; - - /* - * Maintaining per-cpu shares distribution for group scheduling - * - * load_stamp is the last time we updated the load average - * load_last is the last time we updated the load average and saw load - * load_unacc_exec_time is currently unaccounted execution time - */ - u64 load_avg; - u64 load_period; - u64 load_stamp, load_last, load_unacc_exec_time; - - unsigned long load_contribution; -#endif /* CONFIG_SMP */ -#ifdef CONFIG_CFS_BANDWIDTH - int runtime_enabled; - u64 runtime_expires; - s64 runtime_remaining; - - u64 throttled_timestamp; - int throttled, throttle_count; - struct list_head throttled_list; -#endif /* CONFIG_CFS_BANDWIDTH */ -#endif /* CONFIG_FAIR_GROUP_SCHED */ -}; - -static inline int rt_bandwidth_enabled(void) -{ - return sysctl_sched_rt_runtime >= 0; -} - -/* Real-Time classes' related field in a runqueue: */ -struct rt_rq { - struct rt_prio_array active; - unsigned long rt_nr_running; -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - struct { - int curr; /* highest queued rt task prio */ -#ifdef CONFIG_SMP - int next; /* next highest */ -#endif - } highest_prio; -#endif -#ifdef CONFIG_SMP - unsigned long rt_nr_migratory; - unsigned long rt_nr_total; - int overloaded; - struct plist_head pushable_tasks; -#endif - int rt_throttled; - u64 rt_time; - u64 rt_runtime; - /* Nests inside the rq lock: */ - raw_spinlock_t rt_runtime_lock; - -#ifdef CONFIG_RT_GROUP_SCHED - unsigned long rt_nr_boosted; - - struct rq *rq; - struct list_head leaf_rt_rq_list; - struct task_group *tg; -#endif -}; - -#ifdef CONFIG_SMP - -/* - * We add the notion of a root-domain which will be used to define per-domain - * variables. Each exclusive cpuset essentially defines an island domain by - * fully partitioning the member cpus from any other cpuset. Whenever a new - * exclusive cpuset is created, we also create and attach a new root-domain - * object. - * - */ -struct root_domain { - atomic_t refcount; - atomic_t rto_count; - struct rcu_head rcu; - cpumask_var_t span; - cpumask_var_t online; - - /* - * The "RT overload" flag: it gets set if a CPU has more than - * one runnable RT task. - */ - cpumask_var_t rto_mask; - struct cpupri cpupri; -}; - -extern struct root_domain def_root_domain; - -#endif /* CONFIG_SMP */ - -/* - * This is the main, per-CPU runqueue data structure. - * - * Locking rule: those places that want to lock multiple runqueues - * (such as the load balancing or the thread migration code), lock - * acquire operations must be ordered by ascending &runqueue. - */ -struct rq { - /* runqueue lock: */ - raw_spinlock_t lock; - - /* - * nr_running and cpu_load should be in the same cacheline because - * remote CPUs use both these fields when doing load calculation. - */ - unsigned long nr_running; - #define CPU_LOAD_IDX_MAX 5 - unsigned long cpu_load[CPU_LOAD_IDX_MAX]; - unsigned long last_load_update_tick; -#ifdef CONFIG_NO_HZ - u64 nohz_stamp; - unsigned long nohz_flags; -#endif - int skip_clock_update; - - /* capture load from *all* tasks on this cpu: */ - struct load_weight load; - unsigned long nr_load_updates; - u64 nr_switches; - - struct cfs_rq cfs; - struct rt_rq rt; - -#ifdef CONFIG_FAIR_GROUP_SCHED - /* list of leaf cfs_rq on this cpu: */ - struct list_head leaf_cfs_rq_list; -#endif -#ifdef CONFIG_RT_GROUP_SCHED - struct list_head leaf_rt_rq_list; -#endif - - /* - * This is part of a global counter where only the total sum - * over all CPUs matters. A task can increase this counter on - * one CPU and if it got migrated afterwards it may decrease - * it on another CPU. Always updated under the runqueue lock: - */ - unsigned long nr_uninterruptible; - - struct task_struct *curr, *idle, *stop; - unsigned long next_balance; - struct mm_struct *prev_mm; - - u64 clock; - u64 clock_task; - - atomic_t nr_iowait; - -#ifdef CONFIG_SMP - struct root_domain *rd; - struct sched_domain *sd; - - unsigned long cpu_power; - - unsigned char idle_balance; - /* For active balancing */ - int post_schedule; - int active_balance; - int push_cpu; - struct cpu_stop_work active_balance_work; - /* cpu of this runqueue: */ - int cpu; - int online; - - u64 rt_avg; - u64 age_stamp; - u64 idle_stamp; - u64 avg_idle; -#endif - -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - u64 prev_irq_time; -#endif -#ifdef CONFIG_PARAVIRT - u64 prev_steal_time; -#endif -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - u64 prev_steal_time_rq; -#endif - - /* calc_load related fields */ - unsigned long calc_load_update; - long calc_load_active; - -#ifdef CONFIG_SCHED_HRTICK -#ifdef CONFIG_SMP - int hrtick_csd_pending; - struct call_single_data hrtick_csd; -#endif - struct hrtimer hrtick_timer; -#endif - -#ifdef CONFIG_SCHEDSTATS - /* latency stats */ - struct sched_info rq_sched_info; - unsigned long long rq_cpu_time; - /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ - - /* sys_sched_yield() stats */ - unsigned int yld_count; - - /* schedule() stats */ - unsigned int sched_switch; - unsigned int sched_count; - unsigned int sched_goidle; - - /* try_to_wake_up() stats */ - unsigned int ttwu_count; - unsigned int ttwu_local; -#endif - -#ifdef CONFIG_SMP - struct llist_head wake_list; -#endif -}; - -static inline int cpu_of(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq->cpu; -#else - return 0; -#endif -} - -DECLARE_PER_CPU(struct rq, runqueues); - -#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -#define this_rq() (&__get_cpu_var(runqueues)) -#define task_rq(p) cpu_rq(task_cpu(p)) -#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -#define raw_rq() (&__raw_get_cpu_var(runqueues)) - -#ifdef CONFIG_SMP - -#define rcu_dereference_check_sched_domain(p) \ - rcu_dereference_check((p), \ - lockdep_is_held(&sched_domains_mutex)) - -/* - * The domain tree (rq->sd) is protected by RCU's quiescent state transition. - * See detach_destroy_domains: synchronize_sched for details. - * - * The domain tree of any CPU may only be accessed from within - * preempt-disabled sections. - */ -#define for_each_domain(cpu, __sd) \ - for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ - __sd; __sd = __sd->parent) - -#define for_each_lower_domain(sd) for (; sd; sd = sd->child) - -/** - * highest_flag_domain - Return highest sched_domain containing flag. - * @cpu: The cpu whose highest level of sched domain is to - * be returned. - * @flag: The flag to check for the highest sched_domain - * for the given cpu. - * - * Returns the highest sched_domain of a cpu which contains the given flag. - */ -static inline struct sched_domain *highest_flag_domain(int cpu, int flag) -{ - struct sched_domain *sd, *hsd = NULL; - - for_each_domain(cpu, sd) { - if (!(sd->flags & flag)) - break; - hsd = sd; - } - - return hsd; -} - -DECLARE_PER_CPU(struct sched_domain *, sd_llc); -DECLARE_PER_CPU(int, sd_llc_id); - -#endif /* CONFIG_SMP */ - -#include "stats.h" -#include "auto_group.h" - -#ifdef CONFIG_CGROUP_SCHED - -/* - * Return the group to which this tasks belongs. - * - * We use task_subsys_state_check() and extend the RCU verification with - * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each - * task it moves into the cgroup. Therefore by holding either of those locks, - * we pin the task to the current cgroup. - */ -static inline struct task_group *task_group(struct task_struct *p) -{ - struct task_group *tg; - struct cgroup_subsys_state *css; - - css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&task_rq(p)->lock)); - tg = container_of(css, struct task_group, css); - - return autogroup_task_group(p, tg); -} - -/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) -{ -#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) - struct task_group *tg = task_group(p); -#endif - -#ifdef CONFIG_FAIR_GROUP_SCHED - p->se.cfs_rq = tg->cfs_rq[cpu]; - p->se.parent = tg->se[cpu]; -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - p->rt.rt_rq = tg->rt_rq[cpu]; - p->rt.parent = tg->rt_se[cpu]; -#endif -} - -#else /* CONFIG_CGROUP_SCHED */ - -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } -static inline struct task_group *task_group(struct task_struct *p) -{ - return NULL; -} - -#endif /* CONFIG_CGROUP_SCHED */ - -static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - set_task_rq(p, cpu); -#ifdef CONFIG_SMP - /* - * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be - * successfuly executed on another CPU. We must ensure that updates of - * per-task data have been completed by this moment. - */ - smp_wmb(); - task_thread_info(p)->cpu = cpu; -#endif -} - -/* - * Tunables that become constants when CONFIG_SCHED_DEBUG is off: - */ -#ifdef CONFIG_SCHED_DEBUG -# include -# define const_debug __read_mostly -#else -# define const_debug const -#endif - -extern const_debug unsigned int sysctl_sched_features; - -#define SCHED_FEAT(name, enabled) \ - __SCHED_FEAT_##name , - -enum { -#include "features.h" - __SCHED_FEAT_NR, -}; - -#undef SCHED_FEAT - -#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) -static __always_inline bool static_branch__true(struct jump_label_key *key) -{ - return likely(static_branch(key)); /* Not out of line branch. */ -} - -static __always_inline bool static_branch__false(struct jump_label_key *key) -{ - return unlikely(static_branch(key)); /* Out of line branch. */ -} - -#define SCHED_FEAT(name, enabled) \ -static __always_inline bool static_branch_##name(struct jump_label_key *key) \ -{ \ - return static_branch__##enabled(key); \ -} - -#include "features.h" - -#undef SCHED_FEAT - -extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; -#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) -#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ -#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) -#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ - -static inline u64 global_rt_period(void) -{ - return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; -} - -static inline u64 global_rt_runtime(void) -{ - if (sysctl_sched_rt_runtime < 0) - return RUNTIME_INF; - - return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; -} - - - -static inline int task_current(struct rq *rq, struct task_struct *p) -{ - return rq->curr == p; -} - -static inline int task_running(struct rq *rq, struct task_struct *p) -{ -#ifdef CONFIG_SMP - return p->on_cpu; -#else - return task_current(rq, p); -#endif -} - - -#ifndef prepare_arch_switch -# define prepare_arch_switch(next) do { } while (0) -#endif -#ifndef finish_arch_switch -# define finish_arch_switch(prev) do { } while (0) -#endif - -#ifndef __ARCH_WANT_UNLOCKED_CTXSW -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->on_cpu = 1; -#endif -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - */ - smp_wmb(); - prev->on_cpu = 0; -#endif -#ifdef CONFIG_DEBUG_SPINLOCK - /* this is a valid case when another task releases the spinlock */ - rq->lock.owner = current; -#endif - /* - * If we are tracking spinlock dependencies then we have to - * fix up the runqueue lock - which gets 'carried over' from - * prev into current: - */ - spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - - raw_spin_unlock_irq(&rq->lock); -} - -#else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->on_cpu = 1; -#endif -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - raw_spin_unlock_irq(&rq->lock); -#else - raw_spin_unlock(&rq->lock); -#endif -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - */ - smp_wmb(); - prev->on_cpu = 0; -#endif -#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_enable(); -#endif -} -#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ - - -static inline void update_load_add(struct load_weight *lw, unsigned long inc) -{ - lw->weight += inc; - lw->inv_weight = 0; -} - -static inline void update_load_sub(struct load_weight *lw, unsigned long dec) -{ - lw->weight -= dec; - lw->inv_weight = 0; -} - -static inline void update_load_set(struct load_weight *lw, unsigned long w) -{ - lw->weight = w; - lw->inv_weight = 0; -} - -/* - * To aid in avoiding the subversion of "niceness" due to uneven distribution - * of tasks with abnormal "nice" values across CPUs the contribution that - * each task makes to its run queue's load is weighted according to its - * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a - * scaled version of the new time slice allocation that they receive on time - * slice expiry etc. - */ - -#define WEIGHT_IDLEPRIO 3 -#define WMULT_IDLEPRIO 1431655765 - -/* - * Nice levels are multiplicative, with a gentle 10% change for every - * nice level changed. I.e. when a CPU-bound task goes from nice 0 to - * nice 1, it will get ~10% less CPU time than another CPU-bound task - * that remained on nice 0. - * - * The "10% effect" is relative and cumulative: from _any_ nice level, - * if you go up 1 level, it's -10% CPU usage, if you go down 1 level - * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. - * If a task goes up by ~10% and another task goes down by ~10% then - * the relative distance between them is ~25%.) - */ -static const int prio_to_weight[40] = { - /* -20 */ 88761, 71755, 56483, 46273, 36291, - /* -15 */ 29154, 23254, 18705, 14949, 11916, - /* -10 */ 9548, 7620, 6100, 4904, 3906, - /* -5 */ 3121, 2501, 1991, 1586, 1277, - /* 0 */ 1024, 820, 655, 526, 423, - /* 5 */ 335, 272, 215, 172, 137, - /* 10 */ 110, 87, 70, 56, 45, - /* 15 */ 36, 29, 23, 18, 15, -}; - -/* - * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. - * - * In cases where the weight does not change often, we can use the - * precalculated inverse to speed up arithmetics by turning divisions - * into multiplications: - */ -static const u32 prio_to_wmult[40] = { - /* -20 */ 48388, 59856, 76040, 92818, 118348, - /* -15 */ 147320, 184698, 229616, 287308, 360437, - /* -10 */ 449829, 563644, 704093, 875809, 1099582, - /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, - /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, - /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, - /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, - /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, -}; - -/* Time spent by the tasks of the cpu accounting group executing in ... */ -enum cpuacct_stat_index { - CPUACCT_STAT_USER, /* ... user mode */ - CPUACCT_STAT_SYSTEM, /* ... kernel mode */ - - CPUACCT_STAT_NSTATS, -}; - - -#define sched_class_highest (&stop_sched_class) -#define for_each_class(class) \ - for (class = sched_class_highest; class; class = class->next) - -extern const struct sched_class stop_sched_class; -extern const struct sched_class rt_sched_class; -extern const struct sched_class fair_sched_class; -extern const struct sched_class idle_sched_class; - - -#ifdef CONFIG_SMP - -extern void trigger_load_balance(struct rq *rq, int cpu); -extern void idle_balance(int this_cpu, struct rq *this_rq); - -#else /* CONFIG_SMP */ - -static inline void idle_balance(int cpu, struct rq *rq) -{ -} - -#endif - -extern void sysrq_sched_debug_show(void); -extern void sched_init_granularity(void); -extern void update_max_interval(void); -extern void update_group_power(struct sched_domain *sd, int cpu); -extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); -extern void init_sched_rt_class(void); -extern void init_sched_fair_class(void); - -extern void resched_task(struct task_struct *p); -extern void resched_cpu(int cpu); - -extern struct rt_bandwidth def_rt_bandwidth; -extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); - -extern void update_cpu_load(struct rq *this_rq); - -#ifdef CONFIG_CGROUP_CPUACCT -#include -/* track cpu usage of a group of tasks and its child groups */ -struct cpuacct { - struct cgroup_subsys_state css; - /* cpuusage holds pointer to a u64-type object on every cpu */ - u64 __percpu *cpuusage; - struct kernel_cpustat __percpu *cpustat; -}; - -/* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) -{ - return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), - struct cpuacct, css); -} - -/* return cpu accounting group to which this task belongs */ -static inline struct cpuacct *task_ca(struct task_struct *tsk) -{ - return container_of(task_subsys_state(tsk, cpuacct_subsys_id), - struct cpuacct, css); -} - -static inline struct cpuacct *parent_ca(struct cpuacct *ca) -{ - if (!ca || !ca->css.cgroup->parent) - return NULL; - return cgroup_ca(ca->css.cgroup->parent); -} - -extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); -#else -static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} -#endif - -static inline void inc_nr_running(struct rq *rq) -{ - rq->nr_running++; -} - -static inline void dec_nr_running(struct rq *rq) -{ - rq->nr_running--; -} - -extern void update_rq_clock(struct rq *rq); - -extern void activate_task(struct rq *rq, struct task_struct *p, int flags); -extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); - -extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); - -extern const_debug unsigned int sysctl_sched_time_avg; -extern const_debug unsigned int sysctl_sched_nr_migrate; -extern const_debug unsigned int sysctl_sched_migration_cost; - -static inline u64 sched_avg_period(void) -{ - return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; -} - -void calc_load_account_idle(struct rq *this_rq); - -#ifdef CONFIG_SCHED_HRTICK - -/* - * Use hrtick when: - * - enabled by features - * - hrtimer is actually high res - */ -static inline int hrtick_enabled(struct rq *rq) -{ - if (!sched_feat(HRTICK)) - return 0; - if (!cpu_active(cpu_of(rq))) - return 0; - return hrtimer_is_hres_active(&rq->hrtick_timer); -} - -void hrtick_start(struct rq *rq, u64 delay); - -#else - -static inline int hrtick_enabled(struct rq *rq) -{ - return 0; -} - -#endif /* CONFIG_SCHED_HRTICK */ - -#ifdef CONFIG_SMP -extern void sched_avg_update(struct rq *rq); -static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) -{ - rq->rt_avg += rt_delta; - sched_avg_update(rq); -} -#else -static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } -static inline void sched_avg_update(struct rq *rq) { } -#endif - -extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); - -#ifdef CONFIG_SMP -#ifdef CONFIG_PREEMPT - -static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); - -/* - * fair double_lock_balance: Safely acquires both rq->locks in a fair - * way at the expense of forcing extra atomic operations in all - * invocations. This assures that the double_lock is acquired using the - * same underlying policy as the spinlock_t on this architecture, which - * reduces latency compared to the unfair variant below. However, it - * also adds more overhead and therefore may reduce throughput. - */ -static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) - __releases(this_rq->lock) - __acquires(busiest->lock) - __acquires(this_rq->lock) -{ - raw_spin_unlock(&this_rq->lock); - double_rq_lock(this_rq, busiest); - - return 1; -} - -#else -/* - * Unfair double_lock_balance: Optimizes throughput at the expense of - * latency by eliminating extra atomic operations when the locks are - * already in proper order on entry. This favors lower cpu-ids and will - * grant the double lock to lower cpus over higher ids under contention, - * regardless of entry order into the function. - */ -static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) - __releases(this_rq->lock) - __acquires(busiest->lock) - __acquires(this_rq->lock) -{ - int ret = 0; - - if (unlikely(!raw_spin_trylock(&busiest->lock))) { - if (busiest < this_rq) { - raw_spin_unlock(&this_rq->lock); - raw_spin_lock(&busiest->lock); - raw_spin_lock_nested(&this_rq->lock, - SINGLE_DEPTH_NESTING); - ret = 1; - } else - raw_spin_lock_nested(&busiest->lock, - SINGLE_DEPTH_NESTING); - } - return ret; -} - -#endif /* CONFIG_PREEMPT */ - -/* - * double_lock_balance - lock the busiest runqueue, this_rq is locked already. - */ -static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) -{ - if (unlikely(!irqs_disabled())) { - /* printk() doesn't work good under rq->lock */ - raw_spin_unlock(&this_rq->lock); - BUG_ON(1); - } - - return _double_lock_balance(this_rq, busiest); -} - -static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) - __releases(busiest->lock) -{ - raw_spin_unlock(&busiest->lock); - lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); -} - -/* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) - __acquires(rq1->lock) - __acquires(rq2->lock) -{ - BUG_ON(!irqs_disabled()); - if (rq1 == rq2) { - raw_spin_lock(&rq1->lock); - __acquire(rq2->lock); /* Fake it out ;) */ - } else { - if (rq1 < rq2) { - raw_spin_lock(&rq1->lock); - raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); - } else { - raw_spin_lock(&rq2->lock); - raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); - } - } -} - -/* - * double_rq_unlock - safely unlock two runqueues - * - * Note this does not restore interrupts like task_rq_unlock, - * you need to do so manually after calling. - */ -static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) - __releases(rq1->lock) - __releases(rq2->lock) -{ - raw_spin_unlock(&rq1->lock); - if (rq1 != rq2) - raw_spin_unlock(&rq2->lock); - else - __release(rq2->lock); -} - -#else /* CONFIG_SMP */ - -/* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) - __acquires(rq1->lock) - __acquires(rq2->lock) -{ - BUG_ON(!irqs_disabled()); - BUG_ON(rq1 != rq2); - raw_spin_lock(&rq1->lock); - __acquire(rq2->lock); /* Fake it out ;) */ -} - -/* - * double_rq_unlock - safely unlock two runqueues - * - * Note this does not restore interrupts like task_rq_unlock, - * you need to do so manually after calling. - */ -static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) - __releases(rq1->lock) - __releases(rq2->lock) -{ - BUG_ON(rq1 != rq2); - raw_spin_unlock(&rq1->lock); - __release(rq2->lock); -} - -#endif - -extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); -extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); -extern void print_cfs_stats(struct seq_file *m, int cpu); -extern void print_rt_stats(struct seq_file *m, int cpu); - -extern void init_cfs_rq(struct cfs_rq *cfs_rq); -extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); -extern void unthrottle_offline_cfs_rqs(struct rq *rq); - -extern void account_cfs_bandwidth_used(int enabled, int was_enabled); - -#ifdef CONFIG_NO_HZ -enum rq_nohz_flag_bits { - NOHZ_TICK_STOPPED, - NOHZ_BALANCE_KICK, - NOHZ_IDLE, -}; - -#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) -#endif diff --git a/trunk/kernel/sched/stats.c b/trunk/kernel/sched/stats.c deleted file mode 100644 index 2a581ba8e190..000000000000 --- a/trunk/kernel/sched/stats.c +++ /dev/null @@ -1,111 +0,0 @@ - -#include -#include -#include -#include - -#include "sched.h" - -/* - * bump this up when changing the output format or the meaning of an existing - * format, so that tools can adapt (or abort) - */ -#define SCHEDSTAT_VERSION 15 - -static int show_schedstat(struct seq_file *seq, void *v) -{ - int cpu; - int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; - char *mask_str = kmalloc(mask_len, GFP_KERNEL); - - if (mask_str == NULL) - return -ENOMEM; - - seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); - seq_printf(seq, "timestamp %lu\n", jiffies); - for_each_online_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); -#ifdef CONFIG_SMP - struct sched_domain *sd; - int dcount = 0; -#endif - - /* runqueue-specific stats */ - seq_printf(seq, - "cpu%d %u %u %u %u %u %u %llu %llu %lu", - cpu, rq->yld_count, - rq->sched_switch, rq->sched_count, rq->sched_goidle, - rq->ttwu_count, rq->ttwu_local, - rq->rq_cpu_time, - rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); - - seq_printf(seq, "\n"); - -#ifdef CONFIG_SMP - /* domain-specific stats */ - rcu_read_lock(); - for_each_domain(cpu, sd) { - enum cpu_idle_type itype; - - cpumask_scnprintf(mask_str, mask_len, - sched_domain_span(sd)); - seq_printf(seq, "domain%d %s", dcount++, mask_str); - for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; - itype++) { - seq_printf(seq, " %u %u %u %u %u %u %u %u", - sd->lb_count[itype], - sd->lb_balanced[itype], - sd->lb_failed[itype], - sd->lb_imbalance[itype], - sd->lb_gained[itype], - sd->lb_hot_gained[itype], - sd->lb_nobusyq[itype], - sd->lb_nobusyg[itype]); - } - seq_printf(seq, - " %u %u %u %u %u %u %u %u %u %u %u %u\n", - sd->alb_count, sd->alb_failed, sd->alb_pushed, - sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, - sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, - sd->ttwu_wake_remote, sd->ttwu_move_affine, - sd->ttwu_move_balance); - } - rcu_read_unlock(); -#endif - } - kfree(mask_str); - return 0; -} - -static int schedstat_open(struct inode *inode, struct file *file) -{ - unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); - char *buf = kmalloc(size, GFP_KERNEL); - struct seq_file *m; - int res; - - if (!buf) - return -ENOMEM; - res = single_open(file, show_schedstat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = size; - } else - kfree(buf); - return res; -} - -static const struct file_operations proc_schedstat_operations = { - .open = schedstat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init proc_schedstat_init(void) -{ - proc_create("schedstat", 0, NULL, &proc_schedstat_operations); - return 0; -} -module_init(proc_schedstat_init); diff --git a/trunk/kernel/sched/auto_group.c b/trunk/kernel/sched_autogroup.c similarity index 88% rename from trunk/kernel/sched/auto_group.c rename to trunk/kernel/sched_autogroup.c index e8a1f83ee0e7..429242f3c484 100644 --- a/trunk/kernel/sched/auto_group.c +++ b/trunk/kernel/sched_autogroup.c @@ -1,19 +1,15 @@ #ifdef CONFIG_SCHED_AUTOGROUP -#include "sched.h" - #include #include #include #include -#include -#include unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; -void __init autogroup_init(struct task_struct *init_task) +static void __init autogroup_init(struct task_struct *init_task) { autogroup_default.tg = &root_task_group; kref_init(&autogroup_default.kref); @@ -21,7 +17,7 @@ void __init autogroup_init(struct task_struct *init_task) init_task->signal->autogroup = &autogroup_default; } -void autogroup_free(struct task_group *tg) +static inline void autogroup_free(struct task_group *tg) { kfree(tg->autogroup); } @@ -63,6 +59,10 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p) return ag; } +#ifdef CONFIG_RT_GROUP_SCHED +static void free_rt_sched_group(struct task_group *tg); +#endif + static inline struct autogroup *autogroup_create(void) { struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); @@ -108,7 +108,8 @@ static inline struct autogroup *autogroup_create(void) return autogroup_kref_get(&autogroup_default); } -bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) +static inline bool +task_wants_autogroup(struct task_struct *p, struct task_group *tg) { if (tg != &root_task_group) return false; @@ -126,6 +127,22 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) return true; } +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return !!tg->autogroup; +} + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + + if (enabled && task_wants_autogroup(p, tg)) + return p->signal->autogroup->tg; + + return tg; +} + static void autogroup_move_group(struct task_struct *p, struct autogroup *ag) { @@ -246,7 +263,7 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SCHED_DEBUG -int autogroup_path(struct task_group *tg, char *buf, int buflen) +static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) { if (!task_group_is_autogroup(tg)) return 0; diff --git a/trunk/kernel/sched/auto_group.h b/trunk/kernel/sched_autogroup.h similarity index 66% rename from trunk/kernel/sched/auto_group.h rename to trunk/kernel/sched_autogroup.h index 8bd047142816..c2f0e7248dca 100644 --- a/trunk/kernel/sched/auto_group.h +++ b/trunk/kernel/sched_autogroup.h @@ -1,8 +1,5 @@ #ifdef CONFIG_SCHED_AUTOGROUP -#include -#include - struct autogroup { /* * reference doesn't mean how many thread attach to this @@ -16,28 +13,9 @@ struct autogroup { int nice; }; -extern void autogroup_init(struct task_struct *init_task); -extern void autogroup_free(struct task_group *tg); - -static inline bool task_group_is_autogroup(struct task_group *tg) -{ - return !!tg->autogroup; -} - -extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); - +static inline bool task_group_is_autogroup(struct task_group *tg); static inline struct task_group * -autogroup_task_group(struct task_struct *p, struct task_group *tg) -{ - int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); - - if (enabled && task_wants_autogroup(p, tg)) - return p->signal->autogroup->tg; - - return tg; -} - -extern int autogroup_path(struct task_group *tg, char *buf, int buflen); +autogroup_task_group(struct task_struct *p, struct task_group *tg); #else /* !CONFIG_SCHED_AUTOGROUP */ diff --git a/trunk/kernel/sched/clock.c b/trunk/kernel/sched_clock.c similarity index 100% rename from trunk/kernel/sched/clock.c rename to trunk/kernel/sched_clock.c diff --git a/trunk/kernel/sched/cpupri.c b/trunk/kernel/sched_cpupri.c similarity index 99% rename from trunk/kernel/sched/cpupri.c rename to trunk/kernel/sched_cpupri.c index b0d798eaf130..a86cf9d9eb11 100644 --- a/trunk/kernel/sched/cpupri.c +++ b/trunk/kernel/sched_cpupri.c @@ -1,5 +1,5 @@ /* - * kernel/sched/cpupri.c + * kernel/sched_cpupri.c * * CPU priority management * @@ -28,7 +28,7 @@ */ #include -#include "cpupri.h" +#include "sched_cpupri.h" /* Convert between a 140 based task->prio, and our 102 based cpupri */ static int convert_prio(int prio) diff --git a/trunk/kernel/sched/cpupri.h b/trunk/kernel/sched_cpupri.h similarity index 100% rename from trunk/kernel/sched/cpupri.h rename to trunk/kernel/sched_cpupri.h diff --git a/trunk/kernel/sched/debug.c b/trunk/kernel/sched_debug.c similarity index 99% rename from trunk/kernel/sched/debug.c rename to trunk/kernel/sched_debug.c index 2a075e10004b..a6710a112b4f 100644 --- a/trunk/kernel/sched/debug.c +++ b/trunk/kernel/sched_debug.c @@ -1,5 +1,5 @@ /* - * kernel/sched/debug.c + * kernel/time/sched_debug.c * * Print the CFS rbtree * @@ -16,8 +16,6 @@ #include #include -#include "sched.h" - static DEFINE_SPINLOCK(sched_debug_lock); /* @@ -375,7 +373,7 @@ static int sched_debug_show(struct seq_file *m, void *v) return 0; } -void sysrq_sched_debug_show(void) +static void sysrq_sched_debug_show(void) { sched_debug_show(NULL, NULL); } diff --git a/trunk/kernel/sched/fair.c b/trunk/kernel/sched_fair.c similarity index 86% rename from trunk/kernel/sched/fair.c rename to trunk/kernel/sched_fair.c index 84adb2d66cbd..8a39fa3e3c6c 100644 --- a/trunk/kernel/sched/fair.c +++ b/trunk/kernel/sched_fair.c @@ -23,13 +23,6 @@ #include #include #include -#include -#include -#include - -#include - -#include "sched.h" /* * Targeted preemption latency for CPU-bound tasks: @@ -110,110 +103,7 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif -/* - * Increase the granularity value when there are more CPUs, - * because with more CPUs the 'effective latency' as visible - * to users decreases. But the relationship is not linear, - * so pick a second-best guess by going with the log2 of the - * number of CPUs. - * - * This idea comes from the SD scheduler of Con Kolivas: - */ -static int get_update_sysctl_factor(void) -{ - unsigned int cpus = min_t(int, num_online_cpus(), 8); - unsigned int factor; - - switch (sysctl_sched_tunable_scaling) { - case SCHED_TUNABLESCALING_NONE: - factor = 1; - break; - case SCHED_TUNABLESCALING_LINEAR: - factor = cpus; - break; - case SCHED_TUNABLESCALING_LOG: - default: - factor = 1 + ilog2(cpus); - break; - } - - return factor; -} - -static void update_sysctl(void) -{ - unsigned int factor = get_update_sysctl_factor(); - -#define SET_SYSCTL(name) \ - (sysctl_##name = (factor) * normalized_sysctl_##name) - SET_SYSCTL(sched_min_granularity); - SET_SYSCTL(sched_latency); - SET_SYSCTL(sched_wakeup_granularity); -#undef SET_SYSCTL -} - -void sched_init_granularity(void) -{ - update_sysctl(); -} - -#if BITS_PER_LONG == 32 -# define WMULT_CONST (~0UL) -#else -# define WMULT_CONST (1UL << 32) -#endif - -#define WMULT_SHIFT 32 - -/* - * Shift right and round: - */ -#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) - -/* - * delta *= weight / lw - */ -static unsigned long -calc_delta_mine(unsigned long delta_exec, unsigned long weight, - struct load_weight *lw) -{ - u64 tmp; - - /* - * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched - * entities since MIN_SHARES = 2. Treat weight as 1 if less than - * 2^SCHED_LOAD_RESOLUTION. - */ - if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) - tmp = (u64)delta_exec * scale_load_down(weight); - else - tmp = (u64)delta_exec; - - if (!lw->inv_weight) { - unsigned long w = scale_load_down(lw->weight); - - if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) - lw->inv_weight = 1; - else if (unlikely(!w)) - lw->inv_weight = WMULT_CONST; - else - lw->inv_weight = WMULT_CONST / w; - } - - /* - * Check whether we'd overflow the 64-bit multiplication: - */ - if (unlikely(tmp > WMULT_CONST)) - tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, - WMULT_SHIFT/2); - else - tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); - - return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); -} - - -const struct sched_class fair_sched_class; +static const struct sched_class fair_sched_class; /************************************************************** * CFS operations on generic schedulable entities: @@ -523,7 +413,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } -struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) +static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { struct rb_node *left = cfs_rq->rb_leftmost; @@ -544,7 +434,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) } #ifdef CONFIG_SCHED_DEBUG -struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) +static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); @@ -794,7 +684,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) - update_load_add(&rq_of(cfs_rq)->load, se->load.weight); + inc_cpu_load(rq_of(cfs_rq), se->load.weight); if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, se->load.weight); list_add(&se->group_node, &cfs_rq->tasks); @@ -807,7 +697,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) - update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); + dec_cpu_load(rq_of(cfs_rq), se->load.weight); if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, -se->load.weight); list_del_init(&se->group_node); @@ -1003,6 +893,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (unlikely(delta > se->statistics.sleep_max)) se->statistics.sleep_max = delta; + se->statistics.sleep_start = 0; se->statistics.sum_sleep_runtime += delta; if (tsk) { @@ -1019,6 +910,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (unlikely(delta > se->statistics.block_max)) se->statistics.block_max = delta; + se->statistics.block_start = 0; se->statistics.sum_sleep_runtime += delta; if (tsk) { @@ -1028,8 +920,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) trace_sched_stat_iowait(tsk, delta); } - trace_sched_stat_blocked(tsk, delta); - /* * Blocking time is in units of nanosecs, so shift by * 20 to get a milliseconds-range estimation of the @@ -1397,32 +1287,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) */ #ifdef CONFIG_CFS_BANDWIDTH - -#ifdef HAVE_JUMP_LABEL -static struct jump_label_key __cfs_bandwidth_used; - -static inline bool cfs_bandwidth_used(void) -{ - return static_branch(&__cfs_bandwidth_used); -} - -void account_cfs_bandwidth_used(int enabled, int was_enabled) -{ - /* only need to count groups transitioning between enabled/!enabled */ - if (enabled && !was_enabled) - jump_label_inc(&__cfs_bandwidth_used); - else if (!enabled && was_enabled) - jump_label_dec(&__cfs_bandwidth_used); -} -#else /* HAVE_JUMP_LABEL */ -static bool cfs_bandwidth_used(void) -{ - return true; -} - -void account_cfs_bandwidth_used(int enabled, int was_enabled) {} -#endif /* HAVE_JUMP_LABEL */ - /* * default period for cfs group bandwidth. * default: 0.1s, units: nanoseconds @@ -1444,7 +1308,7 @@ static inline u64 sched_cfs_bandwidth_slice(void) * * requires cfs_b->lock */ -void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) +static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { u64 now; @@ -1456,11 +1320,6 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); } -static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) -{ - return &tg->cfs_bandwidth; -} - /* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -1562,7 +1421,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) { - if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) + if (!cfs_rq->runtime_enabled) return; __account_cfs_rq_runtime(cfs_rq, delta_exec); @@ -1570,13 +1429,13 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { - return cfs_bandwidth_used() && cfs_rq->throttled; + return cfs_rq->throttled; } /* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { - return cfs_bandwidth_used() && cfs_rq->throttle_count; + return cfs_rq->throttle_count; } /* @@ -1671,7 +1530,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) raw_spin_unlock(&cfs_b->lock); } -void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) +static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); @@ -1897,9 +1756,6 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) { - if (!cfs_bandwidth_used()) - return; - if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) return; @@ -1945,9 +1801,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) */ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) { - if (!cfs_bandwidth_used()) - return; - /* an active group must be handled by the update_curr()->put() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; @@ -1965,9 +1818,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) /* conditionally throttle active cfs_rq's from put_prev_entity() */ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { - if (!cfs_bandwidth_used()) - return; - if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) return; @@ -1980,112 +1830,7 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) throttle_cfs_rq(cfs_rq); } - -static inline u64 default_cfs_period(void); -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); -static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); - -static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) -{ - struct cfs_bandwidth *cfs_b = - container_of(timer, struct cfs_bandwidth, slack_timer); - do_sched_cfs_slack_timer(cfs_b); - - return HRTIMER_NORESTART; -} - -static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) -{ - struct cfs_bandwidth *cfs_b = - container_of(timer, struct cfs_bandwidth, period_timer); - ktime_t now; - int overrun; - int idle = 0; - - for (;;) { - now = hrtimer_cb_get_time(timer); - overrun = hrtimer_forward(timer, now, cfs_b->period); - - if (!overrun) - break; - - idle = do_sched_cfs_period_timer(cfs_b, overrun); - } - - return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; -} - -void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) -{ - raw_spin_lock_init(&cfs_b->lock); - cfs_b->runtime = 0; - cfs_b->quota = RUNTIME_INF; - cfs_b->period = ns_to_ktime(default_cfs_period()); - - INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); - hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cfs_b->period_timer.function = sched_cfs_period_timer; - hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cfs_b->slack_timer.function = sched_cfs_slack_timer; -} - -static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - cfs_rq->runtime_enabled = 0; - INIT_LIST_HEAD(&cfs_rq->throttled_list); -} - -/* requires cfs_b->lock, may release to reprogram timer */ -void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) -{ - /* - * The timer may be active because we're trying to set a new bandwidth - * period or because we're racing with the tear-down path - * (timer_active==0 becomes visible before the hrtimer call-back - * terminates). In either case we ensure that it's re-programmed - */ - while (unlikely(hrtimer_active(&cfs_b->period_timer))) { - raw_spin_unlock(&cfs_b->lock); - /* ensure cfs_b->lock is available while we wait */ - hrtimer_cancel(&cfs_b->period_timer); - - raw_spin_lock(&cfs_b->lock); - /* if someone else restarted the timer then we're done */ - if (cfs_b->timer_active) - return; - } - - cfs_b->timer_active = 1; - start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); -} - -static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) -{ - hrtimer_cancel(&cfs_b->period_timer); - hrtimer_cancel(&cfs_b->slack_timer); -} - -void unthrottle_offline_cfs_rqs(struct rq *rq) -{ - struct cfs_rq *cfs_rq; - - for_each_leaf_cfs_rq(rq, cfs_rq) { - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - - if (!cfs_rq->runtime_enabled) - continue; - - /* - * clock_task is not advancing so we just need to make sure - * there's some valid quota amount - */ - cfs_rq->runtime_remaining = cfs_b->quota; - if (cfs_rq_throttled(cfs_rq)) - unthrottle_cfs_rq(cfs_rq); - } -} - -#else /* CONFIG_CFS_BANDWIDTH */ +#else static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -2107,22 +1852,8 @@ static inline int throttled_lb_pair(struct task_group *tg, { return 0; } - -void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} - -#ifdef CONFIG_FAIR_GROUP_SCHED -static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} #endif -static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) -{ - return NULL; -} -static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} -void unthrottle_offline_cfs_rqs(struct rq *rq) {} - -#endif /* CONFIG_CFS_BANDWIDTH */ - /************************************************** * CFS operations on tasks: */ @@ -2135,7 +1866,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) WARN_ON(task_rq(p) != rq); - if (cfs_rq->nr_running > 1) { + if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { u64 slice = sched_slice(cfs_rq, se); u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; s64 delta = slice - ran; @@ -2166,7 +1897,7 @@ static void hrtick_update(struct rq *rq) { struct task_struct *curr = rq->curr; - if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) + if (curr->sched_class != &fair_sched_class) return; if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) @@ -2289,61 +2020,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->load.weight; -} - -/* - * Return a low guess at the load of a migration-source cpu weighted - * according to the scheduling class and "nice" value. - * - * We want to under-estimate the load of migration sources, to - * balance conservatively. - */ -static unsigned long source_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return min(rq->cpu_load[type-1], total); -} - -/* - * Return a high guess at the load of a migration-target cpu weighted - * according to the scheduling class and "nice" value. - */ -static unsigned long target_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return max(rq->cpu_load[type-1], total); -} - -static unsigned long power_of(int cpu) -{ - return cpu_rq(cpu)->cpu_power; -} - -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = ACCESS_ONCE(rq->nr_running); - - if (nr_running) - return rq->load.weight / nr_running; - - return 0; -} - static void task_waking_fair(struct task_struct *p) { @@ -2651,7 +2327,7 @@ static int select_idle_sibling(struct task_struct *p, int target) int prev_cpu = task_cpu(p); struct sched_domain *sd; struct sched_group *sg; - int i; + int i, smt = 0; /* * If the task is going to be woken-up on this cpu and if it is @@ -2671,9 +2347,17 @@ static int select_idle_sibling(struct task_struct *p, int target) * Otherwise, iterate the domains and find an elegible idle cpu. */ rcu_read_lock(); +again: + for_each_domain(target, sd) { + if (!smt && (sd->flags & SD_SHARE_CPUPOWER)) + continue; + + if (smt && !(sd->flags & SD_SHARE_CPUPOWER)) + break; + + if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) + break; - sd = rcu_dereference(per_cpu(sd_llc, target)); - for_each_lower_domain(sd) { sg = sd->groups; do { if (!cpumask_intersects(sched_group_cpus(sg), @@ -2692,6 +2376,10 @@ static int select_idle_sibling(struct task_struct *p, int target) sg = sg->next; } while (sg != sd->groups); } + if (!smt) { + smt = 1; + goto again; + } done: rcu_read_unlock(); @@ -2720,9 +2408,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) int want_sd = 1; int sync = wake_flags & WF_SYNC; - if (p->rt.nr_cpus_allowed == 1) - return prev_cpu; - if (sd_flag & SD_BALANCE_WAKE) { if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) want_affine = 1; @@ -3007,8 +2692,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) } while (cfs_rq); p = task_of(se); - if (hrtick_enabled(rq)) - hrtick_start_fair(rq, p); + hrtick_start_fair(rq, p); return p; } @@ -3052,12 +2736,6 @@ static void yield_task_fair(struct rq *rq) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - /* - * Tell update_rq_clock() that we've just updated, - * so we don't do microscopic update in schedule() - * and double the fastpath cost. - */ - rq->skip_clock_update = 1; } set_skip_buddy(se); @@ -3097,51 +2775,13 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, check_preempt_curr(this_rq, p, 0); } -/* - * Is this task likely cache-hot: - */ -static int -task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) -{ - s64 delta; - - if (p->sched_class != &fair_sched_class) - return 0; - - if (unlikely(p->policy == SCHED_IDLE)) - return 0; - - /* - * Buddy candidates are cache hot: - */ - if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && - (&p->se == cfs_rq_of(&p->se)->next || - &p->se == cfs_rq_of(&p->se)->last)) - return 1; - - if (sysctl_sched_migration_cost == -1) - return 1; - if (sysctl_sched_migration_cost == 0) - return 0; - - delta = now - p->se.exec_start; - - return delta < (s64)sysctl_sched_migration_cost; -} - -#define LBF_ALL_PINNED 0x01 -#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ -#define LBF_HAD_BREAK 0x04 -#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ -#define LBF_ABORT 0x10 - /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) + int *all_pinned) { int tsk_cache_hot = 0; /* @@ -3154,7 +2794,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, schedstat_inc(p, se.statistics.nr_failed_migrations_affine); return 0; } - *lb_flags &= ~LBF_ALL_PINNED; + *all_pinned = 0; if (task_running(rq, p)) { schedstat_inc(p, se.statistics.nr_failed_migrations_running); @@ -3228,7 +2868,7 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, static unsigned long balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *lb_flags, + enum cpu_idle_type idle, int *all_pinned, struct cfs_rq *busiest_cfs_rq) { int loops = 0, pulled = 0; @@ -3239,14 +2879,12 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, goto out; list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { - if (loops++ > sysctl_sched_nr_migrate) { - *lb_flags |= LBF_NEED_BREAK; + if (loops++ > sysctl_sched_nr_migrate) break; - } if ((p->se.load.weight >> 1) > rem_load_move || !can_migrate_task(p, busiest, this_cpu, sd, idle, - lb_flags)) + all_pinned)) continue; pull_task(busiest, p, this_rq, this_cpu); @@ -3259,10 +2897,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, * kernels will stop after the first task is pulled to minimize * the critical section. */ - if (idle == CPU_NEWLY_IDLE) { - *lb_flags |= LBF_ABORT; + if (idle == CPU_NEWLY_IDLE) break; - } #endif /* @@ -3367,7 +3003,7 @@ static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) + int *all_pinned) { long rem_load_move = max_load_move; struct cfs_rq *busiest_cfs_rq; @@ -3380,9 +3016,6 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long busiest_weight = busiest_cfs_rq->load.weight; u64 rem_load, moved_load; - if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) - break; - /* * empty group or part of a throttled hierarchy */ @@ -3394,7 +3027,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, rem_load = div_u64(rem_load, busiest_h_load + 1); moved_load = balance_tasks(this_rq, this_cpu, busiest, - rem_load, sd, idle, lb_flags, + rem_load, sd, idle, all_pinned, busiest_cfs_rq); if (!moved_load) @@ -3420,10 +3053,10 @@ static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) + int *all_pinned) { return balance_tasks(this_rq, this_cpu, busiest, - max_load_move, sd, idle, lb_flags, + max_load_move, sd, idle, all_pinned, &busiest->cfs); } #endif @@ -3438,30 +3071,29 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) + int *all_pinned) { unsigned long total_load_moved = 0, load_moved; do { load_moved = load_balance_fair(this_rq, this_cpu, busiest, max_load_move - total_load_moved, - sd, idle, lb_flags); + sd, idle, all_pinned); total_load_moved += load_moved; - if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) - break; - #ifdef CONFIG_PREEMPT /* * NEWIDLE balancing is a source of latency, so preemptible * kernels will stop after the first task is pulled to minimize * the critical section. */ - if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) { - *lb_flags |= LBF_ABORT; + if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) + break; + + if (raw_spin_is_contended(&this_rq->lock) || + raw_spin_is_contended(&busiest->lock)) break; - } #endif } while (load_moved && max_load_move > total_load_moved); @@ -3522,6 +3154,15 @@ struct sg_lb_stats { int group_has_capacity; /* Is there extra capacity in the group? */ }; +/** + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. + * @group: The group whose first cpu is to be returned. + */ +static inline unsigned int group_first_cpu(struct sched_group *group) +{ + return cpumask_first(sched_group_cpus(group)); +} + /** * get_sd_load_idx - Obtain the load index for a given sched domain. * @sd: The sched_domain whose load_idx is to be obtained. @@ -3771,7 +3412,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) sdg->sgp->power = power; } -void update_group_power(struct sched_domain *sd, int cpu) +static void update_group_power(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; @@ -4037,6 +3678,11 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, } while (sg != sd->groups); } +int __weak arch_sd_sibling_asym_packing(void) +{ + return 0*SD_ASYM_PACKING; +} + /** * check_asym_packing - Check to see if the group is packed into the * sched doman. @@ -4400,7 +4046,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, #define MAX_PINNED_INTERVAL 512 /* Working cpumask for load_balance and load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); static int need_active_balance(struct sched_domain *sd, int idle, int busiest_cpu, int this_cpu) @@ -4451,7 +4097,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, lb_flags = 0, active_balance = 0; + int ld_moved, all_pinned = 0, active_balance = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; @@ -4492,11 +4138,11 @@ static int load_balance(int this_cpu, struct rq *this_rq, * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ - lb_flags |= LBF_ALL_PINNED; + all_pinned = 1; local_irq_save(flags); double_rq_lock(this_rq, busiest); ld_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle, &lb_flags); + imbalance, sd, idle, &all_pinned); double_rq_unlock(this_rq, busiest); local_irq_restore(flags); @@ -4506,18 +4152,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, if (ld_moved && this_cpu != smp_processor_id()) resched_cpu(this_cpu); - if (lb_flags & LBF_ABORT) - goto out_balanced; - - if (lb_flags & LBF_NEED_BREAK) { - lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK; - if (lb_flags & LBF_ABORT) - goto out_balanced; - goto redo; - } - /* All tasks on this runqueue were pinned by CPU affinity */ - if (unlikely(lb_flags & LBF_ALL_PINNED)) { + if (unlikely(all_pinned)) { cpumask_clear_cpu(cpu_of(busiest), cpus); if (!cpumask_empty(cpus)) goto redo; @@ -4547,7 +4183,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, tsk_cpus_allowed(busiest->curr))) { raw_spin_unlock_irqrestore(&busiest->lock, flags); - lb_flags |= LBF_ALL_PINNED; + all_pinned = 1; goto out_one_pinned; } @@ -4600,8 +4236,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, out_one_pinned: /* tune up the balancing interval */ - if (((lb_flags & LBF_ALL_PINNED) && - sd->balance_interval < MAX_PINNED_INTERVAL) || + if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; @@ -4614,7 +4249,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ -void idle_balance(int this_cpu, struct rq *this_rq) +static void idle_balance(int this_cpu, struct rq *this_rq) { struct sched_domain *sd; int pulled_task = 0; @@ -4729,16 +4364,28 @@ static int active_load_balance_cpu_stop(void *data) #ifdef CONFIG_NO_HZ /* * idle load balancing details + * - One of the idle CPUs nominates itself as idle load_balancer, while + * entering idle. + * - This idle load balancer CPU will also go into tickless mode when + * it is idle, just like all other idle CPUs * - When one of the busy CPUs notice that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. */ static struct { + atomic_t load_balancer; + atomic_t first_pick_cpu; + atomic_t second_pick_cpu; cpumask_var_t idle_cpus_mask; - atomic_t nr_cpus; + cpumask_var_t grp_idle_mask; unsigned long next_balance; /* in jiffy units */ } nohz ____cacheline_aligned; +int get_nohz_load_balancer(void) +{ + return atomic_read(&nohz.load_balancer); +} + #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) /** * lowest_flag_domain - Return lowest sched_domain containing flag. @@ -4774,6 +4421,33 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) for (sd = lowest_flag_domain(cpu, flag); \ (sd && (sd->flags & flag)); sd = sd->parent) +/** + * is_semi_idle_group - Checks if the given sched_group is semi-idle. + * @ilb_group: group to be checked for semi-idleness + * + * Returns: 1 if the group is semi-idle. 0 otherwise. + * + * We define a sched_group to be semi idle if it has atleast one idle-CPU + * and atleast one non-idle CPU. This helper function checks if the given + * sched_group is semi-idle or not. + */ +static inline int is_semi_idle_group(struct sched_group *ilb_group) +{ + cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, + sched_group_cpus(ilb_group)); + + /* + * A sched_group is semi-idle when it has atleast one busy cpu + * and atleast one idle cpu. + */ + if (cpumask_empty(nohz.grp_idle_mask)) + return 0; + + if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) + return 0; + + return 1; +} /** * find_new_ilb - Finds the optimum idle load balancer for nomination. * @cpu: The cpu which is nominating a new idle_load_balancer. @@ -4788,9 +4462,9 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) */ static int find_new_ilb(int cpu) { - int ilb = cpumask_first(nohz.idle_cpus_mask); - struct sched_group *ilbg; struct sched_domain *sd; + struct sched_group *ilb_group; + int ilb = nr_cpu_ids; /* * Have idle load balancer selection from semi-idle packages only @@ -4808,28 +4482,23 @@ static int find_new_ilb(int cpu) rcu_read_lock(); for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { - ilbg = sd->groups; + ilb_group = sd->groups; do { - if (ilbg->group_weight != - atomic_read(&ilbg->sgp->nr_busy_cpus)) { - ilb = cpumask_first_and(nohz.idle_cpus_mask, - sched_group_cpus(ilbg)); + if (is_semi_idle_group(ilb_group)) { + ilb = cpumask_first(nohz.grp_idle_mask); goto unlock; } - ilbg = ilbg->next; + ilb_group = ilb_group->next; - } while (ilbg != sd->groups); + } while (ilb_group != sd->groups); } unlock: rcu_read_unlock(); out_done: - if (ilb < nr_cpu_ids && idle_cpu(ilb)) - return ilb; - - return nr_cpu_ids; + return ilb; } #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ static inline int find_new_ilb(int call_cpu) @@ -4849,68 +4518,99 @@ static void nohz_balancer_kick(int cpu) nohz.next_balance++; - ilb_cpu = find_new_ilb(cpu); + ilb_cpu = get_nohz_load_balancer(); - if (ilb_cpu >= nr_cpu_ids) - return; + if (ilb_cpu >= nr_cpu_ids) { + ilb_cpu = cpumask_first(nohz.idle_cpus_mask); + if (ilb_cpu >= nr_cpu_ids) + return; + } - if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) - return; - /* - * Use smp_send_reschedule() instead of resched_cpu(). - * This way we generate a sched IPI on the target cpu which - * is idle. And the softirq performing nohz idle load balance - * will be run before returning from the IPI. - */ - smp_send_reschedule(ilb_cpu); + if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { + cpu_rq(ilb_cpu)->nohz_balance_kick = 1; + + smp_mb(); + /* + * Use smp_send_reschedule() instead of resched_cpu(). + * This way we generate a sched IPI on the target cpu which + * is idle. And the softirq performing nohz idle load balance + * will be run before returning from the IPI. + */ + smp_send_reschedule(ilb_cpu); + } return; } -static inline void set_cpu_sd_state_busy(void) +/* + * This routine will try to nominate the ilb (idle load balancing) + * owner among the cpus whose ticks are stopped. ilb owner will do the idle + * load balancing on behalf of all those cpus. + * + * When the ilb owner becomes busy, we will not have new ilb owner until some + * idle CPU wakes up and goes back to idle or some busy CPU tries to kick + * idle load balancing by kicking one of the idle CPUs. + * + * Ticks are stopped for the ilb owner as well, with busy CPU kicking this + * ilb owner CPU in future (when there is a need for idle load balancing on + * behalf of all idle CPUs). + */ +void select_nohz_load_balancer(int stop_tick) { - struct sched_domain *sd; int cpu = smp_processor_id(); - if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - clear_bit(NOHZ_IDLE, nohz_flags(cpu)); + if (stop_tick) { + if (!cpu_active(cpu)) { + if (atomic_read(&nohz.load_balancer) != cpu) + return; - rcu_read_lock(); - for_each_domain(cpu, sd) - atomic_inc(&sd->groups->sgp->nr_busy_cpus); - rcu_read_unlock(); -} + /* + * If we are going offline and still the leader, + * give up! + */ + if (atomic_cmpxchg(&nohz.load_balancer, cpu, + nr_cpu_ids) != cpu) + BUG(); -void set_cpu_sd_state_idle(void) -{ - struct sched_domain *sd; - int cpu = smp_processor_id(); + return; + } - if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - set_bit(NOHZ_IDLE, nohz_flags(cpu)); + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); - rcu_read_lock(); - for_each_domain(cpu, sd) - atomic_dec(&sd->groups->sgp->nr_busy_cpus); - rcu_read_unlock(); -} + if (atomic_read(&nohz.first_pick_cpu) == cpu) + atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); + if (atomic_read(&nohz.second_pick_cpu) == cpu) + atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); -/* - * This routine will record that this cpu is going idle with tick stopped. - * This info will be used in performing idle load balancing in the future. - */ -void select_nohz_load_balancer(int stop_tick) -{ - int cpu = smp_processor_id(); + if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { + int new_ilb; - if (stop_tick) { - if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) + /* make me the ilb owner */ + if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, + cpu) != nr_cpu_ids) + return; + + /* + * Check to see if there is a more power-efficient + * ilb. + */ + new_ilb = find_new_ilb(cpu); + if (new_ilb < nr_cpu_ids && new_ilb != cpu) { + atomic_set(&nohz.load_balancer, nr_cpu_ids); + resched_cpu(new_ilb); + return; + } + return; + } + } else { + if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) return; - cpumask_set_cpu(cpu, nohz.idle_cpus_mask); - atomic_inc(&nohz.nr_cpus); - set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + + if (atomic_read(&nohz.load_balancer) == cpu) + if (atomic_cmpxchg(&nohz.load_balancer, cpu, + nr_cpu_ids) != cpu) + BUG(); } return; } @@ -4924,7 +4624,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; * Scale the max load_balance interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. */ -void update_max_interval(void) +static void update_max_interval(void) { max_load_balance_interval = HZ*num_online_cpus()/10; } @@ -5016,12 +4716,11 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) struct rq *rq; int balance_cpu; - if (idle != CPU_IDLE || - !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) - goto end; + if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) + return; for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { - if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) + if (balance_cpu == this_cpu) continue; /* @@ -5029,8 +4728,10 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) * work being done for other cpus. Next load * balancing owner will pick it up. */ - if (need_resched()) + if (need_resched()) { + this_rq->nohz_balance_kick = 0; break; + } raw_spin_lock_irq(&this_rq->lock); update_rq_clock(this_rq); @@ -5044,75 +4745,53 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) this_rq->next_balance = rq->next_balance; } nohz.next_balance = this_rq->next_balance; -end: - clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); + this_rq->nohz_balance_kick = 0; } /* - * Current heuristic for kicking the idle load balancer in the presence - * of an idle cpu is the system. - * - This rq has more than one task. - * - At any scheduler domain level, this cpu's scheduler group has multiple - * busy cpu's exceeding the group's power. - * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler - * domain span are idle. + * Current heuristic for kicking the idle load balancer + * - first_pick_cpu is the one of the busy CPUs. It will kick + * idle load balancer when it has more than one process active. This + * eliminates the need for idle load balancing altogether when we have + * only one running process in the system (common case). + * - If there are more than one busy CPU, idle load balancer may have + * to run for active_load_balance to happen (i.e., two busy CPUs are + * SMT or core siblings and can run better if they move to different + * physical CPUs). So, second_pick_cpu is the second of the busy CPUs + * which will kick idle load balancer as soon as it has any load. */ static inline int nohz_kick_needed(struct rq *rq, int cpu) { unsigned long now = jiffies; - struct sched_domain *sd; - - if (unlikely(idle_cpu(cpu))) - return 0; - - /* - * We may be recently in ticked or tickless idle mode. At the first - * busy tick after returning from idle, we will update the busy stats. - */ - set_cpu_sd_state_busy(); - if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { - clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); - } - - /* - * None are in tickless mode and hence no need for NOHZ idle load - * balancing. - */ - if (likely(!atomic_read(&nohz.nr_cpus))) - return 0; + int ret; + int first_pick_cpu, second_pick_cpu; if (time_before(now, nohz.next_balance)) return 0; - if (rq->nr_running >= 2) - goto need_kick; - - rcu_read_lock(); - for_each_domain(cpu, sd) { - struct sched_group *sg = sd->groups; - struct sched_group_power *sgp = sg->sgp; - int nr_busy = atomic_read(&sgp->nr_busy_cpus); + if (idle_cpu(cpu)) + return 0; - if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) - goto need_kick_unlock; + first_pick_cpu = atomic_read(&nohz.first_pick_cpu); + second_pick_cpu = atomic_read(&nohz.second_pick_cpu); - if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight - && (cpumask_first_and(nohz.idle_cpus_mask, - sched_domain_span(sd)) < cpu)) - goto need_kick_unlock; + if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && + second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) + return 0; - if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) - break; + ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); + if (ret == nr_cpu_ids || ret == cpu) { + atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); + if (rq->nr_running > 1) + return 1; + } else { + ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); + if (ret == nr_cpu_ids || ret == cpu) { + if (rq->nr_running) + return 1; + } } - rcu_read_unlock(); return 0; - -need_kick_unlock: - rcu_read_unlock(); -need_kick: - return 1; } #else static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } @@ -5147,14 +4826,14 @@ static inline int on_null_domain(int cpu) /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */ -void trigger_load_balance(struct rq *rq, int cpu) +static inline void trigger_load_balance(struct rq *rq, int cpu) { /* Don't need to rebalance while attached to NULL domain */ if (time_after_eq(jiffies, rq->next_balance) && likely(!on_null_domain(cpu))) raise_softirq(SCHED_SOFTIRQ); #ifdef CONFIG_NO_HZ - if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) + else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) nohz_balancer_kick(cpu); #endif } @@ -5169,6 +4848,15 @@ static void rq_offline_fair(struct rq *rq) update_sysctl(); } +#else /* CONFIG_SMP */ + +/* + * on UP we do not need to balance between CPUs: + */ +static inline void idle_balance(int cpu, struct rq *rq) +{ +} + #endif /* CONFIG_SMP */ /* @@ -5192,8 +4880,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) */ static void task_fork_fair(struct task_struct *p) { - struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se, *curr; + struct cfs_rq *cfs_rq = task_cfs_rq(current); + struct sched_entity *se = &p->se, *curr = cfs_rq->curr; int this_cpu = smp_processor_id(); struct rq *rq = this_rq(); unsigned long flags; @@ -5202,9 +4890,6 @@ static void task_fork_fair(struct task_struct *p) update_rq_clock(rq); - cfs_rq = task_cfs_rq(current); - curr = cfs_rq->curr; - if (unlikely(task_cpu(p) != this_cpu)) { rcu_read_lock(); __set_task_cpu(p, this_cpu); @@ -5314,16 +4999,6 @@ static void set_curr_task_fair(struct rq *rq) } } -void init_cfs_rq(struct cfs_rq *cfs_rq) -{ - cfs_rq->tasks_timeline = RB_ROOT; - INIT_LIST_HEAD(&cfs_rq->tasks); - cfs_rq->min_vruntime = (u64)(-(1LL << 20)); -#ifndef CONFIG_64BIT - cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; -#endif -} - #ifdef CONFIG_FAIR_GROUP_SCHED static void task_move_group_fair(struct task_struct *p, int on_rq) { @@ -5340,182 +5015,13 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * to another cgroup's rq. This does somewhat interfere with the * fair sleeper stuff for the first placement, but who cares. */ - /* - * When !on_rq, vruntime of the task has usually NOT been normalized. - * But there are some cases where it has already been normalized: - * - * - Moving a forked child which is waiting for being woken up by - * wake_up_new_task(). - * - Moving a task which has been woken up by try_to_wake_up() and - * waiting for actually being woken up by sched_ttwu_pending(). - * - * To prevent boost or penalty in the new cfs_rq caused by delta - * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. - */ - if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) - on_rq = 1; - if (!on_rq) p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; set_task_rq(p, task_cpu(p)); if (!on_rq) p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; } - -void free_fair_sched_group(struct task_group *tg) -{ - int i; - - destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); - - for_each_possible_cpu(i) { - if (tg->cfs_rq) - kfree(tg->cfs_rq[i]); - if (tg->se) - kfree(tg->se[i]); - } - - kfree(tg->cfs_rq); - kfree(tg->se); -} - -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) -{ - struct cfs_rq *cfs_rq; - struct sched_entity *se; - int i; - - tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); - if (!tg->cfs_rq) - goto err; - tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); - if (!tg->se) - goto err; - - tg->shares = NICE_0_LOAD; - - init_cfs_bandwidth(tg_cfs_bandwidth(tg)); - - for_each_possible_cpu(i) { - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), - GFP_KERNEL, cpu_to_node(i)); - if (!cfs_rq) - goto err; - - se = kzalloc_node(sizeof(struct sched_entity), - GFP_KERNEL, cpu_to_node(i)); - if (!se) - goto err_free_rq; - - init_cfs_rq(cfs_rq); - init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); - } - - return 1; - -err_free_rq: - kfree(cfs_rq); -err: - return 0; -} - -void unregister_fair_sched_group(struct task_group *tg, int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - /* - * Only empty task groups can be destroyed; so we can speculatively - * check on_list without danger of it being re-added. - */ - if (!tg->cfs_rq[cpu]->on_list) - return; - - raw_spin_lock_irqsave(&rq->lock, flags); - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - -void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, - struct sched_entity *se, int cpu, - struct sched_entity *parent) -{ - struct rq *rq = cpu_rq(cpu); - - cfs_rq->tg = tg; - cfs_rq->rq = rq; -#ifdef CONFIG_SMP - /* allow initial update_cfs_load() to truncate */ - cfs_rq->load_stamp = 1; #endif - init_cfs_rq_runtime(cfs_rq); - - tg->cfs_rq[cpu] = cfs_rq; - tg->se[cpu] = se; - - /* se could be NULL for root_task_group */ - if (!se) - return; - - if (!parent) - se->cfs_rq = &rq->cfs; - else - se->cfs_rq = parent->my_q; - - se->my_q = cfs_rq; - update_load_set(&se->load, 0); - se->parent = parent; -} - -static DEFINE_MUTEX(shares_mutex); - -int sched_group_set_shares(struct task_group *tg, unsigned long shares) -{ - int i; - unsigned long flags; - - /* - * We can't change the weight of the root cgroup. - */ - if (!tg->se[0]) - return -EINVAL; - - shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); - - mutex_lock(&shares_mutex); - if (tg->shares == shares) - goto done; - - tg->shares = shares; - for_each_possible_cpu(i) { - struct rq *rq = cpu_rq(i); - struct sched_entity *se; - - se = tg->se[i]; - /* Propagate contribution to hierarchy */ - raw_spin_lock_irqsave(&rq->lock, flags); - for_each_sched_entity(se) - update_cfs_shares(group_cfs_rq(se)); - raw_spin_unlock_irqrestore(&rq->lock, flags); - } - -done: - mutex_unlock(&shares_mutex); - return 0; -} -#else /* CONFIG_FAIR_GROUP_SCHED */ - -void free_fair_sched_group(struct task_group *tg) { } - -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) -{ - return 1; -} - -void unregister_fair_sched_group(struct task_group *tg, int cpu) { } - -#endif /* CONFIG_FAIR_GROUP_SCHED */ - static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) { @@ -5535,7 +5041,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task /* * All the scheduling class methods: */ -const struct sched_class fair_sched_class = { +static const struct sched_class fair_sched_class = { .next = &idle_sched_class, .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, @@ -5572,7 +5078,7 @@ const struct sched_class fair_sched_class = { }; #ifdef CONFIG_SCHED_DEBUG -void print_cfs_stats(struct seq_file *m, int cpu) +static void print_cfs_stats(struct seq_file *m, int cpu) { struct cfs_rq *cfs_rq; @@ -5582,15 +5088,3 @@ void print_cfs_stats(struct seq_file *m, int cpu) rcu_read_unlock(); } #endif - -__init void init_sched_fair_class(void) -{ -#ifdef CONFIG_SMP - open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); - -#ifdef CONFIG_NO_HZ - zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); -#endif -#endif /* SMP */ - -} diff --git a/trunk/kernel/sched/features.h b/trunk/kernel/sched_features.h similarity index 75% rename from trunk/kernel/sched/features.h rename to trunk/kernel/sched_features.h index e61fd73913d0..84802245abd2 100644 --- a/trunk/kernel/sched/features.h +++ b/trunk/kernel/sched_features.h @@ -3,13 +3,13 @@ * them to run sooner, but does not allow tons of sleepers to * rip the spread apart. */ -SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) +SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) /* * Place new tasks ahead so that they do not starve already running * tasks */ -SCHED_FEAT(START_DEBIT, true) +SCHED_FEAT(START_DEBIT, 1) /* * Based on load and program behaviour, see if it makes sense to place @@ -17,54 +17,54 @@ SCHED_FEAT(START_DEBIT, true) * improve cache locality. Typically used with SYNC wakeups as * generated by pipes and the like, see also SYNC_WAKEUPS. */ -SCHED_FEAT(AFFINE_WAKEUPS, true) +SCHED_FEAT(AFFINE_WAKEUPS, 1) /* * Prefer to schedule the task we woke last (assuming it failed * wakeup-preemption), since its likely going to consume data we * touched, increases cache locality. */ -SCHED_FEAT(NEXT_BUDDY, false) +SCHED_FEAT(NEXT_BUDDY, 0) /* * Prefer to schedule the task that ran last (when we did * wake-preempt) as that likely will touch the same data, increases * cache locality. */ -SCHED_FEAT(LAST_BUDDY, true) +SCHED_FEAT(LAST_BUDDY, 1) /* * Consider buddies to be cache hot, decreases the likelyness of a * cache buddy being migrated away, increases cache locality. */ -SCHED_FEAT(CACHE_HOT_BUDDY, true) +SCHED_FEAT(CACHE_HOT_BUDDY, 1) /* * Use arch dependent cpu power functions */ -SCHED_FEAT(ARCH_POWER, false) +SCHED_FEAT(ARCH_POWER, 0) -SCHED_FEAT(HRTICK, false) -SCHED_FEAT(DOUBLE_TICK, false) -SCHED_FEAT(LB_BIAS, true) +SCHED_FEAT(HRTICK, 0) +SCHED_FEAT(DOUBLE_TICK, 0) +SCHED_FEAT(LB_BIAS, 1) /* * Spin-wait on mutex acquisition when the mutex owner is running on * another cpu -- assumes that when the owner is running, it will soon * release the lock. Decreases scheduling overhead. */ -SCHED_FEAT(OWNER_SPIN, true) +SCHED_FEAT(OWNER_SPIN, 1) /* * Decrement CPU power based on time not spent running tasks */ -SCHED_FEAT(NONTASK_POWER, true) +SCHED_FEAT(NONTASK_POWER, 1) /* * Queue remote wakeups on the target CPU and process them * using the scheduler IPI. Reduces rq->lock contention/bounces. */ -SCHED_FEAT(TTWU_QUEUE, true) +SCHED_FEAT(TTWU_QUEUE, 1) -SCHED_FEAT(FORCE_SD_OVERLAP, false) -SCHED_FEAT(RT_RUNTIME_SHARE, true) +SCHED_FEAT(FORCE_SD_OVERLAP, 0) +SCHED_FEAT(RT_RUNTIME_SHARE, 1) diff --git a/trunk/kernel/sched/idle_task.c b/trunk/kernel/sched_idletask.c similarity index 96% rename from trunk/kernel/sched/idle_task.c rename to trunk/kernel/sched_idletask.c index 91b4c957f289..0a51882534ea 100644 --- a/trunk/kernel/sched/idle_task.c +++ b/trunk/kernel/sched_idletask.c @@ -1,5 +1,3 @@ -#include "sched.h" - /* * idle-task scheduling class. * @@ -73,7 +71,7 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task /* * Simple, special scheduling class for the per-CPU idle tasks: */ -const struct sched_class idle_sched_class = { +static const struct sched_class idle_sched_class = { /* .next is NULL */ /* no enqueue/yield_task for idle tasks */ diff --git a/trunk/kernel/sched/rt.c b/trunk/kernel/sched_rt.c similarity index 90% rename from trunk/kernel/sched/rt.c rename to trunk/kernel/sched_rt.c index 3640ebbb466b..583a1368afe6 100644 --- a/trunk/kernel/sched/rt.c +++ b/trunk/kernel/sched_rt.c @@ -3,92 +3,7 @@ * policies) */ -#include "sched.h" - -#include - -static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); - -struct rt_bandwidth def_rt_bandwidth; - -static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) -{ - struct rt_bandwidth *rt_b = - container_of(timer, struct rt_bandwidth, rt_period_timer); - ktime_t now; - int overrun; - int idle = 0; - - for (;;) { - now = hrtimer_cb_get_time(timer); - overrun = hrtimer_forward(timer, now, rt_b->rt_period); - - if (!overrun) - break; - - idle = do_sched_rt_period_timer(rt_b, overrun); - } - - return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; -} - -void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) -{ - rt_b->rt_period = ns_to_ktime(period); - rt_b->rt_runtime = runtime; - - raw_spin_lock_init(&rt_b->rt_runtime_lock); - - hrtimer_init(&rt_b->rt_period_timer, - CLOCK_MONOTONIC, HRTIMER_MODE_REL); - rt_b->rt_period_timer.function = sched_rt_period_timer; -} - -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) -{ - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) - return; - - if (hrtimer_active(&rt_b->rt_period_timer)) - return; - - raw_spin_lock(&rt_b->rt_runtime_lock); - start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); - raw_spin_unlock(&rt_b->rt_runtime_lock); -} - -void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) -{ - struct rt_prio_array *array; - int i; - - array = &rt_rq->active; - for (i = 0; i < MAX_RT_PRIO; i++) { - INIT_LIST_HEAD(array->queue + i); - __clear_bit(i, array->bitmap); - } - /* delimiter for bitsearch: */ - __set_bit(MAX_RT_PRIO, array->bitmap); - -#if defined CONFIG_SMP - rt_rq->highest_prio.curr = MAX_RT_PRIO; - rt_rq->highest_prio.next = MAX_RT_PRIO; - rt_rq->rt_nr_migratory = 0; - rt_rq->overloaded = 0; - plist_head_init(&rt_rq->pushable_tasks); -#endif - - rt_rq->rt_time = 0; - rt_rq->rt_throttled = 0; - rt_rq->rt_runtime = 0; - raw_spin_lock_init(&rt_rq->rt_runtime_lock); -} - #ifdef CONFIG_RT_GROUP_SCHED -static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) -{ - hrtimer_cancel(&rt_b->rt_period_timer); -} #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) @@ -110,91 +25,6 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) return rt_se->rt_rq; } -void free_rt_sched_group(struct task_group *tg) -{ - int i; - - if (tg->rt_se) - destroy_rt_bandwidth(&tg->rt_bandwidth); - - for_each_possible_cpu(i) { - if (tg->rt_rq) - kfree(tg->rt_rq[i]); - if (tg->rt_se) - kfree(tg->rt_se[i]); - } - - kfree(tg->rt_rq); - kfree(tg->rt_se); -} - -void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, - struct sched_rt_entity *rt_se, int cpu, - struct sched_rt_entity *parent) -{ - struct rq *rq = cpu_rq(cpu); - - rt_rq->highest_prio.curr = MAX_RT_PRIO; - rt_rq->rt_nr_boosted = 0; - rt_rq->rq = rq; - rt_rq->tg = tg; - - tg->rt_rq[cpu] = rt_rq; - tg->rt_se[cpu] = rt_se; - - if (!rt_se) - return; - - if (!parent) - rt_se->rt_rq = &rq->rt; - else - rt_se->rt_rq = parent->my_q; - - rt_se->my_q = rt_rq; - rt_se->parent = parent; - INIT_LIST_HEAD(&rt_se->run_list); -} - -int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) -{ - struct rt_rq *rt_rq; - struct sched_rt_entity *rt_se; - int i; - - tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); - if (!tg->rt_rq) - goto err; - tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); - if (!tg->rt_se) - goto err; - - init_rt_bandwidth(&tg->rt_bandwidth, - ktime_to_ns(def_rt_bandwidth.rt_period), 0); - - for_each_possible_cpu(i) { - rt_rq = kzalloc_node(sizeof(struct rt_rq), - GFP_KERNEL, cpu_to_node(i)); - if (!rt_rq) - goto err; - - rt_se = kzalloc_node(sizeof(struct sched_rt_entity), - GFP_KERNEL, cpu_to_node(i)); - if (!rt_se) - goto err_free_rq; - - init_rt_rq(rt_rq, cpu_rq(i)); - rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; - init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); - } - - return 1; - -err_free_rq: - kfree(rt_rq); -err: - return 0; -} - #else /* CONFIG_RT_GROUP_SCHED */ #define rt_entity_is_task(rt_se) (1) @@ -217,12 +47,6 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) return &rq->rt; } -void free_rt_sched_group(struct task_group *tg) { } - -int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) -{ - return 1; -} #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SMP @@ -732,28 +556,6 @@ static void enable_runtime(struct rq *rq) raw_spin_unlock_irqrestore(&rq->lock, flags); } -int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int cpu = (int)(long)hcpu; - - switch (action) { - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - disable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - enable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } -} - static int balance_runtime(struct rt_rq *rt_rq) { int more = 0; @@ -846,7 +648,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) if (rt_rq->rt_throttled) return rt_rq_throttled(rt_rq); - if (runtime >= sched_rt_period(rt_rq)) + if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) return 0; balance_runtime(rt_rq); @@ -1155,8 +957,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) } /* - * Put task to the head or the end of the run list without the overhead of - * dequeue followed by enqueue. + * Put task to the end of the run list without the overhead of dequeue + * followed by enqueue. */ static void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) @@ -1200,9 +1002,6 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) cpu = task_cpu(p); - if (p->rt.nr_cpus_allowed == 1) - goto out; - /* For anything but wake ups, just return the task_cpu */ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) goto out; @@ -1379,6 +1178,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) /* Only try algorithms three times */ #define RT_MAX_TRIES 3 +static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); + static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && @@ -1852,14 +1653,13 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) pull_rt_task(rq); } -void init_sched_rt_class(void) +static inline void init_sched_rt_class(void) { unsigned int i; - for_each_possible_cpu(i) { + for_each_possible_cpu(i) zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), GFP_KERNEL, cpu_to_node(i)); - } } #endif /* CONFIG_SMP */ @@ -2000,7 +1800,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) return 0; } -const struct sched_class rt_sched_class = { +static const struct sched_class rt_sched_class = { .next = &fair_sched_class, .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, @@ -2035,7 +1835,7 @@ const struct sched_class rt_sched_class = { #ifdef CONFIG_SCHED_DEBUG extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); -void print_rt_stats(struct seq_file *m, int cpu) +static void print_rt_stats(struct seq_file *m, int cpu) { rt_rq_iter_t iter; struct rt_rq *rt_rq; diff --git a/trunk/kernel/sched/stats.h b/trunk/kernel/sched_stats.h similarity index 70% rename from trunk/kernel/sched/stats.h rename to trunk/kernel/sched_stats.h index 2ef90a51ec5e..87f9e36ea56e 100644 --- a/trunk/kernel/sched/stats.h +++ b/trunk/kernel/sched_stats.h @@ -1,5 +1,108 @@ #ifdef CONFIG_SCHEDSTATS +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 15 + +static int show_schedstat(struct seq_file *seq, void *v) +{ + int cpu; + int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; + char *mask_str = kmalloc(mask_len, GFP_KERNEL); + + if (mask_str == NULL) + return -ENOMEM; + + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + for_each_online_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); +#ifdef CONFIG_SMP + struct sched_domain *sd; + int dcount = 0; +#endif + + /* runqueue-specific stats */ + seq_printf(seq, + "cpu%d %u %u %u %u %u %u %llu %llu %lu", + cpu, rq->yld_count, + rq->sched_switch, rq->sched_count, rq->sched_goidle, + rq->ttwu_count, rq->ttwu_local, + rq->rq_cpu_time, + rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); + + seq_printf(seq, "\n"); + +#ifdef CONFIG_SMP + /* domain-specific stats */ + rcu_read_lock(); + for_each_domain(cpu, sd) { + enum cpu_idle_type itype; + + cpumask_scnprintf(mask_str, mask_len, + sched_domain_span(sd)); + seq_printf(seq, "domain%d %s", dcount++, mask_str); + for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; + itype++) { + seq_printf(seq, " %u %u %u %u %u %u %u %u", + sd->lb_count[itype], + sd->lb_balanced[itype], + sd->lb_failed[itype], + sd->lb_imbalance[itype], + sd->lb_gained[itype], + sd->lb_hot_gained[itype], + sd->lb_nobusyq[itype], + sd->lb_nobusyg[itype]); + } + seq_printf(seq, + " %u %u %u %u %u %u %u %u %u %u %u %u\n", + sd->alb_count, sd->alb_failed, sd->alb_pushed, + sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, + sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, + sd->ttwu_wake_remote, sd->ttwu_move_affine, + sd->ttwu_move_balance); + } + rcu_read_unlock(); +#endif + } + kfree(mask_str); + return 0; +} + +static int schedstat_open(struct inode *inode, struct file *file) +{ + unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); + char *buf = kmalloc(size, GFP_KERNEL); + struct seq_file *m; + int res; + + if (!buf) + return -ENOMEM; + res = single_open(file, show_schedstat, NULL); + if (!res) { + m = file->private_data; + m->buf = buf; + m->size = size; + } else + kfree(buf); + return res; +} + +static const struct file_operations proc_schedstat_operations = { + .open = schedstat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_schedstat_init(void) +{ + proc_create("schedstat", 0, NULL, &proc_schedstat_operations); + return 0; +} +module_init(proc_schedstat_init); /* * Expects runqueue lock to be held for atomicity of update @@ -180,7 +283,8 @@ static inline void account_group_user_time(struct task_struct *tsk, return; raw_spin_lock(&cputimer->lock); - cputimer->cputime.utime += cputime; + cputimer->cputime.utime = + cputime_add(cputimer->cputime.utime, cputime); raw_spin_unlock(&cputimer->lock); } @@ -203,7 +307,8 @@ static inline void account_group_system_time(struct task_struct *tsk, return; raw_spin_lock(&cputimer->lock); - cputimer->cputime.stime += cputime; + cputimer->cputime.stime = + cputime_add(cputimer->cputime.stime, cputime); raw_spin_unlock(&cputimer->lock); } diff --git a/trunk/kernel/sched/stop_task.c b/trunk/kernel/sched_stoptask.c similarity index 97% rename from trunk/kernel/sched/stop_task.c rename to trunk/kernel/sched_stoptask.c index 7b386e86fd23..8b44e7fa7fb3 100644 --- a/trunk/kernel/sched/stop_task.c +++ b/trunk/kernel/sched_stoptask.c @@ -1,5 +1,3 @@ -#include "sched.h" - /* * stop-task scheduling class. * @@ -82,7 +80,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task) /* * Simple, special scheduling class for the per-CPU stop tasks: */ -const struct sched_class stop_sched_class = { +static const struct sched_class stop_sched_class = { .next = &rt_sched_class, .enqueue_task = enqueue_task_stop, diff --git a/trunk/kernel/signal.c b/trunk/kernel/signal.c index 56ce3a618b28..b3f78d09a105 100644 --- a/trunk/kernel/signal.c +++ b/trunk/kernel/signal.c @@ -1629,8 +1629,10 @@ bool do_notify_parent(struct task_struct *tsk, int sig) info.si_uid = __task_cred(tsk)->uid; rcu_read_unlock(); - info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); - info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime); + info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, + tsk->signal->utime)); + info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, + tsk->signal->stime)); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) @@ -1992,6 +1994,8 @@ static bool do_signal_stop(int signr) */ if (!(sig->flags & SIGNAL_STOP_STOPPED)) sig->group_exit_code = signr; + else + WARN_ON_ONCE(!current->ptrace); sig->group_stop_count = 0; diff --git a/trunk/kernel/softirq.c b/trunk/kernel/softirq.c index 4eb3a0fa351e..2c71d91efff0 100644 --- a/trunk/kernel/softirq.c +++ b/trunk/kernel/softirq.c @@ -347,12 +347,12 @@ void irq_exit(void) if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); + rcu_irq_exit(); #ifdef CONFIG_NO_HZ /* Make sure that timer wheel updates are propagated */ if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) - tick_nohz_irq_exit(); + tick_nohz_stop_sched_tick(0); #endif - rcu_irq_exit(); preempt_enable_no_resched(); } diff --git a/trunk/kernel/sys.c b/trunk/kernel/sys.c index ddf8155bf3f8..481611fbd079 100644 --- a/trunk/kernel/sys.c +++ b/trunk/kernel/sys.c @@ -1605,7 +1605,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) unsigned long maxrss = 0; memset((char *) r, 0, sizeof *r); - utime = stime = 0; + utime = stime = cputime_zero; if (who == RUSAGE_THREAD) { task_times(current, &utime, &stime); @@ -1635,8 +1635,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) case RUSAGE_SELF: thread_group_times(p, &tgutime, &tgstime); - utime += tgutime; - stime += tgstime; + utime = cputime_add(utime, tgutime); + stime = cputime_add(stime, tgstime); r->ru_nvcsw += p->signal->nvcsw; r->ru_nivcsw += p->signal->nivcsw; r->ru_minflt += p->signal->min_flt; diff --git a/trunk/kernel/time/clockevents.c b/trunk/kernel/time/clockevents.c index 1ecd6ba36d6c..c4eb71c8b2ea 100644 --- a/trunk/kernel/time/clockevents.c +++ b/trunk/kernel/time/clockevents.c @@ -387,6 +387,7 @@ void clockevents_exchange_device(struct clock_event_device *old, * released list and do a notify add later. */ if (old) { + old->event_handler = clockevents_handle_noop; clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); list_del(&old->list); list_add(&old->list, &clockevents_released); diff --git a/trunk/kernel/time/tick-sched.c b/trunk/kernel/time/tick-sched.c index 7656642e4b8e..40420644d0ba 100644 --- a/trunk/kernel/time/tick-sched.c +++ b/trunk/kernel/time/tick-sched.c @@ -275,17 +275,42 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); -static void tick_nohz_stop_sched_tick(struct tick_sched *ts) +/** + * tick_nohz_stop_sched_tick - stop the idle tick from the idle task + * + * When the next event is more than a tick into the future, stop the idle tick + * Called either from the idle loop or from irq_exit() when an idle period was + * just interrupted by an interrupt which did not cause a reschedule. + */ +void tick_nohz_stop_sched_tick(int inidle) { - unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; + unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; + struct tick_sched *ts; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; int cpu; + local_irq_save(flags); + cpu = smp_processor_id(); ts = &per_cpu(tick_cpu_sched, cpu); + /* + * Call to tick_nohz_start_idle stops the last_update_time from being + * updated. Thus, it must not be called in the event we are called from + * irq_exit() with the prior state different than idle. + */ + if (!inidle && !ts->inidle) + goto end; + + /* + * Set ts->inidle unconditionally. Even if the system did not + * switch to NOHZ mode the cpu frequency governers rely on the + * update of the idle time accounting in tick_nohz_start_idle(). + */ + ts->inidle = 1; + now = tick_nohz_start_idle(cpu, ts); /* @@ -301,10 +326,10 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) - return; + goto end; if (need_resched()) - return; + goto end; if (unlikely(local_softirq_pending() && cpu_online(cpu))) { static int ratelimit; @@ -314,7 +339,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) (unsigned int) local_softirq_pending()); ratelimit++; } - return; + goto end; } ts->idle_calls++; @@ -409,6 +434,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; + rcu_enter_nohz(); } ts->idle_sleeps++; @@ -446,64 +472,8 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) ts->next_jiffies = next_jiffies; ts->last_jiffies = last_jiffies; ts->sleep_length = ktime_sub(dev->next_event, now); -} - -/** - * tick_nohz_idle_enter - stop the idle tick from the idle task - * - * When the next event is more than a tick into the future, stop the idle tick - * Called when we start the idle loop. - * - * The arch is responsible of calling: - * - * - rcu_idle_enter() after its last use of RCU before the CPU is put - * to sleep. - * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. - */ -void tick_nohz_idle_enter(void) -{ - struct tick_sched *ts; - - WARN_ON_ONCE(irqs_disabled()); - - /* - * Update the idle state in the scheduler domain hierarchy - * when tick_nohz_stop_sched_tick() is called from the idle loop. - * State will be updated to busy during the first busy tick after - * exiting idle. - */ - set_cpu_sd_state_idle(); - - local_irq_disable(); - - ts = &__get_cpu_var(tick_cpu_sched); - /* - * set ts->inidle unconditionally. even if the system did not - * switch to nohz mode the cpu frequency governers rely on the - * update of the idle time accounting in tick_nohz_start_idle(). - */ - ts->inidle = 1; - tick_nohz_stop_sched_tick(ts); - - local_irq_enable(); -} - -/** - * tick_nohz_irq_exit - update next tick event from interrupt exit - * - * When an interrupt fires while we are idle and it doesn't cause - * a reschedule, it may still add, modify or delete a timer, enqueue - * an RCU callback, etc... - * So we need to re-calculate and reprogram the next tick event. - */ -void tick_nohz_irq_exit(void) -{ - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); - - if (!ts->inidle) - return; - - tick_nohz_stop_sched_tick(ts); +end: + local_irq_restore(flags); } /** @@ -545,13 +515,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) } /** - * tick_nohz_idle_exit - restart the idle tick from the idle task + * tick_nohz_restart_sched_tick - restart the idle tick from the idle task * * Restart the idle tick when the CPU is woken up from idle - * This also exit the RCU extended quiescent state. The CPU - * can use RCU again after this function is called. */ -void tick_nohz_idle_exit(void) +void tick_nohz_restart_sched_tick(void) { int cpu = smp_processor_id(); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); @@ -561,7 +529,6 @@ void tick_nohz_idle_exit(void) ktime_t now; local_irq_disable(); - if (ts->idle_active || (ts->inidle && ts->tick_stopped)) now = ktime_get(); @@ -576,6 +543,8 @@ void tick_nohz_idle_exit(void) ts->inidle = 0; + rcu_exit_nohz(); + /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); diff --git a/trunk/kernel/time/timekeeping.c b/trunk/kernel/time/timekeeping.c index 0c6358186401..237841378c03 100644 --- a/trunk/kernel/time/timekeeping.c +++ b/trunk/kernel/time/timekeeping.c @@ -131,7 +131,7 @@ static inline s64 timekeeping_get_ns_raw(void) /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - /* return delta convert to nanoseconds. */ + /* return delta convert to nanoseconds using ntp adjusted mult. */ return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); } @@ -813,11 +813,11 @@ static void timekeeping_adjust(s64 offset) * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. * * Note we subtract one in the shift, so that error is really error*2. - * This "saves" dividing(shifting) interval twice, but keeps the - * (error > interval) comparison as still measuring if error is + * This "saves" dividing(shifting) intererval twice, but keeps the + * (error > interval) comparision as still measuring if error is * larger then half an interval. * - * Note: It does not "save" on aggravation when reading the code. + * Note: It does not "save" on aggrivation when reading the code. */ error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); if (error > interval) { @@ -833,7 +833,7 @@ static void timekeeping_adjust(s64 offset) * nanosecond, and store the amount rounded up into * the error. This causes the likely below to be unlikely. * - * The proper fix is to avoid rounding up by using + * The properfix is to avoid rounding up by using * the high precision timekeeper.xtime_nsec instead of * xtime.tv_nsec everywhere. Fixing this will take some * time. diff --git a/trunk/kernel/timer.c b/trunk/kernel/timer.c index a297ffcf888e..9c3c62b0c4bc 100644 --- a/trunk/kernel/timer.c +++ b/trunk/kernel/timer.c @@ -427,12 +427,6 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state) } } -/* Stub timer callback for improperly used timers. */ -static void stub_timer(unsigned long data) -{ - WARN_ON(1); -} - /* * fixup_activate is called when: * - an active object is activated @@ -456,8 +450,7 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state) debug_object_activate(timer, &timer_debug_descr); return 0; } else { - setup_timer(timer, stub_timer, 0); - return 1; + WARN_ON_ONCE(1); } return 0; @@ -487,40 +480,12 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) } } -/* - * fixup_assert_init is called when: - * - an untracked/uninit-ed object is found - */ -static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) -{ - struct timer_list *timer = addr; - - switch (state) { - case ODEBUG_STATE_NOTAVAILABLE: - if (timer->entry.prev == TIMER_ENTRY_STATIC) { - /* - * This is not really a fixup. The timer was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - debug_object_init(timer, &timer_debug_descr); - return 0; - } else { - setup_timer(timer, stub_timer, 0); - return 1; - } - default: - return 0; - } -} - static struct debug_obj_descr timer_debug_descr = { - .name = "timer_list", - .debug_hint = timer_debug_hint, - .fixup_init = timer_fixup_init, - .fixup_activate = timer_fixup_activate, - .fixup_free = timer_fixup_free, - .fixup_assert_init = timer_fixup_assert_init, + .name = "timer_list", + .debug_hint = timer_debug_hint, + .fixup_init = timer_fixup_init, + .fixup_activate = timer_fixup_activate, + .fixup_free = timer_fixup_free, }; static inline void debug_timer_init(struct timer_list *timer) @@ -543,11 +508,6 @@ static inline void debug_timer_free(struct timer_list *timer) debug_object_free(timer, &timer_debug_descr); } -static inline void debug_timer_assert_init(struct timer_list *timer) -{ - debug_object_assert_init(timer, &timer_debug_descr); -} - static void __init_timer(struct timer_list *timer, const char *name, struct lock_class_key *key); @@ -571,7 +531,6 @@ EXPORT_SYMBOL_GPL(destroy_timer_on_stack); static inline void debug_timer_init(struct timer_list *timer) { } static inline void debug_timer_activate(struct timer_list *timer) { } static inline void debug_timer_deactivate(struct timer_list *timer) { } -static inline void debug_timer_assert_init(struct timer_list *timer) { } #endif static inline void debug_init(struct timer_list *timer) @@ -593,11 +552,6 @@ static inline void debug_deactivate(struct timer_list *timer) trace_timer_cancel(timer); } -static inline void debug_assert_init(struct timer_list *timer) -{ - debug_timer_assert_init(timer); -} - static void __init_timer(struct timer_list *timer, const char *name, struct lock_class_key *key) @@ -948,8 +902,6 @@ int del_timer(struct timer_list *timer) unsigned long flags; int ret = 0; - debug_assert_init(timer); - timer_stats_timer_clear_start_info(timer); if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); @@ -980,8 +932,6 @@ int try_to_del_timer_sync(struct timer_list *timer) unsigned long flags; int ret = -1; - debug_assert_init(timer); - base = lock_timer_base(timer, &flags); if (base->running_timer == timer) diff --git a/trunk/kernel/trace/trace.c b/trunk/kernel/trace/trace.c index 91dc4bc8bf72..f2bd275bb60f 100644 --- a/trunk/kernel/trace/trace.c +++ b/trunk/kernel/trace/trace.c @@ -338,8 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | - TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | - TRACE_ITER_IRQ_INFO; + TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; static int trace_stop_count; static DEFINE_RAW_SPINLOCK(tracing_start_lock); @@ -427,7 +426,6 @@ static const char *trace_options[] = { "record-cmd", "overwrite", "disable_on_free", - "irq-info", NULL }; @@ -1845,33 +1843,6 @@ static void s_stop(struct seq_file *m, void *p) trace_event_read_unlock(); } -static void -get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries) -{ - unsigned long count; - int cpu; - - *total = 0; - *entries = 0; - - for_each_tracing_cpu(cpu) { - count = ring_buffer_entries_cpu(tr->buffer, cpu); - /* - * If this buffer has skipped entries, then we hold all - * entries for the trace and we need to ignore the - * ones before the time stamp. - */ - if (tr->data[cpu]->skipped_entries) { - count -= tr->data[cpu]->skipped_entries; - /* total is the same as the entries */ - *total += count; - } else - *total += count + - ring_buffer_overrun_cpu(tr->buffer, cpu); - *entries += count; - } -} - static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n"); @@ -1884,35 +1855,12 @@ static void print_lat_help_header(struct seq_file *m) seq_puts(m, "# \\ / ||||| \\ | / \n"); } -static void print_event_info(struct trace_array *tr, struct seq_file *m) -{ - unsigned long total; - unsigned long entries; - - get_total_entries(tr, &total, &entries); - seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", - entries, total, num_online_cpus()); - seq_puts(m, "#\n"); -} - -static void print_func_help_header(struct trace_array *tr, struct seq_file *m) +static void print_func_help_header(struct seq_file *m) { - print_event_info(tr, m); - seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); + seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); seq_puts(m, "# | | | | |\n"); } -static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m) -{ - print_event_info(tr, m); - seq_puts(m, "# _-----=> irqs-off\n"); - seq_puts(m, "# / _----=> need-resched\n"); - seq_puts(m, "# | / _---=> hardirq/softirq\n"); - seq_puts(m, "# || / _--=> preempt-depth\n"); - seq_puts(m, "# ||| / delay\n"); - seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"); - seq_puts(m, "# | | | |||| | |\n"); -} void print_trace_header(struct seq_file *m, struct trace_iterator *iter) @@ -1921,14 +1869,32 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) struct trace_array *tr = iter->tr; struct trace_array_cpu *data = tr->data[tr->cpu]; struct tracer *type = current_trace; - unsigned long entries; - unsigned long total; + unsigned long entries = 0; + unsigned long total = 0; + unsigned long count; const char *name = "preemption"; + int cpu; if (type) name = type->name; - get_total_entries(tr, &total, &entries); + + for_each_tracing_cpu(cpu) { + count = ring_buffer_entries_cpu(tr->buffer, cpu); + /* + * If this buffer has skipped entries, then we hold all + * entries for the trace and we need to ignore the + * ones before the time stamp. + */ + if (tr->data[cpu]->skipped_entries) { + count -= tr->data[cpu]->skipped_entries; + /* total is the same as the entries */ + total += count; + } else + total += count + + ring_buffer_overrun_cpu(tr->buffer, cpu); + entries += count; + } seq_printf(m, "# %s latency trace v1.1.5 on %s\n", name, UTS_RELEASE); @@ -2174,21 +2140,6 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) return print_trace_fmt(iter); } -void trace_latency_header(struct seq_file *m) -{ - struct trace_iterator *iter = m->private; - - /* print nothing if the buffers are empty */ - if (trace_empty(iter)) - return; - - if (iter->iter_flags & TRACE_FILE_LAT_FMT) - print_trace_header(m, iter); - - if (!(trace_flags & TRACE_ITER_VERBOSE)) - print_lat_help_header(m); -} - void trace_default_header(struct seq_file *m) { struct trace_iterator *iter = m->private; @@ -2204,12 +2155,8 @@ void trace_default_header(struct seq_file *m) if (!(trace_flags & TRACE_ITER_VERBOSE)) print_lat_help_header(m); } else { - if (!(trace_flags & TRACE_ITER_VERBOSE)) { - if (trace_flags & TRACE_ITER_IRQ_INFO) - print_func_help_header_irq(iter->tr, m); - else - print_func_help_header(iter->tr, m); - } + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_func_help_header(m); } } @@ -4828,7 +4775,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { __ftrace_dump(true, oops_dump_mode); } -EXPORT_SYMBOL_GPL(ftrace_dump); __init static int tracer_alloc_buffers(void) { diff --git a/trunk/kernel/trace/trace.h b/trunk/kernel/trace/trace.h index 2c2657462ac3..092e1f8d18dc 100644 --- a/trunk/kernel/trace/trace.h +++ b/trunk/kernel/trace/trace.h @@ -370,7 +370,6 @@ void trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc); -void trace_latency_header(struct seq_file *m); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); @@ -655,7 +654,6 @@ enum trace_iterator_flags { TRACE_ITER_RECORD_CMD = 0x100000, TRACE_ITER_OVERWRITE = 0x200000, TRACE_ITER_STOP_ON_FREE = 0x400000, - TRACE_ITER_IRQ_INFO = 0x800000, }; /* diff --git a/trunk/kernel/trace/trace_events_filter.c b/trunk/kernel/trace/trace_events_filter.c index f04cc3136bd3..95dc31efd6dd 100644 --- a/trunk/kernel/trace/trace_events_filter.c +++ b/trunk/kernel/trace/trace_events_filter.c @@ -27,12 +27,6 @@ #include "trace.h" #include "trace_output.h" -#define DEFAULT_SYS_FILTER_MESSAGE \ - "### global filter ###\n" \ - "# Use this to set filters for multiple events.\n" \ - "# Only events with the given fields will be affected.\n" \ - "# If no events are modified, an error message will be displayed here" - enum filter_op_ids { OP_OR, @@ -652,7 +646,7 @@ void print_subsystem_event_filter(struct event_subsystem *system, if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else - trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); + trace_seq_printf(s, "none\n"); mutex_unlock(&event_mutex); } @@ -1844,10 +1838,7 @@ int apply_subsystem_event_filter(struct event_subsystem *system, if (!filter) goto out; - /* System filters just show a default message */ - kfree(filter->filter_string); - filter->filter_string = NULL; - + replace_filter_string(filter, filter_string); /* * No event actually uses the system filter * we can free it without synchronize_sched(). @@ -1857,12 +1848,14 @@ int apply_subsystem_event_filter(struct event_subsystem *system, parse_init(ps, filter_ops, filter_string); err = filter_parse(ps); - if (err) - goto err_filter; + if (err) { + append_filter_err(ps, system->filter); + goto out; + } err = replace_system_preds(system, ps, filter_string); if (err) - goto err_filter; + append_filter_err(ps, system->filter); out: filter_opstack_clear(ps); @@ -1872,11 +1865,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system, mutex_unlock(&event_mutex); return err; - -err_filter: - replace_filter_string(filter, filter_string); - append_filter_err(ps, system->filter); - goto out; } #ifdef CONFIG_PERF_EVENTS diff --git a/trunk/kernel/trace/trace_irqsoff.c b/trunk/kernel/trace/trace_irqsoff.c index 99d20e920368..20dad0d7a163 100644 --- a/trunk/kernel/trace/trace_irqsoff.c +++ b/trunk/kernel/trace/trace_irqsoff.c @@ -280,20 +280,9 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) } static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } +static void irqsoff_print_header(struct seq_file *s) { } static void irqsoff_trace_open(struct trace_iterator *iter) { } static void irqsoff_trace_close(struct trace_iterator *iter) { } - -#ifdef CONFIG_FUNCTION_TRACER -static void irqsoff_print_header(struct seq_file *s) -{ - trace_default_header(s); -} -#else -static void irqsoff_print_header(struct seq_file *s) -{ - trace_latency_header(s); -} -#endif /* CONFIG_FUNCTION_TRACER */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ /* diff --git a/trunk/kernel/trace/trace_output.c b/trunk/kernel/trace/trace_output.c index 0d6ff3555942..51999309a6cf 100644 --- a/trunk/kernel/trace/trace_output.c +++ b/trunk/kernel/trace/trace_output.c @@ -627,23 +627,11 @@ int trace_print_context(struct trace_iterator *iter) unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned long secs = (unsigned long)t; char comm[TASK_COMM_LEN]; - int ret; trace_find_cmdline(entry->pid, comm); - ret = trace_seq_printf(s, "%16s-%-5d [%03d] ", - comm, entry->pid, iter->cpu); - if (!ret) - return 0; - - if (trace_flags & TRACE_ITER_IRQ_INFO) { - ret = trace_print_lat_fmt(s, entry); - if (!ret) - return 0; - } - - return trace_seq_printf(s, " %5lu.%06lu: ", - secs, usec_rem); + return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", + comm, entry->pid, iter->cpu, secs, usec_rem); } int trace_print_lat_context(struct trace_iterator *iter) diff --git a/trunk/kernel/trace/trace_sched_wakeup.c b/trunk/kernel/trace/trace_sched_wakeup.c index ff791ea48b57..e4a70c0c71b6 100644 --- a/trunk/kernel/trace/trace_sched_wakeup.c +++ b/trunk/kernel/trace/trace_sched_wakeup.c @@ -280,20 +280,9 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter) } static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } +static void wakeup_print_header(struct seq_file *s) { } static void wakeup_trace_open(struct trace_iterator *iter) { } static void wakeup_trace_close(struct trace_iterator *iter) { } - -#ifdef CONFIG_FUNCTION_TRACER -static void wakeup_print_header(struct seq_file *s) -{ - trace_default_header(s); -} -#else -static void wakeup_print_header(struct seq_file *s) -{ - trace_latency_header(s); -} -#endif /* CONFIG_FUNCTION_TRACER */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ /* diff --git a/trunk/kernel/tsacct.c b/trunk/kernel/tsacct.c index 23b4d784ebdd..5bbfac85866e 100644 --- a/trunk/kernel/tsacct.c +++ b/trunk/kernel/tsacct.c @@ -127,7 +127,7 @@ void acct_update_integrals(struct task_struct *tsk) local_irq_save(flags); time = tsk->stime + tsk->utime; - dtime = time - tsk->acct_timexpd; + dtime = cputime_sub(time, tsk->acct_timexpd); jiffies_to_timeval(cputime_to_jiffies(dtime), &value); delta = value.tv_sec; delta = delta * USEC_PER_SEC + value.tv_usec; diff --git a/trunk/kernel/wait.c b/trunk/kernel/wait.c index 7fdd9eaca2c3..26fa7797f90f 100644 --- a/trunk/kernel/wait.c +++ b/trunk/kernel/wait.c @@ -10,10 +10,10 @@ #include #include -void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) +void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key) { spin_lock_init(&q->lock); - lockdep_set_class_and_name(&q->lock, key, name); + lockdep_set_class(&q->lock, key); INIT_LIST_HEAD(&q->task_list); } diff --git a/trunk/lib/debugobjects.c b/trunk/lib/debugobjects.c index 77cb245f8e7b..a78b7c6e042c 100644 --- a/trunk/lib/debugobjects.c +++ b/trunk/lib/debugobjects.c @@ -268,16 +268,12 @@ static void debug_print_object(struct debug_obj *obj, char *msg) * Try to repair the damage, so we have a better chance to get useful * debug output. */ -static int +static void debug_object_fixup(int (*fixup)(void *addr, enum debug_obj_state state), void * addr, enum debug_obj_state state) { - int fixed = 0; - if (fixup) - fixed = fixup(addr, state); - debug_objects_fixups += fixed; - return fixed; + debug_objects_fixups += fixup(addr, state); } static void debug_object_is_on_stack(void *addr, int onstack) @@ -390,9 +386,6 @@ void debug_object_activate(void *addr, struct debug_obj_descr *descr) struct debug_bucket *db; struct debug_obj *obj; unsigned long flags; - struct debug_obj o = { .object = addr, - .state = ODEBUG_STATE_NOTAVAILABLE, - .descr = descr }; if (!debug_objects_enabled) return; @@ -432,9 +425,8 @@ void debug_object_activate(void *addr, struct debug_obj_descr *descr) * let the type specific code decide whether this is * true or not. */ - if (debug_object_fixup(descr->fixup_activate, addr, - ODEBUG_STATE_NOTAVAILABLE)) - debug_print_object(&o, "activate"); + debug_object_fixup(descr->fixup_activate, addr, + ODEBUG_STATE_NOTAVAILABLE); } /** @@ -570,44 +562,6 @@ void debug_object_free(void *addr, struct debug_obj_descr *descr) raw_spin_unlock_irqrestore(&db->lock, flags); } -/** - * debug_object_assert_init - debug checks when object should be init-ed - * @addr: address of the object - * @descr: pointer to an object specific debug description structure - */ -void debug_object_assert_init(void *addr, struct debug_obj_descr *descr) -{ - struct debug_bucket *db; - struct debug_obj *obj; - unsigned long flags; - - if (!debug_objects_enabled) - return; - - db = get_bucket((unsigned long) addr); - - raw_spin_lock_irqsave(&db->lock, flags); - - obj = lookup_object(addr, db); - if (!obj) { - struct debug_obj o = { .object = addr, - .state = ODEBUG_STATE_NOTAVAILABLE, - .descr = descr }; - - raw_spin_unlock_irqrestore(&db->lock, flags); - /* - * Maybe the object is static. Let the type specific - * code decide what to do. - */ - if (debug_object_fixup(descr->fixup_assert_init, addr, - ODEBUG_STATE_NOTAVAILABLE)) - debug_print_object(&o, "assert_init"); - return; - } - - raw_spin_unlock_irqrestore(&db->lock, flags); -} - /** * debug_object_active_state - debug checks object usage state machine * @addr: address of the object diff --git a/trunk/mm/Kconfig b/trunk/mm/Kconfig index e338407f1225..011b110365c8 100644 --- a/trunk/mm/Kconfig +++ b/trunk/mm/Kconfig @@ -131,12 +131,6 @@ config SPARSEMEM_VMEMMAP config HAVE_MEMBLOCK boolean -config HAVE_MEMBLOCK_NODE_MAP - boolean - -config ARCH_DISCARD_MEMBLOCK - boolean - config NO_BOOTMEM boolean diff --git a/trunk/mm/filemap.c b/trunk/mm/filemap.c index 5f0a3c91fdac..c106d3b3cc64 100644 --- a/trunk/mm/filemap.c +++ b/trunk/mm/filemap.c @@ -1828,7 +1828,7 @@ static struct page *__read_cache_page(struct address_space *mapping, page = __page_cache_alloc(gfp | __GFP_COLD); if (!page) return ERR_PTR(-ENOMEM); - err = add_to_page_cache_lru(page, mapping, index, gfp); + err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); if (unlikely(err)) { page_cache_release(page); if (err == -EEXIST) @@ -1925,7 +1925,10 @@ static struct page *wait_on_page_read(struct page *page) * @gfp: the page allocator flags to use if allocating * * This is the same as "read_mapping_page(mapping, index, NULL)", but with - * any new page allocations done using the specified allocation flags. + * any new page allocations done using the specified allocation flags. Note + * that the Radix tree operations will still use GFP_KERNEL, so you can't + * expect to do this atomically or anything like that - but you can pass in + * other page requirements. * * If the page does not get brought uptodate, return -EIO. */ diff --git a/trunk/mm/hugetlb.c b/trunk/mm/hugetlb.c index 2316840b337a..73f17c0293c0 100644 --- a/trunk/mm/hugetlb.c +++ b/trunk/mm/hugetlb.c @@ -901,6 +901,7 @@ static int gather_surplus_pages(struct hstate *h, int delta) h->resv_huge_pages += delta; ret = 0; + spin_unlock(&hugetlb_lock); /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) { if ((--needed) < 0) @@ -914,7 +915,6 @@ static int gather_surplus_pages(struct hstate *h, int delta) VM_BUG_ON(page_count(page)); enqueue_huge_page(h, page); } - spin_unlock(&hugetlb_lock); /* Free unnecessary surplus pages to the buddy allocator */ free: diff --git a/trunk/mm/memblock.c b/trunk/mm/memblock.c index 2f55f19b7c86..84bec4969ed5 100644 --- a/trunk/mm/memblock.c +++ b/trunk/mm/memblock.c @@ -20,23 +20,12 @@ #include #include -static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; -static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; - -struct memblock memblock __initdata_memblock = { - .memory.regions = memblock_memory_init_regions, - .memory.cnt = 1, /* empty dummy entry */ - .memory.max = INIT_MEMBLOCK_REGIONS, - - .reserved.regions = memblock_reserved_init_regions, - .reserved.cnt = 1, /* empty dummy entry */ - .reserved.max = INIT_MEMBLOCK_REGIONS, - - .current_limit = MEMBLOCK_ALLOC_ANYWHERE, -}; +struct memblock memblock __initdata_memblock; int memblock_debug __initdata_memblock; -static int memblock_can_resize __initdata_memblock; +int memblock_can_resize __initdata_memblock; +static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock; +static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock; /* inline so we don't get a warning when pr_debug is compiled out */ static inline const char *memblock_type_name(struct memblock_type *type) @@ -49,15 +38,20 @@ static inline const char *memblock_type_name(struct memblock_type *type) return "unknown"; } -/* adjust *@size so that (@base + *@size) doesn't overflow, return new size */ -static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) -{ - return *size = min(*size, (phys_addr_t)ULLONG_MAX - base); -} - /* * Address comparison utilities */ + +static phys_addr_t __init_memblock memblock_align_down(phys_addr_t addr, phys_addr_t size) +{ + return addr & ~(size - 1); +} + +static phys_addr_t __init_memblock memblock_align_up(phys_addr_t addr, phys_addr_t size) +{ + return (addr + (size - 1)) & ~(size - 1); +} + static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, phys_addr_t base2, phys_addr_t size2) { @@ -79,66 +73,83 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, return (i < type->cnt) ? i : -1; } -/** - * memblock_find_in_range_node - find free area in given range and node - * @start: start of candidate range - * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} - * @size: size of free area to find - * @align: alignment of free area to find - * @nid: nid of the free area to find, %MAX_NUMNODES for any node - * - * Find @size free area aligned to @align in the specified range and node. - * - * RETURNS: - * Found address on success, %0 on failure. +/* + * Find, allocate, deallocate or reserve unreserved regions. All allocations + * are top-down. */ -phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, - phys_addr_t end, phys_addr_t size, - phys_addr_t align, int nid) + +static phys_addr_t __init_memblock memblock_find_region(phys_addr_t start, phys_addr_t end, + phys_addr_t size, phys_addr_t align) { - phys_addr_t this_start, this_end, cand; - u64 i; + phys_addr_t base, res_base; + long j; + + /* In case, huge size is requested */ + if (end < size) + return MEMBLOCK_ERROR; + + base = memblock_align_down((end - size), align); - /* align @size to avoid excessive fragmentation on reserved array */ - size = round_up(size, align); + /* Prevent allocations returning 0 as it's also used to + * indicate an allocation failure + */ + if (start == 0) + start = PAGE_SIZE; + + while (start <= base) { + j = memblock_overlaps_region(&memblock.reserved, base, size); + if (j < 0) + return base; + res_base = memblock.reserved.regions[j].base; + if (res_base < size) + break; + base = memblock_align_down(res_base - size, align); + } - /* pump up @end */ + return MEMBLOCK_ERROR; +} + +static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size, + phys_addr_t align, phys_addr_t start, phys_addr_t end) +{ + long i; + + BUG_ON(0 == size); + + /* Pump up max_addr */ if (end == MEMBLOCK_ALLOC_ACCESSIBLE) end = memblock.current_limit; - /* adjust @start to avoid underflow and allocating the first page */ - start = max3(start, size, (phys_addr_t)PAGE_SIZE); - end = max(start, end); - - for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { - this_start = clamp(this_start, start, end); - this_end = clamp(this_end, start, end); + /* We do a top-down search, this tends to limit memory + * fragmentation by keeping early boot allocs near the + * top of memory + */ + for (i = memblock.memory.cnt - 1; i >= 0; i--) { + phys_addr_t memblockbase = memblock.memory.regions[i].base; + phys_addr_t memblocksize = memblock.memory.regions[i].size; + phys_addr_t bottom, top, found; - cand = round_down(this_end - size, align); - if (cand >= this_start) - return cand; + if (memblocksize < size) + continue; + if ((memblockbase + memblocksize) <= start) + break; + bottom = max(memblockbase, start); + top = min(memblockbase + memblocksize, end); + if (bottom >= top) + continue; + found = memblock_find_region(bottom, top, size, align); + if (found != MEMBLOCK_ERROR) + return found; } - return 0; + return MEMBLOCK_ERROR; } -/** - * memblock_find_in_range - find free area in given range - * @start: start of candidate range - * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} - * @size: size of free area to find - * @align: alignment of free area to find - * - * Find @size free area aligned to @align in the specified range. - * - * RETURNS: - * Found address on success, %0 on failure. +/* + * Find a free area with specified alignment in a specific range. */ -phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, - phys_addr_t end, phys_addr_t size, - phys_addr_t align) +u64 __init_memblock memblock_find_in_range(u64 start, u64 end, u64 size, u64 align) { - return memblock_find_in_range_node(start, end, size, align, - MAX_NUMNODES); + return memblock_find_base(size, align, start, end); } /* @@ -167,21 +178,25 @@ int __init_memblock memblock_reserve_reserved_regions(void) static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) { - type->total_size -= type->regions[r].size; - memmove(&type->regions[r], &type->regions[r + 1], - (type->cnt - (r + 1)) * sizeof(type->regions[r])); + unsigned long i; + + for (i = r; i < type->cnt - 1; i++) { + type->regions[i].base = type->regions[i + 1].base; + type->regions[i].size = type->regions[i + 1].size; + } type->cnt--; /* Special case for empty arrays */ if (type->cnt == 0) { - WARN_ON(type->total_size != 0); type->cnt = 1; type->regions[0].base = 0; type->regions[0].size = 0; - memblock_set_region_node(&type->regions[0], MAX_NUMNODES); } } +/* Defined below but needed now */ +static long memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size); + static int __init_memblock memblock_double_array(struct memblock_type *type) { struct memblock_region *new_array, *old_array; @@ -211,10 +226,10 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) */ if (use_slab) { new_array = kmalloc(new_size, GFP_KERNEL); - addr = new_array ? __pa(new_array) : 0; + addr = new_array == NULL ? MEMBLOCK_ERROR : __pa(new_array); } else - addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); - if (!addr) { + addr = memblock_find_base(new_size, sizeof(phys_addr_t), 0, MEMBLOCK_ALLOC_ACCESSIBLE); + if (addr == MEMBLOCK_ERROR) { pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", memblock_type_name(type), type->max, type->max * 2); return -1; @@ -239,7 +254,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) return 0; /* Add the new reserved region now. Should not fail ! */ - BUG_ON(memblock_reserve(addr, new_size)); + BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size)); /* If the array wasn't our static init one, then free it. We only do * that before SLAB is available as later on, we don't know whether @@ -253,514 +268,343 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) return 0; } -/** - * memblock_merge_regions - merge neighboring compatible regions - * @type: memblock type to scan - * - * Scan @type and merge neighboring compatible regions. - */ -static void __init_memblock memblock_merge_regions(struct memblock_type *type) +int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1, + phys_addr_t addr2, phys_addr_t size2) { - int i = 0; + return 1; +} - /* cnt never goes below 1 */ - while (i < type->cnt - 1) { - struct memblock_region *this = &type->regions[i]; - struct memblock_region *next = &type->regions[i + 1]; +static long __init_memblock memblock_add_region(struct memblock_type *type, + phys_addr_t base, phys_addr_t size) +{ + phys_addr_t end = base + size; + int i, slot = -1; - if (this->base + this->size != next->base || - memblock_get_region_node(this) != - memblock_get_region_node(next)) { - BUG_ON(this->base + this->size > next->base); - i++; - continue; - } + /* First try and coalesce this MEMBLOCK with others */ + for (i = 0; i < type->cnt; i++) { + struct memblock_region *rgn = &type->regions[i]; + phys_addr_t rend = rgn->base + rgn->size; - this->size += next->size; - memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next)); - type->cnt--; - } -} + /* Exit if there's no possible hits */ + if (rgn->base > end || rgn->size == 0) + break; -/** - * memblock_insert_region - insert new memblock region - * @type: memblock type to insert into - * @idx: index for the insertion point - * @base: base address of the new region - * @size: size of the new region - * - * Insert new memblock region [@base,@base+@size) into @type at @idx. - * @type must already have extra room to accomodate the new region. - */ -static void __init_memblock memblock_insert_region(struct memblock_type *type, - int idx, phys_addr_t base, - phys_addr_t size, int nid) -{ - struct memblock_region *rgn = &type->regions[idx]; + /* Check if we are fully enclosed within an existing + * block + */ + if (rgn->base <= base && rend >= end) + return 0; - BUG_ON(type->cnt >= type->max); - memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); - rgn->base = base; - rgn->size = size; - memblock_set_region_node(rgn, nid); - type->cnt++; - type->total_size += size; -} + /* Check if we overlap or are adjacent with the bottom + * of a block. + */ + if (base < rgn->base && end >= rgn->base) { + /* If we can't coalesce, create a new block */ + if (!memblock_memory_can_coalesce(base, size, + rgn->base, + rgn->size)) { + /* Overlap & can't coalesce are mutually + * exclusive, if you do that, be prepared + * for trouble + */ + WARN_ON(end != rgn->base); + goto new_block; + } + /* We extend the bottom of the block down to our + * base + */ + rgn->base = base; + rgn->size = rend - base; -/** - * memblock_add_region - add new memblock region - * @type: memblock type to add new region into - * @base: base address of the new region - * @size: size of the new region - * @nid: nid of the new region - * - * Add new memblock region [@base,@base+@size) into @type. The new region - * is allowed to overlap with existing ones - overlaps don't affect already - * existing regions. @type is guaranteed to be minimal (all neighbouring - * compatible regions are merged) after the addition. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -static int __init_memblock memblock_add_region(struct memblock_type *type, - phys_addr_t base, phys_addr_t size, int nid) -{ - bool insert = false; - phys_addr_t obase = base; - phys_addr_t end = base + memblock_cap_size(base, &size); - int i, nr_new; + /* Return if we have nothing else to allocate + * (fully coalesced) + */ + if (rend >= end) + return 0; + + /* We continue processing from the end of the + * coalesced block. + */ + base = rend; + size = end - base; + } + + /* Now check if we overlap or are adjacent with the + * top of a block + */ + if (base <= rend && end >= rend) { + /* If we can't coalesce, create a new block */ + if (!memblock_memory_can_coalesce(rgn->base, + rgn->size, + base, size)) { + /* Overlap & can't coalesce are mutually + * exclusive, if you do that, be prepared + * for trouble + */ + WARN_ON(rend != base); + goto new_block; + } + /* We adjust our base down to enclose the + * original block and destroy it. It will be + * part of our new allocation. Since we've + * freed an entry, we know we won't fail + * to allocate one later, so we won't risk + * losing the original block allocation. + */ + size += (base - rgn->base); + base = rgn->base; + memblock_remove_region(type, i--); + } + } - /* special case for empty array */ - if (type->regions[0].size == 0) { - WARN_ON(type->cnt != 1 || type->total_size); + /* If the array is empty, special case, replace the fake + * filler region and return + */ + if ((type->cnt == 1) && (type->regions[0].size == 0)) { type->regions[0].base = base; type->regions[0].size = size; - memblock_set_region_node(&type->regions[0], nid); - type->total_size = size; return 0; } -repeat: - /* - * The following is executed twice. Once with %false @insert and - * then with %true. The first counts the number of regions needed - * to accomodate the new area. The second actually inserts them. - */ - base = obase; - nr_new = 0; - for (i = 0; i < type->cnt; i++) { - struct memblock_region *rgn = &type->regions[i]; - phys_addr_t rbase = rgn->base; - phys_addr_t rend = rbase + rgn->size; + new_block: + /* If we are out of space, we fail. It's too late to resize the array + * but then this shouldn't have happened in the first place. + */ + if (WARN_ON(type->cnt >= type->max)) + return -1; - if (rbase >= end) + /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */ + for (i = type->cnt - 1; i >= 0; i--) { + if (base < type->regions[i].base) { + type->regions[i+1].base = type->regions[i].base; + type->regions[i+1].size = type->regions[i].size; + } else { + type->regions[i+1].base = base; + type->regions[i+1].size = size; + slot = i + 1; break; - if (rend <= base) - continue; - /* - * @rgn overlaps. If it separates the lower part of new - * area, insert that portion. - */ - if (rbase > base) { - nr_new++; - if (insert) - memblock_insert_region(type, i++, base, - rbase - base, nid); } - /* area below @rend is dealt with, forget about it */ - base = min(rend, end); } - - /* insert the remaining portion */ - if (base < end) { - nr_new++; - if (insert) - memblock_insert_region(type, i, base, end - base, nid); + if (base < type->regions[0].base) { + type->regions[0].base = base; + type->regions[0].size = size; + slot = 0; } + type->cnt++; - /* - * If this was the first round, resize array and repeat for actual - * insertions; otherwise, merge and return. + /* The array is full ? Try to resize it. If that fails, we undo + * our allocation and return an error */ - if (!insert) { - while (type->cnt + nr_new > type->max) - if (memblock_double_array(type) < 0) - return -ENOMEM; - insert = true; - goto repeat; - } else { - memblock_merge_regions(type); - return 0; + if (type->cnt == type->max && memblock_double_array(type)) { + BUG_ON(slot < 0); + memblock_remove_region(type, slot); + return -1; } -} -int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, - int nid) -{ - return memblock_add_region(&memblock.memory, base, size, nid); + return 0; } -int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) +long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { - return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES); + return memblock_add_region(&memblock.memory, base, size); + } -/** - * memblock_isolate_range - isolate given range into disjoint memblocks - * @type: memblock type to isolate range for - * @base: base of range to isolate - * @size: size of range to isolate - * @start_rgn: out parameter for the start of isolated region - * @end_rgn: out parameter for the end of isolated region - * - * Walk @type and ensure that regions don't cross the boundaries defined by - * [@base,@base+@size). Crossing regions are split at the boundaries, - * which may create at most two more regions. The index of the first - * region inside the range is returned in *@start_rgn and end in *@end_rgn. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -static int __init_memblock memblock_isolate_range(struct memblock_type *type, - phys_addr_t base, phys_addr_t size, - int *start_rgn, int *end_rgn) +static long __init_memblock __memblock_remove(struct memblock_type *type, + phys_addr_t base, phys_addr_t size) { - phys_addr_t end = base + memblock_cap_size(base, &size); + phys_addr_t end = base + size; int i; - *start_rgn = *end_rgn = 0; - - /* we'll create at most two more regions */ - while (type->cnt + 2 > type->max) - if (memblock_double_array(type) < 0) - return -ENOMEM; - + /* Walk through the array for collisions */ for (i = 0; i < type->cnt; i++) { struct memblock_region *rgn = &type->regions[i]; - phys_addr_t rbase = rgn->base; - phys_addr_t rend = rbase + rgn->size; + phys_addr_t rend = rgn->base + rgn->size; - if (rbase >= end) + /* Nothing more to do, exit */ + if (rgn->base > end || rgn->size == 0) break; - if (rend <= base) + + /* If we fully enclose the block, drop it */ + if (base <= rgn->base && end >= rend) { + memblock_remove_region(type, i--); continue; + } - if (rbase < base) { - /* - * @rgn intersects from below. Split and continue - * to process the next region - the new top half. - */ - rgn->base = base; - rgn->size -= base - rbase; - type->total_size -= base - rbase; - memblock_insert_region(type, i, rbase, base - rbase, - memblock_get_region_node(rgn)); - } else if (rend > end) { - /* - * @rgn intersects from above. Split and redo the - * current region - the new bottom half. + /* If we are fully enclosed within a block + * then we need to split it and we are done + */ + if (base > rgn->base && end < rend) { + rgn->size = base - rgn->base; + if (!memblock_add_region(type, end, rend - end)) + return 0; + /* Failure to split is bad, we at least + * restore the block before erroring */ - rgn->base = end; - rgn->size -= end - rbase; - type->total_size -= end - rbase; - memblock_insert_region(type, i--, rbase, end - rbase, - memblock_get_region_node(rgn)); - } else { - /* @rgn is fully contained, record it */ - if (!*end_rgn) - *start_rgn = i; - *end_rgn = i + 1; + rgn->size = rend - rgn->base; + WARN_ON(1); + return -1; } - } - return 0; -} - -static int __init_memblock __memblock_remove(struct memblock_type *type, - phys_addr_t base, phys_addr_t size) -{ - int start_rgn, end_rgn; - int i, ret; + /* Check if we need to trim the bottom of a block */ + if (rgn->base < end && rend > end) { + rgn->size -= end - rgn->base; + rgn->base = end; + break; + } - ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); - if (ret) - return ret; + /* And check if we need to trim the top of a block */ + if (base < rend) + rgn->size -= rend - base; - for (i = end_rgn - 1; i >= start_rgn; i--) - memblock_remove_region(type, i); + } return 0; } -int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) +long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) { return __memblock_remove(&memblock.memory, base, size); } -int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) +long __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) { - memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", - (unsigned long long)base, - (unsigned long long)base + size, - (void *)_RET_IP_); - return __memblock_remove(&memblock.reserved, base, size); } -int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) +long __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) { struct memblock_type *_rgn = &memblock.reserved; - memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n", - (unsigned long long)base, - (unsigned long long)base + size, - (void *)_RET_IP_); BUG_ON(0 == size); - return memblock_add_region(_rgn, base, size, MAX_NUMNODES); + return memblock_add_region(_rgn, base, size); } -/** - * __next_free_mem_range - next function for for_each_free_mem_range() - * @idx: pointer to u64 loop variable - * @nid: nid: node selector, %MAX_NUMNODES for all nodes - * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL - * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL - * @p_nid: ptr to int for nid of the range, can be %NULL - * - * Find the first free area from *@idx which matches @nid, fill the out - * parameters, and update *@idx for the next iteration. The lower 32bit of - * *@idx contains index into memory region and the upper 32bit indexes the - * areas before each reserved region. For example, if reserved regions - * look like the following, - * - * 0:[0-16), 1:[32-48), 2:[128-130) - * - * The upper 32bit indexes the following regions. - * - * 0:[0-0), 1:[16-32), 2:[48-128), 3:[130-MAX) - * - * As both region arrays are sorted, the function advances the two indices - * in lockstep and returns each intersection. - */ -void __init_memblock __next_free_mem_range(u64 *idx, int nid, - phys_addr_t *out_start, - phys_addr_t *out_end, int *out_nid) +phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) { - struct memblock_type *mem = &memblock.memory; - struct memblock_type *rsv = &memblock.reserved; - int mi = *idx & 0xffffffff; - int ri = *idx >> 32; - - for ( ; mi < mem->cnt; mi++) { - struct memblock_region *m = &mem->regions[mi]; - phys_addr_t m_start = m->base; - phys_addr_t m_end = m->base + m->size; + phys_addr_t found; - /* only memory regions are associated with nodes, check it */ - if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) - continue; + /* We align the size to limit fragmentation. Without this, a lot of + * small allocs quickly eat up the whole reserve array on sparc + */ + size = memblock_align_up(size, align); - /* scan areas before each reservation for intersection */ - for ( ; ri < rsv->cnt + 1; ri++) { - struct memblock_region *r = &rsv->regions[ri]; - phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0; - phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX; - - /* if ri advanced past mi, break out to advance mi */ - if (r_start >= m_end) - break; - /* if the two regions intersect, we're done */ - if (m_start < r_end) { - if (out_start) - *out_start = max(m_start, r_start); - if (out_end) - *out_end = min(m_end, r_end); - if (out_nid) - *out_nid = memblock_get_region_node(m); - /* - * The region which ends first is advanced - * for the next iteration. - */ - if (m_end <= r_end) - mi++; - else - ri++; - *idx = (u32)mi | (u64)ri << 32; - return; - } - } - } + found = memblock_find_base(size, align, 0, max_addr); + if (found != MEMBLOCK_ERROR && + !memblock_add_region(&memblock.reserved, found, size)) + return found; - /* signal end of iteration */ - *idx = ULLONG_MAX; + return 0; } -/** - * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() - * @idx: pointer to u64 loop variable - * @nid: nid: node selector, %MAX_NUMNODES for all nodes - * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL - * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL - * @p_nid: ptr to int for nid of the range, can be %NULL - * - * Reverse of __next_free_mem_range(). - */ -void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, - phys_addr_t *out_start, - phys_addr_t *out_end, int *out_nid) +phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) { - struct memblock_type *mem = &memblock.memory; - struct memblock_type *rsv = &memblock.reserved; - int mi = *idx & 0xffffffff; - int ri = *idx >> 32; - - if (*idx == (u64)ULLONG_MAX) { - mi = mem->cnt - 1; - ri = rsv->cnt; - } + phys_addr_t alloc; - for ( ; mi >= 0; mi--) { - struct memblock_region *m = &mem->regions[mi]; - phys_addr_t m_start = m->base; - phys_addr_t m_end = m->base + m->size; + alloc = __memblock_alloc_base(size, align, max_addr); - /* only memory regions are associated with nodes, check it */ - if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) - continue; + if (alloc == 0) + panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", + (unsigned long long) size, (unsigned long long) max_addr); - /* scan areas before each reservation for intersection */ - for ( ; ri >= 0; ri--) { - struct memblock_region *r = &rsv->regions[ri]; - phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0; - phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX; - - /* if ri advanced past mi, break out to advance mi */ - if (r_end <= m_start) - break; - /* if the two regions intersect, we're done */ - if (m_end > r_start) { - if (out_start) - *out_start = max(m_start, r_start); - if (out_end) - *out_end = min(m_end, r_end); - if (out_nid) - *out_nid = memblock_get_region_node(m); - - if (m_start >= r_start) - mi--; - else - ri--; - *idx = (u32)mi | (u64)ri << 32; - return; - } - } - } + return alloc; +} - *idx = ULLONG_MAX; +phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align) +{ + return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + /* - * Common iterator interface used to define for_each_mem_range(). + * Additional node-local allocators. Search for node memory is bottom up + * and walks memblock regions within that node bottom-up as well, but allocation + * within an memblock region is top-down. XXX I plan to fix that at some stage + * + * WARNING: Only available after early_node_map[] has been populated, + * on some architectures, that is after all the calls to add_active_range() + * have been done to populate it. */ -void __init_memblock __next_mem_pfn_range(int *idx, int nid, - unsigned long *out_start_pfn, - unsigned long *out_end_pfn, int *out_nid) -{ - struct memblock_type *type = &memblock.memory; - struct memblock_region *r; - while (++*idx < type->cnt) { - r = &type->regions[*idx]; +phys_addr_t __weak __init memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid) +{ +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP + /* + * This code originates from sparc which really wants use to walk by addresses + * and returns the nid. This is not very convenient for early_pfn_map[] users + * as the map isn't sorted yet, and it really wants to be walked by nid. + * + * For now, I implement the inefficient method below which walks the early + * map multiple times. Eventually we may want to use an ARCH config option + * to implement a completely different method for both case. + */ + unsigned long start_pfn, end_pfn; + int i; - if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size)) + for (i = 0; i < MAX_NUMNODES; i++) { + get_pfn_range_for_nid(i, &start_pfn, &end_pfn); + if (start < PFN_PHYS(start_pfn) || start >= PFN_PHYS(end_pfn)) continue; - if (nid == MAX_NUMNODES || nid == r->nid) - break; - } - if (*idx >= type->cnt) { - *idx = -1; - return; + *nid = i; + return min(end, PFN_PHYS(end_pfn)); } +#endif + *nid = 0; - if (out_start_pfn) - *out_start_pfn = PFN_UP(r->base); - if (out_end_pfn) - *out_end_pfn = PFN_DOWN(r->base + r->size); - if (out_nid) - *out_nid = r->nid; + return end; } -/** - * memblock_set_node - set node ID on memblock regions - * @base: base of area to set node ID for - * @size: size of area to set node ID for - * @nid: node ID to set - * - * Set the nid of memblock memory regions in [@base,@base+@size) to @nid. - * Regions which cross the area boundaries are split as necessary. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, - int nid) +static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp, + phys_addr_t size, + phys_addr_t align, int nid) { - struct memblock_type *type = &memblock.memory; - int start_rgn, end_rgn; - int i, ret; + phys_addr_t start, end; - ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); - if (ret) - return ret; + start = mp->base; + end = start + mp->size; - for (i = start_rgn; i < end_rgn; i++) - type->regions[i].nid = nid; + start = memblock_align_up(start, align); + while (start < end) { + phys_addr_t this_end; + int this_nid; - memblock_merge_regions(type); - return 0; -} -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - -static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, - phys_addr_t align, phys_addr_t max_addr, - int nid) -{ - phys_addr_t found; - - found = memblock_find_in_range_node(0, max_addr, size, align, nid); - if (found && !memblock_reserve(found, size)) - return found; + this_end = memblock_nid_range(start, end, &this_nid); + if (this_nid == nid) { + phys_addr_t ret = memblock_find_region(start, this_end, size, align); + if (ret != MEMBLOCK_ERROR && + !memblock_add_region(&memblock.reserved, ret, size)) + return ret; + } + start = this_end; + } - return 0; + return MEMBLOCK_ERROR; } phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { - return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); -} - -phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) -{ - return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES); -} - -phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) -{ - phys_addr_t alloc; + struct memblock_type *mem = &memblock.memory; + int i; - alloc = __memblock_alloc_base(size, align, max_addr); + BUG_ON(0 == size); - if (alloc == 0) - panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", - (unsigned long long) size, (unsigned long long) max_addr); + /* We align the size to limit fragmentation. Without this, a lot of + * small allocs quickly eat up the whole reserve array on sparc + */ + size = memblock_align_up(size, align); - return alloc; -} + /* We do a bottom-up search for a region with the right + * nid since that's easier considering how memblock_nid_range() + * works + */ + for (i = 0; i < mem->cnt; i++) { + phys_addr_t ret = memblock_alloc_nid_region(&mem->regions[i], + size, align, nid); + if (ret != MEMBLOCK_ERROR) + return ret; + } -phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align) -{ - return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); + return 0; } phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) @@ -769,7 +613,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i if (res) return res; - return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); + return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE); } @@ -777,9 +621,10 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i * Remaining API functions */ +/* You must call memblock_analyze() before this. */ phys_addr_t __init memblock_phys_mem_size(void) { - return memblock.memory.total_size; + return memblock.memory_size; } /* lowest address */ @@ -795,28 +640,45 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void) return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size); } -void __init memblock_enforce_memory_limit(phys_addr_t limit) +/* You must call memblock_analyze() after this. */ +void __init memblock_enforce_memory_limit(phys_addr_t memory_limit) { unsigned long i; - phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + phys_addr_t limit; + struct memblock_region *p; - if (!limit) + if (!memory_limit) return; - /* find out max address */ + /* Truncate the memblock regions to satisfy the memory limit. */ + limit = memory_limit; for (i = 0; i < memblock.memory.cnt; i++) { - struct memblock_region *r = &memblock.memory.regions[i]; - - if (limit <= r->size) { - max_addr = r->base + limit; - break; + if (limit > memblock.memory.regions[i].size) { + limit -= memblock.memory.regions[i].size; + continue; } - limit -= r->size; + + memblock.memory.regions[i].size = limit; + memblock.memory.cnt = i + 1; + break; } - /* truncate both memory and reserved regions */ - __memblock_remove(&memblock.memory, max_addr, (phys_addr_t)ULLONG_MAX); - __memblock_remove(&memblock.reserved, max_addr, (phys_addr_t)ULLONG_MAX); + memory_limit = memblock_end_of_DRAM(); + + /* And truncate any reserves above the limit also. */ + for (i = 0; i < memblock.reserved.cnt; i++) { + p = &memblock.reserved.regions[i]; + + if (p->base > memory_limit) + p->size = 0; + else if ((p->base + p->size) > memory_limit) + p->size = memory_limit - p->base; + + if (p->size == 0) { + memblock_remove_region(&memblock.reserved, i); + i--; + } + } } static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) @@ -850,18 +712,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) { int idx = memblock_search(&memblock.memory, base); - phys_addr_t end = base + memblock_cap_size(base, &size); if (idx == -1) return 0; return memblock.memory.regions[idx].base <= base && (memblock.memory.regions[idx].base + - memblock.memory.regions[idx].size) >= end; + memblock.memory.regions[idx].size) >= (base + size); } int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) { - memblock_cap_size(base, &size); return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; } @@ -871,45 +731,86 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit) memblock.current_limit = limit; } -static void __init_memblock memblock_dump(struct memblock_type *type, char *name) +static void __init_memblock memblock_dump(struct memblock_type *region, char *name) { unsigned long long base, size; int i; - pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); + pr_info(" %s.cnt = 0x%lx\n", name, region->cnt); - for (i = 0; i < type->cnt; i++) { - struct memblock_region *rgn = &type->regions[i]; - char nid_buf[32] = ""; - - base = rgn->base; - size = rgn->size; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - if (memblock_get_region_node(rgn) != MAX_NUMNODES) - snprintf(nid_buf, sizeof(nid_buf), " on node %d", - memblock_get_region_node(rgn)); -#endif - pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n", - name, i, base, base + size - 1, size, nid_buf); + for (i = 0; i < region->cnt; i++) { + base = region->regions[i].base; + size = region->regions[i].size; + + pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes\n", + name, i, base, base + size - 1, size); } } -void __init_memblock __memblock_dump_all(void) +void __init_memblock memblock_dump_all(void) { + if (!memblock_debug) + return; + pr_info("MEMBLOCK configuration:\n"); - pr_info(" memory size = %#llx reserved size = %#llx\n", - (unsigned long long)memblock.memory.total_size, - (unsigned long long)memblock.reserved.total_size); + pr_info(" memory size = 0x%llx\n", (unsigned long long)memblock.memory_size); memblock_dump(&memblock.memory, "memory"); memblock_dump(&memblock.reserved, "reserved"); } -void __init memblock_allow_resize(void) +void __init memblock_analyze(void) { + int i; + + /* Check marker in the unused last array entry */ + WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base + != MEMBLOCK_INACTIVE); + WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base + != MEMBLOCK_INACTIVE); + + memblock.memory_size = 0; + + for (i = 0; i < memblock.memory.cnt; i++) + memblock.memory_size += memblock.memory.regions[i].size; + + /* We allow resizing from there */ memblock_can_resize = 1; } +void __init memblock_init(void) +{ + static int init_done __initdata = 0; + + if (init_done) + return; + init_done = 1; + + /* Hookup the initial arrays */ + memblock.memory.regions = memblock_memory_init_regions; + memblock.memory.max = INIT_MEMBLOCK_REGIONS; + memblock.reserved.regions = memblock_reserved_init_regions; + memblock.reserved.max = INIT_MEMBLOCK_REGIONS; + + /* Write a marker in the unused last array entry */ + memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE; + memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE; + + /* Create a dummy zero size MEMBLOCK which will get coalesced away later. + * This simplifies the memblock_add() code below... + */ + memblock.memory.regions[0].base = 0; + memblock.memory.regions[0].size = 0; + memblock.memory.cnt = 1; + + /* Ditto. */ + memblock.reserved.regions[0].base = 0; + memblock.reserved.regions[0].size = 0; + memblock.reserved.cnt = 1; + + memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE; +} + static int __init early_memblock(char *p) { if (p && strstr(p, "debug")) @@ -918,7 +819,7 @@ static int __init early_memblock(char *p) } early_param("memblock", early_memblock); -#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK) +#if defined(CONFIG_DEBUG_FS) && !defined(ARCH_DISCARD_MEMBLOCK) static int memblock_debug_show(struct seq_file *m, void *private) { diff --git a/trunk/mm/mempolicy.c b/trunk/mm/mempolicy.c index c3fdbcb17658..adc395481813 100644 --- a/trunk/mm/mempolicy.c +++ b/trunk/mm/mempolicy.c @@ -636,7 +636,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; - pgoff_t pgoff; unsigned long vmstart; unsigned long vmend; @@ -644,21 +643,13 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, if (!vma || vma->vm_start > start) return -EFAULT; - if (start > vma->vm_start) - prev = vma; - for (; vma && vma->vm_start < end; prev = vma, vma = next) { next = vma->vm_next; vmstart = max(start, vma->vm_start); vmend = min(end, vma->vm_end); - if (mpol_equal(vma_policy(vma), new_pol)) - continue; - - pgoff = vma->vm_pgoff + - ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, + vma->anon_vma, vma->vm_file, vma->vm_pgoff, new_pol); if (prev) { vma = prev; diff --git a/trunk/mm/nobootmem.c b/trunk/mm/nobootmem.c index 24f0fc1a56d6..7fa41b4a07bf 100644 --- a/trunk/mm/nobootmem.c +++ b/trunk/mm/nobootmem.c @@ -41,13 +41,14 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, if (limit > memblock.current_limit) limit = memblock.current_limit; - addr = memblock_find_in_range_node(goal, limit, size, align, nid); - if (!addr) + addr = find_memory_core_early(nid, size, align, goal, limit); + + if (addr == MEMBLOCK_ERROR) return NULL; ptr = phys_to_virt(addr); memset(ptr, 0, size); - memblock_reserve(addr, size); + memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); /* * The min_count is set to 0 so that bootmem allocated blocks * are never reported as leaks. @@ -106,27 +107,23 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) __free_pages_bootmem(pfn_to_page(i), 0); } -unsigned long __init free_low_memory_core_early(int nodeid) +unsigned long __init free_all_memory_core_early(int nodeid) { + int i; + u64 start, end; unsigned long count = 0; - phys_addr_t start, end; - u64 i; - - /* free reserved array temporarily so that it's treated as free area */ - memblock_free_reserved_regions(); - - for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { - unsigned long start_pfn = PFN_UP(start); - unsigned long end_pfn = min_t(unsigned long, - PFN_DOWN(end), max_low_pfn); - if (start_pfn < end_pfn) { - __free_pages_memory(start_pfn, end_pfn); - count += end_pfn - start_pfn; - } + struct range *range = NULL; + int nr_range; + + nr_range = get_free_all_memory_range(&range, nodeid); + + for (i = 0; i < nr_range; i++) { + start = range[i].start; + end = range[i].end; + count += end - start; + __free_pages_memory(start, end); } - /* put region array back? */ - memblock_reserve_reserved_regions(); return count; } @@ -140,7 +137,7 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { register_page_bootmem_info_node(pgdat); - /* free_low_memory_core_early(MAX_NUMNODES) will be called later */ + /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ return 0; } @@ -158,7 +155,7 @@ unsigned long __init free_all_bootmem(void) * Use MAX_NUMNODES will make sure all ranges in early_node_map[] * will be used instead of only Node0 related */ - return free_low_memory_core_early(MAX_NUMNODES); + return free_all_memory_core_early(MAX_NUMNODES); } /** @@ -175,7 +172,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { kmemleak_free_part(__va(physaddr), size); - memblock_free(physaddr, size); + memblock_x86_free_range(physaddr, physaddr + size); } /** @@ -190,7 +187,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, void __init free_bootmem(unsigned long addr, unsigned long size) { kmemleak_free_part(__va(addr), size); - memblock_free(addr, size); + memblock_x86_free_range(addr, addr + size); } static void * __init ___alloc_bootmem_nopanic(unsigned long size, diff --git a/trunk/mm/page_alloc.c b/trunk/mm/page_alloc.c index bdc804c2d99c..2b8ba3aebf6e 100644 --- a/trunk/mm/page_alloc.c +++ b/trunk/mm/page_alloc.c @@ -181,17 +181,39 @@ static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; static unsigned long __meminitdata dma_reserve; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; -static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; -static unsigned long __initdata required_kernelcore; -static unsigned long __initdata required_movablecore; -static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; - -/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ -int movable_zone; -EXPORT_SYMBOL(movable_zone); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP + /* + * MAX_ACTIVE_REGIONS determines the maximum number of distinct + * ranges of memory (RAM) that may be registered with add_active_range(). + * Ranges passed to add_active_range() will be merged if possible + * so the number of times add_active_range() can be called is + * related to the number of nodes and the number of holes + */ + #ifdef CONFIG_MAX_ACTIVE_REGIONS + /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ + #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS + #else + #if MAX_NUMNODES >= 32 + /* If there can be many nodes, allow up to 50 holes per node */ + #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) + #else + /* By default, allow up to 256 distinct regions */ + #define MAX_ACTIVE_REGIONS 256 + #endif + #endif + + static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS]; + static int __meminitdata nr_nodemap_entries; + static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; + static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; + static unsigned long __initdata required_kernelcore; + static unsigned long __initdata required_movablecore; + static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; + + /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ + int movable_zone; + EXPORT_SYMBOL(movable_zone); +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ #if MAX_NUMNODES > 1 int nr_node_ids __read_mostly = MAX_NUMNODES; @@ -684,10 +706,10 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) int loop; prefetchw(page); - for (loop = 0; loop < (1 << order); loop++) { + for (loop = 0; loop < BITS_PER_LONG; loop++) { struct page *p = &page[loop]; - if (loop + 1 < (1 << order)) + if (loop + 1 < BITS_PER_LONG) prefetchw(p + 1); __ClearPageReserved(p); set_page_count(p, 0); @@ -3715,7 +3737,35 @@ __meminit int init_currently_empty_zone(struct zone *zone, return 0; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +/* + * Basic iterator support. Return the first range of PFNs for a node + * Note: nid == MAX_NUMNODES returns first region regardless of node + */ +static int __meminit first_active_region_index_in_nid(int nid) +{ + int i; + + for (i = 0; i < nr_nodemap_entries; i++) + if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) + return i; + + return -1; +} + +/* + * Basic iterator support. Return the next active range of PFNs for a node + * Note: nid == MAX_NUMNODES returns next region regardless of node + */ +static int __meminit next_active_region_index_in_nid(int index, int nid) +{ + for (index = index + 1; index < nr_nodemap_entries; index++) + if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) + return index; + + return -1; +} + #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. @@ -3725,12 +3775,15 @@ __meminit int init_currently_empty_zone(struct zone *zone, */ int __meminit __early_pfn_to_nid(unsigned long pfn) { - unsigned long start_pfn, end_pfn; - int i, nid; + int i; + + for (i = 0; i < nr_nodemap_entries; i++) { + unsigned long start_pfn = early_node_map[i].start_pfn; + unsigned long end_pfn = early_node_map[i].end_pfn; - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) if (start_pfn <= pfn && pfn < end_pfn) - return nid; + return early_node_map[i].nid; + } /* This is a memory hole */ return -1; } @@ -3759,6 +3812,11 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) } #endif +/* Basic iterator support to walk early_node_map[] */ +#define for_each_active_range_index_in_nid(i, nid) \ + for (i = first_active_region_index_in_nid(nid); i != -1; \ + i = next_active_region_index_in_nid(i, nid)) + /** * free_bootmem_with_active_regions - Call free_bootmem_node for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. @@ -3768,34 +3826,122 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) * add_active_ranges() contain no holes and may be freed, this * this function may be used instead of calling free_bootmem() manually. */ -void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) +void __init free_bootmem_with_active_regions(int nid, + unsigned long max_low_pfn) { - unsigned long start_pfn, end_pfn; - int i, this_nid; + int i; + + for_each_active_range_index_in_nid(i, nid) { + unsigned long size_pages = 0; + unsigned long end_pfn = early_node_map[i].end_pfn; + + if (early_node_map[i].start_pfn >= max_low_pfn) + continue; - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { - start_pfn = min(start_pfn, max_low_pfn); - end_pfn = min(end_pfn, max_low_pfn); + if (end_pfn > max_low_pfn) + end_pfn = max_low_pfn; - if (start_pfn < end_pfn) - free_bootmem_node(NODE_DATA(this_nid), - PFN_PHYS(start_pfn), - (end_pfn - start_pfn) << PAGE_SHIFT); + size_pages = end_pfn - early_node_map[i].start_pfn; + free_bootmem_node(NODE_DATA(early_node_map[i].nid), + PFN_PHYS(early_node_map[i].start_pfn), + size_pages << PAGE_SHIFT); } } +#ifdef CONFIG_HAVE_MEMBLOCK +/* + * Basic iterator support. Return the last range of PFNs for a node + * Note: nid == MAX_NUMNODES returns last region regardless of node + */ +static int __meminit last_active_region_index_in_nid(int nid) +{ + int i; + + for (i = nr_nodemap_entries - 1; i >= 0; i--) + if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) + return i; + + return -1; +} + +/* + * Basic iterator support. Return the previous active range of PFNs for a node + * Note: nid == MAX_NUMNODES returns next region regardless of node + */ +static int __meminit previous_active_region_index_in_nid(int index, int nid) +{ + for (index = index - 1; index >= 0; index--) + if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) + return index; + + return -1; +} + +#define for_each_active_range_index_in_nid_reverse(i, nid) \ + for (i = last_active_region_index_in_nid(nid); i != -1; \ + i = previous_active_region_index_in_nid(i, nid)) + +u64 __init find_memory_core_early(int nid, u64 size, u64 align, + u64 goal, u64 limit) +{ + int i; + + /* Need to go over early_node_map to find out good range for node */ + for_each_active_range_index_in_nid_reverse(i, nid) { + u64 addr; + u64 ei_start, ei_last; + u64 final_start, final_end; + + ei_last = early_node_map[i].end_pfn; + ei_last <<= PAGE_SHIFT; + ei_start = early_node_map[i].start_pfn; + ei_start <<= PAGE_SHIFT; + + final_start = max(ei_start, goal); + final_end = min(ei_last, limit); + + if (final_start >= final_end) + continue; + + addr = memblock_find_in_range(final_start, final_end, size, align); + + if (addr == MEMBLOCK_ERROR) + continue; + + return addr; + } + + return MEMBLOCK_ERROR; +} +#endif + int __init add_from_early_node_map(struct range *range, int az, int nr_range, int nid) { - unsigned long start_pfn, end_pfn; int i; + u64 start, end; /* need to go over early_node_map to find out good range for node */ - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) - nr_range = add_range(range, az, nr_range, start_pfn, end_pfn); + for_each_active_range_index_in_nid(i, nid) { + start = early_node_map[i].start_pfn; + end = early_node_map[i].end_pfn; + nr_range = add_range(range, az, nr_range, start, end); + } return nr_range; } +void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) +{ + int i; + int ret; + + for_each_active_range_index_in_nid(i, nid) { + ret = work_fn(early_node_map[i].start_pfn, + early_node_map[i].end_pfn, data); + if (ret) + break; + } +} /** * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. @@ -3806,11 +3952,12 @@ int __init add_from_early_node_map(struct range *range, int az, */ void __init sparse_memory_present_with_active_regions(int nid) { - unsigned long start_pfn, end_pfn; - int i, this_nid; + int i; - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) - memory_present(this_nid, start_pfn, end_pfn); + for_each_active_range_index_in_nid(i, nid) + memory_present(early_node_map[i].nid, + early_node_map[i].start_pfn, + early_node_map[i].end_pfn); } /** @@ -3827,15 +3974,13 @@ void __init sparse_memory_present_with_active_regions(int nid) void __meminit get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn) { - unsigned long this_start_pfn, this_end_pfn; int i; - *start_pfn = -1UL; *end_pfn = 0; - for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { - *start_pfn = min(*start_pfn, this_start_pfn); - *end_pfn = max(*end_pfn, this_end_pfn); + for_each_active_range_index_in_nid(i, nid) { + *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); + *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); } if (*start_pfn == -1UL) @@ -3938,16 +4083,46 @@ unsigned long __meminit __absent_pages_in_range(int nid, unsigned long range_start_pfn, unsigned long range_end_pfn) { - unsigned long nr_absent = range_end_pfn - range_start_pfn; - unsigned long start_pfn, end_pfn; - int i; + int i = 0; + unsigned long prev_end_pfn = 0, hole_pages = 0; + unsigned long start_pfn; + + /* Find the end_pfn of the first active range of pfns in the node */ + i = first_active_region_index_in_nid(nid); + if (i == -1) + return 0; + + prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn); + + /* Account for ranges before physical memory on this node */ + if (early_node_map[i].start_pfn > range_start_pfn) + hole_pages = prev_end_pfn - range_start_pfn; + + /* Find all holes for the zone within the node */ + for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { + + /* No need to continue if prev_end_pfn is outside the zone */ + if (prev_end_pfn >= range_end_pfn) + break; + + /* Make sure the end of the zone is not within the hole */ + start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); + prev_end_pfn = max(prev_end_pfn, range_start_pfn); - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { - start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); - end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); - nr_absent -= end_pfn - start_pfn; + /* Update the hole size cound and move on */ + if (start_pfn > range_start_pfn) { + BUG_ON(prev_end_pfn > start_pfn); + hole_pages += start_pfn - prev_end_pfn; + } + prev_end_pfn = early_node_map[i].end_pfn; } - return nr_absent; + + /* Account for ranges past physical memory on this node */ + if (range_end_pfn > prev_end_pfn) + hole_pages += range_end_pfn - + max(range_start_pfn, prev_end_pfn); + + return hole_pages; } /** @@ -3968,14 +4143,14 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, unsigned long *ignored) { - unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; - unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; unsigned long node_start_pfn, node_end_pfn; unsigned long zone_start_pfn, zone_end_pfn; get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); - zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); - zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); + zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], + node_start_pfn); + zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], + node_end_pfn); adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, @@ -3983,7 +4158,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); } -#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +#else static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long *zones_size) @@ -4001,7 +4176,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid, return zholes_size[zone_type]; } -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +#endif static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) @@ -4224,10 +4399,10 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) */ if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP if (page_to_pfn(mem_map) != pgdat->node_start_pfn) mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ } #endif #endif /* CONFIG_FLAT_NODE_MEM_MAP */ @@ -4252,7 +4427,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, free_area_init_core(pgdat, zones_size, zholes_size); } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP #if MAX_NUMNODES > 1 /* @@ -4273,6 +4448,170 @@ static inline void setup_nr_node_ids(void) } #endif +/** + * add_active_range - Register a range of PFNs backed by physical memory + * @nid: The node ID the range resides on + * @start_pfn: The start PFN of the available physical memory + * @end_pfn: The end PFN of the available physical memory + * + * These ranges are stored in an early_node_map[] and later used by + * free_area_init_nodes() to calculate zone sizes and holes. If the + * range spans a memory hole, it is up to the architecture to ensure + * the memory is not freed by the bootmem allocator. If possible + * the range being registered will be merged with existing ranges. + */ +void __init add_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + int i; + + mminit_dprintk(MMINIT_TRACE, "memory_register", + "Entering add_active_range(%d, %#lx, %#lx) " + "%d entries of %d used\n", + nid, start_pfn, end_pfn, + nr_nodemap_entries, MAX_ACTIVE_REGIONS); + + mminit_validate_memmodel_limits(&start_pfn, &end_pfn); + + /* Merge with existing active regions if possible */ + for (i = 0; i < nr_nodemap_entries; i++) { + if (early_node_map[i].nid != nid) + continue; + + /* Skip if an existing region covers this new one */ + if (start_pfn >= early_node_map[i].start_pfn && + end_pfn <= early_node_map[i].end_pfn) + return; + + /* Merge forward if suitable */ + if (start_pfn <= early_node_map[i].end_pfn && + end_pfn > early_node_map[i].end_pfn) { + early_node_map[i].end_pfn = end_pfn; + return; + } + + /* Merge backward if suitable */ + if (start_pfn < early_node_map[i].start_pfn && + end_pfn >= early_node_map[i].start_pfn) { + early_node_map[i].start_pfn = start_pfn; + return; + } + } + + /* Check that early_node_map is large enough */ + if (i >= MAX_ACTIVE_REGIONS) { + printk(KERN_CRIT "More than %d memory regions, truncating\n", + MAX_ACTIVE_REGIONS); + return; + } + + early_node_map[i].nid = nid; + early_node_map[i].start_pfn = start_pfn; + early_node_map[i].end_pfn = end_pfn; + nr_nodemap_entries = i + 1; +} + +/** + * remove_active_range - Shrink an existing registered range of PFNs + * @nid: The node id the range is on that should be shrunk + * @start_pfn: The new PFN of the range + * @end_pfn: The new PFN of the range + * + * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. + * The map is kept near the end physical page range that has already been + * registered. This function allows an arch to shrink an existing registered + * range. + */ +void __init remove_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + int i, j; + int removed = 0; + + printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n", + nid, start_pfn, end_pfn); + + /* Find the old active region end and shrink */ + for_each_active_range_index_in_nid(i, nid) { + if (early_node_map[i].start_pfn >= start_pfn && + early_node_map[i].end_pfn <= end_pfn) { + /* clear it */ + early_node_map[i].start_pfn = 0; + early_node_map[i].end_pfn = 0; + removed = 1; + continue; + } + if (early_node_map[i].start_pfn < start_pfn && + early_node_map[i].end_pfn > start_pfn) { + unsigned long temp_end_pfn = early_node_map[i].end_pfn; + early_node_map[i].end_pfn = start_pfn; + if (temp_end_pfn > end_pfn) + add_active_range(nid, end_pfn, temp_end_pfn); + continue; + } + if (early_node_map[i].start_pfn >= start_pfn && + early_node_map[i].end_pfn > end_pfn && + early_node_map[i].start_pfn < end_pfn) { + early_node_map[i].start_pfn = end_pfn; + continue; + } + } + + if (!removed) + return; + + /* remove the blank ones */ + for (i = nr_nodemap_entries - 1; i > 0; i--) { + if (early_node_map[i].nid != nid) + continue; + if (early_node_map[i].end_pfn) + continue; + /* we found it, get rid of it */ + for (j = i; j < nr_nodemap_entries - 1; j++) + memcpy(&early_node_map[j], &early_node_map[j+1], + sizeof(early_node_map[j])); + j = nr_nodemap_entries - 1; + memset(&early_node_map[j], 0, sizeof(early_node_map[j])); + nr_nodemap_entries--; + } +} + +/** + * remove_all_active_ranges - Remove all currently registered regions + * + * During discovery, it may be found that a table like SRAT is invalid + * and an alternative discovery method must be used. This function removes + * all currently registered regions. + */ +void __init remove_all_active_ranges(void) +{ + memset(early_node_map, 0, sizeof(early_node_map)); + nr_nodemap_entries = 0; +} + +/* Compare two active node_active_regions */ +static int __init cmp_node_active_region(const void *a, const void *b) +{ + struct node_active_region *arange = (struct node_active_region *)a; + struct node_active_region *brange = (struct node_active_region *)b; + + /* Done this way to avoid overflows */ + if (arange->start_pfn > brange->start_pfn) + return 1; + if (arange->start_pfn < brange->start_pfn) + return -1; + + return 0; +} + +/* sort the node_map by start_pfn */ +void __init sort_node_map(void) +{ + sort(early_node_map, (size_t)nr_nodemap_entries, + sizeof(struct node_active_region), + cmp_node_active_region, NULL); +} + /** * node_map_pfn_alignment - determine the maximum internode alignment * @@ -4295,11 +4634,15 @@ static inline void setup_nr_node_ids(void) unsigned long __init node_map_pfn_alignment(void) { unsigned long accl_mask = 0, last_end = 0; - unsigned long start, end, mask; int last_nid = -1; - int i, nid; + int i; + + for_each_active_range_index_in_nid(i, MAX_NUMNODES) { + int nid = early_node_map[i].nid; + unsigned long start = early_node_map[i].start_pfn; + unsigned long end = early_node_map[i].end_pfn; + unsigned long mask; - for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { if (!start || last_nid < 0 || last_nid == nid) { last_nid = nid; last_end = end; @@ -4326,12 +4669,12 @@ unsigned long __init node_map_pfn_alignment(void) /* Find the lowest pfn for a node */ static unsigned long __init find_min_pfn_for_node(int nid) { - unsigned long min_pfn = ULONG_MAX; - unsigned long start_pfn; int i; + unsigned long min_pfn = ULONG_MAX; - for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) - min_pfn = min(min_pfn, start_pfn); + /* Assuming a sorted map, the first range found has the starting pfn */ + for_each_active_range_index_in_nid(i, nid) + min_pfn = min(min_pfn, early_node_map[i].start_pfn); if (min_pfn == ULONG_MAX) { printk(KERN_WARNING @@ -4360,16 +4703,15 @@ unsigned long __init find_min_pfn_with_active_regions(void) */ static unsigned long __init early_calculate_totalpages(void) { + int i; unsigned long totalpages = 0; - unsigned long start_pfn, end_pfn; - int i, nid; - - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { - unsigned long pages = end_pfn - start_pfn; + for (i = 0; i < nr_nodemap_entries; i++) { + unsigned long pages = early_node_map[i].end_pfn - + early_node_map[i].start_pfn; totalpages += pages; if (pages) - node_set_state(nid, N_HIGH_MEMORY); + node_set_state(early_node_map[i].nid, N_HIGH_MEMORY); } return totalpages; } @@ -4424,8 +4766,6 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) /* Spread kernelcore memory as evenly as possible throughout nodes */ kernelcore_node = required_kernelcore / usable_nodes; for_each_node_state(nid, N_HIGH_MEMORY) { - unsigned long start_pfn, end_pfn; - /* * Recalculate kernelcore_node if the division per node * now exceeds what is necessary to satisfy the requested @@ -4442,10 +4782,13 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) kernelcore_remaining = kernelcore_node; /* Go through each range of PFNs within this node */ - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { + for_each_active_range_index_in_nid(i, nid) { + unsigned long start_pfn, end_pfn; unsigned long size_pages; - start_pfn = max(start_pfn, zone_movable_pfn[nid]); + start_pfn = max(early_node_map[i].start_pfn, + zone_movable_pfn[nid]); + end_pfn = early_node_map[i].end_pfn; if (start_pfn >= end_pfn) continue; @@ -4547,8 +4890,11 @@ static void check_for_regular_memory(pg_data_t *pgdat) */ void __init free_area_init_nodes(unsigned long *max_zone_pfn) { - unsigned long start_pfn, end_pfn; - int i, nid; + unsigned long nid; + int i; + + /* Sort early_node_map as initialisation assumes it is sorted */ + sort_node_map(); /* Record where the zone boundaries are */ memset(arch_zone_lowest_possible_pfn, 0, @@ -4595,9 +4941,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) } /* Print out the early_node_map[] */ - printk("Early memory PFN ranges\n"); - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) - printk(" %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn); + printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); + for (i = 0; i < nr_nodemap_entries; i++) + printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid, + early_node_map[i].start_pfn, + early_node_map[i].end_pfn); /* Initialise every node */ mminit_verify_pageflags_layout(); @@ -4650,7 +4998,7 @@ static int __init cmdline_parse_movablecore(char *p) early_param("kernelcore", cmdline_parse_kernelcore); early_param("movablecore", cmdline_parse_movablecore); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ /** * set_dma_reserve - set the specified number of pages reserved in the first zone diff --git a/trunk/mm/slub.c b/trunk/mm/slub.c index 09ccee8fb58e..ed3334d9b6da 100644 --- a/trunk/mm/slub.c +++ b/trunk/mm/slub.c @@ -368,7 +368,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page VM_BUG_ON(!irqs_disabled()); #ifdef CONFIG_CMPXCHG_DOUBLE if (s->flags & __CMPXCHG_DOUBLE) { - if (cmpxchg_double(&page->freelist, &page->counters, + if (cmpxchg_double(&page->freelist, freelist_old, counters_old, freelist_new, counters_new)) return 1; @@ -402,7 +402,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, { #ifdef CONFIG_CMPXCHG_DOUBLE if (s->flags & __CMPXCHG_DOUBLE) { - if (cmpxchg_double(&page->freelist, &page->counters, + if (cmpxchg_double(&page->freelist, freelist_old, counters_old, freelist_new, counters_new)) return 1; diff --git a/trunk/net/bluetooth/hci_conn.c b/trunk/net/bluetooth/hci_conn.c index c1c597e3e198..e0af7237cd92 100644 --- a/trunk/net/bluetooth/hci_conn.c +++ b/trunk/net/bluetooth/hci_conn.c @@ -673,7 +673,7 @@ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) goto encrypt; auth: - if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend)) + if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend)) return 0; if (!hci_conn_auth(conn, sec_level, auth_type)) diff --git a/trunk/net/bluetooth/hci_core.c b/trunk/net/bluetooth/hci_core.c index b84458dcc226..be84ae33ae36 100644 --- a/trunk/net/bluetooth/hci_core.c +++ b/trunk/net/bluetooth/hci_core.c @@ -613,7 +613,7 @@ static int hci_dev_do_close(struct hci_dev *hdev) if (!test_bit(HCI_RAW, &hdev->flags)) { set_bit(HCI_INIT, &hdev->flags); __hci_request(hdev, hci_reset_req, 0, - msecs_to_jiffies(250)); + msecs_to_jiffies(HCI_INIT_TIMEOUT)); clear_bit(HCI_INIT, &hdev->flags); } diff --git a/trunk/net/bluetooth/l2cap_core.c b/trunk/net/bluetooth/l2cap_core.c index 17b5b1cd9657..5ea94a1eecf2 100644 --- a/trunk/net/bluetooth/l2cap_core.c +++ b/trunk/net/bluetooth/l2cap_core.c @@ -2152,7 +2152,7 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, voi void *ptr = req->data; int type, olen; unsigned long val; - struct l2cap_conf_rfc rfc = { .mode = L2CAP_MODE_BASIC }; + struct l2cap_conf_rfc rfc; BT_DBG("chan %p, rsp %p, len %d, req %p", chan, rsp, len, data); @@ -2271,16 +2271,6 @@ static void l2cap_conf_rfc_get(struct l2cap_chan *chan, void *rsp, int len) } } - /* Use sane default values in case a misbehaving remote device - * did not send an RFC option. - */ - rfc.mode = chan->mode; - rfc.retrans_timeout = cpu_to_le16(L2CAP_DEFAULT_RETRANS_TO); - rfc.monitor_timeout = cpu_to_le16(L2CAP_DEFAULT_MONITOR_TO); - rfc.max_pdu_size = cpu_to_le16(chan->imtu); - - BT_ERR("Expected RFC option was not found, using defaults"); - done: switch (rfc.mode) { case L2CAP_MODE_ERTM: diff --git a/trunk/net/bluetooth/rfcomm/core.c b/trunk/net/bluetooth/rfcomm/core.c index 2d28dfe98389..4e32e18211f9 100644 --- a/trunk/net/bluetooth/rfcomm/core.c +++ b/trunk/net/bluetooth/rfcomm/core.c @@ -1146,7 +1146,6 @@ static int rfcomm_recv_ua(struct rfcomm_session *s, u8 dlci) if (list_empty(&s->dlcs)) { s->state = BT_DISCONN; rfcomm_send_disc(s, 0); - rfcomm_session_clear_timer(s); } break; diff --git a/trunk/net/bridge/br_netfilter.c b/trunk/net/bridge/br_netfilter.c index fa8b8f763580..d6ec3720c77e 100644 --- a/trunk/net/bridge/br_netfilter.c +++ b/trunk/net/bridge/br_netfilter.c @@ -114,18 +114,12 @@ static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst, const vo return NULL; } -static unsigned int fake_mtu(const struct dst_entry *dst) -{ - return dst->dev->mtu; -} - static struct dst_ops fake_dst_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), .update_pmtu = fake_update_pmtu, .cow_metrics = fake_cow_metrics, .neigh_lookup = fake_neigh_lookup, - .mtu = fake_mtu, }; /* @@ -147,7 +141,7 @@ void br_netfilter_rtable_init(struct net_bridge *br) rt->dst.dev = br->dev; rt->dst.path = &rt->dst; dst_init_metrics(&rt->dst, br_dst_default_metrics, true); - rt->dst.flags = DST_NOXFRM | DST_NOPEER; + rt->dst.flags = DST_NOXFRM; rt->dst.ops = &fake_dst_ops; } diff --git a/trunk/net/core/flow.c b/trunk/net/core/flow.c index e318c7e98042..8ae42de9c79e 100644 --- a/trunk/net/core/flow.c +++ b/trunk/net/core/flow.c @@ -358,18 +358,6 @@ void flow_cache_flush(void) put_online_cpus(); } -static void flow_cache_flush_task(struct work_struct *work) -{ - flow_cache_flush(); -} - -static DECLARE_WORK(flow_cache_flush_work, flow_cache_flush_task); - -void flow_cache_flush_deferred(void) -{ - schedule_work(&flow_cache_flush_work); -} - static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) { struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); diff --git a/trunk/net/core/net-sysfs.c b/trunk/net/core/net-sysfs.c index 385aefe53648..c71c434a4c05 100644 --- a/trunk/net/core/net-sysfs.c +++ b/trunk/net/core/net-sysfs.c @@ -665,14 +665,11 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, if (count) { int i; - if (count > INT_MAX) - return -EINVAL; - count = roundup_pow_of_two(count); - if (count > (ULONG_MAX - sizeof(struct rps_dev_flow_table)) - / sizeof(struct rps_dev_flow)) { + if (count > 1<<30) { /* Enforce a limit to prevent overflow */ return -EINVAL; } + count = roundup_pow_of_two(count); table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count)); if (!table) return -ENOMEM; diff --git a/trunk/net/core/sock.c b/trunk/net/core/sock.c index b23f174ab84c..4ed7b1d12f5e 100644 --- a/trunk/net/core/sock.c +++ b/trunk/net/core/sock.c @@ -288,7 +288,11 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; - if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { + /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces + number of warnings when compiling with -W --ANK + */ + if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= + (unsigned)sk->sk_rcvbuf) { atomic_inc(&sk->sk_drops); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; diff --git a/trunk/net/ipv4/ipconfig.c b/trunk/net/ipv4/ipconfig.c index 99ec116bef14..0da2afc97f32 100644 --- a/trunk/net/ipv4/ipconfig.c +++ b/trunk/net/ipv4/ipconfig.c @@ -253,10 +253,6 @@ static int __init ic_open_devs(void) } } - /* no point in waiting if we could not bring up at least one device */ - if (!ic_first_dev) - goto have_carrier; - /* wait for a carrier on at least one device */ start = jiffies; while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) { diff --git a/trunk/net/ipv4/route.c b/trunk/net/ipv4/route.c index 94cdbc55ca7e..46af62363b8c 100644 --- a/trunk/net/ipv4/route.c +++ b/trunk/net/ipv4/route.c @@ -91,7 +91,6 @@ #include #include #include -#include #include #include #include @@ -121,7 +120,6 @@ static int ip_rt_max_size; static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; -static int ip_rt_gc_interval __read_mostly = 60 * HZ; static int ip_rt_gc_min_interval __read_mostly = HZ / 2; static int ip_rt_redirect_number __read_mostly = 9; static int ip_rt_redirect_load __read_mostly = HZ / 50; @@ -135,9 +133,6 @@ static int ip_rt_min_advmss __read_mostly = 256; static int rt_chain_length_max __read_mostly = 20; static int redirect_genid; -static struct delayed_work expires_work; -static unsigned long expires_ljiffies; - /* * Interface to generic destination cache. */ @@ -835,97 +830,6 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth) return ONE; } -static void rt_check_expire(void) -{ - static unsigned int rover; - unsigned int i = rover, goal; - struct rtable *rth; - struct rtable __rcu **rthp; - unsigned long samples = 0; - unsigned long sum = 0, sum2 = 0; - unsigned long delta; - u64 mult; - - delta = jiffies - expires_ljiffies; - expires_ljiffies = jiffies; - mult = ((u64)delta) << rt_hash_log; - if (ip_rt_gc_timeout > 1) - do_div(mult, ip_rt_gc_timeout); - goal = (unsigned int)mult; - if (goal > rt_hash_mask) - goal = rt_hash_mask + 1; - for (; goal > 0; goal--) { - unsigned long tmo = ip_rt_gc_timeout; - unsigned long length; - - i = (i + 1) & rt_hash_mask; - rthp = &rt_hash_table[i].chain; - - if (need_resched()) - cond_resched(); - - samples++; - - if (rcu_dereference_raw(*rthp) == NULL) - continue; - length = 0; - spin_lock_bh(rt_hash_lock_addr(i)); - while ((rth = rcu_dereference_protected(*rthp, - lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { - prefetch(rth->dst.rt_next); - if (rt_is_expired(rth)) { - *rthp = rth->dst.rt_next; - rt_free(rth); - continue; - } - if (rth->dst.expires) { - /* Entry is expired even if it is in use */ - if (time_before_eq(jiffies, rth->dst.expires)) { -nofree: - tmo >>= 1; - rthp = &rth->dst.rt_next; - /* - * We only count entries on - * a chain with equal hash inputs once - * so that entries for different QOS - * levels, and other non-hash input - * attributes don't unfairly skew - * the length computation - */ - length += has_noalias(rt_hash_table[i].chain, rth); - continue; - } - } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) - goto nofree; - - /* Cleanup aged off entries. */ - *rthp = rth->dst.rt_next; - rt_free(rth); - } - spin_unlock_bh(rt_hash_lock_addr(i)); - sum += length; - sum2 += length*length; - } - if (samples) { - unsigned long avg = sum / samples; - unsigned long sd = int_sqrt(sum2 / samples - avg*avg); - rt_chain_length_max = max_t(unsigned long, - ip_rt_gc_elasticity, - (avg + 4*sd) >> FRACT_BITS); - } - rover = i; -} - -/* - * rt_worker_func() is run in process context. - * we call rt_check_expire() to scan part of the hash table - */ -static void rt_worker_func(struct work_struct *work) -{ - rt_check_expire(); - schedule_delayed_work(&expires_work, ip_rt_gc_interval); -} - /* * Perturbation of rt_genid by a small quantity [1..256] * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() @@ -1367,7 +1271,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) { struct rtable *rt = (struct rtable *) dst; - if (rt && !(rt->dst.flags & DST_NOPEER)) { + if (rt) { if (rt->peer == NULL) rt_bind_peer(rt, rt->rt_dst, 1); @@ -1378,7 +1282,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) iph->id = htons(inet_getid(rt->peer, more)); return; } - } else if (!rt) + } else printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", __builtin_return_address(0)); @@ -3274,13 +3178,6 @@ static ctl_table ipv4_route_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { - .procname = "gc_interval", - .data = &ip_rt_gc_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, { .procname = "redirect_load", .data = &ip_rt_redirect_load, @@ -3491,11 +3388,6 @@ int __init ip_rt_init(void) devinet_init(); ip_fib_init(); - INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); - expires_ljiffies = jiffies; - schedule_delayed_work(&expires_work, - net_random() % ip_rt_gc_interval + ip_rt_gc_interval); - if (ip_rt_proc_init()) printk(KERN_ERR "Unable to create route proc files\n"); #ifdef CONFIG_XFRM diff --git a/trunk/net/ipv6/ip6_output.c b/trunk/net/ipv6/ip6_output.c index ec562713db9b..84d0bd5cac93 100644 --- a/trunk/net/ipv6/ip6_output.c +++ b/trunk/net/ipv6/ip6_output.c @@ -603,7 +603,7 @@ void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) static atomic_t ipv6_fragmentation_id; int old, new; - if (rt && !(rt->dst.flags & DST_NOPEER)) { + if (rt) { struct inet_peer *peer; if (!rt->rt6i_peer) diff --git a/trunk/net/llc/af_llc.c b/trunk/net/llc/af_llc.c index a18e6c3d36e3..dfd3a648a551 100644 --- a/trunk/net/llc/af_llc.c +++ b/trunk/net/llc/af_llc.c @@ -833,15 +833,15 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, copied += used; len -= used; - /* For non stream protcols we get one packet per recvmsg call */ - if (sk->sk_type != SOCK_STREAM) - goto copy_uaddr; - if (!(flags & MSG_PEEK)) { sk_eat_skb(sk, skb, 0); *seq = 0; } + /* For non stream protcols we get one packet per recvmsg call */ + if (sk->sk_type != SOCK_STREAM) + goto copy_uaddr; + /* Partial read */ if (used + offset < skb->len) continue; @@ -857,12 +857,6 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, } if (llc_sk(sk)->cmsg_flags) llc_cmsg_rcv(msg, skb); - - if (!(flags & MSG_PEEK)) { - sk_eat_skb(sk, skb, 0); - *seq = 0; - } - goto out; } diff --git a/trunk/net/netfilter/ipvs/ip_vs_conn.c b/trunk/net/netfilter/ipvs/ip_vs_conn.c index 29fa5badde75..12571fb2881c 100644 --- a/trunk/net/netfilter/ipvs/ip_vs_conn.c +++ b/trunk/net/netfilter/ipvs/ip_vs_conn.c @@ -616,7 +616,7 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) if ((cp) && (!cp->dest)) { dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, cp->dport, &cp->vaddr, cp->vport, - cp->protocol, cp->fwmark, cp->flags); + cp->protocol, cp->fwmark); ip_vs_bind_dest(cp, dest); return dest; } else diff --git a/trunk/net/netfilter/ipvs/ip_vs_ctl.c b/trunk/net/netfilter/ipvs/ip_vs_ctl.c index e1a66cf37f9a..008bf97cc91a 100644 --- a/trunk/net/netfilter/ipvs/ip_vs_ctl.c +++ b/trunk/net/netfilter/ipvs/ip_vs_ctl.c @@ -619,21 +619,15 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, - __be16 vport, __u16 protocol, __u32 fwmark, - __u32 flags) + __be16 vport, __u16 protocol, __u32 fwmark) { struct ip_vs_dest *dest; struct ip_vs_service *svc; - __be16 port = dport; svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport); if (!svc) return NULL; - if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) - port = 0; - dest = ip_vs_lookup_dest(svc, daddr, port); - if (!dest) - dest = ip_vs_lookup_dest(svc, daddr, port ^ dport); + dest = ip_vs_lookup_dest(svc, daddr, dport); if (dest) atomic_inc(&dest->refcnt); ip_vs_service_put(svc); diff --git a/trunk/net/netfilter/ipvs/ip_vs_sync.c b/trunk/net/netfilter/ipvs/ip_vs_sync.c index 2b6678c0ce14..3cdd479f9b5d 100644 --- a/trunk/net/netfilter/ipvs/ip_vs_sync.c +++ b/trunk/net/netfilter/ipvs/ip_vs_sync.c @@ -740,7 +740,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, * but still handled. */ dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr, - param->vport, protocol, fwmark, flags); + param->vport, protocol, fwmark); /* Set the approprite ativity flag */ if (protocol == IPPROTO_TCP) { diff --git a/trunk/net/netfilter/nf_conntrack_netlink.c b/trunk/net/netfilter/nf_conntrack_netlink.c index 257e77256c5c..ef21b221f036 100644 --- a/trunk/net/netfilter/nf_conntrack_netlink.c +++ b/trunk/net/netfilter/nf_conntrack_netlink.c @@ -135,7 +135,7 @@ ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct) static inline int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct) { - long timeout = ((long)ct->timeout.expires - (long)jiffies) / HZ; + long timeout = (ct->timeout.expires - jiffies) / HZ; if (timeout < 0) timeout = 0; @@ -1358,15 +1358,12 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, nf_ct_protonum(ct)); if (helper == NULL) { rcu_read_unlock(); - spin_unlock_bh(&nf_conntrack_lock); #ifdef CONFIG_MODULES if (request_module("nfct-helper-%s", helpname) < 0) { - spin_lock_bh(&nf_conntrack_lock); err = -EOPNOTSUPP; goto err1; } - spin_lock_bh(&nf_conntrack_lock); rcu_read_lock(); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), @@ -1641,7 +1638,7 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb, const struct nf_conntrack_expect *exp) { struct nf_conn *master = exp->master; - long timeout = ((long)exp->timeout.expires - (long)jiffies) / HZ; + long timeout = (exp->timeout.expires - jiffies) / HZ; struct nf_conn_help *help; if (timeout < 0) @@ -1872,30 +1869,25 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, err = -ENOMEM; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) { - nf_ct_expect_put(exp); + if (skb2 == NULL) goto out; - } rcu_read_lock(); err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp); rcu_read_unlock(); - nf_ct_expect_put(exp); if (err <= 0) goto free; - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); - if (err < 0) - goto out; + nf_ct_expect_put(exp); - return 0; + return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); free: kfree_skb(skb2); out: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + nf_ct_expect_put(exp); + return err; } static int diff --git a/trunk/net/netfilter/xt_connbytes.c b/trunk/net/netfilter/xt_connbytes.c index 9ddf1c3bfb39..5b138506690e 100644 --- a/trunk/net/netfilter/xt_connbytes.c +++ b/trunk/net/netfilter/xt_connbytes.c @@ -87,10 +87,10 @@ connbytes_mt(const struct sk_buff *skb, struct xt_action_param *par) break; } - if (sinfo->count.to >= sinfo->count.from) + if (sinfo->count.to) return what <= sinfo->count.to && what >= sinfo->count.from; - else /* inverted */ - return what < sinfo->count.to || what > sinfo->count.from; + else + return what >= sinfo->count.from; } static int connbytes_mt_check(const struct xt_mtchk_param *par) diff --git a/trunk/net/nfc/nci/core.c b/trunk/net/nfc/nci/core.c index ea66034499ce..3925c6578767 100644 --- a/trunk/net/nfc/nci/core.c +++ b/trunk/net/nfc/nci/core.c @@ -69,7 +69,7 @@ static int __nci_request(struct nci_dev *ndev, __u32 timeout) { int rc = 0; - long completion_rc; + unsigned long completion_rc; ndev->req_status = NCI_REQ_PEND; diff --git a/trunk/net/packet/af_packet.c b/trunk/net/packet/af_packet.c index d9d4970b9b07..82a6f34d39d0 100644 --- a/trunk/net/packet/af_packet.c +++ b/trunk/net/packet/af_packet.c @@ -1630,7 +1630,8 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, if (snaplen > res) snaplen = res; - if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= + (unsigned)sk->sk_rcvbuf) goto drop_n_acct; if (skb_shared(skb)) { @@ -1761,7 +1762,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (po->tp_version <= TPACKET_V2) { if (macoff + snaplen > po->rx_ring.frame_size) { if (po->copy_thresh && - atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { + atomic_read(&sk->sk_rmem_alloc) + skb->truesize + < (unsigned)sk->sk_rcvbuf) { if (skb_shared(skb)) { copy_skb = skb_clone(skb, GFP_ATOMIC); } else { @@ -2448,12 +2450,8 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc { struct packet_sock *po = pkt_sk(sk); - if (po->fanout) { - if (dev) - dev_put(dev); - + if (po->fanout) return -EINVAL; - } lock_sock(sk); diff --git a/trunk/net/sched/sch_mqprio.c b/trunk/net/sched/sch_mqprio.c index 28de43092330..f88256cbacbf 100644 --- a/trunk/net/sched/sch_mqprio.c +++ b/trunk/net/sched/sch_mqprio.c @@ -107,7 +107,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) if (!netif_is_multiqueue(dev)) return -EOPNOTSUPP; - if (!opt || nla_len(opt) < sizeof(*qopt)) + if (nla_len(opt) < sizeof(*qopt)) return -EINVAL; qopt = nla_data(opt); diff --git a/trunk/net/sched/sch_netem.c b/trunk/net/sched/sch_netem.c index a4ab207cdc59..eb3b9a86c6ed 100644 --- a/trunk/net/sched/sch_netem.c +++ b/trunk/net/sched/sch_netem.c @@ -488,7 +488,7 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) return -EINVAL; s = sizeof(struct disttable) + n * sizeof(s16); - d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN); + d = kmalloc(s, GFP_KERNEL); if (!d) d = vmalloc(s); if (!d) @@ -501,10 +501,9 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) root_lock = qdisc_root_sleeping_lock(sch); spin_lock_bh(root_lock); - swap(q->delay_dist, d); + dist_free(q->delay_dist); + q->delay_dist = d; spin_unlock_bh(root_lock); - - dist_free(d); return 0; } diff --git a/trunk/net/sched/sch_qfq.c b/trunk/net/sched/sch_qfq.c index 7b0325459e71..103343408593 100644 --- a/trunk/net/sched/sch_qfq.c +++ b/trunk/net/sched/sch_qfq.c @@ -817,11 +817,11 @@ static struct sk_buff *qfq_dequeue(struct Qdisc *sch) static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) { unsigned long mask; - u64 limit, roundedF; + uint32_t limit, roundedF; int slot_shift = cl->grp->slot_shift; roundedF = qfq_round_down(cl->F, slot_shift); - limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift); + limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift); if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { /* timestamp was stale */ diff --git a/trunk/net/sctp/associola.c b/trunk/net/sctp/associola.c index acd2edbc073e..152b5b3c3fff 100644 --- a/trunk/net/sctp/associola.c +++ b/trunk/net/sctp/associola.c @@ -173,7 +173,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = - min_t(unsigned long, sp->autoclose, sctp_max_autoclose) * HZ; + (unsigned long)sp->autoclose * HZ; /* Initializes the timers */ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) diff --git a/trunk/net/sctp/output.c b/trunk/net/sctp/output.c index 817174eb5f41..08b3cead6503 100644 --- a/trunk/net/sctp/output.c +++ b/trunk/net/sctp/output.c @@ -697,7 +697,13 @@ static void sctp_packet_append_data(struct sctp_packet *packet, /* Keep track of how many bytes are in flight to the receiver. */ asoc->outqueue.outstanding_bytes += datasize; - /* Update our view of the receiver's rwnd. */ + /* Update our view of the receiver's rwnd. Include sk_buff overhead + * while updating peer.rwnd so that it reduces the chances of a + * receiver running out of receive buffer space even when receive + * window is still open. This can happen when a sender is sending + * sending small messages. + */ + datasize += sizeof(struct sk_buff); if (datasize < rwnd) rwnd -= datasize; else diff --git a/trunk/net/sctp/outqueue.c b/trunk/net/sctp/outqueue.c index cfeb1d4a1ee6..14c2b06028ff 100644 --- a/trunk/net/sctp/outqueue.c +++ b/trunk/net/sctp/outqueue.c @@ -411,7 +411,8 @@ void sctp_retransmit_mark(struct sctp_outq *q, chunk->transport->flight_size -= sctp_data_size(chunk); q->outstanding_bytes -= sctp_data_size(chunk); - q->asoc->peer.rwnd += sctp_data_size(chunk); + q->asoc->peer.rwnd += (sctp_data_size(chunk) + + sizeof(struct sk_buff)); } continue; } @@ -431,7 +432,8 @@ void sctp_retransmit_mark(struct sctp_outq *q, * (Section 7.2.4)), add the data size of those * chunks to the rwnd. */ - q->asoc->peer.rwnd += sctp_data_size(chunk); + q->asoc->peer.rwnd += (sctp_data_size(chunk) + + sizeof(struct sk_buff)); q->outstanding_bytes -= sctp_data_size(chunk); if (chunk->transport) transport->flight_size -= sctp_data_size(chunk); diff --git a/trunk/net/sctp/protocol.c b/trunk/net/sctp/protocol.c index 6f6ad8686833..61b9fca5a173 100644 --- a/trunk/net/sctp/protocol.c +++ b/trunk/net/sctp/protocol.c @@ -1285,9 +1285,6 @@ SCTP_STATIC __init int sctp_init(void) sctp_max_instreams = SCTP_DEFAULT_INSTREAMS; sctp_max_outstreams = SCTP_DEFAULT_OUTSTREAMS; - /* Initialize maximum autoclose timeout. */ - sctp_max_autoclose = INT_MAX / HZ; - /* Initialize handle used for association ids. */ idr_init(&sctp_assocs_id); diff --git a/trunk/net/sctp/socket.c b/trunk/net/sctp/socket.c index 54a7cd2fdd7a..13bf5fcdbff1 100644 --- a/trunk/net/sctp/socket.c +++ b/trunk/net/sctp/socket.c @@ -2200,6 +2200,8 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, return -EINVAL; if (copy_from_user(&sp->autoclose, optval, optlen)) return -EFAULT; + /* make sure it won't exceed MAX_SCHEDULE_TIMEOUT */ + sp->autoclose = min_t(long, sp->autoclose, MAX_SCHEDULE_TIMEOUT / HZ); return 0; } diff --git a/trunk/net/sctp/sysctl.c b/trunk/net/sctp/sysctl.c index 60ffbd067ff7..6b3952961b85 100644 --- a/trunk/net/sctp/sysctl.c +++ b/trunk/net/sctp/sysctl.c @@ -53,10 +53,6 @@ static int sack_timer_min = 1; static int sack_timer_max = 500; static int addr_scope_max = 3; /* check sctp_scope_policy_t in include/net/sctp/constants.h for max entries */ static int rwnd_scale_max = 16; -static unsigned long max_autoclose_min = 0; -static unsigned long max_autoclose_max = - (MAX_SCHEDULE_TIMEOUT / HZ > UINT_MAX) - ? UINT_MAX : MAX_SCHEDULE_TIMEOUT / HZ; extern long sysctl_sctp_mem[3]; extern int sysctl_sctp_rmem[3]; @@ -262,15 +258,6 @@ static ctl_table sctp_table[] = { .extra1 = &one, .extra2 = &rwnd_scale_max, }, - { - .procname = "max_autoclose", - .data = &sctp_max_autoclose, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, - .extra1 = &max_autoclose_min, - .extra2 = &max_autoclose_max, - }, { /* sentinel */ } }; diff --git a/trunk/net/socket.c b/trunk/net/socket.c index a0053750e37a..2877647f347b 100644 --- a/trunk/net/socket.c +++ b/trunk/net/socket.c @@ -2883,7 +2883,7 @@ static int bond_ioctl(struct net *net, unsigned int cmd, return dev_ioctl(net, cmd, uifr); default: - return -ENOIOCTLCMD; + return -EINVAL; } } @@ -3210,6 +3210,20 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, return sock_do_ioctl(net, sock, cmd, arg); } + /* Prevent warning from compat_sys_ioctl, these always + * result in -EINVAL in the native case anyway. */ + switch (cmd) { + case SIOCRTMSG: + case SIOCGIFCOUNT: + case SIOCSRARP: + case SIOCGRARP: + case SIOCDRARP: + case SIOCSIFLINK: + case SIOCGIFSLAVE: + case SIOCSIFSLAVE: + return -EINVAL; + } + return -ENOIOCTLCMD; } diff --git a/trunk/net/xfrm/xfrm_policy.c b/trunk/net/xfrm/xfrm_policy.c index 9049a5caeb25..2118d6446630 100644 --- a/trunk/net/xfrm/xfrm_policy.c +++ b/trunk/net/xfrm/xfrm_policy.c @@ -2276,6 +2276,8 @@ static void __xfrm_garbage_collect(struct net *net) { struct dst_entry *head, *next; + flow_cache_flush(); + spin_lock_bh(&xfrm_policy_sk_bundle_lock); head = xfrm_policy_sk_bundles; xfrm_policy_sk_bundles = NULL; @@ -2288,18 +2290,6 @@ static void __xfrm_garbage_collect(struct net *net) } } -static void xfrm_garbage_collect(struct net *net) -{ - flow_cache_flush(); - __xfrm_garbage_collect(net); -} - -static void xfrm_garbage_collect_deferred(struct net *net) -{ - flow_cache_flush_deferred(); - __xfrm_garbage_collect(net); -} - static void xfrm_init_pmtu(struct dst_entry *dst) { do { @@ -2432,7 +2422,7 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) if (likely(dst_ops->neigh_lookup == NULL)) dst_ops->neigh_lookup = xfrm_neigh_lookup; if (likely(afinfo->garbage_collect == NULL)) - afinfo->garbage_collect = xfrm_garbage_collect_deferred; + afinfo->garbage_collect = __xfrm_garbage_collect; xfrm_policy_afinfo[afinfo->family] = afinfo; } write_unlock_bh(&xfrm_policy_afinfo_lock); @@ -2526,7 +2516,7 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void switch (event) { case NETDEV_DOWN: - xfrm_garbage_collect(dev_net(dev)); + __xfrm_garbage_collect(dev_net(dev)); } return NOTIFY_DONE; } diff --git a/trunk/scripts/kconfig/Makefile b/trunk/scripts/kconfig/Makefile index 914833d99b06..ba573fe7c74d 100644 --- a/trunk/scripts/kconfig/Makefile +++ b/trunk/scripts/kconfig/Makefile @@ -60,8 +60,8 @@ update-po-config: $(obj)/kxgettext $(obj)/gconf.glade.h --directory=$(srctree) --directory=$(objtree) \ --output $(obj)/config.pot $(Q)sed -i s/CHARSET/UTF-8/ $(obj)/config.pot - $(Q)(for i in `ls $(srctree)/arch/*/Kconfig \ - $(srctree)/arch/*/um/Kconfig`; \ + $(Q)ln -fs Kconfig.x86 arch/um/Kconfig + $(Q)(for i in `ls $(srctree)/arch/*/Kconfig`; \ do \ echo " GEN $$i"; \ $(obj)/kxgettext $$i \ @@ -69,6 +69,7 @@ update-po-config: $(obj)/kxgettext $(obj)/gconf.glade.h done ) $(Q)msguniq --sort-by-file --to-code=UTF-8 $(obj)/config.pot \ --output $(obj)/linux.pot + $(Q)rm -f $(srctree)/arch/um/Kconfig $(Q)rm -f $(obj)/config.pot PHONY += allnoconfig allyesconfig allmodconfig alldefconfig randconfig diff --git a/trunk/security/security.c b/trunk/security/security.c index e2f684aeb70c..0c6cc69c8f86 100644 --- a/trunk/security/security.c +++ b/trunk/security/security.c @@ -381,7 +381,7 @@ int security_old_inode_init_security(struct inode *inode, struct inode *dir, void **value, size_t *len) { if (unlikely(IS_PRIVATE(inode))) - return -EOPNOTSUPP; + return 0; return security_ops->inode_init_security(inode, dir, qstr, name, value, len); } diff --git a/trunk/sound/atmel/ac97c.c b/trunk/sound/atmel/ac97c.c index 73516f69ac7c..6e5addeb236b 100644 --- a/trunk/sound/atmel/ac97c.c +++ b/trunk/sound/atmel/ac97c.c @@ -899,10 +899,6 @@ static void atmel_ac97c_reset(struct atmel_ac97c *chip) /* AC97 v2.2 specifications says minimum 1 us. */ udelay(2); gpio_set_value(chip->reset_pin, 1); - } else { - ac97c_writel(chip, MR, AC97C_MR_WRST | AC97C_MR_ENA); - udelay(2); - ac97c_writel(chip, MR, AC97C_MR_ENA); } } diff --git a/trunk/sound/soc/codecs/wm8776.c b/trunk/sound/soc/codecs/wm8776.c index d3b0a20744f1..bfdc52370ad0 100644 --- a/trunk/sound/soc/codecs/wm8776.c +++ b/trunk/sound/soc/codecs/wm8776.c @@ -235,7 +235,6 @@ static int wm8776_hw_params(struct snd_pcm_substream *substream, switch (snd_pcm_format_width(params_format(params))) { case 16: iface = 0; - break; case 20: iface = 0x10; break; diff --git a/trunk/tools/perf/Documentation/perf-annotate.txt b/trunk/tools/perf/Documentation/perf-annotate.txt index c89f9e1453f7..fe6762ed56bd 100644 --- a/trunk/tools/perf/Documentation/perf-annotate.txt +++ b/trunk/tools/perf/Documentation/perf-annotate.txt @@ -22,7 +22,7 @@ OPTIONS ------- -i:: --input=:: - Input file name. (default: perf.data unless stdin is a fifo) + Input file name. (default: perf.data) -d:: --dsos=:: @@ -66,7 +66,7 @@ OPTIONS used. This interfaces starts by centering on the line with more samples, TAB/UNTAB cycles through the lines with more samples. --C:: +-c:: --cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can be provided as a comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2. Default is to report samples on all diff --git a/trunk/tools/perf/Documentation/perf-buildid-list.txt b/trunk/tools/perf/Documentation/perf-buildid-list.txt index 25c52efcc7f0..cc22325ffd1b 100644 --- a/trunk/tools/perf/Documentation/perf-buildid-list.txt +++ b/trunk/tools/perf/Documentation/perf-buildid-list.txt @@ -26,7 +26,7 @@ OPTIONS Show only DSOs with hits. -i:: --input=:: - Input file name. (default: perf.data unless stdin is a fifo) + Input file name. (default: perf.data) -f:: --force:: Don't do ownership validation. diff --git a/trunk/tools/perf/Documentation/perf-evlist.txt b/trunk/tools/perf/Documentation/perf-evlist.txt index 0507ec7bad71..0cada9e053dc 100644 --- a/trunk/tools/perf/Documentation/perf-evlist.txt +++ b/trunk/tools/perf/Documentation/perf-evlist.txt @@ -18,7 +18,7 @@ OPTIONS ------- -i:: --input=:: - Input file name. (default: perf.data unless stdin is a fifo) + Input file name. (default: perf.data) SEE ALSO -------- diff --git a/trunk/tools/perf/Documentation/perf-kmem.txt b/trunk/tools/perf/Documentation/perf-kmem.txt index 7c8fbbf3f61c..a52fcde894c7 100644 --- a/trunk/tools/perf/Documentation/perf-kmem.txt +++ b/trunk/tools/perf/Documentation/perf-kmem.txt @@ -23,7 +23,7 @@ OPTIONS ------- -i :: --input=:: - Select the input file (default: perf.data unless stdin is a fifo) + Select the input file (default: perf.data) --caller:: Show per-callsite statistics diff --git a/trunk/tools/perf/Documentation/perf-lock.txt b/trunk/tools/perf/Documentation/perf-lock.txt index d6b2a4f2108b..4a26a2f3a6a3 100644 --- a/trunk/tools/perf/Documentation/perf-lock.txt +++ b/trunk/tools/perf/Documentation/perf-lock.txt @@ -29,7 +29,7 @@ COMMON OPTIONS -i:: --input=:: - Input file name. (default: perf.data unless stdin is a fifo) + Input file name. -v:: --verbose:: diff --git a/trunk/tools/perf/Documentation/perf-record.txt b/trunk/tools/perf/Documentation/perf-record.txt index 2937f7e14bb7..5a520f825295 100644 --- a/trunk/tools/perf/Documentation/perf-record.txt +++ b/trunk/tools/perf/Documentation/perf-record.txt @@ -89,7 +89,7 @@ OPTIONS -m:: --mmap-pages=:: - Number of mmap data pages. Must be a power of two. + Number of mmap data pages. -g:: --call-graph:: diff --git a/trunk/tools/perf/Documentation/perf-report.txt b/trunk/tools/perf/Documentation/perf-report.txt index 9b430e98712e..212f24d672e1 100644 --- a/trunk/tools/perf/Documentation/perf-report.txt +++ b/trunk/tools/perf/Documentation/perf-report.txt @@ -19,7 +19,7 @@ OPTIONS ------- -i:: --input=:: - Input file name. (default: perf.data unless stdin is a fifo) + Input file name. (default: perf.data) -v:: --verbose:: @@ -39,7 +39,7 @@ OPTIONS -T:: --threads:: Show per-thread event counters --c:: +-C:: --comms=:: Only consider symbols in these comms. CSV that understands file://filename entries. @@ -80,10 +80,9 @@ OPTIONS --dump-raw-trace:: Dump raw trace in ASCII. --g [type,min[,limit],order]:: +-g [type,min,order]:: --call-graph:: - Display call chains using type, min percent threshold, optional print - limit and order. + Display call chains using type, min percent threshold and order. type can be either: - flat: single column, linear exposure of call chains. - graph: use a graph tree, displaying absolute overhead rates. @@ -129,7 +128,7 @@ OPTIONS --symfs=:: Look for files with symbols relative to this directory. --C:: +-c:: --cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can be provided as a comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2. Default is to report samples on all diff --git a/trunk/tools/perf/Documentation/perf-sched.txt b/trunk/tools/perf/Documentation/perf-sched.txt index 8ff4df956951..5b212b57f70b 100644 --- a/trunk/tools/perf/Documentation/perf-sched.txt +++ b/trunk/tools/perf/Documentation/perf-sched.txt @@ -40,7 +40,7 @@ OPTIONS ------- -i:: --input=:: - Input file name. (default: perf.data unless stdin is a fifo) + Input file name. (default: perf.data) -v:: --verbose:: diff --git a/trunk/tools/perf/Documentation/perf-script.txt b/trunk/tools/perf/Documentation/perf-script.txt index 2f6cef43da25..dec87ecb530e 100644 --- a/trunk/tools/perf/Documentation/perf-script.txt +++ b/trunk/tools/perf/Documentation/perf-script.txt @@ -106,7 +106,7 @@ OPTIONS -i:: --input=:: - Input file name. (default: perf.data unless stdin is a fifo) + Input file name. -d:: --debug-mode:: @@ -182,17 +182,12 @@ OPTIONS --hide-call-graph:: When printing symbols do not display call chain. --C:: +-c:: --cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can be provided as a comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2. Default is to report samples on all CPUs. --c:: ---comms=:: - Only display events for these comms. CSV that understands - file://filename entries. - -I:: --show-info:: Display extended information about the perf.data file. This adds diff --git a/trunk/tools/perf/Documentation/perf-test.txt b/trunk/tools/perf/Documentation/perf-test.txt index b24ac40fcd58..2c3b462f64b0 100644 --- a/trunk/tools/perf/Documentation/perf-test.txt +++ b/trunk/tools/perf/Documentation/perf-test.txt @@ -8,19 +8,13 @@ perf-test - Runs sanity tests. SYNOPSIS -------- [verse] -'perf test [] [{list |[|]}]' +'perf test ' DESCRIPTION ----------- This command does assorted sanity tests, initially through linked routines but also will look for a directory with more tests in the form of scripts. -To get a list of available tests use 'perf test list', specifying a test name -fragment will show all tests that have it. - -To run just specific tests, inform test name fragments or the numbers obtained -from 'perf test list'. - OPTIONS ------- -v:: diff --git a/trunk/tools/perf/Documentation/perf-timechart.txt b/trunk/tools/perf/Documentation/perf-timechart.txt index 1632b0efc757..d7b79e2ba2ad 100644 --- a/trunk/tools/perf/Documentation/perf-timechart.txt +++ b/trunk/tools/perf/Documentation/perf-timechart.txt @@ -27,7 +27,7 @@ OPTIONS Select the output file (default: output.svg) -i:: --input=:: - Select the input file (default: perf.data unless stdin is a fifo) + Select the input file (default: perf.data) -w:: --width=:: Select the width of the SVG file (default: 1000) diff --git a/trunk/tools/perf/Makefile b/trunk/tools/perf/Makefile index ac86d67b636e..b98e3075646b 100644 --- a/trunk/tools/perf/Makefile +++ b/trunk/tools/perf/Makefile @@ -278,7 +278,6 @@ LIB_H += util/strbuf.h LIB_H += util/strlist.h LIB_H += util/strfilter.h LIB_H += util/svghelper.h -LIB_H += util/tool.h LIB_H += util/run-command.h LIB_H += util/sigchain.h LIB_H += util/symbol.h diff --git a/trunk/tools/perf/builtin-annotate.c b/trunk/tools/perf/builtin-annotate.c index 214ba7f9f577..46b4c24f338e 100644 --- a/trunk/tools/perf/builtin-annotate.c +++ b/trunk/tools/perf/builtin-annotate.c @@ -27,32 +27,32 @@ #include "util/sort.h" #include "util/hist.h" #include "util/session.h" -#include "util/tool.h" #include -struct perf_annotate { - struct perf_tool tool; - char const *input_name; - bool force, use_tui, use_stdio; - bool full_paths; - bool print_line; - const char *sym_hist_filter; - const char *cpu_list; - DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); -}; +static char const *input_name = "perf.data"; + +static bool force, use_tui, use_stdio; + +static bool full_paths; + +static bool print_line; -static int perf_evsel__add_sample(struct perf_evsel *evsel, - struct perf_sample *sample, - struct addr_location *al, - struct perf_annotate *ann) +static const char *sym_hist_filter; + +static const char *cpu_list; +static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); + +static int perf_evlist__add_sample(struct perf_evlist *evlist, + struct perf_sample *sample, + struct perf_evsel *evsel, + struct addr_location *al) { struct hist_entry *he; int ret; - if (ann->sym_hist_filter != NULL && - (al->sym == NULL || - strcmp(ann->sym_hist_filter, al->sym->name) != 0)) { + if (sym_hist_filter != NULL && + (al->sym == NULL || strcmp(sym_hist_filter, al->sym->name) != 0)) { /* We're only interested in a symbol named sym_hist_filter */ if (al->sym != NULL) { rb_erase(&al->sym->rb_node, @@ -69,7 +69,8 @@ static int perf_evsel__add_sample(struct perf_evsel *evsel, ret = 0; if (he->ms.sym != NULL) { struct annotation *notes = symbol__annotation(he->ms.sym); - if (notes->src == NULL && symbol__alloc_hist(he->ms.sym) < 0) + if (notes->src == NULL && + symbol__alloc_hist(he->ms.sym, evlist->nr_entries) < 0) return -ENOMEM; ret = hist_entry__inc_addr_samples(he, evsel->idx, al->addr); @@ -80,26 +81,25 @@ static int perf_evsel__add_sample(struct perf_evsel *evsel, return ret; } -static int process_sample_event(struct perf_tool *tool, - union perf_event *event, +static int process_sample_event(union perf_event *event, struct perf_sample *sample, struct perf_evsel *evsel, - struct machine *machine) + struct perf_session *session) { - struct perf_annotate *ann = container_of(tool, struct perf_annotate, tool); struct addr_location al; - if (perf_event__preprocess_sample(event, machine, &al, sample, + if (perf_event__preprocess_sample(event, session, &al, sample, symbol__annotate_init) < 0) { pr_warning("problem processing %d event, skipping it.\n", event->header.type); return -1; } - if (ann->cpu_list && !test_bit(sample->cpu, ann->cpu_bitmap)) + if (cpu_list && !test_bit(sample->cpu, cpu_bitmap)) return 0; - if (!al.filtered && perf_evsel__add_sample(evsel, sample, &al, ann)) { + if (!al.filtered && + perf_evlist__add_sample(session->evlist, sample, evsel, &al)) { pr_warning("problem incrementing symbol count, " "skipping event\n"); return -1; @@ -108,15 +108,14 @@ static int process_sample_event(struct perf_tool *tool, return 0; } -static int hist_entry__tty_annotate(struct hist_entry *he, int evidx, - struct perf_annotate *ann) +static int hist_entry__tty_annotate(struct hist_entry *he, int evidx) { return symbol__tty_annotate(he->ms.sym, he->ms.map, evidx, - ann->print_line, ann->full_paths, 0, 0); + print_line, full_paths, 0, 0); } static void hists__find_annotations(struct hists *self, int evidx, - struct perf_annotate *ann) + int nr_events) { struct rb_node *nd = rb_first(&self->entries), *next; int key = K_RIGHT; @@ -139,7 +138,8 @@ static void hists__find_annotations(struct hists *self, int evidx, } if (use_browser > 0) { - key = hist_entry__tui_annotate(he, evidx, NULL, NULL, 0); + key = hist_entry__tui_annotate(he, evidx, nr_events, + NULL, NULL, 0); switch (key) { case K_RIGHT: next = rb_next(nd); @@ -154,7 +154,7 @@ static void hists__find_annotations(struct hists *self, int evidx, if (next != NULL) nd = next; } else { - hist_entry__tty_annotate(he, evidx, ann); + hist_entry__tty_annotate(he, evidx); nd = rb_next(nd); /* * Since we have a hist_entry per IP for the same @@ -167,26 +167,33 @@ static void hists__find_annotations(struct hists *self, int evidx, } } -static int __cmd_annotate(struct perf_annotate *ann) +static struct perf_event_ops event_ops = { + .sample = process_sample_event, + .mmap = perf_event__process_mmap, + .comm = perf_event__process_comm, + .fork = perf_event__process_task, + .ordered_samples = true, + .ordering_requires_timestamps = true, +}; + +static int __cmd_annotate(void) { int ret; struct perf_session *session; struct perf_evsel *pos; u64 total_nr_samples; - session = perf_session__new(ann->input_name, O_RDONLY, - ann->force, false, &ann->tool); + session = perf_session__new(input_name, O_RDONLY, force, false, &event_ops); if (session == NULL) return -ENOMEM; - if (ann->cpu_list) { - ret = perf_session__cpu_bitmap(session, ann->cpu_list, - ann->cpu_bitmap); + if (cpu_list) { + ret = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap); if (ret) goto out_delete; } - ret = perf_session__process_events(session, &ann->tool); + ret = perf_session__process_events(session, &event_ops); if (ret) goto out_delete; @@ -210,12 +217,13 @@ static int __cmd_annotate(struct perf_annotate *ann) total_nr_samples += nr_samples; hists__collapse_resort(hists); hists__output_resort(hists); - hists__find_annotations(hists, pos->idx, ann); + hists__find_annotations(hists, pos->idx, + session->evlist->nr_entries); } } if (total_nr_samples == 0) { - ui__warning("The %s file has no samples!\n", session->filename); + ui__warning("The %s file has no samples!\n", input_name); goto out_delete; } out_delete: @@ -239,41 +247,29 @@ static const char * const annotate_usage[] = { NULL }; -int cmd_annotate(int argc, const char **argv, const char *prefix __used) -{ - struct perf_annotate annotate = { - .tool = { - .sample = process_sample_event, - .mmap = perf_event__process_mmap, - .comm = perf_event__process_comm, - .fork = perf_event__process_task, - .ordered_samples = true, - .ordering_requires_timestamps = true, - }, - }; - const struct option options[] = { - OPT_STRING('i', "input", &annotate.input_name, "file", +static const struct option options[] = { + OPT_STRING('i', "input", &input_name, "file", "input file name"), OPT_STRING('d', "dsos", &symbol_conf.dso_list_str, "dso[,dso...]", "only consider symbols in these dsos"), - OPT_STRING('s', "symbol", &annotate.sym_hist_filter, "symbol", + OPT_STRING('s', "symbol", &sym_hist_filter, "symbol", "symbol to annotate"), - OPT_BOOLEAN('f', "force", &annotate.force, "don't complain, do it"), + OPT_BOOLEAN('f', "force", &force, "don't complain, do it"), OPT_INCR('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"), - OPT_BOOLEAN(0, "tui", &annotate.use_tui, "Use the TUI interface"), - OPT_BOOLEAN(0, "stdio", &annotate.use_stdio, "Use the stdio interface"), + OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"), + OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"), OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name, "file", "vmlinux pathname"), OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules, "load module symbols - WARNING: use only with -k and LIVE kernel"), - OPT_BOOLEAN('l', "print-line", &annotate.print_line, + OPT_BOOLEAN('l', "print-line", &print_line, "print matching source lines (may be slow)"), - OPT_BOOLEAN('P', "full-paths", &annotate.full_paths, + OPT_BOOLEAN('P', "full-paths", &full_paths, "Don't shorten the displayed pathnames"), - OPT_STRING('C', "cpu", &annotate.cpu_list, "cpu", "list of cpus to profile"), + OPT_STRING('c', "cpu", &cpu_list, "cpu", "list of cpus to profile"), OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory", "Look for files with symbols relative to this directory"), OPT_BOOLEAN(0, "source", &symbol_conf.annotate_src, @@ -283,13 +279,15 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __used) OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style", "Specify disassembler style (e.g. -M intel for intel syntax)"), OPT_END() - }; +}; +int cmd_annotate(int argc, const char **argv, const char *prefix __used) +{ argc = parse_options(argc, argv, options, annotate_usage, 0); - if (annotate.use_stdio) + if (use_stdio) use_browser = 0; - else if (annotate.use_tui) + else if (use_tui) use_browser = 1; setup_browser(true); @@ -310,7 +308,7 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __used) if (argc > 1) usage_with_options(annotate_usage, options); - annotate.sym_hist_filter = argv[0]; + sym_hist_filter = argv[0]; } if (field_sep && *field_sep == '.') { @@ -318,5 +316,5 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __used) return -1; } - return __cmd_annotate(&annotate); + return __cmd_annotate(); } diff --git a/trunk/tools/perf/builtin-buildid-list.c b/trunk/tools/perf/builtin-buildid-list.c index 52480467e9ff..cb690a65bf02 100644 --- a/trunk/tools/perf/builtin-buildid-list.c +++ b/trunk/tools/perf/builtin-buildid-list.c @@ -18,7 +18,7 @@ #include -static const char *input_name; +static char const *input_name = "perf.data"; static bool force; static bool show_kernel; static bool with_hits; @@ -39,6 +39,24 @@ static const struct option options[] = { OPT_END() }; +static int perf_session__list_build_ids(void) +{ + struct perf_session *session; + + session = perf_session__new(input_name, O_RDONLY, force, false, + &build_id__mark_dso_hit_ops); + if (session == NULL) + return -1; + + if (with_hits) + perf_session__process_events(session, &build_id__mark_dso_hit_ops); + + perf_session__fprintf_dsos_buildid(session, stdout, with_hits); + + perf_session__delete(session); + return 0; +} + static int sysfs__fprintf_build_id(FILE *fp) { u8 kallsyms_build_id[BUILD_ID_SIZE]; @@ -67,37 +85,18 @@ static int filename__fprintf_build_id(const char *name, FILE *fp) return fprintf(fp, "%s\n", sbuild_id); } -static int perf_session__list_build_ids(void) -{ - struct perf_session *session; - - elf_version(EV_CURRENT); - - session = perf_session__new(input_name, O_RDONLY, force, false, - &build_id__mark_dso_hit_ops); - if (session == NULL) - return -1; - - /* - * See if this is an ELF file first: - */ - if (filename__fprintf_build_id(session->filename, stdout)) - goto out; - - if (with_hits) - perf_session__process_events(session, &build_id__mark_dso_hit_ops); - - perf_session__fprintf_dsos_buildid(session, stdout, with_hits); -out: - perf_session__delete(session); - return 0; -} - static int __cmd_buildid_list(void) { if (show_kernel) return sysfs__fprintf_build_id(stdout); + elf_version(EV_CURRENT); + /* + * See if this is an ELF file first: + */ + if (filename__fprintf_build_id(input_name, stdout)) + return 0; + return perf_session__list_build_ids(); } diff --git a/trunk/tools/perf/builtin-diff.c b/trunk/tools/perf/builtin-diff.c index 4f19513d7dda..b39f3a1ee7dc 100644 --- a/trunk/tools/perf/builtin-diff.c +++ b/trunk/tools/perf/builtin-diff.c @@ -9,9 +9,7 @@ #include "util/debug.h" #include "util/event.h" #include "util/hist.h" -#include "util/evsel.h" #include "util/session.h" -#include "util/tool.h" #include "util/sort.h" #include "util/symbol.h" #include "util/util.h" @@ -32,15 +30,14 @@ static int hists__add_entry(struct hists *self, return -ENOMEM; } -static int diff__process_sample_event(struct perf_tool *tool __used, - union perf_event *event, +static int diff__process_sample_event(union perf_event *event, struct perf_sample *sample, struct perf_evsel *evsel __used, - struct machine *machine) + struct perf_session *session) { struct addr_location al; - if (perf_event__preprocess_sample(event, machine, &al, sample, NULL) < 0) { + if (perf_event__preprocess_sample(event, session, &al, sample, NULL) < 0) { pr_warning("problem processing %d event, skipping it.\n", event->header.type); return -1; @@ -49,16 +46,16 @@ static int diff__process_sample_event(struct perf_tool *tool __used, if (al.filtered || al.sym == NULL) return 0; - if (hists__add_entry(&evsel->hists, &al, sample->period)) { + if (hists__add_entry(&session->hists, &al, sample->period)) { pr_warning("problem incrementing symbol period, skipping event\n"); return -1; } - evsel->hists.stats.total_period += sample->period; + session->hists.stats.total_period += sample->period; return 0; } -static struct perf_tool perf_diff = { +static struct perf_event_ops event_ops = { .sample = diff__process_sample_event, .mmap = perf_event__process_mmap, .comm = perf_event__process_comm, @@ -148,13 +145,13 @@ static int __cmd_diff(void) int ret, i; struct perf_session *session[2]; - session[0] = perf_session__new(input_old, O_RDONLY, force, false, &perf_diff); - session[1] = perf_session__new(input_new, O_RDONLY, force, false, &perf_diff); + session[0] = perf_session__new(input_old, O_RDONLY, force, false, &event_ops); + session[1] = perf_session__new(input_new, O_RDONLY, force, false, &event_ops); if (session[0] == NULL || session[1] == NULL) return -ENOMEM; for (i = 0; i < 2; ++i) { - ret = perf_session__process_events(session[i], &perf_diff); + ret = perf_session__process_events(session[i], &event_ops); if (ret) goto out_delete; } diff --git a/trunk/tools/perf/builtin-evlist.c b/trunk/tools/perf/builtin-evlist.c index 26760322c4f4..4c5e9e04a41f 100644 --- a/trunk/tools/perf/builtin-evlist.c +++ b/trunk/tools/perf/builtin-evlist.c @@ -15,7 +15,7 @@ #include "util/parse-options.h" #include "util/session.h" -static const char *input_name; +static char const *input_name = "perf.data"; static int __cmd_evlist(void) { diff --git a/trunk/tools/perf/builtin-inject.c b/trunk/tools/perf/builtin-inject.c index 09c106193e65..8dfc12bb119b 100644 --- a/trunk/tools/perf/builtin-inject.c +++ b/trunk/tools/perf/builtin-inject.c @@ -9,7 +9,6 @@ #include "perf.h" #include "util/session.h" -#include "util/tool.h" #include "util/debug.h" #include "util/parse-options.h" @@ -17,9 +16,8 @@ static char const *input_name = "-"; static bool inject_build_ids; -static int perf_event__repipe_synth(struct perf_tool *tool __used, - union perf_event *event, - struct machine *machine __used) +static int perf_event__repipe_synth(union perf_event *event, + struct perf_session *session __used) { uint32_t size; void *buf = event; @@ -38,70 +36,41 @@ static int perf_event__repipe_synth(struct perf_tool *tool __used, return 0; } -static int perf_event__repipe_op2_synth(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session __used) -{ - return perf_event__repipe_synth(tool, event, NULL); -} - -static int perf_event__repipe_event_type_synth(struct perf_tool *tool, - union perf_event *event) -{ - return perf_event__repipe_synth(tool, event, NULL); -} - -static int perf_event__repipe_tracing_data_synth(union perf_event *event, - struct perf_session *session __used) -{ - return perf_event__repipe_synth(NULL, event, NULL); -} - -static int perf_event__repipe_attr(union perf_event *event, - struct perf_evlist **pevlist __used) -{ - return perf_event__repipe_synth(NULL, event, NULL); -} - -static int perf_event__repipe(struct perf_tool *tool, - union perf_event *event, +static int perf_event__repipe(union perf_event *event, struct perf_sample *sample __used, - struct machine *machine) + struct perf_session *session) { - return perf_event__repipe_synth(tool, event, machine); + return perf_event__repipe_synth(event, session); } -static int perf_event__repipe_sample(struct perf_tool *tool, - union perf_event *event, +static int perf_event__repipe_sample(union perf_event *event, struct perf_sample *sample __used, struct perf_evsel *evsel __used, - struct machine *machine) + struct perf_session *session) { - return perf_event__repipe_synth(tool, event, machine); + return perf_event__repipe_synth(event, session); } -static int perf_event__repipe_mmap(struct perf_tool *tool, - union perf_event *event, +static int perf_event__repipe_mmap(union perf_event *event, struct perf_sample *sample, - struct machine *machine) + struct perf_session *session) { int err; - err = perf_event__process_mmap(tool, event, sample, machine); - perf_event__repipe(tool, event, sample, machine); + err = perf_event__process_mmap(event, sample, session); + perf_event__repipe(event, sample, session); return err; } -static int perf_event__repipe_task(struct perf_tool *tool, - union perf_event *event, +static int perf_event__repipe_task(union perf_event *event, struct perf_sample *sample, - struct machine *machine) + struct perf_session *session) { int err; - err = perf_event__process_task(tool, event, sample, machine); - perf_event__repipe(tool, event, sample, machine); + err = perf_event__process_task(event, sample, session); + perf_event__repipe(event, sample, session); return err; } @@ -111,7 +80,7 @@ static int perf_event__repipe_tracing_data(union perf_event *event, { int err; - perf_event__repipe_synth(NULL, event, NULL); + perf_event__repipe_synth(event, session); err = perf_event__process_tracing_data(event, session); return err; @@ -131,10 +100,10 @@ static int dso__read_build_id(struct dso *self) return -1; } -static int dso__inject_build_id(struct dso *self, struct perf_tool *tool, - struct machine *machine) +static int dso__inject_build_id(struct dso *self, struct perf_session *session) { u16 misc = PERF_RECORD_MISC_USER; + struct machine *machine; int err; if (dso__read_build_id(self) < 0) { @@ -142,11 +111,17 @@ static int dso__inject_build_id(struct dso *self, struct perf_tool *tool, return -1; } + machine = perf_session__find_host_machine(session); + if (machine == NULL) { + pr_err("Can't find machine for session\n"); + return -1; + } + if (self->kernel) misc = PERF_RECORD_MISC_KERNEL; - err = perf_event__synthesize_build_id(tool, self, misc, perf_event__repipe, - machine); + err = perf_event__synthesize_build_id(self, misc, perf_event__repipe, + machine, session); if (err) { pr_err("Can't synthesize build_id event for %s\n", self->long_name); return -1; @@ -155,11 +130,10 @@ static int dso__inject_build_id(struct dso *self, struct perf_tool *tool, return 0; } -static int perf_event__inject_buildid(struct perf_tool *tool, - union perf_event *event, +static int perf_event__inject_buildid(union perf_event *event, struct perf_sample *sample, struct perf_evsel *evsel __used, - struct machine *machine) + struct perf_session *session) { struct addr_location al; struct thread *thread; @@ -167,21 +141,21 @@ static int perf_event__inject_buildid(struct perf_tool *tool, cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; - thread = machine__findnew_thread(machine, event->ip.pid); + thread = perf_session__findnew(session, event->ip.pid); if (thread == NULL) { pr_err("problem processing %d event, skipping it.\n", event->header.type); goto repipe; } - thread__find_addr_map(thread, machine, cpumode, MAP__FUNCTION, - event->ip.ip, &al); + thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION, + event->ip.pid, event->ip.ip, &al); if (al.map != NULL) { if (!al.map->dso->hit) { al.map->dso->hit = 1; if (map__load(al.map, NULL) >= 0) { - dso__inject_build_id(al.map->dso, tool, machine); + dso__inject_build_id(al.map->dso, session); /* * If this fails, too bad, let the other side * account this as unresolved. @@ -194,24 +168,24 @@ static int perf_event__inject_buildid(struct perf_tool *tool, } repipe: - perf_event__repipe(tool, event, sample, machine); + perf_event__repipe(event, sample, session); return 0; } -struct perf_tool perf_inject = { +struct perf_event_ops inject_ops = { .sample = perf_event__repipe_sample, .mmap = perf_event__repipe, .comm = perf_event__repipe, .fork = perf_event__repipe, .exit = perf_event__repipe, .lost = perf_event__repipe, - .read = perf_event__repipe_sample, + .read = perf_event__repipe, .throttle = perf_event__repipe, .unthrottle = perf_event__repipe, - .attr = perf_event__repipe_attr, - .event_type = perf_event__repipe_event_type_synth, - .tracing_data = perf_event__repipe_tracing_data_synth, - .build_id = perf_event__repipe_op2_synth, + .attr = perf_event__repipe_synth, + .event_type = perf_event__repipe_synth, + .tracing_data = perf_event__repipe_synth, + .build_id = perf_event__repipe_synth, }; extern volatile int session_done; @@ -229,17 +203,17 @@ static int __cmd_inject(void) signal(SIGINT, sig_handler); if (inject_build_ids) { - perf_inject.sample = perf_event__inject_buildid; - perf_inject.mmap = perf_event__repipe_mmap; - perf_inject.fork = perf_event__repipe_task; - perf_inject.tracing_data = perf_event__repipe_tracing_data; + inject_ops.sample = perf_event__inject_buildid; + inject_ops.mmap = perf_event__repipe_mmap; + inject_ops.fork = perf_event__repipe_task; + inject_ops.tracing_data = perf_event__repipe_tracing_data; } - session = perf_session__new(input_name, O_RDONLY, false, true, &perf_inject); + session = perf_session__new(input_name, O_RDONLY, false, true, &inject_ops); if (session == NULL) return -ENOMEM; - ret = perf_session__process_events(session, &perf_inject); + ret = perf_session__process_events(session, &inject_ops); perf_session__delete(session); diff --git a/trunk/tools/perf/builtin-kmem.c b/trunk/tools/perf/builtin-kmem.c index fe1ad8f21961..225e963df105 100644 --- a/trunk/tools/perf/builtin-kmem.c +++ b/trunk/tools/perf/builtin-kmem.c @@ -7,7 +7,6 @@ #include "util/thread.h" #include "util/header.h" #include "util/session.h" -#include "util/tool.h" #include "util/parse-options.h" #include "util/trace-event.h" @@ -19,7 +18,7 @@ struct alloc_stat; typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *); -static const char *input_name; +static char const *input_name = "perf.data"; static int alloc_flag; static int caller_flag; @@ -304,13 +303,12 @@ static void process_raw_event(union perf_event *raw_event __used, void *data, } } -static int process_sample_event(struct perf_tool *tool __used, - union perf_event *event, +static int process_sample_event(union perf_event *event, struct perf_sample *sample, struct perf_evsel *evsel __used, - struct machine *machine) + struct perf_session *session) { - struct thread *thread = machine__findnew_thread(machine, event->ip.pid); + struct thread *thread = perf_session__findnew(session, event->ip.pid); if (thread == NULL) { pr_debug("problem processing %d event, skipping it.\n", @@ -326,7 +324,7 @@ static int process_sample_event(struct perf_tool *tool __used, return 0; } -static struct perf_tool perf_kmem = { +static struct perf_event_ops event_ops = { .sample = process_sample_event, .comm = perf_event__process_comm, .ordered_samples = true, @@ -485,7 +483,7 @@ static int __cmd_kmem(void) { int err = -EINVAL; struct perf_session *session = perf_session__new(input_name, O_RDONLY, - 0, false, &perf_kmem); + 0, false, &event_ops); if (session == NULL) return -ENOMEM; @@ -496,7 +494,7 @@ static int __cmd_kmem(void) goto out_delete; setup_pager(); - err = perf_session__process_events(session, &perf_kmem); + err = perf_session__process_events(session, &event_ops); if (err != 0) goto out_delete; sort_result(); diff --git a/trunk/tools/perf/builtin-kvm.c b/trunk/tools/perf/builtin-kvm.c index 032324a76b87..34d1e853829d 100644 --- a/trunk/tools/perf/builtin-kvm.c +++ b/trunk/tools/perf/builtin-kvm.c @@ -38,7 +38,7 @@ static const struct option kvm_options[] = { OPT_BOOLEAN(0, "guest", &perf_guest, "Collect guest os data"), OPT_BOOLEAN(0, "host", &perf_host, - "Collect host os data"), + "Collect guest os data"), OPT_STRING(0, "guestmount", &symbol_conf.guestmount, "directory", "guest mount directory under which every guest os" " instance has a subdir"), diff --git a/trunk/tools/perf/builtin-lock.c b/trunk/tools/perf/builtin-lock.c index 2296c391d0f5..899080ace267 100644 --- a/trunk/tools/perf/builtin-lock.c +++ b/trunk/tools/perf/builtin-lock.c @@ -12,7 +12,6 @@ #include "util/debug.h" #include "util/session.h" -#include "util/tool.h" #include #include @@ -326,7 +325,7 @@ static struct lock_stat *lock_stat_findnew(void *addr, const char *name) die("memory allocation failed\n"); } -static const char *input_name; +static char const *input_name = "perf.data"; struct raw_event_sample { u32 size; @@ -846,13 +845,12 @@ static void dump_info(void) die("Unknown type of information\n"); } -static int process_sample_event(struct perf_tool *tool __used, - union perf_event *event, +static int process_sample_event(union perf_event *event, struct perf_sample *sample, struct perf_evsel *evsel __used, - struct machine *machine) + struct perf_session *s) { - struct thread *thread = machine__findnew_thread(machine, sample->tid); + struct thread *thread = perf_session__findnew(s, sample->tid); if (thread == NULL) { pr_debug("problem processing %d event, skipping it.\n", @@ -865,7 +863,7 @@ static int process_sample_event(struct perf_tool *tool __used, return 0; } -static struct perf_tool eops = { +static struct perf_event_ops eops = { .sample = process_sample_event, .comm = perf_event__process_comm, .ordered_samples = true, diff --git a/trunk/tools/perf/builtin-probe.c b/trunk/tools/perf/builtin-probe.c index 59d43abfbfec..710ae3d0a489 100644 --- a/trunk/tools/perf/builtin-probe.c +++ b/trunk/tools/perf/builtin-probe.c @@ -46,6 +46,7 @@ #define DEFAULT_VAR_FILTER "!__k???tab_* & !__crc_*" #define DEFAULT_FUNC_FILTER "!_*" +#define MAX_PATH_LEN 256 /* Session management structure */ static struct { diff --git a/trunk/tools/perf/builtin-record.c b/trunk/tools/perf/builtin-record.c index 0abfb18b911f..6ab58cc99d53 100644 --- a/trunk/tools/perf/builtin-record.c +++ b/trunk/tools/perf/builtin-record.c @@ -22,7 +22,6 @@ #include "util/evsel.h" #include "util/debug.h" #include "util/session.h" -#include "util/tool.h" #include "util/symbol.h" #include "util/cpumap.h" #include "util/thread_map.h" @@ -36,36 +35,55 @@ enum write_mode_t { WRITE_APPEND }; -struct perf_record { - struct perf_tool tool; - struct perf_record_opts opts; - u64 bytes_written; - const char *output_name; - struct perf_evlist *evlist; - struct perf_session *session; - const char *progname; - int output; - unsigned int page_size; - int realtime_prio; - enum write_mode_t write_mode; - bool no_buildid; - bool no_buildid_cache; - bool force; - bool file_new; - bool append_file; - long samples; - off_t post_processing_offset; -}; - -static void advance_output(struct perf_record *rec, size_t size) +static u64 user_interval = ULLONG_MAX; +static u64 default_interval = 0; + +static unsigned int page_size; +static unsigned int mmap_pages = UINT_MAX; +static unsigned int user_freq = UINT_MAX; +static int freq = 1000; +static int output; +static int pipe_output = 0; +static const char *output_name = NULL; +static bool group = false; +static int realtime_prio = 0; +static bool nodelay = false; +static bool raw_samples = false; +static bool sample_id_all_avail = true; +static bool system_wide = false; +static pid_t target_pid = -1; +static pid_t target_tid = -1; +static pid_t child_pid = -1; +static bool no_inherit = false; +static enum write_mode_t write_mode = WRITE_FORCE; +static bool call_graph = false; +static bool inherit_stat = false; +static bool no_samples = false; +static bool sample_address = false; +static bool sample_time = false; +static bool no_buildid = false; +static bool no_buildid_cache = false; +static struct perf_evlist *evsel_list; + +static long samples = 0; +static u64 bytes_written = 0; + +static int file_new = 1; +static off_t post_processing_offset; + +static struct perf_session *session; +static const char *cpu_list; +static const char *progname; + +static void advance_output(size_t size) { - rec->bytes_written += size; + bytes_written += size; } -static void write_output(struct perf_record *rec, void *buf, size_t size) +static void write_output(void *buf, size_t size) { while (size) { - int ret = write(rec->output, buf, size); + int ret = write(output, buf, size); if (ret < 0) die("failed to write"); @@ -73,33 +91,30 @@ static void write_output(struct perf_record *rec, void *buf, size_t size) size -= ret; buf += ret; - rec->bytes_written += ret; + bytes_written += ret; } } -static int process_synthesized_event(struct perf_tool *tool, - union perf_event *event, +static int process_synthesized_event(union perf_event *event, struct perf_sample *sample __used, - struct machine *machine __used) + struct perf_session *self __used) { - struct perf_record *rec = container_of(tool, struct perf_record, tool); - write_output(rec, event, event->header.size); + write_output(event, event->header.size); return 0; } -static void perf_record__mmap_read(struct perf_record *rec, - struct perf_mmap *md) +static void mmap_read(struct perf_mmap *md) { unsigned int head = perf_mmap__read_head(md); unsigned int old = md->prev; - unsigned char *data = md->base + rec->page_size; + unsigned char *data = md->base + page_size; unsigned long size; void *buf; if (old == head) return; - rec->samples++; + samples++; size = head - old; @@ -108,14 +123,14 @@ static void perf_record__mmap_read(struct perf_record *rec, size = md->mask + 1 - (old & md->mask); old += size; - write_output(rec, buf, size); + write_output(buf, size); } buf = &data[old & md->mask]; size = head - old; old += size; - write_output(rec, buf, size); + write_output(buf, size); md->prev = old; perf_mmap__write_tail(md, old); @@ -134,18 +149,17 @@ static void sig_handler(int sig) signr = sig; } -static void perf_record__sig_exit(int exit_status __used, void *arg) +static void sig_atexit(void) { - struct perf_record *rec = arg; int status; - if (rec->evlist->workload.pid > 0) { + if (child_pid > 0) { if (!child_finished) - kill(rec->evlist->workload.pid, SIGTERM); + kill(child_pid, SIGTERM); wait(&status); if (WIFSIGNALED(status)) - psignal(WTERMSIG(status), rec->progname); + psignal(WTERMSIG(status), progname); } if (signr == -1 || signr == SIGUSR1) @@ -155,6 +169,78 @@ static void perf_record__sig_exit(int exit_status __used, void *arg) kill(getpid(), signr); } +static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist) +{ + struct perf_event_attr *attr = &evsel->attr; + int track = !evsel->idx; /* only the first counter needs these */ + + attr->disabled = 1; + attr->inherit = !no_inherit; + attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | + PERF_FORMAT_TOTAL_TIME_RUNNING | + PERF_FORMAT_ID; + + attr->sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID; + + if (evlist->nr_entries > 1) + attr->sample_type |= PERF_SAMPLE_ID; + + /* + * We default some events to a 1 default interval. But keep + * it a weak assumption overridable by the user. + */ + if (!attr->sample_period || (user_freq != UINT_MAX && + user_interval != ULLONG_MAX)) { + if (freq) { + attr->sample_type |= PERF_SAMPLE_PERIOD; + attr->freq = 1; + attr->sample_freq = freq; + } else { + attr->sample_period = default_interval; + } + } + + if (no_samples) + attr->sample_freq = 0; + + if (inherit_stat) + attr->inherit_stat = 1; + + if (sample_address) { + attr->sample_type |= PERF_SAMPLE_ADDR; + attr->mmap_data = track; + } + + if (call_graph) + attr->sample_type |= PERF_SAMPLE_CALLCHAIN; + + if (system_wide) + attr->sample_type |= PERF_SAMPLE_CPU; + + if (sample_id_all_avail && + (sample_time || system_wide || !no_inherit || cpu_list)) + attr->sample_type |= PERF_SAMPLE_TIME; + + if (raw_samples) { + attr->sample_type |= PERF_SAMPLE_TIME; + attr->sample_type |= PERF_SAMPLE_RAW; + attr->sample_type |= PERF_SAMPLE_CPU; + } + + if (nodelay) { + attr->watermark = 0; + attr->wakeup_events = 1; + } + + attr->mmap = track; + attr->comm = track; + + if (target_pid == -1 && target_tid == -1 && !system_wide) { + attr->disabled = 1; + attr->enable_on_exec = 1; + } +} + static bool perf_evlist__equal(struct perf_evlist *evlist, struct perf_evlist *other) { @@ -174,16 +260,14 @@ static bool perf_evlist__equal(struct perf_evlist *evlist, return true; } -static void perf_record__open(struct perf_record *rec) +static void open_counters(struct perf_evlist *evlist) { struct perf_evsel *pos, *first; - struct perf_evlist *evlist = rec->evlist; - struct perf_session *session = rec->session; - struct perf_record_opts *opts = &rec->opts; - first = list_entry(evlist->entries.next, struct perf_evsel, node); + if (evlist->cpus->map[0] < 0) + no_inherit = true; - perf_evlist__config_attrs(evlist, opts); + first = list_entry(evlist->entries.next, struct perf_evsel, node); list_for_each_entry(pos, &evlist->entries, node) { struct perf_event_attr *attr = &pos->attr; @@ -202,27 +286,29 @@ static void perf_record__open(struct perf_record *rec) */ bool time_needed = attr->sample_type & PERF_SAMPLE_TIME; - if (opts->group && pos != first) + if (group && pos != first) group_fd = first->fd; + + config_attr(pos, evlist); retry_sample_id: - attr->sample_id_all = opts->sample_id_all_avail ? 1 : 0; + attr->sample_id_all = sample_id_all_avail ? 1 : 0; try_again: - if (perf_evsel__open(pos, evlist->cpus, evlist->threads, - opts->group, group_fd) < 0) { + if (perf_evsel__open(pos, evlist->cpus, evlist->threads, group, + group_fd) < 0) { int err = errno; if (err == EPERM || err == EACCES) { ui__error_paranoid(); exit(EXIT_FAILURE); - } else if (err == ENODEV && opts->cpu_list) { + } else if (err == ENODEV && cpu_list) { die("No such device - did you specify" " an out-of-range profile CPU?\n"); - } else if (err == EINVAL && opts->sample_id_all_avail) { + } else if (err == EINVAL && sample_id_all_avail) { /* * Old kernel, no attr->sample_id_type_all field */ - opts->sample_id_all_avail = false; - if (!opts->sample_time && !opts->raw_samples && !time_needed) + sample_id_all_avail = false; + if (!sample_time && !raw_samples && !time_needed) attr->sample_type &= ~PERF_SAMPLE_TIME; goto retry_sample_id; @@ -272,20 +358,10 @@ static void perf_record__open(struct perf_record *rec) exit(-1); } - if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) { - if (errno == EPERM) - die("Permission error mapping pages.\n" - "Consider increasing " - "/proc/sys/kernel/perf_event_mlock_kb,\n" - "or try again with a smaller value of -m/--mmap_pages.\n" - "(current value: %d)\n", opts->mmap_pages); - else if (!is_power_of_2(opts->mmap_pages)) - die("--mmap_pages/-m value must be a power of two."); - + if (perf_evlist__mmap(evlist, mmap_pages, false) < 0) die("failed to mmap with %d (%s)\n", errno, strerror(errno)); - } - if (rec->file_new) + if (file_new) session->evlist = evlist; else { if (!perf_evlist__equal(session->evlist, evlist)) { @@ -297,32 +373,29 @@ static void perf_record__open(struct perf_record *rec) perf_session__update_sample_type(session); } -static int process_buildids(struct perf_record *rec) +static int process_buildids(void) { - u64 size = lseek(rec->output, 0, SEEK_CUR); + u64 size = lseek(output, 0, SEEK_CUR); if (size == 0) return 0; - rec->session->fd = rec->output; - return __perf_session__process_events(rec->session, rec->post_processing_offset, - size - rec->post_processing_offset, + session->fd = output; + return __perf_session__process_events(session, post_processing_offset, + size - post_processing_offset, size, &build_id__mark_dso_hit_ops); } -static void perf_record__exit(int status __used, void *arg) +static void atexit_header(void) { - struct perf_record *rec = arg; - - if (!rec->opts.pipe_output) { - rec->session->header.data_size += rec->bytes_written; - - if (!rec->no_buildid) - process_buildids(rec); - perf_session__write_header(rec->session, rec->evlist, - rec->output, true); - perf_session__delete(rec->session); - perf_evlist__delete(rec->evlist); + if (!pipe_output) { + session->header.data_size += bytes_written; + + if (!no_buildid) + process_buildids(); + perf_session__write_header(session, evsel_list, output, true); + perf_session__delete(session); + perf_evlist__delete(evsel_list); symbol__exit(); } } @@ -330,7 +403,7 @@ static void perf_record__exit(int status __used, void *arg) static void perf_event__synthesize_guest_os(struct machine *machine, void *data) { int err; - struct perf_tool *tool = data; + struct perf_session *psession = data; if (machine__is_host(machine)) return; @@ -343,8 +416,8 @@ static void perf_event__synthesize_guest_os(struct machine *machine, void *data) *method is used to avoid symbol missing when the first addr is *in module instead of in guest kernel. */ - err = perf_event__synthesize_modules(tool, process_synthesized_event, - machine); + err = perf_event__synthesize_modules(process_synthesized_event, + psession, machine); if (err < 0) pr_err("Couldn't record guest kernel [%d]'s reference" " relocation symbol.\n", machine->pid); @@ -353,11 +426,12 @@ static void perf_event__synthesize_guest_os(struct machine *machine, void *data) * We use _stext for guest kernel because guest kernel's /proc/kallsyms * have no _text sometimes. */ - err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, - machine, "_text"); + err = perf_event__synthesize_kernel_mmap(process_synthesized_event, + psession, machine, "_text"); if (err < 0) - err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, - machine, "_stext"); + err = perf_event__synthesize_kernel_mmap(process_synthesized_event, + psession, machine, + "_stext"); if (err < 0) pr_err("Couldn't record guest kernel [%d]'s reference" " relocation symbol.\n", machine->pid); @@ -368,71 +442,73 @@ static struct perf_event_header finished_round_event = { .type = PERF_RECORD_FINISHED_ROUND, }; -static void perf_record__mmap_read_all(struct perf_record *rec) +static void mmap_read_all(void) { int i; - for (i = 0; i < rec->evlist->nr_mmaps; i++) { - if (rec->evlist->mmap[i].base) - perf_record__mmap_read(rec, &rec->evlist->mmap[i]); + for (i = 0; i < evsel_list->nr_mmaps; i++) { + if (evsel_list->mmap[i].base) + mmap_read(&evsel_list->mmap[i]); } - if (perf_header__has_feat(&rec->session->header, HEADER_TRACE_INFO)) - write_output(rec, &finished_round_event, sizeof(finished_round_event)); + if (perf_header__has_feat(&session->header, HEADER_TRACE_INFO)) + write_output(&finished_round_event, sizeof(finished_round_event)); } -static int __cmd_record(struct perf_record *rec, int argc, const char **argv) +static int __cmd_record(int argc, const char **argv) { struct stat st; int flags; - int err, output; + int err; unsigned long waking = 0; + int child_ready_pipe[2], go_pipe[2]; const bool forks = argc > 0; + char buf; struct machine *machine; - struct perf_tool *tool = &rec->tool; - struct perf_record_opts *opts = &rec->opts; - struct perf_evlist *evsel_list = rec->evlist; - const char *output_name = rec->output_name; - struct perf_session *session; - rec->progname = argv[0]; + progname = argv[0]; - rec->page_size = sysconf(_SC_PAGE_SIZE); + page_size = sysconf(_SC_PAGE_SIZE); - on_exit(perf_record__sig_exit, rec); + atexit(sig_atexit); signal(SIGCHLD, sig_handler); signal(SIGINT, sig_handler); signal(SIGUSR1, sig_handler); + if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) { + perror("failed to create pipes"); + exit(-1); + } + if (!output_name) { if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode)) - opts->pipe_output = true; + pipe_output = 1; else - rec->output_name = output_name = "perf.data"; + output_name = "perf.data"; } if (output_name) { if (!strcmp(output_name, "-")) - opts->pipe_output = true; + pipe_output = 1; else if (!stat(output_name, &st) && st.st_size) { - if (rec->write_mode == WRITE_FORCE) { + if (write_mode == WRITE_FORCE) { char oldname[PATH_MAX]; snprintf(oldname, sizeof(oldname), "%s.old", output_name); unlink(oldname); rename(output_name, oldname); } - } else if (rec->write_mode == WRITE_APPEND) { - rec->write_mode = WRITE_FORCE; + } else if (write_mode == WRITE_APPEND) { + write_mode = WRITE_FORCE; } } flags = O_CREAT|O_RDWR; - if (rec->write_mode == WRITE_APPEND) - rec->file_new = 0; + if (write_mode == WRITE_APPEND) + file_new = 0; else flags |= O_TRUNC; - if (opts->pipe_output) + if (pipe_output) output = STDOUT_FILENO; else output = open(output_name, flags, S_IRUSR | S_IWUSR); @@ -441,21 +517,17 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv) exit(-1); } - rec->output = output; - session = perf_session__new(output_name, O_WRONLY, - rec->write_mode == WRITE_FORCE, false, NULL); + write_mode == WRITE_FORCE, false, NULL); if (session == NULL) { pr_err("Not enough memory for reading perf file header\n"); return -1; } - rec->session = session; - - if (!rec->no_buildid) + if (!no_buildid) perf_header__set_feat(&session->header, HEADER_BUILD_ID); - if (!rec->file_new) { + if (!file_new) { err = perf_session__read_header(session, output); if (err < 0) goto out_delete_session; @@ -477,57 +549,94 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv) perf_header__set_feat(&session->header, HEADER_NUMA_TOPOLOGY); perf_header__set_feat(&session->header, HEADER_CPUID); + /* 512 kiB: default amount of unprivileged mlocked memory */ + if (mmap_pages == UINT_MAX) + mmap_pages = (512 * 1024) / page_size; + if (forks) { - err = perf_evlist__prepare_workload(evsel_list, opts, argv); - if (err < 0) { - pr_err("Couldn't run the workload!\n"); - goto out_delete_session; + child_pid = fork(); + if (child_pid < 0) { + perror("failed to fork"); + exit(-1); + } + + if (!child_pid) { + if (pipe_output) + dup2(2, 1); + close(child_ready_pipe[0]); + close(go_pipe[1]); + fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC); + + /* + * Do a dummy execvp to get the PLT entry resolved, + * so we avoid the resolver overhead on the real + * execvp call. + */ + execvp("", (char **)argv); + + /* + * Tell the parent we're ready to go + */ + close(child_ready_pipe[1]); + + /* + * Wait until the parent tells us to go. + */ + if (read(go_pipe[0], &buf, 1) == -1) + perror("unable to read pipe"); + + execvp(argv[0], (char **)argv); + + perror(argv[0]); + kill(getppid(), SIGUSR1); + exit(-1); } + + if (!system_wide && target_tid == -1 && target_pid == -1) + evsel_list->threads->map[0] = child_pid; + + close(child_ready_pipe[1]); + close(go_pipe[0]); + /* + * wait for child to settle + */ + if (read(child_ready_pipe[0], &buf, 1) == -1) { + perror("unable to read pipe"); + exit(-1); + } + close(child_ready_pipe[0]); } - perf_record__open(rec); + open_counters(evsel_list); /* - * perf_session__delete(session) will be called at perf_record__exit() + * perf_session__delete(session) will be called at atexit_header() */ - on_exit(perf_record__exit, rec); + atexit(atexit_header); - if (opts->pipe_output) { + if (pipe_output) { err = perf_header__write_pipe(output); if (err < 0) return err; - } else if (rec->file_new) { + } else if (file_new) { err = perf_session__write_header(session, evsel_list, output, false); if (err < 0) return err; } - if (!!rec->no_buildid - && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) { - pr_err("Couldn't generating buildids. " - "Use --no-buildid to profile anyway.\n"); - return -1; - } + post_processing_offset = lseek(output, 0, SEEK_CUR); - rec->post_processing_offset = lseek(output, 0, SEEK_CUR); - - machine = perf_session__find_host_machine(session); - if (!machine) { - pr_err("Couldn't find native kernel information.\n"); - return -1; - } - - if (opts->pipe_output) { - err = perf_event__synthesize_attrs(tool, session, - process_synthesized_event); + if (pipe_output) { + err = perf_session__synthesize_attrs(session, + process_synthesized_event); if (err < 0) { pr_err("Couldn't synthesize attrs.\n"); return err; } - err = perf_event__synthesize_event_types(tool, process_synthesized_event, - machine); + err = perf_event__synthesize_event_types(process_synthesized_event, + session); if (err < 0) { pr_err("Couldn't synthesize event_types.\n"); return err; @@ -542,49 +651,56 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv) * return this more properly and also * propagate errors that now are calling die() */ - err = perf_event__synthesize_tracing_data(tool, output, evsel_list, - process_synthesized_event); + err = perf_event__synthesize_tracing_data(output, evsel_list, + process_synthesized_event, + session); if (err <= 0) { pr_err("Couldn't record tracing data.\n"); return err; } - advance_output(rec, err); + advance_output(err); } } - err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, - machine, "_text"); + machine = perf_session__find_host_machine(session); + if (!machine) { + pr_err("Couldn't find native kernel information.\n"); + return -1; + } + + err = perf_event__synthesize_kernel_mmap(process_synthesized_event, + session, machine, "_text"); if (err < 0) - err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, - machine, "_stext"); + err = perf_event__synthesize_kernel_mmap(process_synthesized_event, + session, machine, "_stext"); if (err < 0) pr_err("Couldn't record kernel reference relocation symbol\n" "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" "Check /proc/kallsyms permission or run as root.\n"); - err = perf_event__synthesize_modules(tool, process_synthesized_event, - machine); + err = perf_event__synthesize_modules(process_synthesized_event, + session, machine); if (err < 0) pr_err("Couldn't record kernel module information.\n" "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" "Check /proc/modules permission or run as root.\n"); if (perf_guest) - perf_session__process_machines(session, tool, + perf_session__process_machines(session, perf_event__synthesize_guest_os); - if (!opts->system_wide) - perf_event__synthesize_thread_map(tool, evsel_list->threads, + if (!system_wide) + perf_event__synthesize_thread_map(evsel_list->threads, process_synthesized_event, - machine); + session); else - perf_event__synthesize_threads(tool, process_synthesized_event, - machine); + perf_event__synthesize_threads(process_synthesized_event, + session); - if (rec->realtime_prio) { + if (realtime_prio) { struct sched_param param; - param.sched_priority = rec->realtime_prio; + param.sched_priority = realtime_prio; if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { pr_err("Could not set realtime priority.\n"); exit(-1); @@ -597,14 +713,14 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv) * Let the child rip */ if (forks) - perf_evlist__start_workload(evsel_list); + close(go_pipe[1]); for (;;) { - int hits = rec->samples; + int hits = samples; - perf_record__mmap_read_all(rec); + mmap_read_all(); - if (hits == rec->samples) { + if (hits == samples) { if (done) break; err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1); @@ -625,9 +741,9 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv) */ fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n", - (double)rec->bytes_written / 1024.0 / 1024.0, + (double)bytes_written / 1024.0 / 1024.0, output_name, - rec->bytes_written / 24); + bytes_written / 24); return 0; @@ -642,89 +758,58 @@ static const char * const record_usage[] = { NULL }; -/* - * XXX Ideally would be local to cmd_record() and passed to a perf_record__new - * because we need to have access to it in perf_record__exit, that is called - * after cmd_record() exits, but since record_options need to be accessible to - * builtin-script, leave it here. - * - * At least we don't ouch it in all the other functions here directly. - * - * Just say no to tons of global variables, sigh. - */ -static struct perf_record record = { - .opts = { - .target_pid = -1, - .target_tid = -1, - .mmap_pages = UINT_MAX, - .user_freq = UINT_MAX, - .user_interval = ULLONG_MAX, - .freq = 1000, - .sample_id_all_avail = true, - }, - .write_mode = WRITE_FORCE, - .file_new = true, -}; +static bool force, append_file; -/* - * XXX Will stay a global variable till we fix builtin-script.c to stop messing - * with it and switch to use the library functions in perf_evlist that came - * from builtin-record.c, i.e. use perf_record_opts, - * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record', - * using pipes, etc. - */ const struct option record_options[] = { - OPT_CALLBACK('e', "event", &record.evlist, "event", + OPT_CALLBACK('e', "event", &evsel_list, "event", "event selector. use 'perf list' to list available events", parse_events_option), - OPT_CALLBACK(0, "filter", &record.evlist, "filter", + OPT_CALLBACK(0, "filter", &evsel_list, "filter", "event filter", parse_filter), - OPT_INTEGER('p', "pid", &record.opts.target_pid, + OPT_INTEGER('p', "pid", &target_pid, "record events on existing process id"), - OPT_INTEGER('t', "tid", &record.opts.target_tid, + OPT_INTEGER('t', "tid", &target_tid, "record events on existing thread id"), - OPT_INTEGER('r', "realtime", &record.realtime_prio, + OPT_INTEGER('r', "realtime", &realtime_prio, "collect data with this RT SCHED_FIFO priority"), - OPT_BOOLEAN('D', "no-delay", &record.opts.no_delay, + OPT_BOOLEAN('D', "no-delay", &nodelay, "collect data without buffering"), - OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples, + OPT_BOOLEAN('R', "raw-samples", &raw_samples, "collect raw sample records from all opened counters"), - OPT_BOOLEAN('a', "all-cpus", &record.opts.system_wide, + OPT_BOOLEAN('a', "all-cpus", &system_wide, "system-wide collection from all CPUs"), - OPT_BOOLEAN('A', "append", &record.append_file, + OPT_BOOLEAN('A', "append", &append_file, "append to the output file to do incremental profiling"), - OPT_STRING('C', "cpu", &record.opts.cpu_list, "cpu", + OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to monitor"), - OPT_BOOLEAN('f', "force", &record.force, + OPT_BOOLEAN('f', "force", &force, "overwrite existing data file (deprecated)"), - OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"), - OPT_STRING('o', "output", &record.output_name, "file", + OPT_U64('c', "count", &user_interval, "event period to sample"), + OPT_STRING('o', "output", &output_name, "file", "output file name"), - OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit, + OPT_BOOLEAN('i', "no-inherit", &no_inherit, "child tasks do not inherit counters"), - OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"), - OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages, - "number of mmap data pages"), - OPT_BOOLEAN(0, "group", &record.opts.group, + OPT_UINTEGER('F', "freq", &user_freq, "profile at this frequency"), + OPT_UINTEGER('m', "mmap-pages", &mmap_pages, "number of mmap data pages"), + OPT_BOOLEAN(0, "group", &group, "put the counters into a counter group"), - OPT_BOOLEAN('g', "call-graph", &record.opts.call_graph, + OPT_BOOLEAN('g', "call-graph", &call_graph, "do call-graph (stack chain/backtrace) recording"), OPT_INCR('v', "verbose", &verbose, "be more verbose (show counter open errors, etc)"), OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"), - OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat, + OPT_BOOLEAN('s', "stat", &inherit_stat, "per thread counts"), - OPT_BOOLEAN('d', "data", &record.opts.sample_address, + OPT_BOOLEAN('d', "data", &sample_address, "Sample addresses"), - OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"), - OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"), - OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples, + OPT_BOOLEAN('T', "timestamp", &sample_time, "Sample timestamps"), + OPT_BOOLEAN('n', "no-samples", &no_samples, "don't sample"), - OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache, + OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid_cache, "do not update the buildid cache"), - OPT_BOOLEAN('B', "no-buildid", &record.no_buildid, + OPT_BOOLEAN('B', "no-buildid", &no_buildid, "do not collect buildids in perf.data"), - OPT_CALLBACK('G', "cgroup", &record.evlist, "name", + OPT_CALLBACK('G', "cgroup", &evsel_list, "name", "monitor event in cgroup name only", parse_cgroups), OPT_END() @@ -734,8 +819,6 @@ int cmd_record(int argc, const char **argv, const char *prefix __used) { int err = -ENOMEM; struct perf_evsel *pos; - struct perf_evlist *evsel_list; - struct perf_record *rec = &record; perf_header__set_cmdline(argc, argv); @@ -743,25 +826,23 @@ int cmd_record(int argc, const char **argv, const char *prefix __used) if (evsel_list == NULL) return -ENOMEM; - rec->evlist = evsel_list; - argc = parse_options(argc, argv, record_options, record_usage, PARSE_OPT_STOP_AT_NON_OPTION); - if (!argc && rec->opts.target_pid == -1 && rec->opts.target_tid == -1 && - !rec->opts.system_wide && !rec->opts.cpu_list) + if (!argc && target_pid == -1 && target_tid == -1 && + !system_wide && !cpu_list) usage_with_options(record_usage, record_options); - if (rec->force && rec->append_file) { + if (force && append_file) { fprintf(stderr, "Can't overwrite and append at the same time." " You need to choose between -f and -A"); usage_with_options(record_usage, record_options); - } else if (rec->append_file) { - rec->write_mode = WRITE_APPEND; + } else if (append_file) { + write_mode = WRITE_APPEND; } else { - rec->write_mode = WRITE_FORCE; + write_mode = WRITE_FORCE; } - if (nr_cgroups && !rec->opts.system_wide) { + if (nr_cgroups && !system_wide) { fprintf(stderr, "cgroup monitoring only available in" " system-wide mode\n"); usage_with_options(record_usage, record_options); @@ -779,7 +860,7 @@ int cmd_record(int argc, const char **argv, const char *prefix __used) "If some relocation was applied (e.g. kexec) symbols may be misresolved\n" "even with a suitable vmlinux or kallsyms file.\n\n"); - if (rec->no_buildid_cache || rec->no_buildid) + if (no_buildid_cache || no_buildid) disable_buildid_cache(); if (evsel_list->nr_entries == 0 && @@ -788,37 +869,43 @@ int cmd_record(int argc, const char **argv, const char *prefix __used) goto out_symbol_exit; } - if (rec->opts.target_pid != -1) - rec->opts.target_tid = rec->opts.target_pid; + if (target_pid != -1) + target_tid = target_pid; - if (perf_evlist__create_maps(evsel_list, rec->opts.target_pid, - rec->opts.target_tid, rec->opts.cpu_list) < 0) + if (perf_evlist__create_maps(evsel_list, target_pid, + target_tid, cpu_list) < 0) usage_with_options(record_usage, record_options); list_for_each_entry(pos, &evsel_list->entries, node) { + if (perf_evsel__alloc_fd(pos, evsel_list->cpus->nr, + evsel_list->threads->nr) < 0) + goto out_free_fd; if (perf_header__push_event(pos->attr.config, event_name(pos))) goto out_free_fd; } - if (rec->opts.user_interval != ULLONG_MAX) - rec->opts.default_interval = rec->opts.user_interval; - if (rec->opts.user_freq != UINT_MAX) - rec->opts.freq = rec->opts.user_freq; + if (perf_evlist__alloc_pollfd(evsel_list) < 0) + goto out_free_fd; + + if (user_interval != ULLONG_MAX) + default_interval = user_interval; + if (user_freq != UINT_MAX) + freq = user_freq; /* * User specified count overrides default frequency. */ - if (rec->opts.default_interval) - rec->opts.freq = 0; - else if (rec->opts.freq) { - rec->opts.default_interval = rec->opts.freq; + if (default_interval) + freq = 0; + else if (freq) { + default_interval = freq; } else { fprintf(stderr, "frequency and count are zero, aborting\n"); err = -EINVAL; goto out_free_fd; } - err = __cmd_record(&record, argc, argv); + err = __cmd_record(argc, argv); out_free_fd: perf_evlist__delete_maps(evsel_list); out_symbol_exit: diff --git a/trunk/tools/perf/builtin-report.c b/trunk/tools/perf/builtin-report.c index 25d34d483e49..4d7c8340c326 100644 --- a/trunk/tools/perf/builtin-report.c +++ b/trunk/tools/perf/builtin-report.c @@ -25,7 +25,6 @@ #include "util/evsel.h" #include "util/header.h" #include "util/session.h" -#include "util/tool.h" #include "util/parse-options.h" #include "util/parse-events.h" @@ -36,35 +35,38 @@ #include -struct perf_report { - struct perf_tool tool; - struct perf_session *session; - char const *input_name; - bool force, use_tui, use_stdio; - bool hide_unresolved; - bool dont_use_callchains; - bool show_full_info; - bool show_threads; - bool inverted_callchain; - struct perf_read_values show_threads_values; - const char *pretty_printing_style; - symbol_filter_t annotate_init; - const char *cpu_list; - DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); -}; +static char const *input_name = "perf.data"; + +static bool force, use_tui, use_stdio; +static bool hide_unresolved; +static bool dont_use_callchains; +static bool show_full_info; + +static bool show_threads; +static struct perf_read_values show_threads_values; + +static const char default_pretty_printing_style[] = "normal"; +static const char *pretty_printing_style = default_pretty_printing_style; + +static char callchain_default_opt[] = "fractal,0.5,callee"; +static bool inverted_callchain; +static symbol_filter_t annotate_init; + +static const char *cpu_list; +static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); -static int perf_evsel__add_hist_entry(struct perf_evsel *evsel, - struct addr_location *al, - struct perf_sample *sample, - struct machine *machine) +static int perf_session__add_hist_entry(struct perf_session *session, + struct addr_location *al, + struct perf_sample *sample, + struct perf_evsel *evsel) { struct symbol *parent = NULL; int err = 0; struct hist_entry *he; if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) { - err = machine__resolve_callchain(machine, evsel, al->thread, - sample->callchain, &parent); + err = perf_session__resolve_callchain(session, al->thread, + sample->callchain, &parent); if (err) return err; } @@ -74,8 +76,7 @@ static int perf_evsel__add_hist_entry(struct perf_evsel *evsel, return -ENOMEM; if (symbol_conf.use_callchain) { - err = callchain_append(he->callchain, - &evsel->hists.callchain_cursor, + err = callchain_append(he->callchain, &session->callchain_cursor, sample->period); if (err) return err; @@ -91,7 +92,8 @@ static int perf_evsel__add_hist_entry(struct perf_evsel *evsel, assert(evsel != NULL); err = -ENOMEM; - if (notes->src == NULL && symbol__alloc_hist(he->ms.sym) < 0) + if (notes->src == NULL && + symbol__alloc_hist(he->ms.sym, session->evlist->nr_entries) < 0) goto out; err = hist_entry__inc_addr_samples(he, evsel->idx, al->addr); @@ -104,32 +106,30 @@ static int perf_evsel__add_hist_entry(struct perf_evsel *evsel, } -static int process_sample_event(struct perf_tool *tool, - union perf_event *event, +static int process_sample_event(union perf_event *event, struct perf_sample *sample, struct perf_evsel *evsel, - struct machine *machine) + struct perf_session *session) { - struct perf_report *rep = container_of(tool, struct perf_report, tool); struct addr_location al; - if (perf_event__preprocess_sample(event, machine, &al, sample, - rep->annotate_init) < 0) { + if (perf_event__preprocess_sample(event, session, &al, sample, + annotate_init) < 0) { fprintf(stderr, "problem processing %d event, skipping it.\n", event->header.type); return -1; } - if (al.filtered || (rep->hide_unresolved && al.sym == NULL)) + if (al.filtered || (hide_unresolved && al.sym == NULL)) return 0; - if (rep->cpu_list && !test_bit(sample->cpu, rep->cpu_bitmap)) + if (cpu_list && !test_bit(sample->cpu, cpu_bitmap)) return 0; if (al.map != NULL) al.map->dso->hit = 1; - if (perf_evsel__add_hist_entry(evsel, &al, sample, machine)) { + if (perf_session__add_hist_entry(session, &al, sample, evsel)) { pr_debug("problem incrementing symbol period, skipping event\n"); return -1; } @@ -137,17 +137,15 @@ static int process_sample_event(struct perf_tool *tool, return 0; } -static int process_read_event(struct perf_tool *tool, - union perf_event *event, +static int process_read_event(union perf_event *event, struct perf_sample *sample __used, - struct perf_evsel *evsel, - struct machine *machine __used) + struct perf_session *session) { - struct perf_report *rep = container_of(tool, struct perf_report, tool); - - if (rep->show_threads) { + struct perf_evsel *evsel = perf_evlist__id2evsel(session->evlist, + event->read.id); + if (show_threads) { const char *name = evsel ? event_name(evsel) : "unknown"; - perf_read_values_add_value(&rep->show_threads_values, + perf_read_values_add_value(&show_threads_values, event->read.pid, event->read.tid, event->read.id, name, @@ -161,10 +159,8 @@ static int process_read_event(struct perf_tool *tool, return 0; } -static int perf_report__setup_sample_type(struct perf_report *rep) +static int perf_session__setup_sample_type(struct perf_session *self) { - struct perf_session *self = rep->session; - if (!(self->sample_type & PERF_SAMPLE_CALLCHAIN)) { if (sort__has_parent) { ui__warning("Selected --sort parent, but no " @@ -177,8 +173,7 @@ static int perf_report__setup_sample_type(struct perf_report *rep) "you call 'perf record' without -g?\n"); return -1; } - } else if (!rep->dont_use_callchains && - callchain_param.mode != CHAIN_NONE && + } else if (!dont_use_callchains && callchain_param.mode != CHAIN_NONE && !symbol_conf.use_callchain) { symbol_conf.use_callchain = true; if (callchain_register_param(&callchain_param) < 0) { @@ -191,6 +186,22 @@ static int perf_report__setup_sample_type(struct perf_report *rep) return 0; } +static struct perf_event_ops event_ops = { + .sample = process_sample_event, + .mmap = perf_event__process_mmap, + .comm = perf_event__process_comm, + .exit = perf_event__process_task, + .fork = perf_event__process_task, + .lost = perf_event__process_lost, + .read = process_read_event, + .attr = perf_event__process_attr, + .event_type = perf_event__process_event_type, + .tracing_data = perf_event__process_tracing_data, + .build_id = perf_event__process_build_id, + .ordered_samples = true, + .ordering_requires_timestamps = true, +}; + extern volatile int session_done; static void sig_handler(int sig __used) @@ -213,7 +224,6 @@ static size_t hists__fprintf_nr_sample_events(struct hists *self, } static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist, - struct perf_report *rep, const char *help) { struct perf_evsel *pos; @@ -231,18 +241,18 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist, parent_pattern == default_parent_pattern) { fprintf(stdout, "#\n# (%s)\n#\n", help); - if (rep->show_threads) { - bool style = !strcmp(rep->pretty_printing_style, "raw"); - perf_read_values_display(stdout, &rep->show_threads_values, + if (show_threads) { + bool style = !strcmp(pretty_printing_style, "raw"); + perf_read_values_display(stdout, &show_threads_values, style); - perf_read_values_destroy(&rep->show_threads_values); + perf_read_values_destroy(&show_threads_values); } } return 0; } -static int __cmd_report(struct perf_report *rep) +static int __cmd_report(void) { int ret = -EINVAL; u64 nr_samples; @@ -254,31 +264,27 @@ static int __cmd_report(struct perf_report *rep) signal(SIGINT, sig_handler); - session = perf_session__new(rep->input_name, O_RDONLY, - rep->force, false, &rep->tool); + session = perf_session__new(input_name, O_RDONLY, force, false, &event_ops); if (session == NULL) return -ENOMEM; - rep->session = session; - - if (rep->cpu_list) { - ret = perf_session__cpu_bitmap(session, rep->cpu_list, - rep->cpu_bitmap); + if (cpu_list) { + ret = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap); if (ret) goto out_delete; } if (use_browser <= 0) - perf_session__fprintf_info(session, stdout, rep->show_full_info); + perf_session__fprintf_info(session, stdout, show_full_info); - if (rep->show_threads) - perf_read_values_init(&rep->show_threads_values); + if (show_threads) + perf_read_values_init(&show_threads_values); - ret = perf_report__setup_sample_type(rep); + ret = perf_session__setup_sample_type(session); if (ret) goto out_delete; - ret = perf_session__process_events(session, &rep->tool); + ret = perf_session__process_events(session, &event_ops); if (ret) goto out_delete; @@ -321,7 +327,7 @@ static int __cmd_report(struct perf_report *rep) } if (nr_samples == 0) { - ui__warning("The %s file has no samples!\n", session->filename); + ui__warning("The %s file has no samples!\n", input_name); goto out_delete; } @@ -329,7 +335,7 @@ static int __cmd_report(struct perf_report *rep) perf_evlist__tui_browse_hists(session->evlist, help, NULL, NULL, 0); } else - perf_evlist__tty_browse_hists(session->evlist, rep, help); + perf_evlist__tty_browse_hists(session->evlist, help); out_delete: /* @@ -348,9 +354,9 @@ static int __cmd_report(struct perf_report *rep) } static int -parse_callchain_opt(const struct option *opt, const char *arg, int unset) +parse_callchain_opt(const struct option *opt __used, const char *arg, + int unset) { - struct perf_report *rep = (struct perf_report *)opt->value; char *tok, *tok2; char *endptr; @@ -358,7 +364,7 @@ parse_callchain_opt(const struct option *opt, const char *arg, int unset) * --no-call-graph */ if (unset) { - rep->dont_use_callchains = true; + dont_use_callchains = true; return 0; } @@ -406,7 +412,7 @@ parse_callchain_opt(const struct option *opt, const char *arg, int unset) goto setup; if (tok2[0] != 'c') { - callchain_param.print_limit = strtoul(tok2, &endptr, 0); + callchain_param.print_limit = strtod(tok2, &endptr); tok2 = strtok(NULL, ","); if (!tok2) goto setup; @@ -427,34 +433,13 @@ parse_callchain_opt(const struct option *opt, const char *arg, int unset) return 0; } -int cmd_report(int argc, const char **argv, const char *prefix __used) -{ - struct stat st; - char callchain_default_opt[] = "fractal,0.5,callee"; - const char * const report_usage[] = { - "perf report []", - NULL - }; - struct perf_report report = { - .tool = { - .sample = process_sample_event, - .mmap = perf_event__process_mmap, - .comm = perf_event__process_comm, - .exit = perf_event__process_task, - .fork = perf_event__process_task, - .lost = perf_event__process_lost, - .read = process_read_event, - .attr = perf_event__process_attr, - .event_type = perf_event__process_event_type, - .tracing_data = perf_event__process_tracing_data, - .build_id = perf_event__process_build_id, - .ordered_samples = true, - .ordering_requires_timestamps = true, - }, - .pretty_printing_style = "normal", - }; - const struct option options[] = { - OPT_STRING('i', "input", &report.input_name, "file", +static const char * const report_usage[] = { + "perf report [] ", + NULL +}; + +static const struct option options[] = { + OPT_STRING('i', "input", &input_name, "file", "input file name"), OPT_INCR('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), @@ -464,18 +449,17 @@ int cmd_report(int argc, const char **argv, const char *prefix __used) "file", "vmlinux pathname"), OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name, "file", "kallsyms pathname"), - OPT_BOOLEAN('f', "force", &report.force, "don't complain, do it"), + OPT_BOOLEAN('f', "force", &force, "don't complain, do it"), OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules, "load module symbols - WARNING: use only with -k and LIVE kernel"), OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples, "Show a column with the number of samples"), - OPT_BOOLEAN('T', "threads", &report.show_threads, + OPT_BOOLEAN('T', "threads", &show_threads, "Show per-thread event counters"), - OPT_STRING(0, "pretty", &report.pretty_printing_style, "key", + OPT_STRING(0, "pretty", &pretty_printing_style, "key", "pretty printing style key: normal raw"), - OPT_BOOLEAN(0, "tui", &report.use_tui, "Use the TUI interface"), - OPT_BOOLEAN(0, "stdio", &report.use_stdio, - "Use the stdio interface"), + OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"), + OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"), OPT_STRING('s', "sort", &sort_order, "key[,key2...]", "sort by key(s): pid, comm, dso, symbol, parent"), OPT_BOOLEAN(0, "showcpuutilization", &symbol_conf.show_cpu_utilization, @@ -484,14 +468,13 @@ int cmd_report(int argc, const char **argv, const char *prefix __used) "regex filter to identify parent, see: '--sort parent'"), OPT_BOOLEAN('x', "exclude-other", &symbol_conf.exclude_other, "Only display entries with parent-match"), - OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order", - "Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit and callchain order. " + OPT_CALLBACK_DEFAULT('g', "call-graph", NULL, "output_type,min_percent, call_order", + "Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold and callchain order. " "Default: fractal,0.5,callee", &parse_callchain_opt, callchain_default_opt), - OPT_BOOLEAN('G', "inverted", &report.inverted_callchain, - "alias for inverted call graph"), + OPT_BOOLEAN('G', "inverted", &inverted_callchain, "alias for inverted call graph"), OPT_STRING('d', "dsos", &symbol_conf.dso_list_str, "dso[,dso...]", "only consider symbols in these dsos"), - OPT_STRING('c', "comms", &symbol_conf.comm_list_str, "comm[,comm...]", + OPT_STRING('C', "comms", &symbol_conf.comm_list_str, "comm[,comm...]", "only consider symbols in these comms"), OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]", "only consider these symbols"), @@ -501,13 +484,12 @@ int cmd_report(int argc, const char **argv, const char *prefix __used) OPT_STRING('t', "field-separator", &symbol_conf.field_sep, "separator", "separator for columns, no spaces will be added between " "columns '.' is reserved."), - OPT_BOOLEAN('U', "hide-unresolved", &report.hide_unresolved, + OPT_BOOLEAN('U', "hide-unresolved", &hide_unresolved, "Only display entries resolved to a symbol"), OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory", "Look for files with symbols relative to this directory"), - OPT_STRING('C', "cpu", &report.cpu_list, "cpu", - "list of cpus to profile"), - OPT_BOOLEAN('I', "show-info", &report.show_full_info, + OPT_STRING('c', "cpu", &cpu_list, "cpu", "list of cpus to profile"), + OPT_BOOLEAN('I', "show-info", &show_full_info, "Display extended information about perf.data file"), OPT_BOOLEAN(0, "source", &symbol_conf.annotate_src, "Interleave source code with assembly code (default)"), @@ -518,30 +500,24 @@ int cmd_report(int argc, const char **argv, const char *prefix __used) OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period, "Show a column with the sum of periods"), OPT_END() - }; +}; +int cmd_report(int argc, const char **argv, const char *prefix __used) +{ argc = parse_options(argc, argv, options, report_usage, 0); - if (report.use_stdio) + if (use_stdio) use_browser = 0; - else if (report.use_tui) + else if (use_tui) use_browser = 1; - if (report.inverted_callchain) + if (inverted_callchain) callchain_param.order = ORDER_CALLER; - if (!report.input_name || !strlen(report.input_name)) { - if (!fstat(STDIN_FILENO, &st) && S_ISFIFO(st.st_mode)) - report.input_name = "-"; - else - report.input_name = "perf.data"; - } - - if (strcmp(report.input_name, "-") != 0) + if (strcmp(input_name, "-") != 0) setup_browser(true); else use_browser = 0; - /* * Only in the newt browser we are doing integrated annotation, * so don't allocate extra space that won't be used in the stdio @@ -549,7 +525,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __used) */ if (use_browser > 0) { symbol_conf.priv_size = sizeof(struct annotation); - report.annotate_init = symbol__annotate_init; + annotate_init = symbol__annotate_init; /* * For searching by name on the "Browse map details". * providing it only in verbose mode not to bloat too @@ -596,5 +572,5 @@ int cmd_report(int argc, const char **argv, const char *prefix __used) sort_entry__setup_elide(&sort_comm, symbol_conf.comm_list, "comm", stdout); sort_entry__setup_elide(&sort_sym, symbol_conf.sym_list, "symbol", stdout); - return __cmd_report(&report); + return __cmd_report(); } diff --git a/trunk/tools/perf/builtin-sched.c b/trunk/tools/perf/builtin-sched.c index fb8b5f83b4a0..5177964943e7 100644 --- a/trunk/tools/perf/builtin-sched.c +++ b/trunk/tools/perf/builtin-sched.c @@ -2,14 +2,11 @@ #include "perf.h" #include "util/util.h" -#include "util/evlist.h" #include "util/cache.h" -#include "util/evsel.h" #include "util/symbol.h" #include "util/thread.h" #include "util/header.h" #include "util/session.h" -#include "util/tool.h" #include "util/parse-options.h" #include "util/trace-event.h" @@ -22,7 +19,7 @@ #include #include -static const char *input_name; +static char const *input_name = "perf.data"; static char default_sort_order[] = "avg, max, switch, runtime"; static const char *sort_order = default_sort_order; @@ -726,21 +723,21 @@ struct trace_migrate_task_event { struct trace_sched_handler { void (*switch_event)(struct trace_switch_event *, - struct machine *, + struct perf_session *, struct event *, int cpu, u64 timestamp, struct thread *thread); void (*runtime_event)(struct trace_runtime_event *, - struct machine *, + struct perf_session *, struct event *, int cpu, u64 timestamp, struct thread *thread); void (*wakeup_event)(struct trace_wakeup_event *, - struct machine *, + struct perf_session *, struct event *, int cpu, u64 timestamp, @@ -753,7 +750,7 @@ struct trace_sched_handler { struct thread *thread); void (*migrate_task_event)(struct trace_migrate_task_event *, - struct machine *machine, + struct perf_session *session, struct event *, int cpu, u64 timestamp, @@ -763,7 +760,7 @@ struct trace_sched_handler { static void replay_wakeup_event(struct trace_wakeup_event *wakeup_event, - struct machine *machine __used, + struct perf_session *session __used, struct event *event, int cpu __used, u64 timestamp __used, @@ -790,7 +787,7 @@ static u64 cpu_last_switched[MAX_CPUS]; static void replay_switch_event(struct trace_switch_event *switch_event, - struct machine *machine __used, + struct perf_session *session __used, struct event *event, int cpu, u64 timestamp, @@ -1024,7 +1021,7 @@ add_sched_in_event(struct work_atoms *atoms, u64 timestamp) static void latency_switch_event(struct trace_switch_event *switch_event, - struct machine *machine, + struct perf_session *session, struct event *event __used, int cpu, u64 timestamp, @@ -1048,8 +1045,8 @@ latency_switch_event(struct trace_switch_event *switch_event, die("hm, delta: %" PRIu64 " < 0 ?\n", delta); - sched_out = machine__findnew_thread(machine, switch_event->prev_pid); - sched_in = machine__findnew_thread(machine, switch_event->next_pid); + sched_out = perf_session__findnew(session, switch_event->prev_pid); + sched_in = perf_session__findnew(session, switch_event->next_pid); out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid); if (!out_events) { @@ -1077,13 +1074,13 @@ latency_switch_event(struct trace_switch_event *switch_event, static void latency_runtime_event(struct trace_runtime_event *runtime_event, - struct machine *machine, + struct perf_session *session, struct event *event __used, int cpu, u64 timestamp, struct thread *this_thread __used) { - struct thread *thread = machine__findnew_thread(machine, runtime_event->pid); + struct thread *thread = perf_session__findnew(session, runtime_event->pid); struct work_atoms *atoms = thread_atoms_search(&atom_root, thread, &cmp_pid); BUG_ON(cpu >= MAX_CPUS || cpu < 0); @@ -1100,7 +1097,7 @@ latency_runtime_event(struct trace_runtime_event *runtime_event, static void latency_wakeup_event(struct trace_wakeup_event *wakeup_event, - struct machine *machine, + struct perf_session *session, struct event *__event __used, int cpu __used, u64 timestamp, @@ -1114,7 +1111,7 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event, if (!wakeup_event->success) return; - wakee = machine__findnew_thread(machine, wakeup_event->pid); + wakee = perf_session__findnew(session, wakeup_event->pid); atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid); if (!atoms) { thread_atoms_insert(wakee); @@ -1148,7 +1145,7 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event, static void latency_migrate_task_event(struct trace_migrate_task_event *migrate_task_event, - struct machine *machine, + struct perf_session *session, struct event *__event __used, int cpu __used, u64 timestamp, @@ -1164,7 +1161,7 @@ latency_migrate_task_event(struct trace_migrate_task_event *migrate_task_event, if (profile_cpu == -1) return; - migrant = machine__findnew_thread(machine, migrate_task_event->pid); + migrant = perf_session__findnew(session, migrate_task_event->pid); atoms = thread_atoms_search(&atom_root, migrant, &cmp_pid); if (!atoms) { thread_atoms_insert(migrant); @@ -1359,13 +1356,12 @@ static void sort_lat(void) static struct trace_sched_handler *trace_handler; static void -process_sched_wakeup_event(struct perf_tool *tool __used, +process_sched_wakeup_event(void *data, struct perf_session *session, struct event *event, - struct perf_sample *sample, - struct machine *machine, - struct thread *thread) + int cpu __used, + u64 timestamp __used, + struct thread *thread __used) { - void *data = sample->raw_data; struct trace_wakeup_event wakeup_event; FILL_COMMON_FIELDS(wakeup_event, event, data); @@ -1377,8 +1373,8 @@ process_sched_wakeup_event(struct perf_tool *tool __used, FILL_FIELD(wakeup_event, cpu, event, data); if (trace_handler->wakeup_event) - trace_handler->wakeup_event(&wakeup_event, machine, event, - sample->cpu, sample->time, thread); + trace_handler->wakeup_event(&wakeup_event, session, event, + cpu, timestamp, thread); } /* @@ -1396,7 +1392,7 @@ static char next_shortname2 = '0'; static void map_switch_event(struct trace_switch_event *switch_event, - struct machine *machine, + struct perf_session *session, struct event *event __used, int this_cpu, u64 timestamp, @@ -1424,8 +1420,8 @@ map_switch_event(struct trace_switch_event *switch_event, die("hm, delta: %" PRIu64 " < 0 ?\n", delta); - sched_out = machine__findnew_thread(machine, switch_event->prev_pid); - sched_in = machine__findnew_thread(machine, switch_event->next_pid); + sched_out = perf_session__findnew(session, switch_event->prev_pid); + sched_in = perf_session__findnew(session, switch_event->next_pid); curr_thread[this_cpu] = sched_in; @@ -1473,15 +1469,14 @@ map_switch_event(struct trace_switch_event *switch_event, } } + static void -process_sched_switch_event(struct perf_tool *tool __used, +process_sched_switch_event(void *data, struct perf_session *session, struct event *event, - struct perf_sample *sample, - struct machine *machine, - struct thread *thread) + int this_cpu, + u64 timestamp __used, + struct thread *thread __used) { - int this_cpu = sample->cpu; - void *data = sample->raw_data; struct trace_switch_event switch_event; FILL_COMMON_FIELDS(switch_event, event, data); @@ -1503,20 +1498,19 @@ process_sched_switch_event(struct perf_tool *tool __used, nr_context_switch_bugs++; } if (trace_handler->switch_event) - trace_handler->switch_event(&switch_event, machine, event, - this_cpu, sample->time, thread); + trace_handler->switch_event(&switch_event, session, event, + this_cpu, timestamp, thread); curr_pid[this_cpu] = switch_event.next_pid; } static void -process_sched_runtime_event(struct perf_tool *tool __used, - struct event *event, - struct perf_sample *sample, - struct machine *machine, - struct thread *thread) +process_sched_runtime_event(void *data, struct perf_session *session, + struct event *event, + int cpu __used, + u64 timestamp __used, + struct thread *thread __used) { - void *data = sample->raw_data; struct trace_runtime_event runtime_event; FILL_ARRAY(runtime_event, comm, event, data); @@ -1525,18 +1519,16 @@ process_sched_runtime_event(struct perf_tool *tool __used, FILL_FIELD(runtime_event, vruntime, event, data); if (trace_handler->runtime_event) - trace_handler->runtime_event(&runtime_event, machine, event, - sample->cpu, sample->time, thread); + trace_handler->runtime_event(&runtime_event, session, event, cpu, timestamp, thread); } static void -process_sched_fork_event(struct perf_tool *tool __used, +process_sched_fork_event(void *data, struct event *event, - struct perf_sample *sample, - struct machine *machine __used, - struct thread *thread) + int cpu __used, + u64 timestamp __used, + struct thread *thread __used) { - void *data = sample->raw_data; struct trace_fork_event fork_event; FILL_COMMON_FIELDS(fork_event, event, data); @@ -1548,14 +1540,13 @@ process_sched_fork_event(struct perf_tool *tool __used, if (trace_handler->fork_event) trace_handler->fork_event(&fork_event, event, - sample->cpu, sample->time, thread); + cpu, timestamp, thread); } static void -process_sched_exit_event(struct perf_tool *tool __used, - struct event *event, - struct perf_sample *sample __used, - struct machine *machine __used, +process_sched_exit_event(struct event *event, + int cpu __used, + u64 timestamp __used, struct thread *thread __used) { if (verbose) @@ -1563,13 +1554,12 @@ process_sched_exit_event(struct perf_tool *tool __used, } static void -process_sched_migrate_task_event(struct perf_tool *tool __used, - struct event *event, - struct perf_sample *sample, - struct machine *machine, - struct thread *thread) +process_sched_migrate_task_event(void *data, struct perf_session *session, + struct event *event, + int cpu __used, + u64 timestamp __used, + struct thread *thread __used) { - void *data = sample->raw_data; struct trace_migrate_task_event migrate_task_event; FILL_COMMON_FIELDS(migrate_task_event, event, data); @@ -1580,47 +1570,67 @@ process_sched_migrate_task_event(struct perf_tool *tool __used, FILL_FIELD(migrate_task_event, cpu, event, data); if (trace_handler->migrate_task_event) - trace_handler->migrate_task_event(&migrate_task_event, machine, - event, sample->cpu, - sample->time, thread); + trace_handler->migrate_task_event(&migrate_task_event, session, + event, cpu, timestamp, thread); } -typedef void (*tracepoint_handler)(struct perf_tool *tool, struct event *event, - struct perf_sample *sample, - struct machine *machine, - struct thread *thread); +static void process_raw_event(union perf_event *raw_event __used, + struct perf_session *session, void *data, int cpu, + u64 timestamp, struct thread *thread) +{ + struct event *event; + int type; + + + type = trace_parse_common_type(data); + event = trace_find_event(type); + + if (!strcmp(event->name, "sched_switch")) + process_sched_switch_event(data, session, event, cpu, timestamp, thread); + if (!strcmp(event->name, "sched_stat_runtime")) + process_sched_runtime_event(data, session, event, cpu, timestamp, thread); + if (!strcmp(event->name, "sched_wakeup")) + process_sched_wakeup_event(data, session, event, cpu, timestamp, thread); + if (!strcmp(event->name, "sched_wakeup_new")) + process_sched_wakeup_event(data, session, event, cpu, timestamp, thread); + if (!strcmp(event->name, "sched_process_fork")) + process_sched_fork_event(data, event, cpu, timestamp, thread); + if (!strcmp(event->name, "sched_process_exit")) + process_sched_exit_event(event, cpu, timestamp, thread); + if (!strcmp(event->name, "sched_migrate_task")) + process_sched_migrate_task_event(data, session, event, cpu, timestamp, thread); +} -static int perf_sched__process_tracepoint_sample(struct perf_tool *tool, - union perf_event *event __used, - struct perf_sample *sample, - struct perf_evsel *evsel, - struct machine *machine) +static int process_sample_event(union perf_event *event, + struct perf_sample *sample, + struct perf_evsel *evsel __used, + struct perf_session *session) { - struct thread *thread = machine__findnew_thread(machine, sample->pid); + struct thread *thread; + + if (!(session->sample_type & PERF_SAMPLE_RAW)) + return 0; + thread = perf_session__findnew(session, sample->pid); if (thread == NULL) { - pr_debug("problem processing %s event, skipping it.\n", - evsel->name); + pr_debug("problem processing %d event, skipping it.\n", + event->header.type); return -1; } - evsel->hists.stats.total_period += sample->period; - hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE); + dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid); - if (evsel->handler.func != NULL) { - tracepoint_handler f = evsel->handler.func; + if (profile_cpu != -1 && profile_cpu != (int)sample->cpu) + return 0; - if (evsel->handler.data == NULL) - evsel->handler.data = trace_find_event(evsel->attr.config); - - f(tool, evsel->handler.data, sample, machine, thread); - } + process_raw_event(event, session, sample->raw_data, sample->cpu, + sample->time, thread); return 0; } -static struct perf_tool perf_sched = { - .sample = perf_sched__process_tracepoint_sample, +static struct perf_event_ops event_ops = { + .sample = process_sample_event, .comm = perf_event__process_comm, .lost = perf_event__process_lost, .fork = perf_event__process_task, @@ -1630,25 +1640,13 @@ static struct perf_tool perf_sched = { static void read_events(bool destroy, struct perf_session **psession) { int err = -EINVAL; - const struct perf_evsel_str_handler handlers[] = { - { "sched:sched_switch", process_sched_switch_event, }, - { "sched:sched_stat_runtime", process_sched_runtime_event, }, - { "sched:sched_wakeup", process_sched_wakeup_event, }, - { "sched:sched_wakeup_new", process_sched_wakeup_event, }, - { "sched:sched_process_fork", process_sched_fork_event, }, - { "sched:sched_process_exit", process_sched_exit_event, }, - { "sched:sched_migrate_task", process_sched_migrate_task_event, }, - }; struct perf_session *session = perf_session__new(input_name, O_RDONLY, - 0, false, &perf_sched); + 0, false, &event_ops); if (session == NULL) die("No Memory"); - err = perf_evlist__set_tracepoints_handlers_array(session->evlist, handlers); - assert(err == 0); - if (perf_session__has_traces(session, "record -R")) { - err = perf_session__process_events(session, &perf_sched); + err = perf_session__process_events(session, &event_ops); if (err) die("Failed to process events, error %d", err); diff --git a/trunk/tools/perf/builtin-script.c b/trunk/tools/perf/builtin-script.c index fd1909afcfd6..2f62a2952269 100644 --- a/trunk/tools/perf/builtin-script.c +++ b/trunk/tools/perf/builtin-script.c @@ -7,7 +7,6 @@ #include "util/header.h" #include "util/parse-options.h" #include "util/session.h" -#include "util/tool.h" #include "util/symbol.h" #include "util/thread.h" #include "util/trace-event.h" @@ -24,7 +23,6 @@ static u64 nr_unordered; extern const struct option record_options[]; static bool no_callchain; static bool show_full_info; -static bool system_wide; static const char *cpu_list; static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); @@ -317,7 +315,7 @@ static bool sample_addr_correlates_sym(struct perf_event_attr *attr) static void print_sample_addr(union perf_event *event, struct perf_sample *sample, - struct machine *machine, + struct perf_session *session, struct thread *thread, struct perf_event_attr *attr) { @@ -330,11 +328,11 @@ static void print_sample_addr(union perf_event *event, if (!sample_addr_correlates_sym(attr)) return; - thread__find_addr_map(thread, machine, cpumode, MAP__FUNCTION, - sample->addr, &al); + thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION, + event->ip.pid, sample->addr, &al); if (!al.map) - thread__find_addr_map(thread, machine, cpumode, MAP__VARIABLE, - sample->addr, &al); + thread__find_addr_map(thread, session, cpumode, MAP__VARIABLE, + event->ip.pid, sample->addr, &al); al.cpu = sample->cpu; al.sym = NULL; @@ -364,7 +362,7 @@ static void print_sample_addr(union perf_event *event, static void process_event(union perf_event *event __unused, struct perf_sample *sample, struct perf_evsel *evsel, - struct machine *machine, + struct perf_session *session, struct thread *thread) { struct perf_event_attr *attr = &evsel->attr; @@ -379,15 +377,15 @@ static void process_event(union perf_event *event __unused, sample->raw_size); if (PRINT_FIELD(ADDR)) - print_sample_addr(event, sample, machine, thread, attr); + print_sample_addr(event, sample, session, thread, attr); if (PRINT_FIELD(IP)) { if (!symbol_conf.use_callchain) printf(" "); else printf("\n"); - perf_event__print_ip(event, sample, machine, evsel, - PRINT_FIELD(SYM), PRINT_FIELD(DSO)); + perf_session__print_ip(event, sample, session, + PRINT_FIELD(SYM), PRINT_FIELD(DSO)); } printf("\n"); @@ -434,16 +432,14 @@ static int cleanup_scripting(void) return scripting_ops->stop_script(); } -static const char *input_name; +static char const *input_name = "perf.data"; -static int process_sample_event(struct perf_tool *tool __used, - union perf_event *event, +static int process_sample_event(union perf_event *event, struct perf_sample *sample, struct perf_evsel *evsel, - struct machine *machine) + struct perf_session *session) { - struct addr_location al; - struct thread *thread = machine__findnew_thread(machine, event->ip.tid); + struct thread *thread = perf_session__findnew(session, event->ip.pid); if (thread == NULL) { pr_debug("problem processing %d event, skipping it.\n", @@ -462,25 +458,16 @@ static int process_sample_event(struct perf_tool *tool __used, return 0; } - if (perf_event__preprocess_sample(event, machine, &al, sample, 0) < 0) { - pr_err("problem processing %d event, skipping it.\n", - event->header.type); - return -1; - } - - if (al.filtered) - return 0; - if (cpu_list && !test_bit(sample->cpu, cpu_bitmap)) return 0; - scripting_ops->process_event(event, sample, evsel, machine, thread); + scripting_ops->process_event(event, sample, evsel, session, thread); - evsel->hists.stats.total_period += sample->period; + session->hists.stats.total_period += sample->period; return 0; } -static struct perf_tool perf_script = { +static struct perf_event_ops event_ops = { .sample = process_sample_event, .mmap = perf_event__process_mmap, .comm = perf_event__process_comm, @@ -507,7 +494,7 @@ static int __cmd_script(struct perf_session *session) signal(SIGINT, sig_handler); - ret = perf_session__process_events(session, &perf_script); + ret = perf_session__process_events(session, &event_ops); if (debug_mode) pr_err("Misordered timestamps: %" PRIu64 "\n", nr_unordered); @@ -536,6 +523,12 @@ static struct script_spec *script_spec__new(const char *spec, return s; } +static void script_spec__delete(struct script_spec *s) +{ + free(s->spec); + free(s); +} + static void script_spec__add(struct script_spec *s) { list_add_tail(&s->node, &script_specs); @@ -561,11 +554,16 @@ static struct script_spec *script_spec__findnew(const char *spec, s = script_spec__new(spec, ops); if (!s) - return NULL; + goto out_delete_spec; script_spec__add(s); return s; + +out_delete_spec: + script_spec__delete(s); + + return NULL; } int script_spec_register(const char *spec, struct scripting_ops *ops) @@ -683,8 +681,7 @@ static int parse_output_fields(const struct option *opt __used, type = PERF_TYPE_RAW; else { fprintf(stderr, "Invalid event type in field string.\n"); - rc = -EINVAL; - goto out; + return -EINVAL; } if (output[type].user_set) @@ -926,24 +923,6 @@ static int read_script_info(struct script_desc *desc, const char *filename) return 0; } -static char *get_script_root(struct dirent *script_dirent, const char *suffix) -{ - char *script_root, *str; - - script_root = strdup(script_dirent->d_name); - if (!script_root) - return NULL; - - str = (char *)ends_with(script_root, suffix); - if (!str) { - free(script_root); - return NULL; - } - - *str = '\0'; - return script_root; -} - static int list_available_scripts(const struct option *opt __used, const char *s __used, int unset __used) { @@ -955,6 +934,7 @@ static int list_available_scripts(const struct option *opt __used, struct script_desc *desc; char first_half[BUFSIZ]; char *script_root; + char *str; snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path()); @@ -970,14 +950,16 @@ static int list_available_scripts(const struct option *opt __used, continue; for_each_script(lang_path, lang_dir, script_dirent, script_next) { - script_root = get_script_root(&script_dirent, REPORT_SUFFIX); - if (script_root) { + script_root = strdup(script_dirent.d_name); + str = (char *)ends_with(script_root, REPORT_SUFFIX); + if (str) { + *str = '\0'; desc = script_desc__findnew(script_root); snprintf(script_path, MAXPATHLEN, "%s/%s", lang_path, script_dirent.d_name); read_script_info(desc, script_path); - free(script_root); } + free(script_root); } } @@ -999,7 +981,8 @@ static char *get_script_path(const char *script_root, const char *suffix) char script_path[MAXPATHLEN]; DIR *scripts_dir, *lang_dir; char lang_path[MAXPATHLEN]; - char *__script_root; + char *str, *__script_root; + char *path = NULL; snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path()); @@ -1015,18 +998,23 @@ static char *get_script_path(const char *script_root, const char *suffix) continue; for_each_script(lang_path, lang_dir, script_dirent, script_next) { - __script_root = get_script_root(&script_dirent, suffix); - if (__script_root && !strcmp(script_root, __script_root)) { - free(__script_root); + __script_root = strdup(script_dirent.d_name); + str = (char *)ends_with(__script_root, suffix); + if (str) { + *str = '\0'; + if (strcmp(__script_root, script_root)) + continue; snprintf(script_path, MAXPATHLEN, "%s/%s", lang_path, script_dirent.d_name); - return strdup(script_path); + path = strdup(script_path); + free(__script_root); + break; } free(__script_root); } } - return NULL; + return path; } static bool is_top_script(const char *script_path) @@ -1095,11 +1083,7 @@ static const struct option options[] = { OPT_CALLBACK('f', "fields", NULL, "str", "comma separated output fields prepend with 'type:'. Valid types: hw,sw,trace,raw. Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,addr", parse_output_fields), - OPT_BOOLEAN('a', "all-cpus", &system_wide, - "system-wide collection from all CPUs"), - OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"), - OPT_STRING('c', "comms", &symbol_conf.comm_list_str, "comm[,comm...]", - "only display events for these comms"), + OPT_STRING('c', "cpu", &cpu_list, "cpu", "list of cpus to profile"), OPT_BOOLEAN('I', "show-info", &show_full_info, "display extended information from perf.data file"), OPT_END() @@ -1126,6 +1110,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __used) struct perf_session *session; char *script_path = NULL; const char **__argv; + bool system_wide; int i, j, err; setup_scripting(); @@ -1193,17 +1178,15 @@ int cmd_script(int argc, const char **argv, const char *prefix __used) } if (!pid) { + system_wide = true; j = 0; dup2(live_pipe[1], 1); close(live_pipe[0]); - if (is_top_script(argv[0])) { - system_wide = true; - } else if (!system_wide) { + if (!is_top_script(argv[0])) system_wide = !have_cmd(argc - rep_args, &argv[rep_args]); - } __argv = malloc((argc + 6) * sizeof(const char *)); if (!__argv) @@ -1251,11 +1234,10 @@ int cmd_script(int argc, const char **argv, const char *prefix __used) script_path = rep_script_path; if (script_path) { + system_wide = false; j = 0; - if (!rec_script_path) - system_wide = false; - else if (!system_wide) + if (rec_script_path) system_wide = !have_cmd(argc - 1, &argv[1]); __argv = malloc((argc + 2) * sizeof(const char *)); @@ -1279,7 +1261,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __used) if (!script_name) setup_pager(); - session = perf_session__new(input_name, O_RDONLY, 0, false, &perf_script); + session = perf_session__new(input_name, O_RDONLY, 0, false, &event_ops); if (session == NULL) return -ENOMEM; @@ -1305,7 +1287,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __used) return -1; } - input = open(session->filename, O_RDONLY); /* input_name */ + input = open(input_name, O_RDONLY); if (input < 0) { perror("failed to open file"); exit(-1); diff --git a/trunk/tools/perf/builtin-stat.c b/trunk/tools/perf/builtin-stat.c index f5d2a63eba66..955930e0a5c3 100644 --- a/trunk/tools/perf/builtin-stat.c +++ b/trunk/tools/perf/builtin-stat.c @@ -578,33 +578,6 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) avg / avg_stats(&walltime_nsecs_stats)); } -/* used for get_ratio_color() */ -enum grc_type { - GRC_STALLED_CYCLES_FE, - GRC_STALLED_CYCLES_BE, - GRC_CACHE_MISSES, - GRC_MAX_NR -}; - -static const char *get_ratio_color(enum grc_type type, double ratio) -{ - static const double grc_table[GRC_MAX_NR][3] = { - [GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 }, - [GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 }, - [GRC_CACHE_MISSES] = { 20.0, 10.0, 5.0 }, - }; - const char *color = PERF_COLOR_NORMAL; - - if (ratio > grc_table[type][0]) - color = PERF_COLOR_RED; - else if (ratio > grc_table[type][1]) - color = PERF_COLOR_MAGENTA; - else if (ratio > grc_table[type][2]) - color = PERF_COLOR_YELLOW; - - return color; -} - static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __used, double avg) { double total, ratio = 0.0; @@ -615,7 +588,13 @@ static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __us if (total) ratio = avg / total * 100.0; - color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio); + color = PERF_COLOR_NORMAL; + if (ratio > 50.0) + color = PERF_COLOR_RED; + else if (ratio > 30.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 10.0) + color = PERF_COLOR_YELLOW; fprintf(output, " # "); color_fprintf(output, color, "%6.2f%%", ratio); @@ -632,7 +611,13 @@ static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __use if (total) ratio = avg / total * 100.0; - color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio); + color = PERF_COLOR_NORMAL; + if (ratio > 75.0) + color = PERF_COLOR_RED; + else if (ratio > 50.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 20.0) + color = PERF_COLOR_YELLOW; fprintf(output, " # "); color_fprintf(output, color, "%6.2f%%", ratio); @@ -649,7 +634,13 @@ static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double if (total) ratio = avg / total * 100.0; - color = get_ratio_color(GRC_CACHE_MISSES, ratio); + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; fprintf(output, " # "); color_fprintf(output, color, "%6.2f%%", ratio); @@ -666,7 +657,13 @@ static void print_l1_dcache_misses(int cpu, struct perf_evsel *evsel __used, dou if (total) ratio = avg / total * 100.0; - color = get_ratio_color(GRC_CACHE_MISSES, ratio); + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; fprintf(output, " # "); color_fprintf(output, color, "%6.2f%%", ratio); @@ -683,7 +680,13 @@ static void print_l1_icache_misses(int cpu, struct perf_evsel *evsel __used, dou if (total) ratio = avg / total * 100.0; - color = get_ratio_color(GRC_CACHE_MISSES, ratio); + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; fprintf(output, " # "); color_fprintf(output, color, "%6.2f%%", ratio); @@ -700,7 +703,13 @@ static void print_dtlb_cache_misses(int cpu, struct perf_evsel *evsel __used, do if (total) ratio = avg / total * 100.0; - color = get_ratio_color(GRC_CACHE_MISSES, ratio); + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; fprintf(output, " # "); color_fprintf(output, color, "%6.2f%%", ratio); @@ -717,7 +726,13 @@ static void print_itlb_cache_misses(int cpu, struct perf_evsel *evsel __used, do if (total) ratio = avg / total * 100.0; - color = get_ratio_color(GRC_CACHE_MISSES, ratio); + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; fprintf(output, " # "); color_fprintf(output, color, "%6.2f%%", ratio); @@ -734,7 +749,13 @@ static void print_ll_cache_misses(int cpu, struct perf_evsel *evsel __used, doub if (total) ratio = avg / total * 100.0; - color = get_ratio_color(GRC_CACHE_MISSES, ratio); + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; fprintf(output, " # "); color_fprintf(output, color, "%6.2f%%", ratio); @@ -1087,13 +1108,22 @@ static const struct option options[] = { */ static int add_default_attributes(void) { + struct perf_evsel *pos; + size_t attr_nr = 0; + size_t c; + /* Set attrs if no event is selected and !null_run: */ if (null_run) return 0; if (!evsel_list->nr_entries) { - if (perf_evlist__add_attrs_array(evsel_list, default_attrs) < 0) - return -1; + for (c = 0; c < ARRAY_SIZE(default_attrs); c++) { + pos = perf_evsel__new(default_attrs + c, c + attr_nr); + if (pos == NULL) + return -1; + perf_evlist__add(evsel_list, pos); + } + attr_nr += c; } /* Detailed events get appended to the event list: */ @@ -1102,21 +1132,38 @@ static int add_default_attributes(void) return 0; /* Append detailed run extra attributes: */ - if (perf_evlist__add_attrs_array(evsel_list, detailed_attrs) < 0) - return -1; + for (c = 0; c < ARRAY_SIZE(detailed_attrs); c++) { + pos = perf_evsel__new(detailed_attrs + c, c + attr_nr); + if (pos == NULL) + return -1; + perf_evlist__add(evsel_list, pos); + } + attr_nr += c; if (detailed_run < 2) return 0; /* Append very detailed run extra attributes: */ - if (perf_evlist__add_attrs_array(evsel_list, very_detailed_attrs) < 0) - return -1; + for (c = 0; c < ARRAY_SIZE(very_detailed_attrs); c++) { + pos = perf_evsel__new(very_detailed_attrs + c, c + attr_nr); + if (pos == NULL) + return -1; + perf_evlist__add(evsel_list, pos); + } if (detailed_run < 3) return 0; /* Append very, very detailed run extra attributes: */ - return perf_evlist__add_attrs_array(evsel_list, very_very_detailed_attrs); + for (c = 0; c < ARRAY_SIZE(very_very_detailed_attrs); c++) { + pos = perf_evsel__new(very_very_detailed_attrs + c, c + attr_nr); + if (pos == NULL) + return -1; + perf_evlist__add(evsel_list, pos); + } + + + return 0; } int cmd_stat(int argc, const char **argv, const char *prefix __used) @@ -1220,7 +1267,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) list_for_each_entry(pos, &evsel_list->entries, node) { if (perf_evsel__alloc_stat_priv(pos) < 0 || - perf_evsel__alloc_counts(pos, evsel_list->cpus->nr) < 0) + perf_evsel__alloc_counts(pos, evsel_list->cpus->nr) < 0 || + perf_evsel__alloc_fd(pos, evsel_list->cpus->nr, evsel_list->threads->nr) < 0) goto out_free_fd; } diff --git a/trunk/tools/perf/builtin-test.c b/trunk/tools/perf/builtin-test.c index 2b9a7f497a20..831d1baeac37 100644 --- a/trunk/tools/perf/builtin-test.c +++ b/trunk/tools/perf/builtin-test.c @@ -7,7 +7,6 @@ #include "util/cache.h" #include "util/debug.h" -#include "util/debugfs.h" #include "util/evlist.h" #include "util/parse-options.h" #include "util/parse-events.h" @@ -15,6 +14,8 @@ #include "util/thread_map.h" #include "../../include/linux/hw_breakpoint.h" +static long page_size; + static int vmlinux_matches_kallsyms_filter(struct map *map __used, struct symbol *sym) { bool *visited = symbol__priv(sym); @@ -30,7 +31,6 @@ static int test__vmlinux_matches_kallsyms(void) struct map *kallsyms_map, *vmlinux_map; struct machine kallsyms, vmlinux; enum map_type type = MAP__FUNCTION; - long page_size = sysconf(_SC_PAGE_SIZE); struct ref_reloc_sym ref_reloc_sym = { .name = "_stext", }; /* @@ -247,7 +247,7 @@ static int trace_event__id(const char *evname) if (asprintf(&filename, "%s/syscalls/%s/id", - tracing_events_path, evname) < 0) + debugfs_path, evname) < 0) return -1; fd = open(filename, O_RDONLY); @@ -603,7 +603,7 @@ static int test__basic_mmap(void) #define TEST_ASSERT_VAL(text, cond) \ do { \ - if (!(cond)) { \ + if (!cond) { \ pr_debug("FAILED %s:%d %s\n", __FILE__, __LINE__, text); \ return -1; \ } \ @@ -759,103 +759,6 @@ static int test__checkevent_breakpoint_w(struct perf_evlist *evlist) return 0; } -static int test__checkevent_tracepoint_modifier(struct perf_evlist *evlist) -{ - struct perf_evsel *evsel = list_entry(evlist->entries.next, - struct perf_evsel, node); - - TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); - - return test__checkevent_tracepoint(evlist); -} - -static int -test__checkevent_tracepoint_multi_modifier(struct perf_evlist *evlist) -{ - struct perf_evsel *evsel; - - TEST_ASSERT_VAL("wrong number of entries", evlist->nr_entries > 1); - - list_for_each_entry(evsel, &evlist->entries, node) { - TEST_ASSERT_VAL("wrong exclude_user", - !evsel->attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", - evsel->attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); - } - - return test__checkevent_tracepoint_multi(evlist); -} - -static int test__checkevent_raw_modifier(struct perf_evlist *evlist) -{ - struct perf_evsel *evsel = list_entry(evlist->entries.next, - struct perf_evsel, node); - - TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv); - TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip); - - return test__checkevent_raw(evlist); -} - -static int test__checkevent_numeric_modifier(struct perf_evlist *evlist) -{ - struct perf_evsel *evsel = list_entry(evlist->entries.next, - struct perf_evsel, node); - - TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv); - TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip); - - return test__checkevent_numeric(evlist); -} - -static int test__checkevent_symbolic_name_modifier(struct perf_evlist *evlist) -{ - struct perf_evsel *evsel = list_entry(evlist->entries.next, - struct perf_evsel, node); - - TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); - - return test__checkevent_symbolic_name(evlist); -} - -static int test__checkevent_symbolic_alias_modifier(struct perf_evlist *evlist) -{ - struct perf_evsel *evsel = list_entry(evlist->entries.next, - struct perf_evsel, node); - - TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); - - return test__checkevent_symbolic_alias(evlist); -} - -static int test__checkevent_genhw_modifier(struct perf_evlist *evlist) -{ - struct perf_evsel *evsel = list_entry(evlist->entries.next, - struct perf_evsel, node); - - TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv); - TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip); - - return test__checkevent_genhw(evlist); -} - static struct test__event_st { const char *name; __u32 type; @@ -905,34 +808,6 @@ static struct test__event_st { .name = "mem:0:w", .check = test__checkevent_breakpoint_w, }, - { - .name = "syscalls:sys_enter_open:k", - .check = test__checkevent_tracepoint_modifier, - }, - { - .name = "syscalls:*:u", - .check = test__checkevent_tracepoint_multi_modifier, - }, - { - .name = "r1:kp", - .check = test__checkevent_raw_modifier, - }, - { - .name = "1:1:hp", - .check = test__checkevent_numeric_modifier, - }, - { - .name = "instructions:h", - .check = test__checkevent_symbolic_name_modifier, - }, - { - .name = "faults:u", - .check = test__checkevent_symbolic_alias_modifier, - }, - { - .name = "L1-dcache-load-miss:kp", - .check = test__checkevent_genhw_modifier, - }, }; #define TEST__EVENTS_CNT (sizeof(test__events) / sizeof(struct test__event_st)) @@ -966,336 +841,6 @@ static int test__parse_events(void) return ret; } - -static int sched__get_first_possible_cpu(pid_t pid, cpu_set_t **maskp, - size_t *sizep) -{ - cpu_set_t *mask; - size_t size; - int i, cpu = -1, nrcpus = 1024; -realloc: - mask = CPU_ALLOC(nrcpus); - size = CPU_ALLOC_SIZE(nrcpus); - CPU_ZERO_S(size, mask); - - if (sched_getaffinity(pid, size, mask) == -1) { - CPU_FREE(mask); - if (errno == EINVAL && nrcpus < (1024 << 8)) { - nrcpus = nrcpus << 2; - goto realloc; - } - perror("sched_getaffinity"); - return -1; - } - - for (i = 0; i < nrcpus; i++) { - if (CPU_ISSET_S(i, size, mask)) { - if (cpu == -1) { - cpu = i; - *maskp = mask; - *sizep = size; - } else - CPU_CLR_S(i, size, mask); - } - } - - if (cpu == -1) - CPU_FREE(mask); - - return cpu; -} - -static int test__PERF_RECORD(void) -{ - struct perf_record_opts opts = { - .target_pid = -1, - .target_tid = -1, - .no_delay = true, - .freq = 10, - .mmap_pages = 256, - .sample_id_all_avail = true, - }; - cpu_set_t *cpu_mask = NULL; - size_t cpu_mask_size = 0; - struct perf_evlist *evlist = perf_evlist__new(NULL, NULL); - struct perf_evsel *evsel; - struct perf_sample sample; - const char *cmd = "sleep"; - const char *argv[] = { cmd, "1", NULL, }; - char *bname; - u64 sample_type, prev_time = 0; - bool found_cmd_mmap = false, - found_libc_mmap = false, - found_vdso_mmap = false, - found_ld_mmap = false; - int err = -1, errs = 0, i, wakeups = 0, sample_size; - u32 cpu; - int total_events = 0, nr_events[PERF_RECORD_MAX] = { 0, }; - - if (evlist == NULL || argv == NULL) { - pr_debug("Not enough memory to create evlist\n"); - goto out; - } - - /* - * We need at least one evsel in the evlist, use the default - * one: "cycles". - */ - err = perf_evlist__add_default(evlist); - if (err < 0) { - pr_debug("Not enough memory to create evsel\n"); - goto out_delete_evlist; - } - - /* - * Create maps of threads and cpus to monitor. In this case - * we start with all threads and cpus (-1, -1) but then in - * perf_evlist__prepare_workload we'll fill in the only thread - * we're monitoring, the one forked there. - */ - err = perf_evlist__create_maps(evlist, opts.target_pid, - opts.target_tid, opts.cpu_list); - if (err < 0) { - pr_debug("Not enough memory to create thread/cpu maps\n"); - goto out_delete_evlist; - } - - /* - * Prepare the workload in argv[] to run, it'll fork it, and then wait - * for perf_evlist__start_workload() to exec it. This is done this way - * so that we have time to open the evlist (calling sys_perf_event_open - * on all the fds) and then mmap them. - */ - err = perf_evlist__prepare_workload(evlist, &opts, argv); - if (err < 0) { - pr_debug("Couldn't run the workload!\n"); - goto out_delete_evlist; - } - - /* - * Config the evsels, setting attr->comm on the first one, etc. - */ - evsel = list_entry(evlist->entries.next, struct perf_evsel, node); - evsel->attr.sample_type |= PERF_SAMPLE_CPU; - evsel->attr.sample_type |= PERF_SAMPLE_TID; - evsel->attr.sample_type |= PERF_SAMPLE_TIME; - perf_evlist__config_attrs(evlist, &opts); - - err = sched__get_first_possible_cpu(evlist->workload.pid, &cpu_mask, - &cpu_mask_size); - if (err < 0) { - pr_debug("sched__get_first_possible_cpu: %s\n", strerror(errno)); - goto out_delete_evlist; - } - - cpu = err; - - /* - * So that we can check perf_sample.cpu on all the samples. - */ - if (sched_setaffinity(evlist->workload.pid, cpu_mask_size, cpu_mask) < 0) { - pr_debug("sched_setaffinity: %s\n", strerror(errno)); - goto out_free_cpu_mask; - } - - /* - * Call sys_perf_event_open on all the fds on all the evsels, - * grouping them if asked to. - */ - err = perf_evlist__open(evlist, opts.group); - if (err < 0) { - pr_debug("perf_evlist__open: %s\n", strerror(errno)); - goto out_delete_evlist; - } - - /* - * mmap the first fd on a given CPU and ask for events for the other - * fds in the same CPU to be injected in the same mmap ring buffer - * (using ioctl(PERF_EVENT_IOC_SET_OUTPUT)). - */ - err = perf_evlist__mmap(evlist, opts.mmap_pages, false); - if (err < 0) { - pr_debug("perf_evlist__mmap: %s\n", strerror(errno)); - goto out_delete_evlist; - } - - /* - * We'll need these two to parse the PERF_SAMPLE_* fields in each - * event. - */ - sample_type = perf_evlist__sample_type(evlist); - sample_size = __perf_evsel__sample_size(sample_type); - - /* - * Now that all is properly set up, enable the events, they will - * count just on workload.pid, which will start... - */ - perf_evlist__enable(evlist); - - /* - * Now! - */ - perf_evlist__start_workload(evlist); - - while (1) { - int before = total_events; - - for (i = 0; i < evlist->nr_mmaps; i++) { - union perf_event *event; - - while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) { - const u32 type = event->header.type; - const char *name = perf_event__name(type); - - ++total_events; - if (type < PERF_RECORD_MAX) - nr_events[type]++; - - err = perf_event__parse_sample(event, sample_type, - sample_size, true, - &sample, false); - if (err < 0) { - if (verbose) - perf_event__fprintf(event, stderr); - pr_debug("Couldn't parse sample\n"); - goto out_err; - } - - if (verbose) { - pr_info("%" PRIu64" %d ", sample.time, sample.cpu); - perf_event__fprintf(event, stderr); - } - - if (prev_time > sample.time) { - pr_debug("%s going backwards in time, prev=%" PRIu64 ", curr=%" PRIu64 "\n", - name, prev_time, sample.time); - ++errs; - } - - prev_time = sample.time; - - if (sample.cpu != cpu) { - pr_debug("%s with unexpected cpu, expected %d, got %d\n", - name, cpu, sample.cpu); - ++errs; - } - - if ((pid_t)sample.pid != evlist->workload.pid) { - pr_debug("%s with unexpected pid, expected %d, got %d\n", - name, evlist->workload.pid, sample.pid); - ++errs; - } - - if ((pid_t)sample.tid != evlist->workload.pid) { - pr_debug("%s with unexpected tid, expected %d, got %d\n", - name, evlist->workload.pid, sample.tid); - ++errs; - } - - if ((type == PERF_RECORD_COMM || - type == PERF_RECORD_MMAP || - type == PERF_RECORD_FORK || - type == PERF_RECORD_EXIT) && - (pid_t)event->comm.pid != evlist->workload.pid) { - pr_debug("%s with unexpected pid/tid\n", name); - ++errs; - } - - if ((type == PERF_RECORD_COMM || - type == PERF_RECORD_MMAP) && - event->comm.pid != event->comm.tid) { - pr_debug("%s with different pid/tid!\n", name); - ++errs; - } - - switch (type) { - case PERF_RECORD_COMM: - if (strcmp(event->comm.comm, cmd)) { - pr_debug("%s with unexpected comm!\n", name); - ++errs; - } - break; - case PERF_RECORD_EXIT: - goto found_exit; - case PERF_RECORD_MMAP: - bname = strrchr(event->mmap.filename, '/'); - if (bname != NULL) { - if (!found_cmd_mmap) - found_cmd_mmap = !strcmp(bname + 1, cmd); - if (!found_libc_mmap) - found_libc_mmap = !strncmp(bname + 1, "libc", 4); - if (!found_ld_mmap) - found_ld_mmap = !strncmp(bname + 1, "ld", 2); - } else if (!found_vdso_mmap) - found_vdso_mmap = !strcmp(event->mmap.filename, "[vdso]"); - break; - - case PERF_RECORD_SAMPLE: - /* Just ignore samples for now */ - break; - default: - pr_debug("Unexpected perf_event->header.type %d!\n", - type); - ++errs; - } - } - } - - /* - * We don't use poll here because at least at 3.1 times the - * PERF_RECORD_{!SAMPLE} events don't honour - * perf_event_attr.wakeup_events, just PERF_EVENT_SAMPLE does. - */ - if (total_events == before && false) - poll(evlist->pollfd, evlist->nr_fds, -1); - - sleep(1); - if (++wakeups > 5) { - pr_debug("No PERF_RECORD_EXIT event!\n"); - break; - } - } - -found_exit: - if (nr_events[PERF_RECORD_COMM] > 1) { - pr_debug("Excessive number of PERF_RECORD_COMM events!\n"); - ++errs; - } - - if (nr_events[PERF_RECORD_COMM] == 0) { - pr_debug("Missing PERF_RECORD_COMM for %s!\n", cmd); - ++errs; - } - - if (!found_cmd_mmap) { - pr_debug("PERF_RECORD_MMAP for %s missing!\n", cmd); - ++errs; - } - - if (!found_libc_mmap) { - pr_debug("PERF_RECORD_MMAP for %s missing!\n", "libc"); - ++errs; - } - - if (!found_ld_mmap) { - pr_debug("PERF_RECORD_MMAP for %s missing!\n", "ld"); - ++errs; - } - - if (!found_vdso_mmap) { - pr_debug("PERF_RECORD_MMAP for %s missing!\n", "[vdso]"); - ++errs; - } -out_err: - perf_evlist__munmap(evlist); -out_free_cpu_mask: - CPU_FREE(cpu_mask); -out_delete_evlist: - perf_evlist__delete(evlist); -out: - return (err < 0 || errs > 0) ? -1 : 0; -} - static struct test { const char *desc; int (*func)(void); @@ -1320,90 +865,46 @@ static struct test { .desc = "parse events tests", .func = test__parse_events, }, - { - .desc = "Validate PERF_RECORD_* events & perf_sample fields", - .func = test__PERF_RECORD, - }, { .func = NULL, }, }; -static bool perf_test__matches(int curr, int argc, const char *argv[]) -{ - int i; - - if (argc == 0) - return true; - - for (i = 0; i < argc; ++i) { - char *end; - long nr = strtoul(argv[i], &end, 10); - - if (*end == '\0') { - if (nr == curr + 1) - return true; - continue; - } - - if (strstr(tests[curr].desc, argv[i])) - return true; - } - - return false; -} - -static int __cmd_test(int argc, const char *argv[]) +static int __cmd_test(void) { int i = 0; - while (tests[i].func) { - int curr = i++, err; - - if (!perf_test__matches(curr, argc, argv)) - continue; + page_size = sysconf(_SC_PAGE_SIZE); - pr_info("%2d: %s:", i, tests[curr].desc); + while (tests[i].func) { + int err; + pr_info("%2d: %s:", i + 1, tests[i].desc); pr_debug("\n--- start ---\n"); - err = tests[curr].func(); - pr_debug("---- end ----\n%s:", tests[curr].desc); + err = tests[i].func(); + pr_debug("---- end ----\n%s:", tests[i].desc); pr_info(" %s\n", err ? "FAILED!\n" : "Ok"); + ++i; } return 0; } -static int perf_test__list(int argc, const char **argv) -{ - int i = 0; - - while (tests[i].func) { - int curr = i++; - - if (argc > 1 && !strstr(tests[curr].desc, argv[1])) - continue; - - pr_info("%2d: %s\n", i, tests[curr].desc); - } - - return 0; -} - -int cmd_test(int argc, const char **argv, const char *prefix __used) -{ - const char * const test_usage[] = { - "perf test [] [{list |[|]}]", +static const char * const test_usage[] = { + "perf test []", NULL, - }; - const struct option test_options[] = { +}; + +static const struct option test_options[] = { OPT_INTEGER('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), OPT_END() - }; +}; +int cmd_test(int argc, const char **argv, const char *prefix __used) +{ argc = parse_options(argc, argv, test_options, test_usage, 0); - if (argc >= 1 && !strcmp(argv[0], "list")) - return perf_test__list(argc, argv); + if (argc) + usage_with_options(test_usage, test_options); symbol_conf.priv_size = sizeof(int); symbol_conf.sort_by_name = true; @@ -1414,5 +915,5 @@ int cmd_test(int argc, const char **argv, const char *prefix __used) setup_pager(); - return __cmd_test(argc, argv); + return __cmd_test(); } diff --git a/trunk/tools/perf/builtin-timechart.c b/trunk/tools/perf/builtin-timechart.c index 3b75b2e21ea5..aa26f4d66d10 100644 --- a/trunk/tools/perf/builtin-timechart.c +++ b/trunk/tools/perf/builtin-timechart.c @@ -19,7 +19,6 @@ #include "util/color.h" #include #include "util/cache.h" -#include "util/evsel.h" #include #include "util/symbol.h" #include "util/callchain.h" @@ -32,14 +31,13 @@ #include "util/event.h" #include "util/session.h" #include "util/svghelper.h" -#include "util/tool.h" #define SUPPORT_OLD_POWER_EVENTS 1 #define PWR_EVENT_EXIT -1 -static const char *input_name; -static const char *output_name = "output.svg"; +static char const *input_name = "perf.data"; +static char const *output_name = "output.svg"; static unsigned int numcpus; static u64 min_freq; /* Lowest CPU frequency seen */ @@ -275,28 +273,25 @@ static int cpus_cstate_state[MAX_CPUS]; static u64 cpus_pstate_start_times[MAX_CPUS]; static u64 cpus_pstate_state[MAX_CPUS]; -static int process_comm_event(struct perf_tool *tool __used, - union perf_event *event, +static int process_comm_event(union perf_event *event, struct perf_sample *sample __used, - struct machine *machine __used) + struct perf_session *session __used) { pid_set_comm(event->comm.tid, event->comm.comm); return 0; } -static int process_fork_event(struct perf_tool *tool __used, - union perf_event *event, +static int process_fork_event(union perf_event *event, struct perf_sample *sample __used, - struct machine *machine __used) + struct perf_session *session __used) { pid_fork(event->fork.pid, event->fork.ppid, event->fork.time); return 0; } -static int process_exit_event(struct perf_tool *tool __used, - union perf_event *event, +static int process_exit_event(union perf_event *event, struct perf_sample *sample __used, - struct machine *machine __used) + struct perf_session *session __used) { pid_exit(event->fork.pid, event->fork.time); return 0; @@ -491,15 +486,14 @@ static void sched_switch(int cpu, u64 timestamp, struct trace_entry *te) } -static int process_sample_event(struct perf_tool *tool __used, - union perf_event *event __used, +static int process_sample_event(union perf_event *event __used, struct perf_sample *sample, - struct perf_evsel *evsel, - struct machine *machine __used) + struct perf_evsel *evsel __used, + struct perf_session *session) { struct trace_entry *te; - if (evsel->attr.sample_type & PERF_SAMPLE_TIME) { + if (session->sample_type & PERF_SAMPLE_TIME) { if (!first_time || first_time > sample->time) first_time = sample->time; if (last_time < sample->time) @@ -507,7 +501,7 @@ static int process_sample_event(struct perf_tool *tool __used, } te = (void *)sample->raw_data; - if ((evsel->attr.sample_type & PERF_SAMPLE_RAW) && sample->raw_size > 0) { + if (session->sample_type & PERF_SAMPLE_RAW && sample->raw_size > 0) { char *event_str; #ifdef SUPPORT_OLD_POWER_EVENTS struct power_entry_old *peo; @@ -980,7 +974,7 @@ static void write_svg_file(const char *filename) svg_close(); } -static struct perf_tool perf_timechart = { +static struct perf_event_ops event_ops = { .comm = process_comm_event, .fork = process_fork_event, .exit = process_exit_event, @@ -991,7 +985,7 @@ static struct perf_tool perf_timechart = { static int __cmd_timechart(void) { struct perf_session *session = perf_session__new(input_name, O_RDONLY, - 0, false, &perf_timechart); + 0, false, &event_ops); int ret = -EINVAL; if (session == NULL) @@ -1000,7 +994,7 @@ static int __cmd_timechart(void) if (!perf_session__has_traces(session, "timechart record")) goto out_delete; - ret = perf_session__process_events(session, &perf_timechart); + ret = perf_session__process_events(session, &event_ops); if (ret) goto out_delete; diff --git a/trunk/tools/perf/builtin-top.c b/trunk/tools/perf/builtin-top.c index 4f81eeb99875..c9cdedb58134 100644 --- a/trunk/tools/perf/builtin-top.c +++ b/trunk/tools/perf/builtin-top.c @@ -64,6 +64,44 @@ #include #include +static struct perf_top top = { + .count_filter = 5, + .delay_secs = 2, + .target_pid = -1, + .target_tid = -1, + .freq = 1000, /* 1 KHz */ +}; + +static bool system_wide = false; + +static bool use_tui, use_stdio; + +static bool sort_has_symbols; + +static bool dont_use_callchains; +static char callchain_default_opt[] = "fractal,0.5,callee"; + + +static int default_interval = 0; + +static bool kptr_restrict_warned; +static bool vmlinux_warned; +static bool inherit = false; +static int realtime_prio = 0; +static bool group = false; +static bool sample_id_all_avail = true; +static unsigned int mmap_pages = 128; + +static bool dump_symtab = false; + +static struct winsize winsize; + +static const char *sym_filter = NULL; +static int sym_pcnt_filter = 5; + +/* + * Source functions + */ void get_term_dimensions(struct winsize *ws) { @@ -87,23 +125,21 @@ void get_term_dimensions(struct winsize *ws) ws->ws_col = 80; } -static void perf_top__update_print_entries(struct perf_top *top) +static void update_print_entries(struct winsize *ws) { - top->print_entries = top->winsize.ws_row; + top.print_entries = ws->ws_row; - if (top->print_entries > 9) - top->print_entries -= 9; + if (top.print_entries > 9) + top.print_entries -= 9; } -static void perf_top__sig_winch(int sig __used, siginfo_t *info __used, void *arg) +static void sig_winch_handler(int sig __used) { - struct perf_top *top = arg; - - get_term_dimensions(&top->winsize); - perf_top__update_print_entries(top); + get_term_dimensions(&winsize); + update_print_entries(&winsize); } -static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he) +static int parse_source(struct hist_entry *he) { struct symbol *sym; struct annotation *notes; @@ -134,7 +170,7 @@ static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he) pthread_mutex_lock(¬es->lock); - if (symbol__alloc_hist(sym) < 0) { + if (symbol__alloc_hist(sym, top.evlist->nr_entries) < 0) { pthread_mutex_unlock(¬es->lock); pr_err("Not enough memory for annotating '%s' symbol!\n", sym->name); @@ -145,7 +181,7 @@ static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he) err = symbol__annotate(sym, map, 0); if (err == 0) { out_assign: - top->sym_filter_entry = he; + top.sym_filter_entry = he; } pthread_mutex_unlock(¬es->lock); @@ -158,16 +194,14 @@ static void __zero_source_counters(struct hist_entry *he) symbol__annotate_zero_histograms(sym); } -static void perf_top__record_precise_ip(struct perf_top *top, - struct hist_entry *he, - int counter, u64 ip) +static void record_precise_ip(struct hist_entry *he, int counter, u64 ip) { struct annotation *notes; struct symbol *sym; if (he == NULL || he->ms.sym == NULL || - ((top->sym_filter_entry == NULL || - top->sym_filter_entry->ms.sym != he->ms.sym) && use_browser != 1)) + ((top.sym_filter_entry == NULL || + top.sym_filter_entry->ms.sym != he->ms.sym) && use_browser != 1)) return; sym = he->ms.sym; @@ -176,7 +210,8 @@ static void perf_top__record_precise_ip(struct perf_top *top, if (pthread_mutex_trylock(¬es->lock)) return; - if (notes->src == NULL && symbol__alloc_hist(sym) < 0) { + if (notes->src == NULL && + symbol__alloc_hist(sym, top.evlist->nr_entries) < 0) { pthread_mutex_unlock(¬es->lock); pr_err("Not enough memory for annotating '%s' symbol!\n", sym->name); @@ -190,9 +225,8 @@ static void perf_top__record_precise_ip(struct perf_top *top, pthread_mutex_unlock(¬es->lock); } -static void perf_top__show_details(struct perf_top *top) +static void show_details(struct hist_entry *he) { - struct hist_entry *he = top->sym_filter_entry; struct annotation *notes; struct symbol *symbol; int more; @@ -208,15 +242,15 @@ static void perf_top__show_details(struct perf_top *top) if (notes->src == NULL) goto out_unlock; - printf("Showing %s for %s\n", event_name(top->sym_evsel), symbol->name); - printf(" Events Pcnt (>=%d%%)\n", top->sym_pcnt_filter); + printf("Showing %s for %s\n", event_name(top.sym_evsel), symbol->name); + printf(" Events Pcnt (>=%d%%)\n", sym_pcnt_filter); - more = symbol__annotate_printf(symbol, he->ms.map, top->sym_evsel->idx, - 0, top->sym_pcnt_filter, top->print_entries, 4); - if (top->zero) - symbol__annotate_zero_histogram(symbol, top->sym_evsel->idx); + more = symbol__annotate_printf(symbol, he->ms.map, top.sym_evsel->idx, + 0, sym_pcnt_filter, top.print_entries, 4); + if (top.zero) + symbol__annotate_zero_histogram(symbol, top.sym_evsel->idx); else - symbol__annotate_decay_histogram(symbol, top->sym_evsel->idx); + symbol__annotate_decay_histogram(symbol, top.sym_evsel->idx); if (more != 0) printf("%d lines not displayed, maybe increase display entries [e]\n", more); out_unlock: @@ -225,9 +259,11 @@ static void perf_top__show_details(struct perf_top *top) static const char CONSOLE_CLEAR[] = ""; -static struct hist_entry *perf_evsel__add_hist_entry(struct perf_evsel *evsel, - struct addr_location *al, - struct perf_sample *sample) +static struct hist_entry * + perf_session__add_hist_entry(struct perf_session *session, + struct addr_location *al, + struct perf_sample *sample, + struct perf_evsel *evsel) { struct hist_entry *he; @@ -235,51 +271,50 @@ static struct hist_entry *perf_evsel__add_hist_entry(struct perf_evsel *evsel, if (he == NULL) return NULL; - evsel->hists.stats.total_period += sample->period; + session->hists.stats.total_period += sample->period; hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE); return he; } -static void perf_top__print_sym_table(struct perf_top *top) +static void print_sym_table(void) { char bf[160]; int printed = 0; - const int win_width = top->winsize.ws_col - 1; + const int win_width = winsize.ws_col - 1; puts(CONSOLE_CLEAR); - perf_top__header_snprintf(top, bf, sizeof(bf)); + perf_top__header_snprintf(&top, bf, sizeof(bf)); printf("%s\n", bf); - perf_top__reset_sample_counters(top); + perf_top__reset_sample_counters(&top); printf("%-*.*s\n", win_width, win_width, graph_dotted_line); - if (top->sym_evsel->hists.stats.nr_lost_warned != - top->sym_evsel->hists.stats.nr_events[PERF_RECORD_LOST]) { - top->sym_evsel->hists.stats.nr_lost_warned = - top->sym_evsel->hists.stats.nr_events[PERF_RECORD_LOST]; + if (top.sym_evsel->hists.stats.nr_lost_warned != + top.sym_evsel->hists.stats.nr_events[PERF_RECORD_LOST]) { + top.sym_evsel->hists.stats.nr_lost_warned = + top.sym_evsel->hists.stats.nr_events[PERF_RECORD_LOST]; color_fprintf(stdout, PERF_COLOR_RED, "WARNING: LOST %d chunks, Check IO/CPU overload", - top->sym_evsel->hists.stats.nr_lost_warned); + top.sym_evsel->hists.stats.nr_lost_warned); ++printed; } - if (top->sym_filter_entry) { - perf_top__show_details(top); + if (top.sym_filter_entry) { + show_details(top.sym_filter_entry); return; } - hists__collapse_resort_threaded(&top->sym_evsel->hists); - hists__output_resort_threaded(&top->sym_evsel->hists); - hists__decay_entries_threaded(&top->sym_evsel->hists, - top->hide_user_symbols, - top->hide_kernel_symbols); - hists__output_recalc_col_len(&top->sym_evsel->hists, - top->winsize.ws_row - 3); + hists__collapse_resort_threaded(&top.sym_evsel->hists); + hists__output_resort_threaded(&top.sym_evsel->hists); + hists__decay_entries_threaded(&top.sym_evsel->hists, + top.hide_user_symbols, + top.hide_kernel_symbols); + hists__output_recalc_col_len(&top.sym_evsel->hists, winsize.ws_row - 3); putchar('\n'); - hists__fprintf(&top->sym_evsel->hists, NULL, false, false, - top->winsize.ws_row - 4 - printed, win_width, stdout); + hists__fprintf(&top.sym_evsel->hists, NULL, false, false, + winsize.ws_row - 4 - printed, win_width, stdout); } static void prompt_integer(int *target, const char *msg) @@ -317,17 +352,17 @@ static void prompt_percent(int *target, const char *msg) *target = tmp; } -static void perf_top__prompt_symbol(struct perf_top *top, const char *msg) +static void prompt_symbol(struct hist_entry **target, const char *msg) { char *buf = malloc(0), *p; - struct hist_entry *syme = top->sym_filter_entry, *n, *found = NULL; + struct hist_entry *syme = *target, *n, *found = NULL; struct rb_node *next; size_t dummy = 0; /* zero counters of active symbol */ if (syme) { __zero_source_counters(syme); - top->sym_filter_entry = NULL; + *target = NULL; } fprintf(stdout, "\n%s: ", msg); @@ -338,7 +373,7 @@ static void perf_top__prompt_symbol(struct perf_top *top, const char *msg) if (p) *p = 0; - next = rb_first(&top->sym_evsel->hists.entries); + next = rb_first(&top.sym_evsel->hists.entries); while (next) { n = rb_entry(next, struct hist_entry, rb_node); if (n->ms.sym && !strcmp(buf, n->ms.sym->name)) { @@ -351,46 +386,47 @@ static void perf_top__prompt_symbol(struct perf_top *top, const char *msg) if (!found) { fprintf(stderr, "Sorry, %s is not active.\n", buf); sleep(1); + return; } else - perf_top__parse_source(top, found); + parse_source(found); out_free: free(buf); } -static void perf_top__print_mapped_keys(struct perf_top *top) +static void print_mapped_keys(void) { char *name = NULL; - if (top->sym_filter_entry) { - struct symbol *sym = top->sym_filter_entry->ms.sym; + if (top.sym_filter_entry) { + struct symbol *sym = top.sym_filter_entry->ms.sym; name = sym->name; } fprintf(stdout, "\nMapped keys:\n"); - fprintf(stdout, "\t[d] display refresh delay. \t(%d)\n", top->delay_secs); - fprintf(stdout, "\t[e] display entries (lines). \t(%d)\n", top->print_entries); + fprintf(stdout, "\t[d] display refresh delay. \t(%d)\n", top.delay_secs); + fprintf(stdout, "\t[e] display entries (lines). \t(%d)\n", top.print_entries); - if (top->evlist->nr_entries > 1) - fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(top->sym_evsel)); + if (top.evlist->nr_entries > 1) + fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(top.sym_evsel)); - fprintf(stdout, "\t[f] profile display filter (count). \t(%d)\n", top->count_filter); + fprintf(stdout, "\t[f] profile display filter (count). \t(%d)\n", top.count_filter); - fprintf(stdout, "\t[F] annotate display filter (percent). \t(%d%%)\n", top->sym_pcnt_filter); + fprintf(stdout, "\t[F] annotate display filter (percent). \t(%d%%)\n", sym_pcnt_filter); fprintf(stdout, "\t[s] annotate symbol. \t(%s)\n", name?: "NULL"); fprintf(stdout, "\t[S] stop annotation.\n"); fprintf(stdout, "\t[K] hide kernel_symbols symbols. \t(%s)\n", - top->hide_kernel_symbols ? "yes" : "no"); + top.hide_kernel_symbols ? "yes" : "no"); fprintf(stdout, "\t[U] hide user symbols. \t(%s)\n", - top->hide_user_symbols ? "yes" : "no"); - fprintf(stdout, "\t[z] toggle sample zeroing. \t(%d)\n", top->zero ? 1 : 0); + top.hide_user_symbols ? "yes" : "no"); + fprintf(stdout, "\t[z] toggle sample zeroing. \t(%d)\n", top.zero ? 1 : 0); fprintf(stdout, "\t[qQ] quit.\n"); } -static int perf_top__key_mapped(struct perf_top *top, int c) +static int key_mapped(int c) { switch (c) { case 'd': @@ -406,7 +442,7 @@ static int perf_top__key_mapped(struct perf_top *top, int c) case 'S': return 1; case 'E': - return top->evlist->nr_entries > 1 ? 1 : 0; + return top.evlist->nr_entries > 1 ? 1 : 0; default: break; } @@ -414,13 +450,13 @@ static int perf_top__key_mapped(struct perf_top *top, int c) return 0; } -static void perf_top__handle_keypress(struct perf_top *top, int c) +static void handle_keypress(int c) { - if (!perf_top__key_mapped(top, c)) { + if (!key_mapped(c)) { struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; struct termios tc, save; - perf_top__print_mapped_keys(top); + print_mapped_keys(); fprintf(stdout, "\nEnter selection, or unmapped key to continue: "); fflush(stdout); @@ -435,86 +471,81 @@ static void perf_top__handle_keypress(struct perf_top *top, int c) c = getc(stdin); tcsetattr(0, TCSAFLUSH, &save); - if (!perf_top__key_mapped(top, c)) + if (!key_mapped(c)) return; } switch (c) { case 'd': - prompt_integer(&top->delay_secs, "Enter display delay"); - if (top->delay_secs < 1) - top->delay_secs = 1; + prompt_integer(&top.delay_secs, "Enter display delay"); + if (top.delay_secs < 1) + top.delay_secs = 1; break; case 'e': - prompt_integer(&top->print_entries, "Enter display entries (lines)"); - if (top->print_entries == 0) { - struct sigaction act = { - .sa_sigaction = perf_top__sig_winch, - .sa_flags = SA_SIGINFO, - }; - perf_top__sig_winch(SIGWINCH, NULL, top); - sigaction(SIGWINCH, &act, NULL); + prompt_integer(&top.print_entries, "Enter display entries (lines)"); + if (top.print_entries == 0) { + sig_winch_handler(SIGWINCH); + signal(SIGWINCH, sig_winch_handler); } else signal(SIGWINCH, SIG_DFL); break; case 'E': - if (top->evlist->nr_entries > 1) { + if (top.evlist->nr_entries > 1) { /* Select 0 as the default event: */ int counter = 0; fprintf(stderr, "\nAvailable events:"); - list_for_each_entry(top->sym_evsel, &top->evlist->entries, node) - fprintf(stderr, "\n\t%d %s", top->sym_evsel->idx, event_name(top->sym_evsel)); + list_for_each_entry(top.sym_evsel, &top.evlist->entries, node) + fprintf(stderr, "\n\t%d %s", top.sym_evsel->idx, event_name(top.sym_evsel)); prompt_integer(&counter, "Enter details event counter"); - if (counter >= top->evlist->nr_entries) { - top->sym_evsel = list_entry(top->evlist->entries.next, struct perf_evsel, node); - fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(top->sym_evsel)); + if (counter >= top.evlist->nr_entries) { + top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node); + fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(top.sym_evsel)); sleep(1); break; } - list_for_each_entry(top->sym_evsel, &top->evlist->entries, node) - if (top->sym_evsel->idx == counter) + list_for_each_entry(top.sym_evsel, &top.evlist->entries, node) + if (top.sym_evsel->idx == counter) break; } else - top->sym_evsel = list_entry(top->evlist->entries.next, struct perf_evsel, node); + top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node); break; case 'f': - prompt_integer(&top->count_filter, "Enter display event count filter"); + prompt_integer(&top.count_filter, "Enter display event count filter"); break; case 'F': - prompt_percent(&top->sym_pcnt_filter, - "Enter details display event filter (percent)"); + prompt_percent(&sym_pcnt_filter, "Enter details display event filter (percent)"); break; case 'K': - top->hide_kernel_symbols = !top->hide_kernel_symbols; + top.hide_kernel_symbols = !top.hide_kernel_symbols; break; case 'q': case 'Q': printf("exiting.\n"); - if (top->dump_symtab) - perf_session__fprintf_dsos(top->session, stderr); + if (dump_symtab) + perf_session__fprintf_dsos(top.session, stderr); exit(0); case 's': - perf_top__prompt_symbol(top, "Enter details symbol"); + prompt_symbol(&top.sym_filter_entry, "Enter details symbol"); break; case 'S': - if (!top->sym_filter_entry) + if (!top.sym_filter_entry) break; else { - struct hist_entry *syme = top->sym_filter_entry; + struct hist_entry *syme = top.sym_filter_entry; - top->sym_filter_entry = NULL; + top.sym_filter_entry = NULL; __zero_source_counters(syme); } break; case 'U': - top->hide_user_symbols = !top->hide_user_symbols; + top.hide_user_symbols = !top.hide_user_symbols; break; case 'z': - top->zero = !top->zero; + top.zero = !top.zero; break; default: break; @@ -532,30 +563,28 @@ static void perf_top__sort_new_samples(void *arg) hists__collapse_resort_threaded(&t->sym_evsel->hists); hists__output_resort_threaded(&t->sym_evsel->hists); hists__decay_entries_threaded(&t->sym_evsel->hists, - t->hide_user_symbols, - t->hide_kernel_symbols); + top.hide_user_symbols, + top.hide_kernel_symbols); } -static void *display_thread_tui(void *arg) +static void *display_thread_tui(void *arg __used) { - struct perf_top *top = arg; const char *help = "For a higher level overview, try: perf top --sort comm,dso"; - perf_top__sort_new_samples(top); - perf_evlist__tui_browse_hists(top->evlist, help, + perf_top__sort_new_samples(&top); + perf_evlist__tui_browse_hists(top.evlist, help, perf_top__sort_new_samples, - top, top->delay_secs); + &top, top.delay_secs); exit_browser(0); exit(0); return NULL; } -static void *display_thread(void *arg) +static void *display_thread(void *arg __used) { struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; struct termios tc, save; - struct perf_top *top = arg; int delay_msecs, c; tcgetattr(0, &save); @@ -566,13 +595,13 @@ static void *display_thread(void *arg) pthread__unblock_sigwinch(); repeat: - delay_msecs = top->delay_secs * 1000; + delay_msecs = top.delay_secs * 1000; tcsetattr(0, TCSANOW, &tc); /* trash return*/ getc(stdin); while (1) { - perf_top__print_sym_table(top); + print_sym_table(); /* * Either timeout expired or we got an EINTR due to SIGWINCH, * refresh screen in both cases. @@ -592,7 +621,7 @@ static void *display_thread(void *arg) c = getc(stdin); tcsetattr(0, TCSAFLUSH, &save); - perf_top__handle_keypress(top, c); + handle_keypress(c); goto repeat; return NULL; @@ -644,17 +673,47 @@ static int symbol_filter(struct map *map __used, struct symbol *sym) return 0; } -static void perf_event__process_sample(struct perf_tool *tool, - const union perf_event *event, +static void perf_event__process_sample(const union perf_event *event, struct perf_evsel *evsel, struct perf_sample *sample, - struct machine *machine) + struct perf_session *session) { - struct perf_top *top = container_of(tool, struct perf_top, tool); struct symbol *parent = NULL; u64 ip = event->ip.ip; struct addr_location al; + struct machine *machine; int err; + u8 origin = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; + + ++top.samples; + + switch (origin) { + case PERF_RECORD_MISC_USER: + ++top.us_samples; + if (top.hide_user_symbols) + return; + machine = perf_session__find_host_machine(session); + break; + case PERF_RECORD_MISC_KERNEL: + ++top.kernel_samples; + if (top.hide_kernel_symbols) + return; + machine = perf_session__find_host_machine(session); + break; + case PERF_RECORD_MISC_GUEST_KERNEL: + ++top.guest_kernel_samples; + machine = perf_session__find_machine(session, event->ip.pid); + break; + case PERF_RECORD_MISC_GUEST_USER: + ++top.guest_us_samples; + /* + * TODO: we don't process guest user from host side + * except simple counting. + */ + return; + default: + return; + } if (!machine && perf_guest) { pr_err("Can't find guest [%d]'s kernel information\n", @@ -663,14 +722,14 @@ static void perf_event__process_sample(struct perf_tool *tool, } if (event->header.misc & PERF_RECORD_MISC_EXACT_IP) - top->exact_samples++; + top.exact_samples++; - if (perf_event__preprocess_sample(event, machine, &al, sample, + if (perf_event__preprocess_sample(event, session, &al, sample, symbol_filter) < 0 || al.filtered) return; - if (!top->kptr_restrict_warned && + if (!kptr_restrict_warned && symbol_conf.kptr_restrict && al.cpumode == PERF_RECORD_MISC_KERNEL) { ui__warning( @@ -681,7 +740,7 @@ static void perf_event__process_sample(struct perf_tool *tool, " modules" : ""); if (use_browser <= 0) sleep(5); - top->kptr_restrict_warned = true; + kptr_restrict_warned = true; } if (al.sym == NULL) { @@ -697,7 +756,7 @@ static void perf_event__process_sample(struct perf_tool *tool, * --hide-kernel-symbols, even if the user specifies an * invalid --vmlinux ;-) */ - if (!top->kptr_restrict_warned && !top->vmlinux_warned && + if (!kptr_restrict_warned && !vmlinux_warned && al.map == machine->vmlinux_maps[MAP__FUNCTION] && RB_EMPTY_ROOT(&al.map->dso->symbols[MAP__FUNCTION])) { if (symbol_conf.vmlinux_name) { @@ -710,7 +769,7 @@ static void perf_event__process_sample(struct perf_tool *tool, if (use_browser <= 0) sleep(5); - top->vmlinux_warned = true; + vmlinux_warned = true; } } @@ -719,109 +778,70 @@ static void perf_event__process_sample(struct perf_tool *tool, if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) { - err = machine__resolve_callchain(machine, evsel, al.thread, - sample->callchain, &parent); + err = perf_session__resolve_callchain(session, al.thread, + sample->callchain, &parent); if (err) return; } - he = perf_evsel__add_hist_entry(evsel, &al, sample); + he = perf_session__add_hist_entry(session, &al, sample, evsel); if (he == NULL) { pr_err("Problem incrementing symbol period, skipping event\n"); return; } if (symbol_conf.use_callchain) { - err = callchain_append(he->callchain, &evsel->hists.callchain_cursor, + err = callchain_append(he->callchain, &session->callchain_cursor, sample->period); if (err) return; } - if (top->sort_has_symbols) - perf_top__record_precise_ip(top, he, evsel->idx, ip); + if (sort_has_symbols) + record_precise_ip(he, evsel->idx, ip); } return; } -static void perf_top__mmap_read_idx(struct perf_top *top, int idx) +static void perf_session__mmap_read_idx(struct perf_session *self, int idx) { struct perf_sample sample; struct perf_evsel *evsel; - struct perf_session *session = top->session; union perf_event *event; - struct machine *machine; - u8 origin; int ret; - while ((event = perf_evlist__mmap_read(top->evlist, idx)) != NULL) { - ret = perf_session__parse_sample(session, event, &sample); + while ((event = perf_evlist__mmap_read(top.evlist, idx)) != NULL) { + ret = perf_session__parse_sample(self, event, &sample); if (ret) { pr_err("Can't parse sample, err = %d\n", ret); continue; } - evsel = perf_evlist__id2evsel(session->evlist, sample.id); + evsel = perf_evlist__id2evsel(self->evlist, sample.id); assert(evsel != NULL); - origin = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; - if (event->header.type == PERF_RECORD_SAMPLE) - ++top->samples; - - switch (origin) { - case PERF_RECORD_MISC_USER: - ++top->us_samples; - if (top->hide_user_symbols) - continue; - machine = perf_session__find_host_machine(session); - break; - case PERF_RECORD_MISC_KERNEL: - ++top->kernel_samples; - if (top->hide_kernel_symbols) - continue; - machine = perf_session__find_host_machine(session); - break; - case PERF_RECORD_MISC_GUEST_KERNEL: - ++top->guest_kernel_samples; - machine = perf_session__find_machine(session, event->ip.pid); - break; - case PERF_RECORD_MISC_GUEST_USER: - ++top->guest_us_samples; - /* - * TODO: we don't process guest user from host side - * except simple counting. - */ - /* Fall thru */ - default: - continue; - } - - - if (event->header.type == PERF_RECORD_SAMPLE) { - perf_event__process_sample(&top->tool, event, evsel, - &sample, machine); - } else if (event->header.type < PERF_RECORD_MAX) { + perf_event__process_sample(event, evsel, &sample, self); + else if (event->header.type < PERF_RECORD_MAX) { hists__inc_nr_events(&evsel->hists, event->header.type); - perf_event__process(&top->tool, event, &sample, machine); + perf_event__process(event, &sample, self); } else - ++session->hists.stats.nr_unknown_events; + ++self->hists.stats.nr_unknown_events; } } -static void perf_top__mmap_read(struct perf_top *top) +static void perf_session__mmap_read(struct perf_session *self) { int i; - for (i = 0; i < top->evlist->nr_mmaps; i++) - perf_top__mmap_read_idx(top, i); + for (i = 0; i < top.evlist->nr_mmaps; i++) + perf_session__mmap_read_idx(self, i); } -static void perf_top__start_counters(struct perf_top *top) +static void start_counters(struct perf_evlist *evlist) { struct perf_evsel *counter, *first; - struct perf_evlist *evlist = top->evlist; first = list_entry(evlist->entries.next, struct perf_evsel, node); @@ -829,15 +849,15 @@ static void perf_top__start_counters(struct perf_top *top) struct perf_event_attr *attr = &counter->attr; struct xyarray *group_fd = NULL; - if (top->group && counter != first) + if (group && counter != first) group_fd = first->fd; attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID; - if (top->freq) { + if (top.freq) { attr->sample_type |= PERF_SAMPLE_PERIOD; attr->freq = 1; - attr->sample_freq = top->freq; + attr->sample_freq = top.freq; } if (evlist->nr_entries > 1) { @@ -850,23 +870,23 @@ static void perf_top__start_counters(struct perf_top *top) attr->mmap = 1; attr->comm = 1; - attr->inherit = top->inherit; + attr->inherit = inherit; retry_sample_id: - attr->sample_id_all = top->sample_id_all_avail ? 1 : 0; + attr->sample_id_all = sample_id_all_avail ? 1 : 0; try_again: - if (perf_evsel__open(counter, top->evlist->cpus, - top->evlist->threads, top->group, + if (perf_evsel__open(counter, top.evlist->cpus, + top.evlist->threads, group, group_fd) < 0) { int err = errno; if (err == EPERM || err == EACCES) { ui__error_paranoid(); goto out_err; - } else if (err == EINVAL && top->sample_id_all_avail) { + } else if (err == EINVAL && sample_id_all_avail) { /* * Old kernel, no attr->sample_id_type_all field */ - top->sample_id_all_avail = false; + sample_id_all_avail = false; goto retry_sample_id; } /* @@ -900,7 +920,7 @@ static void perf_top__start_counters(struct perf_top *top) } } - if (perf_evlist__mmap(evlist, top->mmap_pages, false) < 0) { + if (perf_evlist__mmap(evlist, mmap_pages, false) < 0) { ui__warning("Failed to mmap with %d (%s)\n", errno, strerror(errno)); goto out_err; @@ -913,14 +933,14 @@ static void perf_top__start_counters(struct perf_top *top) exit(0); } -static int perf_top__setup_sample_type(struct perf_top *top) +static int setup_sample_type(void) { - if (!top->sort_has_symbols) { + if (!sort_has_symbols) { if (symbol_conf.use_callchain) { ui__warning("Selected -g but \"sym\" not present in --sort/-s."); return -EINVAL; } - } else if (!top->dont_use_callchains && callchain_param.mode != CHAIN_NONE) { + } else if (!dont_use_callchains && callchain_param.mode != CHAIN_NONE) { if (callchain_register_param(&callchain_param) < 0) { ui__warning("Can't register callchain params.\n"); return -EINVAL; @@ -930,7 +950,7 @@ static int perf_top__setup_sample_type(struct perf_top *top) return 0; } -static int __cmd_top(struct perf_top *top) +static int __cmd_top(void) { pthread_t thread; int ret; @@ -938,40 +958,39 @@ static int __cmd_top(struct perf_top *top) * FIXME: perf_session__new should allow passing a O_MMAP, so that all this * mmap reading, etc is encapsulated in it. Use O_WRONLY for now. */ - top->session = perf_session__new(NULL, O_WRONLY, false, false, NULL); - if (top->session == NULL) + top.session = perf_session__new(NULL, O_WRONLY, false, false, NULL); + if (top.session == NULL) return -ENOMEM; - ret = perf_top__setup_sample_type(top); + ret = setup_sample_type(); if (ret) goto out_delete; - if (top->target_tid != -1) - perf_event__synthesize_thread_map(&top->tool, top->evlist->threads, - perf_event__process, - &top->session->host_machine); + if (top.target_tid != -1) + perf_event__synthesize_thread_map(top.evlist->threads, + perf_event__process, top.session); else - perf_event__synthesize_threads(&top->tool, perf_event__process, - &top->session->host_machine); - perf_top__start_counters(top); - top->session->evlist = top->evlist; - perf_session__update_sample_type(top->session); + perf_event__synthesize_threads(perf_event__process, top.session); + + start_counters(top.evlist); + top.session->evlist = top.evlist; + perf_session__update_sample_type(top.session); /* Wait for a minimal set of events before starting the snapshot */ - poll(top->evlist->pollfd, top->evlist->nr_fds, 100); + poll(top.evlist->pollfd, top.evlist->nr_fds, 100); - perf_top__mmap_read(top); + perf_session__mmap_read(top.session); if (pthread_create(&thread, NULL, (use_browser > 0 ? display_thread_tui : - display_thread), top)) { + display_thread), NULL)) { printf("Could not create display thread.\n"); exit(-1); } - if (top->realtime_prio) { + if (realtime_prio) { struct sched_param param; - param.sched_priority = top->realtime_prio; + param.sched_priority = realtime_prio; if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { printf("Could not set realtime priority.\n"); exit(-1); @@ -979,25 +998,25 @@ static int __cmd_top(struct perf_top *top) } while (1) { - u64 hits = top->samples; + u64 hits = top.samples; - perf_top__mmap_read(top); + perf_session__mmap_read(top.session); - if (hits == top->samples) - ret = poll(top->evlist->pollfd, top->evlist->nr_fds, 100); + if (hits == top.samples) + ret = poll(top.evlist->pollfd, top.evlist->nr_fds, 100); } out_delete: - perf_session__delete(top->session); - top->session = NULL; + perf_session__delete(top.session); + top.session = NULL; return 0; } static int -parse_callchain_opt(const struct option *opt, const char *arg, int unset) +parse_callchain_opt(const struct option *opt __used, const char *arg, + int unset) { - struct perf_top *top = (struct perf_top *)opt->value; char *tok, *tok2; char *endptr; @@ -1005,7 +1024,7 @@ parse_callchain_opt(const struct option *opt, const char *arg, int unset) * --no-call-graph */ if (unset) { - top->dont_use_callchains = true; + dont_use_callchains = true; return 0; } @@ -1033,7 +1052,9 @@ parse_callchain_opt(const struct option *opt, const char *arg, int unset) symbol_conf.use_callchain = false; return 0; - } else + } + + else return -1; /* get the min percentage */ @@ -1077,32 +1098,17 @@ static const char * const top_usage[] = { NULL }; -int cmd_top(int argc, const char **argv, const char *prefix __used) -{ - struct perf_evsel *pos; - int status = -ENOMEM; - struct perf_top top = { - .count_filter = 5, - .delay_secs = 2, - .target_pid = -1, - .target_tid = -1, - .freq = 1000, /* 1 KHz */ - .sample_id_all_avail = true, - .mmap_pages = 128, - .sym_pcnt_filter = 5, - }; - char callchain_default_opt[] = "fractal,0.5,callee"; - const struct option options[] = { +static const struct option options[] = { OPT_CALLBACK('e', "event", &top.evlist, "event", "event selector. use 'perf list' to list available events", parse_events_option), - OPT_INTEGER('c', "count", &top.default_interval, + OPT_INTEGER('c', "count", &default_interval, "event period to sample"), OPT_INTEGER('p', "pid", &top.target_pid, "profile events on existing process id"), OPT_INTEGER('t', "tid", &top.target_tid, "profile events on existing thread id"), - OPT_BOOLEAN('a', "all-cpus", &top.system_wide, + OPT_BOOLEAN('a', "all-cpus", &system_wide, "system-wide collection from all CPUs"), OPT_STRING('C', "cpu", &top.cpu_list, "cpu", "list of cpus to monitor"), @@ -1110,20 +1116,20 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) "file", "vmlinux pathname"), OPT_BOOLEAN('K', "hide_kernel_symbols", &top.hide_kernel_symbols, "hide kernel symbols"), - OPT_UINTEGER('m', "mmap-pages", &top.mmap_pages, "number of mmap data pages"), - OPT_INTEGER('r', "realtime", &top.realtime_prio, + OPT_UINTEGER('m', "mmap-pages", &mmap_pages, "number of mmap data pages"), + OPT_INTEGER('r', "realtime", &realtime_prio, "collect data with this RT SCHED_FIFO priority"), OPT_INTEGER('d', "delay", &top.delay_secs, "number of seconds to delay between refreshes"), - OPT_BOOLEAN('D', "dump-symtab", &top.dump_symtab, + OPT_BOOLEAN('D', "dump-symtab", &dump_symtab, "dump the symbol table used for profiling"), OPT_INTEGER('f', "count-filter", &top.count_filter, "only display functions with more events than this"), - OPT_BOOLEAN('g', "group", &top.group, + OPT_BOOLEAN('g', "group", &group, "put the counters into a counter group"), - OPT_BOOLEAN('i', "inherit", &top.inherit, + OPT_BOOLEAN('i', "inherit", &inherit, "child tasks inherit counters"), - OPT_STRING(0, "sym-annotate", &top.sym_filter, "symbol name", + OPT_STRING(0, "sym-annotate", &sym_filter, "symbol name", "symbol to annotate"), OPT_BOOLEAN('z', "zero", &top.zero, "zero history across updates"), @@ -1133,15 +1139,15 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) "display this many functions"), OPT_BOOLEAN('U', "hide_user_symbols", &top.hide_user_symbols, "hide user symbols"), - OPT_BOOLEAN(0, "tui", &top.use_tui, "Use the TUI interface"), - OPT_BOOLEAN(0, "stdio", &top.use_stdio, "Use the stdio interface"), + OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"), + OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"), OPT_INCR('v', "verbose", &verbose, "be more verbose (show counter open errors, etc)"), OPT_STRING('s', "sort", &sort_order, "key[,key2...]", "sort by key(s): pid, comm, dso, symbol, parent"), OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples, "Show a column with the number of samples"), - OPT_CALLBACK_DEFAULT('G', "call-graph", &top, "output_type,min_percent, call_order", + OPT_CALLBACK_DEFAULT('G', "call-graph", NULL, "output_type,min_percent, call_order", "Display callchains using output_type (graph, flat, fractal, or none), min percent threshold and callchain order. " "Default: fractal,0.5,callee", &parse_callchain_opt, callchain_default_opt), @@ -1160,7 +1166,12 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style", "Specify disassembler style (e.g. -M intel for intel syntax)"), OPT_END() - }; +}; + +int cmd_top(int argc, const char **argv, const char *prefix __used) +{ + struct perf_evsel *pos; + int status = -ENOMEM; top.evlist = perf_evlist__new(NULL, NULL); if (top.evlist == NULL) @@ -1177,9 +1188,9 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) setup_sorting(top_usage, options); - if (top.use_stdio) + if (use_stdio) use_browser = 0; - else if (top.use_tui) + else if (use_tui) use_browser = 1; setup_browser(false); @@ -1204,31 +1215,38 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) return -ENOMEM; } - symbol_conf.nr_events = top.evlist->nr_entries; - if (top.delay_secs < 1) top.delay_secs = 1; /* * User specified count overrides default frequency. */ - if (top.default_interval) + if (default_interval) top.freq = 0; else if (top.freq) { - top.default_interval = top.freq; + default_interval = top.freq; } else { fprintf(stderr, "frequency and count are zero, aborting\n"); exit(EXIT_FAILURE); } list_for_each_entry(pos, &top.evlist->entries, node) { + if (perf_evsel__alloc_fd(pos, top.evlist->cpus->nr, + top.evlist->threads->nr) < 0) + goto out_free_fd; /* * Fill in the ones not specifically initialized via -c: */ - if (!pos->attr.sample_period) - pos->attr.sample_period = top.default_interval; + if (pos->attr.sample_period) + continue; + + pos->attr.sample_period = default_interval; } + if (perf_evlist__alloc_pollfd(top.evlist) < 0 || + perf_evlist__alloc_mmap(top.evlist) < 0) + goto out_free_fd; + top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node); symbol_conf.priv_size = sizeof(struct annotation); @@ -1245,20 +1263,16 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) * Avoid annotation data structures overhead when symbols aren't on the * sort list. */ - top.sort_has_symbols = sort_sym.list.next != NULL; + sort_has_symbols = sort_sym.list.next != NULL; - get_term_dimensions(&top.winsize); + get_term_dimensions(&winsize); if (top.print_entries == 0) { - struct sigaction act = { - .sa_sigaction = perf_top__sig_winch, - .sa_flags = SA_SIGINFO, - }; - perf_top__update_print_entries(&top); - sigaction(SIGWINCH, &act, NULL); + update_print_entries(&winsize); + signal(SIGWINCH, sig_winch_handler); } - status = __cmd_top(&top); - + status = __cmd_top(); +out_free_fd: perf_evlist__delete(top.evlist); return status; diff --git a/trunk/tools/perf/perf.c b/trunk/tools/perf/perf.c index 2b2e225a4d4c..73d0cac8b67e 100644 --- a/trunk/tools/perf/perf.c +++ b/trunk/tools/perf/perf.c @@ -29,6 +29,8 @@ struct pager_config { int val; }; +static char debugfs_mntpt[MAXPATHLEN]; + static int pager_command_config(const char *var, const char *value, void *data) { struct pager_config *c = data; @@ -79,6 +81,15 @@ static void commit_pager_choice(void) } } +static void set_debugfs_path(void) +{ + char *path; + + path = getenv(PERF_DEBUGFS_ENVIRONMENT); + snprintf(debugfs_path, MAXPATHLEN, "%s/%s", path ?: debugfs_mntpt, + "tracing/events"); +} + static int handle_options(const char ***argv, int *argc, int *envchanged) { int handled = 0; @@ -150,14 +161,15 @@ static int handle_options(const char ***argv, int *argc, int *envchanged) fprintf(stderr, "No directory given for --debugfs-dir.\n"); usage(perf_usage_string); } - debugfs_set_path((*argv)[1]); + strncpy(debugfs_mntpt, (*argv)[1], MAXPATHLEN); + debugfs_mntpt[MAXPATHLEN - 1] = '\0'; if (envchanged) *envchanged = 1; (*argv)++; (*argc)--; } else if (!prefixcmp(cmd, CMD_DEBUGFS_DIR)) { - debugfs_set_path(cmd + strlen(CMD_DEBUGFS_DIR)); - fprintf(stderr, "dir: %s\n", debugfs_mountpoint); + strncpy(debugfs_mntpt, cmd + strlen(CMD_DEBUGFS_DIR), MAXPATHLEN); + debugfs_mntpt[MAXPATHLEN - 1] = '\0'; if (envchanged) *envchanged = 1; } else { @@ -269,6 +281,7 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv) if (use_pager == -1 && p->option & USE_PAGER) use_pager = 1; commit_pager_choice(); + set_debugfs_path(); status = p->fn(argc, argv, prefix); exit_browser(status); @@ -403,6 +416,17 @@ static int run_argv(int *argcp, const char ***argv) return done_alias; } +/* mini /proc/mounts parser: searching for "^blah /mount/point debugfs" */ +static void get_debugfs_mntpt(void) +{ + const char *path = debugfs_mount(NULL); + + if (path) + strncpy(debugfs_mntpt, path, sizeof(debugfs_mntpt)); + else + debugfs_mntpt[0] = '\0'; +} + static void pthread__block_sigwinch(void) { sigset_t set; @@ -429,7 +453,7 @@ int main(int argc, const char **argv) if (!cmd) cmd = "perf-help"; /* get debugfs mount point from /proc/mounts */ - debugfs_mount(NULL); + get_debugfs_mntpt(); /* * "perf-xxxx" is the same as "perf xxxx", but we obviously: * @@ -452,6 +476,7 @@ int main(int argc, const char **argv) argc--; handle_options(&argv, &argc, NULL); commit_pager_choice(); + set_debugfs_path(); set_buildid_dir(); if (argc > 0) { diff --git a/trunk/tools/perf/perf.h b/trunk/tools/perf/perf.h index 64f8bee31ced..914c895510f7 100644 --- a/trunk/tools/perf/perf.h +++ b/trunk/tools/perf/perf.h @@ -185,28 +185,4 @@ extern const char perf_version_string[]; void pthread__unblock_sigwinch(void); -struct perf_record_opts { - pid_t target_pid; - pid_t target_tid; - bool call_graph; - bool group; - bool inherit_stat; - bool no_delay; - bool no_inherit; - bool no_samples; - bool pipe_output; - bool raw_samples; - bool sample_address; - bool sample_time; - bool sample_id_all_avail; - bool system_wide; - bool period; - unsigned int freq; - unsigned int mmap_pages; - unsigned int user_freq; - u64 default_interval; - u64 user_interval; - const char *cpu_list; -}; - #endif diff --git a/trunk/tools/perf/util/annotate.c b/trunk/tools/perf/util/annotate.c index 011ed2676604..119e996035c8 100644 --- a/trunk/tools/perf/util/annotate.c +++ b/trunk/tools/perf/util/annotate.c @@ -25,17 +25,17 @@ int symbol__annotate_init(struct map *map __used, struct symbol *sym) return 0; } -int symbol__alloc_hist(struct symbol *sym) +int symbol__alloc_hist(struct symbol *sym, int nevents) { struct annotation *notes = symbol__annotation(sym); size_t sizeof_sym_hist = (sizeof(struct sym_hist) + (sym->end - sym->start) * sizeof(u64)); - notes->src = zalloc(sizeof(*notes->src) + symbol_conf.nr_events * sizeof_sym_hist); + notes->src = zalloc(sizeof(*notes->src) + nevents * sizeof_sym_hist); if (notes->src == NULL) return -1; notes->src->sizeof_sym_hist = sizeof_sym_hist; - notes->src->nr_histograms = symbol_conf.nr_events; + notes->src->nr_histograms = nevents; INIT_LIST_HEAD(¬es->src->source); return 0; } @@ -334,7 +334,7 @@ int symbol__annotate(struct symbol *sym, struct map *map, size_t privsize) disassembler_style ? "-M " : "", disassembler_style ? disassembler_style : "", map__rip_2objdump(map, sym->start), - map__rip_2objdump(map, sym->end+1), + map__rip_2objdump(map, sym->end), symbol_conf.annotate_asm_raw ? "" : "--no-show-raw", symbol_conf.annotate_src ? "-S" : "", symfs_filename, filename); diff --git a/trunk/tools/perf/util/annotate.h b/trunk/tools/perf/util/annotate.h index efa5dc82bfae..d9072523d342 100644 --- a/trunk/tools/perf/util/annotate.h +++ b/trunk/tools/perf/util/annotate.h @@ -72,7 +72,7 @@ static inline struct annotation *symbol__annotation(struct symbol *sym) int symbol__inc_addr_samples(struct symbol *sym, struct map *map, int evidx, u64 addr); -int symbol__alloc_hist(struct symbol *sym); +int symbol__alloc_hist(struct symbol *sym, int nevents); void symbol__annotate_zero_histograms(struct symbol *sym); int symbol__annotate(struct symbol *sym, struct map *map, size_t privsize); @@ -99,7 +99,8 @@ static inline int symbol__tui_annotate(struct symbol *sym __used, } #else int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx, - void(*timer)(void *arg), void *arg, int delay_secs); + int nr_events, void(*timer)(void *arg), void *arg, + int delay_secs); #endif extern const char *disassembler_style; diff --git a/trunk/tools/perf/util/build-id.c b/trunk/tools/perf/util/build-id.c index dff9c7a725f4..a91cd99f26ea 100644 --- a/trunk/tools/perf/util/build-id.c +++ b/trunk/tools/perf/util/build-id.c @@ -13,18 +13,15 @@ #include "symbol.h" #include #include "debug.h" -#include "session.h" -#include "tool.h" -static int build_id__mark_dso_hit(struct perf_tool *tool __used, - union perf_event *event, +static int build_id__mark_dso_hit(union perf_event *event, struct perf_sample *sample __used, struct perf_evsel *evsel __used, - struct machine *machine) + struct perf_session *session) { struct addr_location al; u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; - struct thread *thread = machine__findnew_thread(machine, event->ip.pid); + struct thread *thread = perf_session__findnew(session, event->ip.pid); if (thread == NULL) { pr_err("problem processing %d event, skipping it.\n", @@ -32,8 +29,8 @@ static int build_id__mark_dso_hit(struct perf_tool *tool __used, return -1; } - thread__find_addr_map(thread, machine, cpumode, MAP__FUNCTION, - event->ip.ip, &al); + thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION, + event->ip.pid, event->ip.ip, &al); if (al.map != NULL) al.map->dso->hit = 1; @@ -41,26 +38,25 @@ static int build_id__mark_dso_hit(struct perf_tool *tool __used, return 0; } -static int perf_event__exit_del_thread(struct perf_tool *tool __used, - union perf_event *event, +static int perf_event__exit_del_thread(union perf_event *event, struct perf_sample *sample __used, - struct machine *machine) + struct perf_session *session) { - struct thread *thread = machine__findnew_thread(machine, event->fork.tid); + struct thread *thread = perf_session__findnew(session, event->fork.tid); dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid, event->fork.ppid, event->fork.ptid); if (thread) { - rb_erase(&thread->rb_node, &machine->threads); - machine->last_match = NULL; + rb_erase(&thread->rb_node, &session->threads); + session->last_match = NULL; thread__delete(thread); } return 0; } -struct perf_tool build_id__mark_dso_hit_ops = { +struct perf_event_ops build_id__mark_dso_hit_ops = { .sample = build_id__mark_dso_hit, .mmap = perf_event__process_mmap, .fork = perf_event__process_task, diff --git a/trunk/tools/perf/util/build-id.h b/trunk/tools/perf/util/build-id.h index a993ba87d996..5dafb00eaa06 100644 --- a/trunk/tools/perf/util/build-id.h +++ b/trunk/tools/perf/util/build-id.h @@ -3,7 +3,7 @@ #include "session.h" -extern struct perf_tool build_id__mark_dso_hit_ops; +extern struct perf_event_ops build_id__mark_dso_hit_ops; char *dso__build_id_filename(struct dso *self, char *bf, size_t size); diff --git a/trunk/tools/perf/util/callchain.h b/trunk/tools/perf/util/callchain.h index 7f9c0f1ae3a9..9b4ff16cac96 100644 --- a/trunk/tools/perf/util/callchain.h +++ b/trunk/tools/perf/util/callchain.h @@ -101,9 +101,6 @@ int callchain_append(struct callchain_root *root, int callchain_merge(struct callchain_cursor *cursor, struct callchain_root *dst, struct callchain_root *src); -struct ip_callchain; -union perf_event; - bool ip_callchain__valid(struct ip_callchain *chain, const union perf_event *event); /* diff --git a/trunk/tools/perf/util/cgroup.c b/trunk/tools/perf/util/cgroup.c index dbe2f16b1a1a..96bee5c46008 100644 --- a/trunk/tools/perf/util/cgroup.c +++ b/trunk/tools/perf/util/cgroup.c @@ -3,6 +3,7 @@ #include "parse-options.h" #include "evsel.h" #include "cgroup.h" +#include "debugfs.h" /* MAX_PATH, STR() */ #include "evlist.h" int nr_cgroups; @@ -11,7 +12,7 @@ static int cgroupfs_find_mountpoint(char *buf, size_t maxlen) { FILE *fp; - char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1]; + char mountpoint[MAX_PATH+1], tokens[MAX_PATH+1], type[MAX_PATH+1]; char *token, *saved_ptr = NULL; int found = 0; @@ -24,8 +25,8 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen) * and inspect every cgroupfs mount point to find one that has * perf_event subsystem */ - while (fscanf(fp, "%*s %"STR(PATH_MAX)"s %"STR(PATH_MAX)"s %" - STR(PATH_MAX)"s %*d %*d\n", + while (fscanf(fp, "%*s %"STR(MAX_PATH)"s %"STR(MAX_PATH)"s %" + STR(MAX_PATH)"s %*d %*d\n", mountpoint, type, tokens) == 3) { if (!strcmp(type, "cgroup")) { @@ -56,15 +57,15 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen) static int open_cgroup(char *name) { - char path[PATH_MAX + 1]; - char mnt[PATH_MAX + 1]; + char path[MAX_PATH+1]; + char mnt[MAX_PATH+1]; int fd; - if (cgroupfs_find_mountpoint(mnt, PATH_MAX + 1)) + if (cgroupfs_find_mountpoint(mnt, MAX_PATH+1)) return -1; - snprintf(path, PATH_MAX, "%s/%s", mnt, name); + snprintf(path, MAX_PATH, "%s/%s", mnt, name); fd = open(path, O_RDONLY); if (fd == -1) diff --git a/trunk/tools/perf/util/config.c b/trunk/tools/perf/util/config.c index 0deac6a14b65..80d9598db31a 100644 --- a/trunk/tools/perf/util/config.c +++ b/trunk/tools/perf/util/config.c @@ -1,8 +1,5 @@ /* - * config.c - * - * Helper functions for parsing config items. - * Originally copied from GIT source. + * GIT - The information manager from hell * * Copyright (C) Linus Torvalds, 2005 * Copyright (C) Johannes Schindelin, 2005 diff --git a/trunk/tools/perf/util/debugfs.c b/trunk/tools/perf/util/debugfs.c index ffc35e748e89..a88fefc0cc0a 100644 --- a/trunk/tools/perf/util/debugfs.c +++ b/trunk/tools/perf/util/debugfs.c @@ -2,12 +2,8 @@ #include "debugfs.h" #include "cache.h" -#include -#include - static int debugfs_premounted; -char debugfs_mountpoint[PATH_MAX + 1] = "/sys/kernel/debug"; -char tracing_events_path[PATH_MAX + 1] = "/sys/kernel/debug/tracing/events"; +static char debugfs_mountpoint[MAX_PATH+1]; static const char *debugfs_known_mountpoints[] = { "/sys/kernel/debug/", @@ -66,9 +62,11 @@ const char *debugfs_find_mountpoint(void) /* give up and parse /proc/mounts */ fp = fopen("/proc/mounts", "r"); if (fp == NULL) - return NULL; + die("Can't open /proc/mounts for read"); - while (fscanf(fp, "%*s %" STR(PATH_MAX) "s %99s %*s %*d %*d\n", + while (fscanf(fp, "%*s %" + STR(MAX_PATH) + "s %99s %*s %*d %*d\n", debugfs_mountpoint, type) == 2) { if (strcmp(type, "debugfs") == 0) break; @@ -108,12 +106,6 @@ int debugfs_valid_entry(const char *path) return 0; } -static void debugfs_set_tracing_events_path(const char *mountpoint) -{ - snprintf(tracing_events_path, sizeof(tracing_events_path), "%s/%s", - mountpoint, "tracing/events"); -} - /* mount the debugfs somewhere if it's not mounted */ char *debugfs_mount(const char *mountpoint) @@ -121,7 +113,7 @@ char *debugfs_mount(const char *mountpoint) /* see if it's already mounted */ if (debugfs_find_mountpoint()) { debugfs_premounted = 1; - goto out; + return debugfs_mountpoint; } /* if not mounted and no argument */ @@ -137,17 +129,10 @@ char *debugfs_mount(const char *mountpoint) return NULL; /* save the mountpoint */ - debugfs_found = 1; strncpy(debugfs_mountpoint, mountpoint, sizeof(debugfs_mountpoint)); -out: - debugfs_set_tracing_events_path(debugfs_mountpoint); - return debugfs_mountpoint; -} + debugfs_found = 1; -void debugfs_set_path(const char *mountpoint) -{ - snprintf(debugfs_mountpoint, sizeof(debugfs_mountpoint), "%s", mountpoint); - debugfs_set_tracing_events_path(mountpoint); + return debugfs_mountpoint; } /* umount the debugfs */ @@ -173,7 +158,7 @@ int debugfs_umount(void) int debugfs_write(const char *entry, const char *value) { - char path[PATH_MAX + 1]; + char path[MAX_PATH+1]; int ret, count; int fd; @@ -218,7 +203,7 @@ int debugfs_write(const char *entry, const char *value) */ int debugfs_read(const char *entry, char *buffer, size_t size) { - char path[PATH_MAX + 1]; + char path[MAX_PATH+1]; int ret; int fd; diff --git a/trunk/tools/perf/util/debugfs.h b/trunk/tools/perf/util/debugfs.h index 4a878f735eb0..83a02879745f 100644 --- a/trunk/tools/perf/util/debugfs.h +++ b/trunk/tools/perf/util/debugfs.h @@ -1,18 +1,25 @@ #ifndef __DEBUGFS_H__ #define __DEBUGFS_H__ -const char *debugfs_find_mountpoint(void); -int debugfs_valid_mountpoint(const char *debugfs); -int debugfs_valid_entry(const char *path); -char *debugfs_mount(const char *mountpoint); -int debugfs_umount(void); -void debugfs_set_path(const char *mountpoint); -int debugfs_write(const char *entry, const char *value); -int debugfs_read(const char *entry, char *buffer, size_t size); -void debugfs_force_cleanup(void); -int debugfs_make_path(const char *element, char *buffer, int size); +#include -extern char debugfs_mountpoint[]; -extern char tracing_events_path[]; +#ifndef MAX_PATH +# define MAX_PATH 256 +#endif + +#ifndef STR +# define _STR(x) #x +# define STR(x) _STR(x) +#endif + +extern const char *debugfs_find_mountpoint(void); +extern int debugfs_valid_mountpoint(const char *debugfs); +extern int debugfs_valid_entry(const char *path); +extern char *debugfs_mount(const char *mountpoint); +extern int debugfs_umount(void); +extern int debugfs_write(const char *entry, const char *value); +extern int debugfs_read(const char *entry, char *buffer, size_t size); +extern void debugfs_force_cleanup(void); +extern int debugfs_make_path(const char *element, char *buffer, int size); #endif /* __DEBUGFS_H__ */ diff --git a/trunk/tools/perf/util/event.c b/trunk/tools/perf/util/event.c index 73ddaf06b8e7..437f8ca679a0 100644 --- a/trunk/tools/perf/util/event.c +++ b/trunk/tools/perf/util/event.c @@ -1,6 +1,7 @@ #include #include "event.h" #include "debug.h" +#include "session.h" #include "sort.h" #include "string.h" #include "strlist.h" @@ -43,27 +44,36 @@ static struct perf_sample synth_sample = { .period = 1, }; -static pid_t perf_event__get_comm_tgid(pid_t pid, char *comm, size_t len) +static pid_t perf_event__synthesize_comm(union perf_event *event, pid_t pid, + int full, perf_event__handler_t process, + struct perf_session *session) { char filename[PATH_MAX]; char bf[BUFSIZ]; FILE *fp; size_t size = 0; - pid_t tgid = -1; + DIR *tasks; + struct dirent dirent, *next; + pid_t tgid = 0; snprintf(filename, sizeof(filename), "/proc/%d/status", pid); fp = fopen(filename, "r"); if (fp == NULL) { +out_race: + /* + * We raced with a task exiting - just return: + */ pr_debug("couldn't open %s\n", filename); return 0; } - while (!comm[0] || (tgid < 0)) { + memset(&event->comm, 0, sizeof(event->comm)); + + while (!event->comm.comm[0] || !event->comm.pid) { if (fgets(bf, sizeof(bf), fp) == NULL) { - pr_warning("couldn't get COMM and pgid, malformed %s\n", - filename); - break; + pr_warning("couldn't get COMM and pgid, malformed %s\n", filename); + goto out; } if (memcmp(bf, "Name:", 5) == 0) { @@ -71,65 +81,33 @@ static pid_t perf_event__get_comm_tgid(pid_t pid, char *comm, size_t len) while (*name && isspace(*name)) ++name; size = strlen(name) - 1; - if (size >= len) - size = len - 1; - memcpy(comm, name, size); - + memcpy(event->comm.comm, name, size++); } else if (memcmp(bf, "Tgid:", 5) == 0) { char *tgids = bf + 5; while (*tgids && isspace(*tgids)) ++tgids; - tgid = atoi(tgids); + tgid = event->comm.pid = atoi(tgids); } } - fclose(fp); - - return tgid; -} - -static pid_t perf_event__synthesize_comm(struct perf_tool *tool, - union perf_event *event, pid_t pid, - int full, - perf_event__handler_t process, - struct machine *machine) -{ - char filename[PATH_MAX]; - size_t size; - DIR *tasks; - struct dirent dirent, *next; - pid_t tgid; - - memset(&event->comm, 0, sizeof(event->comm)); - - tgid = perf_event__get_comm_tgid(pid, event->comm.comm, - sizeof(event->comm.comm)); - if (tgid < 0) - goto out; - - event->comm.pid = tgid; event->comm.header.type = PERF_RECORD_COMM; - - size = strlen(event->comm.comm) + 1; size = ALIGN(size, sizeof(u64)); - memset(event->comm.comm + size, 0, machine->id_hdr_size); + memset(event->comm.comm + size, 0, session->id_hdr_size); event->comm.header.size = (sizeof(event->comm) - (sizeof(event->comm.comm) - size) + - machine->id_hdr_size); + session->id_hdr_size); if (!full) { event->comm.tid = pid; - process(tool, event, &synth_sample, machine); + process(event, &synth_sample, session); goto out; } snprintf(filename, sizeof(filename), "/proc/%d/task", pid); tasks = opendir(filename); - if (tasks == NULL) { - pr_debug("couldn't open %s\n", filename); - return 0; - } + if (tasks == NULL) + goto out_race; while (!readdir_r(tasks, &dirent, &next) && next) { char *end; @@ -137,32 +115,22 @@ static pid_t perf_event__synthesize_comm(struct perf_tool *tool, if (*end) continue; - /* already have tgid; jut want to update the comm */ - (void) perf_event__get_comm_tgid(pid, event->comm.comm, - sizeof(event->comm.comm)); - - size = strlen(event->comm.comm) + 1; - size = ALIGN(size, sizeof(u64)); - memset(event->comm.comm + size, 0, machine->id_hdr_size); - event->comm.header.size = (sizeof(event->comm) - - (sizeof(event->comm.comm) - size) + - machine->id_hdr_size); - event->comm.tid = pid; - process(tool, event, &synth_sample, machine); + process(event, &synth_sample, session); } closedir(tasks); out: + fclose(fp); + return tgid; } -static int perf_event__synthesize_mmap_events(struct perf_tool *tool, - union perf_event *event, +static int perf_event__synthesize_mmap_events(union perf_event *event, pid_t pid, pid_t tgid, perf_event__handler_t process, - struct machine *machine) + struct perf_session *session) { char filename[PATH_MAX]; FILE *fp; @@ -225,12 +193,12 @@ static int perf_event__synthesize_mmap_events(struct perf_tool *tool, event->mmap.len -= event->mmap.start; event->mmap.header.size = (sizeof(event->mmap) - (sizeof(event->mmap.filename) - size)); - memset(event->mmap.filename + size, 0, machine->id_hdr_size); - event->mmap.header.size += machine->id_hdr_size; + memset(event->mmap.filename + size, 0, session->id_hdr_size); + event->mmap.header.size += session->id_hdr_size; event->mmap.pid = tgid; event->mmap.tid = pid; - process(tool, event, &synth_sample, machine); + process(event, &synth_sample, session); } } @@ -238,14 +206,14 @@ static int perf_event__synthesize_mmap_events(struct perf_tool *tool, return 0; } -int perf_event__synthesize_modules(struct perf_tool *tool, - perf_event__handler_t process, +int perf_event__synthesize_modules(perf_event__handler_t process, + struct perf_session *session, struct machine *machine) { struct rb_node *nd; struct map_groups *kmaps = &machine->kmaps; union perf_event *event = zalloc((sizeof(event->mmap) + - machine->id_hdr_size)); + session->id_hdr_size)); if (event == NULL) { pr_debug("Not enough memory synthesizing mmap event " "for kernel modules\n"); @@ -275,15 +243,15 @@ int perf_event__synthesize_modules(struct perf_tool *tool, event->mmap.header.type = PERF_RECORD_MMAP; event->mmap.header.size = (sizeof(event->mmap) - (sizeof(event->mmap.filename) - size)); - memset(event->mmap.filename + size, 0, machine->id_hdr_size); - event->mmap.header.size += machine->id_hdr_size; + memset(event->mmap.filename + size, 0, session->id_hdr_size); + event->mmap.header.size += session->id_hdr_size; event->mmap.start = pos->start; event->mmap.len = pos->end - pos->start; event->mmap.pid = machine->pid; memcpy(event->mmap.filename, pos->dso->long_name, pos->dso->long_name_len + 1); - process(tool, event, &synth_sample, machine); + process(event, &synth_sample, session); } free(event); @@ -292,69 +260,40 @@ int perf_event__synthesize_modules(struct perf_tool *tool, static int __event__synthesize_thread(union perf_event *comm_event, union perf_event *mmap_event, - pid_t pid, int full, - perf_event__handler_t process, - struct perf_tool *tool, - struct machine *machine) + pid_t pid, perf_event__handler_t process, + struct perf_session *session) { - pid_t tgid = perf_event__synthesize_comm(tool, comm_event, pid, full, - process, machine); + pid_t tgid = perf_event__synthesize_comm(comm_event, pid, 1, process, + session); if (tgid == -1) return -1; - return perf_event__synthesize_mmap_events(tool, mmap_event, pid, tgid, - process, machine); + return perf_event__synthesize_mmap_events(mmap_event, pid, tgid, + process, session); } -int perf_event__synthesize_thread_map(struct perf_tool *tool, - struct thread_map *threads, +int perf_event__synthesize_thread_map(struct thread_map *threads, perf_event__handler_t process, - struct machine *machine) + struct perf_session *session) { union perf_event *comm_event, *mmap_event; - int err = -1, thread, j; + int err = -1, thread; - comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size); + comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size); if (comm_event == NULL) goto out; - mmap_event = malloc(sizeof(mmap_event->mmap) + machine->id_hdr_size); + mmap_event = malloc(sizeof(mmap_event->mmap) + session->id_hdr_size); if (mmap_event == NULL) goto out_free_comm; err = 0; for (thread = 0; thread < threads->nr; ++thread) { if (__event__synthesize_thread(comm_event, mmap_event, - threads->map[thread], 0, - process, tool, machine)) { + threads->map[thread], + process, session)) { err = -1; break; } - - /* - * comm.pid is set to thread group id by - * perf_event__synthesize_comm - */ - if ((int) comm_event->comm.pid != threads->map[thread]) { - bool need_leader = true; - - /* is thread group leader in thread_map? */ - for (j = 0; j < threads->nr; ++j) { - if ((int) comm_event->comm.pid == threads->map[j]) { - need_leader = false; - break; - } - } - - /* if not, generate events for it */ - if (need_leader && - __event__synthesize_thread(comm_event, - mmap_event, - comm_event->comm.pid, 0, - process, tool, machine)) { - err = -1; - break; - } - } } free(mmap_event); out_free_comm: @@ -363,20 +302,19 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool, return err; } -int perf_event__synthesize_threads(struct perf_tool *tool, - perf_event__handler_t process, - struct machine *machine) +int perf_event__synthesize_threads(perf_event__handler_t process, + struct perf_session *session) { DIR *proc; struct dirent dirent, *next; union perf_event *comm_event, *mmap_event; int err = -1; - comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size); + comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size); if (comm_event == NULL) goto out; - mmap_event = malloc(sizeof(mmap_event->mmap) + machine->id_hdr_size); + mmap_event = malloc(sizeof(mmap_event->mmap) + session->id_hdr_size); if (mmap_event == NULL) goto out_free_comm; @@ -391,8 +329,8 @@ int perf_event__synthesize_threads(struct perf_tool *tool, if (*end) /* only interested in proper numerical dirents */ continue; - __event__synthesize_thread(comm_event, mmap_event, pid, 1, - process, tool, machine); + __event__synthesize_thread(comm_event, mmap_event, pid, + process, session); } closedir(proc); @@ -427,8 +365,8 @@ static int find_symbol_cb(void *arg, const char *name, char type, return 1; } -int perf_event__synthesize_kernel_mmap(struct perf_tool *tool, - perf_event__handler_t process, +int perf_event__synthesize_kernel_mmap(perf_event__handler_t process, + struct perf_session *session, struct machine *machine, const char *symbol_name) { @@ -445,7 +383,7 @@ int perf_event__synthesize_kernel_mmap(struct perf_tool *tool, */ struct process_symbol_args args = { .name = symbol_name, }; union perf_event *event = zalloc((sizeof(event->mmap) + - machine->id_hdr_size)); + session->id_hdr_size)); if (event == NULL) { pr_debug("Not enough memory synthesizing mmap event " "for kernel modules\n"); @@ -479,32 +417,25 @@ int perf_event__synthesize_kernel_mmap(struct perf_tool *tool, size = ALIGN(size, sizeof(u64)); event->mmap.header.type = PERF_RECORD_MMAP; event->mmap.header.size = (sizeof(event->mmap) - - (sizeof(event->mmap.filename) - size) + machine->id_hdr_size); + (sizeof(event->mmap.filename) - size) + session->id_hdr_size); event->mmap.pgoff = args.start; event->mmap.start = map->start; event->mmap.len = map->end - event->mmap.start; event->mmap.pid = machine->pid; - err = process(tool, event, &synth_sample, machine); + err = process(event, &synth_sample, session); free(event); return err; } -size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp) -{ - return fprintf(fp, ": %s:%d\n", event->comm.comm, event->comm.tid); -} - -int perf_event__process_comm(struct perf_tool *tool __used, - union perf_event *event, +int perf_event__process_comm(union perf_event *event, struct perf_sample *sample __used, - struct machine *machine) + struct perf_session *session) { - struct thread *thread = machine__findnew_thread(machine, event->comm.tid); + struct thread *thread = perf_session__findnew(session, event->comm.tid); - if (dump_trace) - perf_event__fprintf_comm(event, stdout); + dump_printf(": %s:%d\n", event->comm.comm, event->comm.tid); if (thread == NULL || thread__set_comm(thread, event->comm.comm)) { dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n"); @@ -514,13 +445,13 @@ int perf_event__process_comm(struct perf_tool *tool __used, return 0; } -int perf_event__process_lost(struct perf_tool *tool __used, - union perf_event *event, +int perf_event__process_lost(union perf_event *event, struct perf_sample *sample __used, - struct machine *machine __used) + struct perf_session *session) { dump_printf(": id:%" PRIu64 ": lost:%" PRIu64 "\n", event->lost.id, event->lost.lost); + session->hists.stats.total_lost += event->lost.lost; return 0; } @@ -537,15 +468,21 @@ static void perf_event__set_kernel_mmap_len(union perf_event *event, maps[MAP__FUNCTION]->end = ~0ULL; } -static int perf_event__process_kernel_mmap(struct perf_tool *tool __used, - union perf_event *event, - struct machine *machine) +static int perf_event__process_kernel_mmap(union perf_event *event, + struct perf_session *session) { struct map *map; char kmmap_prefix[PATH_MAX]; + struct machine *machine; enum dso_kernel_type kernel_type; bool is_kernel_mmap; + machine = perf_session__findnew_machine(session, event->mmap.pid); + if (!machine) { + pr_err("Can't find id %d's machine\n", event->mmap.pid); + goto out_problem; + } + machine__mmap_name(machine, kmmap_prefix, sizeof(kmmap_prefix)); if (machine__is_host(machine)) kernel_type = DSO_TYPE_KERNEL; @@ -612,9 +549,9 @@ static int perf_event__process_kernel_mmap(struct perf_tool *tool __used, * time /proc/sys/kernel/kptr_restrict was non zero. */ if (event->mmap.pgoff != 0) { - maps__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps, - symbol_name, - event->mmap.pgoff); + perf_session__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps, + symbol_name, + event->mmap.pgoff); } if (machine__is_default_guest(machine)) { @@ -630,35 +567,32 @@ static int perf_event__process_kernel_mmap(struct perf_tool *tool __used, return -1; } -size_t perf_event__fprintf_mmap(union perf_event *event, FILE *fp) -{ - return fprintf(fp, " %d/%d: [%#" PRIx64 "(%#" PRIx64 ") @ %#" PRIx64 "]: %s\n", - event->mmap.pid, event->mmap.tid, event->mmap.start, - event->mmap.len, event->mmap.pgoff, event->mmap.filename); -} - -int perf_event__process_mmap(struct perf_tool *tool, - union perf_event *event, +int perf_event__process_mmap(union perf_event *event, struct perf_sample *sample __used, - struct machine *machine) + struct perf_session *session) { + struct machine *machine; struct thread *thread; struct map *map; u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; int ret = 0; - if (dump_trace) - perf_event__fprintf_mmap(event, stdout); + dump_printf(" %d/%d: [%#" PRIx64 "(%#" PRIx64 ") @ %#" PRIx64 "]: %s\n", + event->mmap.pid, event->mmap.tid, event->mmap.start, + event->mmap.len, event->mmap.pgoff, event->mmap.filename); if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL || cpumode == PERF_RECORD_MISC_KERNEL) { - ret = perf_event__process_kernel_mmap(tool, event, machine); + ret = perf_event__process_kernel_mmap(event, session); if (ret < 0) goto out_problem; return 0; } - thread = machine__findnew_thread(machine, event->mmap.pid); + machine = perf_session__find_host_machine(session); + if (machine == NULL) + goto out_problem; + thread = perf_session__findnew(session, event->mmap.pid); if (thread == NULL) goto out_problem; map = map__new(&machine->user_dsos, event->mmap.start, @@ -676,26 +610,18 @@ int perf_event__process_mmap(struct perf_tool *tool, return 0; } -size_t perf_event__fprintf_task(union perf_event *event, FILE *fp) -{ - return fprintf(fp, "(%d:%d):(%d:%d)\n", - event->fork.pid, event->fork.tid, - event->fork.ppid, event->fork.ptid); -} - -int perf_event__process_task(struct perf_tool *tool __used, - union perf_event *event, +int perf_event__process_task(union perf_event *event, struct perf_sample *sample __used, - struct machine *machine) + struct perf_session *session) { - struct thread *thread = machine__findnew_thread(machine, event->fork.tid); - struct thread *parent = machine__findnew_thread(machine, event->fork.ptid); + struct thread *thread = perf_session__findnew(session, event->fork.tid); + struct thread *parent = perf_session__findnew(session, event->fork.ptid); - if (dump_trace) - perf_event__fprintf_task(event, stdout); + dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid, + event->fork.ppid, event->fork.ptid); if (event->header.type == PERF_RECORD_EXIT) { - machine__remove_thread(machine, thread); + perf_session__remove_thread(session, thread); return 0; } @@ -708,45 +634,22 @@ int perf_event__process_task(struct perf_tool *tool __used, return 0; } -size_t perf_event__fprintf(union perf_event *event, FILE *fp) -{ - size_t ret = fprintf(fp, "PERF_RECORD_%s", - perf_event__name(event->header.type)); - - switch (event->header.type) { - case PERF_RECORD_COMM: - ret += perf_event__fprintf_comm(event, fp); - break; - case PERF_RECORD_FORK: - case PERF_RECORD_EXIT: - ret += perf_event__fprintf_task(event, fp); - break; - case PERF_RECORD_MMAP: - ret += perf_event__fprintf_mmap(event, fp); - break; - default: - ret += fprintf(fp, "\n"); - } - - return ret; -} - -int perf_event__process(struct perf_tool *tool, union perf_event *event, - struct perf_sample *sample, struct machine *machine) +int perf_event__process(union perf_event *event, struct perf_sample *sample, + struct perf_session *session) { switch (event->header.type) { case PERF_RECORD_COMM: - perf_event__process_comm(tool, event, sample, machine); + perf_event__process_comm(event, sample, session); break; case PERF_RECORD_MMAP: - perf_event__process_mmap(tool, event, sample, machine); + perf_event__process_mmap(event, sample, session); break; case PERF_RECORD_FORK: case PERF_RECORD_EXIT: - perf_event__process_task(tool, event, sample, machine); + perf_event__process_task(event, sample, session); break; case PERF_RECORD_LOST: - perf_event__process_lost(tool, event, sample, machine); + perf_event__process_lost(event, sample, session); default: break; } @@ -755,29 +658,36 @@ int perf_event__process(struct perf_tool *tool, union perf_event *event, } void thread__find_addr_map(struct thread *self, - struct machine *machine, u8 cpumode, - enum map_type type, u64 addr, + struct perf_session *session, u8 cpumode, + enum map_type type, pid_t pid, u64 addr, struct addr_location *al) { struct map_groups *mg = &self->mg; + struct machine *machine = NULL; al->thread = self; al->addr = addr; al->cpumode = cpumode; al->filtered = false; - if (machine == NULL) { - al->map = NULL; - return; - } - if (cpumode == PERF_RECORD_MISC_KERNEL && perf_host) { al->level = 'k'; + machine = perf_session__find_host_machine(session); + if (machine == NULL) { + al->map = NULL; + return; + } mg = &machine->kmaps; } else if (cpumode == PERF_RECORD_MISC_USER && perf_host) { al->level = '.'; + machine = perf_session__find_host_machine(session); } else if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL && perf_guest) { al->level = 'g'; + machine = perf_session__find_machine(session, pid); + if (machine == NULL) { + al->map = NULL; + return; + } mg = &machine->kmaps; } else { /* @@ -823,12 +733,13 @@ void thread__find_addr_map(struct thread *self, al->addr = al->map->map_ip(al->map, al->addr); } -void thread__find_addr_location(struct thread *thread, struct machine *machine, - u8 cpumode, enum map_type type, u64 addr, +void thread__find_addr_location(struct thread *self, + struct perf_session *session, u8 cpumode, + enum map_type type, pid_t pid, u64 addr, struct addr_location *al, symbol_filter_t filter) { - thread__find_addr_map(thread, machine, cpumode, type, addr, al); + thread__find_addr_map(self, session, cpumode, type, pid, addr, al); if (al->map != NULL) al->sym = map__find_symbol(al->map, al->addr, filter); else @@ -836,13 +747,13 @@ void thread__find_addr_location(struct thread *thread, struct machine *machine, } int perf_event__preprocess_sample(const union perf_event *event, - struct machine *machine, + struct perf_session *session, struct addr_location *al, struct perf_sample *sample, symbol_filter_t filter) { u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; - struct thread *thread = machine__findnew_thread(machine, event->ip.pid); + struct thread *thread = perf_session__findnew(session, event->ip.pid); if (thread == NULL) return -1; @@ -853,18 +764,18 @@ int perf_event__preprocess_sample(const union perf_event *event, dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid); /* - * Have we already created the kernel maps for this machine? + * Have we already created the kernel maps for the host machine? * * This should have happened earlier, when we processed the kernel MMAP * events, but for older perf.data files there was no such thing, so do * it now. */ if (cpumode == PERF_RECORD_MISC_KERNEL && - machine->vmlinux_maps[MAP__FUNCTION] == NULL) - machine__create_kernel_maps(machine); + session->host_machine.vmlinux_maps[MAP__FUNCTION] == NULL) + machine__create_kernel_maps(&session->host_machine); - thread__find_addr_map(thread, machine, cpumode, MAP__FUNCTION, - event->ip.ip, al); + thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION, + event->ip.pid, event->ip.ip, al); dump_printf(" ...... dso: %s\n", al->map ? al->map->dso->long_name : al->level == 'H' ? "[hypervisor]" : ""); @@ -872,14 +783,13 @@ int perf_event__preprocess_sample(const union perf_event *event, al->cpu = sample->cpu; if (al->map) { - struct dso *dso = al->map->dso; - if (symbol_conf.dso_list && - (!dso || !(strlist__has_entry(symbol_conf.dso_list, - dso->short_name) || - (dso->short_name != dso->long_name && - strlist__has_entry(symbol_conf.dso_list, - dso->long_name))))) + (!al->map || !al->map->dso || + !(strlist__has_entry(symbol_conf.dso_list, + al->map->dso->short_name) || + (al->map->dso->short_name != al->map->dso->long_name && + strlist__has_entry(symbol_conf.dso_list, + al->map->dso->long_name))))) goto out_filtered; al->sym = map__find_symbol(al->map, al->addr, filter); diff --git a/trunk/tools/perf/util/event.h b/trunk/tools/perf/util/event.h index cbdeaad9c5e5..357a85b85248 100644 --- a/trunk/tools/perf/util/event.h +++ b/trunk/tools/perf/util/event.h @@ -2,7 +2,6 @@ #define __PERF_RECORD_H #include -#include #include "../perf.h" #include "map.h" @@ -142,54 +141,43 @@ union perf_event { void perf_event__print_totals(void); -struct perf_tool; +struct perf_session; struct thread_map; -typedef int (*perf_event__handler_t)(struct perf_tool *tool, - union perf_event *event, +typedef int (*perf_event__handler_synth_t)(union perf_event *event, + struct perf_session *session); +typedef int (*perf_event__handler_t)(union perf_event *event, struct perf_sample *sample, - struct machine *machine); + struct perf_session *session); -int perf_event__synthesize_thread_map(struct perf_tool *tool, - struct thread_map *threads, +int perf_event__synthesize_thread_map(struct thread_map *threads, perf_event__handler_t process, - struct machine *machine); -int perf_event__synthesize_threads(struct perf_tool *tool, - perf_event__handler_t process, - struct machine *machine); -int perf_event__synthesize_kernel_mmap(struct perf_tool *tool, - perf_event__handler_t process, + struct perf_session *session); +int perf_event__synthesize_threads(perf_event__handler_t process, + struct perf_session *session); +int perf_event__synthesize_kernel_mmap(perf_event__handler_t process, + struct perf_session *session, struct machine *machine, const char *symbol_name); -int perf_event__synthesize_modules(struct perf_tool *tool, - perf_event__handler_t process, +int perf_event__synthesize_modules(perf_event__handler_t process, + struct perf_session *session, struct machine *machine); -int perf_event__process_comm(struct perf_tool *tool, - union perf_event *event, - struct perf_sample *sample, - struct machine *machine); -int perf_event__process_lost(struct perf_tool *tool, - union perf_event *event, - struct perf_sample *sample, - struct machine *machine); -int perf_event__process_mmap(struct perf_tool *tool, - union perf_event *event, - struct perf_sample *sample, - struct machine *machine); -int perf_event__process_task(struct perf_tool *tool, - union perf_event *event, - struct perf_sample *sample, - struct machine *machine); -int perf_event__process(struct perf_tool *tool, - union perf_event *event, - struct perf_sample *sample, - struct machine *machine); +int perf_event__process_comm(union perf_event *event, struct perf_sample *sample, + struct perf_session *session); +int perf_event__process_lost(union perf_event *event, struct perf_sample *sample, + struct perf_session *session); +int perf_event__process_mmap(union perf_event *event, struct perf_sample *sample, + struct perf_session *session); +int perf_event__process_task(union perf_event *event, struct perf_sample *sample, + struct perf_session *session); +int perf_event__process(union perf_event *event, struct perf_sample *sample, + struct perf_session *session); struct addr_location; int perf_event__preprocess_sample(const union perf_event *self, - struct machine *machine, + struct perf_session *session, struct addr_location *al, struct perf_sample *sample, symbol_filter_t filter); @@ -199,13 +187,5 @@ const char *perf_event__name(unsigned int id); int perf_event__parse_sample(const union perf_event *event, u64 type, int sample_size, bool sample_id_all, struct perf_sample *sample, bool swapped); -int perf_event__synthesize_sample(union perf_event *event, u64 type, - const struct perf_sample *sample, - bool swapped); - -size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp); -size_t perf_event__fprintf_mmap(union perf_event *event, FILE *fp); -size_t perf_event__fprintf_task(union perf_event *event, FILE *fp); -size_t perf_event__fprintf(union perf_event *event, FILE *fp); #endif /* __PERF_RECORD_H */ diff --git a/trunk/tools/perf/util/evlist.c b/trunk/tools/perf/util/evlist.c index fa1837088ca8..fbb4b4ab9cc6 100644 --- a/trunk/tools/perf/util/evlist.c +++ b/trunk/tools/perf/util/evlist.c @@ -6,16 +6,12 @@ * * Released under the GPL v2. (and only v2, not any later version) */ -#include "util.h" -#include "debugfs.h" #include #include "cpumap.h" #include "thread_map.h" #include "evlist.h" #include "evsel.h" -#include - -#include "parse-events.h" +#include "util.h" #include @@ -34,7 +30,6 @@ void perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus, INIT_HLIST_HEAD(&evlist->heads[i]); INIT_LIST_HEAD(&evlist->entries); perf_evlist__set_maps(evlist, cpus, threads); - evlist->workload.pid = -1; } struct perf_evlist *perf_evlist__new(struct cpu_map *cpus, @@ -48,22 +43,6 @@ struct perf_evlist *perf_evlist__new(struct cpu_map *cpus, return evlist; } -void perf_evlist__config_attrs(struct perf_evlist *evlist, - struct perf_record_opts *opts) -{ - struct perf_evsel *evsel; - - if (evlist->cpus->map[0] < 0) - opts->no_inherit = true; - - list_for_each_entry(evsel, &evlist->entries, node) { - perf_evsel__config(evsel, opts); - - if (evlist->nr_entries > 1) - evsel->attr.sample_type |= PERF_SAMPLE_ID; - } -} - static void perf_evlist__purge(struct perf_evlist *evlist) { struct perf_evsel *pos, *n; @@ -97,14 +76,6 @@ void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry) ++evlist->nr_entries; } -static void perf_evlist__splice_list_tail(struct perf_evlist *evlist, - struct list_head *list, - int nr_entries) -{ - list_splice_tail(list, &evlist->entries); - evlist->nr_entries += nr_entries; -} - int perf_evlist__add_default(struct perf_evlist *evlist) { struct perf_event_attr attr = { @@ -129,126 +100,6 @@ int perf_evlist__add_default(struct perf_evlist *evlist) return -ENOMEM; } -int perf_evlist__add_attrs(struct perf_evlist *evlist, - struct perf_event_attr *attrs, size_t nr_attrs) -{ - struct perf_evsel *evsel, *n; - LIST_HEAD(head); - size_t i; - - for (i = 0; i < nr_attrs; i++) { - evsel = perf_evsel__new(attrs + i, evlist->nr_entries + i); - if (evsel == NULL) - goto out_delete_partial_list; - list_add_tail(&evsel->node, &head); - } - - perf_evlist__splice_list_tail(evlist, &head, nr_attrs); - - return 0; - -out_delete_partial_list: - list_for_each_entry_safe(evsel, n, &head, node) - perf_evsel__delete(evsel); - return -1; -} - -static int trace_event__id(const char *evname) -{ - char *filename, *colon; - int err = -1, fd; - - if (asprintf(&filename, "%s/%s/id", tracing_events_path, evname) < 0) - return -1; - - colon = strrchr(filename, ':'); - if (colon != NULL) - *colon = '/'; - - fd = open(filename, O_RDONLY); - if (fd >= 0) { - char id[16]; - if (read(fd, id, sizeof(id)) > 0) - err = atoi(id); - close(fd); - } - - free(filename); - return err; -} - -int perf_evlist__add_tracepoints(struct perf_evlist *evlist, - const char *tracepoints[], - size_t nr_tracepoints) -{ - int err; - size_t i; - struct perf_event_attr *attrs = zalloc(nr_tracepoints * sizeof(*attrs)); - - if (attrs == NULL) - return -1; - - for (i = 0; i < nr_tracepoints; i++) { - err = trace_event__id(tracepoints[i]); - - if (err < 0) - goto out_free_attrs; - - attrs[i].type = PERF_TYPE_TRACEPOINT; - attrs[i].config = err; - attrs[i].sample_type = (PERF_SAMPLE_RAW | PERF_SAMPLE_TIME | - PERF_SAMPLE_CPU); - attrs[i].sample_period = 1; - } - - err = perf_evlist__add_attrs(evlist, attrs, nr_tracepoints); -out_free_attrs: - free(attrs); - return err; -} - -static struct perf_evsel * - perf_evlist__find_tracepoint_by_id(struct perf_evlist *evlist, int id) -{ - struct perf_evsel *evsel; - - list_for_each_entry(evsel, &evlist->entries, node) { - if (evsel->attr.type == PERF_TYPE_TRACEPOINT && - (int)evsel->attr.config == id) - return evsel; - } - - return NULL; -} - -int perf_evlist__set_tracepoints_handlers(struct perf_evlist *evlist, - const struct perf_evsel_str_handler *assocs, - size_t nr_assocs) -{ - struct perf_evsel *evsel; - int err; - size_t i; - - for (i = 0; i < nr_assocs; i++) { - err = trace_event__id(assocs[i].name); - if (err < 0) - goto out; - - evsel = perf_evlist__find_tracepoint_by_id(evlist, err); - if (evsel == NULL) - continue; - - err = -EEXIST; - if (evsel->handler.func != NULL) - goto out; - evsel->handler.func = assocs[i].handler; - } - - err = 0; -out: - return err; -} - void perf_evlist__disable(struct perf_evlist *evlist) { int cpu, thread; @@ -275,7 +126,7 @@ void perf_evlist__enable(struct perf_evlist *evlist) } } -static int perf_evlist__alloc_pollfd(struct perf_evlist *evlist) +int perf_evlist__alloc_pollfd(struct perf_evlist *evlist) { int nfds = evlist->cpus->nr * evlist->threads->nr * evlist->nr_entries; evlist->pollfd = malloc(sizeof(struct pollfd) * nfds); @@ -431,7 +282,7 @@ void perf_evlist__munmap(struct perf_evlist *evlist) evlist->mmap = NULL; } -static int perf_evlist__alloc_mmap(struct perf_evlist *evlist) +int perf_evlist__alloc_mmap(struct perf_evlist *evlist) { evlist->nr_mmaps = evlist->cpus->nr; if (evlist->cpus->map[0] == -1) @@ -447,10 +298,8 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist, evlist->mmap[idx].mask = mask; evlist->mmap[idx].base = mmap(NULL, evlist->mmap_len, prot, MAP_SHARED, fd, 0); - if (evlist->mmap[idx].base == MAP_FAILED) { - evlist->mmap[idx].base = NULL; + if (evlist->mmap[idx].base == MAP_FAILED) return -1; - } perf_evlist__add_pollfd(evlist, fd); return 0; @@ -551,22 +400,14 @@ static int perf_evlist__mmap_per_thread(struct perf_evlist *evlist, int prot, in * * Using perf_evlist__read_on_cpu does this automatically. */ -int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages, - bool overwrite) +int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite) { unsigned int page_size = sysconf(_SC_PAGE_SIZE); + int mask = pages * page_size - 1; struct perf_evsel *evsel; const struct cpu_map *cpus = evlist->cpus; const struct thread_map *threads = evlist->threads; - int prot = PROT_READ | (overwrite ? 0 : PROT_WRITE), mask; - - /* 512 kiB: default amount of unprivileged mlocked memory */ - if (pages == UINT_MAX) - pages = (512 * 1024) / page_size; - else if (!is_power_of_2(pages)) - return -EINVAL; - - mask = pages * page_size - 1; + int prot = PROT_READ | (overwrite ? 0 : PROT_WRITE); if (evlist->mmap == NULL && perf_evlist__alloc_mmap(evlist) < 0) return -ENOMEM; @@ -671,38 +512,6 @@ u64 perf_evlist__sample_type(const struct perf_evlist *evlist) return first->attr.sample_type; } -u16 perf_evlist__id_hdr_size(const struct perf_evlist *evlist) -{ - struct perf_evsel *first; - struct perf_sample *data; - u64 sample_type; - u16 size = 0; - - first = list_entry(evlist->entries.next, struct perf_evsel, node); - - if (!first->attr.sample_id_all) - goto out; - - sample_type = first->attr.sample_type; - - if (sample_type & PERF_SAMPLE_TID) - size += sizeof(data->tid) * 2; - - if (sample_type & PERF_SAMPLE_TIME) - size += sizeof(data->time); - - if (sample_type & PERF_SAMPLE_ID) - size += sizeof(data->id); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - size += sizeof(data->stream_id); - - if (sample_type & PERF_SAMPLE_CPU) - size += sizeof(data->cpu) * 2; -out: - return size; -} - bool perf_evlist__valid_sample_id_all(const struct perf_evlist *evlist) { struct perf_evsel *pos, *first; @@ -760,97 +569,3 @@ int perf_evlist__open(struct perf_evlist *evlist, bool group) return err; } - -int perf_evlist__prepare_workload(struct perf_evlist *evlist, - struct perf_record_opts *opts, - const char *argv[]) -{ - int child_ready_pipe[2], go_pipe[2]; - char bf; - - if (pipe(child_ready_pipe) < 0) { - perror("failed to create 'ready' pipe"); - return -1; - } - - if (pipe(go_pipe) < 0) { - perror("failed to create 'go' pipe"); - goto out_close_ready_pipe; - } - - evlist->workload.pid = fork(); - if (evlist->workload.pid < 0) { - perror("failed to fork"); - goto out_close_pipes; - } - - if (!evlist->workload.pid) { - if (opts->pipe_output) - dup2(2, 1); - - close(child_ready_pipe[0]); - close(go_pipe[1]); - fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC); - - /* - * Do a dummy execvp to get the PLT entry resolved, - * so we avoid the resolver overhead on the real - * execvp call. - */ - execvp("", (char **)argv); - - /* - * Tell the parent we're ready to go - */ - close(child_ready_pipe[1]); - - /* - * Wait until the parent tells us to go. - */ - if (read(go_pipe[0], &bf, 1) == -1) - perror("unable to read pipe"); - - execvp(argv[0], (char **)argv); - - perror(argv[0]); - kill(getppid(), SIGUSR1); - exit(-1); - } - - if (!opts->system_wide && opts->target_tid == -1 && opts->target_pid == -1) - evlist->threads->map[0] = evlist->workload.pid; - - close(child_ready_pipe[1]); - close(go_pipe[0]); - /* - * wait for child to settle - */ - if (read(child_ready_pipe[0], &bf, 1) == -1) { - perror("unable to read pipe"); - goto out_close_pipes; - } - - evlist->workload.cork_fd = go_pipe[1]; - close(child_ready_pipe[0]); - return 0; - -out_close_pipes: - close(go_pipe[0]); - close(go_pipe[1]); -out_close_ready_pipe: - close(child_ready_pipe[0]); - close(child_ready_pipe[1]); - return -1; -} - -int perf_evlist__start_workload(struct perf_evlist *evlist) -{ - if (evlist->workload.cork_fd > 0) { - /* - * Remove the cork, let it rip! - */ - return close(evlist->workload.cork_fd); - } - - return 0; -} diff --git a/trunk/tools/perf/util/evlist.h b/trunk/tools/perf/util/evlist.h index 8922aeed0467..1779ffef7828 100644 --- a/trunk/tools/perf/util/evlist.h +++ b/trunk/tools/perf/util/evlist.h @@ -2,16 +2,12 @@ #define __PERF_EVLIST_H 1 #include -#include #include "../perf.h" #include "event.h" -#include "util.h" -#include struct pollfd; struct thread_map; struct cpu_map; -struct perf_record_opts; #define PERF_EVLIST__HLIST_BITS 8 #define PERF_EVLIST__HLIST_SIZE (1 << PERF_EVLIST__HLIST_BITS) @@ -23,10 +19,6 @@ struct perf_evlist { int nr_fds; int nr_mmaps; int mmap_len; - struct { - int cork_fd; - pid_t pid; - } workload; bool overwrite; union perf_event event_copy; struct perf_mmap *mmap; @@ -36,11 +28,6 @@ struct perf_evlist { struct perf_evsel *selected; }; -struct perf_evsel_str_handler { - const char *name; - void *handler; -}; - struct perf_evsel; struct perf_evlist *perf_evlist__new(struct cpu_map *cpus, @@ -52,26 +39,11 @@ void perf_evlist__delete(struct perf_evlist *evlist); void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry); int perf_evlist__add_default(struct perf_evlist *evlist); -int perf_evlist__add_attrs(struct perf_evlist *evlist, - struct perf_event_attr *attrs, size_t nr_attrs); -int perf_evlist__add_tracepoints(struct perf_evlist *evlist, - const char *tracepoints[], size_t nr_tracepoints); -int perf_evlist__set_tracepoints_handlers(struct perf_evlist *evlist, - const struct perf_evsel_str_handler *assocs, - size_t nr_assocs); - -#define perf_evlist__add_attrs_array(evlist, array) \ - perf_evlist__add_attrs(evlist, array, ARRAY_SIZE(array)) - -#define perf_evlist__add_tracepoints_array(evlist, array) \ - perf_evlist__add_tracepoints(evlist, array, ARRAY_SIZE(array)) - -#define perf_evlist__set_tracepoints_handlers_array(evlist, array) \ - perf_evlist__set_tracepoints_handlers(evlist, array, ARRAY_SIZE(array)) void perf_evlist__id_add(struct perf_evlist *evlist, struct perf_evsel *evsel, int cpu, int thread, u64 id); +int perf_evlist__alloc_pollfd(struct perf_evlist *evlist); void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd); struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id); @@ -80,16 +52,8 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *self, int idx); int perf_evlist__open(struct perf_evlist *evlist, bool group); -void perf_evlist__config_attrs(struct perf_evlist *evlist, - struct perf_record_opts *opts); - -int perf_evlist__prepare_workload(struct perf_evlist *evlist, - struct perf_record_opts *opts, - const char *argv[]); -int perf_evlist__start_workload(struct perf_evlist *evlist); - -int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages, - bool overwrite); +int perf_evlist__alloc_mmap(struct perf_evlist *evlist); +int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite); void perf_evlist__munmap(struct perf_evlist *evlist); void perf_evlist__disable(struct perf_evlist *evlist); @@ -113,7 +77,6 @@ int perf_evlist__set_filters(struct perf_evlist *evlist); u64 perf_evlist__sample_type(const struct perf_evlist *evlist); bool perf_evlist__sample_id_all(const const struct perf_evlist *evlist); -u16 perf_evlist__id_hdr_size(const struct perf_evlist *evlist); bool perf_evlist__valid_sample_type(const struct perf_evlist *evlist); bool perf_evlist__valid_sample_id_all(const struct perf_evlist *evlist); diff --git a/trunk/tools/perf/util/evsel.c b/trunk/tools/perf/util/evsel.c index 667f3b78bb2c..d7915d4e77cb 100644 --- a/trunk/tools/perf/util/evsel.c +++ b/trunk/tools/perf/util/evsel.c @@ -63,79 +63,6 @@ struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx) return evsel; } -void perf_evsel__config(struct perf_evsel *evsel, struct perf_record_opts *opts) -{ - struct perf_event_attr *attr = &evsel->attr; - int track = !evsel->idx; /* only the first counter needs these */ - - attr->sample_id_all = opts->sample_id_all_avail ? 1 : 0; - attr->inherit = !opts->no_inherit; - attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | - PERF_FORMAT_TOTAL_TIME_RUNNING | - PERF_FORMAT_ID; - - attr->sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID; - - /* - * We default some events to a 1 default interval. But keep - * it a weak assumption overridable by the user. - */ - if (!attr->sample_period || (opts->user_freq != UINT_MAX && - opts->user_interval != ULLONG_MAX)) { - if (opts->freq) { - attr->sample_type |= PERF_SAMPLE_PERIOD; - attr->freq = 1; - attr->sample_freq = opts->freq; - } else { - attr->sample_period = opts->default_interval; - } - } - - if (opts->no_samples) - attr->sample_freq = 0; - - if (opts->inherit_stat) - attr->inherit_stat = 1; - - if (opts->sample_address) { - attr->sample_type |= PERF_SAMPLE_ADDR; - attr->mmap_data = track; - } - - if (opts->call_graph) - attr->sample_type |= PERF_SAMPLE_CALLCHAIN; - - if (opts->system_wide) - attr->sample_type |= PERF_SAMPLE_CPU; - - if (opts->period) - attr->sample_type |= PERF_SAMPLE_PERIOD; - - if (opts->sample_id_all_avail && - (opts->sample_time || opts->system_wide || - !opts->no_inherit || opts->cpu_list)) - attr->sample_type |= PERF_SAMPLE_TIME; - - if (opts->raw_samples) { - attr->sample_type |= PERF_SAMPLE_TIME; - attr->sample_type |= PERF_SAMPLE_RAW; - attr->sample_type |= PERF_SAMPLE_CPU; - } - - if (opts->no_delay) { - attr->watermark = 0; - attr->wakeup_events = 1; - } - - attr->mmap = track; - attr->comm = track; - - if (opts->target_pid == -1 && opts->target_tid == -1 && !opts->system_wide) { - attr->disabled = 1; - attr->enable_on_exec = 1; - } -} - int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads) { int cpu, thread; @@ -460,7 +387,7 @@ int perf_event__parse_sample(const union perf_event *event, u64 type, u32 val32[2]; } u; - memset(data, 0, sizeof(*data)); + data->cpu = data->pid = data->tid = -1; data->stream_id = data->id = data->time = -1ULL; @@ -577,82 +504,3 @@ int perf_event__parse_sample(const union perf_event *event, u64 type, return 0; } - -int perf_event__synthesize_sample(union perf_event *event, u64 type, - const struct perf_sample *sample, - bool swapped) -{ - u64 *array; - - /* - * used for cross-endian analysis. See git commit 65014ab3 - * for why this goofiness is needed. - */ - union { - u64 val64; - u32 val32[2]; - } u; - - array = event->sample.array; - - if (type & PERF_SAMPLE_IP) { - event->ip.ip = sample->ip; - array++; - } - - if (type & PERF_SAMPLE_TID) { - u.val32[0] = sample->pid; - u.val32[1] = sample->tid; - if (swapped) { - /* - * Inverse of what is done in perf_event__parse_sample - */ - u.val32[0] = bswap_32(u.val32[0]); - u.val32[1] = bswap_32(u.val32[1]); - u.val64 = bswap_64(u.val64); - } - - *array = u.val64; - array++; - } - - if (type & PERF_SAMPLE_TIME) { - *array = sample->time; - array++; - } - - if (type & PERF_SAMPLE_ADDR) { - *array = sample->addr; - array++; - } - - if (type & PERF_SAMPLE_ID) { - *array = sample->id; - array++; - } - - if (type & PERF_SAMPLE_STREAM_ID) { - *array = sample->stream_id; - array++; - } - - if (type & PERF_SAMPLE_CPU) { - u.val32[0] = sample->cpu; - if (swapped) { - /* - * Inverse of what is done in perf_event__parse_sample - */ - u.val32[0] = bswap_32(u.val32[0]); - u.val64 = bswap_64(u.val64); - } - *array = u.val64; - array++; - } - - if (type & PERF_SAMPLE_PERIOD) { - *array = sample->period; - array++; - } - - return 0; -} diff --git a/trunk/tools/perf/util/evsel.h b/trunk/tools/perf/util/evsel.h index 326b8e4d5035..b1d15e6f7ae3 100644 --- a/trunk/tools/perf/util/evsel.h +++ b/trunk/tools/perf/util/evsel.h @@ -61,17 +61,12 @@ struct perf_evsel { off_t id_offset; }; struct cgroup_sel *cgrp; - struct { - void *func; - void *data; - } handler; bool supported; }; struct cpu_map; struct thread_map; struct perf_evlist; -struct perf_record_opts; struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx); void perf_evsel__init(struct perf_evsel *evsel, @@ -79,9 +74,6 @@ void perf_evsel__init(struct perf_evsel *evsel, void perf_evsel__exit(struct perf_evsel *evsel); void perf_evsel__delete(struct perf_evsel *evsel); -void perf_evsel__config(struct perf_evsel *evsel, - struct perf_record_opts *opts); - int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads); int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads); int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus); diff --git a/trunk/tools/perf/util/header.c b/trunk/tools/perf/util/header.c index 3e7e0b09c12c..33c17a2b2a81 100644 --- a/trunk/tools/perf/util/header.c +++ b/trunk/tools/perf/util/header.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include "evlist.h" @@ -29,6 +28,9 @@ static struct perf_trace_event_type *events; static u32 header_argc; static const char **header_argv; +static int dsos__write_buildid_table(struct perf_header *header, int fd); +static int perf_session__cache_build_ids(struct perf_session *session); + int perf_header__push_event(u64 id, const char *name) { if (strlen(name) > MAX_EVENT_NAME) @@ -185,252 +187,6 @@ perf_header__set_cmdline(int argc, const char **argv) return 0; } -#define dsos__for_each_with_build_id(pos, head) \ - list_for_each_entry(pos, head, node) \ - if (!pos->has_build_id) \ - continue; \ - else - -static int __dsos__write_buildid_table(struct list_head *head, pid_t pid, - u16 misc, int fd) -{ - struct dso *pos; - - dsos__for_each_with_build_id(pos, head) { - int err; - struct build_id_event b; - size_t len; - - if (!pos->hit) - continue; - len = pos->long_name_len + 1; - len = ALIGN(len, NAME_ALIGN); - memset(&b, 0, sizeof(b)); - memcpy(&b.build_id, pos->build_id, sizeof(pos->build_id)); - b.pid = pid; - b.header.misc = misc; - b.header.size = sizeof(b) + len; - err = do_write(fd, &b, sizeof(b)); - if (err < 0) - return err; - err = write_padded(fd, pos->long_name, - pos->long_name_len + 1, len); - if (err < 0) - return err; - } - - return 0; -} - -static int machine__write_buildid_table(struct machine *machine, int fd) -{ - int err; - u16 kmisc = PERF_RECORD_MISC_KERNEL, - umisc = PERF_RECORD_MISC_USER; - - if (!machine__is_host(machine)) { - kmisc = PERF_RECORD_MISC_GUEST_KERNEL; - umisc = PERF_RECORD_MISC_GUEST_USER; - } - - err = __dsos__write_buildid_table(&machine->kernel_dsos, machine->pid, - kmisc, fd); - if (err == 0) - err = __dsos__write_buildid_table(&machine->user_dsos, - machine->pid, umisc, fd); - return err; -} - -static int dsos__write_buildid_table(struct perf_header *header, int fd) -{ - struct perf_session *session = container_of(header, - struct perf_session, header); - struct rb_node *nd; - int err = machine__write_buildid_table(&session->host_machine, fd); - - if (err) - return err; - - for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) { - struct machine *pos = rb_entry(nd, struct machine, rb_node); - err = machine__write_buildid_table(pos, fd); - if (err) - break; - } - return err; -} - -int build_id_cache__add_s(const char *sbuild_id, const char *debugdir, - const char *name, bool is_kallsyms) -{ - const size_t size = PATH_MAX; - char *realname, *filename = zalloc(size), - *linkname = zalloc(size), *targetname; - int len, err = -1; - - if (is_kallsyms) { - if (symbol_conf.kptr_restrict) { - pr_debug("Not caching a kptr_restrict'ed /proc/kallsyms\n"); - return 0; - } - realname = (char *)name; - } else - realname = realpath(name, NULL); - - if (realname == NULL || filename == NULL || linkname == NULL) - goto out_free; - - len = snprintf(filename, size, "%s%s%s", - debugdir, is_kallsyms ? "/" : "", realname); - if (mkdir_p(filename, 0755)) - goto out_free; - - snprintf(filename + len, sizeof(filename) - len, "/%s", sbuild_id); - - if (access(filename, F_OK)) { - if (is_kallsyms) { - if (copyfile("/proc/kallsyms", filename)) - goto out_free; - } else if (link(realname, filename) && copyfile(name, filename)) - goto out_free; - } - - len = snprintf(linkname, size, "%s/.build-id/%.2s", - debugdir, sbuild_id); - - if (access(linkname, X_OK) && mkdir_p(linkname, 0755)) - goto out_free; - - snprintf(linkname + len, size - len, "/%s", sbuild_id + 2); - targetname = filename + strlen(debugdir) - 5; - memcpy(targetname, "../..", 5); - - if (symlink(targetname, linkname) == 0) - err = 0; -out_free: - if (!is_kallsyms) - free(realname); - free(filename); - free(linkname); - return err; -} - -static int build_id_cache__add_b(const u8 *build_id, size_t build_id_size, - const char *name, const char *debugdir, - bool is_kallsyms) -{ - char sbuild_id[BUILD_ID_SIZE * 2 + 1]; - - build_id__sprintf(build_id, build_id_size, sbuild_id); - - return build_id_cache__add_s(sbuild_id, debugdir, name, is_kallsyms); -} - -int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir) -{ - const size_t size = PATH_MAX; - char *filename = zalloc(size), - *linkname = zalloc(size); - int err = -1; - - if (filename == NULL || linkname == NULL) - goto out_free; - - snprintf(linkname, size, "%s/.build-id/%.2s/%s", - debugdir, sbuild_id, sbuild_id + 2); - - if (access(linkname, F_OK)) - goto out_free; - - if (readlink(linkname, filename, size - 1) < 0) - goto out_free; - - if (unlink(linkname)) - goto out_free; - - /* - * Since the link is relative, we must make it absolute: - */ - snprintf(linkname, size, "%s/.build-id/%.2s/%s", - debugdir, sbuild_id, filename); - - if (unlink(linkname)) - goto out_free; - - err = 0; -out_free: - free(filename); - free(linkname); - return err; -} - -static int dso__cache_build_id(struct dso *dso, const char *debugdir) -{ - bool is_kallsyms = dso->kernel && dso->long_name[0] != '/'; - - return build_id_cache__add_b(dso->build_id, sizeof(dso->build_id), - dso->long_name, debugdir, is_kallsyms); -} - -static int __dsos__cache_build_ids(struct list_head *head, const char *debugdir) -{ - struct dso *pos; - int err = 0; - - dsos__for_each_with_build_id(pos, head) - if (dso__cache_build_id(pos, debugdir)) - err = -1; - - return err; -} - -static int machine__cache_build_ids(struct machine *machine, const char *debugdir) -{ - int ret = __dsos__cache_build_ids(&machine->kernel_dsos, debugdir); - ret |= __dsos__cache_build_ids(&machine->user_dsos, debugdir); - return ret; -} - -static int perf_session__cache_build_ids(struct perf_session *session) -{ - struct rb_node *nd; - int ret; - char debugdir[PATH_MAX]; - - snprintf(debugdir, sizeof(debugdir), "%s", buildid_dir); - - if (mkdir(debugdir, 0755) != 0 && errno != EEXIST) - return -1; - - ret = machine__cache_build_ids(&session->host_machine, debugdir); - - for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) { - struct machine *pos = rb_entry(nd, struct machine, rb_node); - ret |= machine__cache_build_ids(pos, debugdir); - } - return ret ? -1 : 0; -} - -static bool machine__read_build_ids(struct machine *machine, bool with_hits) -{ - bool ret = __dsos__read_build_ids(&machine->kernel_dsos, with_hits); - ret |= __dsos__read_build_ids(&machine->user_dsos, with_hits); - return ret; -} - -static bool perf_session__read_build_ids(struct perf_session *session, bool with_hits) -{ - struct rb_node *nd; - bool ret = machine__read_build_ids(&session->host_machine, with_hits); - - for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) { - struct machine *pos = rb_entry(nd, struct machine, rb_node); - ret |= machine__read_build_ids(pos, with_hits); - } - - return ret; -} - static int write_trace_info(int fd, struct perf_header *h __used, struct perf_evlist *evlist) { @@ -446,9 +202,6 @@ static int write_build_id(int fd, struct perf_header *h, session = container_of(h, struct perf_session, header); - if (!perf_session__read_build_ids(session, true)) - return -1; - err = dsos__write_buildid_table(h, fd); if (err < 0) { pr_debug("failed to write buildid table\n"); @@ -1312,30 +1065,26 @@ struct feature_ops { bool full_only; }; -#define FEAT_OPA(n, func) \ - [n] = { .name = #n, .write = write_##func, .print = print_##func } -#define FEAT_OPF(n, func) \ - [n] = { .name = #n, .write = write_##func, .print = print_##func, .full_only = true } - -/* feature_ops not implemented: */ -#define print_trace_info NULL -#define print_build_id NULL +#define FEAT_OPA(n, w, p) \ + [n] = { .name = #n, .write = w, .print = p } +#define FEAT_OPF(n, w, p) \ + [n] = { .name = #n, .write = w, .print = p, .full_only = true } static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = { - FEAT_OPA(HEADER_TRACE_INFO, trace_info), - FEAT_OPA(HEADER_BUILD_ID, build_id), - FEAT_OPA(HEADER_HOSTNAME, hostname), - FEAT_OPA(HEADER_OSRELEASE, osrelease), - FEAT_OPA(HEADER_VERSION, version), - FEAT_OPA(HEADER_ARCH, arch), - FEAT_OPA(HEADER_NRCPUS, nrcpus), - FEAT_OPA(HEADER_CPUDESC, cpudesc), - FEAT_OPA(HEADER_CPUID, cpuid), - FEAT_OPA(HEADER_TOTAL_MEM, total_mem), - FEAT_OPA(HEADER_EVENT_DESC, event_desc), - FEAT_OPA(HEADER_CMDLINE, cmdline), - FEAT_OPF(HEADER_CPU_TOPOLOGY, cpu_topology), - FEAT_OPF(HEADER_NUMA_TOPOLOGY, numa_topology), + FEAT_OPA(HEADER_TRACE_INFO, write_trace_info, NULL), + FEAT_OPA(HEADER_BUILD_ID, write_build_id, NULL), + FEAT_OPA(HEADER_HOSTNAME, write_hostname, print_hostname), + FEAT_OPA(HEADER_OSRELEASE, write_osrelease, print_osrelease), + FEAT_OPA(HEADER_VERSION, write_version, print_version), + FEAT_OPA(HEADER_ARCH, write_arch, print_arch), + FEAT_OPA(HEADER_NRCPUS, write_nrcpus, print_nrcpus), + FEAT_OPA(HEADER_CPUDESC, write_cpudesc, print_cpudesc), + FEAT_OPA(HEADER_CPUID, write_cpuid, print_cpuid), + FEAT_OPA(HEADER_TOTAL_MEM, write_total_mem, print_total_mem), + FEAT_OPA(HEADER_EVENT_DESC, write_event_desc, print_event_desc), + FEAT_OPA(HEADER_CMDLINE, write_cmdline, print_cmdline), + FEAT_OPF(HEADER_CPU_TOPOLOGY, write_cpu_topology, print_cpu_topology), + FEAT_OPF(HEADER_NUMA_TOPOLOGY, write_numa_topology, print_numa_topology), }; struct header_print_data { @@ -1354,9 +1103,9 @@ static int perf_file_section__fprintf_info(struct perf_file_section *section, "%d, continuing...\n", section->offset, feat); return 0; } - if (feat >= HEADER_LAST_FEATURE) { + if (feat < HEADER_TRACE_INFO || feat >= HEADER_LAST_FEATURE) { pr_warning("unknown feature %d\n", feat); - return 0; + return -1; } if (!feat_ops[feat].print) return 0; @@ -1383,16 +1132,260 @@ int perf_header__fprintf_info(struct perf_session *session, FILE *fp, bool full) return 0; } -static int do_write_feat(int fd, struct perf_header *h, int type, - struct perf_file_section **p, - struct perf_evlist *evlist) -{ +#define dsos__for_each_with_build_id(pos, head) \ + list_for_each_entry(pos, head, node) \ + if (!pos->has_build_id) \ + continue; \ + else + +static int __dsos__write_buildid_table(struct list_head *head, pid_t pid, + u16 misc, int fd) +{ + struct dso *pos; + + dsos__for_each_with_build_id(pos, head) { + int err; + struct build_id_event b; + size_t len; + + if (!pos->hit) + continue; + len = pos->long_name_len + 1; + len = ALIGN(len, NAME_ALIGN); + memset(&b, 0, sizeof(b)); + memcpy(&b.build_id, pos->build_id, sizeof(pos->build_id)); + b.pid = pid; + b.header.misc = misc; + b.header.size = sizeof(b) + len; + err = do_write(fd, &b, sizeof(b)); + if (err < 0) + return err; + err = write_padded(fd, pos->long_name, + pos->long_name_len + 1, len); + if (err < 0) + return err; + } + + return 0; +} + +static int machine__write_buildid_table(struct machine *machine, int fd) +{ + int err; + u16 kmisc = PERF_RECORD_MISC_KERNEL, + umisc = PERF_RECORD_MISC_USER; + + if (!machine__is_host(machine)) { + kmisc = PERF_RECORD_MISC_GUEST_KERNEL; + umisc = PERF_RECORD_MISC_GUEST_USER; + } + + err = __dsos__write_buildid_table(&machine->kernel_dsos, machine->pid, + kmisc, fd); + if (err == 0) + err = __dsos__write_buildid_table(&machine->user_dsos, + machine->pid, umisc, fd); + return err; +} + +static int dsos__write_buildid_table(struct perf_header *header, int fd) +{ + struct perf_session *session = container_of(header, + struct perf_session, header); + struct rb_node *nd; + int err = machine__write_buildid_table(&session->host_machine, fd); + + if (err) + return err; + + for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) { + struct machine *pos = rb_entry(nd, struct machine, rb_node); + err = machine__write_buildid_table(pos, fd); + if (err) + break; + } + return err; +} + +int build_id_cache__add_s(const char *sbuild_id, const char *debugdir, + const char *name, bool is_kallsyms) +{ + const size_t size = PATH_MAX; + char *realname, *filename = zalloc(size), + *linkname = zalloc(size), *targetname; + int len, err = -1; + + if (is_kallsyms) { + if (symbol_conf.kptr_restrict) { + pr_debug("Not caching a kptr_restrict'ed /proc/kallsyms\n"); + return 0; + } + realname = (char *)name; + } else + realname = realpath(name, NULL); + + if (realname == NULL || filename == NULL || linkname == NULL) + goto out_free; + + len = snprintf(filename, size, "%s%s%s", + debugdir, is_kallsyms ? "/" : "", realname); + if (mkdir_p(filename, 0755)) + goto out_free; + + snprintf(filename + len, sizeof(filename) - len, "/%s", sbuild_id); + + if (access(filename, F_OK)) { + if (is_kallsyms) { + if (copyfile("/proc/kallsyms", filename)) + goto out_free; + } else if (link(realname, filename) && copyfile(name, filename)) + goto out_free; + } + + len = snprintf(linkname, size, "%s/.build-id/%.2s", + debugdir, sbuild_id); + + if (access(linkname, X_OK) && mkdir_p(linkname, 0755)) + goto out_free; + + snprintf(linkname + len, size - len, "/%s", sbuild_id + 2); + targetname = filename + strlen(debugdir) - 5; + memcpy(targetname, "../..", 5); + + if (symlink(targetname, linkname) == 0) + err = 0; +out_free: + if (!is_kallsyms) + free(realname); + free(filename); + free(linkname); + return err; +} + +static int build_id_cache__add_b(const u8 *build_id, size_t build_id_size, + const char *name, const char *debugdir, + bool is_kallsyms) +{ + char sbuild_id[BUILD_ID_SIZE * 2 + 1]; + + build_id__sprintf(build_id, build_id_size, sbuild_id); + + return build_id_cache__add_s(sbuild_id, debugdir, name, is_kallsyms); +} + +int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir) +{ + const size_t size = PATH_MAX; + char *filename = zalloc(size), + *linkname = zalloc(size); + int err = -1; + + if (filename == NULL || linkname == NULL) + goto out_free; + + snprintf(linkname, size, "%s/.build-id/%.2s/%s", + debugdir, sbuild_id, sbuild_id + 2); + + if (access(linkname, F_OK)) + goto out_free; + + if (readlink(linkname, filename, size - 1) < 0) + goto out_free; + + if (unlink(linkname)) + goto out_free; + + /* + * Since the link is relative, we must make it absolute: + */ + snprintf(linkname, size, "%s/.build-id/%.2s/%s", + debugdir, sbuild_id, filename); + + if (unlink(linkname)) + goto out_free; + + err = 0; +out_free: + free(filename); + free(linkname); + return err; +} + +static int dso__cache_build_id(struct dso *dso, const char *debugdir) +{ + bool is_kallsyms = dso->kernel && dso->long_name[0] != '/'; + + return build_id_cache__add_b(dso->build_id, sizeof(dso->build_id), + dso->long_name, debugdir, is_kallsyms); +} + +static int __dsos__cache_build_ids(struct list_head *head, const char *debugdir) +{ + struct dso *pos; + int err = 0; + + dsos__for_each_with_build_id(pos, head) + if (dso__cache_build_id(pos, debugdir)) + err = -1; + + return err; +} + +static int machine__cache_build_ids(struct machine *machine, const char *debugdir) +{ + int ret = __dsos__cache_build_ids(&machine->kernel_dsos, debugdir); + ret |= __dsos__cache_build_ids(&machine->user_dsos, debugdir); + return ret; +} + +static int perf_session__cache_build_ids(struct perf_session *session) +{ + struct rb_node *nd; + int ret; + char debugdir[PATH_MAX]; + + snprintf(debugdir, sizeof(debugdir), "%s", buildid_dir); + + if (mkdir(debugdir, 0755) != 0 && errno != EEXIST) + return -1; + + ret = machine__cache_build_ids(&session->host_machine, debugdir); + + for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) { + struct machine *pos = rb_entry(nd, struct machine, rb_node); + ret |= machine__cache_build_ids(pos, debugdir); + } + return ret ? -1 : 0; +} + +static bool machine__read_build_ids(struct machine *machine, bool with_hits) +{ + bool ret = __dsos__read_build_ids(&machine->kernel_dsos, with_hits); + ret |= __dsos__read_build_ids(&machine->user_dsos, with_hits); + return ret; +} + +static bool perf_session__read_build_ids(struct perf_session *session, bool with_hits) +{ + struct rb_node *nd; + bool ret = machine__read_build_ids(&session->host_machine, with_hits); + + for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) { + struct machine *pos = rb_entry(nd, struct machine, rb_node); + ret |= machine__read_build_ids(pos, with_hits); + } + + return ret; +} + +static int do_write_feat(int fd, struct perf_header *h, int type, + struct perf_file_section **p, + struct perf_evlist *evlist) +{ int err; int ret = 0; if (perf_header__has_feat(h, type)) { - if (!feat_ops[type].write) - return -1; (*p)->offset = lseek(fd, 0, SEEK_CUR); @@ -1415,12 +1408,18 @@ static int perf_header__adds_write(struct perf_header *header, struct perf_evlist *evlist, int fd) { int nr_sections; + struct perf_session *session; struct perf_file_section *feat_sec, *p; int sec_size; u64 sec_start; - int feat; int err; + session = container_of(header, struct perf_session, header); + + if (perf_header__has_feat(header, HEADER_BUILD_ID && + !perf_session__read_build_ids(session, true))) + perf_header__clear_feat(header, HEADER_BUILD_ID); + nr_sections = bitmap_weight(header->adds_features, HEADER_FEAT_BITS); if (!nr_sections) return 0; @@ -1434,11 +1433,64 @@ static int perf_header__adds_write(struct perf_header *header, sec_start = header->data_offset + header->data_size; lseek(fd, sec_start + sec_size, SEEK_SET); - for_each_set_bit(feat, header->adds_features, HEADER_FEAT_BITS) { - if (do_write_feat(fd, header, feat, &p, evlist)) - perf_header__clear_feat(header, feat); + err = do_write_feat(fd, header, HEADER_TRACE_INFO, &p, evlist); + if (err) + goto out_free; + + err = do_write_feat(fd, header, HEADER_BUILD_ID, &p, evlist); + if (err) { + perf_header__clear_feat(header, HEADER_BUILD_ID); + goto out_free; } + err = do_write_feat(fd, header, HEADER_HOSTNAME, &p, evlist); + if (err) + perf_header__clear_feat(header, HEADER_HOSTNAME); + + err = do_write_feat(fd, header, HEADER_OSRELEASE, &p, evlist); + if (err) + perf_header__clear_feat(header, HEADER_OSRELEASE); + + err = do_write_feat(fd, header, HEADER_VERSION, &p, evlist); + if (err) + perf_header__clear_feat(header, HEADER_VERSION); + + err = do_write_feat(fd, header, HEADER_ARCH, &p, evlist); + if (err) + perf_header__clear_feat(header, HEADER_ARCH); + + err = do_write_feat(fd, header, HEADER_NRCPUS, &p, evlist); + if (err) + perf_header__clear_feat(header, HEADER_NRCPUS); + + err = do_write_feat(fd, header, HEADER_CPUDESC, &p, evlist); + if (err) + perf_header__clear_feat(header, HEADER_CPUDESC); + + err = do_write_feat(fd, header, HEADER_CPUID, &p, evlist); + if (err) + perf_header__clear_feat(header, HEADER_CPUID); + + err = do_write_feat(fd, header, HEADER_TOTAL_MEM, &p, evlist); + if (err) + perf_header__clear_feat(header, HEADER_TOTAL_MEM); + + err = do_write_feat(fd, header, HEADER_CMDLINE, &p, evlist); + if (err) + perf_header__clear_feat(header, HEADER_CMDLINE); + + err = do_write_feat(fd, header, HEADER_EVENT_DESC, &p, evlist); + if (err) + perf_header__clear_feat(header, HEADER_EVENT_DESC); + + err = do_write_feat(fd, header, HEADER_CPU_TOPOLOGY, &p, evlist); + if (err) + perf_header__clear_feat(header, HEADER_CPU_TOPOLOGY); + + err = do_write_feat(fd, header, HEADER_NUMA_TOPOLOGY, &p, evlist); + if (err) + perf_header__clear_feat(header, HEADER_NUMA_TOPOLOGY); + lseek(fd, sec_start, SEEK_SET); /* * may write more than needed due to dropped feature, but @@ -1447,6 +1499,7 @@ static int perf_header__adds_write(struct perf_header *header, err = do_write(fd, feat_sec, sec_size); if (err < 0) pr_debug("failed to write feature section\n"); +out_free: free(feat_sec); return err; } @@ -1584,20 +1637,20 @@ static int perf_header__getbuffer64(struct perf_header *header, int perf_header__process_sections(struct perf_header *header, int fd, void *data, int (*process)(struct perf_file_section *section, - struct perf_header *ph, - int feat, int fd, void *data)) + struct perf_header *ph, + int feat, int fd, void *data)) { - struct perf_file_section *feat_sec, *sec; + struct perf_file_section *feat_sec; int nr_sections; int sec_size; - int feat; - int err; + int idx = 0; + int err = -1, feat = 1; nr_sections = bitmap_weight(header->adds_features, HEADER_FEAT_BITS); if (!nr_sections) return 0; - feat_sec = sec = calloc(sizeof(*feat_sec), nr_sections); + feat_sec = calloc(sizeof(*feat_sec), nr_sections); if (!feat_sec) return -1; @@ -1605,16 +1658,20 @@ int perf_header__process_sections(struct perf_header *header, int fd, lseek(fd, header->data_offset + header->data_size, SEEK_SET); - err = perf_header__getbuffer64(header, fd, feat_sec, sec_size); - if (err < 0) + if (perf_header__getbuffer64(header, fd, feat_sec, sec_size)) goto out_free; - for_each_set_bit(feat, header->adds_features, HEADER_LAST_FEATURE) { - err = process(sec++, header, feat, fd, data); - if (err < 0) - goto out_free; - } err = 0; + while (idx < nr_sections && feat < HEADER_LAST_FEATURE) { + if (perf_header__has_feat(header, feat)) { + struct perf_file_section *sec = &feat_sec[idx++]; + + err = process(sec, header, feat, fd, data); + if (err < 0) + break; + } + ++feat; + } out_free: free(feat_sec); return err; @@ -1849,21 +1906,32 @@ static int perf_file_section__process(struct perf_file_section *section, return 0; } - if (feat >= HEADER_LAST_FEATURE) { - pr_debug("unknown feature %d, continuing...\n", feat); - return 0; - } - switch (feat) { case HEADER_TRACE_INFO: trace_report(fd, false); break; + case HEADER_BUILD_ID: if (perf_header__read_build_ids(ph, fd, section->offset, section->size)) pr_debug("Failed to read buildids, continuing...\n"); break; - default: + + case HEADER_HOSTNAME: + case HEADER_OSRELEASE: + case HEADER_VERSION: + case HEADER_ARCH: + case HEADER_NRCPUS: + case HEADER_CPUDESC: + case HEADER_CPUID: + case HEADER_TOTAL_MEM: + case HEADER_CMDLINE: + case HEADER_EVENT_DESC: + case HEADER_CPU_TOPOLOGY: + case HEADER_NUMA_TOPOLOGY: break; + + default: + pr_debug("unknown feature %d, continuing...\n", feat); } return 0; @@ -1973,8 +2041,6 @@ int perf_session__read_header(struct perf_session *session, int fd) lseek(fd, tmp, SEEK_SET); } - symbol_conf.nr_events = nr_attrs; - if (f_header.event_types.size) { lseek(fd, f_header.event_types.offset, SEEK_SET); events = malloc(f_header.event_types.size); @@ -2002,9 +2068,9 @@ int perf_session__read_header(struct perf_session *session, int fd) return -ENOMEM; } -int perf_event__synthesize_attr(struct perf_tool *tool, - struct perf_event_attr *attr, u16 ids, u64 *id, - perf_event__handler_t process) +int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id, + perf_event__handler_t process, + struct perf_session *session) { union perf_event *ev; size_t size; @@ -2026,23 +2092,22 @@ int perf_event__synthesize_attr(struct perf_tool *tool, ev->attr.header.type = PERF_RECORD_HEADER_ATTR; ev->attr.header.size = size; - err = process(tool, ev, NULL, NULL); + err = process(ev, NULL, session); free(ev); return err; } -int perf_event__synthesize_attrs(struct perf_tool *tool, - struct perf_session *session, +int perf_session__synthesize_attrs(struct perf_session *session, perf_event__handler_t process) { struct perf_evsel *attr; int err = 0; list_for_each_entry(attr, &session->evlist->entries, node) { - err = perf_event__synthesize_attr(tool, &attr->attr, attr->ids, - attr->id, process); + err = perf_event__synthesize_attr(&attr->attr, attr->ids, + attr->id, process, session); if (err) { pr_debug("failed to create perf header attribute\n"); return err; @@ -2053,23 +2118,23 @@ int perf_event__synthesize_attrs(struct perf_tool *tool, } int perf_event__process_attr(union perf_event *event, - struct perf_evlist **pevlist) + struct perf_session *session) { unsigned int i, ids, n_ids; struct perf_evsel *evsel; - struct perf_evlist *evlist = *pevlist; - if (evlist == NULL) { - *pevlist = evlist = perf_evlist__new(NULL, NULL); - if (evlist == NULL) + if (session->evlist == NULL) { + session->evlist = perf_evlist__new(NULL, NULL); + if (session->evlist == NULL) return -ENOMEM; } - evsel = perf_evsel__new(&event->attr.attr, evlist->nr_entries); + evsel = perf_evsel__new(&event->attr.attr, + session->evlist->nr_entries); if (evsel == NULL) return -ENOMEM; - perf_evlist__add(evlist, evsel); + perf_evlist__add(session->evlist, evsel); ids = event->header.size; ids -= (void *)&event->attr.id - (void *)event; @@ -2083,16 +2148,18 @@ int perf_event__process_attr(union perf_event *event, return -ENOMEM; for (i = 0; i < n_ids; i++) { - perf_evlist__id_add(evlist, evsel, 0, i, event->attr.id[i]); + perf_evlist__id_add(session->evlist, evsel, 0, i, + event->attr.id[i]); } + perf_session__update_sample_type(session); + return 0; } -int perf_event__synthesize_event_type(struct perf_tool *tool, - u64 event_id, char *name, +int perf_event__synthesize_event_type(u64 event_id, char *name, perf_event__handler_t process, - struct machine *machine) + struct perf_session *session) { union perf_event ev; size_t size = 0; @@ -2110,14 +2177,13 @@ int perf_event__synthesize_event_type(struct perf_tool *tool, ev.event_type.header.size = sizeof(ev.event_type) - (sizeof(ev.event_type.event_type.name) - size); - err = process(tool, &ev, NULL, machine); + err = process(&ev, NULL, session); return err; } -int perf_event__synthesize_event_types(struct perf_tool *tool, - perf_event__handler_t process, - struct machine *machine) +int perf_event__synthesize_event_types(perf_event__handler_t process, + struct perf_session *session) { struct perf_trace_event_type *type; int i, err = 0; @@ -2125,9 +2191,9 @@ int perf_event__synthesize_event_types(struct perf_tool *tool, for (i = 0; i < event_count; i++) { type = &events[i]; - err = perf_event__synthesize_event_type(tool, type->event_id, + err = perf_event__synthesize_event_type(type->event_id, type->name, process, - machine); + session); if (err) { pr_debug("failed to create perf header event type\n"); return err; @@ -2137,8 +2203,8 @@ int perf_event__synthesize_event_types(struct perf_tool *tool, return err; } -int perf_event__process_event_type(struct perf_tool *tool __unused, - union perf_event *event) +int perf_event__process_event_type(union perf_event *event, + struct perf_session *session __unused) { if (perf_header__push_event(event->event_type.event_type.event_id, event->event_type.event_type.name) < 0) @@ -2147,9 +2213,9 @@ int perf_event__process_event_type(struct perf_tool *tool __unused, return 0; } -int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd, - struct perf_evlist *evlist, - perf_event__handler_t process) +int perf_event__synthesize_tracing_data(int fd, struct perf_evlist *evlist, + perf_event__handler_t process, + struct perf_session *session __unused) { union perf_event ev; struct tracing_data *tdata; @@ -2180,7 +2246,7 @@ int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd, ev.tracing_data.header.size = sizeof(ev.tracing_data); ev.tracing_data.size = aligned_size; - process(tool, &ev, NULL, NULL); + process(&ev, NULL, session); /* * The put function will copy all the tracing data @@ -2222,10 +2288,10 @@ int perf_event__process_tracing_data(union perf_event *event, return size_read + padding; } -int perf_event__synthesize_build_id(struct perf_tool *tool, - struct dso *pos, u16 misc, +int perf_event__synthesize_build_id(struct dso *pos, u16 misc, perf_event__handler_t process, - struct machine *machine) + struct machine *machine, + struct perf_session *session) { union perf_event ev; size_t len; @@ -2245,13 +2311,12 @@ int perf_event__synthesize_build_id(struct perf_tool *tool, ev.build_id.header.size = sizeof(ev.build_id) + len; memcpy(&ev.build_id.filename, pos->long_name, pos->long_name_len); - err = process(tool, &ev, NULL, machine); + err = process(&ev, NULL, session); return err; } -int perf_event__process_build_id(struct perf_tool *tool __used, - union perf_event *event, +int perf_event__process_build_id(union perf_event *event, struct perf_session *session) { __event_process_build_id(&event->build_id, diff --git a/trunk/tools/perf/util/header.h b/trunk/tools/perf/util/header.h index ac4ec956024e..3d5a742f4a2a 100644 --- a/trunk/tools/perf/util/header.h +++ b/trunk/tools/perf/util/header.h @@ -10,8 +10,7 @@ #include enum { - HEADER_RESERVED = 0, /* always cleared */ - HEADER_TRACE_INFO = 1, + HEADER_TRACE_INFO = 1, HEADER_BUILD_ID, HEADER_HOSTNAME, @@ -28,9 +27,10 @@ enum { HEADER_NUMA_TOPOLOGY, HEADER_LAST_FEATURE, - HEADER_FEAT_BITS = 256, }; +#define HEADER_FEAT_BITS 256 + struct perf_file_section { u64 offset; u64 size; @@ -68,7 +68,6 @@ struct perf_header { }; struct perf_evlist; -struct perf_session; int perf_session__read_header(struct perf_session *session, int fd); int perf_session__write_header(struct perf_session *session, @@ -97,36 +96,32 @@ int build_id_cache__add_s(const char *sbuild_id, const char *debugdir, const char *name, bool is_kallsyms); int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir); -int perf_event__synthesize_attr(struct perf_tool *tool, - struct perf_event_attr *attr, u16 ids, u64 *id, - perf_event__handler_t process); -int perf_event__synthesize_attrs(struct perf_tool *tool, - struct perf_session *session, - perf_event__handler_t process); -int perf_event__process_attr(union perf_event *event, struct perf_evlist **pevlist); +int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id, + perf_event__handler_t process, + struct perf_session *session); +int perf_session__synthesize_attrs(struct perf_session *session, + perf_event__handler_t process); +int perf_event__process_attr(union perf_event *event, struct perf_session *session); -int perf_event__synthesize_event_type(struct perf_tool *tool, - u64 event_id, char *name, +int perf_event__synthesize_event_type(u64 event_id, char *name, perf_event__handler_t process, - struct machine *machine); -int perf_event__synthesize_event_types(struct perf_tool *tool, - perf_event__handler_t process, - struct machine *machine); -int perf_event__process_event_type(struct perf_tool *tool, - union perf_event *event); - -int perf_event__synthesize_tracing_data(struct perf_tool *tool, - int fd, struct perf_evlist *evlist, - perf_event__handler_t process); + struct perf_session *session); +int perf_event__synthesize_event_types(perf_event__handler_t process, + struct perf_session *session); +int perf_event__process_event_type(union perf_event *event, + struct perf_session *session); + +int perf_event__synthesize_tracing_data(int fd, struct perf_evlist *evlist, + perf_event__handler_t process, + struct perf_session *session); int perf_event__process_tracing_data(union perf_event *event, struct perf_session *session); -int perf_event__synthesize_build_id(struct perf_tool *tool, - struct dso *pos, u16 misc, +int perf_event__synthesize_build_id(struct dso *pos, u16 misc, perf_event__handler_t process, - struct machine *machine); -int perf_event__process_build_id(struct perf_tool *tool, - union perf_event *event, + struct machine *machine, + struct perf_session *session); +int perf_event__process_build_id(union perf_event *event, struct perf_session *session); /* diff --git a/trunk/tools/perf/util/hist.h b/trunk/tools/perf/util/hist.h index ff6f9d56ea41..89289c8e935e 100644 --- a/trunk/tools/perf/util/hist.h +++ b/trunk/tools/perf/util/hist.h @@ -117,6 +117,7 @@ int perf_evlist__tui_browse_hists(struct perf_evlist *evlist __used, static inline int hist_entry__tui_annotate(struct hist_entry *self __used, int evidx __used, + int nr_events __used, void(*timer)(void *arg) __used, void *arg __used, int delay_secs __used) @@ -127,7 +128,7 @@ static inline int hist_entry__tui_annotate(struct hist_entry *self __used, #define K_RIGHT -2 #else #include "ui/keysyms.h" -int hist_entry__tui_annotate(struct hist_entry *he, int evidx, +int hist_entry__tui_annotate(struct hist_entry *he, int evidx, int nr_events, void(*timer)(void *arg), void *arg, int delay_secs); int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help, diff --git a/trunk/tools/perf/util/include/linux/bitops.h b/trunk/tools/perf/util/include/linux/bitops.h index 62cdee78db7b..305c8484f200 100644 --- a/trunk/tools/perf/util/include/linux/bitops.h +++ b/trunk/tools/perf/util/include/linux/bitops.h @@ -9,17 +9,6 @@ #define BITS_PER_BYTE 8 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) -#define for_each_set_bit(bit, addr, size) \ - for ((bit) = find_first_bit((addr), (size)); \ - (bit) < (size); \ - (bit) = find_next_bit((addr), (size), (bit) + 1)) - -/* same as for_each_set_bit() but use bit as value to start with */ -#define for_each_set_bit_cont(bit, addr, size) \ - for ((bit) = find_next_bit((addr), (size), (bit)); \ - (bit) < (size); \ - (bit) = find_next_bit((addr), (size), (bit) + 1)) - static inline void set_bit(int nr, unsigned long *addr) { addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG); @@ -41,111 +30,4 @@ static inline unsigned long hweight_long(unsigned long w) return sizeof(w) == 4 ? hweight32(w) : hweight64(w); } -#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) - -/** - * __ffs - find first bit in word. - * @word: The word to search - * - * Undefined if no bit exists, so code should check against 0 first. - */ -static __always_inline unsigned long __ffs(unsigned long word) -{ - int num = 0; - -#if BITS_PER_LONG == 64 - if ((word & 0xffffffff) == 0) { - num += 32; - word >>= 32; - } -#endif - if ((word & 0xffff) == 0) { - num += 16; - word >>= 16; - } - if ((word & 0xff) == 0) { - num += 8; - word >>= 8; - } - if ((word & 0xf) == 0) { - num += 4; - word >>= 4; - } - if ((word & 0x3) == 0) { - num += 2; - word >>= 2; - } - if ((word & 0x1) == 0) - num += 1; - return num; -} - -/* - * Find the first set bit in a memory region. - */ -static inline unsigned long -find_first_bit(const unsigned long *addr, unsigned long size) -{ - const unsigned long *p = addr; - unsigned long result = 0; - unsigned long tmp; - - while (size & ~(BITS_PER_LONG-1)) { - if ((tmp = *(p++))) - goto found; - result += BITS_PER_LONG; - size -= BITS_PER_LONG; - } - if (!size) - return result; - - tmp = (*p) & (~0UL >> (BITS_PER_LONG - size)); - if (tmp == 0UL) /* Are any bits set? */ - return result + size; /* Nope. */ -found: - return result + __ffs(tmp); -} - -/* - * Find the next set bit in a memory region. - */ -static inline unsigned long -find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) -{ - const unsigned long *p = addr + BITOP_WORD(offset); - unsigned long result = offset & ~(BITS_PER_LONG-1); - unsigned long tmp; - - if (offset >= size) - return size; - size -= result; - offset %= BITS_PER_LONG; - if (offset) { - tmp = *(p++); - tmp &= (~0UL << offset); - if (size < BITS_PER_LONG) - goto found_first; - if (tmp) - goto found_middle; - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - } - while (size & ~(BITS_PER_LONG-1)) { - if ((tmp = *(p++))) - goto found_middle; - result += BITS_PER_LONG; - size -= BITS_PER_LONG; - } - if (!size) - return result; - tmp = *p; - -found_first: - tmp &= (~0UL >> (BITS_PER_LONG - size)); - if (tmp == 0UL) /* Are any bits set? */ - return result + size; /* Nope. */ -found_middle: - return result + __ffs(tmp); -} - #endif diff --git a/trunk/tools/perf/util/map.c b/trunk/tools/perf/util/map.c index 316aa0ab7122..78284b13e808 100644 --- a/trunk/tools/perf/util/map.c +++ b/trunk/tools/perf/util/map.c @@ -562,10 +562,6 @@ int machine__init(struct machine *self, const char *root_dir, pid_t pid) INIT_LIST_HEAD(&self->user_dsos); INIT_LIST_HEAD(&self->kernel_dsos); - self->threads = RB_ROOT; - INIT_LIST_HEAD(&self->dead_threads); - self->last_match = NULL; - self->kmaps.machine = self; self->pid = pid; self->root_dir = strdup(root_dir); diff --git a/trunk/tools/perf/util/map.h b/trunk/tools/perf/util/map.h index 2b8017f8a930..890d85545d0f 100644 --- a/trunk/tools/perf/util/map.h +++ b/trunk/tools/perf/util/map.h @@ -18,11 +18,9 @@ enum map_type { extern const char *map_type__name[MAP__NR_TYPES]; struct dso; -struct ip_callchain; struct ref_reloc_sym; struct map_groups; struct machine; -struct perf_evsel; struct map { union { @@ -63,11 +61,7 @@ struct map_groups { struct machine { struct rb_node rb_node; pid_t pid; - u16 id_hdr_size; char *root_dir; - struct rb_root threads; - struct list_head dead_threads; - struct thread *last_match; struct list_head user_dsos; struct list_head kernel_dsos; struct map_groups kmaps; @@ -154,13 +148,6 @@ int machine__init(struct machine *self, const char *root_dir, pid_t pid); void machine__exit(struct machine *self); void machine__delete(struct machine *self); -int machine__resolve_callchain(struct machine *machine, - struct perf_evsel *evsel, struct thread *thread, - struct ip_callchain *chain, - struct symbol **parent); -int maps__set_kallsyms_ref_reloc_sym(struct map **maps, const char *symbol_name, - u64 addr); - /* * Default guest kernel is defined by parameter --guestkallsyms * and --guestmodules @@ -203,12 +190,6 @@ struct symbol *map_groups__find_symbol_by_name(struct map_groups *mg, struct map **mapp, symbol_filter_t filter); - -struct thread *machine__findnew_thread(struct machine *machine, pid_t pid); -void machine__remove_thread(struct machine *machine, struct thread *th); - -size_t machine__fprintf(struct machine *machine, FILE *fp); - static inline struct symbol *machine__find_kernel_symbol(struct machine *self, enum map_type type, u64 addr, diff --git a/trunk/tools/perf/util/parse-events.c b/trunk/tools/perf/util/parse-events.c index 531c283fc0c5..928918b796b2 100644 --- a/trunk/tools/perf/util/parse-events.c +++ b/trunk/tools/perf/util/parse-events.c @@ -25,6 +25,8 @@ enum event_result { EVT_HANDLED_ALL }; +char debugfs_path[MAXPATHLEN]; + #define CHW(x) .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_##x #define CSW(x) .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_##x @@ -38,7 +40,6 @@ static struct event_symbol event_symbols[] = { { CHW(BRANCH_INSTRUCTIONS), "branch-instructions", "branches" }, { CHW(BRANCH_MISSES), "branch-misses", "" }, { CHW(BUS_CYCLES), "bus-cycles", "" }, - { CHW(REF_CPU_CYCLES), "ref-cycles", "" }, { CSW(CPU_CLOCK), "cpu-clock", "" }, { CSW(TASK_CLOCK), "task-clock", "" }, @@ -69,7 +70,6 @@ static const char *hw_event_names[PERF_COUNT_HW_MAX] = { "bus-cycles", "stalled-cycles-frontend", "stalled-cycles-backend", - "ref-cycles", }; static const char *sw_event_names[PERF_COUNT_SW_MAX] = { @@ -140,7 +140,7 @@ static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir) char evt_path[MAXPATHLEN]; int fd; - snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", tracing_events_path, + snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", debugfs_path, sys_dir->d_name, evt_dir->d_name); fd = open(evt_path, O_RDONLY); if (fd < 0) @@ -171,16 +171,16 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config) char evt_path[MAXPATHLEN]; char dir_path[MAXPATHLEN]; - if (debugfs_valid_mountpoint(tracing_events_path)) + if (debugfs_valid_mountpoint(debugfs_path)) return NULL; - sys_dir = opendir(tracing_events_path); + sys_dir = opendir(debugfs_path); if (!sys_dir) return NULL; for_each_subsystem(sys_dir, sys_dirent, sys_next) { - snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path, + snprintf(dir_path, MAXPATHLEN, "%s/%s", debugfs_path, sys_dirent.d_name); evt_dir = opendir(dir_path); if (!evt_dir) @@ -447,7 +447,7 @@ parse_single_tracepoint_event(char *sys_name, u64 id; int fd; - snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", tracing_events_path, + snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", debugfs_path, sys_name, evt_name); fd = open(evt_path, O_RDONLY); @@ -485,7 +485,7 @@ parse_multiple_tracepoint_event(struct perf_evlist *evlist, char *sys_name, struct dirent *evt_ent; DIR *evt_dir; - snprintf(evt_path, MAXPATHLEN, "%s/%s", tracing_events_path, sys_name); + snprintf(evt_path, MAXPATHLEN, "%s/%s", debugfs_path, sys_name); evt_dir = opendir(evt_path); if (!evt_dir) { @@ -528,7 +528,7 @@ parse_tracepoint_event(struct perf_evlist *evlist, const char **strp, char sys_name[MAX_EVENT_LENGTH]; unsigned int sys_length, evt_length; - if (debugfs_valid_mountpoint(tracing_events_path)) + if (debugfs_valid_mountpoint(debugfs_path)) return 0; evt_name = strchr(*strp, ':'); @@ -920,10 +920,10 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob) char evt_path[MAXPATHLEN]; char dir_path[MAXPATHLEN]; - if (debugfs_valid_mountpoint(tracing_events_path)) + if (debugfs_valid_mountpoint(debugfs_path)) return; - sys_dir = opendir(tracing_events_path); + sys_dir = opendir(debugfs_path); if (!sys_dir) return; @@ -932,7 +932,7 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob) !strglobmatch(sys_dirent.d_name, subsys_glob)) continue; - snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path, + snprintf(dir_path, MAXPATHLEN, "%s/%s", debugfs_path, sys_dirent.d_name); evt_dir = opendir(dir_path); if (!evt_dir) @@ -964,16 +964,16 @@ int is_valid_tracepoint(const char *event_string) char evt_path[MAXPATHLEN]; char dir_path[MAXPATHLEN]; - if (debugfs_valid_mountpoint(tracing_events_path)) + if (debugfs_valid_mountpoint(debugfs_path)) return 0; - sys_dir = opendir(tracing_events_path); + sys_dir = opendir(debugfs_path); if (!sys_dir) return 0; for_each_subsystem(sys_dir, sys_dirent, sys_next) { - snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path, + snprintf(dir_path, MAXPATHLEN, "%s/%s", debugfs_path, sys_dirent.d_name); evt_dir = opendir(dir_path); if (!evt_dir) diff --git a/trunk/tools/perf/util/parse-events.h b/trunk/tools/perf/util/parse-events.h index 7e0cbe75d5f1..2f8e375e038d 100644 --- a/trunk/tools/perf/util/parse-events.h +++ b/trunk/tools/perf/util/parse-events.h @@ -39,6 +39,7 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob); int print_hwcache_events(const char *event_glob); extern int is_valid_tracepoint(const char *event_string); +extern char debugfs_path[]; extern int valid_debugfs_mount(const char *debugfs); #endif /* __PERF_PARSE_EVENTS_H */ diff --git a/trunk/tools/perf/util/probe-finder.h b/trunk/tools/perf/util/probe-finder.h index 17e94d0c36f9..1132c8f0ce89 100644 --- a/trunk/tools/perf/util/probe-finder.h +++ b/trunk/tools/perf/util/probe-finder.h @@ -5,6 +5,7 @@ #include "util.h" #include "probe-event.h" +#define MAX_PATH_LEN 256 #define MAX_PROBE_BUFFER 1024 #define MAX_PROBES 128 diff --git a/trunk/tools/perf/util/scripting-engines/trace-event-perl.c b/trunk/tools/perf/util/scripting-engines/trace-event-perl.c index e30749e38a9b..74350ffb57fe 100644 --- a/trunk/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/trunk/tools/perf/util/scripting-engines/trace-event-perl.c @@ -27,10 +27,7 @@ #include "../../perf.h" #include "../util.h" -#include "../thread.h" -#include "../event.h" #include "../trace-event.h" -#include "../evsel.h" #include #include @@ -248,11 +245,11 @@ static inline struct event *find_cache_event(int type) return event; } -static void perl_process_tracepoint(union perf_event *pevent __unused, - struct perf_sample *sample, - struct perf_evsel *evsel, - struct machine *machine __unused, - struct thread *thread) +static void perl_process_event(union perf_event *pevent __unused, + struct perf_sample *sample, + struct perf_evsel *evsel, + struct perf_session *session __unused, + struct thread *thread) { struct format_field *field; static char handler[256]; @@ -268,9 +265,6 @@ static void perl_process_tracepoint(union perf_event *pevent __unused, dSP; - if (evsel->attr.type != PERF_TYPE_TRACEPOINT) - return; - type = trace_parse_common_type(data); event = find_cache_event(type); @@ -338,42 +332,6 @@ static void perl_process_tracepoint(union perf_event *pevent __unused, LEAVE; } -static void perl_process_event_generic(union perf_event *pevent __unused, - struct perf_sample *sample, - struct perf_evsel *evsel __unused, - struct machine *machine __unused, - struct thread *thread __unused) -{ - dSP; - - if (!get_cv("process_event", 0)) - return; - - ENTER; - SAVETMPS; - PUSHMARK(SP); - XPUSHs(sv_2mortal(newSVpvn((const char *)pevent, pevent->header.size))); - XPUSHs(sv_2mortal(newSVpvn((const char *)&evsel->attr, sizeof(evsel->attr)))); - XPUSHs(sv_2mortal(newSVpvn((const char *)sample, sizeof(*sample)))); - XPUSHs(sv_2mortal(newSVpvn((const char *)sample->raw_data, sample->raw_size))); - PUTBACK; - call_pv("process_event", G_SCALAR); - SPAGAIN; - PUTBACK; - FREETMPS; - LEAVE; -} - -static void perl_process_event(union perf_event *pevent, - struct perf_sample *sample, - struct perf_evsel *evsel, - struct machine *machine, - struct thread *thread) -{ - perl_process_tracepoint(pevent, sample, evsel, machine, thread); - perl_process_event_generic(pevent, sample, evsel, machine, thread); -} - static void run_start_sub(void) { dSP; /* access to Perl stack */ @@ -595,28 +553,7 @@ static int perl_generate_script(const char *outfile) fprintf(ofp, "sub print_header\n{\n" "\tmy ($event_name, $cpu, $secs, $nsecs, $pid, $comm) = @_;\n\n" "\tprintf(\"%%-20s %%5u %%05u.%%09u %%8u %%-20s \",\n\t " - "$event_name, $cpu, $secs, $nsecs, $pid, $comm);\n}\n"); - - fprintf(ofp, - "\n# Packed byte string args of process_event():\n" - "#\n" - "# $event:\tunion perf_event\tutil/event.h\n" - "# $attr:\tstruct perf_event_attr\tlinux/perf_event.h\n" - "# $sample:\tstruct perf_sample\tutil/event.h\n" - "# $raw_data:\tperf_sample->raw_data\tutil/event.h\n" - "\n" - "sub process_event\n" - "{\n" - "\tmy ($event, $attr, $sample, $raw_data) = @_;\n" - "\n" - "\tmy @event\t= unpack(\"LSS\", $event);\n" - "\tmy @attr\t= unpack(\"LLQQQQQLLQQ\", $attr);\n" - "\tmy @sample\t= unpack(\"QLLQQQQQLL\", $sample);\n" - "\tmy @raw_data\t= unpack(\"C*\", $raw_data);\n" - "\n" - "\tuse Data::Dumper;\n" - "\tprint Dumper \\@event, \\@attr, \\@sample, \\@raw_data;\n" - "}\n"); + "$event_name, $cpu, $secs, $nsecs, $pid, $comm);\n}"); fclose(ofp); diff --git a/trunk/tools/perf/util/scripting-engines/trace-event-python.c b/trunk/tools/perf/util/scripting-engines/trace-event-python.c index 0b2a48783172..6ccf70e8d8f2 100644 --- a/trunk/tools/perf/util/scripting-engines/trace-event-python.c +++ b/trunk/tools/perf/util/scripting-engines/trace-event-python.c @@ -29,8 +29,6 @@ #include "../../perf.h" #include "../util.h" -#include "../event.h" -#include "../thread.h" #include "../trace-event.h" PyMODINIT_FUNC initperf_trace_context(void); @@ -209,7 +207,7 @@ static inline struct event *find_cache_event(int type) static void python_process_event(union perf_event *pevent __unused, struct perf_sample *sample, struct perf_evsel *evsel __unused, - struct machine *machine __unused, + struct perf_session *session __unused, struct thread *thread) { PyObject *handler, *retval, *context, *t, *obj, *dict = NULL; diff --git a/trunk/tools/perf/util/session.c b/trunk/tools/perf/util/session.c index b5ca2558c7bb..0f4555ce9063 100644 --- a/trunk/tools/perf/util/session.c +++ b/trunk/tools/perf/util/session.c @@ -10,7 +10,6 @@ #include "evlist.h" #include "evsel.h" #include "session.h" -#include "tool.h" #include "sort.h" #include "util.h" #include "cpumap.h" @@ -79,13 +78,39 @@ static int perf_session__open(struct perf_session *self, bool force) return -1; } +static void perf_session__id_header_size(struct perf_session *session) +{ + struct perf_sample *data; + u64 sample_type = session->sample_type; + u16 size = 0; + + if (!session->sample_id_all) + goto out; + + if (sample_type & PERF_SAMPLE_TID) + size += sizeof(data->tid) * 2; + + if (sample_type & PERF_SAMPLE_TIME) + size += sizeof(data->time); + + if (sample_type & PERF_SAMPLE_ID) + size += sizeof(data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + size += sizeof(data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + size += sizeof(data->cpu) * 2; +out: + session->id_hdr_size = size; +} + void perf_session__update_sample_type(struct perf_session *self) { self->sample_type = perf_evlist__sample_type(self->evlist); self->sample_size = __perf_evsel__sample_size(self->sample_type); self->sample_id_all = perf_evlist__sample_id_all(self->evlist); - self->id_hdr_size = perf_evlist__id_hdr_size(self->evlist); - self->host_machine.id_hdr_size = self->id_hdr_size; + perf_session__id_header_size(self); } int perf_session__create_kernel_maps(struct perf_session *self) @@ -105,26 +130,18 @@ static void perf_session__destroy_kernel_maps(struct perf_session *self) struct perf_session *perf_session__new(const char *filename, int mode, bool force, bool repipe, - struct perf_tool *tool) + struct perf_event_ops *ops) { - struct perf_session *self; - struct stat st; - size_t len; - - if (!filename || !strlen(filename)) { - if (!fstat(STDIN_FILENO, &st) && S_ISFIFO(st.st_mode)) - filename = "-"; - else - filename = "perf.data"; - } - - len = strlen(filename); - self = zalloc(sizeof(*self) + len); + size_t len = filename ? strlen(filename) + 1 : 0; + struct perf_session *self = zalloc(sizeof(*self) + len); if (self == NULL) goto out; memcpy(self->filename, filename, len); + self->threads = RB_ROOT; + INIT_LIST_HEAD(&self->dead_threads); + self->last_match = NULL; /* * On 64bit we can mmap the data file in one go. No need for tiny mmap * slices. On 32bit we use 32MB. @@ -154,10 +171,10 @@ struct perf_session *perf_session__new(const char *filename, int mode, goto out_delete; } - if (tool && tool->ordering_requires_timestamps && - tool->ordered_samples && !self->sample_id_all) { + if (ops && ops->ordering_requires_timestamps && + ops->ordered_samples && !self->sample_id_all) { dump_printf("WARNING: No sample_id_all support, falling back to unordered processing\n"); - tool->ordered_samples = false; + ops->ordered_samples = false; } out: @@ -167,22 +184,17 @@ struct perf_session *perf_session__new(const char *filename, int mode, return NULL; } -static void machine__delete_dead_threads(struct machine *machine) +static void perf_session__delete_dead_threads(struct perf_session *self) { struct thread *n, *t; - list_for_each_entry_safe(t, n, &machine->dead_threads, node) { + list_for_each_entry_safe(t, n, &self->dead_threads, node) { list_del(&t->node); thread__delete(t); } } -static void perf_session__delete_dead_threads(struct perf_session *session) -{ - machine__delete_dead_threads(&session->host_machine); -} - -static void machine__delete_threads(struct machine *self) +static void perf_session__delete_threads(struct perf_session *self) { struct rb_node *nd = rb_first(&self->threads); @@ -195,11 +207,6 @@ static void machine__delete_threads(struct machine *self) } } -static void perf_session__delete_threads(struct perf_session *session) -{ - machine__delete_threads(&session->host_machine); -} - void perf_session__delete(struct perf_session *self) { perf_session__destroy_kernel_maps(self); @@ -210,7 +217,7 @@ void perf_session__delete(struct perf_session *self) free(self); } -void machine__remove_thread(struct machine *self, struct thread *th) +void perf_session__remove_thread(struct perf_session *self, struct thread *th) { self->last_match = NULL; rb_erase(&th->rb_node, &self->threads); @@ -229,16 +236,16 @@ static bool symbol__match_parent_regex(struct symbol *sym) return 0; } -int machine__resolve_callchain(struct machine *self, struct perf_evsel *evsel, - struct thread *thread, - struct ip_callchain *chain, - struct symbol **parent) +int perf_session__resolve_callchain(struct perf_session *self, + struct thread *thread, + struct ip_callchain *chain, + struct symbol **parent) { u8 cpumode = PERF_RECORD_MISC_USER; unsigned int i; int err; - callchain_cursor_reset(&evsel->hists.callchain_cursor); + callchain_cursor_reset(&self->callchain_cursor); for (i = 0; i < chain->nr; i++) { u64 ip; @@ -265,7 +272,7 @@ int machine__resolve_callchain(struct machine *self, struct perf_evsel *evsel, al.filtered = false; thread__find_addr_location(thread, self, cpumode, - MAP__FUNCTION, ip, &al, NULL); + MAP__FUNCTION, thread->pid, ip, &al, NULL); if (al.sym != NULL) { if (sort__has_parent && !*parent && symbol__match_parent_regex(al.sym)) @@ -274,7 +281,7 @@ int machine__resolve_callchain(struct machine *self, struct perf_evsel *evsel, break; } - err = callchain_cursor_append(&evsel->hists.callchain_cursor, + err = callchain_cursor_append(&self->callchain_cursor, ip, al.map, al.sym); if (err) return err; @@ -283,91 +290,75 @@ int machine__resolve_callchain(struct machine *self, struct perf_evsel *evsel, return 0; } -static int process_event_synth_tracing_data_stub(union perf_event *event __used, - struct perf_session *session __used) -{ - dump_printf(": unhandled!\n"); - return 0; -} - -static int process_event_synth_attr_stub(union perf_event *event __used, - struct perf_evlist **pevlist __used) +static int process_event_synth_stub(union perf_event *event __used, + struct perf_session *session __used) { dump_printf(": unhandled!\n"); return 0; } -static int process_event_sample_stub(struct perf_tool *tool __used, - union perf_event *event __used, +static int process_event_sample_stub(union perf_event *event __used, struct perf_sample *sample __used, struct perf_evsel *evsel __used, - struct machine *machine __used) + struct perf_session *session __used) { dump_printf(": unhandled!\n"); return 0; } -static int process_event_stub(struct perf_tool *tool __used, - union perf_event *event __used, +static int process_event_stub(union perf_event *event __used, struct perf_sample *sample __used, - struct machine *machine __used) + struct perf_session *session __used) { dump_printf(": unhandled!\n"); return 0; } -static int process_finished_round_stub(struct perf_tool *tool __used, - union perf_event *event __used, - struct perf_session *perf_session __used) +static int process_finished_round_stub(union perf_event *event __used, + struct perf_session *session __used, + struct perf_event_ops *ops __used) { dump_printf(": unhandled!\n"); return 0; } -static int process_event_type_stub(struct perf_tool *tool __used, - union perf_event *event __used) -{ - dump_printf(": unhandled!\n"); - return 0; -} +static int process_finished_round(union perf_event *event, + struct perf_session *session, + struct perf_event_ops *ops); -static int process_finished_round(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session); - -static void perf_tool__fill_defaults(struct perf_tool *tool) +static void perf_event_ops__fill_defaults(struct perf_event_ops *handler) { - if (tool->sample == NULL) - tool->sample = process_event_sample_stub; - if (tool->mmap == NULL) - tool->mmap = process_event_stub; - if (tool->comm == NULL) - tool->comm = process_event_stub; - if (tool->fork == NULL) - tool->fork = process_event_stub; - if (tool->exit == NULL) - tool->exit = process_event_stub; - if (tool->lost == NULL) - tool->lost = perf_event__process_lost; - if (tool->read == NULL) - tool->read = process_event_sample_stub; - if (tool->throttle == NULL) - tool->throttle = process_event_stub; - if (tool->unthrottle == NULL) - tool->unthrottle = process_event_stub; - if (tool->attr == NULL) - tool->attr = process_event_synth_attr_stub; - if (tool->event_type == NULL) - tool->event_type = process_event_type_stub; - if (tool->tracing_data == NULL) - tool->tracing_data = process_event_synth_tracing_data_stub; - if (tool->build_id == NULL) - tool->build_id = process_finished_round_stub; - if (tool->finished_round == NULL) { - if (tool->ordered_samples) - tool->finished_round = process_finished_round; + if (handler->sample == NULL) + handler->sample = process_event_sample_stub; + if (handler->mmap == NULL) + handler->mmap = process_event_stub; + if (handler->comm == NULL) + handler->comm = process_event_stub; + if (handler->fork == NULL) + handler->fork = process_event_stub; + if (handler->exit == NULL) + handler->exit = process_event_stub; + if (handler->lost == NULL) + handler->lost = perf_event__process_lost; + if (handler->read == NULL) + handler->read = process_event_stub; + if (handler->throttle == NULL) + handler->throttle = process_event_stub; + if (handler->unthrottle == NULL) + handler->unthrottle = process_event_stub; + if (handler->attr == NULL) + handler->attr = process_event_synth_stub; + if (handler->event_type == NULL) + handler->event_type = process_event_synth_stub; + if (handler->tracing_data == NULL) + handler->tracing_data = process_event_synth_stub; + if (handler->build_id == NULL) + handler->build_id = process_event_synth_stub; + if (handler->finished_round == NULL) { + if (handler->ordered_samples) + handler->finished_round = process_finished_round; else - tool->finished_round = process_finished_round_stub; + handler->finished_round = process_finished_round_stub; } } @@ -499,11 +490,11 @@ static void perf_session_free_sample_buffers(struct perf_session *session) static int perf_session_deliver_event(struct perf_session *session, union perf_event *event, struct perf_sample *sample, - struct perf_tool *tool, + struct perf_event_ops *ops, u64 file_offset); static void flush_sample_queue(struct perf_session *s, - struct perf_tool *tool) + struct perf_event_ops *ops) { struct ordered_samples *os = &s->ordered_samples; struct list_head *head = &os->samples; @@ -514,7 +505,7 @@ static void flush_sample_queue(struct perf_session *s, unsigned idx = 0, progress_next = os->nr_samples / 16; int ret; - if (!tool->ordered_samples || !limit) + if (!ops->ordered_samples || !limit) return; list_for_each_entry_safe(iter, tmp, head, list) { @@ -525,7 +516,7 @@ static void flush_sample_queue(struct perf_session *s, if (ret) pr_err("Can't parse sample, err = %d\n", ret); else - perf_session_deliver_event(s, iter->event, &sample, tool, + perf_session_deliver_event(s, iter->event, &sample, ops, iter->file_offset); os->last_flush = iter->timestamp; @@ -587,11 +578,11 @@ static void flush_sample_queue(struct perf_session *s, * Flush every events below timestamp 7 * etc... */ -static int process_finished_round(struct perf_tool *tool, - union perf_event *event __used, - struct perf_session *session) +static int process_finished_round(union perf_event *event __used, + struct perf_session *session, + struct perf_event_ops *ops) { - flush_sample_queue(session, tool); + flush_sample_queue(session, ops); session->ordered_samples.next_flush = session->ordered_samples.max_timestamp; return 0; @@ -746,26 +737,13 @@ static void dump_sample(struct perf_session *session, union perf_event *event, callchain__printf(sample); } -static struct machine * - perf_session__find_machine_for_cpumode(struct perf_session *session, - union perf_event *event) -{ - const u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; - - if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL && perf_guest) - return perf_session__find_machine(session, event->ip.pid); - - return perf_session__find_host_machine(session); -} - static int perf_session_deliver_event(struct perf_session *session, union perf_event *event, struct perf_sample *sample, - struct perf_tool *tool, + struct perf_event_ops *ops, u64 file_offset) { struct perf_evsel *evsel; - struct machine *machine; dump_event(session, event, file_offset, sample); @@ -787,8 +765,6 @@ static int perf_session_deliver_event(struct perf_session *session, hists__inc_nr_events(&evsel->hists, event->header.type); } - machine = perf_session__find_machine_for_cpumode(session, event); - switch (event->header.type) { case PERF_RECORD_SAMPLE: dump_sample(session, event, sample); @@ -796,25 +772,23 @@ static int perf_session_deliver_event(struct perf_session *session, ++session->hists.stats.nr_unknown_id; return -1; } - return tool->sample(tool, event, sample, evsel, machine); + return ops->sample(event, sample, evsel, session); case PERF_RECORD_MMAP: - return tool->mmap(tool, event, sample, machine); + return ops->mmap(event, sample, session); case PERF_RECORD_COMM: - return tool->comm(tool, event, sample, machine); + return ops->comm(event, sample, session); case PERF_RECORD_FORK: - return tool->fork(tool, event, sample, machine); + return ops->fork(event, sample, session); case PERF_RECORD_EXIT: - return tool->exit(tool, event, sample, machine); + return ops->exit(event, sample, session); case PERF_RECORD_LOST: - if (tool->lost == perf_event__process_lost) - session->hists.stats.total_lost += event->lost.lost; - return tool->lost(tool, event, sample, machine); + return ops->lost(event, sample, session); case PERF_RECORD_READ: - return tool->read(tool, event, sample, evsel, machine); + return ops->read(event, sample, session); case PERF_RECORD_THROTTLE: - return tool->throttle(tool, event, sample, machine); + return ops->throttle(event, sample, session); case PERF_RECORD_UNTHROTTLE: - return tool->unthrottle(tool, event, sample, machine); + return ops->unthrottle(event, sample, session); default: ++session->hists.stats.nr_unknown_events; return -1; @@ -838,29 +812,24 @@ static int perf_session__preprocess_sample(struct perf_session *session, } static int perf_session__process_user_event(struct perf_session *session, union perf_event *event, - struct perf_tool *tool, u64 file_offset) + struct perf_event_ops *ops, u64 file_offset) { - int err; - dump_event(session, event, file_offset, NULL); /* These events are processed right away */ switch (event->header.type) { case PERF_RECORD_HEADER_ATTR: - err = tool->attr(event, &session->evlist); - if (err == 0) - perf_session__update_sample_type(session); - return err; + return ops->attr(event, session); case PERF_RECORD_HEADER_EVENT_TYPE: - return tool->event_type(tool, event); + return ops->event_type(event, session); case PERF_RECORD_HEADER_TRACING_DATA: /* setup for reading amidst mmap */ lseek(session->fd, file_offset, SEEK_SET); - return tool->tracing_data(event, session); + return ops->tracing_data(event, session); case PERF_RECORD_HEADER_BUILD_ID: - return tool->build_id(tool, event, session); + return ops->build_id(event, session); case PERF_RECORD_FINISHED_ROUND: - return tool->finished_round(tool, event, session); + return ops->finished_round(event, session, ops); default: return -EINVAL; } @@ -868,7 +837,7 @@ static int perf_session__process_user_event(struct perf_session *session, union static int perf_session__process_event(struct perf_session *session, union perf_event *event, - struct perf_tool *tool, + struct perf_event_ops *ops, u64 file_offset) { struct perf_sample sample; @@ -884,7 +853,7 @@ static int perf_session__process_event(struct perf_session *session, hists__inc_nr_events(&session->hists, event->header.type); if (event->header.type >= PERF_RECORD_USER_TYPE_START) - return perf_session__process_user_event(session, event, tool, file_offset); + return perf_session__process_user_event(session, event, ops, file_offset); /* * For all kernel events we get the sample data @@ -897,14 +866,14 @@ static int perf_session__process_event(struct perf_session *session, if (perf_session__preprocess_sample(session, event, &sample)) return 0; - if (tool->ordered_samples) { + if (ops->ordered_samples) { ret = perf_session_queue_event(session, event, &sample, file_offset); if (ret != -ETIME) return ret; } - return perf_session_deliver_event(session, event, &sample, tool, + return perf_session_deliver_event(session, event, &sample, ops, file_offset); } @@ -915,11 +884,6 @@ void perf_event_header__bswap(struct perf_event_header *self) self->size = bswap_16(self->size); } -struct thread *perf_session__findnew(struct perf_session *session, pid_t pid) -{ - return machine__findnew_thread(&session->host_machine, pid); -} - static struct thread *perf_session__register_idle_thread(struct perf_session *self) { struct thread *thread = perf_session__findnew(self, 0); @@ -933,9 +897,9 @@ static struct thread *perf_session__register_idle_thread(struct perf_session *se } static void perf_session__warn_about_errors(const struct perf_session *session, - const struct perf_tool *tool) + const struct perf_event_ops *ops) { - if (tool->lost == perf_event__process_lost && + if (ops->lost == perf_event__process_lost && session->hists.stats.nr_events[PERF_RECORD_LOST] != 0) { ui__warning("Processed %d events and lost %d chunks!\n\n" "Check IO/CPU overload!\n\n", @@ -970,7 +934,7 @@ static void perf_session__warn_about_errors(const struct perf_session *session, volatile int session_done; static int __perf_session__process_pipe_events(struct perf_session *self, - struct perf_tool *tool) + struct perf_event_ops *ops) { union perf_event event; uint32_t size; @@ -979,7 +943,7 @@ static int __perf_session__process_pipe_events(struct perf_session *self, int err; void *p; - perf_tool__fill_defaults(tool); + perf_event_ops__fill_defaults(ops); head = 0; more: @@ -1015,7 +979,8 @@ static int __perf_session__process_pipe_events(struct perf_session *self, } } - if ((skip = perf_session__process_event(self, &event, tool, head)) < 0) { + if (size == 0 || + (skip = perf_session__process_event(self, &event, ops, head)) < 0) { dump_printf("%#" PRIx64 " [%#x]: skipping unknown header type: %d\n", head, event.header.size, event.header.type); /* @@ -1038,7 +1003,7 @@ static int __perf_session__process_pipe_events(struct perf_session *self, done: err = 0; out_err: - perf_session__warn_about_errors(self, tool); + perf_session__warn_about_errors(self, ops); perf_session_free_sample_buffers(self); return err; } @@ -1069,7 +1034,7 @@ fetch_mmaped_event(struct perf_session *session, int __perf_session__process_events(struct perf_session *session, u64 data_offset, u64 data_size, - u64 file_size, struct perf_tool *tool) + u64 file_size, struct perf_event_ops *ops) { u64 head, page_offset, file_offset, file_pos, progress_next; int err, mmap_prot, mmap_flags, map_idx = 0; @@ -1078,7 +1043,7 @@ int __perf_session__process_events(struct perf_session *session, union perf_event *event; uint32_t size; - perf_tool__fill_defaults(tool); + perf_event_ops__fill_defaults(ops); page_size = sysconf(_SC_PAGESIZE); @@ -1133,7 +1098,7 @@ int __perf_session__process_events(struct perf_session *session, size = event->header.size; if (size == 0 || - perf_session__process_event(session, event, tool, file_pos) < 0) { + perf_session__process_event(session, event, ops, file_pos) < 0) { dump_printf("%#" PRIx64 " [%#x]: skipping unknown header type: %d\n", file_offset + head, event->header.size, event->header.type); @@ -1162,15 +1127,15 @@ int __perf_session__process_events(struct perf_session *session, err = 0; /* do the final flush for ordered samples */ session->ordered_samples.next_flush = ULLONG_MAX; - flush_sample_queue(session, tool); + flush_sample_queue(session, ops); out_err: - perf_session__warn_about_errors(session, tool); + perf_session__warn_about_errors(session, ops); perf_session_free_sample_buffers(session); return err; } int perf_session__process_events(struct perf_session *self, - struct perf_tool *tool) + struct perf_event_ops *ops) { int err; @@ -1181,9 +1146,9 @@ int perf_session__process_events(struct perf_session *self, err = __perf_session__process_events(self, self->header.data_offset, self->header.data_size, - self->size, tool); + self->size, ops); else - err = __perf_session__process_pipe_events(self, tool); + err = __perf_session__process_pipe_events(self, ops); return err; } @@ -1198,8 +1163,9 @@ bool perf_session__has_traces(struct perf_session *self, const char *msg) return true; } -int maps__set_kallsyms_ref_reloc_sym(struct map **maps, - const char *symbol_name, u64 addr) +int perf_session__set_kallsyms_ref_reloc_sym(struct map **maps, + const char *symbol_name, + u64 addr) { char *bracket; enum map_type i; @@ -1258,27 +1224,6 @@ size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp) return ret; } -size_t perf_session__fprintf(struct perf_session *session, FILE *fp) -{ - /* - * FIXME: Here we have to actually print all the machines in this - * session, not just the host... - */ - return machine__fprintf(&session->host_machine, fp); -} - -void perf_session__remove_thread(struct perf_session *session, - struct thread *th) -{ - /* - * FIXME: This one makes no sense, we need to remove the thread from - * the machine it belongs to, perf_session can have many machines, so - * doing it always on ->host_machine is wrong. Fix when auditing all - * the 'perf kvm' code. - */ - machine__remove_thread(&session->host_machine, th); -} - struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, unsigned int type) { @@ -1291,16 +1236,17 @@ struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, return NULL; } -void perf_event__print_ip(union perf_event *event, struct perf_sample *sample, - struct machine *machine, struct perf_evsel *evsel, - int print_sym, int print_dso) +void perf_session__print_ip(union perf_event *event, + struct perf_sample *sample, + struct perf_session *session, + int print_sym, int print_dso) { struct addr_location al; const char *symname, *dsoname; - struct callchain_cursor *cursor = &evsel->hists.callchain_cursor; + struct callchain_cursor *cursor = &session->callchain_cursor; struct callchain_cursor_node *node; - if (perf_event__preprocess_sample(event, machine, &al, sample, + if (perf_event__preprocess_sample(event, session, &al, sample, NULL) < 0) { error("problem processing %d event, skipping it.\n", event->header.type); @@ -1309,7 +1255,7 @@ void perf_event__print_ip(union perf_event *event, struct perf_sample *sample, if (symbol_conf.use_callchain && sample->callchain) { - if (machine__resolve_callchain(machine, evsel, al.thread, + if (perf_session__resolve_callchain(session, al.thread, sample->callchain, NULL) != 0) { if (verbose) error("Failed to resolve callchain. Skipping\n"); diff --git a/trunk/tools/perf/util/session.h b/trunk/tools/perf/util/session.h index 37bc38381fb6..6e393c98eb34 100644 --- a/trunk/tools/perf/util/session.h +++ b/trunk/tools/perf/util/session.h @@ -30,6 +30,9 @@ struct perf_session { struct perf_header header; unsigned long size; unsigned long mmap_window; + struct rb_root threads; + struct list_head dead_threads; + struct thread *last_match; struct machine host_machine; struct rb_root machines; struct perf_evlist *evlist; @@ -50,31 +53,65 @@ struct perf_session { int cwdlen; char *cwd; struct ordered_samples ordered_samples; - char filename[1]; + struct callchain_cursor callchain_cursor; + char filename[0]; }; -struct perf_tool; +struct perf_evsel; +struct perf_event_ops; + +typedef int (*event_sample)(union perf_event *event, struct perf_sample *sample, + struct perf_evsel *evsel, struct perf_session *session); +typedef int (*event_op)(union perf_event *self, struct perf_sample *sample, + struct perf_session *session); +typedef int (*event_synth_op)(union perf_event *self, + struct perf_session *session); +typedef int (*event_op2)(union perf_event *self, struct perf_session *session, + struct perf_event_ops *ops); + +struct perf_event_ops { + event_sample sample; + event_op mmap, + comm, + fork, + exit, + lost, + read, + throttle, + unthrottle; + event_synth_op attr, + event_type, + tracing_data, + build_id; + event_op2 finished_round; + bool ordered_samples; + bool ordering_requires_timestamps; +}; struct perf_session *perf_session__new(const char *filename, int mode, bool force, bool repipe, - struct perf_tool *tool); + struct perf_event_ops *ops); void perf_session__delete(struct perf_session *self); void perf_event_header__bswap(struct perf_event_header *self); int __perf_session__process_events(struct perf_session *self, u64 data_offset, u64 data_size, u64 size, - struct perf_tool *tool); + struct perf_event_ops *ops); int perf_session__process_events(struct perf_session *self, - struct perf_tool *tool); + struct perf_event_ops *event_ops); -int perf_session__resolve_callchain(struct perf_session *self, struct perf_evsel *evsel, +int perf_session__resolve_callchain(struct perf_session *self, struct thread *thread, struct ip_callchain *chain, struct symbol **parent); bool perf_session__has_traces(struct perf_session *self, const char *msg); +int perf_session__set_kallsyms_ref_reloc_sym(struct map **maps, + const char *symbol_name, + u64 addr); + void mem_bswap_64(void *src, int byte_size); void perf_event__attr_swap(struct perf_event_attr *attr); @@ -107,16 +144,12 @@ struct machine *perf_session__findnew_machine(struct perf_session *self, pid_t p static inline void perf_session__process_machines(struct perf_session *self, - struct perf_tool *tool, machine__process_t process) { - process(&self->host_machine, tool); - return machines__process(&self->machines, process, tool); + process(&self->host_machine, self); + return machines__process(&self->machines, process, self); } -struct thread *perf_session__findnew(struct perf_session *self, pid_t pid); -size_t perf_session__fprintf(struct perf_session *self, FILE *fp); - size_t perf_session__fprintf_dsos(struct perf_session *self, FILE *fp); size_t perf_session__fprintf_dsos_buildid(struct perf_session *self, @@ -134,20 +167,13 @@ static inline int perf_session__parse_sample(struct perf_session *session, session->header.needs_swap); } -static inline int perf_session__synthesize_sample(struct perf_session *session, - union perf_event *event, - const struct perf_sample *sample) -{ - return perf_event__synthesize_sample(event, session->sample_type, - sample, session->header.needs_swap); -} - struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, unsigned int type); -void perf_event__print_ip(union perf_event *event, struct perf_sample *sample, - struct machine *machine, struct perf_evsel *evsel, - int print_sym, int print_dso); +void perf_session__print_ip(union perf_event *event, + struct perf_sample *sample, + struct perf_session *session, + int print_sym, int print_dso); int perf_session__cpu_bitmap(struct perf_session *session, const char *cpu_list, unsigned long *cpu_bitmap); diff --git a/trunk/tools/perf/util/setup.py b/trunk/tools/perf/util/setup.py index 36d4c5619575..95d370074928 100644 --- a/trunk/tools/perf/util/setup.py +++ b/trunk/tools/perf/util/setup.py @@ -27,8 +27,7 @@ def finalize_options(self): perf = Extension('perf', sources = ['util/python.c', 'util/ctype.c', 'util/evlist.c', 'util/evsel.c', 'util/cpumap.c', 'util/thread_map.c', - 'util/util.c', 'util/xyarray.c', 'util/cgroup.c', - 'util/debugfs.c'], + 'util/util.c', 'util/xyarray.c', 'util/cgroup.c'], include_dirs = ['util/include'], extra_compile_args = cflags, ) diff --git a/trunk/tools/perf/util/symbol.c b/trunk/tools/perf/util/symbol.c index 215d50f2042e..632b50c7bc26 100644 --- a/trunk/tools/perf/util/symbol.c +++ b/trunk/tools/perf/util/symbol.c @@ -1757,7 +1757,7 @@ static int map_groups__set_modules_path_dir(struct map_groups *mg, struct stat st; /*sshfs might return bad dent->d_type, so we have to stat*/ - snprintf(path, sizeof(path), "%s/%s", dir_name, dent->d_name); + sprintf(path, "%s/%s", dir_name, dent->d_name); if (stat(path, &st)) continue; @@ -1766,6 +1766,8 @@ static int map_groups__set_modules_path_dir(struct map_groups *mg, !strcmp(dent->d_name, "..")) continue; + snprintf(path, sizeof(path), "%s/%s", + dir_name, dent->d_name); ret = map_groups__set_modules_path_dir(mg, path); if (ret < 0) goto out; @@ -1786,6 +1788,9 @@ static int map_groups__set_modules_path_dir(struct map_groups *mg, if (map == NULL) continue; + snprintf(path, sizeof(path), "%s/%s", + dir_name, dent->d_name); + long_name = strdup(path); if (long_name == NULL) { ret = -1; @@ -2604,10 +2609,10 @@ int symbol__init(void) symbol_conf.initialized = true; return 0; -out_free_comm_list: - strlist__delete(symbol_conf.comm_list); out_free_dso_list: strlist__delete(symbol_conf.dso_list); +out_free_comm_list: + strlist__delete(symbol_conf.comm_list); return -1; } diff --git a/trunk/tools/perf/util/symbol.h b/trunk/tools/perf/util/symbol.h index 123c2e14353e..29f8d742e92f 100644 --- a/trunk/tools/perf/util/symbol.h +++ b/trunk/tools/perf/util/symbol.h @@ -68,7 +68,6 @@ struct strlist; struct symbol_conf { unsigned short priv_size; - unsigned short nr_events; bool try_vmlinux_path, use_modules, sort_by_name, diff --git a/trunk/tools/perf/util/thread.c b/trunk/tools/perf/util/thread.c index fb4b7ea6752f..d5d3b22250f3 100644 --- a/trunk/tools/perf/util/thread.c +++ b/trunk/tools/perf/util/thread.c @@ -61,7 +61,7 @@ static size_t thread__fprintf(struct thread *self, FILE *fp) map_groups__fprintf(&self->mg, verbose, fp); } -struct thread *machine__findnew_thread(struct machine *self, pid_t pid) +struct thread *perf_session__findnew(struct perf_session *self, pid_t pid) { struct rb_node **p = &self->threads.rb_node; struct rb_node *parent = NULL; @@ -125,12 +125,12 @@ int thread__fork(struct thread *self, struct thread *parent) return 0; } -size_t machine__fprintf(struct machine *machine, FILE *fp) +size_t perf_session__fprintf(struct perf_session *self, FILE *fp) { size_t ret = 0; struct rb_node *nd; - for (nd = rb_first(&machine->threads); nd; nd = rb_next(nd)) { + for (nd = rb_first(&self->threads); nd; nd = rb_next(nd)) { struct thread *pos = rb_entry(nd, struct thread, rb_node); ret += thread__fprintf(pos, fp); diff --git a/trunk/tools/perf/util/thread.h b/trunk/tools/perf/util/thread.h index 70c2c13ff679..e5f2401c1b5e 100644 --- a/trunk/tools/perf/util/thread.h +++ b/trunk/tools/perf/util/thread.h @@ -18,14 +18,16 @@ struct thread { int comm_len; }; -struct machine; +struct perf_session; void thread__delete(struct thread *self); int thread__set_comm(struct thread *self, const char *comm); int thread__comm_len(struct thread *self); +struct thread *perf_session__findnew(struct perf_session *self, pid_t pid); void thread__insert_map(struct thread *self, struct map *map); int thread__fork(struct thread *self, struct thread *parent); +size_t perf_session__fprintf(struct perf_session *self, FILE *fp); static inline struct map *thread__find_map(struct thread *self, enum map_type type, u64 addr) @@ -33,12 +35,14 @@ static inline struct map *thread__find_map(struct thread *self, return self ? map_groups__find(&self->mg, type, addr) : NULL; } -void thread__find_addr_map(struct thread *thread, struct machine *machine, - u8 cpumode, enum map_type type, u64 addr, +void thread__find_addr_map(struct thread *self, + struct perf_session *session, u8 cpumode, + enum map_type type, pid_t pid, u64 addr, struct addr_location *al); -void thread__find_addr_location(struct thread *thread, struct machine *machine, - u8 cpumode, enum map_type type, u64 addr, +void thread__find_addr_location(struct thread *self, + struct perf_session *session, u8 cpumode, + enum map_type type, pid_t pid, u64 addr, struct addr_location *al, symbol_filter_t filter); #endif /* __PERF_THREAD_H */ diff --git a/trunk/tools/perf/util/tool.h b/trunk/tools/perf/util/tool.h deleted file mode 100644 index b0e1aadba8d5..000000000000 --- a/trunk/tools/perf/util/tool.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef __PERF_TOOL_H -#define __PERF_TOOL_H - -#include - -struct perf_session; -union perf_event; -struct perf_evlist; -struct perf_evsel; -struct perf_sample; -struct perf_tool; -struct machine; - -typedef int (*event_sample)(struct perf_tool *tool, union perf_event *event, - struct perf_sample *sample, - struct perf_evsel *evsel, struct machine *machine); - -typedef int (*event_op)(struct perf_tool *tool, union perf_event *event, - struct perf_sample *sample, struct machine *machine); - -typedef int (*event_attr_op)(union perf_event *event, - struct perf_evlist **pevlist); -typedef int (*event_simple_op)(struct perf_tool *tool, union perf_event *event); - -typedef int (*event_synth_op)(union perf_event *event, - struct perf_session *session); - -typedef int (*event_op2)(struct perf_tool *tool, union perf_event *event, - struct perf_session *session); - -struct perf_tool { - event_sample sample, - read; - event_op mmap, - comm, - fork, - exit, - lost, - throttle, - unthrottle; - event_attr_op attr; - event_synth_op tracing_data; - event_simple_op event_type; - event_op2 finished_round, - build_id; - bool ordered_samples; - bool ordering_requires_timestamps; -}; - -#endif /* __PERF_TOOL_H */ diff --git a/trunk/tools/perf/util/top.h b/trunk/tools/perf/util/top.h index a248f3c2c60d..399650967958 100644 --- a/trunk/tools/perf/util/top.h +++ b/trunk/tools/perf/util/top.h @@ -1,17 +1,15 @@ #ifndef __PERF_TOP_H #define __PERF_TOP_H 1 -#include "tool.h" #include "types.h" +#include "../perf.h" #include -#include struct perf_evlist; struct perf_evsel; struct perf_session; struct perf_top { - struct perf_tool tool; struct perf_evlist *evlist; /* * Symbols will be added here in perf_event__process_sample and will @@ -25,26 +23,10 @@ struct perf_top { int freq; pid_t target_pid, target_tid; bool hide_kernel_symbols, hide_user_symbols, zero; - bool system_wide; - bool use_tui, use_stdio; - bool sort_has_symbols; - bool dont_use_callchains; - bool kptr_restrict_warned; - bool vmlinux_warned; - bool inherit; - bool group; - bool sample_id_all_avail; - bool dump_symtab; const char *cpu_list; struct hist_entry *sym_filter_entry; struct perf_evsel *sym_evsel; struct perf_session *session; - struct winsize winsize; - unsigned int mmap_pages; - int default_interval; - int realtime_prio; - int sym_pcnt_filter; - const char *sym_filter; }; size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size); diff --git a/trunk/tools/perf/util/trace-event-info.c b/trunk/tools/perf/util/trace-event-info.c index ac6830d8292b..d2655f08bcc0 100644 --- a/trunk/tools/perf/util/trace-event-info.c +++ b/trunk/tools/perf/util/trace-event-info.c @@ -18,8 +18,7 @@ * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -#include -#include "util.h" +#define _GNU_SOURCE #include #include #include @@ -32,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -44,6 +44,10 @@ #define VERSION "0.5" +#define _STR(x) #x +#define STR(x) _STR(x) +#define MAX_PATH 256 + #define TRACE_CTRL "tracing_on" #define TRACE "trace" #define AVAILABLE "available_tracers" @@ -69,6 +73,26 @@ struct events { }; + +static void die(const char *fmt, ...) +{ + va_list ap; + int ret = errno; + + if (errno) + perror("perf"); + else + ret = -1; + + va_start(ap, fmt); + fprintf(stderr, " "); + vfprintf(stderr, fmt, ap); + va_end(ap); + + fprintf(stderr, "\n"); + exit(ret); +} + void *malloc_or_die(unsigned int size) { void *data; diff --git a/trunk/tools/perf/util/trace-event-scripting.c b/trunk/tools/perf/util/trace-event-scripting.c index a3fdf55f317b..c9dcbec7d800 100644 --- a/trunk/tools/perf/util/trace-event-scripting.c +++ b/trunk/tools/perf/util/trace-event-scripting.c @@ -39,7 +39,7 @@ static int stop_script_unsupported(void) static void process_event_unsupported(union perf_event *event __unused, struct perf_sample *sample __unused, struct perf_evsel *evsel __unused, - struct machine *machine __unused, + struct perf_session *session __unused, struct thread *thread __unused) { } diff --git a/trunk/tools/perf/util/trace-event.h b/trunk/tools/perf/util/trace-event.h index 58ae14c5baac..a84100817649 100644 --- a/trunk/tools/perf/util/trace-event.h +++ b/trunk/tools/perf/util/trace-event.h @@ -3,11 +3,7 @@ #include #include "parse-events.h" - -struct machine; -struct perf_sample; -union perf_event; -struct thread; +#include "session.h" #define __unused __attribute__((unused)) @@ -296,7 +292,7 @@ struct scripting_ops { void (*process_event) (union perf_event *event, struct perf_sample *sample, struct perf_evsel *evsel, - struct machine *machine, + struct perf_session *session, struct thread *thread); int (*generate_script) (const char *outfile); }; diff --git a/trunk/tools/perf/util/ui/browsers/annotate.c b/trunk/tools/perf/util/ui/browsers/annotate.c index 295a9c93f945..0575905d1205 100644 --- a/trunk/tools/perf/util/ui/browsers/annotate.c +++ b/trunk/tools/perf/util/ui/browsers/annotate.c @@ -224,7 +224,7 @@ static bool annotate_browser__toggle_source(struct annotate_browser *browser) } static int annotate_browser__run(struct annotate_browser *self, int evidx, - void(*timer)(void *arg), + int nr_events, void(*timer)(void *arg), void *arg, int delay_secs) { struct rb_node *nd = NULL; @@ -328,7 +328,8 @@ static int annotate_browser__run(struct annotate_browser *self, int evidx, notes = symbol__annotation(target); pthread_mutex_lock(¬es->lock); - if (notes->src == NULL && symbol__alloc_hist(target) < 0) { + if (notes->src == NULL && + symbol__alloc_hist(target, nr_events) < 0) { pthread_mutex_unlock(¬es->lock); ui__warning("Not enough memory for annotating '%s' symbol!\n", target->name); @@ -336,7 +337,7 @@ static int annotate_browser__run(struct annotate_browser *self, int evidx, } pthread_mutex_unlock(¬es->lock); - symbol__tui_annotate(target, ms->map, evidx, + symbol__tui_annotate(target, ms->map, evidx, nr_events, timer, arg, delay_secs); } continue; @@ -357,15 +358,15 @@ static int annotate_browser__run(struct annotate_browser *self, int evidx, return key; } -int hist_entry__tui_annotate(struct hist_entry *he, int evidx, +int hist_entry__tui_annotate(struct hist_entry *he, int evidx, int nr_events, void(*timer)(void *arg), void *arg, int delay_secs) { - return symbol__tui_annotate(he->ms.sym, he->ms.map, evidx, + return symbol__tui_annotate(he->ms.sym, he->ms.map, evidx, nr_events, timer, arg, delay_secs); } int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx, - void(*timer)(void *arg), void *arg, + int nr_events, void(*timer)(void *arg), void *arg, int delay_secs) { struct objdump_line *pos, *n; @@ -418,7 +419,8 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx, browser.b.nr_entries = browser.nr_entries; browser.b.entries = ¬es->src->source, browser.b.width += 18; /* Percentage */ - ret = annotate_browser__run(&browser, evidx, timer, arg, delay_secs); + ret = annotate_browser__run(&browser, evidx, nr_events, + timer, arg, delay_secs); list_for_each_entry_safe(pos, n, ¬es->src->source, node) { list_del(&pos->node); objdump_line__free(pos); diff --git a/trunk/tools/perf/util/ui/browsers/hists.c b/trunk/tools/perf/util/ui/browsers/hists.c index 1212a386a033..d0c94b459685 100644 --- a/trunk/tools/perf/util/ui/browsers/hists.c +++ b/trunk/tools/perf/util/ui/browsers/hists.c @@ -1020,7 +1020,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events, * Don't let this be freed, say, by hists__decay_entry. */ he->used = true; - err = hist_entry__tui_annotate(he, evsel->idx, + err = hist_entry__tui_annotate(he, evsel->idx, nr_events, timer, arg, delay_secs); he->used = false; ui_browser__update_nr_entries(&browser->b, browser->hists->nr_entries); diff --git a/trunk/tools/perf/util/ui/progress.c b/trunk/tools/perf/util/ui/progress.c index 13aa64e50e11..295e366b6311 100644 --- a/trunk/tools/perf/util/ui/progress.c +++ b/trunk/tools/perf/util/ui/progress.c @@ -14,9 +14,6 @@ void ui_progress__update(u64 curr, u64 total, const char *title) if (use_browser <= 0) return; - if (total == 0) - return; - ui__refresh_dimensions(true); pthread_mutex_lock(&ui__lock); y = SLtt_Screen_Rows / 2 - 2; diff --git a/trunk/tools/perf/util/usage.c b/trunk/tools/perf/util/usage.c index d76d1c0ff98f..e16bf9a707e8 100644 --- a/trunk/tools/perf/util/usage.c +++ b/trunk/tools/perf/util/usage.c @@ -1,8 +1,5 @@ /* - * usage.c - * - * Various reporting routines. - * Originally copied from GIT source. + * GIT - The information manager from hell * * Copyright (C) Linus Torvalds, 2005 */ diff --git a/trunk/tools/perf/util/util.h b/trunk/tools/perf/util/util.h index 37be34dff798..0128906bac88 100644 --- a/trunk/tools/perf/util/util.h +++ b/trunk/tools/perf/util/util.h @@ -245,15 +245,4 @@ int readn(int fd, void *buf, size_t size); #define _STR(x) #x #define STR(x) _STR(x) -/* - * Determine whether some value is a power of two, where zero is - * *not* considered a power of two. - */ - -static inline __attribute__((const)) -bool is_power_of_2(unsigned long n) -{ - return (n != 0 && ((n & (n - 1)) == 0)); -} - #endif diff --git a/trunk/tools/perf/util/values.c b/trunk/tools/perf/util/values.c index 697c8b4e59cc..bdd33470b235 100644 --- a/trunk/tools/perf/util/values.c +++ b/trunk/tools/perf/util/values.c @@ -32,7 +32,6 @@ void perf_read_values_destroy(struct perf_read_values *values) for (i = 0; i < values->threads; i++) free(values->value[i]); - free(values->value); free(values->pid); free(values->tid); free(values->counterrawid); diff --git a/trunk/virt/kvm/assigned-dev.c b/trunk/virt/kvm/assigned-dev.c index 758e3b36d4cf..3ad0925d23a9 100644 --- a/trunk/virt/kvm/assigned-dev.c +++ b/trunk/virt/kvm/assigned-dev.c @@ -17,8 +17,6 @@ #include #include #include -#include -#include #include "irq.h" static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, @@ -482,76 +480,12 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, return r; } -/* - * We want to test whether the caller has been granted permissions to - * use this device. To be able to configure and control the device, - * the user needs access to PCI configuration space and BAR resources. - * These are accessed through PCI sysfs. PCI config space is often - * passed to the process calling this ioctl via file descriptor, so we - * can't rely on access to that file. We can check for permissions - * on each of the BAR resource files, which is a pretty clear - * indicator that the user has been granted access to the device. - */ -static int probe_sysfs_permissions(struct pci_dev *dev) -{ -#ifdef CONFIG_SYSFS - int i; - bool bar_found = false; - - for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) { - char *kpath, *syspath; - struct path path; - struct inode *inode; - int r; - - if (!pci_resource_len(dev, i)) - continue; - - kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL); - if (!kpath) - return -ENOMEM; - - /* Per sysfs-rules, sysfs is always at /sys */ - syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i); - kfree(kpath); - if (!syspath) - return -ENOMEM; - - r = kern_path(syspath, LOOKUP_FOLLOW, &path); - kfree(syspath); - if (r) - return r; - - inode = path.dentry->d_inode; - - r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS); - path_put(&path); - if (r) - return r; - - bar_found = true; - } - - /* If no resources, probably something special */ - if (!bar_found) - return -EPERM; - - return 0; -#else - return -EINVAL; /* No way to control the device without sysfs */ -#endif -} - static int kvm_vm_ioctl_assign_device(struct kvm *kvm, struct kvm_assigned_pci_dev *assigned_dev) { int r = 0, idx; struct kvm_assigned_dev_kernel *match; struct pci_dev *dev; - u8 header_type; - - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)) - return -EINVAL; mutex_lock(&kvm->lock); idx = srcu_read_lock(&kvm->srcu); @@ -579,18 +513,6 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, r = -EINVAL; goto out_free; } - - /* Don't allow bridges to be assigned */ - pci_read_config_byte(dev, PCI_HEADER_TYPE, &header_type); - if ((header_type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL) { - r = -EPERM; - goto out_put; - } - - r = probe_sysfs_permissions(dev); - if (r) - goto out_put; - if (pci_enable_device(dev)) { printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); r = -EBUSY; @@ -622,14 +544,16 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, list_add(&match->list, &kvm->arch.assigned_dev_head); - if (!kvm->arch.iommu_domain) { - r = kvm_iommu_map_guest(kvm); + if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { + if (!kvm->arch.iommu_domain) { + r = kvm_iommu_map_guest(kvm); + if (r) + goto out_list_del; + } + r = kvm_assign_device(kvm, match); if (r) goto out_list_del; } - r = kvm_assign_device(kvm, match); - if (r) - goto out_list_del; out: srcu_read_unlock(&kvm->srcu, idx); @@ -669,7 +593,8 @@ static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, goto out; } - kvm_deassign_device(kvm, match); + if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) + kvm_deassign_device(kvm, match); kvm_free_assigned_device(kvm, match);