From 00b87c5d365e3c310a5a0e69c8980e6da01e5236 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 20 Dec 2012 07:21:54 -0800 Subject: [PATCH] --- yaml --- r: 347407 b: refs/heads/master c: 1ffab3d4139533eff6e27b7568825307e575faa6 h: refs/heads/master i: 347405: 7f62fa201c2b27500c2f9b0a8a8a5f424050a2a6 347403: a60ba2cfa5a421ac8013f637b4d6001a3045033c 347399: d3c4f11b5854fa4a037732752b80ea195e49ac9c 347391: 0b19f785dc15d8767cefa3337799cb74f2eba6c8 v: v3 --- [refs] | 2 +- trunk/.gitignore | 1 - trunk/Documentation/00-INDEX | 2 - trunk/Documentation/ABI/README | 3 - .../ABI/stable/sysfs-devices-node | 96 +- trunk/Documentation/ABI/testing/ima_policy | 3 +- trunk/Documentation/DocBook/kernel-api.tmpl | 3 + trunk/Documentation/aoe/aoe.txt | 4 +- .../Documentation/backlight/lp855x-driver.txt | 10 +- trunk/Documentation/cgroups/memory.txt | 66 +- .../cgroups/resource_counter.txt | 7 +- .../devicetree/bindings/arm/davinci/nand.txt | 8 + .../devicetree/bindings/i2c/i2c-cbus-gpio.txt | 27 + .../devicetree/bindings/i2c/i2c-mux-gpio.txt | 81 + .../devicetree/bindings/i2c/i2c-ocores.txt | 2 +- .../devicetree/bindings/i2c/i2c-s3c2410.txt | 20 +- .../bindings/input/gpio-matrix-keypad.txt | 46 + .../devicetree/bindings/input/pwm-beeper.txt | 7 + .../bindings/input/stmpe-keypad.txt | 39 + .../bindings/input/tca8418_keypad.txt | 8 + .../bindings/input/touchscreen/mms114.txt | 34 + .../bindings/input/touchscreen/stmpe.txt | 43 + .../devicetree/bindings/mtd/denali-nand.txt | 23 + .../devicetree/bindings/mtd/flctl-nand.txt | 49 + .../devicetree/bindings/mtd/fsmc-nand.txt | 12 +- .../devicetree/bindings/mtd/m25p80.txt | 29 + .../devicetree/bindings/mtd/mtd-physmap.txt | 3 + .../bindings/powerpc/fsl/raideng.txt | 81 + .../devicetree/bindings/pwm/pwm-tiecap.txt | 23 + .../devicetree/bindings/pwm/pwm-tiehrpwm.txt | 23 + .../devicetree/bindings/pwm/pwm-tipwmss.txt | 31 + .../devicetree/bindings/pwm/pwm.txt | 17 +- .../devicetree/bindings/pwm/spear-pwm.txt | 18 + .../devicetree/bindings/pwm/ti,twl-pwm.txt | 17 + .../devicetree/bindings/pwm/ti,twl-pwmled.txt | 17 + .../devicetree/bindings/pwm/vt8500-pwm.txt | 17 + .../devicetree/bindings/rtc/imxdi-rtc.txt | 17 + .../devicetree/bindings/rtc/rtc-omap.txt | 17 + .../bindings/spi/nvidia,tegra20-sflash.txt | 2 +- .../bindings/spi/nvidia,tegra20-slink.txt | 2 +- .../devicetree/bindings/spi/spi_atmel.txt | 26 + trunk/Documentation/filesystems/proc.txt | 130 +- trunk/Documentation/filesystems/vfat.txt | 9 + trunk/Documentation/hwmon/it87 | 10 + trunk/Documentation/kernel-parameters.txt | 7 +- trunk/Documentation/powerpc/ptrace.txt | 16 + trunk/Documentation/security/00-INDEX | 2 + trunk/Documentation/sparse.txt | 18 + trunk/Documentation/x86/boot.txt | 3 +- trunk/Documentation/xtensa/atomctl.txt | 44 + trunk/MAINTAINERS | 18 +- trunk/Makefile | 6 + trunk/arch/Kconfig | 20 + trunk/arch/alpha/include/asm/Kbuild | 9 - trunk/arch/alpha/include/asm/a.out.h | 89 +- trunk/arch/alpha/include/asm/compiler.h | 115 +- trunk/arch/alpha/include/asm/console.h | 48 +- trunk/arch/alpha/include/asm/fpu.h | 124 +- trunk/arch/alpha/include/asm/pal.h | 50 +- trunk/arch/alpha/include/asm/param.h | 20 +- trunk/arch/alpha/include/asm/ptrace.h | 68 +- trunk/arch/alpha/include/asm/signal.h | 135 +- trunk/arch/alpha/include/asm/socket.h | 79 +- trunk/arch/alpha/include/asm/termios.h | 68 +- trunk/arch/alpha/include/asm/types.h | 13 +- trunk/arch/alpha/include/asm/unistd.h | 469 +- trunk/arch/alpha/include/uapi/asm/Kbuild | 40 + trunk/arch/alpha/include/uapi/asm/a.out.h | 91 + .../alpha/include/{ => uapi}/asm/auxvec.h | 0 .../include/{ => uapi}/asm/bitsperlong.h | 0 .../alpha/include/{ => uapi}/asm/byteorder.h | 0 trunk/arch/alpha/include/uapi/asm/compiler.h | 117 + trunk/arch/alpha/include/uapi/asm/console.h | 50 + .../arch/alpha/include/{ => uapi}/asm/errno.h | 0 .../arch/alpha/include/{ => uapi}/asm/fcntl.h | 0 trunk/arch/alpha/include/uapi/asm/fpu.h | 123 + .../alpha/include/{ => uapi}/asm/gentrap.h | 0 .../arch/alpha/include/{ => uapi}/asm/ioctl.h | 0 .../alpha/include/{ => uapi}/asm/ioctls.h | 0 .../alpha/include/{ => uapi}/asm/ipcbuf.h | 0 .../alpha/include/{ => uapi}/asm/kvm_para.h | 0 .../arch/alpha/include/{ => uapi}/asm/mman.h | 0 .../alpha/include/{ => uapi}/asm/msgbuf.h | 0 trunk/arch/alpha/include/uapi/asm/pal.h | 52 + trunk/arch/alpha/include/uapi/asm/param.h | 21 + .../arch/alpha/include/{ => uapi}/asm/poll.h | 0 .../include/{ => uapi}/asm/posix_types.h | 0 trunk/arch/alpha/include/uapi/asm/ptrace.h | 70 + trunk/arch/alpha/include/{ => uapi}/asm/reg.h | 0 .../alpha/include/{ => uapi}/asm/regdef.h | 0 .../alpha/include/{ => uapi}/asm/resource.h | 0 .../alpha/include/{ => uapi}/asm/sembuf.h | 0 .../arch/alpha/include/{ => uapi}/asm/setup.h | 0 .../alpha/include/{ => uapi}/asm/shmbuf.h | 0 .../alpha/include/{ => uapi}/asm/sigcontext.h | 0 .../alpha/include/{ => uapi}/asm/siginfo.h | 0 trunk/arch/alpha/include/uapi/asm/signal.h | 135 + trunk/arch/alpha/include/uapi/asm/socket.h | 81 + .../alpha/include/{ => uapi}/asm/sockios.h | 0 .../arch/alpha/include/{ => uapi}/asm/stat.h | 0 .../alpha/include/{ => uapi}/asm/statfs.h | 0 .../arch/alpha/include/{ => uapi}/asm/swab.h | 0 .../alpha/include/{ => uapi}/asm/sysinfo.h | 0 .../alpha/include/{ => uapi}/asm/termbits.h | 0 trunk/arch/alpha/include/uapi/asm/termios.h | 70 + trunk/arch/alpha/include/uapi/asm/types.h | 16 + trunk/arch/alpha/include/uapi/asm/unistd.h | 471 ++ trunk/arch/arm/boot/dts/imx28-cfa10049.dts | 24 + trunk/arch/arm/boot/dts/spear13xx.dtsi | 14 +- trunk/arch/arm/boot/dts/spear300.dtsi | 8 +- trunk/arch/arm/boot/dts/spear310.dtsi | 8 +- trunk/arch/arm/boot/dts/spear320.dtsi | 8 +- trunk/arch/arm/boot/dts/spear600.dtsi | 8 +- trunk/arch/arm/configs/nhk8815_defconfig | 2 +- trunk/arch/arm/include/uapi/asm/unistd.h | 1 + trunk/arch/arm/kernel/calls.S | 1 + trunk/arch/arm/mach-davinci/devices-da8xx.c | 13 +- trunk/arch/arm/mach-mxs/mach-mxs.c | 2 +- trunk/arch/arm/mach-nomadik/board-nhk8815.c | 71 +- .../arch/arm/mach-nomadik/include/mach/fsmc.h | 29 - trunk/arch/arm/mach-omap1/board-nokia770.c | 14 +- trunk/arch/arm/mach-omap2/board-n8x0.c | 42 + .../arm/mach-omap2/board-rx51-peripherals.c | 6 + trunk/arch/arm/mach-omap2/i2c.c | 19 + .../arm/mach-omap2/omap_hwmod_2430_data.c | 3 +- .../arm/mach-omap2/omap_hwmod_33xx_data.c | 3 +- .../arm/mach-omap2/omap_hwmod_3xxx_data.c | 12 +- .../arm/mach-omap2/omap_hwmod_44xx_data.c | 3 +- trunk/arch/arm/mach-u300/core.c | 14 +- .../arch/arm/mach-ux500/board-mop500-stuib.c | 71 +- trunk/arch/arm64/include/asm/unistd.h | 1 + trunk/arch/arm64/kernel/sys_compat.c | 15 - trunk/arch/blackfin/include/asm/Kbuild | 6 +- trunk/arch/blackfin/include/asm/bfin_sport.h | 128 +- trunk/arch/blackfin/include/asm/bfin_twi.h | 2 +- trunk/arch/blackfin/include/asm/fixed_code.h | 30 +- trunk/arch/blackfin/include/asm/kvm_para.h | 1 - trunk/arch/blackfin/include/asm/pgtable.h | 2 - trunk/arch/blackfin/include/asm/ptrace.h | 161 +- trunk/arch/blackfin/include/asm/uaccess.h | 41 +- trunk/arch/blackfin/include/asm/unistd.h | 430 +- trunk/arch/blackfin/include/mach-common/irq.h | 5 +- trunk/arch/blackfin/include/uapi/asm/Kbuild | 16 + .../blackfin/include/uapi/asm/bfin_sport.h | 136 + .../include/{ => uapi}/asm/byteorder.h | 0 .../include/{ => uapi}/asm/cachectl.h | 0 .../blackfin/include/{ => uapi}/asm/fcntl.h | 0 .../blackfin/include/uapi/asm/fixed_code.h | 38 + .../blackfin/include/{ => uapi}/asm/ioctls.h | 0 .../blackfin/include/{ => uapi}/asm/poll.h | 0 .../include/{ => uapi}/asm/posix_types.h | 0 trunk/arch/blackfin/include/uapi/asm/ptrace.h | 170 + .../include/{ => uapi}/asm/sigcontext.h | 0 .../blackfin/include/{ => uapi}/asm/siginfo.h | 0 .../blackfin/include/{ => uapi}/asm/signal.h | 0 .../blackfin/include/{ => uapi}/asm/stat.h | 0 .../blackfin/include/{ => uapi}/asm/swab.h | 0 trunk/arch/blackfin/include/uapi/asm/unistd.h | 437 ++ trunk/arch/blackfin/kernel/kgdb.c | 13 +- .../mach-bf518/include/mach/anomaly.h | 1 + .../mach-bf527/include/mach/anomaly.h | 1 + .../mach-bf533/include/mach/anomaly.h | 1 + .../mach-bf537/include/mach/anomaly.h | 1 + .../mach-bf538/include/mach/anomaly.h | 1 + .../mach-bf548/include/mach/anomaly.h | 1 + .../mach-bf561/include/mach/anomaly.h | 1 + .../blackfin/mach-bf609/include/mach/irq.h | 3 - trunk/arch/blackfin/mach-bf609/pm.c | 3 +- trunk/arch/blackfin/mach-common/dpmc.c | 19 +- .../arch/blackfin/mach-common/ints-priority.c | 272 +- trunk/arch/cris/include/asm/io.h | 39 +- trunk/arch/cris/kernel/module.c | 2 - trunk/arch/frv/kernel/setup.c | 12 +- trunk/arch/frv/mm/init.c | 2 +- trunk/arch/h8300/Kconfig | 1 + trunk/arch/openrisc/include/asm/Kbuild | 1 + .../arch/openrisc/include/uapi/asm/kvm_para.h | 1 - trunk/arch/openrisc/kernel/asm-offsets.c | 6 +- trunk/arch/parisc/kernel/module.c | 2 - trunk/arch/powerpc/Makefile | 2 +- trunk/arch/powerpc/boot/dts/a3m071.dts | 144 + .../powerpc/boot/dts/fsl/p5020si-post.dtsi | 1 + .../powerpc/boot/dts/fsl/p5020si-pre.dtsi | 6 + .../powerpc/boot/dts/fsl/qoriq-raid1.0-0.dtsi | 85 + trunk/arch/powerpc/configs/pseries_defconfig | 2 +- trunk/arch/powerpc/include/asm/bitops.h | 75 +- trunk/arch/powerpc/include/asm/cputable.h | 12 +- trunk/arch/powerpc/include/asm/dbell.h | 2 +- .../arch/powerpc/include/asm/exception-64s.h | 97 +- trunk/arch/powerpc/include/asm/firmware.h | 4 +- trunk/arch/powerpc/include/asm/fsl_gtm.h | 2 +- trunk/arch/powerpc/include/asm/fsl_guts.h | 4 +- trunk/arch/powerpc/include/asm/hvcall.h | 23 +- trunk/arch/powerpc/include/asm/immap_qe.h | 2 +- trunk/arch/powerpc/include/asm/machdep.h | 39 +- trunk/arch/powerpc/include/asm/mmu.h | 1 + .../powerpc/include/asm/pSeries_reconfig.h | 47 - trunk/arch/powerpc/include/asm/ppc-opcode.h | 6 +- trunk/arch/powerpc/include/asm/prom.h | 16 + trunk/arch/powerpc/include/asm/qe.h | 2 +- trunk/arch/powerpc/include/asm/qe_ic.h | 2 +- trunk/arch/powerpc/include/asm/reg.h | 3 + trunk/arch/powerpc/include/asm/rtas.h | 5 + trunk/arch/powerpc/include/asm/setup.h | 29 + trunk/arch/powerpc/include/asm/systbl.h | 3 +- trunk/arch/powerpc/include/asm/ucc.h | 2 +- trunk/arch/powerpc/include/asm/ucc_fast.h | 2 +- trunk/arch/powerpc/include/asm/ucc_slow.h | 2 +- trunk/arch/powerpc/include/asm/udbg.h | 1 - trunk/arch/powerpc/include/asm/unistd.h | 3 +- trunk/arch/powerpc/include/uapi/asm/setup.h | 31 - trunk/arch/powerpc/include/uapi/asm/unistd.h | 1 + trunk/arch/powerpc/kernel/Makefile | 2 +- .../{cpu_setup_power7.S => cpu_setup_power.S} | 32 +- trunk/arch/powerpc/kernel/cputable.c | 38 + trunk/arch/powerpc/kernel/entry_64.S | 2 + trunk/arch/powerpc/kernel/exceptions-64s.S | 308 +- trunk/arch/powerpc/kernel/head_64.S | 6 +- trunk/arch/powerpc/kernel/idle.c | 3 - trunk/arch/powerpc/kernel/iommu.c | 16 +- trunk/arch/powerpc/kernel/machine_kexec.c | 14 +- trunk/arch/powerpc/kernel/machine_kexec_64.c | 8 +- trunk/arch/powerpc/kernel/pci_32.c | 2 +- trunk/arch/powerpc/kernel/prom.c | 7 +- trunk/arch/powerpc/kernel/prom_init.c | 11 +- trunk/arch/powerpc/kernel/ptrace.c | 90 +- trunk/arch/powerpc/kernel/rtas.c | 1 - trunk/arch/powerpc/kernel/rtas_flash.c | 4 +- trunk/arch/powerpc/kernel/setup_64.c | 5 + trunk/arch/powerpc/kernel/sys_ppc32.c | 17 +- trunk/arch/powerpc/kernel/udbg.c | 23 - trunk/arch/powerpc/mm/numa.c | 12 - trunk/arch/powerpc/mm/tlb_nohash_low.S | 15 +- trunk/arch/powerpc/perf/power7-pmu.c | 17 +- trunk/arch/powerpc/platforms/512x/Kconfig | 1 - .../arch/powerpc/platforms/512x/mpc5121_ads.c | 3 + trunk/arch/powerpc/platforms/512x/mpc512x.h | 11 +- .../powerpc/platforms/512x/mpc512x_shared.c | 25 +- trunk/arch/powerpc/platforms/52xx/lite5200.c | 2 +- .../powerpc/platforms/52xx/mpc5200_simple.c | 1 + .../powerpc/platforms/52xx/mpc52xx_lpbfifo.c | 16 +- .../powerpc/platforms/82xx/pq2ads-pci-pic.c | 8 +- .../arch/powerpc/platforms/83xx/mpc832x_mds.c | 2 +- .../arch/powerpc/platforms/83xx/mpc836x_mds.c | 2 +- .../arch/powerpc/platforms/83xx/mpc836x_rdk.c | 2 +- .../arch/powerpc/platforms/83xx/mpc837x_rdb.c | 2 +- .../arch/powerpc/platforms/85xx/mpc85xx_mds.c | 2 +- trunk/arch/powerpc/platforms/85xx/p1022_ds.c | 8 +- trunk/arch/powerpc/platforms/85xx/smp.c | 49 +- .../powerpc/platforms/86xx/mpc8610_hpcd.c | 2 + .../arch/powerpc/platforms/cell/spufs/sched.c | 2 +- .../powerpc/platforms/powermac/cpufreq_32.c | 5 +- .../arch/powerpc/platforms/powernv/pci-ioda.c | 25 +- trunk/arch/powerpc/platforms/ps3/os-area.c | 6 +- trunk/arch/powerpc/platforms/pseries/dlpar.c | 34 +- .../arch/powerpc/platforms/pseries/firmware.c | 1 + .../powerpc/platforms/pseries/hotplug-cpu.c | 8 +- .../platforms/pseries/hotplug-memory.c | 60 +- trunk/arch/powerpc/platforms/pseries/iommu.c | 10 +- .../arch/powerpc/platforms/pseries/mobility.c | 4 +- .../platforms/pseries/plpar_wrappers.h | 31 + .../arch/powerpc/platforms/pseries/reconfig.c | 119 +- trunk/arch/powerpc/platforms/pseries/setup.c | 77 +- trunk/arch/powerpc/platforms/pseries/smp.c | 1 - trunk/arch/powerpc/sysdev/fsl_gtm.c | 2 +- trunk/arch/powerpc/sysdev/fsl_pci.c | 37 +- trunk/arch/powerpc/sysdev/pmi.c | 13 +- trunk/arch/powerpc/sysdev/qe_lib/qe.c | 2 +- trunk/arch/powerpc/sysdev/qe_lib/qe_ic.c | 2 +- trunk/arch/powerpc/sysdev/qe_lib/qe_ic.h | 2 +- trunk/arch/powerpc/sysdev/qe_lib/qe_io.c | 2 +- trunk/arch/powerpc/sysdev/qe_lib/ucc.c | 2 +- trunk/arch/powerpc/sysdev/qe_lib/ucc_fast.c | 2 +- trunk/arch/powerpc/sysdev/qe_lib/ucc_slow.c | 2 +- trunk/arch/powerpc/sysdev/qe_lib/usb.c | 2 +- trunk/arch/powerpc/xmon/Makefile | 2 +- trunk/arch/powerpc/xmon/nonstdio.c | 53 +- trunk/arch/powerpc/xmon/nonstdio.h | 6 - trunk/arch/powerpc/xmon/start.c | 34 - trunk/arch/powerpc/xmon/xmon.c | 26 +- trunk/arch/s390/include/asm/ccwdev.h | 4 +- trunk/arch/s390/include/asm/pci.h | 39 + trunk/arch/s390/include/asm/pci_debug.h | 36 + trunk/arch/s390/pci/Makefile | 2 +- trunk/arch/s390/pci/pci.c | 73 +- trunk/arch/s390/pci/pci_clp.c | 1 + trunk/arch/s390/pci/pci_debug.c | 193 + trunk/arch/s390/pci/pci_dma.c | 8 +- trunk/arch/s390/pci/pci_event.c | 2 + trunk/arch/sparc/crypto/aes_asm.S | 20 +- trunk/arch/sparc/crypto/aes_glue.c | 31 +- trunk/arch/sparc/crypto/camellia_glue.c | 3 + trunk/arch/sparc/crypto/des_asm.S | 1 + trunk/arch/sparc/crypto/des_glue.c | 6 + trunk/arch/sparc/include/asm/hugetlb.h | 10 +- trunk/arch/sparc/include/asm/pgtable_64.h | 8 +- trunk/arch/sparc/include/asm/unistd.h | 1 + trunk/arch/sparc/kernel/module.c | 4 - trunk/arch/sparc/kernel/sys_sparc32.c | 14 - trunk/arch/tile/include/asm/compat.h | 2 - trunk/arch/tile/include/asm/elf.h | 2 + trunk/arch/tile/include/asm/ptrace.h | 3 +- trunk/arch/tile/include/asm/unistd.h | 1 + trunk/arch/tile/include/uapi/asm/ptrace.h | 8 +- trunk/arch/tile/kernel/compat.c | 18 - trunk/arch/tile/kernel/module.c | 2 - trunk/arch/tile/kernel/pci.c | 4 +- trunk/arch/tile/kernel/pci_gx.c | 3 +- trunk/arch/tile/kernel/ptrace.c | 140 +- trunk/arch/um/drivers/mconsole_kern.c | 2 +- trunk/arch/unicore32/kernel/module.c | 3 - trunk/arch/x86/Makefile | 5 +- trunk/arch/x86/include/asm/paravirt.h | 2 - .../arch/x86/include/uapi/asm/hw_breakpoint.h | 1 + trunk/arch/x86/include/uapi/asm/msr-index.h | 37 + trunk/arch/x86/include/uapi/asm/setup.h | 1 + trunk/arch/x86/kernel/cpu/proc.c | 7 +- trunk/arch/x86/kernel/irqinit.c | 40 - trunk/arch/x86/kernel/traps.c | 6 - trunk/arch/x86/platform/iris/iris.c | 67 +- trunk/arch/x86/syscalls/syscall_32.tbl | 1 + trunk/arch/x86/syscalls/syscall_64.tbl | 1 + trunk/arch/x86/xen/enlighten.c | 7 +- trunk/arch/x86/xen/smp.c | 2 +- trunk/arch/xtensa/Kconfig | 21 + trunk/arch/xtensa/Kconfig.debug | 22 +- trunk/arch/xtensa/Makefile | 20 +- trunk/arch/xtensa/boot/Makefile | 25 +- trunk/arch/xtensa/boot/boot-elf/Makefile | 26 +- trunk/arch/xtensa/boot/boot-redboot/Makefile | 26 +- trunk/arch/xtensa/boot/boot-uboot/Makefile | 14 + trunk/arch/xtensa/boot/dts/lx60.dts | 11 + trunk/arch/xtensa/boot/dts/ml605.dts | 11 + .../xtensa/boot/dts/xtfpga-flash-16m.dtsi | 26 + .../arch/xtensa/boot/dts/xtfpga-flash-4m.dtsi | 18 + trunk/arch/xtensa/boot/dts/xtfpga.dtsi | 56 + trunk/arch/xtensa/include/asm/atomic.h | 271 +- trunk/arch/xtensa/include/asm/barrier.h | 6 +- trunk/arch/xtensa/include/asm/bitops.h | 127 +- trunk/arch/xtensa/include/asm/bootparam.h | 20 +- trunk/arch/xtensa/include/asm/cacheasm.h | 1 - trunk/arch/xtensa/include/asm/cacheflush.h | 3 +- trunk/arch/xtensa/include/asm/checksum.h | 19 +- trunk/arch/xtensa/include/asm/cmpxchg.h | 74 +- trunk/arch/xtensa/include/asm/current.h | 2 +- trunk/arch/xtensa/include/asm/delay.h | 7 +- trunk/arch/xtensa/include/asm/dma-mapping.h | 6 +- trunk/arch/xtensa/include/asm/elf.h | 10 +- trunk/arch/xtensa/include/asm/highmem.h | 1 - .../arch/xtensa/include/asm/initialize_mmu.h | 55 + trunk/arch/xtensa/include/asm/mmu_context.h | 2 +- trunk/arch/xtensa/include/asm/nommu_context.h | 2 +- trunk/arch/xtensa/include/asm/page.h | 20 +- trunk/arch/xtensa/include/asm/pci-bridge.h | 2 +- trunk/arch/xtensa/include/asm/pci.h | 2 +- trunk/arch/xtensa/include/asm/pgalloc.h | 2 +- trunk/arch/xtensa/include/asm/pgtable.h | 8 +- trunk/arch/xtensa/include/asm/platform.h | 1 - trunk/arch/xtensa/include/asm/processor.h | 10 +- trunk/arch/xtensa/include/asm/prom.h | 6 + trunk/arch/xtensa/include/asm/ptrace.h | 4 +- trunk/arch/xtensa/include/asm/regs.h | 5 +- trunk/arch/xtensa/include/asm/spinlock.h | 188 +- trunk/arch/xtensa/include/asm/syscall.h | 11 +- trunk/arch/xtensa/include/asm/traps.h | 23 + trunk/arch/xtensa/include/asm/uaccess.h | 43 +- trunk/arch/xtensa/kernel/Makefile | 8 +- trunk/arch/xtensa/kernel/align.S | 4 +- trunk/arch/xtensa/kernel/asm-offsets.c | 5 +- trunk/arch/xtensa/kernel/coprocessor.S | 25 +- trunk/arch/xtensa/kernel/entry.S | 67 +- trunk/arch/xtensa/kernel/head.S | 21 +- trunk/arch/xtensa/kernel/irq.c | 132 +- trunk/arch/xtensa/kernel/module.c | 2 +- trunk/arch/xtensa/kernel/platform.c | 1 - trunk/arch/xtensa/kernel/process.c | 2 +- trunk/arch/xtensa/kernel/ptrace.c | 3 +- trunk/arch/xtensa/kernel/setup.c | 279 +- trunk/arch/xtensa/kernel/signal.c | 8 +- trunk/arch/xtensa/kernel/syscall.c | 1 - trunk/arch/xtensa/kernel/time.c | 7 +- trunk/arch/xtensa/kernel/traps.c | 18 +- trunk/arch/xtensa/kernel/vectors.S | 67 +- trunk/arch/xtensa/lib/checksum.S | 15 +- trunk/arch/xtensa/lib/memcopy.S | 6 +- trunk/arch/xtensa/lib/pci-auto.c | 9 +- trunk/arch/xtensa/lib/strncpy_user.S | 4 +- trunk/arch/xtensa/lib/strnlen_user.S | 1 - trunk/arch/xtensa/lib/usercopy.S | 1 - trunk/arch/xtensa/mm/cache.c | 27 +- trunk/arch/xtensa/mm/fault.c | 1 - trunk/arch/xtensa/mm/init.c | 16 +- trunk/arch/xtensa/mm/misc.S | 51 +- trunk/arch/xtensa/mm/mmu.c | 2 +- trunk/arch/xtensa/mm/tlb.c | 9 +- .../platforms/iss/include/platform/serial.h | 15 + .../platforms/iss/include/platform/simcall.h | 7 +- trunk/arch/xtensa/platforms/xtfpga/Makefile | 9 + .../xtfpga/include/platform/hardware.h | 69 + .../platforms/xtfpga/include/platform/lcd.h | 20 + .../xtfpga/include/platform/serial.h | 18 + trunk/arch/xtensa/platforms/xtfpga/lcd.c | 76 + trunk/arch/xtensa/platforms/xtfpga/setup.c | 301 ++ trunk/arch/xtensa/variants/s6000/gpio.c | 4 +- trunk/block/Kconfig | 1 + trunk/block/genhd.c | 8 +- trunk/block/partitions/efi.c | 7 +- trunk/block/partitions/msdos.c | 21 +- trunk/drivers/atm/solos-pci.c | 186 +- trunk/drivers/base/dma-buf.c | 2 + trunk/drivers/bcma/driver_chipcommon_pmu.c | 3 +- trunk/drivers/block/aoe/aoe.h | 57 +- trunk/drivers/block/aoe/aoeblk.c | 104 +- trunk/drivers/block/aoe/aoechr.c | 7 +- trunk/drivers/block/aoe/aoecmd.c | 715 ++- trunk/drivers/block/aoe/aoedev.c | 243 +- trunk/drivers/block/aoe/aoemain.c | 2 +- trunk/drivers/block/aoe/aoenet.c | 15 +- trunk/drivers/block/cciss.c | 21 +- trunk/drivers/block/drbd/Kconfig | 10 +- trunk/drivers/block/drbd/Makefile | 2 + trunk/drivers/block/drbd/drbd_actlog.c | 702 ++- trunk/drivers/block/drbd/drbd_bitmap.c | 249 +- trunk/drivers/block/drbd/drbd_int.h | 1365 +++--- trunk/drivers/block/drbd/drbd_interval.c | 207 + trunk/drivers/block/drbd/drbd_interval.h | 40 + trunk/drivers/block/drbd/drbd_main.c | 3861 +++++++--------- trunk/drivers/block/drbd/drbd_nl.c | 3332 ++++++++------ trunk/drivers/block/drbd/drbd_nla.c | 55 + trunk/drivers/block/drbd/drbd_nla.h | 8 + trunk/drivers/block/drbd/drbd_proc.c | 41 +- trunk/drivers/block/drbd/drbd_receiver.c | 3904 ++++++++++------- trunk/drivers/block/drbd/drbd_req.c | 1574 ++++--- trunk/drivers/block/drbd/drbd_req.h | 187 +- trunk/drivers/block/drbd/drbd_state.c | 1856 ++++++++ trunk/drivers/block/drbd/drbd_state.h | 161 + trunk/drivers/block/drbd/drbd_strings.c | 1 + trunk/drivers/block/drbd/drbd_worker.c | 1237 +++--- trunk/drivers/block/drbd/drbd_wrappers.h | 11 +- trunk/drivers/block/loop.c | 10 + trunk/drivers/block/xen-blkback/blkback.c | 301 +- trunk/drivers/block/xen-blkback/common.h | 16 + trunk/drivers/block/xen-blkback/xenbus.c | 23 +- trunk/drivers/block/xen-blkfront.c | 199 +- trunk/drivers/char/random.c | 40 +- trunk/drivers/clk/clk-nomadik.c | 1 + trunk/drivers/crypto/nx/nx-842.c | 20 +- trunk/drivers/crypto/nx/nx.c | 1 - trunk/drivers/dma/dmatest.c | 49 +- trunk/drivers/firmware/efivars.c | 1 - trunk/drivers/gpio/Kconfig | 1 + trunk/drivers/gpio/gpio-ich.c | 1 + trunk/drivers/gpio/gpio-mvebu.c | 17 - trunk/drivers/gpu/drm/ttm/ttm_bo.c | 2 +- trunk/drivers/hwmon/hwmon-vid.c | 10 + trunk/drivers/hwmon/hwmon.c | 26 +- trunk/drivers/hwmon/it87.c | 918 ++-- trunk/drivers/hwmon/twl4030-madc-hwmon.c | 2 +- trunk/drivers/hwmon/w83627ehf.c | 99 +- trunk/drivers/hwmon/w83627hf.c | 81 +- trunk/drivers/i2c/busses/Kconfig | 10 + trunk/drivers/i2c/busses/Makefile | 1 + trunk/drivers/i2c/busses/i2c-at91.c | 338 +- trunk/drivers/i2c/busses/i2c-cbus-gpio.c | 300 ++ trunk/drivers/i2c/busses/i2c-gpio.c | 6 +- trunk/drivers/i2c/busses/i2c-mxs.c | 2 +- trunk/drivers/i2c/busses/i2c-nomadik.c | 14 - trunk/drivers/i2c/busses/i2c-ocores.c | 164 +- trunk/drivers/i2c/busses/i2c-omap.c | 226 +- trunk/drivers/i2c/busses/i2c-rcar.c | 6 +- trunk/drivers/i2c/busses/i2c-s3c2410.c | 211 +- trunk/drivers/i2c/busses/i2c-sh_mobile.c | 150 +- trunk/drivers/i2c/muxes/i2c-mux-gpio.c | 145 +- trunk/drivers/infiniband/hw/ehca/hcp_if.c | 20 - trunk/drivers/input/gameport/emu10k1-gp.c | 6 +- trunk/drivers/input/gameport/fm801-gp.c | 6 +- trunk/drivers/input/input-mt.c | 2 +- trunk/drivers/input/input.c | 181 +- trunk/drivers/input/joystick/as5011.c | 29 +- trunk/drivers/input/joystick/maplecontrol.c | 6 +- trunk/drivers/input/joystick/walkera0701.c | 7 +- trunk/drivers/input/joystick/xpad.c | 33 +- trunk/drivers/input/keyboard/Kconfig | 2 +- trunk/drivers/input/keyboard/adp5520-keys.c | 6 +- trunk/drivers/input/keyboard/adp5588-keys.c | 18 +- trunk/drivers/input/keyboard/adp5589-keys.c | 21 +- trunk/drivers/input/keyboard/bf54x-keys.c | 6 +- .../drivers/input/keyboard/davinci_keyscan.c | 4 +- trunk/drivers/input/keyboard/ep93xx_keypad.c | 6 +- trunk/drivers/input/keyboard/gpio_keys.c | 90 +- .../drivers/input/keyboard/gpio_keys_polled.c | 26 +- trunk/drivers/input/keyboard/hilkbd.c | 10 +- trunk/drivers/input/keyboard/imx_keypad.c | 9 +- trunk/drivers/input/keyboard/jornada680_kbd.c | 6 +- trunk/drivers/input/keyboard/jornada720_kbd.c | 6 +- trunk/drivers/input/keyboard/lm8323.c | 6 +- trunk/drivers/input/keyboard/lm8333.c | 6 +- trunk/drivers/input/keyboard/locomokbd.c | 8 +- trunk/drivers/input/keyboard/lpc32xx-keys.c | 8 +- trunk/drivers/input/keyboard/matrix_keypad.c | 129 +- trunk/drivers/input/keyboard/max7359_keypad.c | 6 +- trunk/drivers/input/keyboard/mcs_touchkey.c | 6 +- .../drivers/input/keyboard/mpr121_touchkey.c | 12 +- .../input/keyboard/nomadik-ske-keypad.c | 38 +- trunk/drivers/input/keyboard/omap-keypad.c | 6 +- trunk/drivers/input/keyboard/omap4-keypad.c | 10 +- trunk/drivers/input/keyboard/opencores-kbd.c | 6 +- .../drivers/input/keyboard/pmic8xxx-keypad.c | 10 +- trunk/drivers/input/keyboard/pxa27x_keypad.c | 6 +- trunk/drivers/input/keyboard/pxa930_rotary.c | 6 +- trunk/drivers/input/keyboard/qt1070.c | 8 +- trunk/drivers/input/keyboard/qt2160.c | 31 +- trunk/drivers/input/keyboard/samsung-keypad.c | 109 +- trunk/drivers/input/keyboard/sh_keysc.c | 6 +- trunk/drivers/input/keyboard/spear-keyboard.c | 98 +- trunk/drivers/input/keyboard/stmpe-keypad.c | 142 +- trunk/drivers/input/keyboard/tc3589x-keypad.c | 6 +- trunk/drivers/input/keyboard/tca6416-keypad.c | 8 +- trunk/drivers/input/keyboard/tca8418_keypad.c | 179 +- trunk/drivers/input/keyboard/tegra-kbc.c | 16 +- .../drivers/input/keyboard/tnetv107x-keypad.c | 6 +- trunk/drivers/input/keyboard/twl4030_keypad.c | 8 +- trunk/drivers/input/keyboard/w90p910_keypad.c | 6 +- trunk/drivers/input/matrix-keymap.c | 23 +- trunk/drivers/input/misc/88pm80x_onkey.c | 6 +- trunk/drivers/input/misc/88pm860x_onkey.c | 6 +- trunk/drivers/input/misc/Kconfig | 29 +- trunk/drivers/input/misc/Makefile | 2 + trunk/drivers/input/misc/ab8500-ponkey.c | 6 +- trunk/drivers/input/misc/ad714x-i2c.c | 6 +- trunk/drivers/input/misc/ad714x-spi.c | 6 +- trunk/drivers/input/misc/adxl34x-i2c.c | 6 +- trunk/drivers/input/misc/adxl34x-spi.c | 6 +- trunk/drivers/input/misc/bfin_rotary.c | 6 +- trunk/drivers/input/misc/bma150.c | 28 +- trunk/drivers/input/misc/cma3000_d0x_i2c.c | 6 +- trunk/drivers/input/misc/cobalt_btns.c | 6 +- trunk/drivers/input/misc/da9052_onkey.c | 6 +- trunk/drivers/input/misc/da9055_onkey.c | 171 + trunk/drivers/input/misc/dm355evm_keys.c | 6 +- trunk/drivers/input/misc/gp2ap002a00f.c | 8 +- trunk/drivers/input/misc/gpio_tilt_polled.c | 6 +- trunk/drivers/input/misc/ixp4xx-beeper.c | 6 +- trunk/drivers/input/misc/kxtj9.c | 16 +- trunk/drivers/input/misc/m68kspkr.c | 6 +- trunk/drivers/input/misc/max8925_onkey.c | 6 +- trunk/drivers/input/misc/max8997_haptic.c | 6 +- trunk/drivers/input/misc/mc13783-pwrbutton.c | 6 +- trunk/drivers/input/misc/mma8450.c | 6 +- trunk/drivers/input/misc/mpu3050.c | 8 +- trunk/drivers/input/misc/pcap_keys.c | 6 +- trunk/drivers/input/misc/pcf50633-input.c | 6 +- trunk/drivers/input/misc/pcf8574_keypad.c | 6 +- trunk/drivers/input/misc/pcspkr.c | 6 +- trunk/drivers/input/misc/pm8xxx-vibrator.c | 6 +- trunk/drivers/input/misc/pmic8xxx-pwrkey.c | 6 +- trunk/drivers/input/misc/pwm-beeper.c | 20 +- trunk/drivers/input/misc/rb532_button.c | 6 +- trunk/drivers/input/misc/retu-pwrbutton.c | 99 + trunk/drivers/input/misc/rotary_encoder.c | 9 +- trunk/drivers/input/misc/sgi_btns.c | 6 +- trunk/drivers/input/misc/sparcspkr.c | 14 +- trunk/drivers/input/misc/twl4030-pwrbutton.c | 3 +- trunk/drivers/input/misc/twl4030-vibra.c | 6 +- trunk/drivers/input/misc/twl6040-vibra.c | 6 +- trunk/drivers/input/misc/wistron_btns.c | 20 +- trunk/drivers/input/misc/wm831x-on.c | 11 +- trunk/drivers/input/misc/xen-kbdfront.c | 2 +- trunk/drivers/input/mouse/alps.c | 10 +- trunk/drivers/input/mouse/gpio_mouse.c | 6 +- trunk/drivers/input/mouse/maplemouse.c | 6 +- trunk/drivers/input/mouse/navpoint.c | 6 +- trunk/drivers/input/mouse/pxa930_trkball.c | 6 +- trunk/drivers/input/mouse/synaptics_i2c.c | 6 +- trunk/drivers/input/serio/Kconfig | 9 + trunk/drivers/input/serio/Makefile | 1 + trunk/drivers/input/serio/altera_ps2.c | 6 +- trunk/drivers/input/serio/ambakmi.c | 6 +- trunk/drivers/input/serio/arc_ps2.c | 274 ++ trunk/drivers/input/serio/ct82c710.c | 6 +- trunk/drivers/input/serio/gscps2.c | 6 +- trunk/drivers/input/serio/hil_mlc.c | 13 +- trunk/drivers/input/serio/i8042-io.h | 2 +- trunk/drivers/input/serio/i8042-sparcio.h | 6 +- trunk/drivers/input/serio/i8042-x86ia64io.h | 9 + trunk/drivers/input/serio/i8042.c | 6 +- trunk/drivers/input/serio/maceps2.c | 8 +- trunk/drivers/input/serio/pcips2.c | 6 +- trunk/drivers/input/serio/q40kbd.c | 6 +- trunk/drivers/input/serio/rpckbd.c | 6 +- trunk/drivers/input/serio/sa1111ps2.c | 12 +- trunk/drivers/input/serio/serio.c | 11 - trunk/drivers/input/serio/xilinx_ps2.c | 8 +- trunk/drivers/input/tablet/wacom_sys.c | 58 +- trunk/drivers/input/tablet/wacom_wac.c | 32 +- trunk/drivers/input/tablet/wacom_wac.h | 2 + trunk/drivers/input/touchscreen/88pm860x-ts.c | 8 +- trunk/drivers/input/touchscreen/Kconfig | 12 - trunk/drivers/input/touchscreen/Makefile | 1 - trunk/drivers/input/touchscreen/ad7877.c | 6 +- trunk/drivers/input/touchscreen/ad7879-i2c.c | 6 +- trunk/drivers/input/touchscreen/ad7879-spi.c | 6 +- trunk/drivers/input/touchscreen/ads7846.c | 10 +- .../drivers/input/touchscreen/atmel_mxt_ts.c | 6 +- .../drivers/input/touchscreen/atmel_tsadcc.c | 6 +- .../drivers/input/touchscreen/auo-pixcir-ts.c | 8 +- trunk/drivers/input/touchscreen/bu21013_ts.c | 125 +- .../drivers/input/touchscreen/cy8ctmg110_ts.c | 19 +- trunk/drivers/input/touchscreen/cyttsp_i2c.c | 6 +- trunk/drivers/input/touchscreen/cyttsp_spi.c | 6 +- trunk/drivers/input/touchscreen/da9034-ts.c | 6 +- trunk/drivers/input/touchscreen/da9052_tsi.c | 10 +- trunk/drivers/input/touchscreen/edt-ft5x06.c | 28 +- trunk/drivers/input/touchscreen/eeti_ts.c | 6 +- trunk/drivers/input/touchscreen/egalax_ts.c | 8 +- .../input/touchscreen/h3600_ts_input.c | 479 -- trunk/drivers/input/touchscreen/htcpen.c | 6 +- trunk/drivers/input/touchscreen/ili210x.c | 6 +- .../input/touchscreen/intel-mid-touch.c | 14 +- .../drivers/input/touchscreen/jornada720_ts.c | 6 +- trunk/drivers/input/touchscreen/lpc32xx_ts.c | 6 +- trunk/drivers/input/touchscreen/max11801_ts.c | 8 +- trunk/drivers/input/touchscreen/mc13783_ts.c | 4 +- trunk/drivers/input/touchscreen/mcs5000_ts.c | 6 +- trunk/drivers/input/touchscreen/mms114.c | 68 +- trunk/drivers/input/touchscreen/pcap_ts.c | 6 +- .../drivers/input/touchscreen/pixcir_i2c_ts.c | 6 +- trunk/drivers/input/touchscreen/s3c2410_ts.c | 6 +- trunk/drivers/input/touchscreen/st1232.c | 8 +- trunk/drivers/input/touchscreen/stmpe-ts.c | 133 +- .../drivers/input/touchscreen/ti_am335x_tsc.c | 6 +- .../drivers/input/touchscreen/tnetv107x-ts.c | 6 +- trunk/drivers/input/touchscreen/tps6507x-ts.c | 4 +- trunk/drivers/input/touchscreen/tsc2005.c | 8 +- trunk/drivers/input/touchscreen/tsc2007.c | 6 +- trunk/drivers/input/touchscreen/ucb1400_ts.c | 8 +- trunk/drivers/input/touchscreen/w90p910_ts.c | 6 +- trunk/drivers/input/touchscreen/wacom_i2c.c | 6 +- trunk/drivers/input/touchscreen/wm831x-ts.c | 12 +- trunk/drivers/isdn/mISDN/dsp_core.c | 3 +- trunk/drivers/macintosh/smu.c | 2 +- .../drivers/macintosh/windfarm_fcu_controls.c | 14 +- .../drivers/macintosh/windfarm_lm75_sensor.c | 14 +- .../macintosh/windfarm_max6690_sensor.c | 13 +- trunk/drivers/macintosh/windfarm_smu_sat.c | 13 +- trunk/drivers/md/md.c | 258 +- trunk/drivers/md/md.h | 28 +- trunk/drivers/md/raid1.c | 15 +- trunk/drivers/md/raid10.c | 15 +- trunk/drivers/md/raid5.c | 55 +- trunk/drivers/message/fusion/mptscsih.c | 1 + trunk/drivers/mfd/stmpe.c | 2 + trunk/drivers/mtd/ar7part.c | 7 +- trunk/drivers/mtd/bcm63xxpart.c | 32 +- trunk/drivers/mtd/chips/cfi_cmdset_0002.c | 16 +- trunk/drivers/mtd/cmdlinepart.c | 91 +- trunk/drivers/mtd/devices/bcm47xxsflash.c | 4 +- trunk/drivers/mtd/devices/block2mtd.c | 4 +- trunk/drivers/mtd/devices/docg3.c | 2 +- trunk/drivers/mtd/devices/docprobe.c | 2 - trunk/drivers/mtd/devices/m25p80.c | 48 +- trunk/drivers/mtd/devices/mtd_dataflash.c | 14 +- trunk/drivers/mtd/devices/spear_smi.c | 23 +- trunk/drivers/mtd/devices/sst25l.c | 10 +- trunk/drivers/mtd/maps/Kconfig | 7 - trunk/drivers/mtd/maps/Makefile | 1 - trunk/drivers/mtd/maps/amd76xrom.c | 7 +- trunk/drivers/mtd/maps/autcpu12-nvram.c | 6 +- trunk/drivers/mtd/maps/bfin-async-flash.c | 9 +- trunk/drivers/mtd/maps/ck804xrom.c | 6 +- trunk/drivers/mtd/maps/esb2rom.c | 6 +- trunk/drivers/mtd/maps/fortunet.c | 277 -- trunk/drivers/mtd/maps/gpio-addr-flash.c | 12 +- trunk/drivers/mtd/maps/ichxrom.c | 8 +- trunk/drivers/mtd/maps/intel_vr_nor.c | 18 +- trunk/drivers/mtd/maps/lantiq-flash.c | 8 +- trunk/drivers/mtd/maps/latch-addr-flash.c | 4 +- trunk/drivers/mtd/maps/pci.c | 6 +- trunk/drivers/mtd/maps/physmap_of.c | 19 +- trunk/drivers/mtd/maps/pismo.c | 18 +- trunk/drivers/mtd/maps/pxa2xx-flash.c | 6 +- trunk/drivers/mtd/maps/sa1100-flash.c | 6 +- trunk/drivers/mtd/maps/scb2_flash.c | 8 +- trunk/drivers/mtd/maps/sun_uflash.c | 6 +- trunk/drivers/mtd/maps/vmu-flash.c | 10 +- trunk/drivers/mtd/mtd_blkdevs.c | 51 +- trunk/drivers/mtd/mtdoops.c | 15 +- trunk/drivers/mtd/nand/Kconfig | 34 +- trunk/drivers/mtd/nand/Makefile | 4 +- trunk/drivers/mtd/nand/ams-delta.c | 6 +- trunk/drivers/mtd/nand/atmel_nand.c | 28 +- trunk/drivers/mtd/nand/au1550nd.c | 8 +- trunk/drivers/mtd/nand/bcm47xxnflash/Makefile | 4 + .../mtd/nand/bcm47xxnflash/bcm47xxnflash.h | 22 + trunk/drivers/mtd/nand/bcm47xxnflash/main.c | 108 + .../mtd/nand/bcm47xxnflash/ops_bcm4706.c | 413 ++ trunk/drivers/mtd/nand/bf5xx_nand.c | 8 +- trunk/drivers/mtd/nand/cafe_nand.c | 12 +- trunk/drivers/mtd/nand/cs553x_nand.c | 3 +- trunk/drivers/mtd/nand/davinci_nand.c | 13 +- trunk/drivers/mtd/nand/denali.c | 162 +- trunk/drivers/mtd/nand/denali.h | 5 + trunk/drivers/mtd/nand/denali_dt.c | 167 + trunk/drivers/mtd/nand/denali_pci.c | 144 + trunk/drivers/mtd/nand/diskonchip.c | 2 - trunk/drivers/mtd/nand/docg4.c | 73 +- trunk/drivers/mtd/nand/fsl_elbc_nand.c | 17 +- trunk/drivers/mtd/nand/fsl_ifc_nand.c | 6 +- trunk/drivers/mtd/nand/fsl_upm.c | 8 +- trunk/drivers/mtd/nand/fsmc_nand.c | 106 +- trunk/drivers/mtd/nand/gpio.c | 34 +- trunk/drivers/mtd/nand/gpmi-nand/gpmi-lib.c | 10 +- trunk/drivers/mtd/nand/gpmi-nand/gpmi-nand.c | 41 +- trunk/drivers/mtd/nand/gpmi-nand/gpmi-nand.h | 1 - trunk/drivers/mtd/nand/jz4740_nand.c | 14 +- trunk/drivers/mtd/nand/lpc32xx_mlc.c | 6 +- trunk/drivers/mtd/nand/lpc32xx_slc.c | 6 +- trunk/drivers/mtd/nand/mpc5121_nfc.c | 8 +- trunk/drivers/mtd/nand/mxc_nand.c | 12 +- trunk/drivers/mtd/nand/nand_base.c | 114 +- trunk/drivers/mtd/nand/nandsim.c | 191 +- trunk/drivers/mtd/nand/ndfc.c | 6 +- trunk/drivers/mtd/nand/nomadik_nand.c | 235 - trunk/drivers/mtd/nand/nuc900_nand.c | 6 +- trunk/drivers/mtd/nand/omap2.c | 2 +- trunk/drivers/mtd/nand/orion_nand.c | 4 +- trunk/drivers/mtd/nand/pasemi_nand.c | 4 +- trunk/drivers/mtd/nand/plat_nand.c | 6 +- trunk/drivers/mtd/nand/s3c2410.c | 7 +- trunk/drivers/mtd/nand/sh_flctl.c | 306 +- trunk/drivers/mtd/nand/sharpsl.c | 6 +- trunk/drivers/mtd/nand/socrates_nand.c | 6 +- trunk/drivers/mtd/ofpart.c | 5 +- trunk/drivers/mtd/onenand/generic.c | 6 +- trunk/drivers/mtd/onenand/omap2.c | 6 +- trunk/drivers/mtd/onenand/samsung.c | 4 +- trunk/drivers/mtd/tests/mtd_nandbiterrs.c | 73 +- trunk/drivers/mtd/tests/mtd_nandecctest.c | 6 +- trunk/drivers/mtd/tests/mtd_oobtest.c | 171 +- trunk/drivers/mtd/tests/mtd_pagetest.c | 152 +- trunk/drivers/mtd/tests/mtd_readtest.c | 44 +- trunk/drivers/mtd/tests/mtd_speedtest.c | 88 +- trunk/drivers/mtd/tests/mtd_stresstest.c | 44 +- trunk/drivers/mtd/tests/mtd_subpagetest.c | 124 +- trunk/drivers/mtd/tests/mtd_torturetest.c | 73 +- trunk/drivers/net/bonding/bond_main.c | 2 - .../net/can/sja1000/sja1000_of_platform.c | 2 +- .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 5 +- trunk/drivers/net/ethernet/emulex/benet/be.h | 2 +- .../net/ethernet/emulex/benet/be_cmds.c | 5 + .../net/ethernet/emulex/benet/be_main.c | 59 +- trunk/drivers/net/ethernet/freescale/Kconfig | 3 +- .../drivers/net/ethernet/ibm/ehea/ehea_phyp.h | 20 - trunk/drivers/net/ethernet/micrel/ksz884x.c | 12 +- .../net/ethernet/qlogic/qlcnic/qlcnic.h | 4 +- .../net/ethernet/qlogic/qlcnic/qlcnic_ctx.c | 5 +- .../net/ethernet/qlogic/qlcnic/qlcnic_hw.c | 5 +- .../net/ethernet/qlogic/qlcnic/qlcnic_main.c | 5 - .../ethernet/qlogic/qlcnic/qlcnic_minidump.c | 3 +- trunk/drivers/net/ethernet/realtek/8139cp.c | 18 +- trunk/drivers/net/ethernet/smsc/smc91x.c | 4 +- trunk/drivers/net/ethernet/smsc/smsc911x.c | 4 +- .../net/ethernet/stmicro/stmmac/stmmac.h | 6 +- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 22 +- trunk/drivers/net/ethernet/ti/cpts.c | 2 - trunk/drivers/net/tun.c | 87 +- trunk/drivers/net/usb/cdc_ether.c | 45 +- trunk/drivers/net/usb/cdc_ncm.c | 10 +- trunk/drivers/net/usb/qmi_wwan.c | 15 + trunk/drivers/net/usb/usbnet.c | 25 +- trunk/drivers/net/wimax/i2400m/i2400m-usb.h | 3 + trunk/drivers/net/wimax/i2400m/usb.c | 6 + trunk/drivers/net/wireless/Kconfig | 6 +- trunk/drivers/net/wireless/Makefile | 2 +- trunk/drivers/net/wireless/rt2x00/rt2x00dev.c | 8 + trunk/drivers/of/base.c | 144 +- trunk/drivers/of/fdt.c | 10 +- trunk/drivers/platform/x86/asus-nb-wmi.c | 2 +- trunk/drivers/platform/x86/asus-wmi.c | 2 +- trunk/drivers/platform/x86/eeepc-wmi.c | 2 +- trunk/drivers/power/charger-manager.c | 38 +- trunk/drivers/pwm/Kconfig | 39 +- trunk/drivers/pwm/Makefile | 5 +- trunk/drivers/pwm/core.c | 29 + trunk/drivers/pwm/pwm-imx.c | 2 +- trunk/drivers/pwm/pwm-lpc32xx.c | 23 +- trunk/drivers/pwm/pwm-samsung.c | 1 + trunk/drivers/pwm/pwm-spear.c | 276 ++ trunk/drivers/pwm/pwm-tiecap.c | 48 +- trunk/drivers/pwm/pwm-tiehrpwm.c | 62 +- trunk/drivers/pwm/pwm-tipwmss.c | 139 + trunk/drivers/pwm/pwm-tipwmss.h | 39 + trunk/drivers/pwm/pwm-twl-led.c | 344 ++ trunk/drivers/pwm/pwm-twl.c | 359 ++ trunk/drivers/pwm/pwm-twl6030.c | 184 - trunk/drivers/pwm/pwm-vt8500.c | 98 +- trunk/drivers/rtc/Kconfig | 31 +- trunk/drivers/rtc/Makefile | 2 + trunk/drivers/rtc/rtc-da9055.c | 413 ++ trunk/drivers/rtc/rtc-davinci.c | 21 +- trunk/drivers/rtc/rtc-dev.c | 19 - trunk/drivers/rtc/rtc-imxdi.c | 11 + trunk/drivers/rtc/rtc-omap.c | 80 +- trunk/drivers/rtc/rtc-pcf8523.c | 326 ++ trunk/drivers/rtc/rtc-s3c.c | 52 +- trunk/drivers/rtc/rtc-spear.c | 91 +- trunk/drivers/rtc/rtc-test.c | 14 +- trunk/drivers/rtc/rtc-tps65910.c | 9 +- trunk/drivers/rtc/rtc-vt8500.c | 15 +- trunk/drivers/scsi/fcoe/fcoe_ctlr.c | 4 +- trunk/drivers/spi/spi-atmel.c | 17 +- trunk/drivers/spi/spi-s3c64xx.c | 10 +- trunk/drivers/spi/spi-sh-hspi.c | 2 +- trunk/drivers/spi/spi.c | 5 +- trunk/drivers/staging/android/binder.c | 3 +- trunk/drivers/usb/musb/musb_core.c | 12 +- trunk/drivers/usb/musb/musb_io.h | 21 - trunk/drivers/usb/musb/tusb6010.c | 5 +- trunk/drivers/video/Kconfig | 8 +- trunk/drivers/video/backlight/88pm860x_bl.c | 18 +- trunk/drivers/video/backlight/atmel-pwm-bl.c | 7 +- trunk/drivers/video/backlight/backlight.c | 29 + trunk/drivers/video/backlight/corgi_lcd.c | 20 +- trunk/drivers/video/backlight/da903x_bl.c | 15 +- trunk/drivers/video/backlight/da9052_bl.c | 2 +- trunk/drivers/video/backlight/generic_bl.c | 4 +- trunk/drivers/video/backlight/hp680_bl.c | 4 +- trunk/drivers/video/backlight/ili9320.c | 14 +- trunk/drivers/video/backlight/ili9320.h | 2 +- trunk/drivers/video/backlight/jornada720_bl.c | 31 +- trunk/drivers/video/backlight/l4f00242t03.c | 3 +- trunk/drivers/video/backlight/lcd.c | 8 +- trunk/drivers/video/backlight/lm3630_bl.c | 2 +- trunk/drivers/video/backlight/lm3639_bl.c | 2 +- trunk/drivers/video/backlight/lms283gf05.c | 17 +- trunk/drivers/video/backlight/locomolcd.c | 38 +- trunk/drivers/video/backlight/lp855x_bl.c | 51 +- trunk/drivers/video/backlight/max8925_bl.c | 11 +- trunk/drivers/video/backlight/omap1_bl.c | 4 +- trunk/drivers/video/backlight/pandora_bl.c | 8 +- .../video/backlight/pcf50633-backlight.c | 8 +- trunk/drivers/video/backlight/platform_lcd.c | 2 +- trunk/drivers/video/backlight/s6e63m0.c | 2 +- trunk/drivers/video/backlight/tdo24m.c | 33 +- trunk/drivers/video/backlight/tosa_bl.c | 7 +- trunk/drivers/video/backlight/tosa_lcd.c | 24 +- trunk/drivers/video/backlight/vgg2432a4.c | 10 +- trunk/drivers/video/gxt4500.c | 15 +- trunk/drivers/virt/fsl_hypervisor.c | 3 - trunk/fs/attr.c | 11 +- trunk/fs/autofs4/autofs_i.h | 8 +- trunk/fs/autofs4/dev-ioctl.c | 4 +- trunk/fs/autofs4/inode.c | 24 +- trunk/fs/autofs4/waitq.c | 5 +- trunk/fs/bad_inode.c | 2 +- trunk/fs/binfmt_elf.c | 4 +- trunk/fs/binfmt_em86.c | 1 - trunk/fs/binfmt_misc.c | 6 - trunk/fs/binfmt_script.c | 4 +- trunk/fs/block_dev.c | 4 +- trunk/fs/btrfs/Makefile | 2 +- trunk/fs/btrfs/acl.c | 2 + trunk/fs/btrfs/backref.c | 16 +- trunk/fs/btrfs/btrfs_inode.h | 4 + trunk/fs/btrfs/check-integrity.c | 31 +- trunk/fs/btrfs/compression.c | 6 +- trunk/fs/btrfs/ctree.c | 241 +- trunk/fs/btrfs/ctree.h | 182 +- trunk/fs/btrfs/delayed-inode.c | 11 +- trunk/fs/btrfs/dev-replace.c | 856 ++++ trunk/fs/btrfs/dev-replace.h | 44 + trunk/fs/btrfs/dir-item.c | 59 + trunk/fs/btrfs/disk-io.c | 142 +- trunk/fs/btrfs/disk-io.h | 4 +- trunk/fs/btrfs/extent-tree.c | 227 +- trunk/fs/btrfs/extent_io.c | 37 +- trunk/fs/btrfs/extent_io.h | 4 +- trunk/fs/btrfs/extent_map.c | 24 +- trunk/fs/btrfs/extent_map.h | 2 + trunk/fs/btrfs/file-item.c | 21 +- trunk/fs/btrfs/file.c | 422 +- trunk/fs/btrfs/free-space-cache.c | 51 +- trunk/fs/btrfs/inode-map.c | 5 +- trunk/fs/btrfs/inode.c | 484 +- trunk/fs/btrfs/ioctl.c | 317 +- trunk/fs/btrfs/ioctl.h | 48 +- trunk/fs/btrfs/math.h | 44 + trunk/fs/btrfs/ordered-data.c | 90 +- trunk/fs/btrfs/ordered-data.h | 7 +- trunk/fs/btrfs/print-tree.c | 3 + trunk/fs/btrfs/reada.c | 31 +- trunk/fs/btrfs/relocation.c | 40 +- trunk/fs/btrfs/root-tree.c | 4 +- trunk/fs/btrfs/scrub.c | 1836 +++++--- trunk/fs/btrfs/send.c | 8 +- trunk/fs/btrfs/super.c | 48 +- trunk/fs/btrfs/transaction.c | 170 +- trunk/fs/btrfs/transaction.h | 2 +- trunk/fs/btrfs/tree-log.c | 477 +- trunk/fs/btrfs/volumes.c | 966 +++- trunk/fs/btrfs/volumes.h | 35 +- trunk/fs/btrfs/xattr.c | 13 +- trunk/fs/ceph/dir.c | 4 +- trunk/fs/ceph/export.c | 4 +- trunk/fs/ceph/file.c | 6 +- trunk/fs/cifs/cifsfs.c | 8 +- trunk/fs/configfs/dir.c | 4 +- trunk/fs/eventfd.c | 20 + trunk/fs/eventpoll.c | 28 + trunk/fs/exec.c | 19 +- trunk/fs/exofs/inode.c | 16 +- trunk/fs/exportfs/expfs.c | 19 +- trunk/fs/ext3/dir.c | 6 +- trunk/fs/ext4/dir.c | 6 +- trunk/fs/ext4/file.c | 22 +- trunk/fs/fat/fat.h | 3 +- trunk/fs/fat/inode.c | 55 +- trunk/fs/fat/misc.c | 9 +- trunk/fs/fuse/dev.c | 4 +- trunk/fs/fuse/dir.c | 20 +- trunk/fs/fuse/file.c | 8 +- trunk/fs/fuse/fuse_i.h | 4 +- trunk/fs/fuse/inode.c | 23 +- trunk/fs/gfs2/file.c | 10 +- trunk/fs/hppfs/hppfs.c | 2 +- trunk/fs/jffs2/nodemgmt.c | 6 +- trunk/fs/libfs.c | 4 +- trunk/fs/lockd/clnt4xdr.c | 8 - trunk/fs/lockd/clntproc.c | 3 +- trunk/fs/lockd/clntxdr.c | 8 - trunk/fs/lockd/host.c | 15 +- trunk/fs/lockd/mon.c | 3 - trunk/fs/mount.h | 3 + trunk/fs/namespace.c | 212 +- trunk/fs/nfs/Makefile | 2 +- trunk/fs/nfs/blocklayout/blocklayout.c | 1 + trunk/fs/nfs/cache_lib.c | 1 - trunk/fs/nfs/callback.h | 4 +- trunk/fs/nfs/callback_proc.c | 17 +- trunk/fs/nfs/callback_xdr.c | 5 +- trunk/fs/nfs/client.c | 9 +- trunk/fs/nfs/dir.c | 28 +- trunk/fs/nfs/direct.c | 17 +- trunk/fs/nfs/file.c | 10 +- trunk/fs/nfs/inode.c | 10 +- trunk/fs/nfs/internal.h | 42 +- trunk/fs/nfs/mount_clnt.c | 7 +- trunk/fs/nfs/nfs2xdr.c | 4 +- trunk/fs/nfs/nfs3proc.c | 6 +- trunk/fs/nfs/nfs3xdr.c | 7 +- trunk/fs/nfs/nfs4_fs.h | 29 +- trunk/fs/nfs/nfs4client.c | 5 +- trunk/fs/nfs/nfs4file.c | 1 - trunk/fs/nfs/nfs4filelayout.c | 45 +- trunk/fs/nfs/nfs4filelayoutdev.c | 3 +- trunk/fs/nfs/nfs4proc.c | 820 +--- trunk/fs/nfs/nfs4session.c | 552 +++ trunk/fs/nfs/nfs4session.h | 142 + trunk/fs/nfs/nfs4state.c | 143 +- trunk/fs/nfs/nfs4super.c | 1 + trunk/fs/nfs/nfs4xdr.c | 52 +- trunk/fs/nfs/objlayout/objlayout.c | 11 - trunk/fs/nfs/pnfs.c | 17 +- trunk/fs/nfs/proc.c | 43 - trunk/fs/nfs/super.c | 2 + trunk/fs/nfs/write.c | 31 +- trunk/fs/notify/Makefile | 2 +- trunk/fs/notify/fanotify/fanotify_user.c | 2 + trunk/fs/notify/fdinfo.c | 179 + trunk/fs/notify/fdinfo.h | 27 + trunk/fs/notify/inode_mark.c | 5 +- trunk/fs/notify/inotify/inotify_user.c | 2 + trunk/fs/ocfs2/extent_map.c | 12 +- trunk/fs/ocfs2/file.c | 6 +- trunk/fs/open.c | 2 +- trunk/fs/pnode.h | 1 + trunk/fs/proc/Makefile | 1 + trunk/fs/proc/array.c | 23 +- trunk/fs/proc/base.c | 169 +- trunk/fs/proc/fd.c | 2 + trunk/fs/proc/generic.c | 26 +- trunk/fs/proc/inode.c | 6 +- trunk/fs/proc/internal.h | 1 + trunk/fs/proc/namespaces.c | 185 +- trunk/fs/proc/proc_devtree.c | 6 +- trunk/fs/proc/root.c | 17 +- trunk/fs/proc/self.c | 59 + trunk/fs/proc/task_mmu.c | 53 + trunk/fs/pstore/inode.c | 6 +- trunk/fs/read_write.c | 40 +- trunk/fs/seq_file.c | 4 +- trunk/fs/signalfd.c | 18 + trunk/fs/sysfs/mount.c | 1 + trunk/fs/ubifs/debug.c | 8 +- trunk/fs/ubifs/dir.c | 4 +- trunk/include/asm-generic/io.h | 12 +- trunk/include/linux/asn1.h | 2 + trunk/include/linux/backing-dev.h | 4 - trunk/include/linux/backlight.h | 10 + trunk/include/linux/bcma/bcma.h | 1 + trunk/include/linux/binfmts.h | 2 - trunk/include/linux/blkdev.h | 19 +- trunk/include/linux/compat.h | 3 + trunk/include/linux/compiler-gcc4.h | 12 + trunk/include/linux/compiler-intel.h | 7 + trunk/include/linux/compiler.h | 11 + trunk/include/linux/cred.h | 2 - trunk/include/linux/dma-buf.h | 99 - trunk/include/linux/drbd.h | 81 +- trunk/include/linux/drbd_genl.h | 378 ++ trunk/include/linux/drbd_genl_api.h | 55 + trunk/include/linux/drbd_limits.h | 90 +- trunk/include/linux/drbd_nl.h | 163 - trunk/include/linux/drbd_tag_magic.h | 84 - trunk/include/linux/exportfs.h | 2 + trunk/include/linux/fs.h | 18 +- trunk/include/linux/ftrace.h | 4 +- trunk/include/linux/genhd.h | 8 +- trunk/include/linux/genl_magic_func.h | 422 ++ trunk/include/linux/genl_magic_struct.h | 277 ++ trunk/include/linux/gfp.h | 5 + trunk/include/linux/hugetlb_cgroup.h | 5 +- trunk/include/linux/i2c-omap.h | 2 - trunk/include/linux/i2c/i2c-sh_mobile.h | 1 + trunk/include/linux/idr.h | 11 + trunk/include/linux/ima.h | 6 + trunk/include/linux/init.h | 40 +- trunk/include/linux/input.h | 10 +- trunk/include/linux/input/bu21013.h | 10 +- trunk/include/linux/ipc_namespace.h | 9 +- trunk/include/linux/kernel.h | 33 + trunk/include/linux/loop.h | 3 + trunk/include/linux/lru_cache.h | 67 +- trunk/include/linux/memcontrol.h | 209 + trunk/include/linux/mm_types.h | 7 +- trunk/include/linux/mnt_namespace.h | 3 +- trunk/include/linux/moduleparam.h | 6 +- trunk/include/linux/mtd/blktrans.h | 4 +- trunk/include/linux/mtd/doc2000.h | 22 +- trunk/include/linux/mtd/fsmc.h | 3 - trunk/include/linux/mtd/gpmi-nand.h | 68 - trunk/include/linux/mtd/map.h | 4 +- trunk/include/linux/mtd/mtd.h | 2 +- trunk/include/linux/mtd/nand.h | 11 +- trunk/include/linux/mtd/sh_flctl.h | 14 +- trunk/include/linux/nfs_fs_sb.h | 47 - trunk/include/linux/nfs_xdr.h | 155 +- trunk/include/linux/nsproxy.h | 2 +- trunk/include/linux/of.h | 29 +- trunk/include/linux/of_platform.h | 1 + trunk/include/linux/percpu-rwsem.h | 91 +- trunk/include/linux/pid_namespace.h | 11 +- .../linux/platform_data/i2c-cbus-gpio.h | 27 + trunk/include/linux/platform_data/lp855x.h | 9 +- .../linux/platform_data/mtd-nomadik-nand.h | 16 - trunk/include/linux/proc_fs.h | 29 +- trunk/include/linux/ptrace.h | 2 + trunk/include/linux/pwm.h | 3 + trunk/include/linux/raid/pq.h | 4 + trunk/include/linux/random.h | 19 +- trunk/include/linux/res_counter.h | 12 +- trunk/include/linux/sched.h | 7 +- trunk/include/linux/security.h | 13 + trunk/include/linux/slab.h | 57 +- trunk/include/linux/slab_def.h | 9 +- trunk/include/linux/slub_def.h | 9 +- trunk/include/linux/string.h | 11 + trunk/include/linux/sunrpc/sched.h | 1 - trunk/include/linux/syscalls.h | 5 +- trunk/include/linux/thread_info.h | 2 + trunk/include/linux/usb/usbnet.h | 3 + trunk/include/linux/user_namespace.h | 10 + trunk/include/linux/utsname.h | 7 +- trunk/include/linux/wait.h | 164 + trunk/include/net/inet_connection_sock.h | 1 + trunk/include/net/ndisc.h | 7 + trunk/include/net/net_namespace.h | 2 + trunk/include/trace/events/btrfs.h | 3 +- trunk/include/trace/events/gfpflags.h | 1 + trunk/include/uapi/asm-generic/unistd.h | 4 +- trunk/include/uapi/linux/if_bridge.h | 3 + trunk/include/uapi/linux/module.h | 8 + trunk/include/uapi/linux/ptrace.h | 5 +- trunk/include/uapi/linux/swab.h | 12 +- trunk/include/xen/interface/event_channel.h | 13 + trunk/init/Kconfig | 4 +- trunk/init/do_mounts.c | 61 +- trunk/init/main.c | 1 - trunk/init/version.c | 2 + trunk/ipc/msgutil.c | 2 + trunk/ipc/namespace.c | 33 +- trunk/kernel/Makefile | 10 +- trunk/kernel/cgroup.c | 3 +- trunk/kernel/compat.c | 17 + trunk/kernel/cred.c | 27 +- trunk/kernel/events/core.c | 2 +- trunk/kernel/exit.c | 12 - trunk/kernel/fork.c | 73 +- trunk/kernel/irq/manage.c | 2 +- trunk/kernel/modsign_certificate.S | 19 + trunk/kernel/modsign_pubkey.c | 6 - trunk/kernel/module.c | 444 +- trunk/kernel/nsproxy.c | 36 +- trunk/kernel/pid.c | 62 +- trunk/kernel/pid_namespace.c | 113 +- trunk/kernel/posix-cpu-timers.c | 3 + trunk/kernel/printk.c | 40 +- trunk/kernel/ptrace.c | 13 +- trunk/kernel/res_counter.c | 20 +- trunk/kernel/sched/core.c | 10 +- trunk/kernel/sched/fair.c | 5 +- trunk/kernel/signal.c | 2 +- trunk/kernel/sys_ni.c | 1 + trunk/kernel/sysctl_binary.c | 2 +- trunk/kernel/trace/ftrace.c | 4 +- trunk/kernel/trace/trace.c | 60 +- trunk/kernel/trace/trace_stack.c | 4 - trunk/kernel/trace/trace_uprobe.c | 8 +- trunk/kernel/user.c | 2 + trunk/kernel/user_namespace.c | 147 +- trunk/kernel/utsname.c | 34 +- trunk/kernel/watchdog.c | 24 +- trunk/lib/Kconfig | 3 + trunk/lib/Kconfig.debug | 10 +- trunk/lib/Makefile | 5 +- trunk/lib/asn1_decoder.c | 8 +- trunk/lib/dynamic_debug.c | 9 +- trunk/lib/interval_tree_test_main.c | 7 +- trunk/lib/kstrtox.c | 64 + trunk/lib/lru_cache.c | 359 +- ....c => of-reconfig-notifier-error-inject.c} | 22 +- trunk/lib/percpu-rwsem.c | 165 + trunk/lib/raid6/Makefile | 9 +- trunk/lib/raid6/algos.c | 12 + trunk/lib/raid6/altivec.uc | 3 - trunk/lib/raid6/avx2.c | 251 ++ trunk/lib/raid6/mmx.c | 2 +- trunk/lib/raid6/recov_avx2.c | 323 ++ trunk/lib/raid6/recov_ssse3.c | 4 - trunk/lib/raid6/sse1.c | 2 +- trunk/lib/raid6/sse2.c | 8 +- trunk/lib/raid6/test/Makefile | 29 +- trunk/lib/raid6/x86.h | 14 +- trunk/lib/random32.c | 97 +- trunk/lib/rbtree_test.c | 8 +- trunk/lib/scatterlist.c | 3 +- trunk/lib/vsprintf.c | 109 +- trunk/mm/Kconfig | 13 +- trunk/mm/backing-dev.c | 84 - trunk/mm/hugetlb.c | 11 +- trunk/mm/hugetlb_cgroup.c | 19 +- trunk/mm/kmemleak.c | 3 +- trunk/mm/ksm.c | 16 +- trunk/mm/memcontrol.c | 1524 ++++++- trunk/mm/memory.c | 9 +- trunk/mm/memory_hotplug.c | 18 +- trunk/mm/migrate.c | 2 +- trunk/mm/mprotect.c | 30 +- trunk/mm/page_alloc.c | 38 +- trunk/mm/shmem.c | 20 +- trunk/mm/slab.c | 383 +- trunk/mm/slab.h | 190 +- trunk/mm/slab_common.c | 292 +- trunk/mm/slob.c | 48 +- trunk/mm/slub.c | 447 +- trunk/mm/vmscan.c | 26 +- trunk/net/atm/atm_sysfs.c | 40 +- trunk/net/bridge/br_mdb.c | 22 +- trunk/net/bridge/br_multicast.c | 13 +- trunk/net/bridge/br_netlink.c | 1 - trunk/net/bridge/br_private.h | 5 +- trunk/net/core/net_namespace.c | 32 +- trunk/net/dccp/ipv4.c | 4 +- trunk/net/dccp/ipv6.c | 3 +- trunk/net/ipv4/inet_connection_sock.c | 16 + trunk/net/ipv4/tcp_ipv4.c | 6 +- trunk/net/ipv6/Makefile | 2 +- trunk/net/ipv6/addrconf.c | 3 +- trunk/net/ipv6/ndisc.c | 17 + trunk/net/ipv6/tcp_ipv6.c | 3 +- trunk/net/mac802154/ieee802154_dev.c | 4 +- trunk/net/netlink/af_netlink.c | 5 +- trunk/net/sctp/Kconfig | 27 +- trunk/net/sctp/probe.c | 3 +- trunk/net/sctp/protocol.c | 4 +- trunk/net/sunrpc/auth_gss/auth_gss.c | 17 +- trunk/net/sunrpc/backchannel_rqst.c | 9 +- trunk/net/sunrpc/bc_svc.c | 2 +- trunk/net/sunrpc/cache.c | 4 +- trunk/net/sunrpc/clnt.c | 48 +- trunk/net/sunrpc/rpc_pipe.c | 9 +- trunk/net/sunrpc/rpcb_clnt.c | 5 +- trunk/net/sunrpc/sched.c | 71 +- trunk/net/sunrpc/svc.c | 12 +- trunk/net/sunrpc/svc_xprt.c | 11 +- trunk/net/sunrpc/svcsock.c | 6 +- trunk/net/sunrpc/xdr.c | 5 +- trunk/net/sunrpc/xprtsock.c | 19 +- trunk/scripts/Makefile.modsign | 32 + trunk/scripts/checkpatch.pl | 143 +- .../scripts/coccinelle/api/d_find_alias.cocci | 80 + trunk/security/capability.c | 6 + trunk/security/commoncap.c | 25 +- trunk/security/integrity/ima/ima.h | 2 +- trunk/security/integrity/ima/ima_api.c | 4 +- trunk/security/integrity/ima/ima_main.c | 21 + trunk/security/integrity/ima/ima_policy.c | 3 + trunk/security/security.c | 10 + trunk/security/selinux/nlmsgtab.c | 2 + trunk/security/yama/yama_lsm.c | 12 +- trunk/sound/Kconfig | 3 - trunk/sound/sound_core.c | 3 +- trunk/tools/power/x86/turbostat/Makefile | 21 +- trunk/tools/power/x86/turbostat/turbostat.8 | 103 +- trunk/tools/power/x86/turbostat/turbostat.c | 677 ++- .../power/x86/x86_energy_perf_policy/Makefile | 6 +- .../x86_energy_perf_policy.c | 2 +- .../testing/selftests/breakpoints/Makefile | 2 +- .../testing/selftests/cpu-hotplug/Makefile | 2 +- trunk/tools/testing/selftests/kcmp/Makefile | 6 +- .../tools/testing/selftests/kcmp/kcmp_test.c | 6 +- .../testing/selftests/memory-hotplug/Makefile | 2 +- trunk/tools/testing/selftests/mqueue/Makefile | 4 +- trunk/tools/testing/selftests/vm/Makefile | 2 +- 1224 files changed, 44953 insertions(+), 23367 deletions(-) create mode 100644 trunk/Documentation/devicetree/bindings/i2c/i2c-cbus-gpio.txt create mode 100644 trunk/Documentation/devicetree/bindings/i2c/i2c-mux-gpio.txt create mode 100644 trunk/Documentation/devicetree/bindings/input/gpio-matrix-keypad.txt create mode 100644 trunk/Documentation/devicetree/bindings/input/pwm-beeper.txt create mode 100644 trunk/Documentation/devicetree/bindings/input/stmpe-keypad.txt create mode 100644 trunk/Documentation/devicetree/bindings/input/tca8418_keypad.txt create mode 100644 trunk/Documentation/devicetree/bindings/input/touchscreen/mms114.txt create mode 100644 trunk/Documentation/devicetree/bindings/input/touchscreen/stmpe.txt create mode 100644 trunk/Documentation/devicetree/bindings/mtd/denali-nand.txt create mode 100644 trunk/Documentation/devicetree/bindings/mtd/flctl-nand.txt create mode 100644 trunk/Documentation/devicetree/bindings/mtd/m25p80.txt create mode 100644 trunk/Documentation/devicetree/bindings/powerpc/fsl/raideng.txt create mode 100644 trunk/Documentation/devicetree/bindings/pwm/pwm-tiecap.txt create mode 100644 trunk/Documentation/devicetree/bindings/pwm/pwm-tiehrpwm.txt create mode 100644 trunk/Documentation/devicetree/bindings/pwm/pwm-tipwmss.txt create mode 100644 trunk/Documentation/devicetree/bindings/pwm/spear-pwm.txt create mode 100644 trunk/Documentation/devicetree/bindings/pwm/ti,twl-pwm.txt create mode 100644 trunk/Documentation/devicetree/bindings/pwm/ti,twl-pwmled.txt create mode 100644 trunk/Documentation/devicetree/bindings/pwm/vt8500-pwm.txt create mode 100644 trunk/Documentation/devicetree/bindings/rtc/imxdi-rtc.txt create mode 100644 trunk/Documentation/devicetree/bindings/rtc/rtc-omap.txt create mode 100644 trunk/Documentation/devicetree/bindings/spi/spi_atmel.txt create mode 100644 trunk/Documentation/xtensa/atomctl.txt create mode 100644 trunk/arch/alpha/include/uapi/asm/a.out.h rename trunk/arch/alpha/include/{ => uapi}/asm/auxvec.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/bitsperlong.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/byteorder.h (100%) create mode 100644 trunk/arch/alpha/include/uapi/asm/compiler.h create mode 100644 trunk/arch/alpha/include/uapi/asm/console.h rename trunk/arch/alpha/include/{ => uapi}/asm/errno.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/fcntl.h (100%) create mode 100644 trunk/arch/alpha/include/uapi/asm/fpu.h rename trunk/arch/alpha/include/{ => uapi}/asm/gentrap.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/ioctl.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/ioctls.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/ipcbuf.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/kvm_para.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/mman.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/msgbuf.h (100%) create mode 100644 trunk/arch/alpha/include/uapi/asm/pal.h create mode 100644 trunk/arch/alpha/include/uapi/asm/param.h rename trunk/arch/alpha/include/{ => uapi}/asm/poll.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/posix_types.h (100%) create mode 100644 trunk/arch/alpha/include/uapi/asm/ptrace.h rename trunk/arch/alpha/include/{ => uapi}/asm/reg.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/regdef.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/resource.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/sembuf.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/setup.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/shmbuf.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/sigcontext.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/siginfo.h (100%) create mode 100644 trunk/arch/alpha/include/uapi/asm/signal.h create mode 100644 trunk/arch/alpha/include/uapi/asm/socket.h rename trunk/arch/alpha/include/{ => uapi}/asm/sockios.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/stat.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/statfs.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/swab.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/sysinfo.h (100%) rename trunk/arch/alpha/include/{ => uapi}/asm/termbits.h (100%) create mode 100644 trunk/arch/alpha/include/uapi/asm/termios.h create mode 100644 trunk/arch/alpha/include/uapi/asm/types.h create mode 100644 trunk/arch/alpha/include/uapi/asm/unistd.h delete mode 100644 trunk/arch/arm/mach-nomadik/include/mach/fsmc.h delete mode 100644 trunk/arch/blackfin/include/asm/kvm_para.h create mode 100644 trunk/arch/blackfin/include/uapi/asm/bfin_sport.h rename trunk/arch/blackfin/include/{ => uapi}/asm/byteorder.h (100%) rename trunk/arch/blackfin/include/{ => uapi}/asm/cachectl.h (100%) rename trunk/arch/blackfin/include/{ => uapi}/asm/fcntl.h (100%) create mode 100644 trunk/arch/blackfin/include/uapi/asm/fixed_code.h rename trunk/arch/blackfin/include/{ => uapi}/asm/ioctls.h (100%) rename trunk/arch/blackfin/include/{ => uapi}/asm/poll.h (100%) rename trunk/arch/blackfin/include/{ => uapi}/asm/posix_types.h (100%) create mode 100644 trunk/arch/blackfin/include/uapi/asm/ptrace.h rename trunk/arch/blackfin/include/{ => uapi}/asm/sigcontext.h (100%) rename trunk/arch/blackfin/include/{ => uapi}/asm/siginfo.h (100%) rename trunk/arch/blackfin/include/{ => uapi}/asm/signal.h (100%) rename trunk/arch/blackfin/include/{ => uapi}/asm/stat.h (100%) rename trunk/arch/blackfin/include/{ => uapi}/asm/swab.h (100%) create mode 100644 trunk/arch/blackfin/include/uapi/asm/unistd.h delete mode 100644 trunk/arch/openrisc/include/uapi/asm/kvm_para.h create mode 100644 trunk/arch/powerpc/boot/dts/a3m071.dts create mode 100644 trunk/arch/powerpc/boot/dts/fsl/qoriq-raid1.0-0.dtsi delete mode 100644 trunk/arch/powerpc/include/asm/pSeries_reconfig.h create mode 100644 trunk/arch/powerpc/include/asm/setup.h rename trunk/arch/powerpc/kernel/{cpu_setup_power7.S => cpu_setup_power.S} (80%) delete mode 100644 trunk/arch/powerpc/xmon/start.c create mode 100644 trunk/arch/s390/include/asm/pci_debug.h create mode 100644 trunk/arch/s390/pci/pci_debug.c create mode 100644 trunk/arch/xtensa/boot/boot-uboot/Makefile create mode 100644 trunk/arch/xtensa/boot/dts/lx60.dts create mode 100644 trunk/arch/xtensa/boot/dts/ml605.dts create mode 100644 trunk/arch/xtensa/boot/dts/xtfpga-flash-16m.dtsi create mode 100644 trunk/arch/xtensa/boot/dts/xtfpga-flash-4m.dtsi create mode 100644 trunk/arch/xtensa/boot/dts/xtfpga.dtsi create mode 100644 trunk/arch/xtensa/include/asm/initialize_mmu.h create mode 100644 trunk/arch/xtensa/include/asm/prom.h create mode 100644 trunk/arch/xtensa/include/asm/traps.h create mode 100644 trunk/arch/xtensa/platforms/xtfpga/Makefile create mode 100644 trunk/arch/xtensa/platforms/xtfpga/include/platform/hardware.h create mode 100644 trunk/arch/xtensa/platforms/xtfpga/include/platform/lcd.h create mode 100644 trunk/arch/xtensa/platforms/xtfpga/include/platform/serial.h create mode 100644 trunk/arch/xtensa/platforms/xtfpga/lcd.c create mode 100644 trunk/arch/xtensa/platforms/xtfpga/setup.c create mode 100644 trunk/drivers/block/drbd/drbd_interval.c create mode 100644 trunk/drivers/block/drbd/drbd_interval.h create mode 100644 trunk/drivers/block/drbd/drbd_nla.c create mode 100644 trunk/drivers/block/drbd/drbd_nla.h create mode 100644 trunk/drivers/block/drbd/drbd_state.c create mode 100644 trunk/drivers/block/drbd/drbd_state.h create mode 100644 trunk/drivers/i2c/busses/i2c-cbus-gpio.c create mode 100644 trunk/drivers/input/misc/da9055_onkey.c create mode 100644 trunk/drivers/input/misc/retu-pwrbutton.c create mode 100644 trunk/drivers/input/serio/arc_ps2.c delete mode 100644 trunk/drivers/input/touchscreen/h3600_ts_input.c delete mode 100644 trunk/drivers/mtd/maps/fortunet.c create mode 100644 trunk/drivers/mtd/nand/bcm47xxnflash/Makefile create mode 100644 trunk/drivers/mtd/nand/bcm47xxnflash/bcm47xxnflash.h create mode 100644 trunk/drivers/mtd/nand/bcm47xxnflash/main.c create mode 100644 trunk/drivers/mtd/nand/bcm47xxnflash/ops_bcm4706.c create mode 100644 trunk/drivers/mtd/nand/denali_dt.c create mode 100644 trunk/drivers/mtd/nand/denali_pci.c delete mode 100644 trunk/drivers/mtd/nand/nomadik_nand.c create mode 100644 trunk/drivers/pwm/pwm-spear.c create mode 100644 trunk/drivers/pwm/pwm-tipwmss.c create mode 100644 trunk/drivers/pwm/pwm-tipwmss.h create mode 100644 trunk/drivers/pwm/pwm-twl-led.c create mode 100644 trunk/drivers/pwm/pwm-twl.c delete mode 100644 trunk/drivers/pwm/pwm-twl6030.c create mode 100644 trunk/drivers/rtc/rtc-da9055.c create mode 100644 trunk/drivers/rtc/rtc-pcf8523.c create mode 100644 trunk/fs/btrfs/dev-replace.c create mode 100644 trunk/fs/btrfs/dev-replace.h create mode 100644 trunk/fs/btrfs/math.h create mode 100644 trunk/fs/nfs/nfs4session.c create mode 100644 trunk/fs/nfs/nfs4session.h create mode 100644 trunk/fs/notify/fdinfo.c create mode 100644 trunk/fs/notify/fdinfo.h create mode 100644 trunk/fs/proc/self.c create mode 100644 trunk/include/linux/drbd_genl.h create mode 100644 trunk/include/linux/drbd_genl_api.h delete mode 100644 trunk/include/linux/drbd_nl.h delete mode 100644 trunk/include/linux/drbd_tag_magic.h create mode 100644 trunk/include/linux/genl_magic_func.h create mode 100644 trunk/include/linux/genl_magic_struct.h delete mode 100644 trunk/include/linux/mtd/gpmi-nand.h create mode 100644 trunk/include/linux/platform_data/i2c-cbus-gpio.h delete mode 100644 trunk/include/linux/platform_data/mtd-nomadik-nand.h create mode 100644 trunk/include/uapi/linux/module.h create mode 100644 trunk/kernel/modsign_certificate.S rename trunk/lib/{pSeries-reconfig-notifier-error-inject.c => of-reconfig-notifier-error-inject.c} (51%) create mode 100644 trunk/lib/percpu-rwsem.c create mode 100644 trunk/lib/raid6/avx2.c create mode 100644 trunk/lib/raid6/recov_avx2.c create mode 100644 trunk/scripts/Makefile.modsign create mode 100644 trunk/scripts/coccinelle/api/d_find_alias.cocci diff --git a/[refs] b/[refs] index 14d1d1dd3afd..3eabd3bcdc9b 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: 4d1839138220e7e35bf9e31c854e4e0196dea7a1 +refs/heads/master: 1ffab3d4139533eff6e27b7568825307e575faa6 diff --git a/trunk/.gitignore b/trunk/.gitignore index 92bd0e45dfa1..3b8b9b33be38 100644 --- a/trunk/.gitignore +++ b/trunk/.gitignore @@ -60,7 +60,6 @@ modules.builtin # Generated include files # include/config -include/linux/version.h include/generated arch/*/include/generated diff --git a/trunk/Documentation/00-INDEX b/trunk/Documentation/00-INDEX index ceb1ff735469..8afe64fb2009 100644 --- a/trunk/Documentation/00-INDEX +++ b/trunk/Documentation/00-INDEX @@ -136,8 +136,6 @@ fault-injection/ - dir with docs about the fault injection capabilities infrastructure. fb/ - directory with info on the frame buffer graphics abstraction layer. -feature-removal-schedule.txt - - list of files and features that are going to be removed. filesystems/ - info on the vfs and the various filesystems that Linux supports. firmware_class/ diff --git a/trunk/Documentation/ABI/README b/trunk/Documentation/ABI/README index 9feaf16f1617..10069828568b 100644 --- a/trunk/Documentation/ABI/README +++ b/trunk/Documentation/ABI/README @@ -36,9 +36,6 @@ The different levels of stability are: the kernel, but are marked to be removed at some later point in time. The description of the interface will document the reason why it is obsolete and when it can be expected to be removed. - The file Documentation/feature-removal-schedule.txt may describe - some of these interfaces, giving a schedule for when they will - be removed. removed/ This directory contains a list of the old interfaces that have diff --git a/trunk/Documentation/ABI/stable/sysfs-devices-node b/trunk/Documentation/ABI/stable/sysfs-devices-node index 49b82cad7003..ce259c13c36a 100644 --- a/trunk/Documentation/ABI/stable/sysfs-devices-node +++ b/trunk/Documentation/ABI/stable/sysfs-devices-node @@ -1,7 +1,101 @@ +What: /sys/devices/system/node/possible +Date: October 2002 +Contact: Linux Memory Management list +Description: + Nodes that could be possibly become online at some point. + +What: /sys/devices/system/node/online +Date: October 2002 +Contact: Linux Memory Management list +Description: + Nodes that are online. + +What: /sys/devices/system/node/has_normal_memory +Date: October 2002 +Contact: Linux Memory Management list +Description: + Nodes that have regular memory. + +What: /sys/devices/system/node/has_cpu +Date: October 2002 +Contact: Linux Memory Management list +Description: + Nodes that have one or more CPUs. + +What: /sys/devices/system/node/has_high_memory +Date: October 2002 +Contact: Linux Memory Management list +Description: + Nodes that have regular or high memory. + Depends on CONFIG_HIGHMEM. + What: /sys/devices/system/node/nodeX Date: October 2002 Contact: Linux Memory Management list Description: When CONFIG_NUMA is enabled, this is a directory containing information on node X such as what CPUs are local to the - node. + node. Each file is detailed next. + +What: /sys/devices/system/node/nodeX/cpumap +Date: October 2002 +Contact: Linux Memory Management list +Description: + The node's cpumap. + +What: /sys/devices/system/node/nodeX/cpulist +Date: October 2002 +Contact: Linux Memory Management list +Description: + The CPUs associated to the node. + +What: /sys/devices/system/node/nodeX/meminfo +Date: October 2002 +Contact: Linux Memory Management list +Description: + Provides information about the node's distribution and memory + utilization. Similar to /proc/meminfo, see Documentation/filesystems/proc.txt + +What: /sys/devices/system/node/nodeX/numastat +Date: October 2002 +Contact: Linux Memory Management list +Description: + The node's hit/miss statistics, in units of pages. + See Documentation/numastat.txt + +What: /sys/devices/system/node/nodeX/distance +Date: October 2002 +Contact: Linux Memory Management list +Description: + Distance between the node and all the other nodes + in the system. + +What: /sys/devices/system/node/nodeX/vmstat +Date: October 2002 +Contact: Linux Memory Management list +Description: + The node's zoned virtual memory statistics. + This is a superset of numastat. + +What: /sys/devices/system/node/nodeX/compact +Date: February 2010 +Contact: Mel Gorman +Description: + When this file is written to, all memory within that node + will be compacted. When it completes, memory will be freed + into blocks which have as many contiguous pages as possible + +What: /sys/devices/system/node/nodeX/scan_unevictable_pages +Date: October 2008 +Contact: Lee Schermerhorn +Description: + When set, it triggers scanning the node's unevictable lists + and move any pages that have become evictable onto the respective + zone's inactive list. See mm/vmscan.c + +What: /sys/devices/system/node/nodeX/hugepages/hugepages-/ +Date: December 2009 +Contact: Lee Schermerhorn +Description: + The node's huge page size control/query attributes. + See Documentation/vm/hugetlbpage.txt \ No newline at end of file diff --git a/trunk/Documentation/ABI/testing/ima_policy b/trunk/Documentation/ABI/testing/ima_policy index 986946613542..ec0a38ef3145 100644 --- a/trunk/Documentation/ABI/testing/ima_policy +++ b/trunk/Documentation/ABI/testing/ima_policy @@ -23,7 +23,7 @@ Description: lsm: [[subj_user=] [subj_role=] [subj_type=] [obj_user=] [obj_role=] [obj_type=]] - base: func:= [BPRM_CHECK][FILE_MMAP][FILE_CHECK] + base: func:= [BPRM_CHECK][FILE_MMAP][FILE_CHECK][MODULE_CHECK] mask:= [MAY_READ] [MAY_WRITE] [MAY_APPEND] [MAY_EXEC] fsmagic:= hex value uid:= decimal value @@ -53,6 +53,7 @@ Description: measure func=BPRM_CHECK measure func=FILE_MMAP mask=MAY_EXEC measure func=FILE_CHECK mask=MAY_READ uid=0 + measure func=MODULE_CHECK uid=0 appraise fowner=0 The default policy measures all executables in bprm_check, diff --git a/trunk/Documentation/DocBook/kernel-api.tmpl b/trunk/Documentation/DocBook/kernel-api.tmpl index 00687ee9d363..f75ab4c1b281 100644 --- a/trunk/Documentation/DocBook/kernel-api.tmpl +++ b/trunk/Documentation/DocBook/kernel-api.tmpl @@ -58,6 +58,9 @@ String Conversions !Elib/vsprintf.c +!Finclude/linux/kernel.h kstrtol +!Finclude/linux/kernel.h kstrtoul +!Elib/kstrtox.c String Manipulation %s ino %lu pgbase %u req %Zu@%llu\n", __func__, hdr->inode->i_ino, @@ -538,9 +531,8 @@ filelayout_read_pagelist(struct nfs_read_data *data) data->mds_offset = offset; /* Perform an asynchronous read to ds */ - status = nfs_initiate_read(ds->ds_clp->cl_rpcclient, data, + nfs_initiate_read(ds->ds_clp->cl_rpcclient, data, &filelayout_read_call_ops, RPC_TASK_SOFTCONN); - BUG_ON(status != 0); return PNFS_ATTEMPTED; } @@ -554,7 +546,6 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) loff_t offset = data->args.offset; u32 j, idx; struct nfs_fh *fh; - int status; /* Retrieve the correct rpc_client for the byte range */ j = nfs4_fl_calc_j_index(lseg, offset); @@ -579,10 +570,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) data->args.offset = filelayout_get_dserver_offset(lseg, offset); /* Perform an asynchronous write */ - status = nfs_initiate_write(ds->ds_clp->cl_rpcclient, data, + nfs_initiate_write(ds->ds_clp->cl_rpcclient, data, &filelayout_write_call_ops, sync, RPC_TASK_SOFTCONN); - BUG_ON(status != 0); return PNFS_ATTEMPTED; } @@ -909,7 +899,7 @@ static void filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - BUG_ON(pgio->pg_lseg != NULL); + WARN_ON_ONCE(pgio->pg_lseg != NULL); if (req->wb_offset != req->wb_pgbase) { /* @@ -939,7 +929,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_commit_info cinfo; int status; - BUG_ON(pgio->pg_lseg != NULL); + WARN_ON_ONCE(pgio->pg_lseg != NULL); if (req->wb_offset != req->wb_pgbase) goto out_mds; @@ -1187,7 +1177,6 @@ static void filelayout_recover_commit_reqs(struct list_head *dst, */ for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { if (transfer_commit_list(&b->written, dst, cinfo, 0)) { - BUG_ON(!list_empty(&b->written)); pnfs_put_lseg(b->wlseg); b->wlseg = NULL; } diff --git a/trunk/fs/nfs/nfs4filelayoutdev.c b/trunk/fs/nfs/nfs4filelayoutdev.c index a8eaa9b7bb0f..b720064bcd7f 100644 --- a/trunk/fs/nfs/nfs4filelayoutdev.c +++ b/trunk/fs/nfs/nfs4filelayoutdev.c @@ -33,6 +33,7 @@ #include #include "internal.h" +#include "nfs4session.h" #include "nfs4filelayout.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD @@ -162,8 +163,6 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr, mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor); - BUG_ON(list_empty(&ds->ds_addrs)); - list_for_each_entry(da, &ds->ds_addrs, da_node) { dprintk("%s: DS %s: trying address %s\n", __func__, ds->ds_remotestr, da->da_remotestr); diff --git a/trunk/fs/nfs/nfs4proc.c b/trunk/fs/nfs/nfs4proc.c index 5eec4429970c..493f0f41c554 100644 --- a/trunk/fs/nfs/nfs4proc.c +++ b/trunk/fs/nfs/nfs4proc.c @@ -52,7 +52,6 @@ #include #include #include -#include #include #include #include @@ -64,14 +63,14 @@ #include "callback.h" #include "pnfs.h" #include "netns.h" +#include "nfs4session.h" + #define NFSDBG_FACILITY NFSDBG_PROC #define NFS4_POLL_RETRY_MIN (HZ/10) #define NFS4_POLL_RETRY_MAX (15*HZ) -#define NFS4_MAX_LOOP_ON_RECOVER (10) - struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); @@ -206,7 +205,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent { __be32 *start, *p; - BUG_ON(readdir->count < 80); if (cookie > 2) { readdir->cookie = cookie; memcpy(&readdir->verifier, verifier, sizeof(readdir->verifier)); @@ -256,22 +254,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent kunmap_atomic(start); } -static int nfs4_wait_clnt_recover(struct nfs_client *clp) -{ - int res; - - might_sleep(); - - res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, - nfs_wait_bit_killable, TASK_KILLABLE); - if (res) - return res; - - if (clp->cl_cons_state < 0) - return clp->cl_cons_state; - return 0; -} - static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) { int res = 0; @@ -351,7 +333,6 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc } case -NFS4ERR_GRACE: case -NFS4ERR_DELAY: - case -EKEYEXPIRED: ret = nfs4_delay(server->client, &exception->timeout); if (ret != 0) break; @@ -397,144 +378,136 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp #if defined(CONFIG_NFS_V4_1) -/* - * nfs4_free_slot - free a slot and efficiently update slot table. - * - * freeing a slot is trivially done by clearing its respective bit - * in the bitmap. - * If the freed slotid equals highest_used_slotid we want to update it - * so that the server would be able to size down the slot table if needed, - * otherwise we know that the highest_used_slotid is still in use. - * When updating highest_used_slotid there may be "holes" in the bitmap - * so we need to scan down from highest_used_slotid to 0 looking for the now - * highest slotid in use. - * If none found, highest_used_slotid is set to NFS4_NO_SLOT. - * - * Must be called while holding tbl->slot_tbl_lock - */ -static void -nfs4_free_slot(struct nfs4_slot_table *tbl, u32 slotid) -{ - BUG_ON(slotid >= NFS4_MAX_SLOT_TABLE); - /* clear used bit in bitmap */ - __clear_bit(slotid, tbl->used_slots); - - /* update highest_used_slotid when it is freed */ - if (slotid == tbl->highest_used_slotid) { - slotid = find_last_bit(tbl->used_slots, tbl->max_slots); - if (slotid < tbl->max_slots) - tbl->highest_used_slotid = slotid; - else - tbl->highest_used_slotid = NFS4_NO_SLOT; - } - dprintk("%s: slotid %u highest_used_slotid %d\n", __func__, - slotid, tbl->highest_used_slotid); -} - -bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy) -{ - rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); - return true; -} - -/* - * Signal state manager thread if session fore channel is drained - */ -static void nfs4_check_drain_fc_complete(struct nfs4_session *ses) -{ - if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { - rpc_wake_up_first(&ses->fc_slot_table.slot_tbl_waitq, - nfs4_set_task_privileged, NULL); - return; - } - - if (ses->fc_slot_table.highest_used_slotid != NFS4_NO_SLOT) - return; - - dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__); - complete(&ses->fc_slot_table.complete); -} - -/* - * Signal state manager thread if session back channel is drained - */ -void nfs4_check_drain_bc_complete(struct nfs4_session *ses) -{ - if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) || - ses->bc_slot_table.highest_used_slotid != NFS4_NO_SLOT) - return; - dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__); - complete(&ses->bc_slot_table.complete); -} - static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) { + struct nfs4_session *session; struct nfs4_slot_table *tbl; + bool send_new_highest_used_slotid = false; - tbl = &res->sr_session->fc_slot_table; if (!res->sr_slot) { /* just wake up the next guy waiting since * we may have not consumed a slot after all */ dprintk("%s: No slot\n", __func__); return; } + tbl = res->sr_slot->table; + session = tbl->session; spin_lock(&tbl->slot_tbl_lock); - nfs4_free_slot(tbl, res->sr_slot - tbl->slots); - nfs4_check_drain_fc_complete(res->sr_session); + /* Be nice to the server: try to ensure that the last transmitted + * value for highest_user_slotid <= target_highest_slotid + */ + if (tbl->highest_used_slotid > tbl->target_highest_slotid) + send_new_highest_used_slotid = true; + + if (nfs41_wake_and_assign_slot(tbl, res->sr_slot)) { + send_new_highest_used_slotid = false; + goto out_unlock; + } + nfs4_free_slot(tbl, res->sr_slot); + + if (tbl->highest_used_slotid != NFS4_NO_SLOT) + send_new_highest_used_slotid = false; +out_unlock: spin_unlock(&tbl->slot_tbl_lock); res->sr_slot = NULL; + if (send_new_highest_used_slotid) + nfs41_server_notify_highest_slotid_update(session->clp); } static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { - unsigned long timestamp; + struct nfs4_session *session; + struct nfs4_slot *slot; struct nfs_client *clp; - - /* - * sr_status remains 1 if an RPC level error occurred. The server - * may or may not have processed the sequence operation.. - * Proceed as if the server received and processed the sequence - * operation. - */ - if (res->sr_status == 1) - res->sr_status = NFS_OK; + bool interrupted = false; + int ret = 1; /* don't increment the sequence number if the task wasn't sent */ if (!RPC_WAS_SENT(task)) goto out; + slot = res->sr_slot; + session = slot->table->session; + + if (slot->interrupted) { + slot->interrupted = 0; + interrupted = true; + } + /* Check the SEQUENCE operation status */ switch (res->sr_status) { case 0: /* Update the slot's sequence and clientid lease timer */ - ++res->sr_slot->seq_nr; - timestamp = res->sr_renewal_time; - clp = res->sr_session->clp; - do_renew_lease(clp, timestamp); + ++slot->seq_nr; + clp = session->clp; + do_renew_lease(clp, res->sr_timestamp); /* Check sequence flags */ if (res->sr_status_flags != 0) nfs4_schedule_lease_recovery(clp); + nfs41_update_target_slotid(slot->table, slot, res); break; + case 1: + /* + * sr_status remains 1 if an RPC level error occurred. + * The server may or may not have processed the sequence + * operation.. + * Mark the slot as having hosted an interrupted RPC call. + */ + slot->interrupted = 1; + goto out; case -NFS4ERR_DELAY: /* The server detected a resend of the RPC call and * returned NFS4ERR_DELAY as per Section 2.10.6.2 * of RFC5661. */ - dprintk("%s: slot=%td seq=%d: Operation in progress\n", + dprintk("%s: slot=%u seq=%u: Operation in progress\n", __func__, - res->sr_slot - res->sr_session->fc_slot_table.slots, - res->sr_slot->seq_nr); + slot->slot_nr, + slot->seq_nr); goto out_retry; + case -NFS4ERR_BADSLOT: + /* + * The slot id we used was probably retired. Try again + * using a different slot id. + */ + goto retry_nowait; + case -NFS4ERR_SEQ_MISORDERED: + /* + * Was the last operation on this sequence interrupted? + * If so, retry after bumping the sequence number. + */ + if (interrupted) { + ++slot->seq_nr; + goto retry_nowait; + } + /* + * Could this slot have been previously retired? + * If so, then the server may be expecting seq_nr = 1! + */ + if (slot->seq_nr != 1) { + slot->seq_nr = 1; + goto retry_nowait; + } + break; + case -NFS4ERR_SEQ_FALSE_RETRY: + ++slot->seq_nr; + goto retry_nowait; default: /* Just update the slot sequence no. */ - ++res->sr_slot->seq_nr; + ++slot->seq_nr; } out: /* The session may be reset by one of the error handlers. */ dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); nfs41_sequence_free_slot(res); - return 1; + return ret; +retry_nowait: + if (rpc_restart_call_prepare(task)) { + task->tk_status = 0; + ret = 0; + } + goto out; out_retry: if (!rpc_restart_call(task)) goto out; @@ -545,55 +518,27 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * static int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { - if (res->sr_session == NULL) + if (res->sr_slot == NULL) return 1; return nfs41_sequence_done(task, res); } -/* - * nfs4_find_slot - efficiently look for a free slot - * - * nfs4_find_slot looks for an unset bit in the used_slots bitmap. - * If found, we mark the slot as used, update the highest_used_slotid, - * and respectively set up the sequence operation args. - * The slot number is returned if found, or NFS4_NO_SLOT otherwise. - * - * Note: must be called with under the slot_tbl_lock. - */ -static u32 -nfs4_find_slot(struct nfs4_slot_table *tbl) -{ - u32 slotid; - u32 ret_id = NFS4_NO_SLOT; - - dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n", - __func__, tbl->used_slots[0], tbl->highest_used_slotid, - tbl->max_slots); - slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slots); - if (slotid >= tbl->max_slots) - goto out; - __set_bit(slotid, tbl->used_slots); - if (slotid > tbl->highest_used_slotid || - tbl->highest_used_slotid == NFS4_NO_SLOT) - tbl->highest_used_slotid = slotid; - ret_id = slotid; -out: - dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n", - __func__, tbl->used_slots[0], tbl->highest_used_slotid, ret_id); - return ret_id; -} - static void nfs41_init_sequence(struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply) { - args->sa_session = NULL; + args->sa_slot = NULL; args->sa_cache_this = 0; + args->sa_privileged = 0; if (cache_reply) args->sa_cache_this = 1; - res->sr_session = NULL; res->sr_slot = NULL; } +static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args) +{ + args->sa_privileged = 1; +} + int nfs41_setup_sequence(struct nfs4_session *session, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -601,59 +546,59 @@ int nfs41_setup_sequence(struct nfs4_session *session, { struct nfs4_slot *slot; struct nfs4_slot_table *tbl; - u32 slotid; dprintk("--> %s\n", __func__); /* slot already allocated? */ if (res->sr_slot != NULL) - return 0; + goto out_success; tbl = &session->fc_slot_table; + task->tk_timeout = 0; + spin_lock(&tbl->slot_tbl_lock); if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && - !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { + !args->sa_privileged) { /* The state manager will wait until the slot table is empty */ - rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); - spin_unlock(&tbl->slot_tbl_lock); dprintk("%s session is draining\n", __func__); - return -EAGAIN; + goto out_sleep; } - if (!rpc_queue_empty(&tbl->slot_tbl_waitq) && - !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { - rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); - spin_unlock(&tbl->slot_tbl_lock); - dprintk("%s enforce FIFO order\n", __func__); - return -EAGAIN; - } - - slotid = nfs4_find_slot(tbl); - if (slotid == NFS4_NO_SLOT) { - rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); - spin_unlock(&tbl->slot_tbl_lock); + slot = nfs4_alloc_slot(tbl); + if (IS_ERR(slot)) { + /* If out of memory, try again in 1/4 second */ + if (slot == ERR_PTR(-ENOMEM)) + task->tk_timeout = HZ >> 2; dprintk("<-- %s: no free slots\n", __func__); - return -EAGAIN; + goto out_sleep; } spin_unlock(&tbl->slot_tbl_lock); - rpc_task_set_priority(task, RPC_PRIORITY_NORMAL); - slot = tbl->slots + slotid; - args->sa_session = session; - args->sa_slotid = slotid; + args->sa_slot = slot; - dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr); + dprintk("<-- %s slotid=%d seqid=%d\n", __func__, + slot->slot_nr, slot->seq_nr); - res->sr_session = session; res->sr_slot = slot; - res->sr_renewal_time = jiffies; + res->sr_timestamp = jiffies; res->sr_status_flags = 0; /* * sr_status is only set in decode_sequence, and so will remain * set to 1 if an rpc level failure occurs. */ res->sr_status = 1; +out_success: + rpc_call_start(task); return 0; +out_sleep: + /* Privileged tasks are queued with top priority */ + if (args->sa_privileged) + rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task, + NULL, RPC_PRIORITY_PRIVILEGED); + else + rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); + spin_unlock(&tbl->slot_tbl_lock); + return -EAGAIN; } EXPORT_SYMBOL_GPL(nfs41_setup_sequence); @@ -665,12 +610,14 @@ int nfs4_setup_sequence(const struct nfs_server *server, struct nfs4_session *session = nfs4_get_session(server); int ret = 0; - if (session == NULL) + if (session == NULL) { + rpc_call_start(task); goto out; + } - dprintk("--> %s clp %p session %p sr_slot %td\n", + dprintk("--> %s clp %p session %p sr_slot %d\n", __func__, session->clp, session, res->sr_slot ? - res->sr_slot - session->fc_slot_table.slots : -1); + res->sr_slot->slot_nr : -1); ret = nfs41_setup_sequence(session, args, res, task); out: @@ -687,19 +634,11 @@ struct nfs41_call_sync_data { static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) { struct nfs41_call_sync_data *data = calldata; + struct nfs4_session *session = nfs4_get_session(data->seq_server); dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); - if (nfs4_setup_sequence(data->seq_server, data->seq_args, - data->seq_res, task)) - return; - rpc_call_start(task); -} - -static void nfs41_call_priv_sync_prepare(struct rpc_task *task, void *calldata) -{ - rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); - nfs41_call_sync_prepare(task, calldata); + nfs41_setup_sequence(session, data->seq_args, data->seq_res, task); } static void nfs41_call_sync_done(struct rpc_task *task, void *calldata) @@ -714,17 +653,11 @@ static const struct rpc_call_ops nfs41_call_sync_ops = { .rpc_call_done = nfs41_call_sync_done, }; -static const struct rpc_call_ops nfs41_call_priv_sync_ops = { - .rpc_call_prepare = nfs41_call_priv_sync_prepare, - .rpc_call_done = nfs41_call_sync_done, -}; - static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - int privileged) + struct nfs4_sequence_res *res) { int ret; struct rpc_task *task; @@ -740,8 +673,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, .callback_data = &data }; - if (privileged) - task_setup.callback_ops = &nfs41_call_priv_sync_ops; task = rpc_run_task(&task_setup); if (IS_ERR(task)) ret = PTR_ERR(task); @@ -752,24 +683,18 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, return ret; } -int _nfs4_call_sync_session(struct rpc_clnt *clnt, - struct nfs_server *server, - struct rpc_message *msg, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - int cache_reply) -{ - nfs41_init_sequence(args, res, cache_reply); - return nfs4_call_sync_sequence(clnt, server, msg, args, res, 0); -} - #else -static inline +static void nfs41_init_sequence(struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply) { } +static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args) +{ +} + + static int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { @@ -777,18 +702,17 @@ static int nfs4_sequence_done(struct rpc_task *task, } #endif /* CONFIG_NFS_V4_1 */ +static int _nfs4_call_sync(struct rpc_clnt *clnt, struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - int cache_reply) + struct nfs4_sequence_res *res) { - nfs41_init_sequence(args, res, cache_reply); return rpc_call_sync(clnt, msg, 0); } -static inline +static int nfs4_call_sync(struct rpc_clnt *clnt, struct nfs_server *server, struct rpc_message *msg, @@ -796,8 +720,9 @@ int nfs4_call_sync(struct rpc_clnt *clnt, struct nfs4_sequence_res *res, int cache_reply) { + nfs41_init_sequence(args, res, cache_reply); return server->nfs_client->cl_mvops->call_sync(clnt, server, msg, - args, res, cache_reply); + args, res); } static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) @@ -1445,13 +1370,6 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state nfs_inode_find_state_and_recover(state->inode, stateid); nfs4_schedule_stateid_recovery(server, state); - case -EKEYEXPIRED: - /* - * User RPCSEC_GSS context has expired. - * We cannot recover this stateid now, so - * skip it and allow recovery thread to - * proceed. - */ case -ENOMEM: err = 0; goto out; @@ -1574,20 +1492,12 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) &data->o_res.seq_res, task) != 0) nfs_release_seqid(data->o_arg.seqid); - else - rpc_call_start(task); return; unlock_no_action: rcu_read_unlock(); out_no_action: task->tk_action = NULL; - -} - -static void nfs4_recover_open_prepare(struct rpc_task *task, void *calldata) -{ - rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); - nfs4_open_prepare(task, calldata); + nfs4_sequence_done(task, &data->o_res.seq_res); } static void nfs4_open_done(struct rpc_task *task, void *calldata) @@ -1648,12 +1558,6 @@ static const struct rpc_call_ops nfs4_open_ops = { .rpc_release = nfs4_open_release, }; -static const struct rpc_call_ops nfs4_recover_open_ops = { - .rpc_call_prepare = nfs4_recover_open_prepare, - .rpc_call_done = nfs4_open_done, - .rpc_release = nfs4_open_release, -}; - static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) { struct inode *dir = data->dir->d_inode; @@ -1683,7 +1587,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) data->rpc_status = 0; data->cancelled = 0; if (isrecover) - task_setup_data.callback_ops = &nfs4_recover_open_ops; + nfs4_set_sequence_privileged(&o_arg->seq_args); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -1789,24 +1693,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) return 0; } -static int nfs4_client_recover_expired_lease(struct nfs_client *clp) -{ - unsigned int loop; - int ret; - - for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { - ret = nfs4_wait_clnt_recover(clp); - if (ret != 0) - break; - if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && - !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) - break; - nfs4_schedule_state_manager(clp); - ret = -EIO; - } - return ret; -} - static int nfs4_recover_expired_lease(struct nfs_server *server) { return nfs4_client_recover_expired_lease(server->nfs_client); @@ -2282,6 +2168,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) if (!call_close) { /* Note: exit _without_ calling nfs4_close_done */ task->tk_action = NULL; + nfs4_sequence_done(task, &calldata->res.seq_res); goto out; } @@ -2299,8 +2186,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) &calldata->res.seq_res, task) != 0) nfs_release_seqid(calldata->arg.seqid); - else - rpc_call_start(task); out: dprintk("%s: done!\n", __func__); } @@ -2533,7 +2418,8 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS]; len = rpcauth_list_flavors(flav_array, ARRAY_SIZE(flav_array)); - BUG_ON(len < 0); + if (len < 0) + return len; for (i = 0; i < len; i++) { /* AUTH_UNIX is the default flavor if none was specified, @@ -3038,12 +2924,10 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) { - if (nfs4_setup_sequence(NFS_SERVER(data->dir), - &data->args.seq_args, - &data->res.seq_res, - task)) - return; - rpc_call_start(task); + nfs4_setup_sequence(NFS_SERVER(data->dir), + &data->args.seq_args, + &data->res.seq_res, + task); } static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) @@ -3071,12 +2955,10 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir) static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) { - if (nfs4_setup_sequence(NFS_SERVER(data->old_dir), - &data->args.seq_args, - &data->res.seq_res, - task)) - return; - rpc_call_start(task); + nfs4_setup_sequence(NFS_SERVER(data->old_dir), + &data->args.seq_args, + &data->res.seq_res, + task); } static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, @@ -3362,9 +3244,6 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, int mode = sattr->ia_mode; int status = -ENOMEM; - BUG_ON(!(sattr->ia_valid & ATTR_MODE)); - BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode)); - data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK); if (data == NULL) goto out; @@ -3380,10 +3259,13 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, data->arg.ftype = NF4CHR; data->arg.u.device.specdata1 = MAJOR(rdev); data->arg.u.device.specdata2 = MINOR(rdev); + } else if (!S_ISSOCK(mode)) { + status = -EINVAL; + goto out_free; } status = nfs4_do_create(dir, dentry, data); - +out_free: nfs4_free_createdata(data); out: return status; @@ -3565,12 +3447,10 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) { - if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), - &data->args.seq_args, - &data->res.seq_res, - task)) - return; - rpc_call_start(task); + nfs4_setup_sequence(NFS_SERVER(data->header->inode), + &data->args.seq_args, + &data->res.seq_res, + task); } static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data) @@ -3631,22 +3511,18 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) { - if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), - &data->args.seq_args, - &data->res.seq_res, - task)) - return; - rpc_call_start(task); + nfs4_setup_sequence(NFS_SERVER(data->header->inode), + &data->args.seq_args, + &data->res.seq_res, + task); } static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) { - if (nfs4_setup_sequence(NFS_SERVER(data->inode), - &data->args.seq_args, - &data->res.seq_res, - task)) - return; - rpc_call_start(task); + nfs4_setup_sequence(NFS_SERVER(data->inode), + &data->args.seq_args, + &data->res.seq_res, + task); } static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data) @@ -3937,8 +3813,13 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu goto out_free; } nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len); - if (buf) + if (buf) { + if (res.acl_len > buflen) { + ret = -ERANGE; + goto out_free; + } _copy_from_pages(buf, pages, res.acl_data_offset, res.acl_len); + } out_ok: ret = res.acl_len; out_free: @@ -4085,7 +3966,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, case -NFS4ERR_DELAY: nfs_inc_server_stats(server, NFSIOS_DELAY); case -NFS4ERR_GRACE: - case -EKEYEXPIRED: rpc_delay(task, NFS4_POLL_RETRY_MAX); task->tk_status = 0; return -EAGAIN; @@ -4293,11 +4173,10 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) d_data = (struct nfs4_delegreturndata *)data; - if (nfs4_setup_sequence(d_data->res.server, - &d_data->args.seq_args, - &d_data->res.seq_res, task)) - return; - rpc_call_start(task); + nfs4_setup_sequence(d_data->res.server, + &d_data->args.seq_args, + &d_data->res.seq_res, + task); } #endif /* CONFIG_NFS_V4_1 */ @@ -4543,6 +4422,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { /* Note: exit _without_ running nfs4_locku_done */ task->tk_action = NULL; + nfs4_sequence_done(task, &calldata->res.seq_res); return; } calldata->timestamp = jiffies; @@ -4551,8 +4431,6 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) &calldata->res.seq_res, task) != 0) nfs_release_seqid(calldata->arg.seqid); - else - rpc_call_start(task); } static const struct rpc_call_ops nfs4_locku_ops = { @@ -4696,8 +4574,9 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) return; /* Do we need to do an open_to_lock_owner? */ if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) { - if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) + if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) { goto out_release_lock_seqid; + } data->arg.open_stateid = &state->stateid; data->arg.new_lock_owner = 1; data->res.open_seqid = data->arg.open_seqid; @@ -4707,20 +4586,12 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) if (nfs4_setup_sequence(data->server, &data->arg.seq_args, &data->res.seq_res, - task) == 0) { - rpc_call_start(task); + task) == 0) return; - } nfs_release_seqid(data->arg.open_seqid); out_release_lock_seqid: nfs_release_seqid(data->arg.lock_seqid); - dprintk("%s: done!, ret = %d\n", __func__, task->tk_status); -} - -static void nfs4_recover_lock_prepare(struct rpc_task *task, void *calldata) -{ - rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); - nfs4_lock_prepare(task, calldata); + dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status); } static void nfs4_lock_done(struct rpc_task *task, void *calldata) @@ -4775,12 +4646,6 @@ static const struct rpc_call_ops nfs4_lock_ops = { .rpc_release = nfs4_lock_release, }; -static const struct rpc_call_ops nfs4_recover_lock_ops = { - .rpc_call_prepare = nfs4_recover_lock_prepare, - .rpc_call_done = nfs4_lock_done, - .rpc_release = nfs4_lock_release, -}; - static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error) { switch (error) { @@ -4823,15 +4688,15 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f return -ENOMEM; if (IS_SETLKW(cmd)) data->arg.block = 1; - if (recovery_type > NFS_LOCK_NEW) { - if (recovery_type == NFS_LOCK_RECLAIM) - data->arg.reclaim = NFS_LOCK_RECLAIM; - task_setup_data.callback_ops = &nfs4_recover_lock_ops; - } nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); msg.rpc_argp = &data->arg; msg.rpc_resp = &data->res; task_setup_data.callback_data = data; + if (recovery_type > NFS_LOCK_NEW) { + if (recovery_type == NFS_LOCK_RECLAIM) + data->arg.reclaim = NFS_LOCK_RECLAIM; + nfs4_set_sequence_privileged(&data->arg.seq_args); + } task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -5100,15 +4965,6 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) nfs4_schedule_stateid_recovery(server, state); err = 0; goto out; - case -EKEYEXPIRED: - /* - * User RPCSEC_GSS context has expired. - * We cannot recover this stateid now, so - * skip it and allow recovery thread to - * proceed. - */ - err = 0; - goto out; case -ENOMEM: case -NFS4ERR_DENIED: /* kill_proc(fl->fl_pid, SIGLOST, 1); */ @@ -5357,7 +5213,6 @@ int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred }; dprintk("--> %s\n", __func__); - BUG_ON(clp == NULL); res.session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); if (unlikely(res.session == NULL)) { @@ -5569,20 +5424,16 @@ struct nfs4_get_lease_time_data { static void nfs4_get_lease_time_prepare(struct rpc_task *task, void *calldata) { - int ret; struct nfs4_get_lease_time_data *data = (struct nfs4_get_lease_time_data *)calldata; dprintk("--> %s\n", __func__); - rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); /* just setup sequence, do not trigger session recovery since we're invoked within one */ - ret = nfs41_setup_sequence(data->clp->cl_session, - &data->args->la_seq_args, - &data->res->lr_seq_res, task); - - BUG_ON(ret == -EAGAIN); - rpc_call_start(task); + nfs41_setup_sequence(data->clp->cl_session, + &data->args->la_seq_args, + &data->res->lr_seq_res, + task); dprintk("<-- %s\n", __func__); } @@ -5644,6 +5495,7 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) int status; nfs41_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0); + nfs4_set_sequence_privileged(&args.la_seq_args); dprintk("--> %s\n", __func__); task = rpc_run_task(&task_setup); @@ -5658,145 +5510,6 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) return status; } -static struct nfs4_slot *nfs4_alloc_slots(u32 max_slots, gfp_t gfp_flags) -{ - return kcalloc(max_slots, sizeof(struct nfs4_slot), gfp_flags); -} - -static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl, - struct nfs4_slot *new, - u32 max_slots, - u32 ivalue) -{ - struct nfs4_slot *old = NULL; - u32 i; - - spin_lock(&tbl->slot_tbl_lock); - if (new) { - old = tbl->slots; - tbl->slots = new; - tbl->max_slots = max_slots; - } - tbl->highest_used_slotid = NFS4_NO_SLOT; - for (i = 0; i < tbl->max_slots; i++) - tbl->slots[i].seq_nr = ivalue; - spin_unlock(&tbl->slot_tbl_lock); - kfree(old); -} - -/* - * (re)Initialise a slot table - */ -static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs, - u32 ivalue) -{ - struct nfs4_slot *new = NULL; - int ret = -ENOMEM; - - dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__, - max_reqs, tbl->max_slots); - - /* Does the newly negotiated max_reqs match the existing slot table? */ - if (max_reqs != tbl->max_slots) { - new = nfs4_alloc_slots(max_reqs, GFP_NOFS); - if (!new) - goto out; - } - ret = 0; - - nfs4_add_and_init_slots(tbl, new, max_reqs, ivalue); - dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, - tbl, tbl->slots, tbl->max_slots); -out: - dprintk("<-- %s: return %d\n", __func__, ret); - return ret; -} - -/* Destroy the slot table */ -static void nfs4_destroy_slot_tables(struct nfs4_session *session) -{ - if (session->fc_slot_table.slots != NULL) { - kfree(session->fc_slot_table.slots); - session->fc_slot_table.slots = NULL; - } - if (session->bc_slot_table.slots != NULL) { - kfree(session->bc_slot_table.slots); - session->bc_slot_table.slots = NULL; - } - return; -} - -/* - * Initialize or reset the forechannel and backchannel tables - */ -static int nfs4_setup_session_slot_tables(struct nfs4_session *ses) -{ - struct nfs4_slot_table *tbl; - int status; - - dprintk("--> %s\n", __func__); - /* Fore channel */ - tbl = &ses->fc_slot_table; - status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1); - if (status) /* -ENOMEM */ - return status; - /* Back channel */ - tbl = &ses->bc_slot_table; - status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0); - if (status && tbl->slots == NULL) - /* Fore and back channel share a connection so get - * both slot tables or neither */ - nfs4_destroy_slot_tables(ses); - return status; -} - -struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) -{ - struct nfs4_session *session; - struct nfs4_slot_table *tbl; - - session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); - if (!session) - return NULL; - - tbl = &session->fc_slot_table; - tbl->highest_used_slotid = NFS4_NO_SLOT; - spin_lock_init(&tbl->slot_tbl_lock); - rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table"); - init_completion(&tbl->complete); - - tbl = &session->bc_slot_table; - tbl->highest_used_slotid = NFS4_NO_SLOT; - spin_lock_init(&tbl->slot_tbl_lock); - rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); - init_completion(&tbl->complete); - - session->session_state = 1<clp = clp; - return session; -} - -void nfs4_destroy_session(struct nfs4_session *session) -{ - struct rpc_xprt *xprt; - struct rpc_cred *cred; - - cred = nfs4_get_exchange_id_cred(session->clp); - nfs4_proc_destroy_session(session, cred); - if (cred) - put_rpccred(cred); - - rcu_read_lock(); - xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt); - rcu_read_unlock(); - dprintk("%s Destroy backchannel for xprt %p\n", - __func__, xprt); - xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS); - nfs4_destroy_slot_tables(session); - kfree(session); -} - /* * Initialize the values to be used by the client in CREATE_SESSION * If nfs4_init_session set the fore channel request and response sizes, @@ -5809,8 +5522,8 @@ void nfs4_destroy_session(struct nfs4_session *session) static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) { struct nfs4_session *session = args->client->cl_session; - unsigned int mxrqst_sz = session->fc_attrs.max_rqst_sz, - mxresp_sz = session->fc_attrs.max_resp_sz; + unsigned int mxrqst_sz = session->fc_target_max_rqst_sz, + mxresp_sz = session->fc_target_max_resp_sz; if (mxrqst_sz == 0) mxrqst_sz = NFS_MAX_FILE_IO_SIZE; @@ -5919,10 +5632,9 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); - if (!status) + if (!status) { /* Verify the session's negotiated channel_attrs values */ status = nfs4_verify_channel_attrs(&args, session); - if (!status) { /* Increment the clientid slot sequence id */ clp->cl_seqid++; } @@ -5991,83 +5703,6 @@ int nfs4_proc_destroy_session(struct nfs4_session *session, return status; } -/* - * With sessions, the client is not marked ready until after a - * successful EXCHANGE_ID and CREATE_SESSION. - * - * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate - * other versions of NFS can be tried. - */ -static int nfs41_check_session_ready(struct nfs_client *clp) -{ - int ret; - - if (clp->cl_cons_state == NFS_CS_SESSION_INITING) { - ret = nfs4_client_recover_expired_lease(clp); - if (ret) - return ret; - } - if (clp->cl_cons_state < NFS_CS_READY) - return -EPROTONOSUPPORT; - smp_rmb(); - return 0; -} - -int nfs4_init_session(struct nfs_server *server) -{ - struct nfs_client *clp = server->nfs_client; - struct nfs4_session *session; - unsigned int rsize, wsize; - - if (!nfs4_has_session(clp)) - return 0; - - session = clp->cl_session; - spin_lock(&clp->cl_lock); - if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) { - - rsize = server->rsize; - if (rsize == 0) - rsize = NFS_MAX_FILE_IO_SIZE; - wsize = server->wsize; - if (wsize == 0) - wsize = NFS_MAX_FILE_IO_SIZE; - - session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; - session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; - } - spin_unlock(&clp->cl_lock); - - return nfs41_check_session_ready(clp); -} - -int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time) -{ - struct nfs4_session *session = clp->cl_session; - int ret; - - spin_lock(&clp->cl_lock); - if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) { - /* - * Do not set NFS_CS_CHECK_LEASE_TIME instead set the - * DS lease to be equal to the MDS lease. - */ - clp->cl_lease_time = lease_time; - clp->cl_last_renewal = jiffies; - } - spin_unlock(&clp->cl_lock); - - ret = nfs41_check_session_ready(clp); - if (ret) - return ret; - /* Test for the DS role */ - if (!is_ds_client(clp)) - return -ENODEV; - return 0; -} -EXPORT_SYMBOL_GPL(nfs4_init_ds_session); - - /* * Renew the cl_session lease. */ @@ -6133,9 +5768,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data) args = task->tk_msg.rpc_argp; res = task->tk_msg.rpc_resp; - if (nfs41_setup_sequence(clp->cl_session, args, res, task)) - return; - rpc_call_start(task); + nfs41_setup_sequence(clp->cl_session, args, res, task); } static const struct rpc_call_ops nfs41_sequence_ops = { @@ -6144,7 +5777,9 @@ static const struct rpc_call_ops nfs41_sequence_ops = { .rpc_release = nfs41_sequence_release, }; -static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) +static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, + struct rpc_cred *cred, + bool is_privileged) { struct nfs4_sequence_data *calldata; struct rpc_message msg = { @@ -6166,6 +5801,8 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_ return ERR_PTR(-ENOMEM); } nfs41_init_sequence(&calldata->args, &calldata->res, 0); + if (is_privileged) + nfs4_set_sequence_privileged(&calldata->args); msg.rpc_argp = &calldata->args; msg.rpc_resp = &calldata->res; calldata->clp = clp; @@ -6181,7 +5818,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0) return 0; - task = _nfs41_proc_sequence(clp, cred); + task = _nfs41_proc_sequence(clp, cred, false); if (IS_ERR(task)) ret = PTR_ERR(task); else @@ -6195,7 +5832,7 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) struct rpc_task *task; int ret; - task = _nfs41_proc_sequence(clp, cred); + task = _nfs41_proc_sequence(clp, cred, true); if (IS_ERR(task)) { ret = PTR_ERR(task); goto out; @@ -6224,13 +5861,10 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data) { struct nfs4_reclaim_complete_data *calldata = data; - rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); - if (nfs41_setup_sequence(calldata->clp->cl_session, - &calldata->arg.seq_args, - &calldata->res.seq_res, task)) - return; - - rpc_call_start(task); + nfs41_setup_sequence(calldata->clp->cl_session, + &calldata->arg.seq_args, + &calldata->res.seq_res, + task); } static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp) @@ -6307,6 +5941,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp) calldata->arg.one_fs = 0; nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0); + nfs4_set_sequence_privileged(&calldata->arg.seq_args); msg.rpc_argp = &calldata->arg; msg.rpc_resp = &calldata->res; task_setup_data.callback_data = calldata; @@ -6330,6 +5965,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) { struct nfs4_layoutget *lgp = calldata; struct nfs_server *server = NFS_SERVER(lgp->args.inode); + struct nfs4_session *session = nfs4_get_session(server); dprintk("--> %s\n", __func__); /* Note the is a race here, where a CB_LAYOUTRECALL can come in @@ -6337,16 +5973,14 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) * However, that is not so catastrophic, and there seems * to be no way to prevent it completely. */ - if (nfs4_setup_sequence(server, &lgp->args.seq_args, + if (nfs41_setup_sequence(session, &lgp->args.seq_args, &lgp->res.seq_res, task)) return; if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, NFS_I(lgp->args.inode)->layout, lgp->args.ctx->state)) { rpc_exit(task, NFS4_OK); - return; } - rpc_call_start(task); } static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) @@ -6359,7 +5993,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) dprintk("--> %s\n", __func__); - if (!nfs4_sequence_done(task, &lgp->res.seq_res)) + if (!nfs41_sequence_done(task, &lgp->res.seq_res)) goto out; switch (task->tk_status) { @@ -6510,10 +6144,10 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) struct nfs4_layoutreturn *lrp = calldata; dprintk("--> %s\n", __func__); - if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args, - &lrp->res.seq_res, task)) - return; - rpc_call_start(task); + nfs41_setup_sequence(lrp->clp->cl_session, + &lrp->args.seq_args, + &lrp->res.seq_res, + task); } static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) @@ -6523,7 +6157,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) dprintk("--> %s\n", __func__); - if (!nfs4_sequence_done(task, &lrp->res.seq_res)) + if (!nfs41_sequence_done(task, &lrp->res.seq_res)) return; server = NFS_SERVER(lrp->args.inode); @@ -6672,11 +6306,12 @@ static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata) { struct nfs4_layoutcommit_data *data = calldata; struct nfs_server *server = NFS_SERVER(data->args.inode); + struct nfs4_session *session = nfs4_get_session(server); - if (nfs4_setup_sequence(server, &data->args.seq_args, - &data->res.seq_res, task)) - return; - rpc_call_start(task); + nfs41_setup_sequence(session, + &data->args.seq_args, + &data->res.seq_res, + task); } static void @@ -6685,7 +6320,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) struct nfs4_layoutcommit_data *data = calldata; struct nfs_server *server = NFS_SERVER(data->args.inode); - if (!nfs4_sequence_done(task, &data->res.seq_res)) + if (!nfs41_sequence_done(task, &data->res.seq_res)) return; switch (task->tk_status) { /* Just ignore these failures */ @@ -6873,7 +6508,9 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) dprintk("NFS call test_stateid %p\n", stateid); nfs41_init_sequence(&args.seq_args, &res.seq_res, 0); - status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(server->client, server, &msg, + &args.seq_args, &res.seq_res); if (status != NFS_OK) { dprintk("NFS reply test_stateid: failed, %d\n", status); return status; @@ -6920,8 +6557,9 @@ static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid) dprintk("NFS call free_stateid %p\n", stateid); nfs41_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); status = nfs4_call_sync_sequence(server->client, server, &msg, - &args.seq_args, &res.seq_res, 1); + &args.seq_args, &res.seq_res); dprintk("NFS reply free_stateid: %d\n", status); return status; } @@ -7041,7 +6679,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { #if defined(CONFIG_NFS_V4_1) static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .minor_version = 1, - .call_sync = _nfs4_call_sync_session, + .call_sync = nfs4_call_sync_sequence, .match_stateid = nfs41_match_stateid, .find_root_sec = nfs41_find_root_sec, .reboot_recovery_ops = &nfs41_reboot_recovery_ops, diff --git a/trunk/fs/nfs/nfs4session.c b/trunk/fs/nfs/nfs4session.c new file mode 100644 index 000000000000..ebda5f4a031b --- /dev/null +++ b/trunk/fs/nfs/nfs4session.c @@ -0,0 +1,552 @@ +/* + * fs/nfs/nfs4session.c + * + * Copyright (c) 2012 Trond Myklebust + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nfs4_fs.h" +#include "internal.h" +#include "nfs4session.h" +#include "callback.h" + +#define NFSDBG_FACILITY NFSDBG_STATE + +/* + * nfs4_shrink_slot_table - free retired slots from the slot table + */ +static void nfs4_shrink_slot_table(struct nfs4_slot_table *tbl, u32 newsize) +{ + struct nfs4_slot **p; + if (newsize >= tbl->max_slots) + return; + + p = &tbl->slots; + while (newsize--) + p = &(*p)->next; + while (*p) { + struct nfs4_slot *slot = *p; + + *p = slot->next; + kfree(slot); + tbl->max_slots--; + } +} + +/* + * nfs4_free_slot - free a slot and efficiently update slot table. + * + * freeing a slot is trivially done by clearing its respective bit + * in the bitmap. + * If the freed slotid equals highest_used_slotid we want to update it + * so that the server would be able to size down the slot table if needed, + * otherwise we know that the highest_used_slotid is still in use. + * When updating highest_used_slotid there may be "holes" in the bitmap + * so we need to scan down from highest_used_slotid to 0 looking for the now + * highest slotid in use. + * If none found, highest_used_slotid is set to NFS4_NO_SLOT. + * + * Must be called while holding tbl->slot_tbl_lock + */ +void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot) +{ + u32 slotid = slot->slot_nr; + + /* clear used bit in bitmap */ + __clear_bit(slotid, tbl->used_slots); + + /* update highest_used_slotid when it is freed */ + if (slotid == tbl->highest_used_slotid) { + u32 new_max = find_last_bit(tbl->used_slots, slotid); + if (new_max < slotid) + tbl->highest_used_slotid = new_max; + else { + tbl->highest_used_slotid = NFS4_NO_SLOT; + nfs4_session_drain_complete(tbl->session, tbl); + } + } + dprintk("%s: slotid %u highest_used_slotid %d\n", __func__, + slotid, tbl->highest_used_slotid); +} + +static struct nfs4_slot *nfs4_new_slot(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_init, gfp_t gfp_mask) +{ + struct nfs4_slot *slot; + + slot = kzalloc(sizeof(*slot), gfp_mask); + if (slot) { + slot->table = tbl; + slot->slot_nr = slotid; + slot->seq_nr = seq_init; + } + return slot; +} + +static struct nfs4_slot *nfs4_find_or_create_slot(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_init, gfp_t gfp_mask) +{ + struct nfs4_slot **p, *slot; + + p = &tbl->slots; + for (;;) { + if (*p == NULL) { + *p = nfs4_new_slot(tbl, tbl->max_slots, + seq_init, gfp_mask); + if (*p == NULL) + break; + tbl->max_slots++; + } + slot = *p; + if (slot->slot_nr == slotid) + return slot; + p = &slot->next; + } + return ERR_PTR(-ENOMEM); +} + +/* + * nfs4_alloc_slot - efficiently look for a free slot + * + * nfs4_alloc_slot looks for an unset bit in the used_slots bitmap. + * If found, we mark the slot as used, update the highest_used_slotid, + * and respectively set up the sequence operation args. + * + * Note: must be called with under the slot_tbl_lock. + */ +struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl) +{ + struct nfs4_slot *ret = ERR_PTR(-EBUSY); + u32 slotid; + + dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n", + __func__, tbl->used_slots[0], tbl->highest_used_slotid, + tbl->max_slotid + 1); + slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slotid + 1); + if (slotid > tbl->max_slotid) + goto out; + ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT); + if (IS_ERR(ret)) + goto out; + __set_bit(slotid, tbl->used_slots); + if (slotid > tbl->highest_used_slotid || + tbl->highest_used_slotid == NFS4_NO_SLOT) + tbl->highest_used_slotid = slotid; + ret->generation = tbl->generation; + +out: + dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n", + __func__, tbl->used_slots[0], tbl->highest_used_slotid, + !IS_ERR(ret) ? ret->slot_nr : -1); + return ret; +} + +static int nfs4_grow_slot_table(struct nfs4_slot_table *tbl, + u32 max_reqs, u32 ivalue) +{ + if (max_reqs <= tbl->max_slots) + return 0; + if (!IS_ERR(nfs4_find_or_create_slot(tbl, max_reqs - 1, ivalue, GFP_NOFS))) + return 0; + return -ENOMEM; +} + +static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl, + u32 server_highest_slotid, + u32 ivalue) +{ + struct nfs4_slot **p; + + nfs4_shrink_slot_table(tbl, server_highest_slotid + 1); + p = &tbl->slots; + while (*p) { + (*p)->seq_nr = ivalue; + (*p)->interrupted = 0; + p = &(*p)->next; + } + tbl->highest_used_slotid = NFS4_NO_SLOT; + tbl->target_highest_slotid = server_highest_slotid; + tbl->server_highest_slotid = server_highest_slotid; + tbl->d_target_highest_slotid = 0; + tbl->d2_target_highest_slotid = 0; + tbl->max_slotid = server_highest_slotid; +} + +/* + * (re)Initialise a slot table + */ +static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, + u32 max_reqs, u32 ivalue) +{ + int ret; + + dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__, + max_reqs, tbl->max_slots); + + if (max_reqs > NFS4_MAX_SLOT_TABLE) + max_reqs = NFS4_MAX_SLOT_TABLE; + + ret = nfs4_grow_slot_table(tbl, max_reqs, ivalue); + if (ret) + goto out; + + spin_lock(&tbl->slot_tbl_lock); + nfs4_reset_slot_table(tbl, max_reqs - 1, ivalue); + spin_unlock(&tbl->slot_tbl_lock); + + dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, + tbl, tbl->slots, tbl->max_slots); +out: + dprintk("<-- %s: return %d\n", __func__, ret); + return ret; +} + +/* Destroy the slot table */ +static void nfs4_destroy_slot_tables(struct nfs4_session *session) +{ + nfs4_shrink_slot_table(&session->fc_slot_table, 0); + nfs4_shrink_slot_table(&session->bc_slot_table, 0); +} + +static bool nfs41_assign_slot(struct rpc_task *task, void *pslot) +{ + struct nfs4_sequence_args *args = task->tk_msg.rpc_argp; + struct nfs4_sequence_res *res = task->tk_msg.rpc_resp; + struct nfs4_slot *slot = pslot; + struct nfs4_slot_table *tbl = slot->table; + + if (nfs4_session_draining(tbl->session) && !args->sa_privileged) + return false; + slot->generation = tbl->generation; + args->sa_slot = slot; + res->sr_timestamp = jiffies; + res->sr_slot = slot; + res->sr_status_flags = 0; + res->sr_status = 1; + return true; +} + +static bool __nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot) +{ + if (rpc_wake_up_first(&tbl->slot_tbl_waitq, nfs41_assign_slot, slot)) + return true; + return false; +} + +bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot) +{ + if (slot->slot_nr > tbl->max_slotid) + return false; + return __nfs41_wake_and_assign_slot(tbl, slot); +} + +static bool nfs41_try_wake_next_slot_table_entry(struct nfs4_slot_table *tbl) +{ + struct nfs4_slot *slot = nfs4_alloc_slot(tbl); + if (!IS_ERR(slot)) { + bool ret = __nfs41_wake_and_assign_slot(tbl, slot); + if (ret) + return ret; + nfs4_free_slot(tbl, slot); + } + return false; +} + +void nfs41_wake_slot_table(struct nfs4_slot_table *tbl) +{ + for (;;) { + if (!nfs41_try_wake_next_slot_table_entry(tbl)) + break; + } +} + +static void nfs41_set_max_slotid_locked(struct nfs4_slot_table *tbl, + u32 target_highest_slotid) +{ + u32 max_slotid; + + max_slotid = min(NFS4_MAX_SLOT_TABLE - 1, target_highest_slotid); + if (max_slotid > tbl->server_highest_slotid) + max_slotid = tbl->server_highest_slotid; + if (max_slotid > tbl->target_highest_slotid) + max_slotid = tbl->target_highest_slotid; + tbl->max_slotid = max_slotid; + nfs41_wake_slot_table(tbl); +} + +/* Update the client's idea of target_highest_slotid */ +static void nfs41_set_target_slotid_locked(struct nfs4_slot_table *tbl, + u32 target_highest_slotid) +{ + if (tbl->target_highest_slotid == target_highest_slotid) + return; + tbl->target_highest_slotid = target_highest_slotid; + tbl->generation++; +} + +void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, + u32 target_highest_slotid) +{ + spin_lock(&tbl->slot_tbl_lock); + nfs41_set_target_slotid_locked(tbl, target_highest_slotid); + tbl->d_target_highest_slotid = 0; + tbl->d2_target_highest_slotid = 0; + nfs41_set_max_slotid_locked(tbl, target_highest_slotid); + spin_unlock(&tbl->slot_tbl_lock); +} + +static void nfs41_set_server_slotid_locked(struct nfs4_slot_table *tbl, + u32 highest_slotid) +{ + if (tbl->server_highest_slotid == highest_slotid) + return; + if (tbl->highest_used_slotid > highest_slotid) + return; + /* Deallocate slots */ + nfs4_shrink_slot_table(tbl, highest_slotid + 1); + tbl->server_highest_slotid = highest_slotid; +} + +static s32 nfs41_derivative_target_slotid(s32 s1, s32 s2) +{ + s1 -= s2; + if (s1 == 0) + return 0; + if (s1 < 0) + return (s1 - 1) >> 1; + return (s1 + 1) >> 1; +} + +static int nfs41_sign_s32(s32 s1) +{ + if (s1 > 0) + return 1; + if (s1 < 0) + return -1; + return 0; +} + +static bool nfs41_same_sign_or_zero_s32(s32 s1, s32 s2) +{ + if (!s1 || !s2) + return true; + return nfs41_sign_s32(s1) == nfs41_sign_s32(s2); +} + +/* Try to eliminate outliers by checking for sharp changes in the + * derivatives and second derivatives + */ +static bool nfs41_is_outlier_target_slotid(struct nfs4_slot_table *tbl, + u32 new_target) +{ + s32 d_target, d2_target; + bool ret = true; + + d_target = nfs41_derivative_target_slotid(new_target, + tbl->target_highest_slotid); + d2_target = nfs41_derivative_target_slotid(d_target, + tbl->d_target_highest_slotid); + /* Is first derivative same sign? */ + if (nfs41_same_sign_or_zero_s32(d_target, tbl->d_target_highest_slotid)) + ret = false; + /* Is second derivative same sign? */ + if (nfs41_same_sign_or_zero_s32(d2_target, tbl->d2_target_highest_slotid)) + ret = false; + tbl->d_target_highest_slotid = d_target; + tbl->d2_target_highest_slotid = d2_target; + return ret; +} + +void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot, + struct nfs4_sequence_res *res) +{ + spin_lock(&tbl->slot_tbl_lock); + if (!nfs41_is_outlier_target_slotid(tbl, res->sr_target_highest_slotid)) + nfs41_set_target_slotid_locked(tbl, res->sr_target_highest_slotid); + if (tbl->generation == slot->generation) + nfs41_set_server_slotid_locked(tbl, res->sr_highest_slotid); + nfs41_set_max_slotid_locked(tbl, res->sr_target_highest_slotid); + spin_unlock(&tbl->slot_tbl_lock); +} + +/* + * Initialize or reset the forechannel and backchannel tables + */ +int nfs4_setup_session_slot_tables(struct nfs4_session *ses) +{ + struct nfs4_slot_table *tbl; + int status; + + dprintk("--> %s\n", __func__); + /* Fore channel */ + tbl = &ses->fc_slot_table; + tbl->session = ses; + status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1); + if (status) /* -ENOMEM */ + return status; + /* Back channel */ + tbl = &ses->bc_slot_table; + tbl->session = ses; + status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0); + if (status && tbl->slots == NULL) + /* Fore and back channel share a connection so get + * both slot tables or neither */ + nfs4_destroy_slot_tables(ses); + return status; +} + +struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) +{ + struct nfs4_session *session; + struct nfs4_slot_table *tbl; + + session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); + if (!session) + return NULL; + + tbl = &session->fc_slot_table; + tbl->highest_used_slotid = NFS4_NO_SLOT; + spin_lock_init(&tbl->slot_tbl_lock); + rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table"); + init_completion(&tbl->complete); + + tbl = &session->bc_slot_table; + tbl->highest_used_slotid = NFS4_NO_SLOT; + spin_lock_init(&tbl->slot_tbl_lock); + rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); + init_completion(&tbl->complete); + + session->session_state = 1<clp = clp; + return session; +} + +void nfs4_destroy_session(struct nfs4_session *session) +{ + struct rpc_xprt *xprt; + struct rpc_cred *cred; + + cred = nfs4_get_exchange_id_cred(session->clp); + nfs4_proc_destroy_session(session, cred); + if (cred) + put_rpccred(cred); + + rcu_read_lock(); + xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt); + rcu_read_unlock(); + dprintk("%s Destroy backchannel for xprt %p\n", + __func__, xprt); + xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS); + nfs4_destroy_slot_tables(session); + kfree(session); +} + +/* + * With sessions, the client is not marked ready until after a + * successful EXCHANGE_ID and CREATE_SESSION. + * + * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate + * other versions of NFS can be tried. + */ +static int nfs41_check_session_ready(struct nfs_client *clp) +{ + int ret; + + if (clp->cl_cons_state == NFS_CS_SESSION_INITING) { + ret = nfs4_client_recover_expired_lease(clp); + if (ret) + return ret; + } + if (clp->cl_cons_state < NFS_CS_READY) + return -EPROTONOSUPPORT; + smp_rmb(); + return 0; +} + +int nfs4_init_session(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_session *session; + unsigned int target_max_rqst_sz = NFS_MAX_FILE_IO_SIZE; + unsigned int target_max_resp_sz = NFS_MAX_FILE_IO_SIZE; + + if (!nfs4_has_session(clp)) + return 0; + + if (server->rsize != 0) + target_max_resp_sz = server->rsize; + target_max_resp_sz += nfs41_maxread_overhead; + + if (server->wsize != 0) + target_max_rqst_sz = server->wsize; + target_max_rqst_sz += nfs41_maxwrite_overhead; + + session = clp->cl_session; + spin_lock(&clp->cl_lock); + if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) { + /* Initialise targets and channel attributes */ + session->fc_target_max_rqst_sz = target_max_rqst_sz; + session->fc_attrs.max_rqst_sz = target_max_rqst_sz; + session->fc_target_max_resp_sz = target_max_resp_sz; + session->fc_attrs.max_resp_sz = target_max_resp_sz; + } else { + /* Just adjust the targets */ + if (target_max_rqst_sz > session->fc_target_max_rqst_sz) { + session->fc_target_max_rqst_sz = target_max_rqst_sz; + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + } + if (target_max_resp_sz > session->fc_target_max_resp_sz) { + session->fc_target_max_resp_sz = target_max_resp_sz; + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + } + } + spin_unlock(&clp->cl_lock); + + if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) + nfs4_schedule_lease_recovery(clp); + + return nfs41_check_session_ready(clp); +} + +int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time) +{ + struct nfs4_session *session = clp->cl_session; + int ret; + + spin_lock(&clp->cl_lock); + if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) { + /* + * Do not set NFS_CS_CHECK_LEASE_TIME instead set the + * DS lease to be equal to the MDS lease. + */ + clp->cl_lease_time = lease_time; + clp->cl_last_renewal = jiffies; + } + spin_unlock(&clp->cl_lock); + + ret = nfs41_check_session_ready(clp); + if (ret) + return ret; + /* Test for the DS role */ + if (!is_ds_client(clp)) + return -ENODEV; + return 0; +} +EXPORT_SYMBOL_GPL(nfs4_init_ds_session); + + diff --git a/trunk/fs/nfs/nfs4session.h b/trunk/fs/nfs/nfs4session.h new file mode 100644 index 000000000000..6f3cb39386d4 --- /dev/null +++ b/trunk/fs/nfs/nfs4session.h @@ -0,0 +1,142 @@ +/* + * fs/nfs/nfs4session.h + * + * Copyright (c) 2012 Trond Myklebust + * + */ +#ifndef __LINUX_FS_NFS_NFS4SESSION_H +#define __LINUX_FS_NFS_NFS4SESSION_H + +/* maximum number of slots to use */ +#define NFS4_DEF_SLOT_TABLE_SIZE (16U) +#define NFS4_MAX_SLOT_TABLE (1024U) +#define NFS4_NO_SLOT ((u32)-1) + +#if IS_ENABLED(CONFIG_NFS_V4) + +/* Sessions slot seqid */ +struct nfs4_slot { + struct nfs4_slot_table *table; + struct nfs4_slot *next; + unsigned long generation; + u32 slot_nr; + u32 seq_nr; + unsigned int interrupted : 1; +}; + +/* Sessions */ +#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long)) +struct nfs4_slot_table { + struct nfs4_session *session; /* Parent session */ + struct nfs4_slot *slots; /* seqid per slot */ + unsigned long used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */ + spinlock_t slot_tbl_lock; + struct rpc_wait_queue slot_tbl_waitq; /* allocators may wait here */ + u32 max_slots; /* # slots in table */ + u32 max_slotid; /* Max allowed slotid value */ + u32 highest_used_slotid; /* sent to server on each SEQ. + * op for dynamic resizing */ + u32 target_highest_slotid; /* Server max_slot target */ + u32 server_highest_slotid; /* Server highest slotid */ + s32 d_target_highest_slotid; /* Derivative */ + s32 d2_target_highest_slotid; /* 2nd derivative */ + unsigned long generation; /* Generation counter for + target_highest_slotid */ + struct completion complete; +}; + +/* + * Session related parameters + */ +struct nfs4_session { + struct nfs4_sessionid sess_id; + u32 flags; + unsigned long session_state; + u32 hash_alg; + u32 ssv_len; + + /* The fore and back channel */ + struct nfs4_channel_attrs fc_attrs; + struct nfs4_slot_table fc_slot_table; + struct nfs4_channel_attrs bc_attrs; + struct nfs4_slot_table bc_slot_table; + struct nfs_client *clp; + /* Create session arguments */ + unsigned int fc_target_max_rqst_sz; + unsigned int fc_target_max_resp_sz; +}; + +enum nfs4_session_state { + NFS4_SESSION_INITING, + NFS4_SESSION_DRAINING, +}; + +#if defined(CONFIG_NFS_V4_1) +extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl); +extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); + +extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, + u32 target_highest_slotid); +extern void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot, + struct nfs4_sequence_res *res); + +extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses); + +extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); +extern void nfs4_destroy_session(struct nfs4_session *session); +extern int nfs4_init_session(struct nfs_server *server); +extern int nfs4_init_ds_session(struct nfs_client *, unsigned long); + +extern void nfs4_session_drain_complete(struct nfs4_session *session, + struct nfs4_slot_table *tbl); + +static inline bool nfs4_session_draining(struct nfs4_session *session) +{ + return !!test_bit(NFS4_SESSION_DRAINING, &session->session_state); +} + +bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot); +void nfs41_wake_slot_table(struct nfs4_slot_table *tbl); + +/* + * Determine if sessions are in use. + */ +static inline int nfs4_has_session(const struct nfs_client *clp) +{ + if (clp->cl_session) + return 1; + return 0; +} + +static inline int nfs4_has_persistent_session(const struct nfs_client *clp) +{ + if (nfs4_has_session(clp)) + return (clp->cl_session->flags & SESSION4_PERSIST); + return 0; +} + +#else /* defined(CONFIG_NFS_V4_1) */ + +static inline int nfs4_init_session(struct nfs_server *server) +{ + return 0; +} + +/* + * Determine if sessions are in use. + */ +static inline int nfs4_has_session(const struct nfs_client *clp) +{ + return 0; +} + +static inline int nfs4_has_persistent_session(const struct nfs_client *clp) +{ + return 0; +} + +#endif /* defined(CONFIG_NFS_V4_1) */ +#endif /* IS_ENABLED(CONFIG_NFS_V4) */ +#endif /* __LINUX_FS_NFS_NFS4SESSION_H */ diff --git a/trunk/fs/nfs/nfs4state.c b/trunk/fs/nfs/nfs4state.c index c351e6b39838..9448c579d41a 100644 --- a/trunk/fs/nfs/nfs4state.c +++ b/trunk/fs/nfs/nfs4state.c @@ -57,6 +57,7 @@ #include "callback.h" #include "delegation.h" #include "internal.h" +#include "nfs4session.h" #include "pnfs.h" #include "netns.h" @@ -66,7 +67,6 @@ const nfs4_stateid zero_stateid; static DEFINE_MUTEX(nfs_clid_init_mutex); -static LIST_HEAD(nfs4_clientid_list); int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) { @@ -254,24 +254,27 @@ static void nfs4_end_drain_session(struct nfs_client *clp) { struct nfs4_session *ses = clp->cl_session; struct nfs4_slot_table *tbl; - int max_slots; if (ses == NULL) return; tbl = &ses->fc_slot_table; if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { spin_lock(&tbl->slot_tbl_lock); - max_slots = tbl->max_slots; - while (max_slots--) { - if (rpc_wake_up_first(&tbl->slot_tbl_waitq, - nfs4_set_task_privileged, - NULL) == NULL) - break; - } + nfs41_wake_slot_table(tbl); spin_unlock(&tbl->slot_tbl_lock); } } +/* + * Signal state manager thread if session fore channel is drained + */ +void nfs4_session_drain_complete(struct nfs4_session *session, + struct nfs4_slot_table *tbl) +{ + if (nfs4_session_draining(session)) + complete(&tbl->complete); +} + static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl) { spin_lock(&tbl->slot_tbl_lock); @@ -303,7 +306,6 @@ static void nfs41_finish_session_reset(struct nfs_client *clp) clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); /* create_session negotiated new slot table */ - clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); nfs41_setup_state_renewal(clp); } @@ -1086,7 +1088,6 @@ void nfs_free_seqid(struct nfs_seqid *seqid) */ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) { - BUG_ON(list_first_entry(&seqid->sequence->list, struct nfs_seqid, list) != seqid); switch (status) { case 0: break; @@ -1209,6 +1210,40 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp) } EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery); +int nfs4_wait_clnt_recover(struct nfs_client *clp) +{ + int res; + + might_sleep(); + + res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (res) + return res; + + if (clp->cl_cons_state < 0) + return clp->cl_cons_state; + return 0; +} + +int nfs4_client_recover_expired_lease(struct nfs_client *clp) +{ + unsigned int loop; + int ret; + + for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { + ret = nfs4_wait_clnt_recover(clp); + if (ret != 0) + break; + if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && + !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) + break; + nfs4_schedule_state_manager(clp); + ret = -EIO; + } + return ret; +} + /* * nfs40_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN * @clp: client to process @@ -1401,14 +1436,6 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs /* Mark the file as being 'closed' */ state->state = 0; break; - case -EKEYEXPIRED: - /* - * User RPCSEC_GSS context has expired. - * We cannot recover this stateid now, so - * skip it and allow recovery thread to - * proceed. - */ - break; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_BAD_STATEID: @@ -1561,14 +1588,6 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); } -static void nfs4_warn_keyexpired(const char *s) -{ - printk_ratelimited(KERN_WARNING "Error: state manager" - " encountered RPCSEC_GSS session" - " expired against NFSv4 server %s.\n", - s); -} - static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) { switch (error) { @@ -1602,10 +1621,6 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); break; - case -EKEYEXPIRED: - /* Nothing we can do */ - nfs4_warn_keyexpired(clp->cl_hostname); - break; default: dprintk("%s: failed to handle error %d for server %s\n", __func__, error, clp->cl_hostname); @@ -1722,8 +1737,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) dprintk("%s: exit with error %d for server %s\n", __func__, -EPROTONOSUPPORT, clp->cl_hostname); return -EPROTONOSUPPORT; - case -EKEYEXPIRED: - nfs4_warn_keyexpired(clp->cl_hostname); case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery * in nfs4_exchange_id */ default: @@ -1876,7 +1889,6 @@ int nfs4_discover_server_trunking(struct nfs_client *clp, break; case -EKEYEXPIRED: - nfs4_warn_keyexpired(clp->cl_hostname); case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery * in nfs4_exchange_id */ status = -EKEYEXPIRED; @@ -1907,14 +1919,23 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) } EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); -void nfs41_handle_recall_slot(struct nfs_client *clp) +static void nfs41_ping_server(struct nfs_client *clp) { - set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); - dprintk("%s: scheduling slot recall for server %s\n", __func__, - clp->cl_hostname); + /* Use CHECK_LEASE to ping the server with a SEQUENCE */ + set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); nfs4_schedule_state_manager(clp); } +void nfs41_server_notify_target_slotid_update(struct nfs_client *clp) +{ + nfs41_ping_server(clp); +} + +void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp) +{ + nfs41_ping_server(clp); +} + static void nfs4_reset_all_state(struct nfs_client *clp) { if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { @@ -2024,35 +2045,6 @@ static int nfs4_reset_session(struct nfs_client *clp) return status; } -static int nfs4_recall_slot(struct nfs_client *clp) -{ - struct nfs4_slot_table *fc_tbl; - struct nfs4_slot *new, *old; - int i; - - if (!nfs4_has_session(clp)) - return 0; - nfs4_begin_drain_session(clp); - fc_tbl = &clp->cl_session->fc_slot_table; - new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot), - GFP_NOFS); - if (!new) - return -ENOMEM; - - spin_lock(&fc_tbl->slot_tbl_lock); - for (i = 0; i < fc_tbl->target_max_slots; i++) - new[i].seq_nr = fc_tbl->slots[i].seq_nr; - old = fc_tbl->slots; - fc_tbl->slots = new; - fc_tbl->max_slots = fc_tbl->target_max_slots; - fc_tbl->target_max_slots = 0; - clp->cl_session->fc_attrs.max_reqs = fc_tbl->max_slots; - spin_unlock(&fc_tbl->slot_tbl_lock); - - kfree(old); - return 0; -} - static int nfs4_bind_conn_to_session(struct nfs_client *clp) { struct rpc_cred *cred; @@ -2083,7 +2075,6 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp) #else /* CONFIG_NFS_V4_1 */ static int nfs4_reset_session(struct nfs_client *clp) { return 0; } static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; } -static int nfs4_recall_slot(struct nfs_client *clp) { return 0; } static int nfs4_bind_conn_to_session(struct nfs_client *clp) { @@ -2115,15 +2106,6 @@ static void nfs4_state_manager(struct nfs_client *clp) continue; } - if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { - section = "check lease"; - status = nfs4_check_lease(clp); - if (status < 0) - goto out_error; - if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) - continue; - } - /* Initialize or reset the session */ if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) { section = "reset session"; @@ -2144,10 +2126,9 @@ static void nfs4_state_manager(struct nfs_client *clp) continue; } - /* Recall session slots */ - if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state)) { - section = "recall slot"; - status = nfs4_recall_slot(clp); + if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { + section = "check lease"; + status = nfs4_check_lease(clp); if (status < 0) goto out_error; continue; diff --git a/trunk/fs/nfs/nfs4super.c b/trunk/fs/nfs/nfs4super.c index bd61221ad2c5..84d2e9e2f313 100644 --- a/trunk/fs/nfs/nfs4super.c +++ b/trunk/fs/nfs/nfs4super.c @@ -51,6 +51,7 @@ static const struct super_operations nfs4_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, .write_inode = nfs4_write_inode, + .drop_inode = nfs_drop_inode, .put_super = nfs_put_super, .statfs = nfs_statfs, .evict_inode = nfs4_evict_inode, diff --git a/trunk/fs/nfs/nfs4xdr.c b/trunk/fs/nfs/nfs4xdr.c index 40836ee5dc3a..26b143920433 100644 --- a/trunk/fs/nfs/nfs4xdr.c +++ b/trunk/fs/nfs/nfs4xdr.c @@ -56,6 +56,7 @@ #include "nfs4_fs.h" #include "internal.h" +#include "nfs4session.h" #include "pnfs.h" #include "netns.h" @@ -270,6 +271,8 @@ static int nfs4_stat_to_errno(int); #if defined(CONFIG_NFS_V4_1) #define NFS4_MAX_MACHINE_NAME_LEN (64) +#define IMPL_NAME_LIMIT (sizeof(utsname()->sysname) + sizeof(utsname()->release) + \ + sizeof(utsname()->version) + sizeof(utsname()->machine) + 8) #define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \ encode_verifier_maxsz + \ @@ -282,7 +285,7 @@ static int nfs4_stat_to_errno(int); 1 /* nii_domain */ + \ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ 1 /* nii_name */ + \ - XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ + XDR_QUADLEN(IMPL_NAME_LIMIT) + \ 3 /* nii_date */) #define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \ 2 /* eir_clientid */ + \ @@ -936,7 +939,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr, * but this is not required as a MUST for the server to do so. */ hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen; - BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); + WARN_ON_ONCE(hdr->taglen > NFS4_MAXTAGLEN); encode_string(xdr, hdr->taglen, hdr->tag); p = reserve_space(xdr, 8); *p++ = cpu_to_be32(hdr->minorversion); @@ -955,7 +958,7 @@ static void encode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 op, static void encode_nops(struct compound_hdr *hdr) { - BUG_ON(hdr->nops > NFS4_MAX_OPS); + WARN_ON_ONCE(hdr->nops > NFS4_MAX_OPS); *hdr->nops_p = htonl(hdr->nops); } @@ -1403,7 +1406,6 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a *p = cpu_to_be32(NFS4_OPEN_NOCREATE); break; default: - BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL); *p = cpu_to_be32(NFS4_OPEN_CREATE); encode_createmode(xdr, arg); } @@ -1621,7 +1623,6 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun p = reserve_space(xdr, 2*4); *p++ = cpu_to_be32(1); *p = cpu_to_be32(FATTR4_WORD0_ACL); - BUG_ON(arg->acl_len % 4); p = reserve_space(xdr, 4); *p = cpu_to_be32(arg->acl_len); xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); @@ -1713,7 +1714,7 @@ static void encode_exchange_id(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - char impl_name[NFS4_OPAQUE_LIMIT]; + char impl_name[IMPL_NAME_LIMIT]; int len = 0; encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr); @@ -1728,7 +1729,7 @@ static void encode_exchange_id(struct xdr_stream *xdr, if (send_implementation_id && sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 && sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) - <= NFS4_OPAQUE_LIMIT + 1) + <= sizeof(impl_name) + 1) len = snprintf(impl_name, sizeof(impl_name), "%s %s %s %s", utsname()->sysname, utsname()->release, utsname()->version, utsname()->machine); @@ -1835,18 +1836,16 @@ static void encode_sequence(struct xdr_stream *xdr, struct compound_hdr *hdr) { #if defined(CONFIG_NFS_V4_1) - struct nfs4_session *session = args->sa_session; + struct nfs4_session *session; struct nfs4_slot_table *tp; - struct nfs4_slot *slot; + struct nfs4_slot *slot = args->sa_slot; __be32 *p; - if (!session) + if (slot == NULL) return; - tp = &session->fc_slot_table; - - WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE); - slot = tp->slots + args->sa_slotid; + tp = slot->table; + session = tp->session; encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr); @@ -1860,12 +1859,12 @@ static void encode_sequence(struct xdr_stream *xdr, ((u32 *)session->sess_id.data)[1], ((u32 *)session->sess_id.data)[2], ((u32 *)session->sess_id.data)[3], - slot->seq_nr, args->sa_slotid, + slot->seq_nr, slot->slot_nr, tp->highest_used_slotid, args->sa_cache_this); p = reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 16); p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); *p++ = cpu_to_be32(slot->seq_nr); - *p++ = cpu_to_be32(args->sa_slotid); + *p++ = cpu_to_be32(slot->slot_nr); *p++ = cpu_to_be32(tp->highest_used_slotid); *p = cpu_to_be32(args->sa_cache_this); #endif /* CONFIG_NFS_V4_1 */ @@ -2027,8 +2026,9 @@ static void encode_free_stateid(struct xdr_stream *xdr, static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args) { #if defined(CONFIG_NFS_V4_1) - if (args->sa_session) - return args->sa_session->clp->cl_mvops->minor_version; + + if (args->sa_slot) + return args->sa_slot->table->session->clp->cl_mvops->minor_version; #endif /* CONFIG_NFS_V4_1 */ return 0; } @@ -5509,12 +5509,13 @@ static int decode_sequence(struct xdr_stream *xdr, struct rpc_rqst *rqstp) { #if defined(CONFIG_NFS_V4_1) + struct nfs4_session *session; struct nfs4_sessionid id; u32 dummy; int status; __be32 *p; - if (!res->sr_session) + if (res->sr_slot == NULL) return 0; status = decode_op_hdr(xdr, OP_SEQUENCE); @@ -5528,8 +5529,9 @@ static int decode_sequence(struct xdr_stream *xdr, * sequence number, the server is looney tunes. */ status = -EREMOTEIO; + session = res->sr_slot->table->session; - if (memcmp(id.data, res->sr_session->sess_id.data, + if (memcmp(id.data, session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { dprintk("%s Invalid session id\n", __func__); goto out_err; @@ -5547,14 +5549,14 @@ static int decode_sequence(struct xdr_stream *xdr, } /* slot id */ dummy = be32_to_cpup(p++); - if (dummy != res->sr_slot - res->sr_session->fc_slot_table.slots) { + if (dummy != res->sr_slot->slot_nr) { dprintk("%s Invalid slot id\n", __func__); goto out_err; } - /* highest slot id - currently not processed */ - dummy = be32_to_cpup(p++); - /* target highest slot id - currently not processed */ - dummy = be32_to_cpup(p++); + /* highest slot id */ + res->sr_highest_slotid = be32_to_cpup(p++); + /* target highest slot id */ + res->sr_target_highest_slotid = be32_to_cpup(p++); /* result flags */ res->sr_status_flags = be32_to_cpup(p); status = 0; diff --git a/trunk/fs/nfs/objlayout/objlayout.c b/trunk/fs/nfs/objlayout/objlayout.c index 874613545301..a9ebd817278b 100644 --- a/trunk/fs/nfs/objlayout/objlayout.c +++ b/trunk/fs/nfs/objlayout/objlayout.c @@ -148,17 +148,6 @@ end_offset(u64 start, u64 len) return end >= start ? end : NFS4_MAX_UINT64; } -/* last octet in a range */ -static inline u64 -last_byte_offset(u64 start, u64 len) -{ - u64 end; - - BUG_ON(!len); - end = start + len; - return end > start ? end - 1 : NFS4_MAX_UINT64; -} - static void _fix_verify_io_params(struct pnfs_layout_segment *lseg, struct page ***p_pages, unsigned *p_pgbase, u64 offset, unsigned long count) diff --git a/trunk/fs/nfs/pnfs.c b/trunk/fs/nfs/pnfs.c index 2878f97bd78d..e7165d915362 100644 --- a/trunk/fs/nfs/pnfs.c +++ b/trunk/fs/nfs/pnfs.c @@ -369,17 +369,6 @@ end_offset(u64 start, u64 len) return end >= start ? end : NFS4_MAX_UINT64; } -/* last octet in a range */ -static inline u64 -last_byte_offset(u64 start, u64 len) -{ - u64 end; - - BUG_ON(!len); - end = start + len; - return end > start ? end - 1 : NFS4_MAX_UINT64; -} - /* * is l2 fully contained in l1? * start1 end1 @@ -645,7 +634,6 @@ send_layoutget(struct pnfs_layout_hdr *lo, dprintk("--> %s\n", __func__); - BUG_ON(ctx == NULL); lgp = kzalloc(sizeof(*lgp), gfp_flags); if (lgp == NULL) return NULL; @@ -1126,7 +1114,6 @@ pnfs_update_layout(struct inode *ino, * chance of a CB_LAYOUTRECALL(FILE) coming in. */ spin_lock(&clp->cl_lock); - BUG_ON(!list_empty(&lo->plh_layouts)); list_add_tail(&lo->plh_layouts, &server->layouts); spin_unlock(&clp->cl_lock); } @@ -1222,7 +1209,7 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r { u64 rd_size = req->wb_bytes; - BUG_ON(pgio->pg_lseg != NULL); + WARN_ON_ONCE(pgio->pg_lseg != NULL); if (req->wb_offset != req->wb_pgbase) { nfs_pageio_reset_read_mds(pgio); @@ -1251,7 +1238,7 @@ void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, u64 wb_size) { - BUG_ON(pgio->pg_lseg != NULL); + WARN_ON_ONCE(pgio->pg_lseg != NULL); if (req->wb_offset != req->wb_pgbase) { nfs_pageio_reset_write_mds(pgio); diff --git a/trunk/fs/nfs/proc.c b/trunk/fs/nfs/proc.c index 50a88c3546ed..f084dac948e1 100644 --- a/trunk/fs/nfs/proc.c +++ b/trunk/fs/nfs/proc.c @@ -46,39 +46,6 @@ #define NFSDBG_FACILITY NFSDBG_PROC -/* - * wrapper to handle the -EKEYEXPIRED error message. This should generally - * only happen if using krb5 auth and a user's TGT expires. NFSv2 doesn't - * support the NFSERR_JUKEBOX error code, but we handle this situation in the - * same way that we handle that error with NFSv3. - */ -static int -nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) -{ - int res; - do { - res = rpc_call_sync(clnt, msg, flags); - if (res != -EKEYEXPIRED) - break; - freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); - res = -ERESTARTSYS; - } while (!fatal_signal_pending(current)); - return res; -} - -#define rpc_call_sync(clnt, msg, flags) nfs_rpc_wrapper(clnt, msg, flags) - -static int -nfs_async_handle_expired_key(struct rpc_task *task) -{ - if (task->tk_status != -EKEYEXPIRED) - return 0; - task->tk_status = 0; - rpc_restart_call(task); - rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); - return 1; -} - /* * Bare-bones access to getattr: this is for nfs_read_super. */ @@ -364,8 +331,6 @@ static void nfs_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlink static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir) { - if (nfs_async_handle_expired_key(task)) - return 0; nfs_mark_for_revalidate(dir); return 1; } @@ -385,8 +350,6 @@ static int nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir, struct inode *new_dir) { - if (nfs_async_handle_expired_key(task)) - return 0; nfs_mark_for_revalidate(old_dir); nfs_mark_for_revalidate(new_dir); return 1; @@ -642,9 +605,6 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) { struct inode *inode = data->header->inode; - if (nfs_async_handle_expired_key(task)) - return -EAGAIN; - nfs_invalidate_atime(inode); if (task->tk_status >= 0) { nfs_refresh_inode(inode, data->res.fattr); @@ -671,9 +631,6 @@ static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) { struct inode *inode = data->header->inode; - if (nfs_async_handle_expired_key(task)) - return -EAGAIN; - if (task->tk_status >= 0) nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); return 0; diff --git a/trunk/fs/nfs/super.c b/trunk/fs/nfs/super.c index 652d3f7176a9..aa5315bb3666 100644 --- a/trunk/fs/nfs/super.c +++ b/trunk/fs/nfs/super.c @@ -64,6 +64,7 @@ #include "iostat.h" #include "internal.h" #include "fscache.h" +#include "nfs4session.h" #include "pnfs.h" #include "nfs.h" @@ -307,6 +308,7 @@ const struct super_operations nfs_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, .write_inode = nfs_write_inode, + .drop_inode = nfs_drop_inode, .put_super = nfs_put_super, .statfs = nfs_statfs, .evict_inode = nfs_evict_inode, diff --git a/trunk/fs/nfs/write.c b/trunk/fs/nfs/write.c index 9347ab7c9574..5209916e1222 100644 --- a/trunk/fs/nfs/write.c +++ b/trunk/fs/nfs/write.c @@ -202,7 +202,6 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c /* A writeback failed: mark the page as bad, and invalidate the page cache */ static void nfs_set_pageerror(struct page *page) { - SetPageError(page); nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page)); } @@ -239,21 +238,18 @@ int nfs_congestion_kb; #define NFS_CONGESTION_OFF_THRESH \ (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2)) -static int nfs_set_page_writeback(struct page *page) +static void nfs_set_page_writeback(struct page *page) { + struct nfs_server *nfss = NFS_SERVER(page_file_mapping(page)->host); int ret = test_set_page_writeback(page); - if (!ret) { - struct inode *inode = page_file_mapping(page)->host; - struct nfs_server *nfss = NFS_SERVER(inode); + WARN_ON_ONCE(ret != 0); - if (atomic_long_inc_return(&nfss->writeback) > - NFS_CONGESTION_ON_THRESH) { - set_bdi_congested(&nfss->backing_dev_info, - BLK_RW_ASYNC); - } + if (atomic_long_inc_return(&nfss->writeback) > + NFS_CONGESTION_ON_THRESH) { + set_bdi_congested(&nfss->backing_dev_info, + BLK_RW_ASYNC); } - return ret; } static void nfs_end_page_writeback(struct page *page) @@ -315,10 +311,10 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, if (IS_ERR(req)) goto out; - ret = nfs_set_page_writeback(page); - BUG_ON(ret != 0); - BUG_ON(test_bit(PG_CLEAN, &req->wb_flags)); + nfs_set_page_writeback(page); + WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); + ret = 0; if (!nfs_pageio_add_request(pgio, req)) { nfs_redirty_request(req); ret = pgio->pg_error; @@ -451,8 +447,6 @@ static void nfs_inode_remove_request(struct nfs_page *req) struct inode *inode = req->wb_context->dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); - BUG_ON (!NFS_WBACK_BUSY(req)); - spin_lock(&inode->i_lock); if (likely(!PageSwapCache(req->wb_page))) { set_page_private(req->wb_page, 0); @@ -884,7 +878,7 @@ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode) { if (nfs_have_delegated_attributes(inode)) goto out; - if (NFS_I(inode)->cache_validity & NFS_INO_REVAL_PAGECACHE) + if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE)) return false; out: return PageUptodate(page) != 0; @@ -1727,7 +1721,6 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) struct nfs_page *req; int ret = 0; - BUG_ON(!PageLocked(page)); for (;;) { wait_on_page_writeback(page); req = nfs_page_find_request(page); @@ -1829,7 +1822,7 @@ int __init nfs_init_writepagecache(void) goto out_destroy_write_mempool; nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT, - nfs_wdata_cachep); + nfs_cdata_cachep); if (nfs_commit_mempool == NULL) goto out_destroy_commit_cache; diff --git a/trunk/fs/notify/Makefile b/trunk/fs/notify/Makefile index ae5f33a6d868..96d3420d0242 100644 --- a/trunk/fs/notify/Makefile +++ b/trunk/fs/notify/Makefile @@ -1,5 +1,5 @@ obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o \ - mark.o vfsmount_mark.o + mark.o vfsmount_mark.o fdinfo.o obj-y += dnotify/ obj-y += inotify/ diff --git a/trunk/fs/notify/fanotify/fanotify_user.c b/trunk/fs/notify/fanotify/fanotify_user.c index 6fcaeb8c902e..a5cd9bba022f 100644 --- a/trunk/fs/notify/fanotify/fanotify_user.c +++ b/trunk/fs/notify/fanotify/fanotify_user.c @@ -17,6 +17,7 @@ #include #include "../../mount.h" +#include "../fdinfo.h" #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 #define FANOTIFY_DEFAULT_MAX_MARKS 8192 @@ -428,6 +429,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar } static const struct file_operations fanotify_fops = { + .show_fdinfo = fanotify_show_fdinfo, .poll = fanotify_poll, .read = fanotify_read, .write = fanotify_write, diff --git a/trunk/fs/notify/fdinfo.c b/trunk/fs/notify/fdinfo.c new file mode 100644 index 000000000000..514c4b81483d --- /dev/null +++ b/trunk/fs/notify/fdinfo.c @@ -0,0 +1,179 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "inotify/inotify.h" +#include "../fs/mount.h" + +#if defined(CONFIG_PROC_FS) + +#if defined(CONFIG_INOTIFY_USER) || defined(CONFIG_FANOTIFY) + +static int show_fdinfo(struct seq_file *m, struct file *f, + int (*show)(struct seq_file *m, struct fsnotify_mark *mark)) +{ + struct fsnotify_group *group = f->private_data; + struct fsnotify_mark *mark; + int ret = 0; + + spin_lock(&group->mark_lock); + list_for_each_entry(mark, &group->marks_list, g_list) { + ret = show(m, mark); + if (ret) + break; + } + spin_unlock(&group->mark_lock); + return ret; +} + +#if defined(CONFIG_EXPORTFS) +static int show_mark_fhandle(struct seq_file *m, struct inode *inode) +{ + struct { + struct file_handle handle; + u8 pad[64]; + } f; + int size, ret, i; + + f.handle.handle_bytes = sizeof(f.pad); + size = f.handle.handle_bytes >> 2; + + ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0); + if ((ret == 255) || (ret == -ENOSPC)) { + WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret); + return 0; + } + + f.handle.handle_type = ret; + f.handle.handle_bytes = size * sizeof(u32); + + ret = seq_printf(m, "fhandle-bytes:%x fhandle-type:%x f_handle:", + f.handle.handle_bytes, f.handle.handle_type); + + for (i = 0; i < f.handle.handle_bytes; i++) + ret |= seq_printf(m, "%02x", (int)f.handle.f_handle[i]); + + return ret; +} +#else +static int show_mark_fhandle(struct seq_file *m, struct inode *inode) +{ + return 0; +} +#endif + +#ifdef CONFIG_INOTIFY_USER + +static int inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) +{ + struct inotify_inode_mark *inode_mark; + struct inode *inode; + int ret = 0; + + if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE))) + return 0; + + inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); + inode = igrab(mark->i.inode); + if (inode) { + ret = seq_printf(m, "inotify wd:%x ino:%lx sdev:%x " + "mask:%x ignored_mask:%x ", + inode_mark->wd, inode->i_ino, + inode->i_sb->s_dev, + mark->mask, mark->ignored_mask); + ret |= show_mark_fhandle(m, inode); + ret |= seq_putc(m, '\n'); + iput(inode); + } + + return ret; +} + +int inotify_show_fdinfo(struct seq_file *m, struct file *f) +{ + return show_fdinfo(m, f, inotify_fdinfo); +} + +#endif /* CONFIG_INOTIFY_USER */ + +#ifdef CONFIG_FANOTIFY + +static int fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) +{ + unsigned int mflags = 0; + struct inode *inode; + int ret = 0; + + if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) + return 0; + + if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) + mflags |= FAN_MARK_IGNORED_SURV_MODIFY; + + if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { + inode = igrab(mark->i.inode); + if (!inode) + goto out; + ret = seq_printf(m, "fanotify ino:%lx sdev:%x " + "mflags:%x mask:%x ignored_mask:%x ", + inode->i_ino, inode->i_sb->s_dev, + mflags, mark->mask, mark->ignored_mask); + ret |= show_mark_fhandle(m, inode); + ret |= seq_putc(m, '\n'); + iput(inode); + } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) { + struct mount *mnt = real_mount(mark->m.mnt); + + ret = seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x " + "ignored_mask:%x\n", mnt->mnt_id, mflags, + mark->mask, mark->ignored_mask); + } +out: + return ret; +} + +int fanotify_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct fsnotify_group *group = f->private_data; + unsigned int flags = 0; + + switch (group->priority) { + case FS_PRIO_0: + flags |= FAN_CLASS_NOTIF; + break; + case FS_PRIO_1: + flags |= FAN_CLASS_CONTENT; + break; + case FS_PRIO_2: + flags |= FAN_CLASS_PRE_CONTENT; + break; + } + + if (group->max_events == UINT_MAX) + flags |= FAN_UNLIMITED_QUEUE; + + if (group->fanotify_data.max_marks == UINT_MAX) + flags |= FAN_UNLIMITED_MARKS; + + seq_printf(m, "fanotify flags:%x event-flags:%x\n", + flags, group->fanotify_data.f_flags); + + return show_fdinfo(m, f, fanotify_fdinfo); +} + +#endif /* CONFIG_FANOTIFY */ + +#endif /* CONFIG_INOTIFY_USER || CONFIG_FANOTIFY */ + +#endif /* CONFIG_PROC_FS */ diff --git a/trunk/fs/notify/fdinfo.h b/trunk/fs/notify/fdinfo.h new file mode 100644 index 000000000000..556afda990e9 --- /dev/null +++ b/trunk/fs/notify/fdinfo.h @@ -0,0 +1,27 @@ +#ifndef __FSNOTIFY_FDINFO_H__ +#define __FSNOTIFY_FDINFO_H__ + +#include +#include + +struct seq_file; +struct file; + +#ifdef CONFIG_PROC_FS + +#ifdef CONFIG_INOTIFY_USER +extern int inotify_show_fdinfo(struct seq_file *m, struct file *f); +#endif + +#ifdef CONFIG_FANOTIFY +extern int fanotify_show_fdinfo(struct seq_file *m, struct file *f); +#endif + +#else /* CONFIG_PROC_FS */ + +#define inotify_show_fdinfo NULL +#define fanotify_show_fdinfo NULL + +#endif /* CONFIG_PROC_FS */ + +#endif /* __FSNOTIFY_FDINFO_H__ */ diff --git a/trunk/fs/notify/inode_mark.c b/trunk/fs/notify/inode_mark.c index b13c00ac48eb..f3035691f528 100644 --- a/trunk/fs/notify/inode_mark.c +++ b/trunk/fs/notify/inode_mark.c @@ -116,8 +116,9 @@ void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) * given a group and inode, find the mark associated with that combination. * if found take a reference to that mark and return it, else return NULL */ -struct fsnotify_mark *fsnotify_find_inode_mark_locked(struct fsnotify_group *group, - struct inode *inode) +static struct fsnotify_mark *fsnotify_find_inode_mark_locked( + struct fsnotify_group *group, + struct inode *inode) { struct fsnotify_mark *mark; struct hlist_node *pos; diff --git a/trunk/fs/notify/inotify/inotify_user.c b/trunk/fs/notify/inotify/inotify_user.c index c311dda054a3..36cb013c7c13 100644 --- a/trunk/fs/notify/inotify/inotify_user.c +++ b/trunk/fs/notify/inotify/inotify_user.c @@ -40,6 +40,7 @@ #include #include "inotify.h" +#include "../fdinfo.h" #include @@ -335,6 +336,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, } static const struct file_operations inotify_fops = { + .show_fdinfo = inotify_show_fdinfo, .poll = inotify_poll, .read = inotify_read, .fasync = inotify_fasync, diff --git a/trunk/fs/ocfs2/extent_map.c b/trunk/fs/ocfs2/extent_map.c index 70b5863a2d64..f487aa343442 100644 --- a/trunk/fs/ocfs2/extent_map.c +++ b/trunk/fs/ocfs2/extent_map.c @@ -832,7 +832,7 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } -int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin) +int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence) { struct inode *inode = file->f_mapping->host; int ret; @@ -843,7 +843,7 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin) struct buffer_head *di_bh = NULL; struct ocfs2_extent_rec rec; - BUG_ON(origin != SEEK_DATA && origin != SEEK_HOLE); + BUG_ON(whence != SEEK_DATA && whence != SEEK_HOLE); ret = ocfs2_inode_lock(inode, &di_bh, 0); if (ret) { @@ -859,7 +859,7 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin) } if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { - if (origin == SEEK_HOLE) + if (whence == SEEK_HOLE) *offset = inode->i_size; goto out_unlock; } @@ -888,8 +888,8 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin) is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1; } - if ((!is_data && origin == SEEK_HOLE) || - (is_data && origin == SEEK_DATA)) { + if ((!is_data && whence == SEEK_HOLE) || + (is_data && whence == SEEK_DATA)) { if (extoff > *offset) *offset = extoff; goto out_unlock; @@ -899,7 +899,7 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin) cpos += clen; } - if (origin == SEEK_HOLE) { + if (whence == SEEK_HOLE) { extoff = cpos; extoff <<= cs_bits; extlen = clen; diff --git a/trunk/fs/ocfs2/file.c b/trunk/fs/ocfs2/file.c index dda089804942..fe492e1a3cfc 100644 --- a/trunk/fs/ocfs2/file.c +++ b/trunk/fs/ocfs2/file.c @@ -2637,14 +2637,14 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, } /* Refer generic_file_llseek_unlocked() */ -static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin) +static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; int ret = 0; mutex_lock(&inode->i_mutex); - switch (origin) { + switch (whence) { case SEEK_SET: break; case SEEK_END: @@ -2659,7 +2659,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin) break; case SEEK_DATA: case SEEK_HOLE: - ret = ocfs2_seek_data_hole_offset(file, &offset, origin); + ret = ocfs2_seek_data_hole_offset(file, &offset, whence); if (ret) goto out; break; diff --git a/trunk/fs/open.c b/trunk/fs/open.c index 59071f55bf7f..182d8667b7bd 100644 --- a/trunk/fs/open.c +++ b/trunk/fs/open.c @@ -435,7 +435,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename) goto dput_and_out; error = -EPERM; - if (!capable(CAP_SYS_CHROOT)) + if (!nsown_capable(CAP_SYS_CHROOT)) goto dput_and_out; error = security_path_chroot(&path); if (error) diff --git a/trunk/fs/pnode.h b/trunk/fs/pnode.h index 65c60979d541..19b853a3445c 100644 --- a/trunk/fs/pnode.h +++ b/trunk/fs/pnode.h @@ -22,6 +22,7 @@ #define CL_COPY_ALL 0x04 #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 +#define CL_SHARED_TO_SLAVE 0x20 static inline void set_mnt_shared(struct mount *mnt) { diff --git a/trunk/fs/proc/Makefile b/trunk/fs/proc/Makefile index 99349efbbc2b..981b05601931 100644 --- a/trunk/fs/proc/Makefile +++ b/trunk/fs/proc/Makefile @@ -21,6 +21,7 @@ proc-y += uptime.o proc-y += version.o proc-y += softirqs.o proc-y += namespaces.o +proc-y += self.o proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o proc-$(CONFIG_NET) += proc_net.o proc-$(CONFIG_PROC_KCORE) += kcore.o diff --git a/trunk/fs/proc/array.c b/trunk/fs/proc/array.c index d3696708fc1a..6a91e6ffbcbd 100644 --- a/trunk/fs/proc/array.c +++ b/trunk/fs/proc/array.c @@ -162,7 +162,7 @@ static inline const char *get_task_state(struct task_struct *tsk) static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p) { - struct user_namespace *user_ns = current_user_ns(); + struct user_namespace *user_ns = seq_user_ns(m); struct group_info *group_info; int g; struct fdtable *fdt = NULL; @@ -212,7 +212,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, group_info = cred->group_info; task_unlock(p); - for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++) + for (g = 0; g < group_info->ngroups; g++) seq_printf(m, "%d ", from_kgid_munged(user_ns, GROUP_AT(group_info, g))); put_cred(cred); @@ -220,7 +220,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, seq_putc(m, '\n'); } -static void render_sigset_t(struct seq_file *m, const char *header, +void render_sigset_t(struct seq_file *m, const char *header, sigset_t *set) { int i; @@ -308,6 +308,10 @@ static void render_cap_t(struct seq_file *m, const char *header, seq_putc(m, '\n'); } +/* Remove non-existent capabilities */ +#define NORM_CAPS(v) (v.cap[CAP_TO_INDEX(CAP_LAST_CAP)] &= \ + CAP_TO_MASK(CAP_LAST_CAP + 1) - 1) + static inline void task_cap(struct seq_file *m, struct task_struct *p) { const struct cred *cred; @@ -321,12 +325,24 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) cap_bset = cred->cap_bset; rcu_read_unlock(); + NORM_CAPS(cap_inheritable); + NORM_CAPS(cap_permitted); + NORM_CAPS(cap_effective); + NORM_CAPS(cap_bset); + render_cap_t(m, "CapInh:\t", &cap_inheritable); render_cap_t(m, "CapPrm:\t", &cap_permitted); render_cap_t(m, "CapEff:\t", &cap_effective); render_cap_t(m, "CapBnd:\t", &cap_bset); } +static inline void task_seccomp(struct seq_file *m, struct task_struct *p) +{ +#ifdef CONFIG_SECCOMP + seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode); +#endif +} + static inline void task_context_switch_counts(struct seq_file *m, struct task_struct *p) { @@ -360,6 +376,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, } task_sig(m, task); task_cap(m, task); + task_seccomp(m, task); task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); task_context_switch_counts(m, task); diff --git a/trunk/fs/proc/base.c b/trunk/fs/proc/base.c index aa63d25157b8..5a5a0be40e40 100644 --- a/trunk/fs/proc/base.c +++ b/trunk/fs/proc/base.c @@ -2345,146 +2345,6 @@ static const struct file_operations proc_coredump_filter_operations = { }; #endif -/* - * /proc/self: - */ -static int proc_self_readlink(struct dentry *dentry, char __user *buffer, - int buflen) -{ - struct pid_namespace *ns = dentry->d_sb->s_fs_info; - pid_t tgid = task_tgid_nr_ns(current, ns); - char tmp[PROC_NUMBUF]; - if (!tgid) - return -ENOENT; - sprintf(tmp, "%d", tgid); - return vfs_readlink(dentry,buffer,buflen,tmp); -} - -static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct pid_namespace *ns = dentry->d_sb->s_fs_info; - pid_t tgid = task_tgid_nr_ns(current, ns); - char *name = ERR_PTR(-ENOENT); - if (tgid) { - /* 11 for max length of signed int in decimal + NULL term */ - name = kmalloc(12, GFP_KERNEL); - if (!name) - name = ERR_PTR(-ENOMEM); - else - sprintf(name, "%d", tgid); - } - nd_set_link(nd, name); - return NULL; -} - -static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) -{ - char *s = nd_get_link(nd); - if (!IS_ERR(s)) - kfree(s); -} - -static const struct inode_operations proc_self_inode_operations = { - .readlink = proc_self_readlink, - .follow_link = proc_self_follow_link, - .put_link = proc_self_put_link, -}; - -/* - * proc base - * - * These are the directory entries in the root directory of /proc - * that properly belong to the /proc filesystem, as they describe - * describe something that is process related. - */ -static const struct pid_entry proc_base_stuff[] = { - NOD("self", S_IFLNK|S_IRWXUGO, - &proc_self_inode_operations, NULL, {}), -}; - -static struct dentry *proc_base_instantiate(struct inode *dir, - struct dentry *dentry, struct task_struct *task, const void *ptr) -{ - const struct pid_entry *p = ptr; - struct inode *inode; - struct proc_inode *ei; - struct dentry *error; - - /* Allocate the inode */ - error = ERR_PTR(-ENOMEM); - inode = new_inode(dir->i_sb); - if (!inode) - goto out; - - /* Initialize the inode */ - ei = PROC_I(inode); - inode->i_ino = get_next_ino(); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - - /* - * grab the reference to the task. - */ - ei->pid = get_task_pid(task, PIDTYPE_PID); - if (!ei->pid) - goto out_iput; - - inode->i_mode = p->mode; - if (S_ISDIR(inode->i_mode)) - set_nlink(inode, 2); - if (S_ISLNK(inode->i_mode)) - inode->i_size = 64; - if (p->iop) - inode->i_op = p->iop; - if (p->fop) - inode->i_fop = p->fop; - ei->op = p->op; - d_add(dentry, inode); - error = NULL; -out: - return error; -out_iput: - iput(inode); - goto out; -} - -static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry) -{ - struct dentry *error; - struct task_struct *task = get_proc_task(dir); - const struct pid_entry *p, *last; - - error = ERR_PTR(-ENOENT); - - if (!task) - goto out_no_task; - - /* Lookup the directory entry */ - last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1]; - for (p = proc_base_stuff; p <= last; p++) { - if (p->len != dentry->d_name.len) - continue; - if (!memcmp(dentry->d_name.name, p->name, p->len)) - break; - } - if (p > last) - goto out; - - error = proc_base_instantiate(dir, dentry, task, p); - -out: - put_task_struct(task); -out_no_task: - return error; -} - -static int proc_base_fill_cache(struct file *filp, void *dirent, - filldir_t filldir, struct task_struct *task, const struct pid_entry *p) -{ - return proc_fill_cache(filp, dirent, filldir, p->name, p->len, - proc_base_instantiate, task, p); -} - #ifdef CONFIG_TASK_IO_ACCOUNTING static int do_io_accounting(struct task_struct *task, char *buffer, int whole) { @@ -2839,10 +2699,6 @@ void proc_flush_task(struct task_struct *task) proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, tgid->numbers[i].nr); } - - upid = &pid->numbers[pid->level]; - if (upid->nr == 1) - pid_ns_release_proc(upid->ns); } static struct dentry *proc_pid_instantiate(struct inode *dir, @@ -2876,15 +2732,11 @@ static struct dentry *proc_pid_instantiate(struct inode *dir, struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { - struct dentry *result; + struct dentry *result = NULL; struct task_struct *task; unsigned tgid; struct pid_namespace *ns; - result = proc_base_lookup(dir, dentry); - if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT) - goto out; - tgid = name_to_int(dentry); if (tgid == ~0U) goto out; @@ -2947,7 +2799,7 @@ static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter ite return iter; } -#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff)) +#define TGID_OFFSET (FIRST_PROCESS_ENTRY) static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir, struct tgid_iter iter) @@ -2967,25 +2819,12 @@ static int fake_filldir(void *buf, const char *name, int namelen, /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int nr; - struct task_struct *reaper; struct tgid_iter iter; struct pid_namespace *ns; filldir_t __filldir; if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) - goto out_no_task; - nr = filp->f_pos - FIRST_PROCESS_ENTRY; - - reaper = get_proc_task(filp->f_path.dentry->d_inode); - if (!reaper) - goto out_no_task; - - for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) { - const struct pid_entry *p = &proc_base_stuff[nr]; - if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0) - goto out; - } + goto out; ns = filp->f_dentry->d_sb->s_fs_info; iter.task = NULL; @@ -3006,8 +2845,6 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) } filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; out: - put_task_struct(reaper); -out_no_task: return 0; } diff --git a/trunk/fs/proc/fd.c b/trunk/fs/proc/fd.c index f28a875f8779..d7a4a28ef630 100644 --- a/trunk/fs/proc/fd.c +++ b/trunk/fs/proc/fd.c @@ -50,6 +50,8 @@ static int seq_show(struct seq_file *m, void *v) if (!ret) { seq_printf(m, "pos:\t%lli\nflags:\t0%o\n", (long long)file->f_pos, f_flags); + if (file->f_op->show_fdinfo) + ret = file->f_op->show_fdinfo(m, file); fput(file); } diff --git a/trunk/fs/proc/generic.c b/trunk/fs/proc/generic.c index 0d80cef4cfb9..7b3ae3cc0ef9 100644 --- a/trunk/fs/proc/generic.c +++ b/trunk/fs/proc/generic.c @@ -350,14 +350,14 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */ * Return an inode number between PROC_DYNAMIC_FIRST and * 0xffffffff, or zero on failure. */ -static unsigned int get_inode_number(void) +int proc_alloc_inum(unsigned int *inum) { unsigned int i; int error; retry: - if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0) - return 0; + if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL)) + return -ENOMEM; spin_lock(&proc_inum_lock); error = ida_get_new(&proc_inum_ida, &i); @@ -365,18 +365,19 @@ static unsigned int get_inode_number(void) if (error == -EAGAIN) goto retry; else if (error) - return 0; + return error; if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { spin_lock(&proc_inum_lock); ida_remove(&proc_inum_ida, i); spin_unlock(&proc_inum_lock); - return 0; + return -ENOSPC; } - return PROC_DYNAMIC_FIRST + i; + *inum = PROC_DYNAMIC_FIRST + i; + return 0; } -static void release_inode_number(unsigned int inum) +void proc_free_inum(unsigned int inum) { spin_lock(&proc_inum_lock); ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); @@ -554,13 +555,12 @@ static const struct inode_operations proc_dir_inode_operations = { static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) { - unsigned int i; struct proc_dir_entry *tmp; + int ret; - i = get_inode_number(); - if (i == 0) - return -EAGAIN; - dp->low_ino = i; + ret = proc_alloc_inum(&dp->low_ino); + if (ret) + return ret; if (S_ISDIR(dp->mode)) { if (dp->proc_iops == NULL) { @@ -764,7 +764,7 @@ EXPORT_SYMBOL(proc_create_data); static void free_proc_entry(struct proc_dir_entry *de) { - release_inode_number(de->low_ino); + proc_free_inum(de->low_ino); if (S_ISLNK(de->mode)) kfree(de->data); diff --git a/trunk/fs/proc/inode.c b/trunk/fs/proc/inode.c index 3b22bbdee9ec..439ae6886507 100644 --- a/trunk/fs/proc/inode.c +++ b/trunk/fs/proc/inode.c @@ -31,6 +31,7 @@ static void proc_evict_inode(struct inode *inode) struct proc_dir_entry *de; struct ctl_table_header *head; const struct proc_ns_operations *ns_ops; + void *ns; truncate_inode_pages(&inode->i_data, 0); clear_inode(inode); @@ -49,8 +50,9 @@ static void proc_evict_inode(struct inode *inode) } /* Release any associated namespace */ ns_ops = PROC_I(inode)->ns_ops; - if (ns_ops && ns_ops->put) - ns_ops->put(PROC_I(inode)->ns); + ns = PROC_I(inode)->ns; + if (ns_ops && ns) + ns_ops->put(ns); } static struct kmem_cache * proc_inode_cachep; diff --git a/trunk/fs/proc/internal.h b/trunk/fs/proc/internal.h index 43973b084abf..252544c05207 100644 --- a/trunk/fs/proc/internal.h +++ b/trunk/fs/proc/internal.h @@ -15,6 +15,7 @@ struct ctl_table_header; struct mempolicy; extern struct proc_dir_entry proc_root; +extern void proc_self_init(void); #ifdef CONFIG_PROC_SYSCTL extern int proc_sys_init(void); extern void sysctl_head_put(struct ctl_table_header *head); diff --git a/trunk/fs/proc/namespaces.c b/trunk/fs/proc/namespaces.c index b178ed733c36..b7a47196c8c3 100644 --- a/trunk/fs/proc/namespaces.c +++ b/trunk/fs/proc/namespaces.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internal.h" @@ -24,12 +25,168 @@ static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_IPC_NS &ipcns_operations, #endif +#ifdef CONFIG_PID_NS + &pidns_operations, +#endif +#ifdef CONFIG_USER_NS + &userns_operations, +#endif + &mntns_operations, }; static const struct file_operations ns_file_operations = { .llseek = no_llseek, }; +static const struct inode_operations ns_inode_operations = { + .setattr = proc_setattr, +}; + +static int ns_delete_dentry(const struct dentry *dentry) +{ + /* Don't cache namespace inodes when not in use */ + return 1; +} + +static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) +{ + struct inode *inode = dentry->d_inode; + const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; + + return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", + ns_ops->name, inode->i_ino); +} + +const struct dentry_operations ns_dentry_operations = +{ + .d_delete = ns_delete_dentry, + .d_dname = ns_dname, +}; + +static struct dentry *proc_ns_get_dentry(struct super_block *sb, + struct task_struct *task, const struct proc_ns_operations *ns_ops) +{ + struct dentry *dentry, *result; + struct inode *inode; + struct proc_inode *ei; + struct qstr qname = { .name = "", }; + void *ns; + + ns = ns_ops->get(task); + if (!ns) + return ERR_PTR(-ENOENT); + + dentry = d_alloc_pseudo(sb, &qname); + if (!dentry) { + ns_ops->put(ns); + return ERR_PTR(-ENOMEM); + } + + inode = iget_locked(sb, ns_ops->inum(ns)); + if (!inode) { + dput(dentry); + ns_ops->put(ns); + return ERR_PTR(-ENOMEM); + } + + ei = PROC_I(inode); + if (inode->i_state & I_NEW) { + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_op = &ns_inode_operations; + inode->i_mode = S_IFREG | S_IRUGO; + inode->i_fop = &ns_file_operations; + ei->ns_ops = ns_ops; + ei->ns = ns; + unlock_new_inode(inode); + } else { + ns_ops->put(ns); + } + + d_set_d_op(dentry, &ns_dentry_operations); + result = d_instantiate_unique(dentry, inode); + if (result) { + dput(dentry); + dentry = result; + } + + return dentry; +} + +static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode = dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct proc_inode *ei = PROC_I(inode); + struct task_struct *task; + struct dentry *ns_dentry; + void *error = ERR_PTR(-EACCES); + + task = get_proc_task(inode); + if (!task) + goto out; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out_put_task; + + ns_dentry = proc_ns_get_dentry(sb, task, ei->ns_ops); + if (IS_ERR(ns_dentry)) { + error = ERR_CAST(ns_dentry); + goto out_put_task; + } + + dput(nd->path.dentry); + nd->path.dentry = ns_dentry; + error = NULL; + +out_put_task: + put_task_struct(task); +out: + return error; +} + +static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) +{ + struct inode *inode = dentry->d_inode; + struct proc_inode *ei = PROC_I(inode); + const struct proc_ns_operations *ns_ops = ei->ns_ops; + struct task_struct *task; + void *ns; + char name[50]; + int len = -EACCES; + + task = get_proc_task(inode); + if (!task) + goto out; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out_put_task; + + len = -ENOENT; + ns = ns_ops->get(task); + if (!ns) + goto out_put_task; + + snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns)); + len = strlen(name); + + if (len > buflen) + len = buflen; + if (copy_to_user(buffer, name, len)) + len = -EFAULT; + + ns_ops->put(ns); +out_put_task: + put_task_struct(task); +out: + return len; +} + +static const struct inode_operations proc_ns_link_inode_operations = { + .readlink = proc_ns_readlink, + .follow_link = proc_ns_follow_link, + .setattr = proc_setattr, +}; + static struct dentry *proc_ns_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { @@ -37,21 +194,15 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, struct inode *inode; struct proc_inode *ei; struct dentry *error = ERR_PTR(-ENOENT); - void *ns; inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) goto out; - ns = ns_ops->get(task); - if (!ns) - goto out_iput; - ei = PROC_I(inode); - inode->i_mode = S_IFREG|S_IRUSR; - inode->i_fop = &ns_file_operations; - ei->ns_ops = ns_ops; - ei->ns = ns; + inode->i_mode = S_IFLNK|S_IRWXUGO; + inode->i_op = &proc_ns_link_inode_operations; + ei->ns_ops = ns_ops; d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); @@ -60,9 +211,6 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, error = NULL; out: return error; -out_iput: - iput(inode); - goto out; } static int proc_ns_fill_cache(struct file *filp, void *dirent, @@ -89,10 +237,6 @@ static int proc_ns_dir_readdir(struct file *filp, void *dirent, if (!task) goto out_no_task; - ret = -EPERM; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out; - ret = 0; i = filp->f_pos; switch (i) { @@ -152,10 +296,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, if (!task) goto out_no_task; - error = ERR_PTR(-EPERM); - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out; - last = &ns_entries[ARRAY_SIZE(ns_entries)]; for (entry = ns_entries; entry < last; entry++) { if (strlen((*entry)->name) != len) @@ -163,7 +303,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, if (!memcmp(dentry->d_name.name, (*entry)->name, len)) break; } - error = ERR_PTR(-ENOENT); if (entry == last) goto out; @@ -198,3 +337,7 @@ struct file *proc_ns_fget(int fd) return ERR_PTR(-EINVAL); } +bool proc_ns_inode(struct inode *inode) +{ + return inode->i_fop == &ns_file_operations; +} diff --git a/trunk/fs/proc/proc_devtree.c b/trunk/fs/proc/proc_devtree.c index df7dd08d4391..de20ec480fa0 100644 --- a/trunk/fs/proc/proc_devtree.c +++ b/trunk/fs/proc/proc_devtree.c @@ -195,11 +195,7 @@ void proc_device_tree_add_node(struct device_node *np, set_node_proc_entry(np, de); for (child = NULL; (child = of_get_next_child(np, child));) { /* Use everything after the last slash, or the full name */ - p = strrchr(child->full_name, '/'); - if (!p) - p = child->full_name; - else - ++p; + p = kbasename(child->full_name); if (duplicate_name(de, p)) p = fixup_name(np, de, p); diff --git a/trunk/fs/proc/root.c b/trunk/fs/proc/root.c index 9889a92d2e01..c6e9fac26bac 100644 --- a/trunk/fs/proc/root.c +++ b/trunk/fs/proc/root.c @@ -100,14 +100,13 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, int err; struct super_block *sb; struct pid_namespace *ns; - struct proc_inode *ei; char *options; if (flags & MS_KERNMOUNT) { ns = (struct pid_namespace *)data; options = NULL; } else { - ns = current->nsproxy->pid_ns; + ns = task_active_pid_ns(current); options = data; } @@ -130,13 +129,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, sb->s_flags |= MS_ACTIVE; } - ei = PROC_I(sb->s_root->d_inode); - if (!ei->pid) { - rcu_read_lock(); - ei->pid = get_pid(find_pid_ns(1, ns)); - rcu_read_unlock(); - } - return dget(sb->s_root); } @@ -153,6 +145,7 @@ static struct file_system_type proc_fs_type = { .name = "proc", .mount = proc_mount, .kill_sb = proc_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; void __init proc_root_init(void) @@ -163,12 +156,8 @@ void __init proc_root_init(void) err = register_filesystem(&proc_fs_type); if (err) return; - err = pid_ns_prepare_proc(&init_pid_ns); - if (err) { - unregister_filesystem(&proc_fs_type); - return; - } + proc_self_init(); proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); diff --git a/trunk/fs/proc/self.c b/trunk/fs/proc/self.c new file mode 100644 index 000000000000..aa5cc3bff140 --- /dev/null +++ b/trunk/fs/proc/self.c @@ -0,0 +1,59 @@ +#include +#include +#include + +/* + * /proc/self: + */ +static int proc_self_readlink(struct dentry *dentry, char __user *buffer, + int buflen) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + char tmp[PROC_NUMBUF]; + if (!tgid) + return -ENOENT; + sprintf(tmp, "%d", tgid); + return vfs_readlink(dentry,buffer,buflen,tmp); +} + +static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + char *name = ERR_PTR(-ENOENT); + if (tgid) { + /* 11 for max length of signed int in decimal + NULL term */ + name = kmalloc(12, GFP_KERNEL); + if (!name) + name = ERR_PTR(-ENOMEM); + else + sprintf(name, "%d", tgid); + } + nd_set_link(nd, name); + return NULL; +} + +static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + char *s = nd_get_link(nd); + if (!IS_ERR(s)) + kfree(s); +} + +static const struct inode_operations proc_self_inode_operations = { + .readlink = proc_self_readlink, + .follow_link = proc_self_follow_link, + .put_link = proc_self_put_link, +}; + +void __init proc_self_init(void) +{ + struct proc_dir_entry *proc_self_symlink; + mode_t mode; + + mode = S_IFLNK | S_IRWXUGO; + proc_self_symlink = proc_create("self", mode, NULL, NULL ); + proc_self_symlink->proc_iops = &proc_self_inode_operations; +} diff --git a/trunk/fs/proc/task_mmu.c b/trunk/fs/proc/task_mmu.c index 48775628abbf..448455b7fd91 100644 --- a/trunk/fs/proc/task_mmu.c +++ b/trunk/fs/proc/task_mmu.c @@ -526,6 +526,57 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return 0; } +static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) +{ + /* + * Don't forget to update Documentation/ on changes. + */ + static const char mnemonics[BITS_PER_LONG][2] = { + /* + * In case if we meet a flag we don't know about. + */ + [0 ... (BITS_PER_LONG-1)] = "??", + + [ilog2(VM_READ)] = "rd", + [ilog2(VM_WRITE)] = "wr", + [ilog2(VM_EXEC)] = "ex", + [ilog2(VM_SHARED)] = "sh", + [ilog2(VM_MAYREAD)] = "mr", + [ilog2(VM_MAYWRITE)] = "mw", + [ilog2(VM_MAYEXEC)] = "me", + [ilog2(VM_MAYSHARE)] = "ms", + [ilog2(VM_GROWSDOWN)] = "gd", + [ilog2(VM_PFNMAP)] = "pf", + [ilog2(VM_DENYWRITE)] = "dw", + [ilog2(VM_LOCKED)] = "lo", + [ilog2(VM_IO)] = "io", + [ilog2(VM_SEQ_READ)] = "sr", + [ilog2(VM_RAND_READ)] = "rr", + [ilog2(VM_DONTCOPY)] = "dc", + [ilog2(VM_DONTEXPAND)] = "de", + [ilog2(VM_ACCOUNT)] = "ac", + [ilog2(VM_NORESERVE)] = "nr", + [ilog2(VM_HUGETLB)] = "ht", + [ilog2(VM_NONLINEAR)] = "nl", + [ilog2(VM_ARCH_1)] = "ar", + [ilog2(VM_DONTDUMP)] = "dd", + [ilog2(VM_MIXEDMAP)] = "mm", + [ilog2(VM_HUGEPAGE)] = "hg", + [ilog2(VM_NOHUGEPAGE)] = "nh", + [ilog2(VM_MERGEABLE)] = "mg", + }; + size_t i; + + seq_puts(m, "VmFlags: "); + for (i = 0; i < BITS_PER_LONG; i++) { + if (vma->vm_flags & (1UL << i)) { + seq_printf(m, "%c%c ", + mnemonics[i][0], mnemonics[i][1]); + } + } + seq_putc(m, '\n'); +} + static int show_smap(struct seq_file *m, void *v, int is_pid) { struct proc_maps_private *priv = m->private; @@ -581,6 +632,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) seq_printf(m, "Nonlinear: %8lu kB\n", mss.nonlinear >> 10); + show_smap_vma_flags(m, vma); + if (m->count < m->size) /* vma is copied successfully */ m->version = (vma != get_gate_vma(task->mm)) ? vma->vm_start : 0; diff --git a/trunk/fs/pstore/inode.c b/trunk/fs/pstore/inode.c index ed1d8c7212da..67de74ca85f4 100644 --- a/trunk/fs/pstore/inode.c +++ b/trunk/fs/pstore/inode.c @@ -151,13 +151,13 @@ static int pstore_file_open(struct inode *inode, struct file *file) return 0; } -static loff_t pstore_file_llseek(struct file *file, loff_t off, int origin) +static loff_t pstore_file_llseek(struct file *file, loff_t off, int whence) { struct seq_file *sf = file->private_data; if (sf->op) - return seq_lseek(file, off, origin); - return default_llseek(file, off, origin); + return seq_lseek(file, off, whence); + return default_llseek(file, off, whence); } static const struct file_operations pstore_file_operations = { diff --git a/trunk/fs/read_write.c b/trunk/fs/read_write.c index d06534857e9e..1edaf099ddd7 100644 --- a/trunk/fs/read_write.c +++ b/trunk/fs/read_write.c @@ -54,7 +54,7 @@ static loff_t lseek_execute(struct file *file, struct inode *inode, * generic_file_llseek_size - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to - * @origin: type of seek + * @whence: type of seek * @size: max size of this file in file system * @eof: offset used for SEEK_END position * @@ -67,12 +67,12 @@ static loff_t lseek_execute(struct file *file, struct inode *inode, * read/writes behave like SEEK_SET against seeks. */ loff_t -generic_file_llseek_size(struct file *file, loff_t offset, int origin, +generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof) { struct inode *inode = file->f_mapping->host; - switch (origin) { + switch (whence) { case SEEK_END: offset += eof; break; @@ -122,17 +122,17 @@ EXPORT_SYMBOL(generic_file_llseek_size); * generic_file_llseek - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to - * @origin: type of seek + * @whence: type of seek * * This is a generic implemenation of ->llseek useable for all normal local * filesystems. It just updates the file offset to the value specified by - * @offset and @origin under i_mutex. + * @offset and @whence under i_mutex. */ -loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) +loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; - return generic_file_llseek_size(file, offset, origin, + return generic_file_llseek_size(file, offset, whence, inode->i_sb->s_maxbytes, i_size_read(inode)); } @@ -142,32 +142,32 @@ EXPORT_SYMBOL(generic_file_llseek); * noop_llseek - No Operation Performed llseek implementation * @file: file structure to seek on * @offset: file offset to seek to - * @origin: type of seek + * @whence: type of seek * * This is an implementation of ->llseek useable for the rare special case when * userspace expects the seek to succeed but the (device) file is actually not * able to perform the seek. In this case you use noop_llseek() instead of * falling back to the default implementation of ->llseek. */ -loff_t noop_llseek(struct file *file, loff_t offset, int origin) +loff_t noop_llseek(struct file *file, loff_t offset, int whence) { return file->f_pos; } EXPORT_SYMBOL(noop_llseek); -loff_t no_llseek(struct file *file, loff_t offset, int origin) +loff_t no_llseek(struct file *file, loff_t offset, int whence) { return -ESPIPE; } EXPORT_SYMBOL(no_llseek); -loff_t default_llseek(struct file *file, loff_t offset, int origin) +loff_t default_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_path.dentry->d_inode; loff_t retval; mutex_lock(&inode->i_mutex); - switch (origin) { + switch (whence) { case SEEK_END: offset += i_size_read(inode); break; @@ -216,7 +216,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin) } EXPORT_SYMBOL(default_llseek); -loff_t vfs_llseek(struct file *file, loff_t offset, int origin) +loff_t vfs_llseek(struct file *file, loff_t offset, int whence) { loff_t (*fn)(struct file *, loff_t, int); @@ -225,11 +225,11 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin) if (file->f_op && file->f_op->llseek) fn = file->f_op->llseek; } - return fn(file, offset, origin); + return fn(file, offset, whence); } EXPORT_SYMBOL(vfs_llseek); -SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin) +SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) { off_t retval; struct fd f = fdget(fd); @@ -237,8 +237,8 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin) return -EBADF; retval = -EINVAL; - if (origin <= SEEK_MAX) { - loff_t res = vfs_llseek(f.file, offset, origin); + if (whence <= SEEK_MAX) { + loff_t res = vfs_llseek(f.file, offset, whence); retval = res; if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ @@ -250,7 +250,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin) #ifdef __ARCH_WANT_SYS_LLSEEK SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned long, offset_low, loff_t __user *, result, - unsigned int, origin) + unsigned int, whence) { int retval; struct fd f = fdget(fd); @@ -260,11 +260,11 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, return -EBADF; retval = -EINVAL; - if (origin > SEEK_MAX) + if (whence > SEEK_MAX) goto out_putf; offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low, - origin); + whence); retval = (int)offset; if (offset >= 0) { diff --git a/trunk/fs/seq_file.c b/trunk/fs/seq_file.c index 99dffab4c4e4..9d863fb501f9 100644 --- a/trunk/fs/seq_file.c +++ b/trunk/fs/seq_file.c @@ -300,14 +300,14 @@ EXPORT_SYMBOL(seq_read); * * Ready-made ->f_op->llseek() */ -loff_t seq_lseek(struct file *file, loff_t offset, int origin) +loff_t seq_lseek(struct file *file, loff_t offset, int whence) { struct seq_file *m = file->private_data; loff_t retval = -EINVAL; mutex_lock(&m->lock); m->version = file->f_version; - switch (origin) { + switch (whence) { case 1: offset += file->f_pos; case 0: diff --git a/trunk/fs/signalfd.c b/trunk/fs/signalfd.c index 8bee4e570911..b53486961735 100644 --- a/trunk/fs/signalfd.c +++ b/trunk/fs/signalfd.c @@ -29,6 +29,7 @@ #include #include #include +#include void signalfd_cleanup(struct sighand_struct *sighand) { @@ -227,7 +228,24 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count, return total ? total: ret; } +#ifdef CONFIG_PROC_FS +static int signalfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct signalfd_ctx *ctx = f->private_data; + sigset_t sigmask; + + sigmask = ctx->sigmask; + signotset(&sigmask); + render_sigset_t(m, "sigmask:\t", &sigmask); + + return 0; +} +#endif + static const struct file_operations signalfd_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = signalfd_show_fdinfo, +#endif .release = signalfd_release, .poll = signalfd_poll, .read = signalfd_read, diff --git a/trunk/fs/sysfs/mount.c b/trunk/fs/sysfs/mount.c index 71eb7e253927..db940a9be045 100644 --- a/trunk/fs/sysfs/mount.c +++ b/trunk/fs/sysfs/mount.c @@ -149,6 +149,7 @@ static struct file_system_type sysfs_fs_type = { .name = "sysfs", .mount = sysfs_mount, .kill_sb = sysfs_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; int __init sysfs_init(void) diff --git a/trunk/fs/ubifs/debug.c b/trunk/fs/ubifs/debug.c index 62911637e12f..12817ffc7345 100644 --- a/trunk/fs/ubifs/debug.c +++ b/trunk/fs/ubifs/debug.c @@ -2560,7 +2560,7 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write) static int corrupt_data(const struct ubifs_info *c, const void *buf, unsigned int len) { - unsigned int from, to, i, ffs = chance(1, 2); + unsigned int from, to, ffs = chance(1, 2); unsigned char *p = (void *)buf; from = random32() % (len + 1); @@ -2571,11 +2571,9 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf, ffs ? "0xFFs" : "random data"); if (ffs) - for (i = from; i < to; i++) - p[i] = 0xFF; + memset(p + from, 0xFF, to - from); else - for (i = from; i < to; i++) - p[i] = random32() % 0x100; + prandom_bytes(p + from, to - from); return to; } diff --git a/trunk/fs/ubifs/dir.c b/trunk/fs/ubifs/dir.c index e271fba1651b..8a574776a493 100644 --- a/trunk/fs/ubifs/dir.c +++ b/trunk/fs/ubifs/dir.c @@ -453,11 +453,11 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) } /* If a directory is seeked, we have to free saved readdir() state */ -static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int origin) +static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence) { kfree(file->private_data); file->private_data = NULL; - return generic_file_llseek(file, offset, origin); + return generic_file_llseek(file, offset, whence); } /* Free saved readdir() state when the directory is closed */ diff --git a/trunk/include/asm-generic/io.h b/trunk/include/asm-generic/io.h index 9e0ebe051243..d1e93284d72a 100644 --- a/trunk/include/asm-generic/io.h +++ b/trunk/include/asm-generic/io.h @@ -154,7 +154,7 @@ static inline void insb(unsigned long addr, void *buffer, int count) if (count) { u8 *buf = buffer; do { - u8 x = inb(addr); + u8 x = __raw_readb(addr + PCI_IOBASE); *buf++ = x; } while (--count); } @@ -167,7 +167,7 @@ static inline void insw(unsigned long addr, void *buffer, int count) if (count) { u16 *buf = buffer; do { - u16 x = inw(addr); + u16 x = __raw_readw(addr + PCI_IOBASE); *buf++ = x; } while (--count); } @@ -180,7 +180,7 @@ static inline void insl(unsigned long addr, void *buffer, int count) if (count) { u32 *buf = buffer; do { - u32 x = inl(addr); + u32 x = __raw_readl(addr + PCI_IOBASE); *buf++ = x; } while (--count); } @@ -193,7 +193,7 @@ static inline void outsb(unsigned long addr, const void *buffer, int count) if (count) { const u8 *buf = buffer; do { - outb(*buf++, addr); + __raw_writeb(*buf++, addr + PCI_IOBASE); } while (--count); } } @@ -205,7 +205,7 @@ static inline void outsw(unsigned long addr, const void *buffer, int count) if (count) { const u16 *buf = buffer; do { - outw(*buf++, addr); + __raw_writew(*buf++, addr + PCI_IOBASE); } while (--count); } } @@ -217,7 +217,7 @@ static inline void outsl(unsigned long addr, const void *buffer, int count) if (count) { const u32 *buf = buffer; do { - outl(*buf++, addr); + __raw_writel(*buf++, addr + PCI_IOBASE); } while (--count); } } diff --git a/trunk/include/linux/asn1.h b/trunk/include/linux/asn1.h index 5c3f4e4b9a23..eed6982860ba 100644 --- a/trunk/include/linux/asn1.h +++ b/trunk/include/linux/asn1.h @@ -64,4 +64,6 @@ enum asn1_tag { ASN1_LONG_TAG = 31 /* Long form tag */ }; +#define ASN1_INDEFINITE_LENGTH 0x80 + #endif /* _LINUX_ASN1_H */ diff --git a/trunk/include/linux/backing-dev.h b/trunk/include/linux/backing-dev.h index 238521a19849..2a9a9abc9126 100644 --- a/trunk/include/linux/backing-dev.h +++ b/trunk/include/linux/backing-dev.h @@ -18,7 +18,6 @@ #include #include #include -#include struct page; struct device; @@ -106,9 +105,6 @@ struct backing_dev_info { struct timer_list laptop_mode_wb_timer; - cpumask_t *flusher_cpumask; /* used for writeback thread scheduling */ - struct mutex flusher_cpumask_lock; - #ifdef CONFIG_DEBUG_FS struct dentry *debug_dir; struct dentry *debug_stats; diff --git a/trunk/include/linux/backlight.h b/trunk/include/linux/backlight.h index 5ffc6dda4675..da9a0825e007 100644 --- a/trunk/include/linux/backlight.h +++ b/trunk/include/linux/backlight.h @@ -134,4 +134,14 @@ struct generic_bl_info { void (*kick_battery)(void); }; +#ifdef CONFIG_OF +struct backlight_device *of_find_backlight_by_node(struct device_node *node); +#else +static inline struct backlight_device * +of_find_backlight_by_node(struct device_node *node) +{ + return NULL; +} +#endif + #endif diff --git a/trunk/include/linux/bcma/bcma.h b/trunk/include/linux/bcma/bcma.h index 93b1e091b1e9..e0ce311011c0 100644 --- a/trunk/include/linux/bcma/bcma.h +++ b/trunk/include/linux/bcma/bcma.h @@ -350,6 +350,7 @@ extern void bcma_core_set_clockmode(struct bcma_device *core, enum bcma_clkmode clkmode); extern void bcma_core_pll_ctl(struct bcma_device *core, u32 req, u32 status, bool on); +extern u32 bcma_chipco_pll_read(struct bcma_drv_cc *cc, u32 offset); #define BCMA_DMA_TRANSLATION_MASK 0xC0000000 #define BCMA_DMA_TRANSLATION_NONE 0x00000000 #define BCMA_DMA_TRANSLATION_DMA32_CMT 0x40000000 /* Client Mode Translation for 32-bit DMA */ diff --git a/trunk/include/linux/binfmts.h b/trunk/include/linux/binfmts.h index 2630c9b41a86..a4c2b565c835 100644 --- a/trunk/include/linux/binfmts.h +++ b/trunk/include/linux/binfmts.h @@ -54,8 +54,6 @@ struct linux_binprm { #define BINPRM_FLAGS_EXECFD_BIT 1 #define BINPRM_FLAGS_EXECFD (1 << BINPRM_FLAGS_EXECFD_BIT) -#define BINPRM_MAX_RECURSION 4 - /* Function parameter for binfmt->coredump */ struct coredump_params { siginfo_t *siginfo; diff --git a/trunk/include/linux/blkdev.h b/trunk/include/linux/blkdev.h index acb4f7bbbd32..f94bc83011ed 100644 --- a/trunk/include/linux/blkdev.h +++ b/trunk/include/linux/blkdev.h @@ -1188,14 +1188,25 @@ static inline int queue_discard_alignment(struct request_queue *q) static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector_t sector) { - sector_t alignment = sector << 9; - alignment = sector_div(alignment, lim->discard_granularity); + unsigned int alignment, granularity, offset; if (!lim->max_discard_sectors) return 0; - alignment = lim->discard_granularity + lim->discard_alignment - alignment; - return sector_div(alignment, lim->discard_granularity); + /* Why are these in bytes, not sectors? */ + alignment = lim->discard_alignment >> 9; + granularity = lim->discard_granularity >> 9; + if (!granularity) + return 0; + + /* Offset of the partition start in 'granularity' sectors */ + offset = sector_div(sector, granularity); + + /* And why do we do this modulus *again* in blkdev_issue_discard()? */ + offset = (granularity + alignment - offset) % granularity; + + /* Turn it back into bytes, gaah */ + return offset << 9; } static inline int bdev_discard_alignment(struct block_device *bdev) diff --git a/trunk/include/linux/compat.h b/trunk/include/linux/compat.h index 784ebfe63c48..e4920bd58a47 100644 --- a/trunk/include/linux/compat.h +++ b/trunk/include/linux/compat.h @@ -588,6 +588,9 @@ asmlinkage ssize_t compat_sys_process_vm_writev(compat_pid_t pid, asmlinkage long compat_sys_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, compat_size_t count); +asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, + struct compat_timespec __user *interval); + #else #define is_compat_task() (0) diff --git a/trunk/include/linux/compiler-gcc4.h b/trunk/include/linux/compiler-gcc4.h index 412bc6c2b023..662fd1b4c42a 100644 --- a/trunk/include/linux/compiler-gcc4.h +++ b/trunk/include/linux/compiler-gcc4.h @@ -31,6 +31,8 @@ #define __linktime_error(message) __attribute__((__error__(message))) +#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) + #if __GNUC_MINOR__ >= 5 /* * Mark a position in code as unreachable. This can be used to @@ -63,3 +65,13 @@ #define __compiletime_warning(message) __attribute__((warning(message))) #define __compiletime_error(message) __attribute__((error(message))) #endif + +#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP +#if __GNUC_MINOR__ >= 4 +#define __HAVE_BUILTIN_BSWAP32__ +#define __HAVE_BUILTIN_BSWAP64__ +#endif +#if __GNUC_MINOR__ >= 8 || (defined(__powerpc__) && __GNUC_MINOR__ >= 6) +#define __HAVE_BUILTIN_BSWAP16__ +#endif +#endif diff --git a/trunk/include/linux/compiler-intel.h b/trunk/include/linux/compiler-intel.h index d8e636e5607d..973ce10c40b6 100644 --- a/trunk/include/linux/compiler-intel.h +++ b/trunk/include/linux/compiler-intel.h @@ -29,3 +29,10 @@ #endif #define uninitialized_var(x) x + +#ifndef __HAVE_BUILTIN_BSWAP16__ +/* icc has this, but it's called _bswap16 */ +#define __HAVE_BUILTIN_BSWAP16__ +#define __builtin_bswap16 _bswap16 +#endif + diff --git a/trunk/include/linux/compiler.h b/trunk/include/linux/compiler.h index f430e4162f41..dd852b73b286 100644 --- a/trunk/include/linux/compiler.h +++ b/trunk/include/linux/compiler.h @@ -10,6 +10,7 @@ # define __force __attribute__((force)) # define __nocast __attribute__((nocast)) # define __iomem __attribute__((noderef, address_space(2))) +# define __must_hold(x) __attribute__((context(x,1,1))) # define __acquires(x) __attribute__((context(x,0,1))) # define __releases(x) __attribute__((context(x,1,0))) # define __acquire(x) __context__(x,1) @@ -33,6 +34,7 @@ extern void __chk_io_ptr(const volatile void __iomem *); # define __chk_user_ptr(x) (void)0 # define __chk_io_ptr(x) (void)0 # define __builtin_warning(x, y...) (1) +# define __must_hold(x) # define __acquires(x) # define __releases(x) # define __acquire(x) (void)0 @@ -42,6 +44,10 @@ extern void __chk_io_ptr(const volatile void __iomem *); # define __rcu #endif +/* Indirect macros required for expanded argument pasting, eg. __LINE__. */ +#define ___PASTE(a,b) a##b +#define __PASTE(a,b) ___PASTE(a,b) + #ifdef __KERNEL__ #ifdef __GNUC__ @@ -164,6 +170,11 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); (typeof(ptr)) (__ptr + (off)); }) #endif +/* Not-quite-unique ID. */ +#ifndef __UNIQUE_ID +# define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __LINE__) +#endif + #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ diff --git a/trunk/include/linux/cred.h b/trunk/include/linux/cred.h index 0142aacb70b7..abb2cd50f6b2 100644 --- a/trunk/include/linux/cred.h +++ b/trunk/include/linux/cred.h @@ -344,10 +344,8 @@ static inline void put_cred(const struct cred *_cred) extern struct user_namespace init_user_ns; #ifdef CONFIG_USER_NS #define current_user_ns() (current_cred_xxx(user_ns)) -#define task_user_ns(task) (task_cred_xxx((task), user_ns)) #else #define current_user_ns() (&init_user_ns) -#define task_user_ns(task) (&init_user_ns) #endif diff --git a/trunk/include/linux/dma-buf.h b/trunk/include/linux/dma-buf.h index eb48f3816df9..bd2e52ccc4f2 100644 --- a/trunk/include/linux/dma-buf.h +++ b/trunk/include/linux/dma-buf.h @@ -156,7 +156,6 @@ static inline void get_dma_buf(struct dma_buf *dmabuf) get_file(dmabuf->file); } -#ifdef CONFIG_DMA_SHARED_BUFFER struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf, struct device *dev); void dma_buf_detach(struct dma_buf *dmabuf, @@ -184,103 +183,5 @@ int dma_buf_mmap(struct dma_buf *, struct vm_area_struct *, unsigned long); void *dma_buf_vmap(struct dma_buf *); void dma_buf_vunmap(struct dma_buf *, void *vaddr); -#else - -static inline struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf, - struct device *dev) -{ - return ERR_PTR(-ENODEV); -} - -static inline void dma_buf_detach(struct dma_buf *dmabuf, - struct dma_buf_attachment *dmabuf_attach) -{ - return; -} - -static inline struct dma_buf *dma_buf_export(void *priv, - const struct dma_buf_ops *ops, - size_t size, int flags) -{ - return ERR_PTR(-ENODEV); -} - -static inline int dma_buf_fd(struct dma_buf *dmabuf, int flags) -{ - return -ENODEV; -} - -static inline struct dma_buf *dma_buf_get(int fd) -{ - return ERR_PTR(-ENODEV); -} - -static inline void dma_buf_put(struct dma_buf *dmabuf) -{ - return; -} - -static inline struct sg_table *dma_buf_map_attachment( - struct dma_buf_attachment *attach, enum dma_data_direction write) -{ - return ERR_PTR(-ENODEV); -} - -static inline void dma_buf_unmap_attachment(struct dma_buf_attachment *attach, - struct sg_table *sg, enum dma_data_direction dir) -{ - return; -} - -static inline int dma_buf_begin_cpu_access(struct dma_buf *dmabuf, - size_t start, size_t len, - enum dma_data_direction dir) -{ - return -ENODEV; -} - -static inline void dma_buf_end_cpu_access(struct dma_buf *dmabuf, - size_t start, size_t len, - enum dma_data_direction dir) -{ -} - -static inline void *dma_buf_kmap_atomic(struct dma_buf *dmabuf, - unsigned long pnum) -{ - return NULL; -} - -static inline void dma_buf_kunmap_atomic(struct dma_buf *dmabuf, - unsigned long pnum, void *vaddr) -{ -} - -static inline void *dma_buf_kmap(struct dma_buf *dmabuf, unsigned long pnum) -{ - return NULL; -} - -static inline void dma_buf_kunmap(struct dma_buf *dmabuf, - unsigned long pnum, void *vaddr) -{ -} - -static inline int dma_buf_mmap(struct dma_buf *dmabuf, - struct vm_area_struct *vma, - unsigned long pgoff) -{ - return -ENODEV; -} - -static inline void *dma_buf_vmap(struct dma_buf *dmabuf) -{ - return NULL; -} - -static inline void dma_buf_vunmap(struct dma_buf *dmabuf, void *vaddr) -{ -} -#endif /* CONFIG_DMA_SHARED_BUFFER */ #endif /* __DMA_BUF_H__ */ diff --git a/trunk/include/linux/drbd.h b/trunk/include/linux/drbd.h index 47e3d4850584..0c5a18ec322c 100644 --- a/trunk/include/linux/drbd.h +++ b/trunk/include/linux/drbd.h @@ -51,12 +51,11 @@ #endif - extern const char *drbd_buildtag(void); -#define REL_VERSION "8.3.13" -#define API_VERSION 88 +#define REL_VERSION "8.4.2" +#define API_VERSION 1 #define PRO_VERSION_MIN 86 -#define PRO_VERSION_MAX 96 +#define PRO_VERSION_MAX 101 enum drbd_io_error_p { @@ -66,7 +65,8 @@ enum drbd_io_error_p { }; enum drbd_fencing_p { - FP_DONT_CARE, + FP_NOT_AVAIL = -1, /* Not a policy */ + FP_DONT_CARE = 0, FP_RESOURCE, FP_STONITH }; @@ -102,6 +102,20 @@ enum drbd_on_congestion { OC_DISCONNECT, }; +enum drbd_read_balancing { + RB_PREFER_LOCAL, + RB_PREFER_REMOTE, + RB_ROUND_ROBIN, + RB_LEAST_PENDING, + RB_CONGESTED_REMOTE, + RB_32K_STRIPING, + RB_64K_STRIPING, + RB_128K_STRIPING, + RB_256K_STRIPING, + RB_512K_STRIPING, + RB_1M_STRIPING, +}; + /* KEEP the order, do not delete or insert. Only append. */ enum drbd_ret_code { ERR_CODE_BASE = 100, @@ -122,7 +136,7 @@ enum drbd_ret_code { ERR_AUTH_ALG = 120, ERR_AUTH_ALG_ND = 121, ERR_NOMEM = 122, - ERR_DISCARD = 123, + ERR_DISCARD_IMPOSSIBLE = 123, ERR_DISK_CONFIGURED = 124, ERR_NET_CONFIGURED = 125, ERR_MANDATORY_TAG = 126, @@ -130,8 +144,8 @@ enum drbd_ret_code { ERR_INTR = 129, /* EINTR */ ERR_RESIZE_RESYNC = 130, ERR_NO_PRIMARY = 131, - ERR_SYNC_AFTER = 132, - ERR_SYNC_AFTER_CYCLE = 133, + ERR_RESYNC_AFTER = 132, + ERR_RESYNC_AFTER_CYCLE = 133, ERR_PAUSE_IS_SET = 134, ERR_PAUSE_IS_CLEAR = 135, ERR_PACKET_NR = 137, @@ -155,6 +169,14 @@ enum drbd_ret_code { ERR_CONG_NOT_PROTO_A = 155, ERR_PIC_AFTER_DEP = 156, ERR_PIC_PEER_DEP = 157, + ERR_RES_NOT_KNOWN = 158, + ERR_RES_IN_USE = 159, + ERR_MINOR_CONFIGURED = 160, + ERR_MINOR_EXISTS = 161, + ERR_INVALID_REQUEST = 162, + ERR_NEED_APV_100 = 163, + ERR_NEED_ALLOW_TWO_PRI = 164, + ERR_MD_UNCLEAN = 165, /* insert new ones above this line */ AFTER_LAST_ERR_CODE @@ -296,7 +318,8 @@ enum drbd_state_rv { SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */ SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */ SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */ - SS_AFTER_LAST_ERROR = -20, /* Keep this at bottom */ + SS_O_VOL_PEER_PRI = -20, + SS_AFTER_LAST_ERROR = -21, /* Keep this at bottom */ }; /* from drbd_strings.c */ @@ -313,7 +336,9 @@ extern const char *drbd_set_st_err_str(enum drbd_state_rv); #define MDF_FULL_SYNC (1 << 3) #define MDF_WAS_UP_TO_DATE (1 << 4) #define MDF_PEER_OUT_DATED (1 << 5) -#define MDF_CRASHED_PRIMARY (1 << 6) +#define MDF_CRASHED_PRIMARY (1 << 6) +#define MDF_AL_CLEAN (1 << 7) +#define MDF_AL_DISABLED (1 << 8) enum drbd_uuid_index { UI_CURRENT, @@ -333,37 +358,23 @@ enum drbd_timeout_flag { #define UUID_JUST_CREATED ((__u64)4) +/* magic numbers used in meta data and network packets */ #define DRBD_MAGIC 0x83740267 -#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC) #define DRBD_MAGIC_BIG 0x835a -#define BE_DRBD_MAGIC_BIG __constant_cpu_to_be16(DRBD_MAGIC_BIG) +#define DRBD_MAGIC_100 0x8620ec20 + +#define DRBD_MD_MAGIC_07 (DRBD_MAGIC+3) +#define DRBD_MD_MAGIC_08 (DRBD_MAGIC+4) +#define DRBD_MD_MAGIC_84_UNCLEAN (DRBD_MAGIC+5) + + +/* how I came up with this magic? + * base64 decode "actlog==" ;) */ +#define DRBD_AL_MAGIC 0x69cb65a2 /* these are of type "int" */ #define DRBD_MD_INDEX_INTERNAL -1 #define DRBD_MD_INDEX_FLEX_EXT -2 #define DRBD_MD_INDEX_FLEX_INT -3 -/* Start of the new netlink/connector stuff */ - -#define DRBD_NL_CREATE_DEVICE 0x01 -#define DRBD_NL_SET_DEFAULTS 0x02 - - -/* For searching a vacant cn_idx value */ -#define CN_IDX_STEP 6977 - -struct drbd_nl_cfg_req { - int packet_type; - unsigned int drbd_minor; - int flags; - unsigned short tag_list[]; -}; - -struct drbd_nl_cfg_reply { - int packet_type; - unsigned int minor; - int ret_code; /* enum ret_code or set_st_err_t */ - unsigned short tag_list[]; /* only used with get_* calls */ -}; - #endif diff --git a/trunk/include/linux/drbd_genl.h b/trunk/include/linux/drbd_genl.h new file mode 100644 index 000000000000..d0d8fac8a6e4 --- /dev/null +++ b/trunk/include/linux/drbd_genl.h @@ -0,0 +1,378 @@ +/* + * General overview: + * full generic netlink message: + * |nlmsghdr|genlmsghdr| + * + * payload: + * |optional fixed size family header| + * + * sequence of netlink attributes: + * I chose to have all "top level" attributes NLA_NESTED, + * corresponding to some real struct. + * So we have a sequence of |tla, len| + * + * nested nla sequence: + * may be empty, or contain a sequence of netlink attributes + * representing the struct fields. + * + * The tag number of any field (regardless of containing struct) + * will be available as T_ ## field_name, + * so you cannot have the same field name in two differnt structs. + * + * The tag numbers themselves are per struct, though, + * so should always begin at 1 (not 0, that is the special "NLA_UNSPEC" type, + * which we won't use here). + * The tag numbers are used as index in the respective nla_policy array. + * + * GENL_struct(tag_name, tag_number, struct name, struct fields) - struct and policy + * genl_magic_struct.h + * generates the struct declaration, + * generates an entry in the tla enum, + * genl_magic_func.h + * generates an entry in the static tla policy + * with .type = NLA_NESTED + * generates the static _nl_policy definition, + * and static conversion functions + * + * genl_magic_func.h + * + * GENL_mc_group(group) + * genl_magic_struct.h + * does nothing + * genl_magic_func.h + * defines and registers the mcast group, + * and provides a send helper + * + * GENL_notification(op_name, op_num, mcast_group, tla list) + * These are notifications to userspace. + * + * genl_magic_struct.h + * generates an entry in the genl_ops enum, + * genl_magic_func.h + * does nothing + * + * mcast group: the name of the mcast group this notification should be + * expected on + * tla list: the list of expected top level attributes, + * for documentation and sanity checking. + * + * GENL_op(op_name, op_num, flags and handler, tla list) - "genl operations" + * These are requests from userspace. + * + * _op and _notification share the same "number space", + * op_nr will be assigned to "genlmsghdr->cmd" + * + * genl_magic_struct.h + * generates an entry in the genl_ops enum, + * genl_magic_func.h + * generates an entry in the static genl_ops array, + * and static register/unregister functions to + * genl_register_family_with_ops(). + * + * flags and handler: + * GENL_op_init( .doit = x, .dumpit = y, .flags = something) + * GENL_doit(x) => .dumpit = NULL, .flags = GENL_ADMIN_PERM + * tla list: the list of expected top level attributes, + * for documentation and sanity checking. + */ + +/* + * STRUCTS + */ + +/* this is sent kernel -> userland on various error conditions, and contains + * informational textual info, which is supposedly human readable. + * The computer relevant return code is in the drbd_genlmsghdr. + */ +GENL_struct(DRBD_NLA_CFG_REPLY, 1, drbd_cfg_reply, + /* "arbitrary" size strings, nla_policy.len = 0 */ + __str_field(1, DRBD_GENLA_F_MANDATORY, info_text, 0) +) + +/* Configuration requests typically need a context to operate on. + * Possible keys are device minor (fits in the drbd_genlmsghdr), + * the replication link (aka connection) name, + * and/or the replication group (aka resource) name, + * and the volume id within the resource. */ +GENL_struct(DRBD_NLA_CFG_CONTEXT, 2, drbd_cfg_context, + __u32_field(1, DRBD_GENLA_F_MANDATORY, ctx_volume) + __str_field(2, DRBD_GENLA_F_MANDATORY, ctx_resource_name, 128) + __bin_field(3, DRBD_GENLA_F_MANDATORY, ctx_my_addr, 128) + __bin_field(4, DRBD_GENLA_F_MANDATORY, ctx_peer_addr, 128) +) + +GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf, + __str_field(1, DRBD_F_REQUIRED | DRBD_F_INVARIANT, backing_dev, 128) + __str_field(2, DRBD_F_REQUIRED | DRBD_F_INVARIANT, meta_dev, 128) + __s32_field(3, DRBD_F_REQUIRED | DRBD_F_INVARIANT, meta_dev_idx) + + /* use the resize command to try and change the disk_size */ + __u64_field(4, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, disk_size) + /* we could change the max_bio_bvecs, + * but it won't propagate through the stack */ + __u32_field(5, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, max_bio_bvecs) + + __u32_field_def(6, DRBD_GENLA_F_MANDATORY, on_io_error, DRBD_ON_IO_ERROR_DEF) + __u32_field_def(7, DRBD_GENLA_F_MANDATORY, fencing, DRBD_FENCING_DEF) + + __u32_field_def(8, DRBD_GENLA_F_MANDATORY, resync_rate, DRBD_RESYNC_RATE_DEF) + __s32_field_def(9, DRBD_GENLA_F_MANDATORY, resync_after, DRBD_MINOR_NUMBER_DEF) + __u32_field_def(10, DRBD_GENLA_F_MANDATORY, al_extents, DRBD_AL_EXTENTS_DEF) + __u32_field_def(11, DRBD_GENLA_F_MANDATORY, c_plan_ahead, DRBD_C_PLAN_AHEAD_DEF) + __u32_field_def(12, DRBD_GENLA_F_MANDATORY, c_delay_target, DRBD_C_DELAY_TARGET_DEF) + __u32_field_def(13, DRBD_GENLA_F_MANDATORY, c_fill_target, DRBD_C_FILL_TARGET_DEF) + __u32_field_def(14, DRBD_GENLA_F_MANDATORY, c_max_rate, DRBD_C_MAX_RATE_DEF) + __u32_field_def(15, DRBD_GENLA_F_MANDATORY, c_min_rate, DRBD_C_MIN_RATE_DEF) + + __flg_field_def(16, DRBD_GENLA_F_MANDATORY, disk_barrier, DRBD_DISK_BARRIER_DEF) + __flg_field_def(17, DRBD_GENLA_F_MANDATORY, disk_flushes, DRBD_DISK_FLUSHES_DEF) + __flg_field_def(18, DRBD_GENLA_F_MANDATORY, disk_drain, DRBD_DISK_DRAIN_DEF) + __flg_field_def(19, DRBD_GENLA_F_MANDATORY, md_flushes, DRBD_MD_FLUSHES_DEF) + __u32_field_def(20, DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF) + __u32_field_def(21, 0 /* OPTIONAL */, read_balancing, DRBD_READ_BALANCING_DEF) + /* 9: __u32_field_def(22, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) */ + __flg_field_def(23, 0 /* OPTIONAL */, al_updates, DRBD_AL_UPDATES_DEF) +) + +GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts, + __str_field_def(1, DRBD_GENLA_F_MANDATORY, cpu_mask, 32) + __u32_field_def(2, DRBD_GENLA_F_MANDATORY, on_no_data, DRBD_ON_NO_DATA_DEF) +) + +GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf, + __str_field_def(1, DRBD_GENLA_F_MANDATORY | DRBD_F_SENSITIVE, + shared_secret, SHARED_SECRET_MAX) + __str_field_def(2, DRBD_GENLA_F_MANDATORY, cram_hmac_alg, SHARED_SECRET_MAX) + __str_field_def(3, DRBD_GENLA_F_MANDATORY, integrity_alg, SHARED_SECRET_MAX) + __str_field_def(4, DRBD_GENLA_F_MANDATORY, verify_alg, SHARED_SECRET_MAX) + __str_field_def(5, DRBD_GENLA_F_MANDATORY, csums_alg, SHARED_SECRET_MAX) + __u32_field_def(6, DRBD_GENLA_F_MANDATORY, wire_protocol, DRBD_PROTOCOL_DEF) + __u32_field_def(7, DRBD_GENLA_F_MANDATORY, connect_int, DRBD_CONNECT_INT_DEF) + __u32_field_def(8, DRBD_GENLA_F_MANDATORY, timeout, DRBD_TIMEOUT_DEF) + __u32_field_def(9, DRBD_GENLA_F_MANDATORY, ping_int, DRBD_PING_INT_DEF) + __u32_field_def(10, DRBD_GENLA_F_MANDATORY, ping_timeo, DRBD_PING_TIMEO_DEF) + __u32_field_def(11, DRBD_GENLA_F_MANDATORY, sndbuf_size, DRBD_SNDBUF_SIZE_DEF) + __u32_field_def(12, DRBD_GENLA_F_MANDATORY, rcvbuf_size, DRBD_RCVBUF_SIZE_DEF) + __u32_field_def(13, DRBD_GENLA_F_MANDATORY, ko_count, DRBD_KO_COUNT_DEF) + __u32_field_def(14, DRBD_GENLA_F_MANDATORY, max_buffers, DRBD_MAX_BUFFERS_DEF) + __u32_field_def(15, DRBD_GENLA_F_MANDATORY, max_epoch_size, DRBD_MAX_EPOCH_SIZE_DEF) + __u32_field_def(16, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) + __u32_field_def(17, DRBD_GENLA_F_MANDATORY, after_sb_0p, DRBD_AFTER_SB_0P_DEF) + __u32_field_def(18, DRBD_GENLA_F_MANDATORY, after_sb_1p, DRBD_AFTER_SB_1P_DEF) + __u32_field_def(19, DRBD_GENLA_F_MANDATORY, after_sb_2p, DRBD_AFTER_SB_2P_DEF) + __u32_field_def(20, DRBD_GENLA_F_MANDATORY, rr_conflict, DRBD_RR_CONFLICT_DEF) + __u32_field_def(21, DRBD_GENLA_F_MANDATORY, on_congestion, DRBD_ON_CONGESTION_DEF) + __u32_field_def(22, DRBD_GENLA_F_MANDATORY, cong_fill, DRBD_CONG_FILL_DEF) + __u32_field_def(23, DRBD_GENLA_F_MANDATORY, cong_extents, DRBD_CONG_EXTENTS_DEF) + __flg_field_def(24, DRBD_GENLA_F_MANDATORY, two_primaries, DRBD_ALLOW_TWO_PRIMARIES_DEF) + __flg_field(25, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, discard_my_data) + __flg_field_def(26, DRBD_GENLA_F_MANDATORY, tcp_cork, DRBD_TCP_CORK_DEF) + __flg_field_def(27, DRBD_GENLA_F_MANDATORY, always_asbp, DRBD_ALWAYS_ASBP_DEF) + __flg_field(28, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, tentative) + __flg_field_def(29, DRBD_GENLA_F_MANDATORY, use_rle, DRBD_USE_RLE_DEF) + /* 9: __u32_field_def(30, DRBD_GENLA_F_MANDATORY, fencing_policy, DRBD_FENCING_DEF) */ +) + +GENL_struct(DRBD_NLA_SET_ROLE_PARMS, 6, set_role_parms, + __flg_field(1, DRBD_GENLA_F_MANDATORY, assume_uptodate) +) + +GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms, + __u64_field(1, DRBD_GENLA_F_MANDATORY, resize_size) + __flg_field(2, DRBD_GENLA_F_MANDATORY, resize_force) + __flg_field(3, DRBD_GENLA_F_MANDATORY, no_resync) +) + +GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info, + /* the reason of the broadcast, + * if this is an event triggered broadcast. */ + __u32_field(1, DRBD_GENLA_F_MANDATORY, sib_reason) + __u32_field(2, DRBD_F_REQUIRED, current_state) + __u64_field(3, DRBD_GENLA_F_MANDATORY, capacity) + __u64_field(4, DRBD_GENLA_F_MANDATORY, ed_uuid) + + /* These are for broadcast from after state change work. + * prev_state and new_state are from the moment the state change took + * place, new_state is not neccessarily the same as current_state, + * there may have been more state changes since. Which will be + * broadcasted soon, in their respective after state change work. */ + __u32_field(5, DRBD_GENLA_F_MANDATORY, prev_state) + __u32_field(6, DRBD_GENLA_F_MANDATORY, new_state) + + /* if we have a local disk: */ + __bin_field(7, DRBD_GENLA_F_MANDATORY, uuids, (UI_SIZE*sizeof(__u64))) + __u32_field(8, DRBD_GENLA_F_MANDATORY, disk_flags) + __u64_field(9, DRBD_GENLA_F_MANDATORY, bits_total) + __u64_field(10, DRBD_GENLA_F_MANDATORY, bits_oos) + /* and in case resync or online verify is active */ + __u64_field(11, DRBD_GENLA_F_MANDATORY, bits_rs_total) + __u64_field(12, DRBD_GENLA_F_MANDATORY, bits_rs_failed) + + /* for pre and post notifications of helper execution */ + __str_field(13, DRBD_GENLA_F_MANDATORY, helper, 32) + __u32_field(14, DRBD_GENLA_F_MANDATORY, helper_exit_code) + + __u64_field(15, 0, send_cnt) + __u64_field(16, 0, recv_cnt) + __u64_field(17, 0, read_cnt) + __u64_field(18, 0, writ_cnt) + __u64_field(19, 0, al_writ_cnt) + __u64_field(20, 0, bm_writ_cnt) + __u32_field(21, 0, ap_bio_cnt) + __u32_field(22, 0, ap_pending_cnt) + __u32_field(23, 0, rs_pending_cnt) +) + +GENL_struct(DRBD_NLA_START_OV_PARMS, 9, start_ov_parms, + __u64_field(1, DRBD_GENLA_F_MANDATORY, ov_start_sector) + __u64_field(2, DRBD_GENLA_F_MANDATORY, ov_stop_sector) +) + +GENL_struct(DRBD_NLA_NEW_C_UUID_PARMS, 10, new_c_uuid_parms, + __flg_field(1, DRBD_GENLA_F_MANDATORY, clear_bm) +) + +GENL_struct(DRBD_NLA_TIMEOUT_PARMS, 11, timeout_parms, + __u32_field(1, DRBD_F_REQUIRED, timeout_type) +) + +GENL_struct(DRBD_NLA_DISCONNECT_PARMS, 12, disconnect_parms, + __flg_field(1, DRBD_GENLA_F_MANDATORY, force_disconnect) +) + +GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms, + __flg_field(1, DRBD_GENLA_F_MANDATORY, force_detach) +) + +/* + * Notifications and commands (genlmsghdr->cmd) + */ +GENL_mc_group(events) + + /* kernel -> userspace announcement of changes */ +GENL_notification( + DRBD_EVENT, 1, events, + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_STATE_INFO, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_SYNCER_CONF, DRBD_GENLA_F_MANDATORY) +) + + /* query kernel for specific or all info */ +GENL_op( + DRBD_ADM_GET_STATUS, 2, + GENL_op_init( + .doit = drbd_adm_get_status, + .dumpit = drbd_adm_get_status_all, + /* anyone may ask for the status, + * it is broadcasted anyways */ + ), + /* To select the object .doit. + * Or a subset of objects in .dumpit. */ + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) +) + + /* add DRBD minor devices as volumes to resources */ +GENL_op(DRBD_ADM_NEW_MINOR, 5, GENL_doit(drbd_adm_add_minor), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) +GENL_op(DRBD_ADM_DEL_MINOR, 6, GENL_doit(drbd_adm_delete_minor), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) + + /* add or delete resources */ +GENL_op(DRBD_ADM_NEW_RESOURCE, 7, GENL_doit(drbd_adm_new_resource), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) +GENL_op(DRBD_ADM_DEL_RESOURCE, 8, GENL_doit(drbd_adm_del_resource), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) + +GENL_op(DRBD_ADM_RESOURCE_OPTS, 9, + GENL_doit(drbd_adm_resource_opts), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_RESOURCE_OPTS, DRBD_GENLA_F_MANDATORY) +) + +GENL_op( + DRBD_ADM_CONNECT, 10, + GENL_doit(drbd_adm_connect), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_F_REQUIRED) +) + +GENL_op( + DRBD_ADM_CHG_NET_OPTS, 29, + GENL_doit(drbd_adm_net_opts), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_F_REQUIRED) +) + +GENL_op(DRBD_ADM_DISCONNECT, 11, GENL_doit(drbd_adm_disconnect), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) + +GENL_op(DRBD_ADM_ATTACH, 12, + GENL_doit(drbd_adm_attach), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_F_REQUIRED) +) + +GENL_op(DRBD_ADM_CHG_DISK_OPTS, 28, + GENL_doit(drbd_adm_disk_opts), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_DISK_OPTS, DRBD_F_REQUIRED) +) + +GENL_op( + DRBD_ADM_RESIZE, 13, + GENL_doit(drbd_adm_resize), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_RESIZE_PARMS, DRBD_GENLA_F_MANDATORY) +) + +GENL_op( + DRBD_ADM_PRIMARY, 14, + GENL_doit(drbd_adm_set_role), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_SET_ROLE_PARMS, DRBD_F_REQUIRED) +) + +GENL_op( + DRBD_ADM_SECONDARY, 15, + GENL_doit(drbd_adm_set_role), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_SET_ROLE_PARMS, DRBD_F_REQUIRED) +) + +GENL_op( + DRBD_ADM_NEW_C_UUID, 16, + GENL_doit(drbd_adm_new_c_uuid), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_NEW_C_UUID_PARMS, DRBD_GENLA_F_MANDATORY) +) + +GENL_op( + DRBD_ADM_START_OV, 17, + GENL_doit(drbd_adm_start_ov), + GENL_tla_expected(DRBD_NLA_START_OV_PARMS, DRBD_GENLA_F_MANDATORY) +) + +GENL_op(DRBD_ADM_DETACH, 18, GENL_doit(drbd_adm_detach), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_DETACH_PARMS, DRBD_GENLA_F_MANDATORY)) + +GENL_op(DRBD_ADM_INVALIDATE, 19, GENL_doit(drbd_adm_invalidate), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) +GENL_op(DRBD_ADM_INVAL_PEER, 20, GENL_doit(drbd_adm_invalidate_peer), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) +GENL_op(DRBD_ADM_PAUSE_SYNC, 21, GENL_doit(drbd_adm_pause_sync), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) +GENL_op(DRBD_ADM_RESUME_SYNC, 22, GENL_doit(drbd_adm_resume_sync), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) +GENL_op(DRBD_ADM_SUSPEND_IO, 23, GENL_doit(drbd_adm_suspend_io), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) +GENL_op(DRBD_ADM_RESUME_IO, 24, GENL_doit(drbd_adm_resume_io), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) +GENL_op(DRBD_ADM_OUTDATE, 25, GENL_doit(drbd_adm_outdate), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) +GENL_op(DRBD_ADM_GET_TIMEOUT_TYPE, 26, GENL_doit(drbd_adm_get_timeout_type), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) +GENL_op(DRBD_ADM_DOWN, 27, GENL_doit(drbd_adm_down), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) diff --git a/trunk/include/linux/drbd_genl_api.h b/trunk/include/linux/drbd_genl_api.h new file mode 100644 index 000000000000..9ef50d51e34e --- /dev/null +++ b/trunk/include/linux/drbd_genl_api.h @@ -0,0 +1,55 @@ +#ifndef DRBD_GENL_STRUCT_H +#define DRBD_GENL_STRUCT_H + +/** + * struct drbd_genlmsghdr - DRBD specific header used in NETLINK_GENERIC requests + * @minor: + * For admin requests (user -> kernel): which minor device to operate on. + * For (unicast) replies or informational (broadcast) messages + * (kernel -> user): which minor device the information is about. + * If we do not operate on minors, but on connections or resources, + * the minor value shall be (~0), and the attribute DRBD_NLA_CFG_CONTEXT + * is used instead. + * @flags: possible operation modifiers (relevant only for user->kernel): + * DRBD_GENL_F_SET_DEFAULTS + * @volume: + * When creating a new minor (adding it to a resource), the resource needs + * to know which volume number within the resource this is supposed to be. + * The volume number corresponds to the same volume number on the remote side, + * whereas the minor number on the remote side may be different + * (union with flags). + * @ret_code: kernel->userland unicast cfg reply return code (union with flags); + */ +struct drbd_genlmsghdr { + __u32 minor; + union { + __u32 flags; + __s32 ret_code; + }; +}; + +/* To be used in drbd_genlmsghdr.flags */ +enum { + DRBD_GENL_F_SET_DEFAULTS = 1, +}; + +enum drbd_state_info_bcast_reason { + SIB_GET_STATUS_REPLY = 1, + SIB_STATE_CHANGE = 2, + SIB_HELPER_PRE = 3, + SIB_HELPER_POST = 4, + SIB_SYNC_PROGRESS = 5, +}; + +/* hack around predefined gcc/cpp "linux=1", + * we cannot possibly include <1/drbd_genl.h> */ +#undef linux + +#include +#define GENL_MAGIC_VERSION API_VERSION +#define GENL_MAGIC_FAMILY drbd +#define GENL_MAGIC_FAMILY_HDRSZ sizeof(struct drbd_genlmsghdr) +#define GENL_MAGIC_INCLUDE_FILE +#include + +#endif diff --git a/trunk/include/linux/drbd_limits.h b/trunk/include/linux/drbd_limits.h index fb670bf603f7..1fa19c5f5e64 100644 --- a/trunk/include/linux/drbd_limits.h +++ b/trunk/include/linux/drbd_limits.h @@ -16,29 +16,37 @@ #define DEBUG_RANGE_CHECK 0 #define DRBD_MINOR_COUNT_MIN 1 -#define DRBD_MINOR_COUNT_MAX 256 +#define DRBD_MINOR_COUNT_MAX 255 #define DRBD_MINOR_COUNT_DEF 32 +#define DRBD_MINOR_COUNT_SCALE '1' + +#define DRBD_VOLUME_MAX 65535 #define DRBD_DIALOG_REFRESH_MIN 0 #define DRBD_DIALOG_REFRESH_MAX 600 +#define DRBD_DIALOG_REFRESH_SCALE '1' /* valid port number */ #define DRBD_PORT_MIN 1 #define DRBD_PORT_MAX 0xffff +#define DRBD_PORT_SCALE '1' /* startup { */ /* if you want more than 3.4 days, disable */ #define DRBD_WFC_TIMEOUT_MIN 0 #define DRBD_WFC_TIMEOUT_MAX 300000 #define DRBD_WFC_TIMEOUT_DEF 0 +#define DRBD_WFC_TIMEOUT_SCALE '1' #define DRBD_DEGR_WFC_TIMEOUT_MIN 0 #define DRBD_DEGR_WFC_TIMEOUT_MAX 300000 #define DRBD_DEGR_WFC_TIMEOUT_DEF 0 +#define DRBD_DEGR_WFC_TIMEOUT_SCALE '1' #define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0 #define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000 #define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0 +#define DRBD_OUTDATED_WFC_TIMEOUT_SCALE '1' /* }*/ /* net { */ @@ -47,75 +55,91 @@ #define DRBD_TIMEOUT_MIN 1 #define DRBD_TIMEOUT_MAX 600 #define DRBD_TIMEOUT_DEF 60 /* 6 seconds */ +#define DRBD_TIMEOUT_SCALE '1' /* If backing disk takes longer than disk_timeout, mark the disk as failed */ #define DRBD_DISK_TIMEOUT_MIN 0 /* 0 = disabled */ #define DRBD_DISK_TIMEOUT_MAX 6000 /* 10 Minutes */ #define DRBD_DISK_TIMEOUT_DEF 0 /* disabled */ +#define DRBD_DISK_TIMEOUT_SCALE '1' /* active connection retries when C_WF_CONNECTION */ #define DRBD_CONNECT_INT_MIN 1 #define DRBD_CONNECT_INT_MAX 120 #define DRBD_CONNECT_INT_DEF 10 /* seconds */ +#define DRBD_CONNECT_INT_SCALE '1' /* keep-alive probes when idle */ #define DRBD_PING_INT_MIN 1 #define DRBD_PING_INT_MAX 120 #define DRBD_PING_INT_DEF 10 +#define DRBD_PING_INT_SCALE '1' /* timeout for the ping packets.*/ #define DRBD_PING_TIMEO_MIN 1 #define DRBD_PING_TIMEO_MAX 300 #define DRBD_PING_TIMEO_DEF 5 +#define DRBD_PING_TIMEO_SCALE '1' /* max number of write requests between write barriers */ #define DRBD_MAX_EPOCH_SIZE_MIN 1 #define DRBD_MAX_EPOCH_SIZE_MAX 20000 #define DRBD_MAX_EPOCH_SIZE_DEF 2048 +#define DRBD_MAX_EPOCH_SIZE_SCALE '1' /* I don't think that a tcp send buffer of more than 10M is useful */ #define DRBD_SNDBUF_SIZE_MIN 0 #define DRBD_SNDBUF_SIZE_MAX (10<<20) #define DRBD_SNDBUF_SIZE_DEF 0 +#define DRBD_SNDBUF_SIZE_SCALE '1' #define DRBD_RCVBUF_SIZE_MIN 0 #define DRBD_RCVBUF_SIZE_MAX (10<<20) #define DRBD_RCVBUF_SIZE_DEF 0 +#define DRBD_RCVBUF_SIZE_SCALE '1' /* @4k PageSize -> 128kB - 512MB */ #define DRBD_MAX_BUFFERS_MIN 32 #define DRBD_MAX_BUFFERS_MAX 131072 #define DRBD_MAX_BUFFERS_DEF 2048 +#define DRBD_MAX_BUFFERS_SCALE '1' /* @4k PageSize -> 4kB - 512MB */ #define DRBD_UNPLUG_WATERMARK_MIN 1 #define DRBD_UNPLUG_WATERMARK_MAX 131072 #define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16) +#define DRBD_UNPLUG_WATERMARK_SCALE '1' /* 0 is disabled. * 200 should be more than enough even for very short timeouts */ #define DRBD_KO_COUNT_MIN 0 #define DRBD_KO_COUNT_MAX 200 -#define DRBD_KO_COUNT_DEF 0 +#define DRBD_KO_COUNT_DEF 7 +#define DRBD_KO_COUNT_SCALE '1' /* } */ /* syncer { */ /* FIXME allow rate to be zero? */ -#define DRBD_RATE_MIN 1 +#define DRBD_RESYNC_RATE_MIN 1 /* channel bonding 10 GbE, or other hardware */ -#define DRBD_RATE_MAX (4 << 20) -#define DRBD_RATE_DEF 250 /* kb/second */ +#define DRBD_RESYNC_RATE_MAX (4 << 20) +#define DRBD_RESYNC_RATE_DEF 250 +#define DRBD_RESYNC_RATE_SCALE 'k' /* kilobytes */ /* less than 7 would hit performance unnecessarily. - * 3833 is the largest prime that still does fit - * into 64 sectors of activity log */ + * 919 slots context information per transaction, + * 32k activity log, 4k transaction size, + * one transaction in flight: + * 919 * 7 = 6433 */ #define DRBD_AL_EXTENTS_MIN 7 -#define DRBD_AL_EXTENTS_MAX 3833 -#define DRBD_AL_EXTENTS_DEF 127 +#define DRBD_AL_EXTENTS_MAX 6433 +#define DRBD_AL_EXTENTS_DEF 1237 +#define DRBD_AL_EXTENTS_SCALE '1' -#define DRBD_AFTER_MIN -1 -#define DRBD_AFTER_MAX 255 -#define DRBD_AFTER_DEF -1 +#define DRBD_MINOR_NUMBER_MIN -1 +#define DRBD_MINOR_NUMBER_MAX ((1 << 20) - 1) +#define DRBD_MINOR_NUMBER_DEF -1 +#define DRBD_MINOR_NUMBER_SCALE '1' /* } */ @@ -124,11 +148,12 @@ * the upper limit with 64bit kernel, enough ram and flexible meta data * is 1 PiB, currently. */ /* DRBD_MAX_SECTORS */ -#define DRBD_DISK_SIZE_SECT_MIN 0 -#define DRBD_DISK_SIZE_SECT_MAX (1 * (2LLU << 40)) -#define DRBD_DISK_SIZE_SECT_DEF 0 /* = disabled = no user size... */ +#define DRBD_DISK_SIZE_MIN 0 +#define DRBD_DISK_SIZE_MAX (1 * (2LLU << 40)) +#define DRBD_DISK_SIZE_DEF 0 /* = disabled = no user size... */ +#define DRBD_DISK_SIZE_SCALE 's' /* sectors */ -#define DRBD_ON_IO_ERROR_DEF EP_PASS_ON +#define DRBD_ON_IO_ERROR_DEF EP_DETACH #define DRBD_FENCING_DEF FP_DONT_CARE #define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT #define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT @@ -136,38 +161,59 @@ #define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT #define DRBD_ON_NO_DATA_DEF OND_IO_ERROR #define DRBD_ON_CONGESTION_DEF OC_BLOCK +#define DRBD_READ_BALANCING_DEF RB_PREFER_LOCAL #define DRBD_MAX_BIO_BVECS_MIN 0 #define DRBD_MAX_BIO_BVECS_MAX 128 #define DRBD_MAX_BIO_BVECS_DEF 0 +#define DRBD_MAX_BIO_BVECS_SCALE '1' #define DRBD_C_PLAN_AHEAD_MIN 0 #define DRBD_C_PLAN_AHEAD_MAX 300 -#define DRBD_C_PLAN_AHEAD_DEF 0 /* RS rate controller disabled by default */ +#define DRBD_C_PLAN_AHEAD_DEF 20 +#define DRBD_C_PLAN_AHEAD_SCALE '1' #define DRBD_C_DELAY_TARGET_MIN 1 #define DRBD_C_DELAY_TARGET_MAX 100 #define DRBD_C_DELAY_TARGET_DEF 10 +#define DRBD_C_DELAY_TARGET_SCALE '1' #define DRBD_C_FILL_TARGET_MIN 0 #define DRBD_C_FILL_TARGET_MAX (1<<20) /* 500MByte in sec */ -#define DRBD_C_FILL_TARGET_DEF 0 /* By default disabled -> controlled by delay_target */ +#define DRBD_C_FILL_TARGET_DEF 100 /* Try to place 50KiB in socket send buffer during resync */ +#define DRBD_C_FILL_TARGET_SCALE 's' /* sectors */ -#define DRBD_C_MAX_RATE_MIN 250 /* kByte/sec */ +#define DRBD_C_MAX_RATE_MIN 250 #define DRBD_C_MAX_RATE_MAX (4 << 20) #define DRBD_C_MAX_RATE_DEF 102400 +#define DRBD_C_MAX_RATE_SCALE 'k' /* kilobytes */ -#define DRBD_C_MIN_RATE_MIN 0 /* kByte/sec */ +#define DRBD_C_MIN_RATE_MIN 0 #define DRBD_C_MIN_RATE_MAX (4 << 20) -#define DRBD_C_MIN_RATE_DEF 4096 +#define DRBD_C_MIN_RATE_DEF 250 +#define DRBD_C_MIN_RATE_SCALE 'k' /* kilobytes */ #define DRBD_CONG_FILL_MIN 0 #define DRBD_CONG_FILL_MAX (10<<21) /* 10GByte in sectors */ #define DRBD_CONG_FILL_DEF 0 +#define DRBD_CONG_FILL_SCALE 's' /* sectors */ #define DRBD_CONG_EXTENTS_MIN DRBD_AL_EXTENTS_MIN #define DRBD_CONG_EXTENTS_MAX DRBD_AL_EXTENTS_MAX #define DRBD_CONG_EXTENTS_DEF DRBD_AL_EXTENTS_DEF +#define DRBD_CONG_EXTENTS_SCALE DRBD_AL_EXTENTS_SCALE + +#define DRBD_PROTOCOL_DEF DRBD_PROT_C + +#define DRBD_DISK_BARRIER_DEF 0 +#define DRBD_DISK_FLUSHES_DEF 1 +#define DRBD_DISK_DRAIN_DEF 1 +#define DRBD_MD_FLUSHES_DEF 1 +#define DRBD_TCP_CORK_DEF 1 +#define DRBD_AL_UPDATES_DEF 1 + +#define DRBD_ALLOW_TWO_PRIMARIES_DEF 0 +#define DRBD_ALWAYS_ASBP_DEF 0 +#define DRBD_USE_RLE_DEF 1 -#undef RANGE #endif diff --git a/trunk/include/linux/drbd_nl.h b/trunk/include/linux/drbd_nl.h deleted file mode 100644 index a8706f08ab36..000000000000 --- a/trunk/include/linux/drbd_nl.h +++ /dev/null @@ -1,163 +0,0 @@ -/* - PAKET( name, - TYPE ( pn, pr, member ) - ... - ) - - You may never reissue one of the pn arguments -*/ - -#if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64) -#error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined" -#endif - -NL_PACKET(primary, 1, - NL_BIT( 1, T_MAY_IGNORE, primary_force) -) - -NL_PACKET(secondary, 2, ) - -NL_PACKET(disk_conf, 3, - NL_INT64( 2, T_MAY_IGNORE, disk_size) - NL_STRING( 3, T_MANDATORY, backing_dev, 128) - NL_STRING( 4, T_MANDATORY, meta_dev, 128) - NL_INTEGER( 5, T_MANDATORY, meta_dev_idx) - NL_INTEGER( 6, T_MAY_IGNORE, on_io_error) - NL_INTEGER( 7, T_MAY_IGNORE, fencing) - NL_BIT( 37, T_MAY_IGNORE, use_bmbv) - NL_BIT( 53, T_MAY_IGNORE, no_disk_flush) - NL_BIT( 54, T_MAY_IGNORE, no_md_flush) - /* 55 max_bio_size was available in 8.2.6rc2 */ - NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs) - NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier) - NL_BIT( 58, T_MAY_IGNORE, no_disk_drain) - NL_INTEGER( 89, T_MAY_IGNORE, disk_timeout) -) - -NL_PACKET(detach, 4, - NL_BIT( 88, T_MANDATORY, detach_force) -) - -NL_PACKET(net_conf, 5, - NL_STRING( 8, T_MANDATORY, my_addr, 128) - NL_STRING( 9, T_MANDATORY, peer_addr, 128) - NL_STRING( 10, T_MAY_IGNORE, shared_secret, SHARED_SECRET_MAX) - NL_STRING( 11, T_MAY_IGNORE, cram_hmac_alg, SHARED_SECRET_MAX) - NL_STRING( 44, T_MAY_IGNORE, integrity_alg, SHARED_SECRET_MAX) - NL_INTEGER( 14, T_MAY_IGNORE, timeout) - NL_INTEGER( 15, T_MANDATORY, wire_protocol) - NL_INTEGER( 16, T_MAY_IGNORE, try_connect_int) - NL_INTEGER( 17, T_MAY_IGNORE, ping_int) - NL_INTEGER( 18, T_MAY_IGNORE, max_epoch_size) - NL_INTEGER( 19, T_MAY_IGNORE, max_buffers) - NL_INTEGER( 20, T_MAY_IGNORE, unplug_watermark) - NL_INTEGER( 21, T_MAY_IGNORE, sndbuf_size) - NL_INTEGER( 22, T_MAY_IGNORE, ko_count) - NL_INTEGER( 24, T_MAY_IGNORE, after_sb_0p) - NL_INTEGER( 25, T_MAY_IGNORE, after_sb_1p) - NL_INTEGER( 26, T_MAY_IGNORE, after_sb_2p) - NL_INTEGER( 39, T_MAY_IGNORE, rr_conflict) - NL_INTEGER( 40, T_MAY_IGNORE, ping_timeo) - NL_INTEGER( 67, T_MAY_IGNORE, rcvbuf_size) - NL_INTEGER( 81, T_MAY_IGNORE, on_congestion) - NL_INTEGER( 82, T_MAY_IGNORE, cong_fill) - NL_INTEGER( 83, T_MAY_IGNORE, cong_extents) - /* 59 addr_family was available in GIT, never released */ - NL_BIT( 60, T_MANDATORY, mind_af) - NL_BIT( 27, T_MAY_IGNORE, want_lose) - NL_BIT( 28, T_MAY_IGNORE, two_primaries) - NL_BIT( 41, T_MAY_IGNORE, always_asbp) - NL_BIT( 61, T_MAY_IGNORE, no_cork) - NL_BIT( 62, T_MANDATORY, auto_sndbuf_size) - NL_BIT( 70, T_MANDATORY, dry_run) -) - -NL_PACKET(disconnect, 6, - NL_BIT( 84, T_MAY_IGNORE, force) -) - -NL_PACKET(resize, 7, - NL_INT64( 29, T_MAY_IGNORE, resize_size) - NL_BIT( 68, T_MAY_IGNORE, resize_force) - NL_BIT( 69, T_MANDATORY, no_resync) -) - -NL_PACKET(syncer_conf, 8, - NL_INTEGER( 30, T_MAY_IGNORE, rate) - NL_INTEGER( 31, T_MAY_IGNORE, after) - NL_INTEGER( 32, T_MAY_IGNORE, al_extents) -/* NL_INTEGER( 71, T_MAY_IGNORE, dp_volume) - * NL_INTEGER( 72, T_MAY_IGNORE, dp_interval) - * NL_INTEGER( 73, T_MAY_IGNORE, throttle_th) - * NL_INTEGER( 74, T_MAY_IGNORE, hold_off_th) - * feature will be reimplemented differently with 8.3.9 */ - NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX) - NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32) - NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX) - NL_BIT( 65, T_MAY_IGNORE, use_rle) - NL_INTEGER( 75, T_MAY_IGNORE, on_no_data) - NL_INTEGER( 76, T_MAY_IGNORE, c_plan_ahead) - NL_INTEGER( 77, T_MAY_IGNORE, c_delay_target) - NL_INTEGER( 78, T_MAY_IGNORE, c_fill_target) - NL_INTEGER( 79, T_MAY_IGNORE, c_max_rate) - NL_INTEGER( 80, T_MAY_IGNORE, c_min_rate) -) - -NL_PACKET(invalidate, 9, ) -NL_PACKET(invalidate_peer, 10, ) -NL_PACKET(pause_sync, 11, ) -NL_PACKET(resume_sync, 12, ) -NL_PACKET(suspend_io, 13, ) -NL_PACKET(resume_io, 14, ) -NL_PACKET(outdate, 15, ) -NL_PACKET(get_config, 16, ) -NL_PACKET(get_state, 17, - NL_INTEGER( 33, T_MAY_IGNORE, state_i) -) - -NL_PACKET(get_uuids, 18, - NL_STRING( 34, T_MAY_IGNORE, uuids, (UI_SIZE*sizeof(__u64))) - NL_INTEGER( 35, T_MAY_IGNORE, uuids_flags) -) - -NL_PACKET(get_timeout_flag, 19, - NL_BIT( 36, T_MAY_IGNORE, use_degraded) -) - -NL_PACKET(call_helper, 20, - NL_STRING( 38, T_MAY_IGNORE, helper, 32) -) - -/* Tag nr 42 already allocated in drbd-8.1 development. */ - -NL_PACKET(sync_progress, 23, - NL_INTEGER( 43, T_MAY_IGNORE, sync_progress) -) - -NL_PACKET(dump_ee, 24, - NL_STRING( 45, T_MAY_IGNORE, dump_ee_reason, 32) - NL_STRING( 46, T_MAY_IGNORE, seen_digest, SHARED_SECRET_MAX) - NL_STRING( 47, T_MAY_IGNORE, calc_digest, SHARED_SECRET_MAX) - NL_INT64( 48, T_MAY_IGNORE, ee_sector) - NL_INT64( 49, T_MAY_IGNORE, ee_block_id) - NL_STRING( 50, T_MAY_IGNORE, ee_data, 32 << 10) -) - -NL_PACKET(start_ov, 25, - NL_INT64( 66, T_MAY_IGNORE, start_sector) -) - -NL_PACKET(new_c_uuid, 26, - NL_BIT( 63, T_MANDATORY, clear_bm) -) - -#ifdef NL_RESPONSE -NL_RESPONSE(return_code_only, 27) -#endif - -#undef NL_PACKET -#undef NL_INTEGER -#undef NL_INT64 -#undef NL_BIT -#undef NL_STRING -#undef NL_RESPONSE diff --git a/trunk/include/linux/drbd_tag_magic.h b/trunk/include/linux/drbd_tag_magic.h deleted file mode 100644 index 82de1f9e48b1..000000000000 --- a/trunk/include/linux/drbd_tag_magic.h +++ /dev/null @@ -1,84 +0,0 @@ -#ifndef DRBD_TAG_MAGIC_H -#define DRBD_TAG_MAGIC_H - -#define TT_END 0 -#define TT_REMOVED 0xE000 - -/* declare packet_type enums */ -enum packet_types { -#define NL_PACKET(name, number, fields) P_ ## name = number, -#define NL_RESPONSE(name, number) P_ ## name = number, -#define NL_INTEGER(pn, pr, member) -#define NL_INT64(pn, pr, member) -#define NL_BIT(pn, pr, member) -#define NL_STRING(pn, pr, member, len) -#include - P_nl_after_last_packet, -}; - -/* These struct are used to deduce the size of the tag lists: */ -#define NL_PACKET(name, number, fields) \ - struct name ## _tag_len_struct { fields }; -#define NL_INTEGER(pn, pr, member) \ - int member; int tag_and_len ## member; -#define NL_INT64(pn, pr, member) \ - __u64 member; int tag_and_len ## member; -#define NL_BIT(pn, pr, member) \ - unsigned char member:1; int tag_and_len ## member; -#define NL_STRING(pn, pr, member, len) \ - unsigned char member[len]; int member ## _len; \ - int tag_and_len ## member; -#include - -/* declare tag-list-sizes */ -static const int tag_list_sizes[] = { -#define NL_PACKET(name, number, fields) 2 fields , -#define NL_INTEGER(pn, pr, member) + 4 + 4 -#define NL_INT64(pn, pr, member) + 4 + 8 -#define NL_BIT(pn, pr, member) + 4 + 1 -#define NL_STRING(pn, pr, member, len) + 4 + (len) -#include -}; - -/* The two highest bits are used for the tag type */ -#define TT_MASK 0xC000 -#define TT_INTEGER 0x0000 -#define TT_INT64 0x4000 -#define TT_BIT 0x8000 -#define TT_STRING 0xC000 -/* The next bit indicates if processing of the tag is mandatory */ -#define T_MANDATORY 0x2000 -#define T_MAY_IGNORE 0x0000 -#define TN_MASK 0x1fff -/* The remaining 13 bits are used to enumerate the tags */ - -#define tag_type(T) ((T) & TT_MASK) -#define tag_number(T) ((T) & TN_MASK) - -/* declare tag enums */ -#define NL_PACKET(name, number, fields) fields -enum drbd_tags { -#define NL_INTEGER(pn, pr, member) T_ ## member = pn | TT_INTEGER | pr , -#define NL_INT64(pn, pr, member) T_ ## member = pn | TT_INT64 | pr , -#define NL_BIT(pn, pr, member) T_ ## member = pn | TT_BIT | pr , -#define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING | pr , -#include -}; - -struct tag { - const char *name; - int type_n_flags; - int max_len; -}; - -/* declare tag names */ -#define NL_PACKET(name, number, fields) fields -static const struct tag tag_descriptions[] = { -#define NL_INTEGER(pn, pr, member) [ pn ] = { #member, TT_INTEGER | pr, sizeof(int) }, -#define NL_INT64(pn, pr, member) [ pn ] = { #member, TT_INT64 | pr, sizeof(__u64) }, -#define NL_BIT(pn, pr, member) [ pn ] = { #member, TT_BIT | pr, sizeof(int) }, -#define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING | pr, (len) }, -#include -}; - -#endif diff --git a/trunk/include/linux/exportfs.h b/trunk/include/linux/exportfs.h index 12291a7ee275..c7e6b6392ab8 100644 --- a/trunk/include/linux/exportfs.h +++ b/trunk/include/linux/exportfs.h @@ -177,6 +177,8 @@ struct export_operations { int (*commit_metadata)(struct inode *inode); }; +extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid, + int *max_len, struct inode *parent); extern int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len, int connectable); extern struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, diff --git a/trunk/include/linux/fs.h b/trunk/include/linux/fs.h index 408fb1e77a0a..a823d4be38e7 100644 --- a/trunk/include/linux/fs.h +++ b/trunk/include/linux/fs.h @@ -44,6 +44,7 @@ struct vm_area_struct; struct vfsmount; struct cred; struct swap_info_struct; +struct seq_file; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -1543,6 +1544,7 @@ struct file_operations { int (*setlease)(struct file *, long, struct file_lock **); long (*fallocate)(struct file *file, int mode, loff_t offset, loff_t len); + int (*show_fdinfo)(struct seq_file *m, struct file *f); }; struct inode_operations { @@ -1578,8 +1580,6 @@ struct inode_operations { umode_t create_mode, int *opened); } ____cacheline_aligned; -struct seq_file; - ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_pointer, @@ -1810,6 +1810,8 @@ struct file_system_type { #define FS_REQUIRES_DEV 1 #define FS_BINARY_MOUNTDATA 2 #define FS_HAS_SUBTYPE 4 +#define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ +#define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */ #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ struct dentry *(*mount) (struct file_system_type *, int, @@ -2286,9 +2288,9 @@ extern ino_t find_inode_number(struct dentry *, struct qstr *); #include /* needed for stackable file system support */ -extern loff_t default_llseek(struct file *file, loff_t offset, int origin); +extern loff_t default_llseek(struct file *file, loff_t offset, int whence); -extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin); +extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence); extern int inode_init_always(struct super_block *, struct inode *); extern void inode_init_once(struct inode *); @@ -2396,11 +2398,11 @@ extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); -extern loff_t noop_llseek(struct file *file, loff_t offset, int origin); -extern loff_t no_llseek(struct file *file, loff_t offset, int origin); -extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); +extern loff_t noop_llseek(struct file *file, loff_t offset, int whence); +extern loff_t no_llseek(struct file *file, loff_t offset, int whence); +extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence); extern loff_t generic_file_llseek_size(struct file *file, loff_t offset, - int origin, loff_t maxsize, loff_t eof); + int whence, loff_t maxsize, loff_t eof); extern int generic_file_open(struct inode * inode, struct file * filp); extern int nonseekable_open(struct inode * inode, struct file * filp); diff --git a/trunk/include/linux/ftrace.h b/trunk/include/linux/ftrace.h index a52f2f4fe030..92691d85c320 100644 --- a/trunk/include/linux/ftrace.h +++ b/trunk/include/linux/ftrace.h @@ -394,7 +394,7 @@ ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos); ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos); -loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin); +loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int whence); int ftrace_regex_release(struct inode *inode, struct file *file); void __init @@ -559,7 +559,7 @@ static inline ssize_t ftrace_filter_write(struct file *file, const char __user * size_t cnt, loff_t *ppos) { return -ENODEV; } static inline ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { return -ENODEV; } -static inline loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin) +static inline loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int whence) { return -ENODEV; } diff --git a/trunk/include/linux/genhd.h b/trunk/include/linux/genhd.h index 4f440b3e89fe..79b8bba19363 100644 --- a/trunk/include/linux/genhd.h +++ b/trunk/include/linux/genhd.h @@ -88,10 +88,14 @@ struct disk_stats { }; #define PARTITION_META_INFO_VOLNAMELTH 64 -#define PARTITION_META_INFO_UUIDLTH 16 +/* + * Enough for the string representation of any kind of UUID plus NULL. + * EFI UUID is 36 characters. MSDOS UUID is 11 characters. + */ +#define PARTITION_META_INFO_UUIDLTH 37 struct partition_meta_info { - u8 uuid[PARTITION_META_INFO_UUIDLTH]; /* always big endian */ + char uuid[PARTITION_META_INFO_UUIDLTH]; u8 volname[PARTITION_META_INFO_VOLNAMELTH]; }; diff --git a/trunk/include/linux/genl_magic_func.h b/trunk/include/linux/genl_magic_func.h new file mode 100644 index 000000000000..023bc346b877 --- /dev/null +++ b/trunk/include/linux/genl_magic_func.h @@ -0,0 +1,422 @@ +#ifndef GENL_MAGIC_FUNC_H +#define GENL_MAGIC_FUNC_H + +#include + +/* + * Magic: declare tla policy {{{1 + * Magic: declare nested policies + * {{{2 + */ +#undef GENL_mc_group +#define GENL_mc_group(group) + +#undef GENL_notification +#define GENL_notification(op_name, op_num, mcast_group, tla_list) + +#undef GENL_op +#define GENL_op(op_name, op_num, handler, tla_list) + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ + [tag_name] = { .type = NLA_NESTED }, + +static struct nla_policy CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy)[] = { +#include GENL_MAGIC_INCLUDE_FILE +}; + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ +static struct nla_policy s_name ## _nl_policy[] __read_mostly = \ +{ s_fields }; + +#undef __field +#define __field(attr_nr, attr_flag, name, nla_type, _type, __get, \ + __put, __is_signed) \ + [attr_nr] = { .type = nla_type }, + +#undef __array +#define __array(attr_nr, attr_flag, name, nla_type, _type, maxlen, \ + __get, __put, __is_signed) \ + [attr_nr] = { .type = nla_type, \ + .len = maxlen - (nla_type == NLA_NUL_STRING) }, + +#include GENL_MAGIC_INCLUDE_FILE + +#ifndef __KERNEL__ +#ifndef pr_info +#define pr_info(args...) fprintf(stderr, args); +#endif +#endif + +#ifdef GENL_MAGIC_DEBUG +static void dprint_field(const char *dir, int nla_type, + const char *name, void *valp) +{ + __u64 val = valp ? *(__u32 *)valp : 1; + switch (nla_type) { + case NLA_U8: val = (__u8)val; + case NLA_U16: val = (__u16)val; + case NLA_U32: val = (__u32)val; + pr_info("%s attr %s: %d 0x%08x\n", dir, + name, (int)val, (unsigned)val); + break; + case NLA_U64: + val = *(__u64*)valp; + pr_info("%s attr %s: %lld 0x%08llx\n", dir, + name, (long long)val, (unsigned long long)val); + break; + case NLA_FLAG: + if (val) + pr_info("%s attr %s: set\n", dir, name); + break; + } +} + +static void dprint_array(const char *dir, int nla_type, + const char *name, const char *val, unsigned len) +{ + switch (nla_type) { + case NLA_NUL_STRING: + if (len && val[len-1] == '\0') + len--; + pr_info("%s attr %s: [len:%u] '%s'\n", dir, name, len, val); + break; + default: + /* we can always show 4 byte, + * thats what nlattr are aligned to. */ + pr_info("%s attr %s: [len:%u] %02x%02x%02x%02x ...\n", + dir, name, len, val[0], val[1], val[2], val[3]); + } +} + +#define DPRINT_TLA(a, op, b) pr_info("%s %s %s\n", a, op, b); + +/* Name is a member field name of the struct s. + * If s is NULL (only parsing, no copy requested in *_from_attrs()), + * nla is supposed to point to the attribute containing the information + * corresponding to that struct member. */ +#define DPRINT_FIELD(dir, nla_type, name, s, nla) \ + do { \ + if (s) \ + dprint_field(dir, nla_type, #name, &s->name); \ + else if (nla) \ + dprint_field(dir, nla_type, #name, \ + (nla_type == NLA_FLAG) ? NULL \ + : nla_data(nla)); \ + } while (0) + +#define DPRINT_ARRAY(dir, nla_type, name, s, nla) \ + do { \ + if (s) \ + dprint_array(dir, nla_type, #name, \ + s->name, s->name ## _len); \ + else if (nla) \ + dprint_array(dir, nla_type, #name, \ + nla_data(nla), nla_len(nla)); \ + } while (0) +#else +#define DPRINT_TLA(a, op, b) do {} while (0) +#define DPRINT_FIELD(dir, nla_type, name, s, nla) do {} while (0) +#define DPRINT_ARRAY(dir, nla_type, name, s, nla) do {} while (0) +#endif + +/* + * Magic: provide conversion functions {{{1 + * populate struct from attribute table: + * {{{2 + */ + +/* processing of generic netlink messages is serialized. + * use one static buffer for parsing of nested attributes */ +static struct nlattr *nested_attr_tb[128]; + +#ifndef BUILD_BUG_ON +/* Force a compilation error if condition is true */ +#define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition)) +/* Force a compilation error if condition is true, but also produce a + result (of value 0 and type size_t), so the expression can be used + e.g. in a structure initializer (or where-ever else comma expressions + aren't permitted). */ +#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) +#define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); })) +#endif + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ +/* *_from_attrs functions are static, but potentially unused */ \ +static int __ ## s_name ## _from_attrs(struct s_name *s, \ + struct genl_info *info, bool exclude_invariants) \ +{ \ + const int maxtype = ARRAY_SIZE(s_name ## _nl_policy)-1; \ + struct nlattr *tla = info->attrs[tag_number]; \ + struct nlattr **ntb = nested_attr_tb; \ + struct nlattr *nla; \ + int err; \ + BUILD_BUG_ON(ARRAY_SIZE(s_name ## _nl_policy) > ARRAY_SIZE(nested_attr_tb)); \ + if (!tla) \ + return -ENOMSG; \ + DPRINT_TLA(#s_name, "<=-", #tag_name); \ + err = drbd_nla_parse_nested(ntb, maxtype, tla, s_name ## _nl_policy); \ + if (err) \ + return err; \ + \ + s_fields \ + return 0; \ +} __attribute__((unused)) \ +static int s_name ## _from_attrs(struct s_name *s, \ + struct genl_info *info) \ +{ \ + return __ ## s_name ## _from_attrs(s, info, false); \ +} __attribute__((unused)) \ +static int s_name ## _from_attrs_for_change(struct s_name *s, \ + struct genl_info *info) \ +{ \ + return __ ## s_name ## _from_attrs(s, info, true); \ +} __attribute__((unused)) \ + +#define __assign(attr_nr, attr_flag, name, nla_type, type, assignment...) \ + nla = ntb[attr_nr]; \ + if (nla) { \ + if (exclude_invariants && ((attr_flag) & DRBD_F_INVARIANT)) { \ + pr_info("<< must not change invariant attr: %s\n", #name); \ + return -EEXIST; \ + } \ + assignment; \ + } else if (exclude_invariants && ((attr_flag) & DRBD_F_INVARIANT)) { \ + /* attribute missing from payload, */ \ + /* which was expected */ \ + } else if ((attr_flag) & DRBD_F_REQUIRED) { \ + pr_info("<< missing attr: %s\n", #name); \ + return -ENOMSG; \ + } + +#undef __field +#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ + __is_signed) \ + __assign(attr_nr, attr_flag, name, nla_type, type, \ + if (s) \ + s->name = __get(nla); \ + DPRINT_FIELD("<<", nla_type, name, s, nla)) + +/* validate_nla() already checked nla_len <= maxlen appropriately. */ +#undef __array +#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ + __get, __put, __is_signed) \ + __assign(attr_nr, attr_flag, name, nla_type, type, \ + if (s) \ + s->name ## _len = \ + __get(s->name, nla, maxlen); \ + DPRINT_ARRAY("<<", nla_type, name, s, nla)) + +#include GENL_MAGIC_INCLUDE_FILE + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) + +/* + * Magic: define op number to op name mapping {{{1 + * {{{2 + */ +const char *CONCAT_(GENL_MAGIC_FAMILY, _genl_cmd_to_str)(__u8 cmd) +{ + switch (cmd) { +#undef GENL_op +#define GENL_op(op_name, op_num, handler, tla_list) \ + case op_num: return #op_name; +#include GENL_MAGIC_INCLUDE_FILE + default: + return "unknown"; + } +} + +#ifdef __KERNEL__ +#include +/* + * Magic: define genl_ops {{{1 + * {{{2 + */ + +#undef GENL_op +#define GENL_op(op_name, op_num, handler, tla_list) \ +{ \ + handler \ + .cmd = op_name, \ + .policy = CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy), \ +}, + +#define ZZZ_genl_ops CONCAT_(GENL_MAGIC_FAMILY, _genl_ops) +static struct genl_ops ZZZ_genl_ops[] __read_mostly = { +#include GENL_MAGIC_INCLUDE_FILE +}; + +#undef GENL_op +#define GENL_op(op_name, op_num, handler, tla_list) + +/* + * Define the genl_family, multicast groups, {{{1 + * and provide register/unregister functions. + * {{{2 + */ +#define ZZZ_genl_family CONCAT_(GENL_MAGIC_FAMILY, _genl_family) +static struct genl_family ZZZ_genl_family __read_mostly = { + .id = GENL_ID_GENERATE, + .name = __stringify(GENL_MAGIC_FAMILY), + .version = GENL_MAGIC_VERSION, +#ifdef GENL_MAGIC_FAMILY_HDRSZ + .hdrsize = NLA_ALIGN(GENL_MAGIC_FAMILY_HDRSZ), +#endif + .maxattr = ARRAY_SIZE(drbd_tla_nl_policy)-1, +}; + +/* + * Magic: define multicast groups + * Magic: define multicast group registration helper + */ +#undef GENL_mc_group +#define GENL_mc_group(group) \ +static struct genl_multicast_group \ +CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group) __read_mostly = { \ + .name = #group, \ +}; \ +static int CONCAT_(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)( \ + struct sk_buff *skb, gfp_t flags) \ +{ \ + unsigned int group_id = \ + CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group).id; \ + if (!group_id) \ + return -EINVAL; \ + return genlmsg_multicast(skb, 0, group_id, flags); \ +} + +#include GENL_MAGIC_INCLUDE_FILE + +int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void) +{ + int err = genl_register_family_with_ops(&ZZZ_genl_family, + ZZZ_genl_ops, ARRAY_SIZE(ZZZ_genl_ops)); + if (err) + return err; +#undef GENL_mc_group +#define GENL_mc_group(group) \ + err = genl_register_mc_group(&ZZZ_genl_family, \ + &CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group)); \ + if (err) \ + goto fail; \ + else \ + pr_info("%s: mcg %s: %u\n", #group, \ + __stringify(GENL_MAGIC_FAMILY), \ + CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group).id); + +#include GENL_MAGIC_INCLUDE_FILE + +#undef GENL_mc_group +#define GENL_mc_group(group) + return 0; +fail: + genl_unregister_family(&ZZZ_genl_family); + return err; +} + +void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void) +{ + genl_unregister_family(&ZZZ_genl_family); +} + +/* + * Magic: provide conversion functions {{{1 + * populate skb from struct. + * {{{2 + */ + +#undef GENL_op +#define GENL_op(op_name, op_num, handler, tla_list) + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ +static int s_name ## _to_skb(struct sk_buff *skb, struct s_name *s, \ + const bool exclude_sensitive) \ +{ \ + struct nlattr *tla = nla_nest_start(skb, tag_number); \ + if (!tla) \ + goto nla_put_failure; \ + DPRINT_TLA(#s_name, "-=>", #tag_name); \ + s_fields \ + nla_nest_end(skb, tla); \ + return 0; \ + \ +nla_put_failure: \ + if (tla) \ + nla_nest_cancel(skb, tla); \ + return -EMSGSIZE; \ +} \ +static inline int s_name ## _to_priv_skb(struct sk_buff *skb, \ + struct s_name *s) \ +{ \ + return s_name ## _to_skb(skb, s, 0); \ +} \ +static inline int s_name ## _to_unpriv_skb(struct sk_buff *skb, \ + struct s_name *s) \ +{ \ + return s_name ## _to_skb(skb, s, 1); \ +} + + +#undef __field +#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ + __is_signed) \ + if (!exclude_sensitive || !((attr_flag) & DRBD_F_SENSITIVE)) { \ + DPRINT_FIELD(">>", nla_type, name, s, NULL); \ + if (__put(skb, attr_nr, s->name)) \ + goto nla_put_failure; \ + } + +#undef __array +#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ + __get, __put, __is_signed) \ + if (!exclude_sensitive || !((attr_flag) & DRBD_F_SENSITIVE)) { \ + DPRINT_ARRAY(">>",nla_type, name, s, NULL); \ + if (__put(skb, attr_nr, min_t(int, maxlen, \ + s->name ## _len + (nla_type == NLA_NUL_STRING)),\ + s->name)) \ + goto nla_put_failure; \ + } + +#include GENL_MAGIC_INCLUDE_FILE + + +/* Functions for initializing structs to default values. */ + +#undef __field +#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ + __is_signed) +#undef __array +#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ + __get, __put, __is_signed) +#undef __u32_field_def +#define __u32_field_def(attr_nr, attr_flag, name, default) \ + x->name = default; +#undef __s32_field_def +#define __s32_field_def(attr_nr, attr_flag, name, default) \ + x->name = default; +#undef __flg_field_def +#define __flg_field_def(attr_nr, attr_flag, name, default) \ + x->name = default; +#undef __str_field_def +#define __str_field_def(attr_nr, attr_flag, name, maxlen) \ + memset(x->name, 0, sizeof(x->name)); \ + x->name ## _len = 0; +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ +static void set_ ## s_name ## _defaults(struct s_name *x) __attribute__((unused)); \ +static void set_ ## s_name ## _defaults(struct s_name *x) { \ +s_fields \ +} + +#include GENL_MAGIC_INCLUDE_FILE + +#endif /* __KERNEL__ */ + +/* }}}1 */ +#endif /* GENL_MAGIC_FUNC_H */ +/* vim: set foldmethod=marker foldlevel=1 nofoldenable : */ diff --git a/trunk/include/linux/genl_magic_struct.h b/trunk/include/linux/genl_magic_struct.h new file mode 100644 index 000000000000..eecd19b37001 --- /dev/null +++ b/trunk/include/linux/genl_magic_struct.h @@ -0,0 +1,277 @@ +#ifndef GENL_MAGIC_STRUCT_H +#define GENL_MAGIC_STRUCT_H + +#ifndef GENL_MAGIC_FAMILY +# error "you need to define GENL_MAGIC_FAMILY before inclusion" +#endif + +#ifndef GENL_MAGIC_VERSION +# error "you need to define GENL_MAGIC_VERSION before inclusion" +#endif + +#ifndef GENL_MAGIC_INCLUDE_FILE +# error "you need to define GENL_MAGIC_INCLUDE_FILE before inclusion" +#endif + +#include +#include + +#define CONCAT__(a,b) a ## b +#define CONCAT_(a,b) CONCAT__(a,b) + +extern int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void); +extern void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void); + +/* + * Extension of genl attribute validation policies {{{2 + */ + +/* + * @DRBD_GENLA_F_MANDATORY: By default, netlink ignores attributes it does not + * know about. This flag can be set in nlattr->nla_type to indicate that this + * attribute must not be ignored. + * + * We check and remove this flag in drbd_nla_check_mandatory() before + * validating the attribute types and lengths via nla_parse_nested(). + */ +#define DRBD_GENLA_F_MANDATORY (1 << 14) + +/* + * Flags specific to drbd and not visible at the netlink layer, used in + * _from_attrs and _to_skb: + * + * @DRBD_F_REQUIRED: Attribute is required; a request without this attribute is + * invalid. + * + * @DRBD_F_SENSITIVE: Attribute includes sensitive information and must not be + * included in unpriviledged get requests or broadcasts. + * + * @DRBD_F_INVARIANT: Attribute is set when an object is initially created, but + * cannot subsequently be changed. + */ +#define DRBD_F_REQUIRED (1 << 0) +#define DRBD_F_SENSITIVE (1 << 1) +#define DRBD_F_INVARIANT (1 << 2) + +#define __nla_type(x) ((__u16)((x) & NLA_TYPE_MASK & ~DRBD_GENLA_F_MANDATORY)) + +/* }}}1 + * MAGIC + * multi-include macro expansion magic starts here + */ + +/* MAGIC helpers {{{2 */ + +/* possible field types */ +#define __flg_field(attr_nr, attr_flag, name) \ + __field(attr_nr, attr_flag, name, NLA_U8, char, \ + nla_get_u8, nla_put_u8, false) +#define __u8_field(attr_nr, attr_flag, name) \ + __field(attr_nr, attr_flag, name, NLA_U8, unsigned char, \ + nla_get_u8, nla_put_u8, false) +#define __u16_field(attr_nr, attr_flag, name) \ + __field(attr_nr, attr_flag, name, NLA_U16, __u16, \ + nla_get_u16, nla_put_u16, false) +#define __u32_field(attr_nr, attr_flag, name) \ + __field(attr_nr, attr_flag, name, NLA_U32, __u32, \ + nla_get_u32, nla_put_u32, false) +#define __s32_field(attr_nr, attr_flag, name) \ + __field(attr_nr, attr_flag, name, NLA_U32, __s32, \ + nla_get_u32, nla_put_u32, true) +#define __u64_field(attr_nr, attr_flag, name) \ + __field(attr_nr, attr_flag, name, NLA_U64, __u64, \ + nla_get_u64, nla_put_u64, false) +#define __str_field(attr_nr, attr_flag, name, maxlen) \ + __array(attr_nr, attr_flag, name, NLA_NUL_STRING, char, maxlen, \ + nla_strlcpy, nla_put, false) +#define __bin_field(attr_nr, attr_flag, name, maxlen) \ + __array(attr_nr, attr_flag, name, NLA_BINARY, char, maxlen, \ + nla_memcpy, nla_put, false) + +/* fields with default values */ +#define __flg_field_def(attr_nr, attr_flag, name, default) \ + __flg_field(attr_nr, attr_flag, name) +#define __u32_field_def(attr_nr, attr_flag, name, default) \ + __u32_field(attr_nr, attr_flag, name) +#define __s32_field_def(attr_nr, attr_flag, name, default) \ + __s32_field(attr_nr, attr_flag, name) +#define __str_field_def(attr_nr, attr_flag, name, maxlen) \ + __str_field(attr_nr, attr_flag, name, maxlen) + +#define GENL_op_init(args...) args +#define GENL_doit(handler) \ + .doit = handler, \ + .flags = GENL_ADMIN_PERM, +#define GENL_dumpit(handler) \ + .dumpit = handler, \ + .flags = GENL_ADMIN_PERM, + +/* }}}1 + * Magic: define the enum symbols for genl_ops + * Magic: define the enum symbols for top level attributes + * Magic: define the enum symbols for nested attributes + * {{{2 + */ + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) + +#undef GENL_mc_group +#define GENL_mc_group(group) + +#undef GENL_notification +#define GENL_notification(op_name, op_num, mcast_group, tla_list) \ + op_name = op_num, + +#undef GENL_op +#define GENL_op(op_name, op_num, handler, tla_list) \ + op_name = op_num, + +enum { +#include GENL_MAGIC_INCLUDE_FILE +}; + +#undef GENL_notification +#define GENL_notification(op_name, op_num, mcast_group, tla_list) + +#undef GENL_op +#define GENL_op(op_name, op_num, handler, attr_list) + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ + tag_name = tag_number, + +enum { +#include GENL_MAGIC_INCLUDE_FILE +}; + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ +enum { \ + s_fields \ +}; + +#undef __field +#define __field(attr_nr, attr_flag, name, nla_type, type, \ + __get, __put, __is_signed) \ + T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)), + +#undef __array +#define __array(attr_nr, attr_flag, name, nla_type, type, \ + maxlen, __get, __put, __is_signed) \ + T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)), + +#include GENL_MAGIC_INCLUDE_FILE + +/* }}}1 + * Magic: compile time assert unique numbers for operations + * Magic: -"- unique numbers for top level attributes + * Magic: -"- unique numbers for nested attributes + * {{{2 + */ + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) + +#undef GENL_op +#define GENL_op(op_name, op_num, handler, attr_list) \ + case op_name: + +#undef GENL_notification +#define GENL_notification(op_name, op_num, mcast_group, tla_list) \ + case op_name: + +static inline void ct_assert_unique_operations(void) +{ + switch (0) { +#include GENL_MAGIC_INCLUDE_FILE + ; + } +} + +#undef GENL_op +#define GENL_op(op_name, op_num, handler, attr_list) + +#undef GENL_notification +#define GENL_notification(op_name, op_num, mcast_group, tla_list) + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ + case tag_number: + +static inline void ct_assert_unique_top_level_attributes(void) +{ + switch (0) { +#include GENL_MAGIC_INCLUDE_FILE + ; + } +} + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ +static inline void ct_assert_unique_ ## s_name ## _attributes(void) \ +{ \ + switch (0) { \ + s_fields \ + ; \ + } \ +} + +#undef __field +#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ + __is_signed) \ + case attr_nr: + +#undef __array +#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ + __get, __put, __is_signed) \ + case attr_nr: + +#include GENL_MAGIC_INCLUDE_FILE + +/* }}}1 + * Magic: declare structs + * struct { + * fields + * }; + * {{{2 + */ + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ +struct s_name { s_fields }; + +#undef __field +#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ + __is_signed) \ + type name; + +#undef __array +#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ + __get, __put, __is_signed) \ + type name[maxlen]; \ + __u32 name ## _len; + +#include GENL_MAGIC_INCLUDE_FILE + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ +enum { \ + s_fields \ +}; + +#undef __field +#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ + is_signed) \ + F_ ## name ## _IS_SIGNED = is_signed, + +#undef __array +#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ + __get, __put, is_signed) \ + F_ ## name ## _IS_SIGNED = is_signed, + +#include GENL_MAGIC_INCLUDE_FILE + +/* }}}1 */ +#endif /* GENL_MAGIC_STRUCT_H */ +/* vim: set foldmethod=marker nofoldenable : */ diff --git a/trunk/include/linux/gfp.h b/trunk/include/linux/gfp.h index f74856e17e48..0f615eb23d05 100644 --- a/trunk/include/linux/gfp.h +++ b/trunk/include/linux/gfp.h @@ -30,6 +30,7 @@ struct vm_area_struct; #define ___GFP_HARDWALL 0x20000u #define ___GFP_THISNODE 0x40000u #define ___GFP_RECLAIMABLE 0x80000u +#define ___GFP_KMEMCG 0x100000u #define ___GFP_NOTRACK 0x200000u #define ___GFP_NO_KSWAPD 0x400000u #define ___GFP_OTHER_NODE 0x800000u @@ -89,6 +90,7 @@ struct vm_area_struct; #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ +#define __GFP_KMEMCG ((__force gfp_t)___GFP_KMEMCG) /* Allocation comes from a memcg-accounted resource */ #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ /* @@ -365,6 +367,9 @@ extern void free_pages(unsigned long addr, unsigned int order); extern void free_hot_cold_page(struct page *page, int cold); extern void free_hot_cold_page_list(struct list_head *list, int cold); +extern void __free_memcg_kmem_pages(struct page *page, unsigned int order); +extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order); + #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) diff --git a/trunk/include/linux/hugetlb_cgroup.h b/trunk/include/linux/hugetlb_cgroup.h index d73878c694b3..ce8217f7b5c2 100644 --- a/trunk/include/linux/hugetlb_cgroup.h +++ b/trunk/include/linux/hugetlb_cgroup.h @@ -62,7 +62,7 @@ extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page); extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg); -extern int hugetlb_cgroup_file_init(int idx) __init; +extern void hugetlb_cgroup_file_init(void) __init; extern void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage); @@ -111,9 +111,8 @@ hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, return; } -static inline int __init hugetlb_cgroup_file_init(int idx) +static inline void hugetlb_cgroup_file_init(void) { - return 0; } static inline void hugetlb_cgroup_migrate(struct page *oldhpage, diff --git a/trunk/include/linux/i2c-omap.h b/trunk/include/linux/i2c-omap.h index 92a0dc75bc74..babe0cf6d56b 100644 --- a/trunk/include/linux/i2c-omap.h +++ b/trunk/include/linux/i2c-omap.h @@ -20,8 +20,6 @@ #define OMAP_I2C_FLAG_NO_FIFO BIT(0) #define OMAP_I2C_FLAG_SIMPLE_CLOCK BIT(1) #define OMAP_I2C_FLAG_16BIT_DATA_REG BIT(2) -#define OMAP_I2C_FLAG_RESET_REGS_POSTIDLE BIT(3) -#define OMAP_I2C_FLAG_APPLY_ERRATA_I207 BIT(4) #define OMAP_I2C_FLAG_ALWAYS_ARMXOR_CLK BIT(5) #define OMAP_I2C_FLAG_FORCE_19200_INT_CLK BIT(6) /* how the CPU address bus must be translated for I2C unit access */ diff --git a/trunk/include/linux/i2c/i2c-sh_mobile.h b/trunk/include/linux/i2c/i2c-sh_mobile.h index beda7081aead..06e3089795fb 100644 --- a/trunk/include/linux/i2c/i2c-sh_mobile.h +++ b/trunk/include/linux/i2c/i2c-sh_mobile.h @@ -5,6 +5,7 @@ struct i2c_sh_mobile_platform_data { unsigned long bus_speed; + unsigned int clks_per_count; }; #endif /* __I2C_SH_MOBILE_H__ */ diff --git a/trunk/include/linux/idr.h b/trunk/include/linux/idr.h index 87259a44c251..de7e190f1af4 100644 --- a/trunk/include/linux/idr.h +++ b/trunk/include/linux/idr.h @@ -152,4 +152,15 @@ void ida_simple_remove(struct ida *ida, unsigned int id); void __init idr_init_cache(void); +/** + * idr_for_each_entry - iterate over an idr's elements of a given type + * @idp: idr handle + * @entry: the type * to use as cursor + * @id: id entry's key + */ +#define idr_for_each_entry(idp, entry, id) \ + for (id = 0, entry = (typeof(entry))idr_get_next((idp), &(id)); \ + entry != NULL; \ + ++id, entry = (typeof(entry))idr_get_next((idp), &(id))) + #endif /* __IDR_H__ */ diff --git a/trunk/include/linux/ima.h b/trunk/include/linux/ima.h index 2c7223d7e73b..86c361e947b9 100644 --- a/trunk/include/linux/ima.h +++ b/trunk/include/linux/ima.h @@ -18,6 +18,7 @@ extern int ima_bprm_check(struct linux_binprm *bprm); extern int ima_file_check(struct file *file, int mask); extern void ima_file_free(struct file *file); extern int ima_file_mmap(struct file *file, unsigned long prot); +extern int ima_module_check(struct file *file); #else static inline int ima_bprm_check(struct linux_binprm *bprm) @@ -40,6 +41,11 @@ static inline int ima_file_mmap(struct file *file, unsigned long prot) return 0; } +static inline int ima_module_check(struct file *file) +{ + return 0; +} + #endif /* CONFIG_IMA_H */ #ifdef CONFIG_IMA_APPRAISE diff --git a/trunk/include/linux/init.h b/trunk/include/linux/init.h index f63692d6902e..a799273714ac 100644 --- a/trunk/include/linux/init.h +++ b/trunk/include/linux/init.h @@ -182,16 +182,16 @@ extern bool initcall_debug; * can point at the same handler without causing duplicate-symbol build errors. */ -#define __define_initcall(level,fn,id) \ +#define __define_initcall(fn, id) \ static initcall_t __initcall_##fn##id __used \ - __attribute__((__section__(".initcall" level ".init"))) = fn + __attribute__((__section__(".initcall" #id ".init"))) = fn /* * Early initcalls run before initializing SMP. * * Only for built-in code, not modules. */ -#define early_initcall(fn) __define_initcall("early",fn,early) +#define early_initcall(fn) __define_initcall(fn, early) /* * A "pure" initcall has no dependencies on anything else, and purely @@ -200,23 +200,23 @@ extern bool initcall_debug; * This only exists for built-in code, not for modules. * Keep main.c:initcall_level_names[] in sync. */ -#define pure_initcall(fn) __define_initcall("0",fn,0) - -#define core_initcall(fn) __define_initcall("1",fn,1) -#define core_initcall_sync(fn) __define_initcall("1s",fn,1s) -#define postcore_initcall(fn) __define_initcall("2",fn,2) -#define postcore_initcall_sync(fn) __define_initcall("2s",fn,2s) -#define arch_initcall(fn) __define_initcall("3",fn,3) -#define arch_initcall_sync(fn) __define_initcall("3s",fn,3s) -#define subsys_initcall(fn) __define_initcall("4",fn,4) -#define subsys_initcall_sync(fn) __define_initcall("4s",fn,4s) -#define fs_initcall(fn) __define_initcall("5",fn,5) -#define fs_initcall_sync(fn) __define_initcall("5s",fn,5s) -#define rootfs_initcall(fn) __define_initcall("rootfs",fn,rootfs) -#define device_initcall(fn) __define_initcall("6",fn,6) -#define device_initcall_sync(fn) __define_initcall("6s",fn,6s) -#define late_initcall(fn) __define_initcall("7",fn,7) -#define late_initcall_sync(fn) __define_initcall("7s",fn,7s) +#define pure_initcall(fn) __define_initcall(fn, 0) + +#define core_initcall(fn) __define_initcall(fn, 1) +#define core_initcall_sync(fn) __define_initcall(fn, 1s) +#define postcore_initcall(fn) __define_initcall(fn, 2) +#define postcore_initcall_sync(fn) __define_initcall(fn, 2s) +#define arch_initcall(fn) __define_initcall(fn, 3) +#define arch_initcall_sync(fn) __define_initcall(fn, 3s) +#define subsys_initcall(fn) __define_initcall(fn, 4) +#define subsys_initcall_sync(fn) __define_initcall(fn, 4s) +#define fs_initcall(fn) __define_initcall(fn, 5) +#define fs_initcall_sync(fn) __define_initcall(fn, 5s) +#define rootfs_initcall(fn) __define_initcall(fn, rootfs) +#define device_initcall(fn) __define_initcall(fn, 6) +#define device_initcall_sync(fn) __define_initcall(fn, 6s) +#define late_initcall(fn) __define_initcall(fn, 7) +#define late_initcall_sync(fn) __define_initcall(fn, 7s) #define __initcall(fn) device_initcall(fn) diff --git a/trunk/include/linux/input.h b/trunk/include/linux/input.h index cab994ba6d91..82ce323b9986 100644 --- a/trunk/include/linux/input.h +++ b/trunk/include/linux/input.h @@ -112,6 +112,11 @@ struct input_value { * @h_list: list of input handles associated with the device. When * accessing the list dev->mutex must be held * @node: used to place the device onto input_dev_list + * @num_vals: number of values queued in the current frame + * @max_vals: maximum number of values queued in a frame + * @vals: array of values queued in the current frame + * @devres_managed: indicates that devices is managed with devres framework + * and needs not be explicitly unregistered or freed. */ struct input_dev { const char *name; @@ -180,6 +185,8 @@ struct input_dev { unsigned int num_vals; unsigned int max_vals; struct input_value *vals; + + bool devres_managed; }; #define to_input_dev(d) container_of(d, struct input_dev, dev) @@ -323,7 +330,8 @@ struct input_handle { struct list_head h_node; }; -struct input_dev *input_allocate_device(void); +struct input_dev __must_check *input_allocate_device(void); +struct input_dev __must_check *devm_input_allocate_device(struct device *); void input_free_device(struct input_dev *dev); static inline struct input_dev *input_get_device(struct input_dev *dev) diff --git a/trunk/include/linux/input/bu21013.h b/trunk/include/linux/input/bu21013.h index 05e03284b92a..6230d76bde5d 100644 --- a/trunk/include/linux/input/bu21013.h +++ b/trunk/include/linux/input/bu21013.h @@ -9,13 +9,10 @@ /** * struct bu21013_platform_device - Handle the platform data - * @cs_en: pointer to the cs enable function - * @cs_dis: pointer to the cs disable function - * @irq_read_val: pointer to read the pen irq value function * @touch_x_max: touch x max * @touch_y_max: touch y max * @cs_pin: chip select pin - * @irq: irq pin + * @touch_pin: touch gpio pin * @ext_clk: external clock flag * @x_flip: x flip flag * @y_flip: y flip flag @@ -24,13 +21,10 @@ * This is used to handle the platform data */ struct bu21013_platform_device { - int (*cs_en)(int reset_pin); - int (*cs_dis)(int reset_pin); - int (*irq_read_val)(void); int touch_x_max; int touch_y_max; unsigned int cs_pin; - unsigned int irq; + unsigned int touch_pin; bool ext_clk; bool x_flip; bool y_flip; diff --git a/trunk/include/linux/ipc_namespace.h b/trunk/include/linux/ipc_namespace.h index 5499c92a9153..fe771978e877 100644 --- a/trunk/include/linux/ipc_namespace.h +++ b/trunk/include/linux/ipc_namespace.h @@ -67,6 +67,8 @@ struct ipc_namespace { /* user_ns which owns the ipc ns */ struct user_namespace *user_ns; + + unsigned int proc_inum; }; extern struct ipc_namespace init_ipc_ns; @@ -133,7 +135,8 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #if defined(CONFIG_IPC_NS) extern struct ipc_namespace *copy_ipcs(unsigned long flags, - struct task_struct *tsk); + struct user_namespace *user_ns, struct ipc_namespace *ns); + static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) { if (ns) @@ -144,12 +147,12 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) extern void put_ipc_ns(struct ipc_namespace *ns); #else static inline struct ipc_namespace *copy_ipcs(unsigned long flags, - struct task_struct *tsk) + struct user_namespace *user_ns, struct ipc_namespace *ns) { if (flags & CLONE_NEWIPC) return ERR_PTR(-EINVAL); - return tsk->nsproxy->ipc_ns; + return ns; } static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) diff --git a/trunk/include/linux/kernel.h b/trunk/include/linux/kernel.h index d97ed5897447..d140e8fb075f 100644 --- a/trunk/include/linux/kernel.h +++ b/trunk/include/linux/kernel.h @@ -220,6 +220,23 @@ int __must_check _kstrtol(const char *s, unsigned int base, long *res); int __must_check kstrtoull(const char *s, unsigned int base, unsigned long long *res); int __must_check kstrtoll(const char *s, unsigned int base, long long *res); + +/** + * kstrtoul - convert a string to an unsigned long + * @s: The start of the string. The string must be null-terminated, and may also + * include a single newline before its terminating null. The first character + * may also be a plus sign, but not a minus sign. + * @base: The number base to use. The maximum supported base is 16. If base is + * given as 0, then the base of the string is automatically detected with the + * conventional semantics - If it begins with 0x the number will be parsed as a + * hexadecimal (case insensitive), if it otherwise begins with 0, it will be + * parsed as an octal number. Otherwise it will be parsed as a decimal. + * @res: Where to write the result of the conversion on success. + * + * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. + * Used as a replacement for the obsolete simple_strtoull. Return code must + * be checked. +*/ static inline int __must_check kstrtoul(const char *s, unsigned int base, unsigned long *res) { /* @@ -233,6 +250,22 @@ static inline int __must_check kstrtoul(const char *s, unsigned int base, unsign return _kstrtoul(s, base, res); } +/** + * kstrtol - convert a string to a long + * @s: The start of the string. The string must be null-terminated, and may also + * include a single newline before its terminating null. The first character + * may also be a plus sign or a minus sign. + * @base: The number base to use. The maximum supported base is 16. If base is + * given as 0, then the base of the string is automatically detected with the + * conventional semantics - If it begins with 0x the number will be parsed as a + * hexadecimal (case insensitive), if it otherwise begins with 0, it will be + * parsed as an octal number. Otherwise it will be parsed as a decimal. + * @res: Where to write the result of the conversion on success. + * + * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. + * Used as a replacement for the obsolete simple_strtoull. Return code must + * be checked. + */ static inline int __must_check kstrtol(const char *s, unsigned int base, long *res) { /* diff --git a/trunk/include/linux/loop.h b/trunk/include/linux/loop.h index 6492181bcb1d..460b60fa7adf 100644 --- a/trunk/include/linux/loop.h +++ b/trunk/include/linux/loop.h @@ -53,10 +53,13 @@ struct loop_device { spinlock_t lo_lock; struct bio_list lo_bio_list; + unsigned int lo_bio_count; int lo_state; struct mutex lo_ctl_mutex; struct task_struct *lo_thread; wait_queue_head_t lo_event; + /* wait queue for incoming requests */ + wait_queue_head_t lo_req_wait; struct request_queue *lo_queue; struct gendisk *lo_disk; diff --git a/trunk/include/linux/lru_cache.h b/trunk/include/linux/lru_cache.h index cafc7f99e124..4019013c6593 100644 --- a/trunk/include/linux/lru_cache.h +++ b/trunk/include/linux/lru_cache.h @@ -166,9 +166,11 @@ struct lc_element { /* if we want to track a larger set of objects, * it needs to become arch independend u64 */ unsigned lc_number; - /* special label when on free list */ #define LC_FREE (~0U) + + /* for pending changes */ + unsigned lc_new_number; }; struct lru_cache { @@ -176,6 +178,7 @@ struct lru_cache { struct list_head lru; struct list_head free; struct list_head in_use; + struct list_head to_be_changed; /* the pre-created kmem cache to allocate the objects from */ struct kmem_cache *lc_cache; @@ -186,7 +189,7 @@ struct lru_cache { size_t element_off; /* number of elements (indices) */ - unsigned int nr_elements; + unsigned int nr_elements; /* Arbitrary limit on maximum tracked objects. Practical limit is much * lower due to allocation failures, probably. For typical use cases, * nr_elements should be a few thousand at most. @@ -194,18 +197,19 @@ struct lru_cache { * 8 high bits of .lc_index to be overloaded with flags in the future. */ #define LC_MAX_ACTIVE (1<<24) + /* allow to accumulate a few (index:label) changes, + * but no more than max_pending_changes */ + unsigned int max_pending_changes; + /* number of elements currently on to_be_changed list */ + unsigned int pending_changes; + /* statistics */ - unsigned used; /* number of lelements currently on in_use list */ - unsigned long hits, misses, starving, dirty, changed; + unsigned used; /* number of elements currently on in_use list */ + unsigned long hits, misses, starving, locked, changed; /* see below: flag-bits for lru_cache */ unsigned long flags; - /* when changing the label of an index element */ - unsigned int new_number; - - /* for paranoia when changing the label of an index element */ - struct lc_element *changing_element; void *lc_private; const char *name; @@ -221,10 +225,15 @@ enum { /* debugging aid, to catch concurrent access early. * user needs to guarantee exclusive access by proper locking! */ __LC_PARANOIA, - /* if we need to change the set, but currently there is a changing - * transaction pending, we are "dirty", and must deferr further - * changing requests */ + + /* annotate that the set is "dirty", possibly accumulating further + * changes, until a transaction is finally triggered */ __LC_DIRTY, + + /* Locked, no further changes allowed. + * Also used to serialize changing transactions. */ + __LC_LOCKED, + /* if we need to change the set, but currently there is no free nor * unused element available, we are "starving", and must not give out * further references, to guarantee that eventually some refcnt will @@ -236,9 +245,11 @@ enum { }; #define LC_PARANOIA (1<<__LC_PARANOIA) #define LC_DIRTY (1<<__LC_DIRTY) +#define LC_LOCKED (1<<__LC_LOCKED) #define LC_STARVING (1<<__LC_STARVING) extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, + unsigned max_pending_changes, unsigned e_count, size_t e_size, size_t e_off); extern void lc_reset(struct lru_cache *lc); extern void lc_destroy(struct lru_cache *lc); @@ -249,7 +260,7 @@ extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr); extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr); extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr); extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e); -extern void lc_changed(struct lru_cache *lc, struct lc_element *e); +extern void lc_committed(struct lru_cache *lc); struct seq_file; extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); @@ -258,17 +269,29 @@ extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char void (*detail) (struct seq_file *, struct lc_element *)); /** - * lc_try_lock - can be used to stop lc_get() from changing the tracked set + * lc_try_lock_for_transaction - can be used to stop lc_get() from changing the tracked set * @lc: the lru cache to operate on * - * Note that the reference counts and order on the active and lru lists may - * still change. Returns true if we acquired the lock. + * Allows (expects) the set to be "dirty". Note that the reference counts and + * order on the active and lru lists may still change. Used to serialize + * changing transactions. Returns true if we aquired the lock. */ -static inline int lc_try_lock(struct lru_cache *lc) +static inline int lc_try_lock_for_transaction(struct lru_cache *lc) { - return !test_and_set_bit(__LC_DIRTY, &lc->flags); + return !test_and_set_bit(__LC_LOCKED, &lc->flags); } +/** + * lc_try_lock - variant to stop lc_get() from changing the tracked set + * @lc: the lru cache to operate on + * + * Note that the reference counts and order on the active and lru lists may + * still change. Only works on a "clean" set. Returns true if we aquired the + * lock, which means there are no pending changes, and any further attempt to + * change the set will not succeed until the next lc_unlock(). + */ +extern int lc_try_lock(struct lru_cache *lc); + /** * lc_unlock - unlock @lc, allow lc_get() to change the set again * @lc: the lru cache to operate on @@ -276,14 +299,10 @@ static inline int lc_try_lock(struct lru_cache *lc) static inline void lc_unlock(struct lru_cache *lc) { clear_bit(__LC_DIRTY, &lc->flags); - smp_mb__after_clear_bit(); + clear_bit_unlock(__LC_LOCKED, &lc->flags); } -static inline int lc_is_used(struct lru_cache *lc, unsigned int enr) -{ - struct lc_element *e = lc_find(lc, enr); - return e && e->refcnt; -} +extern bool lc_is_used(struct lru_cache *lc, unsigned int enr); #define lc_entry(ptr, type, member) \ container_of(ptr, type, member) diff --git a/trunk/include/linux/memcontrol.h b/trunk/include/linux/memcontrol.h index e98a74c0c9c0..0108a56f814e 100644 --- a/trunk/include/linux/memcontrol.h +++ b/trunk/include/linux/memcontrol.h @@ -21,11 +21,14 @@ #define _LINUX_MEMCONTROL_H #include #include +#include +#include struct mem_cgroup; struct page_cgroup; struct page; struct mm_struct; +struct kmem_cache; /* Stats that can be updated by kernel. */ enum mem_cgroup_page_stat_item { @@ -414,5 +417,211 @@ static inline void sock_release_memcg(struct sock *sk) { } #endif /* CONFIG_INET && CONFIG_MEMCG_KMEM */ + +#ifdef CONFIG_MEMCG_KMEM +extern struct static_key memcg_kmem_enabled_key; + +extern int memcg_limited_groups_array_size; + +/* + * Helper macro to loop through all memcg-specific caches. Callers must still + * check if the cache is valid (it is either valid or NULL). + * the slab_mutex must be held when looping through those caches + */ +#define for_each_memcg_cache_index(_idx) \ + for ((_idx) = 0; i < memcg_limited_groups_array_size; (_idx)++) + +static inline bool memcg_kmem_enabled(void) +{ + return static_key_false(&memcg_kmem_enabled_key); +} + +/* + * In general, we'll do everything in our power to not incur in any overhead + * for non-memcg users for the kmem functions. Not even a function call, if we + * can avoid it. + * + * Therefore, we'll inline all those functions so that in the best case, we'll + * see that kmemcg is off for everybody and proceed quickly. If it is on, + * we'll still do most of the flag checking inline. We check a lot of + * conditions, but because they are pretty simple, they are expected to be + * fast. + */ +bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, + int order); +void __memcg_kmem_commit_charge(struct page *page, + struct mem_cgroup *memcg, int order); +void __memcg_kmem_uncharge_pages(struct page *page, int order); + +int memcg_cache_id(struct mem_cgroup *memcg); +int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, + struct kmem_cache *root_cache); +void memcg_release_cache(struct kmem_cache *cachep); +void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep); + +int memcg_update_cache_size(struct kmem_cache *s, int num_groups); +void memcg_update_array_size(int num_groups); + +struct kmem_cache * +__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); + +void mem_cgroup_destroy_cache(struct kmem_cache *cachep); +void kmem_cache_destroy_memcg_children(struct kmem_cache *s); + +/** + * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. + * @gfp: the gfp allocation flags. + * @memcg: a pointer to the memcg this was charged against. + * @order: allocation order. + * + * returns true if the memcg where the current task belongs can hold this + * allocation. + * + * We return true automatically if this allocation is not to be accounted to + * any memcg. + */ +static inline bool +memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) +{ + if (!memcg_kmem_enabled()) + return true; + + /* + * __GFP_NOFAIL allocations will move on even if charging is not + * possible. Therefore we don't even try, and have this allocation + * unaccounted. We could in theory charge it with + * res_counter_charge_nofail, but we hope those allocations are rare, + * and won't be worth the trouble. + */ + if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL)) + return true; + if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) + return true; + + /* If the test is dying, just let it go. */ + if (unlikely(fatal_signal_pending(current))) + return true; + + return __memcg_kmem_newpage_charge(gfp, memcg, order); +} + +/** + * memcg_kmem_uncharge_pages: uncharge pages from memcg + * @page: pointer to struct page being freed + * @order: allocation order. + * + * there is no need to specify memcg here, since it is embedded in page_cgroup + */ +static inline void +memcg_kmem_uncharge_pages(struct page *page, int order) +{ + if (memcg_kmem_enabled()) + __memcg_kmem_uncharge_pages(page, order); +} + +/** + * memcg_kmem_commit_charge: embeds correct memcg in a page + * @page: pointer to struct page recently allocated + * @memcg: the memcg structure we charged against + * @order: allocation order. + * + * Needs to be called after memcg_kmem_newpage_charge, regardless of success or + * failure of the allocation. if @page is NULL, this function will revert the + * charges. Otherwise, it will commit the memcg given by @memcg to the + * corresponding page_cgroup. + */ +static inline void +memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) +{ + if (memcg_kmem_enabled() && memcg) + __memcg_kmem_commit_charge(page, memcg, order); +} + +/** + * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation + * @cachep: the original global kmem cache + * @gfp: allocation flags. + * + * This function assumes that the task allocating, which determines the memcg + * in the page allocator, belongs to the same cgroup throughout the whole + * process. Misacounting can happen if the task calls memcg_kmem_get_cache() + * while belonging to a cgroup, and later on changes. This is considered + * acceptable, and should only happen upon task migration. + * + * Before the cache is created by the memcg core, there is also a possible + * imbalance: the task belongs to a memcg, but the cache being allocated from + * is the global cache, since the child cache is not yet guaranteed to be + * ready. This case is also fine, since in this case the GFP_KMEMCG will not be + * passed and the page allocator will not attempt any cgroup accounting. + */ +static __always_inline struct kmem_cache * +memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) +{ + if (!memcg_kmem_enabled()) + return cachep; + if (gfp & __GFP_NOFAIL) + return cachep; + if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) + return cachep; + if (unlikely(fatal_signal_pending(current))) + return cachep; + + return __memcg_kmem_get_cache(cachep, gfp); +} +#else +#define for_each_memcg_cache_index(_idx) \ + for (; NULL; ) + +static inline bool memcg_kmem_enabled(void) +{ + return false; +} + +static inline bool +memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) +{ + return true; +} + +static inline void memcg_kmem_uncharge_pages(struct page *page, int order) +{ +} + +static inline void +memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) +{ +} + +static inline int memcg_cache_id(struct mem_cgroup *memcg) +{ + return -1; +} + +static inline int +memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, + struct kmem_cache *root_cache) +{ + return 0; +} + +static inline void memcg_release_cache(struct kmem_cache *cachep) +{ +} + +static inline void memcg_cache_list_add(struct mem_cgroup *memcg, + struct kmem_cache *s) +{ +} + +static inline struct kmem_cache * +memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) +{ + return cachep; +} + +static inline void kmem_cache_destroy_memcg_children(struct kmem_cache *s) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/trunk/include/linux/mm_types.h b/trunk/include/linux/mm_types.h index 7d9ebb7cc982..f8f5162a3571 100644 --- a/trunk/include/linux/mm_types.h +++ b/trunk/include/linux/mm_types.h @@ -128,10 +128,7 @@ struct page { }; struct list_head list; /* slobs list of pages */ - struct { /* slab fields */ - struct kmem_cache *slab_cache; - struct slab *slab_page; - }; + struct slab *slab_page; /* slab fields */ }; /* Remainder is not double word aligned */ @@ -146,7 +143,7 @@ struct page { #if USE_SPLIT_PTLOCKS spinlock_t ptl; #endif - struct kmem_cache *slab; /* SLUB: Pointer to slab */ + struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ struct page *first_page; /* Compound tail pages */ }; diff --git a/trunk/include/linux/mnt_namespace.h b/trunk/include/linux/mnt_namespace.h index 5a8e3903d770..12b2ab510323 100644 --- a/trunk/include/linux/mnt_namespace.h +++ b/trunk/include/linux/mnt_namespace.h @@ -4,9 +4,10 @@ struct mnt_namespace; struct fs_struct; +struct user_namespace; extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, - struct fs_struct *); + struct user_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); extern const struct file_operations proc_mounts_operations; diff --git a/trunk/include/linux/moduleparam.h b/trunk/include/linux/moduleparam.h index d6a58065c09c..137b4198fc03 100644 --- a/trunk/include/linux/moduleparam.h +++ b/trunk/include/linux/moduleparam.h @@ -16,17 +16,15 @@ /* Chosen so that structs with an unsigned long line up. */ #define MAX_PARAM_PREFIX_LEN (64 - sizeof(unsigned long)) -#define ___module_cat(a,b) __mod_ ## a ## b -#define __module_cat(a,b) ___module_cat(a,b) #ifdef MODULE #define __MODULE_INFO(tag, name, info) \ -static const char __module_cat(name,__LINE__)[] \ +static const char __UNIQUE_ID(name)[] \ __used __attribute__((section(".modinfo"), unused, aligned(1))) \ = __stringify(tag) "=" info #else /* !MODULE */ /* This struct is here for syntactic coherency, it is not used */ #define __MODULE_INFO(tag, name, info) \ - struct __module_cat(name,__LINE__) {} + struct __UNIQUE_ID(name) {} #endif #define __MODULE_PARM_TYPE(name, _type) \ __MODULE_INFO(parmtype, name##type, #name ":" _type) diff --git a/trunk/include/linux/mtd/blktrans.h b/trunk/include/linux/mtd/blktrans.h index ed270bd2e4df..4eb0a50d0c55 100644 --- a/trunk/include/linux/mtd/blktrans.h +++ b/trunk/include/linux/mtd/blktrans.h @@ -23,6 +23,7 @@ #include #include #include +#include struct hd_geometry; struct mtd_info; @@ -43,7 +44,8 @@ struct mtd_blktrans_dev { struct kref ref; struct gendisk *disk; struct attribute_group *disk_attributes; - struct task_struct *thread; + struct workqueue_struct *wq; + struct work_struct work; struct request_queue *rq; spinlock_t queue_lock; void *priv; diff --git a/trunk/include/linux/mtd/doc2000.h b/trunk/include/linux/mtd/doc2000.h index 0f6fea73a1f6..407d1e556c39 100644 --- a/trunk/include/linux/mtd/doc2000.h +++ b/trunk/include/linux/mtd/doc2000.h @@ -92,12 +92,26 @@ * Others use readb/writeb */ #if defined(__arm__) -#define ReadDOC_(adr, reg) ((unsigned char)(*(volatile __u32 *)(((unsigned long)adr)+((reg)<<2)))) -#define WriteDOC_(d, adr, reg) do{ *(volatile __u32 *)(((unsigned long)adr)+((reg)<<2)) = (__u32)d; wmb();} while(0) +static inline u8 ReadDOC_(u32 __iomem *addr, unsigned long reg) +{ + return __raw_readl(addr + reg); +} +static inline void WriteDOC_(u8 data, u32 __iomem *addr, unsigned long reg) +{ + __raw_writel(data, addr + reg); + wmb(); +} #define DOC_IOREMAP_LEN 0x8000 #elif defined(__ppc__) -#define ReadDOC_(adr, reg) ((unsigned char)(*(volatile __u16 *)(((unsigned long)adr)+((reg)<<1)))) -#define WriteDOC_(d, adr, reg) do{ *(volatile __u16 *)(((unsigned long)adr)+((reg)<<1)) = (__u16)d; wmb();} while(0) +static inline u8 ReadDOC_(u16 __iomem *addr, unsigned long reg) +{ + return __raw_readw(addr + reg); +} +static inline void WriteDOC_(u8 data, u16 __iomem *addr, unsigned long reg) +{ + __raw_writew(data, addr + reg); + wmb(); +} #define DOC_IOREMAP_LEN 0x4000 #else #define ReadDOC_(adr, reg) readb((void __iomem *)(adr) + (reg)) diff --git a/trunk/include/linux/mtd/fsmc.h b/trunk/include/linux/mtd/fsmc.h index b20029221fb1..d6ed61ef451d 100644 --- a/trunk/include/linux/mtd/fsmc.h +++ b/trunk/include/linux/mtd/fsmc.h @@ -155,9 +155,6 @@ struct fsmc_nand_platform_data { unsigned int width; unsigned int bank; - /* CLE, ALE offsets */ - unsigned int cle_off; - unsigned int ale_off; enum access_mode mode; void (*select_bank)(uint32_t bank, uint32_t busw); diff --git a/trunk/include/linux/mtd/gpmi-nand.h b/trunk/include/linux/mtd/gpmi-nand.h deleted file mode 100644 index ed3c4e09f3d1..000000000000 --- a/trunk/include/linux/mtd/gpmi-nand.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (C) 2011 Freescale Semiconductor, Inc. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - */ - -#ifndef __MACH_MXS_GPMI_NAND_H__ -#define __MACH_MXS_GPMI_NAND_H__ - -/* The size of the resources is fixed. */ -#define GPMI_NAND_RES_SIZE 6 - -/* Resource names for the GPMI NAND driver. */ -#define GPMI_NAND_GPMI_REGS_ADDR_RES_NAME "gpmi-nand" -#define GPMI_NAND_GPMI_INTERRUPT_RES_NAME "GPMI NAND GPMI Interrupt" -#define GPMI_NAND_BCH_REGS_ADDR_RES_NAME "bch" -#define GPMI_NAND_BCH_INTERRUPT_RES_NAME "bch" -#define GPMI_NAND_DMA_CHANNELS_RES_NAME "GPMI NAND DMA Channels" -#define GPMI_NAND_DMA_INTERRUPT_RES_NAME "gpmi-dma" - -/** - * struct gpmi_nand_platform_data - GPMI NAND driver platform data. - * - * This structure communicates platform-specific information to the GPMI NAND - * driver that can't be expressed as resources. - * - * @platform_init: A pointer to a function the driver will call to - * initialize the platform (e.g., set up the pin mux). - * @min_prop_delay_in_ns: Minimum propagation delay of GPMI signals to and - * from the NAND Flash device, in nanoseconds. - * @max_prop_delay_in_ns: Maximum propagation delay of GPMI signals to and - * from the NAND Flash device, in nanoseconds. - * @max_chip_count: The maximum number of chips for which the driver - * should configure the hardware. This value most - * likely reflects the number of pins that are - * connected to a NAND Flash device. If this is - * greater than the SoC hardware can support, the - * driver will print a message and fail to initialize. - * @partitions: An optional pointer to an array of partition - * descriptions. - * @partition_count: The number of elements in the partitions array. - */ -struct gpmi_nand_platform_data { - /* SoC hardware information. */ - int (*platform_init)(void); - - /* NAND Flash information. */ - unsigned int min_prop_delay_in_ns; - unsigned int max_prop_delay_in_ns; - unsigned int max_chip_count; - - /* Medium information. */ - struct mtd_partition *partitions; - unsigned partition_count; -}; -#endif diff --git a/trunk/include/linux/mtd/map.h b/trunk/include/linux/mtd/map.h index 3595a0236b0f..f6eb4332ac92 100644 --- a/trunk/include/linux/mtd/map.h +++ b/trunk/include/linux/mtd/map.h @@ -328,7 +328,7 @@ static inline int map_word_bitsset(struct map_info *map, map_word val1, map_word static inline map_word map_word_load(struct map_info *map, const void *ptr) { - map_word r; + map_word r = {{0} }; if (map_bankwidth_is_1(map)) r.x[0] = *(unsigned char *)ptr; @@ -391,7 +391,7 @@ static inline map_word map_word_ff(struct map_info *map) static inline map_word inline_map_read(struct map_info *map, unsigned long ofs) { - map_word r; + map_word uninitialized_var(r); if (map_bankwidth_is_1(map)) r.x[0] = __raw_readb(map->virt + ofs); diff --git a/trunk/include/linux/mtd/mtd.h b/trunk/include/linux/mtd/mtd.h index 81d61e704599..f9ac2897b86b 100644 --- a/trunk/include/linux/mtd/mtd.h +++ b/trunk/include/linux/mtd/mtd.h @@ -98,7 +98,7 @@ struct mtd_oob_ops { }; #define MTD_MAX_OOBFREE_ENTRIES_LARGE 32 -#define MTD_MAX_ECCPOS_ENTRIES_LARGE 448 +#define MTD_MAX_ECCPOS_ENTRIES_LARGE 640 /* * Internal ECC layout control structure. For historical reasons, there is a * similar, smaller struct nand_ecclayout_user (in mtd-abi.h) that is retained diff --git a/trunk/include/linux/mtd/nand.h b/trunk/include/linux/mtd/nand.h index 24e915957e4f..7ccb3c59ed60 100644 --- a/trunk/include/linux/mtd/nand.h +++ b/trunk/include/linux/mtd/nand.h @@ -219,6 +219,13 @@ typedef enum { #define NAND_OWN_BUFFERS 0x00020000 /* Chip may not exist, so silence any errors in scan */ #define NAND_SCAN_SILENT_NODEV 0x00040000 +/* + * Autodetect nand buswidth with readid/onfi. + * This suppose the driver will configure the hardware in 8 bits mode + * when calling nand_scan_ident, and update its configuration + * before calling nand_scan_tail. + */ +#define NAND_BUSWIDTH_AUTO 0x00080000 /* Options set by nand scan */ /* Nand scan has allocated controller struct */ @@ -471,8 +478,8 @@ struct nand_buffers { * non 0 if ONFI supported. * @onfi_params: [INTERN] holds the ONFI page parameter when ONFI is * supported, 0 otherwise. - * @onfi_set_features [REPLACEABLE] set the features for ONFI nand - * @onfi_get_features [REPLACEABLE] get the features for ONFI nand + * @onfi_set_features: [REPLACEABLE] set the features for ONFI nand + * @onfi_get_features: [REPLACEABLE] get the features for ONFI nand * @ecclayout: [REPLACEABLE] the default ECC placement scheme * @bbt: [INTERN] bad block table pointer * @bbt_td: [REPLACEABLE] bad block table descriptor for flash diff --git a/trunk/include/linux/mtd/sh_flctl.h b/trunk/include/linux/mtd/sh_flctl.h index 01e4b15b280e..1c28f8879b1c 100644 --- a/trunk/include/linux/mtd/sh_flctl.h +++ b/trunk/include/linux/mtd/sh_flctl.h @@ -20,6 +20,7 @@ #ifndef __SH_FLCTL_H__ #define __SH_FLCTL_H__ +#include #include #include #include @@ -107,6 +108,7 @@ #define ESTERINTE (0x1 << 24) /* ECC error interrupt enable */ #define AC1CLR (0x1 << 19) /* ECC FIFO clear */ #define AC0CLR (0x1 << 18) /* Data FIFO clear */ +#define DREQ0EN (0x1 << 16) /* FLDTFIFODMA Request Enable */ #define ECERB (0x1 << 9) /* ECC error */ #define STERB (0x1 << 8) /* Status error */ #define STERINTE (0x1 << 4) /* Status error enable */ @@ -138,6 +140,8 @@ enum flctl_ecc_res_t { FL_TIMEOUT }; +struct dma_chan; + struct sh_flctl { struct mtd_info mtd; struct nand_chip chip; @@ -147,7 +151,7 @@ struct sh_flctl { uint8_t done_buff[2048 + 64]; /* max size 2048 + 64 */ int read_bytes; - int index; + unsigned int index; int seqin_column; /* column in SEQIN cmd */ int seqin_page_addr; /* page_addr in SEQIN cmd */ uint32_t seqin_read_cmd; /* read cmd in SEQIN cmd */ @@ -161,6 +165,11 @@ struct sh_flctl { unsigned hwecc:1; /* Hardware ECC (0 = disabled, 1 = enabled) */ unsigned holden:1; /* Hardware has FLHOLDCR and HOLDEN is set */ unsigned qos_request:1; /* QoS request to prevent deep power shutdown */ + + /* DMA related objects */ + struct dma_chan *chan_fifo0_rx; + struct dma_chan *chan_fifo0_tx; + struct completion dma_complete; }; struct sh_flctl_platform_data { @@ -170,6 +179,9 @@ struct sh_flctl_platform_data { unsigned has_hwecc:1; unsigned use_holden:1; + + unsigned int slave_id_fifo0_tx; + unsigned int slave_id_fifo0_rx; }; static inline struct sh_flctl *mtd_to_flctl(struct mtd_info *mtdinfo) diff --git a/trunk/include/linux/nfs_fs_sb.h b/trunk/include/linux/nfs_fs_sb.h index a9e76ee1adca..6c6ed153a9b4 100644 --- a/trunk/include/linux/nfs_fs_sb.h +++ b/trunk/include/linux/nfs_fs_sb.h @@ -198,51 +198,4 @@ struct nfs_server { #define NFS_CAP_POSIX_LOCK (1U << 14) #define NFS_CAP_UIDGID_NOMAP (1U << 15) - -/* maximum number of slots to use */ -#define NFS4_DEF_SLOT_TABLE_SIZE (16U) -#define NFS4_MAX_SLOT_TABLE (256U) -#define NFS4_NO_SLOT ((u32)-1) - -#if IS_ENABLED(CONFIG_NFS_V4) - -/* Sessions */ -#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long)) -struct nfs4_slot_table { - struct nfs4_slot *slots; /* seqid per slot */ - unsigned long used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */ - spinlock_t slot_tbl_lock; - struct rpc_wait_queue slot_tbl_waitq; /* allocators may wait here */ - u32 max_slots; /* # slots in table */ - u32 highest_used_slotid; /* sent to server on each SEQ. - * op for dynamic resizing */ - u32 target_max_slots; /* Set by CB_RECALL_SLOT as - * the new max_slots */ - struct completion complete; -}; - -static inline int slot_idx(struct nfs4_slot_table *tbl, struct nfs4_slot *sp) -{ - return sp - tbl->slots; -} - -/* - * Session related parameters - */ -struct nfs4_session { - struct nfs4_sessionid sess_id; - u32 flags; - unsigned long session_state; - u32 hash_alg; - u32 ssv_len; - - /* The fore and back channel */ - struct nfs4_channel_attrs fc_attrs; - struct nfs4_slot_table fc_slot_table; - struct nfs4_channel_attrs bc_attrs; - struct nfs4_slot_table bc_slot_table; - struct nfs_client *clp; -}; - -#endif /* CONFIG_NFS_V4 */ #endif diff --git a/trunk/include/linux/nfs_xdr.h b/trunk/include/linux/nfs_xdr.h index a73ea89789d1..29adb12c7ecf 100644 --- a/trunk/include/linux/nfs_xdr.h +++ b/trunk/include/linux/nfs_xdr.h @@ -185,23 +185,20 @@ struct nfs4_channel_attrs { u32 max_reqs; }; -/* nfs41 sessions slot seqid */ -struct nfs4_slot { - u32 seq_nr; -}; - +struct nfs4_slot; struct nfs4_sequence_args { - struct nfs4_session *sa_session; - u32 sa_slotid; - u8 sa_cache_this; + struct nfs4_slot *sa_slot; + u8 sa_cache_this : 1, + sa_privileged : 1; }; struct nfs4_sequence_res { - struct nfs4_session *sr_session; struct nfs4_slot *sr_slot; /* slot used to send request */ + unsigned long sr_timestamp; int sr_status; /* sequence operation status */ - unsigned long sr_renewal_time; u32 sr_status_flags; + u32 sr_highest_slotid; + u32 sr_target_highest_slotid; }; struct nfs4_get_lease_time_args { @@ -209,8 +206,8 @@ struct nfs4_get_lease_time_args { }; struct nfs4_get_lease_time_res { - struct nfs_fsinfo *lr_fsinfo; struct nfs4_sequence_res lr_seq_res; + struct nfs_fsinfo *lr_fsinfo; }; #define PNFS_LAYOUT_MAXSIZE 4096 @@ -228,23 +225,23 @@ struct pnfs_layout_range { }; struct nfs4_layoutget_args { + struct nfs4_sequence_args seq_args; __u32 type; struct pnfs_layout_range range; __u64 minlength; __u32 maxcount; struct inode *inode; struct nfs_open_context *ctx; - struct nfs4_sequence_args seq_args; nfs4_stateid stateid; struct nfs4_layoutdriver_data layout; }; struct nfs4_layoutget_res { + struct nfs4_sequence_res seq_res; __u32 return_on_close; struct pnfs_layout_range range; __u32 type; nfs4_stateid stateid; - struct nfs4_sequence_res seq_res; struct nfs4_layoutdriver_data *layoutp; }; @@ -255,38 +252,38 @@ struct nfs4_layoutget { }; struct nfs4_getdevicelist_args { + struct nfs4_sequence_args seq_args; const struct nfs_fh *fh; u32 layoutclass; - struct nfs4_sequence_args seq_args; }; struct nfs4_getdevicelist_res { - struct pnfs_devicelist *devlist; struct nfs4_sequence_res seq_res; + struct pnfs_devicelist *devlist; }; struct nfs4_getdeviceinfo_args { - struct pnfs_device *pdev; struct nfs4_sequence_args seq_args; + struct pnfs_device *pdev; }; struct nfs4_getdeviceinfo_res { - struct pnfs_device *pdev; struct nfs4_sequence_res seq_res; + struct pnfs_device *pdev; }; struct nfs4_layoutcommit_args { + struct nfs4_sequence_args seq_args; nfs4_stateid stateid; __u64 lastbytewritten; struct inode *inode; const u32 *bitmask; - struct nfs4_sequence_args seq_args; }; struct nfs4_layoutcommit_res { + struct nfs4_sequence_res seq_res; struct nfs_fattr *fattr; const struct nfs_server *server; - struct nfs4_sequence_res seq_res; int status; }; @@ -300,11 +297,11 @@ struct nfs4_layoutcommit_data { }; struct nfs4_layoutreturn_args { + struct nfs4_sequence_args seq_args; struct pnfs_layout_hdr *layout; struct inode *inode; nfs4_stateid stateid; __u32 layout_type; - struct nfs4_sequence_args seq_args; }; struct nfs4_layoutreturn_res { @@ -330,6 +327,7 @@ struct stateowner_id { * Arguments to the open call. */ struct nfs_openargs { + struct nfs4_sequence_args seq_args; const struct nfs_fh * fh; struct nfs_seqid * seqid; int open_flags; @@ -350,10 +348,10 @@ struct nfs_openargs { const u32 * bitmask; const u32 * open_bitmap; __u32 claim; - struct nfs4_sequence_args seq_args; }; struct nfs_openres { + struct nfs4_sequence_res seq_res; nfs4_stateid stateid; struct nfs_fh fh; struct nfs4_change_info cinfo; @@ -368,7 +366,6 @@ struct nfs_openres { __u32 attrset[NFS4_BITMAP_SIZE]; struct nfs4_string *owner; struct nfs4_string *group_owner; - struct nfs4_sequence_res seq_res; __u32 access_request; __u32 access_supported; __u32 access_result; @@ -392,20 +389,20 @@ struct nfs_open_confirmres { * Arguments to the close call. */ struct nfs_closeargs { + struct nfs4_sequence_args seq_args; struct nfs_fh * fh; nfs4_stateid * stateid; struct nfs_seqid * seqid; fmode_t fmode; const u32 * bitmask; - struct nfs4_sequence_args seq_args; }; struct nfs_closeres { + struct nfs4_sequence_res seq_res; nfs4_stateid stateid; struct nfs_fattr * fattr; struct nfs_seqid * seqid; const struct nfs_server *server; - struct nfs4_sequence_res seq_res; }; /* * * Arguments to the lock,lockt, and locku call. @@ -417,6 +414,7 @@ struct nfs_lowner { }; struct nfs_lock_args { + struct nfs4_sequence_args seq_args; struct nfs_fh * fh; struct file_lock * fl; struct nfs_seqid * lock_seqid; @@ -427,40 +425,39 @@ struct nfs_lock_args { unsigned char block : 1; unsigned char reclaim : 1; unsigned char new_lock_owner : 1; - struct nfs4_sequence_args seq_args; }; struct nfs_lock_res { + struct nfs4_sequence_res seq_res; nfs4_stateid stateid; struct nfs_seqid * lock_seqid; struct nfs_seqid * open_seqid; - struct nfs4_sequence_res seq_res; }; struct nfs_locku_args { + struct nfs4_sequence_args seq_args; struct nfs_fh * fh; struct file_lock * fl; struct nfs_seqid * seqid; nfs4_stateid * stateid; - struct nfs4_sequence_args seq_args; }; struct nfs_locku_res { + struct nfs4_sequence_res seq_res; nfs4_stateid stateid; struct nfs_seqid * seqid; - struct nfs4_sequence_res seq_res; }; struct nfs_lockt_args { + struct nfs4_sequence_args seq_args; struct nfs_fh * fh; struct file_lock * fl; struct nfs_lowner lock_owner; - struct nfs4_sequence_args seq_args; }; struct nfs_lockt_res { - struct file_lock * denied; /* LOCK, LOCKT failed */ struct nfs4_sequence_res seq_res; + struct file_lock * denied; /* LOCK, LOCKT failed */ }; struct nfs_release_lockowner_args { @@ -468,22 +465,23 @@ struct nfs_release_lockowner_args { }; struct nfs4_delegreturnargs { + struct nfs4_sequence_args seq_args; const struct nfs_fh *fhandle; const nfs4_stateid *stateid; const u32 * bitmask; - struct nfs4_sequence_args seq_args; }; struct nfs4_delegreturnres { + struct nfs4_sequence_res seq_res; struct nfs_fattr * fattr; const struct nfs_server *server; - struct nfs4_sequence_res seq_res; }; /* * Arguments to the read call. */ struct nfs_readargs { + struct nfs4_sequence_args seq_args; struct nfs_fh * fh; struct nfs_open_context *context; struct nfs_lock_context *lock_context; @@ -491,20 +489,20 @@ struct nfs_readargs { __u32 count; unsigned int pgbase; struct page ** pages; - struct nfs4_sequence_args seq_args; }; struct nfs_readres { + struct nfs4_sequence_res seq_res; struct nfs_fattr * fattr; __u32 count; int eof; - struct nfs4_sequence_res seq_res; }; /* * Arguments to the write call. */ struct nfs_writeargs { + struct nfs4_sequence_args seq_args; struct nfs_fh * fh; struct nfs_open_context *context; struct nfs_lock_context *lock_context; @@ -514,7 +512,6 @@ struct nfs_writeargs { unsigned int pgbase; struct page ** pages; const u32 * bitmask; - struct nfs4_sequence_args seq_args; }; struct nfs_write_verifier { @@ -527,65 +524,65 @@ struct nfs_writeverf { }; struct nfs_writeres { + struct nfs4_sequence_res seq_res; struct nfs_fattr * fattr; struct nfs_writeverf * verf; __u32 count; const struct nfs_server *server; - struct nfs4_sequence_res seq_res; }; /* * Arguments to the commit call. */ struct nfs_commitargs { + struct nfs4_sequence_args seq_args; struct nfs_fh *fh; __u64 offset; __u32 count; const u32 *bitmask; - struct nfs4_sequence_args seq_args; }; struct nfs_commitres { + struct nfs4_sequence_res seq_res; struct nfs_fattr *fattr; struct nfs_writeverf *verf; const struct nfs_server *server; - struct nfs4_sequence_res seq_res; }; /* * Common arguments to the unlink call */ struct nfs_removeargs { + struct nfs4_sequence_args seq_args; const struct nfs_fh *fh; struct qstr name; - struct nfs4_sequence_args seq_args; }; struct nfs_removeres { + struct nfs4_sequence_res seq_res; const struct nfs_server *server; struct nfs_fattr *dir_attr; struct nfs4_change_info cinfo; - struct nfs4_sequence_res seq_res; }; /* * Common arguments to the rename call */ struct nfs_renameargs { + struct nfs4_sequence_args seq_args; const struct nfs_fh *old_dir; const struct nfs_fh *new_dir; const struct qstr *old_name; const struct qstr *new_name; - struct nfs4_sequence_args seq_args; }; struct nfs_renameres { + struct nfs4_sequence_res seq_res; const struct nfs_server *server; struct nfs4_change_info old_cinfo; struct nfs_fattr *old_fattr; struct nfs4_change_info new_cinfo; struct nfs_fattr *new_fattr; - struct nfs4_sequence_res seq_res; }; /* @@ -626,20 +623,20 @@ struct nfs_createargs { }; struct nfs_setattrargs { + struct nfs4_sequence_args seq_args; struct nfs_fh * fh; nfs4_stateid stateid; struct iattr * iap; const struct nfs_server * server; /* Needed for name mapping */ const u32 * bitmask; - struct nfs4_sequence_args seq_args; }; struct nfs_setaclargs { + struct nfs4_sequence_args seq_args; struct nfs_fh * fh; size_t acl_len; unsigned int acl_pgbase; struct page ** acl_pages; - struct nfs4_sequence_args seq_args; }; struct nfs_setaclres { @@ -647,27 +644,27 @@ struct nfs_setaclres { }; struct nfs_getaclargs { + struct nfs4_sequence_args seq_args; struct nfs_fh * fh; size_t acl_len; unsigned int acl_pgbase; struct page ** acl_pages; - struct nfs4_sequence_args seq_args; }; /* getxattr ACL interface flags */ #define NFS4_ACL_TRUNC 0x0001 /* ACL was truncated */ struct nfs_getaclres { + struct nfs4_sequence_res seq_res; size_t acl_len; size_t acl_data_offset; int acl_flags; struct page * acl_scratch; - struct nfs4_sequence_res seq_res; }; struct nfs_setattrres { + struct nfs4_sequence_res seq_res; struct nfs_fattr * fattr; const struct nfs_server * server; - struct nfs4_sequence_res seq_res; }; struct nfs_linkargs { @@ -832,21 +829,22 @@ struct nfs3_getaclres { typedef u64 clientid4; struct nfs4_accessargs { + struct nfs4_sequence_args seq_args; const struct nfs_fh * fh; const u32 * bitmask; u32 access; - struct nfs4_sequence_args seq_args; }; struct nfs4_accessres { + struct nfs4_sequence_res seq_res; const struct nfs_server * server; struct nfs_fattr * fattr; u32 supported; u32 access; - struct nfs4_sequence_res seq_res; }; struct nfs4_create_arg { + struct nfs4_sequence_args seq_args; u32 ftype; union { struct { @@ -863,88 +861,88 @@ struct nfs4_create_arg { const struct iattr * attrs; const struct nfs_fh * dir_fh; const u32 * bitmask; - struct nfs4_sequence_args seq_args; }; struct nfs4_create_res { + struct nfs4_sequence_res seq_res; const struct nfs_server * server; struct nfs_fh * fh; struct nfs_fattr * fattr; struct nfs4_change_info dir_cinfo; - struct nfs4_sequence_res seq_res; }; struct nfs4_fsinfo_arg { + struct nfs4_sequence_args seq_args; const struct nfs_fh * fh; const u32 * bitmask; - struct nfs4_sequence_args seq_args; }; struct nfs4_fsinfo_res { - struct nfs_fsinfo *fsinfo; struct nfs4_sequence_res seq_res; + struct nfs_fsinfo *fsinfo; }; struct nfs4_getattr_arg { + struct nfs4_sequence_args seq_args; const struct nfs_fh * fh; const u32 * bitmask; - struct nfs4_sequence_args seq_args; }; struct nfs4_getattr_res { + struct nfs4_sequence_res seq_res; const struct nfs_server * server; struct nfs_fattr * fattr; - struct nfs4_sequence_res seq_res; }; struct nfs4_link_arg { + struct nfs4_sequence_args seq_args; const struct nfs_fh * fh; const struct nfs_fh * dir_fh; const struct qstr * name; const u32 * bitmask; - struct nfs4_sequence_args seq_args; }; struct nfs4_link_res { + struct nfs4_sequence_res seq_res; const struct nfs_server * server; struct nfs_fattr * fattr; struct nfs4_change_info cinfo; struct nfs_fattr * dir_attr; - struct nfs4_sequence_res seq_res; }; struct nfs4_lookup_arg { + struct nfs4_sequence_args seq_args; const struct nfs_fh * dir_fh; const struct qstr * name; const u32 * bitmask; - struct nfs4_sequence_args seq_args; }; struct nfs4_lookup_res { + struct nfs4_sequence_res seq_res; const struct nfs_server * server; struct nfs_fattr * fattr; struct nfs_fh * fh; - struct nfs4_sequence_res seq_res; }; struct nfs4_lookup_root_arg { - const u32 * bitmask; struct nfs4_sequence_args seq_args; + const u32 * bitmask; }; struct nfs4_pathconf_arg { + struct nfs4_sequence_args seq_args; const struct nfs_fh * fh; const u32 * bitmask; - struct nfs4_sequence_args seq_args; }; struct nfs4_pathconf_res { - struct nfs_pathconf *pathconf; struct nfs4_sequence_res seq_res; + struct nfs_pathconf *pathconf; }; struct nfs4_readdir_arg { + struct nfs4_sequence_args seq_args; const struct nfs_fh * fh; u64 cookie; nfs4_verifier verifier; @@ -953,21 +951,20 @@ struct nfs4_readdir_arg { unsigned int pgbase; /* zero-copy data */ const u32 * bitmask; int plus; - struct nfs4_sequence_args seq_args; }; struct nfs4_readdir_res { + struct nfs4_sequence_res seq_res; nfs4_verifier verifier; unsigned int pgbase; - struct nfs4_sequence_res seq_res; }; struct nfs4_readlink { + struct nfs4_sequence_args seq_args; const struct nfs_fh * fh; unsigned int pgbase; unsigned int pglen; /* zero-copy data */ struct page ** pages; /* zero-copy data */ - struct nfs4_sequence_args seq_args; }; struct nfs4_readlink_res { @@ -993,28 +990,28 @@ struct nfs4_setclientid_res { }; struct nfs4_statfs_arg { + struct nfs4_sequence_args seq_args; const struct nfs_fh * fh; const u32 * bitmask; - struct nfs4_sequence_args seq_args; }; struct nfs4_statfs_res { - struct nfs_fsstat *fsstat; struct nfs4_sequence_res seq_res; + struct nfs_fsstat *fsstat; }; struct nfs4_server_caps_arg { - struct nfs_fh *fhandle; struct nfs4_sequence_args seq_args; + struct nfs_fh *fhandle; }; struct nfs4_server_caps_res { + struct nfs4_sequence_res seq_res; u32 attr_bitmask[3]; u32 acl_bitmask; u32 has_links; u32 has_symlinks; u32 fh_expire_type; - struct nfs4_sequence_res seq_res; }; #define NFS4_PATHNAME_MAXCOMPONENTS 512 @@ -1040,16 +1037,16 @@ struct nfs4_fs_locations { }; struct nfs4_fs_locations_arg { + struct nfs4_sequence_args seq_args; const struct nfs_fh *dir_fh; const struct qstr *name; struct page *page; const u32 *bitmask; - struct nfs4_sequence_args seq_args; }; struct nfs4_fs_locations_res { - struct nfs4_fs_locations *fs_locations; struct nfs4_sequence_res seq_res; + struct nfs4_fs_locations *fs_locations; }; struct nfs4_secinfo_oid { @@ -1074,14 +1071,14 @@ struct nfs4_secinfo_flavors { }; struct nfs4_secinfo_arg { + struct nfs4_sequence_args seq_args; const struct nfs_fh *dir_fh; const struct qstr *name; - struct nfs4_sequence_args seq_args; }; struct nfs4_secinfo_res { - struct nfs4_secinfo_flavors *flavors; struct nfs4_sequence_res seq_res; + struct nfs4_secinfo_flavors *flavors; }; #endif /* CONFIG_NFS_V4 */ @@ -1161,9 +1158,9 @@ struct nfs41_create_session_res { }; struct nfs41_reclaim_complete_args { + struct nfs4_sequence_args seq_args; /* In the future extend to include curr_fh for use with migration */ unsigned char one_fs:1; - struct nfs4_sequence_args seq_args; }; struct nfs41_reclaim_complete_res { @@ -1173,28 +1170,28 @@ struct nfs41_reclaim_complete_res { #define SECINFO_STYLE_CURRENT_FH 0 #define SECINFO_STYLE_PARENT 1 struct nfs41_secinfo_no_name_args { - int style; struct nfs4_sequence_args seq_args; + int style; }; struct nfs41_test_stateid_args { - nfs4_stateid *stateid; struct nfs4_sequence_args seq_args; + nfs4_stateid *stateid; }; struct nfs41_test_stateid_res { - unsigned int status; struct nfs4_sequence_res seq_res; + unsigned int status; }; struct nfs41_free_stateid_args { - nfs4_stateid *stateid; struct nfs4_sequence_args seq_args; + nfs4_stateid *stateid; }; struct nfs41_free_stateid_res { - unsigned int status; struct nfs4_sequence_res seq_res; + unsigned int status; }; #else diff --git a/trunk/include/linux/nsproxy.h b/trunk/include/linux/nsproxy.h index cc37a55ad004..10e5947491c7 100644 --- a/trunk/include/linux/nsproxy.h +++ b/trunk/include/linux/nsproxy.h @@ -67,7 +67,7 @@ void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); void free_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, - struct fs_struct *); + struct cred *, struct fs_struct *); int __init nsproxy_cache_init(void); static inline void put_nsproxy(struct nsproxy *ns) diff --git a/trunk/include/linux/of.h b/trunk/include/linux/of.h index 6cfea9aa401f..5ebcc5c8e423 100644 --- a/trunk/include/linux/of.h +++ b/trunk/include/linux/of.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -282,16 +283,28 @@ extern int of_alias_get_id(struct device_node *np, const char *stem); extern int of_machine_is_compatible(const char *compat); -extern int prom_add_property(struct device_node* np, struct property* prop); -extern int prom_remove_property(struct device_node *np, struct property *prop); -extern int prom_update_property(struct device_node *np, - struct property *newprop); +extern int of_add_property(struct device_node *np, struct property *prop); +extern int of_remove_property(struct device_node *np, struct property *prop); +extern int of_update_property(struct device_node *np, struct property *newprop); -#if defined(CONFIG_OF_DYNAMIC) /* For updating the device tree at runtime */ -extern void of_attach_node(struct device_node *); -extern void of_detach_node(struct device_node *); -#endif +#define OF_RECONFIG_ATTACH_NODE 0x0001 +#define OF_RECONFIG_DETACH_NODE 0x0002 +#define OF_RECONFIG_ADD_PROPERTY 0x0003 +#define OF_RECONFIG_REMOVE_PROPERTY 0x0004 +#define OF_RECONFIG_UPDATE_PROPERTY 0x0005 + +struct of_prop_reconfig { + struct device_node *dn; + struct property *prop; +}; + +extern int of_reconfig_notifier_register(struct notifier_block *); +extern int of_reconfig_notifier_unregister(struct notifier_block *); +extern int of_reconfig_notify(unsigned long, void *); + +extern int of_attach_node(struct device_node *); +extern int of_detach_node(struct device_node *); #define of_match_ptr(_ptr) (_ptr) diff --git a/trunk/include/linux/of_platform.h b/trunk/include/linux/of_platform.h index b47d2040c9f2..3863a4dbdf18 100644 --- a/trunk/include/linux/of_platform.h +++ b/trunk/include/linux/of_platform.h @@ -100,6 +100,7 @@ extern int of_platform_populate(struct device_node *root, #if !defined(CONFIG_OF_ADDRESS) struct of_dev_auxdata; +struct device; static inline int of_platform_populate(struct device_node *root, const struct of_device_id *matches, const struct of_dev_auxdata *lookup, diff --git a/trunk/include/linux/percpu-rwsem.h b/trunk/include/linux/percpu-rwsem.h index bd1e86071e57..3e88c9a7d57f 100644 --- a/trunk/include/linux/percpu-rwsem.h +++ b/trunk/include/linux/percpu-rwsem.h @@ -1,83 +1,34 @@ #ifndef _LINUX_PERCPU_RWSEM_H #define _LINUX_PERCPU_RWSEM_H -#include +#include +#include #include -#include -#include +#include +#include struct percpu_rw_semaphore { - unsigned __percpu *counters; - bool locked; - struct mutex mtx; + unsigned int __percpu *fast_read_ctr; + atomic_t write_ctr; + struct rw_semaphore rw_sem; + atomic_t slow_read_ctr; + wait_queue_head_t write_waitq; }; -#define light_mb() barrier() -#define heavy_mb() synchronize_sched_expedited() +extern void percpu_down_read(struct percpu_rw_semaphore *); +extern void percpu_up_read(struct percpu_rw_semaphore *); -static inline void percpu_down_read(struct percpu_rw_semaphore *p) -{ - rcu_read_lock_sched(); - if (unlikely(p->locked)) { - rcu_read_unlock_sched(); - mutex_lock(&p->mtx); - this_cpu_inc(*p->counters); - mutex_unlock(&p->mtx); - return; - } - this_cpu_inc(*p->counters); - rcu_read_unlock_sched(); - light_mb(); /* A, between read of p->locked and read of data, paired with D */ -} +extern void percpu_down_write(struct percpu_rw_semaphore *); +extern void percpu_up_write(struct percpu_rw_semaphore *); -static inline void percpu_up_read(struct percpu_rw_semaphore *p) -{ - light_mb(); /* B, between read of the data and write to p->counter, paired with C */ - this_cpu_dec(*p->counters); -} +extern int __percpu_init_rwsem(struct percpu_rw_semaphore *, + const char *, struct lock_class_key *); +extern void percpu_free_rwsem(struct percpu_rw_semaphore *); -static inline unsigned __percpu_count(unsigned __percpu *counters) -{ - unsigned total = 0; - int cpu; - - for_each_possible_cpu(cpu) - total += ACCESS_ONCE(*per_cpu_ptr(counters, cpu)); - - return total; -} - -static inline void percpu_down_write(struct percpu_rw_semaphore *p) -{ - mutex_lock(&p->mtx); - p->locked = true; - synchronize_sched_expedited(); /* make sure that all readers exit the rcu_read_lock_sched region */ - while (__percpu_count(p->counters)) - msleep(1); - heavy_mb(); /* C, between read of p->counter and write to data, paired with B */ -} - -static inline void percpu_up_write(struct percpu_rw_semaphore *p) -{ - heavy_mb(); /* D, between write to data and write to p->locked, paired with A */ - p->locked = false; - mutex_unlock(&p->mtx); -} - -static inline int percpu_init_rwsem(struct percpu_rw_semaphore *p) -{ - p->counters = alloc_percpu(unsigned); - if (unlikely(!p->counters)) - return -ENOMEM; - p->locked = false; - mutex_init(&p->mtx); - return 0; -} - -static inline void percpu_free_rwsem(struct percpu_rw_semaphore *p) -{ - free_percpu(p->counters); - p->counters = NULL; /* catch use after free bugs */ -} +#define percpu_init_rwsem(brw) \ +({ \ + static struct lock_class_key rwsem_key; \ + __percpu_init_rwsem(brw, #brw, &rwsem_key); \ +}) #endif diff --git a/trunk/include/linux/pid_namespace.h b/trunk/include/linux/pid_namespace.h index 65e3e87eacc5..bf285999273a 100644 --- a/trunk/include/linux/pid_namespace.h +++ b/trunk/include/linux/pid_namespace.h @@ -21,6 +21,7 @@ struct pid_namespace { struct kref kref; struct pidmap pidmap[PIDMAP_ENTRIES]; int last_pid; + int nr_hashed; struct task_struct *child_reaper; struct kmem_cache *pid_cachep; unsigned int level; @@ -31,9 +32,12 @@ struct pid_namespace { #ifdef CONFIG_BSD_PROCESS_ACCT struct bsd_acct_struct *bacct; #endif + struct user_namespace *user_ns; + struct work_struct proc_work; kgid_t pid_gid; int hide_pid; int reboot; /* group exit code if this pidns was rebooted */ + unsigned int proc_inum; }; extern struct pid_namespace init_pid_ns; @@ -46,7 +50,8 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) return ns; } -extern struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *ns); +extern struct pid_namespace *copy_pid_ns(unsigned long flags, + struct user_namespace *user_ns, struct pid_namespace *ns); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); extern void put_pid_ns(struct pid_namespace *ns); @@ -59,8 +64,8 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) return ns; } -static inline struct pid_namespace * -copy_pid_ns(unsigned long flags, struct pid_namespace *ns) +static inline struct pid_namespace *copy_pid_ns(unsigned long flags, + struct user_namespace *user_ns, struct pid_namespace *ns) { if (flags & CLONE_NEWPID) ns = ERR_PTR(-EINVAL); diff --git a/trunk/include/linux/platform_data/i2c-cbus-gpio.h b/trunk/include/linux/platform_data/i2c-cbus-gpio.h new file mode 100644 index 000000000000..6faa992a9502 --- /dev/null +++ b/trunk/include/linux/platform_data/i2c-cbus-gpio.h @@ -0,0 +1,27 @@ +/* + * i2c-cbus-gpio.h - CBUS I2C platform_data definition + * + * Copyright (C) 2004-2009 Nokia Corporation + * + * Written by Felipe Balbi and Aaro Koskinen. + * + * This file is subject to the terms and conditions of the GNU General + * Public License. See the file "COPYING" in the main directory of this + * archive for more details. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef __INCLUDE_LINUX_I2C_CBUS_GPIO_H +#define __INCLUDE_LINUX_I2C_CBUS_GPIO_H + +struct i2c_cbus_platform_data { + int dat_gpio; + int clk_gpio; + int sel_gpio; +}; + +#endif /* __INCLUDE_LINUX_I2C_CBUS_GPIO_H */ diff --git a/trunk/include/linux/platform_data/lp855x.h b/trunk/include/linux/platform_data/lp855x.h index 761f31752367..e81f62d24ee2 100644 --- a/trunk/include/linux/platform_data/lp855x.h +++ b/trunk/include/linux/platform_data/lp855x.h @@ -89,11 +89,6 @@ enum lp8556_brightness_source { LP8556_COMBINED2, /* pwm + i2c after the shaper block */ }; -struct lp855x_pwm_data { - void (*pwm_set_intensity) (int brightness, int max_brightness); - int (*pwm_get_intensity) (int max_brightness); -}; - struct lp855x_rom_data { u8 addr; u8 val; @@ -105,7 +100,7 @@ struct lp855x_rom_data { * @mode : brightness control by pwm or lp855x register * @device_control : value of DEVICE CONTROL register * @initial_brightness : initial value of backlight brightness - * @pwm_data : platform specific pwm generation functions. + * @period_ns : platform specific pwm period value. unit is nano. Only valid when mode is PWM_BASED. * @load_new_rom_data : 0 : use default configuration data @@ -118,7 +113,7 @@ struct lp855x_platform_data { enum lp855x_brightness_ctrl_mode mode; u8 device_control; int initial_brightness; - struct lp855x_pwm_data pwm_data; + unsigned int period_ns; u8 load_new_rom_data; int size_program; struct lp855x_rom_data *rom_data; diff --git a/trunk/include/linux/platform_data/mtd-nomadik-nand.h b/trunk/include/linux/platform_data/mtd-nomadik-nand.h deleted file mode 100644 index c3c8254c22a5..000000000000 --- a/trunk/include/linux/platform_data/mtd-nomadik-nand.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef __ASM_ARCH_NAND_H -#define __ASM_ARCH_NAND_H - -struct nomadik_nand_platform_data { - struct mtd_partition *parts; - int nparts; - int options; - int (*init) (void); - int (*exit) (void); -}; - -#define NAND_IO_DATA 0x40000000 -#define NAND_IO_CMD 0x40800000 -#define NAND_IO_ADDR 0x41000000 - -#endif /* __ASM_ARCH_NAND_H */ diff --git a/trunk/include/linux/proc_fs.h b/trunk/include/linux/proc_fs.h index 3fd2e871ff1b..32676b35d2f5 100644 --- a/trunk/include/linux/proc_fs.h +++ b/trunk/include/linux/proc_fs.h @@ -28,7 +28,11 @@ struct mm_struct; */ enum { - PROC_ROOT_INO = 1, + PROC_ROOT_INO = 1, + PROC_IPC_INIT_INO = 0xEFFFFFFFU, + PROC_UTS_INIT_INO = 0xEFFFFFFEU, + PROC_USER_INIT_INO = 0xEFFFFFFDU, + PROC_PID_INIT_INO = 0xEFFFFFFCU, }; /* @@ -174,7 +178,10 @@ extern struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name, struct proc_dir_entry *parent); extern struct file *proc_ns_fget(int fd); +extern bool proc_ns_inode(struct inode *inode); +extern int proc_alloc_inum(unsigned int *pino); +extern void proc_free_inum(unsigned int inum); #else #define proc_net_fops_create(net, name, mode, fops) ({ (void)(mode), NULL; }) @@ -229,6 +236,19 @@ static inline struct file *proc_ns_fget(int fd) return ERR_PTR(-EINVAL); } +static inline bool proc_ns_inode(struct inode *inode) +{ + return false; +} + +static inline int proc_alloc_inum(unsigned int *inum) +{ + *inum = 1; + return 0; +} +static inline void proc_free_inum(unsigned int inum) +{ +} #endif /* CONFIG_PROC_FS */ #if !defined(CONFIG_PROC_KCORE) @@ -247,10 +267,14 @@ struct proc_ns_operations { void *(*get)(struct task_struct *task); void (*put)(void *ns); int (*install)(struct nsproxy *nsproxy, void *ns); + unsigned int (*inum)(void *ns); }; extern const struct proc_ns_operations netns_operations; extern const struct proc_ns_operations utsns_operations; extern const struct proc_ns_operations ipcns_operations; +extern const struct proc_ns_operations pidns_operations; +extern const struct proc_ns_operations userns_operations; +extern const struct proc_ns_operations mntns_operations; union proc_op { int (*proc_get_link)(struct dentry *, struct path *); @@ -290,4 +314,7 @@ static inline struct net *PDE_NET(struct proc_dir_entry *pde) return pde->parent->data; } +#include + +void render_sigset_t(struct seq_file *m, const char *header, sigset_t *set); #endif /* _LINUX_PROC_FS_H */ diff --git a/trunk/include/linux/ptrace.h b/trunk/include/linux/ptrace.h index a89ff04bddd9..addfbe7c180e 100644 --- a/trunk/include/linux/ptrace.h +++ b/trunk/include/linux/ptrace.h @@ -32,6 +32,8 @@ #define PT_TRACE_EXIT PT_EVENT_FLAG(PTRACE_EVENT_EXIT) #define PT_TRACE_SECCOMP PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP) +#define PT_EXITKILL (PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT) + /* single stepping state bits (used on ARM and PA-RISC) */ #define PT_SINGLESTEP_BIT 31 #define PT_SINGLESTEP (1<> 32) ^ (seed << 10) ^ seed; diff --git a/trunk/include/linux/res_counter.h b/trunk/include/linux/res_counter.h index 6f54e40fa218..5ae8456d9670 100644 --- a/trunk/include/linux/res_counter.h +++ b/trunk/include/linux/res_counter.h @@ -125,14 +125,16 @@ int res_counter_charge_nofail(struct res_counter *counter, * * these calls check for usage underflow and show a warning on the console * _locked call expects the counter->lock to be taken + * + * returns the total charges still present in @counter. */ -void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); -void res_counter_uncharge(struct res_counter *counter, unsigned long val); +u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); +u64 res_counter_uncharge(struct res_counter *counter, unsigned long val); -void res_counter_uncharge_until(struct res_counter *counter, - struct res_counter *top, - unsigned long val); +u64 res_counter_uncharge_until(struct res_counter *counter, + struct res_counter *top, + unsigned long val); /** * res_counter_margin - calculate chargeable space of a counter * @cnt: the counter diff --git a/trunk/include/linux/sched.h b/trunk/include/linux/sched.h index b089c92c609b..f712465b05c5 100644 --- a/trunk/include/linux/sched.h +++ b/trunk/include/linux/sched.h @@ -1597,6 +1597,7 @@ struct task_struct { unsigned long nr_pages; /* uncharged usage */ unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ } memcg_batch; + unsigned int memcg_kmem_skip_account; #endif #ifdef CONFIG_HAVE_HW_BREAKPOINT atomic_t ptrace_bp_refcnt; @@ -1778,12 +1779,6 @@ static inline int is_global_init(struct task_struct *tsk) return tsk->pid == 1; } -/* - * is_container_init: - * check whether in the task is init in its own pid namespace. - */ -extern int is_container_init(struct task_struct *tsk); - extern struct pid *cad_pid; extern void free_task(struct task_struct *tsk); diff --git a/trunk/include/linux/security.h b/trunk/include/linux/security.h index 05e88bdcf7d9..0f6afc657f77 100644 --- a/trunk/include/linux/security.h +++ b/trunk/include/linux/security.h @@ -694,6 +694,12 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * userspace to load a kernel module with the given name. * @kmod_name name of the module requested by the kernel * Return 0 if successful. + * @kernel_module_from_file: + * Load a kernel module from userspace. + * @file contains the file structure pointing to the file containing + * the kernel module to load. If the module is being loaded from a blob, + * this argument will be NULL. + * Return 0 if permission is granted. * @task_fix_setuid: * Update the module's state after setting one or more of the user * identity attributes of the current process. The @flags parameter @@ -1508,6 +1514,7 @@ struct security_operations { int (*kernel_act_as)(struct cred *new, u32 secid); int (*kernel_create_files_as)(struct cred *new, struct inode *inode); int (*kernel_module_request)(char *kmod_name); + int (*kernel_module_from_file)(struct file *file); int (*task_fix_setuid) (struct cred *new, const struct cred *old, int flags); int (*task_setpgid) (struct task_struct *p, pid_t pgid); @@ -1765,6 +1772,7 @@ void security_transfer_creds(struct cred *new, const struct cred *old); int security_kernel_act_as(struct cred *new, u32 secid); int security_kernel_create_files_as(struct cred *new, struct inode *inode); int security_kernel_module_request(char *kmod_name); +int security_kernel_module_from_file(struct file *file); int security_task_fix_setuid(struct cred *new, const struct cred *old, int flags); int security_task_setpgid(struct task_struct *p, pid_t pgid); @@ -2278,6 +2286,11 @@ static inline int security_kernel_module_request(char *kmod_name) return 0; } +static inline int security_kernel_module_from_file(struct file *file) +{ + return 0; +} + static inline int security_task_fix_setuid(struct cred *new, const struct cred *old, int flags) diff --git a/trunk/include/linux/slab.h b/trunk/include/linux/slab.h index 83d1a1454b7e..5d168d7e0a28 100644 --- a/trunk/include/linux/slab.h +++ b/trunk/include/linux/slab.h @@ -11,6 +11,8 @@ #include #include +#include + /* * Flags to pass to kmem_cache_create(). @@ -116,6 +118,7 @@ struct kmem_cache { }; #endif +struct mem_cgroup; /* * struct kmem_cache related prototypes */ @@ -125,10 +128,12 @@ int slab_is_available(void); struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, unsigned long, void (*)(void *)); +struct kmem_cache * +kmem_cache_create_memcg(struct mem_cgroup *, const char *, size_t, size_t, + unsigned long, void (*)(void *), struct kmem_cache *); void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); void kmem_cache_free(struct kmem_cache *, void *); -unsigned int kmem_cache_size(struct kmem_cache *); /* * Please use this macro to create slab caches. Simply specify the @@ -176,6 +181,48 @@ unsigned int kmem_cache_size(struct kmem_cache *); #ifndef ARCH_SLAB_MINALIGN #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) #endif +/* + * This is the main placeholder for memcg-related information in kmem caches. + * struct kmem_cache will hold a pointer to it, so the memory cost while + * disabled is 1 pointer. The runtime cost while enabled, gets bigger than it + * would otherwise be if that would be bundled in kmem_cache: we'll need an + * extra pointer chase. But the trade off clearly lays in favor of not + * penalizing non-users. + * + * Both the root cache and the child caches will have it. For the root cache, + * this will hold a dynamically allocated array large enough to hold + * information about the currently limited memcgs in the system. + * + * Child caches will hold extra metadata needed for its operation. Fields are: + * + * @memcg: pointer to the memcg this cache belongs to + * @list: list_head for the list of all caches in this memcg + * @root_cache: pointer to the global, root cache, this cache was derived from + * @dead: set to true after the memcg dies; the cache may still be around. + * @nr_pages: number of pages that belongs to this cache. + * @destroy: worker to be called whenever we are ready, or believe we may be + * ready, to destroy this cache. + */ +struct memcg_cache_params { + bool is_root_cache; + union { + struct kmem_cache *memcg_caches[0]; + struct { + struct mem_cgroup *memcg; + struct list_head list; + struct kmem_cache *root_cache; + bool dead; + atomic_t nr_pages; + struct work_struct destroy; + }; + }; +}; + +int memcg_update_all_caches(int num_memcgs); + +struct seq_file; +int cache_show(struct kmem_cache *s, struct seq_file *m); +void print_slabinfo_header(struct seq_file *m); /* * Common kmalloc functions provided by all allocators @@ -388,6 +435,14 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node) return kmalloc_node(size, flags | __GFP_ZERO, node); } +/* + * Determine the size of a slab object + */ +static inline unsigned int kmem_cache_size(struct kmem_cache *s) +{ + return s->object_size; +} + void __init kmem_cache_init_late(void); #endif /* _LINUX_SLAB_H */ diff --git a/trunk/include/linux/slab_def.h b/trunk/include/linux/slab_def.h index cc290f0bdb34..8bb6e0eaf3c6 100644 --- a/trunk/include/linux/slab_def.h +++ b/trunk/include/linux/slab_def.h @@ -81,6 +81,9 @@ struct kmem_cache { */ int obj_offset; #endif /* CONFIG_DEBUG_SLAB */ +#ifdef CONFIG_MEMCG_KMEM + struct memcg_cache_params *memcg_params; +#endif /* 6) per-cpu/per-node data, touched during every alloc/free */ /* @@ -89,9 +92,13 @@ struct kmem_cache { * (see kmem_cache_init()) * We still use [NR_CPUS] and not [1] or [0] because cache_cache * is statically defined, so we reserve the max number of cpus. + * + * We also need to guarantee that the list is able to accomodate a + * pointer for each node since "nodelists" uses the remainder of + * available pointers. */ struct kmem_list3 **nodelists; - struct array_cache *array[NR_CPUS]; + struct array_cache *array[NR_CPUS + MAX_NUMNODES]; /* * Do not add fields after array[] */ diff --git a/trunk/include/linux/slub_def.h b/trunk/include/linux/slub_def.h index df448adb7283..9db4825cd393 100644 --- a/trunk/include/linux/slub_def.h +++ b/trunk/include/linux/slub_def.h @@ -101,6 +101,10 @@ struct kmem_cache { #ifdef CONFIG_SYSFS struct kobject kobj; /* For sysfs */ #endif +#ifdef CONFIG_MEMCG_KMEM + struct memcg_cache_params *memcg_params; + int max_attr_size; /* for propagation, maximum size of a stored attr */ +#endif #ifdef CONFIG_NUMA /* @@ -222,7 +226,10 @@ void *__kmalloc(size_t size, gfp_t flags); static __always_inline void * kmalloc_order(size_t size, gfp_t flags, unsigned int order) { - void *ret = (void *) __get_free_pages(flags | __GFP_COMP, order); + void *ret; + + flags |= (__GFP_COMP | __GFP_KMEMCG); + ret = (void *) __get_free_pages(flags, order); kmemleak_alloc(ret, size, 1, flags); return ret; } diff --git a/trunk/include/linux/string.h b/trunk/include/linux/string.h index 630125818ca8..ac889c5ea11b 100644 --- a/trunk/include/linux/string.h +++ b/trunk/include/linux/string.h @@ -143,4 +143,15 @@ static inline bool strstarts(const char *str, const char *prefix) extern size_t memweight(const void *ptr, size_t bytes); +/** + * kbasename - return the last part of a pathname. + * + * @path: path to extract the filename from. + */ +static inline const char *kbasename(const char *path) +{ + const char *tail = strrchr(path, '/'); + return tail ? tail + 1 : path; +} + #endif /* _LINUX_STRING_H_ */ diff --git a/trunk/include/linux/sunrpc/sched.h b/trunk/include/linux/sunrpc/sched.h index dc0c3cc3ada3..b64f8eb0b973 100644 --- a/trunk/include/linux/sunrpc/sched.h +++ b/trunk/include/linux/sunrpc/sched.h @@ -192,7 +192,6 @@ struct rpc_wait_queue { pid_t owner; /* process id of last task serviced */ unsigned char maxpriority; /* maximum priority (0 if queue is not a priority queue) */ unsigned char priority; /* current priority */ - unsigned char count; /* # task groups remaining serviced so far */ unsigned char nr; /* # tasks remaining for cookie */ unsigned short qlen; /* total # tasks waiting in queue */ struct rpc_timer timer_list; diff --git a/trunk/include/linux/syscalls.h b/trunk/include/linux/syscalls.h index 91835e7f364d..6caee34bf8a2 100644 --- a/trunk/include/linux/syscalls.h +++ b/trunk/include/linux/syscalls.h @@ -560,10 +560,10 @@ asmlinkage long sys_utime(char __user *filename, asmlinkage long sys_utimes(char __user *filename, struct timeval __user *utimes); asmlinkage long sys_lseek(unsigned int fd, off_t offset, - unsigned int origin); + unsigned int whence); asmlinkage long sys_llseek(unsigned int fd, unsigned long offset_high, unsigned long offset_low, loff_t __user *result, - unsigned int origin); + unsigned int whence); asmlinkage long sys_read(unsigned int fd, char __user *buf, size_t count); asmlinkage long sys_readahead(int fd, loff_t offset, size_t count); asmlinkage long sys_readv(unsigned long fd, @@ -880,4 +880,5 @@ asmlinkage long sys_process_vm_writev(pid_t pid, asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2); +asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags); #endif diff --git a/trunk/include/linux/thread_info.h b/trunk/include/linux/thread_info.h index ccc1899bd62e..e7e04736802f 100644 --- a/trunk/include/linux/thread_info.h +++ b/trunk/include/linux/thread_info.h @@ -61,6 +61,8 @@ extern long do_no_restart_syscall(struct restart_block *parm); # define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK) #endif +#define THREADINFO_GFP_ACCOUNTED (THREADINFO_GFP | __GFP_KMEMCG) + /* * flag set/clear/test wrappers * - pass TIF_xxxx constants to these functions diff --git a/trunk/include/linux/usb/usbnet.h b/trunk/include/linux/usb/usbnet.h index 9bbeabf66c54..bd45eb7bedc8 100644 --- a/trunk/include/linux/usb/usbnet.h +++ b/trunk/include/linux/usb/usbnet.h @@ -69,6 +69,7 @@ struct usbnet { # define EVENT_DEV_ASLEEP 6 # define EVENT_DEV_OPEN 7 # define EVENT_DEVICE_REPORT_IDLE 8 +# define EVENT_NO_RUNTIME_PM 9 }; static inline struct usb_driver *driver_of(struct usb_interface *intf) @@ -240,4 +241,6 @@ extern void usbnet_set_msglevel(struct net_device *, u32); extern void usbnet_get_drvinfo(struct net_device *, struct ethtool_drvinfo *); extern int usbnet_nway_reset(struct net_device *net); +extern int usbnet_manage_power(struct usbnet *, int); + #endif /* __LINUX_USB_USBNET_H */ diff --git a/trunk/include/linux/user_namespace.h b/trunk/include/linux/user_namespace.h index 95142cae446a..b9bd2e6c73cc 100644 --- a/trunk/include/linux/user_namespace.h +++ b/trunk/include/linux/user_namespace.h @@ -25,6 +25,7 @@ struct user_namespace { struct user_namespace *parent; kuid_t owner; kgid_t group; + unsigned int proc_inum; }; extern struct user_namespace init_user_ns; @@ -39,6 +40,7 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns) } extern int create_user_ns(struct cred *new); +extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred); extern void free_user_ns(struct kref *kref); static inline void put_user_ns(struct user_namespace *ns) @@ -66,6 +68,14 @@ static inline int create_user_ns(struct cred *new) return -EINVAL; } +static inline int unshare_userns(unsigned long unshare_flags, + struct cred **new_cred) +{ + if (unshare_flags & CLONE_NEWUSER) + return -EINVAL; + return 0; +} + static inline void put_user_ns(struct user_namespace *ns) { } diff --git a/trunk/include/linux/utsname.h b/trunk/include/linux/utsname.h index 2b345206722a..239e27733d6c 100644 --- a/trunk/include/linux/utsname.h +++ b/trunk/include/linux/utsname.h @@ -23,6 +23,7 @@ struct uts_namespace { struct kref kref; struct new_utsname name; struct user_namespace *user_ns; + unsigned int proc_inum; }; extern struct uts_namespace init_uts_ns; @@ -33,7 +34,7 @@ static inline void get_uts_ns(struct uts_namespace *ns) } extern struct uts_namespace *copy_utsname(unsigned long flags, - struct task_struct *tsk); + struct user_namespace *user_ns, struct uts_namespace *old_ns); extern void free_uts_ns(struct kref *kref); static inline void put_uts_ns(struct uts_namespace *ns) @@ -50,12 +51,12 @@ static inline void put_uts_ns(struct uts_namespace *ns) } static inline struct uts_namespace *copy_utsname(unsigned long flags, - struct task_struct *tsk) + struct user_namespace *user_ns, struct uts_namespace *old_ns) { if (flags & CLONE_NEWUTS) return ERR_PTR(-EINVAL); - return tsk->nsproxy->uts_ns; + return old_ns; } #endif diff --git a/trunk/include/linux/wait.h b/trunk/include/linux/wait.h index 168dfe122dd3..7cb64d4b499d 100644 --- a/trunk/include/linux/wait.h +++ b/trunk/include/linux/wait.h @@ -550,6 +550,170 @@ do { \ __ret; \ }) + +#define __wait_event_lock_irq(wq, condition, lock, cmd) \ +do { \ + DEFINE_WAIT(__wait); \ + \ + for (;;) { \ + prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \ + if (condition) \ + break; \ + spin_unlock_irq(&lock); \ + cmd; \ + schedule(); \ + spin_lock_irq(&lock); \ + } \ + finish_wait(&wq, &__wait); \ +} while (0) + +/** + * wait_event_lock_irq_cmd - sleep until a condition gets true. The + * condition is checked under the lock. This + * is expected to be called with the lock + * taken. + * @wq: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * @lock: a locked spinlock_t, which will be released before cmd + * and schedule() and reacquired afterwards. + * @cmd: a command which is invoked outside the critical section before + * sleep + * + * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the + * @condition evaluates to true. The @condition is checked each time + * the waitqueue @wq is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + * This is supposed to be called while holding the lock. The lock is + * dropped before invoking the cmd and going to sleep and is reacquired + * afterwards. + */ +#define wait_event_lock_irq_cmd(wq, condition, lock, cmd) \ +do { \ + if (condition) \ + break; \ + __wait_event_lock_irq(wq, condition, lock, cmd); \ +} while (0) + +/** + * wait_event_lock_irq - sleep until a condition gets true. The + * condition is checked under the lock. This + * is expected to be called with the lock + * taken. + * @wq: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * @lock: a locked spinlock_t, which will be released before schedule() + * and reacquired afterwards. + * + * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the + * @condition evaluates to true. The @condition is checked each time + * the waitqueue @wq is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + * This is supposed to be called while holding the lock. The lock is + * dropped before going to sleep and is reacquired afterwards. + */ +#define wait_event_lock_irq(wq, condition, lock) \ +do { \ + if (condition) \ + break; \ + __wait_event_lock_irq(wq, condition, lock, ); \ +} while (0) + + +#define __wait_event_interruptible_lock_irq(wq, condition, \ + lock, ret, cmd) \ +do { \ + DEFINE_WAIT(__wait); \ + \ + for (;;) { \ + prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ + if (condition) \ + break; \ + if (signal_pending(current)) { \ + ret = -ERESTARTSYS; \ + break; \ + } \ + spin_unlock_irq(&lock); \ + cmd; \ + schedule(); \ + spin_lock_irq(&lock); \ + } \ + finish_wait(&wq, &__wait); \ +} while (0) + +/** + * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true. + * The condition is checked under the lock. This is expected to + * be called with the lock taken. + * @wq: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * @lock: a locked spinlock_t, which will be released before cmd and + * schedule() and reacquired afterwards. + * @cmd: a command which is invoked outside the critical section before + * sleep + * + * The process is put to sleep (TASK_INTERRUPTIBLE) until the + * @condition evaluates to true or a signal is received. The @condition is + * checked each time the waitqueue @wq is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + * This is supposed to be called while holding the lock. The lock is + * dropped before invoking the cmd and going to sleep and is reacquired + * afterwards. + * + * The macro will return -ERESTARTSYS if it was interrupted by a signal + * and 0 if @condition evaluated to true. + */ +#define wait_event_interruptible_lock_irq_cmd(wq, condition, lock, cmd) \ +({ \ + int __ret = 0; \ + \ + if (!(condition)) \ + __wait_event_interruptible_lock_irq(wq, condition, \ + lock, __ret, cmd); \ + __ret; \ +}) + +/** + * wait_event_interruptible_lock_irq - sleep until a condition gets true. + * The condition is checked under the lock. This is expected + * to be called with the lock taken. + * @wq: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * @lock: a locked spinlock_t, which will be released before schedule() + * and reacquired afterwards. + * + * The process is put to sleep (TASK_INTERRUPTIBLE) until the + * @condition evaluates to true or signal is received. The @condition is + * checked each time the waitqueue @wq is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + * This is supposed to be called while holding the lock. The lock is + * dropped before going to sleep and is reacquired afterwards. + * + * The macro will return -ERESTARTSYS if it was interrupted by a signal + * and 0 if @condition evaluated to true. + */ +#define wait_event_interruptible_lock_irq(wq, condition, lock) \ +({ \ + int __ret = 0; \ + \ + if (!(condition)) \ + __wait_event_interruptible_lock_irq(wq, condition, \ + lock, __ret, ); \ + __ret; \ +}) + + /* * These are the old interfaces to sleep waiting for an event. * They are racy. DO NOT use them, use the wait_event* interfaces above. diff --git a/trunk/include/net/inet_connection_sock.h b/trunk/include/net/inet_connection_sock.h index ba1d3615acbb..183292722f6e 100644 --- a/trunk/include/net/inet_connection_sock.h +++ b/trunk/include/net/inet_connection_sock.h @@ -318,6 +318,7 @@ extern void inet_csk_reqsk_queue_prune(struct sock *parent, const unsigned long max_rto); extern void inet_csk_destroy_sock(struct sock *sk); +extern void inet_csk_prepare_forced_close(struct sock *sk); /* * LISTEN is a special case for poll.. diff --git a/trunk/include/net/ndisc.h b/trunk/include/net/ndisc.h index 7af1ea893038..23b3a7c58783 100644 --- a/trunk/include/net/ndisc.h +++ b/trunk/include/net/ndisc.h @@ -78,6 +78,13 @@ struct ra_msg { __be32 retrans_timer; }; +struct rd_msg { + struct icmp6hdr icmph; + struct in6_addr target; + struct in6_addr dest; + __u8 opt[0]; +}; + struct nd_opt_hdr { __u8 nd_opt_type; __u8 nd_opt_len; diff --git a/trunk/include/net/net_namespace.h b/trunk/include/net/net_namespace.h index c5a43f56b796..de644bcd8613 100644 --- a/trunk/include/net/net_namespace.h +++ b/trunk/include/net/net_namespace.h @@ -56,6 +56,8 @@ struct net { struct user_namespace *user_ns; /* Owning user namespace */ + unsigned int proc_inum; + struct proc_dir_entry *proc_net; struct proc_dir_entry *proc_net_stat; diff --git a/trunk/include/trace/events/btrfs.h b/trunk/include/trace/events/btrfs.h index 54fab041b22a..ea546a4e9609 100644 --- a/trunk/include/trace/events/btrfs.h +++ b/trunk/include/trace/events/btrfs.h @@ -45,7 +45,8 @@ struct extent_buffer; #define show_root_type(obj) \ obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \ - (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-" + (obj >= BTRFS_ROOT_TREE_OBJECTID && \ + obj <= BTRFS_CSUM_TREE_OBJECTID)) ? __show_root_type(obj) : "-" #define BTRFS_GROUP_FLAGS \ { BTRFS_BLOCK_GROUP_DATA, "DATA"}, \ diff --git a/trunk/include/trace/events/gfpflags.h b/trunk/include/trace/events/gfpflags.h index d6fd8e5b14b7..1eddbf1557f2 100644 --- a/trunk/include/trace/events/gfpflags.h +++ b/trunk/include/trace/events/gfpflags.h @@ -34,6 +34,7 @@ {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ + {(unsigned long)__GFP_KMEMCG, "GFP_KMEMCG"}, \ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ diff --git a/trunk/include/uapi/asm-generic/unistd.h b/trunk/include/uapi/asm-generic/unistd.h index 6e595ba545f4..2c531f478410 100644 --- a/trunk/include/uapi/asm-generic/unistd.h +++ b/trunk/include/uapi/asm-generic/unistd.h @@ -690,9 +690,11 @@ __SC_COMP(__NR_process_vm_writev, sys_process_vm_writev, \ compat_sys_process_vm_writev) #define __NR_kcmp 272 __SYSCALL(__NR_kcmp, sys_kcmp) +#define __NR_finit_module 273 +__SYSCALL(__NR_finit_module, sys_finit_module) #undef __NR_syscalls -#define __NR_syscalls 273 +#define __NR_syscalls 274 /* * All syscalls below here should go away really, diff --git a/trunk/include/uapi/linux/if_bridge.h b/trunk/include/uapi/linux/if_bridge.h index afbb18a0227c..5db297514aec 100644 --- a/trunk/include/uapi/linux/if_bridge.h +++ b/trunk/include/uapi/linux/if_bridge.h @@ -163,6 +163,9 @@ struct br_port_msg { struct br_mdb_entry { __u32 ifindex; +#define MDB_TEMPORARY 0 +#define MDB_PERMANENT 1 + __u8 state; struct { union { __be32 ip4; diff --git a/trunk/include/uapi/linux/module.h b/trunk/include/uapi/linux/module.h new file mode 100644 index 000000000000..38da4258b12f --- /dev/null +++ b/trunk/include/uapi/linux/module.h @@ -0,0 +1,8 @@ +#ifndef _UAPI_LINUX_MODULE_H +#define _UAPI_LINUX_MODULE_H + +/* Flags for sys_finit_module: */ +#define MODULE_INIT_IGNORE_MODVERSIONS 1 +#define MODULE_INIT_IGNORE_VERMAGIC 2 + +#endif /* _UAPI_LINUX_MODULE_H */ diff --git a/trunk/include/uapi/linux/ptrace.h b/trunk/include/uapi/linux/ptrace.h index 1ef6c056a9e4..022ab186a812 100644 --- a/trunk/include/uapi/linux/ptrace.h +++ b/trunk/include/uapi/linux/ptrace.h @@ -73,7 +73,10 @@ #define PTRACE_O_TRACEEXIT (1 << PTRACE_EVENT_EXIT) #define PTRACE_O_TRACESECCOMP (1 << PTRACE_EVENT_SECCOMP) -#define PTRACE_O_MASK 0x000000ff +/* eventless options */ +#define PTRACE_O_EXITKILL (1 << 20) + +#define PTRACE_O_MASK (0x000000ff | PTRACE_O_EXITKILL) #include diff --git a/trunk/include/uapi/linux/swab.h b/trunk/include/uapi/linux/swab.h index e811474724c2..0e011eb91b5d 100644 --- a/trunk/include/uapi/linux/swab.h +++ b/trunk/include/uapi/linux/swab.h @@ -45,7 +45,9 @@ static inline __attribute_const__ __u16 __fswab16(__u16 val) { -#ifdef __arch_swab16 +#ifdef __HAVE_BUILTIN_BSWAP16__ + return __builtin_bswap16(val); +#elif defined (__arch_swab16) return __arch_swab16(val); #else return ___constant_swab16(val); @@ -54,7 +56,9 @@ static inline __attribute_const__ __u16 __fswab16(__u16 val) static inline __attribute_const__ __u32 __fswab32(__u32 val) { -#ifdef __arch_swab32 +#ifdef __HAVE_BUILTIN_BSWAP32__ + return __builtin_bswap32(val); +#elif defined(__arch_swab32) return __arch_swab32(val); #else return ___constant_swab32(val); @@ -63,7 +67,9 @@ static inline __attribute_const__ __u32 __fswab32(__u32 val) static inline __attribute_const__ __u64 __fswab64(__u64 val) { -#ifdef __arch_swab64 +#ifdef __HAVE_BUILTIN_BSWAP64__ + return __builtin_bswap64(val); +#elif defined (__arch_swab64) return __arch_swab64(val); #elif defined(__SWAB_64_THRU_32__) __u32 h = val >> 32; diff --git a/trunk/include/xen/interface/event_channel.h b/trunk/include/xen/interface/event_channel.h index 2090881c3650..f4942921e202 100644 --- a/trunk/include/xen/interface/event_channel.h +++ b/trunk/include/xen/interface/event_channel.h @@ -177,6 +177,19 @@ struct evtchn_unmask { evtchn_port_t port; }; +/* + * EVTCHNOP_reset: Close all event channels associated with specified domain. + * NOTES: + * 1. may be specified as DOMID_SELF. + * 2. Only a sufficiently-privileged domain may specify other than DOMID_SELF. + */ +#define EVTCHNOP_reset 10 +struct evtchn_reset { + /* IN parameters. */ + domid_t dom; +}; +typedef struct evtchn_reset evtchn_reset_t; + struct evtchn_op { uint32_t cmd; /* EVTCHNOP_* */ union { diff --git a/trunk/init/Kconfig b/trunk/init/Kconfig index 1a207efca591..7d30240e5bfe 100644 --- a/trunk/init/Kconfig +++ b/trunk/init/Kconfig @@ -882,7 +882,7 @@ config MEMCG_SWAP_ENABLED config MEMCG_KMEM bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)" depends on MEMCG && EXPERIMENTAL - default n + depends on SLUB || SLAB help The Kernel Memory extension for Memory Resource Controller can limit the amount of memory used by kernel objects in the system. Those are @@ -1069,11 +1069,9 @@ config UIDGID_CONVERTED # Filesystems depends on 9P_FS = n depends on AFS_FS = n - depends on AUTOFS4_FS = n depends on CEPH_FS = n depends on CIFS = n depends on CODA_FS = n - depends on FUSE_FS = n depends on GFS2_FS = n depends on NCP_FS = n depends on NFSD = n diff --git a/trunk/init/do_mounts.c b/trunk/init/do_mounts.c index f8a66424360d..1d1b6348f903 100644 --- a/trunk/init/do_mounts.c +++ b/trunk/init/do_mounts.c @@ -69,23 +69,28 @@ __setup("ro", readonly); __setup("rw", readwrite); #ifdef CONFIG_BLOCK +struct uuidcmp { + const char *uuid; + int len; +}; + /** * match_dev_by_uuid - callback for finding a partition using its uuid * @dev: device passed in by the caller - * @data: opaque pointer to a 36 byte char array with a UUID + * @data: opaque pointer to the desired struct uuidcmp to match * * Returns 1 if the device matches, and 0 otherwise. */ static int match_dev_by_uuid(struct device *dev, void *data) { - u8 *uuid = data; + struct uuidcmp *cmp = data; struct hd_struct *part = dev_to_part(dev); if (!part->info) goto no_match; - if (memcmp(uuid, part->info->uuid, sizeof(part->info->uuid))) - goto no_match; + if (strncasecmp(cmp->uuid, part->info->uuid, cmp->len)) + goto no_match; return 1; no_match: @@ -95,7 +100,7 @@ static int match_dev_by_uuid(struct device *dev, void *data) /** * devt_from_partuuid - looks up the dev_t of a partition by its UUID - * @uuid: min 36 byte char array containing a hex ascii UUID + * @uuid: char array containing ascii UUID * * The function will return the first partition which contains a matching * UUID value in its partition_meta_info struct. This does not search @@ -106,38 +111,41 @@ static int match_dev_by_uuid(struct device *dev, void *data) * * Returns the matching dev_t on success or 0 on failure. */ -static dev_t devt_from_partuuid(char *uuid_str) +static dev_t devt_from_partuuid(const char *uuid_str) { dev_t res = 0; + struct uuidcmp cmp; struct device *dev = NULL; - u8 uuid[16]; struct gendisk *disk; struct hd_struct *part; int offset = 0; + bool clear_root_wait = false; + char *slash; - if (strlen(uuid_str) < 36) - goto done; + cmp.uuid = uuid_str; + slash = strchr(uuid_str, '/'); /* Check for optional partition number offset attributes. */ - if (uuid_str[36]) { + if (slash) { char c = 0; /* Explicitly fail on poor PARTUUID syntax. */ - if (sscanf(&uuid_str[36], - "/PARTNROFF=%d%c", &offset, &c) != 1) { - printk(KERN_ERR "VFS: PARTUUID= is invalid.\n" - "Expected PARTUUID=[/PARTNROFF=%%d]\n"); - if (root_wait) - printk(KERN_ERR - "Disabling rootwait; root= is invalid.\n"); - root_wait = 0; + if (sscanf(slash + 1, + "PARTNROFF=%d%c", &offset, &c) != 1) { + clear_root_wait = true; goto done; } + cmp.len = slash - uuid_str; + } else { + cmp.len = strlen(uuid_str); } - /* Pack the requested UUID in the expected format. */ - part_pack_uuid(uuid_str, uuid); + if (!cmp.len) { + clear_root_wait = true; + goto done; + } - dev = class_find_device(&block_class, NULL, uuid, &match_dev_by_uuid); + dev = class_find_device(&block_class, NULL, &cmp, + &match_dev_by_uuid); if (!dev) goto done; @@ -158,6 +166,13 @@ static dev_t devt_from_partuuid(char *uuid_str) no_offset: put_device(dev); done: + if (clear_root_wait) { + pr_err("VFS: PARTUUID= is invalid.\n" + "Expected PARTUUID=[/PARTNROFF=%%d]\n"); + if (root_wait) + pr_err("Disabling rootwait; root= is invalid.\n"); + root_wait = 0; + } return res; } #endif @@ -174,6 +189,10 @@ static dev_t devt_from_partuuid(char *uuid_str) * used when disk name of partitioned disk ends on a digit. * 6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the * unique id of a partition if the partition table provides it. + * The UUID may be either an EFI/GPT UUID, or refer to an MSDOS + * partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- + * filled hex representation of the 32-bit "NT disk signature", and PP + * is a zero-filled hex representation of the 1-based partition number. * 7) PARTUUID=/PARTNROFF= to select a partition in relation to * a partition with a known unique id. * diff --git a/trunk/init/main.c b/trunk/init/main.c index 63ae904a99a8..baf1f0f5c461 100644 --- a/trunk/init/main.c +++ b/trunk/init/main.c @@ -812,7 +812,6 @@ static int __ref kernel_init(void *unused) system_state = SYSTEM_RUNNING; numa_default_policy(); - current->signal->flags |= SIGNAL_UNKILLABLE; flush_delayed_fput(); if (ramdisk_execute_command) { diff --git a/trunk/init/version.c b/trunk/init/version.c index 86fe0ccb997a..58170f18912d 100644 --- a/trunk/init/version.c +++ b/trunk/init/version.c @@ -12,6 +12,7 @@ #include #include #include +#include #ifndef CONFIG_KALLSYMS #define version(a) Version_ ## a @@ -34,6 +35,7 @@ struct uts_namespace init_uts_ns = { .domainname = UTS_DOMAINNAME, }, .user_ns = &init_user_ns, + .proc_inum = PROC_UTS_INIT_INO, }; EXPORT_SYMBOL_GPL(init_uts_ns); diff --git a/trunk/ipc/msgutil.c b/trunk/ipc/msgutil.c index 26143d377c95..6471f1bdae96 100644 --- a/trunk/ipc/msgutil.c +++ b/trunk/ipc/msgutil.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "util.h" @@ -30,6 +31,7 @@ DEFINE_SPINLOCK(mq_lock); struct ipc_namespace init_ipc_ns = { .count = ATOMIC_INIT(1), .user_ns = &init_user_ns, + .proc_inum = PROC_IPC_INIT_INO, }; atomic_t nr_ipc_ns = ATOMIC_INIT(1); diff --git a/trunk/ipc/namespace.c b/trunk/ipc/namespace.c index f362298c5ce4..7c1fa451b0b0 100644 --- a/trunk/ipc/namespace.c +++ b/trunk/ipc/namespace.c @@ -16,7 +16,7 @@ #include "util.h" -static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk, +static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, struct ipc_namespace *old_ns) { struct ipc_namespace *ns; @@ -26,9 +26,16 @@ static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk, if (ns == NULL) return ERR_PTR(-ENOMEM); + err = proc_alloc_inum(&ns->proc_inum); + if (err) { + kfree(ns); + return ERR_PTR(err); + } + atomic_set(&ns->count, 1); err = mq_init_ns(ns); if (err) { + proc_free_inum(ns->proc_inum); kfree(ns); return ERR_PTR(err); } @@ -46,19 +53,17 @@ static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk, ipcns_notify(IPCNS_CREATED); register_ipcns_notifier(ns); - ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); + ns->user_ns = get_user_ns(user_ns); return ns; } struct ipc_namespace *copy_ipcs(unsigned long flags, - struct task_struct *tsk) + struct user_namespace *user_ns, struct ipc_namespace *ns) { - struct ipc_namespace *ns = tsk->nsproxy->ipc_ns; - if (!(flags & CLONE_NEWIPC)) return get_ipc_ns(ns); - return create_ipc_ns(tsk, ns); + return create_ipc_ns(user_ns, ns); } /* @@ -113,6 +118,7 @@ static void free_ipc_ns(struct ipc_namespace *ns) */ ipcns_notify(IPCNS_REMOVED); put_user_ns(ns->user_ns); + proc_free_inum(ns->proc_inum); kfree(ns); } @@ -161,8 +167,13 @@ static void ipcns_put(void *ns) return put_ipc_ns(ns); } -static int ipcns_install(struct nsproxy *nsproxy, void *ns) +static int ipcns_install(struct nsproxy *nsproxy, void *new) { + struct ipc_namespace *ns = new; + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || + !nsown_capable(CAP_SYS_ADMIN)) + return -EPERM; + /* Ditch state from the old ipc namespace */ exit_sem(current); put_ipc_ns(nsproxy->ipc_ns); @@ -170,10 +181,18 @@ static int ipcns_install(struct nsproxy *nsproxy, void *ns) return 0; } +static unsigned int ipcns_inum(void *vp) +{ + struct ipc_namespace *ns = vp; + + return ns->proc_inum; +} + const struct proc_ns_operations ipcns_operations = { .name = "ipc", .type = CLONE_NEWIPC, .get = ipcns_get, .put = ipcns_put, .install = ipcns_install, + .inum = ipcns_inum, }; diff --git a/trunk/kernel/Makefile b/trunk/kernel/Makefile index ac0d533eb7de..6c072b6da239 100644 --- a/trunk/kernel/Makefile +++ b/trunk/kernel/Makefile @@ -54,7 +54,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o +obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o @@ -137,10 +137,14 @@ ifeq ($(CONFIG_MODULE_SIG),y) # # Pull the signing certificate and any extra certificates into the kernel # + +quiet_cmd_touch = TOUCH $@ + cmd_touch = touch $@ + extra_certificates: - touch $@ + $(call cmd,touch) -kernel/modsign_pubkey.o: signing_key.x509 extra_certificates +kernel/modsign_certificate.o: signing_key.x509 extra_certificates ############################################################################### # diff --git a/trunk/kernel/cgroup.c b/trunk/kernel/cgroup.c index f34c41bfaa37..4855892798fd 100644 --- a/trunk/kernel/cgroup.c +++ b/trunk/kernel/cgroup.c @@ -1333,7 +1333,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) if (ret) goto out_unlock; - /* See feature-removal-schedule.txt */ if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); @@ -3409,7 +3408,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, { struct cgroup_pidlist *l; /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = current->nsproxy->pid_ns; + struct pid_namespace *ns = task_active_pid_ns(current); /* * We can't drop the pidlist_mutex before taking the l->mutex in case diff --git a/trunk/kernel/compat.c b/trunk/kernel/compat.c index c28a306ae05c..f6150e92dfc9 100644 --- a/trunk/kernel/compat.c +++ b/trunk/kernel/compat.c @@ -1215,6 +1215,23 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info) return 0; } +#ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL +asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, + struct compat_timespec __user *interval) +{ + struct timespec t; + int ret; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); + set_fs(old_fs); + if (put_compat_timespec(&t, interval)) + return -EFAULT; + return ret; +} +#endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */ + /* * Allocate user-space memory for the duration of a single system call, * in order to marshall parameters inside a compat thunk. diff --git a/trunk/kernel/cred.c b/trunk/kernel/cred.c index 8888afb846e9..e0573a43c7df 100644 --- a/trunk/kernel/cred.c +++ b/trunk/kernel/cred.c @@ -372,6 +372,31 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) return ret; } +static bool cred_cap_issubset(const struct cred *set, const struct cred *subset) +{ + const struct user_namespace *set_ns = set->user_ns; + const struct user_namespace *subset_ns = subset->user_ns; + + /* If the two credentials are in the same user namespace see if + * the capabilities of subset are a subset of set. + */ + if (set_ns == subset_ns) + return cap_issubset(subset->cap_permitted, set->cap_permitted); + + /* The credentials are in a different user namespaces + * therefore one is a subset of the other only if a set is an + * ancestor of subset and set->euid is owner of subset or one + * of subsets ancestors. + */ + for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) { + if ((set_ns == subset_ns->parent) && + uid_eq(subset_ns->owner, set->euid)) + return true; + } + + return false; +} + /** * commit_creds - Install new credentials upon the current task * @new: The credentials to be assigned @@ -410,7 +435,7 @@ int commit_creds(struct cred *new) !gid_eq(old->egid, new->egid) || !uid_eq(old->fsuid, new->fsuid) || !gid_eq(old->fsgid, new->fsgid) || - !cap_issubset(new->cap_permitted, old->cap_permitted)) { + !cred_cap_issubset(old, new)) { if (task->mm) set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; diff --git a/trunk/kernel/events/core.c b/trunk/kernel/events/core.c index f9ff5493171d..301079d06f24 100644 --- a/trunk/kernel/events/core.c +++ b/trunk/kernel/events/core.c @@ -6155,7 +6155,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->parent = parent_event; - event->ns = get_pid_ns(current->nsproxy->pid_ns); + event->ns = get_pid_ns(task_active_pid_ns(current)); event->id = atomic64_inc_return(&perf_event_id); event->state = PERF_EVENT_STATE_INACTIVE; diff --git a/trunk/kernel/exit.c b/trunk/kernel/exit.c index 50d2e93c36ea..b4df21937216 100644 --- a/trunk/kernel/exit.c +++ b/trunk/kernel/exit.c @@ -72,18 +72,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead) list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); - /* - * If we are the last child process in a pid namespace to be - * reaped, notify the reaper sleeping zap_pid_ns_processes(). - */ - if (IS_ENABLED(CONFIG_PID_NS)) { - struct task_struct *parent = p->real_parent; - - if ((task_active_pid_ns(parent)->child_reaper == parent) && - list_empty(&parent->children) && - (parent->flags & PF_EXITING)) - wake_up_process(parent); - } } list_del_rcu(&p->thread_group); } diff --git a/trunk/kernel/fork.c b/trunk/kernel/fork.c index 115d6c2e4cca..85f6d536608d 100644 --- a/trunk/kernel/fork.c +++ b/trunk/kernel/fork.c @@ -146,7 +146,7 @@ void __weak arch_release_thread_info(struct thread_info *ti) static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { - struct page *page = alloc_pages_node(node, THREADINFO_GFP, + struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; @@ -154,7 +154,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, static inline void free_thread_info(struct thread_info *ti) { - free_pages((unsigned long)ti, THREAD_SIZE_ORDER); + free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_info_cache; @@ -1044,8 +1044,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) atomic_set(&sig->live, 1); atomic_set(&sig->sigcnt, 1); init_waitqueue_head(&sig->wait_chldexit); - if (clone_flags & CLONE_NEWPID) - sig->flags |= SIGNAL_UNKILLABLE; sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); @@ -1438,8 +1436,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); if (thread_group_leader(p)) { - if (is_child_reaper(pid)) - p->nsproxy->pid_ns->child_reaper = p; + if (is_child_reaper(pid)) { + ns_of_pid(pid)->child_reaper = p; + p->signal->flags |= SIGNAL_UNKILLABLE; + } p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); @@ -1473,8 +1473,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: - if (unlikely(clone_flags & CLONE_NEWPID)) - pid_ns_release_proc(p->nsproxy->pid_ns); exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) @@ -1554,15 +1552,9 @@ long do_fork(unsigned long clone_flags, * Do some preliminary argument and permissions checking before we * actually start allocating stuff */ - if (clone_flags & CLONE_NEWUSER) { - if (clone_flags & CLONE_THREAD) + if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) { + if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) return -EINVAL; - /* hopefully this check will go away when userns support is - * complete - */ - if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || - !capable(CAP_SETGID)) - return -EPERM; } /* @@ -1724,7 +1716,8 @@ static int check_unshare_flags(unsigned long unshare_flags) { if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| + CLONE_NEWUSER|CLONE_NEWPID)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing to @@ -1791,19 +1784,40 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { struct fs_struct *fs, *new_fs = NULL; struct files_struct *fd, *new_fd = NULL; + struct cred *new_cred = NULL; struct nsproxy *new_nsproxy = NULL; int do_sysvsem = 0; int err; - err = check_unshare_flags(unshare_flags); - if (err) - goto bad_unshare_out; - + /* + * If unsharing a user namespace must also unshare the thread. + */ + if (unshare_flags & CLONE_NEWUSER) + unshare_flags |= CLONE_THREAD; + /* + * If unsharing a pid namespace must also unshare the thread. + */ + if (unshare_flags & CLONE_NEWPID) + unshare_flags |= CLONE_THREAD; + /* + * If unsharing a thread from a thread group, must also unshare vm. + */ + if (unshare_flags & CLONE_THREAD) + unshare_flags |= CLONE_VM; + /* + * If unsharing vm, must also unshare signal handlers. + */ + if (unshare_flags & CLONE_VM) + unshare_flags |= CLONE_SIGHAND; /* * If unsharing namespace, must also unshare filesystem information. */ if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; + + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; /* * CLONE_NEWIPC must also detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old @@ -1817,11 +1831,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) err = unshare_fd(unshare_flags, &new_fd); if (err) goto bad_unshare_cleanup_fs; - err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); + err = unshare_userns(unshare_flags, &new_cred); if (err) goto bad_unshare_cleanup_fd; + err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, + new_cred, new_fs); + if (err) + goto bad_unshare_cleanup_cred; - if (new_fs || new_fd || do_sysvsem || new_nsproxy) { + if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { if (do_sysvsem) { /* * CLONE_SYSVSEM is equivalent to sys_exit(). @@ -1854,11 +1872,20 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) } task_unlock(current); + + if (new_cred) { + /* Install the new user namespace */ + commit_creds(new_cred); + new_cred = NULL; + } } if (new_nsproxy) put_nsproxy(new_nsproxy); +bad_unshare_cleanup_cred: + if (new_cred) + put_cred(new_cred); bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); diff --git a/trunk/kernel/irq/manage.c b/trunk/kernel/irq/manage.c index 35c70c9e24d8..e49a288fa479 100644 --- a/trunk/kernel/irq/manage.c +++ b/trunk/kernel/irq/manage.c @@ -818,7 +818,7 @@ static void irq_thread_dtor(struct callback_head *unused) action = kthread_data(tsk); pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", - tsk->comm ? tsk->comm : "", tsk->pid, action->irq); + tsk->comm, tsk->pid, action->irq); desc = irq_to_desc(action->irq); diff --git a/trunk/kernel/modsign_certificate.S b/trunk/kernel/modsign_certificate.S new file mode 100644 index 000000000000..246b4c6e6135 --- /dev/null +++ b/trunk/kernel/modsign_certificate.S @@ -0,0 +1,19 @@ +/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */ +#ifndef SYMBOL_PREFIX +#define ASM_SYMBOL(sym) sym +#else +#define PASTE2(x,y) x##y +#define PASTE(x,y) PASTE2(x,y) +#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym) +#endif + +#define GLOBAL(name) \ + .globl ASM_SYMBOL(name); \ + ASM_SYMBOL(name): + + .section ".init.data","aw" + +GLOBAL(modsign_certificate_list) + .incbin "signing_key.x509" + .incbin "extra_certificates" +GLOBAL(modsign_certificate_list_end) diff --git a/trunk/kernel/modsign_pubkey.c b/trunk/kernel/modsign_pubkey.c index 767e559dfb10..045504fffbb2 100644 --- a/trunk/kernel/modsign_pubkey.c +++ b/trunk/kernel/modsign_pubkey.c @@ -20,12 +20,6 @@ struct key *modsign_keyring; extern __initdata const u8 modsign_certificate_list[]; extern __initdata const u8 modsign_certificate_list_end[]; -asm(".section .init.data,\"aw\"\n" - SYMBOL_PREFIX "modsign_certificate_list:\n" - ".incbin \"signing_key.x509\"\n" - ".incbin \"extra_certificates\"\n" - SYMBOL_PREFIX "modsign_certificate_list_end:" - ); /* * We need to make sure ccache doesn't cache the .o file as it doesn't notice diff --git a/trunk/kernel/module.c b/trunk/kernel/module.c index 6e48c3a43599..250092c1d57d 100644 --- a/trunk/kernel/module.c +++ b/trunk/kernel/module.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -28,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -59,6 +61,7 @@ #include #include #include +#include #include "module-internal.h" #define CREATE_TRACE_POINTS @@ -372,9 +375,6 @@ static bool check_symbol(const struct symsearch *syms, printk(KERN_WARNING "Symbol %s is being used " "by a non-GPL module, which will not " "be allowed in the future\n", fsa->name); - printk(KERN_WARNING "Please see the file " - "Documentation/feature-removal-schedule.txt " - "in the kernel source tree for more details.\n"); } } @@ -2282,7 +2282,7 @@ static void layout_symtab(struct module *mod, struct load_info *info) Elf_Shdr *symsect = info->sechdrs + info->index.sym; Elf_Shdr *strsect = info->sechdrs + info->index.str; const Elf_Sym *src; - unsigned int i, nsrc, ndst, strtab_size; + unsigned int i, nsrc, ndst, strtab_size = 0; /* Put symbol section at end of init part of module. */ symsect->sh_flags |= SHF_ALLOC; @@ -2293,9 +2293,6 @@ static void layout_symtab(struct module *mod, struct load_info *info) src = (void *)info->hdr + symsect->sh_offset; nsrc = symsect->sh_size / sizeof(*src); - /* strtab always starts with a nul, so offset 0 is the empty string. */ - strtab_size = 1; - /* Compute total space required for the core symbols' strtab. */ for (ndst = i = 0; i < nsrc; i++) { if (i == 0 || @@ -2337,7 +2334,6 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) mod->core_symtab = dst = mod->module_core + info->symoffs; mod->core_strtab = s = mod->module_core + info->stroffs; src = mod->symtab; - *s++ = 0; for (ndst = i = 0; i < mod->num_symtab; i++) { if (i == 0 || is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { @@ -2378,7 +2374,7 @@ static void dynamic_debug_remove(struct _ddebug *debug) void * __weak module_alloc(unsigned long size) { - return size == 0 ? NULL : vmalloc_exec(size); + return vmalloc_exec(size); } static void *module_alloc_update_bounds(unsigned long size) @@ -2425,18 +2421,17 @@ static inline void kmemleak_load_module(const struct module *mod, #endif #ifdef CONFIG_MODULE_SIG -static int module_sig_check(struct load_info *info, - const void *mod, unsigned long *_len) +static int module_sig_check(struct load_info *info) { int err = -ENOKEY; - unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; - unsigned long len = *_len; + const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; + const void *mod = info->hdr; - if (len > markerlen && - memcmp(mod + len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { + if (info->len > markerlen && + memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { /* We truncate the module to discard the signature */ - *_len -= markerlen; - err = mod_verify_sig(mod, _len); + info->len -= markerlen; + err = mod_verify_sig(mod, &info->len); } if (!err) { @@ -2454,59 +2449,107 @@ static int module_sig_check(struct load_info *info, return err; } #else /* !CONFIG_MODULE_SIG */ -static int module_sig_check(struct load_info *info, - void *mod, unsigned long *len) +static int module_sig_check(struct load_info *info) { return 0; } #endif /* !CONFIG_MODULE_SIG */ -/* Sets info->hdr, info->len and info->sig_ok. */ -static int copy_and_check(struct load_info *info, - const void __user *umod, unsigned long len, - const char __user *uargs) +/* Sanity checks against invalid binaries, wrong arch, weird elf version. */ +static int elf_header_check(struct load_info *info) +{ + if (info->len < sizeof(*(info->hdr))) + return -ENOEXEC; + + if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0 + || info->hdr->e_type != ET_REL + || !elf_check_arch(info->hdr) + || info->hdr->e_shentsize != sizeof(Elf_Shdr)) + return -ENOEXEC; + + if (info->hdr->e_shoff >= info->len + || (info->hdr->e_shnum * sizeof(Elf_Shdr) > + info->len - info->hdr->e_shoff)) + return -ENOEXEC; + + return 0; +} + +/* Sets info->hdr and info->len. */ +static int copy_module_from_user(const void __user *umod, unsigned long len, + struct load_info *info) { int err; - Elf_Ehdr *hdr; - if (len < sizeof(*hdr)) + info->len = len; + if (info->len < sizeof(*(info->hdr))) return -ENOEXEC; + err = security_kernel_module_from_file(NULL); + if (err) + return err; + /* Suck in entire file: we'll want most of it. */ - if ((hdr = vmalloc(len)) == NULL) + info->hdr = vmalloc(info->len); + if (!info->hdr) return -ENOMEM; - if (copy_from_user(hdr, umod, len) != 0) { - err = -EFAULT; - goto free_hdr; + if (copy_from_user(info->hdr, umod, info->len) != 0) { + vfree(info->hdr); + return -EFAULT; } - err = module_sig_check(info, hdr, &len); + return 0; +} + +/* Sets info->hdr and info->len. */ +static int copy_module_from_fd(int fd, struct load_info *info) +{ + struct file *file; + int err; + struct kstat stat; + loff_t pos; + ssize_t bytes = 0; + + file = fget(fd); + if (!file) + return -ENOEXEC; + + err = security_kernel_module_from_file(file); if (err) - goto free_hdr; + goto out; - /* Sanity checks against insmoding binaries or wrong arch, - weird elf version */ - if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 - || hdr->e_type != ET_REL - || !elf_check_arch(hdr) - || hdr->e_shentsize != sizeof(Elf_Shdr)) { - err = -ENOEXEC; - goto free_hdr; - } + err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat); + if (err) + goto out; - if (hdr->e_shoff >= len || - hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) { - err = -ENOEXEC; - goto free_hdr; + if (stat.size > INT_MAX) { + err = -EFBIG; + goto out; + } + info->hdr = vmalloc(stat.size); + if (!info->hdr) { + err = -ENOMEM; + goto out; } - info->hdr = hdr; - info->len = len; - return 0; + pos = 0; + while (pos < stat.size) { + bytes = kernel_read(file, pos, (char *)(info->hdr) + pos, + stat.size - pos); + if (bytes < 0) { + vfree(info->hdr); + err = bytes; + goto out; + } + if (bytes == 0) + break; + pos += bytes; + } + info->len = pos; -free_hdr: - vfree(hdr); +out: + fput(file); return err; } @@ -2515,7 +2558,7 @@ static void free_copy(struct load_info *info) vfree(info->hdr); } -static int rewrite_section_headers(struct load_info *info) +static int rewrite_section_headers(struct load_info *info, int flags) { unsigned int i; @@ -2543,7 +2586,10 @@ static int rewrite_section_headers(struct load_info *info) } /* Track but don't keep modinfo and version sections. */ - info->index.vers = find_sec(info, "__versions"); + if (flags & MODULE_INIT_IGNORE_MODVERSIONS) + info->index.vers = 0; /* Pretend no __versions section! */ + else + info->index.vers = find_sec(info, "__versions"); info->index.info = find_sec(info, ".modinfo"); info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; @@ -2558,7 +2604,7 @@ static int rewrite_section_headers(struct load_info *info) * Return the temporary module pointer (we'll replace it with the final * one when we move the module sections around). */ -static struct module *setup_load_info(struct load_info *info) +static struct module *setup_load_info(struct load_info *info, int flags) { unsigned int i; int err; @@ -2569,7 +2615,7 @@ static struct module *setup_load_info(struct load_info *info) info->secstrings = (void *)info->hdr + info->sechdrs[info->hdr->e_shstrndx].sh_offset; - err = rewrite_section_headers(info); + err = rewrite_section_headers(info, flags); if (err) return ERR_PTR(err); @@ -2607,11 +2653,14 @@ static struct module *setup_load_info(struct load_info *info) return mod; } -static int check_modinfo(struct module *mod, struct load_info *info) +static int check_modinfo(struct module *mod, struct load_info *info, int flags) { const char *modmagic = get_modinfo(info, "vermagic"); int err; + if (flags & MODULE_INIT_IGNORE_VERMAGIC) + modmagic = NULL; + /* This is allowed: modprobe --force will invalidate it. */ if (!modmagic) { err = try_to_force_load(mod, "bad vermagic"); @@ -2741,20 +2790,23 @@ static int move_module(struct module *mod, struct load_info *info) memset(ptr, 0, mod->core_size); mod->module_core = ptr; - ptr = module_alloc_update_bounds(mod->init_size); - /* - * The pointer to this block is stored in the module structure - * which is inside the block. This block doesn't need to be - * scanned as it contains data and code that will be freed - * after the module is initialized. - */ - kmemleak_ignore(ptr); - if (!ptr && mod->init_size) { - module_free(mod, mod->module_core); - return -ENOMEM; - } - memset(ptr, 0, mod->init_size); - mod->module_init = ptr; + if (mod->init_size) { + ptr = module_alloc_update_bounds(mod->init_size); + /* + * The pointer to this block is stored in the module structure + * which is inside the block. This block doesn't need to be + * scanned as it contains data and code that will be freed + * after the module is initialized. + */ + kmemleak_ignore(ptr); + if (!ptr) { + module_free(mod, mod->module_core); + return -ENOMEM; + } + memset(ptr, 0, mod->init_size); + mod->module_init = ptr; + } else + mod->module_init = NULL; /* Transfer each section which specifies SHF_ALLOC */ pr_debug("final section addresses:\n"); @@ -2847,18 +2899,18 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr, return 0; } -static struct module *layout_and_allocate(struct load_info *info) +static struct module *layout_and_allocate(struct load_info *info, int flags) { /* Module within temporary copy. */ struct module *mod; Elf_Shdr *pcpusec; int err; - mod = setup_load_info(info); + mod = setup_load_info(info, flags); if (IS_ERR(mod)) return mod; - err = check_modinfo(mod, info); + err = check_modinfo(mod, info, flags); if (err) return ERR_PTR(err); @@ -2945,33 +2997,124 @@ static bool finished_loading(const char *name) return ret; } +/* Call module constructors. */ +static void do_mod_ctors(struct module *mod) +{ +#ifdef CONFIG_CONSTRUCTORS + unsigned long i; + + for (i = 0; i < mod->num_ctors; i++) + mod->ctors[i](); +#endif +} + +/* This is where the real work happens */ +static int do_init_module(struct module *mod) +{ + int ret = 0; + + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_COMING, mod); + + /* Set RO and NX regions for core */ + set_section_ro_nx(mod->module_core, + mod->core_text_size, + mod->core_ro_size, + mod->core_size); + + /* Set RO and NX regions for init */ + set_section_ro_nx(mod->module_init, + mod->init_text_size, + mod->init_ro_size, + mod->init_size); + + do_mod_ctors(mod); + /* Start the module */ + if (mod->init != NULL) + ret = do_one_initcall(mod->init); + if (ret < 0) { + /* Init routine failed: abort. Try to protect us from + buggy refcounters. */ + mod->state = MODULE_STATE_GOING; + synchronize_sched(); + module_put(mod); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + free_module(mod); + wake_up_all(&module_wq); + return ret; + } + if (ret > 0) { + printk(KERN_WARNING +"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" +"%s: loading module anyway...\n", + __func__, mod->name, ret, + __func__); + dump_stack(); + } + + /* Now it's a first class citizen! */ + mod->state = MODULE_STATE_LIVE; + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_LIVE, mod); + + /* We need to finish all async code before the module init sequence is done */ + async_synchronize_full(); + + mutex_lock(&module_mutex); + /* Drop initial reference. */ + module_put(mod); + trim_init_extable(mod); +#ifdef CONFIG_KALLSYMS + mod->num_symtab = mod->core_num_syms; + mod->symtab = mod->core_symtab; + mod->strtab = mod->core_strtab; +#endif + unset_module_init_ro_nx(mod); + module_free(mod, mod->module_init); + mod->module_init = NULL; + mod->init_size = 0; + mod->init_ro_size = 0; + mod->init_text_size = 0; + mutex_unlock(&module_mutex); + wake_up_all(&module_wq); + + return 0; +} + +static int may_init_module(void) +{ + if (!capable(CAP_SYS_MODULE) || modules_disabled) + return -EPERM; + + return 0; +} + /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ -static struct module *load_module(void __user *umod, - unsigned long len, - const char __user *uargs) +static int load_module(struct load_info *info, const char __user *uargs, + int flags) { - struct load_info info = { NULL, }; struct module *mod, *old; long err; - pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", - umod, len, uargs); + err = module_sig_check(info); + if (err) + goto free_copy; - /* Copy in the blobs from userspace, check they are vaguely sane. */ - err = copy_and_check(&info, umod, len, uargs); + err = elf_header_check(info); if (err) - return ERR_PTR(err); + goto free_copy; /* Figure out module layout, and allocate all the memory. */ - mod = layout_and_allocate(&info); + mod = layout_and_allocate(info, flags); if (IS_ERR(mod)) { err = PTR_ERR(mod); goto free_copy; } #ifdef CONFIG_MODULE_SIG - mod->sig_ok = info.sig_ok; + mod->sig_ok = info->sig_ok; if (!mod->sig_ok) add_taint_module(mod, TAINT_FORCED_MODULE); #endif @@ -2983,25 +3126,25 @@ static struct module *load_module(void __user *umod, /* Now we've got everything in the final locations, we can * find optional sections. */ - find_module_sections(mod, &info); + find_module_sections(mod, info); err = check_module_license_and_versions(mod); if (err) goto free_unload; /* Set up MODINFO_ATTR fields */ - setup_modinfo(mod, &info); + setup_modinfo(mod, info); /* Fix up syms, so that st_value is a pointer to location. */ - err = simplify_symbols(mod, &info); + err = simplify_symbols(mod, info); if (err < 0) goto free_modinfo; - err = apply_relocations(mod, &info); + err = apply_relocations(mod, info); if (err < 0) goto free_modinfo; - err = post_relocation(mod, &info); + err = post_relocation(mod, info); if (err < 0) goto free_modinfo; @@ -3041,14 +3184,14 @@ static struct module *load_module(void __user *umod, } /* This has to be done once we're sure module name is unique. */ - dynamic_debug_setup(info.debug, info.num_debug); + dynamic_debug_setup(info->debug, info->num_debug); /* Find duplicate symbols */ err = verify_export_symbols(mod); if (err < 0) goto ddebug; - module_bug_finalize(info.hdr, info.sechdrs, mod); + module_bug_finalize(info->hdr, info->sechdrs, mod); list_add_rcu(&mod->list, &modules); mutex_unlock(&module_mutex); @@ -3059,16 +3202,17 @@ static struct module *load_module(void __user *umod, goto unlink; /* Link in to syfs. */ - err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp); + err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); if (err < 0) goto unlink; /* Get rid of temporary copy. */ - free_copy(&info); + free_copy(info); /* Done! */ trace_module_load(mod); - return mod; + + return do_init_module(mod); unlink: mutex_lock(&module_mutex); @@ -3077,7 +3221,7 @@ static struct module *load_module(void __user *umod, module_bug_cleanup(mod); wake_up_all(&module_wq); ddebug: - dynamic_debug_remove(info.debug); + dynamic_debug_remove(info->debug); unlock: mutex_unlock(&module_mutex); synchronize_sched(); @@ -3089,106 +3233,52 @@ static struct module *load_module(void __user *umod, free_unload: module_unload_free(mod); free_module: - module_deallocate(mod, &info); + module_deallocate(mod, info); free_copy: - free_copy(&info); - return ERR_PTR(err); -} - -/* Call module constructors. */ -static void do_mod_ctors(struct module *mod) -{ -#ifdef CONFIG_CONSTRUCTORS - unsigned long i; - - for (i = 0; i < mod->num_ctors; i++) - mod->ctors[i](); -#endif + free_copy(info); + return err; } -/* This is where the real work happens */ SYSCALL_DEFINE3(init_module, void __user *, umod, unsigned long, len, const char __user *, uargs) { - struct module *mod; - int ret = 0; + int err; + struct load_info info = { }; - /* Must have permission */ - if (!capable(CAP_SYS_MODULE) || modules_disabled) - return -EPERM; + err = may_init_module(); + if (err) + return err; - /* Do all the hard work */ - mod = load_module(umod, len, uargs); - if (IS_ERR(mod)) - return PTR_ERR(mod); + pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n", + umod, len, uargs); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_COMING, mod); + err = copy_module_from_user(umod, len, &info); + if (err) + return err; - /* Set RO and NX regions for core */ - set_section_ro_nx(mod->module_core, - mod->core_text_size, - mod->core_ro_size, - mod->core_size); + return load_module(&info, uargs, 0); +} - /* Set RO and NX regions for init */ - set_section_ro_nx(mod->module_init, - mod->init_text_size, - mod->init_ro_size, - mod->init_size); +SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) +{ + int err; + struct load_info info = { }; - do_mod_ctors(mod); - /* Start the module */ - if (mod->init != NULL) - ret = do_one_initcall(mod->init); - if (ret < 0) { - /* Init routine failed: abort. Try to protect us from - buggy refcounters. */ - mod->state = MODULE_STATE_GOING; - synchronize_sched(); - module_put(mod); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_GOING, mod); - free_module(mod); - wake_up_all(&module_wq); - return ret; - } - if (ret > 0) { - printk(KERN_WARNING -"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" -"%s: loading module anyway...\n", - __func__, mod->name, ret, - __func__); - dump_stack(); - } + err = may_init_module(); + if (err) + return err; - /* Now it's a first class citizen! */ - mod->state = MODULE_STATE_LIVE; - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_LIVE, mod); + pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags); - /* We need to finish all async code before the module init sequence is done */ - async_synchronize_full(); + if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS + |MODULE_INIT_IGNORE_VERMAGIC)) + return -EINVAL; - mutex_lock(&module_mutex); - /* Drop initial reference. */ - module_put(mod); - trim_init_extable(mod); -#ifdef CONFIG_KALLSYMS - mod->num_symtab = mod->core_num_syms; - mod->symtab = mod->core_symtab; - mod->strtab = mod->core_strtab; -#endif - unset_module_init_ro_nx(mod); - module_free(mod, mod->module_init); - mod->module_init = NULL; - mod->init_size = 0; - mod->init_ro_size = 0; - mod->init_text_size = 0; - mutex_unlock(&module_mutex); - wake_up_all(&module_wq); + err = copy_module_from_fd(fd, &info); + if (err) + return err; - return 0; + return load_module(&info, uargs, flags); } static inline int within(unsigned long addr, void *start, unsigned long size) diff --git a/trunk/kernel/nsproxy.c b/trunk/kernel/nsproxy.c index 7e1c3de1ce45..78e2ecb20165 100644 --- a/trunk/kernel/nsproxy.c +++ b/trunk/kernel/nsproxy.c @@ -57,7 +57,8 @@ static inline struct nsproxy *create_nsproxy(void) * leave it to the caller to do proper locking and attach it to task. */ static struct nsproxy *create_new_namespaces(unsigned long flags, - struct task_struct *tsk, struct fs_struct *new_fs) + struct task_struct *tsk, struct user_namespace *user_ns, + struct fs_struct *new_fs) { struct nsproxy *new_nsp; int err; @@ -66,31 +67,31 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, if (!new_nsp) return ERR_PTR(-ENOMEM); - new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); + new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs); if (IS_ERR(new_nsp->mnt_ns)) { err = PTR_ERR(new_nsp->mnt_ns); goto out_ns; } - new_nsp->uts_ns = copy_utsname(flags, tsk); + new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns); if (IS_ERR(new_nsp->uts_ns)) { err = PTR_ERR(new_nsp->uts_ns); goto out_uts; } - new_nsp->ipc_ns = copy_ipcs(flags, tsk); + new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns); if (IS_ERR(new_nsp->ipc_ns)) { err = PTR_ERR(new_nsp->ipc_ns); goto out_ipc; } - new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); + new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns); if (IS_ERR(new_nsp->pid_ns)) { err = PTR_ERR(new_nsp->pid_ns); goto out_pid; } - new_nsp->net_ns = copy_net_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->net_ns); + new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns); if (IS_ERR(new_nsp->net_ns)) { err = PTR_ERR(new_nsp->net_ns); goto out_net; @@ -122,6 +123,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, int copy_namespaces(unsigned long flags, struct task_struct *tsk) { struct nsproxy *old_ns = tsk->nsproxy; + struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); struct nsproxy *new_ns; int err = 0; @@ -134,7 +136,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) CLONE_NEWPID | CLONE_NEWNET))) return 0; - if (!capable(CAP_SYS_ADMIN)) { + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) { err = -EPERM; goto out; } @@ -151,7 +153,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) goto out; } - new_ns = create_new_namespaces(flags, tsk, tsk->fs); + new_ns = create_new_namespaces(flags, tsk, + task_cred_xxx(tsk, user_ns), tsk->fs); if (IS_ERR(new_ns)) { err = PTR_ERR(new_ns); goto out; @@ -183,19 +186,21 @@ void free_nsproxy(struct nsproxy *ns) * On success, returns the new nsproxy. */ int unshare_nsproxy_namespaces(unsigned long unshare_flags, - struct nsproxy **new_nsp, struct fs_struct *new_fs) + struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs) { + struct user_namespace *user_ns; int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET))) + CLONE_NEWNET | CLONE_NEWPID))) return 0; - if (!capable(CAP_SYS_ADMIN)) + user_ns = new_cred ? new_cred->user_ns : current_user_ns(); + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; - *new_nsp = create_new_namespaces(unshare_flags, current, - new_fs ? new_fs : current->fs); + *new_nsp = create_new_namespaces(unshare_flags, current, user_ns, + new_fs ? new_fs : current->fs); if (IS_ERR(*new_nsp)) { err = PTR_ERR(*new_nsp); goto out; @@ -241,9 +246,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) struct file *file; int err; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - file = proc_ns_fget(fd); if (IS_ERR(file)) return PTR_ERR(file); @@ -254,7 +256,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) if (nstype && (ops->type != nstype)) goto out; - new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); + new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); if (IS_ERR(new_nsproxy)) { err = PTR_ERR(new_nsproxy); goto out; diff --git a/trunk/kernel/pid.c b/trunk/kernel/pid.c index fd996c1ed9f8..36aa02ff17d6 100644 --- a/trunk/kernel/pid.c +++ b/trunk/kernel/pid.c @@ -36,6 +36,7 @@ #include #include #include +#include #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -78,24 +79,11 @@ struct pid_namespace init_pid_ns = { .last_pid = 0, .level = 0, .child_reaper = &init_task, + .user_ns = &init_user_ns, + .proc_inum = PROC_PID_INIT_INO, }; EXPORT_SYMBOL_GPL(init_pid_ns); -int is_container_init(struct task_struct *tsk) -{ - int ret = 0; - struct pid *pid; - - rcu_read_lock(); - pid = task_pid(tsk); - if (pid != NULL && pid->numbers[pid->level].nr == 1) - ret = 1; - rcu_read_unlock(); - - return ret; -} -EXPORT_SYMBOL(is_container_init); - /* * Note: disable interrupts while the pidmap_lock is held as an * interrupt might come in and do read_lock(&tasklist_lock). @@ -269,8 +257,24 @@ void free_pid(struct pid *pid) unsigned long flags; spin_lock_irqsave(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) - hlist_del_rcu(&pid->numbers[i].pid_chain); + for (i = 0; i <= pid->level; i++) { + struct upid *upid = pid->numbers + i; + struct pid_namespace *ns = upid->ns; + hlist_del_rcu(&upid->pid_chain); + switch(--ns->nr_hashed) { + case 1: + /* When all that is left in the pid namespace + * is the reaper wake up the reaper. The reaper + * may be sleeping in zap_pid_ns_processes(). + */ + wake_up_process(ns->child_reaper); + break; + case 0: + ns->nr_hashed = -1; + schedule_work(&ns->proc_work); + break; + } + } spin_unlock_irqrestore(&pidmap_lock, flags); for (i = 0; i <= pid->level; i++) @@ -292,6 +296,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) goto out; tmp = ns; + pid->level = ns->level; for (i = ns->level; i >= 0; i--) { nr = alloc_pidmap(tmp); if (nr < 0) @@ -302,22 +307,32 @@ struct pid *alloc_pid(struct pid_namespace *ns) tmp = tmp->parent; } + if (unlikely(is_child_reaper(pid))) { + if (pid_ns_prepare_proc(ns)) + goto out_free; + } + get_pid_ns(ns); - pid->level = ns->level; atomic_set(&pid->count, 1); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); - for ( ; upid >= pid->numbers; --upid) + if (ns->nr_hashed < 0) + goto out_unlock; + for ( ; upid >= pid->numbers; --upid) { hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); + upid->ns->nr_hashed++; + } spin_unlock_irq(&pidmap_lock); out: return pid; +out_unlock: + spin_unlock(&pidmap_lock); out_free: while (++i <= ns->level) free_pidmap(pid->numbers + i); @@ -344,7 +359,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns); struct pid *find_vpid(int nr) { - return find_pid_ns(nr, current->nsproxy->pid_ns); + return find_pid_ns(nr, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(find_vpid); @@ -428,7 +443,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) struct task_struct *find_task_by_vpid(pid_t vnr) { - return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); + return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); } struct pid *get_task_pid(struct task_struct *task, enum pid_type type) @@ -483,7 +498,7 @@ EXPORT_SYMBOL_GPL(pid_nr_ns); pid_t pid_vnr(struct pid *pid) { - return pid_nr_ns(pid, current->nsproxy->pid_ns); + return pid_nr_ns(pid, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(pid_vnr); @@ -494,7 +509,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) - ns = current->nsproxy->pid_ns; + ns = task_active_pid_ns(current); if (likely(pid_alive(task))) { if (type != PIDTYPE_PID) task = task->group_leader; @@ -569,6 +584,7 @@ void __init pidmap_init(void) /* Reserve PID 0. We never call free_pidmap(0) */ set_bit(0, init_pid_ns.pidmap[0].page); atomic_dec(&init_pid_ns.pidmap[0].nr_free); + init_pid_ns.nr_hashed = 1; init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC); diff --git a/trunk/kernel/pid_namespace.c b/trunk/kernel/pid_namespace.c index 7b07cc0dfb75..fdbd0cdf271a 100644 --- a/trunk/kernel/pid_namespace.c +++ b/trunk/kernel/pid_namespace.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -71,10 +72,17 @@ static struct kmem_cache *create_pid_cachep(int nr_ids) return NULL; } +static void proc_cleanup_work(struct work_struct *work) +{ + struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); + pid_ns_release_proc(ns); +} + /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ #define MAX_PID_NS_LEVEL 32 -static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) +static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, + struct pid_namespace *parent_pid_ns) { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; @@ -99,9 +107,15 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p if (ns->pid_cachep == NULL) goto out_free_map; + err = proc_alloc_inum(&ns->proc_inum); + if (err) + goto out_free_map; + kref_init(&ns->kref); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); + ns->user_ns = get_user_ns(user_ns); + INIT_WORK(&ns->proc_work, proc_cleanup_work); set_bit(0, ns->pidmap[0].page); atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); @@ -109,14 +123,8 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p for (i = 1; i < PIDMAP_ENTRIES; i++) atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); - err = pid_ns_prepare_proc(ns); - if (err) - goto out_put_parent_pid_ns; - return ns; -out_put_parent_pid_ns: - put_pid_ns(parent_pid_ns); out_free_map: kfree(ns->pidmap[0].page); out_free: @@ -129,18 +137,21 @@ static void destroy_pid_namespace(struct pid_namespace *ns) { int i; + proc_free_inum(ns->proc_inum); for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); + put_user_ns(ns->user_ns); kmem_cache_free(pid_ns_cachep, ns); } -struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) +struct pid_namespace *copy_pid_ns(unsigned long flags, + struct user_namespace *user_ns, struct pid_namespace *old_ns) { if (!(flags & CLONE_NEWPID)) return get_pid_ns(old_ns); - if (flags & (CLONE_THREAD|CLONE_PARENT)) + if (task_active_pid_ns(current) != old_ns) return ERR_PTR(-EINVAL); - return create_pid_namespace(old_ns); + return create_pid_namespace(user_ns, old_ns); } static void free_pid_ns(struct kref *kref) @@ -211,22 +222,15 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) /* * sys_wait4() above can't reap the TASK_DEAD children. - * Make sure they all go away, see __unhash_process(). + * Make sure they all go away, see free_pid(). */ for (;;) { - bool need_wait = false; - - read_lock(&tasklist_lock); - if (!list_empty(¤t->children)) { - __set_current_state(TASK_UNINTERRUPTIBLE); - need_wait = true; - } - read_unlock(&tasklist_lock); - - if (!need_wait) + set_current_state(TASK_UNINTERRUPTIBLE); + if (pid_ns->nr_hashed == 1) break; schedule(); } + __set_current_state(TASK_RUNNING); if (pid_ns->reboot) current->signal->group_exit_code = pid_ns->reboot; @@ -239,9 +243,10 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) static int pid_ns_ctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; - if (write && !capable(CAP_SYS_ADMIN)) + if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* @@ -250,7 +255,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, * it should synchronize its usage with external means. */ - tmp.data = ¤t->nsproxy->pid_ns->last_pid; + tmp.data = &pid_ns->last_pid; return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); } @@ -299,6 +304,68 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) return 0; } +static void *pidns_get(struct task_struct *task) +{ + struct pid_namespace *ns; + + rcu_read_lock(); + ns = get_pid_ns(task_active_pid_ns(task)); + rcu_read_unlock(); + + return ns; +} + +static void pidns_put(void *ns) +{ + put_pid_ns(ns); +} + +static int pidns_install(struct nsproxy *nsproxy, void *ns) +{ + struct pid_namespace *active = task_active_pid_ns(current); + struct pid_namespace *ancestor, *new = ns; + + if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || + !nsown_capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * Only allow entering the current active pid namespace + * or a child of the current active pid namespace. + * + * This is required for fork to return a usable pid value and + * this maintains the property that processes and their + * children can not escape their current pid namespace. + */ + if (new->level < active->level) + return -EINVAL; + + ancestor = new; + while (ancestor->level > active->level) + ancestor = ancestor->parent; + if (ancestor != active) + return -EINVAL; + + put_pid_ns(nsproxy->pid_ns); + nsproxy->pid_ns = get_pid_ns(new); + return 0; +} + +static unsigned int pidns_inum(void *ns) +{ + struct pid_namespace *pid_ns = ns; + return pid_ns->proc_inum; +} + +const struct proc_ns_operations pidns_operations = { + .name = "pid", + .type = CLONE_NEWPID, + .get = pidns_get, + .put = pidns_put, + .install = pidns_install, + .inum = pidns_inum, +}; + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); diff --git a/trunk/kernel/posix-cpu-timers.c b/trunk/kernel/posix-cpu-timers.c index d73840271dce..a278cad1d5d6 100644 --- a/trunk/kernel/posix-cpu-timers.c +++ b/trunk/kernel/posix-cpu-timers.c @@ -9,6 +9,7 @@ #include #include #include +#include /* * Called after updating RLIMIT_CPU to run cpu timer and update @@ -470,6 +471,8 @@ static void cleanup_timers(struct list_head *head, */ void posix_cpu_timers_exit(struct task_struct *tsk) { + add_device_randomness((const void*) &tsk->se.sum_exec_runtime, + sizeof(unsigned long long)); cleanup_timers(tsk->cpu_timers, tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); diff --git a/trunk/kernel/printk.c b/trunk/kernel/printk.c index 22e070f3470a..19c0d7bcf24a 100644 --- a/trunk/kernel/printk.c +++ b/trunk/kernel/printk.c @@ -747,6 +747,21 @@ void __init setup_log_buf(int early) free, (free * 100) / __LOG_BUF_LEN); } +static bool __read_mostly ignore_loglevel; + +static int __init ignore_loglevel_setup(char *str) +{ + ignore_loglevel = 1; + printk(KERN_INFO "debug: ignoring loglevel setting.\n"); + + return 0; +} + +early_param("ignore_loglevel", ignore_loglevel_setup); +module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" + "print all kernel messages to the console."); + #ifdef CONFIG_BOOT_PRINTK_DELAY static int boot_delay; /* msecs delay after each printk during bootup */ @@ -770,13 +785,15 @@ static int __init boot_delay_setup(char *str) } __setup("boot_delay=", boot_delay_setup); -static void boot_delay_msec(void) +static void boot_delay_msec(int level) { unsigned long long k; unsigned long timeout; - if (boot_delay == 0 || system_state != SYSTEM_BOOTING) + if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) + || (level >= console_loglevel && !ignore_loglevel)) { return; + } k = (unsigned long long)loops_per_msec * boot_delay; @@ -795,7 +812,7 @@ static void boot_delay_msec(void) } } #else -static inline void boot_delay_msec(void) +static inline void boot_delay_msec(int level) { } #endif @@ -1238,21 +1255,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) return do_syslog(type, buf, len, SYSLOG_FROM_CALL); } -static bool __read_mostly ignore_loglevel; - -static int __init ignore_loglevel_setup(char *str) -{ - ignore_loglevel = 1; - printk(KERN_INFO "debug: ignoring loglevel setting.\n"); - - return 0; -} - -early_param("ignore_loglevel", ignore_loglevel_setup); -module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" - "print all kernel messages to the console."); - /* * Call the console drivers, asking them to write out * log_buf[start] to log_buf[end - 1]. @@ -1498,7 +1500,7 @@ asmlinkage int vprintk_emit(int facility, int level, int this_cpu; int printed_len = 0; - boot_delay_msec(); + boot_delay_msec(level); printk_delay(); /* This stops the holder of console_sem just where we want him */ diff --git a/trunk/kernel/ptrace.c b/trunk/kernel/ptrace.c index 1f5e55dda955..1599157336a6 100644 --- a/trunk/kernel/ptrace.c +++ b/trunk/kernel/ptrace.c @@ -215,8 +215,12 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) smp_rmb(); if (task->mm) dumpable = get_dumpable(task->mm); - if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode)) + rcu_read_lock(); + if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { + rcu_read_unlock(); return -EPERM; + } + rcu_read_unlock(); return security_ptrace_access_check(task, mode); } @@ -280,8 +284,10 @@ static int ptrace_attach(struct task_struct *task, long request, if (seize) flags |= PT_SEIZED; - if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) + rcu_read_lock(); + if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE)) flags |= PT_PTRACE_CAP; + rcu_read_unlock(); task->ptrace = flags; __ptrace_link(task, current); @@ -457,6 +463,9 @@ void exit_ptrace(struct task_struct *tracer) return; list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { + if (unlikely(p->ptrace & PT_EXITKILL)) + send_sig_info(SIGKILL, SEND_SIG_FORCED, p); + if (__ptrace_detach(tracer, p)) list_add(&p->ptrace_entry, &ptrace_dead); } diff --git a/trunk/kernel/res_counter.c b/trunk/kernel/res_counter.c index 3920d593e63c..ff55247e7049 100644 --- a/trunk/kernel/res_counter.c +++ b/trunk/kernel/res_counter.c @@ -86,33 +86,39 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, return __res_counter_charge(counter, val, limit_fail_at, true); } -void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) +u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) { if (WARN_ON(counter->usage < val)) val = counter->usage; counter->usage -= val; + return counter->usage; } -void res_counter_uncharge_until(struct res_counter *counter, - struct res_counter *top, - unsigned long val) +u64 res_counter_uncharge_until(struct res_counter *counter, + struct res_counter *top, + unsigned long val) { unsigned long flags; struct res_counter *c; + u64 ret = 0; local_irq_save(flags); for (c = counter; c != top; c = c->parent) { + u64 r; spin_lock(&c->lock); - res_counter_uncharge_locked(c, val); + r = res_counter_uncharge_locked(c, val); + if (c == counter) + ret = r; spin_unlock(&c->lock); } local_irq_restore(flags); + return ret; } -void res_counter_uncharge(struct res_counter *counter, unsigned long val) +u64 res_counter_uncharge(struct res_counter *counter, unsigned long val) { - res_counter_uncharge_until(counter, NULL, val); + return res_counter_uncharge_until(counter, NULL, val); } static inline unsigned long long * diff --git a/trunk/kernel/sched/core.c b/trunk/kernel/sched/core.c index c1fb82104bfb..257002c13bb0 100644 --- a/trunk/kernel/sched/core.c +++ b/trunk/kernel/sched/core.c @@ -4097,8 +4097,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) goto out_free_cpus_allowed; } retval = -EPERM; - if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) - goto out_unlock; + if (!check_same_owner(p)) { + rcu_read_lock(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { + rcu_read_unlock(); + goto out_unlock; + } + rcu_read_unlock(); + } retval = security_task_setscheduler(p); if (retval) diff --git a/trunk/kernel/sched/fair.c b/trunk/kernel/sched/fair.c index 4603d6cb9e25..5eea8707234a 100644 --- a/trunk/kernel/sched/fair.c +++ b/trunk/kernel/sched/fair.c @@ -793,8 +793,11 @@ unsigned int sysctl_numa_balancing_scan_delay = 1000; static void task_numa_placement(struct task_struct *p) { - int seq = ACCESS_ONCE(p->mm->numa_scan_seq); + int seq; + if (!p->mm) /* for example, ksmd faulting in a user's mm */ + return; + seq = ACCESS_ONCE(p->mm->numa_scan_seq); if (p->numa_scan_seq == seq) return; p->numa_scan_seq = seq; diff --git a/trunk/kernel/signal.c b/trunk/kernel/signal.c index a49c7f36ceb3..580a91e63471 100644 --- a/trunk/kernel/signal.c +++ b/trunk/kernel/signal.c @@ -1753,7 +1753,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, * see comment in do_notify_parent() about the following 4 lines */ rcu_read_lock(); - info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); + info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent)); info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); rcu_read_unlock(); diff --git a/trunk/kernel/sys_ni.c b/trunk/kernel/sys_ni.c index dbff751e4086..395084d4ce16 100644 --- a/trunk/kernel/sys_ni.c +++ b/trunk/kernel/sys_ni.c @@ -25,6 +25,7 @@ cond_syscall(sys_swapoff); cond_syscall(sys_kexec_load); cond_syscall(compat_sys_kexec_load); cond_syscall(sys_init_module); +cond_syscall(sys_finit_module); cond_syscall(sys_delete_module); cond_syscall(sys_socketpair); cond_syscall(sys_bind); diff --git a/trunk/kernel/sysctl_binary.c b/trunk/kernel/sysctl_binary.c index 65bdcf198d4e..5a6384450501 100644 --- a/trunk/kernel/sysctl_binary.c +++ b/trunk/kernel/sysctl_binary.c @@ -1344,7 +1344,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, goto out_putname; } - mnt = current->nsproxy->pid_ns->proc_mnt; + mnt = task_active_pid_ns(current)->proc_mnt; file = file_open_root(mnt->mnt_root, mnt, pathname, flags); result = PTR_ERR(file); if (IS_ERR(file)) diff --git a/trunk/kernel/trace/ftrace.c b/trunk/kernel/trace/ftrace.c index afd092de45b7..3ffe4c5ad3f3 100644 --- a/trunk/kernel/trace/ftrace.c +++ b/trunk/kernel/trace/ftrace.c @@ -2675,12 +2675,12 @@ ftrace_notrace_open(struct inode *inode, struct file *file) } loff_t -ftrace_regex_lseek(struct file *file, loff_t offset, int origin) +ftrace_regex_lseek(struct file *file, loff_t offset, int whence) { loff_t ret; if (file->f_mode & FMODE_READ) - ret = seq_lseek(file, offset, origin); + ret = seq_lseek(file, offset, whence); else file->f_pos = ret = 1; diff --git a/trunk/kernel/trace/trace.c b/trunk/kernel/trace/trace.c index 61e081b4ba11..e5125677efa0 100644 --- a/trunk/kernel/trace/trace.c +++ b/trunk/kernel/trace/trace.c @@ -3034,6 +3034,31 @@ static void set_buffer_entries(struct trace_array *tr, unsigned long val) tr->data[cpu]->entries = val; } +/* resize @tr's buffer to the size of @size_tr's entries */ +static int resize_buffer_duplicate_size(struct trace_array *tr, + struct trace_array *size_tr, int cpu_id) +{ + int cpu, ret = 0; + + if (cpu_id == RING_BUFFER_ALL_CPUS) { + for_each_tracing_cpu(cpu) { + ret = ring_buffer_resize(tr->buffer, + size_tr->data[cpu]->entries, cpu); + if (ret < 0) + break; + tr->data[cpu]->entries = size_tr->data[cpu]->entries; + } + } else { + ret = ring_buffer_resize(tr->buffer, + size_tr->data[cpu_id]->entries, cpu_id); + if (ret == 0) + tr->data[cpu_id]->entries = + size_tr->data[cpu_id]->entries; + } + + return ret; +} + static int __tracing_resize_ring_buffer(unsigned long size, int cpu) { int ret; @@ -3058,23 +3083,8 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu) ret = ring_buffer_resize(max_tr.buffer, size, cpu); if (ret < 0) { - int r = 0; - - if (cpu == RING_BUFFER_ALL_CPUS) { - int i; - for_each_tracing_cpu(i) { - r = ring_buffer_resize(global_trace.buffer, - global_trace.data[i]->entries, - i); - if (r < 0) - break; - } - } else { - r = ring_buffer_resize(global_trace.buffer, - global_trace.data[cpu]->entries, - cpu); - } - + int r = resize_buffer_duplicate_size(&global_trace, + &global_trace, cpu); if (r < 0) { /* * AARGH! We are left with different @@ -3212,17 +3222,11 @@ static int tracing_set_tracer(const char *buf) topts = create_trace_option_files(t); if (t->use_max_tr) { - int cpu; /* we need to make per cpu buffer sizes equivalent */ - for_each_tracing_cpu(cpu) { - ret = ring_buffer_resize(max_tr.buffer, - global_trace.data[cpu]->entries, - cpu); - if (ret < 0) - goto out; - max_tr.data[cpu]->entries = - global_trace.data[cpu]->entries; - } + ret = resize_buffer_duplicate_size(&max_tr, &global_trace, + RING_BUFFER_ALL_CPUS); + if (ret < 0) + goto out; } if (t->init) { @@ -4271,13 +4275,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, return -ENOMEM; if (*ppos & (PAGE_SIZE - 1)) { - WARN_ONCE(1, "Ftrace: previous read must page-align\n"); ret = -EINVAL; goto out; } if (len & (PAGE_SIZE - 1)) { - WARN_ONCE(1, "Ftrace: splice_read should page-align\n"); if (len < PAGE_SIZE) { ret = -EINVAL; goto out; diff --git a/trunk/kernel/trace/trace_stack.c b/trunk/kernel/trace/trace_stack.c index 0c1b165778e5..42ca822fc701 100644 --- a/trunk/kernel/trace/trace_stack.c +++ b/trunk/kernel/trace/trace_stack.c @@ -33,7 +33,6 @@ static unsigned long max_stack_size; static arch_spinlock_t max_stack_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; -static int stack_trace_disabled __read_mostly; static DEFINE_PER_CPU(int, trace_active); static DEFINE_MUTEX(stack_sysctl_mutex); @@ -116,9 +115,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, { int cpu; - if (unlikely(!ftrace_enabled || stack_trace_disabled)) - return; - preempt_disable_notrace(); cpu = raw_smp_processor_id(); diff --git a/trunk/kernel/trace/trace_uprobe.c b/trunk/kernel/trace/trace_uprobe.c index 9614db8b0f8c..c86e6d4f67fb 100644 --- a/trunk/kernel/trace/trace_uprobe.c +++ b/trunk/kernel/trace/trace_uprobe.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "trace_probe.h" @@ -263,16 +264,15 @@ static int create_trace_uprobe(int argc, char **argv) /* setup a probe */ if (!event) { - char *tail = strrchr(filename, '/'); + char *tail; char *ptr; - ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL); - if (!ptr) { + tail = kstrdup(kbasename(filename), GFP_KERNEL); + if (!tail) { ret = -ENOMEM; goto fail_address_parse; } - tail = ptr; ptr = strpbrk(tail, ".-_"); if (ptr) *ptr = '\0'; diff --git a/trunk/kernel/user.c b/trunk/kernel/user.c index 750acffbe9ec..33acb5e53a5f 100644 --- a/trunk/kernel/user.c +++ b/trunk/kernel/user.c @@ -16,6 +16,7 @@ #include #include #include +#include /* * userns count is 1 for root user, 1 for init_uts_ns, @@ -51,6 +52,7 @@ struct user_namespace init_user_ns = { }, .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, + .proc_inum = PROC_USER_INIT_INO, }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/trunk/kernel/user_namespace.c b/trunk/kernel/user_namespace.c index 456a6b9fba34..2b042c42fbc4 100644 --- a/trunk/kernel/user_namespace.c +++ b/trunk/kernel/user_namespace.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +27,24 @@ static struct kmem_cache *user_ns_cachep __read_mostly; static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); +static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) +{ + /* Start with the same capabilities as init but useless for doing + * anything as the capabilities are bound to the new user namespace. + */ + cred->securebits = SECUREBITS_DEFAULT; + cred->cap_inheritable = CAP_EMPTY_SET; + cred->cap_permitted = CAP_FULL_SET; + cred->cap_effective = CAP_FULL_SET; + cred->cap_bset = CAP_FULL_SET; +#ifdef CONFIG_KEYS + key_put(cred->request_key_auth); + cred->request_key_auth = NULL; +#endif + /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ + cred->user_ns = user_ns; +} + /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the @@ -39,6 +58,7 @@ int create_user_ns(struct cred *new) struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; + int ret; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who @@ -52,38 +72,45 @@ int create_user_ns(struct cred *new) if (!ns) return -ENOMEM; + ret = proc_alloc_inum(&ns->proc_inum); + if (ret) { + kmem_cache_free(user_ns_cachep, ns); + return ret; + } + kref_init(&ns->kref); + /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->owner = owner; ns->group = group; - /* Start with the same capabilities as init but useless for doing - * anything as the capabilities are bound to the new user namespace. - */ - new->securebits = SECUREBITS_DEFAULT; - new->cap_inheritable = CAP_EMPTY_SET; - new->cap_permitted = CAP_FULL_SET; - new->cap_effective = CAP_FULL_SET; - new->cap_bset = CAP_FULL_SET; -#ifdef CONFIG_KEYS - key_put(new->request_key_auth); - new->request_key_auth = NULL; -#endif - /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ - - /* Leave the new->user_ns reference with the new user namespace. */ - /* Leave the reference to our user_ns with the new cred. */ - new->user_ns = ns; + set_cred_user_ns(new, ns); return 0; } +int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) +{ + struct cred *cred; + + if (!(unshare_flags & CLONE_NEWUSER)) + return 0; + + cred = prepare_creds(); + if (!cred) + return -ENOMEM; + + *new_cred = cred; + return create_user_ns(cred); +} + void free_user_ns(struct kref *kref) { struct user_namespace *parent, *ns = container_of(kref, struct user_namespace, kref); parent = ns->parent; + proc_free_inum(ns->proc_inum); kmem_cache_free(user_ns_cachep, ns); put_user_ns(parent); } @@ -372,7 +399,7 @@ static int uid_m_show(struct seq_file *seq, void *v) struct user_namespace *lower_ns; uid_t lower; - lower_ns = current_user_ns(); + lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; @@ -393,7 +420,7 @@ static int gid_m_show(struct seq_file *seq, void *v) struct user_namespace *lower_ns; gid_t lower; - lower_ns = current_user_ns(); + lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; @@ -669,10 +696,14 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; + struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; + if ((seq_ns != ns) && (seq_ns != ns->parent)) + return -EPERM; + return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } @@ -681,10 +712,14 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; + struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; + if ((seq_ns != ns) && (seq_ns != ns->parent)) + return -EPERM; + return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } @@ -709,6 +744,21 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { + /* Allow mapping to your own filesystem ids */ + if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { + u32 id = new_map->extent[0].lower_first; + if (cap_setid == CAP_SETUID) { + kuid_t uid = make_kuid(ns->parent, id); + if (uid_eq(uid, current_fsuid())) + return true; + } + else if (cap_setid == CAP_SETGID) { + kgid_t gid = make_kgid(ns->parent, id); + if (gid_eq(gid, current_fsgid())) + return true; + } + } + /* Allow anyone to set a mapping that doesn't require privilege */ if (!cap_valid(cap_setid)) return true; @@ -722,6 +772,65 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, return false; } +static void *userns_get(struct task_struct *task) +{ + struct user_namespace *user_ns; + + rcu_read_lock(); + user_ns = get_user_ns(__task_cred(task)->user_ns); + rcu_read_unlock(); + + return user_ns; +} + +static void userns_put(void *ns) +{ + put_user_ns(ns); +} + +static int userns_install(struct nsproxy *nsproxy, void *ns) +{ + struct user_namespace *user_ns = ns; + struct cred *cred; + + /* Don't allow gaining capabilities by reentering + * the same user namespace. + */ + if (user_ns == current_user_ns()) + return -EINVAL; + + /* Threaded processes may not enter a different user namespace */ + if (atomic_read(¤t->mm->mm_users) > 1) + return -EINVAL; + + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + cred = prepare_creds(); + if (!cred) + return -ENOMEM; + + put_user_ns(cred->user_ns); + set_cred_user_ns(cred, get_user_ns(user_ns)); + + return commit_creds(cred); +} + +static unsigned int userns_inum(void *ns) +{ + struct user_namespace *user_ns = ns; + return user_ns->proc_inum; +} + +const struct proc_ns_operations userns_operations = { + .name = "user", + .type = CLONE_NEWUSER, + .get = userns_get, + .put = userns_put, + .install = userns_install, + .inum = userns_inum, +}; + static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); diff --git a/trunk/kernel/utsname.c b/trunk/kernel/utsname.c index 679d97a5d3fd..08b197e8c485 100644 --- a/trunk/kernel/utsname.c +++ b/trunk/kernel/utsname.c @@ -32,18 +32,25 @@ static struct uts_namespace *create_uts_ns(void) * @old_ns: namespace to clone * Return NULL on error (failure to kmalloc), new ns otherwise */ -static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, +static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *ns; + int err; ns = create_uts_ns(); if (!ns) return ERR_PTR(-ENOMEM); + err = proc_alloc_inum(&ns->proc_inum); + if (err) { + kfree(ns); + return ERR_PTR(err); + } + down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); - ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); + ns->user_ns = get_user_ns(user_ns); up_read(&uts_sem); return ns; } @@ -55,9 +62,8 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, * versa. */ struct uts_namespace *copy_utsname(unsigned long flags, - struct task_struct *tsk) + struct user_namespace *user_ns, struct uts_namespace *old_ns) { - struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; struct uts_namespace *new_ns; BUG_ON(!old_ns); @@ -66,7 +72,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, if (!(flags & CLONE_NEWUTS)) return old_ns; - new_ns = clone_uts_ns(tsk, old_ns); + new_ns = clone_uts_ns(user_ns, old_ns); put_uts_ns(old_ns); return new_ns; @@ -78,6 +84,7 @@ void free_uts_ns(struct kref *kref) ns = container_of(kref, struct uts_namespace, kref); put_user_ns(ns->user_ns); + proc_free_inum(ns->proc_inum); kfree(ns); } @@ -102,19 +109,32 @@ static void utsns_put(void *ns) put_uts_ns(ns); } -static int utsns_install(struct nsproxy *nsproxy, void *ns) +static int utsns_install(struct nsproxy *nsproxy, void *new) { + struct uts_namespace *ns = new; + + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || + !nsown_capable(CAP_SYS_ADMIN)) + return -EPERM; + get_uts_ns(ns); put_uts_ns(nsproxy->uts_ns); nsproxy->uts_ns = ns; return 0; } +static unsigned int utsns_inum(void *vp) +{ + struct uts_namespace *ns = vp; + + return ns->proc_inum; +} + const struct proc_ns_operations utsns_operations = { .name = "uts", .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, + .inum = utsns_inum, }; - diff --git a/trunk/kernel/watchdog.c b/trunk/kernel/watchdog.c index c8c21be11ab4..75a2ab3d0b02 100644 --- a/trunk/kernel/watchdog.c +++ b/trunk/kernel/watchdog.c @@ -31,6 +31,7 @@ int watchdog_enabled = 1; int __read_mostly watchdog_thresh = 10; static int __read_mostly watchdog_disabled; +static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); @@ -116,7 +117,7 @@ static unsigned long get_timestamp(int this_cpu) return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ } -static u64 get_sample_period(void) +static void set_sample_period(void) { /* * convert watchdog_thresh from seconds to ns @@ -125,7 +126,7 @@ static u64 get_sample_period(void) * and hard thresholds) to increment before the * hardlockup detector generates a warning */ - return get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); + sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); } /* Commands for resetting the watchdog */ @@ -275,7 +276,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) wake_up_process(__this_cpu_read(softlockup_watchdog)); /* .. and repeat */ - hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); + hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); if (touch_ts == 0) { if (unlikely(__this_cpu_read(softlockup_touch_sync))) { @@ -343,6 +344,10 @@ static void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + /* kick off the timer for the hardlockup detector */ + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer->function = watchdog_timer_fn; + if (!watchdog_enabled) { kthread_park(current); return; @@ -351,12 +356,8 @@ static void watchdog_enable(unsigned int cpu) /* Enable the perf event */ watchdog_nmi_enable(cpu); - /* kick off the timer for the hardlockup detector */ - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer->function = watchdog_timer_fn; - /* done here because hrtimer_start can only pin to smp_processor_id() */ - hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), + hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED); /* initialize timestamp */ @@ -368,9 +369,6 @@ static void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); - if (!watchdog_enabled) - return; - watchdog_set_prio(SCHED_NORMAL, 0); hrtimer_cancel(hrtimer); /* disable the perf event */ @@ -386,7 +384,7 @@ static int watchdog_should_run(unsigned int cpu) /* * The watchdog thread function - touches the timestamp. * - * It only runs once every get_sample_period() seconds (4 seconds by + * It only runs once every sample_period seconds (4 seconds by * default) to reset the softlockup timestamp. If this gets delayed * for more than 2*watchdog_thresh seconds then the debug-printout * triggers in watchdog_timer_fn(). @@ -519,6 +517,7 @@ int proc_dowatchdog(struct ctl_table *table, int write, if (ret || !write) return ret; + set_sample_period(); if (watchdog_enabled && watchdog_thresh) watchdog_enable_all_cpus(); else @@ -540,6 +539,7 @@ static struct smp_hotplug_thread watchdog_threads = { void __init lockup_detector_init(void) { + set_sample_period(); if (smpboot_register_percpu_thread(&watchdog_threads)) { pr_err("Failed to create watchdog threads, disabled\n"); watchdog_disabled = -ENODEV; diff --git a/trunk/lib/Kconfig b/trunk/lib/Kconfig index 4b31a46fb307..75cdb77fa49d 100644 --- a/trunk/lib/Kconfig +++ b/trunk/lib/Kconfig @@ -42,6 +42,9 @@ config GENERIC_IO config STMP_DEVICE bool +config PERCPU_RWSEM + boolean + config CRC_CCITT tristate "CRC-CCITT functions" help diff --git a/trunk/lib/Kconfig.debug b/trunk/lib/Kconfig.debug index e458782f3c52..3a353091a903 100644 --- a/trunk/lib/Kconfig.debug +++ b/trunk/lib/Kconfig.debug @@ -1192,14 +1192,14 @@ config MEMORY_NOTIFIER_ERROR_INJECT If unsure, say N. -config PSERIES_RECONFIG_NOTIFIER_ERROR_INJECT - tristate "pSeries reconfig notifier error injection module" - depends on PPC_PSERIES && NOTIFIER_ERROR_INJECTION +config OF_RECONFIG_NOTIFIER_ERROR_INJECT + tristate "OF reconfig notifier error injection module" + depends on OF_DYNAMIC && NOTIFIER_ERROR_INJECTION help This option provides the ability to inject artificial errors to - pSeries reconfig notifier chain callbacks. It is controlled + OF reconfig notifier chain callbacks. It is controlled through debugfs interface under - /sys/kernel/debug/notifier-error-inject/pSeries-reconfig/ + /sys/kernel/debug/notifier-error-inject/OF-reconfig/ If the notifier call chain should be failed with some events notified, write the error code to "actions//error". diff --git a/trunk/lib/Makefile b/trunk/lib/Makefile index e2152fa7ff4d..02ed6c04cd7d 100644 --- a/trunk/lib/Makefile +++ b/trunk/lib/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o +lib-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS)) obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o @@ -94,8 +95,8 @@ obj-$(CONFIG_NOTIFIER_ERROR_INJECTION) += notifier-error-inject.o obj-$(CONFIG_CPU_NOTIFIER_ERROR_INJECT) += cpu-notifier-error-inject.o obj-$(CONFIG_PM_NOTIFIER_ERROR_INJECT) += pm-notifier-error-inject.o obj-$(CONFIG_MEMORY_NOTIFIER_ERROR_INJECT) += memory-notifier-error-inject.o -obj-$(CONFIG_PSERIES_RECONFIG_NOTIFIER_ERROR_INJECT) += \ - pSeries-reconfig-notifier-error-inject.o +obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \ + of-reconfig-notifier-error-inject.o lib-$(CONFIG_GENERIC_BUG) += bug.o diff --git a/trunk/lib/asn1_decoder.c b/trunk/lib/asn1_decoder.c index 5293d2433029..11b9b01fda6b 100644 --- a/trunk/lib/asn1_decoder.c +++ b/trunk/lib/asn1_decoder.c @@ -81,7 +81,7 @@ static int asn1_find_indefinite_length(const unsigned char *data, size_t datalen goto next_tag; } - if (unlikely((tag & 0x1f) == 0x1f)) { + if (unlikely((tag & 0x1f) == ASN1_LONG_TAG)) { do { if (unlikely(datalen - dp < 2)) goto data_overrun_error; @@ -96,7 +96,7 @@ static int asn1_find_indefinite_length(const unsigned char *data, size_t datalen goto next_tag; } - if (unlikely(len == 0x80)) { + if (unlikely(len == ASN1_INDEFINITE_LENGTH)) { /* Indefinite length */ if (unlikely((tag & ASN1_CONS_BIT) == ASN1_PRIM << 5)) goto indefinite_len_primitive; @@ -222,7 +222,7 @@ int asn1_ber_decoder(const struct asn1_decoder *decoder, if (unlikely(dp >= datalen - 1)) goto data_overrun_error; tag = data[dp++]; - if (unlikely((tag & 0x1f) == 0x1f)) + if (unlikely((tag & 0x1f) == ASN1_LONG_TAG)) goto long_tag_not_supported; if (op & ASN1_OP_MATCH__ANY) { @@ -254,7 +254,7 @@ int asn1_ber_decoder(const struct asn1_decoder *decoder, len = data[dp++]; if (len > 0x7f) { - if (unlikely(len == 0x80)) { + if (unlikely(len == ASN1_INDEFINITE_LENGTH)) { /* Indefinite length */ if (unlikely(!(tag & ASN1_CONS_BIT))) goto indefinite_len_primitive; diff --git a/trunk/lib/dynamic_debug.c b/trunk/lib/dynamic_debug.c index e7f7d993357a..1db1fc660538 100644 --- a/trunk/lib/dynamic_debug.c +++ b/trunk/lib/dynamic_debug.c @@ -62,13 +62,6 @@ static LIST_HEAD(ddebug_tables); static int verbose = 0; module_param(verbose, int, 0644); -/* Return the last part of a pathname */ -static inline const char *basename(const char *path) -{ - const char *tail = strrchr(path, '/'); - return tail ? tail+1 : path; -} - /* Return the path relative to source root */ static inline const char *trim_prefix(const char *path) { @@ -154,7 +147,7 @@ static int ddebug_change(const struct ddebug_query *query, /* match against the source filename */ if (query->filename && strcmp(query->filename, dp->filename) && - strcmp(query->filename, basename(dp->filename)) && + strcmp(query->filename, kbasename(dp->filename)) && strcmp(query->filename, trim_prefix(dp->filename))) continue; diff --git a/trunk/lib/interval_tree_test_main.c b/trunk/lib/interval_tree_test_main.c index b25903987f7a..245900b98c8e 100644 --- a/trunk/lib/interval_tree_test_main.c +++ b/trunk/lib/interval_tree_test_main.c @@ -30,7 +30,8 @@ static void init(void) { int i; for (i = 0; i < NODES; i++) { - u32 a = prandom32(&rnd), b = prandom32(&rnd); + u32 a = prandom_u32_state(&rnd); + u32 b = prandom_u32_state(&rnd); if (a <= b) { nodes[i].start = a; nodes[i].last = b; @@ -40,7 +41,7 @@ static void init(void) } } for (i = 0; i < SEARCHES; i++) - queries[i] = prandom32(&rnd); + queries[i] = prandom_u32_state(&rnd); } static int interval_tree_test_init(void) @@ -51,7 +52,7 @@ static int interval_tree_test_init(void) printk(KERN_ALERT "interval tree insert/remove"); - prandom32_seed(&rnd, 3141592653589793238ULL); + prandom_seed_state(&rnd, 3141592653589793238ULL); init(); time1 = get_cycles(); diff --git a/trunk/lib/kstrtox.c b/trunk/lib/kstrtox.c index c3615eab0cc3..f78ae0c0c4e2 100644 --- a/trunk/lib/kstrtox.c +++ b/trunk/lib/kstrtox.c @@ -104,6 +104,22 @@ static int _kstrtoull(const char *s, unsigned int base, unsigned long long *res) return 0; } +/** + * kstrtoull - convert a string to an unsigned long long + * @s: The start of the string. The string must be null-terminated, and may also + * include a single newline before its terminating null. The first character + * may also be a plus sign, but not a minus sign. + * @base: The number base to use. The maximum supported base is 16. If base is + * given as 0, then the base of the string is automatically detected with the + * conventional semantics - If it begins with 0x the number will be parsed as a + * hexadecimal (case insensitive), if it otherwise begins with 0, it will be + * parsed as an octal number. Otherwise it will be parsed as a decimal. + * @res: Where to write the result of the conversion on success. + * + * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. + * Used as a replacement for the obsolete simple_strtoull. Return code must + * be checked. + */ int kstrtoull(const char *s, unsigned int base, unsigned long long *res) { if (s[0] == '+') @@ -112,6 +128,22 @@ int kstrtoull(const char *s, unsigned int base, unsigned long long *res) } EXPORT_SYMBOL(kstrtoull); +/** + * kstrtoll - convert a string to a long long + * @s: The start of the string. The string must be null-terminated, and may also + * include a single newline before its terminating null. The first character + * may also be a plus sign or a minus sign. + * @base: The number base to use. The maximum supported base is 16. If base is + * given as 0, then the base of the string is automatically detected with the + * conventional semantics - If it begins with 0x the number will be parsed as a + * hexadecimal (case insensitive), if it otherwise begins with 0, it will be + * parsed as an octal number. Otherwise it will be parsed as a decimal. + * @res: Where to write the result of the conversion on success. + * + * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. + * Used as a replacement for the obsolete simple_strtoull. Return code must + * be checked. + */ int kstrtoll(const char *s, unsigned int base, long long *res) { unsigned long long tmp; @@ -168,6 +200,22 @@ int _kstrtol(const char *s, unsigned int base, long *res) } EXPORT_SYMBOL(_kstrtol); +/** + * kstrtouint - convert a string to an unsigned int + * @s: The start of the string. The string must be null-terminated, and may also + * include a single newline before its terminating null. The first character + * may also be a plus sign, but not a minus sign. + * @base: The number base to use. The maximum supported base is 16. If base is + * given as 0, then the base of the string is automatically detected with the + * conventional semantics - If it begins with 0x the number will be parsed as a + * hexadecimal (case insensitive), if it otherwise begins with 0, it will be + * parsed as an octal number. Otherwise it will be parsed as a decimal. + * @res: Where to write the result of the conversion on success. + * + * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. + * Used as a replacement for the obsolete simple_strtoull. Return code must + * be checked. + */ int kstrtouint(const char *s, unsigned int base, unsigned int *res) { unsigned long long tmp; @@ -183,6 +231,22 @@ int kstrtouint(const char *s, unsigned int base, unsigned int *res) } EXPORT_SYMBOL(kstrtouint); +/** + * kstrtoint - convert a string to an int + * @s: The start of the string. The string must be null-terminated, and may also + * include a single newline before its terminating null. The first character + * may also be a plus sign or a minus sign. + * @base: The number base to use. The maximum supported base is 16. If base is + * given as 0, then the base of the string is automatically detected with the + * conventional semantics - If it begins with 0x the number will be parsed as a + * hexadecimal (case insensitive), if it otherwise begins with 0, it will be + * parsed as an octal number. Otherwise it will be parsed as a decimal. + * @res: Where to write the result of the conversion on success. + * + * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. + * Used as a replacement for the obsolete simple_strtoull. Return code must + * be checked. + */ int kstrtoint(const char *s, unsigned int base, int *res) { long long tmp; diff --git a/trunk/lib/lru_cache.c b/trunk/lib/lru_cache.c index a07e7268d7ed..d71d89498943 100644 --- a/trunk/lib/lru_cache.c +++ b/trunk/lib/lru_cache.c @@ -44,8 +44,8 @@ MODULE_LICENSE("GPL"); } while (0) #define RETURN(x...) do { \ - clear_bit(__LC_PARANOIA, &lc->flags); \ - smp_mb__after_clear_bit(); return x ; } while (0) + clear_bit_unlock(__LC_PARANOIA, &lc->flags); \ + return x ; } while (0) /* BUG() if e is not one of the elements tracked by lc */ #define PARANOIA_LC_ELEMENT(lc, e) do { \ @@ -55,9 +55,40 @@ MODULE_LICENSE("GPL"); BUG_ON(i >= lc_->nr_elements); \ BUG_ON(lc_->lc_element[i] != e_); } while (0) + +/* We need to atomically + * - try to grab the lock (set LC_LOCKED) + * - only if there is no pending transaction + * (neither LC_DIRTY nor LC_STARVING is set) + * Because of PARANOIA_ENTRY() above abusing lc->flags as well, + * it is not sufficient to just say + * return 0 == cmpxchg(&lc->flags, 0, LC_LOCKED); + */ +int lc_try_lock(struct lru_cache *lc) +{ + unsigned long val; + do { + val = cmpxchg(&lc->flags, 0, LC_LOCKED); + } while (unlikely (val == LC_PARANOIA)); + /* Spin until no-one is inside a PARANOIA_ENTRY()/RETURN() section. */ + return 0 == val; +#if 0 + /* Alternative approach, spin in case someone enters or leaves a + * PARANOIA_ENTRY()/RETURN() section. */ + unsigned long old, new, val; + do { + old = lc->flags & LC_PARANOIA; + new = old | LC_LOCKED; + val = cmpxchg(&lc->flags, old, new); + } while (unlikely (val == (old ^ LC_PARANOIA))); + return old == val; +#endif +} + /** * lc_create - prepares to track objects in an active set * @name: descriptive name only used in lc_seq_printf_stats and lc_seq_dump_details + * @max_pending_changes: maximum changes to accumulate until a transaction is required * @e_count: number of elements allowed to be active simultaneously * @e_size: size of the tracked objects * @e_off: offset to the &struct lc_element member in a tracked object @@ -66,6 +97,7 @@ MODULE_LICENSE("GPL"); * or NULL on (allocation) failure. */ struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, + unsigned max_pending_changes, unsigned e_count, size_t e_size, size_t e_off) { struct hlist_head *slot = NULL; @@ -98,12 +130,13 @@ struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, INIT_LIST_HEAD(&lc->in_use); INIT_LIST_HEAD(&lc->lru); INIT_LIST_HEAD(&lc->free); + INIT_LIST_HEAD(&lc->to_be_changed); lc->name = name; lc->element_size = e_size; lc->element_off = e_off; lc->nr_elements = e_count; - lc->new_number = LC_FREE; + lc->max_pending_changes = max_pending_changes; lc->lc_cache = cache; lc->lc_element = element; lc->lc_slot = slot; @@ -117,6 +150,7 @@ struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, e = p + e_off; e->lc_index = i; e->lc_number = LC_FREE; + e->lc_new_number = LC_FREE; list_add(&e->list, &lc->free); element[i] = e; } @@ -175,15 +209,15 @@ void lc_reset(struct lru_cache *lc) INIT_LIST_HEAD(&lc->in_use); INIT_LIST_HEAD(&lc->lru); INIT_LIST_HEAD(&lc->free); + INIT_LIST_HEAD(&lc->to_be_changed); lc->used = 0; lc->hits = 0; lc->misses = 0; lc->starving = 0; - lc->dirty = 0; + lc->locked = 0; lc->changed = 0; + lc->pending_changes = 0; lc->flags = 0; - lc->changing_element = NULL; - lc->new_number = LC_FREE; memset(lc->lc_slot, 0, sizeof(struct hlist_head) * lc->nr_elements); for (i = 0; i < lc->nr_elements; i++) { @@ -194,6 +228,7 @@ void lc_reset(struct lru_cache *lc) /* re-init it */ e->lc_index = i; e->lc_number = LC_FREE; + e->lc_new_number = LC_FREE; list_add(&e->list, &lc->free); } } @@ -208,14 +243,14 @@ size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc) /* NOTE: * total calls to lc_get are * (starving + hits + misses) - * misses include "dirty" count (update from an other thread in + * misses include "locked" count (update from an other thread in * progress) and "changed", when this in fact lead to an successful * update of the cache. */ return seq_printf(seq, "\t%s: used:%u/%u " - "hits:%lu misses:%lu starving:%lu dirty:%lu changed:%lu\n", + "hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n", lc->name, lc->used, lc->nr_elements, - lc->hits, lc->misses, lc->starving, lc->dirty, lc->changed); + lc->hits, lc->misses, lc->starving, lc->locked, lc->changed); } static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr) @@ -224,16 +259,8 @@ static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr) } -/** - * lc_find - find element by label, if present in the hash table - * @lc: The lru_cache object - * @enr: element number - * - * Returns the pointer to an element, if the element with the requested - * "label" or element number is present in the hash table, - * or NULL if not found. Does not change the refcnt. - */ -struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr) +static struct lc_element *__lc_find(struct lru_cache *lc, unsigned int enr, + bool include_changing) { struct hlist_node *n; struct lc_element *e; @@ -241,29 +268,48 @@ struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr) BUG_ON(!lc); BUG_ON(!lc->nr_elements); hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) { - if (e->lc_number == enr) + /* "about to be changed" elements, pending transaction commit, + * are hashed by their "new number". "Normal" elements have + * lc_number == lc_new_number. */ + if (e->lc_new_number != enr) + continue; + if (e->lc_new_number == e->lc_number || include_changing) return e; + break; } return NULL; } -/* returned element will be "recycled" immediately */ -static struct lc_element *lc_evict(struct lru_cache *lc) +/** + * lc_find - find element by label, if present in the hash table + * @lc: The lru_cache object + * @enr: element number + * + * Returns the pointer to an element, if the element with the requested + * "label" or element number is present in the hash table, + * or NULL if not found. Does not change the refcnt. + * Ignores elements that are "about to be used", i.e. not yet in the active + * set, but still pending transaction commit. + */ +struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr) { - struct list_head *n; - struct lc_element *e; - - if (list_empty(&lc->lru)) - return NULL; - - n = lc->lru.prev; - e = list_entry(n, struct lc_element, list); - - PARANOIA_LC_ELEMENT(lc, e); + return __lc_find(lc, enr, 0); +} - list_del(&e->list); - hlist_del(&e->colision); - return e; +/** + * lc_is_used - find element by label + * @lc: The lru_cache object + * @enr: element number + * + * Returns true, if the element with the requested "label" or element number is + * present in the hash table, and is used (refcnt > 0). + * Also finds elements that are not _currently_ used but only "about to be + * used", i.e. on the "to_be_changed" list, pending transaction commit. + */ +bool lc_is_used(struct lru_cache *lc, unsigned int enr) +{ + struct lc_element *e = __lc_find(lc, enr, 1); + return e && e->refcnt; } /** @@ -280,22 +326,34 @@ void lc_del(struct lru_cache *lc, struct lc_element *e) PARANOIA_LC_ELEMENT(lc, e); BUG_ON(e->refcnt); - e->lc_number = LC_FREE; + e->lc_number = e->lc_new_number = LC_FREE; hlist_del_init(&e->colision); list_move(&e->list, &lc->free); RETURN(); } -static struct lc_element *lc_get_unused_element(struct lru_cache *lc) +static struct lc_element *lc_prepare_for_change(struct lru_cache *lc, unsigned new_number) { struct list_head *n; + struct lc_element *e; + + if (!list_empty(&lc->free)) + n = lc->free.next; + else if (!list_empty(&lc->lru)) + n = lc->lru.prev; + else + return NULL; + + e = list_entry(n, struct lc_element, list); + PARANOIA_LC_ELEMENT(lc, e); - if (list_empty(&lc->free)) - return lc_evict(lc); + e->lc_new_number = new_number; + if (!hlist_unhashed(&e->colision)) + __hlist_del(&e->colision); + hlist_add_head(&e->colision, lc_hash_slot(lc, new_number)); + list_move(&e->list, &lc->to_be_changed); - n = lc->free.next; - list_del(n); - return list_entry(n, struct lc_element, list); + return e; } static int lc_unused_element_available(struct lru_cache *lc) @@ -308,45 +366,7 @@ static int lc_unused_element_available(struct lru_cache *lc) return 0; } - -/** - * lc_get - get element by label, maybe change the active set - * @lc: the lru cache to operate on - * @enr: the label to look up - * - * Finds an element in the cache, increases its usage count, - * "touches" and returns it. - * - * In case the requested number is not present, it needs to be added to the - * cache. Therefore it is possible that an other element becomes evicted from - * the cache. In either case, the user is notified so he is able to e.g. keep - * a persistent log of the cache changes, and therefore the objects in use. - * - * Return values: - * NULL - * The cache was marked %LC_STARVING, - * or the requested label was not in the active set - * and a changing transaction is still pending (@lc was marked %LC_DIRTY). - * Or no unused or free element could be recycled (@lc will be marked as - * %LC_STARVING, blocking further lc_get() operations). - * - * pointer to the element with the REQUESTED element number. - * In this case, it can be used right away - * - * pointer to an UNUSED element with some different element number, - * where that different number may also be %LC_FREE. - * - * In this case, the cache is marked %LC_DIRTY (blocking further changes), - * and the returned element pointer is removed from the lru list and - * hash collision chains. The user now should do whatever housekeeping - * is necessary. - * Then he must call lc_changed(lc,element_pointer), to finish - * the change. - * - * NOTE: The user needs to check the lc_number on EACH use, so he recognizes - * any cache set change. - */ -struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) +static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool may_change) { struct lc_element *e; @@ -356,8 +376,12 @@ struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) RETURN(NULL); } - e = lc_find(lc, enr); - if (e) { + e = __lc_find(lc, enr, 1); + /* if lc_new_number != lc_number, + * this enr is currently being pulled in already, + * and will be available once the pending transaction + * has been committed. */ + if (e && e->lc_new_number == e->lc_number) { ++lc->hits; if (e->refcnt++ == 0) lc->used++; @@ -366,6 +390,26 @@ struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) } ++lc->misses; + if (!may_change) + RETURN(NULL); + + /* It has been found above, but on the "to_be_changed" list, not yet + * committed. Don't pull it in twice, wait for the transaction, then + * try again */ + if (e) + RETURN(NULL); + + /* To avoid races with lc_try_lock(), first, mark us dirty + * (using test_and_set_bit, as it implies memory barriers), ... */ + test_and_set_bit(__LC_DIRTY, &lc->flags); + + /* ... only then check if it is locked anyways. If lc_unlock clears + * the dirty bit again, that's not a problem, we will come here again. + */ + if (test_bit(__LC_LOCKED, &lc->flags)) { + ++lc->locked; + RETURN(NULL); + } /* In case there is nothing available and we can not kick out * the LRU element, we have to wait ... @@ -375,71 +419,109 @@ struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) RETURN(NULL); } - /* it was not present in the active set. - * we are going to recycle an unused (or even "free") element. - * user may need to commit a transaction to record that change. - * we serialize on flags & TF_DIRTY */ - if (test_and_set_bit(__LC_DIRTY, &lc->flags)) { - ++lc->dirty; + /* It was not present in the active set. We are going to recycle an + * unused (or even "free") element, but we won't accumulate more than + * max_pending_changes changes. */ + if (lc->pending_changes >= lc->max_pending_changes) RETURN(NULL); - } - e = lc_get_unused_element(lc); + e = lc_prepare_for_change(lc, enr); BUG_ON(!e); clear_bit(__LC_STARVING, &lc->flags); BUG_ON(++e->refcnt != 1); lc->used++; - - lc->changing_element = e; - lc->new_number = enr; + lc->pending_changes++; RETURN(e); } -/* similar to lc_get, - * but only gets a new reference on an existing element. - * you either get the requested element, or NULL. - * will be consolidated into one function. +/** + * lc_get - get element by label, maybe change the active set + * @lc: the lru cache to operate on + * @enr: the label to look up + * + * Finds an element in the cache, increases its usage count, + * "touches" and returns it. + * + * In case the requested number is not present, it needs to be added to the + * cache. Therefore it is possible that an other element becomes evicted from + * the cache. In either case, the user is notified so he is able to e.g. keep + * a persistent log of the cache changes, and therefore the objects in use. + * + * Return values: + * NULL + * The cache was marked %LC_STARVING, + * or the requested label was not in the active set + * and a changing transaction is still pending (@lc was marked %LC_DIRTY). + * Or no unused or free element could be recycled (@lc will be marked as + * %LC_STARVING, blocking further lc_get() operations). + * + * pointer to the element with the REQUESTED element number. + * In this case, it can be used right away + * + * pointer to an UNUSED element with some different element number, + * where that different number may also be %LC_FREE. + * + * In this case, the cache is marked %LC_DIRTY, + * so lc_try_lock() will no longer succeed. + * The returned element pointer is moved to the "to_be_changed" list, + * and registered with the new element number on the hash collision chains, + * so it is possible to pick it up from lc_is_used(). + * Up to "max_pending_changes" (see lc_create()) can be accumulated. + * The user now should do whatever housekeeping is necessary, + * typically serialize on lc_try_lock_for_transaction(), then call + * lc_committed(lc) and lc_unlock(), to finish the change. + * + * NOTE: The user needs to check the lc_number on EACH use, so he recognizes + * any cache set change. */ -struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr) +struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) { - struct lc_element *e; - - PARANOIA_ENTRY(); - if (lc->flags & LC_STARVING) { - ++lc->starving; - RETURN(NULL); - } + return __lc_get(lc, enr, 1); +} - e = lc_find(lc, enr); - if (e) { - ++lc->hits; - if (e->refcnt++ == 0) - lc->used++; - list_move(&e->list, &lc->in_use); /* Not evictable... */ - } - RETURN(e); +/** + * lc_try_get - get element by label, if present; do not change the active set + * @lc: the lru cache to operate on + * @enr: the label to look up + * + * Finds an element in the cache, increases its usage count, + * "touches" and returns it. + * + * Return values: + * NULL + * The cache was marked %LC_STARVING, + * or the requested label was not in the active set + * + * pointer to the element with the REQUESTED element number. + * In this case, it can be used right away + */ +struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr) +{ + return __lc_get(lc, enr, 0); } /** - * lc_changed - tell @lc that the change has been recorded + * lc_committed - tell @lc that pending changes have been recorded * @lc: the lru cache to operate on - * @e: the element pending label change + * + * User is expected to serialize on explicit lc_try_lock_for_transaction() + * before the transaction is started, and later needs to lc_unlock() explicitly + * as well. */ -void lc_changed(struct lru_cache *lc, struct lc_element *e) +void lc_committed(struct lru_cache *lc) { + struct lc_element *e, *tmp; + PARANOIA_ENTRY(); - BUG_ON(e != lc->changing_element); - PARANOIA_LC_ELEMENT(lc, e); - ++lc->changed; - e->lc_number = lc->new_number; - list_add(&e->list, &lc->in_use); - hlist_add_head(&e->colision, lc_hash_slot(lc, lc->new_number)); - lc->changing_element = NULL; - lc->new_number = LC_FREE; - clear_bit(__LC_DIRTY, &lc->flags); - smp_mb__after_clear_bit(); + list_for_each_entry_safe(e, tmp, &lc->to_be_changed, list) { + /* count number of changes, not number of transactions */ + ++lc->changed; + e->lc_number = e->lc_new_number; + list_move(&e->list, &lc->in_use); + } + lc->pending_changes = 0; RETURN(); } @@ -458,13 +540,12 @@ unsigned int lc_put(struct lru_cache *lc, struct lc_element *e) PARANOIA_ENTRY(); PARANOIA_LC_ELEMENT(lc, e); BUG_ON(e->refcnt == 0); - BUG_ON(e == lc->changing_element); + BUG_ON(e->lc_number != e->lc_new_number); if (--e->refcnt == 0) { /* move it to the front of LRU. */ list_move(&e->list, &lc->lru); lc->used--; - clear_bit(__LC_STARVING, &lc->flags); - smp_mb__after_clear_bit(); + clear_bit_unlock(__LC_STARVING, &lc->flags); } RETURN(e->refcnt); } @@ -504,16 +585,24 @@ unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e) void lc_set(struct lru_cache *lc, unsigned int enr, int index) { struct lc_element *e; + struct list_head *lh; if (index < 0 || index >= lc->nr_elements) return; e = lc_element_by_index(lc, index); - e->lc_number = enr; + BUG_ON(e->lc_number != e->lc_new_number); + BUG_ON(e->refcnt != 0); + e->lc_number = e->lc_new_number = enr; hlist_del_init(&e->colision); - hlist_add_head(&e->colision, lc_hash_slot(lc, enr)); - list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru); + if (enr == LC_FREE) + lh = &lc->free; + else { + hlist_add_head(&e->colision, lc_hash_slot(lc, enr)); + lh = &lc->lru; + } + list_move(&e->list, lh); } /** @@ -553,8 +642,10 @@ EXPORT_SYMBOL(lc_try_get); EXPORT_SYMBOL(lc_find); EXPORT_SYMBOL(lc_get); EXPORT_SYMBOL(lc_put); -EXPORT_SYMBOL(lc_changed); +EXPORT_SYMBOL(lc_committed); EXPORT_SYMBOL(lc_element_by_index); EXPORT_SYMBOL(lc_index_of); EXPORT_SYMBOL(lc_seq_printf_stats); EXPORT_SYMBOL(lc_seq_dump_details); +EXPORT_SYMBOL(lc_try_lock); +EXPORT_SYMBOL(lc_is_used); diff --git a/trunk/lib/pSeries-reconfig-notifier-error-inject.c b/trunk/lib/of-reconfig-notifier-error-inject.c similarity index 51% rename from trunk/lib/pSeries-reconfig-notifier-error-inject.c rename to trunk/lib/of-reconfig-notifier-error-inject.c index 7f7c98dcd5c4..8dc79861758a 100644 --- a/trunk/lib/pSeries-reconfig-notifier-error-inject.c +++ b/trunk/lib/of-reconfig-notifier-error-inject.c @@ -1,20 +1,20 @@ #include #include - -#include +#include #include "notifier-error-inject.h" static int priority; module_param(priority, int, 0); -MODULE_PARM_DESC(priority, "specify pSeries reconfig notifier priority"); +MODULE_PARM_DESC(priority, "specify OF reconfig notifier priority"); static struct notifier_err_inject reconfig_err_inject = { .actions = { - { NOTIFIER_ERR_INJECT_ACTION(PSERIES_RECONFIG_ADD) }, - { NOTIFIER_ERR_INJECT_ACTION(PSERIES_RECONFIG_REMOVE) }, - { NOTIFIER_ERR_INJECT_ACTION(PSERIES_DRCONF_MEM_ADD) }, - { NOTIFIER_ERR_INJECT_ACTION(PSERIES_DRCONF_MEM_REMOVE) }, + { NOTIFIER_ERR_INJECT_ACTION(OF_RECONFIG_ATTACH_NODE) }, + { NOTIFIER_ERR_INJECT_ACTION(OF_RECONFIG_DETACH_NODE) }, + { NOTIFIER_ERR_INJECT_ACTION(OF_RECONFIG_ADD_PROPERTY) }, + { NOTIFIER_ERR_INJECT_ACTION(OF_RECONFIG_REMOVE_PROPERTY) }, + { NOTIFIER_ERR_INJECT_ACTION(OF_RECONFIG_UPDATE_PROPERTY) }, {} } }; @@ -25,12 +25,12 @@ static int err_inject_init(void) { int err; - dir = notifier_err_inject_init("pSeries-reconfig", + dir = notifier_err_inject_init("OF-reconfig", notifier_err_inject_dir, &reconfig_err_inject, priority); if (IS_ERR(dir)) return PTR_ERR(dir); - err = pSeries_reconfig_notifier_register(&reconfig_err_inject.nb); + err = of_reconfig_notifier_register(&reconfig_err_inject.nb); if (err) debugfs_remove_recursive(dir); @@ -39,13 +39,13 @@ static int err_inject_init(void) static void err_inject_exit(void) { - pSeries_reconfig_notifier_unregister(&reconfig_err_inject.nb); + of_reconfig_notifier_unregister(&reconfig_err_inject.nb); debugfs_remove_recursive(dir); } module_init(err_inject_init); module_exit(err_inject_exit); -MODULE_DESCRIPTION("pSeries reconfig notifier error injection module"); +MODULE_DESCRIPTION("OF reconfig notifier error injection module"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Akinobu Mita "); diff --git a/trunk/lib/percpu-rwsem.c b/trunk/lib/percpu-rwsem.c new file mode 100644 index 000000000000..652a8ee8efe9 --- /dev/null +++ b/trunk/lib/percpu-rwsem.c @@ -0,0 +1,165 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, + const char *name, struct lock_class_key *rwsem_key) +{ + brw->fast_read_ctr = alloc_percpu(int); + if (unlikely(!brw->fast_read_ctr)) + return -ENOMEM; + + /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ + __init_rwsem(&brw->rw_sem, name, rwsem_key); + atomic_set(&brw->write_ctr, 0); + atomic_set(&brw->slow_read_ctr, 0); + init_waitqueue_head(&brw->write_waitq); + return 0; +} + +void percpu_free_rwsem(struct percpu_rw_semaphore *brw) +{ + free_percpu(brw->fast_read_ctr); + brw->fast_read_ctr = NULL; /* catch use after free bugs */ +} + +/* + * This is the fast-path for down_read/up_read, it only needs to ensure + * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the + * fast per-cpu counter. The writer uses synchronize_sched_expedited() to + * serialize with the preempt-disabled section below. + * + * The nontrivial part is that we should guarantee acquire/release semantics + * in case when + * + * R_W: down_write() comes after up_read(), the writer should see all + * changes done by the reader + * or + * W_R: down_read() comes after up_write(), the reader should see all + * changes done by the writer + * + * If this helper fails the callers rely on the normal rw_semaphore and + * atomic_dec_and_test(), so in this case we have the necessary barriers. + * + * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or + * __this_cpu_add() below can be reordered with any LOAD/STORE done by the + * reader inside the critical section. See the comments in down_write and + * up_write below. + */ +static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) +{ + bool success = false; + + preempt_disable(); + if (likely(!atomic_read(&brw->write_ctr))) { + __this_cpu_add(*brw->fast_read_ctr, val); + success = true; + } + preempt_enable(); + + return success; +} + +/* + * Like the normal down_read() this is not recursive, the writer can + * come after the first percpu_down_read() and create the deadlock. + * + * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, + * percpu_up_read() does rwsem_release(). This pairs with the usage + * of ->rw_sem in percpu_down/up_write(). + */ +void percpu_down_read(struct percpu_rw_semaphore *brw) +{ + might_sleep(); + if (likely(update_fast_ctr(brw, +1))) { + rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); + return; + } + + down_read(&brw->rw_sem); + atomic_inc(&brw->slow_read_ctr); + /* avoid up_read()->rwsem_release() */ + __up_read(&brw->rw_sem); +} + +void percpu_up_read(struct percpu_rw_semaphore *brw) +{ + rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); + + if (likely(update_fast_ctr(brw, -1))) + return; + + /* false-positive is possible but harmless */ + if (atomic_dec_and_test(&brw->slow_read_ctr)) + wake_up_all(&brw->write_waitq); +} + +static int clear_fast_ctr(struct percpu_rw_semaphore *brw) +{ + unsigned int sum = 0; + int cpu; + + for_each_possible_cpu(cpu) { + sum += per_cpu(*brw->fast_read_ctr, cpu); + per_cpu(*brw->fast_read_ctr, cpu) = 0; + } + + return sum; +} + +/* + * A writer increments ->write_ctr to force the readers to switch to the + * slow mode, note the atomic_read() check in update_fast_ctr(). + * + * After that the readers can only inc/dec the slow ->slow_read_ctr counter, + * ->fast_read_ctr is stable. Once the writer moves its sum into the slow + * counter it represents the number of active readers. + * + * Finally the writer takes ->rw_sem for writing and blocks the new readers, + * then waits until the slow counter becomes zero. + */ +void percpu_down_write(struct percpu_rw_semaphore *brw) +{ + /* tell update_fast_ctr() there is a pending writer */ + atomic_inc(&brw->write_ctr); + /* + * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read + * so that update_fast_ctr() can't succeed. + * + * 2. Ensures we see the result of every previous this_cpu_add() in + * update_fast_ctr(). + * + * 3. Ensures that if any reader has exited its critical section via + * fast-path, it executes a full memory barrier before we return. + * See R_W case in the comment above update_fast_ctr(). + */ + synchronize_sched_expedited(); + + /* exclude other writers, and block the new readers completely */ + down_write(&brw->rw_sem); + + /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ + atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); + + /* wait for all readers to complete their percpu_up_read() */ + wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); +} + +void percpu_up_write(struct percpu_rw_semaphore *brw) +{ + /* release the lock, but the readers can't use the fast-path */ + up_write(&brw->rw_sem); + /* + * Insert the barrier before the next fast-path in down_read, + * see W_R case in the comment above update_fast_ctr(). + */ + synchronize_sched_expedited(); + /* the last writer unblocks update_fast_ctr() */ + atomic_dec(&brw->write_ctr); +} diff --git a/trunk/lib/raid6/Makefile b/trunk/lib/raid6/Makefile index de06dfe165b8..9f7c184725d7 100644 --- a/trunk/lib/raid6/Makefile +++ b/trunk/lib/raid6/Makefile @@ -1,8 +1,11 @@ obj-$(CONFIG_RAID6_PQ) += raid6_pq.o -raid6_pq-y += algos.o recov.o recov_ssse3.o tables.o int1.o int2.o int4.o \ - int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \ - altivec8.o mmx.o sse1.o sse2.o +raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ + int8.o int16.o int32.o + +raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o +raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o + hostprogs-y += mktables quiet_cmd_unroll = UNROLL $@ diff --git a/trunk/lib/raid6/algos.c b/trunk/lib/raid6/algos.c index 589f5f50ad2e..6d7316fe9f30 100644 --- a/trunk/lib/raid6/algos.c +++ b/trunk/lib/raid6/algos.c @@ -45,11 +45,20 @@ const struct raid6_calls * const raid6_algos[] = { &raid6_sse1x2, &raid6_sse2x1, &raid6_sse2x2, +#ifdef CONFIG_AS_AVX2 + &raid6_avx2x1, + &raid6_avx2x2, +#endif #endif #if defined(__x86_64__) && !defined(__arch_um__) &raid6_sse2x1, &raid6_sse2x2, &raid6_sse2x4, +#ifdef CONFIG_AS_AVX2 + &raid6_avx2x1, + &raid6_avx2x2, + &raid6_avx2x4, +#endif #endif #ifdef CONFIG_ALTIVEC &raid6_altivec1, @@ -72,6 +81,9 @@ EXPORT_SYMBOL_GPL(raid6_datap_recov); const struct raid6_recov_calls *const raid6_recov_algos[] = { #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) +#ifdef CONFIG_AS_AVX2 + &raid6_recov_avx2, +#endif &raid6_recov_ssse3, #endif &raid6_recov_intx1, diff --git a/trunk/lib/raid6/altivec.uc b/trunk/lib/raid6/altivec.uc index b71012b756f4..7cc12b532e95 100644 --- a/trunk/lib/raid6/altivec.uc +++ b/trunk/lib/raid6/altivec.uc @@ -24,13 +24,10 @@ #include -#ifdef CONFIG_ALTIVEC - #include #ifdef __KERNEL__ # include # include -#endif /* * This is the C data type to use. We use a vector of diff --git a/trunk/lib/raid6/avx2.c b/trunk/lib/raid6/avx2.c new file mode 100644 index 000000000000..bc3b1dd436eb --- /dev/null +++ b/trunk/lib/raid6/avx2.c @@ -0,0 +1,251 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 2012 Intel Corporation + * Author: Yuanhan Liu + * + * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * AVX2 implementation of RAID-6 syndrome functions + * + */ + +#ifdef CONFIG_AS_AVX2 + +#include +#include "x86.h" + +static const struct raid6_avx2_constants { + u64 x1d[4]; +} raid6_avx2_constants __aligned(32) = { + { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,}, +}; + +static int raid6_have_avx2(void) +{ + return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX); +} + +/* + * Plain AVX2 implementation + */ +static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); + asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* Zero temp */ + + for (d = 0; d < bytes; d += 32) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */ + asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); + asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */ + asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d])); + for (z = z0-2; z >= 0; z--) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm6,%ymm2,%ymm2"); + asm volatile("vpxor %ymm6,%ymm4,%ymm4"); + asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d])); + } + asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm6,%ymm2,%ymm2"); + asm volatile("vpxor %ymm6,%ymm4,%ymm4"); + + asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); + asm volatile("vpxor %ymm2,%ymm2,%ymm2"); + asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); + asm volatile("vpxor %ymm4,%ymm4,%ymm4"); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx2x1 = { + raid6_avx21_gen_syndrome, + raid6_have_avx2, + "avx2x1", + 1 /* Has cache hints */ +}; + +/* + * Unrolled-by-2 AVX2 implementation + */ +static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); + asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */ + + /* We uniformly assume a single prefetch covers at least 32 bytes */ + for (d = 0; d < bytes; d += 64) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32])); + asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */ + asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */ + asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */ + asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */ + for (z = z0-1; z >= 0; z--) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32])); + asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d])); + asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32])); + asm volatile("vpxor %ymm5,%ymm2,%ymm2"); + asm volatile("vpxor %ymm7,%ymm3,%ymm3"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + } + asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); + asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); + asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); + asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx2x2 = { + raid6_avx22_gen_syndrome, + raid6_have_avx2, + "avx2x2", + 1 /* Has cache hints */ +}; + +#ifdef CONFIG_X86_64 + +/* + * Unrolled-by-4 AVX2 implementation + */ +static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); + asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */ + asm volatile("vpxor %ymm2,%ymm2,%ymm2"); /* P[0] */ + asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* P[1] */ + asm volatile("vpxor %ymm4,%ymm4,%ymm4"); /* Q[0] */ + asm volatile("vpxor %ymm6,%ymm6,%ymm6"); /* Q[1] */ + asm volatile("vpxor %ymm10,%ymm10,%ymm10"); /* P[2] */ + asm volatile("vpxor %ymm11,%ymm11,%ymm11"); /* P[3] */ + asm volatile("vpxor %ymm12,%ymm12,%ymm12"); /* Q[2] */ + asm volatile("vpxor %ymm14,%ymm14,%ymm14"); /* Q[3] */ + + for (d = 0; d < bytes; d += 128) { + for (z = z0; z >= 0; z--) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32])); + asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64])); + asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96])); + asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7"); + asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13"); + asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); + asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpand %ymm0,%ymm13,%ymm13"); + asm volatile("vpand %ymm0,%ymm15,%ymm15"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vpxor %ymm13,%ymm12,%ymm12"); + asm volatile("vpxor %ymm15,%ymm14,%ymm14"); + asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d])); + asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32])); + asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64])); + asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96])); + asm volatile("vpxor %ymm5,%ymm2,%ymm2"); + asm volatile("vpxor %ymm7,%ymm3,%ymm3"); + asm volatile("vpxor %ymm13,%ymm10,%ymm10"); + asm volatile("vpxor %ymm15,%ymm11,%ymm11"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vpxor %ymm13,%ymm12,%ymm12"); + asm volatile("vpxor %ymm15,%ymm14,%ymm14"); + } + asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); + asm volatile("vpxor %ymm2,%ymm2,%ymm2"); + asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); + asm volatile("vpxor %ymm3,%ymm3,%ymm3"); + asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64])); + asm volatile("vpxor %ymm10,%ymm10,%ymm10"); + asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96])); + asm volatile("vpxor %ymm11,%ymm11,%ymm11"); + asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); + asm volatile("vpxor %ymm4,%ymm4,%ymm4"); + asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); + asm volatile("vpxor %ymm6,%ymm6,%ymm6"); + asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64])); + asm volatile("vpxor %ymm12,%ymm12,%ymm12"); + asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96])); + asm volatile("vpxor %ymm14,%ymm14,%ymm14"); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx2x4 = { + raid6_avx24_gen_syndrome, + raid6_have_avx2, + "avx2x4", + 1 /* Has cache hints */ +}; +#endif + +#endif /* CONFIG_AS_AVX2 */ diff --git a/trunk/lib/raid6/mmx.c b/trunk/lib/raid6/mmx.c index 279347f23094..590c71c9e200 100644 --- a/trunk/lib/raid6/mmx.c +++ b/trunk/lib/raid6/mmx.c @@ -16,7 +16,7 @@ * MMX implementation of RAID-6 syndrome functions */ -#if defined(__i386__) && !defined(__arch_um__) +#ifdef CONFIG_X86_32 #include #include "x86.h" diff --git a/trunk/lib/raid6/recov_avx2.c b/trunk/lib/raid6/recov_avx2.c new file mode 100644 index 000000000000..e1eea433a493 --- /dev/null +++ b/trunk/lib/raid6/recov_avx2.c @@ -0,0 +1,323 @@ +/* + * Copyright (C) 2012 Intel Corporation + * Author: Jim Kukunas + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#if CONFIG_AS_AVX2 + +#include +#include "x86.h" + +static int raid6_has_avx2(void) +{ + return boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX); +} + +static void raid6_2data_recov_avx2(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + const u8 x0f = 0x0f; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data pages + Use the dead data pages as temporary storage for + delta p and delta q */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + /* ymm0 = x0f[16] */ + asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f)); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("vmovdqa %0, %%ymm1" : : "m" (q[0])); + asm volatile("vmovdqa %0, %%ymm9" : : "m" (q[32])); + asm volatile("vmovdqa %0, %%ymm0" : : "m" (p[0])); + asm volatile("vmovdqa %0, %%ymm8" : : "m" (p[32])); + asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (dq[0])); + asm volatile("vpxor %0, %%ymm9, %%ymm9" : : "m" (dq[32])); + asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (dp[0])); + asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (dp[32])); + + /* + * 1 = dq[0] ^ q[0] + * 9 = dq[32] ^ q[32] + * 0 = dp[0] ^ p[0] + * 8 = dp[32] ^ p[32] + */ + + asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0])); + asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16])); + + asm volatile("vpsraw $4, %ymm1, %ymm3"); + asm volatile("vpsraw $4, %ymm9, %ymm12"); + asm volatile("vpand %ymm7, %ymm1, %ymm1"); + asm volatile("vpand %ymm7, %ymm9, %ymm9"); + asm volatile("vpand %ymm7, %ymm3, %ymm3"); + asm volatile("vpand %ymm7, %ymm12, %ymm12"); + asm volatile("vpshufb %ymm9, %ymm4, %ymm14"); + asm volatile("vpshufb %ymm1, %ymm4, %ymm4"); + asm volatile("vpshufb %ymm12, %ymm5, %ymm15"); + asm volatile("vpshufb %ymm3, %ymm5, %ymm5"); + asm volatile("vpxor %ymm14, %ymm15, %ymm15"); + asm volatile("vpxor %ymm4, %ymm5, %ymm5"); + + /* + * 5 = qx[0] + * 15 = qx[32] + */ + + asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0])); + asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16])); + asm volatile("vpsraw $4, %ymm0, %ymm2"); + asm volatile("vpsraw $4, %ymm8, %ymm6"); + asm volatile("vpand %ymm7, %ymm0, %ymm3"); + asm volatile("vpand %ymm7, %ymm8, %ymm14"); + asm volatile("vpand %ymm7, %ymm2, %ymm2"); + asm volatile("vpand %ymm7, %ymm6, %ymm6"); + asm volatile("vpshufb %ymm14, %ymm4, %ymm12"); + asm volatile("vpshufb %ymm3, %ymm4, %ymm4"); + asm volatile("vpshufb %ymm6, %ymm1, %ymm13"); + asm volatile("vpshufb %ymm2, %ymm1, %ymm1"); + asm volatile("vpxor %ymm4, %ymm1, %ymm1"); + asm volatile("vpxor %ymm12, %ymm13, %ymm13"); + + /* + * 1 = pbmul[px[0]] + * 13 = pbmul[px[32]] + */ + asm volatile("vpxor %ymm5, %ymm1, %ymm1"); + asm volatile("vpxor %ymm15, %ymm13, %ymm13"); + + /* + * 1 = db = DQ + * 13 = db[32] = DQ[32] + */ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0])); + asm volatile("vmovdqa %%ymm13,%0" : "=m" (dq[32])); + asm volatile("vpxor %ymm1, %ymm0, %ymm0"); + asm volatile("vpxor %ymm13, %ymm8, %ymm8"); + + asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0])); + asm volatile("vmovdqa %%ymm8, %0" : "=m" (dp[32])); + + bytes -= 64; + p += 64; + q += 64; + dp += 64; + dq += 64; +#else + asm volatile("vmovdqa %0, %%ymm1" : : "m" (*q)); + asm volatile("vmovdqa %0, %%ymm0" : : "m" (*p)); + asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (*dq)); + asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (*dp)); + + /* 1 = dq ^ q; 0 = dp ^ p */ + + asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0])); + asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16])); + + /* + * 1 = dq ^ q + * 3 = dq ^ p >> 4 + */ + asm volatile("vpsraw $4, %ymm1, %ymm3"); + asm volatile("vpand %ymm7, %ymm1, %ymm1"); + asm volatile("vpand %ymm7, %ymm3, %ymm3"); + asm volatile("vpshufb %ymm1, %ymm4, %ymm4"); + asm volatile("vpshufb %ymm3, %ymm5, %ymm5"); + asm volatile("vpxor %ymm4, %ymm5, %ymm5"); + + /* 5 = qx */ + + asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0])); + asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16])); + + asm volatile("vpsraw $4, %ymm0, %ymm2"); + asm volatile("vpand %ymm7, %ymm0, %ymm3"); + asm volatile("vpand %ymm7, %ymm2, %ymm2"); + asm volatile("vpshufb %ymm3, %ymm4, %ymm4"); + asm volatile("vpshufb %ymm2, %ymm1, %ymm1"); + asm volatile("vpxor %ymm4, %ymm1, %ymm1"); + + /* 1 = pbmul[px] */ + asm volatile("vpxor %ymm5, %ymm1, %ymm1"); + /* 1 = db = DQ */ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0])); + + asm volatile("vpxor %ymm1, %ymm0, %ymm0"); + asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0])); + + bytes -= 32; + p += 32; + q += 32; + dp += 32; + dq += 32; +#endif + } + + kernel_fpu_end(); +} + +static void raid6_datap_recov_avx2(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + const u8 x0f = 0x0f; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data page + Use the dead data page as temporary storage for delta q */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f)); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0])); + asm volatile("vmovdqa %0, %%ymm8" : : "m" (dq[32])); + asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0])); + asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (q[32])); + + /* + * 3 = q[0] ^ dq[0] + * 8 = q[32] ^ dq[32] + */ + asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0])); + asm volatile("vmovapd %ymm0, %ymm13"); + asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16])); + asm volatile("vmovapd %ymm1, %ymm14"); + + asm volatile("vpsraw $4, %ymm3, %ymm6"); + asm volatile("vpsraw $4, %ymm8, %ymm12"); + asm volatile("vpand %ymm7, %ymm3, %ymm3"); + asm volatile("vpand %ymm7, %ymm8, %ymm8"); + asm volatile("vpand %ymm7, %ymm6, %ymm6"); + asm volatile("vpand %ymm7, %ymm12, %ymm12"); + asm volatile("vpshufb %ymm3, %ymm0, %ymm0"); + asm volatile("vpshufb %ymm8, %ymm13, %ymm13"); + asm volatile("vpshufb %ymm6, %ymm1, %ymm1"); + asm volatile("vpshufb %ymm12, %ymm14, %ymm14"); + asm volatile("vpxor %ymm0, %ymm1, %ymm1"); + asm volatile("vpxor %ymm13, %ymm14, %ymm14"); + + /* + * 1 = qmul[q[0] ^ dq[0]] + * 14 = qmul[q[32] ^ dq[32]] + */ + asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0])); + asm volatile("vmovdqa %0, %%ymm12" : : "m" (p[32])); + asm volatile("vpxor %ymm1, %ymm2, %ymm2"); + asm volatile("vpxor %ymm14, %ymm12, %ymm12"); + + /* + * 2 = p[0] ^ qmul[q[0] ^ dq[0]] + * 12 = p[32] ^ qmul[q[32] ^ dq[32]] + */ + + asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0])); + asm volatile("vmovdqa %%ymm14, %0" : "=m" (dq[32])); + asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0])); + asm volatile("vmovdqa %%ymm12,%0" : "=m" (p[32])); + + bytes -= 64; + p += 64; + q += 64; + dq += 64; +#else + asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0])); + asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0])); + + /* 3 = q ^ dq */ + + asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0])); + asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16])); + + asm volatile("vpsraw $4, %ymm3, %ymm6"); + asm volatile("vpand %ymm7, %ymm3, %ymm3"); + asm volatile("vpand %ymm7, %ymm6, %ymm6"); + asm volatile("vpshufb %ymm3, %ymm0, %ymm0"); + asm volatile("vpshufb %ymm6, %ymm1, %ymm1"); + asm volatile("vpxor %ymm0, %ymm1, %ymm1"); + + /* 1 = qmul[q ^ dq] */ + + asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0])); + asm volatile("vpxor %ymm1, %ymm2, %ymm2"); + + /* 2 = p ^ qmul[q ^ dq] */ + + asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0])); + asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0])); + + bytes -= 32; + p += 32; + q += 32; + dq += 32; +#endif + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_avx2 = { + .data2 = raid6_2data_recov_avx2, + .datap = raid6_datap_recov_avx2, + .valid = raid6_has_avx2, +#ifdef CONFIG_X86_64 + .name = "avx2x2", +#else + .name = "avx2x1", +#endif + .priority = 2, +}; + +#else +#warning "your version of binutils lacks AVX2 support" +#endif diff --git a/trunk/lib/raid6/recov_ssse3.c b/trunk/lib/raid6/recov_ssse3.c index ecb710c0b4d9..a9168328f03b 100644 --- a/trunk/lib/raid6/recov_ssse3.c +++ b/trunk/lib/raid6/recov_ssse3.c @@ -7,8 +7,6 @@ * of the License. */ -#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) - #include #include "x86.h" @@ -332,5 +330,3 @@ const struct raid6_recov_calls raid6_recov_ssse3 = { #endif .priority = 1, }; - -#endif diff --git a/trunk/lib/raid6/sse1.c b/trunk/lib/raid6/sse1.c index 10dd91948c07..f76297139445 100644 --- a/trunk/lib/raid6/sse1.c +++ b/trunk/lib/raid6/sse1.c @@ -21,7 +21,7 @@ * worthwhile as a separate implementation. */ -#if defined(__i386__) && !defined(__arch_um__) +#ifdef CONFIG_X86_32 #include #include "x86.h" diff --git a/trunk/lib/raid6/sse2.c b/trunk/lib/raid6/sse2.c index bc2d57daa589..85b82c85f28e 100644 --- a/trunk/lib/raid6/sse2.c +++ b/trunk/lib/raid6/sse2.c @@ -17,8 +17,6 @@ * */ -#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) - #include #include "x86.h" @@ -159,9 +157,7 @@ const struct raid6_calls raid6_sse2x2 = { 1 /* Has cache hints */ }; -#endif - -#if defined(__x86_64__) && !defined(__arch_um__) +#ifdef CONFIG_X86_64 /* * Unrolled-by-4 SSE2 implementation @@ -259,4 +255,4 @@ const struct raid6_calls raid6_sse2x4 = { 1 /* Has cache hints */ }; -#endif +#endif /* CONFIG_X86_64 */ diff --git a/trunk/lib/raid6/test/Makefile b/trunk/lib/raid6/test/Makefile index c76151d94764..087332dbf8aa 100644 --- a/trunk/lib/raid6/test/Makefile +++ b/trunk/lib/raid6/test/Makefile @@ -10,6 +10,31 @@ LD = ld AWK = awk -f AR = ar RANLIB = ranlib +OBJS = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o + +ARCH := $(shell uname -m 2>/dev/null | sed -e /s/i.86/i386/) +ifeq ($(ARCH),i386) + CFLAGS += -DCONFIG_X86_32 + IS_X86 = yes +endif +ifeq ($(ARCH),x86_64) + CFLAGS += -DCONFIG_X86_64 + IS_X86 = yes +endif + +ifeq ($(IS_X86),yes) + OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o + CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" | \ + gcc -c -x assembler - >&/dev/null && \ + rm ./-.o && echo -DCONFIG_AS_AVX2=1) +else + HAS_ALTIVEC := $(shell echo -e '\#include \nvector int a;' |\ + gcc -c -x c - >&/dev/null && \ + rm ./-.o && echo yes) + ifeq ($(HAS_ALTIVEC),yes) + OBJS += altivec1.o altivec2.o altivec4.o altivec8.o + endif +endif .c.o: $(CC) $(CFLAGS) -c -o $@ $< @@ -22,9 +47,7 @@ RANLIB = ranlib all: raid6.a raid6test -raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \ - altivec1.o altivec2.o altivec4.o altivec8.o recov.o recov_ssse3.o algos.o \ - tables.o +raid6.a: $(OBJS) rm -f $@ $(AR) cq $@ $^ $(RANLIB) $@ diff --git a/trunk/lib/raid6/x86.h b/trunk/lib/raid6/x86.h index d55d63232c55..b7595484a815 100644 --- a/trunk/lib/raid6/x86.h +++ b/trunk/lib/raid6/x86.h @@ -45,19 +45,23 @@ static inline void kernel_fpu_end(void) #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ #define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */ #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ #define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ /* Should work well enough on modern CPUs for testing */ static inline int boot_cpu_has(int flag) { - u32 eax = (flag & 0x20) ? 0x80000001 : 1; - u32 ecx, edx; + u32 eax, ebx, ecx, edx; + + eax = (flag & 0x100) ? 7 : + (flag & 0x20) ? 0x80000001 : 1; + ecx = 0; asm volatile("cpuid" - : "+a" (eax), "=d" (edx), "=c" (ecx) - : : "ebx"); + : "+a" (eax), "=b" (ebx), "=d" (edx), "+c" (ecx)); - return ((flag & 0x80 ? ecx : edx) >> (flag & 31)) & 1; + return ((flag & 0x100 ? ebx : + (flag & 0x80) ? ecx : edx) >> (flag & 31)) & 1; } #endif /* ndef __KERNEL__ */ diff --git a/trunk/lib/random32.c b/trunk/lib/random32.c index 938bde5876ac..52280d5526be 100644 --- a/trunk/lib/random32.c +++ b/trunk/lib/random32.c @@ -42,13 +42,13 @@ static DEFINE_PER_CPU(struct rnd_state, net_rand_state); /** - * prandom32 - seeded pseudo-random number generator. + * prandom_u32_state - seeded pseudo-random number generator. * @state: pointer to state structure holding seeded state. * * This is used for pseudo-randomness with no outside seeding. - * For more random results, use random32(). + * For more random results, use prandom_u32(). */ -u32 prandom32(struct rnd_state *state) +u32 prandom_u32_state(struct rnd_state *state) { #define TAUSWORTHE(s,a,b,c,d) ((s&c)<>b) @@ -58,32 +58,81 @@ u32 prandom32(struct rnd_state *state) return (state->s1 ^ state->s2 ^ state->s3); } -EXPORT_SYMBOL(prandom32); +EXPORT_SYMBOL(prandom_u32_state); /** - * random32 - pseudo random number generator + * prandom_u32 - pseudo random number generator * * A 32 bit pseudo-random number is generated using a fast * algorithm suitable for simulation. This algorithm is NOT * considered safe for cryptographic use. */ -u32 random32(void) +u32 prandom_u32(void) { unsigned long r; struct rnd_state *state = &get_cpu_var(net_rand_state); - r = prandom32(state); + r = prandom_u32_state(state); put_cpu_var(state); return r; } -EXPORT_SYMBOL(random32); +EXPORT_SYMBOL(prandom_u32); + +/* + * prandom_bytes_state - get the requested number of pseudo-random bytes + * + * @state: pointer to state structure holding seeded state. + * @buf: where to copy the pseudo-random bytes to + * @bytes: the requested number of bytes + * + * This is used for pseudo-randomness with no outside seeding. + * For more random results, use prandom_bytes(). + */ +void prandom_bytes_state(struct rnd_state *state, void *buf, int bytes) +{ + unsigned char *p = buf; + int i; + + for (i = 0; i < round_down(bytes, sizeof(u32)); i += sizeof(u32)) { + u32 random = prandom_u32_state(state); + int j; + + for (j = 0; j < sizeof(u32); j++) { + p[i + j] = random; + random >>= BITS_PER_BYTE; + } + } + if (i < bytes) { + u32 random = prandom_u32_state(state); + + for (; i < bytes; i++) { + p[i] = random; + random >>= BITS_PER_BYTE; + } + } +} +EXPORT_SYMBOL(prandom_bytes_state); + +/** + * prandom_bytes - get the requested number of pseudo-random bytes + * @buf: where to copy the pseudo-random bytes to + * @bytes: the requested number of bytes + */ +void prandom_bytes(void *buf, int bytes) +{ + struct rnd_state *state = &get_cpu_var(net_rand_state); + + prandom_bytes_state(state, buf, bytes); + put_cpu_var(state); +} +EXPORT_SYMBOL(prandom_bytes); /** - * srandom32 - add entropy to pseudo random number generator + * prandom_seed - add entropy to pseudo random number generator * @seed: seed value * - * Add some additional seeding to the random32() pool. + * Add some additional seeding to the prandom pool. */ -void srandom32(u32 entropy) +void prandom_seed(u32 entropy) { int i; /* @@ -95,13 +144,13 @@ void srandom32(u32 entropy) state->s1 = __seed(state->s1 ^ entropy, 1); } } -EXPORT_SYMBOL(srandom32); +EXPORT_SYMBOL(prandom_seed); /* * Generate some initially weak seeding values to allow - * to start the random32() engine. + * to start the prandom_u32() engine. */ -static int __init random32_init(void) +static int __init prandom_init(void) { int i; @@ -114,22 +163,22 @@ static int __init random32_init(void) state->s3 = __seed(LCG(state->s2), 15); /* "warm it up" */ - prandom32(state); - prandom32(state); - prandom32(state); - prandom32(state); - prandom32(state); - prandom32(state); + prandom_u32_state(state); + prandom_u32_state(state); + prandom_u32_state(state); + prandom_u32_state(state); + prandom_u32_state(state); + prandom_u32_state(state); } return 0; } -core_initcall(random32_init); +core_initcall(prandom_init); /* * Generate better values after random number generator * is fully initialized. */ -static int __init random32_reseed(void) +static int __init prandom_reseed(void) { int i; @@ -143,8 +192,8 @@ static int __init random32_reseed(void) state->s3 = __seed(seeds[2], 15); /* mix it in */ - prandom32(state); + prandom_u32_state(state); } return 0; } -late_initcall(random32_reseed); +late_initcall(prandom_reseed); diff --git a/trunk/lib/rbtree_test.c b/trunk/lib/rbtree_test.c index 268b23951fec..af38aedbd874 100644 --- a/trunk/lib/rbtree_test.c +++ b/trunk/lib/rbtree_test.c @@ -96,8 +96,8 @@ static void init(void) { int i; for (i = 0; i < NODES; i++) { - nodes[i].key = prandom32(&rnd); - nodes[i].val = prandom32(&rnd); + nodes[i].key = prandom_u32_state(&rnd); + nodes[i].val = prandom_u32_state(&rnd); } } @@ -118,7 +118,7 @@ static void check(int nr_nodes) { struct rb_node *rb; int count = 0; - int blacks; + int blacks = 0; u32 prev_key = 0; for (rb = rb_first(&root); rb; rb = rb_next(rb)) { @@ -155,7 +155,7 @@ static int rbtree_test_init(void) printk(KERN_ALERT "rbtree testing"); - prandom32_seed(&rnd, 3141592653589793238ULL); + prandom_seed_state(&rnd, 3141592653589793238ULL); init(); time1 = get_cycles(); diff --git a/trunk/lib/scatterlist.c b/trunk/lib/scatterlist.c index 3675452b23ca..7874b01e816e 100644 --- a/trunk/lib/scatterlist.c +++ b/trunk/lib/scatterlist.c @@ -248,7 +248,8 @@ int __sg_alloc_table(struct sg_table *table, unsigned int nents, unsigned int left; #ifndef ARCH_HAS_SG_CHAIN - BUG_ON(nents > max_ents); + if (WARN_ON_ONCE(nents > max_ents)) + return -EINVAL; #endif memset(table, 0, sizeof(*table)); diff --git a/trunk/lib/vsprintf.c b/trunk/lib/vsprintf.c index 39c99fea7c03..fab33a9c5318 100644 --- a/trunk/lib/vsprintf.c +++ b/trunk/lib/vsprintf.c @@ -23,12 +23,12 @@ #include #include #include +#include #include #include #include #include /* for PAGE_SIZE */ -#include #include /* for dereference_function_descriptor() */ #include "kstrtox.h" @@ -38,6 +38,8 @@ * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use + * + * This function is obsolete. Please use kstrtoull instead. */ unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base) { @@ -61,6 +63,8 @@ EXPORT_SYMBOL(simple_strtoull); * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use + * + * This function is obsolete. Please use kstrtoul instead. */ unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base) { @@ -73,6 +77,8 @@ EXPORT_SYMBOL(simple_strtoul); * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use + * + * This function is obsolete. Please use kstrtol instead. */ long simple_strtol(const char *cp, char **endp, unsigned int base) { @@ -88,6 +94,8 @@ EXPORT_SYMBOL(simple_strtol); * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use + * + * This function is obsolete. Please use kstrtoll instead. */ long long simple_strtoll(const char *cp, char **endp, unsigned int base) { @@ -1485,7 +1493,10 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) num = va_arg(args, long); break; case FORMAT_TYPE_SIZE_T: - num = va_arg(args, size_t); + if (spec.flags & SIGN) + num = va_arg(args, ssize_t); + else + num = va_arg(args, size_t); break; case FORMAT_TYPE_PTRDIFF: num = va_arg(args, ptrdiff_t); @@ -2013,7 +2024,11 @@ int vsscanf(const char *buf, const char *fmt, va_list args) char digit; int num = 0; u8 qualifier; - u8 base; + unsigned int base; + union { + long long s; + unsigned long long u; + } val; s16 field_width; bool is_sign; @@ -2053,8 +2068,11 @@ int vsscanf(const char *buf, const char *fmt, va_list args) /* get field width */ field_width = -1; - if (isdigit(*fmt)) + if (isdigit(*fmt)) { field_width = skip_atoi(&fmt); + if (field_width <= 0) + break; + } /* get conversion qualifier */ qualifier = -1; @@ -2154,58 +2172,61 @@ int vsscanf(const char *buf, const char *fmt, va_list args) || (base == 0 && !isdigit(digit))) break; + if (is_sign) + val.s = qualifier != 'L' ? + simple_strtol(str, &next, base) : + simple_strtoll(str, &next, base); + else + val.u = qualifier != 'L' ? + simple_strtoul(str, &next, base) : + simple_strtoull(str, &next, base); + + if (field_width > 0 && next - str > field_width) { + if (base == 0) + _parse_integer_fixup_radix(str, &base); + while (next - str > field_width) { + if (is_sign) + val.s = div_s64(val.s, base); + else + val.u = div_u64(val.u, base); + --next; + } + } + switch (qualifier) { case 'H': /* that's 'hh' in format */ - if (is_sign) { - signed char *s = (signed char *)va_arg(args, signed char *); - *s = (signed char)simple_strtol(str, &next, base); - } else { - unsigned char *s = (unsigned char *)va_arg(args, unsigned char *); - *s = (unsigned char)simple_strtoul(str, &next, base); - } + if (is_sign) + *va_arg(args, signed char *) = val.s; + else + *va_arg(args, unsigned char *) = val.u; break; case 'h': - if (is_sign) { - short *s = (short *)va_arg(args, short *); - *s = (short)simple_strtol(str, &next, base); - } else { - unsigned short *s = (unsigned short *)va_arg(args, unsigned short *); - *s = (unsigned short)simple_strtoul(str, &next, base); - } + if (is_sign) + *va_arg(args, short *) = val.s; + else + *va_arg(args, unsigned short *) = val.u; break; case 'l': - if (is_sign) { - long *l = (long *)va_arg(args, long *); - *l = simple_strtol(str, &next, base); - } else { - unsigned long *l = (unsigned long *)va_arg(args, unsigned long *); - *l = simple_strtoul(str, &next, base); - } + if (is_sign) + *va_arg(args, long *) = val.s; + else + *va_arg(args, unsigned long *) = val.u; break; case 'L': - if (is_sign) { - long long *l = (long long *)va_arg(args, long long *); - *l = simple_strtoll(str, &next, base); - } else { - unsigned long long *l = (unsigned long long *)va_arg(args, unsigned long long *); - *l = simple_strtoull(str, &next, base); - } + if (is_sign) + *va_arg(args, long long *) = val.s; + else + *va_arg(args, unsigned long long *) = val.u; break; case 'Z': case 'z': - { - size_t *s = (size_t *)va_arg(args, size_t *); - *s = (size_t)simple_strtoul(str, &next, base); - } - break; + *va_arg(args, size_t *) = val.u; + break; default: - if (is_sign) { - int *i = (int *)va_arg(args, int *); - *i = (int)simple_strtol(str, &next, base); - } else { - unsigned int *i = (unsigned int *)va_arg(args, unsigned int*); - *i = (unsigned int)simple_strtoul(str, &next, base); - } + if (is_sign) + *va_arg(args, int *) = val.s; + else + *va_arg(args, unsigned int *) = val.u; break; } num++; diff --git a/trunk/mm/Kconfig b/trunk/mm/Kconfig index 71259e052ce8..278e3ab1f169 100644 --- a/trunk/mm/Kconfig +++ b/trunk/mm/Kconfig @@ -149,7 +149,18 @@ config MOVABLE_NODE depends on NO_BOOTMEM depends on X86_64 depends on NUMA - depends on BROKEN + default n + help + Allow a node to have only movable memory. Pages used by the kernel, + such as direct mapping pages cannot be migrated. So the corresponding + memory device cannot be hotplugged. This option allows users to + online all the memory of a node as movable memory so that the whole + node can be hotplugged. Users who don't use the memory hotplug + feature are fine with this option on since they don't online memory + as movable. + + Say Y here if you want to hotplug a whole node. + Say N here if you want kernel to use memory on all nodes evenly. # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG diff --git a/trunk/mm/backing-dev.c b/trunk/mm/backing-dev.c index bd6a6cabef71..d3ca2b3ee176 100644 --- a/trunk/mm/backing-dev.c +++ b/trunk/mm/backing-dev.c @@ -10,7 +10,6 @@ #include #include #include -#include #include static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); @@ -222,63 +221,12 @@ static ssize_t max_ratio_store(struct device *dev, } BDI_SHOW(max_ratio, bdi->max_ratio) -static ssize_t cpu_list_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t count) -{ - struct backing_dev_info *bdi = dev_get_drvdata(dev); - struct bdi_writeback *wb = &bdi->wb; - cpumask_var_t newmask; - ssize_t ret; - struct task_struct *task; - - if (!alloc_cpumask_var(&newmask, GFP_KERNEL)) - return -ENOMEM; - - ret = cpulist_parse(buf, newmask); - if (!ret) { - spin_lock_bh(&bdi->wb_lock); - task = wb->task; - if (task) - get_task_struct(task); - spin_unlock_bh(&bdi->wb_lock); - - mutex_lock(&bdi->flusher_cpumask_lock); - if (task) { - ret = set_cpus_allowed_ptr(task, newmask); - put_task_struct(task); - } - if (ret == 0) { - cpumask_copy(bdi->flusher_cpumask, newmask); - ret = count; - } - mutex_unlock(&bdi->flusher_cpumask_lock); - - } - free_cpumask_var(newmask); - - return ret; -} - -static ssize_t cpu_list_show(struct device *dev, - struct device_attribute *attr, char *page) -{ - struct backing_dev_info *bdi = dev_get_drvdata(dev); - ssize_t ret; - - mutex_lock(&bdi->flusher_cpumask_lock); - ret = cpulist_scnprintf(page, PAGE_SIZE-1, bdi->flusher_cpumask); - mutex_unlock(&bdi->flusher_cpumask_lock); - - return ret; -} - #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) static struct device_attribute bdi_dev_attrs[] = { __ATTR_RW(read_ahead_kb), __ATTR_RW(min_ratio), __ATTR_RW(max_ratio), - __ATTR_RW(cpu_list), __ATTR_NULL, }; @@ -480,7 +428,6 @@ static int bdi_forker_thread(void *ptr) writeback_inodes_wb(&bdi->wb, 1024, WB_REASON_FORKER_THREAD); } else { - int ret; /* * The spinlock makes sure we do not lose * wake-ups when racing with 'bdi_queue_work()'. @@ -490,14 +437,6 @@ static int bdi_forker_thread(void *ptr) spin_lock_bh(&bdi->wb_lock); bdi->wb.task = task; spin_unlock_bh(&bdi->wb_lock); - mutex_lock(&bdi->flusher_cpumask_lock); - ret = set_cpus_allowed_ptr(task, - bdi->flusher_cpumask); - mutex_unlock(&bdi->flusher_cpumask_lock); - if (ret) - printk_once("%s: failed to bind flusher" - " thread %s, error %d\n", - __func__, task->comm, ret); wake_up_process(task); } bdi_clear_pending(bdi); @@ -570,17 +509,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, dev_name(dev)); if (IS_ERR(wb->task)) return PTR_ERR(wb->task); - } else { - int node; - /* - * Set up a default cpumask for the flusher threads that - * includes all cpus on the same numa node as the device. - * The mask may be overridden via sysfs. - */ - node = dev_to_node(bdi->dev); - if (node != NUMA_NO_NODE) - cpumask_copy(bdi->flusher_cpumask, - cpumask_of_node(node)); } bdi_debug_register(bdi, dev_name(dev)); @@ -706,15 +634,6 @@ int bdi_init(struct backing_dev_info *bdi) bdi_wb_init(&bdi->wb, bdi); - if (!bdi_cap_flush_forker(bdi)) { - bdi->flusher_cpumask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); - if (!bdi->flusher_cpumask) - return -ENOMEM; - cpumask_setall(bdi->flusher_cpumask); - mutex_init(&bdi->flusher_cpumask_lock); - } else - bdi->flusher_cpumask = NULL; - for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { err = percpu_counter_init(&bdi->bdi_stat[i], 0); if (err) @@ -737,7 +656,6 @@ int bdi_init(struct backing_dev_info *bdi) err: while (i--) percpu_counter_destroy(&bdi->bdi_stat[i]); - kfree(bdi->flusher_cpumask); } return err; @@ -765,8 +683,6 @@ void bdi_destroy(struct backing_dev_info *bdi) bdi_unregister(bdi); - kfree(bdi->flusher_cpumask); - /* * If bdi_unregister() had already been called earlier, the * wakeup_timer could still be armed because bdi_prune_sb() diff --git a/trunk/mm/hugetlb.c b/trunk/mm/hugetlb.c index e5318c7793ae..4f3ea0b1e57c 100644 --- a/trunk/mm/hugetlb.c +++ b/trunk/mm/hugetlb.c @@ -1906,14 +1906,12 @@ static int __init hugetlb_init(void) default_hstate.max_huge_pages = default_hstate_max_huge_pages; hugetlb_init_hstates(); - gather_bootmem_prealloc(); - report_hugepages(); hugetlb_sysfs_init(); - hugetlb_register_all_nodes(); + hugetlb_cgroup_file_init(); return 0; } @@ -1943,13 +1941,6 @@ void __init hugetlb_add_hstate(unsigned order) h->next_nid_to_free = first_node(node_states[N_MEMORY]); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); - /* - * Add cgroup control files only if the huge page consists - * of more than two normal pages. This is because we use - * page[2].lru.next for storing cgoup details. - */ - if (order >= HUGETLB_CGROUP_MIN_ORDER) - hugetlb_cgroup_file_init(hugetlb_max_hstate - 1); parsed_hstate = h; } diff --git a/trunk/mm/hugetlb_cgroup.c b/trunk/mm/hugetlb_cgroup.c index b5bde7a5c017..9cea7de22ffb 100644 --- a/trunk/mm/hugetlb_cgroup.c +++ b/trunk/mm/hugetlb_cgroup.c @@ -333,7 +333,7 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize) return buf; } -int __init hugetlb_cgroup_file_init(int idx) +static void __init __hugetlb_cgroup_file_init(int idx) { char buf[32]; struct cftype *cft; @@ -375,7 +375,22 @@ int __init hugetlb_cgroup_file_init(int idx) WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); - return 0; + return; +} + +void __init hugetlb_cgroup_file_init(void) +{ + struct hstate *h; + + for_each_hstate(h) { + /* + * Add cgroup control files only if the huge page consists + * of more than two normal pages. This is because we use + * page[2].lru.next for storing cgroup details. + */ + if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) + __hugetlb_cgroup_file_init(hstate_index(h)); + } } /* diff --git a/trunk/mm/kmemleak.c b/trunk/mm/kmemleak.c index a217cc544060..752a705c77c2 100644 --- a/trunk/mm/kmemleak.c +++ b/trunk/mm/kmemleak.c @@ -1556,7 +1556,8 @@ static int dump_str_object_info(const char *str) struct kmemleak_object *object; unsigned long addr; - addr= simple_strtoul(str, NULL, 0); + if (kstrtoul(str, 0, &addr)) + return -EINVAL; object = find_and_get_object(addr, 0); if (!object) { pr_info("Unknown object at 0x%08lx\n", addr); diff --git a/trunk/mm/ksm.c b/trunk/mm/ksm.c index 82dfb4b54321..51573858938d 100644 --- a/trunk/mm/ksm.c +++ b/trunk/mm/ksm.c @@ -1624,7 +1624,7 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock_write(anon_vma); + anon_vma_lock_read(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; @@ -1648,7 +1648,7 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, if (!search_new_forks || !mapcount) break; } - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); if (!mapcount) goto out; } @@ -1678,7 +1678,7 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock_write(anon_vma); + anon_vma_lock_read(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; @@ -1697,11 +1697,11 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) ret = try_to_unmap_one(page, vma, rmap_item->address, flags); if (ret != SWAP_AGAIN || !page_mapped(page)) { - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); goto out; } } - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); } if (!search_new_forks++) goto again; @@ -1731,7 +1731,7 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock_write(anon_vma); + anon_vma_lock_read(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; @@ -1749,11 +1749,11 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, ret = rmap_one(page, vma, rmap_item->address, arg); if (ret != SWAP_AGAIN) { - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); goto out; } } - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); } if (!search_new_forks++) goto again; diff --git a/trunk/mm/memcontrol.c b/trunk/mm/memcontrol.c index bbfac5063ca8..f3009b4bae51 100644 --- a/trunk/mm/memcontrol.c +++ b/trunk/mm/memcontrol.c @@ -10,6 +10,10 @@ * Copyright (C) 2009 Nokia Corporation * Author: Kirill A. Shutemov * + * Kernel Memory Controller + * Copyright (C) 2012 Parallels Inc. and Google Inc. + * Authors: Glauber Costa and Suleiman Souhlal + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -267,6 +271,10 @@ struct mem_cgroup { struct work_struct work_freeing; }; + /* + * the counter to account for kernel memory usage. + */ + struct res_counter kmem; /* * Per cgroup active and inactive list, similar to the * per zone LRU lists. @@ -282,6 +290,7 @@ struct mem_cgroup { * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; + unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ bool oom_lock; atomic_t under_oom; @@ -332,8 +341,61 @@ struct mem_cgroup { #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct tcp_memcontrol tcp_mem; #endif +#if defined(CONFIG_MEMCG_KMEM) + /* analogous to slab_common's slab_caches list. per-memcg */ + struct list_head memcg_slab_caches; + /* Not a spinlock, we can take a lot of time walking the list */ + struct mutex slab_caches_mutex; + /* Index in the kmem_cache->memcg_params->memcg_caches array */ + int kmemcg_id; +#endif }; +/* internal only representation about the status of kmem accounting. */ +enum { + KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ + KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */ + KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ +}; + +/* We account when limit is on, but only after call sites are patched */ +#define KMEM_ACCOUNTED_MASK \ + ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED)) + +#ifdef CONFIG_MEMCG_KMEM +static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) +{ + set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); +} + +static bool memcg_kmem_is_active(struct mem_cgroup *memcg) +{ + return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); +} + +static void memcg_kmem_set_activated(struct mem_cgroup *memcg) +{ + set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); +} + +static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) +{ + clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); +} + +static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) +{ + if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) + set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); +} + +static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) +{ + return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, + &memcg->kmem_account_flags); +} +#endif + /* Stuffs for move charges at task migration. */ /* * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a @@ -388,9 +450,13 @@ enum charge_type { }; /* for encoding cft->private value on file */ -#define _MEM (0) -#define _MEMSWAP (1) -#define _OOM_TYPE (2) +enum res_type { + _MEM, + _MEMSWAP, + _OOM_TYPE, + _KMEM, +}; + #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) @@ -487,6 +553,75 @@ static void disarm_sock_keys(struct mem_cgroup *memcg) } #endif +#ifdef CONFIG_MEMCG_KMEM +/* + * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. + * There are two main reasons for not using the css_id for this: + * 1) this works better in sparse environments, where we have a lot of memcgs, + * but only a few kmem-limited. Or also, if we have, for instance, 200 + * memcgs, and none but the 200th is kmem-limited, we'd have to have a + * 200 entry array for that. + * + * 2) In order not to violate the cgroup API, we would like to do all memory + * allocation in ->create(). At that point, we haven't yet allocated the + * css_id. Having a separate index prevents us from messing with the cgroup + * core for this + * + * The current size of the caches array is stored in + * memcg_limited_groups_array_size. It will double each time we have to + * increase it. + */ +static DEFINE_IDA(kmem_limited_groups); +int memcg_limited_groups_array_size; + +/* + * MIN_SIZE is different than 1, because we would like to avoid going through + * the alloc/free process all the time. In a small machine, 4 kmem-limited + * cgroups is a reasonable guess. In the future, it could be a parameter or + * tunable, but that is strictly not necessary. + * + * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get + * this constant directly from cgroup, but it is understandable that this is + * better kept as an internal representation in cgroup.c. In any case, the + * css_id space is not getting any smaller, and we don't have to necessarily + * increase ours as well if it increases. + */ +#define MEMCG_CACHES_MIN_SIZE 4 +#define MEMCG_CACHES_MAX_SIZE 65535 + +/* + * A lot of the calls to the cache allocation functions are expected to be + * inlined by the compiler. Since the calls to memcg_kmem_get_cache are + * conditional to this static branch, we'll have to allow modules that does + * kmem_cache_alloc and the such to see this symbol as well + */ +struct static_key memcg_kmem_enabled_key; +EXPORT_SYMBOL(memcg_kmem_enabled_key); + +static void disarm_kmem_keys(struct mem_cgroup *memcg) +{ + if (memcg_kmem_is_active(memcg)) { + static_key_slow_dec(&memcg_kmem_enabled_key); + ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); + } + /* + * This check can't live in kmem destruction function, + * since the charges will outlive the cgroup + */ + WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); +} +#else +static void disarm_kmem_keys(struct mem_cgroup *memcg) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +static void disarm_static_keys(struct mem_cgroup *memcg) +{ + disarm_sock_keys(memcg); + disarm_kmem_keys(memcg); +} + static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * @@ -1453,6 +1588,10 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); + printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n", + res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, + res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, + res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); } /* @@ -2060,20 +2199,28 @@ struct memcg_stock_pcp { static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); static DEFINE_MUTEX(percpu_charge_mutex); -/* - * Try to consume stocked charge on this cpu. If success, one page is consumed - * from local stock and true is returned. If the stock is 0 or charges from a - * cgroup which is not current target, returns false. This stock will be - * refilled. +/** + * consume_stock: Try to consume stocked charge on this cpu. + * @memcg: memcg to consume from. + * @nr_pages: how many pages to charge. + * + * The charges will only happen if @memcg matches the current cpu's memcg + * stock, and at least @nr_pages are available in that stock. Failure to + * service an allocation will refill the stock. + * + * returns true if successful, false otherwise. */ -static bool consume_stock(struct mem_cgroup *memcg) +static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; bool ret = true; + if (nr_pages > CHARGE_BATCH) + return false; + stock = &get_cpu_var(memcg_stock); - if (memcg == stock->cached && stock->nr_pages) - stock->nr_pages--; + if (memcg == stock->cached && stock->nr_pages >= nr_pages) + stock->nr_pages -= nr_pages; else /* need to call res_counter_charge */ ret = false; put_cpu_var(memcg_stock); @@ -2250,7 +2397,8 @@ enum { }; static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages, bool oom_check) + unsigned int nr_pages, unsigned int min_pages, + bool oom_check) { unsigned long csize = nr_pages * PAGE_SIZE; struct mem_cgroup *mem_over_limit; @@ -2273,18 +2421,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); /* - * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch - * of regular pages (CHARGE_BATCH), or a single regular page (1). - * * Never reclaim on behalf of optional batching, retry with a * single page instead. */ - if (nr_pages == CHARGE_BATCH) + if (nr_pages > min_pages) return CHARGE_RETRY; if (!(gfp_mask & __GFP_WAIT)) return CHARGE_WOULDBLOCK; + if (gfp_mask & __GFP_NORETRY) + return CHARGE_NOMEM; + ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) return CHARGE_RETRY; @@ -2297,7 +2445,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, * unlikely to succeed so close to the limit, and we fall back * to regular pages anyway in case of failure. */ - if (nr_pages == 1 && ret) + if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) return CHARGE_RETRY; /* @@ -2371,7 +2519,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, memcg = *ptr; if (mem_cgroup_is_root(memcg)) goto done; - if (nr_pages == 1 && consume_stock(memcg)) + if (consume_stock(memcg, nr_pages)) goto done; css_get(&memcg->css); } else { @@ -2396,7 +2544,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, rcu_read_unlock(); goto done; } - if (nr_pages == 1 && consume_stock(memcg)) { + if (consume_stock(memcg, nr_pages)) { /* * It seems dagerous to access memcg without css_get(). * But considering how consume_stok works, it's not @@ -2431,7 +2579,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; } - ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); + ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, + oom_check); switch (ret) { case CHARGE_OK: break; @@ -2624,183 +2773,943 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, memcg_check_events(memcg, page); } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static DEFINE_MUTEX(set_limit_mutex); + +#ifdef CONFIG_MEMCG_KMEM +static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) +{ + return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && + (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); +} -#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) /* - * Because tail pages are not marked as "used", set it. We're under - * zone->lru_lock, 'splitting on pmd' and compound_lock. - * charge/uncharge will be never happen and move_account() is done under - * compound_lock(), so we don't have to take care of races. + * This is a bit cumbersome, but it is rarely used and avoids a backpointer + * in the memcg_cache_params struct. */ -void mem_cgroup_split_huge_fixup(struct page *head) +static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) { - struct page_cgroup *head_pc = lookup_page_cgroup(head); - struct page_cgroup *pc; - int i; + struct kmem_cache *cachep; - if (mem_cgroup_disabled()) - return; - for (i = 1; i < HPAGE_PMD_NR; i++) { - pc = head_pc + i; - pc->mem_cgroup = head_pc->mem_cgroup; - smp_wmb();/* see __commit_charge() */ - pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; - } + VM_BUG_ON(p->is_root_cache); + cachep = p->root_cache; + return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -/** - * mem_cgroup_move_account - move account of the page - * @page: the page - * @nr_pages: number of regular pages (>1 for huge pages) - * @pc: page_cgroup of the page. - * @from: mem_cgroup which the page is moved from. - * @to: mem_cgroup which the page is moved to. @from != @to. - * - * The caller must confirm following. - * - page is not on LRU (isolate_page() is useful.) - * - compound_lock is held when nr_pages > 1 - * - * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" - * from old cgroup. - */ -static int mem_cgroup_move_account(struct page *page, - unsigned int nr_pages, - struct page_cgroup *pc, - struct mem_cgroup *from, - struct mem_cgroup *to) +#ifdef CONFIG_SLABINFO +static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft, + struct seq_file *m) { - unsigned long flags; - int ret; - bool anon = PageAnon(page); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct memcg_cache_params *params; - VM_BUG_ON(from == to); - VM_BUG_ON(PageLRU(page)); - /* - * The page is isolated from LRU. So, collapse function - * will not handle this page. But page splitting can happen. - * Do this check under compound_page_lock(). The caller should - * hold it. - */ - ret = -EBUSY; - if (nr_pages > 1 && !PageTransHuge(page)) - goto out; + if (!memcg_can_account_kmem(memcg)) + return -EIO; - lock_page_cgroup(pc); + print_slabinfo_header(m); - ret = -EINVAL; - if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) - goto unlock; + mutex_lock(&memcg->slab_caches_mutex); + list_for_each_entry(params, &memcg->memcg_slab_caches, list) + cache_show(memcg_params_to_cache(params), m); + mutex_unlock(&memcg->slab_caches_mutex); - move_lock_mem_cgroup(from, &flags); + return 0; +} +#endif - if (!anon && page_mapped(page)) { - /* Update mapped_file data for mem_cgroup */ - preempt_disable(); - __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); - __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); - preempt_enable(); - } - mem_cgroup_charge_statistics(from, anon, -nr_pages); +static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) +{ + struct res_counter *fail_res; + struct mem_cgroup *_memcg; + int ret = 0; + bool may_oom; + + ret = res_counter_charge(&memcg->kmem, size, &fail_res); + if (ret) + return ret; - /* caller should have done css_get */ - pc->mem_cgroup = to; - mem_cgroup_charge_statistics(to, anon, nr_pages); - move_unlock_mem_cgroup(from, &flags); - ret = 0; -unlock: - unlock_page_cgroup(pc); /* - * check events + * Conditions under which we can wait for the oom_killer. Those are + * the same conditions tested by the core page allocator */ - memcg_check_events(to, page); - memcg_check_events(from, page); -out: + may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY); + + _memcg = memcg; + ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, + &_memcg, may_oom); + + if (ret == -EINTR) { + /* + * __mem_cgroup_try_charge() chosed to bypass to root due to + * OOM kill or fatal signal. Since our only options are to + * either fail the allocation or charge it to this cgroup, do + * it as a temporary condition. But we can't fail. From a + * kmem/slab perspective, the cache has already been selected, + * by mem_cgroup_kmem_get_cache(), so it is too late to change + * our minds. + * + * This condition will only trigger if the task entered + * memcg_charge_kmem in a sane state, but was OOM-killed during + * __mem_cgroup_try_charge() above. Tasks that were already + * dying when the allocation triggers should have been already + * directed to the root cgroup in memcontrol.h + */ + res_counter_charge_nofail(&memcg->res, size, &fail_res); + if (do_swap_account) + res_counter_charge_nofail(&memcg->memsw, size, + &fail_res); + ret = 0; + } else if (ret) + res_counter_uncharge(&memcg->kmem, size); + return ret; } -/** - * mem_cgroup_move_parent - moves page to the parent group - * @page: the page to move - * @pc: page_cgroup of the page - * @child: page's cgroup - * - * move charges to its parent or the root cgroup if the group has no - * parent (aka use_hierarchy==0). - * Although this might fail (get_page_unless_zero, isolate_lru_page or - * mem_cgroup_move_account fails) the failure is always temporary and - * it signals a race with a page removal/uncharge or migration. In the - * first case the page is on the way out and it will vanish from the LRU - * on the next attempt and the call should be retried later. - * Isolation from the LRU fails only if page has been isolated from - * the LRU since we looked at it and that usually means either global - * reclaim or migration going on. The page will either get back to the - * LRU or vanish. - * Finaly mem_cgroup_move_account fails only if the page got uncharged - * (!PageCgroupUsed) or moved to a different group. The page will - * disappear in the next attempt. - */ -static int mem_cgroup_move_parent(struct page *page, - struct page_cgroup *pc, - struct mem_cgroup *child) +static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) { - struct mem_cgroup *parent; - unsigned int nr_pages; - unsigned long uninitialized_var(flags); - int ret; + res_counter_uncharge(&memcg->res, size); + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, size); - VM_BUG_ON(mem_cgroup_is_root(child)); + /* Not down to 0 */ + if (res_counter_uncharge(&memcg->kmem, size)) + return; - ret = -EBUSY; - if (!get_page_unless_zero(page)) - goto out; - if (isolate_lru_page(page)) - goto put; + if (memcg_kmem_test_and_clear_dead(memcg)) + mem_cgroup_put(memcg); +} - nr_pages = hpage_nr_pages(page); +void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) +{ + if (!memcg) + return; - parent = parent_mem_cgroup(child); + mutex_lock(&memcg->slab_caches_mutex); + list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); + mutex_unlock(&memcg->slab_caches_mutex); +} + +/* + * helper for acessing a memcg's index. It will be used as an index in the + * child cache array in kmem_cache, and also to derive its name. This function + * will return -1 when this is not a kmem-limited memcg. + */ +int memcg_cache_id(struct mem_cgroup *memcg) +{ + return memcg ? memcg->kmemcg_id : -1; +} + +/* + * This ends up being protected by the set_limit mutex, during normal + * operation, because that is its main call site. + * + * But when we create a new cache, we can call this as well if its parent + * is kmem-limited. That will have to hold set_limit_mutex as well. + */ +int memcg_update_cache_sizes(struct mem_cgroup *memcg) +{ + int num, ret; + + num = ida_simple_get(&kmem_limited_groups, + 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); + if (num < 0) + return num; /* - * If no parent, move charges to root cgroup. + * After this point, kmem_accounted (that we test atomically in + * the beginning of this conditional), is no longer 0. This + * guarantees only one process will set the following boolean + * to true. We don't need test_and_set because we're protected + * by the set_limit_mutex anyway. */ - if (!parent) - parent = root_mem_cgroup; + memcg_kmem_set_activated(memcg); - if (nr_pages > 1) { - VM_BUG_ON(!PageTransHuge(page)); - flags = compound_lock_irqsave(page); + ret = memcg_update_all_caches(num+1); + if (ret) { + ida_simple_remove(&kmem_limited_groups, num); + memcg_kmem_clear_activated(memcg); + return ret; } - ret = mem_cgroup_move_account(page, nr_pages, - pc, child, parent); - if (!ret) - __mem_cgroup_cancel_local_charge(child, nr_pages); + memcg->kmemcg_id = num; + INIT_LIST_HEAD(&memcg->memcg_slab_caches); + mutex_init(&memcg->slab_caches_mutex); + return 0; +} - if (nr_pages > 1) - compound_unlock_irqrestore(page, flags); - putback_lru_page(page); -put: - put_page(page); -out: - return ret; +static size_t memcg_caches_array_size(int num_groups) +{ + ssize_t size; + if (num_groups <= 0) + return 0; + + size = 2 * num_groups; + if (size < MEMCG_CACHES_MIN_SIZE) + size = MEMCG_CACHES_MIN_SIZE; + else if (size > MEMCG_CACHES_MAX_SIZE) + size = MEMCG_CACHES_MAX_SIZE; + + return size; } /* - * Charge the memory controller for page usage. - * Return - * 0 if the charge was successful - * < 0 if the cgroup is over its limit + * We should update the current array size iff all caches updates succeed. This + * can only be done from the slab side. The slab mutex needs to be held when + * calling this. */ -static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, enum charge_type ctype) +void memcg_update_array_size(int num) { - struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; - bool oom = true; - int ret; + if (num > memcg_limited_groups_array_size) + memcg_limited_groups_array_size = memcg_caches_array_size(num); +} - if (PageTransHuge(page)) { +int memcg_update_cache_size(struct kmem_cache *s, int num_groups) +{ + struct memcg_cache_params *cur_params = s->memcg_params; + + VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); + + if (num_groups > memcg_limited_groups_array_size) { + int i; + ssize_t size = memcg_caches_array_size(num_groups); + + size *= sizeof(void *); + size += sizeof(struct memcg_cache_params); + + s->memcg_params = kzalloc(size, GFP_KERNEL); + if (!s->memcg_params) { + s->memcg_params = cur_params; + return -ENOMEM; + } + + s->memcg_params->is_root_cache = true; + + /* + * There is the chance it will be bigger than + * memcg_limited_groups_array_size, if we failed an allocation + * in a cache, in which case all caches updated before it, will + * have a bigger array. + * + * But if that is the case, the data after + * memcg_limited_groups_array_size is certainly unused + */ + for (i = 0; i < memcg_limited_groups_array_size; i++) { + if (!cur_params->memcg_caches[i]) + continue; + s->memcg_params->memcg_caches[i] = + cur_params->memcg_caches[i]; + } + + /* + * Ideally, we would wait until all caches succeed, and only + * then free the old one. But this is not worth the extra + * pointer per-cache we'd have to have for this. + * + * It is not a big deal if some caches are left with a size + * bigger than the others. And all updates will reset this + * anyway. + */ + kfree(cur_params); + } + return 0; +} + +int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, + struct kmem_cache *root_cache) +{ + size_t size = sizeof(struct memcg_cache_params); + + if (!memcg_kmem_enabled()) + return 0; + + if (!memcg) + size += memcg_limited_groups_array_size * sizeof(void *); + + s->memcg_params = kzalloc(size, GFP_KERNEL); + if (!s->memcg_params) + return -ENOMEM; + + if (memcg) { + s->memcg_params->memcg = memcg; + s->memcg_params->root_cache = root_cache; + } + return 0; +} + +void memcg_release_cache(struct kmem_cache *s) +{ + struct kmem_cache *root; + struct mem_cgroup *memcg; + int id; + + /* + * This happens, for instance, when a root cache goes away before we + * add any memcg. + */ + if (!s->memcg_params) + return; + + if (s->memcg_params->is_root_cache) + goto out; + + memcg = s->memcg_params->memcg; + id = memcg_cache_id(memcg); + + root = s->memcg_params->root_cache; + root->memcg_params->memcg_caches[id] = NULL; + mem_cgroup_put(memcg); + + mutex_lock(&memcg->slab_caches_mutex); + list_del(&s->memcg_params->list); + mutex_unlock(&memcg->slab_caches_mutex); + +out: + kfree(s->memcg_params); +} + +/* + * During the creation a new cache, we need to disable our accounting mechanism + * altogether. This is true even if we are not creating, but rather just + * enqueing new caches to be created. + * + * This is because that process will trigger allocations; some visible, like + * explicit kmallocs to auxiliary data structures, name strings and internal + * cache structures; some well concealed, like INIT_WORK() that can allocate + * objects during debug. + * + * If any allocation happens during memcg_kmem_get_cache, we will recurse back + * to it. This may not be a bounded recursion: since the first cache creation + * failed to complete (waiting on the allocation), we'll just try to create the + * cache again, failing at the same point. + * + * memcg_kmem_get_cache is prepared to abort after seeing a positive count of + * memcg_kmem_skip_account. So we enclose anything that might allocate memory + * inside the following two functions. + */ +static inline void memcg_stop_kmem_account(void) +{ + VM_BUG_ON(!current->mm); + current->memcg_kmem_skip_account++; +} + +static inline void memcg_resume_kmem_account(void) +{ + VM_BUG_ON(!current->mm); + current->memcg_kmem_skip_account--; +} + +static void kmem_cache_destroy_work_func(struct work_struct *w) +{ + struct kmem_cache *cachep; + struct memcg_cache_params *p; + + p = container_of(w, struct memcg_cache_params, destroy); + + cachep = memcg_params_to_cache(p); + + /* + * If we get down to 0 after shrink, we could delete right away. + * However, memcg_release_pages() already puts us back in the workqueue + * in that case. If we proceed deleting, we'll get a dangling + * reference, and removing the object from the workqueue in that case + * is unnecessary complication. We are not a fast path. + * + * Note that this case is fundamentally different from racing with + * shrink_slab(): if memcg_cgroup_destroy_cache() is called in + * kmem_cache_shrink, not only we would be reinserting a dead cache + * into the queue, but doing so from inside the worker racing to + * destroy it. + * + * So if we aren't down to zero, we'll just schedule a worker and try + * again + */ + if (atomic_read(&cachep->memcg_params->nr_pages) != 0) { + kmem_cache_shrink(cachep); + if (atomic_read(&cachep->memcg_params->nr_pages) == 0) + return; + } else + kmem_cache_destroy(cachep); +} + +void mem_cgroup_destroy_cache(struct kmem_cache *cachep) +{ + if (!cachep->memcg_params->dead) + return; + + /* + * There are many ways in which we can get here. + * + * We can get to a memory-pressure situation while the delayed work is + * still pending to run. The vmscan shrinkers can then release all + * cache memory and get us to destruction. If this is the case, we'll + * be executed twice, which is a bug (the second time will execute over + * bogus data). In this case, cancelling the work should be fine. + * + * But we can also get here from the worker itself, if + * kmem_cache_shrink is enough to shake all the remaining objects and + * get the page count to 0. In this case, we'll deadlock if we try to + * cancel the work (the worker runs with an internal lock held, which + * is the same lock we would hold for cancel_work_sync().) + * + * Since we can't possibly know who got us here, just refrain from + * running if there is already work pending + */ + if (work_pending(&cachep->memcg_params->destroy)) + return; + /* + * We have to defer the actual destroying to a workqueue, because + * we might currently be in a context that cannot sleep. + */ + schedule_work(&cachep->memcg_params->destroy); +} + +static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) +{ + char *name; + struct dentry *dentry; + + rcu_read_lock(); + dentry = rcu_dereference(memcg->css.cgroup->dentry); + rcu_read_unlock(); + + BUG_ON(dentry == NULL); + + name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name, + memcg_cache_id(memcg), dentry->d_name.name); + + return name; +} + +static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, + struct kmem_cache *s) +{ + char *name; + struct kmem_cache *new; + + name = memcg_cache_name(memcg, s); + if (!name) + return NULL; + + new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align, + (s->flags & ~SLAB_PANIC), s->ctor, s); + + if (new) + new->allocflags |= __GFP_KMEMCG; + + kfree(name); + return new; +} + +/* + * This lock protects updaters, not readers. We want readers to be as fast as + * they can, and they will either see NULL or a valid cache value. Our model + * allow them to see NULL, in which case the root memcg will be selected. + * + * We need this lock because multiple allocations to the same cache from a non + * will span more than one worker. Only one of them can create the cache. + */ +static DEFINE_MUTEX(memcg_cache_mutex); +static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + struct kmem_cache *new_cachep; + int idx; + + BUG_ON(!memcg_can_account_kmem(memcg)); + + idx = memcg_cache_id(memcg); + + mutex_lock(&memcg_cache_mutex); + new_cachep = cachep->memcg_params->memcg_caches[idx]; + if (new_cachep) + goto out; + + new_cachep = kmem_cache_dup(memcg, cachep); + if (new_cachep == NULL) { + new_cachep = cachep; + goto out; + } + + mem_cgroup_get(memcg); + atomic_set(&new_cachep->memcg_params->nr_pages , 0); + + cachep->memcg_params->memcg_caches[idx] = new_cachep; + /* + * the readers won't lock, make sure everybody sees the updated value, + * so they won't put stuff in the queue again for no reason + */ + wmb(); +out: + mutex_unlock(&memcg_cache_mutex); + return new_cachep; +} + +void kmem_cache_destroy_memcg_children(struct kmem_cache *s) +{ + struct kmem_cache *c; + int i; + + if (!s->memcg_params) + return; + if (!s->memcg_params->is_root_cache) + return; + + /* + * If the cache is being destroyed, we trust that there is no one else + * requesting objects from it. Even if there are, the sanity checks in + * kmem_cache_destroy should caught this ill-case. + * + * Still, we don't want anyone else freeing memcg_caches under our + * noses, which can happen if a new memcg comes to life. As usual, + * we'll take the set_limit_mutex to protect ourselves against this. + */ + mutex_lock(&set_limit_mutex); + for (i = 0; i < memcg_limited_groups_array_size; i++) { + c = s->memcg_params->memcg_caches[i]; + if (!c) + continue; + + /* + * We will now manually delete the caches, so to avoid races + * we need to cancel all pending destruction workers and + * proceed with destruction ourselves. + * + * kmem_cache_destroy() will call kmem_cache_shrink internally, + * and that could spawn the workers again: it is likely that + * the cache still have active pages until this very moment. + * This would lead us back to mem_cgroup_destroy_cache. + * + * But that will not execute at all if the "dead" flag is not + * set, so flip it down to guarantee we are in control. + */ + c->memcg_params->dead = false; + cancel_work_sync(&c->memcg_params->destroy); + kmem_cache_destroy(c); + } + mutex_unlock(&set_limit_mutex); +} + +struct create_work { + struct mem_cgroup *memcg; + struct kmem_cache *cachep; + struct work_struct work; +}; + +static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +{ + struct kmem_cache *cachep; + struct memcg_cache_params *params; + + if (!memcg_kmem_is_active(memcg)) + return; + + mutex_lock(&memcg->slab_caches_mutex); + list_for_each_entry(params, &memcg->memcg_slab_caches, list) { + cachep = memcg_params_to_cache(params); + cachep->memcg_params->dead = true; + INIT_WORK(&cachep->memcg_params->destroy, + kmem_cache_destroy_work_func); + schedule_work(&cachep->memcg_params->destroy); + } + mutex_unlock(&memcg->slab_caches_mutex); +} + +static void memcg_create_cache_work_func(struct work_struct *w) +{ + struct create_work *cw; + + cw = container_of(w, struct create_work, work); + memcg_create_kmem_cache(cw->memcg, cw->cachep); + /* Drop the reference gotten when we enqueued. */ + css_put(&cw->memcg->css); + kfree(cw); +} + +/* + * Enqueue the creation of a per-memcg kmem_cache. + * Called with rcu_read_lock. + */ +static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + struct create_work *cw; + + cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); + if (cw == NULL) + return; + + /* The corresponding put will be done in the workqueue. */ + if (!css_tryget(&memcg->css)) { + kfree(cw); + return; + } + + cw->memcg = memcg; + cw->cachep = cachep; + + INIT_WORK(&cw->work, memcg_create_cache_work_func); + schedule_work(&cw->work); +} + +static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + /* + * We need to stop accounting when we kmalloc, because if the + * corresponding kmalloc cache is not yet created, the first allocation + * in __memcg_create_cache_enqueue will recurse. + * + * However, it is better to enclose the whole function. Depending on + * the debugging options enabled, INIT_WORK(), for instance, can + * trigger an allocation. This too, will make us recurse. Because at + * this point we can't allow ourselves back into memcg_kmem_get_cache, + * the safest choice is to do it like this, wrapping the whole function. + */ + memcg_stop_kmem_account(); + __memcg_create_cache_enqueue(memcg, cachep); + memcg_resume_kmem_account(); +} +/* + * Return the kmem_cache we're supposed to use for a slab allocation. + * We try to use the current memcg's version of the cache. + * + * If the cache does not exist yet, if we are the first user of it, + * we either create it immediately, if possible, or create it asynchronously + * in a workqueue. + * In the latter case, we will let the current allocation go through with + * the original cache. + * + * Can't be called in interrupt context or from kernel threads. + * This function needs to be called with rcu_read_lock() held. + */ +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, + gfp_t gfp) +{ + struct mem_cgroup *memcg; + int idx; + + VM_BUG_ON(!cachep->memcg_params); + VM_BUG_ON(!cachep->memcg_params->is_root_cache); + + if (!current->mm || current->memcg_kmem_skip_account) + return cachep; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); + rcu_read_unlock(); + + if (!memcg_can_account_kmem(memcg)) + return cachep; + + idx = memcg_cache_id(memcg); + + /* + * barrier to mare sure we're always seeing the up to date value. The + * code updating memcg_caches will issue a write barrier to match this. + */ + read_barrier_depends(); + if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) { + /* + * If we are in a safe context (can wait, and not in interrupt + * context), we could be be predictable and return right away. + * This would guarantee that the allocation being performed + * already belongs in the new cache. + * + * However, there are some clashes that can arrive from locking. + * For instance, because we acquire the slab_mutex while doing + * kmem_cache_dup, this means no further allocation could happen + * with the slab_mutex held. + * + * Also, because cache creation issue get_online_cpus(), this + * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, + * that ends up reversed during cpu hotplug. (cpuset allocates + * a bunch of GFP_KERNEL memory during cpuup). Due to all that, + * better to defer everything. + */ + memcg_create_cache_enqueue(memcg, cachep); + return cachep; + } + + return cachep->memcg_params->memcg_caches[idx]; +} +EXPORT_SYMBOL(__memcg_kmem_get_cache); + +/* + * We need to verify if the allocation against current->mm->owner's memcg is + * possible for the given order. But the page is not allocated yet, so we'll + * need a further commit step to do the final arrangements. + * + * It is possible for the task to switch cgroups in this mean time, so at + * commit time, we can't rely on task conversion any longer. We'll then use + * the handle argument to return to the caller which cgroup we should commit + * against. We could also return the memcg directly and avoid the pointer + * passing, but a boolean return value gives better semantics considering + * the compiled-out case as well. + * + * Returning true means the allocation is possible. + */ +bool +__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) +{ + struct mem_cgroup *memcg; + int ret; + + *_memcg = NULL; + memcg = try_get_mem_cgroup_from_mm(current->mm); + + /* + * very rare case described in mem_cgroup_from_task. Unfortunately there + * isn't much we can do without complicating this too much, and it would + * be gfp-dependent anyway. Just let it go + */ + if (unlikely(!memcg)) + return true; + + if (!memcg_can_account_kmem(memcg)) { + css_put(&memcg->css); + return true; + } + + ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); + if (!ret) + *_memcg = memcg; + + css_put(&memcg->css); + return (ret == 0); +} + +void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, + int order) +{ + struct page_cgroup *pc; + + VM_BUG_ON(mem_cgroup_is_root(memcg)); + + /* The page allocation failed. Revert */ + if (!page) { + memcg_uncharge_kmem(memcg, PAGE_SIZE << order); + return; + } + + pc = lookup_page_cgroup(page); + lock_page_cgroup(pc); + pc->mem_cgroup = memcg; + SetPageCgroupUsed(pc); + unlock_page_cgroup(pc); +} + +void __memcg_kmem_uncharge_pages(struct page *page, int order) +{ + struct mem_cgroup *memcg = NULL; + struct page_cgroup *pc; + + + pc = lookup_page_cgroup(page); + /* + * Fast unlocked return. Theoretically might have changed, have to + * check again after locking. + */ + if (!PageCgroupUsed(pc)) + return; + + lock_page_cgroup(pc); + if (PageCgroupUsed(pc)) { + memcg = pc->mem_cgroup; + ClearPageCgroupUsed(pc); + } + unlock_page_cgroup(pc); + + /* + * We trust that only if there is a memcg associated with the page, it + * is a valid allocation + */ + if (!memcg) + return; + + VM_BUG_ON(mem_cgroup_is_root(memcg)); + memcg_uncharge_kmem(memcg, PAGE_SIZE << order); +} +#else +static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) +/* + * Because tail pages are not marked as "used", set it. We're under + * zone->lru_lock, 'splitting on pmd' and compound_lock. + * charge/uncharge will be never happen and move_account() is done under + * compound_lock(), so we don't have to take care of races. + */ +void mem_cgroup_split_huge_fixup(struct page *head) +{ + struct page_cgroup *head_pc = lookup_page_cgroup(head); + struct page_cgroup *pc; + int i; + + if (mem_cgroup_disabled()) + return; + for (i = 1; i < HPAGE_PMD_NR; i++) { + pc = head_pc + i; + pc->mem_cgroup = head_pc->mem_cgroup; + smp_wmb();/* see __commit_charge() */ + pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; + } +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +/** + * mem_cgroup_move_account - move account of the page + * @page: the page + * @nr_pages: number of regular pages (>1 for huge pages) + * @pc: page_cgroup of the page. + * @from: mem_cgroup which the page is moved from. + * @to: mem_cgroup which the page is moved to. @from != @to. + * + * The caller must confirm following. + * - page is not on LRU (isolate_page() is useful.) + * - compound_lock is held when nr_pages > 1 + * + * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" + * from old cgroup. + */ +static int mem_cgroup_move_account(struct page *page, + unsigned int nr_pages, + struct page_cgroup *pc, + struct mem_cgroup *from, + struct mem_cgroup *to) +{ + unsigned long flags; + int ret; + bool anon = PageAnon(page); + + VM_BUG_ON(from == to); + VM_BUG_ON(PageLRU(page)); + /* + * The page is isolated from LRU. So, collapse function + * will not handle this page. But page splitting can happen. + * Do this check under compound_page_lock(). The caller should + * hold it. + */ + ret = -EBUSY; + if (nr_pages > 1 && !PageTransHuge(page)) + goto out; + + lock_page_cgroup(pc); + + ret = -EINVAL; + if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) + goto unlock; + + move_lock_mem_cgroup(from, &flags); + + if (!anon && page_mapped(page)) { + /* Update mapped_file data for mem_cgroup */ + preempt_disable(); + __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); + __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); + preempt_enable(); + } + mem_cgroup_charge_statistics(from, anon, -nr_pages); + + /* caller should have done css_get */ + pc->mem_cgroup = to; + mem_cgroup_charge_statistics(to, anon, nr_pages); + move_unlock_mem_cgroup(from, &flags); + ret = 0; +unlock: + unlock_page_cgroup(pc); + /* + * check events + */ + memcg_check_events(to, page); + memcg_check_events(from, page); +out: + return ret; +} + +/** + * mem_cgroup_move_parent - moves page to the parent group + * @page: the page to move + * @pc: page_cgroup of the page + * @child: page's cgroup + * + * move charges to its parent or the root cgroup if the group has no + * parent (aka use_hierarchy==0). + * Although this might fail (get_page_unless_zero, isolate_lru_page or + * mem_cgroup_move_account fails) the failure is always temporary and + * it signals a race with a page removal/uncharge or migration. In the + * first case the page is on the way out and it will vanish from the LRU + * on the next attempt and the call should be retried later. + * Isolation from the LRU fails only if page has been isolated from + * the LRU since we looked at it and that usually means either global + * reclaim or migration going on. The page will either get back to the + * LRU or vanish. + * Finaly mem_cgroup_move_account fails only if the page got uncharged + * (!PageCgroupUsed) or moved to a different group. The page will + * disappear in the next attempt. + */ +static int mem_cgroup_move_parent(struct page *page, + struct page_cgroup *pc, + struct mem_cgroup *child) +{ + struct mem_cgroup *parent; + unsigned int nr_pages; + unsigned long uninitialized_var(flags); + int ret; + + VM_BUG_ON(mem_cgroup_is_root(child)); + + ret = -EBUSY; + if (!get_page_unless_zero(page)) + goto out; + if (isolate_lru_page(page)) + goto put; + + nr_pages = hpage_nr_pages(page); + + parent = parent_mem_cgroup(child); + /* + * If no parent, move charges to root cgroup. + */ + if (!parent) + parent = root_mem_cgroup; + + if (nr_pages > 1) { + VM_BUG_ON(!PageTransHuge(page)); + flags = compound_lock_irqsave(page); + } + + ret = mem_cgroup_move_account(page, nr_pages, + pc, child, parent); + if (!ret) + __mem_cgroup_cancel_local_charge(child, nr_pages); + + if (nr_pages > 1) + compound_unlock_irqrestore(page, flags); + putback_lru_page(page); +put: + put_page(page); +out: + return ret; +} + +/* + * Charge the memory controller for page usage. + * Return + * 0 if the charge was successful + * < 0 if the cgroup is over its limit + */ +static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, enum charge_type ctype) +{ + struct mem_cgroup *memcg = NULL; + unsigned int nr_pages = 1; + bool oom = true; + int ret; + + if (PageTransHuge(page)) { nr_pages <<= compound_order(page); VM_BUG_ON(!PageTransHuge(page)); /* @@ -3486,8 +4395,6 @@ void mem_cgroup_print_bad_page(struct page *page) } #endif -static DEFINE_MUTEX(set_limit_mutex); - static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) { @@ -3772,6 +4679,7 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) { int node, zid; + u64 usage; do { /* This is for making all *used* pages to be on LRU. */ @@ -3792,13 +4700,20 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) cond_resched(); /* + * Kernel memory may not necessarily be trackable to a specific + * process. So they are not migrated, and therefore we can't + * expect their value to drop to 0 here. + * Having res filled up with kmem only is enough. + * * This is a safety check because mem_cgroup_force_empty_list * could have raced with mem_cgroup_replace_page_cache callers * so the lru seemed empty but the page could have been added * right after the check. RES_USAGE should be safe as we always * charge before adding to the LRU. */ - } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0); + usage = res_counter_read_u64(&memcg->res, RES_USAGE) - + res_counter_read_u64(&memcg->kmem, RES_USAGE); + } while (usage > 0); } /* @@ -3942,7 +4857,8 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); char str[64]; u64 val; - int type, name, len; + int name, len; + enum res_type type; type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); @@ -3963,6 +4879,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, else val = res_counter_read_u64(&memcg->memsw, name); break; + case _KMEM: + val = res_counter_read_u64(&memcg->kmem, name); + break; default: BUG(); } @@ -3970,6 +4889,125 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); return simple_read_from_buffer(buf, nbytes, ppos, str, len); } + +static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) +{ + int ret = -EINVAL; +#ifdef CONFIG_MEMCG_KMEM + bool must_inc_static_branch = false; + + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + /* + * For simplicity, we won't allow this to be disabled. It also can't + * be changed if the cgroup has children already, or if tasks had + * already joined. + * + * If tasks join before we set the limit, a person looking at + * kmem.usage_in_bytes will have no way to determine when it took + * place, which makes the value quite meaningless. + * + * After it first became limited, changes in the value of the limit are + * of course permitted. + * + * Taking the cgroup_lock is really offensive, but it is so far the only + * way to guarantee that no children will appear. There are plenty of + * other offenders, and they should all go away. Fine grained locking + * is probably the way to go here. When we are fully hierarchical, we + * can also get rid of the use_hierarchy check. + */ + cgroup_lock(); + mutex_lock(&set_limit_mutex); + if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { + if (cgroup_task_count(cont) || (memcg->use_hierarchy && + !list_empty(&cont->children))) { + ret = -EBUSY; + goto out; + } + ret = res_counter_set_limit(&memcg->kmem, val); + VM_BUG_ON(ret); + + ret = memcg_update_cache_sizes(memcg); + if (ret) { + res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); + goto out; + } + must_inc_static_branch = true; + /* + * kmem charges can outlive the cgroup. In the case of slab + * pages, for instance, a page contain objects from various + * processes, so it is unfeasible to migrate them away. We + * need to reference count the memcg because of that. + */ + mem_cgroup_get(memcg); + } else + ret = res_counter_set_limit(&memcg->kmem, val); +out: + mutex_unlock(&set_limit_mutex); + cgroup_unlock(); + + /* + * We are by now familiar with the fact that we can't inc the static + * branch inside cgroup_lock. See disarm functions for details. A + * worker here is overkill, but also wrong: After the limit is set, we + * must start accounting right away. Since this operation can't fail, + * we can safely defer it to here - no rollback will be needed. + * + * The boolean used to control this is also safe, because + * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be + * able to set it to true; + */ + if (must_inc_static_branch) { + static_key_slow_inc(&memcg_kmem_enabled_key); + /* + * setting the active bit after the inc will guarantee no one + * starts accounting before all call sites are patched + */ + memcg_kmem_set_active(memcg); + } + +#endif + return ret; +} + +static int memcg_propagate_kmem(struct mem_cgroup *memcg) +{ + int ret = 0; + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + if (!parent) + goto out; + + memcg->kmem_account_flags = parent->kmem_account_flags; +#ifdef CONFIG_MEMCG_KMEM + /* + * When that happen, we need to disable the static branch only on those + * memcgs that enabled it. To achieve this, we would be forced to + * complicate the code by keeping track of which memcgs were the ones + * that actually enabled limits, and which ones got it from its + * parents. + * + * It is a lot simpler just to do static_key_slow_inc() on every child + * that is accounted. + */ + if (!memcg_kmem_is_active(memcg)) + goto out; + + /* + * destroy(), called if we fail, will issue static_key_slow_inc() and + * mem_cgroup_put() if kmem is enabled. We have to either call them + * unconditionally, or clear the KMEM_ACTIVE flag. I personally find + * this more consistent, since it always leads to the same destroy path + */ + mem_cgroup_get(memcg); + static_key_slow_inc(&memcg_kmem_enabled_key); + + mutex_lock(&set_limit_mutex); + ret = memcg_update_cache_sizes(memcg); + mutex_unlock(&set_limit_mutex); +#endif +out: + return ret; +} + /* * The user of this function is... * RES_LIMIT. @@ -3978,7 +5016,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, const char *buffer) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - int type, name; + enum res_type type; + int name; unsigned long long val; int ret; @@ -4000,8 +5039,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, break; if (type == _MEM) ret = mem_cgroup_resize_limit(memcg, val); - else + else if (type == _MEMSWAP) ret = mem_cgroup_resize_memsw_limit(memcg, val); + else if (type == _KMEM) + ret = memcg_update_kmem_limit(cont, val); + else + return -EINVAL; break; case RES_SOFT_LIMIT: ret = res_counter_memparse_write_strategy(buffer, &val); @@ -4054,7 +5097,8 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - int type, name; + int name; + enum res_type type; type = MEMFILE_TYPE(event); name = MEMFILE_ATTR(event); @@ -4066,14 +5110,22 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) case RES_MAX_USAGE: if (type == _MEM) res_counter_reset_max(&memcg->res); - else + else if (type == _MEMSWAP) res_counter_reset_max(&memcg->memsw); + else if (type == _KMEM) + res_counter_reset_max(&memcg->kmem); + else + return -EINVAL; break; case RES_FAILCNT: if (type == _MEM) res_counter_reset_failcnt(&memcg->res); - else + else if (type == _MEMSWAP) res_counter_reset_failcnt(&memcg->memsw); + else if (type == _KMEM) + res_counter_reset_failcnt(&memcg->kmem); + else + return -EINVAL; break; } @@ -4390,7 +5442,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp, struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); u64 threshold, usage; int i, size, ret; @@ -4473,7 +5525,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); u64 usage; int i, j, size; @@ -4551,7 +5603,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_eventfd_list *event; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); BUG_ON(type != _OOM_TYPE); event = kmalloc(sizeof(*event), GFP_KERNEL); @@ -4576,7 +5628,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_eventfd_list *ev, *tmp; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); BUG_ON(type != _OOM_TYPE); @@ -4635,12 +5687,33 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, #ifdef CONFIG_MEMCG_KMEM static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { + int ret; + + memcg->kmemcg_id = -1; + ret = memcg_propagate_kmem(memcg); + if (ret) + return ret; + return mem_cgroup_sockets_init(memcg, ss); }; static void kmem_cgroup_destroy(struct mem_cgroup *memcg) { mem_cgroup_sockets_destroy(memcg); + + memcg_kmem_mark_dead(memcg); + + if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) + return; + + /* + * Charges already down to 0, undo mem_cgroup_get() done in the charge + * path here, being careful not to race with memcg_uncharge_kmem: it is + * possible that the charges went down to 0 between mark_dead and the + * res_counter read, so in that case, we don't need the put + */ + if (memcg_kmem_test_and_clear_dead(memcg)) + mem_cgroup_put(memcg); } #else static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) @@ -4748,6 +5821,37 @@ static struct cftype mem_cgroup_files[] = { .trigger = mem_cgroup_reset, .read = mem_cgroup_read, }, +#endif +#ifdef CONFIG_MEMCG_KMEM + { + .name = "kmem.limit_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), + .write_string = mem_cgroup_write, + .read = mem_cgroup_read, + }, + { + .name = "kmem.usage_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), + .read = mem_cgroup_read, + }, + { + .name = "kmem.failcnt", + .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), + .trigger = mem_cgroup_reset, + .read = mem_cgroup_read, + }, + { + .name = "kmem.max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), + .trigger = mem_cgroup_reset, + .read = mem_cgroup_read, + }, +#ifdef CONFIG_SLABINFO + { + .name = "kmem.slabinfo", + .read_seq_string = mem_cgroup_slabinfo_read, + }, +#endif #endif { }, /* terminate */ }; @@ -4816,16 +5920,29 @@ static struct mem_cgroup *mem_cgroup_alloc(void) } /* - * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, - * but in process context. The work_freeing structure is overlaid - * on the rcu_freeing structure, which itself is overlaid on memsw. + * At destroying mem_cgroup, references from swap_cgroup can remain. + * (scanning all at force_empty is too costly...) + * + * Instead of clearing all references at force_empty, we remember + * the number of reference from swap_cgroup and free mem_cgroup when + * it goes down to 0. + * + * Removal of cgroup itself succeeds regardless of refs from swap. */ -static void free_work(struct work_struct *work) + +static void __mem_cgroup_free(struct mem_cgroup *memcg) { - struct mem_cgroup *memcg; + int node; int size = sizeof(struct mem_cgroup); - memcg = container_of(work, struct mem_cgroup, work_freeing); + mem_cgroup_remove_from_trees(memcg); + free_css_id(&mem_cgroup_subsys, &memcg->css); + + for_each_node(node) + free_mem_cgroup_per_zone_info(memcg, node); + + free_percpu(memcg->stat); + /* * We need to make sure that (at least for now), the jump label * destruction code runs outside of the cgroup lock. This is because @@ -4837,45 +5954,34 @@ static void free_work(struct work_struct *work) * to move this code around, and make sure it is outside * the cgroup_lock. */ - disarm_sock_keys(memcg); + disarm_static_keys(memcg); if (size < PAGE_SIZE) kfree(memcg); else vfree(memcg); } -static void free_rcu(struct rcu_head *rcu_head) -{ - struct mem_cgroup *memcg; - - memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); - INIT_WORK(&memcg->work_freeing, free_work); - schedule_work(&memcg->work_freeing); -} /* - * At destroying mem_cgroup, references from swap_cgroup can remain. - * (scanning all at force_empty is too costly...) - * - * Instead of clearing all references at force_empty, we remember - * the number of reference from swap_cgroup and free mem_cgroup when - * it goes down to 0. - * - * Removal of cgroup itself succeeds regardless of refs from swap. + * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, + * but in process context. The work_freeing structure is overlaid + * on the rcu_freeing structure, which itself is overlaid on memsw. */ - -static void __mem_cgroup_free(struct mem_cgroup *memcg) +static void free_work(struct work_struct *work) { - int node; + struct mem_cgroup *memcg; - mem_cgroup_remove_from_trees(memcg); - free_css_id(&mem_cgroup_subsys, &memcg->css); + memcg = container_of(work, struct mem_cgroup, work_freeing); + __mem_cgroup_free(memcg); +} - for_each_node(node) - free_mem_cgroup_per_zone_info(memcg, node); +static void free_rcu(struct rcu_head *rcu_head) +{ + struct mem_cgroup *memcg; - free_percpu(memcg->stat); - call_rcu(&memcg->rcu_freeing, free_rcu); + memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); + INIT_WORK(&memcg->work_freeing, free_work); + schedule_work(&memcg->work_freeing); } static void mem_cgroup_get(struct mem_cgroup *memcg) @@ -4887,7 +5993,7 @@ static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) { if (atomic_sub_and_test(count, &memcg->refcnt)) { struct mem_cgroup *parent = parent_mem_cgroup(memcg); - __mem_cgroup_free(memcg); + call_rcu(&memcg->rcu_freeing, free_rcu); if (parent) mem_cgroup_put(parent); } @@ -4994,6 +6100,8 @@ mem_cgroup_css_alloc(struct cgroup *cont) if (parent && parent->use_hierarchy) { res_counter_init(&memcg->res, &parent->res); res_counter_init(&memcg->memsw, &parent->memsw); + res_counter_init(&memcg->kmem, &parent->kmem); + /* * We increment refcnt of the parent to ensure that we can * safely access it on res_counter_charge/uncharge. @@ -5004,6 +6112,7 @@ mem_cgroup_css_alloc(struct cgroup *cont) } else { res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); + res_counter_init(&memcg->kmem, NULL); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -5043,6 +6152,7 @@ static void mem_cgroup_css_offline(struct cgroup *cont) struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); mem_cgroup_reparent_charges(memcg); + mem_cgroup_destroy_all_caches(memcg); } static void mem_cgroup_css_free(struct cgroup *cont) diff --git a/trunk/mm/memory.c b/trunk/mm/memory.c index e6a3b933517e..e0a9b0ce4f10 100644 --- a/trunk/mm/memory.c +++ b/trunk/mm/memory.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include @@ -3590,6 +3591,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { BUG(); + return 0; } #endif /* CONFIG_NUMA_BALANCING */ @@ -4117,15 +4119,12 @@ void print_vma_addr(char *prefix, unsigned long ip) struct file *f = vma->vm_file; char *buf = (char *)__get_free_page(GFP_KERNEL); if (buf) { - char *p, *s; + char *p; p = d_path(&f->f_path, buf, PAGE_SIZE); if (IS_ERR(p)) p = "?"; - s = strrchr(p, '/'); - if (s) - p = s+1; - printk("%s%s[%lx+%lx]", prefix, p, + printk("%s%s[%lx+%lx]", prefix, kbasename(p), vma->vm_start, vma->vm_end - vma->vm_start); free_page((unsigned long)buf); diff --git a/trunk/mm/memory_hotplug.c b/trunk/mm/memory_hotplug.c index 962e353aa86f..d04ed87bfacb 100644 --- a/trunk/mm/memory_hotplug.c +++ b/trunk/mm/memory_hotplug.c @@ -590,18 +590,21 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, } #ifdef CONFIG_MOVABLE_NODE -/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */ +/* + * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have + * normal memory. + */ static bool can_online_high_movable(struct zone *zone) { return true; } -#else /* #ifdef CONFIG_MOVABLE_NODE */ +#else /* CONFIG_MOVABLE_NODE */ /* ensure every online node has NORMAL memory */ static bool can_online_high_movable(struct zone *zone) { return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); } -#endif /* #ifdef CONFIG_MOVABLE_NODE */ +#endif /* CONFIG_MOVABLE_NODE */ /* check which state of node_states will be changed when online memory */ static void node_states_check_changes_online(unsigned long nr_pages, @@ -1112,12 +1115,15 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) } #ifdef CONFIG_MOVABLE_NODE -/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */ +/* + * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have + * normal memory. + */ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) { return true; } -#else /* #ifdef CONFIG_MOVABLE_NODE */ +#else /* CONFIG_MOVABLE_NODE */ /* ensure the node has NORMAL memory if it is still online */ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) { @@ -1141,7 +1147,7 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) */ return present_pages == 0; } -#endif /* #ifdef CONFIG_MOVABLE_NODE */ +#endif /* CONFIG_MOVABLE_NODE */ /* check which state of node_states will be changed when offline memory */ static void node_states_check_changes_offline(unsigned long nr_pages, diff --git a/trunk/mm/migrate.c b/trunk/mm/migrate.c index 32efd8028bc9..3b676b0c5c3e 100644 --- a/trunk/mm/migrate.c +++ b/trunk/mm/migrate.c @@ -1734,7 +1734,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); - update_mmu_cache_pmd(vma, address, entry); + update_mmu_cache_pmd(vma, address, &entry); page_remove_rmap(page); /* * Finish the charge transaction under the page table lock to diff --git a/trunk/mm/mprotect.c b/trunk/mm/mprotect.c index 3dca970367db..94722a4d6b43 100644 --- a/trunk/mm/mprotect.c +++ b/trunk/mm/mprotect.c @@ -114,7 +114,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, #ifdef CONFIG_NUMA_BALANCING static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, - pmd_t *pmd) + pmd_t *pmd) { spin_lock(&mm->page_table_lock); set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); @@ -122,15 +122,15 @@ static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, } #else static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, - pmd_t *pmd) + pmd_t *pmd) { BUG(); } #endif /* CONFIG_NUMA_BALANCING */ -static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) +static inline unsigned long change_pmd_range(struct vm_area_struct *vma, + pud_t *pud, unsigned long addr, unsigned long end, + pgprot_t newprot, int dirty_accountable, int prot_numa) { pmd_t *pmd; unsigned long next; @@ -143,7 +143,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma, addr, pmd); - else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) { + else if (change_huge_pmd(vma, pmd, addr, newprot, + prot_numa)) { pages += HPAGE_PMD_NR; continue; } @@ -167,9 +168,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * return pages; } -static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) +static inline unsigned long change_pud_range(struct vm_area_struct *vma, + pgd_t *pgd, unsigned long addr, unsigned long end, + pgprot_t newprot, int dirty_accountable, int prot_numa) { pud_t *pud; unsigned long next; @@ -304,7 +305,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, dirty_accountable = 1; } - change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); + change_protection(vma, start, end, vma->vm_page_prot, + dirty_accountable, 0); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); @@ -361,8 +363,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, error = -EINVAL; if (!(vma->vm_flags & VM_GROWSDOWN)) goto out; - } - else { + } else { if (vma->vm_start > start) goto out; if (unlikely(grows & PROT_GROWSUP)) { @@ -378,9 +379,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, for (nstart = start ; ; ) { unsigned long newflags; - /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ - newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); + newflags = vm_flags; + newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); /* newflags >> 4 shift VM_MAY% in place of VM_% */ if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { diff --git a/trunk/mm/page_alloc.c b/trunk/mm/page_alloc.c index d037c8bc1512..2ad2ad168efe 100644 --- a/trunk/mm/page_alloc.c +++ b/trunk/mm/page_alloc.c @@ -371,8 +371,7 @@ static int destroy_compound_page(struct page *page, unsigned long order) int nr_pages = 1 << order; int bad = 0; - if (unlikely(compound_order(page) != order) || - unlikely(!PageHead(page))) { + if (unlikely(compound_order(page) != order)) { bad_page(page); bad++; } @@ -2613,6 +2612,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; + struct mem_cgroup *memcg = NULL; gfp_mask &= gfp_allowed_mask; @@ -2631,6 +2631,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; + /* + * Will only have any effect when __GFP_KMEMCG is set. This is + * verified in the (always inline) callee + */ + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + retry_cpuset: cpuset_mems_cookie = get_mems_allowed(); @@ -2666,6 +2673,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) goto retry_cpuset; + memcg_kmem_commit_charge(page, memcg, order); + return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2718,6 +2727,31 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); +/* + * __free_memcg_kmem_pages and free_memcg_kmem_pages will free + * pages allocated with __GFP_KMEMCG. + * + * Those pages are accounted to a particular memcg, embedded in the + * corresponding page_cgroup. To avoid adding a hit in the allocator to search + * for that information only to find out that it is NULL for users who have no + * interest in that whatsoever, we provide these functions. + * + * The caller knows better which flags it relies on. + */ +void __free_memcg_kmem_pages(struct page *page, unsigned int order) +{ + memcg_kmem_uncharge_pages(page, order); + __free_pages(page, order); +} + +void free_memcg_kmem_pages(unsigned long addr, unsigned int order) +{ + if (addr != 0) { + VM_BUG_ON(!virt_addr_valid((void *)addr)); + __free_memcg_kmem_pages(virt_to_page((void *)addr), order); + } +} + static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) { if (addr) { diff --git a/trunk/mm/shmem.c b/trunk/mm/shmem.c index 03f9ba8fb8e5..5c90d84c2b02 100644 --- a/trunk/mm/shmem.c +++ b/trunk/mm/shmem.c @@ -1719,7 +1719,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. */ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, - pgoff_t index, pgoff_t end, int origin) + pgoff_t index, pgoff_t end, int whence) { struct page *page; struct pagevec pvec; @@ -1733,13 +1733,13 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, pvec.nr = shmem_find_get_pages_and_swap(mapping, index, pvec.nr, pvec.pages, indices); if (!pvec.nr) { - if (origin == SEEK_DATA) + if (whence == SEEK_DATA) index = end; break; } for (i = 0; i < pvec.nr; i++, index++) { if (index < indices[i]) { - if (origin == SEEK_HOLE) { + if (whence == SEEK_HOLE) { done = true; break; } @@ -1751,8 +1751,8 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, page = NULL; } if (index >= end || - (page && origin == SEEK_DATA) || - (!page && origin == SEEK_HOLE)) { + (page && whence == SEEK_DATA) || + (!page && whence == SEEK_HOLE)) { done = true; break; } @@ -1765,15 +1765,15 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, return index; } -static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin) +static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; pgoff_t start, end; loff_t new_offset; - if (origin != SEEK_DATA && origin != SEEK_HOLE) - return generic_file_llseek_size(file, offset, origin, + if (whence != SEEK_DATA && whence != SEEK_HOLE) + return generic_file_llseek_size(file, offset, whence, MAX_LFS_FILESIZE, i_size_read(inode)); mutex_lock(&inode->i_mutex); /* We're holding i_mutex so we can access i_size directly */ @@ -1785,12 +1785,12 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin) else { start = offset >> PAGE_CACHE_SHIFT; end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - new_offset = shmem_seek_hole_data(mapping, start, end, origin); + new_offset = shmem_seek_hole_data(mapping, start, end, whence); new_offset <<= PAGE_CACHE_SHIFT; if (new_offset > offset) { if (new_offset < inode->i_size) offset = new_offset; - else if (origin == SEEK_DATA) + else if (whence == SEEK_DATA) offset = -ENXIO; else offset = inode->i_size; diff --git a/trunk/mm/slab.c b/trunk/mm/slab.c index 33d3363658df..e7667a3584bc 100644 --- a/trunk/mm/slab.c +++ b/trunk/mm/slab.c @@ -87,7 +87,6 @@ */ #include -#include "slab.h" #include #include #include @@ -128,6 +127,8 @@ #include "internal.h" +#include "slab.h" + /* * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. * 0 for faster, smaller code (especially in the critical paths). @@ -162,23 +163,6 @@ */ static bool pfmemalloc_active __read_mostly; -/* Legal flag mask for kmem_cache_create(). */ -#if DEBUG -# define CREATE_MASK (SLAB_RED_ZONE | \ - SLAB_POISON | SLAB_HWCACHE_ALIGN | \ - SLAB_CACHE_DMA | \ - SLAB_STORE_USER | \ - SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ - SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) -#else -# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ - SLAB_CACHE_DMA | \ - SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ - SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) -#endif - /* * kmem_bufctl_t: * @@ -564,15 +548,11 @@ static struct cache_names __initdata cache_names[] = { #undef CACHE }; -static struct arraycache_init initarray_cache __initdata = - { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; static struct arraycache_init initarray_generic = { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; /* internal cache of cache description objs */ -static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES]; static struct kmem_cache kmem_cache_boot = { - .nodelists = kmem_cache_nodelists, .batchcount = 1, .limit = BOOT_CPUCACHE_ENTRIES, .shared = 1, @@ -662,6 +642,26 @@ static void init_node_lock_keys(int q) } } +static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q) +{ + struct kmem_list3 *l3; + l3 = cachep->nodelists[q]; + if (!l3) + return; + + slab_set_lock_classes(cachep, &on_slab_l3_key, + &on_slab_alc_key, q); +} + +static inline void on_slab_lock_classes(struct kmem_cache *cachep) +{ + int node; + + VM_BUG_ON(OFF_SLAB(cachep)); + for_each_node(node) + on_slab_lock_classes_node(cachep, node); +} + static inline void init_lock_keys(void) { int node; @@ -678,6 +678,14 @@ static inline void init_lock_keys(void) { } +static inline void on_slab_lock_classes(struct kmem_cache *cachep) +{ +} + +static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node) +{ +} + static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) { } @@ -1406,6 +1414,9 @@ static int __cpuinit cpuup_prepare(long cpu) free_alien_cache(alien); if (cachep->flags & SLAB_DEBUG_OBJECTS) slab_set_debugobj_lock_classes_node(cachep, node); + else if (!OFF_SLAB(cachep) && + !(cachep->flags & SLAB_DESTROY_BY_RCU)) + on_slab_lock_classes_node(cachep, node); } init_node_lock_keys(node); @@ -1576,29 +1587,34 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index) } } +/* + * The memory after the last cpu cache pointer is used for the + * the nodelists pointer. + */ +static void setup_nodelists_pointer(struct kmem_cache *cachep) +{ + cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; +} + /* * Initialisation. Called after the page allocator have been initialised and * before smp_init(). */ void __init kmem_cache_init(void) { - size_t left_over; struct cache_sizes *sizes; struct cache_names *names; int i; - int order; - int node; kmem_cache = &kmem_cache_boot; + setup_nodelists_pointer(kmem_cache); if (num_possible_nodes() == 1) use_alien_caches = 0; - for (i = 0; i < NUM_INIT_LISTS; i++) { + for (i = 0; i < NUM_INIT_LISTS; i++) kmem_list3_init(&initkmem_list3[i]); - if (i < MAX_NUMNODES) - kmem_cache->nodelists[i] = NULL; - } + set_up_list3s(kmem_cache, CACHE_CACHE); /* @@ -1629,37 +1645,16 @@ void __init kmem_cache_init(void) * 6) Resize the head arrays of the kmalloc caches to their final sizes. */ - node = numa_mem_id(); - /* 1) create the kmem_cache */ - INIT_LIST_HEAD(&slab_caches); - list_add(&kmem_cache->list, &slab_caches); - kmem_cache->colour_off = cache_line_size(); - kmem_cache->array[smp_processor_id()] = &initarray_cache.cache; - kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; /* * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids */ - kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + - nr_node_ids * sizeof(struct kmem_list3 *); - kmem_cache->object_size = kmem_cache->size; - kmem_cache->size = ALIGN(kmem_cache->object_size, - cache_line_size()); - kmem_cache->reciprocal_buffer_size = - reciprocal_value(kmem_cache->size); - - for (order = 0; order < MAX_ORDER; order++) { - cache_estimate(order, kmem_cache->size, - cache_line_size(), 0, &left_over, &kmem_cache->num); - if (kmem_cache->num) - break; - } - BUG_ON(!kmem_cache->num); - kmem_cache->gfporder = order; - kmem_cache->colour = left_over / kmem_cache->colour_off; - kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) + - sizeof(struct slab), cache_line_size()); + create_boot_cache(kmem_cache, "kmem_cache", + offsetof(struct kmem_cache, array[nr_cpu_ids]) + + nr_node_ids * sizeof(struct kmem_list3 *), + SLAB_HWCACHE_ALIGN); + list_add(&kmem_cache->list, &slab_caches); /* 2+3) create the kmalloc caches */ sizes = malloc_sizes; @@ -1671,23 +1666,13 @@ void __init kmem_cache_init(void) * bug. */ - sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name; - sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size; - sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size; - sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN; - __kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); - list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches); - - if (INDEX_AC != INDEX_L3) { - sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name; - sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size; - sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size; - sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN; - __kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); - list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches); - } + sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name, + sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS); + + if (INDEX_AC != INDEX_L3) + sizes[INDEX_L3].cs_cachep = + create_kmalloc_cache(names[INDEX_L3].name, + sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS); slab_early_init = 0; @@ -1699,24 +1684,14 @@ void __init kmem_cache_init(void) * Note for systems short on memory removing the alignment will * allow tighter packing of the smaller caches. */ - if (!sizes->cs_cachep) { - sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - sizes->cs_cachep->name = names->name; - sizes->cs_cachep->size = sizes->cs_size; - sizes->cs_cachep->object_size = sizes->cs_size; - sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN; - __kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); - list_add(&sizes->cs_cachep->list, &slab_caches); - } + if (!sizes->cs_cachep) + sizes->cs_cachep = create_kmalloc_cache(names->name, + sizes->cs_size, ARCH_KMALLOC_FLAGS); + #ifdef CONFIG_ZONE_DMA - sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - sizes->cs_dmacachep->name = names->name_dma; - sizes->cs_dmacachep->size = sizes->cs_size; - sizes->cs_dmacachep->object_size = sizes->cs_size; - sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN; - __kmem_cache_create(sizes->cs_dmacachep, - ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC); - list_add(&sizes->cs_dmacachep->list, &slab_caches); + sizes->cs_dmacachep = create_kmalloc_cache( + names->name_dma, sizes->cs_size, + SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS); #endif sizes++; names++; @@ -1727,7 +1702,6 @@ void __init kmem_cache_init(void) ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); - BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache); memcpy(ptr, cpu_cache_get(kmem_cache), sizeof(struct arraycache_init)); /* @@ -1921,6 +1895,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) if (page->pfmemalloc) SetPageSlabPfmemalloc(page + i); } + memcg_bind_pages(cachep, cachep->gfporder); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); @@ -1957,9 +1932,11 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) __ClearPageSlab(page); page++; } + + memcg_release_pages(cachep, cachep->gfporder); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - free_pages((unsigned long)addr, cachep->gfporder); + free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder); } static void kmem_rcu_free(struct rcu_head *head) @@ -2282,7 +2259,15 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) if (slab_state == DOWN) { /* - * Note: the first kmem_cache_create must create the cache + * Note: Creation of first cache (kmem_cache). + * The setup_list3s is taken care + * of by the caller of __kmem_cache_create + */ + cachep->array[smp_processor_id()] = &initarray_generic.cache; + slab_state = PARTIAL; + } else if (slab_state == PARTIAL) { + /* + * Note: the second kmem_cache_create must create the cache * that's used by kmalloc(24), otherwise the creation of * further caches will BUG(). */ @@ -2290,7 +2275,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) /* * If the cache that's used by kmalloc(sizeof(kmem_list3)) is - * the first cache, then we need to set up all its list3s, + * the second cache, then we need to set up all its list3s, * otherwise the creation of further caches will BUG(). */ set_up_list3s(cachep, SIZE_AC); @@ -2299,6 +2284,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) else slab_state = PARTIAL_ARRAYCACHE; } else { + /* Remaining boot caches */ cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init), gfp); @@ -2331,11 +2317,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) /** * __kmem_cache_create - Create a cache. - * @name: A string which is used in /proc/slabinfo to identify this cache. - * @size: The size of objects to be created in this cache. - * @align: The required alignment for the objects. + * @cachep: cache management descriptor * @flags: SLAB flags - * @ctor: A constructor for the objects. * * Returns a ptr to the cache on success, NULL on failure. * Cannot be called within a int, but can be interrupted. @@ -2378,11 +2361,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (flags & SLAB_DESTROY_BY_RCU) BUG_ON(flags & SLAB_POISON); #endif - /* - * Always checks flags, a caller might be expecting debug support which - * isn't available. - */ - BUG_ON(flags & ~CREATE_MASK); /* * Check that size is in terms of words. This is needed to avoid @@ -2394,22 +2372,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) size &= ~(BYTES_PER_WORD - 1); } - /* calculate the final buffer alignment: */ - - /* 1) arch recommendation: can be overridden for debug */ - if (flags & SLAB_HWCACHE_ALIGN) { - /* - * Default alignment: as specified by the arch code. Except if - * an object is really small, then squeeze multiple objects into - * one cacheline. - */ - ralign = cache_line_size(); - while (size <= ralign / 2) - ralign /= 2; - } else { - ralign = BYTES_PER_WORD; - } - /* * Redzoning and user store require word alignment or possibly larger. * Note this will be overridden by architecture or caller mandated @@ -2426,10 +2388,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) size &= ~(REDZONE_ALIGN - 1); } - /* 2) arch mandated alignment */ - if (ralign < ARCH_SLAB_MINALIGN) { - ralign = ARCH_SLAB_MINALIGN; - } /* 3) caller mandated alignment */ if (ralign < cachep->align) { ralign = cachep->align; @@ -2447,7 +2405,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) else gfp = GFP_NOWAIT; - cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; + setup_nodelists_pointer(cachep); #if DEBUG /* @@ -2566,7 +2524,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); slab_set_debugobj_lock_classes(cachep); - } + } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU)) + on_slab_lock_classes(cachep); return 0; } @@ -3530,6 +3489,8 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, if (slab_should_failslab(cachep, flags)) return NULL; + cachep = memcg_kmem_get_cache(cachep, flags); + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); @@ -3615,6 +3576,8 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) if (slab_should_failslab(cachep, flags)) return NULL; + cachep = memcg_kmem_get_cache(cachep, flags); + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); @@ -3928,6 +3891,9 @@ EXPORT_SYMBOL(__kmalloc); void kmem_cache_free(struct kmem_cache *cachep, void *objp) { unsigned long flags; + cachep = cache_from_obj(cachep, objp); + if (!cachep) + return; local_irq_save(flags); debug_check_no_locks_freed(objp, cachep->object_size); @@ -3969,12 +3935,6 @@ void kfree(const void *objp) } EXPORT_SYMBOL(kfree); -unsigned int kmem_cache_size(struct kmem_cache *cachep) -{ - return cachep->object_size; -} -EXPORT_SYMBOL(kmem_cache_size); - /* * This initializes kmem_list3 or resizes various caches for all nodes. */ @@ -4081,7 +4041,7 @@ static void do_ccupdate_local(void *info) } /* Always called with the slab_mutex held */ -static int do_tune_cpucache(struct kmem_cache *cachep, int limit, +static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared, gfp_t gfp) { struct ccupdate_struct *new; @@ -4124,12 +4084,49 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, return alloc_kmemlist(cachep, gfp); } +static int do_tune_cpucache(struct kmem_cache *cachep, int limit, + int batchcount, int shared, gfp_t gfp) +{ + int ret; + struct kmem_cache *c = NULL; + int i = 0; + + ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); + + if (slab_state < FULL) + return ret; + + if ((ret < 0) || !is_root_cache(cachep)) + return ret; + + VM_BUG_ON(!mutex_is_locked(&slab_mutex)); + for_each_memcg_cache_index(i) { + c = cache_from_memcg(cachep, i); + if (c) + /* return value determined by the parent cache only */ + __do_tune_cpucache(c, limit, batchcount, shared, gfp); + } + + return ret; +} + /* Called with slab_mutex held always */ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) { int err; - int limit, shared; + int limit = 0; + int shared = 0; + int batchcount = 0; + + if (!is_root_cache(cachep)) { + struct kmem_cache *root = memcg_root_cache(cachep); + limit = root->limit; + shared = root->shared; + batchcount = root->batchcount; + } + if (limit && shared && batchcount) + goto skip_setup; /* * The head array serves three purposes: * - create a LIFO ordering, i.e. return objects that are cache-warm @@ -4171,7 +4168,9 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) if (limit > 32) limit = 32; #endif - err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); + batchcount = (limit + 1) / 2; +skip_setup: + err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); if (err) printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", cachep->name, -err); @@ -4276,54 +4275,8 @@ static void cache_reap(struct work_struct *w) } #ifdef CONFIG_SLABINFO - -static void print_slabinfo_header(struct seq_file *m) -{ - /* - * Output format version, so at least we can change it - * without _too_ many complaints. - */ -#if STATS - seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); -#else - seq_puts(m, "slabinfo - version: 2.1\n"); -#endif - seq_puts(m, "# name " - " "); - seq_puts(m, " : tunables "); - seq_puts(m, " : slabdata "); -#if STATS - seq_puts(m, " : globalstat " - " "); - seq_puts(m, " : cpustat "); -#endif - seq_putc(m, '\n'); -} - -static void *s_start(struct seq_file *m, loff_t *pos) -{ - loff_t n = *pos; - - mutex_lock(&slab_mutex); - if (!n) - print_slabinfo_header(m); - - return seq_list_start(&slab_caches, *pos); -} - -static void *s_next(struct seq_file *m, void *p, loff_t *pos) +void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) { - return seq_list_next(p, &slab_caches, pos); -} - -static void s_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&slab_mutex); -} - -static int s_show(struct seq_file *m, void *p) -{ - struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); struct slab *slabp; unsigned long active_objs; unsigned long num_objs; @@ -4378,13 +4331,20 @@ static int s_show(struct seq_file *m, void *p) if (error) printk(KERN_ERR "slab: cache %s error: %s\n", name, error); - seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", - name, active_objs, num_objs, cachep->size, - cachep->num, (1 << cachep->gfporder)); - seq_printf(m, " : tunables %4u %4u %4u", - cachep->limit, cachep->batchcount, cachep->shared); - seq_printf(m, " : slabdata %6lu %6lu %6lu", - active_slabs, num_slabs, shared_avail); + sinfo->active_objs = active_objs; + sinfo->num_objs = num_objs; + sinfo->active_slabs = active_slabs; + sinfo->num_slabs = num_slabs; + sinfo->shared_avail = shared_avail; + sinfo->limit = cachep->limit; + sinfo->batchcount = cachep->batchcount; + sinfo->shared = cachep->shared; + sinfo->objects_per_slab = cachep->num; + sinfo->cache_order = cachep->gfporder; +} + +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) +{ #if STATS { /* list3 stats */ unsigned long high = cachep->high_mark; @@ -4414,31 +4374,8 @@ static int s_show(struct seq_file *m, void *p) allochit, allocmiss, freehit, freemiss); } #endif - seq_putc(m, '\n'); - return 0; } -/* - * slabinfo_op - iterator that generates /proc/slabinfo - * - * Output layout: - * cache-name - * num-active-objs - * total-objs - * object size - * num-active-slabs - * total-slabs - * num-pages-per-slab - * + further values on SMP and with statistics enabled - */ - -static const struct seq_operations slabinfo_op = { - .start = s_start, - .next = s_next, - .stop = s_stop, - .show = s_show, -}; - #define MAX_SLABINFO_WRITE 128 /** * slabinfo_write - Tuning for the slab allocator @@ -4447,7 +4384,7 @@ static const struct seq_operations slabinfo_op = { * @count: data length * @ppos: unused */ -static ssize_t slabinfo_write(struct file *file, const char __user *buffer, +ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; @@ -4490,19 +4427,6 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer, return res; } -static int slabinfo_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &slabinfo_op); -} - -static const struct file_operations proc_slabinfo_operations = { - .open = slabinfo_open, - .read = seq_read, - .write = slabinfo_write, - .llseek = seq_lseek, - .release = seq_release, -}; - #ifdef CONFIG_DEBUG_SLAB_LEAK static void *leaks_start(struct seq_file *m, loff_t *pos) @@ -4631,6 +4555,16 @@ static int leaks_show(struct seq_file *m, void *p) return 0; } +static void *s_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &slab_caches, pos); +} + +static void s_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&slab_mutex); +} + static const struct seq_operations slabstats_op = { .start = leaks_start, .next = s_next, @@ -4665,7 +4599,6 @@ static const struct file_operations proc_slabstats_operations = { static int __init slab_proc_init(void) { - proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations); #ifdef CONFIG_DEBUG_SLAB_LEAK proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); #endif diff --git a/trunk/mm/slab.h b/trunk/mm/slab.h index 7deeb449a301..34a98d642196 100644 --- a/trunk/mm/slab.h +++ b/trunk/mm/slab.h @@ -32,19 +32,201 @@ extern struct list_head slab_caches; /* The slab cache that manages slab cache information */ extern struct kmem_cache *kmem_cache; +unsigned long calculate_alignment(unsigned long flags, + unsigned long align, unsigned long size); + /* Functions provided by the slab allocators */ extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); +extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, + unsigned long flags); +extern void create_boot_cache(struct kmem_cache *, const char *name, + size_t size, unsigned long flags); + +struct mem_cgroup; #ifdef CONFIG_SLUB -struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)); +struct kmem_cache * +__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)); #else -static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +static inline struct kmem_cache * +__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)) { return NULL; } #endif +/* Legal flag mask for kmem_cache_create(), for various configurations */ +#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \ + SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS ) + +#if defined(CONFIG_DEBUG_SLAB) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) +#elif defined(CONFIG_SLUB_DEBUG) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_DEBUG_FREE) +#else +#define SLAB_DEBUG_FLAGS (0) +#endif + +#if defined(CONFIG_SLAB) +#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK) +#elif defined(CONFIG_SLUB) +#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | SLAB_NOTRACK) +#else +#define SLAB_CACHE_FLAGS (0) +#endif + +#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) + int __kmem_cache_shutdown(struct kmem_cache *); +struct seq_file; +struct file; + +struct slabinfo { + unsigned long active_objs; + unsigned long num_objs; + unsigned long active_slabs; + unsigned long num_slabs; + unsigned long shared_avail; + unsigned int limit; + unsigned int batchcount; + unsigned int shared; + unsigned int objects_per_slab; + unsigned int cache_order; +}; + +void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos); + +#ifdef CONFIG_MEMCG_KMEM +static inline bool is_root_cache(struct kmem_cache *s) +{ + return !s->memcg_params || s->memcg_params->is_root_cache; +} + +static inline bool cache_match_memcg(struct kmem_cache *cachep, + struct mem_cgroup *memcg) +{ + return (is_root_cache(cachep) && !memcg) || + (cachep->memcg_params->memcg == memcg); +} + +static inline void memcg_bind_pages(struct kmem_cache *s, int order) +{ + if (!is_root_cache(s)) + atomic_add(1 << order, &s->memcg_params->nr_pages); +} + +static inline void memcg_release_pages(struct kmem_cache *s, int order) +{ + if (is_root_cache(s)) + return; + + if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages)) + mem_cgroup_destroy_cache(s); +} + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return (p == s) || + (s->memcg_params && (p == s->memcg_params->root_cache)); +} + +/* + * We use suffixes to the name in memcg because we can't have caches + * created in the system with the same name. But when we print them + * locally, better refer to them with the base name + */ +static inline const char *cache_name(struct kmem_cache *s) +{ + if (!is_root_cache(s)) + return s->memcg_params->root_cache->name; + return s->name; +} + +static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) +{ + return s->memcg_params->memcg_caches[idx]; +} + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + if (is_root_cache(s)) + return s; + return s->memcg_params->root_cache; +} +#else +static inline bool is_root_cache(struct kmem_cache *s) +{ + return true; +} + +static inline bool cache_match_memcg(struct kmem_cache *cachep, + struct mem_cgroup *memcg) +{ + return true; +} + +static inline void memcg_bind_pages(struct kmem_cache *s, int order) +{ +} + +static inline void memcg_release_pages(struct kmem_cache *s, int order) +{ +} + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return true; +} + +static inline const char *cache_name(struct kmem_cache *s) +{ + return s->name; +} + +static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) +{ + return NULL; +} + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + return s; +} +#endif + +static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) +{ + struct kmem_cache *cachep; + struct page *page; + + /* + * When kmemcg is not being used, both assignments should return the + * same value. but we don't want to pay the assignment price in that + * case. If it is not compiled in, the compiler should be smart enough + * to not do even the assignment. In that case, slab_equal_or_root + * will also be a constant. + */ + if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE)) + return s; + + page = virt_to_head_page(x); + cachep = page->slab_cache; + if (slab_equal_or_root(cachep, s)) + return cachep; + + pr_err("%s: Wrong slab cache. %s but object is from %s\n", + __FUNCTION__, cachep->name, s->name); + WARN_ON_ONCE(1); + return s; +} #endif diff --git a/trunk/mm/slab_common.c b/trunk/mm/slab_common.c index 069a24e64403..3f3cd97d3fdf 100644 --- a/trunk/mm/slab_common.c +++ b/trunk/mm/slab_common.c @@ -13,9 +13,12 @@ #include #include #include +#include +#include #include #include #include +#include #include "slab.h" @@ -25,7 +28,8 @@ DEFINE_MUTEX(slab_mutex); struct kmem_cache *kmem_cache; #ifdef CONFIG_DEBUG_VM -static int kmem_cache_sanity_check(const char *name, size_t size) +static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, + size_t size) { struct kmem_cache *s = NULL; @@ -51,7 +55,13 @@ static int kmem_cache_sanity_check(const char *name, size_t size) continue; } - if (!strcmp(s->name, name)) { + /* + * For simplicity, we won't check this in the list of memcg + * caches. We have control over memcg naming, and if there + * aren't duplicates in the global list, there won't be any + * duplicates in the memcg lists as well. + */ + if (!memcg && !strcmp(s->name, name)) { pr_err("%s (%s): Cache name already exists.\n", __func__, name); dump_stack(); @@ -64,12 +74,69 @@ static int kmem_cache_sanity_check(const char *name, size_t size) return 0; } #else -static inline int kmem_cache_sanity_check(const char *name, size_t size) +static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg, + const char *name, size_t size) { return 0; } #endif +#ifdef CONFIG_MEMCG_KMEM +int memcg_update_all_caches(int num_memcgs) +{ + struct kmem_cache *s; + int ret = 0; + mutex_lock(&slab_mutex); + + list_for_each_entry(s, &slab_caches, list) { + if (!is_root_cache(s)) + continue; + + ret = memcg_update_cache_size(s, num_memcgs); + /* + * See comment in memcontrol.c, memcg_update_cache_size: + * Instead of freeing the memory, we'll just leave the caches + * up to this point in an updated state. + */ + if (ret) + goto out; + } + + memcg_update_array_size(num_memcgs); +out: + mutex_unlock(&slab_mutex); + return ret; +} +#endif + +/* + * Figure out what the alignment of the objects will be given a set of + * flags, a user specified alignment and the size of the objects. + */ +unsigned long calculate_alignment(unsigned long flags, + unsigned long align, unsigned long size) +{ + /* + * If the user wants hardware cache aligned objects then follow that + * suggestion if the object is sufficiently large. + * + * The hardware cache alignment cannot override the specified + * alignment though. If that is greater then use it. + */ + if (flags & SLAB_HWCACHE_ALIGN) { + unsigned long ralign = cache_line_size(); + while (size <= ralign / 2) + ralign /= 2; + align = max(align, ralign); + } + + if (align < ARCH_SLAB_MINALIGN) + align = ARCH_SLAB_MINALIGN; + + return ALIGN(align, sizeof(void *)); +} + + /* * kmem_cache_create - Create a cache. * @name: A string which is used in /proc/slabinfo to identify this cache. @@ -95,8 +162,10 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) * as davem. */ -struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) +struct kmem_cache * +kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *), + struct kmem_cache *parent_cache) { struct kmem_cache *s = NULL; int err = 0; @@ -104,19 +173,33 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align get_online_cpus(); mutex_lock(&slab_mutex); - if (!kmem_cache_sanity_check(name, size) == 0) + if (!kmem_cache_sanity_check(memcg, name, size) == 0) goto out_locked; + /* + * Some allocators will constraint the set of valid flags to a subset + * of all flags. We expect them to define CACHE_CREATE_MASK in this + * case, and we'll just provide them with a sanitized version of the + * passed flags. + */ + flags &= CACHE_CREATE_MASK; - s = __kmem_cache_alias(name, size, align, flags, ctor); + s = __kmem_cache_alias(memcg, name, size, align, flags, ctor); if (s) goto out_locked; s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); if (s) { s->object_size = s->size = size; - s->align = align; + s->align = calculate_alignment(flags, align, size); s->ctor = ctor; + + if (memcg_register_cache(memcg, s, parent_cache)) { + kmem_cache_free(kmem_cache, s); + err = -ENOMEM; + goto out_locked; + } + s->name = kstrdup(name, GFP_KERNEL); if (!s->name) { kmem_cache_free(kmem_cache, s); @@ -126,10 +209,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align err = __kmem_cache_create(s, flags); if (!err) { - s->refcount = 1; list_add(&s->list, &slab_caches); - + memcg_cache_list_add(memcg, s); } else { kfree(s->name); kmem_cache_free(kmem_cache, s); @@ -157,10 +239,20 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align return s; } + +struct kmem_cache * +kmem_cache_create(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) +{ + return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL); +} EXPORT_SYMBOL(kmem_cache_create); void kmem_cache_destroy(struct kmem_cache *s) { + /* Destroy all the children caches if we aren't a memcg cache */ + kmem_cache_destroy_memcg_children(s); + get_online_cpus(); mutex_lock(&slab_mutex); s->refcount--; @@ -172,6 +264,7 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); + memcg_release_cache(s); kfree(s->name); kmem_cache_free(kmem_cache, s); } else { @@ -192,3 +285,182 @@ int slab_is_available(void) { return slab_state >= UP; } + +#ifndef CONFIG_SLOB +/* Create a cache during boot when no slab services are available yet */ +void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size, + unsigned long flags) +{ + int err; + + s->name = name; + s->size = s->object_size = size; + s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); + err = __kmem_cache_create(s, flags); + + if (err) + panic("Creation of kmalloc slab %s size=%zd failed. Reason %d\n", + name, size, err); + + s->refcount = -1; /* Exempt from merging for now */ +} + +struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, + unsigned long flags) +{ + struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + + if (!s) + panic("Out of memory when creating slab %s\n", name); + + create_boot_cache(s, name, size, flags); + list_add(&s->list, &slab_caches); + s->refcount = 1; + return s; +} + +#endif /* !CONFIG_SLOB */ + + +#ifdef CONFIG_SLABINFO +void print_slabinfo_header(struct seq_file *m) +{ + /* + * Output format version, so at least we can change it + * without _too_ many complaints. + */ +#ifdef CONFIG_DEBUG_SLAB + seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); +#else + seq_puts(m, "slabinfo - version: 2.1\n"); +#endif + seq_puts(m, "# name " + " "); + seq_puts(m, " : tunables "); + seq_puts(m, " : slabdata "); +#ifdef CONFIG_DEBUG_SLAB + seq_puts(m, " : globalstat " + " "); + seq_puts(m, " : cpustat "); +#endif + seq_putc(m, '\n'); +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + + mutex_lock(&slab_mutex); + if (!n) + print_slabinfo_header(m); + + return seq_list_start(&slab_caches, *pos); +} + +static void *s_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &slab_caches, pos); +} + +static void s_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&slab_mutex); +} + +static void +memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) +{ + struct kmem_cache *c; + struct slabinfo sinfo; + int i; + + if (!is_root_cache(s)) + return; + + for_each_memcg_cache_index(i) { + c = cache_from_memcg(s, i); + if (!c) + continue; + + memset(&sinfo, 0, sizeof(sinfo)); + get_slabinfo(c, &sinfo); + + info->active_slabs += sinfo.active_slabs; + info->num_slabs += sinfo.num_slabs; + info->shared_avail += sinfo.shared_avail; + info->active_objs += sinfo.active_objs; + info->num_objs += sinfo.num_objs; + } +} + +int cache_show(struct kmem_cache *s, struct seq_file *m) +{ + struct slabinfo sinfo; + + memset(&sinfo, 0, sizeof(sinfo)); + get_slabinfo(s, &sinfo); + + memcg_accumulate_slabinfo(s, &sinfo); + + seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", + cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size, + sinfo.objects_per_slab, (1 << sinfo.cache_order)); + + seq_printf(m, " : tunables %4u %4u %4u", + sinfo.limit, sinfo.batchcount, sinfo.shared); + seq_printf(m, " : slabdata %6lu %6lu %6lu", + sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); + slabinfo_show_stats(m, s); + seq_putc(m, '\n'); + return 0; +} + +static int s_show(struct seq_file *m, void *p) +{ + struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + + if (!is_root_cache(s)) + return 0; + return cache_show(s, m); +} + +/* + * slabinfo_op - iterator that generates /proc/slabinfo + * + * Output layout: + * cache-name + * num-active-objs + * total-objs + * object size + * num-active-slabs + * total-slabs + * num-pages-per-slab + * + further values on SMP and with statistics enabled + */ +static const struct seq_operations slabinfo_op = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static int slabinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &slabinfo_op); +} + +static const struct file_operations proc_slabinfo_operations = { + .open = slabinfo_open, + .read = seq_read, + .write = slabinfo_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init slab_proc_init(void) +{ + proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); + return 0; +} +module_init(slab_proc_init); +#endif /* CONFIG_SLABINFO */ diff --git a/trunk/mm/slob.c b/trunk/mm/slob.c index 1e921c5e9576..a99fdf7a0907 100644 --- a/trunk/mm/slob.c +++ b/trunk/mm/slob.c @@ -28,9 +28,8 @@ * from kmalloc are prepended with a 4-byte header with the kmalloc size. * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls * alloc_pages() directly, allocating compound pages so the page order - * does not have to be separately tracked, and also stores the exact - * allocation size in page->private so that it can be used to accurately - * provide ksize(). These objects are detected in kfree() because slob_page() + * does not have to be separately tracked. + * These objects are detected in kfree() because PageSlab() * is false for them. * * SLAB is emulated on top of SLOB by simply calling constructors and @@ -59,7 +58,6 @@ #include #include -#include "slab.h" #include #include /* struct reclaim_state */ @@ -74,6 +72,7 @@ #include +#include "slab.h" /* * slob_block has a field 'units', which indicates size of block if +ve, * or offset of next block if -ve (in SLOB_UNITs). @@ -124,7 +123,6 @@ static inline void clear_slob_page_free(struct page *sp) #define SLOB_UNIT sizeof(slob_t) #define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) -#define SLOB_ALIGN L1_CACHE_BYTES /* * struct slob_rcu is inserted at the tail of allocated slob blocks, which @@ -455,11 +453,6 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) if (likely(order)) gfp |= __GFP_COMP; ret = slob_new_pages(gfp, order, node); - if (ret) { - struct page *page; - page = virt_to_page(ret); - page->private = size; - } trace_kmalloc_node(caller, ret, size, PAGE_SIZE << order, gfp, node); @@ -506,7 +499,7 @@ void kfree(const void *block) unsigned int *m = (unsigned int *)(block - align); slob_free(m, *m + align); } else - put_page(sp); + __free_pages(sp, compound_order(sp)); } EXPORT_SYMBOL(kfree); @@ -514,37 +507,30 @@ EXPORT_SYMBOL(kfree); size_t ksize(const void *block) { struct page *sp; + int align; + unsigned int *m; BUG_ON(!block); if (unlikely(block == ZERO_SIZE_PTR)) return 0; sp = virt_to_page(block); - if (PageSlab(sp)) { - int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); - unsigned int *m = (unsigned int *)(block - align); - return SLOB_UNITS(*m) * SLOB_UNIT; - } else - return sp->private; + if (unlikely(!PageSlab(sp))) + return PAGE_SIZE << compound_order(sp); + + align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + m = (unsigned int *)(block - align); + return SLOB_UNITS(*m) * SLOB_UNIT; } EXPORT_SYMBOL(ksize); int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) { - size_t align = c->size; - if (flags & SLAB_DESTROY_BY_RCU) { /* leave room for rcu footer at the end of object */ c->size += sizeof(struct slob_rcu); } c->flags = flags; - /* ignore alignment unless it's forced */ - c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; - if (c->align < ARCH_SLAB_MINALIGN) - c->align = ARCH_SLAB_MINALIGN; - if (c->align < align) - c->align = align; - return 0; } @@ -558,12 +544,12 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node); - trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, + trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, SLOB_UNITS(c->size) * SLOB_UNIT, flags, node); } else { b = slob_new_pages(flags, get_order(c->size), node); - trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, + trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, PAGE_SIZE << get_order(c->size), flags, node); } @@ -608,12 +594,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b) } EXPORT_SYMBOL(kmem_cache_free); -unsigned int kmem_cache_size(struct kmem_cache *c) -{ - return c->size; -} -EXPORT_SYMBOL(kmem_cache_size); - int __kmem_cache_shutdown(struct kmem_cache *c) { /* No way to check for remaining objects */ diff --git a/trunk/mm/slub.c b/trunk/mm/slub.c index 487f0bdd53c0..ba2ca53f6c3a 100644 --- a/trunk/mm/slub.c +++ b/trunk/mm/slub.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -112,9 +113,6 @@ * the fast path and disables lockless freelists. */ -#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DEBUG_FREE) - static inline int kmem_cache_debug(struct kmem_cache *s) { #ifdef CONFIG_SLUB_DEBUG @@ -179,8 +177,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #define __OBJECT_POISON 0x80000000UL /* Poison object */ #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ -static int kmem_size = sizeof(struct kmem_cache); - #ifdef CONFIG_SMP static struct notifier_block slab_notifier; #endif @@ -205,13 +201,14 @@ enum track_item { TRACK_ALLOC, TRACK_FREE }; static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); static void sysfs_slab_remove(struct kmem_cache *); - +static void memcg_propagate_slab_attrs(struct kmem_cache *s); #else static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } static inline void sysfs_slab_remove(struct kmem_cache *s) { } +static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } #endif static inline void stat(const struct kmem_cache *s, enum stat_item si) @@ -1092,11 +1089,11 @@ static noinline struct kmem_cache_node *free_debug_processing( if (!check_object(s, page, object, SLUB_RED_ACTIVE)) goto out; - if (unlikely(s != page->slab)) { + if (unlikely(s != page->slab_cache)) { if (!PageSlab(page)) { slab_err(s, page, "Attempt to free object(0x%p) " "outside of slab", object); - } else if (!page->slab) { + } else if (!page->slab_cache) { printk(KERN_ERR "SLUB : no slab for object 0x%p.\n", object); @@ -1348,6 +1345,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) void *start; void *last; void *p; + int order; BUG_ON(flags & GFP_SLAB_BUG_MASK); @@ -1356,8 +1354,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (!page) goto out; + order = compound_order(page); inc_slabs_node(s, page_to_nid(page), page->objects); - page->slab = s; + memcg_bind_pages(s, order); + page->slab_cache = s; __SetPageSlab(page); if (page->pfmemalloc) SetPageSlabPfmemalloc(page); @@ -1365,7 +1365,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) start = page_address(page); if (unlikely(s->flags & SLAB_POISON)) - memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); + memset(start, POISON_INUSE, PAGE_SIZE << order); last = start; for_each_object(p, s, start, page->objects) { @@ -1406,10 +1406,12 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); + + memcg_release_pages(s, order); reset_page_mapcount(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_pages(page, order); + __free_memcg_kmem_pages(page, order); } #define need_reserve_slab_rcu \ @@ -1424,7 +1426,7 @@ static void rcu_free_slab(struct rcu_head *h) else page = container_of((struct list_head *)h, struct page, lru); - __free_slab(page->slab, page); + __free_slab(page->slab_cache, page); } static void free_slab(struct kmem_cache *s, struct page *page) @@ -1872,12 +1874,14 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, void *freel /* * Unfreeze all the cpu partial slabs. * - * This function must be called with interrupt disabled. + * This function must be called with interrupts disabled + * for the cpu using c (or some other guarantee must be there + * to guarantee no concurrent accesses). */ -static void unfreeze_partials(struct kmem_cache *s) +static void unfreeze_partials(struct kmem_cache *s, + struct kmem_cache_cpu *c) { struct kmem_cache_node *n = NULL, *n2 = NULL; - struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); struct page *page, *discard_page = NULL; while ((page = c->partial)) { @@ -1963,7 +1967,7 @@ static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) * set to the per node partial list. */ local_irq_save(flags); - unfreeze_partials(s); + unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); local_irq_restore(flags); oldpage = NULL; pobjects = 0; @@ -2006,7 +2010,7 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) if (c->page) flush_slab(s, c); - unfreeze_partials(s); + unfreeze_partials(s, c); } } @@ -2325,6 +2329,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, if (slab_pre_alloc_hook(s, gfpflags)) return NULL; + s = memcg_kmem_get_cache(s, gfpflags); redo: /* @@ -2459,7 +2464,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page, void *prior; void **object = (void *)x; int was_frozen; - int inuse; struct page new; unsigned long counters; struct kmem_cache_node *n = NULL; @@ -2472,13 +2476,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page, return; do { + if (unlikely(n)) { + spin_unlock_irqrestore(&n->list_lock, flags); + n = NULL; + } prior = page->freelist; counters = page->counters; set_freepointer(s, object, prior); new.counters = counters; was_frozen = new.frozen; new.inuse--; - if ((!new.inuse || !prior) && !was_frozen && !n) { + if ((!new.inuse || !prior) && !was_frozen) { if (!kmem_cache_debug(s) && !prior) @@ -2503,7 +2511,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page, } } - inuse = new.inuse; } while (!cmpxchg_double_slab(s, page, prior, counters, @@ -2529,25 +2536,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page, return; } + if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) + goto slab_empty; + /* - * was_frozen may have been set after we acquired the list_lock in - * an earlier loop. So we need to check it here again. + * Objects left in the slab. If it was not on the partial list before + * then add it. */ - if (was_frozen) - stat(s, FREE_FROZEN); - else { - if (unlikely(!inuse && n->nr_partial > s->min_partial)) - goto slab_empty; - - /* - * Objects left in the slab. If it was not on the partial list before - * then add it. - */ - if (unlikely(!prior)) { - remove_full(s, page); - add_partial(n, page, DEACTIVATE_TO_TAIL); - stat(s, FREE_ADD_PARTIAL); - } + if (kmem_cache_debug(s) && unlikely(!prior)) { + remove_full(s, page); + add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); } spin_unlock_irqrestore(&n->list_lock, flags); return; @@ -2619,19 +2618,10 @@ static __always_inline void slab_free(struct kmem_cache *s, void kmem_cache_free(struct kmem_cache *s, void *x) { - struct page *page; - - page = virt_to_head_page(x); - - if (kmem_cache_debug(s) && page->slab != s) { - pr_err("kmem_cache_free: Wrong slab cache. %s but object" - " is from %s\n", page->slab->name, s->name); - WARN_ON_ONCE(1); + s = cache_from_obj(s, x); + if (!s) return; - } - - slab_free(s, page, x, _RET_IP_); - + slab_free(s, virt_to_head_page(x), x, _RET_IP_); trace_kmem_cache_free(_RET_IP_, x); } EXPORT_SYMBOL(kmem_cache_free); @@ -2769,32 +2759,6 @@ static inline int calculate_order(int size, int reserved) return -ENOSYS; } -/* - * Figure out what the alignment of the objects will be. - */ -static unsigned long calculate_alignment(unsigned long flags, - unsigned long align, unsigned long size) -{ - /* - * If the user wants hardware cache aligned objects then follow that - * suggestion if the object is sufficiently large. - * - * The hardware cache alignment cannot override the specified - * alignment though. If that is greater then use it. - */ - if (flags & SLAB_HWCACHE_ALIGN) { - unsigned long ralign = cache_line_size(); - while (size <= ralign / 2) - ralign /= 2; - align = max(align, ralign); - } - - if (align < ARCH_SLAB_MINALIGN) - align = ARCH_SLAB_MINALIGN; - - return ALIGN(align, sizeof(void *)); -} - static void init_kmem_cache_node(struct kmem_cache_node *n) { @@ -2928,7 +2892,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) { unsigned long flags = s->flags; unsigned long size = s->object_size; - unsigned long align = s->align; int order; /* @@ -2999,20 +2962,12 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) size += sizeof(void *); #endif - /* - * Determine the alignment based on various parameters that the - * user specified and the dynamic determination of cache line size - * on bootup. - */ - align = calculate_alignment(flags, align, s->object_size); - s->align = align; - /* * SLUB stores one object immediately after another beginning from * offset 0. In order to align the objects we have to simply size * each object to conform to the alignment. */ - size = ALIGN(size, align); + size = ALIGN(size, s->align); s->size = size; if (forced_order >= 0) order = forced_order; @@ -3041,7 +2996,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) s->max = s->oo; return !!oo_objects(s->oo); - } static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) @@ -3127,15 +3081,6 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) return -EINVAL; } -/* - * Determine the size of a slab object - */ -unsigned int kmem_cache_size(struct kmem_cache *s) -{ - return s->object_size; -} -EXPORT_SYMBOL(kmem_cache_size); - static void list_slab_objects(struct kmem_cache *s, struct page *page, const char *text) { @@ -3208,8 +3153,19 @@ int __kmem_cache_shutdown(struct kmem_cache *s) { int rc = kmem_cache_close(s); - if (!rc) + if (!rc) { + /* + * We do the same lock strategy around sysfs_slab_add, see + * __kmem_cache_create. Because this is pretty much the last + * operation we do and the lock will be released shortly after + * that in slab_common.c, we could just move sysfs_slab_remove + * to a later point in common code. We should do that when we + * have a common sysfs framework for all allocators. + */ + mutex_unlock(&slab_mutex); sysfs_slab_remove(s); + mutex_lock(&slab_mutex); + } return rc; } @@ -3261,32 +3217,6 @@ static int __init setup_slub_nomerge(char *str) __setup("slub_nomerge", setup_slub_nomerge); -static struct kmem_cache *__init create_kmalloc_cache(const char *name, - int size, unsigned int flags) -{ - struct kmem_cache *s; - - s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - - s->name = name; - s->size = s->object_size = size; - s->align = ARCH_KMALLOC_MINALIGN; - - /* - * This function is called with IRQs disabled during early-boot on - * single CPU so there's no need to take slab_mutex here. - */ - if (kmem_cache_open(s, flags)) - goto panic; - - list_add(&s->list, &slab_caches); - return s; - -panic: - panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); - return NULL; -} - /* * Conversion table for small slabs sizes / 8 to the index in the * kmalloc array. This is necessary for slabs < 192 since we have non power @@ -3372,7 +3302,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) struct page *page; void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK; + flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; page = alloc_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); @@ -3424,7 +3354,7 @@ size_t ksize(const void *object) return PAGE_SIZE << compound_order(page); } - return slab_ksize(page->slab); + return slab_ksize(page->slab_cache); } EXPORT_SYMBOL(ksize); @@ -3449,8 +3379,8 @@ bool verify_mem_not_deleted(const void *x) } slab_lock(page); - if (on_freelist(page->slab, page, object)) { - object_err(page->slab, page, object, "Object is on free-list"); + if (on_freelist(page->slab_cache, page, object)) { + object_err(page->slab_cache, page, object, "Object is on free-list"); rv = false; } else { rv = true; @@ -3478,10 +3408,10 @@ void kfree(const void *x) if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kmemleak_free(x); - __free_pages(page, compound_order(page)); + __free_memcg_kmem_pages(page, compound_order(page)); return; } - slab_free(page->slab, page, object, _RET_IP_); + slab_free(page->slab_cache, page, object, _RET_IP_); } EXPORT_SYMBOL(kfree); @@ -3676,15 +3606,16 @@ static int slab_memory_callback(struct notifier_block *self, /* * Used for early kmem_cache structures that were allocated using - * the page allocator + * the page allocator. Allocate them properly then fix up the pointers + * that may be pointing to the wrong kmem_cache structure. */ -static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) +static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) { int node; + struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - list_add(&s->list, &slab_caches); - s->refcount = -1; + memcpy(s, static_cache, kmem_cache->object_size); for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -3692,78 +3623,52 @@ static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) if (n) { list_for_each_entry(p, &n->partial, lru) - p->slab = s; + p->slab_cache = s; #ifdef CONFIG_SLUB_DEBUG list_for_each_entry(p, &n->full, lru) - p->slab = s; + p->slab_cache = s; #endif } } + list_add(&s->list, &slab_caches); + return s; } void __init kmem_cache_init(void) { + static __initdata struct kmem_cache boot_kmem_cache, + boot_kmem_cache_node; int i; - int caches = 0; - struct kmem_cache *temp_kmem_cache; - int order; - struct kmem_cache *temp_kmem_cache_node; - unsigned long kmalloc_size; + int caches = 2; if (debug_guardpage_minorder()) slub_max_order = 0; - kmem_size = offsetof(struct kmem_cache, node) + - nr_node_ids * sizeof(struct kmem_cache_node *); - - /* Allocate two kmem_caches from the page allocator */ - kmalloc_size = ALIGN(kmem_size, cache_line_size()); - order = get_order(2 * kmalloc_size); - kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order); - - /* - * Must first have the slab cache available for the allocations of the - * struct kmem_cache_node's. There is special bootstrap code in - * kmem_cache_open for slab_state == DOWN. - */ - kmem_cache_node = (void *)kmem_cache + kmalloc_size; + kmem_cache_node = &boot_kmem_cache_node; + kmem_cache = &boot_kmem_cache; - kmem_cache_node->name = "kmem_cache_node"; - kmem_cache_node->size = kmem_cache_node->object_size = - sizeof(struct kmem_cache_node); - kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC); + create_boot_cache(kmem_cache_node, "kmem_cache_node", + sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN); hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); /* Able to allocate the per node structures */ slab_state = PARTIAL; - temp_kmem_cache = kmem_cache; - kmem_cache->name = "kmem_cache"; - kmem_cache->size = kmem_cache->object_size = kmem_size; - kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC); + create_boot_cache(kmem_cache, "kmem_cache", + offsetof(struct kmem_cache, node) + + nr_node_ids * sizeof(struct kmem_cache_node *), + SLAB_HWCACHE_ALIGN); - kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); - memcpy(kmem_cache, temp_kmem_cache, kmem_size); + kmem_cache = bootstrap(&boot_kmem_cache); /* * Allocate kmem_cache_node properly from the kmem_cache slab. * kmem_cache_node is separately allocated so no need to * update any list pointers. */ - temp_kmem_cache_node = kmem_cache_node; - - kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); - memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size); - - kmem_cache_bootstrap_fixup(kmem_cache_node); - - caches++; - kmem_cache_bootstrap_fixup(kmem_cache); - caches++; - /* Free temporary boot structure */ - free_pages((unsigned long)temp_kmem_cache, order); + kmem_cache_node = bootstrap(&boot_kmem_cache_node); /* Now we can use the kmem_cache to allocate kmalloc slabs */ @@ -3891,7 +3796,7 @@ static int slab_unmergeable(struct kmem_cache *s) return 0; } -static struct kmem_cache *find_mergeable(size_t size, +static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, size_t align, unsigned long flags, const char *name, void (*ctor)(void *)) { @@ -3927,17 +3832,21 @@ static struct kmem_cache *find_mergeable(size_t size, if (s->size - size >= sizeof(void *)) continue; + if (!cache_match_memcg(s, memcg)) + continue; + return s; } return NULL; } -struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +struct kmem_cache * +__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; - s = find_mergeable(size, align, flags, name, ctor); + s = find_mergeable(memcg, size, align, flags, name, ctor); if (s) { s->refcount++; /* @@ -3964,6 +3873,11 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) if (err) return err; + /* Mutex is not taken during early boot */ + if (slab_state <= UP) + return 0; + + memcg_propagate_slab_attrs(s); mutex_unlock(&slab_mutex); err = sysfs_slab_add(s); mutex_lock(&slab_mutex); @@ -5197,10 +5111,95 @@ static ssize_t slab_attr_store(struct kobject *kobj, return -EIO; err = attribute->store(s, buf, len); +#ifdef CONFIG_MEMCG_KMEM + if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { + int i; + + mutex_lock(&slab_mutex); + if (s->max_attr_size < len) + s->max_attr_size = len; + /* + * This is a best effort propagation, so this function's return + * value will be determined by the parent cache only. This is + * basically because not all attributes will have a well + * defined semantics for rollbacks - most of the actions will + * have permanent effects. + * + * Returning the error value of any of the children that fail + * is not 100 % defined, in the sense that users seeing the + * error code won't be able to know anything about the state of + * the cache. + * + * Only returning the error code for the parent cache at least + * has well defined semantics. The cache being written to + * directly either failed or succeeded, in which case we loop + * through the descendants with best-effort propagation. + */ + for_each_memcg_cache_index(i) { + struct kmem_cache *c = cache_from_memcg(s, i); + if (c) + attribute->store(c, buf, len); + } + mutex_unlock(&slab_mutex); + } +#endif return err; } +static void memcg_propagate_slab_attrs(struct kmem_cache *s) +{ +#ifdef CONFIG_MEMCG_KMEM + int i; + char *buffer = NULL; + + if (!is_root_cache(s)) + return; + + /* + * This mean this cache had no attribute written. Therefore, no point + * in copying default values around + */ + if (!s->max_attr_size) + return; + + for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { + char mbuf[64]; + char *buf; + struct slab_attribute *attr = to_slab_attr(slab_attrs[i]); + + if (!attr || !attr->store || !attr->show) + continue; + + /* + * It is really bad that we have to allocate here, so we will + * do it only as a fallback. If we actually allocate, though, + * we can just use the allocated buffer until the end. + * + * Most of the slub attributes will tend to be very small in + * size, but sysfs allows buffers up to a page, so they can + * theoretically happen. + */ + if (buffer) + buf = buffer; + else if (s->max_attr_size < ARRAY_SIZE(mbuf)) + buf = mbuf; + else { + buffer = (char *) get_zeroed_page(GFP_KERNEL); + if (WARN_ON(!buffer)) + continue; + buf = buffer; + } + + attr->show(s->memcg_params->root_cache, buf); + attr->store(s, buf, strlen(buf)); + } + + if (buffer) + free_page((unsigned long)buffer); +#endif +} + static const struct sysfs_ops slab_sysfs_ops = { .show = slab_attr_show, .store = slab_attr_store, @@ -5257,6 +5256,12 @@ static char *create_unique_id(struct kmem_cache *s) if (p != name + 1) *p++ = '-'; p += sprintf(p, "%07d", s->size); + +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) + p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg)); +#endif + BUG_ON(p > name + ID_STR_LENGTH - 1); return name; } @@ -5265,13 +5270,8 @@ static int sysfs_slab_add(struct kmem_cache *s) { int err; const char *name; - int unmergeable; - - if (slab_state < FULL) - /* Defer until later */ - return 0; + int unmergeable = slab_unmergeable(s); - unmergeable = slab_unmergeable(s); if (unmergeable) { /* * Slabcache can never be merged so we can use the name proper. @@ -5405,49 +5405,14 @@ __initcall(slab_sysfs_init); * The /proc/slabinfo ABI */ #ifdef CONFIG_SLABINFO -static void print_slabinfo_header(struct seq_file *m) -{ - seq_puts(m, "slabinfo - version: 2.1\n"); - seq_puts(m, "# name " - " "); - seq_puts(m, " : tunables "); - seq_puts(m, " : slabdata "); - seq_putc(m, '\n'); -} - -static void *s_start(struct seq_file *m, loff_t *pos) -{ - loff_t n = *pos; - - mutex_lock(&slab_mutex); - if (!n) - print_slabinfo_header(m); - - return seq_list_start(&slab_caches, *pos); -} - -static void *s_next(struct seq_file *m, void *p, loff_t *pos) -{ - return seq_list_next(p, &slab_caches, pos); -} - -static void s_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&slab_mutex); -} - -static int s_show(struct seq_file *m, void *p) +void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) { unsigned long nr_partials = 0; unsigned long nr_slabs = 0; - unsigned long nr_inuse = 0; unsigned long nr_objs = 0; unsigned long nr_free = 0; - struct kmem_cache *s; int node; - s = list_entry(p, struct kmem_cache, list); - for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); @@ -5460,41 +5425,21 @@ static int s_show(struct seq_file *m, void *p) nr_free += count_partial(n, count_free); } - nr_inuse = nr_objs - nr_free; - - seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, - nr_objs, s->size, oo_objects(s->oo), - (1 << oo_order(s->oo))); - seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); - seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs, - 0UL); - seq_putc(m, '\n'); - return 0; + sinfo->active_objs = nr_objs - nr_free; + sinfo->num_objs = nr_objs; + sinfo->active_slabs = nr_slabs; + sinfo->num_slabs = nr_slabs; + sinfo->objects_per_slab = oo_objects(s->oo); + sinfo->cache_order = oo_order(s->oo); } -static const struct seq_operations slabinfo_op = { - .start = s_start, - .next = s_next, - .stop = s_stop, - .show = s_show, -}; - -static int slabinfo_open(struct inode *inode, struct file *file) +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) { - return seq_open(file, &slabinfo_op); } -static const struct file_operations proc_slabinfo_operations = { - .open = slabinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init slab_proc_init(void) +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) { - proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); - return 0; + return -EIO; } -module_init(slab_proc_init); #endif /* CONFIG_SLABINFO */ diff --git a/trunk/mm/vmscan.c b/trunk/mm/vmscan.c index 7f3096137b8a..adc7e9058181 100644 --- a/trunk/mm/vmscan.c +++ b/trunk/mm/vmscan.c @@ -1177,7 +1177,11 @@ int isolate_lru_page(struct page *page) } /* - * Are there way too many processes in the direct reclaim path already? + * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and + * then get resheduled. When there are massive number of tasks doing page + * allocation, such sleeping direct reclaimers may keep piling up on each CPU, + * the LRU list will go small and be scanned faster than necessary, leading to + * unnecessary swapping, thrashing and OOM. */ static int too_many_isolated(struct zone *zone, int file, struct scan_control *sc) @@ -1198,6 +1202,14 @@ static int too_many_isolated(struct zone *zone, int file, isolated = zone_page_state(zone, NR_ISOLATED_ANON); } + /* + * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they + * won't get blocked by normal direct-reclaimers, forming a circular + * deadlock. + */ + if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) + inactive >>= 3; + return isolated > inactive; } @@ -2558,7 +2570,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, static unsigned long balance_pgdat(pg_data_t *pgdat, int order, int *classzone_idx) { - int all_zones_ok; + struct zone *unbalanced_zone; unsigned long balanced; int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ @@ -2592,7 +2604,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, unsigned long lru_pages = 0; int has_under_min_watermark_zone = 0; - all_zones_ok = 1; + unbalanced_zone = NULL; balanced = 0; /* @@ -2731,7 +2743,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, } if (!zone_balanced(zone, testorder, 0, end_zone)) { - all_zones_ok = 0; + unbalanced_zone = zone; /* * We are still under min water mark. This * means that we have a GFP_ATOMIC allocation @@ -2764,7 +2776,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, pfmemalloc_watermark_ok(pgdat)) wake_up(&pgdat->pfmemalloc_wait); - if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) + if (!unbalanced_zone || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) break; /* kswapd: all done */ /* * OK, kswapd is getting into trouble. Take a nap, then take @@ -2774,7 +2786,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (has_under_min_watermark_zone) count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); else - congestion_wait(BLK_RW_ASYNC, HZ/10); + wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10); } /* @@ -2793,7 +2805,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * high-order: Balanced zones must make up at least 25% of the node * for the node to be balanced */ - if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { + if (unbalanced_zone && (!order || !pgdat_balanced(pgdat, balanced, *classzone_idx))) { cond_resched(); try_to_freeze(); diff --git a/trunk/net/atm/atm_sysfs.c b/trunk/net/atm/atm_sysfs.c index f49da5814bc3..350bf62b2ae3 100644 --- a/trunk/net/atm/atm_sysfs.c +++ b/trunk/net/atm/atm_sysfs.c @@ -14,49 +14,45 @@ static ssize_t show_type(struct device *cdev, struct device_attribute *attr, char *buf) { struct atm_dev *adev = to_atm_dev(cdev); - return sprintf(buf, "%s\n", adev->type); + + return scnprintf(buf, PAGE_SIZE, "%s\n", adev->type); } static ssize_t show_address(struct device *cdev, struct device_attribute *attr, char *buf) { - char *pos = buf; struct atm_dev *adev = to_atm_dev(cdev); - int i; - - for (i = 0; i < (ESI_LEN - 1); i++) - pos += sprintf(pos, "%02x:", adev->esi[i]); - pos += sprintf(pos, "%02x\n", adev->esi[i]); - return pos - buf; + return scnprintf(buf, PAGE_SIZE, "%pM\n", adev->esi); } static ssize_t show_atmaddress(struct device *cdev, struct device_attribute *attr, char *buf) { unsigned long flags; - char *pos = buf; struct atm_dev *adev = to_atm_dev(cdev); struct atm_dev_addr *aaddr; int bin[] = { 1, 2, 10, 6, 1 }, *fmt = bin; - int i, j; + int i, j, count = 0; spin_lock_irqsave(&adev->lock, flags); list_for_each_entry(aaddr, &adev->local, entry) { for (i = 0, j = 0; i < ATM_ESA_LEN; ++i, ++j) { if (j == *fmt) { - pos += sprintf(pos, "."); + count += scnprintf(buf + count, + PAGE_SIZE - count, "."); ++fmt; j = 0; } - pos += sprintf(pos, "%02x", - aaddr->addr.sas_addr.prv[i]); + count += scnprintf(buf + count, + PAGE_SIZE - count, "%02x", + aaddr->addr.sas_addr.prv[i]); } - pos += sprintf(pos, "\n"); + count += scnprintf(buf + count, PAGE_SIZE - count, "\n"); } spin_unlock_irqrestore(&adev->lock, flags); - return pos - buf; + return count; } static ssize_t show_atmindex(struct device *cdev, @@ -64,25 +60,21 @@ static ssize_t show_atmindex(struct device *cdev, { struct atm_dev *adev = to_atm_dev(cdev); - return sprintf(buf, "%d\n", adev->number); + return scnprintf(buf, PAGE_SIZE, "%d\n", adev->number); } static ssize_t show_carrier(struct device *cdev, struct device_attribute *attr, char *buf) { - char *pos = buf; struct atm_dev *adev = to_atm_dev(cdev); - pos += sprintf(pos, "%d\n", - adev->signal == ATM_PHY_SIG_LOST ? 0 : 1); - - return pos - buf; + return scnprintf(buf, PAGE_SIZE, "%d\n", + adev->signal == ATM_PHY_SIG_LOST ? 0 : 1); } static ssize_t show_link_rate(struct device *cdev, struct device_attribute *attr, char *buf) { - char *pos = buf; struct atm_dev *adev = to_atm_dev(cdev); int link_rate; @@ -100,9 +92,7 @@ static ssize_t show_link_rate(struct device *cdev, default: link_rate = adev->link_rate * 8 * 53; } - pos += sprintf(pos, "%d\n", link_rate); - - return pos - buf; + return scnprintf(buf, PAGE_SIZE, "%d\n", link_rate); } static DEVICE_ATTR(address, S_IRUGO, show_address, NULL); diff --git a/trunk/net/bridge/br_mdb.c b/trunk/net/bridge/br_mdb.c index 6f0a2eebcb27..acc9f4cc18f7 100644 --- a/trunk/net/bridge/br_mdb.c +++ b/trunk/net/bridge/br_mdb.c @@ -83,9 +83,12 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, if (port) { struct br_mdb_entry e; e.ifindex = port->dev->ifindex; - e.addr.u.ip4 = p->addr.u.ip4; + e.state = p->state; + if (p->addr.proto == htons(ETH_P_IP)) + e.addr.u.ip4 = p->addr.u.ip4; #if IS_ENABLED(CONFIG_IPV6) - e.addr.u.ip6 = p->addr.u.ip6; + if (p->addr.proto == htons(ETH_P_IPV6)) + e.addr.u.ip6 = p->addr.u.ip6; #endif e.addr.proto = p->addr.proto; if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(e), &e)) { @@ -253,6 +256,8 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry) #endif } else return false; + if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) + return false; return true; } @@ -310,7 +315,7 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, } static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, - struct br_ip *group) + struct br_ip *group, unsigned char state) { struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; @@ -336,7 +341,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, break; } - p = br_multicast_new_port_group(port, group, *pp); + p = br_multicast_new_port_group(port, group, *pp, state); if (unlikely(!p)) return -ENOMEM; rcu_assign_pointer(*pp, p); @@ -373,7 +378,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, #endif spin_lock_bh(&br->multicast_lock); - ret = br_mdb_add_group(br, p, &ip); + ret = br_mdb_add_group(br, p, &ip, entry->state); spin_unlock_bh(&br->multicast_lock); return ret; } @@ -479,3 +484,10 @@ void br_mdb_init(void) rtnl_register(PF_BRIDGE, RTM_NEWMDB, br_mdb_add, NULL, NULL); rtnl_register(PF_BRIDGE, RTM_DELMDB, br_mdb_del, NULL, NULL); } + +void br_mdb_uninit(void) +{ + rtnl_unregister(PF_BRIDGE, RTM_GETMDB); + rtnl_unregister(PF_BRIDGE, RTM_NEWMDB); + rtnl_unregister(PF_BRIDGE, RTM_DELMDB); +} diff --git a/trunk/net/bridge/br_multicast.c b/trunk/net/bridge/br_multicast.c index 1093c89095d8..5391ca43336a 100644 --- a/trunk/net/bridge/br_multicast.c +++ b/trunk/net/bridge/br_multicast.c @@ -279,7 +279,7 @@ static void br_multicast_port_group_expired(unsigned long data) spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || timer_pending(&pg->timer) || - hlist_unhashed(&pg->mglist)) + hlist_unhashed(&pg->mglist) || pg->state & MDB_PERMANENT) goto out; br_multicast_del_pg(br, pg); @@ -622,7 +622,8 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br, struct net_bridge_port_group *br_multicast_new_port_group( struct net_bridge_port *port, struct br_ip *group, - struct net_bridge_port_group __rcu *next) + struct net_bridge_port_group __rcu *next, + unsigned char state) { struct net_bridge_port_group *p; @@ -632,6 +633,7 @@ struct net_bridge_port_group *br_multicast_new_port_group( p->addr = *group; p->port = port; + p->state = state; rcu_assign_pointer(p->next, next); hlist_add_head(&p->mglist, &port->mglist); setup_timer(&p->timer, br_multicast_port_group_expired, @@ -674,7 +676,7 @@ static int br_multicast_add_group(struct net_bridge *br, break; } - p = br_multicast_new_port_group(port, group, *pp); + p = br_multicast_new_port_group(port, group, *pp, MDB_TEMPORARY); if (unlikely(!p)) goto err; rcu_assign_pointer(*pp, p); @@ -1165,7 +1167,6 @@ static int br_ip6_multicast_query(struct net_bridge *br, if (max_delay) group = &mld->mld_mca; } else if (skb->len >= sizeof(*mld2q)) { - u16 mrc; if (!pskb_may_pull(skb, sizeof(*mld2q))) { err = -EINVAL; goto out; @@ -1173,8 +1174,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, mld2q = (struct mld2_query *)icmp6_hdr(skb); if (!mld2q->mld2q_nsrcs) group = &mld2q->mld2q_mca; - mrc = ntohs(mld2q->mld2q_mrc); - max_delay = mrc ? MLDV2_MRC(mrc) : 1; + max_delay = mld2q->mld2q_mrc ? MLDV2_MRC(ntohs(mld2q->mld2q_mrc)) : 1; } if (!group) @@ -1633,6 +1633,7 @@ void br_multicast_stop(struct net_bridge *br) del_timer_sync(&br->multicast_querier_timer); del_timer_sync(&br->multicast_query_timer); + br_mdb_uninit(); spin_lock_bh(&br->multicast_lock); mdb = mlock_dereference(br->mdb, br); if (!mdb) diff --git a/trunk/net/bridge/br_netlink.c b/trunk/net/bridge/br_netlink.c index dead9dfe865b..97ba0189c6f7 100644 --- a/trunk/net/bridge/br_netlink.c +++ b/trunk/net/bridge/br_netlink.c @@ -305,5 +305,4 @@ int __init br_netlink_init(void) void __exit br_netlink_fini(void) { rtnl_link_unregister(&br_link_ops); - rtnl_unregister_all(PF_BRIDGE); } diff --git a/trunk/net/bridge/br_private.h b/trunk/net/bridge/br_private.h index f21a739a6186..8d83be5ffedc 100644 --- a/trunk/net/bridge/br_private.h +++ b/trunk/net/bridge/br_private.h @@ -83,6 +83,7 @@ struct net_bridge_port_group { struct rcu_head rcu; struct timer_list timer; struct br_ip addr; + unsigned char state; }; struct net_bridge_mdb_entry @@ -443,8 +444,10 @@ extern void br_multicast_free_pg(struct rcu_head *head); extern struct net_bridge_port_group *br_multicast_new_port_group( struct net_bridge_port *port, struct br_ip *group, - struct net_bridge_port_group *next); + struct net_bridge_port_group *next, + unsigned char state); extern void br_mdb_init(void); +extern void br_mdb_uninit(void); extern void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, struct br_ip *group, int type); diff --git a/trunk/net/core/net_namespace.c b/trunk/net/core/net_namespace.c index 6456439cbbd9..8acce01b6dab 100644 --- a/trunk/net/core/net_namespace.c +++ b/trunk/net/core/net_namespace.c @@ -381,6 +381,21 @@ struct net *get_net_ns_by_pid(pid_t pid) } EXPORT_SYMBOL_GPL(get_net_ns_by_pid); +static __net_init int net_ns_net_init(struct net *net) +{ + return proc_alloc_inum(&net->proc_inum); +} + +static __net_exit void net_ns_net_exit(struct net *net) +{ + proc_free_inum(net->proc_inum); +} + +static struct pernet_operations __net_initdata net_ns_ops = { + .init = net_ns_net_init, + .exit = net_ns_net_exit, +}; + static int __init net_ns_init(void) { struct net_generic *ng; @@ -412,6 +427,8 @@ static int __init net_ns_init(void) mutex_unlock(&net_mutex); + register_pernet_subsys(&net_ns_ops); + return 0; } @@ -630,16 +647,29 @@ static void netns_put(void *ns) static int netns_install(struct nsproxy *nsproxy, void *ns) { + struct net *net = ns; + + if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) || + !nsown_capable(CAP_SYS_ADMIN)) + return -EPERM; + put_net(nsproxy->net_ns); - nsproxy->net_ns = get_net(ns); + nsproxy->net_ns = get_net(net); return 0; } +static unsigned int netns_inum(void *ns) +{ + struct net *net = ns; + return net->proc_inum; +} + const struct proc_ns_operations netns_operations = { .name = "net", .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, + .inum = netns_inum, }; #endif diff --git a/trunk/net/dccp/ipv4.c b/trunk/net/dccp/ipv4.c index 176ecdba4a22..4f9f5eb478f1 100644 --- a/trunk/net/dccp/ipv4.c +++ b/trunk/net/dccp/ipv4.c @@ -439,8 +439,8 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return NULL; put_and_exit: - bh_unlock_sock(newsk); - sock_put(newsk); + inet_csk_prepare_forced_close(newsk); + dccp_done(newsk); goto exit; } diff --git a/trunk/net/dccp/ipv6.c b/trunk/net/dccp/ipv6.c index 56840b249f3b..6e05981f271e 100644 --- a/trunk/net/dccp/ipv6.c +++ b/trunk/net/dccp/ipv6.c @@ -585,7 +585,8 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, newinet->inet_rcv_saddr = LOOPBACK4_IPV6; if (__inet_inherit_port(sk, newsk) < 0) { - sock_put(newsk); + inet_csk_prepare_forced_close(newsk); + dccp_done(newsk); goto out; } __inet6_hash(newsk, NULL); diff --git a/trunk/net/ipv4/inet_connection_sock.c b/trunk/net/ipv4/inet_connection_sock.c index 2026542d6836..d0670f00d524 100644 --- a/trunk/net/ipv4/inet_connection_sock.c +++ b/trunk/net/ipv4/inet_connection_sock.c @@ -710,6 +710,22 @@ void inet_csk_destroy_sock(struct sock *sk) } EXPORT_SYMBOL(inet_csk_destroy_sock); +/* This function allows to force a closure of a socket after the call to + * tcp/dccp_create_openreq_child(). + */ +void inet_csk_prepare_forced_close(struct sock *sk) +{ + /* sk_clone_lock locked the socket and set refcnt to 2 */ + bh_unlock_sock(sk); + sock_put(sk); + + /* The below has to be done to allow calling inet_csk_destroy_sock */ + sock_set_flag(sk, SOCK_DEAD); + percpu_counter_inc(sk->sk_prot->orphan_count); + inet_sk(sk)->inet_num = 0; +} +EXPORT_SYMBOL(inet_csk_prepare_forced_close); + int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) { struct inet_sock *inet = inet_sk(sk); diff --git a/trunk/net/ipv4/tcp_ipv4.c b/trunk/net/ipv4/tcp_ipv4.c index 1ed230716d51..54139fa514e6 100644 --- a/trunk/net/ipv4/tcp_ipv4.c +++ b/trunk/net/ipv4/tcp_ipv4.c @@ -1767,10 +1767,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return NULL; put_and_exit: - tcp_clear_xmit_timers(newsk); - tcp_cleanup_congestion_control(newsk); - bh_unlock_sock(newsk); - sock_put(newsk); + inet_csk_prepare_forced_close(newsk); + tcp_done(newsk); goto exit; } EXPORT_SYMBOL(tcp_v4_syn_recv_sock); diff --git a/trunk/net/ipv6/Makefile b/trunk/net/ipv6/Makefile index 2068ac4fbdad..4ea244891b58 100644 --- a/trunk/net/ipv6/Makefile +++ b/trunk/net/ipv6/Makefile @@ -41,6 +41,6 @@ obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o obj-$(CONFIG_IPV6_GRE) += ip6_gre.o obj-y += addrconf_core.o exthdrs_core.o -obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6_offload) +obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload) obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o diff --git a/trunk/net/ipv6/addrconf.c b/trunk/net/ipv6/addrconf.c index 6fca01f136ad..408cac4ae00a 100644 --- a/trunk/net/ipv6/addrconf.c +++ b/trunk/net/ipv6/addrconf.c @@ -534,8 +534,7 @@ void inet6_netconf_notify_devconf(struct net *net, int type, int ifindex, rtnl_notify(skb, net, 0, RTNLGRP_IPV6_NETCONF, NULL, GFP_ATOMIC); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV6_NETCONF, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_NETCONF, err); } static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = { diff --git a/trunk/net/ipv6/ndisc.c b/trunk/net/ipv6/ndisc.c index f2a007b7bde3..6574175795df 100644 --- a/trunk/net/ipv6/ndisc.c +++ b/trunk/net/ipv6/ndisc.c @@ -1314,6 +1314,12 @@ static void ndisc_router_discovery(struct sk_buff *skb) static void ndisc_redirect_rcv(struct sk_buff *skb) { + u8 *hdr; + struct ndisc_options ndopts; + struct rd_msg *msg = (struct rd_msg *)skb_transport_header(skb); + u32 ndoptlen = skb->tail - (skb->transport_header + + offsetof(struct rd_msg, opt)); + #ifdef CONFIG_IPV6_NDISC_NODETYPE switch (skb->ndisc_nodetype) { case NDISC_NODETYPE_HOST: @@ -1330,6 +1336,17 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) return; } + if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) + return; + + if (!ndopts.nd_opts_rh) + return; + + hdr = (u8 *)ndopts.nd_opts_rh; + hdr += 8; + if (!pskb_pull(skb, hdr - skb_transport_header(skb))) + return; + icmpv6_notify(skb, NDISC_REDIRECT, 0, 0); } diff --git a/trunk/net/ipv6/tcp_ipv6.c b/trunk/net/ipv6/tcp_ipv6.c index 6565cf55eb1e..93825dd3a7c0 100644 --- a/trunk/net/ipv6/tcp_ipv6.c +++ b/trunk/net/ipv6/tcp_ipv6.c @@ -1288,7 +1288,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, #endif if (__inet_inherit_port(sk, newsk) < 0) { - sock_put(newsk); + inet_csk_prepare_forced_close(newsk); + tcp_done(newsk); goto out; } __inet6_hash(newsk, NULL); diff --git a/trunk/net/mac802154/ieee802154_dev.c b/trunk/net/mac802154/ieee802154_dev.c index e748aed290aa..b7c7f815deae 100644 --- a/trunk/net/mac802154/ieee802154_dev.c +++ b/trunk/net/mac802154/ieee802154_dev.c @@ -224,9 +224,9 @@ void ieee802154_free_device(struct ieee802154_dev *hw) BUG_ON(!list_empty(&priv->slaves)); - wpan_phy_free(priv->phy); - mutex_destroy(&priv->slaves_mtx); + + wpan_phy_free(priv->phy); } EXPORT_SYMBOL(ieee802154_free_device); diff --git a/trunk/net/netlink/af_netlink.c b/trunk/net/netlink/af_netlink.c index c8a1eb6eca2d..c0353d55d56f 100644 --- a/trunk/net/netlink/af_netlink.c +++ b/trunk/net/netlink/af_netlink.c @@ -669,6 +669,9 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err; + if (addr_len < sizeof(struct sockaddr_nl)) + return -EINVAL; + if (nladdr->nl_family != AF_NETLINK) return -EINVAL; @@ -2059,7 +2062,7 @@ static int netlink_seq_show(struct seq_file *seq, void *v) struct sock *s = v; struct netlink_sock *nlk = nlk_sk(s); - seq_printf(seq, "%pK %-3d %-6d %08x %-8d %-8d %pK %-8d %-8d %-8lu\n", + seq_printf(seq, "%pK %-3d %-6u %08x %-8d %-8d %pK %-8d %-8d %-8lu\n", s, s->sk_protocol, nlk->portid, diff --git a/trunk/net/sctp/Kconfig b/trunk/net/sctp/Kconfig index a9edd2e205f4..c26210618e14 100644 --- a/trunk/net/sctp/Kconfig +++ b/trunk/net/sctp/Kconfig @@ -66,12 +66,36 @@ config SCTP_DBG_OBJCNT 'cat /proc/net/sctp/sctp_dbg_objcnt' If unsure, say N +choice + prompt "Default SCTP cookie HMAC encoding" + default SCTP_COOKIE_HMAC_MD5 + help + This option sets the default sctp cookie hmac algorithm + when in doubt select 'md5' + +config SCTP_DEFAULT_COOKIE_HMAC_MD5 + bool "Enable optional MD5 hmac cookie generation" + help + Enable optional MD5 hmac based SCTP cookie generation + select SCTP_COOKIE_HMAC_MD5 + +config SCTP_DEFAULT_COOKIE_HMAC_SHA1 + bool "Enable optional SHA1 hmac cookie generation" + help + Enable optional SHA1 hmac based SCTP cookie generation + select SCTP_COOKIE_HMAC_SHA1 + +config SCTP_DEFAULT_COOKIE_HMAC_NONE + bool "Use no hmac alg in SCTP cookie generation" + help + Use no hmac algorithm in SCTP cookie generation + +endchoice config SCTP_COOKIE_HMAC_MD5 bool "Enable optional MD5 hmac cookie generation" help Enable optional MD5 hmac based SCTP cookie generation - default y select CRYPTO_HMAC if SCTP_COOKIE_HMAC_MD5 select CRYPTO_MD5 if SCTP_COOKIE_HMAC_MD5 @@ -79,7 +103,6 @@ config SCTP_COOKIE_HMAC_SHA1 bool "Enable optional SHA1 hmac cookie generation" help Enable optional SHA1 hmac based SCTP cookie generation - default y select CRYPTO_HMAC if SCTP_COOKIE_HMAC_SHA1 select CRYPTO_SHA1 if SCTP_COOKIE_HMAC_SHA1 diff --git a/trunk/net/sctp/probe.c b/trunk/net/sctp/probe.c index bc6cd75cc1dc..5f7518de2fd1 100644 --- a/trunk/net/sctp/probe.c +++ b/trunk/net/sctp/probe.c @@ -122,7 +122,8 @@ static const struct file_operations sctpprobe_fops = { .llseek = noop_llseek, }; -sctp_disposition_t jsctp_sf_eat_sack(const struct sctp_endpoint *ep, +sctp_disposition_t jsctp_sf_eat_sack(struct net *net, + const struct sctp_endpoint *ep, const struct sctp_association *asoc, const sctp_subtype_t type, void *arg, diff --git a/trunk/net/sctp/protocol.c b/trunk/net/sctp/protocol.c index 2c7785bacf74..f898b1c58bd2 100644 --- a/trunk/net/sctp/protocol.c +++ b/trunk/net/sctp/protocol.c @@ -1191,9 +1191,9 @@ static int __net_init sctp_net_init(struct net *net) net->sctp.cookie_preserve_enable = 1; /* Default sctp sockets to use md5 as their hmac alg */ -#if defined (CONFIG_CRYPTO_MD5) +#if defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5) net->sctp.sctp_hmac_alg = "md5"; -#elif defined (CONFIG_CRYPTO_SHA1) +#elif defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1) net->sctp.sctp_hmac_alg = "sha1"; #else net->sctp.sctp_hmac_alg = NULL; diff --git a/trunk/net/sunrpc/auth_gss/auth_gss.c b/trunk/net/sunrpc/auth_gss/auth_gss.c index 909dc0c31aab..6e5c824b040b 100644 --- a/trunk/net/sunrpc/auth_gss/auth_gss.c +++ b/trunk/net/sunrpc/auth_gss/auth_gss.c @@ -192,17 +192,23 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct const void *q; unsigned int seclen; unsigned int timeout; + unsigned long now = jiffies; u32 window_size; int ret; - /* First unsigned int gives the lifetime (in seconds) of the cred */ + /* First unsigned int gives the remaining lifetime in seconds of the + * credential - e.g. the remaining TGT lifetime for Kerberos or + * the -t value passed to GSSD. + */ p = simple_get_bytes(p, end, &timeout, sizeof(timeout)); if (IS_ERR(p)) goto err; if (timeout == 0) timeout = GSSD_MIN_TIMEOUT; - ctx->gc_expiry = jiffies + (unsigned long)timeout * HZ * 3 / 4; - /* Sequence number window. Determines the maximum number of simultaneous requests */ + ctx->gc_expiry = now + ((unsigned long)timeout * HZ); + /* Sequence number window. Determines the maximum number of + * simultaneous requests + */ p = simple_get_bytes(p, end, &window_size, sizeof(window_size)); if (IS_ERR(p)) goto err; @@ -237,9 +243,12 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct p = ERR_PTR(ret); goto err; } + dprintk("RPC: %s Success. gc_expiry %lu now %lu timeout %u\n", + __func__, ctx->gc_expiry, now, timeout); return q; err: - dprintk("RPC: %s returning %ld\n", __func__, -PTR_ERR(p)); + dprintk("RPC: %s returns %ld gc_expiry %lu now %lu timeout %u\n", + __func__, -PTR_ERR(p), ctx->gc_expiry, now, timeout); return p; } diff --git a/trunk/net/sunrpc/backchannel_rqst.c b/trunk/net/sunrpc/backchannel_rqst.c index a9c0bbccad6b..890a29912d5a 100644 --- a/trunk/net/sunrpc/backchannel_rqst.c +++ b/trunk/net/sunrpc/backchannel_rqst.c @@ -59,7 +59,7 @@ static void xprt_free_allocation(struct rpc_rqst *req) struct xdr_buf *xbufp; dprintk("RPC: free allocations for req= %p\n", req); - BUG_ON(test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state)); + WARN_ON_ONCE(test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state)); xbufp = &req->rq_private_buf; free_page((unsigned long)xbufp->head[0].iov_base); xbufp = &req->rq_snd_buf; @@ -191,7 +191,9 @@ void xprt_destroy_backchannel(struct rpc_xprt *xprt, unsigned int max_reqs) dprintk("RPC: destroy backchannel transport\n"); - BUG_ON(max_reqs == 0); + if (max_reqs == 0) + goto out; + spin_lock_bh(&xprt->bc_pa_lock); xprt_dec_alloc_count(xprt, max_reqs); list_for_each_entry_safe(req, tmp, &xprt->bc_pa_list, rq_bc_pa_list) { @@ -202,6 +204,7 @@ void xprt_destroy_backchannel(struct rpc_xprt *xprt, unsigned int max_reqs) } spin_unlock_bh(&xprt->bc_pa_lock); +out: dprintk("RPC: backchannel list empty= %s\n", list_empty(&xprt->bc_pa_list) ? "true" : "false"); } @@ -255,7 +258,7 @@ void xprt_free_bc_request(struct rpc_rqst *req) dprintk("RPC: free backchannel req=%p\n", req); smp_mb__before_clear_bit(); - BUG_ON(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state)); + WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state)); clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); smp_mb__after_clear_bit(); diff --git a/trunk/net/sunrpc/bc_svc.c b/trunk/net/sunrpc/bc_svc.c index 0b2eb388cbda..15c7a8a1c24f 100644 --- a/trunk/net/sunrpc/bc_svc.c +++ b/trunk/net/sunrpc/bc_svc.c @@ -53,7 +53,7 @@ int bc_send(struct rpc_rqst *req) if (IS_ERR(task)) ret = PTR_ERR(task); else { - BUG_ON(atomic_read(&task->tk_count) != 1); + WARN_ON_ONCE(atomic_read(&task->tk_count) != 1); ret = task->tk_status; rpc_put_task(task); } diff --git a/trunk/net/sunrpc/cache.c b/trunk/net/sunrpc/cache.c index fc2f7aa4dca7..9afa4393c217 100644 --- a/trunk/net/sunrpc/cache.c +++ b/trunk/net/sunrpc/cache.c @@ -775,11 +775,11 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, if (rp->q.list.next == &cd->queue) { spin_unlock(&queue_lock); mutex_unlock(&inode->i_mutex); - BUG_ON(rp->offset); + WARN_ON_ONCE(rp->offset); return 0; } rq = container_of(rp->q.list.next, struct cache_request, q.list); - BUG_ON(rq->q.reader); + WARN_ON_ONCE(rq->q.reader); if (rp->offset == 0) rq->readers++; spin_unlock(&queue_lock); diff --git a/trunk/net/sunrpc/clnt.c b/trunk/net/sunrpc/clnt.c index cdc7564b4512..822f020fa7f4 100644 --- a/trunk/net/sunrpc/clnt.c +++ b/trunk/net/sunrpc/clnt.c @@ -132,8 +132,10 @@ static struct dentry *rpc_setup_pipedir_sb(struct super_block *sb, int error; dir = rpc_d_lookup_sb(sb, dir_name); - if (dir == NULL) + if (dir == NULL) { + pr_info("RPC: pipefs directory doesn't exist: %s\n", dir_name); return dir; + } for (;;) { q.len = snprintf(name, sizeof(name), "clnt%x", (unsigned int)clntid++); name[sizeof(name) - 1] = '\0'; @@ -192,7 +194,8 @@ static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event, case RPC_PIPEFS_MOUNT: dentry = rpc_setup_pipedir_sb(sb, clnt, clnt->cl_program->pipe_dir_name); - BUG_ON(dentry == NULL); + if (!dentry) + return -ENOENT; if (IS_ERR(dentry)) return PTR_ERR(dentry); clnt->cl_dentry = dentry; @@ -234,7 +237,7 @@ static struct rpc_clnt *rpc_get_client_for_event(struct net *net, int event) spin_lock(&sn->rpc_client_lock); list_for_each_entry(clnt, &sn->all_clients, cl_clients) { if (clnt->cl_program->pipe_dir_name == NULL) - break; + continue; if (rpc_clnt_skip_event(clnt, event)) continue; if (atomic_inc_not_zero(&clnt->cl_count) == 0) @@ -607,6 +610,13 @@ EXPORT_SYMBOL_GPL(rpc_killall_tasks); */ void rpc_shutdown_client(struct rpc_clnt *clnt) { + /* + * To avoid deadlock, never call rpc_shutdown_client from a + * workqueue context! + */ + WARN_ON_ONCE(current->flags & PF_WQ_WORKER); + might_sleep(); + dprintk_rcu("RPC: shutting down %s client for %s\n", clnt->cl_protname, rcu_dereference(clnt->cl_xprt)->servername); @@ -693,21 +703,19 @@ struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old, const struct rpc_program *program, u32 vers) { + struct rpc_create_args args = { + .program = program, + .prognumber = program->number, + .version = vers, + .authflavor = old->cl_auth->au_flavor, + .client_name = old->cl_principal, + }; struct rpc_clnt *clnt; - const struct rpc_version *version; int err; - BUG_ON(vers >= program->nrvers || !program->version[vers]); - version = program->version[vers]; - clnt = rpc_clone_client(old); + clnt = __rpc_clone_client(&args, old); if (IS_ERR(clnt)) goto out; - clnt->cl_procinfo = version->procs; - clnt->cl_maxproc = version->nrprocs; - clnt->cl_protname = program->name; - clnt->cl_prog = program->number; - clnt->cl_vers = version->number; - clnt->cl_stats = program->stats; err = rpc_ping(clnt); if (err != 0) { rpc_shutdown_client(clnt); @@ -832,7 +840,12 @@ int rpc_call_sync(struct rpc_clnt *clnt, const struct rpc_message *msg, int flag }; int status; - BUG_ON(flags & RPC_TASK_ASYNC); + WARN_ON_ONCE(flags & RPC_TASK_ASYNC); + if (flags & RPC_TASK_ASYNC) { + rpc_release_calldata(task_setup_data.callback_ops, + task_setup_data.callback_data); + return -EINVAL; + } task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) @@ -908,7 +921,7 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req, task->tk_action = call_bc_transmit; atomic_inc(&task->tk_count); - BUG_ON(atomic_read(&task->tk_count) != 2); + WARN_ON_ONCE(atomic_read(&task->tk_count) != 2); rpc_execute(task); out: @@ -1368,6 +1381,7 @@ call_refreshresult(struct rpc_task *task) return; case -ETIMEDOUT: rpc_delay(task, 3*HZ); + case -EKEYEXPIRED: case -EAGAIN: status = -EACCES; if (!task->tk_cred_retry) @@ -1654,7 +1668,6 @@ call_transmit(struct rpc_task *task) task->tk_action = call_transmit_status; /* Encode here so that rpcsec_gss can use correct sequence number. */ if (rpc_task_need_encode(task)) { - BUG_ON(task->tk_rqstp->rq_bytes_sent != 0); rpc_xdr_encode(task); /* Did the encode result in an error condition? */ if (task->tk_status != 0) { @@ -1738,7 +1751,6 @@ call_bc_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; - BUG_ON(task->tk_status != 0); task->tk_status = xprt_prepare_transmit(task); if (task->tk_status == -EAGAIN) { /* @@ -1785,7 +1797,7 @@ call_bc_transmit(struct rpc_task *task) * We were unable to reply and will have to drop the * request. The server should reconnect and retransmit. */ - BUG_ON(task->tk_status == -EAGAIN); + WARN_ON_ONCE(task->tk_status == -EAGAIN); printk(KERN_NOTICE "RPC: Could not send backchannel reply " "error: %d\n", task->tk_status); break; diff --git a/trunk/net/sunrpc/rpc_pipe.c b/trunk/net/sunrpc/rpc_pipe.c index 80f5dd23417d..fd10981ea792 100644 --- a/trunk/net/sunrpc/rpc_pipe.c +++ b/trunk/net/sunrpc/rpc_pipe.c @@ -1093,7 +1093,7 @@ void rpc_put_sb_net(const struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); - BUG_ON(sn->pipefs_sb == NULL); + WARN_ON(sn->pipefs_sb == NULL); mutex_unlock(&sn->pipefs_sb_lock); } EXPORT_SYMBOL_GPL(rpc_put_sb_net); @@ -1152,14 +1152,19 @@ static void rpc_kill_sb(struct super_block *sb) struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); mutex_lock(&sn->pipefs_sb_lock); + if (sn->pipefs_sb != sb) { + mutex_unlock(&sn->pipefs_sb_lock); + goto out; + } sn->pipefs_sb = NULL; mutex_unlock(&sn->pipefs_sb_lock); - put_net(net); dprintk("RPC: sending pipefs UMOUNT notification for net %p%s\n", net, NET_NAME(net)); blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_UMOUNT, sb); + put_net(net); +out: kill_litter_super(sb); } diff --git a/trunk/net/sunrpc/rpcb_clnt.c b/trunk/net/sunrpc/rpcb_clnt.c index a70acae496e4..411f332de0b3 100644 --- a/trunk/net/sunrpc/rpcb_clnt.c +++ b/trunk/net/sunrpc/rpcb_clnt.c @@ -884,7 +884,10 @@ static void encode_rpcb_string(struct xdr_stream *xdr, const char *string, u32 len; len = strlen(string); - BUG_ON(len > maxstrlen); + WARN_ON_ONCE(len > maxstrlen); + if (len > maxstrlen) + /* truncate and hope for the best */ + len = maxstrlen; p = xdr_reserve_space(xdr, 4 + len); xdr_encode_opaque(p, string, len); } diff --git a/trunk/net/sunrpc/sched.c b/trunk/net/sunrpc/sched.c index 6357fcb00c7e..d17a704aaf5f 100644 --- a/trunk/net/sunrpc/sched.c +++ b/trunk/net/sunrpc/sched.c @@ -98,6 +98,23 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task) list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list); } +static void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority) +{ + queue->priority = priority; +} + +static void rpc_set_waitqueue_owner(struct rpc_wait_queue *queue, pid_t pid) +{ + queue->owner = pid; + queue->nr = RPC_BATCH_COUNT; +} + +static void rpc_reset_waitqueue_priority(struct rpc_wait_queue *queue) +{ + rpc_set_waitqueue_priority(queue, queue->maxpriority); + rpc_set_waitqueue_owner(queue, 0); +} + /* * Add new request to a priority queue. */ @@ -109,9 +126,11 @@ static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue, struct rpc_task *t; INIT_LIST_HEAD(&task->u.tk_wait.links); - q = &queue->tasks[queue_priority]; if (unlikely(queue_priority > queue->maxpriority)) - q = &queue->tasks[queue->maxpriority]; + queue_priority = queue->maxpriority; + if (queue_priority > queue->priority) + rpc_set_waitqueue_priority(queue, queue_priority); + q = &queue->tasks[queue_priority]; list_for_each_entry(t, q, u.tk_wait.list) { if (t->tk_owner == task->tk_owner) { list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links); @@ -133,7 +152,9 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task, unsigned char queue_priority) { - BUG_ON (RPC_IS_QUEUED(task)); + WARN_ON_ONCE(RPC_IS_QUEUED(task)); + if (RPC_IS_QUEUED(task)) + return; if (RPC_IS_PRIORITY(queue)) __rpc_add_wait_queue_priority(queue, task, queue_priority); @@ -178,24 +199,6 @@ static void __rpc_remove_wait_queue(struct rpc_wait_queue *queue, struct rpc_tas task->tk_pid, queue, rpc_qname(queue)); } -static inline void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority) -{ - queue->priority = priority; - queue->count = 1 << (priority * 2); -} - -static inline void rpc_set_waitqueue_owner(struct rpc_wait_queue *queue, pid_t pid) -{ - queue->owner = pid; - queue->nr = RPC_BATCH_COUNT; -} - -static inline void rpc_reset_waitqueue_priority(struct rpc_wait_queue *queue) -{ - rpc_set_waitqueue_priority(queue, queue->maxpriority); - rpc_set_waitqueue_owner(queue, 0); -} - static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname, unsigned char nr_queues) { int i; @@ -334,7 +337,7 @@ static void __rpc_sleep_on_priority(struct rpc_wait_queue *q, __rpc_add_wait_queue(q, task, queue_priority); - BUG_ON(task->tk_callback != NULL); + WARN_ON_ONCE(task->tk_callback != NULL); task->tk_callback = action; __rpc_add_timer(q, task); } @@ -343,7 +346,12 @@ void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, rpc_action action) { /* We shouldn't ever put an inactive task to sleep */ - BUG_ON(!RPC_IS_ACTIVATED(task)); + WARN_ON_ONCE(!RPC_IS_ACTIVATED(task)); + if (!RPC_IS_ACTIVATED(task)) { + task->tk_status = -EIO; + rpc_put_task_async(task); + return; + } /* * Protect the queue operations. @@ -358,7 +366,12 @@ void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task, rpc_action action, int priority) { /* We shouldn't ever put an inactive task to sleep */ - BUG_ON(!RPC_IS_ACTIVATED(task)); + WARN_ON_ONCE(!RPC_IS_ACTIVATED(task)); + if (!RPC_IS_ACTIVATED(task)) { + task->tk_status = -EIO; + rpc_put_task_async(task); + return; + } /* * Protect the queue operations. @@ -367,6 +380,7 @@ void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task, __rpc_sleep_on_priority(q, task, action, priority - RPC_PRIORITY_LOW); spin_unlock_bh(&q->lock); } +EXPORT_SYMBOL_GPL(rpc_sleep_on_priority); /** * __rpc_do_wake_up_task - wake up a single rpc_task @@ -451,8 +465,7 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q /* * Check if we need to switch queues. */ - if (--queue->count) - goto new_owner; + goto new_owner; } /* @@ -697,7 +710,9 @@ static void __rpc_execute(struct rpc_task *task) dprintk("RPC: %5u __rpc_execute flags=0x%x\n", task->tk_pid, task->tk_flags); - BUG_ON(RPC_IS_QUEUED(task)); + WARN_ON_ONCE(RPC_IS_QUEUED(task)); + if (RPC_IS_QUEUED(task)) + return; for (;;) { void (*do_action)(struct rpc_task *); @@ -981,7 +996,7 @@ static void rpc_release_task(struct rpc_task *task) { dprintk("RPC: %5u release task\n", task->tk_pid); - BUG_ON (RPC_IS_QUEUED(task)); + WARN_ON_ONCE(RPC_IS_QUEUED(task)); rpc_release_resources_task(task); diff --git a/trunk/net/sunrpc/svc.c b/trunk/net/sunrpc/svc.c index 3ee7461926d8..dfa4ba69ff45 100644 --- a/trunk/net/sunrpc/svc.c +++ b/trunk/net/sunrpc/svc.c @@ -324,7 +324,9 @@ svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx) * The caller checks for sv_nrpools > 1, which * implies that we've been initialized. */ - BUG_ON(m->count == 0); + WARN_ON_ONCE(m->count == 0); + if (m->count == 0) + return; switch (m->mode) { case SVC_POOL_PERCPU: @@ -585,7 +587,9 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node) * We assume one is at most one page */ arghi = 0; - BUG_ON(pages > RPCSVC_MAXPAGES); + WARN_ON_ONCE(pages > RPCSVC_MAXPAGES); + if (pages > RPCSVC_MAXPAGES) + pages = RPCSVC_MAXPAGES; while (pages) { struct page *p = alloc_pages_node(node, GFP_KERNEL, 0); if (!p) @@ -946,7 +950,9 @@ int svc_register(const struct svc_serv *serv, struct net *net, unsigned int i; int error = 0; - BUG_ON(proto == 0 && port == 0); + WARN_ON_ONCE(proto == 0 && port == 0); + if (proto == 0 && port == 0) + return -EINVAL; for (progp = serv->sv_program; progp; progp = progp->pg_next) { for (i = 0; i < progp->pg_nvers; i++) { diff --git a/trunk/net/sunrpc/svc_xprt.c b/trunk/net/sunrpc/svc_xprt.c index 194d865fae72..b8e47fac7315 100644 --- a/trunk/net/sunrpc/svc_xprt.c +++ b/trunk/net/sunrpc/svc_xprt.c @@ -218,7 +218,9 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, */ static void svc_xprt_received(struct svc_xprt *xprt) { - BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags)); + WARN_ON_ONCE(!test_bit(XPT_BUSY, &xprt->xpt_flags)); + if (!test_bit(XPT_BUSY, &xprt->xpt_flags)) + return; /* As soon as we clear busy, the xprt could be closed and * 'put', so we need a reference to call svc_xprt_enqueue with: */ @@ -577,7 +579,10 @@ int svc_alloc_arg(struct svc_rqst *rqstp) /* now allocate needed pages. If we get a failure, sleep briefly */ pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE; - BUG_ON(pages >= RPCSVC_MAXPAGES); + WARN_ON_ONCE(pages >= RPCSVC_MAXPAGES); + if (pages >= RPCSVC_MAXPAGES) + /* use as many pages as possible */ + pages = RPCSVC_MAXPAGES - 1; for (i = 0; i < pages ; i++) while (rqstp->rq_pages[i] == NULL) { struct page *p = alloc_page(GFP_KERNEL); @@ -926,7 +931,7 @@ static void svc_delete_xprt(struct svc_xprt *xprt) spin_lock_bh(&serv->sv_lock); if (!test_and_set_bit(XPT_DETACHED, &xprt->xpt_flags)) list_del_init(&xprt->xpt_list); - BUG_ON(!list_empty(&xprt->xpt_ready)); + WARN_ON_ONCE(!list_empty(&xprt->xpt_ready)); if (test_bit(XPT_TEMP, &xprt->xpt_flags)) serv->sv_tmpcnt--; spin_unlock_bh(&serv->sv_lock); diff --git a/trunk/net/sunrpc/svcsock.c b/trunk/net/sunrpc/svcsock.c index 03827cef1fa7..cc3020d16789 100644 --- a/trunk/net/sunrpc/svcsock.c +++ b/trunk/net/sunrpc/svcsock.c @@ -84,7 +84,11 @@ static struct lock_class_key svc_slock_key[2]; static void svc_reclassify_socket(struct socket *sock) { struct sock *sk = sock->sk; - BUG_ON(sock_owned_by_user(sk)); + + WARN_ON_ONCE(sock_owned_by_user(sk)); + if (sock_owned_by_user(sk)) + return; + switch (sk->sk_family) { case AF_INET: sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD", diff --git a/trunk/net/sunrpc/xdr.c b/trunk/net/sunrpc/xdr.c index 08f50afd5f2a..56055632f151 100644 --- a/trunk/net/sunrpc/xdr.c +++ b/trunk/net/sunrpc/xdr.c @@ -318,7 +318,10 @@ xdr_shrink_bufhead(struct xdr_buf *buf, size_t len) tail = buf->tail; head = buf->head; - BUG_ON (len > head->iov_len); + + WARN_ON_ONCE(len > head->iov_len); + if (len > head->iov_len) + len = head->iov_len; /* Shift the tail first */ if (tail->iov_len != 0) { diff --git a/trunk/net/sunrpc/xprtsock.c b/trunk/net/sunrpc/xprtsock.c index 75853cabf4c9..68b0a81c31d5 100644 --- a/trunk/net/sunrpc/xprtsock.c +++ b/trunk/net/sunrpc/xprtsock.c @@ -1746,7 +1746,6 @@ static inline void xs_reclassify_socketu(struct socket *sock) { struct sock *sk = sock->sk; - BUG_ON(sock_owned_by_user(sk)); sock_lock_init_class_and_name(sk, "slock-AF_LOCAL-RPC", &xs_slock_key[1], "sk_lock-AF_LOCAL-RPC", &xs_key[1]); } @@ -1755,7 +1754,6 @@ static inline void xs_reclassify_socket4(struct socket *sock) { struct sock *sk = sock->sk; - BUG_ON(sock_owned_by_user(sk)); sock_lock_init_class_and_name(sk, "slock-AF_INET-RPC", &xs_slock_key[0], "sk_lock-AF_INET-RPC", &xs_key[0]); } @@ -1764,13 +1762,16 @@ static inline void xs_reclassify_socket6(struct socket *sock) { struct sock *sk = sock->sk; - BUG_ON(sock_owned_by_user(sk)); sock_lock_init_class_and_name(sk, "slock-AF_INET6-RPC", &xs_slock_key[1], "sk_lock-AF_INET6-RPC", &xs_key[1]); } static inline void xs_reclassify_socket(int family, struct socket *sock) { + WARN_ON_ONCE(sock_owned_by_user(sock->sk)); + if (sock_owned_by_user(sock->sk)) + return; + switch (family) { case AF_LOCAL: xs_reclassify_socketu(sock); @@ -1901,6 +1902,10 @@ static void xs_local_setup_socket(struct work_struct *work) dprintk("RPC: xprt %p: socket %s does not exist\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); break; + case -ECONNREFUSED: + dprintk("RPC: xprt %p: connection refused for %s\n", + xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); + break; default: printk(KERN_ERR "%s: unhandled error (%d) connecting to %s\n", __func__, -status, @@ -2329,9 +2334,11 @@ static void *bc_malloc(struct rpc_task *task, size_t size) struct page *page; struct rpc_buffer *buf; - BUG_ON(size > PAGE_SIZE - sizeof(struct rpc_buffer)); - page = alloc_page(GFP_KERNEL); + WARN_ON_ONCE(size > PAGE_SIZE - sizeof(struct rpc_buffer)); + if (size > PAGE_SIZE - sizeof(struct rpc_buffer)) + return NULL; + page = alloc_page(GFP_KERNEL); if (!page) return NULL; @@ -2393,7 +2400,6 @@ static int bc_send_request(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct svc_xprt *xprt; - struct svc_sock *svsk; u32 len; dprintk("sending request with xid: %08x\n", ntohl(req->rq_xid)); @@ -2401,7 +2407,6 @@ static int bc_send_request(struct rpc_task *task) * Get the server socket associated with this callback xprt */ xprt = req->rq_xprt->bc_xprt; - svsk = container_of(xprt, struct svc_sock, sk_xprt); /* * Grab the mutex to serialize data as the connection is shared diff --git a/trunk/scripts/Makefile.modsign b/trunk/scripts/Makefile.modsign new file mode 100644 index 000000000000..abfda626dbad --- /dev/null +++ b/trunk/scripts/Makefile.modsign @@ -0,0 +1,32 @@ +# ========================================================================== +# Signing modules +# ========================================================================== + +PHONY := __modsign +__modsign: + +include scripts/Kbuild.include + +__modules := $(sort $(shell grep -h '\.ko' /dev/null $(wildcard $(MODVERDIR)/*.mod))) +modules := $(patsubst %.o,%.ko,$(wildcard $(__modules:.ko=.o))) + +PHONY += $(modules) +__modsign: $(modules) + @: + +quiet_cmd_sign_ko = SIGN [M] $(2)/$(notdir $@) + cmd_sign_ko = $(mod_sign_cmd) $(2)/$(notdir $@) + +# Modules built outside the kernel source tree go into extra by default +INSTALL_MOD_DIR ?= extra +ext-mod-dir = $(INSTALL_MOD_DIR)$(subst $(patsubst %/,%,$(KBUILD_EXTMOD)),,$(@D)) + +modinst_dir = $(if $(KBUILD_EXTMOD),$(ext-mod-dir),kernel/$(@D)) + +$(modules): + $(call cmd,sign_ko,$(MODLIB)/$(modinst_dir)) + +# Declare the contents of the .PHONY variable as phony. We keep that +# information in a variable se we can use it in if_changed and friends. + +.PHONY: $(PHONY) diff --git a/trunk/scripts/checkpatch.pl b/trunk/scripts/checkpatch.pl index f18750e3bd6c..1d6e4c541370 100755 --- a/trunk/scripts/checkpatch.pl +++ b/trunk/scripts/checkpatch.pl @@ -33,6 +33,7 @@ my @ignore = (); my $help = 0; my $configuration_file = ".checkpatch.conf"; +my $max_line_length = 80; sub help { my ($exitcode) = @_; @@ -51,6 +52,7 @@ sub help { -f, --file treat FILE as regular source file --subjective, --strict enable more subjective tests --ignore TYPE(,TYPE2...) ignore various comma separated message types + --max-line-length=n set the maximum line length, if exceeded, warn --show-types show the message "types" in the output --root=PATH PATH to the kernel tree root --no-summary suppress the per-file summary @@ -107,6 +109,7 @@ sub help { 'strict!' => \$check, 'ignore=s' => \@ignore, 'show-types!' => \$show_types, + 'max-line-length=i' => \$max_line_length, 'root=s' => \$root, 'summary!' => \$summary, 'mailback!' => \$mailback, @@ -227,7 +230,11 @@ sub help { our $Member = qr{->$Ident|\.$Ident|\[[^]]*\]}; our $Lval = qr{$Ident(?:$Member)*}; -our $Constant = qr{(?i:(?:[0-9]+|0x[0-9a-f]+)[ul]*)}; +our $Float_hex = qr{(?i:0x[0-9a-f]+p-?[0-9]+[fl]?)}; +our $Float_dec = qr{(?i:((?:[0-9]+\.[0-9]*|[0-9]*\.[0-9]+)(?:e-?[0-9]+)?[fl]?))}; +our $Float_int = qr{(?i:[0-9]+e-?[0-9]+[fl]?)}; +our $Float = qr{$Float_hex|$Float_dec|$Float_int}; +our $Constant = qr{(?:$Float|(?i:(?:0x[0-9a-f]+|[0-9]+)[ul]*))}; our $Assignment = qr{(?:\*\=|/=|%=|\+=|-=|<<=|>>=|&=|\^=|\|=|=)}; our $Compare = qr{<=|>=|==|!=|<|>}; our $Operators = qr{ @@ -352,27 +359,6 @@ sub deparenthesize { $chk_signoff = 0 if ($file); -my @dep_includes = (); -my @dep_functions = (); -my $removal = "Documentation/feature-removal-schedule.txt"; -if ($tree && -f "$root/$removal") { - open(my $REMOVE, '<', "$root/$removal") || - die "$P: $removal: open failed - $!\n"; - while (<$REMOVE>) { - if (/^Check:\s+(.*\S)/) { - for my $entry (split(/[, ]+/, $1)) { - if ($entry =~ m@include/(.*)@) { - push(@dep_includes, $1); - - } elsif ($entry !~ m@/@) { - push(@dep_functions, $entry); - } - } - } - } - close($REMOVE); -} - my @rawlines = (); my @lines = (); my $vname; @@ -1412,6 +1398,8 @@ sub process { my %suppress_export; my $suppress_statement = 0; + my %camelcase = (); + # Pre-scan the patch sanitizing the lines. # Pre-scan the patch looking for any __setup documentation. # @@ -1757,6 +1745,13 @@ sub process { #print "is_start<$is_start> is_end<$is_end> length<$length>\n"; } +# discourage the addition of CONFIG_EXPERIMENTAL in Kconfig. + if ($realfile =~ /Kconfig/ && + $line =~ /.\s*depends on\s+.*\bEXPERIMENTAL\b/) { + WARN("CONFIG_EXPERIMENTAL", + "Use of CONFIG_EXPERIMENTAL is deprecated. For alternatives, see https://lkml.org/lkml/2012/10/23/580\n"); + } + if (($realfile =~ /Makefile.*/ || $realfile =~ /Kbuild.*/) && ($line =~ /\+(EXTRA_[A-Z]+FLAGS).*/)) { my $flag = $1; @@ -1774,15 +1769,15 @@ sub process { # check we are in a valid source file if not then ignore this hunk next if ($realfile !~ /\.(h|c|s|S|pl|sh)$/); -#80 column limit +#line length limit if ($line =~ /^\+/ && $prevrawline !~ /\/\*\*/ && $rawline !~ /^.\s*\*\s*\@$Ident\s/ && !($line =~ /^\+\s*$logFunctions\s*\(\s*(?:(KERN_\S+\s*|[^"]*))?"[X\t]*"\s*(?:|,|\)\s*;)\s*$/ || $line =~ /^\+\s*"[^"]*"\s*(?:\s*|,|\)\s*;)\s*$/) && - $length > 80) + $length > $max_line_length) { WARN("LONG_LINE", - "line over 80 characters\n" . $herecurr); + "line over $max_line_length characters\n" . $herecurr); } # Check for user-visible strings broken across lines, which breaks the ability @@ -1912,6 +1907,12 @@ sub process { # check we are in a valid C source file if not then ignore this hunk next if ($realfile !~ /\.(h|c)$/); +# discourage the addition of CONFIG_EXPERIMENTAL in #if(def). + if ($line =~ /^\+\s*\#\s*if.*\bCONFIG_EXPERIMENTAL\b/) { + WARN("CONFIG_EXPERIMENTAL", + "Use of CONFIG_EXPERIMENTAL is deprecated. For alternatives, see https://lkml.org/lkml/2012/10/23/580\n"); + } + # check for RCS/CVS revision markers if ($rawline =~ /^\+.*\$(Revision|Log|Id)(?:\$|)/) { WARN("CVS_KEYWORD", @@ -2906,12 +2907,17 @@ sub process { } } -#studly caps, commented out until figure out how to distinguish between use of existing and adding new -# if (($line=~/[\w_][a-z\d]+[A-Z]/) and !($line=~/print/)) { -# print "No studly caps, use _\n"; -# print "$herecurr"; -# $clean = 0; -# } +#CamelCase + while ($line =~ m{($Constant|$Lval)}g) { + my $var = $1; + if ($var !~ /$Constant/ && + $var =~ /[A-Z]\w*[a-z]|[a-z]\w*[A-Z]/ && + !defined $camelcase{$var}) { + $camelcase{$var} = 1; + WARN("CAMELCASE", + "Avoid CamelCase: <$var>\n" . $herecurr); + } + } #no spaces allowed after \ in define if ($line=~/\#\s*define.*\\\s$/) { @@ -3013,6 +3019,17 @@ sub process { "Macros with complex values should be enclosed in parenthesis\n" . "$herectx"); } } + +# check for line continuations outside of #defines, preprocessor #, and asm + + } else { + if ($prevline !~ /^..*\\$/ && + $line !~ /^\+\s*\#.*\\$/ && # preprocessor + $line !~ /^\+.*\b(__asm__|asm)\b.*\\$/ && # asm + $line =~ /^\+.*\\$/) { + WARN("LINE_CONTINUATIONS", + "Avoid unnecessary line continuations\n" . $herecurr); + } } # do {} while (0) macro tests: @@ -3183,20 +3200,14 @@ sub process { } } -# don't include deprecated include files (uses RAW line) - for my $inc (@dep_includes) { - if ($rawline =~ m@^.\s*\#\s*include\s*\<$inc>@) { - ERROR("DEPRECATED_INCLUDE", - "Don't use <$inc>: see Documentation/feature-removal-schedule.txt\n" . $herecurr); - } +# check for unnecessary blank lines around braces + if (($line =~ /^..*}\s*$/ && $prevline =~ /^.\s*$/)) { + CHK("BRACES", + "Blank lines aren't necessary before a close brace '}'\n" . $hereprev); } - -# don't use deprecated functions - for my $func (@dep_functions) { - if ($line =~ /\b$func\b/) { - ERROR("DEPRECATED_FUNCTION", - "Don't use $func(): see Documentation/feature-removal-schedule.txt\n" . $herecurr); - } + if (($line =~ /^.\s*$/ && $prevline =~ /^..*{\s*$/)) { + CHK("BRACES", + "Blank lines aren't necessary after an open brace '{'\n" . $hereprev); } # no volatiles please @@ -3213,20 +3224,12 @@ sub process { $herecurr); } -# check for needless kfree() checks - if ($prevline =~ /\bif\s*\(([^\)]*)\)/) { - my $expr = $1; - if ($line =~ /\bkfree\(\Q$expr\E\);/) { - WARN("NEEDLESS_KFREE", - "kfree(NULL) is safe this check is probably not required\n" . $hereprev); - } - } -# check for needless usb_free_urb() checks - if ($prevline =~ /\bif\s*\(([^\)]*)\)/) { - my $expr = $1; - if ($line =~ /\busb_free_urb\(\Q$expr\E\);/) { - WARN("NEEDLESS_USB_FREE_URB", - "usb_free_urb(NULL) is safe this check is probably not required\n" . $hereprev); +# check for needless "if () fn()" uses + if ($prevline =~ /\bif\s*\(\s*($Lval)\s*\)/) { + my $expr = '\s*\(\s*' . quotemeta($1) . '\s*\)\s*;'; + if ($line =~ /\b(kfree|usb_free_urb|debugfs_remove(?:_recursive)?)$expr/) { + WARN('NEEDLESS_IF', + "$1(NULL) is safe this check is probably not required\n" . $hereprev); } } @@ -3344,6 +3347,12 @@ sub process { "Avoid line continuations in quoted strings\n" . $herecurr); } +# check for struct spinlock declarations + if ($line =~ /^.\s*\bstruct\s+spinlock\s+\w+\s*;/) { + WARN("USE_SPINLOCK_T", + "struct spinlock should be spinlock_t\n" . $herecurr); + } + # Check for misused memsets if ($^V && $^V ge 5.10.0 && defined $stat && @@ -3450,8 +3459,22 @@ sub process { # check for multiple semicolons if ($line =~ /;\s*;\s*$/) { - WARN("ONE_SEMICOLON", - "Statements terminations use 1 semicolon\n" . $herecurr); + WARN("ONE_SEMICOLON", + "Statements terminations use 1 semicolon\n" . $herecurr); + } + +# check for switch/default statements without a break; + if ($^V && $^V ge 5.10.0 && + defined $stat && + $stat =~ /^\+[$;\s]*(?:case[$;\s]+\w+[$;\s]*:[$;\s]*|)*[$;\s]*\bdefault[$;\s]*:[$;\s]*;/g) { + my $ctx = ''; + my $herectx = $here . "\n"; + my $cnt = statement_rawlines($stat); + for (my $n = 0; $n < $cnt; $n++) { + $herectx .= raw_line($linenr, $n) . "\n"; + } + WARN("DEFAULT_NO_BREAK", + "switch default: should use break\n" . $herectx); } # check for gcc specific __FUNCTION__ diff --git a/trunk/scripts/coccinelle/api/d_find_alias.cocci b/trunk/scripts/coccinelle/api/d_find_alias.cocci new file mode 100644 index 000000000000..a9694a8d3e5a --- /dev/null +++ b/trunk/scripts/coccinelle/api/d_find_alias.cocci @@ -0,0 +1,80 @@ +/// Make sure calls to d_find_alias() have a corresponding call to dput(). +// +// Keywords: d_find_alias, dput +// +// Confidence: Moderate +// URL: http://coccinelle.lip6.fr/ +// Options: -include_headers + +virtual context +virtual org +virtual patch +virtual report + +@r exists@ +local idexpression struct dentry *dent; +expression E, E1; +statement S1, S2; +position p1, p2; +@@ +( + if (!(dent@p1 = d_find_alias(...))) S1 +| + dent@p1 = d_find_alias(...) +) + +<...when != dput(dent) + when != if (...) { <+... dput(dent) ...+> } + when != true !dent || ... + when != dent = E + when != E = dent +if (!dent || ...) S2 +...> +( + return <+...dent...+>; +| + return @p2 ...; +| + dent@p2 = E1; +| + E1 = dent; +) + +@depends on context@ +local idexpression struct dentry *r.dent; +position r.p1,r.p2; +@@ +* dent@p1 = ... + ... +( +* return@p2 ...; +| +* dent@p2 +) + + +@script:python depends on org@ +p1 << r.p1; +p2 << r.p2; +@@ +cocci.print_main("Missing call to dput()",p1) +cocci.print_secs("",p2) + +@depends on patch@ +local idexpression struct dentry *r.dent; +position r.p2; +@@ +( ++ dput(dent); + return @p2 ...; +| ++ dput(dent); + dent@p2 = ...; +) + +@script:python depends on report@ +p1 << r.p1; +p2 << r.p2; +@@ +msg = "Missing call to dput() at line %s." +coccilib.report.print_report(p1[0], msg % (p2[0].line)) diff --git a/trunk/security/capability.c b/trunk/security/capability.c index b14a30c234b8..0fe5a026aef8 100644 --- a/trunk/security/capability.c +++ b/trunk/security/capability.c @@ -395,6 +395,11 @@ static int cap_kernel_module_request(char *kmod_name) return 0; } +static int cap_kernel_module_from_file(struct file *file) +{ + return 0; +} + static int cap_task_setpgid(struct task_struct *p, pid_t pgid) { return 0; @@ -967,6 +972,7 @@ void __init security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, kernel_act_as); set_to_cap_if_null(ops, kernel_create_files_as); set_to_cap_if_null(ops, kernel_module_request); + set_to_cap_if_null(ops, kernel_module_from_file); set_to_cap_if_null(ops, task_fix_setuid); set_to_cap_if_null(ops, task_setpgid); set_to_cap_if_null(ops, task_getpgid); diff --git a/trunk/security/commoncap.c b/trunk/security/commoncap.c index 6dbae4650abe..7ee08c756d6b 100644 --- a/trunk/security/commoncap.c +++ b/trunk/security/commoncap.c @@ -76,24 +76,33 @@ int cap_netlink_send(struct sock *sk, struct sk_buff *skb) int cap_capable(const struct cred *cred, struct user_namespace *targ_ns, int cap, int audit) { - for (;;) { - /* The owner of the user namespace has all caps. */ - if (targ_ns != &init_user_ns && uid_eq(targ_ns->owner, cred->euid)) - return 0; + struct user_namespace *ns = targ_ns; + /* See if cred has the capability in the target user namespace + * by examining the target user namespace and all of the target + * user namespace's parents. + */ + for (;;) { /* Do we have the necessary capabilities? */ - if (targ_ns == cred->user_ns) + if (ns == cred->user_ns) return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM; /* Have we tried all of the parent namespaces? */ - if (targ_ns == &init_user_ns) + if (ns == &init_user_ns) return -EPERM; + /* + * The owner of the user namespace in the parent of the + * user namespace has all caps. + */ + if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid)) + return 0; + /* - *If you have a capability in a parent user ns, then you have + * If you have a capability in a parent user ns, then you have * it over all children user namespaces as well. */ - targ_ns = targ_ns->parent; + ns = ns->parent; } /* We never get here */ diff --git a/trunk/security/integrity/ima/ima.h b/trunk/security/integrity/ima/ima.h index 6ee8826662cc..3b2adb794f15 100644 --- a/trunk/security/integrity/ima/ima.h +++ b/trunk/security/integrity/ima/ima.h @@ -127,7 +127,7 @@ struct integrity_iint_cache *integrity_iint_insert(struct inode *inode); struct integrity_iint_cache *integrity_iint_find(struct inode *inode); /* IMA policy related functions */ -enum ima_hooks { FILE_CHECK = 1, FILE_MMAP, BPRM_CHECK, POST_SETATTR }; +enum ima_hooks { FILE_CHECK = 1, FILE_MMAP, BPRM_CHECK, MODULE_CHECK, POST_SETATTR }; int ima_match_policy(struct inode *inode, enum ima_hooks func, int mask, int flags); diff --git a/trunk/security/integrity/ima/ima_api.c b/trunk/security/integrity/ima/ima_api.c index b356884fb3ef..0cea3db21657 100644 --- a/trunk/security/integrity/ima/ima_api.c +++ b/trunk/security/integrity/ima/ima_api.c @@ -100,12 +100,12 @@ void ima_add_violation(struct inode *inode, const unsigned char *filename, * ima_get_action - appraise & measure decision based on policy. * @inode: pointer to inode to measure * @mask: contains the permission mask (MAY_READ, MAY_WRITE, MAY_EXECUTE) - * @function: calling function (FILE_CHECK, BPRM_CHECK, FILE_MMAP) + * @function: calling function (FILE_CHECK, BPRM_CHECK, FILE_MMAP, MODULE_CHECK) * * The policy is defined in terms of keypairs: * subj=, obj=, type=, func=, mask=, fsmagic= * subj,obj, and type: are LSM specific. - * func: FILE_CHECK | BPRM_CHECK | FILE_MMAP + * func: FILE_CHECK | BPRM_CHECK | FILE_MMAP | MODULE_CHECK * mask: contains the permission mask * fsmagic: hex value * diff --git a/trunk/security/integrity/ima/ima_main.c b/trunk/security/integrity/ima/ima_main.c index 73c9a268253e..45de18e9a6f2 100644 --- a/trunk/security/integrity/ima/ima_main.c +++ b/trunk/security/integrity/ima/ima_main.c @@ -280,6 +280,27 @@ int ima_file_check(struct file *file, int mask) } EXPORT_SYMBOL_GPL(ima_file_check); +/** + * ima_module_check - based on policy, collect/store/appraise measurement. + * @file: pointer to the file to be measured/appraised + * + * Measure/appraise kernel modules based on policy. + * + * Always return 0 and audit dentry_open failures. + * Return code is based upon measurement appraisal. + */ +int ima_module_check(struct file *file) +{ + int rc; + + if (!file) + rc = INTEGRITY_UNKNOWN; + else + rc = process_measurement(file, file->f_dentry->d_name.name, + MAY_EXEC, MODULE_CHECK); + return (ima_appraise & IMA_APPRAISE_ENFORCE) ? rc : 0; +} + static int __init init_ima(void) { int error; diff --git a/trunk/security/integrity/ima/ima_policy.c b/trunk/security/integrity/ima/ima_policy.c index c7dacd2eab7a..af7d182d5a46 100644 --- a/trunk/security/integrity/ima/ima_policy.c +++ b/trunk/security/integrity/ima/ima_policy.c @@ -80,6 +80,7 @@ static struct ima_rule_entry default_rules[] = { .flags = IMA_FUNC | IMA_MASK}, {.action = MEASURE,.func = FILE_CHECK,.mask = MAY_READ,.uid = GLOBAL_ROOT_UID, .flags = IMA_FUNC | IMA_MASK | IMA_UID}, + {.action = MEASURE,.func = MODULE_CHECK, .flags = IMA_FUNC}, }; static struct ima_rule_entry default_appraise_rules[] = { @@ -401,6 +402,8 @@ static int ima_parse_rule(char *rule, struct ima_rule_entry *entry) /* PATH_CHECK is for backwards compat */ else if (strcmp(args[0].from, "PATH_CHECK") == 0) entry->func = FILE_CHECK; + else if (strcmp(args[0].from, "MODULE_CHECK") == 0) + entry->func = MODULE_CHECK; else if (strcmp(args[0].from, "FILE_MMAP") == 0) entry->func = FILE_MMAP; else if (strcmp(args[0].from, "BPRM_CHECK") == 0) diff --git a/trunk/security/security.c b/trunk/security/security.c index 8dcd4ae10a5f..daa97f4ac9d1 100644 --- a/trunk/security/security.c +++ b/trunk/security/security.c @@ -820,6 +820,16 @@ int security_kernel_module_request(char *kmod_name) return security_ops->kernel_module_request(kmod_name); } +int security_kernel_module_from_file(struct file *file) +{ + int ret; + + ret = security_ops->kernel_module_from_file(file); + if (ret) + return ret; + return ima_module_check(file); +} + int security_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { diff --git a/trunk/security/selinux/nlmsgtab.c b/trunk/security/selinux/nlmsgtab.c index 370a6468b3ba..855e464e92ef 100644 --- a/trunk/security/selinux/nlmsgtab.c +++ b/trunk/security/selinux/nlmsgtab.c @@ -69,6 +69,8 @@ static struct nlmsg_perm nlmsg_route_perms[] = { RTM_SETDCB, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_NEWNETCONF, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETNETCONF, NETLINK_ROUTE_SOCKET__NLMSG_READ }, + { RTM_NEWMDB, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, + { RTM_DELMDB, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETMDB, NETLINK_ROUTE_SOCKET__NLMSG_READ }, }; diff --git a/trunk/security/yama/yama_lsm.c b/trunk/security/yama/yama_lsm.c index 2663145d1197..23414b93771f 100644 --- a/trunk/security/yama/yama_lsm.c +++ b/trunk/security/yama/yama_lsm.c @@ -298,14 +298,18 @@ int yama_ptrace_access_check(struct task_struct *child, /* No additional restrictions. */ break; case YAMA_SCOPE_RELATIONAL: + rcu_read_lock(); if (!task_is_descendant(current, child) && !ptracer_exception_found(current, child) && - !ns_capable(task_user_ns(child), CAP_SYS_PTRACE)) + !ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE)) rc = -EPERM; + rcu_read_unlock(); break; case YAMA_SCOPE_CAPABILITY: - if (!ns_capable(task_user_ns(child), CAP_SYS_PTRACE)) + rcu_read_lock(); + if (!ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE)) rc = -EPERM; + rcu_read_unlock(); break; case YAMA_SCOPE_NO_ATTACH: default: @@ -343,8 +347,10 @@ int yama_ptrace_traceme(struct task_struct *parent) /* Only disallow PTRACE_TRACEME on more aggressive settings. */ switch (ptrace_scope) { case YAMA_SCOPE_CAPABILITY: - if (!ns_capable(task_user_ns(parent), CAP_SYS_PTRACE)) + rcu_read_lock(); + if (!ns_capable(__task_cred(parent)->user_ns, CAP_SYS_PTRACE)) rc = -EPERM; + rcu_read_unlock(); break; case YAMA_SCOPE_NO_ATTACH: rc = -EPERM; diff --git a/trunk/sound/Kconfig b/trunk/sound/Kconfig index 261a03c8a209..c710ce2c5c37 100644 --- a/trunk/sound/Kconfig +++ b/trunk/sound/Kconfig @@ -52,9 +52,6 @@ config SOUND_OSS_CORE_PRECLAIM Disabling this allows alternative OSS implementations. - Please read Documentation/feature-removal-schedule.txt for - details. - If unsure, say Y. source "sound/oss/dmasound/Kconfig" diff --git a/trunk/sound/sound_core.c b/trunk/sound/sound_core.c index fb9255cca214..bb23009edc8d 100644 --- a/trunk/sound/sound_core.c +++ b/trunk/sound/sound_core.c @@ -146,8 +146,7 @@ extern int msnd_pinnacle_init(void); * devices only the standard chrdev aliases are requested. * * All these clutters are scheduled to be removed along with - * sound-slot/service-* module aliases. Please take a look at - * feature-removal-schedule.txt for details. + * sound-slot/service-* module aliases. */ #ifdef CONFIG_SOUND_OSS_CORE_PRECLAIM static int preclaim_oss = 1; diff --git a/trunk/tools/power/x86/turbostat/Makefile b/trunk/tools/power/x86/turbostat/Makefile index f85649554191..f09641da40d4 100644 --- a/trunk/tools/power/x86/turbostat/Makefile +++ b/trunk/tools/power/x86/turbostat/Makefile @@ -1,9 +1,22 @@ +CC = $(CROSS_COMPILE)gcc +BUILD_OUTPUT := $(PWD) +PREFIX := /usr +DESTDIR := + turbostat : turbostat.c CFLAGS += -Wall +CFLAGS += -I../../../../arch/x86/include/uapi/ + +%: %.c + @mkdir -p $(BUILD_OUTPUT) + $(CC) $(CFLAGS) $< -o $(BUILD_OUTPUT)/$@ +.PHONY : clean clean : - rm -f turbostat + @rm -f $(BUILD_OUTPUT)/turbostat -install : - install turbostat /usr/bin/turbostat - install turbostat.8 /usr/share/man/man8 +install : turbostat + install -d $(DESTDIR)$(PREFIX)/bin + install $(BUILD_OUTPUT)/turbostat $(DESTDIR)$(PREFIX)/bin/turbostat + install -d $(DESTDIR)$(PREFIX)/share/man/man8 + install turbostat.8 $(DESTDIR)$(PREFIX)/share/man/man8 diff --git a/trunk/tools/power/x86/turbostat/turbostat.8 b/trunk/tools/power/x86/turbostat/turbostat.8 index e4d0690cccf9..0d7dc2cfefb5 100644 --- a/trunk/tools/power/x86/turbostat/turbostat.8 +++ b/trunk/tools/power/x86/turbostat/turbostat.8 @@ -11,16 +11,16 @@ turbostat \- Report processor frequency and idle statistics .RB [ Options ] .RB [ "\-i interval_sec" ] .SH DESCRIPTION -\fBturbostat \fP reports processor topology, frequency -and idle power state statistics on modern X86 processors. +\fBturbostat \fP reports processor topology, frequency, +idle power-state statistics, temperature and power on modern X86 processors. Either \fBcommand\fP is forked and statistics are printed upon its completion, or statistics are printed periodically. \fBturbostat \fP -requires that the processor +must be run on root, and +minimally requires that the processor supports an "invariant" TSC, plus the APERF and MPERF MSRs. -\fBturbostat \fP will report idle cpu power state residency -on processors that additionally support C-state residency counters. +Additional information is reported depending on hardware counter support. .SS Options The \fB-p\fP option limits output to the 1st thread in 1st core of each package. @@ -57,7 +57,15 @@ Note that multiple CPUs per core indicate support for Intel(R) Hyper-Threading T \fBGHz\fP average clock rate while the CPU was in c0 state. \fBTSC\fP average GHz that the TSC ran during the entire interval. \fB%c1, %c3, %c6, %c7\fP show the percentage residency in hardware core idle states. +\fBCTMP\fP Degrees Celsius reported by the per-core Digital Thermal Sensor. +\fBPTMP\fP Degrees Celsius reported by the per-package Package Thermal Monitor. \fB%pc2, %pc3, %pc6, %pc7\fP percentage residency in hardware package idle states. +\fBPkg_W\fP Watts consumed by the whole package. +\fBCor_W\fP Watts consumed by the core part of the package. +\fBGFX_W\fP Watts consumed by the Graphics part of the package -- available only on client processors. +\fBRAM_W\fP Watts consumed by the DRAM DIMMS -- available only on server processors. +\fBPKG_%\fP percent of the interval that RAPL throttling was active on the Package. +\fBRAM_%\fP percent of the interval that RAPL throttling was active on DRAM. .fi .PP .SH EXAMPLE @@ -66,50 +74,73 @@ Without any parameters, turbostat prints out counters ever 5 seconds. for turbostat to fork). The first row of statistics is a summary for the entire system. -Note that the summary is a weighted average. +For residency % columns, the summary is a weighted average. +For Temperature columns, the summary is the column maximum. +For Watts columns, the summary is a system total. Subsequent rows show per-CPU statistics. .nf -[root@x980]# ./turbostat -cor CPU %c0 GHz TSC %c1 %c3 %c6 %pc3 %pc6 - 0.09 1.62 3.38 1.83 0.32 97.76 1.26 83.61 - 0 0 0.15 1.62 3.38 10.23 0.05 89.56 1.26 83.61 - 0 6 0.05 1.62 3.38 10.34 - 1 2 0.03 1.62 3.38 0.07 0.05 99.86 - 1 8 0.03 1.62 3.38 0.06 - 2 4 0.21 1.62 3.38 0.10 1.49 98.21 - 2 10 0.02 1.62 3.38 0.29 - 8 1 0.04 1.62 3.38 0.04 0.08 99.84 - 8 7 0.01 1.62 3.38 0.06 - 9 3 0.53 1.62 3.38 0.10 0.20 99.17 - 9 9 0.02 1.62 3.38 0.60 - 10 5 0.01 1.62 3.38 0.02 0.04 99.92 - 10 11 0.02 1.62 3.38 0.02 +[root@sandy]# ./turbostat +cor CPU %c0 GHz TSC %c1 %c3 %c6 %c7 CTMP PTMP %pc2 %pc3 %pc6 %pc7 Pkg_W Cor_W GFX_W + 0.06 0.80 2.29 0.11 0.00 0.00 99.83 47 40 0.26 0.01 0.44 98.78 3.49 0.12 0.14 + 0 0 0.07 0.80 2.29 0.07 0.00 0.00 99.86 40 40 0.26 0.01 0.44 98.78 3.49 0.12 0.14 + 0 4 0.03 0.80 2.29 0.12 + 1 1 0.04 0.80 2.29 0.25 0.01 0.00 99.71 40 + 1 5 0.16 0.80 2.29 0.13 + 2 2 0.05 0.80 2.29 0.06 0.01 0.00 99.88 40 + 2 6 0.03 0.80 2.29 0.08 + 3 3 0.05 0.80 2.29 0.08 0.00 0.00 99.87 47 + 3 7 0.04 0.84 2.29 0.09 .fi .SH SUMMARY EXAMPLE The "-s" option prints the column headers just once, and then the one line system summary for each sample interval. .nf -[root@x980]# ./turbostat -s - %c0 GHz TSC %c1 %c3 %c6 %pc3 %pc6 - 0.23 1.67 3.38 2.00 0.30 97.47 1.07 82.12 - 0.10 1.62 3.38 1.87 2.25 95.77 12.02 72.60 - 0.20 1.64 3.38 1.98 0.11 97.72 0.30 83.36 - 0.11 1.70 3.38 1.86 1.81 96.22 9.71 74.90 +[root@wsm]# turbostat -S + %c0 GHz TSC %c1 %c3 %c6 CTMP %pc3 %pc6 + 1.40 2.81 3.38 10.78 43.47 44.35 42 13.67 2.09 + 1.34 2.90 3.38 11.48 58.96 28.23 41 19.89 0.15 + 1.55 2.72 3.38 26.73 37.66 34.07 42 2.53 2.80 + 1.37 2.83 3.38 16.95 60.05 21.63 42 5.76 0.20 .fi .SH VERBOSE EXAMPLE The "-v" option adds verbosity to the output: .nf -GenuineIntel 11 CPUID levels; family:model:stepping 0x6:2c:2 (6:44:2) -12 * 133 = 1600 MHz max efficiency -25 * 133 = 3333 MHz TSC frequency -26 * 133 = 3467 MHz max turbo 4 active cores -26 * 133 = 3467 MHz max turbo 3 active cores -27 * 133 = 3600 MHz max turbo 2 active cores -27 * 133 = 3600 MHz max turbo 1 active cores - +[root@ivy]# turbostat -v +turbostat v3.0 November 23, 2012 - Len Brown +CPUID(0): GenuineIntel 13 CPUID levels; family:model:stepping 0x6:3a:9 (6:58:9) +CPUID(6): APERF, DTS, PTM, EPB +RAPL: 851 sec. Joule Counter Range +cpu0: MSR_NHM_PLATFORM_INFO: 0x81010f0012300 +16 * 100 = 1600 MHz max efficiency +35 * 100 = 3500 MHz TSC frequency +cpu0: MSR_NHM_SNB_PKG_CST_CFG_CTL: 0x1e008402 (UNdemote-C3, UNdemote-C1, demote-C3, demote-C1, locked: pkg-cstate-limit=2: pc6-noret) +cpu0: MSR_NHM_TURBO_RATIO_LIMIT: 0x25262727 +37 * 100 = 3700 MHz max turbo 4 active cores +38 * 100 = 3800 MHz max turbo 3 active cores +39 * 100 = 3900 MHz max turbo 2 active cores +39 * 100 = 3900 MHz max turbo 1 active cores +cpu0: MSR_IA32_ENERGY_PERF_BIAS: 0x00000006 (balanced) +cpu0: MSR_RAPL_POWER_UNIT: 0x000a1003 (0.125000 Watts, 0.000015 Joules, 0.000977 sec.) +cpu0: MSR_PKG_POWER_INFO: 0x01e00268 (77 W TDP, RAPL 60 - 0 W, 0.000000 sec.) +cpu0: MSR_PKG_POWER_LIMIT: 0x830000148268 (UNlocked) +cpu0: PKG Limit #1: ENabled (77.000000 Watts, 1.000000 sec, clamp DISabled) +cpu0: PKG Limit #2: ENabled (96.000000 Watts, 0.000977* sec, clamp DISabled) +cpu0: MSR_PP0_POLICY: 0 +cpu0: MSR_PP0_POWER_LIMIT: 0x00000000 (UNlocked) +cpu0: Cores Limit: DISabled (0.000000 Watts, 0.000977 sec, clamp DISabled) +cpu0: MSR_PP1_POLICY: 0 +cpu0: MSR_PP1_POWER_LIMIT: 0x00000000 (UNlocked) +cpu0: GFX Limit: DISabled (0.000000 Watts, 0.000977 sec, clamp DISabled) +cpu0: MSR_IA32_TEMPERATURE_TARGET: 0x00691400 (105 C) +cpu0: MSR_IA32_PACKAGE_THERM_STATUS: 0x884e0000 (27 C) +cpu0: MSR_IA32_THERM_STATUS: 0x88560000 (19 C +/- 1) +cpu1: MSR_IA32_THERM_STATUS: 0x88560000 (19 C +/- 1) +cpu2: MSR_IA32_THERM_STATUS: 0x88540000 (21 C +/- 1) +cpu3: MSR_IA32_THERM_STATUS: 0x884e0000 (27 C +/- 1) + ... .fi The \fBmax efficiency\fP frequency, a.k.a. Low Frequency Mode, is the frequency available at the minimum package voltage. The \fBTSC frequency\fP is the nominal @@ -142,7 +173,7 @@ cor CPU %c0 GHz TSC %c1 %c3 %c6 %pc3 %pc6 10 5 1.42 3.43 3.38 2.14 30.99 65.44 10 11 0.16 2.88 3.38 3.40 .fi -Above the cycle soaker drives cpu7 up its 3.6 Ghz turbo limit +Above the cycle soaker drives cpu7 up its 3.6 GHz turbo limit while the other processors are generally in various states of idle. Note that cpu1 and cpu7 are HT siblings within core8. diff --git a/trunk/tools/power/x86/turbostat/turbostat.c b/trunk/tools/power/x86/turbostat/turbostat.c index ea095abbe97e..ce6d46038f74 100644 --- a/trunk/tools/power/x86/turbostat/turbostat.c +++ b/trunk/tools/power/x86/turbostat/turbostat.c @@ -20,6 +20,7 @@ */ #define _GNU_SOURCE +#include #include #include #include @@ -35,28 +36,18 @@ #include #include -#define MSR_NEHALEM_PLATFORM_INFO 0xCE -#define MSR_NEHALEM_TURBO_RATIO_LIMIT 0x1AD -#define MSR_IVT_TURBO_RATIO_LIMIT 0x1AE -#define MSR_APERF 0xE8 -#define MSR_MPERF 0xE7 -#define MSR_PKG_C2_RESIDENCY 0x60D /* SNB only */ -#define MSR_PKG_C3_RESIDENCY 0x3F8 -#define MSR_PKG_C6_RESIDENCY 0x3F9 -#define MSR_PKG_C7_RESIDENCY 0x3FA /* SNB only */ -#define MSR_CORE_C3_RESIDENCY 0x3FC -#define MSR_CORE_C6_RESIDENCY 0x3FD -#define MSR_CORE_C7_RESIDENCY 0x3FE /* SNB only */ - char *proc_stat = "/proc/stat"; unsigned int interval_sec = 5; /* set with -i interval_sec */ unsigned int verbose; /* set with -v */ +unsigned int rapl_verbose; /* set with -R */ +unsigned int thermal_verbose; /* set with -T */ unsigned int summary_only; /* set with -s */ unsigned int skip_c0; unsigned int skip_c1; unsigned int do_nhm_cstates; unsigned int do_snb_cstates; unsigned int has_aperf; +unsigned int has_epb; unsigned int units = 1000000000; /* Ghz etc */ unsigned int genuine_intel; unsigned int has_invariant_tsc; @@ -74,6 +65,23 @@ unsigned int show_cpu; unsigned int show_pkg_only; unsigned int show_core_only; char *output_buffer, *outp; +unsigned int do_rapl; +unsigned int do_dts; +unsigned int do_ptm; +unsigned int tcc_activation_temp; +unsigned int tcc_activation_temp_override; +double rapl_power_units, rapl_energy_units, rapl_time_units; +double rapl_joule_counter_range; + +#define RAPL_PKG (1 << 0) +#define RAPL_CORES (1 << 1) +#define RAPL_GFX (1 << 2) +#define RAPL_DRAM (1 << 3) +#define RAPL_PKG_PERF_STATUS (1 << 4) +#define RAPL_DRAM_PERF_STATUS (1 << 5) +#define TJMAX_DEFAULT 100 + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) int aperf_mperf_unstable; int backwards_count; @@ -101,6 +109,7 @@ struct core_data { unsigned long long c3; unsigned long long c6; unsigned long long c7; + unsigned int core_temp_c; unsigned int core_id; } *core_even, *core_odd; @@ -110,6 +119,14 @@ struct pkg_data { unsigned long long pc6; unsigned long long pc7; unsigned int package_id; + unsigned int energy_pkg; /* MSR_PKG_ENERGY_STATUS */ + unsigned int energy_dram; /* MSR_DRAM_ENERGY_STATUS */ + unsigned int energy_cores; /* MSR_PP0_ENERGY_STATUS */ + unsigned int energy_gfx; /* MSR_PP1_ENERGY_STATUS */ + unsigned int rapl_pkg_perf_status; /* MSR_PKG_PERF_STATUS */ + unsigned int rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ + unsigned int pkg_temp_c; + } *package_even, *package_odd; #define ODD_COUNTERS thread_odd, core_odd, package_odd @@ -247,6 +264,12 @@ void print_header(void) outp += sprintf(outp, " %%c6"); if (do_snb_cstates) outp += sprintf(outp, " %%c7"); + + if (do_dts) + outp += sprintf(outp, " CTMP"); + if (do_ptm) + outp += sprintf(outp, " PTMP"); + if (do_snb_cstates) outp += sprintf(outp, " %%pc2"); if (do_nhm_cstates) @@ -256,6 +279,19 @@ void print_header(void) if (do_snb_cstates) outp += sprintf(outp, " %%pc7"); + if (do_rapl & RAPL_PKG) + outp += sprintf(outp, " Pkg_W"); + if (do_rapl & RAPL_CORES) + outp += sprintf(outp, " Cor_W"); + if (do_rapl & RAPL_GFX) + outp += sprintf(outp, " GFX_W"); + if (do_rapl & RAPL_DRAM) + outp += sprintf(outp, " RAM_W"); + if (do_rapl & RAPL_PKG_PERF_STATUS) + outp += sprintf(outp, " PKG_%%"); + if (do_rapl & RAPL_DRAM_PERF_STATUS) + outp += sprintf(outp, " RAM_%%"); + outp += sprintf(outp, "\n"); } @@ -285,6 +321,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, fprintf(stderr, "c3: %016llX\n", c->c3); fprintf(stderr, "c6: %016llX\n", c->c6); fprintf(stderr, "c7: %016llX\n", c->c7); + fprintf(stderr, "DTS: %dC\n", c->core_temp_c); } if (p) { @@ -293,6 +330,13 @@ int dump_counters(struct thread_data *t, struct core_data *c, fprintf(stderr, "pc3: %016llX\n", p->pc3); fprintf(stderr, "pc6: %016llX\n", p->pc6); fprintf(stderr, "pc7: %016llX\n", p->pc7); + fprintf(stderr, "Joules PKG: %0X\n", p->energy_pkg); + fprintf(stderr, "Joules COR: %0X\n", p->energy_cores); + fprintf(stderr, "Joules GFX: %0X\n", p->energy_gfx); + fprintf(stderr, "Joules RAM: %0X\n", p->energy_dram); + fprintf(stderr, "Throttle PKG: %0X\n", p->rapl_pkg_perf_status); + fprintf(stderr, "Throttle RAM: %0X\n", p->rapl_dram_perf_status); + fprintf(stderr, "PTM: %dC\n", p->pkg_temp_c); } return 0; } @@ -302,14 +346,21 @@ int dump_counters(struct thread_data *t, struct core_data *c, * package: "pk" 2 columns %2d * core: "cor" 3 columns %3d * CPU: "CPU" 3 columns %3d + * Pkg_W: %6.2 + * Cor_W: %6.2 + * GFX_W: %5.2 + * RAM_W: %5.2 * GHz: "GHz" 3 columns %3.2 * TSC: "TSC" 3 columns %3.2 * percentage " %pc3" %6.2 + * Perf Status percentage: %5.2 + * "CTMP" 4 columns %4d */ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { double interval_float; + char *fmt5, *fmt6; /* if showing only 1st thread in core and this isn't one, bail out */ if (show_core_only && !(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) @@ -349,7 +400,6 @@ int format_counters(struct thread_data *t, struct core_data *c, if (show_cpu) outp += sprintf(outp, " %3d", t->cpu_id); } - /* %c0 */ if (do_nhm_cstates) { if (show_pkg || show_core || show_cpu) @@ -414,10 +464,16 @@ int format_counters(struct thread_data *t, struct core_data *c, if (do_snb_cstates) outp += sprintf(outp, " %6.2f", 100.0 * c->c7/t->tsc); + if (do_dts) + outp += sprintf(outp, " %4d", c->core_temp_c); + /* print per-package data only for 1st core in package */ if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) goto done; + if (do_ptm) + outp += sprintf(outp, " %4d", p->pkg_temp_c); + if (do_snb_cstates) outp += sprintf(outp, " %6.2f", 100.0 * p->pc2/t->tsc); if (do_nhm_cstates) @@ -426,6 +482,32 @@ int format_counters(struct thread_data *t, struct core_data *c, outp += sprintf(outp, " %6.2f", 100.0 * p->pc6/t->tsc); if (do_snb_cstates) outp += sprintf(outp, " %6.2f", 100.0 * p->pc7/t->tsc); + + /* + * If measurement interval exceeds minimum RAPL Joule Counter range, + * indicate that results are suspect by printing "**" in fraction place. + */ + if (interval_float < rapl_joule_counter_range) { + fmt5 = " %5.2f"; + fmt6 = " %6.2f"; + } else { + fmt5 = " %3.0f**"; + fmt6 = " %4.0f**"; + } + + if (do_rapl & RAPL_PKG) + outp += sprintf(outp, fmt6, p->energy_pkg * rapl_energy_units / interval_float); + if (do_rapl & RAPL_CORES) + outp += sprintf(outp, fmt6, p->energy_cores * rapl_energy_units / interval_float); + if (do_rapl & RAPL_GFX) + outp += sprintf(outp, fmt5, p->energy_gfx * rapl_energy_units / interval_float); + if (do_rapl & RAPL_DRAM) + outp += sprintf(outp, fmt5, p->energy_dram * rapl_energy_units / interval_float); + if (do_rapl & RAPL_PKG_PERF_STATUS ) + outp += sprintf(outp, fmt5, 100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float); + if (do_rapl & RAPL_DRAM_PERF_STATUS ) + outp += sprintf(outp, fmt5, 100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float); + done: outp += sprintf(outp, "\n"); @@ -435,6 +517,7 @@ int format_counters(struct thread_data *t, struct core_data *c, void flush_stdout() { fputs(output_buffer, stdout); + fflush(stdout); outp = output_buffer; } void flush_stderr() @@ -461,6 +544,13 @@ void format_all_counters(struct thread_data *t, struct core_data *c, struct pkg_ for_all_cpus(format_counters, t, c, p); } +#define DELTA_WRAP32(new, old) \ + if (new > old) { \ + old = new - old; \ + } else { \ + old = 0x100000000 + new - old; \ + } + void delta_package(struct pkg_data *new, struct pkg_data *old) { @@ -468,6 +558,14 @@ delta_package(struct pkg_data *new, struct pkg_data *old) old->pc3 = new->pc3 - old->pc3; old->pc6 = new->pc6 - old->pc6; old->pc7 = new->pc7 - old->pc7; + old->pkg_temp_c = new->pkg_temp_c; + + DELTA_WRAP32(new->energy_pkg, old->energy_pkg); + DELTA_WRAP32(new->energy_cores, old->energy_cores); + DELTA_WRAP32(new->energy_gfx, old->energy_gfx); + DELTA_WRAP32(new->energy_dram, old->energy_dram); + DELTA_WRAP32(new->rapl_pkg_perf_status, old->rapl_pkg_perf_status); + DELTA_WRAP32(new->rapl_dram_perf_status, old->rapl_dram_perf_status); } void @@ -476,6 +574,7 @@ delta_core(struct core_data *new, struct core_data *old) old->c3 = new->c3 - old->c3; old->c6 = new->c6 - old->c6; old->c7 = new->c7 - old->c7; + old->core_temp_c = new->core_temp_c; } /* @@ -582,11 +681,20 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data c->c3 = 0; c->c6 = 0; c->c7 = 0; + c->core_temp_c = 0; p->pc2 = 0; p->pc3 = 0; p->pc6 = 0; p->pc7 = 0; + + p->energy_pkg = 0; + p->energy_dram = 0; + p->energy_cores = 0; + p->energy_gfx = 0; + p->rapl_pkg_perf_status = 0; + p->rapl_dram_perf_status = 0; + p->pkg_temp_c = 0; } int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) @@ -607,6 +715,8 @@ int sum_counters(struct thread_data *t, struct core_data *c, average.cores.c6 += c->c6; average.cores.c7 += c->c7; + average.cores.core_temp_c = MAX(average.cores.core_temp_c, c->core_temp_c); + /* sum per-pkg values only for 1st core in pkg */ if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) return 0; @@ -616,6 +726,15 @@ int sum_counters(struct thread_data *t, struct core_data *c, average.packages.pc6 += p->pc6; average.packages.pc7 += p->pc7; + average.packages.energy_pkg += p->energy_pkg; + average.packages.energy_dram += p->energy_dram; + average.packages.energy_cores += p->energy_cores; + average.packages.energy_gfx += p->energy_gfx; + + average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c); + + average.packages.rapl_pkg_perf_status += p->rapl_pkg_perf_status; + average.packages.rapl_dram_perf_status += p->rapl_dram_perf_status; return 0; } /* @@ -667,23 +786,26 @@ static unsigned long long rdtsc(void) int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int cpu = t->cpu_id; + unsigned long long msr; - if (cpu_migrate(cpu)) + if (cpu_migrate(cpu)) { + fprintf(stderr, "Could not migrate to CPU %d\n", cpu); return -1; + } t->tsc = rdtsc(); /* we are running on local CPU of interest */ if (has_aperf) { - if (get_msr(cpu, MSR_APERF, &t->aperf)) + if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) return -3; - if (get_msr(cpu, MSR_MPERF, &t->mperf)) + if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf)) return -4; } if (extra_delta_offset32) { - if (get_msr(cpu, extra_delta_offset32, &t->extra_delta32)) + if (get_msr(cpu, extra_delta_offset32, &msr)) return -5; - t->extra_delta32 &= 0xFFFFFFFF; + t->extra_delta32 = msr & 0xFFFFFFFF; } if (extra_delta_offset64) @@ -691,9 +813,9 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -5; if (extra_msr_offset32) { - if (get_msr(cpu, extra_msr_offset32, &t->extra_msr32)) + if (get_msr(cpu, extra_msr_offset32, &msr)) return -5; - t->extra_msr32 &= 0xFFFFFFFF; + t->extra_msr32 = msr & 0xFFFFFFFF; } if (extra_msr_offset64) @@ -715,6 +837,13 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_msr(cpu, MSR_CORE_C7_RESIDENCY, &c->c7)) return -8; + if (do_dts) { + if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr)) + return -9; + c->core_temp_c = tcc_activation_temp - ((msr >> 16) & 0x7F); + } + + /* collect package counters only for 1st core in package */ if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) return 0; @@ -731,6 +860,41 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_msr(cpu, MSR_PKG_C7_RESIDENCY, &p->pc7)) return -12; } + if (do_rapl & RAPL_PKG) { + if (get_msr(cpu, MSR_PKG_ENERGY_STATUS, &msr)) + return -13; + p->energy_pkg = msr & 0xFFFFFFFF; + } + if (do_rapl & RAPL_CORES) { + if (get_msr(cpu, MSR_PP0_ENERGY_STATUS, &msr)) + return -14; + p->energy_cores = msr & 0xFFFFFFFF; + } + if (do_rapl & RAPL_DRAM) { + if (get_msr(cpu, MSR_DRAM_ENERGY_STATUS, &msr)) + return -15; + p->energy_dram = msr & 0xFFFFFFFF; + } + if (do_rapl & RAPL_GFX) { + if (get_msr(cpu, MSR_PP1_ENERGY_STATUS, &msr)) + return -16; + p->energy_gfx = msr & 0xFFFFFFFF; + } + if (do_rapl & RAPL_PKG_PERF_STATUS) { + if (get_msr(cpu, MSR_PKG_PERF_STATUS, &msr)) + return -16; + p->rapl_pkg_perf_status = msr & 0xFFFFFFFF; + } + if (do_rapl & RAPL_DRAM_PERF_STATUS) { + if (get_msr(cpu, MSR_DRAM_PERF_STATUS, &msr)) + return -16; + p->rapl_dram_perf_status = msr & 0xFFFFFFFF; + } + if (do_ptm) { + if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr)) + return -17; + p->pkg_temp_c = tcc_activation_temp - ((msr >> 16) & 0x7F); + } return 0; } @@ -742,10 +906,10 @@ void print_verbose_header(void) if (!do_nehalem_platform_info) return; - get_msr(0, MSR_NEHALEM_PLATFORM_INFO, &msr); + get_msr(0, MSR_NHM_PLATFORM_INFO, &msr); - if (verbose > 1) - fprintf(stderr, "MSR_NEHALEM_PLATFORM_INFO: 0x%llx\n", msr); + if (verbose) + fprintf(stderr, "cpu0: MSR_NHM_PLATFORM_INFO: 0x%08llx\n", msr); ratio = (msr >> 40) & 0xFF; fprintf(stderr, "%d * %.0f = %.0f MHz max efficiency\n", @@ -760,8 +924,8 @@ void print_verbose_header(void) get_msr(0, MSR_IVT_TURBO_RATIO_LIMIT, &msr); - if (verbose > 1) - fprintf(stderr, "MSR_IVT_TURBO_RATIO_LIMIT: 0x%llx\n", msr); + if (verbose) + fprintf(stderr, "cpu0: MSR_IVT_TURBO_RATIO_LIMIT: 0x%08llx\n", msr); ratio = (msr >> 56) & 0xFF; if (ratio) @@ -804,14 +968,56 @@ void print_verbose_header(void) ratio, bclk, ratio * bclk); print_nhm_turbo_ratio_limits: + get_msr(0, MSR_NHM_SNB_PKG_CST_CFG_CTL, &msr); + +#define SNB_C1_AUTO_UNDEMOTE (1UL << 27) +#define SNB_C3_AUTO_UNDEMOTE (1UL << 28) + + fprintf(stderr, "cpu0: MSR_NHM_SNB_PKG_CST_CFG_CTL: 0x%08llx", msr); + + fprintf(stderr, " (%s%s%s%s%slocked: pkg-cstate-limit=%d: ", + (msr & SNB_C3_AUTO_UNDEMOTE) ? "UNdemote-C3, " : "", + (msr & SNB_C1_AUTO_UNDEMOTE) ? "UNdemote-C1, " : "", + (msr & NHM_C3_AUTO_DEMOTE) ? "demote-C3, " : "", + (msr & NHM_C1_AUTO_DEMOTE) ? "demote-C1, " : "", + (msr & (1 << 15)) ? "" : "UN", + (unsigned int)msr & 7); + + + switch(msr & 0x7) { + case 0: + fprintf(stderr, "pc0"); + break; + case 1: + fprintf(stderr, do_snb_cstates ? "pc2" : "pc0"); + break; + case 2: + fprintf(stderr, do_snb_cstates ? "pc6-noret" : "pc3"); + break; + case 3: + fprintf(stderr, "pc6"); + break; + case 4: + fprintf(stderr, "pc7"); + break; + case 5: + fprintf(stderr, do_snb_cstates ? "pc7s" : "invalid"); + break; + case 7: + fprintf(stderr, "unlimited"); + break; + default: + fprintf(stderr, "invalid"); + } + fprintf(stderr, ")\n"); if (!do_nehalem_turbo_ratio_limit) return; - get_msr(0, MSR_NEHALEM_TURBO_RATIO_LIMIT, &msr); + get_msr(0, MSR_NHM_TURBO_RATIO_LIMIT, &msr); - if (verbose > 1) - fprintf(stderr, "MSR_NEHALEM_TURBO_RATIO_LIMIT: 0x%llx\n", msr); + if (verbose) + fprintf(stderr, "cpu0: MSR_NHM_TURBO_RATIO_LIMIT: 0x%08llx\n", msr); ratio = (msr >> 56) & 0xFF; if (ratio) @@ -1100,15 +1306,22 @@ int mark_cpu_present(int cpu) void turbostat_loop() { int retval; + int restarted = 0; restart: + restarted++; + retval = for_all_cpus(get_counters, EVEN_COUNTERS); if (retval < -1) { exit(retval); } else if (retval == -1) { + if (restarted > 1) { + exit(retval); + } re_initialize(); goto restart; } + restarted = 0; gettimeofday(&tv_even, (struct timezone *)NULL); while (1) { @@ -1207,6 +1420,299 @@ int has_ivt_turbo_ratio_limit(unsigned int family, unsigned int model) } } +/* + * print_epb() + * Decode the ENERGY_PERF_BIAS MSR + */ +int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + unsigned long long msr; + char *epb_string; + int cpu; + + if (!has_epb) + return 0; + + cpu = t->cpu_id; + + /* EPB is per-package */ + if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) + return 0; + + if (cpu_migrate(cpu)) { + fprintf(stderr, "Could not migrate to CPU %d\n", cpu); + return -1; + } + + if (get_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &msr)) + return 0; + + switch (msr & 0x7) { + case ENERGY_PERF_BIAS_PERFORMANCE: + epb_string = "performance"; + break; + case ENERGY_PERF_BIAS_NORMAL: + epb_string = "balanced"; + break; + case ENERGY_PERF_BIAS_POWERSAVE: + epb_string = "powersave"; + break; + default: + epb_string = "custom"; + break; + } + fprintf(stderr, "cpu%d: MSR_IA32_ENERGY_PERF_BIAS: 0x%08llx (%s)\n", cpu, msr, epb_string); + + return 0; +} + +#define RAPL_POWER_GRANULARITY 0x7FFF /* 15 bit power granularity */ +#define RAPL_TIME_GRANULARITY 0x3F /* 6 bit time granularity */ + +/* + * rapl_probe() + * + * sets do_rapl + */ +void rapl_probe(unsigned int family, unsigned int model) +{ + unsigned long long msr; + double tdp; + + if (!genuine_intel) + return; + + if (family != 6) + return; + + switch (model) { + case 0x2A: + case 0x3A: + do_rapl = RAPL_PKG | RAPL_CORES | RAPL_GFX; + break; + case 0x2D: + case 0x3E: + do_rapl = RAPL_PKG | RAPL_CORES | RAPL_DRAM | RAPL_PKG_PERF_STATUS | RAPL_DRAM_PERF_STATUS; + break; + default: + return; + } + + /* units on package 0, verify later other packages match */ + if (get_msr(0, MSR_RAPL_POWER_UNIT, &msr)) + return; + + rapl_power_units = 1.0 / (1 << (msr & 0xF)); + rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F)); + rapl_time_units = 1.0 / (1 << (msr >> 16 & 0xF)); + + /* get TDP to determine energy counter range */ + if (get_msr(0, MSR_PKG_POWER_INFO, &msr)) + return; + + tdp = ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units; + + rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp; + + if (verbose) + fprintf(stderr, "RAPL: %.0f sec. Joule Counter Range\n", rapl_joule_counter_range); + + return; +} + +int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + unsigned long long msr; + unsigned int dts; + int cpu; + + if (!(do_dts || do_ptm)) + return 0; + + cpu = t->cpu_id; + + /* DTS is per-core, no need to print for each thread */ + if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) + return 0; + + if (cpu_migrate(cpu)) { + fprintf(stderr, "Could not migrate to CPU %d\n", cpu); + return -1; + } + + if (do_ptm && (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) { + if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr)) + return 0; + + dts = (msr >> 16) & 0x7F; + fprintf(stderr, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n", + cpu, msr, tcc_activation_temp - dts); + +#ifdef THERM_DEBUG + if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &msr)) + return 0; + + dts = (msr >> 16) & 0x7F; + dts2 = (msr >> 8) & 0x7F; + fprintf(stderr, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n", + cpu, msr, tcc_activation_temp - dts, tcc_activation_temp - dts2); +#endif + } + + + if (do_dts) { + unsigned int resolution; + + if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr)) + return 0; + + dts = (msr >> 16) & 0x7F; + resolution = (msr >> 27) & 0xF; + fprintf(stderr, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n", + cpu, msr, tcc_activation_temp - dts, resolution); + +#ifdef THERM_DEBUG + if (get_msr(cpu, MSR_IA32_THERM_INTERRUPT, &msr)) + return 0; + + dts = (msr >> 16) & 0x7F; + dts2 = (msr >> 8) & 0x7F; + fprintf(stderr, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n", + cpu, msr, tcc_activation_temp - dts, tcc_activation_temp - dts2); +#endif + } + + return 0; +} + +void print_power_limit_msr(int cpu, unsigned long long msr, char *label) +{ + fprintf(stderr, "cpu%d: %s: %sabled (%f Watts, %f sec, clamp %sabled)\n", + cpu, label, + ((msr >> 15) & 1) ? "EN" : "DIS", + ((msr >> 0) & 0x7FFF) * rapl_power_units, + (1.0 + (((msr >> 22) & 0x3)/4.0)) * (1 << ((msr >> 17) & 0x1F)) * rapl_time_units, + (((msr >> 16) & 1) ? "EN" : "DIS")); + + return; +} + +int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + unsigned long long msr; + int cpu; + double local_rapl_power_units, local_rapl_energy_units, local_rapl_time_units; + + if (!do_rapl) + return 0; + + /* RAPL counters are per package, so print only for 1st thread/package */ + if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) + return 0; + + cpu = t->cpu_id; + if (cpu_migrate(cpu)) { + fprintf(stderr, "Could not migrate to CPU %d\n", cpu); + return -1; + } + + if (get_msr(cpu, MSR_RAPL_POWER_UNIT, &msr)) + return -1; + + local_rapl_power_units = 1.0 / (1 << (msr & 0xF)); + local_rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F)); + local_rapl_time_units = 1.0 / (1 << (msr >> 16 & 0xF)); + + if (local_rapl_power_units != rapl_power_units) + fprintf(stderr, "cpu%d, ERROR: Power units mis-match\n", cpu); + if (local_rapl_energy_units != rapl_energy_units) + fprintf(stderr, "cpu%d, ERROR: Energy units mis-match\n", cpu); + if (local_rapl_time_units != rapl_time_units) + fprintf(stderr, "cpu%d, ERROR: Time units mis-match\n", cpu); + + if (verbose) { + fprintf(stderr, "cpu%d: MSR_RAPL_POWER_UNIT: 0x%08llx " + "(%f Watts, %f Joules, %f sec.)\n", cpu, msr, + local_rapl_power_units, local_rapl_energy_units, local_rapl_time_units); + } + if (do_rapl & RAPL_PKG) { + if (get_msr(cpu, MSR_PKG_POWER_INFO, &msr)) + return -5; + + + fprintf(stderr, "cpu%d: MSR_PKG_POWER_INFO: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n", + cpu, msr, + ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units, + ((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units, + ((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units, + ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units); + + if (get_msr(cpu, MSR_PKG_POWER_LIMIT, &msr)) + return -9; + + fprintf(stderr, "cpu%d: MSR_PKG_POWER_LIMIT: 0x%08llx (%slocked)\n", + cpu, msr, (msr >> 63) & 1 ? "": "UN"); + + print_power_limit_msr(cpu, msr, "PKG Limit #1"); + fprintf(stderr, "cpu%d: PKG Limit #2: %sabled (%f Watts, %f* sec, clamp %sabled)\n", + cpu, + ((msr >> 47) & 1) ? "EN" : "DIS", + ((msr >> 32) & 0x7FFF) * rapl_power_units, + (1.0 + (((msr >> 54) & 0x3)/4.0)) * (1 << ((msr >> 49) & 0x1F)) * rapl_time_units, + ((msr >> 48) & 1) ? "EN" : "DIS"); + } + + if (do_rapl & RAPL_DRAM) { + if (get_msr(cpu, MSR_DRAM_POWER_INFO, &msr)) + return -6; + + + fprintf(stderr, "cpu%d: MSR_DRAM_POWER_INFO,: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n", + cpu, msr, + ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units, + ((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units, + ((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units, + ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units); + + + if (get_msr(cpu, MSR_DRAM_POWER_LIMIT, &msr)) + return -9; + fprintf(stderr, "cpu%d: MSR_DRAM_POWER_LIMIT: 0x%08llx (%slocked)\n", + cpu, msr, (msr >> 31) & 1 ? "": "UN"); + + print_power_limit_msr(cpu, msr, "DRAM Limit"); + } + if (do_rapl & RAPL_CORES) { + if (verbose) { + if (get_msr(cpu, MSR_PP0_POLICY, &msr)) + return -7; + + fprintf(stderr, "cpu%d: MSR_PP0_POLICY: %lld\n", cpu, msr & 0xF); + + if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr)) + return -9; + fprintf(stderr, "cpu%d: MSR_PP0_POWER_LIMIT: 0x%08llx (%slocked)\n", + cpu, msr, (msr >> 31) & 1 ? "": "UN"); + print_power_limit_msr(cpu, msr, "Cores Limit"); + } + } + if (do_rapl & RAPL_GFX) { + if (verbose) { + if (get_msr(cpu, MSR_PP1_POLICY, &msr)) + return -8; + + fprintf(stderr, "cpu%d: MSR_PP1_POLICY: %lld\n", cpu, msr & 0xF); + + if (get_msr(cpu, MSR_PP1_POWER_LIMIT, &msr)) + return -9; + fprintf(stderr, "cpu%d: MSR_PP1_POWER_LIMIT: 0x%08llx (%slocked)\n", + cpu, msr, (msr >> 31) & 1 ? "": "UN"); + print_power_limit_msr(cpu, msr, "GFX Limit"); + } + } + return 0; +} + int is_snb(unsigned int family, unsigned int model) { @@ -1231,6 +1737,72 @@ double discover_bclk(unsigned int family, unsigned int model) return 133.33; } +/* + * MSR_IA32_TEMPERATURE_TARGET indicates the temperature where + * the Thermal Control Circuit (TCC) activates. + * This is usually equal to tjMax. + * + * Older processors do not have this MSR, so there we guess, + * but also allow cmdline over-ride with -T. + * + * Several MSR temperature values are in units of degrees-C + * below this value, including the Digital Thermal Sensor (DTS), + * Package Thermal Management Sensor (PTM), and thermal event thresholds. + */ +int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + unsigned long long msr; + unsigned int target_c_local; + int cpu; + + /* tcc_activation_temp is used only for dts or ptm */ + if (!(do_dts || do_ptm)) + return 0; + + /* this is a per-package concept */ + if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) + return 0; + + cpu = t->cpu_id; + if (cpu_migrate(cpu)) { + fprintf(stderr, "Could not migrate to CPU %d\n", cpu); + return -1; + } + + if (tcc_activation_temp_override != 0) { + tcc_activation_temp = tcc_activation_temp_override; + fprintf(stderr, "cpu%d: Using cmdline TCC Target (%d C)\n", + cpu, tcc_activation_temp); + return 0; + } + + /* Temperature Target MSR is Nehalem and newer only */ + if (!do_nehalem_platform_info) + goto guess; + + if (get_msr(0, MSR_IA32_TEMPERATURE_TARGET, &msr)) + goto guess; + + target_c_local = (msr >> 16) & 0x7F; + + if (verbose) + fprintf(stderr, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", + cpu, msr, target_c_local); + + if (target_c_local < 85 || target_c_local > 120) + goto guess; + + tcc_activation_temp = target_c_local; + + return 0; + +guess: + tcc_activation_temp = TJMAX_DEFAULT; + fprintf(stderr, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n", + cpu, tcc_activation_temp); + + return 0; +} void check_cpuid() { unsigned int eax, ebx, ecx, edx, max_level; @@ -1244,7 +1816,7 @@ void check_cpuid() genuine_intel = 1; if (verbose) - fprintf(stderr, "%.4s%.4s%.4s ", + fprintf(stderr, "CPUID(0): %.4s%.4s%.4s ", (char *)&ebx, (char *)&edx, (char *)&ecx); asm("cpuid" : "=a" (fms), "=c" (ecx), "=d" (edx) : "a" (1) : "ebx"); @@ -1295,10 +1867,19 @@ void check_cpuid() asm("cpuid" : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a" (0x6)); has_aperf = ecx & (1 << 0); - if (!has_aperf) { - fprintf(stderr, "No APERF MSR\n"); - exit(1); - } + do_dts = eax & (1 << 0); + do_ptm = eax & (1 << 6); + has_epb = ecx & (1 << 3); + + if (verbose) + fprintf(stderr, "CPUID(6): %s%s%s%s\n", + has_aperf ? "APERF" : "No APERF!", + do_dts ? ", DTS" : "", + do_ptm ? ", PTM": "", + has_epb ? ", EPB": ""); + + if (!has_aperf) + exit(-1); do_nehalem_platform_info = genuine_intel && has_invariant_tsc; do_nhm_cstates = genuine_intel; /* all Intel w/ non-stop TSC have NHM counters */ @@ -1307,12 +1888,15 @@ void check_cpuid() do_nehalem_turbo_ratio_limit = has_nehalem_turbo_ratio_limit(family, model); do_ivt_turbo_ratio_limit = has_ivt_turbo_ratio_limit(family, model); + rapl_probe(family, model); + + return; } void usage() { - fprintf(stderr, "%s: [-v][-p|-P|-S][-c MSR# | -s]][-C MSR#][-m MSR#][-M MSR#][-i interval_sec | command ...]\n", + fprintf(stderr, "%s: [-v][-R][-T][-p|-P|-S][-c MSR# | -s]][-C MSR#][-m MSR#][-M MSR#][-i interval_sec | command ...]\n", progname); exit(1); } @@ -1548,6 +2132,17 @@ void turbostat_init() if (verbose) print_verbose_header(); + + if (verbose) + for_all_cpus(print_epb, ODD_COUNTERS); + + if (verbose) + for_all_cpus(print_rapl, ODD_COUNTERS); + + for_all_cpus(set_temperature_target, ODD_COUNTERS); + + if (verbose) + for_all_cpus(print_thermal, ODD_COUNTERS); } int fork_it(char **argv) @@ -1604,7 +2199,7 @@ void cmdline(int argc, char **argv) progname = argv[0]; - while ((opt = getopt(argc, argv, "+pPSvi:sc:sC:m:M:")) != -1) { + while ((opt = getopt(argc, argv, "+pPSvi:sc:sC:m:M:RT:")) != -1) { switch (opt) { case 'p': show_core_only++; @@ -1636,6 +2231,12 @@ void cmdline(int argc, char **argv) case 'M': sscanf(optarg, "%x", &extra_msr_offset64); break; + case 'R': + rapl_verbose++; + break; + case 'T': + tcc_activation_temp_override = atoi(optarg); + break; default: usage(); } @@ -1646,8 +2247,8 @@ int main(int argc, char **argv) { cmdline(argc, argv); - if (verbose > 1) - fprintf(stderr, "turbostat v2.1 October 6, 2012" + if (verbose) + fprintf(stderr, "turbostat v3.0 November 23, 2012" " - Len Brown \n"); turbostat_init(); diff --git a/trunk/tools/power/x86/x86_energy_perf_policy/Makefile b/trunk/tools/power/x86/x86_energy_perf_policy/Makefile index f458237fdd79..971c9ffdcb50 100644 --- a/trunk/tools/power/x86/x86_energy_perf_policy/Makefile +++ b/trunk/tools/power/x86/x86_energy_perf_policy/Makefile @@ -1,8 +1,10 @@ +DESTDIR ?= + x86_energy_perf_policy : x86_energy_perf_policy.c clean : rm -f x86_energy_perf_policy install : - install x86_energy_perf_policy /usr/bin/ - install x86_energy_perf_policy.8 /usr/share/man/man8/ + install x86_energy_perf_policy ${DESTDIR}/usr/bin/ + install x86_energy_perf_policy.8 ${DESTDIR}/usr/share/man/man8/ diff --git a/trunk/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c b/trunk/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c index 33c5c7ee148f..40b3e5482f8a 100644 --- a/trunk/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c +++ b/trunk/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c @@ -289,7 +289,7 @@ void for_every_cpu(void (func)(int)) "cpu%u %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d\n", &cpu); if (retval != 1) - return; + break; func(cpu); } diff --git a/trunk/tools/testing/selftests/breakpoints/Makefile b/trunk/tools/testing/selftests/breakpoints/Makefile index 931278035f5c..e18b42b254af 100644 --- a/trunk/tools/testing/selftests/breakpoints/Makefile +++ b/trunk/tools/testing/selftests/breakpoints/Makefile @@ -17,7 +17,7 @@ else endif run_tests: - ./breakpoint_test + @./breakpoint_test || echo "breakpoints selftests: [FAIL]" clean: rm -fr breakpoint_test diff --git a/trunk/tools/testing/selftests/cpu-hotplug/Makefile b/trunk/tools/testing/selftests/cpu-hotplug/Makefile index 7c9c20ff578a..12657a5e4bf9 100644 --- a/trunk/tools/testing/selftests/cpu-hotplug/Makefile +++ b/trunk/tools/testing/selftests/cpu-hotplug/Makefile @@ -1,6 +1,6 @@ all: run_tests: - ./on-off-test.sh + @./on-off-test.sh || echo "cpu-hotplug selftests: [FAIL]" clean: diff --git a/trunk/tools/testing/selftests/kcmp/Makefile b/trunk/tools/testing/selftests/kcmp/Makefile index dc79b86ea65c..56eb5523dbb8 100644 --- a/trunk/tools/testing/selftests/kcmp/Makefile +++ b/trunk/tools/testing/selftests/kcmp/Makefile @@ -16,13 +16,13 @@ CFLAGS += -I../../../../arch/x86/include/ all: ifeq ($(ARCH),X86) - gcc $(CFLAGS) kcmp_test.c -o run_test + gcc $(CFLAGS) kcmp_test.c -o kcmp_test else echo "Not an x86 target, can't build kcmp selftest" endif -run-tests: all - ./kcmp_test +run_tests: all + @./kcmp_test || echo "kcmp_test: [FAIL]" clean: rm -fr ./run_test diff --git a/trunk/tools/testing/selftests/kcmp/kcmp_test.c b/trunk/tools/testing/selftests/kcmp/kcmp_test.c index 358cc6bfa35d..fa4f1b37e045 100644 --- a/trunk/tools/testing/selftests/kcmp/kcmp_test.c +++ b/trunk/tools/testing/selftests/kcmp/kcmp_test.c @@ -72,7 +72,8 @@ int main(int argc, char **argv) /* This one should return same fd */ ret = sys_kcmp(pid1, pid2, KCMP_FILE, fd1, fd1); if (ret) { - printf("FAIL: 0 expected but %d returned\n", ret); + printf("FAIL: 0 expected but %d returned (%s)\n", + ret, strerror(errno)); ret = -1; } else printf("PASS: 0 returned as expected\n"); @@ -80,7 +81,8 @@ int main(int argc, char **argv) /* Compare with self */ ret = sys_kcmp(pid1, pid1, KCMP_VM, 0, 0); if (ret) { - printf("FAIL: 0 expected but %li returned\n", ret); + printf("FAIL: 0 expected but %li returned (%s)\n", + ret, strerror(errno)); ret = -1; } else printf("PASS: 0 returned as expected\n"); diff --git a/trunk/tools/testing/selftests/memory-hotplug/Makefile b/trunk/tools/testing/selftests/memory-hotplug/Makefile index 7c9c20ff578a..0f49c3f5f58d 100644 --- a/trunk/tools/testing/selftests/memory-hotplug/Makefile +++ b/trunk/tools/testing/selftests/memory-hotplug/Makefile @@ -1,6 +1,6 @@ all: run_tests: - ./on-off-test.sh + @./on-off-test.sh || echo "memory-hotplug selftests: [FAIL]" clean: diff --git a/trunk/tools/testing/selftests/mqueue/Makefile b/trunk/tools/testing/selftests/mqueue/Makefile index 54c0aad2b47c..218a122c7951 100644 --- a/trunk/tools/testing/selftests/mqueue/Makefile +++ b/trunk/tools/testing/selftests/mqueue/Makefile @@ -3,8 +3,8 @@ all: gcc -O2 -lrt -lpthread -lpopt -o mq_perf_tests mq_perf_tests.c run_tests: - ./mq_open_tests /test1 - ./mq_perf_tests + @./mq_open_tests /test1 || echo "mq_open_tests: [FAIL]" + @./mq_perf_tests || echo "mq_perf_tests: [FAIL]" clean: rm -f mq_open_tests mq_perf_tests diff --git a/trunk/tools/testing/selftests/vm/Makefile b/trunk/tools/testing/selftests/vm/Makefile index 7300d0702efe..436d2e81868b 100644 --- a/trunk/tools/testing/selftests/vm/Makefile +++ b/trunk/tools/testing/selftests/vm/Makefile @@ -8,7 +8,7 @@ all: hugepage-mmap hugepage-shm map_hugetlb thuge-gen $(CC) $(CFLAGS) -o $@ $^ run_tests: all - /bin/sh ./run_vmtests + @/bin/sh ./run_vmtests || echo "vmtests: [FAIL]" clean: $(RM) hugepage-mmap hugepage-shm map_hugetlb