From 00b87c5d365e3c310a5a0e69c8980e6da01e5236 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 20 Dec 2012 07:21:54 -0800
Subject: [PATCH] --- yaml --- r: 347407 b: refs/heads/master c:
 1ffab3d4139533eff6e27b7568825307e575faa6 h: refs/heads/master i:   347405:
 7f62fa201c2b27500c2f9b0a8a8a5f424050a2a6   347403:
 a60ba2cfa5a421ac8013f637b4d6001a3045033c   347399:
 d3c4f11b5854fa4a037732752b80ea195e49ac9c   347391:
 0b19f785dc15d8767cefa3337799cb74f2eba6c8 v: v3

---
 [refs]                                        |    2 +-
 trunk/.gitignore                              |    1 -
 trunk/Documentation/00-INDEX                  |    2 -
 trunk/Documentation/ABI/README                |    3 -
 .../ABI/stable/sysfs-devices-node             |   96 +-
 trunk/Documentation/ABI/testing/ima_policy    |    3 +-
 trunk/Documentation/DocBook/kernel-api.tmpl   |    3 +
 trunk/Documentation/aoe/aoe.txt               |    4 +-
 .../Documentation/backlight/lp855x-driver.txt |   10 +-
 trunk/Documentation/cgroups/memory.txt        |   66 +-
 .../cgroups/resource_counter.txt              |    7 +-
 .../devicetree/bindings/arm/davinci/nand.txt  |    8 +
 .../devicetree/bindings/i2c/i2c-cbus-gpio.txt |   27 +
 .../devicetree/bindings/i2c/i2c-mux-gpio.txt  |   81 +
 .../devicetree/bindings/i2c/i2c-ocores.txt    |    2 +-
 .../devicetree/bindings/i2c/i2c-s3c2410.txt   |   20 +-
 .../bindings/input/gpio-matrix-keypad.txt     |   46 +
 .../devicetree/bindings/input/pwm-beeper.txt  |    7 +
 .../bindings/input/stmpe-keypad.txt           |   39 +
 .../bindings/input/tca8418_keypad.txt         |    8 +
 .../bindings/input/touchscreen/mms114.txt     |   34 +
 .../bindings/input/touchscreen/stmpe.txt      |   43 +
 .../devicetree/bindings/mtd/denali-nand.txt   |   23 +
 .../devicetree/bindings/mtd/flctl-nand.txt    |   49 +
 .../devicetree/bindings/mtd/fsmc-nand.txt     |   12 +-
 .../devicetree/bindings/mtd/m25p80.txt        |   29 +
 .../devicetree/bindings/mtd/mtd-physmap.txt   |    3 +
 .../bindings/powerpc/fsl/raideng.txt          |   81 +
 .../devicetree/bindings/pwm/pwm-tiecap.txt    |   23 +
 .../devicetree/bindings/pwm/pwm-tiehrpwm.txt  |   23 +
 .../devicetree/bindings/pwm/pwm-tipwmss.txt   |   31 +
 .../devicetree/bindings/pwm/pwm.txt           |   17 +-
 .../devicetree/bindings/pwm/spear-pwm.txt     |   18 +
 .../devicetree/bindings/pwm/ti,twl-pwm.txt    |   17 +
 .../devicetree/bindings/pwm/ti,twl-pwmled.txt |   17 +
 .../devicetree/bindings/pwm/vt8500-pwm.txt    |   17 +
 .../devicetree/bindings/rtc/imxdi-rtc.txt     |   17 +
 .../devicetree/bindings/rtc/rtc-omap.txt      |   17 +
 .../bindings/spi/nvidia,tegra20-sflash.txt    |    2 +-
 .../bindings/spi/nvidia,tegra20-slink.txt     |    2 +-
 .../devicetree/bindings/spi/spi_atmel.txt     |   26 +
 trunk/Documentation/filesystems/proc.txt      |  130 +-
 trunk/Documentation/filesystems/vfat.txt      |    9 +
 trunk/Documentation/hwmon/it87                |   10 +
 trunk/Documentation/kernel-parameters.txt     |    7 +-
 trunk/Documentation/powerpc/ptrace.txt        |   16 +
 trunk/Documentation/security/00-INDEX         |    2 +
 trunk/Documentation/sparse.txt                |   18 +
 trunk/Documentation/x86/boot.txt              |    3 +-
 trunk/Documentation/xtensa/atomctl.txt        |   44 +
 trunk/MAINTAINERS                             |   18 +-
 trunk/Makefile                                |    6 +
 trunk/arch/Kconfig                            |   20 +
 trunk/arch/alpha/include/asm/Kbuild           |    9 -
 trunk/arch/alpha/include/asm/a.out.h          |   89 +-
 trunk/arch/alpha/include/asm/compiler.h       |  115 +-
 trunk/arch/alpha/include/asm/console.h        |   48 +-
 trunk/arch/alpha/include/asm/fpu.h            |  124 +-
 trunk/arch/alpha/include/asm/pal.h            |   50 +-
 trunk/arch/alpha/include/asm/param.h          |   20 +-
 trunk/arch/alpha/include/asm/ptrace.h         |   68 +-
 trunk/arch/alpha/include/asm/signal.h         |  135 +-
 trunk/arch/alpha/include/asm/socket.h         |   79 +-
 trunk/arch/alpha/include/asm/termios.h        |   68 +-
 trunk/arch/alpha/include/asm/types.h          |   13 +-
 trunk/arch/alpha/include/asm/unistd.h         |  469 +-
 trunk/arch/alpha/include/uapi/asm/Kbuild      |   40 +
 trunk/arch/alpha/include/uapi/asm/a.out.h     |   91 +
 .../alpha/include/{ => uapi}/asm/auxvec.h     |    0
 .../include/{ => uapi}/asm/bitsperlong.h      |    0
 .../alpha/include/{ => uapi}/asm/byteorder.h  |    0
 trunk/arch/alpha/include/uapi/asm/compiler.h  |  117 +
 trunk/arch/alpha/include/uapi/asm/console.h   |   50 +
 .../arch/alpha/include/{ => uapi}/asm/errno.h |    0
 .../arch/alpha/include/{ => uapi}/asm/fcntl.h |    0
 trunk/arch/alpha/include/uapi/asm/fpu.h       |  123 +
 .../alpha/include/{ => uapi}/asm/gentrap.h    |    0
 .../arch/alpha/include/{ => uapi}/asm/ioctl.h |    0
 .../alpha/include/{ => uapi}/asm/ioctls.h     |    0
 .../alpha/include/{ => uapi}/asm/ipcbuf.h     |    0
 .../alpha/include/{ => uapi}/asm/kvm_para.h   |    0
 .../arch/alpha/include/{ => uapi}/asm/mman.h  |    0
 .../alpha/include/{ => uapi}/asm/msgbuf.h     |    0
 trunk/arch/alpha/include/uapi/asm/pal.h       |   52 +
 trunk/arch/alpha/include/uapi/asm/param.h     |   21 +
 .../arch/alpha/include/{ => uapi}/asm/poll.h  |    0
 .../include/{ => uapi}/asm/posix_types.h      |    0
 trunk/arch/alpha/include/uapi/asm/ptrace.h    |   70 +
 trunk/arch/alpha/include/{ => uapi}/asm/reg.h |    0
 .../alpha/include/{ => uapi}/asm/regdef.h     |    0
 .../alpha/include/{ => uapi}/asm/resource.h   |    0
 .../alpha/include/{ => uapi}/asm/sembuf.h     |    0
 .../arch/alpha/include/{ => uapi}/asm/setup.h |    0
 .../alpha/include/{ => uapi}/asm/shmbuf.h     |    0
 .../alpha/include/{ => uapi}/asm/sigcontext.h |    0
 .../alpha/include/{ => uapi}/asm/siginfo.h    |    0
 trunk/arch/alpha/include/uapi/asm/signal.h    |  135 +
 trunk/arch/alpha/include/uapi/asm/socket.h    |   81 +
 .../alpha/include/{ => uapi}/asm/sockios.h    |    0
 .../arch/alpha/include/{ => uapi}/asm/stat.h  |    0
 .../alpha/include/{ => uapi}/asm/statfs.h     |    0
 .../arch/alpha/include/{ => uapi}/asm/swab.h  |    0
 .../alpha/include/{ => uapi}/asm/sysinfo.h    |    0
 .../alpha/include/{ => uapi}/asm/termbits.h   |    0
 trunk/arch/alpha/include/uapi/asm/termios.h   |   70 +
 trunk/arch/alpha/include/uapi/asm/types.h     |   16 +
 trunk/arch/alpha/include/uapi/asm/unistd.h    |  471 ++
 trunk/arch/arm/boot/dts/imx28-cfa10049.dts    |   24 +
 trunk/arch/arm/boot/dts/spear13xx.dtsi        |   14 +-
 trunk/arch/arm/boot/dts/spear300.dtsi         |    8 +-
 trunk/arch/arm/boot/dts/spear310.dtsi         |    8 +-
 trunk/arch/arm/boot/dts/spear320.dtsi         |    8 +-
 trunk/arch/arm/boot/dts/spear600.dtsi         |    8 +-
 trunk/arch/arm/configs/nhk8815_defconfig      |    2 +-
 trunk/arch/arm/include/uapi/asm/unistd.h      |    1 +
 trunk/arch/arm/kernel/calls.S                 |    1 +
 trunk/arch/arm/mach-davinci/devices-da8xx.c   |   13 +-
 trunk/arch/arm/mach-mxs/mach-mxs.c            |    2 +-
 trunk/arch/arm/mach-nomadik/board-nhk8815.c   |   71 +-
 .../arch/arm/mach-nomadik/include/mach/fsmc.h |   29 -
 trunk/arch/arm/mach-omap1/board-nokia770.c    |   14 +-
 trunk/arch/arm/mach-omap2/board-n8x0.c        |   42 +
 .../arm/mach-omap2/board-rx51-peripherals.c   |    6 +
 trunk/arch/arm/mach-omap2/i2c.c               |   19 +
 .../arm/mach-omap2/omap_hwmod_2430_data.c     |    3 +-
 .../arm/mach-omap2/omap_hwmod_33xx_data.c     |    3 +-
 .../arm/mach-omap2/omap_hwmod_3xxx_data.c     |   12 +-
 .../arm/mach-omap2/omap_hwmod_44xx_data.c     |    3 +-
 trunk/arch/arm/mach-u300/core.c               |   14 +-
 .../arch/arm/mach-ux500/board-mop500-stuib.c  |   71 +-
 trunk/arch/arm64/include/asm/unistd.h         |    1 +
 trunk/arch/arm64/kernel/sys_compat.c          |   15 -
 trunk/arch/blackfin/include/asm/Kbuild        |    6 +-
 trunk/arch/blackfin/include/asm/bfin_sport.h  |  128 +-
 trunk/arch/blackfin/include/asm/bfin_twi.h    |    2 +-
 trunk/arch/blackfin/include/asm/fixed_code.h  |   30 +-
 trunk/arch/blackfin/include/asm/kvm_para.h    |    1 -
 trunk/arch/blackfin/include/asm/pgtable.h     |    2 -
 trunk/arch/blackfin/include/asm/ptrace.h      |  161 +-
 trunk/arch/blackfin/include/asm/uaccess.h     |   41 +-
 trunk/arch/blackfin/include/asm/unistd.h      |  430 +-
 trunk/arch/blackfin/include/mach-common/irq.h |    5 +-
 trunk/arch/blackfin/include/uapi/asm/Kbuild   |   16 +
 .../blackfin/include/uapi/asm/bfin_sport.h    |  136 +
 .../include/{ => uapi}/asm/byteorder.h        |    0
 .../include/{ => uapi}/asm/cachectl.h         |    0
 .../blackfin/include/{ => uapi}/asm/fcntl.h   |    0
 .../blackfin/include/uapi/asm/fixed_code.h    |   38 +
 .../blackfin/include/{ => uapi}/asm/ioctls.h  |    0
 .../blackfin/include/{ => uapi}/asm/poll.h    |    0
 .../include/{ => uapi}/asm/posix_types.h      |    0
 trunk/arch/blackfin/include/uapi/asm/ptrace.h |  170 +
 .../include/{ => uapi}/asm/sigcontext.h       |    0
 .../blackfin/include/{ => uapi}/asm/siginfo.h |    0
 .../blackfin/include/{ => uapi}/asm/signal.h  |    0
 .../blackfin/include/{ => uapi}/asm/stat.h    |    0
 .../blackfin/include/{ => uapi}/asm/swab.h    |    0
 trunk/arch/blackfin/include/uapi/asm/unistd.h |  437 ++
 trunk/arch/blackfin/kernel/kgdb.c             |   13 +-
 .../mach-bf518/include/mach/anomaly.h         |    1 +
 .../mach-bf527/include/mach/anomaly.h         |    1 +
 .../mach-bf533/include/mach/anomaly.h         |    1 +
 .../mach-bf537/include/mach/anomaly.h         |    1 +
 .../mach-bf538/include/mach/anomaly.h         |    1 +
 .../mach-bf548/include/mach/anomaly.h         |    1 +
 .../mach-bf561/include/mach/anomaly.h         |    1 +
 .../blackfin/mach-bf609/include/mach/irq.h    |    3 -
 trunk/arch/blackfin/mach-bf609/pm.c           |    3 +-
 trunk/arch/blackfin/mach-common/dpmc.c        |   19 +-
 .../arch/blackfin/mach-common/ints-priority.c |  272 +-
 trunk/arch/cris/include/asm/io.h              |   39 +-
 trunk/arch/cris/kernel/module.c               |    2 -
 trunk/arch/frv/kernel/setup.c                 |   12 +-
 trunk/arch/frv/mm/init.c                      |    2 +-
 trunk/arch/h8300/Kconfig                      |    1 +
 trunk/arch/openrisc/include/asm/Kbuild        |    1 +
 .../arch/openrisc/include/uapi/asm/kvm_para.h |    1 -
 trunk/arch/openrisc/kernel/asm-offsets.c      |    6 +-
 trunk/arch/parisc/kernel/module.c             |    2 -
 trunk/arch/powerpc/Makefile                   |    2 +-
 trunk/arch/powerpc/boot/dts/a3m071.dts        |  144 +
 .../powerpc/boot/dts/fsl/p5020si-post.dtsi    |    1 +
 .../powerpc/boot/dts/fsl/p5020si-pre.dtsi     |    6 +
 .../powerpc/boot/dts/fsl/qoriq-raid1.0-0.dtsi |   85 +
 trunk/arch/powerpc/configs/pseries_defconfig  |    2 +-
 trunk/arch/powerpc/include/asm/bitops.h       |   75 +-
 trunk/arch/powerpc/include/asm/cputable.h     |   12 +-
 trunk/arch/powerpc/include/asm/dbell.h        |    2 +-
 .../arch/powerpc/include/asm/exception-64s.h  |   97 +-
 trunk/arch/powerpc/include/asm/firmware.h     |    4 +-
 trunk/arch/powerpc/include/asm/fsl_gtm.h      |    2 +-
 trunk/arch/powerpc/include/asm/fsl_guts.h     |    4 +-
 trunk/arch/powerpc/include/asm/hvcall.h       |   23 +-
 trunk/arch/powerpc/include/asm/immap_qe.h     |    2 +-
 trunk/arch/powerpc/include/asm/machdep.h      |   39 +-
 trunk/arch/powerpc/include/asm/mmu.h          |    1 +
 .../powerpc/include/asm/pSeries_reconfig.h    |   47 -
 trunk/arch/powerpc/include/asm/ppc-opcode.h   |    6 +-
 trunk/arch/powerpc/include/asm/prom.h         |   16 +
 trunk/arch/powerpc/include/asm/qe.h           |    2 +-
 trunk/arch/powerpc/include/asm/qe_ic.h        |    2 +-
 trunk/arch/powerpc/include/asm/reg.h          |    3 +
 trunk/arch/powerpc/include/asm/rtas.h         |    5 +
 trunk/arch/powerpc/include/asm/setup.h        |   29 +
 trunk/arch/powerpc/include/asm/systbl.h       |    3 +-
 trunk/arch/powerpc/include/asm/ucc.h          |    2 +-
 trunk/arch/powerpc/include/asm/ucc_fast.h     |    2 +-
 trunk/arch/powerpc/include/asm/ucc_slow.h     |    2 +-
 trunk/arch/powerpc/include/asm/udbg.h         |    1 -
 trunk/arch/powerpc/include/asm/unistd.h       |    3 +-
 trunk/arch/powerpc/include/uapi/asm/setup.h   |   31 -
 trunk/arch/powerpc/include/uapi/asm/unistd.h  |    1 +
 trunk/arch/powerpc/kernel/Makefile            |    2 +-
 .../{cpu_setup_power7.S => cpu_setup_power.S} |   32 +-
 trunk/arch/powerpc/kernel/cputable.c          |   38 +
 trunk/arch/powerpc/kernel/entry_64.S          |    2 +
 trunk/arch/powerpc/kernel/exceptions-64s.S    |  308 +-
 trunk/arch/powerpc/kernel/head_64.S           |    6 +-
 trunk/arch/powerpc/kernel/idle.c              |    3 -
 trunk/arch/powerpc/kernel/iommu.c             |   16 +-
 trunk/arch/powerpc/kernel/machine_kexec.c     |   14 +-
 trunk/arch/powerpc/kernel/machine_kexec_64.c  |    8 +-
 trunk/arch/powerpc/kernel/pci_32.c            |    2 +-
 trunk/arch/powerpc/kernel/prom.c              |    7 +-
 trunk/arch/powerpc/kernel/prom_init.c         |   11 +-
 trunk/arch/powerpc/kernel/ptrace.c            |   90 +-
 trunk/arch/powerpc/kernel/rtas.c              |    1 -
 trunk/arch/powerpc/kernel/rtas_flash.c        |    4 +-
 trunk/arch/powerpc/kernel/setup_64.c          |    5 +
 trunk/arch/powerpc/kernel/sys_ppc32.c         |   17 +-
 trunk/arch/powerpc/kernel/udbg.c              |   23 -
 trunk/arch/powerpc/mm/numa.c                  |   12 -
 trunk/arch/powerpc/mm/tlb_nohash_low.S        |   15 +-
 trunk/arch/powerpc/perf/power7-pmu.c          |   17 +-
 trunk/arch/powerpc/platforms/512x/Kconfig     |    1 -
 .../arch/powerpc/platforms/512x/mpc5121_ads.c |    3 +
 trunk/arch/powerpc/platforms/512x/mpc512x.h   |   11 +-
 .../powerpc/platforms/512x/mpc512x_shared.c   |   25 +-
 trunk/arch/powerpc/platforms/52xx/lite5200.c  |    2 +-
 .../powerpc/platforms/52xx/mpc5200_simple.c   |    1 +
 .../powerpc/platforms/52xx/mpc52xx_lpbfifo.c  |   16 +-
 .../powerpc/platforms/82xx/pq2ads-pci-pic.c   |    8 +-
 .../arch/powerpc/platforms/83xx/mpc832x_mds.c |    2 +-
 .../arch/powerpc/platforms/83xx/mpc836x_mds.c |    2 +-
 .../arch/powerpc/platforms/83xx/mpc836x_rdk.c |    2 +-
 .../arch/powerpc/platforms/83xx/mpc837x_rdb.c |    2 +-
 .../arch/powerpc/platforms/85xx/mpc85xx_mds.c |    2 +-
 trunk/arch/powerpc/platforms/85xx/p1022_ds.c  |    8 +-
 trunk/arch/powerpc/platforms/85xx/smp.c       |   49 +-
 .../powerpc/platforms/86xx/mpc8610_hpcd.c     |    2 +
 .../arch/powerpc/platforms/cell/spufs/sched.c |    2 +-
 .../powerpc/platforms/powermac/cpufreq_32.c   |    5 +-
 .../arch/powerpc/platforms/powernv/pci-ioda.c |   25 +-
 trunk/arch/powerpc/platforms/ps3/os-area.c    |    6 +-
 trunk/arch/powerpc/platforms/pseries/dlpar.c  |   34 +-
 .../arch/powerpc/platforms/pseries/firmware.c |    1 +
 .../powerpc/platforms/pseries/hotplug-cpu.c   |    8 +-
 .../platforms/pseries/hotplug-memory.c        |   60 +-
 trunk/arch/powerpc/platforms/pseries/iommu.c  |   10 +-
 .../arch/powerpc/platforms/pseries/mobility.c |    4 +-
 .../platforms/pseries/plpar_wrappers.h        |   31 +
 .../arch/powerpc/platforms/pseries/reconfig.c |  119 +-
 trunk/arch/powerpc/platforms/pseries/setup.c  |   77 +-
 trunk/arch/powerpc/platforms/pseries/smp.c    |    1 -
 trunk/arch/powerpc/sysdev/fsl_gtm.c           |    2 +-
 trunk/arch/powerpc/sysdev/fsl_pci.c           |   37 +-
 trunk/arch/powerpc/sysdev/pmi.c               |   13 +-
 trunk/arch/powerpc/sysdev/qe_lib/qe.c         |    2 +-
 trunk/arch/powerpc/sysdev/qe_lib/qe_ic.c      |    2 +-
 trunk/arch/powerpc/sysdev/qe_lib/qe_ic.h      |    2 +-
 trunk/arch/powerpc/sysdev/qe_lib/qe_io.c      |    2 +-
 trunk/arch/powerpc/sysdev/qe_lib/ucc.c        |    2 +-
 trunk/arch/powerpc/sysdev/qe_lib/ucc_fast.c   |    2 +-
 trunk/arch/powerpc/sysdev/qe_lib/ucc_slow.c   |    2 +-
 trunk/arch/powerpc/sysdev/qe_lib/usb.c        |    2 +-
 trunk/arch/powerpc/xmon/Makefile              |    2 +-
 trunk/arch/powerpc/xmon/nonstdio.c            |   53 +-
 trunk/arch/powerpc/xmon/nonstdio.h            |    6 -
 trunk/arch/powerpc/xmon/start.c               |   34 -
 trunk/arch/powerpc/xmon/xmon.c                |   26 +-
 trunk/arch/s390/include/asm/ccwdev.h          |    4 +-
 trunk/arch/s390/include/asm/pci.h             |   39 +
 trunk/arch/s390/include/asm/pci_debug.h       |   36 +
 trunk/arch/s390/pci/Makefile                  |    2 +-
 trunk/arch/s390/pci/pci.c                     |   73 +-
 trunk/arch/s390/pci/pci_clp.c                 |    1 +
 trunk/arch/s390/pci/pci_debug.c               |  193 +
 trunk/arch/s390/pci/pci_dma.c                 |    8 +-
 trunk/arch/s390/pci/pci_event.c               |    2 +
 trunk/arch/sparc/crypto/aes_asm.S             |   20 +-
 trunk/arch/sparc/crypto/aes_glue.c            |   31 +-
 trunk/arch/sparc/crypto/camellia_glue.c       |    3 +
 trunk/arch/sparc/crypto/des_asm.S             |    1 +
 trunk/arch/sparc/crypto/des_glue.c            |    6 +
 trunk/arch/sparc/include/asm/hugetlb.h        |   10 +-
 trunk/arch/sparc/include/asm/pgtable_64.h     |    8 +-
 trunk/arch/sparc/include/asm/unistd.h         |    1 +
 trunk/arch/sparc/kernel/module.c              |    4 -
 trunk/arch/sparc/kernel/sys_sparc32.c         |   14 -
 trunk/arch/tile/include/asm/compat.h          |    2 -
 trunk/arch/tile/include/asm/elf.h             |    2 +
 trunk/arch/tile/include/asm/ptrace.h          |    3 +-
 trunk/arch/tile/include/asm/unistd.h          |    1 +
 trunk/arch/tile/include/uapi/asm/ptrace.h     |    8 +-
 trunk/arch/tile/kernel/compat.c               |   18 -
 trunk/arch/tile/kernel/module.c               |    2 -
 trunk/arch/tile/kernel/pci.c                  |    4 +-
 trunk/arch/tile/kernel/pci_gx.c               |    3 +-
 trunk/arch/tile/kernel/ptrace.c               |  140 +-
 trunk/arch/um/drivers/mconsole_kern.c         |    2 +-
 trunk/arch/unicore32/kernel/module.c          |    3 -
 trunk/arch/x86/Makefile                       |    5 +-
 trunk/arch/x86/include/asm/paravirt.h         |    2 -
 .../arch/x86/include/uapi/asm/hw_breakpoint.h |    1 +
 trunk/arch/x86/include/uapi/asm/msr-index.h   |   37 +
 trunk/arch/x86/include/uapi/asm/setup.h       |    1 +
 trunk/arch/x86/kernel/cpu/proc.c              |    7 +-
 trunk/arch/x86/kernel/irqinit.c               |   40 -
 trunk/arch/x86/kernel/traps.c                 |    6 -
 trunk/arch/x86/platform/iris/iris.c           |   67 +-
 trunk/arch/x86/syscalls/syscall_32.tbl        |    1 +
 trunk/arch/x86/syscalls/syscall_64.tbl        |    1 +
 trunk/arch/x86/xen/enlighten.c                |    7 +-
 trunk/arch/x86/xen/smp.c                      |    2 +-
 trunk/arch/xtensa/Kconfig                     |   21 +
 trunk/arch/xtensa/Kconfig.debug               |   22 +-
 trunk/arch/xtensa/Makefile                    |   20 +-
 trunk/arch/xtensa/boot/Makefile               |   25 +-
 trunk/arch/xtensa/boot/boot-elf/Makefile      |   26 +-
 trunk/arch/xtensa/boot/boot-redboot/Makefile  |   26 +-
 trunk/arch/xtensa/boot/boot-uboot/Makefile    |   14 +
 trunk/arch/xtensa/boot/dts/lx60.dts           |   11 +
 trunk/arch/xtensa/boot/dts/ml605.dts          |   11 +
 .../xtensa/boot/dts/xtfpga-flash-16m.dtsi     |   26 +
 .../arch/xtensa/boot/dts/xtfpga-flash-4m.dtsi |   18 +
 trunk/arch/xtensa/boot/dts/xtfpga.dtsi        |   56 +
 trunk/arch/xtensa/include/asm/atomic.h        |  271 +-
 trunk/arch/xtensa/include/asm/barrier.h       |    6 +-
 trunk/arch/xtensa/include/asm/bitops.h        |  127 +-
 trunk/arch/xtensa/include/asm/bootparam.h     |   20 +-
 trunk/arch/xtensa/include/asm/cacheasm.h      |    1 -
 trunk/arch/xtensa/include/asm/cacheflush.h    |    3 +-
 trunk/arch/xtensa/include/asm/checksum.h      |   19 +-
 trunk/arch/xtensa/include/asm/cmpxchg.h       |   74 +-
 trunk/arch/xtensa/include/asm/current.h       |    2 +-
 trunk/arch/xtensa/include/asm/delay.h         |    7 +-
 trunk/arch/xtensa/include/asm/dma-mapping.h   |    6 +-
 trunk/arch/xtensa/include/asm/elf.h           |   10 +-
 trunk/arch/xtensa/include/asm/highmem.h       |    1 -
 .../arch/xtensa/include/asm/initialize_mmu.h  |   55 +
 trunk/arch/xtensa/include/asm/mmu_context.h   |    2 +-
 trunk/arch/xtensa/include/asm/nommu_context.h |    2 +-
 trunk/arch/xtensa/include/asm/page.h          |   20 +-
 trunk/arch/xtensa/include/asm/pci-bridge.h    |    2 +-
 trunk/arch/xtensa/include/asm/pci.h           |    2 +-
 trunk/arch/xtensa/include/asm/pgalloc.h       |    2 +-
 trunk/arch/xtensa/include/asm/pgtable.h       |    8 +-
 trunk/arch/xtensa/include/asm/platform.h      |    1 -
 trunk/arch/xtensa/include/asm/processor.h     |   10 +-
 trunk/arch/xtensa/include/asm/prom.h          |    6 +
 trunk/arch/xtensa/include/asm/ptrace.h        |    4 +-
 trunk/arch/xtensa/include/asm/regs.h          |    5 +-
 trunk/arch/xtensa/include/asm/spinlock.h      |  188 +-
 trunk/arch/xtensa/include/asm/syscall.h       |   11 +-
 trunk/arch/xtensa/include/asm/traps.h         |   23 +
 trunk/arch/xtensa/include/asm/uaccess.h       |   43 +-
 trunk/arch/xtensa/kernel/Makefile             |    8 +-
 trunk/arch/xtensa/kernel/align.S              |    4 +-
 trunk/arch/xtensa/kernel/asm-offsets.c        |    5 +-
 trunk/arch/xtensa/kernel/coprocessor.S        |   25 +-
 trunk/arch/xtensa/kernel/entry.S              |   67 +-
 trunk/arch/xtensa/kernel/head.S               |   21 +-
 trunk/arch/xtensa/kernel/irq.c                |  132 +-
 trunk/arch/xtensa/kernel/module.c             |    2 +-
 trunk/arch/xtensa/kernel/platform.c           |    1 -
 trunk/arch/xtensa/kernel/process.c            |    2 +-
 trunk/arch/xtensa/kernel/ptrace.c             |    3 +-
 trunk/arch/xtensa/kernel/setup.c              |  279 +-
 trunk/arch/xtensa/kernel/signal.c             |    8 +-
 trunk/arch/xtensa/kernel/syscall.c            |    1 -
 trunk/arch/xtensa/kernel/time.c               |    7 +-
 trunk/arch/xtensa/kernel/traps.c              |   18 +-
 trunk/arch/xtensa/kernel/vectors.S            |   67 +-
 trunk/arch/xtensa/lib/checksum.S              |   15 +-
 trunk/arch/xtensa/lib/memcopy.S               |    6 +-
 trunk/arch/xtensa/lib/pci-auto.c              |    9 +-
 trunk/arch/xtensa/lib/strncpy_user.S          |    4 +-
 trunk/arch/xtensa/lib/strnlen_user.S          |    1 -
 trunk/arch/xtensa/lib/usercopy.S              |    1 -
 trunk/arch/xtensa/mm/cache.c                  |   27 +-
 trunk/arch/xtensa/mm/fault.c                  |    1 -
 trunk/arch/xtensa/mm/init.c                   |   16 +-
 trunk/arch/xtensa/mm/misc.S                   |   51 +-
 trunk/arch/xtensa/mm/mmu.c                    |    2 +-
 trunk/arch/xtensa/mm/tlb.c                    |    9 +-
 .../platforms/iss/include/platform/serial.h   |   15 +
 .../platforms/iss/include/platform/simcall.h  |    7 +-
 trunk/arch/xtensa/platforms/xtfpga/Makefile   |    9 +
 .../xtfpga/include/platform/hardware.h        |   69 +
 .../platforms/xtfpga/include/platform/lcd.h   |   20 +
 .../xtfpga/include/platform/serial.h          |   18 +
 trunk/arch/xtensa/platforms/xtfpga/lcd.c      |   76 +
 trunk/arch/xtensa/platforms/xtfpga/setup.c    |  301 ++
 trunk/arch/xtensa/variants/s6000/gpio.c       |    4 +-
 trunk/block/Kconfig                           |    1 +
 trunk/block/genhd.c                           |    8 +-
 trunk/block/partitions/efi.c                  |    7 +-
 trunk/block/partitions/msdos.c                |   21 +-
 trunk/drivers/atm/solos-pci.c                 |  186 +-
 trunk/drivers/base/dma-buf.c                  |    2 +
 trunk/drivers/bcma/driver_chipcommon_pmu.c    |    3 +-
 trunk/drivers/block/aoe/aoe.h                 |   57 +-
 trunk/drivers/block/aoe/aoeblk.c              |  104 +-
 trunk/drivers/block/aoe/aoechr.c              |    7 +-
 trunk/drivers/block/aoe/aoecmd.c              |  715 ++-
 trunk/drivers/block/aoe/aoedev.c              |  243 +-
 trunk/drivers/block/aoe/aoemain.c             |    2 +-
 trunk/drivers/block/aoe/aoenet.c              |   15 +-
 trunk/drivers/block/cciss.c                   |   21 +-
 trunk/drivers/block/drbd/Kconfig              |   10 +-
 trunk/drivers/block/drbd/Makefile             |    2 +
 trunk/drivers/block/drbd/drbd_actlog.c        |  702 ++-
 trunk/drivers/block/drbd/drbd_bitmap.c        |  249 +-
 trunk/drivers/block/drbd/drbd_int.h           | 1365 +++---
 trunk/drivers/block/drbd/drbd_interval.c      |  207 +
 trunk/drivers/block/drbd/drbd_interval.h      |   40 +
 trunk/drivers/block/drbd/drbd_main.c          | 3861 +++++++---------
 trunk/drivers/block/drbd/drbd_nl.c            | 3332 ++++++++------
 trunk/drivers/block/drbd/drbd_nla.c           |   55 +
 trunk/drivers/block/drbd/drbd_nla.h           |    8 +
 trunk/drivers/block/drbd/drbd_proc.c          |   41 +-
 trunk/drivers/block/drbd/drbd_receiver.c      | 3904 ++++++++++-------
 trunk/drivers/block/drbd/drbd_req.c           | 1574 ++++---
 trunk/drivers/block/drbd/drbd_req.h           |  187 +-
 trunk/drivers/block/drbd/drbd_state.c         | 1856 ++++++++
 trunk/drivers/block/drbd/drbd_state.h         |  161 +
 trunk/drivers/block/drbd/drbd_strings.c       |    1 +
 trunk/drivers/block/drbd/drbd_worker.c        | 1237 +++---
 trunk/drivers/block/drbd/drbd_wrappers.h      |   11 +-
 trunk/drivers/block/loop.c                    |   10 +
 trunk/drivers/block/xen-blkback/blkback.c     |  301 +-
 trunk/drivers/block/xen-blkback/common.h      |   16 +
 trunk/drivers/block/xen-blkback/xenbus.c      |   23 +-
 trunk/drivers/block/xen-blkfront.c            |  199 +-
 trunk/drivers/char/random.c                   |   40 +-
 trunk/drivers/clk/clk-nomadik.c               |    1 +
 trunk/drivers/crypto/nx/nx-842.c              |   20 +-
 trunk/drivers/crypto/nx/nx.c                  |    1 -
 trunk/drivers/dma/dmatest.c                   |   49 +-
 trunk/drivers/firmware/efivars.c              |    1 -
 trunk/drivers/gpio/Kconfig                    |    1 +
 trunk/drivers/gpio/gpio-ich.c                 |    1 +
 trunk/drivers/gpio/gpio-mvebu.c               |   17 -
 trunk/drivers/gpu/drm/ttm/ttm_bo.c            |    2 +-
 trunk/drivers/hwmon/hwmon-vid.c               |   10 +
 trunk/drivers/hwmon/hwmon.c                   |   26 +-
 trunk/drivers/hwmon/it87.c                    |  918 ++--
 trunk/drivers/hwmon/twl4030-madc-hwmon.c      |    2 +-
 trunk/drivers/hwmon/w83627ehf.c               |   99 +-
 trunk/drivers/hwmon/w83627hf.c                |   81 +-
 trunk/drivers/i2c/busses/Kconfig              |   10 +
 trunk/drivers/i2c/busses/Makefile             |    1 +
 trunk/drivers/i2c/busses/i2c-at91.c           |  338 +-
 trunk/drivers/i2c/busses/i2c-cbus-gpio.c      |  300 ++
 trunk/drivers/i2c/busses/i2c-gpio.c           |    6 +-
 trunk/drivers/i2c/busses/i2c-mxs.c            |    2 +-
 trunk/drivers/i2c/busses/i2c-nomadik.c        |   14 -
 trunk/drivers/i2c/busses/i2c-ocores.c         |  164 +-
 trunk/drivers/i2c/busses/i2c-omap.c           |  226 +-
 trunk/drivers/i2c/busses/i2c-rcar.c           |    6 +-
 trunk/drivers/i2c/busses/i2c-s3c2410.c        |  211 +-
 trunk/drivers/i2c/busses/i2c-sh_mobile.c      |  150 +-
 trunk/drivers/i2c/muxes/i2c-mux-gpio.c        |  145 +-
 trunk/drivers/infiniband/hw/ehca/hcp_if.c     |   20 -
 trunk/drivers/input/gameport/emu10k1-gp.c     |    6 +-
 trunk/drivers/input/gameport/fm801-gp.c       |    6 +-
 trunk/drivers/input/input-mt.c                |    2 +-
 trunk/drivers/input/input.c                   |  181 +-
 trunk/drivers/input/joystick/as5011.c         |   29 +-
 trunk/drivers/input/joystick/maplecontrol.c   |    6 +-
 trunk/drivers/input/joystick/walkera0701.c    |    7 +-
 trunk/drivers/input/joystick/xpad.c           |   33 +-
 trunk/drivers/input/keyboard/Kconfig          |    2 +-
 trunk/drivers/input/keyboard/adp5520-keys.c   |    6 +-
 trunk/drivers/input/keyboard/adp5588-keys.c   |   18 +-
 trunk/drivers/input/keyboard/adp5589-keys.c   |   21 +-
 trunk/drivers/input/keyboard/bf54x-keys.c     |    6 +-
 .../drivers/input/keyboard/davinci_keyscan.c  |    4 +-
 trunk/drivers/input/keyboard/ep93xx_keypad.c  |    6 +-
 trunk/drivers/input/keyboard/gpio_keys.c      |   90 +-
 .../drivers/input/keyboard/gpio_keys_polled.c |   26 +-
 trunk/drivers/input/keyboard/hilkbd.c         |   10 +-
 trunk/drivers/input/keyboard/imx_keypad.c     |    9 +-
 trunk/drivers/input/keyboard/jornada680_kbd.c |    6 +-
 trunk/drivers/input/keyboard/jornada720_kbd.c |    6 +-
 trunk/drivers/input/keyboard/lm8323.c         |    6 +-
 trunk/drivers/input/keyboard/lm8333.c         |    6 +-
 trunk/drivers/input/keyboard/locomokbd.c      |    8 +-
 trunk/drivers/input/keyboard/lpc32xx-keys.c   |    8 +-
 trunk/drivers/input/keyboard/matrix_keypad.c  |  129 +-
 trunk/drivers/input/keyboard/max7359_keypad.c |    6 +-
 trunk/drivers/input/keyboard/mcs_touchkey.c   |    6 +-
 .../drivers/input/keyboard/mpr121_touchkey.c  |   12 +-
 .../input/keyboard/nomadik-ske-keypad.c       |   38 +-
 trunk/drivers/input/keyboard/omap-keypad.c    |    6 +-
 trunk/drivers/input/keyboard/omap4-keypad.c   |   10 +-
 trunk/drivers/input/keyboard/opencores-kbd.c  |    6 +-
 .../drivers/input/keyboard/pmic8xxx-keypad.c  |   10 +-
 trunk/drivers/input/keyboard/pxa27x_keypad.c  |    6 +-
 trunk/drivers/input/keyboard/pxa930_rotary.c  |    6 +-
 trunk/drivers/input/keyboard/qt1070.c         |    8 +-
 trunk/drivers/input/keyboard/qt2160.c         |   31 +-
 trunk/drivers/input/keyboard/samsung-keypad.c |  109 +-
 trunk/drivers/input/keyboard/sh_keysc.c       |    6 +-
 trunk/drivers/input/keyboard/spear-keyboard.c |   98 +-
 trunk/drivers/input/keyboard/stmpe-keypad.c   |  142 +-
 trunk/drivers/input/keyboard/tc3589x-keypad.c |    6 +-
 trunk/drivers/input/keyboard/tca6416-keypad.c |    8 +-
 trunk/drivers/input/keyboard/tca8418_keypad.c |  179 +-
 trunk/drivers/input/keyboard/tegra-kbc.c      |   16 +-
 .../drivers/input/keyboard/tnetv107x-keypad.c |    6 +-
 trunk/drivers/input/keyboard/twl4030_keypad.c |    8 +-
 trunk/drivers/input/keyboard/w90p910_keypad.c |    6 +-
 trunk/drivers/input/matrix-keymap.c           |   23 +-
 trunk/drivers/input/misc/88pm80x_onkey.c      |    6 +-
 trunk/drivers/input/misc/88pm860x_onkey.c     |    6 +-
 trunk/drivers/input/misc/Kconfig              |   29 +-
 trunk/drivers/input/misc/Makefile             |    2 +
 trunk/drivers/input/misc/ab8500-ponkey.c      |    6 +-
 trunk/drivers/input/misc/ad714x-i2c.c         |    6 +-
 trunk/drivers/input/misc/ad714x-spi.c         |    6 +-
 trunk/drivers/input/misc/adxl34x-i2c.c        |    6 +-
 trunk/drivers/input/misc/adxl34x-spi.c        |    6 +-
 trunk/drivers/input/misc/bfin_rotary.c        |    6 +-
 trunk/drivers/input/misc/bma150.c             |   28 +-
 trunk/drivers/input/misc/cma3000_d0x_i2c.c    |    6 +-
 trunk/drivers/input/misc/cobalt_btns.c        |    6 +-
 trunk/drivers/input/misc/da9052_onkey.c       |    6 +-
 trunk/drivers/input/misc/da9055_onkey.c       |  171 +
 trunk/drivers/input/misc/dm355evm_keys.c      |    6 +-
 trunk/drivers/input/misc/gp2ap002a00f.c       |    8 +-
 trunk/drivers/input/misc/gpio_tilt_polled.c   |    6 +-
 trunk/drivers/input/misc/ixp4xx-beeper.c      |    6 +-
 trunk/drivers/input/misc/kxtj9.c              |   16 +-
 trunk/drivers/input/misc/m68kspkr.c           |    6 +-
 trunk/drivers/input/misc/max8925_onkey.c      |    6 +-
 trunk/drivers/input/misc/max8997_haptic.c     |    6 +-
 trunk/drivers/input/misc/mc13783-pwrbutton.c  |    6 +-
 trunk/drivers/input/misc/mma8450.c            |    6 +-
 trunk/drivers/input/misc/mpu3050.c            |    8 +-
 trunk/drivers/input/misc/pcap_keys.c          |    6 +-
 trunk/drivers/input/misc/pcf50633-input.c     |    6 +-
 trunk/drivers/input/misc/pcf8574_keypad.c     |    6 +-
 trunk/drivers/input/misc/pcspkr.c             |    6 +-
 trunk/drivers/input/misc/pm8xxx-vibrator.c    |    6 +-
 trunk/drivers/input/misc/pmic8xxx-pwrkey.c    |    6 +-
 trunk/drivers/input/misc/pwm-beeper.c         |   20 +-
 trunk/drivers/input/misc/rb532_button.c       |    6 +-
 trunk/drivers/input/misc/retu-pwrbutton.c     |   99 +
 trunk/drivers/input/misc/rotary_encoder.c     |    9 +-
 trunk/drivers/input/misc/sgi_btns.c           |    6 +-
 trunk/drivers/input/misc/sparcspkr.c          |   14 +-
 trunk/drivers/input/misc/twl4030-pwrbutton.c  |    3 +-
 trunk/drivers/input/misc/twl4030-vibra.c      |    6 +-
 trunk/drivers/input/misc/twl6040-vibra.c      |    6 +-
 trunk/drivers/input/misc/wistron_btns.c       |   20 +-
 trunk/drivers/input/misc/wm831x-on.c          |   11 +-
 trunk/drivers/input/misc/xen-kbdfront.c       |    2 +-
 trunk/drivers/input/mouse/alps.c              |   10 +-
 trunk/drivers/input/mouse/gpio_mouse.c        |    6 +-
 trunk/drivers/input/mouse/maplemouse.c        |    6 +-
 trunk/drivers/input/mouse/navpoint.c          |    6 +-
 trunk/drivers/input/mouse/pxa930_trkball.c    |    6 +-
 trunk/drivers/input/mouse/synaptics_i2c.c     |    6 +-
 trunk/drivers/input/serio/Kconfig             |    9 +
 trunk/drivers/input/serio/Makefile            |    1 +
 trunk/drivers/input/serio/altera_ps2.c        |    6 +-
 trunk/drivers/input/serio/ambakmi.c           |    6 +-
 trunk/drivers/input/serio/arc_ps2.c           |  274 ++
 trunk/drivers/input/serio/ct82c710.c          |    6 +-
 trunk/drivers/input/serio/gscps2.c            |    6 +-
 trunk/drivers/input/serio/hil_mlc.c           |   13 +-
 trunk/drivers/input/serio/i8042-io.h          |    2 +-
 trunk/drivers/input/serio/i8042-sparcio.h     |    6 +-
 trunk/drivers/input/serio/i8042-x86ia64io.h   |    9 +
 trunk/drivers/input/serio/i8042.c             |    6 +-
 trunk/drivers/input/serio/maceps2.c           |    8 +-
 trunk/drivers/input/serio/pcips2.c            |    6 +-
 trunk/drivers/input/serio/q40kbd.c            |    6 +-
 trunk/drivers/input/serio/rpckbd.c            |    6 +-
 trunk/drivers/input/serio/sa1111ps2.c         |   12 +-
 trunk/drivers/input/serio/serio.c             |   11 -
 trunk/drivers/input/serio/xilinx_ps2.c        |    8 +-
 trunk/drivers/input/tablet/wacom_sys.c        |   58 +-
 trunk/drivers/input/tablet/wacom_wac.c        |   32 +-
 trunk/drivers/input/tablet/wacom_wac.h        |    2 +
 trunk/drivers/input/touchscreen/88pm860x-ts.c |    8 +-
 trunk/drivers/input/touchscreen/Kconfig       |   12 -
 trunk/drivers/input/touchscreen/Makefile      |    1 -
 trunk/drivers/input/touchscreen/ad7877.c      |    6 +-
 trunk/drivers/input/touchscreen/ad7879-i2c.c  |    6 +-
 trunk/drivers/input/touchscreen/ad7879-spi.c  |    6 +-
 trunk/drivers/input/touchscreen/ads7846.c     |   10 +-
 .../drivers/input/touchscreen/atmel_mxt_ts.c  |    6 +-
 .../drivers/input/touchscreen/atmel_tsadcc.c  |    6 +-
 .../drivers/input/touchscreen/auo-pixcir-ts.c |    8 +-
 trunk/drivers/input/touchscreen/bu21013_ts.c  |  125 +-
 .../drivers/input/touchscreen/cy8ctmg110_ts.c |   19 +-
 trunk/drivers/input/touchscreen/cyttsp_i2c.c  |    6 +-
 trunk/drivers/input/touchscreen/cyttsp_spi.c  |    6 +-
 trunk/drivers/input/touchscreen/da9034-ts.c   |    6 +-
 trunk/drivers/input/touchscreen/da9052_tsi.c  |   10 +-
 trunk/drivers/input/touchscreen/edt-ft5x06.c  |   28 +-
 trunk/drivers/input/touchscreen/eeti_ts.c     |    6 +-
 trunk/drivers/input/touchscreen/egalax_ts.c   |    8 +-
 .../input/touchscreen/h3600_ts_input.c        |  479 --
 trunk/drivers/input/touchscreen/htcpen.c      |    6 +-
 trunk/drivers/input/touchscreen/ili210x.c     |    6 +-
 .../input/touchscreen/intel-mid-touch.c       |   14 +-
 .../drivers/input/touchscreen/jornada720_ts.c |    6 +-
 trunk/drivers/input/touchscreen/lpc32xx_ts.c  |    6 +-
 trunk/drivers/input/touchscreen/max11801_ts.c |    8 +-
 trunk/drivers/input/touchscreen/mc13783_ts.c  |    4 +-
 trunk/drivers/input/touchscreen/mcs5000_ts.c  |    6 +-
 trunk/drivers/input/touchscreen/mms114.c      |   68 +-
 trunk/drivers/input/touchscreen/pcap_ts.c     |    6 +-
 .../drivers/input/touchscreen/pixcir_i2c_ts.c |    6 +-
 trunk/drivers/input/touchscreen/s3c2410_ts.c  |    6 +-
 trunk/drivers/input/touchscreen/st1232.c      |    8 +-
 trunk/drivers/input/touchscreen/stmpe-ts.c    |  133 +-
 .../drivers/input/touchscreen/ti_am335x_tsc.c |    6 +-
 .../drivers/input/touchscreen/tnetv107x-ts.c  |    6 +-
 trunk/drivers/input/touchscreen/tps6507x-ts.c |    4 +-
 trunk/drivers/input/touchscreen/tsc2005.c     |    8 +-
 trunk/drivers/input/touchscreen/tsc2007.c     |    6 +-
 trunk/drivers/input/touchscreen/ucb1400_ts.c  |    8 +-
 trunk/drivers/input/touchscreen/w90p910_ts.c  |    6 +-
 trunk/drivers/input/touchscreen/wacom_i2c.c   |    6 +-
 trunk/drivers/input/touchscreen/wm831x-ts.c   |   12 +-
 trunk/drivers/isdn/mISDN/dsp_core.c           |    3 +-
 trunk/drivers/macintosh/smu.c                 |    2 +-
 .../drivers/macintosh/windfarm_fcu_controls.c |   14 +-
 .../drivers/macintosh/windfarm_lm75_sensor.c  |   14 +-
 .../macintosh/windfarm_max6690_sensor.c       |   13 +-
 trunk/drivers/macintosh/windfarm_smu_sat.c    |   13 +-
 trunk/drivers/md/md.c                         |  258 +-
 trunk/drivers/md/md.h                         |   28 +-
 trunk/drivers/md/raid1.c                      |   15 +-
 trunk/drivers/md/raid10.c                     |   15 +-
 trunk/drivers/md/raid5.c                      |   55 +-
 trunk/drivers/message/fusion/mptscsih.c       |    1 +
 trunk/drivers/mfd/stmpe.c                     |    2 +
 trunk/drivers/mtd/ar7part.c                   |    7 +-
 trunk/drivers/mtd/bcm63xxpart.c               |   32 +-
 trunk/drivers/mtd/chips/cfi_cmdset_0002.c     |   16 +-
 trunk/drivers/mtd/cmdlinepart.c               |   91 +-
 trunk/drivers/mtd/devices/bcm47xxsflash.c     |    4 +-
 trunk/drivers/mtd/devices/block2mtd.c         |    4 +-
 trunk/drivers/mtd/devices/docg3.c             |    2 +-
 trunk/drivers/mtd/devices/docprobe.c          |    2 -
 trunk/drivers/mtd/devices/m25p80.c            |   48 +-
 trunk/drivers/mtd/devices/mtd_dataflash.c     |   14 +-
 trunk/drivers/mtd/devices/spear_smi.c         |   23 +-
 trunk/drivers/mtd/devices/sst25l.c            |   10 +-
 trunk/drivers/mtd/maps/Kconfig                |    7 -
 trunk/drivers/mtd/maps/Makefile               |    1 -
 trunk/drivers/mtd/maps/amd76xrom.c            |    7 +-
 trunk/drivers/mtd/maps/autcpu12-nvram.c       |    6 +-
 trunk/drivers/mtd/maps/bfin-async-flash.c     |    9 +-
 trunk/drivers/mtd/maps/ck804xrom.c            |    6 +-
 trunk/drivers/mtd/maps/esb2rom.c              |    6 +-
 trunk/drivers/mtd/maps/fortunet.c             |  277 --
 trunk/drivers/mtd/maps/gpio-addr-flash.c      |   12 +-
 trunk/drivers/mtd/maps/ichxrom.c              |    8 +-
 trunk/drivers/mtd/maps/intel_vr_nor.c         |   18 +-
 trunk/drivers/mtd/maps/lantiq-flash.c         |    8 +-
 trunk/drivers/mtd/maps/latch-addr-flash.c     |    4 +-
 trunk/drivers/mtd/maps/pci.c                  |    6 +-
 trunk/drivers/mtd/maps/physmap_of.c           |   19 +-
 trunk/drivers/mtd/maps/pismo.c                |   18 +-
 trunk/drivers/mtd/maps/pxa2xx-flash.c         |    6 +-
 trunk/drivers/mtd/maps/sa1100-flash.c         |    6 +-
 trunk/drivers/mtd/maps/scb2_flash.c           |    8 +-
 trunk/drivers/mtd/maps/sun_uflash.c           |    6 +-
 trunk/drivers/mtd/maps/vmu-flash.c            |   10 +-
 trunk/drivers/mtd/mtd_blkdevs.c               |   51 +-
 trunk/drivers/mtd/mtdoops.c                   |   15 +-
 trunk/drivers/mtd/nand/Kconfig                |   34 +-
 trunk/drivers/mtd/nand/Makefile               |    4 +-
 trunk/drivers/mtd/nand/ams-delta.c            |    6 +-
 trunk/drivers/mtd/nand/atmel_nand.c           |   28 +-
 trunk/drivers/mtd/nand/au1550nd.c             |    8 +-
 trunk/drivers/mtd/nand/bcm47xxnflash/Makefile |    4 +
 .../mtd/nand/bcm47xxnflash/bcm47xxnflash.h    |   22 +
 trunk/drivers/mtd/nand/bcm47xxnflash/main.c   |  108 +
 .../mtd/nand/bcm47xxnflash/ops_bcm4706.c      |  413 ++
 trunk/drivers/mtd/nand/bf5xx_nand.c           |    8 +-
 trunk/drivers/mtd/nand/cafe_nand.c            |   12 +-
 trunk/drivers/mtd/nand/cs553x_nand.c          |    3 +-
 trunk/drivers/mtd/nand/davinci_nand.c         |   13 +-
 trunk/drivers/mtd/nand/denali.c               |  162 +-
 trunk/drivers/mtd/nand/denali.h               |    5 +
 trunk/drivers/mtd/nand/denali_dt.c            |  167 +
 trunk/drivers/mtd/nand/denali_pci.c           |  144 +
 trunk/drivers/mtd/nand/diskonchip.c           |    2 -
 trunk/drivers/mtd/nand/docg4.c                |   73 +-
 trunk/drivers/mtd/nand/fsl_elbc_nand.c        |   17 +-
 trunk/drivers/mtd/nand/fsl_ifc_nand.c         |    6 +-
 trunk/drivers/mtd/nand/fsl_upm.c              |    8 +-
 trunk/drivers/mtd/nand/fsmc_nand.c            |  106 +-
 trunk/drivers/mtd/nand/gpio.c                 |   34 +-
 trunk/drivers/mtd/nand/gpmi-nand/gpmi-lib.c   |   10 +-
 trunk/drivers/mtd/nand/gpmi-nand/gpmi-nand.c  |   41 +-
 trunk/drivers/mtd/nand/gpmi-nand/gpmi-nand.h  |    1 -
 trunk/drivers/mtd/nand/jz4740_nand.c          |   14 +-
 trunk/drivers/mtd/nand/lpc32xx_mlc.c          |    6 +-
 trunk/drivers/mtd/nand/lpc32xx_slc.c          |    6 +-
 trunk/drivers/mtd/nand/mpc5121_nfc.c          |    8 +-
 trunk/drivers/mtd/nand/mxc_nand.c             |   12 +-
 trunk/drivers/mtd/nand/nand_base.c            |  114 +-
 trunk/drivers/mtd/nand/nandsim.c              |  191 +-
 trunk/drivers/mtd/nand/ndfc.c                 |    6 +-
 trunk/drivers/mtd/nand/nomadik_nand.c         |  235 -
 trunk/drivers/mtd/nand/nuc900_nand.c          |    6 +-
 trunk/drivers/mtd/nand/omap2.c                |    2 +-
 trunk/drivers/mtd/nand/orion_nand.c           |    4 +-
 trunk/drivers/mtd/nand/pasemi_nand.c          |    4 +-
 trunk/drivers/mtd/nand/plat_nand.c            |    6 +-
 trunk/drivers/mtd/nand/s3c2410.c              |    7 +-
 trunk/drivers/mtd/nand/sh_flctl.c             |  306 +-
 trunk/drivers/mtd/nand/sharpsl.c              |    6 +-
 trunk/drivers/mtd/nand/socrates_nand.c        |    6 +-
 trunk/drivers/mtd/ofpart.c                    |    5 +-
 trunk/drivers/mtd/onenand/generic.c           |    6 +-
 trunk/drivers/mtd/onenand/omap2.c             |    6 +-
 trunk/drivers/mtd/onenand/samsung.c           |    4 +-
 trunk/drivers/mtd/tests/mtd_nandbiterrs.c     |   73 +-
 trunk/drivers/mtd/tests/mtd_nandecctest.c     |    6 +-
 trunk/drivers/mtd/tests/mtd_oobtest.c         |  171 +-
 trunk/drivers/mtd/tests/mtd_pagetest.c        |  152 +-
 trunk/drivers/mtd/tests/mtd_readtest.c        |   44 +-
 trunk/drivers/mtd/tests/mtd_speedtest.c       |   88 +-
 trunk/drivers/mtd/tests/mtd_stresstest.c      |   44 +-
 trunk/drivers/mtd/tests/mtd_subpagetest.c     |  124 +-
 trunk/drivers/mtd/tests/mtd_torturetest.c     |   73 +-
 trunk/drivers/net/bonding/bond_main.c         |    2 -
 .../net/can/sja1000/sja1000_of_platform.c     |    2 +-
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   |    5 +-
 trunk/drivers/net/ethernet/emulex/benet/be.h  |    2 +-
 .../net/ethernet/emulex/benet/be_cmds.c       |    5 +
 .../net/ethernet/emulex/benet/be_main.c       |   59 +-
 trunk/drivers/net/ethernet/freescale/Kconfig  |    3 +-
 .../drivers/net/ethernet/ibm/ehea/ehea_phyp.h |   20 -
 trunk/drivers/net/ethernet/micrel/ksz884x.c   |   12 +-
 .../net/ethernet/qlogic/qlcnic/qlcnic.h       |    4 +-
 .../net/ethernet/qlogic/qlcnic/qlcnic_ctx.c   |    5 +-
 .../net/ethernet/qlogic/qlcnic/qlcnic_hw.c    |    5 +-
 .../net/ethernet/qlogic/qlcnic/qlcnic_main.c  |    5 -
 .../ethernet/qlogic/qlcnic/qlcnic_minidump.c  |    3 +-
 trunk/drivers/net/ethernet/realtek/8139cp.c   |   18 +-
 trunk/drivers/net/ethernet/smsc/smc91x.c      |    4 +-
 trunk/drivers/net/ethernet/smsc/smsc911x.c    |    4 +-
 .../net/ethernet/stmicro/stmmac/stmmac.h      |    6 +-
 .../net/ethernet/stmicro/stmmac/stmmac_main.c |   22 +-
 trunk/drivers/net/ethernet/ti/cpts.c          |    2 -
 trunk/drivers/net/tun.c                       |   87 +-
 trunk/drivers/net/usb/cdc_ether.c             |   45 +-
 trunk/drivers/net/usb/cdc_ncm.c               |   10 +-
 trunk/drivers/net/usb/qmi_wwan.c              |   15 +
 trunk/drivers/net/usb/usbnet.c                |   25 +-
 trunk/drivers/net/wimax/i2400m/i2400m-usb.h   |    3 +
 trunk/drivers/net/wimax/i2400m/usb.c          |    6 +
 trunk/drivers/net/wireless/Kconfig            |    6 +-
 trunk/drivers/net/wireless/Makefile           |    2 +-
 trunk/drivers/net/wireless/rt2x00/rt2x00dev.c |    8 +
 trunk/drivers/of/base.c                       |  144 +-
 trunk/drivers/of/fdt.c                        |   10 +-
 trunk/drivers/platform/x86/asus-nb-wmi.c      |    2 +-
 trunk/drivers/platform/x86/asus-wmi.c         |    2 +-
 trunk/drivers/platform/x86/eeepc-wmi.c        |    2 +-
 trunk/drivers/power/charger-manager.c         |   38 +-
 trunk/drivers/pwm/Kconfig                     |   39 +-
 trunk/drivers/pwm/Makefile                    |    5 +-
 trunk/drivers/pwm/core.c                      |   29 +
 trunk/drivers/pwm/pwm-imx.c                   |    2 +-
 trunk/drivers/pwm/pwm-lpc32xx.c               |   23 +-
 trunk/drivers/pwm/pwm-samsung.c               |    1 +
 trunk/drivers/pwm/pwm-spear.c                 |  276 ++
 trunk/drivers/pwm/pwm-tiecap.c                |   48 +-
 trunk/drivers/pwm/pwm-tiehrpwm.c              |   62 +-
 trunk/drivers/pwm/pwm-tipwmss.c               |  139 +
 trunk/drivers/pwm/pwm-tipwmss.h               |   39 +
 trunk/drivers/pwm/pwm-twl-led.c               |  344 ++
 trunk/drivers/pwm/pwm-twl.c                   |  359 ++
 trunk/drivers/pwm/pwm-twl6030.c               |  184 -
 trunk/drivers/pwm/pwm-vt8500.c                |   98 +-
 trunk/drivers/rtc/Kconfig                     |   31 +-
 trunk/drivers/rtc/Makefile                    |    2 +
 trunk/drivers/rtc/rtc-da9055.c                |  413 ++
 trunk/drivers/rtc/rtc-davinci.c               |   21 +-
 trunk/drivers/rtc/rtc-dev.c                   |   19 -
 trunk/drivers/rtc/rtc-imxdi.c                 |   11 +
 trunk/drivers/rtc/rtc-omap.c                  |   80 +-
 trunk/drivers/rtc/rtc-pcf8523.c               |  326 ++
 trunk/drivers/rtc/rtc-s3c.c                   |   52 +-
 trunk/drivers/rtc/rtc-spear.c                 |   91 +-
 trunk/drivers/rtc/rtc-test.c                  |   14 +-
 trunk/drivers/rtc/rtc-tps65910.c              |    9 +-
 trunk/drivers/rtc/rtc-vt8500.c                |   15 +-
 trunk/drivers/scsi/fcoe/fcoe_ctlr.c           |    4 +-
 trunk/drivers/spi/spi-atmel.c                 |   17 +-
 trunk/drivers/spi/spi-s3c64xx.c               |   10 +-
 trunk/drivers/spi/spi-sh-hspi.c               |    2 +-
 trunk/drivers/spi/spi.c                       |    5 +-
 trunk/drivers/staging/android/binder.c        |    3 +-
 trunk/drivers/usb/musb/musb_core.c            |   12 +-
 trunk/drivers/usb/musb/musb_io.h              |   21 -
 trunk/drivers/usb/musb/tusb6010.c             |    5 +-
 trunk/drivers/video/Kconfig                   |    8 +-
 trunk/drivers/video/backlight/88pm860x_bl.c   |   18 +-
 trunk/drivers/video/backlight/atmel-pwm-bl.c  |    7 +-
 trunk/drivers/video/backlight/backlight.c     |   29 +
 trunk/drivers/video/backlight/corgi_lcd.c     |   20 +-
 trunk/drivers/video/backlight/da903x_bl.c     |   15 +-
 trunk/drivers/video/backlight/da9052_bl.c     |    2 +-
 trunk/drivers/video/backlight/generic_bl.c    |    4 +-
 trunk/drivers/video/backlight/hp680_bl.c      |    4 +-
 trunk/drivers/video/backlight/ili9320.c       |   14 +-
 trunk/drivers/video/backlight/ili9320.h       |    2 +-
 trunk/drivers/video/backlight/jornada720_bl.c |   31 +-
 trunk/drivers/video/backlight/l4f00242t03.c   |    3 +-
 trunk/drivers/video/backlight/lcd.c           |    8 +-
 trunk/drivers/video/backlight/lm3630_bl.c     |    2 +-
 trunk/drivers/video/backlight/lm3639_bl.c     |    2 +-
 trunk/drivers/video/backlight/lms283gf05.c    |   17 +-
 trunk/drivers/video/backlight/locomolcd.c     |   38 +-
 trunk/drivers/video/backlight/lp855x_bl.c     |   51 +-
 trunk/drivers/video/backlight/max8925_bl.c    |   11 +-
 trunk/drivers/video/backlight/omap1_bl.c      |    4 +-
 trunk/drivers/video/backlight/pandora_bl.c    |    8 +-
 .../video/backlight/pcf50633-backlight.c      |    8 +-
 trunk/drivers/video/backlight/platform_lcd.c  |    2 +-
 trunk/drivers/video/backlight/s6e63m0.c       |    2 +-
 trunk/drivers/video/backlight/tdo24m.c        |   33 +-
 trunk/drivers/video/backlight/tosa_bl.c       |    7 +-
 trunk/drivers/video/backlight/tosa_lcd.c      |   24 +-
 trunk/drivers/video/backlight/vgg2432a4.c     |   10 +-
 trunk/drivers/video/gxt4500.c                 |   15 +-
 trunk/drivers/virt/fsl_hypervisor.c           |    3 -
 trunk/fs/attr.c                               |   11 +-
 trunk/fs/autofs4/autofs_i.h                   |    8 +-
 trunk/fs/autofs4/dev-ioctl.c                  |    4 +-
 trunk/fs/autofs4/inode.c                      |   24 +-
 trunk/fs/autofs4/waitq.c                      |    5 +-
 trunk/fs/bad_inode.c                          |    2 +-
 trunk/fs/binfmt_elf.c                         |    4 +-
 trunk/fs/binfmt_em86.c                        |    1 -
 trunk/fs/binfmt_misc.c                        |    6 -
 trunk/fs/binfmt_script.c                      |    4 +-
 trunk/fs/block_dev.c                          |    4 +-
 trunk/fs/btrfs/Makefile                       |    2 +-
 trunk/fs/btrfs/acl.c                          |    2 +
 trunk/fs/btrfs/backref.c                      |   16 +-
 trunk/fs/btrfs/btrfs_inode.h                  |    4 +
 trunk/fs/btrfs/check-integrity.c              |   31 +-
 trunk/fs/btrfs/compression.c                  |    6 +-
 trunk/fs/btrfs/ctree.c                        |  241 +-
 trunk/fs/btrfs/ctree.h                        |  182 +-
 trunk/fs/btrfs/delayed-inode.c                |   11 +-
 trunk/fs/btrfs/dev-replace.c                  |  856 ++++
 trunk/fs/btrfs/dev-replace.h                  |   44 +
 trunk/fs/btrfs/dir-item.c                     |   59 +
 trunk/fs/btrfs/disk-io.c                      |  142 +-
 trunk/fs/btrfs/disk-io.h                      |    4 +-
 trunk/fs/btrfs/extent-tree.c                  |  227 +-
 trunk/fs/btrfs/extent_io.c                    |   37 +-
 trunk/fs/btrfs/extent_io.h                    |    4 +-
 trunk/fs/btrfs/extent_map.c                   |   24 +-
 trunk/fs/btrfs/extent_map.h                   |    2 +
 trunk/fs/btrfs/file-item.c                    |   21 +-
 trunk/fs/btrfs/file.c                         |  422 +-
 trunk/fs/btrfs/free-space-cache.c             |   51 +-
 trunk/fs/btrfs/inode-map.c                    |    5 +-
 trunk/fs/btrfs/inode.c                        |  484 +-
 trunk/fs/btrfs/ioctl.c                        |  317 +-
 trunk/fs/btrfs/ioctl.h                        |   48 +-
 trunk/fs/btrfs/math.h                         |   44 +
 trunk/fs/btrfs/ordered-data.c                 |   90 +-
 trunk/fs/btrfs/ordered-data.h                 |    7 +-
 trunk/fs/btrfs/print-tree.c                   |    3 +
 trunk/fs/btrfs/reada.c                        |   31 +-
 trunk/fs/btrfs/relocation.c                   |   40 +-
 trunk/fs/btrfs/root-tree.c                    |    4 +-
 trunk/fs/btrfs/scrub.c                        | 1836 +++++---
 trunk/fs/btrfs/send.c                         |    8 +-
 trunk/fs/btrfs/super.c                        |   48 +-
 trunk/fs/btrfs/transaction.c                  |  170 +-
 trunk/fs/btrfs/transaction.h                  |    2 +-
 trunk/fs/btrfs/tree-log.c                     |  477 +-
 trunk/fs/btrfs/volumes.c                      |  966 +++-
 trunk/fs/btrfs/volumes.h                      |   35 +-
 trunk/fs/btrfs/xattr.c                        |   13 +-
 trunk/fs/ceph/dir.c                           |    4 +-
 trunk/fs/ceph/export.c                        |    4 +-
 trunk/fs/ceph/file.c                          |    6 +-
 trunk/fs/cifs/cifsfs.c                        |    8 +-
 trunk/fs/configfs/dir.c                       |    4 +-
 trunk/fs/eventfd.c                            |   20 +
 trunk/fs/eventpoll.c                          |   28 +
 trunk/fs/exec.c                               |   19 +-
 trunk/fs/exofs/inode.c                        |   16 +-
 trunk/fs/exportfs/expfs.c                     |   19 +-
 trunk/fs/ext3/dir.c                           |    6 +-
 trunk/fs/ext4/dir.c                           |    6 +-
 trunk/fs/ext4/file.c                          |   22 +-
 trunk/fs/fat/fat.h                            |    3 +-
 trunk/fs/fat/inode.c                          |   55 +-
 trunk/fs/fat/misc.c                           |    9 +-
 trunk/fs/fuse/dev.c                           |    4 +-
 trunk/fs/fuse/dir.c                           |   20 +-
 trunk/fs/fuse/file.c                          |    8 +-
 trunk/fs/fuse/fuse_i.h                        |    4 +-
 trunk/fs/fuse/inode.c                         |   23 +-
 trunk/fs/gfs2/file.c                          |   10 +-
 trunk/fs/hppfs/hppfs.c                        |    2 +-
 trunk/fs/jffs2/nodemgmt.c                     |    6 +-
 trunk/fs/libfs.c                              |    4 +-
 trunk/fs/lockd/clnt4xdr.c                     |    8 -
 trunk/fs/lockd/clntproc.c                     |    3 +-
 trunk/fs/lockd/clntxdr.c                      |    8 -
 trunk/fs/lockd/host.c                         |   15 +-
 trunk/fs/lockd/mon.c                          |    3 -
 trunk/fs/mount.h                              |    3 +
 trunk/fs/namespace.c                          |  212 +-
 trunk/fs/nfs/Makefile                         |    2 +-
 trunk/fs/nfs/blocklayout/blocklayout.c        |    1 +
 trunk/fs/nfs/cache_lib.c                      |    1 -
 trunk/fs/nfs/callback.h                       |    4 +-
 trunk/fs/nfs/callback_proc.c                  |   17 +-
 trunk/fs/nfs/callback_xdr.c                   |    5 +-
 trunk/fs/nfs/client.c                         |    9 +-
 trunk/fs/nfs/dir.c                            |   28 +-
 trunk/fs/nfs/direct.c                         |   17 +-
 trunk/fs/nfs/file.c                           |   10 +-
 trunk/fs/nfs/inode.c                          |   10 +-
 trunk/fs/nfs/internal.h                       |   42 +-
 trunk/fs/nfs/mount_clnt.c                     |    7 +-
 trunk/fs/nfs/nfs2xdr.c                        |    4 +-
 trunk/fs/nfs/nfs3proc.c                       |    6 +-
 trunk/fs/nfs/nfs3xdr.c                        |    7 +-
 trunk/fs/nfs/nfs4_fs.h                        |   29 +-
 trunk/fs/nfs/nfs4client.c                     |    5 +-
 trunk/fs/nfs/nfs4file.c                       |    1 -
 trunk/fs/nfs/nfs4filelayout.c                 |   45 +-
 trunk/fs/nfs/nfs4filelayoutdev.c              |    3 +-
 trunk/fs/nfs/nfs4proc.c                       |  820 +---
 trunk/fs/nfs/nfs4session.c                    |  552 +++
 trunk/fs/nfs/nfs4session.h                    |  142 +
 trunk/fs/nfs/nfs4state.c                      |  143 +-
 trunk/fs/nfs/nfs4super.c                      |    1 +
 trunk/fs/nfs/nfs4xdr.c                        |   52 +-
 trunk/fs/nfs/objlayout/objlayout.c            |   11 -
 trunk/fs/nfs/pnfs.c                           |   17 +-
 trunk/fs/nfs/proc.c                           |   43 -
 trunk/fs/nfs/super.c                          |    2 +
 trunk/fs/nfs/write.c                          |   31 +-
 trunk/fs/notify/Makefile                      |    2 +-
 trunk/fs/notify/fanotify/fanotify_user.c      |    2 +
 trunk/fs/notify/fdinfo.c                      |  179 +
 trunk/fs/notify/fdinfo.h                      |   27 +
 trunk/fs/notify/inode_mark.c                  |    5 +-
 trunk/fs/notify/inotify/inotify_user.c        |    2 +
 trunk/fs/ocfs2/extent_map.c                   |   12 +-
 trunk/fs/ocfs2/file.c                         |    6 +-
 trunk/fs/open.c                               |    2 +-
 trunk/fs/pnode.h                              |    1 +
 trunk/fs/proc/Makefile                        |    1 +
 trunk/fs/proc/array.c                         |   23 +-
 trunk/fs/proc/base.c                          |  169 +-
 trunk/fs/proc/fd.c                            |    2 +
 trunk/fs/proc/generic.c                       |   26 +-
 trunk/fs/proc/inode.c                         |    6 +-
 trunk/fs/proc/internal.h                      |    1 +
 trunk/fs/proc/namespaces.c                    |  185 +-
 trunk/fs/proc/proc_devtree.c                  |    6 +-
 trunk/fs/proc/root.c                          |   17 +-
 trunk/fs/proc/self.c                          |   59 +
 trunk/fs/proc/task_mmu.c                      |   53 +
 trunk/fs/pstore/inode.c                       |    6 +-
 trunk/fs/read_write.c                         |   40 +-
 trunk/fs/seq_file.c                           |    4 +-
 trunk/fs/signalfd.c                           |   18 +
 trunk/fs/sysfs/mount.c                        |    1 +
 trunk/fs/ubifs/debug.c                        |    8 +-
 trunk/fs/ubifs/dir.c                          |    4 +-
 trunk/include/asm-generic/io.h                |   12 +-
 trunk/include/linux/asn1.h                    |    2 +
 trunk/include/linux/backing-dev.h             |    4 -
 trunk/include/linux/backlight.h               |   10 +
 trunk/include/linux/bcma/bcma.h               |    1 +
 trunk/include/linux/binfmts.h                 |    2 -
 trunk/include/linux/blkdev.h                  |   19 +-
 trunk/include/linux/compat.h                  |    3 +
 trunk/include/linux/compiler-gcc4.h           |   12 +
 trunk/include/linux/compiler-intel.h          |    7 +
 trunk/include/linux/compiler.h                |   11 +
 trunk/include/linux/cred.h                    |    2 -
 trunk/include/linux/dma-buf.h                 |   99 -
 trunk/include/linux/drbd.h                    |   81 +-
 trunk/include/linux/drbd_genl.h               |  378 ++
 trunk/include/linux/drbd_genl_api.h           |   55 +
 trunk/include/linux/drbd_limits.h             |   90 +-
 trunk/include/linux/drbd_nl.h                 |  163 -
 trunk/include/linux/drbd_tag_magic.h          |   84 -
 trunk/include/linux/exportfs.h                |    2 +
 trunk/include/linux/fs.h                      |   18 +-
 trunk/include/linux/ftrace.h                  |    4 +-
 trunk/include/linux/genhd.h                   |    8 +-
 trunk/include/linux/genl_magic_func.h         |  422 ++
 trunk/include/linux/genl_magic_struct.h       |  277 ++
 trunk/include/linux/gfp.h                     |    5 +
 trunk/include/linux/hugetlb_cgroup.h          |    5 +-
 trunk/include/linux/i2c-omap.h                |    2 -
 trunk/include/linux/i2c/i2c-sh_mobile.h       |    1 +
 trunk/include/linux/idr.h                     |   11 +
 trunk/include/linux/ima.h                     |    6 +
 trunk/include/linux/init.h                    |   40 +-
 trunk/include/linux/input.h                   |   10 +-
 trunk/include/linux/input/bu21013.h           |   10 +-
 trunk/include/linux/ipc_namespace.h           |    9 +-
 trunk/include/linux/kernel.h                  |   33 +
 trunk/include/linux/loop.h                    |    3 +
 trunk/include/linux/lru_cache.h               |   67 +-
 trunk/include/linux/memcontrol.h              |  209 +
 trunk/include/linux/mm_types.h                |    7 +-
 trunk/include/linux/mnt_namespace.h           |    3 +-
 trunk/include/linux/moduleparam.h             |    6 +-
 trunk/include/linux/mtd/blktrans.h            |    4 +-
 trunk/include/linux/mtd/doc2000.h             |   22 +-
 trunk/include/linux/mtd/fsmc.h                |    3 -
 trunk/include/linux/mtd/gpmi-nand.h           |   68 -
 trunk/include/linux/mtd/map.h                 |    4 +-
 trunk/include/linux/mtd/mtd.h                 |    2 +-
 trunk/include/linux/mtd/nand.h                |   11 +-
 trunk/include/linux/mtd/sh_flctl.h            |   14 +-
 trunk/include/linux/nfs_fs_sb.h               |   47 -
 trunk/include/linux/nfs_xdr.h                 |  155 +-
 trunk/include/linux/nsproxy.h                 |    2 +-
 trunk/include/linux/of.h                      |   29 +-
 trunk/include/linux/of_platform.h             |    1 +
 trunk/include/linux/percpu-rwsem.h            |   91 +-
 trunk/include/linux/pid_namespace.h           |   11 +-
 .../linux/platform_data/i2c-cbus-gpio.h       |   27 +
 trunk/include/linux/platform_data/lp855x.h    |    9 +-
 .../linux/platform_data/mtd-nomadik-nand.h    |   16 -
 trunk/include/linux/proc_fs.h                 |   29 +-
 trunk/include/linux/ptrace.h                  |    2 +
 trunk/include/linux/pwm.h                     |    3 +
 trunk/include/linux/raid/pq.h                 |    4 +
 trunk/include/linux/random.h                  |   19 +-
 trunk/include/linux/res_counter.h             |   12 +-
 trunk/include/linux/sched.h                   |    7 +-
 trunk/include/linux/security.h                |   13 +
 trunk/include/linux/slab.h                    |   57 +-
 trunk/include/linux/slab_def.h                |    9 +-
 trunk/include/linux/slub_def.h                |    9 +-
 trunk/include/linux/string.h                  |   11 +
 trunk/include/linux/sunrpc/sched.h            |    1 -
 trunk/include/linux/syscalls.h                |    5 +-
 trunk/include/linux/thread_info.h             |    2 +
 trunk/include/linux/usb/usbnet.h              |    3 +
 trunk/include/linux/user_namespace.h          |   10 +
 trunk/include/linux/utsname.h                 |    7 +-
 trunk/include/linux/wait.h                    |  164 +
 trunk/include/net/inet_connection_sock.h      |    1 +
 trunk/include/net/ndisc.h                     |    7 +
 trunk/include/net/net_namespace.h             |    2 +
 trunk/include/trace/events/btrfs.h            |    3 +-
 trunk/include/trace/events/gfpflags.h         |    1 +
 trunk/include/uapi/asm-generic/unistd.h       |    4 +-
 trunk/include/uapi/linux/if_bridge.h          |    3 +
 trunk/include/uapi/linux/module.h             |    8 +
 trunk/include/uapi/linux/ptrace.h             |    5 +-
 trunk/include/uapi/linux/swab.h               |   12 +-
 trunk/include/xen/interface/event_channel.h   |   13 +
 trunk/init/Kconfig                            |    4 +-
 trunk/init/do_mounts.c                        |   61 +-
 trunk/init/main.c                             |    1 -
 trunk/init/version.c                          |    2 +
 trunk/ipc/msgutil.c                           |    2 +
 trunk/ipc/namespace.c                         |   33 +-
 trunk/kernel/Makefile                         |   10 +-
 trunk/kernel/cgroup.c                         |    3 +-
 trunk/kernel/compat.c                         |   17 +
 trunk/kernel/cred.c                           |   27 +-
 trunk/kernel/events/core.c                    |    2 +-
 trunk/kernel/exit.c                           |   12 -
 trunk/kernel/fork.c                           |   73 +-
 trunk/kernel/irq/manage.c                     |    2 +-
 trunk/kernel/modsign_certificate.S            |   19 +
 trunk/kernel/modsign_pubkey.c                 |    6 -
 trunk/kernel/module.c                         |  444 +-
 trunk/kernel/nsproxy.c                        |   36 +-
 trunk/kernel/pid.c                            |   62 +-
 trunk/kernel/pid_namespace.c                  |  113 +-
 trunk/kernel/posix-cpu-timers.c               |    3 +
 trunk/kernel/printk.c                         |   40 +-
 trunk/kernel/ptrace.c                         |   13 +-
 trunk/kernel/res_counter.c                    |   20 +-
 trunk/kernel/sched/core.c                     |   10 +-
 trunk/kernel/sched/fair.c                     |    5 +-
 trunk/kernel/signal.c                         |    2 +-
 trunk/kernel/sys_ni.c                         |    1 +
 trunk/kernel/sysctl_binary.c                  |    2 +-
 trunk/kernel/trace/ftrace.c                   |    4 +-
 trunk/kernel/trace/trace.c                    |   60 +-
 trunk/kernel/trace/trace_stack.c              |    4 -
 trunk/kernel/trace/trace_uprobe.c             |    8 +-
 trunk/kernel/user.c                           |    2 +
 trunk/kernel/user_namespace.c                 |  147 +-
 trunk/kernel/utsname.c                        |   34 +-
 trunk/kernel/watchdog.c                       |   24 +-
 trunk/lib/Kconfig                             |    3 +
 trunk/lib/Kconfig.debug                       |   10 +-
 trunk/lib/Makefile                            |    5 +-
 trunk/lib/asn1_decoder.c                      |    8 +-
 trunk/lib/dynamic_debug.c                     |    9 +-
 trunk/lib/interval_tree_test_main.c           |    7 +-
 trunk/lib/kstrtox.c                           |   64 +
 trunk/lib/lru_cache.c                         |  359 +-
 ....c => of-reconfig-notifier-error-inject.c} |   22 +-
 trunk/lib/percpu-rwsem.c                      |  165 +
 trunk/lib/raid6/Makefile                      |    9 +-
 trunk/lib/raid6/algos.c                       |   12 +
 trunk/lib/raid6/altivec.uc                    |    3 -
 trunk/lib/raid6/avx2.c                        |  251 ++
 trunk/lib/raid6/mmx.c                         |    2 +-
 trunk/lib/raid6/recov_avx2.c                  |  323 ++
 trunk/lib/raid6/recov_ssse3.c                 |    4 -
 trunk/lib/raid6/sse1.c                        |    2 +-
 trunk/lib/raid6/sse2.c                        |    8 +-
 trunk/lib/raid6/test/Makefile                 |   29 +-
 trunk/lib/raid6/x86.h                         |   14 +-
 trunk/lib/random32.c                          |   97 +-
 trunk/lib/rbtree_test.c                       |    8 +-
 trunk/lib/scatterlist.c                       |    3 +-
 trunk/lib/vsprintf.c                          |  109 +-
 trunk/mm/Kconfig                              |   13 +-
 trunk/mm/backing-dev.c                        |   84 -
 trunk/mm/hugetlb.c                            |   11 +-
 trunk/mm/hugetlb_cgroup.c                     |   19 +-
 trunk/mm/kmemleak.c                           |    3 +-
 trunk/mm/ksm.c                                |   16 +-
 trunk/mm/memcontrol.c                         | 1524 ++++++-
 trunk/mm/memory.c                             |    9 +-
 trunk/mm/memory_hotplug.c                     |   18 +-
 trunk/mm/migrate.c                            |    2 +-
 trunk/mm/mprotect.c                           |   30 +-
 trunk/mm/page_alloc.c                         |   38 +-
 trunk/mm/shmem.c                              |   20 +-
 trunk/mm/slab.c                               |  383 +-
 trunk/mm/slab.h                               |  190 +-
 trunk/mm/slab_common.c                        |  292 +-
 trunk/mm/slob.c                               |   48 +-
 trunk/mm/slub.c                               |  447 +-
 trunk/mm/vmscan.c                             |   26 +-
 trunk/net/atm/atm_sysfs.c                     |   40 +-
 trunk/net/bridge/br_mdb.c                     |   22 +-
 trunk/net/bridge/br_multicast.c               |   13 +-
 trunk/net/bridge/br_netlink.c                 |    1 -
 trunk/net/bridge/br_private.h                 |    5 +-
 trunk/net/core/net_namespace.c                |   32 +-
 trunk/net/dccp/ipv4.c                         |    4 +-
 trunk/net/dccp/ipv6.c                         |    3 +-
 trunk/net/ipv4/inet_connection_sock.c         |   16 +
 trunk/net/ipv4/tcp_ipv4.c                     |    6 +-
 trunk/net/ipv6/Makefile                       |    2 +-
 trunk/net/ipv6/addrconf.c                     |    3 +-
 trunk/net/ipv6/ndisc.c                        |   17 +
 trunk/net/ipv6/tcp_ipv6.c                     |    3 +-
 trunk/net/mac802154/ieee802154_dev.c          |    4 +-
 trunk/net/netlink/af_netlink.c                |    5 +-
 trunk/net/sctp/Kconfig                        |   27 +-
 trunk/net/sctp/probe.c                        |    3 +-
 trunk/net/sctp/protocol.c                     |    4 +-
 trunk/net/sunrpc/auth_gss/auth_gss.c          |   17 +-
 trunk/net/sunrpc/backchannel_rqst.c           |    9 +-
 trunk/net/sunrpc/bc_svc.c                     |    2 +-
 trunk/net/sunrpc/cache.c                      |    4 +-
 trunk/net/sunrpc/clnt.c                       |   48 +-
 trunk/net/sunrpc/rpc_pipe.c                   |    9 +-
 trunk/net/sunrpc/rpcb_clnt.c                  |    5 +-
 trunk/net/sunrpc/sched.c                      |   71 +-
 trunk/net/sunrpc/svc.c                        |   12 +-
 trunk/net/sunrpc/svc_xprt.c                   |   11 +-
 trunk/net/sunrpc/svcsock.c                    |    6 +-
 trunk/net/sunrpc/xdr.c                        |    5 +-
 trunk/net/sunrpc/xprtsock.c                   |   19 +-
 trunk/scripts/Makefile.modsign                |   32 +
 trunk/scripts/checkpatch.pl                   |  143 +-
 .../scripts/coccinelle/api/d_find_alias.cocci |   80 +
 trunk/security/capability.c                   |    6 +
 trunk/security/commoncap.c                    |   25 +-
 trunk/security/integrity/ima/ima.h            |    2 +-
 trunk/security/integrity/ima/ima_api.c        |    4 +-
 trunk/security/integrity/ima/ima_main.c       |   21 +
 trunk/security/integrity/ima/ima_policy.c     |    3 +
 trunk/security/security.c                     |   10 +
 trunk/security/selinux/nlmsgtab.c             |    2 +
 trunk/security/yama/yama_lsm.c                |   12 +-
 trunk/sound/Kconfig                           |    3 -
 trunk/sound/sound_core.c                      |    3 +-
 trunk/tools/power/x86/turbostat/Makefile      |   21 +-
 trunk/tools/power/x86/turbostat/turbostat.8   |  103 +-
 trunk/tools/power/x86/turbostat/turbostat.c   |  677 ++-
 .../power/x86/x86_energy_perf_policy/Makefile |    6 +-
 .../x86_energy_perf_policy.c                  |    2 +-
 .../testing/selftests/breakpoints/Makefile    |    2 +-
 .../testing/selftests/cpu-hotplug/Makefile    |    2 +-
 trunk/tools/testing/selftests/kcmp/Makefile   |    6 +-
 .../tools/testing/selftests/kcmp/kcmp_test.c  |    6 +-
 .../testing/selftests/memory-hotplug/Makefile |    2 +-
 trunk/tools/testing/selftests/mqueue/Makefile |    4 +-
 trunk/tools/testing/selftests/vm/Makefile     |    2 +-
 1224 files changed, 44953 insertions(+), 23367 deletions(-)
 create mode 100644 trunk/Documentation/devicetree/bindings/i2c/i2c-cbus-gpio.txt
 create mode 100644 trunk/Documentation/devicetree/bindings/i2c/i2c-mux-gpio.txt
 create mode 100644 trunk/Documentation/devicetree/bindings/input/gpio-matrix-keypad.txt
 create mode 100644 trunk/Documentation/devicetree/bindings/input/pwm-beeper.txt
 create mode 100644 trunk/Documentation/devicetree/bindings/input/stmpe-keypad.txt
 create mode 100644 trunk/Documentation/devicetree/bindings/input/tca8418_keypad.txt
 create mode 100644 trunk/Documentation/devicetree/bindings/input/touchscreen/mms114.txt
 create mode 100644 trunk/Documentation/devicetree/bindings/input/touchscreen/stmpe.txt
 create mode 100644 trunk/Documentation/devicetree/bindings/mtd/denali-nand.txt
 create mode 100644 trunk/Documentation/devicetree/bindings/mtd/flctl-nand.txt
 create mode 100644 trunk/Documentation/devicetree/bindings/mtd/m25p80.txt
 create mode 100644 trunk/Documentation/devicetree/bindings/powerpc/fsl/raideng.txt
 create mode 100644 trunk/Documentation/devicetree/bindings/pwm/pwm-tiecap.txt
 create mode 100644 trunk/Documentation/devicetree/bindings/pwm/pwm-tiehrpwm.txt
 create mode 100644 trunk/Documentation/devicetree/bindings/pwm/pwm-tipwmss.txt
 create mode 100644 trunk/Documentation/devicetree/bindings/pwm/spear-pwm.txt
 create mode 100644 trunk/Documentation/devicetree/bindings/pwm/ti,twl-pwm.txt
 create mode 100644 trunk/Documentation/devicetree/bindings/pwm/ti,twl-pwmled.txt
 create mode 100644 trunk/Documentation/devicetree/bindings/pwm/vt8500-pwm.txt
 create mode 100644 trunk/Documentation/devicetree/bindings/rtc/imxdi-rtc.txt
 create mode 100644 trunk/Documentation/devicetree/bindings/rtc/rtc-omap.txt
 create mode 100644 trunk/Documentation/devicetree/bindings/spi/spi_atmel.txt
 create mode 100644 trunk/Documentation/xtensa/atomctl.txt
 create mode 100644 trunk/arch/alpha/include/uapi/asm/a.out.h
 rename trunk/arch/alpha/include/{ => uapi}/asm/auxvec.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/bitsperlong.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/byteorder.h (100%)
 create mode 100644 trunk/arch/alpha/include/uapi/asm/compiler.h
 create mode 100644 trunk/arch/alpha/include/uapi/asm/console.h
 rename trunk/arch/alpha/include/{ => uapi}/asm/errno.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/fcntl.h (100%)
 create mode 100644 trunk/arch/alpha/include/uapi/asm/fpu.h
 rename trunk/arch/alpha/include/{ => uapi}/asm/gentrap.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/ioctl.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/ioctls.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/ipcbuf.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/kvm_para.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/mman.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/msgbuf.h (100%)
 create mode 100644 trunk/arch/alpha/include/uapi/asm/pal.h
 create mode 100644 trunk/arch/alpha/include/uapi/asm/param.h
 rename trunk/arch/alpha/include/{ => uapi}/asm/poll.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/posix_types.h (100%)
 create mode 100644 trunk/arch/alpha/include/uapi/asm/ptrace.h
 rename trunk/arch/alpha/include/{ => uapi}/asm/reg.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/regdef.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/resource.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/sembuf.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/setup.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/shmbuf.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/sigcontext.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/siginfo.h (100%)
 create mode 100644 trunk/arch/alpha/include/uapi/asm/signal.h
 create mode 100644 trunk/arch/alpha/include/uapi/asm/socket.h
 rename trunk/arch/alpha/include/{ => uapi}/asm/sockios.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/stat.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/statfs.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/swab.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/sysinfo.h (100%)
 rename trunk/arch/alpha/include/{ => uapi}/asm/termbits.h (100%)
 create mode 100644 trunk/arch/alpha/include/uapi/asm/termios.h
 create mode 100644 trunk/arch/alpha/include/uapi/asm/types.h
 create mode 100644 trunk/arch/alpha/include/uapi/asm/unistd.h
 delete mode 100644 trunk/arch/arm/mach-nomadik/include/mach/fsmc.h
 delete mode 100644 trunk/arch/blackfin/include/asm/kvm_para.h
 create mode 100644 trunk/arch/blackfin/include/uapi/asm/bfin_sport.h
 rename trunk/arch/blackfin/include/{ => uapi}/asm/byteorder.h (100%)
 rename trunk/arch/blackfin/include/{ => uapi}/asm/cachectl.h (100%)
 rename trunk/arch/blackfin/include/{ => uapi}/asm/fcntl.h (100%)
 create mode 100644 trunk/arch/blackfin/include/uapi/asm/fixed_code.h
 rename trunk/arch/blackfin/include/{ => uapi}/asm/ioctls.h (100%)
 rename trunk/arch/blackfin/include/{ => uapi}/asm/poll.h (100%)
 rename trunk/arch/blackfin/include/{ => uapi}/asm/posix_types.h (100%)
 create mode 100644 trunk/arch/blackfin/include/uapi/asm/ptrace.h
 rename trunk/arch/blackfin/include/{ => uapi}/asm/sigcontext.h (100%)
 rename trunk/arch/blackfin/include/{ => uapi}/asm/siginfo.h (100%)
 rename trunk/arch/blackfin/include/{ => uapi}/asm/signal.h (100%)
 rename trunk/arch/blackfin/include/{ => uapi}/asm/stat.h (100%)
 rename trunk/arch/blackfin/include/{ => uapi}/asm/swab.h (100%)
 create mode 100644 trunk/arch/blackfin/include/uapi/asm/unistd.h
 delete mode 100644 trunk/arch/openrisc/include/uapi/asm/kvm_para.h
 create mode 100644 trunk/arch/powerpc/boot/dts/a3m071.dts
 create mode 100644 trunk/arch/powerpc/boot/dts/fsl/qoriq-raid1.0-0.dtsi
 delete mode 100644 trunk/arch/powerpc/include/asm/pSeries_reconfig.h
 create mode 100644 trunk/arch/powerpc/include/asm/setup.h
 rename trunk/arch/powerpc/kernel/{cpu_setup_power7.S => cpu_setup_power.S} (80%)
 delete mode 100644 trunk/arch/powerpc/xmon/start.c
 create mode 100644 trunk/arch/s390/include/asm/pci_debug.h
 create mode 100644 trunk/arch/s390/pci/pci_debug.c
 create mode 100644 trunk/arch/xtensa/boot/boot-uboot/Makefile
 create mode 100644 trunk/arch/xtensa/boot/dts/lx60.dts
 create mode 100644 trunk/arch/xtensa/boot/dts/ml605.dts
 create mode 100644 trunk/arch/xtensa/boot/dts/xtfpga-flash-16m.dtsi
 create mode 100644 trunk/arch/xtensa/boot/dts/xtfpga-flash-4m.dtsi
 create mode 100644 trunk/arch/xtensa/boot/dts/xtfpga.dtsi
 create mode 100644 trunk/arch/xtensa/include/asm/initialize_mmu.h
 create mode 100644 trunk/arch/xtensa/include/asm/prom.h
 create mode 100644 trunk/arch/xtensa/include/asm/traps.h
 create mode 100644 trunk/arch/xtensa/platforms/xtfpga/Makefile
 create mode 100644 trunk/arch/xtensa/platforms/xtfpga/include/platform/hardware.h
 create mode 100644 trunk/arch/xtensa/platforms/xtfpga/include/platform/lcd.h
 create mode 100644 trunk/arch/xtensa/platforms/xtfpga/include/platform/serial.h
 create mode 100644 trunk/arch/xtensa/platforms/xtfpga/lcd.c
 create mode 100644 trunk/arch/xtensa/platforms/xtfpga/setup.c
 create mode 100644 trunk/drivers/block/drbd/drbd_interval.c
 create mode 100644 trunk/drivers/block/drbd/drbd_interval.h
 create mode 100644 trunk/drivers/block/drbd/drbd_nla.c
 create mode 100644 trunk/drivers/block/drbd/drbd_nla.h
 create mode 100644 trunk/drivers/block/drbd/drbd_state.c
 create mode 100644 trunk/drivers/block/drbd/drbd_state.h
 create mode 100644 trunk/drivers/i2c/busses/i2c-cbus-gpio.c
 create mode 100644 trunk/drivers/input/misc/da9055_onkey.c
 create mode 100644 trunk/drivers/input/misc/retu-pwrbutton.c
 create mode 100644 trunk/drivers/input/serio/arc_ps2.c
 delete mode 100644 trunk/drivers/input/touchscreen/h3600_ts_input.c
 delete mode 100644 trunk/drivers/mtd/maps/fortunet.c
 create mode 100644 trunk/drivers/mtd/nand/bcm47xxnflash/Makefile
 create mode 100644 trunk/drivers/mtd/nand/bcm47xxnflash/bcm47xxnflash.h
 create mode 100644 trunk/drivers/mtd/nand/bcm47xxnflash/main.c
 create mode 100644 trunk/drivers/mtd/nand/bcm47xxnflash/ops_bcm4706.c
 create mode 100644 trunk/drivers/mtd/nand/denali_dt.c
 create mode 100644 trunk/drivers/mtd/nand/denali_pci.c
 delete mode 100644 trunk/drivers/mtd/nand/nomadik_nand.c
 create mode 100644 trunk/drivers/pwm/pwm-spear.c
 create mode 100644 trunk/drivers/pwm/pwm-tipwmss.c
 create mode 100644 trunk/drivers/pwm/pwm-tipwmss.h
 create mode 100644 trunk/drivers/pwm/pwm-twl-led.c
 create mode 100644 trunk/drivers/pwm/pwm-twl.c
 delete mode 100644 trunk/drivers/pwm/pwm-twl6030.c
 create mode 100644 trunk/drivers/rtc/rtc-da9055.c
 create mode 100644 trunk/drivers/rtc/rtc-pcf8523.c
 create mode 100644 trunk/fs/btrfs/dev-replace.c
 create mode 100644 trunk/fs/btrfs/dev-replace.h
 create mode 100644 trunk/fs/btrfs/math.h
 create mode 100644 trunk/fs/nfs/nfs4session.c
 create mode 100644 trunk/fs/nfs/nfs4session.h
 create mode 100644 trunk/fs/notify/fdinfo.c
 create mode 100644 trunk/fs/notify/fdinfo.h
 create mode 100644 trunk/fs/proc/self.c
 create mode 100644 trunk/include/linux/drbd_genl.h
 create mode 100644 trunk/include/linux/drbd_genl_api.h
 delete mode 100644 trunk/include/linux/drbd_nl.h
 delete mode 100644 trunk/include/linux/drbd_tag_magic.h
 create mode 100644 trunk/include/linux/genl_magic_func.h
 create mode 100644 trunk/include/linux/genl_magic_struct.h
 delete mode 100644 trunk/include/linux/mtd/gpmi-nand.h
 create mode 100644 trunk/include/linux/platform_data/i2c-cbus-gpio.h
 delete mode 100644 trunk/include/linux/platform_data/mtd-nomadik-nand.h
 create mode 100644 trunk/include/uapi/linux/module.h
 create mode 100644 trunk/kernel/modsign_certificate.S
 rename trunk/lib/{pSeries-reconfig-notifier-error-inject.c => of-reconfig-notifier-error-inject.c} (51%)
 create mode 100644 trunk/lib/percpu-rwsem.c
 create mode 100644 trunk/lib/raid6/avx2.c
 create mode 100644 trunk/lib/raid6/recov_avx2.c
 create mode 100644 trunk/scripts/Makefile.modsign
 create mode 100644 trunk/scripts/coccinelle/api/d_find_alias.cocci

diff --git a/[refs] b/[refs]
index 14d1d1dd3afd..3eabd3bcdc9b 100644
--- a/[refs]
+++ b/[refs]
@@ -1,2 +1,2 @@
 ---
-refs/heads/master: 4d1839138220e7e35bf9e31c854e4e0196dea7a1
+refs/heads/master: 1ffab3d4139533eff6e27b7568825307e575faa6
diff --git a/trunk/.gitignore b/trunk/.gitignore
index 92bd0e45dfa1..3b8b9b33be38 100644
--- a/trunk/.gitignore
+++ b/trunk/.gitignore
@@ -60,7 +60,6 @@ modules.builtin
 # Generated include files
 #
 include/config
-include/linux/version.h
 include/generated
 arch/*/include/generated
 
diff --git a/trunk/Documentation/00-INDEX b/trunk/Documentation/00-INDEX
index ceb1ff735469..8afe64fb2009 100644
--- a/trunk/Documentation/00-INDEX
+++ b/trunk/Documentation/00-INDEX
@@ -136,8 +136,6 @@ fault-injection/
 	- dir with docs about the fault injection capabilities infrastructure.
 fb/
 	- directory with info on the frame buffer graphics abstraction layer.
-feature-removal-schedule.txt
-	- list of files and features that are going to be removed.
 filesystems/
 	- info on the vfs and the various filesystems that Linux supports.
 firmware_class/
diff --git a/trunk/Documentation/ABI/README b/trunk/Documentation/ABI/README
index 9feaf16f1617..10069828568b 100644
--- a/trunk/Documentation/ABI/README
+++ b/trunk/Documentation/ABI/README
@@ -36,9 +36,6 @@ The different levels of stability are:
 	the kernel, but are marked to be removed at some later point in
 	time.  The description of the interface will document the reason
 	why it is obsolete and when it can be expected to be removed.
-	The file Documentation/feature-removal-schedule.txt may describe
-	some of these interfaces, giving a schedule for when they will
-	be removed.
 
   removed/
 	This directory contains a list of the old interfaces that have
diff --git a/trunk/Documentation/ABI/stable/sysfs-devices-node b/trunk/Documentation/ABI/stable/sysfs-devices-node
index 49b82cad7003..ce259c13c36a 100644
--- a/trunk/Documentation/ABI/stable/sysfs-devices-node
+++ b/trunk/Documentation/ABI/stable/sysfs-devices-node
@@ -1,7 +1,101 @@
+What:		/sys/devices/system/node/possible
+Date:		October 2002
+Contact:	Linux Memory Management list <linux-mm@kvack.org>
+Description:
+		Nodes that could be possibly become online at some point.
+
+What:		/sys/devices/system/node/online
+Date:		October 2002
+Contact:	Linux Memory Management list <linux-mm@kvack.org>
+Description:
+		Nodes that are online.
+
+What:		/sys/devices/system/node/has_normal_memory
+Date:		October 2002
+Contact:	Linux Memory Management list <linux-mm@kvack.org>
+Description:
+		Nodes that have regular memory.
+
+What:		/sys/devices/system/node/has_cpu
+Date:		October 2002
+Contact:	Linux Memory Management list <linux-mm@kvack.org>
+Description:
+		Nodes that have one or more CPUs.
+
+What:		/sys/devices/system/node/has_high_memory
+Date:		October 2002
+Contact:	Linux Memory Management list <linux-mm@kvack.org>
+Description:
+		Nodes that have regular or high memory.
+		Depends on CONFIG_HIGHMEM.
+
 What:		/sys/devices/system/node/nodeX
 Date:		October 2002
 Contact:	Linux Memory Management list <linux-mm@kvack.org>
 Description:
 		When CONFIG_NUMA is enabled, this is a directory containing
 		information on node X such as what CPUs are local to the
-		node.
+		node. Each file is detailed next.
+
+What:		/sys/devices/system/node/nodeX/cpumap
+Date:		October 2002
+Contact:	Linux Memory Management list <linux-mm@kvack.org>
+Description:
+		The node's cpumap.
+
+What:		/sys/devices/system/node/nodeX/cpulist
+Date:		October 2002
+Contact:	Linux Memory Management list <linux-mm@kvack.org>
+Description:
+		The CPUs associated to the node.
+
+What:		/sys/devices/system/node/nodeX/meminfo
+Date:		October 2002
+Contact:	Linux Memory Management list <linux-mm@kvack.org>
+Description:
+		Provides information about the node's distribution and memory
+		utilization. Similar to /proc/meminfo, see Documentation/filesystems/proc.txt
+
+What:		/sys/devices/system/node/nodeX/numastat
+Date:		October 2002
+Contact:	Linux Memory Management list <linux-mm@kvack.org>
+Description:
+		The node's hit/miss statistics, in units of pages.
+		See Documentation/numastat.txt
+
+What:		/sys/devices/system/node/nodeX/distance
+Date:		October 2002
+Contact:	Linux Memory Management list <linux-mm@kvack.org>
+Description:
+		Distance between the node and all the other nodes
+		in the system.
+
+What:		/sys/devices/system/node/nodeX/vmstat
+Date:		October 2002
+Contact:	Linux Memory Management list <linux-mm@kvack.org>
+Description:
+		The node's zoned virtual memory statistics.
+		This is a superset of numastat.
+
+What:		/sys/devices/system/node/nodeX/compact
+Date:		February 2010
+Contact:	Mel Gorman <mel@csn.ul.ie>
+Description:
+		When this file is written to, all memory within that node
+		will be compacted. When it completes, memory will be freed
+		into blocks which have as many contiguous pages as possible
+
+What:		/sys/devices/system/node/nodeX/scan_unevictable_pages
+Date:		October 2008
+Contact:	Lee Schermerhorn <lee.schermerhorn@hp.com>
+Description:
+		When set, it triggers scanning the node's unevictable lists
+		and move any pages that have become evictable onto the respective
+		zone's inactive list. See mm/vmscan.c
+
+What:		/sys/devices/system/node/nodeX/hugepages/hugepages-<size>/
+Date:		December 2009
+Contact:	Lee Schermerhorn <lee.schermerhorn@hp.com>
+Description:
+		The node's huge page size control/query attributes.
+		See Documentation/vm/hugetlbpage.txt
\ No newline at end of file
diff --git a/trunk/Documentation/ABI/testing/ima_policy b/trunk/Documentation/ABI/testing/ima_policy
index 986946613542..ec0a38ef3145 100644
--- a/trunk/Documentation/ABI/testing/ima_policy
+++ b/trunk/Documentation/ABI/testing/ima_policy
@@ -23,7 +23,7 @@ Description:
 			lsm:	[[subj_user=] [subj_role=] [subj_type=]
 				 [obj_user=] [obj_role=] [obj_type=]]
 
-		base: 	func:= [BPRM_CHECK][FILE_MMAP][FILE_CHECK]
+		base: 	func:= [BPRM_CHECK][FILE_MMAP][FILE_CHECK][MODULE_CHECK]
 			mask:= [MAY_READ] [MAY_WRITE] [MAY_APPEND] [MAY_EXEC]
 			fsmagic:= hex value
 			uid:= decimal value
@@ -53,6 +53,7 @@ Description:
 			measure func=BPRM_CHECK
 			measure func=FILE_MMAP mask=MAY_EXEC
 			measure func=FILE_CHECK mask=MAY_READ uid=0
+			measure func=MODULE_CHECK uid=0
 			appraise fowner=0
 
 		The default policy measures all executables in bprm_check,
diff --git a/trunk/Documentation/DocBook/kernel-api.tmpl b/trunk/Documentation/DocBook/kernel-api.tmpl
index 00687ee9d363..f75ab4c1b281 100644
--- a/trunk/Documentation/DocBook/kernel-api.tmpl
+++ b/trunk/Documentation/DocBook/kernel-api.tmpl
@@ -58,6 +58,9 @@
 
      <sect1><title>String Conversions</title>
 !Elib/vsprintf.c
+!Finclude/linux/kernel.h kstrtol
+!Finclude/linux/kernel.h kstrtoul
+!Elib/kstrtox.c
      </sect1>
      <sect1><title>String Manipulation</title>
 <!-- All functions are exported at now
diff --git a/trunk/Documentation/aoe/aoe.txt b/trunk/Documentation/aoe/aoe.txt
index bfc9cb19abcd..c71487d399d1 100644
--- a/trunk/Documentation/aoe/aoe.txt
+++ b/trunk/Documentation/aoe/aoe.txt
@@ -125,7 +125,9 @@ DRIVER OPTIONS
   The aoe_deadsecs module parameter determines the maximum number of
   seconds that the driver will wait for an AoE device to provide a
   response to an AoE command.  After aoe_deadsecs seconds have
-  elapsed, the AoE device will be marked as "down".
+  elapsed, the AoE device will be marked as "down".  A value of zero
+  is supported for testing purposes and makes the aoe driver keep
+  trying AoE commands forever.
 
   The aoe_maxout module parameter has a default of 128.  This is the
   maximum number of unresponded packets that will be sent to an AoE
diff --git a/trunk/Documentation/backlight/lp855x-driver.txt b/trunk/Documentation/backlight/lp855x-driver.txt
index f5e4caafab7d..1529394cfe8b 100644
--- a/trunk/Documentation/backlight/lp855x-driver.txt
+++ b/trunk/Documentation/backlight/lp855x-driver.txt
@@ -35,11 +35,8 @@ For supporting platform specific data, the lp855x platform data can be used.
 * mode : Brightness control mode. PWM or register based.
 * device_control : Value of DEVICE CONTROL register.
 * initial_brightness : Initial value of backlight brightness.
-* pwm_data : Platform specific pwm generation functions.
+* period_ns : Platform specific PWM period value. unit is nano.
 	     Only valid when brightness is pwm input mode.
-	     Functions should be implemented by PWM driver.
-	     - pwm_set_intensity() : set duty of PWM
-	     - pwm_get_intensity() : get current duty of PWM
 * load_new_rom_data :
 	0 : use default configuration data
 	1 : update values of eeprom or eprom registers on loading driver
@@ -71,8 +68,5 @@ static struct lp855x_platform_data lp8556_pdata = {
 	.mode = PWM_BASED,
 	.device_control = PWM_CONFIG(LP8556),
 	.initial_brightness = INITIAL_BRT,
-	.pwm_data = {
-		     .pwm_set_intensity = platform_pwm_set_intensity,
-		     .pwm_get_intensity = platform_pwm_get_intensity,
-		     },
+	.period_ns = 1000000,
 };
diff --git a/trunk/Documentation/cgroups/memory.txt b/trunk/Documentation/cgroups/memory.txt
index a25cb3fafeba..8b8c28b9864c 100644
--- a/trunk/Documentation/cgroups/memory.txt
+++ b/trunk/Documentation/cgroups/memory.txt
@@ -71,6 +71,11 @@ Brief summary of control files.
  memory.oom_control		 # set/show oom controls.
  memory.numa_stat		 # show the number of memory usage per numa node
 
+ memory.kmem.limit_in_bytes      # set/show hard limit for kernel memory
+ memory.kmem.usage_in_bytes      # show current kernel memory allocation
+ memory.kmem.failcnt             # show the number of kernel memory usage hits limits
+ memory.kmem.max_usage_in_bytes  # show max kernel memory usage recorded
+
  memory.kmem.tcp.limit_in_bytes  # set/show hard limit for tcp buf memory
  memory.kmem.tcp.usage_in_bytes  # show current tcp buf memory allocation
  memory.kmem.tcp.failcnt            # show the number of tcp buf memory usage hits limits
@@ -268,20 +273,73 @@ the amount of kernel memory used by the system. Kernel memory is fundamentally
 different than user memory, since it can't be swapped out, which makes it
 possible to DoS the system by consuming too much of this precious resource.
 
+Kernel memory won't be accounted at all until limit on a group is set. This
+allows for existing setups to continue working without disruption.  The limit
+cannot be set if the cgroup have children, or if there are already tasks in the
+cgroup. Attempting to set the limit under those conditions will return -EBUSY.
+When use_hierarchy == 1 and a group is accounted, its children will
+automatically be accounted regardless of their limit value.
+
+After a group is first limited, it will be kept being accounted until it
+is removed. The memory limitation itself, can of course be removed by writing
+-1 to memory.kmem.limit_in_bytes. In this case, kmem will be accounted, but not
+limited.
+
 Kernel memory limits are not imposed for the root cgroup. Usage for the root
-cgroup may or may not be accounted.
+cgroup may or may not be accounted. The memory used is accumulated into
+memory.kmem.usage_in_bytes, or in a separate counter when it makes sense.
+(currently only for tcp).
+The main "kmem" counter is fed into the main counter, so kmem charges will
+also be visible from the user counter.
 
 Currently no soft limit is implemented for kernel memory. It is future work
 to trigger slab reclaim when those limits are reached.
 
 2.7.1 Current Kernel Memory resources accounted
 
+* stack pages: every process consumes some stack pages. By accounting into
+kernel memory, we prevent new processes from being created when the kernel
+memory usage is too high.
+
+* slab pages: pages allocated by the SLAB or SLUB allocator are tracked. A copy
+of each kmem_cache is created everytime the cache is touched by the first time
+from inside the memcg. The creation is done lazily, so some objects can still be
+skipped while the cache is being created. All objects in a slab page should
+belong to the same memcg. This only fails to hold when a task is migrated to a
+different memcg during the page allocation by the cache.
+
 * sockets memory pressure: some sockets protocols have memory pressure
 thresholds. The Memory Controller allows them to be controlled individually
 per cgroup, instead of globally.
 
 * tcp memory pressure: sockets memory pressure for the tcp protocol.
 
+2.7.3 Common use cases
+
+Because the "kmem" counter is fed to the main user counter, kernel memory can
+never be limited completely independently of user memory. Say "U" is the user
+limit, and "K" the kernel limit. There are three possible ways limits can be
+set:
+
+    U != 0, K = unlimited:
+    This is the standard memcg limitation mechanism already present before kmem
+    accounting. Kernel memory is completely ignored.
+
+    U != 0, K < U:
+    Kernel memory is a subset of the user memory. This setup is useful in
+    deployments where the total amount of memory per-cgroup is overcommited.
+    Overcommiting kernel memory limits is definitely not recommended, since the
+    box can still run out of non-reclaimable memory.
+    In this case, the admin could set up K so that the sum of all groups is
+    never greater than the total memory, and freely set U at the cost of his
+    QoS.
+
+    U != 0, K >= U:
+    Since kmem charges will also be fed to the user counter and reclaim will be
+    triggered for the cgroup for both kinds of memory. This setup gives the
+    admin a unified view of memory, and it is also useful for people who just
+    want to track kernel memory usage.
+
 3. User Interface
 
 0. Configuration
@@ -290,6 +348,7 @@ a. Enable CONFIG_CGROUPS
 b. Enable CONFIG_RESOURCE_COUNTERS
 c. Enable CONFIG_MEMCG
 d. Enable CONFIG_MEMCG_SWAP (to use swap extension)
+d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
 
 1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
 # mount -t tmpfs none /sys/fs/cgroup
@@ -406,6 +465,11 @@ About use_hierarchy, see Section 6.
   Because rmdir() moves all pages to parent, some out-of-use page caches can be
   moved to the parent. If you want to avoid that, force_empty will be useful.
 
+  Also, note that when memory.kmem.limit_in_bytes is set the charges due to
+  kernel pages will still be seen. This is not considered a failure and the
+  write will still return success. In this case, it is expected that
+  memory.kmem.usage_in_bytes == memory.usage_in_bytes.
+
   About use_hierarchy, see Section 6.
 
 5.2 stat file
diff --git a/trunk/Documentation/cgroups/resource_counter.txt b/trunk/Documentation/cgroups/resource_counter.txt
index 0c4a344e78fa..c4d99ed0b418 100644
--- a/trunk/Documentation/cgroups/resource_counter.txt
+++ b/trunk/Documentation/cgroups/resource_counter.txt
@@ -83,16 +83,17 @@ to work with it.
 	res_counter->lock internally (it must be called with res_counter->lock
 	held). The force parameter indicates whether we can bypass the limit.
 
- e. void res_counter_uncharge[_locked]
+ e. u64 res_counter_uncharge[_locked]
 			(struct res_counter *rc, unsigned long val)
 
 	When a resource is released (freed) it should be de-accounted
 	from the resource counter it was accounted to.  This is called
-	"uncharging".
+	"uncharging". The return value of this function indicate the amount
+	of charges still present in the counter.
 
 	The _locked routines imply that the res_counter->lock is taken.
 
- f. void res_counter_uncharge_until
+ f. u64 res_counter_uncharge_until
 		(struct res_counter *rc, struct res_counter *top,
 		 unsinged long val)
 
diff --git a/trunk/Documentation/devicetree/bindings/arm/davinci/nand.txt b/trunk/Documentation/devicetree/bindings/arm/davinci/nand.txt
index 49fc7ada929a..3545ea704b50 100644
--- a/trunk/Documentation/devicetree/bindings/arm/davinci/nand.txt
+++ b/trunk/Documentation/devicetree/bindings/arm/davinci/nand.txt
@@ -23,6 +23,9 @@ Recommended properties :
 - ti,davinci-nand-buswidth: buswidth 8 or 16
 - ti,davinci-nand-use-bbt: use flash based bad block table support.
 
+nand device bindings may contain additional sub-nodes describing
+partitions of the address space. See partition.txt for more detail.
+
 Example(da850 EVM ):
 nand_cs3@62000000 {
 	compatible = "ti,davinci-nand";
@@ -35,4 +38,9 @@ nand_cs3@62000000 {
 	ti,davinci-ecc-mode = "hw";
 	ti,davinci-ecc-bits = <4>;
 	ti,davinci-nand-use-bbt;
+
+	partition@180000 {
+		label = "ubifs";
+		reg = <0x180000 0x7e80000>;
+	};
 };
diff --git a/trunk/Documentation/devicetree/bindings/i2c/i2c-cbus-gpio.txt b/trunk/Documentation/devicetree/bindings/i2c/i2c-cbus-gpio.txt
new file mode 100644
index 000000000000..8ce9cd2855b5
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/i2c/i2c-cbus-gpio.txt
@@ -0,0 +1,27 @@
+Device tree bindings for i2c-cbus-gpio driver
+
+Required properties:
+	- compatible = "i2c-cbus-gpio";
+	- gpios: clk, dat, sel
+	- #address-cells = <1>;
+	- #size-cells = <0>;
+
+Optional properties:
+	- child nodes conforming to i2c bus binding
+
+Example:
+
+i2c@0 {
+	compatible = "i2c-cbus-gpio";
+	gpios = <&gpio 66 0 /* clk */
+		 &gpio 65 0 /* dat */
+		 &gpio 64 0 /* sel */
+		>;
+	#address-cells = <1>;
+	#size-cells = <0>;
+
+	retu-mfd: retu@1 {
+		compatible = "retu-mfd";
+		reg = <0x1>;
+	};
+};
diff --git a/trunk/Documentation/devicetree/bindings/i2c/i2c-mux-gpio.txt b/trunk/Documentation/devicetree/bindings/i2c/i2c-mux-gpio.txt
new file mode 100644
index 000000000000..66709a825541
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/i2c/i2c-mux-gpio.txt
@@ -0,0 +1,81 @@
+GPIO-based I2C Bus Mux
+
+This binding describes an I2C bus multiplexer that uses GPIOs to
+route the I2C signals.
+
+                                  +-----+  +-----+
+                                  | dev |  | dev |
+    +------------+                +-----+  +-----+
+    | SoC        |                   |        |
+    |            |          /--------+--------+
+    |   +------+ |  +------+    child bus A, on GPIO value set to 0
+    |   | I2C  |-|--| Mux  |
+    |   +------+ |  +--+---+    child bus B, on GPIO value set to 1
+    |            |     |    \----------+--------+--------+
+    |   +------+ |     |               |        |        |
+    |   | GPIO |-|-----+            +-----+  +-----+  +-----+
+    |   +------+ |                  | dev |  | dev |  | dev |
+    +------------+                  +-----+  +-----+  +-----+
+
+Required properties:
+- compatible: i2c-mux-gpio
+- i2c-parent: The phandle of the I2C bus that this multiplexer's master-side
+  port is connected to.
+- mux-gpios: list of gpios used to control the muxer
+* Standard I2C mux properties. See mux.txt in this directory.
+* I2C child bus nodes. See mux.txt in this directory.
+
+Optional properties:
+- idle-state: value to set the muxer to when idle. When no value is
+  given, it defaults to the last value used.
+
+For each i2c child node, an I2C child bus will be created. They will
+be numbered based on their order in the device tree.
+
+Whenever an access is made to a device on a child bus, the value set
+in the revelant node's reg property will be output using the list of
+GPIOs, the first in the list holding the least-significant value.
+
+If an idle state is defined, using the idle-state (optional) property,
+whenever an access is not being made to a device on a child bus, the
+GPIOs will be set according to the idle value.
+
+If an idle state is not defined, the most recently used value will be
+left programmed into hardware whenever no access is being made to a
+device on a child bus.
+
+Example:
+	i2cmux {
+		compatible = "i2c-mux-gpio";
+		#address-cells = <1>;
+		#size-cells = <0>;
+		mux-gpios = <&gpio1 22 0 &gpio1 23 0>;
+		i2c-parent = <&i2c1>;
+
+		i2c@1 {
+			reg = <1>;
+			#address-cells = <1>;
+			#size-cells = <0>;
+
+			ssd1307: oled@3c {
+				compatible = "solomon,ssd1307fb-i2c";
+				reg = <0x3c>;
+				pwms = <&pwm 4 3000>;
+				reset-gpios = <&gpio2 7 1>;
+				reset-active-low;
+			};
+		};
+
+		i2c@3 {
+			reg = <3>;
+			#address-cells = <1>;
+			#size-cells = <0>;
+
+			pca9555: pca9555@20 {
+				compatible = "nxp,pca9555";
+				gpio-controller;
+				#gpio-cells = <2>;
+				reg = <0x20>;
+			};
+		};
+	};
diff --git a/trunk/Documentation/devicetree/bindings/i2c/i2c-ocores.txt b/trunk/Documentation/devicetree/bindings/i2c/i2c-ocores.txt
index c15781f4dc8c..1637c298a1b3 100644
--- a/trunk/Documentation/devicetree/bindings/i2c/i2c-ocores.txt
+++ b/trunk/Documentation/devicetree/bindings/i2c/i2c-ocores.txt
@@ -1,7 +1,7 @@
 Device tree configuration for i2c-ocores
 
 Required properties:
-- compatible      : "opencores,i2c-ocores"
+- compatible      : "opencores,i2c-ocores" or "aeroflexgaisler,i2cmst"
 - reg             : bus address start and address range size of device
 - interrupts      : interrupt number
 - clock-frequency : frequency of bus clock in Hz
diff --git a/trunk/Documentation/devicetree/bindings/i2c/i2c-s3c2410.txt b/trunk/Documentation/devicetree/bindings/i2c/i2c-s3c2410.txt
index b6cb5a12c672..e9611ace8792 100644
--- a/trunk/Documentation/devicetree/bindings/i2c/i2c-s3c2410.txt
+++ b/trunk/Documentation/devicetree/bindings/i2c/i2c-s3c2410.txt
@@ -13,11 +13,17 @@ Required properties:
   - interrupts: interrupt number to the cpu.
   - samsung,i2c-sda-delay: Delay (in ns) applied to data line (SDA) edges.
 
+Required for all cases except "samsung,s3c2440-hdmiphy-i2c":
+  - Samsung GPIO variant (deprecated):
+    - gpios: The order of the gpios should be the following: <SDA, SCL>.
+      The gpio specifier depends on the gpio controller. Required in all
+      cases except for "samsung,s3c2440-hdmiphy-i2c" whose input/output
+      lines are permanently wired to the respective clienta
+  - Pinctrl variant (preferred, if available):
+    - pinctrl-0: Pin control group to be used for this controller.
+    - pinctrl-names: Should contain only one value - "default".
+
 Optional properties:
-  - gpios: The order of the gpios should be the following: <SDA, SCL>.
-    The gpio specifier depends on the gpio controller. Required in all
-    cases except for "samsung,s3c2440-hdmiphy-i2c" whose input/output
-    lines are permanently wired to the respective client
   - samsung,i2c-slave-addr: Slave address in multi-master enviroment. If not
     specified, default value is 0.
   - samsung,i2c-max-bus-freq: Desired frequency in Hz of the bus. If not
@@ -31,8 +37,14 @@ Example:
 		interrupts = <345>;
 		samsung,i2c-sda-delay = <100>;
 		samsung,i2c-max-bus-freq = <100000>;
+		/* Samsung GPIO variant begins here */
 		gpios = <&gpd1 2 0 /* SDA */
 			 &gpd1 3 0 /* SCL */>;
+		/* Samsung GPIO variant ends here */
+		/* Pinctrl variant begins here */
+		pinctrl-0 = <&i2c3_bus>;
+		pinctrl-names = "default";
+		/* Pinctrl variant ends here */
 		#address-cells = <1>;
 		#size-cells = <0>;
 
diff --git a/trunk/Documentation/devicetree/bindings/input/gpio-matrix-keypad.txt b/trunk/Documentation/devicetree/bindings/input/gpio-matrix-keypad.txt
new file mode 100644
index 000000000000..ead641c65e0a
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/input/gpio-matrix-keypad.txt
@@ -0,0 +1,46 @@
+* GPIO driven matrix keypad device tree bindings
+
+GPIO driven matrix keypad is used to interface a SoC with a matrix keypad.
+The matrix keypad supports multiple row and column lines, a key can be
+placed at each intersection of a unique row and a unique column. The matrix
+keypad can sense a key-press and key-release by means of GPIO lines and
+report the event using GPIO interrupts to the cpu.
+
+Required Properties:
+- compatible:		Should be "gpio-matrix-keypad"
+- row-gpios:		List of gpios used as row lines. The gpio specifier
+			for this property depends on the gpio controller to
+			which these row lines are connected.
+- col-gpios:		List of gpios used as column lines. The gpio specifier
+			for this property depends on the gpio controller to
+			which these column lines are connected.
+- linux,keymap:		The definition can be found at
+			bindings/input/matrix-keymap.txt
+
+Optional Properties:
+- linux,no-autorepeat:	do no enable autorepeat feature.
+- linux,wakeup:		use any event on keypad as wakeup event.
+- debounce-delay-ms:	debounce interval in milliseconds
+- col-scan-delay-us:	delay, measured in microseconds, that is needed
+			before we can scan keypad after activating column gpio
+
+Example:
+	matrix-keypad {
+		compatible = "gpio-matrix-keypad";
+		debounce-delay-ms = <5>;
+		col-scan-delay-us = <2>;
+
+		row-gpios = <&gpio2 25 0
+			     &gpio2 26 0
+			     &gpio2 27 0>;
+
+		col-gpios = <&gpio2 21 0
+			     &gpio2 22 0>;
+
+		linux,keymap = <0x0000008B
+				0x0100009E
+				0x02000069
+				0x0001006A
+				0x0101001C
+				0x0201006C>;
+	};
diff --git a/trunk/Documentation/devicetree/bindings/input/pwm-beeper.txt b/trunk/Documentation/devicetree/bindings/input/pwm-beeper.txt
new file mode 100644
index 000000000000..be332ae4f2d6
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/input/pwm-beeper.txt
@@ -0,0 +1,7 @@
+* PWM beeper device tree bindings
+
+Registers a PWM device as beeper.
+
+Required properties:
+- compatible: should be "pwm-beeper"
+- pwms: phandle to the physical PWM device
diff --git a/trunk/Documentation/devicetree/bindings/input/stmpe-keypad.txt b/trunk/Documentation/devicetree/bindings/input/stmpe-keypad.txt
new file mode 100644
index 000000000000..1b97222e8a0b
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/input/stmpe-keypad.txt
@@ -0,0 +1,39 @@
+* STMPE Keypad
+
+Required properties:
+ - compatible               : "st,stmpe-keypad"
+ - linux,keymap             : See ./matrix-keymap.txt
+
+Optional properties:
+ - debounce-interval        : Debouncing interval time in milliseconds
+ - st,scan-count            : Scanning cycles elapsed before key data is updated
+ - st,no-autorepeat         : If specified device will not autorepeat
+
+Example:
+
+	stmpe_keypad {
+		compatible = "st,stmpe-keypad";
+
+		debounce-interval = <64>;
+		st,scan-count = <8>;
+		st,no-autorepeat;
+
+		linux,keymap = <0x205006b
+				0x4010074
+				0x3050072
+				0x1030004
+				0x502006a
+				0x500000a
+				0x5008b
+				0x706001c
+				0x405000b
+				0x6070003
+				0x3040067
+				0x303006c
+				0x60400e7
+				0x602009e
+				0x4020073
+				0x5050002
+				0x4030069
+				0x3020008>;
+	};
diff --git a/trunk/Documentation/devicetree/bindings/input/tca8418_keypad.txt b/trunk/Documentation/devicetree/bindings/input/tca8418_keypad.txt
new file mode 100644
index 000000000000..2a1538f0053f
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/input/tca8418_keypad.txt
@@ -0,0 +1,8 @@
+
+Required properties:
+- compatible: "ti,tca8418"
+- reg: the I2C address
+- interrupts: IRQ line number, should trigger on falling edge
+- keypad,num-rows: The number of rows
+- keypad,num-columns: The number of columns
+- linux,keymap: Keys definitions, see keypad-matrix.
diff --git a/trunk/Documentation/devicetree/bindings/input/touchscreen/mms114.txt b/trunk/Documentation/devicetree/bindings/input/touchscreen/mms114.txt
new file mode 100644
index 000000000000..89d4c56c5671
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/input/touchscreen/mms114.txt
@@ -0,0 +1,34 @@
+* MELFAS MMS114 touchscreen controller
+
+Required properties:
+- compatible: must be "melfas,mms114"
+- reg: I2C address of the chip
+- interrupts: interrupt to which the chip is connected
+- x-size: horizontal resolution of touchscreen
+- y-size: vertical resolution of touchscreen
+
+Optional properties:
+- contact-threshold:
+- moving-threshold:
+- x-invert: invert X axis
+- y-invert: invert Y axis
+
+Example:
+
+	i2c@00000000 {
+		/* ... */
+
+		touchscreen@48 {
+			compatible = "melfas,mms114";
+			reg = <0x48>;
+			interrupts = <39 0>;
+			x-size = <720>;
+			y-size = <1280>;
+			contact-threshold = <10>;
+			moving-threshold = <10>;
+			x-invert;
+			y-invert;
+		};
+
+		/* ... */
+	};
diff --git a/trunk/Documentation/devicetree/bindings/input/touchscreen/stmpe.txt b/trunk/Documentation/devicetree/bindings/input/touchscreen/stmpe.txt
new file mode 100644
index 000000000000..127baa31a77a
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/input/touchscreen/stmpe.txt
@@ -0,0 +1,43 @@
+STMPE Touchscreen
+----------------
+
+Required properties:
+ - compatible: "st,stmpe-ts"
+
+Optional properties:
+- st,sample-time: ADC converstion time in number of clock.  (0 -> 36 clocks, 1 ->
+  44 clocks, 2 -> 56 clocks, 3 -> 64 clocks, 4 -> 80 clocks, 5 -> 96 clocks, 6
+  -> 144 clocks), recommended is 4.
+- st,mod-12b: ADC Bit mode (0 -> 10bit ADC, 1 -> 12bit ADC)
+- st,ref-sel: ADC reference source (0 -> internal reference, 1 -> external
+  reference)
+- st,adc-freq: ADC Clock speed (0 -> 1.625 MHz, 1 -> 3.25 MHz, 2 || 3 -> 6.5 MHz)
+- st,ave-ctrl: Sample average control (0 -> 1 sample, 1 -> 2 samples, 2 -> 4
+  samples, 3 -> 8 samples)
+- st,touch-det-delay: Touch detect interrupt delay (0 -> 10 us, 1 -> 50 us, 2 ->
+  100 us, 3 -> 500 us, 4-> 1 ms, 5 -> 5 ms, 6 -> 10 ms, 7 -> 50 ms) recommended
+  is 3
+- st,settling: Panel driver settling time (0 -> 10 us, 1 -> 100 us, 2 -> 500 us, 3
+  -> 1 ms, 4 -> 5 ms, 5 -> 10 ms, 6 for 50 ms, 7 -> 100 ms) recommended is 2
+- st,fraction-z: Length of the fractional part in z (fraction-z ([0..7]) = Count of
+  the fractional part) recommended is 7
+- st,i-drive: current limit value of the touchscreen drivers (0 -> 20 mA typical 35
+  mA max, 1 -> 50 mA typical 80 mA max)
+
+Node name must be stmpe_touchscreen and should be child node of stmpe node to
+which it belongs.
+
+Example:
+
+	stmpe_touchscreen {
+		compatible = "st,stmpe-ts";
+		st,sample-time = <4>;
+		st,mod-12b = <1>;
+		st,ref-sel = <0>;
+		st,adc-freq = <1>;
+		st,ave-ctrl = <1>;
+		st,touch-det-delay = <2>;
+		st,settling = <2>;
+		st,fraction-z = <7>;
+		st,i-drive = <1>;
+	};
diff --git a/trunk/Documentation/devicetree/bindings/mtd/denali-nand.txt b/trunk/Documentation/devicetree/bindings/mtd/denali-nand.txt
new file mode 100644
index 000000000000..b04d03a1d499
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/mtd/denali-nand.txt
@@ -0,0 +1,23 @@
+* Denali NAND controller
+
+Required properties:
+  - compatible : should be "denali,denali-nand-dt"
+  - reg : should contain registers location and length for data and reg.
+  - reg-names: Should contain the reg names "nand_data" and "denali_reg"
+  - interrupts : The interrupt number.
+  - dm-mask : DMA bit mask
+
+The device tree may optionally contain sub-nodes describing partitions of the
+address space. See partition.txt for more detail.
+
+Examples:
+
+nand: nand@ff900000 {
+	#address-cells = <1>;
+	#size-cells = <1>;
+	compatible = "denali,denali-nand-dt";
+	reg = <0xff900000 0x100000>, <0xffb80000 0x10000>;
+	reg-names = "nand_data", "denali_reg";
+	interrupts = <0 144 4>;
+	dma-mask = <0xffffffff>;
+};
diff --git a/trunk/Documentation/devicetree/bindings/mtd/flctl-nand.txt b/trunk/Documentation/devicetree/bindings/mtd/flctl-nand.txt
new file mode 100644
index 000000000000..427f46dc60ad
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/mtd/flctl-nand.txt
@@ -0,0 +1,49 @@
+FLCTL NAND controller
+
+Required properties:
+- compatible : "renesas,shmobile-flctl-sh7372"
+- reg : Address range of the FLCTL
+- interrupts : flste IRQ number
+- nand-bus-width : bus width to NAND chip
+
+Optional properties:
+- dmas: DMA specifier(s)
+- dma-names: name for each DMA specifier. Valid names are
+	     "data_tx", "data_rx", "ecc_tx", "ecc_rx"
+
+The DMA fields are not used yet in the driver but are listed here for
+completing the bindings.
+
+The device tree may optionally contain sub-nodes describing partitions of the
+address space. See partition.txt for more detail.
+
+Example:
+
+	flctl@e6a30000 {
+		#address-cells = <1>;
+		#size-cells = <1>;
+		compatible = "renesas,shmobile-flctl-sh7372";
+		reg = <0xe6a30000 0x100>;
+		interrupts = <0x0d80>;
+
+		nand-bus-width = <16>;
+
+		dmas = <&dmac 1 /* data_tx */
+			&dmac 2;> /* data_rx */
+		dma-names = "data_tx", "data_rx";
+
+		system@0 {
+			label = "system";
+			reg = <0x0 0x8000000>;
+		};
+
+		userdata@8000000 {
+			label = "userdata";
+			reg = <0x8000000 0x10000000>;
+		};
+
+		cache@18000000 {
+			label = "cache";
+			reg = <0x18000000 0x8000000>;
+		};
+	};
diff --git a/trunk/Documentation/devicetree/bindings/mtd/fsmc-nand.txt b/trunk/Documentation/devicetree/bindings/mtd/fsmc-nand.txt
index e2c663b354d2..e3ea32e7de3e 100644
--- a/trunk/Documentation/devicetree/bindings/mtd/fsmc-nand.txt
+++ b/trunk/Documentation/devicetree/bindings/mtd/fsmc-nand.txt
@@ -3,9 +3,7 @@
 Required properties:
 - compatible : "st,spear600-fsmc-nand"
 - reg : Address range of the mtd chip
-- reg-names: Should contain the reg names "fsmc_regs" and "nand_data"
-- st,ale-off : Chip specific offset to ALE
-- st,cle-off : Chip specific offset to CLE
+- reg-names: Should contain the reg names "fsmc_regs", "nand_data", "nand_addr" and "nand_cmd"
 
 Optional properties:
 - bank-width : Width (in bytes) of the device.  If not present, the width
@@ -19,10 +17,10 @@ Example:
 		#address-cells = <1>;
 		#size-cells = <1>;
 		reg = <0xd1800000 0x1000	/* FSMC Register */
-		       0xd2000000 0x4000>;	/* NAND Base */
-		reg-names = "fsmc_regs", "nand_data";
-		st,ale-off = <0x20000>;
-		st,cle-off = <0x10000>;
+		       0xd2000000 0x0010	/* NAND Base DATA */
+		       0xd2020000 0x0010	/* NAND Base ADDR */
+		       0xd2010000 0x0010>;	/* NAND Base CMD */
+		reg-names = "fsmc_regs", "nand_data", "nand_addr", "nand_cmd";
 
 		bank-width = <1>;
 		nand-skip-bbtscan;
diff --git a/trunk/Documentation/devicetree/bindings/mtd/m25p80.txt b/trunk/Documentation/devicetree/bindings/mtd/m25p80.txt
new file mode 100644
index 000000000000..6d3d57609470
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/mtd/m25p80.txt
@@ -0,0 +1,29 @@
+* MTD SPI driver for ST M25Pxx (and similar) serial flash chips
+
+Required properties:
+- #address-cells, #size-cells : Must be present if the device has sub-nodes
+  representing partitions.
+- compatible : Should be the manufacturer and the name of the chip. Bear in mind
+               the DT binding is not Linux-only, but in case of Linux, see the
+               "m25p_ids" table in drivers/mtd/devices/m25p80.c for the list of
+               supported chips.
+- reg : Chip-Select number
+- spi-max-frequency : Maximum frequency of the SPI bus the chip can operate at
+
+Optional properties:
+- m25p,fast-read : Use the "fast read" opcode to read data from the chip instead
+                   of the usual "read" opcode. This opcode is not supported by
+                   all chips and support for it can not be detected at runtime.
+                   Refer to your chips' datasheet to check if this is supported
+                   by your chip.
+
+Example:
+
+	flash: m25p80@0 {
+		#address-cells = <1>;
+		#size-cells = <1>;
+		compatible = "spansion,m25p80";
+		reg = <0>;
+		spi-max-frequency = <40000000>;
+		m25p,fast-read;
+	};
diff --git a/trunk/Documentation/devicetree/bindings/mtd/mtd-physmap.txt b/trunk/Documentation/devicetree/bindings/mtd/mtd-physmap.txt
index 94de19b8f16b..dab7847fc800 100644
--- a/trunk/Documentation/devicetree/bindings/mtd/mtd-physmap.txt
+++ b/trunk/Documentation/devicetree/bindings/mtd/mtd-physmap.txt
@@ -23,6 +23,9 @@ file systems on embedded devices.
    unaligned accesses as implemented in the JFFS2 code via memcpy().
    By defining "no-unaligned-direct-access", the flash will not be
    exposed directly to the MTD users (e.g. JFFS2) any more.
+ - linux,mtd-name: allow to specify the mtd name for retro capability with
+   physmap-flash drivers as boot loader pass the mtd partition via the old
+   device name physmap-flash.
 
 For JEDEC compatible devices, the following additional properties
 are defined:
diff --git a/trunk/Documentation/devicetree/bindings/powerpc/fsl/raideng.txt b/trunk/Documentation/devicetree/bindings/powerpc/fsl/raideng.txt
new file mode 100644
index 000000000000..4ad29b9ac2ac
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/powerpc/fsl/raideng.txt
@@ -0,0 +1,81 @@
+* Freescale 85xx RAID Engine nodes
+
+RAID Engine nodes are defined to describe on-chip RAID accelerators.  Each RAID
+Engine should have a separate node.
+
+Supported chips:
+P5020, P5040
+
+Required properties:
+
+- compatible:	Should contain "fsl,raideng-v1.0" as the value
+		This identifies RAID Engine block. 1 in 1.0 represents
+		major number whereas 0 represents minor number. The
+		version matches the hardware IP version.
+- reg:		offset and length of the register set for the device
+- ranges:	standard ranges property specifying the translation
+		between child address space and parent address space
+
+Example:
+	/* P5020 */
+	raideng: raideng@320000 {
+		compatible = "fsl,raideng-v1.0";
+		#address-cells = <1>;
+		#size-cells = <1>;
+		reg	= <0x320000 0x10000>;
+		ranges	= <0 0x320000 0x10000>;
+	};
+
+
+There must be a sub-node for each job queue present in RAID Engine
+This node must be a sub-node of the main RAID Engine node
+
+- compatible:	Should contain "fsl,raideng-v1.0-job-queue" as the value
+		This identifies the job queue interface
+- reg:		offset and length of the register set for job queue
+- ranges:	standard ranges property specifying the translation
+		between child address space and parent address space
+
+Example:
+	/* P5020 */
+	raideng_jq0@1000 {
+		compatible = "fsl,raideng-v1.0-job-queue";
+		reg	   = <0x1000 0x1000>;
+		ranges	   = <0x0 0x1000 0x1000>;
+	};
+
+
+There must be a sub-node for each job ring present in RAID Engine
+This node must be a sub-node of job queue node
+
+- compatible:	Must contain "fsl,raideng-v1.0-job-ring" as the value
+		This identifies job ring. Should contain either
+		"fsl,raideng-v1.0-hp-ring" or "fsl,raideng-v1.0-lp-ring"
+		depending upon whether ring has high or low priority
+- reg:		offset and length of the register set for job ring
+- interrupts:	interrupt mapping for job ring IRQ
+
+Optional property:
+
+- fsl,liodn:	Specifies the LIODN to be used for Job Ring. This
+		property is normally set by firmware. Value
+		is of 12-bits which is the LIODN number for this JR.
+		This property is used by the IOMMU (PAMU) to distinquish
+		transactions from this JR and than be able to do address
+		translation & protection accordingly.
+
+Example:
+	/* P5020 */
+	raideng_jq0@1000 {
+		compatible = "fsl,raideng-v1.0-job-queue";
+		reg	   = <0x1000 0x1000>;
+		ranges	   = <0x0 0x1000 0x1000>;
+
+		raideng_jr0: jr@0 {
+			compatible = "fsl,raideng-v1.0-job-ring", "fsl,raideng-v1.0-hp-ring";
+			reg	   = <0x0 0x400>;
+			interrupts = <139 2 0 0>;
+			interrupt-parent = <&mpic>;
+			fsl,liodn = <0x41>;
+		};
+	};
diff --git a/trunk/Documentation/devicetree/bindings/pwm/pwm-tiecap.txt b/trunk/Documentation/devicetree/bindings/pwm/pwm-tiecap.txt
new file mode 100644
index 000000000000..131e8c11d26f
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/pwm/pwm-tiecap.txt
@@ -0,0 +1,23 @@
+TI SOC ECAP based APWM controller
+
+Required properties:
+- compatible: Must be "ti,am33xx-ecap"
+- #pwm-cells: Should be 3. Number of cells being used to specify PWM property.
+  First cell specifies the per-chip index of the PWM to use, the second
+  cell is the period in nanoseconds and bit 0 in the third cell is used to
+  encode the polarity of PWM output. Set bit 0 of the third in PWM specifier
+  to 1 for inverse polarity & set to 0 for normal polarity.
+- reg: physical base address and size of the registers map.
+
+Optional properties:
+- ti,hwmods: Name of the hwmod associated to the ECAP:
+  "ecap<x>", <x> being the 0-based instance number from the HW spec
+
+Example:
+
+ecap0: ecap@0 {
+	compatible = "ti,am33xx-ecap";
+	#pwm-cells = <3>;
+	reg = <0x48300100 0x80>;
+	ti,hwmods = "ecap0";
+};
diff --git a/trunk/Documentation/devicetree/bindings/pwm/pwm-tiehrpwm.txt b/trunk/Documentation/devicetree/bindings/pwm/pwm-tiehrpwm.txt
new file mode 100644
index 000000000000..4fc7079d822e
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/pwm/pwm-tiehrpwm.txt
@@ -0,0 +1,23 @@
+TI SOC EHRPWM based PWM controller
+
+Required properties:
+- compatible : Must be "ti,am33xx-ehrpwm"
+- #pwm-cells: Should be 3. Number of cells being used to specify PWM property.
+  First cell specifies the per-chip index of the PWM to use, the second
+  cell is the period in nanoseconds and bit 0 in the third cell is used to
+  encode the polarity of PWM output. Set bit 0 of the third in PWM specifier
+  to 1 for inverse polarity & set to 0 for normal polarity.
+- reg: physical base address and size of the registers map.
+
+Optional properties:
+- ti,hwmods: Name of the hwmod associated to the EHRPWM:
+  "ehrpwm<x>", <x> being the 0-based instance number from the HW spec
+
+Example:
+
+ehrpwm0: ehrpwm@0 {
+	compatible = "ti,am33xx-ehrpwm";
+	#pwm-cells = <3>;
+	reg = <0x48300200 0x100>;
+	ti,hwmods = "ehrpwm0";
+};
diff --git a/trunk/Documentation/devicetree/bindings/pwm/pwm-tipwmss.txt b/trunk/Documentation/devicetree/bindings/pwm/pwm-tipwmss.txt
new file mode 100644
index 000000000000..f7eae77f8354
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/pwm/pwm-tipwmss.txt
@@ -0,0 +1,31 @@
+TI SOC based PWM Subsystem
+
+Required properties:
+- compatible: Must be "ti,am33xx-pwmss";
+- reg: physical base address and size of the registers map.
+- address-cells: Specify the number of u32 entries needed in child nodes.
+		  Should set to 1.
+- size-cells: specify number of u32 entries needed to specify child nodes size
+		in reg property. Should set to 1.
+- ranges: describes the address mapping of a memory-mapped bus. Should set to
+	   physical address map of child's base address, physical address within
+	   parent's address  space and length of the address map. For am33xx,
+	   3 set of child register maps present, ECAP register space, EQEP
+	   register space, EHRPWM register space.
+
+Also child nodes should also populated under PWMSS DT node.
+
+Example:
+pwmss0: pwmss@48300000 {
+	compatible = "ti,am33xx-pwmss";
+	reg = <0x48300000 0x10>;
+	ti,hwmods = "epwmss0";
+	#address-cells = <1>;
+	#size-cells = <1>;
+	status = "disabled";
+	ranges = <0x48300100 0x48300100 0x80   /* ECAP */
+		  0x48300180 0x48300180 0x80   /* EQEP */
+		  0x48300200 0x48300200 0x80>; /* EHRPWM */
+
+	/* child nodes go here */
+};
diff --git a/trunk/Documentation/devicetree/bindings/pwm/pwm.txt b/trunk/Documentation/devicetree/bindings/pwm/pwm.txt
index 73ec962bfe8c..06e67247859a 100644
--- a/trunk/Documentation/devicetree/bindings/pwm/pwm.txt
+++ b/trunk/Documentation/devicetree/bindings/pwm/pwm.txt
@@ -37,10 +37,21 @@ device:
 		pwm-names = "backlight";
 	};
 
+Note that in the example above, specifying the "pwm-names" is redundant
+because the name "backlight" would be used as fallback anyway.
+
 pwm-specifier typically encodes the chip-relative PWM number and the PWM
-period in nanoseconds. Note that in the example above, specifying the
-"pwm-names" is redundant because the name "backlight" would be used as
-fallback anyway.
+period in nanoseconds.
+
+Optionally, the pwm-specifier can encode a number of flags in a third cell:
+- bit 0: PWM signal polarity (0: normal polarity, 1: inverse polarity)
+
+Example with optional PWM specifier for inverse polarity
+
+	bl: backlight {
+		pwms = <&pwm 0 5000000 1>;
+		pwm-names = "backlight";
+	};
 
 2) PWM controller nodes
 -----------------------
diff --git a/trunk/Documentation/devicetree/bindings/pwm/spear-pwm.txt b/trunk/Documentation/devicetree/bindings/pwm/spear-pwm.txt
new file mode 100644
index 000000000000..3ac779d83386
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/pwm/spear-pwm.txt
@@ -0,0 +1,18 @@
+== ST SPEAr SoC PWM controller ==
+
+Required properties:
+- compatible: should be one of:
+  - "st,spear320-pwm"
+  - "st,spear1340-pwm"
+- reg: physical base address and length of the controller's registers
+- #pwm-cells: number of cells used to specify PWM which is fixed to 2 on
+  SPEAr. The first cell specifies the per-chip index of the PWM to use and
+  the second cell is the period in nanoseconds.
+
+Example:
+
+        pwm: pwm@a8000000 {
+            compatible ="st,spear320-pwm";
+            reg = <0xa8000000 0x1000>;
+            #pwm-cells = <2>;
+        };
diff --git a/trunk/Documentation/devicetree/bindings/pwm/ti,twl-pwm.txt b/trunk/Documentation/devicetree/bindings/pwm/ti,twl-pwm.txt
new file mode 100644
index 000000000000..2943ee5fce00
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/pwm/ti,twl-pwm.txt
@@ -0,0 +1,17 @@
+Texas Instruments TWL series PWM drivers
+
+Supported PWMs:
+On TWL4030 series: PWM1 and PWM2
+On TWL6030 series: PWM0 and PWM1
+
+Required properties:
+- compatible: "ti,twl4030-pwm" or "ti,twl6030-pwm"
+- #pwm-cells: should be 2.  The first cell specifies the per-chip index
+  of the PWM to use and the second cell is the period in nanoseconds.
+
+Example:
+
+twl_pwm: pwm {
+	compatible = "ti,twl6030-pwm";
+	#pwm-cells = <2>;
+};
diff --git a/trunk/Documentation/devicetree/bindings/pwm/ti,twl-pwmled.txt b/trunk/Documentation/devicetree/bindings/pwm/ti,twl-pwmled.txt
new file mode 100644
index 000000000000..cb64f3acc10f
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/pwm/ti,twl-pwmled.txt
@@ -0,0 +1,17 @@
+Texas Instruments TWL series PWM drivers connected to LED terminals
+
+Supported PWMs:
+On TWL4030 series: PWMA and PWMB (connected to LEDA and LEDB terminals)
+On TWL6030 series: LED PWM (mainly used as charging indicator LED)
+
+Required properties:
+- compatible: "ti,twl4030-pwmled" or "ti,twl6030-pwmled"
+- #pwm-cells: should be 2.  The first cell specifies the per-chip index
+  of the PWM to use and the second cell is the period in nanoseconds.
+
+Example:
+
+twl_pwmled: pwmled {
+	compatible = "ti,twl6030-pwmled";
+	#pwm-cells = <2>;
+};
diff --git a/trunk/Documentation/devicetree/bindings/pwm/vt8500-pwm.txt b/trunk/Documentation/devicetree/bindings/pwm/vt8500-pwm.txt
new file mode 100644
index 000000000000..bcc63678a9a5
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/pwm/vt8500-pwm.txt
@@ -0,0 +1,17 @@
+VIA/Wondermedia VT8500/WM8xxx series SoC PWM controller
+
+Required properties:
+- compatible: should be "via,vt8500-pwm"
+- reg: physical base address and length of the controller's registers
+- #pwm-cells: should be 2.  The first cell specifies the per-chip index
+  of the PWM to use and the second cell is the period in nanoseconds.
+- clocks: phandle to the PWM source clock
+
+Example:
+
+pwm1: pwm@d8220000 {
+	#pwm-cells = <2>;
+	compatible = "via,vt8500-pwm";
+	reg = <0xd8220000 0x1000>;
+	clocks = <&clkpwm>;
+};
diff --git a/trunk/Documentation/devicetree/bindings/rtc/imxdi-rtc.txt b/trunk/Documentation/devicetree/bindings/rtc/imxdi-rtc.txt
new file mode 100644
index 000000000000..c9d80d7da141
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/rtc/imxdi-rtc.txt
@@ -0,0 +1,17 @@
+* i.MX25 Real Time Clock controller
+
+This binding supports the following chips: i.MX25, i.MX53
+
+Required properties:
+- compatible: should be: "fsl,imx25-rtc"
+- reg: physical base address of the controller and length of memory mapped
+  region.
+- interrupts: rtc alarm interrupt
+
+Example:
+
+rtc@80056000 {
+	compatible = "fsl,imx53-rtc", "fsl,imx25-rtc";
+	reg = <0x80056000 2000>;
+	interrupts = <29>;
+};
diff --git a/trunk/Documentation/devicetree/bindings/rtc/rtc-omap.txt b/trunk/Documentation/devicetree/bindings/rtc/rtc-omap.txt
new file mode 100644
index 000000000000..b47aa415c820
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/rtc/rtc-omap.txt
@@ -0,0 +1,17 @@
+TI Real Time Clock
+
+Required properties:
+- compatible: "ti,da830-rtc"
+- reg: Address range of rtc register set
+- interrupts: rtc timer, alarm interrupts in order
+- interrupt-parent: phandle for the interrupt controller
+
+Example:
+
+rtc@1c23000 {
+	compatible = "ti,da830-rtc";
+	reg = <0x23000 0x1000>;
+	interrupts = <19
+		      19>;
+	interrupt-parent = <&intc>;
+};
diff --git a/trunk/Documentation/devicetree/bindings/spi/nvidia,tegra20-sflash.txt b/trunk/Documentation/devicetree/bindings/spi/nvidia,tegra20-sflash.txt
index 8cf24f6f0a99..7b53da5cb75b 100644
--- a/trunk/Documentation/devicetree/bindings/spi/nvidia,tegra20-sflash.txt
+++ b/trunk/Documentation/devicetree/bindings/spi/nvidia,tegra20-sflash.txt
@@ -13,7 +13,7 @@ Recommended properties:
 
 Example:
 
-spi@7000d600 {
+spi@7000c380 {
 	compatible = "nvidia,tegra20-sflash";
 	reg = <0x7000c380 0x80>;
 	interrupts = <0 39 0x04>;
diff --git a/trunk/Documentation/devicetree/bindings/spi/nvidia,tegra20-slink.txt b/trunk/Documentation/devicetree/bindings/spi/nvidia,tegra20-slink.txt
index f5b1ad1a1ec3..eefe15e3d95e 100644
--- a/trunk/Documentation/devicetree/bindings/spi/nvidia,tegra20-slink.txt
+++ b/trunk/Documentation/devicetree/bindings/spi/nvidia,tegra20-slink.txt
@@ -13,7 +13,7 @@ Recommended properties:
 
 Example:
 
-slink@7000d600 {
+spi@7000d600 {
 	compatible = "nvidia,tegra20-slink";
 	reg = <0x7000d600 0x200>;
 	interrupts = <0 82 0x04>;
diff --git a/trunk/Documentation/devicetree/bindings/spi/spi_atmel.txt b/trunk/Documentation/devicetree/bindings/spi/spi_atmel.txt
new file mode 100644
index 000000000000..07e04cdc0c9e
--- /dev/null
+++ b/trunk/Documentation/devicetree/bindings/spi/spi_atmel.txt
@@ -0,0 +1,26 @@
+Atmel SPI device
+
+Required properties:
+- compatible : should be "atmel,at91rm9200-spi".
+- reg: Address and length of the register set for the device
+- interrupts: Should contain spi interrupt
+- cs-gpios: chipselects
+
+Example:
+
+spi1: spi@fffcc000 {
+	compatible = "atmel,at91rm9200-spi";
+	reg = <0xfffcc000 0x4000>;
+	interrupts = <13 4 5>;
+	#address-cells = <1>;
+	#size-cells = <0>;
+	cs-gpios = <&pioB 3 0>;
+	status = "okay";
+
+	mmc-slot@0 {
+		compatible = "mmc-spi-slot";
+		reg = <0>;
+		gpios = <&pioC 4 0>;	/* CD */
+		spi-max-frequency = <25000000>;
+	};
+};
diff --git a/trunk/Documentation/filesystems/proc.txt b/trunk/Documentation/filesystems/proc.txt
index 3844d21d6ca3..fd8d0d594fc7 100644
--- a/trunk/Documentation/filesystems/proc.txt
+++ b/trunk/Documentation/filesystems/proc.txt
@@ -41,6 +41,7 @@ Table of Contents
   3.5	/proc/<pid>/mountinfo - Information about mounts
   3.6	/proc/<pid>/comm  & /proc/<pid>/task/<tid>/comm
   3.7   /proc/<pid>/task/<tid>/children - Information about task children
+  3.8   /proc/<pid>/fdinfo/<fd> - Information about opened file
 
   4	Configuring procfs
   4.1	Mount options
@@ -142,7 +143,7 @@ Table 1-1: Process specific entries in /proc
  pagemap	Page table
  stack		Report full stack trace, enable via CONFIG_STACKTRACE
  smaps		a extension based on maps, showing the memory consumption of
-		each mapping
+		each mapping and flags associated with it
 ..............................................................................
 
 For example, to get the status information of a process, all you have to do is
@@ -181,6 +182,7 @@ read the file /proc/PID/status:
   CapPrm: 0000000000000000
   CapEff: 0000000000000000
   CapBnd: ffffffffffffffff
+  Seccomp:        0
   voluntary_ctxt_switches:        0
   nonvoluntary_ctxt_switches:     1
 
@@ -237,6 +239,7 @@ Table 1-2: Contents of the status files (as of 2.6.30-rc7)
  CapPrm                      bitmap of permitted capabilities
  CapEff                      bitmap of effective capabilities
  CapBnd                      bitmap of capabilities bounding set
+ Seccomp                     seccomp mode, like prctl(PR_GET_SECCOMP, ...)
  Cpus_allowed                mask of CPUs on which this process may run
  Cpus_allowed_list           Same as previous, but in "list format"
  Mems_allowed                mask of memory nodes allowed to this process
@@ -415,8 +418,9 @@ Swap:                  0 kB
 KernelPageSize:        4 kB
 MMUPageSize:           4 kB
 Locked:              374 kB
+VmFlags: rd ex mr mw me de
 
-The first of these lines shows the same information as is displayed for the
+the first of these lines shows the same information as is displayed for the
 mapping in /proc/PID/maps.  The remaining lines show the size of the mapping
 (size), the amount of the mapping that is currently resident in RAM (RSS), the
 process' proportional share of this mapping (PSS), the number of clean and
@@ -430,6 +434,41 @@ and a page is modified, the file page is replaced by a private anonymous copy.
 "Swap" shows how much would-be-anonymous memory is also used, but out on
 swap.
 
+"VmFlags" field deserves a separate description. This member represents the kernel
+flags associated with the particular virtual memory area in two letter encoded
+manner. The codes are the following:
+    rd  - readable
+    wr  - writeable
+    ex  - executable
+    sh  - shared
+    mr  - may read
+    mw  - may write
+    me  - may execute
+    ms  - may share
+    gd  - stack segment growns down
+    pf  - pure PFN range
+    dw  - disabled write to the mapped file
+    lo  - pages are locked in memory
+    io  - memory mapped I/O area
+    sr  - sequential read advise provided
+    rr  - random read advise provided
+    dc  - do not copy area on fork
+    de  - do not expand area on remapping
+    ac  - area is accountable
+    nr  - swap space is not reserved for the area
+    ht  - area uses huge tlb pages
+    nl  - non-linear mapping
+    ar  - architecture specific flag
+    dd  - do not include area into core dump
+    mm  - mixed map area
+    hg  - huge page advise flag
+    nh  - no-huge page advise flag
+    mg  - mergable advise flag
+
+Note that there is no guarantee that every flag and associated mnemonic will
+be present in all further kernel releases. Things get changed, the flags may
+be vanished or the reverse -- new added.
+
 This file is only present if the CONFIG_MMU kernel configuration option is
 enabled.
 
@@ -1595,6 +1634,93 @@ pids, so one need to either stop or freeze processes being inspected
 if precise results are needed.
 
 
+3.7	/proc/<pid>/fdinfo/<fd> - Information about opened file
+---------------------------------------------------------------
+This file provides information associated with an opened file. The regular
+files have at least two fields -- 'pos' and 'flags'. The 'pos' represents
+the current offset of the opened file in decimal form [see lseek(2) for
+details] and 'flags' denotes the octal O_xxx mask the file has been
+created with [see open(2) for details].
+
+A typical output is
+
+	pos:	0
+	flags:	0100002
+
+The files such as eventfd, fsnotify, signalfd, epoll among the regular pos/flags
+pair provide additional information particular to the objects they represent.
+
+	Eventfd files
+	~~~~~~~~~~~~~
+	pos:	0
+	flags:	04002
+	eventfd-count:	5a
+
+	where 'eventfd-count' is hex value of a counter.
+
+	Signalfd files
+	~~~~~~~~~~~~~~
+	pos:	0
+	flags:	04002
+	sigmask:	0000000000000200
+
+	where 'sigmask' is hex value of the signal mask associated
+	with a file.
+
+	Epoll files
+	~~~~~~~~~~~
+	pos:	0
+	flags:	02
+	tfd:        5 events:       1d data: ffffffffffffffff
+
+	where 'tfd' is a target file descriptor number in decimal form,
+	'events' is events mask being watched and the 'data' is data
+	associated with a target [see epoll(7) for more details].
+
+	Fsnotify files
+	~~~~~~~~~~~~~~
+	For inotify files the format is the following
+
+	pos:	0
+	flags:	02000000
+	inotify wd:3 ino:9e7e sdev:800013 mask:800afce ignored_mask:0 fhandle-bytes:8 fhandle-type:1 f_handle:7e9e0000640d1b6d
+
+	where 'wd' is a watch descriptor in decimal form, ie a target file
+	descriptor number, 'ino' and 'sdev' are inode and device where the
+	target file resides and the 'mask' is the mask of events, all in hex
+	form [see inotify(7) for more details].
+
+	If the kernel was built with exportfs support, the path to the target
+	file is encoded as a file handle.  The file handle is provided by three
+	fields 'fhandle-bytes', 'fhandle-type' and 'f_handle', all in hex
+	format.
+
+	If the kernel is built without exportfs support the file handle won't be
+	printed out.
+
+	If there is no inotify mark attached yet the 'inotify' line will be omitted.
+
+	For fanotify files the format is
+
+	pos:	0
+	flags:	02
+	fanotify flags:10 event-flags:0
+	fanotify mnt_id:12 mflags:40 mask:38 ignored_mask:40000003
+	fanotify ino:4f969 sdev:800013 mflags:0 mask:3b ignored_mask:40000000 fhandle-bytes:8 fhandle-type:1 f_handle:69f90400c275b5b4
+
+	where fanotify 'flags' and 'event-flags' are values used in fanotify_init
+	call, 'mnt_id' is the mount point identifier, 'mflags' is the value of
+	flags associated with mark which are tracked separately from events
+	mask. 'ino', 'sdev' are target inode and device, 'mask' is the events
+	mask and 'ignored_mask' is the mask of events which are to be ignored.
+	All in hex format. Incorporation of 'mflags', 'mask' and 'ignored_mask'
+	does provide information about flags and mask used in fanotify_mark
+	call [see fsnotify manpage for details].
+
+	While the first three lines are mandatory and always printed, the rest is
+	optional and may be omitted if no marks created yet.
+
+
 ------------------------------------------------------------------------------
 Configuring procfs
 ------------------------------------------------------------------------------
diff --git a/trunk/Documentation/filesystems/vfat.txt b/trunk/Documentation/filesystems/vfat.txt
index de1e6c4dccff..d230dd9c99b0 100644
--- a/trunk/Documentation/filesystems/vfat.txt
+++ b/trunk/Documentation/filesystems/vfat.txt
@@ -111,6 +111,15 @@ tz=UTC        -- Interpret timestamps as UTC rather than local time.
                  useful when mounting devices (like digital cameras)
                  that are set to UTC in order to avoid the pitfalls of
                  local time.
+time_offset=minutes
+	      -- Set offset for conversion of timestamps from local time
+		 used by FAT to UTC. I.e. <minutes> minutes will be subtracted
+		 from each timestamp to convert it to UTC used internally by
+		 Linux. This is useful when time zone set in sys_tz is
+		 not the time zone used by the filesystem. Note that this
+		 option still does not provide correct time stamps in all
+		 cases in presence of DST - time stamps in a different DST
+		 setting will be off by one hour.
 
 showexec      -- If set, the execute permission bits of the file will be
 		 allowed only if the extension part of the name is .EXE,
diff --git a/trunk/Documentation/hwmon/it87 b/trunk/Documentation/hwmon/it87
index 87850d86c559..8386aadc0a82 100644
--- a/trunk/Documentation/hwmon/it87
+++ b/trunk/Documentation/hwmon/it87
@@ -209,3 +209,13 @@ doesn't use CPU cycles.
 Trip points must be set properly before switching to automatic fan speed
 control mode. The driver will perform basic integrity checks before
 actually switching to automatic control mode.
+
+
+Temperature offset attributes
+-----------------------------
+
+The driver supports temp[1-3]_offset sysfs attributes to adjust the reported
+temperature for thermal diodes or diode-connected thermal transistors.
+If a temperature sensor is configured for thermistors, the attribute values
+are ignored. If the thermal sensor type is Intel PECI, the temperature offset
+must be programmed to the critical CPU temperature.
diff --git a/trunk/Documentation/kernel-parameters.txt b/trunk/Documentation/kernel-parameters.txt
index ea8e5b485576..ddd84d627185 100644
--- a/trunk/Documentation/kernel-parameters.txt
+++ b/trunk/Documentation/kernel-parameters.txt
@@ -1503,9 +1503,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 	mem=nn[KMG]	[KNL,BOOT] Force usage of a specific amount of memory
 			Amount of memory to be used when the kernel is not able
 			to see the whole system memory or for test.
-			[X86-32] Use together with memmap= to avoid physical
-			address space collisions. Without memmap= PCI devices
-			could be placed at addresses belonging to unused RAM.
+			[X86] Work as limiting max address. Use together
+			with memmap= to avoid physical address space collisions.
+			Without memmap= PCI devices could be placed at addresses
+			belonging to unused RAM.
 
 	mem=nopentium	[BUGS=X86-32] Disable usage of 4MB pages for kernel
 			memory.
diff --git a/trunk/Documentation/powerpc/ptrace.txt b/trunk/Documentation/powerpc/ptrace.txt
index f4a5499b7bc6..f2a7a3919772 100644
--- a/trunk/Documentation/powerpc/ptrace.txt
+++ b/trunk/Documentation/powerpc/ptrace.txt
@@ -127,6 +127,22 @@ Some examples of using the structure to:
   p.addr2           = (uint64_t) end_range;
   p.condition_value = 0;
 
+- set a watchpoint in server processors (BookS)
+
+  p.version         = 1;
+  p.trigger_type    = PPC_BREAKPOINT_TRIGGER_RW;
+  p.addr_mode       = PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE;
+  or
+  p.addr_mode       = PPC_BREAKPOINT_MODE_EXACT;
+
+  p.condition_mode  = PPC_BREAKPOINT_CONDITION_NONE;
+  p.addr            = (uint64_t) begin_range;
+  /* For PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE addr2 needs to be specified, where
+   * addr2 - addr <= 8 Bytes.
+   */
+  p.addr2           = (uint64_t) end_range;
+  p.condition_value = 0;
+
 3. PTRACE_DELHWDEBUG
 
 Takes an integer which identifies an existing breakpoint or watchpoint
diff --git a/trunk/Documentation/security/00-INDEX b/trunk/Documentation/security/00-INDEX
index eeed1de546d4..414235c1fcfc 100644
--- a/trunk/Documentation/security/00-INDEX
+++ b/trunk/Documentation/security/00-INDEX
@@ -12,6 +12,8 @@ apparmor.txt
 	- documentation on the AppArmor security extension.
 credentials.txt
 	- documentation about credentials in Linux.
+keys-ecryptfs.txt
+	- description of the encryption keys for the ecryptfs filesystem.
 keys-request-key.txt
 	- description of the kernel key request service.
 keys-trusted-encrypted.txt
diff --git a/trunk/Documentation/sparse.txt b/trunk/Documentation/sparse.txt
index 4909d4116356..eceab1308a8c 100644
--- a/trunk/Documentation/sparse.txt
+++ b/trunk/Documentation/sparse.txt
@@ -49,6 +49,24 @@ be generated without __CHECK_ENDIAN__.
 __bitwise - noisy stuff; in particular, __le*/__be* are that.  We really
 don't want to drown in noise unless we'd explicitly asked for it.
 
+Using sparse for lock checking
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following macros are undefined for gcc and defined during a sparse
+run to use the "context" tracking feature of sparse, applied to
+locking.  These annotations tell sparse when a lock is held, with
+regard to the annotated function's entry and exit.
+
+__must_hold - The specified lock is held on function entry and exit.
+
+__acquires - The specified lock is held on function exit, but not entry.
+
+__releases - The specified lock is held on function entry, but not exit.
+
+If the function enters and exits without the lock held, acquiring and
+releasing the lock inside the function in a balanced way, no
+annotation is needed.  The tree annotations above are for cases where
+sparse would otherwise report a context imbalance.
 
 Getting sparse
 ~~~~~~~~~~~~~~
diff --git a/trunk/Documentation/x86/boot.txt b/trunk/Documentation/x86/boot.txt
index f15cb74c4f78..406d82d5d2bb 100644
--- a/trunk/Documentation/x86/boot.txt
+++ b/trunk/Documentation/x86/boot.txt
@@ -373,7 +373,7 @@ Protocol:	2.00+
 	1  Loadlin
 	2  bootsect-loader	(0x20, all other values reserved)
 	3  Syslinux
-	4  Etherboot/gPXE
+	4  Etherboot/gPXE/iPXE
 	5  ELILO
 	7  GRUB
 	8  U-Boot
@@ -381,6 +381,7 @@ Protocol:	2.00+
 	A  Gujin
 	B  Qemu
 	C  Arcturus Networks uCbootloader
+	D  kexec-tools
 	E  Extended		(see ext_loader_type)
 	F  Special		(0xFF = undefined)
        10  Reserved
diff --git a/trunk/Documentation/xtensa/atomctl.txt b/trunk/Documentation/xtensa/atomctl.txt
new file mode 100644
index 000000000000..10a8d1ff35ec
--- /dev/null
+++ b/trunk/Documentation/xtensa/atomctl.txt
@@ -0,0 +1,44 @@
+We Have Atomic Operation Control (ATOMCTL) Register.
+This register determines the effect of using a S32C1I instruction
+with various combinations of:
+
+     1. With and without an Coherent Cache Controller which
+        can do Atomic Transactions to the memory internally.
+
+     2. With and without An Intelligent Memory Controller which
+        can do Atomic Transactions itself.
+
+The Core comes up with a default value of for the three types of cache ops:
+
+      0x28: (WB: Internal, WT: Internal, BY:Exception)
+
+On the FPGA Cards we typically simulate an Intelligent Memory controller
+which can implement  RCW transactions. For FPGA cards with an External
+Memory controller we let it to the atomic operations internally while
+doing a Cached (WB) transaction and use the Memory RCW for un-cached
+operations.
+
+For systems without an coherent cache controller, non-MX, we always
+use the memory controllers RCW, thought non-MX controlers likely
+support the Internal Operation.
+
+CUSTOMER-WARNING:
+   Virtually all customers buy their memory controllers from vendors that
+   don't support atomic RCW memory transactions and will likely want to
+   configure this register to not use RCW.
+
+Developers might find using RCW in Bypass mode convenient when testing
+with the cache being bypassed; for example studying cache alias problems.
+
+See Section 4.3.12.4 of ISA; Bits:
+
+                             WB     WT      BY
+                           5   4 | 3   2 | 1   0
+  2 Bit
+  Field
+  Values     WB - Write Back         WT - Write Thru         BY - Bypass
+---------    ---------------         -----------------     ----------------
+    0        Exception               Exception               Exception
+    1        RCW Transaction         RCW Transaction         RCW Transaction
+    2        Internal Operation      Exception               Reserved
+    3        Reserved                Reserved                Reserved
diff --git a/trunk/MAINTAINERS b/trunk/MAINTAINERS
index 6892b26025ba..4e2a1f67a1fc 100644
--- a/trunk/MAINTAINERS
+++ b/trunk/MAINTAINERS
@@ -1280,7 +1280,7 @@ F:	Documentation/hwmon/asc7621
 F:	drivers/hwmon/asc7621.c
 
 ASUS NOTEBOOKS AND EEEPC ACPI/WMI EXTRAS DRIVERS
-M:	Corentin Chary <corentincj@iksaif.net>
+M:	Corentin Chary <corentin.chary@gmail.com>
 L:	acpi4asus-user@lists.sourceforge.net
 L:	platform-driver-x86@vger.kernel.org
 W:	http://acpi4asus.sf.net
@@ -1929,7 +1929,7 @@ F:	scripts/checkpatch.pl
 
 CHINESE DOCUMENTATION
 M:	Harry Wei <harryxiyou@gmail.com>
-L:	xiyoulinuxkernelgroup@googlegroups.com
+L:	xiyoulinuxkernelgroup@googlegroups.com (subscribers-only)
 L:	linux-kernel@zh-kernel.org (moderated for non-subscribers)
 S:	Maintained
 F:	Documentation/zh_CN/
@@ -2982,7 +2982,6 @@ L:	linux-ext4@vger.kernel.org
 S:	Maintained
 F:	Documentation/filesystems/ext3.txt
 F:	fs/ext3/
-F:	include/linux/ext3*
 
 EXT4 FILE SYSTEM
 M:	"Theodore Ts'o" <tytso@mit.edu>
@@ -3129,7 +3128,8 @@ W:	http://ieee1394.wiki.kernel.org/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/ieee1394/linux1394.git
 S:	Maintained
 F:	drivers/firewire/
-F:	include/linux/firewire*.h
+F:	include/linux/firewire.h
+F:	include/uapi/linux/firewire*.h
 F:	tools/firewire/
 
 FIRMWARE LOADER (request_firmware)
@@ -4314,7 +4314,6 @@ M:	Jan Kara <jack@suse.cz>
 L:	linux-ext4@vger.kernel.org
 S:	Maintained
 F:	fs/jbd/
-F:	include/linux/ext3_jbd.h
 F:	include/linux/jbd.h
 
 JOURNALLING LAYER FOR BLOCK DEVICES (JBD2)
@@ -6492,7 +6491,7 @@ F:	drivers/media/pci/saa7146/
 F:	include/media/saa7146*
 
 SAMSUNG LAPTOP DRIVER
-M:	Corentin Chary <corentincj@iksaif.net>
+M:	Corentin Chary <corentin.chary@gmail.com>
 L:	platform-driver-x86@vger.kernel.org
 S:	Maintained
 F:	drivers/platform/x86/samsung-laptop.c
@@ -7546,6 +7545,13 @@ S:	Maintained
 F:	sound/soc/codecs/lm49453*
 F:	sound/soc/codecs/isabelle*
 
+TI LP855x BACKLIGHT DRIVER
+M:	Milo Kim <milo.kim@ti.com>
+S:	Maintained
+F:	Documentation/backlight/lp855x-driver.txt
+F:	drivers/video/backlight/lp855x_bl.c
+F:	include/linux/platform_data/lp855x.h
+
 TI TWL4030 SERIES SOC CODEC DRIVER
 M:	Peter Ujfalusi <peter.ujfalusi@ti.com>
 L:	alsa-devel@alsa-project.org (moderated for non-subscribers)
diff --git a/trunk/Makefile b/trunk/Makefile
index 540f7b240c77..6f07f4a28b47 100644
--- a/trunk/Makefile
+++ b/trunk/Makefile
@@ -981,6 +981,12 @@ _modinst_post: _modinst_
 	$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.fwinst obj=firmware __fw_modinst
 	$(call cmd,depmod)
 
+ifeq ($(CONFIG_MODULE_SIG), y)
+PHONY += modules_sign
+modules_sign:
+	$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modsign
+endif
+
 else # CONFIG_MODULES
 
 # Modules not configured
diff --git a/trunk/arch/Kconfig b/trunk/arch/Kconfig
index 34884faf98cd..8e9e3246b2b4 100644
--- a/trunk/arch/Kconfig
+++ b/trunk/arch/Kconfig
@@ -80,6 +80,7 @@ config UPROBES
 	bool "Transparent user-space probes (EXPERIMENTAL)"
 	depends on UPROBE_EVENT && PERF_EVENTS
 	default n
+	select PERCPU_RWSEM
 	help
 	  Uprobes is the user-space counterpart to kprobes: they
 	  enable instrumentation applications (such as 'perf probe')
@@ -112,6 +113,25 @@ config HAVE_EFFICIENT_UNALIGNED_ACCESS
 	  See Documentation/unaligned-memory-access.txt for more
 	  information on the topic of unaligned memory accesses.
 
+config ARCH_USE_BUILTIN_BSWAP
+       bool
+       help
+	 Modern versions of GCC (since 4.4) have builtin functions
+	 for handling byte-swapping. Using these, instead of the old
+	 inline assembler that the architecture code provides in the
+	 __arch_bswapXX() macros, allows the compiler to see what's
+	 happening and offers more opportunity for optimisation. In
+	 particular, the compiler will be able to combine the byteswap
+	 with a nearby load or store and use load-and-swap or
+	 store-and-swap instructions if the architecture has them. It
+	 should almost *never* result in code which is worse than the
+	 hand-coded assembler in <asm/swab.h>.  But just in case it
+	 does, the use of the builtins is optional.
+
+	 Any architecture with load-and-swap or store-and-swap
+	 instructions should set this. And it shouldn't hurt to set it
+	 on architectures that don't have such instructions.
+
 config HAVE_SYSCALL_WRAPPERS
 	bool
 
diff --git a/trunk/arch/alpha/include/asm/Kbuild b/trunk/arch/alpha/include/asm/Kbuild
index dcfabb9f05a0..a6e85f448c1c 100644
--- a/trunk/arch/alpha/include/asm/Kbuild
+++ b/trunk/arch/alpha/include/asm/Kbuild
@@ -1,14 +1,5 @@
-include include/asm-generic/Kbuild.asm
 
 generic-y += clkdev.h
 
-header-y += compiler.h
-header-y += console.h
-header-y += fpu.h
-header-y += gentrap.h
-header-y += pal.h
-header-y += reg.h
-header-y += regdef.h
-header-y += sysinfo.h
 generic-y += exec.h
 generic-y += trace_clock.h
diff --git a/trunk/arch/alpha/include/asm/a.out.h b/trunk/arch/alpha/include/asm/a.out.h
index acdc681231cb..9abbd2455306 100644
--- a/trunk/arch/alpha/include/asm/a.out.h
+++ b/trunk/arch/alpha/include/asm/a.out.h
@@ -1,94 +1,8 @@
 #ifndef __ALPHA_A_OUT_H__
 #define __ALPHA_A_OUT_H__
 
-#include <linux/types.h>
+#include <uapi/asm/a.out.h>
 
-/*
- * OSF/1 ECOFF header structs.  ECOFF files consist of:
- * 	- a file header (struct filehdr),
- *	- an a.out header (struct aouthdr),
- *	- one or more section headers (struct scnhdr). 
- *	  The filhdr's "f_nscns" field contains the
- *	  number of section headers.
- */
-
-struct filehdr
-{
-	/* OSF/1 "file" header */
-	__u16 f_magic, f_nscns;
-	__u32 f_timdat;
-	__u64 f_symptr;
-	__u32 f_nsyms;
-	__u16 f_opthdr, f_flags;
-};
-
-struct aouthdr
-{
-	__u64 info;		/* after that it looks quite normal.. */
-	__u64 tsize;
-	__u64 dsize;
-	__u64 bsize;
-	__u64 entry;
-	__u64 text_start;	/* with a few additions that actually make sense */
-	__u64 data_start;
-	__u64 bss_start;
-	__u32 gprmask, fprmask;	/* bitmask of general & floating point regs used in binary */
-	__u64 gpvalue;
-};
-
-struct scnhdr
-{
-	char	s_name[8];
-	__u64	s_paddr;
-	__u64	s_vaddr;
-	__u64	s_size;
-	__u64	s_scnptr;
-	__u64	s_relptr;
-	__u64	s_lnnoptr;
-	__u16	s_nreloc;
-	__u16	s_nlnno;
-	__u32	s_flags;
-};
-
-struct exec
-{
-	/* OSF/1 "file" header */
-	struct filehdr		fh;
-	struct aouthdr		ah;
-};
-
-/*
- * Define's so that the kernel exec code can access the a.out header
- * fields...
- */
-#define	a_info		ah.info
-#define	a_text		ah.tsize
-#define a_data		ah.dsize
-#define a_bss		ah.bsize
-#define a_entry		ah.entry
-#define a_textstart	ah.text_start
-#define	a_datastart	ah.data_start
-#define	a_bssstart	ah.bss_start
-#define	a_gprmask	ah.gprmask
-#define a_fprmask	ah.fprmask
-#define a_gpvalue	ah.gpvalue
-
-#define N_TXTADDR(x) ((x).a_textstart)
-#define N_DATADDR(x) ((x).a_datastart)
-#define N_BSSADDR(x) ((x).a_bssstart)
-#define N_DRSIZE(x) 0
-#define N_TRSIZE(x) 0
-#define N_SYMSIZE(x) 0
-
-#define AOUTHSZ		sizeof(struct aouthdr)
-#define SCNHSZ		sizeof(struct scnhdr)
-#define SCNROUND	16
-
-#define N_TXTOFF(x) \
-  ((long) N_MAGIC(x) == ZMAGIC ? 0 : \
-   (sizeof(struct exec) + (x).fh.f_nscns*SCNHSZ + SCNROUND - 1) & ~(SCNROUND - 1))
-
-#ifdef __KERNEL__
 
 /* Assume that start addresses below 4G belong to a TASO application.
    Unfortunately, there is no proper bit in the exec header to check.
@@ -98,5 +12,4 @@ struct exec
 	set_personality (((BFPM->taso || EX.ah.entry < 0x100000000L \
 			   ? ADDR_LIMIT_32BIT : 0) | PER_OSF4))
 
-#endif /* __KERNEL__ */
 #endif /* __A_OUT_GNU_H__ */
diff --git a/trunk/arch/alpha/include/asm/compiler.h b/trunk/arch/alpha/include/asm/compiler.h
index da6bb199839c..a7720b96bcc9 100644
--- a/trunk/arch/alpha/include/asm/compiler.h
+++ b/trunk/arch/alpha/include/asm/compiler.h
@@ -1,119 +1,8 @@
 #ifndef __ALPHA_COMPILER_H
 #define __ALPHA_COMPILER_H
 
-/* 
- * Herein are macros we use when describing various patterns we want to GCC.
- * In all cases we can get better schedules out of the compiler if we hide
- * as little as possible inside inline assembly.  However, we want to be
- * able to know what we'll get out before giving up inline assembly.  Thus
- * these tests and macros.
- */
+#include <uapi/asm/compiler.h>
 
-#if __GNUC__ == 3 && __GNUC_MINOR__ >= 4 || __GNUC__ > 3
-# define __kernel_insbl(val, shift)	__builtin_alpha_insbl(val, shift)
-# define __kernel_inswl(val, shift)	__builtin_alpha_inswl(val, shift)
-# define __kernel_insql(val, shift)	__builtin_alpha_insql(val, shift)
-# define __kernel_inslh(val, shift)	__builtin_alpha_inslh(val, shift)
-# define __kernel_extbl(val, shift)	__builtin_alpha_extbl(val, shift)
-# define __kernel_extwl(val, shift)	__builtin_alpha_extwl(val, shift)
-# define __kernel_cmpbge(a, b)		__builtin_alpha_cmpbge(a, b)
-#else
-# define __kernel_insbl(val, shift)					\
-  ({ unsigned long __kir;						\
-     __asm__("insbl %2,%1,%0" : "=r"(__kir) : "rI"(shift), "r"(val));	\
-     __kir; })
-# define __kernel_inswl(val, shift)					\
-  ({ unsigned long __kir;						\
-     __asm__("inswl %2,%1,%0" : "=r"(__kir) : "rI"(shift), "r"(val));	\
-     __kir; })
-# define __kernel_insql(val, shift)					\
-  ({ unsigned long __kir;						\
-     __asm__("insql %2,%1,%0" : "=r"(__kir) : "rI"(shift), "r"(val));	\
-     __kir; })
-# define __kernel_inslh(val, shift)					\
-  ({ unsigned long __kir;						\
-     __asm__("inslh %2,%1,%0" : "=r"(__kir) : "rI"(shift), "r"(val));	\
-     __kir; })
-# define __kernel_extbl(val, shift)					\
-  ({ unsigned long __kir;						\
-     __asm__("extbl %2,%1,%0" : "=r"(__kir) : "rI"(shift), "r"(val));	\
-     __kir; })
-# define __kernel_extwl(val, shift)					\
-  ({ unsigned long __kir;						\
-     __asm__("extwl %2,%1,%0" : "=r"(__kir) : "rI"(shift), "r"(val));	\
-     __kir; })
-# define __kernel_cmpbge(a, b)						\
-  ({ unsigned long __kir;						\
-     __asm__("cmpbge %r2,%1,%0" : "=r"(__kir) : "rI"(b), "rJ"(a));	\
-     __kir; })
-#endif
-
-#ifdef __alpha_cix__
-# if __GNUC__ == 3 && __GNUC_MINOR__ >= 4 || __GNUC__ > 3
-#  define __kernel_cttz(x)		__builtin_ctzl(x)
-#  define __kernel_ctlz(x)		__builtin_clzl(x)
-#  define __kernel_ctpop(x)		__builtin_popcountl(x)
-# else
-#  define __kernel_cttz(x)						\
-   ({ unsigned long __kir;						\
-      __asm__("cttz %1,%0" : "=r"(__kir) : "r"(x));			\
-      __kir; })
-#  define __kernel_ctlz(x)						\
-   ({ unsigned long __kir;						\
-      __asm__("ctlz %1,%0" : "=r"(__kir) : "r"(x));			\
-      __kir; })
-#  define __kernel_ctpop(x)						\
-   ({ unsigned long __kir;						\
-      __asm__("ctpop %1,%0" : "=r"(__kir) : "r"(x));			\
-      __kir; })
-# endif
-#else
-# define __kernel_cttz(x)						\
-  ({ unsigned long __kir;						\
-     __asm__(".arch ev67; cttz %1,%0" : "=r"(__kir) : "r"(x));		\
-     __kir; })
-# define __kernel_ctlz(x)						\
-  ({ unsigned long __kir;						\
-     __asm__(".arch ev67; ctlz %1,%0" : "=r"(__kir) : "r"(x));		\
-     __kir; })
-# define __kernel_ctpop(x)						\
-  ({ unsigned long __kir;						\
-     __asm__(".arch ev67; ctpop %1,%0" : "=r"(__kir) : "r"(x));		\
-     __kir; })
-#endif
-
-
-/* 
- * Beginning with EGCS 1.1, GCC defines __alpha_bwx__ when the BWX 
- * extension is enabled.  Previous versions did not define anything
- * we could test during compilation -- too bad, so sad.
- */
-
-#if defined(__alpha_bwx__)
-#define __kernel_ldbu(mem)	(mem)
-#define __kernel_ldwu(mem)	(mem)
-#define __kernel_stb(val,mem)	((mem) = (val))
-#define __kernel_stw(val,mem)	((mem) = (val))
-#else
-#define __kernel_ldbu(mem)				\
-  ({ unsigned char __kir;				\
-     __asm__(".arch ev56;				\
-	      ldbu %0,%1" : "=r"(__kir) : "m"(mem));	\
-     __kir; })
-#define __kernel_ldwu(mem)				\
-  ({ unsigned short __kir;				\
-     __asm__(".arch ev56;				\
-	      ldwu %0,%1" : "=r"(__kir) : "m"(mem));	\
-     __kir; })
-#define __kernel_stb(val,mem)				\
-  __asm__(".arch ev56;					\
-	   stb %1,%0" : "=m"(mem) : "r"(val))
-#define __kernel_stw(val,mem)				\
-  __asm__(".arch ev56;					\
-	   stw %1,%0" : "=m"(mem) : "r"(val))
-#endif
-
-#ifdef __KERNEL__
 /* Some idiots over in <linux/compiler.h> thought inline should imply
    always_inline.  This breaks stuff.  We'll include this file whenever
    we run into such problems.  */
@@ -125,6 +14,4 @@
 #undef __always_inline
 #define __always_inline		inline __attribute__((always_inline))
 
-#endif /* __KERNEL__ */
-
 #endif /* __ALPHA_COMPILER_H */
diff --git a/trunk/arch/alpha/include/asm/console.h b/trunk/arch/alpha/include/asm/console.h
index a3ce4e62249b..f2b584fe0994 100644
--- a/trunk/arch/alpha/include/asm/console.h
+++ b/trunk/arch/alpha/include/asm/console.h
@@ -1,52 +1,8 @@
 #ifndef __AXP_CONSOLE_H
 #define __AXP_CONSOLE_H
 
-/*
- * Console callback routine numbers
- */
-#define CCB_GETC		0x01
-#define CCB_PUTS		0x02
-#define CCB_RESET_TERM		0x03
-#define CCB_SET_TERM_INT	0x04
-#define CCB_SET_TERM_CTL	0x05
-#define CCB_PROCESS_KEYCODE	0x06
-#define CCB_OPEN_CONSOLE	0x07
-#define CCB_CLOSE_CONSOLE	0x08
+#include <uapi/asm/console.h>
 
-#define CCB_OPEN		0x10
-#define CCB_CLOSE		0x11
-#define CCB_IOCTL		0x12
-#define CCB_READ		0x13
-#define CCB_WRITE		0x14
-
-#define CCB_SET_ENV		0x20
-#define CCB_RESET_ENV		0x21
-#define CCB_GET_ENV		0x22
-#define CCB_SAVE_ENV		0x23
-
-#define CCB_PSWITCH		0x30
-#define CCB_BIOS_EMUL		0x32
-
-/*
- * Environment variable numbers
- */
-#define ENV_AUTO_ACTION		0x01
-#define ENV_BOOT_DEV		0x02
-#define ENV_BOOTDEF_DEV		0x03
-#define ENV_BOOTED_DEV		0x04
-#define ENV_BOOT_FILE		0x05
-#define ENV_BOOTED_FILE		0x06
-#define ENV_BOOT_OSFLAGS	0x07
-#define ENV_BOOTED_OSFLAGS	0x08
-#define ENV_BOOT_RESET		0x09
-#define ENV_DUMP_DEV		0x0A
-#define ENV_ENABLE_AUDIT	0x0B
-#define ENV_LICENSE		0x0C
-#define ENV_CHAR_SET		0x0D
-#define ENV_LANGUAGE		0x0E
-#define ENV_TTY_DEV		0x0F
-
-#ifdef __KERNEL__
 #ifndef __ASSEMBLY__
 extern long callback_puts(long unit, const char *s, long length);
 extern long callback_getc(long unit);
@@ -70,6 +26,4 @@ struct hwrpb_struct;
 extern int callback_init_done;
 extern void * callback_init(void *);
 #endif /* __ASSEMBLY__ */
-#endif /* __KERNEL__ */
-
 #endif /* __AXP_CONSOLE_H */
diff --git a/trunk/arch/alpha/include/asm/fpu.h b/trunk/arch/alpha/include/asm/fpu.h
index e477bcd5b94a..71c20956b905 100644
--- a/trunk/arch/alpha/include/asm/fpu.h
+++ b/trunk/arch/alpha/include/asm/fpu.h
@@ -1,128 +1,8 @@
 #ifndef __ASM_ALPHA_FPU_H
 #define __ASM_ALPHA_FPU_H
 
-#ifdef __KERNEL__
 #include <asm/special_insns.h>
-#endif
-
-/*
- * Alpha floating-point control register defines:
- */
-#define FPCR_DNOD	(1UL<<47)	/* denorm INV trap disable */
-#define FPCR_DNZ	(1UL<<48)	/* denorms to zero */
-#define FPCR_INVD	(1UL<<49)	/* invalid op disable (opt.) */
-#define FPCR_DZED	(1UL<<50)	/* division by zero disable (opt.) */
-#define FPCR_OVFD	(1UL<<51)	/* overflow disable (optional) */
-#define FPCR_INV	(1UL<<52)	/* invalid operation */
-#define FPCR_DZE	(1UL<<53)	/* division by zero */
-#define FPCR_OVF	(1UL<<54)	/* overflow */
-#define FPCR_UNF	(1UL<<55)	/* underflow */
-#define FPCR_INE	(1UL<<56)	/* inexact */
-#define FPCR_IOV	(1UL<<57)	/* integer overflow */
-#define FPCR_UNDZ	(1UL<<60)	/* underflow to zero (opt.) */
-#define FPCR_UNFD	(1UL<<61)	/* underflow disable (opt.) */
-#define FPCR_INED	(1UL<<62)	/* inexact disable (opt.) */
-#define FPCR_SUM	(1UL<<63)	/* summary bit */
-
-#define FPCR_DYN_SHIFT	58		/* first dynamic rounding mode bit */
-#define FPCR_DYN_CHOPPED (0x0UL << FPCR_DYN_SHIFT)	/* towards 0 */
-#define FPCR_DYN_MINUS	 (0x1UL << FPCR_DYN_SHIFT)	/* towards -INF */
-#define FPCR_DYN_NORMAL	 (0x2UL << FPCR_DYN_SHIFT)	/* towards nearest */
-#define FPCR_DYN_PLUS	 (0x3UL << FPCR_DYN_SHIFT)	/* towards +INF */
-#define FPCR_DYN_MASK	 (0x3UL << FPCR_DYN_SHIFT)
-
-#define FPCR_MASK	0xffff800000000000L
-
-/*
- * IEEE trap enables are implemented in software.  These per-thread
- * bits are stored in the "ieee_state" field of "struct thread_info".
- * Thus, the bits are defined so as not to conflict with the
- * floating-point enable bit (which is architected).  On top of that,
- * we want to make these bits compatible with OSF/1 so
- * ieee_set_fp_control() etc. can be implemented easily and
- * compatibly.  The corresponding definitions are in
- * /usr/include/machine/fpu.h under OSF/1.
- */
-#define IEEE_TRAP_ENABLE_INV	(1UL<<1)	/* invalid op */
-#define IEEE_TRAP_ENABLE_DZE	(1UL<<2)	/* division by zero */
-#define IEEE_TRAP_ENABLE_OVF	(1UL<<3)	/* overflow */
-#define IEEE_TRAP_ENABLE_UNF	(1UL<<4)	/* underflow */
-#define IEEE_TRAP_ENABLE_INE	(1UL<<5)	/* inexact */
-#define IEEE_TRAP_ENABLE_DNO	(1UL<<6)	/* denorm */
-#define IEEE_TRAP_ENABLE_MASK	(IEEE_TRAP_ENABLE_INV | IEEE_TRAP_ENABLE_DZE |\
-				 IEEE_TRAP_ENABLE_OVF | IEEE_TRAP_ENABLE_UNF |\
-				 IEEE_TRAP_ENABLE_INE | IEEE_TRAP_ENABLE_DNO)
-
-/* Denorm and Underflow flushing */
-#define IEEE_MAP_DMZ		(1UL<<12)	/* Map denorm inputs to zero */
-#define IEEE_MAP_UMZ		(1UL<<13)	/* Map underflowed outputs to zero */
-
-#define IEEE_MAP_MASK		(IEEE_MAP_DMZ | IEEE_MAP_UMZ)
-
-/* status bits coming from fpcr: */
-#define IEEE_STATUS_INV		(1UL<<17)
-#define IEEE_STATUS_DZE		(1UL<<18)
-#define IEEE_STATUS_OVF		(1UL<<19)
-#define IEEE_STATUS_UNF		(1UL<<20)
-#define IEEE_STATUS_INE		(1UL<<21)
-#define IEEE_STATUS_DNO		(1UL<<22)
-
-#define IEEE_STATUS_MASK	(IEEE_STATUS_INV | IEEE_STATUS_DZE |	\
-				 IEEE_STATUS_OVF | IEEE_STATUS_UNF |	\
-				 IEEE_STATUS_INE | IEEE_STATUS_DNO)
-
-#define IEEE_SW_MASK		(IEEE_TRAP_ENABLE_MASK |		\
-				 IEEE_STATUS_MASK | IEEE_MAP_MASK)
-
-#define IEEE_CURRENT_RM_SHIFT	32
-#define IEEE_CURRENT_RM_MASK	(3UL<<IEEE_CURRENT_RM_SHIFT)
-
-#define IEEE_STATUS_TO_EXCSUM_SHIFT	16
-
-#define IEEE_INHERIT    (1UL<<63)	/* inherit on thread create? */
-
-/*
- * Convert the software IEEE trap enable and status bits into the
- * hardware fpcr format. 
- *
- * Digital Unix engineers receive my thanks for not defining the
- * software bits identical to the hardware bits.  The chip designers
- * receive my thanks for making all the not-implemented fpcr bits
- * RAZ forcing us to use system calls to read/write this value.
- */
-
-static inline unsigned long
-ieee_swcr_to_fpcr(unsigned long sw)
-{
-	unsigned long fp;
-	fp = (sw & IEEE_STATUS_MASK) << 35;
-	fp |= (sw & IEEE_MAP_DMZ) << 36;
-	fp |= (sw & IEEE_STATUS_MASK ? FPCR_SUM : 0);
-	fp |= (~sw & (IEEE_TRAP_ENABLE_INV
-		      | IEEE_TRAP_ENABLE_DZE
-		      | IEEE_TRAP_ENABLE_OVF)) << 48;
-	fp |= (~sw & (IEEE_TRAP_ENABLE_UNF | IEEE_TRAP_ENABLE_INE)) << 57;
-	fp |= (sw & IEEE_MAP_UMZ ? FPCR_UNDZ | FPCR_UNFD : 0);
-	fp |= (~sw & IEEE_TRAP_ENABLE_DNO) << 41;
-	return fp;
-}
-
-static inline unsigned long
-ieee_fpcr_to_swcr(unsigned long fp)
-{
-	unsigned long sw;
-	sw = (fp >> 35) & IEEE_STATUS_MASK;
-	sw |= (fp >> 36) & IEEE_MAP_DMZ;
-	sw |= (~fp >> 48) & (IEEE_TRAP_ENABLE_INV
-			     | IEEE_TRAP_ENABLE_DZE
-			     | IEEE_TRAP_ENABLE_OVF);
-	sw |= (~fp >> 57) & (IEEE_TRAP_ENABLE_UNF | IEEE_TRAP_ENABLE_INE);
-	sw |= (fp >> 47) & IEEE_MAP_UMZ;
-	sw |= (~fp >> 41) & IEEE_TRAP_ENABLE_DNO;
-	return sw;
-}
-
-#ifdef __KERNEL__
+#include <uapi/asm/fpu.h>
 
 /* The following two functions don't need trapb/excb instructions
    around the mf_fpcr/mt_fpcr instructions because (a) the kernel
@@ -192,6 +72,4 @@ extern void alpha_write_fp_reg (unsigned long reg, unsigned long val);
 extern unsigned long alpha_read_fp_reg_s (unsigned long reg);
 extern void alpha_write_fp_reg_s (unsigned long reg, unsigned long val);
 
-#endif /* __KERNEL__ */
-
 #endif /* __ASM_ALPHA_FPU_H */
diff --git a/trunk/arch/alpha/include/asm/pal.h b/trunk/arch/alpha/include/asm/pal.h
index 6699ee583429..6fcd2b5b08f0 100644
--- a/trunk/arch/alpha/include/asm/pal.h
+++ b/trunk/arch/alpha/include/asm/pal.h
@@ -1,54 +1,8 @@
 #ifndef __ALPHA_PAL_H
 #define __ALPHA_PAL_H
 
-/*
- * Common PAL-code
- */
-#define PAL_halt	  0
-#define PAL_cflush	  1
-#define PAL_draina	  2
-#define PAL_bpt		128
-#define PAL_bugchk	129
-#define PAL_chmk	131
-#define PAL_callsys	131
-#define PAL_imb		134
-#define PAL_rduniq	158
-#define PAL_wruniq	159
-#define PAL_gentrap	170
-#define PAL_nphalt	190
-
-/*
- * VMS specific PAL-code
- */
-#define PAL_swppal	10
-#define PAL_mfpr_vptb	41
+#include <uapi/asm/pal.h>
 
-/*
- * OSF specific PAL-code
- */
-#define PAL_cserve	 9
-#define PAL_wripir	13
-#define PAL_rdmces	16
-#define PAL_wrmces	17
-#define PAL_wrfen	43
-#define PAL_wrvptptr	45
-#define PAL_jtopal	46
-#define PAL_swpctx	48
-#define PAL_wrval	49
-#define PAL_rdval	50
-#define PAL_tbi		51
-#define PAL_wrent	52
-#define PAL_swpipl	53
-#define PAL_rdps	54
-#define PAL_wrkgp	55
-#define PAL_wrusp	56
-#define PAL_wrperfmon	57
-#define PAL_rdusp	58
-#define PAL_whami	60
-#define PAL_retsys	61
-#define PAL_rti		63
-
-#ifdef __KERNEL__
 #ifndef __ASSEMBLY__
 
 extern void halt(void) __attribute__((noreturn));
@@ -158,6 +112,4 @@ __CALL_PAL_W1(wrvptptr, unsigned long);
 #define tbia()		__tbi(-2, /* no second argument */)
 
 #endif /* !__ASSEMBLY__ */
-#endif /* __KERNEL__ */
-
 #endif /* __ALPHA_PAL_H */
diff --git a/trunk/arch/alpha/include/asm/param.h b/trunk/arch/alpha/include/asm/param.h
index e691ecfedb2c..bf46af51941b 100644
--- a/trunk/arch/alpha/include/asm/param.h
+++ b/trunk/arch/alpha/include/asm/param.h
@@ -1,27 +1,9 @@
 #ifndef _ASM_ALPHA_PARAM_H
 #define _ASM_ALPHA_PARAM_H
 
-/* ??? Gross.  I don't want to parameterize this, and supposedly the
-   hardware ignores reprogramming.  We also need userland buy-in to the 
-   change in HZ, since this is visible in the wait4 resources etc.  */
+#include <uapi/asm/param.h>
 
-#ifdef __KERNEL__
 #define HZ		CONFIG_HZ
 #define USER_HZ		HZ
-#else
-#define HZ		1024
-#endif
-
-#define EXEC_PAGESIZE	8192
-
-#ifndef NOGROUP
-#define NOGROUP		(-1)
-#endif
-
-#define MAXHOSTNAMELEN	64	/* max length of hostname */
-
-#ifdef __KERNEL__
 # define CLOCKS_PER_SEC	HZ	/* frequency at which times() counts */
-#endif
-
 #endif /* _ASM_ALPHA_PARAM_H */
diff --git a/trunk/arch/alpha/include/asm/ptrace.h b/trunk/arch/alpha/include/asm/ptrace.h
index b4c5b2fbb647..df9a6cd748d5 100644
--- a/trunk/arch/alpha/include/asm/ptrace.h
+++ b/trunk/arch/alpha/include/asm/ptrace.h
@@ -1,72 +1,8 @@
 #ifndef _ASMAXP_PTRACE_H
 #define _ASMAXP_PTRACE_H
 
+#include <uapi/asm/ptrace.h>
 
-/*
- * This struct defines the way the registers are stored on the
- * kernel stack during a system call or other kernel entry
- *
- * NOTE! I want to minimize the overhead of system calls, so this
- * struct has as little information as possible.  I does not have
- *
- *  - floating point regs: the kernel doesn't change those
- *  - r9-15: saved by the C compiler
- *
- * This makes "fork()" and "exec()" a bit more complex, but should
- * give us low system call latency.
- */
-
-struct pt_regs {
-	unsigned long r0;
-	unsigned long r1;
-	unsigned long r2;
-	unsigned long r3;
-	unsigned long r4;
-	unsigned long r5;
-	unsigned long r6;
-	unsigned long r7;
-	unsigned long r8;
-	unsigned long r19;
-	unsigned long r20;
-	unsigned long r21;
-	unsigned long r22;
-	unsigned long r23;
-	unsigned long r24;
-	unsigned long r25;
-	unsigned long r26;
-	unsigned long r27;
-	unsigned long r28;
-	unsigned long hae;
-/* JRP - These are the values provided to a0-a2 by PALcode */
-	unsigned long trap_a0;
-	unsigned long trap_a1;
-	unsigned long trap_a2;
-/* These are saved by PAL-code: */
-	unsigned long ps;
-	unsigned long pc;
-	unsigned long gp;
-	unsigned long r16;
-	unsigned long r17;
-	unsigned long r18;
-};
-
-/*
- * This is the extended stack used by signal handlers and the context
- * switcher: it's pushed after the normal "struct pt_regs".
- */
-struct switch_stack {
-	unsigned long r9;
-	unsigned long r10;
-	unsigned long r11;
-	unsigned long r12;
-	unsigned long r13;
-	unsigned long r14;
-	unsigned long r15;
-	unsigned long r26;
-	unsigned long fp[32];	/* fp[31] is fpcr */
-};
-
-#ifdef __KERNEL__
 
 #define arch_has_single_step()		(1)
 #define user_mode(regs) (((regs)->ps & 8) != 0)
@@ -83,5 +19,3 @@ struct switch_stack {
 #define force_successful_syscall_return() (current_pt_regs()->r0 = 0)
 
 #endif
-
-#endif
diff --git a/trunk/arch/alpha/include/asm/signal.h b/trunk/arch/alpha/include/asm/signal.h
index 45552862cc10..8a1ac28cd562 100644
--- a/trunk/arch/alpha/include/asm/signal.h
+++ b/trunk/arch/alpha/include/asm/signal.h
@@ -1,12 +1,8 @@
 #ifndef _ASMAXP_SIGNAL_H
 #define _ASMAXP_SIGNAL_H
 
-#include <linux/types.h>
+#include <uapi/asm/signal.h>
 
-/* Avoid too many header ordering problems.  */
-struct siginfo;
-
-#ifdef __KERNEL__
 /* Digital Unix defines 64 signals.  Most things should be clean enough
    to redefine this at will, if care is taken to make libc match.  */
 
@@ -20,100 +16,6 @@ typedef struct {
 	unsigned long sig[_NSIG_WORDS];
 } sigset_t;
 
-#else
-/* Here we must cater to libcs that poke about in kernel headers.  */
-
-#define NSIG		32
-typedef unsigned long sigset_t;
-
-#endif /* __KERNEL__ */
-
-
-/*
- * Linux/AXP has different signal numbers that Linux/i386: I'm trying
- * to make it OSF/1 binary compatible, at least for normal binaries.
- */
-#define SIGHUP		 1
-#define SIGINT		 2
-#define SIGQUIT		 3
-#define SIGILL		 4
-#define SIGTRAP		 5
-#define SIGABRT		 6
-#define SIGEMT		 7
-#define SIGFPE		 8
-#define SIGKILL		 9
-#define SIGBUS		10
-#define SIGSEGV		11
-#define SIGSYS		12
-#define SIGPIPE		13
-#define SIGALRM		14
-#define SIGTERM		15
-#define SIGURG		16
-#define SIGSTOP		17
-#define SIGTSTP		18
-#define SIGCONT		19
-#define SIGCHLD		20
-#define SIGTTIN		21
-#define SIGTTOU		22
-#define SIGIO		23
-#define SIGXCPU		24
-#define SIGXFSZ		25
-#define SIGVTALRM	26
-#define SIGPROF		27
-#define SIGWINCH	28
-#define SIGINFO		29
-#define SIGUSR1		30
-#define SIGUSR2		31
-
-#define SIGPOLL	SIGIO
-#define SIGPWR	SIGINFO
-#define SIGIOT	SIGABRT
-
-/* These should not be considered constants from userland.  */
-#define SIGRTMIN	32
-#define SIGRTMAX	_NSIG
-
-/*
- * SA_FLAGS values:
- *
- * SA_ONSTACK indicates that a registered stack_t will be used.
- * SA_RESTART flag to get restarting signals (which were the default long ago)
- * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop.
- * SA_RESETHAND clears the handler when the signal is delivered.
- * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies.
- * SA_NODEFER prevents the current signal from being masked in the handler.
- *
- * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single
- * Unix names RESETHAND and NODEFER respectively.
- */
-
-#define SA_ONSTACK	0x00000001
-#define SA_RESTART	0x00000002
-#define SA_NOCLDSTOP	0x00000004
-#define SA_NODEFER	0x00000008
-#define SA_RESETHAND	0x00000010
-#define SA_NOCLDWAIT	0x00000020
-#define SA_SIGINFO	0x00000040
-
-#define SA_ONESHOT	SA_RESETHAND
-#define SA_NOMASK	SA_NODEFER
-
-/* 
- * sigaltstack controls
- */
-#define SS_ONSTACK	1
-#define SS_DISABLE	2
-
-#define MINSIGSTKSZ	4096
-#define SIGSTKSZ	16384
-
-#define SIG_BLOCK          1	/* for blocking signals */
-#define SIG_UNBLOCK        2	/* for unblocking signals */
-#define SIG_SETMASK        3	/* for setting the signal mask */
-
-#include <asm-generic/signal-defs.h>
-
-#ifdef __KERNEL__
 struct osf_sigaction {
 	__sighandler_t	sa_handler;
 	old_sigset_t	sa_mask;
@@ -130,40 +32,5 @@ struct k_sigaction {
 	struct sigaction sa;
 	__sigrestore_t ka_restorer;
 };
-#else
-/* Here we must cater to libcs that poke about in kernel headers.  */
-
-struct sigaction {
-	union {
-	  __sighandler_t	_sa_handler;
-	  void (*_sa_sigaction)(int, struct siginfo *, void *);
-	} _u;
-	sigset_t	sa_mask;
-	int		sa_flags;
-};
-
-#define sa_handler	_u._sa_handler
-#define sa_sigaction	_u._sa_sigaction
-
-#endif /* __KERNEL__ */
-
-typedef struct sigaltstack {
-	void __user *ss_sp;
-	int ss_flags;
-	size_t ss_size;
-} stack_t;
-
-/* sigstack(2) is deprecated, and will be withdrawn in a future version
-   of the X/Open CAE Specification.  Use sigaltstack instead.  It is only
-   implemented here for OSF/1 compatibility.  */
-
-struct sigstack {
-	void __user *ss_sp;
-	int ss_onstack;
-};
-
-#ifdef __KERNEL__
 #include <asm/sigcontext.h>
 #endif
-
-#endif
diff --git a/trunk/arch/alpha/include/asm/socket.h b/trunk/arch/alpha/include/asm/socket.h
index 0087d053b77f..8d806d80ed24 100644
--- a/trunk/arch/alpha/include/asm/socket.h
+++ b/trunk/arch/alpha/include/asm/socket.h
@@ -1,87 +1,10 @@
 #ifndef _ASM_SOCKET_H
 #define _ASM_SOCKET_H
 
-#include <asm/sockios.h>
+#include <uapi/asm/socket.h>
 
-/* For setsockopt(2) */
-/*
- * Note: we only bother about making the SOL_SOCKET options
- * same as OSF/1, as that's all that "normal" programs are
- * likely to set.  We don't necessarily want to be binary
- * compatible with _everything_. 
- */
-#define SOL_SOCKET	0xffff
-
-#define SO_DEBUG	0x0001
-#define SO_REUSEADDR	0x0004
-#define SO_KEEPALIVE	0x0008
-#define SO_DONTROUTE	0x0010
-#define SO_BROADCAST	0x0020
-#define SO_LINGER	0x0080
-#define SO_OOBINLINE	0x0100
-/* To add :#define SO_REUSEPORT 0x0200 */
-
-#define SO_TYPE		0x1008
-#define SO_ERROR	0x1007
-#define SO_SNDBUF	0x1001
-#define SO_RCVBUF	0x1002
-#define SO_SNDBUFFORCE	0x100a
-#define SO_RCVBUFFORCE	0x100b
-#define	SO_RCVLOWAT	0x1010
-#define	SO_SNDLOWAT	0x1011
-#define	SO_RCVTIMEO	0x1012
-#define	SO_SNDTIMEO	0x1013
-#define SO_ACCEPTCONN	0x1014
-#define SO_PROTOCOL	0x1028
-#define SO_DOMAIN	0x1029
-
-/* linux-specific, might as well be the same as on i386 */
-#define SO_NO_CHECK	11
-#define SO_PRIORITY	12
-#define SO_BSDCOMPAT	14
-
-#define SO_PASSCRED	17
-#define SO_PEERCRED	18
-#define SO_BINDTODEVICE 25
-
-/* Socket filtering */
-#define SO_ATTACH_FILTER        26
-#define SO_DETACH_FILTER        27
-#define SO_GET_FILTER		SO_ATTACH_FILTER
-
-#define SO_PEERNAME		28
-#define SO_TIMESTAMP		29
-#define SCM_TIMESTAMP		SO_TIMESTAMP
-
-#define SO_PEERSEC		30
-#define SO_PASSSEC		34
-#define SO_TIMESTAMPNS		35
-#define SCM_TIMESTAMPNS		SO_TIMESTAMPNS
-
-/* Security levels - as per NRL IPv6 - don't actually do anything */
-#define SO_SECURITY_AUTHENTICATION		19
-#define SO_SECURITY_ENCRYPTION_TRANSPORT	20
-#define SO_SECURITY_ENCRYPTION_NETWORK		21
-
-#define SO_MARK			36
-
-#define SO_TIMESTAMPING		37
-#define SCM_TIMESTAMPING	SO_TIMESTAMPING
-
-#define SO_RXQ_OVFL             40
-
-#define SO_WIFI_STATUS		41
-#define SCM_WIFI_STATUS		SO_WIFI_STATUS
-#define SO_PEEK_OFF		42
-
-/* Instruct lower device to use last 4-bytes of skb data as FCS */
-#define SO_NOFCS		43
-
-#ifdef __KERNEL__
 /* O_NONBLOCK clashes with the bits used for socket types.  Therefore we
  * have to define SOCK_NONBLOCK to a different value here.
  */
 #define SOCK_NONBLOCK	0x40000000
-#endif /* __KERNEL__ */
-
 #endif /* _ASM_SOCKET_H */
diff --git a/trunk/arch/alpha/include/asm/termios.h b/trunk/arch/alpha/include/asm/termios.h
index fa13716a11c3..7fde0f88da88 100644
--- a/trunk/arch/alpha/include/asm/termios.h
+++ b/trunk/arch/alpha/include/asm/termios.h
@@ -1,72 +1,8 @@
 #ifndef _ALPHA_TERMIOS_H
 #define _ALPHA_TERMIOS_H
 
-#include <asm/ioctls.h>
-#include <asm/termbits.h>
+#include <uapi/asm/termios.h>
 
-struct sgttyb {
-	char	sg_ispeed;
-	char	sg_ospeed;
-	char	sg_erase;
-	char	sg_kill;
-	short	sg_flags;
-};
-
-struct tchars {
-	char	t_intrc;
-	char	t_quitc;
-	char	t_startc;
-	char	t_stopc;
-	char	t_eofc;
-	char	t_brkc;
-};
-
-struct ltchars {
-	char	t_suspc;
-	char	t_dsuspc;
-	char	t_rprntc;
-	char	t_flushc;
-	char	t_werasc;
-	char	t_lnextc;
-};
-
-struct winsize {
-	unsigned short ws_row;
-	unsigned short ws_col;
-	unsigned short ws_xpixel;
-	unsigned short ws_ypixel;
-};
-
-#define NCC 8
-struct termio {
-	unsigned short c_iflag;		/* input mode flags */
-	unsigned short c_oflag;		/* output mode flags */
-	unsigned short c_cflag;		/* control mode flags */
-	unsigned short c_lflag;		/* local mode flags */
-	unsigned char c_line;		/* line discipline */
-	unsigned char c_cc[NCC];	/* control characters */
-};
-
-/*
- * c_cc characters in the termio structure.  Oh, how I love being
- * backwardly compatible.  Notice that character 4 and 5 are
- * interpreted differently depending on whether ICANON is set in
- * c_lflag.  If it's set, they are used as _VEOF and _VEOL, otherwise
- * as _VMIN and V_TIME.  This is for compatibility with OSF/1 (which
- * is compatible with sysV)...
- */
-#define _VINTR	0
-#define _VQUIT	1
-#define _VERASE	2
-#define _VKILL	3
-#define _VEOF	4
-#define _VMIN	4
-#define _VEOL	5
-#define _VTIME	5
-#define _VEOL2	6
-#define _VSWTC	7
-
-#ifdef __KERNEL__
 /*	eof=^D		eol=\0		eol2=\0		erase=del
 	werase=^W	kill=^U		reprint=^R	sxtc=\0
 	intr=^C		quit=^\		susp=^Z		<OSF/1 VDSUSP>
@@ -141,6 +77,4 @@ struct termio {
 #define kernel_termios_to_user_termios(u, k) \
 	copy_to_user(u, k, sizeof(struct termios))
 
-#endif	/* __KERNEL__ */
-
 #endif	/* _ALPHA_TERMIOS_H */
diff --git a/trunk/arch/alpha/include/asm/types.h b/trunk/arch/alpha/include/asm/types.h
index 0a0579076f4a..f61e1a56c378 100644
--- a/trunk/arch/alpha/include/asm/types.h
+++ b/trunk/arch/alpha/include/asm/types.h
@@ -1,18 +1,7 @@
 #ifndef _ALPHA_TYPES_H
 #define _ALPHA_TYPES_H
 
-/*
- * This file is never included by application software unless
- * explicitly requested (e.g., via linux/types.h) in which case the
- * application is Linux specific so (user-) name space pollution is
- * not a major issue.  However, for interoperability, libraries still
- * need to be careful to avoid a name clashes.
- */
-
-#ifdef __KERNEL__
 #include <asm-generic/int-ll64.h>
-#else
-#include <asm-generic/int-l64.h>
-#endif
+#include <uapi/asm/types.h>
 
 #endif /* _ALPHA_TYPES_H */
diff --git a/trunk/arch/alpha/include/asm/unistd.h b/trunk/arch/alpha/include/asm/unistd.h
index eb3a4664ced2..d6069ff3b1c8 100644
--- a/trunk/arch/alpha/include/asm/unistd.h
+++ b/trunk/arch/alpha/include/asm/unistd.h
@@ -1,474 +1,8 @@
 #ifndef _ALPHA_UNISTD_H
 #define _ALPHA_UNISTD_H
 
-#define __NR_osf_syscall	  0	/* not implemented */
-#define __NR_exit		  1
-#define __NR_fork		  2
-#define __NR_read		  3
-#define __NR_write		  4
-#define __NR_osf_old_open	  5	/* not implemented */
-#define __NR_close		  6
-#define __NR_osf_wait4		  7
-#define __NR_osf_old_creat	  8	/* not implemented */
-#define __NR_link		  9
-#define __NR_unlink		 10
-#define __NR_osf_execve		 11	/* not implemented */
-#define __NR_chdir		 12
-#define __NR_fchdir		 13
-#define __NR_mknod		 14
-#define __NR_chmod		 15
-#define __NR_chown		 16
-#define __NR_brk		 17
-#define __NR_osf_getfsstat	 18	/* not implemented */
-#define __NR_lseek		 19
-#define __NR_getxpid		 20
-#define __NR_osf_mount		 21
-#define __NR_umount		 22
-#define __NR_setuid		 23
-#define __NR_getxuid		 24
-#define __NR_exec_with_loader	 25	/* not implemented */
-#define __NR_ptrace		 26
-#define __NR_osf_nrecvmsg	 27	/* not implemented */
-#define __NR_osf_nsendmsg	 28	/* not implemented */
-#define __NR_osf_nrecvfrom	 29	/* not implemented */
-#define __NR_osf_naccept	 30	/* not implemented */
-#define __NR_osf_ngetpeername	 31	/* not implemented */
-#define __NR_osf_ngetsockname	 32	/* not implemented */
-#define __NR_access		 33
-#define __NR_osf_chflags	 34	/* not implemented */
-#define __NR_osf_fchflags	 35	/* not implemented */
-#define __NR_sync		 36
-#define __NR_kill		 37
-#define __NR_osf_old_stat	 38	/* not implemented */
-#define __NR_setpgid		 39
-#define __NR_osf_old_lstat	 40	/* not implemented */
-#define __NR_dup		 41
-#define __NR_pipe		 42
-#define __NR_osf_set_program_attributes	43
-#define __NR_osf_profil		 44	/* not implemented */
-#define __NR_open		 45
-#define __NR_osf_old_sigaction	 46	/* not implemented */
-#define __NR_getxgid		 47
-#define __NR_osf_sigprocmask	 48
-#define __NR_osf_getlogin	 49	/* not implemented */
-#define __NR_osf_setlogin	 50	/* not implemented */
-#define __NR_acct		 51
-#define __NR_sigpending		 52
+#include <uapi/asm/unistd.h>
 
-#define __NR_ioctl		 54
-#define __NR_osf_reboot		 55	/* not implemented */
-#define __NR_osf_revoke		 56	/* not implemented */
-#define __NR_symlink		 57
-#define __NR_readlink		 58
-#define __NR_execve		 59
-#define __NR_umask		 60
-#define __NR_chroot		 61
-#define __NR_osf_old_fstat	 62	/* not implemented */
-#define __NR_getpgrp		 63
-#define __NR_getpagesize	 64
-#define __NR_osf_mremap		 65	/* not implemented */
-#define __NR_vfork		 66
-#define __NR_stat		 67
-#define __NR_lstat		 68
-#define __NR_osf_sbrk		 69	/* not implemented */
-#define __NR_osf_sstk		 70	/* not implemented */
-#define __NR_mmap		 71	/* OSF/1 mmap is superset of Linux */
-#define __NR_osf_old_vadvise	 72	/* not implemented */
-#define __NR_munmap		 73
-#define __NR_mprotect		 74
-#define __NR_madvise		 75
-#define __NR_vhangup		 76
-#define __NR_osf_kmodcall	 77	/* not implemented */
-#define __NR_osf_mincore	 78	/* not implemented */
-#define __NR_getgroups		 79
-#define __NR_setgroups		 80
-#define __NR_osf_old_getpgrp	 81	/* not implemented */
-#define __NR_setpgrp		 82	/* BSD alias for setpgid */
-#define __NR_osf_setitimer	 83
-#define __NR_osf_old_wait	 84	/* not implemented */
-#define __NR_osf_table		 85	/* not implemented */
-#define __NR_osf_getitimer	 86
-#define __NR_gethostname	 87
-#define __NR_sethostname	 88
-#define __NR_getdtablesize	 89
-#define __NR_dup2		 90
-#define __NR_fstat		 91
-#define __NR_fcntl		 92
-#define __NR_osf_select		 93
-#define __NR_poll		 94
-#define __NR_fsync		 95
-#define __NR_setpriority	 96
-#define __NR_socket		 97
-#define __NR_connect		 98
-#define __NR_accept		 99
-#define __NR_getpriority	100
-#define __NR_send		101
-#define __NR_recv		102
-#define __NR_sigreturn		103
-#define __NR_bind		104
-#define __NR_setsockopt		105
-#define __NR_listen		106
-#define __NR_osf_plock		107	/* not implemented */
-#define __NR_osf_old_sigvec	108	/* not implemented */
-#define __NR_osf_old_sigblock	109	/* not implemented */
-#define __NR_osf_old_sigsetmask	110	/* not implemented */
-#define __NR_sigsuspend		111
-#define __NR_osf_sigstack	112
-#define __NR_recvmsg		113
-#define __NR_sendmsg		114
-#define __NR_osf_old_vtrace	115	/* not implemented */
-#define __NR_osf_gettimeofday	116
-#define __NR_osf_getrusage	117
-#define __NR_getsockopt		118
-
-#define __NR_readv		120
-#define __NR_writev		121
-#define __NR_osf_settimeofday	122
-#define __NR_fchown		123
-#define __NR_fchmod		124
-#define __NR_recvfrom		125
-#define __NR_setreuid		126
-#define __NR_setregid		127
-#define __NR_rename		128
-#define __NR_truncate		129
-#define __NR_ftruncate		130
-#define __NR_flock		131
-#define __NR_setgid		132
-#define __NR_sendto		133
-#define __NR_shutdown		134
-#define __NR_socketpair		135
-#define __NR_mkdir		136
-#define __NR_rmdir		137
-#define __NR_osf_utimes		138
-#define __NR_osf_old_sigreturn	139	/* not implemented */
-#define __NR_osf_adjtime	140	/* not implemented */
-#define __NR_getpeername	141
-#define __NR_osf_gethostid	142	/* not implemented */
-#define __NR_osf_sethostid	143	/* not implemented */
-#define __NR_getrlimit		144
-#define __NR_setrlimit		145
-#define __NR_osf_old_killpg	146	/* not implemented */
-#define __NR_setsid		147
-#define __NR_quotactl		148
-#define __NR_osf_oldquota	149	/* not implemented */
-#define __NR_getsockname	150
-
-#define __NR_osf_pid_block	153	/* not implemented */
-#define __NR_osf_pid_unblock	154	/* not implemented */
-
-#define __NR_sigaction		156
-#define __NR_osf_sigwaitprim	157	/* not implemented */
-#define __NR_osf_nfssvc		158	/* not implemented */
-#define __NR_osf_getdirentries	159
-#define __NR_osf_statfs		160
-#define __NR_osf_fstatfs	161
-
-#define __NR_osf_asynch_daemon	163	/* not implemented */
-#define __NR_osf_getfh		164	/* not implemented */	
-#define __NR_osf_getdomainname	165
-#define __NR_setdomainname	166
-
-#define __NR_osf_exportfs	169	/* not implemented */
-
-#define __NR_osf_alt_plock	181	/* not implemented */
-
-#define __NR_osf_getmnt		184	/* not implemented */
-
-#define __NR_osf_alt_sigpending	187	/* not implemented */
-#define __NR_osf_alt_setsid	188	/* not implemented */
-
-#define __NR_osf_swapon		199
-#define __NR_msgctl		200
-#define __NR_msgget		201
-#define __NR_msgrcv		202
-#define __NR_msgsnd		203
-#define __NR_semctl		204
-#define __NR_semget		205
-#define __NR_semop		206
-#define __NR_osf_utsname	207
-#define __NR_lchown		208
-#define __NR_osf_shmat		209
-#define __NR_shmctl		210
-#define __NR_shmdt		211
-#define __NR_shmget		212
-#define __NR_osf_mvalid		213	/* not implemented */
-#define __NR_osf_getaddressconf	214	/* not implemented */
-#define __NR_osf_msleep		215	/* not implemented */
-#define __NR_osf_mwakeup	216	/* not implemented */
-#define __NR_msync		217
-#define __NR_osf_signal		218	/* not implemented */
-#define __NR_osf_utc_gettime	219	/* not implemented */
-#define __NR_osf_utc_adjtime	220	/* not implemented */
-
-#define __NR_osf_security	222	/* not implemented */
-#define __NR_osf_kloadcall	223	/* not implemented */
-
-#define __NR_osf_stat		224
-#define __NR_osf_lstat		225
-#define __NR_osf_fstat		226
-#define __NR_osf_statfs64	227
-#define __NR_osf_fstatfs64	228
-
-#define __NR_getpgid		233
-#define __NR_getsid		234
-#define __NR_sigaltstack	235
-#define __NR_osf_waitid		236	/* not implemented */
-#define __NR_osf_priocntlset	237	/* not implemented */
-#define __NR_osf_sigsendset	238	/* not implemented */
-#define __NR_osf_set_speculative	239	/* not implemented */
-#define __NR_osf_msfs_syscall	240	/* not implemented */
-#define __NR_osf_sysinfo	241
-#define __NR_osf_uadmin		242	/* not implemented */
-#define __NR_osf_fuser		243	/* not implemented */
-#define __NR_osf_proplist_syscall    244
-#define __NR_osf_ntp_adjtime	245	/* not implemented */
-#define __NR_osf_ntp_gettime	246	/* not implemented */
-#define __NR_osf_pathconf	247	/* not implemented */
-#define __NR_osf_fpathconf	248	/* not implemented */
-
-#define __NR_osf_uswitch	250	/* not implemented */
-#define __NR_osf_usleep_thread	251
-#define __NR_osf_audcntl	252	/* not implemented */
-#define __NR_osf_audgen		253	/* not implemented */
-#define __NR_sysfs		254
-#define __NR_osf_subsys_info	255	/* not implemented */
-#define __NR_osf_getsysinfo	256
-#define __NR_osf_setsysinfo	257
-#define __NR_osf_afs_syscall	258	/* not implemented */
-#define __NR_osf_swapctl	259	/* not implemented */
-#define __NR_osf_memcntl	260	/* not implemented */
-#define __NR_osf_fdatasync	261	/* not implemented */
-
-/*
- * Ignore legacy syscalls that we don't use.
- */
-#define __IGNORE_alarm
-#define __IGNORE_creat
-#define __IGNORE_getegid
-#define __IGNORE_geteuid
-#define __IGNORE_getgid
-#define __IGNORE_getpid
-#define __IGNORE_getppid
-#define __IGNORE_getuid
-#define __IGNORE_pause
-#define __IGNORE_time
-#define __IGNORE_utime
-#define __IGNORE_umount2
-
-/*
- * Linux-specific system calls begin at 300
- */
-#define __NR_bdflush		300
-#define __NR_sethae		301
-#define __NR_mount		302
-#define __NR_old_adjtimex	303
-#define __NR_swapoff		304
-#define __NR_getdents		305
-#define __NR_create_module	306
-#define __NR_init_module	307
-#define __NR_delete_module	308
-#define __NR_get_kernel_syms	309
-#define __NR_syslog		310
-#define __NR_reboot		311
-#define __NR_clone		312
-#define __NR_uselib		313
-#define __NR_mlock		314
-#define __NR_munlock		315
-#define __NR_mlockall		316
-#define __NR_munlockall		317
-#define __NR_sysinfo		318
-#define __NR__sysctl		319
-/* 320 was sys_idle.  */
-#define __NR_oldumount		321
-#define __NR_swapon		322
-#define __NR_times		323
-#define __NR_personality	324
-#define __NR_setfsuid		325
-#define __NR_setfsgid		326
-#define __NR_ustat		327
-#define __NR_statfs		328
-#define __NR_fstatfs		329
-#define __NR_sched_setparam		330
-#define __NR_sched_getparam		331
-#define __NR_sched_setscheduler		332
-#define __NR_sched_getscheduler		333
-#define __NR_sched_yield		334
-#define __NR_sched_get_priority_max	335
-#define __NR_sched_get_priority_min	336
-#define __NR_sched_rr_get_interval	337
-#define __NR_afs_syscall		338
-#define __NR_uname			339
-#define __NR_nanosleep			340
-#define __NR_mremap			341
-#define __NR_nfsservctl			342
-#define __NR_setresuid			343
-#define __NR_getresuid			344
-#define __NR_pciconfig_read		345
-#define __NR_pciconfig_write		346
-#define __NR_query_module		347
-#define __NR_prctl			348
-#define __NR_pread64			349
-#define __NR_pwrite64			350
-#define __NR_rt_sigreturn		351
-#define __NR_rt_sigaction		352
-#define __NR_rt_sigprocmask		353
-#define __NR_rt_sigpending		354
-#define __NR_rt_sigtimedwait		355
-#define __NR_rt_sigqueueinfo		356
-#define __NR_rt_sigsuspend		357
-#define __NR_select			358
-#define __NR_gettimeofday		359
-#define __NR_settimeofday		360
-#define __NR_getitimer			361
-#define __NR_setitimer			362
-#define __NR_utimes			363
-#define __NR_getrusage			364
-#define __NR_wait4			365
-#define __NR_adjtimex			366
-#define __NR_getcwd			367
-#define __NR_capget			368
-#define __NR_capset			369
-#define __NR_sendfile			370
-#define __NR_setresgid			371
-#define __NR_getresgid			372
-#define __NR_dipc			373
-#define __NR_pivot_root			374
-#define __NR_mincore			375
-#define __NR_pciconfig_iobase		376
-#define __NR_getdents64			377
-#define __NR_gettid			378
-#define __NR_readahead			379
-/* 380 is unused */
-#define __NR_tkill			381
-#define __NR_setxattr			382
-#define __NR_lsetxattr			383
-#define __NR_fsetxattr			384
-#define __NR_getxattr			385
-#define __NR_lgetxattr			386
-#define __NR_fgetxattr			387
-#define __NR_listxattr			388
-#define __NR_llistxattr			389
-#define __NR_flistxattr			390
-#define __NR_removexattr		391
-#define __NR_lremovexattr		392
-#define __NR_fremovexattr		393
-#define __NR_futex			394
-#define __NR_sched_setaffinity		395     
-#define __NR_sched_getaffinity		396
-#define __NR_tuxcall			397
-#define __NR_io_setup			398
-#define __NR_io_destroy			399
-#define __NR_io_getevents		400
-#define __NR_io_submit			401
-#define __NR_io_cancel			402
-#define __NR_exit_group			405
-#define __NR_lookup_dcookie		406
-#define __NR_epoll_create		407
-#define __NR_epoll_ctl			408
-#define __NR_epoll_wait			409
-/* Feb 2007: These three sys_epoll defines shouldn't be here but culling
- * them would break userspace apps ... we'll kill them off in 2010 :) */
-#define __NR_sys_epoll_create		__NR_epoll_create
-#define __NR_sys_epoll_ctl		__NR_epoll_ctl
-#define __NR_sys_epoll_wait		__NR_epoll_wait
-#define __NR_remap_file_pages		410
-#define __NR_set_tid_address		411
-#define __NR_restart_syscall		412
-#define __NR_fadvise64			413
-#define __NR_timer_create		414
-#define __NR_timer_settime		415
-#define __NR_timer_gettime		416
-#define __NR_timer_getoverrun		417
-#define __NR_timer_delete		418
-#define __NR_clock_settime		419
-#define __NR_clock_gettime		420
-#define __NR_clock_getres		421
-#define __NR_clock_nanosleep		422
-#define __NR_semtimedop			423
-#define __NR_tgkill			424
-#define __NR_stat64			425
-#define __NR_lstat64			426
-#define __NR_fstat64			427
-#define __NR_vserver			428
-#define __NR_mbind			429
-#define __NR_get_mempolicy		430
-#define __NR_set_mempolicy		431
-#define __NR_mq_open			432
-#define __NR_mq_unlink			433
-#define __NR_mq_timedsend		434
-#define __NR_mq_timedreceive		435
-#define __NR_mq_notify			436
-#define __NR_mq_getsetattr		437
-#define __NR_waitid			438
-#define __NR_add_key			439
-#define __NR_request_key		440
-#define __NR_keyctl			441
-#define __NR_ioprio_set			442
-#define __NR_ioprio_get			443
-#define __NR_inotify_init		444
-#define __NR_inotify_add_watch		445
-#define __NR_inotify_rm_watch		446
-#define __NR_fdatasync			447
-#define __NR_kexec_load			448
-#define __NR_migrate_pages		449
-#define __NR_openat			450
-#define __NR_mkdirat			451
-#define __NR_mknodat			452
-#define __NR_fchownat			453
-#define __NR_futimesat			454
-#define __NR_fstatat64			455
-#define __NR_unlinkat			456
-#define __NR_renameat			457
-#define __NR_linkat			458
-#define __NR_symlinkat			459
-#define __NR_readlinkat			460
-#define __NR_fchmodat			461
-#define __NR_faccessat			462
-#define __NR_pselect6			463
-#define __NR_ppoll			464
-#define __NR_unshare			465
-#define __NR_set_robust_list		466
-#define __NR_get_robust_list		467
-#define __NR_splice			468
-#define __NR_sync_file_range		469
-#define __NR_tee			470
-#define __NR_vmsplice			471
-#define __NR_move_pages			472
-#define __NR_getcpu			473
-#define __NR_epoll_pwait		474
-#define __NR_utimensat			475
-#define __NR_signalfd			476
-#define __NR_timerfd			477
-#define __NR_eventfd			478
-#define __NR_recvmmsg			479
-#define __NR_fallocate			480
-#define __NR_timerfd_create		481
-#define __NR_timerfd_settime		482
-#define __NR_timerfd_gettime		483
-#define __NR_signalfd4			484
-#define __NR_eventfd2			485
-#define __NR_epoll_create1		486
-#define __NR_dup3			487
-#define __NR_pipe2			488
-#define __NR_inotify_init1		489
-#define __NR_preadv			490
-#define __NR_pwritev			491
-#define __NR_rt_tgsigqueueinfo		492
-#define __NR_perf_event_open		493
-#define __NR_fanotify_init		494
-#define __NR_fanotify_mark		495
-#define __NR_prlimit64			496
-#define __NR_name_to_handle_at		497
-#define __NR_open_by_handle_at		498
-#define __NR_clock_adjtime		499
-#define __NR_syncfs			500
-#define __NR_setns			501
-#define __NR_accept4			502
-#define __NR_sendmmsg			503
-#define __NR_process_vm_readv		504
-#define __NR_process_vm_writev		505
-
-#ifdef __KERNEL__
 
 #define NR_SYSCALLS			506
 
@@ -498,5 +32,4 @@
 
 #define cond_syscall(x)  asm(".weak\t" #x "\n" #x " = sys_ni_syscall")
 
-#endif /* __KERNEL__ */
 #endif /* _ALPHA_UNISTD_H */
diff --git a/trunk/arch/alpha/include/uapi/asm/Kbuild b/trunk/arch/alpha/include/uapi/asm/Kbuild
index baebb3da1d44..d96f2ef5b639 100644
--- a/trunk/arch/alpha/include/uapi/asm/Kbuild
+++ b/trunk/arch/alpha/include/uapi/asm/Kbuild
@@ -1,3 +1,43 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+header-y += a.out.h
+header-y += auxvec.h
+header-y += bitsperlong.h
+header-y += byteorder.h
+header-y += compiler.h
+header-y += console.h
+header-y += errno.h
+header-y += fcntl.h
+header-y += fpu.h
+header-y += gentrap.h
+header-y += ioctl.h
+header-y += ioctls.h
+header-y += ipcbuf.h
+header-y += kvm_para.h
+header-y += mman.h
+header-y += msgbuf.h
+header-y += pal.h
+header-y += param.h
+header-y += poll.h
+header-y += posix_types.h
+header-y += ptrace.h
+header-y += reg.h
+header-y += regdef.h
+header-y += resource.h
+header-y += sembuf.h
+header-y += setup.h
+header-y += shmbuf.h
+header-y += sigcontext.h
+header-y += siginfo.h
+header-y += signal.h
+header-y += socket.h
+header-y += sockios.h
+header-y += stat.h
+header-y += statfs.h
+header-y += swab.h
+header-y += sysinfo.h
+header-y += termbits.h
+header-y += termios.h
+header-y += types.h
+header-y += unistd.h
diff --git a/trunk/arch/alpha/include/uapi/asm/a.out.h b/trunk/arch/alpha/include/uapi/asm/a.out.h
new file mode 100644
index 000000000000..547707246f63
--- /dev/null
+++ b/trunk/arch/alpha/include/uapi/asm/a.out.h
@@ -0,0 +1,91 @@
+#ifndef _UAPI__ALPHA_A_OUT_H__
+#define _UAPI__ALPHA_A_OUT_H__
+
+#include <linux/types.h>
+
+/*
+ * OSF/1 ECOFF header structs.  ECOFF files consist of:
+ * 	- a file header (struct filehdr),
+ *	- an a.out header (struct aouthdr),
+ *	- one or more section headers (struct scnhdr). 
+ *	  The filhdr's "f_nscns" field contains the
+ *	  number of section headers.
+ */
+
+struct filehdr
+{
+	/* OSF/1 "file" header */
+	__u16 f_magic, f_nscns;
+	__u32 f_timdat;
+	__u64 f_symptr;
+	__u32 f_nsyms;
+	__u16 f_opthdr, f_flags;
+};
+
+struct aouthdr
+{
+	__u64 info;		/* after that it looks quite normal.. */
+	__u64 tsize;
+	__u64 dsize;
+	__u64 bsize;
+	__u64 entry;
+	__u64 text_start;	/* with a few additions that actually make sense */
+	__u64 data_start;
+	__u64 bss_start;
+	__u32 gprmask, fprmask;	/* bitmask of general & floating point regs used in binary */
+	__u64 gpvalue;
+};
+
+struct scnhdr
+{
+	char	s_name[8];
+	__u64	s_paddr;
+	__u64	s_vaddr;
+	__u64	s_size;
+	__u64	s_scnptr;
+	__u64	s_relptr;
+	__u64	s_lnnoptr;
+	__u16	s_nreloc;
+	__u16	s_nlnno;
+	__u32	s_flags;
+};
+
+struct exec
+{
+	/* OSF/1 "file" header */
+	struct filehdr		fh;
+	struct aouthdr		ah;
+};
+
+/*
+ * Define's so that the kernel exec code can access the a.out header
+ * fields...
+ */
+#define	a_info		ah.info
+#define	a_text		ah.tsize
+#define a_data		ah.dsize
+#define a_bss		ah.bsize
+#define a_entry		ah.entry
+#define a_textstart	ah.text_start
+#define	a_datastart	ah.data_start
+#define	a_bssstart	ah.bss_start
+#define	a_gprmask	ah.gprmask
+#define a_fprmask	ah.fprmask
+#define a_gpvalue	ah.gpvalue
+
+#define N_TXTADDR(x) ((x).a_textstart)
+#define N_DATADDR(x) ((x).a_datastart)
+#define N_BSSADDR(x) ((x).a_bssstart)
+#define N_DRSIZE(x) 0
+#define N_TRSIZE(x) 0
+#define N_SYMSIZE(x) 0
+
+#define AOUTHSZ		sizeof(struct aouthdr)
+#define SCNHSZ		sizeof(struct scnhdr)
+#define SCNROUND	16
+
+#define N_TXTOFF(x) \
+  ((long) N_MAGIC(x) == ZMAGIC ? 0 : \
+   (sizeof(struct exec) + (x).fh.f_nscns*SCNHSZ + SCNROUND - 1) & ~(SCNROUND - 1))
+
+#endif /* _UAPI__ALPHA_A_OUT_H__ */
diff --git a/trunk/arch/alpha/include/asm/auxvec.h b/trunk/arch/alpha/include/uapi/asm/auxvec.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/auxvec.h
rename to trunk/arch/alpha/include/uapi/asm/auxvec.h
diff --git a/trunk/arch/alpha/include/asm/bitsperlong.h b/trunk/arch/alpha/include/uapi/asm/bitsperlong.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/bitsperlong.h
rename to trunk/arch/alpha/include/uapi/asm/bitsperlong.h
diff --git a/trunk/arch/alpha/include/asm/byteorder.h b/trunk/arch/alpha/include/uapi/asm/byteorder.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/byteorder.h
rename to trunk/arch/alpha/include/uapi/asm/byteorder.h
diff --git a/trunk/arch/alpha/include/uapi/asm/compiler.h b/trunk/arch/alpha/include/uapi/asm/compiler.h
new file mode 100644
index 000000000000..32cc7833f0c1
--- /dev/null
+++ b/trunk/arch/alpha/include/uapi/asm/compiler.h
@@ -0,0 +1,117 @@
+#ifndef _UAPI__ALPHA_COMPILER_H
+#define _UAPI__ALPHA_COMPILER_H
+
+/* 
+ * Herein are macros we use when describing various patterns we want to GCC.
+ * In all cases we can get better schedules out of the compiler if we hide
+ * as little as possible inside inline assembly.  However, we want to be
+ * able to know what we'll get out before giving up inline assembly.  Thus
+ * these tests and macros.
+ */
+
+#if __GNUC__ == 3 && __GNUC_MINOR__ >= 4 || __GNUC__ > 3
+# define __kernel_insbl(val, shift)	__builtin_alpha_insbl(val, shift)
+# define __kernel_inswl(val, shift)	__builtin_alpha_inswl(val, shift)
+# define __kernel_insql(val, shift)	__builtin_alpha_insql(val, shift)
+# define __kernel_inslh(val, shift)	__builtin_alpha_inslh(val, shift)
+# define __kernel_extbl(val, shift)	__builtin_alpha_extbl(val, shift)
+# define __kernel_extwl(val, shift)	__builtin_alpha_extwl(val, shift)
+# define __kernel_cmpbge(a, b)		__builtin_alpha_cmpbge(a, b)
+#else
+# define __kernel_insbl(val, shift)					\
+  ({ unsigned long __kir;						\
+     __asm__("insbl %2,%1,%0" : "=r"(__kir) : "rI"(shift), "r"(val));	\
+     __kir; })
+# define __kernel_inswl(val, shift)					\
+  ({ unsigned long __kir;						\
+     __asm__("inswl %2,%1,%0" : "=r"(__kir) : "rI"(shift), "r"(val));	\
+     __kir; })
+# define __kernel_insql(val, shift)					\
+  ({ unsigned long __kir;						\
+     __asm__("insql %2,%1,%0" : "=r"(__kir) : "rI"(shift), "r"(val));	\
+     __kir; })
+# define __kernel_inslh(val, shift)					\
+  ({ unsigned long __kir;						\
+     __asm__("inslh %2,%1,%0" : "=r"(__kir) : "rI"(shift), "r"(val));	\
+     __kir; })
+# define __kernel_extbl(val, shift)					\
+  ({ unsigned long __kir;						\
+     __asm__("extbl %2,%1,%0" : "=r"(__kir) : "rI"(shift), "r"(val));	\
+     __kir; })
+# define __kernel_extwl(val, shift)					\
+  ({ unsigned long __kir;						\
+     __asm__("extwl %2,%1,%0" : "=r"(__kir) : "rI"(shift), "r"(val));	\
+     __kir; })
+# define __kernel_cmpbge(a, b)						\
+  ({ unsigned long __kir;						\
+     __asm__("cmpbge %r2,%1,%0" : "=r"(__kir) : "rI"(b), "rJ"(a));	\
+     __kir; })
+#endif
+
+#ifdef __alpha_cix__
+# if __GNUC__ == 3 && __GNUC_MINOR__ >= 4 || __GNUC__ > 3
+#  define __kernel_cttz(x)		__builtin_ctzl(x)
+#  define __kernel_ctlz(x)		__builtin_clzl(x)
+#  define __kernel_ctpop(x)		__builtin_popcountl(x)
+# else
+#  define __kernel_cttz(x)						\
+   ({ unsigned long __kir;						\
+      __asm__("cttz %1,%0" : "=r"(__kir) : "r"(x));			\
+      __kir; })
+#  define __kernel_ctlz(x)						\
+   ({ unsigned long __kir;						\
+      __asm__("ctlz %1,%0" : "=r"(__kir) : "r"(x));			\
+      __kir; })
+#  define __kernel_ctpop(x)						\
+   ({ unsigned long __kir;						\
+      __asm__("ctpop %1,%0" : "=r"(__kir) : "r"(x));			\
+      __kir; })
+# endif
+#else
+# define __kernel_cttz(x)						\
+  ({ unsigned long __kir;						\
+     __asm__(".arch ev67; cttz %1,%0" : "=r"(__kir) : "r"(x));		\
+     __kir; })
+# define __kernel_ctlz(x)						\
+  ({ unsigned long __kir;						\
+     __asm__(".arch ev67; ctlz %1,%0" : "=r"(__kir) : "r"(x));		\
+     __kir; })
+# define __kernel_ctpop(x)						\
+  ({ unsigned long __kir;						\
+     __asm__(".arch ev67; ctpop %1,%0" : "=r"(__kir) : "r"(x));		\
+     __kir; })
+#endif
+
+
+/* 
+ * Beginning with EGCS 1.1, GCC defines __alpha_bwx__ when the BWX 
+ * extension is enabled.  Previous versions did not define anything
+ * we could test during compilation -- too bad, so sad.
+ */
+
+#if defined(__alpha_bwx__)
+#define __kernel_ldbu(mem)	(mem)
+#define __kernel_ldwu(mem)	(mem)
+#define __kernel_stb(val,mem)	((mem) = (val))
+#define __kernel_stw(val,mem)	((mem) = (val))
+#else
+#define __kernel_ldbu(mem)				\
+  ({ unsigned char __kir;				\
+     __asm__(".arch ev56;				\
+	      ldbu %0,%1" : "=r"(__kir) : "m"(mem));	\
+     __kir; })
+#define __kernel_ldwu(mem)				\
+  ({ unsigned short __kir;				\
+     __asm__(".arch ev56;				\
+	      ldwu %0,%1" : "=r"(__kir) : "m"(mem));	\
+     __kir; })
+#define __kernel_stb(val,mem)				\
+  __asm__(".arch ev56;					\
+	   stb %1,%0" : "=m"(mem) : "r"(val))
+#define __kernel_stw(val,mem)				\
+  __asm__(".arch ev56;					\
+	   stw %1,%0" : "=m"(mem) : "r"(val))
+#endif
+
+
+#endif /* _UAPI__ALPHA_COMPILER_H */
diff --git a/trunk/arch/alpha/include/uapi/asm/console.h b/trunk/arch/alpha/include/uapi/asm/console.h
new file mode 100644
index 000000000000..fd08a191f360
--- /dev/null
+++ b/trunk/arch/alpha/include/uapi/asm/console.h
@@ -0,0 +1,50 @@
+#ifndef _UAPI__AXP_CONSOLE_H
+#define _UAPI__AXP_CONSOLE_H
+
+/*
+ * Console callback routine numbers
+ */
+#define CCB_GETC		0x01
+#define CCB_PUTS		0x02
+#define CCB_RESET_TERM		0x03
+#define CCB_SET_TERM_INT	0x04
+#define CCB_SET_TERM_CTL	0x05
+#define CCB_PROCESS_KEYCODE	0x06
+#define CCB_OPEN_CONSOLE	0x07
+#define CCB_CLOSE_CONSOLE	0x08
+
+#define CCB_OPEN		0x10
+#define CCB_CLOSE		0x11
+#define CCB_IOCTL		0x12
+#define CCB_READ		0x13
+#define CCB_WRITE		0x14
+
+#define CCB_SET_ENV		0x20
+#define CCB_RESET_ENV		0x21
+#define CCB_GET_ENV		0x22
+#define CCB_SAVE_ENV		0x23
+
+#define CCB_PSWITCH		0x30
+#define CCB_BIOS_EMUL		0x32
+
+/*
+ * Environment variable numbers
+ */
+#define ENV_AUTO_ACTION		0x01
+#define ENV_BOOT_DEV		0x02
+#define ENV_BOOTDEF_DEV		0x03
+#define ENV_BOOTED_DEV		0x04
+#define ENV_BOOT_FILE		0x05
+#define ENV_BOOTED_FILE		0x06
+#define ENV_BOOT_OSFLAGS	0x07
+#define ENV_BOOTED_OSFLAGS	0x08
+#define ENV_BOOT_RESET		0x09
+#define ENV_DUMP_DEV		0x0A
+#define ENV_ENABLE_AUDIT	0x0B
+#define ENV_LICENSE		0x0C
+#define ENV_CHAR_SET		0x0D
+#define ENV_LANGUAGE		0x0E
+#define ENV_TTY_DEV		0x0F
+
+
+#endif /* _UAPI__AXP_CONSOLE_H */
diff --git a/trunk/arch/alpha/include/asm/errno.h b/trunk/arch/alpha/include/uapi/asm/errno.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/errno.h
rename to trunk/arch/alpha/include/uapi/asm/errno.h
diff --git a/trunk/arch/alpha/include/asm/fcntl.h b/trunk/arch/alpha/include/uapi/asm/fcntl.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/fcntl.h
rename to trunk/arch/alpha/include/uapi/asm/fcntl.h
diff --git a/trunk/arch/alpha/include/uapi/asm/fpu.h b/trunk/arch/alpha/include/uapi/asm/fpu.h
new file mode 100644
index 000000000000..21a053ca2233
--- /dev/null
+++ b/trunk/arch/alpha/include/uapi/asm/fpu.h
@@ -0,0 +1,123 @@
+#ifndef _UAPI__ASM_ALPHA_FPU_H
+#define _UAPI__ASM_ALPHA_FPU_H
+
+
+/*
+ * Alpha floating-point control register defines:
+ */
+#define FPCR_DNOD	(1UL<<47)	/* denorm INV trap disable */
+#define FPCR_DNZ	(1UL<<48)	/* denorms to zero */
+#define FPCR_INVD	(1UL<<49)	/* invalid op disable (opt.) */
+#define FPCR_DZED	(1UL<<50)	/* division by zero disable (opt.) */
+#define FPCR_OVFD	(1UL<<51)	/* overflow disable (optional) */
+#define FPCR_INV	(1UL<<52)	/* invalid operation */
+#define FPCR_DZE	(1UL<<53)	/* division by zero */
+#define FPCR_OVF	(1UL<<54)	/* overflow */
+#define FPCR_UNF	(1UL<<55)	/* underflow */
+#define FPCR_INE	(1UL<<56)	/* inexact */
+#define FPCR_IOV	(1UL<<57)	/* integer overflow */
+#define FPCR_UNDZ	(1UL<<60)	/* underflow to zero (opt.) */
+#define FPCR_UNFD	(1UL<<61)	/* underflow disable (opt.) */
+#define FPCR_INED	(1UL<<62)	/* inexact disable (opt.) */
+#define FPCR_SUM	(1UL<<63)	/* summary bit */
+
+#define FPCR_DYN_SHIFT	58		/* first dynamic rounding mode bit */
+#define FPCR_DYN_CHOPPED (0x0UL << FPCR_DYN_SHIFT)	/* towards 0 */
+#define FPCR_DYN_MINUS	 (0x1UL << FPCR_DYN_SHIFT)	/* towards -INF */
+#define FPCR_DYN_NORMAL	 (0x2UL << FPCR_DYN_SHIFT)	/* towards nearest */
+#define FPCR_DYN_PLUS	 (0x3UL << FPCR_DYN_SHIFT)	/* towards +INF */
+#define FPCR_DYN_MASK	 (0x3UL << FPCR_DYN_SHIFT)
+
+#define FPCR_MASK	0xffff800000000000L
+
+/*
+ * IEEE trap enables are implemented in software.  These per-thread
+ * bits are stored in the "ieee_state" field of "struct thread_info".
+ * Thus, the bits are defined so as not to conflict with the
+ * floating-point enable bit (which is architected).  On top of that,
+ * we want to make these bits compatible with OSF/1 so
+ * ieee_set_fp_control() etc. can be implemented easily and
+ * compatibly.  The corresponding definitions are in
+ * /usr/include/machine/fpu.h under OSF/1.
+ */
+#define IEEE_TRAP_ENABLE_INV	(1UL<<1)	/* invalid op */
+#define IEEE_TRAP_ENABLE_DZE	(1UL<<2)	/* division by zero */
+#define IEEE_TRAP_ENABLE_OVF	(1UL<<3)	/* overflow */
+#define IEEE_TRAP_ENABLE_UNF	(1UL<<4)	/* underflow */
+#define IEEE_TRAP_ENABLE_INE	(1UL<<5)	/* inexact */
+#define IEEE_TRAP_ENABLE_DNO	(1UL<<6)	/* denorm */
+#define IEEE_TRAP_ENABLE_MASK	(IEEE_TRAP_ENABLE_INV | IEEE_TRAP_ENABLE_DZE |\
+				 IEEE_TRAP_ENABLE_OVF | IEEE_TRAP_ENABLE_UNF |\
+				 IEEE_TRAP_ENABLE_INE | IEEE_TRAP_ENABLE_DNO)
+
+/* Denorm and Underflow flushing */
+#define IEEE_MAP_DMZ		(1UL<<12)	/* Map denorm inputs to zero */
+#define IEEE_MAP_UMZ		(1UL<<13)	/* Map underflowed outputs to zero */
+
+#define IEEE_MAP_MASK		(IEEE_MAP_DMZ | IEEE_MAP_UMZ)
+
+/* status bits coming from fpcr: */
+#define IEEE_STATUS_INV		(1UL<<17)
+#define IEEE_STATUS_DZE		(1UL<<18)
+#define IEEE_STATUS_OVF		(1UL<<19)
+#define IEEE_STATUS_UNF		(1UL<<20)
+#define IEEE_STATUS_INE		(1UL<<21)
+#define IEEE_STATUS_DNO		(1UL<<22)
+
+#define IEEE_STATUS_MASK	(IEEE_STATUS_INV | IEEE_STATUS_DZE |	\
+				 IEEE_STATUS_OVF | IEEE_STATUS_UNF |	\
+				 IEEE_STATUS_INE | IEEE_STATUS_DNO)
+
+#define IEEE_SW_MASK		(IEEE_TRAP_ENABLE_MASK |		\
+				 IEEE_STATUS_MASK | IEEE_MAP_MASK)
+
+#define IEEE_CURRENT_RM_SHIFT	32
+#define IEEE_CURRENT_RM_MASK	(3UL<<IEEE_CURRENT_RM_SHIFT)
+
+#define IEEE_STATUS_TO_EXCSUM_SHIFT	16
+
+#define IEEE_INHERIT    (1UL<<63)	/* inherit on thread create? */
+
+/*
+ * Convert the software IEEE trap enable and status bits into the
+ * hardware fpcr format. 
+ *
+ * Digital Unix engineers receive my thanks for not defining the
+ * software bits identical to the hardware bits.  The chip designers
+ * receive my thanks for making all the not-implemented fpcr bits
+ * RAZ forcing us to use system calls to read/write this value.
+ */
+
+static inline unsigned long
+ieee_swcr_to_fpcr(unsigned long sw)
+{
+	unsigned long fp;
+	fp = (sw & IEEE_STATUS_MASK) << 35;
+	fp |= (sw & IEEE_MAP_DMZ) << 36;
+	fp |= (sw & IEEE_STATUS_MASK ? FPCR_SUM : 0);
+	fp |= (~sw & (IEEE_TRAP_ENABLE_INV
+		      | IEEE_TRAP_ENABLE_DZE
+		      | IEEE_TRAP_ENABLE_OVF)) << 48;
+	fp |= (~sw & (IEEE_TRAP_ENABLE_UNF | IEEE_TRAP_ENABLE_INE)) << 57;
+	fp |= (sw & IEEE_MAP_UMZ ? FPCR_UNDZ | FPCR_UNFD : 0);
+	fp |= (~sw & IEEE_TRAP_ENABLE_DNO) << 41;
+	return fp;
+}
+
+static inline unsigned long
+ieee_fpcr_to_swcr(unsigned long fp)
+{
+	unsigned long sw;
+	sw = (fp >> 35) & IEEE_STATUS_MASK;
+	sw |= (fp >> 36) & IEEE_MAP_DMZ;
+	sw |= (~fp >> 48) & (IEEE_TRAP_ENABLE_INV
+			     | IEEE_TRAP_ENABLE_DZE
+			     | IEEE_TRAP_ENABLE_OVF);
+	sw |= (~fp >> 57) & (IEEE_TRAP_ENABLE_UNF | IEEE_TRAP_ENABLE_INE);
+	sw |= (fp >> 47) & IEEE_MAP_UMZ;
+	sw |= (~fp >> 41) & IEEE_TRAP_ENABLE_DNO;
+	return sw;
+}
+
+
+#endif /* _UAPI__ASM_ALPHA_FPU_H */
diff --git a/trunk/arch/alpha/include/asm/gentrap.h b/trunk/arch/alpha/include/uapi/asm/gentrap.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/gentrap.h
rename to trunk/arch/alpha/include/uapi/asm/gentrap.h
diff --git a/trunk/arch/alpha/include/asm/ioctl.h b/trunk/arch/alpha/include/uapi/asm/ioctl.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/ioctl.h
rename to trunk/arch/alpha/include/uapi/asm/ioctl.h
diff --git a/trunk/arch/alpha/include/asm/ioctls.h b/trunk/arch/alpha/include/uapi/asm/ioctls.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/ioctls.h
rename to trunk/arch/alpha/include/uapi/asm/ioctls.h
diff --git a/trunk/arch/alpha/include/asm/ipcbuf.h b/trunk/arch/alpha/include/uapi/asm/ipcbuf.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/ipcbuf.h
rename to trunk/arch/alpha/include/uapi/asm/ipcbuf.h
diff --git a/trunk/arch/alpha/include/asm/kvm_para.h b/trunk/arch/alpha/include/uapi/asm/kvm_para.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/kvm_para.h
rename to trunk/arch/alpha/include/uapi/asm/kvm_para.h
diff --git a/trunk/arch/alpha/include/asm/mman.h b/trunk/arch/alpha/include/uapi/asm/mman.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/mman.h
rename to trunk/arch/alpha/include/uapi/asm/mman.h
diff --git a/trunk/arch/alpha/include/asm/msgbuf.h b/trunk/arch/alpha/include/uapi/asm/msgbuf.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/msgbuf.h
rename to trunk/arch/alpha/include/uapi/asm/msgbuf.h
diff --git a/trunk/arch/alpha/include/uapi/asm/pal.h b/trunk/arch/alpha/include/uapi/asm/pal.h
new file mode 100644
index 000000000000..3c0ce08e5f59
--- /dev/null
+++ b/trunk/arch/alpha/include/uapi/asm/pal.h
@@ -0,0 +1,52 @@
+#ifndef _UAPI__ALPHA_PAL_H
+#define _UAPI__ALPHA_PAL_H
+
+/*
+ * Common PAL-code
+ */
+#define PAL_halt	  0
+#define PAL_cflush	  1
+#define PAL_draina	  2
+#define PAL_bpt		128
+#define PAL_bugchk	129
+#define PAL_chmk	131
+#define PAL_callsys	131
+#define PAL_imb		134
+#define PAL_rduniq	158
+#define PAL_wruniq	159
+#define PAL_gentrap	170
+#define PAL_nphalt	190
+
+/*
+ * VMS specific PAL-code
+ */
+#define PAL_swppal	10
+#define PAL_mfpr_vptb	41
+
+/*
+ * OSF specific PAL-code
+ */
+#define PAL_cserve	 9
+#define PAL_wripir	13
+#define PAL_rdmces	16
+#define PAL_wrmces	17
+#define PAL_wrfen	43
+#define PAL_wrvptptr	45
+#define PAL_jtopal	46
+#define PAL_swpctx	48
+#define PAL_wrval	49
+#define PAL_rdval	50
+#define PAL_tbi		51
+#define PAL_wrent	52
+#define PAL_swpipl	53
+#define PAL_rdps	54
+#define PAL_wrkgp	55
+#define PAL_wrusp	56
+#define PAL_wrperfmon	57
+#define PAL_rdusp	58
+#define PAL_whami	60
+#define PAL_retsys	61
+#define PAL_rti		63
+
+
+#endif /* _UAPI__ALPHA_PAL_H */
diff --git a/trunk/arch/alpha/include/uapi/asm/param.h b/trunk/arch/alpha/include/uapi/asm/param.h
new file mode 100644
index 000000000000..29daed819ebd
--- /dev/null
+++ b/trunk/arch/alpha/include/uapi/asm/param.h
@@ -0,0 +1,21 @@
+#ifndef _UAPI_ASM_ALPHA_PARAM_H
+#define _UAPI_ASM_ALPHA_PARAM_H
+
+/* ??? Gross.  I don't want to parameterize this, and supposedly the
+   hardware ignores reprogramming.  We also need userland buy-in to the 
+   change in HZ, since this is visible in the wait4 resources etc.  */
+
+#ifndef __KERNEL__
+#define HZ		1024
+#endif
+
+#define EXEC_PAGESIZE	8192
+
+#ifndef NOGROUP
+#define NOGROUP		(-1)
+#endif
+
+#define MAXHOSTNAMELEN	64	/* max length of hostname */
+
+
+#endif /* _UAPI_ASM_ALPHA_PARAM_H */
diff --git a/trunk/arch/alpha/include/asm/poll.h b/trunk/arch/alpha/include/uapi/asm/poll.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/poll.h
rename to trunk/arch/alpha/include/uapi/asm/poll.h
diff --git a/trunk/arch/alpha/include/asm/posix_types.h b/trunk/arch/alpha/include/uapi/asm/posix_types.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/posix_types.h
rename to trunk/arch/alpha/include/uapi/asm/posix_types.h
diff --git a/trunk/arch/alpha/include/uapi/asm/ptrace.h b/trunk/arch/alpha/include/uapi/asm/ptrace.h
new file mode 100644
index 000000000000..5ce83fa9a05b
--- /dev/null
+++ b/trunk/arch/alpha/include/uapi/asm/ptrace.h
@@ -0,0 +1,70 @@
+#ifndef _UAPI_ASMAXP_PTRACE_H
+#define _UAPI_ASMAXP_PTRACE_H
+
+
+/*
+ * This struct defines the way the registers are stored on the
+ * kernel stack during a system call or other kernel entry
+ *
+ * NOTE! I want to minimize the overhead of system calls, so this
+ * struct has as little information as possible.  I does not have
+ *
+ *  - floating point regs: the kernel doesn't change those
+ *  - r9-15: saved by the C compiler
+ *
+ * This makes "fork()" and "exec()" a bit more complex, but should
+ * give us low system call latency.
+ */
+
+struct pt_regs {
+	unsigned long r0;
+	unsigned long r1;
+	unsigned long r2;
+	unsigned long r3;
+	unsigned long r4;
+	unsigned long r5;
+	unsigned long r6;
+	unsigned long r7;
+	unsigned long r8;
+	unsigned long r19;
+	unsigned long r20;
+	unsigned long r21;
+	unsigned long r22;
+	unsigned long r23;
+	unsigned long r24;
+	unsigned long r25;
+	unsigned long r26;
+	unsigned long r27;
+	unsigned long r28;
+	unsigned long hae;
+/* JRP - These are the values provided to a0-a2 by PALcode */
+	unsigned long trap_a0;
+	unsigned long trap_a1;
+	unsigned long trap_a2;
+/* These are saved by PAL-code: */
+	unsigned long ps;
+	unsigned long pc;
+	unsigned long gp;
+	unsigned long r16;
+	unsigned long r17;
+	unsigned long r18;
+};
+
+/*
+ * This is the extended stack used by signal handlers and the context
+ * switcher: it's pushed after the normal "struct pt_regs".
+ */
+struct switch_stack {
+	unsigned long r9;
+	unsigned long r10;
+	unsigned long r11;
+	unsigned long r12;
+	unsigned long r13;
+	unsigned long r14;
+	unsigned long r15;
+	unsigned long r26;
+	unsigned long fp[32];	/* fp[31] is fpcr */
+};
+
+
+#endif /* _UAPI_ASMAXP_PTRACE_H */
diff --git a/trunk/arch/alpha/include/asm/reg.h b/trunk/arch/alpha/include/uapi/asm/reg.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/reg.h
rename to trunk/arch/alpha/include/uapi/asm/reg.h
diff --git a/trunk/arch/alpha/include/asm/regdef.h b/trunk/arch/alpha/include/uapi/asm/regdef.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/regdef.h
rename to trunk/arch/alpha/include/uapi/asm/regdef.h
diff --git a/trunk/arch/alpha/include/asm/resource.h b/trunk/arch/alpha/include/uapi/asm/resource.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/resource.h
rename to trunk/arch/alpha/include/uapi/asm/resource.h
diff --git a/trunk/arch/alpha/include/asm/sembuf.h b/trunk/arch/alpha/include/uapi/asm/sembuf.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/sembuf.h
rename to trunk/arch/alpha/include/uapi/asm/sembuf.h
diff --git a/trunk/arch/alpha/include/asm/setup.h b/trunk/arch/alpha/include/uapi/asm/setup.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/setup.h
rename to trunk/arch/alpha/include/uapi/asm/setup.h
diff --git a/trunk/arch/alpha/include/asm/shmbuf.h b/trunk/arch/alpha/include/uapi/asm/shmbuf.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/shmbuf.h
rename to trunk/arch/alpha/include/uapi/asm/shmbuf.h
diff --git a/trunk/arch/alpha/include/asm/sigcontext.h b/trunk/arch/alpha/include/uapi/asm/sigcontext.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/sigcontext.h
rename to trunk/arch/alpha/include/uapi/asm/sigcontext.h
diff --git a/trunk/arch/alpha/include/asm/siginfo.h b/trunk/arch/alpha/include/uapi/asm/siginfo.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/siginfo.h
rename to trunk/arch/alpha/include/uapi/asm/siginfo.h
diff --git a/trunk/arch/alpha/include/uapi/asm/signal.h b/trunk/arch/alpha/include/uapi/asm/signal.h
new file mode 100644
index 000000000000..965bbfa59c65
--- /dev/null
+++ b/trunk/arch/alpha/include/uapi/asm/signal.h
@@ -0,0 +1,135 @@
+#ifndef _UAPI_ASMAXP_SIGNAL_H
+#define _UAPI_ASMAXP_SIGNAL_H
+
+#include <linux/types.h>
+
+/* Avoid too many header ordering problems.  */
+struct siginfo;
+
+#ifndef __KERNEL__
+/* Here we must cater to libcs that poke about in kernel headers.  */
+
+#define NSIG		32
+typedef unsigned long sigset_t;
+
+#endif /* __KERNEL__ */
+
+
+/*
+ * Linux/AXP has different signal numbers that Linux/i386: I'm trying
+ * to make it OSF/1 binary compatible, at least for normal binaries.
+ */
+#define SIGHUP		 1
+#define SIGINT		 2
+#define SIGQUIT		 3
+#define SIGILL		 4
+#define SIGTRAP		 5
+#define SIGABRT		 6
+#define SIGEMT		 7
+#define SIGFPE		 8
+#define SIGKILL		 9
+#define SIGBUS		10
+#define SIGSEGV		11
+#define SIGSYS		12
+#define SIGPIPE		13
+#define SIGALRM		14
+#define SIGTERM		15
+#define SIGURG		16
+#define SIGSTOP		17
+#define SIGTSTP		18
+#define SIGCONT		19
+#define SIGCHLD		20
+#define SIGTTIN		21
+#define SIGTTOU		22
+#define SIGIO		23
+#define SIGXCPU		24
+#define SIGXFSZ		25
+#define SIGVTALRM	26
+#define SIGPROF		27
+#define SIGWINCH	28
+#define SIGINFO		29
+#define SIGUSR1		30
+#define SIGUSR2		31
+
+#define SIGPOLL	SIGIO
+#define SIGPWR	SIGINFO
+#define SIGIOT	SIGABRT
+
+/* These should not be considered constants from userland.  */
+#define SIGRTMIN	32
+#define SIGRTMAX	_NSIG
+
+/*
+ * SA_FLAGS values:
+ *
+ * SA_ONSTACK indicates that a registered stack_t will be used.
+ * SA_RESTART flag to get restarting signals (which were the default long ago)
+ * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop.
+ * SA_RESETHAND clears the handler when the signal is delivered.
+ * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies.
+ * SA_NODEFER prevents the current signal from being masked in the handler.
+ *
+ * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single
+ * Unix names RESETHAND and NODEFER respectively.
+ */
+
+#define SA_ONSTACK	0x00000001
+#define SA_RESTART	0x00000002
+#define SA_NOCLDSTOP	0x00000004
+#define SA_NODEFER	0x00000008
+#define SA_RESETHAND	0x00000010
+#define SA_NOCLDWAIT	0x00000020
+#define SA_SIGINFO	0x00000040
+
+#define SA_ONESHOT	SA_RESETHAND
+#define SA_NOMASK	SA_NODEFER
+
+/* 
+ * sigaltstack controls
+ */
+#define SS_ONSTACK	1
+#define SS_DISABLE	2
+
+#define MINSIGSTKSZ	4096
+#define SIGSTKSZ	16384
+
+#define SIG_BLOCK          1	/* for blocking signals */
+#define SIG_UNBLOCK        2	/* for unblocking signals */
+#define SIG_SETMASK        3	/* for setting the signal mask */
+
+#include <asm-generic/signal-defs.h>
+
+#ifndef __KERNEL__
+/* Here we must cater to libcs that poke about in kernel headers.  */
+
+struct sigaction {
+	union {
+	  __sighandler_t	_sa_handler;
+	  void (*_sa_sigaction)(int, struct siginfo *, void *);
+	} _u;
+	sigset_t	sa_mask;
+	int		sa_flags;
+};
+
+#define sa_handler	_u._sa_handler
+#define sa_sigaction	_u._sa_sigaction
+
+#endif /* __KERNEL__ */
+
+typedef struct sigaltstack {
+	void __user *ss_sp;
+	int ss_flags;
+	size_t ss_size;
+} stack_t;
+
+/* sigstack(2) is deprecated, and will be withdrawn in a future version
+   of the X/Open CAE Specification.  Use sigaltstack instead.  It is only
+   implemented here for OSF/1 compatibility.  */
+
+struct sigstack {
+	void __user *ss_sp;
+	int ss_onstack;
+};
+
+
+#endif /* _UAPI_ASMAXP_SIGNAL_H */
diff --git a/trunk/arch/alpha/include/uapi/asm/socket.h b/trunk/arch/alpha/include/uapi/asm/socket.h
new file mode 100644
index 000000000000..097c1577735a
--- /dev/null
+++ b/trunk/arch/alpha/include/uapi/asm/socket.h
@@ -0,0 +1,81 @@
+#ifndef _UAPI_ASM_SOCKET_H
+#define _UAPI_ASM_SOCKET_H
+
+#include <asm/sockios.h>
+
+/* For setsockopt(2) */
+/*
+ * Note: we only bother about making the SOL_SOCKET options
+ * same as OSF/1, as that's all that "normal" programs are
+ * likely to set.  We don't necessarily want to be binary
+ * compatible with _everything_. 
+ */
+#define SOL_SOCKET	0xffff
+
+#define SO_DEBUG	0x0001
+#define SO_REUSEADDR	0x0004
+#define SO_KEEPALIVE	0x0008
+#define SO_DONTROUTE	0x0010
+#define SO_BROADCAST	0x0020
+#define SO_LINGER	0x0080
+#define SO_OOBINLINE	0x0100
+/* To add :#define SO_REUSEPORT 0x0200 */
+
+#define SO_TYPE		0x1008
+#define SO_ERROR	0x1007
+#define SO_SNDBUF	0x1001
+#define SO_RCVBUF	0x1002
+#define SO_SNDBUFFORCE	0x100a
+#define SO_RCVBUFFORCE	0x100b
+#define	SO_RCVLOWAT	0x1010
+#define	SO_SNDLOWAT	0x1011
+#define	SO_RCVTIMEO	0x1012
+#define	SO_SNDTIMEO	0x1013
+#define SO_ACCEPTCONN	0x1014
+#define SO_PROTOCOL	0x1028
+#define SO_DOMAIN	0x1029
+
+/* linux-specific, might as well be the same as on i386 */
+#define SO_NO_CHECK	11
+#define SO_PRIORITY	12
+#define SO_BSDCOMPAT	14
+
+#define SO_PASSCRED	17
+#define SO_PEERCRED	18
+#define SO_BINDTODEVICE 25
+
+/* Socket filtering */
+#define SO_ATTACH_FILTER        26
+#define SO_DETACH_FILTER        27
+#define SO_GET_FILTER		SO_ATTACH_FILTER
+
+#define SO_PEERNAME		28
+#define SO_TIMESTAMP		29
+#define SCM_TIMESTAMP		SO_TIMESTAMP
+
+#define SO_PEERSEC		30
+#define SO_PASSSEC		34
+#define SO_TIMESTAMPNS		35
+#define SCM_TIMESTAMPNS		SO_TIMESTAMPNS
+
+/* Security levels - as per NRL IPv6 - don't actually do anything */
+#define SO_SECURITY_AUTHENTICATION		19
+#define SO_SECURITY_ENCRYPTION_TRANSPORT	20
+#define SO_SECURITY_ENCRYPTION_NETWORK		21
+
+#define SO_MARK			36
+
+#define SO_TIMESTAMPING		37
+#define SCM_TIMESTAMPING	SO_TIMESTAMPING
+
+#define SO_RXQ_OVFL             40
+
+#define SO_WIFI_STATUS		41
+#define SCM_WIFI_STATUS		SO_WIFI_STATUS
+#define SO_PEEK_OFF		42
+
+/* Instruct lower device to use last 4-bytes of skb data as FCS */
+#define SO_NOFCS		43
+
+
+#endif /* _UAPI_ASM_SOCKET_H */
diff --git a/trunk/arch/alpha/include/asm/sockios.h b/trunk/arch/alpha/include/uapi/asm/sockios.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/sockios.h
rename to trunk/arch/alpha/include/uapi/asm/sockios.h
diff --git a/trunk/arch/alpha/include/asm/stat.h b/trunk/arch/alpha/include/uapi/asm/stat.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/stat.h
rename to trunk/arch/alpha/include/uapi/asm/stat.h
diff --git a/trunk/arch/alpha/include/asm/statfs.h b/trunk/arch/alpha/include/uapi/asm/statfs.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/statfs.h
rename to trunk/arch/alpha/include/uapi/asm/statfs.h
diff --git a/trunk/arch/alpha/include/asm/swab.h b/trunk/arch/alpha/include/uapi/asm/swab.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/swab.h
rename to trunk/arch/alpha/include/uapi/asm/swab.h
diff --git a/trunk/arch/alpha/include/asm/sysinfo.h b/trunk/arch/alpha/include/uapi/asm/sysinfo.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/sysinfo.h
rename to trunk/arch/alpha/include/uapi/asm/sysinfo.h
diff --git a/trunk/arch/alpha/include/asm/termbits.h b/trunk/arch/alpha/include/uapi/asm/termbits.h
similarity index 100%
rename from trunk/arch/alpha/include/asm/termbits.h
rename to trunk/arch/alpha/include/uapi/asm/termbits.h
diff --git a/trunk/arch/alpha/include/uapi/asm/termios.h b/trunk/arch/alpha/include/uapi/asm/termios.h
new file mode 100644
index 000000000000..580ed1e4854c
--- /dev/null
+++ b/trunk/arch/alpha/include/uapi/asm/termios.h
@@ -0,0 +1,70 @@
+#ifndef _UAPI_ALPHA_TERMIOS_H
+#define _UAPI_ALPHA_TERMIOS_H
+
+#include <asm/ioctls.h>
+#include <asm/termbits.h>
+
+struct sgttyb {
+	char	sg_ispeed;
+	char	sg_ospeed;
+	char	sg_erase;
+	char	sg_kill;
+	short	sg_flags;
+};
+
+struct tchars {
+	char	t_intrc;
+	char	t_quitc;
+	char	t_startc;
+	char	t_stopc;
+	char	t_eofc;
+	char	t_brkc;
+};
+
+struct ltchars {
+	char	t_suspc;
+	char	t_dsuspc;
+	char	t_rprntc;
+	char	t_flushc;
+	char	t_werasc;
+	char	t_lnextc;
+};
+
+struct winsize {
+	unsigned short ws_row;
+	unsigned short ws_col;
+	unsigned short ws_xpixel;
+	unsigned short ws_ypixel;
+};
+
+#define NCC 8
+struct termio {
+	unsigned short c_iflag;		/* input mode flags */
+	unsigned short c_oflag;		/* output mode flags */
+	unsigned short c_cflag;		/* control mode flags */
+	unsigned short c_lflag;		/* local mode flags */
+	unsigned char c_line;		/* line discipline */
+	unsigned char c_cc[NCC];	/* control characters */
+};
+
+/*
+ * c_cc characters in the termio structure.  Oh, how I love being
+ * backwardly compatible.  Notice that character 4 and 5 are
+ * interpreted differently depending on whether ICANON is set in
+ * c_lflag.  If it's set, they are used as _VEOF and _VEOL, otherwise
+ * as _VMIN and V_TIME.  This is for compatibility with OSF/1 (which
+ * is compatible with sysV)...
+ */
+#define _VINTR	0
+#define _VQUIT	1
+#define _VERASE	2
+#define _VKILL	3
+#define _VEOF	4
+#define _VMIN	4
+#define _VEOL	5
+#define _VTIME	5
+#define _VEOL2	6
+#define _VSWTC	7
+
+
+#endif /* _UAPI_ALPHA_TERMIOS_H */
diff --git a/trunk/arch/alpha/include/uapi/asm/types.h b/trunk/arch/alpha/include/uapi/asm/types.h
new file mode 100644
index 000000000000..9fd3cd459777
--- /dev/null
+++ b/trunk/arch/alpha/include/uapi/asm/types.h
@@ -0,0 +1,16 @@
+#ifndef _UAPI_ALPHA_TYPES_H
+#define _UAPI_ALPHA_TYPES_H
+
+/*
+ * This file is never included by application software unless
+ * explicitly requested (e.g., via linux/types.h) in which case the
+ * application is Linux specific so (user-) name space pollution is
+ * not a major issue.  However, for interoperability, libraries still
+ * need to be careful to avoid a name clashes.
+ */
+
+#ifndef __KERNEL__
+#include <asm-generic/int-l64.h>
+#endif
+
+#endif /* _UAPI_ALPHA_TYPES_H */
diff --git a/trunk/arch/alpha/include/uapi/asm/unistd.h b/trunk/arch/alpha/include/uapi/asm/unistd.h
new file mode 100644
index 000000000000..801d28bcea51
--- /dev/null
+++ b/trunk/arch/alpha/include/uapi/asm/unistd.h
@@ -0,0 +1,471 @@
+#ifndef _UAPI_ALPHA_UNISTD_H
+#define _UAPI_ALPHA_UNISTD_H
+
+#define __NR_osf_syscall	  0	/* not implemented */
+#define __NR_exit		  1
+#define __NR_fork		  2
+#define __NR_read		  3
+#define __NR_write		  4
+#define __NR_osf_old_open	  5	/* not implemented */
+#define __NR_close		  6
+#define __NR_osf_wait4		  7
+#define __NR_osf_old_creat	  8	/* not implemented */
+#define __NR_link		  9
+#define __NR_unlink		 10
+#define __NR_osf_execve		 11	/* not implemented */
+#define __NR_chdir		 12
+#define __NR_fchdir		 13
+#define __NR_mknod		 14
+#define __NR_chmod		 15
+#define __NR_chown		 16
+#define __NR_brk		 17
+#define __NR_osf_getfsstat	 18	/* not implemented */
+#define __NR_lseek		 19
+#define __NR_getxpid		 20
+#define __NR_osf_mount		 21
+#define __NR_umount		 22
+#define __NR_setuid		 23
+#define __NR_getxuid		 24
+#define __NR_exec_with_loader	 25	/* not implemented */
+#define __NR_ptrace		 26
+#define __NR_osf_nrecvmsg	 27	/* not implemented */
+#define __NR_osf_nsendmsg	 28	/* not implemented */
+#define __NR_osf_nrecvfrom	 29	/* not implemented */
+#define __NR_osf_naccept	 30	/* not implemented */
+#define __NR_osf_ngetpeername	 31	/* not implemented */
+#define __NR_osf_ngetsockname	 32	/* not implemented */
+#define __NR_access		 33
+#define __NR_osf_chflags	 34	/* not implemented */
+#define __NR_osf_fchflags	 35	/* not implemented */
+#define __NR_sync		 36
+#define __NR_kill		 37
+#define __NR_osf_old_stat	 38	/* not implemented */
+#define __NR_setpgid		 39
+#define __NR_osf_old_lstat	 40	/* not implemented */
+#define __NR_dup		 41
+#define __NR_pipe		 42
+#define __NR_osf_set_program_attributes	43
+#define __NR_osf_profil		 44	/* not implemented */
+#define __NR_open		 45
+#define __NR_osf_old_sigaction	 46	/* not implemented */
+#define __NR_getxgid		 47
+#define __NR_osf_sigprocmask	 48
+#define __NR_osf_getlogin	 49	/* not implemented */
+#define __NR_osf_setlogin	 50	/* not implemented */
+#define __NR_acct		 51
+#define __NR_sigpending		 52
+
+#define __NR_ioctl		 54
+#define __NR_osf_reboot		 55	/* not implemented */
+#define __NR_osf_revoke		 56	/* not implemented */
+#define __NR_symlink		 57
+#define __NR_readlink		 58
+#define __NR_execve		 59
+#define __NR_umask		 60
+#define __NR_chroot		 61
+#define __NR_osf_old_fstat	 62	/* not implemented */
+#define __NR_getpgrp		 63
+#define __NR_getpagesize	 64
+#define __NR_osf_mremap		 65	/* not implemented */
+#define __NR_vfork		 66
+#define __NR_stat		 67
+#define __NR_lstat		 68
+#define __NR_osf_sbrk		 69	/* not implemented */
+#define __NR_osf_sstk		 70	/* not implemented */
+#define __NR_mmap		 71	/* OSF/1 mmap is superset of Linux */
+#define __NR_osf_old_vadvise	 72	/* not implemented */
+#define __NR_munmap		 73
+#define __NR_mprotect		 74
+#define __NR_madvise		 75
+#define __NR_vhangup		 76
+#define __NR_osf_kmodcall	 77	/* not implemented */
+#define __NR_osf_mincore	 78	/* not implemented */
+#define __NR_getgroups		 79
+#define __NR_setgroups		 80
+#define __NR_osf_old_getpgrp	 81	/* not implemented */
+#define __NR_setpgrp		 82	/* BSD alias for setpgid */
+#define __NR_osf_setitimer	 83
+#define __NR_osf_old_wait	 84	/* not implemented */
+#define __NR_osf_table		 85	/* not implemented */
+#define __NR_osf_getitimer	 86
+#define __NR_gethostname	 87
+#define __NR_sethostname	 88
+#define __NR_getdtablesize	 89
+#define __NR_dup2		 90
+#define __NR_fstat		 91
+#define __NR_fcntl		 92
+#define __NR_osf_select		 93
+#define __NR_poll		 94
+#define __NR_fsync		 95
+#define __NR_setpriority	 96
+#define __NR_socket		 97
+#define __NR_connect		 98
+#define __NR_accept		 99
+#define __NR_getpriority	100
+#define __NR_send		101
+#define __NR_recv		102
+#define __NR_sigreturn		103
+#define __NR_bind		104
+#define __NR_setsockopt		105
+#define __NR_listen		106
+#define __NR_osf_plock		107	/* not implemented */
+#define __NR_osf_old_sigvec	108	/* not implemented */
+#define __NR_osf_old_sigblock	109	/* not implemented */
+#define __NR_osf_old_sigsetmask	110	/* not implemented */
+#define __NR_sigsuspend		111
+#define __NR_osf_sigstack	112
+#define __NR_recvmsg		113
+#define __NR_sendmsg		114
+#define __NR_osf_old_vtrace	115	/* not implemented */
+#define __NR_osf_gettimeofday	116
+#define __NR_osf_getrusage	117
+#define __NR_getsockopt		118
+
+#define __NR_readv		120
+#define __NR_writev		121
+#define __NR_osf_settimeofday	122
+#define __NR_fchown		123
+#define __NR_fchmod		124
+#define __NR_recvfrom		125
+#define __NR_setreuid		126
+#define __NR_setregid		127
+#define __NR_rename		128
+#define __NR_truncate		129
+#define __NR_ftruncate		130
+#define __NR_flock		131
+#define __NR_setgid		132
+#define __NR_sendto		133
+#define __NR_shutdown		134
+#define __NR_socketpair		135
+#define __NR_mkdir		136
+#define __NR_rmdir		137
+#define __NR_osf_utimes		138
+#define __NR_osf_old_sigreturn	139	/* not implemented */
+#define __NR_osf_adjtime	140	/* not implemented */
+#define __NR_getpeername	141
+#define __NR_osf_gethostid	142	/* not implemented */
+#define __NR_osf_sethostid	143	/* not implemented */
+#define __NR_getrlimit		144
+#define __NR_setrlimit		145
+#define __NR_osf_old_killpg	146	/* not implemented */
+#define __NR_setsid		147
+#define __NR_quotactl		148
+#define __NR_osf_oldquota	149	/* not implemented */
+#define __NR_getsockname	150
+
+#define __NR_osf_pid_block	153	/* not implemented */
+#define __NR_osf_pid_unblock	154	/* not implemented */
+
+#define __NR_sigaction		156
+#define __NR_osf_sigwaitprim	157	/* not implemented */
+#define __NR_osf_nfssvc		158	/* not implemented */
+#define __NR_osf_getdirentries	159
+#define __NR_osf_statfs		160
+#define __NR_osf_fstatfs	161
+
+#define __NR_osf_asynch_daemon	163	/* not implemented */
+#define __NR_osf_getfh		164	/* not implemented */	
+#define __NR_osf_getdomainname	165
+#define __NR_setdomainname	166
+
+#define __NR_osf_exportfs	169	/* not implemented */
+
+#define __NR_osf_alt_plock	181	/* not implemented */
+
+#define __NR_osf_getmnt		184	/* not implemented */
+
+#define __NR_osf_alt_sigpending	187	/* not implemented */
+#define __NR_osf_alt_setsid	188	/* not implemented */
+
+#define __NR_osf_swapon		199
+#define __NR_msgctl		200
+#define __NR_msgget		201
+#define __NR_msgrcv		202
+#define __NR_msgsnd		203
+#define __NR_semctl		204
+#define __NR_semget		205
+#define __NR_semop		206
+#define __NR_osf_utsname	207
+#define __NR_lchown		208
+#define __NR_osf_shmat		209
+#define __NR_shmctl		210
+#define __NR_shmdt		211
+#define __NR_shmget		212
+#define __NR_osf_mvalid		213	/* not implemented */
+#define __NR_osf_getaddressconf	214	/* not implemented */
+#define __NR_osf_msleep		215	/* not implemented */
+#define __NR_osf_mwakeup	216	/* not implemented */
+#define __NR_msync		217
+#define __NR_osf_signal		218	/* not implemented */
+#define __NR_osf_utc_gettime	219	/* not implemented */
+#define __NR_osf_utc_adjtime	220	/* not implemented */
+
+#define __NR_osf_security	222	/* not implemented */
+#define __NR_osf_kloadcall	223	/* not implemented */
+
+#define __NR_osf_stat		224
+#define __NR_osf_lstat		225
+#define __NR_osf_fstat		226
+#define __NR_osf_statfs64	227
+#define __NR_osf_fstatfs64	228
+
+#define __NR_getpgid		233
+#define __NR_getsid		234
+#define __NR_sigaltstack	235
+#define __NR_osf_waitid		236	/* not implemented */
+#define __NR_osf_priocntlset	237	/* not implemented */
+#define __NR_osf_sigsendset	238	/* not implemented */
+#define __NR_osf_set_speculative	239	/* not implemented */
+#define __NR_osf_msfs_syscall	240	/* not implemented */
+#define __NR_osf_sysinfo	241
+#define __NR_osf_uadmin		242	/* not implemented */
+#define __NR_osf_fuser		243	/* not implemented */
+#define __NR_osf_proplist_syscall    244
+#define __NR_osf_ntp_adjtime	245	/* not implemented */
+#define __NR_osf_ntp_gettime	246	/* not implemented */
+#define __NR_osf_pathconf	247	/* not implemented */
+#define __NR_osf_fpathconf	248	/* not implemented */
+
+#define __NR_osf_uswitch	250	/* not implemented */
+#define __NR_osf_usleep_thread	251
+#define __NR_osf_audcntl	252	/* not implemented */
+#define __NR_osf_audgen		253	/* not implemented */
+#define __NR_sysfs		254
+#define __NR_osf_subsys_info	255	/* not implemented */
+#define __NR_osf_getsysinfo	256
+#define __NR_osf_setsysinfo	257
+#define __NR_osf_afs_syscall	258	/* not implemented */
+#define __NR_osf_swapctl	259	/* not implemented */
+#define __NR_osf_memcntl	260	/* not implemented */
+#define __NR_osf_fdatasync	261	/* not implemented */
+
+/*
+ * Ignore legacy syscalls that we don't use.
+ */
+#define __IGNORE_alarm
+#define __IGNORE_creat
+#define __IGNORE_getegid
+#define __IGNORE_geteuid
+#define __IGNORE_getgid
+#define __IGNORE_getpid
+#define __IGNORE_getppid
+#define __IGNORE_getuid
+#define __IGNORE_pause
+#define __IGNORE_time
+#define __IGNORE_utime
+#define __IGNORE_umount2
+
+/*
+ * Linux-specific system calls begin at 300
+ */
+#define __NR_bdflush		300
+#define __NR_sethae		301
+#define __NR_mount		302
+#define __NR_old_adjtimex	303
+#define __NR_swapoff		304
+#define __NR_getdents		305
+#define __NR_create_module	306
+#define __NR_init_module	307
+#define __NR_delete_module	308
+#define __NR_get_kernel_syms	309
+#define __NR_syslog		310
+#define __NR_reboot		311
+#define __NR_clone		312
+#define __NR_uselib		313
+#define __NR_mlock		314
+#define __NR_munlock		315
+#define __NR_mlockall		316
+#define __NR_munlockall		317
+#define __NR_sysinfo		318
+#define __NR__sysctl		319
+/* 320 was sys_idle.  */
+#define __NR_oldumount		321
+#define __NR_swapon		322
+#define __NR_times		323
+#define __NR_personality	324
+#define __NR_setfsuid		325
+#define __NR_setfsgid		326
+#define __NR_ustat		327
+#define __NR_statfs		328
+#define __NR_fstatfs		329
+#define __NR_sched_setparam		330
+#define __NR_sched_getparam		331
+#define __NR_sched_setscheduler		332
+#define __NR_sched_getscheduler		333
+#define __NR_sched_yield		334
+#define __NR_sched_get_priority_max	335
+#define __NR_sched_get_priority_min	336
+#define __NR_sched_rr_get_interval	337
+#define __NR_afs_syscall		338
+#define __NR_uname			339
+#define __NR_nanosleep			340
+#define __NR_mremap			341
+#define __NR_nfsservctl			342
+#define __NR_setresuid			343
+#define __NR_getresuid			344
+#define __NR_pciconfig_read		345
+#define __NR_pciconfig_write		346
+#define __NR_query_module		347
+#define __NR_prctl			348
+#define __NR_pread64			349
+#define __NR_pwrite64			350
+#define __NR_rt_sigreturn		351
+#define __NR_rt_sigaction		352
+#define __NR_rt_sigprocmask		353
+#define __NR_rt_sigpending		354
+#define __NR_rt_sigtimedwait		355
+#define __NR_rt_sigqueueinfo		356
+#define __NR_rt_sigsuspend		357
+#define __NR_select			358
+#define __NR_gettimeofday		359
+#define __NR_settimeofday		360
+#define __NR_getitimer			361
+#define __NR_setitimer			362
+#define __NR_utimes			363
+#define __NR_getrusage			364
+#define __NR_wait4			365
+#define __NR_adjtimex			366
+#define __NR_getcwd			367
+#define __NR_capget			368
+#define __NR_capset			369
+#define __NR_sendfile			370
+#define __NR_setresgid			371
+#define __NR_getresgid			372
+#define __NR_dipc			373
+#define __NR_pivot_root			374
+#define __NR_mincore			375
+#define __NR_pciconfig_iobase		376
+#define __NR_getdents64			377
+#define __NR_gettid			378
+#define __NR_readahead			379
+/* 380 is unused */
+#define __NR_tkill			381
+#define __NR_setxattr			382
+#define __NR_lsetxattr			383
+#define __NR_fsetxattr			384
+#define __NR_getxattr			385
+#define __NR_lgetxattr			386
+#define __NR_fgetxattr			387
+#define __NR_listxattr			388
+#define __NR_llistxattr			389
+#define __NR_flistxattr			390
+#define __NR_removexattr		391
+#define __NR_lremovexattr		392
+#define __NR_fremovexattr		393
+#define __NR_futex			394
+#define __NR_sched_setaffinity		395     
+#define __NR_sched_getaffinity		396
+#define __NR_tuxcall			397
+#define __NR_io_setup			398
+#define __NR_io_destroy			399
+#define __NR_io_getevents		400
+#define __NR_io_submit			401
+#define __NR_io_cancel			402
+#define __NR_exit_group			405
+#define __NR_lookup_dcookie		406
+#define __NR_epoll_create		407
+#define __NR_epoll_ctl			408
+#define __NR_epoll_wait			409
+/* Feb 2007: These three sys_epoll defines shouldn't be here but culling
+ * them would break userspace apps ... we'll kill them off in 2010 :) */
+#define __NR_sys_epoll_create		__NR_epoll_create
+#define __NR_sys_epoll_ctl		__NR_epoll_ctl
+#define __NR_sys_epoll_wait		__NR_epoll_wait
+#define __NR_remap_file_pages		410
+#define __NR_set_tid_address		411
+#define __NR_restart_syscall		412
+#define __NR_fadvise64			413
+#define __NR_timer_create		414
+#define __NR_timer_settime		415
+#define __NR_timer_gettime		416
+#define __NR_timer_getoverrun		417
+#define __NR_timer_delete		418
+#define __NR_clock_settime		419
+#define __NR_clock_gettime		420
+#define __NR_clock_getres		421
+#define __NR_clock_nanosleep		422
+#define __NR_semtimedop			423
+#define __NR_tgkill			424
+#define __NR_stat64			425
+#define __NR_lstat64			426
+#define __NR_fstat64			427
+#define __NR_vserver			428
+#define __NR_mbind			429
+#define __NR_get_mempolicy		430
+#define __NR_set_mempolicy		431
+#define __NR_mq_open			432
+#define __NR_mq_unlink			433
+#define __NR_mq_timedsend		434
+#define __NR_mq_timedreceive		435
+#define __NR_mq_notify			436
+#define __NR_mq_getsetattr		437
+#define __NR_waitid			438
+#define __NR_add_key			439
+#define __NR_request_key		440
+#define __NR_keyctl			441
+#define __NR_ioprio_set			442
+#define __NR_ioprio_get			443
+#define __NR_inotify_init		444
+#define __NR_inotify_add_watch		445
+#define __NR_inotify_rm_watch		446
+#define __NR_fdatasync			447
+#define __NR_kexec_load			448
+#define __NR_migrate_pages		449
+#define __NR_openat			450
+#define __NR_mkdirat			451
+#define __NR_mknodat			452
+#define __NR_fchownat			453
+#define __NR_futimesat			454
+#define __NR_fstatat64			455
+#define __NR_unlinkat			456
+#define __NR_renameat			457
+#define __NR_linkat			458
+#define __NR_symlinkat			459
+#define __NR_readlinkat			460
+#define __NR_fchmodat			461
+#define __NR_faccessat			462
+#define __NR_pselect6			463
+#define __NR_ppoll			464
+#define __NR_unshare			465
+#define __NR_set_robust_list		466
+#define __NR_get_robust_list		467
+#define __NR_splice			468
+#define __NR_sync_file_range		469
+#define __NR_tee			470
+#define __NR_vmsplice			471
+#define __NR_move_pages			472
+#define __NR_getcpu			473
+#define __NR_epoll_pwait		474
+#define __NR_utimensat			475
+#define __NR_signalfd			476
+#define __NR_timerfd			477
+#define __NR_eventfd			478
+#define __NR_recvmmsg			479
+#define __NR_fallocate			480
+#define __NR_timerfd_create		481
+#define __NR_timerfd_settime		482
+#define __NR_timerfd_gettime		483
+#define __NR_signalfd4			484
+#define __NR_eventfd2			485
+#define __NR_epoll_create1		486
+#define __NR_dup3			487
+#define __NR_pipe2			488
+#define __NR_inotify_init1		489
+#define __NR_preadv			490
+#define __NR_pwritev			491
+#define __NR_rt_tgsigqueueinfo		492
+#define __NR_perf_event_open		493
+#define __NR_fanotify_init		494
+#define __NR_fanotify_mark		495
+#define __NR_prlimit64			496
+#define __NR_name_to_handle_at		497
+#define __NR_open_by_handle_at		498
+#define __NR_clock_adjtime		499
+#define __NR_syncfs			500
+#define __NR_setns			501
+#define __NR_accept4			502
+#define __NR_sendmmsg			503
+#define __NR_process_vm_readv		504
+#define __NR_process_vm_writev		505
+
+#endif /* _UAPI_ALPHA_UNISTD_H */
diff --git a/trunk/arch/arm/boot/dts/imx28-cfa10049.dts b/trunk/arch/arm/boot/dts/imx28-cfa10049.dts
index b222614ac9e0..bdc80a4453dd 100644
--- a/trunk/arch/arm/boot/dts/imx28-cfa10049.dts
+++ b/trunk/arch/arm/boot/dts/imx28-cfa10049.dts
@@ -92,6 +92,30 @@
 				status = "okay";
 			};
 
+			i2cmux {
+				compatible = "i2c-mux-gpio";
+				#address-cells = <1>;
+				#size-cells = <0>;
+				mux-gpios = <&gpio1 22 0 &gpio1 23 0>;
+				i2c-parent = <&i2c1>;
+
+				i2c@0 {
+					reg = <0>;
+				};
+
+				i2c@1 {
+					reg = <1>;
+				};
+
+				i2c@2 {
+					reg = <2>;
+				};
+
+				i2c@3 {
+					reg = <3>;
+				};
+			};
+
 			usbphy1: usbphy@8007e000 {
 				status = "okay";
 			};
diff --git a/trunk/arch/arm/boot/dts/spear13xx.dtsi b/trunk/arch/arm/boot/dts/spear13xx.dtsi
index 009096d1d2c3..b4ca60f4eb42 100644
--- a/trunk/arch/arm/boot/dts/spear13xx.dtsi
+++ b/trunk/arch/arm/boot/dts/spear13xx.dtsi
@@ -73,7 +73,7 @@
 				400000
 				500000
 				600000 >;
-		status = "disable";
+		status = "disabled";
 	};
 
 	ahb {
@@ -118,15 +118,15 @@
 			compatible = "st,spear600-fsmc-nand";
 			#address-cells = <1>;
 			#size-cells = <1>;
-			reg = <0xb0000000 0x1000	/* FSMC Register */
-			       0xb0800000 0x0010>;	/* NAND Base */
-			reg-names = "fsmc_regs", "nand_data";
+			reg = <0xb0000000 0x1000	/* FSMC Register*/
+			       0xb0800000 0x0010	/* NAND Base DATA */
+			       0xb0820000 0x0010	/* NAND Base ADDR */
+			       0xb0810000 0x0010>;	/* NAND Base CMD */
+			reg-names = "fsmc_regs", "nand_data", "nand_addr", "nand_cmd";
 			interrupts = <0 20 0x4
 				      0 21 0x4
 				      0 22 0x4
 				      0 23 0x4>;
-			st,ale-off = <0x20000>;
-			st,cle-off = <0x10000>;
 			st,mode = <2>;
 			status = "disabled";
 		};
@@ -144,7 +144,7 @@
 			compatible = "st,pcm-audio";
 			#address-cells = <0>;
 			#size-cells = <0>;
-			status = "disable";
+			status = "disabled";
 		};
 
 		smi: flash@ea000000 {
diff --git a/trunk/arch/arm/boot/dts/spear300.dtsi b/trunk/arch/arm/boot/dts/spear300.dtsi
index 090adc656015..f79b3dfaabe6 100644
--- a/trunk/arch/arm/boot/dts/spear300.dtsi
+++ b/trunk/arch/arm/boot/dts/spear300.dtsi
@@ -38,10 +38,10 @@
 			#address-cells = <1>;
 			#size-cells = <1>;
 			reg = <0x94000000 0x1000	/* FSMC Register */
-			       0x80000000 0x0010>;	/* NAND Base */
-			reg-names = "fsmc_regs", "nand_data";
-			st,ale-off = <0x20000>;
-			st,cle-off = <0x10000>;
+			       0x80000000 0x0010	/* NAND Base DATA */
+			       0x80020000 0x0010	/* NAND Base ADDR */
+			       0x80010000 0x0010>;	/* NAND Base CMD */
+			reg-names = "fsmc_regs", "nand_data", "nand_addr", "nand_cmd";
 			status = "disabled";
 		};
 
diff --git a/trunk/arch/arm/boot/dts/spear310.dtsi b/trunk/arch/arm/boot/dts/spear310.dtsi
index e814e5e97083..ab45b8c81982 100644
--- a/trunk/arch/arm/boot/dts/spear310.dtsi
+++ b/trunk/arch/arm/boot/dts/spear310.dtsi
@@ -33,10 +33,10 @@
 			#address-cells = <1>;
 			#size-cells = <1>;
 			reg = <0x44000000 0x1000	/* FSMC Register */
-			       0x40000000 0x0010>;	/* NAND Base */
-			reg-names = "fsmc_regs", "nand_data";
-			st,ale-off = <0x10000>;
-			st,cle-off = <0x20000>;
+			       0x40000000 0x0010	/* NAND Base DATA */
+			       0x40020000 0x0010	/* NAND Base ADDR */
+			       0x40010000 0x0010>;	/* NAND Base CMD */
+			reg-names = "fsmc_regs", "nand_data", "nand_addr", "nand_cmd";
 			status = "disabled";
 		};
 
diff --git a/trunk/arch/arm/boot/dts/spear320.dtsi b/trunk/arch/arm/boot/dts/spear320.dtsi
index c056a84deabf..caa5520b1fd4 100644
--- a/trunk/arch/arm/boot/dts/spear320.dtsi
+++ b/trunk/arch/arm/boot/dts/spear320.dtsi
@@ -40,10 +40,10 @@
 			#address-cells = <1>;
 			#size-cells = <1>;
 			reg = <0x4c000000 0x1000	/* FSMC Register */
-			       0x50000000 0x0010>;	/* NAND Base */
-			reg-names = "fsmc_regs", "nand_data";
-			st,ale-off = <0x20000>;
-			st,cle-off = <0x10000>;
+			       0x50000000 0x0010	/* NAND Base DATA */
+			       0x50020000 0x0010	/* NAND Base ADDR */
+			       0x50010000 0x0010>;	/* NAND Base CMD */
+			reg-names = "fsmc_regs", "nand_data", "nand_addr", "nand_cmd";
 			status = "disabled";
 		};
 
diff --git a/trunk/arch/arm/boot/dts/spear600.dtsi b/trunk/arch/arm/boot/dts/spear600.dtsi
index e051dde5181f..19f99dc4115e 100644
--- a/trunk/arch/arm/boot/dts/spear600.dtsi
+++ b/trunk/arch/arm/boot/dts/spear600.dtsi
@@ -76,10 +76,10 @@
 			#address-cells = <1>;
 			#size-cells = <1>;
 			reg = <0xd1800000 0x1000	/* FSMC Register */
-			       0xd2000000 0x4000>;	/* NAND Base */
-			reg-names = "fsmc_regs", "nand_data";
-			st,ale-off = <0x20000>;
-			st,cle-off = <0x10000>;
+			       0xd2000000 0x0010	/* NAND Base DATA */
+			       0xd2020000 0x0010	/* NAND Base ADDR */
+			       0xd2010000 0x0010>;	/* NAND Base CMD */
+			reg-names = "fsmc_regs", "nand_data", "nand_addr", "nand_cmd";
 			status = "disabled";
 		};
 
diff --git a/trunk/arch/arm/configs/nhk8815_defconfig b/trunk/arch/arm/configs/nhk8815_defconfig
index 240b25eea565..86cfd2959c47 100644
--- a/trunk/arch/arm/configs/nhk8815_defconfig
+++ b/trunk/arch/arm/configs/nhk8815_defconfig
@@ -57,7 +57,7 @@ CONFIG_MTD_CHAR=y
 CONFIG_MTD_BLOCK=y
 CONFIG_MTD_NAND=y
 CONFIG_MTD_NAND_ECC_SMC=y
-CONFIG_MTD_NAND_NOMADIK=y
+CONFIG_MTD_NAND_FSMC=y
 CONFIG_MTD_ONENAND=y
 CONFIG_MTD_ONENAND_VERIFY_WRITE=y
 CONFIG_MTD_ONENAND_GENERIC=y
diff --git a/trunk/arch/arm/include/uapi/asm/unistd.h b/trunk/arch/arm/include/uapi/asm/unistd.h
index ac03bdb4ae44..4da7cde70b5d 100644
--- a/trunk/arch/arm/include/uapi/asm/unistd.h
+++ b/trunk/arch/arm/include/uapi/asm/unistd.h
@@ -405,6 +405,7 @@
 #define __NR_process_vm_readv		(__NR_SYSCALL_BASE+376)
 #define __NR_process_vm_writev		(__NR_SYSCALL_BASE+377)
 					/* 378 for kcmp */
+#define __NR_finit_module		(__NR_SYSCALL_BASE+379)
 
 /*
  * This may need to be greater than __NR_last_syscall+1 in order to
diff --git a/trunk/arch/arm/kernel/calls.S b/trunk/arch/arm/kernel/calls.S
index 5935b6a02e6e..a4fda4e7a372 100644
--- a/trunk/arch/arm/kernel/calls.S
+++ b/trunk/arch/arm/kernel/calls.S
@@ -388,6 +388,7 @@
 		CALL(sys_process_vm_readv)
 		CALL(sys_process_vm_writev)
 		CALL(sys_ni_syscall)	/* reserved for sys_kcmp */
+		CALL(sys_finit_module)
 #ifndef syscalls_counted
 .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
 #define syscalls_counted
diff --git a/trunk/arch/arm/mach-davinci/devices-da8xx.c b/trunk/arch/arm/mach-davinci/devices-da8xx.c
index fcdbe437409e..2d5502d84a22 100644
--- a/trunk/arch/arm/mach-davinci/devices-da8xx.c
+++ b/trunk/arch/arm/mach-davinci/devices-da8xx.c
@@ -725,7 +725,7 @@ static struct resource da8xx_rtc_resources[] = {
 };
 
 static struct platform_device da8xx_rtc_device = {
-	.name           = "omap_rtc",
+	.name           = "da830-rtc",
 	.id             = -1,
 	.num_resources	= ARRAY_SIZE(da8xx_rtc_resources),
 	.resource	= da8xx_rtc_resources,
@@ -734,17 +734,6 @@ static struct platform_device da8xx_rtc_device = {
 int da8xx_register_rtc(void)
 {
 	int ret;
-	void __iomem *base;
-
-	base = ioremap(DA8XX_RTC_BASE, SZ_4K);
-	if (WARN_ON(!base))
-		return -ENOMEM;
-
-	/* Unlock the rtc's registers */
-	__raw_writel(0x83e70b13, base + 0x6c);
-	__raw_writel(0x95a4f1e0, base + 0x70);
-
-	iounmap(base);
 
 	ret = platform_device_register(&da8xx_rtc_device);
 	if (!ret)
diff --git a/trunk/arch/arm/mach-mxs/mach-mxs.c b/trunk/arch/arm/mach-mxs/mach-mxs.c
index 98070370d602..c66129b5dd18 100644
--- a/trunk/arch/arm/mach-mxs/mach-mxs.c
+++ b/trunk/arch/arm/mach-mxs/mach-mxs.c
@@ -240,7 +240,7 @@ static void __init update_fec_mac_prop(enum mac_oui oui)
 		macaddr[4] = (val >> 8) & 0xff;
 		macaddr[5] = (val >> 0) & 0xff;
 
-		prom_update_property(np, newmac);
+		of_update_property(np, newmac);
 	}
 }
 
diff --git a/trunk/arch/arm/mach-nomadik/board-nhk8815.c b/trunk/arch/arm/mach-nomadik/board-nhk8815.c
index 5ccdf53c5a9d..98167a4319f7 100644
--- a/trunk/arch/arm/mach-nomadik/board-nhk8815.c
+++ b/trunk/arch/arm/mach-nomadik/board-nhk8815.c
@@ -19,6 +19,7 @@
 #include <linux/gpio.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
+#include <linux/mtd/fsmc.h>
 #include <linux/mtd/onenand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/i2c.h>
@@ -33,7 +34,6 @@
 #include <asm/mach/arch.h>
 #include <asm/mach/flash.h>
 #include <asm/mach/time.h>
-#include <mach/fsmc.h>
 #include <mach/irqs.h>
 
 #include "cpu-8815.h"
@@ -42,39 +42,34 @@
 #define SRC_CR_INIT_MASK	0x00007fff
 #define SRC_CR_INIT_VAL		0x2aaa8000
 
+#define ALE_OFF 0x1000000
+#define CLE_OFF 0x800000
+
 /* These addresses span 16MB, so use three individual pages */
 static struct resource nhk8815_nand_resources[] = {
 	{
+		.name = "nand_data",
+		.start = 0x40000000,
+		.end = 0x40000000 + SZ_16K - 1,
+		.flags = IORESOURCE_MEM,
+	}, {
 		.name = "nand_addr",
-		.start = NAND_IO_ADDR,
-		.end = NAND_IO_ADDR + 0xfff,
+		.start = 0x40000000 + ALE_OFF,
+		.end = 0x40000000 +ALE_OFF + SZ_16K - 1,
 		.flags = IORESOURCE_MEM,
 	}, {
 		.name = "nand_cmd",
-		.start = NAND_IO_CMD,
-		.end = NAND_IO_CMD + 0xfff,
+		.start = 0x40000000 + CLE_OFF,
+		.end = 0x40000000 + CLE_OFF + SZ_16K - 1,
 		.flags = IORESOURCE_MEM,
 	}, {
-		.name = "nand_data",
-		.start = NAND_IO_DATA,
-		.end = NAND_IO_DATA + 0xfff,
+		.name  = "fsmc_regs",
+		.start = NOMADIK_FSMC_BASE,
+		.end   = NOMADIK_FSMC_BASE + SZ_4K - 1,
 		.flags = IORESOURCE_MEM,
-	}
+	},
 };
 
-static int nhk8815_nand_init(void)
-{
-	/* FSMC setup for nand chip select (8-bit nand in 8815NHK) */
-	writel(0x0000000E, FSMC_PCR(0));
-	writel(0x000D0A00, FSMC_PMEM(0));
-	writel(0x00100A00, FSMC_PATT(0));
-
-	/* enable access to the chip select area */
-	writel(readl(FSMC_PCR(0)) | 0x04, FSMC_PCR(0));
-
-	return 0;
-}
-
 /*
  * These partitions are the same as those used in the 2.6.20 release
  * shipped by the vendor; the first two partitions are mandated
@@ -108,20 +103,28 @@ static struct mtd_partition nhk8815_partitions[] = {
 	}
 };
 
-static struct nomadik_nand_platform_data nhk8815_nand_data = {
-	.parts		= nhk8815_partitions,
-	.nparts		= ARRAY_SIZE(nhk8815_partitions),
-	.options	= NAND_COPYBACK | NAND_CACHEPRG | NAND_NO_PADDING,
-	.init		= nhk8815_nand_init,
+static struct fsmc_nand_timings nhk8815_nand_timings = {
+	.thiz	= 0,
+	.thold	= 0x10,
+	.twait	= 0x0A,
+	.tset	= 0,
+};
+
+static struct fsmc_nand_platform_data nhk8815_nand_platform_data = {
+	.nand_timings = &nhk8815_nand_timings,
+	.partitions = nhk8815_partitions,
+	.nr_partitions = ARRAY_SIZE(nhk8815_partitions),
+	.width = FSMC_NAND_BW8,
 };
 
 static struct platform_device nhk8815_nand_device = {
-	.name		= "nomadik_nand",
-	.dev		= {
-		.platform_data = &nhk8815_nand_data,
+	.name = "fsmc-nand",
+	.id = -1,
+	.resource = nhk8815_nand_resources,
+	.num_resources = ARRAY_SIZE(nhk8815_nand_resources),
+	.dev = {
+		.platform_data = &nhk8815_nand_platform_data,
 	},
-	.resource	= nhk8815_nand_resources,
-	.num_resources	= ARRAY_SIZE(nhk8815_nand_resources),
 };
 
 /* These are the partitions for the OneNand device, different from above */
@@ -176,6 +179,10 @@ static struct platform_device nhk8815_onenand_device = {
 	.num_resources	= ARRAY_SIZE(nhk8815_onenand_resource),
 };
 
+/* bus control reg. and bus timing reg. for CS0..CS3 */
+#define FSMC_BCR(x)	(NOMADIK_FSMC_VA + (x << 3))
+#define FSMC_BTR(x)	(NOMADIK_FSMC_VA + (x << 3) + 0x04)
+
 static void __init nhk8815_onenand_init(void)
 {
 #ifdef CONFIG_MTD_ONENAND
diff --git a/trunk/arch/arm/mach-nomadik/include/mach/fsmc.h b/trunk/arch/arm/mach-nomadik/include/mach/fsmc.h
deleted file mode 100644
index 8c2c05183685..000000000000
--- a/trunk/arch/arm/mach-nomadik/include/mach/fsmc.h
+++ /dev/null
@@ -1,29 +0,0 @@
-
-/* Definitions for the Nomadik FSMC "Flexible Static Memory controller" */
-
-#ifndef __ASM_ARCH_FSMC_H
-#define __ASM_ARCH_FSMC_H
-
-#include <mach/hardware.h>
-/*
- * Register list
- */
-
-/* bus control reg. and bus timing reg. for CS0..CS3 */
-#define FSMC_BCR(x)     (NOMADIK_FSMC_VA + (x << 3))
-#define FSMC_BTR(x)     (NOMADIK_FSMC_VA + (x << 3) + 0x04)
-
-/* PC-card and NAND:
- * PCR = control register
- * PMEM = memory timing
- * PATT = attribute timing
- * PIO = I/O timing
- * PECCR = ECC result
- */
-#define FSMC_PCR(x)     (NOMADIK_FSMC_VA + ((2 + x) << 5) + 0x00)
-#define FSMC_PMEM(x)    (NOMADIK_FSMC_VA + ((2 + x) << 5) + 0x08)
-#define FSMC_PATT(x)    (NOMADIK_FSMC_VA + ((2 + x) << 5) + 0x0c)
-#define FSMC_PIO(x)     (NOMADIK_FSMC_VA + ((2 + x) << 5) + 0x10)
-#define FSMC_PECCR(x)   (NOMADIK_FSMC_VA + ((2 + x) << 5) + 0x14)
-
-#endif /* __ASM_ARCH_FSMC_H */
diff --git a/trunk/arch/arm/mach-omap1/board-nokia770.c b/trunk/arch/arm/mach-omap1/board-nokia770.c
index 3e8ead67e459..24d2f2df11a0 100644
--- a/trunk/arch/arm/mach-omap1/board-nokia770.c
+++ b/trunk/arch/arm/mach-omap1/board-nokia770.c
@@ -112,17 +112,6 @@ static void __init mipid_dev_init(void)
 	omapfb_set_lcd_config(&nokia770_lcd_config);
 }
 
-static void __init ads7846_dev_init(void)
-{
-	if (gpio_request(ADS7846_PENDOWN_GPIO, "ADS7846 pendown") < 0)
-		printk(KERN_ERR "can't get ads7846 pen down GPIO\n");
-}
-
-static int ads7846_get_pendown_state(void)
-{
-	return !gpio_get_value(ADS7846_PENDOWN_GPIO);
-}
-
 static struct ads7846_platform_data nokia770_ads7846_platform_data __initdata = {
 	.x_max		= 0x0fff,
 	.y_max		= 0x0fff,
@@ -131,7 +120,7 @@ static struct ads7846_platform_data nokia770_ads7846_platform_data __initdata =
 	.debounce_max	= 10,
 	.debounce_tol	= 3,
 	.debounce_rep	= 1,
-	.get_pendown_state	= ads7846_get_pendown_state,
+	.gpio_pendown	= ADS7846_PENDOWN_GPIO,
 };
 
 static struct spi_board_info nokia770_spi_board_info[] __initdata = {
@@ -241,7 +230,6 @@ static void __init omap_nokia770_init(void)
 	omap_serial_init();
 	omap_register_i2c_bus(1, 100, NULL, 0);
 	hwa742_dev_init();
-	ads7846_dev_init();
 	mipid_dev_init();
 	omap1_usb_init(&nokia770_usb_config);
 	nokia770_mmc_init();
diff --git a/trunk/arch/arm/mach-omap2/board-n8x0.c b/trunk/arch/arm/mach-omap2/board-n8x0.c
index a4e167c55c1d..0abb30fe399c 100644
--- a/trunk/arch/arm/mach-omap2/board-n8x0.c
+++ b/trunk/arch/arm/mach-omap2/board-n8x0.c
@@ -16,10 +16,12 @@
 #include <linux/gpio.h>
 #include <linux/init.h>
 #include <linux/io.h>
+#include <linux/irq.h>
 #include <linux/stddef.h>
 #include <linux/i2c.h>
 #include <linux/spi/spi.h>
 #include <linux/usb/musb.h>
+#include <linux/platform_data/i2c-cbus-gpio.h>
 #include <linux/platform_data/spi-omap2-mcspi.h>
 #include <linux/platform_data/mtd-onenand-omap2.h>
 #include <linux/mfd/menelaus.h>
@@ -40,6 +42,45 @@
 #define TUSB6010_GPIO_ENABLE	0
 #define TUSB6010_DMACHAN	0x3f
 
+#if defined(CONFIG_I2C_CBUS_GPIO) || defined(CONFIG_I2C_CBUS_GPIO_MODULE)
+static struct i2c_cbus_platform_data n8x0_cbus_data = {
+	.clk_gpio = 66,
+	.dat_gpio = 65,
+	.sel_gpio = 64,
+};
+
+static struct platform_device n8x0_cbus_device = {
+	.name	= "i2c-cbus-gpio",
+	.id	= 3,
+	.dev	= {
+		.platform_data = &n8x0_cbus_data,
+	},
+};
+
+static struct i2c_board_info n8x0_i2c_board_info_3[] __initdata = {
+	{
+		I2C_BOARD_INFO("retu-mfd", 0x01),
+	},
+};
+
+static void __init n8x0_cbus_init(void)
+{
+	const int retu_irq_gpio = 108;
+
+	if (gpio_request_one(retu_irq_gpio, GPIOF_IN, "Retu IRQ"))
+		return;
+	irq_set_irq_type(gpio_to_irq(retu_irq_gpio), IRQ_TYPE_EDGE_RISING);
+	n8x0_i2c_board_info_3[0].irq = gpio_to_irq(retu_irq_gpio);
+	i2c_register_board_info(3, n8x0_i2c_board_info_3,
+				ARRAY_SIZE(n8x0_i2c_board_info_3));
+	platform_device_register(&n8x0_cbus_device);
+}
+#else /* CONFIG_I2C_CBUS_GPIO */
+static void __init n8x0_cbus_init(void)
+{
+}
+#endif /* CONFIG_I2C_CBUS_GPIO */
+
 #if defined(CONFIG_USB_MUSB_TUSB6010) || defined(CONFIG_USB_MUSB_TUSB6010_MODULE)
 /*
  * Enable or disable power to TUSB6010. When enabling, turn on 3.3 V and
@@ -678,6 +719,7 @@ static void __init n8x0_init_machine(void)
 	gpmc_onenand_init(board_onenand_data);
 	n8x0_mmc_init();
 	n8x0_usb_init();
+	n8x0_cbus_init();
 }
 
 MACHINE_START(NOKIA_N800, "Nokia N800")
diff --git a/trunk/arch/arm/mach-omap2/board-rx51-peripherals.c b/trunk/arch/arm/mach-omap2/board-rx51-peripherals.c
index 60529e0b3d67..cf07e289b4ea 100644
--- a/trunk/arch/arm/mach-omap2/board-rx51-peripherals.c
+++ b/trunk/arch/arm/mach-omap2/board-rx51-peripherals.c
@@ -256,6 +256,11 @@ static struct spi_board_info rx51_peripherals_spi_board_info[] __initdata = {
 	},
 };
 
+static struct platform_device rx51_battery_device = {
+	.name	= "rx51-battery",
+	.id	= -1,
+};
+
 static void rx51_charger_set_power(bool on)
 {
 	gpio_set_value(RX51_USB_TRANSCEIVER_RST_GPIO, on);
@@ -277,6 +282,7 @@ static void __init rx51_charger_init(void)
 	WARN_ON(gpio_request_one(RX51_USB_TRANSCEIVER_RST_GPIO,
 		GPIOF_OUT_INIT_HIGH, "isp1704_reset"));
 
+	platform_device_register(&rx51_battery_device);
 	platform_device_register(&rx51_charger_device);
 }
 
diff --git a/trunk/arch/arm/mach-omap2/i2c.c b/trunk/arch/arm/mach-omap2/i2c.c
index fbb9b152cd5e..df6d6acbc9ed 100644
--- a/trunk/arch/arm/mach-omap2/i2c.c
+++ b/trunk/arch/arm/mach-omap2/i2c.c
@@ -120,6 +120,16 @@ static int __init omap_i2c_nr_ports(void)
 	return ports;
 }
 
+/*
+ * XXX This function is a temporary compatibility wrapper - only
+ * needed until the I2C driver can be converted to call
+ * omap_pm_set_max_dev_wakeup_lat() and handle a return code.
+ */
+static void omap_pm_set_max_mpu_wakeup_lat_compat(struct device *dev, long t)
+{
+	omap_pm_set_max_mpu_wakeup_lat(dev, t);
+}
+
 static const char name[] = "omap_i2c";
 
 int __init omap_i2c_add_bus(struct omap_i2c_bus_platform_data *i2c_pdata,
@@ -157,6 +167,15 @@ int __init omap_i2c_add_bus(struct omap_i2c_bus_platform_data *i2c_pdata,
 	dev_attr = (struct omap_i2c_dev_attr *)oh->dev_attr;
 	pdata->flags = dev_attr->flags;
 
+	/*
+	 * When waiting for completion of a i2c transfer, we need to
+	 * set a wake up latency constraint for the MPU. This is to
+	 * ensure quick enough wakeup from idle, when transfer
+	 * completes.
+	 * Only omap3 has support for constraints
+	 */
+	if (cpu_is_omap34xx())
+		pdata->set_mpu_wkup_lat = omap_pm_set_max_mpu_wakeup_lat_compat;
 	pdev = omap_device_build(name, bus_id, oh, pdata,
 			sizeof(struct omap_i2c_bus_platform_data),
 			NULL, 0, 0);
diff --git a/trunk/arch/arm/mach-omap2/omap_hwmod_2430_data.c b/trunk/arch/arm/mach-omap2/omap_hwmod_2430_data.c
index 6c8fa70ddadd..d2d3840557c3 100644
--- a/trunk/arch/arm/mach-omap2/omap_hwmod_2430_data.c
+++ b/trunk/arch/arm/mach-omap2/omap_hwmod_2430_data.c
@@ -77,8 +77,7 @@ static struct omap_hwmod_class i2c_class = {
 
 static struct omap_i2c_dev_attr i2c_dev_attr = {
 	.fifo_depth	= 8, /* bytes */
-	.flags		= OMAP_I2C_FLAG_APPLY_ERRATA_I207 |
-			  OMAP_I2C_FLAG_BUS_SHIFT_2 |
+	.flags		= OMAP_I2C_FLAG_BUS_SHIFT_2 |
 			  OMAP_I2C_FLAG_FORCE_19200_INT_CLK,
 };
 
diff --git a/trunk/arch/arm/mach-omap2/omap_hwmod_33xx_data.c b/trunk/arch/arm/mach-omap2/omap_hwmod_33xx_data.c
index 32820d89f5b4..081c71edddf4 100644
--- a/trunk/arch/arm/mach-omap2/omap_hwmod_33xx_data.c
+++ b/trunk/arch/arm/mach-omap2/omap_hwmod_33xx_data.c
@@ -1118,8 +1118,7 @@ static struct omap_hwmod_class i2c_class = {
 };
 
 static struct omap_i2c_dev_attr i2c_dev_attr = {
-	.flags = OMAP_I2C_FLAG_BUS_SHIFT_NONE |
-		  OMAP_I2C_FLAG_RESET_REGS_POSTIDLE,
+	.flags = OMAP_I2C_FLAG_BUS_SHIFT_NONE,
 };
 
 /* i2c1 */
diff --git a/trunk/arch/arm/mach-omap2/omap_hwmod_3xxx_data.c b/trunk/arch/arm/mach-omap2/omap_hwmod_3xxx_data.c
index ec4499e5a4c9..8bb2628df34e 100644
--- a/trunk/arch/arm/mach-omap2/omap_hwmod_3xxx_data.c
+++ b/trunk/arch/arm/mach-omap2/omap_hwmod_3xxx_data.c
@@ -794,9 +794,7 @@ static struct omap_hwmod omap3xxx_dss_venc_hwmod = {
 /* I2C1 */
 static struct omap_i2c_dev_attr i2c1_dev_attr = {
 	.fifo_depth	= 8, /* bytes */
-	.flags		= OMAP_I2C_FLAG_APPLY_ERRATA_I207 |
-			  OMAP_I2C_FLAG_RESET_REGS_POSTIDLE |
-			  OMAP_I2C_FLAG_BUS_SHIFT_2,
+	.flags		= OMAP_I2C_FLAG_BUS_SHIFT_2,
 };
 
 static struct omap_hwmod omap3xxx_i2c1_hwmod = {
@@ -821,9 +819,7 @@ static struct omap_hwmod omap3xxx_i2c1_hwmod = {
 /* I2C2 */
 static struct omap_i2c_dev_attr i2c2_dev_attr = {
 	.fifo_depth	= 8, /* bytes */
-	.flags = OMAP_I2C_FLAG_APPLY_ERRATA_I207 |
-		 OMAP_I2C_FLAG_RESET_REGS_POSTIDLE |
-		 OMAP_I2C_FLAG_BUS_SHIFT_2,
+	.flags = OMAP_I2C_FLAG_BUS_SHIFT_2,
 };
 
 static struct omap_hwmod omap3xxx_i2c2_hwmod = {
@@ -848,9 +844,7 @@ static struct omap_hwmod omap3xxx_i2c2_hwmod = {
 /* I2C3 */
 static struct omap_i2c_dev_attr i2c3_dev_attr = {
 	.fifo_depth	= 64, /* bytes */
-	.flags = OMAP_I2C_FLAG_APPLY_ERRATA_I207 |
-		 OMAP_I2C_FLAG_RESET_REGS_POSTIDLE |
-		 OMAP_I2C_FLAG_BUS_SHIFT_2,
+	.flags = OMAP_I2C_FLAG_BUS_SHIFT_2,
 };
 
 static struct omap_hwmod_irq_info i2c3_mpu_irqs[] = {
diff --git a/trunk/arch/arm/mach-omap2/omap_hwmod_44xx_data.c b/trunk/arch/arm/mach-omap2/omap_hwmod_44xx_data.c
index eb61cfd9452b..272b0178dba6 100644
--- a/trunk/arch/arm/mach-omap2/omap_hwmod_44xx_data.c
+++ b/trunk/arch/arm/mach-omap2/omap_hwmod_44xx_data.c
@@ -1529,8 +1529,7 @@ static struct omap_hwmod_class omap44xx_i2c_hwmod_class = {
 };
 
 static struct omap_i2c_dev_attr i2c_dev_attr = {
-	.flags	= OMAP_I2C_FLAG_BUS_SHIFT_NONE |
-			OMAP_I2C_FLAG_RESET_REGS_POSTIDLE,
+	.flags	= OMAP_I2C_FLAG_BUS_SHIFT_NONE,
 };
 
 /* i2c1 */
diff --git a/trunk/arch/arm/mach-u300/core.c b/trunk/arch/arm/mach-u300/core.c
index 0374b9863e9b..4ce77cdc31cc 100644
--- a/trunk/arch/arm/mach-u300/core.c
+++ b/trunk/arch/arm/mach-u300/core.c
@@ -248,6 +248,18 @@ static struct resource rtc_resources[] = {
  * but these are not yet used by the driver.
  */
 static struct resource fsmc_resources[] = {
+	{
+		.name  = "nand_addr",
+		.start = U300_NAND_CS0_PHYS_BASE + PLAT_NAND_ALE,
+		.end   = U300_NAND_CS0_PHYS_BASE + PLAT_NAND_ALE + SZ_16K - 1,
+		.flags = IORESOURCE_MEM,
+	},
+	{
+		.name  = "nand_cmd",
+		.start = U300_NAND_CS0_PHYS_BASE + PLAT_NAND_CLE,
+		.end   = U300_NAND_CS0_PHYS_BASE + PLAT_NAND_CLE + SZ_16K - 1,
+		.flags = IORESOURCE_MEM,
+	},
 	{
 		.name  = "nand_data",
 		.start = U300_NAND_CS0_PHYS_BASE,
@@ -1491,8 +1503,6 @@ static struct fsmc_nand_platform_data nand_platform_data = {
 	.nr_partitions = ARRAY_SIZE(u300_partitions),
 	.options = NAND_SKIP_BBTSCAN,
 	.width = FSMC_NAND_BW8,
-	.ale_off = PLAT_NAND_ALE,
-	.cle_off = PLAT_NAND_CLE,
 };
 
 static struct platform_device nand_device = {
diff --git a/trunk/arch/arm/mach-ux500/board-mop500-stuib.c b/trunk/arch/arm/mach-ux500/board-mop500-stuib.c
index 564f57d5d8a7..7e1f294f0434 100644
--- a/trunk/arch/arm/mach-ux500/board-mop500-stuib.c
+++ b/trunk/arch/arm/mach-ux500/board-mop500-stuib.c
@@ -77,9 +77,6 @@ static struct i2c_board_info __initdata mop500_i2c0_devices_stuib[] = {
  * BU21013 ROHM touchscreen interface on the STUIBs
  */
 
-/* tracks number of bu21013 devices being enabled */
-static int bu21013_devices;
-
 #define TOUCH_GPIO_PIN  84
 
 #define TOUCH_XMAX	384
@@ -88,73 +85,8 @@ static int bu21013_devices;
 #define PRCMU_CLOCK_OCR		0x1CC
 #define TSC_EXT_CLOCK_9_6MHZ	0x840000
 
-/**
- * bu21013_gpio_board_init : configures the touch panel.
- * @reset_pin: reset pin number
- * This function can be used to configures
- * the voltage and reset the touch panel controller.
- */
-static int bu21013_gpio_board_init(int reset_pin)
-{
-	int retval = 0;
-
-	bu21013_devices++;
-	if (bu21013_devices == 1) {
-		retval = gpio_request(reset_pin, "touchp_reset");
-		if (retval) {
-			printk(KERN_ERR "Unable to request gpio reset_pin");
-			return retval;
-		}
-		retval = gpio_direction_output(reset_pin, 1);
-		if (retval < 0) {
-			printk(KERN_ERR "%s: gpio direction failed\n",
-					__func__);
-			return retval;
-		}
-	}
-
-	return retval;
-}
-
-/**
- * bu21013_gpio_board_exit : deconfigures the touch panel controller
- * @reset_pin: reset pin number
- * This function can be used to deconfigures the chip selection
- * for touch panel controller.
- */
-static int bu21013_gpio_board_exit(int reset_pin)
-{
-	int retval = 0;
-
-	if (bu21013_devices == 1) {
-		retval = gpio_direction_output(reset_pin, 0);
-		if (retval < 0) {
-			printk(KERN_ERR "%s: gpio direction failed\n",
-					__func__);
-			return retval;
-		}
-		gpio_set_value(reset_pin, 0);
-	}
-	bu21013_devices--;
-
-	return retval;
-}
-
-/**
- * bu21013_read_pin_val : get the interrupt pin value
- * This function can be used to get the interrupt pin value for touch panel
- * controller.
- */
-static int bu21013_read_pin_val(void)
-{
-	return gpio_get_value(TOUCH_GPIO_PIN);
-}
-
 static struct bu21013_platform_device tsc_plat_device = {
-	.cs_en = bu21013_gpio_board_init,
-	.cs_dis = bu21013_gpio_board_exit,
-	.irq_read_val = bu21013_read_pin_val,
-	.irq = NOMADIK_GPIO_TO_IRQ(TOUCH_GPIO_PIN),
+	.touch_pin = TOUCH_GPIO_PIN,
 	.touch_x_max = TOUCH_XMAX,
 	.touch_y_max = TOUCH_YMAX,
 	.ext_clk = false,
@@ -171,7 +103,6 @@ static struct i2c_board_info __initdata u8500_i2c3_devices_stuib[] = {
 		I2C_BOARD_INFO("bu21013_tp", 0x5D),
 		.platform_data = &tsc_plat_device,
 	},
-
 };
 
 void __init mop500_stuib_init(void)
diff --git a/trunk/arch/arm64/include/asm/unistd.h b/trunk/arch/arm64/include/asm/unistd.h
index d69aeea6da1e..76fb7dd3350a 100644
--- a/trunk/arch/arm64/include/asm/unistd.h
+++ b/trunk/arch/arm64/include/asm/unistd.h
@@ -20,6 +20,7 @@
 #define __ARCH_WANT_SYS_GETPGRP
 #define __ARCH_WANT_SYS_LLSEEK
 #define __ARCH_WANT_SYS_NICE
+#define __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL
 #define __ARCH_WANT_SYS_SIGPENDING
 #define __ARCH_WANT_SYS_SIGPROCMASK
 #define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
diff --git a/trunk/arch/arm64/kernel/sys_compat.c b/trunk/arch/arm64/kernel/sys_compat.c
index f7b05edf8ce3..26e9c4eeaba8 100644
--- a/trunk/arch/arm64/kernel/sys_compat.c
+++ b/trunk/arch/arm64/kernel/sys_compat.c
@@ -28,21 +28,6 @@
 #include <asm/cacheflush.h>
 #include <asm/unistd32.h>
 
-asmlinkage int compat_sys_sched_rr_get_interval(compat_pid_t pid,
-						struct compat_timespec __user *interval)
-{
-	struct timespec t;
-	int ret;
-	mm_segment_t old_fs = get_fs();
-
-	set_fs(KERNEL_DS);
-	ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
-	set_fs(old_fs);
-	if (put_compat_timespec(&t, interval))
-		return -EFAULT;
-	return ret;
-}
-
 static inline void
 do_compat_cache_op(unsigned long start, unsigned long end, int flags)
 {
diff --git a/trunk/arch/blackfin/include/asm/Kbuild b/trunk/arch/blackfin/include/asm/Kbuild
index 27d70759474c..127826f8a375 100644
--- a/trunk/arch/blackfin/include/asm/Kbuild
+++ b/trunk/arch/blackfin/include/asm/Kbuild
@@ -1,4 +1,3 @@
-include include/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
@@ -17,6 +16,7 @@ generic-y += ipcbuf.h
 generic-y += irq_regs.h
 generic-y += kdebug.h
 generic-y += kmap_types.h
+generic-y += kvm_para.h
 generic-y += local64.h
 generic-y += local.h
 generic-y += mman.h
@@ -44,7 +44,3 @@ generic-y += ucontext.h
 generic-y += unaligned.h
 generic-y += user.h
 generic-y += xor.h
-
-header-y += bfin_sport.h
-header-y += cachectl.h
-header-y += fixed_code.h
diff --git a/trunk/arch/blackfin/include/asm/bfin_sport.h b/trunk/arch/blackfin/include/asm/bfin_sport.h
index f8907ea6b5b6..50b9dfd4839f 100644
--- a/trunk/arch/blackfin/include/asm/bfin_sport.h
+++ b/trunk/arch/blackfin/include/asm/bfin_sport.h
@@ -5,65 +5,12 @@
  *
  * Licensed under the GPL-2 or later.
  */
-
 #ifndef __BFIN_SPORT_H__
 #define __BFIN_SPORT_H__
 
-/* Sport mode: it can be set to TDM, i2s or others */
-#define NORM_MODE	0x0
-#define TDM_MODE	0x1
-#define I2S_MODE	0x2
-#define NDSO_MODE	0x3
-
-/* Data format, normal, a-law or u-law */
-#define NORM_FORMAT	0x0
-#define ALAW_FORMAT	0x2
-#define ULAW_FORMAT	0x3
-
-/* Function driver which use sport must initialize the structure */
-struct sport_config {
-	/* TDM (multichannels), I2S or other mode */
-	unsigned int mode:3;
-	unsigned int polled;	/* use poll instead of irq when set */
-
-	/* if TDM mode is selected, channels must be set */
-	int channels;	/* Must be in 8 units */
-	unsigned int frame_delay:4;	/* Delay between frame sync pulse and first bit */
-
-	/* I2S mode */
-	unsigned int right_first:1;	/* Right stereo channel first */
-
-	/* In mormal mode, the following item need to be set */
-	unsigned int lsb_first:1;	/* order of transmit or receive data */
-	unsigned int fsync:1;	/* Frame sync required */
-	unsigned int data_indep:1;	/* data independent frame sync generated */
-	unsigned int act_low:1;	/* Active low TFS */
-	unsigned int late_fsync:1;	/* Late frame sync */
-	unsigned int tckfe:1;
-	unsigned int sec_en:1;	/* Secondary side enabled */
-
-	/* Choose clock source */
-	unsigned int int_clk:1;	/* Internal or external clock */
-
-	/* If external clock is used, the following fields are ignored */
-	int serial_clk;
-	int fsync_clk;
-
-	unsigned int data_format:2;	/* Normal, u-law or a-law */
-
-	int word_len;		/* How length of the word in bits, 3-32 bits */
-	int dma_enabled;
-};
-
-/* Userspace interface */
-#define SPORT_IOC_MAGIC		'P'
-#define SPORT_IOC_CONFIG	_IOWR('P', 0x01, struct sport_config)
-#define SPORT_IOC_GET_SYSTEMCLOCK         _IOR('P', 0x02, unsigned long)
-#define SPORT_IOC_SET_BAUDRATE            _IOW('P', 0x03, unsigned long)
-
-#ifdef __KERNEL__
 
 #include <linux/types.h>
+#include <uapi/asm/bfin_sport.h>
 
 /*
  * All Blackfin system MMRs are padded to 32bits even if the register
@@ -122,76 +69,3 @@ struct bfin_snd_platform_data {
 })
 
 #endif
-
-/* SPORT_TCR1 Masks */
-#define TSPEN		0x0001	/* TX enable */
-#define ITCLK		0x0002	/* Internal TX Clock Select */
-#define TDTYPE		0x000C	/* TX Data Formatting Select */
-#define DTYPE_NORM	0x0000	/* Data Format Normal */
-#define DTYPE_ULAW	0x0008	/* Compand Using u-Law */
-#define DTYPE_ALAW	0x000C	/* Compand Using A-Law */
-#define TLSBIT		0x0010	/* TX Bit Order */
-#define ITFS		0x0200	/* Internal TX Frame Sync Select */
-#define TFSR		0x0400	/* TX Frame Sync Required Select */
-#define DITFS		0x0800	/* Data Independent TX Frame Sync Select */
-#define LTFS		0x1000	/* Low TX Frame Sync Select */
-#define LATFS		0x2000	/* Late TX Frame Sync Select */
-#define TCKFE		0x4000	/* TX Clock Falling Edge Select */
-
-/* SPORT_TCR2 Masks */
-#define SLEN		0x001F	/* SPORT TX Word Length (2 - 31) */
-#define DP_SLEN(x)	BFIN_DEPOSIT(SLEN, x)
-#define EX_SLEN(x)	BFIN_EXTRACT(SLEN, x)
-#define TXSE		0x0100	/* TX Secondary Enable */
-#define TSFSE		0x0200	/* TX Stereo Frame Sync Enable */
-#define TRFST		0x0400	/* TX Right-First Data Order */
-
-/* SPORT_RCR1 Masks */
-#define RSPEN		0x0001	/* RX enable */
-#define IRCLK		0x0002	/* Internal RX Clock Select */
-#define RDTYPE		0x000C	/* RX Data Formatting Select */
-/* DTYPE_* defined above */
-#define RLSBIT		0x0010	/* RX Bit Order */
-#define IRFS		0x0200	/* Internal RX Frame Sync Select */
-#define RFSR		0x0400	/* RX Frame Sync Required Select */
-#define LRFS		0x1000	/* Low RX Frame Sync Select */
-#define LARFS		0x2000	/* Late RX Frame Sync Select */
-#define RCKFE		0x4000	/* RX Clock Falling Edge Select */
-
-/* SPORT_RCR2 Masks */
-/* SLEN defined above */
-#define RXSE		0x0100	/* RX Secondary Enable */
-#define RSFSE		0x0200	/* RX Stereo Frame Sync Enable */
-#define RRFST		0x0400	/* Right-First Data Order */
-
-/* SPORT_STAT Masks */
-#define RXNE		0x0001	/* RX FIFO Not Empty Status */
-#define RUVF		0x0002	/* RX Underflow Status */
-#define ROVF		0x0004	/* RX Overflow Status */
-#define TXF		0x0008	/* TX FIFO Full Status */
-#define TUVF		0x0010	/* TX Underflow Status */
-#define TOVF		0x0020	/* TX Overflow Status */
-#define TXHRE		0x0040	/* TX Hold Register Empty */
-
-/* SPORT_MCMC1 Masks */
-#define SP_WOFF		0x03FF	/* Multichannel Window Offset Field */
-#define DP_SP_WOFF(x)	BFIN_DEPOSIT(SP_WOFF, x)
-#define EX_SP_WOFF(x)	BFIN_EXTRACT(SP_WOFF, x)
-#define SP_WSIZE	0xF000	/* Multichannel Window Size Field */
-#define DP_SP_WSIZE(x)	BFIN_DEPOSIT(SP_WSIZE, x)
-#define EX_SP_WSIZE(x)	BFIN_EXTRACT(SP_WSIZE, x)
-
-/* SPORT_MCMC2 Masks */
-#define MCCRM		0x0003	/* Multichannel Clock Recovery Mode */
-#define REC_BYPASS	0x0000	/* Bypass Mode (No Clock Recovery) */
-#define REC_2FROM4	0x0002	/* Recover 2 MHz Clock from 4 MHz Clock */
-#define REC_8FROM16	0x0003	/* Recover 8 MHz Clock from 16 MHz Clock */
-#define MCDTXPE		0x0004	/* Multichannel DMA Transmit Packing */
-#define MCDRXPE		0x0008	/* Multichannel DMA Receive Packing */
-#define MCMEN		0x0010	/* Multichannel Frame Mode Enable */
-#define FSDR		0x0080	/* Multichannel Frame Sync to Data Relationship */
-#define MFD		0xF000	/* Multichannel Frame Delay */
-#define DP_MFD(x)	BFIN_DEPOSIT(MFD, x)
-#define EX_MFD(x)	BFIN_EXTRACT(MFD, x)
-
-#endif
diff --git a/trunk/arch/blackfin/include/asm/bfin_twi.h b/trunk/arch/blackfin/include/asm/bfin_twi.h
index f4a072787436..90c3c006557d 100644
--- a/trunk/arch/blackfin/include/asm/bfin_twi.h
+++ b/trunk/arch/blackfin/include/asm/bfin_twi.h
@@ -61,7 +61,7 @@ struct bfin_twi_iface {
 	int			cur_msg;
 	u16			saved_clkdiv;
 	u16			saved_control;
-	struct bfin_twi_regs	*regs_base;
+	struct bfin_twi_regs __iomem *regs_base;
 };
 
 #define DEFINE_TWI_REG(reg_name, reg) \
diff --git a/trunk/arch/blackfin/include/asm/fixed_code.h b/trunk/arch/blackfin/include/asm/fixed_code.h
index 5395088b2d0e..bc330f06207b 100644
--- a/trunk/arch/blackfin/include/asm/fixed_code.h
+++ b/trunk/arch/blackfin/include/asm/fixed_code.h
@@ -6,11 +6,11 @@
  *
  * Licensed under the GPL-2 or later.
  */
-
 #ifndef __BFIN_ASM_FIXED_CODE_H__
 #define __BFIN_ASM_FIXED_CODE_H__
 
-#ifdef __KERNEL__
+#include <uapi/asm/fixed_code.h>
+
 #ifndef __ASSEMBLY__
 #include <linux/linkage.h>
 #include <linux/ptrace.h>
@@ -28,29 +28,3 @@ extern void safe_user_instruction(void);
 extern void sigreturn_stub(void);
 #endif
 #endif
-
-#ifndef CONFIG_PHY_RAM_BASE_ADDRESS
-#define CONFIG_PHY_RAM_BASE_ADDRESS	0x0
-#endif
-
-#define FIXED_CODE_START	(CONFIG_PHY_RAM_BASE_ADDRESS + 0x400)
-
-#define SIGRETURN_STUB		(CONFIG_PHY_RAM_BASE_ADDRESS + 0x400)
-
-#define ATOMIC_SEQS_START	(CONFIG_PHY_RAM_BASE_ADDRESS + 0x410)
-
-#define ATOMIC_XCHG32		(CONFIG_PHY_RAM_BASE_ADDRESS + 0x410)
-#define ATOMIC_CAS32		(CONFIG_PHY_RAM_BASE_ADDRESS + 0x420)
-#define ATOMIC_ADD32		(CONFIG_PHY_RAM_BASE_ADDRESS + 0x430)
-#define ATOMIC_SUB32		(CONFIG_PHY_RAM_BASE_ADDRESS + 0x440)
-#define ATOMIC_IOR32		(CONFIG_PHY_RAM_BASE_ADDRESS + 0x450)
-#define ATOMIC_AND32		(CONFIG_PHY_RAM_BASE_ADDRESS + 0x460)
-#define ATOMIC_XOR32		(CONFIG_PHY_RAM_BASE_ADDRESS + 0x470)
-
-#define ATOMIC_SEQS_END		(CONFIG_PHY_RAM_BASE_ADDRESS + 0x480)
-
-#define SAFE_USER_INSTRUCTION   (CONFIG_PHY_RAM_BASE_ADDRESS + 0x480)
-
-#define FIXED_CODE_END		(CONFIG_PHY_RAM_BASE_ADDRESS + 0x490)
-
-#endif
diff --git a/trunk/arch/blackfin/include/asm/kvm_para.h b/trunk/arch/blackfin/include/asm/kvm_para.h
deleted file mode 100644
index 14fab8f0b957..000000000000
--- a/trunk/arch/blackfin/include/asm/kvm_para.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/kvm_para.h>
diff --git a/trunk/arch/blackfin/include/asm/pgtable.h b/trunk/arch/blackfin/include/asm/pgtable.h
index dcca3e6d6e80..b8663921d3c1 100644
--- a/trunk/arch/blackfin/include/asm/pgtable.h
+++ b/trunk/arch/blackfin/include/asm/pgtable.h
@@ -83,8 +83,6 @@ PTE_BIT_FUNC(mkyoung, |= _PAGE_ACCESSED);
 #define ZERO_PAGE(vaddr)	virt_to_page(empty_zero_page)
 extern char empty_zero_page[];
 
-extern unsigned int kobjsize(const void *objp);
-
 #define swapper_pg_dir ((pgd_t *) 0)
 /*
  * No page table caches to initialise.
diff --git a/trunk/arch/blackfin/include/asm/ptrace.h b/trunk/arch/blackfin/include/asm/ptrace.h
index 10d8641180f2..14ea93388c05 100644
--- a/trunk/arch/blackfin/include/asm/ptrace.h
+++ b/trunk/arch/blackfin/include/asm/ptrace.h
@@ -3,102 +3,13 @@
  *
  * Licensed under the GPL-2 or later.
  */
-
 #ifndef _BFIN_PTRACE_H
 #define _BFIN_PTRACE_H
 
-/*
- * GCC defines register number like this:
- * -----------------------------
- *       0 - 7 are data registers R0-R7
- *       8 - 15 are address registers P0-P7
- *      16 - 31 dsp registers I/B/L0 -- I/B/L3 & M0--M3
- *      32 - 33 A registers A0 & A1
- *      34 -    status register
- * -----------------------------
- *
- * We follows above, except:
- *      32-33 --- Low 32-bit of A0&1
- *      34-35 --- High 8-bit of A0&1
- */
+#include <uapi/asm/ptrace.h>
 
 #ifndef __ASSEMBLY__
 
-struct task_struct;
-
-/* this struct defines the way the registers are stored on the
-   stack during a system call. */
-
-struct pt_regs {
-	long orig_pc;
-	long ipend;
-	long seqstat;
-	long rete;
-	long retn;
-	long retx;
-	long pc;		/* PC == RETI */
-	long rets;
-	long reserved;		/* Used as scratch during system calls */
-	long astat;
-	long lb1;
-	long lb0;
-	long lt1;
-	long lt0;
-	long lc1;
-	long lc0;
-	long a1w;
-	long a1x;
-	long a0w;
-	long a0x;
-	long b3;
-	long b2;
-	long b1;
-	long b0;
-	long l3;
-	long l2;
-	long l1;
-	long l0;
-	long m3;
-	long m2;
-	long m1;
-	long m0;
-	long i3;
-	long i2;
-	long i1;
-	long i0;
-	long usp;
-	long fp;
-	long p5;
-	long p4;
-	long p3;
-	long p2;
-	long p1;
-	long p0;
-	long r7;
-	long r6;
-	long r5;
-	long r4;
-	long r3;
-	long r2;
-	long r1;
-	long r0;
-	long orig_r0;
-	long orig_p0;
-	long syscfg;
-};
-
-/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
-#define PTRACE_GETREGS            12
-#define PTRACE_SETREGS            13	/* ptrace signal  */
-
-#define PTRACE_GETFDPIC           31	/* get the ELF fdpic loadmap address */
-#define PTRACE_GETFDPIC_EXEC       0	/* [addr] request the executable loadmap */
-#define PTRACE_GETFDPIC_INTERP     1	/* [addr] request the interpreter loadmap */
-
-#define PS_S  (0x0002)
-
-#ifdef __KERNEL__
-
 /* user_mode returns true if only one bit is set in IPEND, other than the
    master interrupt enable.  */
 #define user_mode(regs) (!(((regs)->ipend & ~0x10) & (((regs)->ipend & ~0x10) - 1)))
@@ -126,75 +37,5 @@ extern int is_user_addr_valid(struct task_struct *child,
 
 #include <asm-generic/ptrace.h>
 
-#endif  /*  __KERNEL__  */
-
 #endif				/* __ASSEMBLY__ */
-
-/*
- * Offsets used by 'ptrace' system call interface.
- */
-
-#define PT_R0 204
-#define PT_R1 200
-#define PT_R2 196
-#define PT_R3 192
-#define PT_R4 188
-#define PT_R5 184
-#define PT_R6 180
-#define PT_R7 176
-#define PT_P0 172
-#define PT_P1 168
-#define PT_P2 164
-#define PT_P3 160
-#define PT_P4 156
-#define PT_P5 152
-#define PT_FP 148
-#define PT_USP 144
-#define PT_I0 140
-#define PT_I1 136
-#define PT_I2 132
-#define PT_I3 128
-#define PT_M0 124
-#define PT_M1 120
-#define PT_M2 116
-#define PT_M3 112
-#define PT_L0 108
-#define PT_L1 104
-#define PT_L2 100
-#define PT_L3 96
-#define PT_B0 92
-#define PT_B1 88
-#define PT_B2 84
-#define PT_B3 80
-#define PT_A0X 76
-#define PT_A0W 72
-#define PT_A1X 68
-#define PT_A1W 64
-#define PT_LC0 60
-#define PT_LC1 56
-#define PT_LT0 52
-#define PT_LT1 48
-#define PT_LB0 44
-#define PT_LB1 40
-#define PT_ASTAT 36
-#define PT_RESERVED 32
-#define PT_RETS 28
-#define PT_PC 24
-#define PT_RETX 20
-#define PT_RETN 16
-#define PT_RETE 12
-#define PT_SEQSTAT 8
-#define PT_IPEND 4
-
-#define PT_ORIG_R0 208
-#define PT_ORIG_P0 212
-#define PT_SYSCFG 216
-#define PT_TEXT_ADDR 220
-#define PT_TEXT_END_ADDR 224
-#define PT_DATA_ADDR 228
-#define PT_FDPIC_EXEC 232
-#define PT_FDPIC_INTERP 236
-
-#define PT_LAST_PSEUDO PT_FDPIC_INTERP
-
 #endif				/* _BFIN_PTRACE_H */
diff --git a/trunk/arch/blackfin/include/asm/uaccess.h b/trunk/arch/blackfin/include/asm/uaccess.h
index 5cc111502822..461bb542e2e8 100644
--- a/trunk/arch/blackfin/include/asm/uaccess.h
+++ b/trunk/arch/blackfin/include/asm/uaccess.h
@@ -34,23 +34,6 @@ static inline void set_fs(mm_segment_t fs)
 
 #define access_ok(type, addr, size) _access_ok((unsigned long)(addr), (size))
 
-static inline int is_in_rom(unsigned long addr)
-{
-	/*
-	 * What we are really trying to do is determine if addr is
-	 * in an allocated kernel memory region. If not then assume
-	 * we cannot free it or otherwise de-allocate it. Ideally
-	 * we could restrict this to really being in a ROM or flash,
-	 * but that would need to be done on a board by board basis,
-	 * not globally.
-	 */
-	if ((addr < _ramstart) || (addr >= _ramend))
-		return (1);
-
-	/* Default case, not in ROM */
-	return (0);
-}
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
@@ -89,7 +72,7 @@ struct exception_table_entry {
 	({							\
 		int _err = 0;					\
 		typeof(*(p)) _x = (x);				\
-		typeof(*(p)) *_p = (p);				\
+		typeof(*(p)) __user *_p = (p);				\
 		if (!access_ok(VERIFY_WRITE, _p, sizeof(*(_p)))) {\
 			_err = -EFAULT;				\
 		}						\
@@ -108,8 +91,8 @@ struct exception_table_entry {
 			long _xl, _xh;				\
 			_xl = ((long *)&_x)[0];			\
 			_xh = ((long *)&_x)[1];			\
-			__put_user_asm(_xl, ((long *)_p)+0, );	\
-			__put_user_asm(_xh, ((long *)_p)+1, );	\
+			__put_user_asm(_xl, ((long __user *)_p)+0, );	\
+			__put_user_asm(_xh, ((long __user *)_p)+1, );	\
 		} break;					\
 		default:					\
 			_err = __put_user_bad();		\
@@ -136,7 +119,7 @@ static inline int bad_user_access_length(void)
  * aliasing issues.
  */
 
-#define __ptr(x) ((unsigned long *)(x))
+#define __ptr(x) ((unsigned long __force *)(x))
 
 #define __put_user_asm(x,p,bhw)				\
 	__asm__ (#bhw"[%1] = %0;\n\t"			\
@@ -216,12 +199,12 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
  */
 
 static inline long __must_check
-strncpy_from_user(char *dst, const char *src, long count)
+strncpy_from_user(char *dst, const char __user *src, long count)
 {
 	char *tmp;
 	if (!access_ok(VERIFY_READ, src, 1))
 		return -EFAULT;
-	strncpy(dst, src, count);
+	strncpy(dst, (const char __force *)src, count);
 	for (tmp = dst; *tmp && count > 0; tmp++, count--) ;
 	return (tmp - dst);
 }
@@ -237,18 +220,18 @@ strncpy_from_user(char *dst, const char *src, long count)
  * On exception, returns 0.
  * If the string is too long, returns a value greater than n.
  */
-static inline long __must_check strnlen_user(const char *src, long n)
+static inline long __must_check strnlen_user(const char __user *src, long n)
 {
 	if (!access_ok(VERIFY_READ, src, 1))
 		return 0;
-	return strnlen(src, n) + 1;
+	return strnlen((const char __force *)src, n) + 1;
 }
 
-static inline long __must_check strlen_user(const char *src)
+static inline long __must_check strlen_user(const char __user *src)
 {
 	if (!access_ok(VERIFY_READ, src, 1))
 		return 0;
-	return strlen(src) + 1;
+	return strlen((const char __force *)src) + 1;
 }
 
 /*
@@ -256,11 +239,11 @@ static inline long __must_check strlen_user(const char *src)
  */
 
 static inline unsigned long __must_check
-__clear_user(void *to, unsigned long n)
+__clear_user(void __user *to, unsigned long n)
 {
 	if (!access_ok(VERIFY_WRITE, to, n))
 		return n;
-	memset(to, 0, n);
+	memset((void __force *)to, 0, n);
 	return 0;
 }
 
diff --git a/trunk/arch/blackfin/include/asm/unistd.h b/trunk/arch/blackfin/include/asm/unistd.h
index 460514a1a4e1..17eb748e9c54 100644
--- a/trunk/arch/blackfin/include/asm/unistd.h
+++ b/trunk/arch/blackfin/include/asm/unistd.h
@@ -3,437 +3,11 @@
  *
  * Licensed under the GPL-2 or later.
  */
-
 #ifndef __ASM_BFIN_UNISTD_H
 #define __ASM_BFIN_UNISTD_H
-/*
- * This file contains the system call numbers.
- */
-#define __NR_restart_syscall	  0
-#define __NR_exit		  1
-				/* 2 __NR_fork not supported on nommu */
-#define __NR_read		  3
-#define __NR_write		  4
-#define __NR_open		  5
-#define __NR_close		  6
-				/* 7 __NR_waitpid obsolete */
-#define __NR_creat		  8
-#define __NR_link		  9
-#define __NR_unlink		 10
-#define __NR_execve		 11
-#define __NR_chdir		 12
-#define __NR_time		 13
-#define __NR_mknod		 14
-#define __NR_chmod		 15
-#define __NR_chown		 16
-				/* 17 __NR_break obsolete */
-				/* 18 __NR_oldstat obsolete */
-#define __NR_lseek		 19
-#define __NR_getpid		 20
-#define __NR_mount		 21
-				/* 22 __NR_umount obsolete */
-#define __NR_setuid		 23
-#define __NR_getuid		 24
-#define __NR_stime		 25
-#define __NR_ptrace		 26
-#define __NR_alarm		 27
-				/* 28 __NR_oldfstat obsolete */
-#define __NR_pause		 29
-				/* 30 __NR_utime obsolete */
-				/* 31 __NR_stty obsolete */
-				/* 32 __NR_gtty obsolete */
-#define __NR_access		 33
-#define __NR_nice		 34
-				/* 35 __NR_ftime obsolete */
-#define __NR_sync		 36
-#define __NR_kill		 37
-#define __NR_rename		 38
-#define __NR_mkdir		 39
-#define __NR_rmdir		 40
-#define __NR_dup		 41
-#define __NR_pipe		 42
-#define __NR_times		 43
-				/* 44 __NR_prof obsolete */
-#define __NR_brk		 45
-#define __NR_setgid		 46
-#define __NR_getgid		 47
-				/* 48 __NR_signal obsolete */
-#define __NR_geteuid		 49
-#define __NR_getegid		 50
-#define __NR_acct		 51
-#define __NR_umount2		 52
-				/* 53 __NR_lock obsolete */
-#define __NR_ioctl		 54
-#define __NR_fcntl		 55
-				/* 56 __NR_mpx obsolete */
-#define __NR_setpgid		 57
-				/* 58 __NR_ulimit obsolete */
-				/* 59 __NR_oldolduname obsolete */
-#define __NR_umask		 60
-#define __NR_chroot		 61
-#define __NR_ustat		 62
-#define __NR_dup2		 63
-#define __NR_getppid		 64
-#define __NR_getpgrp		 65
-#define __NR_setsid		 66
-				/* 67 __NR_sigaction obsolete */
-#define __NR_sgetmask		 68
-#define __NR_ssetmask		 69
-#define __NR_setreuid		 70
-#define __NR_setregid		 71
-				/* 72 __NR_sigsuspend obsolete */
-				/* 73 __NR_sigpending obsolete */
-#define __NR_sethostname	 74
-#define __NR_setrlimit		 75
-				/* 76 __NR_old_getrlimit obsolete */
-#define __NR_getrusage		 77
-#define __NR_gettimeofday	 78
-#define __NR_settimeofday	 79
-#define __NR_getgroups		 80
-#define __NR_setgroups		 81
-				/* 82 __NR_select obsolete */
-#define __NR_symlink		 83
-				/* 84 __NR_oldlstat obsolete */
-#define __NR_readlink		 85
-				/* 86 __NR_uselib obsolete */
-				/* 87 __NR_swapon obsolete */
-#define __NR_reboot		 88
-				/* 89 __NR_readdir obsolete */
-				/* 90 __NR_mmap obsolete */
-#define __NR_munmap		 91
-#define __NR_truncate		 92
-#define __NR_ftruncate		 93
-#define __NR_fchmod		 94
-#define __NR_fchown		 95
-#define __NR_getpriority	 96
-#define __NR_setpriority	 97
-				/* 98 __NR_profil obsolete */
-#define __NR_statfs		 99
-#define __NR_fstatfs		100
-				/* 101 __NR_ioperm */
-				/* 102 __NR_socketcall obsolete */
-#define __NR_syslog		103
-#define __NR_setitimer		104
-#define __NR_getitimer		105
-#define __NR_stat		106
-#define __NR_lstat		107
-#define __NR_fstat		108
-				/* 109 __NR_olduname obsolete */
-				/* 110 __NR_iopl obsolete */
-#define __NR_vhangup		111
-				/* 112 __NR_idle obsolete */
-				/* 113 __NR_vm86old */
-#define __NR_wait4		114
-				/* 115 __NR_swapoff obsolete */
-#define __NR_sysinfo		116
-				/* 117 __NR_ipc oboslete */
-#define __NR_fsync		118
-				/* 119 __NR_sigreturn obsolete */
-#define __NR_clone		120
-#define __NR_setdomainname	121
-#define __NR_uname		122
-				/* 123 __NR_modify_ldt obsolete */
-#define __NR_adjtimex		124
-#define __NR_mprotect		125
-				/* 126 __NR_sigprocmask obsolete */
-				/* 127 __NR_create_module obsolete */
-#define __NR_init_module	128
-#define __NR_delete_module	129
-				/* 130 __NR_get_kernel_syms obsolete */
-#define __NR_quotactl		131
-#define __NR_getpgid		132
-#define __NR_fchdir		133
-#define __NR_bdflush		134
-				/* 135 was sysfs */
-#define __NR_personality	136
-				/* 137 __NR_afs_syscall */
-#define __NR_setfsuid		138
-#define __NR_setfsgid		139
-#define __NR__llseek		140
-#define __NR_getdents		141
-				/* 142 __NR__newselect obsolete */
-#define __NR_flock		143
-				/* 144 __NR_msync obsolete */
-#define __NR_readv		145
-#define __NR_writev		146
-#define __NR_getsid		147
-#define __NR_fdatasync		148
-#define __NR__sysctl		149
-				/* 150 __NR_mlock */
-				/* 151 __NR_munlock */
-				/* 152 __NR_mlockall */
-				/* 153 __NR_munlockall */
-#define __NR_sched_setparam		154
-#define __NR_sched_getparam		155
-#define __NR_sched_setscheduler		156
-#define __NR_sched_getscheduler		157
-#define __NR_sched_yield		158
-#define __NR_sched_get_priority_max	159
-#define __NR_sched_get_priority_min	160
-#define __NR_sched_rr_get_interval	161
-#define __NR_nanosleep		162
-#define __NR_mremap		163
-#define __NR_setresuid		164
-#define __NR_getresuid		165
-				/* 166 __NR_vm86 */
-				/* 167 __NR_query_module */
-				/* 168 __NR_poll */
-#define __NR_nfsservctl		169
-#define __NR_setresgid		170
-#define __NR_getresgid		171
-#define __NR_prctl		172
-#define __NR_rt_sigreturn	173
-#define __NR_rt_sigaction	174
-#define __NR_rt_sigprocmask	175
-#define __NR_rt_sigpending	176
-#define __NR_rt_sigtimedwait	177
-#define __NR_rt_sigqueueinfo	178
-#define __NR_rt_sigsuspend	179
-#define __NR_pread		180
-#define __NR_pwrite		181
-#define __NR_lchown		182
-#define __NR_getcwd		183
-#define __NR_capget		184
-#define __NR_capset		185
-#define __NR_sigaltstack	186
-#define __NR_sendfile		187
-				/* 188 __NR_getpmsg */
-				/* 189 __NR_putpmsg */
-#define __NR_vfork		190
-#define __NR_getrlimit		191
-#define __NR_mmap2		192
-#define __NR_truncate64		193
-#define __NR_ftruncate64	194
-#define __NR_stat64		195
-#define __NR_lstat64		196
-#define __NR_fstat64		197
-#define __NR_chown32		198
-#define __NR_getuid32		199
-#define __NR_getgid32		200
-#define __NR_geteuid32		201
-#define __NR_getegid32		202
-#define __NR_setreuid32		203
-#define __NR_setregid32		204
-#define __NR_getgroups32	205
-#define __NR_setgroups32	206
-#define __NR_fchown32		207
-#define __NR_setresuid32	208
-#define __NR_getresuid32	209
-#define __NR_setresgid32	210
-#define __NR_getresgid32	211
-#define __NR_lchown32		212
-#define __NR_setuid32		213
-#define __NR_setgid32		214
-#define __NR_setfsuid32		215
-#define __NR_setfsgid32		216
-#define __NR_pivot_root		217
-				/* 218 __NR_mincore */
-				/* 219 __NR_madvise */
-#define __NR_getdents64		220
-#define __NR_fcntl64		221
-				/* 222 reserved for TUX */
-				/* 223 reserved for TUX */
-#define __NR_gettid		224
-#define __NR_readahead		225
-#define __NR_setxattr		226
-#define __NR_lsetxattr		227
-#define __NR_fsetxattr		228
-#define __NR_getxattr		229
-#define __NR_lgetxattr		230
-#define __NR_fgetxattr		231
-#define __NR_listxattr		232
-#define __NR_llistxattr		233
-#define __NR_flistxattr		234
-#define __NR_removexattr	235
-#define __NR_lremovexattr	236
-#define __NR_fremovexattr	237
-#define __NR_tkill		238
-#define __NR_sendfile64		239
-#define __NR_futex		240
-#define __NR_sched_setaffinity	241
-#define __NR_sched_getaffinity	242
-				/* 243 __NR_set_thread_area */
-				/* 244 __NR_get_thread_area */
-#define __NR_io_setup		245
-#define __NR_io_destroy		246
-#define __NR_io_getevents	247
-#define __NR_io_submit		248
-#define __NR_io_cancel		249
-				/* 250 __NR_alloc_hugepages */
-				/* 251 __NR_free_hugepages */
-#define __NR_exit_group		252
-#define __NR_lookup_dcookie     253
-#define __NR_bfin_spinlock      254
-
-#define __NR_epoll_create	255
-#define __NR_epoll_ctl		256
-#define __NR_epoll_wait		257
-				/* 258 __NR_remap_file_pages */
-#define __NR_set_tid_address	259
-#define __NR_timer_create	260
-#define __NR_timer_settime	261
-#define __NR_timer_gettime	262
-#define __NR_timer_getoverrun	263
-#define __NR_timer_delete	264
-#define __NR_clock_settime	265
-#define __NR_clock_gettime	266
-#define __NR_clock_getres	267
-#define __NR_clock_nanosleep	268
-#define __NR_statfs64		269
-#define __NR_fstatfs64		270
-#define __NR_tgkill		271
-#define __NR_utimes		272
-#define __NR_fadvise64_64	273
-				/* 274 __NR_vserver */
-				/* 275 __NR_mbind */
-				/* 276 __NR_get_mempolicy */
-				/* 277 __NR_set_mempolicy */
-#define __NR_mq_open 		278
-#define __NR_mq_unlink		279
-#define __NR_mq_timedsend	280
-#define __NR_mq_timedreceive	281
-#define __NR_mq_notify		282
-#define __NR_mq_getsetattr	283
-#define __NR_kexec_load		284
-#define __NR_waitid		285
-#define __NR_add_key		286
-#define __NR_request_key	287
-#define __NR_keyctl		288
-#define __NR_ioprio_set		289
-#define __NR_ioprio_get		290
-#define __NR_inotify_init	291
-#define __NR_inotify_add_watch	292
-#define __NR_inotify_rm_watch	293
-				/* 294 __NR_migrate_pages */
-#define __NR_openat		295
-#define __NR_mkdirat		296
-#define __NR_mknodat		297
-#define __NR_fchownat		298
-#define __NR_futimesat		299
-#define __NR_fstatat64		300
-#define __NR_unlinkat		301
-#define __NR_renameat		302
-#define __NR_linkat		303
-#define __NR_symlinkat		304
-#define __NR_readlinkat		305
-#define __NR_fchmodat		306
-#define __NR_faccessat		307
-#define __NR_pselect6		308
-#define __NR_ppoll		309
-#define __NR_unshare		310
-
-/* Blackfin private syscalls */
-#define __NR_sram_alloc		311
-#define __NR_sram_free		312
-#define __NR_dma_memcpy		313
-
-/* socket syscalls */
-#define __NR_accept		314
-#define __NR_bind		315
-#define __NR_connect		316
-#define __NR_getpeername	317
-#define __NR_getsockname	318
-#define __NR_getsockopt		319
-#define __NR_listen		320
-#define __NR_recv		321
-#define __NR_recvfrom		322
-#define __NR_recvmsg		323
-#define __NR_send		324
-#define __NR_sendmsg		325
-#define __NR_sendto		326
-#define __NR_setsockopt		327
-#define __NR_shutdown		328
-#define __NR_socket		329
-#define __NR_socketpair		330
-
-/* sysv ipc syscalls */
-#define __NR_semctl		331
-#define __NR_semget		332
-#define __NR_semop		333
-#define __NR_msgctl		334
-#define __NR_msgget		335
-#define __NR_msgrcv		336
-#define __NR_msgsnd		337
-#define __NR_shmat		338
-#define __NR_shmctl		339
-#define __NR_shmdt		340
-#define __NR_shmget		341
 
-#define __NR_splice		342
-#define __NR_sync_file_range	343
-#define __NR_tee		344
-#define __NR_vmsplice		345
+#include <uapi/asm/unistd.h>
 
-#define __NR_epoll_pwait	346
-#define __NR_utimensat		347
-#define __NR_signalfd		348
-#define __NR_timerfd_create	349
-#define __NR_eventfd		350
-#define __NR_pread64		351
-#define __NR_pwrite64		352
-#define __NR_fadvise64		353
-#define __NR_set_robust_list	354
-#define __NR_get_robust_list	355
-#define __NR_fallocate		356
-#define __NR_semtimedop		357
-#define __NR_timerfd_settime	358
-#define __NR_timerfd_gettime	359
-#define __NR_signalfd4		360
-#define __NR_eventfd2		361
-#define __NR_epoll_create1	362
-#define __NR_dup3		363
-#define __NR_pipe2		364
-#define __NR_inotify_init1	365
-#define __NR_preadv		366
-#define __NR_pwritev		367
-#define __NR_rt_tgsigqueueinfo	368
-#define __NR_perf_event_open	369
-#define __NR_recvmmsg		370
-#define __NR_fanotify_init	371
-#define __NR_fanotify_mark	372
-#define __NR_prlimit64		373
-#define __NR_cacheflush		374
-#define __NR_name_to_handle_at	375
-#define __NR_open_by_handle_at	376
-#define __NR_clock_adjtime	377
-#define __NR_syncfs		378
-#define __NR_setns		379
-#define __NR_sendmmsg		380
-#define __NR_process_vm_readv	381
-#define __NR_process_vm_writev	382
-
-#define __NR_syscall		383
-#define NR_syscalls		__NR_syscall
-
-/* Old optional stuff no one actually uses */
-#define __IGNORE_sysfs
-#define __IGNORE_uselib
-
-/* Implement the newer interfaces */
-#define __IGNORE_mmap
-#define __IGNORE_poll
-#define __IGNORE_select
-#define __IGNORE_utime
-
-/* Not relevant on no-mmu */
-#define __IGNORE_swapon
-#define __IGNORE_swapoff
-#define __IGNORE_msync
-#define __IGNORE_mlock
-#define __IGNORE_munlock
-#define __IGNORE_mlockall
-#define __IGNORE_munlockall
-#define __IGNORE_mincore
-#define __IGNORE_madvise
-#define __IGNORE_remap_file_pages
-#define __IGNORE_mbind
-#define __IGNORE_get_mempolicy
-#define __IGNORE_set_mempolicy
-#define __IGNORE_migrate_pages
-#define __IGNORE_move_pages
-#define __IGNORE_getcpu
-
-#ifdef __KERNEL__
 #define __ARCH_WANT_STAT64
 #define __ARCH_WANT_SYS_ALARM
 #define __ARCH_WANT_SYS_GETHOSTNAME
@@ -457,6 +31,4 @@
  */
 #define cond_syscall(x) asm(".weak\t_" #x "\n\t.set\t_" #x ",_sys_ni_syscall");
 
-#endif	/* __KERNEL__ */
-
 #endif				/* __ASM_BFIN_UNISTD_H */
diff --git a/trunk/arch/blackfin/include/mach-common/irq.h b/trunk/arch/blackfin/include/mach-common/irq.h
index cab14e911dc2..af9fc8171ebc 100644
--- a/trunk/arch/blackfin/include/mach-common/irq.h
+++ b/trunk/arch/blackfin/include/mach-common/irq.h
@@ -40,8 +40,6 @@
 #define IRQ_HWERR		5	/* Hardware Error */
 #define IRQ_CORETMR		6	/* Core timer */
 
-#define BFIN_IRQ(x)		((x) + 7)
-
 #define IVG7			7
 #define IVG8			8
 #define IVG9			9
@@ -52,6 +50,9 @@
 #define IVG14			14
 #define IVG15			15
 
+#define BFIN_IRQ(x)		((x) + IVG7)
+#define BFIN_SYSIRQ(x)		((x) - IVG7)
+
 #define NR_IRQS			(NR_MACH_IRQS + NR_SPARE_IRQS)
 
 #endif
diff --git a/trunk/arch/blackfin/include/uapi/asm/Kbuild b/trunk/arch/blackfin/include/uapi/asm/Kbuild
index baebb3da1d44..0bd28f77abc3 100644
--- a/trunk/arch/blackfin/include/uapi/asm/Kbuild
+++ b/trunk/arch/blackfin/include/uapi/asm/Kbuild
@@ -1,3 +1,19 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+header-y += bfin_sport.h
+header-y += byteorder.h
+header-y += cachectl.h
+header-y += fcntl.h
+header-y += fixed_code.h
+header-y += ioctls.h
+header-y += kvm_para.h
+header-y += poll.h
+header-y += posix_types.h
+header-y += ptrace.h
+header-y += sigcontext.h
+header-y += siginfo.h
+header-y += signal.h
+header-y += stat.h
+header-y += swab.h
+header-y += unistd.h
diff --git a/trunk/arch/blackfin/include/uapi/asm/bfin_sport.h b/trunk/arch/blackfin/include/uapi/asm/bfin_sport.h
new file mode 100644
index 000000000000..c086de87ee61
--- /dev/null
+++ b/trunk/arch/blackfin/include/uapi/asm/bfin_sport.h
@@ -0,0 +1,136 @@
+/*
+ * bfin_sport.h - interface to Blackfin SPORTs
+ *
+ * Copyright 2004-2009 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#ifndef _UAPI__BFIN_SPORT_H__
+#define _UAPI__BFIN_SPORT_H__
+
+/* Sport mode: it can be set to TDM, i2s or others */
+#define NORM_MODE	0x0
+#define TDM_MODE	0x1
+#define I2S_MODE	0x2
+#define NDSO_MODE	0x3
+
+/* Data format, normal, a-law or u-law */
+#define NORM_FORMAT	0x0
+#define ALAW_FORMAT	0x2
+#define ULAW_FORMAT	0x3
+
+/* Function driver which use sport must initialize the structure */
+struct sport_config {
+	/* TDM (multichannels), I2S or other mode */
+	unsigned int mode:3;
+	unsigned int polled;	/* use poll instead of irq when set */
+
+	/* if TDM mode is selected, channels must be set */
+	int channels;	/* Must be in 8 units */
+	unsigned int frame_delay:4;	/* Delay between frame sync pulse and first bit */
+
+	/* I2S mode */
+	unsigned int right_first:1;	/* Right stereo channel first */
+
+	/* In mormal mode, the following item need to be set */
+	unsigned int lsb_first:1;	/* order of transmit or receive data */
+	unsigned int fsync:1;	/* Frame sync required */
+	unsigned int data_indep:1;	/* data independent frame sync generated */
+	unsigned int act_low:1;	/* Active low TFS */
+	unsigned int late_fsync:1;	/* Late frame sync */
+	unsigned int tckfe:1;
+	unsigned int sec_en:1;	/* Secondary side enabled */
+
+	/* Choose clock source */
+	unsigned int int_clk:1;	/* Internal or external clock */
+
+	/* If external clock is used, the following fields are ignored */
+	int serial_clk;
+	int fsync_clk;
+
+	unsigned int data_format:2;	/* Normal, u-law or a-law */
+
+	int word_len;		/* How length of the word in bits, 3-32 bits */
+	int dma_enabled;
+};
+
+/* Userspace interface */
+#define SPORT_IOC_MAGIC		'P'
+#define SPORT_IOC_CONFIG	_IOWR('P', 0x01, struct sport_config)
+#define SPORT_IOC_GET_SYSTEMCLOCK         _IOR('P', 0x02, unsigned long)
+#define SPORT_IOC_SET_BAUDRATE            _IOW('P', 0x03, unsigned long)
+
+
+/* SPORT_TCR1 Masks */
+#define TSPEN		0x0001	/* TX enable */
+#define ITCLK		0x0002	/* Internal TX Clock Select */
+#define TDTYPE		0x000C	/* TX Data Formatting Select */
+#define DTYPE_NORM	0x0000	/* Data Format Normal */
+#define DTYPE_ULAW	0x0008	/* Compand Using u-Law */
+#define DTYPE_ALAW	0x000C	/* Compand Using A-Law */
+#define TLSBIT		0x0010	/* TX Bit Order */
+#define ITFS		0x0200	/* Internal TX Frame Sync Select */
+#define TFSR		0x0400	/* TX Frame Sync Required Select */
+#define DITFS		0x0800	/* Data Independent TX Frame Sync Select */
+#define LTFS		0x1000	/* Low TX Frame Sync Select */
+#define LATFS		0x2000	/* Late TX Frame Sync Select */
+#define TCKFE		0x4000	/* TX Clock Falling Edge Select */
+
+/* SPORT_TCR2 Masks */
+#define SLEN		0x001F	/* SPORT TX Word Length (2 - 31) */
+#define DP_SLEN(x)	BFIN_DEPOSIT(SLEN, x)
+#define EX_SLEN(x)	BFIN_EXTRACT(SLEN, x)
+#define TXSE		0x0100	/* TX Secondary Enable */
+#define TSFSE		0x0200	/* TX Stereo Frame Sync Enable */
+#define TRFST		0x0400	/* TX Right-First Data Order */
+
+/* SPORT_RCR1 Masks */
+#define RSPEN		0x0001	/* RX enable */
+#define IRCLK		0x0002	/* Internal RX Clock Select */
+#define RDTYPE		0x000C	/* RX Data Formatting Select */
+/* DTYPE_* defined above */
+#define RLSBIT		0x0010	/* RX Bit Order */
+#define IRFS		0x0200	/* Internal RX Frame Sync Select */
+#define RFSR		0x0400	/* RX Frame Sync Required Select */
+#define LRFS		0x1000	/* Low RX Frame Sync Select */
+#define LARFS		0x2000	/* Late RX Frame Sync Select */
+#define RCKFE		0x4000	/* RX Clock Falling Edge Select */
+
+/* SPORT_RCR2 Masks */
+/* SLEN defined above */
+#define RXSE		0x0100	/* RX Secondary Enable */
+#define RSFSE		0x0200	/* RX Stereo Frame Sync Enable */
+#define RRFST		0x0400	/* Right-First Data Order */
+
+/* SPORT_STAT Masks */
+#define RXNE		0x0001	/* RX FIFO Not Empty Status */
+#define RUVF		0x0002	/* RX Underflow Status */
+#define ROVF		0x0004	/* RX Overflow Status */
+#define TXF		0x0008	/* TX FIFO Full Status */
+#define TUVF		0x0010	/* TX Underflow Status */
+#define TOVF		0x0020	/* TX Overflow Status */
+#define TXHRE		0x0040	/* TX Hold Register Empty */
+
+/* SPORT_MCMC1 Masks */
+#define SP_WOFF		0x03FF	/* Multichannel Window Offset Field */
+#define DP_SP_WOFF(x)	BFIN_DEPOSIT(SP_WOFF, x)
+#define EX_SP_WOFF(x)	BFIN_EXTRACT(SP_WOFF, x)
+#define SP_WSIZE	0xF000	/* Multichannel Window Size Field */
+#define DP_SP_WSIZE(x)	BFIN_DEPOSIT(SP_WSIZE, x)
+#define EX_SP_WSIZE(x)	BFIN_EXTRACT(SP_WSIZE, x)
+
+/* SPORT_MCMC2 Masks */
+#define MCCRM		0x0003	/* Multichannel Clock Recovery Mode */
+#define REC_BYPASS	0x0000	/* Bypass Mode (No Clock Recovery) */
+#define REC_2FROM4	0x0002	/* Recover 2 MHz Clock from 4 MHz Clock */
+#define REC_8FROM16	0x0003	/* Recover 8 MHz Clock from 16 MHz Clock */
+#define MCDTXPE		0x0004	/* Multichannel DMA Transmit Packing */
+#define MCDRXPE		0x0008	/* Multichannel DMA Receive Packing */
+#define MCMEN		0x0010	/* Multichannel Frame Mode Enable */
+#define FSDR		0x0080	/* Multichannel Frame Sync to Data Relationship */
+#define MFD		0xF000	/* Multichannel Frame Delay */
+#define DP_MFD(x)	BFIN_DEPOSIT(MFD, x)
+#define EX_MFD(x)	BFIN_EXTRACT(MFD, x)
+
+#endif /* _UAPI__BFIN_SPORT_H__ */
diff --git a/trunk/arch/blackfin/include/asm/byteorder.h b/trunk/arch/blackfin/include/uapi/asm/byteorder.h
similarity index 100%
rename from trunk/arch/blackfin/include/asm/byteorder.h
rename to trunk/arch/blackfin/include/uapi/asm/byteorder.h
diff --git a/trunk/arch/blackfin/include/asm/cachectl.h b/trunk/arch/blackfin/include/uapi/asm/cachectl.h
similarity index 100%
rename from trunk/arch/blackfin/include/asm/cachectl.h
rename to trunk/arch/blackfin/include/uapi/asm/cachectl.h
diff --git a/trunk/arch/blackfin/include/asm/fcntl.h b/trunk/arch/blackfin/include/uapi/asm/fcntl.h
similarity index 100%
rename from trunk/arch/blackfin/include/asm/fcntl.h
rename to trunk/arch/blackfin/include/uapi/asm/fcntl.h
diff --git a/trunk/arch/blackfin/include/uapi/asm/fixed_code.h b/trunk/arch/blackfin/include/uapi/asm/fixed_code.h
new file mode 100644
index 000000000000..3bef1dca379f
--- /dev/null
+++ b/trunk/arch/blackfin/include/uapi/asm/fixed_code.h
@@ -0,0 +1,38 @@
+/*
+ * This file defines the fixed addresses where userspace programs
+ * can find atomic code sequences.
+ *
+ * Copyright 2007-2008 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#ifndef _UAPI__BFIN_ASM_FIXED_CODE_H__
+#define _UAPI__BFIN_ASM_FIXED_CODE_H__
+
+
+#ifndef CONFIG_PHY_RAM_BASE_ADDRESS
+#define CONFIG_PHY_RAM_BASE_ADDRESS	0x0
+#endif
+
+#define FIXED_CODE_START	(CONFIG_PHY_RAM_BASE_ADDRESS + 0x400)
+
+#define SIGRETURN_STUB		(CONFIG_PHY_RAM_BASE_ADDRESS + 0x400)
+
+#define ATOMIC_SEQS_START	(CONFIG_PHY_RAM_BASE_ADDRESS + 0x410)
+
+#define ATOMIC_XCHG32		(CONFIG_PHY_RAM_BASE_ADDRESS + 0x410)
+#define ATOMIC_CAS32		(CONFIG_PHY_RAM_BASE_ADDRESS + 0x420)
+#define ATOMIC_ADD32		(CONFIG_PHY_RAM_BASE_ADDRESS + 0x430)
+#define ATOMIC_SUB32		(CONFIG_PHY_RAM_BASE_ADDRESS + 0x440)
+#define ATOMIC_IOR32		(CONFIG_PHY_RAM_BASE_ADDRESS + 0x450)
+#define ATOMIC_AND32		(CONFIG_PHY_RAM_BASE_ADDRESS + 0x460)
+#define ATOMIC_XOR32		(CONFIG_PHY_RAM_BASE_ADDRESS + 0x470)
+
+#define ATOMIC_SEQS_END		(CONFIG_PHY_RAM_BASE_ADDRESS + 0x480)
+
+#define SAFE_USER_INSTRUCTION   (CONFIG_PHY_RAM_BASE_ADDRESS + 0x480)
+
+#define FIXED_CODE_END		(CONFIG_PHY_RAM_BASE_ADDRESS + 0x490)
+
+#endif /* _UAPI__BFIN_ASM_FIXED_CODE_H__ */
diff --git a/trunk/arch/blackfin/include/asm/ioctls.h b/trunk/arch/blackfin/include/uapi/asm/ioctls.h
similarity index 100%
rename from trunk/arch/blackfin/include/asm/ioctls.h
rename to trunk/arch/blackfin/include/uapi/asm/ioctls.h
diff --git a/trunk/arch/blackfin/include/asm/poll.h b/trunk/arch/blackfin/include/uapi/asm/poll.h
similarity index 100%
rename from trunk/arch/blackfin/include/asm/poll.h
rename to trunk/arch/blackfin/include/uapi/asm/poll.h
diff --git a/trunk/arch/blackfin/include/asm/posix_types.h b/trunk/arch/blackfin/include/uapi/asm/posix_types.h
similarity index 100%
rename from trunk/arch/blackfin/include/asm/posix_types.h
rename to trunk/arch/blackfin/include/uapi/asm/posix_types.h
diff --git a/trunk/arch/blackfin/include/uapi/asm/ptrace.h b/trunk/arch/blackfin/include/uapi/asm/ptrace.h
new file mode 100644
index 000000000000..fd48bd0739d2
--- /dev/null
+++ b/trunk/arch/blackfin/include/uapi/asm/ptrace.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright 2004-2008 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#ifndef _UAPI_BFIN_PTRACE_H
+#define _UAPI_BFIN_PTRACE_H
+
+/*
+ * GCC defines register number like this:
+ * -----------------------------
+ *       0 - 7 are data registers R0-R7
+ *       8 - 15 are address registers P0-P7
+ *      16 - 31 dsp registers I/B/L0 -- I/B/L3 & M0--M3
+ *      32 - 33 A registers A0 & A1
+ *      34 -    status register
+ * -----------------------------
+ *
+ * We follows above, except:
+ *      32-33 --- Low 32-bit of A0&1
+ *      34-35 --- High 8-bit of A0&1
+ */
+
+#ifndef __ASSEMBLY__
+
+struct task_struct;
+
+/* this struct defines the way the registers are stored on the
+   stack during a system call. */
+
+struct pt_regs {
+	long orig_pc;
+	long ipend;
+	long seqstat;
+	long rete;
+	long retn;
+	long retx;
+	long pc;		/* PC == RETI */
+	long rets;
+	long reserved;		/* Used as scratch during system calls */
+	long astat;
+	long lb1;
+	long lb0;
+	long lt1;
+	long lt0;
+	long lc1;
+	long lc0;
+	long a1w;
+	long a1x;
+	long a0w;
+	long a0x;
+	long b3;
+	long b2;
+	long b1;
+	long b0;
+	long l3;
+	long l2;
+	long l1;
+	long l0;
+	long m3;
+	long m2;
+	long m1;
+	long m0;
+	long i3;
+	long i2;
+	long i1;
+	long i0;
+	long usp;
+	long fp;
+	long p5;
+	long p4;
+	long p3;
+	long p2;
+	long p1;
+	long p0;
+	long r7;
+	long r6;
+	long r5;
+	long r4;
+	long r3;
+	long r2;
+	long r1;
+	long r0;
+	long orig_r0;
+	long orig_p0;
+	long syscfg;
+};
+
+/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
+#define PTRACE_GETREGS            12
+#define PTRACE_SETREGS            13	/* ptrace signal  */
+
+#define PTRACE_GETFDPIC           31	/* get the ELF fdpic loadmap address */
+#define PTRACE_GETFDPIC_EXEC       0	/* [addr] request the executable loadmap */
+#define PTRACE_GETFDPIC_INTERP     1	/* [addr] request the interpreter loadmap */
+
+#define PS_S  (0x0002)
+
+
+#endif				/* __ASSEMBLY__ */
+
+/*
+ * Offsets used by 'ptrace' system call interface.
+ */
+
+#define PT_R0 204
+#define PT_R1 200
+#define PT_R2 196
+#define PT_R3 192
+#define PT_R4 188
+#define PT_R5 184
+#define PT_R6 180
+#define PT_R7 176
+#define PT_P0 172
+#define PT_P1 168
+#define PT_P2 164
+#define PT_P3 160
+#define PT_P4 156
+#define PT_P5 152
+#define PT_FP 148
+#define PT_USP 144
+#define PT_I0 140
+#define PT_I1 136
+#define PT_I2 132
+#define PT_I3 128
+#define PT_M0 124
+#define PT_M1 120
+#define PT_M2 116
+#define PT_M3 112
+#define PT_L0 108
+#define PT_L1 104
+#define PT_L2 100
+#define PT_L3 96
+#define PT_B0 92
+#define PT_B1 88
+#define PT_B2 84
+#define PT_B3 80
+#define PT_A0X 76
+#define PT_A0W 72
+#define PT_A1X 68
+#define PT_A1W 64
+#define PT_LC0 60
+#define PT_LC1 56
+#define PT_LT0 52
+#define PT_LT1 48
+#define PT_LB0 44
+#define PT_LB1 40
+#define PT_ASTAT 36
+#define PT_RESERVED 32
+#define PT_RETS 28
+#define PT_PC 24
+#define PT_RETX 20
+#define PT_RETN 16
+#define PT_RETE 12
+#define PT_SEQSTAT 8
+#define PT_IPEND 4
+
+#define PT_ORIG_R0 208
+#define PT_ORIG_P0 212
+#define PT_SYSCFG 216
+#define PT_TEXT_ADDR 220
+#define PT_TEXT_END_ADDR 224
+#define PT_DATA_ADDR 228
+#define PT_FDPIC_EXEC 232
+#define PT_FDPIC_INTERP 236
+
+#define PT_LAST_PSEUDO PT_FDPIC_INTERP
+
+#endif /* _UAPI_BFIN_PTRACE_H */
diff --git a/trunk/arch/blackfin/include/asm/sigcontext.h b/trunk/arch/blackfin/include/uapi/asm/sigcontext.h
similarity index 100%
rename from trunk/arch/blackfin/include/asm/sigcontext.h
rename to trunk/arch/blackfin/include/uapi/asm/sigcontext.h
diff --git a/trunk/arch/blackfin/include/asm/siginfo.h b/trunk/arch/blackfin/include/uapi/asm/siginfo.h
similarity index 100%
rename from trunk/arch/blackfin/include/asm/siginfo.h
rename to trunk/arch/blackfin/include/uapi/asm/siginfo.h
diff --git a/trunk/arch/blackfin/include/asm/signal.h b/trunk/arch/blackfin/include/uapi/asm/signal.h
similarity index 100%
rename from trunk/arch/blackfin/include/asm/signal.h
rename to trunk/arch/blackfin/include/uapi/asm/signal.h
diff --git a/trunk/arch/blackfin/include/asm/stat.h b/trunk/arch/blackfin/include/uapi/asm/stat.h
similarity index 100%
rename from trunk/arch/blackfin/include/asm/stat.h
rename to trunk/arch/blackfin/include/uapi/asm/stat.h
diff --git a/trunk/arch/blackfin/include/asm/swab.h b/trunk/arch/blackfin/include/uapi/asm/swab.h
similarity index 100%
rename from trunk/arch/blackfin/include/asm/swab.h
rename to trunk/arch/blackfin/include/uapi/asm/swab.h
diff --git a/trunk/arch/blackfin/include/uapi/asm/unistd.h b/trunk/arch/blackfin/include/uapi/asm/unistd.h
new file mode 100644
index 000000000000..a4511649a864
--- /dev/null
+++ b/trunk/arch/blackfin/include/uapi/asm/unistd.h
@@ -0,0 +1,437 @@
+/*
+ * Copyright 2004-2009 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#ifndef _UAPI__ASM_BFIN_UNISTD_H
+#define _UAPI__ASM_BFIN_UNISTD_H
+/*
+ * This file contains the system call numbers.
+ */
+#define __NR_restart_syscall	  0
+#define __NR_exit		  1
+				/* 2 __NR_fork not supported on nommu */
+#define __NR_read		  3
+#define __NR_write		  4
+#define __NR_open		  5
+#define __NR_close		  6
+				/* 7 __NR_waitpid obsolete */
+#define __NR_creat		  8
+#define __NR_link		  9
+#define __NR_unlink		 10
+#define __NR_execve		 11
+#define __NR_chdir		 12
+#define __NR_time		 13
+#define __NR_mknod		 14
+#define __NR_chmod		 15
+#define __NR_chown		 16
+				/* 17 __NR_break obsolete */
+				/* 18 __NR_oldstat obsolete */
+#define __NR_lseek		 19
+#define __NR_getpid		 20
+#define __NR_mount		 21
+				/* 22 __NR_umount obsolete */
+#define __NR_setuid		 23
+#define __NR_getuid		 24
+#define __NR_stime		 25
+#define __NR_ptrace		 26
+#define __NR_alarm		 27
+				/* 28 __NR_oldfstat obsolete */
+#define __NR_pause		 29
+				/* 30 __NR_utime obsolete */
+				/* 31 __NR_stty obsolete */
+				/* 32 __NR_gtty obsolete */
+#define __NR_access		 33
+#define __NR_nice		 34
+				/* 35 __NR_ftime obsolete */
+#define __NR_sync		 36
+#define __NR_kill		 37
+#define __NR_rename		 38
+#define __NR_mkdir		 39
+#define __NR_rmdir		 40
+#define __NR_dup		 41
+#define __NR_pipe		 42
+#define __NR_times		 43
+				/* 44 __NR_prof obsolete */
+#define __NR_brk		 45
+#define __NR_setgid		 46
+#define __NR_getgid		 47
+				/* 48 __NR_signal obsolete */
+#define __NR_geteuid		 49
+#define __NR_getegid		 50
+#define __NR_acct		 51
+#define __NR_umount2		 52
+				/* 53 __NR_lock obsolete */
+#define __NR_ioctl		 54
+#define __NR_fcntl		 55
+				/* 56 __NR_mpx obsolete */
+#define __NR_setpgid		 57
+				/* 58 __NR_ulimit obsolete */
+				/* 59 __NR_oldolduname obsolete */
+#define __NR_umask		 60
+#define __NR_chroot		 61
+#define __NR_ustat		 62
+#define __NR_dup2		 63
+#define __NR_getppid		 64
+#define __NR_getpgrp		 65
+#define __NR_setsid		 66
+				/* 67 __NR_sigaction obsolete */
+#define __NR_sgetmask		 68
+#define __NR_ssetmask		 69
+#define __NR_setreuid		 70
+#define __NR_setregid		 71
+				/* 72 __NR_sigsuspend obsolete */
+				/* 73 __NR_sigpending obsolete */
+#define __NR_sethostname	 74
+#define __NR_setrlimit		 75
+				/* 76 __NR_old_getrlimit obsolete */
+#define __NR_getrusage		 77
+#define __NR_gettimeofday	 78
+#define __NR_settimeofday	 79
+#define __NR_getgroups		 80
+#define __NR_setgroups		 81
+				/* 82 __NR_select obsolete */
+#define __NR_symlink		 83
+				/* 84 __NR_oldlstat obsolete */
+#define __NR_readlink		 85
+				/* 86 __NR_uselib obsolete */
+				/* 87 __NR_swapon obsolete */
+#define __NR_reboot		 88
+				/* 89 __NR_readdir obsolete */
+				/* 90 __NR_mmap obsolete */
+#define __NR_munmap		 91
+#define __NR_truncate		 92
+#define __NR_ftruncate		 93
+#define __NR_fchmod		 94
+#define __NR_fchown		 95
+#define __NR_getpriority	 96
+#define __NR_setpriority	 97
+				/* 98 __NR_profil obsolete */
+#define __NR_statfs		 99
+#define __NR_fstatfs		100
+				/* 101 __NR_ioperm */
+				/* 102 __NR_socketcall obsolete */
+#define __NR_syslog		103
+#define __NR_setitimer		104
+#define __NR_getitimer		105
+#define __NR_stat		106
+#define __NR_lstat		107
+#define __NR_fstat		108
+				/* 109 __NR_olduname obsolete */
+				/* 110 __NR_iopl obsolete */
+#define __NR_vhangup		111
+				/* 112 __NR_idle obsolete */
+				/* 113 __NR_vm86old */
+#define __NR_wait4		114
+				/* 115 __NR_swapoff obsolete */
+#define __NR_sysinfo		116
+				/* 117 __NR_ipc oboslete */
+#define __NR_fsync		118
+				/* 119 __NR_sigreturn obsolete */
+#define __NR_clone		120
+#define __NR_setdomainname	121
+#define __NR_uname		122
+				/* 123 __NR_modify_ldt obsolete */
+#define __NR_adjtimex		124
+#define __NR_mprotect		125
+				/* 126 __NR_sigprocmask obsolete */
+				/* 127 __NR_create_module obsolete */
+#define __NR_init_module	128
+#define __NR_delete_module	129
+				/* 130 __NR_get_kernel_syms obsolete */
+#define __NR_quotactl		131
+#define __NR_getpgid		132
+#define __NR_fchdir		133
+#define __NR_bdflush		134
+				/* 135 was sysfs */
+#define __NR_personality	136
+				/* 137 __NR_afs_syscall */
+#define __NR_setfsuid		138
+#define __NR_setfsgid		139
+#define __NR__llseek		140
+#define __NR_getdents		141
+				/* 142 __NR__newselect obsolete */
+#define __NR_flock		143
+				/* 144 __NR_msync obsolete */
+#define __NR_readv		145
+#define __NR_writev		146
+#define __NR_getsid		147
+#define __NR_fdatasync		148
+#define __NR__sysctl		149
+				/* 150 __NR_mlock */
+				/* 151 __NR_munlock */
+				/* 152 __NR_mlockall */
+				/* 153 __NR_munlockall */
+#define __NR_sched_setparam		154
+#define __NR_sched_getparam		155
+#define __NR_sched_setscheduler		156
+#define __NR_sched_getscheduler		157
+#define __NR_sched_yield		158
+#define __NR_sched_get_priority_max	159
+#define __NR_sched_get_priority_min	160
+#define __NR_sched_rr_get_interval	161
+#define __NR_nanosleep		162
+#define __NR_mremap		163
+#define __NR_setresuid		164
+#define __NR_getresuid		165
+				/* 166 __NR_vm86 */
+				/* 167 __NR_query_module */
+				/* 168 __NR_poll */
+#define __NR_nfsservctl		169
+#define __NR_setresgid		170
+#define __NR_getresgid		171
+#define __NR_prctl		172
+#define __NR_rt_sigreturn	173
+#define __NR_rt_sigaction	174
+#define __NR_rt_sigprocmask	175
+#define __NR_rt_sigpending	176
+#define __NR_rt_sigtimedwait	177
+#define __NR_rt_sigqueueinfo	178
+#define __NR_rt_sigsuspend	179
+#define __NR_pread		180
+#define __NR_pwrite		181
+#define __NR_lchown		182
+#define __NR_getcwd		183
+#define __NR_capget		184
+#define __NR_capset		185
+#define __NR_sigaltstack	186
+#define __NR_sendfile		187
+				/* 188 __NR_getpmsg */
+				/* 189 __NR_putpmsg */
+#define __NR_vfork		190
+#define __NR_getrlimit		191
+#define __NR_mmap2		192
+#define __NR_truncate64		193
+#define __NR_ftruncate64	194
+#define __NR_stat64		195
+#define __NR_lstat64		196
+#define __NR_fstat64		197
+#define __NR_chown32		198
+#define __NR_getuid32		199
+#define __NR_getgid32		200
+#define __NR_geteuid32		201
+#define __NR_getegid32		202
+#define __NR_setreuid32		203
+#define __NR_setregid32		204
+#define __NR_getgroups32	205
+#define __NR_setgroups32	206
+#define __NR_fchown32		207
+#define __NR_setresuid32	208
+#define __NR_getresuid32	209
+#define __NR_setresgid32	210
+#define __NR_getresgid32	211
+#define __NR_lchown32		212
+#define __NR_setuid32		213
+#define __NR_setgid32		214
+#define __NR_setfsuid32		215
+#define __NR_setfsgid32		216
+#define __NR_pivot_root		217
+				/* 218 __NR_mincore */
+				/* 219 __NR_madvise */
+#define __NR_getdents64		220
+#define __NR_fcntl64		221
+				/* 222 reserved for TUX */
+				/* 223 reserved for TUX */
+#define __NR_gettid		224
+#define __NR_readahead		225
+#define __NR_setxattr		226
+#define __NR_lsetxattr		227
+#define __NR_fsetxattr		228
+#define __NR_getxattr		229
+#define __NR_lgetxattr		230
+#define __NR_fgetxattr		231
+#define __NR_listxattr		232
+#define __NR_llistxattr		233
+#define __NR_flistxattr		234
+#define __NR_removexattr	235
+#define __NR_lremovexattr	236
+#define __NR_fremovexattr	237
+#define __NR_tkill		238
+#define __NR_sendfile64		239
+#define __NR_futex		240
+#define __NR_sched_setaffinity	241
+#define __NR_sched_getaffinity	242
+				/* 243 __NR_set_thread_area */
+				/* 244 __NR_get_thread_area */
+#define __NR_io_setup		245
+#define __NR_io_destroy		246
+#define __NR_io_getevents	247
+#define __NR_io_submit		248
+#define __NR_io_cancel		249
+				/* 250 __NR_alloc_hugepages */
+				/* 251 __NR_free_hugepages */
+#define __NR_exit_group		252
+#define __NR_lookup_dcookie     253
+#define __NR_bfin_spinlock      254
+
+#define __NR_epoll_create	255
+#define __NR_epoll_ctl		256
+#define __NR_epoll_wait		257
+				/* 258 __NR_remap_file_pages */
+#define __NR_set_tid_address	259
+#define __NR_timer_create	260
+#define __NR_timer_settime	261
+#define __NR_timer_gettime	262
+#define __NR_timer_getoverrun	263
+#define __NR_timer_delete	264
+#define __NR_clock_settime	265
+#define __NR_clock_gettime	266
+#define __NR_clock_getres	267
+#define __NR_clock_nanosleep	268
+#define __NR_statfs64		269
+#define __NR_fstatfs64		270
+#define __NR_tgkill		271
+#define __NR_utimes		272
+#define __NR_fadvise64_64	273
+				/* 274 __NR_vserver */
+				/* 275 __NR_mbind */
+				/* 276 __NR_get_mempolicy */
+				/* 277 __NR_set_mempolicy */
+#define __NR_mq_open 		278
+#define __NR_mq_unlink		279
+#define __NR_mq_timedsend	280
+#define __NR_mq_timedreceive	281
+#define __NR_mq_notify		282
+#define __NR_mq_getsetattr	283
+#define __NR_kexec_load		284
+#define __NR_waitid		285
+#define __NR_add_key		286
+#define __NR_request_key	287
+#define __NR_keyctl		288
+#define __NR_ioprio_set		289
+#define __NR_ioprio_get		290
+#define __NR_inotify_init	291
+#define __NR_inotify_add_watch	292
+#define __NR_inotify_rm_watch	293
+				/* 294 __NR_migrate_pages */
+#define __NR_openat		295
+#define __NR_mkdirat		296
+#define __NR_mknodat		297
+#define __NR_fchownat		298
+#define __NR_futimesat		299
+#define __NR_fstatat64		300
+#define __NR_unlinkat		301
+#define __NR_renameat		302
+#define __NR_linkat		303
+#define __NR_symlinkat		304
+#define __NR_readlinkat		305
+#define __NR_fchmodat		306
+#define __NR_faccessat		307
+#define __NR_pselect6		308
+#define __NR_ppoll		309
+#define __NR_unshare		310
+
+/* Blackfin private syscalls */
+#define __NR_sram_alloc		311
+#define __NR_sram_free		312
+#define __NR_dma_memcpy		313
+
+/* socket syscalls */
+#define __NR_accept		314
+#define __NR_bind		315
+#define __NR_connect		316
+#define __NR_getpeername	317
+#define __NR_getsockname	318
+#define __NR_getsockopt		319
+#define __NR_listen		320
+#define __NR_recv		321
+#define __NR_recvfrom		322
+#define __NR_recvmsg		323
+#define __NR_send		324
+#define __NR_sendmsg		325
+#define __NR_sendto		326
+#define __NR_setsockopt		327
+#define __NR_shutdown		328
+#define __NR_socket		329
+#define __NR_socketpair		330
+
+/* sysv ipc syscalls */
+#define __NR_semctl		331
+#define __NR_semget		332
+#define __NR_semop		333
+#define __NR_msgctl		334
+#define __NR_msgget		335
+#define __NR_msgrcv		336
+#define __NR_msgsnd		337
+#define __NR_shmat		338
+#define __NR_shmctl		339
+#define __NR_shmdt		340
+#define __NR_shmget		341
+
+#define __NR_splice		342
+#define __NR_sync_file_range	343
+#define __NR_tee		344
+#define __NR_vmsplice		345
+
+#define __NR_epoll_pwait	346
+#define __NR_utimensat		347
+#define __NR_signalfd		348
+#define __NR_timerfd_create	349
+#define __NR_eventfd		350
+#define __NR_pread64		351
+#define __NR_pwrite64		352
+#define __NR_fadvise64		353
+#define __NR_set_robust_list	354
+#define __NR_get_robust_list	355
+#define __NR_fallocate		356
+#define __NR_semtimedop		357
+#define __NR_timerfd_settime	358
+#define __NR_timerfd_gettime	359
+#define __NR_signalfd4		360
+#define __NR_eventfd2		361
+#define __NR_epoll_create1	362
+#define __NR_dup3		363
+#define __NR_pipe2		364
+#define __NR_inotify_init1	365
+#define __NR_preadv		366
+#define __NR_pwritev		367
+#define __NR_rt_tgsigqueueinfo	368
+#define __NR_perf_event_open	369
+#define __NR_recvmmsg		370
+#define __NR_fanotify_init	371
+#define __NR_fanotify_mark	372
+#define __NR_prlimit64		373
+#define __NR_cacheflush		374
+#define __NR_name_to_handle_at	375
+#define __NR_open_by_handle_at	376
+#define __NR_clock_adjtime	377
+#define __NR_syncfs		378
+#define __NR_setns		379
+#define __NR_sendmmsg		380
+#define __NR_process_vm_readv	381
+#define __NR_process_vm_writev	382
+
+#define __NR_syscall		383
+#define NR_syscalls		__NR_syscall
+
+/* Old optional stuff no one actually uses */
+#define __IGNORE_sysfs
+#define __IGNORE_uselib
+
+/* Implement the newer interfaces */
+#define __IGNORE_mmap
+#define __IGNORE_poll
+#define __IGNORE_select
+#define __IGNORE_utime
+
+/* Not relevant on no-mmu */
+#define __IGNORE_swapon
+#define __IGNORE_swapoff
+#define __IGNORE_msync
+#define __IGNORE_mlock
+#define __IGNORE_munlock
+#define __IGNORE_mlockall
+#define __IGNORE_munlockall
+#define __IGNORE_mincore
+#define __IGNORE_madvise
+#define __IGNORE_remap_file_pages
+#define __IGNORE_mbind
+#define __IGNORE_get_mempolicy
+#define __IGNORE_set_mempolicy
+#define __IGNORE_migrate_pages
+#define __IGNORE_move_pages
+#define __IGNORE_getcpu
+
+
+#endif /* _UAPI__ASM_BFIN_UNISTD_H */
diff --git a/trunk/arch/blackfin/kernel/kgdb.c b/trunk/arch/blackfin/kernel/kgdb.c
index 9b80b152435e..b882ce22c347 100644
--- a/trunk/arch/blackfin/kernel/kgdb.c
+++ b/trunk/arch/blackfin/kernel/kgdb.c
@@ -329,6 +329,9 @@ static void bfin_disable_hw_debug(struct pt_regs *regs)
 }
 
 #ifdef CONFIG_SMP
+extern void generic_exec_single(int cpu, struct call_single_data *data, int wait);
+static struct call_single_data kgdb_smp_ipi_data[NR_CPUS];
+
 void kgdb_passive_cpu_callback(void *info)
 {
 	kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs());
@@ -336,12 +339,18 @@ void kgdb_passive_cpu_callback(void *info)
 
 void kgdb_roundup_cpus(unsigned long flags)
 {
-	smp_call_function(kgdb_passive_cpu_callback, NULL, 0);
+	unsigned int cpu;
+
+	for (cpu = cpumask_first(cpu_online_mask); cpu < nr_cpu_ids;
+		cpu = cpumask_next(cpu, cpu_online_mask)) {
+		kgdb_smp_ipi_data[cpu].func = kgdb_passive_cpu_callback;
+		generic_exec_single(cpu, &kgdb_smp_ipi_data[cpu], 0);
+	}
 }
 
 void kgdb_roundup_cpu(int cpu, unsigned long flags)
 {
-	smp_call_function_single(cpu, kgdb_passive_cpu_callback, NULL, 0);
+	generic_exec_single(cpu, &kgdb_smp_ipi_data[cpu], 0);
 }
 #endif
 
diff --git a/trunk/arch/blackfin/mach-bf518/include/mach/anomaly.h b/trunk/arch/blackfin/mach-bf518/include/mach/anomaly.h
index 845e6bc8d633..46cb88231d66 100644
--- a/trunk/arch/blackfin/mach-bf518/include/mach/anomaly.h
+++ b/trunk/arch/blackfin/mach-bf518/include/mach/anomaly.h
@@ -165,5 +165,6 @@
 #define ANOMALY_05000474 (0)
 #define ANOMALY_05000475 (0)
 #define ANOMALY_05000480 (0)
+#define ANOMALY_16000030 (0)
 
 #endif
diff --git a/trunk/arch/blackfin/mach-bf527/include/mach/anomaly.h b/trunk/arch/blackfin/mach-bf527/include/mach/anomaly.h
index aa14110be4c4..2f9cc33deec4 100644
--- a/trunk/arch/blackfin/mach-bf527/include/mach/anomaly.h
+++ b/trunk/arch/blackfin/mach-bf527/include/mach/anomaly.h
@@ -285,5 +285,6 @@
 #define ANOMALY_05000448 (0)
 #define ANOMALY_05000474 (0)
 #define ANOMALY_05000480 (0)
+#define ANOMALY_16000030 (0)
 
 #endif
diff --git a/trunk/arch/blackfin/mach-bf533/include/mach/anomaly.h b/trunk/arch/blackfin/mach-bf533/include/mach/anomaly.h
index 3a8f73a669f0..0e754efc3cf6 100644
--- a/trunk/arch/blackfin/mach-bf533/include/mach/anomaly.h
+++ b/trunk/arch/blackfin/mach-bf533/include/mach/anomaly.h
@@ -378,5 +378,6 @@
 #define ANOMALY_05000474 (0)
 #define ANOMALY_05000480 (0)
 #define ANOMALY_05000485 (0)
+#define ANOMALY_16000030 (0)
 
 #endif
diff --git a/trunk/arch/blackfin/mach-bf537/include/mach/anomaly.h b/trunk/arch/blackfin/mach-bf537/include/mach/anomaly.h
index df9212696397..2bc70c5b9415 100644
--- a/trunk/arch/blackfin/mach-bf537/include/mach/anomaly.h
+++ b/trunk/arch/blackfin/mach-bf537/include/mach/anomaly.h
@@ -236,5 +236,6 @@
 #define ANOMALY_05000467 (0)
 #define ANOMALY_05000474 (0)
 #define ANOMALY_05000485 (0)
+#define ANOMALY_16000030 (0)
 
 #endif
diff --git a/trunk/arch/blackfin/mach-bf538/include/mach/anomaly.h b/trunk/arch/blackfin/mach-bf538/include/mach/anomaly.h
index 318d922d11d4..eaac26973f6a 100644
--- a/trunk/arch/blackfin/mach-bf538/include/mach/anomaly.h
+++ b/trunk/arch/blackfin/mach-bf538/include/mach/anomaly.h
@@ -210,5 +210,6 @@
 #define ANOMALY_05000474 (0)
 #define ANOMALY_05000480 (0)
 #define ANOMALY_05000485 (0)
+#define ANOMALY_16000030 (0)
 
 #endif
diff --git a/trunk/arch/blackfin/mach-bf548/include/mach/anomaly.h b/trunk/arch/blackfin/mach-bf548/include/mach/anomaly.h
index 5b711d85b90b..098fad63e03b 100644
--- a/trunk/arch/blackfin/mach-bf548/include/mach/anomaly.h
+++ b/trunk/arch/blackfin/mach-bf548/include/mach/anomaly.h
@@ -296,5 +296,6 @@
 #define ANOMALY_05000440 (0)
 #define ANOMALY_05000475 (0)
 #define ANOMALY_05000480 (0)
+#define ANOMALY_16000030 (0)
 
 #endif
diff --git a/trunk/arch/blackfin/mach-bf561/include/mach/anomaly.h b/trunk/arch/blackfin/mach-bf561/include/mach/anomaly.h
index 72476ff50335..038249c1d0d4 100644
--- a/trunk/arch/blackfin/mach-bf561/include/mach/anomaly.h
+++ b/trunk/arch/blackfin/mach-bf561/include/mach/anomaly.h
@@ -348,5 +348,6 @@
 #define ANOMALY_05000474 (0)
 #define ANOMALY_05000480 (0)
 #define ANOMALY_05000485 (0)
+#define ANOMALY_16000030 (0)
 
 #endif
diff --git a/trunk/arch/blackfin/mach-bf609/include/mach/irq.h b/trunk/arch/blackfin/mach-bf609/include/mach/irq.h
index 23e74cdeeee8..fa0843d5d77a 100644
--- a/trunk/arch/blackfin/mach-bf609/include/mach/irq.h
+++ b/trunk/arch/blackfin/mach-bf609/include/mach/irq.h
@@ -9,9 +9,6 @@
 
 #include <mach-common/irq.h>
 
-#undef BFIN_IRQ
-#define BFIN_IRQ(x) ((x) + IVG15)
-
 #define NR_PERI_INTS		(5 * 32)
 
 #define IRQ_SEC_ERR		BFIN_IRQ(0)	/* SEC Error */
diff --git a/trunk/arch/blackfin/mach-bf609/pm.c b/trunk/arch/blackfin/mach-bf609/pm.c
index dacafc163f76..ad505d9db4a8 100644
--- a/trunk/arch/blackfin/mach-bf609/pm.c
+++ b/trunk/arch/blackfin/mach-bf609/pm.c
@@ -174,7 +174,6 @@ void bfin_hibernate_syscontrol(void)
 	bfin_write32(DPM0_RESTORE5, bfin_read32(DPM0_RESTORE5) | 4);
 }
 
-#define IRQ_SID(irq)   ((irq) - IVG15)
 asmlinkage void enter_deepsleep(void);
 
 __attribute__((l1_text))
@@ -311,7 +310,7 @@ static irqreturn_t test_isr(int irq, void *dev_id)
 {
 	printk(KERN_DEBUG "gpio irq %d\n", irq);
 	if (irq == 231)
-		bfin_sec_raise_irq(IRQ_SID(IRQ_SOFT1));
+		bfin_sec_raise_irq(BFIN_SYSIRQ(IRQ_SOFT1));
 	return IRQ_HANDLED;
 }
 
diff --git a/trunk/arch/blackfin/mach-common/dpmc.c b/trunk/arch/blackfin/mach-common/dpmc.c
index f5685a496c58..978bb400be06 100644
--- a/trunk/arch/blackfin/mach-common/dpmc.c
+++ b/trunk/arch/blackfin/mach-common/dpmc.c
@@ -157,24 +157,7 @@ struct platform_driver bfin_dpmc_device_driver = {
 		.name = DRIVER_NAME,
 	}
 };
-
-/**
- *	bfin_dpmc_init - Init driver
- */
-static int __init bfin_dpmc_init(void)
-{
-	return platform_driver_register(&bfin_dpmc_device_driver);
-}
-module_init(bfin_dpmc_init);
-
-/**
- *	bfin_dpmc_exit - break down driver
- */
-static void __exit bfin_dpmc_exit(void)
-{
-	platform_driver_unregister(&bfin_dpmc_device_driver);
-}
-module_exit(bfin_dpmc_exit);
+module_platform_driver(bfin_dpmc_device_driver);
 
 MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
 MODULE_DESCRIPTION("cpu power management driver for Blackfin");
diff --git a/trunk/arch/blackfin/mach-common/ints-priority.c b/trunk/arch/blackfin/mach-common/ints-priority.c
index 902bebc434c6..83ff311fd6ea 100644
--- a/trunk/arch/blackfin/mach-common/ints-priority.c
+++ b/trunk/arch/blackfin/mach-common/ints-priority.c
@@ -28,12 +28,6 @@
 #include <asm/dpmc.h>
 #include <asm/traps.h>
 
-#ifndef SEC_GCTL
-# define SIC_SYSIRQ(irq)	(irq - (IRQ_CORETMR + 1))
-#else
-# define SIC_SYSIRQ(irq)	((irq) - IVG15)
-#endif
-
 /*
  * NOTES:
  * - we have separated the physical Hardware interrupt from the
@@ -141,13 +135,13 @@ static void bfin_core_unmask_irq(struct irq_data *d)
 	return;
 }
 
+#ifndef SEC_GCTL
 void bfin_internal_mask_irq(unsigned int irq)
 {
 	unsigned long flags = hard_local_irq_save();
-#ifndef SEC_GCTL
 #ifdef SIC_IMASK0
-	unsigned mask_bank = SIC_SYSIRQ(irq) / 32;
-	unsigned mask_bit = SIC_SYSIRQ(irq) % 32;
+	unsigned mask_bank = BFIN_SYSIRQ(irq) / 32;
+	unsigned mask_bit = BFIN_SYSIRQ(irq) % 32;
 	bfin_write_SIC_IMASK(mask_bank, bfin_read_SIC_IMASK(mask_bank) &
 			~(1 << mask_bit));
 # if defined(CONFIG_SMP) || defined(CONFIG_ICC)
@@ -156,9 +150,8 @@ void bfin_internal_mask_irq(unsigned int irq)
 # endif
 #else
 	bfin_write_SIC_IMASK(bfin_read_SIC_IMASK() &
-			~(1 << SIC_SYSIRQ(irq)));
+			~(1 << BFIN_SYSIRQ(irq)));
 #endif /* end of SIC_IMASK0 */
-#endif
 	hard_local_irq_restore(flags);
 }
 
@@ -176,10 +169,9 @@ void bfin_internal_unmask_irq(unsigned int irq)
 {
 	unsigned long flags = hard_local_irq_save();
 
-#ifndef SEC_GCTL
 #ifdef SIC_IMASK0
-	unsigned mask_bank = SIC_SYSIRQ(irq) / 32;
-	unsigned mask_bit = SIC_SYSIRQ(irq) % 32;
+	unsigned mask_bank = BFIN_SYSIRQ(irq) / 32;
+	unsigned mask_bit = BFIN_SYSIRQ(irq) % 32;
 # ifdef CONFIG_SMP
 	if (cpumask_test_cpu(0, affinity))
 # endif
@@ -194,17 +186,103 @@ void bfin_internal_unmask_irq(unsigned int irq)
 # endif
 #else
 	bfin_write_SIC_IMASK(bfin_read_SIC_IMASK() |
-			(1 << SIC_SYSIRQ(irq)));
+			(1 << BFIN_SYSIRQ(irq)));
+#endif
+	hard_local_irq_restore(flags);
+}
+
+#ifdef CONFIG_SMP
+static void bfin_internal_unmask_irq_chip(struct irq_data *d)
+{
+	bfin_internal_unmask_irq_affinity(d->irq, d->affinity);
+}
+
+static int bfin_internal_set_affinity(struct irq_data *d,
+				      const struct cpumask *mask, bool force)
+{
+	bfin_internal_mask_irq(d->irq);
+	bfin_internal_unmask_irq_affinity(d->irq, mask);
+
+	return 0;
+}
+#else
+static void bfin_internal_unmask_irq_chip(struct irq_data *d)
+{
+	bfin_internal_unmask_irq(d->irq);
+}
 #endif
+
+#if defined(CONFIG_PM)
+int bfin_internal_set_wake(unsigned int irq, unsigned int state)
+{
+	u32 bank, bit, wakeup = 0;
+	unsigned long flags;
+	bank = BFIN_SYSIRQ(irq) / 32;
+	bit = BFIN_SYSIRQ(irq) % 32;
+
+	switch (irq) {
+#ifdef IRQ_RTC
+	case IRQ_RTC:
+	wakeup |= WAKE;
+	break;
+#endif
+#ifdef IRQ_CAN0_RX
+	case IRQ_CAN0_RX:
+	wakeup |= CANWE;
+	break;
 #endif
+#ifdef IRQ_CAN1_RX
+	case IRQ_CAN1_RX:
+	wakeup |= CANWE;
+	break;
+#endif
+#ifdef IRQ_USB_INT0
+	case IRQ_USB_INT0:
+	wakeup |= USBWE;
+	break;
+#endif
+#ifdef CONFIG_BF54x
+	case IRQ_CNT:
+	wakeup |= ROTWE;
+	break;
+#endif
+	default:
+	break;
+	}
+
+	flags = hard_local_irq_save();
+
+	if (state) {
+		bfin_sic_iwr[bank] |= (1 << bit);
+		vr_wakeup  |= wakeup;
+
+	} else {
+		bfin_sic_iwr[bank] &= ~(1 << bit);
+		vr_wakeup  &= ~wakeup;
+	}
+
 	hard_local_irq_restore(flags);
+
+	return 0;
 }
 
-#ifdef SEC_GCTL
+static int bfin_internal_set_wake_chip(struct irq_data *d, unsigned int state)
+{
+	return bfin_internal_set_wake(d->irq, state);
+}
+#else
+inline int bfin_internal_set_wake(unsigned int irq, unsigned int state)
+{
+	return 0;
+}
+# define bfin_internal_set_wake_chip NULL
+#endif
+
+#else /* SEC_GCTL */
 static void bfin_sec_preflow_handler(struct irq_data *d)
 {
 	unsigned long flags = hard_local_irq_save();
-	unsigned int sid = SIC_SYSIRQ(d->irq);
+	unsigned int sid = BFIN_SYSIRQ(d->irq);
 
 	bfin_write_SEC_SCI(0, SEC_CSID, sid);
 
@@ -214,7 +292,7 @@ static void bfin_sec_preflow_handler(struct irq_data *d)
 static void bfin_sec_mask_ack_irq(struct irq_data *d)
 {
 	unsigned long flags = hard_local_irq_save();
-	unsigned int sid = SIC_SYSIRQ(d->irq);
+	unsigned int sid = BFIN_SYSIRQ(d->irq);
 
 	bfin_write_SEC_SCI(0, SEC_CSID, sid);
 
@@ -224,7 +302,7 @@ static void bfin_sec_mask_ack_irq(struct irq_data *d)
 static void bfin_sec_unmask_irq(struct irq_data *d)
 {
 	unsigned long flags = hard_local_irq_save();
-	unsigned int sid = SIC_SYSIRQ(d->irq);
+	unsigned int sid = BFIN_SYSIRQ(d->irq);
 
 	bfin_write32(SEC_END, sid);
 
@@ -269,7 +347,7 @@ static void bfin_sec_enable_sci(unsigned int sid)
 	unsigned long flags = hard_local_irq_save();
 	uint32_t reg_sctl = bfin_read_SEC_SCTL(sid);
 
-	if (sid == SIC_SYSIRQ(IRQ_WATCH0))
+	if (sid == BFIN_SYSIRQ(IRQ_WATCH0))
 		reg_sctl |= SEC_SCTL_FAULT_EN;
 	else
 		reg_sctl |= SEC_SCTL_INT_EN;
@@ -292,7 +370,7 @@ static void bfin_sec_disable_sci(unsigned int sid)
 static void bfin_sec_enable(struct irq_data *d)
 {
 	unsigned long flags = hard_local_irq_save();
-	unsigned int sid = SIC_SYSIRQ(d->irq);
+	unsigned int sid = BFIN_SYSIRQ(d->irq);
 
 	bfin_sec_enable_sci(sid);
 	bfin_sec_enable_ssi(sid);
@@ -303,7 +381,7 @@ static void bfin_sec_enable(struct irq_data *d)
 static void bfin_sec_disable(struct irq_data *d)
 {
 	unsigned long flags = hard_local_irq_save();
-	unsigned int sid = SIC_SYSIRQ(d->irq);
+	unsigned int sid = BFIN_SYSIRQ(d->irq);
 
 	bfin_sec_disable_sci(sid);
 	bfin_sec_disable_ssi(sid);
@@ -328,9 +406,10 @@ static void bfin_sec_set_priority(unsigned int sec_int_levels, u8 *sec_int_prior
 	hard_local_irq_restore(flags);
 }
 
-void bfin_sec_raise_irq(unsigned int sid)
+void bfin_sec_raise_irq(unsigned int irq)
 {
 	unsigned long flags = hard_local_irq_save();
+	unsigned int sid = BFIN_SYSIRQ(irq);
 
 	bfin_write32(SEC_RAISE, sid);
 
@@ -341,8 +420,13 @@ static void init_software_driven_irq(void)
 {
 	bfin_sec_set_ssi_coreid(34, 0);
 	bfin_sec_set_ssi_coreid(35, 1);
+
+	bfin_sec_enable_sci(35);
+	bfin_sec_enable_ssi(35);
 	bfin_sec_set_ssi_coreid(36, 0);
 	bfin_sec_set_ssi_coreid(37, 1);
+	bfin_sec_enable_sci(37);
+	bfin_sec_enable_ssi(37);
 }
 
 void bfin_sec_resume(void)
@@ -412,6 +496,8 @@ void handle_sec_fault(unsigned int irq, struct irq_desc *desc)
 	}
 
 	raw_spin_unlock(&desc->lock);
+
+	handle_fasteoi_irq(irq, desc);
 }
 
 void handle_core_fault(unsigned int irq, struct irq_desc *desc)
@@ -431,105 +517,18 @@ void handle_core_fault(unsigned int irq, struct irq_desc *desc)
 		printk(KERN_NOTICE "Kernel Stack\n");
 		show_stack(current, NULL);
 		print_modules();
-		panic("Kernel core hardware error");
+		panic("Core 0 hardware error");
 		break;
 	case IRQ_C0_NMI_L1_PARITY_ERR:
-		panic("NMI occurs unexpectedly");
+		panic("Core 0 NMI L1 parity error");
 		break;
 	default:
-		panic("Core 1 fault occurs unexpectedly");
+		panic("Core 1 fault %d occurs unexpectedly", irq);
 	}
 
 	raw_spin_unlock(&desc->lock);
 }
-#endif
-
-#ifdef CONFIG_SMP
-static void bfin_internal_unmask_irq_chip(struct irq_data *d)
-{
-	bfin_internal_unmask_irq_affinity(d->irq, d->affinity);
-}
-
-static int bfin_internal_set_affinity(struct irq_data *d,
-				      const struct cpumask *mask, bool force)
-{
-	bfin_internal_mask_irq(d->irq);
-	bfin_internal_unmask_irq_affinity(d->irq, mask);
-
-	return 0;
-}
-#else
-static void bfin_internal_unmask_irq_chip(struct irq_data *d)
-{
-	bfin_internal_unmask_irq(d->irq);
-}
-#endif
-
-#if defined(CONFIG_PM) && !defined(SEC_GCTL)
-int bfin_internal_set_wake(unsigned int irq, unsigned int state)
-{
-	u32 bank, bit, wakeup = 0;
-	unsigned long flags;
-	bank = SIC_SYSIRQ(irq) / 32;
-	bit = SIC_SYSIRQ(irq) % 32;
-
-	switch (irq) {
-#ifdef IRQ_RTC
-	case IRQ_RTC:
-	wakeup |= WAKE;
-	break;
-#endif
-#ifdef IRQ_CAN0_RX
-	case IRQ_CAN0_RX:
-	wakeup |= CANWE;
-	break;
-#endif
-#ifdef IRQ_CAN1_RX
-	case IRQ_CAN1_RX:
-	wakeup |= CANWE;
-	break;
-#endif
-#ifdef IRQ_USB_INT0
-	case IRQ_USB_INT0:
-	wakeup |= USBWE;
-	break;
-#endif
-#ifdef CONFIG_BF54x
-	case IRQ_CNT:
-	wakeup |= ROTWE;
-	break;
-#endif
-	default:
-	break;
-	}
-
-	flags = hard_local_irq_save();
-
-	if (state) {
-		bfin_sic_iwr[bank] |= (1 << bit);
-		vr_wakeup  |= wakeup;
-
-	} else {
-		bfin_sic_iwr[bank] &= ~(1 << bit);
-		vr_wakeup  &= ~wakeup;
-	}
-
-	hard_local_irq_restore(flags);
-
-	return 0;
-}
-
-static int bfin_internal_set_wake_chip(struct irq_data *d, unsigned int state)
-{
-	return bfin_internal_set_wake(d->irq, state);
-}
-#else
-inline int bfin_internal_set_wake(unsigned int irq, unsigned int state)
-{
-	return 0;
-}
-# define bfin_internal_set_wake_chip NULL
-#endif
+#endif /* SEC_GCTL */
 
 static struct irq_chip bfin_core_irqchip = {
 	.name = "CORE",
@@ -537,6 +536,7 @@ static struct irq_chip bfin_core_irqchip = {
 	.irq_unmask = bfin_core_unmask_irq,
 };
 
+#ifndef SEC_GCTL
 static struct irq_chip bfin_internal_irqchip = {
 	.name = "INTN",
 	.irq_mask = bfin_internal_mask_irq_chip,
@@ -548,8 +548,7 @@ static struct irq_chip bfin_internal_irqchip = {
 #endif
 	.irq_set_wake = bfin_internal_set_wake_chip,
 };
-
-#ifdef SEC_GCTL
+#else
 static struct irq_chip bfin_sec_irqchip = {
 	.name = "SEC",
 	.irq_mask_ack = bfin_sec_mask_ack_irq,
@@ -1138,7 +1137,9 @@ static int bfin_gpio_set_wake(struct irq_data *d, unsigned int state)
 		return -EINVAL;
 	}
 
+#ifndef SEC_GCTL
 	bfin_internal_set_wake(pint_irq, state);
+#endif
 
 	return 0;
 }
@@ -1173,7 +1174,7 @@ static int sec_suspend(void)
 	u32 bank;
 
 	for (bank = 0; bank < NR_PINT_SYS_IRQS; bank++)
-		save_pint_sec_ctl[bank] = bfin_read_SEC_SCTL(bank + SIC_SYSIRQ(IRQ_PINT0));
+		save_pint_sec_ctl[bank] = bfin_read_SEC_SCTL(bank + BFIN_SYSIRQ(IRQ_PINT0));
 	return 0;
 }
 
@@ -1187,7 +1188,7 @@ static void sec_resume(void)
 	bfin_write_SEC_SCI(0, SEC_CCTL, SEC_CCTL_EN | SEC_CCTL_NMI_EN);
 
 	for (bank = 0; bank < NR_PINT_SYS_IRQS; bank++)
-		bfin_write_SEC_SCTL(bank + SIC_SYSIRQ(IRQ_PINT0), save_pint_sec_ctl[bank]);
+		bfin_write_SEC_SCTL(bank + BFIN_SYSIRQ(IRQ_PINT0), save_pint_sec_ctl[bank]);
 }
 
 static struct syscore_ops sec_pm_syscore_ops = {
@@ -1538,33 +1539,26 @@ int __init init_arch_irq(void)
 
 	for (irq = 0; irq <= SYS_IRQS; irq++) {
 		if (irq <= IRQ_CORETMR) {
-			irq_set_chip(irq, &bfin_core_irqchip);
-#ifdef CONFIG_TICKSOURCE_CORETMR
+			irq_set_chip_and_handler(irq, &bfin_core_irqchip,
+				handle_simple_irq);
+#if defined(CONFIG_TICKSOURCE_CORETMR) && defined(CONFIG_SMP)
 			if (irq == IRQ_CORETMR)
-# ifdef CONFIG_SMP
 				irq_set_handler(irq, handle_percpu_irq);
-# else
-				irq_set_handler(irq, handle_simple_irq);
-# endif
 #endif
-		} else if (irq < BFIN_IRQ(0)) {
-			irq_set_chip_and_handler(irq, &bfin_internal_irqchip,
-					handle_simple_irq);
-		} else if (irq == IRQ_SEC_ERR) {
-			irq_set_chip_and_handler(irq, &bfin_sec_irqchip,
-					handle_sec_fault);
-		} else if (irq < CORE_IRQS && irq >= IRQ_C0_DBL_FAULT) {
-			irq_set_chip_and_handler(irq, &bfin_sec_irqchip,
-					handle_core_fault);
 		} else if (irq >= BFIN_IRQ(21) && irq <= BFIN_IRQ(26)) {
 			irq_set_chip(irq, &bfin_sec_irqchip);
 			irq_set_chained_handler(irq, bfin_demux_gpio_irq);
 		} else if (irq >= BFIN_IRQ(34) && irq <= BFIN_IRQ(37)) {
-			irq_set_chip(irq, &bfin_sec_irqchip);
-			irq_set_handler(irq, handle_percpu_irq);
-		} else {
 			irq_set_chip_and_handler(irq, &bfin_sec_irqchip,
-					handle_fasteoi_irq);
+				handle_percpu_irq);
+		} else {
+			irq_set_chip(irq, &bfin_sec_irqchip);
+			if (irq == IRQ_SEC_ERR)
+				irq_set_handler(irq, handle_sec_fault);
+			else if (irq >= IRQ_C0_DBL_FAULT && irq < CORE_IRQS)
+				irq_set_handler(irq, handle_core_fault);
+			else
+				irq_set_handler(irq, handle_fasteoi_irq);
 			__irq_set_preflow_handler(irq, bfin_sec_preflow_handler);
 		}
 	}
@@ -1593,8 +1587,8 @@ int __init init_arch_irq(void)
 
 
 	bfin_write_SEC_FCTL(SEC_FCTL_EN | SEC_FCTL_SYSRST_EN | SEC_FCTL_FLTIN_EN);
-	bfin_sec_enable_sci(SIC_SYSIRQ(IRQ_WATCH0));
-	bfin_sec_enable_ssi(SIC_SYSIRQ(IRQ_WATCH0));
+	bfin_sec_enable_sci(BFIN_SYSIRQ(IRQ_WATCH0));
+	bfin_sec_enable_ssi(BFIN_SYSIRQ(IRQ_WATCH0));
 	bfin_write_SEC_SCI(0, SEC_CCTL, SEC_CCTL_RESET);
 	udelay(100);
 	bfin_write_SEC_GCTL(SEC_GCTL_EN);
diff --git a/trunk/arch/cris/include/asm/io.h b/trunk/arch/cris/include/asm/io.h
index 32567bc2a421..ac12ae2b9286 100644
--- a/trunk/arch/cris/include/asm/io.h
+++ b/trunk/arch/cris/include/asm/io.h
@@ -133,12 +133,39 @@ static inline void writel(unsigned int b, volatile void __iomem *addr)
 #define insb(port,addr,count) (cris_iops ? cris_iops->read_io(port,addr,1,count) : 0)
 #define insw(port,addr,count) (cris_iops ? cris_iops->read_io(port,addr,2,count) : 0)
 #define insl(port,addr,count) (cris_iops ? cris_iops->read_io(port,addr,4,count) : 0)
-#define outb(data,port) if (cris_iops) cris_iops->write_io(port,(void*)(unsigned)data,1,1)
-#define outw(data,port) if (cris_iops) cris_iops->write_io(port,(void*)(unsigned)data,2,1)
-#define outl(data,port) if (cris_iops) cris_iops->write_io(port,(void*)(unsigned)data,4,1)
-#define outsb(port,addr,count) if(cris_iops) cris_iops->write_io(port,(void*)addr,1,count)
-#define outsw(port,addr,count) if(cris_iops) cris_iops->write_io(port,(void*)addr,2,count)
-#define outsl(port,addr,count) if(cris_iops) cris_iops->write_io(port,(void*)addr,3,count)
+static inline void outb(unsigned char data, unsigned int port)
+{
+	if (cris_iops)
+		cris_iops->write_io(port, (void *) &data, 1, 1);
+}
+static inline void outw(unsigned short data, unsigned int port)
+{
+	if (cris_iops)
+		cris_iops->write_io(port, (void *) &data, 2, 1);
+}
+static inline void outl(unsigned int data, unsigned int port)
+{
+	if (cris_iops)
+		cris_iops->write_io(port, (void *) &data, 4, 1);
+}
+static inline void outsb(unsigned int port, const void *addr,
+			 unsigned long count)
+{
+	if (cris_iops)
+		cris_iops->write_io(port, (void *)addr, 1, count);
+}
+static inline void outsw(unsigned int port, const void *addr,
+			 unsigned long count)
+{
+	if (cris_iops)
+		cris_iops->write_io(port, (void *)addr, 2, count);
+}
+static inline void outsl(unsigned int port, const void *addr,
+			 unsigned long count)
+{
+	if (cris_iops)
+		cris_iops->write_io(port, (void *)addr, 4, count);
+}
 
 /*
  * Convert a physical pointer to a virtual kernel pointer for /dev/mem
diff --git a/trunk/arch/cris/kernel/module.c b/trunk/arch/cris/kernel/module.c
index 37400f5869e6..51123f985eb5 100644
--- a/trunk/arch/cris/kernel/module.c
+++ b/trunk/arch/cris/kernel/module.c
@@ -32,8 +32,6 @@
 #ifdef CONFIG_ETRAX_KMALLOCED_MODULES
 void *module_alloc(unsigned long size)
 {
-	if (size == 0)
-		return NULL;
 	return kmalloc(size, GFP_KERNEL);
 }
 
diff --git a/trunk/arch/frv/kernel/setup.c b/trunk/arch/frv/kernel/setup.c
index b8993c87d3de..3cb3392f799e 100644
--- a/trunk/arch/frv/kernel/setup.c
+++ b/trunk/arch/frv/kernel/setup.c
@@ -804,9 +804,9 @@ void __init setup_arch(char **cmdline_p)
 
 	BUG_ON(memory_start == memory_end);
 
-	init_mm.start_code = (unsigned long) &_stext;
-	init_mm.end_code = (unsigned long) &_etext;
-	init_mm.end_data = (unsigned long) &_edata;
+	init_mm.start_code = (unsigned long) _stext;
+	init_mm.end_code = (unsigned long) _etext;
+	init_mm.end_data = (unsigned long) _edata;
 #if 0 /* DAVIDM - don't set brk just incase someone decides to use it */
 	init_mm.brk = (unsigned long) &_end;
 #else
@@ -814,10 +814,8 @@ void __init setup_arch(char **cmdline_p)
 #endif
 
 #ifdef DEBUG
-	printk("KERNEL -> TEXT=0x%06x-0x%06x DATA=0x%06x-0x%06x BSS=0x%06x-0x%06x\n",
-	       (int) &_stext, (int) &_etext,
-	       (int) &_sdata, (int) &_edata,
-	       (int) &_sbss, (int) &_ebss);
+	printk("KERNEL -> TEXT=0x%p-0x%p DATA=0x%p-0x%p BSS=0x%p-0x%p\n",
+	       _stext, _etext, _sdata, _edata, __bss_start, __bss_stop);
 #endif
 
 #ifdef CONFIG_VT
diff --git a/trunk/arch/frv/mm/init.c b/trunk/arch/frv/mm/init.c
index a19effcccb34..92e97b0894a6 100644
--- a/trunk/arch/frv/mm/init.c
+++ b/trunk/arch/frv/mm/init.c
@@ -146,7 +146,7 @@ void __init mem_init(void)
 
 #else
 	codek = (_etext - _stext) >> 10;
-	datak = 0; //(_ebss - _sdata) >> 10;
+	datak = 0; //(__bss_stop - _sdata) >> 10;
 #endif
 
 	tmp = nr_free_pages() << PAGE_SHIFT;
diff --git a/trunk/arch/h8300/Kconfig b/trunk/arch/h8300/Kconfig
index 04bef4d25b4a..0ae445087607 100644
--- a/trunk/arch/h8300/Kconfig
+++ b/trunk/arch/h8300/Kconfig
@@ -3,6 +3,7 @@ config H8300
 	default y
 	select HAVE_IDE
 	select HAVE_GENERIC_HARDIRQS
+	select GENERIC_ATOMIC64
 	select HAVE_UID16
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select GENERIC_IRQ_SHOW
diff --git a/trunk/arch/openrisc/include/asm/Kbuild b/trunk/arch/openrisc/include/asm/Kbuild
index 8971026e1c63..f20d01d9aaf9 100644
--- a/trunk/arch/openrisc/include/asm/Kbuild
+++ b/trunk/arch/openrisc/include/asm/Kbuild
@@ -32,6 +32,7 @@ generic-y += ipcbuf.h
 generic-y += irq_regs.h
 generic-y += kdebug.h
 generic-y += kmap_types.h
+generic-y += kvm_para.h
 generic-y += local.h
 generic-y += mman.h
 generic-y += module.h
diff --git a/trunk/arch/openrisc/include/uapi/asm/kvm_para.h b/trunk/arch/openrisc/include/uapi/asm/kvm_para.h
deleted file mode 100644
index 14fab8f0b957..000000000000
--- a/trunk/arch/openrisc/include/uapi/asm/kvm_para.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/kvm_para.h>
diff --git a/trunk/arch/openrisc/kernel/asm-offsets.c b/trunk/arch/openrisc/kernel/asm-offsets.c
index 1a242a0d7583..ddb736855863 100644
--- a/trunk/arch/openrisc/kernel/asm-offsets.c
+++ b/trunk/arch/openrisc/kernel/asm-offsets.c
@@ -34,15 +34,11 @@
 #include <linux/mm.h>
 #include <linux/io.h>
 #include <linux/thread_info.h>
+#include <linux/kbuild.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 
-#define DEFINE(sym, val) \
-		asm volatile("\n->" #sym " %0 " #val : : "i" (val))
-
-#define BLANK() asm volatile("\n->" : : )
-
 int main(void)
 {
 	/* offsets into the task_struct */
diff --git a/trunk/arch/parisc/kernel/module.c b/trunk/arch/parisc/kernel/module.c
index 5e34ccf39a49..2a625fb063e1 100644
--- a/trunk/arch/parisc/kernel/module.c
+++ b/trunk/arch/parisc/kernel/module.c
@@ -214,8 +214,6 @@ static inline int reassemble_22(int as22)
 
 void *module_alloc(unsigned long size)
 {
-	if (size == 0)
-		return NULL;
 	/* using RWX means less protection for modules, but it's
 	 * easier than trying to map the text, data, init_text and
 	 * init_data correctly */
diff --git a/trunk/arch/powerpc/Makefile b/trunk/arch/powerpc/Makefile
index 159e94f4b22a..b639852116fa 100644
--- a/trunk/arch/powerpc/Makefile
+++ b/trunk/arch/powerpc/Makefile
@@ -181,7 +181,7 @@ $(BOOT_TARGETS2): vmlinux
 bootwrapper_install:
 	$(Q)$(MAKE) ARCH=ppc64 $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
 
-%.dtb:
+%.dtb: scripts
 	$(Q)$(MAKE) ARCH=ppc64 $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
 
 define archhelp
diff --git a/trunk/arch/powerpc/boot/dts/a3m071.dts b/trunk/arch/powerpc/boot/dts/a3m071.dts
new file mode 100644
index 000000000000..877a28cb77e4
--- /dev/null
+++ b/trunk/arch/powerpc/boot/dts/a3m071.dts
@@ -0,0 +1,144 @@
+/*
+ * a3m071 board Device Tree Source
+ *
+ * Copyright 2012 Stefan Roese <sr@denx.de>
+ *
+ * Copyright (C) 2011 DENX Software Engineering GmbH
+ * Heiko Schocher <hs@denx.de>
+ *
+ * Copyright (C) 2007 Semihalf
+ * Marian Balakowicz <m8@semihalf.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+/include/ "mpc5200b.dtsi"
+
+/ {
+	model = "anonymous,a3m071";
+	compatible = "anonymous,a3m071";
+
+	soc5200@f0000000 {
+		#address-cells = <1>;
+		#size-cells = <1>;
+		compatible = "fsl,mpc5200b-immr";
+		ranges = <0 0xf0000000 0x0000c000>;
+		reg = <0xf0000000 0x00000100>;
+		bus-frequency = <0>; /* From boot loader */
+		system-frequency = <0>; /* From boot loader */
+
+		timer@600 {
+			fsl,has-wdt;
+		};
+
+		spi@f00 {
+			status = "disabled";
+		};
+
+		usb: usb@1000 {
+			status = "disabled";
+		};
+
+		psc@2000 {
+			compatible = "fsl,mpc5200b-psc-uart","fsl,mpc5200-psc-uart";
+			reg = <0x2000 0x100>;
+			interrupts = <2 1 0>;
+		};
+
+		psc@2200 {
+			status = "disabled";
+		};
+
+		psc@2400 {
+			status = "disabled";
+		};
+
+		psc@2600 {
+			status = "disabled";
+		};
+
+		psc@2800 {
+			status = "disabled";
+		};
+
+		psc@2c00 {		// PSC6
+			compatible = "fsl,mpc5200b-psc-uart","fsl,mpc5200-psc-uart";
+			reg = <0x2c00 0x100>;
+			interrupts = <2 4 0>;
+		};
+
+		ethernet@3000 {
+			phy-handle = <&phy0>;
+		};
+
+		mdio@3000 {
+			phy0: ethernet-phy@3 {
+				reg = <0x03>;
+			};
+		};
+
+		ata@3a00 {
+			status = "disabled";
+		};
+
+		i2c@3d00 {
+			status = "disabled";
+		};
+
+		i2c@3d40 {
+			status = "disabled";
+		};
+	};
+
+	localbus {
+		compatible = "fsl,mpc5200b-lpb","simple-bus";
+		#address-cells = <2>;
+		#size-cells = <1>;
+		ranges = <0 0 0xfc000000 0x02000000
+			  3 0 0xe9000000 0x00080000
+			  5 0 0xe8000000 0x00010000>;
+
+		flash@0,0 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			reg = <0 0x0 0x02000000>;
+			compatible = "cfi-flash";
+			bank-width = <2>;
+			partition@0x0 {
+				label = "u-boot";
+				reg = <0x00000000 0x00040000>;
+				read-only;
+			};
+			partition@0x00040000 {
+				label = "env";
+				reg = <0x00040000 0x00020000>;
+			};
+			partition@0x00060000 {
+				label = "dtb";
+				reg = <0x00060000 0x00020000>;
+			};
+			partition@0x00080000 {
+				label = "kernel";
+				reg = <0x00080000 0x00500000>;
+			};
+			partition@0x00580000 {
+				label = "root";
+				reg = <0x00580000 0x00A80000>;
+			};
+		};
+
+		fpga@3,0 {
+			compatible = "anonymous,a3m071-fpga";
+			reg = <3 0x0 0x00080000
+			       5 0x0 0x00010000>;
+			interrupts = <0 0 3>;  /* level low */
+		};
+	};
+
+	pci@f0000d00 {
+		status = "disabled";
+	};
+};
diff --git a/trunk/arch/powerpc/boot/dts/fsl/p5020si-post.dtsi b/trunk/arch/powerpc/boot/dts/fsl/p5020si-post.dtsi
index 64b6abea8464..5d7205b7bb05 100644
--- a/trunk/arch/powerpc/boot/dts/fsl/p5020si-post.dtsi
+++ b/trunk/arch/powerpc/boot/dts/fsl/p5020si-post.dtsi
@@ -354,4 +354,5 @@
 /include/ "qoriq-sata2-0.dtsi"
 /include/ "qoriq-sata2-1.dtsi"
 /include/ "qoriq-sec4.2-0.dtsi"
+/include/ "qoriq-raid1.0-0.dtsi"
 };
diff --git a/trunk/arch/powerpc/boot/dts/fsl/p5020si-pre.dtsi b/trunk/arch/powerpc/boot/dts/fsl/p5020si-pre.dtsi
index 0a198b0a77e5..8df47fc45ab5 100644
--- a/trunk/arch/powerpc/boot/dts/fsl/p5020si-pre.dtsi
+++ b/trunk/arch/powerpc/boot/dts/fsl/p5020si-pre.dtsi
@@ -73,6 +73,12 @@
 		rtic_c = &rtic_c;
 		rtic_d = &rtic_d;
 		sec_mon = &sec_mon;
+
+		raideng = &raideng;
+		raideng_jr0 = &raideng_jr0;
+		raideng_jr1 = &raideng_jr1;
+		raideng_jr2 = &raideng_jr2;
+		raideng_jr3 = &raideng_jr3;
 	};
 
 	cpus {
diff --git a/trunk/arch/powerpc/boot/dts/fsl/qoriq-raid1.0-0.dtsi b/trunk/arch/powerpc/boot/dts/fsl/qoriq-raid1.0-0.dtsi
new file mode 100644
index 000000000000..8d2e8aa6cf8a
--- /dev/null
+++ b/trunk/arch/powerpc/boot/dts/fsl/qoriq-raid1.0-0.dtsi
@@ -0,0 +1,85 @@
+/*
+ * QorIQ RAID 1.0 device tree stub [ controller @ offset 0x320000 ]
+ *
+ * Copyright 2012 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+raideng: raideng@320000 {
+	compatible = "fsl,raideng-v1.0";
+	#address-cells = <1>;
+	#size-cells = <1>;
+	reg = <0x320000 0x10000>;
+	ranges = <0 0x320000 0x10000>;
+
+	raideng_jq0@1000 {
+		compatible = "fsl,raideng-v1.0-job-queue";
+		#address-cells = <1>;
+		#size-cells = <1>;
+		reg = <0x1000 0x1000>;
+		ranges = <0x0 0x1000 0x1000>;
+
+		raideng_jr0: jr@0 {
+			compatible = "fsl,raideng-v1.0-job-ring", "fsl,raideng-v1.0-hp-ring";
+			reg = <0x0 0x400>;
+			interrupts = <139 2 0 0>;
+			interrupt-parent = <&mpic>;
+		};
+
+		raideng_jr1: jr@400 {
+			compatible = "fsl,raideng-v1.0-job-ring", "fsl,raideng-v1.0-lp-ring";
+			reg = <0x400 0x400>;
+			interrupts = <140 2 0 0>;
+			interrupt-parent = <&mpic>;
+		};
+	};
+
+	raideng_jq1@2000 {
+		compatible = "fsl,raideng-v1.0-job-queue";
+		#address-cells = <1>;
+		#size-cells = <1>;
+		reg = <0x2000 0x1000>;
+		ranges = <0x0 0x2000 0x1000>;
+
+		raideng_jr2: jr@0 {
+			compatible = "fsl,raideng-v1.0-job-ring", "fsl,raideng-v1.0-hp-ring";
+			reg = <0x0 0x400>;
+			interrupts = <141 2 0 0>;
+			interrupt-parent = <&mpic>;
+		};
+
+		raideng_jr3: jr@400 {
+			compatible = "fsl,raideng-v1.0-job-ring", "fsl,raideng-v1.0-lp-ring";
+			reg = <0x400 0x400>;
+			interrupts = <142 2 0 0>;
+			interrupt-parent = <&mpic>;
+		};
+	};
+};
diff --git a/trunk/arch/powerpc/configs/pseries_defconfig b/trunk/arch/powerpc/configs/pseries_defconfig
index 1f710a32ffae..5b8e1e508270 100644
--- a/trunk/arch/powerpc/configs/pseries_defconfig
+++ b/trunk/arch/powerpc/configs/pseries_defconfig
@@ -2,7 +2,7 @@ CONFIG_PPC64=y
 CONFIG_ALTIVEC=y
 CONFIG_VSX=y
 CONFIG_SMP=y
-CONFIG_NR_CPUS=1024
+CONFIG_NR_CPUS=2048
 CONFIG_EXPERIMENTAL=y
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
diff --git a/trunk/arch/powerpc/include/asm/bitops.h b/trunk/arch/powerpc/include/asm/bitops.h
index dc2cf9c6d9e6..ef918a2328bb 100644
--- a/trunk/arch/powerpc/include/asm/bitops.h
+++ b/trunk/arch/powerpc/include/asm/bitops.h
@@ -52,8 +52,6 @@
 #define smp_mb__before_clear_bit()	smp_mb()
 #define smp_mb__after_clear_bit()	smp_mb()
 
-#define BITOP_MASK(nr)		(1UL << ((nr) % BITS_PER_LONG))
-#define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
 #define BITOP_LE_SWIZZLE	((BITS_PER_LONG-1) & ~0x7)
 
 /* Macro for generating the ***_bits() functions */
@@ -83,22 +81,22 @@ DEFINE_BITOP(change_bits, xor, "", "")
 
 static __inline__ void set_bit(int nr, volatile unsigned long *addr)
 {
-	set_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr));
+	set_bits(BIT_MASK(nr), addr + BIT_WORD(nr));
 }
 
 static __inline__ void clear_bit(int nr, volatile unsigned long *addr)
 {
-	clear_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr));
+	clear_bits(BIT_MASK(nr), addr + BIT_WORD(nr));
 }
 
 static __inline__ void clear_bit_unlock(int nr, volatile unsigned long *addr)
 {
-	clear_bits_unlock(BITOP_MASK(nr), addr + BITOP_WORD(nr));
+	clear_bits_unlock(BIT_MASK(nr), addr + BIT_WORD(nr));
 }
 
 static __inline__ void change_bit(int nr, volatile unsigned long *addr)
 {
-	change_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr));
+	change_bits(BIT_MASK(nr), addr + BIT_WORD(nr));
 }
 
 /* Like DEFINE_BITOP(), with changes to the arguments to 'op' and the output
@@ -136,26 +134,26 @@ DEFINE_TESTOP(test_and_change_bits, xor, PPC_ATOMIC_ENTRY_BARRIER,
 static __inline__ int test_and_set_bit(unsigned long nr,
 				       volatile unsigned long *addr)
 {
-	return test_and_set_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr)) != 0;
+	return test_and_set_bits(BIT_MASK(nr), addr + BIT_WORD(nr)) != 0;
 }
 
 static __inline__ int test_and_set_bit_lock(unsigned long nr,
 				       volatile unsigned long *addr)
 {
-	return test_and_set_bits_lock(BITOP_MASK(nr),
-				addr + BITOP_WORD(nr)) != 0;
+	return test_and_set_bits_lock(BIT_MASK(nr),
+				addr + BIT_WORD(nr)) != 0;
 }
 
 static __inline__ int test_and_clear_bit(unsigned long nr,
 					 volatile unsigned long *addr)
 {
-	return test_and_clear_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr)) != 0;
+	return test_and_clear_bits(BIT_MASK(nr), addr + BIT_WORD(nr)) != 0;
 }
 
 static __inline__ int test_and_change_bit(unsigned long nr,
 					  volatile unsigned long *addr)
 {
-	return test_and_change_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr)) != 0;
+	return test_and_change_bits(BIT_MASK(nr), addr + BIT_WORD(nr)) != 0;
 }
 
 #include <asm-generic/bitops/non-atomic.h>
@@ -280,61 +278,8 @@ unsigned long __arch_hweight64(__u64 w);
 #include <asm-generic/bitops/find.h>
 
 /* Little-endian versions */
+#include <asm-generic/bitops/le.h>
 
-static __inline__ int test_bit_le(unsigned long nr,
-				  __const__ void *addr)
-{
-	__const__ unsigned char	*tmp = (__const__ unsigned char *) addr;
-	return (tmp[nr >> 3] >> (nr & 7)) & 1;
-}
-
-static inline void set_bit_le(int nr, void *addr)
-{
-	set_bit(nr ^ BITOP_LE_SWIZZLE, addr);
-}
-
-static inline void clear_bit_le(int nr, void *addr)
-{
-	clear_bit(nr ^ BITOP_LE_SWIZZLE, addr);
-}
-
-static inline void __set_bit_le(int nr, void *addr)
-{
-	__set_bit(nr ^ BITOP_LE_SWIZZLE, addr);
-}
-
-static inline void __clear_bit_le(int nr, void *addr)
-{
-	__clear_bit(nr ^ BITOP_LE_SWIZZLE, addr);
-}
-
-static inline int test_and_set_bit_le(int nr, void *addr)
-{
-	return test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr);
-}
-
-static inline int test_and_clear_bit_le(int nr, void *addr)
-{
-	return test_and_clear_bit(nr ^ BITOP_LE_SWIZZLE, addr);
-}
-
-static inline int __test_and_set_bit_le(int nr, void *addr)
-{
-	return __test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr);
-}
-
-static inline int __test_and_clear_bit_le(int nr, void *addr)
-{
-	return __test_and_clear_bit(nr ^ BITOP_LE_SWIZZLE, addr);
-}
-
-#define find_first_zero_bit_le(addr, size) \
-	find_next_zero_bit_le((addr), (size), 0)
-unsigned long find_next_zero_bit_le(const void *addr,
-				    unsigned long size, unsigned long offset);
-
-unsigned long find_next_bit_le(const void *addr,
-				    unsigned long size, unsigned long offset);
 /* Bitmap functions for the ext2 filesystem */
 
 #include <asm-generic/bitops/ext2-atomic-setbit.h>
diff --git a/trunk/arch/powerpc/include/asm/cputable.h b/trunk/arch/powerpc/include/asm/cputable.h
index 21a0687b8c4d..76f81bd64f1d 100644
--- a/trunk/arch/powerpc/include/asm/cputable.h
+++ b/trunk/arch/powerpc/include/asm/cputable.h
@@ -401,6 +401,14 @@ extern const char *powerpc_base_platform;
 	    CPU_FTR_DSCR | CPU_FTR_SAO  | CPU_FTR_ASYM_SMT | \
 	    CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
 	    CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY)
+#define CPU_FTRS_POWER8 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
+	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_206 |\
+	    CPU_FTR_MMCRA | CPU_FTR_SMT | \
+	    CPU_FTR_COHERENT_ICACHE | \
+	    CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
+	    CPU_FTR_DSCR | CPU_FTR_SAO  | \
+	    CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
+	    CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY)
 #define CPU_FTRS_CELL	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
 	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
 	    CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
@@ -421,8 +429,8 @@ extern const char *powerpc_base_platform;
 #define CPU_FTRS_POSSIBLE	\
 	    (CPU_FTRS_POWER3 | CPU_FTRS_RS64 | CPU_FTRS_POWER4 |	\
 	    CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | CPU_FTRS_POWER6 |	\
-	    CPU_FTRS_POWER7 | CPU_FTRS_CELL | CPU_FTRS_PA6T |		\
-	    CPU_FTR_VSX)
+	    CPU_FTRS_POWER7 | CPU_FTRS_POWER8 | CPU_FTRS_CELL |		\
+	    CPU_FTRS_PA6T | CPU_FTR_VSX)
 #endif
 #else
 enum {
diff --git a/trunk/arch/powerpc/include/asm/dbell.h b/trunk/arch/powerpc/include/asm/dbell.h
index 154c067761b1..607e4eeeb694 100644
--- a/trunk/arch/powerpc/include/asm/dbell.h
+++ b/trunk/arch/powerpc/include/asm/dbell.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2009 Freescale Semicondutor, Inc.
+ * Copyright 2009 Freescale Semiconductor, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
diff --git a/trunk/arch/powerpc/include/asm/exception-64s.h b/trunk/arch/powerpc/include/asm/exception-64s.h
index a43c1473915f..ad708dda3ba3 100644
--- a/trunk/arch/powerpc/include/asm/exception-64s.h
+++ b/trunk/arch/powerpc/include/asm/exception-64s.h
@@ -48,6 +48,35 @@
 #define EX_LR		72
 #define EX_CFAR		80
 
+#ifdef CONFIG_RELOCATABLE
+#define EXCEPTION_RELON_PROLOG_PSERIES_1(label, h)			\
+	ld	r12,PACAKBASE(r13);	/* get high part of &label */	\
+	mfspr	r11,SPRN_##h##SRR0;	/* save SRR0 */			\
+	LOAD_HANDLER(r12,label);					\
+	mtlr	r12;							\
+	mfspr	r12,SPRN_##h##SRR1;	/* and SRR1 */			\
+	li	r10,MSR_RI;						\
+	mtmsrd 	r10,1;			/* Set RI (EE=0) */		\
+	blr;
+#else
+/* If not relocatable, we can jump directly -- and save messing with LR */
+#define EXCEPTION_RELON_PROLOG_PSERIES_1(label, h)			\
+	mfspr	r11,SPRN_##h##SRR0;	/* save SRR0 */			\
+	mfspr	r12,SPRN_##h##SRR1;	/* and SRR1 */			\
+	li	r10,MSR_RI;						\
+	mtmsrd 	r10,1;			/* Set RI (EE=0) */		\
+	b	label;
+#endif
+
+/*
+ * As EXCEPTION_PROLOG_PSERIES(), except we've already got relocation on
+ * so no need to rfid.  Save lr in case we're CONFIG_RELOCATABLE, in which
+ * case EXCEPTION_RELON_PROLOG_PSERIES_1 will be using lr.
+ */
+#define EXCEPTION_RELON_PROLOG_PSERIES(area, label, h, extra, vec)	\
+	EXCEPTION_PROLOG_1(area, extra, vec);				\
+	EXCEPTION_RELON_PROLOG_PSERIES_1(label, h)
+
 /*
  * We're short on space and time in the exception prolog, so we can't
  * use the normal SET_REG_IMMEDIATE macro. Normally we just need the
@@ -55,12 +84,29 @@
  * word.
  */
 #define LOAD_HANDLER(reg, label)					\
-	addi	reg,reg,(label)-_stext;	/* virt addr of handler ... */
+	/* Handlers must be within 64K of kbase, which must be 64k aligned */ \
+	ori	reg,reg,(label)-_stext;	/* virt addr of handler ... */
 
 /* Exception register prefixes */
 #define EXC_HV	H
 #define EXC_STD
 
+#if defined(CONFIG_RELOCATABLE)
+/*
+ * If we support interrupts with relocation on AND we're a relocatable
+ * kernel, we need to use LR to get to the 2nd level handler.  So, save/restore
+ * it when required.
+ */
+#define SAVE_LR(reg, area)	mflr	reg ; 	std	reg,area+EX_LR(r13)
+#define GET_LR(reg, area) 			ld	reg,area+EX_LR(r13)
+#define RESTORE_LR(reg, area)	ld	reg,area+EX_LR(r13) ; mtlr reg
+#else
+/* ...else LR is unused and in register. */
+#define SAVE_LR(reg, area)
+#define GET_LR(reg, area) 	mflr	reg
+#define RESTORE_LR(reg, area)
+#endif
+
 #define __EXCEPTION_PROLOG_1(area, extra, vec)				\
 	GET_PACA(r13);							\
 	std	r9,area+EX_R9(r13);	/* save r9 - r12 */		\
@@ -69,6 +115,7 @@
 	mfspr	r10,SPRN_CFAR;						\
 	std	r10,area+EX_CFAR(r13);					\
 	END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);		\
+	SAVE_LR(r10, area);						\
 	mfcr	r9;							\
 	extra(vec);							\
 	std	r11,area+EX_R11(r13);					\
@@ -169,6 +216,7 @@ do_kvm_##n:								\
 	sth	r1,PACA_TRAP_SAVE(r13);					   \
 	std	r3,area+EX_R3(r13);					   \
 	addi	r3,r13,area;		/* r3 -> where regs are saved*/	   \
+	RESTORE_LR(r1, area);						   \
 	b	bad_stack;						   \
 3:	std	r9,_CCR(r1);		/* save CR in stackframe	*/ \
 	std	r11,_NIP(r1);		/* save SRR0 in stackframe	*/ \
@@ -194,8 +242,8 @@ do_kvm_##n:								\
 	ld	r10,area+EX_CFAR(r13);					   \
 	std	r10,ORIG_GPR3(r1);					   \
 	END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);		   \
+	GET_LR(r9,area);		/* Get LR, later save to stack	*/ \
 	ld	r2,PACATOC(r13);	/* get kernel TOC into r2	*/ \
-	mflr	r9;			/* save LR in stackframe	*/ \
 	std	r9,_LINK(r1);						   \
 	mfctr	r10;			/* save CTR in stackframe	*/ \
 	std	r10,_CTR(r1);						   \
@@ -232,6 +280,26 @@ label##_hv:						\
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,	\
 				 EXC_HV, KVMTEST, vec)
 
+#define STD_RELON_EXCEPTION_PSERIES(loc, vec, label)	\
+	. = loc;					\
+	.globl label##_relon_pSeries;			\
+label##_relon_pSeries:					\
+	HMT_MEDIUM;					\
+	/* No guest interrupts come through here */	\
+	SET_SCRATCH0(r13);		/* save r13 */	\
+	EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label##_common, \
+				       EXC_STD, KVMTEST_PR, vec)
+
+#define STD_RELON_EXCEPTION_HV(loc, vec, label)		\
+	. = loc;					\
+	.globl label##_relon_hv;			\
+label##_relon_hv:					\
+	HMT_MEDIUM;					\
+	/* No guest interrupts come through here */	\
+	SET_SCRATCH0(r13);	/* save r13 */		\
+	EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label##_common, \
+				       EXC_HV, KVMTEST, vec)
+
 /* This associate vector numbers with bits in paca->irq_happened */
 #define SOFTEN_VALUE_0x500	PACA_IRQ_EE
 #define SOFTEN_VALUE_0x502	PACA_IRQ_EE
@@ -257,6 +325,9 @@ label##_hv:						\
 	KVMTEST(vec);							\
 	_SOFTEN_TEST(EXC_STD, vec)
 
+#define SOFTEN_NOTEST_PR(vec)		_SOFTEN_TEST(EXC_STD, vec)
+#define SOFTEN_NOTEST_HV(vec)		_SOFTEN_TEST(EXC_HV, vec)
+
 #define __MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)		\
 	HMT_MEDIUM;							\
 	SET_SCRATCH0(r13);    /* save r13 */				\
@@ -279,6 +350,28 @@ label##_hv:								\
 	_MASKABLE_EXCEPTION_PSERIES(vec, label,				\
 				    EXC_HV, SOFTEN_TEST_HV)
 
+#define __MASKABLE_RELON_EXCEPTION_PSERIES(vec, label, h, extra)	\
+	HMT_MEDIUM;							\
+	SET_SCRATCH0(r13);    /* save r13 */				\
+	__EXCEPTION_PROLOG_1(PACA_EXGEN, extra, vec);		\
+	EXCEPTION_RELON_PROLOG_PSERIES_1(label##_common, h);
+#define _MASKABLE_RELON_EXCEPTION_PSERIES(vec, label, h, extra)	\
+	__MASKABLE_RELON_EXCEPTION_PSERIES(vec, label, h, extra)
+
+#define MASKABLE_RELON_EXCEPTION_PSERIES(loc, vec, label)		\
+	. = loc;							\
+	.globl label##_relon_pSeries;					\
+label##_relon_pSeries:							\
+	_MASKABLE_RELON_EXCEPTION_PSERIES(vec, label,			\
+					  EXC_STD, SOFTEN_NOTEST_PR)
+
+#define MASKABLE_RELON_EXCEPTION_HV(loc, vec, label)			\
+	. = loc;							\
+	.globl label##_relon_hv;					\
+label##_relon_hv:							\
+	_MASKABLE_RELON_EXCEPTION_PSERIES(vec, label,			\
+					  EXC_HV, SOFTEN_NOTEST_HV)
+
 /*
  * Our exception common code can be passed various "additions"
  * to specify the behaviour of interrupts, whether to kick the
diff --git a/trunk/arch/powerpc/include/asm/firmware.h b/trunk/arch/powerpc/include/asm/firmware.h
index ad0b751b0d78..973cc3be011b 100644
--- a/trunk/arch/powerpc/include/asm/firmware.h
+++ b/trunk/arch/powerpc/include/asm/firmware.h
@@ -49,6 +49,7 @@
 #define FW_FEATURE_XCMO		ASM_CONST(0x0000000008000000)
 #define FW_FEATURE_OPAL		ASM_CONST(0x0000000010000000)
 #define FW_FEATURE_OPALv2	ASM_CONST(0x0000000020000000)
+#define FW_FEATURE_SET_MODE	ASM_CONST(0x0000000040000000)
 
 #ifndef __ASSEMBLY__
 
@@ -62,7 +63,8 @@ enum {
 		FW_FEATURE_VIO | FW_FEATURE_RDMA | FW_FEATURE_LLAN |
 		FW_FEATURE_BULK_REMOVE | FW_FEATURE_XDABR |
 		FW_FEATURE_MULTITCE | FW_FEATURE_SPLPAR | FW_FEATURE_LPAR |
-		FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO,
+		FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO |
+		FW_FEATURE_SET_MODE,
 	FW_FEATURE_PSERIES_ALWAYS = 0,
 	FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_OPALv2,
 	FW_FEATURE_POWERNV_ALWAYS = 0,
diff --git a/trunk/arch/powerpc/include/asm/fsl_gtm.h b/trunk/arch/powerpc/include/asm/fsl_gtm.h
index 8e8c9b5032d3..3b05808f9caa 100644
--- a/trunk/arch/powerpc/include/asm/fsl_gtm.h
+++ b/trunk/arch/powerpc/include/asm/fsl_gtm.h
@@ -1,7 +1,7 @@
 /*
  * Freescale General-purpose Timers Module
  *
- * Copyright (c) Freescale Semicondutor, Inc. 2006.
+ * Copyright 2006 Freescale Semiconductor, Inc.
  *               Shlomi Gridish <gridish@freescale.com>
  *               Jerry Huang <Chang-Ming.Huang@freescale.com>
  * Copyright (c) MontaVista Software, Inc. 2008.
diff --git a/trunk/arch/powerpc/include/asm/fsl_guts.h b/trunk/arch/powerpc/include/asm/fsl_guts.h
index dd5ba2c22771..77ced0b3d81d 100644
--- a/trunk/arch/powerpc/include/asm/fsl_guts.h
+++ b/trunk/arch/powerpc/include/asm/fsl_guts.h
@@ -71,7 +71,9 @@ struct ccsr_guts {
 	u8	res0c4[0x224 - 0xc4];
 	__be32  iodelay1;	/* 0x.0224 - IO delay control register 1 */
 	__be32  iodelay2;	/* 0x.0228 - IO delay control register 2 */
-	u8	res22c[0x800 - 0x22c];
+	u8	res22c[0x604 - 0x22c];
+	__be32	pamubypenr; 	/* 0x.604 - PAMU bypass enable register */
+	u8	res608[0x800 - 0x608];
 	__be32	clkdvdr;	/* 0x.0800 - Clock Divide Register */
 	u8	res804[0x900 - 0x804];
 	__be32	ircr;		/* 0x.0900 - Infrared Control Register */
diff --git a/trunk/arch/powerpc/include/asm/hvcall.h b/trunk/arch/powerpc/include/asm/hvcall.h
index 7a867065db79..0975e5c0bb19 100644
--- a/trunk/arch/powerpc/include/asm/hvcall.h
+++ b/trunk/arch/powerpc/include/asm/hvcall.h
@@ -267,7 +267,8 @@
 #define H_RANDOM		0x300
 #define H_COP			0x304
 #define H_GET_MPP_X		0x314
-#define MAX_HCALL_OPCODE	H_GET_MPP_X
+#define H_SET_MODE		0x31C
+#define MAX_HCALL_OPCODE	H_SET_MODE
 
 #ifndef __ASSEMBLY__
 
@@ -355,6 +356,26 @@ struct hvcall_mpp_x_data {
 
 int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data);
 
+static inline unsigned int get_longbusy_msecs(int longbusy_rc)
+{
+	switch (longbusy_rc) {
+	case H_LONG_BUSY_ORDER_1_MSEC:
+		return 1;
+	case H_LONG_BUSY_ORDER_10_MSEC:
+		return 10;
+	case H_LONG_BUSY_ORDER_100_MSEC:
+		return 100;
+	case H_LONG_BUSY_ORDER_1_SEC:
+		return 1000;
+	case H_LONG_BUSY_ORDER_10_SEC:
+		return 10000;
+	case H_LONG_BUSY_ORDER_100_SEC:
+		return 100000;
+	default:
+		return 1;
+	}
+}
+
 #ifdef CONFIG_PPC_PSERIES
 extern int CMO_PrPSP;
 extern int CMO_SecPSP;
diff --git a/trunk/arch/powerpc/include/asm/immap_qe.h b/trunk/arch/powerpc/include/asm/immap_qe.h
index 61e8490786b8..bedbff891423 100644
--- a/trunk/arch/powerpc/include/asm/immap_qe.h
+++ b/trunk/arch/powerpc/include/asm/immap_qe.h
@@ -3,7 +3,7 @@
  * The Internal Memory Map for devices with QE on them. This
  * is the superset of all QE devices (8360, etc.).
 
- * Copyright (C) 2006. Freescale Semicondutor, Inc. All rights reserved.
+ * Copyright (C) 2006. Freescale Semiconductor, Inc. All rights reserved.
  *
  * Authors: 	Shlomi Gridish <gridish@freescale.com>
  * 		Li Yang <leoli@freescale.com>
diff --git a/trunk/arch/powerpc/include/asm/machdep.h b/trunk/arch/powerpc/include/asm/machdep.h
index c4231973edd3..19d9d96eb8d3 100644
--- a/trunk/arch/powerpc/include/asm/machdep.h
+++ b/trunk/arch/powerpc/include/asm/machdep.h
@@ -166,9 +166,6 @@ struct machdep_calls {
 						unsigned long size,
 						pgprot_t vma_prot);
 
-	/* Idle loop for this platform, leave empty for default idle loop */
-	void		(*idle_loop)(void);
-
 	/*
 	 * Function for waiting for work with reduced power in idle loop;
 	 * called with interrupts disabled.
@@ -320,28 +317,28 @@ static inline void log_error(char *buf, unsigned int err_type, int fatal)
 		ppc_md.log_error(buf, err_type, fatal);
 }
 
-#define __define_machine_initcall(mach,level,fn,id) \
+#define __define_machine_initcall(mach, fn, id) \
 	static int __init __machine_initcall_##mach##_##fn(void) { \
 		if (machine_is(mach)) return fn(); \
 		return 0; \
 	} \
-	__define_initcall(level,__machine_initcall_##mach##_##fn,id);
-
-#define machine_core_initcall(mach,fn)		__define_machine_initcall(mach,"1",fn,1)
-#define machine_core_initcall_sync(mach,fn)	__define_machine_initcall(mach,"1s",fn,1s)
-#define machine_postcore_initcall(mach,fn)	__define_machine_initcall(mach,"2",fn,2)
-#define machine_postcore_initcall_sync(mach,fn)	__define_machine_initcall(mach,"2s",fn,2s)
-#define machine_arch_initcall(mach,fn)		__define_machine_initcall(mach,"3",fn,3)
-#define machine_arch_initcall_sync(mach,fn)	__define_machine_initcall(mach,"3s",fn,3s)
-#define machine_subsys_initcall(mach,fn)	__define_machine_initcall(mach,"4",fn,4)
-#define machine_subsys_initcall_sync(mach,fn)	__define_machine_initcall(mach,"4s",fn,4s)
-#define machine_fs_initcall(mach,fn)		__define_machine_initcall(mach,"5",fn,5)
-#define machine_fs_initcall_sync(mach,fn)	__define_machine_initcall(mach,"5s",fn,5s)
-#define machine_rootfs_initcall(mach,fn)	__define_machine_initcall(mach,"rootfs",fn,rootfs)
-#define machine_device_initcall(mach,fn)	__define_machine_initcall(mach,"6",fn,6)
-#define machine_device_initcall_sync(mach,fn)	__define_machine_initcall(mach,"6s",fn,6s)
-#define machine_late_initcall(mach,fn)		__define_machine_initcall(mach,"7",fn,7)
-#define machine_late_initcall_sync(mach,fn)	__define_machine_initcall(mach,"7s",fn,7s)
+	__define_initcall(__machine_initcall_##mach##_##fn, id);
+
+#define machine_core_initcall(mach, fn)		__define_machine_initcall(mach, fn, 1)
+#define machine_core_initcall_sync(mach, fn)	__define_machine_initcall(mach, fn, 1s)
+#define machine_postcore_initcall(mach, fn)	__define_machine_initcall(mach, fn, 2)
+#define machine_postcore_initcall_sync(mach, fn)	__define_machine_initcall(mach, fn, 2s)
+#define machine_arch_initcall(mach, fn)		__define_machine_initcall(mach, fn, 3)
+#define machine_arch_initcall_sync(mach, fn)	__define_machine_initcall(mach, fn, 3s)
+#define machine_subsys_initcall(mach, fn)	__define_machine_initcall(mach, fn, 4)
+#define machine_subsys_initcall_sync(mach, fn)	__define_machine_initcall(mach, fn, 4s)
+#define machine_fs_initcall(mach, fn)		__define_machine_initcall(mach, fn, 5)
+#define machine_fs_initcall_sync(mach, fn)	__define_machine_initcall(mach, fn, 5s)
+#define machine_rootfs_initcall(mach, fn)	__define_machine_initcall(mach, fn, rootfs)
+#define machine_device_initcall(mach, fn)	__define_machine_initcall(mach, fn, 6)
+#define machine_device_initcall_sync(mach, fn)	__define_machine_initcall(mach, fn, 6s)
+#define machine_late_initcall(mach, fn)		__define_machine_initcall(mach, fn, 7)
+#define machine_late_initcall_sync(mach, fn)	__define_machine_initcall(mach, fn, 7s)
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_MACHDEP_H */
diff --git a/trunk/arch/powerpc/include/asm/mmu.h b/trunk/arch/powerpc/include/asm/mmu.h
index 5e38eedea218..691fd8aca939 100644
--- a/trunk/arch/powerpc/include/asm/mmu.h
+++ b/trunk/arch/powerpc/include/asm/mmu.h
@@ -101,6 +101,7 @@
 #define MMU_FTRS_POWER5		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
 #define MMU_FTRS_POWER6		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
 #define MMU_FTRS_POWER7		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
+#define MMU_FTRS_POWER8		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
 #define MMU_FTRS_CELL		MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \
 				MMU_FTR_CI_LARGE_PAGE
 #define MMU_FTRS_PA6T		MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \
diff --git a/trunk/arch/powerpc/include/asm/pSeries_reconfig.h b/trunk/arch/powerpc/include/asm/pSeries_reconfig.h
deleted file mode 100644
index c07edfe98b98..000000000000
--- a/trunk/arch/powerpc/include/asm/pSeries_reconfig.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef _PPC64_PSERIES_RECONFIG_H
-#define _PPC64_PSERIES_RECONFIG_H
-#ifdef __KERNEL__
-
-#include <linux/notifier.h>
-
-/*
- * Use this API if your code needs to know about OF device nodes being
- * added or removed on pSeries systems.
- */
-
-#define PSERIES_RECONFIG_ADD		0x0001
-#define PSERIES_RECONFIG_REMOVE		0x0002
-#define PSERIES_DRCONF_MEM_ADD		0x0003
-#define PSERIES_DRCONF_MEM_REMOVE	0x0004
-#define PSERIES_UPDATE_PROPERTY		0x0005
-
-/**
- * pSeries_reconfig_notify - Notifier value structure for OFDT property updates
- *
- * @node: Device tree node which owns the property being updated
- * @property: Updated property
- */
-struct pSeries_reconfig_prop_update {
-	struct device_node *node;
-	struct property *property;
-};
-
-#ifdef CONFIG_PPC_PSERIES
-extern int pSeries_reconfig_notifier_register(struct notifier_block *);
-extern void pSeries_reconfig_notifier_unregister(struct notifier_block *);
-extern int pSeries_reconfig_notify(unsigned long action, void *p);
-/* Not the best place to put this, will be fixed when we move some
- * of the rtas suspend-me stuff to pseries */
-extern void pSeries_coalesce_init(void);
-#else /* !CONFIG_PPC_PSERIES */
-static inline int pSeries_reconfig_notifier_register(struct notifier_block *nb)
-{
-	return 0;
-}
-static inline void pSeries_reconfig_notifier_unregister(struct notifier_block *nb) { }
-static inline void pSeries_coalesce_init(void) { }
-#endif /* CONFIG_PPC_PSERIES */
-
-
-#endif /* __KERNEL__ */
-#endif /* _PPC64_PSERIES_RECONFIG_H */
diff --git a/trunk/arch/powerpc/include/asm/ppc-opcode.h b/trunk/arch/powerpc/include/asm/ppc-opcode.h
index 42b1f43b943b..51fb00a20d7e 100644
--- a/trunk/arch/powerpc/include/asm/ppc-opcode.h
+++ b/trunk/arch/powerpc/include/asm/ppc-opcode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2009 Freescale Semicondutor, Inc.
+ * Copyright 2009 Freescale Semiconductor, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -86,6 +86,7 @@
 #define PPC_INST_DCBA_MASK		0xfc0007fe
 #define PPC_INST_DCBAL			0x7c2005ec
 #define PPC_INST_DCBZL			0x7c2007ec
+#define PPC_INST_ICBT			0x7c00002c
 #define PPC_INST_ISEL			0x7c00001e
 #define PPC_INST_ISEL_MASK		0xfc00003e
 #define PPC_INST_LDARX			0x7c0000a8
@@ -201,6 +202,7 @@
 #define __PPC_MB(s)	(((s) & 0x1f) << 6)
 #define __PPC_ME(s)	(((s) & 0x1f) << 1)
 #define __PPC_BI(s)	(((s) & 0x1f) << 16)
+#define __PPC_CT(t)	(((t) & 0x0f) << 21)
 
 /*
  * Only use the larx hint bit on 64bit CPUs. e500v1/v2 based CPUs will treat a
@@ -263,6 +265,8 @@
 					__PPC_RS(t) | __PPC_RA0(a) | __PPC_RB(b))
 #define PPC_SLBFEE_DOT(t, b)	stringify_in_c(.long PPC_INST_SLBFEE | \
 					__PPC_RT(t) | __PPC_RB(b))
+#define PPC_ICBT(c,a,b)		stringify_in_c(.long PPC_INST_ICBT | \
+				       __PPC_CT(c) | __PPC_RA0(a) | __PPC_RB(b))
 /* PASemi instructions */
 #define LBZCIX(t,a,b)		stringify_in_c(.long PPC_INST_LBZCIX | \
 				       __PPC_RT(t) | __PPC_RA(a) | __PPC_RB(b))
diff --git a/trunk/arch/powerpc/include/asm/prom.h b/trunk/arch/powerpc/include/asm/prom.h
index b5c91901e384..99c92d5363e4 100644
--- a/trunk/arch/powerpc/include/asm/prom.h
+++ b/trunk/arch/powerpc/include/asm/prom.h
@@ -58,6 +58,22 @@ static inline int of_node_to_nid(struct device_node *device) { return 0; }
 
 extern void of_instantiate_rtc(void);
 
+/* The of_drconf_cell struct defines the layout of the LMB array
+ * specified in the device tree property
+ * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory
+ */
+struct of_drconf_cell {
+	u64	base_addr;
+	u32	drc_index;
+	u32	reserved;
+	u32	aa_index;
+	u32	flags;
+};
+
+#define DRCONF_MEM_ASSIGNED	0x00000008
+#define DRCONF_MEM_AI_INVALID	0x00000040
+#define DRCONF_MEM_RESERVED	0x00000080
+
 /* These includes are put at the bottom because they may contain things
  * that are overridden by this file.  Ideally they shouldn't be included
  * by this file, but there are a bunch of .c files that currently depend
diff --git a/trunk/arch/powerpc/include/asm/qe.h b/trunk/arch/powerpc/include/asm/qe.h
index 229571a49391..32b9bfa0c9bd 100644
--- a/trunk/arch/powerpc/include/asm/qe.h
+++ b/trunk/arch/powerpc/include/asm/qe.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006 Freescale Semicondutor, Inc. All rights reserved.
+ * Copyright (C) 2006 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Authors: 	Shlomi Gridish <gridish@freescale.com>
  * 		Li Yang <leoli@freescale.com>
diff --git a/trunk/arch/powerpc/include/asm/qe_ic.h b/trunk/arch/powerpc/include/asm/qe_ic.h
index f706164b0bd0..25784cc959a0 100644
--- a/trunk/arch/powerpc/include/asm/qe_ic.h
+++ b/trunk/arch/powerpc/include/asm/qe_ic.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006 Freescale Semicondutor, Inc. All rights reserved.
+ * Copyright (C) 2006 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Authors: 	Shlomi Gridish <gridish@freescale.com>
  * 		Li Yang <leoli@freescale.com>
diff --git a/trunk/arch/powerpc/include/asm/reg.h b/trunk/arch/powerpc/include/asm/reg.h
index 97d37278ea2d..3d5c9dc8917a 100644
--- a/trunk/arch/powerpc/include/asm/reg.h
+++ b/trunk/arch/powerpc/include/asm/reg.h
@@ -249,6 +249,8 @@
 #define   LPCR_RMLS    0x1C000000      /* impl dependent rmo limit sel */
 #define	  LPCR_RMLS_SH	(63-37)
 #define   LPCR_ILE     0x02000000      /* !HV irqs set MSR:LE */
+#define   LPCR_AIL_0	0x00000000	/* MMU off exception offset 0x0 */
+#define   LPCR_AIL_3	0x01800000	/* MMU on exception offset 0xc00...4xxx */
 #define   LPCR_PECE	0x00007000	/* powersave exit cause enable */
 #define     LPCR_PECE0	0x00004000	/* ext. exceptions can cause exit */
 #define     LPCR_PECE1	0x00002000	/* decrementer can cause exit */
@@ -1030,6 +1032,7 @@
 #define PVR_970MP	0x0044
 #define PVR_970GX	0x0045
 #define PVR_POWER7p	0x004A
+#define PVR_POWER8	0x004B
 #define PVR_BE		0x0070
 #define PVR_PA6T	0x0090
 
diff --git a/trunk/arch/powerpc/include/asm/rtas.h b/trunk/arch/powerpc/include/asm/rtas.h
index 557cff845dee..aef00c675905 100644
--- a/trunk/arch/powerpc/include/asm/rtas.h
+++ b/trunk/arch/powerpc/include/asm/rtas.h
@@ -353,8 +353,13 @@ static inline int page_is_rtas_user_buf(unsigned long pfn)
 		return 1;
 	return 0;
 }
+
+/* Not the best place to put pSeries_coalesce_init, will be fixed when we
+ * move some of the rtas suspend-me stuff to pseries */
+extern void pSeries_coalesce_init(void);
 #else
 static inline int page_is_rtas_user_buf(unsigned long pfn) { return 0;}
+static inline void pSeries_coalesce_init(void) { }
 #endif
 
 extern int call_rtas(const char *, int, int, unsigned long *, ...);
diff --git a/trunk/arch/powerpc/include/asm/setup.h b/trunk/arch/powerpc/include/asm/setup.h
new file mode 100644
index 000000000000..d3ca85529b8b
--- /dev/null
+++ b/trunk/arch/powerpc/include/asm/setup.h
@@ -0,0 +1,29 @@
+#ifndef _ASM_POWERPC_SETUP_H
+#define _ASM_POWERPC_SETUP_H
+
+#include <uapi/asm/setup.h>
+
+#ifndef __ASSEMBLY__
+extern void ppc_printk_progress(char *s, unsigned short hex);
+
+extern unsigned int rtas_data;
+extern int mem_init_done;	/* set on boot once kmalloc can be called */
+extern int init_bootmem_done;	/* set once bootmem is available */
+extern unsigned long long memory_limit;
+extern unsigned long klimit;
+extern void *zalloc_maybe_bootmem(size_t size, gfp_t mask);
+
+struct device_node;
+extern void note_scsi_host(struct device_node *, void *);
+
+/* Used in very early kernel initialization. */
+extern unsigned long reloc_offset(void);
+extern unsigned long add_reloc_offset(unsigned long);
+extern void reloc_got2(unsigned long);
+
+#define PTRRELOC(x)	((typeof(x)) add_reloc_offset((unsigned long)(x)))
+
+#endif /* !__ASSEMBLY__ */
+
+#endif	/* _ASM_POWERPC_SETUP_H */
+
diff --git a/trunk/arch/powerpc/include/asm/systbl.h b/trunk/arch/powerpc/include/asm/systbl.h
index 840838769853..97909d3b1d7b 100644
--- a/trunk/arch/powerpc/include/asm/systbl.h
+++ b/trunk/arch/powerpc/include/asm/systbl.h
@@ -164,7 +164,7 @@ COMPAT_SYS_SPU(sched_getscheduler)
 SYSCALL_SPU(sched_yield)
 COMPAT_SYS_SPU(sched_get_priority_max)
 COMPAT_SYS_SPU(sched_get_priority_min)
-COMPAT_SYS_SPU(sched_rr_get_interval)
+SYSX_SPU(sys_sched_rr_get_interval,compat_sys_sched_rr_get_interval_wrapper,sys_sched_rr_get_interval)
 COMPAT_SYS_SPU(nanosleep)
 SYSCALL_SPU(mremap)
 SYSCALL_SPU(setresuid)
@@ -356,3 +356,4 @@ COMPAT_SYS_SPU(sendmmsg)
 SYSCALL_SPU(setns)
 COMPAT_SYS(process_vm_readv)
 COMPAT_SYS(process_vm_writev)
+SYSCALL(finit_module)
diff --git a/trunk/arch/powerpc/include/asm/ucc.h b/trunk/arch/powerpc/include/asm/ucc.h
index 46b09ba6bead..6927ac26516e 100644
--- a/trunk/arch/powerpc/include/asm/ucc.h
+++ b/trunk/arch/powerpc/include/asm/ucc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006 Freescale Semicondutor, Inc. All rights reserved.
+ * Copyright (C) 2006 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Authors: 	Shlomi Gridish <gridish@freescale.com>
  * 		Li Yang <leoli@freescale.com>
diff --git a/trunk/arch/powerpc/include/asm/ucc_fast.h b/trunk/arch/powerpc/include/asm/ucc_fast.h
index 4644c840e2fa..72ea9bab07df 100644
--- a/trunk/arch/powerpc/include/asm/ucc_fast.h
+++ b/trunk/arch/powerpc/include/asm/ucc_fast.h
@@ -1,7 +1,7 @@
 /*
  * Internal header file for UCC FAST unit routines.
  *
- * Copyright (C) 2006 Freescale Semicondutor, Inc. All rights reserved.
+ * Copyright (C) 2006 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Authors: 	Shlomi Gridish <gridish@freescale.com>
  * 		Li Yang <leoli@freescale.com>
diff --git a/trunk/arch/powerpc/include/asm/ucc_slow.h b/trunk/arch/powerpc/include/asm/ucc_slow.h
index cf131ffdb8d1..c44131e68e11 100644
--- a/trunk/arch/powerpc/include/asm/ucc_slow.h
+++ b/trunk/arch/powerpc/include/asm/ucc_slow.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006 Freescale Semicondutor, Inc. All rights reserved.
+ * Copyright (C) 2006 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Authors: 	Shlomi Gridish <gridish@freescale.com>
  * 		Li Yang <leoli@freescale.com>
diff --git a/trunk/arch/powerpc/include/asm/udbg.h b/trunk/arch/powerpc/include/asm/udbg.h
index b3038817b8dc..5a7510e9d09d 100644
--- a/trunk/arch/powerpc/include/asm/udbg.h
+++ b/trunk/arch/powerpc/include/asm/udbg.h
@@ -21,7 +21,6 @@ extern int (*udbg_getc_poll)(void);
 
 extern void udbg_puts(const char *s);
 extern int udbg_write(const char *s, int n);
-extern int udbg_read(char *buf, int buflen);
 
 extern void register_early_udbg_console(void);
 extern void udbg_printf(const char *fmt, ...)
diff --git a/trunk/arch/powerpc/include/asm/unistd.h b/trunk/arch/powerpc/include/asm/unistd.h
index 76fe846ec40e..29365e15ed7c 100644
--- a/trunk/arch/powerpc/include/asm/unistd.h
+++ b/trunk/arch/powerpc/include/asm/unistd.h
@@ -12,7 +12,7 @@
 #include <uapi/asm/unistd.h>
 
 
-#define __NR_syscalls		353
+#define __NR_syscalls		354
 
 #define __NR__exit __NR_exit
 #define NR_syscalls	__NR_syscalls
@@ -54,6 +54,7 @@
 #define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
 #define __ARCH_WANT_SYS_NEWFSTATAT
 #define __ARCH_WANT_COMPAT_SYS_SENDFILE
+#define __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL
 #endif
 #define __ARCH_WANT_SYS_EXECVE
 #define __ARCH_WANT_SYS_FORK
diff --git a/trunk/arch/powerpc/include/uapi/asm/setup.h b/trunk/arch/powerpc/include/uapi/asm/setup.h
index 8b9a306260b2..552df83f1a49 100644
--- a/trunk/arch/powerpc/include/uapi/asm/setup.h
+++ b/trunk/arch/powerpc/include/uapi/asm/setup.h
@@ -1,32 +1 @@
-#ifndef _ASM_POWERPC_SETUP_H
-#define _ASM_POWERPC_SETUP_H
-
 #include <asm-generic/setup.h>
-
-#ifndef __ASSEMBLY__
-extern void ppc_printk_progress(char *s, unsigned short hex);
-
-extern unsigned int rtas_data;
-extern int mem_init_done;	/* set on boot once kmalloc can be called */
-extern int init_bootmem_done;	/* set once bootmem is available */
-extern unsigned long long memory_limit;
-extern unsigned long klimit;
-extern void *zalloc_maybe_bootmem(size_t size, gfp_t mask);
-
-extern void via_cuda_init(void);
-extern void read_rtc_time(void);
-extern void pmac_find_display(void);
-
-struct device_node;
-extern void note_scsi_host(struct device_node *, void *);
-
-/* Used in very early kernel initialization. */
-extern unsigned long reloc_offset(void);
-extern unsigned long add_reloc_offset(unsigned long);
-extern void reloc_got2(unsigned long);
-
-#define PTRRELOC(x)	((typeof(x)) add_reloc_offset((unsigned long)(x)))
-
-#endif /* !__ASSEMBLY__ */
-
-#endif	/* _ASM_POWERPC_SETUP_H */
diff --git a/trunk/arch/powerpc/include/uapi/asm/unistd.h b/trunk/arch/powerpc/include/uapi/asm/unistd.h
index 380b5d37a904..8c478c6c6b1e 100644
--- a/trunk/arch/powerpc/include/uapi/asm/unistd.h
+++ b/trunk/arch/powerpc/include/uapi/asm/unistd.h
@@ -375,6 +375,7 @@
 #define __NR_setns		350
 #define __NR_process_vm_readv	351
 #define __NR_process_vm_writev	352
+#define __NR_finit_module	353
 
 
 #endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */
diff --git a/trunk/arch/powerpc/kernel/Makefile b/trunk/arch/powerpc/kernel/Makefile
index cde12f8a4ebc..8f619342f14c 100644
--- a/trunk/arch/powerpc/kernel/Makefile
+++ b/trunk/arch/powerpc/kernel/Makefile
@@ -38,7 +38,7 @@ obj-$(CONFIG_PPC64)		+= setup_64.o sys_ppc32.o \
 				   paca.o nvram_64.o firmware.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
 obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_ppc970.o cpu_setup_pa6t.o
-obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_power7.o
+obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_power.o
 obj64-$(CONFIG_RELOCATABLE)	+= reloc_64.o
 obj-$(CONFIG_PPC_BOOK3E_64)	+= exceptions-64e.o idle_book3e.o
 obj-$(CONFIG_PPC_A2)		+= cpu_setup_a2.o
diff --git a/trunk/arch/powerpc/kernel/cpu_setup_power7.S b/trunk/arch/powerpc/kernel/cpu_setup_power.S
similarity index 80%
rename from trunk/arch/powerpc/kernel/cpu_setup_power7.S
rename to trunk/arch/powerpc/kernel/cpu_setup_power.S
index 76797c5105d6..57cf14065aec 100644
--- a/trunk/arch/powerpc/kernel/cpu_setup_power7.S
+++ b/trunk/arch/powerpc/kernel/cpu_setup_power.S
@@ -27,6 +27,7 @@ _GLOBAL(__setup_cpu_power7)
 	beqlr
 	li	r0,0
 	mtspr	SPRN_LPID,r0
+	mfspr	r3,SPRN_LPCR
 	bl	__init_LPCR
 	bl	__init_TLB
 	mtlr	r11
@@ -39,6 +40,35 @@ _GLOBAL(__restore_cpu_power7)
 	beqlr
 	li	r0,0
 	mtspr	SPRN_LPID,r0
+	mfspr	r3,SPRN_LPCR
+	bl	__init_LPCR
+	bl	__init_TLB
+	mtlr	r11
+	blr
+
+_GLOBAL(__setup_cpu_power8)
+	mflr	r11
+	bl	__init_hvmode_206
+	mtlr	r11
+	beqlr
+	li	r0,0
+	mtspr	SPRN_LPID,r0
+	mfspr	r3,SPRN_LPCR
+	oris	r3, r3, LPCR_AIL_3@h
+	bl	__init_LPCR
+	bl	__init_TLB
+	mtlr	r11
+	blr
+
+_GLOBAL(__restore_cpu_power8)
+	mflr	r11
+	mfmsr	r3
+	rldicl.	r0,r3,4,63
+	beqlr
+	li	r0,0
+	mtspr	SPRN_LPID,r0
+	mfspr   r3,SPRN_LPCR
+	oris	r3, r3, LPCR_AIL_3@h
 	bl	__init_LPCR
 	bl	__init_TLB
 	mtlr	r11
@@ -57,6 +87,7 @@ __init_hvmode_206:
 
 __init_LPCR:
 	/* Setup a sane LPCR:
+	 *   Called with initial LPCR in R3
 	 *
 	 *   LPES = 0b01 (HSRR0/1 used for 0x500)
 	 *   PECE = 0b111
@@ -67,7 +98,6 @@ __init_LPCR:
 	 *
 	 * Other bits untouched for now
 	 */
-	mfspr	r3,SPRN_LPCR
 	li	r5,1
 	rldimi	r3,r5, LPCR_LPES_SH, 64-LPCR_LPES_SH-2
 	ori	r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2)
diff --git a/trunk/arch/powerpc/kernel/cputable.c b/trunk/arch/powerpc/kernel/cputable.c
index 0514c21f138b..75a3d71b895d 100644
--- a/trunk/arch/powerpc/kernel/cputable.c
+++ b/trunk/arch/powerpc/kernel/cputable.c
@@ -68,6 +68,8 @@ extern void __restore_cpu_pa6t(void);
 extern void __restore_cpu_ppc970(void);
 extern void __setup_cpu_power7(unsigned long offset, struct cpu_spec* spec);
 extern void __restore_cpu_power7(void);
+extern void __setup_cpu_power8(unsigned long offset, struct cpu_spec* spec);
+extern void __restore_cpu_power8(void);
 extern void __restore_cpu_a2(void);
 #endif /* CONFIG_PPC64 */
 #if defined(CONFIG_E500)
@@ -94,6 +96,10 @@ extern void __restore_cpu_e5500(void);
 				 PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \
 				 PPC_FEATURE_TRUE_LE | \
 				 PPC_FEATURE_PSERIES_PERFMON_COMPAT)
+#define COMMON_USER_POWER8	(COMMON_USER_PPC64 | PPC_FEATURE_ARCH_2_06 |\
+				 PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \
+				 PPC_FEATURE_TRUE_LE | \
+				 PPC_FEATURE_PSERIES_PERFMON_COMPAT)
 #define COMMON_USER_PA6T	(COMMON_USER_PPC64 | PPC_FEATURE_PA6T |\
 				 PPC_FEATURE_TRUE_LE | \
 				 PPC_FEATURE_HAS_ALTIVEC_COMP)
@@ -429,6 +435,21 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_restore		= __restore_cpu_power7,
 		.platform		= "power7",
 	},
+	{	/* 2.07-compliant processor, i.e. Power8 "architected" mode */
+		.pvr_mask		= 0xffffffff,
+		.pvr_value		= 0x0f000004,
+		.cpu_name		= "POWER8 (architected)",
+		.cpu_features		= CPU_FTRS_POWER8,
+		.cpu_user_features	= COMMON_USER_POWER8,
+		.mmu_features		= MMU_FTRS_POWER8,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.oprofile_type		= PPC_OPROFILE_POWER4,
+		.oprofile_cpu_type	= "ppc64/ibm-compat-v1",
+		.cpu_setup		= __setup_cpu_power8,
+		.cpu_restore		= __restore_cpu_power8,
+		.platform		= "power8",
+	},
 	{	/* Power7 */
 		.pvr_mask		= 0xffff0000,
 		.pvr_value		= 0x003f0000,
@@ -463,6 +484,23 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_restore		= __restore_cpu_power7,
 		.platform		= "power7+",
 	},
+	{	/* Power8 */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x004b0000,
+		.cpu_name		= "POWER8 (raw)",
+		.cpu_features		= CPU_FTRS_POWER8,
+		.cpu_user_features	= COMMON_USER_POWER8,
+		.mmu_features		= MMU_FTRS_POWER8,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/power8",
+		.oprofile_type		= PPC_OPROFILE_POWER4,
+		.cpu_setup		= __setup_cpu_power8,
+		.cpu_restore		= __restore_cpu_power8,
+		.platform		= "power8",
+	},
 	{	/* Cell Broadband Engine */
 		.pvr_mask		= 0xffff0000,
 		.pvr_value		= 0x00700000,
diff --git a/trunk/arch/powerpc/kernel/entry_64.S b/trunk/arch/powerpc/kernel/entry_64.S
index e9a906c27234..b310a0573625 100644
--- a/trunk/arch/powerpc/kernel/entry_64.S
+++ b/trunk/arch/powerpc/kernel/entry_64.S
@@ -373,6 +373,8 @@ _GLOBAL(ret_from_fork)
 _GLOBAL(ret_from_kernel_thread)
 	bl	.schedule_tail
 	REST_NVGPRS(r1)
+	li	r3,0
+	std	r3,0(r1)
 	ld	r14, 0(r14)
 	mtlr	r14
 	mr	r3,r15
diff --git a/trunk/arch/powerpc/kernel/exceptions-64s.S b/trunk/arch/powerpc/kernel/exceptions-64s.S
index 10b658ad65e1..4665e82fa377 100644
--- a/trunk/arch/powerpc/kernel/exceptions-64s.S
+++ b/trunk/arch/powerpc/kernel/exceptions-64s.S
@@ -19,12 +19,76 @@
 /*
  * We layout physical memory as follows:
  * 0x0000 - 0x00ff : Secondary processor spin code
- * 0x0100 - 0x2fff : pSeries Interrupt prologs
- * 0x3000 - 0x5fff : interrupt support common interrupt prologs
- * 0x6000 - 0x6fff : Initial (CPU0) segment table
+ * 0x0100 - 0x17ff : pSeries Interrupt prologs
+ * 0x1800 - 0x4000 : interrupt support common interrupt prologs
+ * 0x4000 - 0x5fff : pSeries interrupts with IR=1,DR=1
+ * 0x6000 - 0x6fff : more interrupt support including for IR=1,DR=1
  * 0x7000 - 0x7fff : FWNMI data area
- * 0x8000 -        : Early init and support code
+ * 0x8000 - 0x8fff : Initial (CPU0) segment table
+ * 0x9000 -        : Early init and support code
  */
+	/* Syscall routine is used twice, in reloc-off and reloc-on paths */
+#define SYSCALL_PSERIES_1 					\
+BEGIN_FTR_SECTION						\
+	cmpdi	r0,0x1ebe ; 					\
+	beq-	1f ;						\
+END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
+	mr	r9,r13 ;					\
+	GET_PACA(r13) ;						\
+	mfspr	r11,SPRN_SRR0 ;					\
+0:
+
+#define SYSCALL_PSERIES_2_RFID 					\
+	mfspr	r12,SPRN_SRR1 ;					\
+	ld	r10,PACAKBASE(r13) ; 				\
+	LOAD_HANDLER(r10, system_call_entry) ; 			\
+	mtspr	SPRN_SRR0,r10 ; 				\
+	ld	r10,PACAKMSR(r13) ;				\
+	mtspr	SPRN_SRR1,r10 ; 				\
+	rfid ; 							\
+	b	. ;	/* prevent speculative execution */
+
+#define SYSCALL_PSERIES_3					\
+	/* Fast LE/BE switch system call */			\
+1:	mfspr	r12,SPRN_SRR1 ;					\
+	xori	r12,r12,MSR_LE ;				\
+	mtspr	SPRN_SRR1,r12 ;					\
+	rfid ;		/* return to userspace */		\
+	b	. ;						\
+2:	mfspr	r12,SPRN_SRR1 ;					\
+	andi.	r12,r12,MSR_PR ;				\
+	bne	0b ;						\
+	mtspr	SPRN_SRR0,r3 ;					\
+	mtspr	SPRN_SRR1,r4 ;					\
+	mtspr	SPRN_SDR1,r5 ;					\
+	rfid ;							\
+	b	. ;	/* prevent speculative execution */
+
+#if defined(CONFIG_RELOCATABLE)
+	/*
+	 * We can't branch directly; in the direct case we use LR
+	 * and system_call_entry restores LR.  (We thus need to move
+	 * LR to r10 in the RFID case too.)
+	 */
+#define SYSCALL_PSERIES_2_DIRECT				\
+	mflr	r10 ;						\
+	ld	r12,PACAKBASE(r13) ; 				\
+	LOAD_HANDLER(r12, system_call_entry_direct) ;		\
+	mtlr	r12 ;						\
+	mfspr	r12,SPRN_SRR1 ;					\
+	/* Re-use of r13... No spare regs to do this */	\
+	li	r13,MSR_RI ;					\
+	mtmsrd 	r13,1 ;						\
+	GET_PACA(r13) ;	/* get r13 back */			\
+	blr ;
+#else
+	/* We can branch directly */
+#define SYSCALL_PSERIES_2_DIRECT				\
+	mfspr	r12,SPRN_SRR1 ;					\
+	li	r10,MSR_RI ;					\
+	mtmsrd 	r10,1 ;			/* Set RI (EE=0) */	\
+	b	system_call_entry_direct ;
+#endif
 
 /*
  * This is the start of the interrupt handlers for pSeries
@@ -207,31 +271,11 @@ system_call_pSeries:
 	KVMTEST(0xc00)
 	GET_SCRATCH0(r13)
 #endif
-BEGIN_FTR_SECTION
-	cmpdi	r0,0x1ebe
-	beq-	1f
-END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
-	mr	r9,r13
-	GET_PACA(r13)
-	mfspr	r11,SPRN_SRR0
-	mfspr	r12,SPRN_SRR1
-	ld	r10,PACAKBASE(r13)
-	LOAD_HANDLER(r10, system_call_entry)
-	mtspr	SPRN_SRR0,r10
-	ld	r10,PACAKMSR(r13)
-	mtspr	SPRN_SRR1,r10
-	rfid
-	b	.	/* prevent speculative execution */
-
+	SYSCALL_PSERIES_1
+	SYSCALL_PSERIES_2_RFID
+	SYSCALL_PSERIES_3
 	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00)
 
-/* Fast LE/BE switch system call */
-1:	mfspr	r12,SPRN_SRR1
-	xori	r12,r12,MSR_LE
-	mtspr	SPRN_SRR1,r12
-	rfid		/* return to userspace */
-	b	.
-
 	STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xd00)
 
@@ -276,7 +320,7 @@ vsx_unavailable_pSeries_1:
 	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
 
 	. = 0x1500
-	.global denorm_Hypervisor
+	.global denorm_exception_hv
 denorm_exception_hv:
 	HMT_MEDIUM
 	mtspr	SPRN_SPRG_HSCRATCH0,r13
@@ -311,12 +355,14 @@ denorm_exception_hv:
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1800, 0x1802, cbe_thermal)
 	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
+#else
+	. = 0x1800
 #endif /* CONFIG_CBE_RAS */
 
-	. = 0x3000
 
 /*** Out of line interrupts support ***/
 
+	.align	7
 	/* moved from 0x200 */
 machine_check_pSeries:
 	.globl machine_check_fwnmi
@@ -575,16 +621,12 @@ slb_miss_user_pseries:
 	b	.				/* prevent spec. execution */
 #endif /* __DISABLED__ */
 
-	.align	7
-	.globl	__end_interrupts
-__end_interrupts:
-
 /*
  * Code from here down to __end_handlers is invoked from the
  * exception prologs above.  Because the prologs assemble the
  * addresses of these handlers using the LOAD_HANDLER macro,
- * which uses an addi instruction, these handlers must be in
- * the first 32k of the kernel image.
+ * which uses an ori instruction, these handlers must be in
+ * the first 64k of the kernel image.
  */
 
 /*** Common interrupt handlers ***/
@@ -613,8 +655,8 @@ machine_check_common:
 	STD_EXCEPTION_COMMON(0xb00, trap_0b, .unknown_exception)
 	STD_EXCEPTION_COMMON(0xd00, single_step, .single_step_exception)
 	STD_EXCEPTION_COMMON(0xe00, trap_0e, .unknown_exception)
-        STD_EXCEPTION_COMMON(0xe40, emulation_assist, .program_check_exception)
-        STD_EXCEPTION_COMMON(0xe60, hmi_exception, .unknown_exception)
+	STD_EXCEPTION_COMMON(0xe40, emulation_assist, .program_check_exception)
+	STD_EXCEPTION_COMMON(0xe60, hmi_exception, .unknown_exception)
 	STD_EXCEPTION_COMMON_ASYNC(0xf00, performance_monitor, .performance_monitor_exception)
 	STD_EXCEPTION_COMMON(0x1300, instruction_breakpoint, .instruction_breakpoint_exception)
 	STD_EXCEPTION_COMMON(0x1502, denorm, .unknown_exception)
@@ -629,7 +671,158 @@ machine_check_common:
 	STD_EXCEPTION_COMMON(0x1800, cbe_thermal, .cbe_thermal_exception)
 #endif /* CONFIG_CBE_RAS */
 
+	/*
+	 * Relocation-on interrupts: A subset of the interrupts can be delivered
+	 * with IR=1/DR=1, if AIL==2 and MSR.HV won't be changed by delivering
+	 * it.  Addresses are the same as the original interrupt addresses, but
+	 * offset by 0xc000000000004000.
+	 * It's impossible to receive interrupts below 0x300 via this mechanism.
+	 * KVM: None of these traps are from the guest ; anything that escalated
+	 * to HV=1 from HV=0 is delivered via real mode handlers.
+	 */
+
+	/*
+	 * This uses the standard macro, since the original 0x300 vector
+	 * only has extra guff for STAB-based processors -- which never
+	 * come here.
+	 */
+	STD_RELON_EXCEPTION_PSERIES(0x4300, 0x300, data_access)
+	. = 0x4380
+	.globl data_access_slb_relon_pSeries
+data_access_slb_relon_pSeries:
+	HMT_MEDIUM
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380)
+	std	r3,PACA_EXSLB+EX_R3(r13)
+	mfspr	r3,SPRN_DAR
+	mfspr	r12,SPRN_SRR1
+#ifndef CONFIG_RELOCATABLE
+	b	.slb_miss_realmode
+#else
+	/*
+	 * We can't just use a direct branch to .slb_miss_realmode
+	 * because the distance from here to there depends on where
+	 * the kernel ends up being put.
+	 */
+	mfctr	r11
+	ld	r10,PACAKBASE(r13)
+	LOAD_HANDLER(r10, .slb_miss_realmode)
+	mtctr	r10
+	bctr
+#endif
+
+	STD_RELON_EXCEPTION_PSERIES(0x4400, 0x400, instruction_access)
+	. = 0x4480
+	.globl instruction_access_slb_relon_pSeries
+instruction_access_slb_relon_pSeries:
+	HMT_MEDIUM
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480)
+	std	r3,PACA_EXSLB+EX_R3(r13)
+	mfspr	r3,SPRN_SRR0		/* SRR0 is faulting address */
+	mfspr	r12,SPRN_SRR1
+#ifndef CONFIG_RELOCATABLE
+	b	.slb_miss_realmode
+#else
+	mfctr	r11
+	ld	r10,PACAKBASE(r13)
+	LOAD_HANDLER(r10, .slb_miss_realmode)
+	mtctr	r10
+	bctr
+#endif
+
+	. = 0x4500
+	.globl hardware_interrupt_relon_pSeries;
+	.globl hardware_interrupt_relon_hv;
+hardware_interrupt_relon_pSeries:
+hardware_interrupt_relon_hv:
+	BEGIN_FTR_SECTION
+		_MASKABLE_RELON_EXCEPTION_PSERIES(0x502, hardware_interrupt, EXC_HV, SOFTEN_TEST_HV)
+	FTR_SECTION_ELSE
+		_MASKABLE_RELON_EXCEPTION_PSERIES(0x500, hardware_interrupt, EXC_STD, SOFTEN_TEST_PR)
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_206)
+	STD_RELON_EXCEPTION_PSERIES(0x4600, 0x600, alignment)
+	STD_RELON_EXCEPTION_PSERIES(0x4700, 0x700, program_check)
+	STD_RELON_EXCEPTION_PSERIES(0x4800, 0x800, fp_unavailable)
+	MASKABLE_RELON_EXCEPTION_PSERIES(0x4900, 0x900, decrementer)
+	STD_RELON_EXCEPTION_HV(0x4980, 0x982, hdecrementer)
+	STD_RELON_EXCEPTION_PSERIES(0x4b00, 0xb00, trap_0b)
+
+	. = 0x4c00
+	.globl system_call_relon_pSeries
+system_call_relon_pSeries:
+	HMT_MEDIUM
+	SYSCALL_PSERIES_1
+	SYSCALL_PSERIES_2_DIRECT
+	SYSCALL_PSERIES_3
+
+	STD_RELON_EXCEPTION_PSERIES(0x4d00, 0xd00, single_step)
+
+	. = 0x4e00
+	b	h_data_storage_relon_hv
+
+	. = 0x4e20
+	b	h_instr_storage_relon_hv
+
+	. = 0x4e40
+	b	emulation_assist_relon_hv
+
+	. = 0x4e50
+	b	hmi_exception_relon_hv
+
+	. = 0x4e60
+	b	hmi_exception_relon_hv
+
+	/* For when we support the doorbell interrupt:
+	STD_RELON_EXCEPTION_HYPERVISOR(0x4e80, 0xe80, doorbell_hyper)
+	*/
+
+performance_monitor_relon_pSeries_1:
+	. = 0x4f00
+	b	performance_monitor_relon_pSeries
+
+altivec_unavailable_relon_pSeries_1:
+	. = 0x4f20
+	b	altivec_unavailable_relon_pSeries
+
+vsx_unavailable_relon_pSeries_1:
+	. = 0x4f40
+	b	vsx_unavailable_relon_pSeries
+
+#ifdef CONFIG_CBE_RAS
+	STD_RELON_EXCEPTION_HV(0x5200, 0x1202, cbe_system_error)
+#endif /* CONFIG_CBE_RAS */
+	STD_RELON_EXCEPTION_PSERIES(0x5300, 0x1300, instruction_breakpoint)
+#ifdef CONFIG_PPC_DENORMALISATION
+	. = 0x5500
+	b	denorm_exception_hv
+#endif
+#ifdef CONFIG_CBE_RAS
+	STD_RELON_EXCEPTION_HV(0x5600, 0x1602, cbe_maintenance)
+#else
+#ifdef CONFIG_HVC_SCOM
+	STD_RELON_EXCEPTION_HV(0x5600, 0x1600, maintence_interrupt)
+	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1600)
+#endif /* CONFIG_HVC_SCOM */
+#endif /* CONFIG_CBE_RAS */
+	STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist)
+#ifdef CONFIG_CBE_RAS
+	STD_RELON_EXCEPTION_HV(0x5800, 0x1802, cbe_thermal)
+#endif /* CONFIG_CBE_RAS */
+
+	/* Other future vectors */
 	.align	7
+	.globl	__end_interrupts
+__end_interrupts:
+
+	.align	7
+system_call_entry_direct:
+#if defined(CONFIG_RELOCATABLE)
+	/* The first level prologue may have used LR to get here, saving
+	 * orig in r10.  To save hacking/ifdeffing common code, restore here.
+	 */
+	mtlr	r10
+#endif
 system_call_entry:
 	b	system_call_common
 
@@ -714,21 +907,21 @@ data_access_common:
 	ld	r3,PACA_EXGEN+EX_DAR(r13)
 	lwz	r4,PACA_EXGEN+EX_DSISR(r13)
 	li	r5,0x300
-	b	.do_hash_page	 	/* Try to handle as hpte fault */
+	b	.do_hash_page		/* Try to handle as hpte fault */
 
 	.align  7
-        .globl  h_data_storage_common
+	.globl  h_data_storage_common
 h_data_storage_common:
-        mfspr   r10,SPRN_HDAR
-        std     r10,PACA_EXGEN+EX_DAR(r13)
-        mfspr   r10,SPRN_HDSISR
-        stw     r10,PACA_EXGEN+EX_DSISR(r13)
-        EXCEPTION_PROLOG_COMMON(0xe00, PACA_EXGEN)
-        bl      .save_nvgprs
+	mfspr   r10,SPRN_HDAR
+	std     r10,PACA_EXGEN+EX_DAR(r13)
+	mfspr   r10,SPRN_HDSISR
+	stw     r10,PACA_EXGEN+EX_DSISR(r13)
+	EXCEPTION_PROLOG_COMMON(0xe00, PACA_EXGEN)
+	bl      .save_nvgprs
 	DISABLE_INTS
-        addi    r3,r1,STACK_FRAME_OVERHEAD
-        bl      .unknown_exception
-        b       .ret_from_except
+	addi    r3,r1,STACK_FRAME_OVERHEAD
+	bl      .unknown_exception
+	b       .ret_from_except
 
 	.align	7
 	.globl instruction_access_common
@@ -741,7 +934,7 @@ instruction_access_common:
 	li	r5,0x400
 	b	.do_hash_page		/* Try to handle as hpte fault */
 
-        STD_EXCEPTION_COMMON(0xe20, h_instr_storage, .unknown_exception)
+	STD_EXCEPTION_COMMON(0xe20, h_instr_storage, .unknown_exception)
 
 /*
  * Here is the common SLB miss user that is used when going to virtual
@@ -1152,6 +1345,21 @@ _GLOBAL(do_stab_bolted)
 	rfid
 	b	.	/* prevent speculative execution */
 
+
+	/* Equivalents to the above handlers for relocation-on interrupt vectors */
+	STD_RELON_EXCEPTION_HV(., 0xe00, h_data_storage)
+	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe00)
+	STD_RELON_EXCEPTION_HV(., 0xe20, h_instr_storage)
+	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe20)
+	STD_RELON_EXCEPTION_HV(., 0xe40, emulation_assist)
+	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe40)
+	STD_RELON_EXCEPTION_HV(., 0xe60, hmi_exception)
+	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe60)
+
+	STD_RELON_EXCEPTION_PSERIES(., 0xf00, performance_monitor)
+	STD_RELON_EXCEPTION_PSERIES(., 0xf20, altivec_unavailable)
+	STD_RELON_EXCEPTION_PSERIES(., 0xf40, vsx_unavailable)
+
 #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
 /*
  * Data area reserved for FWNMI option.
@@ -1164,7 +1372,7 @@ fwnmi_data_area:
 	/* pseries and powernv need to keep the whole page from
 	 * 0x7000 to 0x8000 free for use by the firmware
 	 */
-        . = 0x8000
+	. = 0x8000
 #endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */
 
 /* Space for CPU0's segment table */
diff --git a/trunk/arch/powerpc/kernel/head_64.S b/trunk/arch/powerpc/kernel/head_64.S
index 58bddee8e1e8..116f0868695b 100644
--- a/trunk/arch/powerpc/kernel/head_64.S
+++ b/trunk/arch/powerpc/kernel/head_64.S
@@ -422,7 +422,7 @@ _STATIC(__after_prom_start)
 	tovirt(r6,r6)			/* on booke, we already run at PAGE_OFFSET */
 #endif
 
-#ifdef CONFIG_CRASH_DUMP
+#ifdef CONFIG_RELOCATABLE
 /*
  * Check if the kernel has to be running as relocatable kernel based on the
  * variable __run_at_load, if it is set the kernel is treated as relocatable
@@ -432,7 +432,8 @@ _STATIC(__after_prom_start)
 	cmplwi	cr0,r7,1
 	bne	3f
 
-	li	r5,__end_interrupts - _stext	/* just copy interrupts */
+	/* just copy interrupts */
+	LOAD_REG_IMMEDIATE(r5, __end_interrupts - _stext)
 	b	5f
 3:
 #endif
@@ -703,6 +704,7 @@ _INIT_STATIC(start_here_multiplatform)
 
 #ifdef CONFIG_PPC_EARLY_DEBUG_OPAL
 	/* Setup OPAL entry */
+	LOAD_REG_ADDR(r11, opal)
 	std	r28,0(r11);
 	std	r29,8(r11);
 #endif
diff --git a/trunk/arch/powerpc/kernel/idle.c b/trunk/arch/powerpc/kernel/idle.c
index 2099d9a879e8..ea78761aa169 100644
--- a/trunk/arch/powerpc/kernel/idle.c
+++ b/trunk/arch/powerpc/kernel/idle.c
@@ -55,9 +55,6 @@ __setup("powersave=off", powersave_off);
  */
 void cpu_idle(void)
 {
-	if (ppc_md.idle_loop)
-		ppc_md.idle_loop();	/* doesn't return */
-
 	set_thread_flag(TIF_POLLING_NRFLAG);
 	while (1) {
 		tick_nohz_idle_enter();
diff --git a/trunk/arch/powerpc/kernel/iommu.c b/trunk/arch/powerpc/kernel/iommu.c
index 8226c6cb348a..c862fd716fe3 100644
--- a/trunk/arch/powerpc/kernel/iommu.c
+++ b/trunk/arch/powerpc/kernel/iommu.c
@@ -656,7 +656,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 	struct iommu_pool *p;
 
 	/* number of bytes needed for the bitmap */
-	sz = (tbl->it_size + 7) >> 3;
+	sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
 
 	page = alloc_pages_node(nid, GFP_ATOMIC, get_order(sz));
 	if (!page)
@@ -708,7 +708,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 
 void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 {
-	unsigned long bitmap_sz, i;
+	unsigned long bitmap_sz;
 	unsigned int order;
 
 	if (!tbl || !tbl->it_map) {
@@ -718,17 +718,11 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 	}
 
 	/* verify that table contains no entries */
-	/* it_size is in entries, and we're examining 64 at a time */
-	for (i = 0; i < (tbl->it_size/64); i++) {
-		if (tbl->it_map[i] != 0) {
-			printk(KERN_WARNING "%s: Unexpected TCEs for %s\n",
-				__func__, node_name);
-			break;
-		}
-	}
+	if (!bitmap_empty(tbl->it_map, tbl->it_size))
+		pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name);
 
 	/* calculate bitmap size in bytes */
-	bitmap_sz = (tbl->it_size + 7) / 8;
+	bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
 
 	/* free bitmap */
 	order = get_order(bitmap_sz);
diff --git a/trunk/arch/powerpc/kernel/machine_kexec.c b/trunk/arch/powerpc/kernel/machine_kexec.c
index fa9f6c72f557..e1ec57e87b3b 100644
--- a/trunk/arch/powerpc/kernel/machine_kexec.c
+++ b/trunk/arch/powerpc/kernel/machine_kexec.c
@@ -218,23 +218,23 @@ static void __init export_crashk_values(struct device_node *node)
 	 * be sure what's in them, so remove them. */
 	prop = of_find_property(node, "linux,crashkernel-base", NULL);
 	if (prop)
-		prom_remove_property(node, prop);
+		of_remove_property(node, prop);
 
 	prop = of_find_property(node, "linux,crashkernel-size", NULL);
 	if (prop)
-		prom_remove_property(node, prop);
+		of_remove_property(node, prop);
 
 	if (crashk_res.start != 0) {
-		prom_add_property(node, &crashk_base_prop);
+		of_add_property(node, &crashk_base_prop);
 		crashk_size = resource_size(&crashk_res);
-		prom_add_property(node, &crashk_size_prop);
+		of_add_property(node, &crashk_size_prop);
 	}
 
 	/*
 	 * memory_limit is required by the kexec-tools to limit the
 	 * crash regions to the actual memory used.
 	 */
-	prom_update_property(node, &memory_limit_prop);
+	of_update_property(node, &memory_limit_prop);
 }
 
 static int __init kexec_setup(void)
@@ -249,11 +249,11 @@ static int __init kexec_setup(void)
 	/* remove any stale properties so ours can be found */
 	prop = of_find_property(node, kernel_end_prop.name, NULL);
 	if (prop)
-		prom_remove_property(node, prop);
+		of_remove_property(node, prop);
 
 	/* information needed by userspace when using default_machine_kexec */
 	kernel_end = __pa(_end);
-	prom_add_property(node, &kernel_end_prop);
+	of_add_property(node, &kernel_end_prop);
 
 	export_crashk_values(node);
 
diff --git a/trunk/arch/powerpc/kernel/machine_kexec_64.c b/trunk/arch/powerpc/kernel/machine_kexec_64.c
index d7f609086a99..7206701b1ff1 100644
--- a/trunk/arch/powerpc/kernel/machine_kexec_64.c
+++ b/trunk/arch/powerpc/kernel/machine_kexec_64.c
@@ -389,14 +389,14 @@ static int __init export_htab_values(void)
 	/* remove any stale propertys so ours can be found */
 	prop = of_find_property(node, htab_base_prop.name, NULL);
 	if (prop)
-		prom_remove_property(node, prop);
+		of_remove_property(node, prop);
 	prop = of_find_property(node, htab_size_prop.name, NULL);
 	if (prop)
-		prom_remove_property(node, prop);
+		of_remove_property(node, prop);
 
 	htab_base = __pa(htab_address);
-	prom_add_property(node, &htab_base_prop);
-	prom_add_property(node, &htab_size_prop);
+	of_add_property(node, &htab_base_prop);
+	of_add_property(node, &htab_size_prop);
 
 	of_node_put(node);
 	return 0;
diff --git a/trunk/arch/powerpc/kernel/pci_32.c b/trunk/arch/powerpc/kernel/pci_32.c
index 4b06ec5a502e..64f526a321f5 100644
--- a/trunk/arch/powerpc/kernel/pci_32.c
+++ b/trunk/arch/powerpc/kernel/pci_32.c
@@ -208,7 +208,7 @@ pci_create_OF_bus_map(void)
 		of_prop->name = "pci-OF-bus-map";
 		of_prop->length = 256;
 		of_prop->value = &of_prop[1];
-		prom_add_property(dn, of_prop);
+		of_add_property(dn, of_prop);
 		of_node_put(dn);
 	}
 }
diff --git a/trunk/arch/powerpc/kernel/prom.c b/trunk/arch/powerpc/kernel/prom.c
index 37725e86651e..8b6f7a99cce2 100644
--- a/trunk/arch/powerpc/kernel/prom.c
+++ b/trunk/arch/powerpc/kernel/prom.c
@@ -32,6 +32,7 @@
 #include <linux/debugfs.h>
 #include <linux/irq.h>
 #include <linux/memblock.h>
+#include <linux/of.h>
 
 #include <asm/prom.h>
 #include <asm/rtas.h>
@@ -49,11 +50,11 @@
 #include <asm/btext.h>
 #include <asm/sections.h>
 #include <asm/machdep.h>
-#include <asm/pSeries_reconfig.h>
 #include <asm/pci-bridge.h>
 #include <asm/kexec.h>
 #include <asm/opal.h>
 #include <asm/fadump.h>
+#include <asm/debug.h>
 
 #include <mm/mmu_decl.h>
 
@@ -802,7 +803,7 @@ static int prom_reconfig_notifier(struct notifier_block *nb,
 	int err;
 
 	switch (action) {
-	case PSERIES_RECONFIG_ADD:
+	case OF_RECONFIG_ATTACH_NODE:
 		err = of_finish_dynamic_node(node);
 		if (err < 0)
 			printk(KERN_ERR "finish_node returned %d\n", err);
@@ -821,7 +822,7 @@ static struct notifier_block prom_reconfig_nb = {
 
 static int __init prom_reconfig_setup(void)
 {
-	return pSeries_reconfig_notifier_register(&prom_reconfig_nb);
+	return of_reconfig_notifier_register(&prom_reconfig_nb);
 }
 __initcall(prom_reconfig_setup);
 #endif
diff --git a/trunk/arch/powerpc/kernel/prom_init.c b/trunk/arch/powerpc/kernel/prom_init.c
index cb6c123722a2..779f34049a56 100644
--- a/trunk/arch/powerpc/kernel/prom_init.c
+++ b/trunk/arch/powerpc/kernel/prom_init.c
@@ -671,6 +671,7 @@ static void __init early_cmdline_parse(void)
 #define OV1_PPC_2_04		0x08	/* set if we support PowerPC 2.04 */
 #define OV1_PPC_2_05		0x04	/* set if we support PowerPC 2.05 */
 #define OV1_PPC_2_06		0x02	/* set if we support PowerPC 2.06 */
+#define OV1_PPC_2_07		0x01	/* set if we support PowerPC 2.07 */
 
 /* Option vector 2: Open Firmware options supported */
 #define OV2_REAL_MODE		0x20	/* set if we want OF in real mode */
@@ -707,6 +708,7 @@ static void __init early_cmdline_parse(void)
 #define OV5_PFO_HW_RNG		0x80	/* PFO Random Number Generator */
 #define OV5_PFO_HW_842		0x40	/* PFO Compression Accelerator */
 #define OV5_PFO_HW_ENCR		0x20	/* PFO Encryption Accelerator */
+#define OV5_SUB_PROCESSORS	0x01    /* 1,2,or 4 Sub-Processors supported */
 
 /* Option Vector 6: IBM PAPR hints */
 #define OV6_LINUX		0x02	/* Linux is our OS */
@@ -719,6 +721,8 @@ static unsigned char ibm_architecture_vec[] = {
 	W(0xfffe0000), W(0x003a0000),	/* POWER5/POWER5+ */
 	W(0xffff0000), W(0x003e0000),	/* POWER6 */
 	W(0xffff0000), W(0x003f0000),	/* POWER7 */
+	W(0xffff0000), W(0x004b0000),	/* POWER8 */
+	W(0xffffffff), W(0x0f000004),	/* all 2.07-compliant */
 	W(0xffffffff), W(0x0f000003),	/* all 2.06-compliant */
 	W(0xffffffff), W(0x0f000002),	/* all 2.05-compliant */
 	W(0xfffffffe), W(0x0f000001),	/* all 2.04-compliant and earlier */
@@ -728,7 +732,7 @@ static unsigned char ibm_architecture_vec[] = {
 	3 - 2,				/* length */
 	0,				/* don't ignore, don't halt */
 	OV1_PPC_2_00 | OV1_PPC_2_01 | OV1_PPC_2_02 | OV1_PPC_2_03 |
-	OV1_PPC_2_04 | OV1_PPC_2_05 | OV1_PPC_2_06,
+	OV1_PPC_2_04 | OV1_PPC_2_05 | OV1_PPC_2_06 | OV1_PPC_2_07,
 
 	/* option vector 2: Open Firmware options supported */
 	34 - 2,				/* length */
@@ -755,7 +759,7 @@ static unsigned char ibm_architecture_vec[] = {
 	OV4_MIN_ENT_CAP,		/* minimum VP entitled capacity */
 
 	/* option vector 5: PAPR/OF options */
-	18 - 2,				/* length */
+	19 - 2,				/* length */
 	0,				/* don't ignore, don't halt */
 	OV5_LPAR | OV5_SPLPAR | OV5_LARGE_PAGES | OV5_DRCONF_MEMORY |
 	OV5_DONATE_DEDICATE_CPU | OV5_MSI,
@@ -769,13 +773,14 @@ static unsigned char ibm_architecture_vec[] = {
 	 * must match by the macro below. Update the definition if
 	 * the structure layout changes.
 	 */
-#define IBM_ARCH_VEC_NRCORES_OFFSET	101
+#define IBM_ARCH_VEC_NRCORES_OFFSET	117
 	W(NR_CPUS),			/* number of cores supported */
 	0,
 	0,
 	0,
 	0,
 	OV5_PFO_HW_RNG | OV5_PFO_HW_ENCR | OV5_PFO_HW_842,
+	OV5_SUB_PROCESSORS,
 	/* option vector 6: IBM PAPR hints */
 	4 - 2,				/* length */
 	0,
diff --git a/trunk/arch/powerpc/kernel/ptrace.c b/trunk/arch/powerpc/kernel/ptrace.c
index 79d8e56470df..c4970004d44d 100644
--- a/trunk/arch/powerpc/kernel/ptrace.c
+++ b/trunk/arch/powerpc/kernel/ptrace.c
@@ -952,6 +952,10 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
 		arch_bp_generic_fields(data &
 					(DABR_DATA_WRITE | DABR_DATA_READ),
 							&attr.bp_type);
+
+		/* Enable breakpoint */
+		attr.disabled = false;
+
 		ret =  modify_user_hw_breakpoint(bp, &attr);
 		if (ret) {
 			ptrace_put_breakpoints(task);
@@ -1037,7 +1041,7 @@ void ptrace_disable(struct task_struct *child)
 }
 
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
-static long set_intruction_bp(struct task_struct *child,
+static long set_instruction_bp(struct task_struct *child,
 			      struct ppc_hw_breakpoint *bp_info)
 {
 	int slot;
@@ -1338,6 +1342,12 @@ static int set_dac_range(struct task_struct *child,
 static long ppc_set_hwdebug(struct task_struct *child,
 		     struct ppc_hw_breakpoint *bp_info)
 {
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+	int len = 0;
+	struct thread_struct *thread = &(child->thread);
+	struct perf_event *bp;
+	struct perf_event_attr attr;
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
 #ifndef CONFIG_PPC_ADV_DEBUG_REGS
 	unsigned long dabr;
 #endif
@@ -1365,7 +1375,7 @@ static long ppc_set_hwdebug(struct task_struct *child,
 		if ((bp_info->trigger_type != PPC_BREAKPOINT_TRIGGER_EXECUTE) ||
 		    (bp_info->condition_mode != PPC_BREAKPOINT_CONDITION_NONE))
 			return -EINVAL;
-		return set_intruction_bp(child, bp_info);
+		return set_instruction_bp(child, bp_info);
 	}
 	if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT)
 		return set_dac(child, bp_info);
@@ -1381,13 +1391,9 @@ static long ppc_set_hwdebug(struct task_struct *child,
 	 */
 	if ((bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_RW) == 0 ||
 	    (bp_info->trigger_type & ~PPC_BREAKPOINT_TRIGGER_RW) != 0 ||
-	    bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT ||
 	    bp_info->condition_mode != PPC_BREAKPOINT_CONDITION_NONE)
 		return -EINVAL;
 
-	if (child->thread.dabr)
-		return -ENOSPC;
-
 	if ((unsigned long)bp_info->addr >= TASK_SIZE)
 		return -EIO;
 
@@ -1397,6 +1403,50 @@ static long ppc_set_hwdebug(struct task_struct *child,
 		dabr |= DABR_DATA_READ;
 	if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE)
 		dabr |= DABR_DATA_WRITE;
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+	if (ptrace_get_breakpoints(child) < 0)
+		return -ESRCH;
+
+	/*
+	 * Check if the request is for 'range' breakpoints. We can
+	 * support it if range < 8 bytes.
+	 */
+	if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) {
+		len = bp_info->addr2 - bp_info->addr;
+	} else if (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT) {
+		ptrace_put_breakpoints(child);
+		return -EINVAL;
+	}
+	bp = thread->ptrace_bps[0];
+	if (bp) {
+		ptrace_put_breakpoints(child);
+		return -ENOSPC;
+	}
+
+	/* Create a new breakpoint request if one doesn't exist already */
+	hw_breakpoint_init(&attr);
+	attr.bp_addr = (unsigned long)bp_info->addr & ~HW_BREAKPOINT_ALIGN;
+	attr.bp_len = len;
+	arch_bp_generic_fields(dabr & (DABR_DATA_WRITE | DABR_DATA_READ),
+								&attr.bp_type);
+
+	thread->ptrace_bps[0] = bp = register_user_hw_breakpoint(&attr,
+					       ptrace_triggered, NULL, child);
+	if (IS_ERR(bp)) {
+		thread->ptrace_bps[0] = NULL;
+		ptrace_put_breakpoints(child);
+		return PTR_ERR(bp);
+	}
+
+	ptrace_put_breakpoints(child);
+	return 1;
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+
+	if (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT)
+		return -EINVAL;
+
+	if (child->thread.dabr)
+		return -ENOSPC;
 
 	child->thread.dabr = dabr;
 	child->thread.dabrx = DABRX_ALL;
@@ -1405,8 +1455,13 @@ static long ppc_set_hwdebug(struct task_struct *child,
 #endif /* !CONFIG_PPC_ADV_DEBUG_DVCS */
 }
 
-static long ppc_del_hwdebug(struct task_struct *child, long addr, long data)
+static long ppc_del_hwdebug(struct task_struct *child, long data)
 {
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+	int ret = 0;
+	struct thread_struct *thread = &(child->thread);
+	struct perf_event *bp;
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
 	int rc;
 
@@ -1426,10 +1481,25 @@ static long ppc_del_hwdebug(struct task_struct *child, long addr, long data)
 #else
 	if (data != 1)
 		return -EINVAL;
+
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+	if (ptrace_get_breakpoints(child) < 0)
+		return -ESRCH;
+
+	bp = thread->ptrace_bps[0];
+	if (bp) {
+		unregister_hw_breakpoint(bp);
+		thread->ptrace_bps[0] = NULL;
+	} else
+		ret = -ENOENT;
+	ptrace_put_breakpoints(child);
+	return ret;
+#else /* CONFIG_HAVE_HW_BREAKPOINT */
 	if (child->thread.dabr == 0)
 		return -ENOENT;
 
 	child->thread.dabr = 0;
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
 
 	return 0;
 #endif
@@ -1536,7 +1606,11 @@ long arch_ptrace(struct task_struct *child, long request,
 		dbginfo.data_bp_alignment = 4;
 #endif
 		dbginfo.sizeof_condition = 0;
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+		dbginfo.features = PPC_DEBUG_FEATURE_DATA_BP_RANGE;
+#else
 		dbginfo.features = 0;
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
 #endif /* CONFIG_PPC_ADV_DEBUG_REGS */
 
 		if (!access_ok(VERIFY_WRITE, datavp,
@@ -1563,7 +1637,7 @@ long arch_ptrace(struct task_struct *child, long request,
 	}
 
 	case PPC_PTRACE_DELHWDEBUG: {
-		ret = ppc_del_hwdebug(child, addr, data);
+		ret = ppc_del_hwdebug(child, data);
 		break;
 	}
 
diff --git a/trunk/arch/powerpc/kernel/rtas.c b/trunk/arch/powerpc/kernel/rtas.c
index fcec38241f79..1fd6e7b2f390 100644
--- a/trunk/arch/powerpc/kernel/rtas.c
+++ b/trunk/arch/powerpc/kernel/rtas.c
@@ -42,7 +42,6 @@
 #include <asm/time.h>
 #include <asm/mmu.h>
 #include <asm/topology.h>
-#include <asm/pSeries_reconfig.h>
 
 struct rtas_t rtas = {
 	.lock = __ARCH_SPIN_LOCK_UNLOCKED
diff --git a/trunk/arch/powerpc/kernel/rtas_flash.c b/trunk/arch/powerpc/kernel/rtas_flash.c
index 20b0120db0c3..8329190312c1 100644
--- a/trunk/arch/powerpc/kernel/rtas_flash.c
+++ b/trunk/arch/powerpc/kernel/rtas_flash.c
@@ -650,10 +650,8 @@ static int initialize_flash_pde_data(const char *rtas_call_name,
 	int token;
 
 	dp->data = kzalloc(buf_size, GFP_KERNEL);
-	if (dp->data == NULL) {
-		remove_flash_pde(dp);
+	if (dp->data == NULL)
 		return -ENOMEM;
-	}
 
 	/*
 	 * This code assumes that the status int is the first member of the
diff --git a/trunk/arch/powerpc/kernel/setup_64.c b/trunk/arch/powerpc/kernel/setup_64.c
index efb6a41b3131..6da881b35dac 100644
--- a/trunk/arch/powerpc/kernel/setup_64.c
+++ b/trunk/arch/powerpc/kernel/setup_64.c
@@ -601,6 +601,11 @@ void __init setup_arch(char **cmdline_p)
 
 	kvm_linear_init();
 
+	/* Interrupt code needs to be 64K-aligned */
+	if ((unsigned long)_stext & 0xffff)
+		panic("Kernelbase not 64K-aligned (0x%lx)!\n",
+		      (unsigned long)_stext);
+
 	ppc64_boot_msg(0x15, "Setup Done");
 }
 
diff --git a/trunk/arch/powerpc/kernel/sys_ppc32.c b/trunk/arch/powerpc/kernel/sys_ppc32.c
index 9c2ed90ece8f..8a93778ed9f5 100644
--- a/trunk/arch/powerpc/kernel/sys_ppc32.c
+++ b/trunk/arch/powerpc/kernel/sys_ppc32.c
@@ -175,19 +175,10 @@ asmlinkage long compat_sys_prctl(u32 option, u32 arg2, u32 arg3, u32 arg4, u32 a
  * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
  * and the register representation of a signed int (msr in 64-bit mode) is performed.
  */
-asmlinkage long compat_sys_sched_rr_get_interval(u32 pid, struct compat_timespec __user *interval)
-{
-	struct timespec t;
-	int ret;
-	mm_segment_t old_fs = get_fs ();
-
-	/* The __user pointer cast is valid because of the set_fs() */
-	set_fs (KERNEL_DS);
-	ret = sys_sched_rr_get_interval((int)pid, (struct timespec __user *) &t);
-	set_fs (old_fs);
-	if (put_compat_timespec(&t, interval))
-		return -EFAULT;
-	return ret;
+asmlinkage long compat_sys_sched_rr_get_interval_wrapper(u32 pid,
+							 struct compat_timespec __user *interval)
+{
+	return compat_sys_sched_rr_get_interval((int)pid, interval);
 }
 
 /* Note: it is necessary to treat mode as an unsigned int,
diff --git a/trunk/arch/powerpc/kernel/udbg.c b/trunk/arch/powerpc/kernel/udbg.c
index c39c1ca77f46..f9748498fe58 100644
--- a/trunk/arch/powerpc/kernel/udbg.c
+++ b/trunk/arch/powerpc/kernel/udbg.c
@@ -122,29 +122,6 @@ int udbg_write(const char *s, int n)
 	return n - remain;
 }
 
-int udbg_read(char *buf, int buflen)
-{
-	char *p = buf;
-	int i, c;
-
-	if (!udbg_getc)
-		return 0;
-
-	for (i = 0; i < buflen; ++i) {
-		do {
-			c = udbg_getc();
-			if (c == -1 && i == 0)
-				return -1;
-
-		} while (c == 0x11 || c == 0x13);
-		if (c == 0 || c == -1)
-			break;
-		*p++ = c;
-	}
-
-	return i;
-}
-
 #define UDBG_BUFSIZE 256
 void udbg_printf(const char *fmt, ...)
 {
diff --git a/trunk/arch/powerpc/mm/numa.c b/trunk/arch/powerpc/mm/numa.c
index 59213cfaeca9..bba87ca2b4d7 100644
--- a/trunk/arch/powerpc/mm/numa.c
+++ b/trunk/arch/powerpc/mm/numa.c
@@ -399,18 +399,6 @@ static unsigned long read_n_cells(int n, const unsigned int **buf)
 	return result;
 }
 
-struct of_drconf_cell {
-	u64	base_addr;
-	u32	drc_index;
-	u32	reserved;
-	u32	aa_index;
-	u32	flags;
-};
-
-#define DRCONF_MEM_ASSIGNED	0x00000008
-#define DRCONF_MEM_AI_INVALID	0x00000040
-#define DRCONF_MEM_RESERVED	0x00000080
-
 /*
  * Read the next memblock list entry from the ibm,dynamic-memory property
  * and return the information in the provided of_drconf_cell structure.
diff --git a/trunk/arch/powerpc/mm/tlb_nohash_low.S b/trunk/arch/powerpc/mm/tlb_nohash_low.S
index fab919fd1384..626ad081639f 100644
--- a/trunk/arch/powerpc/mm/tlb_nohash_low.S
+++ b/trunk/arch/powerpc/mm/tlb_nohash_low.S
@@ -190,12 +190,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
 
 #ifdef CONFIG_PPC_47x
 
-/*
- * 47x variant of icbt
- */
-# define ICBT(CT,RA,RB)	\
-	.long	0x7c00002c | ((CT) << 21) | ((RA) << 16) | ((RB) << 11)
-
 /*
  * _tlbivax_bcast is only on 47x. We don't bother doing a runtime
  * check though, it will blow up soon enough if we mistakenly try
@@ -208,8 +202,7 @@ _GLOBAL(_tlbivax_bcast)
 	wrteei	0
 	mtspr	SPRN_MMUCR,r5
 	isync
-/*	tlbivax	0,r3 - use .long to avoid binutils deps */
-	.long 0x7c000624 | (r3 << 11)
+	PPC_TLBIVAX(0, R3)
 	isync
 	eieio
 	tlbsync
@@ -227,11 +220,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_476_DD2)
 	bl	2f
 2:	mflr	r6
 	li	r7,32
-	ICBT(0,r6,r7)		/* touch next cache line */
+	PPC_ICBT(0,R6,R7)		/* touch next cache line */
 	add	r6,r6,r7
-	ICBT(0,r6,r7)		/* touch next cache line */
+	PPC_ICBT(0,R6,R7)		/* touch next cache line */
 	add	r6,r6,r7
-	ICBT(0,r6,r7)		/* touch next cache line */
+	PPC_ICBT(0,R6,R7)		/* touch next cache line */
 	sync
 	nop
 	nop
diff --git a/trunk/arch/powerpc/perf/power7-pmu.c b/trunk/arch/powerpc/perf/power7-pmu.c
index 441af08edf43..2ee01e38d5e2 100644
--- a/trunk/arch/powerpc/perf/power7-pmu.c
+++ b/trunk/arch/powerpc/perf/power7-pmu.c
@@ -54,8 +54,10 @@
  * Layout of constraint bits:
  * 6666555555555544444444443333333333222222222211111111110000000000
  * 3210987654321098765432109876543210987654321098765432109876543210
- *                                                 [  ><><><><><><>
- *                                                  NC P6P5P4P3P2P1
+ *                                              < ><  ><><><><><><>
+ *                                              L2  NC P6P5P4P3P2P1
+ *
+ * L2 - 16-18 - Required L2SEL value (select field)
  *
  * NC - number of counters
  *     15: NC error 0x8000
@@ -72,7 +74,7 @@
 static int power7_get_constraint(u64 event, unsigned long *maskp,
 				 unsigned long *valp)
 {
-	int pmc, sh;
+	int pmc, sh, unit;
 	unsigned long mask = 0, value = 0;
 
 	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
@@ -90,6 +92,15 @@ static int power7_get_constraint(u64 event, unsigned long *maskp,
 		mask  |= 0x8000;
 		value |= 0x1000;
 	}
+
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	if (unit == 6) {
+		/* L2SEL must be identical across events */
+		int l2sel = (event >> PM_L2SEL_SH) & PM_L2SEL_MSK;
+		mask  |= 0x7 << 16;
+		value |= l2sel << 16;
+	}
+
 	*maskp = mask;
 	*valp = value;
 	return 0;
diff --git a/trunk/arch/powerpc/platforms/512x/Kconfig b/trunk/arch/powerpc/platforms/512x/Kconfig
index b62508b113db..c16999802ecf 100644
--- a/trunk/arch/powerpc/platforms/512x/Kconfig
+++ b/trunk/arch/powerpc/platforms/512x/Kconfig
@@ -2,7 +2,6 @@ config PPC_MPC512x
 	bool "512x-based boards"
 	depends on 6xx
 	select FSL_SOC
-	select FB_FSL_DIU
 	select IPIC
 	select PPC_CLOCK
 	select PPC_PCI_CHOICE
diff --git a/trunk/arch/powerpc/platforms/512x/mpc5121_ads.c b/trunk/arch/powerpc/platforms/512x/mpc5121_ads.c
index dcef6ade48e1..0a134e0469ef 100644
--- a/trunk/arch/powerpc/platforms/512x/mpc5121_ads.c
+++ b/trunk/arch/powerpc/platforms/512x/mpc5121_ads.c
@@ -42,7 +42,10 @@ static void __init mpc5121_ads_setup_arch(void)
 	for_each_compatible_node(np, "pci", "fsl,mpc5121-pci")
 		mpc83xx_add_bridge(np);
 #endif
+
+#if defined(CONFIG_FB_FSL_DIU) || defined(CONFIG_FB_FSL_DIU_MODULE)
 	mpc512x_setup_diu();
+#endif
 }
 
 static void __init mpc5121_ads_init_IRQ(void)
diff --git a/trunk/arch/powerpc/platforms/512x/mpc512x.h b/trunk/arch/powerpc/platforms/512x/mpc512x.h
index 1ab6d11d0b19..c32b399eb952 100644
--- a/trunk/arch/powerpc/platforms/512x/mpc512x.h
+++ b/trunk/arch/powerpc/platforms/512x/mpc512x.h
@@ -16,6 +16,13 @@ extern void __init mpc512x_init(void);
 extern int __init mpc5121_clk_init(void);
 void __init mpc512x_declare_of_platform_devices(void);
 extern void mpc512x_restart(char *cmd);
-extern void mpc512x_init_diu(void);
-extern void mpc512x_setup_diu(void);
+
+#if defined(CONFIG_FB_FSL_DIU) || defined(CONFIG_FB_FSL_DIU_MODULE)
+void mpc512x_init_diu(void);
+void mpc512x_setup_diu(void);
+#else
+#define mpc512x_init_diu NULL
+#define mpc512x_setup_diu NULL
+#endif
+
 #endif				/* __MPC512X_H__ */
diff --git a/trunk/arch/powerpc/platforms/512x/mpc512x_shared.c b/trunk/arch/powerpc/platforms/512x/mpc512x_shared.c
index 1650e090ef3a..35f14fda108a 100644
--- a/trunk/arch/powerpc/platforms/512x/mpc512x_shared.c
+++ b/trunk/arch/powerpc/platforms/512x/mpc512x_shared.c
@@ -58,6 +58,8 @@ void mpc512x_restart(char *cmd)
 		;
 }
 
+#if defined(CONFIG_FB_FSL_DIU) || defined(CONFIG_FB_FSL_DIU_MODULE)
+
 struct fsl_diu_shared_fb {
 	u8		gamma[0x300];	/* 32-bit aligned! */
 	struct diu_ad	ad0;		/* 32-bit aligned! */
@@ -66,25 +68,6 @@ struct fsl_diu_shared_fb {
 	bool		in_use;
 };
 
-u32 mpc512x_get_pixel_format(enum fsl_diu_monitor_port port,
-			     unsigned int bits_per_pixel)
-{
-	switch (bits_per_pixel) {
-	case 32:
-		return 0x88883316;
-	case 24:
-		return 0x88082219;
-	case 16:
-		return 0x65053118;
-	}
-	return 0x00000400;
-}
-
-void mpc512x_set_gamma_table(enum fsl_diu_monitor_port port,
-			     char *gamma_table_base)
-{
-}
-
 void mpc512x_set_monitor_port(enum fsl_diu_monitor_port port)
 {
 }
@@ -320,14 +303,14 @@ void __init mpc512x_setup_diu(void)
 		}
 	}
 
-	diu_ops.get_pixel_format	= mpc512x_get_pixel_format;
-	diu_ops.set_gamma_table		= mpc512x_set_gamma_table;
 	diu_ops.set_monitor_port	= mpc512x_set_monitor_port;
 	diu_ops.set_pixel_clock		= mpc512x_set_pixel_clock;
 	diu_ops.valid_monitor_port	= mpc512x_valid_monitor_port;
 	diu_ops.release_bootmem		= mpc512x_release_bootmem;
 }
 
+#endif
+
 void __init mpc512x_init_IRQ(void)
 {
 	struct device_node *np;
diff --git a/trunk/arch/powerpc/platforms/52xx/lite5200.c b/trunk/arch/powerpc/platforms/52xx/lite5200.c
index 448d862bcf3d..1843bc932011 100644
--- a/trunk/arch/powerpc/platforms/52xx/lite5200.c
+++ b/trunk/arch/powerpc/platforms/52xx/lite5200.c
@@ -4,7 +4,7 @@
  * Written by: Grant Likely <grant.likely@secretlab.ca>
  *
  * Copyright (C) Secret Lab Technologies Ltd. 2006. All rights reserved.
- * Copyright (C) Freescale Semicondutor, Inc. 2006. All rights reserved.
+ * Copyright 2006 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Description:
  * This program is free software; you can redistribute  it and/or modify it
diff --git a/trunk/arch/powerpc/platforms/52xx/mpc5200_simple.c b/trunk/arch/powerpc/platforms/52xx/mpc5200_simple.c
index 9cf36020cf0d..792a301a0bf0 100644
--- a/trunk/arch/powerpc/platforms/52xx/mpc5200_simple.c
+++ b/trunk/arch/powerpc/platforms/52xx/mpc5200_simple.c
@@ -50,6 +50,7 @@ static void __init mpc5200_simple_setup_arch(void)
 
 /* list of the supported boards */
 static const char *board[] __initdata = {
+	"anonymous,a3m071",
 	"anonymous,a4m072",
 	"anon,charon",
 	"ifm,o2d",
diff --git a/trunk/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c b/trunk/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c
index 2351f9e0fb6f..16150fa430f9 100644
--- a/trunk/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c
+++ b/trunk/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c
@@ -578,18 +578,4 @@ static struct platform_driver mpc52xx_lpbfifo_driver = {
 	.probe = mpc52xx_lpbfifo_probe,
 	.remove = __devexit_p(mpc52xx_lpbfifo_remove),
 };
-
-/***********************************************************************
- * Module init/exit
- */
-static int __init mpc52xx_lpbfifo_init(void)
-{
-	return platform_driver_register(&mpc52xx_lpbfifo_driver);
-}
-module_init(mpc52xx_lpbfifo_init);
-
-static void __exit mpc52xx_lpbfifo_exit(void)
-{
-	platform_driver_unregister(&mpc52xx_lpbfifo_driver);
-}
-module_exit(mpc52xx_lpbfifo_exit);
+module_platform_driver(mpc52xx_lpbfifo_driver);
diff --git a/trunk/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c b/trunk/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c
index 328d221fd1c0..74861a7fb807 100644
--- a/trunk/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c
+++ b/trunk/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c
@@ -16,7 +16,6 @@
 #include <linux/spinlock.h>
 #include <linux/irq.h>
 #include <linux/types.h>
-#include <linux/bootmem.h>
 #include <linux/slab.h>
 
 #include <asm/io.h>
@@ -149,7 +148,7 @@ int __init pq2ads_pci_init_irq(void)
 	priv->regs = of_iomap(np, 0);
 	if (!priv->regs) {
 		printk(KERN_ERR "Cannot map PCI PIC registers.\n");
-		goto out_free_bootmem;
+		goto out_free_kmalloc;
 	}
 
 	/* mask all PCI interrupts */
@@ -171,9 +170,8 @@ int __init pq2ads_pci_init_irq(void)
 
 out_unmap_regs:
 	iounmap(priv->regs);
-out_free_bootmem:
-	free_bootmem((unsigned long)priv,
-	             sizeof(struct pq2ads_pci_pic));
+out_free_kmalloc:
+	kfree(priv);
 	of_node_put(np);
 out_unmap_irq:
 	irq_dispose_mapping(irq);
diff --git a/trunk/arch/powerpc/platforms/83xx/mpc832x_mds.c b/trunk/arch/powerpc/platforms/83xx/mpc832x_mds.c
index d440435e055c..8d762203eeff 100644
--- a/trunk/arch/powerpc/platforms/83xx/mpc832x_mds.c
+++ b/trunk/arch/powerpc/platforms/83xx/mpc832x_mds.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) Freescale Semicondutor, Inc. 2006. All rights reserved.
+ * Copyright 2006 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Description:
  * MPC832xE MDS board specific routines.
diff --git a/trunk/arch/powerpc/platforms/83xx/mpc836x_mds.c b/trunk/arch/powerpc/platforms/83xx/mpc836x_mds.c
index 1b1f6c8a1a12..1a26d2f83401 100644
--- a/trunk/arch/powerpc/platforms/83xx/mpc836x_mds.c
+++ b/trunk/arch/powerpc/platforms/83xx/mpc836x_mds.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) Freescale Semicondutor, Inc. 2006. All rights reserved.
+ * Copyright 2006 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Author: Li Yang <LeoLi@freescale.com>
  *	   Yin Olivia <Hong-hua.Yin@freescale.com>
diff --git a/trunk/arch/powerpc/platforms/83xx/mpc836x_rdk.c b/trunk/arch/powerpc/platforms/83xx/mpc836x_rdk.c
index f8769d713d61..b63b42d11d6c 100644
--- a/trunk/arch/powerpc/platforms/83xx/mpc836x_rdk.c
+++ b/trunk/arch/powerpc/platforms/83xx/mpc836x_rdk.c
@@ -1,7 +1,7 @@
 /*
  * MPC8360E-RDK board file.
  *
- * Copyright (c) 2006  Freescale Semicondutor, Inc.
+ * Copyright (c) 2006  Freescale Semiconductor, Inc.
  * Copyright (c) 2007-2008  MontaVista Software, Inc.
  *
  * Author: Anton Vorontsov <avorontsov@ru.mvista.com>
diff --git a/trunk/arch/powerpc/platforms/83xx/mpc837x_rdb.c b/trunk/arch/powerpc/platforms/83xx/mpc837x_rdb.c
index eca1f0960fff..9813c81e8e5b 100644
--- a/trunk/arch/powerpc/platforms/83xx/mpc837x_rdb.c
+++ b/trunk/arch/powerpc/platforms/83xx/mpc837x_rdb.c
@@ -1,7 +1,7 @@
 /*
  * arch/powerpc/platforms/83xx/mpc837x_rdb.c
  *
- * Copyright (C) 2007 Freescale Semicondutor, Inc. All rights reserved.
+ * Copyright (C) 2007 Freescale Semiconductor, Inc. All rights reserved.
  *
  * MPC837x RDB board specific routines
  *
diff --git a/trunk/arch/powerpc/platforms/85xx/mpc85xx_mds.c b/trunk/arch/powerpc/platforms/85xx/mpc85xx_mds.c
index 8498f7323470..bd12588fa252 100644
--- a/trunk/arch/powerpc/platforms/85xx/mpc85xx_mds.c
+++ b/trunk/arch/powerpc/platforms/85xx/mpc85xx_mds.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006-2010, 2012 Freescale Semicondutor, Inc.
+ * Copyright (C) 2006-2010, 2012 Freescale Semiconductor, Inc.
  * All rights reserved.
  *
  * Author: Andy Fleming <afleming@freescale.com>
diff --git a/trunk/arch/powerpc/platforms/85xx/p1022_ds.c b/trunk/arch/powerpc/platforms/85xx/p1022_ds.c
index 848a3e98e1c1..7328b8d74129 100644
--- a/trunk/arch/powerpc/platforms/85xx/p1022_ds.c
+++ b/trunk/arch/powerpc/platforms/85xx/p1022_ds.c
@@ -249,7 +249,7 @@ static void p1022ds_set_monitor_port(enum fsl_diu_monitor_port port)
 		goto exit;
 	}
 
-	iprop = of_get_property(law_node, "fsl,num-laws", 0);
+	iprop = of_get_property(law_node, "fsl,num-laws", NULL);
 	if (!iprop) {
 		pr_err("p1022ds: LAW node is missing fsl,num-laws property\n");
 		goto exit;
@@ -539,7 +539,7 @@ static void __init p1022_ds_setup_arch(void)
 				};
 
 				/*
-				 * prom_update_property() is called before
+				 * of_update_property() is called before
 				 * kmalloc() is available, so the 'new' object
 				 * should be allocated in the global area.
 				 * The easiest way is to do that is to
@@ -548,7 +548,7 @@ static void __init p1022_ds_setup_arch(void)
 				 */
 				pr_info("p1022ds: disabling %s node",
 					np2->full_name);
-				prom_update_property(np2, &nor_status);
+				of_update_property(np2, &nor_status);
 				of_node_put(np2);
 			}
 
@@ -564,7 +564,7 @@ static void __init p1022_ds_setup_arch(void)
 
 				pr_info("p1022ds: disabling %s node",
 					np2->full_name);
-				prom_update_property(np2, &nand_status);
+				of_update_property(np2, &nand_status);
 				of_node_put(np2);
 			}
 
diff --git a/trunk/arch/powerpc/platforms/85xx/smp.c b/trunk/arch/powerpc/platforms/85xx/smp.c
index 6fcfa12e5c56..148c2f2d9780 100644
--- a/trunk/arch/powerpc/platforms/85xx/smp.c
+++ b/trunk/arch/powerpc/platforms/85xx/smp.c
@@ -128,6 +128,19 @@ static void __cpuinit smp_85xx_mach_cpu_die(void)
 }
 #endif
 
+static inline void flush_spin_table(void *spin_table)
+{
+	flush_dcache_range((ulong)spin_table,
+		(ulong)spin_table + sizeof(struct epapr_spin_table));
+}
+
+static inline u32 read_spin_table_addr_l(void *spin_table)
+{
+	flush_dcache_range((ulong)spin_table,
+		(ulong)spin_table + sizeof(struct epapr_spin_table));
+	return in_be32(&((struct epapr_spin_table *)spin_table)->addr_l);
+}
+
 static int __cpuinit smp_85xx_kick_cpu(int nr)
 {
 	unsigned long flags;
@@ -161,8 +174,8 @@ static int __cpuinit smp_85xx_kick_cpu(int nr)
 
 	/* Map the spin table */
 	if (ioremappable)
-		spin_table = ioremap(*cpu_rel_addr,
-				sizeof(struct epapr_spin_table));
+		spin_table = ioremap_prot(*cpu_rel_addr,
+			sizeof(struct epapr_spin_table), _PAGE_COHERENT);
 	else
 		spin_table = phys_to_virt(*cpu_rel_addr);
 
@@ -173,7 +186,16 @@ static int __cpuinit smp_85xx_kick_cpu(int nr)
 	generic_set_cpu_up(nr);
 
 	if (system_state == SYSTEM_RUNNING) {
+		/*
+		 * To keep it compatible with old boot program which uses
+		 * cache-inhibit spin table, we need to flush the cache
+		 * before accessing spin table to invalidate any staled data.
+		 * We also need to flush the cache after writing to spin
+		 * table to push data out.
+		 */
+		flush_spin_table(spin_table);
 		out_be32(&spin_table->addr_l, 0);
+		flush_spin_table(spin_table);
 
 		/*
 		 * We don't set the BPTR register here since it already points
@@ -181,9 +203,14 @@ static int __cpuinit smp_85xx_kick_cpu(int nr)
 		 */
 		mpic_reset_core(hw_cpu);
 
-		/* wait until core is ready... */
-		if (!spin_event_timeout(in_be32(&spin_table->addr_l) == 1,
-						10000, 100)) {
+		/*
+		 * wait until core is ready...
+		 * We need to invalidate the stale data, in case the boot
+		 * loader uses a cache-inhibited spin table.
+		 */
+		if (!spin_event_timeout(
+				read_spin_table_addr_l(spin_table) == 1,
+				10000, 100)) {
 			pr_err("%s: timeout waiting for core %d to reset\n",
 							__func__, hw_cpu);
 			ret = -ENOENT;
@@ -194,12 +221,10 @@ static int __cpuinit smp_85xx_kick_cpu(int nr)
 		__secondary_hold_acknowledge = -1;
 	}
 #endif
+	flush_spin_table(spin_table);
 	out_be32(&spin_table->pir, hw_cpu);
 	out_be32(&spin_table->addr_l, __pa(__early_start));
-
-	if (!ioremappable)
-		flush_dcache_range((ulong)spin_table,
-			(ulong)spin_table + sizeof(struct epapr_spin_table));
+	flush_spin_table(spin_table);
 
 	/* Wait a bit for the CPU to ack. */
 	if (!spin_event_timeout(__secondary_hold_acknowledge == hw_cpu,
@@ -213,13 +238,11 @@ static int __cpuinit smp_85xx_kick_cpu(int nr)
 #else
 	smp_generic_kick_cpu(nr);
 
+	flush_spin_table(spin_table);
 	out_be32(&spin_table->pir, hw_cpu);
 	out_be64((u64 *)(&spin_table->addr_h),
 	  __pa((u64)*((unsigned long long *)generic_secondary_smp_init)));
-
-	if (!ioremappable)
-		flush_dcache_range((ulong)spin_table,
-			(ulong)spin_table + sizeof(struct epapr_spin_table));
+	flush_spin_table(spin_table);
 #endif
 
 	local_irq_restore(flags);
diff --git a/trunk/arch/powerpc/platforms/86xx/mpc8610_hpcd.c b/trunk/arch/powerpc/platforms/86xx/mpc8610_hpcd.c
index a817398a56da..04d9d317f741 100644
--- a/trunk/arch/powerpc/platforms/86xx/mpc8610_hpcd.c
+++ b/trunk/arch/powerpc/platforms/86xx/mpc8610_hpcd.c
@@ -353,5 +353,7 @@ define_machine(mpc86xx_hpcd) {
 	.time_init		= mpc86xx_time_init,
 	.calibrate_decr		= generic_calibrate_decr,
 	.progress		= udbg_progress,
+#ifdef CONFIG_PCI
 	.pcibios_fixup_bus	= fsl_pcibios_fixup_bus,
+#endif
 };
diff --git a/trunk/arch/powerpc/platforms/cell/spufs/sched.c b/trunk/arch/powerpc/platforms/cell/spufs/sched.c
index 965d381abd75..25db92a8e1cf 100644
--- a/trunk/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/trunk/arch/powerpc/platforms/cell/spufs/sched.c
@@ -1094,7 +1094,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private)
 		LOAD_INT(c), LOAD_FRAC(c),
 		count_active_contexts(),
 		atomic_read(&nr_spu_contexts),
-		current->nsproxy->pid_ns->last_pid);
+		task_active_pid_ns(current)->last_pid);
 	return 0;
 }
 
diff --git a/trunk/arch/powerpc/platforms/powermac/cpufreq_32.c b/trunk/arch/powerpc/platforms/powermac/cpufreq_32.c
index 64171198535c..311b804353b1 100644
--- a/trunk/arch/powerpc/platforms/powermac/cpufreq_32.c
+++ b/trunk/arch/powerpc/platforms/powermac/cpufreq_32.c
@@ -55,6 +55,7 @@ static unsigned int low_freq;
 static unsigned int hi_freq;
 static unsigned int cur_freq;
 static unsigned int sleep_freq;
+static unsigned long transition_latency;
 
 /*
  * Different models uses different mechanisms to switch the frequency
@@ -403,7 +404,7 @@ static int pmac_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	if (policy->cpu != 0)
 		return -ENODEV;
 
-	policy->cpuinfo.transition_latency	= CPUFREQ_ETERNAL;
+	policy->cpuinfo.transition_latency	= transition_latency;
 	policy->cur = cur_freq;
 
 	cpufreq_frequency_table_get_attr(pmac_cpu_freqs, policy->cpu);
@@ -658,12 +659,14 @@ static int __init pmac_cpufreq_setup(void)
 	if (!value)
 		goto out;
 	cur_freq = (*value) / 1000;
+	transition_latency = CPUFREQ_ETERNAL;
 
 	/*  Check for 7447A based MacRISC3 */
 	if (of_machine_is_compatible("MacRISC3") &&
 	    of_get_property(cpunode, "dynamic-power-step", NULL) &&
 	    PVR_VER(mfspr(SPRN_PVR)) == 0x8003) {
 		pmac_cpufreq_init_7447A(cpunode);
+		transition_latency = 8000000;
 	/* Check for other MacRISC3 machines */
 	} else if (of_machine_is_compatible("PowerBook3,4") ||
 		   of_machine_is_compatible("PowerBook3,5") ||
diff --git a/trunk/arch/powerpc/platforms/powernv/pci-ioda.c b/trunk/arch/powerpc/platforms/powernv/pci-ioda.c
index 471aa3ccd9fd..53d052e95cfc 100644
--- a/trunk/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/trunk/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -34,24 +34,12 @@
 #include "powernv.h"
 #include "pci.h"
 
-static int __pe_printk(const char *level, const struct pnv_ioda_pe *pe,
-		       struct va_format *vaf)
-{
-	char pfix[32];
-
-	if (pe->pdev)
-		strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
-	else
-		sprintf(pfix, "%04x:%02x     ",
-			pci_domain_nr(pe->pbus), pe->pbus->number);
-	return printk("pci %s%s: [PE# %.3d] %pV", level, pfix, pe->pe_number, vaf);
-}
-
 #define define_pe_printk_level(func, kern_level)		\
 static int func(const struct pnv_ioda_pe *pe, const char *fmt, ...)	\
 {								\
 	struct va_format vaf;					\
 	va_list args;						\
+	char pfix[32];						\
 	int r;							\
 								\
 	va_start(args, fmt);					\
@@ -59,7 +47,16 @@ static int func(const struct pnv_ioda_pe *pe, const char *fmt, ...)	\
 	vaf.fmt = fmt;						\
 	vaf.va = &args;						\
 								\
-	r = __pe_printk(kern_level, pe, &vaf);			\
+	if (pe->pdev)						\
+		strlcpy(pfix, dev_name(&pe->pdev->dev),		\
+			sizeof(pfix));				\
+	else							\
+		sprintf(pfix, "%04x:%02x     ",			\
+			pci_domain_nr(pe->pbus),		\
+			pe->pbus->number);			\
+	r = printk(kern_level "pci %s: [PE# %.3d] %pV",		\
+		   pfix, pe->pe_number, &vaf);			\
+								\
 	va_end(args);						\
 								\
 	return r;						\
diff --git a/trunk/arch/powerpc/platforms/ps3/os-area.c b/trunk/arch/powerpc/platforms/ps3/os-area.c
index 56d26bc4fd41..09787139834d 100644
--- a/trunk/arch/powerpc/platforms/ps3/os-area.c
+++ b/trunk/arch/powerpc/platforms/ps3/os-area.c
@@ -280,13 +280,13 @@ static void os_area_set_property(struct device_node *node,
 
 	if (tmp) {
 		pr_debug("%s:%d found %s\n", __func__, __LINE__, prop->name);
-		prom_remove_property(node, tmp);
+		of_remove_property(node, tmp);
 	}
 
-	result = prom_add_property(node, prop);
+	result = of_add_property(node, prop);
 
 	if (result)
-		pr_debug("%s:%d prom_set_property failed\n", __func__,
+		pr_debug("%s:%d of_set_property failed\n", __func__,
 			__LINE__);
 }
 
diff --git a/trunk/arch/powerpc/platforms/pseries/dlpar.c b/trunk/arch/powerpc/platforms/pseries/dlpar.c
index 0f1b706506ed..a1a7b9a67ffd 100644
--- a/trunk/arch/powerpc/platforms/pseries/dlpar.c
+++ b/trunk/arch/powerpc/platforms/pseries/dlpar.c
@@ -13,17 +13,16 @@
 #include <linux/kernel.h>
 #include <linux/kref.h>
 #include <linux/notifier.h>
-#include <linux/proc_fs.h>
 #include <linux/spinlock.h>
 #include <linux/cpu.h>
 #include <linux/slab.h>
+#include <linux/of.h>
 #include "offline_states.h"
 
 #include <asm/prom.h>
 #include <asm/machdep.h>
 #include <asm/uaccess.h>
 #include <asm/rtas.h>
-#include <asm/pSeries_reconfig.h>
 
 struct cc_workarea {
 	u32	drc_index;
@@ -255,9 +254,6 @@ static struct device_node *derive_parent(const char *path)
 
 int dlpar_attach_node(struct device_node *dn)
 {
-#ifdef CONFIG_PROC_DEVICETREE
-	struct proc_dir_entry *ent;
-#endif
 	int rc;
 
 	of_node_set_flag(dn, OF_DYNAMIC);
@@ -266,44 +262,26 @@ int dlpar_attach_node(struct device_node *dn)
 	if (!dn->parent)
 		return -ENOMEM;
 
-	rc = pSeries_reconfig_notify(PSERIES_RECONFIG_ADD, dn);
+	rc = of_attach_node(dn);
 	if (rc) {
 		printk(KERN_ERR "Failed to add device node %s\n",
 		       dn->full_name);
 		return rc;
 	}
 
-	of_attach_node(dn);
-
-#ifdef CONFIG_PROC_DEVICETREE
-	ent = proc_mkdir(strrchr(dn->full_name, '/') + 1, dn->parent->pde);
-	if (ent)
-		proc_device_tree_add_node(dn, ent);
-#endif
-
 	of_node_put(dn->parent);
 	return 0;
 }
 
 int dlpar_detach_node(struct device_node *dn)
 {
-#ifdef CONFIG_PROC_DEVICETREE
-	struct device_node *parent = dn->parent;
-	struct property *prop = dn->properties;
-
-	while (prop) {
-		remove_proc_entry(prop->name, dn->pde);
-		prop = prop->next;
-	}
+	int rc;
 
-	if (dn->pde)
-		remove_proc_entry(dn->pde->name, parent->pde);
-#endif
+	rc = of_detach_node(dn);
+	if (rc)
+		return rc;
 
-	pSeries_reconfig_notify(PSERIES_RECONFIG_REMOVE, dn);
-	of_detach_node(dn);
 	of_node_put(dn); /* Must decrement the refcount */
-
 	return 0;
 }
 
diff --git a/trunk/arch/powerpc/platforms/pseries/firmware.c b/trunk/arch/powerpc/platforms/pseries/firmware.c
index 0b0eff0cce35..7b56118f531c 100644
--- a/trunk/arch/powerpc/platforms/pseries/firmware.c
+++ b/trunk/arch/powerpc/platforms/pseries/firmware.c
@@ -56,6 +56,7 @@ firmware_features_table[FIRMWARE_MAX_FEATURES] = {
 	{FW_FEATURE_MULTITCE,		"hcall-multi-tce"},
 	{FW_FEATURE_SPLPAR,		"hcall-splpar"},
 	{FW_FEATURE_VPHN,		"hcall-vphn"},
+	{FW_FEATURE_SET_MODE,		"hcall-set-mode"},
 };
 
 /* Build up the firmware features bitmask using the contents of
diff --git a/trunk/arch/powerpc/platforms/pseries/hotplug-cpu.c b/trunk/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 64c97d8ac0c5..a38956269fbf 100644
--- a/trunk/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/trunk/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -23,12 +23,12 @@
 #include <linux/delay.h>
 #include <linux/sched.h>	/* for idle_task_exit */
 #include <linux/cpu.h>
+#include <linux/of.h>
 #include <asm/prom.h>
 #include <asm/rtas.h>
 #include <asm/firmware.h>
 #include <asm/machdep.h>
 #include <asm/vdso_datapage.h>
-#include <asm/pSeries_reconfig.h>
 #include <asm/xics.h>
 #include "plpar_wrappers.h"
 #include "offline_states.h"
@@ -333,10 +333,10 @@ static int pseries_smp_notifier(struct notifier_block *nb,
 	int err = 0;
 
 	switch (action) {
-	case PSERIES_RECONFIG_ADD:
+	case OF_RECONFIG_ATTACH_NODE:
 		err = pseries_add_processor(node);
 		break;
-	case PSERIES_RECONFIG_REMOVE:
+	case OF_RECONFIG_DETACH_NODE:
 		pseries_remove_processor(node);
 		break;
 	}
@@ -399,7 +399,7 @@ static int __init pseries_cpu_hotplug_init(void)
 
 	/* Processors can be added/removed only on LPAR */
 	if (firmware_has_feature(FW_FEATURE_LPAR)) {
-		pSeries_reconfig_notifier_register(&pseries_smp_nb);
+		of_reconfig_notifier_register(&pseries_smp_nb);
 		cpu_maps_update_begin();
 		if (cede_offline_enabled && parse_cede_parameters() == 0) {
 			default_offline_state = CPU_STATE_INACTIVE;
diff --git a/trunk/arch/powerpc/platforms/pseries/hotplug-memory.c b/trunk/arch/powerpc/platforms/pseries/hotplug-memory.c
index ecdb0a6b3171..2372c609fa2b 100644
--- a/trunk/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/trunk/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -16,7 +16,6 @@
 
 #include <asm/firmware.h>
 #include <asm/machdep.h>
-#include <asm/pSeries_reconfig.h>
 #include <asm/sparsemem.h>
 
 static unsigned long get_memblock_size(void)
@@ -187,42 +186,69 @@ static int pseries_add_memory(struct device_node *np)
 	return (ret < 0) ? -EINVAL : 0;
 }
 
-static int pseries_drconf_memory(unsigned long *base, unsigned int action)
+static int pseries_update_drconf_memory(struct of_prop_reconfig *pr)
 {
+	struct of_drconf_cell *new_drmem, *old_drmem;
 	unsigned long memblock_size;
-	int rc;
+	u32 entries;
+	u32 *p;
+	int i, rc = -EINVAL;
 
 	memblock_size = get_memblock_size();
 	if (!memblock_size)
 		return -EINVAL;
 
-	if (action == PSERIES_DRCONF_MEM_ADD) {
-		rc = memblock_add(*base, memblock_size);
-		rc = (rc < 0) ? -EINVAL : 0;
-	} else if (action == PSERIES_DRCONF_MEM_REMOVE) {
-		rc = pseries_remove_memblock(*base, memblock_size);
-	} else {
-		rc = -EINVAL;
+	p = (u32 *)of_get_property(pr->dn, "ibm,dynamic-memory", NULL);
+	if (!p)
+		return -EINVAL;
+
+	/* The first int of the property is the number of lmb's described
+	 * by the property. This is followed by an array of of_drconf_cell
+	 * entries. Get the niumber of entries and skip to the array of
+	 * of_drconf_cell's.
+	 */
+	entries = *p++;
+	old_drmem = (struct of_drconf_cell *)p;
+
+	p = (u32 *)pr->prop->value;
+	p++;
+	new_drmem = (struct of_drconf_cell *)p;
+
+	for (i = 0; i < entries; i++) {
+		if ((old_drmem[i].flags & DRCONF_MEM_ASSIGNED) &&
+		    (!(new_drmem[i].flags & DRCONF_MEM_ASSIGNED))) {
+			rc = pseries_remove_memblock(old_drmem[i].base_addr,
+						     memblock_size);
+			break;
+		} else if ((!(old_drmem[i].flags & DRCONF_MEM_ASSIGNED)) &&
+			   (new_drmem[i].flags & DRCONF_MEM_ASSIGNED)) {
+			rc = memblock_add(old_drmem[i].base_addr,
+					  memblock_size);
+			rc = (rc < 0) ? -EINVAL : 0;
+			break;
+		}
 	}
 
 	return rc;
 }
 
 static int pseries_memory_notifier(struct notifier_block *nb,
-				unsigned long action, void *node)
+				   unsigned long action, void *node)
 {
+	struct of_prop_reconfig *pr;
 	int err = 0;
 
 	switch (action) {
-	case PSERIES_RECONFIG_ADD:
+	case OF_RECONFIG_ATTACH_NODE:
 		err = pseries_add_memory(node);
 		break;
-	case PSERIES_RECONFIG_REMOVE:
+	case OF_RECONFIG_DETACH_NODE:
 		err = pseries_remove_memory(node);
 		break;
-	case PSERIES_DRCONF_MEM_ADD:
-	case PSERIES_DRCONF_MEM_REMOVE:
-		err = pseries_drconf_memory(node, action);
+	case OF_RECONFIG_UPDATE_PROPERTY:
+		pr = (struct of_prop_reconfig *)node;
+		if (!strcmp(pr->prop->name, "ibm,dynamic-memory"))
+			err = pseries_update_drconf_memory(pr);
 		break;
 	}
 	return notifier_from_errno(err);
@@ -235,7 +261,7 @@ static struct notifier_block pseries_mem_nb = {
 static int __init pseries_memory_hotplug_init(void)
 {
 	if (firmware_has_feature(FW_FEATURE_LPAR))
-		pSeries_reconfig_notifier_register(&pseries_mem_nb);
+		of_reconfig_notifier_register(&pseries_mem_nb);
 
 	return 0;
 }
diff --git a/trunk/arch/powerpc/platforms/pseries/iommu.c b/trunk/arch/powerpc/platforms/pseries/iommu.c
index 6153eea27ce7..e2685badb5db 100644
--- a/trunk/arch/powerpc/platforms/pseries/iommu.c
+++ b/trunk/arch/powerpc/platforms/pseries/iommu.c
@@ -36,13 +36,13 @@
 #include <linux/dma-mapping.h>
 #include <linux/crash_dump.h>
 #include <linux/memory.h>
+#include <linux/of.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/rtas.h>
 #include <asm/iommu.h>
 #include <asm/pci-bridge.h>
 #include <asm/machdep.h>
-#include <asm/pSeries_reconfig.h>
 #include <asm/firmware.h>
 #include <asm/tce.h>
 #include <asm/ppc-pci.h>
@@ -760,7 +760,7 @@ static void remove_ddw(struct device_node *np)
 	__remove_ddw(np, ddw_avail, liobn);
 
 delprop:
-	ret = prom_remove_property(np, win64);
+	ret = of_remove_property(np, win64);
 	if (ret)
 		pr_warning("%s: failed to remove direct window property: %d\n",
 			np->full_name, ret);
@@ -1070,7 +1070,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 		goto out_free_window;
 	}
 
-	ret = prom_add_property(pdn, win64);
+	ret = of_add_property(pdn, win64);
 	if (ret) {
 		dev_err(&dev->dev, "unable to add dma window property for %s: %d",
 			 pdn->full_name, ret);
@@ -1294,7 +1294,7 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
 	struct direct_window *window;
 
 	switch (action) {
-	case PSERIES_RECONFIG_REMOVE:
+	case OF_RECONFIG_DETACH_NODE:
 		if (pci && pci->iommu_table)
 			iommu_free_table(pci->iommu_table, np->full_name);
 
@@ -1357,7 +1357,7 @@ void iommu_init_early_pSeries(void)
 	}
 
 
-	pSeries_reconfig_notifier_register(&iommu_reconfig_nb);
+	of_reconfig_notifier_register(&iommu_reconfig_nb);
 	register_memory_notifier(&iommu_mem_nb);
 
 	set_pci_dma_ops(&dma_iommu_ops);
diff --git a/trunk/arch/powerpc/platforms/pseries/mobility.c b/trunk/arch/powerpc/platforms/pseries/mobility.c
index dd30b12edfe4..6573808cc5f3 100644
--- a/trunk/arch/powerpc/platforms/pseries/mobility.c
+++ b/trunk/arch/powerpc/platforms/pseries/mobility.c
@@ -116,7 +116,7 @@ static int update_dt_property(struct device_node *dn, struct property **prop,
 	}
 
 	if (!more) {
-		prom_update_property(dn, new_prop);
+		of_update_property(dn, new_prop);
 		new_prop = NULL;
 	}
 
@@ -172,7 +172,7 @@ static int update_dt_node(u32 phandle)
 
 			case 0x80000000:
 				prop = of_find_property(dn, prop_name, NULL);
-				prom_remove_property(dn, prop);
+				of_remove_property(dn, prop);
 				prop = NULL;
 				break;
 
diff --git a/trunk/arch/powerpc/platforms/pseries/plpar_wrappers.h b/trunk/arch/powerpc/platforms/pseries/plpar_wrappers.h
index 13e8cc43adf7..e6cc34a67053 100644
--- a/trunk/arch/powerpc/platforms/pseries/plpar_wrappers.h
+++ b/trunk/arch/powerpc/platforms/pseries/plpar_wrappers.h
@@ -273,4 +273,35 @@ static inline long plpar_put_term_char(unsigned long termno, unsigned long len,
 			lbuf[1]);
 }
 
+/* Set various resource mode parameters */
+static inline long plpar_set_mode(unsigned long mflags, unsigned long resource,
+		unsigned long value1, unsigned long value2)
+{
+	return plpar_hcall_norets(H_SET_MODE, mflags, resource, value1, value2);
+}
+
+/*
+ * Enable relocation on exceptions on this partition
+ *
+ * Note: this call has a partition wide scope and can take a while to complete.
+ * If it returns H_LONG_BUSY_* it should be retried periodically until it
+ * returns H_SUCCESS.
+ */
+static inline long enable_reloc_on_exceptions(void)
+{
+	/* mflags = 3: Exceptions at 0xC000000000004000 */
+	return plpar_set_mode(3, 3, 0, 0);
+}
+
+/*
+ * Disable relocation on exceptions on this partition
+ *
+ * Note: this call has a partition wide scope and can take a while to complete.
+ * If it returns H_LONG_BUSY_* it should be retried periodically until it
+ * returns H_SUCCESS.
+ */
+static inline long disable_reloc_on_exceptions(void) {
+	return plpar_set_mode(0, 3, 0, 0);
+}
+
 #endif /* _PSERIES_PLPAR_WRAPPERS_H */
diff --git a/trunk/arch/powerpc/platforms/pseries/reconfig.c b/trunk/arch/powerpc/platforms/pseries/reconfig.c
index 2f4668136b20..d6491bd481d0 100644
--- a/trunk/arch/powerpc/platforms/pseries/reconfig.c
+++ b/trunk/arch/powerpc/platforms/pseries/reconfig.c
@@ -16,55 +16,13 @@
 #include <linux/notifier.h>
 #include <linux/proc_fs.h>
 #include <linux/slab.h>
+#include <linux/of.h>
 
 #include <asm/prom.h>
 #include <asm/machdep.h>
 #include <asm/uaccess.h>
-#include <asm/pSeries_reconfig.h>
 #include <asm/mmu.h>
 
-
-
-/*
- * Routines for "runtime" addition and removal of device tree nodes.
- */
-#ifdef CONFIG_PROC_DEVICETREE
-/*
- * Add a node to /proc/device-tree.
- */
-static void add_node_proc_entries(struct device_node *np)
-{
-	struct proc_dir_entry *ent;
-
-	ent = proc_mkdir(strrchr(np->full_name, '/') + 1, np->parent->pde);
-	if (ent)
-		proc_device_tree_add_node(np, ent);
-}
-
-static void remove_node_proc_entries(struct device_node *np)
-{
-	struct property *pp = np->properties;
-	struct device_node *parent = np->parent;
-
-	while (pp) {
-		remove_proc_entry(pp->name, np->pde);
-		pp = pp->next;
-	}
-	if (np->pde)
-		remove_proc_entry(np->pde->name, parent->pde);
-}
-#else /* !CONFIG_PROC_DEVICETREE */
-static void add_node_proc_entries(struct device_node *np)
-{
-	return;
-}
-
-static void remove_node_proc_entries(struct device_node *np)
-{
-	return;
-}
-#endif /* CONFIG_PROC_DEVICETREE */
-
 /**
  *	derive_parent - basically like dirname(1)
  *	@path:  the full_name of a node to be added to the tree
@@ -97,28 +55,6 @@ static struct device_node *derive_parent(const char *path)
 	return parent;
 }
 
-static BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain);
-
-int pSeries_reconfig_notifier_register(struct notifier_block *nb)
-{
-	return blocking_notifier_chain_register(&pSeries_reconfig_chain, nb);
-}
-EXPORT_SYMBOL_GPL(pSeries_reconfig_notifier_register);
-
-void pSeries_reconfig_notifier_unregister(struct notifier_block *nb)
-{
-	blocking_notifier_chain_unregister(&pSeries_reconfig_chain, nb);
-}
-EXPORT_SYMBOL_GPL(pSeries_reconfig_notifier_unregister);
-
-int pSeries_reconfig_notify(unsigned long action, void *p)
-{
-	int err = blocking_notifier_call_chain(&pSeries_reconfig_chain,
-						action, p);
-
-	return notifier_to_errno(err);
-}
-
 static int pSeries_reconfig_add_node(const char *path, struct property *proplist)
 {
 	struct device_node *np;
@@ -142,16 +78,12 @@ static int pSeries_reconfig_add_node(const char *path, struct property *proplist
 		goto out_err;
 	}
 
-	err = pSeries_reconfig_notify(PSERIES_RECONFIG_ADD, np);
+	err = of_attach_node(np);
 	if (err) {
 		printk(KERN_ERR "Failed to add device node %s\n", path);
 		goto out_err;
 	}
 
-	of_attach_node(np);
-
-	add_node_proc_entries(np);
-
 	of_node_put(np->parent);
 
 	return 0;
@@ -179,11 +111,7 @@ static int pSeries_reconfig_remove_node(struct device_node *np)
 		return -EBUSY;
 	}
 
-	remove_node_proc_entries(np);
-
-	pSeries_reconfig_notify(PSERIES_RECONFIG_REMOVE, np);
 	of_detach_node(np);
-
 	of_node_put(parent);
 	of_node_put(np); /* Must decrement the refcount */
 	return 0;
@@ -397,7 +325,7 @@ static int do_add_property(char *buf, size_t bufsize)
 	if (!prop)
 		return -ENOMEM;
 
-	prom_add_property(np, prop);
+	of_add_property(np, prop);
 
 	return 0;
 }
@@ -421,16 +349,15 @@ static int do_remove_property(char *buf, size_t bufsize)
 
 	prop = of_find_property(np, buf, NULL);
 
-	return prom_remove_property(np, prop);
+	return of_remove_property(np, prop);
 }
 
 static int do_update_property(char *buf, size_t bufsize)
 {
 	struct device_node *np;
-	struct pSeries_reconfig_prop_update upd_value;
 	unsigned char *value;
 	char *name, *end, *next_prop;
-	int rc, length;
+	int length;
 	struct property *newprop;
 	buf = parse_node(buf, bufsize, &np);
 	end = buf + bufsize;
@@ -452,41 +379,7 @@ static int do_update_property(char *buf, size_t bufsize)
 	if (!strcmp(name, "slb-size") || !strcmp(name, "ibm,slb-size"))
 		slb_set_size(*(int *)value);
 
-	upd_value.node = np;
-	upd_value.property = newprop;
-	pSeries_reconfig_notify(PSERIES_UPDATE_PROPERTY, &upd_value);
-
-	rc = prom_update_property(np, newprop);
-	if (rc)
-		return rc;
-
-	/* For memory under the ibm,dynamic-reconfiguration-memory node
-	 * of the device tree, adding and removing memory is just an update
-	 * to the ibm,dynamic-memory property instead of adding/removing a
-	 * memory node in the device tree.  For these cases we still need to
-	 * involve the notifier chain.
-	 */
-	if (!strcmp(name, "ibm,dynamic-memory")) {
-		int action;
-
-		next_prop = parse_next_property(next_prop, end, &name,
-						&length, &value);
-		if (!next_prop)
-			return -EINVAL;
-
-		if (!strcmp(name, "add"))
-			action = PSERIES_DRCONF_MEM_ADD;
-		else
-			action = PSERIES_DRCONF_MEM_REMOVE;
-
-		rc = pSeries_reconfig_notify(action, value);
-		if (rc) {
-			prom_update_property(np, newprop);
-			return rc;
-		}
-	}
-
-	return 0;
+	return of_update_property(np, newprop);
 }
 
 /**
diff --git a/trunk/arch/powerpc/platforms/pseries/setup.c b/trunk/arch/powerpc/platforms/pseries/setup.c
index e3cb7ae61658..ca55882465d6 100644
--- a/trunk/arch/powerpc/platforms/pseries/setup.c
+++ b/trunk/arch/powerpc/platforms/pseries/setup.c
@@ -40,6 +40,8 @@
 #include <linux/seq_file.h>
 #include <linux/root_dev.h>
 #include <linux/cpuidle.h>
+#include <linux/of.h>
+#include <linux/kexec.h>
 
 #include <asm/mmu.h>
 #include <asm/processor.h>
@@ -63,7 +65,6 @@
 #include <asm/smp.h>
 #include <asm/firmware.h>
 #include <asm/eeh.h>
-#include <asm/pSeries_reconfig.h>
 
 #include "plpar_wrappers.h"
 #include "pseries.h"
@@ -258,7 +259,7 @@ static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long act
 	int err = NOTIFY_OK;
 
 	switch (action) {
-	case PSERIES_RECONFIG_ADD:
+	case OF_RECONFIG_ATTACH_NODE:
 		pci = np->parent->data;
 		if (pci) {
 			update_dn_pci_info(np, pci->phb);
@@ -367,6 +368,65 @@ static void pSeries_idle(void)
 	}
 }
 
+/*
+ * Enable relocation on during exceptions. This has partition wide scope and
+ * may take a while to complete, if it takes longer than one second we will
+ * just give up rather than wasting any more time on this - if that turns out
+ * to ever be a problem in practice we can move this into a kernel thread to
+ * finish off the process later in boot.
+ */
+static int __init pSeries_enable_reloc_on_exc(void)
+{
+	long rc;
+	unsigned int delay, total_delay = 0;
+
+	while (1) {
+		rc = enable_reloc_on_exceptions();
+		if (!H_IS_LONG_BUSY(rc))
+			return rc;
+
+		delay = get_longbusy_msecs(rc);
+		total_delay += delay;
+		if (total_delay > 1000) {
+			pr_warn("Warning: Giving up waiting to enable "
+				"relocation on exceptions (%u msec)!\n",
+				total_delay);
+			return rc;
+		}
+
+		mdelay(delay);
+	}
+}
+
+#ifdef CONFIG_KEXEC
+static long pSeries_disable_reloc_on_exc(void)
+{
+	long rc;
+
+	while (1) {
+		rc = disable_reloc_on_exceptions();
+		if (!H_IS_LONG_BUSY(rc))
+			return rc;
+		mdelay(get_longbusy_msecs(rc));
+	}
+}
+
+static void pSeries_machine_kexec(struct kimage *image)
+{
+	long rc;
+
+	if (firmware_has_feature(FW_FEATURE_SET_MODE) &&
+	    (image->type != KEXEC_TYPE_CRASH)) {
+		rc = pSeries_disable_reloc_on_exc();
+		if (rc != H_SUCCESS)
+			pr_warning("Warning: Failed to disable relocation on "
+				   "exceptions: %ld\n", rc);
+	}
+
+	default_machine_kexec(image);
+}
+#endif
+
 static void __init pSeries_setup_arch(void)
 {
 	panic_timeout = 10;
@@ -389,7 +449,7 @@ static void __init pSeries_setup_arch(void)
 	/* Find and initialize PCI host bridges */
 	init_pci_config_tokens();
 	find_and_init_phbs();
-	pSeries_reconfig_notifier_register(&pci_dn_reconfig_nb);
+	of_reconfig_notifier_register(&pci_dn_reconfig_nb);
 
 	pSeries_nvram_init();
 
@@ -402,6 +462,14 @@ static void __init pSeries_setup_arch(void)
 		ppc_md.enable_pmcs = pseries_lpar_enable_pmcs;
 	else
 		ppc_md.enable_pmcs = power4_enable_pmcs;
+
+	if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
+		long rc;
+		if ((rc = pSeries_enable_reloc_on_exc()) != H_SUCCESS) {
+			pr_warn("Unable to enable relocation on exceptions: "
+				"%ld\n", rc);
+		}
+	}
 }
 
 static int __init pSeries_init_panel(void)
@@ -659,4 +727,7 @@ define_machine(pseries) {
 	.progress		= rtas_progress,
 	.system_reset_exception = pSeries_system_reset_exception,
 	.machine_check_exception = pSeries_machine_check_exception,
+#ifdef CONFIG_KEXEC
+	.machine_kexec          = pSeries_machine_kexec,
+#endif
 };
diff --git a/trunk/arch/powerpc/platforms/pseries/smp.c b/trunk/arch/powerpc/platforms/pseries/smp.c
index 71706bc34a0d..9fc0a4941908 100644
--- a/trunk/arch/powerpc/platforms/pseries/smp.c
+++ b/trunk/arch/powerpc/platforms/pseries/smp.c
@@ -38,7 +38,6 @@
 #include <asm/cputable.h>
 #include <asm/firmware.h>
 #include <asm/rtas.h>
-#include <asm/pSeries_reconfig.h>
 #include <asm/mpic.h>
 #include <asm/vdso_datapage.h>
 #include <asm/cputhreads.h>
diff --git a/trunk/arch/powerpc/sysdev/fsl_gtm.c b/trunk/arch/powerpc/sysdev/fsl_gtm.c
index 02cf1e7e77fc..0eb871cc3437 100644
--- a/trunk/arch/powerpc/sysdev/fsl_gtm.c
+++ b/trunk/arch/powerpc/sysdev/fsl_gtm.c
@@ -1,7 +1,7 @@
 /*
  * Freescale General-purpose Timers Module
  *
- * Copyright (c) Freescale Semicondutor, Inc. 2006.
+ * Copyright (c) Freescale Semiconductor, Inc. 2006.
  *               Shlomi Gridish <gridish@freescale.com>
  *               Jerry Huang <Chang-Ming.Huang@freescale.com>
  * Copyright (c) MontaVista Software, Inc. 2008.
diff --git a/trunk/arch/powerpc/sysdev/fsl_pci.c b/trunk/arch/powerpc/sysdev/fsl_pci.c
index 01b62a62c635..5ba325bff3a2 100644
--- a/trunk/arch/powerpc/sysdev/fsl_pci.c
+++ b/trunk/arch/powerpc/sysdev/fsl_pci.c
@@ -89,7 +89,7 @@ static int fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask)
 	return 0;
 }
 
-static int __init setup_one_atmu(struct ccsr_pci __iomem *pci,
+static int setup_one_atmu(struct ccsr_pci __iomem *pci,
 	unsigned int index, const struct resource *res,
 	resource_size_t offset)
 {
@@ -126,7 +126,7 @@ static int __init setup_one_atmu(struct ccsr_pci __iomem *pci,
 }
 
 /* atmu setup for fsl pci/pcie controller */
-static void __init setup_pci_atmu(struct pci_controller *hose,
+static void setup_pci_atmu(struct pci_controller *hose,
 				  struct resource *rsrc)
 {
 	struct ccsr_pci __iomem *pci;
@@ -902,9 +902,42 @@ static int __devinit fsl_pci_probe(struct platform_device *pdev)
 	return 0;
 }
 
+#ifdef CONFIG_PM
+static int fsl_pci_resume(struct device *dev)
+{
+	struct pci_controller *hose;
+	struct resource pci_rsrc;
+
+	hose = pci_find_hose_for_OF_device(dev->of_node);
+	if (!hose)
+		return -ENODEV;
+
+	if (of_address_to_resource(dev->of_node, 0, &pci_rsrc)) {
+		dev_err(dev, "Get pci register base failed.");
+		return -ENODEV;
+	}
+
+	setup_pci_atmu(hose, &pci_rsrc);
+
+	return 0;
+}
+
+static const struct dev_pm_ops pci_pm_ops = {
+	.resume = fsl_pci_resume,
+};
+
+#define PCI_PM_OPS (&pci_pm_ops)
+
+#else
+
+#define PCI_PM_OPS NULL
+
+#endif
+
 static struct platform_driver fsl_pci_driver = {
 	.driver = {
 		.name = "fsl-pci",
+		.pm = PCI_PM_OPS,
 		.of_match_table = pci_ids,
 	},
 	.probe = fsl_pci_probe,
diff --git a/trunk/arch/powerpc/sysdev/pmi.c b/trunk/arch/powerpc/sysdev/pmi.c
index 8f0465422b1e..5aaf86c03893 100644
--- a/trunk/arch/powerpc/sysdev/pmi.c
+++ b/trunk/arch/powerpc/sysdev/pmi.c
@@ -214,18 +214,7 @@ static struct platform_driver pmi_of_platform_driver = {
 		.of_match_table = pmi_match,
 	},
 };
-
-static int __init pmi_module_init(void)
-{
-	return platform_driver_register(&pmi_of_platform_driver);
-}
-module_init(pmi_module_init);
-
-static void __exit pmi_module_exit(void)
-{
-	platform_driver_unregister(&pmi_of_platform_driver);
-}
-module_exit(pmi_module_exit);
+module_platform_driver(pmi_of_platform_driver);
 
 int pmi_send_message(pmi_message_t msg)
 {
diff --git a/trunk/arch/powerpc/sysdev/qe_lib/qe.c b/trunk/arch/powerpc/sysdev/qe_lib/qe.c
index b04367529729..238a07b97f2c 100644
--- a/trunk/arch/powerpc/sysdev/qe_lib/qe.c
+++ b/trunk/arch/powerpc/sysdev/qe_lib/qe.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006-2010 Freescale Semicondutor, Inc. All rights reserved.
+ * Copyright (C) 2006-2010 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Authors: 	Shlomi Gridish <gridish@freescale.com>
  * 		Li Yang <leoli@freescale.com>
diff --git a/trunk/arch/powerpc/sysdev/qe_lib/qe_ic.c b/trunk/arch/powerpc/sysdev/qe_lib/qe_ic.c
index 2fba6ef2f95e..b2b87c30e266 100644
--- a/trunk/arch/powerpc/sysdev/qe_lib/qe_ic.c
+++ b/trunk/arch/powerpc/sysdev/qe_lib/qe_ic.c
@@ -1,7 +1,7 @@
 /*
  * arch/powerpc/sysdev/qe_lib/qe_ic.c
  *
- * Copyright (C) 2006 Freescale Semicondutor, Inc.  All rights reserved.
+ * Copyright (C) 2006 Freescale Semiconductor, Inc.  All rights reserved.
  *
  * Author: Li Yang <leoli@freescale.com>
  * Based on code from Shlomi Gridish <gridish@freescale.com>
diff --git a/trunk/arch/powerpc/sysdev/qe_lib/qe_ic.h b/trunk/arch/powerpc/sysdev/qe_lib/qe_ic.h
index c327872ed35c..efef7ab9b753 100644
--- a/trunk/arch/powerpc/sysdev/qe_lib/qe_ic.h
+++ b/trunk/arch/powerpc/sysdev/qe_lib/qe_ic.h
@@ -3,7 +3,7 @@
  *
  * QUICC ENGINE Interrupt Controller Header
  *
- * Copyright (C) 2006 Freescale Semicondutor, Inc. All rights reserved.
+ * Copyright (C) 2006 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Author: Li Yang <leoli@freescale.com>
  * Based on code from Shlomi Gridish <gridish@freescale.com>
diff --git a/trunk/arch/powerpc/sysdev/qe_lib/qe_io.c b/trunk/arch/powerpc/sysdev/qe_lib/qe_io.c
index fd1a6c3b1721..a88807b3dd57 100644
--- a/trunk/arch/powerpc/sysdev/qe_lib/qe_io.c
+++ b/trunk/arch/powerpc/sysdev/qe_lib/qe_io.c
@@ -3,7 +3,7 @@
  *
  * QE Parallel I/O ports configuration routines
  *
- * Copyright (C) Freescale Semicondutor, Inc. 2006. All rights reserved.
+ * Copyright 2006 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Author: Li Yang <LeoLi@freescale.com>
  * Based on code from Shlomi Gridish <gridish@freescale.com>
diff --git a/trunk/arch/powerpc/sysdev/qe_lib/ucc.c b/trunk/arch/powerpc/sysdev/qe_lib/ucc.c
index 04677505f20f..134b07d29435 100644
--- a/trunk/arch/powerpc/sysdev/qe_lib/ucc.c
+++ b/trunk/arch/powerpc/sysdev/qe_lib/ucc.c
@@ -3,7 +3,7 @@
  *
  * QE UCC API Set - UCC specific routines implementations.
  *
- * Copyright (C) 2006 Freescale Semicondutor, Inc. All rights reserved.
+ * Copyright (C) 2006 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Authors: 	Shlomi Gridish <gridish@freescale.com>
  * 		Li Yang <leoli@freescale.com>
diff --git a/trunk/arch/powerpc/sysdev/qe_lib/ucc_fast.c b/trunk/arch/powerpc/sysdev/qe_lib/ucc_fast.c
index fba02440d122..cceb2e366738 100644
--- a/trunk/arch/powerpc/sysdev/qe_lib/ucc_fast.c
+++ b/trunk/arch/powerpc/sysdev/qe_lib/ucc_fast.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006 Freescale Semicondutor, Inc. All rights reserved.
+ * Copyright (C) 2006 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Authors: 	Shlomi Gridish <gridish@freescale.com>
  * 		Li Yang <leoli@freescale.com>
diff --git a/trunk/arch/powerpc/sysdev/qe_lib/ucc_slow.c b/trunk/arch/powerpc/sysdev/qe_lib/ucc_slow.c
index 524c0ead941d..1c062f48f1ac 100644
--- a/trunk/arch/powerpc/sysdev/qe_lib/ucc_slow.c
+++ b/trunk/arch/powerpc/sysdev/qe_lib/ucc_slow.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006 Freescale Semicondutor, Inc. All rights reserved.
+ * Copyright (C) 2006 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Authors: 	Shlomi Gridish <gridish@freescale.com>
  * 		Li Yang <leoli@freescale.com>
diff --git a/trunk/arch/powerpc/sysdev/qe_lib/usb.c b/trunk/arch/powerpc/sysdev/qe_lib/usb.c
index 9162828f5da7..27f23bd15eb6 100644
--- a/trunk/arch/powerpc/sysdev/qe_lib/usb.c
+++ b/trunk/arch/powerpc/sysdev/qe_lib/usb.c
@@ -1,7 +1,7 @@
 /*
  * QE USB routines
  *
- * Copyright (c) Freescale Semicondutor, Inc. 2006.
+ * Copyright 2006 Freescale Semiconductor, Inc.
  *               Shlomi Gridish <gridish@freescale.com>
  *               Jerry Huang <Chang-Ming.Huang@freescale.com>
  * Copyright (c) MontaVista Software, Inc. 2008.
diff --git a/trunk/arch/powerpc/xmon/Makefile b/trunk/arch/powerpc/xmon/Makefile
index c168c54e3c40..b49fdbd15808 100644
--- a/trunk/arch/powerpc/xmon/Makefile
+++ b/trunk/arch/powerpc/xmon/Makefile
@@ -6,7 +6,7 @@ GCOV_PROFILE := n
 
 ccflags-$(CONFIG_PPC64) := -mno-minimal-toc
 
-obj-y			+= xmon.o start.o nonstdio.o
+obj-y			+= xmon.o nonstdio.o
 
 ifdef CONFIG_XMON_DISASSEMBLY
 obj-y			+= ppc-dis.o ppc-opc.o
diff --git a/trunk/arch/powerpc/xmon/nonstdio.c b/trunk/arch/powerpc/xmon/nonstdio.c
index bfac84fbe780..bce3dcfe5058 100644
--- a/trunk/arch/powerpc/xmon/nonstdio.c
+++ b/trunk/arch/powerpc/xmon/nonstdio.c
@@ -7,9 +7,23 @@
  *      2 of the License, or (at your option) any later version.
  */
 #include <linux/string.h>
+#include <asm/udbg.h>
 #include <asm/time.h>
 #include "nonstdio.h"
 
+
+static int xmon_write(const void *ptr, int nb)
+{
+	return udbg_write(ptr, nb);
+}
+
+static int xmon_readchar(void)
+{
+	if (udbg_getc)
+		return udbg_getc();
+	return -1;
+}
+
 int xmon_putchar(int c)
 {
 	char ch = c;
@@ -23,34 +37,7 @@ static char line[256];
 static char *lineptr;
 static int lineleft;
 
-int xmon_expect(const char *str, unsigned long timeout)
-{
-	int c;
-	unsigned long t0;
-
-	/* assume 25MHz default timebase if tb_ticks_per_sec not set yet */
-	timeout *= tb_ticks_per_sec? tb_ticks_per_sec: 25000000;
-	t0 = get_tbl();
-	do {
-		lineptr = line;
-		for (;;) {
-			c = xmon_read_poll();
-			if (c == -1) {
-				if (get_tbl() - t0 > timeout)
-					return 0;
-				continue;
-			}
-			if (c == '\n')
-				break;
-			if (c != '\r' && lineptr < &line[sizeof(line) - 1])
-				*lineptr++ = c;
-		}
-		*lineptr = 0;
-	} while (strstr(line, str) == NULL);
-	return 1;
-}
-
-int xmon_getchar(void)
+static int xmon_getchar(void)
 {
 	int c;
 
@@ -124,13 +111,19 @@ char *xmon_gets(char *str, int nb)
 void xmon_printf(const char *format, ...)
 {
 	va_list args;
-	int n;
 	static char xmon_outbuf[1024];
+	int rc, n;
 
 	va_start(args, format);
 	n = vsnprintf(xmon_outbuf, sizeof(xmon_outbuf), format, args);
 	va_end(args);
-	xmon_write(xmon_outbuf, n);
+
+	rc = xmon_write(xmon_outbuf, n);
+
+	if (n && rc == 0) {
+		/* No udbg hooks, fallback to printk() - dangerous */
+		printk(xmon_outbuf);
+	}
 }
 
 void xmon_puts(const char *str)
diff --git a/trunk/arch/powerpc/xmon/nonstdio.h b/trunk/arch/powerpc/xmon/nonstdio.h
index 23dd95f4599c..18a51ded4ffd 100644
--- a/trunk/arch/powerpc/xmon/nonstdio.h
+++ b/trunk/arch/powerpc/xmon/nonstdio.h
@@ -4,12 +4,6 @@
 #define putchar	xmon_putchar
 
 extern int xmon_putchar(int c);
-extern int xmon_getchar(void);
 extern void xmon_puts(const char *);
 extern char *xmon_gets(char *, int);
 extern void xmon_printf(const char *, ...);
-extern void xmon_map_scc(void);
-extern int xmon_expect(const char *str, unsigned long timeout);
-extern int xmon_write(const void *ptr, int nb);
-extern int xmon_readchar(void);
-extern int xmon_read_poll(void);
diff --git a/trunk/arch/powerpc/xmon/start.c b/trunk/arch/powerpc/xmon/start.c
deleted file mode 100644
index 8864de2af382..000000000000
--- a/trunk/arch/powerpc/xmon/start.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (C) 1996 Paul Mackerras.
- *
- *      This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      as published by the Free Software Foundation; either version
- *      2 of the License, or (at your option) any later version.
- */
-#include <asm/machdep.h>
-#include <asm/udbg.h>
-#include "nonstdio.h"
-
-void xmon_map_scc(void)
-{
-}
-
-int xmon_write(const void *ptr, int nb)
-{
-	return udbg_write(ptr, nb);
-}
-
-int xmon_readchar(void)
-{
-	if (udbg_getc)
-		return udbg_getc();
-	return -1;
-}
-
-int xmon_read_poll(void)
-{
-	if (udbg_getc_poll)
-		return udbg_getc_poll();
-	return -1;
-}
diff --git a/trunk/arch/powerpc/xmon/xmon.c b/trunk/arch/powerpc/xmon/xmon.c
index 3a56a639a92e..1f8d2f10a432 100644
--- a/trunk/arch/powerpc/xmon/xmon.c
+++ b/trunk/arch/powerpc/xmon/xmon.c
@@ -52,9 +52,6 @@
 #include "nonstdio.h"
 #include "dis-asm.h"
 
-#define scanhex	xmon_scanhex
-#define skipbl	xmon_skipbl
-
 #ifdef CONFIG_SMP
 static cpumask_t cpus_in_xmon = CPU_MASK_NONE;
 static unsigned long xmon_taken = 1;
@@ -169,12 +166,8 @@ extern void xmon_leave(void);
 
 #ifdef CONFIG_PPC64
 #define REG		"%.16lx"
-#define REGS_PER_LINE	4
-#define LAST_VOLATILE	13
 #else
 #define REG		"%.8lx"
-#define REGS_PER_LINE	8
-#define LAST_VOLATILE	12
 #endif
 
 #define GETWORD(v)	(((v)[0] << 24) + ((v)[1] << 16) + ((v)[2] << 8) + (v)[3])
@@ -1288,27 +1281,19 @@ static void get_function_bounds(unsigned long pc, unsigned long *startp,
 	catch_memory_errors = 0;
 }
 
-static int xmon_depth_to_print = 64;
-
 #define LRSAVE_OFFSET		(STACK_FRAME_LR_SAVE * sizeof(unsigned long))
 #define MARKER_OFFSET		(STACK_FRAME_MARKER * sizeof(unsigned long))
 
-#ifdef __powerpc64__
-#define REGS_OFFSET		0x70
-#else
-#define REGS_OFFSET		16
-#endif
-
 static void xmon_show_stack(unsigned long sp, unsigned long lr,
 			    unsigned long pc)
 {
+	int max_to_print = 64;
 	unsigned long ip;
 	unsigned long newsp;
 	unsigned long marker;
-	int count = 0;
 	struct pt_regs regs;
 
-	do {
+	while (max_to_print--) {
 		if (sp < PAGE_OFFSET) {
 			if (sp != 0)
 				printf("SP (%lx) is in userspace\n", sp);
@@ -1362,10 +1347,10 @@ static void xmon_show_stack(unsigned long sp, unsigned long lr,
 		   an exception frame. */
 		if (mread(sp + MARKER_OFFSET, &marker, sizeof(unsigned long))
 		    && marker == STACK_FRAME_REGS_MARKER) {
-			if (mread(sp + REGS_OFFSET, &regs, sizeof(regs))
+			if (mread(sp + STACK_FRAME_OVERHEAD, &regs, sizeof(regs))
 			    != sizeof(regs)) {
 				printf("Couldn't read registers at %lx\n",
-				       sp + REGS_OFFSET);
+				       sp + STACK_FRAME_OVERHEAD);
 				break;
 			}
 			printf("--- Exception: %lx %s at ", regs.trap,
@@ -1379,7 +1364,7 @@ static void xmon_show_stack(unsigned long sp, unsigned long lr,
 			break;
 
 		sp = newsp;
-	} while (count++ < xmon_depth_to_print);
+	}
 }
 
 static void backtrace(struct pt_regs *excp)
@@ -2943,7 +2928,6 @@ static void xmon_init(int enable)
 		__debugger_dabr_match = NULL;
 		__debugger_fault_handler = NULL;
 	}
-	xmon_map_scc();
 }
 
 #ifdef CONFIG_MAGIC_SYSRQ
diff --git a/trunk/arch/s390/include/asm/ccwdev.h b/trunk/arch/s390/include/asm/ccwdev.h
index 6d1f3573f0df..e6061617a50b 100644
--- a/trunk/arch/s390/include/asm/ccwdev.h
+++ b/trunk/arch/s390/include/asm/ccwdev.h
@@ -12,15 +12,13 @@
 #include <linux/mod_devicetable.h>
 #include <asm/fcx.h>
 #include <asm/irq.h>
+#include <asm/schid.h>
 
 /* structs from asm/cio.h */
 struct irb;
 struct ccw1;
 struct ccw_dev_id;
 
-/* from asm/schid.h */
-struct subchannel_id;
-
 /* simplified initializers for struct ccw_device:
  * CCW_DEVICE and CCW_DEVICE_DEVTYPE initialize one
  * entry in your MODULE_DEVICE_TABLE and set the match_flag correctly */
diff --git a/trunk/arch/s390/include/asm/pci.h b/trunk/arch/s390/include/asm/pci.h
index a6175ad0c42f..b1fa93c606ad 100644
--- a/trunk/arch/s390/include/asm/pci.h
+++ b/trunk/arch/s390/include/asm/pci.h
@@ -9,6 +9,7 @@
 #include <asm-generic/pci.h>
 #include <asm-generic/pci-dma-compat.h>
 #include <asm/pci_clp.h>
+#include <asm/pci_debug.h>
 
 #define PCIBIOS_MIN_IO		0x1000
 #define PCIBIOS_MIN_MEM		0x10000000
@@ -33,6 +34,25 @@ int pci_proc_domain(struct pci_bus *);
 #define ZPCI_FC_BLOCKED			0x20
 #define ZPCI_FC_DMA_ENABLED		0x10
 
+struct zpci_fmb {
+	u32 format	:  8;
+	u32 dma_valid	:  1;
+	u32		: 23;
+	u32 samples;
+	u64 last_update;
+	/* hardware counters */
+	u64 ld_ops;
+	u64 st_ops;
+	u64 stb_ops;
+	u64 rpcit_ops;
+	u64 dma_rbytes;
+	u64 dma_wbytes;
+	/* software counters */
+	atomic64_t allocated_pages;
+	atomic64_t mapped_pages;
+	atomic64_t unmapped_pages;
+} __packed __aligned(16);
+
 struct msi_map {
 	unsigned long irq;
 	struct msi_desc *msi;
@@ -92,7 +112,15 @@ struct zpci_dev {
 	u64		end_dma;	/* End of available DMA addresses */
 	u64		dma_mask;	/* DMA address space mask */
 
+	/* Function measurement block */
+	struct zpci_fmb *fmb;
+	u16		fmb_update;	/* update interval */
+
 	enum pci_bus_speed max_bus_speed;
+
+	struct dentry	*debugfs_dev;
+	struct dentry	*debugfs_perf;
+	struct dentry	*debugfs_debug;
 };
 
 struct pci_hp_callback_ops {
@@ -155,4 +183,15 @@ extern struct list_head zpci_list;
 extern struct pci_hp_callback_ops hotplug_ops;
 extern unsigned int pci_probe;
 
+/* FMB */
+int zpci_fmb_enable_device(struct zpci_dev *);
+int zpci_fmb_disable_device(struct zpci_dev *);
+
+/* Debug */
+int zpci_debug_init(void);
+void zpci_debug_exit(void);
+void zpci_debug_init_device(struct zpci_dev *);
+void zpci_debug_exit_device(struct zpci_dev *);
+void zpci_debug_info(struct zpci_dev *, struct seq_file *);
+
 #endif
diff --git a/trunk/arch/s390/include/asm/pci_debug.h b/trunk/arch/s390/include/asm/pci_debug.h
new file mode 100644
index 000000000000..6bbec4265b6e
--- /dev/null
+++ b/trunk/arch/s390/include/asm/pci_debug.h
@@ -0,0 +1,36 @@
+#ifndef _S390_ASM_PCI_DEBUG_H
+#define _S390_ASM_PCI_DEBUG_H
+
+#include <asm/debug.h>
+
+extern debug_info_t *pci_debug_msg_id;
+extern debug_info_t *pci_debug_err_id;
+
+#ifdef CONFIG_PCI_DEBUG
+#define zpci_dbg(fmt, args...)							\
+	do {									\
+		if (pci_debug_msg_id->level >= 2)				\
+			debug_sprintf_event(pci_debug_msg_id, 2, fmt , ## args);\
+	} while (0)
+
+#else /* !CONFIG_PCI_DEBUG */
+#define zpci_dbg(fmt, args...) do { } while (0)
+#endif
+
+#define zpci_err(text...)							\
+	do {									\
+		char debug_buffer[16];						\
+		snprintf(debug_buffer, 16, text);				\
+		debug_text_event(pci_debug_err_id, 0, debug_buffer);		\
+	} while (0)
+
+static inline void zpci_err_hex(void *addr, int len)
+{
+	while (len > 0) {
+		debug_event(pci_debug_err_id, 0, (void *) addr, len);
+		len -= pci_debug_err_id->buf_size;
+		addr += pci_debug_err_id->buf_size;
+	}
+}
+
+#endif
diff --git a/trunk/arch/s390/pci/Makefile b/trunk/arch/s390/pci/Makefile
index ab0827b6bc4b..f0f426a113ce 100644
--- a/trunk/arch/s390/pci/Makefile
+++ b/trunk/arch/s390/pci/Makefile
@@ -3,4 +3,4 @@
 #
 
 obj-$(CONFIG_PCI)	+= pci.o pci_dma.o pci_clp.o pci_msi.o \
-			   pci_sysfs.o pci_event.o
+			   pci_sysfs.o pci_event.o pci_debug.o
diff --git a/trunk/arch/s390/pci/pci.c b/trunk/arch/s390/pci/pci.c
index 7ed38e5e3028..8fa416b8775f 100644
--- a/trunk/arch/s390/pci/pci.c
+++ b/trunk/arch/s390/pci/pci.c
@@ -98,6 +98,10 @@ EXPORT_SYMBOL_GPL(zpci_iomap_start);
 static int __read_mostly aisb_max;
 
 static struct kmem_cache *zdev_irq_cache;
+static struct kmem_cache *zdev_fmb_cache;
+
+debug_info_t *pci_debug_msg_id;
+debug_info_t *pci_debug_err_id;
 
 static inline int irq_to_msi_nr(unsigned int irq)
 {
@@ -216,6 +220,7 @@ struct mod_pci_args {
 	u64 base;
 	u64 limit;
 	u64 iota;
+	u64 fmb_addr;
 };
 
 static int mod_pci(struct zpci_dev *zdev, int fn, u8 dmaas, struct mod_pci_args *args)
@@ -232,6 +237,7 @@ static int mod_pci(struct zpci_dev *zdev, int fn, u8 dmaas, struct mod_pci_args
 	fib->pba = args->base;
 	fib->pal = args->limit;
 	fib->iota = args->iota;
+	fib->fmb_addr = args->fmb_addr;
 
 	rc = mpcifc_instr(req, fib);
 	free_page((unsigned long) fib);
@@ -242,7 +248,7 @@ static int mod_pci(struct zpci_dev *zdev, int fn, u8 dmaas, struct mod_pci_args
 int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas,
 		       u64 base, u64 limit, u64 iota)
 {
-	struct mod_pci_args args = { base, limit, iota };
+	struct mod_pci_args args = { base, limit, iota, 0 };
 
 	WARN_ON_ONCE(iota & 0x3fff);
 	args.iota |= ZPCI_IOTA_RTTO_FLAG;
@@ -252,7 +258,7 @@ int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas,
 /* Modify PCI: Unregister I/O address translation parameters */
 int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas)
 {
-	struct mod_pci_args args = { 0, 0, 0 };
+	struct mod_pci_args args = { 0, 0, 0, 0 };
 
 	return mod_pci(zdev, ZPCI_MOD_FC_DEREG_IOAT, dmaas, &args);
 }
@@ -260,11 +266,46 @@ int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas)
 /* Modify PCI: Unregister adapter interruptions */
 static int zpci_unregister_airq(struct zpci_dev *zdev)
 {
-	struct mod_pci_args args = { 0, 0, 0 };
+	struct mod_pci_args args = { 0, 0, 0, 0 };
 
 	return mod_pci(zdev, ZPCI_MOD_FC_DEREG_INT, 0, &args);
 }
 
+/* Modify PCI: Set PCI function measurement parameters */
+int zpci_fmb_enable_device(struct zpci_dev *zdev)
+{
+	struct mod_pci_args args = { 0, 0, 0, 0 };
+
+	if (zdev->fmb)
+		return -EINVAL;
+
+	zdev->fmb = kmem_cache_alloc(zdev_fmb_cache, GFP_KERNEL);
+	if (!zdev->fmb)
+		return -ENOMEM;
+	memset(zdev->fmb, 0, sizeof(*zdev->fmb));
+	WARN_ON((u64) zdev->fmb & 0xf);
+
+	args.fmb_addr = virt_to_phys(zdev->fmb);
+	return mod_pci(zdev, ZPCI_MOD_FC_SET_MEASURE, 0, &args);
+}
+
+/* Modify PCI: Disable PCI function measurement */
+int zpci_fmb_disable_device(struct zpci_dev *zdev)
+{
+	struct mod_pci_args args = { 0, 0, 0, 0 };
+	int rc;
+
+	if (!zdev->fmb)
+		return -EINVAL;
+
+	/* Function measurement is disabled if fmb address is zero */
+	rc = mod_pci(zdev, ZPCI_MOD_FC_SET_MEASURE, 0, &args);
+
+	kmem_cache_free(zdev_fmb_cache, zdev->fmb);
+	zdev->fmb = NULL;
+	return rc;
+}
+
 #define ZPCI_PCIAS_CFGSPC	15
 
 static int zpci_cfg_load(struct zpci_dev *zdev, int offset, u32 *val, u8 len)
@@ -633,6 +674,7 @@ static void zpci_remove_device(struct pci_dev *pdev)
 	dev_info(&pdev->dev, "Removing device %u\n", zdev->domain);
 	zdev->state = ZPCI_FN_STATE_CONFIGURED;
 	zpci_dma_exit_device(zdev);
+	zpci_fmb_disable_device(zdev);
 	zpci_sysfs_remove_device(&pdev->dev);
 	zpci_unmap_resources(pdev);
 	list_del(&zdev->entry);		/* can be called from init */
@@ -799,6 +841,16 @@ static void zpci_irq_exit(void)
 	kfree(bucket);
 }
 
+void zpci_debug_info(struct zpci_dev *zdev, struct seq_file *m)
+{
+	if (!zdev)
+		return;
+
+	seq_printf(m, "global irq retries: %u\n", atomic_read(&irq_retries));
+	seq_printf(m, "aibv[0]:%016lx  aibv[1]:%016lx  aisb:%016lx\n",
+		   get_imap(0)->aibv, get_imap(1)->aibv, *bucket->aisb);
+}
+
 static struct resource *zpci_alloc_bus_resource(unsigned long start, unsigned long size,
 						unsigned long flags, int domain)
 {
@@ -994,6 +1046,8 @@ int zpci_scan_device(struct zpci_dev *zdev)
 		goto out;
 	}
 
+	zpci_debug_init_device(zdev);
+	zpci_fmb_enable_device(zdev);
 	zpci_map_resources(zdev);
 	pci_bus_add_devices(zdev->bus);
 
@@ -1020,6 +1074,11 @@ static int zpci_mem_init(void)
 	if (!zdev_irq_cache)
 		goto error_zdev;
 
+	zdev_fmb_cache = kmem_cache_create("PCI_FMB_cache", sizeof(struct zpci_fmb),
+				16, 0, NULL);
+	if (!zdev_fmb_cache)
+		goto error_fmb;
+
 	/* TODO: use realloc */
 	zpci_iomap_start = kzalloc(ZPCI_IOMAP_MAX_ENTRIES * sizeof(*zpci_iomap_start),
 				   GFP_KERNEL);
@@ -1028,6 +1087,8 @@ static int zpci_mem_init(void)
 	return 0;
 
 error_iomap:
+	kmem_cache_destroy(zdev_fmb_cache);
+error_fmb:
 	kmem_cache_destroy(zdev_irq_cache);
 error_zdev:
 	return -ENOMEM;
@@ -1037,6 +1098,7 @@ static void zpci_mem_exit(void)
 {
 	kfree(zpci_iomap_start);
 	kmem_cache_destroy(zdev_irq_cache);
+	kmem_cache_destroy(zdev_fmb_cache);
 }
 
 unsigned int pci_probe = 1;
@@ -1066,6 +1128,10 @@ static int __init pci_base_init(void)
 		test_facility(69), test_facility(70),
 		test_facility(71));
 
+	rc = zpci_debug_init();
+	if (rc)
+		return rc;
+
 	rc = zpci_mem_init();
 	if (rc)
 		goto out_mem;
@@ -1098,6 +1164,7 @@ static int __init pci_base_init(void)
 out_hash:
 	zpci_mem_exit();
 out_mem:
+	zpci_debug_exit();
 	return rc;
 }
 subsys_initcall(pci_base_init);
diff --git a/trunk/arch/s390/pci/pci_clp.c b/trunk/arch/s390/pci/pci_clp.c
index 7f4ce8d874a4..2c847143cbd1 100644
--- a/trunk/arch/s390/pci/pci_clp.c
+++ b/trunk/arch/s390/pci/pci_clp.c
@@ -51,6 +51,7 @@ static void clp_store_query_pci_fngrp(struct zpci_dev *zdev,
 	zdev->tlb_refresh = response->refresh;
 	zdev->dma_mask = response->dasm;
 	zdev->msi_addr = response->msia;
+	zdev->fmb_update = response->mui;
 
 	pr_debug("Supported number of MSI vectors: %u\n", response->noi);
 	switch (response->version) {
diff --git a/trunk/arch/s390/pci/pci_debug.c b/trunk/arch/s390/pci/pci_debug.c
new file mode 100644
index 000000000000..a303c95346cb
--- /dev/null
+++ b/trunk/arch/s390/pci/pci_debug.c
@@ -0,0 +1,193 @@
+/*
+ *  Copyright IBM Corp. 2012
+ *
+ *  Author(s):
+ *    Jan Glauber <jang@linux.vnet.ibm.com>
+ */
+
+#define COMPONENT "zPCI"
+#define pr_fmt(fmt) COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/pci.h>
+#include <asm/debug.h>
+
+#include <asm/pci_dma.h>
+
+static struct dentry *debugfs_root;
+
+static char *pci_perf_names[] = {
+	/* hardware counters */
+	"Load operations",
+	"Store operations",
+	"Store block operations",
+	"Refresh operations",
+	"DMA read bytes",
+	"DMA write bytes",
+	/* software counters */
+	"Allocated pages",
+	"Mapped pages",
+	"Unmapped pages",
+};
+
+static int pci_perf_show(struct seq_file *m, void *v)
+{
+	struct zpci_dev *zdev = m->private;
+	u64 *stat;
+	int i;
+
+	if (!zdev)
+		return 0;
+	if (!zdev->fmb)
+		return seq_printf(m, "FMB statistics disabled\n");
+
+	/* header */
+	seq_printf(m, "FMB @ %p\n", zdev->fmb);
+	seq_printf(m, "Update interval: %u ms\n", zdev->fmb_update);
+	seq_printf(m, "Samples: %u\n", zdev->fmb->samples);
+	seq_printf(m, "Last update TOD: %Lx\n", zdev->fmb->last_update);
+
+	/* hardware counters */
+	stat = (u64 *) &zdev->fmb->ld_ops;
+	for (i = 0; i < 4; i++)
+		seq_printf(m, "%26s:\t%llu\n",
+			   pci_perf_names[i], *(stat + i));
+	if (zdev->fmb->dma_valid)
+		for (i = 4; i < 6; i++)
+			seq_printf(m, "%26s:\t%llu\n",
+				   pci_perf_names[i], *(stat + i));
+	/* software counters */
+	for (i = 6; i < ARRAY_SIZE(pci_perf_names); i++)
+		seq_printf(m, "%26s:\t%llu\n",
+			   pci_perf_names[i],
+			   atomic64_read((atomic64_t *) (stat + i)));
+
+	return 0;
+}
+
+static ssize_t pci_perf_seq_write(struct file *file, const char __user *ubuf,
+				  size_t count, loff_t *off)
+{
+	struct zpci_dev *zdev = ((struct seq_file *) file->private_data)->private;
+	unsigned long val;
+	int rc;
+
+	if (!zdev)
+		return 0;
+
+	rc = kstrtoul_from_user(ubuf, count, 10, &val);
+	if (rc)
+		return rc;
+
+	switch (val) {
+	case 0:
+		rc = zpci_fmb_disable_device(zdev);
+		if (rc)
+			return rc;
+		break;
+	case 1:
+		rc = zpci_fmb_enable_device(zdev);
+		if (rc)
+			return rc;
+		break;
+	}
+	return count;
+}
+
+static int pci_perf_seq_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, pci_perf_show,
+			   filp->f_path.dentry->d_inode->i_private);
+}
+
+static const struct file_operations debugfs_pci_perf_fops = {
+	.open	 = pci_perf_seq_open,
+	.read	 = seq_read,
+	.write	 = pci_perf_seq_write,
+	.llseek  = seq_lseek,
+	.release = single_release,
+};
+
+static int pci_debug_show(struct seq_file *m, void *v)
+{
+	struct zpci_dev *zdev = m->private;
+
+	zpci_debug_info(zdev, m);
+	return 0;
+}
+
+static int pci_debug_seq_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, pci_debug_show,
+			   filp->f_path.dentry->d_inode->i_private);
+}
+
+static const struct file_operations debugfs_pci_debug_fops = {
+	.open	 = pci_debug_seq_open,
+	.read	 = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+};
+
+void zpci_debug_init_device(struct zpci_dev *zdev)
+{
+	zdev->debugfs_dev = debugfs_create_dir(dev_name(&zdev->pdev->dev),
+					       debugfs_root);
+	if (IS_ERR(zdev->debugfs_dev))
+		zdev->debugfs_dev = NULL;
+
+	zdev->debugfs_perf = debugfs_create_file("statistics",
+				S_IFREG | S_IRUGO | S_IWUSR,
+				zdev->debugfs_dev, zdev,
+				&debugfs_pci_perf_fops);
+	if (IS_ERR(zdev->debugfs_perf))
+		zdev->debugfs_perf = NULL;
+
+	zdev->debugfs_debug = debugfs_create_file("debug",
+				S_IFREG | S_IRUGO | S_IWUSR,
+				zdev->debugfs_dev, zdev,
+				&debugfs_pci_debug_fops);
+	if (IS_ERR(zdev->debugfs_debug))
+		zdev->debugfs_debug = NULL;
+}
+
+void zpci_debug_exit_device(struct zpci_dev *zdev)
+{
+	debugfs_remove(zdev->debugfs_perf);
+	debugfs_remove(zdev->debugfs_debug);
+	debugfs_remove(zdev->debugfs_dev);
+}
+
+int __init zpci_debug_init(void)
+{
+	/* event trace buffer */
+	pci_debug_msg_id = debug_register("pci_msg", 16, 1, 16 * sizeof(long));
+	if (!pci_debug_msg_id)
+		return -EINVAL;
+	debug_register_view(pci_debug_msg_id, &debug_sprintf_view);
+	debug_set_level(pci_debug_msg_id, 3);
+	zpci_dbg("Debug view initialized\n");
+
+	/* error log */
+	pci_debug_err_id = debug_register("pci_error", 2, 1, 16);
+	if (!pci_debug_err_id)
+		return -EINVAL;
+	debug_register_view(pci_debug_err_id, &debug_hex_ascii_view);
+	debug_set_level(pci_debug_err_id, 6);
+	zpci_err("Debug view initialized\n");
+
+	debugfs_root = debugfs_create_dir("pci", NULL);
+	return 0;
+}
+
+void zpci_debug_exit(void)
+{
+	if (pci_debug_msg_id)
+		debug_unregister(pci_debug_msg_id);
+	if (pci_debug_err_id)
+		debug_unregister(pci_debug_err_id);
+
+	debugfs_remove(debugfs_root);
+}
diff --git a/trunk/arch/s390/pci/pci_dma.c b/trunk/arch/s390/pci/pci_dma.c
index c64b4b294b0a..6138468b420f 100644
--- a/trunk/arch/s390/pci/pci_dma.c
+++ b/trunk/arch/s390/pci/pci_dma.c
@@ -291,8 +291,10 @@ static dma_addr_t s390_dma_map_pages(struct device *dev, struct page *page,
 	if (direction == DMA_NONE || direction == DMA_TO_DEVICE)
 		flags |= ZPCI_TABLE_PROTECTED;
 
-	if (!dma_update_trans(zdev, pa, dma_addr, size, flags))
+	if (!dma_update_trans(zdev, pa, dma_addr, size, flags)) {
+		atomic64_add(nr_pages, (atomic64_t *) &zdev->fmb->mapped_pages);
 		return dma_addr + offset;
+	}
 
 out_free:
 	dma_free_iommu(zdev, iommu_page_index, nr_pages);
@@ -315,6 +317,7 @@ static void s390_dma_unmap_pages(struct device *dev, dma_addr_t dma_addr,
 			     ZPCI_TABLE_PROTECTED | ZPCI_PTE_INVALID))
 		dev_err(dev, "Failed to unmap addr: %Lx\n", dma_addr);
 
+	atomic64_add(npages, (atomic64_t *) &zdev->fmb->unmapped_pages);
 	iommu_page_index = (dma_addr - zdev->start_dma) >> PAGE_SHIFT;
 	dma_free_iommu(zdev, iommu_page_index, npages);
 }
@@ -323,6 +326,7 @@ static void *s390_dma_alloc(struct device *dev, size_t size,
 			    dma_addr_t *dma_handle, gfp_t flag,
 			    struct dma_attrs *attrs)
 {
+	struct zpci_dev *zdev = get_zdev(container_of(dev, struct pci_dev, dev));
 	struct page *page;
 	unsigned long pa;
 	dma_addr_t map;
@@ -331,6 +335,8 @@ static void *s390_dma_alloc(struct device *dev, size_t size,
 	page = alloc_pages(flag, get_order(size));
 	if (!page)
 		return NULL;
+
+	atomic64_add(size / PAGE_SIZE, (atomic64_t *) &zdev->fmb->allocated_pages);
 	pa = page_to_phys(page);
 	memset((void *) pa, 0, size);
 
diff --git a/trunk/arch/s390/pci/pci_event.c b/trunk/arch/s390/pci/pci_event.c
index dbed8cd3370c..ec62e3a0dc09 100644
--- a/trunk/arch/s390/pci/pci_event.c
+++ b/trunk/arch/s390/pci/pci_event.c
@@ -45,6 +45,8 @@ static void zpci_event_log_err(struct zpci_ccdf_err *ccdf)
 {
 	struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid);
 
+	zpci_err("SEI error CCD:\n");
+	zpci_err_hex(ccdf, sizeof(*ccdf));
 	dev_err(&zdev->pdev->dev, "event code: 0x%x\n", ccdf->pec);
 }
 
diff --git a/trunk/arch/sparc/crypto/aes_asm.S b/trunk/arch/sparc/crypto/aes_asm.S
index 23f6cbb910d3..1cda8aa7cb85 100644
--- a/trunk/arch/sparc/crypto/aes_asm.S
+++ b/trunk/arch/sparc/crypto/aes_asm.S
@@ -1024,7 +1024,11 @@ ENTRY(aes_sparc64_ecb_encrypt_256)
 	 add		%o2, 0x20, %o2
 	brlz,pt		%o3, 11f
 	 nop
-10:	ldx		[%o1 + 0x00], %g3
+10:	ldd		[%o0 + 0xd0], %f56
+	ldd		[%o0 + 0xd8], %f58
+	ldd		[%o0 + 0xe0], %f60
+	ldd		[%o0 + 0xe8], %f62
+	ldx		[%o1 + 0x00], %g3
 	ldx		[%o1 + 0x08], %g7
 	xor		%g1, %g3, %g3
 	xor		%g2, %g7, %g7
@@ -1128,9 +1132,9 @@ ENTRY(aes_sparc64_ecb_decrypt_256)
 	/* %o0=&key[key_len], %o1=input, %o2=output, %o3=len */
 	ldx		[%o0 - 0x10], %g1
 	subcc		%o3, 0x10, %o3
+	ldx		[%o0 - 0x08], %g2
 	be		10f
-	 ldx		[%o0 - 0x08], %g2
-	sub		%o0, 0xf0, %o0
+	 sub		%o0, 0xf0, %o0
 1:	ldx		[%o1 + 0x00], %g3
 	ldx		[%o1 + 0x08], %g7
 	ldx		[%o1 + 0x10], %o4
@@ -1154,7 +1158,11 @@ ENTRY(aes_sparc64_ecb_decrypt_256)
 	 add		%o2, 0x20, %o2
 	brlz,pt		%o3, 11f
 	 nop
-10:	ldx		[%o1 + 0x00], %g3
+10:	ldd		[%o0 + 0x18], %f56
+	ldd		[%o0 + 0x10], %f58
+	ldd		[%o0 + 0x08], %f60
+	ldd		[%o0 + 0x00], %f62
+	ldx		[%o1 + 0x00], %g3
 	ldx		[%o1 + 0x08], %g7
 	xor		%g1, %g3, %g3
 	xor		%g2, %g7, %g7
@@ -1511,11 +1519,11 @@ ENTRY(aes_sparc64_ctr_crypt_256)
 	 add		%o2, 0x20, %o2
 	brlz,pt		%o3, 11f
 	 nop
-	ldd		[%o0 + 0xd0], %f56
+10:	ldd		[%o0 + 0xd0], %f56
 	ldd		[%o0 + 0xd8], %f58
 	ldd		[%o0 + 0xe0], %f60
 	ldd		[%o0 + 0xe8], %f62
-10:	xor		%g1, %g3, %o5
+	xor		%g1, %g3, %o5
 	MOVXTOD_O5_F0
 	xor		%g2, %g7, %o5
 	MOVXTOD_O5_F2
diff --git a/trunk/arch/sparc/crypto/aes_glue.c b/trunk/arch/sparc/crypto/aes_glue.c
index 3965d1d36dfa..503e6d96ad4e 100644
--- a/trunk/arch/sparc/crypto/aes_glue.c
+++ b/trunk/arch/sparc/crypto/aes_glue.c
@@ -222,6 +222,7 @@ static int ecb_encrypt(struct blkcipher_desc *desc,
 
 	blkcipher_walk_init(&walk, dst, src, nbytes);
 	err = blkcipher_walk_virt(desc, &walk);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	ctx->ops->load_encrypt_keys(&ctx->key[0]);
 	while ((nbytes = walk.nbytes)) {
@@ -251,6 +252,7 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
 
 	blkcipher_walk_init(&walk, dst, src, nbytes);
 	err = blkcipher_walk_virt(desc, &walk);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	ctx->ops->load_decrypt_keys(&ctx->key[0]);
 	key_end = &ctx->key[ctx->expanded_key_length / sizeof(u64)];
@@ -280,6 +282,7 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
 
 	blkcipher_walk_init(&walk, dst, src, nbytes);
 	err = blkcipher_walk_virt(desc, &walk);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	ctx->ops->load_encrypt_keys(&ctx->key[0]);
 	while ((nbytes = walk.nbytes)) {
@@ -309,6 +312,7 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
 
 	blkcipher_walk_init(&walk, dst, src, nbytes);
 	err = blkcipher_walk_virt(desc, &walk);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	ctx->ops->load_decrypt_keys(&ctx->key[0]);
 	key_end = &ctx->key[ctx->expanded_key_length / sizeof(u64)];
@@ -329,6 +333,22 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
 	return err;
 }
 
+static void ctr_crypt_final(struct crypto_sparc64_aes_ctx *ctx,
+			    struct blkcipher_walk *walk)
+{
+	u8 *ctrblk = walk->iv;
+	u64 keystream[AES_BLOCK_SIZE / sizeof(u64)];
+	u8 *src = walk->src.virt.addr;
+	u8 *dst = walk->dst.virt.addr;
+	unsigned int nbytes = walk->nbytes;
+
+	ctx->ops->ecb_encrypt(&ctx->key[0], (const u64 *)ctrblk,
+			      keystream, AES_BLOCK_SIZE);
+	crypto_xor((u8 *) keystream, src, nbytes);
+	memcpy(dst, keystream, nbytes);
+	crypto_inc(ctrblk, AES_BLOCK_SIZE);
+}
+
 static int ctr_crypt(struct blkcipher_desc *desc,
 		     struct scatterlist *dst, struct scatterlist *src,
 		     unsigned int nbytes)
@@ -338,10 +358,11 @@ static int ctr_crypt(struct blkcipher_desc *desc,
 	int err;
 
 	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt(desc, &walk);
+	err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	ctx->ops->load_encrypt_keys(&ctx->key[0]);
-	while ((nbytes = walk.nbytes)) {
+	while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
 		unsigned int block_len = nbytes & AES_BLOCK_MASK;
 
 		if (likely(block_len)) {
@@ -353,6 +374,10 @@ static int ctr_crypt(struct blkcipher_desc *desc,
 		nbytes &= AES_BLOCK_SIZE - 1;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
+	if (walk.nbytes) {
+		ctr_crypt_final(ctx, &walk);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
 	fprs_write(0);
 	return err;
 }
@@ -418,7 +443,7 @@ static struct crypto_alg algs[] = { {
 	.cra_driver_name	= "ctr-aes-sparc64",
 	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
 	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_blocksize		= 1,
 	.cra_ctxsize		= sizeof(struct crypto_sparc64_aes_ctx),
 	.cra_alignmask		= 7,
 	.cra_type		= &crypto_blkcipher_type,
diff --git a/trunk/arch/sparc/crypto/camellia_glue.c b/trunk/arch/sparc/crypto/camellia_glue.c
index 62c89af3fd3f..888f6260b4ec 100644
--- a/trunk/arch/sparc/crypto/camellia_glue.c
+++ b/trunk/arch/sparc/crypto/camellia_glue.c
@@ -98,6 +98,7 @@ static int __ecb_crypt(struct blkcipher_desc *desc,
 
 	blkcipher_walk_init(&walk, dst, src, nbytes);
 	err = blkcipher_walk_virt(desc, &walk);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	if (encrypt)
 		key = &ctx->encrypt_key[0];
@@ -160,6 +161,7 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
 
 	blkcipher_walk_init(&walk, dst, src, nbytes);
 	err = blkcipher_walk_virt(desc, &walk);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	key = &ctx->encrypt_key[0];
 	camellia_sparc64_load_keys(key, ctx->key_len);
@@ -198,6 +200,7 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
 
 	blkcipher_walk_init(&walk, dst, src, nbytes);
 	err = blkcipher_walk_virt(desc, &walk);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	key = &ctx->decrypt_key[0];
 	camellia_sparc64_load_keys(key, ctx->key_len);
diff --git a/trunk/arch/sparc/crypto/des_asm.S b/trunk/arch/sparc/crypto/des_asm.S
index 30b6e90b28b2..b5c8fc269b5f 100644
--- a/trunk/arch/sparc/crypto/des_asm.S
+++ b/trunk/arch/sparc/crypto/des_asm.S
@@ -376,6 +376,7 @@ ENTRY(des3_ede_sparc64_ecb_crypt)
 1:	ldd	[%o1 + 0x00], %f60
 	DES3_LOOP_BODY(60)
 	std	%f60, [%o2 + 0x00]
+	add	%o1, 0x08, %o1
 	subcc	%o3, 0x08, %o3
 	bne,pt	%icc, 1b
 	 add	%o2, 0x08, %o2
diff --git a/trunk/arch/sparc/crypto/des_glue.c b/trunk/arch/sparc/crypto/des_glue.c
index 41524cebcc49..3065bc61f9d3 100644
--- a/trunk/arch/sparc/crypto/des_glue.c
+++ b/trunk/arch/sparc/crypto/des_glue.c
@@ -100,6 +100,7 @@ static int __ecb_crypt(struct blkcipher_desc *desc,
 
 	blkcipher_walk_init(&walk, dst, src, nbytes);
 	err = blkcipher_walk_virt(desc, &walk);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	if (encrypt)
 		des_sparc64_load_keys(&ctx->encrypt_expkey[0]);
@@ -147,6 +148,7 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
 
 	blkcipher_walk_init(&walk, dst, src, nbytes);
 	err = blkcipher_walk_virt(desc, &walk);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	des_sparc64_load_keys(&ctx->encrypt_expkey[0]);
 	while ((nbytes = walk.nbytes)) {
@@ -177,6 +179,7 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
 
 	blkcipher_walk_init(&walk, dst, src, nbytes);
 	err = blkcipher_walk_virt(desc, &walk);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	des_sparc64_load_keys(&ctx->decrypt_expkey[0]);
 	while ((nbytes = walk.nbytes)) {
@@ -266,6 +269,7 @@ static int __ecb3_crypt(struct blkcipher_desc *desc,
 
 	blkcipher_walk_init(&walk, dst, src, nbytes);
 	err = blkcipher_walk_virt(desc, &walk);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	if (encrypt)
 		K = &ctx->encrypt_expkey[0];
@@ -317,6 +321,7 @@ static int cbc3_encrypt(struct blkcipher_desc *desc,
 
 	blkcipher_walk_init(&walk, dst, src, nbytes);
 	err = blkcipher_walk_virt(desc, &walk);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	K = &ctx->encrypt_expkey[0];
 	des3_ede_sparc64_load_keys(K);
@@ -352,6 +357,7 @@ static int cbc3_decrypt(struct blkcipher_desc *desc,
 
 	blkcipher_walk_init(&walk, dst, src, nbytes);
 	err = blkcipher_walk_virt(desc, &walk);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	K = &ctx->decrypt_expkey[0];
 	des3_ede_sparc64_load_keys(K);
diff --git a/trunk/arch/sparc/include/asm/hugetlb.h b/trunk/arch/sparc/include/asm/hugetlb.h
index 8c5eed6d267f..9661e9bc7bb6 100644
--- a/trunk/arch/sparc/include/asm/hugetlb.h
+++ b/trunk/arch/sparc/include/asm/hugetlb.h
@@ -61,14 +61,20 @@ static inline pte_t huge_pte_wrprotect(pte_t pte)
 static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
 					   unsigned long addr, pte_t *ptep)
 {
-	ptep_set_wrprotect(mm, addr, ptep);
+	pte_t old_pte = *ptep;
+	set_huge_pte_at(mm, addr, ptep, pte_wrprotect(old_pte));
 }
 
 static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 					     unsigned long addr, pte_t *ptep,
 					     pte_t pte, int dirty)
 {
-	return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+	int changed = !pte_same(*ptep, pte);
+	if (changed) {
+		set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+		flush_tlb_page(vma, addr);
+	}
+	return changed;
 }
 
 static inline pte_t huge_ptep_get(pte_t *ptep)
diff --git a/trunk/arch/sparc/include/asm/pgtable_64.h b/trunk/arch/sparc/include/asm/pgtable_64.h
index 95515f1e7cef..7870be0f5adc 100644
--- a/trunk/arch/sparc/include/asm/pgtable_64.h
+++ b/trunk/arch/sparc/include/asm/pgtable_64.h
@@ -617,6 +617,12 @@ static inline unsigned long pte_present(pte_t pte)
 	return val;
 }
 
+#define pte_accessible pte_accessible
+static inline unsigned long pte_accessible(pte_t a)
+{
+	return pte_val(a) & _PAGE_VALID;
+}
+
 static inline unsigned long pte_special(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_SPECIAL;
@@ -802,7 +808,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
 	 * SUN4V NOTE: _PAGE_VALID is the same value in both the SUN4U
 	 *             and SUN4V pte layout, so this inline test is fine.
 	 */
-	if (likely(mm != &init_mm) && (pte_val(orig) & _PAGE_VALID))
+	if (likely(mm != &init_mm) && pte_accessible(orig))
 		tlb_batch_add(mm, addr, ptep, orig, fullmm);
 }
 
diff --git a/trunk/arch/sparc/include/asm/unistd.h b/trunk/arch/sparc/include/asm/unistd.h
index c3e5d8b64171..497386a7ed28 100644
--- a/trunk/arch/sparc/include/asm/unistd.h
+++ b/trunk/arch/sparc/include/asm/unistd.h
@@ -45,6 +45,7 @@
 #define __ARCH_WANT_COMPAT_SYS_TIME
 #define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
 #define __ARCH_WANT_COMPAT_SYS_SENDFILE
+#define __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL
 #endif
 #define __ARCH_WANT_SYS_EXECVE
 
diff --git a/trunk/arch/sparc/kernel/module.c b/trunk/arch/sparc/kernel/module.c
index f1ddc0d23679..4435488ebe25 100644
--- a/trunk/arch/sparc/kernel/module.c
+++ b/trunk/arch/sparc/kernel/module.c
@@ -43,10 +43,6 @@ void *module_alloc(unsigned long size)
 {
 	void *ret;
 
-	/* We handle the zero case fine, unlike vmalloc */
-	if (size == 0)
-		return NULL;
-
 	ret = module_map(size);
 	if (ret)
 		memset(ret, 0, size);
diff --git a/trunk/arch/sparc/kernel/sys_sparc32.c b/trunk/arch/sparc/kernel/sys_sparc32.c
index 03c7e929ec34..4a4cdc633f6b 100644
--- a/trunk/arch/sparc/kernel/sys_sparc32.c
+++ b/trunk/arch/sparc/kernel/sys_sparc32.c
@@ -211,20 +211,6 @@ asmlinkage long compat_sys_sysfs(int option, u32 arg1, u32 arg2)
 	return sys_sysfs(option, arg1, arg2);
 }
 
-asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *interval)
-{
-	struct timespec t;
-	int ret;
-	mm_segment_t old_fs = get_fs ();
-	
-	set_fs (KERNEL_DS);
-	ret = sys_sched_rr_get_interval(pid, (struct timespec __user *) &t);
-	set_fs (old_fs);
-	if (put_compat_timespec(&t, interval))
-		return -EFAULT;
-	return ret;
-}
-
 asmlinkage long compat_sys_rt_sigprocmask(int how,
 					  compat_sigset_t __user *set,
 					  compat_sigset_t __user *oset,
diff --git a/trunk/arch/tile/include/asm/compat.h b/trunk/arch/tile/include/asm/compat.h
index ca61fb4296b3..88f3c227afd9 100644
--- a/trunk/arch/tile/include/asm/compat.h
+++ b/trunk/arch/tile/include/asm/compat.h
@@ -296,8 +296,6 @@ long compat_sys_sync_file_range2(int fd, unsigned int flags,
 long compat_sys_fallocate(int fd, int mode,
 			  u32 offset_lo, u32 offset_hi,
 			  u32 len_lo, u32 len_hi);
-long compat_sys_sched_rr_get_interval(compat_pid_t pid,
-				      struct compat_timespec __user *interval);
 
 /* Assembly trampoline to avoid clobbering r0. */
 long _compat_sys_rt_sigreturn(void);
diff --git a/trunk/arch/tile/include/asm/elf.h b/trunk/arch/tile/include/asm/elf.h
index b73e1039c911..ff8a93408823 100644
--- a/trunk/arch/tile/include/asm/elf.h
+++ b/trunk/arch/tile/include/asm/elf.h
@@ -170,4 +170,6 @@ do { \
 
 #endif /* CONFIG_COMPAT */
 
+#define CORE_DUMP_USE_REGSET
+
 #endif /* _ASM_TILE_ELF_H */
diff --git a/trunk/arch/tile/include/asm/ptrace.h b/trunk/arch/tile/include/asm/ptrace.h
index 1a4fd9ab0ee1..5ce052e16b7b 100644
--- a/trunk/arch/tile/include/asm/ptrace.h
+++ b/trunk/arch/tile/include/asm/ptrace.h
@@ -24,8 +24,7 @@ typedef unsigned long pt_reg_t;
 #include <uapi/asm/ptrace.h>
 
 #define PTRACE_O_MASK_TILE	(PTRACE_O_TRACEMIGRATE)
-#define PT_TRACE_MIGRATE	0x00080000
-#define PT_TRACE_MASK_TILE	(PT_TRACE_MIGRATE)
+#define PT_TRACE_MIGRATE	PT_EVENT_FLAG(PTRACE_EVENT_MIGRATE)
 
 /* Flag bits in pt_regs.flags */
 #define PT_FLAGS_DISABLE_IRQ    1  /* on return to kernel, disable irqs */
diff --git a/trunk/arch/tile/include/asm/unistd.h b/trunk/arch/tile/include/asm/unistd.h
index b51c6ee3cd6c..fe841e7d4963 100644
--- a/trunk/arch/tile/include/asm/unistd.h
+++ b/trunk/arch/tile/include/asm/unistd.h
@@ -14,6 +14,7 @@
 /* In compat mode, we use sys_llseek() for compat_sys_llseek(). */
 #ifdef CONFIG_COMPAT
 #define __ARCH_WANT_SYS_LLSEEK
+#define __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL
 #endif
 #define __ARCH_WANT_SYS_NEWFSTATAT
 #define __ARCH_WANT_SYS_EXECVE
diff --git a/trunk/arch/tile/include/uapi/asm/ptrace.h b/trunk/arch/tile/include/uapi/asm/ptrace.h
index c717d0fec72e..7757e1985fb6 100644
--- a/trunk/arch/tile/include/uapi/asm/ptrace.h
+++ b/trunk/arch/tile/include/uapi/asm/ptrace.h
@@ -81,8 +81,14 @@ struct pt_regs {
 #define PTRACE_SETFPREGS	15
 
 /* Support TILE-specific ptrace options, with events starting at 16. */
-#define PTRACE_O_TRACEMIGRATE	0x00010000
 #define PTRACE_EVENT_MIGRATE	16
+#define PTRACE_O_TRACEMIGRATE	(1 << PTRACE_EVENT_MIGRATE)
 
+/*
+ * Flag bits in pt_regs.flags that are part of the ptrace API.
+ * We start our numbering higher up to avoid confusion with the
+ * non-ABI kernel-internal values that use the low 16 bits.
+ */
+#define PT_FLAGS_COMPAT		0x10000  /* process is an -m32 compat process */
 
 #endif /* _UAPI_ASM_TILE_PTRACE_H */
diff --git a/trunk/arch/tile/kernel/compat.c b/trunk/arch/tile/kernel/compat.c
index 9cd7cb6041c0..7f72401b4f45 100644
--- a/trunk/arch/tile/kernel/compat.c
+++ b/trunk/arch/tile/kernel/compat.c
@@ -76,24 +76,6 @@ long compat_sys_fallocate(int fd, int mode,
 			     ((loff_t)len_hi << 32) | len_lo);
 }
 
-
-
-long compat_sys_sched_rr_get_interval(compat_pid_t pid,
-				      struct compat_timespec __user *interval)
-{
-	struct timespec t;
-	int ret;
-	mm_segment_t old_fs = get_fs();
-
-	set_fs(KERNEL_DS);
-	ret = sys_sched_rr_get_interval(pid,
-					(struct timespec __force __user *)&t);
-	set_fs(old_fs);
-	if (put_compat_timespec(&t, interval))
-		return -EFAULT;
-	return ret;
-}
-
 /* Provide the compat syscall number to call mapping. */
 #undef __SYSCALL
 #define __SYSCALL(nr, call) [nr] = (call),
diff --git a/trunk/arch/tile/kernel/module.c b/trunk/arch/tile/kernel/module.c
index 243ffebe38d6..4918d91bc3a6 100644
--- a/trunk/arch/tile/kernel/module.c
+++ b/trunk/arch/tile/kernel/module.c
@@ -42,8 +42,6 @@ void *module_alloc(unsigned long size)
 	int i = 0;
 	int npages;
 
-	if (size == 0)
-		return NULL;
 	npages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
 	pages = kmalloc(npages * sizeof(struct page *), GFP_KERNEL);
 	if (pages == NULL)
diff --git a/trunk/arch/tile/kernel/pci.c b/trunk/arch/tile/kernel/pci.c
index 759822687e8f..aac1cd586966 100644
--- a/trunk/arch/tile/kernel/pci.c
+++ b/trunk/arch/tile/kernel/pci.c
@@ -245,7 +245,7 @@ static void __devinit fixup_read_and_payload_sizes(void)
 	u16 new_values;
 
 	/* Scan for the smallest maximum payload size. */
-	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+	for_each_pci_dev(dev) {
 		u32 devcap;
 		int max_payload;
 
@@ -260,7 +260,7 @@ static void __devinit fixup_read_and_payload_sizes(void)
 
 	/* Now, set the max_payload_size for all devices to that value. */
 	new_values = (max_read_size << 12) | (smallest_max_payload << 5);
-	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL)
+	for_each_pci_dev(dev)
 		pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL,
 				PCI_EXP_DEVCTL_PAYLOAD | PCI_EXP_DEVCTL_READRQ,
 				new_values);
diff --git a/trunk/arch/tile/kernel/pci_gx.c b/trunk/arch/tile/kernel/pci_gx.c
index 2ba6d052f85d..94810d4a6332 100644
--- a/trunk/arch/tile/kernel/pci_gx.c
+++ b/trunk/arch/tile/kernel/pci_gx.c
@@ -1047,8 +1047,7 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
 }
 
 /* Called for each device after PCI setup is done. */
-static void __init
-pcibios_fixup_final(struct pci_dev *pdev)
+static void pcibios_fixup_final(struct pci_dev *pdev)
 {
 	set_dma_ops(&pdev->dev, gx_pci_dma_map_ops);
 	set_dma_offset(&pdev->dev, TILE_PCI_MEM_MAP_BASE_OFFSET);
diff --git a/trunk/arch/tile/kernel/ptrace.c b/trunk/arch/tile/kernel/ptrace.c
index e92e40527d6d..9835312d5a91 100644
--- a/trunk/arch/tile/kernel/ptrace.c
+++ b/trunk/arch/tile/kernel/ptrace.c
@@ -19,7 +19,10 @@
 #include <linux/kprobes.h>
 #include <linux/compat.h>
 #include <linux/uaccess.h>
+#include <linux/regset.h>
+#include <linux/elf.h>
 #include <asm/traps.h>
+#include <arch/chip.h>
 
 void user_enable_single_step(struct task_struct *child)
 {
@@ -45,6 +48,100 @@ void ptrace_disable(struct task_struct *child)
 	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 }
 
+/*
+ * Get registers from task and ready the result for userspace.
+ * Note that we localize the API issues to getregs() and putregs() at
+ * some cost in performance, e.g. we need a full pt_regs copy for
+ * PEEKUSR, and two copies for POKEUSR.  But in general we expect
+ * GETREGS/PUTREGS to be the API of choice anyway.
+ */
+static char *getregs(struct task_struct *child, struct pt_regs *uregs)
+{
+	*uregs = *task_pt_regs(child);
+
+	/* Set up flags ABI bits. */
+	uregs->flags = 0;
+#ifdef CONFIG_COMPAT
+	if (task_thread_info(child)->status & TS_COMPAT)
+		uregs->flags |= PT_FLAGS_COMPAT;
+#endif
+
+	return (char *)uregs;
+}
+
+/* Put registers back to task. */
+static void putregs(struct task_struct *child, struct pt_regs *uregs)
+{
+	struct pt_regs *regs = task_pt_regs(child);
+
+	/* Don't allow overwriting the kernel-internal flags word. */
+	uregs->flags = regs->flags;
+
+	/* Only allow setting the ICS bit in the ex1 word. */
+	uregs->ex1 = PL_ICS_EX1(USER_PL, EX1_ICS(uregs->ex1));
+
+	*regs = *uregs;
+}
+
+enum tile_regset {
+	REGSET_GPR,
+};
+
+static int tile_gpr_get(struct task_struct *target,
+			  const struct user_regset *regset,
+			  unsigned int pos, unsigned int count,
+			  void *kbuf, void __user *ubuf)
+{
+	struct pt_regs regs;
+
+	getregs(target, &regs);
+
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &regs, 0,
+				   sizeof(regs));
+}
+
+static int tile_gpr_set(struct task_struct *target,
+			  const struct user_regset *regset,
+			  unsigned int pos, unsigned int count,
+			  const void *kbuf, const void __user *ubuf)
+{
+	int ret;
+	struct pt_regs regs;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &regs, 0,
+				 sizeof(regs));
+	if (ret)
+		return ret;
+
+	putregs(target, &regs);
+
+	return 0;
+}
+
+static const struct user_regset tile_user_regset[] = {
+	[REGSET_GPR] = {
+		.core_note_type = NT_PRSTATUS,
+		.n = ELF_NGREG,
+		.size = sizeof(elf_greg_t),
+		.align = sizeof(elf_greg_t),
+		.get = tile_gpr_get,
+		.set = tile_gpr_set,
+	},
+};
+
+static const struct user_regset_view tile_user_regset_view = {
+	.name = CHIP_ARCH_NAME,
+	.e_machine = ELF_ARCH,
+	.ei_osabi = ELF_OSABI,
+	.regsets = tile_user_regset,
+	.n = ARRAY_SIZE(tile_user_regset),
+};
+
+const struct user_regset_view *task_user_regset_view(struct task_struct *task)
+{
+	return &tile_user_regset_view;
+}
+
 long arch_ptrace(struct task_struct *child, long request,
 		 unsigned long addr, unsigned long data)
 {
@@ -53,14 +150,13 @@ long arch_ptrace(struct task_struct *child, long request,
 	long ret = -EIO;
 	char *childreg;
 	struct pt_regs copyregs;
-	int ex1_offset;
 
 	switch (request) {
 
 	case PTRACE_PEEKUSR:  /* Read register from pt_regs. */
 		if (addr >= PTREGS_SIZE)
 			break;
-		childreg = (char *)task_pt_regs(child) + addr;
+		childreg = getregs(child, &copyregs) + addr;
 #ifdef CONFIG_COMPAT
 		if (is_compat_task()) {
 			if (addr & (sizeof(compat_long_t)-1))
@@ -79,17 +175,7 @@ long arch_ptrace(struct task_struct *child, long request,
 	case PTRACE_POKEUSR:  /* Write register in pt_regs. */
 		if (addr >= PTREGS_SIZE)
 			break;
-		childreg = (char *)task_pt_regs(child) + addr;
-
-		/* Guard against overwrites of the privilege level. */
-		ex1_offset = PTREGS_OFFSET_EX1;
-#if defined(CONFIG_COMPAT) && defined(__BIG_ENDIAN)
-		if (is_compat_task())   /* point at low word */
-			ex1_offset += sizeof(compat_long_t);
-#endif
-		if (addr == ex1_offset)
-			data = PL_ICS_EX1(USER_PL, EX1_ICS(data));
-
+		childreg = getregs(child, &copyregs) + addr;
 #ifdef CONFIG_COMPAT
 		if (is_compat_task()) {
 			if (addr & (sizeof(compat_long_t)-1))
@@ -102,24 +188,20 @@ long arch_ptrace(struct task_struct *child, long request,
 				break;
 			*(long *)childreg = data;
 		}
+		putregs(child, &copyregs);
 		ret = 0;
 		break;
 
 	case PTRACE_GETREGS:  /* Get all registers from the child. */
-		if (copy_to_user(datap, task_pt_regs(child),
-				 sizeof(struct pt_regs)) == 0) {
-			ret = 0;
-		}
+		ret = copy_regset_to_user(child, &tile_user_regset_view,
+					  REGSET_GPR, 0,
+					  sizeof(struct pt_regs), datap);
 		break;
 
 	case PTRACE_SETREGS:  /* Set all registers in the child. */
-		if (copy_from_user(&copyregs, datap,
-				   sizeof(struct pt_regs)) == 0) {
-			copyregs.ex1 =
-				PL_ICS_EX1(USER_PL, EX1_ICS(copyregs.ex1));
-			*task_pt_regs(child) = copyregs;
-			ret = 0;
-		}
+		ret = copy_regset_from_user(child, &tile_user_regset_view,
+					    REGSET_GPR, 0,
+					    sizeof(struct pt_regs), datap);
 		break;
 
 	case PTRACE_GETFPREGS:  /* Get the child FPU state. */
@@ -128,12 +210,16 @@ long arch_ptrace(struct task_struct *child, long request,
 
 	case PTRACE_SETOPTIONS:
 		/* Support TILE-specific ptrace options. */
-		child->ptrace &= ~PT_TRACE_MASK_TILE;
+		BUILD_BUG_ON(PTRACE_O_MASK_TILE & PTRACE_O_MASK);
 		tmp = data & PTRACE_O_MASK_TILE;
 		data &= ~PTRACE_O_MASK_TILE;
 		ret = ptrace_request(child, request, addr, data);
-		if (tmp & PTRACE_O_TRACEMIGRATE)
-			child->ptrace |= PT_TRACE_MIGRATE;
+		if (ret == 0) {
+			unsigned int flags = child->ptrace;
+			flags &= ~(PTRACE_O_MASK_TILE << PT_OPT_FLAG_SHIFT);
+			flags |= (tmp << PT_OPT_FLAG_SHIFT);
+			child->ptrace = flags;
+		}
 		break;
 
 	default:
diff --git a/trunk/arch/um/drivers/mconsole_kern.c b/trunk/arch/um/drivers/mconsole_kern.c
index 49e3b49e552f..4bd82ac0210f 100644
--- a/trunk/arch/um/drivers/mconsole_kern.c
+++ b/trunk/arch/um/drivers/mconsole_kern.c
@@ -123,7 +123,7 @@ void mconsole_log(struct mc_request *req)
 
 void mconsole_proc(struct mc_request *req)
 {
-	struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt;
+	struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt;
 	char *buf;
 	int len;
 	struct file *file;
diff --git a/trunk/arch/unicore32/kernel/module.c b/trunk/arch/unicore32/kernel/module.c
index 8fbe8577f5e6..16bd1495b934 100644
--- a/trunk/arch/unicore32/kernel/module.c
+++ b/trunk/arch/unicore32/kernel/module.c
@@ -27,9 +27,6 @@ void *module_alloc(unsigned long size)
 	struct vm_struct *area;
 
 	size = PAGE_ALIGN(size);
-	if (!size)
-		return NULL;
-
 	area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
 	if (!area)
 		return NULL;
diff --git a/trunk/arch/x86/Makefile b/trunk/arch/x86/Makefile
index 05afcca66de6..e71fc4279aab 100644
--- a/trunk/arch/x86/Makefile
+++ b/trunk/arch/x86/Makefile
@@ -123,9 +123,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
 # does binutils support specific instructions?
 asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
 avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
+avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
 
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr)
 
 LDFLAGS := -m elf_$(UTS_MACHINE)
 
diff --git a/trunk/arch/x86/include/asm/paravirt.h b/trunk/arch/x86/include/asm/paravirt.h
index a0facf3908d7..5edd1742cfd0 100644
--- a/trunk/arch/x86/include/asm/paravirt.h
+++ b/trunk/arch/x86/include/asm/paravirt.h
@@ -528,7 +528,6 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
 		PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
 }
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 			      pmd_t *pmdp, pmd_t pmd)
 {
@@ -539,7 +538,6 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 		PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp,
 			    native_pmd_val(pmd));
 }
-#endif
 
 static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 {
diff --git a/trunk/arch/x86/include/uapi/asm/hw_breakpoint.h b/trunk/arch/x86/include/uapi/asm/hw_breakpoint.h
index e69de29bb2d1..79a9626b5500 100644
--- a/trunk/arch/x86/include/uapi/asm/hw_breakpoint.h
+++ b/trunk/arch/x86/include/uapi/asm/hw_breakpoint.h
@@ -0,0 +1 @@
+/* */
diff --git a/trunk/arch/x86/include/uapi/asm/msr-index.h b/trunk/arch/x86/include/uapi/asm/msr-index.h
index 6e930b218724..433a59fb1a74 100644
--- a/trunk/arch/x86/include/uapi/asm/msr-index.h
+++ b/trunk/arch/x86/include/uapi/asm/msr-index.h
@@ -35,11 +35,14 @@
 #define MSR_IA32_PERFCTR0		0x000000c1
 #define MSR_IA32_PERFCTR1		0x000000c2
 #define MSR_FSB_FREQ			0x000000cd
+#define MSR_NHM_PLATFORM_INFO		0x000000ce
 
 #define MSR_NHM_SNB_PKG_CST_CFG_CTL	0x000000e2
 #define NHM_C3_AUTO_DEMOTE		(1UL << 25)
 #define NHM_C1_AUTO_DEMOTE		(1UL << 26)
 #define ATM_LNC_C6_AUTO_DEMOTE		(1UL << 25)
+#define SNB_C1_AUTO_UNDEMOTE		(1UL << 27)
+#define SNB_C3_AUTO_UNDEMOTE		(1UL << 28)
 
 #define MSR_MTRRcap			0x000000fe
 #define MSR_IA32_BBL_CR_CTL		0x00000119
@@ -55,6 +58,8 @@
 
 #define MSR_OFFCORE_RSP_0		0x000001a6
 #define MSR_OFFCORE_RSP_1		0x000001a7
+#define MSR_NHM_TURBO_RATIO_LIMIT	0x000001ad
+#define MSR_IVT_TURBO_RATIO_LIMIT	0x000001ae
 
 #define MSR_LBR_SELECT			0x000001c8
 #define MSR_LBR_TOS			0x000001c9
@@ -103,6 +108,38 @@
 #define MSR_IA32_MC0_ADDR		0x00000402
 #define MSR_IA32_MC0_MISC		0x00000403
 
+/* C-state Residency Counters */
+#define MSR_PKG_C3_RESIDENCY		0x000003f8
+#define MSR_PKG_C6_RESIDENCY		0x000003f9
+#define MSR_PKG_C7_RESIDENCY		0x000003fa
+#define MSR_CORE_C3_RESIDENCY		0x000003fc
+#define MSR_CORE_C6_RESIDENCY		0x000003fd
+#define MSR_CORE_C7_RESIDENCY		0x000003fe
+#define MSR_PKG_C2_RESIDENCY		0x0000060d
+
+/* Run Time Average Power Limiting (RAPL) Interface */
+
+#define MSR_RAPL_POWER_UNIT		0x00000606
+
+#define MSR_PKG_POWER_LIMIT		0x00000610
+#define MSR_PKG_ENERGY_STATUS		0x00000611
+#define MSR_PKG_PERF_STATUS		0x00000613
+#define MSR_PKG_POWER_INFO		0x00000614
+
+#define MSR_DRAM_POWER_LIMIT		0x00000618
+#define MSR_DRAM_ENERGY_STATUS		0x00000619
+#define MSR_DRAM_PERF_STATUS		0x0000061b
+#define MSR_DRAM_POWER_INFO		0x0000061c
+
+#define MSR_PP0_POWER_LIMIT		0x00000638
+#define MSR_PP0_ENERGY_STATUS		0x00000639
+#define MSR_PP0_POLICY			0x0000063a
+#define MSR_PP0_PERF_STATUS		0x0000063b
+
+#define MSR_PP1_POWER_LIMIT		0x00000640
+#define MSR_PP1_ENERGY_STATUS		0x00000641
+#define MSR_PP1_POLICY			0x00000642
+
 #define MSR_AMD64_MC0_MASK		0xc0010044
 
 #define MSR_IA32_MCx_CTL(x)		(MSR_IA32_MC0_CTL + 4*(x))
diff --git a/trunk/arch/x86/include/uapi/asm/setup.h b/trunk/arch/x86/include/uapi/asm/setup.h
index e69de29bb2d1..79a9626b5500 100644
--- a/trunk/arch/x86/include/uapi/asm/setup.h
+++ b/trunk/arch/x86/include/uapi/asm/setup.h
@@ -0,0 +1 @@
+/* */
diff --git a/trunk/arch/x86/kernel/cpu/proc.c b/trunk/arch/x86/kernel/cpu/proc.c
index fbd895562292..3286a92e662a 100644
--- a/trunk/arch/x86/kernel/cpu/proc.c
+++ b/trunk/arch/x86/kernel/cpu/proc.c
@@ -26,11 +26,6 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
 #ifdef CONFIG_X86_32
 static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
 {
-	/*
-	 * We use exception 16 if we have hardware math and we've either seen
-	 * it or the CPU claims it is internal
-	 */
-	int fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu);
 	seq_printf(m,
 		   "fdiv_bug\t: %s\n"
 		   "hlt_bug\t\t: %s\n"
@@ -45,7 +40,7 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
 		   c->f00f_bug ? "yes" : "no",
 		   c->coma_bug ? "yes" : "no",
 		   c->hard_math ? "yes" : "no",
-		   fpu_exception ? "yes" : "no",
+		   c->hard_math ? "yes" : "no",
 		   c->cpuid_level,
 		   c->wp_works_ok ? "yes" : "no");
 }
diff --git a/trunk/arch/x86/kernel/irqinit.c b/trunk/arch/x86/kernel/irqinit.c
index 6e03b0d69138..7dc4e459c2b3 100644
--- a/trunk/arch/x86/kernel/irqinit.c
+++ b/trunk/arch/x86/kernel/irqinit.c
@@ -42,39 +42,6 @@
  * (these are usually mapped into the 0x30-0xff vector range)
  */
 
-#ifdef CONFIG_X86_32
-/*
- * Note that on a 486, we don't want to do a SIGFPE on an irq13
- * as the irq is unreliable, and exception 16 works correctly
- * (ie as explained in the intel literature). On a 386, you
- * can't use exception 16 due to bad IBM design, so we have to
- * rely on the less exact irq13.
- *
- * Careful.. Not only is IRQ13 unreliable, but it is also
- * leads to races. IBM designers who came up with it should
- * be shot.
- */
-
-static irqreturn_t math_error_irq(int cpl, void *dev_id)
-{
-	outb(0, 0xF0);
-	if (ignore_fpu_irq || !boot_cpu_data.hard_math)
-		return IRQ_NONE;
-	math_error(get_irq_regs(), 0, X86_TRAP_MF);
-	return IRQ_HANDLED;
-}
-
-/*
- * New motherboards sometimes make IRQ 13 be a PCI interrupt,
- * so allow interrupt sharing.
- */
-static struct irqaction fpu_irq = {
-	.handler = math_error_irq,
-	.name = "fpu",
-	.flags = IRQF_NO_THREAD,
-};
-#endif
-
 /*
  * IRQ2 is cascade interrupt to second interrupt controller
  */
@@ -242,13 +209,6 @@ void __init native_init_IRQ(void)
 		setup_irq(2, &irq2);
 
 #ifdef CONFIG_X86_32
-	/*
-	 * External FPU? Set up irq13 if so, for
-	 * original braindamaged IBM FERR coupling.
-	 */
-	if (boot_cpu_data.hard_math && !cpu_has_fpu)
-		setup_irq(FPU_IRQ, &fpu_irq);
-
 	irq_ctx_init(smp_processor_id());
 #endif
 }
diff --git a/trunk/arch/x86/kernel/traps.c b/trunk/arch/x86/kernel/traps.c
index eb8586693e0b..ecffca11f4e9 100644
--- a/trunk/arch/x86/kernel/traps.c
+++ b/trunk/arch/x86/kernel/traps.c
@@ -69,9 +69,6 @@
 
 asmlinkage int system_call(void);
 
-/* Do we ignore FPU interrupts ? */
-char ignore_fpu_irq;
-
 /*
  * The IDT has to be page-aligned to simplify the Pentium
  * F0 0F bug workaround.
@@ -564,9 +561,6 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
 
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-#ifdef CONFIG_X86_32
-	ignore_fpu_irq = 1;
-#endif
 	exception_enter(regs);
 	math_error(regs, error_code, X86_TRAP_MF);
 	exception_exit(regs);
diff --git a/trunk/arch/x86/platform/iris/iris.c b/trunk/arch/x86/platform/iris/iris.c
index 5917eb56b313..e6cb80f620af 100644
--- a/trunk/arch/x86/platform/iris/iris.c
+++ b/trunk/arch/x86/platform/iris/iris.c
@@ -23,6 +23,7 @@
 
 #include <linux/moduleparam.h>
 #include <linux/module.h>
+#include <linux/platform_device.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/delay.h>
@@ -62,29 +63,75 @@ static void iris_power_off(void)
  * by reading its input port and seeing whether the read value is
  * meaningful.
  */
-static int iris_init(void)
+static int iris_probe(struct platform_device *pdev)
 {
-	unsigned char status;
-	if (force != 1) {
-		printk(KERN_ERR "The force parameter has not been set to 1 so the Iris poweroff handler will not be installed.\n");
-		return -ENODEV;
-	}
-	status = inb(IRIS_GIO_INPUT);
+	unsigned char status = inb(IRIS_GIO_INPUT);
 	if (status == IRIS_GIO_NODEV) {
-		printk(KERN_ERR "This machine does not seem to be an Iris. Power_off handler not installed.\n");
+		printk(KERN_ERR "This machine does not seem to be an Iris. "
+			"Power off handler not installed.\n");
 		return -ENODEV;
 	}
 	old_pm_power_off = pm_power_off;
 	pm_power_off = &iris_power_off;
 	printk(KERN_INFO "Iris power_off handler installed.\n");
-
 	return 0;
 }
 
-static void iris_exit(void)
+static int iris_remove(struct platform_device *pdev)
 {
 	pm_power_off = old_pm_power_off;
 	printk(KERN_INFO "Iris power_off handler uninstalled.\n");
+	return 0;
+}
+
+static struct platform_driver iris_driver = {
+	.driver		= {
+		.name   = "iris",
+		.owner  = THIS_MODULE,
+	},
+	.probe          = iris_probe,
+	.remove         = iris_remove,
+};
+
+static struct resource iris_resources[] = {
+	{
+		.start  = IRIS_GIO_BASE,
+		.end    = IRIS_GIO_OUTPUT,
+		.flags  = IORESOURCE_IO,
+		.name   = "address"
+	}
+};
+
+static struct platform_device *iris_device;
+
+static int iris_init(void)
+{
+	int ret;
+	if (force != 1) {
+		printk(KERN_ERR "The force parameter has not been set to 1."
+			" The Iris poweroff handler will not be installed.\n");
+		return -ENODEV;
+	}
+	ret = platform_driver_register(&iris_driver);
+	if (ret < 0) {
+		printk(KERN_ERR "Failed to register iris platform driver: %d\n",
+			ret);
+		return ret;
+	}
+	iris_device = platform_device_register_simple("iris", (-1),
+				iris_resources, ARRAY_SIZE(iris_resources));
+	if (IS_ERR(iris_device)) {
+		printk(KERN_ERR "Failed to register iris platform device\n");
+		platform_driver_unregister(&iris_driver);
+		return PTR_ERR(iris_device);
+	}
+	return 0;
+}
+
+static void iris_exit(void)
+{
+	platform_device_unregister(iris_device);
+	platform_driver_unregister(&iris_driver);
 }
 
 module_init(iris_init);
diff --git a/trunk/arch/x86/syscalls/syscall_32.tbl b/trunk/arch/x86/syscalls/syscall_32.tbl
index ee3c220ee500..05f404f53f59 100644
--- a/trunk/arch/x86/syscalls/syscall_32.tbl
+++ b/trunk/arch/x86/syscalls/syscall_32.tbl
@@ -356,3 +356,4 @@
 347	i386	process_vm_readv	sys_process_vm_readv		compat_sys_process_vm_readv
 348	i386	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
 349	i386	kcmp			sys_kcmp
+350	i386	finit_module		sys_finit_module
diff --git a/trunk/arch/x86/syscalls/syscall_64.tbl b/trunk/arch/x86/syscalls/syscall_64.tbl
index a582bfed95bb..7c58c84b7bc8 100644
--- a/trunk/arch/x86/syscalls/syscall_64.tbl
+++ b/trunk/arch/x86/syscalls/syscall_64.tbl
@@ -319,6 +319,7 @@
 310	64	process_vm_readv	sys_process_vm_readv
 311	64	process_vm_writev	sys_process_vm_writev
 312	common	kcmp			sys_kcmp
+313	common	finit_module		sys_finit_module
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/trunk/arch/x86/xen/enlighten.c b/trunk/arch/x86/xen/enlighten.c
index 3aeaa933b527..138e5667409a 100644
--- a/trunk/arch/x86/xen/enlighten.c
+++ b/trunk/arch/x86/xen/enlighten.c
@@ -193,10 +193,11 @@ void xen_vcpu_restore(void)
 {
 	int cpu;
 
-	for_each_online_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		bool other_cpu = (cpu != smp_processor_id());
+		bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL);
 
-		if (other_cpu &&
+		if (other_cpu && is_up &&
 		    HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
 			BUG();
 
@@ -205,7 +206,7 @@ void xen_vcpu_restore(void)
 		if (have_vcpu_info_placement)
 			xen_vcpu_setup(cpu);
 
-		if (other_cpu &&
+		if (other_cpu && is_up &&
 		    HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
 			BUG();
 	}
diff --git a/trunk/arch/x86/xen/smp.c b/trunk/arch/x86/xen/smp.c
index 353c50f18702..4f7d2599b484 100644
--- a/trunk/arch/x86/xen/smp.c
+++ b/trunk/arch/x86/xen/smp.c
@@ -254,7 +254,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
 	}
 	xen_init_lock_cpu(0);
 
-	smp_store_cpu_info(0);
+	smp_store_boot_cpu_info();
 	cpu_data(0).x86_max_cores = 1;
 
 	for_each_possible_cpu(i) {
diff --git a/trunk/arch/xtensa/Kconfig b/trunk/arch/xtensa/Kconfig
index 2481f267be29..73d34e77c39c 100644
--- a/trunk/arch/xtensa/Kconfig
+++ b/trunk/arch/xtensa/Kconfig
@@ -17,6 +17,7 @@ config XTENSA
 	select GENERIC_KERNEL_EXECVE
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select CLONE_BACKWARDS
+	select IRQ_DOMAIN
 	help
 	  Xtensa processors are 32-bit RISC machines designed by Tensilica
 	  primarily for embedded systems.  These processors are both
@@ -150,6 +151,15 @@ config XTENSA_PLATFORM_S6105
 	select SERIAL_CONSOLE
 	select NO_IOPORT
 
+config XTENSA_PLATFORM_XTFPGA
+	bool "XTFPGA"
+	select SERIAL_CONSOLE
+	select ETHOC
+	select XTENSA_CALIBRATE_CCOUNT
+	help
+	  XTFPGA is the name of Tensilica board family (LX60, LX110, LX200, ML605).
+	  This hardware is capable of running a full Linux distribution.
+
 endchoice
 
 
@@ -177,6 +187,17 @@ config CMDLINE
 	  time by entering them here. As a minimum, you should specify the
 	  memory size and the root device (e.g., mem=64M root=/dev/nfs).
 
+config USE_OF
+	bool "Flattened Device Tree support"
+	select OF
+	select OF_EARLY_FLATTREE
+	help
+	  Include support for flattened device tree machine descriptions.
+
+config BUILTIN_DTB
+	string "DTB to build into the kernel image"
+	depends on OF
+
 source "mm/Kconfig"
 
 source "drivers/pcmcia/Kconfig"
diff --git a/trunk/arch/xtensa/Kconfig.debug b/trunk/arch/xtensa/Kconfig.debug
index 11c585295dd7..a34010e0e51c 100644
--- a/trunk/arch/xtensa/Kconfig.debug
+++ b/trunk/arch/xtensa/Kconfig.debug
@@ -2,6 +2,26 @@ menu "Kernel hacking"
 
 source "lib/Kconfig.debug"
 
-endmenu
+config LD_NO_RELAX
+	bool "Disable linker relaxation"
+	default n
+	help
+	  Enable this function to disable link-time optimizations.
+	  The default linker behavior is to combine identical literal
+	  values to reduce code size and remove unnecessary overhead from
+	  assembler-generated 'longcall' sequences.
+	  Enabling this option improves the link time but increases the
+	  code size, and possibly execution time.
+
+config S32C1I_SELFTEST
+	bool "Perform S32C1I instruction self-test at boot"
+	default y
+	help
+	  Enable this option to test S32C1I instruction behavior at boot.
+	  Correct operation of this instruction requires some cooperation from hardware
+	  external to the processor (such as bus bridge, bus fabric, or memory controller).
+	  It is easy to make wrong hardware configuration, this test should catch it early.
 
+	  Say 'N' on stable hardware.
 
+endmenu
diff --git a/trunk/arch/xtensa/Makefile b/trunk/arch/xtensa/Makefile
index bb5ba61723f7..0aa72702f179 100644
--- a/trunk/arch/xtensa/Makefile
+++ b/trunk/arch/xtensa/Makefile
@@ -38,6 +38,7 @@ endif
 platform-$(CONFIG_XTENSA_PLATFORM_XT2000)	:= xt2000
 platform-$(CONFIG_XTENSA_PLATFORM_ISS)		:= iss
 platform-$(CONFIG_XTENSA_PLATFORM_S6105)	:= s6105
+platform-$(CONFIG_XTENSA_PLATFORM_XTFPGA)	:= xtfpga
 
 PLATFORM = $(platform-y)
 export PLATFORM
@@ -49,6 +50,17 @@ KBUILD_CFLAGS += -pipe -mlongcalls
 
 KBUILD_CFLAGS += $(call cc-option,-mforce-no-pic,)
 
+ifneq ($(CONFIG_LD_NO_RELAX),)
+LDFLAGS := --no-relax
+endif
+
+ifeq ($(shell echo -e __XTENSA_EB__ | $(CC) -E - | grep -v "\#"),1)
+CHECKFLAGS += -D__XTENSA_EB__
+endif
+ifeq ($(shell echo -e __XTENSA_EL__ | $(CC) -E - | grep -v "\#"),1)
+CHECKFLAGS += -D__XTENSA_EL__
+endif
+
 vardirs := $(patsubst %,arch/xtensa/variants/%/,$(variant-y))
 plfdirs := $(patsubst %,arch/xtensa/platforms/%/,$(platform-y))
 
@@ -75,6 +87,10 @@ core-y		+= $(buildvar) $(buildplf)
 
 libs-y		+= arch/xtensa/lib/ $(LIBGCC)
 
+ifneq ($(CONFIG_BUILTIN_DTB),"")
+core-$(CONFIG_OF) += arch/xtensa/boot/
+endif
+
 boot		:= arch/xtensa/boot
 
 all: zImage
@@ -84,7 +100,9 @@ bzImage : zImage
 zImage: vmlinux
 	$(Q)$(MAKE) $(build)=$(boot) $@
 
+%.dtb:
+	$(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
+
 define archhelp
   @echo '* zImage      - Compressed kernel image (arch/xtensa/boot/images/zImage.*)'
 endef
-
diff --git a/trunk/arch/xtensa/boot/Makefile b/trunk/arch/xtensa/boot/Makefile
index 4018f8994196..818647e815d7 100644
--- a/trunk/arch/xtensa/boot/Makefile
+++ b/trunk/arch/xtensa/boot/Makefile
@@ -22,12 +22,35 @@ subdir-y	:= lib
 # Subdirs for the boot loader(s)
 
 bootdir-$(CONFIG_XTENSA_PLATFORM_ISS)	 += boot-elf
-bootdir-$(CONFIG_XTENSA_PLATFORM_XT2000) += boot-redboot boot-elf
+bootdir-$(CONFIG_XTENSA_PLATFORM_XT2000) += boot-redboot boot-elf boot-uboot
+bootdir-$(CONFIG_XTENSA_PLATFORM_XTFPGA) += boot-redboot boot-elf boot-uboot
 
 
+BUILTIN_DTB := $(patsubst "%",%,$(CONFIG_BUILTIN_DTB)).dtb.o
+ifneq ($(CONFIG_BUILTIN_DTB),"")
+obj-$(CONFIG_OF) += $(BUILTIN_DTB)
+endif
+
+# Rule to build device tree blobs
+$(obj)/%.dtb: $(src)/dts/%.dts FORCE
+	$(call if_changed_dep,dtc)
+
+clean-files := *.dtb.S
+
 zImage Image: $(bootdir-y)
 
 $(bootdir-y): $(addprefix $(obj)/,$(subdir-y)) \
 	      $(addprefix $(obj)/,$(host-progs))
 	$(Q)$(MAKE) $(build)=$(obj)/$@ $(MAKECMDGOALS)
 
+OBJCOPYFLAGS = --strip-all -R .comment -R .note.gnu.build-id -O binary
+
+vmlinux.bin: vmlinux FORCE
+	$(call if_changed,objcopy)
+
+vmlinux.bin.gz: vmlinux.bin FORCE
+	$(call if_changed,gzip)
+
+boot-elf: vmlinux.bin
+boot-redboot: vmlinux.bin.gz
+boot-uboot: vmlinux.bin.gz
diff --git a/trunk/arch/xtensa/boot/boot-elf/Makefile b/trunk/arch/xtensa/boot/boot-elf/Makefile
index f10992b89027..1fe01b78c124 100644
--- a/trunk/arch/xtensa/boot/boot-elf/Makefile
+++ b/trunk/arch/xtensa/boot/boot-elf/Makefile
@@ -4,9 +4,6 @@
 # for more details.
 #
 
-GZIP = gzip
-GZIP_FLAGS = -v9fc
-
 ifeq ($(BIG_ENDIAN),1)
 OBJCOPY_ARGS    := -O elf32-xtensa-be
 else
@@ -20,18 +17,17 @@ boot-y		:= bootstrap.o
 
 OBJS		:= $(addprefix $(obj)/,$(boot-y))
 
-vmlinux.tmp: vmlinux
-	$(OBJCOPY) --strip-all -R .comment -R .note.gnu.build-id -O binary \
-		$^ $@
-
-Image:	vmlinux.tmp $(OBJS) arch/$(ARCH)/boot/boot-elf/boot.lds
-	$(OBJCOPY) $(OBJCOPY_ARGS) -R .comment \
-		--add-section image=vmlinux.tmp \
+$(obj)/Image.o: vmlinux.bin $(OBJS)
+	$(Q)$(OBJCOPY) $(OBJCOPY_ARGS) -R .comment \
+		--add-section image=vmlinux.bin \
 		--set-section-flags image=contents,alloc,load,load,data \
-		$(OBJS) $@.tmp
-	$(LD) $(LDFLAGS) $(LDFLAGS_vmlinux) \
-		-T arch/$(ARCH)/boot/boot-elf/boot.lds \
-		-o arch/$(ARCH)/boot/$@.elf $@.tmp
+		$(OBJS) $@
 
-zImage:	Image
+$(obj)/../Image.elf: $(obj)/Image.o $(obj)/boot.lds
+	$(Q)$(LD) $(LDFLAGS) $(LDFLAGS_vmlinux) \
+		-T $(obj)/boot.lds \
+		--build-id=none \
+		-o $@ $(obj)/Image.o
+	$(Q)$(kecho) '  Kernel: $@ is ready'
 
+zImage:	$(obj)/../Image.elf
diff --git a/trunk/arch/xtensa/boot/boot-redboot/Makefile b/trunk/arch/xtensa/boot/boot-redboot/Makefile
index 25a78c6b1530..8be8b9436981 100644
--- a/trunk/arch/xtensa/boot/boot-redboot/Makefile
+++ b/trunk/arch/xtensa/boot/boot-redboot/Makefile
@@ -4,8 +4,6 @@
 # for more details.
 #
 
-GZIP = gzip
-GZIP_FLAGS = -v9fc
 ifeq ($(BIG_ENDIAN),1)
 OBJCOPY_ARGS 	:= -O elf32-xtensa-be
 else
@@ -21,17 +19,17 @@ LIBS	:= arch/xtensa/boot/lib/lib.a arch/xtensa/lib/lib.a
 
 LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name)
 
-vmlinux.tmp: vmlinux
-	$(OBJCOPY) --strip-all -R .comment -R .note.gnu.build-id -O binary \
-		$^ $@
+$(obj)/zImage.o: vmlinux.bin.gz $(OBJS)
+	$(Q)$(OBJCOPY) $(OBJCOPY_ARGS) -R .comment \
+		--add-section image=vmlinux.bin.gz \
+		--set-section-flags image=contents,alloc,load,load,data \
+		$(OBJS) $@
 
-vmlinux.tmp.gz: vmlinux.tmp
-	$(GZIP) $(GZIP_FLAGS) $^ > $@
+$(obj)/zImage.elf: $(obj)/zImage.o $(LIBS)
+	$(Q)$(LD) $(LD_ARGS) -o $@ $^ -L/xtensa-elf/lib $(LIBGCC)
 
-zImage: vmlinux.tmp.gz $(OBJS) $(LIBS)
-	$(OBJCOPY) $(OBJCOPY_ARGS) -R .comment \
-		--add-section image=vmlinux.tmp.gz \
-		--set-section-flags image=contents,alloc,load,load,data \
-		$(OBJS) $@.tmp
-	$(LD) $(LD_ARGS) -o $@.elf $@.tmp $(LIBS) -L/xtensa-elf/lib $(LIBGCC)
-	$(OBJCOPY) -S -O binary $@.elf arch/$(ARCH)/boot/$@.redboot
+$(obj)/../zImage.redboot: $(obj)/zImage.elf
+	$(Q)$(OBJCOPY) -S -O binary $< $@
+	$(Q)$(kecho) '  Kernel: $@ is ready'
+
+zImage: $(obj)/../zImage.redboot
diff --git a/trunk/arch/xtensa/boot/boot-uboot/Makefile b/trunk/arch/xtensa/boot/boot-uboot/Makefile
new file mode 100644
index 000000000000..bfbf8af582f1
--- /dev/null
+++ b/trunk/arch/xtensa/boot/boot-uboot/Makefile
@@ -0,0 +1,14 @@
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+
+UIMAGE_LOADADDR = 0xd0001000
+UIMAGE_COMPRESSION = gzip
+
+$(obj)/../uImage: vmlinux.bin.gz FORCE
+	$(call if_changed,uimage)
+	$(Q)$(kecho) '  Kernel: $@ is ready'
+
+zImage: $(obj)/../uImage
diff --git a/trunk/arch/xtensa/boot/dts/lx60.dts b/trunk/arch/xtensa/boot/dts/lx60.dts
new file mode 100644
index 000000000000..2eab3658e1bd
--- /dev/null
+++ b/trunk/arch/xtensa/boot/dts/lx60.dts
@@ -0,0 +1,11 @@
+/dts-v1/;
+/include/ "xtfpga.dtsi"
+/include/ "xtfpga-flash-4m.dtsi"
+
+/ {
+	compatible = "xtensa,lx60";
+	memory@0 {
+		device_type = "memory";
+		reg = <0x00000000 0x04000000>;
+	};
+};
diff --git a/trunk/arch/xtensa/boot/dts/ml605.dts b/trunk/arch/xtensa/boot/dts/ml605.dts
new file mode 100644
index 000000000000..6ed51d6554e6
--- /dev/null
+++ b/trunk/arch/xtensa/boot/dts/ml605.dts
@@ -0,0 +1,11 @@
+/dts-v1/;
+/include/ "xtfpga.dtsi"
+/include/ "xtfpga-flash-16m.dtsi"
+
+/ {
+	compatible = "xtensa,ml605";
+	memory@0 {
+		device_type = "memory";
+		reg = <0x00000000 0x08000000>;
+	};
+};
diff --git a/trunk/arch/xtensa/boot/dts/xtfpga-flash-16m.dtsi b/trunk/arch/xtensa/boot/dts/xtfpga-flash-16m.dtsi
new file mode 100644
index 000000000000..e5703c7beeb6
--- /dev/null
+++ b/trunk/arch/xtensa/boot/dts/xtfpga-flash-16m.dtsi
@@ -0,0 +1,26 @@
+/ {
+	flash: flash@f8000000 {
+		#address-cells = <1>;
+		#size-cells = <1>;
+		compatible = "cfi-flash";
+		reg = <0xf8000000 0x01000000>;
+		bank-width = <2>;
+		device-width = <2>;
+		partition@0x0 {
+			label = "boot loader area";
+			reg = <0x00000000 0x00400000>;
+		};
+		partition@0x400000 {
+			label = "kernel image";
+			reg = <0x00400000 0x00600000>;
+		};
+		partition@0xa00000 {
+			label = "data";
+			reg = <0x00a00000 0x005e0000>;
+		};
+		partition@0xfe0000 {
+			label = "boot environment";
+			reg = <0x00fe0000 0x00020000>;
+		};
+        };
+};
diff --git a/trunk/arch/xtensa/boot/dts/xtfpga-flash-4m.dtsi b/trunk/arch/xtensa/boot/dts/xtfpga-flash-4m.dtsi
new file mode 100644
index 000000000000..6f9c10d6b689
--- /dev/null
+++ b/trunk/arch/xtensa/boot/dts/xtfpga-flash-4m.dtsi
@@ -0,0 +1,18 @@
+/ {
+	flash: flash@f8000000 {
+		#address-cells = <1>;
+		#size-cells = <1>;
+		compatible = "cfi-flash";
+		reg = <0xf8000000 0x00400000>;
+		bank-width = <2>;
+		device-width = <2>;
+		partition@0x0 {
+			label = "boot loader area";
+			reg = <0x00000000 0x003f0000>;
+		};
+		partition@0x3f0000 {
+			label = "boot environment";
+			reg = <0x003f0000 0x00010000>;
+		};
+        };
+};
diff --git a/trunk/arch/xtensa/boot/dts/xtfpga.dtsi b/trunk/arch/xtensa/boot/dts/xtfpga.dtsi
new file mode 100644
index 000000000000..7eda6ecf7eef
--- /dev/null
+++ b/trunk/arch/xtensa/boot/dts/xtfpga.dtsi
@@ -0,0 +1,56 @@
+/ {
+	compatible = "xtensa,xtfpga";
+	#address-cells = <1>;
+	#size-cells = <1>;
+	interrupt-parent = <&pic>;
+
+	chosen {
+		bootargs = "earlycon=uart8250,mmio32,0xfd050020,115200n8 console=ttyS0,115200n8 ip=dhcp root=/dev/nfs rw debug";
+	};
+
+	memory@0 {
+		device_type = "memory";
+		reg = <0x00000000 0x06000000>;
+	};
+
+	cpus {
+		#address-cells = <1>;
+		#size-cells = <0>;
+		cpu@0 {
+			compatible = "xtensa,cpu";
+			reg = <0>;
+			/* Filled in by platform_setup from FPGA register
+			 * clock-frequency = <100000000>;
+			 */
+		};
+	};
+
+	pic: pic {
+		compatible = "xtensa,pic";
+		/* one cell: internal irq number,
+		 * two cells: second cell == 0: internal irq number
+		 *            second cell == 1: external irq number
+		 */
+		#interrupt-cells = <2>;
+		interrupt-controller;
+	};
+
+	serial0: serial@fd050020 {
+		device_type = "serial";
+		compatible = "ns16550a";
+		no-loopback-test;
+		reg = <0xfd050020 0x20>;
+		reg-shift = <2>;
+		interrupts = <0 1>; /* external irq 0 */
+		/* Filled in by platform_setup from FPGA register
+		 * clock-frequency = <100000000>;
+		 */
+	};
+
+	enet0: ethoc@fd030000 {
+		compatible = "opencores,ethoc";
+		reg = <0xfd030000 0x4000 0xfd800000 0x4000>;
+		interrupts = <1 1>; /* external irq 1 */
+		local-mac-address = [00 50 c2 13 6f 00];
+	};
+};
diff --git a/trunk/arch/xtensa/include/asm/atomic.h b/trunk/arch/xtensa/include/asm/atomic.h
index 24f50cada70c..c3f289174c10 100644
--- a/trunk/arch/xtensa/include/asm/atomic.h
+++ b/trunk/arch/xtensa/include/asm/atomic.h
@@ -66,19 +66,35 @@
  */
 static inline void atomic_add(int i, atomic_t * v)
 {
-    unsigned int vval;
-
-    __asm__ __volatile__(
-	"rsil    a15, "__stringify(LOCKLEVEL)"\n\t"
-	"l32i    %0, %2, 0              \n\t"
-	"add     %0, %0, %1             \n\t"
-	"s32i    %0, %2, 0              \n\t"
-	"wsr     a15, ps                \n\t"
-	"rsync                          \n"
-	: "=&a" (vval)
-	: "a" (i), "a" (v)
-	: "a15", "memory"
-	);
+#if XCHAL_HAVE_S32C1I
+	unsigned long tmp;
+	int result;
+
+	__asm__ __volatile__(
+			"1:     l32i    %1, %3, 0\n"
+			"       wsr     %1, scompare1\n"
+			"       add     %0, %1, %2\n"
+			"       s32c1i  %0, %3, 0\n"
+			"       bne     %0, %1, 1b\n"
+			: "=&a" (result), "=&a" (tmp)
+			: "a" (i), "a" (v)
+			: "memory"
+			);
+#else
+	unsigned int vval;
+
+	__asm__ __volatile__(
+			"       rsil    a15, "__stringify(LOCKLEVEL)"\n"
+			"       l32i    %0, %2, 0\n"
+			"       add     %0, %0, %1\n"
+			"       s32i    %0, %2, 0\n"
+			"       wsr     a15, ps\n"
+			"       rsync\n"
+			: "=&a" (vval)
+			: "a" (i), "a" (v)
+			: "a15", "memory"
+			);
+#endif
 }
 
 /**
@@ -90,19 +106,35 @@ static inline void atomic_add(int i, atomic_t * v)
  */
 static inline void atomic_sub(int i, atomic_t *v)
 {
-    unsigned int vval;
-
-    __asm__ __volatile__(
-	"rsil    a15, "__stringify(LOCKLEVEL)"\n\t"
-	"l32i    %0, %2, 0              \n\t"
-	"sub     %0, %0, %1             \n\t"
-	"s32i    %0, %2, 0              \n\t"
-	"wsr     a15, ps                \n\t"
-	"rsync                          \n"
-	: "=&a" (vval)
-	: "a" (i), "a" (v)
-	: "a15", "memory"
-	);
+#if XCHAL_HAVE_S32C1I
+	unsigned long tmp;
+	int result;
+
+	__asm__ __volatile__(
+			"1:     l32i    %1, %3, 0\n"
+			"       wsr     %1, scompare1\n"
+			"       sub     %0, %1, %2\n"
+			"       s32c1i  %0, %3, 0\n"
+			"       bne     %0, %1, 1b\n"
+			: "=&a" (result), "=&a" (tmp)
+			: "a" (i), "a" (v)
+			: "memory"
+			);
+#else
+	unsigned int vval;
+
+	__asm__ __volatile__(
+			"       rsil    a15, "__stringify(LOCKLEVEL)"\n"
+			"       l32i    %0, %2, 0\n"
+			"       sub     %0, %0, %1\n"
+			"       s32i    %0, %2, 0\n"
+			"       wsr     a15, ps\n"
+			"       rsync\n"
+			: "=&a" (vval)
+			: "a" (i), "a" (v)
+			: "a15", "memory"
+			);
+#endif
 }
 
 /*
@@ -111,40 +143,78 @@ static inline void atomic_sub(int i, atomic_t *v)
 
 static inline int atomic_add_return(int i, atomic_t * v)
 {
-     unsigned int vval;
-
-    __asm__ __volatile__(
-	"rsil    a15,"__stringify(LOCKLEVEL)"\n\t"
-	"l32i    %0, %2, 0             \n\t"
-	"add     %0, %0, %1            \n\t"
-	"s32i    %0, %2, 0             \n\t"
-	"wsr     a15, ps               \n\t"
-	"rsync                         \n"
-	: "=&a" (vval)
-	: "a" (i), "a" (v)
-	: "a15", "memory"
-	);
-
-    return vval;
+#if XCHAL_HAVE_S32C1I
+	unsigned long tmp;
+	int result;
+
+	__asm__ __volatile__(
+			"1:     l32i    %1, %3, 0\n"
+			"       wsr     %1, scompare1\n"
+			"       add     %0, %1, %2\n"
+			"       s32c1i  %0, %3, 0\n"
+			"       bne     %0, %1, 1b\n"
+			"       add     %0, %0, %2\n"
+			: "=&a" (result), "=&a" (tmp)
+			: "a" (i), "a" (v)
+			: "memory"
+			);
+
+	return result;
+#else
+	unsigned int vval;
+
+	__asm__ __volatile__(
+			"       rsil    a15,"__stringify(LOCKLEVEL)"\n"
+			"       l32i    %0, %2, 0\n"
+			"       add     %0, %0, %1\n"
+			"       s32i    %0, %2, 0\n"
+			"       wsr     a15, ps\n"
+			"       rsync\n"
+			: "=&a" (vval)
+			: "a" (i), "a" (v)
+			: "a15", "memory"
+			);
+
+	return vval;
+#endif
 }
 
 static inline int atomic_sub_return(int i, atomic_t * v)
 {
-    unsigned int vval;
-
-    __asm__ __volatile__(
-	"rsil    a15,"__stringify(LOCKLEVEL)"\n\t"
-	"l32i    %0, %2, 0             \n\t"
-	"sub     %0, %0, %1            \n\t"
-	"s32i    %0, %2, 0             \n\t"
-	"wsr     a15, ps               \n\t"
-	"rsync                         \n"
-	: "=&a" (vval)
-	: "a" (i), "a" (v)
-	: "a15", "memory"
-	);
-
-    return vval;
+#if XCHAL_HAVE_S32C1I
+	unsigned long tmp;
+	int result;
+
+	__asm__ __volatile__(
+			"1:     l32i    %1, %3, 0\n"
+			"       wsr     %1, scompare1\n"
+			"       sub     %0, %1, %2\n"
+			"       s32c1i  %0, %3, 0\n"
+			"       bne     %0, %1, 1b\n"
+			"       sub     %0, %0, %2\n"
+			: "=&a" (result), "=&a" (tmp)
+			: "a" (i), "a" (v)
+			: "memory"
+			);
+
+	return result;
+#else
+	unsigned int vval;
+
+	__asm__ __volatile__(
+			"       rsil    a15,"__stringify(LOCKLEVEL)"\n"
+			"       l32i    %0, %2, 0\n"
+			"       sub     %0, %0, %1\n"
+			"       s32i    %0, %2, 0\n"
+			"       wsr     a15, ps\n"
+			"       rsync\n"
+			: "=&a" (vval)
+			: "a" (i), "a" (v)
+			: "a15", "memory"
+			);
+
+	return vval;
+#endif
 }
 
 /**
@@ -251,38 +321,70 @@ static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u)
 
 static inline void atomic_clear_mask(unsigned int mask, atomic_t *v)
 {
-    unsigned int all_f = -1;
-    unsigned int vval;
-
-    __asm__ __volatile__(
-	"rsil    a15,"__stringify(LOCKLEVEL)"\n\t"
-	"l32i    %0, %2, 0             \n\t"
-	"xor     %1, %4, %3            \n\t"
-	"and     %0, %0, %4            \n\t"
-	"s32i    %0, %2, 0             \n\t"
-	"wsr     a15, ps               \n\t"
-	"rsync                         \n"
-	: "=&a" (vval), "=a" (mask)
-	: "a" (v), "a" (all_f), "1" (mask)
-	: "a15", "memory"
-	);
+#if XCHAL_HAVE_S32C1I
+	unsigned long tmp;
+	int result;
+
+	__asm__ __volatile__(
+			"1:     l32i    %1, %3, 0\n"
+			"       wsr     %1, scompare1\n"
+			"       and     %0, %1, %2\n"
+			"       s32c1i  %0, %3, 0\n"
+			"       bne     %0, %1, 1b\n"
+			: "=&a" (result), "=&a" (tmp)
+			: "a" (~mask), "a" (v)
+			: "memory"
+			);
+#else
+	unsigned int all_f = -1;
+	unsigned int vval;
+
+	__asm__ __volatile__(
+			"       rsil    a15,"__stringify(LOCKLEVEL)"\n"
+			"       l32i    %0, %2, 0\n"
+			"       xor     %1, %4, %3\n"
+			"       and     %0, %0, %4\n"
+			"       s32i    %0, %2, 0\n"
+			"       wsr     a15, ps\n"
+			"       rsync\n"
+			: "=&a" (vval), "=a" (mask)
+			: "a" (v), "a" (all_f), "1" (mask)
+			: "a15", "memory"
+			);
+#endif
 }
 
 static inline void atomic_set_mask(unsigned int mask, atomic_t *v)
 {
-    unsigned int vval;
-
-    __asm__ __volatile__(
-	"rsil    a15,"__stringify(LOCKLEVEL)"\n\t"
-	"l32i    %0, %2, 0             \n\t"
-	"or      %0, %0, %1            \n\t"
-	"s32i    %0, %2, 0             \n\t"
-	"wsr     a15, ps               \n\t"
-	"rsync                         \n"
-	: "=&a" (vval)
-	: "a" (mask), "a" (v)
-	: "a15", "memory"
-	);
+#if XCHAL_HAVE_S32C1I
+	unsigned long tmp;
+	int result;
+
+	__asm__ __volatile__(
+			"1:     l32i    %1, %3, 0\n"
+			"       wsr     %1, scompare1\n"
+			"       or      %0, %1, %2\n"
+			"       s32c1i  %0, %3, 0\n"
+			"       bne     %0, %1, 1b\n"
+			: "=&a" (result), "=&a" (tmp)
+			: "a" (mask), "a" (v)
+			: "memory"
+			);
+#else
+	unsigned int vval;
+
+	__asm__ __volatile__(
+			"       rsil    a15,"__stringify(LOCKLEVEL)"\n"
+			"       l32i    %0, %2, 0\n"
+			"       or      %0, %0, %1\n"
+			"       s32i    %0, %2, 0\n"
+			"       wsr     a15, ps\n"
+			"       rsync\n"
+			: "=&a" (vval)
+			: "a" (mask), "a" (v)
+			: "a15", "memory"
+			);
+#endif
 }
 
 /* Atomic operations are already serializing */
@@ -294,4 +396,3 @@ static inline void atomic_set_mask(unsigned int mask, atomic_t *v)
 #endif /* __KERNEL__ */
 
 #endif /* _XTENSA_ATOMIC_H */
-
diff --git a/trunk/arch/xtensa/include/asm/barrier.h b/trunk/arch/xtensa/include/asm/barrier.h
index 55707a8009d3..ef021677d536 100644
--- a/trunk/arch/xtensa/include/asm/barrier.h
+++ b/trunk/arch/xtensa/include/asm/barrier.h
@@ -3,7 +3,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (C) 2001 - 2005 Tensilica Inc.
+ * Copyright (C) 2001 - 2012 Tensilica Inc.
  */
 
 #ifndef _XTENSA_SYSTEM_H
@@ -12,8 +12,8 @@
 #define smp_read_barrier_depends() do { } while(0)
 #define read_barrier_depends() do { } while(0)
 
-#define mb()  barrier()
-#define rmb() mb()
+#define mb()  ({ __asm__ __volatile__("memw" : : : "memory"); })
+#define rmb() barrier()
 #define wmb() mb()
 
 #ifdef CONFIG_SMP
diff --git a/trunk/arch/xtensa/include/asm/bitops.h b/trunk/arch/xtensa/include/asm/bitops.h
index 5270197ddd36..84afe58d5d37 100644
--- a/trunk/arch/xtensa/include/asm/bitops.h
+++ b/trunk/arch/xtensa/include/asm/bitops.h
@@ -29,7 +29,6 @@
 #define smp_mb__before_clear_bit()	barrier()
 #define smp_mb__after_clear_bit()	barrier()
 
-#include <asm-generic/bitops/atomic.h>
 #include <asm-generic/bitops/non-atomic.h>
 
 #if XCHAL_HAVE_NSA
@@ -104,6 +103,132 @@ static inline unsigned long __fls(unsigned long word)
 #endif
 
 #include <asm-generic/bitops/fls64.h>
+
+#if XCHAL_HAVE_S32C1I
+
+static inline void set_bit(unsigned int bit, volatile unsigned long *p)
+{
+	unsigned long tmp, value;
+	unsigned long mask = 1UL << (bit & 31);
+
+	p += bit >> 5;
+
+	__asm__ __volatile__(
+			"1:     l32i    %1, %3, 0\n"
+			"       wsr     %1, scompare1\n"
+			"       or      %0, %1, %2\n"
+			"       s32c1i  %0, %3, 0\n"
+			"       bne     %0, %1, 1b\n"
+			: "=&a" (tmp), "=&a" (value)
+			: "a" (mask), "a" (p)
+			: "memory");
+}
+
+static inline void clear_bit(unsigned int bit, volatile unsigned long *p)
+{
+	unsigned long tmp, value;
+	unsigned long mask = 1UL << (bit & 31);
+
+	p += bit >> 5;
+
+	__asm__ __volatile__(
+			"1:     l32i    %1, %3, 0\n"
+			"       wsr     %1, scompare1\n"
+			"       and     %0, %1, %2\n"
+			"       s32c1i  %0, %3, 0\n"
+			"       bne     %0, %1, 1b\n"
+			: "=&a" (tmp), "=&a" (value)
+			: "a" (~mask), "a" (p)
+			: "memory");
+}
+
+static inline void change_bit(unsigned int bit, volatile unsigned long *p)
+{
+	unsigned long tmp, value;
+	unsigned long mask = 1UL << (bit & 31);
+
+	p += bit >> 5;
+
+	__asm__ __volatile__(
+			"1:     l32i    %1, %3, 0\n"
+			"       wsr     %1, scompare1\n"
+			"       xor     %0, %1, %2\n"
+			"       s32c1i  %0, %3, 0\n"
+			"       bne     %0, %1, 1b\n"
+			: "=&a" (tmp), "=&a" (value)
+			: "a" (mask), "a" (p)
+			: "memory");
+}
+
+static inline int
+test_and_set_bit(unsigned int bit, volatile unsigned long *p)
+{
+	unsigned long tmp, value;
+	unsigned long mask = 1UL << (bit & 31);
+
+	p += bit >> 5;
+
+	__asm__ __volatile__(
+			"1:     l32i    %1, %3, 0\n"
+			"       wsr     %1, scompare1\n"
+			"       or      %0, %1, %2\n"
+			"       s32c1i  %0, %3, 0\n"
+			"       bne     %0, %1, 1b\n"
+			: "=&a" (tmp), "=&a" (value)
+			: "a" (mask), "a" (p)
+			: "memory");
+
+	return tmp & mask;
+}
+
+static inline int
+test_and_clear_bit(unsigned int bit, volatile unsigned long *p)
+{
+	unsigned long tmp, value;
+	unsigned long mask = 1UL << (bit & 31);
+
+	p += bit >> 5;
+
+	__asm__ __volatile__(
+			"1:     l32i    %1, %3, 0\n"
+			"       wsr     %1, scompare1\n"
+			"       and     %0, %1, %2\n"
+			"       s32c1i  %0, %3, 0\n"
+			"       bne     %0, %1, 1b\n"
+			: "=&a" (tmp), "=&a" (value)
+			: "a" (~mask), "a" (p)
+			: "memory");
+
+	return tmp & mask;
+}
+
+static inline int
+test_and_change_bit(unsigned int bit, volatile unsigned long *p)
+{
+	unsigned long tmp, value;
+	unsigned long mask = 1UL << (bit & 31);
+
+	p += bit >> 5;
+
+	__asm__ __volatile__(
+			"1:     l32i    %1, %3, 0\n"
+			"       wsr     %1, scompare1\n"
+			"       xor     %0, %1, %2\n"
+			"       s32c1i  %0, %3, 0\n"
+			"       bne     %0, %1, 1b\n"
+			: "=&a" (tmp), "=&a" (value)
+			: "a" (mask), "a" (p)
+			: "memory");
+
+	return tmp & mask;
+}
+
+#else
+
+#include <asm-generic/bitops/atomic.h>
+
+#endif /* XCHAL_HAVE_S32C1I */
+
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/le.h>
 
diff --git a/trunk/arch/xtensa/include/asm/bootparam.h b/trunk/arch/xtensa/include/asm/bootparam.h
index 9983f2c1b7ee..0c25799facab 100644
--- a/trunk/arch/xtensa/include/asm/bootparam.h
+++ b/trunk/arch/xtensa/include/asm/bootparam.h
@@ -22,6 +22,7 @@
 #define BP_TAG_MEMORY		0x1003	/* memory addr and size (bp_meminfo) */
 #define BP_TAG_SERIAL_BAUSRATE	0x1004	/* baud rate of current console. */
 #define BP_TAG_SERIAL_PORT	0x1005	/* serial device of current console */
+#define BP_TAG_FDT		0x1006	/* flat device tree addr */
 
 #define BP_TAG_FIRST		0x7B0B  /* first tag with a version number */
 #define BP_TAG_LAST 		0x7E0B	/* last tag */
@@ -31,15 +32,15 @@
 /* All records are aligned to 4 bytes */
 
 typedef struct bp_tag {
-  unsigned short id;		/* tag id */
-  unsigned short size;		/* size of this record excluding the structure*/
-  unsigned long data[0];	/* data */
+	unsigned short id;	/* tag id */
+	unsigned short size;	/* size of this record excluding the structure*/
+	unsigned long data[0];	/* data */
 } bp_tag_t;
 
 typedef struct meminfo {
-  unsigned long type;
-  unsigned long start;
-  unsigned long end;
+	unsigned long type;
+	unsigned long start;
+	unsigned long end;
 } meminfo_t;
 
 #define SYSMEM_BANKS_MAX 5
@@ -48,14 +49,11 @@ typedef struct meminfo {
 #define MEMORY_TYPE_NONE		0x2000
 
 typedef struct sysmem_info {
-  int nr_banks;
-  meminfo_t bank[SYSMEM_BANKS_MAX];
+	int nr_banks;
+	meminfo_t bank[SYSMEM_BANKS_MAX];
 } sysmem_info_t;
 
 extern sysmem_info_t sysmem;
 
 #endif
 #endif
-
-
-
diff --git a/trunk/arch/xtensa/include/asm/cacheasm.h b/trunk/arch/xtensa/include/asm/cacheasm.h
index 2c20a58f94cd..60e18773ecb8 100644
--- a/trunk/arch/xtensa/include/asm/cacheasm.h
+++ b/trunk/arch/xtensa/include/asm/cacheasm.h
@@ -174,4 +174,3 @@
 	__loop_cache_page \ar \as ihi XCHAL_ICACHE_LINEWIDTH
 
 	.endm
-
diff --git a/trunk/arch/xtensa/include/asm/cacheflush.h b/trunk/arch/xtensa/include/asm/cacheflush.h
index 569fec4f9a20..127cd48883c4 100644
--- a/trunk/arch/xtensa/include/asm/cacheflush.h
+++ b/trunk/arch/xtensa/include/asm/cacheflush.h
@@ -104,7 +104,8 @@ static inline void __invalidate_icache_page_alias(unsigned long virt,
 #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page*);
 extern void flush_cache_range(struct vm_area_struct*, ulong, ulong);
-extern void flush_cache_page(struct vm_area_struct*, unsigned long, unsigned long);
+extern void flush_cache_page(struct vm_area_struct*,
+			     unsigned long, unsigned long);
 
 #else
 
diff --git a/trunk/arch/xtensa/include/asm/checksum.h b/trunk/arch/xtensa/include/asm/checksum.h
index e4d831a30772..aed7ad68ca46 100644
--- a/trunk/arch/xtensa/include/asm/checksum.h
+++ b/trunk/arch/xtensa/include/asm/checksum.h
@@ -36,8 +36,9 @@ asmlinkage __wsum csum_partial(const void *buff, int len, __wsum sum);
  * better 64-bit) boundary
  */
 
-asmlinkage __wsum csum_partial_copy_generic(const void *src, void *dst, int len, __wsum sum,
-						   int *src_err_ptr, int *dst_err_ptr);
+asmlinkage __wsum csum_partial_copy_generic(const void *src, void *dst,
+					    int len, __wsum sum,
+					    int *src_err_ptr, int *dst_err_ptr);
 
 /*
  *	Note: when you get a NULL pointer exception here this means someone
@@ -54,7 +55,7 @@ __wsum csum_partial_copy_nocheck(const void *src, void *dst,
 
 static inline
 __wsum csum_partial_copy_from_user(const void __user *src, void *dst,
-						int len, __wsum sum, int *err_ptr)
+				   int len, __wsum sum, int *err_ptr)
 {
 	return csum_partial_copy_generic((__force const void *)src, dst,
 					len, sum, err_ptr, NULL);
@@ -112,7 +113,8 @@ static __inline__ __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
 	/* Since the input registers which are loaded with iph and ihl
 	   are modified, we must also specify them as outputs, or gcc
 	   will assume they contain their original values. */
-		: "=r" (sum), "=r" (iph), "=r" (ihl), "=&r" (tmp), "=&r" (endaddr)
+		: "=r" (sum), "=r" (iph), "=r" (ihl), "=&r" (tmp),
+		  "=&r" (endaddr)
 		: "1" (iph), "2" (ihl)
 		: "memory");
 
@@ -168,7 +170,7 @@ static __inline__ __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
 
 static __inline__ __sum16 ip_compute_csum(const void *buff, int len)
 {
-    return csum_fold (csum_partial(buff, len, 0));
+	return csum_fold (csum_partial(buff, len, 0));
 }
 
 #define _HAVE_ARCH_IPV6_CSUM
@@ -238,11 +240,12 @@ static __inline__ __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
  *	Copy and checksum to user
  */
 #define HAVE_CSUM_COPY_USER
-static __inline__ __wsum csum_and_copy_to_user(const void *src, void __user *dst,
-				    int len, __wsum sum, int *err_ptr)
+static __inline__ __wsum csum_and_copy_to_user(const void *src,
+					       void __user *dst, int len,
+					       __wsum sum, int *err_ptr)
 {
 	if (access_ok(VERIFY_WRITE, dst, len))
-		return csum_partial_copy_generic(src, dst, len, sum, NULL, err_ptr);
+		return csum_partial_copy_generic(src,dst,len,sum,NULL,err_ptr);
 
 	if (len)
 		*err_ptr = -EFAULT;
diff --git a/trunk/arch/xtensa/include/asm/cmpxchg.h b/trunk/arch/xtensa/include/asm/cmpxchg.h
index 64dad04a9d27..d9ab131bc1aa 100644
--- a/trunk/arch/xtensa/include/asm/cmpxchg.h
+++ b/trunk/arch/xtensa/include/asm/cmpxchg.h
@@ -22,17 +22,30 @@
 static inline unsigned long
 __cmpxchg_u32(volatile int *p, int old, int new)
 {
-  __asm__ __volatile__("rsil    a15, "__stringify(LOCKLEVEL)"\n\t"
-		       "l32i    %0, %1, 0              \n\t"
-		       "bne	%0, %2, 1f             \n\t"
-		       "s32i    %3, %1, 0              \n\t"
-		       "1:                             \n\t"
-		       "wsr     a15, ps                \n\t"
-		       "rsync                          \n\t"
-		       : "=&a" (old)
-		       : "a" (p), "a" (old), "r" (new)
-		       : "a15", "memory");
-  return old;
+#if XCHAL_HAVE_S32C1I
+	__asm__ __volatile__(
+			"       wsr     %2, scompare1\n"
+			"       s32c1i  %0, %1, 0\n"
+			: "+a" (new)
+			: "a" (p), "a" (old)
+			: "memory"
+			);
+
+	return new;
+#else
+	__asm__ __volatile__(
+			"       rsil    a15, "__stringify(LOCKLEVEL)"\n"
+			"       l32i    %0, %1, 0\n"
+			"       bne     %0, %2, 1f\n"
+			"       s32i    %3, %1, 0\n"
+			"1:\n"
+			"       wsr     a15, ps\n"
+			"       rsync\n"
+			: "=&a" (old)
+			: "a" (p), "a" (old), "r" (new)
+			: "a15", "memory");
+	return old;
+#endif
 }
 /* This function doesn't exist, so you'll get a linker error
  * if something tries to do an invalid cmpxchg(). */
@@ -93,19 +106,36 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
 
 static inline unsigned long xchg_u32(volatile int * m, unsigned long val)
 {
-  unsigned long tmp;
-  __asm__ __volatile__("rsil    a15, "__stringify(LOCKLEVEL)"\n\t"
-		       "l32i    %0, %1, 0              \n\t"
-		       "s32i    %2, %1, 0              \n\t"
-		       "wsr     a15, ps                \n\t"
-		       "rsync                          \n\t"
-		       : "=&a" (tmp)
-		       : "a" (m), "a" (val)
-		       : "a15", "memory");
-  return tmp;
+#if XCHAL_HAVE_S32C1I
+	unsigned long tmp, result;
+	__asm__ __volatile__(
+			"1:     l32i    %1, %2, 0\n"
+			"       mov     %0, %3\n"
+			"       wsr     %1, scompare1\n"
+			"       s32c1i  %0, %2, 0\n"
+			"       bne     %0, %1, 1b\n"
+			: "=&a" (result), "=&a" (tmp)
+			: "a" (m), "a" (val)
+			: "memory"
+			);
+	return result;
+#else
+	unsigned long tmp;
+	__asm__ __volatile__(
+			"       rsil    a15, "__stringify(LOCKLEVEL)"\n"
+			"       l32i    %0, %1, 0\n"
+			"       s32i    %2, %1, 0\n"
+			"       wsr     a15, ps\n"
+			"       rsync\n"
+			: "=&a" (tmp)
+			: "a" (m), "a" (val)
+			: "a15", "memory");
+	return tmp;
+#endif
 }
 
-#define xchg(ptr,x) ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr))))
+#define xchg(ptr,x) \
+	((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr))))
 
 /*
  * This only works if the compiler isn't horribly bad at optimizing.
diff --git a/trunk/arch/xtensa/include/asm/current.h b/trunk/arch/xtensa/include/asm/current.h
index 8d1eb5d78649..47e46dcf5d49 100644
--- a/trunk/arch/xtensa/include/asm/current.h
+++ b/trunk/arch/xtensa/include/asm/current.h
@@ -30,7 +30,7 @@ static inline struct task_struct *get_current(void)
 
 #define GET_CURRENT(reg,sp)		\
 	GET_THREAD_INFO(reg,sp);	\
-  	l32i reg, reg, TI_TASK		\
+	l32i reg, reg, TI_TASK		\
 
 #endif
 
diff --git a/trunk/arch/xtensa/include/asm/delay.h b/trunk/arch/xtensa/include/asm/delay.h
index 58c0a4fd4003..61fc5faeb46c 100644
--- a/trunk/arch/xtensa/include/asm/delay.h
+++ b/trunk/arch/xtensa/include/asm/delay.h
@@ -19,9 +19,9 @@ extern unsigned long loops_per_jiffy;
 
 static inline void __delay(unsigned long loops)
 {
-  /* 2 cycles per loop. */
-  __asm__ __volatile__ ("1: addi %0, %0, -2; bgeui %0, 2, 1b"
-			: "=r" (loops) : "0" (loops));
+	/* 2 cycles per loop. */
+	__asm__ __volatile__ ("1: addi %0, %0, -2; bgeui %0, 2, 1b"
+			      : "=r" (loops) : "0" (loops));
 }
 
 static __inline__ u32 xtensa_get_ccount(void)
@@ -46,4 +46,3 @@ static __inline__ void udelay (unsigned long usecs)
 }
 
 #endif
-
diff --git a/trunk/arch/xtensa/include/asm/dma-mapping.h b/trunk/arch/xtensa/include/asm/dma-mapping.h
index 492c95790ad5..4acb5feba1fb 100644
--- a/trunk/arch/xtensa/include/asm/dma-mapping.h
+++ b/trunk/arch/xtensa/include/asm/dma-mapping.h
@@ -16,6 +16,8 @@
 #include <linux/mm.h>
 #include <linux/scatterlist.h>
 
+#define DMA_ERROR_CODE		(~(dma_addr_t)0x0)
+
 /*
  * DMA-consistent mapping functions.
  */
@@ -98,8 +100,8 @@ dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
 }
 
 static inline void
-dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
-		enum dma_data_direction direction)
+dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle,
+		           size_t size, enum dma_data_direction direction)
 {
 	consistent_sync((void *)bus_to_virt(dma_handle), size, direction);
 }
diff --git a/trunk/arch/xtensa/include/asm/elf.h b/trunk/arch/xtensa/include/asm/elf.h
index 5293312bc6a4..264d5fa450d8 100644
--- a/trunk/arch/xtensa/include/asm/elf.h
+++ b/trunk/arch/xtensa/include/asm/elf.h
@@ -168,11 +168,11 @@ extern void xtensa_elf_core_copy_regs (xtensa_gregset_t *, struct pt_regs *);
  */
 
 #define ELF_PLAT_INIT(_r, load_addr) \
-  do { _r->areg[0]=0; /*_r->areg[1]=0;*/ _r->areg[2]=0;  _r->areg[3]=0;  \
-       _r->areg[4]=0;  _r->areg[5]=0;    _r->areg[6]=0;  _r->areg[7]=0;  \
-       _r->areg[8]=0;  _r->areg[9]=0;    _r->areg[10]=0; _r->areg[11]=0; \
-       _r->areg[12]=0; _r->areg[13]=0;   _r->areg[14]=0; _r->areg[15]=0; \
-  } while (0)
+	do { _r->areg[0]=0; /*_r->areg[1]=0;*/ _r->areg[2]=0;  _r->areg[3]=0;  \
+	     _r->areg[4]=0;  _r->areg[5]=0;    _r->areg[6]=0;  _r->areg[7]=0;  \
+	     _r->areg[8]=0;  _r->areg[9]=0;    _r->areg[10]=0; _r->areg[11]=0; \
+	     _r->areg[12]=0; _r->areg[13]=0;   _r->areg[14]=0; _r->areg[15]=0; \
+	} while (0)
 
 typedef struct {
 	xtregs_opt_t	opt;
diff --git a/trunk/arch/xtensa/include/asm/highmem.h b/trunk/arch/xtensa/include/asm/highmem.h
index 0a046ca5a687..80be15124697 100644
--- a/trunk/arch/xtensa/include/asm/highmem.h
+++ b/trunk/arch/xtensa/include/asm/highmem.h
@@ -14,4 +14,3 @@
 extern void flush_cache_kmaps(void);
 
 #endif
-
diff --git a/trunk/arch/xtensa/include/asm/initialize_mmu.h b/trunk/arch/xtensa/include/asm/initialize_mmu.h
new file mode 100644
index 000000000000..e1f8ba4061ed
--- /dev/null
+++ b/trunk/arch/xtensa/include/asm/initialize_mmu.h
@@ -0,0 +1,55 @@
+/*
+ * arch/xtensa/include/asm/initialize_mmu.h
+ *
+ * Initializes MMU:
+ *
+ *      For the new V3 MMU we remap the TLB from virtual == physical
+ *      to the standard Linux mapping used in earlier MMU's.
+ *
+ *      The the MMU we also support a new configuration register that
+ *      specifies how the S32C1I instruction operates with the cache
+ *      controller.
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file "COPYING" in the main directory of
+ * this archive for more details.
+ *
+ * Copyright (C) 2008 - 2012 Tensilica, Inc.
+ *
+ *   Marc Gauthier <marc@tensilica.com>
+ *   Pete Delaney <piet@tensilica.com>
+ */
+
+#ifndef _XTENSA_INITIALIZE_MMU_H
+#define _XTENSA_INITIALIZE_MMU_H
+
+#ifdef __ASSEMBLY__
+
+#define XTENSA_HWVERSION_RC_2009_0 230000
+
+	.macro	initialize_mmu
+
+#if XCHAL_HAVE_S32C1I && (XCHAL_HW_MIN_VERSION >= XTENSA_HWVERSION_RC_2009_0)
+/*
+ * We Have Atomic Operation Control (ATOMCTL) Register; Initialize it.
+ * For details see Documentation/xtensa/atomctl.txt
+ */
+#if XCHAL_DCACHE_IS_COHERENT
+	movi	a3, 0x25	/* For SMP/MX -- internal for writeback,
+				 * RCW otherwise
+				 */
+#else
+	movi	a3, 0x29	/* non-MX -- Most cores use Std Memory
+				 * Controlers which usually can't use RCW
+				 */
+#endif
+	wsr	a3, atomctl
+#endif  /* XCHAL_HAVE_S32C1I &&
+	 * (XCHAL_HW_MIN_VERSION >= XTENSA_HWVERSION_RC_2009_0)
+	 */
+
+	.endm
+
+#endif /*__ASSEMBLY__*/
+
+#endif /* _XTENSA_INITIALIZE_MMU_H */
diff --git a/trunk/arch/xtensa/include/asm/mmu_context.h b/trunk/arch/xtensa/include/asm/mmu_context.h
index feb10af96519..d43525a286bb 100644
--- a/trunk/arch/xtensa/include/asm/mmu_context.h
+++ b/trunk/arch/xtensa/include/asm/mmu_context.h
@@ -107,7 +107,7 @@ activate_mm(struct mm_struct *prev, struct mm_struct *next)
 
 
 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
-                             struct task_struct *tsk)
+			     struct task_struct *tsk)
 {
 	unsigned long asid = asid_cache;
 
diff --git a/trunk/arch/xtensa/include/asm/nommu_context.h b/trunk/arch/xtensa/include/asm/nommu_context.h
index 599e7a2e729d..3407cf7989b7 100644
--- a/trunk/arch/xtensa/include/asm/nommu_context.h
+++ b/trunk/arch/xtensa/include/asm/nommu_context.h
@@ -2,7 +2,7 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
 }
 
-static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+static inline int init_new_context(struct task_struct *tsk,struct mm_struct *mm)
 {
 	return 0;
 }
diff --git a/trunk/arch/xtensa/include/asm/page.h b/trunk/arch/xtensa/include/asm/page.h
index 7a5591a71f85..47f582333f6b 100644
--- a/trunk/arch/xtensa/include/asm/page.h
+++ b/trunk/arch/xtensa/include/asm/page.h
@@ -29,19 +29,19 @@
  * PAGE_SHIFT determines the page size
  */
 
-#define PAGE_SHIFT		12
-#define PAGE_SIZE		(__XTENSA_UL_CONST(1) << PAGE_SHIFT)
-#define PAGE_MASK		(~(PAGE_SIZE-1))
+#define PAGE_SHIFT	12
+#define PAGE_SIZE	(__XTENSA_UL_CONST(1) << PAGE_SHIFT)
+#define PAGE_MASK	(~(PAGE_SIZE-1))
 
 #ifdef CONFIG_MMU
-#define PAGE_OFFSET		XCHAL_KSEG_CACHED_VADDR
-#define MAX_MEM_PFN		XCHAL_KSEG_SIZE
+#define PAGE_OFFSET	XCHAL_KSEG_CACHED_VADDR
+#define MAX_MEM_PFN	XCHAL_KSEG_SIZE
 #else
-#define PAGE_OFFSET		0
-#define MAX_MEM_PFN		(PLATFORM_DEFAULT_MEM_START + PLATFORM_DEFAULT_MEM_SIZE)
+#define PAGE_OFFSET	0
+#define MAX_MEM_PFN	(PLATFORM_DEFAULT_MEM_START + PLATFORM_DEFAULT_MEM_SIZE)
 #endif
 
-#define PGTABLE_START		0x80000000
+#define PGTABLE_START	0x80000000
 
 /*
  * Cache aliasing:
@@ -161,7 +161,9 @@ extern void copy_user_page(void*, void*, unsigned long, struct page*);
 
 #define __pa(x)			((unsigned long) (x) - PAGE_OFFSET)
 #define __va(x)			((void *)((unsigned long) (x) + PAGE_OFFSET))
-#define pfn_valid(pfn)		((pfn) >= ARCH_PFN_OFFSET && ((pfn) - ARCH_PFN_OFFSET) < max_mapnr)
+#define pfn_valid(pfn) \
+	((pfn) >= ARCH_PFN_OFFSET && ((pfn) - ARCH_PFN_OFFSET) < max_mapnr)
+
 #ifdef CONFIG_DISCONTIGMEM
 # error CONFIG_DISCONTIGMEM not supported
 #endif
diff --git a/trunk/arch/xtensa/include/asm/pci-bridge.h b/trunk/arch/xtensa/include/asm/pci-bridge.h
index 00fcbd7c534a..0b68c76ec1e6 100644
--- a/trunk/arch/xtensa/include/asm/pci-bridge.h
+++ b/trunk/arch/xtensa/include/asm/pci-bridge.h
@@ -35,7 +35,7 @@ struct pci_space {
 struct pci_controller {
 	int index;			/* used for pci_controller_num */
 	struct pci_controller *next;
-        struct pci_bus *bus;
+	struct pci_bus *bus;
 	void *arch_data;
 
 	int first_busno;
diff --git a/trunk/arch/xtensa/include/asm/pci.h b/trunk/arch/xtensa/include/asm/pci.h
index 05244f07dd31..614be031a79a 100644
--- a/trunk/arch/xtensa/include/asm/pci.h
+++ b/trunk/arch/xtensa/include/asm/pci.h
@@ -53,7 +53,7 @@ struct pci_dev;
 
 /* Map a range of PCI memory or I/O space for a device into user space */
 int pci_mmap_page_range(struct pci_dev *pdev, struct vm_area_struct *vma,
-                        enum pci_mmap_state mmap_state, int write_combine);
+			enum pci_mmap_state mmap_state, int write_combine);
 
 /* Tell drivers/pci/proc.c that we have pci_mmap_page_range() */
 #define HAVE_PCI_MMAP	1
diff --git a/trunk/arch/xtensa/include/asm/pgalloc.h b/trunk/arch/xtensa/include/asm/pgalloc.h
index 40cf9bceda2c..cf914c8c249a 100644
--- a/trunk/arch/xtensa/include/asm/pgalloc.h
+++ b/trunk/arch/xtensa/include/asm/pgalloc.h
@@ -42,7 +42,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 
 extern struct kmem_cache *pgtable_cache;
 
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, 
+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
 					 unsigned long address)
 {
 	return kmem_cache_alloc(pgtable_cache, GFP_KERNEL|__GFP_REPEAT);
diff --git a/trunk/arch/xtensa/include/asm/pgtable.h b/trunk/arch/xtensa/include/asm/pgtable.h
index b03c043ce75b..c90ea5bfa1b4 100644
--- a/trunk/arch/xtensa/include/asm/pgtable.h
+++ b/trunk/arch/xtensa/include/asm/pgtable.h
@@ -284,7 +284,7 @@ struct vm_area_struct;
 
 static inline int
 ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr,
-    			  pte_t *ptep)
+			  pte_t *ptep)
 {
 	pte_t pte = *ptep;
 	if (!pte_young(pte))
@@ -304,8 +304,8 @@ ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 static inline void
 ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
-  	pte_t pte = *ptep;
-  	update_pte(ptep, pte_wrprotect(pte));
+	pte_t pte = *ptep;
+	update_pte(ptep, pte_wrprotect(pte));
 }
 
 /* to find an entry in a kernel page-table-directory */
@@ -399,7 +399,7 @@ extern  void update_mmu_cache(struct vm_area_struct * vma,
  */
 
 #define io_remap_pfn_range(vma,from,pfn,size,prot) \
-                remap_pfn_range(vma, from, pfn, size, prot)
+	remap_pfn_range(vma, from, pfn, size, prot)
 
 typedef pte_t *pte_addr_t;
 
diff --git a/trunk/arch/xtensa/include/asm/platform.h b/trunk/arch/xtensa/include/asm/platform.h
index 7d936e58e9be..ec098b68fb9a 100644
--- a/trunk/arch/xtensa/include/asm/platform.h
+++ b/trunk/arch/xtensa/include/asm/platform.h
@@ -75,4 +75,3 @@ extern int platform_pcibios_fixup (void);
 extern void platform_calibrate_ccount (void);
 
 #endif	/* _XTENSA_PLATFORM_H */
-
diff --git a/trunk/arch/xtensa/include/asm/processor.h b/trunk/arch/xtensa/include/asm/processor.h
index 2d630e7399ca..e5fb6b0abdf4 100644
--- a/trunk/arch/xtensa/include/asm/processor.h
+++ b/trunk/arch/xtensa/include/asm/processor.h
@@ -89,7 +89,7 @@
 #define MAKE_PC_FROM_RA(ra,sp)    (((ra) & 0x3fffffff) | ((sp) & 0xc0000000))
 
 typedef struct {
-    unsigned long seg;
+	unsigned long seg;
 } mm_segment_t;
 
 struct thread_struct {
@@ -145,10 +145,10 @@ struct thread_struct {
  *       set_thread_state in signal.c depends on it.
  */
 #define USER_PS_VALUE ((1 << PS_WOE_BIT) |				\
-                       (1 << PS_CALLINC_SHIFT) |			\
-                       (USER_RING << PS_RING_SHIFT) |			\
-                       (1 << PS_UM_BIT) |				\
-                       (1 << PS_EXCM_BIT))
+		       (1 << PS_CALLINC_SHIFT) |			\
+		       (USER_RING << PS_RING_SHIFT) |			\
+		       (1 << PS_UM_BIT) |				\
+		       (1 << PS_EXCM_BIT))
 
 /* Clearing a0 terminates the backtrace. */
 #define start_thread(regs, new_pc, new_sp) \
diff --git a/trunk/arch/xtensa/include/asm/prom.h b/trunk/arch/xtensa/include/asm/prom.h
new file mode 100644
index 000000000000..f3d7cd2c0de7
--- /dev/null
+++ b/trunk/arch/xtensa/include/asm/prom.h
@@ -0,0 +1,6 @@
+#ifndef _XTENSA_ASM_PROM_H
+#define _XTENSA_ASM_PROM_H
+
+#define HAVE_ARCH_DEVTREE_FIXUPS
+
+#endif /* _XTENSA_ASM_PROM_H */
diff --git a/trunk/arch/xtensa/include/asm/ptrace.h b/trunk/arch/xtensa/include/asm/ptrace.h
index da21c17f23aa..58bf6fd3f913 100644
--- a/trunk/arch/xtensa/include/asm/ptrace.h
+++ b/trunk/arch/xtensa/include/asm/ptrace.h
@@ -37,7 +37,7 @@ struct pt_regs {
 	unsigned long windowstart;	/*  52 */
 	unsigned long syscall;		/*  56 */
 	unsigned long icountlevel;	/*  60 */
-	int reserved[1];		/*  64 */
+	unsigned long scompare1;	/*  64 */
 
 	/* Additional configurable registers that are used by the compiler. */
 	xtregs_opt_t xtregs_opt;
@@ -55,7 +55,7 @@ struct pt_regs {
 
 # define arch_has_single_step()	(1)
 # define task_pt_regs(tsk) ((struct pt_regs*) \
-  (task_stack_page(tsk) + KERNEL_STACK_SIZE - (XCHAL_NUM_AREGS-16)*4) - 1)
+	(task_stack_page(tsk) + KERNEL_STACK_SIZE - (XCHAL_NUM_AREGS-16)*4) - 1)
 # define user_mode(regs) (((regs)->ps & 0x00000020)!=0)
 # define instruction_pointer(regs) ((regs)->pc)
 
diff --git a/trunk/arch/xtensa/include/asm/regs.h b/trunk/arch/xtensa/include/asm/regs.h
index 8a8aa61ccc8d..76096a4e5b8d 100644
--- a/trunk/arch/xtensa/include/asm/regs.h
+++ b/trunk/arch/xtensa/include/asm/regs.h
@@ -52,6 +52,10 @@
 #define EXCCAUSE_SPECULATION			7
 #define EXCCAUSE_PRIVILEGED			8
 #define EXCCAUSE_UNALIGNED			9
+#define EXCCAUSE_INSTR_DATA_ERROR		12
+#define EXCCAUSE_LOAD_STORE_DATA_ERROR		13
+#define EXCCAUSE_INSTR_ADDR_ERROR		14
+#define EXCCAUSE_LOAD_STORE_ADDR_ERROR		15
 #define EXCCAUSE_ITLB_MISS			16
 #define EXCCAUSE_ITLB_MULTIHIT			17
 #define EXCCAUSE_ITLB_PRIVILEGE			18
@@ -105,4 +109,3 @@
 #define DEBUGCAUSE_ICOUNT_BIT		0	/* ICOUNT would incr. to zero */
 
 #endif /* _XTENSA_SPECREG_H */
-
diff --git a/trunk/arch/xtensa/include/asm/spinlock.h b/trunk/arch/xtensa/include/asm/spinlock.h
index 8ff23649581b..03975906b36f 100644
--- a/trunk/arch/xtensa/include/asm/spinlock.h
+++ b/trunk/arch/xtensa/include/asm/spinlock.h
@@ -11,6 +11,192 @@
 #ifndef _XTENSA_SPINLOCK_H
 #define _XTENSA_SPINLOCK_H
 
-#include <linux/spinlock.h>
+/*
+ * spinlock
+ *
+ * There is at most one owner of a spinlock.  There are not different
+ * types of spinlock owners like there are for rwlocks (see below).
+ *
+ * When trying to obtain a spinlock, the function "spins" forever, or busy-
+ * waits, until the lock is obtained.  When spinning, presumably some other
+ * owner will soon give up the spinlock making it available to others.  Use
+ * the trylock functions to avoid spinning forever.
+ *
+ * possible values:
+ *
+ *    0         nobody owns the spinlock
+ *    1         somebody owns the spinlock
+ */
+
+#define __raw_spin_is_locked(x) ((x)->slock != 0)
+#define __raw_spin_unlock_wait(lock) \
+	do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0)
+
+#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+
+static inline void __raw_spin_lock(raw_spinlock_t *lock)
+{
+	unsigned long tmp;
+
+	__asm__ __volatile__(
+			"       movi    %0, 0\n"
+			"       wsr     %0, scompare1\n"
+			"1:     movi    %0, 1\n"
+			"       s32c1i  %0, %1, 0\n"
+			"       bnez    %0, 1b\n"
+			: "=&a" (tmp)
+			: "a" (&lock->slock)
+			: "memory");
+}
+
+/* Returns 1 if the lock is obtained, 0 otherwise. */
+
+static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+{
+	unsigned long tmp;
+
+	__asm__ __volatile__(
+			"       movi    %0, 0\n"
+			"       wsr     %0, scompare1\n"
+			"       movi    %0, 1\n"
+			"       s32c1i  %0, %1, 0\n"
+			: "=&a" (tmp)
+			: "a" (&lock->slock)
+			: "memory");
+
+	return tmp == 0 ? 1 : 0;
+}
+
+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+{
+	unsigned long tmp;
+
+	__asm__ __volatile__(
+			"       movi    %0, 0\n"
+			"       s32ri   %0, %1, 0\n"
+			: "=&a" (tmp)
+			: "a" (&lock->slock)
+			: "memory");
+}
+
+/*
+ * rwlock
+ *
+ * Read-write locks are really a more flexible spinlock.  They allow
+ * multiple readers but only one writer.  Write ownership is exclusive
+ * (i.e., all other readers and writers are blocked from ownership while
+ * there is a write owner).  These rwlocks are unfair to writers.  Writers
+ * can be starved for an indefinite time by readers.
+ *
+ * possible values:
+ *
+ *   0          nobody owns the rwlock
+ *  >0          one or more readers own the rwlock
+ *                (the positive value is the actual number of readers)
+ *  0x80000000  one writer owns the rwlock, no other writers, no readers
+ */
+
+#define __raw_write_can_lock(x)  ((x)->lock == 0)
+
+static inline void __raw_write_lock(raw_rwlock_t *rw)
+{
+	unsigned long tmp;
+
+	__asm__ __volatile__(
+			"       movi    %0, 0\n"
+			"       wsr     %0, scompare1\n"
+			"1:     movi    %0, 1\n"
+			"       slli    %0, %0, 31\n"
+			"       s32c1i  %0, %1, 0\n"
+			"       bnez    %0, 1b\n"
+			: "=&a" (tmp)
+			: "a" (&rw->lock)
+			: "memory");
+}
+
+/* Returns 1 if the lock is obtained, 0 otherwise. */
+
+static inline int __raw_write_trylock(raw_rwlock_t *rw)
+{
+	unsigned long tmp;
+
+	__asm__ __volatile__(
+			"       movi    %0, 0\n"
+			"       wsr     %0, scompare1\n"
+			"       movi    %0, 1\n"
+			"       slli    %0, %0, 31\n"
+			"       s32c1i  %0, %1, 0\n"
+			: "=&a" (tmp)
+			: "a" (&rw->lock)
+			: "memory");
+
+	return tmp == 0 ? 1 : 0;
+}
+
+static inline void __raw_write_unlock(raw_rwlock_t *rw)
+{
+	unsigned long tmp;
+
+	__asm__ __volatile__(
+			"       movi    %0, 0\n"
+			"       s32ri   %0, %1, 0\n"
+			: "=&a" (tmp)
+			: "a" (&rw->lock)
+			: "memory");
+}
+
+static inline void __raw_read_lock(raw_rwlock_t *rw)
+{
+	unsigned long tmp;
+	unsigned long result;
+
+	__asm__ __volatile__(
+			"1:     l32i    %1, %2, 0\n"
+			"       bltz    %1, 1b\n"
+			"       wsr     %1, scompare1\n"
+			"       addi    %0, %1, 1\n"
+			"       s32c1i  %0, %2, 0\n"
+			"       bne     %0, %1, 1b\n"
+			: "=&a" (result), "=&a" (tmp)
+			: "a" (&rw->lock)
+			: "memory");
+}
+
+/* Returns 1 if the lock is obtained, 0 otherwise. */
+
+static inline int __raw_read_trylock(raw_rwlock_t *rw)
+{
+	unsigned long result;
+	unsigned long tmp;
+
+	__asm__ __volatile__(
+			"       l32i    %1, %2, 0\n"
+			"       addi    %0, %1, 1\n"
+			"       bltz    %0, 1f\n"
+			"       wsr     %1, scompare1\n"
+			"       s32c1i  %0, %2, 0\n"
+			"       sub     %0, %0, %1\n"
+			"1:\n"
+			: "=&a" (result), "=&a" (tmp)
+			: "a" (&rw->lock)
+			: "memory");
+
+	return result == 0;
+}
+
+static inline void __raw_read_unlock(raw_rwlock_t *rw)
+{
+	unsigned long tmp1, tmp2;
+
+	__asm__ __volatile__(
+			"1:     l32i    %1, %2, 0\n"
+			"       addi    %0, %1, -1\n"
+			"       wsr     %1, scompare1\n"
+			"       s32c1i  %0, %2, 0\n"
+			"       bne     %0, %1, 1b\n"
+			: "=&a" (tmp1), "=&a" (tmp2)
+			: "a" (&rw->lock)
+			: "memory");
+}
 
 #endif	/* _XTENSA_SPINLOCK_H */
diff --git a/trunk/arch/xtensa/include/asm/syscall.h b/trunk/arch/xtensa/include/asm/syscall.h
index b00c928d4cce..8d5e47fad095 100644
--- a/trunk/arch/xtensa/include/asm/syscall.h
+++ b/trunk/arch/xtensa/include/asm/syscall.h
@@ -25,9 +25,10 @@ asmlinkage long xtensa_fadvise64_64(int, int,
 /* Should probably move to linux/syscalls.h */
 struct pollfd;
 asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp,
-	fd_set __user *exp, struct timespec __user *tsp, void __user *sig);
+			     fd_set __user *exp, struct timespec __user *tsp,
+			     void __user *sig);
 asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
-	struct timespec __user *tsp, const sigset_t __user *sigmask,
-	size_t sigsetsize);
-asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset,
-		size_t sigsetsize);
+			  struct timespec __user *tsp,
+			  const sigset_t __user *sigmask,
+			  size_t sigsetsize);
+asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize);
diff --git a/trunk/arch/xtensa/include/asm/traps.h b/trunk/arch/xtensa/include/asm/traps.h
new file mode 100644
index 000000000000..54f70440185e
--- /dev/null
+++ b/trunk/arch/xtensa/include/asm/traps.h
@@ -0,0 +1,23 @@
+/*
+ * arch/xtensa/include/asm/traps.h
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2012 Tensilica Inc.
+ */
+#ifndef _XTENSA_TRAPS_H
+#define _XTENSA_TRAPS_H
+
+#include <asm/ptrace.h>
+
+/*
+ * handler must be either of the following:
+ *  void (*)(struct pt_regs *regs);
+ *  void (*)(struct pt_regs *regs, unsigned long exccause);
+ */
+extern void * __init trap_set_handler(int cause, void *handler);
+extern void do_unhandled(struct pt_regs *regs, unsigned long exccause);
+
+#endif /* _XTENSA_TRAPS_H */
diff --git a/trunk/arch/xtensa/include/asm/uaccess.h b/trunk/arch/xtensa/include/asm/uaccess.h
index 6e4bb3b791ab..fd686dc45d1a 100644
--- a/trunk/arch/xtensa/include/asm/uaccess.h
+++ b/trunk/arch/xtensa/include/asm/uaccess.h
@@ -180,7 +180,8 @@
 #define segment_eq(a,b)	((a).seg == (b).seg)
 
 #define __kernel_ok (segment_eq(get_fs(), KERNEL_DS))
-#define __user_ok(addr,size) (((size) <= TASK_SIZE)&&((addr) <= TASK_SIZE-(size)))
+#define __user_ok(addr,size) \
+		(((size) <= TASK_SIZE)&&((addr) <= TASK_SIZE-(size)))
 #define __access_ok(addr,size) (__kernel_ok || __user_ok((addr),(size)))
 #define access_ok(type,addr,size) __access_ok((unsigned long)(addr),(size))
 
@@ -234,10 +235,10 @@ do {									\
 	int __cb;							\
 	retval = 0;							\
 	switch (size) {							\
-        case 1: __put_user_asm(x,ptr,retval,1,"s8i",__cb);  break;	\
-        case 2: __put_user_asm(x,ptr,retval,2,"s16i",__cb); break;	\
-        case 4: __put_user_asm(x,ptr,retval,4,"s32i",__cb); break;	\
-        case 8: {							\
+	case 1: __put_user_asm(x,ptr,retval,1,"s8i",__cb);  break;	\
+	case 2: __put_user_asm(x,ptr,retval,2,"s16i",__cb); break;	\
+	case 4: __put_user_asm(x,ptr,retval,4,"s32i",__cb); break;	\
+	case 8: {							\
 		     __typeof__(*ptr) __v64 = x;			\
 		     retval = __copy_to_user(ptr,&__v64,8);		\
 		     break;						\
@@ -291,7 +292,7 @@ do {									\
  * __check_align_* macros still work.
  */
 #define __put_user_asm(x, addr, err, align, insn, cb)	\
-   __asm__ __volatile__(				\
+__asm__ __volatile__(					\
 	__check_align_##align				\
 	"1: "insn"  %2, %3, 0		\n"		\
 	"2:				\n"		\
@@ -301,8 +302,8 @@ do {									\
 	"   .long  2b			\n"		\
 	"5:				\n"		\
 	"   l32r   %1, 4b		\n"		\
-        "   movi   %0, %4		\n"		\
-        "   jx     %1			\n"		\
+	"   movi   %0, %4		\n"		\
+	"   jx     %1			\n"		\
 	"   .previous			\n"		\
 	"   .section  __ex_table,\"a\"	\n"		\
 	"   .long	1b, 5b		\n"		\
@@ -334,13 +335,13 @@ extern long __get_user_bad(void);
 do {									\
 	int __cb;							\
 	retval = 0;							\
-        switch (size) {							\
-          case 1: __get_user_asm(x,ptr,retval,1,"l8ui",__cb);  break;	\
-          case 2: __get_user_asm(x,ptr,retval,2,"l16ui",__cb); break;	\
-          case 4: __get_user_asm(x,ptr,retval,4,"l32i",__cb);  break;	\
-          case 8: retval = __copy_from_user(&x,ptr,8);    break;	\
-          default: (x) = __get_user_bad();				\
-        }								\
+	switch (size) {							\
+	case 1: __get_user_asm(x,ptr,retval,1,"l8ui",__cb);  break;	\
+	case 2: __get_user_asm(x,ptr,retval,2,"l16ui",__cb); break;	\
+	case 4: __get_user_asm(x,ptr,retval,4,"l32i",__cb);  break;	\
+	case 8: retval = __copy_from_user(&x,ptr,8);    break;	\
+	default: (x) = __get_user_bad();				\
+	}								\
 } while (0)
 
 
@@ -349,7 +350,7 @@ do {									\
  * __check_align_* macros still work.
  */
 #define __get_user_asm(x, addr, err, align, insn, cb) \
-   __asm__ __volatile__(			\
+__asm__ __volatile__(			\
 	__check_align_##align			\
 	"1: "insn"  %2, %3, 0		\n"	\
 	"2:				\n"	\
@@ -360,8 +361,8 @@ do {									\
 	"5:				\n"	\
 	"   l32r   %1, 4b		\n"	\
 	"   movi   %2, 0		\n"	\
-        "   movi   %0, %4		\n"	\
-        "   jx     %1			\n"	\
+	"   movi   %0, %4		\n"	\
+	"   jx     %1			\n"	\
 	"   .previous			\n"	\
 	"   .section  __ex_table,\"a\"	\n"	\
 	"   .long	1b, 5b		\n"	\
@@ -421,8 +422,10 @@ __generic_copy_from_user(void *to, const void *from, unsigned long n)
 
 #define copy_to_user(to,from,n) __generic_copy_to_user((to),(from),(n))
 #define copy_from_user(to,from,n) __generic_copy_from_user((to),(from),(n))
-#define __copy_to_user(to,from,n) __generic_copy_to_user_nocheck((to),(from),(n))
-#define __copy_from_user(to,from,n) __generic_copy_from_user_nocheck((to),(from),(n))
+#define __copy_to_user(to,from,n) \
+	__generic_copy_to_user_nocheck((to),(from),(n))
+#define __copy_from_user(to,from,n) \
+	__generic_copy_from_user_nocheck((to),(from),(n))
 #define __copy_to_user_inatomic __copy_to_user
 #define __copy_from_user_inatomic __copy_from_user
 
diff --git a/trunk/arch/xtensa/kernel/Makefile b/trunk/arch/xtensa/kernel/Makefile
index f36cef5a62ff..c3a59d992ac0 100644
--- a/trunk/arch/xtensa/kernel/Makefile
+++ b/trunk/arch/xtensa/kernel/Makefile
@@ -23,13 +23,13 @@ obj-$(CONFIG_MODULES) += xtensa_ksyms.o module.o
 #
 # Replicate rules in scripts/Makefile.build
 
-sed-y = -e 's/\*(\(\.[a-z]*it\|\.ref\|\)\.text)/*(\1.literal \1.text)/g'    \
-	-e 's/\.text\.unlikely/.literal.unlikely .text.unlikely/g' \
+sed-y = -e 's/\*(\(\.[a-z]*it\|\.ref\|\)\.text)/*(\1.literal \1.text)/g' \
+	-e 's/\.text\.unlikely/.literal.unlikely .text.unlikely/g'	 \
 	-e 's/\*(\(\.text\.[a-z]*\))/*(\1.literal \1)/g'
 
 quiet_cmd__cpp_lds_S = LDS     $@
-      cmd__cpp_lds_S = $(CPP) $(cpp_flags) -P -C -Uxtensa -D__ASSEMBLY__ $< \
-                       | sed $(sed-y) >$@
+cmd__cpp_lds_S = $(CPP) $(cpp_flags) -P -C -Uxtensa -D__ASSEMBLY__ $<    \
+                 | sed $(sed-y) >$@
 
 $(obj)/vmlinux.lds: $(src)/vmlinux.lds.S FORCE
 	$(call if_changed_dep,_cpp_lds_S)
diff --git a/trunk/arch/xtensa/kernel/align.S b/trunk/arch/xtensa/kernel/align.S
index 934ae58e2c79..aa2e87b8566a 100644
--- a/trunk/arch/xtensa/kernel/align.S
+++ b/trunk/arch/xtensa/kernel/align.S
@@ -442,7 +442,7 @@ ENTRY(fast_unaligned)
 	mov	a1, a2
 
 	rsr	a0, ps
-        bbsi.l  a2, PS_UM_BIT, 1f     # jump if user mode
+	bbsi.l  a2, PS_UM_BIT, 1f     # jump if user mode
 
 	movi	a0, _kernel_exception
 	jx	a0
@@ -450,6 +450,6 @@ ENTRY(fast_unaligned)
 1:	movi	a0, _user_exception
 	jx	a0
 
+ENDPROC(fast_unaligned)
 
 #endif /* XCHAL_UNALIGNED_LOAD_EXCEPTION || XCHAL_UNALIGNED_STORE_EXCEPTION */
-
diff --git a/trunk/arch/xtensa/kernel/asm-offsets.c b/trunk/arch/xtensa/kernel/asm-offsets.c
index 7dc3f9157185..0701fad170db 100644
--- a/trunk/arch/xtensa/kernel/asm-offsets.c
+++ b/trunk/arch/xtensa/kernel/asm-offsets.c
@@ -41,6 +41,7 @@ int main(void)
 	DEFINE(PT_SAR, offsetof (struct pt_regs, sar));
 	DEFINE(PT_ICOUNTLEVEL, offsetof (struct pt_regs, icountlevel));
 	DEFINE(PT_SYSCALL, offsetof (struct pt_regs, syscall));
+	DEFINE(PT_SCOMPARE1, offsetof(struct pt_regs, scompare1));
 	DEFINE(PT_AREG, offsetof (struct pt_regs, areg[0]));
 	DEFINE(PT_AREG0, offsetof (struct pt_regs, areg[0]));
 	DEFINE(PT_AREG1, offsetof (struct pt_regs, areg[1]));
@@ -91,7 +92,8 @@ int main(void)
 #endif
 	DEFINE(THREAD_XTREGS_USER, offsetof (struct thread_info, xtregs_user));
 	DEFINE(XTREGS_USER_SIZE, sizeof(xtregs_user_t));
-	DEFINE(THREAD_CURRENT_DS, offsetof (struct task_struct, thread.current_ds));
+	DEFINE(THREAD_CURRENT_DS, offsetof (struct task_struct, \
+	       thread.current_ds));
 
 	/* struct mm_struct */
 	DEFINE(MM_USERS, offsetof(struct mm_struct, mm_users));
@@ -108,4 +110,3 @@ int main(void)
 
 	return 0;
 }
-
diff --git a/trunk/arch/xtensa/kernel/coprocessor.S b/trunk/arch/xtensa/kernel/coprocessor.S
index 54c3be313bfa..647657484866 100644
--- a/trunk/arch/xtensa/kernel/coprocessor.S
+++ b/trunk/arch/xtensa/kernel/coprocessor.S
@@ -43,10 +43,13 @@
 /* IO protection is currently unsupported. */
 
 ENTRY(fast_io_protect)
+
 	wsr	a0, excsave1
 	movi	a0, unrecoverable_exception
 	callx0	a0
 
+ENDPROC(fast_io_protect)
+
 #if XTENSA_HAVE_COPROCESSORS
 
 /*
@@ -139,6 +142,7 @@ ENTRY(fast_io_protect)
  */
 
 ENTRY(coprocessor_save)
+
 	entry	a1, 32
 	s32i	a0, a1, 0
 	movi	a0, .Lsave_cp_regs_jump_table
@@ -150,7 +154,10 @@ ENTRY(coprocessor_save)
 1:	l32i	a0, a1, 0
 	retw
 
+ENDPROC(coprocessor_save)
+
 ENTRY(coprocessor_load)
+
 	entry	a1, 32
 	s32i	a0, a1, 0
 	movi	a0, .Lload_cp_regs_jump_table
@@ -162,8 +169,10 @@ ENTRY(coprocessor_load)
 1:	l32i	a0, a1, 0
 	retw
 
+ENDPROC(coprocessor_load)
+
 /*
- * coprocessor_flush(struct task_info*, index) 
+ * coprocessor_flush(struct task_info*, index)
  *                             a2        a3
  * coprocessor_restore(struct task_info*, index)
  *                              a2         a3
@@ -178,6 +187,7 @@ ENTRY(coprocessor_load)
 
 
 ENTRY(coprocessor_flush)
+
 	entry	a1, 32
 	s32i	a0, a1, 0
 	movi	a0, .Lsave_cp_regs_jump_table
@@ -191,6 +201,8 @@ ENTRY(coprocessor_flush)
 1:	l32i	a0, a1, 0
 	retw
 
+ENDPROC(coprocessor_flush)
+
 ENTRY(coprocessor_restore)
 	entry	a1, 32
 	s32i	a0, a1, 0
@@ -205,6 +217,8 @@ ENTRY(coprocessor_restore)
 1:	l32i	a0, a1, 0
 	retw
 
+ENDPROC(coprocessor_restore)
+
 /*
  * Entry condition:
  *
@@ -220,10 +234,12 @@ ENTRY(coprocessor_restore)
  */
 
 ENTRY(fast_coprocessor_double)
+
 	wsr	a0, excsave1
 	movi	a0, unrecoverable_exception
 	callx0	a0
 
+ENDPROC(fast_coprocessor_double)
 
 ENTRY(fast_coprocessor)
 
@@ -327,9 +343,14 @@ ENTRY(fast_coprocessor)
 
 	rfe
 
+ENDPROC(fast_coprocessor)
+
 	.data
+
 ENTRY(coprocessor_owner)
+
 	.fill XCHAL_CP_MAX, 4, 0
 
-#endif /* XTENSA_HAVE_COPROCESSORS */
+END(coprocessor_owner)
 
+#endif /* XTENSA_HAVE_COPROCESSORS */
diff --git a/trunk/arch/xtensa/kernel/entry.S b/trunk/arch/xtensa/kernel/entry.S
index 90bfc1dbc13d..3777fec85e7c 100644
--- a/trunk/arch/xtensa/kernel/entry.S
+++ b/trunk/arch/xtensa/kernel/entry.S
@@ -219,6 +219,7 @@ _user_exception:
 
 	j	common_exception
 
+ENDPROC(user_exception)
 
 /*
  * First-level exit handler for kernel exceptions
@@ -371,6 +372,13 @@ common_exception:
 	s32i	a2, a1, PT_LBEG
 	s32i	a3, a1, PT_LEND
 
+	/* Save SCOMPARE1 */
+
+#if XCHAL_HAVE_S32C1I
+	rsr     a2, scompare1
+	s32i    a2, a1, PT_SCOMPARE1
+#endif
+
 	/* Save optional registers. */
 
 	save_xtregs_opt a1 a2 a4 a5 a6 a7 PT_XTREGS_OPT
@@ -432,6 +440,12 @@ common_exception_return:
 
 	load_xtregs_opt a1 a2 a4 a5 a6 a7 PT_XTREGS_OPT
 
+	/* Restore SCOMPARE1 */
+
+#if XCHAL_HAVE_S32C1I
+	l32i    a2, a1, PT_SCOMPARE1
+	wsr     a2, scompare1
+#endif
 	wsr	a3, ps		/* disable interrupts */
 
 	_bbci.l	a3, PS_UM_BIT, kernel_exception_exit
@@ -641,6 +655,8 @@ common_exception_exit:
 	l32i	a1, a1, PT_AREG1
 	rfde
 
+ENDPROC(kernel_exception)
+
 /*
  * Debug exception handler.
  *
@@ -701,6 +717,7 @@ ENTRY(debug_exception)
 	/* Debug exception while in exception mode. */
 1:	j	1b	// FIXME!!
 
+ENDPROC(debug_exception)
 
 /*
  * We get here in case of an unrecoverable exception.
@@ -751,6 +768,7 @@ ENTRY(unrecoverable_exception)
 
 1:	j	1b
 
+ENDPROC(unrecoverable_exception)
 
 /* -------------------------- FAST EXCEPTION HANDLERS ----------------------- */
 
@@ -856,7 +874,7 @@ ENTRY(fast_alloca)
 
 	_bnei	a0, 1, 1f		# no 'movsp a1, ax': jump
 
-        /* Move the save area. This implies the use of the L32E
+	/* Move the save area. This implies the use of the L32E
 	 * and S32E instructions, because this move must be done with
 	 * the user's PS.RING privilege levels, not with ring 0
 	 * (kernel's) privileges currently active with PS.EXCM
@@ -929,6 +947,7 @@ ENTRY(fast_alloca)
 	l32i	a2, a2, PT_AREG2
 	rfe
 
+ENDPROC(fast_alloca)
 
 /*
  * fast system calls.
@@ -966,6 +985,8 @@ ENTRY(fast_syscall_kernel)
 
 	j	kernel_exception
 
+ENDPROC(fast_syscall_kernel)
+
 ENTRY(fast_syscall_user)
 
 	/* Skip syscall. */
@@ -983,19 +1004,21 @@ ENTRY(fast_syscall_user)
 
 	j	user_exception
 
-ENTRY(fast_syscall_unrecoverable)
+ENDPROC(fast_syscall_user)
 
-        /* Restore all states. */
+ENTRY(fast_syscall_unrecoverable)
 
-        l32i    a0, a2, PT_AREG0        # restore a0
-        xsr     a2, depc                # restore a2, depc
-        rsr     a3, excsave1
+	/* Restore all states. */
 
-        wsr     a0, excsave1
-        movi    a0, unrecoverable_exception
-        callx0  a0
+	l32i    a0, a2, PT_AREG0        # restore a0
+	xsr     a2, depc                # restore a2, depc
+	rsr     a3, excsave1
 
+	wsr     a0, excsave1
+	movi    a0, unrecoverable_exception
+	callx0  a0
 
+ENDPROC(fast_syscall_unrecoverable)
 
 /*
  * sysxtensa syscall handler
@@ -1101,7 +1124,7 @@ CATCH
 	movi	a2, -EINVAL
 	rfe
 
-
+ENDPROC(fast_syscall_xtensa)
 
 
 /* fast_syscall_spill_registers.
@@ -1160,6 +1183,8 @@ ENTRY(fast_syscall_spill_registers)
 	movi	a2, 0
 	rfe
 
+ENDPROC(fast_syscall_spill_registers)
+
 /* Fixup handler.
  *
  * We get here if the spill routine causes an exception, e.g. tlb miss.
@@ -1228,9 +1253,9 @@ fast_syscall_spill_registers_fixup:
 
 	movi	a3, exc_table
 	rsr	a0, exccause
-        addx4	a0, a0, a3              	# find entry in table
-        l32i	a0, a0, EXC_TABLE_FAST_USER     # load handler
-        jx	a0
+	addx4	a0, a0, a3              	# find entry in table
+	l32i	a0, a0, EXC_TABLE_FAST_USER     # load handler
+	jx	a0
 
 fast_syscall_spill_registers_fixup_return:
 
@@ -1432,7 +1457,7 @@ ENTRY(_spill_registers)
 	rsr	a0, ps
 	_bbci.l	a0, PS_UM_BIT, 1f
 
- 	/* User space: Setup a dummy frame and kill application.
+	/* User space: Setup a dummy frame and kill application.
 	 * Note: We assume EXC_TABLE_KSTK contains a valid stack pointer.
 	 */
 
@@ -1464,6 +1489,8 @@ ENTRY(_spill_registers)
 	callx0	a0		# should not return
 1:	j	1b
 
+ENDPROC(_spill_registers)
+
 #ifdef CONFIG_MMU
 /*
  * We should never get here. Bail out!
@@ -1475,6 +1502,8 @@ ENTRY(fast_second_level_miss_double_kernel)
 	callx0	a0		# should not return
 1:	j	1b
 
+ENDPROC(fast_second_level_miss_double_kernel)
+
 /* First-level entry handler for user, kernel, and double 2nd-level
  * TLB miss exceptions.  Note that for now, user and kernel miss
  * exceptions share the same entry point and are handled identically.
@@ -1682,6 +1711,7 @@ ENTRY(fast_second_level_miss)
 	j	_kernel_exception
 1:	j	_user_exception
 
+ENDPROC(fast_second_level_miss)
 
 /*
  * StoreProhibitedException
@@ -1777,6 +1807,9 @@ ENTRY(fast_store_prohibited)
 	bbsi.l	a2, PS_UM_BIT, 1f
 	j	_kernel_exception
 1:	j	_user_exception
+
+ENDPROC(fast_store_prohibited)
+
 #endif /* CONFIG_MMU */
 
 /*
@@ -1787,6 +1820,7 @@ ENTRY(fast_store_prohibited)
  */
 
 ENTRY(system_call)
+
 	entry	a1, 32
 
 	/* regs->syscall = regs->areg[2] */
@@ -1831,6 +1865,8 @@ ENTRY(system_call)
 	callx4	a4
 	retw
 
+ENDPROC(system_call)
+
 
 /*
  * Task switch.
@@ -1899,6 +1935,7 @@ ENTRY(_switch_to)
 
 	retw
 
+ENDPROC(_switch_to)
 
 ENTRY(ret_from_fork)
 
@@ -1914,6 +1951,8 @@ ENTRY(ret_from_fork)
 
 	j	common_exception_return
 
+ENDPROC(ret_from_fork)
+
 /*
  * Kernel thread creation helper
  * On entry, set up by copy_thread: a2 = thread_fn, a3 = thread_fn arg
diff --git a/trunk/arch/xtensa/kernel/head.S b/trunk/arch/xtensa/kernel/head.S
index bdc50788f35e..91d9095284de 100644
--- a/trunk/arch/xtensa/kernel/head.S
+++ b/trunk/arch/xtensa/kernel/head.S
@@ -18,6 +18,7 @@
 #include <asm/processor.h>
 #include <asm/page.h>
 #include <asm/cacheasm.h>
+#include <asm/initialize_mmu.h>
 
 #include <linux/init.h>
 #include <linux/linkage.h>
@@ -47,16 +48,19 @@
 	 */
 
 	__HEAD
-	.globl _start
-_start:	_j	2f
+ENTRY(_start)
+
+	_j	2f
 	.align	4
 1:	.word	_startup
 2:	l32r	a0, 1b
 	jx	a0
 
+ENDPROC(_start)
+
 	.section .init.text, "ax"
-	.align 4
-_startup:
+
+ENTRY(_startup)
 
 	/* Disable interrupts and exceptions. */
 
@@ -107,7 +111,7 @@ _startup:
 	/* Disable all timers. */
 
 	.set	_index, 0
-	.rept	XCHAL_NUM_TIMERS - 1
+	.rept	XCHAL_NUM_TIMERS
 	wsr	a0, SREG_CCOMPARE + _index
 	.set	_index, _index + 1
 	.endr
@@ -120,7 +124,7 @@ _startup:
 
 	/* Disable coprocessors. */
 
-#if XCHAL_CP_NUM > 0
+#if XCHAL_HAVE_CP
 	wsr	a0, cpenable
 #endif
 
@@ -152,6 +156,8 @@ _startup:
 
 	isync
 
+	initialize_mmu
+
 	/* Unpack data sections
 	 *
 	 * The linker script used to build the Linux kernel image
@@ -230,6 +236,7 @@ _startup:
 should_never_return:
 	j	should_never_return
 
+ENDPROC(_startup)
 
 /*
  * BSS section
@@ -239,6 +246,8 @@ __PAGE_ALIGNED_BSS
 #ifdef CONFIG_MMU
 ENTRY(swapper_pg_dir)
 	.fill	PAGE_SIZE, 1, 0
+END(swapper_pg_dir)
 #endif
 ENTRY(empty_zero_page)
 	.fill	PAGE_SIZE, 1, 0
+END(empty_zero_page)
diff --git a/trunk/arch/xtensa/kernel/irq.c b/trunk/arch/xtensa/kernel/irq.c
index a6ce3e563739..6f4f9749cff7 100644
--- a/trunk/arch/xtensa/kernel/irq.c
+++ b/trunk/arch/xtensa/kernel/irq.c
@@ -18,6 +18,8 @@
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/kernel_stat.h>
+#include <linux/irqdomain.h>
+#include <linux/of.h>
 
 #include <asm/uaccess.h>
 #include <asm/platform.h>
@@ -26,19 +28,22 @@ static unsigned int cached_irq_mask;
 
 atomic_t irq_err_count;
 
+static struct irq_domain *root_domain;
+
 /*
  * do_IRQ handles all normal device IRQ's (the special
  * SMP cross-CPU interrupts have their own specific
  * handlers).
  */
 
-asmlinkage void do_IRQ(int irq, struct pt_regs *regs)
+asmlinkage void do_IRQ(int hwirq, struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
+	int irq = irq_find_mapping(root_domain, hwirq);
 
-	if (irq >= NR_IRQS) {
+	if (hwirq >= NR_IRQS) {
 		printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
-				__func__, irq);
+				__func__, hwirq);
 	}
 
 	irq_enter();
@@ -71,40 +76,39 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 
 static void xtensa_irq_mask(struct irq_data *d)
 {
-	cached_irq_mask &= ~(1 << d->irq);
+	cached_irq_mask &= ~(1 << d->hwirq);
 	set_sr (cached_irq_mask, intenable);
 }
 
 static void xtensa_irq_unmask(struct irq_data *d)
 {
-	cached_irq_mask |= 1 << d->irq;
+	cached_irq_mask |= 1 << d->hwirq;
 	set_sr (cached_irq_mask, intenable);
 }
 
 static void xtensa_irq_enable(struct irq_data *d)
 {
-	variant_irq_enable(d->irq);
+	variant_irq_enable(d->hwirq);
 	xtensa_irq_unmask(d);
 }
 
 static void xtensa_irq_disable(struct irq_data *d)
 {
 	xtensa_irq_mask(d);
-	variant_irq_disable(d->irq);
+	variant_irq_disable(d->hwirq);
 }
 
 static void xtensa_irq_ack(struct irq_data *d)
 {
-	set_sr(1 << d->irq, intclear);
+	set_sr(1 << d->hwirq, intclear);
 }
 
 static int xtensa_irq_retrigger(struct irq_data *d)
 {
-	set_sr (1 << d->irq, INTSET);
+	set_sr(1 << d->hwirq, intset);
 	return 1;
 }
 
-
 static struct irq_chip xtensa_irq_chip = {
 	.name		= "xtensa",
 	.irq_enable	= xtensa_irq_enable,
@@ -115,37 +119,99 @@ static struct irq_chip xtensa_irq_chip = {
 	.irq_retrigger	= xtensa_irq_retrigger,
 };
 
-void __init init_IRQ(void)
+static int xtensa_irq_map(struct irq_domain *d, unsigned int irq,
+		irq_hw_number_t hw)
 {
-	int index;
-
-	for (index = 0; index < XTENSA_NR_IRQS; index++) {
-		int mask = 1 << index;
-
-		if (mask & XCHAL_INTTYPE_MASK_SOFTWARE)
-			irq_set_chip_and_handler(index, &xtensa_irq_chip,
-						 handle_simple_irq);
+	u32 mask = 1 << hw;
+
+	if (mask & XCHAL_INTTYPE_MASK_SOFTWARE) {
+		irq_set_chip_and_handler_name(irq, &xtensa_irq_chip,
+				handle_simple_irq, "level");
+		irq_set_status_flags(irq, IRQ_LEVEL);
+	} else if (mask & XCHAL_INTTYPE_MASK_EXTERN_EDGE) {
+		irq_set_chip_and_handler_name(irq, &xtensa_irq_chip,
+				handle_edge_irq, "edge");
+		irq_clear_status_flags(irq, IRQ_LEVEL);
+	} else if (mask & XCHAL_INTTYPE_MASK_EXTERN_LEVEL) {
+		irq_set_chip_and_handler_name(irq, &xtensa_irq_chip,
+				handle_level_irq, "level");
+		irq_set_status_flags(irq, IRQ_LEVEL);
+	} else if (mask & XCHAL_INTTYPE_MASK_TIMER) {
+		irq_set_chip_and_handler_name(irq, &xtensa_irq_chip,
+				handle_edge_irq, "edge");
+		irq_clear_status_flags(irq, IRQ_LEVEL);
+	} else {/* XCHAL_INTTYPE_MASK_WRITE_ERROR */
+		/* XCHAL_INTTYPE_MASK_NMI */
+
+		irq_set_chip_and_handler_name(irq, &xtensa_irq_chip,
+				handle_level_irq, "level");
+		irq_set_status_flags(irq, IRQ_LEVEL);
+	}
+	return 0;
+}
 
-		else if (mask & XCHAL_INTTYPE_MASK_EXTERN_EDGE)
-			irq_set_chip_and_handler(index, &xtensa_irq_chip,
-						 handle_edge_irq);
+static unsigned map_ext_irq(unsigned ext_irq)
+{
+	unsigned mask = XCHAL_INTTYPE_MASK_EXTERN_EDGE |
+		XCHAL_INTTYPE_MASK_EXTERN_LEVEL;
+	unsigned i;
 
-		else if (mask & XCHAL_INTTYPE_MASK_EXTERN_LEVEL)
-			irq_set_chip_and_handler(index, &xtensa_irq_chip,
-						 handle_level_irq);
+	for (i = 0; mask; ++i, mask >>= 1) {
+		if ((mask & 1) && ext_irq-- == 0)
+			return i;
+	}
+	return XCHAL_NUM_INTERRUPTS;
+}
 
-		else if (mask & XCHAL_INTTYPE_MASK_TIMER)
-			irq_set_chip_and_handler(index, &xtensa_irq_chip,
-						 handle_edge_irq);
+/*
+ * Device Tree IRQ specifier translation function which works with one or
+ * two cell bindings. First cell value maps directly to the hwirq number.
+ * Second cell if present specifies whether hwirq number is external (1) or
+ * internal (0).
+ */
+int xtensa_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr,
+		const u32 *intspec, unsigned int intsize,
+		unsigned long *out_hwirq, unsigned int *out_type)
+{
+	if (WARN_ON(intsize < 1 || intsize > 2))
+		return -EINVAL;
+	if (intsize == 2 && intspec[1] == 1) {
+		unsigned int_irq = map_ext_irq(intspec[0]);
+		if (int_irq < XCHAL_NUM_INTERRUPTS)
+			*out_hwirq = int_irq;
+		else
+			return -EINVAL;
+	} else {
+		*out_hwirq = intspec[0];
+	}
+	*out_type = IRQ_TYPE_NONE;
+	return 0;
+}
 
-		else	/* XCHAL_INTTYPE_MASK_WRITE_ERROR */
-			/* XCHAL_INTTYPE_MASK_NMI */
+static const struct irq_domain_ops xtensa_irq_domain_ops = {
+	.xlate = xtensa_irq_domain_xlate,
+	.map = xtensa_irq_map,
+};
 
-			irq_set_chip_and_handler(index, &xtensa_irq_chip,
-						 handle_level_irq);
-	}
+void __init init_IRQ(void)
+{
+	struct device_node *intc = NULL;
 
 	cached_irq_mask = 0;
+	set_sr(~0, intclear);
+
+#ifdef CONFIG_OF
+	/* The interrupt controller device node is mandatory */
+	intc = of_find_compatible_node(NULL, NULL, "xtensa,pic");
+	BUG_ON(!intc);
+
+	root_domain = irq_domain_add_linear(intc, NR_IRQS,
+			&xtensa_irq_domain_ops, NULL);
+#else
+	root_domain = irq_domain_add_legacy(intc, NR_IRQS, 0, 0,
+			&xtensa_irq_domain_ops, NULL);
+#endif
+	irq_set_default_host(root_domain);
 
 	variant_init_irq();
 }
diff --git a/trunk/arch/xtensa/kernel/module.c b/trunk/arch/xtensa/kernel/module.c
index 451dda928c93..b715237bae61 100644
--- a/trunk/arch/xtensa/kernel/module.c
+++ b/trunk/arch/xtensa/kernel/module.c
@@ -53,7 +53,7 @@ int apply_relocate_add(Elf32_Shdr *sechdrs,
 		       struct module *mod)
 {
 	unsigned int i;
-        Elf32_Rela *rela = (void *)sechdrs[relsec].sh_addr;
+	Elf32_Rela *rela = (void *)sechdrs[relsec].sh_addr;
 	Elf32_Sym *sym;
 	unsigned char *location;
 	uint32_t value;
diff --git a/trunk/arch/xtensa/kernel/platform.c b/trunk/arch/xtensa/kernel/platform.c
index 97230e46cbe7..44bf21c3769a 100644
--- a/trunk/arch/xtensa/kernel/platform.c
+++ b/trunk/arch/xtensa/kernel/platform.c
@@ -44,4 +44,3 @@ _F(void, calibrate_ccount, (void),
 	ccount_per_jiffy = 10 * (1000000UL/HZ);
 });
 #endif
-
diff --git a/trunk/arch/xtensa/kernel/process.c b/trunk/arch/xtensa/kernel/process.c
index 1accf28da5f5..0dd5784416d3 100644
--- a/trunk/arch/xtensa/kernel/process.c
+++ b/trunk/arch/xtensa/kernel/process.c
@@ -108,7 +108,7 @@ void coprocessor_flush_all(struct thread_info *ti)
 
 void cpu_idle(void)
 {
-  	local_irq_enable();
+	local_irq_enable();
 
 	/* endless idle loop with no priority at all */
 	while (1) {
diff --git a/trunk/arch/xtensa/kernel/ptrace.c b/trunk/arch/xtensa/kernel/ptrace.c
index 33eea4c16f12..61fb2e9e9035 100644
--- a/trunk/arch/xtensa/kernel/ptrace.c
+++ b/trunk/arch/xtensa/kernel/ptrace.c
@@ -154,7 +154,7 @@ int ptrace_setxregs(struct task_struct *child, void __user *uregs)
 	coprocessor_flush_all(ti);
 	coprocessor_release_all(ti);
 
-	ret |= __copy_from_user(&ti->xtregs_cp, &xtregs->cp0, 
+	ret |= __copy_from_user(&ti->xtregs_cp, &xtregs->cp0,
 				sizeof(xtregs_coprocessor_t));
 #endif
 	ret |= __copy_from_user(&regs->xtregs_opt, &xtregs->opt,
@@ -343,4 +343,3 @@ void do_syscall_trace_leave(struct pt_regs *regs)
 			&& (current->ptrace & PT_PTRACED))
 		do_syscall_trace();
 }
-
diff --git a/trunk/arch/xtensa/kernel/setup.c b/trunk/arch/xtensa/kernel/setup.c
index b237988ba6d7..24c1a57abb40 100644
--- a/trunk/arch/xtensa/kernel/setup.c
+++ b/trunk/arch/xtensa/kernel/setup.c
@@ -22,6 +22,11 @@
 #include <linux/bootmem.h>
 #include <linux/kernel.h>
 
+#ifdef CONFIG_OF
+#include <linux/of_fdt.h>
+#include <linux/of_platform.h>
+#endif
+
 #if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_DUMMY_CONSOLE)
 # include <linux/console.h>
 #endif
@@ -42,6 +47,7 @@
 #include <asm/page.h>
 #include <asm/setup.h>
 #include <asm/param.h>
+#include <asm/traps.h>
 
 #include <platform/hardware.h>
 
@@ -64,6 +70,11 @@ int initrd_is_mapped = 0;
 extern int initrd_below_start_ok;
 #endif
 
+#ifdef CONFIG_OF
+extern u32 __dtb_start[];
+void *dtb_start = __dtb_start;
+#endif
+
 unsigned char aux_device_present;
 extern unsigned long loops_per_jiffy;
 
@@ -83,6 +94,8 @@ extern void init_mmu(void);
 static inline void init_mmu(void) { }
 #endif
 
+extern int mem_reserve(unsigned long, unsigned long, int);
+extern void bootmem_init(void);
 extern void zones_init(void);
 
 /*
@@ -104,28 +117,33 @@ typedef struct tagtable {
 
 /* parse current tag */
 
-static int __init parse_tag_mem(const bp_tag_t *tag)
+static int __init add_sysmem_bank(unsigned long type, unsigned long start,
+		unsigned long end)
 {
-	meminfo_t *mi = (meminfo_t*)(tag->data);
-
-	if (mi->type != MEMORY_TYPE_CONVENTIONAL)
-		return -1;
-
 	if (sysmem.nr_banks >= SYSMEM_BANKS_MAX) {
 		printk(KERN_WARNING
-		       "Ignoring memory bank 0x%08lx size %ldKB\n",
-		       (unsigned long)mi->start,
-		       (unsigned long)mi->end - (unsigned long)mi->start);
+				"Ignoring memory bank 0x%08lx size %ldKB\n",
+				start, end - start);
 		return -EINVAL;
 	}
-	sysmem.bank[sysmem.nr_banks].type  = mi->type;
-	sysmem.bank[sysmem.nr_banks].start = PAGE_ALIGN(mi->start);
-	sysmem.bank[sysmem.nr_banks].end   = mi->end & PAGE_MASK;
+	sysmem.bank[sysmem.nr_banks].type  = type;
+	sysmem.bank[sysmem.nr_banks].start = PAGE_ALIGN(start);
+	sysmem.bank[sysmem.nr_banks].end   = end & PAGE_MASK;
 	sysmem.nr_banks++;
 
 	return 0;
 }
 
+static int __init parse_tag_mem(const bp_tag_t *tag)
+{
+	meminfo_t *mi = (meminfo_t *)(tag->data);
+
+	if (mi->type != MEMORY_TYPE_CONVENTIONAL)
+		return -1;
+
+	return add_sysmem_bank(mi->type, mi->start, mi->end);
+}
+
 __tagtable(BP_TAG_MEMORY, parse_tag_mem);
 
 #ifdef CONFIG_BLK_DEV_INITRD
@@ -142,12 +160,31 @@ static int __init parse_tag_initrd(const bp_tag_t* tag)
 
 __tagtable(BP_TAG_INITRD, parse_tag_initrd);
 
+#ifdef CONFIG_OF
+
+static int __init parse_tag_fdt(const bp_tag_t *tag)
+{
+	dtb_start = (void *)(tag->data[0]);
+	return 0;
+}
+
+__tagtable(BP_TAG_FDT, parse_tag_fdt);
+
+void __init early_init_dt_setup_initrd_arch(unsigned long start,
+		unsigned long end)
+{
+	initrd_start = (void *)__va(start);
+	initrd_end = (void *)__va(end);
+	initrd_below_start_ok = 1;
+}
+
+#endif /* CONFIG_OF */
+
 #endif /* CONFIG_BLK_DEV_INITRD */
 
 static int __init parse_tag_cmdline(const bp_tag_t* tag)
 {
-	strncpy(command_line, (char*)(tag->data), COMMAND_LINE_SIZE);
-	command_line[COMMAND_LINE_SIZE - 1] = '\0';
+	strlcpy(command_line, (char *)(tag->data), COMMAND_LINE_SIZE);
 	return 0;
 }
 
@@ -185,6 +222,58 @@ static int __init parse_bootparam(const bp_tag_t* tag)
 	return 0;
 }
 
+#ifdef CONFIG_OF
+
+void __init early_init_dt_add_memory_arch(u64 base, u64 size)
+{
+	size &= PAGE_MASK;
+	add_sysmem_bank(MEMORY_TYPE_CONVENTIONAL, base, base + size);
+}
+
+void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
+{
+	return __alloc_bootmem(size, align, 0);
+}
+
+void __init early_init_devtree(void *params)
+{
+	/* Setup flat device-tree pointer */
+	initial_boot_params = params;
+
+	/* Retrieve various informations from the /chosen node of the
+	 * device-tree, including the platform type, initrd location and
+	 * size, TCE reserve, and more ...
+	 */
+	if (!command_line[0])
+		of_scan_flat_dt(early_init_dt_scan_chosen, command_line);
+
+	/* Scan memory nodes and rebuild MEMBLOCKs */
+	of_scan_flat_dt(early_init_dt_scan_root, NULL);
+	if (sysmem.nr_banks == 0)
+		of_scan_flat_dt(early_init_dt_scan_memory, NULL);
+}
+
+static void __init copy_devtree(void)
+{
+	void *alloc = early_init_dt_alloc_memory_arch(
+			be32_to_cpu(initial_boot_params->totalsize), 0);
+	if (alloc) {
+		memcpy(alloc, initial_boot_params,
+				be32_to_cpu(initial_boot_params->totalsize));
+		initial_boot_params = alloc;
+	}
+}
+
+static int __init xtensa_device_probe(void)
+{
+	of_platform_populate(NULL, NULL, NULL, NULL);
+	return 0;
+}
+
+device_initcall(xtensa_device_probe);
+
+#endif /* CONFIG_OF */
+
 /*
  * Initialize architecture. (Early stage)
  */
@@ -193,14 +282,14 @@ void __init init_arch(bp_tag_t *bp_start)
 {
 	sysmem.nr_banks = 0;
 
-#ifdef CONFIG_CMDLINE_BOOL
-	strcpy(command_line, default_command_line);
-#endif
-
 	/* Parse boot parameters */
 
-        if (bp_start)
-	  parse_bootparam(bp_start);
+	if (bp_start)
+		parse_bootparam(bp_start);
+
+#ifdef CONFIG_OF
+	early_init_devtree(dtb_start);
+#endif
 
 	if (sysmem.nr_banks == 0) {
 		sysmem.nr_banks = 1;
@@ -209,6 +298,11 @@ void __init init_arch(bp_tag_t *bp_start)
 				     + PLATFORM_DEFAULT_MEM_SIZE;
 	}
 
+#ifdef CONFIG_CMDLINE_BOOL
+	if (!command_line[0])
+		strlcpy(command_line, default_command_line, COMMAND_LINE_SIZE);
+#endif
+
 	/* Early hook for platforms */
 
 	platform_init(bp_start);
@@ -235,15 +329,130 @@ extern char _UserExceptionVector_text_end;
 extern char _DoubleExceptionVector_literal_start;
 extern char _DoubleExceptionVector_text_end;
 
-void __init setup_arch(char **cmdline_p)
+
+#ifdef CONFIG_S32C1I_SELFTEST
+#if XCHAL_HAVE_S32C1I
+
+static int __initdata rcw_word, rcw_probe_pc, rcw_exc;
+
+/*
+ * Basic atomic compare-and-swap, that records PC of S32C1I for probing.
+ *
+ * If *v == cmp, set *v = set.  Return previous *v.
+ */
+static inline int probed_compare_swap(int *v, int cmp, int set)
+{
+	int tmp;
+
+	__asm__ __volatile__(
+			"	movi	%1, 1f\n"
+			"	s32i	%1, %4, 0\n"
+			"	wsr	%2, scompare1\n"
+			"1:	s32c1i	%0, %3, 0\n"
+			: "=a" (set), "=&a" (tmp)
+			: "a" (cmp), "a" (v), "a" (&rcw_probe_pc), "0" (set)
+			: "memory"
+			);
+	return set;
+}
+
+/* Handle probed exception */
+
+void __init do_probed_exception(struct pt_regs *regs, unsigned long exccause)
+{
+	if (regs->pc == rcw_probe_pc) {	/* exception on s32c1i ? */
+		regs->pc += 3;		/* skip the s32c1i instruction */
+		rcw_exc = exccause;
+	} else {
+		do_unhandled(regs, exccause);
+	}
+}
+
+/* Simple test of S32C1I (soc bringup assist) */
+
+void __init check_s32c1i(void)
+{
+	int n, cause1, cause2;
+	void *handbus, *handdata, *handaddr; /* temporarily saved handlers */
+
+	rcw_probe_pc = 0;
+	handbus  = trap_set_handler(EXCCAUSE_LOAD_STORE_ERROR,
+			do_probed_exception);
+	handdata = trap_set_handler(EXCCAUSE_LOAD_STORE_DATA_ERROR,
+			do_probed_exception);
+	handaddr = trap_set_handler(EXCCAUSE_LOAD_STORE_ADDR_ERROR,
+			do_probed_exception);
+
+	/* First try an S32C1I that does not store: */
+	rcw_exc = 0;
+	rcw_word = 1;
+	n = probed_compare_swap(&rcw_word, 0, 2);
+	cause1 = rcw_exc;
+
+	/* took exception? */
+	if (cause1 != 0) {
+		/* unclean exception? */
+		if (n != 2 || rcw_word != 1)
+			panic("S32C1I exception error");
+	} else if (rcw_word != 1 || n != 1) {
+		panic("S32C1I compare error");
+	}
+
+	/* Then an S32C1I that stores: */
+	rcw_exc = 0;
+	rcw_word = 0x1234567;
+	n = probed_compare_swap(&rcw_word, 0x1234567, 0xabcde);
+	cause2 = rcw_exc;
+
+	if (cause2 != 0) {
+		/* unclean exception? */
+		if (n != 0xabcde || rcw_word != 0x1234567)
+			panic("S32C1I exception error (b)");
+	} else if (rcw_word != 0xabcde || n != 0x1234567) {
+		panic("S32C1I store error");
+	}
+
+	/* Verify consistency of exceptions: */
+	if (cause1 || cause2) {
+		pr_warn("S32C1I took exception %d, %d\n", cause1, cause2);
+		/* If emulation of S32C1I upon bus error gets implemented,
+		   we can get rid of this panic for single core (not SMP) */
+		panic("S32C1I exceptions not currently supported");
+	}
+	if (cause1 != cause2)
+		panic("inconsistent S32C1I exceptions");
+
+	trap_set_handler(EXCCAUSE_LOAD_STORE_ERROR, handbus);
+	trap_set_handler(EXCCAUSE_LOAD_STORE_DATA_ERROR, handdata);
+	trap_set_handler(EXCCAUSE_LOAD_STORE_ADDR_ERROR, handaddr);
+}
+
+#else /* XCHAL_HAVE_S32C1I */
+
+/* This condition should not occur with a commercially deployed processor.
+   Display reminder for early engr test or demo chips / FPGA bitstreams */
+void __init check_s32c1i(void)
+{
+	pr_warn("Processor configuration lacks atomic compare-and-swap support!\n");
+}
+
+#endif /* XCHAL_HAVE_S32C1I */
+#else /* CONFIG_S32C1I_SELFTEST */
+
+void __init check_s32c1i(void)
 {
-	extern int mem_reserve(unsigned long, unsigned long, int);
-	extern void bootmem_init(void);
+}
+
+#endif /* CONFIG_S32C1I_SELFTEST */
 
-	memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
-	boot_command_line[COMMAND_LINE_SIZE-1] = '\0';
+
+void __init setup_arch(char **cmdline_p)
+{
+	strlcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
 	*cmdline_p = command_line;
 
+	check_s32c1i();
+
 	/* Reserve some memory regions */
 
 #ifdef CONFIG_BLK_DEV_INITRD
@@ -251,7 +460,7 @@ void __init setup_arch(char **cmdline_p)
 		initrd_is_mapped = mem_reserve(__pa(initrd_start),
 					       __pa(initrd_end), 0);
 		initrd_below_start_ok = 1;
- 	} else {
+	} else {
 		initrd_start = 0;
 	}
 #endif
@@ -275,8 +484,12 @@ void __init setup_arch(char **cmdline_p)
 
 	bootmem_init();
 
-	platform_setup(cmdline_p);
+#ifdef CONFIG_OF
+	copy_devtree();
+	unflatten_device_tree();
+#endif
 
+	platform_setup(cmdline_p);
 
 	paging_init();
 	zones_init();
@@ -326,7 +539,7 @@ c_show(struct seq_file *f, void *slot)
 		     "core ID\t\t: " XCHAL_CORE_ID "\n"
 		     "build ID\t: 0x%x\n"
 		     "byte order\t: %s\n"
- 		     "cpu MHz\t\t: %lu.%02lu\n"
+		     "cpu MHz\t\t: %lu.%02lu\n"
 		     "bogomips\t: %lu.%02lu\n",
 		     XCHAL_BUILD_UNIQUE_ID,
 		     XCHAL_HAVE_BE ?  "big" : "little",
@@ -380,6 +593,9 @@ c_show(struct seq_file *f, void *slot)
 #endif
 #if XCHAL_HAVE_FP
 		     "fpu "
+#endif
+#if XCHAL_HAVE_S32C1I
+		     "s32c1i "
 #endif
 		     "\n");
 
@@ -412,7 +628,7 @@ c_show(struct seq_file *f, void *slot)
 		     "icache size\t: %d\n"
 		     "icache flags\t: "
 #if XCHAL_ICACHE_LINE_LOCKABLE
-		     "lock"
+		     "lock "
 #endif
 		     "\n"
 		     "dcache line size: %d\n"
@@ -420,10 +636,10 @@ c_show(struct seq_file *f, void *slot)
 		     "dcache size\t: %d\n"
 		     "dcache flags\t: "
 #if XCHAL_DCACHE_IS_WRITEBACK
-		     "writeback"
+		     "writeback "
 #endif
 #if XCHAL_DCACHE_LINE_LOCKABLE
-		     "lock"
+		     "lock "
 #endif
 		     "\n",
 		     XCHAL_ICACHE_LINESIZE,
@@ -465,4 +681,3 @@ const struct seq_operations cpuinfo_op =
 };
 
 #endif /* CONFIG_PROC_FS */
-
diff --git a/trunk/arch/xtensa/kernel/signal.c b/trunk/arch/xtensa/kernel/signal.c
index 63c566f627bc..de34d6be91cd 100644
--- a/trunk/arch/xtensa/kernel/signal.c
+++ b/trunk/arch/xtensa/kernel/signal.c
@@ -212,7 +212,7 @@ restore_sigcontext(struct pt_regs *regs, struct rt_sigframe __user *frame)
 	if (err)
 		return err;
 
- 	/* The signal handler may have used coprocessors in which
+	/* The signal handler may have used coprocessors in which
 	 * case they are still enabled.  We disable them to force a
 	 * reloading of the original task's CP state by the lazy
 	 * context-switching mechanisms of CP exception handling.
@@ -396,7 +396,7 @@ static int setup_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	 */
 
 	/* Set up registers for signal handler */
-	start_thread(regs, (unsigned long) ka->sa.sa_handler, 
+	start_thread(regs, (unsigned long) ka->sa.sa_handler,
 		     (unsigned long) frame);
 
 	/* Set up a stack frame for a call4
@@ -424,9 +424,9 @@ static int setup_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	return -EFAULT;
 }
 
-asmlinkage long xtensa_sigaltstack(const stack_t __user *uss, 
+asmlinkage long xtensa_sigaltstack(const stack_t __user *uss,
 				   stack_t __user *uoss,
-    				   long a2, long a3, long a4, long a5,
+				   long a2, long a3, long a4, long a5,
 				   struct pt_regs *regs)
 {
 	return do_sigaltstack(uss, uoss, regs->areg[1]);
diff --git a/trunk/arch/xtensa/kernel/syscall.c b/trunk/arch/xtensa/kernel/syscall.c
index 5702065f472a..54fa8425cee2 100644
--- a/trunk/arch/xtensa/kernel/syscall.c
+++ b/trunk/arch/xtensa/kernel/syscall.c
@@ -52,4 +52,3 @@ asmlinkage long xtensa_fadvise64_64(int fd, int advice,
 {
 	return sys_fadvise64_64(fd, offset, len, advice);
 }
-
diff --git a/trunk/arch/xtensa/kernel/time.c b/trunk/arch/xtensa/kernel/time.c
index ac62f9cf1e10..ffb474104311 100644
--- a/trunk/arch/xtensa/kernel/time.c
+++ b/trunk/arch/xtensa/kernel/time.c
@@ -22,6 +22,7 @@
 #include <linux/irq.h>
 #include <linux/profile.h>
 #include <linux/delay.h>
+#include <linux/irqdomain.h>
 
 #include <asm/timex.h>
 #include <asm/platform.h>
@@ -31,7 +32,7 @@ unsigned long ccount_per_jiffy;		/* per 1/HZ */
 unsigned long nsec_per_ccount;		/* nsec per ccount increment */
 #endif
 
-static cycle_t ccount_read(void)
+static cycle_t ccount_read(struct clocksource *cs)
 {
 	return (cycle_t)get_ccount();
 }
@@ -52,6 +53,7 @@ static struct irqaction timer_irqaction = {
 
 void __init time_init(void)
 {
+	unsigned int irq;
 #ifdef CONFIG_XTENSA_CALIBRATE_CCOUNT
 	printk("Calibrating CPU frequency ");
 	platform_calibrate_ccount();
@@ -62,7 +64,8 @@ void __init time_init(void)
 
 	/* Initialize the linux timer interrupt. */
 
-	setup_irq(LINUX_TIMER_INT, &timer_irqaction);
+	irq = irq_create_mapping(NULL, LINUX_TIMER_INT);
+	setup_irq(irq, &timer_irqaction);
 	set_linux_timer(get_ccount() + CCOUNT_PER_JIFFY);
 }
 
diff --git a/trunk/arch/xtensa/kernel/traps.c b/trunk/arch/xtensa/kernel/traps.c
index 5caf2b64d43a..01e0111bf787 100644
--- a/trunk/arch/xtensa/kernel/traps.c
+++ b/trunk/arch/xtensa/kernel/traps.c
@@ -293,6 +293,17 @@ do_debug(struct pt_regs *regs)
 }
 
 
+/* Set exception C handler - for temporary use when probing exceptions */
+
+void * __init trap_set_handler(int cause, void *handler)
+{
+	unsigned long *entry = &exc_table[EXC_TABLE_DEFAULT / 4 + cause];
+	void *previous = (void *)*entry;
+	*entry = (unsigned long)handler;
+	return previous;
+}
+
+
 /*
  * Initialize dispatch tables.
  *
@@ -397,7 +408,8 @@ static inline void spill_registers(void)
 		"wsr	a13, sar\n\t"
 		"wsr	a14, ps\n\t"
 		:: "a" (&a0), "a" (&ps)
-		: "a2", "a3", "a4", "a7", "a11", "a12", "a13", "a14", "a15", "memory");
+		: "a2", "a3", "a4", "a7", "a11", "a12", "a13", "a14", "a15",
+		  "memory");
 }
 
 void show_trace(struct task_struct *task, unsigned long *sp)
@@ -452,7 +464,7 @@ void show_stack(struct task_struct *task, unsigned long *sp)
 
 	if (!sp)
 		sp = stack_pointer(task);
- 	stack = sp;
+	stack = sp;
 
 	printk("\nStack: ");
 
@@ -523,5 +535,3 @@ void die(const char * str, struct pt_regs * regs, long err)
 
 	do_exit(err);
 }
-
-
diff --git a/trunk/arch/xtensa/kernel/vectors.S b/trunk/arch/xtensa/kernel/vectors.S
index 4462c1e595c2..68df35f66ce3 100644
--- a/trunk/arch/xtensa/kernel/vectors.S
+++ b/trunk/arch/xtensa/kernel/vectors.S
@@ -79,6 +79,8 @@ ENTRY(_UserExceptionVector)
 	l32i	a0, a0, EXC_TABLE_FAST_USER	# load handler
 	jx	a0
 
+ENDPROC(_UserExceptionVector)
+
 /*
  * Kernel exception vector. (Exceptions with PS.UM == 0, PS.EXCM == 0)
  *
@@ -103,6 +105,7 @@ ENTRY(_KernelExceptionVector)
 	l32i	a0, a0, EXC_TABLE_FAST_KERNEL	# load handler address
 	jx	a0
 
+ENDPROC(_KernelExceptionVector)
 
 /*
  * Double exception vector (Exceptions with PS.EXCM == 1)
@@ -225,7 +228,13 @@ ENTRY(_DoubleExceptionVector)
 	/* Window overflow/underflow exception. Get stack pointer. */
 
 	mov	a3, a2
-	movi	a2, exc_table
+	/* This explicit literal and the following references to it are made
+	 * in order to fit DoubleExceptionVector.literals into the available
+	 * 16-byte gap before DoubleExceptionVector.text in the absence of
+	 * link time relaxation. See kernel/vmlinux.lds.S
+	 */
+	.literal .Lexc_table, exc_table
+	l32r	a2, .Lexc_table
 	l32i	a2, a2, EXC_TABLE_KSTK
 
 	/* Check for overflow/underflow exception, jump if overflow. */
@@ -255,7 +264,7 @@ ENTRY(_DoubleExceptionVector)
 	s32i	a0, a2, PT_AREG0
 
 	wsr	a3, excsave1		# save a3
-	movi	a3, exc_table
+	l32r	a3, .Lexc_table
 
 	rsr	a0, exccause
 	s32i	a0, a2, PT_DEPC		# mark it as a regular exception
@@ -267,7 +276,7 @@ ENTRY(_DoubleExceptionVector)
 
 	/* a0: depc, a1: a1, a2: a2, a3: trashed, depc: a0, excsave1: a3 */
 
-	movi	a3, exc_table
+	l32r	a3, .Lexc_table
 	s32i	a2, a3, EXC_TABLE_DOUBLE_SAVE	# temporary variable
 
 	/* Enter critical section. */
@@ -296,7 +305,7 @@ ENTRY(_DoubleExceptionVector)
 
 	/* a0: avail, a1: a1, a2: kstk, a3: avail, depc: a2, excsave: a3 */
 
-	movi	a3, exc_table
+	l32r	a3, .Lexc_table
 	rsr	a0, exccause
 	addx4	a0, a0, a3
 	l32i	a0, a0, EXC_TABLE_FAST_USER
@@ -338,6 +347,7 @@ ENTRY(_DoubleExceptionVector)
 
 	.end literal_prefix
 
+ENDPROC(_DoubleExceptionVector)
 
 /*
  * Debug interrupt vector
@@ -349,9 +359,11 @@ ENTRY(_DoubleExceptionVector)
 	.section .DebugInterruptVector.text, "ax"
 
 ENTRY(_DebugInterruptVector)
+
 	xsr	a0, SREG_EXCSAVE + XCHAL_DEBUGLEVEL
 	jx	a0
 
+ENDPROC(_DebugInterruptVector)
 
 
 /* Window overflow and underflow handlers.
@@ -363,38 +375,43 @@ ENTRY(_DebugInterruptVector)
  *	 we try to access any page that would cause a page fault early.
  */
 
+#define ENTRY_ALIGN64(name)	\
+	.globl name;		\
+	.align 64;		\
+	name:
+
 	.section		.WindowVectors.text, "ax"
 
 
 /* 4-Register Window Overflow Vector (Handler) */
 
-	.align 64
-.global _WindowOverflow4
-_WindowOverflow4:
+ENTRY_ALIGN64(_WindowOverflow4)
+
 	s32e	a0, a5, -16
 	s32e	a1, a5, -12
 	s32e	a2, a5,  -8
 	s32e	a3, a5,  -4
 	rfwo
 
+ENDPROC(_WindowOverflow4)
+
 
 /* 4-Register Window Underflow Vector (Handler) */
 
-	.align 64
-.global _WindowUnderflow4
-_WindowUnderflow4:
+ENTRY_ALIGN64(_WindowUnderflow4)
+
 	l32e	a0, a5, -16
 	l32e	a1, a5, -12
 	l32e	a2, a5,  -8
 	l32e	a3, a5,  -4
 	rfwu
 
+ENDPROC(_WindowUnderflow4)
 
 /* 8-Register Window Overflow Vector (Handler) */
 
-	.align 64
-.global _WindowOverflow8
-_WindowOverflow8:
+ENTRY_ALIGN64(_WindowOverflow8)
+
 	s32e	a0, a9, -16
 	l32e	a0, a1, -12
 	s32e	a2, a9,  -8
@@ -406,11 +423,12 @@ _WindowOverflow8:
 	s32e	a7, a0, -20
 	rfwo
 
+ENDPROC(_WindowOverflow8)
+
 /* 8-Register Window Underflow Vector (Handler) */
 
-	.align 64
-.global _WindowUnderflow8
-_WindowUnderflow8:
+ENTRY_ALIGN64(_WindowUnderflow8)
+
 	l32e	a1, a9, -12
 	l32e	a0, a9, -16
 	l32e	a7, a1, -12
@@ -422,12 +440,12 @@ _WindowUnderflow8:
 	l32e	a7, a7, -20
 	rfwu
 
+ENDPROC(_WindowUnderflow8)
 
 /* 12-Register Window Overflow Vector (Handler) */
 
-	.align 64
-.global _WindowOverflow12
-_WindowOverflow12:
+ENTRY_ALIGN64(_WindowOverflow12)
+
 	s32e	a0,  a13, -16
 	l32e	a0,  a1,  -12
 	s32e	a1,  a13, -12
@@ -443,11 +461,12 @@ _WindowOverflow12:
 	s32e	a11, a0,  -20
 	rfwo
 
+ENDPROC(_WindowOverflow12)
+
 /* 12-Register Window Underflow Vector (Handler) */
 
-	.align 64
-.global _WindowUnderflow12
-_WindowUnderflow12:
+ENTRY_ALIGN64(_WindowUnderflow12)
+
 	l32e	a1,  a13, -12
 	l32e	a0,  a13, -16
 	l32e	a11, a1,  -12
@@ -463,6 +482,6 @@ _WindowUnderflow12:
 	l32e	a11, a11, -20
 	rfwu
 
-	.text
-
+ENDPROC(_WindowUnderflow12)
 
+	.text
diff --git a/trunk/arch/xtensa/lib/checksum.S b/trunk/arch/xtensa/lib/checksum.S
index df397f932d0e..4eb573d2720e 100644
--- a/trunk/arch/xtensa/lib/checksum.S
+++ b/trunk/arch/xtensa/lib/checksum.S
@@ -41,10 +41,11 @@
 
 .text
 ENTRY(csum_partial)
-	  /*
-	   * Experiments with Ethernet and SLIP connections show that buf
-	   * is aligned on either a 2-byte or 4-byte boundary.
-	   */
+
+	/*
+	 * Experiments with Ethernet and SLIP connections show that buf
+	 * is aligned on either a 2-byte or 4-byte boundary.
+	 */
 	entry	sp, 32
 	extui	a5, a2, 0, 2
 	bnez	a5, 8f		/* branch if 2-byte aligned */
@@ -170,7 +171,7 @@ ENTRY(csum_partial)
 3:
 	j	5b		/* branch to handle the remaining byte */
 
-
+ENDPROC(csum_partial)
 
 /*
  * Copy from ds while checksumming, otherwise like csum_partial
@@ -211,6 +212,7 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst, int len,
  */
 
 ENTRY(csum_partial_copy_generic)
+
 	entry	sp, 32
 	mov	a12, a3
 	mov	a11, a4
@@ -367,6 +369,8 @@ DST(	s8i	a8, a3, 1	)
 6:
 	j	4b		/* process the possible trailing odd byte */
 
+ENDPROC(csum_partial_copy_generic)
+
 
 # Exception handler:
 .section .fixup, "ax"
@@ -406,4 +410,3 @@ DST(	s8i	a8, a3, 1	)
 	retw
 
 .previous
-
diff --git a/trunk/arch/xtensa/lib/memcopy.S b/trunk/arch/xtensa/lib/memcopy.S
index c48b80acb5f0..b1c219acabe7 100644
--- a/trunk/arch/xtensa/lib/memcopy.S
+++ b/trunk/arch/xtensa/lib/memcopy.S
@@ -210,8 +210,10 @@ memcpy:
 	_beqz	a4, .Ldone	# avoid loading anything for zero-length copies
 	# copy 16 bytes per iteration for word-aligned dst and unaligned src
 	ssa8	a3		# set shift amount from byte offset
-#define SIM_CHECKS_ALIGNMENT	1	/* set to 1 when running on ISS (simulator) with the
-					   lint or ferret client, or 0 to save a few cycles */
+
+/* set to 1 when running on ISS (simulator) with the
+   lint or ferret client, or 0 to save a few cycles */
+#define SIM_CHECKS_ALIGNMENT	1
 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
 	and	a11, a3, a8	# save unalignment offset for below
 	sub	a3, a3, a11	# align a3
diff --git a/trunk/arch/xtensa/lib/pci-auto.c b/trunk/arch/xtensa/lib/pci-auto.c
index a71733ae1193..34d05abbd921 100644
--- a/trunk/arch/xtensa/lib/pci-auto.c
+++ b/trunk/arch/xtensa/lib/pci-auto.c
@@ -241,8 +241,8 @@ int __init pciauto_bus_scan(struct pci_controller *pci_ctrl, int current_bus)
 	unsigned char header_type;
 	struct pci_dev *dev = &pciauto_dev;
 
-        pciauto_dev.bus = &pciauto_bus;
-        pciauto_dev.sysdata = pci_ctrl;
+	pciauto_dev.bus = &pciauto_bus;
+	pciauto_dev.sysdata = pci_ctrl;
 	pciauto_bus.ops = pci_ctrl->ops;
 
 	/*
@@ -345,8 +345,3 @@ int __init pciauto_bus_scan(struct pci_controller *pci_ctrl, int current_bus)
 	}
 	return sub_bus;
 }
-
-
-
-
-
diff --git a/trunk/arch/xtensa/lib/strncpy_user.S b/trunk/arch/xtensa/lib/strncpy_user.S
index 9f603cdaaa68..1ad0ecf45368 100644
--- a/trunk/arch/xtensa/lib/strncpy_user.S
+++ b/trunk/arch/xtensa/lib/strncpy_user.S
@@ -166,7 +166,7 @@ __strncpy_user:
 	retw
 .Lz1:	# byte 1 is zero
 #ifdef __XTENSA_EB__
-        extui   a9, a9, 16, 16
+	extui   a9, a9, 16, 16
 #endif /* __XTENSA_EB__ */
 	EX(s16i, a9, a11, 0, fixup_s)
 	addi	a11, a11, 1		# advance dst pointer
@@ -174,7 +174,7 @@ __strncpy_user:
 	retw
 .Lz2:	# byte 2 is zero
 #ifdef __XTENSA_EB__
-        extui   a9, a9, 16, 16
+	extui   a9, a9, 16, 16
 #endif /* __XTENSA_EB__ */
 	EX(s16i, a9, a11, 0, fixup_s)
 	movi	a9, 0
diff --git a/trunk/arch/xtensa/lib/strnlen_user.S b/trunk/arch/xtensa/lib/strnlen_user.S
index 23f2a89816a1..4c03b1e581e9 100644
--- a/trunk/arch/xtensa/lib/strnlen_user.S
+++ b/trunk/arch/xtensa/lib/strnlen_user.S
@@ -145,4 +145,3 @@ __strnlen_user:
 lenfixup:
 	movi	a2, 0
 	retw
-
diff --git a/trunk/arch/xtensa/lib/usercopy.S b/trunk/arch/xtensa/lib/usercopy.S
index 46d60314bb16..ace1892a875e 100644
--- a/trunk/arch/xtensa/lib/usercopy.S
+++ b/trunk/arch/xtensa/lib/usercopy.S
@@ -318,4 +318,3 @@ l_fixup:
 	/* Ignore memset return value in a6. */
 	/* a2 still contains bytes not copied. */
 	retw
-
diff --git a/trunk/arch/xtensa/mm/cache.c b/trunk/arch/xtensa/mm/cache.c
index 85df4655d326..81edeab82d17 100644
--- a/trunk/arch/xtensa/mm/cache.c
+++ b/trunk/arch/xtensa/mm/cache.c
@@ -118,7 +118,7 @@ void flush_dcache_page(struct page *page)
  * For now, flush the whole cache. FIXME??
  */
 
-void flush_cache_range(struct vm_area_struct* vma, 
+void flush_cache_range(struct vm_area_struct* vma,
 		       unsigned long start, unsigned long end)
 {
 	__flush_invalidate_dcache_all();
@@ -133,7 +133,7 @@ void flush_cache_range(struct vm_area_struct* vma,
  */
 
 void flush_cache_page(struct vm_area_struct* vma, unsigned long address,
-    		      unsigned long pfn)
+		      unsigned long pfn)
 {
 	/* Note that we have to use the 'alias' address to avoid multi-hit */
 
@@ -166,14 +166,14 @@ update_mmu_cache(struct vm_area_struct * vma, unsigned long addr, pte_t *ptep)
 
 	if (!PageReserved(page) && test_bit(PG_arch_1, &page->flags)) {
 
-		unsigned long vaddr = TLBTEMP_BASE_1 + (addr & DCACHE_ALIAS_MASK);
 		unsigned long paddr = (unsigned long) page_address(page);
 		unsigned long phys = page_to_phys(page);
+		unsigned long tmp = TLBTEMP_BASE_1 + (addr & DCACHE_ALIAS_MASK);
 
 		__flush_invalidate_dcache_page(paddr);
 
-		__flush_invalidate_dcache_page_alias(vaddr, phys);
-		__invalidate_icache_page_alias(vaddr, phys);
+		__flush_invalidate_dcache_page_alias(tmp, phys);
+		__invalidate_icache_page_alias(tmp, phys);
 
 		clear_bit(PG_arch_1, &page->flags);
 	}
@@ -195,7 +195,7 @@ update_mmu_cache(struct vm_area_struct * vma, unsigned long addr, pte_t *ptep)
 
 #if (DCACHE_WAY_SIZE > PAGE_SIZE) && XCHAL_DCACHE_IS_WRITEBACK
 
-void copy_to_user_page(struct vm_area_struct *vma, struct page *page, 
+void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
 		unsigned long vaddr, void *dst, const void *src,
 		unsigned long len)
 {
@@ -205,8 +205,8 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
 	/* Flush and invalidate user page if aliased. */
 
 	if (alias) {
-		unsigned long temp = TLBTEMP_BASE_1 + (vaddr & DCACHE_ALIAS_MASK);
-		__flush_invalidate_dcache_page_alias(temp, phys);
+		unsigned long t = TLBTEMP_BASE_1 + (vaddr & DCACHE_ALIAS_MASK);
+		__flush_invalidate_dcache_page_alias(t, phys);
 	}
 
 	/* Copy data */
@@ -219,12 +219,11 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
 	 */
 
 	if (alias) {
-		unsigned long temp = TLBTEMP_BASE_1 + (vaddr & DCACHE_ALIAS_MASK);
+		unsigned long t = TLBTEMP_BASE_1 + (vaddr & DCACHE_ALIAS_MASK);
 
 		__flush_invalidate_dcache_range((unsigned long) dst, len);
-		if ((vma->vm_flags & VM_EXEC) != 0) {
-			__invalidate_icache_page_alias(temp, phys);
-		}
+		if ((vma->vm_flags & VM_EXEC) != 0)
+			__invalidate_icache_page_alias(t, phys);
 
 	} else if ((vma->vm_flags & VM_EXEC) != 0) {
 		__flush_dcache_range((unsigned long)dst,len);
@@ -245,8 +244,8 @@ extern void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
 	 */
 
 	if (alias) {
-		unsigned long temp = TLBTEMP_BASE_1 + (vaddr & DCACHE_ALIAS_MASK);
-		__flush_invalidate_dcache_page_alias(temp, phys);
+		unsigned long t = TLBTEMP_BASE_1 + (vaddr & DCACHE_ALIAS_MASK);
+		__flush_invalidate_dcache_page_alias(t, phys);
 	}
 
 	memcpy(dst, src, len);
diff --git a/trunk/arch/xtensa/mm/fault.c b/trunk/arch/xtensa/mm/fault.c
index 245b08f7eaf4..4b7bc8db170f 100644
--- a/trunk/arch/xtensa/mm/fault.c
+++ b/trunk/arch/xtensa/mm/fault.c
@@ -254,4 +254,3 @@ bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
 	die("Oops", regs, sig);
 	do_exit(sig);
 }
-
diff --git a/trunk/arch/xtensa/mm/init.c b/trunk/arch/xtensa/mm/init.c
index db955179da2d..7a5156ffebb6 100644
--- a/trunk/arch/xtensa/mm/init.c
+++ b/trunk/arch/xtensa/mm/init.c
@@ -75,15 +75,15 @@ int __init mem_reserve(unsigned long start, unsigned long end, int must_exist)
 			sysmem.nr_banks++;
 		}
 		sysmem.bank[i].end = start;
+
+	} else if (end < sysmem.bank[i].end) {
+		sysmem.bank[i].start = end;
+
 	} else {
-		if (end < sysmem.bank[i].end)
-			sysmem.bank[i].start = end;
-		else {
-			/* remove entry */
-			sysmem.nr_banks--;
-			sysmem.bank[i].start = sysmem.bank[sysmem.nr_banks].start;
-			sysmem.bank[i].end   = sysmem.bank[sysmem.nr_banks].end;
-		}
+		/* remove entry */
+		sysmem.nr_banks--;
+		sysmem.bank[i].start = sysmem.bank[sysmem.nr_banks].start;
+		sysmem.bank[i].end   = sysmem.bank[sysmem.nr_banks].end;
 	}
 	return -1;
 }
diff --git a/trunk/arch/xtensa/mm/misc.S b/trunk/arch/xtensa/mm/misc.S
index b048406d8756..d97ed1ba7b0a 100644
--- a/trunk/arch/xtensa/mm/misc.S
+++ b/trunk/arch/xtensa/mm/misc.S
@@ -29,6 +29,7 @@
  */
 
 ENTRY(clear_page)
+
 	entry	a1, 16
 
 	movi	a3, 0
@@ -45,6 +46,8 @@ ENTRY(clear_page)
 
 	retw
 
+ENDPROC(clear_page)
+
 /*
  * copy_page and copy_user_page are the same for non-cache-aliased configs.
  *
@@ -53,6 +56,7 @@ ENTRY(clear_page)
  */
 
 ENTRY(copy_page)
+
 	entry	a1, 16
 
 	__loopi a2, a4, PAGE_SIZE, 32
@@ -84,6 +88,8 @@ ENTRY(copy_page)
 
 	retw
 
+ENDPROC(copy_page)
+
 #ifdef CONFIG_MMU
 /*
  * If we have to deal with cache aliasing, we use temporary memory mappings
@@ -109,6 +115,7 @@ ENTRY(__tlbtemp_mapping_start)
  */
 
 ENTRY(clear_user_page)
+
 	entry	a1, 32
 
 	/* Mark page dirty and determine alias. */
@@ -164,6 +171,8 @@ ENTRY(clear_user_page)
 
 	retw
 
+ENDPROC(clear_user_page)
+
 /*
  * copy_page_user (void *to, void *from, unsigned long vaddr, struct page *page)
  *                    a2          a3	        a4		    a5
@@ -171,7 +180,7 @@ ENTRY(clear_user_page)
 
 ENTRY(copy_user_page)
 
-	entry	a1, 32 
+	entry	a1, 32
 
 	/* Mark page dirty and determine alias for destination. */
 
@@ -262,6 +271,8 @@ ENTRY(copy_user_page)
 
 	retw
 
+ENDPROC(copy_user_page)
+
 #endif
 
 #if (DCACHE_WAY_SIZE > PAGE_SIZE)
@@ -272,6 +283,7 @@ ENTRY(copy_user_page)
  */
 
 ENTRY(__flush_invalidate_dcache_page_alias)
+
 	entry	sp, 16
 
 	movi	a7, 0			# required for exception handler
@@ -287,6 +299,7 @@ ENTRY(__flush_invalidate_dcache_page_alias)
 
 	retw
 
+ENDPROC(__flush_invalidate_dcache_page_alias)
 #endif
 
 ENTRY(__tlbtemp_mapping_itlb)
@@ -294,6 +307,7 @@ ENTRY(__tlbtemp_mapping_itlb)
 #if (ICACHE_WAY_SIZE > PAGE_SIZE)
 	
 ENTRY(__invalidate_icache_page_alias)
+
 	entry	sp, 16
 
 	addi	a6, a3, (PAGE_KERNEL_EXEC | _PAGE_HW_WRITE)
@@ -307,11 +321,14 @@ ENTRY(__invalidate_icache_page_alias)
 	isync
 	retw
 
+ENDPROC(__invalidate_icache_page_alias)
+
 #endif
 
 /* End of special treatment in tlb miss exception */
 
 ENTRY(__tlbtemp_mapping_end)
+
 #endif /* CONFIG_MMU
 
 /*
@@ -319,6 +336,7 @@ ENTRY(__tlbtemp_mapping_end)
  */
 
 ENTRY(__invalidate_icache_page)
+
 	entry	sp, 16
 
 	___invalidate_icache_page a2 a3
@@ -326,11 +344,14 @@ ENTRY(__invalidate_icache_page)
 
 	retw
 
+ENDPROC(__invalidate_icache_page)
+
 /*
  * void __invalidate_dcache_page(ulong start)
  */
 
 ENTRY(__invalidate_dcache_page)
+
 	entry	sp, 16
 
 	___invalidate_dcache_page a2 a3
@@ -338,11 +359,14 @@ ENTRY(__invalidate_dcache_page)
 
 	retw
 
+ENDPROC(__invalidate_dcache_page)
+
 /*
  * void __flush_invalidate_dcache_page(ulong start)
  */
 
 ENTRY(__flush_invalidate_dcache_page)
+
 	entry	sp, 16
 
 	___flush_invalidate_dcache_page a2 a3
@@ -350,11 +374,14 @@ ENTRY(__flush_invalidate_dcache_page)
 	dsync
 	retw
 
+ENDPROC(__flush_invalidate_dcache_page)
+
 /*
  * void __flush_dcache_page(ulong start)
  */
 
 ENTRY(__flush_dcache_page)
+
 	entry	sp, 16
 
 	___flush_dcache_page a2 a3
@@ -362,11 +389,14 @@ ENTRY(__flush_dcache_page)
 	dsync
 	retw
 
+ENDPROC(__flush_dcache_page)
+
 /*
  * void __invalidate_icache_range(ulong start, ulong size)
  */
 
 ENTRY(__invalidate_icache_range)
+
 	entry	sp, 16
 
 	___invalidate_icache_range a2 a3 a4
@@ -374,11 +404,14 @@ ENTRY(__invalidate_icache_range)
 
 	retw
 
+ENDPROC(__invalidate_icache_range)
+
 /*
  * void __flush_invalidate_dcache_range(ulong start, ulong size)
  */
 
 ENTRY(__flush_invalidate_dcache_range)
+
 	entry	sp, 16
 
 	___flush_invalidate_dcache_range a2 a3 a4
@@ -386,11 +419,14 @@ ENTRY(__flush_invalidate_dcache_range)
 
 	retw
 
+ENDPROC(__flush_invalidate_dcache_range)
+
 /*
  * void _flush_dcache_range(ulong start, ulong size)
  */
 
 ENTRY(__flush_dcache_range)
+
 	entry	sp, 16
 
 	___flush_dcache_range a2 a3 a4
@@ -398,22 +434,28 @@ ENTRY(__flush_dcache_range)
 
 	retw
 
+ENDPROC(__flush_dcache_range)
+
 /*
  * void _invalidate_dcache_range(ulong start, ulong size)
  */
 
 ENTRY(__invalidate_dcache_range)
+
 	entry	sp, 16
 
 	___invalidate_dcache_range a2 a3 a4
 
 	retw
 
+ENDPROC(__invalidate_dcache_range)
+
 /*
  * void _invalidate_icache_all(void)
  */
 
 ENTRY(__invalidate_icache_all)
+
 	entry	sp, 16
 
 	___invalidate_icache_all a2 a3
@@ -421,11 +463,14 @@ ENTRY(__invalidate_icache_all)
 
 	retw
 
+ENDPROC(__invalidate_icache_all)
+
 /*
  * void _flush_invalidate_dcache_all(void)
  */
 
 ENTRY(__flush_invalidate_dcache_all)
+
 	entry	sp, 16
 
 	___flush_invalidate_dcache_all a2 a3
@@ -433,11 +478,14 @@ ENTRY(__flush_invalidate_dcache_all)
 
 	retw
 
+ENDPROC(__flush_invalidate_dcache_all)
+
 /*
  * void _invalidate_dcache_all(void)
  */
 
 ENTRY(__invalidate_dcache_all)
+
 	entry	sp, 16
 
 	___invalidate_dcache_all a2 a3
@@ -445,3 +493,4 @@ ENTRY(__invalidate_dcache_all)
 
 	retw
 
+ENDPROC(__invalidate_dcache_all)
diff --git a/trunk/arch/xtensa/mm/mmu.c b/trunk/arch/xtensa/mm/mmu.c
index ca81654f3ec2..0f77f9d3bb8b 100644
--- a/trunk/arch/xtensa/mm/mmu.c
+++ b/trunk/arch/xtensa/mm/mmu.c
@@ -37,7 +37,7 @@ void __init init_mmu(void)
 
 	/* Set rasid register to a known value. */
 
-	set_rasid_register(ASID_USER_FIRST);
+	set_rasid_register(ASID_INSERT(ASID_USER_FIRST));
 
 	/* Set PTEVADDR special register to the start of the page
 	 * table, which is in kernel mappable space (ie. not
diff --git a/trunk/arch/xtensa/mm/tlb.c b/trunk/arch/xtensa/mm/tlb.c
index e2700b21395b..5411aa67c68e 100644
--- a/trunk/arch/xtensa/mm/tlb.c
+++ b/trunk/arch/xtensa/mm/tlb.c
@@ -63,7 +63,7 @@ void flush_tlb_all (void)
 void flush_tlb_mm(struct mm_struct *mm)
 {
 	if (mm == current->active_mm) {
-		int flags;
+		unsigned long flags;
 		local_save_flags(flags);
 		__get_new_mmu_context(mm);
 		__load_mmu_context(mm);
@@ -82,7 +82,7 @@ void flush_tlb_mm(struct mm_struct *mm)
 #endif
 
 void flush_tlb_range (struct vm_area_struct *vma,
-    		      unsigned long start, unsigned long end)
+		      unsigned long start, unsigned long end)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long flags;
@@ -100,7 +100,7 @@ void flush_tlb_range (struct vm_area_struct *vma,
 		int oldpid = get_rasid_register();
 		set_rasid_register (ASID_INSERT(mm->context));
 		start &= PAGE_MASK;
- 		if (vma->vm_flags & VM_EXEC)
+		if (vma->vm_flags & VM_EXEC)
 			while(start < end) {
 				invalidate_itlb_mapping(start);
 				invalidate_dtlb_mapping(start);
@@ -130,7 +130,7 @@ void flush_tlb_page (struct vm_area_struct *vma, unsigned long page)
 
 	local_save_flags(flags);
 
-       	oldpid = get_rasid_register();
+	oldpid = get_rasid_register();
 
 	if (vma->vm_flags & VM_EXEC)
 		invalidate_itlb_mapping(page);
@@ -140,4 +140,3 @@ void flush_tlb_page (struct vm_area_struct *vma, unsigned long page)
 
 	local_irq_restore(flags);
 }
-
diff --git a/trunk/arch/xtensa/platforms/iss/include/platform/serial.h b/trunk/arch/xtensa/platforms/iss/include/platform/serial.h
index e69de29bb2d1..16aec542d435 100644
--- a/trunk/arch/xtensa/platforms/iss/include/platform/serial.h
+++ b/trunk/arch/xtensa/platforms/iss/include/platform/serial.h
@@ -0,0 +1,15 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2012 Tensilica Inc.
+ */
+
+#ifndef __ASM_XTENSA_ISS_SERIAL_H
+#define __ASM_XTENSA_ISS_SERIAL_H
+
+/* Have no meaning on ISS, but needed for 8250_early.c */
+#define BASE_BAUD 0
+
+#endif /* __ASM_XTENSA_ISS_SERIAL_H */
diff --git a/trunk/arch/xtensa/platforms/iss/include/platform/simcall.h b/trunk/arch/xtensa/platforms/iss/include/platform/simcall.h
index bd78192e2fc9..b5a4edf02d76 100644
--- a/trunk/arch/xtensa/platforms/iss/include/platform/simcall.h
+++ b/trunk/arch/xtensa/platforms/iss/include/platform/simcall.h
@@ -74,13 +74,12 @@ static inline int __simc(int a, int b, int c, int d, int e, int f)
 			"mov %1, a3\n"
 			: "=a" (ret), "=a" (errno), "+r"(a1), "+r"(b1)
 			: "r"(c1), "r"(d1), "r"(e1), "r"(f1)
-			: );
+			: "memory");
 	return ret;
 }
 
 static inline int simc_open(const char *file, int flags, int mode)
 {
-	wmb();
 	return __simc(SYS_open, (int) file, flags, mode, 0, 0);
 }
 
@@ -91,19 +90,16 @@ static inline int simc_close(int fd)
 
 static inline int simc_ioctl(int fd, int request, void *arg)
 {
-	wmb();
 	return __simc(SYS_ioctl, fd, request, (int) arg, 0, 0);
 }
 
 static inline int simc_read(int fd, void *buf, size_t count)
 {
-	rmb();
 	return __simc(SYS_read, fd, (int) buf, count, 0, 0);
 }
 
 static inline int simc_write(int fd, const void *buf, size_t count)
 {
-	wmb();
 	return __simc(SYS_write, fd, (int) buf, count, 0, 0);
 }
 
@@ -111,7 +107,6 @@ static inline int simc_poll(int fd)
 {
 	struct timeval tv = { .tv_sec = 0, .tv_usec = 0 };
 
-	wmb();
 	return __simc(SYS_select_one, fd, XTISS_SELECT_ONE_READ, (int)&tv,
 			0, 0);
 }
diff --git a/trunk/arch/xtensa/platforms/xtfpga/Makefile b/trunk/arch/xtensa/platforms/xtfpga/Makefile
new file mode 100644
index 000000000000..b9ae206340cd
--- /dev/null
+++ b/trunk/arch/xtensa/platforms/xtfpga/Makefile
@@ -0,0 +1,9 @@
+# Makefile for the Tensilica xtavnet Emulation Board
+#
+# Note! Dependencies are done automagically by 'make dep', which also
+# removes any old dependencies. DON'T put your own dependencies here
+# unless it's something special (ie not a .c file).
+#
+# Note 2! The CFLAGS definitions are in the main makefile...
+
+obj-y			= setup.o lcd.o
diff --git a/trunk/arch/xtensa/platforms/xtfpga/include/platform/hardware.h b/trunk/arch/xtensa/platforms/xtfpga/include/platform/hardware.h
new file mode 100644
index 000000000000..4416773cbde5
--- /dev/null
+++ b/trunk/arch/xtensa/platforms/xtfpga/include/platform/hardware.h
@@ -0,0 +1,69 @@
+/*
+ * arch/xtensa/platform/xtavnet/include/platform/hardware.h
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2006 Tensilica Inc.
+ */
+
+/*
+ * This file contains the hardware configuration of the XTAVNET boards.
+ */
+
+#ifndef __XTENSA_XTAVNET_HARDWARE_H
+#define __XTENSA_XTAVNET_HARDWARE_H
+
+/* By default NO_IRQ is defined to 0 in Linux, but we use the
+   interrupt 0 for UART... */
+#define NO_IRQ                 -1
+
+/* Memory configuration. */
+
+#define PLATFORM_DEFAULT_MEM_START 0x00000000
+#define PLATFORM_DEFAULT_MEM_SIZE  0x04000000
+
+/* Interrupt configuration. */
+
+#define PLATFORM_NR_IRQS	10
+
+/* Default assignment of LX60 devices to external interrupts. */
+
+#ifdef CONFIG_ARCH_HAS_SMP
+#define DUART16552_INTNUM	XCHAL_EXTINT3_NUM
+#define OETH_IRQ		XCHAL_EXTINT4_NUM
+#else
+#define DUART16552_INTNUM	XCHAL_EXTINT0_NUM
+#define OETH_IRQ		XCHAL_EXTINT1_NUM
+#endif
+
+/*
+ *  Device addresses and parameters.
+ */
+
+/* UART */
+#define DUART16552_PADDR	(XCHAL_KIO_PADDR + 0x0D050020)
+/* LCD instruction and data addresses. */
+#define LCD_INSTR_ADDR		((char *)IOADDR(0x0D040000))
+#define LCD_DATA_ADDR		((char *)IOADDR(0x0D040004))
+
+/* Misc. */
+#define XTFPGA_FPGAREGS_VADDR	IOADDR(0x0D020000)
+/* Clock frequency in Hz (read-only):  */
+#define XTFPGA_CLKFRQ_VADDR	(XTFPGA_FPGAREGS_VADDR + 0x04)
+/* Setting of 8 DIP switches:  */
+#define DIP_SWITCHES_VADDR	(XTFPGA_FPGAREGS_VADDR + 0x0C)
+/* Software reset (write 0xdead):  */
+#define XTFPGA_SWRST_VADDR	(XTFPGA_FPGAREGS_VADDR + 0x10)
+
+/*  OpenCores Ethernet controller:  */
+				/* regs + RX/TX descriptors */
+#define OETH_REGS_PADDR		(XCHAL_KIO_PADDR + 0x0D030000)
+#define OETH_REGS_SIZE		0x1000
+#define OETH_SRAMBUFF_PADDR	(XCHAL_KIO_PADDR + 0x0D800000)
+
+				/* 5*rx buffs + 5*tx buffs */
+#define OETH_SRAMBUFF_SIZE	(5 * 0x600 + 5 * 0x600)
+
+#endif /* __XTENSA_XTAVNET_HARDWARE_H */
diff --git a/trunk/arch/xtensa/platforms/xtfpga/include/platform/lcd.h b/trunk/arch/xtensa/platforms/xtfpga/include/platform/lcd.h
new file mode 100644
index 000000000000..0e435645af5a
--- /dev/null
+++ b/trunk/arch/xtensa/platforms/xtfpga/include/platform/lcd.h
@@ -0,0 +1,20 @@
+/*
+ * arch/xtensa/platform/xtavnet/include/platform/lcd.h
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2001, 2006 Tensilica Inc.
+ */
+
+#ifndef __XTENSA_XTAVNET_LCD_H
+#define __XTENSA_XTAVNET_LCD_H
+
+/* Display string STR at position POS on the LCD. */
+void lcd_disp_at_pos(char *str, unsigned char pos);
+
+/* Shift the contents of the LCD display left or right. */
+void lcd_shiftleft(void);
+void lcd_shiftright(void);
+#endif
diff --git a/trunk/arch/xtensa/platforms/xtfpga/include/platform/serial.h b/trunk/arch/xtensa/platforms/xtfpga/include/platform/serial.h
new file mode 100644
index 000000000000..14d8f7beebfd
--- /dev/null
+++ b/trunk/arch/xtensa/platforms/xtfpga/include/platform/serial.h
@@ -0,0 +1,18 @@
+/*
+ * arch/xtensa/platform/xtavnet/include/platform/serial.h
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2001, 2006 Tensilica Inc.
+ */
+
+#ifndef __ASM_XTENSA_XTAVNET_SERIAL_H
+#define __ASM_XTENSA_XTAVNET_SERIAL_H
+
+#include <platform/hardware.h>
+
+#define BASE_BAUD (*(long *)XTFPGA_CLKFRQ_VADDR / 16)
+
+#endif /* __ASM_XTENSA_XTAVNET_SERIAL_H */
diff --git a/trunk/arch/xtensa/platforms/xtfpga/lcd.c b/trunk/arch/xtensa/platforms/xtfpga/lcd.c
new file mode 100644
index 000000000000..2872301598df
--- /dev/null
+++ b/trunk/arch/xtensa/platforms/xtfpga/lcd.c
@@ -0,0 +1,76 @@
+/*
+ * Driver for the LCD display on the Tensilica LX60 Board.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2001, 2006 Tensilica Inc.
+ */
+
+/*
+ *
+ * FIXME: this code is from the examples from the LX60 user guide.
+ *
+ * The lcd_pause function does busy waiting, which is probably not
+ * great. Maybe the code could be changed to use kernel timers, or
+ * change the hardware to not need to wait.
+ */
+
+#include <linux/init.h>
+#include <linux/io.h>
+
+#include <platform/hardware.h>
+#include <platform/lcd.h>
+#include <linux/delay.h>
+
+#define LCD_PAUSE_ITERATIONS	4000
+#define LCD_CLEAR		0x1
+#define LCD_DISPLAY_ON		0xc
+
+/* 8bit and 2 lines display */
+#define LCD_DISPLAY_MODE8BIT	0x38
+#define LCD_DISPLAY_POS		0x80
+#define LCD_SHIFT_LEFT		0x18
+#define LCD_SHIFT_RIGHT		0x1c
+
+static int __init lcd_init(void)
+{
+	*LCD_INSTR_ADDR = LCD_DISPLAY_MODE8BIT;
+	mdelay(5);
+	*LCD_INSTR_ADDR = LCD_DISPLAY_MODE8BIT;
+	udelay(200);
+	*LCD_INSTR_ADDR = LCD_DISPLAY_MODE8BIT;
+	udelay(50);
+	*LCD_INSTR_ADDR = LCD_DISPLAY_ON;
+	udelay(50);
+	*LCD_INSTR_ADDR = LCD_CLEAR;
+	mdelay(10);
+	lcd_disp_at_pos("XTENSA LINUX", 0);
+	return 0;
+}
+
+void lcd_disp_at_pos(char *str, unsigned char pos)
+{
+	*LCD_INSTR_ADDR = LCD_DISPLAY_POS | pos;
+	udelay(100);
+	while (*str != 0) {
+		*LCD_DATA_ADDR = *str;
+		udelay(200);
+		str++;
+	}
+}
+
+void lcd_shiftleft(void)
+{
+	*LCD_INSTR_ADDR = LCD_SHIFT_LEFT;
+	udelay(50);
+}
+
+void lcd_shiftright(void)
+{
+	*LCD_INSTR_ADDR = LCD_SHIFT_RIGHT;
+	udelay(50);
+}
+
+arch_initcall(lcd_init);
diff --git a/trunk/arch/xtensa/platforms/xtfpga/setup.c b/trunk/arch/xtensa/platforms/xtfpga/setup.c
new file mode 100644
index 000000000000..4b9951a4569d
--- /dev/null
+++ b/trunk/arch/xtensa/platforms/xtfpga/setup.c
@@ -0,0 +1,301 @@
+/*
+ *
+ * arch/xtensa/platform/xtavnet/setup.c
+ *
+ * ...
+ *
+ * Authors:	Chris Zankel <chris@zankel.net>
+ *		Joe Taylor <joe@tensilica.com>
+ *
+ * Copyright 2001 - 2006 Tensilica Inc.
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ */
+#include <linux/stddef.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/reboot.h>
+#include <linux/kdev_t.h>
+#include <linux/types.h>
+#include <linux/major.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/of.h>
+
+#include <asm/timex.h>
+#include <asm/processor.h>
+#include <asm/platform.h>
+#include <asm/bootparam.h>
+#include <platform/lcd.h>
+#include <platform/hardware.h>
+
+void platform_halt(void)
+{
+	lcd_disp_at_pos(" HALT ", 0);
+	local_irq_disable();
+	while (1)
+		cpu_relax();
+}
+
+void platform_power_off(void)
+{
+	lcd_disp_at_pos("POWEROFF", 0);
+	local_irq_disable();
+	while (1)
+		cpu_relax();
+}
+
+void platform_restart(void)
+{
+	/* Flush and reset the mmu, simulate a processor reset, and
+	 * jump to the reset vector. */
+
+
+	__asm__ __volatile__ ("movi	a2, 15\n\t"
+			      "wsr	a2, icountlevel\n\t"
+			      "movi	a2, 0\n\t"
+			      "wsr	a2, icount\n\t"
+			      "wsr	a2, ibreakenable\n\t"
+			      "wsr	a2, lcount\n\t"
+			      "movi	a2, 0x1f\n\t"
+			      "wsr	a2, ps\n\t"
+			      "isync\n\t"
+			      "jx	%0\n\t"
+			      :
+			      : "a" (XCHAL_RESET_VECTOR_VADDR)
+			      : "a2"
+			      );
+
+	/* control never gets here */
+}
+
+void __init platform_setup(char **cmdline)
+{
+}
+
+#ifdef CONFIG_OF
+
+static void __init update_clock_frequency(struct device_node *node)
+{
+	struct property *newfreq;
+	u32 freq;
+
+	if (!of_property_read_u32(node, "clock-frequency", &freq) && freq != 0)
+		return;
+
+	newfreq = kzalloc(sizeof(*newfreq) + sizeof(u32), GFP_KERNEL);
+	if (!newfreq)
+		return;
+	newfreq->value = newfreq + 1;
+	newfreq->length = sizeof(freq);
+	newfreq->name = kstrdup("clock-frequency", GFP_KERNEL);
+	if (!newfreq->name) {
+		kfree(newfreq);
+		return;
+	}
+
+	*(u32 *)newfreq->value = cpu_to_be32(*(u32 *)XTFPGA_CLKFRQ_VADDR);
+	prom_update_property(node, newfreq);
+}
+
+#define MAC_LEN 6
+static void __init update_local_mac(struct device_node *node)
+{
+	struct property *newmac;
+	const u8* macaddr;
+	int prop_len;
+
+	macaddr = of_get_property(node, "local-mac-address", &prop_len);
+	if (macaddr == NULL || prop_len != MAC_LEN)
+		return;
+
+	newmac = kzalloc(sizeof(*newmac) + MAC_LEN, GFP_KERNEL);
+	if (newmac == NULL)
+		return;
+
+	newmac->value = newmac + 1;
+	newmac->length = MAC_LEN;
+	newmac->name = kstrdup("local-mac-address", GFP_KERNEL);
+	if (newmac->name == NULL) {
+		kfree(newmac);
+		return;
+	}
+
+	memcpy(newmac->value, macaddr, MAC_LEN);
+	((u8*)newmac->value)[5] = (*(u32*)DIP_SWITCHES_VADDR) & 0x3f;
+	prom_update_property(node, newmac);
+}
+
+static int __init machine_setup(void)
+{
+	struct device_node *serial;
+	struct device_node *eth = NULL;
+
+	for_each_compatible_node(serial, NULL, "ns16550a")
+		update_clock_frequency(serial);
+
+	if ((eth = of_find_compatible_node(eth, NULL, "opencores,ethoc")))
+		update_local_mac(eth);
+	return 0;
+}
+arch_initcall(machine_setup);
+
+#endif
+
+/* early initialization */
+
+void __init platform_init(bp_tag_t *first)
+{
+}
+
+/* Heartbeat. */
+
+void platform_heartbeat(void)
+{
+}
+
+#ifdef CONFIG_XTENSA_CALIBRATE_CCOUNT
+
+void platform_calibrate_ccount(void)
+{
+	long clk_freq = 0;
+#ifdef CONFIG_OF
+	struct device_node *cpu =
+		of_find_compatible_node(NULL, NULL, "xtensa,cpu");
+	if (cpu) {
+		u32 freq;
+		update_clock_frequency(cpu);
+		if (!of_property_read_u32(cpu, "clock-frequency", &freq))
+			clk_freq = freq;
+	}
+#endif
+	if (!clk_freq)
+		clk_freq = *(long *)XTFPGA_CLKFRQ_VADDR;
+
+	ccount_per_jiffy = clk_freq / HZ;
+	nsec_per_ccount = 1000000000UL / clk_freq;
+}
+
+#endif
+
+#ifndef CONFIG_OF
+
+#include <linux/serial_8250.h>
+#include <linux/if.h>
+#include <net/ethoc.h>
+
+/*----------------------------------------------------------------------------
+ *  Ethernet -- OpenCores Ethernet MAC (ethoc driver)
+ */
+
+static struct resource ethoc_res[] __initdata = {
+	[0] = { /* register space */
+		.start = OETH_REGS_PADDR,
+		.end   = OETH_REGS_PADDR + OETH_REGS_SIZE - 1,
+		.flags = IORESOURCE_MEM,
+	},
+	[1] = { /* buffer space */
+		.start = OETH_SRAMBUFF_PADDR,
+		.end   = OETH_SRAMBUFF_PADDR + OETH_SRAMBUFF_SIZE - 1,
+		.flags = IORESOURCE_MEM,
+	},
+	[2] = { /* IRQ number */
+		.start = OETH_IRQ,
+		.end   = OETH_IRQ,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct ethoc_platform_data ethoc_pdata __initdata = {
+	/*
+	 * The MAC address for these boards is 00:50:c2:13:6f:xx.
+	 * The last byte (here as zero) is read from the DIP switches on the
+	 * board.
+	 */
+	.hwaddr = { 0x00, 0x50, 0xc2, 0x13, 0x6f, 0 },
+	.phy_id = -1,
+};
+
+static struct platform_device ethoc_device __initdata = {
+	.name = "ethoc",
+	.id = -1,
+	.num_resources = ARRAY_SIZE(ethoc_res),
+	.resource = ethoc_res,
+	.dev = {
+		.platform_data = &ethoc_pdata,
+	},
+};
+
+/*----------------------------------------------------------------------------
+ *  UART
+ */
+
+static struct resource serial_resource __initdata = {
+	.start	= DUART16552_PADDR,
+	.end	= DUART16552_PADDR + 0x1f,
+	.flags	= IORESOURCE_MEM,
+};
+
+static struct plat_serial8250_port serial_platform_data[] __initdata = {
+	[0] = {
+		.mapbase	= DUART16552_PADDR,
+		.irq		= DUART16552_INTNUM,
+		.flags		= UPF_BOOT_AUTOCONF | UPF_SKIP_TEST |
+				  UPF_IOREMAP,
+		.iotype		= UPIO_MEM32,
+		.regshift	= 2,
+		.uartclk	= 0,    /* set in xtavnet_init() */
+	},
+	{ },
+};
+
+static struct platform_device xtavnet_uart __initdata = {
+	.name		= "serial8250",
+	.id		= PLAT8250_DEV_PLATFORM,
+	.dev		= {
+		.platform_data	= serial_platform_data,
+	},
+	.num_resources	= 1,
+	.resource	= &serial_resource,
+};
+
+/* platform devices */
+static struct platform_device *platform_devices[] __initdata = {
+	&ethoc_device,
+	&xtavnet_uart,
+};
+
+
+static int __init xtavnet_init(void)
+{
+	/* Ethernet MAC address.  */
+	ethoc_pdata.hwaddr[5] = *(u32 *)DIP_SWITCHES_VADDR;
+
+	/* Clock rate varies among FPGA bitstreams; board specific FPGA register
+	 * reports the actual clock rate.
+	 */
+	serial_platform_data[0].uartclk = *(long *)XTFPGA_CLKFRQ_VADDR;
+
+
+	/* register platform devices */
+	platform_add_devices(platform_devices, ARRAY_SIZE(platform_devices));
+
+	/* ETHOC driver is a bit quiet; at least display Ethernet MAC, so user
+	 * knows whether they set it correctly on the DIP switches.
+	 */
+	pr_info("XTFPGA: Ethernet MAC %pM\n", ethoc_pdata.hwaddr);
+
+	return 0;
+}
+
+/*
+ * Register to be done during do_initcalls().
+ */
+arch_initcall(xtavnet_init);
+
+#endif /* CONFIG_OF */
diff --git a/trunk/arch/xtensa/variants/s6000/gpio.c b/trunk/arch/xtensa/variants/s6000/gpio.c
index b89541ba39ab..da9e85c13b08 100644
--- a/trunk/arch/xtensa/variants/s6000/gpio.c
+++ b/trunk/arch/xtensa/variants/s6000/gpio.c
@@ -164,7 +164,7 @@ static void demux_irqs(unsigned int irq, struct irq_desc *desc)
 	int cirq;
 
 	chip->irq_mask(&desc->irq_data);
-	chip->irq_ack(&desc->irq_data));
+	chip->irq_ack(&desc->irq_data);
 	pending = readb(S6_REG_GPIO + S6_GPIO_BANK(0) + S6_GPIO_MIS) & *mask;
 	cirq = IRQ_BASE - 1;
 	while (pending) {
@@ -173,7 +173,7 @@ static void demux_irqs(unsigned int irq, struct irq_desc *desc)
 		pending >>= n;
 		generic_handle_irq(cirq);
 	}
-	chip->irq_unmask(&desc->irq_data));
+	chip->irq_unmask(&desc->irq_data);
 }
 
 extern const signed char *platform_irq_mappings[XTENSA_NR_IRQS];
diff --git a/trunk/block/Kconfig b/trunk/block/Kconfig
index a7e40a7c8214..4a85ccf8d4cf 100644
--- a/trunk/block/Kconfig
+++ b/trunk/block/Kconfig
@@ -4,6 +4,7 @@
 menuconfig BLOCK
        bool "Enable the block layer" if EXPERT
        default y
+       select PERCPU_RWSEM
        help
 	 Provide block layer support for the kernel.
 
diff --git a/trunk/block/genhd.c b/trunk/block/genhd.c
index 2a6fdf539a69..9a289d7c84bb 100644
--- a/trunk/block/genhd.c
+++ b/trunk/block/genhd.c
@@ -743,7 +743,6 @@ void __init printk_all_partitions(void)
 		struct hd_struct *part;
 		char name_buf[BDEVNAME_SIZE];
 		char devt_buf[BDEVT_SIZE];
-		char uuid_buf[PARTITION_META_INFO_UUIDLTH * 2 + 5];
 
 		/*
 		 * Don't show empty devices or things that have been
@@ -762,16 +761,11 @@ void __init printk_all_partitions(void)
 		while ((part = disk_part_iter_next(&piter))) {
 			bool is_part0 = part == &disk->part0;
 
-			uuid_buf[0] = '\0';
-			if (part->info)
-				snprintf(uuid_buf, sizeof(uuid_buf), "%pU",
-					 part->info->uuid);
-
 			printk("%s%s %10llu %s %s", is_part0 ? "" : "  ",
 			       bdevt_str(part_devt(part), devt_buf),
 			       (unsigned long long)part_nr_sects_read(part) >> 1
 			       , disk_name(disk, part->partno, name_buf),
-			       uuid_buf);
+			       part->info ? part->info->uuid : "");
 			if (is_part0) {
 				if (disk->driverfs_dev != NULL &&
 				    disk->driverfs_dev->driver != NULL)
diff --git a/trunk/block/partitions/efi.c b/trunk/block/partitions/efi.c
index 6296b403c67a..b62fb88b8711 100644
--- a/trunk/block/partitions/efi.c
+++ b/trunk/block/partitions/efi.c
@@ -620,7 +620,6 @@ int efi_partition(struct parsed_partitions *state)
 	gpt_entry *ptes = NULL;
 	u32 i;
 	unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
-	u8 unparsed_guid[37];
 
 	if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
 		kfree(gpt);
@@ -649,11 +648,7 @@ int efi_partition(struct parsed_partitions *state)
 			state->parts[i + 1].flags = ADDPART_FLAG_RAID;
 
 		info = &state->parts[i + 1].info;
-		/* Instead of doing a manual swap to big endian, reuse the
-		 * common ASCII hex format as the interim.
-		 */
-		efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid);
-		part_pack_uuid(unparsed_guid, info->uuid);
+		efi_guid_unparse(&ptes[i].unique_partition_guid, info->uuid);
 
 		/* Naively convert UTF16-LE to 7 bits. */
 		label_max = min(sizeof(info->volname) - 1,
diff --git a/trunk/block/partitions/msdos.c b/trunk/block/partitions/msdos.c
index 5f79a6677c69..8752a5d26565 100644
--- a/trunk/block/partitions/msdos.c
+++ b/trunk/block/partitions/msdos.c
@@ -94,6 +94,17 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
 	return ret;
 }
 
+static void set_info(struct parsed_partitions *state, int slot,
+		     u32 disksig)
+{
+	struct partition_meta_info *info = &state->parts[slot].info;
+
+	snprintf(info->uuid, sizeof(info->uuid), "%08x-%02x", disksig,
+		 slot);
+	info->volname[0] = 0;
+	state->parts[slot].has_info = true;
+}
+
 /*
  * Create devices for each logical partition in an extended partition.
  * The logical partitions form a linked list, with each entry being
@@ -106,7 +117,8 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
  */
 
 static void parse_extended(struct parsed_partitions *state,
-			   sector_t first_sector, sector_t first_size)
+			   sector_t first_sector, sector_t first_size,
+			   u32 disksig)
 {
 	struct partition *p;
 	Sector sect;
@@ -166,6 +178,7 @@ static void parse_extended(struct parsed_partitions *state,
 			}
 
 			put_partition(state, state->next, next, size);
+			set_info(state, state->next, disksig);
 			if (SYS_IND(p) == LINUX_RAID_PARTITION)
 				state->parts[state->next].flags = ADDPART_FLAG_RAID;
 			loopct = 0;
@@ -437,6 +450,7 @@ int msdos_partition(struct parsed_partitions *state)
 	struct partition *p;
 	struct fat_boot_sector *fb;
 	int slot;
+	u32 disksig;
 
 	data = read_part_sector(state, 0, &sect);
 	if (!data)
@@ -491,6 +505,8 @@ int msdos_partition(struct parsed_partitions *state)
 #endif
 	p = (struct partition *) (data + 0x1be);
 
+	disksig = le32_to_cpup((__le32 *)(data + 0x1b8));
+
 	/*
 	 * Look for partitions in two passes:
 	 * First find the primary and DOS-type extended partitions.
@@ -515,11 +531,12 @@ int msdos_partition(struct parsed_partitions *state)
 			put_partition(state, slot, start, n);
 
 			strlcat(state->pp_buf, " <", PAGE_SIZE);
-			parse_extended(state, start, size);
+			parse_extended(state, start, size, disksig);
 			strlcat(state->pp_buf, " >", PAGE_SIZE);
 			continue;
 		}
 		put_partition(state, slot, start, size);
+		set_info(state, slot, disksig);
 		if (SYS_IND(p) == LINUX_RAID_PARTITION)
 			state->parts[slot].flags = ADDPART_FLAG_RAID;
 		if (SYS_IND(p) == DM6_PARTITION)
diff --git a/trunk/drivers/atm/solos-pci.c b/trunk/drivers/atm/solos-pci.c
index c909b7b7d5f1..d70abe77f737 100644
--- a/trunk/drivers/atm/solos-pci.c
+++ b/trunk/drivers/atm/solos-pci.c
@@ -42,7 +42,8 @@
 #include <linux/swab.h>
 #include <linux/slab.h>
 
-#define VERSION "0.07"
+#define VERSION "1.04"
+#define DRIVER_VERSION 0x01
 #define PTAG "solos-pci"
 
 #define CONFIG_RAM_SIZE	128
@@ -56,16 +57,21 @@
 #define FLASH_BUSY	0x60
 #define FPGA_MODE	0x5C
 #define FLASH_MODE	0x58
+#define GPIO_STATUS	0x54
+#define DRIVER_VER	0x50
 #define TX_DMA_ADDR(port)	(0x40 + (4 * (port)))
 #define RX_DMA_ADDR(port)	(0x30 + (4 * (port)))
 
 #define DATA_RAM_SIZE	32768
 #define BUF_SIZE	2048
 #define OLD_BUF_SIZE	4096 /* For FPGA versions <= 2*/
-#define FPGA_PAGE	528 /* FPGA flash page size*/
-#define SOLOS_PAGE	512 /* Solos flash page size*/
-#define FPGA_BLOCK	(FPGA_PAGE * 8) /* FPGA flash block size*/
-#define SOLOS_BLOCK	(SOLOS_PAGE * 8) /* Solos flash block size*/
+/* Old boards use ATMEL AD45DB161D flash */
+#define ATMEL_FPGA_PAGE	528 /* FPGA flash page size*/
+#define ATMEL_SOLOS_PAGE	512 /* Solos flash page size*/
+#define ATMEL_FPGA_BLOCK	(ATMEL_FPGA_PAGE * 8) /* FPGA block size*/
+#define ATMEL_SOLOS_BLOCK	(ATMEL_SOLOS_PAGE * 8) /* Solos block size*/
+/* Current boards use M25P/M25PE SPI flash */
+#define SPI_FLASH_BLOCK	(256 * 64)
 
 #define RX_BUF(card, nr) ((card->buffers) + (nr)*(card->buffer_size)*2)
 #define TX_BUF(card, nr) ((card->buffers) + (nr)*(card->buffer_size)*2 + (card->buffer_size))
@@ -122,11 +128,14 @@ struct solos_card {
 	struct sk_buff_head cli_queue[4];
 	struct sk_buff *tx_skb[4];
 	struct sk_buff *rx_skb[4];
+	unsigned char *dma_bounce;
 	wait_queue_head_t param_wq;
 	wait_queue_head_t fw_wq;
 	int using_dma;
+	int dma_alignment;
 	int fpga_version;
 	int buffer_size;
+	int atmel_flash;
 };
 
 
@@ -451,7 +460,6 @@ static ssize_t console_show(struct device *dev, struct device_attribute *attr,
 
 	len = skb->len;
 	memcpy(buf, skb->data, len);
-	dev_dbg(&card->dev->dev, "len: %d\n", len);
 
 	kfree_skb(skb);
 	return len;
@@ -498,6 +506,78 @@ static ssize_t console_store(struct device *dev, struct device_attribute *attr,
 	return err?:count;
 }
 
+struct geos_gpio_attr {
+	struct device_attribute attr;
+	int offset;
+};
+
+#define SOLOS_GPIO_ATTR(_name, _mode, _show, _store, _offset)	\
+	struct geos_gpio_attr gpio_attr_##_name = {		\
+		.attr = __ATTR(_name, _mode, _show, _store),	\
+		.offset = _offset }
+
+static ssize_t geos_gpio_store(struct device *dev, struct device_attribute *attr,
+			       const char *buf, size_t count)
+{
+	struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
+	struct geos_gpio_attr *gattr = container_of(attr, struct geos_gpio_attr, attr);
+	struct solos_card *card = pci_get_drvdata(pdev);
+	uint32_t data32;
+
+	if (count != 1 && (count != 2 || buf[1] != '\n'))
+		return -EINVAL;
+
+	spin_lock_irq(&card->param_queue_lock);
+	data32 = ioread32(card->config_regs + GPIO_STATUS);
+	if (buf[0] == '1') {
+		data32 |= 1 << gattr->offset;
+		iowrite32(data32, card->config_regs + GPIO_STATUS);
+	} else if (buf[0] == '0') {
+		data32 &= ~(1 << gattr->offset);
+		iowrite32(data32, card->config_regs + GPIO_STATUS);
+	} else {
+		count = -EINVAL;
+	}
+	spin_lock_irq(&card->param_queue_lock);
+	return count;
+}
+
+static ssize_t geos_gpio_show(struct device *dev, struct device_attribute *attr,
+			      char *buf)
+{
+	struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
+	struct geos_gpio_attr *gattr = container_of(attr, struct geos_gpio_attr, attr);
+	struct solos_card *card = pci_get_drvdata(pdev);
+	uint32_t data32;
+
+	data32 = ioread32(card->config_regs + GPIO_STATUS);
+	data32 = (data32 >> gattr->offset) & 1;
+
+	return sprintf(buf, "%d\n", data32);
+}
+
+static ssize_t hardware_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
+	struct geos_gpio_attr *gattr = container_of(attr, struct geos_gpio_attr, attr);
+	struct solos_card *card = pci_get_drvdata(pdev);
+	uint32_t data32;
+
+	data32 = ioread32(card->config_regs + GPIO_STATUS);
+	switch (gattr->offset) {
+	case 0:
+		/* HardwareVersion */
+		data32 = data32 & 0x1F;
+		break;
+	case 1:
+		/* HardwareVariant */
+		data32 = (data32 >> 5) & 0x0F;
+		break;
+	}
+	return sprintf(buf, "%d\n", data32);
+}
+
 static DEVICE_ATTR(console, 0644, console_show, console_store);
 
 
@@ -506,6 +586,14 @@ static DEVICE_ATTR(console, 0644, console_show, console_store);
 
 #include "solos-attrlist.c"
 
+static SOLOS_GPIO_ATTR(GPIO1, 0644, geos_gpio_show, geos_gpio_store, 9);
+static SOLOS_GPIO_ATTR(GPIO2, 0644, geos_gpio_show, geos_gpio_store, 10);
+static SOLOS_GPIO_ATTR(GPIO3, 0644, geos_gpio_show, geos_gpio_store, 11);
+static SOLOS_GPIO_ATTR(GPIO4, 0644, geos_gpio_show, geos_gpio_store, 12);
+static SOLOS_GPIO_ATTR(GPIO5, 0644, geos_gpio_show, geos_gpio_store, 13);
+static SOLOS_GPIO_ATTR(PushButton, 0444, geos_gpio_show, NULL, 14);
+static SOLOS_GPIO_ATTR(HardwareVersion, 0444, hardware_show, NULL, 0);
+static SOLOS_GPIO_ATTR(HardwareVariant, 0444, hardware_show, NULL, 1);
 #undef SOLOS_ATTR_RO
 #undef SOLOS_ATTR_RW
 
@@ -522,6 +610,23 @@ static struct attribute_group solos_attr_group = {
 	.name = "parameters",
 };
 
+static struct attribute *gpio_attrs[] = {
+	&gpio_attr_GPIO1.attr.attr,
+	&gpio_attr_GPIO2.attr.attr,
+	&gpio_attr_GPIO3.attr.attr,
+	&gpio_attr_GPIO4.attr.attr,
+	&gpio_attr_GPIO5.attr.attr,
+	&gpio_attr_PushButton.attr.attr,
+	&gpio_attr_HardwareVersion.attr.attr,
+	&gpio_attr_HardwareVariant.attr.attr,
+	NULL
+};
+
+static struct attribute_group gpio_attr_group = {
+	.attrs = gpio_attrs,
+	.name = "gpio",
+};
+
 static int flash_upgrade(struct solos_card *card, int chip)
 {
 	const struct firmware *fw;
@@ -533,16 +638,25 @@ static int flash_upgrade(struct solos_card *card, int chip)
 	switch (chip) {
 	case 0:
 		fw_name = "solos-FPGA.bin";
-		blocksize = FPGA_BLOCK;
+		if (card->atmel_flash)
+			blocksize = ATMEL_FPGA_BLOCK;
+		else
+			blocksize = SPI_FLASH_BLOCK;
 		break;
 	case 1:
 		fw_name = "solos-Firmware.bin";
-		blocksize = SOLOS_BLOCK;
+		if (card->atmel_flash)
+			blocksize = ATMEL_SOLOS_BLOCK;
+		else
+			blocksize = SPI_FLASH_BLOCK;
 		break;
 	case 2:
 		if (card->fpga_version > LEGACY_BUFFERS){
 			fw_name = "solos-db-FPGA.bin";
-			blocksize = FPGA_BLOCK;
+			if (card->atmel_flash)
+				blocksize = ATMEL_FPGA_BLOCK;
+			else
+				blocksize = SPI_FLASH_BLOCK;
 		} else {
 			dev_info(&card->dev->dev, "FPGA version doesn't support"
 					" daughter board upgrades\n");
@@ -552,7 +666,10 @@ static int flash_upgrade(struct solos_card *card, int chip)
 	case 3:
 		if (card->fpga_version > LEGACY_BUFFERS){
 			fw_name = "solos-Firmware.bin";
-			blocksize = SOLOS_BLOCK;
+			if (card->atmel_flash)
+				blocksize = ATMEL_SOLOS_BLOCK;
+			else
+				blocksize = SPI_FLASH_BLOCK;
 		} else {
 			dev_info(&card->dev->dev, "FPGA version doesn't support"
 					" daughter board upgrades\n");
@@ -568,6 +685,9 @@ static int flash_upgrade(struct solos_card *card, int chip)
 
 	dev_info(&card->dev->dev, "Flash upgrade starting\n");
 
+	/* New FPGAs require driver version before permitting flash upgrades */
+	iowrite32(DRIVER_VERSION, card->config_regs + DRIVER_VER);
+
 	numblocks = fw->size / blocksize;
 	dev_info(&card->dev->dev, "Firmware size: %zd\n", fw->size);
 	dev_info(&card->dev->dev, "Number of blocks: %d\n", numblocks);
@@ -597,9 +717,13 @@ static int flash_upgrade(struct solos_card *card, int chip)
 		/* dev_info(&card->dev->dev, "Set FPGA Flash mode to Block Write\n"); */
 		iowrite32(((chip * 2) + 1), card->config_regs + FLASH_MODE);
 
-		/* Copy block to buffer, swapping each 16 bits */
+		/* Copy block to buffer, swapping each 16 bits for Atmel flash */
 		for(i = 0; i < blocksize; i += 4) {
-			uint32_t word = swahb32p((uint32_t *)(fw->data + offset + i));
+			uint32_t word;
+			if (card->atmel_flash)
+				word = swahb32p((uint32_t *)(fw->data + offset + i));
+			else
+				word = *(uint32_t *)(fw->data + offset + i);
 			if(card->fpga_version > LEGACY_BUFFERS)
 				iowrite32(word, FLASH_BUF + i);
 			else
@@ -961,7 +1085,12 @@ static uint32_t fpga_tx(struct solos_card *card)
 				tx_started |= 1 << port;
 				oldskb = skb; /* We're done with this skb already */
 			} else if (skb && card->using_dma) {
-				SKB_CB(skb)->dma_addr = pci_map_single(card->dev, skb->data,
+				unsigned char *data = skb->data;
+				if ((unsigned long)data & card->dma_alignment) {
+					data = card->dma_bounce + (BUF_SIZE * port);
+					memcpy(data, skb->data, skb->len);
+				}
+				SKB_CB(skb)->dma_addr = pci_map_single(card->dev, data,
 								       skb->len, PCI_DMA_TODEVICE);
 				card->tx_skb[port] = skb;
 				iowrite32(SKB_CB(skb)->dma_addr,
@@ -1133,18 +1262,33 @@ static int fpga_probe(struct pci_dev *dev, const struct pci_device_id *id)
 		db_fpga_upgrade = db_firmware_upgrade = 0;
 	}
 
+	/* Stopped using Atmel flash after 0.03-38 */
+	if (fpga_ver < 39)
+		card->atmel_flash = 1;
+	else
+		card->atmel_flash = 0;
+
+	data32 = ioread32(card->config_regs + PORTS);
+	card->nr_ports = (data32 & 0x000000FF);
+
 	if (card->fpga_version >= DMA_SUPPORTED) {
 		pci_set_master(dev);
 		card->using_dma = 1;
+		if (1) { /* All known FPGA versions so far */
+			card->dma_alignment = 3;
+			card->dma_bounce = kmalloc(card->nr_ports * BUF_SIZE, GFP_KERNEL);
+			if (!card->dma_bounce) {
+				dev_warn(&card->dev->dev, "Failed to allocate DMA bounce buffers\n");
+				/* Fallback to MMIO doesn't work */
+				goto out_unmap_both;
+			}
+		}
 	} else {
 		card->using_dma = 0;
 		/* Set RX empty flag for all ports */
 		iowrite32(0xF0, card->config_regs + FLAGS_ADDR);
 	}
 
-	data32 = ioread32(card->config_regs + PORTS);
-	card->nr_ports = (data32 & 0x000000FF);
-
 	pci_set_drvdata(dev, card);
 
 	tasklet_init(&card->tlet, solos_bh, (unsigned long)card);
@@ -1179,6 +1323,10 @@ static int fpga_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	if (err)
 		goto out_free_irq;
 
+	if (card->fpga_version >= DMA_SUPPORTED &&
+	    sysfs_create_group(&card->dev->dev.kobj, &gpio_attr_group))
+		dev_err(&card->dev->dev, "Could not register parameter group for GPIOs\n");
+
 	return 0;
 
  out_free_irq:
@@ -1187,6 +1335,7 @@ static int fpga_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	tasklet_kill(&card->tlet);
 	
  out_unmap_both:
+	kfree(card->dma_bounce);
 	pci_set_drvdata(dev, NULL);
 	pci_iounmap(dev, card->buffers);
  out_unmap_config:
@@ -1289,11 +1438,16 @@ static void fpga_remove(struct pci_dev *dev)
 	iowrite32(1, card->config_regs + FPGA_MODE);
 	(void)ioread32(card->config_regs + FPGA_MODE); 
 
+	if (card->fpga_version >= DMA_SUPPORTED)
+		sysfs_remove_group(&card->dev->dev.kobj, &gpio_attr_group);
+
 	atm_remove(card);
 
 	free_irq(dev->irq, card);
 	tasklet_kill(&card->tlet);
 
+	kfree(card->dma_bounce);
+
 	/* Release device from reset */
 	iowrite32(0, card->config_regs + FPGA_MODE);
 	(void)ioread32(card->config_regs + FPGA_MODE); 
diff --git a/trunk/drivers/base/dma-buf.c b/trunk/drivers/base/dma-buf.c
index 460e22dee36d..a3f79c495a41 100644
--- a/trunk/drivers/base/dma-buf.c
+++ b/trunk/drivers/base/dma-buf.c
@@ -298,6 +298,8 @@ void dma_buf_unmap_attachment(struct dma_buf_attachment *attach,
 				struct sg_table *sg_table,
 				enum dma_data_direction direction)
 {
+	might_sleep();
+
 	if (WARN_ON(!attach || !attach->dmabuf || !sg_table))
 		return;
 
diff --git a/trunk/drivers/bcma/driver_chipcommon_pmu.c b/trunk/drivers/bcma/driver_chipcommon_pmu.c
index e162999bf916..c62c788b3289 100644
--- a/trunk/drivers/bcma/driver_chipcommon_pmu.c
+++ b/trunk/drivers/bcma/driver_chipcommon_pmu.c
@@ -13,12 +13,13 @@
 #include <linux/export.h>
 #include <linux/bcma/bcma.h>
 
-static u32 bcma_chipco_pll_read(struct bcma_drv_cc *cc, u32 offset)
+u32 bcma_chipco_pll_read(struct bcma_drv_cc *cc, u32 offset)
 {
 	bcma_cc_write32(cc, BCMA_CC_PLLCTL_ADDR, offset);
 	bcma_cc_read32(cc, BCMA_CC_PLLCTL_ADDR);
 	return bcma_cc_read32(cc, BCMA_CC_PLLCTL_DATA);
 }
+EXPORT_SYMBOL_GPL(bcma_chipco_pll_read);
 
 void bcma_chipco_pll_write(struct bcma_drv_cc *cc, u32 offset, u32 value)
 {
diff --git a/trunk/drivers/block/aoe/aoe.h b/trunk/drivers/block/aoe/aoe.h
index d2ed7f18d1ac..175649468c95 100644
--- a/trunk/drivers/block/aoe/aoe.h
+++ b/trunk/drivers/block/aoe/aoe.h
@@ -1,5 +1,5 @@
 /* Copyright (c) 2012 Coraid, Inc.  See COPYING for GPL terms. */
-#define VERSION "50"
+#define VERSION "81"
 #define AOE_MAJOR 152
 #define DEVICE_NAME "aoe"
 
@@ -10,7 +10,7 @@
 #define AOE_PARTITIONS (16)
 #endif
 
-#define WHITESPACE " \t\v\f\n"
+#define WHITESPACE " \t\v\f\n,"
 
 enum {
 	AOECMD_ATA,
@@ -73,21 +73,29 @@ enum {
 	DEVFL_TKILL = (1<<1),	/* flag for timer to know when to kill self */
 	DEVFL_EXT = (1<<2),	/* device accepts lba48 commands */
 	DEVFL_GDALLOC = (1<<3),	/* need to alloc gendisk */
-	DEVFL_KICKME = (1<<4),	/* slow polling network card catch */
-	DEVFL_NEWSIZE = (1<<5),	/* need to update dev size in block layer */
+	DEVFL_GD_NOW = (1<<4),	/* allocating gendisk */
+	DEVFL_KICKME = (1<<5),	/* slow polling network card catch */
+	DEVFL_NEWSIZE = (1<<6),	/* need to update dev size in block layer */
+	DEVFL_FREEING = (1<<7),	/* set when device is being cleaned up */
+	DEVFL_FREED = (1<<8),	/* device has been cleaned up */
 };
 
 enum {
 	DEFAULTBCNT = 2 * 512,	/* 2 sectors */
 	MIN_BUFS = 16,
-	NTARGETS = 8,
+	NTARGETS = 4,
 	NAOEIFS = 8,
 	NSKBPOOLMAX = 256,
 	NFACTIVE = 61,
 
 	TIMERTICK = HZ / 10,
-	MINTIMER = HZ >> 2,
-	MAXTIMER = HZ << 1,
+	RTTSCALE = 8,
+	RTTDSCALE = 3,
+	RTTAVG_INIT = USEC_PER_SEC / 4 << RTTSCALE,
+	RTTDEV_INIT = RTTAVG_INIT / 4,
+
+	HARD_SCORN_SECS = 10,	/* try another remote port after this */
+	MAX_TAINT = 1000,	/* cap on aoetgt taint */
 };
 
 struct buf {
@@ -100,10 +108,17 @@ struct buf {
 	struct request *rq;
 };
 
+enum frame_flags {
+	FFL_PROBE = 1,
+};
+
 struct frame {
 	struct list_head head;
 	u32 tag;
+	struct timeval sent;	/* high-res time packet was sent */
+	u32 sent_jiffs;		/* low-res jiffies-based sent time */
 	ulong waited;
+	ulong waited_total;
 	struct aoetgt *t;		/* parent target I belong to */
 	sector_t lba;
 	struct sk_buff *skb;		/* command skb freed on module exit */
@@ -112,6 +127,7 @@ struct frame {
 	struct bio_vec *bv;
 	ulong bcnt;
 	ulong bv_off;
+	char flags;
 };
 
 struct aoeif {
@@ -122,28 +138,31 @@ struct aoeif {
 
 struct aoetgt {
 	unsigned char addr[6];
-	ushort nframes;
+	ushort nframes;		/* cap on frames to use */
 	struct aoedev *d;			/* parent device I belong to */
 	struct list_head ffree;			/* list of free frames */
 	struct aoeif ifs[NAOEIFS];
 	struct aoeif *ifp;	/* current aoeif in use */
-	ushort nout;
-	ushort maxout;
-	ulong falloc;
-	ulong lastwadj;		/* last window adjustment */
+	ushort nout;		/* number of AoE commands outstanding */
+	ushort maxout;		/* current value for max outstanding */
+	ushort next_cwnd;	/* incr maxout after decrementing to zero */
+	ushort ssthresh;	/* slow start threshold */
+	ulong falloc;		/* number of allocated frames */
+	int taint;		/* how much we want to avoid this aoetgt */
 	int minbcnt;
 	int wpkts, rpkts;
+	char nout_probes;
 };
 
 struct aoedev {
 	struct aoedev *next;
 	ulong sysminor;
 	ulong aoemajor;
+	u32 rttavg;		/* scaled AoE round trip time average */
+	u32 rttdev;		/* scaled round trip time mean deviation */
 	u16 aoeminor;
 	u16 flags;
 	u16 nopen;		/* (bd_openers isn't available without sleeping) */
-	u16 rttavg;		/* round trip average of requests/responses */
-	u16 mintimer;
 	u16 fw_ver;		/* version of blade's firmware */
 	u16 lasttag;		/* last tag sent */
 	u16 useme;
@@ -151,7 +170,7 @@ struct aoedev {
 	struct work_struct work;/* disk create work struct */
 	struct gendisk *gd;
 	struct request_queue *blkq;
-	struct hd_geometry geo; 
+	struct hd_geometry geo;
 	sector_t ssize;
 	struct timer_list timer;
 	spinlock_t lock;
@@ -164,11 +183,12 @@ struct aoedev {
 	} ip;
 	ulong maxbcnt;
 	struct list_head factive[NFACTIVE];	/* hash of active frames */
-	struct aoetgt *targets[NTARGETS];
+	struct list_head rexmitq; /* deferred retransmissions */
+	struct aoetgt **targets;
+	ulong ntargets;		/* number of allocated aoetgt pointers */
 	struct aoetgt **tgt;	/* target in use when working */
-	struct aoetgt *htgt;	/* target needing rexmit assistance */
-	ulong ntargets;
 	ulong kicked;
+	char ident[512];
 };
 
 /* kthread tracking */
@@ -195,6 +215,7 @@ void aoecmd_cfg(ushort aoemajor, unsigned char aoeminor);
 struct sk_buff *aoecmd_ata_rsp(struct sk_buff *);
 void aoecmd_cfg_rsp(struct sk_buff *);
 void aoecmd_sleepwork(struct work_struct *);
+void aoecmd_wreset(struct aoetgt *t);
 void aoecmd_cleanslate(struct aoedev *);
 void aoecmd_exit(void);
 int aoecmd_init(void);
diff --git a/trunk/drivers/block/aoe/aoeblk.c b/trunk/drivers/block/aoe/aoeblk.c
index 00dfc5008ad4..a129f8c8073d 100644
--- a/trunk/drivers/block/aoe/aoeblk.c
+++ b/trunk/drivers/block/aoe/aoeblk.c
@@ -16,11 +16,19 @@
 #include <linux/netdevice.h>
 #include <linux/mutex.h>
 #include <linux/export.h>
+#include <linux/moduleparam.h>
+#include <scsi/sg.h>
 #include "aoe.h"
 
 static DEFINE_MUTEX(aoeblk_mutex);
 static struct kmem_cache *buf_pool_cache;
 
+/* GPFS needs a larger value than the default. */
+static int aoe_maxsectors;
+module_param(aoe_maxsectors, int, 0644);
+MODULE_PARM_DESC(aoe_maxsectors,
+	"When nonzero, set the maximum number of sectors per I/O request");
+
 static ssize_t aoedisk_show_state(struct device *dev,
 				  struct device_attribute *attr, char *page)
 {
@@ -59,7 +67,7 @@ static ssize_t aoedisk_show_netif(struct device *dev,
 	nd = nds;
 	ne = nd + ARRAY_SIZE(nds);
 	t = d->targets;
-	te = t + NTARGETS;
+	te = t + d->ntargets;
 	for (; t < te && *t; t++) {
 		ifp = (*t)->ifs;
 		e = ifp + NAOEIFS;
@@ -91,6 +99,14 @@ static ssize_t aoedisk_show_fwver(struct device *dev,
 
 	return snprintf(page, PAGE_SIZE, "0x%04x\n", (unsigned int) d->fw_ver);
 }
+static ssize_t aoedisk_show_payload(struct device *dev,
+				    struct device_attribute *attr, char *page)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	struct aoedev *d = disk->private_data;
+
+	return snprintf(page, PAGE_SIZE, "%lu\n", d->maxbcnt);
+}
 
 static DEVICE_ATTR(state, S_IRUGO, aoedisk_show_state, NULL);
 static DEVICE_ATTR(mac, S_IRUGO, aoedisk_show_mac, NULL);
@@ -99,12 +115,14 @@ static struct device_attribute dev_attr_firmware_version = {
 	.attr = { .name = "firmware-version", .mode = S_IRUGO },
 	.show = aoedisk_show_fwver,
 };
+static DEVICE_ATTR(payload, S_IRUGO, aoedisk_show_payload, NULL);
 
 static struct attribute *aoe_attrs[] = {
 	&dev_attr_state.attr,
 	&dev_attr_mac.attr,
 	&dev_attr_netif.attr,
 	&dev_attr_firmware_version.attr,
+	&dev_attr_payload.attr,
 	NULL,
 };
 
@@ -129,9 +147,18 @@ aoeblk_open(struct block_device *bdev, fmode_t mode)
 	struct aoedev *d = bdev->bd_disk->private_data;
 	ulong flags;
 
+	if (!virt_addr_valid(d)) {
+		pr_crit("aoe: invalid device pointer in %s\n",
+			__func__);
+		WARN_ON(1);
+		return -ENODEV;
+	}
+	if (!(d->flags & DEVFL_UP) || d->flags & DEVFL_TKILL)
+		return -ENODEV;
+
 	mutex_lock(&aoeblk_mutex);
 	spin_lock_irqsave(&d->lock, flags);
-	if (d->flags & DEVFL_UP) {
+	if (d->flags & DEVFL_UP && !(d->flags & DEVFL_TKILL)) {
 		d->nopen++;
 		spin_unlock_irqrestore(&d->lock, flags);
 		mutex_unlock(&aoeblk_mutex);
@@ -195,9 +222,38 @@ aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
+static int
+aoeblk_ioctl(struct block_device *bdev, fmode_t mode, uint cmd, ulong arg)
+{
+	struct aoedev *d;
+
+	if (!arg)
+		return -EINVAL;
+
+	d = bdev->bd_disk->private_data;
+	if ((d->flags & DEVFL_UP) == 0) {
+		pr_err("aoe: disk not up\n");
+		return -ENODEV;
+	}
+
+	if (cmd == HDIO_GET_IDENTITY) {
+		if (!copy_to_user((void __user *) arg, &d->ident,
+			sizeof(d->ident)))
+			return 0;
+		return -EFAULT;
+	}
+
+	/* udev calls scsi_id, which uses SG_IO, resulting in noise */
+	if (cmd != SG_IO)
+		pr_info("aoe: unknown ioctl 0x%x\n", cmd);
+
+	return -ENOTTY;
+}
+
 static const struct block_device_operations aoe_bdops = {
 	.open = aoeblk_open,
 	.release = aoeblk_release,
+	.ioctl = aoeblk_ioctl,
 	.getgeo = aoeblk_getgeo,
 	.owner = THIS_MODULE,
 };
@@ -212,6 +268,18 @@ aoeblk_gdalloc(void *vp)
 	struct request_queue *q;
 	enum { KB = 1024, MB = KB * KB, READ_AHEAD = 2 * MB, };
 	ulong flags;
+	int late = 0;
+
+	spin_lock_irqsave(&d->lock, flags);
+	if (d->flags & DEVFL_GDALLOC
+	&& !(d->flags & DEVFL_TKILL)
+	&& !(d->flags & DEVFL_GD_NOW))
+		d->flags |= DEVFL_GD_NOW;
+	else
+		late = 1;
+	spin_unlock_irqrestore(&d->lock, flags);
+	if (late)
+		return;
 
 	gd = alloc_disk(AOE_PARTITIONS);
 	if (gd == NULL) {
@@ -231,23 +299,24 @@ aoeblk_gdalloc(void *vp)
 	if (q == NULL) {
 		pr_err("aoe: cannot allocate block queue for %ld.%d\n",
 			d->aoemajor, d->aoeminor);
-		mempool_destroy(mp);
-		goto err_disk;
+		goto err_mempool;
 	}
 
-	d->blkq = blk_alloc_queue(GFP_KERNEL);
-	if (!d->blkq)
-		goto err_mempool;
-	d->blkq->backing_dev_info.name = "aoe";
-	if (bdi_init(&d->blkq->backing_dev_info))
-		goto err_blkq;
 	spin_lock_irqsave(&d->lock, flags);
-	blk_queue_max_hw_sectors(d->blkq, BLK_DEF_MAX_SECTORS);
+	WARN_ON(!(d->flags & DEVFL_GD_NOW));
+	WARN_ON(!(d->flags & DEVFL_GDALLOC));
+	WARN_ON(d->flags & DEVFL_TKILL);
+	WARN_ON(d->gd);
+	WARN_ON(d->flags & DEVFL_UP);
+	blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS);
+	q->backing_dev_info.name = "aoe";
 	q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE;
 	d->bufpool = mp;
 	d->blkq = gd->queue = q;
 	q->queuedata = d;
 	d->gd = gd;
+	if (aoe_maxsectors)
+		blk_queue_max_hw_sectors(q, aoe_maxsectors);
 	gd->major = AOE_MAJOR;
 	gd->first_minor = d->sysminor;
 	gd->fops = &aoe_bdops;
@@ -263,18 +332,21 @@ aoeblk_gdalloc(void *vp)
 
 	add_disk(gd);
 	aoedisk_add_sysfs(d);
+
+	spin_lock_irqsave(&d->lock, flags);
+	WARN_ON(!(d->flags & DEVFL_GD_NOW));
+	d->flags &= ~DEVFL_GD_NOW;
+	spin_unlock_irqrestore(&d->lock, flags);
 	return;
 
-err_blkq:
-	blk_cleanup_queue(d->blkq);
-	d->blkq = NULL;
 err_mempool:
-	mempool_destroy(d->bufpool);
+	mempool_destroy(mp);
 err_disk:
 	put_disk(gd);
 err:
 	spin_lock_irqsave(&d->lock, flags);
-	d->flags &= ~DEVFL_GDALLOC;
+	d->flags &= ~DEVFL_GD_NOW;
+	schedule_work(&d->work);
 	spin_unlock_irqrestore(&d->lock, flags);
 }
 
diff --git a/trunk/drivers/block/aoe/aoechr.c b/trunk/drivers/block/aoe/aoechr.c
index ed57a890c643..42e67ad6bd20 100644
--- a/trunk/drivers/block/aoe/aoechr.c
+++ b/trunk/drivers/block/aoe/aoechr.c
@@ -39,6 +39,11 @@ struct ErrMsg {
 };
 
 static DEFINE_MUTEX(aoechr_mutex);
+
+/* A ring buffer of error messages, to be read through
+ * "/dev/etherd/err".  When no messages are present,
+ * readers will block waiting for messages to appear.
+ */
 static struct ErrMsg emsgs[NMSG];
 static int emsgs_head_idx, emsgs_tail_idx;
 static struct completion emsgs_comp;
@@ -282,7 +287,7 @@ aoechr_init(void)
 	int n, i;
 
 	n = register_chrdev(AOE_MAJOR, "aoechr", &aoe_fops);
-	if (n < 0) { 
+	if (n < 0) {
 		printk(KERN_ERR "aoe: can't register char device\n");
 		return n;
 	}
diff --git a/trunk/drivers/block/aoe/aoecmd.c b/trunk/drivers/block/aoe/aoecmd.c
index 9fe4f1865558..25ef5c014fca 100644
--- a/trunk/drivers/block/aoe/aoecmd.c
+++ b/trunk/drivers/block/aoe/aoecmd.c
@@ -22,6 +22,7 @@
 #define MAXIOC (8192)	/* default meant to avoid most soft lockups */
 
 static void ktcomplete(struct frame *, struct sk_buff *);
+static int count_targets(struct aoedev *d, int *untainted);
 
 static struct buf *nextbuf(struct aoedev *);
 
@@ -29,7 +30,7 @@ static int aoe_deadsecs = 60 * 3;
 module_param(aoe_deadsecs, int, 0644);
 MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev.");
 
-static int aoe_maxout = 16;
+static int aoe_maxout = 64;
 module_param(aoe_maxout, int, 0644);
 MODULE_PARM_DESC(aoe_maxout,
 	"Only aoe_maxout outstanding packets for every MAC on eX.Y.");
@@ -43,6 +44,8 @@ static struct {
 	spinlock_t lock;
 } iocq;
 
+static struct page *empty_page;
+
 static struct sk_buff *
 new_skb(ulong len)
 {
@@ -58,6 +61,23 @@ new_skb(ulong len)
 	return skb;
 }
 
+static struct frame *
+getframe_deferred(struct aoedev *d, u32 tag)
+{
+	struct list_head *head, *pos, *nx;
+	struct frame *f;
+
+	head = &d->rexmitq;
+	list_for_each_safe(pos, nx, head) {
+		f = list_entry(pos, struct frame, head);
+		if (f->tag == tag) {
+			list_del(pos);
+			return f;
+		}
+	}
+	return NULL;
+}
+
 static struct frame *
 getframe(struct aoedev *d, u32 tag)
 {
@@ -162,8 +182,10 @@ aoe_freetframe(struct frame *f)
 
 	t = f->t;
 	f->buf = NULL;
+	f->lba = 0;
 	f->bv = NULL;
 	f->r_skb = NULL;
+	f->flags = 0;
 	list_add(&f->head, &t->ffree);
 }
 
@@ -217,20 +239,25 @@ newframe(struct aoedev *d)
 	struct frame *f;
 	struct aoetgt *t, **tt;
 	int totout = 0;
+	int use_tainted;
+	int has_untainted;
 
-	if (d->targets[0] == NULL) {	/* shouldn't happen, but I'm paranoid */
+	if (!d->targets || !d->targets[0]) {
 		printk(KERN_ERR "aoe: NULL TARGETS!\n");
 		return NULL;
 	}
 	tt = d->tgt;	/* last used target */
-	for (;;) {
+	for (use_tainted = 0, has_untainted = 0;;) {
 		tt++;
-		if (tt >= &d->targets[NTARGETS] || !*tt)
+		if (tt >= &d->targets[d->ntargets] || !*tt)
 			tt = d->targets;
 		t = *tt;
-		totout += t->nout;
+		if (!t->taint) {
+			has_untainted = 1;
+			totout += t->nout;
+		}
 		if (t->nout < t->maxout
-		&& t != d->htgt
+		&& (use_tainted || !t->taint)
 		&& t->ifp->nd) {
 			f = newtframe(d, t);
 			if (f) {
@@ -239,8 +266,12 @@ newframe(struct aoedev *d)
 				return f;
 			}
 		}
-		if (tt == d->tgt)	/* we've looped and found nada */
-			break;
+		if (tt == d->tgt) {	/* we've looped and found nada */
+			if (!use_tainted && !has_untainted)
+				use_tainted = 1;
+			else
+				break;
+		}
 	}
 	if (totout == 0) {
 		d->kicked++;
@@ -277,21 +308,68 @@ fhash(struct frame *f)
 	list_add_tail(&f->head, &d->factive[n]);
 }
 
+static void
+ata_rw_frameinit(struct frame *f)
+{
+	struct aoetgt *t;
+	struct aoe_hdr *h;
+	struct aoe_atahdr *ah;
+	struct sk_buff *skb;
+	char writebit, extbit;
+
+	skb = f->skb;
+	h = (struct aoe_hdr *) skb_mac_header(skb);
+	ah = (struct aoe_atahdr *) (h + 1);
+	skb_put(skb, sizeof(*h) + sizeof(*ah));
+	memset(h, 0, skb->len);
+
+	writebit = 0x10;
+	extbit = 0x4;
+
+	t = f->t;
+	f->tag = aoehdr_atainit(t->d, t, h);
+	fhash(f);
+	t->nout++;
+	f->waited = 0;
+	f->waited_total = 0;
+	if (f->buf)
+		f->lba = f->buf->sector;
+
+	/* set up ata header */
+	ah->scnt = f->bcnt >> 9;
+	put_lba(ah, f->lba);
+	if (t->d->flags & DEVFL_EXT) {
+		ah->aflags |= AOEAFL_EXT;
+	} else {
+		extbit = 0;
+		ah->lba3 &= 0x0f;
+		ah->lba3 |= 0xe0;	/* LBA bit + obsolete 0xa0 */
+	}
+	if (f->buf && bio_data_dir(f->buf->bio) == WRITE) {
+		skb_fillup(skb, f->bv, f->bv_off, f->bcnt);
+		ah->aflags |= AOEAFL_WRITE;
+		skb->len += f->bcnt;
+		skb->data_len = f->bcnt;
+		skb->truesize += f->bcnt;
+		t->wpkts++;
+	} else {
+		t->rpkts++;
+		writebit = 0;
+	}
+
+	ah->cmdstat = ATA_CMD_PIO_READ | writebit | extbit;
+	skb->dev = t->ifp->nd;
+}
+
 static int
 aoecmd_ata_rw(struct aoedev *d)
 {
 	struct frame *f;
-	struct aoe_hdr *h;
-	struct aoe_atahdr *ah;
 	struct buf *buf;
 	struct aoetgt *t;
 	struct sk_buff *skb;
 	struct sk_buff_head queue;
 	ulong bcnt, fbcnt;
-	char writebit, extbit;
-
-	writebit = 0x10;
-	extbit = 0x4;
 
 	buf = nextbuf(d);
 	if (buf == NULL)
@@ -326,50 +404,18 @@ aoecmd_ata_rw(struct aoedev *d)
 	} while (fbcnt);
 
 	/* initialize the headers & frame */
-	skb = f->skb;
-	h = (struct aoe_hdr *) skb_mac_header(skb);
-	ah = (struct aoe_atahdr *) (h+1);
-	skb_put(skb, sizeof *h + sizeof *ah);
-	memset(h, 0, skb->len);
-	f->tag = aoehdr_atainit(d, t, h);
-	fhash(f);
-	t->nout++;
-	f->waited = 0;
 	f->buf = buf;
 	f->bcnt = bcnt;
-	f->lba = buf->sector;
-
-	/* set up ata header */
-	ah->scnt = bcnt >> 9;
-	put_lba(ah, buf->sector);
-	if (d->flags & DEVFL_EXT) {
-		ah->aflags |= AOEAFL_EXT;
-	} else {
-		extbit = 0;
-		ah->lba3 &= 0x0f;
-		ah->lba3 |= 0xe0;	/* LBA bit + obsolete 0xa0 */
-	}
-	if (bio_data_dir(buf->bio) == WRITE) {
-		skb_fillup(skb, f->bv, f->bv_off, bcnt);
-		ah->aflags |= AOEAFL_WRITE;
-		skb->len += bcnt;
-		skb->data_len = bcnt;
-		skb->truesize += bcnt;
-		t->wpkts++;
-	} else {
-		t->rpkts++;
-		writebit = 0;
-	}
-
-	ah->cmdstat = ATA_CMD_PIO_READ | writebit | extbit;
+	ata_rw_frameinit(f);
 
 	/* mark all tracking fields and load out */
 	buf->nframesout += 1;
 	buf->sector += bcnt >> 9;
 
-	skb->dev = t->ifp->nd;
-	skb = skb_clone(skb, GFP_ATOMIC);
+	skb = skb_clone(f->skb, GFP_ATOMIC);
 	if (skb) {
+		do_gettimeofday(&f->sent);
+		f->sent_jiffs = (u32) jiffies;
 		__skb_queue_head_init(&queue);
 		__skb_queue_tail(&queue, skb);
 		aoenet_xmit(&queue);
@@ -442,11 +488,14 @@ resend(struct aoedev *d, struct frame *f)
 	h = (struct aoe_hdr *) skb_mac_header(skb);
 	ah = (struct aoe_atahdr *) (h+1);
 
-	snprintf(buf, sizeof buf,
-		"%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x s=%pm d=%pm nout=%d\n",
-		"retransmit", d->aoemajor, d->aoeminor, f->tag, jiffies, n,
-		h->src, h->dst, t->nout);
-	aoechr_error(buf);
+	if (!(f->flags & FFL_PROBE)) {
+		snprintf(buf, sizeof(buf),
+			"%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x s=%pm d=%pm nout=%d\n",
+			"retransmit", d->aoemajor, d->aoeminor,
+			f->tag, jiffies, n,
+			h->src, h->dst, t->nout);
+		aoechr_error(buf);
+	}
 
 	f->tag = n;
 	fhash(f);
@@ -458,11 +507,45 @@ resend(struct aoedev *d, struct frame *f)
 	skb = skb_clone(skb, GFP_ATOMIC);
 	if (skb == NULL)
 		return;
+	do_gettimeofday(&f->sent);
+	f->sent_jiffs = (u32) jiffies;
 	__skb_queue_head_init(&queue);
 	__skb_queue_tail(&queue, skb);
 	aoenet_xmit(&queue);
 }
 
+static int
+tsince_hr(struct frame *f)
+{
+	struct timeval now;
+	int n;
+
+	do_gettimeofday(&now);
+	n = now.tv_usec - f->sent.tv_usec;
+	n += (now.tv_sec - f->sent.tv_sec) * USEC_PER_SEC;
+
+	if (n < 0)
+		n = -n;
+
+	/* For relatively long periods, use jiffies to avoid
+	 * discrepancies caused by updates to the system time.
+	 *
+	 * On system with HZ of 1000, 32-bits is over 49 days
+	 * worth of jiffies, or over 71 minutes worth of usecs.
+	 *
+	 * Jiffies overflow is handled by subtraction of unsigned ints:
+	 * (gdb) print (unsigned) 2 - (unsigned) 0xfffffffe
+	 * $3 = 4
+	 * (gdb)
+	 */
+	if (n > USEC_PER_SEC / 4) {
+		n = ((u32) jiffies) - f->sent_jiffs;
+		n *= USEC_PER_SEC / HZ;
+	}
+
+	return n;
+}
+
 static int
 tsince(u32 tag)
 {
@@ -472,7 +555,7 @@ tsince(u32 tag)
 	n -= tag & 0xffff;
 	if (n < 0)
 		n += 1<<16;
-	return n;
+	return jiffies_to_usecs(n + 1);
 }
 
 static struct aoeif *
@@ -503,70 +586,189 @@ ejectif(struct aoetgt *t, struct aoeif *ifp)
 	dev_put(nd);
 }
 
-static int
-sthtith(struct aoedev *d)
+static struct frame *
+reassign_frame(struct frame *f)
 {
-	struct frame *f, *nf;
-	struct list_head *nx, *pos, *head;
+	struct frame *nf;
 	struct sk_buff *skb;
-	struct aoetgt *ht = d->htgt;
-	int i;
 
-	for (i = 0; i < NFACTIVE; i++) {
-		head = &d->factive[i];
-		list_for_each_safe(pos, nx, head) {
-			f = list_entry(pos, struct frame, head);
-			if (f->t != ht)
-				continue;
+	nf = newframe(f->t->d);
+	if (!nf)
+		return NULL;
+	if (nf->t == f->t) {
+		aoe_freetframe(nf);
+		return NULL;
+	}
 
-			nf = newframe(d);
-			if (!nf)
-				return 0;
+	skb = nf->skb;
+	nf->skb = f->skb;
+	nf->buf = f->buf;
+	nf->bcnt = f->bcnt;
+	nf->lba = f->lba;
+	nf->bv = f->bv;
+	nf->bv_off = f->bv_off;
+	nf->waited = 0;
+	nf->waited_total = f->waited_total;
+	nf->sent = f->sent;
+	nf->sent_jiffs = f->sent_jiffs;
+	f->skb = skb;
+
+	return nf;
+}
 
-			/* remove frame from active list */
-			list_del(pos);
+static void
+probe(struct aoetgt *t)
+{
+	struct aoedev *d;
+	struct frame *f;
+	struct sk_buff *skb;
+	struct sk_buff_head queue;
+	size_t n, m;
+	int frag;
 
-			/* reassign all pertinent bits to new outbound frame */
-			skb = nf->skb;
-			nf->skb = f->skb;
-			nf->buf = f->buf;
-			nf->bcnt = f->bcnt;
-			nf->lba = f->lba;
-			nf->bv = f->bv;
-			nf->bv_off = f->bv_off;
-			nf->waited = 0;
-			f->skb = skb;
+	d = t->d;
+	f = newtframe(d, t);
+	if (!f) {
+		pr_err("%s %pm for e%ld.%d: %s\n",
+			"aoe: cannot probe remote address",
+			t->addr,
+			(long) d->aoemajor, d->aoeminor,
+			"no frame available");
+		return;
+	}
+	f->flags |= FFL_PROBE;
+	ifrotate(t);
+	f->bcnt = t->d->maxbcnt ? t->d->maxbcnt : DEFAULTBCNT;
+	ata_rw_frameinit(f);
+	skb = f->skb;
+	for (frag = 0, n = f->bcnt; n > 0; ++frag, n -= m) {
+		if (n < PAGE_SIZE)
+			m = n;
+		else
+			m = PAGE_SIZE;
+		skb_fill_page_desc(skb, frag, empty_page, 0, m);
+	}
+	skb->len += f->bcnt;
+	skb->data_len = f->bcnt;
+	skb->truesize += f->bcnt;
+
+	skb = skb_clone(f->skb, GFP_ATOMIC);
+	if (skb) {
+		do_gettimeofday(&f->sent);
+		f->sent_jiffs = (u32) jiffies;
+		__skb_queue_head_init(&queue);
+		__skb_queue_tail(&queue, skb);
+		aoenet_xmit(&queue);
+	}
+}
+
+static long
+rto(struct aoedev *d)
+{
+	long t;
+
+	t = 2 * d->rttavg >> RTTSCALE;
+	t += 8 * d->rttdev >> RTTDSCALE;
+	if (t == 0)
+		t = 1;
+
+	return t;
+}
+
+static void
+rexmit_deferred(struct aoedev *d)
+{
+	struct aoetgt *t;
+	struct frame *f;
+	struct frame *nf;
+	struct list_head *pos, *nx, *head;
+	int since;
+	int untainted;
+
+	count_targets(d, &untainted);
+
+	head = &d->rexmitq;
+	list_for_each_safe(pos, nx, head) {
+		f = list_entry(pos, struct frame, head);
+		t = f->t;
+		if (t->taint) {
+			if (!(f->flags & FFL_PROBE)) {
+				nf = reassign_frame(f);
+				if (nf) {
+					if (t->nout_probes == 0
+					&& untainted > 0) {
+						probe(t);
+						t->nout_probes++;
+					}
+					list_replace(&f->head, &nf->head);
+					pos = &nf->head;
+					aoe_freetframe(f);
+					f = nf;
+					t = f->t;
+				}
+			} else if (untainted < 1) {
+				/* don't probe w/o other untainted aoetgts */
+				goto stop_probe;
+			} else if (tsince_hr(f) < t->taint * rto(d)) {
+				/* reprobe slowly when taint is high */
+				continue;
+			}
+		} else if (f->flags & FFL_PROBE) {
+stop_probe:		/* don't probe untainted aoetgts */
+			list_del(pos);
 			aoe_freetframe(f);
-			ht->nout--;
-			nf->t->nout++;
-			resend(d, nf);
+			/* leaving d->kicked, because this is routine */
+			f->t->d->flags |= DEVFL_KICKME;
+			continue;
 		}
+		if (t->nout >= t->maxout)
+			continue;
+		list_del(pos);
+		t->nout++;
+		if (f->flags & FFL_PROBE)
+			t->nout_probes++;
+		since = tsince_hr(f);
+		f->waited += since;
+		f->waited_total += since;
+		resend(d, f);
 	}
-	/* We've cleaned up the outstanding so take away his
-	 * interfaces so he won't be used.  We should remove him from
-	 * the target array here, but cleaning up a target is
-	 * involved.  PUNT!
-	 */
-	memset(ht->ifs, 0, sizeof ht->ifs);
-	d->htgt = NULL;
-	return 1;
 }
 
-static inline unsigned char
-ata_scnt(unsigned char *packet) {
-	struct aoe_hdr *h;
-	struct aoe_atahdr *ah;
+/* An aoetgt accumulates demerits quickly, and successful
+ * probing redeems the aoetgt slowly.
+ */
+static void
+scorn(struct aoetgt *t)
+{
+	int n;
 
-	h = (struct aoe_hdr *) packet;
-	ah = (struct aoe_atahdr *) (h+1);
-	return ah->scnt;
+	n = t->taint++;
+	t->taint += t->taint * 2;
+	if (n > t->taint)
+		t->taint = n;
+	if (t->taint > MAX_TAINT)
+		t->taint = MAX_TAINT;
+}
+
+static int
+count_targets(struct aoedev *d, int *untainted)
+{
+	int i, good;
+
+	for (i = good = 0; i < d->ntargets && d->targets[i]; ++i)
+		if (d->targets[i]->taint == 0)
+			good++;
+
+	if (untainted)
+		*untainted = good;
+	return i;
 }
 
 static void
 rexmit_timer(ulong vp)
 {
 	struct aoedev *d;
-	struct aoetgt *t, **tt, **te;
+	struct aoetgt *t;
 	struct aoeif *ifp;
 	struct frame *f;
 	struct list_head *head, *pos, *nx;
@@ -574,15 +776,18 @@ rexmit_timer(ulong vp)
 	register long timeout;
 	ulong flags, n;
 	int i;
+	int utgts;	/* number of aoetgt descriptors (not slots) */
+	int since;
 
 	d = (struct aoedev *) vp;
 
-	/* timeout is always ~150% of the moving average */
-	timeout = d->rttavg;
-	timeout += timeout >> 1;
-
 	spin_lock_irqsave(&d->lock, flags);
 
+	/* timeout based on observed timings and variations */
+	timeout = rto(d);
+
+	utgts = count_targets(d, NULL);
+
 	if (d->flags & DEVFL_TKILL) {
 		spin_unlock_irqrestore(&d->lock, flags);
 		return;
@@ -593,67 +798,61 @@ rexmit_timer(ulong vp)
 		head = &d->factive[i];
 		list_for_each_safe(pos, nx, head) {
 			f = list_entry(pos, struct frame, head);
-			if (tsince(f->tag) < timeout)
+			if (tsince_hr(f) < timeout)
 				break;	/* end of expired frames */
 			/* move to flist for later processing */
 			list_move_tail(pos, &flist);
 		}
 	}
-	/* window check */
-	tt = d->targets;
-	te = tt + d->ntargets;
-	for (; tt < te && (t = *tt); tt++) {
-		if (t->nout == t->maxout
-		&& t->maxout < t->nframes
-		&& (jiffies - t->lastwadj)/HZ > 10) {
-			t->maxout++;
-			t->lastwadj = jiffies;
-		}
-	}
-
-	if (!list_empty(&flist)) {	/* retransmissions necessary */
-		n = d->rttavg <<= 1;
-		if (n > MAXTIMER)
-			d->rttavg = MAXTIMER;
-	}
 
 	/* process expired frames */
 	while (!list_empty(&flist)) {
 		pos = flist.next;
 		f = list_entry(pos, struct frame, head);
-		n = f->waited += timeout;
-		n /= HZ;
-		if (n > aoe_deadsecs) {
+		since = tsince_hr(f);
+		n = f->waited_total + since;
+		n /= USEC_PER_SEC;
+		if (aoe_deadsecs
+		&& n > aoe_deadsecs
+		&& !(f->flags & FFL_PROBE)) {
 			/* Waited too long.  Device failure.
 			 * Hang all frames on first hash bucket for downdev
 			 * to clean up.
 			 */
 			list_splice(&flist, &d->factive[0]);
 			aoedev_downdev(d);
-			break;
+			goto out;
 		}
-		list_del(pos);
 
 		t = f->t;
-		if (n > aoe_deadsecs/2)
-			d->htgt = t; /* see if another target can help */
-
-		if (t->nout == t->maxout) {
-			if (t->maxout > 1)
-				t->maxout--;
-			t->lastwadj = jiffies;
+		n = f->waited + since;
+		n /= USEC_PER_SEC;
+		if (aoe_deadsecs && utgts > 0
+		&& (n > aoe_deadsecs / utgts || n > HARD_SCORN_SECS))
+			scorn(t); /* avoid this target */
+
+		if (t->maxout != 1) {
+			t->ssthresh = t->maxout / 2;
+			t->maxout = 1;
 		}
 
-		ifp = getif(t, f->skb->dev);
-		if (ifp && ++ifp->lost > (t->nframes << 1)
-		&& (ifp != t->ifs || t->ifs[1].nd)) {
-			ejectif(t, ifp);
-			ifp = NULL;
+		if (f->flags & FFL_PROBE) {
+			t->nout_probes--;
+		} else {
+			ifp = getif(t, f->skb->dev);
+			if (ifp && ++ifp->lost > (t->nframes << 1)
+			&& (ifp != t->ifs || t->ifs[1].nd)) {
+				ejectif(t, ifp);
+				ifp = NULL;
+			}
 		}
-		resend(d, f);
+		list_move_tail(pos, &d->rexmitq);
+		t->nout--;
 	}
+	rexmit_deferred(d);
 
-	if ((d->flags & DEVFL_KICKME || d->htgt) && d->blkq) {
+out:
+	if ((d->flags & DEVFL_KICKME) && d->blkq) {
 		d->flags &= ~DEVFL_KICKME;
 		d->blkq->request_fn(d->blkq);
 	}
@@ -774,8 +973,7 @@ nextbuf(struct aoedev *d)
 void
 aoecmd_work(struct aoedev *d)
 {
-	if (d->htgt && !sthtith(d))
-		return;
+	rexmit_deferred(d);
 	while (aoecmd_ata_rw(d))
 		;
 }
@@ -808,6 +1006,17 @@ aoecmd_sleepwork(struct work_struct *work)
 	}
 }
 
+static void
+ata_ident_fixstring(u16 *id, int ns)
+{
+	u16 s;
+
+	while (ns-- > 0) {
+		s = *id;
+		*id++ = s >> 8 | s << 8;
+	}
+}
+
 static void
 ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
 {
@@ -843,6 +1052,11 @@ ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
 		d->geo.sectors = get_unaligned_le16(&id[56 << 1]);
 	}
 
+	ata_ident_fixstring((u16 *) &id[10<<1], 10);	/* serial */
+	ata_ident_fixstring((u16 *) &id[23<<1], 4);	/* firmware */
+	ata_ident_fixstring((u16 *) &id[27<<1], 20);	/* model */
+	memcpy(d->ident, id, sizeof(d->ident));
+
 	if (d->ssize != ssize)
 		printk(KERN_INFO
 			"aoe: %pm e%ld.%d v%04x has %llu sectors\n",
@@ -862,26 +1076,28 @@ ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
 }
 
 static void
-calc_rttavg(struct aoedev *d, int rtt)
+calc_rttavg(struct aoedev *d, struct aoetgt *t, int rtt)
 {
 	register long n;
 
 	n = rtt;
-	if (n < 0) {
-		n = -rtt;
-		if (n < MINTIMER)
-			n = MINTIMER;
-		else if (n > MAXTIMER)
-			n = MAXTIMER;
-		d->mintimer += (n - d->mintimer) >> 1;
-	} else if (n < d->mintimer)
-		n = d->mintimer;
-	else if (n > MAXTIMER)
-		n = MAXTIMER;
-
-	/* g == .25; cf. Congestion Avoidance and Control, Jacobson & Karels; 1988 */
-	n -= d->rttavg;
-	d->rttavg += n >> 2;
+
+	/* cf. Congestion Avoidance and Control, Jacobson & Karels, 1988 */
+	n -= d->rttavg >> RTTSCALE;
+	d->rttavg += n;
+	if (n < 0)
+		n = -n;
+	n -= d->rttdev >> RTTDSCALE;
+	d->rttdev += n;
+
+	if (!t || t->maxout >= t->nframes)
+		return;
+	if (t->maxout < t->ssthresh)
+		t->maxout += 1;
+	else if (t->nout == t->maxout && t->next_cwnd-- == 0) {
+		t->maxout += 1;
+		t->next_cwnd = t->maxout;
+	}
 }
 
 static struct aoetgt *
@@ -890,7 +1106,7 @@ gettgt(struct aoedev *d, char *addr)
 	struct aoetgt **t, **e;
 
 	t = d->targets;
-	e = t + NTARGETS;
+	e = t + d->ntargets;
 	for (; t < e && *t; t++)
 		if (memcmp((*t)->addr, addr, sizeof((*t)->addr)) == 0)
 			return *t;
@@ -966,19 +1182,22 @@ ktiocomplete(struct frame *f)
 	struct aoeif *ifp;
 	struct aoedev *d;
 	long n;
+	int untainted;
 
 	if (f == NULL)
 		return;
 
 	t = f->t;
 	d = t->d;
+	skb = f->r_skb;
+	buf = f->buf;
+	if (f->flags & FFL_PROBE)
+		goto out;
+	if (!skb)		/* just fail the buf. */
+		goto noskb;
 
 	hout = (struct aoe_hdr *) skb_mac_header(f->skb);
 	ahout = (struct aoe_atahdr *) (hout+1);
-	buf = f->buf;
-	skb = f->r_skb;
-	if (skb == NULL)
-		goto noskb;	/* just fail the buf. */
 
 	hin = (struct aoe_hdr *) skb->data;
 	skb_pull(skb, sizeof(*hin));
@@ -988,9 +1207,9 @@ ktiocomplete(struct frame *f)
 		pr_err("aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n",
 			ahout->cmdstat, ahin->cmdstat,
 			d->aoemajor, d->aoeminor);
-noskb:	if (buf)
+noskb:		if (buf)
 			clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
-		goto badrsp;
+		goto out;
 	}
 
 	n = ahout->scnt << 9;
@@ -998,8 +1217,10 @@ noskb:	if (buf)
 	case ATA_CMD_PIO_READ:
 	case ATA_CMD_PIO_READ_EXT:
 		if (skb->len < n) {
-			pr_err("aoe: runt data size in read.  skb->len=%d need=%ld\n",
-				skb->len, n);
+			pr_err("%s e%ld.%d.  skb->len=%d need=%ld\n",
+				"aoe: runt data size in read from",
+				(long) d->aoemajor, d->aoeminor,
+			       skb->len, n);
 			clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
 			break;
 		}
@@ -1010,13 +1231,13 @@ noskb:	if (buf)
 		ifp = getif(t, skb->dev);
 		if (ifp)
 			ifp->lost = 0;
-		if (d->htgt == t) /* I'll help myself, thank you. */
-			d->htgt = NULL;
 		spin_unlock_irq(&d->lock);
 		break;
 	case ATA_CMD_ID_ATA:
 		if (skb->len < 512) {
-			pr_info("aoe: runt data size in ataid.  skb->len=%d\n",
+			pr_info("%s e%ld.%d.  skb->len=%d need=512\n",
+				"aoe: runt data size in ataid from",
+				(long) d->aoemajor, d->aoeminor,
 				skb->len);
 			break;
 		}
@@ -1032,16 +1253,23 @@ noskb:	if (buf)
 			be16_to_cpu(get_unaligned(&hin->major)),
 			hin->minor);
 	}
-badrsp:
+out:
 	spin_lock_irq(&d->lock);
+	if (t->taint > 0
+	&& --t->taint > 0
+	&& t->nout_probes == 0) {
+		count_targets(d, &untainted);
+		if (untainted > 0) {
+			probe(t);
+			t->nout_probes++;
+		}
+	}
 
 	aoe_freetframe(f);
 
 	if (buf && --buf->nframesout == 0 && buf->resid == 0)
 		aoe_end_buf(d, buf);
 
-	aoecmd_work(d);
-
 	spin_unlock_irq(&d->lock);
 	aoedev_put(d);
 	dev_kfree_skb(skb);
@@ -1141,7 +1369,6 @@ aoecmd_ata_rsp(struct sk_buff *skb)
 	struct aoedev *d;
 	struct aoe_hdr *h;
 	struct frame *f;
-	struct aoetgt *t;
 	u32 n;
 	ulong flags;
 	char ebuf[128];
@@ -1162,23 +1389,32 @@ aoecmd_ata_rsp(struct sk_buff *skb)
 
 	n = be32_to_cpu(get_unaligned(&h->tag));
 	f = getframe(d, n);
-	if (f == NULL) {
-		calc_rttavg(d, -tsince(n));
-		spin_unlock_irqrestore(&d->lock, flags);
-		aoedev_put(d);
-		snprintf(ebuf, sizeof ebuf,
-			"%15s e%d.%d    tag=%08x@%08lx\n",
-			"unexpected rsp",
-			get_unaligned_be16(&h->major),
-			h->minor,
-			get_unaligned_be32(&h->tag),
-			jiffies);
-		aoechr_error(ebuf);
-		return skb;
+	if (f) {
+		calc_rttavg(d, f->t, tsince_hr(f));
+		f->t->nout--;
+		if (f->flags & FFL_PROBE)
+			f->t->nout_probes--;
+	} else {
+		f = getframe_deferred(d, n);
+		if (f) {
+			calc_rttavg(d, NULL, tsince_hr(f));
+		} else {
+			calc_rttavg(d, NULL, tsince(n));
+			spin_unlock_irqrestore(&d->lock, flags);
+			aoedev_put(d);
+			snprintf(ebuf, sizeof(ebuf),
+				 "%15s e%d.%d    tag=%08x@%08lx s=%pm d=%pm\n",
+				 "unexpected rsp",
+				 get_unaligned_be16(&h->major),
+				 h->minor,
+				 get_unaligned_be32(&h->tag),
+				 jiffies,
+				 h->src,
+				 h->dst);
+			aoechr_error(ebuf);
+			return skb;
+		}
 	}
-	t = f->t;
-	calc_rttavg(d, tsince(f->tag));
-	t->nout--;
 	aoecmd_work(d);
 
 	spin_unlock_irqrestore(&d->lock, flags);
@@ -1201,7 +1437,7 @@ aoecmd_cfg(ushort aoemajor, unsigned char aoeminor)
 	aoecmd_cfg_pkts(aoemajor, aoeminor, &queue);
 	aoenet_xmit(&queue);
 }
- 
+
 struct sk_buff *
 aoecmd_ata_id(struct aoedev *d)
 {
@@ -1227,6 +1463,7 @@ aoecmd_ata_id(struct aoedev *d)
 	fhash(f);
 	t->nout++;
 	f->waited = 0;
+	f->waited_total = 0;
 
 	/* set up ata header */
 	ah->scnt = 1;
@@ -1235,41 +1472,69 @@ aoecmd_ata_id(struct aoedev *d)
 
 	skb->dev = t->ifp->nd;
 
-	d->rttavg = MAXTIMER;
+	d->rttavg = RTTAVG_INIT;
+	d->rttdev = RTTDEV_INIT;
 	d->timer.function = rexmit_timer;
 
-	return skb_clone(skb, GFP_ATOMIC);
+	skb = skb_clone(skb, GFP_ATOMIC);
+	if (skb) {
+		do_gettimeofday(&f->sent);
+		f->sent_jiffs = (u32) jiffies;
+	}
+
+	return skb;
 }
- 
+
+static struct aoetgt **
+grow_targets(struct aoedev *d)
+{
+	ulong oldn, newn;
+	struct aoetgt **tt;
+
+	oldn = d->ntargets;
+	newn = oldn * 2;
+	tt = kcalloc(newn, sizeof(*d->targets), GFP_ATOMIC);
+	if (!tt)
+		return NULL;
+	memmove(tt, d->targets, sizeof(*d->targets) * oldn);
+	d->tgt = tt + (d->tgt - d->targets);
+	kfree(d->targets);
+	d->targets = tt;
+	d->ntargets = newn;
+
+	return &d->targets[oldn];
+}
+
 static struct aoetgt *
 addtgt(struct aoedev *d, char *addr, ulong nframes)
 {
 	struct aoetgt *t, **tt, **te;
 
 	tt = d->targets;
-	te = tt + NTARGETS;
+	te = tt + d->ntargets;
 	for (; tt < te && *tt; tt++)
 		;
 
 	if (tt == te) {
-		printk(KERN_INFO
-			"aoe: device addtgt failure; too many targets\n");
-		return NULL;
+		tt = grow_targets(d);
+		if (!tt)
+			goto nomem;
 	}
 	t = kzalloc(sizeof(*t), GFP_ATOMIC);
-	if (!t) {
-		printk(KERN_INFO "aoe: cannot allocate memory to add target\n");
-		return NULL;
-	}
-
-	d->ntargets++;
+	if (!t)
+		goto nomem;
 	t->nframes = nframes;
 	t->d = d;
 	memcpy(t->addr, addr, sizeof t->addr);
 	t->ifp = t->ifs;
-	t->maxout = t->nframes;
+	aoecmd_wreset(t);
+	t->maxout = t->nframes / 2;
 	INIT_LIST_HEAD(&t->ffree);
 	return *tt = t;
+
+ nomem:
+	pr_info("aoe: cannot allocate memory to add target\n");
+	return NULL;
 }
 
 static void
@@ -1279,7 +1544,7 @@ setdbcnt(struct aoedev *d)
 	int bcnt = 0;
 
 	t = d->targets;
-	e = t + NTARGETS;
+	e = t + d->ntargets;
 	for (; t < e && *t; t++)
 		if (bcnt == 0 || bcnt > (*t)->minbcnt)
 			bcnt = (*t)->minbcnt;
@@ -1373,7 +1638,11 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
 	spin_lock_irqsave(&d->lock, flags);
 
 	t = gettgt(d, h->src);
-	if (!t) {
+	if (t) {
+		t->nframes = n;
+		if (n < t->maxout)
+			aoecmd_wreset(t);
+	} else {
 		t = addtgt(d, h->src, n);
 		if (!t)
 			goto bail;
@@ -1401,18 +1670,27 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
 	}
 }
 
+void
+aoecmd_wreset(struct aoetgt *t)
+{
+	t->maxout = 1;
+	t->ssthresh = t->nframes / 2;
+	t->next_cwnd = t->nframes;
+}
+
 void
 aoecmd_cleanslate(struct aoedev *d)
 {
 	struct aoetgt **t, **te;
 
-	d->mintimer = MINTIMER;
+	d->rttavg = RTTAVG_INIT;
+	d->rttdev = RTTDEV_INIT;
 	d->maxbcnt = 0;
 
 	t = d->targets;
-	te = t + NTARGETS;
+	te = t + d->ntargets;
 	for (; t < te && *t; t++)
-		(*t)->maxout = (*t)->nframes;
+		aoecmd_wreset(*t);
 }
 
 void
@@ -1460,6 +1738,14 @@ aoe_flush_iocq(void)
 int __init
 aoecmd_init(void)
 {
+	void *p;
+
+	/* get_zeroed_page returns page with ref count 1 */
+	p = (void *) get_zeroed_page(GFP_KERNEL | __GFP_REPEAT);
+	if (!p)
+		return -ENOMEM;
+	empty_page = virt_to_page(p);
+
 	INIT_LIST_HEAD(&iocq.head);
 	spin_lock_init(&iocq.lock);
 	init_waitqueue_head(&ktiowq);
@@ -1475,4 +1761,7 @@ aoecmd_exit(void)
 {
 	aoe_ktstop(&kts);
 	aoe_flush_iocq();
+
+	free_page((unsigned long) page_address(empty_page));
+	empty_page = NULL;
 }
diff --git a/trunk/drivers/block/aoe/aoedev.c b/trunk/drivers/block/aoe/aoedev.c
index 90e5b537f94b..98f2965778b9 100644
--- a/trunk/drivers/block/aoe/aoedev.c
+++ b/trunk/drivers/block/aoe/aoedev.c
@@ -15,7 +15,6 @@
 #include "aoe.h"
 
 static void dummy_timer(ulong);
-static void aoedev_freedev(struct aoedev *);
 static void freetgt(struct aoedev *d, struct aoetgt *t);
 static void skbpoolfree(struct aoedev *d);
 
@@ -69,25 +68,34 @@ minor_get_static(ulong *sysminor, ulong aoemaj, int aoemin)
 		NPERSHELF = 16,
 	};
 
+	if (aoemin >= NPERSHELF) {
+		pr_err("aoe: %s %d slots per shelf\n",
+			"static minor device numbers support only",
+			NPERSHELF);
+		error = -1;
+		goto out;
+	}
+
 	n = aoemaj * NPERSHELF + aoemin;
-	if (aoemin >= NPERSHELF || n >= N_DEVS) {
+	if (n >= N_DEVS) {
 		pr_err("aoe: %s with e%ld.%d\n",
 			"cannot use static minor device numbers",
 			aoemaj, aoemin);
 		error = -1;
-	} else {
-		spin_lock_irqsave(&used_minors_lock, flags);
-		if (test_bit(n, used_minors)) {
-			pr_err("aoe: %s %lu\n",
-				"existing device already has static minor number",
-				n);
-			error = -1;
-		} else
-			set_bit(n, used_minors);
-		spin_unlock_irqrestore(&used_minors_lock, flags);
+		goto out;
 	}
 
-	*sysminor = n;
+	spin_lock_irqsave(&used_minors_lock, flags);
+	if (test_bit(n, used_minors)) {
+		pr_err("aoe: %s %lu\n",
+			"existing device already has static minor number",
+			n);
+		error = -1;
+	} else
+		set_bit(n, used_minors);
+	spin_unlock_irqrestore(&used_minors_lock, flags);
+	*sysminor = n * AOE_PARTITIONS;
+out:
 	return error;
 }
 
@@ -170,41 +178,50 @@ aoe_failip(struct aoedev *d)
 		aoe_end_request(d, rq, 0);
 }
 
+static void
+downdev_frame(struct list_head *pos)
+{
+	struct frame *f;
+
+	f = list_entry(pos, struct frame, head);
+	list_del(pos);
+	if (f->buf) {
+		f->buf->nframesout--;
+		aoe_failbuf(f->t->d, f->buf);
+	}
+	aoe_freetframe(f);
+}
+
 void
 aoedev_downdev(struct aoedev *d)
 {
 	struct aoetgt *t, **tt, **te;
-	struct frame *f;
 	struct list_head *head, *pos, *nx;
 	struct request *rq;
 	int i;
 
 	d->flags &= ~DEVFL_UP;
 
-	/* clean out active buffers */
+	/* clean out active and to-be-retransmitted buffers */
 	for (i = 0; i < NFACTIVE; i++) {
 		head = &d->factive[i];
-		list_for_each_safe(pos, nx, head) {
-			f = list_entry(pos, struct frame, head);
-			list_del(pos);
-			if (f->buf) {
-				f->buf->nframesout--;
-				aoe_failbuf(d, f->buf);
-			}
-			aoe_freetframe(f);
-		}
+		list_for_each_safe(pos, nx, head)
+			downdev_frame(pos);
 	}
+	head = &d->rexmitq;
+	list_for_each_safe(pos, nx, head)
+		downdev_frame(pos);
+
 	/* reset window dressings */
 	tt = d->targets;
-	te = tt + NTARGETS;
+	te = tt + d->ntargets;
 	for (; tt < te && (t = *tt); tt++) {
-		t->maxout = t->nframes;
+		aoecmd_wreset(t);
 		t->nout = 0;
 	}
 
 	/* clean out the in-process request (if any) */
 	aoe_failip(d);
-	d->htgt = NULL;
 
 	/* fast fail all pending I/O */
 	if (d->blkq) {
@@ -218,12 +235,48 @@ aoedev_downdev(struct aoedev *d)
 		set_capacity(d->gd, 0);
 }
 
+/* return whether the user asked for this particular
+ * device to be flushed
+ */
+static int
+user_req(char *s, size_t slen, struct aoedev *d)
+{
+	char *p;
+	size_t lim;
+
+	if (!d->gd)
+		return 0;
+	p = strrchr(d->gd->disk_name, '/');
+	if (!p)
+		p = d->gd->disk_name;
+	else
+		p += 1;
+	lim = sizeof(d->gd->disk_name);
+	lim -= p - d->gd->disk_name;
+	if (slen < lim)
+		lim = slen;
+
+	return !strncmp(s, p, lim);
+}
+
 static void
-aoedev_freedev(struct aoedev *d)
+freedev(struct aoedev *d)
 {
 	struct aoetgt **t, **e;
+	int freeing = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&d->lock, flags);
+	if (d->flags & DEVFL_TKILL
+	&& !(d->flags & DEVFL_FREEING)) {
+		d->flags |= DEVFL_FREEING;
+		freeing = 1;
+	}
+	spin_unlock_irqrestore(&d->lock, flags);
+	if (!freeing)
+		return;
 
-	cancel_work_sync(&d->work);
+	del_timer_sync(&d->timer);
 	if (d->gd) {
 		aoedisk_rm_sysfs(d);
 		del_gendisk(d->gd);
@@ -231,61 +284,113 @@ aoedev_freedev(struct aoedev *d)
 		blk_cleanup_queue(d->blkq);
 	}
 	t = d->targets;
-	e = t + NTARGETS;
+	e = t + d->ntargets;
 	for (; t < e && *t; t++)
 		freetgt(d, *t);
 	if (d->bufpool)
 		mempool_destroy(d->bufpool);
 	skbpoolfree(d);
 	minor_free(d->sysminor);
-	kfree(d);
+
+	spin_lock_irqsave(&d->lock, flags);
+	d->flags |= DEVFL_FREED;
+	spin_unlock_irqrestore(&d->lock, flags);
 }
 
-int
-aoedev_flush(const char __user *str, size_t cnt)
+enum flush_parms {
+	NOT_EXITING = 0,
+	EXITING = 1,
+};
+
+static int
+flush(const char __user *str, size_t cnt, int exiting)
 {
 	ulong flags;
 	struct aoedev *d, **dd;
-	struct aoedev *rmd = NULL;
 	char buf[16];
 	int all = 0;
+	int specified = 0;	/* flush a specific device */
+	unsigned int skipflags;
+
+	skipflags = DEVFL_GDALLOC | DEVFL_NEWSIZE | DEVFL_TKILL;
 
-	if (cnt >= 3) {
+	if (!exiting && cnt >= 3) {
 		if (cnt > sizeof buf)
 			cnt = sizeof buf;
 		if (copy_from_user(buf, str, cnt))
 			return -EFAULT;
 		all = !strncmp(buf, "all", 3);
+		if (!all)
+			specified = 1;
 	}
 
+	flush_scheduled_work();
+	/* pass one: without sleeping, do aoedev_downdev */
 	spin_lock_irqsave(&devlist_lock, flags);
-	dd = &devlist;
-	while ((d = *dd)) {
+	for (d = devlist; d; d = d->next) {
 		spin_lock(&d->lock);
-		if ((!all && (d->flags & DEVFL_UP))
-		|| (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE))
+		if (exiting) {
+			/* unconditionally take each device down */
+		} else if (specified) {
+			if (!user_req(buf, cnt, d))
+				goto cont;
+		} else if ((!all && (d->flags & DEVFL_UP))
+		|| d->flags & skipflags
 		|| d->nopen
-		|| d->ref) {
-			spin_unlock(&d->lock);
-			dd = &d->next;
-			continue;
-		}
-		*dd = d->next;
+		|| d->ref)
+			goto cont;
+
 		aoedev_downdev(d);
 		d->flags |= DEVFL_TKILL;
+cont:
 		spin_unlock(&d->lock);
-		d->next = rmd;
-		rmd = d;
 	}
 	spin_unlock_irqrestore(&devlist_lock, flags);
-	while ((d = rmd)) {
-		rmd = d->next;
-		del_timer_sync(&d->timer);
-		aoedev_freedev(d);	/* must be able to sleep */
+
+	/* pass two: call freedev, which might sleep,
+	 * for aoedevs marked with DEVFL_TKILL
+	 */
+restart:
+	spin_lock_irqsave(&devlist_lock, flags);
+	for (d = devlist; d; d = d->next) {
+		spin_lock(&d->lock);
+		if (d->flags & DEVFL_TKILL
+		&& !(d->flags & DEVFL_FREEING)) {
+			spin_unlock(&d->lock);
+			spin_unlock_irqrestore(&devlist_lock, flags);
+			freedev(d);
+			goto restart;
+		}
+		spin_unlock(&d->lock);
 	}
+
+	/* pass three: remove aoedevs marked with DEVFL_FREED */
+	for (dd = &devlist, d = *dd; d; d = *dd) {
+		struct aoedev *doomed = NULL;
+
+		spin_lock(&d->lock);
+		if (d->flags & DEVFL_FREED) {
+			*dd = d->next;
+			doomed = d;
+		} else {
+			dd = &d->next;
+		}
+		spin_unlock(&d->lock);
+		if (doomed)
+			kfree(doomed->targets);
+		kfree(doomed);
+	}
+	spin_unlock_irqrestore(&devlist_lock, flags);
+
 	return 0;
 }
 
+int
+aoedev_flush(const char __user *str, size_t cnt)
+{
+	return flush(str, cnt, NOT_EXITING);
+}
+
 /* This has been confirmed to occur once with Tms=3*1000 due to the
  * driver changing link and not processing its transmit ring.  The
  * problem is hard enough to solve by returning an error that I'm
@@ -332,13 +437,20 @@ aoedev_by_aoeaddr(ulong maj, int min, int do_alloc)
 	struct aoedev *d;
 	int i;
 	ulong flags;
-	ulong sysminor;
+	ulong sysminor = 0;
 
 	spin_lock_irqsave(&devlist_lock, flags);
 
 	for (d=devlist; d; d=d->next)
 		if (d->aoemajor == maj && d->aoeminor == min) {
+			spin_lock(&d->lock);
+			if (d->flags & DEVFL_TKILL) {
+				spin_unlock(&d->lock);
+				d = NULL;
+				goto out;
+			}
 			d->ref++;
+			spin_unlock(&d->lock);
 			break;
 		}
 	if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0)
@@ -346,6 +458,13 @@ aoedev_by_aoeaddr(ulong maj, int min, int do_alloc)
 	d = kcalloc(1, sizeof *d, GFP_ATOMIC);
 	if (!d)
 		goto out;
+	d->targets = kcalloc(NTARGETS, sizeof(*d->targets), GFP_ATOMIC);
+	if (!d->targets) {
+		kfree(d);
+		d = NULL;
+		goto out;
+	}
+	d->ntargets = NTARGETS;
 	INIT_WORK(&d->work, aoecmd_sleepwork);
 	spin_lock_init(&d->lock);
 	skb_queue_head_init(&d->skbpool);
@@ -359,10 +478,12 @@ aoedev_by_aoeaddr(ulong maj, int min, int do_alloc)
 	d->ref = 1;
 	for (i = 0; i < NFACTIVE; i++)
 		INIT_LIST_HEAD(&d->factive[i]);
+	INIT_LIST_HEAD(&d->rexmitq);
 	d->sysminor = sysminor;
 	d->aoemajor = maj;
 	d->aoeminor = min;
-	d->mintimer = MINTIMER;
+	d->rttavg = RTTAVG_INIT;
+	d->rttdev = RTTDEV_INIT;
 	d->next = devlist;
 	devlist = d;
  out:
@@ -396,21 +517,9 @@ freetgt(struct aoedev *d, struct aoetgt *t)
 void
 aoedev_exit(void)
 {
-	struct aoedev *d;
-	ulong flags;
-
+	flush_scheduled_work();
 	aoe_flush_iocq();
-	while ((d = devlist)) {
-		devlist = d->next;
-
-		spin_lock_irqsave(&d->lock, flags);
-		aoedev_downdev(d);
-		d->flags |= DEVFL_TKILL;
-		spin_unlock_irqrestore(&d->lock, flags);
-
-		del_timer_sync(&d->timer);
-		aoedev_freedev(d);
-	}
+	flush(NULL, 0, EXITING);
 }
 
 int __init
diff --git a/trunk/drivers/block/aoe/aoemain.c b/trunk/drivers/block/aoe/aoemain.c
index 04793c2c701b..4b987c2fefbe 100644
--- a/trunk/drivers/block/aoe/aoemain.c
+++ b/trunk/drivers/block/aoe/aoemain.c
@@ -105,7 +105,7 @@ aoe_init(void)
 	aoechr_exit();
  chr_fail:
 	aoedev_exit();
-	
+
 	printk(KERN_INFO "aoe: initialisation failure.\n");
 	return ret;
 }
diff --git a/trunk/drivers/block/aoe/aoenet.c b/trunk/drivers/block/aoe/aoenet.c
index 162c6471275c..71d3ea8d3006 100644
--- a/trunk/drivers/block/aoe/aoenet.c
+++ b/trunk/drivers/block/aoe/aoenet.c
@@ -31,7 +31,7 @@ enum {
 
 static char aoe_iflist[IFLISTSZ];
 module_param_string(aoe_iflist, aoe_iflist, IFLISTSZ, 0600);
-MODULE_PARM_DESC(aoe_iflist, "aoe_iflist=\"dev1 [dev2 ...]\"");
+MODULE_PARM_DESC(aoe_iflist, "aoe_iflist=dev1[,dev2...]");
 
 static wait_queue_head_t txwq;
 static struct ktstate kts;
@@ -52,13 +52,18 @@ static struct sk_buff_head skbtxq;
 
 /* enters with txlock held */
 static int
-tx(void)
+tx(void) __must_hold(&txlock)
 {
 	struct sk_buff *skb;
+	struct net_device *ifp;
 
 	while ((skb = skb_dequeue(&skbtxq))) {
 		spin_unlock_irq(&txlock);
-		dev_queue_xmit(skb);
+		ifp = skb->dev;
+		if (dev_queue_xmit(skb) == NET_XMIT_DROP && net_ratelimit())
+			pr_warn("aoe: packet could not be sent on %s.  %s\n",
+				ifp ? ifp->name : "netif",
+				"consider increasing tx_queue_len");
 		spin_lock_irq(&txlock);
 	}
 	return 0;
@@ -119,8 +124,8 @@ aoenet_xmit(struct sk_buff_head *queue)
 	}
 }
 
-/* 
- * (1) len doesn't include the header by default.  I want this. 
+/*
+ * (1) len doesn't include the header by default.  I want this.
  */
 static int
 aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev)
diff --git a/trunk/drivers/block/cciss.c b/trunk/drivers/block/cciss.c
index ca83f96756ad..6526157edafc 100644
--- a/trunk/drivers/block/cciss.c
+++ b/trunk/drivers/block/cciss.c
@@ -41,8 +41,9 @@
 #include <linux/spinlock.h>
 #include <linux/compat.h>
 #include <linux/mutex.h>
+#include <linux/bitmap.h>
+#include <linux/io.h>
 #include <asm/uaccess.h>
-#include <asm/io.h>
 
 #include <linux/dma-mapping.h>
 #include <linux/blkdev.h>
@@ -978,8 +979,7 @@ static CommandList_struct *cmd_alloc(ctlr_info_t *h)
 		i = find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds);
 		if (i == h->nr_cmds)
 			return NULL;
-	} while (test_and_set_bit(i & (BITS_PER_LONG - 1),
-		  h->cmd_pool_bits + (i / BITS_PER_LONG)) != 0);
+	} while (test_and_set_bit(i, h->cmd_pool_bits) != 0);
 	c = h->cmd_pool + i;
 	memset(c, 0, sizeof(CommandList_struct));
 	cmd_dma_handle = h->cmd_pool_dhandle + i * sizeof(CommandList_struct);
@@ -1046,8 +1046,7 @@ static void cmd_free(ctlr_info_t *h, CommandList_struct *c)
 	int i;
 
 	i = c - h->cmd_pool;
-	clear_bit(i & (BITS_PER_LONG - 1),
-		  h->cmd_pool_bits + (i / BITS_PER_LONG));
+	clear_bit(i, h->cmd_pool_bits);
 	h->nr_frees++;
 }
 
@@ -4268,10 +4267,7 @@ static void __devinit cciss_find_board_params(ctlr_info_t *h)
 
 static inline bool CISS_signature_present(ctlr_info_t *h)
 {
-	if ((readb(&h->cfgtable->Signature[0]) != 'C') ||
-	    (readb(&h->cfgtable->Signature[1]) != 'I') ||
-	    (readb(&h->cfgtable->Signature[2]) != 'S') ||
-	    (readb(&h->cfgtable->Signature[3]) != 'S')) {
+	if (!check_signature(h->cfgtable->Signature, "CISS", 4)) {
 		dev_warn(&h->pdev->dev, "not a valid CISS config table\n");
 		return false;
 	}
@@ -4812,8 +4808,7 @@ static __devinit int cciss_init_reset_devices(struct pci_dev *pdev)
 
 static __devinit int cciss_allocate_cmd_pool(ctlr_info_t *h)
 {
-	h->cmd_pool_bits = kmalloc(
-		DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG) *
+	h->cmd_pool_bits = kmalloc(BITS_TO_LONGS(h->nr_cmds) *
 		sizeof(unsigned long), GFP_KERNEL);
 	h->cmd_pool = pci_alloc_consistent(h->pdev,
 		h->nr_cmds * sizeof(CommandList_struct),
@@ -5068,9 +5063,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	pci_set_drvdata(pdev, h);
 	/* command and error info recs zeroed out before
 	   they are used */
-	memset(h->cmd_pool_bits, 0,
-	       DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG)
-			* sizeof(unsigned long));
+	bitmap_zero(h->cmd_pool_bits, h->nr_cmds);
 
 	h->num_luns = 0;
 	h->highest_lun = -1;
diff --git a/trunk/drivers/block/drbd/Kconfig b/trunk/drivers/block/drbd/Kconfig
index df0983787390..7845bd6ee414 100644
--- a/trunk/drivers/block/drbd/Kconfig
+++ b/trunk/drivers/block/drbd/Kconfig
@@ -2,13 +2,14 @@
 # DRBD device driver configuration
 #
 
-comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected"
-	depends on PROC_FS='n' || INET='n' || CONNECTOR='n'
+comment "DRBD disabled because PROC_FS or INET not selected"
+	depends on PROC_FS='n' || INET='n'
 
 config BLK_DEV_DRBD
 	tristate "DRBD Distributed Replicated Block Device support"
-	depends on PROC_FS && INET && CONNECTOR
+	depends on PROC_FS && INET
 	select LRU_CACHE
+	select LIBCRC32C
 	default n
 	help
 
@@ -58,7 +59,8 @@ config DRBD_FAULT_INJECTION
 	  32	data read
 	  64	read ahead
 	  128	kmalloc of bitmap
-	  256	allocation of EE (epoch_entries)
+	  256	allocation of peer_requests
+	  512	insert data corruption on receiving side
 
 	  fault_devs: bitmask of minor numbers
 	  fault_rate: frequency in percent
diff --git a/trunk/drivers/block/drbd/Makefile b/trunk/drivers/block/drbd/Makefile
index 0d3f337ff5ff..8b450338075e 100644
--- a/trunk/drivers/block/drbd/Makefile
+++ b/trunk/drivers/block/drbd/Makefile
@@ -1,5 +1,7 @@
 drbd-y := drbd_bitmap.o drbd_proc.o
 drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
 drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
+drbd-y += drbd_interval.o drbd_state.o
+drbd-y += drbd_nla.o
 
 obj-$(CONFIG_BLK_DEV_DRBD)     += drbd.o
diff --git a/trunk/drivers/block/drbd/drbd_actlog.c b/trunk/drivers/block/drbd/drbd_actlog.c
index 3fbef018ce55..92510f8ad013 100644
--- a/trunk/drivers/block/drbd/drbd_actlog.c
+++ b/trunk/drivers/block/drbd/drbd_actlog.c
@@ -24,21 +24,73 @@
  */
 
 #include <linux/slab.h>
+#include <linux/crc32c.h>
 #include <linux/drbd.h>
+#include <linux/drbd_limits.h>
+#include <linux/dynamic_debug.h>
 #include "drbd_int.h"
 #include "drbd_wrappers.h"
 
-/* We maintain a trivial checksum in our on disk activity log.
- * With that we can ensure correct operation even when the storage
- * device might do a partial (last) sector write while losing power.
- */
-struct __packed al_transaction {
-	u32       magic;
-	u32       tr_number;
-	struct __packed {
-		u32 pos;
-		u32 extent; } updates[1 + AL_EXTENTS_PT];
-	u32       xor_sum;
+
+enum al_transaction_types {
+	AL_TR_UPDATE = 0,
+	AL_TR_INITIALIZED = 0xffff
+};
+/* all fields on disc in big endian */
+struct __packed al_transaction_on_disk {
+	/* don't we all like magic */
+	__be32	magic;
+
+	/* to identify the most recent transaction block
+	 * in the on disk ring buffer */
+	__be32	tr_number;
+
+	/* checksum on the full 4k block, with this field set to 0. */
+	__be32	crc32c;
+
+	/* type of transaction, special transaction types like:
+	 * purge-all, set-all-idle, set-all-active, ... to-be-defined
+	 * see also enum al_transaction_types */
+	__be16	transaction_type;
+
+	/* we currently allow only a few thousand extents,
+	 * so 16bit will be enough for the slot number. */
+
+	/* how many updates in this transaction */
+	__be16	n_updates;
+
+	/* maximum slot number, "al-extents" in drbd.conf speak.
+	 * Having this in each transaction should make reconfiguration
+	 * of that parameter easier. */
+	__be16	context_size;
+
+	/* slot number the context starts with */
+	__be16	context_start_slot_nr;
+
+	/* Some reserved bytes.  Expected usage is a 64bit counter of
+	 * sectors-written since device creation, and other data generation tag
+	 * supporting usage */
+	__be32	__reserved[4];
+
+	/* --- 36 byte used --- */
+
+	/* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes
+	 * in one transaction, then use the remaining byte in the 4k block for
+	 * context information.  "Flexible" number of updates per transaction
+	 * does not help, as we have to account for the case when all update
+	 * slots are used anyways, so it would only complicate code without
+	 * additional benefit.
+	 */
+	__be16	update_slot_nr[AL_UPDATES_PER_TRANSACTION];
+
+	/* but the extent number is 32bit, which at an extent size of 4 MiB
+	 * allows to cover device sizes of up to 2**54 Byte (16 PiB) */
+	__be32	update_extent_nr[AL_UPDATES_PER_TRANSACTION];
+
+	/* --- 420 bytes used (36 + 64*6) --- */
+
+	/* 4096 - 420 = 3676 = 919 * 4 */
+	__be32	context[AL_CONTEXT_PER_TRANSACTION];
 };
 
 struct update_odbm_work {
@@ -48,22 +100,11 @@ struct update_odbm_work {
 
 struct update_al_work {
 	struct drbd_work w;
-	struct lc_element *al_ext;
 	struct completion event;
-	unsigned int enr;
-	/* if old_enr != LC_FREE, write corresponding bitmap sector, too */
-	unsigned int old_enr;
-};
-
-struct drbd_atodb_wait {
-	atomic_t           count;
-	struct completion  io_done;
-	struct drbd_conf   *mdev;
-	int                error;
+	int err;
 };
 
-
-int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
+static int al_write_transaction(struct drbd_conf *mdev);
 
 void *drbd_md_get_buffer(struct drbd_conf *mdev)
 {
@@ -82,22 +123,24 @@ void drbd_md_put_buffer(struct drbd_conf *mdev)
 		wake_up(&mdev->misc_wait);
 }
 
-static bool md_io_allowed(struct drbd_conf *mdev)
-{
-	enum drbd_disk_state ds = mdev->state.disk;
-	return ds >= D_NEGOTIATING || ds == D_ATTACHING;
-}
-
-void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+void wait_until_done_or_force_detached(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
 				     unsigned int *done)
 {
-	long dt = bdev->dc.disk_timeout * HZ / 10;
+	long dt;
+
+	rcu_read_lock();
+	dt = rcu_dereference(bdev->disk_conf)->disk_timeout;
+	rcu_read_unlock();
+	dt = dt * HZ / 10;
 	if (dt == 0)
 		dt = MAX_SCHEDULE_TIMEOUT;
 
-	dt = wait_event_timeout(mdev->misc_wait, *done || !md_io_allowed(mdev), dt);
-	if (dt == 0)
+	dt = wait_event_timeout(mdev->misc_wait,
+			*done || test_bit(FORCE_DETACH, &mdev->flags), dt);
+	if (dt == 0) {
 		dev_err(DEV, "meta-data IO operation timed out\n");
+		drbd_chk_io_error(mdev, 1, DRBD_FORCE_DETACH);
+	}
 }
 
 static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
@@ -106,7 +149,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
 				 int rw, int size)
 {
 	struct bio *bio;
-	int ok;
+	int err;
 
 	mdev->md_io.done = 0;
 	mdev->md_io.error = -ENODEV;
@@ -118,8 +161,8 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
 	bio = bio_alloc_drbd(GFP_NOIO);
 	bio->bi_bdev = bdev->md_bdev;
 	bio->bi_sector = sector;
-	ok = (bio_add_page(bio, page, size, 0) == size);
-	if (!ok)
+	err = -EIO;
+	if (bio_add_page(bio, page, size, 0) != size)
 		goto out;
 	bio->bi_private = &mdev->md_io;
 	bio->bi_end_io = drbd_md_io_complete;
@@ -127,7 +170,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
 
 	if (!get_ldev_if_state(mdev, D_ATTACHING)) {  /* Corresponding put_ldev in drbd_md_io_complete() */
 		dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
-		ok = 0;
+		err = -ENODEV;
 		goto out;
 	}
 
@@ -137,86 +180,47 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
 		bio_endio(bio, -EIO);
 	else
 		submit_bio(rw, bio);
-	wait_until_done_or_disk_failure(mdev, bdev, &mdev->md_io.done);
-	ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0;
+	wait_until_done_or_force_detached(mdev, bdev, &mdev->md_io.done);
+	if (bio_flagged(bio, BIO_UPTODATE))
+		err = mdev->md_io.error;
 
  out:
 	bio_put(bio);
-	return ok;
+	return err;
 }
 
 int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
 			 sector_t sector, int rw)
 {
-	int logical_block_size, mask, ok;
-	int offset = 0;
+	int err;
 	struct page *iop = mdev->md_io_page;
 
 	D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1);
 
 	BUG_ON(!bdev->md_bdev);
 
-	logical_block_size = bdev_logical_block_size(bdev->md_bdev);
-	if (logical_block_size == 0)
-		logical_block_size = MD_SECTOR_SIZE;
-
-	/* in case logical_block_size != 512 [ s390 only? ] */
-	if (logical_block_size != MD_SECTOR_SIZE) {
-		mask = (logical_block_size / MD_SECTOR_SIZE) - 1;
-		D_ASSERT(mask == 1 || mask == 3 || mask == 7);
-		D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE);
-		offset = sector & mask;
-		sector = sector & ~mask;
-		iop = mdev->md_io_tmpp;
-
-		if (rw & WRITE) {
-			/* these are GFP_KERNEL pages, pre-allocated
-			 * on device initialization */
-			void *p = page_address(mdev->md_io_page);
-			void *hp = page_address(mdev->md_io_tmpp);
-
-			ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector,
-					READ, logical_block_size);
-
-			if (unlikely(!ok)) {
-				dev_err(DEV, "drbd_md_sync_page_io(,%llus,"
-				    "READ [logical_block_size!=512]) failed!\n",
-				    (unsigned long long)sector);
-				return 0;
-			}
-
-			memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE);
-		}
-	}
+	dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n",
+	     current->comm, current->pid, __func__,
+	     (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
 
 	if (sector < drbd_md_first_sector(bdev) ||
-	    sector > drbd_md_last_sector(bdev))
+	    sector + 7 > drbd_md_last_sector(bdev))
 		dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
 		     current->comm, current->pid, __func__,
 		     (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
 
-	ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size);
-	if (unlikely(!ok)) {
-		dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n",
-		    (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
-		return 0;
-	}
-
-	if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) {
-		void *p = page_address(mdev->md_io_page);
-		void *hp = page_address(mdev->md_io_tmpp);
-
-		memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE);
+	err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE);
+	if (err) {
+		dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
+		    (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
 	}
-
-	return ok;
+	return err;
 }
 
 static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
 {
 	struct lc_element *al_ext;
 	struct lc_element *tmp;
-	unsigned long     al_flags = 0;
 	int wake;
 
 	spin_lock_irq(&mdev->al_lock);
@@ -231,76 +235,92 @@ static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
 			return NULL;
 		}
 	}
-	al_ext   = lc_get(mdev->act_log, enr);
-	al_flags = mdev->act_log->flags;
+	al_ext = lc_get(mdev->act_log, enr);
 	spin_unlock_irq(&mdev->al_lock);
-
-	/*
-	if (!al_ext) {
-		if (al_flags & LC_STARVING)
-			dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n");
-		if (al_flags & LC_DIRTY)
-			dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n");
-	}
-	*/
-
 	return al_ext;
 }
 
-void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector)
+void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
 {
-	unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
-	struct lc_element *al_ext;
-	struct update_al_work al_work;
+	/* for bios crossing activity log extent boundaries,
+	 * we may need to activate two extents in one go */
+	unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+	unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+	unsigned enr;
+	bool locked = false;
 
+
+	D_ASSERT(first <= last);
 	D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
 
-	wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)));
+	for (enr = first; enr <= last; enr++)
+		wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL);
+
+	/* Serialize multiple transactions.
+	 * This uses test_and_set_bit, memory barrier is implicit.
+	 */
+	wait_event(mdev->al_wait,
+			mdev->act_log->pending_changes == 0 ||
+			(locked = lc_try_lock_for_transaction(mdev->act_log)));
 
-	if (al_ext->lc_number != enr) {
+	if (locked) {
 		/* drbd_al_write_transaction(mdev,al_ext,enr);
 		 * recurses into generic_make_request(), which
 		 * disallows recursion, bios being serialized on the
 		 * current->bio_tail list now.
 		 * we have to delegate updates to the activity log
 		 * to the worker thread. */
-		init_completion(&al_work.event);
-		al_work.al_ext = al_ext;
-		al_work.enr = enr;
-		al_work.old_enr = al_ext->lc_number;
-		al_work.w.cb = w_al_write_transaction;
-		drbd_queue_work_front(&mdev->data.work, &al_work.w);
-		wait_for_completion(&al_work.event);
-
-		mdev->al_writ_cnt++;
-
-		spin_lock_irq(&mdev->al_lock);
-		lc_changed(mdev->act_log, al_ext);
-		spin_unlock_irq(&mdev->al_lock);
+
+		/* Double check: it may have been committed by someone else,
+		 * while we have been waiting for the lock. */
+		if (mdev->act_log->pending_changes) {
+			bool write_al_updates;
+
+			rcu_read_lock();
+			write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
+			rcu_read_unlock();
+
+			if (write_al_updates) {
+				al_write_transaction(mdev);
+				mdev->al_writ_cnt++;
+			}
+
+			spin_lock_irq(&mdev->al_lock);
+			/* FIXME
+			if (err)
+				we need an "lc_cancel" here;
+			*/
+			lc_committed(mdev->act_log);
+			spin_unlock_irq(&mdev->al_lock);
+		}
+		lc_unlock(mdev->act_log);
 		wake_up(&mdev->al_wait);
 	}
 }
 
-void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
+void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i)
 {
-	unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
+	/* for bios crossing activity log extent boundaries,
+	 * we may need to activate two extents in one go */
+	unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+	unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+	unsigned enr;
 	struct lc_element *extent;
 	unsigned long flags;
 
+	D_ASSERT(first <= last);
 	spin_lock_irqsave(&mdev->al_lock, flags);
 
-	extent = lc_find(mdev->act_log, enr);
-
-	if (!extent) {
-		spin_unlock_irqrestore(&mdev->al_lock, flags);
-		dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
-		return;
+	for (enr = first; enr <= last; enr++) {
+		extent = lc_find(mdev->act_log, enr);
+		if (!extent) {
+			dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
+			continue;
+		}
+		lc_put(mdev->act_log, extent);
 	}
-
-	if (lc_put(mdev->act_log, extent) == 0)
-		wake_up(&mdev->al_wait);
-
 	spin_unlock_irqrestore(&mdev->al_lock, flags);
+	wake_up(&mdev->al_wait);
 }
 
 #if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
@@ -326,296 +346,148 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
 	return rs_enr >>
 		/* bit to page */
 		((PAGE_SHIFT + 3) -
-		/* al extent number to bit */
+		/* resync extent number to bit */
 		 (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
 }
 
-int
-w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int
+_al_write_transaction(struct drbd_conf *mdev)
 {
-	struct update_al_work *aw = container_of(w, struct update_al_work, w);
-	struct lc_element *updated = aw->al_ext;
-	const unsigned int new_enr = aw->enr;
-	const unsigned int evicted = aw->old_enr;
-	struct al_transaction *buffer;
+	struct al_transaction_on_disk *buffer;
+	struct lc_element *e;
 	sector_t sector;
-	int i, n, mx;
-	unsigned int extent_nr;
-	u32 xor_sum = 0;
+	int i, mx;
+	unsigned extent_nr;
+	unsigned crc = 0;
+	int err = 0;
 
 	if (!get_ldev(mdev)) {
-		dev_err(DEV,
-			"disk is %s, cannot start al transaction (-%d +%d)\n",
-			drbd_disk_str(mdev->state.disk), evicted, new_enr);
-		complete(&((struct update_al_work *)w)->event);
-		return 1;
+		dev_err(DEV, "disk is %s, cannot start al transaction\n",
+			drbd_disk_str(mdev->state.disk));
+		return -EIO;
 	}
-	/* do we have to do a bitmap write, first?
-	 * TODO reduce maximum latency:
-	 * submit both bios, then wait for both,
-	 * instead of doing two synchronous sector writes.
-	 * For now, we must not write the transaction,
-	 * if we cannot write out the bitmap of the evicted extent. */
-	if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
-		drbd_bm_write_page(mdev, al_extent_to_bm_page(evicted));
 
 	/* The bitmap write may have failed, causing a state change. */
 	if (mdev->state.disk < D_INCONSISTENT) {
 		dev_err(DEV,
-			"disk is %s, cannot write al transaction (-%d +%d)\n",
-			drbd_disk_str(mdev->state.disk), evicted, new_enr);
-		complete(&((struct update_al_work *)w)->event);
+			"disk is %s, cannot write al transaction\n",
+			drbd_disk_str(mdev->state.disk));
 		put_ldev(mdev);
-		return 1;
+		return -EIO;
 	}
 
 	buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */
 	if (!buffer) {
 		dev_err(DEV, "disk failed while waiting for md_io buffer\n");
-		complete(&((struct update_al_work *)w)->event);
 		put_ldev(mdev);
-		return 1;
+		return -ENODEV;
 	}
 
-	buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
+	memset(buffer, 0, sizeof(*buffer));
+	buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
 	buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
 
-	n = lc_index_of(mdev->act_log, updated);
+	i = 0;
+
+	/* Even though no one can start to change this list
+	 * once we set the LC_LOCKED -- from drbd_al_begin_io(),
+	 * lc_try_lock_for_transaction() --, someone may still
+	 * be in the process of changing it. */
+	spin_lock_irq(&mdev->al_lock);
+	list_for_each_entry(e, &mdev->act_log->to_be_changed, list) {
+		if (i == AL_UPDATES_PER_TRANSACTION) {
+			i++;
+			break;
+		}
+		buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
+		buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
+		if (e->lc_number != LC_FREE)
+			drbd_bm_mark_for_writeout(mdev,
+					al_extent_to_bm_page(e->lc_number));
+		i++;
+	}
+	spin_unlock_irq(&mdev->al_lock);
+	BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
 
-	buffer->updates[0].pos = cpu_to_be32(n);
-	buffer->updates[0].extent = cpu_to_be32(new_enr);
+	buffer->n_updates = cpu_to_be16(i);
+	for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
+		buffer->update_slot_nr[i] = cpu_to_be16(-1);
+		buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
+	}
 
-	xor_sum ^= new_enr;
+	buffer->context_size = cpu_to_be16(mdev->act_log->nr_elements);
+	buffer->context_start_slot_nr = cpu_to_be16(mdev->al_tr_cycle);
 
-	mx = min_t(int, AL_EXTENTS_PT,
+	mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
 		   mdev->act_log->nr_elements - mdev->al_tr_cycle);
 	for (i = 0; i < mx; i++) {
 		unsigned idx = mdev->al_tr_cycle + i;
 		extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;
-		buffer->updates[i+1].pos = cpu_to_be32(idx);
-		buffer->updates[i+1].extent = cpu_to_be32(extent_nr);
-		xor_sum ^= extent_nr;
-	}
-	for (; i < AL_EXTENTS_PT; i++) {
-		buffer->updates[i+1].pos = __constant_cpu_to_be32(-1);
-		buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE);
-		xor_sum ^= LC_FREE;
+		buffer->context[i] = cpu_to_be32(extent_nr);
 	}
-	mdev->al_tr_cycle += AL_EXTENTS_PT;
+	for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
+		buffer->context[i] = cpu_to_be32(LC_FREE);
+
+	mdev->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
 	if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
 		mdev->al_tr_cycle = 0;
 
-	buffer->xor_sum = cpu_to_be32(xor_sum);
-
 	sector =  mdev->ldev->md.md_offset
-		+ mdev->ldev->md.al_offset + mdev->al_tr_pos;
-
-	if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
-		drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
+		+ mdev->ldev->md.al_offset
+		+ mdev->al_tr_pos * (MD_BLOCK_SIZE>>9);
 
-	if (++mdev->al_tr_pos >
-	    div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
-		mdev->al_tr_pos = 0;
+	crc = crc32c(0, buffer, 4096);
+	buffer->crc32c = cpu_to_be32(crc);
 
-	D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
-	mdev->al_tr_number++;
+	if (drbd_bm_write_hinted(mdev))
+		err = -EIO;
+		/* drbd_chk_io_error done already */
+	else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
+		err = -EIO;
+		drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
+	} else {
+		/* advance ringbuffer position and transaction counter */
+		mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE);
+		mdev->al_tr_number++;
+	}
 
 	drbd_md_put_buffer(mdev);
-
-	complete(&((struct update_al_work *)w)->event);
 	put_ldev(mdev);
 
-	return 1;
+	return err;
 }
 
-/**
- * drbd_al_read_tr() - Read a single transaction from the on disk activity log
- * @mdev:	DRBD device.
- * @bdev:	Block device to read form.
- * @b:		pointer to an al_transaction.
- * @index:	On disk slot of the transaction to read.
- *
- * Returns -1 on IO error, 0 on checksum error and 1 upon success.
- */
-static int drbd_al_read_tr(struct drbd_conf *mdev,
-			   struct drbd_backing_dev *bdev,
-			   struct al_transaction *b,
-			   int index)
-{
-	sector_t sector;
-	int rv, i;
-	u32 xor_sum = 0;
-
-	sector = bdev->md.md_offset + bdev->md.al_offset + index;
-
-	/* Dont process error normally,
-	 * as this is done before disk is attached! */
-	if (!drbd_md_sync_page_io(mdev, bdev, sector, READ))
-		return -1;
-
-	rv = (be32_to_cpu(b->magic) == DRBD_MAGIC);
-
-	for (i = 0; i < AL_EXTENTS_PT + 1; i++)
-		xor_sum ^= be32_to_cpu(b->updates[i].extent);
-	rv &= (xor_sum == be32_to_cpu(b->xor_sum));
 
-	return rv;
-}
-
-/**
- * drbd_al_read_log() - Restores the activity log from its on disk representation.
- * @mdev:	DRBD device.
- * @bdev:	Block device to read form.
- *
- * Returns 1 on success, returns 0 when reading the log failed due to IO errors.
- */
-int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
+static int w_al_write_transaction(struct drbd_work *w, int unused)
 {
-	struct al_transaction *buffer;
-	int i;
-	int rv;
-	int mx;
-	int active_extents = 0;
-	int transactions = 0;
-	int found_valid = 0;
-	int from = 0;
-	int to = 0;
-	u32 from_tnr = 0;
-	u32 to_tnr = 0;
-	u32 cnr;
-
-	mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT);
-
-	/* lock out all other meta data io for now,
-	 * and make sure the page is mapped.
-	 */
-	buffer = drbd_md_get_buffer(mdev);
-	if (!buffer)
-		return 0;
-
-	/* Find the valid transaction in the log */
-	for (i = 0; i <= mx; i++) {
-		rv = drbd_al_read_tr(mdev, bdev, buffer, i);
-		if (rv == 0)
-			continue;
-		if (rv == -1) {
-			drbd_md_put_buffer(mdev);
-			return 0;
-		}
-		cnr = be32_to_cpu(buffer->tr_number);
-
-		if (++found_valid == 1) {
-			from = i;
-			to = i;
-			from_tnr = cnr;
-			to_tnr = cnr;
-			continue;
-		}
-		if ((int)cnr - (int)from_tnr < 0) {
-			D_ASSERT(from_tnr - cnr + i - from == mx+1);
-			from = i;
-			from_tnr = cnr;
-		}
-		if ((int)cnr - (int)to_tnr > 0) {
-			D_ASSERT(cnr - to_tnr == i - to);
-			to = i;
-			to_tnr = cnr;
-		}
-	}
-
-	if (!found_valid) {
-		dev_warn(DEV, "No usable activity log found.\n");
-		drbd_md_put_buffer(mdev);
-		return 1;
-	}
-
-	/* Read the valid transactions.
-	 * dev_info(DEV, "Reading from %d to %d.\n",from,to); */
-	i = from;
-	while (1) {
-		int j, pos;
-		unsigned int extent_nr;
-		unsigned int trn;
-
-		rv = drbd_al_read_tr(mdev, bdev, buffer, i);
-		ERR_IF(rv == 0) goto cancel;
-		if (rv == -1) {
-			drbd_md_put_buffer(mdev);
-			return 0;
-		}
-
-		trn = be32_to_cpu(buffer->tr_number);
-
-		spin_lock_irq(&mdev->al_lock);
-
-		/* This loop runs backwards because in the cyclic
-		   elements there might be an old version of the
-		   updated element (in slot 0). So the element in slot 0
-		   can overwrite old versions. */
-		for (j = AL_EXTENTS_PT; j >= 0; j--) {
-			pos = be32_to_cpu(buffer->updates[j].pos);
-			extent_nr = be32_to_cpu(buffer->updates[j].extent);
-
-			if (extent_nr == LC_FREE)
-				continue;
-
-			lc_set(mdev->act_log, extent_nr, pos);
-			active_extents++;
-		}
-		spin_unlock_irq(&mdev->al_lock);
-
-		transactions++;
-
-cancel:
-		if (i == to)
-			break;
-		i++;
-		if (i > mx)
-			i = 0;
-	}
-
-	mdev->al_tr_number = to_tnr+1;
-	mdev->al_tr_pos = to;
-	if (++mdev->al_tr_pos >
-	    div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
-		mdev->al_tr_pos = 0;
-
-	/* ok, we are done with it */
-	drbd_md_put_buffer(mdev);
+	struct update_al_work *aw = container_of(w, struct update_al_work, w);
+	struct drbd_conf *mdev = w->mdev;
+	int err;
 
-	dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
-	     transactions, active_extents);
+	err = _al_write_transaction(mdev);
+	aw->err = err;
+	complete(&aw->event);
 
-	return 1;
+	return err != -EIO ? err : 0;
 }
 
-/**
- * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
- * @mdev:	DRBD device.
- */
-void drbd_al_apply_to_bm(struct drbd_conf *mdev)
+/* Calls from worker context (see w_restart_disk_io()) need to write the
+   transaction directly. Others came through generic_make_request(),
+   those need to delegate it to the worker. */
+static int al_write_transaction(struct drbd_conf *mdev)
 {
-	unsigned int enr;
-	unsigned long add = 0;
-	char ppb[10];
-	int i, tmp;
-
-	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+	struct update_al_work al_work;
 
-	for (i = 0; i < mdev->act_log->nr_elements; i++) {
-		enr = lc_element_by_index(mdev->act_log, i)->lc_number;
-		if (enr == LC_FREE)
-			continue;
-		tmp = drbd_bm_ALe_set_all(mdev, enr);
-		dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr);
-		add += tmp;
-	}
+	if (current == mdev->tconn->worker.task)
+		return _al_write_transaction(mdev);
 
-	lc_unlock(mdev->act_log);
-	wake_up(&mdev->al_wait);
+	init_completion(&al_work.event);
+	al_work.w.cb = w_al_write_transaction;
+	al_work.w.mdev = mdev;
+	drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
+	wait_for_completion(&al_work.event);
 
-	dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n",
-	     ppsize(ppb, Bit2KB(add)));
+	return al_work.err;
 }
 
 static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
@@ -645,7 +517,7 @@ void drbd_al_shrink(struct drbd_conf *mdev)
 	struct lc_element *al_ext;
 	int i;
 
-	D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags));
+	D_ASSERT(test_bit(__LC_LOCKED, &mdev->act_log->flags));
 
 	for (i = 0; i < mdev->act_log->nr_elements; i++) {
 		al_ext = lc_element_by_index(mdev->act_log, i);
@@ -657,15 +529,17 @@ void drbd_al_shrink(struct drbd_conf *mdev)
 	wake_up(&mdev->al_wait);
 }
 
-static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int w_update_odbm(struct drbd_work *w, int unused)
 {
 	struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
+	struct drbd_conf *mdev = w->mdev;
+	struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
 
 	if (!get_ldev(mdev)) {
 		if (__ratelimit(&drbd_ratelimit_state))
 			dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");
 		kfree(udw);
-		return 1;
+		return 0;
 	}
 
 	drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr));
@@ -683,9 +557,9 @@ static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused
 			break;
 		}
 	}
-	drbd_bcast_sync_progress(mdev);
+	drbd_bcast_event(mdev, &sib);
 
-	return 1;
+	return 0;
 }
 
 
@@ -755,7 +629,9 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
 			}
 			ext->rs_left = rs_left;
 			ext->rs_failed = success ? 0 : count;
-			lc_changed(mdev->resync, &ext->lce);
+			/* we don't keep a persistent log of the resync lru,
+			 * we can commit any change right away. */
+			lc_committed(mdev->resync);
 		}
 		lc_put(mdev->resync, &ext->lce);
 		/* no race, we are within the al_lock! */
@@ -767,7 +643,8 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
 			if (udw) {
 				udw->enr = ext->lce.lc_number;
 				udw->w.cb = w_update_odbm;
-				drbd_queue_work_front(&mdev->data.work, &udw->w);
+				udw->w.mdev = mdev;
+				drbd_queue_work_front(&mdev->tconn->sender_work, &udw->w);
 			} else {
 				dev_warn(DEV, "Could not kmalloc an udw\n");
 			}
@@ -813,16 +690,22 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
 	int wake_up = 0;
 	unsigned long flags;
 
-	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
+	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
 		dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
 				(unsigned long long)sector, size);
 		return;
 	}
+
+	if (!get_ldev(mdev))
+		return; /* no disk, no metadata, no bitmap to clear bits in */
+
 	nr_sectors = drbd_get_capacity(mdev->this_bdev);
 	esector = sector + (size >> 9) - 1;
 
-	ERR_IF(sector >= nr_sectors) return;
-	ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
+	if (!expect(sector < nr_sectors))
+		goto out;
+	if (!expect(esector < nr_sectors))
+		esector = nr_sectors - 1;
 
 	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
 
@@ -830,7 +713,7 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
 	 * round up start sector, round down end sector.  we make sure we only
 	 * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
 	if (unlikely(esector < BM_SECT_PER_BIT-1))
-		return;
+		goto out;
 	if (unlikely(esector == (nr_sectors-1)))
 		ebnr = lbnr;
 	else
@@ -838,14 +721,14 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
 	sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
 
 	if (sbnr > ebnr)
-		return;
+		goto out;
 
 	/*
 	 * ok, (capacity & 7) != 0 sometimes, but who cares...
 	 * we count rs_{total,left} in bits, not sectors.
 	 */
 	count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
-	if (count && get_ldev(mdev)) {
+	if (count) {
 		drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev));
 		spin_lock_irqsave(&mdev->al_lock, flags);
 		drbd_try_clear_on_disk_bm(mdev, sector, count, true);
@@ -854,8 +737,9 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
 		/* just wake_up unconditional now, various lc_chaged(),
 		 * lc_put() in drbd_try_clear_on_disk_bm(). */
 		wake_up = 1;
-		put_ldev(mdev);
 	}
+out:
+	put_ldev(mdev);
 	if (wake_up)
 		wake_up(&mdev->al_wait);
 }
@@ -871,7 +755,7 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
 int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
 			    const char *file, const unsigned int line)
 {
-	unsigned long sbnr, ebnr, lbnr, flags;
+	unsigned long sbnr, ebnr, flags;
 	sector_t esector, nr_sectors;
 	unsigned int enr, count = 0;
 	struct lc_element *e;
@@ -880,7 +764,7 @@ int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
 	if (size == 0)
 		return 0;
 
-	if (size < 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
+	if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
 		dev_err(DEV, "sector: %llus, size: %d\n",
 			(unsigned long long)sector, size);
 		return 0;
@@ -892,12 +776,10 @@ int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
 	nr_sectors = drbd_get_capacity(mdev->this_bdev);
 	esector = sector + (size >> 9) - 1;
 
-	ERR_IF(sector >= nr_sectors)
+	if (!expect(sector < nr_sectors))
 		goto out;
-	ERR_IF(esector >= nr_sectors)
-		esector = (nr_sectors-1);
-
-	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+	if (!expect(esector < nr_sectors))
+		esector = nr_sectors - 1;
 
 	/* we set it out of sync,
 	 * we do not need to round anything here */
@@ -940,7 +822,7 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
 		if (bm_ext->lce.lc_number != enr) {
 			bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
 			bm_ext->rs_failed = 0;
-			lc_changed(mdev->resync, &bm_ext->lce);
+			lc_committed(mdev->resync);
 			wakeup = 1;
 		}
 		if (bm_ext->lce.refcnt == 1)
@@ -956,7 +838,7 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
 		if (rs_flags & LC_STARVING)
 			dev_warn(DEV, "Have to wait for element"
 			     " (resync LRU too small?)\n");
-		BUG_ON(rs_flags & LC_DIRTY);
+		BUG_ON(rs_flags & LC_LOCKED);
 	}
 
 	return bm_ext;
@@ -964,26 +846,12 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
 
 static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
 {
-	struct lc_element *al_ext;
-	int rv = 0;
+	int rv;
 
 	spin_lock_irq(&mdev->al_lock);
-	if (unlikely(enr == mdev->act_log->new_number))
-		rv = 1;
-	else {
-		al_ext = lc_find(mdev->act_log, enr);
-		if (al_ext) {
-			if (al_ext->refcnt)
-				rv = 1;
-		}
-	}
+	rv = lc_is_used(mdev->act_log, enr);
 	spin_unlock_irq(&mdev->al_lock);
 
-	/*
-	if (unlikely(rv)) {
-		dev_info(DEV, "Delaying sync read until app's write is done\n");
-	}
-	*/
 	return rv;
 }
 
@@ -1113,13 +981,13 @@ int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
 			if (rs_flags & LC_STARVING)
 				dev_warn(DEV, "Have to wait for element"
 				     " (resync LRU too small?)\n");
-			BUG_ON(rs_flags & LC_DIRTY);
+			BUG_ON(rs_flags & LC_LOCKED);
 			goto try_again;
 		}
 		if (bm_ext->lce.lc_number != enr) {
 			bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
 			bm_ext->rs_failed = 0;
-			lc_changed(mdev->resync, &bm_ext->lce);
+			lc_committed(mdev->resync);
 			wake_up(&mdev->al_wait);
 			D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0);
 		}
@@ -1130,8 +998,6 @@ int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
 	}
 check_al:
 	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
-		if (unlikely(al_enr+i == mdev->act_log->new_number))
-			goto try_again;
 		if (lc_is_used(mdev->act_log, al_enr+i))
 			goto try_again;
 	}
@@ -1266,7 +1132,7 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
 	sector_t esector, nr_sectors;
 	int wake_up = 0;
 
-	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
+	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
 		dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
 				(unsigned long long)sector, size);
 		return;
@@ -1274,8 +1140,10 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
 	nr_sectors = drbd_get_capacity(mdev->this_bdev);
 	esector = sector + (size >> 9) - 1;
 
-	ERR_IF(sector >= nr_sectors) return;
-	ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
+	if (!expect(sector < nr_sectors))
+		return;
+	if (!expect(esector < nr_sectors))
+		esector = nr_sectors - 1;
 
 	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
 
diff --git a/trunk/drivers/block/drbd/drbd_bitmap.c b/trunk/drivers/block/drbd/drbd_bitmap.c
index d84566496746..8dc29502dc08 100644
--- a/trunk/drivers/block/drbd/drbd_bitmap.c
+++ b/trunk/drivers/block/drbd/drbd_bitmap.c
@@ -119,13 +119,9 @@ static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
 	if (!__ratelimit(&drbd_ratelimit_state))
 		return;
 	dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
-	    current == mdev->receiver.task ? "receiver" :
-	    current == mdev->asender.task  ? "asender"  :
-	    current == mdev->worker.task   ? "worker"   : current->comm,
-	    func, b->bm_why ?: "?",
-	    b->bm_task == mdev->receiver.task ? "receiver" :
-	    b->bm_task == mdev->asender.task  ? "asender"  :
-	    b->bm_task == mdev->worker.task   ? "worker"   : "?");
+		drbd_task_to_thread_name(mdev->tconn, current),
+		func, b->bm_why ?: "?",
+		drbd_task_to_thread_name(mdev->tconn, b->bm_task));
 }
 
 void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags)
@@ -142,13 +138,9 @@ void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags)
 
 	if (trylock_failed) {
 		dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
-		    current == mdev->receiver.task ? "receiver" :
-		    current == mdev->asender.task  ? "asender"  :
-		    current == mdev->worker.task   ? "worker"   : current->comm,
-		    why, b->bm_why ?: "?",
-		    b->bm_task == mdev->receiver.task ? "receiver" :
-		    b->bm_task == mdev->asender.task  ? "asender"  :
-		    b->bm_task == mdev->worker.task   ? "worker"   : "?");
+			 drbd_task_to_thread_name(mdev->tconn, current),
+			 why, b->bm_why ?: "?",
+			 drbd_task_to_thread_name(mdev->tconn, b->bm_task));
 		mutex_lock(&b->bm_change);
 	}
 	if (BM_LOCKED_MASK & b->bm_flags)
@@ -196,6 +188,9 @@ void drbd_bm_unlock(struct drbd_conf *mdev)
 /* to mark for lazy writeout once syncer cleared all clearable bits,
  * we if bits have been cleared since last IO. */
 #define BM_PAGE_LAZY_WRITEOUT	28
+/* pages marked with this "HINT" will be considered for writeout
+ * on activity log transactions */
+#define BM_PAGE_HINT_WRITEOUT	27
 
 /* store_page_idx uses non-atomic assignment. It is only used directly after
  * allocating the page.  All other bm_set_page_* and bm_clear_page_* need to
@@ -227,8 +222,7 @@ static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
 	void *addr = &page_private(b->bm_pages[page_nr]);
-	clear_bit(BM_PAGE_IO_LOCK, addr);
-	smp_mb__after_clear_bit();
+	clear_bit_unlock(BM_PAGE_IO_LOCK, addr);
 	wake_up(&mdev->bitmap->bm_io_wait);
 }
 
@@ -246,6 +240,27 @@ static void bm_set_page_need_writeout(struct page *page)
 	set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
 }
 
+/**
+ * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
+ * @mdev:	DRBD device.
+ * @page_nr:	the bitmap page to mark with the "hint" flag
+ *
+ * From within an activity log transaction, we mark a few pages with these
+ * hints, then call drbd_bm_write_hinted(), which will only write out changed
+ * pages which are flagged with this mark.
+ */
+void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr)
+{
+	struct page *page;
+	if (page_nr >= mdev->bitmap->bm_number_of_pages) {
+		dev_warn(DEV, "BAD: page_nr: %u, number_of_pages: %u\n",
+			 page_nr, (int)mdev->bitmap->bm_number_of_pages);
+		return;
+	}
+	page = mdev->bitmap->bm_pages[page_nr];
+	set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page));
+}
+
 static int bm_test_page_unchanged(struct page *page)
 {
 	volatile const unsigned long *addr = &page_private(page);
@@ -373,14 +388,16 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
 		return old_pages;
 
 	/* Trying kmalloc first, falling back to vmalloc.
-	 * GFP_KERNEL is ok, as this is done when a lower level disk is
-	 * "attached" to the drbd.  Context is receiver thread or cqueue
-	 * thread.  As we have no disk yet, we are not in the IO path,
-	 * not even the IO path of the peer. */
+	 * GFP_NOIO, as this is called while drbd IO is "suspended",
+	 * and during resize or attach on diskless Primary,
+	 * we must not block on IO to ourselves.
+	 * Context is receiver thread or dmsetup. */
 	bytes = sizeof(struct page *)*want;
-	new_pages = kzalloc(bytes, GFP_KERNEL);
+	new_pages = kzalloc(bytes, GFP_NOIO);
 	if (!new_pages) {
-		new_pages = vzalloc(bytes);
+		new_pages = __vmalloc(bytes,
+				GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO,
+				PAGE_KERNEL);
 		if (!new_pages)
 			return NULL;
 		vmalloced = 1;
@@ -390,7 +407,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
 		for (i = 0; i < have; i++)
 			new_pages[i] = old_pages[i];
 		for (; i < want; i++) {
-			page = alloc_page(GFP_HIGHUSER);
+			page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
 			if (!page) {
 				bm_free_pages(new_pages + have, i - have);
 				bm_vk_free(new_pages, vmalloced);
@@ -439,7 +456,8 @@ int drbd_bm_init(struct drbd_conf *mdev)
 
 sector_t drbd_bm_capacity(struct drbd_conf *mdev)
 {
-	ERR_IF(!mdev->bitmap) return 0;
+	if (!expect(mdev->bitmap))
+		return 0;
 	return mdev->bitmap->bm_dev_capacity;
 }
 
@@ -447,7 +465,8 @@ sector_t drbd_bm_capacity(struct drbd_conf *mdev)
  */
 void drbd_bm_cleanup(struct drbd_conf *mdev)
 {
-	ERR_IF (!mdev->bitmap) return;
+	if (!expect(mdev->bitmap))
+		return;
 	bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
 	bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags));
 	kfree(mdev->bitmap);
@@ -610,7 +629,8 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
 	int err = 0, growing;
 	int opages_vmalloced;
 
-	ERR_IF(!b) return -ENOMEM;
+	if (!expect(b))
+		return -ENOMEM;
 
 	drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK);
 
@@ -732,8 +752,10 @@ unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
 	unsigned long s;
 	unsigned long flags;
 
-	ERR_IF(!b) return 0;
-	ERR_IF(!b->bm_pages) return 0;
+	if (!expect(b))
+		return 0;
+	if (!expect(b->bm_pages))
+		return 0;
 
 	spin_lock_irqsave(&b->bm_lock, flags);
 	s = b->bm_set;
@@ -756,8 +778,10 @@ unsigned long drbd_bm_total_weight(struct drbd_conf *mdev)
 size_t drbd_bm_words(struct drbd_conf *mdev)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
-	ERR_IF(!b) return 0;
-	ERR_IF(!b->bm_pages) return 0;
+	if (!expect(b))
+		return 0;
+	if (!expect(b->bm_pages))
+		return 0;
 
 	return b->bm_words;
 }
@@ -765,7 +789,8 @@ size_t drbd_bm_words(struct drbd_conf *mdev)
 unsigned long drbd_bm_bits(struct drbd_conf *mdev)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
-	ERR_IF(!b) return 0;
+	if (!expect(b))
+		return 0;
 
 	return b->bm_bits;
 }
@@ -786,8 +811,10 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
 
 	end = offset + number;
 
-	ERR_IF(!b) return;
-	ERR_IF(!b->bm_pages) return;
+	if (!expect(b))
+		return;
+	if (!expect(b->bm_pages))
+		return;
 	if (number == 0)
 		return;
 	WARN_ON(offset >= b->bm_words);
@@ -831,8 +858,10 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
 
 	end = offset + number;
 
-	ERR_IF(!b) return;
-	ERR_IF(!b->bm_pages) return;
+	if (!expect(b))
+		return;
+	if (!expect(b->bm_pages))
+		return;
 
 	spin_lock_irq(&b->bm_lock);
 	if ((offset >= b->bm_words) ||
@@ -860,8 +889,10 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
 void drbd_bm_set_all(struct drbd_conf *mdev)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
-	ERR_IF(!b) return;
-	ERR_IF(!b->bm_pages) return;
+	if (!expect(b))
+		return;
+	if (!expect(b->bm_pages))
+		return;
 
 	spin_lock_irq(&b->bm_lock);
 	bm_memset(b, 0, 0xff, b->bm_words);
@@ -874,8 +905,10 @@ void drbd_bm_set_all(struct drbd_conf *mdev)
 void drbd_bm_clear_all(struct drbd_conf *mdev)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
-	ERR_IF(!b) return;
-	ERR_IF(!b->bm_pages) return;
+	if (!expect(b))
+		return;
+	if (!expect(b->bm_pages))
+		return;
 
 	spin_lock_irq(&b->bm_lock);
 	bm_memset(b, 0, 0, b->bm_words);
@@ -889,7 +922,8 @@ struct bm_aio_ctx {
 	unsigned int done;
 	unsigned flags;
 #define BM_AIO_COPY_PAGES	1
-#define BM_WRITE_ALL_PAGES	2
+#define BM_AIO_WRITE_HINTED	2
+#define BM_WRITE_ALL_PAGES	4
 	int error;
 	struct kref kref;
 };
@@ -977,17 +1011,11 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
 	bm_set_page_unchanged(b->bm_pages[page_nr]);
 
 	if (ctx->flags & BM_AIO_COPY_PAGES) {
-		void *src, *dest;
 		page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT);
-		dest = kmap_atomic(page);
-		src = kmap_atomic(b->bm_pages[page_nr]);
-		memcpy(dest, src, PAGE_SIZE);
-		kunmap_atomic(src);
-		kunmap_atomic(dest);
+		copy_highpage(page, b->bm_pages[page_nr]);
 		bm_store_page_idx(page, page_nr);
 	} else
 		page = b->bm_pages[page_nr];
-
 	bio->bi_bdev = mdev->ldev->md_bdev;
 	bio->bi_sector = on_disk_sector;
 	/* bio_add_page of a single page to an empty bio will always succeed,
@@ -1060,6 +1088,11 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
 		if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
 			break;
 		if (rw & WRITE) {
+			if ((flags & BM_AIO_WRITE_HINTED) &&
+			    !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
+				    &page_private(b->bm_pages[i])))
+				continue;
+
 			if (!(flags & BM_WRITE_ALL_PAGES) &&
 			    bm_test_page_unchanged(b->bm_pages[i])) {
 				dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
@@ -1088,13 +1121,15 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
 	 * "in_flight reached zero, all done" event.
 	 */
 	if (!atomic_dec_and_test(&ctx->in_flight))
-		wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);
+		wait_until_done_or_force_detached(mdev, mdev->ldev, &ctx->done);
 	else
 		kref_put(&ctx->kref, &bm_aio_ctx_destroy);
 
-	dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
-			rw == WRITE ? "WRITE" : "READ",
-			count, jiffies - now);
+	/* summary for global bitmap IO */
+	if (flags == 0)
+		dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
+			 rw == WRITE ? "WRITE" : "READ",
+			 count, jiffies - now);
 
 	if (ctx->error) {
 		dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
@@ -1103,7 +1138,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
 	}
 
 	if (atomic_read(&ctx->in_flight))
-		err = -EIO; /* Disk failed during IO... */
+		err = -EIO; /* Disk timeout/force-detach during IO... */
 
 	now = jiffies;
 	if (rw == WRITE) {
@@ -1115,8 +1150,9 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
 	}
 	now = b->bm_set;
 
-	dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
-	     ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
+	if (flags == 0)
+		dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
+		     ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
 
 	kref_put(&ctx->kref, &bm_aio_ctx_destroy);
 	return err;
@@ -1179,9 +1215,17 @@ int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local)
 	return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0);
 }
 
+/**
+ * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed.
+ * @mdev:	DRBD device.
+ */
+int drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local)
+{
+	return bm_rw(mdev, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
+}
 
 /**
- * drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap
+ * drbd_bm_write_page() - Writes a PAGE_SIZE aligned piece of bitmap
  * @mdev:	DRBD device.
  * @idx:	bitmap page index
  *
@@ -1222,11 +1266,11 @@ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(loc
 	}
 
 	bm_page_io_async(ctx, idx, WRITE_SYNC);
-	wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);
+	wait_until_done_or_force_detached(mdev, mdev->ldev, &ctx->done);
 
 	if (ctx->error)
 		drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
-		/* that should force detach, so the in memory bitmap will be
+		/* that causes us to detach, so the in memory bitmap will be
 		 * gone in a moment as well. */
 
 	mdev->bm_writ_cnt++;
@@ -1289,8 +1333,10 @@ static unsigned long bm_find_next(struct drbd_conf *mdev,
 	struct drbd_bitmap *b = mdev->bitmap;
 	unsigned long i = DRBD_END_OF_BITMAP;
 
-	ERR_IF(!b) return i;
-	ERR_IF(!b->bm_pages) return i;
+	if (!expect(b))
+		return i;
+	if (!expect(b->bm_pages))
+		return i;
 
 	spin_lock_irq(&b->bm_lock);
 	if (BM_DONT_TEST & b->bm_flags)
@@ -1391,8 +1437,10 @@ static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
 	struct drbd_bitmap *b = mdev->bitmap;
 	int c = 0;
 
-	ERR_IF(!b) return 1;
-	ERR_IF(!b->bm_pages) return 0;
+	if (!expect(b))
+		return 1;
+	if (!expect(b->bm_pages))
+		return 0;
 
 	spin_lock_irqsave(&b->bm_lock, flags);
 	if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
@@ -1423,13 +1471,21 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
 {
 	int i;
 	int bits;
+	int changed = 0;
 	unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
 	for (i = first_word; i < last_word; i++) {
 		bits = hweight_long(paddr[i]);
 		paddr[i] = ~0UL;
-		b->bm_set += BITS_PER_LONG - bits;
+		changed += BITS_PER_LONG - bits;
 	}
 	kunmap_atomic(paddr);
+	if (changed) {
+		/* We only need lazy writeout, the information is still in the
+		 * remote bitmap as well, and is reconstructed during the next
+		 * bitmap exchange, if lost locally due to a crash. */
+		bm_set_page_lazy_writeout(b->bm_pages[page_nr]);
+		b->bm_set += changed;
+	}
 }
 
 /* Same thing as drbd_bm_set_bits,
@@ -1524,8 +1580,10 @@ int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
 	unsigned long *p_addr;
 	int i;
 
-	ERR_IF(!b) return 0;
-	ERR_IF(!b->bm_pages) return 0;
+	if (!expect(b))
+		return 0;
+	if (!expect(b->bm_pages))
+		return 0;
 
 	spin_lock_irqsave(&b->bm_lock, flags);
 	if (BM_DONT_TEST & b->bm_flags)
@@ -1559,8 +1617,10 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
 	 * robust in case we screwed up elsewhere, in that case pretend there
 	 * was one dirty bit in the requested area, so we won't try to do a
 	 * local read there (no bitmap probably implies no disk) */
-	ERR_IF(!b) return 1;
-	ERR_IF(!b->bm_pages) return 1;
+	if (!expect(b))
+		return 1;
+	if (!expect(b->bm_pages))
+		return 1;
 
 	spin_lock_irqsave(&b->bm_lock, flags);
 	if (BM_DONT_TEST & b->bm_flags)
@@ -1573,11 +1633,10 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
 				bm_unmap(p_addr);
 			p_addr = bm_map_pidx(b, idx);
 		}
-		ERR_IF (bitnr >= b->bm_bits) {
-			dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
-		} else {
+		if (expect(bitnr < b->bm_bits))
 			c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
-		}
+		else
+			dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
 	}
 	if (p_addr)
 		bm_unmap(p_addr);
@@ -1607,8 +1666,10 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
 	unsigned long flags;
 	unsigned long *p_addr, *bm;
 
-	ERR_IF(!b) return 0;
-	ERR_IF(!b->bm_pages) return 0;
+	if (!expect(b))
+		return 0;
+	if (!expect(b->bm_pages))
+		return 0;
 
 	spin_lock_irqsave(&b->bm_lock, flags);
 	if (BM_DONT_TEST & b->bm_flags)
@@ -1630,47 +1691,3 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
 	spin_unlock_irqrestore(&b->bm_lock, flags);
 	return count;
 }
-
-/* Set all bits covered by the AL-extent al_enr.
- * Returns number of bits changed. */
-unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
-{
-	struct drbd_bitmap *b = mdev->bitmap;
-	unsigned long *p_addr, *bm;
-	unsigned long weight;
-	unsigned long s, e;
-	int count, i, do_now;
-	ERR_IF(!b) return 0;
-	ERR_IF(!b->bm_pages) return 0;
-
-	spin_lock_irq(&b->bm_lock);
-	if (BM_DONT_SET & b->bm_flags)
-		bm_print_lock_info(mdev);
-	weight = b->bm_set;
-
-	s = al_enr * BM_WORDS_PER_AL_EXT;
-	e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
-	/* assert that s and e are on the same page */
-	D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
-	      ==  s    >> (PAGE_SHIFT - LN2_BPL + 3));
-	count = 0;
-	if (s < b->bm_words) {
-		i = do_now = e-s;
-		p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
-		bm = p_addr + MLPP(s);
-		while (i--) {
-			count += hweight_long(*bm);
-			*bm = -1UL;
-			bm++;
-		}
-		bm_unmap(p_addr);
-		b->bm_set += do_now*BITS_PER_LONG - count;
-		if (e == b->bm_words)
-			b->bm_set -= bm_clear_surplus(b);
-	} else {
-		dev_err(DEV, "start offset (%lu) too large in drbd_bm_ALe_set_all\n", s);
-	}
-	weight = b->bm_set - weight;
-	spin_unlock_irq(&b->bm_lock);
-	return weight;
-}
diff --git a/trunk/drivers/block/drbd/drbd_int.h b/trunk/drivers/block/drbd/drbd_int.h
index b953cc7c9c00..6b51afa1aae1 100644
--- a/trunk/drivers/block/drbd/drbd_int.h
+++ b/trunk/drivers/block/drbd/drbd_int.h
@@ -39,9 +39,13 @@
 #include <linux/major.h>
 #include <linux/blkdev.h>
 #include <linux/genhd.h>
+#include <linux/idr.h>
 #include <net/tcp.h>
 #include <linux/lru_cache.h>
 #include <linux/prefetch.h>
+#include <linux/drbd_genl_api.h>
+#include <linux/drbd.h>
+#include "drbd_state.h"
 
 #ifdef __CHECKER__
 # define __protected_by(x)       __attribute__((require_context(x,1,999,"rdwr")))
@@ -61,7 +65,6 @@
 extern unsigned int minor_count;
 extern bool disable_sendpage;
 extern bool allow_oos;
-extern unsigned int cn_idx;
 
 #ifdef CONFIG_DRBD_FAULT_INJECTION
 extern int enable_faults;
@@ -86,34 +89,44 @@ extern char usermode_helper[];
  */
 #define DRBD_SIGKILL SIGHUP
 
-/* All EEs on the free list should have ID_VACANT (== 0)
- * freshly allocated EEs get !ID_VACANT (== 1)
- * so if it says "cannot dereference null pointer at address 0x00000001",
- * it is most likely one of these :( */
-
 #define ID_IN_SYNC      (4711ULL)
 #define ID_OUT_OF_SYNC  (4712ULL)
-
 #define ID_SYNCER (-1ULL)
-#define ID_VACANT 0
-#define is_syncer_block_id(id) ((id) == ID_SYNCER)
+
 #define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL)
 
 struct drbd_conf;
+struct drbd_tconn;
 
 
 /* to shorten dev_warn(DEV, "msg"); and relatives statements */
 #define DEV (disk_to_dev(mdev->vdisk))
 
+#define conn_printk(LEVEL, TCONN, FMT, ARGS...) \
+	printk(LEVEL "d-con %s: " FMT, TCONN->name , ## ARGS)
+#define conn_alert(TCONN, FMT, ARGS...)  conn_printk(KERN_ALERT, TCONN, FMT, ## ARGS)
+#define conn_crit(TCONN, FMT, ARGS...)   conn_printk(KERN_CRIT, TCONN, FMT, ## ARGS)
+#define conn_err(TCONN, FMT, ARGS...)    conn_printk(KERN_ERR, TCONN, FMT, ## ARGS)
+#define conn_warn(TCONN, FMT, ARGS...)   conn_printk(KERN_WARNING, TCONN, FMT, ## ARGS)
+#define conn_notice(TCONN, FMT, ARGS...) conn_printk(KERN_NOTICE, TCONN, FMT, ## ARGS)
+#define conn_info(TCONN, FMT, ARGS...)   conn_printk(KERN_INFO, TCONN, FMT, ## ARGS)
+#define conn_dbg(TCONN, FMT, ARGS...)    conn_printk(KERN_DEBUG, TCONN, FMT, ## ARGS)
+
 #define D_ASSERT(exp)	if (!(exp)) \
 	 dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__)
 
-#define ERR_IF(exp) if (({						\
-	int _b = (exp) != 0;						\
-	if (_b) dev_err(DEV, "ASSERT FAILED: %s: (%s) in %s:%d\n",	\
-			__func__, #exp, __FILE__, __LINE__);		\
-	_b;								\
-	}))
+/**
+ * expect  -  Make an assertion
+ *
+ * Unlike the assert macro, this macro returns a boolean result.
+ */
+#define expect(exp) ({								\
+		bool _bool = (exp);						\
+		if (!_bool)							\
+			dev_err(DEV, "ASSERTION %s FAILED in %s\n",		\
+			        #exp, __func__);				\
+		_bool;								\
+		})
 
 /* Defines to control fault insertion */
 enum {
@@ -150,15 +163,12 @@ drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
 /* usual integer division */
 #define div_floor(A, B) ((A)/(B))
 
-/* drbd_meta-data.c (still in drbd_main.c) */
-/* 4th incarnation of the disk layout. */
-#define DRBD_MD_MAGIC (DRBD_MAGIC+4)
-
-extern struct drbd_conf **minor_table;
 extern struct ratelimit_state drbd_ratelimit_state;
+extern struct idr minors; /* RCU, updates: genl_lock() */
+extern struct list_head drbd_tconns; /* RCU, updates: genl_lock() */
 
 /* on the wire */
-enum drbd_packets {
+enum drbd_packet {
 	/* receiver (data socket) */
 	P_DATA		      = 0x00,
 	P_DATA_REPLY	      = 0x01, /* Response to P_DATA_REQUEST */
@@ -186,7 +196,7 @@ enum drbd_packets {
 	P_RECV_ACK	      = 0x15, /* Used in protocol B */
 	P_WRITE_ACK	      = 0x16, /* Used in protocol C */
 	P_RS_WRITE_ACK	      = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
-	P_DISCARD_ACK	      = 0x18, /* Used in proto C, two-primaries conflict detection */
+	P_SUPERSEDED	      = 0x18, /* Used in proto C, two-primaries conflict detection */
 	P_NEG_ACK	      = 0x19, /* Sent if local disk is unusable */
 	P_NEG_DREPLY	      = 0x1a, /* Local disk is broken... */
 	P_NEG_RS_DREPLY	      = 0x1b, /* Local disk is broken... */
@@ -207,77 +217,23 @@ enum drbd_packets {
 	P_DELAY_PROBE         = 0x27, /* is used on BOTH sockets */
 	P_OUT_OF_SYNC         = 0x28, /* Mark as out of sync (Outrunning), data socket */
 	P_RS_CANCEL           = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */
+	P_CONN_ST_CHG_REQ     = 0x2a, /* data sock: Connection wide state request */
+	P_CONN_ST_CHG_REPLY   = 0x2b, /* meta sock: Connection side state req reply */
+	P_RETRY_WRITE	      = 0x2c, /* Protocol C: retry conflicting write request */
+	P_PROTOCOL_UPDATE     = 0x2d, /* data sock: is used in established connections */
 
-	P_MAX_CMD	      = 0x2A,
 	P_MAY_IGNORE	      = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
 	P_MAX_OPT_CMD	      = 0x101,
 
 	/* special command ids for handshake */
 
-	P_HAND_SHAKE_M	      = 0xfff1, /* First Packet on the MetaSock */
-	P_HAND_SHAKE_S	      = 0xfff2, /* First Packet on the Socket */
+	P_INITIAL_META	      = 0xfff1, /* First Packet on the MetaSock */
+	P_INITIAL_DATA	      = 0xfff2, /* First Packet on the Socket */
 
-	P_HAND_SHAKE	      = 0xfffe	/* FIXED for the next century! */
+	P_CONNECTION_FEATURES = 0xfffe	/* FIXED for the next century! */
 };
 
-static inline const char *cmdname(enum drbd_packets cmd)
-{
-	/* THINK may need to become several global tables
-	 * when we want to support more than
-	 * one PRO_VERSION */
-	static const char *cmdnames[] = {
-		[P_DATA]	        = "Data",
-		[P_DATA_REPLY]	        = "DataReply",
-		[P_RS_DATA_REPLY]	= "RSDataReply",
-		[P_BARRIER]	        = "Barrier",
-		[P_BITMAP]	        = "ReportBitMap",
-		[P_BECOME_SYNC_TARGET]  = "BecomeSyncTarget",
-		[P_BECOME_SYNC_SOURCE]  = "BecomeSyncSource",
-		[P_UNPLUG_REMOTE]	= "UnplugRemote",
-		[P_DATA_REQUEST]	= "DataRequest",
-		[P_RS_DATA_REQUEST]     = "RSDataRequest",
-		[P_SYNC_PARAM]	        = "SyncParam",
-		[P_SYNC_PARAM89]	= "SyncParam89",
-		[P_PROTOCOL]            = "ReportProtocol",
-		[P_UUIDS]	        = "ReportUUIDs",
-		[P_SIZES]	        = "ReportSizes",
-		[P_STATE]	        = "ReportState",
-		[P_SYNC_UUID]           = "ReportSyncUUID",
-		[P_AUTH_CHALLENGE]      = "AuthChallenge",
-		[P_AUTH_RESPONSE]	= "AuthResponse",
-		[P_PING]		= "Ping",
-		[P_PING_ACK]	        = "PingAck",
-		[P_RECV_ACK]	        = "RecvAck",
-		[P_WRITE_ACK]	        = "WriteAck",
-		[P_RS_WRITE_ACK]	= "RSWriteAck",
-		[P_DISCARD_ACK]	        = "DiscardAck",
-		[P_NEG_ACK]	        = "NegAck",
-		[P_NEG_DREPLY]	        = "NegDReply",
-		[P_NEG_RS_DREPLY]	= "NegRSDReply",
-		[P_BARRIER_ACK]	        = "BarrierAck",
-		[P_STATE_CHG_REQ]       = "StateChgRequest",
-		[P_STATE_CHG_REPLY]     = "StateChgReply",
-		[P_OV_REQUEST]          = "OVRequest",
-		[P_OV_REPLY]            = "OVReply",
-		[P_OV_RESULT]           = "OVResult",
-		[P_CSUM_RS_REQUEST]     = "CsumRSRequest",
-		[P_RS_IS_IN_SYNC]	= "CsumRSIsInSync",
-		[P_COMPRESSED_BITMAP]   = "CBitmap",
-		[P_DELAY_PROBE]         = "DelayProbe",
-		[P_OUT_OF_SYNC]		= "OutOfSync",
-		[P_MAX_CMD]	        = NULL,
-	};
-
-	if (cmd == P_HAND_SHAKE_M)
-		return "HandShakeM";
-	if (cmd == P_HAND_SHAKE_S)
-		return "HandShakeS";
-	if (cmd == P_HAND_SHAKE)
-		return "HandShake";
-	if (cmd >= P_MAX_CMD)
-		return "Unknown";
-	return cmdnames[cmd];
-}
+extern const char *cmdname(enum drbd_packet cmd);
 
 /* for sending/receiving the bitmap,
  * possibly in some encoding scheme */
@@ -337,37 +293,24 @@ struct p_header80 {
 	u32	  magic;
 	u16	  command;
 	u16	  length;	/* bytes of data after this header */
-	u8	  payload[0];
 } __packed;
 
 /* Header for big packets, Used for data packets exceeding 64kB */
 struct p_header95 {
 	u16	  magic;	/* use DRBD_MAGIC_BIG here */
 	u16	  command;
-	u32	  length;	/* Use only 24 bits of that. Ignore the highest 8 bit. */
-	u8	  payload[0];
+	u32	  length;
 } __packed;
 
-union p_header {
-	struct p_header80 h80;
-	struct p_header95 h95;
-};
-
-/*
- * short commands, packets without payload, plain p_header:
- *   P_PING
- *   P_PING_ACK
- *   P_BECOME_SYNC_TARGET
- *   P_BECOME_SYNC_SOURCE
- *   P_UNPLUG_REMOTE
- */
+struct p_header100 {
+	u32	  magic;
+	u16	  volume;
+	u16	  command;
+	u32	  length;
+	u32	  pad;
+} __packed;
 
-/*
- * commands with out-of-struct payload:
- *   P_BITMAP    (no additional fields)
- *   P_DATA, P_DATA_REPLY (see p_data)
- *   P_COMPRESSED_BITMAP (see receive_compressed_bitmap)
- */
+extern unsigned int drbd_header_size(struct drbd_tconn *tconn);
 
 /* these defines must not be changed without changing the protocol version */
 #define DP_HARDBARRIER	      1 /* depricated */
@@ -377,9 +320,10 @@ union p_header {
 #define DP_FUA               16 /* equals REQ_FUA     */
 #define DP_FLUSH             32 /* equals REQ_FLUSH   */
 #define DP_DISCARD           64 /* equals REQ_DISCARD */
+#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */
+#define DP_SEND_WRITE_ACK   256 /* This is a proto C write request */
 
 struct p_data {
-	union p_header head;
 	u64	    sector;    /* 64 bits sector number */
 	u64	    block_id;  /* to identify the request in protocol B&C */
 	u32	    seq_num;
@@ -390,21 +334,18 @@ struct p_data {
  * commands which share a struct:
  *  p_block_ack:
  *   P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
- *   P_DISCARD_ACK (proto C, two-primaries conflict detection)
+ *   P_SUPERSEDED (proto C, two-primaries conflict detection)
  *  p_block_req:
  *   P_DATA_REQUEST, P_RS_DATA_REQUEST
  */
 struct p_block_ack {
-	struct p_header80 head;
 	u64	    sector;
 	u64	    block_id;
 	u32	    blksize;
 	u32	    seq_num;
 } __packed;
 
-
 struct p_block_req {
-	struct p_header80 head;
 	u64 sector;
 	u64 block_id;
 	u32 blksize;
@@ -413,59 +354,52 @@ struct p_block_req {
 
 /*
  * commands with their own struct for additional fields:
- *   P_HAND_SHAKE
+ *   P_CONNECTION_FEATURES
  *   P_BARRIER
  *   P_BARRIER_ACK
  *   P_SYNC_PARAM
  *   ReportParams
  */
 
-struct p_handshake {
-	struct p_header80 head;	/* 8 bytes */
+struct p_connection_features {
 	u32 protocol_min;
 	u32 feature_flags;
 	u32 protocol_max;
 
 	/* should be more than enough for future enhancements
-	 * for now, feature_flags and the reserverd array shall be zero.
+	 * for now, feature_flags and the reserved array shall be zero.
 	 */
 
 	u32 _pad;
-	u64 reserverd[7];
+	u64 reserved[7];
 } __packed;
-/* 80 bytes, FIXED for the next century */
 
 struct p_barrier {
-	struct p_header80 head;
 	u32 barrier;	/* barrier number _handle_ only */
 	u32 pad;	/* to multiple of 8 Byte */
 } __packed;
 
 struct p_barrier_ack {
-	struct p_header80 head;
 	u32 barrier;
 	u32 set_size;
 } __packed;
 
 struct p_rs_param {
-	struct p_header80 head;
-	u32 rate;
+	u32 resync_rate;
 
 	      /* Since protocol version 88 and higher. */
 	char verify_alg[0];
 } __packed;
 
 struct p_rs_param_89 {
-	struct p_header80 head;
-	u32 rate;
+	u32 resync_rate;
         /* protocol version 89: */
 	char verify_alg[SHARED_SECRET_MAX];
 	char csums_alg[SHARED_SECRET_MAX];
 } __packed;
 
 struct p_rs_param_95 {
-	struct p_header80 head;
-	u32 rate;
+	u32 resync_rate;
 	char verify_alg[SHARED_SECRET_MAX];
 	char csums_alg[SHARED_SECRET_MAX];
 	u32 c_plan_ahead;
@@ -475,12 +409,11 @@ struct p_rs_param_95 {
 } __packed;
 
 enum drbd_conn_flags {
-	CF_WANT_LOSE = 1,
+	CF_DISCARD_MY_DATA = 1,
 	CF_DRY_RUN = 2,
 };
 
 struct p_protocol {
-	struct p_header80 head;
 	u32 protocol;
 	u32 after_sb_0p;
 	u32 after_sb_1p;
@@ -494,17 +427,14 @@ struct p_protocol {
 } __packed;
 
 struct p_uuids {
-	struct p_header80 head;
 	u64 uuid[UI_EXTENDED_SIZE];
 } __packed;
 
 struct p_rs_uuid {
-	struct p_header80 head;
 	u64	    uuid;
 } __packed;
 
 struct p_sizes {
-	struct p_header80 head;
 	u64	    d_size;  /* size of disk */
 	u64	    u_size;  /* user requested size */
 	u64	    c_size;  /* current exported size */
@@ -514,18 +444,15 @@ struct p_sizes {
 } __packed;
 
 struct p_state {
-	struct p_header80 head;
 	u32	    state;
 } __packed;
 
 struct p_req_state {
-	struct p_header80 head;
 	u32	    mask;
 	u32	    val;
 } __packed;
 
 struct p_req_state_reply {
-	struct p_header80 head;
 	u32	    retcode;
 } __packed;
 
@@ -539,15 +466,7 @@ struct p_drbd06_param {
 	u32	  bit_map_gen[5];
 } __packed;
 
-struct p_discard {
-	struct p_header80 head;
-	u64	    block_id;
-	u32	    seq_num;
-	u32	    pad;
-} __packed;
-
 struct p_block_desc {
-	struct p_header80 head;
 	u64 sector;
 	u32 blksize;
 	u32 pad;	/* to multiple of 8 Byte */
@@ -563,7 +482,6 @@ enum drbd_bitmap_code {
 };
 
 struct p_compressed_bm {
-	struct p_header80 head;
 	/* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
 	 * (encoding & 0x80): polarity (set/unset) of first runlength
 	 * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
@@ -575,90 +493,22 @@ struct p_compressed_bm {
 } __packed;
 
 struct p_delay_probe93 {
-	struct p_header80 head;
 	u32     seq_num; /* sequence number to match the two probe packets */
 	u32     offset;  /* usecs the probe got sent after the reference time point */
 } __packed;
 
-/* DCBP: Drbd Compressed Bitmap Packet ... */
-static inline enum drbd_bitmap_code
-DCBP_get_code(struct p_compressed_bm *p)
-{
-	return (enum drbd_bitmap_code)(p->encoding & 0x0f);
-}
-
-static inline void
-DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
-{
-	BUG_ON(code & ~0xf);
-	p->encoding = (p->encoding & ~0xf) | code;
-}
-
-static inline int
-DCBP_get_start(struct p_compressed_bm *p)
-{
-	return (p->encoding & 0x80) != 0;
-}
-
-static inline void
-DCBP_set_start(struct p_compressed_bm *p, int set)
-{
-	p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
-}
-
-static inline int
-DCBP_get_pad_bits(struct p_compressed_bm *p)
-{
-	return (p->encoding >> 4) & 0x7;
-}
-
-static inline void
-DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
-{
-	BUG_ON(n & ~0x7);
-	p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
-}
-
-/* one bitmap packet, including the p_header,
- * should fit within one _architecture independend_ page.
- * so we need to use the fixed size 4KiB page size
- * most architectures have used for a long time.
+/*
+ * Bitmap packets need to fit within a single page on the sender and receiver,
+ * so we are limited to 4 KiB (and not to PAGE_SIZE, which can be bigger).
  */
-#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80))
-#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
-#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
-#if (PAGE_SIZE < 4096)
-/* drbd_send_bitmap / receive_bitmap would break horribly */
-#error "PAGE_SIZE too small"
-#endif
-
-union p_polymorph {
-        union p_header           header;
-        struct p_handshake       handshake;
-        struct p_data            data;
-        struct p_block_ack       block_ack;
-        struct p_barrier         barrier;
-        struct p_barrier_ack     barrier_ack;
-        struct p_rs_param_89     rs_param_89;
-        struct p_rs_param_95     rs_param_95;
-        struct p_protocol        protocol;
-        struct p_sizes           sizes;
-        struct p_uuids           uuids;
-        struct p_state           state;
-        struct p_req_state       req_state;
-        struct p_req_state_reply req_state_reply;
-        struct p_block_req       block_req;
-	struct p_delay_probe93   delay_probe93;
-	struct p_rs_uuid         rs_uuid;
-	struct p_block_desc      block_desc;
-} __packed;
+#define DRBD_SOCKET_BUFFER_SIZE 4096
 
 /**********************************************************************/
 enum drbd_thread_state {
-	None,
-	Running,
-	Exiting,
-	Restarting
+	NONE,
+	RUNNING,
+	EXITING,
+	RESTARTING
 };
 
 struct drbd_thread {
@@ -667,8 +517,9 @@ struct drbd_thread {
 	struct completion stop;
 	enum drbd_thread_state t_state;
 	int (*function) (struct drbd_thread *);
-	struct drbd_conf *mdev;
+	struct drbd_tconn *tconn;
 	int reset_cpu_mask;
+	char name[9];
 };
 
 static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
@@ -681,58 +532,54 @@ static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
 	return thi->t_state;
 }
 
-struct drbd_work;
-typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel);
 struct drbd_work {
 	struct list_head list;
-	drbd_work_cb cb;
+	int (*cb)(struct drbd_work *, int cancel);
+	union {
+		struct drbd_conf *mdev;
+		struct drbd_tconn *tconn;
+	};
 };
 
-struct drbd_tl_epoch;
+#include "drbd_interval.h"
+
+extern int drbd_wait_misc(struct drbd_conf *, struct drbd_interval *);
+
 struct drbd_request {
 	struct drbd_work w;
-	struct drbd_conf *mdev;
 
 	/* if local IO is not allowed, will be NULL.
 	 * if local IO _is_ allowed, holds the locally submitted bio clone,
 	 * or, after local IO completion, the ERR_PTR(error).
-	 * see drbd_endio_pri(). */
+	 * see drbd_request_endio(). */
 	struct bio *private_bio;
 
-	struct hlist_node collision;
-	sector_t sector;
-	unsigned int size;
-	unsigned int epoch; /* barrier_nr */
+	struct drbd_interval i;
 
-	/* barrier_nr: used to check on "completion" whether this req was in
+	/* epoch: used to check on "completion" whether this req was in
 	 * the current epoch, and we therefore have to close it,
-	 * starting a new epoch...
+	 * causing a p_barrier packet to be send, starting a new epoch.
+	 *
+	 * This corresponds to "barrier" in struct p_barrier[_ack],
+	 * and to "barrier_nr" in struct drbd_epoch (and various
+	 * comments/function parameters/local variable names).
 	 */
+	unsigned int epoch;
 
 	struct list_head tl_requests; /* ring list in the transfer log */
 	struct bio *master_bio;       /* master bio pointer */
-	unsigned long rq_state; /* see comments above _req_mod() */
 	unsigned long start_time;
-};
-
-struct drbd_tl_epoch {
-	struct drbd_work w;
-	struct list_head requests; /* requests before */
-	struct drbd_tl_epoch *next; /* pointer to the next barrier */
-	unsigned int br_number;  /* the barriers identifier. */
-	int n_writes;	/* number of requests attached before this barrier */
-};
 
-struct drbd_request;
+	/* once it hits 0, we may complete the master_bio */
+	atomic_t completion_ref;
+	/* once it hits 0, we may destroy this drbd_request object */
+	struct kref kref;
 
-/* These Tl_epoch_entries may be in one of 6 lists:
-   active_ee .. data packet being written
-   sync_ee   .. syncer block being written
-   done_ee   .. block written, need to send P_WRITE_ACK
-   read_ee   .. [RS]P_DATA_REQUEST being read
-*/
+	unsigned rq_state; /* see comments above _req_mod() */
+};
 
 struct drbd_epoch {
+	struct drbd_tconn *tconn;
 	struct list_head list;
 	unsigned int barrier_nr;
 	atomic_t epoch_size; /* increased on every request added. */
@@ -762,17 +609,14 @@ struct digest_info {
 	void *digest;
 };
 
-struct drbd_epoch_entry {
+struct drbd_peer_request {
 	struct drbd_work w;
-	struct hlist_node collision;
 	struct drbd_epoch *epoch; /* for writes */
-	struct drbd_conf *mdev;
 	struct page *pages;
 	atomic_t pending_bios;
-	unsigned int size;
+	struct drbd_interval i;
 	/* see comments on ee flag bits below */
 	unsigned long flags;
-	sector_t sector;
 	union {
 		u64 block_id;
 		struct digest_info *digest;
@@ -793,31 +637,37 @@ enum {
 	 * we need to resubmit without the barrier flag. */
 	__EE_RESUBMITTED,
 
-	/* we may have several bios per epoch entry.
+	/* we may have several bios per peer request.
 	 * if any of those fail, we set this flag atomically
 	 * from the endio callback */
 	__EE_WAS_ERROR,
 
 	/* This ee has a pointer to a digest instead of a block id */
 	__EE_HAS_DIGEST,
+
+	/* Conflicting local requests need to be restarted after this request */
+	__EE_RESTART_REQUESTS,
+
+	/* The peer wants a write ACK for this (wire proto C) */
+	__EE_SEND_WRITE_ACK,
+
+	/* Is set when net_conf had two_primaries set while creating this peer_req */
+	__EE_IN_INTERVAL_TREE,
 };
 #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
 #define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
 #define	EE_RESUBMITTED         (1<<__EE_RESUBMITTED)
 #define EE_WAS_ERROR           (1<<__EE_WAS_ERROR)
 #define EE_HAS_DIGEST          (1<<__EE_HAS_DIGEST)
+#define EE_RESTART_REQUESTS	(1<<__EE_RESTART_REQUESTS)
+#define EE_SEND_WRITE_ACK	(1<<__EE_SEND_WRITE_ACK)
+#define EE_IN_INTERVAL_TREE	(1<<__EE_IN_INTERVAL_TREE)
 
-/* global flag bits */
+/* flag bits per mdev */
 enum {
-	CREATE_BARRIER,		/* next P_DATA is preceded by a P_BARRIER */
-	SIGNAL_ASENDER,		/* whether asender wants to be interrupted */
-	SEND_PING,		/* whether asender should send a ping asap */
-
 	UNPLUG_REMOTE,		/* sending a "UnplugRemote" could help */
 	MD_DIRTY,		/* current uuids and flags not yet on disk */
-	DISCARD_CONCURRENT,	/* Set on one node, cleared on the peer! */
 	USE_DEGR_WFC_T,		/* degr-wfc-timeout instead of wfc-timeout. */
-	CLUSTER_ST_CHANGE,	/* Cluster wide state change going on... */
 	CL_ST_CHG_SUCCESS,
 	CL_ST_CHG_FAIL,
 	CRASHED_PRIMARY,	/* This node was a crashed primary.
@@ -831,32 +681,18 @@ enum {
 				   once no more io in flight, start bitmap io */
 	BITMAP_IO_QUEUED,       /* Started bitmap IO */
 	GO_DISKLESS,		/* Disk is being detached, on io-error or admin request. */
-	WAS_IO_ERROR,		/* Local disk failed returned IO error */
+	WAS_IO_ERROR,		/* Local disk failed, returned IO error */
+	WAS_READ_ERROR,		/* Local disk READ failed (set additionally to the above) */
 	FORCE_DETACH,		/* Force-detach from local disk, aborting any pending local IO */
 	RESYNC_AFTER_NEG,       /* Resync after online grow after the attach&negotiate finished. */
-	NET_CONGESTED,		/* The data socket is congested */
-
-	CONFIG_PENDING,		/* serialization of (re)configuration requests.
-				 * if set, also prevents the device from dying */
-	DEVICE_DYING,		/* device became unconfigured,
-				 * but worker thread is still handling the cleanup.
-				 * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed,
-				 * while this is set. */
 	RESIZE_PENDING,		/* Size change detected locally, waiting for the response from
 				 * the peer, if it changed there as well. */
-	CONN_DRY_RUN,		/* Expect disconnect after resync handshake. */
-	GOT_PING_ACK,		/* set when we receive a ping_ack packet, misc wait gets woken */
 	NEW_CUR_UUID,		/* Create new current UUID when thawing IO */
 	AL_SUSPENDED,		/* Activity logging is currently suspended. */
 	AHEAD_TO_SYNC_SOURCE,   /* Ahead -> SyncSource queued */
-	STATE_SENT,		/* Do not change state/UUIDs while this is set */
-
-	CALLBACK_PENDING,	/* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
-				 * pending, from drbd worker context.
-				 * If set, bdi_write_congested() returns true,
-				 * so shrink_page_list() would not recurse into,
-				 * and potentially deadlock on, this drbd worker.
-				 */
+	B_RS_H_DONE,		/* Before resync handler done (already executed) */
+	DISCARD_MY_DATA,	/* discard_my_data flag per volume */
+	READ_BALANCE_RR,
 };
 
 struct drbd_bitmap; /* opaque for drbd_conf */
@@ -894,24 +730,24 @@ enum bm_flag {
 
 struct drbd_work_queue {
 	struct list_head q;
-	struct semaphore s; /* producers up it, worker down()s it */
 	spinlock_t q_lock;  /* to protect the list. */
+	wait_queue_head_t q_wait;
 };
 
 struct drbd_socket {
-	struct drbd_work_queue work;
 	struct mutex mutex;
 	struct socket    *socket;
 	/* this way we get our
 	 * send/receive buffers off the stack */
-	union p_polymorph sbuf;
-	union p_polymorph rbuf;
+	void *sbuf;
+	void *rbuf;
 };
 
 struct drbd_md {
 	u64 md_offset;		/* sector offset to 'super' block */
 
 	u64 la_size_sect;	/* last agreed size, unit sectors */
+	spinlock_t uuid_lock;
 	u64 uuid[UI_SIZE];
 	u64 device_uuid;
 	u32 flags;
@@ -921,24 +757,16 @@ struct drbd_md {
 	s32 bm_offset;	/* signed relative sector offset to bitmap */
 
 	/* u32 al_nr_extents;	   important for restoring the AL
-	 * is stored into  sync_conf.al_extents, which in turn
+	 * is stored into  ldev->dc.al_extents, which in turn
 	 * gets applied to act_log->nr_elements
 	 */
 };
 
-/* for sync_conf and other types... */
-#define NL_PACKET(name, number, fields) struct name { fields };
-#define NL_INTEGER(pn,pr,member) int member;
-#define NL_INT64(pn,pr,member) __u64 member;
-#define NL_BIT(pn,pr,member)   unsigned member:1;
-#define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len;
-#include <linux/drbd_nl.h>
-
 struct drbd_backing_dev {
 	struct block_device *backing_bdev;
 	struct block_device *md_bdev;
 	struct drbd_md md;
-	struct disk_conf dc; /* The user provided config... */
+	struct disk_conf *disk_conf; /* RCU, for updates: mdev->tconn->conf_update */
 	sector_t known_size; /* last known size of that backing device */
 };
 
@@ -962,18 +790,116 @@ enum write_ordering_e {
 };
 
 struct fifo_buffer {
-	int *values;
 	unsigned int head_index;
 	unsigned int size;
+	int total; /* sum of all values */
+	int values[0];
+};
+extern struct fifo_buffer *fifo_alloc(int fifo_size);
+
+/* flag bits per tconn */
+enum {
+	NET_CONGESTED,		/* The data socket is congested */
+	RESOLVE_CONFLICTS,	/* Set on one node, cleared on the peer! */
+	SEND_PING,		/* whether asender should send a ping asap */
+	SIGNAL_ASENDER,		/* whether asender wants to be interrupted */
+	GOT_PING_ACK,		/* set when we receive a ping_ack packet, ping_wait gets woken */
+	CONN_WD_ST_CHG_REQ,	/* A cluster wide state change on the connection is active */
+	CONN_WD_ST_CHG_OKAY,
+	CONN_WD_ST_CHG_FAIL,
+	CONN_DRY_RUN,		/* Expect disconnect after resync handshake. */
+	CREATE_BARRIER,		/* next P_DATA is preceded by a P_BARRIER */
+	STATE_SENT,		/* Do not change state/UUIDs while this is set */
+	CALLBACK_PENDING,	/* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
+				 * pending, from drbd worker context.
+				 * If set, bdi_write_congested() returns true,
+				 * so shrink_page_list() would not recurse into,
+				 * and potentially deadlock on, this drbd worker.
+				 */
+	DISCONNECT_SENT,
+};
+
+struct drbd_tconn {			/* is a resource from the config file */
+	char *name;			/* Resource name */
+	struct list_head all_tconn;	/* linked on global drbd_tconns */
+	struct kref kref;
+	struct idr volumes;		/* <tconn, vnr> to mdev mapping */
+	enum drbd_conns cstate;		/* Only C_STANDALONE to C_WF_REPORT_PARAMS */
+	unsigned susp:1;		/* IO suspended by user */
+	unsigned susp_nod:1;		/* IO suspended because no data */
+	unsigned susp_fen:1;		/* IO suspended because fence peer handler runs */
+	struct mutex cstate_mutex;	/* Protects graceful disconnects */
+
+	unsigned long flags;
+	struct net_conf *net_conf;	/* content protected by rcu */
+	struct mutex conf_update;	/* mutex for ready-copy-update of net_conf and disk_conf */
+	wait_queue_head_t ping_wait;	/* Woken upon reception of a ping, and a state change */
+	struct res_opts res_opts;
+
+	struct sockaddr_storage my_addr;
+	int my_addr_len;
+	struct sockaddr_storage peer_addr;
+	int peer_addr_len;
+
+	struct drbd_socket data;	/* data/barrier/cstate/parameter packets */
+	struct drbd_socket meta;	/* ping/ack (metadata) packets */
+	int agreed_pro_version;		/* actually used protocol version */
+	unsigned long last_received;	/* in jiffies, either socket */
+	unsigned int ko_count;
+
+	spinlock_t req_lock;
+
+	struct list_head transfer_log;	/* all requests not yet fully processed */
+
+	struct crypto_hash *cram_hmac_tfm;
+	struct crypto_hash *integrity_tfm;  /* checksums we compute, updates protected by tconn->data->mutex */
+	struct crypto_hash *peer_integrity_tfm;  /* checksums we verify, only accessed from receiver thread  */
+	struct crypto_hash *csums_tfm;
+	struct crypto_hash *verify_tfm;
+	void *int_dig_in;
+	void *int_dig_vv;
+
+	/* receiver side */
+	struct drbd_epoch *current_epoch;
+	spinlock_t epoch_lock;
+	unsigned int epochs;
+	enum write_ordering_e write_ordering;
+	atomic_t current_tle_nr;	/* transfer log epoch number */
+	unsigned current_tle_writes;	/* writes seen within this tl epoch */
+
+	unsigned long last_reconnect_jif;
+	struct drbd_thread receiver;
+	struct drbd_thread worker;
+	struct drbd_thread asender;
+	cpumask_var_t cpu_mask;
+
+	/* sender side */
+	struct drbd_work_queue sender_work;
+
+	struct {
+		/* whether this sender thread
+		 * has processed a single write yet. */
+		bool seen_any_write_yet;
+
+		/* Which barrier number to send with the next P_BARRIER */
+		int current_epoch_nr;
+
+		/* how many write requests have been sent
+		 * with req->epoch == current_epoch_nr.
+		 * If none, no P_BARRIER will be sent. */
+		unsigned current_epoch_writes;
+	} send;
 };
 
 struct drbd_conf {
+	struct drbd_tconn *tconn;
+	int vnr;			/* volume number within the connection */
+	struct kref kref;
+
 	/* things that are stored as / read from meta data on disk */
 	unsigned long flags;
 
 	/* configured by drbdsetup */
-	struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */
-	struct syncer_conf sync_conf;
 	struct drbd_backing_dev *ldev __protected_by(local);
 
 	sector_t p_size;     /* partner's disk size */
@@ -981,11 +907,7 @@ struct drbd_conf {
 	struct block_device *this_bdev;
 	struct gendisk	    *vdisk;
 
-	struct drbd_socket data; /* data/barrier/cstate/parameter packets */
-	struct drbd_socket meta; /* ping/ack (metadata) packets */
-	int agreed_pro_version;  /* actually used protocol version */
-	unsigned long last_received; /* in jiffies, either socket */
-	unsigned int ko_count;
+	unsigned long last_reattach_jif;
 	struct drbd_work  resync_work,
 			  unplug_work,
 			  go_diskless,
@@ -1005,10 +927,9 @@ struct drbd_conf {
 	/* Used after attach while negotiating new disk state. */
 	union drbd_state new_state_tmp;
 
-	union drbd_state state;
+	union drbd_dev_state state;
 	wait_queue_head_t misc_wait;
 	wait_queue_head_t state_wait;  /* upon each state change. */
-	wait_queue_head_t net_cnt_wait;
 	unsigned int send_cnt;
 	unsigned int recv_cnt;
 	unsigned int read_cnt;
@@ -1018,17 +939,12 @@ struct drbd_conf {
 	atomic_t ap_bio_cnt;	 /* Requests we need to complete */
 	atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
 	atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
-	atomic_t unacked_cnt;	 /* Need to send replys for */
+	atomic_t unacked_cnt;	 /* Need to send replies for */
 	atomic_t local_cnt;	 /* Waiting for local completion */
-	atomic_t net_cnt;	 /* Users of net_conf */
-	spinlock_t req_lock;
-	struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */
-	struct drbd_tl_epoch *newest_tle;
-	struct drbd_tl_epoch *oldest_tle;
-	struct list_head out_of_sequence_requests;
-	struct list_head barrier_acked_requests;
-	struct hlist_head *tl_hash;
-	unsigned int tl_hash_s;
+
+	/* Interval tree of pending local requests */
+	struct rb_root read_requests;
+	struct rb_root write_requests;
 
 	/* blocks to resync in this run [unit BM_BLOCK_SIZE] */
 	unsigned long rs_total;
@@ -1048,9 +964,11 @@ struct drbd_conf {
 	unsigned long rs_mark_time[DRBD_SYNC_MARKS];
 	/* current index into rs_mark_{left,time} */
 	int rs_last_mark;
+	unsigned long rs_last_bcast; /* [unit jiffies] */
 
 	/* where does the admin want us to start? (sector) */
 	sector_t ov_start_sector;
+	sector_t ov_stop_sector;
 	/* where are we now? (sector) */
 	sector_t ov_position;
 	/* Start sector of out of sync range (to merge printk reporting). */
@@ -1058,14 +976,7 @@ struct drbd_conf {
 	/* size of out-of-sync range in sectors. */
 	sector_t ov_last_oos_size;
 	unsigned long ov_left; /* in bits */
-	struct crypto_hash *csums_tfm;
-	struct crypto_hash *verify_tfm;
 
-	unsigned long last_reattach_jif;
-	unsigned long last_reconnect_jif;
-	struct drbd_thread receiver;
-	struct drbd_thread worker;
-	struct drbd_thread asender;
 	struct drbd_bitmap *bitmap;
 	unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */
 
@@ -1078,29 +989,19 @@ struct drbd_conf {
 
 	int open_cnt;
 	u64 *p_uuid;
-	struct drbd_epoch *current_epoch;
-	spinlock_t epoch_lock;
-	unsigned int epochs;
-	enum write_ordering_e write_ordering;
+
 	struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
 	struct list_head sync_ee;   /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
-	struct list_head done_ee;   /* send ack */
-	struct list_head read_ee;   /* IO in progress (any read) */
+	struct list_head done_ee;   /* need to send P_WRITE_ACK */
+	struct list_head read_ee;   /* [RS]P_DATA_REQUEST being read */
 	struct list_head net_ee;    /* zero-copy network send in progress */
-	struct hlist_head *ee_hash; /* is proteced by req_lock! */
-	unsigned int ee_hash_s;
-
-	/* this one is protected by ee_lock, single thread */
-	struct drbd_epoch_entry *last_write_w_barrier;
 
 	int next_barrier_nr;
-	struct hlist_head *app_reads_hash; /* is proteced by req_lock */
 	struct list_head resync_reads;
 	atomic_t pp_in_use;		/* allocated from page pool */
 	atomic_t pp_in_use_by_net;	/* sendpage()d, still referenced by tcp */
 	wait_queue_head_t ee_wait;
 	struct page *md_io_page;	/* one page buffer for md_io */
-	struct page *md_io_tmpp;	/* for logical_block_size != 512 */
 	struct drbd_md_io md_io;
 	atomic_t md_io_in_use;		/* protects the md_io, md_io_page and md_io_tmpp */
 	spinlock_t al_lock;
@@ -1109,22 +1010,16 @@ struct drbd_conf {
 	unsigned int al_tr_number;
 	int al_tr_cycle;
 	int al_tr_pos;   /* position of the next transaction in the journal */
-	struct crypto_hash *cram_hmac_tfm;
-	struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */
-	struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */
-	void *int_dig_out;
-	void *int_dig_in;
-	void *int_dig_vv;
 	wait_queue_head_t seq_wait;
 	atomic_t packet_seq;
 	unsigned int peer_seq;
 	spinlock_t peer_seq_lock;
 	unsigned int minor;
 	unsigned long comm_bm_set; /* communicated number of set bits. */
-	cpumask_var_t cpu_mask;
 	struct bm_io_work bm_io_work;
 	u64 ed_uuid; /* UUID of the exposed data */
-	struct mutex state_mutex;
+	struct mutex own_state_mutex;
+	struct mutex *state_mutex; /* either own_state_mutex or mdev->tconn->cstate_mutex */
 	char congestion_reason;  /* Why we where congested... */
 	atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
 	atomic_t rs_sect_ev; /* for submitted resync data rate, both */
@@ -1132,9 +1027,8 @@ struct drbd_conf {
 	int rs_last_events;  /* counter of read or write "events" (unit sectors)
 			      * on the lower level device when we last looked. */
 	int c_sync_rate; /* current resync rate after syncer throttle magic */
-	struct fifo_buffer rs_plan_s; /* correction values of resync planer */
+	struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, tconn->conn_update) */
 	int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
-	int rs_planed;    /* resync sectors already planned */
 	atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
 	unsigned int peer_max_bio_size;
 	unsigned int local_max_bio_size;
@@ -1142,11 +1036,7 @@ struct drbd_conf {
 
 static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
 {
-	struct drbd_conf *mdev;
-
-	mdev = minor < minor_count ? minor_table[minor] : NULL;
-
-	return mdev;
+	return (struct drbd_conf *)idr_find(&minors, minor);
 }
 
 static inline unsigned int mdev_to_minor(struct drbd_conf *mdev)
@@ -1154,29 +1044,9 @@ static inline unsigned int mdev_to_minor(struct drbd_conf *mdev)
 	return mdev->minor;
 }
 
-/* returns 1 if it was successful,
- * returns 0 if there was no data socket.
- * so wherever you are going to use the data.socket, e.g. do
- * if (!drbd_get_data_sock(mdev))
- *	return 0;
- *	CODE();
- * drbd_put_data_sock(mdev);
- */
-static inline int drbd_get_data_sock(struct drbd_conf *mdev)
-{
-	mutex_lock(&mdev->data.mutex);
-	/* drbd_disconnect() could have called drbd_free_sock()
-	 * while we were waiting in down()... */
-	if (unlikely(mdev->data.socket == NULL)) {
-		mutex_unlock(&mdev->data.mutex);
-		return 0;
-	}
-	return 1;
-}
-
-static inline void drbd_put_data_sock(struct drbd_conf *mdev)
+static inline struct drbd_conf *vnr_to_mdev(struct drbd_tconn *tconn, int vnr)
 {
-	mutex_unlock(&mdev->data.mutex);
+	return (struct drbd_conf *)idr_find(&tconn->volumes, vnr);
 }
 
 /*
@@ -1185,106 +1055,77 @@ static inline void drbd_put_data_sock(struct drbd_conf *mdev)
 
 /* drbd_main.c */
 
-enum chg_state_flags {
-	CS_HARD	= 1,
-	CS_VERBOSE = 2,
-	CS_WAIT_COMPLETE = 4,
-	CS_SERIALIZE    = 8,
-	CS_ORDERED      = CS_WAIT_COMPLETE + CS_SERIALIZE,
-};
-
 enum dds_flags {
 	DDSF_FORCED    = 1,
 	DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */
 };
 
 extern void drbd_init_set_defaults(struct drbd_conf *mdev);
-extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev,
-					    enum chg_state_flags f,
-					    union drbd_state mask,
-					    union drbd_state val);
-extern void drbd_force_state(struct drbd_conf *, union drbd_state,
-			union drbd_state);
-extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *,
-					      union drbd_state,
-					      union drbd_state,
-					      enum chg_state_flags);
-extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state,
-					   enum chg_state_flags,
-					   struct completion *done);
-extern void print_st_err(struct drbd_conf *, union drbd_state,
-			union drbd_state, int);
 extern int  drbd_thread_start(struct drbd_thread *thi);
 extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
+extern char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task);
 #ifdef CONFIG_SMP
-extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev);
-extern void drbd_calc_cpu_mask(struct drbd_conf *mdev);
+extern void drbd_thread_current_set_cpu(struct drbd_thread *thi);
+extern void drbd_calc_cpu_mask(struct drbd_tconn *tconn);
 #else
 #define drbd_thread_current_set_cpu(A) ({})
 #define drbd_calc_cpu_mask(A) ({})
 #endif
-extern void drbd_free_resources(struct drbd_conf *mdev);
-extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
+extern void tl_release(struct drbd_tconn *, unsigned int barrier_nr,
 		       unsigned int set_size);
-extern void tl_clear(struct drbd_conf *mdev);
-extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
-extern void drbd_free_sock(struct drbd_conf *mdev);
-extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
-			void *buf, size_t size, unsigned msg_flags);
-extern int drbd_send_protocol(struct drbd_conf *mdev);
+extern void tl_clear(struct drbd_tconn *);
+extern void drbd_free_sock(struct drbd_tconn *tconn);
+extern int drbd_send(struct drbd_tconn *tconn, struct socket *sock,
+		     void *buf, size_t size, unsigned msg_flags);
+extern int drbd_send_all(struct drbd_tconn *, struct socket *, void *, size_t,
+			 unsigned);
+
+extern int __drbd_send_protocol(struct drbd_tconn *tconn, enum drbd_packet cmd);
+extern int drbd_send_protocol(struct drbd_tconn *tconn);
 extern int drbd_send_uuids(struct drbd_conf *mdev);
 extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
-extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev);
+extern void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev);
 extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
 extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s);
 extern int drbd_send_current_state(struct drbd_conf *mdev);
-extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
-			enum drbd_packets cmd, struct p_header80 *h,
-			size_t size, unsigned msg_flags);
-#define USE_DATA_SOCKET 1
-#define USE_META_SOCKET 0
-extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
-			enum drbd_packets cmd, struct p_header80 *h,
-			size_t size);
-extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
-			char *data, size_t size);
-extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc);
-extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr,
-			u32 set_size);
-extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
-			struct drbd_epoch_entry *e);
-extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
-			struct p_block_req *rp);
-extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
-			struct p_data *dp, int data_size);
-extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
+extern int drbd_send_sync_param(struct drbd_conf *mdev);
+extern void drbd_send_b_ack(struct drbd_tconn *tconn, u32 barrier_nr,
+			    u32 set_size);
+extern int drbd_send_ack(struct drbd_conf *, enum drbd_packet,
+			 struct drbd_peer_request *);
+extern void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd,
+			     struct p_block_req *rp);
+extern void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd,
+			     struct p_data *dp, int data_size);
+extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd,
 			    sector_t sector, int blksize, u64 block_id);
-extern int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req);
-extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
-			   struct drbd_epoch_entry *e);
+extern int drbd_send_out_of_sync(struct drbd_conf *, struct drbd_request *);
+extern int drbd_send_block(struct drbd_conf *, enum drbd_packet,
+			   struct drbd_peer_request *);
 extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req);
 extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
 			      sector_t sector, int size, u64 block_id);
-extern int drbd_send_drequest_csum(struct drbd_conf *mdev,
-				   sector_t sector,int size,
-				   void *digest, int digest_size,
-				   enum drbd_packets cmd);
+extern int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector,
+				   int size, void *digest, int digest_size,
+				   enum drbd_packet cmd);
 extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size);
 
 extern int drbd_send_bitmap(struct drbd_conf *mdev);
-extern int _drbd_send_bitmap(struct drbd_conf *mdev);
-extern int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode);
+extern void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode);
+extern void conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode);
 extern void drbd_free_bc(struct drbd_backing_dev *ldev);
 extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
 void drbd_print_uuids(struct drbd_conf *mdev, const char *text);
 
+extern void conn_md_sync(struct drbd_tconn *tconn);
 extern void drbd_md_sync(struct drbd_conf *mdev);
 extern int  drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
 extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
 extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
 extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
-extern void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
 extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local);
+extern void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local);
+extern void __drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
 extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local);
 extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local);
 extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
@@ -1302,33 +1143,52 @@ extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
 extern int drbd_bitmap_io(struct drbd_conf *mdev,
 		int (*io_fn)(struct drbd_conf *),
 		char *why, enum bm_flag flags);
+extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
+		int (*io_fn)(struct drbd_conf *),
+		char *why, enum bm_flag flags);
 extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
 extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
 extern void drbd_go_diskless(struct drbd_conf *mdev);
 extern void drbd_ldev_destroy(struct drbd_conf *mdev);
 
-
 /* Meta data layout
    We reserve a 128MB Block (4k aligned)
    * either at the end of the backing device
    * or on a separate meta data device. */
 
-#define MD_RESERVED_SECT (128LU << 11)  /* 128 MB, unit sectors */
 /* The following numbers are sectors */
-#define MD_AL_OFFSET 8	    /* 8 Sectors after start of meta area */
-#define MD_AL_MAX_SIZE 64   /* = 32 kb LOG  ~ 3776 extents ~ 14 GB Storage */
-/* Allows up to about 3.8TB */
-#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE)
-
-/* Since the smalles IO unit is usually 512 byte */
-#define MD_SECTOR_SHIFT	 9
-#define MD_SECTOR_SIZE	 (1<<MD_SECTOR_SHIFT)
-
-/* activity log */
-#define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */
-#define AL_EXTENT_SHIFT 22		 /* One extent represents 4M Storage */
+/* Allows up to about 3.8TB, so if you want more,
+ * you need to use the "flexible" meta data format. */
+#define MD_RESERVED_SECT (128LU << 11)  /* 128 MB, unit sectors */
+#define MD_AL_OFFSET	8    /* 8 Sectors after start of meta area */
+#define MD_AL_SECTORS	64   /* = 32 kB on disk activity log ring buffer */
+#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS)
+
+/* we do all meta data IO in 4k blocks */
+#define MD_BLOCK_SHIFT	12
+#define MD_BLOCK_SIZE	(1<<MD_BLOCK_SHIFT)
+
+/* One activity log extent represents 4M of storage */
+#define AL_EXTENT_SHIFT 22
 #define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
 
+/* We could make these currently hardcoded constants configurable
+ * variables at create-md time (or even re-configurable at runtime?).
+ * Which will require some more changes to the DRBD "super block"
+ * and attach code.
+ *
+ * updates per transaction:
+ *   This many changes to the active set can be logged with one transaction.
+ *   This number is arbitrary.
+ * context per transaction:
+ *   This many context extent numbers are logged with each transaction.
+ *   This number is resulting from the transaction block size (4k), the layout
+ *   of the transaction header, and the number of updates per transaction.
+ *   See drbd_actlog.c:struct al_transaction_on_disk
+ * */
+#define AL_UPDATES_PER_TRANSACTION	 64	// arbitrary
+#define AL_CONTEXT_PER_TRANSACTION	919	// (4096 - 36 - 6*64)/4
+
 #if BITS_PER_LONG == 32
 #define LN2_BPL 5
 #define cpu_to_lel(A) cpu_to_le32(A)
@@ -1364,11 +1224,14 @@ struct bm_extent {
 
 #define SLEEP_TIME (HZ/10)
 
-#define BM_BLOCK_SHIFT  12			 /* 4k per bit */
+/* We do bitmap IO in units of 4k blocks.
+ * We also still have a hardcoded 4k per bit relation. */
+#define BM_BLOCK_SHIFT	12			 /* 4k per bit */
 #define BM_BLOCK_SIZE	 (1<<BM_BLOCK_SHIFT)
-/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
- * per sector of on disk bitmap */
-#define BM_EXT_SHIFT	 (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3)  /* = 24 */
+/* mostly arbitrarily set the represented size of one bitmap extent,
+ * aka resync extent, to 16 MiB (which is also 512 Byte worth of bitmap
+ * at 4k per bit resolution) */
+#define BM_EXT_SHIFT	 24	/* 16 MiB per resync extent */
 #define BM_EXT_SIZE	 (1<<BM_EXT_SHIFT)
 
 #if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12)
@@ -1436,17 +1299,20 @@ struct bm_extent {
 #endif
 #endif
 
-/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
- * With a value of 8 all IO in one 128K block make it to the same slot of the
- * hash table. */
-#define HT_SHIFT 8
-#define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT))
+/* BIO_MAX_SIZE is 256 * PAGE_CACHE_SIZE,
+ * so for typical PAGE_CACHE_SIZE of 4k, that is (1<<20) Byte.
+ * Since we may live in a mixed-platform cluster,
+ * we limit us to a platform agnostic constant here for now.
+ * A followup commit may allow even bigger BIO sizes,
+ * once we thought that through. */
+#define DRBD_MAX_BIO_SIZE (1U << 20)
+#if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+#error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+#endif
 #define DRBD_MAX_BIO_SIZE_SAFE (1U << 12)       /* Works always = 4k */
 
-#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* The old header only allows packets up to 32Kib data */
-
-/* Number of elements in the app_reads_hash */
-#define APP_R_HSIZE 15
+#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
+#define DRBD_MAX_BIO_SIZE_P95    (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
 
 extern int  drbd_bm_init(struct drbd_conf *mdev);
 extern int  drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits);
@@ -1468,11 +1334,11 @@ extern int  drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr);
 extern int  drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
 extern int  drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
 extern int  drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
+extern void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr);
 extern int  drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
+extern int  drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local);
 extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local);
 extern int  drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local);
-extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
-		unsigned long al_enr);
 extern size_t	     drbd_bm_words(struct drbd_conf *mdev);
 extern unsigned long drbd_bm_bits(struct drbd_conf *mdev);
 extern sector_t      drbd_bm_capacity(struct drbd_conf *mdev);
@@ -1497,7 +1363,7 @@ extern void drbd_bm_unlock(struct drbd_conf *mdev);
 /* drbd_main.c */
 
 extern struct kmem_cache *drbd_request_cache;
-extern struct kmem_cache *drbd_ee_cache;	/* epoch entries */
+extern struct kmem_cache *drbd_ee_cache;	/* peer requests */
 extern struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
 extern struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
 extern mempool_t *drbd_request_mempool;
@@ -1537,12 +1403,22 @@ extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
 
 extern rwlock_t global_state_lock;
 
-extern struct drbd_conf *drbd_new_device(unsigned int minor);
-extern void drbd_free_mdev(struct drbd_conf *mdev);
+extern int conn_lowest_minor(struct drbd_tconn *tconn);
+enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr);
+extern void drbd_minor_destroy(struct kref *kref);
+
+extern int set_resource_options(struct drbd_tconn *tconn, struct res_opts *res_opts);
+extern struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts);
+extern void conn_destroy(struct kref *kref);
+struct drbd_tconn *conn_get_by_name(const char *name);
+extern struct drbd_tconn *conn_get_by_addrs(void *my_addr, int my_addr_len,
+					    void *peer_addr, int peer_addr_len);
+extern void conn_free_crypto(struct drbd_tconn *tconn);
 
 extern int proc_details;
 
 /* drbd_req */
+extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long);
 extern void drbd_make_request(struct request_queue *q, struct bio *bio);
 extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
 extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
@@ -1550,10 +1426,11 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t);
 
 
 /* drbd_nl.c */
+extern int drbd_msg_put_info(const char *info);
 extern void drbd_suspend_io(struct drbd_conf *mdev);
 extern void drbd_resume_io(struct drbd_conf *mdev);
 extern char *ppsize(char *buf, unsigned long long size);
-extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int);
+extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, sector_t, int);
 enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
 extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
 extern void resync_after_online_grow(struct drbd_conf *);
@@ -1561,13 +1438,14 @@ extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev);
 extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev,
 					enum drbd_role new_role,
 					int force);
-extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
-extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev);
+extern bool conn_try_outdate_peer(struct drbd_tconn *tconn);
+extern void conn_try_outdate_peer_async(struct drbd_tconn *tconn);
 extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
 
 /* drbd_worker.c */
 extern int drbd_worker(struct drbd_thread *thi);
-extern int drbd_alter_sa(struct drbd_conf *mdev, int na);
+enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor);
+void drbd_resync_after_changed(struct drbd_conf *mdev);
 extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side);
 extern void resume_next_sg(struct drbd_conf *mdev);
 extern void suspend_other_sg(struct drbd_conf *mdev);
@@ -1576,13 +1454,13 @@ extern int drbd_resync_finished(struct drbd_conf *mdev);
 extern void *drbd_md_get_buffer(struct drbd_conf *mdev);
 extern void drbd_md_put_buffer(struct drbd_conf *mdev);
 extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
-				struct drbd_backing_dev *bdev, sector_t sector, int rw);
-extern void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
-					    unsigned int *done);
-extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
+		struct drbd_backing_dev *bdev, sector_t sector, int rw);
+extern void drbd_ov_out_of_sync_found(struct drbd_conf *, sector_t, int);
+extern void wait_until_done_or_force_detached(struct drbd_conf *mdev,
+		struct drbd_backing_dev *bdev, unsigned int *done);
 extern void drbd_rs_controller_reset(struct drbd_conf *mdev);
 
-static inline void ov_oos_print(struct drbd_conf *mdev)
+static inline void ov_out_of_sync_print(struct drbd_conf *mdev)
 {
 	if (mdev->ov_last_oos_size) {
 		dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n",
@@ -1594,97 +1472,102 @@ static inline void ov_oos_print(struct drbd_conf *mdev)
 
 
 extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
-extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, struct drbd_epoch_entry *, void *);
+extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *,
+			 struct drbd_peer_request *, void *);
 /* worker callbacks */
-extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
-extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
-extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
-extern int w_resync_timer(struct drbd_conf *, struct drbd_work *, int);
-extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int);
-extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int);
-extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
-extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
-extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
-extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
-extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int);
-extern int w_send_oos(struct drbd_conf *, struct drbd_work *, int);
-extern int w_start_resync(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_data_req(struct drbd_work *, int);
+extern int w_e_end_rsdata_req(struct drbd_work *, int);
+extern int w_e_end_csum_rs_req(struct drbd_work *, int);
+extern int w_e_end_ov_reply(struct drbd_work *, int);
+extern int w_e_end_ov_req(struct drbd_work *, int);
+extern int w_ov_finished(struct drbd_work *, int);
+extern int w_resync_timer(struct drbd_work *, int);
+extern int w_send_write_hint(struct drbd_work *, int);
+extern int w_make_resync_request(struct drbd_work *, int);
+extern int w_send_dblock(struct drbd_work *, int);
+extern int w_send_read_req(struct drbd_work *, int);
+extern int w_prev_work_done(struct drbd_work *, int);
+extern int w_e_reissue(struct drbd_work *, int);
+extern int w_restart_disk_io(struct drbd_work *, int);
+extern int w_send_out_of_sync(struct drbd_work *, int);
+extern int w_start_resync(struct drbd_work *, int);
 
 extern void resync_timer_fn(unsigned long data);
 extern void start_resync_timer_fn(unsigned long data);
 
 /* drbd_receiver.c */
 extern int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector);
-extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
-		const unsigned rw, const int fault_type);
-extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
-extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
-					    u64 id,
-					    sector_t sector,
-					    unsigned int data_size,
-					    gfp_t gfp_mask) __must_hold(local);
-extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
-		int is_net);
-#define drbd_free_ee(m,e)	drbd_free_some_ee(m, e, 0)
-#define drbd_free_net_ee(m,e)	drbd_free_some_ee(m, e, 1)
-extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
-		struct list_head *head);
-extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
-		struct list_head *head);
+extern int drbd_submit_peer_request(struct drbd_conf *,
+				    struct drbd_peer_request *, const unsigned,
+				    const int);
+extern int drbd_free_peer_reqs(struct drbd_conf *, struct list_head *);
+extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_conf *, u64,
+						     sector_t, unsigned int,
+						     gfp_t) __must_hold(local);
+extern void __drbd_free_peer_req(struct drbd_conf *, struct drbd_peer_request *,
+				 int);
+#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0)
+#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1)
+extern struct page *drbd_alloc_pages(struct drbd_conf *, unsigned int, bool);
 extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
 extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
-extern void drbd_flush_workqueue(struct drbd_conf *mdev);
-extern void drbd_free_tl_hash(struct drbd_conf *mdev);
+extern void conn_flush_workqueue(struct drbd_tconn *tconn);
+extern int drbd_connected(struct drbd_conf *mdev);
+static inline void drbd_flush_workqueue(struct drbd_conf *mdev)
+{
+	conn_flush_workqueue(mdev->tconn);
+}
 
-/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
- * mess with get_fs/set_fs, we know we are KERNEL_DS always. */
+/* Yes, there is kernel_setsockopt, but only since 2.6.18.
+ * So we have our own copy of it here. */
 static inline int drbd_setsockopt(struct socket *sock, int level, int optname,
-			char __user *optval, int optlen)
+				  char *optval, int optlen)
 {
+	mm_segment_t oldfs = get_fs();
+	char __user *uoptval;
 	int err;
+
+	uoptval = (char __user __force *)optval;
+
+	set_fs(KERNEL_DS);
 	if (level == SOL_SOCKET)
-		err = sock_setsockopt(sock, level, optname, optval, optlen);
+		err = sock_setsockopt(sock, level, optname, uoptval, optlen);
 	else
-		err = sock->ops->setsockopt(sock, level, optname, optval,
+		err = sock->ops->setsockopt(sock, level, optname, uoptval,
 					    optlen);
+	set_fs(oldfs);
 	return err;
 }
 
 static inline void drbd_tcp_cork(struct socket *sock)
 {
-	int __user val = 1;
+	int val = 1;
 	(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
-			(char __user *)&val, sizeof(val));
+			(char*)&val, sizeof(val));
 }
 
 static inline void drbd_tcp_uncork(struct socket *sock)
 {
-	int __user val = 0;
+	int val = 0;
 	(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
-			(char __user *)&val, sizeof(val));
+			(char*)&val, sizeof(val));
 }
 
 static inline void drbd_tcp_nodelay(struct socket *sock)
 {
-	int __user val = 1;
+	int val = 1;
 	(void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY,
-			(char __user *)&val, sizeof(val));
+			(char*)&val, sizeof(val));
 }
 
 static inline void drbd_tcp_quickack(struct socket *sock)
 {
-	int __user val = 2;
+	int val = 2;
 	(void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
-			(char __user *)&val, sizeof(val));
+			(char*)&val, sizeof(val));
 }
 
-void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo);
+void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo);
 
 /* drbd_proc.c */
 extern struct proc_dir_entry *drbd_proc;
@@ -1693,8 +1576,8 @@ extern const char *drbd_conn_str(enum drbd_conns s);
 extern const char *drbd_role_str(enum drbd_role s);
 
 /* drbd_actlog.c */
-extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector);
-extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector);
+extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i);
+extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i);
 extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector);
 extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
 extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
@@ -1702,7 +1585,6 @@ extern void drbd_rs_cancel_all(struct drbd_conf *mdev);
 extern int drbd_rs_del_all(struct drbd_conf *mdev);
 extern void drbd_rs_failed_io(struct drbd_conf *mdev,
 		sector_t sector, int size);
-extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *);
 extern void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go);
 extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector,
 		int size, const char *file, const unsigned int line);
@@ -1712,73 +1594,24 @@ extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
 		int size, const char *file, const unsigned int line);
 #define drbd_set_out_of_sync(mdev, sector, size) \
 	__drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
-extern void drbd_al_apply_to_bm(struct drbd_conf *mdev);
 extern void drbd_al_shrink(struct drbd_conf *mdev);
 
-
 /* drbd_nl.c */
-
-void drbd_nl_cleanup(void);
-int __init drbd_nl_init(void);
-void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state);
-void drbd_bcast_sync_progress(struct drbd_conf *mdev);
-void drbd_bcast_ee(struct drbd_conf *mdev,
-		const char *reason, const int dgs,
-		const char* seen_hash, const char* calc_hash,
-		const struct drbd_epoch_entry* e);
-
-
-/**
- * DOC: DRBD State macros
- *
- * These macros are used to express state changes in easily readable form.
- *
- * The NS macros expand to a mask and a value, that can be bit ored onto the
- * current state as soon as the spinlock (req_lock) was taken.
- *
- * The _NS macros are used for state functions that get called with the
- * spinlock. These macros expand directly to the new state value.
- *
- * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
- * to express state changes that affect more than one aspect of the state.
- *
- * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
- * Means that the network connection was established and that the peer
- * is in secondary role.
- */
-#define role_MASK R_MASK
-#define peer_MASK R_MASK
-#define disk_MASK D_MASK
-#define pdsk_MASK D_MASK
-#define conn_MASK C_MASK
-#define susp_MASK 1
-#define user_isp_MASK 1
-#define aftr_isp_MASK 1
-#define susp_nod_MASK 1
-#define susp_fen_MASK 1
-
-#define NS(T, S) \
-	({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
-	({ union drbd_state val; val.i = 0; val.T = (S); val; })
-#define NS2(T1, S1, T2, S2) \
-	({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
-	  mask.T2 = T2##_MASK; mask; }), \
-	({ union drbd_state val; val.i = 0; val.T1 = (S1); \
-	  val.T2 = (S2); val; })
-#define NS3(T1, S1, T2, S2, T3, S3) \
-	({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
-	  mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
-	({ union drbd_state val;  val.i = 0; val.T1 = (S1); \
-	  val.T2 = (S2); val.T3 = (S3); val; })
-
-#define _NS(D, T, S) \
-	D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; })
-#define _NS2(D, T1, S1, T2, S2) \
-	D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
-	__ns.T2 = (S2); __ns; })
-#define _NS3(D, T1, S1, T2, S2, T3, S3) \
-	D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
-	__ns.T2 = (S2); __ns.T3 = (S3); __ns; })
+/* state info broadcast */
+struct sib_info {
+	enum drbd_state_info_bcast_reason sib_reason;
+	union {
+		struct {
+			char *helper_name;
+			unsigned helper_exit_code;
+		};
+		struct {
+			union drbd_state os;
+			union drbd_state ns;
+		};
+	};
+};
+void drbd_bcast_event(struct drbd_conf *mdev, const struct sib_info *sib);
 
 /*
  * inline helper functions
@@ -1795,9 +1628,10 @@ static inline struct page *page_chain_next(struct page *page)
 #define page_chain_for_each_safe(page, n) \
 	for (; page && ({ n = page_chain_next(page); 1; }); page = n)
 
-static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
+
+static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req)
 {
-	struct page *page = e->pages;
+	struct page *page = peer_req->pages;
 	page_chain_for_each(page) {
 		if (page_count(page) > 1)
 			return 1;
@@ -1805,18 +1639,6 @@ static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
 	return 0;
 }
 
-static inline void drbd_state_lock(struct drbd_conf *mdev)
-{
-	wait_event(mdev->misc_wait,
-		   !test_and_set_bit(CLUSTER_ST_CHANGE, &mdev->flags));
-}
-
-static inline void drbd_state_unlock(struct drbd_conf *mdev)
-{
-	clear_bit(CLUSTER_ST_CHANGE, &mdev->flags);
-	wake_up(&mdev->misc_wait);
-}
-
 static inline enum drbd_state_rv
 _drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
 		enum chg_state_flags flags, struct completion *done)
@@ -1830,48 +1652,71 @@ _drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
 	return rv;
 }
 
-/**
- * drbd_request_state() - Reqest a state change
- * @mdev:	DRBD device.
- * @mask:	mask of state bits to change.
- * @val:	value of new state bits.
- *
- * This is the most graceful way of requesting a state change. It is verbose
- * quite verbose in case the state change is not possible, and all those
- * state changes are globally serialized.
- */
-static inline int drbd_request_state(struct drbd_conf *mdev,
-				     union drbd_state mask,
-				     union drbd_state val)
+static inline union drbd_state drbd_read_state(struct drbd_conf *mdev)
 {
-	return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
+	union drbd_state rv;
+
+	rv.i = mdev->state.i;
+	rv.susp = mdev->tconn->susp;
+	rv.susp_nod = mdev->tconn->susp_nod;
+	rv.susp_fen = mdev->tconn->susp_fen;
+
+	return rv;
 }
 
 enum drbd_force_detach_flags {
-	DRBD_IO_ERROR,
+	DRBD_READ_ERROR,
+	DRBD_WRITE_ERROR,
 	DRBD_META_IO_ERROR,
 	DRBD_FORCE_DETACH,
 };
 
 #define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
 static inline void __drbd_chk_io_error_(struct drbd_conf *mdev,
-		enum drbd_force_detach_flags forcedetach,
+		enum drbd_force_detach_flags df,
 		const char *where)
 {
-	switch (mdev->ldev->dc.on_io_error) {
-	case EP_PASS_ON:
-		if (forcedetach == DRBD_IO_ERROR) {
+	enum drbd_io_error_p ep;
+
+	rcu_read_lock();
+	ep = rcu_dereference(mdev->ldev->disk_conf)->on_io_error;
+	rcu_read_unlock();
+	switch (ep) {
+	case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */
+		if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) {
 			if (__ratelimit(&drbd_ratelimit_state))
 				dev_err(DEV, "Local IO failed in %s.\n", where);
 			if (mdev->state.disk > D_INCONSISTENT)
 				_drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_HARD, NULL);
 			break;
 		}
-		/* NOTE fall through to detach case if forcedetach set */
+		/* NOTE fall through for DRBD_META_IO_ERROR or DRBD_FORCE_DETACH */
 	case EP_DETACH:
 	case EP_CALL_HELPER:
+		/* Remember whether we saw a READ or WRITE error.
+		 *
+		 * Recovery of the affected area for WRITE failure is covered
+		 * by the activity log.
+		 * READ errors may fall outside that area though. Certain READ
+		 * errors can be "healed" by writing good data to the affected
+		 * blocks, which triggers block re-allocation in lower layers.
+		 *
+		 * If we can not write the bitmap after a READ error,
+		 * we may need to trigger a full sync (see w_go_diskless()).
+		 *
+		 * Force-detach is not really an IO error, but rather a
+		 * desperate measure to try to deal with a completely
+		 * unresponsive lower level IO stack.
+		 * Still it should be treated as a WRITE error.
+		 *
+		 * Meta IO error is always WRITE error:
+		 * we read meta data only once during attach,
+		 * which will fail in case of errors.
+		 */
 		set_bit(WAS_IO_ERROR, &mdev->flags);
-		if (forcedetach == DRBD_FORCE_DETACH)
+		if (df == DRBD_READ_ERROR)
+			set_bit(WAS_READ_ERROR, &mdev->flags);
+		if (df == DRBD_FORCE_DETACH)
 			set_bit(FORCE_DETACH, &mdev->flags);
 		if (mdev->state.disk > D_FAILED) {
 			_drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
@@ -1896,9 +1741,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
 {
 	if (error) {
 		unsigned long flags;
-		spin_lock_irqsave(&mdev->req_lock, flags);
+		spin_lock_irqsave(&mdev->tconn->req_lock, flags);
 		__drbd_chk_io_error_(mdev, forcedetach, where);
-		spin_unlock_irqrestore(&mdev->req_lock, flags);
+		spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
 	}
 }
 
@@ -1910,9 +1755,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
  * BTW, for internal meta data, this happens to be the maximum capacity
  * we could agree upon with our peer node.
  */
-static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
+static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backing_dev *bdev)
 {
-	switch (bdev->dc.meta_dev_idx) {
+	switch (meta_dev_idx) {
 	case DRBD_MD_INDEX_INTERNAL:
 	case DRBD_MD_INDEX_FLEX_INT:
 		return bdev->md.md_offset + bdev->md.bm_offset;
@@ -1922,13 +1767,30 @@ static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
 	}
 }
 
+static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
+{
+	int meta_dev_idx;
+
+	rcu_read_lock();
+	meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+	rcu_read_unlock();
+
+	return _drbd_md_first_sector(meta_dev_idx, bdev);
+}
+
 /**
  * drbd_md_last_sector() - Return the last sector number of the meta data area
  * @bdev:	Meta data block device.
  */
 static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
 {
-	switch (bdev->dc.meta_dev_idx) {
+	int meta_dev_idx;
+
+	rcu_read_lock();
+	meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+	rcu_read_unlock();
+
+	switch (meta_dev_idx) {
 	case DRBD_MD_INDEX_INTERNAL:
 	case DRBD_MD_INDEX_FLEX_INT:
 		return bdev->md.md_offset + MD_AL_OFFSET - 1;
@@ -1956,12 +1818,18 @@ static inline sector_t drbd_get_capacity(struct block_device *bdev)
 static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
 {
 	sector_t s;
-	switch (bdev->dc.meta_dev_idx) {
+	int meta_dev_idx;
+
+	rcu_read_lock();
+	meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+	rcu_read_unlock();
+
+	switch (meta_dev_idx) {
 	case DRBD_MD_INDEX_INTERNAL:
 	case DRBD_MD_INDEX_FLEX_INT:
 		s = drbd_get_capacity(bdev->backing_bdev)
 			? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
-					drbd_md_first_sector(bdev))
+				_drbd_md_first_sector(meta_dev_idx, bdev))
 			: 0;
 		break;
 	case DRBD_MD_INDEX_FLEX_EXT:
@@ -1987,9 +1855,15 @@ static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
 static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
 				    struct drbd_backing_dev *bdev)
 {
-	switch (bdev->dc.meta_dev_idx) {
+	int meta_dev_idx;
+
+	rcu_read_lock();
+	meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+	rcu_read_unlock();
+
+	switch (meta_dev_idx) {
 	default: /* external, some index */
-		return MD_RESERVED_SECT * bdev->dc.meta_dev_idx;
+		return MD_RESERVED_SECT * meta_dev_idx;
 	case DRBD_MD_INDEX_INTERNAL:
 		/* with drbd08, internal meta data is always "flexible" */
 	case DRBD_MD_INDEX_FLEX_INT:
@@ -2015,9 +1889,8 @@ drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
 	unsigned long flags;
 	spin_lock_irqsave(&q->q_lock, flags);
 	list_add(&w->list, &q->q);
-	up(&q->s); /* within the spinlock,
-		      see comment near end of drbd_worker() */
 	spin_unlock_irqrestore(&q->q_lock, flags);
+	wake_up(&q->q_wait);
 }
 
 static inline void
@@ -2026,41 +1899,35 @@ drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
 	unsigned long flags;
 	spin_lock_irqsave(&q->q_lock, flags);
 	list_add_tail(&w->list, &q->q);
-	up(&q->s); /* within the spinlock,
-		      see comment near end of drbd_worker() */
 	spin_unlock_irqrestore(&q->q_lock, flags);
+	wake_up(&q->q_wait);
 }
 
-static inline void wake_asender(struct drbd_conf *mdev)
-{
-	if (test_bit(SIGNAL_ASENDER, &mdev->flags))
-		force_sig(DRBD_SIG, mdev->asender.task);
-}
-
-static inline void request_ping(struct drbd_conf *mdev)
+static inline void wake_asender(struct drbd_tconn *tconn)
 {
-	set_bit(SEND_PING, &mdev->flags);
-	wake_asender(mdev);
+	if (test_bit(SIGNAL_ASENDER, &tconn->flags))
+		force_sig(DRBD_SIG, tconn->asender.task);
 }
 
-static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
-	enum drbd_packets cmd)
+static inline void request_ping(struct drbd_tconn *tconn)
 {
-	struct p_header80 h;
-	return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
+	set_bit(SEND_PING, &tconn->flags);
+	wake_asender(tconn);
 }
 
-static inline int drbd_send_ping(struct drbd_conf *mdev)
-{
-	struct p_header80 h;
-	return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
-}
+extern void *conn_prepare_command(struct drbd_tconn *, struct drbd_socket *);
+extern void *drbd_prepare_command(struct drbd_conf *, struct drbd_socket *);
+extern int conn_send_command(struct drbd_tconn *, struct drbd_socket *,
+			     enum drbd_packet, unsigned int, void *,
+			     unsigned int);
+extern int drbd_send_command(struct drbd_conf *, struct drbd_socket *,
+			     enum drbd_packet, unsigned int, void *,
+			     unsigned int);
 
-static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
-{
-	struct p_header80 h;
-	return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
-}
+extern int drbd_send_ping(struct drbd_tconn *tconn);
+extern int drbd_send_ping_ack(struct drbd_tconn *tconn);
+extern int drbd_send_state_req(struct drbd_conf *, union drbd_state, union drbd_state);
+extern int conn_send_state_req(struct drbd_tconn *, union drbd_state, union drbd_state);
 
 static inline void drbd_thread_stop(struct drbd_thread *thi)
 {
@@ -2082,21 +1949,21 @@ static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
  * or implicit barrier packets as necessary.
  * increased:
  *  w_send_barrier
- *  _req_mod(req, queue_for_net_write or queue_for_net_read);
+ *  _req_mod(req, QUEUE_FOR_NET_WRITE or QUEUE_FOR_NET_READ);
  *    it is much easier and equally valid to count what we queue for the
  *    worker, even before it actually was queued or send.
  *    (drbd_make_request_common; recovery path on read io-error)
  * decreased:
  *  got_BarrierAck (respective tl_clear, tl_clear_barrier)
- *  _req_mod(req, data_received)
+ *  _req_mod(req, DATA_RECEIVED)
  *     [from receive_DataReply]
- *  _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked)
+ *  _req_mod(req, WRITE_ACKED_BY_PEER or RECV_ACKED_BY_PEER or NEG_ACKED)
  *     [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)]
  *     for some reason it is NOT decreased in got_NegAck,
  *     but in the resulting cleanup code from report_params.
  *     we should try to remember the reason for that...
- *  _req_mod(req, send_failed or send_canceled)
- *  _req_mod(req, connection_lost_while_pending)
+ *  _req_mod(req, SEND_FAILED or SEND_CANCELED)
+ *  _req_mod(req, CONNECTION_LOST_WHILE_PENDING)
  *     [from tl_clear_barrier]
  */
 static inline void inc_ap_pending(struct drbd_conf *mdev)
@@ -2104,17 +1971,19 @@ static inline void inc_ap_pending(struct drbd_conf *mdev)
 	atomic_inc(&mdev->ap_pending_cnt);
 }
 
-#define ERR_IF_CNT_IS_NEGATIVE(which)				\
-	if (atomic_read(&mdev->which) < 0)			\
+#define ERR_IF_CNT_IS_NEGATIVE(which, func, line)			\
+	if (atomic_read(&mdev->which) < 0)				\
 		dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n",	\
-		    __func__ , __LINE__ ,			\
-		    atomic_read(&mdev->which))
+			func, line,					\
+			atomic_read(&mdev->which))
 
-#define dec_ap_pending(mdev)	do {				\
-	typecheck(struct drbd_conf *, mdev);			\
-	if (atomic_dec_and_test(&mdev->ap_pending_cnt))		\
-		wake_up(&mdev->misc_wait);			\
-	ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0)
+#define dec_ap_pending(mdev) _dec_ap_pending(mdev, __FUNCTION__, __LINE__)
+static inline void _dec_ap_pending(struct drbd_conf *mdev, const char *func, int line)
+{
+	if (atomic_dec_and_test(&mdev->ap_pending_cnt))
+		wake_up(&mdev->misc_wait);
+	ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt, func, line);
+}
 
 /* counts how many resync-related answers we still expect from the peer
  *		     increase			decrease
@@ -2127,10 +1996,12 @@ static inline void inc_rs_pending(struct drbd_conf *mdev)
 	atomic_inc(&mdev->rs_pending_cnt);
 }
 
-#define dec_rs_pending(mdev)	do {				\
-	typecheck(struct drbd_conf *, mdev);			\
-	atomic_dec(&mdev->rs_pending_cnt);			\
-	ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0)
+#define dec_rs_pending(mdev) _dec_rs_pending(mdev, __FUNCTION__, __LINE__)
+static inline void _dec_rs_pending(struct drbd_conf *mdev, const char *func, int line)
+{
+	atomic_dec(&mdev->rs_pending_cnt);
+	ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt, func, line);
+}
 
 /* counts how many answers we still need to send to the peer.
  * increased on
@@ -2146,38 +2017,18 @@ static inline void inc_unacked(struct drbd_conf *mdev)
 	atomic_inc(&mdev->unacked_cnt);
 }
 
-#define dec_unacked(mdev)	do {				\
-	typecheck(struct drbd_conf *, mdev);			\
-	atomic_dec(&mdev->unacked_cnt);				\
-	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
-
-#define sub_unacked(mdev, n)	do {				\
-	typecheck(struct drbd_conf *, mdev);			\
-	atomic_sub(n, &mdev->unacked_cnt);			\
-	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
-
-
-static inline void put_net_conf(struct drbd_conf *mdev)
+#define dec_unacked(mdev) _dec_unacked(mdev, __FUNCTION__, __LINE__)
+static inline void _dec_unacked(struct drbd_conf *mdev, const char *func, int line)
 {
-	if (atomic_dec_and_test(&mdev->net_cnt))
-		wake_up(&mdev->net_cnt_wait);
+	atomic_dec(&mdev->unacked_cnt);
+	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
 }
 
-/**
- * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there
- * @mdev:	DRBD device.
- *
- * You have to call put_net_conf() when finished working with mdev->net_conf.
- */
-static inline int get_net_conf(struct drbd_conf *mdev)
+#define sub_unacked(mdev, n) _sub_unacked(mdev, n, __FUNCTION__, __LINE__)
+static inline void _sub_unacked(struct drbd_conf *mdev, int n, const char *func, int line)
 {
-	int have_net_conf;
-
-	atomic_inc(&mdev->net_cnt);
-	have_net_conf = mdev->state.conn >= C_UNCONNECTED;
-	if (!have_net_conf)
-		put_net_conf(mdev);
-	return have_net_conf;
+	atomic_sub(n, &mdev->unacked_cnt);
+	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
 }
 
 /**
@@ -2281,17 +2132,20 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
  * maybe re-implement using semaphores? */
 static inline int drbd_get_max_buffers(struct drbd_conf *mdev)
 {
-	int mxb = 1000000; /* arbitrary limit on open requests */
-	if (get_net_conf(mdev)) {
-		mxb = mdev->net_conf->max_buffers;
-		put_net_conf(mdev);
-	}
+	struct net_conf *nc;
+	int mxb;
+
+	rcu_read_lock();
+	nc = rcu_dereference(mdev->tconn->net_conf);
+	mxb = nc ? nc->max_buffers : 1000000;  /* arbitrary limit on open requests */
+	rcu_read_unlock();
+
 	return mxb;
 }
 
 static inline int drbd_state_is_stable(struct drbd_conf *mdev)
 {
-	union drbd_state s = mdev->state;
+	union drbd_dev_state s = mdev->state;
 
 	/* DO NOT add a default clause, we want the compiler to warn us
 	 * for any newly introduced state we may have forgotten to add here */
@@ -2325,7 +2179,7 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev)
 
 		/* Allow IO in BM exchange states with new protocols */
 	case C_WF_BITMAP_S:
-		if (mdev->agreed_pro_version < 96)
+		if (mdev->tconn->agreed_pro_version < 96)
 			return 0;
 		break;
 
@@ -2347,7 +2201,7 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev)
 		/* disk state is stable as well. */
 		break;
 
-	/* no new io accepted during tansitional states */
+	/* no new io accepted during transitional states */
 	case D_ATTACHING:
 	case D_NEGOTIATING:
 	case D_UNKNOWN:
@@ -2359,16 +2213,18 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev)
 	return 1;
 }
 
-static inline int is_susp(union drbd_state s)
+static inline int drbd_suspended(struct drbd_conf *mdev)
 {
-	return s.susp || s.susp_nod || s.susp_fen;
+	struct drbd_tconn *tconn = mdev->tconn;
+
+	return tconn->susp || tconn->susp_fen || tconn->susp_nod;
 }
 
 static inline bool may_inc_ap_bio(struct drbd_conf *mdev)
 {
 	int mxb = drbd_get_max_buffers(mdev);
 
-	if (is_susp(mdev->state))
+	if (drbd_suspended(mdev))
 		return false;
 	if (test_bit(SUSPEND_IO, &mdev->flags))
 		return false;
@@ -2390,30 +2246,30 @@ static inline bool may_inc_ap_bio(struct drbd_conf *mdev)
 	return true;
 }
 
-static inline bool inc_ap_bio_cond(struct drbd_conf *mdev, int count)
+static inline bool inc_ap_bio_cond(struct drbd_conf *mdev)
 {
 	bool rv = false;
 
-	spin_lock_irq(&mdev->req_lock);
+	spin_lock_irq(&mdev->tconn->req_lock);
 	rv = may_inc_ap_bio(mdev);
 	if (rv)
-		atomic_add(count, &mdev->ap_bio_cnt);
-	spin_unlock_irq(&mdev->req_lock);
+		atomic_inc(&mdev->ap_bio_cnt);
+	spin_unlock_irq(&mdev->tconn->req_lock);
 
 	return rv;
 }
 
-static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
+static inline void inc_ap_bio(struct drbd_conf *mdev)
 {
 	/* we wait here
 	 *    as long as the device is suspended
 	 *    until the bitmap is no longer on the fly during connection
-	 *    handshake as long as we would exeed the max_buffer limit.
+	 *    handshake as long as we would exceed the max_buffer limit.
 	 *
 	 * to avoid races with the reconnect code,
 	 * we need to atomic_inc within the spinlock. */
 
-	wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev, count));
+	wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev));
 }
 
 static inline void dec_ap_bio(struct drbd_conf *mdev)
@@ -2425,7 +2281,7 @@ static inline void dec_ap_bio(struct drbd_conf *mdev)
 
 	if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
 		if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
-			drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
+			drbd_queue_work(&mdev->tconn->sender_work, &mdev->bm_io_work.w);
 	}
 
 	/* this currently does wake_up for every dec_ap_bio!
@@ -2435,6 +2291,12 @@ static inline void dec_ap_bio(struct drbd_conf *mdev)
 		wake_up(&mdev->misc_wait);
 }
 
+static inline bool verify_can_do_stop_sector(struct drbd_conf *mdev)
+{
+	return mdev->tconn->agreed_pro_version >= 97 &&
+		mdev->tconn->agreed_pro_version != 100;
+}
+
 static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
 {
 	int changed = mdev->ed_uuid != val;
@@ -2442,40 +2304,6 @@ static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
 	return changed;
 }
 
-static inline int seq_cmp(u32 a, u32 b)
-{
-	/* we assume wrap around at 32bit.
-	 * for wrap around at 24bit (old atomic_t),
-	 * we'd have to
-	 *  a <<= 8; b <<= 8;
-	 */
-	return (s32)(a) - (s32)(b);
-}
-#define seq_lt(a, b) (seq_cmp((a), (b)) < 0)
-#define seq_gt(a, b) (seq_cmp((a), (b)) > 0)
-#define seq_ge(a, b) (seq_cmp((a), (b)) >= 0)
-#define seq_le(a, b) (seq_cmp((a), (b)) <= 0)
-/* CAUTION: please no side effects in arguments! */
-#define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b)))
-
-static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq)
-{
-	unsigned int m;
-	spin_lock(&mdev->peer_seq_lock);
-	m = seq_max(mdev->peer_seq, new_seq);
-	mdev->peer_seq = m;
-	spin_unlock(&mdev->peer_seq_lock);
-	if (m == new_seq)
-		wake_up(&mdev->seq_wait);
-}
-
-static inline void drbd_update_congested(struct drbd_conf *mdev)
-{
-	struct sock *sk = mdev->data.socket->sk;
-	if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
-		set_bit(NET_CONGESTED, &mdev->flags);
-}
-
 static inline int drbd_queue_order_type(struct drbd_conf *mdev)
 {
 	/* sorry, we currently have no working implementation
@@ -2490,10 +2318,15 @@ static inline void drbd_md_flush(struct drbd_conf *mdev)
 {
 	int r;
 
+	if (mdev->ldev == NULL) {
+		dev_warn(DEV, "mdev->ldev == NULL in drbd_md_flush\n");
+		return;
+	}
+
 	if (test_bit(MD_NO_FUA, &mdev->flags))
 		return;
 
-	r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL);
+	r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_NOIO, NULL);
 	if (r) {
 		set_bit(MD_NO_FUA, &mdev->flags);
 		dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
diff --git a/trunk/drivers/block/drbd/drbd_interval.c b/trunk/drivers/block/drbd/drbd_interval.c
new file mode 100644
index 000000000000..89c497c630b4
--- /dev/null
+++ b/trunk/drivers/block/drbd/drbd_interval.c
@@ -0,0 +1,207 @@
+#include <asm/bug.h>
+#include <linux/rbtree_augmented.h>
+#include "drbd_interval.h"
+
+/**
+ * interval_end  -  return end of @node
+ */
+static inline
+sector_t interval_end(struct rb_node *node)
+{
+	struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb);
+	return this->end;
+}
+
+/**
+ * compute_subtree_last  -  compute end of @node
+ *
+ * The end of an interval is the highest (start + (size >> 9)) value of this
+ * node and of its children.  Called for @node and its parents whenever the end
+ * may have changed.
+ */
+static inline sector_t
+compute_subtree_last(struct drbd_interval *node)
+{
+	sector_t max = node->sector + (node->size >> 9);
+
+	if (node->rb.rb_left) {
+		sector_t left = interval_end(node->rb.rb_left);
+		if (left > max)
+			max = left;
+	}
+	if (node->rb.rb_right) {
+		sector_t right = interval_end(node->rb.rb_right);
+		if (right > max)
+			max = right;
+	}
+	return max;
+}
+
+static void augment_propagate(struct rb_node *rb, struct rb_node *stop)
+{
+	while (rb != stop) {
+		struct drbd_interval *node = rb_entry(rb, struct drbd_interval, rb);
+		sector_t subtree_last = compute_subtree_last(node);
+		if (node->end == subtree_last)
+			break;
+		node->end = subtree_last;
+		rb = rb_parent(&node->rb);
+	}
+}
+
+static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+	struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
+	struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
+
+	new->end = old->end;
+}
+
+static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+	struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
+	struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
+
+	new->end = old->end;
+	old->end = compute_subtree_last(old);
+}
+
+static const struct rb_augment_callbacks augment_callbacks = {
+	augment_propagate,
+	augment_copy,
+	augment_rotate,
+};
+
+/**
+ * drbd_insert_interval  -  insert a new interval into a tree
+ */
+bool
+drbd_insert_interval(struct rb_root *root, struct drbd_interval *this)
+{
+	struct rb_node **new = &root->rb_node, *parent = NULL;
+
+	BUG_ON(!IS_ALIGNED(this->size, 512));
+
+	while (*new) {
+		struct drbd_interval *here =
+			rb_entry(*new, struct drbd_interval, rb);
+
+		parent = *new;
+		if (this->sector < here->sector)
+			new = &(*new)->rb_left;
+		else if (this->sector > here->sector)
+			new = &(*new)->rb_right;
+		else if (this < here)
+			new = &(*new)->rb_left;
+		else if (this > here)
+			new = &(*new)->rb_right;
+		else
+			return false;
+	}
+
+	rb_link_node(&this->rb, parent, new);
+	rb_insert_augmented(&this->rb, root, &augment_callbacks);
+	return true;
+}
+
+/**
+ * drbd_contains_interval  -  check if a tree contains a given interval
+ * @sector:	start sector of @interval
+ * @interval:	may not be a valid pointer
+ *
+ * Returns if the tree contains the node @interval with start sector @start.
+ * Does not dereference @interval until @interval is known to be a valid object
+ * in @tree.  Returns %false if @interval is in the tree but with a different
+ * sector number.
+ */
+bool
+drbd_contains_interval(struct rb_root *root, sector_t sector,
+		       struct drbd_interval *interval)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct drbd_interval *here =
+			rb_entry(node, struct drbd_interval, rb);
+
+		if (sector < here->sector)
+			node = node->rb_left;
+		else if (sector > here->sector)
+			node = node->rb_right;
+		else if (interval < here)
+			node = node->rb_left;
+		else if (interval > here)
+			node = node->rb_right;
+		else
+			return true;
+	}
+	return false;
+}
+
+/**
+ * drbd_remove_interval  -  remove an interval from a tree
+ */
+void
+drbd_remove_interval(struct rb_root *root, struct drbd_interval *this)
+{
+	rb_erase_augmented(&this->rb, root, &augment_callbacks);
+}
+
+/**
+ * drbd_find_overlap  - search for an interval overlapping with [sector, sector + size)
+ * @sector:	start sector
+ * @size:	size, aligned to 512 bytes
+ *
+ * Returns an interval overlapping with [sector, sector + size), or NULL if
+ * there is none.  When there is more than one overlapping interval in the
+ * tree, the interval with the lowest start sector is returned, and all other
+ * overlapping intervals will be on the right side of the tree, reachable with
+ * rb_next().
+ */
+struct drbd_interval *
+drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size)
+{
+	struct rb_node *node = root->rb_node;
+	struct drbd_interval *overlap = NULL;
+	sector_t end = sector + (size >> 9);
+
+	BUG_ON(!IS_ALIGNED(size, 512));
+
+	while (node) {
+		struct drbd_interval *here =
+			rb_entry(node, struct drbd_interval, rb);
+
+		if (node->rb_left &&
+		    sector < interval_end(node->rb_left)) {
+			/* Overlap if any must be on left side */
+			node = node->rb_left;
+		} else if (here->sector < end &&
+			   sector < here->sector + (here->size >> 9)) {
+			overlap = here;
+			break;
+		} else if (sector >= here->sector) {
+			/* Overlap if any must be on right side */
+			node = node->rb_right;
+		} else
+			break;
+	}
+	return overlap;
+}
+
+struct drbd_interval *
+drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size)
+{
+	sector_t end = sector + (size >> 9);
+	struct rb_node *node;
+
+	for (;;) {
+		node = rb_next(&i->rb);
+		if (!node)
+			return NULL;
+		i = rb_entry(node, struct drbd_interval, rb);
+		if (i->sector >= end)
+			return NULL;
+		if (sector < i->sector + (i->size >> 9))
+			return i;
+	}
+}
diff --git a/trunk/drivers/block/drbd/drbd_interval.h b/trunk/drivers/block/drbd/drbd_interval.h
new file mode 100644
index 000000000000..f38fcb00c10d
--- /dev/null
+++ b/trunk/drivers/block/drbd/drbd_interval.h
@@ -0,0 +1,40 @@
+#ifndef __DRBD_INTERVAL_H
+#define __DRBD_INTERVAL_H
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+struct drbd_interval {
+	struct rb_node rb;
+	sector_t sector;	/* start sector of the interval */
+	unsigned int size;	/* size in bytes */
+	sector_t end;		/* highest interval end in subtree */
+	int local:1		/* local or remote request? */;
+	int waiting:1;
+};
+
+static inline void drbd_clear_interval(struct drbd_interval *i)
+{
+	RB_CLEAR_NODE(&i->rb);
+}
+
+static inline bool drbd_interval_empty(struct drbd_interval *i)
+{
+	return RB_EMPTY_NODE(&i->rb);
+}
+
+extern bool drbd_insert_interval(struct rb_root *, struct drbd_interval *);
+extern bool drbd_contains_interval(struct rb_root *, sector_t,
+				   struct drbd_interval *);
+extern void drbd_remove_interval(struct rb_root *, struct drbd_interval *);
+extern struct drbd_interval *drbd_find_overlap(struct rb_root *, sector_t,
+					unsigned int);
+extern struct drbd_interval *drbd_next_overlap(struct drbd_interval *, sector_t,
+					unsigned int);
+
+#define drbd_for_each_overlap(i, root, sector, size)		\
+	for (i = drbd_find_overlap(root, sector, size);		\
+	     i;							\
+	     i = drbd_next_overlap(i, sector, size))
+
+#endif  /* __DRBD_INTERVAL_H */
diff --git a/trunk/drivers/block/drbd/drbd_main.c b/trunk/drivers/block/drbd/drbd_main.c
index f55683ad4ffa..8c13eeb83c53 100644
--- a/trunk/drivers/block/drbd/drbd_main.c
+++ b/trunk/drivers/block/drbd/drbd_main.c
@@ -56,14 +56,6 @@
 
 #include "drbd_vli.h"
 
-struct after_state_chg_work {
-	struct drbd_work w;
-	union drbd_state os;
-	union drbd_state ns;
-	enum chg_state_flags flags;
-	struct completion *done;
-};
-
 static DEFINE_MUTEX(drbd_main_mutex);
 int drbdd_init(struct drbd_thread *);
 int drbd_worker(struct drbd_thread *);
@@ -72,21 +64,17 @@ int drbd_asender(struct drbd_thread *);
 int drbd_init(void);
 static int drbd_open(struct block_device *bdev, fmode_t mode);
 static int drbd_release(struct gendisk *gd, fmode_t mode);
-static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
-static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
-			   union drbd_state ns, enum chg_state_flags flags);
-static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+static int w_md_sync(struct drbd_work *w, int unused);
 static void md_sync_timer_fn(unsigned long data);
-static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
-static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
-static void _tl_clear(struct drbd_conf *mdev);
+static int w_bitmap_io(struct drbd_work *w, int unused);
+static int w_go_diskless(struct drbd_work *w, int unused);
 
 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
 	      "Lars Ellenberg <lars@linbit.com>");
 MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
 MODULE_VERSION(REL_VERSION);
 MODULE_LICENSE("GPL");
-MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
+MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices ("
 		 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
 MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
 
@@ -98,7 +86,6 @@ MODULE_PARM_DESC(allow_oos, "DONT USE!");
 module_param(minor_count, uint, 0444);
 module_param(disable_sendpage, bool, 0644);
 module_param(allow_oos, bool, 0);
-module_param(cn_idx, uint, 0444);
 module_param(proc_details, int, 0644);
 
 #ifdef CONFIG_DRBD_FAULT_INJECTION
@@ -120,7 +107,6 @@ module_param(fault_devs, int, 0644);
 unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
 bool disable_sendpage;
 bool allow_oos;
-unsigned int cn_idx = CN_IDX_DRBD;
 int proc_details;       /* Detail level in proc drbd*/
 
 /* Module parameter for setting the user mode helper program
@@ -132,10 +118,11 @@ module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0
 /* in 2.6.x, our device mapping and config info contains our virtual gendisks
  * as member "struct gendisk *vdisk;"
  */
-struct drbd_conf **minor_table;
+struct idr minors;
+struct list_head drbd_tconns;  /* list of struct drbd_tconn */
 
 struct kmem_cache *drbd_request_cache;
-struct kmem_cache *drbd_ee_cache;	/* epoch entries */
+struct kmem_cache *drbd_ee_cache;	/* peer requests */
 struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
 struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
 mempool_t *drbd_request_mempool;
@@ -164,10 +151,15 @@ static const struct block_device_operations drbd_ops = {
 
 struct bio *bio_alloc_drbd(gfp_t gfp_mask)
 {
+	struct bio *bio;
+
 	if (!drbd_md_io_bio_set)
 		return bio_alloc(gfp_mask, 1);
 
-	return bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+	bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+	if (!bio)
+		return NULL;
+	return bio;
 }
 
 #ifdef __CHECKER__
@@ -184,1676 +176,264 @@ int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
 		if (atomic_dec_and_test(&mdev->local_cnt))
 			wake_up(&mdev->misc_wait);
 	}
-	return io_allowed;
-}
-
-#endif
-
-/**
- * DOC: The transfer log
- *
- * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
- * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
- * of the list. There is always at least one &struct drbd_tl_epoch object.
- *
- * Each &struct drbd_tl_epoch has a circular double linked list of requests
- * attached.
- */
-static int tl_init(struct drbd_conf *mdev)
-{
-	struct drbd_tl_epoch *b;
-
-	/* during device minor initialization, we may well use GFP_KERNEL */
-	b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
-	if (!b)
-		return 0;
-	INIT_LIST_HEAD(&b->requests);
-	INIT_LIST_HEAD(&b->w.list);
-	b->next = NULL;
-	b->br_number = 4711;
-	b->n_writes = 0;
-	b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
-
-	mdev->oldest_tle = b;
-	mdev->newest_tle = b;
-	INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
-	INIT_LIST_HEAD(&mdev->barrier_acked_requests);
-
-	mdev->tl_hash = NULL;
-	mdev->tl_hash_s = 0;
-
-	return 1;
-}
-
-static void tl_cleanup(struct drbd_conf *mdev)
-{
-	D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
-	D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
-	kfree(mdev->oldest_tle);
-	mdev->oldest_tle = NULL;
-	kfree(mdev->unused_spare_tle);
-	mdev->unused_spare_tle = NULL;
-	kfree(mdev->tl_hash);
-	mdev->tl_hash = NULL;
-	mdev->tl_hash_s = 0;
-}
-
-/**
- * _tl_add_barrier() - Adds a barrier to the transfer log
- * @mdev:	DRBD device.
- * @new:	Barrier to be added before the current head of the TL.
- *
- * The caller must hold the req_lock.
- */
-void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
-{
-	struct drbd_tl_epoch *newest_before;
-
-	INIT_LIST_HEAD(&new->requests);
-	INIT_LIST_HEAD(&new->w.list);
-	new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
-	new->next = NULL;
-	new->n_writes = 0;
-
-	newest_before = mdev->newest_tle;
-	new->br_number = newest_before->br_number+1;
-	if (mdev->newest_tle != new) {
-		mdev->newest_tle->next = new;
-		mdev->newest_tle = new;
-	}
-}
-
-/**
- * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
- * @mdev:	DRBD device.
- * @barrier_nr:	Expected identifier of the DRBD write barrier packet.
- * @set_size:	Expected number of requests before that barrier.
- *
- * In case the passed barrier_nr or set_size does not match the oldest
- * &struct drbd_tl_epoch objects this function will cause a termination
- * of the connection.
- */
-void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
-		       unsigned int set_size)
-{
-	struct drbd_tl_epoch *b, *nob; /* next old barrier */
-	struct list_head *le, *tle;
-	struct drbd_request *r;
-
-	spin_lock_irq(&mdev->req_lock);
-
-	b = mdev->oldest_tle;
-
-	/* first some paranoia code */
-	if (b == NULL) {
-		dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
-			barrier_nr);
-		goto bail;
-	}
-	if (b->br_number != barrier_nr) {
-		dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
-			barrier_nr, b->br_number);
-		goto bail;
-	}
-	if (b->n_writes != set_size) {
-		dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
-			barrier_nr, set_size, b->n_writes);
-		goto bail;
-	}
-
-	/* Clean up list of requests processed during current epoch */
-	list_for_each_safe(le, tle, &b->requests) {
-		r = list_entry(le, struct drbd_request, tl_requests);
-		_req_mod(r, barrier_acked);
-	}
-	/* There could be requests on the list waiting for completion
-	   of the write to the local disk. To avoid corruptions of
-	   slab's data structures we have to remove the lists head.
-
-	   Also there could have been a barrier ack out of sequence, overtaking
-	   the write acks - which would be a bug and violating write ordering.
-	   To not deadlock in case we lose connection while such requests are
-	   still pending, we need some way to find them for the
-	   _req_mode(connection_lost_while_pending).
-
-	   These have been list_move'd to the out_of_sequence_requests list in
-	   _req_mod(, barrier_acked) above.
-	   */
-	list_splice_init(&b->requests, &mdev->barrier_acked_requests);
-
-	nob = b->next;
-	if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
-		_tl_add_barrier(mdev, b);
-		if (nob)
-			mdev->oldest_tle = nob;
-		/* if nob == NULL b was the only barrier, and becomes the new
-		   barrier. Therefore mdev->oldest_tle points already to b */
-	} else {
-		D_ASSERT(nob != NULL);
-		mdev->oldest_tle = nob;
-		kfree(b);
-	}
-
-	spin_unlock_irq(&mdev->req_lock);
-	dec_ap_pending(mdev);
-
-	return;
-
-bail:
-	spin_unlock_irq(&mdev->req_lock);
-	drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
-}
-
-
-/**
- * _tl_restart() - Walks the transfer log, and applies an action to all requests
- * @mdev:	DRBD device.
- * @what:       The action/event to perform with all request objects
- *
- * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
- * restart_frozen_disk_io.
- */
-static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
-{
-	struct drbd_tl_epoch *b, *tmp, **pn;
-	struct list_head *le, *tle, carry_reads;
-	struct drbd_request *req;
-	int rv, n_writes, n_reads;
-
-	b = mdev->oldest_tle;
-	pn = &mdev->oldest_tle;
-	while (b) {
-		n_writes = 0;
-		n_reads = 0;
-		INIT_LIST_HEAD(&carry_reads);
-		list_for_each_safe(le, tle, &b->requests) {
-			req = list_entry(le, struct drbd_request, tl_requests);
-			rv = _req_mod(req, what);
-
-			n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
-			n_reads  += (rv & MR_READ) >> MR_READ_SHIFT;
-		}
-		tmp = b->next;
-
-		if (n_writes) {
-			if (what == resend) {
-				b->n_writes = n_writes;
-				if (b->w.cb == NULL) {
-					b->w.cb = w_send_barrier;
-					inc_ap_pending(mdev);
-					set_bit(CREATE_BARRIER, &mdev->flags);
-				}
-
-				drbd_queue_work(&mdev->data.work, &b->w);
-			}
-			pn = &b->next;
-		} else {
-			if (n_reads)
-				list_add(&carry_reads, &b->requests);
-			/* there could still be requests on that ring list,
-			 * in case local io is still pending */
-			list_del(&b->requests);
-
-			/* dec_ap_pending corresponding to queue_barrier.
-			 * the newest barrier may not have been queued yet,
-			 * in which case w.cb is still NULL. */
-			if (b->w.cb != NULL)
-				dec_ap_pending(mdev);
-
-			if (b == mdev->newest_tle) {
-				/* recycle, but reinit! */
-				D_ASSERT(tmp == NULL);
-				INIT_LIST_HEAD(&b->requests);
-				list_splice(&carry_reads, &b->requests);
-				INIT_LIST_HEAD(&b->w.list);
-				b->w.cb = NULL;
-				b->br_number = net_random();
-				b->n_writes = 0;
-
-				*pn = b;
-				break;
-			}
-			*pn = tmp;
-			kfree(b);
-		}
-		b = tmp;
-		list_splice(&carry_reads, &b->requests);
-	}
-
-	/* Actions operating on the disk state, also want to work on
-	   requests that got barrier acked. */
-
-	list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
-		req = list_entry(le, struct drbd_request, tl_requests);
-		_req_mod(req, what);
-	}
-}
-
-
-/**
- * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
- * @mdev:	DRBD device.
- *
- * This is called after the connection to the peer was lost. The storage covered
- * by the requests on the transfer gets marked as our of sync. Called from the
- * receiver thread and the worker thread.
- */
-void tl_clear(struct drbd_conf *mdev)
-{
-	spin_lock_irq(&mdev->req_lock);
-	_tl_clear(mdev);
-	spin_unlock_irq(&mdev->req_lock);
-}
-
-static void _tl_clear(struct drbd_conf *mdev)
-{
-	struct list_head *le, *tle;
-	struct drbd_request *r;
-
-	_tl_restart(mdev, connection_lost_while_pending);
-
-	/* we expect this list to be empty. */
-	D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
-
-	/* but just in case, clean it up anyways! */
-	list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
-		r = list_entry(le, struct drbd_request, tl_requests);
-		/* It would be nice to complete outside of spinlock.
-		 * But this is easier for now. */
-		_req_mod(r, connection_lost_while_pending);
-	}
-
-	/* ensure bit indicating barrier is required is clear */
-	clear_bit(CREATE_BARRIER, &mdev->flags);
-
-	memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
-
-}
-
-void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
-{
-	spin_lock_irq(&mdev->req_lock);
-	_tl_restart(mdev, what);
-	spin_unlock_irq(&mdev->req_lock);
-}
-
-/**
- * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
- * @mdev:	DRBD device.
- */
-void tl_abort_disk_io(struct drbd_conf *mdev)
-{
-	struct drbd_tl_epoch *b;
-	struct list_head *le, *tle;
-	struct drbd_request *req;
-
-	spin_lock_irq(&mdev->req_lock);
-	b = mdev->oldest_tle;
-	while (b) {
-		list_for_each_safe(le, tle, &b->requests) {
-			req = list_entry(le, struct drbd_request, tl_requests);
-			if (!(req->rq_state & RQ_LOCAL_PENDING))
-				continue;
-			_req_mod(req, abort_disk_io);
-		}
-		b = b->next;
-	}
-
-	list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
-		req = list_entry(le, struct drbd_request, tl_requests);
-		if (!(req->rq_state & RQ_LOCAL_PENDING))
-			continue;
-		_req_mod(req, abort_disk_io);
-	}
-
-	spin_unlock_irq(&mdev->req_lock);
-}
-
-/**
- * cl_wide_st_chg() - true if the state change is a cluster wide one
- * @mdev:	DRBD device.
- * @os:		old (current) state.
- * @ns:		new (wanted) state.
- */
-static int cl_wide_st_chg(struct drbd_conf *mdev,
-			  union drbd_state os, union drbd_state ns)
-{
-	return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
-		 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
-		  (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
-		  (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
-		  (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
-		(os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
-		(os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
-}
-
-enum drbd_state_rv
-drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
-		  union drbd_state mask, union drbd_state val)
-{
-	unsigned long flags;
-	union drbd_state os, ns;
-	enum drbd_state_rv rv;
-
-	spin_lock_irqsave(&mdev->req_lock, flags);
-	os = mdev->state;
-	ns.i = (os.i & ~mask.i) | val.i;
-	rv = _drbd_set_state(mdev, ns, f, NULL);
-	ns = mdev->state;
-	spin_unlock_irqrestore(&mdev->req_lock, flags);
-
-	return rv;
-}
-
-/**
- * drbd_force_state() - Impose a change which happens outside our control on our state
- * @mdev:	DRBD device.
- * @mask:	mask of state bits to change.
- * @val:	value of new state bits.
- */
-void drbd_force_state(struct drbd_conf *mdev,
-	union drbd_state mask, union drbd_state val)
-{
-	drbd_change_state(mdev, CS_HARD, mask, val);
-}
-
-static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
-static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
-						    union drbd_state,
-						    union drbd_state);
-enum sanitize_state_warnings {
-	NO_WARNING,
-	ABORTED_ONLINE_VERIFY,
-	ABORTED_RESYNC,
-	CONNECTION_LOST_NEGOTIATING,
-	IMPLICITLY_UPGRADED_DISK,
-	IMPLICITLY_UPGRADED_PDSK,
-};
-static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
-				       union drbd_state ns, enum sanitize_state_warnings *warn);
-int drbd_send_state_req(struct drbd_conf *,
-			union drbd_state, union drbd_state);
-
-static enum drbd_state_rv
-_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
-	     union drbd_state val)
-{
-	union drbd_state os, ns;
-	unsigned long flags;
-	enum drbd_state_rv rv;
-
-	if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
-		return SS_CW_SUCCESS;
-
-	if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
-		return SS_CW_FAILED_BY_PEER;
-
-	rv = 0;
-	spin_lock_irqsave(&mdev->req_lock, flags);
-	os = mdev->state;
-	ns.i = (os.i & ~mask.i) | val.i;
-	ns = sanitize_state(mdev, os, ns, NULL);
-
-	if (!cl_wide_st_chg(mdev, os, ns))
-		rv = SS_CW_NO_NEED;
-	if (!rv) {
-		rv = is_valid_state(mdev, ns);
-		if (rv == SS_SUCCESS) {
-			rv = is_valid_state_transition(mdev, ns, os);
-			if (rv == SS_SUCCESS)
-				rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
-		}
-	}
-	spin_unlock_irqrestore(&mdev->req_lock, flags);
-
-	return rv;
-}
-
-/**
- * drbd_req_state() - Perform an eventually cluster wide state change
- * @mdev:	DRBD device.
- * @mask:	mask of state bits to change.
- * @val:	value of new state bits.
- * @f:		flags
- *
- * Should not be called directly, use drbd_request_state() or
- * _drbd_request_state().
- */
-static enum drbd_state_rv
-drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
-	       union drbd_state val, enum chg_state_flags f)
-{
-	struct completion done;
-	unsigned long flags;
-	union drbd_state os, ns;
-	enum drbd_state_rv rv;
-
-	init_completion(&done);
-
-	if (f & CS_SERIALIZE)
-		mutex_lock(&mdev->state_mutex);
-
-	spin_lock_irqsave(&mdev->req_lock, flags);
-	os = mdev->state;
-	ns.i = (os.i & ~mask.i) | val.i;
-	ns = sanitize_state(mdev, os, ns, NULL);
-
-	if (cl_wide_st_chg(mdev, os, ns)) {
-		rv = is_valid_state(mdev, ns);
-		if (rv == SS_SUCCESS)
-			rv = is_valid_state_transition(mdev, ns, os);
-		spin_unlock_irqrestore(&mdev->req_lock, flags);
-
-		if (rv < SS_SUCCESS) {
-			if (f & CS_VERBOSE)
-				print_st_err(mdev, os, ns, rv);
-			goto abort;
-		}
-
-		drbd_state_lock(mdev);
-		if (!drbd_send_state_req(mdev, mask, val)) {
-			drbd_state_unlock(mdev);
-			rv = SS_CW_FAILED_BY_PEER;
-			if (f & CS_VERBOSE)
-				print_st_err(mdev, os, ns, rv);
-			goto abort;
-		}
-
-		wait_event(mdev->state_wait,
-			(rv = _req_st_cond(mdev, mask, val)));
-
-		if (rv < SS_SUCCESS) {
-			drbd_state_unlock(mdev);
-			if (f & CS_VERBOSE)
-				print_st_err(mdev, os, ns, rv);
-			goto abort;
-		}
-		spin_lock_irqsave(&mdev->req_lock, flags);
-		os = mdev->state;
-		ns.i = (os.i & ~mask.i) | val.i;
-		rv = _drbd_set_state(mdev, ns, f, &done);
-		drbd_state_unlock(mdev);
-	} else {
-		rv = _drbd_set_state(mdev, ns, f, &done);
-	}
-
-	spin_unlock_irqrestore(&mdev->req_lock, flags);
-
-	if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
-		D_ASSERT(current != mdev->worker.task);
-		wait_for_completion(&done);
-	}
-
-abort:
-	if (f & CS_SERIALIZE)
-		mutex_unlock(&mdev->state_mutex);
-
-	return rv;
-}
-
-/**
- * _drbd_request_state() - Request a state change (with flags)
- * @mdev:	DRBD device.
- * @mask:	mask of state bits to change.
- * @val:	value of new state bits.
- * @f:		flags
- *
- * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
- * flag, or when logging of failed state change requests is not desired.
- */
-enum drbd_state_rv
-_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
-		    union drbd_state val, enum chg_state_flags f)
-{
-	enum drbd_state_rv rv;
-
-	wait_event(mdev->state_wait,
-		   (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
-
-	return rv;
-}
-
-static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
-{
-	dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
-	    name,
-	    drbd_conn_str(ns.conn),
-	    drbd_role_str(ns.role),
-	    drbd_role_str(ns.peer),
-	    drbd_disk_str(ns.disk),
-	    drbd_disk_str(ns.pdsk),
-	    is_susp(ns) ? 's' : 'r',
-	    ns.aftr_isp ? 'a' : '-',
-	    ns.peer_isp ? 'p' : '-',
-	    ns.user_isp ? 'u' : '-'
-	    );
-}
-
-void print_st_err(struct drbd_conf *mdev, union drbd_state os,
-	          union drbd_state ns, enum drbd_state_rv err)
-{
-	if (err == SS_IN_TRANSIENT_STATE)
-		return;
-	dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
-	print_st(mdev, " state", os);
-	print_st(mdev, "wanted", ns);
-}
-
-
-/**
- * is_valid_state() - Returns an SS_ error code if ns is not valid
- * @mdev:	DRBD device.
- * @ns:		State to consider.
- */
-static enum drbd_state_rv
-is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
-{
-	/* See drbd_state_sw_errors in drbd_strings.c */
-
-	enum drbd_fencing_p fp;
-	enum drbd_state_rv rv = SS_SUCCESS;
-
-	fp = FP_DONT_CARE;
-	if (get_ldev(mdev)) {
-		fp = mdev->ldev->dc.fencing;
-		put_ldev(mdev);
-	}
-
-	if (get_net_conf(mdev)) {
-		if (!mdev->net_conf->two_primaries &&
-		    ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
-			rv = SS_TWO_PRIMARIES;
-		put_net_conf(mdev);
-	}
-
-	if (rv <= 0)
-		/* already found a reason to abort */;
-	else if (ns.role == R_SECONDARY && mdev->open_cnt)
-		rv = SS_DEVICE_IN_USE;
-
-	else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
-		rv = SS_NO_UP_TO_DATE_DISK;
-
-	else if (fp >= FP_RESOURCE &&
-		 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
-		rv = SS_PRIMARY_NOP;
-
-	else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
-		rv = SS_NO_UP_TO_DATE_DISK;
-
-	else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
-		rv = SS_NO_LOCAL_DISK;
-
-	else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
-		rv = SS_NO_REMOTE_DISK;
-
-	else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
-		rv = SS_NO_UP_TO_DATE_DISK;
-
-	else if ((ns.conn == C_CONNECTED ||
-		  ns.conn == C_WF_BITMAP_S ||
-		  ns.conn == C_SYNC_SOURCE ||
-		  ns.conn == C_PAUSED_SYNC_S) &&
-		  ns.disk == D_OUTDATED)
-		rv = SS_CONNECTED_OUTDATES;
-
-	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
-		 (mdev->sync_conf.verify_alg[0] == 0))
-		rv = SS_NO_VERIFY_ALG;
-
-	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
-		  mdev->agreed_pro_version < 88)
-		rv = SS_NOT_SUPPORTED;
-
-	else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
-		rv = SS_CONNECTED_OUTDATES;
-
-	return rv;
-}
-
-/**
- * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
- * @mdev:	DRBD device.
- * @ns:		new state.
- * @os:		old state.
- */
-static enum drbd_state_rv
-is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
-			  union drbd_state os)
-{
-	enum drbd_state_rv rv = SS_SUCCESS;
-
-	if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
-	    os.conn > C_CONNECTED)
-		rv = SS_RESYNC_RUNNING;
-
-	if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
-		rv = SS_ALREADY_STANDALONE;
-
-	if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
-		rv = SS_IS_DISKLESS;
-
-	if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
-		rv = SS_NO_NET_CONFIG;
-
-	if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
-		rv = SS_LOWER_THAN_OUTDATED;
-
-	if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
-		rv = SS_IN_TRANSIENT_STATE;
-
-	if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
-		rv = SS_IN_TRANSIENT_STATE;
-
-	/* While establishing a connection only allow cstate to change.
-	   Delay/refuse role changes, detach attach etc... */
-	if (test_bit(STATE_SENT, &mdev->flags) &&
-	    !(os.conn == C_WF_REPORT_PARAMS ||
-	      (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
-		rv = SS_IN_TRANSIENT_STATE;
-
-	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
-		rv = SS_NEED_CONNECTION;
-
-	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
-	    ns.conn != os.conn && os.conn > C_CONNECTED)
-		rv = SS_RESYNC_RUNNING;
-
-	if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
-	    os.conn < C_CONNECTED)
-		rv = SS_NEED_CONNECTION;
-
-	if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
-	    && os.conn < C_WF_REPORT_PARAMS)
-		rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
-
-	return rv;
-}
-
-static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
-{
-	static const char *msg_table[] = {
-		[NO_WARNING] = "",
-		[ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
-		[ABORTED_RESYNC] = "Resync aborted.",
-		[CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
-		[IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
-		[IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
-	};
-
-	if (warn != NO_WARNING)
-		dev_warn(DEV, "%s\n", msg_table[warn]);
-}
-
-/**
- * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
- * @mdev:	DRBD device.
- * @os:		old state.
- * @ns:		new state.
- * @warn_sync_abort:
- *
- * When we loose connection, we have to set the state of the peers disk (pdsk)
- * to D_UNKNOWN. This rule and many more along those lines are in this function.
- */
-static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
-				       union drbd_state ns, enum sanitize_state_warnings *warn)
-{
-	enum drbd_fencing_p fp;
-	enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
-
-	if (warn)
-		*warn = NO_WARNING;
-
-	fp = FP_DONT_CARE;
-	if (get_ldev(mdev)) {
-		fp = mdev->ldev->dc.fencing;
-		put_ldev(mdev);
-	}
-
-	/* Disallow Network errors to configure a device's network part */
-	if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
-	    os.conn <= C_DISCONNECTING)
-		ns.conn = os.conn;
-
-	/* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
-	 * If you try to go into some Sync* state, that shall fail (elsewhere). */
-	if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
-	    ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED)
-		ns.conn = os.conn;
-
-	/* we cannot fail (again) if we already detached */
-	if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
-		ns.disk = D_DISKLESS;
-
-	/* After C_DISCONNECTING only C_STANDALONE may follow */
-	if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
-		ns.conn = os.conn;
-
-	if (ns.conn < C_CONNECTED) {
-		ns.peer_isp = 0;
-		ns.peer = R_UNKNOWN;
-		if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
-			ns.pdsk = D_UNKNOWN;
-	}
-
-	/* Clear the aftr_isp when becoming unconfigured */
-	if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
-		ns.aftr_isp = 0;
-
-	/* Abort resync if a disk fails/detaches */
-	if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
-	    (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
-		if (warn)
-			*warn =	os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
-				ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
-		ns.conn = C_CONNECTED;
-	}
-
-	/* Connection breaks down before we finished "Negotiating" */
-	if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
-	    get_ldev_if_state(mdev, D_NEGOTIATING)) {
-		if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
-			ns.disk = mdev->new_state_tmp.disk;
-			ns.pdsk = mdev->new_state_tmp.pdsk;
-		} else {
-			if (warn)
-				*warn = CONNECTION_LOST_NEGOTIATING;
-			ns.disk = D_DISKLESS;
-			ns.pdsk = D_UNKNOWN;
-		}
-		put_ldev(mdev);
-	}
-
-	/* D_CONSISTENT and D_OUTDATED vanish when we get connected */
-	if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
-		if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
-			ns.disk = D_UP_TO_DATE;
-		if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
-			ns.pdsk = D_UP_TO_DATE;
-	}
-
-	/* Implications of the connection stat on the disk states */
-	disk_min = D_DISKLESS;
-	disk_max = D_UP_TO_DATE;
-	pdsk_min = D_INCONSISTENT;
-	pdsk_max = D_UNKNOWN;
-	switch ((enum drbd_conns)ns.conn) {
-	case C_WF_BITMAP_T:
-	case C_PAUSED_SYNC_T:
-	case C_STARTING_SYNC_T:
-	case C_WF_SYNC_UUID:
-	case C_BEHIND:
-		disk_min = D_INCONSISTENT;
-		disk_max = D_OUTDATED;
-		pdsk_min = D_UP_TO_DATE;
-		pdsk_max = D_UP_TO_DATE;
-		break;
-	case C_VERIFY_S:
-	case C_VERIFY_T:
-		disk_min = D_UP_TO_DATE;
-		disk_max = D_UP_TO_DATE;
-		pdsk_min = D_UP_TO_DATE;
-		pdsk_max = D_UP_TO_DATE;
-		break;
-	case C_CONNECTED:
-		disk_min = D_DISKLESS;
-		disk_max = D_UP_TO_DATE;
-		pdsk_min = D_DISKLESS;
-		pdsk_max = D_UP_TO_DATE;
-		break;
-	case C_WF_BITMAP_S:
-	case C_PAUSED_SYNC_S:
-	case C_STARTING_SYNC_S:
-	case C_AHEAD:
-		disk_min = D_UP_TO_DATE;
-		disk_max = D_UP_TO_DATE;
-		pdsk_min = D_INCONSISTENT;
-		pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
-		break;
-	case C_SYNC_TARGET:
-		disk_min = D_INCONSISTENT;
-		disk_max = D_INCONSISTENT;
-		pdsk_min = D_UP_TO_DATE;
-		pdsk_max = D_UP_TO_DATE;
-		break;
-	case C_SYNC_SOURCE:
-		disk_min = D_UP_TO_DATE;
-		disk_max = D_UP_TO_DATE;
-		pdsk_min = D_INCONSISTENT;
-		pdsk_max = D_INCONSISTENT;
-		break;
-	case C_STANDALONE:
-	case C_DISCONNECTING:
-	case C_UNCONNECTED:
-	case C_TIMEOUT:
-	case C_BROKEN_PIPE:
-	case C_NETWORK_FAILURE:
-	case C_PROTOCOL_ERROR:
-	case C_TEAR_DOWN:
-	case C_WF_CONNECTION:
-	case C_WF_REPORT_PARAMS:
-	case C_MASK:
-		break;
-	}
-	if (ns.disk > disk_max)
-		ns.disk = disk_max;
-
-	if (ns.disk < disk_min) {
-		if (warn)
-			*warn = IMPLICITLY_UPGRADED_DISK;
-		ns.disk = disk_min;
-	}
-	if (ns.pdsk > pdsk_max)
-		ns.pdsk = pdsk_max;
-
-	if (ns.pdsk < pdsk_min) {
-		if (warn)
-			*warn = IMPLICITLY_UPGRADED_PDSK;
-		ns.pdsk = pdsk_min;
-	}
-
-	if (fp == FP_STONITH &&
-	    (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
-	    !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
-		ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
-
-	if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
-	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
-	    !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
-		ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
-
-	if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
-		if (ns.conn == C_SYNC_SOURCE)
-			ns.conn = C_PAUSED_SYNC_S;
-		if (ns.conn == C_SYNC_TARGET)
-			ns.conn = C_PAUSED_SYNC_T;
-	} else {
-		if (ns.conn == C_PAUSED_SYNC_S)
-			ns.conn = C_SYNC_SOURCE;
-		if (ns.conn == C_PAUSED_SYNC_T)
-			ns.conn = C_SYNC_TARGET;
-	}
-
-	return ns;
-}
-
-/* helper for __drbd_set_state */
-static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
-{
-	if (mdev->agreed_pro_version < 90)
-		mdev->ov_start_sector = 0;
-	mdev->rs_total = drbd_bm_bits(mdev);
-	mdev->ov_position = 0;
-	if (cs == C_VERIFY_T) {
-		/* starting online verify from an arbitrary position
-		 * does not fit well into the existing protocol.
-		 * on C_VERIFY_T, we initialize ov_left and friends
-		 * implicitly in receive_DataRequest once the
-		 * first P_OV_REQUEST is received */
-		mdev->ov_start_sector = ~(sector_t)0;
-	} else {
-		unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
-		if (bit >= mdev->rs_total) {
-			mdev->ov_start_sector =
-				BM_BIT_TO_SECT(mdev->rs_total - 1);
-			mdev->rs_total = 1;
-		} else
-			mdev->rs_total -= bit;
-		mdev->ov_position = mdev->ov_start_sector;
-	}
-	mdev->ov_left = mdev->rs_total;
-}
-
-static void drbd_resume_al(struct drbd_conf *mdev)
-{
-	if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
-		dev_info(DEV, "Resumed AL updates\n");
-}
-
-/**
- * __drbd_set_state() - Set a new DRBD state
- * @mdev:	DRBD device.
- * @ns:		new state.
- * @flags:	Flags
- * @done:	Optional completion, that will get completed after the after_state_ch() finished
- *
- * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
- */
-enum drbd_state_rv
-__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
-	         enum chg_state_flags flags, struct completion *done)
-{
-	union drbd_state os;
-	enum drbd_state_rv rv = SS_SUCCESS;
-	enum sanitize_state_warnings ssw;
-	struct after_state_chg_work *ascw;
-
-	os = mdev->state;
-
-	ns = sanitize_state(mdev, os, ns, &ssw);
-
-	if (ns.i == os.i)
-		return SS_NOTHING_TO_DO;
-
-	if (!(flags & CS_HARD)) {
-		/*  pre-state-change checks ; only look at ns  */
-		/* See drbd_state_sw_errors in drbd_strings.c */
-
-		rv = is_valid_state(mdev, ns);
-		if (rv < SS_SUCCESS) {
-			/* If the old state was illegal as well, then let
-			   this happen...*/
-
-			if (is_valid_state(mdev, os) == rv)
-				rv = is_valid_state_transition(mdev, ns, os);
-		} else
-			rv = is_valid_state_transition(mdev, ns, os);
-	}
-
-	if (rv < SS_SUCCESS) {
-		if (flags & CS_VERBOSE)
-			print_st_err(mdev, os, ns, rv);
-		return rv;
-	}
-
-	print_sanitize_warnings(mdev, ssw);
-
-	{
-	char *pbp, pb[300];
-	pbp = pb;
-	*pbp = 0;
-	if (ns.role != os.role)
-		pbp += sprintf(pbp, "role( %s -> %s ) ",
-			       drbd_role_str(os.role),
-			       drbd_role_str(ns.role));
-	if (ns.peer != os.peer)
-		pbp += sprintf(pbp, "peer( %s -> %s ) ",
-			       drbd_role_str(os.peer),
-			       drbd_role_str(ns.peer));
-	if (ns.conn != os.conn)
-		pbp += sprintf(pbp, "conn( %s -> %s ) ",
-			       drbd_conn_str(os.conn),
-			       drbd_conn_str(ns.conn));
-	if (ns.disk != os.disk)
-		pbp += sprintf(pbp, "disk( %s -> %s ) ",
-			       drbd_disk_str(os.disk),
-			       drbd_disk_str(ns.disk));
-	if (ns.pdsk != os.pdsk)
-		pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
-			       drbd_disk_str(os.pdsk),
-			       drbd_disk_str(ns.pdsk));
-	if (is_susp(ns) != is_susp(os))
-		pbp += sprintf(pbp, "susp( %d -> %d ) ",
-			       is_susp(os),
-			       is_susp(ns));
-	if (ns.aftr_isp != os.aftr_isp)
-		pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
-			       os.aftr_isp,
-			       ns.aftr_isp);
-	if (ns.peer_isp != os.peer_isp)
-		pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
-			       os.peer_isp,
-			       ns.peer_isp);
-	if (ns.user_isp != os.user_isp)
-		pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
-			       os.user_isp,
-			       ns.user_isp);
-	dev_info(DEV, "%s\n", pb);
-	}
-
-	/* solve the race between becoming unconfigured,
-	 * worker doing the cleanup, and
-	 * admin reconfiguring us:
-	 * on (re)configure, first set CONFIG_PENDING,
-	 * then wait for a potentially exiting worker,
-	 * start the worker, and schedule one no_op.
-	 * then proceed with configuration.
-	 */
-	if (ns.disk == D_DISKLESS &&
-	    ns.conn == C_STANDALONE &&
-	    ns.role == R_SECONDARY &&
-	    !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
-		set_bit(DEVICE_DYING, &mdev->flags);
-
-	/* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
-	 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
-	 * drbd_ldev_destroy() won't happen before our corresponding
-	 * after_state_ch works run, where we put_ldev again. */
-	if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
-	    (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
-		atomic_inc(&mdev->local_cnt);
-
-	mdev->state = ns;
-
-	if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
-		drbd_print_uuids(mdev, "attached to UUIDs");
-
-	wake_up(&mdev->misc_wait);
-	wake_up(&mdev->state_wait);
-
-	/* aborted verify run. log the last position */
-	if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
-	    ns.conn < C_CONNECTED) {
-		mdev->ov_start_sector =
-			BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
-		dev_info(DEV, "Online Verify reached sector %llu\n",
-			(unsigned long long)mdev->ov_start_sector);
-	}
-
-	if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
-	    (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
-		dev_info(DEV, "Syncer continues.\n");
-		mdev->rs_paused += (long)jiffies
-				  -(long)mdev->rs_mark_time[mdev->rs_last_mark];
-		if (ns.conn == C_SYNC_TARGET)
-			mod_timer(&mdev->resync_timer, jiffies);
-	}
-
-	if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
-	    (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
-		dev_info(DEV, "Resync suspended\n");
-		mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
-	}
-
-	if (os.conn == C_CONNECTED &&
-	    (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
-		unsigned long now = jiffies;
-		int i;
-
-		set_ov_position(mdev, ns.conn);
-		mdev->rs_start = now;
-		mdev->rs_last_events = 0;
-		mdev->rs_last_sect_ev = 0;
-		mdev->ov_last_oos_size = 0;
-		mdev->ov_last_oos_start = 0;
-
-		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
-			mdev->rs_mark_left[i] = mdev->ov_left;
-			mdev->rs_mark_time[i] = now;
-		}
-
-		drbd_rs_controller_reset(mdev);
-
-		if (ns.conn == C_VERIFY_S) {
-			dev_info(DEV, "Starting Online Verify from sector %llu\n",
-					(unsigned long long)mdev->ov_position);
-			mod_timer(&mdev->resync_timer, jiffies);
-		}
-	}
-
-	if (get_ldev(mdev)) {
-		u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
-						 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
-						 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
-
-		if (test_bit(CRASHED_PRIMARY, &mdev->flags))
-			mdf |= MDF_CRASHED_PRIMARY;
-		if (mdev->state.role == R_PRIMARY ||
-		    (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
-			mdf |= MDF_PRIMARY_IND;
-		if (mdev->state.conn > C_WF_REPORT_PARAMS)
-			mdf |= MDF_CONNECTED_IND;
-		if (mdev->state.disk > D_INCONSISTENT)
-			mdf |= MDF_CONSISTENT;
-		if (mdev->state.disk > D_OUTDATED)
-			mdf |= MDF_WAS_UP_TO_DATE;
-		if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
-			mdf |= MDF_PEER_OUT_DATED;
-		if (mdf != mdev->ldev->md.flags) {
-			mdev->ldev->md.flags = mdf;
-			drbd_md_mark_dirty(mdev);
-		}
-		if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
-			drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
-		put_ldev(mdev);
-	}
-
-	/* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
-	if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
-	    os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
-		set_bit(CONSIDER_RESYNC, &mdev->flags);
-
-	/* Receiver should clean up itself */
-	if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
-		drbd_thread_stop_nowait(&mdev->receiver);
-
-	/* Now the receiver finished cleaning up itself, it should die */
-	if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
-		drbd_thread_stop_nowait(&mdev->receiver);
-
-	/* Upon network failure, we need to restart the receiver. */
-	if (os.conn > C_WF_CONNECTION &&
-	    ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
-		drbd_thread_restart_nowait(&mdev->receiver);
-
-	/* Resume AL writing if we get a connection */
-	if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
-		drbd_resume_al(mdev);
-
-	/* remember last connect and attach times so request_timer_fn() won't
-	 * kill newly established sessions while we are still trying to thaw
-	 * previously frozen IO */
-	if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS)
-		mdev->last_reconnect_jif = jiffies;
-	if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
-	    ns.disk > D_NEGOTIATING)
-		mdev->last_reattach_jif = jiffies;
-
-	ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
-	if (ascw) {
-		ascw->os = os;
-		ascw->ns = ns;
-		ascw->flags = flags;
-		ascw->w.cb = w_after_state_ch;
-		ascw->done = done;
-		drbd_queue_work(&mdev->data.work, &ascw->w);
-	} else {
-		dev_warn(DEV, "Could not kmalloc an ascw\n");
-	}
-
-	return rv;
-}
-
-static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
-{
-	struct after_state_chg_work *ascw =
-		container_of(w, struct after_state_chg_work, w);
-	after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
-	if (ascw->flags & CS_WAIT_COMPLETE) {
-		D_ASSERT(ascw->done != NULL);
-		complete(ascw->done);
-	}
-	kfree(ascw);
-
-	return 1;
-}
-
-static void abw_start_sync(struct drbd_conf *mdev, int rv)
-{
-	if (rv) {
-		dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
-		_drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
-		return;
-	}
-
-	switch (mdev->state.conn) {
-	case C_STARTING_SYNC_T:
-		_drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
-		break;
-	case C_STARTING_SYNC_S:
-		drbd_start_resync(mdev, C_SYNC_SOURCE);
-		break;
-	}
-}
-
-int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
-		int (*io_fn)(struct drbd_conf *),
-		char *why, enum bm_flag flags)
-{
-	int rv;
-
-	D_ASSERT(current == mdev->worker.task);
-
-	/* open coded non-blocking drbd_suspend_io(mdev); */
-	set_bit(SUSPEND_IO, &mdev->flags);
-
-	drbd_bm_lock(mdev, why, flags);
-	rv = io_fn(mdev);
-	drbd_bm_unlock(mdev);
-
-	drbd_resume_io(mdev);
-
-	return rv;
+	return io_allowed;
 }
 
+#endif
+
 /**
- * after_state_ch() - Perform after state change actions that may sleep
- * @mdev:	DRBD device.
- * @os:		old state.
- * @ns:		new state.
- * @flags:	Flags
+ * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch
+ * @tconn:	DRBD connection.
+ * @barrier_nr:	Expected identifier of the DRBD write barrier packet.
+ * @set_size:	Expected number of requests before that barrier.
+ *
+ * In case the passed barrier_nr or set_size does not match the oldest
+ * epoch of not yet barrier-acked requests, this function will cause a
+ * termination of the connection.
  */
-static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
-			   union drbd_state ns, enum chg_state_flags flags)
+void tl_release(struct drbd_tconn *tconn, unsigned int barrier_nr,
+		unsigned int set_size)
 {
-	enum drbd_fencing_p fp;
-	enum drbd_req_event what = nothing;
-	union drbd_state nsm = (union drbd_state){ .i = -1 };
-
-	if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
-		clear_bit(CRASHED_PRIMARY, &mdev->flags);
-		if (mdev->p_uuid)
-			mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
-	}
-
-	fp = FP_DONT_CARE;
-	if (get_ldev(mdev)) {
-		fp = mdev->ldev->dc.fencing;
-		put_ldev(mdev);
-	}
-
-	/* Inform userspace about the change... */
-	drbd_bcast_state(mdev, ns);
-
-	if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
-	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
-		drbd_khelper(mdev, "pri-on-incon-degr");
-
-	/* Here we have the actions that are performed after a
-	   state change. This function might sleep */
-
-	if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
-		mod_timer(&mdev->request_timer, jiffies + HZ);
-
-	nsm.i = -1;
-	if (ns.susp_nod) {
-		if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
-			what = resend;
-
-		if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
-		    ns.disk > D_NEGOTIATING)
-			what = restart_frozen_disk_io;
-
-		if (what != nothing)
-			nsm.susp_nod = 0;
-	}
-
-	if (ns.susp_fen) {
-		/* case1: The outdate peer handler is successful: */
-		if (os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) {
-			if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
-				drbd_uuid_new_current(mdev);
-				clear_bit(NEW_CUR_UUID, &mdev->flags);
-			}
-			spin_lock_irq(&mdev->req_lock);
-			_tl_clear(mdev);
-			_drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
-			spin_unlock_irq(&mdev->req_lock);
-		}
-		/* case2: The connection was established again: */
-		if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
-			clear_bit(NEW_CUR_UUID, &mdev->flags);
-			what = resend;
-			nsm.susp_fen = 0;
+	struct drbd_request *r;
+	struct drbd_request *req = NULL;
+	int expect_epoch = 0;
+	int expect_size = 0;
+
+	spin_lock_irq(&tconn->req_lock);
+
+	/* find oldest not yet barrier-acked write request,
+	 * count writes in its epoch. */
+	list_for_each_entry(r, &tconn->transfer_log, tl_requests) {
+		const unsigned s = r->rq_state;
+		if (!req) {
+			if (!(s & RQ_WRITE))
+				continue;
+			if (!(s & RQ_NET_MASK))
+				continue;
+			if (s & RQ_NET_DONE)
+				continue;
+			req = r;
+			expect_epoch = req->epoch;
+			expect_size ++;
+		} else {
+			if (r->epoch != expect_epoch)
+				break;
+			if (!(s & RQ_WRITE))
+				continue;
+			/* if (s & RQ_DONE): not expected */
+			/* if (!(s & RQ_NET_MASK)): not expected */
+			expect_size++;
 		}
 	}
 
-	if (what != nothing) {
-		spin_lock_irq(&mdev->req_lock);
-		_tl_restart(mdev, what);
-		nsm.i &= mdev->state.i;
-		_drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
-		spin_unlock_irq(&mdev->req_lock);
+	/* first some paranoia code */
+	if (req == NULL) {
+		conn_err(tconn, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
+			 barrier_nr);
+		goto bail;
 	}
-
-	/* Became sync source.  With protocol >= 96, we still need to send out
-	 * the sync uuid now. Need to do that before any drbd_send_state, or
-	 * the other side may go "paused sync" before receiving the sync uuids,
-	 * which is unexpected. */
-	if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
-	    (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
-	    mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
-		drbd_gen_and_send_sync_uuid(mdev);
-		put_ldev(mdev);
+	if (expect_epoch != barrier_nr) {
+		conn_err(tconn, "BAD! BarrierAck #%u received, expected #%u!\n",
+			 barrier_nr, expect_epoch);
+		goto bail;
 	}
 
-	/* Do not change the order of the if above and the two below... */
-	if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
-		/* we probably will start a resync soon.
-		 * make sure those things are properly reset. */
-		mdev->rs_total = 0;
-		mdev->rs_failed = 0;
-		atomic_set(&mdev->rs_pending_cnt, 0);
-		drbd_rs_cancel_all(mdev);
-
-		drbd_send_uuids(mdev);
-		drbd_send_state(mdev, ns);
-	}
-	/* No point in queuing send_bitmap if we don't have a connection
-	 * anymore, so check also the _current_ state, not only the new state
-	 * at the time this work was queued. */
-	if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
-	    mdev->state.conn == C_WF_BITMAP_S)
-		drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
-				"send_bitmap (WFBitMapS)",
-				BM_LOCKED_TEST_ALLOWED);
-
-	/* Lost contact to peer's copy of the data */
-	if ((os.pdsk >= D_INCONSISTENT &&
-	     os.pdsk != D_UNKNOWN &&
-	     os.pdsk != D_OUTDATED)
-	&&  (ns.pdsk < D_INCONSISTENT ||
-	     ns.pdsk == D_UNKNOWN ||
-	     ns.pdsk == D_OUTDATED)) {
-		if (get_ldev(mdev)) {
-			if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
-			    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
-				if (is_susp(mdev->state)) {
-					set_bit(NEW_CUR_UUID, &mdev->flags);
-				} else {
-					drbd_uuid_new_current(mdev);
-					drbd_send_uuids(mdev);
-				}
-			}
-			put_ldev(mdev);
-		}
+	if (expect_size != set_size) {
+		conn_err(tconn, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
+			 barrier_nr, set_size, expect_size);
+		goto bail;
 	}
 
-	if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
-		if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
-		    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
-			drbd_uuid_new_current(mdev);
-			drbd_send_uuids(mdev);
-		}
-		/* D_DISKLESS Peer becomes secondary */
-		if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
-			/* We may still be Primary ourselves.
-			 * No harm done if the bitmap still changes,
-			 * redirtied pages will follow later. */
-			drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
-				"demote diskless peer", BM_LOCKED_SET_ALLOWED);
-		put_ldev(mdev);
+	/* Clean up list of requests processed during current epoch. */
+	/* this extra list walk restart is paranoia,
+	 * to catch requests being barrier-acked "unexpectedly".
+	 * It usually should find the same req again, or some READ preceding it. */
+	list_for_each_entry(req, &tconn->transfer_log, tl_requests)
+		if (req->epoch == expect_epoch)
+			break;
+	list_for_each_entry_safe_from(req, r, &tconn->transfer_log, tl_requests) {
+		if (req->epoch != expect_epoch)
+			break;
+		_req_mod(req, BARRIER_ACKED);
 	}
+	spin_unlock_irq(&tconn->req_lock);
 
-	/* Write out all changed bits on demote.
-	 * Though, no need to da that just yet
-	 * if there is a resync going on still */
-	if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
-		mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
-		/* No changes to the bitmap expected this time, so assert that,
-		 * even though no harm was done if it did change. */
-		drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
-				"demote", BM_LOCKED_TEST_ALLOWED);
-		put_ldev(mdev);
-	}
+	return;
 
-	/* Last part of the attaching process ... */
-	if (ns.conn >= C_CONNECTED &&
-	    os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
-		drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
-		drbd_send_uuids(mdev);
-		drbd_send_state(mdev, ns);
-	}
+bail:
+	spin_unlock_irq(&tconn->req_lock);
+	conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
+}
 
-	/* We want to pause/continue resync, tell peer. */
-	if (ns.conn >= C_CONNECTED &&
-	     ((os.aftr_isp != ns.aftr_isp) ||
-	      (os.user_isp != ns.user_isp)))
-		drbd_send_state(mdev, ns);
-
-	/* In case one of the isp bits got set, suspend other devices. */
-	if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
-	    (ns.aftr_isp || ns.peer_isp || ns.user_isp))
-		suspend_other_sg(mdev);
-
-	/* Make sure the peer gets informed about eventual state
-	   changes (ISP bits) while we were in WFReportParams. */
-	if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
-		drbd_send_state(mdev, ns);
-
-	if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
-		drbd_send_state(mdev, ns);
-
-	/* We are in the progress to start a full sync... */
-	if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
-	    (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
-		/* no other bitmap changes expected during this phase */
-		drbd_queue_bitmap_io(mdev,
-			&drbd_bmio_set_n_write, &abw_start_sync,
-			"set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
-
-	/* We are invalidating our self... */
-	if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
-	    os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
-		/* other bitmap operation expected during this phase */
-		drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
-			"set_n_write from invalidate", BM_LOCKED_MASK);
-
-	/* first half of local IO error, failure to attach,
-	 * or administrative detach */
-	if (os.disk != D_FAILED && ns.disk == D_FAILED) {
-		enum drbd_io_error_p eh = EP_PASS_ON;
-		int was_io_error = 0;
-		/* corresponding get_ldev was in __drbd_set_state, to serialize
-		 * our cleanup here with the transition to D_DISKLESS.
-		 * But is is still not save to dreference ldev here, since
-		 * we might come from an failed Attach before ldev was set. */
-		if (mdev->ldev) {
-			eh = mdev->ldev->dc.on_io_error;
-			was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
-
-			if (was_io_error && eh == EP_CALL_HELPER)
-				drbd_khelper(mdev, "local-io-error");
-
-			/* Immediately allow completion of all application IO,
-			 * that waits for completion from the local disk,
-			 * if this was a force-detach due to disk_timeout
-			 * or administrator request (drbdsetup detach --force).
-			 * Do NOT abort otherwise.
-			 * Aborting local requests may cause serious problems,
-			 * if requests are completed to upper layers already,
-			 * and then later the already submitted local bio completes.
-			 * This can cause DMA into former bio pages that meanwhile
-			 * have been re-used for other things.
-			 * So aborting local requests may cause crashes,
-			 * or even worse, silent data corruption.
-			 */
-			if (test_and_clear_bit(FORCE_DETACH, &mdev->flags))
-				tl_abort_disk_io(mdev);
-
-			/* current state still has to be D_FAILED,
-			 * there is only one way out: to D_DISKLESS,
-			 * and that may only happen after our put_ldev below. */
-			if (mdev->state.disk != D_FAILED)
-				dev_err(DEV,
-					"ASSERT FAILED: disk is %s during detach\n",
-					drbd_disk_str(mdev->state.disk));
-
-			if (ns.conn >= C_CONNECTED)
-				drbd_send_state(mdev, ns);
-
-			drbd_rs_cancel_all(mdev);
-
-			/* In case we want to get something to stable storage still,
-			 * this may be the last chance.
-			 * Following put_ldev may transition to D_DISKLESS. */
-			drbd_md_sync(mdev);
-		}
-		put_ldev(mdev);
-	}
 
-        /* second half of local IO error, failure to attach,
-         * or administrative detach,
-         * after local_cnt references have reached zero again */
-        if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
-                /* We must still be diskless,
-                 * re-attach has to be serialized with this! */
-                if (mdev->state.disk != D_DISKLESS)
-                        dev_err(DEV,
-                                "ASSERT FAILED: disk is %s while going diskless\n",
-                                drbd_disk_str(mdev->state.disk));
-
-		if (ns.conn >= C_CONNECTED)
-			drbd_send_state(mdev, ns);
-
-		/* corresponding get_ldev in __drbd_set_state
-		 * this may finally trigger drbd_ldev_destroy. */
-		put_ldev(mdev);
-	}
+/**
+ * _tl_restart() - Walks the transfer log, and applies an action to all requests
+ * @mdev:	DRBD device.
+ * @what:       The action/event to perform with all request objects
+ *
+ * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
+ * RESTART_FROZEN_DISK_IO.
+ */
+/* must hold resource->req_lock */
+void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
+{
+	struct drbd_request *req, *r;
 
-	/* Notify peer that I had a local IO error, and did not detached.. */
-	if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
-		drbd_send_state(mdev, ns);
+	list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests)
+		_req_mod(req, what);
+}
 
-	/* Disks got bigger while they were detached */
-	if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
-	    test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
-		if (ns.conn == C_CONNECTED)
-			resync_after_online_grow(mdev);
-	}
+void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
+{
+	spin_lock_irq(&tconn->req_lock);
+	_tl_restart(tconn, what);
+	spin_unlock_irq(&tconn->req_lock);
+}
 
-	/* A resync finished or aborted, wake paused devices... */
-	if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
-	    (os.peer_isp && !ns.peer_isp) ||
-	    (os.user_isp && !ns.user_isp))
-		resume_next_sg(mdev);
-
-	/* sync target done with resync.  Explicitly notify peer, even though
-	 * it should (at least for non-empty resyncs) already know itself. */
-	if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
-		drbd_send_state(mdev, ns);
-
-	/* Wake up role changes, that were delayed because of connection establishing */
-	if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) {
-		clear_bit(STATE_SENT, &mdev->flags);
-		wake_up(&mdev->state_wait);
-	}
+/**
+ * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
+ * @mdev:	DRBD device.
+ *
+ * This is called after the connection to the peer was lost. The storage covered
+ * by the requests on the transfer gets marked as our of sync. Called from the
+ * receiver thread and the worker thread.
+ */
+void tl_clear(struct drbd_tconn *tconn)
+{
+	tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING);
+}
 
-	/* This triggers bitmap writeout of potentially still unwritten pages
-	 * if the resync finished cleanly, or aborted because of peer disk
-	 * failure, or because of connection loss.
-	 * For resync aborted because of local disk failure, we cannot do
-	 * any bitmap writeout anymore.
-	 * No harm done if some bits change during this phase.
-	 */
-	if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
-		drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
-			"write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
-		put_ldev(mdev);
-	}
+/**
+ * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
+ * @mdev:	DRBD device.
+ */
+void tl_abort_disk_io(struct drbd_conf *mdev)
+{
+	struct drbd_tconn *tconn = mdev->tconn;
+	struct drbd_request *req, *r;
 
-	/* free tl_hash if we Got thawed and are C_STANDALONE */
-	if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
-		drbd_free_tl_hash(mdev);
-
-	/* Upon network connection, we need to start the receiver */
-	if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
-		drbd_thread_start(&mdev->receiver);
-
-	/* Terminate worker thread if we are unconfigured - it will be
-	   restarted as needed... */
-	if (ns.disk == D_DISKLESS &&
-	    ns.conn == C_STANDALONE &&
-	    ns.role == R_SECONDARY) {
-		if (os.aftr_isp != ns.aftr_isp)
-			resume_next_sg(mdev);
-		/* set in __drbd_set_state, unless CONFIG_PENDING was set */
-		if (test_bit(DEVICE_DYING, &mdev->flags))
-			drbd_thread_stop_nowait(&mdev->worker);
+	spin_lock_irq(&tconn->req_lock);
+	list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) {
+		if (!(req->rq_state & RQ_LOCAL_PENDING))
+			continue;
+		if (req->w.mdev != mdev)
+			continue;
+		_req_mod(req, ABORT_DISK_IO);
 	}
-
-	drbd_md_sync(mdev);
+	spin_unlock_irq(&tconn->req_lock);
 }
 
-
 static int drbd_thread_setup(void *arg)
 {
 	struct drbd_thread *thi = (struct drbd_thread *) arg;
-	struct drbd_conf *mdev = thi->mdev;
+	struct drbd_tconn *tconn = thi->tconn;
 	unsigned long flags;
 	int retval;
 
+	snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s",
+		 thi->name[0], thi->tconn->name);
+
 restart:
 	retval = thi->function(thi);
 
 	spin_lock_irqsave(&thi->t_lock, flags);
 
-	/* if the receiver has been "Exiting", the last thing it did
+	/* if the receiver has been "EXITING", the last thing it did
 	 * was set the conn state to "StandAlone",
 	 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
 	 * and receiver thread will be "started".
-	 * drbd_thread_start needs to set "Restarting" in that case.
+	 * drbd_thread_start needs to set "RESTARTING" in that case.
 	 * t_state check and assignment needs to be within the same spinlock,
-	 * so either thread_start sees Exiting, and can remap to Restarting,
-	 * or thread_start see None, and can proceed as normal.
+	 * so either thread_start sees EXITING, and can remap to RESTARTING,
+	 * or thread_start see NONE, and can proceed as normal.
 	 */
 
-	if (thi->t_state == Restarting) {
-		dev_info(DEV, "Restarting %s\n", current->comm);
-		thi->t_state = Running;
+	if (thi->t_state == RESTARTING) {
+		conn_info(tconn, "Restarting %s thread\n", thi->name);
+		thi->t_state = RUNNING;
 		spin_unlock_irqrestore(&thi->t_lock, flags);
 		goto restart;
 	}
 
 	thi->task = NULL;
-	thi->t_state = None;
+	thi->t_state = NONE;
 	smp_mb();
-	complete(&thi->stop);
+	complete_all(&thi->stop);
 	spin_unlock_irqrestore(&thi->t_lock, flags);
 
-	dev_info(DEV, "Terminating %s\n", current->comm);
+	conn_info(tconn, "Terminating %s\n", current->comm);
 
 	/* Release mod reference taken when thread was started */
+
+	kref_put(&tconn->kref, &conn_destroy);
 	module_put(THIS_MODULE);
 	return retval;
 }
 
-static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
-		      int (*func) (struct drbd_thread *))
+static void drbd_thread_init(struct drbd_tconn *tconn, struct drbd_thread *thi,
+			     int (*func) (struct drbd_thread *), char *name)
 {
 	spin_lock_init(&thi->t_lock);
 	thi->task    = NULL;
-	thi->t_state = None;
+	thi->t_state = NONE;
 	thi->function = func;
-	thi->mdev = mdev;
+	thi->tconn = tconn;
+	strncpy(thi->name, name, ARRAY_SIZE(thi->name));
 }
 
 int drbd_thread_start(struct drbd_thread *thi)
 {
-	struct drbd_conf *mdev = thi->mdev;
+	struct drbd_tconn *tconn = thi->tconn;
 	struct task_struct *nt;
 	unsigned long flags;
 
-	const char *me =
-		thi == &mdev->receiver ? "receiver" :
-		thi == &mdev->asender  ? "asender"  :
-		thi == &mdev->worker   ? "worker"   : "NONSENSE";
-
 	/* is used from state engine doing drbd_thread_stop_nowait,
 	 * while holding the req lock irqsave */
 	spin_lock_irqsave(&thi->t_lock, flags);
 
 	switch (thi->t_state) {
-	case None:
-		dev_info(DEV, "Starting %s thread (from %s [%d])\n",
-				me, current->comm, current->pid);
+	case NONE:
+		conn_info(tconn, "Starting %s thread (from %s [%d])\n",
+			 thi->name, current->comm, current->pid);
 
 		/* Get ref on module for thread - this is released when thread exits */
 		if (!try_module_get(THIS_MODULE)) {
-			dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
+			conn_err(tconn, "Failed to get module reference in drbd_thread_start\n");
 			spin_unlock_irqrestore(&thi->t_lock, flags);
 			return false;
 		}
 
+		kref_get(&thi->tconn->kref);
+
 		init_completion(&thi->stop);
-		D_ASSERT(thi->task == NULL);
 		thi->reset_cpu_mask = 1;
-		thi->t_state = Running;
+		thi->t_state = RUNNING;
 		spin_unlock_irqrestore(&thi->t_lock, flags);
 		flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
 
 		nt = kthread_create(drbd_thread_setup, (void *) thi,
-				    "drbd%d_%s", mdev_to_minor(mdev), me);
+				    "drbd_%c_%s", thi->name[0], thi->tconn->name);
 
 		if (IS_ERR(nt)) {
-			dev_err(DEV, "Couldn't start thread\n");
+			conn_err(tconn, "Couldn't start thread\n");
 
+			kref_put(&tconn->kref, &conn_destroy);
 			module_put(THIS_MODULE);
 			return false;
 		}
 		spin_lock_irqsave(&thi->t_lock, flags);
 		thi->task = nt;
-		thi->t_state = Running;
+		thi->t_state = RUNNING;
 		spin_unlock_irqrestore(&thi->t_lock, flags);
 		wake_up_process(nt);
 		break;
-	case Exiting:
-		thi->t_state = Restarting;
-		dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
-				me, current->comm, current->pid);
+	case EXITING:
+		thi->t_state = RESTARTING;
+		conn_info(tconn, "Restarting %s thread (from %s [%d])\n",
+				thi->name, current->comm, current->pid);
 		/* fall through */
-	case Running:
-	case Restarting:
+	case RUNNING:
+	case RESTARTING:
 	default:
 		spin_unlock_irqrestore(&thi->t_lock, flags);
 		break;
@@ -1867,12 +447,12 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
 {
 	unsigned long flags;
 
-	enum drbd_thread_state ns = restart ? Restarting : Exiting;
+	enum drbd_thread_state ns = restart ? RESTARTING : EXITING;
 
 	/* may be called from state engine, holding the req lock irqsave */
 	spin_lock_irqsave(&thi->t_lock, flags);
 
-	if (thi->t_state == None) {
+	if (thi->t_state == NONE) {
 		spin_unlock_irqrestore(&thi->t_lock, flags);
 		if (restart)
 			drbd_thread_start(thi);
@@ -1890,7 +470,6 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
 		init_completion(&thi->stop);
 		if (thi->task != current)
 			force_sig(DRBD_SIGKILL, thi->task);
-
 	}
 
 	spin_unlock_irqrestore(&thi->t_lock, flags);
@@ -1899,6 +478,35 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
 		wait_for_completion(&thi->stop);
 }
 
+static struct drbd_thread *drbd_task_to_thread(struct drbd_tconn *tconn, struct task_struct *task)
+{
+	struct drbd_thread *thi =
+		task == tconn->receiver.task ? &tconn->receiver :
+		task == tconn->asender.task  ? &tconn->asender :
+		task == tconn->worker.task   ? &tconn->worker : NULL;
+
+	return thi;
+}
+
+char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task)
+{
+	struct drbd_thread *thi = drbd_task_to_thread(tconn, task);
+	return thi ? thi->name : task->comm;
+}
+
+int conn_lowest_minor(struct drbd_tconn *tconn)
+{
+	struct drbd_conf *mdev;
+	int vnr = 0, m;
+
+	rcu_read_lock();
+	mdev = idr_get_next(&tconn->volumes, &vnr);
+	m = mdev ? mdev_to_minor(mdev) : -1;
+	rcu_read_unlock();
+
+	return m;
+}
+
 #ifdef CONFIG_SMP
 /**
  * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
@@ -1907,238 +515,345 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
  * Forces all threads of a device onto the same CPU. This is beneficial for
  * DRBD's performance. May be overwritten by user's configuration.
  */
-void drbd_calc_cpu_mask(struct drbd_conf *mdev)
+void drbd_calc_cpu_mask(struct drbd_tconn *tconn)
 {
 	int ord, cpu;
 
 	/* user override. */
-	if (cpumask_weight(mdev->cpu_mask))
+	if (cpumask_weight(tconn->cpu_mask))
 		return;
 
-	ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
+	ord = conn_lowest_minor(tconn) % cpumask_weight(cpu_online_mask);
 	for_each_online_cpu(cpu) {
 		if (ord-- == 0) {
-			cpumask_set_cpu(cpu, mdev->cpu_mask);
+			cpumask_set_cpu(cpu, tconn->cpu_mask);
 			return;
 		}
 	}
 	/* should not be reached */
-	cpumask_setall(mdev->cpu_mask);
+	cpumask_setall(tconn->cpu_mask);
 }
 
 /**
  * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
  * @mdev:	DRBD device.
+ * @thi:	drbd_thread object
  *
  * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
  * prematurely.
  */
-void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
+void drbd_thread_current_set_cpu(struct drbd_thread *thi)
 {
 	struct task_struct *p = current;
-	struct drbd_thread *thi =
-		p == mdev->asender.task  ? &mdev->asender  :
-		p == mdev->receiver.task ? &mdev->receiver :
-		p == mdev->worker.task   ? &mdev->worker   :
-		NULL;
-	ERR_IF(thi == NULL)
-		return;
+
 	if (!thi->reset_cpu_mask)
 		return;
 	thi->reset_cpu_mask = 0;
-	set_cpus_allowed_ptr(p, mdev->cpu_mask);
+	set_cpus_allowed_ptr(p, thi->tconn->cpu_mask);
 }
 #endif
 
-/* the appropriate socket mutex must be held already */
-int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
-			  enum drbd_packets cmd, struct p_header80 *h,
-			  size_t size, unsigned msg_flags)
+/**
+ * drbd_header_size  -  size of a packet header
+ *
+ * The header size is a multiple of 8, so any payload following the header is
+ * word aligned on 64-bit architectures.  (The bitmap send and receive code
+ * relies on this.)
+ */
+unsigned int drbd_header_size(struct drbd_tconn *tconn)
+{
+	if (tconn->agreed_pro_version >= 100) {
+		BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), 8));
+		return sizeof(struct p_header100);
+	} else {
+		BUILD_BUG_ON(sizeof(struct p_header80) !=
+			     sizeof(struct p_header95));
+		BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8));
+		return sizeof(struct p_header80);
+	}
+}
+
+static unsigned int prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size)
 {
-	int sent, ok;
+	h->magic   = cpu_to_be32(DRBD_MAGIC);
+	h->command = cpu_to_be16(cmd);
+	h->length  = cpu_to_be16(size);
+	return sizeof(struct p_header80);
+}
 
-	ERR_IF(!h) return false;
-	ERR_IF(!size) return false;
+static unsigned int prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size)
+{
+	h->magic   = cpu_to_be16(DRBD_MAGIC_BIG);
+	h->command = cpu_to_be16(cmd);
+	h->length = cpu_to_be32(size);
+	return sizeof(struct p_header95);
+}
 
-	h->magic   = BE_DRBD_MAGIC;
+static unsigned int prepare_header100(struct p_header100 *h, enum drbd_packet cmd,
+				      int size, int vnr)
+{
+	h->magic = cpu_to_be32(DRBD_MAGIC_100);
+	h->volume = cpu_to_be16(vnr);
 	h->command = cpu_to_be16(cmd);
-	h->length  = cpu_to_be16(size-sizeof(struct p_header80));
+	h->length = cpu_to_be32(size);
+	h->pad = 0;
+	return sizeof(struct p_header100);
+}
 
-	sent = drbd_send(mdev, sock, h, size, msg_flags);
+static unsigned int prepare_header(struct drbd_tconn *tconn, int vnr,
+				   void *buffer, enum drbd_packet cmd, int size)
+{
+	if (tconn->agreed_pro_version >= 100)
+		return prepare_header100(buffer, cmd, size, vnr);
+	else if (tconn->agreed_pro_version >= 95 &&
+		 size > DRBD_MAX_SIZE_H80_PACKET)
+		return prepare_header95(buffer, cmd, size);
+	else
+		return prepare_header80(buffer, cmd, size);
+}
 
-	ok = (sent == size);
-	if (!ok && !signal_pending(current))
-		dev_warn(DEV, "short sent %s size=%d sent=%d\n",
-		    cmdname(cmd), (int)size, sent);
-	return ok;
+static void *__conn_prepare_command(struct drbd_tconn *tconn,
+				    struct drbd_socket *sock)
+{
+	if (!sock->socket)
+		return NULL;
+	return sock->sbuf + drbd_header_size(tconn);
 }
 
-/* don't pass the socket. we may only look at it
- * when we hold the appropriate socket mutex.
- */
-int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
-		  enum drbd_packets cmd, struct p_header80 *h, size_t size)
+void *conn_prepare_command(struct drbd_tconn *tconn, struct drbd_socket *sock)
 {
-	int ok = 0;
-	struct socket *sock;
+	void *p;
 
-	if (use_data_socket) {
-		mutex_lock(&mdev->data.mutex);
-		sock = mdev->data.socket;
-	} else {
-		mutex_lock(&mdev->meta.mutex);
-		sock = mdev->meta.socket;
-	}
+	mutex_lock(&sock->mutex);
+	p = __conn_prepare_command(tconn, sock);
+	if (!p)
+		mutex_unlock(&sock->mutex);
 
-	/* drbd_disconnect() could have called drbd_free_sock()
-	 * while we were waiting in down()... */
-	if (likely(sock != NULL))
-		ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
+	return p;
+}
 
-	if (use_data_socket)
-		mutex_unlock(&mdev->data.mutex);
-	else
-		mutex_unlock(&mdev->meta.mutex);
-	return ok;
+void *drbd_prepare_command(struct drbd_conf *mdev, struct drbd_socket *sock)
+{
+	return conn_prepare_command(mdev->tconn, sock);
 }
 
-int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
-		   size_t size)
+static int __send_command(struct drbd_tconn *tconn, int vnr,
+			  struct drbd_socket *sock, enum drbd_packet cmd,
+			  unsigned int header_size, void *data,
+			  unsigned int size)
 {
-	struct p_header80 h;
-	int ok;
+	int msg_flags;
+	int err;
 
-	h.magic   = BE_DRBD_MAGIC;
-	h.command = cpu_to_be16(cmd);
-	h.length  = cpu_to_be16(size);
+	/*
+	 * Called with @data == NULL and the size of the data blocks in @size
+	 * for commands that send data blocks.  For those commands, omit the
+	 * MSG_MORE flag: this will increase the likelihood that data blocks
+	 * which are page aligned on the sender will end up page aligned on the
+	 * receiver.
+	 */
+	msg_flags = data ? MSG_MORE : 0;
+
+	header_size += prepare_header(tconn, vnr, sock->sbuf, cmd,
+				      header_size + size);
+	err = drbd_send_all(tconn, sock->socket, sock->sbuf, header_size,
+			    msg_flags);
+	if (data && !err)
+		err = drbd_send_all(tconn, sock->socket, data, size, 0);
+	return err;
+}
 
-	if (!drbd_get_data_sock(mdev))
-		return 0;
+static int __conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock,
+			       enum drbd_packet cmd, unsigned int header_size,
+			       void *data, unsigned int size)
+{
+	return __send_command(tconn, 0, sock, cmd, header_size, data, size);
+}
+
+int conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock,
+		      enum drbd_packet cmd, unsigned int header_size,
+		      void *data, unsigned int size)
+{
+	int err;
+
+	err = __conn_send_command(tconn, sock, cmd, header_size, data, size);
+	mutex_unlock(&sock->mutex);
+	return err;
+}
+
+int drbd_send_command(struct drbd_conf *mdev, struct drbd_socket *sock,
+		      enum drbd_packet cmd, unsigned int header_size,
+		      void *data, unsigned int size)
+{
+	int err;
+
+	err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, header_size,
+			     data, size);
+	mutex_unlock(&sock->mutex);
+	return err;
+}
 
-	ok = (sizeof(h) ==
-		drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
-	ok = ok && (size ==
-		drbd_send(mdev, mdev->data.socket, data, size, 0));
+int drbd_send_ping(struct drbd_tconn *tconn)
+{
+	struct drbd_socket *sock;
+
+	sock = &tconn->meta;
+	if (!conn_prepare_command(tconn, sock))
+		return -EIO;
+	return conn_send_command(tconn, sock, P_PING, 0, NULL, 0);
+}
 
-	drbd_put_data_sock(mdev);
+int drbd_send_ping_ack(struct drbd_tconn *tconn)
+{
+	struct drbd_socket *sock;
 
-	return ok;
+	sock = &tconn->meta;
+	if (!conn_prepare_command(tconn, sock))
+		return -EIO;
+	return conn_send_command(tconn, sock, P_PING_ACK, 0, NULL, 0);
 }
 
-int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
+int drbd_send_sync_param(struct drbd_conf *mdev)
 {
+	struct drbd_socket *sock;
 	struct p_rs_param_95 *p;
-	struct socket *sock;
-	int size, rv;
-	const int apv = mdev->agreed_pro_version;
+	int size;
+	const int apv = mdev->tconn->agreed_pro_version;
+	enum drbd_packet cmd;
+	struct net_conf *nc;
+	struct disk_conf *dc;
+
+	sock = &mdev->tconn->data;
+	p = drbd_prepare_command(mdev, sock);
+	if (!p)
+		return -EIO;
+
+	rcu_read_lock();
+	nc = rcu_dereference(mdev->tconn->net_conf);
 
 	size = apv <= 87 ? sizeof(struct p_rs_param)
 		: apv == 88 ? sizeof(struct p_rs_param)
-			+ strlen(mdev->sync_conf.verify_alg) + 1
+			+ strlen(nc->verify_alg) + 1
 		: apv <= 94 ? sizeof(struct p_rs_param_89)
 		: /* apv >= 95 */ sizeof(struct p_rs_param_95);
 
-	/* used from admin command context and receiver/worker context.
-	 * to avoid kmalloc, grab the socket right here,
-	 * then use the pre-allocated sbuf there */
-	mutex_lock(&mdev->data.mutex);
-	sock = mdev->data.socket;
-
-	if (likely(sock != NULL)) {
-		enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
-
-		p = &mdev->data.sbuf.rs_param_95;
-
-		/* initialize verify_alg and csums_alg */
-		memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
-
-		p->rate = cpu_to_be32(sc->rate);
-		p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
-		p->c_delay_target = cpu_to_be32(sc->c_delay_target);
-		p->c_fill_target = cpu_to_be32(sc->c_fill_target);
-		p->c_max_rate = cpu_to_be32(sc->c_max_rate);
+	cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
 
-		if (apv >= 88)
-			strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
-		if (apv >= 89)
-			strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
+	/* initialize verify_alg and csums_alg */
+	memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
 
-		rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
-	} else
-		rv = 0; /* not ok */
+	if (get_ldev(mdev)) {
+		dc = rcu_dereference(mdev->ldev->disk_conf);
+		p->resync_rate = cpu_to_be32(dc->resync_rate);
+		p->c_plan_ahead = cpu_to_be32(dc->c_plan_ahead);
+		p->c_delay_target = cpu_to_be32(dc->c_delay_target);
+		p->c_fill_target = cpu_to_be32(dc->c_fill_target);
+		p->c_max_rate = cpu_to_be32(dc->c_max_rate);
+		put_ldev(mdev);
+	} else {
+		p->resync_rate = cpu_to_be32(DRBD_RESYNC_RATE_DEF);
+		p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF);
+		p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF);
+		p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF);
+		p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF);
+	}
 
-	mutex_unlock(&mdev->data.mutex);
+	if (apv >= 88)
+		strcpy(p->verify_alg, nc->verify_alg);
+	if (apv >= 89)
+		strcpy(p->csums_alg, nc->csums_alg);
+	rcu_read_unlock();
 
-	return rv;
+	return drbd_send_command(mdev, sock, cmd, size, NULL, 0);
 }
 
-int drbd_send_protocol(struct drbd_conf *mdev)
+int __drbd_send_protocol(struct drbd_tconn *tconn, enum drbd_packet cmd)
 {
+	struct drbd_socket *sock;
 	struct p_protocol *p;
-	int size, cf, rv;
+	struct net_conf *nc;
+	int size, cf;
 
-	size = sizeof(struct p_protocol);
+	sock = &tconn->data;
+	p = __conn_prepare_command(tconn, sock);
+	if (!p)
+		return -EIO;
 
-	if (mdev->agreed_pro_version >= 87)
-		size += strlen(mdev->net_conf->integrity_alg) + 1;
+	rcu_read_lock();
+	nc = rcu_dereference(tconn->net_conf);
 
-	/* we must not recurse into our own queue,
-	 * as that is blocked during handshake */
-	p = kmalloc(size, GFP_NOIO);
-	if (p == NULL)
-		return 0;
+	if (nc->tentative && tconn->agreed_pro_version < 92) {
+		rcu_read_unlock();
+		mutex_unlock(&sock->mutex);
+		conn_err(tconn, "--dry-run is not supported by peer");
+		return -EOPNOTSUPP;
+	}
 
-	p->protocol      = cpu_to_be32(mdev->net_conf->wire_protocol);
-	p->after_sb_0p   = cpu_to_be32(mdev->net_conf->after_sb_0p);
-	p->after_sb_1p   = cpu_to_be32(mdev->net_conf->after_sb_1p);
-	p->after_sb_2p   = cpu_to_be32(mdev->net_conf->after_sb_2p);
-	p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
+	size = sizeof(*p);
+	if (tconn->agreed_pro_version >= 87)
+		size += strlen(nc->integrity_alg) + 1;
 
+	p->protocol      = cpu_to_be32(nc->wire_protocol);
+	p->after_sb_0p   = cpu_to_be32(nc->after_sb_0p);
+	p->after_sb_1p   = cpu_to_be32(nc->after_sb_1p);
+	p->after_sb_2p   = cpu_to_be32(nc->after_sb_2p);
+	p->two_primaries = cpu_to_be32(nc->two_primaries);
 	cf = 0;
-	if (mdev->net_conf->want_lose)
-		cf |= CF_WANT_LOSE;
-	if (mdev->net_conf->dry_run) {
-		if (mdev->agreed_pro_version >= 92)
-			cf |= CF_DRY_RUN;
-		else {
-			dev_err(DEV, "--dry-run is not supported by peer");
-			kfree(p);
-			return -1;
-		}
-	}
+	if (nc->discard_my_data)
+		cf |= CF_DISCARD_MY_DATA;
+	if (nc->tentative)
+		cf |= CF_DRY_RUN;
 	p->conn_flags    = cpu_to_be32(cf);
 
-	if (mdev->agreed_pro_version >= 87)
-		strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
+	if (tconn->agreed_pro_version >= 87)
+		strcpy(p->integrity_alg, nc->integrity_alg);
+	rcu_read_unlock();
 
-	rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
-			   (struct p_header80 *)p, size);
-	kfree(p);
-	return rv;
+	return __conn_send_command(tconn, sock, cmd, size, NULL, 0);
+}
+
+int drbd_send_protocol(struct drbd_tconn *tconn)
+{
+	int err;
+
+	mutex_lock(&tconn->data.mutex);
+	err = __drbd_send_protocol(tconn, P_PROTOCOL);
+	mutex_unlock(&tconn->data.mutex);
+
+	return err;
 }
 
 int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
 {
-	struct p_uuids p;
+	struct drbd_socket *sock;
+	struct p_uuids *p;
 	int i;
 
 	if (!get_ldev_if_state(mdev, D_NEGOTIATING))
-		return 1;
+		return 0;
 
+	sock = &mdev->tconn->data;
+	p = drbd_prepare_command(mdev, sock);
+	if (!p) {
+		put_ldev(mdev);
+		return -EIO;
+	}
+	spin_lock_irq(&mdev->ldev->md.uuid_lock);
 	for (i = UI_CURRENT; i < UI_SIZE; i++)
-		p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
+		p->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
+	spin_unlock_irq(&mdev->ldev->md.uuid_lock);
 
 	mdev->comm_bm_set = drbd_bm_total_weight(mdev);
-	p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
-	uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
+	p->uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
+	rcu_read_lock();
+	uuid_flags |= rcu_dereference(mdev->tconn->net_conf)->discard_my_data ? 1 : 0;
+	rcu_read_unlock();
 	uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
 	uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
-	p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
+	p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
 
 	put_ldev(mdev);
-
-	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
-			     (struct p_header80 *)&p, sizeof(p));
+	return drbd_send_command(mdev, sock, P_UUIDS, sizeof(*p), NULL, 0);
 }
 
 int drbd_send_uuids(struct drbd_conf *mdev)
@@ -2169,9 +884,10 @@ void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
 	}
 }
 
-int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
+void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
 {
-	struct p_rs_uuid p;
+	struct drbd_socket *sock;
+	struct p_rs_uuid *p;
 	u64 uuid;
 
 	D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
@@ -2184,24 +900,29 @@ int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
 	drbd_uuid_set(mdev, UI_BITMAP, uuid);
 	drbd_print_uuids(mdev, "updated sync UUID");
 	drbd_md_sync(mdev);
-	p.uuid = cpu_to_be64(uuid);
 
-	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
-			     (struct p_header80 *)&p, sizeof(p));
+	sock = &mdev->tconn->data;
+	p = drbd_prepare_command(mdev, sock);
+	if (p) {
+		p->uuid = cpu_to_be64(uuid);
+		drbd_send_command(mdev, sock, P_SYNC_UUID, sizeof(*p), NULL, 0);
+	}
 }
 
 int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
 {
-	struct p_sizes p;
+	struct drbd_socket *sock;
+	struct p_sizes *p;
 	sector_t d_size, u_size;
 	int q_order_type;
 	unsigned int max_bio_size;
-	int ok;
 
 	if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
 		D_ASSERT(mdev->ldev->backing_bdev);
 		d_size = drbd_get_max_capacity(mdev->ldev);
-		u_size = mdev->ldev->dc.disk_size;
+		rcu_read_lock();
+		u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
+		rcu_read_unlock();
 		q_order_type = drbd_queue_order_type(mdev);
 		max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
 		max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
@@ -2213,20 +934,23 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
 		max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
 	}
 
-	/* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
-	if (mdev->agreed_pro_version <= 94)
-		max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+	sock = &mdev->tconn->data;
+	p = drbd_prepare_command(mdev, sock);
+	if (!p)
+		return -EIO;
 
-	p.d_size = cpu_to_be64(d_size);
-	p.u_size = cpu_to_be64(u_size);
-	p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
-	p.max_bio_size = cpu_to_be32(max_bio_size);
-	p.queue_order_type = cpu_to_be16(q_order_type);
-	p.dds_flags = cpu_to_be16(flags);
+	if (mdev->tconn->agreed_pro_version <= 94)
+		max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+	else if (mdev->tconn->agreed_pro_version < 100)
+		max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95);
 
-	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
-			   (struct p_header80 *)&p, sizeof(p));
-	return ok;
+	p->d_size = cpu_to_be64(d_size);
+	p->u_size = cpu_to_be64(u_size);
+	p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
+	p->max_bio_size = cpu_to_be32(max_bio_size);
+	p->queue_order_type = cpu_to_be16(q_order_type);
+	p->dds_flags = cpu_to_be16(flags);
+	return drbd_send_command(mdev, sock, P_SIZES, sizeof(*p), NULL, 0);
 }
 
 /**
@@ -2235,34 +959,21 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
  */
 int drbd_send_current_state(struct drbd_conf *mdev)
 {
-	struct socket *sock;
-	struct p_state p;
-	int ok = 0;
-
-	/* Grab state lock so we wont send state if we're in the middle
-	 * of a cluster wide state change on another thread */
-	drbd_state_lock(mdev);
-
-	mutex_lock(&mdev->data.mutex);
-
-	p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
-	sock = mdev->data.socket;
+	struct drbd_socket *sock;
+	struct p_state *p;
 
-	if (likely(sock != NULL)) {
-		ok = _drbd_send_cmd(mdev, sock, P_STATE,
-				    (struct p_header80 *)&p, sizeof(p), 0);
-	}
-
-	mutex_unlock(&mdev->data.mutex);
-
-	drbd_state_unlock(mdev);
-	return ok;
+	sock = &mdev->tconn->data;
+	p = drbd_prepare_command(mdev, sock);
+	if (!p)
+		return -EIO;
+	p->state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
+	return drbd_send_command(mdev, sock, P_STATE, sizeof(*p), NULL, 0);
 }
 
 /**
  * drbd_send_state() - After a state change, sends the new state to the peer
- * @mdev:	DRBD device.
- * @state:	the state to send, not necessarily the current state.
+ * @mdev:      DRBD device.
+ * @state:     the state to send, not necessarily the current state.
  *
  * Each state change queues an "after_state_ch" work, which will eventually
  * send the resulting new state to the peer. If more state changes happen
@@ -2271,50 +982,95 @@ int drbd_send_current_state(struct drbd_conf *mdev)
  */
 int drbd_send_state(struct drbd_conf *mdev, union drbd_state state)
 {
-	struct socket *sock;
-	struct p_state p;
-	int ok = 0;
+	struct drbd_socket *sock;
+	struct p_state *p;
 
-	mutex_lock(&mdev->data.mutex);
+	sock = &mdev->tconn->data;
+	p = drbd_prepare_command(mdev, sock);
+	if (!p)
+		return -EIO;
+	p->state = cpu_to_be32(state.i); /* Within the send mutex */
+	return drbd_send_command(mdev, sock, P_STATE, sizeof(*p), NULL, 0);
+}
 
-	p.state = cpu_to_be32(state.i);
-	sock = mdev->data.socket;
+int drbd_send_state_req(struct drbd_conf *mdev, union drbd_state mask, union drbd_state val)
+{
+	struct drbd_socket *sock;
+	struct p_req_state *p;
 
-	if (likely(sock != NULL)) {
-		ok = _drbd_send_cmd(mdev, sock, P_STATE,
-				    (struct p_header80 *)&p, sizeof(p), 0);
-	}
+	sock = &mdev->tconn->data;
+	p = drbd_prepare_command(mdev, sock);
+	if (!p)
+		return -EIO;
+	p->mask = cpu_to_be32(mask.i);
+	p->val = cpu_to_be32(val.i);
+	return drbd_send_command(mdev, sock, P_STATE_CHG_REQ, sizeof(*p), NULL, 0);
+}
 
-	mutex_unlock(&mdev->data.mutex);
+int conn_send_state_req(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val)
+{
+	enum drbd_packet cmd;
+	struct drbd_socket *sock;
+	struct p_req_state *p;
 
-	return ok;
+	cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ;
+	sock = &tconn->data;
+	p = conn_prepare_command(tconn, sock);
+	if (!p)
+		return -EIO;
+	p->mask = cpu_to_be32(mask.i);
+	p->val = cpu_to_be32(val.i);
+	return conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0);
 }
 
-int drbd_send_state_req(struct drbd_conf *mdev,
-	union drbd_state mask, union drbd_state val)
+void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
 {
-	struct p_req_state p;
+	struct drbd_socket *sock;
+	struct p_req_state_reply *p;
+
+	sock = &mdev->tconn->meta;
+	p = drbd_prepare_command(mdev, sock);
+	if (p) {
+		p->retcode = cpu_to_be32(retcode);
+		drbd_send_command(mdev, sock, P_STATE_CHG_REPLY, sizeof(*p), NULL, 0);
+	}
+}
 
-	p.mask    = cpu_to_be32(mask.i);
-	p.val     = cpu_to_be32(val.i);
+void conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode)
+{
+	struct drbd_socket *sock;
+	struct p_req_state_reply *p;
+	enum drbd_packet cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY;
 
-	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
-			     (struct p_header80 *)&p, sizeof(p));
+	sock = &tconn->meta;
+	p = conn_prepare_command(tconn, sock);
+	if (p) {
+		p->retcode = cpu_to_be32(retcode);
+		conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0);
+	}
 }
 
-int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
+static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
 {
-	struct p_req_state_reply p;
+	BUG_ON(code & ~0xf);
+	p->encoding = (p->encoding & ~0xf) | code;
+}
 
-	p.retcode    = cpu_to_be32(retcode);
+static void dcbp_set_start(struct p_compressed_bm *p, int set)
+{
+	p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
+}
 
-	return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
-			     (struct p_header80 *)&p, sizeof(p));
+static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n)
+{
+	BUG_ON(n & ~0x7);
+	p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
 }
 
 int fill_bitmap_rle_bits(struct drbd_conf *mdev,
-	struct p_compressed_bm *p,
-	struct bm_xfer_ctx *c)
+			 struct p_compressed_bm *p,
+			 unsigned int size,
+			 struct bm_xfer_ctx *c)
 {
 	struct bitstream bs;
 	unsigned long plain_bits;
@@ -2322,19 +1078,21 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev,
 	unsigned long rl;
 	unsigned len;
 	unsigned toggle;
-	int bits;
+	int bits, use_rle;
 
 	/* may we use this feature? */
-	if ((mdev->sync_conf.use_rle == 0) ||
-		(mdev->agreed_pro_version < 90))
-			return 0;
+	rcu_read_lock();
+	use_rle = rcu_dereference(mdev->tconn->net_conf)->use_rle;
+	rcu_read_unlock();
+	if (!use_rle || mdev->tconn->agreed_pro_version < 90)
+		return 0;
 
 	if (c->bit_offset >= c->bm_bits)
 		return 0; /* nothing to do. */
 
 	/* use at most thus many bytes */
-	bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
-	memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
+	bitstream_init(&bs, p->code, size, 0);
+	memset(p->code, 0, size);
 	/* plain bits covered in this code string */
 	plain_bits = 0;
 
@@ -2356,12 +1114,12 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev,
 			if (rl == 0) {
 				/* the first checked bit was set,
 				 * store start value, */
-				DCBP_set_start(p, 1);
+				dcbp_set_start(p, 1);
 				/* but skip encoding of zero run length */
 				toggle = !toggle;
 				continue;
 			}
-			DCBP_set_start(p, 0);
+			dcbp_set_start(p, 0);
 		}
 
 		/* paranoia: catch zero runlength.
@@ -2401,7 +1159,7 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev,
 	bm_xfer_ctx_bit_to_word_offset(c);
 
 	/* store pad_bits */
-	DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
+	dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
 
 	return len;
 }
@@ -2413,48 +1171,52 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev,
  * code upon failure.
  */
 static int
-send_bitmap_rle_or_plain(struct drbd_conf *mdev,
-			 struct p_header80 *h, struct bm_xfer_ctx *c)
+send_bitmap_rle_or_plain(struct drbd_conf *mdev, struct bm_xfer_ctx *c)
 {
-	struct p_compressed_bm *p = (void*)h;
-	unsigned long num_words;
-	int len;
-	int ok;
-
-	len = fill_bitmap_rle_bits(mdev, p, c);
+	struct drbd_socket *sock = &mdev->tconn->data;
+	unsigned int header_size = drbd_header_size(mdev->tconn);
+	struct p_compressed_bm *p = sock->sbuf + header_size;
+	int len, err;
 
+	len = fill_bitmap_rle_bits(mdev, p,
+			DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c);
 	if (len < 0)
 		return -EIO;
 
 	if (len) {
-		DCBP_set_code(p, RLE_VLI_Bits);
-		ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
-			sizeof(*p) + len, 0);
-
+		dcbp_set_code(p, RLE_VLI_Bits);
+		err = __send_command(mdev->tconn, mdev->vnr, sock,
+				     P_COMPRESSED_BITMAP, sizeof(*p) + len,
+				     NULL, 0);
 		c->packets[0]++;
-		c->bytes[0] += sizeof(*p) + len;
+		c->bytes[0] += header_size + sizeof(*p) + len;
 
 		if (c->bit_offset >= c->bm_bits)
 			len = 0; /* DONE */
 	} else {
 		/* was not compressible.
 		 * send a buffer full of plain text bits instead. */
-		num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
-		len = num_words * sizeof(long);
+		unsigned int data_size;
+		unsigned long num_words;
+		unsigned long *p = sock->sbuf + header_size;
+
+		data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
+		num_words = min_t(size_t, data_size / sizeof(*p),
+				  c->bm_words - c->word_offset);
+		len = num_words * sizeof(*p);
 		if (len)
-			drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
-		ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
-				   h, sizeof(struct p_header80) + len, 0);
+			drbd_bm_get_lel(mdev, c->word_offset, num_words, p);
+		err = __send_command(mdev->tconn, mdev->vnr, sock, P_BITMAP, len, NULL, 0);
 		c->word_offset += num_words;
 		c->bit_offset = c->word_offset * BITS_PER_LONG;
 
 		c->packets[1]++;
-		c->bytes[1] += sizeof(struct p_header80) + len;
+		c->bytes[1] += header_size + len;
 
 		if (c->bit_offset > c->bm_bits)
 			c->bit_offset = c->bm_bits;
 	}
-	if (ok) {
+	if (!err) {
 		if (len == 0) {
 			INFO_bm_xfer_stats(mdev, "send", c);
 			return 0;
@@ -2465,21 +1227,13 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev,
 }
 
 /* See the comment at receive_bitmap() */
-int _drbd_send_bitmap(struct drbd_conf *mdev)
+static int _drbd_send_bitmap(struct drbd_conf *mdev)
 {
 	struct bm_xfer_ctx c;
-	struct p_header80 *p;
 	int err;
 
-	ERR_IF(!mdev->bitmap) return false;
-
-	/* maybe we should use some per thread scratch page,
-	 * and allocate that during initial device creation? */
-	p = (struct p_header80 *) __get_free_page(GFP_NOIO);
-	if (!p) {
-		dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
+	if (!expect(mdev->bitmap))
 		return false;
-	}
 
 	if (get_ldev(mdev)) {
 		if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
@@ -2504,37 +1258,39 @@ int _drbd_send_bitmap(struct drbd_conf *mdev)
 	};
 
 	do {
-		err = send_bitmap_rle_or_plain(mdev, p, &c);
+		err = send_bitmap_rle_or_plain(mdev, &c);
 	} while (err > 0);
 
-	free_page((unsigned long) p);
 	return err == 0;
 }
 
 int drbd_send_bitmap(struct drbd_conf *mdev)
 {
-	int err;
+	struct drbd_socket *sock = &mdev->tconn->data;
+	int err = -1;
 
-	if (!drbd_get_data_sock(mdev))
-		return -1;
-	err = !_drbd_send_bitmap(mdev);
-	drbd_put_data_sock(mdev);
+	mutex_lock(&sock->mutex);
+	if (sock->socket)
+		err = !_drbd_send_bitmap(mdev);
+	mutex_unlock(&sock->mutex);
 	return err;
 }
 
-int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
+void drbd_send_b_ack(struct drbd_tconn *tconn, u32 barrier_nr, u32 set_size)
 {
-	int ok;
-	struct p_barrier_ack p;
+	struct drbd_socket *sock;
+	struct p_barrier_ack *p;
 
-	p.barrier  = barrier_nr;
-	p.set_size = cpu_to_be32(set_size);
+	if (tconn->cstate < C_WF_REPORT_PARAMS)
+		return;
 
-	if (mdev->state.conn < C_CONNECTED)
-		return false;
-	ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
-			(struct p_header80 *)&p, sizeof(p));
-	return ok;
+	sock = &tconn->meta;
+	p = conn_prepare_command(tconn, sock);
+	if (!p)
+		return;
+	p->barrier = barrier_nr;
+	p->set_size = cpu_to_be32(set_size);
+	conn_send_command(tconn, sock, P_BARRIER_ACK, sizeof(*p), NULL, 0);
 }
 
 /**
@@ -2545,62 +1301,62 @@ int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
  * @blksize:	size in byte, needs to be in big endian byte order
  * @block_id:	Id, big endian byte order
  */
-static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
-			  u64 sector,
-			  u32 blksize,
-			  u64 block_id)
+static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
+			  u64 sector, u32 blksize, u64 block_id)
 {
-	int ok;
-	struct p_block_ack p;
+	struct drbd_socket *sock;
+	struct p_block_ack *p;
 
-	p.sector   = sector;
-	p.block_id = block_id;
-	p.blksize  = blksize;
-	p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
+	if (mdev->state.conn < C_CONNECTED)
+		return -EIO;
 
-	if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
-		return false;
-	ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
-				(struct p_header80 *)&p, sizeof(p));
-	return ok;
+	sock = &mdev->tconn->meta;
+	p = drbd_prepare_command(mdev, sock);
+	if (!p)
+		return -EIO;
+	p->sector = sector;
+	p->block_id = block_id;
+	p->blksize = blksize;
+	p->seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq));
+	return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0);
 }
 
 /* dp->sector and dp->block_id already/still in network byte order,
  * data_size is payload size according to dp->head,
  * and may need to be corrected for digest size. */
-int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
-		     struct p_data *dp, int data_size)
+void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd,
+		      struct p_data *dp, int data_size)
 {
-	data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
-		crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
-	return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
-			      dp->block_id);
+	if (mdev->tconn->peer_integrity_tfm)
+		data_size -= crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm);
+	_drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
+		       dp->block_id);
 }
 
-int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
-		     struct p_block_req *rp)
+void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd,
+		      struct p_block_req *rp)
 {
-	return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
+	_drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
 }
 
 /**
  * drbd_send_ack() - Sends an ack packet
- * @mdev:	DRBD device.
- * @cmd:	Packet command code.
- * @e:		Epoch entry.
+ * @mdev:	DRBD device
+ * @cmd:	packet command code
+ * @peer_req:	peer request
  */
-int drbd_send_ack(struct drbd_conf *mdev,
-	enum drbd_packets cmd, struct drbd_epoch_entry *e)
+int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
+		  struct drbd_peer_request *peer_req)
 {
 	return _drbd_send_ack(mdev, cmd,
-			      cpu_to_be64(e->sector),
-			      cpu_to_be32(e->size),
-			      e->block_id);
+			      cpu_to_be64(peer_req->i.sector),
+			      cpu_to_be32(peer_req->i.size),
+			      peer_req->block_id);
 }
 
 /* This function misuses the block_id field to signal if the blocks
  * are is sync or not. */
-int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
+int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd,
 		     sector_t sector, int blksize, u64 block_id)
 {
 	return _drbd_send_ack(mdev, cmd,
@@ -2612,85 +1368,87 @@ int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
 int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
 		       sector_t sector, int size, u64 block_id)
 {
-	int ok;
-	struct p_block_req p;
+	struct drbd_socket *sock;
+	struct p_block_req *p;
 
-	p.sector   = cpu_to_be64(sector);
-	p.block_id = block_id;
-	p.blksize  = cpu_to_be32(size);
-
-	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
-				(struct p_header80 *)&p, sizeof(p));
-	return ok;
-}
-
-int drbd_send_drequest_csum(struct drbd_conf *mdev,
-			    sector_t sector, int size,
-			    void *digest, int digest_size,
-			    enum drbd_packets cmd)
-{
-	int ok;
-	struct p_block_req p;
-
-	p.sector   = cpu_to_be64(sector);
-	p.block_id = BE_DRBD_MAGIC + 0xbeef;
-	p.blksize  = cpu_to_be32(size);
-
-	p.head.magic   = BE_DRBD_MAGIC;
-	p.head.command = cpu_to_be16(cmd);
-	p.head.length  = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
-
-	mutex_lock(&mdev->data.mutex);
+	sock = &mdev->tconn->data;
+	p = drbd_prepare_command(mdev, sock);
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(sector);
+	p->block_id = block_id;
+	p->blksize = cpu_to_be32(size);
+	return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0);
+}
 
-	ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
-	ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
+int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, int size,
+			    void *digest, int digest_size, enum drbd_packet cmd)
+{
+	struct drbd_socket *sock;
+	struct p_block_req *p;
 
-	mutex_unlock(&mdev->data.mutex);
+	/* FIXME: Put the digest into the preallocated socket buffer.  */
 
-	return ok;
+	sock = &mdev->tconn->data;
+	p = drbd_prepare_command(mdev, sock);
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(sector);
+	p->block_id = ID_SYNCER /* unused */;
+	p->blksize = cpu_to_be32(size);
+	return drbd_send_command(mdev, sock, cmd, sizeof(*p),
+				 digest, digest_size);
 }
 
 int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
 {
-	int ok;
-	struct p_block_req p;
-
-	p.sector   = cpu_to_be64(sector);
-	p.block_id = BE_DRBD_MAGIC + 0xbabe;
-	p.blksize  = cpu_to_be32(size);
+	struct drbd_socket *sock;
+	struct p_block_req *p;
 
-	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
-			   (struct p_header80 *)&p, sizeof(p));
-	return ok;
+	sock = &mdev->tconn->data;
+	p = drbd_prepare_command(mdev, sock);
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(sector);
+	p->block_id = ID_SYNCER /* unused */;
+	p->blksize = cpu_to_be32(size);
+	return drbd_send_command(mdev, sock, P_OV_REQUEST, sizeof(*p), NULL, 0);
 }
 
 /* called on sndtimeo
  * returns false if we should retry,
  * true if we think connection is dead
  */
-static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
+static int we_should_drop_the_connection(struct drbd_tconn *tconn, struct socket *sock)
 {
 	int drop_it;
 	/* long elapsed = (long)(jiffies - mdev->last_received); */
 
-	drop_it =   mdev->meta.socket == sock
-		|| !mdev->asender.task
-		|| get_t_state(&mdev->asender) != Running
-		|| mdev->state.conn < C_CONNECTED;
+	drop_it =   tconn->meta.socket == sock
+		|| !tconn->asender.task
+		|| get_t_state(&tconn->asender) != RUNNING
+		|| tconn->cstate < C_WF_REPORT_PARAMS;
 
 	if (drop_it)
 		return true;
 
-	drop_it = !--mdev->ko_count;
+	drop_it = !--tconn->ko_count;
 	if (!drop_it) {
-		dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
-		       current->comm, current->pid, mdev->ko_count);
-		request_ping(mdev);
+		conn_err(tconn, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
+			 current->comm, current->pid, tconn->ko_count);
+		request_ping(tconn);
 	}
 
 	return drop_it; /* && (mdev->state == R_PRIMARY) */;
 }
 
+static void drbd_update_congested(struct drbd_tconn *tconn)
+{
+	struct sock *sk = tconn->data.socket->sk;
+	if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
+		set_bit(NET_CONGESTED, &tconn->flags);
+}
+
 /* The idea of sendpage seems to be to put some kind of reference
  * to the page into the skb, and to hand it over to the NIC. In
  * this process get_page() gets called.
@@ -2713,21 +1471,28 @@ static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *
  * with page_count == 0 or PageSlab.
  */
 static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
-		   int offset, size_t size, unsigned msg_flags)
+			      int offset, size_t size, unsigned msg_flags)
 {
-	int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
+	struct socket *socket;
+	void *addr;
+	int err;
+
+	socket = mdev->tconn->data.socket;
+	addr = kmap(page) + offset;
+	err = drbd_send_all(mdev->tconn, socket, addr, size, msg_flags);
 	kunmap(page);
-	if (sent == size)
-		mdev->send_cnt += size>>9;
-	return sent == size;
+	if (!err)
+		mdev->send_cnt += size >> 9;
+	return err;
 }
 
 static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
 		    int offset, size_t size, unsigned msg_flags)
 {
+	struct socket *socket = mdev->tconn->data.socket;
 	mm_segment_t oldfs = get_fs();
-	int sent, ok;
 	int len = size;
+	int err = -EIO;
 
 	/* e.g. XFS meta- & log-data is in slab pages, which have a
 	 * page_count of 0 and/or have PageSlab() set.
@@ -2739,34 +1504,35 @@ static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
 		return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
 
 	msg_flags |= MSG_NOSIGNAL;
-	drbd_update_congested(mdev);
+	drbd_update_congested(mdev->tconn);
 	set_fs(KERNEL_DS);
 	do {
-		sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
-							offset, len,
-							msg_flags);
-		if (sent == -EAGAIN) {
-			if (we_should_drop_the_connection(mdev,
-							  mdev->data.socket))
-				break;
-			else
-				continue;
-		}
+		int sent;
+
+		sent = socket->ops->sendpage(socket, page, offset, len, msg_flags);
 		if (sent <= 0) {
+			if (sent == -EAGAIN) {
+				if (we_should_drop_the_connection(mdev->tconn, socket))
+					break;
+				continue;
+			}
 			dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
 			     __func__, (int)size, len, sent);
+			if (sent < 0)
+				err = sent;
 			break;
 		}
 		len    -= sent;
 		offset += sent;
 	} while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
 	set_fs(oldfs);
-	clear_bit(NET_CONGESTED, &mdev->flags);
+	clear_bit(NET_CONGESTED, &mdev->tconn->flags);
 
-	ok = (len == 0);
-	if (likely(ok))
-		mdev->send_cnt += size>>9;
-	return ok;
+	if (len == 0) {
+		err = 0;
+		mdev->send_cnt += size >> 9;
+	}
+	return err;
 }
 
 static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
@@ -2775,12 +1541,15 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
 	int i;
 	/* hint all but last page with MSG_MORE */
 	bio_for_each_segment(bvec, bio, i) {
-		if (!_drbd_no_send_page(mdev, bvec->bv_page,
-				     bvec->bv_offset, bvec->bv_len,
-				     i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
-			return 0;
+		int err;
+
+		err = _drbd_no_send_page(mdev, bvec->bv_page,
+					 bvec->bv_offset, bvec->bv_len,
+					 i == bio->bi_vcnt - 1 ? 0 : MSG_MORE);
+		if (err)
+			return err;
 	}
-	return 1;
+	return 0;
 }
 
 static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
@@ -2789,32 +1558,40 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
 	int i;
 	/* hint all but last page with MSG_MORE */
 	bio_for_each_segment(bvec, bio, i) {
-		if (!_drbd_send_page(mdev, bvec->bv_page,
-				     bvec->bv_offset, bvec->bv_len,
-				     i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
-			return 0;
+		int err;
+
+		err = _drbd_send_page(mdev, bvec->bv_page,
+				      bvec->bv_offset, bvec->bv_len,
+				      i == bio->bi_vcnt - 1 ? 0 : MSG_MORE);
+		if (err)
+			return err;
 	}
-	return 1;
+	return 0;
 }
 
-static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+static int _drbd_send_zc_ee(struct drbd_conf *mdev,
+			    struct drbd_peer_request *peer_req)
 {
-	struct page *page = e->pages;
-	unsigned len = e->size;
+	struct page *page = peer_req->pages;
+	unsigned len = peer_req->i.size;
+	int err;
+
 	/* hint all but last page with MSG_MORE */
 	page_chain_for_each(page) {
 		unsigned l = min_t(unsigned, len, PAGE_SIZE);
-		if (!_drbd_send_page(mdev, page, 0, l,
-				page_chain_next(page) ? MSG_MORE : 0))
-			return 0;
+
+		err = _drbd_send_page(mdev, page, 0, l,
+				      page_chain_next(page) ? MSG_MORE : 0);
+		if (err)
+			return err;
 		len -= l;
 	}
-	return 1;
+	return 0;
 }
 
 static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
 {
-	if (mdev->agreed_pro_version >= 95)
+	if (mdev->tconn->agreed_pro_version >= 95)
 		return  (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
 			(bi_rw & REQ_FUA ? DP_FUA : 0) |
 			(bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
@@ -2828,50 +1605,36 @@ static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
  */
 int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
 {
-	int ok = 1;
-	struct p_data p;
+	struct drbd_socket *sock;
+	struct p_data *p;
 	unsigned int dp_flags = 0;
-	void *dgb;
 	int dgs;
+	int err;
 
-	if (!drbd_get_data_sock(mdev))
-		return 0;
-
-	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
-		crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
-
-	if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
-		p.head.h80.magic   = BE_DRBD_MAGIC;
-		p.head.h80.command = cpu_to_be16(P_DATA);
-		p.head.h80.length  =
-			cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
-	} else {
-		p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
-		p.head.h95.command = cpu_to_be16(P_DATA);
-		p.head.h95.length  =
-			cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
-	}
-
-	p.sector   = cpu_to_be64(req->sector);
-	p.block_id = (unsigned long)req;
-	p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
+	sock = &mdev->tconn->data;
+	p = drbd_prepare_command(mdev, sock);
+	dgs = mdev->tconn->integrity_tfm ? crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0;
 
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(req->i.sector);
+	p->block_id = (unsigned long)req;
+	p->seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq));
 	dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
-
 	if (mdev->state.conn >= C_SYNC_SOURCE &&
 	    mdev->state.conn <= C_PAUSED_SYNC_T)
 		dp_flags |= DP_MAY_SET_IN_SYNC;
-
-	p.dp_flags = cpu_to_be32(dp_flags);
-	set_bit(UNPLUG_REMOTE, &mdev->flags);
-	ok = (sizeof(p) ==
-		drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
-	if (ok && dgs) {
-		dgb = mdev->int_dig_out;
-		drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
-		ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
-	}
-	if (ok) {
+	if (mdev->tconn->agreed_pro_version >= 100) {
+		if (req->rq_state & RQ_EXP_RECEIVE_ACK)
+			dp_flags |= DP_SEND_RECEIVE_ACK;
+		if (req->rq_state & RQ_EXP_WRITE_ACK)
+			dp_flags |= DP_SEND_WRITE_ACK;
+	}
+	p->dp_flags = cpu_to_be32(dp_flags);
+	if (dgs)
+		drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, p + 1);
+	err = __send_command(mdev->tconn, mdev->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size);
+	if (!err) {
 		/* For protocol A, we have to memcpy the payload into
 		 * socket buffers, as we may complete right away
 		 * as soon as we handed it over to tcp, at which point the data
@@ -2883,92 +1646,76 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
 		 * out ok after sending on this side, but does not fit on the
 		 * receiving side, we sure have detected corruption elsewhere.
 		 */
-		if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
-			ok = _drbd_send_bio(mdev, req->master_bio);
+		if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || dgs)
+			err = _drbd_send_bio(mdev, req->master_bio);
 		else
-			ok = _drbd_send_zc_bio(mdev, req->master_bio);
+			err = _drbd_send_zc_bio(mdev, req->master_bio);
 
 		/* double check digest, sometimes buffers have been modified in flight. */
 		if (dgs > 0 && dgs <= 64) {
 			/* 64 byte, 512 bit, is the largest digest size
 			 * currently supported in kernel crypto. */
 			unsigned char digest[64];
-			drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
-			if (memcmp(mdev->int_dig_out, digest, dgs)) {
+			drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, digest);
+			if (memcmp(p + 1, digest, dgs)) {
 				dev_warn(DEV,
 					"Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
-					(unsigned long long)req->sector, req->size);
+					(unsigned long long)req->i.sector, req->i.size);
 			}
 		} /* else if (dgs > 64) {
 		     ... Be noisy about digest too large ...
 		} */
 	}
+	mutex_unlock(&sock->mutex);  /* locked by drbd_prepare_command() */
 
-	drbd_put_data_sock(mdev);
-
-	return ok;
+	return err;
 }
 
 /* answer packet, used to send data back for read requests:
  *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
  *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
  */
-int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
-		    struct drbd_epoch_entry *e)
+int drbd_send_block(struct drbd_conf *mdev, enum drbd_packet cmd,
+		    struct drbd_peer_request *peer_req)
 {
-	int ok;
-	struct p_data p;
-	void *dgb;
+	struct drbd_socket *sock;
+	struct p_data *p;
+	int err;
 	int dgs;
 
-	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
-		crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
-
-	if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
-		p.head.h80.magic   = BE_DRBD_MAGIC;
-		p.head.h80.command = cpu_to_be16(cmd);
-		p.head.h80.length  =
-			cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
-	} else {
-		p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
-		p.head.h95.command = cpu_to_be16(cmd);
-		p.head.h95.length  =
-			cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
-	}
-
-	p.sector   = cpu_to_be64(e->sector);
-	p.block_id = e->block_id;
-	/* p.seq_num  = 0;    No sequence numbers here.. */
-
-	/* Only called by our kernel thread.
-	 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
-	 * in response to admin command or module unload.
-	 */
-	if (!drbd_get_data_sock(mdev))
-		return 0;
+	sock = &mdev->tconn->data;
+	p = drbd_prepare_command(mdev, sock);
 
-	ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
-	if (ok && dgs) {
-		dgb = mdev->int_dig_out;
-		drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
-		ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
-	}
-	if (ok)
-		ok = _drbd_send_zc_ee(mdev, e);
+	dgs = mdev->tconn->integrity_tfm ? crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0;
 
-	drbd_put_data_sock(mdev);
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(peer_req->i.sector);
+	p->block_id = peer_req->block_id;
+	p->seq_num = 0;  /* unused */
+	p->dp_flags = 0;
+	if (dgs)
+		drbd_csum_ee(mdev, mdev->tconn->integrity_tfm, peer_req, p + 1);
+	err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, sizeof(*p) + dgs, NULL, peer_req->i.size);
+	if (!err)
+		err = _drbd_send_zc_ee(mdev, peer_req);
+	mutex_unlock(&sock->mutex);  /* locked by drbd_prepare_command() */
 
-	return ok;
+	return err;
 }
 
-int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
+int drbd_send_out_of_sync(struct drbd_conf *mdev, struct drbd_request *req)
 {
-	struct p_block_desc p;
-
-	p.sector  = cpu_to_be64(req->sector);
-	p.blksize = cpu_to_be32(req->size);
+	struct drbd_socket *sock;
+	struct p_block_desc *p;
 
-	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
+	sock = &mdev->tconn->data;
+	p = drbd_prepare_command(mdev, sock);
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(req->i.sector);
+	p->blksize = cpu_to_be32(req->i.size);
+	return drbd_send_command(mdev, sock, P_OUT_OF_SYNC, sizeof(*p), NULL, 0);
 }
 
 /*
@@ -2987,7 +1734,7 @@ int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
 /*
  * you must have down()ed the appropriate [m]sock_mutex elsewhere!
  */
-int drbd_send(struct drbd_conf *mdev, struct socket *sock,
+int drbd_send(struct drbd_tconn *tconn, struct socket *sock,
 	      void *buf, size_t size, unsigned msg_flags)
 {
 	struct kvec iov;
@@ -2995,7 +1742,7 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock,
 	int rv, sent = 0;
 
 	if (!sock)
-		return -1000;
+		return -EBADR;
 
 	/* THINK  if (signal_pending) return ... ? */
 
@@ -3008,9 +1755,11 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock,
 	msg.msg_controllen = 0;
 	msg.msg_flags      = msg_flags | MSG_NOSIGNAL;
 
-	if (sock == mdev->data.socket) {
-		mdev->ko_count = mdev->net_conf->ko_count;
-		drbd_update_congested(mdev);
+	if (sock == tconn->data.socket) {
+		rcu_read_lock();
+		tconn->ko_count = rcu_dereference(tconn->net_conf)->ko_count;
+		rcu_read_unlock();
+		drbd_update_congested(tconn);
 	}
 	do {
 		/* STRANGE
@@ -3024,12 +1773,11 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock,
  */
 		rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
 		if (rv == -EAGAIN) {
-			if (we_should_drop_the_connection(mdev, sock))
+			if (we_should_drop_the_connection(tconn, sock))
 				break;
 			else
 				continue;
 		}
-		D_ASSERT(rv != 0);
 		if (rv == -EINTR) {
 			flush_signals(current);
 			rv = 0;
@@ -3041,22 +1789,40 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock,
 		iov.iov_len  -= rv;
 	} while (sent < size);
 
-	if (sock == mdev->data.socket)
-		clear_bit(NET_CONGESTED, &mdev->flags);
+	if (sock == tconn->data.socket)
+		clear_bit(NET_CONGESTED, &tconn->flags);
 
 	if (rv <= 0) {
 		if (rv != -EAGAIN) {
-			dev_err(DEV, "%s_sendmsg returned %d\n",
-			    sock == mdev->meta.socket ? "msock" : "sock",
-			    rv);
-			drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
+			conn_err(tconn, "%s_sendmsg returned %d\n",
+				 sock == tconn->meta.socket ? "msock" : "sock",
+				 rv);
+			conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD);
 		} else
-			drbd_force_state(mdev, NS(conn, C_TIMEOUT));
+			conn_request_state(tconn, NS(conn, C_TIMEOUT), CS_HARD);
 	}
 
 	return sent;
 }
 
+/**
+ * drbd_send_all  -  Send an entire buffer
+ *
+ * Returns 0 upon success and a negative error value otherwise.
+ */
+int drbd_send_all(struct drbd_tconn *tconn, struct socket *sock, void *buffer,
+		  size_t size, unsigned msg_flags)
+{
+	int err;
+
+	err = drbd_send(tconn, sock, buffer, size, msg_flags);
+	if (err < 0)
+		return err;
+	if (err != size)
+		return -EIO;
+	return 0;
+}
+
 static int drbd_open(struct block_device *bdev, fmode_t mode)
 {
 	struct drbd_conf *mdev = bdev->bd_disk->private_data;
@@ -3064,7 +1830,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode)
 	int rv = 0;
 
 	mutex_lock(&drbd_main_mutex);
-	spin_lock_irqsave(&mdev->req_lock, flags);
+	spin_lock_irqsave(&mdev->tconn->req_lock, flags);
 	/* to have a stable mdev->state.role
 	 * and no race with updating open_cnt */
 
@@ -3077,7 +1843,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode)
 
 	if (!rv)
 		mdev->open_cnt++;
-	spin_unlock_irqrestore(&mdev->req_lock, flags);
+	spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
 	mutex_unlock(&drbd_main_mutex);
 
 	return rv;
@@ -3094,35 +1860,14 @@ static int drbd_release(struct gendisk *gd, fmode_t mode)
 
 static void drbd_set_defaults(struct drbd_conf *mdev)
 {
-	/* This way we get a compile error when sync_conf grows,
-	   and we forgot to initialize it here */
-	mdev->sync_conf = (struct syncer_conf) {
-		/* .rate = */		DRBD_RATE_DEF,
-		/* .after = */		DRBD_AFTER_DEF,
-		/* .al_extents = */	DRBD_AL_EXTENTS_DEF,
-		/* .verify_alg = */	{}, 0,
-		/* .cpu_mask = */	{}, 0,
-		/* .csums_alg = */	{}, 0,
-		/* .use_rle = */	0,
-		/* .on_no_data = */	DRBD_ON_NO_DATA_DEF,
-		/* .c_plan_ahead = */	DRBD_C_PLAN_AHEAD_DEF,
-		/* .c_delay_target = */	DRBD_C_DELAY_TARGET_DEF,
-		/* .c_fill_target = */	DRBD_C_FILL_TARGET_DEF,
-		/* .c_max_rate = */	DRBD_C_MAX_RATE_DEF,
-		/* .c_min_rate = */	DRBD_C_MIN_RATE_DEF
-	};
-
-	/* Have to use that way, because the layout differs between
-	   big endian and little endian */
-	mdev->state = (union drbd_state) {
+	/* Beware! The actual layout differs
+	 * between big endian and little endian */
+	mdev->state = (union drbd_dev_state) {
 		{ .role = R_SECONDARY,
 		  .peer = R_UNKNOWN,
 		  .conn = C_STANDALONE,
 		  .disk = D_DISKLESS,
 		  .pdsk = D_UNKNOWN,
-		  .susp = 0,
-		  .susp_nod = 0,
-		  .susp_fen = 0
 		} };
 }
 
@@ -3138,28 +1883,17 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	atomic_set(&mdev->rs_pending_cnt, 0);
 	atomic_set(&mdev->unacked_cnt, 0);
 	atomic_set(&mdev->local_cnt, 0);
-	atomic_set(&mdev->net_cnt, 0);
-	atomic_set(&mdev->packet_seq, 0);
-	atomic_set(&mdev->pp_in_use, 0);
 	atomic_set(&mdev->pp_in_use_by_net, 0);
 	atomic_set(&mdev->rs_sect_in, 0);
 	atomic_set(&mdev->rs_sect_ev, 0);
 	atomic_set(&mdev->ap_in_flight, 0);
 	atomic_set(&mdev->md_io_in_use, 0);
 
-	mutex_init(&mdev->data.mutex);
-	mutex_init(&mdev->meta.mutex);
-	sema_init(&mdev->data.work.s, 0);
-	sema_init(&mdev->meta.work.s, 0);
-	mutex_init(&mdev->state_mutex);
-
-	spin_lock_init(&mdev->data.work.q_lock);
-	spin_lock_init(&mdev->meta.work.q_lock);
+	mutex_init(&mdev->own_state_mutex);
+	mdev->state_mutex = &mdev->own_state_mutex;
 
 	spin_lock_init(&mdev->al_lock);
-	spin_lock_init(&mdev->req_lock);
 	spin_lock_init(&mdev->peer_seq_lock);
-	spin_lock_init(&mdev->epoch_lock);
 
 	INIT_LIST_HEAD(&mdev->active_ee);
 	INIT_LIST_HEAD(&mdev->sync_ee);
@@ -3167,8 +1901,6 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	INIT_LIST_HEAD(&mdev->read_ee);
 	INIT_LIST_HEAD(&mdev->net_ee);
 	INIT_LIST_HEAD(&mdev->resync_reads);
-	INIT_LIST_HEAD(&mdev->data.work.q);
-	INIT_LIST_HEAD(&mdev->meta.work.q);
 	INIT_LIST_HEAD(&mdev->resync_work.list);
 	INIT_LIST_HEAD(&mdev->unplug_work.list);
 	INIT_LIST_HEAD(&mdev->go_diskless.list);
@@ -3182,6 +1914,14 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	mdev->md_sync_work.cb = w_md_sync;
 	mdev->bm_io_work.w.cb = w_bitmap_io;
 	mdev->start_resync_work.cb = w_start_resync;
+
+	mdev->resync_work.mdev  = mdev;
+	mdev->unplug_work.mdev  = mdev;
+	mdev->go_diskless.mdev  = mdev;
+	mdev->md_sync_work.mdev = mdev;
+	mdev->bm_io_work.w.mdev = mdev;
+	mdev->start_resync_work.mdev = mdev;
+
 	init_timer(&mdev->resync_timer);
 	init_timer(&mdev->md_sync_timer);
 	init_timer(&mdev->start_resync_timer);
@@ -3197,17 +1937,10 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 
 	init_waitqueue_head(&mdev->misc_wait);
 	init_waitqueue_head(&mdev->state_wait);
-	init_waitqueue_head(&mdev->net_cnt_wait);
 	init_waitqueue_head(&mdev->ee_wait);
 	init_waitqueue_head(&mdev->al_wait);
 	init_waitqueue_head(&mdev->seq_wait);
 
-	drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
-	drbd_thread_init(mdev, &mdev->worker, drbd_worker);
-	drbd_thread_init(mdev, &mdev->asender, drbd_asender);
-
-	mdev->agreed_pro_version = PRO_VERSION_MAX;
-	mdev->write_ordering = WO_bdev_flush;
 	mdev->resync_wenr = LC_FREE;
 	mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
 	mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
@@ -3216,13 +1949,10 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 void drbd_mdev_cleanup(struct drbd_conf *mdev)
 {
 	int i;
-	if (mdev->receiver.t_state != None)
+	if (mdev->tconn->receiver.t_state != NONE)
 		dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
-				mdev->receiver.t_state);
+				mdev->tconn->receiver.t_state);
 
-	/* no need to lock it, I'm the only thread alive */
-	if (atomic_read(&mdev->current_epoch->epoch_size) !=  0)
-		dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
 	mdev->al_writ_cnt  =
 	mdev->bm_writ_cnt  =
 	mdev->read_cnt     =
@@ -3239,7 +1969,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
 		mdev->rs_mark_left[i] = 0;
 		mdev->rs_mark_time[i] = 0;
 	}
-	D_ASSERT(mdev->net_conf == NULL);
+	D_ASSERT(mdev->tconn->net_conf == NULL);
 
 	drbd_set_my_capacity(mdev, 0);
 	if (mdev->bitmap) {
@@ -3248,21 +1978,18 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
 		drbd_bm_cleanup(mdev);
 	}
 
-	drbd_free_resources(mdev);
+	drbd_free_bc(mdev->ldev);
+	mdev->ldev = NULL;
+
 	clear_bit(AL_SUSPENDED, &mdev->flags);
 
-	/*
-	 * currently we drbd_init_ee only on module load, so
-	 * we may do drbd_release_ee only on module unload!
-	 */
 	D_ASSERT(list_empty(&mdev->active_ee));
 	D_ASSERT(list_empty(&mdev->sync_ee));
 	D_ASSERT(list_empty(&mdev->done_ee));
 	D_ASSERT(list_empty(&mdev->read_ee));
 	D_ASSERT(list_empty(&mdev->net_ee));
 	D_ASSERT(list_empty(&mdev->resync_reads));
-	D_ASSERT(list_empty(&mdev->data.work.q));
-	D_ASSERT(list_empty(&mdev->meta.work.q));
+	D_ASSERT(list_empty(&mdev->tconn->sender_work.q));
 	D_ASSERT(list_empty(&mdev->resync_work.list));
 	D_ASSERT(list_empty(&mdev->unplug_work.list));
 	D_ASSERT(list_empty(&mdev->go_diskless.list));
@@ -3336,7 +2063,7 @@ static int drbd_create_mempools(void)
 		goto Enomem;
 
 	drbd_ee_cache = kmem_cache_create(
-		"drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
+		"drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL);
 	if (drbd_ee_cache == NULL)
 		goto Enomem;
 
@@ -3351,11 +2078,9 @@ static int drbd_create_mempools(void)
 		goto Enomem;
 
 	/* mempools */
-#ifdef COMPAT_HAVE_BIOSET_CREATE
 	drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
 	if (drbd_md_io_bio_set == NULL)
 		goto Enomem;
-#endif
 
 	drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
 	if (drbd_md_io_page_pool == NULL)
@@ -3404,73 +2129,53 @@ static struct notifier_block drbd_notifier = {
 	.notifier_call = drbd_notify_sys,
 };
 
-static void drbd_release_ee_lists(struct drbd_conf *mdev)
+static void drbd_release_all_peer_reqs(struct drbd_conf *mdev)
 {
 	int rr;
 
-	rr = drbd_release_ee(mdev, &mdev->active_ee);
+	rr = drbd_free_peer_reqs(mdev, &mdev->active_ee);
 	if (rr)
 		dev_err(DEV, "%d EEs in active list found!\n", rr);
 
-	rr = drbd_release_ee(mdev, &mdev->sync_ee);
+	rr = drbd_free_peer_reqs(mdev, &mdev->sync_ee);
 	if (rr)
 		dev_err(DEV, "%d EEs in sync list found!\n", rr);
 
-	rr = drbd_release_ee(mdev, &mdev->read_ee);
+	rr = drbd_free_peer_reqs(mdev, &mdev->read_ee);
 	if (rr)
 		dev_err(DEV, "%d EEs in read list found!\n", rr);
 
-	rr = drbd_release_ee(mdev, &mdev->done_ee);
+	rr = drbd_free_peer_reqs(mdev, &mdev->done_ee);
 	if (rr)
 		dev_err(DEV, "%d EEs in done list found!\n", rr);
 
-	rr = drbd_release_ee(mdev, &mdev->net_ee);
+	rr = drbd_free_peer_reqs(mdev, &mdev->net_ee);
 	if (rr)
 		dev_err(DEV, "%d EEs in net list found!\n", rr);
 }
 
-/* caution. no locking.
- * currently only used from module cleanup code. */
-static void drbd_delete_device(unsigned int minor)
+/* caution. no locking. */
+void drbd_minor_destroy(struct kref *kref)
 {
-	struct drbd_conf *mdev = minor_to_mdev(minor);
-
-	if (!mdev)
-		return;
+	struct drbd_conf *mdev = container_of(kref, struct drbd_conf, kref);
+	struct drbd_tconn *tconn = mdev->tconn;
 
 	del_timer_sync(&mdev->request_timer);
 
 	/* paranoia asserts */
-	if (mdev->open_cnt != 0)
-		dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
-				__FILE__ , __LINE__);
-
-	ERR_IF (!list_empty(&mdev->data.work.q)) {
-		struct list_head *lp;
-		list_for_each(lp, &mdev->data.work.q) {
-			dev_err(DEV, "lp = %p\n", lp);
-		}
-	};
+	D_ASSERT(mdev->open_cnt == 0);
 	/* end paranoia asserts */
 
-	del_gendisk(mdev->vdisk);
-
 	/* cleanup stuff that may have been allocated during
 	 * device (re-)configuration or state changes */
 
 	if (mdev->this_bdev)
 		bdput(mdev->this_bdev);
 
-	drbd_free_resources(mdev);
+	drbd_free_bc(mdev->ldev);
+	mdev->ldev = NULL;
 
-	drbd_release_ee_lists(mdev);
-
-	/* should be freed on disconnect? */
-	kfree(mdev->ee_hash);
-	/*
-	mdev->ee_hash_s = 0;
-	mdev->ee_hash = NULL;
-	*/
+	drbd_release_all_peer_reqs(mdev);
 
 	lc_destroy(mdev->act_log);
 	lc_destroy(mdev->resync);
@@ -3478,19 +2183,101 @@ static void drbd_delete_device(unsigned int minor)
 	kfree(mdev->p_uuid);
 	/* mdev->p_uuid = NULL; */
 
-	kfree(mdev->int_dig_out);
-	kfree(mdev->int_dig_in);
-	kfree(mdev->int_dig_vv);
+	if (mdev->bitmap) /* should no longer be there. */
+		drbd_bm_cleanup(mdev);
+	__free_page(mdev->md_io_page);
+	put_disk(mdev->vdisk);
+	blk_cleanup_queue(mdev->rq_queue);
+	kfree(mdev->rs_plan_s);
+	kfree(mdev);
+
+	kref_put(&tconn->kref, &conn_destroy);
+}
+
+/* One global retry thread, if we need to push back some bio and have it
+ * reinserted through our make request function.
+ */
+static struct retry_worker {
+	struct workqueue_struct *wq;
+	struct work_struct worker;
+
+	spinlock_t lock;
+	struct list_head writes;
+} retry;
+
+static void do_retry(struct work_struct *ws)
+{
+	struct retry_worker *retry = container_of(ws, struct retry_worker, worker);
+	LIST_HEAD(writes);
+	struct drbd_request *req, *tmp;
+
+	spin_lock_irq(&retry->lock);
+	list_splice_init(&retry->writes, &writes);
+	spin_unlock_irq(&retry->lock);
+
+	list_for_each_entry_safe(req, tmp, &writes, tl_requests) {
+		struct drbd_conf *mdev = req->w.mdev;
+		struct bio *bio = req->master_bio;
+		unsigned long start_time = req->start_time;
+		bool expected;
+
+		expected = 
+			expect(atomic_read(&req->completion_ref) == 0) &&
+			expect(req->rq_state & RQ_POSTPONED) &&
+			expect((req->rq_state & RQ_LOCAL_PENDING) == 0 ||
+				(req->rq_state & RQ_LOCAL_ABORTED) != 0);
+
+		if (!expected)
+			dev_err(DEV, "req=%p completion_ref=%d rq_state=%x\n",
+				req, atomic_read(&req->completion_ref),
+				req->rq_state);
+
+		/* We still need to put one kref associated with the
+		 * "completion_ref" going zero in the code path that queued it
+		 * here.  The request object may still be referenced by a
+		 * frozen local req->private_bio, in case we force-detached.
+		 */
+		kref_put(&req->kref, drbd_req_destroy);
+
+		/* A single suspended or otherwise blocking device may stall
+		 * all others as well.  Fortunately, this code path is to
+		 * recover from a situation that "should not happen":
+		 * concurrent writes in multi-primary setup.
+		 * In a "normal" lifecycle, this workqueue is supposed to be
+		 * destroyed without ever doing anything.
+		 * If it turns out to be an issue anyways, we can do per
+		 * resource (replication group) or per device (minor) retry
+		 * workqueues instead.
+		 */
+
+		/* We are not just doing generic_make_request(),
+		 * as we want to keep the start_time information. */
+		inc_ap_bio(mdev);
+		__drbd_make_request(mdev, bio, start_time);
+	}
+}
+
+void drbd_restart_request(struct drbd_request *req)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&retry.lock, flags);
+	list_move_tail(&req->tl_requests, &retry.writes);
+	spin_unlock_irqrestore(&retry.lock, flags);
+
+	/* Drop the extra reference that would otherwise
+	 * have been dropped by complete_master_bio.
+	 * do_retry() needs to grab a new one. */
+	dec_ap_bio(req->w.mdev);
 
-	/* cleanup the rest that has been
-	 * allocated from drbd_new_device
-	 * and actually free the mdev itself */
-	drbd_free_mdev(mdev);
+	queue_work(retry.wq, &retry.worker);
 }
 
+
 static void drbd_cleanup(void)
 {
 	unsigned int i;
+	struct drbd_conf *mdev;
+	struct drbd_tconn *tconn, *tmp;
 
 	unregister_reboot_notifier(&drbd_notifier);
 
@@ -3505,19 +2292,31 @@ static void drbd_cleanup(void)
 	if (drbd_proc)
 		remove_proc_entry("drbd", NULL);
 
-	drbd_nl_cleanup();
+	if (retry.wq)
+		destroy_workqueue(retry.wq);
+
+	drbd_genl_unregister();
 
-	if (minor_table) {
-		i = minor_count;
-		while (i--)
-			drbd_delete_device(i);
-		drbd_destroy_mempools();
+	idr_for_each_entry(&minors, mdev, i) {
+		idr_remove(&minors, mdev_to_minor(mdev));
+		idr_remove(&mdev->tconn->volumes, mdev->vnr);
+		del_gendisk(mdev->vdisk);
+		/* synchronize_rcu(); No other threads running at this point */
+		kref_put(&mdev->kref, &drbd_minor_destroy);
 	}
 
-	kfree(minor_table);
+	/* not _rcu since, no other updater anymore. Genl already unregistered */
+	list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) {
+		list_del(&tconn->all_tconn); /* not _rcu no proc, not other threads */
+		/* synchronize_rcu(); */
+		kref_put(&tconn->kref, &conn_destroy);
+	}
 
+	drbd_destroy_mempools();
 	unregister_blkdev(DRBD_MAJOR, "drbd");
 
+	idr_destroy(&minors);
+
 	printk(KERN_INFO "drbd: module cleanup done.\n");
 }
 
@@ -3542,7 +2341,7 @@ static int drbd_congested(void *congested_data, int bdi_bits)
 		goto out;
 	}
 
-	if (test_bit(CALLBACK_PENDING, &mdev->flags)) {
+	if (test_bit(CALLBACK_PENDING, &mdev->tconn->flags)) {
 		r |= (1 << BDI_async_congested);
 		/* Without good local data, we would need to read from remote,
 		 * and that would need the worker thread as well, which is
@@ -3566,7 +2365,7 @@ static int drbd_congested(void *congested_data, int bdi_bits)
 			reason = 'b';
 	}
 
-	if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
+	if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->tconn->flags)) {
 		r |= (1 << BDI_async_congested);
 		reason = reason == 'b' ? 'a' : 'n';
 	}
@@ -3576,20 +2375,243 @@ static int drbd_congested(void *congested_data, int bdi_bits)
 	return r;
 }
 
-struct drbd_conf *drbd_new_device(unsigned int minor)
+static void drbd_init_workqueue(struct drbd_work_queue* wq)
+{
+	spin_lock_init(&wq->q_lock);
+	INIT_LIST_HEAD(&wq->q);
+	init_waitqueue_head(&wq->q_wait);
+}
+
+struct drbd_tconn *conn_get_by_name(const char *name)
+{
+	struct drbd_tconn *tconn;
+
+	if (!name || !name[0])
+		return NULL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tconn, &drbd_tconns, all_tconn) {
+		if (!strcmp(tconn->name, name)) {
+			kref_get(&tconn->kref);
+			goto found;
+		}
+	}
+	tconn = NULL;
+found:
+	rcu_read_unlock();
+	return tconn;
+}
+
+struct drbd_tconn *conn_get_by_addrs(void *my_addr, int my_addr_len,
+				     void *peer_addr, int peer_addr_len)
+{
+	struct drbd_tconn *tconn;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tconn, &drbd_tconns, all_tconn) {
+		if (tconn->my_addr_len == my_addr_len &&
+		    tconn->peer_addr_len == peer_addr_len &&
+		    !memcmp(&tconn->my_addr, my_addr, my_addr_len) &&
+		    !memcmp(&tconn->peer_addr, peer_addr, peer_addr_len)) {
+			kref_get(&tconn->kref);
+			goto found;
+		}
+	}
+	tconn = NULL;
+found:
+	rcu_read_unlock();
+	return tconn;
+}
+
+static int drbd_alloc_socket(struct drbd_socket *socket)
+{
+	socket->rbuf = (void *) __get_free_page(GFP_KERNEL);
+	if (!socket->rbuf)
+		return -ENOMEM;
+	socket->sbuf = (void *) __get_free_page(GFP_KERNEL);
+	if (!socket->sbuf)
+		return -ENOMEM;
+	return 0;
+}
+
+static void drbd_free_socket(struct drbd_socket *socket)
+{
+	free_page((unsigned long) socket->sbuf);
+	free_page((unsigned long) socket->rbuf);
+}
+
+void conn_free_crypto(struct drbd_tconn *tconn)
+{
+	drbd_free_sock(tconn);
+
+	crypto_free_hash(tconn->csums_tfm);
+	crypto_free_hash(tconn->verify_tfm);
+	crypto_free_hash(tconn->cram_hmac_tfm);
+	crypto_free_hash(tconn->integrity_tfm);
+	crypto_free_hash(tconn->peer_integrity_tfm);
+	kfree(tconn->int_dig_in);
+	kfree(tconn->int_dig_vv);
+
+	tconn->csums_tfm = NULL;
+	tconn->verify_tfm = NULL;
+	tconn->cram_hmac_tfm = NULL;
+	tconn->integrity_tfm = NULL;
+	tconn->peer_integrity_tfm = NULL;
+	tconn->int_dig_in = NULL;
+	tconn->int_dig_vv = NULL;
+}
+
+int set_resource_options(struct drbd_tconn *tconn, struct res_opts *res_opts)
+{
+	cpumask_var_t new_cpu_mask;
+	int err;
+
+	if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL))
+		return -ENOMEM;
+		/*
+		retcode = ERR_NOMEM;
+		drbd_msg_put_info("unable to allocate cpumask");
+		*/
+
+	/* silently ignore cpu mask on UP kernel */
+	if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) {
+		/* FIXME: Get rid of constant 32 here */
+		err = bitmap_parse(res_opts->cpu_mask, 32,
+				   cpumask_bits(new_cpu_mask), nr_cpu_ids);
+		if (err) {
+			conn_warn(tconn, "bitmap_parse() failed with %d\n", err);
+			/* retcode = ERR_CPU_MASK_PARSE; */
+			goto fail;
+		}
+	}
+	tconn->res_opts = *res_opts;
+	if (!cpumask_equal(tconn->cpu_mask, new_cpu_mask)) {
+		cpumask_copy(tconn->cpu_mask, new_cpu_mask);
+		drbd_calc_cpu_mask(tconn);
+		tconn->receiver.reset_cpu_mask = 1;
+		tconn->asender.reset_cpu_mask = 1;
+		tconn->worker.reset_cpu_mask = 1;
+	}
+	err = 0;
+
+fail:
+	free_cpumask_var(new_cpu_mask);
+	return err;
+
+}
+
+/* caller must be under genl_lock() */
+struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts)
+{
+	struct drbd_tconn *tconn;
+
+	tconn = kzalloc(sizeof(struct drbd_tconn), GFP_KERNEL);
+	if (!tconn)
+		return NULL;
+
+	tconn->name = kstrdup(name, GFP_KERNEL);
+	if (!tconn->name)
+		goto fail;
+
+	if (drbd_alloc_socket(&tconn->data))
+		goto fail;
+	if (drbd_alloc_socket(&tconn->meta))
+		goto fail;
+
+	if (!zalloc_cpumask_var(&tconn->cpu_mask, GFP_KERNEL))
+		goto fail;
+
+	if (set_resource_options(tconn, res_opts))
+		goto fail;
+
+	tconn->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
+	if (!tconn->current_epoch)
+		goto fail;
+
+	INIT_LIST_HEAD(&tconn->transfer_log);
+
+	INIT_LIST_HEAD(&tconn->current_epoch->list);
+	tconn->epochs = 1;
+	spin_lock_init(&tconn->epoch_lock);
+	tconn->write_ordering = WO_bdev_flush;
+
+	tconn->send.seen_any_write_yet = false;
+	tconn->send.current_epoch_nr = 0;
+	tconn->send.current_epoch_writes = 0;
+
+	tconn->cstate = C_STANDALONE;
+	mutex_init(&tconn->cstate_mutex);
+	spin_lock_init(&tconn->req_lock);
+	mutex_init(&tconn->conf_update);
+	init_waitqueue_head(&tconn->ping_wait);
+	idr_init(&tconn->volumes);
+
+	drbd_init_workqueue(&tconn->sender_work);
+	mutex_init(&tconn->data.mutex);
+	mutex_init(&tconn->meta.mutex);
+
+	drbd_thread_init(tconn, &tconn->receiver, drbdd_init, "receiver");
+	drbd_thread_init(tconn, &tconn->worker, drbd_worker, "worker");
+	drbd_thread_init(tconn, &tconn->asender, drbd_asender, "asender");
+
+	kref_init(&tconn->kref);
+	list_add_tail_rcu(&tconn->all_tconn, &drbd_tconns);
+
+	return tconn;
+
+fail:
+	kfree(tconn->current_epoch);
+	free_cpumask_var(tconn->cpu_mask);
+	drbd_free_socket(&tconn->meta);
+	drbd_free_socket(&tconn->data);
+	kfree(tconn->name);
+	kfree(tconn);
+
+	return NULL;
+}
+
+void conn_destroy(struct kref *kref)
+{
+	struct drbd_tconn *tconn = container_of(kref, struct drbd_tconn, kref);
+
+	if (atomic_read(&tconn->current_epoch->epoch_size) !=  0)
+		conn_err(tconn, "epoch_size:%d\n", atomic_read(&tconn->current_epoch->epoch_size));
+	kfree(tconn->current_epoch);
+
+	idr_destroy(&tconn->volumes);
+
+	free_cpumask_var(tconn->cpu_mask);
+	drbd_free_socket(&tconn->meta);
+	drbd_free_socket(&tconn->data);
+	kfree(tconn->name);
+	kfree(tconn->int_dig_in);
+	kfree(tconn->int_dig_vv);
+	kfree(tconn);
+}
+
+enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr)
 {
 	struct drbd_conf *mdev;
 	struct gendisk *disk;
 	struct request_queue *q;
+	int vnr_got = vnr;
+	int minor_got = minor;
+	enum drbd_ret_code err = ERR_NOMEM;
+
+	mdev = minor_to_mdev(minor);
+	if (mdev)
+		return ERR_MINOR_EXISTS;
 
 	/* GFP_KERNEL, we are outside of all write-out paths */
 	mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
 	if (!mdev)
-		return NULL;
-	if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
-		goto out_no_cpumask;
+		return ERR_NOMEM;
+
+	kref_get(&tconn->kref);
+	mdev->tconn = tconn;
 
 	mdev->minor = minor;
+	mdev->vnr = vnr;
 
 	drbd_init_set_defaults(mdev);
 
@@ -3627,7 +2649,7 @@ struct drbd_conf *drbd_new_device(unsigned int minor)
 	blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
 	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
 	blk_queue_merge_bvec(q, drbd_merge_bvec);
-	q->queue_lock = &mdev->req_lock;
+	q->queue_lock = &mdev->tconn->req_lock; /* needed since we use */
 
 	mdev->md_io_page = alloc_page(GFP_KERNEL);
 	if (!mdev->md_io_page)
@@ -3635,30 +2657,44 @@ struct drbd_conf *drbd_new_device(unsigned int minor)
 
 	if (drbd_bm_init(mdev))
 		goto out_no_bitmap;
-	/* no need to lock access, we are still initializing this minor device. */
-	if (!tl_init(mdev))
-		goto out_no_tl;
-
-	mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
-	if (!mdev->app_reads_hash)
-		goto out_no_app_reads;
-
-	mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
-	if (!mdev->current_epoch)
-		goto out_no_epoch;
-
-	INIT_LIST_HEAD(&mdev->current_epoch->list);
-	mdev->epochs = 1;
-
-	return mdev;
-
-/* out_whatever_else:
-	kfree(mdev->current_epoch); */
-out_no_epoch:
-	kfree(mdev->app_reads_hash);
-out_no_app_reads:
-	tl_cleanup(mdev);
-out_no_tl:
+	mdev->read_requests = RB_ROOT;
+	mdev->write_requests = RB_ROOT;
+
+	if (!idr_pre_get(&minors, GFP_KERNEL))
+		goto out_no_minor_idr;
+	if (idr_get_new_above(&minors, mdev, minor, &minor_got))
+		goto out_no_minor_idr;
+	if (minor_got != minor) {
+		err = ERR_MINOR_EXISTS;
+		drbd_msg_put_info("requested minor exists already");
+		goto out_idr_remove_minor;
+	}
+
+	if (!idr_pre_get(&tconn->volumes, GFP_KERNEL))
+		goto out_idr_remove_minor;
+	if (idr_get_new_above(&tconn->volumes, mdev, vnr, &vnr_got))
+		goto out_idr_remove_minor;
+	if (vnr_got != vnr) {
+		err = ERR_INVALID_REQUEST;
+		drbd_msg_put_info("requested volume exists already");
+		goto out_idr_remove_vol;
+	}
+	add_disk(disk);
+	kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */
+
+	/* inherit the connection state */
+	mdev->state.conn = tconn->cstate;
+	if (mdev->state.conn == C_WF_REPORT_PARAMS)
+		drbd_connected(mdev);
+
+	return NO_ERROR;
+
+out_idr_remove_vol:
+	idr_remove(&tconn->volumes, vnr_got);
+out_idr_remove_minor:
+	idr_remove(&minors, minor_got);
+	synchronize_rcu();
+out_no_minor_idr:
 	drbd_bm_cleanup(mdev);
 out_no_bitmap:
 	__free_page(mdev->md_io_page);
@@ -3667,55 +2703,25 @@ struct drbd_conf *drbd_new_device(unsigned int minor)
 out_no_disk:
 	blk_cleanup_queue(q);
 out_no_q:
-	free_cpumask_var(mdev->cpu_mask);
-out_no_cpumask:
-	kfree(mdev);
-	return NULL;
-}
-
-/* counterpart of drbd_new_device.
- * last part of drbd_delete_device. */
-void drbd_free_mdev(struct drbd_conf *mdev)
-{
-	kfree(mdev->current_epoch);
-	kfree(mdev->app_reads_hash);
-	tl_cleanup(mdev);
-	if (mdev->bitmap) /* should no longer be there. */
-		drbd_bm_cleanup(mdev);
-	__free_page(mdev->md_io_page);
-	put_disk(mdev->vdisk);
-	blk_cleanup_queue(mdev->rq_queue);
-	free_cpumask_var(mdev->cpu_mask);
-	drbd_free_tl_hash(mdev);
 	kfree(mdev);
+	kref_put(&tconn->kref, &conn_destroy);
+	return err;
 }
 
-
 int __init drbd_init(void)
 {
 	int err;
 
-	if (sizeof(struct p_handshake) != 80) {
-		printk(KERN_ERR
-		       "drbd: never change the size or layout "
-		       "of the HandShake packet.\n");
-		return -EINVAL;
-	}
-
 	if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
 		printk(KERN_ERR
-			"drbd: invalid minor_count (%d)\n", minor_count);
+		       "drbd: invalid minor_count (%d)\n", minor_count);
 #ifdef MODULE
 		return -EINVAL;
 #else
-		minor_count = 8;
+		minor_count = DRBD_MINOR_COUNT_DEF;
 #endif
 	}
 
-	err = drbd_nl_init();
-	if (err)
-		return err;
-
 	err = register_blkdev(DRBD_MAJOR, "drbd");
 	if (err) {
 		printk(KERN_ERR
@@ -3724,6 +2730,13 @@ int __init drbd_init(void)
 		return err;
 	}
 
+	err = drbd_genl_register();
+	if (err) {
+		printk(KERN_ERR "drbd: unable to register generic netlink family\n");
+		goto fail;
+	}
+
+
 	register_reboot_notifier(&drbd_notifier);
 
 	/*
@@ -3734,22 +2747,29 @@ int __init drbd_init(void)
 	init_waitqueue_head(&drbd_pp_wait);
 
 	drbd_proc = NULL; /* play safe for drbd_cleanup */
-	minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
-				GFP_KERNEL);
-	if (!minor_table)
-		goto Enomem;
+	idr_init(&minors);
 
 	err = drbd_create_mempools();
 	if (err)
-		goto Enomem;
+		goto fail;
 
 	drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
 	if (!drbd_proc)	{
 		printk(KERN_ERR "drbd: unable to register proc file\n");
-		goto Enomem;
+		goto fail;
 	}
 
 	rwlock_init(&global_state_lock);
+	INIT_LIST_HEAD(&drbd_tconns);
+
+	retry.wq = create_singlethread_workqueue("drbd-reissue");
+	if (!retry.wq) {
+		printk(KERN_ERR "drbd: unable to create retry workqueue\n");
+		goto fail;
+	}
+	INIT_WORK(&retry.worker, do_retry);
+	spin_lock_init(&retry.lock);
+	INIT_LIST_HEAD(&retry.writes);
 
 	printk(KERN_INFO "drbd: initialized. "
 	       "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
@@ -3757,11 +2777,10 @@ int __init drbd_init(void)
 	printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
 	printk(KERN_INFO "drbd: registered as block device major %d\n",
 		DRBD_MAJOR);
-	printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
 
 	return 0; /* Success! */
 
-Enomem:
+fail:
 	drbd_cleanup();
 	if (err == -ENOMEM)
 		/* currently always the case */
@@ -3782,47 +2801,42 @@ void drbd_free_bc(struct drbd_backing_dev *ldev)
 	kfree(ldev);
 }
 
-void drbd_free_sock(struct drbd_conf *mdev)
+void drbd_free_sock(struct drbd_tconn *tconn)
 {
-	if (mdev->data.socket) {
-		mutex_lock(&mdev->data.mutex);
-		kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
-		sock_release(mdev->data.socket);
-		mdev->data.socket = NULL;
-		mutex_unlock(&mdev->data.mutex);
+	if (tconn->data.socket) {
+		mutex_lock(&tconn->data.mutex);
+		kernel_sock_shutdown(tconn->data.socket, SHUT_RDWR);
+		sock_release(tconn->data.socket);
+		tconn->data.socket = NULL;
+		mutex_unlock(&tconn->data.mutex);
 	}
-	if (mdev->meta.socket) {
-		mutex_lock(&mdev->meta.mutex);
-		kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
-		sock_release(mdev->meta.socket);
-		mdev->meta.socket = NULL;
-		mutex_unlock(&mdev->meta.mutex);
+	if (tconn->meta.socket) {
+		mutex_lock(&tconn->meta.mutex);
+		kernel_sock_shutdown(tconn->meta.socket, SHUT_RDWR);
+		sock_release(tconn->meta.socket);
+		tconn->meta.socket = NULL;
+		mutex_unlock(&tconn->meta.mutex);
 	}
 }
 
+/* meta data management */
 
-void drbd_free_resources(struct drbd_conf *mdev)
+void conn_md_sync(struct drbd_tconn *tconn)
 {
-	crypto_free_hash(mdev->csums_tfm);
-	mdev->csums_tfm = NULL;
-	crypto_free_hash(mdev->verify_tfm);
-	mdev->verify_tfm = NULL;
-	crypto_free_hash(mdev->cram_hmac_tfm);
-	mdev->cram_hmac_tfm = NULL;
-	crypto_free_hash(mdev->integrity_w_tfm);
-	mdev->integrity_w_tfm = NULL;
-	crypto_free_hash(mdev->integrity_r_tfm);
-	mdev->integrity_r_tfm = NULL;
-
-	drbd_free_sock(mdev);
+	struct drbd_conf *mdev;
+	int vnr;
 
-	__no_warn(local,
-		  drbd_free_bc(mdev->ldev);
-		  mdev->ldev = NULL;);
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+		kref_get(&mdev->kref);
+		rcu_read_unlock();
+		drbd_md_sync(mdev);
+		kref_put(&mdev->kref, &drbd_minor_destroy);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
 }
 
-/* meta data management */
-
 struct meta_data_on_disk {
 	u64 la_size;           /* last agreed size. */
 	u64 uuid[UI_SIZE];   /* UUIDs. */
@@ -3833,7 +2847,7 @@ struct meta_data_on_disk {
 	u32 md_size_sect;
 	u32 al_offset;         /* offset to this block */
 	u32 al_nr_extents;     /* important for restoring the AL */
-	      /* `-- act_log->nr_elements <-- sync_conf.al_extents */
+	      /* `-- act_log->nr_elements <-- ldev->dc.al_extents */
 	u32 bm_offset;         /* offset to the bitmap, from here */
 	u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
 	u32 la_peer_max_bio_size;   /* last peer max_bio_size */
@@ -3871,7 +2885,7 @@ void drbd_md_sync(struct drbd_conf *mdev)
 	for (i = UI_CURRENT; i < UI_SIZE; i++)
 		buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
 	buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
-	buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
+	buffer->magic = cpu_to_be32(DRBD_MD_MAGIC_84_UNCLEAN);
 
 	buffer->md_size_sect  = cpu_to_be32(mdev->ldev->md.md_size_sect);
 	buffer->al_offset     = cpu_to_be32(mdev->ldev->md.al_offset);
@@ -3885,7 +2899,7 @@ void drbd_md_sync(struct drbd_conf *mdev)
 	D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
 	sector = mdev->ldev->md.md_offset;
 
-	if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
+	if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
 		/* this was a try anyways ... */
 		dev_err(DEV, "meta data update failed!\n");
 		drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
@@ -3906,11 +2920,12 @@ void drbd_md_sync(struct drbd_conf *mdev)
  * @bdev:	Device from which the meta data should be read in.
  *
  * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
- * something goes wrong.  Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
+ * something goes wrong.
  */
 int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 {
 	struct meta_data_on_disk *buffer;
+	u32 magic, flags;
 	int i, rv = NO_ERROR;
 
 	if (!get_ldev_if_state(mdev, D_ATTACHING))
@@ -3920,7 +2935,7 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 	if (!buffer)
 		goto out;
 
-	if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
+	if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
 		/* NOTE: can't do normal error processing here as this is
 		   called BEFORE disk is attached */
 		dev_err(DEV, "Error while reading metadata.\n");
@@ -3928,8 +2943,20 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 		goto err;
 	}
 
-	if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
-		dev_err(DEV, "Error while reading metadata, magic not found.\n");
+	magic = be32_to_cpu(buffer->magic);
+	flags = be32_to_cpu(buffer->flags);
+	if (magic == DRBD_MD_MAGIC_84_UNCLEAN ||
+	    (magic == DRBD_MD_MAGIC_08 && !(flags & MDF_AL_CLEAN))) {
+			/* btw: that's Activity Log clean, not "all" clean. */
+		dev_err(DEV, "Found unclean meta data. Did you \"drbdadm apply-al\"?\n");
+		rv = ERR_MD_UNCLEAN;
+		goto err;
+	}
+	if (magic != DRBD_MD_MAGIC_08) {
+		if (magic == DRBD_MD_MAGIC_07)
+			dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n");
+		else
+			dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n");
 		rv = ERR_MD_INVALID;
 		goto err;
 	}
@@ -3963,20 +2990,16 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 	for (i = UI_CURRENT; i < UI_SIZE; i++)
 		bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
 	bdev->md.flags = be32_to_cpu(buffer->flags);
-	mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
 	bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
 
-	spin_lock_irq(&mdev->req_lock);
+	spin_lock_irq(&mdev->tconn->req_lock);
 	if (mdev->state.conn < C_CONNECTED) {
 		unsigned int peer;
 		peer = be32_to_cpu(buffer->la_peer_max_bio_size);
 		peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
 		mdev->peer_max_bio_size = peer;
 	}
-	spin_unlock_irq(&mdev->req_lock);
-
-	if (mdev->sync_conf.al_extents < 7)
-		mdev->sync_conf.al_extents = 127;
+	spin_unlock_irq(&mdev->tconn->req_lock);
 
  err:
 	drbd_md_put_buffer(mdev);
@@ -4011,7 +3034,7 @@ void drbd_md_mark_dirty(struct drbd_conf *mdev)
 }
 #endif
 
-static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
+void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
 {
 	int i;
 
@@ -4019,7 +3042,7 @@ static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
 		mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
 }
 
-void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
+void __drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
 {
 	if (idx == UI_CURRENT) {
 		if (mdev->state.role == R_PRIMARY)
@@ -4034,14 +3057,24 @@ void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
 	drbd_md_mark_dirty(mdev);
 }
 
+void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags);
+	__drbd_uuid_set(mdev, idx, val);
+	spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags);
+}
 
 void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
 {
+	unsigned long flags;
+	spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags);
 	if (mdev->ldev->md.uuid[idx]) {
 		drbd_uuid_move_history(mdev);
 		mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
 	}
-	_drbd_uuid_set(mdev, idx, val);
+	__drbd_uuid_set(mdev, idx, val);
+	spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags);
 }
 
 /**
@@ -4054,15 +3087,20 @@ void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
 void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
 {
 	u64 val;
-	unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
+	unsigned long long bm_uuid;
+
+	get_random_bytes(&val, sizeof(u64));
+
+	spin_lock_irq(&mdev->ldev->md.uuid_lock);
+	bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
 
 	if (bm_uuid)
 		dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
 
 	mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
+	__drbd_uuid_set(mdev, UI_CURRENT, val);
+	spin_unlock_irq(&mdev->ldev->md.uuid_lock);
 
-	get_random_bytes(&val, sizeof(u64));
-	_drbd_uuid_set(mdev, UI_CURRENT, val);
 	drbd_print_uuids(mdev, "new current UUID");
 	/* get it to stable storage _now_ */
 	drbd_md_sync(mdev);
@@ -4070,9 +3108,11 @@ void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
 
 void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
 {
+	unsigned long flags;
 	if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
 		return;
 
+	spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags);
 	if (val == 0) {
 		drbd_uuid_move_history(mdev);
 		mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
@@ -4084,6 +3124,8 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
 
 		mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
 	}
+	spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags);
+
 	drbd_md_mark_dirty(mdev);
 }
 
@@ -4135,9 +3177,10 @@ int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
 	return rv;
 }
 
-static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int w_bitmap_io(struct drbd_work *w, int unused)
 {
 	struct bm_io_work *work = container_of(w, struct bm_io_work, w);
+	struct drbd_conf *mdev = w->mdev;
 	int rv = -EIO;
 
 	D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
@@ -4149,8 +3192,7 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 		put_ldev(mdev);
 	}
 
-	clear_bit(BITMAP_IO, &mdev->flags);
-	smp_mb__after_clear_bit();
+	clear_bit_unlock(BITMAP_IO, &mdev->flags);
 	wake_up(&mdev->misc_wait);
 
 	if (work->done)
@@ -4160,7 +3202,7 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 	work->why = NULL;
 	work->flags = 0;
 
-	return 1;
+	return 0;
 }
 
 void drbd_ldev_destroy(struct drbd_conf *mdev)
@@ -4173,29 +3215,51 @@ void drbd_ldev_destroy(struct drbd_conf *mdev)
 		drbd_free_bc(mdev->ldev);
 		mdev->ldev = NULL;);
 
-	if (mdev->md_io_tmpp) {
-		__free_page(mdev->md_io_tmpp);
-		mdev->md_io_tmpp = NULL;
-	}
 	clear_bit(GO_DISKLESS, &mdev->flags);
 }
 
-static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int w_go_diskless(struct drbd_work *w, int unused)
 {
+	struct drbd_conf *mdev = w->mdev;
+
 	D_ASSERT(mdev->state.disk == D_FAILED);
 	/* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
 	 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
 	 * the protected members anymore, though, so once put_ldev reaches zero
 	 * again, it will be safe to free them. */
+
+	/* Try to write changed bitmap pages, read errors may have just
+	 * set some bits outside the area covered by the activity log.
+	 *
+	 * If we have an IO error during the bitmap writeout,
+	 * we will want a full sync next time, just in case.
+	 * (Do we want a specific meta data flag for this?)
+	 *
+	 * If that does not make it to stable storage either,
+	 * we cannot do anything about that anymore.
+	 *
+	 * We still need to check if both bitmap and ldev are present, we may
+	 * end up here after a failed attach, before ldev was even assigned.
+	 */
+	if (mdev->bitmap && mdev->ldev) {
+		if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write,
+					"detach", BM_LOCKED_MASK)) {
+			if (test_bit(WAS_READ_ERROR, &mdev->flags)) {
+				drbd_md_set_flag(mdev, MDF_FULL_SYNC);
+				drbd_md_sync(mdev);
+			}
+		}
+	}
+
 	drbd_force_state(mdev, NS(disk, D_DISKLESS));
-	return 1;
+	return 0;
 }
 
 void drbd_go_diskless(struct drbd_conf *mdev)
 {
 	D_ASSERT(mdev->state.disk == D_FAILED);
 	if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
-		drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
+		drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless);
 }
 
 /**
@@ -4215,7 +3279,7 @@ void drbd_queue_bitmap_io(struct drbd_conf *mdev,
 			  void (*done)(struct drbd_conf *, int),
 			  char *why, enum bm_flag flags)
 {
-	D_ASSERT(current == mdev->worker.task);
+	D_ASSERT(current == mdev->tconn->worker.task);
 
 	D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
 	D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
@@ -4229,13 +3293,13 @@ void drbd_queue_bitmap_io(struct drbd_conf *mdev,
 	mdev->bm_io_work.why = why;
 	mdev->bm_io_work.flags = flags;
 
-	spin_lock_irq(&mdev->req_lock);
+	spin_lock_irq(&mdev->tconn->req_lock);
 	set_bit(BITMAP_IO, &mdev->flags);
 	if (atomic_read(&mdev->ap_bio_cnt) == 0) {
 		if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
-			drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
+			drbd_queue_work(&mdev->tconn->sender_work, &mdev->bm_io_work.w);
 	}
-	spin_unlock_irq(&mdev->req_lock);
+	spin_unlock_irq(&mdev->tconn->req_lock);
 }
 
 /**
@@ -4252,7 +3316,7 @@ int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
 {
 	int rv;
 
-	D_ASSERT(current != mdev->worker.task);
+	D_ASSERT(current != mdev->tconn->worker.task);
 
 	if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
 		drbd_suspend_io(mdev);
@@ -4291,18 +3355,127 @@ static void md_sync_timer_fn(unsigned long data)
 {
 	struct drbd_conf *mdev = (struct drbd_conf *) data;
 
-	drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
+	/* must not double-queue! */
+	if (list_empty(&mdev->md_sync_work.list))
+		drbd_queue_work_front(&mdev->tconn->sender_work, &mdev->md_sync_work);
 }
 
-static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int w_md_sync(struct drbd_work *w, int unused)
 {
+	struct drbd_conf *mdev = w->mdev;
+
 	dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
 #ifdef DEBUG
 	dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
 		mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
 #endif
 	drbd_md_sync(mdev);
-	return 1;
+	return 0;
+}
+
+const char *cmdname(enum drbd_packet cmd)
+{
+	/* THINK may need to become several global tables
+	 * when we want to support more than
+	 * one PRO_VERSION */
+	static const char *cmdnames[] = {
+		[P_DATA]	        = "Data",
+		[P_DATA_REPLY]	        = "DataReply",
+		[P_RS_DATA_REPLY]	= "RSDataReply",
+		[P_BARRIER]	        = "Barrier",
+		[P_BITMAP]	        = "ReportBitMap",
+		[P_BECOME_SYNC_TARGET]  = "BecomeSyncTarget",
+		[P_BECOME_SYNC_SOURCE]  = "BecomeSyncSource",
+		[P_UNPLUG_REMOTE]	= "UnplugRemote",
+		[P_DATA_REQUEST]	= "DataRequest",
+		[P_RS_DATA_REQUEST]     = "RSDataRequest",
+		[P_SYNC_PARAM]	        = "SyncParam",
+		[P_SYNC_PARAM89]	= "SyncParam89",
+		[P_PROTOCOL]            = "ReportProtocol",
+		[P_UUIDS]	        = "ReportUUIDs",
+		[P_SIZES]	        = "ReportSizes",
+		[P_STATE]	        = "ReportState",
+		[P_SYNC_UUID]           = "ReportSyncUUID",
+		[P_AUTH_CHALLENGE]      = "AuthChallenge",
+		[P_AUTH_RESPONSE]	= "AuthResponse",
+		[P_PING]		= "Ping",
+		[P_PING_ACK]	        = "PingAck",
+		[P_RECV_ACK]	        = "RecvAck",
+		[P_WRITE_ACK]	        = "WriteAck",
+		[P_RS_WRITE_ACK]	= "RSWriteAck",
+		[P_SUPERSEDED]          = "Superseded",
+		[P_NEG_ACK]	        = "NegAck",
+		[P_NEG_DREPLY]	        = "NegDReply",
+		[P_NEG_RS_DREPLY]	= "NegRSDReply",
+		[P_BARRIER_ACK]	        = "BarrierAck",
+		[P_STATE_CHG_REQ]       = "StateChgRequest",
+		[P_STATE_CHG_REPLY]     = "StateChgReply",
+		[P_OV_REQUEST]          = "OVRequest",
+		[P_OV_REPLY]            = "OVReply",
+		[P_OV_RESULT]           = "OVResult",
+		[P_CSUM_RS_REQUEST]     = "CsumRSRequest",
+		[P_RS_IS_IN_SYNC]	= "CsumRSIsInSync",
+		[P_COMPRESSED_BITMAP]   = "CBitmap",
+		[P_DELAY_PROBE]         = "DelayProbe",
+		[P_OUT_OF_SYNC]		= "OutOfSync",
+		[P_RETRY_WRITE]		= "RetryWrite",
+		[P_RS_CANCEL]		= "RSCancel",
+		[P_CONN_ST_CHG_REQ]	= "conn_st_chg_req",
+		[P_CONN_ST_CHG_REPLY]	= "conn_st_chg_reply",
+		[P_RETRY_WRITE]		= "retry_write",
+		[P_PROTOCOL_UPDATE]	= "protocol_update",
+
+		/* enum drbd_packet, but not commands - obsoleted flags:
+		 *	P_MAY_IGNORE
+		 *	P_MAX_OPT_CMD
+		 */
+	};
+
+	/* too big for the array: 0xfffX */
+	if (cmd == P_INITIAL_META)
+		return "InitialMeta";
+	if (cmd == P_INITIAL_DATA)
+		return "InitialData";
+	if (cmd == P_CONNECTION_FEATURES)
+		return "ConnectionFeatures";
+	if (cmd >= ARRAY_SIZE(cmdnames))
+		return "Unknown";
+	return cmdnames[cmd];
+}
+
+/**
+ * drbd_wait_misc  -  wait for a request to make progress
+ * @mdev:	device associated with the request
+ * @i:		the struct drbd_interval embedded in struct drbd_request or
+ *		struct drbd_peer_request
+ */
+int drbd_wait_misc(struct drbd_conf *mdev, struct drbd_interval *i)
+{
+	struct net_conf *nc;
+	DEFINE_WAIT(wait);
+	long timeout;
+
+	rcu_read_lock();
+	nc = rcu_dereference(mdev->tconn->net_conf);
+	if (!nc) {
+		rcu_read_unlock();
+		return -ETIMEDOUT;
+	}
+	timeout = nc->ko_count ? nc->timeout * HZ / 10 * nc->ko_count : MAX_SCHEDULE_TIMEOUT;
+	rcu_read_unlock();
+
+	/* Indicate to wake up mdev->misc_wait on progress.  */
+	i->waiting = true;
+	prepare_to_wait(&mdev->misc_wait, &wait, TASK_INTERRUPTIBLE);
+	spin_unlock_irq(&mdev->tconn->req_lock);
+	timeout = schedule_timeout(timeout);
+	finish_wait(&mdev->misc_wait, &wait);
+	spin_lock_irq(&mdev->tconn->req_lock);
+	if (!timeout || mdev->state.conn < C_CONNECTED)
+		return -ETIMEDOUT;
+	if (signal_pending(current))
+		return -ERESTARTSYS;
+	return 0;
 }
 
 #ifdef CONFIG_DRBD_FAULT_INJECTION
diff --git a/trunk/drivers/block/drbd/drbd_nl.c b/trunk/drivers/block/drbd/drbd_nl.c
index edb490aad8b4..2af26fc95280 100644
--- a/trunk/drivers/block/drbd/drbd_nl.c
+++ b/trunk/drivers/block/drbd/drbd_nl.c
@@ -29,159 +29,317 @@
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/slab.h>
-#include <linux/connector.h>
 #include <linux/blkpg.h>
 #include <linux/cpumask.h>
 #include "drbd_int.h"
 #include "drbd_req.h"
 #include "drbd_wrappers.h"
 #include <asm/unaligned.h>
-#include <linux/drbd_tag_magic.h>
 #include <linux/drbd_limits.h>
-#include <linux/compiler.h>
 #include <linux/kthread.h>
 
-static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
-static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
-static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *);
-
-/* see get_sb_bdev and bd_claim */
+#include <net/genetlink.h>
+
+/* .doit */
+// int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info);
+// int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_down(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
+/* .dumpit */
+int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
+
+#include <linux/drbd_genl_api.h>
+#include "drbd_nla.h"
+#include <linux/genl_magic_func.h>
+
+/* used blkdev_get_by_path, to claim our meta data device(s) */
 static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
 
-/* Generate the tag_list to struct functions */
-#define NL_PACKET(name, number, fields) \
-static int name ## _from_tags(struct drbd_conf *mdev, \
-	unsigned short *tags, struct name *arg) __attribute__ ((unused)); \
-static int name ## _from_tags(struct drbd_conf *mdev, \
-	unsigned short *tags, struct name *arg) \
-{ \
-	int tag; \
-	int dlen; \
-	\
-	while ((tag = get_unaligned(tags++)) != TT_END) {	\
-		dlen = get_unaligned(tags++);			\
-		switch (tag_number(tag)) { \
-		fields \
-		default: \
-			if (tag & T_MANDATORY) { \
-				dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \
-				return 0; \
-			} \
-		} \
-		tags = (unsigned short *)((char *)tags + dlen); \
-	} \
-	return 1; \
-}
-#define NL_INTEGER(pn, pr, member) \
-	case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \
-		arg->member = get_unaligned((int *)(tags));	\
-		break;
-#define NL_INT64(pn, pr, member) \
-	case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \
-		arg->member = get_unaligned((u64 *)(tags));	\
+/* Configuration is strictly serialized, because generic netlink message
+ * processing is strictly serialized by the genl_lock().
+ * Which means we can use one static global drbd_config_context struct.
+ */
+static struct drbd_config_context {
+	/* assigned from drbd_genlmsghdr */
+	unsigned int minor;
+	/* assigned from request attributes, if present */
+	unsigned int volume;
+#define VOLUME_UNSPECIFIED		(-1U)
+	/* pointer into the request skb,
+	 * limited lifetime! */
+	char *resource_name;
+	struct nlattr *my_addr;
+	struct nlattr *peer_addr;
+
+	/* reply buffer */
+	struct sk_buff *reply_skb;
+	/* pointer into reply buffer */
+	struct drbd_genlmsghdr *reply_dh;
+	/* resolved from attributes, if possible */
+	struct drbd_conf *mdev;
+	struct drbd_tconn *tconn;
+} adm_ctx;
+
+static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
+{
+	genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
+	if (genlmsg_reply(skb, info))
+		printk(KERN_ERR "drbd: error sending genl reply\n");
+}
+
+/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
+ * reason it could fail was no space in skb, and there are 4k available. */
+int drbd_msg_put_info(const char *info)
+{
+	struct sk_buff *skb = adm_ctx.reply_skb;
+	struct nlattr *nla;
+	int err = -EMSGSIZE;
+
+	if (!info || !info[0])
+		return 0;
+
+	nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY);
+	if (!nla)
+		return err;
+
+	err = nla_put_string(skb, T_info_text, info);
+	if (err) {
+		nla_nest_cancel(skb, nla);
+		return err;
+	} else
+		nla_nest_end(skb, nla);
+	return 0;
+}
+
+/* This would be a good candidate for a "pre_doit" hook,
+ * and per-family private info->pointers.
+ * But we need to stay compatible with older kernels.
+ * If it returns successfully, adm_ctx members are valid.
+ */
+#define DRBD_ADM_NEED_MINOR	1
+#define DRBD_ADM_NEED_RESOURCE	2
+#define DRBD_ADM_NEED_CONNECTION 4
+static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info,
+		unsigned flags)
+{
+	struct drbd_genlmsghdr *d_in = info->userhdr;
+	const u8 cmd = info->genlhdr->cmd;
+	int err;
+
+	memset(&adm_ctx, 0, sizeof(adm_ctx));
+
+	/* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */
+	if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN))
+	       return -EPERM;
+
+	adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!adm_ctx.reply_skb) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb,
+					info, &drbd_genl_family, 0, cmd);
+	/* put of a few bytes into a fresh skb of >= 4k will always succeed.
+	 * but anyways */
+	if (!adm_ctx.reply_dh) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	adm_ctx.reply_dh->minor = d_in->minor;
+	adm_ctx.reply_dh->ret_code = NO_ERROR;
+
+	adm_ctx.volume = VOLUME_UNSPECIFIED;
+	if (info->attrs[DRBD_NLA_CFG_CONTEXT]) {
+		struct nlattr *nla;
+		/* parse and validate only */
+		err = drbd_cfg_context_from_attrs(NULL, info);
+		if (err)
+			goto fail;
+
+		/* It was present, and valid,
+		 * copy it over to the reply skb. */
+		err = nla_put_nohdr(adm_ctx.reply_skb,
+				info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len,
+				info->attrs[DRBD_NLA_CFG_CONTEXT]);
+		if (err)
+			goto fail;
+
+		/* and assign stuff to the global adm_ctx */
+		nla = nested_attr_tb[__nla_type(T_ctx_volume)];
+		if (nla)
+			adm_ctx.volume = nla_get_u32(nla);
+		nla = nested_attr_tb[__nla_type(T_ctx_resource_name)];
+		if (nla)
+			adm_ctx.resource_name = nla_data(nla);
+		adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)];
+		adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)];
+		if ((adm_ctx.my_addr &&
+		     nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.tconn->my_addr)) ||
+		    (adm_ctx.peer_addr &&
+		     nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.tconn->peer_addr))) {
+			err = -EINVAL;
+			goto fail;
+		}
+	}
+
+	adm_ctx.minor = d_in->minor;
+	adm_ctx.mdev = minor_to_mdev(d_in->minor);
+	adm_ctx.tconn = conn_get_by_name(adm_ctx.resource_name);
+
+	if (!adm_ctx.mdev && (flags & DRBD_ADM_NEED_MINOR)) {
+		drbd_msg_put_info("unknown minor");
+		return ERR_MINOR_INVALID;
+	}
+	if (!adm_ctx.tconn && (flags & DRBD_ADM_NEED_RESOURCE)) {
+		drbd_msg_put_info("unknown resource");
+		return ERR_INVALID_REQUEST;
+	}
+
+	if (flags & DRBD_ADM_NEED_CONNECTION) {
+		if (adm_ctx.tconn && !(flags & DRBD_ADM_NEED_RESOURCE)) {
+			drbd_msg_put_info("no resource name expected");
+			return ERR_INVALID_REQUEST;
+		}
+		if (adm_ctx.mdev) {
+			drbd_msg_put_info("no minor number expected");
+			return ERR_INVALID_REQUEST;
+		}
+		if (adm_ctx.my_addr && adm_ctx.peer_addr)
+			adm_ctx.tconn = conn_get_by_addrs(nla_data(adm_ctx.my_addr),
+							  nla_len(adm_ctx.my_addr),
+							  nla_data(adm_ctx.peer_addr),
+							  nla_len(adm_ctx.peer_addr));
+		if (!adm_ctx.tconn) {
+			drbd_msg_put_info("unknown connection");
+			return ERR_INVALID_REQUEST;
+		}
+	}
+
+	/* some more paranoia, if the request was over-determined */
+	if (adm_ctx.mdev && adm_ctx.tconn &&
+	    adm_ctx.mdev->tconn != adm_ctx.tconn) {
+		pr_warning("request: minor=%u, resource=%s; but that minor belongs to connection %s\n",
+				adm_ctx.minor, adm_ctx.resource_name,
+				adm_ctx.mdev->tconn->name);
+		drbd_msg_put_info("minor exists in different resource");
+		return ERR_INVALID_REQUEST;
+	}
+	if (adm_ctx.mdev &&
+	    adm_ctx.volume != VOLUME_UNSPECIFIED &&
+	    adm_ctx.volume != adm_ctx.mdev->vnr) {
+		pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n",
+				adm_ctx.minor, adm_ctx.volume,
+				adm_ctx.mdev->vnr, adm_ctx.mdev->tconn->name);
+		drbd_msg_put_info("minor exists as different volume");
+		return ERR_INVALID_REQUEST;
+	}
+
+	return NO_ERROR;
+
+fail:
+	nlmsg_free(adm_ctx.reply_skb);
+	adm_ctx.reply_skb = NULL;
+	return err;
+}
+
+static int drbd_adm_finish(struct genl_info *info, int retcode)
+{
+	if (adm_ctx.tconn) {
+		kref_put(&adm_ctx.tconn->kref, &conn_destroy);
+		adm_ctx.tconn = NULL;
+	}
+
+	if (!adm_ctx.reply_skb)
+		return -ENOMEM;
+
+	adm_ctx.reply_dh->ret_code = retcode;
+	drbd_adm_send_reply(adm_ctx.reply_skb, info);
+	return 0;
+}
+
+static void setup_khelper_env(struct drbd_tconn *tconn, char **envp)
+{
+	char *afs;
+
+	/* FIXME: A future version will not allow this case. */
+	if (tconn->my_addr_len == 0 || tconn->peer_addr_len == 0)
+		return;
+
+	switch (((struct sockaddr *)&tconn->peer_addr)->sa_family) {
+	case AF_INET6:
+		afs = "ipv6";
+		snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6",
+			 &((struct sockaddr_in6 *)&tconn->peer_addr)->sin6_addr);
 		break;
-#define NL_BIT(pn, pr, member) \
-	case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \
-		arg->member = *(char *)(tags) ? 1 : 0; \
+	case AF_INET:
+		afs = "ipv4";
+		snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+			 &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr);
 		break;
-#define NL_STRING(pn, pr, member, len) \
-	case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \
-		if (dlen > len) { \
-			dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \
-				#member, dlen, (unsigned int)len); \
-			return 0; \
-		} \
-		 arg->member ## _len = dlen; \
-		 memcpy(arg->member, tags, min_t(size_t, dlen, len)); \
-		 break;
-#include <linux/drbd_nl.h>
-
-/* Generate the struct to tag_list functions */
-#define NL_PACKET(name, number, fields) \
-static unsigned short* \
-name ## _to_tags(struct drbd_conf *mdev, \
-	struct name *arg, unsigned short *tags) __attribute__ ((unused)); \
-static unsigned short* \
-name ## _to_tags(struct drbd_conf *mdev, \
-	struct name *arg, unsigned short *tags) \
-{ \
-	fields \
-	return tags; \
-}
-
-#define NL_INTEGER(pn, pr, member) \
-	put_unaligned(pn | pr | TT_INTEGER, tags++);	\
-	put_unaligned(sizeof(int), tags++);		\
-	put_unaligned(arg->member, (int *)tags);	\
-	tags = (unsigned short *)((char *)tags+sizeof(int));
-#define NL_INT64(pn, pr, member) \
-	put_unaligned(pn | pr | TT_INT64, tags++);	\
-	put_unaligned(sizeof(u64), tags++);		\
-	put_unaligned(arg->member, (u64 *)tags);	\
-	tags = (unsigned short *)((char *)tags+sizeof(u64));
-#define NL_BIT(pn, pr, member) \
-	put_unaligned(pn | pr | TT_BIT, tags++);	\
-	put_unaligned(sizeof(char), tags++);		\
-	*(char *)tags = arg->member; \
-	tags = (unsigned short *)((char *)tags+sizeof(char));
-#define NL_STRING(pn, pr, member, len) \
-	put_unaligned(pn | pr | TT_STRING, tags++);	\
-	put_unaligned(arg->member ## _len, tags++);	\
-	memcpy(tags, arg->member, arg->member ## _len); \
-	tags = (unsigned short *)((char *)tags + arg->member ## _len);
-#include <linux/drbd_nl.h>
-
-void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name);
-void drbd_nl_send_reply(struct cn_msg *, int);
+	default:
+		afs = "ssocks";
+		snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+			 &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr);
+	}
+	snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs);
+}
 
 int drbd_khelper(struct drbd_conf *mdev, char *cmd)
 {
 	char *envp[] = { "HOME=/",
 			"TERM=linux",
 			"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
-			NULL, /* Will be set to address family */
-			NULL, /* Will be set to address */
+			 (char[20]) { }, /* address family */
+			 (char[60]) { }, /* address */
 			NULL };
-
-	char mb[12], af[20], ad[60], *afs;
+	char mb[12];
 	char *argv[] = {usermode_helper, cmd, mb, NULL };
+	struct drbd_tconn *tconn = mdev->tconn;
+	struct sib_info sib;
 	int ret;
 
-	if (current == mdev->worker.task)
-		set_bit(CALLBACK_PENDING, &mdev->flags);
+	if (current == tconn->worker.task)
+		set_bit(CALLBACK_PENDING, &tconn->flags);
 
 	snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev));
-
-	if (get_net_conf(mdev)) {
-		switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) {
-		case AF_INET6:
-			afs = "ipv6";
-			snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6",
-				 &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr);
-			break;
-		case AF_INET:
-			afs = "ipv4";
-			snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
-				 &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
-			break;
-		default:
-			afs = "ssocks";
-			snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
-				 &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
-		}
-		snprintf(af, 20, "DRBD_PEER_AF=%s", afs);
-		envp[3]=af;
-		envp[4]=ad;
-		put_net_conf(mdev);
-	}
+	setup_khelper_env(tconn, envp);
 
 	/* The helper may take some time.
 	 * write out any unsynced meta data changes now */
 	drbd_md_sync(mdev);
 
 	dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
-
-	drbd_bcast_ev_helper(mdev, cmd);
+	sib.sib_reason = SIB_HELPER_PRE;
+	sib.helper_name = cmd;
+	drbd_bcast_event(mdev, &sib);
 	ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
 	if (ret)
 		dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
@@ -191,9 +349,46 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd)
 		dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
 				usermode_helper, cmd, mb,
 				(ret >> 8) & 0xff, ret);
+	sib.sib_reason = SIB_HELPER_POST;
+	sib.helper_exit_code = ret;
+	drbd_bcast_event(mdev, &sib);
+
+	if (current == tconn->worker.task)
+		clear_bit(CALLBACK_PENDING, &tconn->flags);
 
-	if (current == mdev->worker.task)
-		clear_bit(CALLBACK_PENDING, &mdev->flags);
+	if (ret < 0) /* Ignore any ERRNOs we got. */
+		ret = 0;
+
+	return ret;
+}
+
+int conn_khelper(struct drbd_tconn *tconn, char *cmd)
+{
+	char *envp[] = { "HOME=/",
+			"TERM=linux",
+			"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+			 (char[20]) { }, /* address family */
+			 (char[60]) { }, /* address */
+			NULL };
+	char *argv[] = {usermode_helper, cmd, tconn->name, NULL };
+	int ret;
+
+	setup_khelper_env(tconn, envp);
+	conn_md_sync(tconn);
+
+	conn_info(tconn, "helper command: %s %s %s\n", usermode_helper, cmd, tconn->name);
+	/* TODO: conn_bcast_event() ?? */
+
+	ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
+	if (ret)
+		conn_warn(tconn, "helper command: %s %s %s exit code %u (0x%x)\n",
+			  usermode_helper, cmd, tconn->name,
+			  (ret >> 8) & 0xff, ret);
+	else
+		conn_info(tconn, "helper command: %s %s %s exit code %u (0x%x)\n",
+			  usermode_helper, cmd, tconn->name,
+			  (ret >> 8) & 0xff, ret);
+	/* TODO: conn_bcast_event() ?? */
 
 	if (ret < 0) /* Ignore any ERRNOs we got. */
 		ret = 0;
@@ -201,116 +396,129 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd)
 	return ret;
 }
 
-enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
+static enum drbd_fencing_p highest_fencing_policy(struct drbd_tconn *tconn)
+{
+	enum drbd_fencing_p fp = FP_NOT_AVAIL;
+	struct drbd_conf *mdev;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+		if (get_ldev_if_state(mdev, D_CONSISTENT)) {
+			fp = max_t(enum drbd_fencing_p, fp,
+				   rcu_dereference(mdev->ldev->disk_conf)->fencing);
+			put_ldev(mdev);
+		}
+	}
+	rcu_read_unlock();
+
+	return fp;
+}
+
+bool conn_try_outdate_peer(struct drbd_tconn *tconn)
 {
+	union drbd_state mask = { };
+	union drbd_state val = { };
+	enum drbd_fencing_p fp;
 	char *ex_to_string;
 	int r;
-	enum drbd_disk_state nps;
-	enum drbd_fencing_p fp;
 
-	D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
+	if (tconn->cstate >= C_WF_REPORT_PARAMS) {
+		conn_err(tconn, "Expected cstate < C_WF_REPORT_PARAMS\n");
+		return false;
+	}
 
-	if (get_ldev_if_state(mdev, D_CONSISTENT)) {
-		fp = mdev->ldev->dc.fencing;
-		put_ldev(mdev);
-	} else {
-		dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n");
-		nps = mdev->state.pdsk;
+	fp = highest_fencing_policy(tconn);
+	switch (fp) {
+	case FP_NOT_AVAIL:
+		conn_warn(tconn, "Not fencing peer, I'm not even Consistent myself.\n");
 		goto out;
+	case FP_DONT_CARE:
+		return true;
+	default: ;
 	}
 
-	r = drbd_khelper(mdev, "fence-peer");
+	r = conn_khelper(tconn, "fence-peer");
 
 	switch ((r>>8) & 0xff) {
 	case 3: /* peer is inconsistent */
 		ex_to_string = "peer is inconsistent or worse";
-		nps = D_INCONSISTENT;
+		mask.pdsk = D_MASK;
+		val.pdsk = D_INCONSISTENT;
 		break;
 	case 4: /* peer got outdated, or was already outdated */
 		ex_to_string = "peer was fenced";
-		nps = D_OUTDATED;
+		mask.pdsk = D_MASK;
+		val.pdsk = D_OUTDATED;
 		break;
 	case 5: /* peer was down */
-		if (mdev->state.disk == D_UP_TO_DATE) {
+		if (conn_highest_disk(tconn) == D_UP_TO_DATE) {
 			/* we will(have) create(d) a new UUID anyways... */
 			ex_to_string = "peer is unreachable, assumed to be dead";
-			nps = D_OUTDATED;
+			mask.pdsk = D_MASK;
+			val.pdsk = D_OUTDATED;
 		} else {
 			ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
-			nps = mdev->state.pdsk;
 		}
 		break;
 	case 6: /* Peer is primary, voluntarily outdate myself.
 		 * This is useful when an unconnected R_SECONDARY is asked to
 		 * become R_PRIMARY, but finds the other peer being active. */
 		ex_to_string = "peer is active";
-		dev_warn(DEV, "Peer is primary, outdating myself.\n");
-		nps = D_UNKNOWN;
-		_drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE);
+		conn_warn(tconn, "Peer is primary, outdating myself.\n");
+		mask.disk = D_MASK;
+		val.disk = D_OUTDATED;
 		break;
 	case 7:
 		if (fp != FP_STONITH)
-			dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n");
+			conn_err(tconn, "fence-peer() = 7 && fencing != Stonith !!!\n");
 		ex_to_string = "peer was stonithed";
-		nps = D_OUTDATED;
+		mask.pdsk = D_MASK;
+		val.pdsk = D_OUTDATED;
 		break;
 	default:
 		/* The script is broken ... */
-		nps = D_UNKNOWN;
-		dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
-		return nps;
+		conn_err(tconn, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
+		return false; /* Eventually leave IO frozen */
 	}
 
-	dev_info(DEV, "fence-peer helper returned %d (%s)\n",
-			(r>>8) & 0xff, ex_to_string);
+	conn_info(tconn, "fence-peer helper returned %d (%s)\n",
+		  (r>>8) & 0xff, ex_to_string);
 
-out:
-	if (mdev->state.susp_fen && nps >= D_UNKNOWN) {
-		/* The handler was not successful... unfreeze here, the
-		   state engine can not unfreeze... */
-		_drbd_request_state(mdev, NS(susp_fen, 0), CS_VERBOSE);
-	}
+ out:
 
-	return nps;
+	/* Not using
+	   conn_request_state(tconn, mask, val, CS_VERBOSE);
+	   here, because we might were able to re-establish the connection in the
+	   meantime. */
+	spin_lock_irq(&tconn->req_lock);
+	if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags))
+		_conn_request_state(tconn, mask, val, CS_VERBOSE);
+	spin_unlock_irq(&tconn->req_lock);
+
+	return conn_highest_pdsk(tconn) <= D_OUTDATED;
 }
 
 static int _try_outdate_peer_async(void *data)
 {
-	struct drbd_conf *mdev = (struct drbd_conf *)data;
-	enum drbd_disk_state nps;
-	union drbd_state ns;
+	struct drbd_tconn *tconn = (struct drbd_tconn *)data;
 
-	nps = drbd_try_outdate_peer(mdev);
-
-	/* Not using
-	   drbd_request_state(mdev, NS(pdsk, nps));
-	   here, because we might were able to re-establish the connection
-	   in the meantime. This can only partially be solved in the state's
-	   engine is_valid_state() and is_valid_state_transition()
-	   functions.
-
-	   nps can be D_INCONSISTENT, D_OUTDATED or D_UNKNOWN.
-	   pdsk == D_INCONSISTENT while conn >= C_CONNECTED is valid,
-	   therefore we have to have the pre state change check here.
-	*/
-	spin_lock_irq(&mdev->req_lock);
-	ns = mdev->state;
-	if (ns.conn < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &mdev->flags)) {
-		ns.pdsk = nps;
-		_drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
-	}
-	spin_unlock_irq(&mdev->req_lock);
+	conn_try_outdate_peer(tconn);
 
+	kref_put(&tconn->kref, &conn_destroy);
 	return 0;
 }
 
-void drbd_try_outdate_peer_async(struct drbd_conf *mdev)
+void conn_try_outdate_peer_async(struct drbd_tconn *tconn)
 {
 	struct task_struct *opa;
 
-	opa = kthread_run(_try_outdate_peer_async, mdev, "drbd%d_a_helper", mdev_to_minor(mdev));
-	if (IS_ERR(opa))
-		dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n");
+	kref_get(&tconn->kref);
+	opa = kthread_run(_try_outdate_peer_async, tconn, "drbd_async_h");
+	if (IS_ERR(opa)) {
+		conn_err(tconn, "out of mem, failed to invoke fence-peer helper\n");
+		kref_put(&tconn->kref, &conn_destroy);
+	}
 }
 
 enum drbd_state_rv
@@ -318,15 +526,15 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
 {
 	const int max_tries = 4;
 	enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
+	struct net_conf *nc;
 	int try = 0;
 	int forced = 0;
 	union drbd_state mask, val;
-	enum drbd_disk_state nps;
 
 	if (new_role == R_PRIMARY)
-		request_ping(mdev); /* Detect a dead peer ASAP */
+		request_ping(mdev->tconn); /* Detect a dead peer ASAP */
 
-	mutex_lock(&mdev->state_mutex);
+	mutex_lock(mdev->state_mutex);
 
 	mask.i = 0; mask.role = R_MASK;
 	val.i  = 0; val.role  = new_role;
@@ -354,38 +562,34 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
 		if (rv == SS_NO_UP_TO_DATE_DISK &&
 		    mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) {
 			D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
-			nps = drbd_try_outdate_peer(mdev);
 
-			if (nps == D_OUTDATED || nps == D_INCONSISTENT) {
+			if (conn_try_outdate_peer(mdev->tconn)) {
 				val.disk = D_UP_TO_DATE;
 				mask.disk = D_MASK;
 			}
-
-			val.pdsk = nps;
-			mask.pdsk = D_MASK;
-
 			continue;
 		}
 
 		if (rv == SS_NOTHING_TO_DO)
-			goto fail;
+			goto out;
 		if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
-			nps = drbd_try_outdate_peer(mdev);
-
-			if (force && nps > D_OUTDATED) {
+			if (!conn_try_outdate_peer(mdev->tconn) && force) {
 				dev_warn(DEV, "Forced into split brain situation!\n");
-				nps = D_OUTDATED;
-			}
-
-			mask.pdsk = D_MASK;
-			val.pdsk  = nps;
+				mask.pdsk = D_MASK;
+				val.pdsk  = D_OUTDATED;
 
+			}
 			continue;
 		}
 		if (rv == SS_TWO_PRIMARIES) {
 			/* Maybe the peer is detected as dead very soon...
 			   retry at most once more in this case. */
-			schedule_timeout_interruptible((mdev->net_conf->ping_timeo+1)*HZ/10);
+			int timeo;
+			rcu_read_lock();
+			nc = rcu_dereference(mdev->tconn->net_conf);
+			timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
+			rcu_read_unlock();
+			schedule_timeout_interruptible(timeo);
 			if (try < max_tries)
 				try = max_tries - 1;
 			continue;
@@ -394,13 +598,13 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
 			rv = _drbd_request_state(mdev, mask, val,
 						CS_VERBOSE + CS_WAIT_COMPLETE);
 			if (rv < SS_SUCCESS)
-				goto fail;
+				goto out;
 		}
 		break;
 	}
 
 	if (rv < SS_SUCCESS)
-		goto fail;
+		goto out;
 
 	if (forced)
 		dev_warn(DEV, "Forced to consider local data as UpToDate!\n");
@@ -408,6 +612,8 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
 	/* Wait until nothing is on the fly :) */
 	wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0);
 
+	/* FIXME also wait for all pending P_BARRIER_ACK? */
+
 	if (new_role == R_SECONDARY) {
 		set_disk_ro(mdev->vdisk, true);
 		if (get_ldev(mdev)) {
@@ -415,10 +621,12 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
 			put_ldev(mdev);
 		}
 	} else {
-		if (get_net_conf(mdev)) {
-			mdev->net_conf->want_lose = 0;
-			put_net_conf(mdev);
-		}
+		mutex_lock(&mdev->tconn->conf_update);
+		nc = mdev->tconn->net_conf;
+		if (nc)
+			nc->discard_my_data = 0; /* without copy; single bit op is atomic */
+		mutex_unlock(&mdev->tconn->conf_update);
+
 		set_disk_ro(mdev->vdisk, false);
 		if (get_ldev(mdev)) {
 			if (((mdev->state.conn < C_CONNECTED ||
@@ -444,67 +652,47 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
 	drbd_md_sync(mdev);
 
 	kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
- fail:
-	mutex_unlock(&mdev->state_mutex);
+out:
+	mutex_unlock(mdev->state_mutex);
 	return rv;
 }
 
-static struct drbd_conf *ensure_mdev(int minor, int create)
+static const char *from_attrs_err_to_txt(int err)
 {
-	struct drbd_conf *mdev;
-
-	if (minor >= minor_count)
-		return NULL;
-
-	mdev = minor_to_mdev(minor);
-
-	if (!mdev && create) {
-		struct gendisk *disk = NULL;
-		mdev = drbd_new_device(minor);
-
-		spin_lock_irq(&drbd_pp_lock);
-		if (minor_table[minor] == NULL) {
-			minor_table[minor] = mdev;
-			disk = mdev->vdisk;
-			mdev = NULL;
-		} /* else: we lost the race */
-		spin_unlock_irq(&drbd_pp_lock);
-
-		if (disk) /* we won the race above */
-			/* in case we ever add a drbd_delete_device(),
-			 * don't forget the del_gendisk! */
-			add_disk(disk);
-		else /* we lost the race above */
-			drbd_free_mdev(mdev);
-
-		mdev = minor_to_mdev(minor);
-	}
-
-	return mdev;
+	return	err == -ENOMSG ? "required attribute missing" :
+		err == -EOPNOTSUPP ? "unknown mandatory attribute" :
+		err == -EEXIST ? "can not change invariant setting" :
+		"invalid attribute value";
 }
 
-static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-			   struct drbd_nl_cfg_reply *reply)
+int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info)
 {
-	struct primary primary_args;
-
-	memset(&primary_args, 0, sizeof(struct primary));
-	if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) {
-		reply->ret_code = ERR_MANDATORY_TAG;
-		return 0;
-	}
-
-	reply->ret_code =
-		drbd_set_role(mdev, R_PRIMARY, primary_args.primary_force);
+	struct set_role_parms parms;
+	int err;
+	enum drbd_ret_code retcode;
 
-	return 0;
-}
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
 
-static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-			     struct drbd_nl_cfg_reply *reply)
-{
-	reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0);
+	memset(&parms, 0, sizeof(parms));
+	if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) {
+		err = set_role_parms_from_attrs(&parms, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(from_attrs_err_to_txt(err));
+			goto out;
+		}
+	}
 
+	if (info->genlhdr->cmd == DRBD_ADM_PRIMARY)
+		retcode = drbd_set_role(adm_ctx.mdev, R_PRIMARY, parms.assume_uptodate);
+	else
+		retcode = drbd_set_role(adm_ctx.mdev, R_SECONDARY, 0);
+out:
+	drbd_adm_finish(info, retcode);
 	return 0;
 }
 
@@ -514,7 +702,12 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
 				       struct drbd_backing_dev *bdev)
 {
 	sector_t md_size_sect = 0;
-	switch (bdev->dc.meta_dev_idx) {
+	int meta_dev_idx;
+
+	rcu_read_lock();
+	meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+
+	switch (meta_dev_idx) {
 	default:
 		/* v07 style fixed size indexed meta data */
 		bdev->md.md_size_sect = MD_RESERVED_SECT;
@@ -533,7 +726,7 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
 	case DRBD_MD_INDEX_FLEX_INT:
 		bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
 		/* al size is still fixed */
-		bdev->md.al_offset = -MD_AL_MAX_SIZE;
+		bdev->md.al_offset = -MD_AL_SECTORS;
 		/* we need (slightly less than) ~ this much bitmap sectors: */
 		md_size_sect = drbd_get_capacity(bdev->backing_bdev);
 		md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
@@ -549,6 +742,7 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
 		bdev->md.bm_offset   = -md_size_sect + MD_AL_OFFSET;
 		break;
 	}
+	rcu_read_unlock();
 }
 
 /* input size is expected to be in KB */
@@ -581,10 +775,16 @@ char *ppsize(char *buf, unsigned long long size)
  *  R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
  *  peer may not initiate a resize.
  */
+/* Note these are not to be confused with
+ * drbd_adm_suspend_io/drbd_adm_resume_io,
+ * which are (sub) state changes triggered by admin (drbdsetup),
+ * and can be long lived.
+ * This changes an mdev->flag, is triggered by drbd internals,
+ * and should be short-lived. */
 void drbd_suspend_io(struct drbd_conf *mdev)
 {
 	set_bit(SUSPEND_IO, &mdev->flags);
-	if (is_susp(mdev->state))
+	if (drbd_suspended(mdev))
 		return;
 	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
 }
@@ -605,7 +805,7 @@ void drbd_resume_io(struct drbd_conf *mdev)
 enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
 {
 	sector_t prev_first_sect, prev_size; /* previous meta location */
-	sector_t la_size;
+	sector_t la_size, u_size;
 	sector_t size;
 	char ppb[10];
 
@@ -633,7 +833,10 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
 	/* TODO: should only be some assert here, not (re)init... */
 	drbd_md_set_sector_offsets(mdev, mdev->ldev);
 
-	size = drbd_new_dev_size(mdev, mdev->ldev, flags & DDSF_FORCED);
+	rcu_read_lock();
+	u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
+	rcu_read_unlock();
+	size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED);
 
 	if (drbd_get_capacity(mdev->this_bdev) != size ||
 	    drbd_bm_capacity(mdev) != size) {
@@ -696,12 +899,12 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
 }
 
 sector_t
-drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int assume_peer_has_space)
+drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+		  sector_t u_size, int assume_peer_has_space)
 {
 	sector_t p_size = mdev->p_size;   /* partner's disk size. */
 	sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */
 	sector_t m_size; /* my size */
-	sector_t u_size = bdev->dc.disk_size; /* size requested by user. */
 	sector_t size = 0;
 
 	m_size = drbd_get_max_capacity(bdev);
@@ -750,24 +953,21 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int ass
  * failed, and 0 on success. You should call drbd_md_sync() after you called
  * this function.
  */
-static int drbd_check_al_size(struct drbd_conf *mdev)
+static int drbd_check_al_size(struct drbd_conf *mdev, struct disk_conf *dc)
 {
 	struct lru_cache *n, *t;
 	struct lc_element *e;
 	unsigned int in_use;
 	int i;
 
-	ERR_IF(mdev->sync_conf.al_extents < 7)
-		mdev->sync_conf.al_extents = 127;
-
 	if (mdev->act_log &&
-	    mdev->act_log->nr_elements == mdev->sync_conf.al_extents)
+	    mdev->act_log->nr_elements == dc->al_extents)
 		return 0;
 
 	in_use = 0;
 	t = mdev->act_log;
-	n = lc_create("act_log", drbd_al_ext_cache,
-		mdev->sync_conf.al_extents, sizeof(struct lc_element), 0);
+	n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION,
+		dc->al_extents, sizeof(struct lc_element), 0);
 
 	if (n == NULL) {
 		dev_err(DEV, "Cannot allocate act_log lru!\n");
@@ -808,7 +1008,9 @@ static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_
 		struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
 
 		max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
-		max_segments = mdev->ldev->dc.max_bio_bvecs;
+		rcu_read_lock();
+		max_segments = rcu_dereference(mdev->ldev->disk_conf)->max_bio_bvecs;
+		rcu_read_unlock();
 		put_ldev(mdev);
 	}
 
@@ -852,12 +1054,14 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
 	   Because new from 8.3.8 onwards the peer can use multiple
 	   BIOs for a single peer_request */
 	if (mdev->state.conn >= C_CONNECTED) {
-		if (mdev->agreed_pro_version < 94) {
-			peer = min(mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+		if (mdev->tconn->agreed_pro_version < 94)
+			peer = min( mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
 			/* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
-		} else if (mdev->agreed_pro_version == 94)
+		else if (mdev->tconn->agreed_pro_version == 94)
 			peer = DRBD_MAX_SIZE_H80_PACKET;
-		else /* drbd 8.3.8 onwards */
+		else if (mdev->tconn->agreed_pro_version < 100)
+			peer = DRBD_MAX_BIO_SIZE_P95;  /* drbd 8.3.8 onwards, before 8.4.0 */
+		else
 			peer = DRBD_MAX_BIO_SIZE;
 	}
 
@@ -872,36 +1076,27 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
 	drbd_setup_queue_param(mdev, new);
 }
 
-/* serialize deconfig (worker exiting, doing cleanup)
- * and reconfig (drbdsetup disk, drbdsetup net)
- *
- * Wait for a potentially exiting worker, then restart it,
- * or start a new one.  Flush any pending work, there may still be an
- * after_state_change queued.
- */
-static void drbd_reconfig_start(struct drbd_conf *mdev)
+/* Starts the worker thread */
+static void conn_reconfig_start(struct drbd_tconn *tconn)
 {
-	wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags));
-	wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags));
-	drbd_thread_start(&mdev->worker);
-	drbd_flush_workqueue(mdev);
+	drbd_thread_start(&tconn->worker);
+	conn_flush_workqueue(tconn);
 }
 
-/* if still unconfigured, stops worker again.
- * if configured now, clears CONFIG_PENDING.
- * wakes potential waiters */
-static void drbd_reconfig_done(struct drbd_conf *mdev)
+/* if still unconfigured, stops worker again. */
+static void conn_reconfig_done(struct drbd_tconn *tconn)
 {
-	spin_lock_irq(&mdev->req_lock);
-	if (mdev->state.disk == D_DISKLESS &&
-	    mdev->state.conn == C_STANDALONE &&
-	    mdev->state.role == R_SECONDARY) {
-		set_bit(DEVICE_DYING, &mdev->flags);
-		drbd_thread_stop_nowait(&mdev->worker);
-	} else
-		clear_bit(CONFIG_PENDING, &mdev->flags);
-	spin_unlock_irq(&mdev->req_lock);
-	wake_up(&mdev->state_wait);
+	bool stop_threads;
+	spin_lock_irq(&tconn->req_lock);
+	stop_threads = conn_all_vols_unconf(tconn) &&
+		tconn->cstate == C_STANDALONE;
+	spin_unlock_irq(&tconn->req_lock);
+	if (stop_threads) {
+		/* asender is implicitly stopped by receiver
+		 * in conn_disconnect() */
+		drbd_thread_stop(&tconn->receiver);
+		drbd_thread_stop(&tconn->worker);
+	}
 }
 
 /* Make sure IO is suspended before calling this function(). */
@@ -909,42 +1104,187 @@ static void drbd_suspend_al(struct drbd_conf *mdev)
 {
 	int s = 0;
 
-	if (lc_try_lock(mdev->act_log)) {
-		drbd_al_shrink(mdev);
-		lc_unlock(mdev->act_log);
-	} else {
+	if (!lc_try_lock(mdev->act_log)) {
 		dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n");
 		return;
 	}
 
-	spin_lock_irq(&mdev->req_lock);
+	drbd_al_shrink(mdev);
+	spin_lock_irq(&mdev->tconn->req_lock);
 	if (mdev->state.conn < C_CONNECTED)
 		s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags);
-
-	spin_unlock_irq(&mdev->req_lock);
+	spin_unlock_irq(&mdev->tconn->req_lock);
+	lc_unlock(mdev->act_log);
 
 	if (s)
 		dev_info(DEV, "Suspended AL updates\n");
 }
 
-/* does always return 0;
- * interesting return code is in reply->ret_code */
-static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-			     struct drbd_nl_cfg_reply *reply)
+
+static bool should_set_defaults(struct genl_info *info)
+{
+	unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags;
+	return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS);
+}
+
+static void enforce_disk_conf_limits(struct disk_conf *dc)
+{
+	if (dc->al_extents < DRBD_AL_EXTENTS_MIN)
+		dc->al_extents = DRBD_AL_EXTENTS_MIN;
+	if (dc->al_extents > DRBD_AL_EXTENTS_MAX)
+		dc->al_extents = DRBD_AL_EXTENTS_MAX;
+
+	if (dc->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+		dc->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
+}
+
+int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
+{
+	enum drbd_ret_code retcode;
+	struct drbd_conf *mdev;
+	struct disk_conf *new_disk_conf, *old_disk_conf;
+	struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
+	int err, fifo_size;
+
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mdev = adm_ctx.mdev;
+
+	/* we also need a disk
+	 * to change the options on */
+	if (!get_ldev(mdev)) {
+		retcode = ERR_NO_DISK;
+		goto out;
+	}
+
+	new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
+	if (!new_disk_conf) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	mutex_lock(&mdev->tconn->conf_update);
+	old_disk_conf = mdev->ldev->disk_conf;
+	*new_disk_conf = *old_disk_conf;
+	if (should_set_defaults(info))
+		set_disk_conf_defaults(new_disk_conf);
+
+	err = disk_conf_from_attrs_for_change(new_disk_conf, info);
+	if (err && err != -ENOMSG) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(from_attrs_err_to_txt(err));
+	}
+
+	if (!expect(new_disk_conf->resync_rate >= 1))
+		new_disk_conf->resync_rate = 1;
+
+	enforce_disk_conf_limits(new_disk_conf);
+
+	fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+	if (fifo_size != mdev->rs_plan_s->size) {
+		new_plan = fifo_alloc(fifo_size);
+		if (!new_plan) {
+			dev_err(DEV, "kmalloc of fifo_buffer failed");
+			retcode = ERR_NOMEM;
+			goto fail_unlock;
+		}
+	}
+
+	drbd_suspend_io(mdev);
+	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+	drbd_al_shrink(mdev);
+	err = drbd_check_al_size(mdev, new_disk_conf);
+	lc_unlock(mdev->act_log);
+	wake_up(&mdev->al_wait);
+	drbd_resume_io(mdev);
+
+	if (err) {
+		retcode = ERR_NOMEM;
+		goto fail_unlock;
+	}
+
+	write_lock_irq(&global_state_lock);
+	retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after);
+	if (retcode == NO_ERROR) {
+		rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
+		drbd_resync_after_changed(mdev);
+	}
+	write_unlock_irq(&global_state_lock);
+
+	if (retcode != NO_ERROR)
+		goto fail_unlock;
+
+	if (new_plan) {
+		old_plan = mdev->rs_plan_s;
+		rcu_assign_pointer(mdev->rs_plan_s, new_plan);
+	}
+
+	mutex_unlock(&mdev->tconn->conf_update);
+
+	if (new_disk_conf->al_updates)
+		mdev->ldev->md.flags &= ~MDF_AL_DISABLED;
+	else
+		mdev->ldev->md.flags |= MDF_AL_DISABLED;
+
+	if (new_disk_conf->md_flushes)
+		clear_bit(MD_NO_FUA, &mdev->flags);
+	else
+		set_bit(MD_NO_FUA, &mdev->flags);
+
+	drbd_bump_write_ordering(mdev->tconn, WO_bdev_flush);
+
+	drbd_md_sync(mdev);
+
+	if (mdev->state.conn >= C_CONNECTED)
+		drbd_send_sync_param(mdev);
+
+	synchronize_rcu();
+	kfree(old_disk_conf);
+	kfree(old_plan);
+	mod_timer(&mdev->request_timer, jiffies + HZ);
+	goto success;
+
+fail_unlock:
+	mutex_unlock(&mdev->tconn->conf_update);
+ fail:
+	kfree(new_disk_conf);
+	kfree(new_plan);
+success:
+	put_ldev(mdev);
+ out:
+	drbd_adm_finish(info, retcode);
+	return 0;
+}
+
+int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_conf *mdev;
+	int err;
 	enum drbd_ret_code retcode;
 	enum determine_dev_size dd;
 	sector_t max_possible_sectors;
 	sector_t min_md_device_sectors;
 	struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
+	struct disk_conf *new_disk_conf = NULL;
 	struct block_device *bdev;
 	struct lru_cache *resync_lru = NULL;
+	struct fifo_buffer *new_plan = NULL;
 	union drbd_state ns, os;
 	enum drbd_state_rv rv;
-	int cp_discovered = 0;
-	int logical_block_size;
+	struct net_conf *nc;
+
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto finish;
 
-	drbd_reconfig_start(mdev);
+	mdev = adm_ctx.mdev;
+	conn_reconfig_start(mdev->tconn);
 
 	/* if you want to reconfigure, please tear down first */
 	if (mdev->state.disk > D_DISKLESS) {
@@ -959,47 +1299,65 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 
 	/* make sure there is no leftover from previous force-detach attempts */
 	clear_bit(FORCE_DETACH, &mdev->flags);
+	clear_bit(WAS_IO_ERROR, &mdev->flags);
+	clear_bit(WAS_READ_ERROR, &mdev->flags);
 
 	/* and no leftover from previously aborted resync or verify, either */
 	mdev->rs_total = 0;
 	mdev->rs_failed = 0;
 	atomic_set(&mdev->rs_pending_cnt, 0);
 
-	/* allocation not in the IO path, cqueue thread context */
+	/* allocation not in the IO path, drbdsetup context */
 	nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
 	if (!nbc) {
 		retcode = ERR_NOMEM;
 		goto fail;
 	}
+	spin_lock_init(&nbc->md.uuid_lock);
 
-	nbc->dc.disk_size     = DRBD_DISK_SIZE_SECT_DEF;
-	nbc->dc.on_io_error   = DRBD_ON_IO_ERROR_DEF;
-	nbc->dc.fencing       = DRBD_FENCING_DEF;
-	nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF;
+	new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+	if (!new_disk_conf) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+	nbc->disk_conf = new_disk_conf;
 
-	if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) {
+	set_disk_conf_defaults(new_disk_conf);
+	err = disk_conf_from_attrs(new_disk_conf, info);
+	if (err) {
 		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(from_attrs_err_to_txt(err));
 		goto fail;
 	}
 
-	if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
+	enforce_disk_conf_limits(new_disk_conf);
+
+	new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
+	if (!new_plan) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
 		retcode = ERR_MD_IDX_INVALID;
 		goto fail;
 	}
 
-	if (get_net_conf(mdev)) {
-		int prot = mdev->net_conf->wire_protocol;
-		put_net_conf(mdev);
-		if (nbc->dc.fencing == FP_STONITH && prot == DRBD_PROT_A) {
+	rcu_read_lock();
+	nc = rcu_dereference(mdev->tconn->net_conf);
+	if (nc) {
+		if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
+			rcu_read_unlock();
 			retcode = ERR_STONITH_AND_PROT_A;
 			goto fail;
 		}
 	}
+	rcu_read_unlock();
 
-	bdev = blkdev_get_by_path(nbc->dc.backing_dev,
+	bdev = blkdev_get_by_path(new_disk_conf->backing_dev,
 				  FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev);
 	if (IS_ERR(bdev)) {
-		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
+		dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
 			PTR_ERR(bdev));
 		retcode = ERR_OPEN_DISK;
 		goto fail;
@@ -1014,12 +1372,12 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	 * should check it for you already; but if you don't, or
 	 * someone fooled it, we need to double check here)
 	 */
-	bdev = blkdev_get_by_path(nbc->dc.meta_dev,
+	bdev = blkdev_get_by_path(new_disk_conf->meta_dev,
 				  FMODE_READ | FMODE_WRITE | FMODE_EXCL,
-				  (nbc->dc.meta_dev_idx < 0) ?
+				  (new_disk_conf->meta_dev_idx < 0) ?
 				  (void *)mdev : (void *)drbd_m_holder);
 	if (IS_ERR(bdev)) {
-		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
+		dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
 			PTR_ERR(bdev));
 		retcode = ERR_OPEN_MD_DISK;
 		goto fail;
@@ -1027,14 +1385,14 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	nbc->md_bdev = bdev;
 
 	if ((nbc->backing_bdev == nbc->md_bdev) !=
-	    (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
-	     nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
+	    (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+	     new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
 		retcode = ERR_MD_IDX_INVALID;
 		goto fail;
 	}
 
 	resync_lru = lc_create("resync", drbd_bm_ext_cache,
-			61, sizeof(struct bm_extent),
+			1, 61, sizeof(struct bm_extent),
 			offsetof(struct bm_extent, lce));
 	if (!resync_lru) {
 		retcode = ERR_NOMEM;
@@ -1044,21 +1402,21 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	/* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
 	drbd_md_set_sector_offsets(mdev, nbc);
 
-	if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) {
+	if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
 		dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
 			(unsigned long long) drbd_get_max_capacity(nbc),
-			(unsigned long long) nbc->dc.disk_size);
+			(unsigned long long) new_disk_conf->disk_size);
 		retcode = ERR_DISK_TOO_SMALL;
 		goto fail;
 	}
 
-	if (nbc->dc.meta_dev_idx < 0) {
+	if (new_disk_conf->meta_dev_idx < 0) {
 		max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
 		/* at least one MB, otherwise it does not make sense */
 		min_md_device_sectors = (2<<10);
 	} else {
 		max_possible_sectors = DRBD_MAX_SECTORS;
-		min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1);
+		min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1);
 	}
 
 	if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
@@ -1083,14 +1441,20 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		dev_warn(DEV, "==> truncating very big lower level device "
 			"to currently maximum possible %llu sectors <==\n",
 			(unsigned long long) max_possible_sectors);
-		if (nbc->dc.meta_dev_idx >= 0)
+		if (new_disk_conf->meta_dev_idx >= 0)
 			dev_warn(DEV, "==>> using internal or flexible "
 				      "meta data may help <<==\n");
 	}
 
 	drbd_suspend_io(mdev);
 	/* also wait for the last barrier ack. */
-	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || is_susp(mdev->state));
+	/* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171
+	 * We need a way to either ignore barrier acks for barriers sent before a device
+	 * was attached, or a way to wait for all pending barrier acks to come in.
+	 * As barriers are counted per resource,
+	 * we'd need to suspend io on all devices of a resource.
+	 */
+	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || drbd_suspended(mdev));
 	/* and for any other previously queued work */
 	drbd_flush_workqueue(mdev);
 
@@ -1105,25 +1469,6 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 
 	drbd_md_set_sector_offsets(mdev, nbc);
 
-	/* allocate a second IO page if logical_block_size != 512 */
-	logical_block_size = bdev_logical_block_size(nbc->md_bdev);
-	if (logical_block_size == 0)
-		logical_block_size = MD_SECTOR_SIZE;
-
-	if (logical_block_size != MD_SECTOR_SIZE) {
-		if (!mdev->md_io_tmpp) {
-			struct page *page = alloc_page(GFP_NOIO);
-			if (!page)
-				goto force_diskless_dec;
-
-			dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n",
-			     logical_block_size, MD_SECTOR_SIZE);
-			dev_warn(DEV, "Workaround engaged (has performance impact).\n");
-
-			mdev->md_io_tmpp = page;
-		}
-	}
-
 	if (!mdev->bitmap) {
 		if (drbd_bm_init(mdev)) {
 			retcode = ERR_NOMEM;
@@ -1145,30 +1490,25 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	}
 
 	/* Since we are diskless, fix the activity log first... */
-	if (drbd_check_al_size(mdev)) {
+	if (drbd_check_al_size(mdev, new_disk_conf)) {
 		retcode = ERR_NOMEM;
 		goto force_diskless_dec;
 	}
 
 	/* Prevent shrinking of consistent devices ! */
 	if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
-	    drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) {
+	    drbd_new_dev_size(mdev, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) {
 		dev_warn(DEV, "refusing to truncate a consistent device\n");
 		retcode = ERR_DISK_TOO_SMALL;
 		goto force_diskless_dec;
 	}
 
-	if (!drbd_al_read_log(mdev, nbc)) {
-		retcode = ERR_IO_MD_DISK;
-		goto force_diskless_dec;
-	}
-
 	/* Reset the "barriers don't work" bits here, then force meta data to
 	 * be written, to ensure we determine if barriers are supported. */
-	if (nbc->dc.no_md_flush)
-		set_bit(MD_NO_FUA, &mdev->flags);
-	else
+	if (new_disk_conf->md_flushes)
 		clear_bit(MD_NO_FUA, &mdev->flags);
+	else
+		set_bit(MD_NO_FUA, &mdev->flags);
 
 	/* Point of no return reached.
 	 * Devices and memory are no longer released by error cleanup below.
@@ -1177,11 +1517,13 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	D_ASSERT(mdev->ldev == NULL);
 	mdev->ldev = nbc;
 	mdev->resync = resync_lru;
+	mdev->rs_plan_s = new_plan;
 	nbc = NULL;
 	resync_lru = NULL;
+	new_disk_conf = NULL;
+	new_plan = NULL;
 
-	mdev->write_ordering = WO_bdev_flush;
-	drbd_bump_write_ordering(mdev, WO_bdev_flush);
+	drbd_bump_write_ordering(mdev->tconn, WO_bdev_flush);
 
 	if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
 		set_bit(CRASHED_PRIMARY, &mdev->flags);
@@ -1189,10 +1531,8 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		clear_bit(CRASHED_PRIMARY, &mdev->flags);
 
 	if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
-	    !(mdev->state.role == R_PRIMARY && mdev->state.susp_nod)) {
+	    !(mdev->state.role == R_PRIMARY && mdev->tconn->susp_nod))
 		set_bit(CRASHED_PRIMARY, &mdev->flags);
-		cp_discovered = 1;
-	}
 
 	mdev->send_cnt = 0;
 	mdev->recv_cnt = 0;
@@ -1228,7 +1568,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	} else if (dd == grew)
 		set_bit(RESYNC_AFTER_NEG, &mdev->flags);
 
-	if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
+	if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC) ||
+	    (test_bit(CRASHED_PRIMARY, &mdev->flags) &&
+	     drbd_md_test_flag(mdev->ldev, MDF_AL_DISABLED))) {
 		dev_info(DEV, "Assuming that all blocks are out of sync "
 		     "(aka FullSync)\n");
 		if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write,
@@ -1238,16 +1580,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		}
 	} else {
 		if (drbd_bitmap_io(mdev, &drbd_bm_read,
-			"read from attaching", BM_LOCKED_MASK) < 0) {
-			retcode = ERR_IO_MD_DISK;
-			goto force_diskless_dec;
-		}
-	}
-
-	if (cp_discovered) {
-		drbd_al_apply_to_bm(mdev);
-		if (drbd_bitmap_io(mdev, &drbd_bm_write,
-			"crashed primary apply AL", BM_LOCKED_MASK)) {
+			"read from attaching", BM_LOCKED_MASK)) {
 			retcode = ERR_IO_MD_DISK;
 			goto force_diskless_dec;
 		}
@@ -1256,9 +1589,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev))
 		drbd_suspend_al(mdev); /* IO is still suspended here... */
 
-	spin_lock_irq(&mdev->req_lock);
-	os = mdev->state;
-	ns.i = os.i;
+	spin_lock_irq(&mdev->tconn->req_lock);
+	os = drbd_read_state(mdev);
+	ns = os;
 	/* If MDF_CONSISTENT is not set go into inconsistent state,
 	   otherwise investigate MDF_WasUpToDate...
 	   If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
@@ -1276,8 +1609,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED))
 		ns.pdsk = D_OUTDATED;
 
-	if ( ns.disk == D_CONSISTENT &&
-	    (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE))
+	rcu_read_lock();
+	if (ns.disk == D_CONSISTENT &&
+	    (ns.pdsk == D_OUTDATED || rcu_dereference(mdev->ldev->disk_conf)->fencing == FP_DONT_CARE))
 		ns.disk = D_UP_TO_DATE;
 
 	/* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
@@ -1285,6 +1619,13 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	   this point, because drbd_request_state() modifies these
 	   flags. */
 
+	if (rcu_dereference(mdev->ldev->disk_conf)->al_updates)
+		mdev->ldev->md.flags &= ~MDF_AL_DISABLED;
+	else
+		mdev->ldev->md.flags |= MDF_AL_DISABLED;
+
+	rcu_read_unlock();
+
 	/* In case we are C_CONNECTED postpone any decision on the new disk
 	   state after the negotiation phase. */
 	if (mdev->state.conn == C_CONNECTED) {
@@ -1300,12 +1641,13 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	}
 
 	rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
-	ns = mdev->state;
-	spin_unlock_irq(&mdev->req_lock);
+	spin_unlock_irq(&mdev->tconn->req_lock);
 
 	if (rv < SS_SUCCESS)
 		goto force_diskless_dec;
 
+	mod_timer(&mdev->request_timer, jiffies + HZ);
+
 	if (mdev->state.role == R_PRIMARY)
 		mdev->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
 	else
@@ -1316,16 +1658,17 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 
 	kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
 	put_ldev(mdev);
-	reply->ret_code = retcode;
-	drbd_reconfig_done(mdev);
+	conn_reconfig_done(mdev->tconn);
+	drbd_adm_finish(info, retcode);
 	return 0;
 
  force_diskless_dec:
 	put_ldev(mdev);
  force_diskless:
-	drbd_force_state(mdev, NS(disk, D_FAILED));
+	drbd_force_state(mdev, NS(disk, D_DISKLESS));
 	drbd_md_sync(mdev);
  fail:
+	conn_reconfig_done(mdev->tconn);
 	if (nbc) {
 		if (nbc->backing_bdev)
 			blkdev_put(nbc->backing_bdev,
@@ -1335,34 +1678,24 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 		kfree(nbc);
 	}
+	kfree(new_disk_conf);
 	lc_destroy(resync_lru);
+	kfree(new_plan);
 
-	reply->ret_code = retcode;
-	drbd_reconfig_done(mdev);
+ finish:
+	drbd_adm_finish(info, retcode);
 	return 0;
 }
 
-/* Detaching the disk is a process in multiple stages.  First we need to lock
- * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
- * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
- * internal references as well.
- * Only then we have finally detached. */
-static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-			  struct drbd_nl_cfg_reply *reply)
+static int adm_detach(struct drbd_conf *mdev, int force)
 {
-	enum drbd_ret_code retcode;
+	enum drbd_state_rv retcode;
 	int ret;
-	struct detach dt = {};
-
-	if (!detach_from_tags(mdev, nlp->tag_list, &dt)) {
-		reply->ret_code = ERR_MANDATORY_TAG;
-		goto out;
-	}
 
-	if (dt.detach_force) {
+	if (force) {
 		set_bit(FORCE_DETACH, &mdev->flags);
 		drbd_force_state(mdev, NS(disk, D_FAILED));
-		reply->ret_code = SS_SUCCESS;
+		retcode = SS_SUCCESS;
 		goto out;
 	}
 
@@ -1374,608 +1707,697 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 	ret = wait_event_interruptible(mdev->misc_wait,
 			mdev->state.disk != D_FAILED);
 	drbd_resume_io(mdev);
-
 	if ((int)retcode == (int)SS_IS_DISKLESS)
 		retcode = SS_NOTHING_TO_DO;
 	if (ret)
 		retcode = ERR_INTR;
-	reply->ret_code = retcode;
 out:
-	return 0;
+	return retcode;
 }
 
-static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-			    struct drbd_nl_cfg_reply *reply)
+/* Detaching the disk is a process in multiple stages.  First we need to lock
+ * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
+ * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
+ * internal references as well.
+ * Only then we have finally detached. */
+int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info)
 {
-	int i, ns;
 	enum drbd_ret_code retcode;
-	struct net_conf *new_conf = NULL;
-	struct crypto_hash *tfm = NULL;
-	struct crypto_hash *integrity_w_tfm = NULL;
-	struct crypto_hash *integrity_r_tfm = NULL;
-	struct hlist_head *new_tl_hash = NULL;
-	struct hlist_head *new_ee_hash = NULL;
-	struct drbd_conf *odev;
-	char hmac_name[CRYPTO_MAX_ALG_NAME];
-	void *int_dig_out = NULL;
-	void *int_dig_in = NULL;
-	void *int_dig_vv = NULL;
-	struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr;
+	struct detach_parms parms = { };
+	int err;
 
-	drbd_reconfig_start(mdev);
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
 
-	if (mdev->state.conn > C_STANDALONE) {
-		retcode = ERR_NET_CONFIGURED;
-		goto fail;
+	if (info->attrs[DRBD_NLA_DETACH_PARMS]) {
+		err = detach_parms_from_attrs(&parms, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(from_attrs_err_to_txt(err));
+			goto out;
+		}
 	}
 
-	/* allocation not in the IO path, cqueue thread context */
-	new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
-	if (!new_conf) {
-		retcode = ERR_NOMEM;
-		goto fail;
-	}
+	retcode = adm_detach(adm_ctx.mdev, parms.force_detach);
+out:
+	drbd_adm_finish(info, retcode);
+	return 0;
+}
 
-	new_conf->timeout	   = DRBD_TIMEOUT_DEF;
-	new_conf->try_connect_int  = DRBD_CONNECT_INT_DEF;
-	new_conf->ping_int	   = DRBD_PING_INT_DEF;
-	new_conf->max_epoch_size   = DRBD_MAX_EPOCH_SIZE_DEF;
-	new_conf->max_buffers	   = DRBD_MAX_BUFFERS_DEF;
-	new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF;
-	new_conf->sndbuf_size	   = DRBD_SNDBUF_SIZE_DEF;
-	new_conf->rcvbuf_size	   = DRBD_RCVBUF_SIZE_DEF;
-	new_conf->ko_count	   = DRBD_KO_COUNT_DEF;
-	new_conf->after_sb_0p	   = DRBD_AFTER_SB_0P_DEF;
-	new_conf->after_sb_1p	   = DRBD_AFTER_SB_1P_DEF;
-	new_conf->after_sb_2p	   = DRBD_AFTER_SB_2P_DEF;
-	new_conf->want_lose	   = 0;
-	new_conf->two_primaries    = 0;
-	new_conf->wire_protocol    = DRBD_PROT_C;
-	new_conf->ping_timeo	   = DRBD_PING_TIMEO_DEF;
-	new_conf->rr_conflict	   = DRBD_RR_CONFLICT_DEF;
-	new_conf->on_congestion    = DRBD_ON_CONGESTION_DEF;
-	new_conf->cong_extents     = DRBD_CONG_EXTENTS_DEF;
-
-	if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) {
-		retcode = ERR_MANDATORY_TAG;
-		goto fail;
+static bool conn_resync_running(struct drbd_tconn *tconn)
+{
+	struct drbd_conf *mdev;
+	bool rv = false;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+		if (mdev->state.conn == C_SYNC_SOURCE ||
+		    mdev->state.conn == C_SYNC_TARGET ||
+		    mdev->state.conn == C_PAUSED_SYNC_S ||
+		    mdev->state.conn == C_PAUSED_SYNC_T) {
+			rv = true;
+			break;
+		}
 	}
+	rcu_read_unlock();
 
-	if (new_conf->two_primaries
-	    && (new_conf->wire_protocol != DRBD_PROT_C)) {
-		retcode = ERR_NOT_PROTO_C;
-		goto fail;
-	}
+	return rv;
+}
 
-	if (get_ldev(mdev)) {
-		enum drbd_fencing_p fp = mdev->ldev->dc.fencing;
-		put_ldev(mdev);
-		if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) {
-			retcode = ERR_STONITH_AND_PROT_A;
-			goto fail;
+static bool conn_ov_running(struct drbd_tconn *tconn)
+{
+	struct drbd_conf *mdev;
+	bool rv = false;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+		if (mdev->state.conn == C_VERIFY_S ||
+		    mdev->state.conn == C_VERIFY_T) {
+			rv = true;
+			break;
 		}
 	}
+	rcu_read_unlock();
 
-	if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) {
-		retcode = ERR_CONG_NOT_PROTO_A;
-		goto fail;
-	}
+	return rv;
+}
 
-	if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
-		retcode = ERR_DISCARD;
-		goto fail;
-	}
+static enum drbd_ret_code
+_check_net_options(struct drbd_tconn *tconn, struct net_conf *old_conf, struct net_conf *new_conf)
+{
+	struct drbd_conf *mdev;
+	int i;
 
-	retcode = NO_ERROR;
+	if (old_conf && tconn->cstate == C_WF_REPORT_PARAMS && tconn->agreed_pro_version < 100) {
+		if (new_conf->wire_protocol != old_conf->wire_protocol)
+			return ERR_NEED_APV_100;
 
-	new_my_addr = (struct sockaddr *)&new_conf->my_addr;
-	new_peer_addr = (struct sockaddr *)&new_conf->peer_addr;
-	for (i = 0; i < minor_count; i++) {
-		odev = minor_to_mdev(i);
-		if (!odev || odev == mdev)
-			continue;
-		if (get_net_conf(odev)) {
-			taken_addr = (struct sockaddr *)&odev->net_conf->my_addr;
-			if (new_conf->my_addr_len == odev->net_conf->my_addr_len &&
-			    !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len))
-				retcode = ERR_LOCAL_ADDR;
-
-			taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr;
-			if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len &&
-			    !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len))
-				retcode = ERR_PEER_ADDR;
-
-			put_net_conf(odev);
-			if (retcode != NO_ERROR)
-				goto fail;
-		}
+		if (new_conf->two_primaries != old_conf->two_primaries)
+			return ERR_NEED_APV_100;
+
+		if (strcmp(new_conf->integrity_alg, old_conf->integrity_alg))
+			return ERR_NEED_APV_100;
 	}
 
-	if (new_conf->cram_hmac_alg[0] != 0) {
-		snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
-			new_conf->cram_hmac_alg);
-		tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC);
-		if (IS_ERR(tfm)) {
-			tfm = NULL;
-			retcode = ERR_AUTH_ALG;
-			goto fail;
-		}
+	if (!new_conf->two_primaries &&
+	    conn_highest_role(tconn) == R_PRIMARY &&
+	    conn_highest_peer(tconn) == R_PRIMARY)
+		return ERR_NEED_ALLOW_TWO_PRI;
 
-		if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
-			retcode = ERR_AUTH_ALG_ND;
-			goto fail;
+	if (new_conf->two_primaries &&
+	    (new_conf->wire_protocol != DRBD_PROT_C))
+		return ERR_NOT_PROTO_C;
+
+	idr_for_each_entry(&tconn->volumes, mdev, i) {
+		if (get_ldev(mdev)) {
+			enum drbd_fencing_p fp = rcu_dereference(mdev->ldev->disk_conf)->fencing;
+			put_ldev(mdev);
+			if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH)
+				return ERR_STONITH_AND_PROT_A;
 		}
+		if (mdev->state.role == R_PRIMARY && new_conf->discard_my_data)
+			return ERR_DISCARD_IMPOSSIBLE;
 	}
 
-	if (new_conf->integrity_alg[0]) {
-		integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
-		if (IS_ERR(integrity_w_tfm)) {
-			integrity_w_tfm = NULL;
-			retcode=ERR_INTEGRITY_ALG;
-			goto fail;
-		}
+	if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A)
+		return ERR_CONG_NOT_PROTO_A;
 
-		if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) {
-			retcode=ERR_INTEGRITY_ALG_ND;
-			goto fail;
-		}
+	return NO_ERROR;
+}
 
-		integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
-		if (IS_ERR(integrity_r_tfm)) {
-			integrity_r_tfm = NULL;
-			retcode=ERR_INTEGRITY_ALG;
-			goto fail;
-		}
-	}
+static enum drbd_ret_code
+check_net_options(struct drbd_tconn *tconn, struct net_conf *new_conf)
+{
+	static enum drbd_ret_code rv;
+	struct drbd_conf *mdev;
+	int i;
 
-	ns = new_conf->max_epoch_size/8;
-	if (mdev->tl_hash_s != ns) {
-		new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
-		if (!new_tl_hash) {
-			retcode = ERR_NOMEM;
-			goto fail;
-		}
-	}
+	rcu_read_lock();
+	rv = _check_net_options(tconn, rcu_dereference(tconn->net_conf), new_conf);
+	rcu_read_unlock();
 
-	ns = new_conf->max_buffers/8;
-	if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) {
-		new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
-		if (!new_ee_hash) {
-			retcode = ERR_NOMEM;
-			goto fail;
+	/* tconn->volumes protected by genl_lock() here */
+	idr_for_each_entry(&tconn->volumes, mdev, i) {
+		if (!mdev->bitmap) {
+			if(drbd_bm_init(mdev))
+				return ERR_NOMEM;
 		}
 	}
 
-	((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
+	return rv;
+}
 
-	if (integrity_w_tfm) {
-		i = crypto_hash_digestsize(integrity_w_tfm);
-		int_dig_out = kmalloc(i, GFP_KERNEL);
-		if (!int_dig_out) {
-			retcode = ERR_NOMEM;
-			goto fail;
-		}
-		int_dig_in = kmalloc(i, GFP_KERNEL);
-		if (!int_dig_in) {
-			retcode = ERR_NOMEM;
-			goto fail;
-		}
-		int_dig_vv = kmalloc(i, GFP_KERNEL);
-		if (!int_dig_vv) {
-			retcode = ERR_NOMEM;
-			goto fail;
-		}
-	}
+struct crypto {
+	struct crypto_hash *verify_tfm;
+	struct crypto_hash *csums_tfm;
+	struct crypto_hash *cram_hmac_tfm;
+	struct crypto_hash *integrity_tfm;
+};
 
-	if (!mdev->bitmap) {
-		if(drbd_bm_init(mdev)) {
-			retcode = ERR_NOMEM;
-			goto fail;
-		}
-	}
+static int
+alloc_hash(struct crypto_hash **tfm, char *tfm_name, int err_alg)
+{
+	if (!tfm_name[0])
+		return NO_ERROR;
 
-	drbd_flush_workqueue(mdev);
-	spin_lock_irq(&mdev->req_lock);
-	if (mdev->net_conf != NULL) {
-		retcode = ERR_NET_CONFIGURED;
-		spin_unlock_irq(&mdev->req_lock);
-		goto fail;
+	*tfm = crypto_alloc_hash(tfm_name, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(*tfm)) {
+		*tfm = NULL;
+		return err_alg;
 	}
-	mdev->net_conf = new_conf;
 
-	mdev->send_cnt = 0;
-	mdev->recv_cnt = 0;
+	return NO_ERROR;
+}
 
-	if (new_tl_hash) {
-		kfree(mdev->tl_hash);
-		mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8;
-		mdev->tl_hash = new_tl_hash;
-	}
+static enum drbd_ret_code
+alloc_crypto(struct crypto *crypto, struct net_conf *new_conf)
+{
+	char hmac_name[CRYPTO_MAX_ALG_NAME];
+	enum drbd_ret_code rv;
+
+	rv = alloc_hash(&crypto->csums_tfm, new_conf->csums_alg,
+		       ERR_CSUMS_ALG);
+	if (rv != NO_ERROR)
+		return rv;
+	rv = alloc_hash(&crypto->verify_tfm, new_conf->verify_alg,
+		       ERR_VERIFY_ALG);
+	if (rv != NO_ERROR)
+		return rv;
+	rv = alloc_hash(&crypto->integrity_tfm, new_conf->integrity_alg,
+		       ERR_INTEGRITY_ALG);
+	if (rv != NO_ERROR)
+		return rv;
+	if (new_conf->cram_hmac_alg[0] != 0) {
+		snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
+			 new_conf->cram_hmac_alg);
 
-	if (new_ee_hash) {
-		kfree(mdev->ee_hash);
-		mdev->ee_hash_s = mdev->net_conf->max_buffers/8;
-		mdev->ee_hash = new_ee_hash;
+		rv = alloc_hash(&crypto->cram_hmac_tfm, hmac_name,
+			       ERR_AUTH_ALG);
 	}
 
-	crypto_free_hash(mdev->cram_hmac_tfm);
-	mdev->cram_hmac_tfm = tfm;
+	return rv;
+}
 
-	crypto_free_hash(mdev->integrity_w_tfm);
-	mdev->integrity_w_tfm = integrity_w_tfm;
+static void free_crypto(struct crypto *crypto)
+{
+	crypto_free_hash(crypto->cram_hmac_tfm);
+	crypto_free_hash(crypto->integrity_tfm);
+	crypto_free_hash(crypto->csums_tfm);
+	crypto_free_hash(crypto->verify_tfm);
+}
 
-	crypto_free_hash(mdev->integrity_r_tfm);
-	mdev->integrity_r_tfm = integrity_r_tfm;
+int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
+{
+	enum drbd_ret_code retcode;
+	struct drbd_tconn *tconn;
+	struct net_conf *old_conf, *new_conf = NULL;
+	int err;
+	int ovr; /* online verify running */
+	int rsr; /* re-sync running */
+	struct crypto crypto = { };
 
-	kfree(mdev->int_dig_out);
-	kfree(mdev->int_dig_in);
-	kfree(mdev->int_dig_vv);
-	mdev->int_dig_out=int_dig_out;
-	mdev->int_dig_in=int_dig_in;
-	mdev->int_dig_vv=int_dig_vv;
-	retcode = _drbd_set_state(_NS(mdev, conn, C_UNCONNECTED), CS_VERBOSE, NULL);
-	spin_unlock_irq(&mdev->req_lock);
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
 
-	kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
-	reply->ret_code = retcode;
-	drbd_reconfig_done(mdev);
-	return 0;
+	tconn = adm_ctx.tconn;
 
-fail:
-	kfree(int_dig_out);
-	kfree(int_dig_in);
-	kfree(int_dig_vv);
-	crypto_free_hash(tfm);
-	crypto_free_hash(integrity_w_tfm);
-	crypto_free_hash(integrity_r_tfm);
-	kfree(new_tl_hash);
-	kfree(new_ee_hash);
-	kfree(new_conf);
+	new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
+	if (!new_conf) {
+		retcode = ERR_NOMEM;
+		goto out;
+	}
 
-	reply->ret_code = retcode;
-	drbd_reconfig_done(mdev);
-	return 0;
-}
+	conn_reconfig_start(tconn);
 
-static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-			      struct drbd_nl_cfg_reply *reply)
-{
-	int retcode;
-	struct disconnect dc;
+	mutex_lock(&tconn->data.mutex);
+	mutex_lock(&tconn->conf_update);
+	old_conf = tconn->net_conf;
 
-	memset(&dc, 0, sizeof(struct disconnect));
-	if (!disconnect_from_tags(mdev, nlp->tag_list, &dc)) {
-		retcode = ERR_MANDATORY_TAG;
+	if (!old_conf) {
+		drbd_msg_put_info("net conf missing, try connect");
+		retcode = ERR_INVALID_REQUEST;
 		goto fail;
 	}
 
-	if (dc.force) {
-		spin_lock_irq(&mdev->req_lock);
-		if (mdev->state.conn >= C_WF_CONNECTION)
-			_drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), CS_HARD, NULL);
-		spin_unlock_irq(&mdev->req_lock);
-		goto done;
+	*new_conf = *old_conf;
+	if (should_set_defaults(info))
+		set_net_conf_defaults(new_conf);
+
+	err = net_conf_from_attrs_for_change(new_conf, info);
+	if (err && err != -ENOMSG) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(from_attrs_err_to_txt(err));
+		goto fail;
 	}
 
-	retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED);
+	retcode = check_net_options(tconn, new_conf);
+	if (retcode != NO_ERROR)
+		goto fail;
 
-	if (retcode == SS_NOTHING_TO_DO)
-		goto done;
-	else if (retcode == SS_ALREADY_STANDALONE)
-		goto done;
-	else if (retcode == SS_PRIMARY_NOP) {
-		/* Our statche checking code wants to see the peer outdated. */
-		retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
-						      pdsk, D_OUTDATED));
-	} else if (retcode == SS_CW_FAILED_BY_PEER) {
-		/* The peer probably wants to see us outdated. */
-		retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
-							disk, D_OUTDATED),
-					      CS_ORDERED);
-		if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) {
-			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-			retcode = SS_SUCCESS;
-		}
+	/* re-sync running */
+	rsr = conn_resync_running(tconn);
+	if (rsr && strcmp(new_conf->csums_alg, old_conf->csums_alg)) {
+		retcode = ERR_CSUMS_RESYNC_RUNNING;
+		goto fail;
 	}
 
-	if (retcode < SS_SUCCESS)
+	/* online verify running */
+	ovr = conn_ov_running(tconn);
+	if (ovr && strcmp(new_conf->verify_alg, old_conf->verify_alg)) {
+		retcode = ERR_VERIFY_RUNNING;
 		goto fail;
+	}
 
-	if (wait_event_interruptible(mdev->state_wait,
-				     mdev->state.conn != C_DISCONNECTING)) {
-		/* Do not test for mdev->state.conn == C_STANDALONE, since
-		   someone else might connect us in the mean time! */
-		retcode = ERR_INTR;
+	retcode = alloc_crypto(&crypto, new_conf);
+	if (retcode != NO_ERROR)
 		goto fail;
+
+	rcu_assign_pointer(tconn->net_conf, new_conf);
+
+	if (!rsr) {
+		crypto_free_hash(tconn->csums_tfm);
+		tconn->csums_tfm = crypto.csums_tfm;
+		crypto.csums_tfm = NULL;
+	}
+	if (!ovr) {
+		crypto_free_hash(tconn->verify_tfm);
+		tconn->verify_tfm = crypto.verify_tfm;
+		crypto.verify_tfm = NULL;
 	}
 
- done:
-	retcode = NO_ERROR;
+	crypto_free_hash(tconn->integrity_tfm);
+	tconn->integrity_tfm = crypto.integrity_tfm;
+	if (tconn->cstate >= C_WF_REPORT_PARAMS && tconn->agreed_pro_version >= 100)
+		/* Do this without trying to take tconn->data.mutex again.  */
+		__drbd_send_protocol(tconn, P_PROTOCOL_UPDATE);
+
+	crypto_free_hash(tconn->cram_hmac_tfm);
+	tconn->cram_hmac_tfm = crypto.cram_hmac_tfm;
+
+	mutex_unlock(&tconn->conf_update);
+	mutex_unlock(&tconn->data.mutex);
+	synchronize_rcu();
+	kfree(old_conf);
+
+	if (tconn->cstate >= C_WF_REPORT_PARAMS)
+		drbd_send_sync_param(minor_to_mdev(conn_lowest_minor(tconn)));
+
+	goto done;
+
  fail:
-	drbd_md_sync(mdev);
-	reply->ret_code = retcode;
+	mutex_unlock(&tconn->conf_update);
+	mutex_unlock(&tconn->data.mutex);
+	free_crypto(&crypto);
+	kfree(new_conf);
+ done:
+	conn_reconfig_done(tconn);
+ out:
+	drbd_adm_finish(info, retcode);
 	return 0;
 }
 
-void resync_after_online_grow(struct drbd_conf *mdev)
+int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
 {
-	int iass; /* I am sync source */
+	struct drbd_conf *mdev;
+	struct net_conf *old_conf, *new_conf = NULL;
+	struct crypto crypto = { };
+	struct drbd_tconn *tconn;
+	enum drbd_ret_code retcode;
+	int i;
+	int err;
 
-	dev_info(DEV, "Resync of new storage after online grow\n");
-	if (mdev->state.role != mdev->state.peer)
-		iass = (mdev->state.role == R_PRIMARY);
-	else
-		iass = test_bit(DISCARD_CONCURRENT, &mdev->flags);
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
 
-	if (iass)
-		drbd_start_resync(mdev, C_SYNC_SOURCE);
-	else
-		_drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
-}
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+	if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) {
+		drbd_msg_put_info("connection endpoint(s) missing");
+		retcode = ERR_INVALID_REQUEST;
+		goto out;
+	}
 
-static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-			  struct drbd_nl_cfg_reply *reply)
-{
-	struct resize rs;
-	int retcode = NO_ERROR;
-	enum determine_dev_size dd;
-	enum dds_flags ddsf;
+	/* No need for _rcu here. All reconfiguration is
+	 * strictly serialized on genl_lock(). We are protected against
+	 * concurrent reconfiguration/addition/deletion */
+	list_for_each_entry(tconn, &drbd_tconns, all_tconn) {
+		if (nla_len(adm_ctx.my_addr) == tconn->my_addr_len &&
+		    !memcmp(nla_data(adm_ctx.my_addr), &tconn->my_addr, tconn->my_addr_len)) {
+			retcode = ERR_LOCAL_ADDR;
+			goto out;
+		}
 
-	memset(&rs, 0, sizeof(struct resize));
-	if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
-		retcode = ERR_MANDATORY_TAG;
-		goto fail;
+		if (nla_len(adm_ctx.peer_addr) == tconn->peer_addr_len &&
+		    !memcmp(nla_data(adm_ctx.peer_addr), &tconn->peer_addr, tconn->peer_addr_len)) {
+			retcode = ERR_PEER_ADDR;
+			goto out;
+		}
 	}
 
-	if (mdev->state.conn > C_CONNECTED) {
-		retcode = ERR_RESIZE_RESYNC;
-		goto fail;
-	}
+	tconn = adm_ctx.tconn;
+	conn_reconfig_start(tconn);
 
-	if (mdev->state.role == R_SECONDARY &&
-	    mdev->state.peer == R_SECONDARY) {
-		retcode = ERR_NO_PRIMARY;
+	if (tconn->cstate > C_STANDALONE) {
+		retcode = ERR_NET_CONFIGURED;
 		goto fail;
 	}
 
-	if (!get_ldev(mdev)) {
-		retcode = ERR_NO_DISK;
+	/* allocation not in the IO path, drbdsetup / netlink process context */
+	new_conf = kzalloc(sizeof(*new_conf), GFP_KERNEL);
+	if (!new_conf) {
+		retcode = ERR_NOMEM;
 		goto fail;
 	}
 
-	if (rs.no_resync && mdev->agreed_pro_version < 93) {
-		retcode = ERR_NEED_APV_93;
-		goto fail_ldev;
-	}
-
-	if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
-		mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
+	set_net_conf_defaults(new_conf);
 
-	mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
-	ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
-	dd = drbd_determine_dev_size(mdev, ddsf);
-	drbd_md_sync(mdev);
-	put_ldev(mdev);
-	if (dd == dev_size_error) {
-		retcode = ERR_NOMEM_BITMAP;
+	err = net_conf_from_attrs(new_conf, info);
+	if (err && err != -ENOMSG) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(from_attrs_err_to_txt(err));
 		goto fail;
 	}
 
-	if (mdev->state.conn == C_CONNECTED) {
-		if (dd == grew)
-			set_bit(RESIZE_PENDING, &mdev->flags);
+	retcode = check_net_options(tconn, new_conf);
+	if (retcode != NO_ERROR)
+		goto fail;
 
-		drbd_send_uuids(mdev);
-		drbd_send_sizes(mdev, 1, ddsf);
-	}
+	retcode = alloc_crypto(&crypto, new_conf);
+	if (retcode != NO_ERROR)
+		goto fail;
 
- fail:
-	reply->ret_code = retcode;
-	return 0;
+	((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
 
- fail_ldev:
-	put_ldev(mdev);
-	goto fail;
-}
+	conn_flush_workqueue(tconn);
 
-static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-			       struct drbd_nl_cfg_reply *reply)
-{
-	int retcode = NO_ERROR;
-	int err;
-	int ovr; /* online verify running */
-	int rsr; /* re-sync running */
-	struct crypto_hash *verify_tfm = NULL;
-	struct crypto_hash *csums_tfm = NULL;
-	struct syncer_conf sc;
-	cpumask_var_t new_cpu_mask;
-	int *rs_plan_s = NULL;
-	int fifo_size;
-
-	if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) {
-		retcode = ERR_NOMEM;
+	mutex_lock(&tconn->conf_update);
+	old_conf = tconn->net_conf;
+	if (old_conf) {
+		retcode = ERR_NET_CONFIGURED;
+		mutex_unlock(&tconn->conf_update);
 		goto fail;
 	}
+	rcu_assign_pointer(tconn->net_conf, new_conf);
 
-	if (nlp->flags & DRBD_NL_SET_DEFAULTS) {
-		memset(&sc, 0, sizeof(struct syncer_conf));
-		sc.rate       = DRBD_RATE_DEF;
-		sc.after      = DRBD_AFTER_DEF;
-		sc.al_extents = DRBD_AL_EXTENTS_DEF;
-		sc.on_no_data  = DRBD_ON_NO_DATA_DEF;
-		sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF;
-		sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF;
-		sc.c_fill_target = DRBD_C_FILL_TARGET_DEF;
-		sc.c_max_rate = DRBD_C_MAX_RATE_DEF;
-		sc.c_min_rate = DRBD_C_MIN_RATE_DEF;
-	} else
-		memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
+	conn_free_crypto(tconn);
+	tconn->cram_hmac_tfm = crypto.cram_hmac_tfm;
+	tconn->integrity_tfm = crypto.integrity_tfm;
+	tconn->csums_tfm = crypto.csums_tfm;
+	tconn->verify_tfm = crypto.verify_tfm;
 
-	if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) {
-		retcode = ERR_MANDATORY_TAG;
-		goto fail;
-	}
+	tconn->my_addr_len = nla_len(adm_ctx.my_addr);
+	memcpy(&tconn->my_addr, nla_data(adm_ctx.my_addr), tconn->my_addr_len);
+	tconn->peer_addr_len = nla_len(adm_ctx.peer_addr);
+	memcpy(&tconn->peer_addr, nla_data(adm_ctx.peer_addr), tconn->peer_addr_len);
 
-	/* re-sync running */
-	rsr = (	mdev->state.conn == C_SYNC_SOURCE ||
-		mdev->state.conn == C_SYNC_TARGET ||
-		mdev->state.conn == C_PAUSED_SYNC_S ||
-		mdev->state.conn == C_PAUSED_SYNC_T );
+	mutex_unlock(&tconn->conf_update);
 
-	if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) {
-		retcode = ERR_CSUMS_RESYNC_RUNNING;
-		goto fail;
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, i) {
+		mdev->send_cnt = 0;
+		mdev->recv_cnt = 0;
 	}
+	rcu_read_unlock();
 
-	if (!rsr && sc.csums_alg[0]) {
-		csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC);
-		if (IS_ERR(csums_tfm)) {
-			csums_tfm = NULL;
-			retcode = ERR_CSUMS_ALG;
-			goto fail;
-		}
+	retcode = conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE);
 
-		if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) {
-			retcode = ERR_CSUMS_ALG_ND;
-			goto fail;
-		}
-	}
+	conn_reconfig_done(tconn);
+	drbd_adm_finish(info, retcode);
+	return 0;
 
-	/* online verify running */
-	ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T);
+fail:
+	free_crypto(&crypto);
+	kfree(new_conf);
 
-	if (ovr) {
-		if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) {
-			retcode = ERR_VERIFY_RUNNING;
-			goto fail;
+	conn_reconfig_done(tconn);
+out:
+	drbd_adm_finish(info, retcode);
+	return 0;
+}
+
+static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool force)
+{
+	enum drbd_state_rv rv;
+
+	rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING),
+			force ? CS_HARD : 0);
+
+	switch (rv) {
+	case SS_NOTHING_TO_DO:
+		break;
+	case SS_ALREADY_STANDALONE:
+		return SS_SUCCESS;
+	case SS_PRIMARY_NOP:
+		/* Our state checking code wants to see the peer outdated. */
+		rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING,
+						pdsk, D_OUTDATED), CS_VERBOSE);
+		break;
+	case SS_CW_FAILED_BY_PEER:
+		/* The peer probably wants to see us outdated. */
+		rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING,
+							disk, D_OUTDATED), 0);
+		if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) {
+			rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING),
+					CS_HARD);
 		}
+		break;
+	default:;
+		/* no special handling necessary */
+	}
+
+	if (rv >= SS_SUCCESS) {
+		enum drbd_state_rv rv2;
+		/* No one else can reconfigure the network while I am here.
+		 * The state handling only uses drbd_thread_stop_nowait(),
+		 * we want to really wait here until the receiver is no more.
+		 */
+		drbd_thread_stop(&adm_ctx.tconn->receiver);
+
+		/* Race breaker.  This additional state change request may be
+		 * necessary, if this was a forced disconnect during a receiver
+		 * restart.  We may have "killed" the receiver thread just
+		 * after drbdd_init() returned.  Typically, we should be
+		 * C_STANDALONE already, now, and this becomes a no-op.
+		 */
+		rv2 = conn_request_state(tconn, NS(conn, C_STANDALONE),
+				CS_VERBOSE | CS_HARD);
+		if (rv2 < SS_SUCCESS)
+			conn_err(tconn,
+				"unexpected rv2=%d in conn_try_disconnect()\n",
+				rv2);
 	}
+	return rv;
+}
 
-	if (!ovr && sc.verify_alg[0]) {
-		verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC);
-		if (IS_ERR(verify_tfm)) {
-			verify_tfm = NULL;
-			retcode = ERR_VERIFY_ALG;
-			goto fail;
-		}
+int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info)
+{
+	struct disconnect_parms parms;
+	struct drbd_tconn *tconn;
+	enum drbd_state_rv rv;
+	enum drbd_ret_code retcode;
+	int err;
+
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto fail;
 
-		if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) {
-			retcode = ERR_VERIFY_ALG_ND;
+	tconn = adm_ctx.tconn;
+	memset(&parms, 0, sizeof(parms));
+	if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) {
+		err = disconnect_parms_from_attrs(&parms, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(from_attrs_err_to_txt(err));
 			goto fail;
 		}
 	}
 
-	/* silently ignore cpu mask on UP kernel */
-	if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) {
-		err = bitmap_parse(sc.cpu_mask, 32,
-				cpumask_bits(new_cpu_mask), nr_cpu_ids);
+	rv = conn_try_disconnect(tconn, parms.force_disconnect);
+	if (rv < SS_SUCCESS)
+		retcode = rv;  /* FIXME: Type mismatch. */
+	else
+		retcode = NO_ERROR;
+ fail:
+	drbd_adm_finish(info, retcode);
+	return 0;
+}
+
+void resync_after_online_grow(struct drbd_conf *mdev)
+{
+	int iass; /* I am sync source */
+
+	dev_info(DEV, "Resync of new storage after online grow\n");
+	if (mdev->state.role != mdev->state.peer)
+		iass = (mdev->state.role == R_PRIMARY);
+	else
+		iass = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags);
+
+	if (iass)
+		drbd_start_resync(mdev, C_SYNC_SOURCE);
+	else
+		_drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
+}
+
+int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
+{
+	struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
+	struct resize_parms rs;
+	struct drbd_conf *mdev;
+	enum drbd_ret_code retcode;
+	enum determine_dev_size dd;
+	enum dds_flags ddsf;
+	sector_t u_size;
+	int err;
+
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	memset(&rs, 0, sizeof(struct resize_parms));
+	if (info->attrs[DRBD_NLA_RESIZE_PARMS]) {
+		err = resize_parms_from_attrs(&rs, info);
 		if (err) {
-			dev_warn(DEV, "bitmap_parse() failed with %d\n", err);
-			retcode = ERR_CPU_MASK_PARSE;
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(from_attrs_err_to_txt(err));
 			goto fail;
 		}
 	}
 
-	ERR_IF (sc.rate < 1) sc.rate = 1;
-	ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */
-#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT)
-	if (sc.al_extents > AL_MAX) {
-		dev_err(DEV, "sc.al_extents > %d\n", AL_MAX);
-		sc.al_extents = AL_MAX;
+	mdev = adm_ctx.mdev;
+	if (mdev->state.conn > C_CONNECTED) {
+		retcode = ERR_RESIZE_RESYNC;
+		goto fail;
 	}
-#undef AL_MAX
 
-	/* to avoid spurious errors when configuring minors before configuring
-	 * the minors they depend on: if necessary, first create the minor we
-	 * depend on */
-	if (sc.after >= 0)
-		ensure_mdev(sc.after, 1);
+	if (mdev->state.role == R_SECONDARY &&
+	    mdev->state.peer == R_SECONDARY) {
+		retcode = ERR_NO_PRIMARY;
+		goto fail;
+	}
 
-	/* most sanity checks done, try to assign the new sync-after
-	 * dependency.  need to hold the global lock in there,
-	 * to avoid a race in the dependency loop check. */
-	retcode = drbd_alter_sa(mdev, sc.after);
-	if (retcode != NO_ERROR)
+	if (!get_ldev(mdev)) {
+		retcode = ERR_NO_DISK;
 		goto fail;
+	}
 
-	fifo_size = (sc.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
-	if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
-		rs_plan_s   = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
-		if (!rs_plan_s) {
-			dev_err(DEV, "kmalloc of fifo_buffer failed");
+	if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) {
+		retcode = ERR_NEED_APV_93;
+		goto fail_ldev;
+	}
+
+	rcu_read_lock();
+	u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
+	rcu_read_unlock();
+	if (u_size != (sector_t)rs.resize_size) {
+		new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
+		if (!new_disk_conf) {
 			retcode = ERR_NOMEM;
-			goto fail;
+			goto fail_ldev;
 		}
 	}
 
-	/* ok, assign the rest of it as well.
-	 * lock against receive_SyncParam() */
-	spin_lock(&mdev->peer_seq_lock);
-	mdev->sync_conf = sc;
+	if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
+		mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
 
-	if (!rsr) {
-		crypto_free_hash(mdev->csums_tfm);
-		mdev->csums_tfm = csums_tfm;
-		csums_tfm = NULL;
+	if (new_disk_conf) {
+		mutex_lock(&mdev->tconn->conf_update);
+		old_disk_conf = mdev->ldev->disk_conf;
+		*new_disk_conf = *old_disk_conf;
+		new_disk_conf->disk_size = (sector_t)rs.resize_size;
+		rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
+		mutex_unlock(&mdev->tconn->conf_update);
+		synchronize_rcu();
+		kfree(old_disk_conf);
 	}
 
-	if (!ovr) {
-		crypto_free_hash(mdev->verify_tfm);
-		mdev->verify_tfm = verify_tfm;
-		verify_tfm = NULL;
+	ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
+	dd = drbd_determine_dev_size(mdev, ddsf);
+	drbd_md_sync(mdev);
+	put_ldev(mdev);
+	if (dd == dev_size_error) {
+		retcode = ERR_NOMEM_BITMAP;
+		goto fail;
 	}
 
-	if (fifo_size != mdev->rs_plan_s.size) {
-		kfree(mdev->rs_plan_s.values);
-		mdev->rs_plan_s.values = rs_plan_s;
-		mdev->rs_plan_s.size   = fifo_size;
-		mdev->rs_planed = 0;
-		rs_plan_s = NULL;
+	if (mdev->state.conn == C_CONNECTED) {
+		if (dd == grew)
+			set_bit(RESIZE_PENDING, &mdev->flags);
+
+		drbd_send_uuids(mdev);
+		drbd_send_sizes(mdev, 1, ddsf);
 	}
 
-	spin_unlock(&mdev->peer_seq_lock);
+ fail:
+	drbd_adm_finish(info, retcode);
+	return 0;
 
-	if (get_ldev(mdev)) {
-		wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
-		drbd_al_shrink(mdev);
-		err = drbd_check_al_size(mdev);
-		lc_unlock(mdev->act_log);
-		wake_up(&mdev->al_wait);
+ fail_ldev:
+	put_ldev(mdev);
+	goto fail;
+}
 
-		put_ldev(mdev);
-		drbd_md_sync(mdev);
+int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info)
+{
+	enum drbd_ret_code retcode;
+	struct drbd_tconn *tconn;
+	struct res_opts res_opts;
+	int err;
 
-		if (err) {
-			retcode = ERR_NOMEM;
-			goto fail;
-		}
-	}
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto fail;
+	tconn = adm_ctx.tconn;
 
-	if (mdev->state.conn >= C_CONNECTED)
-		drbd_send_sync_param(mdev, &sc);
+	res_opts = tconn->res_opts;
+	if (should_set_defaults(info))
+		set_res_opts_defaults(&res_opts);
 
-	if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) {
-		cpumask_copy(mdev->cpu_mask, new_cpu_mask);
-		drbd_calc_cpu_mask(mdev);
-		mdev->receiver.reset_cpu_mask = 1;
-		mdev->asender.reset_cpu_mask = 1;
-		mdev->worker.reset_cpu_mask = 1;
+	err = res_opts_from_attrs(&res_opts, info);
+	if (err && err != -ENOMSG) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(from_attrs_err_to_txt(err));
+		goto fail;
+	}
+
+	err = set_resource_options(tconn, &res_opts);
+	if (err) {
+		retcode = ERR_INVALID_REQUEST;
+		if (err == -ENOMEM)
+			retcode = ERR_NOMEM;
 	}
 
-	kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
 fail:
-	kfree(rs_plan_s);
-	free_cpumask_var(new_cpu_mask);
-	crypto_free_hash(csums_tfm);
-	crypto_free_hash(verify_tfm);
-	reply->ret_code = retcode;
+	drbd_adm_finish(info, retcode);
 	return 0;
 }
 
-static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-			      struct drbd_nl_cfg_reply *reply)
+int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
 {
-	int retcode;
+	struct drbd_conf *mdev;
+	int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mdev = adm_ctx.mdev;
 
 	/* If there is still bitmap IO pending, probably because of a previous
 	 * resync just being finished, wait for it before requesting a new resync.
@@ -1990,10 +2412,10 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
 		retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
 
 	while (retcode == SS_NEED_CONNECTION) {
-		spin_lock_irq(&mdev->req_lock);
+		spin_lock_irq(&mdev->tconn->req_lock);
 		if (mdev->state.conn < C_CONNECTED)
 			retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL);
-		spin_unlock_irq(&mdev->req_lock);
+		spin_unlock_irq(&mdev->tconn->req_lock);
 
 		if (retcode != SS_NEED_CONNECTION)
 			break;
@@ -2002,7 +2424,25 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
 	}
 	drbd_resume_io(mdev);
 
-	reply->ret_code = retcode;
+out:
+	drbd_adm_finish(info, retcode);
+	return 0;
+}
+
+static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info,
+		union drbd_state mask, union drbd_state val)
+{
+	enum drbd_ret_code retcode;
+
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	retcode = drbd_request_state(adm_ctx.mdev, mask, val);
+out:
+	drbd_adm_finish(info, retcode);
 	return 0;
 }
 
@@ -2015,10 +2455,18 @@ static int drbd_bmio_set_susp_al(struct drbd_conf *mdev)
 	return rv;
 }
 
-static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-				   struct drbd_nl_cfg_reply *reply)
+int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
 {
-	int retcode;
+	int retcode; /* drbd_ret_code, drbd_state_rv */
+	struct drbd_conf *mdev;
+
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mdev = adm_ctx.mdev;
 
 	/* If there is still bitmap IO pending, probably because of a previous
 	 * resync just being finished, wait for it before requesting a new resync.
@@ -2028,16 +2476,15 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re
 	drbd_flush_workqueue(mdev);
 
 	retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
-
 	if (retcode < SS_SUCCESS) {
 		if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) {
-			/* The peer will get a resync upon connect anyways. Just make that
-			   into a full resync. */
+			/* The peer will get a resync upon connect anyways.
+			 * Just make that into a full resync. */
 			retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
 			if (retcode >= SS_SUCCESS) {
 				if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
-					"set_n_write from invalidate_peer",
-					BM_LOCKED_SET_ALLOWED))
+						   "set_n_write from invalidate_peer",
+						   BM_LOCKED_SET_ALLOWED))
 					retcode = ERR_IO_MD_DISK;
 			}
 		} else
@@ -2045,30 +2492,41 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re
 	}
 	drbd_resume_io(mdev);
 
-	reply->ret_code = retcode;
+out:
+	drbd_adm_finish(info, retcode);
 	return 0;
 }
 
-static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-			      struct drbd_nl_cfg_reply *reply)
+int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info)
 {
-	int retcode = NO_ERROR;
+	enum drbd_ret_code retcode;
 
-	if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
-		retcode = ERR_PAUSE_IS_SET;
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
 
-	reply->ret_code = retcode;
+	if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
+		retcode = ERR_PAUSE_IS_SET;
+out:
+	drbd_adm_finish(info, retcode);
 	return 0;
 }
 
-static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-			       struct drbd_nl_cfg_reply *reply)
+int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info)
 {
-	int retcode = NO_ERROR;
-	union drbd_state s;
+	union drbd_dev_state s;
+	enum drbd_ret_code retcode;
 
-	if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
-		s = mdev->state;
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
+		s = adm_ctx.mdev->state;
 		if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) {
 			retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP :
 				  s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR;
@@ -2077,172 +2535,482 @@ static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 		}
 	}
 
-	reply->ret_code = retcode;
+out:
+	drbd_adm_finish(info, retcode);
 	return 0;
 }
 
-static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-			      struct drbd_nl_cfg_reply *reply)
+int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info)
 {
-	reply->ret_code = drbd_request_state(mdev, NS(susp, 1));
-
-	return 0;
+	return drbd_adm_simple_request_state(skb, info, NS(susp, 1));
 }
 
-static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-			     struct drbd_nl_cfg_reply *reply)
+int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_conf *mdev;
+	int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mdev = adm_ctx.mdev;
 	if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
 		drbd_uuid_new_current(mdev);
 		clear_bit(NEW_CUR_UUID, &mdev->flags);
 	}
 	drbd_suspend_io(mdev);
-	reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
-	if (reply->ret_code == SS_SUCCESS) {
+	retcode = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
+	if (retcode == SS_SUCCESS) {
 		if (mdev->state.conn < C_CONNECTED)
-			tl_clear(mdev);
+			tl_clear(mdev->tconn);
 		if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED)
-			tl_restart(mdev, fail_frozen_disk_io);
+			tl_restart(mdev->tconn, FAIL_FROZEN_DISK_IO);
 	}
 	drbd_resume_io(mdev);
 
+out:
+	drbd_adm_finish(info, retcode);
 	return 0;
 }
 
-static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-			   struct drbd_nl_cfg_reply *reply)
+int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info)
 {
-	reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED));
-	return 0;
+	return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED));
 }
 
-static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-			   struct drbd_nl_cfg_reply *reply)
+int nla_put_drbd_cfg_context(struct sk_buff *skb, struct drbd_tconn *tconn, unsigned vnr)
 {
-	unsigned short *tl;
+	struct nlattr *nla;
+	nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT);
+	if (!nla)
+		goto nla_put_failure;
+	if (vnr != VOLUME_UNSPECIFIED &&
+	    nla_put_u32(skb, T_ctx_volume, vnr))
+		goto nla_put_failure;
+	if (nla_put_string(skb, T_ctx_resource_name, tconn->name))
+		goto nla_put_failure;
+	if (tconn->my_addr_len &&
+	    nla_put(skb, T_ctx_my_addr, tconn->my_addr_len, &tconn->my_addr))
+		goto nla_put_failure;
+	if (tconn->peer_addr_len &&
+	    nla_put(skb, T_ctx_peer_addr, tconn->peer_addr_len, &tconn->peer_addr))
+		goto nla_put_failure;
+	nla_nest_end(skb, nla);
+	return 0;
 
-	tl = reply->tag_list;
+nla_put_failure:
+	if (nla)
+		nla_nest_cancel(skb, nla);
+	return -EMSGSIZE;
+}
 
-	if (get_ldev(mdev)) {
-		tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl);
-		put_ldev(mdev);
-	}
+int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev,
+		const struct sib_info *sib)
+{
+	struct state_info *si = NULL; /* for sizeof(si->member); */
+	struct net_conf *nc;
+	struct nlattr *nla;
+	int got_ldev;
+	int err = 0;
+	int exclude_sensitive;
+
+	/* If sib != NULL, this is drbd_bcast_event, which anyone can listen
+	 * to.  So we better exclude_sensitive information.
+	 *
+	 * If sib == NULL, this is drbd_adm_get_status, executed synchronously
+	 * in the context of the requesting user process. Exclude sensitive
+	 * information, unless current has superuser.
+	 *
+	 * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and
+	 * relies on the current implementation of netlink_dump(), which
+	 * executes the dump callback successively from netlink_recvmsg(),
+	 * always in the context of the receiving process */
+	exclude_sensitive = sib || !capable(CAP_SYS_ADMIN);
+
+	got_ldev = get_ldev(mdev);
+
+	/* We need to add connection name and volume number information still.
+	 * Minor number is in drbd_genlmsghdr. */
+	if (nla_put_drbd_cfg_context(skb, mdev->tconn, mdev->vnr))
+		goto nla_put_failure;
+
+	if (res_opts_to_skb(skb, &mdev->tconn->res_opts, exclude_sensitive))
+		goto nla_put_failure;
+
+	rcu_read_lock();
+	if (got_ldev)
+		if (disk_conf_to_skb(skb, rcu_dereference(mdev->ldev->disk_conf), exclude_sensitive))
+			goto nla_put_failure;
+
+	nc = rcu_dereference(mdev->tconn->net_conf);
+	if (nc)
+		err = net_conf_to_skb(skb, nc, exclude_sensitive);
+	rcu_read_unlock();
+	if (err)
+		goto nla_put_failure;
+
+	nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO);
+	if (!nla)
+		goto nla_put_failure;
+	if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) ||
+	    nla_put_u32(skb, T_current_state, mdev->state.i) ||
+	    nla_put_u64(skb, T_ed_uuid, mdev->ed_uuid) ||
+	    nla_put_u64(skb, T_capacity, drbd_get_capacity(mdev->this_bdev)) ||
+	    nla_put_u64(skb, T_send_cnt, mdev->send_cnt) ||
+	    nla_put_u64(skb, T_recv_cnt, mdev->recv_cnt) ||
+	    nla_put_u64(skb, T_read_cnt, mdev->read_cnt) ||
+	    nla_put_u64(skb, T_writ_cnt, mdev->writ_cnt) ||
+	    nla_put_u64(skb, T_al_writ_cnt, mdev->al_writ_cnt) ||
+	    nla_put_u64(skb, T_bm_writ_cnt, mdev->bm_writ_cnt) ||
+	    nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&mdev->ap_bio_cnt)) ||
+	    nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&mdev->ap_pending_cnt)) ||
+	    nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&mdev->rs_pending_cnt)))
+		goto nla_put_failure;
+
+	if (got_ldev) {
+		int err;
 
-	if (get_net_conf(mdev)) {
-		tl = net_conf_to_tags(mdev, mdev->net_conf, tl);
-		put_net_conf(mdev);
+		spin_lock_irq(&mdev->ldev->md.uuid_lock);
+		err = nla_put(skb, T_uuids, sizeof(si->uuids), mdev->ldev->md.uuid);
+		spin_unlock_irq(&mdev->ldev->md.uuid_lock);
+
+		if (err)
+			goto nla_put_failure;
+
+		if (nla_put_u32(skb, T_disk_flags, mdev->ldev->md.flags) ||
+		    nla_put_u64(skb, T_bits_total, drbd_bm_bits(mdev)) ||
+		    nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(mdev)))
+			goto nla_put_failure;
+		if (C_SYNC_SOURCE <= mdev->state.conn &&
+		    C_PAUSED_SYNC_T >= mdev->state.conn) {
+			if (nla_put_u64(skb, T_bits_rs_total, mdev->rs_total) ||
+			    nla_put_u64(skb, T_bits_rs_failed, mdev->rs_failed))
+				goto nla_put_failure;
+		}
 	}
-	tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl);
 
-	put_unaligned(TT_END, tl++); /* Close the tag list */
+	if (sib) {
+		switch(sib->sib_reason) {
+		case SIB_SYNC_PROGRESS:
+		case SIB_GET_STATUS_REPLY:
+			break;
+		case SIB_STATE_CHANGE:
+			if (nla_put_u32(skb, T_prev_state, sib->os.i) ||
+			    nla_put_u32(skb, T_new_state, sib->ns.i))
+				goto nla_put_failure;
+			break;
+		case SIB_HELPER_POST:
+			if (nla_put_u32(skb, T_helper_exit_code,
+					sib->helper_exit_code))
+				goto nla_put_failure;
+			/* fall through */
+		case SIB_HELPER_PRE:
+			if (nla_put_string(skb, T_helper, sib->helper_name))
+				goto nla_put_failure;
+			break;
+		}
+	}
+	nla_nest_end(skb, nla);
 
-	return (int)((char *)tl - (char *)reply->tag_list);
+	if (0)
+nla_put_failure:
+		err = -EMSGSIZE;
+	if (got_ldev)
+		put_ldev(mdev);
+	return err;
 }
 
-static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-			     struct drbd_nl_cfg_reply *reply)
+int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info)
 {
-	unsigned short *tl = reply->tag_list;
-	union drbd_state s = mdev->state;
-	unsigned long rs_left;
-	unsigned int res;
+	enum drbd_ret_code retcode;
+	int err;
 
-	tl = get_state_to_tags(mdev, (struct get_state *)&s, tl);
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
 
-	/* no local ref, no bitmap, no syncer progress. */
-	if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) {
-		if (get_ldev(mdev)) {
-			drbd_get_syncer_progress(mdev, &rs_left, &res);
-			tl = tl_add_int(tl, T_sync_progress, &res);
-			put_ldev(mdev);
-		}
+	err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.mdev, NULL);
+	if (err) {
+		nlmsg_free(adm_ctx.reply_skb);
+		return err;
 	}
-	put_unaligned(TT_END, tl++); /* Close the tag list */
-
-	return (int)((char *)tl - (char *)reply->tag_list);
+out:
+	drbd_adm_finish(info, retcode);
+	return 0;
 }
 
-static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-			     struct drbd_nl_cfg_reply *reply)
+int get_one_status(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	unsigned short *tl;
-
-	tl = reply->tag_list;
+	struct drbd_conf *mdev;
+	struct drbd_genlmsghdr *dh;
+	struct drbd_tconn *pos = (struct drbd_tconn*)cb->args[0];
+	struct drbd_tconn *tconn = NULL;
+	struct drbd_tconn *tmp;
+	unsigned volume = cb->args[1];
+
+	/* Open coded, deferred, iteration:
+	 * list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) {
+	 *	idr_for_each_entry(&tconn->volumes, mdev, i) {
+	 *	  ...
+	 *	}
+	 * }
+	 * where tconn is cb->args[0];
+	 * and i is cb->args[1];
+	 *
+	 * cb->args[2] indicates if we shall loop over all resources,
+	 * or just dump all volumes of a single resource.
+	 *
+	 * This may miss entries inserted after this dump started,
+	 * or entries deleted before they are reached.
+	 *
+	 * We need to make sure the mdev won't disappear while
+	 * we are looking at it, and revalidate our iterators
+	 * on each iteration.
+	 */
 
-	if (get_ldev(mdev)) {
-		tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64));
-		tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags);
-		put_ldev(mdev);
+	/* synchronize with conn_create()/conn_destroy() */
+	rcu_read_lock();
+	/* revalidate iterator position */
+	list_for_each_entry_rcu(tmp, &drbd_tconns, all_tconn) {
+		if (pos == NULL) {
+			/* first iteration */
+			pos = tmp;
+			tconn = pos;
+			break;
+		}
+		if (tmp == pos) {
+			tconn = pos;
+			break;
+		}
 	}
-	put_unaligned(TT_END, tl++); /* Close the tag list */
+	if (tconn) {
+next_tconn:
+		mdev = idr_get_next(&tconn->volumes, &volume);
+		if (!mdev) {
+			/* No more volumes to dump on this tconn.
+			 * Advance tconn iterator. */
+			pos = list_entry_rcu(tconn->all_tconn.next,
+					     struct drbd_tconn, all_tconn);
+			/* Did we dump any volume on this tconn yet? */
+			if (volume != 0) {
+				/* If we reached the end of the list,
+				 * or only a single resource dump was requested,
+				 * we are done. */
+				if (&pos->all_tconn == &drbd_tconns || cb->args[2])
+					goto out;
+				volume = 0;
+				tconn = pos;
+				goto next_tconn;
+			}
+		}
+
+		dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+				cb->nlh->nlmsg_seq, &drbd_genl_family,
+				NLM_F_MULTI, DRBD_ADM_GET_STATUS);
+		if (!dh)
+			goto out;
+
+		if (!mdev) {
+			/* This is a tconn without a single volume.
+			 * Suprisingly enough, it may have a network
+			 * configuration. */
+			struct net_conf *nc;
+			dh->minor = -1U;
+			dh->ret_code = NO_ERROR;
+			if (nla_put_drbd_cfg_context(skb, tconn, VOLUME_UNSPECIFIED))
+				goto cancel;
+			nc = rcu_dereference(tconn->net_conf);
+			if (nc && net_conf_to_skb(skb, nc, 1) != 0)
+				goto cancel;
+			goto done;
+		}
+
+		D_ASSERT(mdev->vnr == volume);
+		D_ASSERT(mdev->tconn == tconn);
+
+		dh->minor = mdev_to_minor(mdev);
+		dh->ret_code = NO_ERROR;
+
+		if (nla_put_status_info(skb, mdev, NULL)) {
+cancel:
+			genlmsg_cancel(skb, dh);
+			goto out;
+		}
+done:
+		genlmsg_end(skb, dh);
+        }
 
-	return (int)((char *)tl - (char *)reply->tag_list);
+out:
+	rcu_read_unlock();
+	/* where to start the next iteration */
+        cb->args[0] = (long)pos;
+        cb->args[1] = (pos == tconn) ? volume + 1 : 0;
+
+	/* No more tconns/volumes/minors found results in an empty skb.
+	 * Which will terminate the dump. */
+        return skb->len;
 }
 
-/**
- * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use
- * @mdev:	DRBD device.
- * @nlp:	Netlink/connector packet from drbdsetup
- * @reply:	Reply packet for drbdsetup
+/*
+ * Request status of all resources, or of all volumes within a single resource.
+ *
+ * This is a dump, as the answer may not fit in a single reply skb otherwise.
+ * Which means we cannot use the family->attrbuf or other such members, because
+ * dump is NOT protected by the genl_lock().  During dump, we only have access
+ * to the incoming skb, and need to opencode "parsing" of the nlattr payload.
+ *
+ * Once things are setup properly, we call into get_one_status().
  */
-static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-				    struct drbd_nl_cfg_reply *reply)
+int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	unsigned short *tl;
-	char rv;
+	const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
+	struct nlattr *nla;
+	const char *resource_name;
+	struct drbd_tconn *tconn;
+	int maxtype;
+
+	/* Is this a followup call? */
+	if (cb->args[0]) {
+		/* ... of a single resource dump,
+		 * and the resource iterator has been advanced already? */
+		if (cb->args[2] && cb->args[2] != cb->args[0])
+			return 0; /* DONE. */
+		goto dump;
+	}
+
+	/* First call (from netlink_dump_start).  We need to figure out
+	 * which resource(s) the user wants us to dump. */
+	nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen),
+			nlmsg_attrlen(cb->nlh, hdrlen),
+			DRBD_NLA_CFG_CONTEXT);
+
+	/* No explicit context given.  Dump all. */
+	if (!nla)
+		goto dump;
+	maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
+	nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name));
+	if (IS_ERR(nla))
+		return PTR_ERR(nla);
+	/* context given, but no name present? */
+	if (!nla)
+		return -EINVAL;
+	resource_name = nla_data(nla);
+	tconn = conn_get_by_name(resource_name);
+
+	if (!tconn)
+		return -ENODEV;
+
+	kref_put(&tconn->kref, &conn_destroy); /* get_one_status() (re)validates tconn by itself */
+
+	/* prime iterators, and set "filter" mode mark:
+	 * only dump this tconn. */
+	cb->args[0] = (long)tconn;
+	/* cb->args[1] = 0; passed in this way. */
+	cb->args[2] = (long)tconn;
+
+dump:
+	return get_one_status(skb, cb);
+}
 
-	tl = reply->tag_list;
+int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info)
+{
+	enum drbd_ret_code retcode;
+	struct timeout_parms tp;
+	int err;
 
-	rv = mdev->state.pdsk == D_OUTDATED        ? UT_PEER_OUTDATED :
-	  test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT;
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
 
-	tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv));
-	put_unaligned(TT_END, tl++); /* Close the tag list */
+	tp.timeout_type =
+		adm_ctx.mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
+		test_bit(USE_DEGR_WFC_T, &adm_ctx.mdev->flags) ? UT_DEGRADED :
+		UT_DEFAULT;
 
-	return (int)((char *)tl - (char *)reply->tag_list);
+	err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp);
+	if (err) {
+		nlmsg_free(adm_ctx.reply_skb);
+		return err;
+	}
+out:
+	drbd_adm_finish(info, retcode);
+	return 0;
 }
 
-static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-				    struct drbd_nl_cfg_reply *reply)
+int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
 {
-	/* default to resume from last known position, if possible */
-	struct start_ov args =
-		{ .start_sector = mdev->ov_start_sector };
+	struct drbd_conf *mdev;
+	enum drbd_ret_code retcode;
+	struct start_ov_parms parms;
 
-	if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) {
-		reply->ret_code = ERR_MANDATORY_TAG;
-		return 0;
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mdev = adm_ctx.mdev;
+
+	/* resume from last known position, if possible */
+	parms.ov_start_sector = mdev->ov_start_sector;
+	parms.ov_stop_sector = ULLONG_MAX;
+	if (info->attrs[DRBD_NLA_START_OV_PARMS]) {
+		int err = start_ov_parms_from_attrs(&parms, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(from_attrs_err_to_txt(err));
+			goto out;
+		}
 	}
+	/* w_make_ov_request expects position to be aligned */
+	mdev->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1);
+	mdev->ov_stop_sector = parms.ov_stop_sector;
 
 	/* If there is still bitmap IO pending, e.g. previous resync or verify
 	 * just being finished, wait for it before requesting a new resync. */
 	drbd_suspend_io(mdev);
 	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
-
-	/* w_make_ov_request expects position to be aligned */
-	mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT;
-	reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
+	retcode = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
 	drbd_resume_io(mdev);
+out:
+	drbd_adm_finish(info, retcode);
 	return 0;
 }
 
 
-static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-			      struct drbd_nl_cfg_reply *reply)
+int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info)
 {
-	int retcode = NO_ERROR;
+	struct drbd_conf *mdev;
+	enum drbd_ret_code retcode;
 	int skip_initial_sync = 0;
 	int err;
+	struct new_c_uuid_parms args;
 
-	struct new_c_uuid args;
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out_nolock;
 
-	memset(&args, 0, sizeof(struct new_c_uuid));
-	if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) {
-		reply->ret_code = ERR_MANDATORY_TAG;
-		return 0;
+	mdev = adm_ctx.mdev;
+	memset(&args, 0, sizeof(args));
+	if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) {
+		err = new_c_uuid_parms_from_attrs(&args, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(from_attrs_err_to_txt(err));
+			goto out_nolock;
+		}
 	}
 
-	mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */
+	mutex_lock(mdev->state_mutex); /* Protects us against serialized state changes. */
 
 	if (!get_ldev(mdev)) {
 		retcode = ERR_NO_DISK;
@@ -2250,7 +3018,7 @@ static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
 	}
 
 	/* this is "skip initial sync", assume to be clean */
-	if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 &&
+	if (mdev->state.conn == C_CONNECTED && mdev->tconn->agreed_pro_version >= 90 &&
 	    mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
 		dev_info(DEV, "Preparing to skip initial sync\n");
 		skip_initial_sync = 1;
@@ -2273,10 +3041,10 @@ static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
 			drbd_send_uuids_skip_initial_sync(mdev);
 			_drbd_uuid_set(mdev, UI_BITMAP, 0);
 			drbd_print_uuids(mdev, "cleared bitmap UUID");
-			spin_lock_irq(&mdev->req_lock);
+			spin_lock_irq(&mdev->tconn->req_lock);
 			_drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
 					CS_VERBOSE, NULL);
-			spin_unlock_irq(&mdev->req_lock);
+			spin_unlock_irq(&mdev->tconn->req_lock);
 		}
 	}
 
@@ -2284,416 +3052,284 @@ static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
 out_dec:
 	put_ldev(mdev);
 out:
-	mutex_unlock(&mdev->state_mutex);
-
-	reply->ret_code = retcode;
+	mutex_unlock(mdev->state_mutex);
+out_nolock:
+	drbd_adm_finish(info, retcode);
 	return 0;
 }
 
-struct cn_handler_struct {
-	int (*function)(struct drbd_conf *,
-			 struct drbd_nl_cfg_req *,
-			 struct drbd_nl_cfg_reply *);
-	int reply_body_size;
-};
-
-static struct cn_handler_struct cnd_table[] = {
-	[ P_primary ]		= { &drbd_nl_primary,		0 },
-	[ P_secondary ]		= { &drbd_nl_secondary,		0 },
-	[ P_disk_conf ]		= { &drbd_nl_disk_conf,		0 },
-	[ P_detach ]		= { &drbd_nl_detach,		0 },
-	[ P_net_conf ]		= { &drbd_nl_net_conf,		0 },
-	[ P_disconnect ]	= { &drbd_nl_disconnect,	0 },
-	[ P_resize ]		= { &drbd_nl_resize,		0 },
-	[ P_syncer_conf ]	= { &drbd_nl_syncer_conf,	0 },
-	[ P_invalidate ]	= { &drbd_nl_invalidate,	0 },
-	[ P_invalidate_peer ]	= { &drbd_nl_invalidate_peer,	0 },
-	[ P_pause_sync ]	= { &drbd_nl_pause_sync,	0 },
-	[ P_resume_sync ]	= { &drbd_nl_resume_sync,	0 },
-	[ P_suspend_io ]	= { &drbd_nl_suspend_io,	0 },
-	[ P_resume_io ]		= { &drbd_nl_resume_io,		0 },
-	[ P_outdate ]		= { &drbd_nl_outdate,		0 },
-	[ P_get_config ]	= { &drbd_nl_get_config,
-				    sizeof(struct syncer_conf_tag_len_struct) +
-				    sizeof(struct disk_conf_tag_len_struct) +
-				    sizeof(struct net_conf_tag_len_struct) },
-	[ P_get_state ]		= { &drbd_nl_get_state,
-				    sizeof(struct get_state_tag_len_struct) +
-				    sizeof(struct sync_progress_tag_len_struct)	},
-	[ P_get_uuids ]		= { &drbd_nl_get_uuids,
-				    sizeof(struct get_uuids_tag_len_struct) },
-	[ P_get_timeout_flag ]	= { &drbd_nl_get_timeout_flag,
-				    sizeof(struct get_timeout_flag_tag_len_struct)},
-	[ P_start_ov ]		= { &drbd_nl_start_ov,		0 },
-	[ P_new_c_uuid ]	= { &drbd_nl_new_c_uuid,	0 },
-};
-
-static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp)
+static enum drbd_ret_code
+drbd_check_resource_name(const char *name)
 {
-	struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data;
-	struct cn_handler_struct *cm;
-	struct cn_msg *cn_reply;
-	struct drbd_nl_cfg_reply *reply;
-	struct drbd_conf *mdev;
-	int retcode, rr;
-	int reply_size = sizeof(struct cn_msg)
-		+ sizeof(struct drbd_nl_cfg_reply)
-		+ sizeof(short int);
-
-	if (!try_module_get(THIS_MODULE)) {
-		printk(KERN_ERR "drbd: try_module_get() failed!\n");
-		return;
+	if (!name || !name[0]) {
+		drbd_msg_put_info("resource name missing");
+		return ERR_MANDATORY_TAG;
 	}
-
-	if (!capable(CAP_SYS_ADMIN)) {
-		retcode = ERR_PERM;
-		goto fail;
-	}
-
-	mdev = ensure_mdev(nlp->drbd_minor,
-			(nlp->flags & DRBD_NL_CREATE_DEVICE));
-	if (!mdev) {
-		retcode = ERR_MINOR_INVALID;
-		goto fail;
+	/* if we want to use these in sysfs/configfs/debugfs some day,
+	 * we must not allow slashes */
+	if (strchr(name, '/')) {
+		drbd_msg_put_info("invalid resource name");
+		return ERR_INVALID_REQUEST;
 	}
+	return NO_ERROR;
+}
 
-	if (nlp->packet_type >= P_nl_after_last_packet ||
-	    nlp->packet_type == P_return_code_only) {
-		retcode = ERR_PACKET_NR;
-		goto fail;
-	}
+int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
+{
+	enum drbd_ret_code retcode;
+	struct res_opts res_opts;
+	int err;
 
-	cm = cnd_table + nlp->packet_type;
+	retcode = drbd_adm_prepare(skb, info, 0);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
 
-	/* This may happen if packet number is 0: */
-	if (cm->function == NULL) {
-		retcode = ERR_PACKET_NR;
-		goto fail;
+	set_res_opts_defaults(&res_opts);
+	err = res_opts_from_attrs(&res_opts, info);
+	if (err && err != -ENOMSG) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(from_attrs_err_to_txt(err));
+		goto out;
 	}
 
-	reply_size += cm->reply_body_size;
+	retcode = drbd_check_resource_name(adm_ctx.resource_name);
+	if (retcode != NO_ERROR)
+		goto out;
 
-	/* allocation not in the IO path, cqueue thread context */
-	cn_reply = kzalloc(reply_size, GFP_KERNEL);
-	if (!cn_reply) {
-		retcode = ERR_NOMEM;
-		goto fail;
+	if (adm_ctx.tconn) {
+		if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) {
+			retcode = ERR_INVALID_REQUEST;
+			drbd_msg_put_info("resource exists");
+		}
+		/* else: still NO_ERROR */
+		goto out;
 	}
-	reply = (struct drbd_nl_cfg_reply *) cn_reply->data;
-
-	reply->packet_type =
-		cm->reply_body_size ? nlp->packet_type : P_return_code_only;
-	reply->minor = nlp->drbd_minor;
-	reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */
-	/* reply->tag_list; might be modified by cm->function. */
-
-	rr = cm->function(mdev, nlp, reply);
-
-	cn_reply->id = req->id;
-	cn_reply->seq = req->seq;
-	cn_reply->ack = req->ack  + 1;
-	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr;
-	cn_reply->flags = 0;
 
-	rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL);
-	if (rr && rr != -ESRCH)
-		printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
-
-	kfree(cn_reply);
-	module_put(THIS_MODULE);
-	return;
- fail:
-	drbd_nl_send_reply(req, retcode);
-	module_put(THIS_MODULE);
+	if (!conn_create(adm_ctx.resource_name, &res_opts))
+		retcode = ERR_NOMEM;
+out:
+	drbd_adm_finish(info, retcode);
+	return 0;
 }
 
-static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */
-
-static unsigned short *
-__tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data,
-	unsigned short len, int nul_terminated)
+int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info)
 {
-	unsigned short l = tag_descriptions[tag_number(tag)].max_len;
-	len = (len < l) ? len :  l;
-	put_unaligned(tag, tl++);
-	put_unaligned(len, tl++);
-	memcpy(tl, data, len);
-	tl = (unsigned short*)((char*)tl + len);
-	if (nul_terminated)
-		*((char*)tl - 1) = 0;
-	return tl;
-}
+	struct drbd_genlmsghdr *dh = info->userhdr;
+	enum drbd_ret_code retcode;
 
-static unsigned short *
-tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len)
-{
-	return __tl_add_blob(tl, tag, data, len, 0);
-}
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
 
-static unsigned short *
-tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str)
-{
-	return __tl_add_blob(tl, tag, str, strlen(str)+1, 0);
-}
+	if (dh->minor > MINORMASK) {
+		drbd_msg_put_info("requested minor out of range");
+		retcode = ERR_INVALID_REQUEST;
+		goto out;
+	}
+	if (adm_ctx.volume > DRBD_VOLUME_MAX) {
+		drbd_msg_put_info("requested volume id out of range");
+		retcode = ERR_INVALID_REQUEST;
+		goto out;
+	}
 
-static unsigned short *
-tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val)
-{
-	put_unaligned(tag, tl++);
-	switch(tag_type(tag)) {
-	case TT_INTEGER:
-		put_unaligned(sizeof(int), tl++);
-		put_unaligned(*(int *)val, (int *)tl);
-		tl = (unsigned short*)((char*)tl+sizeof(int));
-		break;
-	case TT_INT64:
-		put_unaligned(sizeof(u64), tl++);
-		put_unaligned(*(u64 *)val, (u64 *)tl);
-		tl = (unsigned short*)((char*)tl+sizeof(u64));
-		break;
-	default:
-		/* someone did something stupid. */
-		;
+	/* drbd_adm_prepare made sure already
+	 * that mdev->tconn and mdev->vnr match the request. */
+	if (adm_ctx.mdev) {
+		if (info->nlhdr->nlmsg_flags & NLM_F_EXCL)
+			retcode = ERR_MINOR_EXISTS;
+		/* else: still NO_ERROR */
+		goto out;
 	}
-	return tl;
+
+	retcode = conn_new_minor(adm_ctx.tconn, dh->minor, adm_ctx.volume);
+out:
+	drbd_adm_finish(info, retcode);
+	return 0;
 }
 
-void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state)
+static enum drbd_ret_code adm_delete_minor(struct drbd_conf *mdev)
 {
-	char buffer[sizeof(struct cn_msg)+
-		    sizeof(struct drbd_nl_cfg_reply)+
-		    sizeof(struct get_state_tag_len_struct)+
-		    sizeof(short int)];
-	struct cn_msg *cn_reply = (struct cn_msg *) buffer;
-	struct drbd_nl_cfg_reply *reply =
-		(struct drbd_nl_cfg_reply *)cn_reply->data;
-	unsigned short *tl = reply->tag_list;
-
-	/* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
-
-	tl = get_state_to_tags(mdev, (struct get_state *)&state, tl);
-
-	put_unaligned(TT_END, tl++); /* Close the tag list */
-
-	cn_reply->id.idx = CN_IDX_DRBD;
-	cn_reply->id.val = CN_VAL_DRBD;
-
-	cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
-	cn_reply->ack = 0; /* not used here. */
-	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
-		(int)((char *)tl - (char *)reply->tag_list);
-	cn_reply->flags = 0;
-
-	reply->packet_type = P_get_state;
-	reply->minor = mdev_to_minor(mdev);
-	reply->ret_code = NO_ERROR;
-
-	cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+	if (mdev->state.disk == D_DISKLESS &&
+	    /* no need to be mdev->state.conn == C_STANDALONE &&
+	     * we may want to delete a minor from a live replication group.
+	     */
+	    mdev->state.role == R_SECONDARY) {
+		_drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS),
+				    CS_VERBOSE + CS_WAIT_COMPLETE);
+		idr_remove(&mdev->tconn->volumes, mdev->vnr);
+		idr_remove(&minors, mdev_to_minor(mdev));
+		del_gendisk(mdev->vdisk);
+		synchronize_rcu();
+		kref_put(&mdev->kref, &drbd_minor_destroy);
+		return NO_ERROR;
+	} else
+		return ERR_MINOR_CONFIGURED;
 }
 
-void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name)
+int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info)
 {
-	char buffer[sizeof(struct cn_msg)+
-		    sizeof(struct drbd_nl_cfg_reply)+
-		    sizeof(struct call_helper_tag_len_struct)+
-		    sizeof(short int)];
-	struct cn_msg *cn_reply = (struct cn_msg *) buffer;
-	struct drbd_nl_cfg_reply *reply =
-		(struct drbd_nl_cfg_reply *)cn_reply->data;
-	unsigned short *tl = reply->tag_list;
-
-	/* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
-
-	tl = tl_add_str(tl, T_helper, helper_name);
-	put_unaligned(TT_END, tl++); /* Close the tag list */
-
-	cn_reply->id.idx = CN_IDX_DRBD;
-	cn_reply->id.val = CN_VAL_DRBD;
-
-	cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
-	cn_reply->ack = 0; /* not used here. */
-	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
-		(int)((char *)tl - (char *)reply->tag_list);
-	cn_reply->flags = 0;
+	enum drbd_ret_code retcode;
 
-	reply->packet_type = P_call_helper;
-	reply->minor = mdev_to_minor(mdev);
-	reply->ret_code = NO_ERROR;
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
 
-	cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+	retcode = adm_delete_minor(adm_ctx.mdev);
+out:
+	drbd_adm_finish(info, retcode);
+	return 0;
 }
 
-void drbd_bcast_ee(struct drbd_conf *mdev,
-		const char *reason, const int dgs,
-		const char* seen_hash, const char* calc_hash,
-		const struct drbd_epoch_entry* e)
+int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cn_msg *cn_reply;
-	struct drbd_nl_cfg_reply *reply;
-	unsigned short *tl;
-	struct page *page;
-	unsigned len;
+	int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+	struct drbd_conf *mdev;
+	unsigned i;
 
-	if (!e)
-		return;
-	if (!reason || !reason[0])
-		return;
+	retcode = drbd_adm_prepare(skb, info, 0);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
 
-	/* apparently we have to memcpy twice, first to prepare the data for the
-	 * struct cn_msg, then within cn_netlink_send from the cn_msg to the
-	 * netlink skb. */
-	/* receiver thread context, which is not in the writeout path (of this node),
-	 * but may be in the writeout path of the _other_ node.
-	 * GFP_NOIO to avoid potential "distributed deadlock". */
-	cn_reply = kzalloc(
-		sizeof(struct cn_msg)+
-		sizeof(struct drbd_nl_cfg_reply)+
-		sizeof(struct dump_ee_tag_len_struct)+
-		sizeof(short int),
-		GFP_NOIO);
-
-	if (!cn_reply) {
-		dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n",
-				(unsigned long long)e->sector, e->size);
-		return;
+	if (!adm_ctx.tconn) {
+		retcode = ERR_RES_NOT_KNOWN;
+		goto out;
 	}
 
-	reply = (struct drbd_nl_cfg_reply*)cn_reply->data;
-	tl = reply->tag_list;
-
-	tl = tl_add_str(tl, T_dump_ee_reason, reason);
-	tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs);
-	tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs);
-	tl = tl_add_int(tl, T_ee_sector, &e->sector);
-	tl = tl_add_int(tl, T_ee_block_id, &e->block_id);
-
-	/* dump the first 32k */
-	len = min_t(unsigned, e->size, 32 << 10);
-	put_unaligned(T_ee_data, tl++);
-	put_unaligned(len, tl++);
-
-	page = e->pages;
-	page_chain_for_each(page) {
-		void *d = kmap_atomic(page);
-		unsigned l = min_t(unsigned, len, PAGE_SIZE);
-		memcpy(tl, d, l);
-		kunmap_atomic(d);
-		tl = (unsigned short*)((char*)tl + l);
-		len -= l;
-		if (len == 0)
-			break;
+	/* demote */
+	idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) {
+		retcode = drbd_set_role(mdev, R_SECONDARY, 0);
+		if (retcode < SS_SUCCESS) {
+			drbd_msg_put_info("failed to demote");
+			goto out;
+		}
 	}
-	put_unaligned(TT_END, tl++); /* Close the tag list */
-
-	cn_reply->id.idx = CN_IDX_DRBD;
-	cn_reply->id.val = CN_VAL_DRBD;
 
-	cn_reply->seq = atomic_add_return(1,&drbd_nl_seq);
-	cn_reply->ack = 0; // not used here.
-	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
-		(int)((char*)tl - (char*)reply->tag_list);
-	cn_reply->flags = 0;
-
-	reply->packet_type = P_dump_ee;
-	reply->minor = mdev_to_minor(mdev);
-	reply->ret_code = NO_ERROR;
-
-	cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
-	kfree(cn_reply);
-}
-
-void drbd_bcast_sync_progress(struct drbd_conf *mdev)
-{
-	char buffer[sizeof(struct cn_msg)+
-		    sizeof(struct drbd_nl_cfg_reply)+
-		    sizeof(struct sync_progress_tag_len_struct)+
-		    sizeof(short int)];
-	struct cn_msg *cn_reply = (struct cn_msg *) buffer;
-	struct drbd_nl_cfg_reply *reply =
-		(struct drbd_nl_cfg_reply *)cn_reply->data;
-	unsigned short *tl = reply->tag_list;
-	unsigned long rs_left;
-	unsigned int res;
+	retcode = conn_try_disconnect(adm_ctx.tconn, 0);
+	if (retcode < SS_SUCCESS) {
+		drbd_msg_put_info("failed to disconnect");
+		goto out;
+	}
 
-	/* no local ref, no bitmap, no syncer progress, no broadcast. */
-	if (!get_ldev(mdev))
-		return;
-	drbd_get_syncer_progress(mdev, &rs_left, &res);
-	put_ldev(mdev);
+	/* detach */
+	idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) {
+		retcode = adm_detach(mdev, 0);
+		if (retcode < SS_SUCCESS || retcode > NO_ERROR) {
+			drbd_msg_put_info("failed to detach");
+			goto out;
+		}
+	}
 
-	tl = tl_add_int(tl, T_sync_progress, &res);
-	put_unaligned(TT_END, tl++); /* Close the tag list */
+	/* If we reach this, all volumes (of this tconn) are Secondary,
+	 * Disconnected, Diskless, aka Unconfigured. Make sure all threads have
+	 * actually stopped, state handling only does drbd_thread_stop_nowait(). */
+	drbd_thread_stop(&adm_ctx.tconn->worker);
 
-	cn_reply->id.idx = CN_IDX_DRBD;
-	cn_reply->id.val = CN_VAL_DRBD;
+	/* Now, nothing can fail anymore */
 
-	cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
-	cn_reply->ack = 0; /* not used here. */
-	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
-		(int)((char *)tl - (char *)reply->tag_list);
-	cn_reply->flags = 0;
+	/* delete volumes */
+	idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) {
+		retcode = adm_delete_minor(mdev);
+		if (retcode != NO_ERROR) {
+			/* "can not happen" */
+			drbd_msg_put_info("failed to delete volume");
+			goto out;
+		}
+	}
 
-	reply->packet_type = P_sync_progress;
-	reply->minor = mdev_to_minor(mdev);
-	reply->ret_code = NO_ERROR;
+	/* delete connection */
+	if (conn_lowest_minor(adm_ctx.tconn) < 0) {
+		list_del_rcu(&adm_ctx.tconn->all_tconn);
+		synchronize_rcu();
+		kref_put(&adm_ctx.tconn->kref, &conn_destroy);
 
-	cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+		retcode = NO_ERROR;
+	} else {
+		/* "can not happen" */
+		retcode = ERR_RES_IN_USE;
+		drbd_msg_put_info("failed to delete connection");
+	}
+	goto out;
+out:
+	drbd_adm_finish(info, retcode);
+	return 0;
 }
 
-int __init drbd_nl_init(void)
+int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
 {
-	static struct cb_id cn_id_drbd;
-	int err, try=10;
+	enum drbd_ret_code retcode;
 
-	cn_id_drbd.val = CN_VAL_DRBD;
-	do {
-		cn_id_drbd.idx = cn_idx;
-		err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback);
-		if (!err)
-			break;
-		cn_idx = (cn_idx + CN_IDX_STEP);
-	} while (try--);
+	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
 
-	if (err) {
-		printk(KERN_ERR "drbd: cn_drbd failed to register\n");
-		return err;
+	if (conn_lowest_minor(adm_ctx.tconn) < 0) {
+		list_del_rcu(&adm_ctx.tconn->all_tconn);
+		synchronize_rcu();
+		kref_put(&adm_ctx.tconn->kref, &conn_destroy);
+
+		retcode = NO_ERROR;
+	} else {
+		retcode = ERR_RES_IN_USE;
 	}
 
+	if (retcode == NO_ERROR)
+		drbd_thread_stop(&adm_ctx.tconn->worker);
+out:
+	drbd_adm_finish(info, retcode);
 	return 0;
 }
 
-void drbd_nl_cleanup(void)
+void drbd_bcast_event(struct drbd_conf *mdev, const struct sib_info *sib)
 {
-	static struct cb_id cn_id_drbd;
-
-	cn_id_drbd.idx = cn_idx;
-	cn_id_drbd.val = CN_VAL_DRBD;
-
-	cn_del_callback(&cn_id_drbd);
-}
+	static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
+	struct sk_buff *msg;
+	struct drbd_genlmsghdr *d_out;
+	unsigned seq;
+	int err = -ENOMEM;
+
+	if (sib->sib_reason == SIB_SYNC_PROGRESS) {
+		if (time_after(jiffies, mdev->rs_last_bcast + HZ))
+			mdev->rs_last_bcast = jiffies;
+		else
+			return;
+	}
 
-void drbd_nl_send_reply(struct cn_msg *req, int ret_code)
-{
-	char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)];
-	struct cn_msg *cn_reply = (struct cn_msg *) buffer;
-	struct drbd_nl_cfg_reply *reply =
-		(struct drbd_nl_cfg_reply *)cn_reply->data;
-	int rr;
+	seq = atomic_inc_return(&drbd_genl_seq);
+	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+	if (!msg)
+		goto failed;
 
-	memset(buffer, 0, sizeof(buffer));
-	cn_reply->id = req->id;
+	err = -EMSGSIZE;
+	d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT);
+	if (!d_out) /* cannot happen, but anyways. */
+		goto nla_put_failure;
+	d_out->minor = mdev_to_minor(mdev);
+	d_out->ret_code = NO_ERROR;
 
-	cn_reply->seq = req->seq;
-	cn_reply->ack = req->ack  + 1;
-	cn_reply->len = sizeof(struct drbd_nl_cfg_reply);
-	cn_reply->flags = 0;
+	if (nla_put_status_info(msg, mdev, sib))
+		goto nla_put_failure;
+	genlmsg_end(msg, d_out);
+	err = drbd_genl_multicast_events(msg, 0);
+	/* msg has been consumed or freed in netlink_broadcast() */
+	if (err && err != -ESRCH)
+		goto failed;
 
-	reply->packet_type = P_return_code_only;
-	reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor;
-	reply->ret_code = ret_code;
+	return;
 
-	rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
-	if (rr && rr != -ESRCH)
-		printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
+nla_put_failure:
+	nlmsg_free(msg);
+failed:
+	dev_err(DEV, "Error %d while broadcasting event. "
+			"Event seq:%u sib_reason:%u\n",
+			err, seq, sib->sib_reason);
 }
-
diff --git a/trunk/drivers/block/drbd/drbd_nla.c b/trunk/drivers/block/drbd/drbd_nla.c
new file mode 100644
index 000000000000..fa672b6df8d6
--- /dev/null
+++ b/trunk/drivers/block/drbd/drbd_nla.c
@@ -0,0 +1,55 @@
+#include "drbd_wrappers.h"
+#include <linux/kernel.h>
+#include <net/netlink.h>
+#include <linux/drbd_genl_api.h>
+#include "drbd_nla.h"
+
+static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla)
+{
+	struct nlattr *head = nla_data(nla);
+	int len = nla_len(nla);
+	int rem;
+
+	/*
+	 * validate_nla (called from nla_parse_nested) ignores attributes
+	 * beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag.
+	 * In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY
+	 * flag set also, check and remove that flag before calling
+	 * nla_parse_nested.
+	 */
+
+	nla_for_each_attr(nla, head, len, rem) {
+		if (nla->nla_type & DRBD_GENLA_F_MANDATORY) {
+			nla->nla_type &= ~DRBD_GENLA_F_MANDATORY;
+			if (nla_type(nla) > maxtype)
+				return -EOPNOTSUPP;
+		}
+	}
+	return 0;
+}
+
+int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+			  const struct nla_policy *policy)
+{
+	int err;
+
+	err = drbd_nla_check_mandatory(maxtype, nla);
+	if (!err)
+		err = nla_parse_nested(tb, maxtype, nla, policy);
+
+	return err;
+}
+
+struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype)
+{
+	int err;
+	/*
+	 * If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and
+	 * we don't know about that attribute, reject all the nested
+	 * attributes.
+	 */
+	err = drbd_nla_check_mandatory(maxtype, nla);
+	if (err)
+		return ERR_PTR(err);
+	return nla_find_nested(nla, attrtype);
+}
diff --git a/trunk/drivers/block/drbd/drbd_nla.h b/trunk/drivers/block/drbd/drbd_nla.h
new file mode 100644
index 000000000000..679c2d5b4535
--- /dev/null
+++ b/trunk/drivers/block/drbd/drbd_nla.h
@@ -0,0 +1,8 @@
+#ifndef __DRBD_NLA_H
+#define __DRBD_NLA_H
+
+extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+				 const struct nla_policy *policy);
+extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype);
+
+#endif  /* __DRBD_NLA_H */
diff --git a/trunk/drivers/block/drbd/drbd_proc.c b/trunk/drivers/block/drbd/drbd_proc.c
index 5496104f90b9..56672a61eb94 100644
--- a/trunk/drivers/block/drbd/drbd_proc.c
+++ b/trunk/drivers/block/drbd/drbd_proc.c
@@ -167,18 +167,24 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
 		 * we convert to sectors in the display below. */
 		unsigned long bm_bits = drbd_bm_bits(mdev);
 		unsigned long bit_pos;
+		unsigned long long stop_sector = 0;
 		if (mdev->state.conn == C_VERIFY_S ||
-		    mdev->state.conn == C_VERIFY_T)
+		    mdev->state.conn == C_VERIFY_T) {
 			bit_pos = bm_bits - mdev->ov_left;
-		else
+			if (verify_can_do_stop_sector(mdev))
+				stop_sector = mdev->ov_stop_sector;
+		} else
 			bit_pos = mdev->bm_resync_fo;
 		/* Total sectors may be slightly off for oddly
 		 * sized devices. So what. */
 		seq_printf(seq,
-			"\t%3d%% sector pos: %llu/%llu\n",
+			"\t%3d%% sector pos: %llu/%llu",
 			(int)(bit_pos / (bm_bits/100+1)),
 			(unsigned long long)bit_pos * BM_SECT_PER_BIT,
 			(unsigned long long)bm_bits * BM_SECT_PER_BIT);
+		if (stop_sector != 0 && stop_sector != ULLONG_MAX)
+			seq_printf(seq, " stop sector: %llu", stop_sector);
+		seq_printf(seq, "\n");
 	}
 }
 
@@ -194,9 +200,11 @@ static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
 
 static int drbd_seq_show(struct seq_file *seq, void *v)
 {
-	int i, hole = 0;
+	int i, prev_i = -1;
 	const char *sn;
 	struct drbd_conf *mdev;
+	struct net_conf *nc;
+	char wp;
 
 	static char write_ordering_chars[] = {
 		[WO_none] = 'n',
@@ -227,16 +235,11 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 	 oos .. known out-of-sync kB
 	*/
 
-	for (i = 0; i < minor_count; i++) {
-		mdev = minor_to_mdev(i);
-		if (!mdev) {
-			hole = 1;
-			continue;
-		}
-		if (hole) {
-			hole = 0;
+	rcu_read_lock();
+	idr_for_each_entry(&minors, mdev, i) {
+		if (prev_i != i - 1)
 			seq_printf(seq, "\n");
-		}
+		prev_i = i;
 
 		sn = drbd_conn_str(mdev->state.conn);
 
@@ -248,6 +251,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 			/* reset mdev->congestion_reason */
 			bdi_rw_congested(&mdev->rq_queue->backing_dev_info);
 
+			nc = rcu_dereference(mdev->tconn->net_conf);
+			wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' ';
 			seq_printf(seq,
 			   "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
 			   "    ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
@@ -257,9 +262,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 			   drbd_role_str(mdev->state.peer),
 			   drbd_disk_str(mdev->state.disk),
 			   drbd_disk_str(mdev->state.pdsk),
-			   (mdev->net_conf == NULL ? ' ' :
-			    (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
-			   is_susp(mdev->state) ? 's' : 'r',
+			   wp,
+			   drbd_suspended(mdev) ? 's' : 'r',
 			   mdev->state.aftr_isp ? 'a' : '-',
 			   mdev->state.peer_isp ? 'p' : '-',
 			   mdev->state.user_isp ? 'u' : '-',
@@ -276,8 +280,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 			   atomic_read(&mdev->rs_pending_cnt),
 			   atomic_read(&mdev->unacked_cnt),
 			   atomic_read(&mdev->ap_bio_cnt),
-			   mdev->epochs,
-			   write_ordering_chars[mdev->write_ordering]
+			   mdev->tconn->epochs,
+			   write_ordering_chars[mdev->tconn->write_ordering]
 			);
 			seq_printf(seq, " oos:%llu\n",
 				   Bit2KB((unsigned long long)
@@ -302,6 +306,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 			}
 		}
 	}
+	rcu_read_unlock();
 
 	return 0;
 }
diff --git a/trunk/drivers/block/drbd/drbd_receiver.c b/trunk/drivers/block/drbd/drbd_receiver.c
index c74ca2df7431..a9eccfc6079b 100644
--- a/trunk/drivers/block/drbd/drbd_receiver.c
+++ b/trunk/drivers/block/drbd/drbd_receiver.c
@@ -48,17 +48,25 @@
 
 #include "drbd_vli.h"
 
+struct packet_info {
+	enum drbd_packet cmd;
+	unsigned int size;
+	unsigned int vnr;
+	void *data;
+};
+
 enum finish_epoch {
 	FE_STILL_LIVE,
 	FE_DESTROYED,
 	FE_RECYCLED,
 };
 
-static int drbd_do_handshake(struct drbd_conf *mdev);
-static int drbd_do_auth(struct drbd_conf *mdev);
+static int drbd_do_features(struct drbd_tconn *tconn);
+static int drbd_do_auth(struct drbd_tconn *tconn);
+static int drbd_disconnected(struct drbd_conf *mdev);
 
-static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
-static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *, struct drbd_epoch *, enum epoch_event);
+static int e_end_block(struct drbd_work *, int);
 
 
 #define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
@@ -142,11 +150,12 @@ static void page_chain_add(struct page **head,
 	*head = chain_first;
 }
 
-static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number)
+static struct page *__drbd_alloc_pages(struct drbd_conf *mdev,
+				       unsigned int number)
 {
 	struct page *page = NULL;
 	struct page *tmp = NULL;
-	int i = 0;
+	unsigned int i = 0;
 
 	/* Yes, testing drbd_pp_vacant outside the lock is racy.
 	 * So what. It saves a spin_lock. */
@@ -175,7 +184,7 @@ static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int
 		return page;
 
 	/* Not enough pages immediately available this time.
-	 * No need to jump around here, drbd_pp_alloc will retry this
+	 * No need to jump around here, drbd_alloc_pages will retry this
 	 * function "soon". */
 	if (page) {
 		tmp = page_chain_tail(page, NULL);
@@ -187,9 +196,10 @@ static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int
 	return NULL;
 }
 
-static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
+static void reclaim_finished_net_peer_reqs(struct drbd_conf *mdev,
+					   struct list_head *to_be_freed)
 {
-	struct drbd_epoch_entry *e;
+	struct drbd_peer_request *peer_req;
 	struct list_head *le, *tle;
 
 	/* The EEs are always appended to the end of the list. Since
@@ -198,8 +208,8 @@ static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed
 	   stop to examine the list... */
 
 	list_for_each_safe(le, tle, &mdev->net_ee) {
-		e = list_entry(le, struct drbd_epoch_entry, w.list);
-		if (drbd_ee_has_active_page(e))
+		peer_req = list_entry(le, struct drbd_peer_request, w.list);
+		if (drbd_peer_req_has_active_page(peer_req))
 			break;
 		list_move(le, to_be_freed);
 	}
@@ -208,18 +218,18 @@ static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed
 static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
 {
 	LIST_HEAD(reclaimed);
-	struct drbd_epoch_entry *e, *t;
+	struct drbd_peer_request *peer_req, *t;
 
-	spin_lock_irq(&mdev->req_lock);
-	reclaim_net_ee(mdev, &reclaimed);
-	spin_unlock_irq(&mdev->req_lock);
+	spin_lock_irq(&mdev->tconn->req_lock);
+	reclaim_finished_net_peer_reqs(mdev, &reclaimed);
+	spin_unlock_irq(&mdev->tconn->req_lock);
 
-	list_for_each_entry_safe(e, t, &reclaimed, w.list)
-		drbd_free_net_ee(mdev, e);
+	list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
+		drbd_free_net_peer_req(mdev, peer_req);
 }
 
 /**
- * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled)
+ * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
  * @mdev:	DRBD device.
  * @number:	number of pages requested
  * @retry:	whether to retry, if not enough pages are available right now
@@ -230,23 +240,31 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
  *
  * Returns a page chain linked via page->private.
  */
-static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry)
+struct page *drbd_alloc_pages(struct drbd_conf *mdev, unsigned int number,
+			      bool retry)
 {
 	struct page *page = NULL;
+	struct net_conf *nc;
 	DEFINE_WAIT(wait);
+	int mxb;
 
 	/* Yes, we may run up to @number over max_buffers. If we
 	 * follow it strictly, the admin will get it wrong anyways. */
-	if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers)
-		page = drbd_pp_first_pages_or_try_alloc(mdev, number);
+	rcu_read_lock();
+	nc = rcu_dereference(mdev->tconn->net_conf);
+	mxb = nc ? nc->max_buffers : 1000000;
+	rcu_read_unlock();
+
+	if (atomic_read(&mdev->pp_in_use) < mxb)
+		page = __drbd_alloc_pages(mdev, number);
 
 	while (page == NULL) {
 		prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
 
 		drbd_kick_lo_and_reclaim_net(mdev);
 
-		if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
-			page = drbd_pp_first_pages_or_try_alloc(mdev, number);
+		if (atomic_read(&mdev->pp_in_use) < mxb) {
+			page = __drbd_alloc_pages(mdev, number);
 			if (page)
 				break;
 		}
@@ -255,7 +273,7 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool
 			break;
 
 		if (signal_pending(current)) {
-			dev_warn(DEV, "drbd_pp_alloc interrupted!\n");
+			dev_warn(DEV, "drbd_alloc_pages interrupted!\n");
 			break;
 		}
 
@@ -268,11 +286,11 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool
 	return page;
 }
 
-/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
- * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
+/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
+ * Is also used from inside an other spin_lock_irq(&mdev->tconn->req_lock);
  * Either links the page chain back to the global pool,
  * or returns all pages to the system. */
-static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
+static void drbd_free_pages(struct drbd_conf *mdev, struct page *page, int is_net)
 {
 	atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
 	int i;
@@ -280,7 +298,7 @@ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
 	if (page == NULL)
 		return;
 
-	if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE)*minor_count)
+	if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count)
 		i = page_chain_free(page);
 	else {
 		struct page *tmp;
@@ -302,127 +320,130 @@ You need to hold the req_lock:
  _drbd_wait_ee_list_empty()
 
 You must not have the req_lock:
- drbd_free_ee()
- drbd_alloc_ee()
- drbd_init_ee()
- drbd_release_ee()
+ drbd_free_peer_req()
+ drbd_alloc_peer_req()
+ drbd_free_peer_reqs()
  drbd_ee_fix_bhs()
- drbd_process_done_ee()
+ drbd_finish_peer_reqs()
  drbd_clear_done_ee()
  drbd_wait_ee_list_empty()
 */
 
-struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
-				     u64 id,
-				     sector_t sector,
-				     unsigned int data_size,
-				     gfp_t gfp_mask) __must_hold(local)
+struct drbd_peer_request *
+drbd_alloc_peer_req(struct drbd_conf *mdev, u64 id, sector_t sector,
+		    unsigned int data_size, gfp_t gfp_mask) __must_hold(local)
 {
-	struct drbd_epoch_entry *e;
+	struct drbd_peer_request *peer_req;
 	struct page *page = NULL;
 	unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
 
 	if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE))
 		return NULL;
 
-	e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
-	if (!e) {
+	peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
+	if (!peer_req) {
 		if (!(gfp_mask & __GFP_NOWARN))
-			dev_err(DEV, "alloc_ee: Allocation of an EE failed\n");
+			dev_err(DEV, "%s: allocation failed\n", __func__);
 		return NULL;
 	}
 
 	if (data_size) {
-		page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
+		page = drbd_alloc_pages(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
 		if (!page)
 			goto fail;
 	}
 
-	INIT_HLIST_NODE(&e->collision);
-	e->epoch = NULL;
-	e->mdev = mdev;
-	e->pages = page;
-	atomic_set(&e->pending_bios, 0);
-	e->size = data_size;
-	e->flags = 0;
-	e->sector = sector;
-	e->block_id = id;
+	drbd_clear_interval(&peer_req->i);
+	peer_req->i.size = data_size;
+	peer_req->i.sector = sector;
+	peer_req->i.local = false;
+	peer_req->i.waiting = false;
+
+	peer_req->epoch = NULL;
+	peer_req->w.mdev = mdev;
+	peer_req->pages = page;
+	atomic_set(&peer_req->pending_bios, 0);
+	peer_req->flags = 0;
+	/*
+	 * The block_id is opaque to the receiver.  It is not endianness
+	 * converted, and sent back to the sender unchanged.
+	 */
+	peer_req->block_id = id;
 
-	return e;
+	return peer_req;
 
  fail:
-	mempool_free(e, drbd_ee_mempool);
+	mempool_free(peer_req, drbd_ee_mempool);
 	return NULL;
 }
 
-void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net)
+void __drbd_free_peer_req(struct drbd_conf *mdev, struct drbd_peer_request *peer_req,
+		       int is_net)
 {
-	if (e->flags & EE_HAS_DIGEST)
-		kfree(e->digest);
-	drbd_pp_free(mdev, e->pages, is_net);
-	D_ASSERT(atomic_read(&e->pending_bios) == 0);
-	D_ASSERT(hlist_unhashed(&e->collision));
-	mempool_free(e, drbd_ee_mempool);
+	if (peer_req->flags & EE_HAS_DIGEST)
+		kfree(peer_req->digest);
+	drbd_free_pages(mdev, peer_req->pages, is_net);
+	D_ASSERT(atomic_read(&peer_req->pending_bios) == 0);
+	D_ASSERT(drbd_interval_empty(&peer_req->i));
+	mempool_free(peer_req, drbd_ee_mempool);
 }
 
-int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
+int drbd_free_peer_reqs(struct drbd_conf *mdev, struct list_head *list)
 {
 	LIST_HEAD(work_list);
-	struct drbd_epoch_entry *e, *t;
+	struct drbd_peer_request *peer_req, *t;
 	int count = 0;
 	int is_net = list == &mdev->net_ee;
 
-	spin_lock_irq(&mdev->req_lock);
+	spin_lock_irq(&mdev->tconn->req_lock);
 	list_splice_init(list, &work_list);
-	spin_unlock_irq(&mdev->req_lock);
+	spin_unlock_irq(&mdev->tconn->req_lock);
 
-	list_for_each_entry_safe(e, t, &work_list, w.list) {
-		drbd_free_some_ee(mdev, e, is_net);
+	list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
+		__drbd_free_peer_req(mdev, peer_req, is_net);
 		count++;
 	}
 	return count;
 }
 
-
 /*
- * This function is called from _asender only_
- * but see also comments in _req_mod(,barrier_acked)
- * and receive_Barrier.
- *
- * Move entries from net_ee to done_ee, if ready.
- * Grab done_ee, call all callbacks, free the entries.
- * The callbacks typically send out ACKs.
+ * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
  */
-static int drbd_process_done_ee(struct drbd_conf *mdev)
+static int drbd_finish_peer_reqs(struct drbd_conf *mdev)
 {
 	LIST_HEAD(work_list);
 	LIST_HEAD(reclaimed);
-	struct drbd_epoch_entry *e, *t;
-	int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS);
+	struct drbd_peer_request *peer_req, *t;
+	int err = 0;
 
-	spin_lock_irq(&mdev->req_lock);
-	reclaim_net_ee(mdev, &reclaimed);
+	spin_lock_irq(&mdev->tconn->req_lock);
+	reclaim_finished_net_peer_reqs(mdev, &reclaimed);
 	list_splice_init(&mdev->done_ee, &work_list);
-	spin_unlock_irq(&mdev->req_lock);
+	spin_unlock_irq(&mdev->tconn->req_lock);
 
-	list_for_each_entry_safe(e, t, &reclaimed, w.list)
-		drbd_free_net_ee(mdev, e);
+	list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
+		drbd_free_net_peer_req(mdev, peer_req);
 
 	/* possible callbacks here:
-	 * e_end_block, and e_end_resync_block, e_send_discard_ack.
+	 * e_end_block, and e_end_resync_block, e_send_superseded.
 	 * all ignore the last argument.
 	 */
-	list_for_each_entry_safe(e, t, &work_list, w.list) {
+	list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
+		int err2;
+
 		/* list_del not necessary, next/prev members not touched */
-		ok = e->w.cb(mdev, &e->w, !ok) && ok;
-		drbd_free_ee(mdev, e);
+		err2 = peer_req->w.cb(&peer_req->w, !!err);
+		if (!err)
+			err = err2;
+		drbd_free_peer_req(mdev, peer_req);
 	}
 	wake_up(&mdev->ee_wait);
 
-	return ok;
+	return err;
 }
 
-void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
+static void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
+				     struct list_head *head)
 {
 	DEFINE_WAIT(wait);
 
@@ -430,55 +451,22 @@ void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
 	 * and calling prepare_to_wait in the fast path */
 	while (!list_empty(head)) {
 		prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
-		spin_unlock_irq(&mdev->req_lock);
+		spin_unlock_irq(&mdev->tconn->req_lock);
 		io_schedule();
 		finish_wait(&mdev->ee_wait, &wait);
-		spin_lock_irq(&mdev->req_lock);
+		spin_lock_irq(&mdev->tconn->req_lock);
 	}
 }
 
-void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
+static void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
+				    struct list_head *head)
 {
-	spin_lock_irq(&mdev->req_lock);
+	spin_lock_irq(&mdev->tconn->req_lock);
 	_drbd_wait_ee_list_empty(mdev, head);
-	spin_unlock_irq(&mdev->req_lock);
+	spin_unlock_irq(&mdev->tconn->req_lock);
 }
 
-/* see also kernel_accept; which is only present since 2.6.18.
- * also we want to log which part of it failed, exactly */
-static int drbd_accept(struct drbd_conf *mdev, const char **what,
-		struct socket *sock, struct socket **newsock)
-{
-	struct sock *sk = sock->sk;
-	int err = 0;
-
-	*what = "listen";
-	err = sock->ops->listen(sock, 5);
-	if (err < 0)
-		goto out;
-
-	*what = "sock_create_lite";
-	err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
-			       newsock);
-	if (err < 0)
-		goto out;
-
-	*what = "accept";
-	err = sock->ops->accept(sock, *newsock, 0);
-	if (err < 0) {
-		sock_release(*newsock);
-		*newsock = NULL;
-		goto out;
-	}
-	(*newsock)->ops  = sock->ops;
-	__module_get((*newsock)->ops->owner);
-
-out:
-	return err;
-}
-
-static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock,
-		    void *buf, size_t size, int flags)
+static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
 {
 	mm_segment_t oldfs;
 	struct kvec iov = {
@@ -500,59 +488,62 @@ static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock,
 	return rv;
 }
 
-static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
+static int drbd_recv(struct drbd_tconn *tconn, void *buf, size_t size)
 {
-	mm_segment_t oldfs;
-	struct kvec iov = {
-		.iov_base = buf,
-		.iov_len = size,
-	};
-	struct msghdr msg = {
-		.msg_iovlen = 1,
-		.msg_iov = (struct iovec *)&iov,
-		.msg_flags = MSG_WAITALL | MSG_NOSIGNAL
-	};
 	int rv;
 
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
+	rv = drbd_recv_short(tconn->data.socket, buf, size, 0);
 
-	for (;;) {
-		rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags);
-		if (rv == size)
-			break;
+	if (rv < 0) {
+		if (rv == -ECONNRESET)
+			conn_info(tconn, "sock was reset by peer\n");
+		else if (rv != -ERESTARTSYS)
+			conn_err(tconn, "sock_recvmsg returned %d\n", rv);
+	} else if (rv == 0) {
+		if (test_bit(DISCONNECT_SENT, &tconn->flags)) {
+			long t;
+			rcu_read_lock();
+			t = rcu_dereference(tconn->net_conf)->ping_timeo * HZ/10;
+			rcu_read_unlock();
 
-		/* Note:
-		 * ECONNRESET	other side closed the connection
-		 * ERESTARTSYS	(on  sock) we got a signal
-		 */
+			t = wait_event_timeout(tconn->ping_wait, tconn->cstate < C_WF_REPORT_PARAMS, t);
 
-		if (rv < 0) {
-			if (rv == -ECONNRESET)
-				dev_info(DEV, "sock was reset by peer\n");
-			else if (rv != -ERESTARTSYS)
-				dev_err(DEV, "sock_recvmsg returned %d\n", rv);
-			break;
-		} else if (rv == 0) {
-			dev_info(DEV, "sock was shut down by peer\n");
-			break;
-		} else	{
-			/* signal came in, or peer/link went down,
-			 * after we read a partial message
-			 */
-			/* D_ASSERT(signal_pending(current)); */
-			break;
+			if (t)
+				goto out;
 		}
-	};
-
-	set_fs(oldfs);
+		conn_info(tconn, "sock was shut down by peer\n");
+	}
 
 	if (rv != size)
-		drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
+		conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD);
 
+out:
 	return rv;
 }
 
+static int drbd_recv_all(struct drbd_tconn *tconn, void *buf, size_t size)
+{
+	int err;
+
+	err = drbd_recv(tconn, buf, size);
+	if (err != size) {
+		if (err >= 0)
+			err = -EIO;
+	} else
+		err = 0;
+	return err;
+}
+
+static int drbd_recv_all_warn(struct drbd_tconn *tconn, void *buf, size_t size)
+{
+	int err;
+
+	err = drbd_recv_all(tconn, buf, size);
+	if (err && !signal_pending(current))
+		conn_warn(tconn, "short read (expected size %d)\n", (int)size);
+	return err;
+}
+
 /* quoting tcp(7):
  *   On individual connections, the socket buffer size must be set prior to the
  *   listen(2) or connect(2) calls in order to have it take effect.
@@ -572,29 +563,50 @@ static void drbd_setbufsize(struct socket *sock, unsigned int snd,
 	}
 }
 
-static struct socket *drbd_try_connect(struct drbd_conf *mdev)
+static struct socket *drbd_try_connect(struct drbd_tconn *tconn)
 {
 	const char *what;
 	struct socket *sock;
 	struct sockaddr_in6 src_in6;
-	int err;
+	struct sockaddr_in6 peer_in6;
+	struct net_conf *nc;
+	int err, peer_addr_len, my_addr_len;
+	int sndbuf_size, rcvbuf_size, connect_int;
 	int disconnect_on_error = 1;
 
-	if (!get_net_conf(mdev))
+	rcu_read_lock();
+	nc = rcu_dereference(tconn->net_conf);
+	if (!nc) {
+		rcu_read_unlock();
 		return NULL;
+	}
+	sndbuf_size = nc->sndbuf_size;
+	rcvbuf_size = nc->rcvbuf_size;
+	connect_int = nc->connect_int;
+	rcu_read_unlock();
+
+	my_addr_len = min_t(int, tconn->my_addr_len, sizeof(src_in6));
+	memcpy(&src_in6, &tconn->my_addr, my_addr_len);
+
+	if (((struct sockaddr *)&tconn->my_addr)->sa_family == AF_INET6)
+		src_in6.sin6_port = 0;
+	else
+		((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
+
+	peer_addr_len = min_t(int, tconn->peer_addr_len, sizeof(src_in6));
+	memcpy(&peer_in6, &tconn->peer_addr, peer_addr_len);
 
 	what = "sock_create_kern";
-	err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
-		SOCK_STREAM, IPPROTO_TCP, &sock);
+	err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family,
+			       SOCK_STREAM, IPPROTO_TCP, &sock);
 	if (err < 0) {
 		sock = NULL;
 		goto out;
 	}
 
 	sock->sk->sk_rcvtimeo =
-	sock->sk->sk_sndtimeo =  mdev->net_conf->try_connect_int*HZ;
-	drbd_setbufsize(sock, mdev->net_conf->sndbuf_size,
-			mdev->net_conf->rcvbuf_size);
+	sock->sk->sk_sndtimeo = connect_int * HZ;
+	drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
 
        /* explicitly bind to the configured IP as source IP
 	*  for the outgoing connections.
@@ -603,17 +615,8 @@ static struct socket *drbd_try_connect(struct drbd_conf *mdev)
 	* Make sure to use 0 as port number, so linux selects
 	*  a free one dynamically.
 	*/
-	memcpy(&src_in6, mdev->net_conf->my_addr,
-	       min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6)));
-	if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6)
-		src_in6.sin6_port = 0;
-	else
-		((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
-
 	what = "bind before connect";
-	err = sock->ops->bind(sock,
-			      (struct sockaddr *) &src_in6,
-			      mdev->net_conf->my_addr_len);
+	err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
 	if (err < 0)
 		goto out;
 
@@ -621,9 +624,7 @@ static struct socket *drbd_try_connect(struct drbd_conf *mdev)
 	 * stay C_WF_CONNECTION, don't go Disconnecting! */
 	disconnect_on_error = 0;
 	what = "connect";
-	err = sock->ops->connect(sock,
-				 (struct sockaddr *)mdev->net_conf->peer_addr,
-				 mdev->net_conf->peer_addr_len, 0);
+	err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
 
 out:
 	if (err < 0) {
@@ -641,91 +642,174 @@ static struct socket *drbd_try_connect(struct drbd_conf *mdev)
 			disconnect_on_error = 0;
 			break;
 		default:
-			dev_err(DEV, "%s failed, err = %d\n", what, err);
+			conn_err(tconn, "%s failed, err = %d\n", what, err);
 		}
 		if (disconnect_on_error)
-			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+			conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
 	}
-	put_net_conf(mdev);
+
 	return sock;
 }
 
-static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
+struct accept_wait_data {
+	struct drbd_tconn *tconn;
+	struct socket *s_listen;
+	struct completion door_bell;
+	void (*original_sk_state_change)(struct sock *sk);
+
+};
+
+static void drbd_incoming_connection(struct sock *sk)
+{
+	struct accept_wait_data *ad = sk->sk_user_data;
+	void (*state_change)(struct sock *sk);
+
+	state_change = ad->original_sk_state_change;
+	if (sk->sk_state == TCP_ESTABLISHED)
+		complete(&ad->door_bell);
+	state_change(sk);
+}
+
+static int prepare_listen_socket(struct drbd_tconn *tconn, struct accept_wait_data *ad)
 {
-	int timeo, err;
-	struct socket *s_estab = NULL, *s_listen;
+	int err, sndbuf_size, rcvbuf_size, my_addr_len;
+	struct sockaddr_in6 my_addr;
+	struct socket *s_listen;
+	struct net_conf *nc;
 	const char *what;
 
-	if (!get_net_conf(mdev))
-		return NULL;
+	rcu_read_lock();
+	nc = rcu_dereference(tconn->net_conf);
+	if (!nc) {
+		rcu_read_unlock();
+		return -EIO;
+	}
+	sndbuf_size = nc->sndbuf_size;
+	rcvbuf_size = nc->rcvbuf_size;
+	rcu_read_unlock();
+
+	my_addr_len = min_t(int, tconn->my_addr_len, sizeof(struct sockaddr_in6));
+	memcpy(&my_addr, &tconn->my_addr, my_addr_len);
 
 	what = "sock_create_kern";
-	err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
-		SOCK_STREAM, IPPROTO_TCP, &s_listen);
+	err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family,
+			       SOCK_STREAM, IPPROTO_TCP, &s_listen);
 	if (err) {
 		s_listen = NULL;
 		goto out;
 	}
 
-	timeo = mdev->net_conf->try_connect_int * HZ;
-	timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
-
-	s_listen->sk->sk_reuse    = SK_CAN_REUSE; /* SO_REUSEADDR */
-	s_listen->sk->sk_rcvtimeo = timeo;
-	s_listen->sk->sk_sndtimeo = timeo;
-	drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size,
-			mdev->net_conf->rcvbuf_size);
+	s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+	drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
 
 	what = "bind before listen";
-	err = s_listen->ops->bind(s_listen,
-			      (struct sockaddr *) mdev->net_conf->my_addr,
-			      mdev->net_conf->my_addr_len);
+	err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
 	if (err < 0)
 		goto out;
 
-	err = drbd_accept(mdev, &what, s_listen, &s_estab);
+	ad->s_listen = s_listen;
+	write_lock_bh(&s_listen->sk->sk_callback_lock);
+	ad->original_sk_state_change = s_listen->sk->sk_state_change;
+	s_listen->sk->sk_state_change = drbd_incoming_connection;
+	s_listen->sk->sk_user_data = ad;
+	write_unlock_bh(&s_listen->sk->sk_callback_lock);
+
+	what = "listen";
+	err = s_listen->ops->listen(s_listen, 5);
+	if (err < 0)
+		goto out;
 
+	return 0;
 out:
 	if (s_listen)
 		sock_release(s_listen);
 	if (err < 0) {
 		if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
-			dev_err(DEV, "%s failed, err = %d\n", what, err);
-			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+			conn_err(tconn, "%s failed, err = %d\n", what, err);
+			conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
 		}
 	}
-	put_net_conf(mdev);
 
-	return s_estab;
+	return -EIO;
 }
 
-static int drbd_send_fp(struct drbd_conf *mdev,
-	struct socket *sock, enum drbd_packets cmd)
+static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad)
 {
-	struct p_header80 *h = &mdev->data.sbuf.header.h80;
-
-	return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
+	write_lock_bh(&sk->sk_callback_lock);
+	sk->sk_state_change = ad->original_sk_state_change;
+	sk->sk_user_data = NULL;
+	write_unlock_bh(&sk->sk_callback_lock);
 }
 
-static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
+static struct socket *drbd_wait_for_connect(struct drbd_tconn *tconn, struct accept_wait_data *ad)
 {
-	struct p_header80 *h = &mdev->data.rbuf.header.h80;
-	int rr;
+	int timeo, connect_int, err = 0;
+	struct socket *s_estab = NULL;
+	struct net_conf *nc;
+
+	rcu_read_lock();
+	nc = rcu_dereference(tconn->net_conf);
+	if (!nc) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	connect_int = nc->connect_int;
+	rcu_read_unlock();
+
+	timeo = connect_int * HZ;
+	timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
+
+	err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
+	if (err <= 0)
+		return NULL;
+
+	err = kernel_accept(ad->s_listen, &s_estab, 0);
+	if (err < 0) {
+		if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
+			conn_err(tconn, "accept failed, err = %d\n", err);
+			conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+		}
+	}
+
+	if (s_estab)
+		unregister_state_change(s_estab->sk, ad);
+
+	return s_estab;
+}
+
+static int decode_header(struct drbd_tconn *, void *, struct packet_info *);
 
-	rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
+static int send_first_packet(struct drbd_tconn *tconn, struct drbd_socket *sock,
+			     enum drbd_packet cmd)
+{
+	if (!conn_prepare_command(tconn, sock))
+		return -EIO;
+	return conn_send_command(tconn, sock, cmd, 0, NULL, 0);
+}
 
-	if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC)
-		return be16_to_cpu(h->command);
+static int receive_first_packet(struct drbd_tconn *tconn, struct socket *sock)
+{
+	unsigned int header_size = drbd_header_size(tconn);
+	struct packet_info pi;
+	int err;
 
-	return 0xffff;
+	err = drbd_recv_short(sock, tconn->data.rbuf, header_size, 0);
+	if (err != header_size) {
+		if (err >= 0)
+			err = -EIO;
+		return err;
+	}
+	err = decode_header(tconn, tconn->data.rbuf, &pi);
+	if (err)
+		return err;
+	return pi.cmd;
 }
 
 /**
  * drbd_socket_okay() - Free the socket if its connection is not okay
- * @mdev:	DRBD device.
  * @sock:	pointer to the pointer to the socket.
  */
-static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
+static int drbd_socket_okay(struct socket **sock)
 {
 	int rr;
 	char tb[4];
@@ -733,7 +817,7 @@ static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
 	if (!*sock)
 		return false;
 
-	rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
+	rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
 
 	if (rr > 0 || rr == -EAGAIN) {
 		return true;
@@ -743,6 +827,31 @@ static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
 		return false;
 	}
 }
+/* Gets called if a connection is established, or if a new minor gets created
+   in a connection */
+int drbd_connected(struct drbd_conf *mdev)
+{
+	int err;
+
+	atomic_set(&mdev->packet_seq, 0);
+	mdev->peer_seq = 0;
+
+	mdev->state_mutex = mdev->tconn->agreed_pro_version < 100 ?
+		&mdev->tconn->cstate_mutex :
+		&mdev->own_state_mutex;
+
+	err = drbd_send_sync_param(mdev);
+	if (!err)
+		err = drbd_send_sizes(mdev, 0, 0);
+	if (!err)
+		err = drbd_send_uuids(mdev);
+	if (!err)
+		err = drbd_send_current_state(mdev);
+	clear_bit(USE_DEGR_WFC_T, &mdev->flags);
+	clear_bit(RESIZE_PENDING, &mdev->flags);
+	mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
+	return err;
+}
 
 /*
  * return values:
@@ -752,232 +861,315 @@ static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
  *     no point in trying again, please go standalone.
  *  -2 We do not have a network config...
  */
-static int drbd_connect(struct drbd_conf *mdev)
+static int conn_connect(struct drbd_tconn *tconn)
 {
-	struct socket *s, *sock, *msock;
-	int try, h, ok;
+	struct drbd_socket sock, msock;
+	struct drbd_conf *mdev;
+	struct net_conf *nc;
+	int vnr, timeout, h, ok;
+	bool discard_my_data;
 	enum drbd_state_rv rv;
+	struct accept_wait_data ad = {
+		.tconn = tconn,
+		.door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
+	};
 
-	D_ASSERT(!mdev->data.socket);
-
-	if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
+	clear_bit(DISCONNECT_SENT, &tconn->flags);
+	if (conn_request_state(tconn, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
 		return -2;
 
-	clear_bit(DISCARD_CONCURRENT, &mdev->flags);
+	mutex_init(&sock.mutex);
+	sock.sbuf = tconn->data.sbuf;
+	sock.rbuf = tconn->data.rbuf;
+	sock.socket = NULL;
+	mutex_init(&msock.mutex);
+	msock.sbuf = tconn->meta.sbuf;
+	msock.rbuf = tconn->meta.rbuf;
+	msock.socket = NULL;
 
-	sock  = NULL;
-	msock = NULL;
+	/* Assume that the peer only understands protocol 80 until we know better.  */
+	tconn->agreed_pro_version = 80;
+
+	if (prepare_listen_socket(tconn, &ad))
+		return 0;
 
 	do {
-		for (try = 0;;) {
-			/* 3 tries, this should take less than a second! */
-			s = drbd_try_connect(mdev);
-			if (s || ++try >= 3)
-				break;
-			/* give the other side time to call bind() & listen() */
-			schedule_timeout_interruptible(HZ / 10);
-		}
+		struct socket *s;
 
+		s = drbd_try_connect(tconn);
 		if (s) {
-			if (!sock) {
-				drbd_send_fp(mdev, s, P_HAND_SHAKE_S);
-				sock = s;
-				s = NULL;
-			} else if (!msock) {
-				drbd_send_fp(mdev, s, P_HAND_SHAKE_M);
-				msock = s;
-				s = NULL;
+			if (!sock.socket) {
+				sock.socket = s;
+				send_first_packet(tconn, &sock, P_INITIAL_DATA);
+			} else if (!msock.socket) {
+				clear_bit(RESOLVE_CONFLICTS, &tconn->flags);
+				msock.socket = s;
+				send_first_packet(tconn, &msock, P_INITIAL_META);
 			} else {
-				dev_err(DEV, "Logic error in drbd_connect()\n");
+				conn_err(tconn, "Logic error in conn_connect()\n");
 				goto out_release_sockets;
 			}
 		}
 
-		if (sock && msock) {
-			schedule_timeout_interruptible(mdev->net_conf->ping_timeo*HZ/10);
-			ok = drbd_socket_okay(mdev, &sock);
-			ok = drbd_socket_okay(mdev, &msock) && ok;
+		if (sock.socket && msock.socket) {
+			rcu_read_lock();
+			nc = rcu_dereference(tconn->net_conf);
+			timeout = nc->ping_timeo * HZ / 10;
+			rcu_read_unlock();
+			schedule_timeout_interruptible(timeout);
+			ok = drbd_socket_okay(&sock.socket);
+			ok = drbd_socket_okay(&msock.socket) && ok;
 			if (ok)
 				break;
 		}
 
 retry:
-		s = drbd_wait_for_connect(mdev);
+		s = drbd_wait_for_connect(tconn, &ad);
 		if (s) {
-			try = drbd_recv_fp(mdev, s);
-			drbd_socket_okay(mdev, &sock);
-			drbd_socket_okay(mdev, &msock);
-			switch (try) {
-			case P_HAND_SHAKE_S:
-				if (sock) {
-					dev_warn(DEV, "initial packet S crossed\n");
-					sock_release(sock);
+			int fp = receive_first_packet(tconn, s);
+			drbd_socket_okay(&sock.socket);
+			drbd_socket_okay(&msock.socket);
+			switch (fp) {
+			case P_INITIAL_DATA:
+				if (sock.socket) {
+					conn_warn(tconn, "initial packet S crossed\n");
+					sock_release(sock.socket);
+					sock.socket = s;
+					goto randomize;
 				}
-				sock = s;
+				sock.socket = s;
 				break;
-			case P_HAND_SHAKE_M:
-				if (msock) {
-					dev_warn(DEV, "initial packet M crossed\n");
-					sock_release(msock);
+			case P_INITIAL_META:
+				set_bit(RESOLVE_CONFLICTS, &tconn->flags);
+				if (msock.socket) {
+					conn_warn(tconn, "initial packet M crossed\n");
+					sock_release(msock.socket);
+					msock.socket = s;
+					goto randomize;
 				}
-				msock = s;
-				set_bit(DISCARD_CONCURRENT, &mdev->flags);
+				msock.socket = s;
 				break;
 			default:
-				dev_warn(DEV, "Error receiving initial packet\n");
+				conn_warn(tconn, "Error receiving initial packet\n");
 				sock_release(s);
+randomize:
 				if (random32() & 1)
 					goto retry;
 			}
 		}
 
-		if (mdev->state.conn <= C_DISCONNECTING)
+		if (tconn->cstate <= C_DISCONNECTING)
 			goto out_release_sockets;
 		if (signal_pending(current)) {
 			flush_signals(current);
 			smp_rmb();
-			if (get_t_state(&mdev->receiver) == Exiting)
+			if (get_t_state(&tconn->receiver) == EXITING)
 				goto out_release_sockets;
 		}
 
-		if (sock && msock) {
-			ok = drbd_socket_okay(mdev, &sock);
-			ok = drbd_socket_okay(mdev, &msock) && ok;
-			if (ok)
-				break;
-		}
-	} while (1);
+		ok = drbd_socket_okay(&sock.socket);
+		ok = drbd_socket_okay(&msock.socket) && ok;
+	} while (!ok);
 
-	msock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
-	sock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+	if (ad.s_listen)
+		sock_release(ad.s_listen);
 
-	sock->sk->sk_allocation = GFP_NOIO;
-	msock->sk->sk_allocation = GFP_NOIO;
+	sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+	msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
 
-	sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
-	msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
+	sock.socket->sk->sk_allocation = GFP_NOIO;
+	msock.socket->sk->sk_allocation = GFP_NOIO;
+
+	sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
+	msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
 
 	/* NOT YET ...
-	 * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
-	 * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
-	 * first set it to the P_HAND_SHAKE timeout,
+	 * sock.socket->sk->sk_sndtimeo = tconn->net_conf->timeout*HZ/10;
+	 * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+	 * first set it to the P_CONNECTION_FEATURES timeout,
 	 * which we set to 4x the configured ping_timeout. */
-	sock->sk->sk_sndtimeo =
-	sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10;
+	rcu_read_lock();
+	nc = rcu_dereference(tconn->net_conf);
+
+	sock.socket->sk->sk_sndtimeo =
+	sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
+
+	msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
+	timeout = nc->timeout * HZ / 10;
+	discard_my_data = nc->discard_my_data;
+	rcu_read_unlock();
 
-	msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
-	msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
+	msock.socket->sk->sk_sndtimeo = timeout;
 
 	/* we don't want delays.
 	 * we use TCP_CORK where appropriate, though */
-	drbd_tcp_nodelay(sock);
-	drbd_tcp_nodelay(msock);
+	drbd_tcp_nodelay(sock.socket);
+	drbd_tcp_nodelay(msock.socket);
 
-	mdev->data.socket = sock;
-	mdev->meta.socket = msock;
-	mdev->last_received = jiffies;
+	tconn->data.socket = sock.socket;
+	tconn->meta.socket = msock.socket;
+	tconn->last_received = jiffies;
 
-	D_ASSERT(mdev->asender.task == NULL);
-
-	h = drbd_do_handshake(mdev);
+	h = drbd_do_features(tconn);
 	if (h <= 0)
 		return h;
 
-	if (mdev->cram_hmac_tfm) {
+	if (tconn->cram_hmac_tfm) {
 		/* drbd_request_state(mdev, NS(conn, WFAuth)); */
-		switch (drbd_do_auth(mdev)) {
+		switch (drbd_do_auth(tconn)) {
 		case -1:
-			dev_err(DEV, "Authentication of peer failed\n");
+			conn_err(tconn, "Authentication of peer failed\n");
 			return -1;
 		case 0:
-			dev_err(DEV, "Authentication of peer failed, trying again.\n");
+			conn_err(tconn, "Authentication of peer failed, trying again.\n");
 			return 0;
 		}
 	}
 
-	sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
-	sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
-
-	atomic_set(&mdev->packet_seq, 0);
-	mdev->peer_seq = 0;
+	tconn->data.socket->sk->sk_sndtimeo = timeout;
+	tconn->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
 
-	if (drbd_send_protocol(mdev) == -1)
+	if (drbd_send_protocol(tconn) == -EOPNOTSUPP)
 		return -1;
-	set_bit(STATE_SENT, &mdev->flags);
-	drbd_send_sync_param(mdev, &mdev->sync_conf);
-	drbd_send_sizes(mdev, 0, 0);
-	drbd_send_uuids(mdev);
-	drbd_send_current_state(mdev);
-	clear_bit(USE_DEGR_WFC_T, &mdev->flags);
-	clear_bit(RESIZE_PENDING, &mdev->flags);
 
-	spin_lock_irq(&mdev->req_lock);
-	rv = _drbd_set_state(_NS(mdev, conn, C_WF_REPORT_PARAMS), CS_VERBOSE, NULL);
-	if (mdev->state.conn != C_WF_REPORT_PARAMS)
-		clear_bit(STATE_SENT, &mdev->flags);
-	spin_unlock_irq(&mdev->req_lock);
+	set_bit(STATE_SENT, &tconn->flags);
 
-	if (rv < SS_SUCCESS)
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+		kref_get(&mdev->kref);
+		/* Prevent a race between resync-handshake and
+		 * being promoted to Primary.
+		 *
+		 * Grab and release the state mutex, so we know that any current
+		 * drbd_set_role() is finished, and any incoming drbd_set_role
+		 * will see the STATE_SENT flag, and wait for it to be cleared.
+		 */
+		mutex_lock(mdev->state_mutex);
+		mutex_unlock(mdev->state_mutex);
+
+		rcu_read_unlock();
+
+		if (discard_my_data)
+			set_bit(DISCARD_MY_DATA, &mdev->flags);
+		else
+			clear_bit(DISCARD_MY_DATA, &mdev->flags);
+
+		drbd_connected(mdev);
+		kref_put(&mdev->kref, &drbd_minor_destroy);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+
+	rv = conn_request_state(tconn, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
+	if (rv < SS_SUCCESS || tconn->cstate != C_WF_REPORT_PARAMS) {
+		clear_bit(STATE_SENT, &tconn->flags);
 		return 0;
+	}
 
-	drbd_thread_start(&mdev->asender);
-	mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
+	drbd_thread_start(&tconn->asender);
 
-	return 1;
+	mutex_lock(&tconn->conf_update);
+	/* The discard_my_data flag is a single-shot modifier to the next
+	 * connection attempt, the handshake of which is now well underway.
+	 * No need for rcu style copying of the whole struct
+	 * just to clear a single value. */
+	tconn->net_conf->discard_my_data = 0;
+	mutex_unlock(&tconn->conf_update);
+
+	return h;
 
 out_release_sockets:
-	if (sock)
-		sock_release(sock);
-	if (msock)
-		sock_release(msock);
+	if (ad.s_listen)
+		sock_release(ad.s_listen);
+	if (sock.socket)
+		sock_release(sock.socket);
+	if (msock.socket)
+		sock_release(msock.socket);
 	return -1;
 }
 
-static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size)
+static int decode_header(struct drbd_tconn *tconn, void *header, struct packet_info *pi)
 {
-	union p_header *h = &mdev->data.rbuf.header;
-	int r;
-
-	r = drbd_recv(mdev, h, sizeof(*h));
-	if (unlikely(r != sizeof(*h))) {
-		if (!signal_pending(current))
-			dev_warn(DEV, "short read expecting header on sock: r=%d\n", r);
-		return false;
-	}
-
-	if (likely(h->h80.magic == BE_DRBD_MAGIC)) {
-		*cmd = be16_to_cpu(h->h80.command);
-		*packet_size = be16_to_cpu(h->h80.length);
-	} else if (h->h95.magic == BE_DRBD_MAGIC_BIG) {
-		*cmd = be16_to_cpu(h->h95.command);
-		*packet_size = be32_to_cpu(h->h95.length);
+	unsigned int header_size = drbd_header_size(tconn);
+
+	if (header_size == sizeof(struct p_header100) &&
+	    *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
+		struct p_header100 *h = header;
+		if (h->pad != 0) {
+			conn_err(tconn, "Header padding is not zero\n");
+			return -EINVAL;
+		}
+		pi->vnr = be16_to_cpu(h->volume);
+		pi->cmd = be16_to_cpu(h->command);
+		pi->size = be32_to_cpu(h->length);
+	} else if (header_size == sizeof(struct p_header95) &&
+		   *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
+		struct p_header95 *h = header;
+		pi->cmd = be16_to_cpu(h->command);
+		pi->size = be32_to_cpu(h->length);
+		pi->vnr = 0;
+	} else if (header_size == sizeof(struct p_header80) &&
+		   *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
+		struct p_header80 *h = header;
+		pi->cmd = be16_to_cpu(h->command);
+		pi->size = be16_to_cpu(h->length);
+		pi->vnr = 0;
 	} else {
-		dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n",
-		    be32_to_cpu(h->h80.magic),
-		    be16_to_cpu(h->h80.command),
-		    be16_to_cpu(h->h80.length));
-		return false;
+		conn_err(tconn, "Wrong magic value 0x%08x in protocol version %d\n",
+			 be32_to_cpu(*(__be32 *)header),
+			 tconn->agreed_pro_version);
+		return -EINVAL;
 	}
-	mdev->last_received = jiffies;
+	pi->data = header + header_size;
+	return 0;
+}
+
+static int drbd_recv_header(struct drbd_tconn *tconn, struct packet_info *pi)
+{
+	void *buffer = tconn->data.rbuf;
+	int err;
+
+	err = drbd_recv_all_warn(tconn, buffer, drbd_header_size(tconn));
+	if (err)
+		return err;
+
+	err = decode_header(tconn, buffer, pi);
+	tconn->last_received = jiffies;
 
-	return true;
+	return err;
 }
 
-static void drbd_flush(struct drbd_conf *mdev)
+static void drbd_flush(struct drbd_tconn *tconn)
 {
 	int rv;
+	struct drbd_conf *mdev;
+	int vnr;
+
+	if (tconn->write_ordering >= WO_bdev_flush) {
+		rcu_read_lock();
+		idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+			if (!get_ldev(mdev))
+				continue;
+			kref_get(&mdev->kref);
+			rcu_read_unlock();
+
+			rv = blkdev_issue_flush(mdev->ldev->backing_bdev,
+					GFP_NOIO, NULL);
+			if (rv) {
+				dev_info(DEV, "local disk flush failed with status %d\n", rv);
+				/* would rather check on EOPNOTSUPP, but that is not reliable.
+				 * don't try again for ANY return value != 0
+				 * if (rv == -EOPNOTSUPP) */
+				drbd_bump_write_ordering(tconn, WO_drain_io);
+			}
+			put_ldev(mdev);
+			kref_put(&mdev->kref, &drbd_minor_destroy);
 
-	if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
-		rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
-					NULL);
-		if (rv) {
-			dev_info(DEV, "local disk flush failed with status %d\n", rv);
-			/* would rather check on EOPNOTSUPP, but that is not reliable.
-			 * don't try again for ANY return value != 0
-			 * if (rv == -EOPNOTSUPP) */
-			drbd_bump_write_ordering(mdev, WO_drain_io);
+			rcu_read_lock();
+			if (rv)
+				break;
 		}
-		put_ldev(mdev);
+		rcu_read_unlock();
 	}
 }
 
@@ -987,7 +1179,7 @@ static void drbd_flush(struct drbd_conf *mdev)
  * @epoch:	Epoch object.
  * @ev:		Epoch event.
  */
-static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *tconn,
 					       struct drbd_epoch *epoch,
 					       enum epoch_event ev)
 {
@@ -995,7 +1187,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
 	struct drbd_epoch *next_epoch;
 	enum finish_epoch rv = FE_STILL_LIVE;
 
-	spin_lock(&mdev->epoch_lock);
+	spin_lock(&tconn->epoch_lock);
 	do {
 		next_epoch = NULL;
 
@@ -1017,18 +1209,22 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
 		    atomic_read(&epoch->active) == 0 &&
 		    (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
 			if (!(ev & EV_CLEANUP)) {
-				spin_unlock(&mdev->epoch_lock);
-				drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
-				spin_lock(&mdev->epoch_lock);
+				spin_unlock(&tconn->epoch_lock);
+				drbd_send_b_ack(epoch->tconn, epoch->barrier_nr, epoch_size);
+				spin_lock(&tconn->epoch_lock);
 			}
+#if 0
+			/* FIXME: dec unacked on connection, once we have
+			 * something to count pending connection packets in. */
 			if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
-				dec_unacked(mdev);
+				dec_unacked(epoch->tconn);
+#endif
 
-			if (mdev->current_epoch != epoch) {
+			if (tconn->current_epoch != epoch) {
 				next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
 				list_del(&epoch->list);
 				ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
-				mdev->epochs--;
+				tconn->epochs--;
 				kfree(epoch);
 
 				if (rv == FE_STILL_LIVE)
@@ -1039,7 +1235,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
 				/* atomic_set(&epoch->active, 0); is already zero */
 				if (rv == FE_STILL_LIVE)
 					rv = FE_RECYCLED;
-				wake_up(&mdev->ee_wait);
 			}
 		}
 
@@ -1049,40 +1244,52 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
 		epoch = next_epoch;
 	} while (1);
 
-	spin_unlock(&mdev->epoch_lock);
+	spin_unlock(&tconn->epoch_lock);
 
 	return rv;
 }
 
 /**
  * drbd_bump_write_ordering() - Fall back to an other write ordering method
- * @mdev:	DRBD device.
+ * @tconn:	DRBD connection.
  * @wo:		Write ordering method to try.
  */
-void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local)
+void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo)
 {
+	struct disk_conf *dc;
+	struct drbd_conf *mdev;
 	enum write_ordering_e pwo;
+	int vnr;
 	static char *write_ordering_str[] = {
 		[WO_none] = "none",
 		[WO_drain_io] = "drain",
 		[WO_bdev_flush] = "flush",
 	};
 
-	pwo = mdev->write_ordering;
+	pwo = tconn->write_ordering;
 	wo = min(pwo, wo);
-	if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
-		wo = WO_drain_io;
-	if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
-		wo = WO_none;
-	mdev->write_ordering = wo;
-	if (pwo != mdev->write_ordering || wo == WO_bdev_flush)
-		dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+		if (!get_ldev_if_state(mdev, D_ATTACHING))
+			continue;
+		dc = rcu_dereference(mdev->ldev->disk_conf);
+
+		if (wo == WO_bdev_flush && !dc->disk_flushes)
+			wo = WO_drain_io;
+		if (wo == WO_drain_io && !dc->disk_drain)
+			wo = WO_none;
+		put_ldev(mdev);
+	}
+	rcu_read_unlock();
+	tconn->write_ordering = wo;
+	if (pwo != tconn->write_ordering || wo == WO_bdev_flush)
+		conn_info(tconn, "Method to ensure write ordering: %s\n", write_ordering_str[tconn->write_ordering]);
 }
 
 /**
- * drbd_submit_ee()
+ * drbd_submit_peer_request()
  * @mdev:	DRBD device.
- * @e:		epoch entry
+ * @peer_req:	peer request
  * @rw:		flag field, see bio->bi_rw
  *
  * May spread the pages to multiple bios,
@@ -1096,14 +1303,15 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
  *  on certain Xen deployments.
  */
 /* TODO allocate from our own bio_set. */
-int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
-		const unsigned rw, const int fault_type)
+int drbd_submit_peer_request(struct drbd_conf *mdev,
+			     struct drbd_peer_request *peer_req,
+			     const unsigned rw, const int fault_type)
 {
 	struct bio *bios = NULL;
 	struct bio *bio;
-	struct page *page = e->pages;
-	sector_t sector = e->sector;
-	unsigned ds = e->size;
+	struct page *page = peer_req->pages;
+	sector_t sector = peer_req->i.sector;
+	unsigned ds = peer_req->i.size;
 	unsigned n_bios = 0;
 	unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
 	int err = -ENOMEM;
@@ -1122,12 +1330,12 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
 		dev_err(DEV, "submit_ee: Allocation of a bio failed\n");
 		goto fail;
 	}
-	/* > e->sector, unless this is the first bio */
+	/* > peer_req->i.sector, unless this is the first bio */
 	bio->bi_sector = sector;
 	bio->bi_bdev = mdev->ldev->backing_bdev;
 	bio->bi_rw = rw;
-	bio->bi_private = e;
-	bio->bi_end_io = drbd_endio_sec;
+	bio->bi_private = peer_req;
+	bio->bi_end_io = drbd_peer_request_endio;
 
 	bio->bi_next = bios;
 	bios = bio;
@@ -1156,7 +1364,7 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
 	D_ASSERT(page == NULL);
 	D_ASSERT(ds == 0);
 
-	atomic_set(&e->pending_bios, n_bios);
+	atomic_set(&peer_req->pending_bios, n_bios);
 	do {
 		bio = bios;
 		bios = bios->bi_next;
@@ -1175,26 +1383,57 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
 	return err;
 }
 
-static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static void drbd_remove_epoch_entry_interval(struct drbd_conf *mdev,
+					     struct drbd_peer_request *peer_req)
+{
+	struct drbd_interval *i = &peer_req->i;
+
+	drbd_remove_interval(&mdev->write_requests, i);
+	drbd_clear_interval(i);
+
+	/* Wake up any processes waiting for this peer request to complete.  */
+	if (i->waiting)
+		wake_up(&mdev->misc_wait);
+}
+
+void conn_wait_active_ee_empty(struct drbd_tconn *tconn)
+{
+	struct drbd_conf *mdev;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+		kref_get(&mdev->kref);
+		rcu_read_unlock();
+		drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
+		kref_put(&mdev->kref, &drbd_minor_destroy);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+}
+
+static int receive_Barrier(struct drbd_tconn *tconn, struct packet_info *pi)
 {
 	int rv;
-	struct p_barrier *p = &mdev->data.rbuf.barrier;
+	struct p_barrier *p = pi->data;
 	struct drbd_epoch *epoch;
 
-	inc_unacked(mdev);
-
-	mdev->current_epoch->barrier_nr = p->barrier;
-	rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
+	/* FIXME these are unacked on connection,
+	 * not a specific (peer)device.
+	 */
+	tconn->current_epoch->barrier_nr = p->barrier;
+	tconn->current_epoch->tconn = tconn;
+	rv = drbd_may_finish_epoch(tconn, tconn->current_epoch, EV_GOT_BARRIER_NR);
 
 	/* P_BARRIER_ACK may imply that the corresponding extent is dropped from
 	 * the activity log, which means it would not be resynced in case the
 	 * R_PRIMARY crashes now.
 	 * Therefore we must send the barrier_ack after the barrier request was
 	 * completed. */
-	switch (mdev->write_ordering) {
+	switch (tconn->write_ordering) {
 	case WO_none:
 		if (rv == FE_RECYCLED)
-			return true;
+			return 0;
 
 		/* receiver context, in the writeout path of the other node.
 		 * avoid potential distributed deadlock */
@@ -1202,81 +1441,75 @@ static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsign
 		if (epoch)
 			break;
 		else
-			dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
+			conn_warn(tconn, "Allocation of an epoch failed, slowing down\n");
 			/* Fall through */
 
 	case WO_bdev_flush:
 	case WO_drain_io:
-		drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
-		drbd_flush(mdev);
+		conn_wait_active_ee_empty(tconn);
+		drbd_flush(tconn);
 
-		if (atomic_read(&mdev->current_epoch->epoch_size)) {
+		if (atomic_read(&tconn->current_epoch->epoch_size)) {
 			epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
 			if (epoch)
 				break;
 		}
 
-		epoch = mdev->current_epoch;
-		wait_event(mdev->ee_wait, atomic_read(&epoch->epoch_size) == 0);
-
-		D_ASSERT(atomic_read(&epoch->active) == 0);
-		D_ASSERT(epoch->flags == 0);
-
-		return true;
+		return 0;
 	default:
-		dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering);
-		return false;
+		conn_err(tconn, "Strangeness in tconn->write_ordering %d\n", tconn->write_ordering);
+		return -EIO;
 	}
 
 	epoch->flags = 0;
 	atomic_set(&epoch->epoch_size, 0);
 	atomic_set(&epoch->active, 0);
 
-	spin_lock(&mdev->epoch_lock);
-	if (atomic_read(&mdev->current_epoch->epoch_size)) {
-		list_add(&epoch->list, &mdev->current_epoch->list);
-		mdev->current_epoch = epoch;
-		mdev->epochs++;
+	spin_lock(&tconn->epoch_lock);
+	if (atomic_read(&tconn->current_epoch->epoch_size)) {
+		list_add(&epoch->list, &tconn->current_epoch->list);
+		tconn->current_epoch = epoch;
+		tconn->epochs++;
 	} else {
 		/* The current_epoch got recycled while we allocated this one... */
 		kfree(epoch);
 	}
-	spin_unlock(&mdev->epoch_lock);
+	spin_unlock(&tconn->epoch_lock);
 
-	return true;
+	return 0;
 }
 
 /* used from receive_RSDataReply (recv_resync_read)
  * and from receive_Data */
-static struct drbd_epoch_entry *
-read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
+static struct drbd_peer_request *
+read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector,
+	      int data_size) __must_hold(local)
 {
 	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
-	struct drbd_epoch_entry *e;
+	struct drbd_peer_request *peer_req;
 	struct page *page;
-	int dgs, ds, rr;
-	void *dig_in = mdev->int_dig_in;
-	void *dig_vv = mdev->int_dig_vv;
+	int dgs, ds, err;
+	void *dig_in = mdev->tconn->int_dig_in;
+	void *dig_vv = mdev->tconn->int_dig_vv;
 	unsigned long *data;
 
-	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
-		crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
-
-	if (dgs) {
-		rr = drbd_recv(mdev, dig_in, dgs);
-		if (rr != dgs) {
-			if (!signal_pending(current))
-				dev_warn(DEV,
-					"short read receiving data digest: read %d expected %d\n",
-					rr, dgs);
+	dgs = 0;
+	if (mdev->tconn->peer_integrity_tfm) {
+		dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm);
+		/*
+		 * FIXME: Receive the incoming digest into the receive buffer
+		 *	  here, together with its struct p_data?
+		 */
+		err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs);
+		if (err)
 			return NULL;
-		}
+		data_size -= dgs;
 	}
 
-	data_size -= dgs;
-
-	ERR_IF(data_size &  0x1ff) return NULL;
-	ERR_IF(data_size >  DRBD_MAX_BIO_SIZE) return NULL;
+	if (!expect(IS_ALIGNED(data_size, 512)))
+		return NULL;
+	if (!expect(data_size <= DRBD_MAX_BIO_SIZE))
+		return NULL;
 
 	/* even though we trust out peer,
 	 * we sometimes have to double check. */
@@ -1291,47 +1524,42 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
 	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
 	 * "criss-cross" setup, that might cause write-out on some other DRBD,
 	 * which in turn might block on the other node at this very place.  */
-	e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
-	if (!e)
+	peer_req = drbd_alloc_peer_req(mdev, id, sector, data_size, GFP_NOIO);
+	if (!peer_req)
 		return NULL;
 
 	if (!data_size)
-		return e;
+		return peer_req;
 
 	ds = data_size;
-	page = e->pages;
+	page = peer_req->pages;
 	page_chain_for_each(page) {
 		unsigned len = min_t(int, ds, PAGE_SIZE);
 		data = kmap(page);
-		rr = drbd_recv(mdev, data, len);
+		err = drbd_recv_all_warn(mdev->tconn, data, len);
 		if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) {
 			dev_err(DEV, "Fault injection: Corrupting data on receive\n");
 			data[0] = data[0] ^ (unsigned long)-1;
 		}
 		kunmap(page);
-		if (rr != len) {
-			drbd_free_ee(mdev, e);
-			if (!signal_pending(current))
-				dev_warn(DEV, "short read receiving data: read %d expected %d\n",
-				rr, len);
+		if (err) {
+			drbd_free_peer_req(mdev, peer_req);
 			return NULL;
 		}
-		ds -= rr;
+		ds -= len;
 	}
 
 	if (dgs) {
-		drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv);
+		drbd_csum_ee(mdev, mdev->tconn->peer_integrity_tfm, peer_req, dig_vv);
 		if (memcmp(dig_in, dig_vv, dgs)) {
 			dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n",
 				(unsigned long long)sector, data_size);
-			drbd_bcast_ee(mdev, "digest failed",
-					dgs, dig_in, dig_vv, e);
-			drbd_free_ee(mdev, e);
+			drbd_free_peer_req(mdev, peer_req);
 			return NULL;
 		}
 	}
 	mdev->recv_cnt += data_size>>9;
-	return e;
+	return peer_req;
 }
 
 /* drbd_drain_block() just takes a data block
@@ -1340,30 +1568,26 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
 static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
 {
 	struct page *page;
-	int rr, rv = 1;
+	int err = 0;
 	void *data;
 
 	if (!data_size)
-		return true;
+		return 0;
 
-	page = drbd_pp_alloc(mdev, 1, 1);
+	page = drbd_alloc_pages(mdev, 1, 1);
 
 	data = kmap(page);
 	while (data_size) {
-		rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
-		if (rr != min_t(int, data_size, PAGE_SIZE)) {
-			rv = 0;
-			if (!signal_pending(current))
-				dev_warn(DEV,
-					"short read receiving data: read %d expected %d\n",
-					rr, min_t(int, data_size, PAGE_SIZE));
+		unsigned int len = min_t(int, data_size, PAGE_SIZE);
+
+		err = drbd_recv_all_warn(mdev->tconn, data, len);
+		if (err)
 			break;
-		}
-		data_size -= rr;
+		data_size -= len;
 	}
 	kunmap(page);
-	drbd_pp_free(mdev, page, 0);
-	return rv;
+	drbd_free_pages(mdev, page, 0);
+	return err;
 }
 
 static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
@@ -1371,26 +1595,19 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
 {
 	struct bio_vec *bvec;
 	struct bio *bio;
-	int dgs, rr, i, expect;
-	void *dig_in = mdev->int_dig_in;
-	void *dig_vv = mdev->int_dig_vv;
-
-	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
-		crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
+	int dgs, err, i, expect;
+	void *dig_in = mdev->tconn->int_dig_in;
+	void *dig_vv = mdev->tconn->int_dig_vv;
 
-	if (dgs) {
-		rr = drbd_recv(mdev, dig_in, dgs);
-		if (rr != dgs) {
-			if (!signal_pending(current))
-				dev_warn(DEV,
-					"short read receiving data reply digest: read %d expected %d\n",
-					rr, dgs);
-			return 0;
-		}
+	dgs = 0;
+	if (mdev->tconn->peer_integrity_tfm) {
+		dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm);
+		err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs);
+		if (err)
+			return err;
+		data_size -= dgs;
 	}
 
-	data_size -= dgs;
-
 	/* optimistically update recv_cnt.  if receiving fails below,
 	 * we disconnect anyways, and counters will be reset. */
 	mdev->recv_cnt += data_size>>9;
@@ -1399,63 +1616,61 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
 	D_ASSERT(sector == bio->bi_sector);
 
 	bio_for_each_segment(bvec, bio, i) {
+		void *mapped = kmap(bvec->bv_page) + bvec->bv_offset;
 		expect = min_t(int, data_size, bvec->bv_len);
-		rr = drbd_recv(mdev,
-			     kmap(bvec->bv_page)+bvec->bv_offset,
-			     expect);
+		err = drbd_recv_all_warn(mdev->tconn, mapped, expect);
 		kunmap(bvec->bv_page);
-		if (rr != expect) {
-			if (!signal_pending(current))
-				dev_warn(DEV, "short read receiving data reply: "
-					"read %d expected %d\n",
-					rr, expect);
-			return 0;
-		}
-		data_size -= rr;
+		if (err)
+			return err;
+		data_size -= expect;
 	}
 
 	if (dgs) {
-		drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv);
+		drbd_csum_bio(mdev, mdev->tconn->peer_integrity_tfm, bio, dig_vv);
 		if (memcmp(dig_in, dig_vv, dgs)) {
 			dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
-			return 0;
+			return -EINVAL;
 		}
 	}
 
 	D_ASSERT(data_size == 0);
-	return 1;
+	return 0;
 }
 
-/* e_end_resync_block() is called via
- * drbd_process_done_ee() by asender only */
-static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+/*
+ * e_end_resync_block() is called in asender context via
+ * drbd_finish_peer_reqs().
+ */
+static int e_end_resync_block(struct drbd_work *w, int unused)
 {
-	struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
-	sector_t sector = e->sector;
-	int ok;
+	struct drbd_peer_request *peer_req =
+		container_of(w, struct drbd_peer_request, w);
+	struct drbd_conf *mdev = w->mdev;
+	sector_t sector = peer_req->i.sector;
+	int err;
 
-	D_ASSERT(hlist_unhashed(&e->collision));
+	D_ASSERT(drbd_interval_empty(&peer_req->i));
 
-	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
-		drbd_set_in_sync(mdev, sector, e->size);
-		ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
+	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+		drbd_set_in_sync(mdev, sector, peer_req->i.size);
+		err = drbd_send_ack(mdev, P_RS_WRITE_ACK, peer_req);
 	} else {
 		/* Record failure to sync */
-		drbd_rs_failed_io(mdev, sector, e->size);
+		drbd_rs_failed_io(mdev, sector, peer_req->i.size);
 
-		ok  = drbd_send_ack(mdev, P_NEG_ACK, e);
+		err  = drbd_send_ack(mdev, P_NEG_ACK, peer_req);
 	}
 	dec_unacked(mdev);
 
-	return ok;
+	return err;
 }
 
 static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
 {
-	struct drbd_epoch_entry *e;
+	struct drbd_peer_request *peer_req;
 
-	e = read_in_block(mdev, ID_SYNCER, sector, data_size);
-	if (!e)
+	peer_req = read_in_block(mdev, ID_SYNCER, sector, data_size);
+	if (!peer_req)
 		goto fail;
 
 	dec_rs_pending(mdev);
@@ -1464,64 +1679,88 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si
 	/* corresponding dec_unacked() in e_end_resync_block()
 	 * respective _drbd_clear_done_ee */
 
-	e->w.cb = e_end_resync_block;
+	peer_req->w.cb = e_end_resync_block;
 
-	spin_lock_irq(&mdev->req_lock);
-	list_add(&e->w.list, &mdev->sync_ee);
-	spin_unlock_irq(&mdev->req_lock);
+	spin_lock_irq(&mdev->tconn->req_lock);
+	list_add(&peer_req->w.list, &mdev->sync_ee);
+	spin_unlock_irq(&mdev->tconn->req_lock);
 
 	atomic_add(data_size >> 9, &mdev->rs_sect_ev);
-	if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
-		return true;
+	if (drbd_submit_peer_request(mdev, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
+		return 0;
 
 	/* don't care for the reason here */
 	dev_err(DEV, "submit failed, triggering re-connect\n");
-	spin_lock_irq(&mdev->req_lock);
-	list_del(&e->w.list);
-	spin_unlock_irq(&mdev->req_lock);
+	spin_lock_irq(&mdev->tconn->req_lock);
+	list_del(&peer_req->w.list);
+	spin_unlock_irq(&mdev->tconn->req_lock);
 
-	drbd_free_ee(mdev, e);
+	drbd_free_peer_req(mdev, peer_req);
 fail:
 	put_ldev(mdev);
-	return false;
+	return -EIO;
+}
+
+static struct drbd_request *
+find_request(struct drbd_conf *mdev, struct rb_root *root, u64 id,
+	     sector_t sector, bool missing_ok, const char *func)
+{
+	struct drbd_request *req;
+
+	/* Request object according to our peer */
+	req = (struct drbd_request *)(unsigned long)id;
+	if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
+		return req;
+	if (!missing_ok) {
+		dev_err(DEV, "%s: failed to find request 0x%lx, sector %llus\n", func,
+			(unsigned long)id, (unsigned long long)sector);
+	}
+	return NULL;
 }
 
-static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_DataReply(struct drbd_tconn *tconn, struct packet_info *pi)
 {
+	struct drbd_conf *mdev;
 	struct drbd_request *req;
 	sector_t sector;
-	int ok;
-	struct p_data *p = &mdev->data.rbuf.data;
+	int err;
+	struct p_data *p = pi->data;
+
+	mdev = vnr_to_mdev(tconn, pi->vnr);
+	if (!mdev)
+		return -EIO;
 
 	sector = be64_to_cpu(p->sector);
 
-	spin_lock_irq(&mdev->req_lock);
-	req = _ar_id_to_req(mdev, p->block_id, sector);
-	spin_unlock_irq(&mdev->req_lock);
-	if (unlikely(!req)) {
-		dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n");
-		return false;
-	}
+	spin_lock_irq(&mdev->tconn->req_lock);
+	req = find_request(mdev, &mdev->read_requests, p->block_id, sector, false, __func__);
+	spin_unlock_irq(&mdev->tconn->req_lock);
+	if (unlikely(!req))
+		return -EIO;
 
 	/* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
 	 * special casing it there for the various failure cases.
 	 * still no race with drbd_fail_pending_reads */
-	ok = recv_dless_read(mdev, req, sector, data_size);
-
-	if (ok)
-		req_mod(req, data_received);
+	err = recv_dless_read(mdev, req, sector, pi->size);
+	if (!err)
+		req_mod(req, DATA_RECEIVED);
 	/* else: nothing. handled from drbd_disconnect...
 	 * I don't think we may complete this just yet
 	 * in case we are "on-disconnect: freeze" */
 
-	return ok;
+	return err;
 }
 
-static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_RSDataReply(struct drbd_tconn *tconn, struct packet_info *pi)
 {
+	struct drbd_conf *mdev;
 	sector_t sector;
-	int ok;
-	struct p_data *p = &mdev->data.rbuf.data;
+	int err;
+	struct p_data *p = pi->data;
+
+	mdev = vnr_to_mdev(tconn, pi->vnr);
+	if (!mdev)
+		return -EIO;
 
 	sector = be64_to_cpu(p->sector);
 	D_ASSERT(p->block_id == ID_SYNCER);
@@ -1529,42 +1768,63 @@ static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, un
 	if (get_ldev(mdev)) {
 		/* data is submitted to disk within recv_resync_read.
 		 * corresponding put_ldev done below on error,
-		 * or in drbd_endio_write_sec. */
-		ok = recv_resync_read(mdev, sector, data_size);
+		 * or in drbd_peer_request_endio. */
+		err = recv_resync_read(mdev, sector, pi->size);
 	} else {
 		if (__ratelimit(&drbd_ratelimit_state))
 			dev_err(DEV, "Can not write resync data to local disk.\n");
 
-		ok = drbd_drain_block(mdev, data_size);
+		err = drbd_drain_block(mdev, pi->size);
 
-		drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
+		drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size);
 	}
 
-	atomic_add(data_size >> 9, &mdev->rs_sect_in);
+	atomic_add(pi->size >> 9, &mdev->rs_sect_in);
 
-	return ok;
+	return err;
 }
 
-/* e_end_block() is called via drbd_process_done_ee().
- * this means this function only runs in the asender thread
- */
-static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+static void restart_conflicting_writes(struct drbd_conf *mdev,
+				       sector_t sector, int size)
 {
-	struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
-	sector_t sector = e->sector;
-	int ok = 1, pcmd;
+	struct drbd_interval *i;
+	struct drbd_request *req;
+
+	drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
+		if (!i->local)
+			continue;
+		req = container_of(i, struct drbd_request, i);
+		if (req->rq_state & RQ_LOCAL_PENDING ||
+		    !(req->rq_state & RQ_POSTPONED))
+			continue;
+		/* as it is RQ_POSTPONED, this will cause it to
+		 * be queued on the retry workqueue. */
+		__req_mod(req, CONFLICT_RESOLVED, NULL);
+	}
+}
 
-	if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
-		if (likely((e->flags & EE_WAS_ERROR) == 0)) {
+/*
+ * e_end_block() is called in asender context via drbd_finish_peer_reqs().
+ */
+static int e_end_block(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req =
+		container_of(w, struct drbd_peer_request, w);
+	struct drbd_conf *mdev = w->mdev;
+	sector_t sector = peer_req->i.sector;
+	int err = 0, pcmd;
+
+	if (peer_req->flags & EE_SEND_WRITE_ACK) {
+		if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
 			pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
 				mdev->state.conn <= C_PAUSED_SYNC_T &&
-				e->flags & EE_MAY_SET_IN_SYNC) ?
+				peer_req->flags & EE_MAY_SET_IN_SYNC) ?
 				P_RS_WRITE_ACK : P_WRITE_ACK;
-			ok &= drbd_send_ack(mdev, pcmd, e);
+			err = drbd_send_ack(mdev, pcmd, peer_req);
 			if (pcmd == P_RS_WRITE_ACK)
-				drbd_set_in_sync(mdev, sector, e->size);
+				drbd_set_in_sync(mdev, sector, peer_req->i.size);
 		} else {
-			ok  = drbd_send_ack(mdev, P_NEG_ACK, e);
+			err = drbd_send_ack(mdev, P_NEG_ACK, peer_req);
 			/* we expect it to be marked out of sync anyways...
 			 * maybe assert this?  */
 		}
@@ -1572,52 +1832,115 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	}
 	/* we delete from the conflict detection hash _after_ we sent out the
 	 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
-	if (mdev->net_conf->two_primaries) {
-		spin_lock_irq(&mdev->req_lock);
-		D_ASSERT(!hlist_unhashed(&e->collision));
-		hlist_del_init(&e->collision);
-		spin_unlock_irq(&mdev->req_lock);
-	} else {
-		D_ASSERT(hlist_unhashed(&e->collision));
-	}
+	if (peer_req->flags & EE_IN_INTERVAL_TREE) {
+		spin_lock_irq(&mdev->tconn->req_lock);
+		D_ASSERT(!drbd_interval_empty(&peer_req->i));
+		drbd_remove_epoch_entry_interval(mdev, peer_req);
+		if (peer_req->flags & EE_RESTART_REQUESTS)
+			restart_conflicting_writes(mdev, sector, peer_req->i.size);
+		spin_unlock_irq(&mdev->tconn->req_lock);
+	} else
+		D_ASSERT(drbd_interval_empty(&peer_req->i));
+
+	drbd_may_finish_epoch(mdev->tconn, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+
+	return err;
+}
 
-	drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
+{
+	struct drbd_conf *mdev = w->mdev;
+	struct drbd_peer_request *peer_req =
+		container_of(w, struct drbd_peer_request, w);
+	int err;
+
+	err = drbd_send_ack(mdev, ack, peer_req);
+	dec_unacked(mdev);
 
-	return ok;
+	return err;
 }
 
-static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int e_send_superseded(struct drbd_work *w, int unused)
 {
-	struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
-	int ok = 1;
+	return e_send_ack(w, P_SUPERSEDED);
+}
 
-	D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
-	ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
+static int e_send_retry_write(struct drbd_work *w, int unused)
+{
+	struct drbd_tconn *tconn = w->mdev->tconn;
 
-	spin_lock_irq(&mdev->req_lock);
-	D_ASSERT(!hlist_unhashed(&e->collision));
-	hlist_del_init(&e->collision);
-	spin_unlock_irq(&mdev->req_lock);
+	return e_send_ack(w, tconn->agreed_pro_version >= 100 ?
+			     P_RETRY_WRITE : P_SUPERSEDED);
+}
 
-	dec_unacked(mdev);
+static bool seq_greater(u32 a, u32 b)
+{
+	/*
+	 * We assume 32-bit wrap-around here.
+	 * For 24-bit wrap-around, we would have to shift:
+	 *  a <<= 8; b <<= 8;
+	 */
+	return (s32)a - (s32)b > 0;
+}
+
+static u32 seq_max(u32 a, u32 b)
+{
+	return seq_greater(a, b) ? a : b;
+}
+
+static bool need_peer_seq(struct drbd_conf *mdev)
+{
+	struct drbd_tconn *tconn = mdev->tconn;
+	int tp;
+
+	/*
+	 * We only need to keep track of the last packet_seq number of our peer
+	 * if we are in dual-primary mode and we have the resolve-conflicts flag set; see
+	 * handle_write_conflicts().
+	 */
 
-	return ok;
+	rcu_read_lock();
+	tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries;
+	rcu_read_unlock();
+
+	return tp && test_bit(RESOLVE_CONFLICTS, &tconn->flags);
+}
+
+static void update_peer_seq(struct drbd_conf *mdev, unsigned int peer_seq)
+{
+	unsigned int newest_peer_seq;
+
+	if (need_peer_seq(mdev)) {
+		spin_lock(&mdev->peer_seq_lock);
+		newest_peer_seq = seq_max(mdev->peer_seq, peer_seq);
+		mdev->peer_seq = newest_peer_seq;
+		spin_unlock(&mdev->peer_seq_lock);
+		/* wake up only if we actually changed mdev->peer_seq */
+		if (peer_seq == newest_peer_seq)
+			wake_up(&mdev->seq_wait);
+	}
 }
 
-static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_entry *data_e)
+static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
 {
+	return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
+}
 
-	struct drbd_epoch_entry *rs_e;
+/* maybe change sync_ee into interval trees as well? */
+static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_peer_request *peer_req)
+{
+	struct drbd_peer_request *rs_req;
 	bool rv = 0;
 
-	spin_lock_irq(&mdev->req_lock);
-	list_for_each_entry(rs_e, &mdev->sync_ee, w.list) {
-		if (overlaps(data_e->sector, data_e->size, rs_e->sector, rs_e->size)) {
+	spin_lock_irq(&mdev->tconn->req_lock);
+	list_for_each_entry(rs_req, &mdev->sync_ee, w.list) {
+		if (overlaps(peer_req->i.sector, peer_req->i.size,
+			     rs_req->i.sector, rs_req->i.size)) {
 			rv = 1;
 			break;
 		}
 	}
-	spin_unlock_irq(&mdev->req_lock);
+	spin_unlock_irq(&mdev->tconn->req_lock);
 
 	return rv;
 }
@@ -1643,35 +1966,41 @@ static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_e
  *
  * returns 0 if we may process the packet,
  * -ERESTARTSYS if we were interrupted (by disconnect signal). */
-static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
+static int wait_for_and_update_peer_seq(struct drbd_conf *mdev, const u32 peer_seq)
 {
 	DEFINE_WAIT(wait);
-	unsigned int p_seq;
 	long timeout;
-	int ret = 0;
+	int ret;
+
+	if (!need_peer_seq(mdev))
+		return 0;
+
 	spin_lock(&mdev->peer_seq_lock);
 	for (;;) {
-		prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
-		if (seq_le(packet_seq, mdev->peer_seq+1))
+		if (!seq_greater(peer_seq - 1, mdev->peer_seq)) {
+			mdev->peer_seq = seq_max(mdev->peer_seq, peer_seq);
+			ret = 0;
 			break;
+		}
 		if (signal_pending(current)) {
 			ret = -ERESTARTSYS;
 			break;
 		}
-		p_seq = mdev->peer_seq;
+		prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
 		spin_unlock(&mdev->peer_seq_lock);
-		timeout = schedule_timeout(30*HZ);
+		rcu_read_lock();
+		timeout = rcu_dereference(mdev->tconn->net_conf)->ping_timeo*HZ/10;
+		rcu_read_unlock();
+		timeout = schedule_timeout(timeout);
 		spin_lock(&mdev->peer_seq_lock);
-		if (timeout == 0 && p_seq == mdev->peer_seq) {
+		if (!timeout) {
 			ret = -ETIMEDOUT;
-			dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n");
+			dev_err(DEV, "Timed out waiting for missing ack packets; disconnecting\n");
 			break;
 		}
 	}
-	finish_wait(&mdev->seq_wait, &wait);
-	if (mdev->peer_seq+1 == packet_seq)
-		mdev->peer_seq++;
 	spin_unlock(&mdev->peer_seq_lock);
+	finish_wait(&mdev->seq_wait, &wait);
 	return ret;
 }
 
@@ -1686,233 +2015,277 @@ static unsigned long wire_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
 		(dpf & DP_DISCARD ? REQ_DISCARD : 0);
 }
 
+static void fail_postponed_requests(struct drbd_conf *mdev, sector_t sector,
+				    unsigned int size)
+{
+	struct drbd_interval *i;
+
+    repeat:
+	drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
+		struct drbd_request *req;
+		struct bio_and_error m;
+
+		if (!i->local)
+			continue;
+		req = container_of(i, struct drbd_request, i);
+		if (!(req->rq_state & RQ_POSTPONED))
+			continue;
+		req->rq_state &= ~RQ_POSTPONED;
+		__req_mod(req, NEG_ACKED, &m);
+		spin_unlock_irq(&mdev->tconn->req_lock);
+		if (m.bio)
+			complete_master_bio(mdev, &m);
+		spin_lock_irq(&mdev->tconn->req_lock);
+		goto repeat;
+	}
+}
+
+static int handle_write_conflicts(struct drbd_conf *mdev,
+				  struct drbd_peer_request *peer_req)
+{
+	struct drbd_tconn *tconn = mdev->tconn;
+	bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &tconn->flags);
+	sector_t sector = peer_req->i.sector;
+	const unsigned int size = peer_req->i.size;
+	struct drbd_interval *i;
+	bool equal;
+	int err;
+
+	/*
+	 * Inserting the peer request into the write_requests tree will prevent
+	 * new conflicting local requests from being added.
+	 */
+	drbd_insert_interval(&mdev->write_requests, &peer_req->i);
+
+    repeat:
+	drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
+		if (i == &peer_req->i)
+			continue;
+
+		if (!i->local) {
+			/*
+			 * Our peer has sent a conflicting remote request; this
+			 * should not happen in a two-node setup.  Wait for the
+			 * earlier peer request to complete.
+			 */
+			err = drbd_wait_misc(mdev, i);
+			if (err)
+				goto out;
+			goto repeat;
+		}
+
+		equal = i->sector == sector && i->size == size;
+		if (resolve_conflicts) {
+			/*
+			 * If the peer request is fully contained within the
+			 * overlapping request, it can be considered overwritten
+			 * and thus superseded; otherwise, it will be retried
+			 * once all overlapping requests have completed.
+			 */
+			bool superseded = i->sector <= sector && i->sector +
+				       (i->size >> 9) >= sector + (size >> 9);
+
+			if (!equal)
+				dev_alert(DEV, "Concurrent writes detected: "
+					       "local=%llus +%u, remote=%llus +%u, "
+					       "assuming %s came first\n",
+					  (unsigned long long)i->sector, i->size,
+					  (unsigned long long)sector, size,
+					  superseded ? "local" : "remote");
+
+			inc_unacked(mdev);
+			peer_req->w.cb = superseded ? e_send_superseded :
+						   e_send_retry_write;
+			list_add_tail(&peer_req->w.list, &mdev->done_ee);
+			wake_asender(mdev->tconn);
+
+			err = -ENOENT;
+			goto out;
+		} else {
+			struct drbd_request *req =
+				container_of(i, struct drbd_request, i);
+
+			if (!equal)
+				dev_alert(DEV, "Concurrent writes detected: "
+					       "local=%llus +%u, remote=%llus +%u\n",
+					  (unsigned long long)i->sector, i->size,
+					  (unsigned long long)sector, size);
+
+			if (req->rq_state & RQ_LOCAL_PENDING ||
+			    !(req->rq_state & RQ_POSTPONED)) {
+				/*
+				 * Wait for the node with the discard flag to
+				 * decide if this request has been superseded
+				 * or needs to be retried.
+				 * Requests that have been superseded will
+				 * disappear from the write_requests tree.
+				 *
+				 * In addition, wait for the conflicting
+				 * request to finish locally before submitting
+				 * the conflicting peer request.
+				 */
+				err = drbd_wait_misc(mdev, &req->i);
+				if (err) {
+					_conn_request_state(mdev->tconn,
+							    NS(conn, C_TIMEOUT),
+							    CS_HARD);
+					fail_postponed_requests(mdev, sector, size);
+					goto out;
+				}
+				goto repeat;
+			}
+			/*
+			 * Remember to restart the conflicting requests after
+			 * the new peer request has completed.
+			 */
+			peer_req->flags |= EE_RESTART_REQUESTS;
+		}
+	}
+	err = 0;
+
+    out:
+	if (err)
+		drbd_remove_epoch_entry_interval(mdev, peer_req);
+	return err;
+}
+
 /* mirrored write */
-static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi)
 {
+	struct drbd_conf *mdev;
 	sector_t sector;
-	struct drbd_epoch_entry *e;
-	struct p_data *p = &mdev->data.rbuf.data;
+	struct drbd_peer_request *peer_req;
+	struct p_data *p = pi->data;
+	u32 peer_seq = be32_to_cpu(p->seq_num);
 	int rw = WRITE;
 	u32 dp_flags;
+	int err, tp;
 
-	if (!get_ldev(mdev)) {
-		spin_lock(&mdev->peer_seq_lock);
-		if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num))
-			mdev->peer_seq++;
-		spin_unlock(&mdev->peer_seq_lock);
+	mdev = vnr_to_mdev(tconn, pi->vnr);
+	if (!mdev)
+		return -EIO;
 
-		drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
-		atomic_inc(&mdev->current_epoch->epoch_size);
-		return drbd_drain_block(mdev, data_size);
+	if (!get_ldev(mdev)) {
+		int err2;
+
+		err = wait_for_and_update_peer_seq(mdev, peer_seq);
+		drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size);
+		atomic_inc(&tconn->current_epoch->epoch_size);
+		err2 = drbd_drain_block(mdev, pi->size);
+		if (!err)
+			err = err2;
+		return err;
 	}
 
-	/* get_ldev(mdev) successful.
-	 * Corresponding put_ldev done either below (on various errors),
-	 * or in drbd_endio_write_sec, if we successfully submit the data at
-	 * the end of this function. */
+	/*
+	 * Corresponding put_ldev done either below (on various errors), or in
+	 * drbd_peer_request_endio, if we successfully submit the data at the
+	 * end of this function.
+	 */
 
 	sector = be64_to_cpu(p->sector);
-	e = read_in_block(mdev, p->block_id, sector, data_size);
-	if (!e) {
+	peer_req = read_in_block(mdev, p->block_id, sector, pi->size);
+	if (!peer_req) {
 		put_ldev(mdev);
-		return false;
+		return -EIO;
 	}
 
-	e->w.cb = e_end_block;
+	peer_req->w.cb = e_end_block;
 
 	dp_flags = be32_to_cpu(p->dp_flags);
 	rw |= wire_flags_to_bio(mdev, dp_flags);
-	if (e->pages == NULL) {
-		D_ASSERT(e->size == 0);
+	if (peer_req->pages == NULL) {
+		D_ASSERT(peer_req->i.size == 0);
 		D_ASSERT(dp_flags & DP_FLUSH);
 	}
 
 	if (dp_flags & DP_MAY_SET_IN_SYNC)
-		e->flags |= EE_MAY_SET_IN_SYNC;
-
-	spin_lock(&mdev->epoch_lock);
-	e->epoch = mdev->current_epoch;
-	atomic_inc(&e->epoch->epoch_size);
-	atomic_inc(&e->epoch->active);
-	spin_unlock(&mdev->epoch_lock);
-
-	/* I'm the receiver, I do hold a net_cnt reference. */
-	if (!mdev->net_conf->two_primaries) {
-		spin_lock_irq(&mdev->req_lock);
-	} else {
-		/* don't get the req_lock yet,
-		 * we may sleep in drbd_wait_peer_seq */
-		const int size = e->size;
-		const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags);
-		DEFINE_WAIT(wait);
-		struct drbd_request *i;
-		struct hlist_node *n;
-		struct hlist_head *slot;
-		int first;
-
-		D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
-		BUG_ON(mdev->ee_hash == NULL);
-		BUG_ON(mdev->tl_hash == NULL);
-
-		/* conflict detection and handling:
-		 * 1. wait on the sequence number,
-		 *    in case this data packet overtook ACK packets.
-		 * 2. check our hash tables for conflicting requests.
-		 *    we only need to walk the tl_hash, since an ee can not
-		 *    have a conflict with an other ee: on the submitting
-		 *    node, the corresponding req had already been conflicting,
-		 *    and a conflicting req is never sent.
-		 *
-		 * Note: for two_primaries, we are protocol C,
-		 * so there cannot be any request that is DONE
-		 * but still on the transfer log.
-		 *
-		 * unconditionally add to the ee_hash.
-		 *
-		 * if no conflicting request is found:
-		 *    submit.
-		 *
-		 * if any conflicting request is found
-		 * that has not yet been acked,
-		 * AND I have the "discard concurrent writes" flag:
-		 *	 queue (via done_ee) the P_DISCARD_ACK; OUT.
-		 *
-		 * if any conflicting request is found:
-		 *	 block the receiver, waiting on misc_wait
-		 *	 until no more conflicting requests are there,
-		 *	 or we get interrupted (disconnect).
-		 *
-		 *	 we do not just write after local io completion of those
-		 *	 requests, but only after req is done completely, i.e.
-		 *	 we wait for the P_DISCARD_ACK to arrive!
-		 *
-		 *	 then proceed normally, i.e. submit.
-		 */
-		if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num)))
+		peer_req->flags |= EE_MAY_SET_IN_SYNC;
+
+	spin_lock(&tconn->epoch_lock);
+	peer_req->epoch = tconn->current_epoch;
+	atomic_inc(&peer_req->epoch->epoch_size);
+	atomic_inc(&peer_req->epoch->active);
+	spin_unlock(&tconn->epoch_lock);
+
+	rcu_read_lock();
+	tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries;
+	rcu_read_unlock();
+	if (tp) {
+		peer_req->flags |= EE_IN_INTERVAL_TREE;
+		err = wait_for_and_update_peer_seq(mdev, peer_seq);
+		if (err)
 			goto out_interrupted;
-
-		spin_lock_irq(&mdev->req_lock);
-
-		hlist_add_head(&e->collision, ee_hash_slot(mdev, sector));
-
-#define OVERLAPS overlaps(i->sector, i->size, sector, size)
-		slot = tl_hash_slot(mdev, sector);
-		first = 1;
-		for (;;) {
-			int have_unacked = 0;
-			int have_conflict = 0;
-			prepare_to_wait(&mdev->misc_wait, &wait,
-				TASK_INTERRUPTIBLE);
-			hlist_for_each_entry(i, n, slot, collision) {
-				if (OVERLAPS) {
-					/* only ALERT on first iteration,
-					 * we may be woken up early... */
-					if (first)
-						dev_alert(DEV, "%s[%u] Concurrent local write detected!"
-						      "	new: %llus +%u; pending: %llus +%u\n",
-						      current->comm, current->pid,
-						      (unsigned long long)sector, size,
-						      (unsigned long long)i->sector, i->size);
-					if (i->rq_state & RQ_NET_PENDING)
-						++have_unacked;
-					++have_conflict;
-				}
-			}
-#undef OVERLAPS
-			if (!have_conflict)
-				break;
-
-			/* Discard Ack only for the _first_ iteration */
-			if (first && discard && have_unacked) {
-				dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n",
-				     (unsigned long long)sector);
-				inc_unacked(mdev);
-				e->w.cb = e_send_discard_ack;
-				list_add_tail(&e->w.list, &mdev->done_ee);
-
-				spin_unlock_irq(&mdev->req_lock);
-
-				/* we could probably send that P_DISCARD_ACK ourselves,
-				 * but I don't like the receiver using the msock */
-
+		spin_lock_irq(&mdev->tconn->req_lock);
+		err = handle_write_conflicts(mdev, peer_req);
+		if (err) {
+			spin_unlock_irq(&mdev->tconn->req_lock);
+			if (err == -ENOENT) {
 				put_ldev(mdev);
-				wake_asender(mdev);
-				finish_wait(&mdev->misc_wait, &wait);
-				return true;
+				return 0;
 			}
+			goto out_interrupted;
+		}
+	} else
+		spin_lock_irq(&mdev->tconn->req_lock);
+	list_add(&peer_req->w.list, &mdev->active_ee);
+	spin_unlock_irq(&mdev->tconn->req_lock);
 
-			if (signal_pending(current)) {
-				hlist_del_init(&e->collision);
-
-				spin_unlock_irq(&mdev->req_lock);
-
-				finish_wait(&mdev->misc_wait, &wait);
-				goto out_interrupted;
-			}
+	if (mdev->state.conn == C_SYNC_TARGET)
+		wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, peer_req));
 
-			spin_unlock_irq(&mdev->req_lock);
-			if (first) {
-				first = 0;
-				dev_alert(DEV, "Concurrent write! [W AFTERWARDS] "
-				     "sec=%llus\n", (unsigned long long)sector);
-			} else if (discard) {
-				/* we had none on the first iteration.
-				 * there must be none now. */
-				D_ASSERT(have_unacked == 0);
-			}
-			schedule();
-			spin_lock_irq(&mdev->req_lock);
+	if (mdev->tconn->agreed_pro_version < 100) {
+		rcu_read_lock();
+		switch (rcu_dereference(mdev->tconn->net_conf)->wire_protocol) {
+		case DRBD_PROT_C:
+			dp_flags |= DP_SEND_WRITE_ACK;
+			break;
+		case DRBD_PROT_B:
+			dp_flags |= DP_SEND_RECEIVE_ACK;
+			break;
 		}
-		finish_wait(&mdev->misc_wait, &wait);
+		rcu_read_unlock();
 	}
 
-	list_add(&e->w.list, &mdev->active_ee);
-	spin_unlock_irq(&mdev->req_lock);
-
-	if (mdev->state.conn == C_SYNC_TARGET)
-		wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, e));
-
-	switch (mdev->net_conf->wire_protocol) {
-	case DRBD_PROT_C:
+	if (dp_flags & DP_SEND_WRITE_ACK) {
+		peer_req->flags |= EE_SEND_WRITE_ACK;
 		inc_unacked(mdev);
 		/* corresponding dec_unacked() in e_end_block()
 		 * respective _drbd_clear_done_ee */
-		break;
-	case DRBD_PROT_B:
+	}
+
+	if (dp_flags & DP_SEND_RECEIVE_ACK) {
 		/* I really don't like it that the receiver thread
 		 * sends on the msock, but anyways */
-		drbd_send_ack(mdev, P_RECV_ACK, e);
-		break;
-	case DRBD_PROT_A:
-		/* nothing to do */
-		break;
+		drbd_send_ack(mdev, P_RECV_ACK, peer_req);
 	}
 
 	if (mdev->state.pdsk < D_INCONSISTENT) {
 		/* In case we have the only disk of the cluster, */
-		drbd_set_out_of_sync(mdev, e->sector, e->size);
-		e->flags |= EE_CALL_AL_COMPLETE_IO;
-		e->flags &= ~EE_MAY_SET_IN_SYNC;
-		drbd_al_begin_io(mdev, e->sector);
+		drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size);
+		peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
+		peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
+		drbd_al_begin_io(mdev, &peer_req->i);
 	}
 
-	if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
-		return true;
+	err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR);
+	if (!err)
+		return 0;
 
 	/* don't care for the reason here */
 	dev_err(DEV, "submit failed, triggering re-connect\n");
-	spin_lock_irq(&mdev->req_lock);
-	list_del(&e->w.list);
-	hlist_del_init(&e->collision);
-	spin_unlock_irq(&mdev->req_lock);
-	if (e->flags & EE_CALL_AL_COMPLETE_IO)
-		drbd_al_complete_io(mdev, e->sector);
+	spin_lock_irq(&mdev->tconn->req_lock);
+	list_del(&peer_req->w.list);
+	drbd_remove_epoch_entry_interval(mdev, peer_req);
+	spin_unlock_irq(&mdev->tconn->req_lock);
+	if (peer_req->flags & EE_CALL_AL_COMPLETE_IO)
+		drbd_al_complete_io(mdev, &peer_req->i);
 
 out_interrupted:
-	drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + EV_CLEANUP);
+	drbd_may_finish_epoch(tconn, peer_req->epoch, EV_PUT + EV_CLEANUP);
 	put_ldev(mdev);
-	drbd_free_ee(mdev, e);
-	return false;
+	drbd_free_peer_req(mdev, peer_req);
+	return err;
 }
 
 /* We may throttle resync, if the lower device seems to be busy,
@@ -1933,9 +2306,14 @@ int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector)
 	struct lc_element *tmp;
 	int curr_events;
 	int throttle = 0;
+	unsigned int c_min_rate;
+
+	rcu_read_lock();
+	c_min_rate = rcu_dereference(mdev->ldev->disk_conf)->c_min_rate;
+	rcu_read_unlock();
 
 	/* feature disabled? */
-	if (mdev->sync_conf.c_min_rate == 0)
+	if (c_min_rate == 0)
 		return 0;
 
 	spin_lock_irq(&mdev->al_lock);
@@ -1975,40 +2353,46 @@ int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector)
 		db = mdev->rs_mark_left[i] - rs_left;
 		dbdt = Bit2KB(db/dt);
 
-		if (dbdt > mdev->sync_conf.c_min_rate)
+		if (dbdt > c_min_rate)
 			throttle = 1;
 	}
 	return throttle;
 }
 
 
-static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size)
+static int receive_DataRequest(struct drbd_tconn *tconn, struct packet_info *pi)
 {
+	struct drbd_conf *mdev;
 	sector_t sector;
-	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
-	struct drbd_epoch_entry *e;
+	sector_t capacity;
+	struct drbd_peer_request *peer_req;
 	struct digest_info *di = NULL;
 	int size, verb;
 	unsigned int fault_type;
-	struct p_block_req *p =	&mdev->data.rbuf.block_req;
+	struct p_block_req *p =	pi->data;
+
+	mdev = vnr_to_mdev(tconn, pi->vnr);
+	if (!mdev)
+		return -EIO;
+	capacity = drbd_get_capacity(mdev->this_bdev);
 
 	sector = be64_to_cpu(p->sector);
 	size   = be32_to_cpu(p->blksize);
 
-	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
+	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
 		dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
 				(unsigned long long)sector, size);
-		return false;
+		return -EINVAL;
 	}
 	if (sector + (size>>9) > capacity) {
 		dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
 				(unsigned long long)sector, size);
-		return false;
+		return -EINVAL;
 	}
 
 	if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
 		verb = 1;
-		switch (cmd) {
+		switch (pi->cmd) {
 		case P_DATA_REQUEST:
 			drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
 			break;
@@ -2023,35 +2407,34 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
 			drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
 			break;
 		default:
-			dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
-				cmdname(cmd));
+			BUG();
 		}
 		if (verb && __ratelimit(&drbd_ratelimit_state))
 			dev_err(DEV, "Can not satisfy peer's read request, "
 			    "no local data.\n");
 
 		/* drain possibly payload */
-		return drbd_drain_block(mdev, digest_size);
+		return drbd_drain_block(mdev, pi->size);
 	}
 
 	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
 	 * "criss-cross" setup, that might cause write-out on some other DRBD,
 	 * which in turn might block on the other node at this very place.  */
-	e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
-	if (!e) {
+	peer_req = drbd_alloc_peer_req(mdev, p->block_id, sector, size, GFP_NOIO);
+	if (!peer_req) {
 		put_ldev(mdev);
-		return false;
+		return -ENOMEM;
 	}
 
-	switch (cmd) {
+	switch (pi->cmd) {
 	case P_DATA_REQUEST:
-		e->w.cb = w_e_end_data_req;
+		peer_req->w.cb = w_e_end_data_req;
 		fault_type = DRBD_FAULT_DT_RD;
 		/* application IO, don't drbd_rs_begin_io */
 		goto submit;
 
 	case P_RS_DATA_REQUEST:
-		e->w.cb = w_e_end_rsdata_req;
+		peer_req->w.cb = w_e_end_rsdata_req;
 		fault_type = DRBD_FAULT_RS_RD;
 		/* used in the sector offset progress display */
 		mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
@@ -2060,28 +2443,28 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
 	case P_OV_REPLY:
 	case P_CSUM_RS_REQUEST:
 		fault_type = DRBD_FAULT_RS_RD;
-		di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
+		di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
 		if (!di)
 			goto out_free_e;
 
-		di->digest_size = digest_size;
+		di->digest_size = pi->size;
 		di->digest = (((char *)di)+sizeof(struct digest_info));
 
-		e->digest = di;
-		e->flags |= EE_HAS_DIGEST;
+		peer_req->digest = di;
+		peer_req->flags |= EE_HAS_DIGEST;
 
-		if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
+		if (drbd_recv_all(mdev->tconn, di->digest, pi->size))
 			goto out_free_e;
 
-		if (cmd == P_CSUM_RS_REQUEST) {
-			D_ASSERT(mdev->agreed_pro_version >= 89);
-			e->w.cb = w_e_end_csum_rs_req;
+		if (pi->cmd == P_CSUM_RS_REQUEST) {
+			D_ASSERT(mdev->tconn->agreed_pro_version >= 89);
+			peer_req->w.cb = w_e_end_csum_rs_req;
 			/* used in the sector offset progress display */
 			mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
-		} else if (cmd == P_OV_REPLY) {
+		} else if (pi->cmd == P_OV_REPLY) {
 			/* track progress, we may need to throttle */
 			atomic_add(size >> 9, &mdev->rs_sect_in);
-			e->w.cb = w_e_end_ov_reply;
+			peer_req->w.cb = w_e_end_ov_reply;
 			dec_rs_pending(mdev);
 			/* drbd_rs_begin_io done when we sent this request,
 			 * but accounting still needs to be done. */
@@ -2091,7 +2474,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
 
 	case P_OV_REQUEST:
 		if (mdev->ov_start_sector == ~(sector_t)0 &&
-		    mdev->agreed_pro_version >= 90) {
+		    mdev->tconn->agreed_pro_version >= 90) {
 			unsigned long now = jiffies;
 			int i;
 			mdev->ov_start_sector = sector;
@@ -2105,15 +2488,12 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
 			dev_info(DEV, "Online Verify start sector: %llu\n",
 					(unsigned long long)sector);
 		}
-		e->w.cb = w_e_end_ov_req;
+		peer_req->w.cb = w_e_end_ov_req;
 		fault_type = DRBD_FAULT_RS_RD;
 		break;
 
 	default:
-		dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
-		    cmdname(cmd));
-		fault_type = DRBD_FAULT_MAX;
-		goto out_free_e;
+		BUG();
 	}
 
 	/* Throttle, drbd_rs_begin_io and submit should become asynchronous
@@ -2148,30 +2528,31 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
 
 submit:
 	inc_unacked(mdev);
-	spin_lock_irq(&mdev->req_lock);
-	list_add_tail(&e->w.list, &mdev->read_ee);
-	spin_unlock_irq(&mdev->req_lock);
+	spin_lock_irq(&mdev->tconn->req_lock);
+	list_add_tail(&peer_req->w.list, &mdev->read_ee);
+	spin_unlock_irq(&mdev->tconn->req_lock);
 
-	if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
-		return true;
+	if (drbd_submit_peer_request(mdev, peer_req, READ, fault_type) == 0)
+		return 0;
 
 	/* don't care for the reason here */
 	dev_err(DEV, "submit failed, triggering re-connect\n");
-	spin_lock_irq(&mdev->req_lock);
-	list_del(&e->w.list);
-	spin_unlock_irq(&mdev->req_lock);
+	spin_lock_irq(&mdev->tconn->req_lock);
+	list_del(&peer_req->w.list);
+	spin_unlock_irq(&mdev->tconn->req_lock);
 	/* no drbd_rs_complete_io(), we are dropping the connection anyways */
 
 out_free_e:
 	put_ldev(mdev);
-	drbd_free_ee(mdev, e);
-	return false;
+	drbd_free_peer_req(mdev, peer_req);
+	return -EIO;
 }
 
 static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
 {
 	int self, peer, rv = -100;
 	unsigned long ch_self, ch_peer;
+	enum drbd_after_sb_p after_sb_0p;
 
 	self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
 	peer = mdev->p_uuid[UI_BITMAP] & 1;
@@ -2179,10 +2560,14 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
 	ch_peer = mdev->p_uuid[UI_SIZE];
 	ch_self = mdev->comm_bm_set;
 
-	switch (mdev->net_conf->after_sb_0p) {
+	rcu_read_lock();
+	after_sb_0p = rcu_dereference(mdev->tconn->net_conf)->after_sb_0p;
+	rcu_read_unlock();
+	switch (after_sb_0p) {
 	case ASB_CONSENSUS:
 	case ASB_DISCARD_SECONDARY:
 	case ASB_CALL_HELPER:
+	case ASB_VIOLENTLY:
 		dev_err(DEV, "Configuration error.\n");
 		break;
 	case ASB_DISCONNECT:
@@ -2211,14 +2596,14 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
 		     "Using discard-least-changes instead\n");
 	case ASB_DISCARD_ZERO_CHG:
 		if (ch_peer == 0 && ch_self == 0) {
-			rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
+			rv = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags)
 				? -1 : 1;
 			break;
 		} else {
 			if (ch_peer == 0) { rv =  1; break; }
 			if (ch_self == 0) { rv = -1; break; }
 		}
-		if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG)
+		if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
 			break;
 	case ASB_DISCARD_LEAST_CHG:
 		if	(ch_self < ch_peer)
@@ -2227,7 +2612,7 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
 			rv =  1;
 		else /* ( ch_self == ch_peer ) */
 		     /* Well, then use something else. */
-			rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
+			rv = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags)
 				? -1 : 1;
 		break;
 	case ASB_DISCARD_LOCAL:
@@ -2243,13 +2628,18 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
 static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
 {
 	int hg, rv = -100;
+	enum drbd_after_sb_p after_sb_1p;
 
-	switch (mdev->net_conf->after_sb_1p) {
+	rcu_read_lock();
+	after_sb_1p = rcu_dereference(mdev->tconn->net_conf)->after_sb_1p;
+	rcu_read_unlock();
+	switch (after_sb_1p) {
 	case ASB_DISCARD_YOUNGER_PRI:
 	case ASB_DISCARD_OLDER_PRI:
 	case ASB_DISCARD_LEAST_CHG:
 	case ASB_DISCARD_LOCAL:
 	case ASB_DISCARD_REMOTE:
+	case ASB_DISCARD_ZERO_CHG:
 		dev_err(DEV, "Configuration error.\n");
 		break;
 	case ASB_DISCONNECT:
@@ -2292,8 +2682,12 @@ static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
 static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
 {
 	int hg, rv = -100;
+	enum drbd_after_sb_p after_sb_2p;
 
-	switch (mdev->net_conf->after_sb_2p) {
+	rcu_read_lock();
+	after_sb_2p = rcu_dereference(mdev->tconn->net_conf)->after_sb_2p;
+	rcu_read_unlock();
+	switch (after_sb_2p) {
 	case ASB_DISCARD_YOUNGER_PRI:
 	case ASB_DISCARD_OLDER_PRI:
 	case ASB_DISCARD_LEAST_CHG:
@@ -2301,6 +2695,7 @@ static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
 	case ASB_DISCARD_REMOTE:
 	case ASB_CONSENSUS:
 	case ASB_DISCARD_SECONDARY:
+	case ASB_DISCARD_ZERO_CHG:
 		dev_err(DEV, "Configuration error.\n");
 		break;
 	case ASB_VIOLENTLY:
@@ -2386,13 +2781,15 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
 
 		if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
 
-			if (mdev->agreed_pro_version < 91)
+			if (mdev->tconn->agreed_pro_version < 91)
 				return -1091;
 
 			if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
 			    (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
 				dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n");
-				drbd_uuid_set_bm(mdev, 0UL);
+				drbd_uuid_move_history(mdev);
+				mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
+				mdev->ldev->md.uuid[UI_BITMAP] = 0;
 
 				drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
 					       mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
@@ -2407,7 +2804,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
 
 		if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
 
-			if (mdev->agreed_pro_version < 91)
+			if (mdev->tconn->agreed_pro_version < 91)
 				return -1091;
 
 			if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
@@ -2440,7 +2837,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
 		case 1: /*  self_pri && !peer_pri */ return 1;
 		case 2: /* !self_pri &&  peer_pri */ return -1;
 		case 3: /*  self_pri &&  peer_pri */
-			dc = test_bit(DISCARD_CONCURRENT, &mdev->flags);
+			dc = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags);
 			return dc ? -1 : 1;
 		}
 	}
@@ -2453,14 +2850,14 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
 	*rule_nr = 51;
 	peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
 	if (self == peer) {
-		if (mdev->agreed_pro_version < 96 ?
+		if (mdev->tconn->agreed_pro_version < 96 ?
 		    (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
 		    (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
 		    peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) {
 			/* The last P_SYNC_UUID did not get though. Undo the last start of
 			   resync as sync source modifications of the peer's UUIDs. */
 
-			if (mdev->agreed_pro_version < 91)
+			if (mdev->tconn->agreed_pro_version < 91)
 				return -1091;
 
 			mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
@@ -2490,18 +2887,18 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
 	*rule_nr = 71;
 	self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
 	if (self == peer) {
-		if (mdev->agreed_pro_version < 96 ?
+		if (mdev->tconn->agreed_pro_version < 96 ?
 		    (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
 		    (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
 		    self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
 			/* The last P_SYNC_UUID did not get though. Undo the last start of
 			   resync as sync source modifications of our UUIDs. */
 
-			if (mdev->agreed_pro_version < 91)
+			if (mdev->tconn->agreed_pro_version < 91)
 				return -1091;
 
-			_drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
-			_drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
+			__drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
+			__drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
 
 			dev_info(DEV, "Last syncUUID did not get through, corrected:\n");
 			drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
@@ -2545,20 +2942,24 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
 static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
 					   enum drbd_disk_state peer_disk) __must_hold(local)
 {
-	int hg, rule_nr;
 	enum drbd_conns rv = C_MASK;
 	enum drbd_disk_state mydisk;
+	struct net_conf *nc;
+	int hg, rule_nr, rr_conflict, tentative;
 
 	mydisk = mdev->state.disk;
 	if (mydisk == D_NEGOTIATING)
 		mydisk = mdev->new_state_tmp.disk;
 
 	dev_info(DEV, "drbd_sync_handshake:\n");
+
+	spin_lock_irq(&mdev->ldev->md.uuid_lock);
 	drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0);
 	drbd_uuid_dump(mdev, "peer", mdev->p_uuid,
 		       mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
 
 	hg = drbd_uuid_compare(mdev, &rule_nr);
+	spin_unlock_irq(&mdev->ldev->md.uuid_lock);
 
 	dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
 
@@ -2584,7 +2985,10 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
 	if (abs(hg) == 100)
 		drbd_khelper(mdev, "initial-split-brain");
 
-	if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
+	rcu_read_lock();
+	nc = rcu_dereference(mdev->tconn->net_conf);
+
+	if (hg == 100 || (hg == -100 && nc->always_asbp)) {
 		int pcount = (mdev->state.role == R_PRIMARY)
 			   + (peer_role == R_PRIMARY);
 		int forced = (hg == -100);
@@ -2613,9 +3017,9 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
 	}
 
 	if (hg == -100) {
-		if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1))
+		if (test_bit(DISCARD_MY_DATA, &mdev->flags) && !(mdev->p_uuid[UI_FLAGS]&1))
 			hg = -1;
-		if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1))
+		if (!test_bit(DISCARD_MY_DATA, &mdev->flags) && (mdev->p_uuid[UI_FLAGS]&1))
 			hg = 1;
 
 		if (abs(hg) < 100)
@@ -2623,6 +3027,9 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
 			     "Sync from %s node\n",
 			     (hg < 0) ? "peer" : "this");
 	}
+	rr_conflict = nc->rr_conflict;
+	tentative = nc->tentative;
+	rcu_read_unlock();
 
 	if (hg == -100) {
 		/* FIXME this log message is not correct if we end up here
@@ -2641,7 +3048,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
 
 	if (hg < 0 && /* by intention we do not use mydisk here. */
 	    mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
-		switch (mdev->net_conf->rr_conflict) {
+		switch (rr_conflict) {
 		case ASB_CALL_HELPER:
 			drbd_khelper(mdev, "pri-lost");
 			/* fall through */
@@ -2654,7 +3061,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
 		}
 	}
 
-	if (mdev->net_conf->dry_run || test_bit(CONN_DRY_RUN, &mdev->flags)) {
+	if (tentative || test_bit(CONN_DRY_RUN, &mdev->tconn->flags)) {
 		if (hg == 0)
 			dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n");
 		else
@@ -2686,33 +3093,29 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
 	return rv;
 }
 
-/* returns 1 if invalid */
-static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
+static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
 {
 	/* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
-	if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) ||
-	    (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL))
-		return 0;
+	if (peer == ASB_DISCARD_REMOTE)
+		return ASB_DISCARD_LOCAL;
 
 	/* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
-	if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL ||
-	    self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL)
-		return 1;
+	if (peer == ASB_DISCARD_LOCAL)
+		return ASB_DISCARD_REMOTE;
 
 	/* everything else is valid if they are equal on both sides. */
-	if (peer == self)
-		return 0;
-
-	/* everything es is invalid. */
-	return 1;
+	return peer;
 }
 
-static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_protocol(struct drbd_tconn *tconn, struct packet_info *pi)
 {
-	struct p_protocol *p = &mdev->data.rbuf.protocol;
-	int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
-	int p_want_lose, p_two_primaries, cf;
-	char p_integrity_alg[SHARED_SECRET_MAX] = "";
+	struct p_protocol *p = pi->data;
+	enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
+	int p_proto, p_discard_my_data, p_two_primaries, cf;
+	struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
+	char integrity_alg[SHARED_SECRET_MAX] = "";
+	struct crypto_hash *peer_integrity_tfm = NULL;
+	void *int_dig_in = NULL, *int_dig_vv = NULL;
 
 	p_proto		= be32_to_cpu(p->protocol);
 	p_after_sb_0p	= be32_to_cpu(p->after_sb_0p);
@@ -2720,63 +3123,138 @@ static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsig
 	p_after_sb_2p	= be32_to_cpu(p->after_sb_2p);
 	p_two_primaries = be32_to_cpu(p->two_primaries);
 	cf		= be32_to_cpu(p->conn_flags);
-	p_want_lose = cf & CF_WANT_LOSE;
+	p_discard_my_data = cf & CF_DISCARD_MY_DATA;
 
-	clear_bit(CONN_DRY_RUN, &mdev->flags);
+	if (tconn->agreed_pro_version >= 87) {
+		int err;
 
-	if (cf & CF_DRY_RUN)
-		set_bit(CONN_DRY_RUN, &mdev->flags);
-
-	if (p_proto != mdev->net_conf->wire_protocol) {
-		dev_err(DEV, "incompatible communication protocols\n");
-		goto disconnect;
+		if (pi->size > sizeof(integrity_alg))
+			return -EIO;
+		err = drbd_recv_all(tconn, integrity_alg, pi->size);
+		if (err)
+			return err;
+		integrity_alg[SHARED_SECRET_MAX - 1] = 0;
 	}
 
-	if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) {
-		dev_err(DEV, "incompatible after-sb-0pri settings\n");
-		goto disconnect;
-	}
+	if (pi->cmd != P_PROTOCOL_UPDATE) {
+		clear_bit(CONN_DRY_RUN, &tconn->flags);
 
-	if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) {
-		dev_err(DEV, "incompatible after-sb-1pri settings\n");
-		goto disconnect;
-	}
+		if (cf & CF_DRY_RUN)
+			set_bit(CONN_DRY_RUN, &tconn->flags);
 
-	if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) {
-		dev_err(DEV, "incompatible after-sb-2pri settings\n");
-		goto disconnect;
-	}
+		rcu_read_lock();
+		nc = rcu_dereference(tconn->net_conf);
 
-	if (p_want_lose && mdev->net_conf->want_lose) {
-		dev_err(DEV, "both sides have the 'want_lose' flag set\n");
-		goto disconnect;
-	}
+		if (p_proto != nc->wire_protocol) {
+			conn_err(tconn, "incompatible %s settings\n", "protocol");
+			goto disconnect_rcu_unlock;
+		}
 
-	if (p_two_primaries != mdev->net_conf->two_primaries) {
-		dev_err(DEV, "incompatible setting of the two-primaries options\n");
-		goto disconnect;
+		if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
+			conn_err(tconn, "incompatible %s settings\n", "after-sb-0pri");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
+			conn_err(tconn, "incompatible %s settings\n", "after-sb-1pri");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
+			conn_err(tconn, "incompatible %s settings\n", "after-sb-2pri");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (p_discard_my_data && nc->discard_my_data) {
+			conn_err(tconn, "incompatible %s settings\n", "discard-my-data");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (p_two_primaries != nc->two_primaries) {
+			conn_err(tconn, "incompatible %s settings\n", "allow-two-primaries");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (strcmp(integrity_alg, nc->integrity_alg)) {
+			conn_err(tconn, "incompatible %s settings\n", "data-integrity-alg");
+			goto disconnect_rcu_unlock;
+		}
+
+		rcu_read_unlock();
 	}
 
-	if (mdev->agreed_pro_version >= 87) {
-		unsigned char *my_alg = mdev->net_conf->integrity_alg;
+	if (integrity_alg[0]) {
+		int hash_size;
+
+		/*
+		 * We can only change the peer data integrity algorithm
+		 * here.  Changing our own data integrity algorithm
+		 * requires that we send a P_PROTOCOL_UPDATE packet at
+		 * the same time; otherwise, the peer has no way to
+		 * tell between which packets the algorithm should
+		 * change.
+		 */
 
-		if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size)
-			return false;
+		peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
+		if (!peer_integrity_tfm) {
+			conn_err(tconn, "peer data-integrity-alg %s not supported\n",
+				 integrity_alg);
+			goto disconnect;
+		}
 
-		p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
-		if (strcmp(p_integrity_alg, my_alg)) {
-			dev_err(DEV, "incompatible setting of the data-integrity-alg\n");
+		hash_size = crypto_hash_digestsize(peer_integrity_tfm);
+		int_dig_in = kmalloc(hash_size, GFP_KERNEL);
+		int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
+		if (!(int_dig_in && int_dig_vv)) {
+			conn_err(tconn, "Allocation of buffers for data integrity checking failed\n");
 			goto disconnect;
 		}
-		dev_info(DEV, "data-integrity-alg: %s\n",
-		     my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
 	}
 
-	return true;
+	new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
+	if (!new_net_conf) {
+		conn_err(tconn, "Allocation of new net_conf failed\n");
+		goto disconnect;
+	}
+
+	mutex_lock(&tconn->data.mutex);
+	mutex_lock(&tconn->conf_update);
+	old_net_conf = tconn->net_conf;
+	*new_net_conf = *old_net_conf;
+
+	new_net_conf->wire_protocol = p_proto;
+	new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
+	new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
+	new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
+	new_net_conf->two_primaries = p_two_primaries;
+
+	rcu_assign_pointer(tconn->net_conf, new_net_conf);
+	mutex_unlock(&tconn->conf_update);
+	mutex_unlock(&tconn->data.mutex);
 
+	crypto_free_hash(tconn->peer_integrity_tfm);
+	kfree(tconn->int_dig_in);
+	kfree(tconn->int_dig_vv);
+	tconn->peer_integrity_tfm = peer_integrity_tfm;
+	tconn->int_dig_in = int_dig_in;
+	tconn->int_dig_vv = int_dig_vv;
+
+	if (strcmp(old_net_conf->integrity_alg, integrity_alg))
+		conn_info(tconn, "peer data-integrity-alg: %s\n",
+			  integrity_alg[0] ? integrity_alg : "(none)");
+
+	synchronize_rcu();
+	kfree(old_net_conf);
+	return 0;
+
+disconnect_rcu_unlock:
+	rcu_read_unlock();
 disconnect:
-	drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-	return false;
+	crypto_free_hash(peer_integrity_tfm);
+	kfree(int_dig_in);
+	kfree(int_dig_vv);
+	conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+	return -EIO;
 }
 
 /* helper function
@@ -2798,24 +3276,64 @@ struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
 			alg, name, PTR_ERR(tfm));
 		return tfm;
 	}
-	if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
-		crypto_free_hash(tfm);
-		dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name);
-		return ERR_PTR(-EINVAL);
-	}
 	return tfm;
 }
 
-static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size)
+static int ignore_remaining_packet(struct drbd_tconn *tconn, struct packet_info *pi)
+{
+	void *buffer = tconn->data.rbuf;
+	int size = pi->size;
+
+	while (size) {
+		int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
+		s = drbd_recv(tconn, buffer, s);
+		if (s <= 0) {
+			if (s < 0)
+				return s;
+			break;
+		}
+		size -= s;
+	}
+	if (size)
+		return -EIO;
+	return 0;
+}
+
+/*
+ * config_unknown_volume  -  device configuration command for unknown volume
+ *
+ * When a device is added to an existing connection, the node on which the
+ * device is added first will send configuration commands to its peer but the
+ * peer will not know about the device yet.  It will warn and ignore these
+ * commands.  Once the device is added on the second node, the second node will
+ * send the same device configuration commands, but in the other direction.
+ *
+ * (We can also end up here if drbd is misconfigured.)
+ */
+static int config_unknown_volume(struct drbd_tconn *tconn, struct packet_info *pi)
+{
+	conn_warn(tconn, "%s packet received for volume %u, which is not configured locally\n",
+		  cmdname(pi->cmd), pi->vnr);
+	return ignore_remaining_packet(tconn, pi);
+}
+
+static int receive_SyncParam(struct drbd_tconn *tconn, struct packet_info *pi)
 {
-	int ok = true;
-	struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95;
+	struct drbd_conf *mdev;
+	struct p_rs_param_95 *p;
 	unsigned int header_size, data_size, exp_max_sz;
 	struct crypto_hash *verify_tfm = NULL;
 	struct crypto_hash *csums_tfm = NULL;
-	const int apv = mdev->agreed_pro_version;
-	int *rs_plan_s = NULL;
+	struct net_conf *old_net_conf, *new_net_conf = NULL;
+	struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
+	const int apv = tconn->agreed_pro_version;
+	struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
 	int fifo_size = 0;
+	int err;
+
+	mdev = vnr_to_mdev(tconn, pi->vnr);
+	if (!mdev)
+		return config_unknown_volume(tconn, pi);
 
 	exp_max_sz  = apv <= 87 ? sizeof(struct p_rs_param)
 		    : apv == 88 ? sizeof(struct p_rs_param)
@@ -2823,32 +3341,49 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
 		    : apv <= 94 ? sizeof(struct p_rs_param_89)
 		    : /* apv >= 95 */ sizeof(struct p_rs_param_95);
 
-	if (packet_size > exp_max_sz) {
+	if (pi->size > exp_max_sz) {
 		dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
-		    packet_size, exp_max_sz);
-		return false;
+		    pi->size, exp_max_sz);
+		return -EIO;
 	}
 
 	if (apv <= 88) {
-		header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80);
-		data_size   = packet_size  - header_size;
+		header_size = sizeof(struct p_rs_param);
+		data_size = pi->size - header_size;
 	} else if (apv <= 94) {
-		header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80);
-		data_size   = packet_size  - header_size;
+		header_size = sizeof(struct p_rs_param_89);
+		data_size = pi->size - header_size;
 		D_ASSERT(data_size == 0);
 	} else {
-		header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80);
-		data_size   = packet_size  - header_size;
+		header_size = sizeof(struct p_rs_param_95);
+		data_size = pi->size - header_size;
 		D_ASSERT(data_size == 0);
 	}
 
 	/* initialize verify_alg and csums_alg */
+	p = pi->data;
 	memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
 
-	if (drbd_recv(mdev, &p->head.payload, header_size) != header_size)
-		return false;
+	err = drbd_recv_all(mdev->tconn, p, header_size);
+	if (err)
+		return err;
+
+	mutex_lock(&mdev->tconn->conf_update);
+	old_net_conf = mdev->tconn->net_conf;
+	if (get_ldev(mdev)) {
+		new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+		if (!new_disk_conf) {
+			put_ldev(mdev);
+			mutex_unlock(&mdev->tconn->conf_update);
+			dev_err(DEV, "Allocation of new disk_conf failed\n");
+			return -ENOMEM;
+		}
+
+		old_disk_conf = mdev->ldev->disk_conf;
+		*new_disk_conf = *old_disk_conf;
 
-	mdev->sync_conf.rate	  = be32_to_cpu(p->rate);
+		new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
+	}
 
 	if (apv >= 88) {
 		if (apv == 88) {
@@ -2856,12 +3391,13 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
 				dev_err(DEV, "verify-alg of wrong size, "
 					"peer wants %u, accepting only up to %u byte\n",
 					data_size, SHARED_SECRET_MAX);
-				return false;
+				err = -EIO;
+				goto reconnect;
 			}
 
-			if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
-				return false;
-
+			err = drbd_recv_all(mdev->tconn, p->verify_alg, data_size);
+			if (err)
+				goto reconnect;
 			/* we expect NUL terminated string */
 			/* but just in case someone tries to be evil */
 			D_ASSERT(p->verify_alg[data_size-1] == 0);
@@ -2876,10 +3412,10 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
 			p->csums_alg[SHARED_SECRET_MAX-1] = 0;
 		}
 
-		if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) {
+		if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
 			if (mdev->state.conn == C_WF_REPORT_PARAMS) {
 				dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
-				    mdev->sync_conf.verify_alg, p->verify_alg);
+				    old_net_conf->verify_alg, p->verify_alg);
 				goto disconnect;
 			}
 			verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
@@ -2890,10 +3426,10 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
 			}
 		}
 
-		if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) {
+		if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
 			if (mdev->state.conn == C_WF_REPORT_PARAMS) {
 				dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
-				    mdev->sync_conf.csums_alg, p->csums_alg);
+				    old_net_conf->csums_alg, p->csums_alg);
 				goto disconnect;
 			}
 			csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
@@ -2904,57 +3440,91 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
 			}
 		}
 
-		if (apv > 94) {
-			mdev->sync_conf.rate	  = be32_to_cpu(p->rate);
-			mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
-			mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target);
-			mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target);
-			mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate);
-
-			fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
-			if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
-				rs_plan_s   = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
-				if (!rs_plan_s) {
+		if (apv > 94 && new_disk_conf) {
+			new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
+			new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
+			new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
+			new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
+
+			fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+			if (fifo_size != mdev->rs_plan_s->size) {
+				new_plan = fifo_alloc(fifo_size);
+				if (!new_plan) {
 					dev_err(DEV, "kmalloc of fifo_buffer failed");
+					put_ldev(mdev);
 					goto disconnect;
 				}
 			}
 		}
 
-		spin_lock(&mdev->peer_seq_lock);
-		/* lock against drbd_nl_syncer_conf() */
-		if (verify_tfm) {
-			strcpy(mdev->sync_conf.verify_alg, p->verify_alg);
-			mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1;
-			crypto_free_hash(mdev->verify_tfm);
-			mdev->verify_tfm = verify_tfm;
-			dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
-		}
-		if (csums_tfm) {
-			strcpy(mdev->sync_conf.csums_alg, p->csums_alg);
-			mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1;
-			crypto_free_hash(mdev->csums_tfm);
-			mdev->csums_tfm = csums_tfm;
-			dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
-		}
-		if (fifo_size != mdev->rs_plan_s.size) {
-			kfree(mdev->rs_plan_s.values);
-			mdev->rs_plan_s.values = rs_plan_s;
-			mdev->rs_plan_s.size   = fifo_size;
-			mdev->rs_planed = 0;
+		if (verify_tfm || csums_tfm) {
+			new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
+			if (!new_net_conf) {
+				dev_err(DEV, "Allocation of new net_conf failed\n");
+				goto disconnect;
+			}
+
+			*new_net_conf = *old_net_conf;
+
+			if (verify_tfm) {
+				strcpy(new_net_conf->verify_alg, p->verify_alg);
+				new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
+				crypto_free_hash(mdev->tconn->verify_tfm);
+				mdev->tconn->verify_tfm = verify_tfm;
+				dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
+			}
+			if (csums_tfm) {
+				strcpy(new_net_conf->csums_alg, p->csums_alg);
+				new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
+				crypto_free_hash(mdev->tconn->csums_tfm);
+				mdev->tconn->csums_tfm = csums_tfm;
+				dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
+			}
+			rcu_assign_pointer(tconn->net_conf, new_net_conf);
 		}
-		spin_unlock(&mdev->peer_seq_lock);
 	}
 
-	return ok;
+	if (new_disk_conf) {
+		rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
+		put_ldev(mdev);
+	}
+
+	if (new_plan) {
+		old_plan = mdev->rs_plan_s;
+		rcu_assign_pointer(mdev->rs_plan_s, new_plan);
+	}
+
+	mutex_unlock(&mdev->tconn->conf_update);
+	synchronize_rcu();
+	if (new_net_conf)
+		kfree(old_net_conf);
+	kfree(old_disk_conf);
+	kfree(old_plan);
+
+	return 0;
+
+reconnect:
+	if (new_disk_conf) {
+		put_ldev(mdev);
+		kfree(new_disk_conf);
+	}
+	mutex_unlock(&mdev->tconn->conf_update);
+	return -EIO;
+
 disconnect:
+	kfree(new_plan);
+	if (new_disk_conf) {
+		put_ldev(mdev);
+		kfree(new_disk_conf);
+	}
+	mutex_unlock(&mdev->tconn->conf_update);
 	/* just for completeness: actually not needed,
 	 * as this is not reached if csums_tfm was ok. */
 	crypto_free_hash(csums_tfm);
 	/* but free the verify_tfm again, if csums_tfm did not work out */
 	crypto_free_hash(verify_tfm);
-	drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-	return false;
+	conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+	return -EIO;
 }
 
 /* warn if the arguments differ by more than 12.5% */
@@ -2970,59 +3540,77 @@ static void warn_if_differ_considerably(struct drbd_conf *mdev,
 		     (unsigned long long)a, (unsigned long long)b);
 }
 
-static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi)
 {
-	struct p_sizes *p = &mdev->data.rbuf.sizes;
+	struct drbd_conf *mdev;
+	struct p_sizes *p = pi->data;
 	enum determine_dev_size dd = unchanged;
 	sector_t p_size, p_usize, my_usize;
 	int ldsc = 0; /* local disk size changed */
 	enum dds_flags ddsf;
 
+	mdev = vnr_to_mdev(tconn, pi->vnr);
+	if (!mdev)
+		return config_unknown_volume(tconn, pi);
+
 	p_size = be64_to_cpu(p->d_size);
 	p_usize = be64_to_cpu(p->u_size);
 
-	if (p_size == 0 && mdev->state.disk == D_DISKLESS) {
-		dev_err(DEV, "some backing storage is needed\n");
-		drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-		return false;
-	}
-
 	/* just store the peer's disk size for now.
 	 * we still need to figure out whether we accept that. */
 	mdev->p_size = p_size;
 
 	if (get_ldev(mdev)) {
+		rcu_read_lock();
+		my_usize = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
+		rcu_read_unlock();
+
 		warn_if_differ_considerably(mdev, "lower level device sizes",
 			   p_size, drbd_get_max_capacity(mdev->ldev));
 		warn_if_differ_considerably(mdev, "user requested size",
-					    p_usize, mdev->ldev->dc.disk_size);
+					    p_usize, my_usize);
 
 		/* if this is the first connect, or an otherwise expected
 		 * param exchange, choose the minimum */
 		if (mdev->state.conn == C_WF_REPORT_PARAMS)
-			p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size,
-					     p_usize);
-
-		my_usize = mdev->ldev->dc.disk_size;
-
-		if (mdev->ldev->dc.disk_size != p_usize) {
-			mdev->ldev->dc.disk_size = p_usize;
-			dev_info(DEV, "Peer sets u_size to %lu sectors\n",
-			     (unsigned long)mdev->ldev->dc.disk_size);
-		}
+			p_usize = min_not_zero(my_usize, p_usize);
 
 		/* Never shrink a device with usable data during connect.
 		   But allow online shrinking if we are connected. */
-		if (drbd_new_dev_size(mdev, mdev->ldev, 0) <
-		   drbd_get_capacity(mdev->this_bdev) &&
-		   mdev->state.disk >= D_OUTDATED &&
-		   mdev->state.conn < C_CONNECTED) {
+		if (drbd_new_dev_size(mdev, mdev->ldev, p_usize, 0) <
+		    drbd_get_capacity(mdev->this_bdev) &&
+		    mdev->state.disk >= D_OUTDATED &&
+		    mdev->state.conn < C_CONNECTED) {
 			dev_err(DEV, "The peer's disk size is too small!\n");
-			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-			mdev->ldev->dc.disk_size = my_usize;
+			conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
 			put_ldev(mdev);
-			return false;
+			return -EIO;
+		}
+
+		if (my_usize != p_usize) {
+			struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
+
+			new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+			if (!new_disk_conf) {
+				dev_err(DEV, "Allocation of new disk_conf failed\n");
+				put_ldev(mdev);
+				return -ENOMEM;
+			}
+
+			mutex_lock(&mdev->tconn->conf_update);
+			old_disk_conf = mdev->ldev->disk_conf;
+			*new_disk_conf = *old_disk_conf;
+			new_disk_conf->disk_size = p_usize;
+
+			rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
+			mutex_unlock(&mdev->tconn->conf_update);
+			synchronize_rcu();
+			kfree(old_disk_conf);
+
+			dev_info(DEV, "Peer sets u_size to %lu sectors\n",
+				 (unsigned long)my_usize);
 		}
+
 		put_ldev(mdev);
 	}
 
@@ -3031,7 +3619,7 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 		dd = drbd_determine_dev_size(mdev, ddsf);
 		put_ldev(mdev);
 		if (dd == dev_size_error)
-			return false;
+			return -EIO;
 		drbd_md_sync(mdev);
 	} else {
 		/* I am diskless, need to accept the peer's size. */
@@ -3070,16 +3658,25 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 		}
 	}
 
-	return true;
+	return 0;
 }
 
-static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_uuids(struct drbd_tconn *tconn, struct packet_info *pi)
 {
-	struct p_uuids *p = &mdev->data.rbuf.uuids;
+	struct drbd_conf *mdev;
+	struct p_uuids *p = pi->data;
 	u64 *p_uuid;
 	int i, updated_uuids = 0;
 
+	mdev = vnr_to_mdev(tconn, pi->vnr);
+	if (!mdev)
+		return config_unknown_volume(tconn, pi);
+
 	p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
+	if (!p_uuid) {
+		dev_err(DEV, "kmalloc of p_uuid failed\n");
+		return false;
+	}
 
 	for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
 		p_uuid[i] = be64_to_cpu(p->uuid[i]);
@@ -3093,14 +3690,14 @@ static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 	    (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
 		dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
 		    (unsigned long long)mdev->ed_uuid);
-		drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-		return false;
+		conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+		return -EIO;
 	}
 
 	if (get_ldev(mdev)) {
 		int skip_initial_sync =
 			mdev->state.conn == C_CONNECTED &&
-			mdev->agreed_pro_version >= 90 &&
+			mdev->tconn->agreed_pro_version >= 90 &&
 			mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
 			(p_uuid[UI_FLAGS] & 8);
 		if (skip_initial_sync) {
@@ -3127,14 +3724,15 @@ static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 	   ongoing cluster wide state change is finished. That is important if
 	   we are primary and are detaching from our disk. We need to see the
 	   new disk state... */
-	wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags));
+	mutex_lock(mdev->state_mutex);
+	mutex_unlock(mdev->state_mutex);
 	if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
 		updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
 
 	if (updated_uuids)
 		drbd_print_uuids(mdev, "receiver updated UUIDs to");
 
-	return true;
+	return 0;
 }
 
 /**
@@ -3146,6 +3744,7 @@ static union drbd_state convert_state(union drbd_state ps)
 	union drbd_state ms;
 
 	static enum drbd_conns c_tab[] = {
+		[C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
 		[C_CONNECTED] = C_CONNECTED,
 
 		[C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
@@ -3164,43 +3763,77 @@ static union drbd_state convert_state(union drbd_state ps)
 	ms.disk = ps.pdsk;
 	ms.peer_isp = (ps.aftr_isp | ps.user_isp);
 
-	return ms;
+	return ms;
+}
+
+static int receive_req_state(struct drbd_tconn *tconn, struct packet_info *pi)
+{
+	struct drbd_conf *mdev;
+	struct p_req_state *p = pi->data;
+	union drbd_state mask, val;
+	enum drbd_state_rv rv;
+
+	mdev = vnr_to_mdev(tconn, pi->vnr);
+	if (!mdev)
+		return -EIO;
+
+	mask.i = be32_to_cpu(p->mask);
+	val.i = be32_to_cpu(p->val);
+
+	if (test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags) &&
+	    mutex_is_locked(mdev->state_mutex)) {
+		drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
+		return 0;
+	}
+
+	mask = convert_state(mask);
+	val = convert_state(val);
+
+	rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
+	drbd_send_sr_reply(mdev, rv);
+
+	drbd_md_sync(mdev);
+
+	return 0;
 }
 
-static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_req_conn_state(struct drbd_tconn *tconn, struct packet_info *pi)
 {
-	struct p_req_state *p = &mdev->data.rbuf.req_state;
+	struct p_req_state *p = pi->data;
 	union drbd_state mask, val;
 	enum drbd_state_rv rv;
 
 	mask.i = be32_to_cpu(p->mask);
 	val.i = be32_to_cpu(p->val);
 
-	if (test_bit(DISCARD_CONCURRENT, &mdev->flags) &&
-	    test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) {
-		drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
-		return true;
+	if (test_bit(RESOLVE_CONFLICTS, &tconn->flags) &&
+	    mutex_is_locked(&tconn->cstate_mutex)) {
+		conn_send_sr_reply(tconn, SS_CONCURRENT_ST_CHG);
+		return 0;
 	}
 
 	mask = convert_state(mask);
 	val = convert_state(val);
 
-	rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
-
-	drbd_send_sr_reply(mdev, rv);
-	drbd_md_sync(mdev);
+	rv = conn_request_state(tconn, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
+	conn_send_sr_reply(tconn, rv);
 
-	return true;
+	return 0;
 }
 
-static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi)
 {
-	struct p_state *p = &mdev->data.rbuf.state;
+	struct drbd_conf *mdev;
+	struct p_state *p = pi->data;
 	union drbd_state os, ns, peer_state;
 	enum drbd_disk_state real_peer_disk;
 	enum chg_state_flags cs_flags;
 	int rv;
 
+	mdev = vnr_to_mdev(tconn, pi->vnr);
+	if (!mdev)
+		return config_unknown_volume(tconn, pi);
+
 	peer_state.i = be32_to_cpu(p->state);
 
 	real_peer_disk = peer_state.disk;
@@ -3209,16 +3842,16 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 		dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
 	}
 
-	spin_lock_irq(&mdev->req_lock);
+	spin_lock_irq(&mdev->tconn->req_lock);
  retry:
-	os = ns = mdev->state;
-	spin_unlock_irq(&mdev->req_lock);
+	os = ns = drbd_read_state(mdev);
+	spin_unlock_irq(&mdev->tconn->req_lock);
 
 	/* If some other part of the code (asender thread, timeout)
 	 * already decided to close the connection again,
 	 * we must not "re-establish" it here. */
 	if (os.conn <= C_TEAR_DOWN)
-		return false;
+		return -ECONNRESET;
 
 	/* If this is the "end of sync" confirmation, usually the peer disk
 	 * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
@@ -3246,10 +3879,18 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 			 peer_state.conn == C_CONNECTED) {
 			if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
 				drbd_resync_finished(mdev);
-			return true;
+			return 0;
 		}
 	}
 
+	/* explicit verify finished notification, stop sector reached. */
+	if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
+	    peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
+		ov_out_of_sync_print(mdev);
+		drbd_resync_finished(mdev);
+		return 0;
+	}
+
 	/* peer says his disk is inconsistent, while we think it is uptodate,
 	 * and this happens while the peer still thinks we have a sync going on,
 	 * but we think we are already done with the sync.
@@ -3298,17 +3939,17 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 				peer_state.disk = D_DISKLESS;
 				real_peer_disk = D_DISKLESS;
 			} else {
-				if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags))
-					return false;
+				if (test_and_clear_bit(CONN_DRY_RUN, &mdev->tconn->flags))
+					return -EIO;
 				D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
-				drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-				return false;
+				conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+				return -EIO;
 			}
 		}
 	}
 
-	spin_lock_irq(&mdev->req_lock);
-	if (mdev->state.i != os.i)
+	spin_lock_irq(&mdev->tconn->req_lock);
+	if (os.i != drbd_read_state(mdev).i)
 		goto retry;
 	clear_bit(CONSIDER_RESYNC, &mdev->flags);
 	ns.peer = peer_state.role;
@@ -3317,25 +3958,25 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 	if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
 		ns.disk = mdev->new_state_tmp.disk;
 	cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
-	if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
+	if (ns.pdsk == D_CONSISTENT && drbd_suspended(mdev) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
 	    test_bit(NEW_CUR_UUID, &mdev->flags)) {
-		/* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
+		/* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
 		   for temporal network outages! */
-		spin_unlock_irq(&mdev->req_lock);
+		spin_unlock_irq(&mdev->tconn->req_lock);
 		dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
-		tl_clear(mdev);
+		tl_clear(mdev->tconn);
 		drbd_uuid_new_current(mdev);
 		clear_bit(NEW_CUR_UUID, &mdev->flags);
-		drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
-		return false;
+		conn_request_state(mdev->tconn, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
+		return -EIO;
 	}
 	rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
-	ns = mdev->state;
-	spin_unlock_irq(&mdev->req_lock);
+	ns = drbd_read_state(mdev);
+	spin_unlock_irq(&mdev->tconn->req_lock);
 
 	if (rv < SS_SUCCESS) {
-		drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-		return false;
+		conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+		return -EIO;
 	}
 
 	if (os.conn > C_WF_REPORT_PARAMS) {
@@ -3349,16 +3990,21 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 		}
 	}
 
-	mdev->net_conf->want_lose = 0;
+	clear_bit(DISCARD_MY_DATA, &mdev->flags);
 
 	drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
 
-	return true;
+	return 0;
 }
 
-static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_sync_uuid(struct drbd_tconn *tconn, struct packet_info *pi)
 {
-	struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid;
+	struct drbd_conf *mdev;
+	struct p_rs_uuid *p = pi->data;
+
+	mdev = vnr_to_mdev(tconn, pi->vnr);
+	if (!mdev)
+		return -EIO;
 
 	wait_event(mdev->misc_wait,
 		   mdev->state.conn == C_WF_SYNC_UUID ||
@@ -3381,7 +4027,7 @@ static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
 	} else
 		dev_err(DEV, "Ignoring SyncUUID packet!\n");
 
-	return true;
+	return 0;
 }
 
 /**
@@ -3391,27 +4037,27 @@ static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
  * code upon failure.
  */
 static int
-receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
-		     unsigned long *buffer, struct bm_xfer_ctx *c)
+receive_bitmap_plain(struct drbd_conf *mdev, unsigned int size,
+		     unsigned long *p, struct bm_xfer_ctx *c)
 {
-	unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
-	unsigned want = num_words * sizeof(long);
+	unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
+				 drbd_header_size(mdev->tconn);
+	unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
+				       c->bm_words - c->word_offset);
+	unsigned int want = num_words * sizeof(*p);
 	int err;
 
-	if (want != data_size) {
-		dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
+	if (want != size) {
+		dev_err(DEV, "%s:want (%u) != size (%u)\n", __func__, want, size);
 		return -EIO;
 	}
 	if (want == 0)
 		return 0;
-	err = drbd_recv(mdev, buffer, want);
-	if (err != want) {
-		if (err >= 0)
-			err = -EIO;
+	err = drbd_recv_all(mdev->tconn, p, want);
+	if (err)
 		return err;
-	}
 
-	drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
+	drbd_bm_merge_lel(mdev, c->word_offset, num_words, p);
 
 	c->word_offset += num_words;
 	c->bit_offset = c->word_offset * BITS_PER_LONG;
@@ -3421,6 +4067,21 @@ receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
 	return 1;
 }
 
+static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
+{
+	return (enum drbd_bitmap_code)(p->encoding & 0x0f);
+}
+
+static int dcbp_get_start(struct p_compressed_bm *p)
+{
+	return (p->encoding & 0x80) != 0;
+}
+
+static int dcbp_get_pad_bits(struct p_compressed_bm *p)
+{
+	return (p->encoding >> 4) & 0x7;
+}
+
 /**
  * recv_bm_rle_bits
  *
@@ -3430,7 +4091,8 @@ receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
 static int
 recv_bm_rle_bits(struct drbd_conf *mdev,
 		struct p_compressed_bm *p,
-		struct bm_xfer_ctx *c)
+		 struct bm_xfer_ctx *c,
+		 unsigned int len)
 {
 	struct bitstream bs;
 	u64 look_ahead;
@@ -3438,12 +4100,11 @@ recv_bm_rle_bits(struct drbd_conf *mdev,
 	u64 tmp;
 	unsigned long s = c->bit_offset;
 	unsigned long e;
-	int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head));
-	int toggle = DCBP_get_start(p);
+	int toggle = dcbp_get_start(p);
 	int have;
 	int bits;
 
-	bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p));
+	bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
 
 	bits = bitstream_get_bits(&bs, &look_ahead, 64);
 	if (bits < 0)
@@ -3495,17 +4156,18 @@ recv_bm_rle_bits(struct drbd_conf *mdev,
 static int
 decode_bitmap_c(struct drbd_conf *mdev,
 		struct p_compressed_bm *p,
-		struct bm_xfer_ctx *c)
+		struct bm_xfer_ctx *c,
+		unsigned int len)
 {
-	if (DCBP_get_code(p) == RLE_VLI_Bits)
-		return recv_bm_rle_bits(mdev, p, c);
+	if (dcbp_get_code(p) == RLE_VLI_Bits)
+		return recv_bm_rle_bits(mdev, p, c, len - sizeof(*p));
 
 	/* other variants had been implemented for evaluation,
 	 * but have been dropped as this one turned out to be "best"
 	 * during all our tests. */
 
 	dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
-	drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+	conn_request_state(mdev->tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
 	return -EIO;
 }
 
@@ -3513,11 +4175,13 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev,
 		const char *direction, struct bm_xfer_ctx *c)
 {
 	/* what would it take to transfer it "plaintext" */
-	unsigned plain = sizeof(struct p_header80) *
-		((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
-		+ c->bm_words * sizeof(long);
-	unsigned total = c->bytes[0] + c->bytes[1];
-	unsigned r;
+	unsigned int header_size = drbd_header_size(mdev->tconn);
+	unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
+	unsigned int plain =
+		header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
+		c->bm_words * sizeof(unsigned long);
+	unsigned int total = c->bytes[0] + c->bytes[1];
+	unsigned int r;
 
 	/* total can not be zero. but just in case: */
 	if (total == 0)
@@ -3551,67 +4215,63 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev,
    in order to be agnostic to the 32 vs 64 bits issue.
 
    returns 0 on failure, 1 if we successfully received it. */
-static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_bitmap(struct drbd_tconn *tconn, struct packet_info *pi)
 {
+	struct drbd_conf *mdev;
 	struct bm_xfer_ctx c;
-	void *buffer;
 	int err;
-	int ok = false;
-	struct p_header80 *h = &mdev->data.rbuf.header.h80;
+
+	mdev = vnr_to_mdev(tconn, pi->vnr);
+	if (!mdev)
+		return -EIO;
 
 	drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED);
 	/* you are supposed to send additional out-of-sync information
 	 * if you actually set bits during this phase */
 
-	/* maybe we should use some per thread scratch page,
-	 * and allocate that during initial device creation? */
-	buffer	 = (unsigned long *) __get_free_page(GFP_NOIO);
-	if (!buffer) {
-		dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
-		goto out;
-	}
-
 	c = (struct bm_xfer_ctx) {
 		.bm_bits = drbd_bm_bits(mdev),
 		.bm_words = drbd_bm_words(mdev),
 	};
 
 	for(;;) {
-		if (cmd == P_BITMAP) {
-			err = receive_bitmap_plain(mdev, data_size, buffer, &c);
-		} else if (cmd == P_COMPRESSED_BITMAP) {
+		if (pi->cmd == P_BITMAP)
+			err = receive_bitmap_plain(mdev, pi->size, pi->data, &c);
+		else if (pi->cmd == P_COMPRESSED_BITMAP) {
 			/* MAYBE: sanity check that we speak proto >= 90,
 			 * and the feature is enabled! */
-			struct p_compressed_bm *p;
+			struct p_compressed_bm *p = pi->data;
 
-			if (data_size > BM_PACKET_PAYLOAD_BYTES) {
+			if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(tconn)) {
 				dev_err(DEV, "ReportCBitmap packet too large\n");
+				err = -EIO;
 				goto out;
 			}
-			/* use the page buff */
-			p = buffer;
-			memcpy(p, h, sizeof(*h));
-			if (drbd_recv(mdev, p->head.payload, data_size) != data_size)
-				goto out;
-			if (data_size <= (sizeof(*p) - sizeof(p->head))) {
-				dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size);
+			if (pi->size <= sizeof(*p)) {
+				dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", pi->size);
+				err = -EIO;
 				goto out;
 			}
-			err = decode_bitmap_c(mdev, p, &c);
+			err = drbd_recv_all(mdev->tconn, p, pi->size);
+			if (err)
+			       goto out;
+			err = decode_bitmap_c(mdev, p, &c, pi->size);
 		} else {
-			dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd);
+			dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
+			err = -EIO;
 			goto out;
 		}
 
-		c.packets[cmd == P_BITMAP]++;
-		c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size;
+		c.packets[pi->cmd == P_BITMAP]++;
+		c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(tconn) + pi->size;
 
 		if (err <= 0) {
 			if (err < 0)
 				goto out;
 			break;
 		}
-		if (!drbd_recv_header(mdev, &cmd, &data_size))
+		err = drbd_recv_header(mdev->tconn, pi);
+		if (err)
 			goto out;
 	}
 
@@ -3620,8 +4280,8 @@ static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigne
 	if (mdev->state.conn == C_WF_BITMAP_T) {
 		enum drbd_state_rv rv;
 
-		ok = !drbd_send_bitmap(mdev);
-		if (!ok)
+		err = drbd_send_bitmap(mdev);
+		if (err)
 			goto out;
 		/* Omit CS_ORDERED with this state transition to avoid deadlocks. */
 		rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
@@ -3632,47 +4292,40 @@ static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigne
 		dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
 		    drbd_conn_str(mdev->state.conn));
 	}
+	err = 0;
 
-	ok = true;
  out:
 	drbd_bm_unlock(mdev);
-	if (ok && mdev->state.conn == C_WF_BITMAP_S)
+	if (!err && mdev->state.conn == C_WF_BITMAP_S)
 		drbd_start_resync(mdev, C_SYNC_SOURCE);
-	free_page((unsigned long) buffer);
-	return ok;
+	return err;
 }
 
-static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_skip(struct drbd_tconn *tconn, struct packet_info *pi)
 {
-	/* TODO zero copy sink :) */
-	static char sink[128];
-	int size, want, r;
+	conn_warn(tconn, "skipping unknown optional packet type %d, l: %d!\n",
+		 pi->cmd, pi->size);
 
-	dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
-		 cmd, data_size);
-
-	size = data_size;
-	while (size > 0) {
-		want = min_t(int, size, sizeof(sink));
-		r = drbd_recv(mdev, sink, want);
-		ERR_IF(r <= 0) break;
-		size -= r;
-	}
-	return size == 0;
+	return ignore_remaining_packet(tconn, pi);
 }
 
-static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_UnplugRemote(struct drbd_tconn *tconn, struct packet_info *pi)
 {
 	/* Make sure we've acked all the TCP data associated
 	 * with the data requests being unplugged */
-	drbd_tcp_quickack(mdev->data.socket);
+	drbd_tcp_quickack(tconn->data.socket);
 
-	return true;
+	return 0;
 }
 
-static int receive_out_of_sync(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_out_of_sync(struct drbd_tconn *tconn, struct packet_info *pi)
 {
-	struct p_block_desc *p = &mdev->data.rbuf.block_desc;
+	struct drbd_conf *mdev;
+	struct p_block_desc *p = pi->data;
+
+	mdev = vnr_to_mdev(tconn, pi->vnr);
+	if (!mdev)
+		return -EIO;
 
 	switch (mdev->state.conn) {
 	case C_WF_SYNC_UUID:
@@ -3686,15 +4339,13 @@ static int receive_out_of_sync(struct drbd_conf *mdev, enum drbd_packets cmd, un
 
 	drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
 
-	return true;
+	return 0;
 }
 
-typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
-
 struct data_cmd {
 	int expect_payload;
 	size_t pkt_size;
-	drbd_cmd_handler_f function;
+	int (*fn)(struct drbd_tconn *, struct packet_info *);
 };
 
 static struct data_cmd drbd_cmd_handler[] = {
@@ -3702,13 +4353,13 @@ static struct data_cmd drbd_cmd_handler[] = {
 	[P_DATA_REPLY]	    = { 1, sizeof(struct p_data), receive_DataReply },
 	[P_RS_DATA_REPLY]   = { 1, sizeof(struct p_data), receive_RSDataReply } ,
 	[P_BARRIER]	    = { 0, sizeof(struct p_barrier), receive_Barrier } ,
-	[P_BITMAP]	    = { 1, sizeof(struct p_header80), receive_bitmap } ,
-	[P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
-	[P_UNPLUG_REMOTE]   = { 0, sizeof(struct p_header80), receive_UnplugRemote },
+	[P_BITMAP]	    = { 1, 0, receive_bitmap } ,
+	[P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
+	[P_UNPLUG_REMOTE]   = { 0, 0, receive_UnplugRemote },
 	[P_DATA_REQUEST]    = { 0, sizeof(struct p_block_req), receive_DataRequest },
 	[P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
-	[P_SYNC_PARAM]	    = { 1, sizeof(struct p_header80), receive_SyncParam },
-	[P_SYNC_PARAM89]    = { 1, sizeof(struct p_header80), receive_SyncParam },
+	[P_SYNC_PARAM]	    = { 1, 0, receive_SyncParam },
+	[P_SYNC_PARAM89]    = { 1, 0, receive_SyncParam },
 	[P_PROTOCOL]        = { 1, sizeof(struct p_protocol), receive_protocol },
 	[P_UUIDS]	    = { 0, sizeof(struct p_uuids), receive_uuids },
 	[P_SIZES]	    = { 0, sizeof(struct p_sizes), receive_sizes },
@@ -3720,124 +4371,75 @@ static struct data_cmd drbd_cmd_handler[] = {
 	[P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
 	[P_DELAY_PROBE]     = { 0, sizeof(struct p_delay_probe93), receive_skip },
 	[P_OUT_OF_SYNC]     = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
-	/* anything missing from this table is in
-	 * the asender_tbl, see get_asender_cmd */
-	[P_MAX_CMD]	    = { 0, 0, NULL },
+	[P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
+	[P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
 };
 
-/* All handler functions that expect a sub-header get that sub-heder in
-   mdev->data.rbuf.header.head.payload.
-
-   Usually in mdev->data.rbuf.header.head the callback can find the usual
-   p_header, but they may not rely on that. Since there is also p_header95 !
- */
-
-static void drbdd(struct drbd_conf *mdev)
+static void drbdd(struct drbd_tconn *tconn)
 {
-	union p_header *header = &mdev->data.rbuf.header;
-	unsigned int packet_size;
-	enum drbd_packets cmd;
+	struct packet_info pi;
 	size_t shs; /* sub header size */
-	int rv;
+	int err;
 
-	while (get_t_state(&mdev->receiver) == Running) {
-		drbd_thread_current_set_cpu(mdev);
-		if (!drbd_recv_header(mdev, &cmd, &packet_size))
+	while (get_t_state(&tconn->receiver) == RUNNING) {
+		struct data_cmd *cmd;
+
+		drbd_thread_current_set_cpu(&tconn->receiver);
+		if (drbd_recv_header(tconn, &pi))
 			goto err_out;
 
-		if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) {
-			dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size);
+		cmd = &drbd_cmd_handler[pi.cmd];
+		if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
+			conn_err(tconn, "Unexpected data packet %s (0x%04x)",
+				 cmdname(pi.cmd), pi.cmd);
 			goto err_out;
 		}
 
-		shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header);
-		if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) {
-			dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size);
+		shs = cmd->pkt_size;
+		if (pi.size > shs && !cmd->expect_payload) {
+			conn_err(tconn, "No payload expected %s l:%d\n",
+				 cmdname(pi.cmd), pi.size);
 			goto err_out;
 		}
 
 		if (shs) {
-			rv = drbd_recv(mdev, &header->h80.payload, shs);
-			if (unlikely(rv != shs)) {
-				if (!signal_pending(current))
-					dev_warn(DEV, "short read while reading sub header: rv=%d\n", rv);
+			err = drbd_recv_all_warn(tconn, pi.data, shs);
+			if (err)
 				goto err_out;
-			}
+			pi.size -= shs;
 		}
 
-		rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs);
-
-		if (unlikely(!rv)) {
-			dev_err(DEV, "error receiving %s, l: %d!\n",
-			    cmdname(cmd), packet_size);
+		err = cmd->fn(tconn, &pi);
+		if (err) {
+			conn_err(tconn, "error receiving %s, e: %d l: %d!\n",
+				 cmdname(pi.cmd), err, pi.size);
 			goto err_out;
 		}
 	}
+	return;
 
-	if (0) {
-	err_out:
-		drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
-	}
-	/* If we leave here, we probably want to update at least the
-	 * "Connected" indicator on stable storage. Do so explicitly here. */
-	drbd_md_sync(mdev);
+    err_out:
+	conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
 }
 
-void drbd_flush_workqueue(struct drbd_conf *mdev)
+void conn_flush_workqueue(struct drbd_tconn *tconn)
 {
 	struct drbd_wq_barrier barr;
 
 	barr.w.cb = w_prev_work_done;
+	barr.w.tconn = tconn;
 	init_completion(&barr.done);
-	drbd_queue_work(&mdev->data.work, &barr.w);
+	drbd_queue_work(&tconn->sender_work, &barr.w);
 	wait_for_completion(&barr.done);
 }
 
-void drbd_free_tl_hash(struct drbd_conf *mdev)
-{
-	struct hlist_head *h;
-
-	spin_lock_irq(&mdev->req_lock);
-
-	if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
-		spin_unlock_irq(&mdev->req_lock);
-		return;
-	}
-	/* paranoia code */
-	for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
-		if (h->first)
-			dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
-				(int)(h - mdev->ee_hash), h->first);
-	kfree(mdev->ee_hash);
-	mdev->ee_hash = NULL;
-	mdev->ee_hash_s = 0;
-
-	/* We may not have had the chance to wait for all locally pending
-	 * application requests. The hlist_add_fake() prevents access after
-	 * free on master bio completion. */
-	for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) {
-		struct drbd_request *req;
-		struct hlist_node *pos, *n;
-		hlist_for_each_entry_safe(req, pos, n, h, collision) {
-			hlist_del_init(&req->collision);
-			hlist_add_fake(&req->collision);
-		}
-	}
-
-	kfree(mdev->tl_hash);
-	mdev->tl_hash = NULL;
-	mdev->tl_hash_s = 0;
-	spin_unlock_irq(&mdev->req_lock);
-}
-
-static void drbd_disconnect(struct drbd_conf *mdev)
+static void conn_disconnect(struct drbd_tconn *tconn)
 {
-	enum drbd_fencing_p fp;
-	union drbd_state os, ns;
-	int rv = SS_UNKNOWN_ERROR;
-	unsigned int i;
+	struct drbd_conf *mdev;
+	enum drbd_conns oc;
+	int vnr;
 
-	if (mdev->state.conn == C_STANDALONE)
+	if (tconn->cstate == C_STANDALONE)
 		return;
 
 	/* We are about to start the cleanup after connection loss.
@@ -3845,18 +4447,54 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 	 * Usually we should be in some network failure state already,
 	 * but just in case we are not, we fix it up here.
 	 */
-	drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+	conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD);
 
 	/* asender does not clean up anything. it must not interfere, either */
-	drbd_thread_stop(&mdev->asender);
-	drbd_free_sock(mdev);
+	drbd_thread_stop(&tconn->asender);
+	drbd_free_sock(tconn);
+
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+		kref_get(&mdev->kref);
+		rcu_read_unlock();
+		drbd_disconnected(mdev);
+		kref_put(&mdev->kref, &drbd_minor_destroy);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+
+	if (!list_empty(&tconn->current_epoch->list))
+		conn_err(tconn, "ASSERTION FAILED: tconn->current_epoch->list not empty\n");
+	/* ok, no more ee's on the fly, it is safe to reset the epoch_size */
+	atomic_set(&tconn->current_epoch->epoch_size, 0);
+	tconn->send.seen_any_write_yet = false;
+
+	conn_info(tconn, "Connection closed\n");
+
+	if (conn_highest_role(tconn) == R_PRIMARY && conn_highest_pdsk(tconn) >= D_UNKNOWN)
+		conn_try_outdate_peer_async(tconn);
+
+	spin_lock_irq(&tconn->req_lock);
+	oc = tconn->cstate;
+	if (oc >= C_UNCONNECTED)
+		_conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE);
+
+	spin_unlock_irq(&tconn->req_lock);
+
+	if (oc == C_DISCONNECTING)
+		conn_request_state(tconn, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
+}
+
+static int drbd_disconnected(struct drbd_conf *mdev)
+{
+	unsigned int i;
 
 	/* wait for current activity to cease. */
-	spin_lock_irq(&mdev->req_lock);
+	spin_lock_irq(&mdev->tconn->req_lock);
 	_drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
 	_drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
 	_drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
-	spin_unlock_irq(&mdev->req_lock);
+	spin_unlock_irq(&mdev->tconn->req_lock);
 
 	/* We do not have data structures that would allow us to
 	 * get the rs_pending_cnt down to 0 again.
@@ -3874,7 +4512,6 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 	atomic_set(&mdev->rs_pending_cnt, 0);
 	wake_up(&mdev->misc_wait);
 
-	/* make sure syncer is stopped and w_resume_next_sg queued */
 	del_timer_sync(&mdev->resync_timer);
 	resync_timer_fn((unsigned long)mdev);
 
@@ -3883,50 +4520,25 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 	 * to be "canceled" */
 	drbd_flush_workqueue(mdev);
 
-	/* This also does reclaim_net_ee().  If we do this too early, we might
-	 * miss some resync ee and pages.*/
-	drbd_process_done_ee(mdev);
+	drbd_finish_peer_reqs(mdev);
+
+	/* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
+	   might have issued a work again. The one before drbd_finish_peer_reqs() is
+	   necessary to reclain net_ee in drbd_finish_peer_reqs(). */
+	drbd_flush_workqueue(mdev);
+
+	/* need to do it again, drbd_finish_peer_reqs() may have populated it
+	 * again via drbd_try_clear_on_disk_bm(). */
+	drbd_rs_cancel_all(mdev);
 
 	kfree(mdev->p_uuid);
 	mdev->p_uuid = NULL;
 
-	if (!is_susp(mdev->state))
-		tl_clear(mdev);
-
-	dev_info(DEV, "Connection closed\n");
+	if (!drbd_suspended(mdev))
+		tl_clear(mdev->tconn);
 
 	drbd_md_sync(mdev);
 
-	fp = FP_DONT_CARE;
-	if (get_ldev(mdev)) {
-		fp = mdev->ldev->dc.fencing;
-		put_ldev(mdev);
-	}
-
-	if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN)
-		drbd_try_outdate_peer_async(mdev);
-
-	spin_lock_irq(&mdev->req_lock);
-	os = mdev->state;
-	if (os.conn >= C_UNCONNECTED) {
-		/* Do not restart in case we are C_DISCONNECTING */
-		ns = os;
-		ns.conn = C_UNCONNECTED;
-		rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
-	}
-	spin_unlock_irq(&mdev->req_lock);
-
-	if (os.conn == C_DISCONNECTING) {
-		wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
-
-		crypto_free_hash(mdev->cram_hmac_tfm);
-		mdev->cram_hmac_tfm = NULL;
-
-		kfree(mdev->net_conf);
-		mdev->net_conf = NULL;
-		drbd_request_state(mdev, NS(conn, C_STANDALONE));
-	}
-
 	/* serialize with bitmap writeout triggered by the state change,
 	 * if any. */
 	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
@@ -3938,7 +4550,7 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 	 * Actually we don't care for exactly when the network stack does its
 	 * put_page(), but release our reference on these pages right here.
 	 */
-	i = drbd_release_ee(mdev, &mdev->net_ee);
+	i = drbd_free_peer_reqs(mdev, &mdev->net_ee);
 	if (i)
 		dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
 	i = atomic_read(&mdev->pp_in_use_by_net);
@@ -3953,9 +4565,7 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 	D_ASSERT(list_empty(&mdev->sync_ee));
 	D_ASSERT(list_empty(&mdev->done_ee));
 
-	/* ok, no more ee's on the fly, it is safe to reset the epoch_size */
-	atomic_set(&mdev->current_epoch->epoch_size, 0);
-	D_ASSERT(list_empty(&mdev->current_epoch->list));
+	return 0;
 }
 
 /*
@@ -3967,29 +4577,19 @@ static void drbd_disconnect(struct drbd_conf *mdev)
  *
  * for now, they are expected to be zero, but ignored.
  */
-static int drbd_send_handshake(struct drbd_conf *mdev)
+static int drbd_send_features(struct drbd_tconn *tconn)
 {
-	/* ASSERT current == mdev->receiver ... */
-	struct p_handshake *p = &mdev->data.sbuf.handshake;
-	int ok;
-
-	if (mutex_lock_interruptible(&mdev->data.mutex)) {
-		dev_err(DEV, "interrupted during initial handshake\n");
-		return 0; /* interrupted. not ok. */
-	}
-
-	if (mdev->data.socket == NULL) {
-		mutex_unlock(&mdev->data.mutex);
-		return 0;
-	}
+	struct drbd_socket *sock;
+	struct p_connection_features *p;
 
+	sock = &tconn->data;
+	p = conn_prepare_command(tconn, sock);
+	if (!p)
+		return -EIO;
 	memset(p, 0, sizeof(*p));
 	p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
 	p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
-	ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
-			     (struct p_header80 *)p, sizeof(*p), 0 );
-	mutex_unlock(&mdev->data.mutex);
-	return ok;
+	return conn_send_command(tconn, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
 }
 
 /*
@@ -3999,42 +4599,38 @@ static int drbd_send_handshake(struct drbd_conf *mdev)
  *  -1 peer talks different language,
  *     no point in trying again, please go standalone.
  */
-static int drbd_do_handshake(struct drbd_conf *mdev)
+static int drbd_do_features(struct drbd_tconn *tconn)
 {
-	/* ASSERT current == mdev->receiver ... */
-	struct p_handshake *p = &mdev->data.rbuf.handshake;
-	const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80);
-	unsigned int length;
-	enum drbd_packets cmd;
-	int rv;
+	/* ASSERT current == tconn->receiver ... */
+	struct p_connection_features *p;
+	const int expect = sizeof(struct p_connection_features);
+	struct packet_info pi;
+	int err;
 
-	rv = drbd_send_handshake(mdev);
-	if (!rv)
+	err = drbd_send_features(tconn);
+	if (err)
 		return 0;
 
-	rv = drbd_recv_header(mdev, &cmd, &length);
-	if (!rv)
+	err = drbd_recv_header(tconn, &pi);
+	if (err)
 		return 0;
 
-	if (cmd != P_HAND_SHAKE) {
-		dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
-		     cmdname(cmd), cmd);
+	if (pi.cmd != P_CONNECTION_FEATURES) {
+		conn_err(tconn, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
+			 cmdname(pi.cmd), pi.cmd);
 		return -1;
 	}
 
-	if (length != expect) {
-		dev_err(DEV, "expected HandShake length: %u, received: %u\n",
-		     expect, length);
+	if (pi.size != expect) {
+		conn_err(tconn, "expected ConnectionFeatures length: %u, received: %u\n",
+		     expect, pi.size);
 		return -1;
 	}
 
-	rv = drbd_recv(mdev, &p->head.payload, expect);
-
-	if (rv != expect) {
-		if (!signal_pending(current))
-			dev_warn(DEV, "short read receiving handshake packet: l=%u\n", rv);
+	p = pi.data;
+	err = drbd_recv_all_warn(tconn, p, expect);
+	if (err)
 		return 0;
-	}
 
 	p->protocol_min = be32_to_cpu(p->protocol_min);
 	p->protocol_max = be32_to_cpu(p->protocol_max);
@@ -4045,15 +4641,15 @@ static int drbd_do_handshake(struct drbd_conf *mdev)
 	    PRO_VERSION_MIN > p->protocol_max)
 		goto incompat;
 
-	mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
+	tconn->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
 
-	dev_info(DEV, "Handshake successful: "
-	     "Agreed network protocol version %d\n", mdev->agreed_pro_version);
+	conn_info(tconn, "Handshake successful: "
+	     "Agreed network protocol version %d\n", tconn->agreed_pro_version);
 
 	return 1;
 
  incompat:
-	dev_err(DEV, "incompatible DRBD dialects: "
+	conn_err(tconn, "incompatible DRBD dialects: "
 	    "I support %d-%d, peer supports %d-%d\n",
 	    PRO_VERSION_MIN, PRO_VERSION_MAX,
 	    p->protocol_min, p->protocol_max);
@@ -4061,7 +4657,7 @@ static int drbd_do_handshake(struct drbd_conf *mdev)
 }
 
 #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
-static int drbd_do_auth(struct drbd_conf *mdev)
+static int drbd_do_auth(struct drbd_tconn *tconn)
 {
 	dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
 	dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
@@ -4076,121 +4672,139 @@ static int drbd_do_auth(struct drbd_conf *mdev)
 	-1 - auth failed, don't try again.
 */
 
-static int drbd_do_auth(struct drbd_conf *mdev)
+static int drbd_do_auth(struct drbd_tconn *tconn)
 {
+	struct drbd_socket *sock;
 	char my_challenge[CHALLENGE_LEN];  /* 64 Bytes... */
 	struct scatterlist sg;
 	char *response = NULL;
 	char *right_response = NULL;
 	char *peers_ch = NULL;
-	unsigned int key_len = strlen(mdev->net_conf->shared_secret);
+	unsigned int key_len;
+	char secret[SHARED_SECRET_MAX]; /* 64 byte */
 	unsigned int resp_size;
 	struct hash_desc desc;
-	enum drbd_packets cmd;
-	unsigned int length;
-	int rv;
+	struct packet_info pi;
+	struct net_conf *nc;
+	int err, rv;
+
+	/* FIXME: Put the challenge/response into the preallocated socket buffer.  */
+
+	rcu_read_lock();
+	nc = rcu_dereference(tconn->net_conf);
+	key_len = strlen(nc->shared_secret);
+	memcpy(secret, nc->shared_secret, key_len);
+	rcu_read_unlock();
 
-	desc.tfm = mdev->cram_hmac_tfm;
+	desc.tfm = tconn->cram_hmac_tfm;
 	desc.flags = 0;
 
-	rv = crypto_hash_setkey(mdev->cram_hmac_tfm,
-				(u8 *)mdev->net_conf->shared_secret, key_len);
+	rv = crypto_hash_setkey(tconn->cram_hmac_tfm, (u8 *)secret, key_len);
 	if (rv) {
-		dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv);
+		conn_err(tconn, "crypto_hash_setkey() failed with %d\n", rv);
 		rv = -1;
 		goto fail;
 	}
 
 	get_random_bytes(my_challenge, CHALLENGE_LEN);
 
-	rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN);
+	sock = &tconn->data;
+	if (!conn_prepare_command(tconn, sock)) {
+		rv = 0;
+		goto fail;
+	}
+	rv = !conn_send_command(tconn, sock, P_AUTH_CHALLENGE, 0,
+				my_challenge, CHALLENGE_LEN);
 	if (!rv)
 		goto fail;
 
-	rv = drbd_recv_header(mdev, &cmd, &length);
-	if (!rv)
+	err = drbd_recv_header(tconn, &pi);
+	if (err) {
+		rv = 0;
 		goto fail;
+	}
 
-	if (cmd != P_AUTH_CHALLENGE) {
-		dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
-		    cmdname(cmd), cmd);
+	if (pi.cmd != P_AUTH_CHALLENGE) {
+		conn_err(tconn, "expected AuthChallenge packet, received: %s (0x%04x)\n",
+			 cmdname(pi.cmd), pi.cmd);
 		rv = 0;
 		goto fail;
 	}
 
-	if (length > CHALLENGE_LEN * 2) {
-		dev_err(DEV, "expected AuthChallenge payload too big.\n");
+	if (pi.size > CHALLENGE_LEN * 2) {
+		conn_err(tconn, "expected AuthChallenge payload too big.\n");
 		rv = -1;
 		goto fail;
 	}
 
-	peers_ch = kmalloc(length, GFP_NOIO);
+	peers_ch = kmalloc(pi.size, GFP_NOIO);
 	if (peers_ch == NULL) {
-		dev_err(DEV, "kmalloc of peers_ch failed\n");
+		conn_err(tconn, "kmalloc of peers_ch failed\n");
 		rv = -1;
 		goto fail;
 	}
 
-	rv = drbd_recv(mdev, peers_ch, length);
-
-	if (rv != length) {
-		if (!signal_pending(current))
-			dev_warn(DEV, "short read AuthChallenge: l=%u\n", rv);
+	err = drbd_recv_all_warn(tconn, peers_ch, pi.size);
+	if (err) {
 		rv = 0;
 		goto fail;
 	}
 
-	resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm);
+	resp_size = crypto_hash_digestsize(tconn->cram_hmac_tfm);
 	response = kmalloc(resp_size, GFP_NOIO);
 	if (response == NULL) {
-		dev_err(DEV, "kmalloc of response failed\n");
+		conn_err(tconn, "kmalloc of response failed\n");
 		rv = -1;
 		goto fail;
 	}
 
 	sg_init_table(&sg, 1);
-	sg_set_buf(&sg, peers_ch, length);
+	sg_set_buf(&sg, peers_ch, pi.size);
 
 	rv = crypto_hash_digest(&desc, &sg, sg.length, response);
 	if (rv) {
-		dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
+		conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv);
 		rv = -1;
 		goto fail;
 	}
 
-	rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size);
-	if (!rv)
+	if (!conn_prepare_command(tconn, sock)) {
+		rv = 0;
 		goto fail;
-
-	rv = drbd_recv_header(mdev, &cmd, &length);
+	}
+	rv = !conn_send_command(tconn, sock, P_AUTH_RESPONSE, 0,
+				response, resp_size);
 	if (!rv)
 		goto fail;
 
-	if (cmd != P_AUTH_RESPONSE) {
-		dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
-			cmdname(cmd), cmd);
+	err = drbd_recv_header(tconn, &pi);
+	if (err) {
 		rv = 0;
 		goto fail;
 	}
 
-	if (length != resp_size) {
-		dev_err(DEV, "expected AuthResponse payload of wrong size\n");
+	if (pi.cmd != P_AUTH_RESPONSE) {
+		conn_err(tconn, "expected AuthResponse packet, received: %s (0x%04x)\n",
+			 cmdname(pi.cmd), pi.cmd);
 		rv = 0;
 		goto fail;
 	}
 
-	rv = drbd_recv(mdev, response , resp_size);
+	if (pi.size != resp_size) {
+		conn_err(tconn, "expected AuthResponse payload of wrong size\n");
+		rv = 0;
+		goto fail;
+	}
 
-	if (rv != resp_size) {
-		if (!signal_pending(current))
-			dev_warn(DEV, "short read receiving AuthResponse: l=%u\n", rv);
+	err = drbd_recv_all_warn(tconn, response , resp_size);
+	if (err) {
 		rv = 0;
 		goto fail;
 	}
 
 	right_response = kmalloc(resp_size, GFP_NOIO);
 	if (right_response == NULL) {
-		dev_err(DEV, "kmalloc of right_response failed\n");
+		conn_err(tconn, "kmalloc of right_response failed\n");
 		rv = -1;
 		goto fail;
 	}
@@ -4199,7 +4813,7 @@ static int drbd_do_auth(struct drbd_conf *mdev)
 
 	rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
 	if (rv) {
-		dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
+		conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv);
 		rv = -1;
 		goto fail;
 	}
@@ -4207,8 +4821,8 @@ static int drbd_do_auth(struct drbd_conf *mdev)
 	rv = !memcmp(response, right_response, resp_size);
 
 	if (rv)
-		dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n",
-		     resp_size, mdev->net_conf->cram_hmac_alg);
+		conn_info(tconn, "Peer authenticated using %d bytes HMAC\n",
+		     resp_size);
 	else
 		rv = -1;
 
@@ -4223,82 +4837,106 @@ static int drbd_do_auth(struct drbd_conf *mdev)
 
 int drbdd_init(struct drbd_thread *thi)
 {
-	struct drbd_conf *mdev = thi->mdev;
-	unsigned int minor = mdev_to_minor(mdev);
+	struct drbd_tconn *tconn = thi->tconn;
 	int h;
 
-	sprintf(current->comm, "drbd%d_receiver", minor);
-
-	dev_info(DEV, "receiver (re)started\n");
+	conn_info(tconn, "receiver (re)started\n");
 
 	do {
-		h = drbd_connect(mdev);
+		h = conn_connect(tconn);
 		if (h == 0) {
-			drbd_disconnect(mdev);
+			conn_disconnect(tconn);
 			schedule_timeout_interruptible(HZ);
 		}
 		if (h == -1) {
-			dev_warn(DEV, "Discarding network configuration.\n");
-			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+			conn_warn(tconn, "Discarding network configuration.\n");
+			conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
 		}
 	} while (h == 0);
 
-	if (h > 0) {
-		if (get_net_conf(mdev)) {
-			drbdd(mdev);
-			put_net_conf(mdev);
-		}
-	}
+	if (h > 0)
+		drbdd(tconn);
 
-	drbd_disconnect(mdev);
+	conn_disconnect(tconn);
 
-	dev_info(DEV, "receiver terminated\n");
+	conn_info(tconn, "receiver terminated\n");
 	return 0;
 }
 
 /* ********* acknowledge sender ******** */
 
-static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_conn_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi)
 {
-	struct p_req_state_reply *p = (struct p_req_state_reply *)h;
+	struct p_req_state_reply *p = pi->data;
+	int retcode = be32_to_cpu(p->retcode);
+
+	if (retcode >= SS_SUCCESS) {
+		set_bit(CONN_WD_ST_CHG_OKAY, &tconn->flags);
+	} else {
+		set_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags);
+		conn_err(tconn, "Requested state change failed by peer: %s (%d)\n",
+			 drbd_set_st_err_str(retcode), retcode);
+	}
+	wake_up(&tconn->ping_wait);
+
+	return 0;
+}
 
+static int got_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi)
+{
+	struct drbd_conf *mdev;
+	struct p_req_state_reply *p = pi->data;
 	int retcode = be32_to_cpu(p->retcode);
 
+	mdev = vnr_to_mdev(tconn, pi->vnr);
+	if (!mdev)
+		return -EIO;
+
+	if (test_bit(CONN_WD_ST_CHG_REQ, &tconn->flags)) {
+		D_ASSERT(tconn->agreed_pro_version < 100);
+		return got_conn_RqSReply(tconn, pi);
+	}
+
 	if (retcode >= SS_SUCCESS) {
 		set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
 	} else {
 		set_bit(CL_ST_CHG_FAIL, &mdev->flags);
 		dev_err(DEV, "Requested state change failed by peer: %s (%d)\n",
-		    drbd_set_st_err_str(retcode), retcode);
+			drbd_set_st_err_str(retcode), retcode);
 	}
 	wake_up(&mdev->state_wait);
 
-	return true;
+	return 0;
 }
 
-static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_Ping(struct drbd_tconn *tconn, struct packet_info *pi)
 {
-	return drbd_send_ping_ack(mdev);
+	return drbd_send_ping_ack(tconn);
 
 }
 
-static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_PingAck(struct drbd_tconn *tconn, struct packet_info *pi)
 {
 	/* restore idle timeout */
-	mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
-	if (!test_and_set_bit(GOT_PING_ACK, &mdev->flags))
-		wake_up(&mdev->misc_wait);
+	tconn->meta.socket->sk->sk_rcvtimeo = tconn->net_conf->ping_int*HZ;
+	if (!test_and_set_bit(GOT_PING_ACK, &tconn->flags))
+		wake_up(&tconn->ping_wait);
 
-	return true;
+	return 0;
 }
 
-static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_IsInSync(struct drbd_tconn *tconn, struct packet_info *pi)
 {
-	struct p_block_ack *p = (struct p_block_ack *)h;
+	struct drbd_conf *mdev;
+	struct p_block_ack *p = pi->data;
 	sector_t sector = be64_to_cpu(p->sector);
 	int blksize = be32_to_cpu(p->blksize);
 
-	D_ASSERT(mdev->agreed_pro_version >= 89);
+	mdev = vnr_to_mdev(tconn, pi->vnr);
+	if (!mdev)
+		return -EIO;
+
+	D_ASSERT(mdev->tconn->agreed_pro_version >= 89);
 
 	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
 
@@ -4312,162 +4950,139 @@ static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
 	dec_rs_pending(mdev);
 	atomic_add(blksize >> 9, &mdev->rs_sect_in);
 
-	return true;
-}
-
-/* when we receive the ACK for a write request,
- * verify that we actually know about it */
-static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
-	u64 id, sector_t sector)
-{
-	struct hlist_head *slot = tl_hash_slot(mdev, sector);
-	struct hlist_node *n;
-	struct drbd_request *req;
-
-	hlist_for_each_entry(req, n, slot, collision) {
-		if ((unsigned long)req == (unsigned long)id) {
-			if (req->sector != sector) {
-				dev_err(DEV, "_ack_id_to_req: found req %p but it has "
-				    "wrong sector (%llus versus %llus)\n", req,
-				    (unsigned long long)req->sector,
-				    (unsigned long long)sector);
-				break;
-			}
-			return req;
-		}
-	}
-	return NULL;
+	return 0;
 }
 
-typedef struct drbd_request *(req_validator_fn)
-	(struct drbd_conf *mdev, u64 id, sector_t sector);
-
-static int validate_req_change_req_state(struct drbd_conf *mdev,
-	u64 id, sector_t sector, req_validator_fn validator,
-	const char *func, enum drbd_req_event what)
+static int
+validate_req_change_req_state(struct drbd_conf *mdev, u64 id, sector_t sector,
+			      struct rb_root *root, const char *func,
+			      enum drbd_req_event what, bool missing_ok)
 {
 	struct drbd_request *req;
 	struct bio_and_error m;
 
-	spin_lock_irq(&mdev->req_lock);
-	req = validator(mdev, id, sector);
+	spin_lock_irq(&mdev->tconn->req_lock);
+	req = find_request(mdev, root, id, sector, missing_ok, func);
 	if (unlikely(!req)) {
-		spin_unlock_irq(&mdev->req_lock);
-
-		dev_err(DEV, "%s: failed to find req %p, sector %llus\n", func,
-			(void *)(unsigned long)id, (unsigned long long)sector);
-		return false;
+		spin_unlock_irq(&mdev->tconn->req_lock);
+		return -EIO;
 	}
 	__req_mod(req, what, &m);
-	spin_unlock_irq(&mdev->req_lock);
+	spin_unlock_irq(&mdev->tconn->req_lock);
 
 	if (m.bio)
 		complete_master_bio(mdev, &m);
-	return true;
+	return 0;
 }
 
-static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_BlockAck(struct drbd_tconn *tconn, struct packet_info *pi)
 {
-	struct p_block_ack *p = (struct p_block_ack *)h;
+	struct drbd_conf *mdev;
+	struct p_block_ack *p = pi->data;
 	sector_t sector = be64_to_cpu(p->sector);
 	int blksize = be32_to_cpu(p->blksize);
 	enum drbd_req_event what;
 
+	mdev = vnr_to_mdev(tconn, pi->vnr);
+	if (!mdev)
+		return -EIO;
+
 	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
 
-	if (is_syncer_block_id(p->block_id)) {
+	if (p->block_id == ID_SYNCER) {
 		drbd_set_in_sync(mdev, sector, blksize);
 		dec_rs_pending(mdev);
-		return true;
+		return 0;
 	}
-	switch (be16_to_cpu(h->command)) {
+	switch (pi->cmd) {
 	case P_RS_WRITE_ACK:
-		D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
-		what = write_acked_by_peer_and_sis;
+		what = WRITE_ACKED_BY_PEER_AND_SIS;
 		break;
 	case P_WRITE_ACK:
-		D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
-		what = write_acked_by_peer;
+		what = WRITE_ACKED_BY_PEER;
 		break;
 	case P_RECV_ACK:
-		D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B);
-		what = recv_acked_by_peer;
+		what = RECV_ACKED_BY_PEER;
 		break;
-	case P_DISCARD_ACK:
-		D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
-		what = conflict_discarded_by_peer;
+	case P_SUPERSEDED:
+		what = CONFLICT_RESOLVED;
+		break;
+	case P_RETRY_WRITE:
+		what = POSTPONE_WRITE;
 		break;
 	default:
-		D_ASSERT(0);
-		return false;
+		BUG();
 	}
 
 	return validate_req_change_req_state(mdev, p->block_id, sector,
-		_ack_id_to_req, __func__ , what);
+					     &mdev->write_requests, __func__,
+					     what, false);
 }
 
-static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_NegAck(struct drbd_tconn *tconn, struct packet_info *pi)
 {
-	struct p_block_ack *p = (struct p_block_ack *)h;
+	struct drbd_conf *mdev;
+	struct p_block_ack *p = pi->data;
 	sector_t sector = be64_to_cpu(p->sector);
 	int size = be32_to_cpu(p->blksize);
-	struct drbd_request *req;
-	struct bio_and_error m;
+	int err;
+
+	mdev = vnr_to_mdev(tconn, pi->vnr);
+	if (!mdev)
+		return -EIO;
 
 	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
 
-	if (is_syncer_block_id(p->block_id)) {
+	if (p->block_id == ID_SYNCER) {
 		dec_rs_pending(mdev);
 		drbd_rs_failed_io(mdev, sector, size);
-		return true;
+		return 0;
 	}
 
-	spin_lock_irq(&mdev->req_lock);
-	req = _ack_id_to_req(mdev, p->block_id, sector);
-	if (!req) {
-		spin_unlock_irq(&mdev->req_lock);
-		if (mdev->net_conf->wire_protocol == DRBD_PROT_A ||
-		    mdev->net_conf->wire_protocol == DRBD_PROT_B) {
-			/* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
-			   The master bio might already be completed, therefore the
-			   request is no longer in the collision hash.
-			   => Do not try to validate block_id as request. */
-			/* In Protocol B we might already have got a P_RECV_ACK
-			   but then get a P_NEG_ACK after wards. */
-			drbd_set_out_of_sync(mdev, sector, size);
-			return true;
-		} else {
-			dev_err(DEV, "%s: failed to find req %p, sector %llus\n", __func__,
-				(void *)(unsigned long)p->block_id, (unsigned long long)sector);
-			return false;
-		}
+	err = validate_req_change_req_state(mdev, p->block_id, sector,
+					    &mdev->write_requests, __func__,
+					    NEG_ACKED, true);
+	if (err) {
+		/* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
+		   The master bio might already be completed, therefore the
+		   request is no longer in the collision hash. */
+		/* In Protocol B we might already have got a P_RECV_ACK
+		   but then get a P_NEG_ACK afterwards. */
+		drbd_set_out_of_sync(mdev, sector, size);
 	}
-	__req_mod(req, neg_acked, &m);
-	spin_unlock_irq(&mdev->req_lock);
-
-	if (m.bio)
-		complete_master_bio(mdev, &m);
-	return true;
+	return 0;
 }
 
-static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_NegDReply(struct drbd_tconn *tconn, struct packet_info *pi)
 {
-	struct p_block_ack *p = (struct p_block_ack *)h;
+	struct drbd_conf *mdev;
+	struct p_block_ack *p = pi->data;
 	sector_t sector = be64_to_cpu(p->sector);
 
+	mdev = vnr_to_mdev(tconn, pi->vnr);
+	if (!mdev)
+		return -EIO;
+
 	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
-	dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n",
+
+	dev_err(DEV, "Got NegDReply; Sector %llus, len %u.\n",
 	    (unsigned long long)sector, be32_to_cpu(p->blksize));
 
 	return validate_req_change_req_state(mdev, p->block_id, sector,
-		_ar_id_to_req, __func__ , neg_acked);
+					     &mdev->read_requests, __func__,
+					     NEG_ACKED, false);
 }
 
-static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_NegRSDReply(struct drbd_tconn *tconn, struct packet_info *pi)
 {
+	struct drbd_conf *mdev;
 	sector_t sector;
 	int size;
-	struct p_block_ack *p = (struct p_block_ack *)h;
+	struct p_block_ack *p = pi->data;
+
+	mdev = vnr_to_mdev(tconn, pi->vnr);
+	if (!mdev)
+		return -EIO;
 
 	sector = be64_to_cpu(p->sector);
 	size = be32_to_cpu(p->blksize);
@@ -4478,57 +5093,66 @@ static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h)
 
 	if (get_ldev_if_state(mdev, D_FAILED)) {
 		drbd_rs_complete_io(mdev, sector);
-		switch (be16_to_cpu(h->command)) {
+		switch (pi->cmd) {
 		case P_NEG_RS_DREPLY:
 			drbd_rs_failed_io(mdev, sector, size);
 		case P_RS_CANCEL:
 			break;
 		default:
-			D_ASSERT(0);
-			put_ldev(mdev);
-			return false;
+			BUG();
 		}
 		put_ldev(mdev);
 	}
 
-	return true;
+	return 0;
 }
 
-static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_BarrierAck(struct drbd_tconn *tconn, struct packet_info *pi)
 {
-	struct p_barrier_ack *p = (struct p_barrier_ack *)h;
-
-	tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
-
-	if (mdev->state.conn == C_AHEAD &&
-	    atomic_read(&mdev->ap_in_flight) == 0 &&
-	    !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) {
-		mdev->start_resync_timer.expires = jiffies + HZ;
-		add_timer(&mdev->start_resync_timer);
+	struct p_barrier_ack *p = pi->data;
+	struct drbd_conf *mdev;
+	int vnr;
+
+	tl_release(tconn, p->barrier, be32_to_cpu(p->set_size));
+
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+		if (mdev->state.conn == C_AHEAD &&
+		    atomic_read(&mdev->ap_in_flight) == 0 &&
+		    !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) {
+			mdev->start_resync_timer.expires = jiffies + HZ;
+			add_timer(&mdev->start_resync_timer);
+		}
 	}
+	rcu_read_unlock();
 
-	return true;
+	return 0;
 }
 
-static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_OVResult(struct drbd_tconn *tconn, struct packet_info *pi)
 {
-	struct p_block_ack *p = (struct p_block_ack *)h;
+	struct drbd_conf *mdev;
+	struct p_block_ack *p = pi->data;
 	struct drbd_work *w;
 	sector_t sector;
 	int size;
 
+	mdev = vnr_to_mdev(tconn, pi->vnr);
+	if (!mdev)
+		return -EIO;
+
 	sector = be64_to_cpu(p->sector);
 	size = be32_to_cpu(p->blksize);
 
 	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
 
 	if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
-		drbd_ov_oos_found(mdev, sector, size);
+		drbd_ov_out_of_sync_found(mdev, sector, size);
 	else
-		ov_oos_print(mdev);
+		ov_out_of_sync_print(mdev);
 
 	if (!get_ldev(mdev))
-		return true;
+		return 0;
 
 	drbd_rs_complete_io(mdev, sector);
 	dec_rs_pending(mdev);
@@ -4543,114 +5167,137 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
 		w = kmalloc(sizeof(*w), GFP_NOIO);
 		if (w) {
 			w->cb = w_ov_finished;
-			drbd_queue_work_front(&mdev->data.work, w);
+			w->mdev = mdev;
+			drbd_queue_work(&mdev->tconn->sender_work, w);
 		} else {
 			dev_err(DEV, "kmalloc(w) failed.");
-			ov_oos_print(mdev);
+			ov_out_of_sync_print(mdev);
 			drbd_resync_finished(mdev);
 		}
 	}
 	put_ldev(mdev);
-	return true;
+	return 0;
 }
 
-static int got_skip(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_skip(struct drbd_tconn *tconn, struct packet_info *pi)
 {
-	return true;
+	return 0;
+}
+
+static int tconn_finish_peer_reqs(struct drbd_tconn *tconn)
+{
+	struct drbd_conf *mdev;
+	int vnr, not_empty = 0;
+
+	do {
+		clear_bit(SIGNAL_ASENDER, &tconn->flags);
+		flush_signals(current);
+
+		rcu_read_lock();
+		idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+			kref_get(&mdev->kref);
+			rcu_read_unlock();
+			if (drbd_finish_peer_reqs(mdev)) {
+				kref_put(&mdev->kref, &drbd_minor_destroy);
+				return 1;
+			}
+			kref_put(&mdev->kref, &drbd_minor_destroy);
+			rcu_read_lock();
+		}
+		set_bit(SIGNAL_ASENDER, &tconn->flags);
+
+		spin_lock_irq(&tconn->req_lock);
+		idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+			not_empty = !list_empty(&mdev->done_ee);
+			if (not_empty)
+				break;
+		}
+		spin_unlock_irq(&tconn->req_lock);
+		rcu_read_unlock();
+	} while (not_empty);
+
+	return 0;
 }
 
 struct asender_cmd {
 	size_t pkt_size;
-	int (*process)(struct drbd_conf *mdev, struct p_header80 *h);
+	int (*fn)(struct drbd_tconn *tconn, struct packet_info *);
 };
 
-static struct asender_cmd *get_asender_cmd(int cmd)
-{
-	static struct asender_cmd asender_tbl[] = {
-		/* anything missing from this table is in
-		 * the drbd_cmd_handler (drbd_default_handler) table,
-		 * see the beginning of drbdd() */
-	[P_PING]	    = { sizeof(struct p_header80), got_Ping },
-	[P_PING_ACK]	    = { sizeof(struct p_header80), got_PingAck },
+static struct asender_cmd asender_tbl[] = {
+	[P_PING]	    = { 0, got_Ping },
+	[P_PING_ACK]	    = { 0, got_PingAck },
 	[P_RECV_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
 	[P_WRITE_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
 	[P_RS_WRITE_ACK]    = { sizeof(struct p_block_ack), got_BlockAck },
-	[P_DISCARD_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_SUPERSEDED]   = { sizeof(struct p_block_ack), got_BlockAck },
 	[P_NEG_ACK]	    = { sizeof(struct p_block_ack), got_NegAck },
 	[P_NEG_DREPLY]	    = { sizeof(struct p_block_ack), got_NegDReply },
-	[P_NEG_RS_DREPLY]   = { sizeof(struct p_block_ack), got_NegRSDReply},
+	[P_NEG_RS_DREPLY]   = { sizeof(struct p_block_ack), got_NegRSDReply },
 	[P_OV_RESULT]	    = { sizeof(struct p_block_ack), got_OVResult },
 	[P_BARRIER_ACK]	    = { sizeof(struct p_barrier_ack), got_BarrierAck },
 	[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
 	[P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
 	[P_DELAY_PROBE]     = { sizeof(struct p_delay_probe93), got_skip },
-	[P_RS_CANCEL]       = { sizeof(struct p_block_ack), got_NegRSDReply},
-	[P_MAX_CMD]	    = { 0, NULL },
-	};
-	if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
-		return NULL;
-	return &asender_tbl[cmd];
-}
+	[P_RS_CANCEL]       = { sizeof(struct p_block_ack), got_NegRSDReply },
+	[P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
+	[P_RETRY_WRITE]	    = { sizeof(struct p_block_ack), got_BlockAck },
+};
 
 int drbd_asender(struct drbd_thread *thi)
 {
-	struct drbd_conf *mdev = thi->mdev;
-	struct p_header80 *h = &mdev->meta.rbuf.header.h80;
+	struct drbd_tconn *tconn = thi->tconn;
 	struct asender_cmd *cmd = NULL;
-
-	int rv, len;
-	void *buf    = h;
+	struct packet_info pi;
+	int rv;
+	void *buf    = tconn->meta.rbuf;
 	int received = 0;
-	int expect   = sizeof(struct p_header80);
-	int empty;
-	int ping_timeout_active = 0;
-
-	sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
+	unsigned int header_size = drbd_header_size(tconn);
+	int expect   = header_size;
+	bool ping_timeout_active = false;
+	struct net_conf *nc;
+	int ping_timeo, tcp_cork, ping_int;
 
 	current->policy = SCHED_RR;  /* Make this a realtime task! */
 	current->rt_priority = 2;    /* more important than all other tasks */
 
-	while (get_t_state(thi) == Running) {
-		drbd_thread_current_set_cpu(mdev);
-		if (test_and_clear_bit(SEND_PING, &mdev->flags)) {
-			ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
-			mdev->meta.socket->sk->sk_rcvtimeo =
-				mdev->net_conf->ping_timeo*HZ/10;
-			ping_timeout_active = 1;
-		}
+	while (get_t_state(thi) == RUNNING) {
+		drbd_thread_current_set_cpu(thi);
 
-		/* conditionally cork;
-		 * it may hurt latency if we cork without much to send */
-		if (!mdev->net_conf->no_cork &&
-			3 < atomic_read(&mdev->unacked_cnt))
-			drbd_tcp_cork(mdev->meta.socket);
-		while (1) {
-			clear_bit(SIGNAL_ASENDER, &mdev->flags);
-			flush_signals(current);
-			if (!drbd_process_done_ee(mdev))
+		rcu_read_lock();
+		nc = rcu_dereference(tconn->net_conf);
+		ping_timeo = nc->ping_timeo;
+		tcp_cork = nc->tcp_cork;
+		ping_int = nc->ping_int;
+		rcu_read_unlock();
+
+		if (test_and_clear_bit(SEND_PING, &tconn->flags)) {
+			if (drbd_send_ping(tconn)) {
+				conn_err(tconn, "drbd_send_ping has failed\n");
 				goto reconnect;
-			/* to avoid race with newly queued ACKs */
-			set_bit(SIGNAL_ASENDER, &mdev->flags);
-			spin_lock_irq(&mdev->req_lock);
-			empty = list_empty(&mdev->done_ee);
-			spin_unlock_irq(&mdev->req_lock);
-			/* new ack may have been queued right here,
-			 * but then there is also a signal pending,
-			 * and we start over... */
-			if (empty)
-				break;
+			}
+			tconn->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10;
+			ping_timeout_active = true;
+		}
+
+		/* TODO: conditionally cork; it may hurt latency if we cork without
+		   much to send */
+		if (tcp_cork)
+			drbd_tcp_cork(tconn->meta.socket);
+		if (tconn_finish_peer_reqs(tconn)) {
+			conn_err(tconn, "tconn_finish_peer_reqs() failed\n");
+			goto reconnect;
 		}
 		/* but unconditionally uncork unless disabled */
-		if (!mdev->net_conf->no_cork)
-			drbd_tcp_uncork(mdev->meta.socket);
+		if (tcp_cork)
+			drbd_tcp_uncork(tconn->meta.socket);
 
 		/* short circuit, recv_msg would return EINTR anyways. */
 		if (signal_pending(current))
 			continue;
 
-		rv = drbd_recv_short(mdev, mdev->meta.socket,
-				     buf, expect-received, 0);
-		clear_bit(SIGNAL_ASENDER, &mdev->flags);
+		rv = drbd_recv_short(tconn->meta.socket, buf, expect-received, 0);
+		clear_bit(SIGNAL_ASENDER, &tconn->flags);
 
 		flush_signals(current);
 
@@ -4668,80 +5315,91 @@ int drbd_asender(struct drbd_thread *thi)
 			received += rv;
 			buf	 += rv;
 		} else if (rv == 0) {
-			dev_err(DEV, "meta connection shut down by peer.\n");
+			if (test_bit(DISCONNECT_SENT, &tconn->flags)) {
+				long t;
+				rcu_read_lock();
+				t = rcu_dereference(tconn->net_conf)->ping_timeo * HZ/10;
+				rcu_read_unlock();
+
+				t = wait_event_timeout(tconn->ping_wait,
+						       tconn->cstate < C_WF_REPORT_PARAMS,
+						       t);
+				if (t)
+					break;
+			}
+			conn_err(tconn, "meta connection shut down by peer.\n");
 			goto reconnect;
 		} else if (rv == -EAGAIN) {
 			/* If the data socket received something meanwhile,
 			 * that is good enough: peer is still alive. */
-			if (time_after(mdev->last_received,
-				jiffies - mdev->meta.socket->sk->sk_rcvtimeo))
+			if (time_after(tconn->last_received,
+				jiffies - tconn->meta.socket->sk->sk_rcvtimeo))
 				continue;
 			if (ping_timeout_active) {
-				dev_err(DEV, "PingAck did not arrive in time.\n");
+				conn_err(tconn, "PingAck did not arrive in time.\n");
 				goto reconnect;
 			}
-			set_bit(SEND_PING, &mdev->flags);
+			set_bit(SEND_PING, &tconn->flags);
 			continue;
 		} else if (rv == -EINTR) {
 			continue;
 		} else {
-			dev_err(DEV, "sock_recvmsg returned %d\n", rv);
+			conn_err(tconn, "sock_recvmsg returned %d\n", rv);
 			goto reconnect;
 		}
 
 		if (received == expect && cmd == NULL) {
-			if (unlikely(h->magic != BE_DRBD_MAGIC)) {
-				dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n",
-				    be32_to_cpu(h->magic),
-				    be16_to_cpu(h->command),
-				    be16_to_cpu(h->length));
+			if (decode_header(tconn, tconn->meta.rbuf, &pi))
 				goto reconnect;
-			}
-			cmd = get_asender_cmd(be16_to_cpu(h->command));
-			len = be16_to_cpu(h->length);
-			if (unlikely(cmd == NULL)) {
-				dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n",
-				    be32_to_cpu(h->magic),
-				    be16_to_cpu(h->command),
-				    be16_to_cpu(h->length));
+			cmd = &asender_tbl[pi.cmd];
+			if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) {
+				conn_err(tconn, "Unexpected meta packet %s (0x%04x)\n",
+					 cmdname(pi.cmd), pi.cmd);
 				goto disconnect;
 			}
-			expect = cmd->pkt_size;
-			ERR_IF(len != expect-sizeof(struct p_header80))
+			expect = header_size + cmd->pkt_size;
+			if (pi.size != expect - header_size) {
+				conn_err(tconn, "Wrong packet size on meta (c: %d, l: %d)\n",
+					pi.cmd, pi.size);
 				goto reconnect;
+			}
 		}
 		if (received == expect) {
-			mdev->last_received = jiffies;
-			D_ASSERT(cmd != NULL);
-			if (!cmd->process(mdev, h))
+			bool err;
+
+			err = cmd->fn(tconn, &pi);
+			if (err) {
+				conn_err(tconn, "%pf failed\n", cmd->fn);
 				goto reconnect;
+			}
 
-			/* the idle_timeout (ping-int)
-			 * has been restored in got_PingAck() */
-			if (cmd == get_asender_cmd(P_PING_ACK))
-				ping_timeout_active = 0;
+			tconn->last_received = jiffies;
 
-			buf	 = h;
+			if (cmd == &asender_tbl[P_PING_ACK]) {
+				/* restore idle timeout */
+				tconn->meta.socket->sk->sk_rcvtimeo = ping_int * HZ;
+				ping_timeout_active = false;
+			}
+
+			buf	 = tconn->meta.rbuf;
 			received = 0;
-			expect	 = sizeof(struct p_header80);
+			expect	 = header_size;
 			cmd	 = NULL;
 		}
 	}
 
 	if (0) {
 reconnect:
-		drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
-		drbd_md_sync(mdev);
+		conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+		conn_md_sync(tconn);
 	}
 	if (0) {
 disconnect:
-		drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-		drbd_md_sync(mdev);
+		conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
 	}
-	clear_bit(SIGNAL_ASENDER, &mdev->flags);
+	clear_bit(SIGNAL_ASENDER, &tconn->flags);
 
-	D_ASSERT(mdev->state.conn < C_CONNECTED);
-	dev_info(DEV, "asender terminated\n");
+	conn_info(tconn, "asender terminated\n");
 
 	return 0;
 }
diff --git a/trunk/drivers/block/drbd/drbd_req.c b/trunk/drivers/block/drbd/drbd_req.c
index 01b2ac641c7b..f58a4a4b4dfb 100644
--- a/trunk/drivers/block/drbd/drbd_req.c
+++ b/trunk/drivers/block/drbd/drbd_req.c
@@ -31,6 +31,8 @@
 #include "drbd_req.h"
 
 
+static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size);
+
 /* Update disk stats at start of I/O request */
 static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio)
 {
@@ -40,6 +42,8 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req
 	part_round_stats(cpu, &mdev->vdisk->part0);
 	part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
 	part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
+	(void) cpu; /* The macro invocations above want the cpu argument, I do not like
+		       the compiler warning about cpu only assigned but never used... */
 	part_inc_in_flight(&mdev->vdisk->part0, rw);
 	part_stat_unlock();
 }
@@ -57,9 +61,51 @@ static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
 	part_stat_unlock();
 }
 
-static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
+static struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
+					       struct bio *bio_src)
+{
+	struct drbd_request *req;
+
+	req = mempool_alloc(drbd_request_mempool, GFP_NOIO);
+	if (!req)
+		return NULL;
+
+	drbd_req_make_private_bio(req, bio_src);
+	req->rq_state    = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
+	req->w.mdev      = mdev;
+	req->master_bio  = bio_src;
+	req->epoch       = 0;
+
+	drbd_clear_interval(&req->i);
+	req->i.sector     = bio_src->bi_sector;
+	req->i.size      = bio_src->bi_size;
+	req->i.local = true;
+	req->i.waiting = false;
+
+	INIT_LIST_HEAD(&req->tl_requests);
+	INIT_LIST_HEAD(&req->w.list);
+
+	/* one reference to be put by __drbd_make_request */
+	atomic_set(&req->completion_ref, 1);
+	/* one kref as long as completion_ref > 0 */
+	kref_init(&req->kref);
+	return req;
+}
+
+void drbd_req_destroy(struct kref *kref)
 {
-	const unsigned long s = req->rq_state;
+	struct drbd_request *req = container_of(kref, struct drbd_request, kref);
+	struct drbd_conf *mdev = req->w.mdev;
+	const unsigned s = req->rq_state;
+
+	if ((req->master_bio && !(s & RQ_POSTPONED)) ||
+		atomic_read(&req->completion_ref) ||
+		(s & RQ_LOCAL_PENDING) ||
+		((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) {
+		dev_err(DEV, "drbd_req_destroy: Logic BUG rq_state = 0x%x, completion_ref = %d\n",
+				s, atomic_read(&req->completion_ref));
+		return;
+	}
 
 	/* remove it from the transfer log.
 	 * well, only if it had been there in the first
@@ -67,24 +113,33 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const
 	 * and never sent), it should still be "empty" as
 	 * initialized in drbd_req_new(), so we can list_del() it
 	 * here unconditionally */
-	list_del(&req->tl_requests);
+	list_del_init(&req->tl_requests);
 
 	/* if it was a write, we may have to set the corresponding
 	 * bit(s) out-of-sync first. If it had a local part, we need to
 	 * release the reference to the activity log. */
-	if (rw == WRITE) {
+	if (s & RQ_WRITE) {
 		/* Set out-of-sync unless both OK flags are set
 		 * (local only or remote failed).
 		 * Other places where we set out-of-sync:
 		 * READ with local io-error */
-		if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
-			drbd_set_out_of_sync(mdev, req->sector, req->size);
 
-		if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
-			drbd_set_in_sync(mdev, req->sector, req->size);
+		/* There is a special case:
+		 * we may notice late that IO was suspended,
+		 * and postpone, or schedule for retry, a write,
+		 * before it even was submitted or sent.
+		 * In that case we do not want to touch the bitmap at all.
+		 */
+		if ((s & (RQ_POSTPONED|RQ_LOCAL_MASK|RQ_NET_MASK)) != RQ_POSTPONED) {
+			if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
+				drbd_set_out_of_sync(mdev, req->i.sector, req->i.size);
+
+			if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
+				drbd_set_in_sync(mdev, req->i.sector, req->i.size);
+		}
 
 		/* one might be tempted to move the drbd_al_complete_io
-		 * to the local io completion callback drbd_endio_pri.
+		 * to the local io completion callback drbd_request_endio.
 		 * but, if this was a mirror write, we may only
 		 * drbd_al_complete_io after this is RQ_NET_DONE,
 		 * otherwise the extent could be dropped from the al
@@ -93,109 +148,35 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const
 		 * but after the extent has been dropped from the al,
 		 * we would forget to resync the corresponding extent.
 		 */
-		if (s & RQ_LOCAL_MASK) {
+		if (s & RQ_IN_ACT_LOG) {
 			if (get_ldev_if_state(mdev, D_FAILED)) {
-				if (s & RQ_IN_ACT_LOG)
-					drbd_al_complete_io(mdev, req->sector);
+				drbd_al_complete_io(mdev, &req->i);
 				put_ldev(mdev);
 			} else if (__ratelimit(&drbd_ratelimit_state)) {
-				dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), "
-				     "but my Disk seems to have failed :(\n",
-				     (unsigned long long) req->sector);
+				dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu, %u), "
+					 "but my Disk seems to have failed :(\n",
+					 (unsigned long long) req->i.sector, req->i.size);
 			}
 		}
 	}
 
-	drbd_req_free(req);
+	mempool_free(req, drbd_request_mempool);
 }
 
-static void queue_barrier(struct drbd_conf *mdev)
-{
-	struct drbd_tl_epoch *b;
-
-	/* We are within the req_lock. Once we queued the barrier for sending,
-	 * we set the CREATE_BARRIER bit. It is cleared as soon as a new
-	 * barrier/epoch object is added. This is the only place this bit is
-	 * set. It indicates that the barrier for this epoch is already queued,
-	 * and no new epoch has been created yet. */
-	if (test_bit(CREATE_BARRIER, &mdev->flags))
-		return;
-
-	b = mdev->newest_tle;
-	b->w.cb = w_send_barrier;
-	/* inc_ap_pending done here, so we won't
-	 * get imbalanced on connection loss.
-	 * dec_ap_pending will be done in got_BarrierAck
-	 * or (on connection loss) in tl_clear.  */
-	inc_ap_pending(mdev);
-	drbd_queue_work(&mdev->data.work, &b->w);
-	set_bit(CREATE_BARRIER, &mdev->flags);
+static void wake_all_senders(struct drbd_tconn *tconn) {
+	wake_up(&tconn->sender_work.q_wait);
 }
 
-static void _about_to_complete_local_write(struct drbd_conf *mdev,
-	struct drbd_request *req)
+/* must hold resource->req_lock */
+static void start_new_tl_epoch(struct drbd_tconn *tconn)
 {
-	const unsigned long s = req->rq_state;
-	struct drbd_request *i;
-	struct drbd_epoch_entry *e;
-	struct hlist_node *n;
-	struct hlist_head *slot;
-
-	/* Before we can signal completion to the upper layers,
-	 * we may need to close the current epoch.
-	 * We can skip this, if this request has not even been sent, because we
-	 * did not have a fully established connection yet/anymore, during
-	 * bitmap exchange, or while we are C_AHEAD due to congestion policy.
-	 */
-	if (mdev->state.conn >= C_CONNECTED &&
-	    (s & RQ_NET_SENT) != 0 &&
-	    req->epoch == mdev->newest_tle->br_number)
-		queue_barrier(mdev);
-
-	/* we need to do the conflict detection stuff,
-	 * if we have the ee_hash (two_primaries) and
-	 * this has been on the network */
-	if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) {
-		const sector_t sector = req->sector;
-		const int size = req->size;
-
-		/* ASSERT:
-		 * there must be no conflicting requests, since
-		 * they must have been failed on the spot */
-#define OVERLAPS overlaps(sector, size, i->sector, i->size)
-		slot = tl_hash_slot(mdev, sector);
-		hlist_for_each_entry(i, n, slot, collision) {
-			if (OVERLAPS) {
-				dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; "
-				      "other: %p %llus +%u\n",
-				      req, (unsigned long long)sector, size,
-				      i, (unsigned long long)i->sector, i->size);
-			}
-		}
+	/* no point closing an epoch, if it is empty, anyways. */
+	if (tconn->current_tle_writes == 0)
+		return;
 
-		/* maybe "wake" those conflicting epoch entries
-		 * that wait for this request to finish.
-		 *
-		 * currently, there can be only _one_ such ee
-		 * (well, or some more, which would be pending
-		 * P_DISCARD_ACK not yet sent by the asender...),
-		 * since we block the receiver thread upon the
-		 * first conflict detection, which will wait on
-		 * misc_wait.  maybe we want to assert that?
-		 *
-		 * anyways, if we found one,
-		 * we just have to do a wake_up.  */
-#undef OVERLAPS
-#define OVERLAPS overlaps(sector, size, e->sector, e->size)
-		slot = ee_hash_slot(mdev, req->sector);
-		hlist_for_each_entry(e, n, slot, collision) {
-			if (OVERLAPS) {
-				wake_up(&mdev->misc_wait);
-				break;
-			}
-		}
-	}
-#undef OVERLAPS
+	tconn->current_tle_writes = 0;
+	atomic_inc(&tconn->current_tle_nr);
+	wake_all_senders(tconn);
 }
 
 void complete_master_bio(struct drbd_conf *mdev,
@@ -205,17 +186,33 @@ void complete_master_bio(struct drbd_conf *mdev,
 	dec_ap_bio(mdev);
 }
 
+
+static void drbd_remove_request_interval(struct rb_root *root,
+					 struct drbd_request *req)
+{
+	struct drbd_conf *mdev = req->w.mdev;
+	struct drbd_interval *i = &req->i;
+
+	drbd_remove_interval(root, i);
+
+	/* Wake up any processes waiting for this request to complete.  */
+	if (i->waiting)
+		wake_up(&mdev->misc_wait);
+}
+
 /* Helper for __req_mod().
  * Set m->bio to the master bio, if it is fit to be completed,
  * or leave it alone (it is initialized to NULL in __req_mod),
  * if it has already been completed, or cannot be completed yet.
  * If m->bio is set, the error status to be returned is placed in m->error.
  */
-void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
+static
+void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
 {
-	const unsigned long s = req->rq_state;
-	struct drbd_conf *mdev = req->mdev;
-	int rw = req->rq_state & RQ_WRITE ? WRITE : READ;
+	const unsigned s = req->rq_state;
+	struct drbd_conf *mdev = req->w.mdev;
+	int rw;
+	int error, ok;
 
 	/* we must not complete the master bio, while it is
 	 *	still being processed by _drbd_send_zc_bio (drbd_send_dblock)
@@ -226,165 +223,220 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
 	 *	the receiver,
 	 *	the bio_endio completion callbacks.
 	 */
-	if (s & RQ_NET_QUEUED)
-		return;
-	if (s & RQ_NET_PENDING)
+	if ((s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) ||
+	    (s & RQ_NET_QUEUED) || (s & RQ_NET_PENDING) ||
+	    (s & RQ_COMPLETION_SUSP)) {
+		dev_err(DEV, "drbd_req_complete: Logic BUG rq_state = 0x%x\n", s);
 		return;
-	if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED))
+	}
+
+	if (!req->master_bio) {
+		dev_err(DEV, "drbd_req_complete: Logic BUG, master_bio == NULL!\n");
 		return;
+	}
 
-	if (req->master_bio) {
-		/* this is data_received (remote read)
-		 * or protocol C P_WRITE_ACK
-		 * or protocol B P_RECV_ACK
-		 * or protocol A "handed_over_to_network" (SendAck)
-		 * or canceled or failed,
-		 * or killed from the transfer log due to connection loss.
-		 */
+	rw = bio_rw(req->master_bio);
 
-		/*
-		 * figure out whether to report success or failure.
-		 *
-		 * report success when at least one of the operations succeeded.
-		 * or, to put the other way,
-		 * only report failure, when both operations failed.
-		 *
-		 * what to do about the failures is handled elsewhere.
-		 * what we need to do here is just: complete the master_bio.
-		 *
-		 * local completion error, if any, has been stored as ERR_PTR
-		 * in private_bio within drbd_endio_pri.
-		 */
-		int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
-		int error = PTR_ERR(req->private_bio);
+	/*
+	 * figure out whether to report success or failure.
+	 *
+	 * report success when at least one of the operations succeeded.
+	 * or, to put the other way,
+	 * only report failure, when both operations failed.
+	 *
+	 * what to do about the failures is handled elsewhere.
+	 * what we need to do here is just: complete the master_bio.
+	 *
+	 * local completion error, if any, has been stored as ERR_PTR
+	 * in private_bio within drbd_request_endio.
+	 */
+	ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
+	error = PTR_ERR(req->private_bio);
 
-		/* remove the request from the conflict detection
-		 * respective block_id verification hash */
-		if (!hlist_unhashed(&req->collision))
-			hlist_del(&req->collision);
-		else
-			D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
+	/* remove the request from the conflict detection
+	 * respective block_id verification hash */
+	if (!drbd_interval_empty(&req->i)) {
+		struct rb_root *root;
 
-		/* for writes we need to do some extra housekeeping */
 		if (rw == WRITE)
-			_about_to_complete_local_write(mdev, req);
+			root = &mdev->write_requests;
+		else
+			root = &mdev->read_requests;
+		drbd_remove_request_interval(root, req);
+	} else if (!(s & RQ_POSTPONED))
+		D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
 
-		/* Update disk stats */
-		_drbd_end_io_acct(mdev, req);
+	/* Before we can signal completion to the upper layers,
+	 * we may need to close the current transfer log epoch.
+	 * We are within the request lock, so we can simply compare
+	 * the request epoch number with the current transfer log
+	 * epoch number.  If they match, increase the current_tle_nr,
+	 * and reset the transfer log epoch write_cnt.
+	 */
+	if (rw == WRITE &&
+	    req->epoch == atomic_read(&mdev->tconn->current_tle_nr))
+		start_new_tl_epoch(mdev->tconn);
+
+	/* Update disk stats */
+	_drbd_end_io_acct(mdev, req);
+
+	/* If READ failed,
+	 * have it be pushed back to the retry work queue,
+	 * so it will re-enter __drbd_make_request(),
+	 * and be re-assigned to a suitable local or remote path,
+	 * or failed if we do not have access to good data anymore.
+	 *
+	 * Unless it was failed early by __drbd_make_request(),
+	 * because no path was available, in which case
+	 * it was not even added to the transfer_log.
+	 *
+	 * READA may fail, and will not be retried.
+	 *
+	 * WRITE should have used all available paths already.
+	 */
+	if (!ok && rw == READ && !list_empty(&req->tl_requests))
+		req->rq_state |= RQ_POSTPONED;
 
+	if (!(req->rq_state & RQ_POSTPONED)) {
 		m->error = ok ? 0 : (error ?: -EIO);
 		m->bio = req->master_bio;
 		req->master_bio = NULL;
 	}
+}
 
-	if (s & RQ_LOCAL_PENDING)
-		return;
+static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put)
+{
+	struct drbd_conf *mdev = req->w.mdev;
+	D_ASSERT(m || (req->rq_state & RQ_POSTPONED));
+
+	if (!atomic_sub_and_test(put, &req->completion_ref))
+		return 0;
 
-	if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
-		/* this is disconnected (local only) operation,
-		 * or protocol C P_WRITE_ACK,
-		 * or protocol A or B P_BARRIER_ACK,
-		 * or killed from the transfer log due to connection loss. */
-		_req_is_done(mdev, req, rw);
+	drbd_req_complete(req, m);
+
+	if (req->rq_state & RQ_POSTPONED) {
+		/* don't destroy the req object just yet,
+		 * but queue it for retry */
+		drbd_restart_request(req);
+		return 0;
 	}
-	/* else: network part and not DONE yet. that is
-	 * protocol A or B, barrier ack still pending... */
+
+	return 1;
 }
 
-static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_error *m)
+/* I'd like this to be the only place that manipulates
+ * req->completion_ref and req->kref. */
+static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
+		int clear, int set)
 {
-	struct drbd_conf *mdev = req->mdev;
+	struct drbd_conf *mdev = req->w.mdev;
+	unsigned s = req->rq_state;
+	int c_put = 0;
+	int k_put = 0;
 
-	if (!is_susp(mdev->state))
-		_req_may_be_done(req, m);
-}
+	if (drbd_suspended(mdev) && !((s | clear) & RQ_COMPLETION_SUSP))
+		set |= RQ_COMPLETION_SUSP;
 
-/*
- * checks whether there was an overlapping request
- * or ee already registered.
- *
- * if so, return 1, in which case this request is completed on the spot,
- * without ever being submitted or send.
- *
- * return 0 if it is ok to submit this request.
- *
- * NOTE:
- * paranoia: assume something above us is broken, and issues different write
- * requests for the same block simultaneously...
- *
- * To ensure these won't be reordered differently on both nodes, resulting in
- * diverging data sets, we discard the later one(s). Not that this is supposed
- * to happen, but this is the rationale why we also have to check for
- * conflicting requests with local origin, and why we have to do so regardless
- * of whether we allowed multiple primaries.
- *
- * BTW, in case we only have one primary, the ee_hash is empty anyways, and the
- * second hlist_for_each_entry becomes a noop. This is even simpler than to
- * grab a reference on the net_conf, and check for the two_primaries flag...
- */
-static int _req_conflicts(struct drbd_request *req)
-{
-	struct drbd_conf *mdev = req->mdev;
-	const sector_t sector = req->sector;
-	const int size = req->size;
-	struct drbd_request *i;
-	struct drbd_epoch_entry *e;
-	struct hlist_node *n;
-	struct hlist_head *slot;
+	/* apply */
 
-	D_ASSERT(hlist_unhashed(&req->collision));
+	req->rq_state &= ~clear;
+	req->rq_state |= set;
 
-	if (!get_net_conf(mdev))
-		return 0;
+	/* no change? */
+	if (req->rq_state == s)
+		return;
 
-	/* BUG_ON */
-	ERR_IF (mdev->tl_hash_s == 0)
-		goto out_no_conflict;
-	BUG_ON(mdev->tl_hash == NULL);
-
-#define OVERLAPS overlaps(i->sector, i->size, sector, size)
-	slot = tl_hash_slot(mdev, sector);
-	hlist_for_each_entry(i, n, slot, collision) {
-		if (OVERLAPS) {
-			dev_alert(DEV, "%s[%u] Concurrent local write detected! "
-			      "[DISCARD L] new: %llus +%u; "
-			      "pending: %llus +%u\n",
-			      current->comm, current->pid,
-			      (unsigned long long)sector, size,
-			      (unsigned long long)i->sector, i->size);
-			goto out_conflict;
-		}
+	/* intent: get references */
+
+	if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING))
+		atomic_inc(&req->completion_ref);
+
+	if (!(s & RQ_NET_PENDING) && (set & RQ_NET_PENDING)) {
+		inc_ap_pending(mdev);
+		atomic_inc(&req->completion_ref);
 	}
 
-	if (mdev->ee_hash_s) {
-		/* now, check for overlapping requests with remote origin */
-		BUG_ON(mdev->ee_hash == NULL);
-#undef OVERLAPS
-#define OVERLAPS overlaps(e->sector, e->size, sector, size)
-		slot = ee_hash_slot(mdev, sector);
-		hlist_for_each_entry(e, n, slot, collision) {
-			if (OVERLAPS) {
-				dev_alert(DEV, "%s[%u] Concurrent remote write detected!"
-				      " [DISCARD L] new: %llus +%u; "
-				      "pending: %llus +%u\n",
-				      current->comm, current->pid,
-				      (unsigned long long)sector, size,
-				      (unsigned long long)e->sector, e->size);
-				goto out_conflict;
-			}
-		}
+	if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED))
+		atomic_inc(&req->completion_ref);
+
+	if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK))
+		kref_get(&req->kref); /* wait for the DONE */
+
+	if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT))
+		atomic_add(req->i.size >> 9, &mdev->ap_in_flight);
+
+	if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP))
+		atomic_inc(&req->completion_ref);
+
+	/* progress: put references */
+
+	if ((s & RQ_COMPLETION_SUSP) && (clear & RQ_COMPLETION_SUSP))
+		++c_put;
+
+	if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) {
+		D_ASSERT(req->rq_state & RQ_LOCAL_PENDING);
+		/* local completion may still come in later,
+		 * we need to keep the req object around. */
+		kref_get(&req->kref);
+		++c_put;
+	}
+
+	if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) {
+		if (req->rq_state & RQ_LOCAL_ABORTED)
+			++k_put;
+		else
+			++c_put;
 	}
-#undef OVERLAPS
 
-out_no_conflict:
-	/* this is like it should be, and what we expected.
-	 * our users do behave after all... */
-	put_net_conf(mdev);
-	return 0;
+	if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) {
+		dec_ap_pending(mdev);
+		++c_put;
+	}
 
-out_conflict:
-	put_net_conf(mdev);
-	return 1;
+	if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED))
+		++c_put;
+
+	if ((s & RQ_EXP_BARR_ACK) && !(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
+		if (req->rq_state & RQ_NET_SENT)
+			atomic_sub(req->i.size >> 9, &mdev->ap_in_flight);
+		++k_put;
+	}
+
+	/* potentially complete and destroy */
+
+	if (k_put || c_put) {
+		/* Completion does it's own kref_put.  If we are going to
+		 * kref_sub below, we need req to be still around then. */
+		int at_least = k_put + !!c_put;
+		int refcount = atomic_read(&req->kref.refcount);
+		if (refcount < at_least)
+			dev_err(DEV,
+				"mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",
+				s, req->rq_state, refcount, at_least);
+	}
+
+	/* If we made progress, retry conflicting peer requests, if any. */
+	if (req->i.waiting)
+		wake_up(&mdev->misc_wait);
+
+	if (c_put)
+		k_put += drbd_req_put_completion_ref(req, m, c_put);
+	if (k_put)
+		kref_sub(&req->kref, k_put, drbd_req_destroy);
+}
+
+static void drbd_report_io_error(struct drbd_conf *mdev, struct drbd_request *req)
+{
+        char b[BDEVNAME_SIZE];
+
+	if (!__ratelimit(&drbd_ratelimit_state))
+		return;
+
+	dev_warn(DEV, "local %s IO error sector %llu+%u on %s\n",
+			(req->rq_state & RQ_WRITE) ? "WRITE" : "READ",
+			(unsigned long long)req->i.sector,
+			req->i.size >> 9,
+			bdevname(mdev->ldev->backing_bdev, b));
 }
 
 /* obviously this could be coded as many single functions
@@ -402,9 +454,12 @@ static int _req_conflicts(struct drbd_request *req)
 int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		struct bio_and_error *m)
 {
-	struct drbd_conf *mdev = req->mdev;
-	int rv = 0;
-	m->bio = NULL;
+	struct drbd_conf *mdev = req->w.mdev;
+	struct net_conf *nc;
+	int p, rv = 0;
+
+	if (m)
+		m->bio = NULL;
 
 	switch (what) {
 	default:
@@ -413,116 +468,91 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 
 	/* does not happen...
 	 * initialization done in drbd_req_new
-	case created:
+	case CREATED:
 		break;
 		*/
 
-	case to_be_send: /* via network */
-		/* reached via drbd_make_request_common
+	case TO_BE_SENT: /* via network */
+		/* reached via __drbd_make_request
 		 * and from w_read_retry_remote */
 		D_ASSERT(!(req->rq_state & RQ_NET_MASK));
-		req->rq_state |= RQ_NET_PENDING;
-		inc_ap_pending(mdev);
+		rcu_read_lock();
+		nc = rcu_dereference(mdev->tconn->net_conf);
+		p = nc->wire_protocol;
+		rcu_read_unlock();
+		req->rq_state |=
+			p == DRBD_PROT_C ? RQ_EXP_WRITE_ACK :
+			p == DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0;
+		mod_rq_state(req, m, 0, RQ_NET_PENDING);
 		break;
 
-	case to_be_submitted: /* locally */
-		/* reached via drbd_make_request_common */
+	case TO_BE_SUBMITTED: /* locally */
+		/* reached via __drbd_make_request */
 		D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK));
-		req->rq_state |= RQ_LOCAL_PENDING;
+		mod_rq_state(req, m, 0, RQ_LOCAL_PENDING);
 		break;
 
-	case completed_ok:
+	case COMPLETED_OK:
 		if (req->rq_state & RQ_WRITE)
-			mdev->writ_cnt += req->size>>9;
+			mdev->writ_cnt += req->i.size >> 9;
 		else
-			mdev->read_cnt += req->size>>9;
+			mdev->read_cnt += req->i.size >> 9;
 
-		req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
-		req->rq_state &= ~RQ_LOCAL_PENDING;
-
-		_req_may_be_done_not_susp(req, m);
+		mod_rq_state(req, m, RQ_LOCAL_PENDING,
+				RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
 		break;
 
-	case abort_disk_io:
-		req->rq_state |= RQ_LOCAL_ABORTED;
-		if (req->rq_state & RQ_WRITE)
-			_req_may_be_done_not_susp(req, m);
-		else
-			goto goto_queue_for_net_read;
+	case ABORT_DISK_IO:
+		mod_rq_state(req, m, 0, RQ_LOCAL_ABORTED);
 		break;
 
-	case write_completed_with_error:
-		req->rq_state |= RQ_LOCAL_COMPLETED;
-		req->rq_state &= ~RQ_LOCAL_PENDING;
-
-		__drbd_chk_io_error(mdev, DRBD_IO_ERROR);
-		_req_may_be_done_not_susp(req, m);
+	case WRITE_COMPLETED_WITH_ERROR:
+		drbd_report_io_error(mdev, req);
+		__drbd_chk_io_error(mdev, DRBD_WRITE_ERROR);
+		mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
 		break;
 
-	case read_ahead_completed_with_error:
-		/* it is legal to fail READA */
-		req->rq_state |= RQ_LOCAL_COMPLETED;
-		req->rq_state &= ~RQ_LOCAL_PENDING;
-		_req_may_be_done_not_susp(req, m);
+	case READ_COMPLETED_WITH_ERROR:
+		drbd_set_out_of_sync(mdev, req->i.sector, req->i.size);
+		drbd_report_io_error(mdev, req);
+		__drbd_chk_io_error(mdev, DRBD_READ_ERROR);
+		/* fall through. */
+	case READ_AHEAD_COMPLETED_WITH_ERROR:
+		/* it is legal to fail READA, no __drbd_chk_io_error in that case. */
+		mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
 		break;
 
-	case read_completed_with_error:
-		drbd_set_out_of_sync(mdev, req->sector, req->size);
-
-		req->rq_state |= RQ_LOCAL_COMPLETED;
-		req->rq_state &= ~RQ_LOCAL_PENDING;
-
-		if (req->rq_state & RQ_LOCAL_ABORTED) {
-			_req_may_be_done(req, m);
-			break;
-		}
-
-		__drbd_chk_io_error(mdev, DRBD_IO_ERROR);
-
-	goto_queue_for_net_read:
-
-		D_ASSERT(!(req->rq_state & RQ_NET_MASK));
-
-		/* no point in retrying if there is no good remote data,
-		 * or we have no connection. */
-		if (mdev->state.pdsk != D_UP_TO_DATE) {
-			_req_may_be_done_not_susp(req, m);
-			break;
-		}
-
-		/* _req_mod(req,to_be_send); oops, recursion... */
-		req->rq_state |= RQ_NET_PENDING;
-		inc_ap_pending(mdev);
-		/* fall through: _req_mod(req,queue_for_net_read); */
-
-	case queue_for_net_read:
+	case QUEUE_FOR_NET_READ:
 		/* READ or READA, and
 		 * no local disk,
 		 * or target area marked as invalid,
 		 * or just got an io-error. */
-		/* from drbd_make_request_common
+		/* from __drbd_make_request
 		 * or from bio_endio during read io-error recovery */
 
-		/* so we can verify the handle in the answer packet
-		 * corresponding hlist_del is in _req_may_be_done() */
-		hlist_add_head(&req->collision, ar_hash_slot(mdev, req->sector));
+		/* So we can verify the handle in the answer packet.
+		 * Corresponding drbd_remove_request_interval is in
+		 * drbd_req_complete() */
+		D_ASSERT(drbd_interval_empty(&req->i));
+		drbd_insert_interval(&mdev->read_requests, &req->i);
 
 		set_bit(UNPLUG_REMOTE, &mdev->flags);
 
 		D_ASSERT(req->rq_state & RQ_NET_PENDING);
-		req->rq_state |= RQ_NET_QUEUED;
-		req->w.cb = (req->rq_state & RQ_LOCAL_MASK)
-			? w_read_retry_remote
-			: w_send_read_req;
-		drbd_queue_work(&mdev->data.work, &req->w);
+		D_ASSERT((req->rq_state & RQ_LOCAL_MASK) == 0);
+		mod_rq_state(req, m, 0, RQ_NET_QUEUED);
+		req->w.cb = w_send_read_req;
+		drbd_queue_work(&mdev->tconn->sender_work, &req->w);
 		break;
 
-	case queue_for_net_write:
+	case QUEUE_FOR_NET_WRITE:
 		/* assert something? */
-		/* from drbd_make_request_common only */
+		/* from __drbd_make_request only */
 
-		hlist_add_head(&req->collision, tl_hash_slot(mdev, req->sector));
-		/* corresponding hlist_del is in _req_may_be_done() */
+		/* Corresponding drbd_remove_request_interval is in
+		 * drbd_req_complete() */
+		D_ASSERT(drbd_interval_empty(&req->i));
+		drbd_insert_interval(&mdev->write_requests, &req->i);
 
 		/* NOTE
 		 * In case the req ended up on the transfer log before being
@@ -533,7 +563,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		 *
 		 * _req_add_to_epoch(req); this has to be after the
 		 * _maybe_start_new_epoch(req); which happened in
-		 * drbd_make_request_common, because we now may set the bit
+		 * __drbd_make_request, because we now may set the bit
 		 * again ourselves to close the current epoch.
 		 *
 		 * Add req to the (now) current epoch (barrier). */
@@ -543,202 +573,187 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		 * hurting performance. */
 		set_bit(UNPLUG_REMOTE, &mdev->flags);
 
-		/* see drbd_make_request_common,
-		 * just after it grabs the req_lock */
-		D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0);
-
-		req->epoch = mdev->newest_tle->br_number;
-
-		/* increment size of current epoch */
-		mdev->newest_tle->n_writes++;
-
 		/* queue work item to send data */
 		D_ASSERT(req->rq_state & RQ_NET_PENDING);
-		req->rq_state |= RQ_NET_QUEUED;
+		mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK);
 		req->w.cb =  w_send_dblock;
-		drbd_queue_work(&mdev->data.work, &req->w);
+		drbd_queue_work(&mdev->tconn->sender_work, &req->w);
 
 		/* close the epoch, in case it outgrew the limit */
-		if (mdev->newest_tle->n_writes >= mdev->net_conf->max_epoch_size)
-			queue_barrier(mdev);
+		rcu_read_lock();
+		nc = rcu_dereference(mdev->tconn->net_conf);
+		p = nc->max_epoch_size;
+		rcu_read_unlock();
+		if (mdev->tconn->current_tle_writes >= p)
+			start_new_tl_epoch(mdev->tconn);
 
 		break;
 
-	case queue_for_send_oos:
-		req->rq_state |= RQ_NET_QUEUED;
-		req->w.cb =  w_send_oos;
-		drbd_queue_work(&mdev->data.work, &req->w);
+	case QUEUE_FOR_SEND_OOS:
+		mod_rq_state(req, m, 0, RQ_NET_QUEUED);
+		req->w.cb =  w_send_out_of_sync;
+		drbd_queue_work(&mdev->tconn->sender_work, &req->w);
 		break;
 
-	case read_retry_remote_canceled:
-	case send_canceled:
-	case send_failed:
+	case READ_RETRY_REMOTE_CANCELED:
+	case SEND_CANCELED:
+	case SEND_FAILED:
 		/* real cleanup will be done from tl_clear.  just update flags
 		 * so it is no longer marked as on the worker queue */
-		req->rq_state &= ~RQ_NET_QUEUED;
-		/* if we did it right, tl_clear should be scheduled only after
-		 * this, so this should not be necessary! */
-		_req_may_be_done_not_susp(req, m);
+		mod_rq_state(req, m, RQ_NET_QUEUED, 0);
 		break;
 
-	case handed_over_to_network:
+	case HANDED_OVER_TO_NETWORK:
 		/* assert something? */
-		if (bio_data_dir(req->master_bio) == WRITE)
-			atomic_add(req->size>>9, &mdev->ap_in_flight);
-
 		if (bio_data_dir(req->master_bio) == WRITE &&
-		    mdev->net_conf->wire_protocol == DRBD_PROT_A) {
+		    !(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) {
 			/* this is what is dangerous about protocol A:
 			 * pretend it was successfully written on the peer. */
-			if (req->rq_state & RQ_NET_PENDING) {
-				dec_ap_pending(mdev);
-				req->rq_state &= ~RQ_NET_PENDING;
-				req->rq_state |= RQ_NET_OK;
-			} /* else: neg-ack was faster... */
+			if (req->rq_state & RQ_NET_PENDING)
+				mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
+			/* else: neg-ack was faster... */
 			/* it is still not yet RQ_NET_DONE until the
 			 * corresponding epoch barrier got acked as well,
 			 * so we know what to dirty on connection loss */
 		}
-		req->rq_state &= ~RQ_NET_QUEUED;
-		req->rq_state |= RQ_NET_SENT;
-		_req_may_be_done_not_susp(req, m);
+		mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
 		break;
 
-	case oos_handed_to_network:
+	case OOS_HANDED_TO_NETWORK:
 		/* Was not set PENDING, no longer QUEUED, so is now DONE
 		 * as far as this connection is concerned. */
-		req->rq_state &= ~RQ_NET_QUEUED;
-		req->rq_state |= RQ_NET_DONE;
-		_req_may_be_done_not_susp(req, m);
+		mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_DONE);
 		break;
 
-	case connection_lost_while_pending:
+	case CONNECTION_LOST_WHILE_PENDING:
 		/* transfer log cleanup after connection loss */
-		/* assert something? */
-		if (req->rq_state & RQ_NET_PENDING)
-			dec_ap_pending(mdev);
-		req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
-		req->rq_state |= RQ_NET_DONE;
-		if (req->rq_state & RQ_NET_SENT && req->rq_state & RQ_WRITE)
-			atomic_sub(req->size>>9, &mdev->ap_in_flight);
-
-		/* if it is still queued, we may not complete it here.
-		 * it will be canceled soon. */
-		if (!(req->rq_state & RQ_NET_QUEUED))
-			_req_may_be_done(req, m); /* Allowed while state.susp */
+		mod_rq_state(req, m,
+				RQ_NET_OK|RQ_NET_PENDING|RQ_COMPLETION_SUSP,
+				RQ_NET_DONE);
 		break;
 
-	case conflict_discarded_by_peer:
-		/* for discarded conflicting writes of multiple primaries,
+	case CONFLICT_RESOLVED:
+		/* for superseded conflicting writes of multiple primaries,
 		 * there is no need to keep anything in the tl, potential
-		 * node crashes are covered by the activity log. */
-		if (what == conflict_discarded_by_peer)
-			dev_alert(DEV, "Got DiscardAck packet %llus +%u!"
-			      " DRBD is not a random data generator!\n",
-			      (unsigned long long)req->sector, req->size);
-		req->rq_state |= RQ_NET_DONE;
-		/* fall through */
-	case write_acked_by_peer_and_sis:
-	case write_acked_by_peer:
-		if (what == write_acked_by_peer_and_sis)
-			req->rq_state |= RQ_NET_SIS;
+		 * node crashes are covered by the activity log.
+		 *
+		 * If this request had been marked as RQ_POSTPONED before,
+		 * it will actually not be completed, but "restarted",
+		 * resubmitted from the retry worker context. */
+		D_ASSERT(req->rq_state & RQ_NET_PENDING);
+		D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK);
+		mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_DONE|RQ_NET_OK);
+		break;
+
+	case WRITE_ACKED_BY_PEER_AND_SIS:
+		req->rq_state |= RQ_NET_SIS;
+	case WRITE_ACKED_BY_PEER:
+		D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK);
 		/* protocol C; successfully written on peer.
 		 * Nothing more to do here.
 		 * We want to keep the tl in place for all protocols, to cater
 		 * for volatile write-back caches on lower level devices. */
 
-	case recv_acked_by_peer:
+		goto ack_common;
+	case RECV_ACKED_BY_PEER:
+		D_ASSERT(req->rq_state & RQ_EXP_RECEIVE_ACK);
 		/* protocol B; pretends to be successfully written on peer.
-		 * see also notes above in handed_over_to_network about
+		 * see also notes above in HANDED_OVER_TO_NETWORK about
 		 * protocol != C */
-		req->rq_state |= RQ_NET_OK;
+	ack_common:
 		D_ASSERT(req->rq_state & RQ_NET_PENDING);
-		dec_ap_pending(mdev);
-		atomic_sub(req->size>>9, &mdev->ap_in_flight);
-		req->rq_state &= ~RQ_NET_PENDING;
-		_req_may_be_done_not_susp(req, m);
+		mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
 		break;
 
-	case neg_acked:
-		/* assert something? */
-		if (req->rq_state & RQ_NET_PENDING) {
-			dec_ap_pending(mdev);
-			atomic_sub(req->size>>9, &mdev->ap_in_flight);
-		}
-		req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
+	case POSTPONE_WRITE:
+		D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK);
+		/* If this node has already detected the write conflict, the
+		 * worker will be waiting on misc_wait.  Wake it up once this
+		 * request has completed locally.
+		 */
+		D_ASSERT(req->rq_state & RQ_NET_PENDING);
+		req->rq_state |= RQ_POSTPONED;
+		if (req->i.waiting)
+			wake_up(&mdev->misc_wait);
+		/* Do not clear RQ_NET_PENDING. This request will make further
+		 * progress via restart_conflicting_writes() or
+		 * fail_postponed_requests(). Hopefully. */
+		break;
 
-		req->rq_state |= RQ_NET_DONE;
-		_req_may_be_done_not_susp(req, m);
-		/* else: done by handed_over_to_network */
+	case NEG_ACKED:
+		mod_rq_state(req, m, RQ_NET_OK|RQ_NET_PENDING, 0);
 		break;
 
-	case fail_frozen_disk_io:
+	case FAIL_FROZEN_DISK_IO:
 		if (!(req->rq_state & RQ_LOCAL_COMPLETED))
 			break;
-
-		_req_may_be_done(req, m); /* Allowed while state.susp */
+		mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
 		break;
 
-	case restart_frozen_disk_io:
+	case RESTART_FROZEN_DISK_IO:
 		if (!(req->rq_state & RQ_LOCAL_COMPLETED))
 			break;
 
-		req->rq_state &= ~RQ_LOCAL_COMPLETED;
+		mod_rq_state(req, m,
+				RQ_COMPLETION_SUSP|RQ_LOCAL_COMPLETED,
+				RQ_LOCAL_PENDING);
 
 		rv = MR_READ;
 		if (bio_data_dir(req->master_bio) == WRITE)
 			rv = MR_WRITE;
 
-		get_ldev(mdev);
+		get_ldev(mdev); /* always succeeds in this call path */
 		req->w.cb = w_restart_disk_io;
-		drbd_queue_work(&mdev->data.work, &req->w);
+		drbd_queue_work(&mdev->tconn->sender_work, &req->w);
 		break;
 
-	case resend:
+	case RESEND:
 		/* Simply complete (local only) READs. */
 		if (!(req->rq_state & RQ_WRITE) && !req->w.cb) {
-			_req_may_be_done(req, m);
+			mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
 			break;
 		}
 
 		/* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
-		   before the connection loss (B&C only); only P_BARRIER_ACK was missing.
-		   Trowing them out of the TL here by pretending we got a BARRIER_ACK
-		   We ensure that the peer was not rebooted */
+		   before the connection loss (B&C only); only P_BARRIER_ACK
+		   (or the local completion?) was missing when we suspended.
+		   Throwing them out of the TL here by pretending we got a BARRIER_ACK.
+		   During connection handshake, we ensure that the peer was not rebooted. */
 		if (!(req->rq_state & RQ_NET_OK)) {
+			/* FIXME could this possibly be a req->w.cb == w_send_out_of_sync?
+			 * in that case we must not set RQ_NET_PENDING. */
+
+			mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING);
 			if (req->w.cb) {
-				drbd_queue_work(&mdev->data.work, &req->w);
+				drbd_queue_work(&mdev->tconn->sender_work, &req->w);
 				rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
-			}
+			} /* else: FIXME can this happen? */
 			break;
 		}
-		/* else, fall through to barrier_acked */
+		/* else, fall through to BARRIER_ACKED */
 
-	case barrier_acked:
+	case BARRIER_ACKED:
+		/* barrier ack for READ requests does not make sense */
 		if (!(req->rq_state & RQ_WRITE))
 			break;
 
 		if (req->rq_state & RQ_NET_PENDING) {
-			/* barrier came in before all requests have been acked.
+			/* barrier came in before all requests were acked.
 			 * this is bad, because if the connection is lost now,
 			 * we won't be able to clean them up... */
-			dev_err(DEV, "FIXME (barrier_acked but pending)\n");
-			list_move(&req->tl_requests, &mdev->out_of_sequence_requests);
+			dev_err(DEV, "FIXME (BARRIER_ACKED but pending)\n");
 		}
-		if ((req->rq_state & RQ_NET_MASK) != 0) {
-			req->rq_state |= RQ_NET_DONE;
-			if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
-				atomic_sub(req->size>>9, &mdev->ap_in_flight);
-		}
-		_req_may_be_done(req, m); /* Allowed while state.susp */
+		/* Allowed to complete requests, even while suspended.
+		 * As this is called for all requests within a matching epoch,
+		 * we need to filter, and only set RQ_NET_DONE for those that
+		 * have actually been on the wire. */
+		mod_rq_state(req, m, RQ_COMPLETION_SUSP,
+				(req->rq_state & RQ_NET_MASK) ? RQ_NET_DONE : 0);
 		break;
 
-	case data_received:
+	case DATA_RECEIVED:
 		D_ASSERT(req->rq_state & RQ_NET_PENDING);
-		dec_ap_pending(mdev);
-		req->rq_state &= ~RQ_NET_PENDING;
-		req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
-		_req_may_be_done_not_susp(req, m);
+		mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE);
 		break;
 	};
 
@@ -752,75 +767,265 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
  *   since size may be bigger than BM_BLOCK_SIZE,
  *   we may need to check several bits.
  */
-static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size)
+static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size)
 {
 	unsigned long sbnr, ebnr;
 	sector_t esector, nr_sectors;
 
 	if (mdev->state.disk == D_UP_TO_DATE)
-		return 1;
-	if (mdev->state.disk >= D_OUTDATED)
-		return 0;
-	if (mdev->state.disk <  D_INCONSISTENT)
-		return 0;
-	/* state.disk == D_INCONSISTENT   We will have a look at the BitMap */
-	nr_sectors = drbd_get_capacity(mdev->this_bdev);
+		return true;
+	if (mdev->state.disk != D_INCONSISTENT)
+		return false;
 	esector = sector + (size >> 9) - 1;
-
+	nr_sectors = drbd_get_capacity(mdev->this_bdev);
 	D_ASSERT(sector  < nr_sectors);
 	D_ASSERT(esector < nr_sectors);
 
 	sbnr = BM_SECT_TO_BIT(sector);
 	ebnr = BM_SECT_TO_BIT(esector);
 
-	return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
+	return drbd_bm_count_bits(mdev, sbnr, ebnr) == 0;
+}
+
+static bool remote_due_to_read_balancing(struct drbd_conf *mdev, sector_t sector,
+		enum drbd_read_balancing rbm)
+{
+	struct backing_dev_info *bdi;
+	int stripe_shift;
+
+	switch (rbm) {
+	case RB_CONGESTED_REMOTE:
+		bdi = &mdev->ldev->backing_bdev->bd_disk->queue->backing_dev_info;
+		return bdi_read_congested(bdi);
+	case RB_LEAST_PENDING:
+		return atomic_read(&mdev->local_cnt) >
+			atomic_read(&mdev->ap_pending_cnt) + atomic_read(&mdev->rs_pending_cnt);
+	case RB_32K_STRIPING:  /* stripe_shift = 15 */
+	case RB_64K_STRIPING:
+	case RB_128K_STRIPING:
+	case RB_256K_STRIPING:
+	case RB_512K_STRIPING:
+	case RB_1M_STRIPING:   /* stripe_shift = 20 */
+		stripe_shift = (rbm - RB_32K_STRIPING + 15);
+		return (sector >> (stripe_shift - 9)) & 1;
+	case RB_ROUND_ROBIN:
+		return test_and_change_bit(READ_BALANCE_RR, &mdev->flags);
+	case RB_PREFER_REMOTE:
+		return true;
+	case RB_PREFER_LOCAL:
+	default:
+		return false;
+	}
+}
+
+/*
+ * complete_conflicting_writes  -  wait for any conflicting write requests
+ *
+ * The write_requests tree contains all active write requests which we
+ * currently know about.  Wait for any requests to complete which conflict with
+ * the new one.
+ *
+ * Only way out: remove the conflicting intervals from the tree.
+ */
+static void complete_conflicting_writes(struct drbd_request *req)
+{
+	DEFINE_WAIT(wait);
+	struct drbd_conf *mdev = req->w.mdev;
+	struct drbd_interval *i;
+	sector_t sector = req->i.sector;
+	int size = req->i.size;
+
+	i = drbd_find_overlap(&mdev->write_requests, sector, size);
+	if (!i)
+		return;
+
+	for (;;) {
+		prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
+		i = drbd_find_overlap(&mdev->write_requests, sector, size);
+		if (!i)
+			break;
+		/* Indicate to wake up device->misc_wait on progress.  */
+		i->waiting = true;
+		spin_unlock_irq(&mdev->tconn->req_lock);
+		schedule();
+		spin_lock_irq(&mdev->tconn->req_lock);
+	}
+	finish_wait(&mdev->misc_wait, &wait);
 }
 
+/* called within req_lock and rcu_read_lock() */
 static void maybe_pull_ahead(struct drbd_conf *mdev)
 {
-	int congested = 0;
+	struct drbd_tconn *tconn = mdev->tconn;
+	struct net_conf *nc;
+	bool congested = false;
+	enum drbd_on_congestion on_congestion;
+
+	nc = rcu_dereference(tconn->net_conf);
+	on_congestion = nc ? nc->on_congestion : OC_BLOCK;
+	if (on_congestion == OC_BLOCK ||
+	    tconn->agreed_pro_version < 96)
+		return;
 
 	/* If I don't even have good local storage, we can not reasonably try
 	 * to pull ahead of the peer. We also need the local reference to make
 	 * sure mdev->act_log is there.
-	 * Note: caller has to make sure that net_conf is there.
 	 */
 	if (!get_ldev_if_state(mdev, D_UP_TO_DATE))
 		return;
 
-	if (mdev->net_conf->cong_fill &&
-	    atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) {
+	if (nc->cong_fill &&
+	    atomic_read(&mdev->ap_in_flight) >= nc->cong_fill) {
 		dev_info(DEV, "Congestion-fill threshold reached\n");
-		congested = 1;
+		congested = true;
 	}
 
-	if (mdev->act_log->used >= mdev->net_conf->cong_extents) {
+	if (mdev->act_log->used >= nc->cong_extents) {
 		dev_info(DEV, "Congestion-extents threshold reached\n");
-		congested = 1;
+		congested = true;
 	}
 
 	if (congested) {
-		queue_barrier(mdev); /* last barrier, after mirrored writes */
+		/* start a new epoch for non-mirrored writes */
+		start_new_tl_epoch(mdev->tconn);
 
-		if (mdev->net_conf->on_congestion == OC_PULL_AHEAD)
+		if (on_congestion == OC_PULL_AHEAD)
 			_drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
-		else  /*mdev->net_conf->on_congestion == OC_DISCONNECT */
+		else  /*nc->on_congestion == OC_DISCONNECT */
 			_drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
 	}
 	put_ldev(mdev);
 }
 
-static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
+/* If this returns false, and req->private_bio is still set,
+ * this should be submitted locally.
+ *
+ * If it returns false, but req->private_bio is not set,
+ * we do not have access to good data :(
+ *
+ * Otherwise, this destroys req->private_bio, if any,
+ * and returns true.
+ */
+static bool do_remote_read(struct drbd_request *req)
+{
+	struct drbd_conf *mdev = req->w.mdev;
+	enum drbd_read_balancing rbm;
+
+	if (req->private_bio) {
+		if (!drbd_may_do_local_read(mdev,
+					req->i.sector, req->i.size)) {
+			bio_put(req->private_bio);
+			req->private_bio = NULL;
+			put_ldev(mdev);
+		}
+	}
+
+	if (mdev->state.pdsk != D_UP_TO_DATE)
+		return false;
+
+	if (req->private_bio == NULL)
+		return true;
+
+	/* TODO: improve read balancing decisions, take into account drbd
+	 * protocol, pending requests etc. */
+
+	rcu_read_lock();
+	rbm = rcu_dereference(mdev->ldev->disk_conf)->read_balancing;
+	rcu_read_unlock();
+
+	if (rbm == RB_PREFER_LOCAL && req->private_bio)
+		return false; /* submit locally */
+
+	if (remote_due_to_read_balancing(mdev, req->i.sector, rbm)) {
+		if (req->private_bio) {
+			bio_put(req->private_bio);
+			req->private_bio = NULL;
+			put_ldev(mdev);
+		}
+		return true;
+	}
+
+	return false;
+}
+
+/* returns number of connections (== 1, for drbd 8.4)
+ * expected to actually write this data,
+ * which does NOT include those that we are L_AHEAD for. */
+static int drbd_process_write_request(struct drbd_request *req)
+{
+	struct drbd_conf *mdev = req->w.mdev;
+	int remote, send_oos;
+
+	rcu_read_lock();
+	remote = drbd_should_do_remote(mdev->state);
+	if (remote) {
+		maybe_pull_ahead(mdev);
+		remote = drbd_should_do_remote(mdev->state);
+	}
+	send_oos = drbd_should_send_out_of_sync(mdev->state);
+	rcu_read_unlock();
+
+	/* Need to replicate writes.  Unless it is an empty flush,
+	 * which is better mapped to a DRBD P_BARRIER packet,
+	 * also for drbd wire protocol compatibility reasons.
+	 * If this was a flush, just start a new epoch.
+	 * Unless the current epoch was empty anyways, or we are not currently
+	 * replicating, in which case there is no point. */
+	if (unlikely(req->i.size == 0)) {
+		/* The only size==0 bios we expect are empty flushes. */
+		D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH);
+		if (remote)
+			start_new_tl_epoch(mdev->tconn);
+		return 0;
+	}
+
+	if (!remote && !send_oos)
+		return 0;
+
+	D_ASSERT(!(remote && send_oos));
+
+	if (remote) {
+		_req_mod(req, TO_BE_SENT);
+		_req_mod(req, QUEUE_FOR_NET_WRITE);
+	} else if (drbd_set_out_of_sync(mdev, req->i.sector, req->i.size))
+		_req_mod(req, QUEUE_FOR_SEND_OOS);
+
+	return remote;
+}
+
+static void
+drbd_submit_req_private_bio(struct drbd_request *req)
+{
+	struct drbd_conf *mdev = req->w.mdev;
+	struct bio *bio = req->private_bio;
+	const int rw = bio_rw(bio);
+
+	bio->bi_bdev = mdev->ldev->backing_bdev;
+
+	/* State may have changed since we grabbed our reference on the
+	 * ->ldev member. Double check, and short-circuit to endio.
+	 * In case the last activity log transaction failed to get on
+	 * stable storage, and this is a WRITE, we may not even submit
+	 * this bio. */
+	if (get_ldev(mdev)) {
+		if (drbd_insert_fault(mdev,
+				      rw == WRITE ? DRBD_FAULT_DT_WR
+				    : rw == READ  ? DRBD_FAULT_DT_RD
+				    :               DRBD_FAULT_DT_RA))
+			bio_endio(bio, -EIO);
+		else
+			generic_make_request(bio);
+		put_ldev(mdev);
+	} else
+		bio_endio(bio, -EIO);
+}
+
+void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
 {
 	const int rw = bio_rw(bio);
-	const int size = bio->bi_size;
-	const sector_t sector = bio->bi_sector;
-	struct drbd_tl_epoch *b = NULL;
+	struct bio_and_error m = { NULL, };
 	struct drbd_request *req;
-	int local, remote, send_oos = 0;
-	int err = -EIO;
-	int ret = 0;
-	union drbd_state s;
+	bool no_remote = false;
 
 	/* allocate outside of all locks; */
 	req = drbd_req_new(mdev, bio);
@@ -830,55 +1035,14 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
 		 * if user cannot handle io errors, that's not our business. */
 		dev_err(DEV, "could not kmalloc() req\n");
 		bio_endio(bio, -ENOMEM);
-		return 0;
+		return;
 	}
 	req->start_time = start_time;
 
-	local = get_ldev(mdev);
-	if (!local) {
-		bio_put(req->private_bio); /* or we get a bio leak */
+	if (!get_ldev(mdev)) {
+		bio_put(req->private_bio);
 		req->private_bio = NULL;
 	}
-	if (rw == WRITE) {
-		/* Need to replicate writes.  Unless it is an empty flush,
-		 * which is better mapped to a DRBD P_BARRIER packet,
-		 * also for drbd wire protocol compatibility reasons. */
-		if (unlikely(size == 0)) {
-			/* The only size==0 bios we expect are empty flushes. */
-			D_ASSERT(bio->bi_rw & REQ_FLUSH);
-			remote = 0;
-		} else
-			remote = 1;
-	} else {
-		/* READ || READA */
-		if (local) {
-			if (!drbd_may_do_local_read(mdev, sector, size)) {
-				/* we could kick the syncer to
-				 * sync this extent asap, wait for
-				 * it, then continue locally.
-				 * Or just issue the request remotely.
-				 */
-				local = 0;
-				bio_put(req->private_bio);
-				req->private_bio = NULL;
-				put_ldev(mdev);
-			}
-		}
-		remote = !local && mdev->state.pdsk >= D_UP_TO_DATE;
-	}
-
-	/* If we have a disk, but a READA request is mapped to remote,
-	 * we are R_PRIMARY, D_INCONSISTENT, SyncTarget.
-	 * Just fail that READA request right here.
-	 *
-	 * THINK: maybe fail all READA when not local?
-	 *        or make this configurable...
-	 *        if network is slow, READA won't do any good.
-	 */
-	if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) {
-		err = -EWOULDBLOCK;
-		goto fail_and_free_req;
-	}
 
 	/* For WRITES going to the local disk, grab a reference on the target
 	 * extent.  This waits for any resync activity in the corresponding
@@ -887,348 +1051,131 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
 	 * of transactional on-disk meta data updates.
 	 * Empty flushes don't need to go into the activity log, they can only
 	 * flush data for pending writes which are already in there. */
-	if (rw == WRITE && local && size
+	if (rw == WRITE && req->private_bio && req->i.size
 	&& !test_bit(AL_SUSPENDED, &mdev->flags)) {
 		req->rq_state |= RQ_IN_ACT_LOG;
-		drbd_al_begin_io(mdev, sector);
-	}
-
-	s = mdev->state;
-	remote = remote && drbd_should_do_remote(s);
-	send_oos = rw == WRITE && drbd_should_send_oos(s);
-	D_ASSERT(!(remote && send_oos));
-
-	if (!(local || remote) && !is_susp(mdev->state)) {
-		if (__ratelimit(&drbd_ratelimit_state))
-			dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
-		goto fail_free_complete;
+		drbd_al_begin_io(mdev, &req->i);
 	}
 
-	/* For WRITE request, we have to make sure that we have an
-	 * unused_spare_tle, in case we need to start a new epoch.
-	 * I try to be smart and avoid to pre-allocate always "just in case",
-	 * but there is a race between testing the bit and pointer outside the
-	 * spinlock, and grabbing the spinlock.
-	 * if we lost that race, we retry.  */
-	if (rw == WRITE && (remote || send_oos) &&
-	    mdev->unused_spare_tle == NULL &&
-	    test_bit(CREATE_BARRIER, &mdev->flags)) {
-allocate_barrier:
-		b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO);
-		if (!b) {
-			dev_err(DEV, "Failed to alloc barrier.\n");
-			err = -ENOMEM;
-			goto fail_free_complete;
-		}
+	spin_lock_irq(&mdev->tconn->req_lock);
+	if (rw == WRITE) {
+		/* This may temporarily give up the req_lock,
+		 * but will re-aquire it before it returns here.
+		 * Needs to be before the check on drbd_suspended() */
+		complete_conflicting_writes(req);
 	}
 
-	/* GOOD, everything prepared, grab the spin_lock */
-	spin_lock_irq(&mdev->req_lock);
-
-	if (is_susp(mdev->state)) {
-		/* If we got suspended, use the retry mechanism of
-		   drbd_make_request() to restart processing of this
-		   bio. In the next call to drbd_make_request
-		   we sleep in inc_ap_bio() */
-		ret = 1;
-		spin_unlock_irq(&mdev->req_lock);
-		goto fail_free_complete;
-	}
+	/* no more giving up req_lock from now on! */
 
-	if (remote || send_oos) {
-		remote = drbd_should_do_remote(mdev->state);
-		send_oos = rw == WRITE && drbd_should_send_oos(mdev->state);
-		D_ASSERT(!(remote && send_oos));
-
-		if (!(remote || send_oos))
-			dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
-		if (!(local || remote)) {
-			dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
-			spin_unlock_irq(&mdev->req_lock);
-			goto fail_free_complete;
+	if (drbd_suspended(mdev)) {
+		/* push back and retry: */
+		req->rq_state |= RQ_POSTPONED;
+		if (req->private_bio) {
+			bio_put(req->private_bio);
+			req->private_bio = NULL;
+			put_ldev(mdev);
 		}
+		goto out;
 	}
 
-	if (b && mdev->unused_spare_tle == NULL) {
-		mdev->unused_spare_tle = b;
-		b = NULL;
-	}
-	if (rw == WRITE && (remote || send_oos) &&
-	    mdev->unused_spare_tle == NULL &&
-	    test_bit(CREATE_BARRIER, &mdev->flags)) {
-		/* someone closed the current epoch
-		 * while we were grabbing the spinlock */
-		spin_unlock_irq(&mdev->req_lock);
-		goto allocate_barrier;
-	}
-
-
 	/* Update disk stats */
 	_drbd_start_io_acct(mdev, req, bio);
 
-	/* _maybe_start_new_epoch(mdev);
-	 * If we need to generate a write barrier packet, we have to add the
-	 * new epoch (barrier) object, and queue the barrier packet for sending,
-	 * and queue the req's data after it _within the same lock_, otherwise
-	 * we have race conditions were the reorder domains could be mixed up.
-	 *
-	 * Even read requests may start a new epoch and queue the corresponding
-	 * barrier packet.  To get the write ordering right, we only have to
-	 * make sure that, if this is a write request and it triggered a
-	 * barrier packet, this request is queued within the same spinlock. */
-	if ((remote || send_oos) && mdev->unused_spare_tle &&
-	    test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
-		_tl_add_barrier(mdev, mdev->unused_spare_tle);
-		mdev->unused_spare_tle = NULL;
-	} else {
-		D_ASSERT(!(remote && rw == WRITE &&
-			   test_bit(CREATE_BARRIER, &mdev->flags)));
+	/* We fail READ/READA early, if we can not serve it.
+	 * We must do this before req is registered on any lists.
+	 * Otherwise, drbd_req_complete() will queue failed READ for retry. */
+	if (rw != WRITE) {
+		if (!do_remote_read(req) && !req->private_bio)
+			goto nodata;
 	}
 
-	/* NOTE
-	 * Actually, 'local' may be wrong here already, since we may have failed
-	 * to write to the meta data, and may become wrong anytime because of
-	 * local io-error for some other request, which would lead to us
-	 * "detaching" the local disk.
-	 *
-	 * 'remote' may become wrong any time because the network could fail.
-	 *
-	 * This is a harmless race condition, though, since it is handled
-	 * correctly at the appropriate places; so it just defers the failure
-	 * of the respective operation.
-	 */
-
-	/* mark them early for readability.
-	 * this just sets some state flags. */
-	if (remote)
-		_req_mod(req, to_be_send);
-	if (local)
-		_req_mod(req, to_be_submitted);
-
-	/* check this request on the collision detection hash tables.
-	 * if we have a conflict, just complete it here.
-	 * THINK do we want to check reads, too? (I don't think so...) */
-	if (rw == WRITE && _req_conflicts(req))
-		goto fail_conflicting;
+	/* which transfer log epoch does this belong to? */
+	req->epoch = atomic_read(&mdev->tconn->current_tle_nr);
 
 	/* no point in adding empty flushes to the transfer log,
 	 * they are mapped to drbd barriers already. */
-	if (likely(size!=0))
-		list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
+	if (likely(req->i.size!=0)) {
+		if (rw == WRITE)
+			mdev->tconn->current_tle_writes++;
 
-	/* NOTE remote first: to get the concurrent write detection right,
-	 * we must register the request before start of local IO.  */
-	if (remote) {
-		/* either WRITE and C_CONNECTED,
-		 * or READ, and no local disk,
-		 * or READ, but not in sync.
-		 */
-		_req_mod(req, (rw == WRITE)
-				? queue_for_net_write
-				: queue_for_net_read);
+		list_add_tail(&req->tl_requests, &mdev->tconn->transfer_log);
 	}
-	if (send_oos && drbd_set_out_of_sync(mdev, sector, size))
-		_req_mod(req, queue_for_send_oos);
 
-	if (remote &&
-	    mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96)
-		maybe_pull_ahead(mdev);
-
-	/* If this was a flush, queue a drbd barrier/start a new epoch.
-	 * Unless the current epoch was empty anyways, or we are not currently
-	 * replicating, in which case there is no point. */
-	if (unlikely(bio->bi_rw & REQ_FLUSH)
-		&& mdev->newest_tle->n_writes
-		&& drbd_should_do_remote(mdev->state))
-		queue_barrier(mdev);
-
-	spin_unlock_irq(&mdev->req_lock);
-	kfree(b); /* if someone else has beaten us to it... */
-
-	if (local) {
-		req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
-
-		/* State may have changed since we grabbed our reference on the
-		 * mdev->ldev member. Double check, and short-circuit to endio.
-		 * In case the last activity log transaction failed to get on
-		 * stable storage, and this is a WRITE, we may not even submit
-		 * this bio. */
-		if (get_ldev(mdev)) {
-			if (drbd_insert_fault(mdev,   rw == WRITE ? DRBD_FAULT_DT_WR
-						    : rw == READ  ? DRBD_FAULT_DT_RD
-						    :               DRBD_FAULT_DT_RA))
-				bio_endio(req->private_bio, -EIO);
-			else
-				generic_make_request(req->private_bio);
-			put_ldev(mdev);
+	if (rw == WRITE) {
+		if (!drbd_process_write_request(req))
+			no_remote = true;
+	} else {
+		/* We either have a private_bio, or we can read from remote.
+		 * Otherwise we had done the goto nodata above. */
+		if (req->private_bio == NULL) {
+			_req_mod(req, TO_BE_SENT);
+			_req_mod(req, QUEUE_FOR_NET_READ);
 		} else
-			bio_endio(req->private_bio, -EIO);
+			no_remote = true;
 	}
 
-	return 0;
-
-fail_conflicting:
-	/* this is a conflicting request.
-	 * even though it may have been only _partially_
-	 * overlapping with one of the currently pending requests,
-	 * without even submitting or sending it, we will
-	 * pretend that it was successfully served right now.
-	 */
-	_drbd_end_io_acct(mdev, req);
-	spin_unlock_irq(&mdev->req_lock);
-	if (remote)
-		dec_ap_pending(mdev);
-	/* THINK: do we want to fail it (-EIO), or pretend success?
-	 * this pretends success. */
-	err = 0;
-
-fail_free_complete:
-	if (req->rq_state & RQ_IN_ACT_LOG)
-		drbd_al_complete_io(mdev, sector);
-fail_and_free_req:
-	if (local) {
-		bio_put(req->private_bio);
-		req->private_bio = NULL;
-		put_ldev(mdev);
+	if (req->private_bio) {
+		/* needs to be marked within the same spinlock */
+		_req_mod(req, TO_BE_SUBMITTED);
+		/* but we need to give up the spinlock to submit */
+		spin_unlock_irq(&mdev->tconn->req_lock);
+		drbd_submit_req_private_bio(req);
+		spin_lock_irq(&mdev->tconn->req_lock);
+	} else if (no_remote) {
+nodata:
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "IO ERROR: neither local nor remote data, sector %llu+%u\n",
+					(unsigned long long)req->i.sector, req->i.size >> 9);
+		/* A write may have been queued for send_oos, however.
+		 * So we can not simply free it, we must go through drbd_req_put_completion_ref() */
 	}
-	if (!ret)
-		bio_endio(bio, err);
-
-	drbd_req_free(req);
-	dec_ap_bio(mdev);
-	kfree(b);
-
-	return ret;
-}
 
-/* helper function for drbd_make_request
- * if we can determine just by the mdev (state) that this request will fail,
- * return 1
- * otherwise return 0
- */
-static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
-{
-	if (mdev->state.role != R_PRIMARY &&
-		(!allow_oos || is_write)) {
-		if (__ratelimit(&drbd_ratelimit_state)) {
-			dev_err(DEV, "Process %s[%u] tried to %s; "
-			    "since we are not in Primary state, "
-			    "we cannot allow this\n",
-			    current->comm, current->pid,
-			    is_write ? "WRITE" : "READ");
-		}
-		return 1;
-	}
+out:
+	if (drbd_req_put_completion_ref(req, &m, 1))
+		kref_put(&req->kref, drbd_req_destroy);
+	spin_unlock_irq(&mdev->tconn->req_lock);
 
-	return 0;
+	if (m.bio)
+		complete_master_bio(mdev, &m);
+	return;
 }
 
 void drbd_make_request(struct request_queue *q, struct bio *bio)
 {
-	unsigned int s_enr, e_enr;
 	struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
 	unsigned long start_time;
 
-	if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
-		bio_endio(bio, -EPERM);
-		return;
-	}
-
 	start_time = jiffies;
 
 	/*
 	 * what we "blindly" assume:
 	 */
-	D_ASSERT((bio->bi_size & 0x1ff) == 0);
-
-	/* to make some things easier, force alignment of requests within the
-	 * granularity of our hash tables */
-	s_enr = bio->bi_sector >> HT_SHIFT;
-	e_enr = bio->bi_size ? (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT : s_enr;
-
-	if (likely(s_enr == e_enr)) {
-		do {
-			inc_ap_bio(mdev, 1);
-		} while (drbd_make_request_common(mdev, bio, start_time));
-		return;
-	}
-
-	/* can this bio be split generically?
-	 * Maybe add our own split-arbitrary-bios function. */
-	if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_BIO_SIZE) {
-		/* rather error out here than BUG in bio_split */
-		dev_err(DEV, "bio would need to, but cannot, be split: "
-		    "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
-		    bio->bi_vcnt, bio->bi_idx, bio->bi_size,
-		    (unsigned long long)bio->bi_sector);
-		bio_endio(bio, -EINVAL);
-	} else {
-		/* This bio crosses some boundary, so we have to split it. */
-		struct bio_pair *bp;
-		/* works for the "do not cross hash slot boundaries" case
-		 * e.g. sector 262269, size 4096
-		 * s_enr = 262269 >> 6 = 4097
-		 * e_enr = (262269+8-1) >> 6 = 4098
-		 * HT_SHIFT = 6
-		 * sps = 64, mask = 63
-		 * first_sectors = 64 - (262269 & 63) = 3
-		 */
-		const sector_t sect = bio->bi_sector;
-		const int sps = 1 << HT_SHIFT; /* sectors per slot */
-		const int mask = sps - 1;
-		const sector_t first_sectors = sps - (sect & mask);
-		bp = bio_split(bio, first_sectors);
+	D_ASSERT(IS_ALIGNED(bio->bi_size, 512));
 
-		/* we need to get a "reference count" (ap_bio_cnt)
-		 * to avoid races with the disconnect/reconnect/suspend code.
-		 * In case we need to split the bio here, we need to get three references
-		 * atomically, otherwise we might deadlock when trying to submit the
-		 * second one! */
-		inc_ap_bio(mdev, 3);
-
-		D_ASSERT(e_enr == s_enr + 1);
-
-		while (drbd_make_request_common(mdev, &bp->bio1, start_time))
-			inc_ap_bio(mdev, 1);
-
-		while (drbd_make_request_common(mdev, &bp->bio2, start_time))
-			inc_ap_bio(mdev, 1);
-
-		dec_ap_bio(mdev);
-
-		bio_pair_release(bp);
-	}
+	inc_ap_bio(mdev);
+	__drbd_make_request(mdev, bio, start_time);
 }
 
-/* This is called by bio_add_page().  With this function we reduce
- * the number of BIOs that span over multiple DRBD_MAX_BIO_SIZEs
- * units (was AL_EXTENTs).
+/* This is called by bio_add_page().
+ *
+ * q->max_hw_sectors and other global limits are already enforced there.
  *
- * we do the calculation within the lower 32bit of the byte offsets,
- * since we don't care for actual offset, but only check whether it
- * would cross "activity log extent" boundaries.
+ * We need to call down to our lower level device,
+ * in case it has special restrictions.
+ *
+ * We also may need to enforce configured max-bio-bvecs limits.
  *
  * As long as the BIO is empty we have to allow at least one bvec,
- * regardless of size and offset.  so the resulting bio may still
- * cross extent boundaries.  those are dealt with (bio_split) in
- * drbd_make_request.
+ * regardless of size and offset, so no need to ask lower levels.
  */
 int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
 {
 	struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
-	unsigned int bio_offset =
-		(unsigned int)bvm->bi_sector << 9; /* 32 bit */
 	unsigned int bio_size = bvm->bi_size;
-	int limit, backing_limit;
-
-	limit = DRBD_MAX_BIO_SIZE
-	      - ((bio_offset & (DRBD_MAX_BIO_SIZE-1)) + bio_size);
-	if (limit < 0)
-		limit = 0;
-	if (bio_size == 0) {
-		if (limit <= bvec->bv_len)
-			limit = bvec->bv_len;
-	} else if (limit && get_ldev(mdev)) {
+	int limit = DRBD_MAX_BIO_SIZE;
+	int backing_limit;
+
+	if (bio_size && get_ldev(mdev)) {
 		struct request_queue * const b =
 			mdev->ldev->backing_bdev->bd_disk->queue;
 		if (b->merge_bvec_fn) {
@@ -1240,24 +1187,38 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
 	return limit;
 }
 
+struct drbd_request *find_oldest_request(struct drbd_tconn *tconn)
+{
+	/* Walk the transfer log,
+	 * and find the oldest not yet completed request */
+	struct drbd_request *r;
+	list_for_each_entry(r, &tconn->transfer_log, tl_requests) {
+		if (atomic_read(&r->completion_ref))
+			return r;
+	}
+	return NULL;
+}
+
 void request_timer_fn(unsigned long data)
 {
 	struct drbd_conf *mdev = (struct drbd_conf *) data;
+	struct drbd_tconn *tconn = mdev->tconn;
 	struct drbd_request *req; /* oldest request */
-	struct list_head *le;
+	struct net_conf *nc;
 	unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
 	unsigned long now;
 
-	if (get_net_conf(mdev)) {
-		if (mdev->state.conn >= C_WF_REPORT_PARAMS)
-			ent = mdev->net_conf->timeout*HZ/10
-				* mdev->net_conf->ko_count;
-		put_net_conf(mdev);
-	}
+	rcu_read_lock();
+	nc = rcu_dereference(tconn->net_conf);
+	if (nc && mdev->state.conn >= C_WF_REPORT_PARAMS)
+		ent = nc->timeout * HZ/10 * nc->ko_count;
+
 	if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */
-		dt = mdev->ldev->dc.disk_timeout * HZ / 10;
+		dt = rcu_dereference(mdev->ldev->disk_conf)->disk_timeout * HZ / 10;
 		put_ldev(mdev);
 	}
+	rcu_read_unlock();
+
 	et = min_not_zero(dt, ent);
 
 	if (!et)
@@ -1265,17 +1226,14 @@ void request_timer_fn(unsigned long data)
 
 	now = jiffies;
 
-	spin_lock_irq(&mdev->req_lock);
-	le = &mdev->oldest_tle->requests;
-	if (list_empty(le)) {
-		spin_unlock_irq(&mdev->req_lock);
+	spin_lock_irq(&tconn->req_lock);
+	req = find_oldest_request(tconn);
+	if (!req) {
+		spin_unlock_irq(&tconn->req_lock);
 		mod_timer(&mdev->request_timer, now + et);
 		return;
 	}
 
-	le = le->prev;
-	req = list_entry(le, struct drbd_request, tl_requests);
-
 	/* The request is considered timed out, if
 	 * - we have some effective timeout from the configuration,
 	 *   with above state restrictions applied,
@@ -1294,17 +1252,17 @@ void request_timer_fn(unsigned long data)
 	 */
 	if (ent && req->rq_state & RQ_NET_PENDING &&
 		 time_after(now, req->start_time + ent) &&
-		!time_in_range(now, mdev->last_reconnect_jif, mdev->last_reconnect_jif + ent)) {
+		!time_in_range(now, tconn->last_reconnect_jif, tconn->last_reconnect_jif + ent)) {
 		dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n");
 		_drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
 	}
-	if (dt && req->rq_state & RQ_LOCAL_PENDING &&
+	if (dt && req->rq_state & RQ_LOCAL_PENDING && req->w.mdev == mdev &&
 		 time_after(now, req->start_time + dt) &&
 		!time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) {
 		dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n");
 		__drbd_chk_io_error(mdev, DRBD_FORCE_DETACH);
 	}
 	nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et;
-	spin_unlock_irq(&mdev->req_lock);
+	spin_unlock_irq(&tconn->req_lock);
 	mod_timer(&mdev->request_timer, nt);
 }
diff --git a/trunk/drivers/block/drbd/drbd_req.h b/trunk/drivers/block/drbd/drbd_req.h
index 3d2111919486..016de6b8bb57 100644
--- a/trunk/drivers/block/drbd/drbd_req.h
+++ b/trunk/drivers/block/drbd/drbd_req.h
@@ -77,40 +77,41 @@
  */
 
 enum drbd_req_event {
-	created,
-	to_be_send,
-	to_be_submitted,
+	CREATED,
+	TO_BE_SENT,
+	TO_BE_SUBMITTED,
 
 	/* XXX yes, now I am inconsistent...
 	 * these are not "events" but "actions"
 	 * oh, well... */
-	queue_for_net_write,
-	queue_for_net_read,
-	queue_for_send_oos,
-
-	send_canceled,
-	send_failed,
-	handed_over_to_network,
-	oos_handed_to_network,
-	connection_lost_while_pending,
-	read_retry_remote_canceled,
-	recv_acked_by_peer,
-	write_acked_by_peer,
-	write_acked_by_peer_and_sis, /* and set_in_sync */
-	conflict_discarded_by_peer,
-	neg_acked,
-	barrier_acked, /* in protocol A and B */
-	data_received, /* (remote read) */
-
-	read_completed_with_error,
-	read_ahead_completed_with_error,
-	write_completed_with_error,
-	abort_disk_io,
-	completed_ok,
-	resend,
-	fail_frozen_disk_io,
-	restart_frozen_disk_io,
-	nothing, /* for tracing only */
+	QUEUE_FOR_NET_WRITE,
+	QUEUE_FOR_NET_READ,
+	QUEUE_FOR_SEND_OOS,
+
+	SEND_CANCELED,
+	SEND_FAILED,
+	HANDED_OVER_TO_NETWORK,
+	OOS_HANDED_TO_NETWORK,
+	CONNECTION_LOST_WHILE_PENDING,
+	READ_RETRY_REMOTE_CANCELED,
+	RECV_ACKED_BY_PEER,
+	WRITE_ACKED_BY_PEER,
+	WRITE_ACKED_BY_PEER_AND_SIS, /* and set_in_sync */
+	CONFLICT_RESOLVED,
+	POSTPONE_WRITE,
+	NEG_ACKED,
+	BARRIER_ACKED, /* in protocol A and B */
+	DATA_RECEIVED, /* (remote read) */
+
+	READ_COMPLETED_WITH_ERROR,
+	READ_AHEAD_COMPLETED_WITH_ERROR,
+	WRITE_COMPLETED_WITH_ERROR,
+	ABORT_DISK_IO,
+	COMPLETED_OK,
+	RESEND,
+	FAIL_FROZEN_DISK_IO,
+	RESTART_FROZEN_DISK_IO,
+	NOTHING,
 };
 
 /* encoding of request states for now.  we don't actually need that many bits.
@@ -142,8 +143,8 @@ enum drbd_req_state_bits {
 	 *        recv_ack (B) or implicit "ack" (A),
 	 *        still waiting for the barrier ack.
 	 *        master_bio may already be completed and invalidated.
-	 * 11100: write_acked (C),
-	 *        data_received (for remote read, any protocol)
+	 * 11100: write acked (C),
+	 *        data received (for remote read, any protocol)
 	 *        or finally the barrier ack has arrived (B,A)...
 	 *        request can be freed
 	 * 01100: neg-acked (write, protocol C)
@@ -198,6 +199,22 @@ enum drbd_req_state_bits {
 
 	/* Should call drbd_al_complete_io() for this request... */
 	__RQ_IN_ACT_LOG,
+
+	/* The peer has sent a retry ACK */
+	__RQ_POSTPONED,
+
+	/* would have been completed,
+	 * but was not, because of drbd_suspended() */
+	__RQ_COMPLETION_SUSP,
+
+	/* We expect a receive ACK (wire proto B) */
+	__RQ_EXP_RECEIVE_ACK,
+
+	/* We expect a write ACK (wite proto C) */
+	__RQ_EXP_WRITE_ACK,
+
+	/* waiting for a barrier ack, did an extra kref_get */
+	__RQ_EXP_BARR_ACK,
 };
 
 #define RQ_LOCAL_PENDING   (1UL << __RQ_LOCAL_PENDING)
@@ -219,56 +236,16 @@ enum drbd_req_state_bits {
 
 #define RQ_WRITE           (1UL << __RQ_WRITE)
 #define RQ_IN_ACT_LOG      (1UL << __RQ_IN_ACT_LOG)
+#define RQ_POSTPONED	   (1UL << __RQ_POSTPONED)
+#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP)
+#define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK)
+#define RQ_EXP_WRITE_ACK   (1UL << __RQ_EXP_WRITE_ACK)
+#define RQ_EXP_BARR_ACK    (1UL << __RQ_EXP_BARR_ACK)
 
 /* For waking up the frozen transfer log mod_req() has to return if the request
    should be counted in the epoch object*/
-#define MR_WRITE_SHIFT 0
-#define MR_WRITE       (1 << MR_WRITE_SHIFT)
-#define MR_READ_SHIFT  1
-#define MR_READ        (1 << MR_READ_SHIFT)
-
-/* epoch entries */
-static inline
-struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
-{
-	BUG_ON(mdev->ee_hash_s == 0);
-	return mdev->ee_hash +
-		((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s);
-}
-
-/* transfer log (drbd_request objects) */
-static inline
-struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector)
-{
-	BUG_ON(mdev->tl_hash_s == 0);
-	return mdev->tl_hash +
-		((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s);
-}
-
-/* application reads (drbd_request objects) */
-static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector)
-{
-	return mdev->app_reads_hash
-		+ ((unsigned int)(sector) % APP_R_HSIZE);
-}
-
-/* when we receive the answer for a read request,
- * verify that we actually know about it */
-static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
-	u64 id, sector_t sector)
-{
-	struct hlist_head *slot = ar_hash_slot(mdev, sector);
-	struct hlist_node *n;
-	struct drbd_request *req;
-
-	hlist_for_each_entry(req, n, slot, collision) {
-		if ((unsigned long)req == (unsigned long)id) {
-			D_ASSERT(req->sector == sector);
-			return req;
-		}
-	}
-	return NULL;
-}
+#define MR_WRITE       1
+#define MR_READ        2
 
 static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
 {
@@ -278,41 +255,10 @@ static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bi
 	req->private_bio = bio;
 
 	bio->bi_private  = req;
-	bio->bi_end_io   = drbd_endio_pri;
+	bio->bi_end_io   = drbd_request_endio;
 	bio->bi_next     = NULL;
 }
 
-static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
-	struct bio *bio_src)
-{
-	struct drbd_request *req =
-		mempool_alloc(drbd_request_mempool, GFP_NOIO);
-	if (likely(req)) {
-		drbd_req_make_private_bio(req, bio_src);
-
-		req->rq_state    = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
-		req->mdev        = mdev;
-		req->master_bio  = bio_src;
-		req->epoch       = 0;
-		req->sector      = bio_src->bi_sector;
-		req->size        = bio_src->bi_size;
-		INIT_HLIST_NODE(&req->collision);
-		INIT_LIST_HEAD(&req->tl_requests);
-		INIT_LIST_HEAD(&req->w.list);
-	}
-	return req;
-}
-
-static inline void drbd_req_free(struct drbd_request *req)
-{
-	mempool_free(req, drbd_request_mempool);
-}
-
-static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
-{
-	return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
-}
-
 /* Short lived temporary struct on the stack.
  * We could squirrel the error to be returned into
  * bio->bi_size, or similar. But that would be too ugly. */
@@ -321,6 +267,7 @@ struct bio_and_error {
 	int error;
 };
 
+extern void drbd_req_destroy(struct kref *kref);
 extern void _req_may_be_done(struct drbd_request *req,
 		struct bio_and_error *m);
 extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
@@ -328,13 +275,17 @@ extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 extern void complete_master_bio(struct drbd_conf *mdev,
 		struct bio_and_error *m);
 extern void request_timer_fn(unsigned long data);
-extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
+extern void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what);
+extern void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what);
+
+/* this is in drbd_main.c */
+extern void drbd_restart_request(struct drbd_request *req);
 
 /* use this if you don't want to deal with calling complete_master_bio()
  * outside the spinlock, e.g. when walking some list on cleanup. */
 static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what)
 {
-	struct drbd_conf *mdev = req->mdev;
+	struct drbd_conf *mdev = req->w.mdev;
 	struct bio_and_error m;
 	int rv;
 
@@ -354,13 +305,13 @@ static inline int req_mod(struct drbd_request *req,
 		enum drbd_req_event what)
 {
 	unsigned long flags;
-	struct drbd_conf *mdev = req->mdev;
+	struct drbd_conf *mdev = req->w.mdev;
 	struct bio_and_error m;
 	int rv;
 
-	spin_lock_irqsave(&mdev->req_lock, flags);
+	spin_lock_irqsave(&mdev->tconn->req_lock, flags);
 	rv = __req_mod(req, what, &m);
-	spin_unlock_irqrestore(&mdev->req_lock, flags);
+	spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
 
 	if (m.bio)
 		complete_master_bio(mdev, &m);
@@ -368,7 +319,7 @@ static inline int req_mod(struct drbd_request *req,
 	return rv;
 }
 
-static inline bool drbd_should_do_remote(union drbd_state s)
+static inline bool drbd_should_do_remote(union drbd_dev_state s)
 {
 	return s.pdsk == D_UP_TO_DATE ||
 		(s.pdsk >= D_INCONSISTENT &&
@@ -378,7 +329,7 @@ static inline bool drbd_should_do_remote(union drbd_state s)
 	   That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
 	   states. */
 }
-static inline bool drbd_should_send_oos(union drbd_state s)
+static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s)
 {
 	return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
 	/* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
diff --git a/trunk/drivers/block/drbd/drbd_state.c b/trunk/drivers/block/drbd/drbd_state.c
new file mode 100644
index 000000000000..53bf6182bac4
--- /dev/null
+++ b/trunk/drivers/block/drbd/drbd_state.c
@@ -0,0 +1,1856 @@
+/*
+   drbd_state.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
+   from Logicworks, Inc. for making SDP replication support possible.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/drbd_limits.h>
+#include "drbd_int.h"
+#include "drbd_req.h"
+
+/* in drbd_main.c */
+extern void tl_abort_disk_io(struct drbd_conf *mdev);
+
+struct after_state_chg_work {
+	struct drbd_work w;
+	union drbd_state os;
+	union drbd_state ns;
+	enum chg_state_flags flags;
+	struct completion *done;
+};
+
+enum sanitize_state_warnings {
+	NO_WARNING,
+	ABORTED_ONLINE_VERIFY,
+	ABORTED_RESYNC,
+	CONNECTION_LOST_NEGOTIATING,
+	IMPLICITLY_UPGRADED_DISK,
+	IMPLICITLY_UPGRADED_PDSK,
+};
+
+static int w_after_state_ch(struct drbd_work *w, int unused);
+static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
+			   union drbd_state ns, enum chg_state_flags flags);
+static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
+static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_tconn *);
+static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns);
+static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns,
+				       enum sanitize_state_warnings *warn);
+
+static inline bool is_susp(union drbd_state s)
+{
+        return s.susp || s.susp_nod || s.susp_fen;
+}
+
+bool conn_all_vols_unconf(struct drbd_tconn *tconn)
+{
+	struct drbd_conf *mdev;
+	bool rv = true;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+		if (mdev->state.disk != D_DISKLESS ||
+		    mdev->state.conn != C_STANDALONE ||
+		    mdev->state.role != R_SECONDARY) {
+			rv = false;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+/* Unfortunately the states where not correctly ordered, when
+   they where defined. therefore can not use max_t() here. */
+static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2)
+{
+	if (role1 == R_PRIMARY || role2 == R_PRIMARY)
+		return R_PRIMARY;
+	if (role1 == R_SECONDARY || role2 == R_SECONDARY)
+		return R_SECONDARY;
+	return R_UNKNOWN;
+}
+static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2)
+{
+	if (role1 == R_UNKNOWN || role2 == R_UNKNOWN)
+		return R_UNKNOWN;
+	if (role1 == R_SECONDARY || role2 == R_SECONDARY)
+		return R_SECONDARY;
+	return R_PRIMARY;
+}
+
+enum drbd_role conn_highest_role(struct drbd_tconn *tconn)
+{
+	enum drbd_role role = R_UNKNOWN;
+	struct drbd_conf *mdev;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, vnr)
+		role = max_role(role, mdev->state.role);
+	rcu_read_unlock();
+
+	return role;
+}
+
+enum drbd_role conn_highest_peer(struct drbd_tconn *tconn)
+{
+	enum drbd_role peer = R_UNKNOWN;
+	struct drbd_conf *mdev;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, vnr)
+		peer = max_role(peer, mdev->state.peer);
+	rcu_read_unlock();
+
+	return peer;
+}
+
+enum drbd_disk_state conn_highest_disk(struct drbd_tconn *tconn)
+{
+	enum drbd_disk_state ds = D_DISKLESS;
+	struct drbd_conf *mdev;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, vnr)
+		ds = max_t(enum drbd_disk_state, ds, mdev->state.disk);
+	rcu_read_unlock();
+
+	return ds;
+}
+
+enum drbd_disk_state conn_lowest_disk(struct drbd_tconn *tconn)
+{
+	enum drbd_disk_state ds = D_MASK;
+	struct drbd_conf *mdev;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, vnr)
+		ds = min_t(enum drbd_disk_state, ds, mdev->state.disk);
+	rcu_read_unlock();
+
+	return ds;
+}
+
+enum drbd_disk_state conn_highest_pdsk(struct drbd_tconn *tconn)
+{
+	enum drbd_disk_state ds = D_DISKLESS;
+	struct drbd_conf *mdev;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, vnr)
+		ds = max_t(enum drbd_disk_state, ds, mdev->state.pdsk);
+	rcu_read_unlock();
+
+	return ds;
+}
+
+enum drbd_conns conn_lowest_conn(struct drbd_tconn *tconn)
+{
+	enum drbd_conns conn = C_MASK;
+	struct drbd_conf *mdev;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, vnr)
+		conn = min_t(enum drbd_conns, conn, mdev->state.conn);
+	rcu_read_unlock();
+
+	return conn;
+}
+
+static bool no_peer_wf_report_params(struct drbd_tconn *tconn)
+{
+	struct drbd_conf *mdev;
+	int vnr;
+	bool rv = true;
+
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, vnr)
+		if (mdev->state.conn == C_WF_REPORT_PARAMS) {
+			rv = false;
+			break;
+		}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+
+/**
+ * cl_wide_st_chg() - true if the state change is a cluster wide one
+ * @mdev:	DRBD device.
+ * @os:		old (current) state.
+ * @ns:		new (wanted) state.
+ */
+static int cl_wide_st_chg(struct drbd_conf *mdev,
+			  union drbd_state os, union drbd_state ns)
+{
+	return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
+		 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
+		  (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+		  (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
+		  (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
+		(os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
+		(os.conn == C_CONNECTED && ns.conn == C_VERIFY_S) ||
+		(os.conn == C_CONNECTED && ns.conn == C_WF_REPORT_PARAMS);
+}
+
+static union drbd_state
+apply_mask_val(union drbd_state os, union drbd_state mask, union drbd_state val)
+{
+	union drbd_state ns;
+	ns.i = (os.i & ~mask.i) | val.i;
+	return ns;
+}
+
+enum drbd_state_rv
+drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
+		  union drbd_state mask, union drbd_state val)
+{
+	unsigned long flags;
+	union drbd_state ns;
+	enum drbd_state_rv rv;
+
+	spin_lock_irqsave(&mdev->tconn->req_lock, flags);
+	ns = apply_mask_val(drbd_read_state(mdev), mask, val);
+	rv = _drbd_set_state(mdev, ns, f, NULL);
+	spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
+
+	return rv;
+}
+
+/**
+ * drbd_force_state() - Impose a change which happens outside our control on our state
+ * @mdev:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ */
+void drbd_force_state(struct drbd_conf *mdev,
+	union drbd_state mask, union drbd_state val)
+{
+	drbd_change_state(mdev, CS_HARD, mask, val);
+}
+
+static enum drbd_state_rv
+_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
+	     union drbd_state val)
+{
+	union drbd_state os, ns;
+	unsigned long flags;
+	enum drbd_state_rv rv;
+
+	if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
+		return SS_CW_SUCCESS;
+
+	if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
+		return SS_CW_FAILED_BY_PEER;
+
+	spin_lock_irqsave(&mdev->tconn->req_lock, flags);
+	os = drbd_read_state(mdev);
+	ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL);
+	rv = is_valid_transition(os, ns);
+	if (rv >= SS_SUCCESS)
+		rv = SS_UNKNOWN_ERROR;  /* cont waiting, otherwise fail. */
+
+	if (!cl_wide_st_chg(mdev, os, ns))
+		rv = SS_CW_NO_NEED;
+	if (rv == SS_UNKNOWN_ERROR) {
+		rv = is_valid_state(mdev, ns);
+		if (rv >= SS_SUCCESS) {
+			rv = is_valid_soft_transition(os, ns, mdev->tconn);
+			if (rv >= SS_SUCCESS)
+				rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
+		}
+	}
+	spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
+
+	return rv;
+}
+
+/**
+ * drbd_req_state() - Perform an eventually cluster wide state change
+ * @mdev:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ * @f:		flags
+ *
+ * Should not be called directly, use drbd_request_state() or
+ * _drbd_request_state().
+ */
+static enum drbd_state_rv
+drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
+	       union drbd_state val, enum chg_state_flags f)
+{
+	struct completion done;
+	unsigned long flags;
+	union drbd_state os, ns;
+	enum drbd_state_rv rv;
+
+	init_completion(&done);
+
+	if (f & CS_SERIALIZE)
+		mutex_lock(mdev->state_mutex);
+
+	spin_lock_irqsave(&mdev->tconn->req_lock, flags);
+	os = drbd_read_state(mdev);
+	ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL);
+	rv = is_valid_transition(os, ns);
+	if (rv < SS_SUCCESS) {
+		spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
+		goto abort;
+	}
+
+	if (cl_wide_st_chg(mdev, os, ns)) {
+		rv = is_valid_state(mdev, ns);
+		if (rv == SS_SUCCESS)
+			rv = is_valid_soft_transition(os, ns, mdev->tconn);
+		spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
+
+		if (rv < SS_SUCCESS) {
+			if (f & CS_VERBOSE)
+				print_st_err(mdev, os, ns, rv);
+			goto abort;
+		}
+
+		if (drbd_send_state_req(mdev, mask, val)) {
+			rv = SS_CW_FAILED_BY_PEER;
+			if (f & CS_VERBOSE)
+				print_st_err(mdev, os, ns, rv);
+			goto abort;
+		}
+
+		wait_event(mdev->state_wait,
+			(rv = _req_st_cond(mdev, mask, val)));
+
+		if (rv < SS_SUCCESS) {
+			if (f & CS_VERBOSE)
+				print_st_err(mdev, os, ns, rv);
+			goto abort;
+		}
+		spin_lock_irqsave(&mdev->tconn->req_lock, flags);
+		ns = apply_mask_val(drbd_read_state(mdev), mask, val);
+		rv = _drbd_set_state(mdev, ns, f, &done);
+	} else {
+		rv = _drbd_set_state(mdev, ns, f, &done);
+	}
+
+	spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
+
+	if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
+		D_ASSERT(current != mdev->tconn->worker.task);
+		wait_for_completion(&done);
+	}
+
+abort:
+	if (f & CS_SERIALIZE)
+		mutex_unlock(mdev->state_mutex);
+
+	return rv;
+}
+
+/**
+ * _drbd_request_state() - Request a state change (with flags)
+ * @mdev:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ * @f:		flags
+ *
+ * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
+ * flag, or when logging of failed state change requests is not desired.
+ */
+enum drbd_state_rv
+_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
+		    union drbd_state val, enum chg_state_flags f)
+{
+	enum drbd_state_rv rv;
+
+	wait_event(mdev->state_wait,
+		   (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
+
+	return rv;
+}
+
+static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
+{
+	dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n",
+	    name,
+	    drbd_conn_str(ns.conn),
+	    drbd_role_str(ns.role),
+	    drbd_role_str(ns.peer),
+	    drbd_disk_str(ns.disk),
+	    drbd_disk_str(ns.pdsk),
+	    is_susp(ns) ? 's' : 'r',
+	    ns.aftr_isp ? 'a' : '-',
+	    ns.peer_isp ? 'p' : '-',
+	    ns.user_isp ? 'u' : '-',
+	    ns.susp_fen ? 'F' : '-',
+	    ns.susp_nod ? 'N' : '-'
+	    );
+}
+
+void print_st_err(struct drbd_conf *mdev, union drbd_state os,
+	          union drbd_state ns, enum drbd_state_rv err)
+{
+	if (err == SS_IN_TRANSIENT_STATE)
+		return;
+	dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
+	print_st(mdev, " state", os);
+	print_st(mdev, "wanted", ns);
+}
+
+static long print_state_change(char *pb, union drbd_state os, union drbd_state ns,
+			       enum chg_state_flags flags)
+{
+	char *pbp;
+	pbp = pb;
+	*pbp = 0;
+
+	if (ns.role != os.role && flags & CS_DC_ROLE)
+		pbp += sprintf(pbp, "role( %s -> %s ) ",
+			       drbd_role_str(os.role),
+			       drbd_role_str(ns.role));
+	if (ns.peer != os.peer && flags & CS_DC_PEER)
+		pbp += sprintf(pbp, "peer( %s -> %s ) ",
+			       drbd_role_str(os.peer),
+			       drbd_role_str(ns.peer));
+	if (ns.conn != os.conn && flags & CS_DC_CONN)
+		pbp += sprintf(pbp, "conn( %s -> %s ) ",
+			       drbd_conn_str(os.conn),
+			       drbd_conn_str(ns.conn));
+	if (ns.disk != os.disk && flags & CS_DC_DISK)
+		pbp += sprintf(pbp, "disk( %s -> %s ) ",
+			       drbd_disk_str(os.disk),
+			       drbd_disk_str(ns.disk));
+	if (ns.pdsk != os.pdsk && flags & CS_DC_PDSK)
+		pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
+			       drbd_disk_str(os.pdsk),
+			       drbd_disk_str(ns.pdsk));
+
+	return pbp - pb;
+}
+
+static void drbd_pr_state_change(struct drbd_conf *mdev, union drbd_state os, union drbd_state ns,
+				 enum chg_state_flags flags)
+{
+	char pb[300];
+	char *pbp = pb;
+
+	pbp += print_state_change(pbp, os, ns, flags ^ CS_DC_MASK);
+
+	if (ns.aftr_isp != os.aftr_isp)
+		pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
+			       os.aftr_isp,
+			       ns.aftr_isp);
+	if (ns.peer_isp != os.peer_isp)
+		pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
+			       os.peer_isp,
+			       ns.peer_isp);
+	if (ns.user_isp != os.user_isp)
+		pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
+			       os.user_isp,
+			       ns.user_isp);
+
+	if (pbp != pb)
+		dev_info(DEV, "%s\n", pb);
+}
+
+static void conn_pr_state_change(struct drbd_tconn *tconn, union drbd_state os, union drbd_state ns,
+				 enum chg_state_flags flags)
+{
+	char pb[300];
+	char *pbp = pb;
+
+	pbp += print_state_change(pbp, os, ns, flags);
+
+	if (is_susp(ns) != is_susp(os) && flags & CS_DC_SUSP)
+		pbp += sprintf(pbp, "susp( %d -> %d ) ",
+			       is_susp(os),
+			       is_susp(ns));
+
+	if (pbp != pb)
+		conn_info(tconn, "%s\n", pb);
+}
+
+
+/**
+ * is_valid_state() - Returns an SS_ error code if ns is not valid
+ * @mdev:	DRBD device.
+ * @ns:		State to consider.
+ */
+static enum drbd_state_rv
+is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
+{
+	/* See drbd_state_sw_errors in drbd_strings.c */
+
+	enum drbd_fencing_p fp;
+	enum drbd_state_rv rv = SS_SUCCESS;
+	struct net_conf *nc;
+
+	rcu_read_lock();
+	fp = FP_DONT_CARE;
+	if (get_ldev(mdev)) {
+		fp = rcu_dereference(mdev->ldev->disk_conf)->fencing;
+		put_ldev(mdev);
+	}
+
+	nc = rcu_dereference(mdev->tconn->net_conf);
+	if (nc) {
+		if (!nc->two_primaries && ns.role == R_PRIMARY) {
+			if (ns.peer == R_PRIMARY)
+				rv = SS_TWO_PRIMARIES;
+			else if (conn_highest_peer(mdev->tconn) == R_PRIMARY)
+				rv = SS_O_VOL_PEER_PRI;
+		}
+	}
+
+	if (rv <= 0)
+		/* already found a reason to abort */;
+	else if (ns.role == R_SECONDARY && mdev->open_cnt)
+		rv = SS_DEVICE_IN_USE;
+
+	else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if (fp >= FP_RESOURCE &&
+		 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
+		rv = SS_PRIMARY_NOP;
+
+	else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
+		rv = SS_NO_LOCAL_DISK;
+
+	else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
+		rv = SS_NO_REMOTE_DISK;
+
+	else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if ((ns.conn == C_CONNECTED ||
+		  ns.conn == C_WF_BITMAP_S ||
+		  ns.conn == C_SYNC_SOURCE ||
+		  ns.conn == C_PAUSED_SYNC_S) &&
+		  ns.disk == D_OUTDATED)
+		rv = SS_CONNECTED_OUTDATES;
+
+	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+		 (nc->verify_alg[0] == 0))
+		rv = SS_NO_VERIFY_ALG;
+
+	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+		  mdev->tconn->agreed_pro_version < 88)
+		rv = SS_NOT_SUPPORTED;
+
+	else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
+		rv = SS_CONNECTED_OUTDATES;
+
+	rcu_read_unlock();
+
+	return rv;
+}
+
+/**
+ * is_valid_soft_transition() - Returns an SS_ error code if the state transition is not possible
+ * This function limits state transitions that may be declined by DRBD. I.e.
+ * user requests (aka soft transitions).
+ * @mdev:	DRBD device.
+ * @ns:		new state.
+ * @os:		old state.
+ */
+static enum drbd_state_rv
+is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_tconn *tconn)
+{
+	enum drbd_state_rv rv = SS_SUCCESS;
+
+	if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
+	    os.conn > C_CONNECTED)
+		rv = SS_RESYNC_RUNNING;
+
+	if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
+		rv = SS_ALREADY_STANDALONE;
+
+	if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
+		rv = SS_IS_DISKLESS;
+
+	if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
+		rv = SS_NO_NET_CONFIG;
+
+	if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
+		rv = SS_LOWER_THAN_OUTDATED;
+
+	if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
+		rv = SS_IN_TRANSIENT_STATE;
+
+	/* if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
+	   rv = SS_IN_TRANSIENT_STATE; */
+
+	/* While establishing a connection only allow cstate to change.
+	   Delay/refuse role changes, detach attach etc... */
+	if (test_bit(STATE_SENT, &tconn->flags) &&
+	    !(os.conn == C_WF_REPORT_PARAMS ||
+	      (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
+		rv = SS_IN_TRANSIENT_STATE;
+
+	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
+		rv = SS_NEED_CONNECTION;
+
+	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+	    ns.conn != os.conn && os.conn > C_CONNECTED)
+		rv = SS_RESYNC_RUNNING;
+
+	if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
+	    os.conn < C_CONNECTED)
+		rv = SS_NEED_CONNECTION;
+
+	if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
+	    && os.conn < C_WF_REPORT_PARAMS)
+		rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
+
+	return rv;
+}
+
+static enum drbd_state_rv
+is_valid_conn_transition(enum drbd_conns oc, enum drbd_conns nc)
+{
+	/* no change -> nothing to do, at least for the connection part */
+	if (oc == nc)
+		return SS_NOTHING_TO_DO;
+
+	/* disconnect of an unconfigured connection does not make sense */
+	if (oc == C_STANDALONE && nc == C_DISCONNECTING)
+		return SS_ALREADY_STANDALONE;
+
+	/* from C_STANDALONE, we start with C_UNCONNECTED */
+	if (oc == C_STANDALONE && nc != C_UNCONNECTED)
+		return SS_NEED_CONNECTION;
+
+	/* When establishing a connection we need to go through WF_REPORT_PARAMS!
+	   Necessary to do the right thing upon invalidate-remote on a disconnected resource */
+	if (oc < C_WF_REPORT_PARAMS && nc >= C_CONNECTED)
+		return SS_NEED_CONNECTION;
+
+	/* After a network error only C_UNCONNECTED or C_DISCONNECTING may follow. */
+	if (oc >= C_TIMEOUT && oc <= C_TEAR_DOWN && nc != C_UNCONNECTED && nc != C_DISCONNECTING)
+		return SS_IN_TRANSIENT_STATE;
+
+	/* After C_DISCONNECTING only C_STANDALONE may follow */
+	if (oc == C_DISCONNECTING && nc != C_STANDALONE)
+		return SS_IN_TRANSIENT_STATE;
+
+	return SS_SUCCESS;
+}
+
+
+/**
+ * is_valid_transition() - Returns an SS_ error code if the state transition is not possible
+ * This limits hard state transitions. Hard state transitions are facts there are
+ * imposed on DRBD by the environment. E.g. disk broke or network broke down.
+ * But those hard state transitions are still not allowed to do everything.
+ * @ns:		new state.
+ * @os:		old state.
+ */
+static enum drbd_state_rv
+is_valid_transition(union drbd_state os, union drbd_state ns)
+{
+	enum drbd_state_rv rv;
+
+	rv = is_valid_conn_transition(os.conn, ns.conn);
+
+	/* we cannot fail (again) if we already detached */
+	if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
+		rv = SS_IS_DISKLESS;
+
+	return rv;
+}
+
+static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
+{
+	static const char *msg_table[] = {
+		[NO_WARNING] = "",
+		[ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
+		[ABORTED_RESYNC] = "Resync aborted.",
+		[CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
+		[IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
+		[IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
+	};
+
+	if (warn != NO_WARNING)
+		dev_warn(DEV, "%s\n", msg_table[warn]);
+}
+
+/**
+ * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
+ * @mdev:	DRBD device.
+ * @os:		old state.
+ * @ns:		new state.
+ * @warn_sync_abort:
+ *
+ * When we loose connection, we have to set the state of the peers disk (pdsk)
+ * to D_UNKNOWN. This rule and many more along those lines are in this function.
+ */
+static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns,
+				       enum sanitize_state_warnings *warn)
+{
+	enum drbd_fencing_p fp;
+	enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
+
+	if (warn)
+		*warn = NO_WARNING;
+
+	fp = FP_DONT_CARE;
+	if (get_ldev(mdev)) {
+		rcu_read_lock();
+		fp = rcu_dereference(mdev->ldev->disk_conf)->fencing;
+		rcu_read_unlock();
+		put_ldev(mdev);
+	}
+
+	/* Implications from connection to peer and peer_isp */
+	if (ns.conn < C_CONNECTED) {
+		ns.peer_isp = 0;
+		ns.peer = R_UNKNOWN;
+		if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
+			ns.pdsk = D_UNKNOWN;
+	}
+
+	/* Clear the aftr_isp when becoming unconfigured */
+	if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
+		ns.aftr_isp = 0;
+
+	/* An implication of the disk states onto the connection state */
+	/* Abort resync if a disk fails/detaches */
+	if (ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
+		if (warn)
+			*warn = ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T ?
+				ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
+		ns.conn = C_CONNECTED;
+	}
+
+	/* Connection breaks down before we finished "Negotiating" */
+	if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
+	    get_ldev_if_state(mdev, D_NEGOTIATING)) {
+		if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
+			ns.disk = mdev->new_state_tmp.disk;
+			ns.pdsk = mdev->new_state_tmp.pdsk;
+		} else {
+			if (warn)
+				*warn = CONNECTION_LOST_NEGOTIATING;
+			ns.disk = D_DISKLESS;
+			ns.pdsk = D_UNKNOWN;
+		}
+		put_ldev(mdev);
+	}
+
+	/* D_CONSISTENT and D_OUTDATED vanish when we get connected */
+	if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
+		if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
+			ns.disk = D_UP_TO_DATE;
+		if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
+			ns.pdsk = D_UP_TO_DATE;
+	}
+
+	/* Implications of the connection stat on the disk states */
+	disk_min = D_DISKLESS;
+	disk_max = D_UP_TO_DATE;
+	pdsk_min = D_INCONSISTENT;
+	pdsk_max = D_UNKNOWN;
+	switch ((enum drbd_conns)ns.conn) {
+	case C_WF_BITMAP_T:
+	case C_PAUSED_SYNC_T:
+	case C_STARTING_SYNC_T:
+	case C_WF_SYNC_UUID:
+	case C_BEHIND:
+		disk_min = D_INCONSISTENT;
+		disk_max = D_OUTDATED;
+		pdsk_min = D_UP_TO_DATE;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_VERIFY_S:
+	case C_VERIFY_T:
+		disk_min = D_UP_TO_DATE;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_UP_TO_DATE;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_CONNECTED:
+		disk_min = D_DISKLESS;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_DISKLESS;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_WF_BITMAP_S:
+	case C_PAUSED_SYNC_S:
+	case C_STARTING_SYNC_S:
+	case C_AHEAD:
+		disk_min = D_UP_TO_DATE;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_INCONSISTENT;
+		pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
+		break;
+	case C_SYNC_TARGET:
+		disk_min = D_INCONSISTENT;
+		disk_max = D_INCONSISTENT;
+		pdsk_min = D_UP_TO_DATE;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_SYNC_SOURCE:
+		disk_min = D_UP_TO_DATE;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_INCONSISTENT;
+		pdsk_max = D_INCONSISTENT;
+		break;
+	case C_STANDALONE:
+	case C_DISCONNECTING:
+	case C_UNCONNECTED:
+	case C_TIMEOUT:
+	case C_BROKEN_PIPE:
+	case C_NETWORK_FAILURE:
+	case C_PROTOCOL_ERROR:
+	case C_TEAR_DOWN:
+	case C_WF_CONNECTION:
+	case C_WF_REPORT_PARAMS:
+	case C_MASK:
+		break;
+	}
+	if (ns.disk > disk_max)
+		ns.disk = disk_max;
+
+	if (ns.disk < disk_min) {
+		if (warn)
+			*warn = IMPLICITLY_UPGRADED_DISK;
+		ns.disk = disk_min;
+	}
+	if (ns.pdsk > pdsk_max)
+		ns.pdsk = pdsk_max;
+
+	if (ns.pdsk < pdsk_min) {
+		if (warn)
+			*warn = IMPLICITLY_UPGRADED_PDSK;
+		ns.pdsk = pdsk_min;
+	}
+
+	if (fp == FP_STONITH &&
+	    (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED))
+		ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
+
+	if (mdev->tconn->res_opts.on_no_data == OND_SUSPEND_IO &&
+	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
+		ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
+
+	if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
+		if (ns.conn == C_SYNC_SOURCE)
+			ns.conn = C_PAUSED_SYNC_S;
+		if (ns.conn == C_SYNC_TARGET)
+			ns.conn = C_PAUSED_SYNC_T;
+	} else {
+		if (ns.conn == C_PAUSED_SYNC_S)
+			ns.conn = C_SYNC_SOURCE;
+		if (ns.conn == C_PAUSED_SYNC_T)
+			ns.conn = C_SYNC_TARGET;
+	}
+
+	return ns;
+}
+
+void drbd_resume_al(struct drbd_conf *mdev)
+{
+	if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
+		dev_info(DEV, "Resumed AL updates\n");
+}
+
+/* helper for __drbd_set_state */
+static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
+{
+	if (mdev->tconn->agreed_pro_version < 90)
+		mdev->ov_start_sector = 0;
+	mdev->rs_total = drbd_bm_bits(mdev);
+	mdev->ov_position = 0;
+	if (cs == C_VERIFY_T) {
+		/* starting online verify from an arbitrary position
+		 * does not fit well into the existing protocol.
+		 * on C_VERIFY_T, we initialize ov_left and friends
+		 * implicitly in receive_DataRequest once the
+		 * first P_OV_REQUEST is received */
+		mdev->ov_start_sector = ~(sector_t)0;
+	} else {
+		unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
+		if (bit >= mdev->rs_total) {
+			mdev->ov_start_sector =
+				BM_BIT_TO_SECT(mdev->rs_total - 1);
+			mdev->rs_total = 1;
+		} else
+			mdev->rs_total -= bit;
+		mdev->ov_position = mdev->ov_start_sector;
+	}
+	mdev->ov_left = mdev->rs_total;
+}
+
+/**
+ * __drbd_set_state() - Set a new DRBD state
+ * @mdev:	DRBD device.
+ * @ns:		new state.
+ * @flags:	Flags
+ * @done:	Optional completion, that will get completed after the after_state_ch() finished
+ *
+ * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
+ */
+enum drbd_state_rv
+__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
+	         enum chg_state_flags flags, struct completion *done)
+{
+	union drbd_state os;
+	enum drbd_state_rv rv = SS_SUCCESS;
+	enum sanitize_state_warnings ssw;
+	struct after_state_chg_work *ascw;
+
+	os = drbd_read_state(mdev);
+
+	ns = sanitize_state(mdev, ns, &ssw);
+	if (ns.i == os.i)
+		return SS_NOTHING_TO_DO;
+
+	rv = is_valid_transition(os, ns);
+	if (rv < SS_SUCCESS)
+		return rv;
+
+	if (!(flags & CS_HARD)) {
+		/*  pre-state-change checks ; only look at ns  */
+		/* See drbd_state_sw_errors in drbd_strings.c */
+
+		rv = is_valid_state(mdev, ns);
+		if (rv < SS_SUCCESS) {
+			/* If the old state was illegal as well, then let
+			   this happen...*/
+
+			if (is_valid_state(mdev, os) == rv)
+				rv = is_valid_soft_transition(os, ns, mdev->tconn);
+		} else
+			rv = is_valid_soft_transition(os, ns, mdev->tconn);
+	}
+
+	if (rv < SS_SUCCESS) {
+		if (flags & CS_VERBOSE)
+			print_st_err(mdev, os, ns, rv);
+		return rv;
+	}
+
+	print_sanitize_warnings(mdev, ssw);
+
+	drbd_pr_state_change(mdev, os, ns, flags);
+
+	/* Display changes to the susp* flags that where caused by the call to
+	   sanitize_state(). Only display it here if we where not called from
+	   _conn_request_state() */
+	if (!(flags & CS_DC_SUSP))
+		conn_pr_state_change(mdev->tconn, os, ns, (flags & ~CS_DC_MASK) | CS_DC_SUSP);
+
+	/* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
+	 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
+	 * drbd_ldev_destroy() won't happen before our corresponding
+	 * after_state_ch works run, where we put_ldev again. */
+	if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
+	    (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
+		atomic_inc(&mdev->local_cnt);
+
+	mdev->state.i = ns.i;
+	mdev->tconn->susp = ns.susp;
+	mdev->tconn->susp_nod = ns.susp_nod;
+	mdev->tconn->susp_fen = ns.susp_fen;
+
+	if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
+		drbd_print_uuids(mdev, "attached to UUIDs");
+
+	/* Wake up role changes, that were delayed because of connection establishing */
+	if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS &&
+	    no_peer_wf_report_params(mdev->tconn))
+		clear_bit(STATE_SENT, &mdev->tconn->flags);
+
+	wake_up(&mdev->misc_wait);
+	wake_up(&mdev->state_wait);
+	wake_up(&mdev->tconn->ping_wait);
+
+	/* Aborted verify run, or we reached the stop sector.
+	 * Log the last position, unless end-of-device. */
+	if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
+	    ns.conn <= C_CONNECTED) {
+		mdev->ov_start_sector =
+			BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
+		if (mdev->ov_left)
+			dev_info(DEV, "Online Verify reached sector %llu\n",
+				(unsigned long long)mdev->ov_start_sector);
+	}
+
+	if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
+	    (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
+		dev_info(DEV, "Syncer continues.\n");
+		mdev->rs_paused += (long)jiffies
+				  -(long)mdev->rs_mark_time[mdev->rs_last_mark];
+		if (ns.conn == C_SYNC_TARGET)
+			mod_timer(&mdev->resync_timer, jiffies);
+	}
+
+	if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
+	    (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
+		dev_info(DEV, "Resync suspended\n");
+		mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
+	}
+
+	if (os.conn == C_CONNECTED &&
+	    (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
+		unsigned long now = jiffies;
+		int i;
+
+		set_ov_position(mdev, ns.conn);
+		mdev->rs_start = now;
+		mdev->rs_last_events = 0;
+		mdev->rs_last_sect_ev = 0;
+		mdev->ov_last_oos_size = 0;
+		mdev->ov_last_oos_start = 0;
+
+		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+			mdev->rs_mark_left[i] = mdev->ov_left;
+			mdev->rs_mark_time[i] = now;
+		}
+
+		drbd_rs_controller_reset(mdev);
+
+		if (ns.conn == C_VERIFY_S) {
+			dev_info(DEV, "Starting Online Verify from sector %llu\n",
+					(unsigned long long)mdev->ov_position);
+			mod_timer(&mdev->resync_timer, jiffies);
+		}
+	}
+
+	if (get_ldev(mdev)) {
+		u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
+						 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
+						 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
+
+		mdf &= ~MDF_AL_CLEAN;
+		if (test_bit(CRASHED_PRIMARY, &mdev->flags))
+			mdf |= MDF_CRASHED_PRIMARY;
+		if (mdev->state.role == R_PRIMARY ||
+		    (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
+			mdf |= MDF_PRIMARY_IND;
+		if (mdev->state.conn > C_WF_REPORT_PARAMS)
+			mdf |= MDF_CONNECTED_IND;
+		if (mdev->state.disk > D_INCONSISTENT)
+			mdf |= MDF_CONSISTENT;
+		if (mdev->state.disk > D_OUTDATED)
+			mdf |= MDF_WAS_UP_TO_DATE;
+		if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
+			mdf |= MDF_PEER_OUT_DATED;
+		if (mdf != mdev->ldev->md.flags) {
+			mdev->ldev->md.flags = mdf;
+			drbd_md_mark_dirty(mdev);
+		}
+		if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
+			drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
+		put_ldev(mdev);
+	}
+
+	/* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
+	if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
+	    os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
+		set_bit(CONSIDER_RESYNC, &mdev->flags);
+
+	/* Receiver should clean up itself */
+	if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
+		drbd_thread_stop_nowait(&mdev->tconn->receiver);
+
+	/* Now the receiver finished cleaning up itself, it should die */
+	if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
+		drbd_thread_stop_nowait(&mdev->tconn->receiver);
+
+	/* Upon network failure, we need to restart the receiver. */
+	if (os.conn > C_WF_CONNECTION &&
+	    ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
+		drbd_thread_restart_nowait(&mdev->tconn->receiver);
+
+	/* Resume AL writing if we get a connection */
+	if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
+		drbd_resume_al(mdev);
+
+	/* remember last attach time so request_timer_fn() won't
+	 * kill newly established sessions while we are still trying to thaw
+	 * previously frozen IO */
+	if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+	    ns.disk > D_NEGOTIATING)
+		mdev->last_reattach_jif = jiffies;
+
+	ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
+	if (ascw) {
+		ascw->os = os;
+		ascw->ns = ns;
+		ascw->flags = flags;
+		ascw->w.cb = w_after_state_ch;
+		ascw->w.mdev = mdev;
+		ascw->done = done;
+		drbd_queue_work(&mdev->tconn->sender_work, &ascw->w);
+	} else {
+		dev_err(DEV, "Could not kmalloc an ascw\n");
+	}
+
+	return rv;
+}
+
+static int w_after_state_ch(struct drbd_work *w, int unused)
+{
+	struct after_state_chg_work *ascw =
+		container_of(w, struct after_state_chg_work, w);
+	struct drbd_conf *mdev = w->mdev;
+
+	after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
+	if (ascw->flags & CS_WAIT_COMPLETE) {
+		D_ASSERT(ascw->done != NULL);
+		complete(ascw->done);
+	}
+	kfree(ascw);
+
+	return 0;
+}
+
+static void abw_start_sync(struct drbd_conf *mdev, int rv)
+{
+	if (rv) {
+		dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
+		_drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
+		return;
+	}
+
+	switch (mdev->state.conn) {
+	case C_STARTING_SYNC_T:
+		_drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+		break;
+	case C_STARTING_SYNC_S:
+		drbd_start_resync(mdev, C_SYNC_SOURCE);
+		break;
+	}
+}
+
+int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
+		int (*io_fn)(struct drbd_conf *),
+		char *why, enum bm_flag flags)
+{
+	int rv;
+
+	D_ASSERT(current == mdev->tconn->worker.task);
+
+	/* open coded non-blocking drbd_suspend_io(mdev); */
+	set_bit(SUSPEND_IO, &mdev->flags);
+
+	drbd_bm_lock(mdev, why, flags);
+	rv = io_fn(mdev);
+	drbd_bm_unlock(mdev);
+
+	drbd_resume_io(mdev);
+
+	return rv;
+}
+
+/**
+ * after_state_ch() - Perform after state change actions that may sleep
+ * @mdev:	DRBD device.
+ * @os:		old state.
+ * @ns:		new state.
+ * @flags:	Flags
+ */
+static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
+			   union drbd_state ns, enum chg_state_flags flags)
+{
+	struct sib_info sib;
+
+	sib.sib_reason = SIB_STATE_CHANGE;
+	sib.os = os;
+	sib.ns = ns;
+
+	if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
+		clear_bit(CRASHED_PRIMARY, &mdev->flags);
+		if (mdev->p_uuid)
+			mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
+	}
+
+	/* Inform userspace about the change... */
+	drbd_bcast_event(mdev, &sib);
+
+	if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
+	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
+		drbd_khelper(mdev, "pri-on-incon-degr");
+
+	/* Here we have the actions that are performed after a
+	   state change. This function might sleep */
+
+	if (ns.susp_nod) {
+		struct drbd_tconn *tconn = mdev->tconn;
+		enum drbd_req_event what = NOTHING;
+
+		spin_lock_irq(&tconn->req_lock);
+		if (os.conn < C_CONNECTED && conn_lowest_conn(tconn) >= C_CONNECTED)
+			what = RESEND;
+
+		if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+		    conn_lowest_disk(tconn) > D_NEGOTIATING)
+			what = RESTART_FROZEN_DISK_IO;
+
+		if (tconn->susp_nod && what != NOTHING) {
+			_tl_restart(tconn, what);
+			_conn_request_state(tconn,
+					    (union drbd_state) { { .susp_nod = 1 } },
+					    (union drbd_state) { { .susp_nod = 0 } },
+					    CS_VERBOSE);
+		}
+		spin_unlock_irq(&tconn->req_lock);
+	}
+
+	if (ns.susp_fen) {
+		struct drbd_tconn *tconn = mdev->tconn;
+
+		spin_lock_irq(&tconn->req_lock);
+		if (tconn->susp_fen && conn_lowest_conn(tconn) >= C_CONNECTED) {
+			/* case2: The connection was established again: */
+			struct drbd_conf *odev;
+			int vnr;
+
+			rcu_read_lock();
+			idr_for_each_entry(&tconn->volumes, odev, vnr)
+				clear_bit(NEW_CUR_UUID, &odev->flags);
+			rcu_read_unlock();
+			_tl_restart(tconn, RESEND);
+			_conn_request_state(tconn,
+					    (union drbd_state) { { .susp_fen = 1 } },
+					    (union drbd_state) { { .susp_fen = 0 } },
+					    CS_VERBOSE);
+		}
+		spin_unlock_irq(&tconn->req_lock);
+	}
+
+	/* Became sync source.  With protocol >= 96, we still need to send out
+	 * the sync uuid now. Need to do that before any drbd_send_state, or
+	 * the other side may go "paused sync" before receiving the sync uuids,
+	 * which is unexpected. */
+	if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
+	    (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
+	    mdev->tconn->agreed_pro_version >= 96 && get_ldev(mdev)) {
+		drbd_gen_and_send_sync_uuid(mdev);
+		put_ldev(mdev);
+	}
+
+	/* Do not change the order of the if above and the two below... */
+	if (os.pdsk == D_DISKLESS &&
+	    ns.pdsk > D_DISKLESS && ns.pdsk != D_UNKNOWN) {      /* attach on the peer */
+		/* we probably will start a resync soon.
+		 * make sure those things are properly reset. */
+		mdev->rs_total = 0;
+		mdev->rs_failed = 0;
+		atomic_set(&mdev->rs_pending_cnt, 0);
+		drbd_rs_cancel_all(mdev);
+
+		drbd_send_uuids(mdev);
+		drbd_send_state(mdev, ns);
+	}
+	/* No point in queuing send_bitmap if we don't have a connection
+	 * anymore, so check also the _current_ state, not only the new state
+	 * at the time this work was queued. */
+	if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
+	    mdev->state.conn == C_WF_BITMAP_S)
+		drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
+				"send_bitmap (WFBitMapS)",
+				BM_LOCKED_TEST_ALLOWED);
+
+	/* Lost contact to peer's copy of the data */
+	if ((os.pdsk >= D_INCONSISTENT &&
+	     os.pdsk != D_UNKNOWN &&
+	     os.pdsk != D_OUTDATED)
+	&&  (ns.pdsk < D_INCONSISTENT ||
+	     ns.pdsk == D_UNKNOWN ||
+	     ns.pdsk == D_OUTDATED)) {
+		if (get_ldev(mdev)) {
+			if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
+			    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+				if (drbd_suspended(mdev)) {
+					set_bit(NEW_CUR_UUID, &mdev->flags);
+				} else {
+					drbd_uuid_new_current(mdev);
+					drbd_send_uuids(mdev);
+				}
+			}
+			put_ldev(mdev);
+		}
+	}
+
+	if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
+		if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
+		    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+			drbd_uuid_new_current(mdev);
+			drbd_send_uuids(mdev);
+		}
+		/* D_DISKLESS Peer becomes secondary */
+		if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
+			/* We may still be Primary ourselves.
+			 * No harm done if the bitmap still changes,
+			 * redirtied pages will follow later. */
+			drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
+				"demote diskless peer", BM_LOCKED_SET_ALLOWED);
+		put_ldev(mdev);
+	}
+
+	/* Write out all changed bits on demote.
+	 * Though, no need to da that just yet
+	 * if there is a resync going on still */
+	if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
+		mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
+		/* No changes to the bitmap expected this time, so assert that,
+		 * even though no harm was done if it did change. */
+		drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
+				"demote", BM_LOCKED_TEST_ALLOWED);
+		put_ldev(mdev);
+	}
+
+	/* Last part of the attaching process ... */
+	if (ns.conn >= C_CONNECTED &&
+	    os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
+		drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
+		drbd_send_uuids(mdev);
+		drbd_send_state(mdev, ns);
+	}
+
+	/* We want to pause/continue resync, tell peer. */
+	if (ns.conn >= C_CONNECTED &&
+	     ((os.aftr_isp != ns.aftr_isp) ||
+	      (os.user_isp != ns.user_isp)))
+		drbd_send_state(mdev, ns);
+
+	/* In case one of the isp bits got set, suspend other devices. */
+	if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
+	    (ns.aftr_isp || ns.peer_isp || ns.user_isp))
+		suspend_other_sg(mdev);
+
+	/* Make sure the peer gets informed about eventual state
+	   changes (ISP bits) while we were in WFReportParams. */
+	if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
+		drbd_send_state(mdev, ns);
+
+	if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
+		drbd_send_state(mdev, ns);
+
+	/* We are in the progress to start a full sync... */
+	if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+	    (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
+		/* no other bitmap changes expected during this phase */
+		drbd_queue_bitmap_io(mdev,
+			&drbd_bmio_set_n_write, &abw_start_sync,
+			"set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
+
+	/* We are invalidating our self... */
+	if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
+	    os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
+		/* other bitmap operation expected during this phase */
+		drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
+			"set_n_write from invalidate", BM_LOCKED_MASK);
+
+	/* first half of local IO error, failure to attach,
+	 * or administrative detach */
+	if (os.disk != D_FAILED && ns.disk == D_FAILED) {
+		enum drbd_io_error_p eh = EP_PASS_ON;
+		int was_io_error = 0;
+		/* corresponding get_ldev was in __drbd_set_state, to serialize
+		 * our cleanup here with the transition to D_DISKLESS.
+		 * But is is still not save to dreference ldev here, since
+		 * we might come from an failed Attach before ldev was set. */
+		if (mdev->ldev) {
+			rcu_read_lock();
+			eh = rcu_dereference(mdev->ldev->disk_conf)->on_io_error;
+			rcu_read_unlock();
+
+			was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
+
+			if (was_io_error && eh == EP_CALL_HELPER)
+				drbd_khelper(mdev, "local-io-error");
+
+			/* Immediately allow completion of all application IO,
+			 * that waits for completion from the local disk,
+			 * if this was a force-detach due to disk_timeout
+			 * or administrator request (drbdsetup detach --force).
+			 * Do NOT abort otherwise.
+			 * Aborting local requests may cause serious problems,
+			 * if requests are completed to upper layers already,
+			 * and then later the already submitted local bio completes.
+			 * This can cause DMA into former bio pages that meanwhile
+			 * have been re-used for other things.
+			 * So aborting local requests may cause crashes,
+			 * or even worse, silent data corruption.
+			 */
+			if (test_and_clear_bit(FORCE_DETACH, &mdev->flags))
+				tl_abort_disk_io(mdev);
+
+			/* current state still has to be D_FAILED,
+			 * there is only one way out: to D_DISKLESS,
+			 * and that may only happen after our put_ldev below. */
+			if (mdev->state.disk != D_FAILED)
+				dev_err(DEV,
+					"ASSERT FAILED: disk is %s during detach\n",
+					drbd_disk_str(mdev->state.disk));
+
+			if (ns.conn >= C_CONNECTED)
+				drbd_send_state(mdev, ns);
+
+			drbd_rs_cancel_all(mdev);
+
+			/* In case we want to get something to stable storage still,
+			 * this may be the last chance.
+			 * Following put_ldev may transition to D_DISKLESS. */
+			drbd_md_sync(mdev);
+		}
+		put_ldev(mdev);
+	}
+
+        /* second half of local IO error, failure to attach,
+         * or administrative detach,
+         * after local_cnt references have reached zero again */
+        if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
+                /* We must still be diskless,
+                 * re-attach has to be serialized with this! */
+                if (mdev->state.disk != D_DISKLESS)
+                        dev_err(DEV,
+                                "ASSERT FAILED: disk is %s while going diskless\n",
+                                drbd_disk_str(mdev->state.disk));
+
+		if (ns.conn >= C_CONNECTED)
+			drbd_send_state(mdev, ns);
+		/* corresponding get_ldev in __drbd_set_state
+		 * this may finally trigger drbd_ldev_destroy. */
+		put_ldev(mdev);
+	}
+
+	/* Notify peer that I had a local IO error, and did not detached.. */
+	if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
+		drbd_send_state(mdev, ns);
+
+	/* Disks got bigger while they were detached */
+	if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
+	    test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
+		if (ns.conn == C_CONNECTED)
+			resync_after_online_grow(mdev);
+	}
+
+	/* A resync finished or aborted, wake paused devices... */
+	if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
+	    (os.peer_isp && !ns.peer_isp) ||
+	    (os.user_isp && !ns.user_isp))
+		resume_next_sg(mdev);
+
+	/* sync target done with resync.  Explicitly notify peer, even though
+	 * it should (at least for non-empty resyncs) already know itself. */
+	if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
+		drbd_send_state(mdev, ns);
+
+	/* Verify finished, or reached stop sector.  Peer did not know about
+	 * the stop sector, and we may even have changed the stop sector during
+	 * verify to interrupt/stop early.  Send the new state. */
+	if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
+	&& verify_can_do_stop_sector(mdev))
+		drbd_send_state(mdev, ns);
+
+	/* This triggers bitmap writeout of potentially still unwritten pages
+	 * if the resync finished cleanly, or aborted because of peer disk
+	 * failure, or because of connection loss.
+	 * For resync aborted because of local disk failure, we cannot do
+	 * any bitmap writeout anymore.
+	 * No harm done if some bits change during this phase.
+	 */
+	if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
+		drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
+			"write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
+		put_ldev(mdev);
+	}
+
+	if (ns.disk == D_DISKLESS &&
+	    ns.conn == C_STANDALONE &&
+	    ns.role == R_SECONDARY) {
+		if (os.aftr_isp != ns.aftr_isp)
+			resume_next_sg(mdev);
+	}
+
+	drbd_md_sync(mdev);
+}
+
+struct after_conn_state_chg_work {
+	struct drbd_work w;
+	enum drbd_conns oc;
+	union drbd_state ns_min;
+	union drbd_state ns_max; /* new, max state, over all mdevs */
+	enum chg_state_flags flags;
+};
+
+static int w_after_conn_state_ch(struct drbd_work *w, int unused)
+{
+	struct after_conn_state_chg_work *acscw =
+		container_of(w, struct after_conn_state_chg_work, w);
+	struct drbd_tconn *tconn = w->tconn;
+	enum drbd_conns oc = acscw->oc;
+	union drbd_state ns_max = acscw->ns_max;
+	struct drbd_conf *mdev;
+	int vnr;
+
+	kfree(acscw);
+
+	/* Upon network configuration, we need to start the receiver */
+	if (oc == C_STANDALONE && ns_max.conn == C_UNCONNECTED)
+		drbd_thread_start(&tconn->receiver);
+
+	if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) {
+		struct net_conf *old_conf;
+
+		mutex_lock(&tconn->conf_update);
+		old_conf = tconn->net_conf;
+		tconn->my_addr_len = 0;
+		tconn->peer_addr_len = 0;
+		rcu_assign_pointer(tconn->net_conf, NULL);
+		conn_free_crypto(tconn);
+		mutex_unlock(&tconn->conf_update);
+
+		synchronize_rcu();
+		kfree(old_conf);
+	}
+
+	if (ns_max.susp_fen) {
+		/* case1: The outdate peer handler is successful: */
+		if (ns_max.pdsk <= D_OUTDATED) {
+			rcu_read_lock();
+			idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+				if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
+					drbd_uuid_new_current(mdev);
+					clear_bit(NEW_CUR_UUID, &mdev->flags);
+				}
+			}
+			rcu_read_unlock();
+			spin_lock_irq(&tconn->req_lock);
+			_tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING);
+			_conn_request_state(tconn,
+					    (union drbd_state) { { .susp_fen = 1 } },
+					    (union drbd_state) { { .susp_fen = 0 } },
+					    CS_VERBOSE);
+			spin_unlock_irq(&tconn->req_lock);
+		}
+	}
+	kref_put(&tconn->kref, &conn_destroy);
+
+	conn_md_sync(tconn);
+
+	return 0;
+}
+
+void conn_old_common_state(struct drbd_tconn *tconn, union drbd_state *pcs, enum chg_state_flags *pf)
+{
+	enum chg_state_flags flags = ~0;
+	struct drbd_conf *mdev;
+	int vnr, first_vol = 1;
+	union drbd_dev_state os, cs = {
+		{ .role = R_SECONDARY,
+		  .peer = R_UNKNOWN,
+		  .conn = tconn->cstate,
+		  .disk = D_DISKLESS,
+		  .pdsk = D_UNKNOWN,
+		} };
+
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+		os = mdev->state;
+
+		if (first_vol) {
+			cs = os;
+			first_vol = 0;
+			continue;
+		}
+
+		if (cs.role != os.role)
+			flags &= ~CS_DC_ROLE;
+
+		if (cs.peer != os.peer)
+			flags &= ~CS_DC_PEER;
+
+		if (cs.conn != os.conn)
+			flags &= ~CS_DC_CONN;
+
+		if (cs.disk != os.disk)
+			flags &= ~CS_DC_DISK;
+
+		if (cs.pdsk != os.pdsk)
+			flags &= ~CS_DC_PDSK;
+	}
+	rcu_read_unlock();
+
+	*pf |= CS_DC_MASK;
+	*pf &= flags;
+	(*pcs).i = cs.i;
+}
+
+static enum drbd_state_rv
+conn_is_valid_transition(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
+			 enum chg_state_flags flags)
+{
+	enum drbd_state_rv rv = SS_SUCCESS;
+	union drbd_state ns, os;
+	struct drbd_conf *mdev;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+		os = drbd_read_state(mdev);
+		ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL);
+
+		if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
+			ns.disk = os.disk;
+
+		if (ns.i == os.i)
+			continue;
+
+		rv = is_valid_transition(os, ns);
+		if (rv < SS_SUCCESS)
+			break;
+
+		if (!(flags & CS_HARD)) {
+			rv = is_valid_state(mdev, ns);
+			if (rv < SS_SUCCESS) {
+				if (is_valid_state(mdev, os) == rv)
+					rv = is_valid_soft_transition(os, ns, tconn);
+			} else
+				rv = is_valid_soft_transition(os, ns, tconn);
+		}
+		if (rv < SS_SUCCESS)
+			break;
+	}
+	rcu_read_unlock();
+
+	if (rv < SS_SUCCESS && flags & CS_VERBOSE)
+		print_st_err(mdev, os, ns, rv);
+
+	return rv;
+}
+
+void
+conn_set_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
+	       union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags)
+{
+	union drbd_state ns, os, ns_max = { };
+	union drbd_state ns_min = {
+		{ .role = R_MASK,
+		  .peer = R_MASK,
+		  .conn = val.conn,
+		  .disk = D_MASK,
+		  .pdsk = D_MASK
+		} };
+	struct drbd_conf *mdev;
+	enum drbd_state_rv rv;
+	int vnr, number_of_volumes = 0;
+
+	if (mask.conn == C_MASK) {
+		/* remember last connect time so request_timer_fn() won't
+		 * kill newly established sessions while we are still trying to thaw
+		 * previously frozen IO */
+		if (tconn->cstate != C_WF_REPORT_PARAMS && val.conn == C_WF_REPORT_PARAMS)
+			tconn->last_reconnect_jif = jiffies;
+
+		tconn->cstate = val.conn;
+	}
+
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+		number_of_volumes++;
+		os = drbd_read_state(mdev);
+		ns = apply_mask_val(os, mask, val);
+		ns = sanitize_state(mdev, ns, NULL);
+
+		if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
+			ns.disk = os.disk;
+
+		rv = __drbd_set_state(mdev, ns, flags, NULL);
+		if (rv < SS_SUCCESS)
+			BUG();
+
+		ns.i = mdev->state.i;
+		ns_max.role = max_role(ns.role, ns_max.role);
+		ns_max.peer = max_role(ns.peer, ns_max.peer);
+		ns_max.conn = max_t(enum drbd_conns, ns.conn, ns_max.conn);
+		ns_max.disk = max_t(enum drbd_disk_state, ns.disk, ns_max.disk);
+		ns_max.pdsk = max_t(enum drbd_disk_state, ns.pdsk, ns_max.pdsk);
+
+		ns_min.role = min_role(ns.role, ns_min.role);
+		ns_min.peer = min_role(ns.peer, ns_min.peer);
+		ns_min.conn = min_t(enum drbd_conns, ns.conn, ns_min.conn);
+		ns_min.disk = min_t(enum drbd_disk_state, ns.disk, ns_min.disk);
+		ns_min.pdsk = min_t(enum drbd_disk_state, ns.pdsk, ns_min.pdsk);
+	}
+	rcu_read_unlock();
+
+	if (number_of_volumes == 0) {
+		ns_min = ns_max = (union drbd_state) { {
+				.role = R_SECONDARY,
+				.peer = R_UNKNOWN,
+				.conn = val.conn,
+				.disk = D_DISKLESS,
+				.pdsk = D_UNKNOWN
+			} };
+	}
+
+	ns_min.susp = ns_max.susp = tconn->susp;
+	ns_min.susp_nod = ns_max.susp_nod = tconn->susp_nod;
+	ns_min.susp_fen = ns_max.susp_fen = tconn->susp_fen;
+
+	*pns_min = ns_min;
+	*pns_max = ns_max;
+}
+
+static enum drbd_state_rv
+_conn_rq_cond(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val)
+{
+	enum drbd_state_rv rv;
+
+	if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &tconn->flags))
+		return SS_CW_SUCCESS;
+
+	if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags))
+		return SS_CW_FAILED_BY_PEER;
+
+	rv = tconn->cstate != C_WF_REPORT_PARAMS ? SS_CW_NO_NEED : SS_UNKNOWN_ERROR;
+
+	if (rv == SS_UNKNOWN_ERROR)
+		rv = conn_is_valid_transition(tconn, mask, val, 0);
+
+	if (rv == SS_SUCCESS)
+		rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
+
+	return rv;
+}
+
+enum drbd_state_rv
+_conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
+		    enum chg_state_flags flags)
+{
+	enum drbd_state_rv rv = SS_SUCCESS;
+	struct after_conn_state_chg_work *acscw;
+	enum drbd_conns oc = tconn->cstate;
+	union drbd_state ns_max, ns_min, os;
+	bool have_mutex = false;
+
+	if (mask.conn) {
+		rv = is_valid_conn_transition(oc, val.conn);
+		if (rv < SS_SUCCESS)
+			goto abort;
+	}
+
+	rv = conn_is_valid_transition(tconn, mask, val, flags);
+	if (rv < SS_SUCCESS)
+		goto abort;
+
+	if (oc == C_WF_REPORT_PARAMS && val.conn == C_DISCONNECTING &&
+	    !(flags & (CS_LOCAL_ONLY | CS_HARD))) {
+
+		/* This will be a cluster-wide state change.
+		 * Need to give up the spinlock, grab the mutex,
+		 * then send the state change request, ... */
+		spin_unlock_irq(&tconn->req_lock);
+		mutex_lock(&tconn->cstate_mutex);
+		have_mutex = true;
+
+		set_bit(CONN_WD_ST_CHG_REQ, &tconn->flags);
+		if (conn_send_state_req(tconn, mask, val)) {
+			/* sending failed. */
+			clear_bit(CONN_WD_ST_CHG_REQ, &tconn->flags);
+			rv = SS_CW_FAILED_BY_PEER;
+			/* need to re-aquire the spin lock, though */
+			goto abort_unlocked;
+		}
+
+		if (val.conn == C_DISCONNECTING)
+			set_bit(DISCONNECT_SENT, &tconn->flags);
+
+		/* ... and re-aquire the spinlock.
+		 * If _conn_rq_cond() returned >= SS_SUCCESS, we must call
+		 * conn_set_state() within the same spinlock. */
+		spin_lock_irq(&tconn->req_lock);
+		wait_event_lock_irq(tconn->ping_wait,
+				(rv = _conn_rq_cond(tconn, mask, val)),
+				tconn->req_lock);
+		clear_bit(CONN_WD_ST_CHG_REQ, &tconn->flags);
+		if (rv < SS_SUCCESS)
+			goto abort;
+	}
+
+	conn_old_common_state(tconn, &os, &flags);
+	flags |= CS_DC_SUSP;
+	conn_set_state(tconn, mask, val, &ns_min, &ns_max, flags);
+	conn_pr_state_change(tconn, os, ns_max, flags);
+
+	acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC);
+	if (acscw) {
+		acscw->oc = os.conn;
+		acscw->ns_min = ns_min;
+		acscw->ns_max = ns_max;
+		acscw->flags = flags;
+		acscw->w.cb = w_after_conn_state_ch;
+		kref_get(&tconn->kref);
+		acscw->w.tconn = tconn;
+		drbd_queue_work(&tconn->sender_work, &acscw->w);
+	} else {
+		conn_err(tconn, "Could not kmalloc an acscw\n");
+	}
+
+ abort:
+	if (have_mutex) {
+		/* mutex_unlock() "... must not be used in interrupt context.",
+		 * so give up the spinlock, then re-aquire it */
+		spin_unlock_irq(&tconn->req_lock);
+ abort_unlocked:
+		mutex_unlock(&tconn->cstate_mutex);
+		spin_lock_irq(&tconn->req_lock);
+	}
+	if (rv < SS_SUCCESS && flags & CS_VERBOSE) {
+		conn_err(tconn, "State change failed: %s\n", drbd_set_st_err_str(rv));
+		conn_err(tconn, " mask = 0x%x val = 0x%x\n", mask.i, val.i);
+		conn_err(tconn, " old_conn:%s wanted_conn:%s\n", drbd_conn_str(oc), drbd_conn_str(val.conn));
+	}
+	return rv;
+}
+
+enum drbd_state_rv
+conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
+		   enum chg_state_flags flags)
+{
+	enum drbd_state_rv rv;
+
+	spin_lock_irq(&tconn->req_lock);
+	rv = _conn_request_state(tconn, mask, val, flags);
+	spin_unlock_irq(&tconn->req_lock);
+
+	return rv;
+}
diff --git a/trunk/drivers/block/drbd/drbd_state.h b/trunk/drivers/block/drbd/drbd_state.h
new file mode 100644
index 000000000000..a3c361bbc4b6
--- /dev/null
+++ b/trunk/drivers/block/drbd/drbd_state.h
@@ -0,0 +1,161 @@
+#ifndef DRBD_STATE_H
+#define DRBD_STATE_H
+
+struct drbd_conf;
+struct drbd_tconn;
+
+/**
+ * DOC: DRBD State macros
+ *
+ * These macros are used to express state changes in easily readable form.
+ *
+ * The NS macros expand to a mask and a value, that can be bit ored onto the
+ * current state as soon as the spinlock (req_lock) was taken.
+ *
+ * The _NS macros are used for state functions that get called with the
+ * spinlock. These macros expand directly to the new state value.
+ *
+ * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
+ * to express state changes that affect more than one aspect of the state.
+ *
+ * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
+ * Means that the network connection was established and that the peer
+ * is in secondary role.
+ */
+#define role_MASK R_MASK
+#define peer_MASK R_MASK
+#define disk_MASK D_MASK
+#define pdsk_MASK D_MASK
+#define conn_MASK C_MASK
+#define susp_MASK 1
+#define user_isp_MASK 1
+#define aftr_isp_MASK 1
+#define susp_nod_MASK 1
+#define susp_fen_MASK 1
+
+#define NS(T, S) \
+	({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
+	({ union drbd_state val; val.i = 0; val.T = (S); val; })
+#define NS2(T1, S1, T2, S2) \
+	({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+	  mask.T2 = T2##_MASK; mask; }), \
+	({ union drbd_state val; val.i = 0; val.T1 = (S1); \
+	  val.T2 = (S2); val; })
+#define NS3(T1, S1, T2, S2, T3, S3) \
+	({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+	  mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
+	({ union drbd_state val;  val.i = 0; val.T1 = (S1); \
+	  val.T2 = (S2); val.T3 = (S3); val; })
+
+#define _NS(D, T, S) \
+	D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T = (S); __ns; })
+#define _NS2(D, T1, S1, T2, S2) \
+	D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
+	__ns.T2 = (S2); __ns; })
+#define _NS3(D, T1, S1, T2, S2, T3, S3) \
+	D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
+	__ns.T2 = (S2); __ns.T3 = (S3); __ns; })
+
+enum chg_state_flags {
+	CS_HARD	         = 1 << 0,
+	CS_VERBOSE       = 1 << 1,
+	CS_WAIT_COMPLETE = 1 << 2,
+	CS_SERIALIZE     = 1 << 3,
+	CS_ORDERED       = CS_WAIT_COMPLETE + CS_SERIALIZE,
+	CS_LOCAL_ONLY    = 1 << 4, /* Do not consider a device pair wide state change */
+	CS_DC_ROLE       = 1 << 5, /* DC = display as connection state change */
+	CS_DC_PEER       = 1 << 6,
+	CS_DC_CONN       = 1 << 7,
+	CS_DC_DISK       = 1 << 8,
+	CS_DC_PDSK       = 1 << 9,
+	CS_DC_SUSP       = 1 << 10,
+	CS_DC_MASK       = CS_DC_ROLE + CS_DC_PEER + CS_DC_CONN + CS_DC_DISK + CS_DC_PDSK,
+	CS_IGN_OUTD_FAIL = 1 << 11,
+};
+
+/* drbd_dev_state and drbd_state are different types. This is to stress the
+   small difference. There is no suspended flag (.susp), and no suspended
+   while fence handler runs flas (susp_fen). */
+union drbd_dev_state {
+	struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+		unsigned role:2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned peer:2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned conn:5 ;   /* 17/32	 cstates */
+		unsigned disk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned pdsk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned _unused:1 ;
+		unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+		unsigned peer_isp:1 ;
+		unsigned user_isp:1 ;
+		unsigned _pad:11;   /* 0	 unused */
+#elif defined(__BIG_ENDIAN_BITFIELD)
+		unsigned _pad:11;
+		unsigned user_isp:1 ;
+		unsigned peer_isp:1 ;
+		unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+		unsigned _unused:1 ;
+		unsigned pdsk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned disk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned conn:5 ;   /* 17/32	 cstates */
+		unsigned peer:2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned role:2 ;   /* 3/4	 primary/secondary/unknown */
+#else
+# error "this endianess is not supported"
+#endif
+	};
+	unsigned int i;
+};
+
+extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev,
+					    enum chg_state_flags f,
+					    union drbd_state mask,
+					    union drbd_state val);
+extern void drbd_force_state(struct drbd_conf *, union drbd_state,
+			union drbd_state);
+extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *,
+					      union drbd_state,
+					      union drbd_state,
+					      enum chg_state_flags);
+extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state,
+					   enum chg_state_flags,
+					   struct completion *done);
+extern void print_st_err(struct drbd_conf *, union drbd_state,
+			union drbd_state, int);
+
+enum drbd_state_rv
+_conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
+		    enum chg_state_flags flags);
+
+enum drbd_state_rv
+conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
+		   enum chg_state_flags flags);
+
+extern void drbd_resume_al(struct drbd_conf *mdev);
+extern bool conn_all_vols_unconf(struct drbd_tconn *tconn);
+
+/**
+ * drbd_request_state() - Reqest a state change
+ * @mdev:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ *
+ * This is the most graceful way of requesting a state change. It is verbose
+ * quite verbose in case the state change is not possible, and all those
+ * state changes are globally serialized.
+ */
+static inline int drbd_request_state(struct drbd_conf *mdev,
+				     union drbd_state mask,
+				     union drbd_state val)
+{
+	return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
+}
+
+enum drbd_role conn_highest_role(struct drbd_tconn *tconn);
+enum drbd_role conn_highest_peer(struct drbd_tconn *tconn);
+enum drbd_disk_state conn_highest_disk(struct drbd_tconn *tconn);
+enum drbd_disk_state conn_lowest_disk(struct drbd_tconn *tconn);
+enum drbd_disk_state conn_highest_pdsk(struct drbd_tconn *tconn);
+enum drbd_conns conn_lowest_conn(struct drbd_tconn *tconn);
+
+#endif
diff --git a/trunk/drivers/block/drbd/drbd_strings.c b/trunk/drivers/block/drbd/drbd_strings.c
index c44a2a602772..9a664bd27404 100644
--- a/trunk/drivers/block/drbd/drbd_strings.c
+++ b/trunk/drivers/block/drbd/drbd_strings.c
@@ -89,6 +89,7 @@ static const char *drbd_state_sw_errors[] = {
 	[-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
 	[-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
 	[-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
+	[-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config",
 };
 
 const char *drbd_conn_str(enum drbd_conns s)
diff --git a/trunk/drivers/block/drbd/drbd_worker.c b/trunk/drivers/block/drbd/drbd_worker.c
index 6bce2cc179d4..424dc7bdf9b7 100644
--- a/trunk/drivers/block/drbd/drbd_worker.c
+++ b/trunk/drivers/block/drbd/drbd_worker.c
@@ -38,16 +38,13 @@
 #include "drbd_int.h"
 #include "drbd_req.h"
 
-static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
-static int w_make_resync_request(struct drbd_conf *mdev,
-				 struct drbd_work *w, int cancel);
-
+static int w_make_ov_request(struct drbd_work *w, int cancel);
 
 
 /* endio handlers:
  *   drbd_md_io_complete (defined here)
- *   drbd_endio_pri (defined here)
- *   drbd_endio_sec (defined here)
+ *   drbd_request_endio (defined here)
+ *   drbd_peer_request_endio (defined here)
  *   bm_async_io_complete (defined in drbd_bitmap.c)
  *
  * For all these callbacks, note the following:
@@ -60,7 +57,7 @@ static int w_make_resync_request(struct drbd_conf *mdev,
 
 /* About the global_state_lock
    Each state transition on an device holds a read lock. In case we have
-   to evaluate the sync after dependencies, we grab a write lock, because
+   to evaluate the resync after dependencies, we grab a write lock, because
    we need stable states on all devices for that.  */
 rwlock_t global_state_lock;
 
@@ -98,97 +95,93 @@ void drbd_md_io_complete(struct bio *bio, int error)
 /* reads on behalf of the partner,
  * "submitted" by the receiver
  */
-void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
+void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local)
 {
 	unsigned long flags = 0;
-	struct drbd_conf *mdev = e->mdev;
-
-	D_ASSERT(e->block_id != ID_VACANT);
+	struct drbd_conf *mdev = peer_req->w.mdev;
 
-	spin_lock_irqsave(&mdev->req_lock, flags);
-	mdev->read_cnt += e->size >> 9;
-	list_del(&e->w.list);
+	spin_lock_irqsave(&mdev->tconn->req_lock, flags);
+	mdev->read_cnt += peer_req->i.size >> 9;
+	list_del(&peer_req->w.list);
 	if (list_empty(&mdev->read_ee))
 		wake_up(&mdev->ee_wait);
-	if (test_bit(__EE_WAS_ERROR, &e->flags))
-		__drbd_chk_io_error(mdev, DRBD_IO_ERROR);
-	spin_unlock_irqrestore(&mdev->req_lock, flags);
+	if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
+		__drbd_chk_io_error(mdev, DRBD_READ_ERROR);
+	spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
 
-	drbd_queue_work(&mdev->data.work, &e->w);
+	drbd_queue_work(&mdev->tconn->sender_work, &peer_req->w);
 	put_ldev(mdev);
 }
 
 /* writes on behalf of the partner, or resync writes,
  * "submitted" by the receiver, final stage.  */
-static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
+static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
 {
 	unsigned long flags = 0;
-	struct drbd_conf *mdev = e->mdev;
-	sector_t e_sector;
+	struct drbd_conf *mdev = peer_req->w.mdev;
+	struct drbd_interval i;
 	int do_wake;
-	int is_syncer_req;
+	u64 block_id;
 	int do_al_complete_io;
 
-	D_ASSERT(e->block_id != ID_VACANT);
-
-	/* after we moved e to done_ee,
+	/* after we moved peer_req to done_ee,
 	 * we may no longer access it,
 	 * it may be freed/reused already!
 	 * (as soon as we release the req_lock) */
-	e_sector = e->sector;
-	do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
-	is_syncer_req = is_syncer_block_id(e->block_id);
+	i = peer_req->i;
+	do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
+	block_id = peer_req->block_id;
 
-	spin_lock_irqsave(&mdev->req_lock, flags);
-	mdev->writ_cnt += e->size >> 9;
-	list_del(&e->w.list); /* has been on active_ee or sync_ee */
-	list_add_tail(&e->w.list, &mdev->done_ee);
+	spin_lock_irqsave(&mdev->tconn->req_lock, flags);
+	mdev->writ_cnt += peer_req->i.size >> 9;
+	list_move_tail(&peer_req->w.list, &mdev->done_ee);
 
-	/* No hlist_del_init(&e->collision) here, we did not send the Ack yet,
-	 * neither did we wake possibly waiting conflicting requests.
-	 * done from "drbd_process_done_ee" within the appropriate w.cb
-	 * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
+	/*
+	 * Do not remove from the write_requests tree here: we did not send the
+	 * Ack yet and did not wake possibly waiting conflicting requests.
+	 * Removed from the tree from "drbd_process_done_ee" within the
+	 * appropriate w.cb (e_end_block/e_end_resync_block) or from
+	 * _drbd_clear_done_ee.
+	 */
 
-	do_wake = is_syncer_req
-		? list_empty(&mdev->sync_ee)
-		: list_empty(&mdev->active_ee);
+	do_wake = list_empty(block_id == ID_SYNCER ? &mdev->sync_ee : &mdev->active_ee);
 
-	if (test_bit(__EE_WAS_ERROR, &e->flags))
-		__drbd_chk_io_error(mdev, DRBD_IO_ERROR);
-	spin_unlock_irqrestore(&mdev->req_lock, flags);
+	if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
+		__drbd_chk_io_error(mdev, DRBD_WRITE_ERROR);
+	spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
 
-	if (is_syncer_req)
-		drbd_rs_complete_io(mdev, e_sector);
+	if (block_id == ID_SYNCER)
+		drbd_rs_complete_io(mdev, i.sector);
 
 	if (do_wake)
 		wake_up(&mdev->ee_wait);
 
 	if (do_al_complete_io)
-		drbd_al_complete_io(mdev, e_sector);
+		drbd_al_complete_io(mdev, &i);
 
-	wake_asender(mdev);
+	wake_asender(mdev->tconn);
 	put_ldev(mdev);
 }
 
 /* writes on behalf of the partner, or resync writes,
  * "submitted" by the receiver.
  */
-void drbd_endio_sec(struct bio *bio, int error)
+void drbd_peer_request_endio(struct bio *bio, int error)
 {
-	struct drbd_epoch_entry *e = bio->bi_private;
-	struct drbd_conf *mdev = e->mdev;
+	struct drbd_peer_request *peer_req = bio->bi_private;
+	struct drbd_conf *mdev = peer_req->w.mdev;
 	int uptodate = bio_flagged(bio, BIO_UPTODATE);
 	int is_write = bio_data_dir(bio) == WRITE;
 
 	if (error && __ratelimit(&drbd_ratelimit_state))
 		dev_warn(DEV, "%s: error=%d s=%llus\n",
 				is_write ? "write" : "read", error,
-				(unsigned long long)e->sector);
+				(unsigned long long)peer_req->i.sector);
 	if (!error && !uptodate) {
 		if (__ratelimit(&drbd_ratelimit_state))
 			dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
 					is_write ? "write" : "read",
-					(unsigned long long)e->sector);
+					(unsigned long long)peer_req->i.sector);
 		/* strange behavior of some lower level drivers...
 		 * fail the request by clearing the uptodate flag,
 		 * but do not return any error?! */
@@ -196,24 +189,24 @@ void drbd_endio_sec(struct bio *bio, int error)
 	}
 
 	if (error)
-		set_bit(__EE_WAS_ERROR, &e->flags);
+		set_bit(__EE_WAS_ERROR, &peer_req->flags);
 
 	bio_put(bio); /* no need for the bio anymore */
-	if (atomic_dec_and_test(&e->pending_bios)) {
+	if (atomic_dec_and_test(&peer_req->pending_bios)) {
 		if (is_write)
-			drbd_endio_write_sec_final(e);
+			drbd_endio_write_sec_final(peer_req);
 		else
-			drbd_endio_read_sec_final(e);
+			drbd_endio_read_sec_final(peer_req);
 	}
 }
 
 /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
  */
-void drbd_endio_pri(struct bio *bio, int error)
+void drbd_request_endio(struct bio *bio, int error)
 {
 	unsigned long flags;
 	struct drbd_request *req = bio->bi_private;
-	struct drbd_conf *mdev = req->mdev;
+	struct drbd_conf *mdev = req->w.mdev;
 	struct bio_and_error m;
 	enum drbd_req_event what;
 	int uptodate = bio_flagged(bio, BIO_UPTODATE);
@@ -227,53 +220,72 @@ void drbd_endio_pri(struct bio *bio, int error)
 		error = -EIO;
 	}
 
+
+	/* If this request was aborted locally before,
+	 * but now was completed "successfully",
+	 * chances are that this caused arbitrary data corruption.
+	 *
+	 * "aborting" requests, or force-detaching the disk, is intended for
+	 * completely blocked/hung local backing devices which do no longer
+	 * complete requests at all, not even do error completions.  In this
+	 * situation, usually a hard-reset and failover is the only way out.
+	 *
+	 * By "aborting", basically faking a local error-completion,
+	 * we allow for a more graceful swichover by cleanly migrating services.
+	 * Still the affected node has to be rebooted "soon".
+	 *
+	 * By completing these requests, we allow the upper layers to re-use
+	 * the associated data pages.
+	 *
+	 * If later the local backing device "recovers", and now DMAs some data
+	 * from disk into the original request pages, in the best case it will
+	 * just put random data into unused pages; but typically it will corrupt
+	 * meanwhile completely unrelated data, causing all sorts of damage.
+	 *
+	 * Which means delayed successful completion,
+	 * especially for READ requests,
+	 * is a reason to panic().
+	 *
+	 * We assume that a delayed *error* completion is OK,
+	 * though we still will complain noisily about it.
+	 */
+	if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) {
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_emerg(DEV, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
+
+		if (!error)
+			panic("possible random memory corruption caused by delayed completion of aborted local request\n");
+	}
+
 	/* to avoid recursion in __req_mod */
 	if (unlikely(error)) {
 		what = (bio_data_dir(bio) == WRITE)
-			? write_completed_with_error
+			? WRITE_COMPLETED_WITH_ERROR
 			: (bio_rw(bio) == READ)
-			  ? read_completed_with_error
-			  : read_ahead_completed_with_error;
+			  ? READ_COMPLETED_WITH_ERROR
+			  : READ_AHEAD_COMPLETED_WITH_ERROR;
 	} else
-		what = completed_ok;
+		what = COMPLETED_OK;
 
 	bio_put(req->private_bio);
 	req->private_bio = ERR_PTR(error);
 
 	/* not req_mod(), we need irqsave here! */
-	spin_lock_irqsave(&mdev->req_lock, flags);
+	spin_lock_irqsave(&mdev->tconn->req_lock, flags);
 	__req_mod(req, what, &m);
-	spin_unlock_irqrestore(&mdev->req_lock, flags);
+	spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
 	put_ldev(mdev);
 
 	if (m.bio)
 		complete_master_bio(mdev, &m);
 }
 
-int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
-{
-	struct drbd_request *req = container_of(w, struct drbd_request, w);
-
-	/* We should not detach for read io-error,
-	 * but try to WRITE the P_DATA_REPLY to the failed location,
-	 * to give the disk the chance to relocate that block */
-
-	spin_lock_irq(&mdev->req_lock);
-	if (cancel || mdev->state.pdsk != D_UP_TO_DATE) {
-		_req_mod(req, read_retry_remote_canceled);
-		spin_unlock_irq(&mdev->req_lock);
-		return 1;
-	}
-	spin_unlock_irq(&mdev->req_lock);
-
-	return w_send_read_req(mdev, w, 0);
-}
-
-void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)
+void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm,
+		  struct drbd_peer_request *peer_req, void *digest)
 {
 	struct hash_desc desc;
 	struct scatterlist sg;
-	struct page *page = e->pages;
+	struct page *page = peer_req->pages;
 	struct page *tmp;
 	unsigned len;
 
@@ -290,7 +302,7 @@ void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_e
 		page = tmp;
 	}
 	/* and now the last, possibly only partially used page */
-	len = e->size & (PAGE_SIZE - 1);
+	len = peer_req->i.size & (PAGE_SIZE - 1);
 	sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
 	crypto_hash_update(&desc, &sg, sg.length);
 	crypto_hash_final(&desc, digest);
@@ -316,59 +328,58 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *
 	crypto_hash_final(&desc, digest);
 }
 
-/* TODO merge common code with w_e_end_ov_req */
-int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+/* MAYBE merge common code with w_e_end_ov_req */
+static int w_e_send_csum(struct drbd_work *w, int cancel)
 {
-	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_conf *mdev = w->mdev;
 	int digest_size;
 	void *digest;
-	int ok = 1;
-
-	D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
+	int err = 0;
 
 	if (unlikely(cancel))
 		goto out;
 
-	if (likely((e->flags & EE_WAS_ERROR) != 0))
+	if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0))
 		goto out;
 
-	digest_size = crypto_hash_digestsize(mdev->csums_tfm);
+	digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm);
 	digest = kmalloc(digest_size, GFP_NOIO);
 	if (digest) {
-		sector_t sector = e->sector;
-		unsigned int size = e->size;
-		drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
-		/* Free e and pages before send.
+		sector_t sector = peer_req->i.sector;
+		unsigned int size = peer_req->i.size;
+		drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest);
+		/* Free peer_req and pages before send.
 		 * In case we block on congestion, we could otherwise run into
 		 * some distributed deadlock, if the other side blocks on
 		 * congestion as well, because our receiver blocks in
-		 * drbd_pp_alloc due to pp_in_use > max_buffers. */
-		drbd_free_ee(mdev, e);
-		e = NULL;
+		 * drbd_alloc_pages due to pp_in_use > max_buffers. */
+		drbd_free_peer_req(mdev, peer_req);
+		peer_req = NULL;
 		inc_rs_pending(mdev);
-		ok = drbd_send_drequest_csum(mdev, sector, size,
-					     digest, digest_size,
-					     P_CSUM_RS_REQUEST);
+		err = drbd_send_drequest_csum(mdev, sector, size,
+					      digest, digest_size,
+					      P_CSUM_RS_REQUEST);
 		kfree(digest);
 	} else {
 		dev_err(DEV, "kmalloc() of digest failed.\n");
-		ok = 0;
+		err = -ENOMEM;
 	}
 
 out:
-	if (e)
-		drbd_free_ee(mdev, e);
+	if (peer_req)
+		drbd_free_peer_req(mdev, peer_req);
 
-	if (unlikely(!ok))
+	if (unlikely(err))
 		dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
-	return ok;
+	return err;
 }
 
 #define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
 
 static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
 {
-	struct drbd_epoch_entry *e;
+	struct drbd_peer_request *peer_req;
 
 	if (!get_ldev(mdev))
 		return -EIO;
@@ -378,45 +389,47 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
 
 	/* GFP_TRY, because if there is no memory available right now, this may
 	 * be rescheduled for later. It is "only" background resync, after all. */
-	e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
-	if (!e)
+	peer_req = drbd_alloc_peer_req(mdev, ID_SYNCER /* unused */, sector,
+				       size, GFP_TRY);
+	if (!peer_req)
 		goto defer;
 
-	e->w.cb = w_e_send_csum;
-	spin_lock_irq(&mdev->req_lock);
-	list_add(&e->w.list, &mdev->read_ee);
-	spin_unlock_irq(&mdev->req_lock);
+	peer_req->w.cb = w_e_send_csum;
+	spin_lock_irq(&mdev->tconn->req_lock);
+	list_add(&peer_req->w.list, &mdev->read_ee);
+	spin_unlock_irq(&mdev->tconn->req_lock);
 
 	atomic_add(size >> 9, &mdev->rs_sect_ev);
-	if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
+	if (drbd_submit_peer_request(mdev, peer_req, READ, DRBD_FAULT_RS_RD) == 0)
 		return 0;
 
 	/* If it failed because of ENOMEM, retry should help.  If it failed
 	 * because bio_add_page failed (probably broken lower level driver),
 	 * retry may or may not help.
 	 * If it does not, you may need to force disconnect. */
-	spin_lock_irq(&mdev->req_lock);
-	list_del(&e->w.list);
-	spin_unlock_irq(&mdev->req_lock);
+	spin_lock_irq(&mdev->tconn->req_lock);
+	list_del(&peer_req->w.list);
+	spin_unlock_irq(&mdev->tconn->req_lock);
 
-	drbd_free_ee(mdev, e);
+	drbd_free_peer_req(mdev, peer_req);
 defer:
 	put_ldev(mdev);
 	return -EAGAIN;
 }
 
-int w_resync_timer(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_resync_timer(struct drbd_work *w, int cancel)
 {
+	struct drbd_conf *mdev = w->mdev;
 	switch (mdev->state.conn) {
 	case C_VERIFY_S:
-		w_make_ov_request(mdev, w, cancel);
+		w_make_ov_request(w, cancel);
 		break;
 	case C_SYNC_TARGET:
-		w_make_resync_request(mdev, w, cancel);
+		w_make_resync_request(w, cancel);
 		break;
 	}
 
-	return 1;
+	return 0;
 }
 
 void resync_timer_fn(unsigned long data)
@@ -424,7 +437,7 @@ void resync_timer_fn(unsigned long data)
 	struct drbd_conf *mdev = (struct drbd_conf *) data;
 
 	if (list_empty(&mdev->resync_work.list))
-		drbd_queue_work(&mdev->data.work, &mdev->resync_work);
+		drbd_queue_work(&mdev->tconn->sender_work, &mdev->resync_work);
 }
 
 static void fifo_set(struct fifo_buffer *fb, int value)
@@ -456,8 +469,24 @@ static void fifo_add_val(struct fifo_buffer *fb, int value)
 		fb->values[i] += value;
 }
 
+struct fifo_buffer *fifo_alloc(int fifo_size)
+{
+	struct fifo_buffer *fb;
+
+	fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO);
+	if (!fb)
+		return NULL;
+
+	fb->head_index = 0;
+	fb->size = fifo_size;
+	fb->total = 0;
+
+	return fb;
+}
+
 static int drbd_rs_controller(struct drbd_conf *mdev)
 {
+	struct disk_conf *dc;
 	unsigned int sect_in;  /* Number of sectors that came in since the last turn */
 	unsigned int want;     /* The number of sectors we want in the proxy */
 	int req_sect; /* Number of sectors to request in this turn */
@@ -466,38 +495,39 @@ static int drbd_rs_controller(struct drbd_conf *mdev)
 	int steps; /* Number of time steps to plan ahead */
 	int curr_corr;
 	int max_sect;
+	struct fifo_buffer *plan;
 
 	sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */
 	mdev->rs_in_flight -= sect_in;
 
-	spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */
+	dc = rcu_dereference(mdev->ldev->disk_conf);
+	plan = rcu_dereference(mdev->rs_plan_s);
 
-	steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
+	steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
 
 	if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */
-		want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps;
+		want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps;
 	} else { /* normal path */
-		want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target :
-			sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10);
+		want = dc->c_fill_target ? dc->c_fill_target :
+			sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10);
 	}
 
-	correction = want - mdev->rs_in_flight - mdev->rs_planed;
+	correction = want - mdev->rs_in_flight - plan->total;
 
 	/* Plan ahead */
 	cps = correction / steps;
-	fifo_add_val(&mdev->rs_plan_s, cps);
-	mdev->rs_planed += cps * steps;
+	fifo_add_val(plan, cps);
+	plan->total += cps * steps;
 
 	/* What we do in this step */
-	curr_corr = fifo_push(&mdev->rs_plan_s, 0);
-	spin_unlock(&mdev->peer_seq_lock);
-	mdev->rs_planed -= curr_corr;
+	curr_corr = fifo_push(plan, 0);
+	plan->total -= curr_corr;
 
 	req_sect = sect_in + curr_corr;
 	if (req_sect < 0)
 		req_sect = 0;
 
-	max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ;
+	max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ;
 	if (req_sect > max_sect)
 		req_sect = max_sect;
 
@@ -513,22 +543,25 @@ static int drbd_rs_controller(struct drbd_conf *mdev)
 static int drbd_rs_number_requests(struct drbd_conf *mdev)
 {
 	int number;
-	if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */
+
+	rcu_read_lock();
+	if (rcu_dereference(mdev->rs_plan_s)->size) {
 		number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);
 		mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
 	} else {
-		mdev->c_sync_rate = mdev->sync_conf.rate;
+		mdev->c_sync_rate = rcu_dereference(mdev->ldev->disk_conf)->resync_rate;
 		number = SLEEP_TIME * mdev->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
 	}
+	rcu_read_unlock();
 
 	/* ignore the amount of pending requests, the resync controller should
 	 * throttle down to incoming reply rate soon enough anyways. */
 	return number;
 }
 
-static int w_make_resync_request(struct drbd_conf *mdev,
-				 struct drbd_work *w, int cancel)
+int w_make_resync_request(struct drbd_work *w, int cancel)
 {
+	struct drbd_conf *mdev = w->mdev;
 	unsigned long bit;
 	sector_t sector;
 	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
@@ -538,12 +571,12 @@ static int w_make_resync_request(struct drbd_conf *mdev,
 	int i = 0;
 
 	if (unlikely(cancel))
-		return 1;
+		return 0;
 
 	if (mdev->rs_total == 0) {
 		/* empty resync? */
 		drbd_resync_finished(mdev);
-		return 1;
+		return 0;
 	}
 
 	if (!get_ldev(mdev)) {
@@ -552,7 +585,7 @@ static int w_make_resync_request(struct drbd_conf *mdev,
 		   to continue resync with a broken disk makes no sense at
 		   all */
 		dev_err(DEV, "Disk broke down during resync!\n");
-		return 1;
+		return 0;
 	}
 
 	max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9;
@@ -562,15 +595,15 @@ static int w_make_resync_request(struct drbd_conf *mdev,
 
 	for (i = 0; i < number; i++) {
 		/* Stop generating RS requests, when half of the send buffer is filled */
-		mutex_lock(&mdev->data.mutex);
-		if (mdev->data.socket) {
-			queued = mdev->data.socket->sk->sk_wmem_queued;
-			sndbuf = mdev->data.socket->sk->sk_sndbuf;
+		mutex_lock(&mdev->tconn->data.mutex);
+		if (mdev->tconn->data.socket) {
+			queued = mdev->tconn->data.socket->sk->sk_wmem_queued;
+			sndbuf = mdev->tconn->data.socket->sk->sk_sndbuf;
 		} else {
 			queued = 1;
 			sndbuf = 0;
 		}
-		mutex_unlock(&mdev->data.mutex);
+		mutex_unlock(&mdev->tconn->data.mutex);
 		if (queued > sndbuf / 2)
 			goto requeue;
 
@@ -581,7 +614,7 @@ static int w_make_resync_request(struct drbd_conf *mdev,
 		if (bit == DRBD_END_OF_BITMAP) {
 			mdev->bm_resync_fo = drbd_bm_bits(mdev);
 			put_ldev(mdev);
-			return 1;
+			return 0;
 		}
 
 		sector = BM_BIT_TO_SECT(bit);
@@ -640,11 +673,11 @@ static int w_make_resync_request(struct drbd_conf *mdev,
 		/* adjust very last sectors, in case we are oddly sized */
 		if (sector + (size>>9) > capacity)
 			size = (capacity-sector)<<9;
-		if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
+		if (mdev->tconn->agreed_pro_version >= 89 && mdev->tconn->csums_tfm) {
 			switch (read_for_csum(mdev, sector, size)) {
 			case -EIO: /* Disk failure */
 				put_ldev(mdev);
-				return 0;
+				return -EIO;
 			case -EAGAIN: /* allocation failed, or ldev busy */
 				drbd_rs_complete_io(mdev, sector);
 				mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
@@ -657,13 +690,16 @@ static int w_make_resync_request(struct drbd_conf *mdev,
 				BUG();
 			}
 		} else {
+			int err;
+
 			inc_rs_pending(mdev);
-			if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
-					       sector, size, ID_SYNCER)) {
+			err = drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
+						 sector, size, ID_SYNCER);
+			if (err) {
 				dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
 				dec_rs_pending(mdev);
 				put_ldev(mdev);
-				return 0;
+				return err;
 			}
 		}
 	}
@@ -676,21 +712,23 @@ static int w_make_resync_request(struct drbd_conf *mdev,
 		 * until then resync "work" is "inactive" ...
 		 */
 		put_ldev(mdev);
-		return 1;
+		return 0;
 	}
 
  requeue:
 	mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
 	mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
 	put_ldev(mdev);
-	return 1;
+	return 0;
 }
 
-static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+static int w_make_ov_request(struct drbd_work *w, int cancel)
 {
+	struct drbd_conf *mdev = w->mdev;
 	int number, i, size;
 	sector_t sector;
 	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
+	bool stop_sector_reached = false;
 
 	if (unlikely(cancel))
 		return 1;
@@ -699,9 +737,17 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca
 
 	sector = mdev->ov_position;
 	for (i = 0; i < number; i++) {
-		if (sector >= capacity) {
+		if (sector >= capacity)
 			return 1;
-		}
+
+		/* We check for "finished" only in the reply path:
+		 * w_e_end_ov_reply().
+		 * We need to send at least one request out. */
+		stop_sector_reached = i > 0
+			&& verify_can_do_stop_sector(mdev)
+			&& sector >= mdev->ov_stop_sector;
+		if (stop_sector_reached)
+			break;
 
 		size = BM_BLOCK_SIZE;
 
@@ -715,7 +761,7 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca
 			size = (capacity-sector)<<9;
 
 		inc_rs_pending(mdev);
-		if (!drbd_send_ov_request(mdev, sector, size)) {
+		if (drbd_send_ov_request(mdev, sector, size)) {
 			dec_rs_pending(mdev);
 			return 0;
 		}
@@ -725,56 +771,39 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca
 
  requeue:
 	mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
-	mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
-	return 1;
-}
-
-
-void start_resync_timer_fn(unsigned long data)
-{
-	struct drbd_conf *mdev = (struct drbd_conf *) data;
-
-	drbd_queue_work(&mdev->data.work, &mdev->start_resync_work);
-}
-
-int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
-{
-	if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) {
-		dev_warn(DEV, "w_start_resync later...\n");
-		mdev->start_resync_timer.expires = jiffies + HZ/10;
-		add_timer(&mdev->start_resync_timer);
-		return 1;
-	}
-
-	drbd_start_resync(mdev, C_SYNC_SOURCE);
-	clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags);
+	if (i == 0 || !stop_sector_reached)
+		mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
 	return 1;
 }
 
-int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_ov_finished(struct drbd_work *w, int cancel)
 {
+	struct drbd_conf *mdev = w->mdev;
 	kfree(w);
-	ov_oos_print(mdev);
+	ov_out_of_sync_print(mdev);
 	drbd_resync_finished(mdev);
 
-	return 1;
+	return 0;
 }
 
-static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+static int w_resync_finished(struct drbd_work *w, int cancel)
 {
+	struct drbd_conf *mdev = w->mdev;
 	kfree(w);
 
 	drbd_resync_finished(mdev);
 
-	return 1;
+	return 0;
 }
 
 static void ping_peer(struct drbd_conf *mdev)
 {
-	clear_bit(GOT_PING_ACK, &mdev->flags);
-	request_ping(mdev);
-	wait_event(mdev->misc_wait,
-		   test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
+	struct drbd_tconn *tconn = mdev->tconn;
+
+	clear_bit(GOT_PING_ACK, &tconn->flags);
+	request_ping(tconn);
+	wait_event(tconn->ping_wait,
+		   test_bit(GOT_PING_ACK, &tconn->flags) || mdev->state.conn < C_CONNECTED);
 }
 
 int drbd_resync_finished(struct drbd_conf *mdev)
@@ -799,7 +828,8 @@ int drbd_resync_finished(struct drbd_conf *mdev)
 		w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
 		if (w) {
 			w->cb = w_resync_finished;
-			drbd_queue_work(&mdev->data.work, w);
+			w->mdev = mdev;
+			drbd_queue_work(&mdev->tconn->sender_work, w);
 			return 1;
 		}
 		dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
@@ -808,7 +838,12 @@ int drbd_resync_finished(struct drbd_conf *mdev)
 	dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
 	if (dt <= 0)
 		dt = 1;
+	
 	db = mdev->rs_total;
+	/* adjust for verify start and stop sectors, respective reached position */
+	if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
+		db -= mdev->ov_left;
+
 	dbdt = Bit2KB(db/dt);
 	mdev->rs_paused /= HZ;
 
@@ -817,8 +852,8 @@ int drbd_resync_finished(struct drbd_conf *mdev)
 
 	ping_peer(mdev);
 
-	spin_lock_irq(&mdev->req_lock);
-	os = mdev->state;
+	spin_lock_irq(&mdev->tconn->req_lock);
+	os = drbd_read_state(mdev);
 
 	verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
 
@@ -831,7 +866,7 @@ int drbd_resync_finished(struct drbd_conf *mdev)
 	ns.conn = C_CONNECTED;
 
 	dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
-	     verify_done ? "Online verify " : "Resync",
+	     verify_done ? "Online verify" : "Resync",
 	     dt + mdev->rs_paused, mdev->rs_paused, dbdt);
 
 	n_oos = drbd_bm_total_weight(mdev);
@@ -848,7 +883,7 @@ int drbd_resync_finished(struct drbd_conf *mdev)
 		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
 			khelper_cmd = "after-resync-target";
 
-		if (mdev->csums_tfm && mdev->rs_total) {
+		if (mdev->tconn->csums_tfm && mdev->rs_total) {
 			const unsigned long s = mdev->rs_same_csum;
 			const unsigned long t = mdev->rs_total;
 			const int ratio =
@@ -906,13 +941,15 @@ int drbd_resync_finished(struct drbd_conf *mdev)
 
 	_drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
 out_unlock:
-	spin_unlock_irq(&mdev->req_lock);
+	spin_unlock_irq(&mdev->tconn->req_lock);
 	put_ldev(mdev);
 out:
 	mdev->rs_total  = 0;
 	mdev->rs_failed = 0;
 	mdev->rs_paused = 0;
-	if (verify_done)
+
+	/* reset start sector, if we reached end of device */
+	if (verify_done && mdev->ov_left == 0)
 		mdev->ov_start_sector = 0;
 
 	drbd_md_sync(mdev);
@@ -924,19 +961,19 @@ int drbd_resync_finished(struct drbd_conf *mdev)
 }
 
 /* helper */
-static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_peer_request *peer_req)
 {
-	if (drbd_ee_has_active_page(e)) {
+	if (drbd_peer_req_has_active_page(peer_req)) {
 		/* This might happen if sendpage() has not finished */
-		int i = (e->size + PAGE_SIZE -1) >> PAGE_SHIFT;
+		int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT;
 		atomic_add(i, &mdev->pp_in_use_by_net);
 		atomic_sub(i, &mdev->pp_in_use);
-		spin_lock_irq(&mdev->req_lock);
-		list_add_tail(&e->w.list, &mdev->net_ee);
-		spin_unlock_irq(&mdev->req_lock);
+		spin_lock_irq(&mdev->tconn->req_lock);
+		list_add_tail(&peer_req->w.list, &mdev->net_ee);
+		spin_unlock_irq(&mdev->tconn->req_lock);
 		wake_up(&drbd_pp_wait);
 	} else
-		drbd_free_ee(mdev, e);
+		drbd_free_peer_req(mdev, peer_req);
 }
 
 /**
@@ -945,174 +982,177 @@ static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_ent
  * @w:		work object.
  * @cancel:	The connection will be closed anyways
  */
-int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_e_end_data_req(struct drbd_work *w, int cancel)
 {
-	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
-	int ok;
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_conf *mdev = w->mdev;
+	int err;
 
 	if (unlikely(cancel)) {
-		drbd_free_ee(mdev, e);
+		drbd_free_peer_req(mdev, peer_req);
 		dec_unacked(mdev);
-		return 1;
+		return 0;
 	}
 
-	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
-		ok = drbd_send_block(mdev, P_DATA_REPLY, e);
+	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+		err = drbd_send_block(mdev, P_DATA_REPLY, peer_req);
 	} else {
 		if (__ratelimit(&drbd_ratelimit_state))
 			dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
-			    (unsigned long long)e->sector);
+			    (unsigned long long)peer_req->i.sector);
 
-		ok = drbd_send_ack(mdev, P_NEG_DREPLY, e);
+		err = drbd_send_ack(mdev, P_NEG_DREPLY, peer_req);
 	}
 
 	dec_unacked(mdev);
 
-	move_to_net_ee_or_free(mdev, e);
+	move_to_net_ee_or_free(mdev, peer_req);
 
-	if (unlikely(!ok))
+	if (unlikely(err))
 		dev_err(DEV, "drbd_send_block() failed\n");
-	return ok;
+	return err;
 }
 
 /**
- * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
+ * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
  * @mdev:	DRBD device.
  * @w:		work object.
  * @cancel:	The connection will be closed anyways
  */
-int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
 {
-	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
-	int ok;
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_conf *mdev = w->mdev;
+	int err;
 
 	if (unlikely(cancel)) {
-		drbd_free_ee(mdev, e);
+		drbd_free_peer_req(mdev, peer_req);
 		dec_unacked(mdev);
-		return 1;
+		return 0;
 	}
 
 	if (get_ldev_if_state(mdev, D_FAILED)) {
-		drbd_rs_complete_io(mdev, e->sector);
+		drbd_rs_complete_io(mdev, peer_req->i.sector);
 		put_ldev(mdev);
 	}
 
 	if (mdev->state.conn == C_AHEAD) {
-		ok = drbd_send_ack(mdev, P_RS_CANCEL, e);
-	} else if (likely((e->flags & EE_WAS_ERROR) == 0)) {
+		err = drbd_send_ack(mdev, P_RS_CANCEL, peer_req);
+	} else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
 		if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
 			inc_rs_pending(mdev);
-			ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
+			err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req);
 		} else {
 			if (__ratelimit(&drbd_ratelimit_state))
 				dev_err(DEV, "Not sending RSDataReply, "
 				    "partner DISKLESS!\n");
-			ok = 1;
+			err = 0;
 		}
 	} else {
 		if (__ratelimit(&drbd_ratelimit_state))
 			dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
-			    (unsigned long long)e->sector);
+			    (unsigned long long)peer_req->i.sector);
 
-		ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
+		err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req);
 
 		/* update resync data with failure */
-		drbd_rs_failed_io(mdev, e->sector, e->size);
+		drbd_rs_failed_io(mdev, peer_req->i.sector, peer_req->i.size);
 	}
 
 	dec_unacked(mdev);
 
-	move_to_net_ee_or_free(mdev, e);
+	move_to_net_ee_or_free(mdev, peer_req);
 
-	if (unlikely(!ok))
+	if (unlikely(err))
 		dev_err(DEV, "drbd_send_block() failed\n");
-	return ok;
+	return err;
 }
 
-int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
 {
-	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_conf *mdev = w->mdev;
 	struct digest_info *di;
 	int digest_size;
 	void *digest = NULL;
-	int ok, eq = 0;
+	int err, eq = 0;
 
 	if (unlikely(cancel)) {
-		drbd_free_ee(mdev, e);
+		drbd_free_peer_req(mdev, peer_req);
 		dec_unacked(mdev);
-		return 1;
+		return 0;
 	}
 
 	if (get_ldev(mdev)) {
-		drbd_rs_complete_io(mdev, e->sector);
+		drbd_rs_complete_io(mdev, peer_req->i.sector);
 		put_ldev(mdev);
 	}
 
-	di = e->digest;
+	di = peer_req->digest;
 
-	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
+	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
 		/* quick hack to try to avoid a race against reconfiguration.
 		 * a real fix would be much more involved,
 		 * introducing more locking mechanisms */
-		if (mdev->csums_tfm) {
-			digest_size = crypto_hash_digestsize(mdev->csums_tfm);
+		if (mdev->tconn->csums_tfm) {
+			digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm);
 			D_ASSERT(digest_size == di->digest_size);
 			digest = kmalloc(digest_size, GFP_NOIO);
 		}
 		if (digest) {
-			drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
+			drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest);
 			eq = !memcmp(digest, di->digest, digest_size);
 			kfree(digest);
 		}
 
 		if (eq) {
-			drbd_set_in_sync(mdev, e->sector, e->size);
+			drbd_set_in_sync(mdev, peer_req->i.sector, peer_req->i.size);
 			/* rs_same_csums unit is BM_BLOCK_SIZE */
-			mdev->rs_same_csum += e->size >> BM_BLOCK_SHIFT;
-			ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
+			mdev->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT;
+			err = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, peer_req);
 		} else {
 			inc_rs_pending(mdev);
-			e->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
-			e->flags &= ~EE_HAS_DIGEST; /* This e no longer has a digest pointer */
+			peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
+			peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */
 			kfree(di);
-			ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
+			err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req);
 		}
 	} else {
-		ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
+		err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req);
 		if (__ratelimit(&drbd_ratelimit_state))
 			dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
 	}
 
 	dec_unacked(mdev);
-	move_to_net_ee_or_free(mdev, e);
+	move_to_net_ee_or_free(mdev, peer_req);
 
-	if (unlikely(!ok))
+	if (unlikely(err))
 		dev_err(DEV, "drbd_send_block/ack() failed\n");
-	return ok;
+	return err;
 }
 
-/* TODO merge common code with w_e_send_csum */
-int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_e_end_ov_req(struct drbd_work *w, int cancel)
 {
-	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
-	sector_t sector = e->sector;
-	unsigned int size = e->size;
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_conf *mdev = w->mdev;
+	sector_t sector = peer_req->i.sector;
+	unsigned int size = peer_req->i.size;
 	int digest_size;
 	void *digest;
-	int ok = 1;
+	int err = 0;
 
 	if (unlikely(cancel))
 		goto out;
 
-	digest_size = crypto_hash_digestsize(mdev->verify_tfm);
+	digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm);
 	digest = kmalloc(digest_size, GFP_NOIO);
 	if (!digest) {
-		ok = 0;	/* terminate the connection in case the allocation failed */
+		err = 1;	/* terminate the connection in case the allocation failed */
 		goto out;
 	}
 
-	if (likely(!(e->flags & EE_WAS_ERROR)))
-		drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
+	if (likely(!(peer_req->flags & EE_WAS_ERROR)))
+		drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest);
 	else
 		memset(digest, 0, digest_size);
 
@@ -1120,25 +1160,23 @@ int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	 * In case we block on congestion, we could otherwise run into
 	 * some distributed deadlock, if the other side blocks on
 	 * congestion as well, because our receiver blocks in
-	 * drbd_pp_alloc due to pp_in_use > max_buffers. */
-	drbd_free_ee(mdev, e);
-	e = NULL;
+	 * drbd_alloc_pages due to pp_in_use > max_buffers. */
+	drbd_free_peer_req(mdev, peer_req);
+	peer_req = NULL;
 	inc_rs_pending(mdev);
-	ok = drbd_send_drequest_csum(mdev, sector, size,
-				     digest, digest_size,
-				     P_OV_REPLY);
-	if (!ok)
+	err = drbd_send_drequest_csum(mdev, sector, size, digest, digest_size, P_OV_REPLY);
+	if (err)
 		dec_rs_pending(mdev);
 	kfree(digest);
 
 out:
-	if (e)
-		drbd_free_ee(mdev, e);
+	if (peer_req)
+		drbd_free_peer_req(mdev, peer_req);
 	dec_unacked(mdev);
-	return ok;
+	return err;
 }
 
-void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
+void drbd_ov_out_of_sync_found(struct drbd_conf *mdev, sector_t sector, int size)
 {
 	if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
 		mdev->ov_last_oos_size += size>>9;
@@ -1149,36 +1187,38 @@ void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
 	drbd_set_out_of_sync(mdev, sector, size);
 }
 
-int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_e_end_ov_reply(struct drbd_work *w, int cancel)
 {
-	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_conf *mdev = w->mdev;
 	struct digest_info *di;
 	void *digest;
-	sector_t sector = e->sector;
-	unsigned int size = e->size;
+	sector_t sector = peer_req->i.sector;
+	unsigned int size = peer_req->i.size;
 	int digest_size;
-	int ok, eq = 0;
+	int err, eq = 0;
+	bool stop_sector_reached = false;
 
 	if (unlikely(cancel)) {
-		drbd_free_ee(mdev, e);
+		drbd_free_peer_req(mdev, peer_req);
 		dec_unacked(mdev);
-		return 1;
+		return 0;
 	}
 
 	/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
 	 * the resync lru has been cleaned up already */
 	if (get_ldev(mdev)) {
-		drbd_rs_complete_io(mdev, e->sector);
+		drbd_rs_complete_io(mdev, peer_req->i.sector);
 		put_ldev(mdev);
 	}
 
-	di = e->digest;
+	di = peer_req->digest;
 
-	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
-		digest_size = crypto_hash_digestsize(mdev->verify_tfm);
+	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+		digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm);
 		digest = kmalloc(digest_size, GFP_NOIO);
 		if (digest) {
-			drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
+			drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest);
 
 			D_ASSERT(digest_size == di->digest_size);
 			eq = !memcmp(digest, di->digest, digest_size);
@@ -1186,19 +1226,19 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 		}
 	}
 
-		/* Free e and pages before send.
-		 * In case we block on congestion, we could otherwise run into
-		 * some distributed deadlock, if the other side blocks on
-		 * congestion as well, because our receiver blocks in
-		 * drbd_pp_alloc due to pp_in_use > max_buffers. */
-	drbd_free_ee(mdev, e);
+	/* Free peer_req and pages before send.
+	 * In case we block on congestion, we could otherwise run into
+	 * some distributed deadlock, if the other side blocks on
+	 * congestion as well, because our receiver blocks in
+	 * drbd_alloc_pages due to pp_in_use > max_buffers. */
+	drbd_free_peer_req(mdev, peer_req);
 	if (!eq)
-		drbd_ov_oos_found(mdev, sector, size);
+		drbd_ov_out_of_sync_found(mdev, sector, size);
 	else
-		ov_oos_print(mdev);
+		ov_out_of_sync_print(mdev);
 
-	ok = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size,
-			      eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
+	err = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size,
+			       eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
 
 	dec_unacked(mdev);
 
@@ -1208,73 +1248,102 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	if ((mdev->ov_left & 0x200) == 0x200)
 		drbd_advance_rs_marks(mdev, mdev->ov_left);
 
-	if (mdev->ov_left == 0) {
-		ov_oos_print(mdev);
+	stop_sector_reached = verify_can_do_stop_sector(mdev) &&
+		(sector + (size>>9)) >= mdev->ov_stop_sector;
+
+	if (mdev->ov_left == 0 || stop_sector_reached) {
+		ov_out_of_sync_print(mdev);
 		drbd_resync_finished(mdev);
 	}
 
-	return ok;
+	return err;
 }
 
-int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_prev_work_done(struct drbd_work *w, int cancel)
 {
 	struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
+
 	complete(&b->done);
-	return 1;
+	return 0;
 }
 
-int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+/* FIXME
+ * We need to track the number of pending barrier acks,
+ * and to be able to wait for them.
+ * See also comment in drbd_adm_attach before drbd_suspend_io.
+ */
+int drbd_send_barrier(struct drbd_tconn *tconn)
 {
-	struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w);
-	struct p_barrier *p = &mdev->data.sbuf.barrier;
-	int ok = 1;
-
-	/* really avoid racing with tl_clear.  w.cb may have been referenced
-	 * just before it was reassigned and re-queued, so double check that.
-	 * actually, this race was harmless, since we only try to send the
-	 * barrier packet here, and otherwise do nothing with the object.
-	 * but compare with the head of w_clear_epoch */
-	spin_lock_irq(&mdev->req_lock);
-	if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED)
-		cancel = 1;
-	spin_unlock_irq(&mdev->req_lock);
-	if (cancel)
-		return 1;
+	struct p_barrier *p;
+	struct drbd_socket *sock;
 
-	if (!drbd_get_data_sock(mdev))
-		return 0;
-	p->barrier = b->br_number;
-	/* inc_ap_pending was done where this was queued.
-	 * dec_ap_pending will be done in got_BarrierAck
-	 * or (on connection loss) in w_clear_epoch.  */
-	ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
-				(struct p_header80 *)p, sizeof(*p), 0);
-	drbd_put_data_sock(mdev);
-
-	return ok;
+	sock = &tconn->data;
+	p = conn_prepare_command(tconn, sock);
+	if (!p)
+		return -EIO;
+	p->barrier = tconn->send.current_epoch_nr;
+	p->pad = 0;
+	tconn->send.current_epoch_writes = 0;
+
+	return conn_send_command(tconn, sock, P_BARRIER, sizeof(*p), NULL, 0);
 }
 
-int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_send_write_hint(struct drbd_work *w, int cancel)
 {
+	struct drbd_conf *mdev = w->mdev;
+	struct drbd_socket *sock;
+
 	if (cancel)
-		return 1;
-	return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
+		return 0;
+	sock = &mdev->tconn->data;
+	if (!drbd_prepare_command(mdev, sock))
+		return -EIO;
+	return drbd_send_command(mdev, sock, P_UNPLUG_REMOTE, 0, NULL, 0);
 }
 
-int w_send_oos(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+static void re_init_if_first_write(struct drbd_tconn *tconn, unsigned int epoch)
+{
+	if (!tconn->send.seen_any_write_yet) {
+		tconn->send.seen_any_write_yet = true;
+		tconn->send.current_epoch_nr = epoch;
+		tconn->send.current_epoch_writes = 0;
+	}
+}
+
+static void maybe_send_barrier(struct drbd_tconn *tconn, unsigned int epoch)
+{
+	/* re-init if first write on this connection */
+	if (!tconn->send.seen_any_write_yet)
+		return;
+	if (tconn->send.current_epoch_nr != epoch) {
+		if (tconn->send.current_epoch_writes)
+			drbd_send_barrier(tconn);
+		tconn->send.current_epoch_nr = epoch;
+	}
+}
+
+int w_send_out_of_sync(struct drbd_work *w, int cancel)
 {
 	struct drbd_request *req = container_of(w, struct drbd_request, w);
-	int ok;
+	struct drbd_conf *mdev = w->mdev;
+	struct drbd_tconn *tconn = mdev->tconn;
+	int err;
 
 	if (unlikely(cancel)) {
-		req_mod(req, send_canceled);
-		return 1;
+		req_mod(req, SEND_CANCELED);
+		return 0;
 	}
 
-	ok = drbd_send_oos(mdev, req);
-	req_mod(req, oos_handed_to_network);
+	/* this time, no tconn->send.current_epoch_writes++;
+	 * If it was sent, it was the closing barrier for the last
+	 * replicated epoch, before we went into AHEAD mode.
+	 * No more barriers will be sent, until we leave AHEAD mode again. */
+	maybe_send_barrier(tconn, req->epoch);
+
+	err = drbd_send_out_of_sync(mdev, req);
+	req_mod(req, OOS_HANDED_TO_NETWORK);
 
-	return ok;
+	return err;
 }
 
 /**
@@ -1283,20 +1352,26 @@ int w_send_oos(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  * @w:		work object.
  * @cancel:	The connection will be closed anyways
  */
-int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_send_dblock(struct drbd_work *w, int cancel)
 {
 	struct drbd_request *req = container_of(w, struct drbd_request, w);
-	int ok;
+	struct drbd_conf *mdev = w->mdev;
+	struct drbd_tconn *tconn = mdev->tconn;
+	int err;
 
 	if (unlikely(cancel)) {
-		req_mod(req, send_canceled);
-		return 1;
+		req_mod(req, SEND_CANCELED);
+		return 0;
 	}
 
-	ok = drbd_send_dblock(mdev, req);
-	req_mod(req, ok ? handed_over_to_network : send_failed);
+	re_init_if_first_write(tconn, req->epoch);
+	maybe_send_barrier(tconn, req->epoch);
+	tconn->send.current_epoch_writes++;
+
+	err = drbd_send_dblock(mdev, req);
+	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
 
-	return ok;
+	return err;
 }
 
 /**
@@ -1305,57 +1380,61 @@ int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  * @w:		work object.
  * @cancel:	The connection will be closed anyways
  */
-int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_send_read_req(struct drbd_work *w, int cancel)
 {
 	struct drbd_request *req = container_of(w, struct drbd_request, w);
-	int ok;
+	struct drbd_conf *mdev = w->mdev;
+	struct drbd_tconn *tconn = mdev->tconn;
+	int err;
 
 	if (unlikely(cancel)) {
-		req_mod(req, send_canceled);
-		return 1;
+		req_mod(req, SEND_CANCELED);
+		return 0;
 	}
 
-	ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size,
-				(unsigned long)req);
+	/* Even read requests may close a write epoch,
+	 * if there was any yet. */
+	maybe_send_barrier(tconn, req->epoch);
 
-	if (!ok) {
-		/* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();
-		 * so this is probably redundant */
-		if (mdev->state.conn >= C_CONNECTED)
-			drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
-	}
-	req_mod(req, ok ? handed_over_to_network : send_failed);
+	err = drbd_send_drequest(mdev, P_DATA_REQUEST, req->i.sector, req->i.size,
+				 (unsigned long)req);
+
+	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
 
-	return ok;
+	return err;
 }
 
-int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_restart_disk_io(struct drbd_work *w, int cancel)
 {
 	struct drbd_request *req = container_of(w, struct drbd_request, w);
+	struct drbd_conf *mdev = w->mdev;
 
 	if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
-		drbd_al_begin_io(mdev, req->sector);
-	/* Calling drbd_al_begin_io() out of the worker might deadlocks
-	   theoretically. Practically it can not deadlock, since this is
-	   only used when unfreezing IOs. All the extents of the requests
-	   that made it into the TL are already active */
+		drbd_al_begin_io(mdev, &req->i);
 
 	drbd_req_make_private_bio(req, req->master_bio);
 	req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
 	generic_make_request(req->private_bio);
 
-	return 1;
+	return 0;
 }
 
 static int _drbd_may_sync_now(struct drbd_conf *mdev)
 {
 	struct drbd_conf *odev = mdev;
+	int resync_after;
 
 	while (1) {
-		if (odev->sync_conf.after == -1)
+		if (!odev->ldev)
+			return 1;
+		rcu_read_lock();
+		resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
+		rcu_read_unlock();
+		if (resync_after == -1)
+			return 1;
+		odev = minor_to_mdev(resync_after);
+		if (!expect(odev))
 			return 1;
-		odev = minor_to_mdev(odev->sync_conf.after);
-		ERR_IF(!odev) return 1;
 		if ((odev->state.conn >= C_SYNC_SOURCE &&
 		     odev->state.conn <= C_PAUSED_SYNC_T) ||
 		    odev->state.aftr_isp || odev->state.peer_isp ||
@@ -1375,16 +1454,15 @@ static int _drbd_pause_after(struct drbd_conf *mdev)
 	struct drbd_conf *odev;
 	int i, rv = 0;
 
-	for (i = 0; i < minor_count; i++) {
-		odev = minor_to_mdev(i);
-		if (!odev)
-			continue;
+	rcu_read_lock();
+	idr_for_each_entry(&minors, odev, i) {
 		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
 			continue;
 		if (!_drbd_may_sync_now(odev))
 			rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
 			       != SS_NOTHING_TO_DO);
 	}
+	rcu_read_unlock();
 
 	return rv;
 }
@@ -1400,10 +1478,8 @@ static int _drbd_resume_next(struct drbd_conf *mdev)
 	struct drbd_conf *odev;
 	int i, rv = 0;
 
-	for (i = 0; i < minor_count; i++) {
-		odev = minor_to_mdev(i);
-		if (!odev)
-			continue;
+	rcu_read_lock();
+	idr_for_each_entry(&minors, odev, i) {
 		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
 			continue;
 		if (odev->state.aftr_isp) {
@@ -1413,6 +1489,7 @@ static int _drbd_resume_next(struct drbd_conf *mdev)
 				       != SS_NOTHING_TO_DO) ;
 		}
 	}
+	rcu_read_unlock();
 	return rv;
 }
 
@@ -1430,57 +1507,86 @@ void suspend_other_sg(struct drbd_conf *mdev)
 	write_unlock_irq(&global_state_lock);
 }
 
-static int sync_after_error(struct drbd_conf *mdev, int o_minor)
+/* caller must hold global_state_lock */
+enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor)
 {
 	struct drbd_conf *odev;
+	int resync_after;
 
 	if (o_minor == -1)
 		return NO_ERROR;
 	if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
-		return ERR_SYNC_AFTER;
+		return ERR_RESYNC_AFTER;
 
 	/* check for loops */
 	odev = minor_to_mdev(o_minor);
 	while (1) {
 		if (odev == mdev)
-			return ERR_SYNC_AFTER_CYCLE;
+			return ERR_RESYNC_AFTER_CYCLE;
 
+		rcu_read_lock();
+		resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
+		rcu_read_unlock();
 		/* dependency chain ends here, no cycles. */
-		if (odev->sync_conf.after == -1)
+		if (resync_after == -1)
 			return NO_ERROR;
 
 		/* follow the dependency chain */
-		odev = minor_to_mdev(odev->sync_conf.after);
+		odev = minor_to_mdev(resync_after);
 	}
 }
 
-int drbd_alter_sa(struct drbd_conf *mdev, int na)
+/* caller must hold global_state_lock */
+void drbd_resync_after_changed(struct drbd_conf *mdev)
 {
 	int changes;
-	int retcode;
 
-	write_lock_irq(&global_state_lock);
-	retcode = sync_after_error(mdev, na);
-	if (retcode == NO_ERROR) {
-		mdev->sync_conf.after = na;
-		do {
-			changes  = _drbd_pause_after(mdev);
-			changes |= _drbd_resume_next(mdev);
-		} while (changes);
-	}
-	write_unlock_irq(&global_state_lock);
-	return retcode;
+	do {
+		changes  = _drbd_pause_after(mdev);
+		changes |= _drbd_resume_next(mdev);
+	} while (changes);
 }
 
 void drbd_rs_controller_reset(struct drbd_conf *mdev)
 {
+	struct fifo_buffer *plan;
+
 	atomic_set(&mdev->rs_sect_in, 0);
 	atomic_set(&mdev->rs_sect_ev, 0);
 	mdev->rs_in_flight = 0;
-	mdev->rs_planed = 0;
-	spin_lock(&mdev->peer_seq_lock);
-	fifo_set(&mdev->rs_plan_s, 0);
-	spin_unlock(&mdev->peer_seq_lock);
+
+	/* Updating the RCU protected object in place is necessary since
+	   this function gets called from atomic context.
+	   It is valid since all other updates also lead to an completely
+	   empty fifo */
+	rcu_read_lock();
+	plan = rcu_dereference(mdev->rs_plan_s);
+	plan->total = 0;
+	fifo_set(plan, 0);
+	rcu_read_unlock();
+}
+
+void start_resync_timer_fn(unsigned long data)
+{
+	struct drbd_conf *mdev = (struct drbd_conf *) data;
+
+	drbd_queue_work(&mdev->tconn->sender_work, &mdev->start_resync_work);
+}
+
+int w_start_resync(struct drbd_work *w, int cancel)
+{
+	struct drbd_conf *mdev = w->mdev;
+
+	if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) {
+		dev_warn(DEV, "w_start_resync later...\n");
+		mdev->start_resync_timer.expires = jiffies + HZ/10;
+		add_timer(&mdev->start_resync_timer);
+		return 0;
+	}
+
+	drbd_start_resync(mdev, C_SYNC_SOURCE);
+	clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags);
+	return 0;
 }
 
 /**
@@ -1501,43 +1607,58 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
 		return;
 	}
 
-	if (side == C_SYNC_TARGET) {
-		/* Since application IO was locked out during C_WF_BITMAP_T and
-		   C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
-		   we check that we might make the data inconsistent. */
-		r = drbd_khelper(mdev, "before-resync-target");
-		r = (r >> 8) & 0xff;
-		if (r > 0) {
-			dev_info(DEV, "before-resync-target handler returned %d, "
-			     "dropping connection.\n", r);
-			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-			return;
-		}
-	} else /* C_SYNC_SOURCE */ {
-		r = drbd_khelper(mdev, "before-resync-source");
-		r = (r >> 8) & 0xff;
-		if (r > 0) {
-			if (r == 3) {
-				dev_info(DEV, "before-resync-source handler returned %d, "
-					 "ignoring. Old userland tools?", r);
-			} else {
-				dev_info(DEV, "before-resync-source handler returned %d, "
+	if (!test_bit(B_RS_H_DONE, &mdev->flags)) {
+		if (side == C_SYNC_TARGET) {
+			/* Since application IO was locked out during C_WF_BITMAP_T and
+			   C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
+			   we check that we might make the data inconsistent. */
+			r = drbd_khelper(mdev, "before-resync-target");
+			r = (r >> 8) & 0xff;
+			if (r > 0) {
+				dev_info(DEV, "before-resync-target handler returned %d, "
 					 "dropping connection.\n", r);
-				drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+				conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
 				return;
 			}
+		} else /* C_SYNC_SOURCE */ {
+			r = drbd_khelper(mdev, "before-resync-source");
+			r = (r >> 8) & 0xff;
+			if (r > 0) {
+				if (r == 3) {
+					dev_info(DEV, "before-resync-source handler returned %d, "
+						 "ignoring. Old userland tools?", r);
+				} else {
+					dev_info(DEV, "before-resync-source handler returned %d, "
+						 "dropping connection.\n", r);
+					conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+					return;
+				}
+			}
 		}
 	}
 
-	drbd_state_lock(mdev);
+	if (current == mdev->tconn->worker.task) {
+		/* The worker should not sleep waiting for state_mutex,
+		   that can take long */
+		if (!mutex_trylock(mdev->state_mutex)) {
+			set_bit(B_RS_H_DONE, &mdev->flags);
+			mdev->start_resync_timer.expires = jiffies + HZ/5;
+			add_timer(&mdev->start_resync_timer);
+			return;
+		}
+	} else {
+		mutex_lock(mdev->state_mutex);
+	}
+	clear_bit(B_RS_H_DONE, &mdev->flags);
+
 	write_lock_irq(&global_state_lock);
 	if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
 		write_unlock_irq(&global_state_lock);
-		drbd_state_unlock(mdev);
+		mutex_unlock(mdev->state_mutex);
 		return;
 	}
 
-	ns.i = mdev->state.i;
+	ns = drbd_read_state(mdev);
 
 	ns.aftr_isp = !_drbd_may_sync_now(mdev);
 
@@ -1549,7 +1670,7 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
 		ns.pdsk = D_INCONSISTENT;
 
 	r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
-	ns = mdev->state;
+	ns = drbd_read_state(mdev);
 
 	if (ns.conn < C_CONNECTED)
 		r = SS_UNKNOWN_ERROR;
@@ -1575,6 +1696,10 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
 	write_unlock_irq(&global_state_lock);
 
 	if (r == SS_SUCCESS) {
+		/* reset rs_last_bcast when a resync or verify is started,
+		 * to deal with potential jiffies wrap. */
+		mdev->rs_last_bcast = jiffies - HZ;
+
 		dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
 		     drbd_conn_str(ns.conn),
 		     (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
@@ -1589,10 +1714,10 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
 		 * drbd_resync_finished from here in that case.
 		 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
 		 * and from after_state_ch otherwise. */
-		if (side == C_SYNC_SOURCE && mdev->agreed_pro_version < 96)
+		if (side == C_SYNC_SOURCE && mdev->tconn->agreed_pro_version < 96)
 			drbd_gen_and_send_sync_uuid(mdev);
 
-		if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) {
+		if (mdev->tconn->agreed_pro_version < 95 && mdev->rs_total == 0) {
 			/* This still has a race (about when exactly the peers
 			 * detect connection loss) that can lead to a full sync
 			 * on next handshake. In 8.3.9 we fixed this with explicit
@@ -1603,10 +1728,16 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
 			 * detect connection loss, then waiting for a ping
 			 * response (implicit in drbd_resync_finished) reduces
 			 * the race considerably, but does not solve it. */
-			if (side == C_SYNC_SOURCE)
-				schedule_timeout_interruptible(
-					mdev->net_conf->ping_int * HZ +
-					mdev->net_conf->ping_timeo*HZ/9);
+			if (side == C_SYNC_SOURCE) {
+				struct net_conf *nc;
+				int timeo;
+
+				rcu_read_lock();
+				nc = rcu_dereference(mdev->tconn->net_conf);
+				timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
+				rcu_read_unlock();
+				schedule_timeout_interruptible(timeo);
+			}
 			drbd_resync_finished(mdev);
 		}
 
@@ -1621,114 +1752,180 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
 		drbd_md_sync(mdev);
 	}
 	put_ldev(mdev);
-	drbd_state_unlock(mdev);
+	mutex_unlock(mdev->state_mutex);
 }
 
-int drbd_worker(struct drbd_thread *thi)
+/* If the resource already closed the current epoch, but we did not
+ * (because we have not yet seen new requests), we should send the
+ * corresponding barrier now.  Must be checked within the same spinlock
+ * that is used to check for new requests. */
+bool need_to_send_barrier(struct drbd_tconn *connection)
 {
-	struct drbd_conf *mdev = thi->mdev;
-	struct drbd_work *w = NULL;
-	LIST_HEAD(work_list);
-	int intr = 0, i;
+	if (!connection->send.seen_any_write_yet)
+		return false;
+
+	/* Skip barriers that do not contain any writes.
+	 * This may happen during AHEAD mode. */
+	if (!connection->send.current_epoch_writes)
+		return false;
+
+	/* ->req_lock is held when requests are queued on
+	 * connection->sender_work, and put into ->transfer_log.
+	 * It is also held when ->current_tle_nr is increased.
+	 * So either there are already new requests queued,
+	 * and corresponding barriers will be send there.
+	 * Or nothing new is queued yet, so the difference will be 1.
+	 */
+	if (atomic_read(&connection->current_tle_nr) !=
+	    connection->send.current_epoch_nr + 1)
+		return false;
+
+	return true;
+}
+
+bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
+{
+	spin_lock_irq(&queue->q_lock);
+	list_splice_init(&queue->q, work_list);
+	spin_unlock_irq(&queue->q_lock);
+	return !list_empty(work_list);
+}
 
-	sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev));
+bool dequeue_work_item(struct drbd_work_queue *queue, struct list_head *work_list)
+{
+	spin_lock_irq(&queue->q_lock);
+	if (!list_empty(&queue->q))
+		list_move(queue->q.next, work_list);
+	spin_unlock_irq(&queue->q_lock);
+	return !list_empty(work_list);
+}
 
-	while (get_t_state(thi) == Running) {
-		drbd_thread_current_set_cpu(mdev);
+void wait_for_work(struct drbd_tconn *connection, struct list_head *work_list)
+{
+	DEFINE_WAIT(wait);
+	struct net_conf *nc;
+	int uncork, cork;
 
-		if (down_trylock(&mdev->data.work.s)) {
-			mutex_lock(&mdev->data.mutex);
-			if (mdev->data.socket && !mdev->net_conf->no_cork)
-				drbd_tcp_uncork(mdev->data.socket);
-			mutex_unlock(&mdev->data.mutex);
+	dequeue_work_item(&connection->sender_work, work_list);
+	if (!list_empty(work_list))
+		return;
 
-			intr = down_interruptible(&mdev->data.work.s);
+	/* Still nothing to do?
+	 * Maybe we still need to close the current epoch,
+	 * even if no new requests are queued yet.
+	 *
+	 * Also, poke TCP, just in case.
+	 * Then wait for new work (or signal). */
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	uncork = nc ? nc->tcp_cork : 0;
+	rcu_read_unlock();
+	if (uncork) {
+		mutex_lock(&connection->data.mutex);
+		if (connection->data.socket)
+			drbd_tcp_uncork(connection->data.socket);
+		mutex_unlock(&connection->data.mutex);
+	}
 
-			mutex_lock(&mdev->data.mutex);
-			if (mdev->data.socket  && !mdev->net_conf->no_cork)
-				drbd_tcp_cork(mdev->data.socket);
-			mutex_unlock(&mdev->data.mutex);
+	for (;;) {
+		int send_barrier;
+		prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE);
+		spin_lock_irq(&connection->req_lock);
+		spin_lock(&connection->sender_work.q_lock);	/* FIXME get rid of this one? */
+		/* dequeue single item only,
+		 * we still use drbd_queue_work_front() in some places */
+		if (!list_empty(&connection->sender_work.q))
+			list_move(connection->sender_work.q.next, work_list);
+		spin_unlock(&connection->sender_work.q_lock);	/* FIXME get rid of this one? */
+		if (!list_empty(work_list) || signal_pending(current)) {
+			spin_unlock_irq(&connection->req_lock);
+			break;
 		}
+		send_barrier = need_to_send_barrier(connection);
+		spin_unlock_irq(&connection->req_lock);
+		if (send_barrier) {
+			drbd_send_barrier(connection);
+			connection->send.current_epoch_nr++;
+		}
+		schedule();
+		/* may be woken up for other things but new work, too,
+		 * e.g. if the current epoch got closed.
+		 * In which case we send the barrier above. */
+	}
+	finish_wait(&connection->sender_work.q_wait, &wait);
+
+	/* someone may have changed the config while we have been waiting above. */
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	cork = nc ? nc->tcp_cork : 0;
+	rcu_read_unlock();
+	mutex_lock(&connection->data.mutex);
+	if (connection->data.socket) {
+		if (cork)
+			drbd_tcp_cork(connection->data.socket);
+		else if (!uncork)
+			drbd_tcp_uncork(connection->data.socket);
+	}
+	mutex_unlock(&connection->data.mutex);
+}
 
-		if (intr) {
-			D_ASSERT(intr == -EINTR);
+int drbd_worker(struct drbd_thread *thi)
+{
+	struct drbd_tconn *tconn = thi->tconn;
+	struct drbd_work *w = NULL;
+	struct drbd_conf *mdev;
+	LIST_HEAD(work_list);
+	int vnr;
+
+	while (get_t_state(thi) == RUNNING) {
+		drbd_thread_current_set_cpu(thi);
+
+		/* as long as we use drbd_queue_work_front(),
+		 * we may only dequeue single work items here, not batches. */
+		if (list_empty(&work_list))
+			wait_for_work(tconn, &work_list);
+
+		if (signal_pending(current)) {
 			flush_signals(current);
-			ERR_IF (get_t_state(thi) == Running)
+			if (get_t_state(thi) == RUNNING) {
+				conn_warn(tconn, "Worker got an unexpected signal\n");
 				continue;
+			}
 			break;
 		}
 
-		if (get_t_state(thi) != Running)
+		if (get_t_state(thi) != RUNNING)
 			break;
-		/* With this break, we have done a down() but not consumed
-		   the entry from the list. The cleanup code takes care of
-		   this...   */
-
-		w = NULL;
-		spin_lock_irq(&mdev->data.work.q_lock);
-		ERR_IF(list_empty(&mdev->data.work.q)) {
-			/* something terribly wrong in our logic.
-			 * we were able to down() the semaphore,
-			 * but the list is empty... doh.
-			 *
-			 * what is the best thing to do now?
-			 * try again from scratch, restarting the receiver,
-			 * asender, whatnot? could break even more ugly,
-			 * e.g. when we are primary, but no good local data.
-			 *
-			 * I'll try to get away just starting over this loop.
-			 */
-			spin_unlock_irq(&mdev->data.work.q_lock);
-			continue;
-		}
-		w = list_entry(mdev->data.work.q.next, struct drbd_work, list);
-		list_del_init(&w->list);
-		spin_unlock_irq(&mdev->data.work.q_lock);
-
-		if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) {
-			/* dev_warn(DEV, "worker: a callback failed! \n"); */
-			if (mdev->state.conn >= C_CONNECTED)
-				drbd_force_state(mdev,
-						NS(conn, C_NETWORK_FAILURE));
+
+		while (!list_empty(&work_list)) {
+			w = list_first_entry(&work_list, struct drbd_work, list);
+			list_del_init(&w->list);
+			if (w->cb(w, tconn->cstate < C_WF_REPORT_PARAMS) == 0)
+				continue;
+			if (tconn->cstate >= C_WF_REPORT_PARAMS)
+				conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD);
 		}
 	}
-	D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags));
-	D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags));
-
-	spin_lock_irq(&mdev->data.work.q_lock);
-	i = 0;
-	while (!list_empty(&mdev->data.work.q)) {
-		list_splice_init(&mdev->data.work.q, &work_list);
-		spin_unlock_irq(&mdev->data.work.q_lock);
 
+	do {
 		while (!list_empty(&work_list)) {
-			w = list_entry(work_list.next, struct drbd_work, list);
+			w = list_first_entry(&work_list, struct drbd_work, list);
 			list_del_init(&w->list);
-			w->cb(mdev, w, 1);
-			i++; /* dead debugging code */
+			w->cb(w, 1);
 		}
-
-		spin_lock_irq(&mdev->data.work.q_lock);
+		dequeue_work_batch(&tconn->sender_work, &work_list);
+	} while (!list_empty(&work_list));
+
+	rcu_read_lock();
+	idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+		D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
+		kref_get(&mdev->kref);
+		rcu_read_unlock();
+		drbd_mdev_cleanup(mdev);
+		kref_put(&mdev->kref, &drbd_minor_destroy);
+		rcu_read_lock();
 	}
-	sema_init(&mdev->data.work.s, 0);
-	/* DANGEROUS race: if someone did queue his work within the spinlock,
-	 * but up() ed outside the spinlock, we could get an up() on the
-	 * semaphore without corresponding list entry.
-	 * So don't do that.
-	 */
-	spin_unlock_irq(&mdev->data.work.q_lock);
-
-	D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
-	/* _drbd_set_state only uses stop_nowait.
-	 * wait here for the Exiting receiver. */
-	drbd_thread_stop(&mdev->receiver);
-	drbd_mdev_cleanup(mdev);
-
-	dev_info(DEV, "worker terminated\n");
-
-	clear_bit(DEVICE_DYING, &mdev->flags);
-	clear_bit(CONFIG_PENDING, &mdev->flags);
-	wake_up(&mdev->state_wait);
+	rcu_read_unlock();
 
 	return 0;
 }
diff --git a/trunk/drivers/block/drbd/drbd_wrappers.h b/trunk/drivers/block/drbd/drbd_wrappers.h
index 151f1a37478f..328f18e4b4ee 100644
--- a/trunk/drivers/block/drbd/drbd_wrappers.h
+++ b/trunk/drivers/block/drbd/drbd_wrappers.h
@@ -3,6 +3,7 @@
 
 #include <linux/ctype.h>
 #include <linux/mm.h>
+#include "drbd_int.h"
 
 /* see get_sb_bdev and bd_claim */
 extern char *drbd_sec_holder;
@@ -20,8 +21,8 @@ static inline void drbd_set_my_capacity(struct drbd_conf *mdev,
 
 /* bi_end_io handlers */
 extern void drbd_md_io_complete(struct bio *bio, int error);
-extern void drbd_endio_sec(struct bio *bio, int error);
-extern void drbd_endio_pri(struct bio *bio, int error);
+extern void drbd_peer_request_endio(struct bio *bio, int error);
+extern void drbd_request_endio(struct bio *bio, int error);
 
 /*
  * used to submit our private bio
@@ -45,12 +46,6 @@ static inline void drbd_generic_make_request(struct drbd_conf *mdev,
 		generic_make_request(bio);
 }
 
-static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm)
-{
-        return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK)
-                == CRYPTO_ALG_TYPE_HASH;
-}
-
 #ifndef __CHECKER__
 # undef __cond_lock
 # define __cond_lock(x,c) (c)
diff --git a/trunk/drivers/block/loop.c b/trunk/drivers/block/loop.c
index 54046e51160a..ae1251270624 100644
--- a/trunk/drivers/block/loop.c
+++ b/trunk/drivers/block/loop.c
@@ -463,6 +463,7 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
  */
 static void loop_add_bio(struct loop_device *lo, struct bio *bio)
 {
+	lo->lo_bio_count++;
 	bio_list_add(&lo->lo_bio_list, bio);
 }
 
@@ -471,6 +472,7 @@ static void loop_add_bio(struct loop_device *lo, struct bio *bio)
  */
 static struct bio *loop_get_bio(struct loop_device *lo)
 {
+	lo->lo_bio_count--;
 	return bio_list_pop(&lo->lo_bio_list);
 }
 
@@ -489,6 +491,10 @@ static void loop_make_request(struct request_queue *q, struct bio *old_bio)
 		goto out;
 	if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))
 		goto out;
+	if (lo->lo_bio_count >= q->nr_congestion_on)
+		wait_event_lock_irq(lo->lo_req_wait,
+				    lo->lo_bio_count < q->nr_congestion_off,
+				    lo->lo_lock);
 	loop_add_bio(lo, old_bio);
 	wake_up(&lo->lo_event);
 	spin_unlock_irq(&lo->lo_lock);
@@ -546,6 +552,8 @@ static int loop_thread(void *data)
 			continue;
 		spin_lock_irq(&lo->lo_lock);
 		bio = loop_get_bio(lo);
+		if (lo->lo_bio_count < lo->lo_queue->nr_congestion_off)
+			wake_up(&lo->lo_req_wait);
 		spin_unlock_irq(&lo->lo_lock);
 
 		BUG_ON(!bio);
@@ -873,6 +881,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	lo->transfer = transfer_none;
 	lo->ioctl = NULL;
 	lo->lo_sizelimit = 0;
+	lo->lo_bio_count = 0;
 	lo->old_gfp_mask = mapping_gfp_mask(mapping);
 	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
 
@@ -1673,6 +1682,7 @@ static int loop_add(struct loop_device **l, int i)
 	lo->lo_number		= i;
 	lo->lo_thread		= NULL;
 	init_waitqueue_head(&lo->lo_event);
+	init_waitqueue_head(&lo->lo_req_wait);
 	spin_lock_init(&lo->lo_lock);
 	disk->major		= LOOP_MAJOR;
 	disk->first_minor	= i << part_shift;
diff --git a/trunk/drivers/block/xen-blkback/blkback.c b/trunk/drivers/block/xen-blkback/blkback.c
index 280a13846e6c..74374fb762aa 100644
--- a/trunk/drivers/block/xen-blkback/blkback.c
+++ b/trunk/drivers/block/xen-blkback/blkback.c
@@ -39,6 +39,7 @@
 #include <linux/list.h>
 #include <linux/delay.h>
 #include <linux/freezer.h>
+#include <linux/bitmap.h>
 
 #include <xen/events.h>
 #include <xen/page.h>
@@ -79,6 +80,7 @@ struct pending_req {
 	unsigned short		operation;
 	int			status;
 	struct list_head	free_list;
+	DECLARE_BITMAP(unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
 };
 
 #define BLKBACK_INVALID_HANDLE (~0)
@@ -98,6 +100,36 @@ struct xen_blkbk {
 
 static struct xen_blkbk *blkbk;
 
+/*
+ * Maximum number of grant pages that can be mapped in blkback.
+ * BLKIF_MAX_SEGMENTS_PER_REQUEST * RING_SIZE is the maximum number of
+ * pages that blkback will persistently map.
+ * Currently, this is:
+ * RING_SIZE = 32 (for all known ring types)
+ * BLKIF_MAX_SEGMENTS_PER_REQUEST = 11
+ * sizeof(struct persistent_gnt) = 48
+ * So the maximum memory used to store the grants is:
+ * 32 * 11 * 48 = 16896 bytes
+ */
+static inline unsigned int max_mapped_grant_pages(enum blkif_protocol protocol)
+{
+	switch (protocol) {
+	case BLKIF_PROTOCOL_NATIVE:
+		return __CONST_RING_SIZE(blkif, PAGE_SIZE) *
+			   BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	case BLKIF_PROTOCOL_X86_32:
+		return __CONST_RING_SIZE(blkif_x86_32, PAGE_SIZE) *
+			   BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	case BLKIF_PROTOCOL_X86_64:
+		return __CONST_RING_SIZE(blkif_x86_64, PAGE_SIZE) *
+			   BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	default:
+		BUG();
+	}
+	return 0;
+}
+
+
 /*
  * Little helpful macro to figure out the index and virtual address of the
  * pending_pages[..]. For each 'pending_req' we have have up to
@@ -129,6 +161,90 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 static void make_response(struct xen_blkif *blkif, u64 id,
 			  unsigned short op, int st);
 
+#define foreach_grant(pos, rbtree, node) \
+	for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node); \
+	     &(pos)->node != NULL; \
+	     (pos) = container_of(rb_next(&(pos)->node), typeof(*(pos)), node))
+
+
+static void add_persistent_gnt(struct rb_root *root,
+			       struct persistent_gnt *persistent_gnt)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	struct persistent_gnt *this;
+
+	/* Figure out where to put new node */
+	while (*new) {
+		this = container_of(*new, struct persistent_gnt, node);
+
+		parent = *new;
+		if (persistent_gnt->gnt < this->gnt)
+			new = &((*new)->rb_left);
+		else if (persistent_gnt->gnt > this->gnt)
+			new = &((*new)->rb_right);
+		else {
+			pr_alert(DRV_PFX " trying to add a gref that's already in the tree\n");
+			BUG();
+		}
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&(persistent_gnt->node), parent, new);
+	rb_insert_color(&(persistent_gnt->node), root);
+}
+
+static struct persistent_gnt *get_persistent_gnt(struct rb_root *root,
+						 grant_ref_t gref)
+{
+	struct persistent_gnt *data;
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		data = container_of(node, struct persistent_gnt, node);
+
+		if (gref < data->gnt)
+			node = node->rb_left;
+		else if (gref > data->gnt)
+			node = node->rb_right;
+		else
+			return data;
+	}
+	return NULL;
+}
+
+static void free_persistent_gnts(struct rb_root *root, unsigned int num)
+{
+	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct persistent_gnt *persistent_gnt;
+	int ret = 0;
+	int segs_to_unmap = 0;
+
+	foreach_grant(persistent_gnt, root, node) {
+		BUG_ON(persistent_gnt->handle ==
+			BLKBACK_INVALID_HANDLE);
+		gnttab_set_unmap_op(&unmap[segs_to_unmap],
+			(unsigned long) pfn_to_kaddr(page_to_pfn(
+				persistent_gnt->page)),
+			GNTMAP_host_map,
+			persistent_gnt->handle);
+
+		pages[segs_to_unmap] = persistent_gnt->page;
+		rb_erase(&persistent_gnt->node, root);
+		kfree(persistent_gnt);
+		num--;
+
+		if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
+			!rb_next(&persistent_gnt->node)) {
+			ret = gnttab_unmap_refs(unmap, NULL, pages,
+				segs_to_unmap);
+			BUG_ON(ret);
+			segs_to_unmap = 0;
+		}
+	}
+	BUG_ON(num != 0);
+}
+
 /*
  * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
  */
@@ -302,6 +418,14 @@ int xen_blkif_schedule(void *arg)
 			print_stats(blkif);
 	}
 
+	/* Free all persistent grant pages */
+	if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
+		free_persistent_gnts(&blkif->persistent_gnts,
+			blkif->persistent_gnt_c);
+
+	BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
+	blkif->persistent_gnt_c = 0;
+
 	if (log_stats)
 		print_stats(blkif);
 
@@ -328,6 +452,8 @@ static void xen_blkbk_unmap(struct pending_req *req)
 	int ret;
 
 	for (i = 0; i < req->nr_pages; i++) {
+		if (!test_bit(i, req->unmap_seg))
+			continue;
 		handle = pending_handle(req, i);
 		if (handle == BLKBACK_INVALID_HANDLE)
 			continue;
@@ -344,12 +470,26 @@ static void xen_blkbk_unmap(struct pending_req *req)
 
 static int xen_blkbk_map(struct blkif_request *req,
 			 struct pending_req *pending_req,
-			 struct seg_buf seg[])
+			 struct seg_buf seg[],
+			 struct page *pages[])
 {
 	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-	int i;
+	struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct persistent_gnt *persistent_gnt = NULL;
+	struct xen_blkif *blkif = pending_req->blkif;
+	phys_addr_t addr = 0;
+	int i, j;
+	bool new_map;
 	int nseg = req->u.rw.nr_segments;
+	int segs_to_map = 0;
 	int ret = 0;
+	int use_persistent_gnts;
+
+	use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
+
+	BUG_ON(blkif->persistent_gnt_c >
+		   max_mapped_grant_pages(pending_req->blkif->blk_protocol));
 
 	/*
 	 * Fill out preq.nr_sects with proper amount of sectors, and setup
@@ -359,36 +499,146 @@ static int xen_blkbk_map(struct blkif_request *req,
 	for (i = 0; i < nseg; i++) {
 		uint32_t flags;
 
-		flags = GNTMAP_host_map;
-		if (pending_req->operation != BLKIF_OP_READ)
-			flags |= GNTMAP_readonly;
-		gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
-				  req->u.rw.seg[i].gref,
-				  pending_req->blkif->domid);
+		if (use_persistent_gnts)
+			persistent_gnt = get_persistent_gnt(
+				&blkif->persistent_gnts,
+				req->u.rw.seg[i].gref);
+
+		if (persistent_gnt) {
+			/*
+			 * We are using persistent grants and
+			 * the grant is already mapped
+			 */
+			new_map = false;
+		} else if (use_persistent_gnts &&
+			   blkif->persistent_gnt_c <
+			   max_mapped_grant_pages(blkif->blk_protocol)) {
+			/*
+			 * We are using persistent grants, the grant is
+			 * not mapped but we have room for it
+			 */
+			new_map = true;
+			persistent_gnt = kmalloc(
+				sizeof(struct persistent_gnt),
+				GFP_KERNEL);
+			if (!persistent_gnt)
+				return -ENOMEM;
+			persistent_gnt->page = alloc_page(GFP_KERNEL);
+			if (!persistent_gnt->page) {
+				kfree(persistent_gnt);
+				return -ENOMEM;
+			}
+			persistent_gnt->gnt = req->u.rw.seg[i].gref;
+			persistent_gnt->handle = BLKBACK_INVALID_HANDLE;
+
+			pages_to_gnt[segs_to_map] =
+				persistent_gnt->page;
+			addr = (unsigned long) pfn_to_kaddr(
+				page_to_pfn(persistent_gnt->page));
+
+			add_persistent_gnt(&blkif->persistent_gnts,
+				persistent_gnt);
+			blkif->persistent_gnt_c++;
+			pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
+				 persistent_gnt->gnt, blkif->persistent_gnt_c,
+				 max_mapped_grant_pages(blkif->blk_protocol));
+		} else {
+			/*
+			 * We are either using persistent grants and
+			 * hit the maximum limit of grants mapped,
+			 * or we are not using persistent grants.
+			 */
+			if (use_persistent_gnts &&
+				!blkif->vbd.overflow_max_grants) {
+				blkif->vbd.overflow_max_grants = 1;
+				pr_alert(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
+					 blkif->domid, blkif->vbd.handle);
+			}
+			new_map = true;
+			pages[i] = blkbk->pending_page(pending_req, i);
+			addr = vaddr(pending_req, i);
+			pages_to_gnt[segs_to_map] =
+				blkbk->pending_page(pending_req, i);
+		}
+
+		if (persistent_gnt) {
+			pages[i] = persistent_gnt->page;
+			persistent_gnts[i] = persistent_gnt;
+		} else {
+			persistent_gnts[i] = NULL;
+		}
+
+		if (new_map) {
+			flags = GNTMAP_host_map;
+			if (!persistent_gnt &&
+			    (pending_req->operation != BLKIF_OP_READ))
+				flags |= GNTMAP_readonly;
+			gnttab_set_map_op(&map[segs_to_map++], addr,
+					  flags, req->u.rw.seg[i].gref,
+					  blkif->domid);
+		}
 	}
 
-	ret = gnttab_map_refs(map, NULL, &blkbk->pending_page(pending_req, 0), nseg);
-	BUG_ON(ret);
+	if (segs_to_map) {
+		ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map);
+		BUG_ON(ret);
+	}
 
 	/*
 	 * Now swizzle the MFN in our domain with the MFN from the other domain
 	 * so that when we access vaddr(pending_req,i) it has the contents of
 	 * the page from the other domain.
 	 */
-	for (i = 0; i < nseg; i++) {
-		if (unlikely(map[i].status != 0)) {
-			pr_debug(DRV_PFX "invalid buffer -- could not remap it\n");
-			map[i].handle = BLKBACK_INVALID_HANDLE;
-			ret |= 1;
+	bitmap_zero(pending_req->unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+	for (i = 0, j = 0; i < nseg; i++) {
+		if (!persistent_gnts[i] ||
+		    persistent_gnts[i]->handle == BLKBACK_INVALID_HANDLE) {
+			/* This is a newly mapped grant */
+			BUG_ON(j >= segs_to_map);
+			if (unlikely(map[j].status != 0)) {
+				pr_debug(DRV_PFX "invalid buffer -- could not remap it\n");
+				map[j].handle = BLKBACK_INVALID_HANDLE;
+				ret |= 1;
+				if (persistent_gnts[i]) {
+					rb_erase(&persistent_gnts[i]->node,
+						 &blkif->persistent_gnts);
+					blkif->persistent_gnt_c--;
+					kfree(persistent_gnts[i]);
+					persistent_gnts[i] = NULL;
+				}
+			}
+		}
+		if (persistent_gnts[i]) {
+			if (persistent_gnts[i]->handle ==
+			    BLKBACK_INVALID_HANDLE) {
+				/*
+				 * If this is a new persistent grant
+				 * save the handler
+				 */
+				persistent_gnts[i]->handle = map[j].handle;
+				persistent_gnts[i]->dev_bus_addr =
+					map[j++].dev_bus_addr;
+			}
+			pending_handle(pending_req, i) =
+				persistent_gnts[i]->handle;
+
+			if (ret)
+				continue;
+
+			seg[i].buf = persistent_gnts[i]->dev_bus_addr |
+				(req->u.rw.seg[i].first_sect << 9);
+		} else {
+			pending_handle(pending_req, i) = map[j].handle;
+			bitmap_set(pending_req->unmap_seg, i, 1);
+
+			if (ret) {
+				j++;
+				continue;
+			}
+
+			seg[i].buf = map[j++].dev_bus_addr |
+				(req->u.rw.seg[i].first_sect << 9);
 		}
-
-		pending_handle(pending_req, i) = map[i].handle;
-
-		if (ret)
-			continue;
-
-		seg[i].buf  = map[i].dev_bus_addr |
-			(req->u.rw.seg[i].first_sect << 9);
 	}
 	return ret;
 }
@@ -591,6 +841,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	int operation;
 	struct blk_plug plug;
 	bool drain = false;
+	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 
 	switch (req->operation) {
 	case BLKIF_OP_READ:
@@ -677,7 +928,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	 * the hypercall to unmap the grants - that is all done in
 	 * xen_blkbk_unmap.
 	 */
-	if (xen_blkbk_map(req, pending_req, seg))
+	if (xen_blkbk_map(req, pending_req, seg, pages))
 		goto fail_flush;
 
 	/*
@@ -689,7 +940,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	for (i = 0; i < nseg; i++) {
 		while ((bio == NULL) ||
 		       (bio_add_page(bio,
-				     blkbk->pending_page(pending_req, i),
+				     pages[i],
 				     seg[i].nsec << 9,
 				     seg[i].buf & ~PAGE_MASK) == 0)) {
 
diff --git a/trunk/drivers/block/xen-blkback/common.h b/trunk/drivers/block/xen-blkback/common.h
index 9a54623e52d7..6072390c7f57 100644
--- a/trunk/drivers/block/xen-blkback/common.h
+++ b/trunk/drivers/block/xen-blkback/common.h
@@ -34,6 +34,7 @@
 #include <linux/vmalloc.h>
 #include <linux/wait.h>
 #include <linux/io.h>
+#include <linux/rbtree.h>
 #include <asm/setup.h>
 #include <asm/pgalloc.h>
 #include <asm/hypervisor.h>
@@ -160,10 +161,21 @@ struct xen_vbd {
 	sector_t		size;
 	unsigned int		flush_support:1;
 	unsigned int		discard_secure:1;
+	unsigned int		feature_gnt_persistent:1;
+	unsigned int		overflow_max_grants:1;
 };
 
 struct backend_info;
 
+
+struct persistent_gnt {
+	struct page *page;
+	grant_ref_t gnt;
+	grant_handle_t handle;
+	uint64_t dev_bus_addr;
+	struct rb_node node;
+};
+
 struct xen_blkif {
 	/* Unique identifier for this interface. */
 	domid_t			domid;
@@ -190,6 +202,10 @@ struct xen_blkif {
 	struct task_struct	*xenblkd;
 	unsigned int		waiting_reqs;
 
+	/* tree to store persistent grants */
+	struct rb_root		persistent_gnts;
+	unsigned int		persistent_gnt_c;
+
 	/* statistics */
 	unsigned long		st_print;
 	int			st_rd_req;
diff --git a/trunk/drivers/block/xen-blkback/xenbus.c b/trunk/drivers/block/xen-blkback/xenbus.c
index f58434c2617c..63980722db41 100644
--- a/trunk/drivers/block/xen-blkback/xenbus.c
+++ b/trunk/drivers/block/xen-blkback/xenbus.c
@@ -117,6 +117,7 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
 	atomic_set(&blkif->drain, 0);
 	blkif->st_print = jiffies;
 	init_waitqueue_head(&blkif->waiting_to_free);
+	blkif->persistent_gnts.rb_node = NULL;
 
 	return blkif;
 }
@@ -672,6 +673,13 @@ static void connect(struct backend_info *be)
 
 	xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support);
 
+	err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u", 1);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/feature-persistent",
+				 dev->nodename);
+		goto abort;
+	}
+
 	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
 			    (unsigned long long)vbd_sz(&be->blkif->vbd));
 	if (err) {
@@ -720,6 +728,7 @@ static int connect_ring(struct backend_info *be)
 	struct xenbus_device *dev = be->dev;
 	unsigned long ring_ref;
 	unsigned int evtchn;
+	unsigned int pers_grants;
 	char protocol[64] = "";
 	int err;
 
@@ -749,8 +758,18 @@ static int connect_ring(struct backend_info *be)
 		xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
 		return -1;
 	}
-	pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s)\n",
-		ring_ref, evtchn, be->blkif->blk_protocol, protocol);
+	err = xenbus_gather(XBT_NIL, dev->otherend,
+			    "feature-persistent", "%u",
+			    &pers_grants, NULL);
+	if (err)
+		pers_grants = 0;
+
+	be->blkif->vbd.feature_gnt_persistent = pers_grants;
+	be->blkif->vbd.overflow_max_grants = 0;
+
+	pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s) %s\n",
+		ring_ref, evtchn, be->blkif->blk_protocol, protocol,
+		pers_grants ? "persistent grants" : "");
 
 	/* Map the shared frame, irq etc. */
 	err = xen_blkif_map(be->blkif, ring_ref, evtchn);
diff --git a/trunk/drivers/block/xen-blkfront.c b/trunk/drivers/block/xen-blkfront.c
index 007db8986e84..96e9b00db081 100644
--- a/trunk/drivers/block/xen-blkfront.c
+++ b/trunk/drivers/block/xen-blkfront.c
@@ -44,6 +44,7 @@
 #include <linux/mutex.h>
 #include <linux/scatterlist.h>
 #include <linux/bitmap.h>
+#include <linux/llist.h>
 
 #include <xen/xen.h>
 #include <xen/xenbus.h>
@@ -64,10 +65,17 @@ enum blkif_state {
 	BLKIF_STATE_SUSPENDED,
 };
 
+struct grant {
+	grant_ref_t gref;
+	unsigned long pfn;
+	struct llist_node node;
+};
+
 struct blk_shadow {
 	struct blkif_request req;
 	struct request *request;
 	unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 };
 
 static DEFINE_MUTEX(blkfront_mutex);
@@ -97,6 +105,8 @@ struct blkfront_info
 	struct work_struct work;
 	struct gnttab_free_callback callback;
 	struct blk_shadow shadow[BLK_RING_SIZE];
+	struct llist_head persistent_gnts;
+	unsigned int persistent_gnts_c;
 	unsigned long shadow_free;
 	unsigned int feature_flush;
 	unsigned int flush_op;
@@ -104,6 +114,7 @@ struct blkfront_info
 	unsigned int feature_secdiscard:1;
 	unsigned int discard_granularity;
 	unsigned int discard_alignment;
+	unsigned int feature_persistent:1;
 	int is_ready;
 };
 
@@ -287,21 +298,36 @@ static int blkif_queue_request(struct request *req)
 	unsigned long id;
 	unsigned int fsect, lsect;
 	int i, ref;
+
+	/*
+	 * Used to store if we are able to queue the request by just using
+	 * existing persistent grants, or if we have to get new grants,
+	 * as there are not sufficiently many free.
+	 */
+	bool new_persistent_gnts;
 	grant_ref_t gref_head;
+	struct page *granted_page;
+	struct grant *gnt_list_entry = NULL;
 	struct scatterlist *sg;
 
 	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
 		return 1;
 
-	if (gnttab_alloc_grant_references(
-		BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
-		gnttab_request_free_callback(
-			&info->callback,
-			blkif_restart_queue_callback,
-			info,
-			BLKIF_MAX_SEGMENTS_PER_REQUEST);
-		return 1;
-	}
+	/* Check if we have enought grants to allocate a requests */
+	if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+		new_persistent_gnts = 1;
+		if (gnttab_alloc_grant_references(
+		    BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c,
+		    &gref_head) < 0) {
+			gnttab_request_free_callback(
+				&info->callback,
+				blkif_restart_queue_callback,
+				info,
+				BLKIF_MAX_SEGMENTS_PER_REQUEST);
+			return 1;
+		}
+	} else
+		new_persistent_gnts = 0;
 
 	/* Fill out a communications ring structure. */
 	ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
@@ -341,18 +367,73 @@ static int blkif_queue_request(struct request *req)
 		       BLKIF_MAX_SEGMENTS_PER_REQUEST);
 
 		for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) {
-			buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg)));
 			fsect = sg->offset >> 9;
 			lsect = fsect + (sg->length >> 9) - 1;
-			/* install a grant reference. */
-			ref = gnttab_claim_grant_reference(&gref_head);
-			BUG_ON(ref == -ENOSPC);
 
-			gnttab_grant_foreign_access_ref(
-					ref,
+			if (info->persistent_gnts_c) {
+				BUG_ON(llist_empty(&info->persistent_gnts));
+				gnt_list_entry = llist_entry(
+					llist_del_first(&info->persistent_gnts),
+					struct grant, node);
+
+				ref = gnt_list_entry->gref;
+				buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn);
+				info->persistent_gnts_c--;
+			} else {
+				ref = gnttab_claim_grant_reference(&gref_head);
+				BUG_ON(ref == -ENOSPC);
+
+				gnt_list_entry =
+					kmalloc(sizeof(struct grant),
+							 GFP_ATOMIC);
+				if (!gnt_list_entry)
+					return -ENOMEM;
+
+				granted_page = alloc_page(GFP_ATOMIC);
+				if (!granted_page) {
+					kfree(gnt_list_entry);
+					return -ENOMEM;
+				}
+
+				gnt_list_entry->pfn =
+					page_to_pfn(granted_page);
+				gnt_list_entry->gref = ref;
+
+				buffer_mfn = pfn_to_mfn(page_to_pfn(
+								granted_page));
+				gnttab_grant_foreign_access_ref(ref,
 					info->xbdev->otherend_id,
-					buffer_mfn,
-					rq_data_dir(req));
+					buffer_mfn, 0);
+			}
+
+			info->shadow[id].grants_used[i] = gnt_list_entry;
+
+			if (rq_data_dir(req)) {
+				char *bvec_data;
+				void *shared_data;
+
+				BUG_ON(sg->offset + sg->length > PAGE_SIZE);
+
+				shared_data = kmap_atomic(
+					pfn_to_page(gnt_list_entry->pfn));
+				bvec_data = kmap_atomic(sg_page(sg));
+
+				/*
+				 * this does not wipe data stored outside the
+				 * range sg->offset..sg->offset+sg->length.
+				 * Therefore, blkback *could* see data from
+				 * previous requests. This is OK as long as
+				 * persistent grants are shared with just one
+				 * domain. It may need refactoring if this
+				 * changes
+				 */
+				memcpy(shared_data + sg->offset,
+				       bvec_data   + sg->offset,
+				       sg->length);
+
+				kunmap_atomic(bvec_data);
+				kunmap_atomic(shared_data);
+			}
 
 			info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
 			ring_req->u.rw.seg[i] =
@@ -368,7 +449,8 @@ static int blkif_queue_request(struct request *req)
 	/* Keep a private copy so we can reissue requests when recovering. */
 	info->shadow[id].req = *ring_req;
 
-	gnttab_free_grant_references(gref_head);
+	if (new_persistent_gnts)
+		gnttab_free_grant_references(gref_head);
 
 	return 0;
 }
@@ -480,12 +562,13 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 static void xlvbd_flush(struct blkfront_info *info)
 {
 	blk_queue_flush(info->rq, info->feature_flush);
-	printk(KERN_INFO "blkfront: %s: %s: %s\n",
+	printk(KERN_INFO "blkfront: %s: %s: %s %s\n",
 	       info->gd->disk_name,
 	       info->flush_op == BLKIF_OP_WRITE_BARRIER ?
 		"barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
 		"flush diskcache" : "barrier or flush"),
-	       info->feature_flush ? "enabled" : "disabled");
+	       info->feature_flush ? "enabled" : "disabled",
+	       info->feature_persistent ? "using persistent grants" : "");
 }
 
 static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
@@ -707,6 +790,9 @@ static void blkif_restart_queue(struct work_struct *work)
 
 static void blkif_free(struct blkfront_info *info, int suspend)
 {
+	struct llist_node *all_gnts;
+	struct grant *persistent_gnt;
+
 	/* Prevent new requests being issued until we fix things up. */
 	spin_lock_irq(&info->io_lock);
 	info->connected = suspend ?
@@ -714,6 +800,18 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 	/* No more blkif_request(). */
 	if (info->rq)
 		blk_stop_queue(info->rq);
+
+	/* Remove all persistent grants */
+	if (info->persistent_gnts_c) {
+		all_gnts = llist_del_all(&info->persistent_gnts);
+		llist_for_each_entry(persistent_gnt, all_gnts, node) {
+			gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
+			__free_page(pfn_to_page(persistent_gnt->pfn));
+			kfree(persistent_gnt);
+		}
+		info->persistent_gnts_c = 0;
+	}
+
 	/* No more gnttab callback work. */
 	gnttab_cancel_free_callback(&info->callback);
 	spin_unlock_irq(&info->io_lock);
@@ -734,13 +832,43 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 
 }
 
-static void blkif_completion(struct blk_shadow *s)
+static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
+			     struct blkif_response *bret)
 {
 	int i;
-	/* Do not let BLKIF_OP_DISCARD as nr_segment is in the same place
-	 * flag. */
-	for (i = 0; i < s->req.u.rw.nr_segments; i++)
-		gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL);
+	struct bio_vec *bvec;
+	struct req_iterator iter;
+	unsigned long flags;
+	char *bvec_data;
+	void *shared_data;
+	unsigned int offset = 0;
+
+	if (bret->operation == BLKIF_OP_READ) {
+		/*
+		 * Copy the data received from the backend into the bvec.
+		 * Since bv_offset can be different than 0, and bv_len different
+		 * than PAGE_SIZE, we have to keep track of the current offset,
+		 * to be sure we are copying the data from the right shared page.
+		 */
+		rq_for_each_segment(bvec, s->request, iter) {
+			BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE);
+			i = offset >> PAGE_SHIFT;
+			BUG_ON(i >= s->req.u.rw.nr_segments);
+			shared_data = kmap_atomic(
+				pfn_to_page(s->grants_used[i]->pfn));
+			bvec_data = bvec_kmap_irq(bvec, &flags);
+			memcpy(bvec_data, shared_data + bvec->bv_offset,
+				bvec->bv_len);
+			bvec_kunmap_irq(bvec_data, &flags);
+			kunmap_atomic(shared_data);
+			offset += bvec->bv_len;
+		}
+	}
+	/* Add the persistent grant into the list of free grants */
+	for (i = 0; i < s->req.u.rw.nr_segments; i++) {
+		llist_add(&s->grants_used[i]->node, &info->persistent_gnts);
+		info->persistent_gnts_c++;
+	}
 }
 
 static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -783,7 +911,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 		req  = info->shadow[id].request;
 
 		if (bret->operation != BLKIF_OP_DISCARD)
-			blkif_completion(&info->shadow[id]);
+			blkif_completion(&info->shadow[id], info, bret);
 
 		if (add_id_to_freelist(info, id)) {
 			WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
@@ -942,6 +1070,11 @@ static int talk_to_blkback(struct xenbus_device *dev,
 		message = "writing protocol";
 		goto abort_transaction;
 	}
+	err = xenbus_printf(xbt, dev->nodename,
+			    "feature-persistent", "%u", 1);
+	if (err)
+		dev_warn(&dev->dev,
+			 "writing persistent grants feature to xenbus");
 
 	err = xenbus_transaction_end(xbt, 0);
 	if (err) {
@@ -1029,6 +1162,8 @@ static int blkfront_probe(struct xenbus_device *dev,
 	spin_lock_init(&info->io_lock);
 	info->xbdev = dev;
 	info->vdevice = vdevice;
+	init_llist_head(&info->persistent_gnts);
+	info->persistent_gnts_c = 0;
 	info->connected = BLKIF_STATE_DISCONNECTED;
 	INIT_WORK(&info->work, blkif_restart_queue);
 
@@ -1093,7 +1228,7 @@ static int blkif_recover(struct blkfront_info *info)
 					req->u.rw.seg[j].gref,
 					info->xbdev->otherend_id,
 					pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]),
-					rq_data_dir(info->shadow[req->u.rw.id].request));
+					0);
 		}
 		info->shadow[req->u.rw.id].req = *req;
 
@@ -1225,7 +1360,7 @@ static void blkfront_connect(struct blkfront_info *info)
 	unsigned long sector_size;
 	unsigned int binfo;
 	int err;
-	int barrier, flush, discard;
+	int barrier, flush, discard, persistent;
 
 	switch (info->connected) {
 	case BLKIF_STATE_CONNECTED:
@@ -1303,6 +1438,14 @@ static void blkfront_connect(struct blkfront_info *info)
 	if (!err && discard)
 		blkfront_setup_discard(info);
 
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			    "feature-persistent", "%u", &persistent,
+			    NULL);
+	if (err)
+		info->feature_persistent = 0;
+	else
+		info->feature_persistent = persistent;
+
 	err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
 	if (err) {
 		xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
diff --git a/trunk/drivers/char/random.c b/trunk/drivers/char/random.c
index b86eae9b77df..85e81ec1451e 100644
--- a/trunk/drivers/char/random.c
+++ b/trunk/drivers/char/random.c
@@ -399,7 +399,6 @@ static DECLARE_WAIT_QUEUE_HEAD(random_read_wait);
 static DECLARE_WAIT_QUEUE_HEAD(random_write_wait);
 static struct fasync_struct *fasync;
 
-#if 0
 static bool debug;
 module_param(debug, bool, 0644);
 #define DEBUG_ENT(fmt, arg...) do { \
@@ -410,9 +409,6 @@ module_param(debug, bool, 0644);
 		blocking_pool.entropy_count,\
 		nonblocking_pool.entropy_count,\
 		## arg); } while (0)
-#else
-#define DEBUG_ENT(fmt, arg...) do {} while (0)
-#endif
 
 /**********************************************************************
  *
@@ -437,6 +433,7 @@ struct entropy_store {
 	int entropy_count;
 	int entropy_total;
 	unsigned int initialized:1;
+	bool last_data_init;
 	__u8 last_data[EXTRACT_SIZE];
 };
 
@@ -829,7 +826,7 @@ static void xfer_secondary_pool(struct entropy_store *r, size_t nbytes)
 		bytes = min_t(int, bytes, sizeof(tmp));
 
 		DEBUG_ENT("going to reseed %s with %d bits "
-			  "(%d of %d requested)\n",
+			  "(%zu of %d requested)\n",
 			  r->name, bytes * 8, nbytes * 8, r->entropy_count);
 
 		bytes = extract_entropy(r->pull, tmp, bytes,
@@ -860,7 +857,7 @@ static size_t account(struct entropy_store *r, size_t nbytes, int min,
 	spin_lock_irqsave(&r->lock, flags);
 
 	BUG_ON(r->entropy_count > r->poolinfo->POOLBITS);
-	DEBUG_ENT("trying to extract %d bits from %s\n",
+	DEBUG_ENT("trying to extract %zu bits from %s\n",
 		  nbytes * 8, r->name);
 
 	/* Can we pull enough? */
@@ -882,7 +879,7 @@ static size_t account(struct entropy_store *r, size_t nbytes, int min,
 		}
 	}
 
-	DEBUG_ENT("debiting %d entropy credits from %s%s\n",
+	DEBUG_ENT("debiting %zu entropy credits from %s%s\n",
 		  nbytes * 8, r->name, r->limit ? "" : " (unlimited)");
 
 	spin_unlock_irqrestore(&r->lock, flags);
@@ -957,6 +954,10 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf,
 	ssize_t ret = 0, i;
 	__u8 tmp[EXTRACT_SIZE];
 
+	/* if last_data isn't primed, we need EXTRACT_SIZE extra bytes */
+	if (fips_enabled && !r->last_data_init)
+		nbytes += EXTRACT_SIZE;
+
 	trace_extract_entropy(r->name, nbytes, r->entropy_count, _RET_IP_);
 	xfer_secondary_pool(r, nbytes);
 	nbytes = account(r, nbytes, min, reserved);
@@ -967,6 +968,17 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf,
 		if (fips_enabled) {
 			unsigned long flags;
 
+
+			/* prime last_data value if need be, per fips 140-2 */
+			if (!r->last_data_init) {
+				spin_lock_irqsave(&r->lock, flags);
+				memcpy(r->last_data, tmp, EXTRACT_SIZE);
+				r->last_data_init = true;
+				nbytes -= EXTRACT_SIZE;
+				spin_unlock_irqrestore(&r->lock, flags);
+				extract_buf(r, tmp);
+			}
+
 			spin_lock_irqsave(&r->lock, flags);
 			if (!memcmp(tmp, r->last_data, EXTRACT_SIZE))
 				panic("Hardware RNG duplicated output!\n");
@@ -1086,6 +1098,7 @@ static void init_std_data(struct entropy_store *r)
 
 	r->entropy_count = 0;
 	r->entropy_total = 0;
+	r->last_data_init = false;
 	mix_pool_bytes(r, &now, sizeof(now), NULL);
 	for (i = r->poolinfo->POOLBYTES; i > 0; i -= sizeof(rv)) {
 		if (!arch_get_random_long(&rv))
@@ -1142,11 +1155,16 @@ random_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 		if (n > SEC_XFER_SIZE)
 			n = SEC_XFER_SIZE;
 
-		DEBUG_ENT("reading %d bits\n", n*8);
+		DEBUG_ENT("reading %zu bits\n", n*8);
 
 		n = extract_entropy_user(&blocking_pool, buf, n);
 
-		DEBUG_ENT("read got %d bits (%d still needed)\n",
+		if (n < 0) {
+			retval = n;
+			break;
+		}
+
+		DEBUG_ENT("read got %zd bits (%zd still needed)\n",
 			  n*8, (nbytes-n)*8);
 
 		if (n == 0) {
@@ -1171,10 +1189,6 @@ random_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 			continue;
 		}
 
-		if (n < 0) {
-			retval = n;
-			break;
-		}
 		count += n;
 		buf += n;
 		nbytes -= n;
diff --git a/trunk/drivers/clk/clk-nomadik.c b/trunk/drivers/clk/clk-nomadik.c
index 517a8ff7121e..6b4c70f7d23d 100644
--- a/trunk/drivers/clk/clk-nomadik.c
+++ b/trunk/drivers/clk/clk-nomadik.c
@@ -20,6 +20,7 @@ void __init nomadik_clk_init(void)
 	clk_register_clkdev(clk, NULL, "gpio.2");
 	clk_register_clkdev(clk, NULL, "gpio.3");
 	clk_register_clkdev(clk, NULL, "rng");
+	clk_register_clkdev(clk, NULL, "fsmc-nand");
 
 	/*
 	 * The 2.4 MHz TIMCLK reference clock is active at boot time, this is
diff --git a/trunk/drivers/crypto/nx/nx-842.c b/trunk/drivers/crypto/nx/nx-842.c
index 0ce625738677..6c4c000671c5 100644
--- a/trunk/drivers/crypto/nx/nx-842.c
+++ b/trunk/drivers/crypto/nx/nx-842.c
@@ -28,7 +28,6 @@
 #include <linux/slab.h>
 
 #include <asm/page.h>
-#include <asm/pSeries_reconfig.h>
 #include <asm/vio.h>
 
 #include "nx_csbcpb.h" /* struct nx_csbcpb */
@@ -1014,26 +1013,23 @@ static int nx842_OF_upd(struct property *new_prop)
  *	NOTIFY_BAD encoded with error number on failure, use
  *		notifier_to_errno() to decode this value
  */
-static int nx842_OF_notifier(struct notifier_block *np,
-					unsigned long action,
-					void *update)
+static int nx842_OF_notifier(struct notifier_block *np, unsigned long action,
+			     void *update)
 {
-	struct pSeries_reconfig_prop_update *upd;
+	struct of_prop_reconfig *upd = update;
 	struct nx842_devdata *local_devdata;
 	struct device_node *node = NULL;
 
-	upd = (struct pSeries_reconfig_prop_update *)update;
-
 	rcu_read_lock();
 	local_devdata = rcu_dereference(devdata);
 	if (local_devdata)
 		node = local_devdata->dev->of_node;
 
 	if (local_devdata &&
-			action == PSERIES_UPDATE_PROPERTY &&
-			!strcmp(upd->node->name, node->name)) {
+			action == OF_RECONFIG_UPDATE_PROPERTY &&
+			!strcmp(upd->dn->name, node->name)) {
 		rcu_read_unlock();
-		nx842_OF_upd(upd->property);
+		nx842_OF_upd(upd->prop);
 	} else
 		rcu_read_unlock();
 
@@ -1182,7 +1178,7 @@ static int __init nx842_probe(struct vio_dev *viodev,
 	synchronize_rcu();
 	kfree(old_devdata);
 
-	pSeries_reconfig_notifier_register(&nx842_of_nb);
+	of_reconfig_notifier_register(&nx842_of_nb);
 
 	ret = nx842_OF_upd(NULL);
 	if (ret && ret != -ENODEV) {
@@ -1228,7 +1224,7 @@ static int __exit nx842_remove(struct vio_dev *viodev)
 	spin_lock_irqsave(&devdata_mutex, flags);
 	old_devdata = rcu_dereference_check(devdata,
 			lockdep_is_held(&devdata_mutex));
-	pSeries_reconfig_notifier_unregister(&nx842_of_nb);
+	of_reconfig_notifier_unregister(&nx842_of_nb);
 	rcu_assign_pointer(devdata, NULL);
 	spin_unlock_irqrestore(&devdata_mutex, flags);
 	synchronize_rcu();
diff --git a/trunk/drivers/crypto/nx/nx.c b/trunk/drivers/crypto/nx/nx.c
index 638110efae9b..f7a8a16aa7d3 100644
--- a/trunk/drivers/crypto/nx/nx.c
+++ b/trunk/drivers/crypto/nx/nx.c
@@ -33,7 +33,6 @@
 #include <linux/scatterlist.h>
 #include <linux/device.h>
 #include <linux/of.h>
-#include <asm/pSeries_reconfig.h>
 #include <asm/hvcall.h>
 #include <asm/vio.h>
 
diff --git a/trunk/drivers/dma/dmatest.c b/trunk/drivers/dma/dmatest.c
index 24225f0fdcd8..64b048d7fba7 100644
--- a/trunk/drivers/dma/dmatest.c
+++ b/trunk/drivers/dma/dmatest.c
@@ -228,6 +228,20 @@ static void dmatest_callback(void *arg)
 	wake_up_all(done->wait);
 }
 
+static inline void unmap_src(struct device *dev, dma_addr_t *addr, size_t len,
+			     unsigned int count)
+{
+	while (count--)
+		dma_unmap_single(dev, addr[count], len, DMA_TO_DEVICE);
+}
+
+static inline void unmap_dst(struct device *dev, dma_addr_t *addr, size_t len,
+			     unsigned int count)
+{
+	while (count--)
+		dma_unmap_single(dev, addr[count], len, DMA_BIDIRECTIONAL);
+}
+
 /*
  * This function repeatedly tests DMA transfers of various lengths and
  * offsets for a given operation type until it is told to exit by
@@ -353,15 +367,35 @@ static int dmatest_func(void *data)
 
 			dma_srcs[i] = dma_map_single(dev->dev, buf, len,
 						     DMA_TO_DEVICE);
+			ret = dma_mapping_error(dev->dev, dma_srcs[i]);
+			if (ret) {
+				unmap_src(dev->dev, dma_srcs, len, i);
+				pr_warn("%s: #%u: mapping error %d with "
+					"src_off=0x%x len=0x%x\n",
+					thread_name, total_tests - 1, ret,
+					src_off, len);
+				failed_tests++;
+				continue;
+			}
 		}
 		/* map with DMA_BIDIRECTIONAL to force writeback/invalidate */
 		for (i = 0; i < dst_cnt; i++) {
 			dma_dsts[i] = dma_map_single(dev->dev, thread->dsts[i],
 						     test_buf_size,
 						     DMA_BIDIRECTIONAL);
+			ret = dma_mapping_error(dev->dev, dma_dsts[i]);
+			if (ret) {
+				unmap_src(dev->dev, dma_srcs, len, src_cnt);
+				unmap_dst(dev->dev, dma_dsts, test_buf_size, i);
+				pr_warn("%s: #%u: mapping error %d with "
+					"dst_off=0x%x len=0x%x\n",
+					thread_name, total_tests - 1, ret,
+					dst_off, test_buf_size);
+				failed_tests++;
+				continue;
+			}
 		}
 
-
 		if (thread->type == DMA_MEMCPY)
 			tx = dev->device_prep_dma_memcpy(chan,
 							 dma_dsts[0] + dst_off,
@@ -383,13 +417,8 @@ static int dmatest_func(void *data)
 		}
 
 		if (!tx) {
-			for (i = 0; i < src_cnt; i++)
-				dma_unmap_single(dev->dev, dma_srcs[i], len,
-						 DMA_TO_DEVICE);
-			for (i = 0; i < dst_cnt; i++)
-				dma_unmap_single(dev->dev, dma_dsts[i],
-						 test_buf_size,
-						 DMA_BIDIRECTIONAL);
+			unmap_src(dev->dev, dma_srcs, len, src_cnt);
+			unmap_dst(dev->dev, dma_dsts, test_buf_size, dst_cnt);
 			pr_warning("%s: #%u: prep error with src_off=0x%x "
 					"dst_off=0x%x len=0x%x\n",
 					thread_name, total_tests - 1,
@@ -443,9 +472,7 @@ static int dmatest_func(void *data)
 		}
 
 		/* Unmap by myself (see DMA_COMPL_SKIP_DEST_UNMAP above) */
-		for (i = 0; i < dst_cnt; i++)
-			dma_unmap_single(dev->dev, dma_dsts[i], test_buf_size,
-					 DMA_BIDIRECTIONAL);
+		unmap_dst(dev->dev, dma_dsts, test_buf_size, dst_cnt);
 
 		error_count = 0;
 
diff --git a/trunk/drivers/firmware/efivars.c b/trunk/drivers/firmware/efivars.c
index 52c5d8956d7d..7b1c37497c9a 100644
--- a/trunk/drivers/firmware/efivars.c
+++ b/trunk/drivers/firmware/efivars.c
@@ -883,7 +883,6 @@ static struct inode *efivarfs_get_inode(struct super_block *sb,
 
 	if (inode) {
 		inode->i_ino = get_next_ino();
-		inode->i_uid = inode->i_gid = 0;
 		inode->i_mode = mode;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		switch (mode & S_IFMT) {
diff --git a/trunk/drivers/gpio/Kconfig b/trunk/drivers/gpio/Kconfig
index 8ae1f5b19669..682de754d63f 100644
--- a/trunk/drivers/gpio/Kconfig
+++ b/trunk/drivers/gpio/Kconfig
@@ -172,6 +172,7 @@ config GPIO_MSM_V2
 config GPIO_MVEBU
 	def_bool y
 	depends on PLAT_ORION
+	depends on OF
 	select GPIO_GENERIC
 	select GENERIC_IRQ_CHIP
 
diff --git a/trunk/drivers/gpio/gpio-ich.c b/trunk/drivers/gpio/gpio-ich.c
index 6cc87ac8e019..6f2306db8591 100644
--- a/trunk/drivers/gpio/gpio-ich.c
+++ b/trunk/drivers/gpio/gpio-ich.c
@@ -390,6 +390,7 @@ static int ichx_gpio_probe(struct platform_device *pdev)
 		return -ENODEV;
 	}
 
+	spin_lock_init(&ichx_priv.lock);
 	res_base = platform_get_resource(pdev, IORESOURCE_IO, ICH_RES_GPIO);
 	ichx_priv.use_gpio = ich_info->use_gpio;
 	err = ichx_gpio_request_regions(res_base, pdev->name,
diff --git a/trunk/drivers/gpio/gpio-mvebu.c b/trunk/drivers/gpio/gpio-mvebu.c
index d767b534c4af..7d9bd94be8d2 100644
--- a/trunk/drivers/gpio/gpio-mvebu.c
+++ b/trunk/drivers/gpio/gpio-mvebu.c
@@ -41,7 +41,6 @@
 #include <linux/io.h>
 #include <linux/of_irq.h>
 #include <linux/of_device.h>
-#include <linux/platform_device.h>
 #include <linux/pinctrl/consumer.h>
 
 /*
@@ -469,19 +468,6 @@ static void mvebu_gpio_irq_handler(unsigned int irq, struct irq_desc *desc)
 	}
 }
 
-static struct platform_device_id mvebu_gpio_ids[] = {
-	{
-		.name = "orion-gpio",
-	}, {
-		.name = "mv78200-gpio",
-	}, {
-		.name = "armadaxp-gpio",
-	}, {
-		/* sentinel */
-	},
-};
-MODULE_DEVICE_TABLE(platform, mvebu_gpio_ids);
-
 static struct of_device_id mvebu_gpio_of_match[] = {
 	{
 		.compatible = "marvell,orion-gpio",
@@ -555,9 +541,7 @@ static int mvebu_gpio_probe(struct platform_device *pdev)
 	mvchip->chip.base = id * MVEBU_MAX_GPIO_PER_BANK;
 	mvchip->chip.ngpio = ngpios;
 	mvchip->chip.can_sleep = 0;
-#ifdef CONFIG_OF
 	mvchip->chip.of_node = np;
-#endif
 
 	spin_lock_init(&mvchip->lock);
 	mvchip->membase = devm_request_and_ioremap(&pdev->dev, res);
@@ -698,7 +682,6 @@ static struct platform_driver mvebu_gpio_driver = {
 		.of_match_table = mvebu_gpio_of_match,
 	},
 	.probe		= mvebu_gpio_probe,
-	.id_table	= mvebu_gpio_ids,
 };
 
 static int __init mvebu_gpio_init(void)
diff --git a/trunk/drivers/gpu/drm/ttm/ttm_bo.c b/trunk/drivers/gpu/drm/ttm/ttm_bo.c
index a9151337d5b9..33d20be87db5 100644
--- a/trunk/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/trunk/drivers/gpu/drm/ttm/ttm_bo.c
@@ -579,7 +579,7 @@ static int ttm_bo_cleanup_refs_and_unlock(struct ttm_buffer_object *bo,
 		 * at this point the buffer should be dead, so
 		 * no new sync objects can be attached.
 		 */
-		sync_obj = driver->sync_obj_ref(&bo->sync_obj);
+		sync_obj = driver->sync_obj_ref(bo->sync_obj);
 		spin_unlock(&bdev->fence_lock);
 
 		atomic_set(&bo->reserved, 0);
diff --git a/trunk/drivers/hwmon/hwmon-vid.c b/trunk/drivers/hwmon/hwmon-vid.c
index 9f26400713f0..89cfd64b3373 100644
--- a/trunk/drivers/hwmon/hwmon-vid.c
+++ b/trunk/drivers/hwmon/hwmon-vid.c
@@ -115,6 +115,12 @@ int vid_from_reg(int val, u8 vrm)
 		return (val < 32) ? 1550 - 25 * val
 			: 775 - (25 * (val - 31)) / 2;
 
+	case 26:		/* AMD family 10h to 15h, serial VID */
+		val &= 0x7f;
+		if (val >= 0x7c)
+			return 0;
+		return DIV_ROUND_CLOSEST(15500 - 125 * val, 10);
+
 	case 91:		/* VRM 9.1 */
 	case 90:		/* VRM 9.0 */
 		val &= 0x1f;
@@ -195,6 +201,10 @@ static struct vrm_model vrm_models[] = {
 	{X86_VENDOR_AMD, 0xF, 0x40, 0x7F, ANY, 24},	/* NPT family 0Fh */
 	{X86_VENDOR_AMD, 0xF, 0x80, ANY, ANY, 25},	/* future fam. 0Fh */
 	{X86_VENDOR_AMD, 0x10, 0x0, ANY, ANY, 25},	/* NPT family 10h */
+	{X86_VENDOR_AMD, 0x11, 0x0, ANY, ANY, 26},	/* family 11h */
+	{X86_VENDOR_AMD, 0x12, 0x0, ANY, ANY, 26},	/* family 12h */
+	{X86_VENDOR_AMD, 0x14, 0x0, ANY, ANY, 26},	/* family 14h */
+	{X86_VENDOR_AMD, 0x15, 0x0, ANY, ANY, 26},	/* family 15h */
 
 	{X86_VENDOR_INTEL, 0x6, 0x0, 0x6, ANY, 82},	/* Pentium Pro,
 							 * Pentium II, Xeon,
diff --git a/trunk/drivers/hwmon/hwmon.c b/trunk/drivers/hwmon/hwmon.c
index c3c471ca202f..646314f7c839 100644
--- a/trunk/drivers/hwmon/hwmon.c
+++ b/trunk/drivers/hwmon/hwmon.c
@@ -84,19 +84,21 @@ static void __init hwmon_pci_quirks(void)
 
 	/* Open access to 0x295-0x296 on MSI MS-7031 */
 	sb = pci_get_device(PCI_VENDOR_ID_ATI, 0x436c, NULL);
-	if (sb &&
-	    (sb->subsystem_vendor == 0x1462 &&	/* MSI */
-	     sb->subsystem_device == 0x0031)) {	/* MS-7031 */
-
-		pci_read_config_byte(sb, 0x48, &enable);
-		pci_read_config_word(sb, 0x64, &base);
-
-		if (base == 0 && !(enable & BIT(2))) {
-			dev_info(&sb->dev,
-				 "Opening wide generic port at 0x295\n");
-			pci_write_config_word(sb, 0x64, 0x295);
-			pci_write_config_byte(sb, 0x48, enable | BIT(2));
+	if (sb) {
+		if (sb->subsystem_vendor == 0x1462 &&	/* MSI */
+		    sb->subsystem_device == 0x0031) {	/* MS-7031 */
+			pci_read_config_byte(sb, 0x48, &enable);
+			pci_read_config_word(sb, 0x64, &base);
+
+			if (base == 0 && !(enable & BIT(2))) {
+				dev_info(&sb->dev,
+					 "Opening wide generic port at 0x295\n");
+				pci_write_config_word(sb, 0x64, 0x295);
+				pci_write_config_byte(sb, 0x48,
+						      enable | BIT(2));
+			}
 		}
+		pci_dev_put(sb);
 	}
 #endif
 }
diff --git a/trunk/drivers/hwmon/it87.c b/trunk/drivers/hwmon/it87.c
index d32aa354cbdf..117d66fcded6 100644
--- a/trunk/drivers/hwmon/it87.c
+++ b/trunk/drivers/hwmon/it87.c
@@ -203,6 +203,8 @@ static const u8 IT87_REG_FAN[]		= { 0x0d, 0x0e, 0x0f, 0x80, 0x82 };
 static const u8 IT87_REG_FAN_MIN[]	= { 0x10, 0x11, 0x12, 0x84, 0x86 };
 static const u8 IT87_REG_FANX[]		= { 0x18, 0x19, 0x1a, 0x81, 0x83 };
 static const u8 IT87_REG_FANX_MIN[]	= { 0x1b, 0x1c, 0x1d, 0x85, 0x87 };
+static const u8 IT87_REG_TEMP_OFFSET[]	= { 0x56, 0x57, 0x59 };
+
 #define IT87_REG_FAN_MAIN_CTRL 0x13
 #define IT87_REG_FAN_CTL       0x14
 #define IT87_REG_PWM(nr)       (0x15 + (nr))
@@ -226,6 +228,83 @@ static const u8 IT87_REG_FANX_MIN[]	= { 0x1b, 0x1c, 0x1d, 0x85, 0x87 };
 #define IT87_REG_AUTO_TEMP(nr, i) (0x60 + (nr) * 8 + (i))
 #define IT87_REG_AUTO_PWM(nr, i)  (0x65 + (nr) * 8 + (i))
 
+struct it87_devices {
+	const char *name;
+	u16 features;
+	u8 peci_mask;
+	u8 old_peci_mask;
+};
+
+#define FEAT_12MV_ADC		(1 << 0)
+#define FEAT_NEWER_AUTOPWM	(1 << 1)
+#define FEAT_OLD_AUTOPWM	(1 << 2)
+#define FEAT_16BIT_FANS		(1 << 3)
+#define FEAT_TEMP_OFFSET	(1 << 4)
+#define FEAT_TEMP_PECI		(1 << 5)
+#define FEAT_TEMP_OLD_PECI	(1 << 6)
+
+static const struct it87_devices it87_devices[] = {
+	[it87] = {
+		.name = "it87",
+		.features = FEAT_OLD_AUTOPWM,	/* may need to overwrite */
+	},
+	[it8712] = {
+		.name = "it8712",
+		.features = FEAT_OLD_AUTOPWM,	/* may need to overwrite */
+	},
+	[it8716] = {
+		.name = "it8716",
+		.features = FEAT_16BIT_FANS | FEAT_TEMP_OFFSET,
+	},
+	[it8718] = {
+		.name = "it8718",
+		.features = FEAT_16BIT_FANS | FEAT_TEMP_OFFSET
+		  | FEAT_TEMP_OLD_PECI,
+		.old_peci_mask = 0x4,
+	},
+	[it8720] = {
+		.name = "it8720",
+		.features = FEAT_16BIT_FANS | FEAT_TEMP_OFFSET
+		  | FEAT_TEMP_OLD_PECI,
+		.old_peci_mask = 0x4,
+	},
+	[it8721] = {
+		.name = "it8721",
+		.features = FEAT_NEWER_AUTOPWM | FEAT_12MV_ADC | FEAT_16BIT_FANS
+		  | FEAT_TEMP_OFFSET | FEAT_TEMP_OLD_PECI | FEAT_TEMP_PECI,
+		.peci_mask = 0x05,
+		.old_peci_mask = 0x02,	/* Actually reports PCH */
+	},
+	[it8728] = {
+		.name = "it8728",
+		.features = FEAT_NEWER_AUTOPWM | FEAT_12MV_ADC | FEAT_16BIT_FANS
+		  | FEAT_TEMP_OFFSET | FEAT_TEMP_PECI,
+		.peci_mask = 0x07,
+	},
+	[it8782] = {
+		.name = "it8782",
+		.features = FEAT_16BIT_FANS | FEAT_TEMP_OFFSET
+		  | FEAT_TEMP_OLD_PECI,
+		.old_peci_mask = 0x4,
+	},
+	[it8783] = {
+		.name = "it8783",
+		.features = FEAT_16BIT_FANS | FEAT_TEMP_OFFSET
+		  | FEAT_TEMP_OLD_PECI,
+		.old_peci_mask = 0x4,
+	},
+};
+
+#define has_16bit_fans(data)	((data)->features & FEAT_16BIT_FANS)
+#define has_12mv_adc(data)	((data)->features & FEAT_12MV_ADC)
+#define has_newer_autopwm(data)	((data)->features & FEAT_NEWER_AUTOPWM)
+#define has_old_autopwm(data)	((data)->features & FEAT_OLD_AUTOPWM)
+#define has_temp_offset(data)	((data)->features & FEAT_TEMP_OFFSET)
+#define has_temp_peci(data, nr)	(((data)->features & FEAT_TEMP_PECI) && \
+				 ((data)->peci_mask & (1 << nr)))
+#define has_temp_old_peci(data, nr) \
+				(((data)->features & FEAT_TEMP_OLD_PECI) && \
+				 ((data)->old_peci_mask & (1 << nr)))
 
 struct it87_sio_data {
 	enum chips type;
@@ -249,7 +328,9 @@ struct it87_sio_data {
 struct it87_data {
 	struct device *hwmon_dev;
 	enum chips type;
-	u8 revision;
+	u16 features;
+	u8 peci_mask;
+	u8 old_peci_mask;
 
 	unsigned short addr;
 	const char *name;
@@ -258,17 +339,13 @@ struct it87_data {
 	unsigned long last_updated;	/* In jiffies */
 
 	u16 in_scaled;		/* Internal voltage sensors are scaled */
-	u8 in[9];		/* Register value */
-	u8 in_max[8];		/* Register value */
-	u8 in_min[8];		/* Register value */
+	u8 in[9][3];		/* [nr][0]=in, [1]=min, [2]=max */
 	u8 has_fan;		/* Bitfield, fans enabled */
-	u16 fan[5];		/* Register values, possibly combined */
-	u16 fan_min[5];		/* Register values, possibly combined */
+	u16 fan[5][2];		/* Register values, [nr][0]=fan, [1]=min */
 	u8 has_temp;		/* Bitfield, temp sensors enabled */
-	s8 temp[3];		/* Register value */
-	s8 temp_high[3];	/* Register value */
-	s8 temp_low[3];		/* Register value */
-	u8 sensor;		/* Register value */
+	s8 temp[3][4];		/* [nr][0]=temp, [1]=min, [2]=max, [3]=offset */
+	u8 sensor;		/* Register value (IT87_REG_TEMP_ENABLE) */
+	u8 extra;		/* Register value (IT87_REG_TEMP_EXTRA) */
 	u8 fan_div[3];		/* Register encoding, shifted right */
 	u8 vid;			/* Register encoding, combined */
 	u8 vrm;
@@ -296,26 +373,6 @@ struct it87_data {
 	s8 auto_temp[3][5];	/* [nr][0] is point1_temp_hyst */
 };
 
-static inline int has_12mv_adc(const struct it87_data *data)
-{
-	/*
-	 * IT8721F and later have a 12 mV ADC, also with internal scaling
-	 * on selected inputs.
-	 */
-	return data->type == it8721
-	    || data->type == it8728;
-}
-
-static inline int has_newer_autopwm(const struct it87_data *data)
-{
-	/*
-	 * IT8721F and later have separate registers for the temperature
-	 * mapping and the manual duty cycle.
-	 */
-	return data->type == it8721
-	    || data->type == it8728;
-}
-
 static int adc_lsb(const struct it87_data *data, int nr)
 {
 	int lsb = has_12mv_adc(data) ? 12 : 16;
@@ -398,35 +455,6 @@ static const unsigned int pwm_freq[8] = {
 	750000 / 128,
 };
 
-static inline int has_16bit_fans(const struct it87_data *data)
-{
-	/*
-	 * IT8705F Datasheet 0.4.1, 3h == Version G.
-	 * IT8712F Datasheet 0.9.1, section 8.3.5 indicates 8h == Version J.
-	 * These are the first revisions with 16-bit tachometer support.
-	 */
-	return (data->type == it87 && data->revision >= 0x03)
-	    || (data->type == it8712 && data->revision >= 0x08)
-	    || data->type == it8716
-	    || data->type == it8718
-	    || data->type == it8720
-	    || data->type == it8721
-	    || data->type == it8728
-	    || data->type == it8782
-	    || data->type == it8783;
-}
-
-static inline int has_old_autopwm(const struct it87_data *data)
-{
-	/*
-	 * The old automatic fan speed control interface is implemented
-	 * by IT8705F chips up to revision F and IT8712F chips up to
-	 * revision G.
-	 */
-	return (data->type == it87 && data->revision < 0x03)
-	    || (data->type == it8712 && data->revision < 0x08);
-}
-
 static int it87_probe(struct platform_device *pdev);
 static int it87_remove(struct platform_device *pdev);
 
@@ -447,59 +475,22 @@ static struct platform_driver it87_driver = {
 };
 
 static ssize_t show_in(struct device *dev, struct device_attribute *attr,
-		char *buf)
-{
-	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
-	int nr = sensor_attr->index;
-
-	struct it87_data *data = it87_update_device(dev);
-	return sprintf(buf, "%d\n", in_from_reg(data, nr, data->in[nr]));
-}
-
-static ssize_t show_in_min(struct device *dev, struct device_attribute *attr,
-		char *buf)
+		       char *buf)
 {
-	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
-	int nr = sensor_attr->index;
+	struct sensor_device_attribute_2 *sattr = to_sensor_dev_attr_2(attr);
+	int nr = sattr->nr;
+	int index = sattr->index;
 
 	struct it87_data *data = it87_update_device(dev);
-	return sprintf(buf, "%d\n", in_from_reg(data, nr, data->in_min[nr]));
+	return sprintf(buf, "%d\n", in_from_reg(data, nr, data->in[nr][index]));
 }
 
-static ssize_t show_in_max(struct device *dev, struct device_attribute *attr,
-		char *buf)
-{
-	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
-	int nr = sensor_attr->index;
-
-	struct it87_data *data = it87_update_device(dev);
-	return sprintf(buf, "%d\n", in_from_reg(data, nr, data->in_max[nr]));
-}
-
-static ssize_t set_in_min(struct device *dev, struct device_attribute *attr,
-		const char *buf, size_t count)
-{
-	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
-	int nr = sensor_attr->index;
-
-	struct it87_data *data = dev_get_drvdata(dev);
-	unsigned long val;
-
-	if (kstrtoul(buf, 10, &val) < 0)
-		return -EINVAL;
-
-	mutex_lock(&data->update_lock);
-	data->in_min[nr] = in_to_reg(data, nr, val);
-	it87_write_value(data, IT87_REG_VIN_MIN(nr),
-			data->in_min[nr]);
-	mutex_unlock(&data->update_lock);
-	return count;
-}
-static ssize_t set_in_max(struct device *dev, struct device_attribute *attr,
-		const char *buf, size_t count)
+static ssize_t set_in(struct device *dev, struct device_attribute *attr,
+		      const char *buf, size_t count)
 {
-	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
-	int nr = sensor_attr->index;
+	struct sensor_device_attribute_2 *sattr = to_sensor_dev_attr_2(attr);
+	int nr = sattr->nr;
+	int index = sattr->index;
 
 	struct it87_data *data = dev_get_drvdata(dev);
 	unsigned long val;
@@ -508,140 +499,167 @@ static ssize_t set_in_max(struct device *dev, struct device_attribute *attr,
 		return -EINVAL;
 
 	mutex_lock(&data->update_lock);
-	data->in_max[nr] = in_to_reg(data, nr, val);
-	it87_write_value(data, IT87_REG_VIN_MAX(nr),
-			data->in_max[nr]);
+	data->in[nr][index] = in_to_reg(data, nr, val);
+	it87_write_value(data,
+			 index == 1 ? IT87_REG_VIN_MIN(nr)
+				    : IT87_REG_VIN_MAX(nr),
+			 data->in[nr][index]);
 	mutex_unlock(&data->update_lock);
 	return count;
 }
 
-#define show_in_offset(offset)					\
-static SENSOR_DEVICE_ATTR(in##offset##_input, S_IRUGO,		\
-		show_in, NULL, offset);
-
-#define limit_in_offset(offset)					\
-static SENSOR_DEVICE_ATTR(in##offset##_min, S_IRUGO | S_IWUSR,	\
-		show_in_min, set_in_min, offset);		\
-static SENSOR_DEVICE_ATTR(in##offset##_max, S_IRUGO | S_IWUSR,	\
-		show_in_max, set_in_max, offset);
-
-show_in_offset(0);
-limit_in_offset(0);
-show_in_offset(1);
-limit_in_offset(1);
-show_in_offset(2);
-limit_in_offset(2);
-show_in_offset(3);
-limit_in_offset(3);
-show_in_offset(4);
-limit_in_offset(4);
-show_in_offset(5);
-limit_in_offset(5);
-show_in_offset(6);
-limit_in_offset(6);
-show_in_offset(7);
-limit_in_offset(7);
-show_in_offset(8);
+static SENSOR_DEVICE_ATTR_2(in0_input, S_IRUGO, show_in, NULL, 0, 0);
+static SENSOR_DEVICE_ATTR_2(in0_min, S_IRUGO | S_IWUSR, show_in, set_in,
+			    0, 1);
+static SENSOR_DEVICE_ATTR_2(in0_max, S_IRUGO | S_IWUSR, show_in, set_in,
+			    0, 2);
+
+static SENSOR_DEVICE_ATTR_2(in1_input, S_IRUGO, show_in, NULL, 1, 0);
+static SENSOR_DEVICE_ATTR_2(in1_min, S_IRUGO | S_IWUSR, show_in, set_in,
+			    1, 1);
+static SENSOR_DEVICE_ATTR_2(in1_max, S_IRUGO | S_IWUSR, show_in, set_in,
+			    1, 2);
+
+static SENSOR_DEVICE_ATTR_2(in2_input, S_IRUGO, show_in, NULL, 2, 0);
+static SENSOR_DEVICE_ATTR_2(in2_min, S_IRUGO | S_IWUSR, show_in, set_in,
+			    2, 1);
+static SENSOR_DEVICE_ATTR_2(in2_max, S_IRUGO | S_IWUSR, show_in, set_in,
+			    2, 2);
+
+static SENSOR_DEVICE_ATTR_2(in3_input, S_IRUGO, show_in, NULL, 3, 0);
+static SENSOR_DEVICE_ATTR_2(in3_min, S_IRUGO | S_IWUSR, show_in, set_in,
+			    3, 1);
+static SENSOR_DEVICE_ATTR_2(in3_max, S_IRUGO | S_IWUSR, show_in, set_in,
+			    3, 2);
+
+static SENSOR_DEVICE_ATTR_2(in4_input, S_IRUGO, show_in, NULL, 4, 0);
+static SENSOR_DEVICE_ATTR_2(in4_min, S_IRUGO | S_IWUSR, show_in, set_in,
+			    4, 1);
+static SENSOR_DEVICE_ATTR_2(in4_max, S_IRUGO | S_IWUSR, show_in, set_in,
+			    4, 2);
+
+static SENSOR_DEVICE_ATTR_2(in5_input, S_IRUGO, show_in, NULL, 5, 0);
+static SENSOR_DEVICE_ATTR_2(in5_min, S_IRUGO | S_IWUSR, show_in, set_in,
+			    5, 1);
+static SENSOR_DEVICE_ATTR_2(in5_max, S_IRUGO | S_IWUSR, show_in, set_in,
+			    5, 2);
+
+static SENSOR_DEVICE_ATTR_2(in6_input, S_IRUGO, show_in, NULL, 6, 0);
+static SENSOR_DEVICE_ATTR_2(in6_min, S_IRUGO | S_IWUSR, show_in, set_in,
+			    6, 1);
+static SENSOR_DEVICE_ATTR_2(in6_max, S_IRUGO | S_IWUSR, show_in, set_in,
+			    6, 2);
+
+static SENSOR_DEVICE_ATTR_2(in7_input, S_IRUGO, show_in, NULL, 7, 0);
+static SENSOR_DEVICE_ATTR_2(in7_min, S_IRUGO | S_IWUSR, show_in, set_in,
+			    7, 1);
+static SENSOR_DEVICE_ATTR_2(in7_max, S_IRUGO | S_IWUSR, show_in, set_in,
+			    7, 2);
+
+static SENSOR_DEVICE_ATTR_2(in8_input, S_IRUGO, show_in, NULL, 8, 0);
 
 /* 3 temperatures */
 static ssize_t show_temp(struct device *dev, struct device_attribute *attr,
-		char *buf)
+			 char *buf)
 {
-	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
-	int nr = sensor_attr->index;
-
+	struct sensor_device_attribute_2 *sattr = to_sensor_dev_attr_2(attr);
+	int nr = sattr->nr;
+	int index = sattr->index;
 	struct it87_data *data = it87_update_device(dev);
-	return sprintf(buf, "%d\n", TEMP_FROM_REG(data->temp[nr]));
-}
-static ssize_t show_temp_max(struct device *dev, struct device_attribute *attr,
-		char *buf)
-{
-	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
-	int nr = sensor_attr->index;
 
-	struct it87_data *data = it87_update_device(dev);
-	return sprintf(buf, "%d\n", TEMP_FROM_REG(data->temp_high[nr]));
+	return sprintf(buf, "%d\n", TEMP_FROM_REG(data->temp[nr][index]));
 }
-static ssize_t show_temp_min(struct device *dev, struct device_attribute *attr,
-		char *buf)
-{
-	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
-	int nr = sensor_attr->index;
 
-	struct it87_data *data = it87_update_device(dev);
-	return sprintf(buf, "%d\n", TEMP_FROM_REG(data->temp_low[nr]));
-}
-static ssize_t set_temp_max(struct device *dev, struct device_attribute *attr,
-		const char *buf, size_t count)
+static ssize_t set_temp(struct device *dev, struct device_attribute *attr,
+			const char *buf, size_t count)
 {
-	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
-	int nr = sensor_attr->index;
-
+	struct sensor_device_attribute_2 *sattr = to_sensor_dev_attr_2(attr);
+	int nr = sattr->nr;
+	int index = sattr->index;
 	struct it87_data *data = dev_get_drvdata(dev);
 	long val;
+	u8 reg, regval;
 
 	if (kstrtol(buf, 10, &val) < 0)
 		return -EINVAL;
 
 	mutex_lock(&data->update_lock);
-	data->temp_high[nr] = TEMP_TO_REG(val);
-	it87_write_value(data, IT87_REG_TEMP_HIGH(nr), data->temp_high[nr]);
-	mutex_unlock(&data->update_lock);
-	return count;
-}
-static ssize_t set_temp_min(struct device *dev, struct device_attribute *attr,
-		const char *buf, size_t count)
-{
-	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
-	int nr = sensor_attr->index;
 
-	struct it87_data *data = dev_get_drvdata(dev);
-	long val;
-
-	if (kstrtol(buf, 10, &val) < 0)
-		return -EINVAL;
+	switch (index) {
+	default:
+	case 1:
+		reg = IT87_REG_TEMP_LOW(nr);
+		break;
+	case 2:
+		reg = IT87_REG_TEMP_HIGH(nr);
+		break;
+	case 3:
+		regval = it87_read_value(data, IT87_REG_BEEP_ENABLE);
+		if (!(regval & 0x80)) {
+			regval |= 0x80;
+			it87_write_value(data, IT87_REG_BEEP_ENABLE, regval);
+		}
+		data->valid = 0;
+		reg = IT87_REG_TEMP_OFFSET[nr];
+		break;
+	}
 
-	mutex_lock(&data->update_lock);
-	data->temp_low[nr] = TEMP_TO_REG(val);
-	it87_write_value(data, IT87_REG_TEMP_LOW(nr), data->temp_low[nr]);
+	data->temp[nr][index] = TEMP_TO_REG(val);
+	it87_write_value(data, reg, data->temp[nr][index]);
 	mutex_unlock(&data->update_lock);
 	return count;
 }
-#define show_temp_offset(offset)					\
-static SENSOR_DEVICE_ATTR(temp##offset##_input, S_IRUGO,		\
-		show_temp, NULL, offset - 1);				\
-static SENSOR_DEVICE_ATTR(temp##offset##_max, S_IRUGO | S_IWUSR,	\
-		show_temp_max, set_temp_max, offset - 1);		\
-static SENSOR_DEVICE_ATTR(temp##offset##_min, S_IRUGO | S_IWUSR,	\
-		show_temp_min, set_temp_min, offset - 1);
-
-show_temp_offset(1);
-show_temp_offset(2);
-show_temp_offset(3);
-
-static ssize_t show_sensor(struct device *dev, struct device_attribute *attr,
-		char *buf)
+
+static SENSOR_DEVICE_ATTR_2(temp1_input, S_IRUGO, show_temp, NULL, 0, 0);
+static SENSOR_DEVICE_ATTR_2(temp1_min, S_IRUGO | S_IWUSR, show_temp, set_temp,
+			    0, 1);
+static SENSOR_DEVICE_ATTR_2(temp1_max, S_IRUGO | S_IWUSR, show_temp, set_temp,
+			    0, 2);
+static SENSOR_DEVICE_ATTR_2(temp1_offset, S_IRUGO | S_IWUSR, show_temp,
+			    set_temp, 0, 3);
+static SENSOR_DEVICE_ATTR_2(temp2_input, S_IRUGO, show_temp, NULL, 1, 0);
+static SENSOR_DEVICE_ATTR_2(temp2_min, S_IRUGO | S_IWUSR, show_temp, set_temp,
+			    1, 1);
+static SENSOR_DEVICE_ATTR_2(temp2_max, S_IRUGO | S_IWUSR, show_temp, set_temp,
+			    1, 2);
+static SENSOR_DEVICE_ATTR_2(temp2_offset, S_IRUGO | S_IWUSR, show_temp,
+			    set_temp, 1, 3);
+static SENSOR_DEVICE_ATTR_2(temp3_input, S_IRUGO, show_temp, NULL, 2, 0);
+static SENSOR_DEVICE_ATTR_2(temp3_min, S_IRUGO | S_IWUSR, show_temp, set_temp,
+			    2, 1);
+static SENSOR_DEVICE_ATTR_2(temp3_max, S_IRUGO | S_IWUSR, show_temp, set_temp,
+			    2, 2);
+static SENSOR_DEVICE_ATTR_2(temp3_offset, S_IRUGO | S_IWUSR, show_temp,
+			    set_temp, 2, 3);
+
+static ssize_t show_temp_type(struct device *dev, struct device_attribute *attr,
+			      char *buf)
 {
 	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
 	int nr = sensor_attr->index;
 	struct it87_data *data = it87_update_device(dev);
 	u8 reg = data->sensor;	    /* In case value is updated while used */
+	u8 extra = data->extra;
 
+	if ((has_temp_peci(data, nr) && (reg >> 6 == nr + 1))
+	    || (has_temp_old_peci(data, nr) && (extra & 0x80)))
+		return sprintf(buf, "6\n");  /* Intel PECI */
 	if (reg & (1 << nr))
 		return sprintf(buf, "3\n");  /* thermal diode */
 	if (reg & (8 << nr))
 		return sprintf(buf, "4\n");  /* thermistor */
 	return sprintf(buf, "0\n");      /* disabled */
 }
-static ssize_t set_sensor(struct device *dev, struct device_attribute *attr,
-		const char *buf, size_t count)
+
+static ssize_t set_temp_type(struct device *dev, struct device_attribute *attr,
+			     const char *buf, size_t count)
 {
 	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
 	int nr = sensor_attr->index;
 
 	struct it87_data *data = dev_get_drvdata(dev);
 	long val;
-	u8 reg;
+	u8 reg, extra;
 
 	if (kstrtol(buf, 10, &val) < 0)
 		return -EINVAL;
@@ -649,33 +667,45 @@ static ssize_t set_sensor(struct device *dev, struct device_attribute *attr,
 	reg = it87_read_value(data, IT87_REG_TEMP_ENABLE);
 	reg &= ~(1 << nr);
 	reg &= ~(8 << nr);
+	if (has_temp_peci(data, nr) && (reg >> 6 == nr + 1 || val == 6))
+		reg &= 0x3f;
+	extra = it87_read_value(data, IT87_REG_TEMP_EXTRA);
+	if (has_temp_old_peci(data, nr) && ((extra & 0x80) || val == 6))
+		extra &= 0x7f;
 	if (val == 2) {	/* backwards compatibility */
-		dev_warn(dev, "Sensor type 2 is deprecated, please use 4 "
-			 "instead\n");
+		dev_warn(dev,
+			 "Sensor type 2 is deprecated, please use 4 instead\n");
 		val = 4;
 	}
-	/* 3 = thermal diode; 4 = thermistor; 0 = disabled */
+	/* 3 = thermal diode; 4 = thermistor; 6 = Intel PECI; 0 = disabled */
 	if (val == 3)
 		reg |= 1 << nr;
 	else if (val == 4)
 		reg |= 8 << nr;
+	else if (has_temp_peci(data, nr) && val == 6)
+		reg |= (nr + 1) << 6;
+	else if (has_temp_old_peci(data, nr) && val == 6)
+		extra |= 0x80;
 	else if (val != 0)
 		return -EINVAL;
 
 	mutex_lock(&data->update_lock);
 	data->sensor = reg;
+	data->extra = extra;
 	it87_write_value(data, IT87_REG_TEMP_ENABLE, data->sensor);
+	if (has_temp_old_peci(data, nr))
+		it87_write_value(data, IT87_REG_TEMP_EXTRA, data->extra);
 	data->valid = 0;	/* Force cache refresh */
 	mutex_unlock(&data->update_lock);
 	return count;
 }
-#define show_sensor_offset(offset)					\
-static SENSOR_DEVICE_ATTR(temp##offset##_type, S_IRUGO | S_IWUSR,	\
-		show_sensor, set_sensor, offset - 1);
 
-show_sensor_offset(1);
-show_sensor_offset(2);
-show_sensor_offset(3);
+static SENSOR_DEVICE_ATTR(temp1_type, S_IRUGO | S_IWUSR, show_temp_type,
+			  set_temp_type, 0);
+static SENSOR_DEVICE_ATTR(temp2_type, S_IRUGO | S_IWUSR, show_temp_type,
+			  set_temp_type, 1);
+static SENSOR_DEVICE_ATTR(temp3_type, S_IRUGO | S_IWUSR, show_temp_type,
+			  set_temp_type, 2);
 
 /* 3 Fans */
 
@@ -692,25 +722,21 @@ static int pwm_mode(const struct it87_data *data, int nr)
 }
 
 static ssize_t show_fan(struct device *dev, struct device_attribute *attr,
-		char *buf)
+			char *buf)
 {
-	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
-	int nr = sensor_attr->index;
-
+	struct sensor_device_attribute_2 *sattr = to_sensor_dev_attr_2(attr);
+	int nr = sattr->nr;
+	int index = sattr->index;
+	int speed;
 	struct it87_data *data = it87_update_device(dev);
-	return sprintf(buf, "%d\n", FAN_FROM_REG(data->fan[nr],
-				DIV_FROM_REG(data->fan_div[nr])));
-}
-static ssize_t show_fan_min(struct device *dev, struct device_attribute *attr,
-		char *buf)
-{
-	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
-	int nr = sensor_attr->index;
 
-	struct it87_data *data = it87_update_device(dev);
-	return sprintf(buf, "%d\n", FAN_FROM_REG(data->fan_min[nr],
-				DIV_FROM_REG(data->fan_div[nr])));
+	speed = has_16bit_fans(data) ?
+		FAN16_FROM_REG(data->fan[nr][index]) :
+		FAN_FROM_REG(data->fan[nr][index],
+			     DIV_FROM_REG(data->fan_div[nr]));
+	return sprintf(buf, "%d\n", speed);
 }
+
 static ssize_t show_fan_div(struct device *dev, struct device_attribute *attr,
 		char *buf)
 {
@@ -747,11 +773,13 @@ static ssize_t show_pwm_freq(struct device *dev, struct device_attribute *attr,
 
 	return sprintf(buf, "%u\n", pwm_freq[index]);
 }
-static ssize_t set_fan_min(struct device *dev, struct device_attribute *attr,
-		const char *buf, size_t count)
+
+static ssize_t set_fan(struct device *dev, struct device_attribute *attr,
+		       const char *buf, size_t count)
 {
-	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
-	int nr = sensor_attr->index;
+	struct sensor_device_attribute_2 *sattr = to_sensor_dev_attr_2(attr);
+	int nr = sattr->nr;
+	int index = sattr->index;
 
 	struct it87_data *data = dev_get_drvdata(dev);
 	long val;
@@ -761,24 +789,36 @@ static ssize_t set_fan_min(struct device *dev, struct device_attribute *attr,
 		return -EINVAL;
 
 	mutex_lock(&data->update_lock);
-	reg = it87_read_value(data, IT87_REG_FAN_DIV);
-	switch (nr) {
-	case 0:
-		data->fan_div[nr] = reg & 0x07;
-		break;
-	case 1:
-		data->fan_div[nr] = (reg >> 3) & 0x07;
-		break;
-	case 2:
-		data->fan_div[nr] = (reg & 0x40) ? 3 : 1;
-		break;
+
+	if (has_16bit_fans(data)) {
+		data->fan[nr][index] = FAN16_TO_REG(val);
+		it87_write_value(data, IT87_REG_FAN_MIN[nr],
+				 data->fan[nr][index] & 0xff);
+		it87_write_value(data, IT87_REG_FANX_MIN[nr],
+				 data->fan[nr][index] >> 8);
+	} else {
+		reg = it87_read_value(data, IT87_REG_FAN_DIV);
+		switch (nr) {
+		case 0:
+			data->fan_div[nr] = reg & 0x07;
+			break;
+		case 1:
+			data->fan_div[nr] = (reg >> 3) & 0x07;
+			break;
+		case 2:
+			data->fan_div[nr] = (reg & 0x40) ? 3 : 1;
+			break;
+		}
+		data->fan[nr][index] =
+		  FAN_TO_REG(val, DIV_FROM_REG(data->fan_div[nr]));
+		it87_write_value(data, IT87_REG_FAN_MIN[nr],
+				 data->fan[nr][index]);
 	}
 
-	data->fan_min[nr] = FAN_TO_REG(val, DIV_FROM_REG(data->fan_div[nr]));
-	it87_write_value(data, IT87_REG_FAN_MIN[nr], data->fan_min[nr]);
 	mutex_unlock(&data->update_lock);
 	return count;
 }
+
 static ssize_t set_fan_div(struct device *dev, struct device_attribute *attr,
 		const char *buf, size_t count)
 {
@@ -797,7 +837,7 @@ static ssize_t set_fan_div(struct device *dev, struct device_attribute *attr,
 	old = it87_read_value(data, IT87_REG_FAN_DIV);
 
 	/* Save fan min limit */
-	min = FAN_FROM_REG(data->fan_min[nr], DIV_FROM_REG(data->fan_div[nr]));
+	min = FAN_FROM_REG(data->fan[nr][1], DIV_FROM_REG(data->fan_div[nr]));
 
 	switch (nr) {
 	case 0:
@@ -818,8 +858,8 @@ static ssize_t set_fan_div(struct device *dev, struct device_attribute *attr,
 	it87_write_value(data, IT87_REG_FAN_DIV, val);
 
 	/* Restore fan min limit */
-	data->fan_min[nr] = FAN_TO_REG(min, DIV_FROM_REG(data->fan_div[nr]));
-	it87_write_value(data, IT87_REG_FAN_MIN[nr], data->fan_min[nr]);
+	data->fan[nr][1] = FAN_TO_REG(min, DIV_FROM_REG(data->fan_div[nr]));
+	it87_write_value(data, IT87_REG_FAN_MIN[nr], data->fan[nr][1]);
 
 	mutex_unlock(&data->update_lock);
 	return count;
@@ -843,8 +883,8 @@ static int check_trip_points(struct device *dev, int nr)
 	}
 
 	if (err) {
-		dev_err(dev, "Inconsistent trip points, not switching to "
-			"automatic mode\n");
+		dev_err(dev,
+			"Inconsistent trip points, not switching to automatic mode\n");
 		dev_err(dev, "Adjust the trip points and try again\n");
 	}
 	return err;
@@ -1092,118 +1132,106 @@ static ssize_t set_auto_temp(struct device *dev,
 	return count;
 }
 
-#define show_fan_offset(offset)					\
-static SENSOR_DEVICE_ATTR(fan##offset##_input, S_IRUGO,		\
-		show_fan, NULL, offset - 1);			\
-static SENSOR_DEVICE_ATTR(fan##offset##_min, S_IRUGO | S_IWUSR, \
-		show_fan_min, set_fan_min, offset - 1);		\
-static SENSOR_DEVICE_ATTR(fan##offset##_div, S_IRUGO | S_IWUSR, \
-		show_fan_div, set_fan_div, offset - 1);
-
-show_fan_offset(1);
-show_fan_offset(2);
-show_fan_offset(3);
-
-#define show_pwm_offset(offset)						\
-static SENSOR_DEVICE_ATTR(pwm##offset##_enable, S_IRUGO | S_IWUSR,	\
-		show_pwm_enable, set_pwm_enable, offset - 1);		\
-static SENSOR_DEVICE_ATTR(pwm##offset, S_IRUGO | S_IWUSR,		\
-		show_pwm, set_pwm, offset - 1);				\
-static DEVICE_ATTR(pwm##offset##_freq,					\
-		(offset == 1 ? S_IRUGO | S_IWUSR : S_IRUGO),		\
-		show_pwm_freq, (offset == 1 ? set_pwm_freq : NULL));	\
-static SENSOR_DEVICE_ATTR(pwm##offset##_auto_channels_temp,		\
-		S_IRUGO | S_IWUSR, show_pwm_temp_map, set_pwm_temp_map,	\
-		offset - 1);						\
-static SENSOR_DEVICE_ATTR_2(pwm##offset##_auto_point1_pwm,		\
-		S_IRUGO | S_IWUSR, show_auto_pwm, set_auto_pwm,		\
-		offset - 1, 0);						\
-static SENSOR_DEVICE_ATTR_2(pwm##offset##_auto_point2_pwm,		\
-		S_IRUGO | S_IWUSR, show_auto_pwm, set_auto_pwm,		\
-		offset - 1, 1);						\
-static SENSOR_DEVICE_ATTR_2(pwm##offset##_auto_point3_pwm,		\
-		S_IRUGO | S_IWUSR, show_auto_pwm, set_auto_pwm,		\
-		offset - 1, 2);						\
-static SENSOR_DEVICE_ATTR_2(pwm##offset##_auto_point4_pwm,		\
-		S_IRUGO, show_auto_pwm, NULL, offset - 1, 3);		\
-static SENSOR_DEVICE_ATTR_2(pwm##offset##_auto_point1_temp,		\
-		S_IRUGO | S_IWUSR, show_auto_temp, set_auto_temp,	\
-		offset - 1, 1);						\
-static SENSOR_DEVICE_ATTR_2(pwm##offset##_auto_point1_temp_hyst,	\
-		S_IRUGO | S_IWUSR, show_auto_temp, set_auto_temp,	\
-		offset - 1, 0);						\
-static SENSOR_DEVICE_ATTR_2(pwm##offset##_auto_point2_temp,		\
-		S_IRUGO | S_IWUSR, show_auto_temp, set_auto_temp,	\
-		offset - 1, 2);						\
-static SENSOR_DEVICE_ATTR_2(pwm##offset##_auto_point3_temp,		\
-		S_IRUGO | S_IWUSR, show_auto_temp, set_auto_temp,	\
-		offset - 1, 3);						\
-static SENSOR_DEVICE_ATTR_2(pwm##offset##_auto_point4_temp,		\
-		S_IRUGO | S_IWUSR, show_auto_temp, set_auto_temp,	\
-		offset - 1, 4);
-
-show_pwm_offset(1);
-show_pwm_offset(2);
-show_pwm_offset(3);
-
-/* A different set of callbacks for 16-bit fans */
-static ssize_t show_fan16(struct device *dev, struct device_attribute *attr,
-		char *buf)
-{
-	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
-	int nr = sensor_attr->index;
-	struct it87_data *data = it87_update_device(dev);
-	return sprintf(buf, "%d\n", FAN16_FROM_REG(data->fan[nr]));
-}
-
-static ssize_t show_fan16_min(struct device *dev, struct device_attribute *attr,
-		char *buf)
-{
-	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
-	int nr = sensor_attr->index;
-	struct it87_data *data = it87_update_device(dev);
-	return sprintf(buf, "%d\n", FAN16_FROM_REG(data->fan_min[nr]));
-}
-
-static ssize_t set_fan16_min(struct device *dev, struct device_attribute *attr,
-		const char *buf, size_t count)
-{
-	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
-	int nr = sensor_attr->index;
-	struct it87_data *data = dev_get_drvdata(dev);
-	long val;
-
-	if (kstrtol(buf, 10, &val) < 0)
-		return -EINVAL;
-
-	mutex_lock(&data->update_lock);
-	data->fan_min[nr] = FAN16_TO_REG(val);
-	it87_write_value(data, IT87_REG_FAN_MIN[nr],
-			 data->fan_min[nr] & 0xff);
-	it87_write_value(data, IT87_REG_FANX_MIN[nr],
-			 data->fan_min[nr] >> 8);
-	mutex_unlock(&data->update_lock);
-	return count;
-}
-
-/*
- * We want to use the same sysfs file names as 8-bit fans, but we need
- * different variable names, so we have to use SENSOR_ATTR instead of
- * SENSOR_DEVICE_ATTR.
- */
-#define show_fan16_offset(offset) \
-static struct sensor_device_attribute sensor_dev_attr_fan##offset##_input16 \
-	= SENSOR_ATTR(fan##offset##_input, S_IRUGO,		\
-		show_fan16, NULL, offset - 1);			\
-static struct sensor_device_attribute sensor_dev_attr_fan##offset##_min16 \
-	= SENSOR_ATTR(fan##offset##_min, S_IRUGO | S_IWUSR,	\
-		show_fan16_min, set_fan16_min, offset - 1)
-
-show_fan16_offset(1);
-show_fan16_offset(2);
-show_fan16_offset(3);
-show_fan16_offset(4);
-show_fan16_offset(5);
+static SENSOR_DEVICE_ATTR_2(fan1_input, S_IRUGO, show_fan, NULL, 0, 0);
+static SENSOR_DEVICE_ATTR_2(fan1_min, S_IRUGO | S_IWUSR, show_fan, set_fan,
+			    0, 1);
+static SENSOR_DEVICE_ATTR(fan1_div, S_IRUGO | S_IWUSR, show_fan_div,
+			  set_fan_div, 0);
+
+static SENSOR_DEVICE_ATTR_2(fan2_input, S_IRUGO, show_fan, NULL, 1, 0);
+static SENSOR_DEVICE_ATTR_2(fan2_min, S_IRUGO | S_IWUSR, show_fan, set_fan,
+			    1, 1);
+static SENSOR_DEVICE_ATTR(fan2_div, S_IRUGO | S_IWUSR, show_fan_div,
+			  set_fan_div, 1);
+
+static SENSOR_DEVICE_ATTR_2(fan3_input, S_IRUGO, show_fan, NULL, 2, 0);
+static SENSOR_DEVICE_ATTR_2(fan3_min, S_IRUGO | S_IWUSR, show_fan, set_fan,
+			    2, 1);
+static SENSOR_DEVICE_ATTR(fan3_div, S_IRUGO | S_IWUSR, show_fan_div,
+			  set_fan_div, 2);
+
+static SENSOR_DEVICE_ATTR_2(fan4_input, S_IRUGO, show_fan, NULL, 3, 0);
+static SENSOR_DEVICE_ATTR_2(fan4_min, S_IRUGO | S_IWUSR, show_fan, set_fan,
+			    3, 1);
+
+static SENSOR_DEVICE_ATTR_2(fan5_input, S_IRUGO, show_fan, NULL, 4, 0);
+static SENSOR_DEVICE_ATTR_2(fan5_min, S_IRUGO | S_IWUSR, show_fan, set_fan,
+			    4, 1);
+
+static SENSOR_DEVICE_ATTR(pwm1_enable, S_IRUGO | S_IWUSR,
+			  show_pwm_enable, set_pwm_enable, 0);
+static SENSOR_DEVICE_ATTR(pwm1, S_IRUGO | S_IWUSR, show_pwm, set_pwm, 0);
+static DEVICE_ATTR(pwm1_freq, S_IRUGO | S_IWUSR, show_pwm_freq, set_pwm_freq);
+static SENSOR_DEVICE_ATTR(pwm1_auto_channels_temp, S_IRUGO | S_IWUSR,
+			  show_pwm_temp_map, set_pwm_temp_map, 0);
+static SENSOR_DEVICE_ATTR_2(pwm1_auto_point1_pwm, S_IRUGO | S_IWUSR,
+			    show_auto_pwm, set_auto_pwm, 0, 0);
+static SENSOR_DEVICE_ATTR_2(pwm1_auto_point2_pwm, S_IRUGO | S_IWUSR,
+			    show_auto_pwm, set_auto_pwm, 0, 1);
+static SENSOR_DEVICE_ATTR_2(pwm1_auto_point3_pwm, S_IRUGO | S_IWUSR,
+			    show_auto_pwm, set_auto_pwm, 0, 2);
+static SENSOR_DEVICE_ATTR_2(pwm1_auto_point4_pwm, S_IRUGO,
+			    show_auto_pwm, NULL, 0, 3);
+static SENSOR_DEVICE_ATTR_2(pwm1_auto_point1_temp, S_IRUGO | S_IWUSR,
+			    show_auto_temp, set_auto_temp, 0, 1);
+static SENSOR_DEVICE_ATTR_2(pwm1_auto_point1_temp_hyst, S_IRUGO | S_IWUSR,
+			    show_auto_temp, set_auto_temp, 0, 0);
+static SENSOR_DEVICE_ATTR_2(pwm1_auto_point2_temp, S_IRUGO | S_IWUSR,
+			    show_auto_temp, set_auto_temp, 0, 2);
+static SENSOR_DEVICE_ATTR_2(pwm1_auto_point3_temp, S_IRUGO | S_IWUSR,
+			    show_auto_temp, set_auto_temp, 0, 3);
+static SENSOR_DEVICE_ATTR_2(pwm1_auto_point4_temp, S_IRUGO | S_IWUSR,
+			    show_auto_temp, set_auto_temp, 0, 4);
+
+static SENSOR_DEVICE_ATTR(pwm2_enable, S_IRUGO | S_IWUSR,
+			  show_pwm_enable, set_pwm_enable, 1);
+static SENSOR_DEVICE_ATTR(pwm2, S_IRUGO | S_IWUSR, show_pwm, set_pwm, 1);
+static DEVICE_ATTR(pwm2_freq, S_IRUGO, show_pwm_freq, NULL);
+static SENSOR_DEVICE_ATTR(pwm2_auto_channels_temp, S_IRUGO | S_IWUSR,
+			  show_pwm_temp_map, set_pwm_temp_map, 1);
+static SENSOR_DEVICE_ATTR_2(pwm2_auto_point1_pwm, S_IRUGO | S_IWUSR,
+			    show_auto_pwm, set_auto_pwm, 1, 0);
+static SENSOR_DEVICE_ATTR_2(pwm2_auto_point2_pwm, S_IRUGO | S_IWUSR,
+			    show_auto_pwm, set_auto_pwm, 1, 1);
+static SENSOR_DEVICE_ATTR_2(pwm2_auto_point3_pwm, S_IRUGO | S_IWUSR,
+			    show_auto_pwm, set_auto_pwm, 1, 2);
+static SENSOR_DEVICE_ATTR_2(pwm2_auto_point4_pwm, S_IRUGO,
+			    show_auto_pwm, NULL, 1, 3);
+static SENSOR_DEVICE_ATTR_2(pwm2_auto_point1_temp, S_IRUGO | S_IWUSR,
+			    show_auto_temp, set_auto_temp, 1, 1);
+static SENSOR_DEVICE_ATTR_2(pwm2_auto_point1_temp_hyst, S_IRUGO | S_IWUSR,
+			    show_auto_temp, set_auto_temp, 1, 0);
+static SENSOR_DEVICE_ATTR_2(pwm2_auto_point2_temp, S_IRUGO | S_IWUSR,
+			    show_auto_temp, set_auto_temp, 1, 2);
+static SENSOR_DEVICE_ATTR_2(pwm2_auto_point3_temp, S_IRUGO | S_IWUSR,
+			    show_auto_temp, set_auto_temp, 1, 3);
+static SENSOR_DEVICE_ATTR_2(pwm2_auto_point4_temp, S_IRUGO | S_IWUSR,
+			    show_auto_temp, set_auto_temp, 1, 4);
+
+static SENSOR_DEVICE_ATTR(pwm3_enable, S_IRUGO | S_IWUSR,
+			  show_pwm_enable, set_pwm_enable, 2);
+static SENSOR_DEVICE_ATTR(pwm3, S_IRUGO | S_IWUSR, show_pwm, set_pwm, 2);
+static DEVICE_ATTR(pwm3_freq, S_IRUGO, show_pwm_freq, NULL);
+static SENSOR_DEVICE_ATTR(pwm3_auto_channels_temp, S_IRUGO | S_IWUSR,
+			  show_pwm_temp_map, set_pwm_temp_map, 2);
+static SENSOR_DEVICE_ATTR_2(pwm3_auto_point1_pwm, S_IRUGO | S_IWUSR,
+			    show_auto_pwm, set_auto_pwm, 2, 0);
+static SENSOR_DEVICE_ATTR_2(pwm3_auto_point2_pwm, S_IRUGO | S_IWUSR,
+			    show_auto_pwm, set_auto_pwm, 2, 1);
+static SENSOR_DEVICE_ATTR_2(pwm3_auto_point3_pwm, S_IRUGO | S_IWUSR,
+			    show_auto_pwm, set_auto_pwm, 2, 2);
+static SENSOR_DEVICE_ATTR_2(pwm3_auto_point4_pwm, S_IRUGO,
+			    show_auto_pwm, NULL, 2, 3);
+static SENSOR_DEVICE_ATTR_2(pwm3_auto_point1_temp, S_IRUGO | S_IWUSR,
+			    show_auto_temp, set_auto_temp, 2, 1);
+static SENSOR_DEVICE_ATTR_2(pwm3_auto_point1_temp_hyst, S_IRUGO | S_IWUSR,
+			    show_auto_temp, set_auto_temp, 2, 0);
+static SENSOR_DEVICE_ATTR_2(pwm3_auto_point2_temp, S_IRUGO | S_IWUSR,
+			    show_auto_temp, set_auto_temp, 2, 2);
+static SENSOR_DEVICE_ATTR_2(pwm3_auto_point3_temp, S_IRUGO | S_IWUSR,
+			    show_auto_temp, set_auto_temp, 2, 3);
+static SENSOR_DEVICE_ATTR_2(pwm3_auto_point4_temp, S_IRUGO | S_IWUSR,
+			    show_auto_temp, set_auto_temp, 2, 4);
 
 /* Alarms */
 static ssize_t show_alarms(struct device *dev, struct device_attribute *attr,
@@ -1471,6 +1499,12 @@ static const struct attribute_group it87_group_temp[3] = {
 	{ .attrs = it87_attributes_temp[2] },
 };
 
+static struct attribute *it87_attributes_temp_offset[] = {
+	&sensor_dev_attr_temp1_offset.dev_attr.attr,
+	&sensor_dev_attr_temp2_offset.dev_attr.attr,
+	&sensor_dev_attr_temp3_offset.dev_attr.attr,
+};
+
 static struct attribute *it87_attributes[] = {
 	&dev_attr_alarms.attr,
 	&sensor_dev_attr_intrusion0_alarm.dev_attr.attr,
@@ -1500,73 +1534,47 @@ static struct attribute *it87_attributes_temp_beep[] = {
 	&sensor_dev_attr_temp3_beep.dev_attr.attr,
 };
 
-static struct attribute *it87_attributes_fan16[5][3+1] = { {
-	&sensor_dev_attr_fan1_input16.dev_attr.attr,
-	&sensor_dev_attr_fan1_min16.dev_attr.attr,
+static struct attribute *it87_attributes_fan[5][3+1] = { {
+	&sensor_dev_attr_fan1_input.dev_attr.attr,
+	&sensor_dev_attr_fan1_min.dev_attr.attr,
 	&sensor_dev_attr_fan1_alarm.dev_attr.attr,
 	NULL
 }, {
-	&sensor_dev_attr_fan2_input16.dev_attr.attr,
-	&sensor_dev_attr_fan2_min16.dev_attr.attr,
+	&sensor_dev_attr_fan2_input.dev_attr.attr,
+	&sensor_dev_attr_fan2_min.dev_attr.attr,
 	&sensor_dev_attr_fan2_alarm.dev_attr.attr,
 	NULL
 }, {
-	&sensor_dev_attr_fan3_input16.dev_attr.attr,
-	&sensor_dev_attr_fan3_min16.dev_attr.attr,
+	&sensor_dev_attr_fan3_input.dev_attr.attr,
+	&sensor_dev_attr_fan3_min.dev_attr.attr,
 	&sensor_dev_attr_fan3_alarm.dev_attr.attr,
 	NULL
 }, {
-	&sensor_dev_attr_fan4_input16.dev_attr.attr,
-	&sensor_dev_attr_fan4_min16.dev_attr.attr,
+	&sensor_dev_attr_fan4_input.dev_attr.attr,
+	&sensor_dev_attr_fan4_min.dev_attr.attr,
 	&sensor_dev_attr_fan4_alarm.dev_attr.attr,
 	NULL
 }, {
-	&sensor_dev_attr_fan5_input16.dev_attr.attr,
-	&sensor_dev_attr_fan5_min16.dev_attr.attr,
+	&sensor_dev_attr_fan5_input.dev_attr.attr,
+	&sensor_dev_attr_fan5_min.dev_attr.attr,
 	&sensor_dev_attr_fan5_alarm.dev_attr.attr,
 	NULL
 } };
 
-static const struct attribute_group it87_group_fan16[5] = {
-	{ .attrs = it87_attributes_fan16[0] },
-	{ .attrs = it87_attributes_fan16[1] },
-	{ .attrs = it87_attributes_fan16[2] },
-	{ .attrs = it87_attributes_fan16[3] },
-	{ .attrs = it87_attributes_fan16[4] },
+static const struct attribute_group it87_group_fan[5] = {
+	{ .attrs = it87_attributes_fan[0] },
+	{ .attrs = it87_attributes_fan[1] },
+	{ .attrs = it87_attributes_fan[2] },
+	{ .attrs = it87_attributes_fan[3] },
+	{ .attrs = it87_attributes_fan[4] },
 };
 
-static struct attribute *it87_attributes_fan[3][4+1] = { {
-	&sensor_dev_attr_fan1_input.dev_attr.attr,
-	&sensor_dev_attr_fan1_min.dev_attr.attr,
+static const struct attribute *it87_attributes_fan_div[] = {
 	&sensor_dev_attr_fan1_div.dev_attr.attr,
-	&sensor_dev_attr_fan1_alarm.dev_attr.attr,
-	NULL
-}, {
-	&sensor_dev_attr_fan2_input.dev_attr.attr,
-	&sensor_dev_attr_fan2_min.dev_attr.attr,
 	&sensor_dev_attr_fan2_div.dev_attr.attr,
-	&sensor_dev_attr_fan2_alarm.dev_attr.attr,
-	NULL
-}, {
-	&sensor_dev_attr_fan3_input.dev_attr.attr,
-	&sensor_dev_attr_fan3_min.dev_attr.attr,
 	&sensor_dev_attr_fan3_div.dev_attr.attr,
-	&sensor_dev_attr_fan3_alarm.dev_attr.attr,
-	NULL
-} };
-
-static const struct attribute_group it87_group_fan[3] = {
-	{ .attrs = it87_attributes_fan[0] },
-	{ .attrs = it87_attributes_fan[1] },
-	{ .attrs = it87_attributes_fan[2] },
 };
 
-static const struct attribute_group *
-it87_get_fan_group(const struct it87_data *data)
-{
-	return has_16bit_fans(data) ? it87_group_fan16 : it87_group_fan;
-}
-
 static struct attribute *it87_attributes_pwm[3][4+1] = { {
 	&sensor_dev_attr_pwm1_enable.dev_attr.attr,
 	&sensor_dev_attr_pwm1.dev_attr.attr,
@@ -1925,7 +1933,6 @@ static void it87_remove_files(struct device *dev)
 {
 	struct it87_data *data = platform_get_drvdata(pdev);
 	struct it87_sio_data *sio_data = dev->platform_data;
-	const struct attribute_group *fan_group = it87_get_fan_group(data);
 	int i;
 
 	sysfs_remove_group(&dev->kobj, &it87_group);
@@ -1941,6 +1948,9 @@ static void it87_remove_files(struct device *dev)
 		if (!(data->has_temp & (1 << i)))
 			continue;
 		sysfs_remove_group(&dev->kobj, &it87_group_temp[i]);
+		if (has_temp_offset(data))
+			sysfs_remove_file(&dev->kobj,
+					  it87_attributes_temp_offset[i]);
 		if (sio_data->beep_pin)
 			sysfs_remove_file(&dev->kobj,
 					  it87_attributes_temp_beep[i]);
@@ -1948,10 +1958,13 @@ static void it87_remove_files(struct device *dev)
 	for (i = 0; i < 5; i++) {
 		if (!(data->has_fan & (1 << i)))
 			continue;
-		sysfs_remove_group(&dev->kobj, &fan_group[i]);
+		sysfs_remove_group(&dev->kobj, &it87_group_fan[i]);
 		if (sio_data->beep_pin)
 			sysfs_remove_file(&dev->kobj,
 					  it87_attributes_fan_beep[i]);
+		if (i < 3 && !has_16bit_fans(data))
+			sysfs_remove_file(&dev->kobj,
+					  it87_attributes_fan_div[i]);
 	}
 	for (i = 0; i < 3; i++) {
 		if (sio_data->skip_pwm & (1 << 0))
@@ -1972,21 +1985,9 @@ static int it87_probe(struct platform_device *pdev)
 	struct resource *res;
 	struct device *dev = &pdev->dev;
 	struct it87_sio_data *sio_data = dev->platform_data;
-	const struct attribute_group *fan_group;
 	int err = 0, i;
 	int enable_pwm_interface;
 	int fan_beep_need_rw;
-	static const char * const names[] = {
-		"it87",
-		"it8712",
-		"it8716",
-		"it8718",
-		"it8720",
-		"it8721",
-		"it8728",
-		"it8782",
-		"it8783",
-	};
 
 	res = platform_get_resource(pdev, IORESOURCE_IO, 0);
 	if (!devm_request_region(&pdev->dev, res->start, IT87_EC_EXTENT,
@@ -2003,8 +2004,31 @@ static int it87_probe(struct platform_device *pdev)
 
 	data->addr = res->start;
 	data->type = sio_data->type;
-	data->revision = sio_data->revision;
-	data->name = names[sio_data->type];
+	data->features = it87_devices[sio_data->type].features;
+	data->peci_mask = it87_devices[sio_data->type].peci_mask;
+	data->old_peci_mask = it87_devices[sio_data->type].old_peci_mask;
+	data->name = it87_devices[sio_data->type].name;
+	/*
+	 * IT8705F Datasheet 0.4.1, 3h == Version G.
+	 * IT8712F Datasheet 0.9.1, section 8.3.5 indicates 8h == Version J.
+	 * These are the first revisions with 16-bit tachometer support.
+	 */
+	switch (data->type) {
+	case it87:
+		if (sio_data->revision >= 0x03) {
+			data->features &= ~FEAT_OLD_AUTOPWM;
+			data->features |= FEAT_16BIT_FANS;
+		}
+		break;
+	case it8712:
+		if (sio_data->revision >= 0x08) {
+			data->features &= ~FEAT_OLD_AUTOPWM;
+			data->features |= FEAT_16BIT_FANS;
+		}
+		break;
+	default:
+		break;
+	}
 
 	/* Now, we do the remaining detection. */
 	if ((it87_read_value(data, IT87_REG_CONFIG) & 0x80)
@@ -2068,6 +2092,12 @@ static int it87_probe(struct platform_device *pdev)
 		err = sysfs_create_group(&dev->kobj, &it87_group_temp[i]);
 		if (err)
 			goto error;
+		if (has_temp_offset(data)) {
+			err = sysfs_create_file(&dev->kobj,
+						it87_attributes_temp_offset[i]);
+			if (err)
+				goto error;
+		}
 		if (sio_data->beep_pin) {
 			err = sysfs_create_file(&dev->kobj,
 						it87_attributes_temp_beep[i]);
@@ -2077,15 +2107,21 @@ static int it87_probe(struct platform_device *pdev)
 	}
 
 	/* Do not create fan files for disabled fans */
-	fan_group = it87_get_fan_group(data);
 	fan_beep_need_rw = 1;
 	for (i = 0; i < 5; i++) {
 		if (!(data->has_fan & (1 << i)))
 			continue;
-		err = sysfs_create_group(&dev->kobj, &fan_group[i]);
+		err = sysfs_create_group(&dev->kobj, &it87_group_fan[i]);
 		if (err)
 			goto error;
 
+		if (i < 3 && !has_16bit_fans(data)) {
+			err = sysfs_create_file(&dev->kobj,
+						it87_attributes_fan_div[i]);
+			if (err)
+				goto error;
+		}
+
 		if (sio_data->beep_pin) {
 			err = sysfs_create_file(&dev->kobj,
 						it87_attributes_fan_beep[i]);
@@ -2221,8 +2257,8 @@ static int it87_check_pwm(struct device *dev)
 			 * PWM interface).
 			 */
 			if (!((pwm[0] | pwm[1] | pwm[2]) & 0x80)) {
-				dev_info(dev, "Reconfiguring PWM to "
-					 "active high polarity\n");
+				dev_info(dev,
+					 "Reconfiguring PWM to active high polarity\n");
 				it87_write_value(data, IT87_REG_FAN_CTL,
 						 tmp | 0x87);
 				for (i = 0; i < 3; i++)
@@ -2232,16 +2268,16 @@ static int it87_check_pwm(struct device *dev)
 				return 1;
 			}
 
-			dev_info(dev, "PWM configuration is "
-				 "too broken to be fixed\n");
+			dev_info(dev,
+				 "PWM configuration is too broken to be fixed\n");
 		}
 
-		dev_info(dev, "Detected broken BIOS "
-			 "defaults, disabling PWM interface\n");
+		dev_info(dev,
+			 "Detected broken BIOS defaults, disabling PWM interface\n");
 		return 0;
 	} else if (fix_pwm_polarity) {
-		dev_info(dev, "PWM configuration looks "
-			 "sane, won't touch\n");
+		dev_info(dev,
+			 "PWM configuration looks sane, won't touch\n");
 	}
 
 	return 1;
@@ -2389,42 +2425,46 @@ static struct it87_data *it87_update_device(struct device *dev)
 				it87_read_value(data, IT87_REG_CONFIG) | 0x40);
 		}
 		for (i = 0; i <= 7; i++) {
-			data->in[i] =
+			data->in[i][0] =
 				it87_read_value(data, IT87_REG_VIN(i));
-			data->in_min[i] =
+			data->in[i][1] =
 				it87_read_value(data, IT87_REG_VIN_MIN(i));
-			data->in_max[i] =
+			data->in[i][2] =
 				it87_read_value(data, IT87_REG_VIN_MAX(i));
 		}
 		/* in8 (battery) has no limit registers */
-		data->in[8] = it87_read_value(data, IT87_REG_VIN(8));
+		data->in[8][0] = it87_read_value(data, IT87_REG_VIN(8));
 
 		for (i = 0; i < 5; i++) {
 			/* Skip disabled fans */
 			if (!(data->has_fan & (1 << i)))
 				continue;
 
-			data->fan_min[i] =
+			data->fan[i][1] =
 				it87_read_value(data, IT87_REG_FAN_MIN[i]);
-			data->fan[i] = it87_read_value(data,
+			data->fan[i][0] = it87_read_value(data,
 				       IT87_REG_FAN[i]);
 			/* Add high byte if in 16-bit mode */
 			if (has_16bit_fans(data)) {
-				data->fan[i] |= it87_read_value(data,
+				data->fan[i][0] |= it87_read_value(data,
 						IT87_REG_FANX[i]) << 8;
-				data->fan_min[i] |= it87_read_value(data,
+				data->fan[i][1] |= it87_read_value(data,
 						IT87_REG_FANX_MIN[i]) << 8;
 			}
 		}
 		for (i = 0; i < 3; i++) {
 			if (!(data->has_temp & (1 << i)))
 				continue;
-			data->temp[i] =
+			data->temp[i][0] =
 				it87_read_value(data, IT87_REG_TEMP(i));
-			data->temp_high[i] =
-				it87_read_value(data, IT87_REG_TEMP_HIGH(i));
-			data->temp_low[i] =
+			data->temp[i][1] =
 				it87_read_value(data, IT87_REG_TEMP_LOW(i));
+			data->temp[i][2] =
+				it87_read_value(data, IT87_REG_TEMP_HIGH(i));
+			if (has_temp_offset(data))
+				data->temp[i][3] =
+				  it87_read_value(data,
+						  IT87_REG_TEMP_OFFSET[i]);
 		}
 
 		/* Newer chips don't have clock dividers */
@@ -2448,6 +2488,7 @@ static struct it87_data *it87_update_device(struct device *dev)
 			it87_update_pwm_ctrl(data, i);
 
 		data->sensor = it87_read_value(data, IT87_REG_TEMP_ENABLE);
+		data->extra = it87_read_value(data, IT87_REG_TEMP_EXTRA);
 		/*
 		 * The IT8705F does not have VID capability.
 		 * The IT8718F and later don't use IT87_REG_VID for the
@@ -2549,8 +2590,7 @@ static void __exit sm_it87_exit(void)
 }
 
 
-MODULE_AUTHOR("Chris Gauthron, "
-	      "Jean Delvare <khali@linux-fr.org>");
+MODULE_AUTHOR("Chris Gauthron, Jean Delvare <khali@linux-fr.org>");
 MODULE_DESCRIPTION("IT8705F/IT871xF/IT872xF hardware monitoring driver");
 module_param(update_vbat, bool, 0);
 MODULE_PARM_DESC(update_vbat, "Update vbat if set else return powerup value");
diff --git a/trunk/drivers/hwmon/twl4030-madc-hwmon.c b/trunk/drivers/hwmon/twl4030-madc-hwmon.c
index 149d44a7c584..6c6d440bb2dd 100644
--- a/trunk/drivers/hwmon/twl4030-madc-hwmon.c
+++ b/trunk/drivers/hwmon/twl4030-madc-hwmon.c
@@ -130,7 +130,7 @@ static int twl4030_madc_hwmon_remove(struct platform_device *pdev)
 
 static struct platform_driver twl4030_madc_hwmon_driver = {
 	.probe = twl4030_madc_hwmon_probe,
-	.remove = __exit_p(twl4030_madc_hwmon_remove),
+	.remove = twl4030_madc_hwmon_remove,
 	.driver = {
 		   .name = "twl4030_madc_hwmon",
 		   .owner = THIS_MODULE,
diff --git a/trunk/drivers/hwmon/w83627ehf.c b/trunk/drivers/hwmon/w83627ehf.c
index 55ac41c05561..0e8ffd6059a0 100644
--- a/trunk/drivers/hwmon/w83627ehf.c
+++ b/trunk/drivers/hwmon/w83627ehf.c
@@ -1,7 +1,7 @@
 /*
  *  w83627ehf - Driver for the hardware monitoring functionality of
  *		the Winbond W83627EHF Super-I/O chip
- *  Copyright (C) 2005-2011  Jean Delvare <khali@linux-fr.org>
+ *  Copyright (C) 2005-2012  Jean Delvare <khali@linux-fr.org>
  *  Copyright (C) 2006  Yuan Mu (Winbond),
  *			Rudolf Marek <r.marek@assembler.cz>
  *			David Hubbard <david.c.hubbard@gmail.com>
@@ -502,6 +502,13 @@ struct w83627ehf_data {
 	u16 have_temp_offset;
 	u8 in6_skip:1;
 	u8 temp3_val_only:1;
+
+#ifdef CONFIG_PM
+	/* Remember extra register values over suspend/resume */
+	u8 vbat;
+	u8 fandiv1;
+	u8 fandiv2;
+#endif
 };
 
 struct w83627ehf_sio_data {
@@ -898,6 +905,8 @@ static struct w83627ehf_data *w83627ehf_update_device(struct device *dev)
 				data->temp_max_hyst[i]
 				  = w83627ehf_read_temp(data,
 						data->reg_temp_hyst[i]);
+			if (i > 2)
+				continue;
 			if (data->have_temp_offset & (1 << i))
 				data->temp_offset[i]
 				  = w83627ehf_read_value(data,
@@ -2608,10 +2617,98 @@ static int w83627ehf_remove(struct platform_device *pdev)
 	return 0;
 }
 
+#ifdef CONFIG_PM
+static int w83627ehf_suspend(struct device *dev)
+{
+	struct w83627ehf_data *data = w83627ehf_update_device(dev);
+	struct w83627ehf_sio_data *sio_data = dev->platform_data;
+
+	mutex_lock(&data->update_lock);
+	data->vbat = w83627ehf_read_value(data, W83627EHF_REG_VBAT);
+	if (sio_data->kind == nct6775) {
+		data->fandiv1 = w83627ehf_read_value(data, NCT6775_REG_FANDIV1);
+		data->fandiv2 = w83627ehf_read_value(data, NCT6775_REG_FANDIV2);
+	}
+	mutex_unlock(&data->update_lock);
+
+	return 0;
+}
+
+static int w83627ehf_resume(struct device *dev)
+{
+	struct w83627ehf_data *data = dev_get_drvdata(dev);
+	struct w83627ehf_sio_data *sio_data = dev->platform_data;
+	int i;
+
+	mutex_lock(&data->update_lock);
+	data->bank = 0xff;		/* Force initial bank selection */
+
+	/* Restore limits */
+	for (i = 0; i < data->in_num; i++) {
+		if ((i == 6) && data->in6_skip)
+			continue;
+
+		w83627ehf_write_value(data, W83627EHF_REG_IN_MIN(i),
+				      data->in_min[i]);
+		w83627ehf_write_value(data, W83627EHF_REG_IN_MAX(i),
+				      data->in_max[i]);
+	}
+
+	for (i = 0; i < 5; i++) {
+		if (!(data->has_fan_min & (1 << i)))
+			continue;
+
+		w83627ehf_write_value(data, data->REG_FAN_MIN[i],
+				      data->fan_min[i]);
+	}
+
+	for (i = 0; i < NUM_REG_TEMP; i++) {
+		if (!(data->have_temp & (1 << i)))
+			continue;
+
+		if (data->reg_temp_over[i])
+			w83627ehf_write_temp(data, data->reg_temp_over[i],
+					     data->temp_max[i]);
+		if (data->reg_temp_hyst[i])
+			w83627ehf_write_temp(data, data->reg_temp_hyst[i],
+					     data->temp_max_hyst[i]);
+		if (i > 2)
+			continue;
+		if (data->have_temp_offset & (1 << i))
+			w83627ehf_write_value(data,
+					      W83627EHF_REG_TEMP_OFFSET[i],
+					      data->temp_offset[i]);
+	}
+
+	/* Restore other settings */
+	w83627ehf_write_value(data, W83627EHF_REG_VBAT, data->vbat);
+	if (sio_data->kind == nct6775) {
+		w83627ehf_write_value(data, NCT6775_REG_FANDIV1, data->fandiv1);
+		w83627ehf_write_value(data, NCT6775_REG_FANDIV2, data->fandiv2);
+	}
+
+	/* Force re-reading all values */
+	data->valid = 0;
+	mutex_unlock(&data->update_lock);
+
+	return 0;
+}
+
+static const struct dev_pm_ops w83627ehf_dev_pm_ops = {
+	.suspend = w83627ehf_suspend,
+	.resume = w83627ehf_resume,
+};
+
+#define W83627EHF_DEV_PM_OPS	(&w83627ehf_dev_pm_ops)
+#else
+#define W83627EHF_DEV_PM_OPS	NULL
+#endif /* CONFIG_PM */
+
 static struct platform_driver w83627ehf_driver = {
 	.driver = {
 		.owner	= THIS_MODULE,
 		.name	= DRVNAME,
+		.pm	= W83627EHF_DEV_PM_OPS,
 	},
 	.probe		= w83627ehf_probe,
 	.remove		= w83627ehf_remove,
diff --git a/trunk/drivers/hwmon/w83627hf.c b/trunk/drivers/hwmon/w83627hf.c
index 7f68b8309d10..81f486520cea 100644
--- a/trunk/drivers/hwmon/w83627hf.c
+++ b/trunk/drivers/hwmon/w83627hf.c
@@ -5,7 +5,7 @@
  *			      Philip Edelbrock <phil@netroedge.com>,
  *			      and Mark Studebaker <mdsxyz123@yahoo.com>
  * Ported to 2.6 by Bernhard C. Schrenk <clemy@clemy.org>
- * Copyright (c) 2007  Jean Delvare <khali@linux-fr.org>
+ * Copyright (c) 2007 - 1012  Jean Delvare <khali@linux-fr.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -389,6 +389,12 @@ struct w83627hf_data {
 				 */
 	u8 vrm;
 	u8 vrm_ovt;		/* Register value, 627THF/637HF/687THF only */
+
+#ifdef CONFIG_PM
+	/* Remember extra register values over suspend/resume */
+	u8 scfg1;
+	u8 scfg2;
+#endif
 };
 
 
@@ -401,10 +407,77 @@ static void w83627hf_update_fan_div(struct w83627hf_data *data);
 static struct w83627hf_data *w83627hf_update_device(struct device *dev);
 static void w83627hf_init_device(struct platform_device *pdev);
 
+#ifdef CONFIG_PM
+static int w83627hf_suspend(struct device *dev)
+{
+	struct w83627hf_data *data = w83627hf_update_device(dev);
+
+	mutex_lock(&data->update_lock);
+	data->scfg1 = w83627hf_read_value(data, W83781D_REG_SCFG1);
+	data->scfg2 = w83627hf_read_value(data, W83781D_REG_SCFG2);
+	mutex_unlock(&data->update_lock);
+
+	return 0;
+}
+
+static int w83627hf_resume(struct device *dev)
+{
+	struct w83627hf_data *data = dev_get_drvdata(dev);
+	int i, num_temps = (data->type == w83697hf) ? 2 : 3;
+
+	/* Restore limits */
+	mutex_lock(&data->update_lock);
+	for (i = 0; i <= 8; i++) {
+		/* skip missing sensors */
+		if (((data->type == w83697hf) && (i == 1)) ||
+		    ((data->type != w83627hf && data->type != w83697hf)
+		    && (i == 5 || i == 6)))
+			continue;
+		w83627hf_write_value(data, W83781D_REG_IN_MAX(i),
+				     data->in_max[i]);
+		w83627hf_write_value(data, W83781D_REG_IN_MIN(i),
+				     data->in_min[i]);
+	}
+	for (i = 0; i <= 2; i++)
+		w83627hf_write_value(data, W83627HF_REG_FAN_MIN(i),
+				     data->fan_min[i]);
+	for (i = 0; i < num_temps; i++) {
+		w83627hf_write_value(data, w83627hf_reg_temp_over[i],
+				     data->temp_max[i]);
+		w83627hf_write_value(data, w83627hf_reg_temp_hyst[i],
+				     data->temp_max_hyst[i]);
+	}
+
+	/* Fixup BIOS bugs */
+	if (data->type == w83627thf || data->type == w83637hf ||
+	    data->type == w83687thf)
+		w83627hf_write_value(data, W83627THF_REG_VRM_OVT_CFG,
+				     data->vrm_ovt);
+	w83627hf_write_value(data, W83781D_REG_SCFG1, data->scfg1);
+	w83627hf_write_value(data, W83781D_REG_SCFG2, data->scfg2);
+
+	/* Force re-reading all values */
+	data->valid = 0;
+	mutex_unlock(&data->update_lock);
+
+	return 0;
+}
+
+static const struct dev_pm_ops w83627hf_dev_pm_ops = {
+	.suspend = w83627hf_suspend,
+	.resume = w83627hf_resume,
+};
+
+#define W83627HF_DEV_PM_OPS	(&w83627hf_dev_pm_ops)
+#else
+#define W83627HF_DEV_PM_OPS	NULL
+#endif /* CONFIG_PM */
+
 static struct platform_driver w83627hf_driver = {
 	.driver = {
 		.owner	= THIS_MODULE,
 		.name	= DRVNAME,
+		.pm	= W83627HF_DEV_PM_OPS,
 	},
 	.probe		= w83627hf_probe,
 	.remove		= w83627hf_remove,
@@ -1659,8 +1732,10 @@ static void w83627hf_init_device(struct platform_device *pdev)
 	/* Minimize conflicts with other winbond i2c-only clients...  */
 	/* disable i2c subclients... how to disable main i2c client?? */
 	/* force i2c address to relatively uncommon address */
-	w83627hf_write_value(data, W83781D_REG_I2C_SUBADDR, 0x89);
-	w83627hf_write_value(data, W83781D_REG_I2C_ADDR, force_i2c);
+	if (type == w83627hf) {
+		w83627hf_write_value(data, W83781D_REG_I2C_SUBADDR, 0x89);
+		w83627hf_write_value(data, W83781D_REG_I2C_ADDR, force_i2c);
+	}
 
 	/* Read VID only once */
 	if (type == w83627hf || type == w83637hf) {
diff --git a/trunk/drivers/i2c/busses/Kconfig b/trunk/drivers/i2c/busses/Kconfig
index c7bff51fe524..bdca5111eb9d 100644
--- a/trunk/drivers/i2c/busses/Kconfig
+++ b/trunk/drivers/i2c/busses/Kconfig
@@ -337,6 +337,16 @@ config I2C_BLACKFIN_TWI_CLK_KHZ
 	help
 	  The unit of the TWI clock is kHz.
 
+config I2C_CBUS_GPIO
+	tristate "CBUS I2C driver"
+	depends on GENERIC_GPIO
+	help
+	  Support for CBUS access using I2C API. Mostly relevant for Nokia
+	  Internet Tablets (770, N800 and N810).
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called i2c-cbus-gpio.
+
 config I2C_CPM
 	tristate "Freescale CPM1 or CPM2 (MPC8xx/826x)"
 	depends on (CPM1 || CPM2) && OF_I2C
diff --git a/trunk/drivers/i2c/busses/Makefile b/trunk/drivers/i2c/busses/Makefile
index e5cb209d276c..6181f3ff263f 100644
--- a/trunk/drivers/i2c/busses/Makefile
+++ b/trunk/drivers/i2c/busses/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_I2C_POWERMAC)	+= i2c-powermac.o
 obj-$(CONFIG_I2C_AT91)		+= i2c-at91.o
 obj-$(CONFIG_I2C_AU1550)	+= i2c-au1550.o
 obj-$(CONFIG_I2C_BLACKFIN_TWI)	+= i2c-bfin-twi.o
+obj-$(CONFIG_I2C_CBUS_GPIO)	+= i2c-cbus-gpio.o
 obj-$(CONFIG_I2C_CPM)		+= i2c-cpm.o
 obj-$(CONFIG_I2C_DAVINCI)	+= i2c-davinci.o
 obj-$(CONFIG_I2C_DESIGNWARE_CORE)	+= i2c-designware-core.o
diff --git a/trunk/drivers/i2c/busses/i2c-at91.c b/trunk/drivers/i2c/busses/i2c-at91.c
index c02bf208084f..b4575ee4bdf3 100644
--- a/trunk/drivers/i2c/busses/i2c-at91.c
+++ b/trunk/drivers/i2c/busses/i2c-at91.c
@@ -19,6 +19,8 @@
 
 #include <linux/clk.h>
 #include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmaengine.h>
 #include <linux/err.h>
 #include <linux/i2c.h>
 #include <linux/interrupt.h>
@@ -29,9 +31,11 @@
 #include <linux/of_i2c.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
+#include <linux/platform_data/dma-atmel.h>
 
 #define TWI_CLK_HZ		100000			/* max 400 Kbits/s */
 #define AT91_I2C_TIMEOUT	msecs_to_jiffies(100)	/* transfer timeout */
+#define AT91_I2C_DMA_THRESHOLD	8			/* enable DMA if transfer size is bigger than this threshold */
 
 /* AT91 TWI register definitions */
 #define	AT91_TWI_CR		0x0000	/* Control Register */
@@ -66,24 +70,39 @@
 #define	AT91_TWI_THR		0x0034	/* Transmit Holding Register */
 
 struct at91_twi_pdata {
-	unsigned	clk_max_div;
-	unsigned	clk_offset;
-	bool		has_unre_flag;
+	unsigned clk_max_div;
+	unsigned clk_offset;
+	bool has_unre_flag;
+	bool has_dma_support;
+	struct at_dma_slave dma_slave;
+};
+
+struct at91_twi_dma {
+	struct dma_chan *chan_rx;
+	struct dma_chan *chan_tx;
+	struct scatterlist sg;
+	struct dma_async_tx_descriptor *data_desc;
+	enum dma_data_direction direction;
+	bool buf_mapped;
+	bool xfer_in_progress;
 };
 
 struct at91_twi_dev {
-	struct device		*dev;
-	void __iomem		*base;
-	struct completion	cmd_complete;
-	struct clk		*clk;
-	u8			*buf;
-	size_t			buf_len;
-	struct i2c_msg		*msg;
-	int			irq;
-	unsigned		transfer_status;
-	struct i2c_adapter	adapter;
-	unsigned		twi_cwgr_reg;
-	struct at91_twi_pdata	*pdata;
+	struct device *dev;
+	void __iomem *base;
+	struct completion cmd_complete;
+	struct clk *clk;
+	u8 *buf;
+	size_t buf_len;
+	struct i2c_msg *msg;
+	int irq;
+	unsigned imr;
+	unsigned transfer_status;
+	struct i2c_adapter adapter;
+	unsigned twi_cwgr_reg;
+	struct at91_twi_pdata *pdata;
+	bool use_dma;
+	struct at91_twi_dma dma;
 };
 
 static unsigned at91_twi_read(struct at91_twi_dev *dev, unsigned reg)
@@ -102,6 +121,17 @@ static void at91_disable_twi_interrupts(struct at91_twi_dev *dev)
 		       AT91_TWI_TXCOMP | AT91_TWI_RXRDY | AT91_TWI_TXRDY);
 }
 
+static void at91_twi_irq_save(struct at91_twi_dev *dev)
+{
+	dev->imr = at91_twi_read(dev, AT91_TWI_IMR) & 0x7;
+	at91_disable_twi_interrupts(dev);
+}
+
+static void at91_twi_irq_restore(struct at91_twi_dev *dev)
+{
+	at91_twi_write(dev, AT91_TWI_IER, dev->imr);
+}
+
 static void at91_init_twi_bus(struct at91_twi_dev *dev)
 {
 	at91_disable_twi_interrupts(dev);
@@ -138,6 +168,28 @@ static void __devinit at91_calc_twi_clock(struct at91_twi_dev *dev, int twi_clk)
 	dev_dbg(dev->dev, "cdiv %d ckdiv %d\n", cdiv, ckdiv);
 }
 
+static void at91_twi_dma_cleanup(struct at91_twi_dev *dev)
+{
+	struct at91_twi_dma *dma = &dev->dma;
+
+	at91_twi_irq_save(dev);
+
+	if (dma->xfer_in_progress) {
+		if (dma->direction == DMA_FROM_DEVICE)
+			dmaengine_terminate_all(dma->chan_rx);
+		else
+			dmaengine_terminate_all(dma->chan_tx);
+		dma->xfer_in_progress = false;
+	}
+	if (dma->buf_mapped) {
+		dma_unmap_single(dev->dev, sg_dma_address(&dma->sg),
+				 dev->buf_len, dma->direction);
+		dma->buf_mapped = false;
+	}
+
+	at91_twi_irq_restore(dev);
+}
+
 static void at91_twi_write_next_byte(struct at91_twi_dev *dev)
 {
 	if (dev->buf_len <= 0)
@@ -154,6 +206,60 @@ static void at91_twi_write_next_byte(struct at91_twi_dev *dev)
 	++dev->buf;
 }
 
+static void at91_twi_write_data_dma_callback(void *data)
+{
+	struct at91_twi_dev *dev = (struct at91_twi_dev *)data;
+
+	dma_unmap_single(dev->dev, sg_dma_address(&dev->dma.sg),
+			 dev->buf_len, DMA_MEM_TO_DEV);
+
+	at91_twi_write(dev, AT91_TWI_CR, AT91_TWI_STOP);
+}
+
+static void at91_twi_write_data_dma(struct at91_twi_dev *dev)
+{
+	dma_addr_t dma_addr;
+	struct dma_async_tx_descriptor *txdesc;
+	struct at91_twi_dma *dma = &dev->dma;
+	struct dma_chan *chan_tx = dma->chan_tx;
+
+	if (dev->buf_len <= 0)
+		return;
+
+	dma->direction = DMA_TO_DEVICE;
+
+	at91_twi_irq_save(dev);
+	dma_addr = dma_map_single(dev->dev, dev->buf, dev->buf_len,
+				  DMA_TO_DEVICE);
+	if (dma_mapping_error(dev->dev, dma_addr)) {
+		dev_err(dev->dev, "dma map failed\n");
+		return;
+	}
+	dma->buf_mapped = true;
+	at91_twi_irq_restore(dev);
+	sg_dma_len(&dma->sg) = dev->buf_len;
+	sg_dma_address(&dma->sg) = dma_addr;
+
+	txdesc = dmaengine_prep_slave_sg(chan_tx, &dma->sg, 1, DMA_MEM_TO_DEV,
+					 DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+	if (!txdesc) {
+		dev_err(dev->dev, "dma prep slave sg failed\n");
+		goto error;
+	}
+
+	txdesc->callback = at91_twi_write_data_dma_callback;
+	txdesc->callback_param = dev;
+
+	dma->xfer_in_progress = true;
+	dmaengine_submit(txdesc);
+	dma_async_issue_pending(chan_tx);
+
+	return;
+
+error:
+	at91_twi_dma_cleanup(dev);
+}
+
 static void at91_twi_read_next_byte(struct at91_twi_dev *dev)
 {
 	if (dev->buf_len <= 0)
@@ -179,6 +285,61 @@ static void at91_twi_read_next_byte(struct at91_twi_dev *dev)
 	++dev->buf;
 }
 
+static void at91_twi_read_data_dma_callback(void *data)
+{
+	struct at91_twi_dev *dev = (struct at91_twi_dev *)data;
+
+	dma_unmap_single(dev->dev, sg_dma_address(&dev->dma.sg),
+			 dev->buf_len, DMA_DEV_TO_MEM);
+
+	/* The last two bytes have to be read without using dma */
+	dev->buf += dev->buf_len - 2;
+	dev->buf_len = 2;
+	at91_twi_write(dev, AT91_TWI_IER, AT91_TWI_RXRDY);
+}
+
+static void at91_twi_read_data_dma(struct at91_twi_dev *dev)
+{
+	dma_addr_t dma_addr;
+	struct dma_async_tx_descriptor *rxdesc;
+	struct at91_twi_dma *dma = &dev->dma;
+	struct dma_chan *chan_rx = dma->chan_rx;
+
+	dma->direction = DMA_FROM_DEVICE;
+
+	/* Keep in mind that we won't use dma to read the last two bytes */
+	at91_twi_irq_save(dev);
+	dma_addr = dma_map_single(dev->dev, dev->buf, dev->buf_len - 2,
+				  DMA_FROM_DEVICE);
+	if (dma_mapping_error(dev->dev, dma_addr)) {
+		dev_err(dev->dev, "dma map failed\n");
+		return;
+	}
+	dma->buf_mapped = true;
+	at91_twi_irq_restore(dev);
+	dma->sg.dma_address = dma_addr;
+	sg_dma_len(&dma->sg) = dev->buf_len - 2;
+
+	rxdesc = dmaengine_prep_slave_sg(chan_rx, &dma->sg, 1, DMA_DEV_TO_MEM,
+					 DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+	if (!rxdesc) {
+		dev_err(dev->dev, "dma prep slave sg failed\n");
+		goto error;
+	}
+
+	rxdesc->callback = at91_twi_read_data_dma_callback;
+	rxdesc->callback_param = dev;
+
+	dma->xfer_in_progress = true;
+	dmaengine_submit(rxdesc);
+	dma_async_issue_pending(dma->chan_rx);
+
+	return;
+
+error:
+	at91_twi_dma_cleanup(dev);
+}
+
 static irqreturn_t atmel_twi_interrupt(int irq, void *dev_id)
 {
 	struct at91_twi_dev *dev = dev_id;
@@ -229,12 +390,36 @@ static int at91_do_twi_transfer(struct at91_twi_dev *dev)
 		if (dev->buf_len <= 1 && !(dev->msg->flags & I2C_M_RECV_LEN))
 			start_flags |= AT91_TWI_STOP;
 		at91_twi_write(dev, AT91_TWI_CR, start_flags);
-		at91_twi_write(dev, AT91_TWI_IER,
+		/*
+		 * When using dma, the last byte has to be read manually in
+		 * order to not send the stop command too late and then
+		 * to receive extra data. In practice, there are some issues
+		 * if you use the dma to read n-1 bytes because of latency.
+		 * Reading n-2 bytes with dma and the two last ones manually
+		 * seems to be the best solution.
+		 */
+		if (dev->use_dma && (dev->buf_len > AT91_I2C_DMA_THRESHOLD)) {
+			at91_twi_read_data_dma(dev);
+			/*
+			 * It is important to enable TXCOMP irq here because
+			 * doing it only when transferring the last two bytes
+			 * will mask NACK errors since TXCOMP is set when a
+			 * NACK occurs.
+			 */
+			at91_twi_write(dev, AT91_TWI_IER,
+			       AT91_TWI_TXCOMP);
+		} else
+			at91_twi_write(dev, AT91_TWI_IER,
 			       AT91_TWI_TXCOMP | AT91_TWI_RXRDY);
 	} else {
-		at91_twi_write_next_byte(dev);
-		at91_twi_write(dev, AT91_TWI_IER,
-			       AT91_TWI_TXCOMP | AT91_TWI_TXRDY);
+		if (dev->use_dma && (dev->buf_len > AT91_I2C_DMA_THRESHOLD)) {
+			at91_twi_write_data_dma(dev);
+			at91_twi_write(dev, AT91_TWI_IER, AT91_TWI_TXCOMP);
+		} else {
+			at91_twi_write_next_byte(dev);
+			at91_twi_write(dev, AT91_TWI_IER,
+				AT91_TWI_TXCOMP | AT91_TWI_TXRDY);
+		}
 	}
 
 	ret = wait_for_completion_interruptible_timeout(&dev->cmd_complete,
@@ -242,23 +427,31 @@ static int at91_do_twi_transfer(struct at91_twi_dev *dev)
 	if (ret == 0) {
 		dev_err(dev->dev, "controller timed out\n");
 		at91_init_twi_bus(dev);
-		return -ETIMEDOUT;
+		ret = -ETIMEDOUT;
+		goto error;
 	}
 	if (dev->transfer_status & AT91_TWI_NACK) {
 		dev_dbg(dev->dev, "received nack\n");
-		return -EREMOTEIO;
+		ret = -EREMOTEIO;
+		goto error;
 	}
 	if (dev->transfer_status & AT91_TWI_OVRE) {
 		dev_err(dev->dev, "overrun while reading\n");
-		return -EIO;
+		ret = -EIO;
+		goto error;
 	}
 	if (has_unre_flag && dev->transfer_status & AT91_TWI_UNRE) {
 		dev_err(dev->dev, "underrun while writing\n");
-		return -EIO;
+		ret = -EIO;
+		goto error;
 	}
 	dev_dbg(dev->dev, "transfer complete\n");
 
 	return 0;
+
+error:
+	at91_twi_dma_cleanup(dev);
+	return ret;
 }
 
 static int at91_twi_xfer(struct i2c_adapter *adap, struct i2c_msg *msg, int num)
@@ -329,36 +522,42 @@ static struct at91_twi_pdata at91rm9200_config = {
 	.clk_max_div = 5,
 	.clk_offset = 3,
 	.has_unre_flag = true,
+	.has_dma_support = false,
 };
 
 static struct at91_twi_pdata at91sam9261_config = {
 	.clk_max_div = 5,
 	.clk_offset = 4,
 	.has_unre_flag = false,
+	.has_dma_support = false,
 };
 
 static struct at91_twi_pdata at91sam9260_config = {
 	.clk_max_div = 7,
 	.clk_offset = 4,
 	.has_unre_flag = false,
+	.has_dma_support = false,
 };
 
 static struct at91_twi_pdata at91sam9g20_config = {
 	.clk_max_div = 7,
 	.clk_offset = 4,
 	.has_unre_flag = false,
+	.has_dma_support = false,
 };
 
 static struct at91_twi_pdata at91sam9g10_config = {
 	.clk_max_div = 7,
 	.clk_offset = 4,
 	.has_unre_flag = false,
+	.has_dma_support = false,
 };
 
 static struct at91_twi_pdata at91sam9x5_config = {
 	.clk_max_div = 7,
 	.clk_offset = 4,
 	.has_unre_flag = false,
+	.has_dma_support = true,
 };
 
 static const struct platform_device_id at91_twi_devtypes[] = {
@@ -405,6 +604,90 @@ MODULE_DEVICE_TABLE(of, atmel_twi_dt_ids);
 #define atmel_twi_dt_ids NULL
 #endif
 
+static bool __devinit filter(struct dma_chan *chan, void *slave)
+{
+	struct at_dma_slave *sl = slave;
+
+	if (sl->dma_dev == chan->device->dev) {
+		chan->private = sl;
+		return true;
+	} else {
+		return false;
+	}
+}
+
+static int __devinit at91_twi_configure_dma(struct at91_twi_dev *dev, u32 phy_addr)
+{
+	int ret = 0;
+	struct at_dma_slave *sdata;
+	struct dma_slave_config slave_config;
+	struct at91_twi_dma *dma = &dev->dma;
+
+	sdata = &dev->pdata->dma_slave;
+
+	memset(&slave_config, 0, sizeof(slave_config));
+	slave_config.src_addr = (dma_addr_t)phy_addr + AT91_TWI_RHR;
+	slave_config.src_addr_width = DMA_SLAVE_BUSWIDTH_1_BYTE;
+	slave_config.src_maxburst = 1;
+	slave_config.dst_addr = (dma_addr_t)phy_addr + AT91_TWI_THR;
+	slave_config.dst_addr_width = DMA_SLAVE_BUSWIDTH_1_BYTE;
+	slave_config.dst_maxburst = 1;
+	slave_config.device_fc = false;
+
+	if (sdata && sdata->dma_dev) {
+		dma_cap_mask_t mask;
+
+		dma_cap_zero(mask);
+		dma_cap_set(DMA_SLAVE, mask);
+		dma->chan_tx = dma_request_channel(mask, filter, sdata);
+		if (!dma->chan_tx) {
+			dev_err(dev->dev, "no DMA channel available for tx\n");
+			ret = -EBUSY;
+			goto error;
+		}
+		dma->chan_rx = dma_request_channel(mask, filter, sdata);
+		if (!dma->chan_rx) {
+			dev_err(dev->dev, "no DMA channel available for rx\n");
+			ret = -EBUSY;
+			goto error;
+		}
+	} else {
+		ret = -EINVAL;
+		goto error;
+	}
+
+	slave_config.direction = DMA_MEM_TO_DEV;
+	if (dmaengine_slave_config(dma->chan_tx, &slave_config)) {
+		dev_err(dev->dev, "failed to configure tx channel\n");
+		ret = -EINVAL;
+		goto error;
+	}
+
+	slave_config.direction = DMA_DEV_TO_MEM;
+	if (dmaengine_slave_config(dma->chan_rx, &slave_config)) {
+		dev_err(dev->dev, "failed to configure rx channel\n");
+		ret = -EINVAL;
+		goto error;
+	}
+
+	sg_init_table(&dma->sg, 1);
+	dma->buf_mapped = false;
+	dma->xfer_in_progress = false;
+
+	dev_info(dev->dev, "using %s (tx) and %s (rx) for DMA transfers\n",
+		 dma_chan_name(dma->chan_tx), dma_chan_name(dma->chan_rx));
+
+	return ret;
+
+error:
+	dev_info(dev->dev, "can't use DMA\n");
+	if (dma->chan_rx)
+		dma_release_channel(dma->chan_rx);
+	if (dma->chan_tx)
+		dma_release_channel(dma->chan_tx);
+	return ret;
+}
+
 static struct at91_twi_pdata * __devinit at91_twi_get_driver_data(
 					struct platform_device *pdev)
 {
@@ -413,7 +696,7 @@ static struct at91_twi_pdata * __devinit at91_twi_get_driver_data(
 		match = of_match_node(atmel_twi_dt_ids, pdev->dev.of_node);
 		if (!match)
 			return NULL;
-		return match->data;
+		return (struct at91_twi_pdata *)match->data;
 	}
 	return (struct at91_twi_pdata *) platform_get_device_id(pdev)->driver_data;
 }
@@ -423,6 +706,7 @@ static int __devinit at91_twi_probe(struct platform_device *pdev)
 	struct at91_twi_dev *dev;
 	struct resource *mem;
 	int rc;
+	u32 phy_addr;
 
 	dev = devm_kzalloc(&pdev->dev, sizeof(*dev), GFP_KERNEL);
 	if (!dev)
@@ -433,6 +717,7 @@ static int __devinit at91_twi_probe(struct platform_device *pdev)
 	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!mem)
 		return -ENODEV;
+	phy_addr = mem->start;
 
 	dev->pdata = at91_twi_get_driver_data(pdev);
 	if (!dev->pdata)
@@ -462,6 +747,11 @@ static int __devinit at91_twi_probe(struct platform_device *pdev)
 	}
 	clk_prepare_enable(dev->clk);
 
+	if (dev->pdata->has_dma_support) {
+		if (at91_twi_configure_dma(dev, phy_addr) == 0)
+			dev->use_dma = true;
+	}
+
 	at91_calc_twi_clock(dev, TWI_CLK_HZ);
 	at91_init_twi_bus(dev);
 
diff --git a/trunk/drivers/i2c/busses/i2c-cbus-gpio.c b/trunk/drivers/i2c/busses/i2c-cbus-gpio.c
new file mode 100644
index 000000000000..98386d659318
--- /dev/null
+++ b/trunk/drivers/i2c/busses/i2c-cbus-gpio.c
@@ -0,0 +1,300 @@
+/*
+ * CBUS I2C driver for Nokia Internet Tablets.
+ *
+ * Copyright (C) 2004-2010 Nokia Corporation
+ *
+ * Based on code written by Juha Yrjölä, David Weinehall, Mikko Ylinen and
+ * Felipe Balbi. Converted to I2C driver by Aaro Koskinen.
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License. See the file "COPYING" in the main directory of this
+ * archive for more details.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/io.h>
+#include <linux/i2c.h>
+#include <linux/gpio.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of_gpio.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/platform_data/i2c-cbus-gpio.h>
+
+/*
+ * Bit counts are derived from Nokia implementation. These should be checked
+ * if other CBUS implementations appear.
+ */
+#define CBUS_ADDR_BITS	3
+#define CBUS_REG_BITS	5
+
+struct cbus_host {
+	spinlock_t	lock;		/* host lock */
+	struct device	*dev;
+	int		clk_gpio;
+	int		dat_gpio;
+	int		sel_gpio;
+};
+
+/**
+ * cbus_send_bit - sends one bit over the bus
+ * @host: the host we're using
+ * @bit: one bit of information to send
+ */
+static void cbus_send_bit(struct cbus_host *host, unsigned bit)
+{
+	gpio_set_value(host->dat_gpio, bit ? 1 : 0);
+	gpio_set_value(host->clk_gpio, 1);
+	gpio_set_value(host->clk_gpio, 0);
+}
+
+/**
+ * cbus_send_data - sends @len amount of data over the bus
+ * @host: the host we're using
+ * @data: the data to send
+ * @len: size of the transfer
+ */
+static void cbus_send_data(struct cbus_host *host, unsigned data, unsigned len)
+{
+	int i;
+
+	for (i = len; i > 0; i--)
+		cbus_send_bit(host, data & (1 << (i - 1)));
+}
+
+/**
+ * cbus_receive_bit - receives one bit from the bus
+ * @host: the host we're using
+ */
+static int cbus_receive_bit(struct cbus_host *host)
+{
+	int ret;
+
+	gpio_set_value(host->clk_gpio, 1);
+	ret = gpio_get_value(host->dat_gpio);
+	gpio_set_value(host->clk_gpio, 0);
+	return ret;
+}
+
+/**
+ * cbus_receive_word - receives 16-bit word from the bus
+ * @host: the host we're using
+ */
+static int cbus_receive_word(struct cbus_host *host)
+{
+	int ret = 0;
+	int i;
+
+	for (i = 16; i > 0; i--) {
+		int bit = cbus_receive_bit(host);
+
+		if (bit < 0)
+			return bit;
+
+		if (bit)
+			ret |= 1 << (i - 1);
+	}
+	return ret;
+}
+
+/**
+ * cbus_transfer - transfers data over the bus
+ * @host: the host we're using
+ * @rw: read/write flag
+ * @dev: device address
+ * @reg: register address
+ * @data: if @rw == I2C_SBUS_WRITE data to send otherwise 0
+ */
+static int cbus_transfer(struct cbus_host *host, char rw, unsigned dev,
+			 unsigned reg, unsigned data)
+{
+	unsigned long flags;
+	int ret;
+
+	/* We don't want interrupts disturbing our transfer */
+	spin_lock_irqsave(&host->lock, flags);
+
+	/* Reset state and start of transfer, SEL stays down during transfer */
+	gpio_set_value(host->sel_gpio, 0);
+
+	/* Set the DAT pin to output */
+	gpio_direction_output(host->dat_gpio, 1);
+
+	/* Send the device address */
+	cbus_send_data(host, dev, CBUS_ADDR_BITS);
+
+	/* Send the rw flag */
+	cbus_send_bit(host, rw == I2C_SMBUS_READ);
+
+	/* Send the register address */
+	cbus_send_data(host, reg, CBUS_REG_BITS);
+
+	if (rw == I2C_SMBUS_WRITE) {
+		cbus_send_data(host, data, 16);
+		ret = 0;
+	} else {
+		ret = gpio_direction_input(host->dat_gpio);
+		if (ret) {
+			dev_dbg(host->dev, "failed setting direction\n");
+			goto out;
+		}
+		gpio_set_value(host->clk_gpio, 1);
+
+		ret = cbus_receive_word(host);
+		if (ret < 0) {
+			dev_dbg(host->dev, "failed receiving data\n");
+			goto out;
+		}
+	}
+
+	/* Indicate end of transfer, SEL goes up until next transfer */
+	gpio_set_value(host->sel_gpio, 1);
+	gpio_set_value(host->clk_gpio, 1);
+	gpio_set_value(host->clk_gpio, 0);
+
+out:
+	spin_unlock_irqrestore(&host->lock, flags);
+
+	return ret;
+}
+
+static int cbus_i2c_smbus_xfer(struct i2c_adapter	*adapter,
+			       u16			addr,
+			       unsigned short		flags,
+			       char			read_write,
+			       u8			command,
+			       int			size,
+			       union i2c_smbus_data	*data)
+{
+	struct cbus_host *chost = i2c_get_adapdata(adapter);
+	int ret;
+
+	if (size != I2C_SMBUS_WORD_DATA)
+		return -EINVAL;
+
+	ret = cbus_transfer(chost, read_write == I2C_SMBUS_READ, addr,
+			    command, data->word);
+	if (ret < 0)
+		return ret;
+
+	if (read_write == I2C_SMBUS_READ)
+		data->word = ret;
+
+	return 0;
+}
+
+static u32 cbus_i2c_func(struct i2c_adapter *adapter)
+{
+	return I2C_FUNC_SMBUS_READ_WORD_DATA | I2C_FUNC_SMBUS_WRITE_WORD_DATA;
+}
+
+static const struct i2c_algorithm cbus_i2c_algo = {
+	.smbus_xfer	= cbus_i2c_smbus_xfer,
+	.functionality	= cbus_i2c_func,
+};
+
+static int cbus_i2c_remove(struct platform_device *pdev)
+{
+	struct i2c_adapter *adapter = platform_get_drvdata(pdev);
+
+	return i2c_del_adapter(adapter);
+}
+
+static int cbus_i2c_probe(struct platform_device *pdev)
+{
+	struct i2c_adapter *adapter;
+	struct cbus_host *chost;
+	int ret;
+
+	adapter = devm_kzalloc(&pdev->dev, sizeof(struct i2c_adapter),
+			       GFP_KERNEL);
+	if (!adapter)
+		return -ENOMEM;
+
+	chost = devm_kzalloc(&pdev->dev, sizeof(*chost), GFP_KERNEL);
+	if (!chost)
+		return -ENOMEM;
+
+	if (pdev->dev.of_node) {
+		struct device_node *dnode = pdev->dev.of_node;
+		if (of_gpio_count(dnode) != 3)
+			return -ENODEV;
+		chost->clk_gpio = of_get_gpio(dnode, 0);
+		chost->dat_gpio = of_get_gpio(dnode, 1);
+		chost->sel_gpio = of_get_gpio(dnode, 2);
+	} else if (pdev->dev.platform_data) {
+		struct i2c_cbus_platform_data *pdata = pdev->dev.platform_data;
+		chost->clk_gpio = pdata->clk_gpio;
+		chost->dat_gpio = pdata->dat_gpio;
+		chost->sel_gpio = pdata->sel_gpio;
+	} else {
+		return -ENODEV;
+	}
+
+	adapter->owner		= THIS_MODULE;
+	adapter->class		= I2C_CLASS_HWMON;
+	adapter->dev.parent	= &pdev->dev;
+	adapter->nr		= pdev->id;
+	adapter->timeout	= HZ;
+	adapter->algo		= &cbus_i2c_algo;
+	strlcpy(adapter->name, "CBUS I2C adapter", sizeof(adapter->name));
+
+	spin_lock_init(&chost->lock);
+	chost->dev = &pdev->dev;
+
+	ret = devm_gpio_request_one(&pdev->dev, chost->clk_gpio,
+				    GPIOF_OUT_INIT_LOW, "CBUS clk");
+	if (ret)
+		return ret;
+
+	ret = devm_gpio_request_one(&pdev->dev, chost->dat_gpio, GPIOF_IN,
+				    "CBUS data");
+	if (ret)
+		return ret;
+
+	ret = devm_gpio_request_one(&pdev->dev, chost->sel_gpio,
+				    GPIOF_OUT_INIT_HIGH, "CBUS sel");
+	if (ret)
+		return ret;
+
+	i2c_set_adapdata(adapter, chost);
+	platform_set_drvdata(pdev, adapter);
+
+	return i2c_add_numbered_adapter(adapter);
+}
+
+#if defined(CONFIG_OF)
+static const struct of_device_id i2c_cbus_dt_ids[] = {
+	{ .compatible = "i2c-cbus-gpio", },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, i2c_cbus_dt_ids);
+#endif
+
+static struct platform_driver cbus_i2c_driver = {
+	.probe	= cbus_i2c_probe,
+	.remove	= cbus_i2c_remove,
+	.driver	= {
+		.owner	= THIS_MODULE,
+		.name	= "i2c-cbus-gpio",
+	},
+};
+module_platform_driver(cbus_i2c_driver);
+
+MODULE_ALIAS("platform:i2c-cbus-gpio");
+MODULE_DESCRIPTION("CBUS I2C driver");
+MODULE_AUTHOR("Juha Yrjölä");
+MODULE_AUTHOR("David Weinehall");
+MODULE_AUTHOR("Mikko Ylinen");
+MODULE_AUTHOR("Felipe Balbi");
+MODULE_AUTHOR("Aaro Koskinen <aaro.koskinen@iki.fi>");
+MODULE_LICENSE("GPL");
diff --git a/trunk/drivers/i2c/busses/i2c-gpio.c b/trunk/drivers/i2c/busses/i2c-gpio.c
index e62d2d938628..257299a92df3 100644
--- a/trunk/drivers/i2c/busses/i2c-gpio.c
+++ b/trunk/drivers/i2c/busses/i2c-gpio.c
@@ -184,7 +184,11 @@ static int __devinit i2c_gpio_probe(struct platform_device *pdev)
 	bit_data->data = pdata;
 
 	adap->owner = THIS_MODULE;
-	snprintf(adap->name, sizeof(adap->name), "i2c-gpio%d", pdev->id);
+	if (pdev->dev.of_node)
+		strlcpy(adap->name, dev_name(&pdev->dev), sizeof(adap->name));
+	else
+		snprintf(adap->name, sizeof(adap->name), "i2c-gpio%d", pdev->id);
+
 	adap->algo_data = bit_data;
 	adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD;
 	adap->dev.parent = &pdev->dev;
diff --git a/trunk/drivers/i2c/busses/i2c-mxs.c b/trunk/drivers/i2c/busses/i2c-mxs.c
index 0670da79ee5e..6ed53da9e1f4 100644
--- a/trunk/drivers/i2c/busses/i2c-mxs.c
+++ b/trunk/drivers/i2c/busses/i2c-mxs.c
@@ -359,7 +359,7 @@ static int mxs_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg msgs[],
 
 static u32 mxs_i2c_func(struct i2c_adapter *adap)
 {
-	return I2C_FUNC_I2C | (I2C_FUNC_SMBUS_EMUL & ~I2C_FUNC_SMBUS_QUICK);
+	return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL;
 }
 
 static irqreturn_t mxs_i2c_isr(int this_irq, void *dev_id)
diff --git a/trunk/drivers/i2c/busses/i2c-nomadik.c b/trunk/drivers/i2c/busses/i2c-nomadik.c
index 02c3115a2dfa..8b2ffcf45322 100644
--- a/trunk/drivers/i2c/busses/i2c-nomadik.c
+++ b/trunk/drivers/i2c/busses/i2c-nomadik.c
@@ -435,13 +435,6 @@ static int read_i2c(struct nmk_i2c_dev *dev, u16 flags)
 	timeout = wait_for_completion_timeout(
 		&dev->xfer_complete, dev->adap.timeout);
 
-	if (timeout < 0) {
-		dev_err(&dev->adev->dev,
-			"wait_for_completion_timeout "
-			"returned %d waiting for event\n", timeout);
-		status = timeout;
-	}
-
 	if (timeout == 0) {
 		/* Controller timed out */
 		dev_err(&dev->adev->dev, "read from slave 0x%x timed out\n",
@@ -523,13 +516,6 @@ static int write_i2c(struct nmk_i2c_dev *dev, u16 flags)
 	timeout = wait_for_completion_timeout(
 		&dev->xfer_complete, dev->adap.timeout);
 
-	if (timeout < 0) {
-		dev_err(&dev->adev->dev,
-			"wait_for_completion_timeout "
-			"returned %d waiting for event\n", timeout);
-		status = timeout;
-	}
-
 	if (timeout == 0) {
 		/* Controller timed out */
 		dev_err(&dev->adev->dev, "write to slave 0x%x timed out\n",
diff --git a/trunk/drivers/i2c/busses/i2c-ocores.c b/trunk/drivers/i2c/busses/i2c-ocores.c
index 15da1ac7cf9e..9b35c9fbb2fe 100644
--- a/trunk/drivers/i2c/busses/i2c-ocores.c
+++ b/trunk/drivers/i2c/busses/i2c-ocores.c
@@ -4,6 +4,9 @@
  *
  * Peter Korsgaard <jacmet@sunsite.dk>
  *
+ * Support for the GRLIB port of the controller by
+ * Andreas Larsson <andreas@gaisler.com>
+ *
  * This file is licensed under the terms of the GNU General Public License
  * version 2.  This program is licensed "as is" without any warranty of any
  * kind, whether express or implied.
@@ -34,6 +37,8 @@ struct ocores_i2c {
 	int nmsgs;
 	int state; /* see STATE_ */
 	int clock_khz;
+	void (*setreg)(struct ocores_i2c *i2c, int reg, u8 value);
+	u8 (*getreg)(struct ocores_i2c *i2c, int reg);
 };
 
 /* registers */
@@ -67,24 +72,47 @@ struct ocores_i2c {
 #define STATE_READ		3
 #define STATE_ERROR		4
 
+#define TYPE_OCORES		0
+#define TYPE_GRLIB		1
+
+static void oc_setreg_8(struct ocores_i2c *i2c, int reg, u8 value)
+{
+	iowrite8(value, i2c->base + (reg << i2c->reg_shift));
+}
+
+static void oc_setreg_16(struct ocores_i2c *i2c, int reg, u8 value)
+{
+	iowrite16(value, i2c->base + (reg << i2c->reg_shift));
+}
+
+static void oc_setreg_32(struct ocores_i2c *i2c, int reg, u8 value)
+{
+	iowrite32(value, i2c->base + (reg << i2c->reg_shift));
+}
+
+static inline u8 oc_getreg_8(struct ocores_i2c *i2c, int reg)
+{
+	return ioread8(i2c->base + (reg << i2c->reg_shift));
+}
+
+static inline u8 oc_getreg_16(struct ocores_i2c *i2c, int reg)
+{
+	return ioread16(i2c->base + (reg << i2c->reg_shift));
+}
+
+static inline u8 oc_getreg_32(struct ocores_i2c *i2c, int reg)
+{
+	return ioread32(i2c->base + (reg << i2c->reg_shift));
+}
+
 static inline void oc_setreg(struct ocores_i2c *i2c, int reg, u8 value)
 {
-	if (i2c->reg_io_width == 4)
-		iowrite32(value, i2c->base + (reg << i2c->reg_shift));
-	else if (i2c->reg_io_width == 2)
-		iowrite16(value, i2c->base + (reg << i2c->reg_shift));
-	else
-		iowrite8(value, i2c->base + (reg << i2c->reg_shift));
+	i2c->setreg(i2c, reg, value);
 }
 
 static inline u8 oc_getreg(struct ocores_i2c *i2c, int reg)
 {
-	if (i2c->reg_io_width == 4)
-		return ioread32(i2c->base + (reg << i2c->reg_shift));
-	else if (i2c->reg_io_width == 2)
-		return ioread16(i2c->base + (reg << i2c->reg_shift));
-	else
-		return ioread8(i2c->base + (reg << i2c->reg_shift));
+	return i2c->getreg(i2c, reg);
 }
 
 static void ocores_process(struct ocores_i2c *i2c)
@@ -223,11 +251,59 @@ static struct i2c_adapter ocores_adapter = {
 	.algo		= &ocores_algorithm,
 };
 
+static struct of_device_id ocores_i2c_match[] = {
+	{
+		.compatible = "opencores,i2c-ocores",
+		.data = (void *)TYPE_OCORES,
+	},
+	{
+		.compatible = "aeroflexgaisler,i2cmst",
+		.data = (void *)TYPE_GRLIB,
+	},
+	{},
+};
+MODULE_DEVICE_TABLE(of, ocores_i2c_match);
+
 #ifdef CONFIG_OF
+/* Read and write functions for the GRLIB port of the controller. Registers are
+ * 32-bit big endian and the PRELOW and PREHIGH registers are merged into one
+ * register. The subsequent registers has their offset decreased accordingly. */
+static u8 oc_getreg_grlib(struct ocores_i2c *i2c, int reg)
+{
+	u32 rd;
+	int rreg = reg;
+	if (reg != OCI2C_PRELOW)
+		rreg--;
+	rd = ioread32be(i2c->base + (rreg << i2c->reg_shift));
+	if (reg == OCI2C_PREHIGH)
+		return (u8)(rd >> 8);
+	else
+		return (u8)rd;
+}
+
+static void oc_setreg_grlib(struct ocores_i2c *i2c, int reg, u8 value)
+{
+	u32 curr, wr;
+	int rreg = reg;
+	if (reg != OCI2C_PRELOW)
+		rreg--;
+	if (reg == OCI2C_PRELOW || reg == OCI2C_PREHIGH) {
+		curr = ioread32be(i2c->base + (rreg << i2c->reg_shift));
+		if (reg == OCI2C_PRELOW)
+			wr = (curr & 0xff00) | value;
+		else
+			wr = (((u32)value) << 8) | (curr & 0xff);
+	} else {
+		wr = value;
+	}
+	iowrite32be(wr, i2c->base + (rreg << i2c->reg_shift));
+}
+
 static int ocores_i2c_of_probe(struct platform_device *pdev,
 				struct ocores_i2c *i2c)
 {
 	struct device_node *np = pdev->dev.of_node;
+	const struct of_device_id *match;
 	u32 val;
 
 	if (of_property_read_u32(np, "reg-shift", &i2c->reg_shift)) {
@@ -253,6 +329,14 @@ static int ocores_i2c_of_probe(struct platform_device *pdev,
 
 	of_property_read_u32(pdev->dev.of_node, "reg-io-width",
 				&i2c->reg_io_width);
+
+	match = of_match_node(ocores_i2c_match, pdev->dev.of_node);
+	if (match && (int)match->data == TYPE_GRLIB) {
+		dev_dbg(&pdev->dev, "GRLIB variant of i2c-ocores\n");
+		i2c->setreg = oc_setreg_grlib;
+		i2c->getreg = oc_getreg_grlib;
+	}
+
 	return 0;
 }
 #else
@@ -263,7 +347,8 @@ static int __devinit ocores_i2c_probe(struct platform_device *pdev)
 {
 	struct ocores_i2c *i2c;
 	struct ocores_i2c_platform_data *pdata;
-	struct resource *res, *res2;
+	struct resource *res;
+	int irq;
 	int ret;
 	int i;
 
@@ -271,26 +356,17 @@ static int __devinit ocores_i2c_probe(struct platform_device *pdev)
 	if (!res)
 		return -ENODEV;
 
-	res2 = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
-	if (!res2)
-		return -ENODEV;
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0)
+		return irq;
 
 	i2c = devm_kzalloc(&pdev->dev, sizeof(*i2c), GFP_KERNEL);
 	if (!i2c)
 		return -ENOMEM;
 
-	if (!devm_request_mem_region(&pdev->dev, res->start,
-				     resource_size(res), pdev->name)) {
-		dev_err(&pdev->dev, "Memory region busy\n");
-		return -EBUSY;
-	}
-
-	i2c->base = devm_ioremap_nocache(&pdev->dev, res->start,
-					 resource_size(res));
-	if (!i2c->base) {
-		dev_err(&pdev->dev, "Unable to map registers\n");
-		return -EIO;
-	}
+	i2c->base = devm_request_and_ioremap(&pdev->dev, res);
+	if (!i2c->base)
+		return -EADDRNOTAVAIL;
 
 	pdata = pdev->dev.platform_data;
 	if (pdata) {
@@ -306,10 +382,34 @@ static int __devinit ocores_i2c_probe(struct platform_device *pdev)
 	if (i2c->reg_io_width == 0)
 		i2c->reg_io_width = 1; /* Set to default value */
 
+	if (!i2c->setreg || !i2c->getreg) {
+		switch (i2c->reg_io_width) {
+		case 1:
+			i2c->setreg = oc_setreg_8;
+			i2c->getreg = oc_getreg_8;
+			break;
+
+		case 2:
+			i2c->setreg = oc_setreg_16;
+			i2c->getreg = oc_getreg_16;
+			break;
+
+		case 4:
+			i2c->setreg = oc_setreg_32;
+			i2c->getreg = oc_getreg_32;
+			break;
+
+		default:
+			dev_err(&pdev->dev, "Unsupported I/O width (%d)\n",
+				i2c->reg_io_width);
+			return -EINVAL;
+		}
+	}
+
 	ocores_init(i2c);
 
 	init_waitqueue_head(&i2c->wait);
-	ret = devm_request_irq(&pdev->dev, res2->start, ocores_isr, 0,
+	ret = devm_request_irq(&pdev->dev, irq, ocores_isr, 0,
 			       pdev->name, i2c);
 	if (ret) {
 		dev_err(&pdev->dev, "Cannot claim IRQ\n");
@@ -383,12 +483,6 @@ static SIMPLE_DEV_PM_OPS(ocores_i2c_pm, ocores_i2c_suspend, ocores_i2c_resume);
 #define OCORES_I2C_PM	NULL
 #endif
 
-static struct of_device_id ocores_i2c_match[] = {
-	{ .compatible = "opencores,i2c-ocores", },
-	{},
-};
-MODULE_DEVICE_TABLE(of, ocores_i2c_match);
-
 static struct platform_driver ocores_i2c_driver = {
 	.probe   = ocores_i2c_probe,
 	.remove  = __devexit_p(ocores_i2c_remove),
diff --git a/trunk/drivers/i2c/busses/i2c-omap.c b/trunk/drivers/i2c/busses/i2c-omap.c
index 3525c9e62cb0..7a62acb7d262 100644
--- a/trunk/drivers/i2c/busses/i2c-omap.c
+++ b/trunk/drivers/i2c/busses/i2c-omap.c
@@ -43,14 +43,16 @@
 #include <linux/slab.h>
 #include <linux/i2c-omap.h>
 #include <linux/pm_runtime.h>
+#include <linux/pinctrl/consumer.h>
 
 /* I2C controller revisions */
 #define OMAP_I2C_OMAP1_REV_2		0x20
 
 /* I2C controller revisions present on specific hardware */
-#define OMAP_I2C_REV_ON_2430		0x36
-#define OMAP_I2C_REV_ON_3430_3530	0x3C
-#define OMAP_I2C_REV_ON_3630_4430	0x40
+#define OMAP_I2C_REV_ON_2430		0x00000036
+#define OMAP_I2C_REV_ON_3430_3530	0x0000003C
+#define OMAP_I2C_REV_ON_3630		0x00000040
+#define OMAP_I2C_REV_ON_4430_PLUS	0x50400002
 
 /* timeout waiting for the controller to respond */
 #define OMAP_I2C_TIMEOUT (msecs_to_jiffies(1000))
@@ -190,7 +192,6 @@ struct omap_i2c_dev {
 	void			(*set_mpu_wkup_lat)(struct device *dev,
 						    long latency);
 	u32			speed;		/* Speed of bus in kHz */
-	u32			dtrev;		/* extra revision from DT */
 	u32			flags;
 	u16			cmd_err;
 	u8			*buf;
@@ -202,17 +203,18 @@ struct omap_i2c_dev {
 						 * fifo_size==0 implies no fifo
 						 * if set, should be trsh+1
 						 */
-	u8			rev;
+	u32			rev;
 	unsigned		b_hw:1;		/* bad h/w fixes */
 	unsigned		receiver:1;	/* true when we're in receiver mode */
 	u16			iestate;	/* Saved interrupt register */
 	u16			pscstate;
 	u16			scllstate;
 	u16			sclhstate;
-	u16			bufstate;
 	u16			syscstate;
 	u16			westate;
 	u16			errata;
+
+	struct pinctrl		*pins;
 };
 
 static const u8 reg_map_ip_v1[] = {
@@ -275,16 +277,39 @@ static inline u16 omap_i2c_read_reg(struct omap_i2c_dev *i2c_dev, int reg)
 				(i2c_dev->regs[reg] << i2c_dev->reg_shift));
 }
 
-static int omap_i2c_init(struct omap_i2c_dev *dev)
+static void __omap_i2c_init(struct omap_i2c_dev *dev)
+{
+
+	omap_i2c_write_reg(dev, OMAP_I2C_CON_REG, 0);
+
+	/* Setup clock prescaler to obtain approx 12MHz I2C module clock: */
+	omap_i2c_write_reg(dev, OMAP_I2C_PSC_REG, dev->pscstate);
+
+	/* SCL low and high time values */
+	omap_i2c_write_reg(dev, OMAP_I2C_SCLL_REG, dev->scllstate);
+	omap_i2c_write_reg(dev, OMAP_I2C_SCLH_REG, dev->sclhstate);
+	if (dev->rev >= OMAP_I2C_REV_ON_3430_3530)
+		omap_i2c_write_reg(dev, OMAP_I2C_WE_REG, dev->westate);
+
+	/* Take the I2C module out of reset: */
+	omap_i2c_write_reg(dev, OMAP_I2C_CON_REG, OMAP_I2C_CON_EN);
+
+	/*
+	 * Don't write to this register if the IE state is 0 as it can
+	 * cause deadlock.
+	 */
+	if (dev->iestate)
+		omap_i2c_write_reg(dev, OMAP_I2C_IE_REG, dev->iestate);
+}
+
+static int omap_i2c_reset(struct omap_i2c_dev *dev)
 {
-	u16 psc = 0, scll = 0, sclh = 0, buf = 0;
-	u16 fsscll = 0, fssclh = 0, hsscll = 0, hssclh = 0;
-	unsigned long fclk_rate = 12000000;
 	unsigned long timeout;
-	unsigned long internal_clk = 0;
-	struct clk *fclk;
+	u16 sysc;
 
 	if (dev->rev >= OMAP_I2C_OMAP1_REV_2) {
+		sysc = omap_i2c_read_reg(dev, OMAP_I2C_SYSC_REG);
+
 		/* Disable I2C controller before soft reset */
 		omap_i2c_write_reg(dev, OMAP_I2C_CON_REG,
 			omap_i2c_read_reg(dev, OMAP_I2C_CON_REG) &
@@ -306,32 +331,28 @@ static int omap_i2c_init(struct omap_i2c_dev *dev)
 		}
 
 		/* SYSC register is cleared by the reset; rewrite it */
-		if (dev->rev == OMAP_I2C_REV_ON_2430) {
-
-			omap_i2c_write_reg(dev, OMAP_I2C_SYSC_REG,
-					   SYSC_AUTOIDLE_MASK);
-
-		} else if (dev->rev >= OMAP_I2C_REV_ON_3430_3530) {
-			dev->syscstate = SYSC_AUTOIDLE_MASK;
-			dev->syscstate |= SYSC_ENAWAKEUP_MASK;
-			dev->syscstate |= (SYSC_IDLEMODE_SMART <<
-			      __ffs(SYSC_SIDLEMODE_MASK));
-			dev->syscstate |= (SYSC_CLOCKACTIVITY_FCLK <<
-			      __ffs(SYSC_CLOCKACTIVITY_MASK));
-
-			omap_i2c_write_reg(dev, OMAP_I2C_SYSC_REG,
-							dev->syscstate);
-			/*
-			 * Enabling all wakup sources to stop I2C freezing on
-			 * WFI instruction.
-			 * REVISIT: Some wkup sources might not be needed.
-			 */
-			dev->westate = OMAP_I2C_WE_ALL;
-			omap_i2c_write_reg(dev, OMAP_I2C_WE_REG,
-							dev->westate);
-		}
+		omap_i2c_write_reg(dev, OMAP_I2C_SYSC_REG, sysc);
+
+	}
+	return 0;
+}
+
+static int omap_i2c_init(struct omap_i2c_dev *dev)
+{
+	u16 psc = 0, scll = 0, sclh = 0;
+	u16 fsscll = 0, fssclh = 0, hsscll = 0, hssclh = 0;
+	unsigned long fclk_rate = 12000000;
+	unsigned long internal_clk = 0;
+	struct clk *fclk;
+
+	if (dev->rev >= OMAP_I2C_REV_ON_3430_3530) {
+		/*
+		 * Enabling all wakup sources to stop I2C freezing on
+		 * WFI instruction.
+		 * REVISIT: Some wkup sources might not be needed.
+		 */
+		dev->westate = OMAP_I2C_WE_ALL;
 	}
-	omap_i2c_write_reg(dev, OMAP_I2C_CON_REG, 0);
 
 	if (dev->flags & OMAP_I2C_FLAG_ALWAYS_ARMXOR_CLK) {
 		/*
@@ -416,28 +437,17 @@ static int omap_i2c_init(struct omap_i2c_dev *dev)
 		sclh = fclk_rate / (dev->speed * 2) - 7 + psc;
 	}
 
-	/* Setup clock prescaler to obtain approx 12MHz I2C module clock: */
-	omap_i2c_write_reg(dev, OMAP_I2C_PSC_REG, psc);
-
-	/* SCL low and high time values */
-	omap_i2c_write_reg(dev, OMAP_I2C_SCLL_REG, scll);
-	omap_i2c_write_reg(dev, OMAP_I2C_SCLH_REG, sclh);
-
-	/* Take the I2C module out of reset: */
-	omap_i2c_write_reg(dev, OMAP_I2C_CON_REG, OMAP_I2C_CON_EN);
-
-	/* Enable interrupts */
 	dev->iestate = (OMAP_I2C_IE_XRDY | OMAP_I2C_IE_RRDY |
 			OMAP_I2C_IE_ARDY | OMAP_I2C_IE_NACK |
 			OMAP_I2C_IE_AL)  | ((dev->fifo_size) ?
 				(OMAP_I2C_IE_RDR | OMAP_I2C_IE_XDR) : 0);
-	omap_i2c_write_reg(dev, OMAP_I2C_IE_REG, dev->iestate);
-	if (dev->flags & OMAP_I2C_FLAG_RESET_REGS_POSTIDLE) {
-		dev->pscstate = psc;
-		dev->scllstate = scll;
-		dev->sclhstate = sclh;
-		dev->bufstate = buf;
-	}
+
+	dev->pscstate = psc;
+	dev->scllstate = scll;
+	dev->sclhstate = sclh;
+
+	__omap_i2c_init(dev);
+
 	return 0;
 }
 
@@ -490,7 +500,7 @@ static void omap_i2c_resize_fifo(struct omap_i2c_dev *dev, u8 size, bool is_rx)
 
 	omap_i2c_write_reg(dev, OMAP_I2C_BUF_REG, buf);
 
-	if (dev->rev < OMAP_I2C_REV_ON_3630_4430)
+	if (dev->rev < OMAP_I2C_REV_ON_3630)
 		dev->b_hw = 1; /* Enable hardware fixes */
 
 	/* calculate wakeup latency constraint for MPU */
@@ -586,7 +596,8 @@ static int omap_i2c_xfer_msg(struct i2c_adapter *adap,
 						OMAP_I2C_TIMEOUT);
 	if (timeout == 0) {
 		dev_err(dev->dev, "controller timed out\n");
-		omap_i2c_init(dev);
+		omap_i2c_reset(dev);
+		__omap_i2c_init(dev);
 		return -ETIMEDOUT;
 	}
 
@@ -596,7 +607,8 @@ static int omap_i2c_xfer_msg(struct i2c_adapter *adap,
 	/* We have an error */
 	if (dev->cmd_err & (OMAP_I2C_STAT_AL | OMAP_I2C_STAT_ROVR |
 			    OMAP_I2C_STAT_XUDF)) {
-		omap_i2c_init(dev);
+		omap_i2c_reset(dev);
+		__omap_i2c_init(dev);
 		return -EIO;
 	}
 
@@ -642,13 +654,14 @@ omap_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg msgs[], int num)
 			break;
 	}
 
-	if (dev->set_mpu_wkup_lat != NULL)
-		dev->set_mpu_wkup_lat(dev->dev, -1);
-
 	if (r == 0)
 		r = num;
 
 	omap_i2c_wait_for_bb(dev);
+
+	if (dev->set_mpu_wkup_lat != NULL)
+		dev->set_mpu_wkup_lat(dev->dev, -1);
+
 out:
 	pm_runtime_mark_last_busy(dev->dev);
 	pm_runtime_put_autosuspend(dev->dev);
@@ -1025,9 +1038,7 @@ static const struct i2c_algorithm omap_i2c_algo = {
 #ifdef CONFIG_OF
 static struct omap_i2c_bus_platform_data omap3_pdata = {
 	.rev = OMAP_I2C_IP_VERSION_1,
-	.flags = OMAP_I2C_FLAG_APPLY_ERRATA_I207 |
-		 OMAP_I2C_FLAG_RESET_REGS_POSTIDLE |
-		 OMAP_I2C_FLAG_BUS_SHIFT_2,
+	.flags = OMAP_I2C_FLAG_BUS_SHIFT_2,
 };
 
 static struct omap_i2c_bus_platform_data omap4_pdata = {
@@ -1048,6 +1059,16 @@ static const struct of_device_id omap_i2c_of_match[] = {
 MODULE_DEVICE_TABLE(of, omap_i2c_of_match);
 #endif
 
+#define OMAP_I2C_SCHEME(rev)		((rev & 0xc000) >> 14)
+
+#define OMAP_I2C_REV_SCHEME_0_MAJOR(rev) (rev >> 4)
+#define OMAP_I2C_REV_SCHEME_0_MINOR(rev) (rev & 0xf)
+
+#define OMAP_I2C_REV_SCHEME_1_MAJOR(rev) ((rev & 0x0700) >> 7)
+#define OMAP_I2C_REV_SCHEME_1_MINOR(rev) (rev & 0x1f)
+#define OMAP_I2C_SCHEME_0		0
+#define OMAP_I2C_SCHEME_1		1
+
 static int __devinit
 omap_i2c_probe(struct platform_device *pdev)
 {
@@ -1060,6 +1081,8 @@ omap_i2c_probe(struct platform_device *pdev)
 	const struct of_device_id *match;
 	int irq;
 	int r;
+	u32 rev;
+	u16 minor, major, scheme;
 
 	/* NOTE: driver uses the static register mapping */
 	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
@@ -1091,7 +1114,6 @@ omap_i2c_probe(struct platform_device *pdev)
 		u32 freq = 100000; /* default to 100000 Hz */
 
 		pdata = match->data;
-		dev->dtrev = pdata->rev;
 		dev->flags = pdata->flags;
 
 		of_property_read_u32(node, "clock-frequency", &freq);
@@ -1101,7 +1123,16 @@ omap_i2c_probe(struct platform_device *pdev)
 		dev->speed = pdata->clkrate;
 		dev->flags = pdata->flags;
 		dev->set_mpu_wkup_lat = pdata->set_mpu_wkup_lat;
-		dev->dtrev = pdata->rev;
+	}
+
+	dev->pins = devm_pinctrl_get_select_default(&pdev->dev);
+	if (IS_ERR(dev->pins)) {
+		if (PTR_ERR(dev->pins) == -EPROBE_DEFER)
+			return -EPROBE_DEFER;
+
+		dev_warn(&pdev->dev, "did not get pins for i2c error: %li\n",
+			 PTR_ERR(dev->pins));
+		dev->pins = NULL;
 	}
 
 	dev->dev = &pdev->dev;
@@ -1114,11 +1145,6 @@ omap_i2c_probe(struct platform_device *pdev)
 
 	dev->reg_shift = (dev->flags >> OMAP_I2C_FLAG_BUS_SHIFT__SHIFT) & 3;
 
-	if (dev->dtrev == OMAP_I2C_IP_VERSION_2)
-		dev->regs = (u8 *)reg_map_ip_v2;
-	else
-		dev->regs = (u8 *)reg_map_ip_v1;
-
 	pm_runtime_enable(dev->dev);
 	pm_runtime_set_autosuspend_delay(dev->dev, OMAP_I2C_PM_TIMEOUT);
 	pm_runtime_use_autosuspend(dev->dev);
@@ -1127,11 +1153,37 @@ omap_i2c_probe(struct platform_device *pdev)
 	if (IS_ERR_VALUE(r))
 		goto err_free_mem;
 
-	dev->rev = omap_i2c_read_reg(dev, OMAP_I2C_REV_REG) & 0xff;
+	/*
+	 * Read the Rev hi bit-[15:14] ie scheme this is 1 indicates ver2.
+	 * On omap1/3/2 Offset 4 is IE Reg the bit [15:14] is 0 at reset.
+	 * Also since the omap_i2c_read_reg uses reg_map_ip_* a
+	 * raw_readw is done.
+	 */
+	rev = __raw_readw(dev->base + 0x04);
+
+	scheme = OMAP_I2C_SCHEME(rev);
+	switch (scheme) {
+	case OMAP_I2C_SCHEME_0:
+		dev->regs = (u8 *)reg_map_ip_v1;
+		dev->rev = omap_i2c_read_reg(dev, OMAP_I2C_REV_REG);
+		minor = OMAP_I2C_REV_SCHEME_0_MAJOR(dev->rev);
+		major = OMAP_I2C_REV_SCHEME_0_MAJOR(dev->rev);
+		break;
+	case OMAP_I2C_SCHEME_1:
+		/* FALLTHROUGH */
+	default:
+		dev->regs = (u8 *)reg_map_ip_v2;
+		rev = (rev << 16) |
+			omap_i2c_read_reg(dev, OMAP_I2C_IP_V2_REVNB_LO);
+		minor = OMAP_I2C_REV_SCHEME_1_MINOR(rev);
+		major = OMAP_I2C_REV_SCHEME_1_MAJOR(rev);
+		dev->rev = rev;
+	}
 
 	dev->errata = 0;
 
-	if (dev->flags & OMAP_I2C_FLAG_APPLY_ERRATA_I207)
+	if (dev->rev >= OMAP_I2C_REV_ON_2430 &&
+			dev->rev < OMAP_I2C_REV_ON_4430_PLUS)
 		dev->errata |= I2C_OMAP_ERRATA_I207;
 
 	if (dev->rev <= OMAP_I2C_REV_ON_3430_3530)
@@ -1152,7 +1204,7 @@ omap_i2c_probe(struct platform_device *pdev)
 
 		dev->fifo_size = (dev->fifo_size / 2);
 
-		if (dev->rev < OMAP_I2C_REV_ON_3630_4430)
+		if (dev->rev < OMAP_I2C_REV_ON_3630)
 			dev->b_hw = 1; /* Enable hardware fixes */
 
 		/* calculate wakeup latency constraint for MPU */
@@ -1195,8 +1247,8 @@ omap_i2c_probe(struct platform_device *pdev)
 		goto err_unuse_clocks;
 	}
 
-	dev_info(dev->dev, "bus %d rev%d.%d.%d at %d kHz\n", adap->nr,
-		 dev->dtrev, dev->rev >> 4, dev->rev & 0xf, dev->speed);
+	dev_info(dev->dev, "bus %d rev%d.%d at %d kHz\n", adap->nr,
+		 major, minor, dev->speed);
 
 	of_i2c_register_devices(adap);
 
@@ -1239,14 +1291,13 @@ static int omap_i2c_runtime_suspend(struct device *dev)
 {
 	struct platform_device *pdev = to_platform_device(dev);
 	struct omap_i2c_dev *_dev = platform_get_drvdata(pdev);
-	u16 iv;
 
 	_dev->iestate = omap_i2c_read_reg(_dev, OMAP_I2C_IE_REG);
 
 	omap_i2c_write_reg(_dev, OMAP_I2C_IE_REG, 0);
 
 	if (_dev->rev < OMAP_I2C_OMAP1_REV_2) {
-		iv = omap_i2c_read_reg(_dev, OMAP_I2C_IV_REG); /* Read clears */
+		omap_i2c_read_reg(_dev, OMAP_I2C_IV_REG); /* Read clears */
 	} else {
 		omap_i2c_write_reg(_dev, OMAP_I2C_STAT_REG, _dev->iestate);
 
@@ -1262,23 +1313,10 @@ static int omap_i2c_runtime_resume(struct device *dev)
 	struct platform_device *pdev = to_platform_device(dev);
 	struct omap_i2c_dev *_dev = platform_get_drvdata(pdev);
 
-	if (_dev->flags & OMAP_I2C_FLAG_RESET_REGS_POSTIDLE) {
-		omap_i2c_write_reg(_dev, OMAP_I2C_CON_REG, 0);
-		omap_i2c_write_reg(_dev, OMAP_I2C_PSC_REG, _dev->pscstate);
-		omap_i2c_write_reg(_dev, OMAP_I2C_SCLL_REG, _dev->scllstate);
-		omap_i2c_write_reg(_dev, OMAP_I2C_SCLH_REG, _dev->sclhstate);
-		omap_i2c_write_reg(_dev, OMAP_I2C_BUF_REG, _dev->bufstate);
-		omap_i2c_write_reg(_dev, OMAP_I2C_SYSC_REG, _dev->syscstate);
-		omap_i2c_write_reg(_dev, OMAP_I2C_WE_REG, _dev->westate);
-		omap_i2c_write_reg(_dev, OMAP_I2C_CON_REG, OMAP_I2C_CON_EN);
-	}
+	if (!_dev->regs)
+		return 0;
 
-	/*
-	 * Don't write to this register if the IE state is 0 as it can
-	 * cause deadlock.
-	 */
-	if (_dev->iestate)
-		omap_i2c_write_reg(_dev, OMAP_I2C_IE_REG, _dev->iestate);
+	__omap_i2c_init(_dev);
 
 	return 0;
 }
diff --git a/trunk/drivers/i2c/busses/i2c-rcar.c b/trunk/drivers/i2c/busses/i2c-rcar.c
index f9399d163af2..72a8071a5556 100644
--- a/trunk/drivers/i2c/busses/i2c-rcar.c
+++ b/trunk/drivers/i2c/busses/i2c-rcar.c
@@ -642,7 +642,7 @@ static int __devinit rcar_i2c_probe(struct platform_device *pdev)
 	if (ret < 0)
 		return ret;
 
-	priv->io = devm_ioremap(dev, res->start, resource_size(res));
+	priv->io = devm_request_and_ioremap(dev, res);
 	if (!priv->io) {
 		dev_err(dev, "cannot ioremap\n");
 		return -ENODEV;
@@ -693,7 +693,7 @@ static int __devexit rcar_i2c_remove(struct platform_device *pdev)
 	return 0;
 }
 
-static struct platform_driver rcar_i2c_drv = {
+static struct platform_driver rcar_i2c_driver = {
 	.driver	= {
 		.name	= "i2c-rcar",
 		.owner	= THIS_MODULE,
@@ -702,7 +702,7 @@ static struct platform_driver rcar_i2c_drv = {
 	.remove		= __devexit_p(rcar_i2c_remove),
 };
 
-module_platform_driver(rcar_i2c_drv);
+module_platform_driver(rcar_i2c_driver);
 
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Renesas R-Car I2C bus driver");
diff --git a/trunk/drivers/i2c/busses/i2c-s3c2410.c b/trunk/drivers/i2c/busses/i2c-s3c2410.c
index b33d95ebc890..a290d089ceaf 100644
--- a/trunk/drivers/i2c/busses/i2c-s3c2410.c
+++ b/trunk/drivers/i2c/busses/i2c-s3c2410.c
@@ -38,6 +38,7 @@
 #include <linux/io.h>
 #include <linux/of_i2c.h>
 #include <linux/of_gpio.h>
+#include <linux/pinctrl/consumer.h>
 
 #include <asm/irq.h>
 
@@ -49,6 +50,9 @@
 #define QUIRK_HDMIPHY		(1 << 1)
 #define QUIRK_NO_GPIO		(1 << 2)
 
+/* Max time to wait for bus to become idle after a xfer (in us) */
+#define S3C2410_IDLE_TIMEOUT	5000
+
 /* i2c controller state */
 enum s3c24xx_i2c_state {
 	STATE_IDLE,
@@ -59,7 +63,6 @@ enum s3c24xx_i2c_state {
 };
 
 struct s3c24xx_i2c {
-	spinlock_t		lock;
 	wait_queue_head_t	wait;
 	unsigned int            quirks;
 	unsigned int		suspended:1;
@@ -78,11 +81,11 @@ struct s3c24xx_i2c {
 	void __iomem		*regs;
 	struct clk		*clk;
 	struct device		*dev;
-	struct resource		*ioarea;
 	struct i2c_adapter	adap;
 
 	struct s3c2410_platform_i2c	*pdata;
 	int			gpios[2];
+	struct pinctrl          *pctrl;
 #ifdef CONFIG_CPU_FREQ
 	struct notifier_block	freq_transition;
 #endif
@@ -235,8 +238,47 @@ static inline void s3c24xx_i2c_stop(struct s3c24xx_i2c *i2c, int ret)
 
 	dev_dbg(i2c->dev, "STOP\n");
 
-	/* stop the transfer */
-	iicstat &= ~S3C2410_IICSTAT_START;
+	/*
+	 * The datasheet says that the STOP sequence should be:
+	 *  1) I2CSTAT.5 = 0	- Clear BUSY (or 'generate STOP')
+	 *  2) I2CCON.4 = 0	- Clear IRQPEND
+	 *  3) Wait until the stop condition takes effect.
+	 *  4*) I2CSTAT.4 = 0	- Clear TXRXEN
+	 *
+	 * Where, step "4*" is only for buses with the "HDMIPHY" quirk.
+	 *
+	 * However, after much experimentation, it appears that:
+	 * a) normal buses automatically clear BUSY and transition from
+	 *    Master->Slave when they complete generating a STOP condition.
+	 *    Therefore, step (3) can be done in doxfer() by polling I2CCON.4
+	 *    after starting the STOP generation here.
+	 * b) HDMIPHY bus does neither, so there is no way to do step 3.
+	 *    There is no indication when this bus has finished generating
+	 *    STOP.
+	 *
+	 * In fact, we have found that as soon as the IRQPEND bit is cleared in
+	 * step 2, the HDMIPHY bus generates the STOP condition, and then
+	 * immediately starts transferring another data byte, even though the
+	 * bus is supposedly stopped.  This is presumably because the bus is
+	 * still in "Master" mode, and its BUSY bit is still set.
+	 *
+	 * To avoid these extra post-STOP transactions on HDMI phy devices, we
+	 * just disable Serial Output on the bus (I2CSTAT.4 = 0) directly,
+	 * instead of first generating a proper STOP condition.  This should
+	 * float SDA & SCK terminating the transfer.  Subsequent transfers
+	 *  start with a proper START condition, and proceed normally.
+	 *
+	 * The HDMIPHY bus is an internal bus that always has exactly two
+	 * devices, the host as Master and the HDMIPHY device as the slave.
+	 * Skipping the STOP condition has been tested on this bus and works.
+	 */
+	if (i2c->quirks & QUIRK_HDMIPHY) {
+		/* Stop driving the I2C pins */
+		iicstat &= ~S3C2410_IICSTAT_TXRXEN;
+	} else {
+		/* stop the transfer */
+		iicstat &= ~S3C2410_IICSTAT_START;
+	}
 	writel(iicstat, i2c->regs + S3C2410_IICSTAT);
 
 	i2c->state = STATE_STOP;
@@ -490,13 +532,6 @@ static int s3c24xx_i2c_set_master(struct s3c24xx_i2c *i2c)
 	unsigned long iicstat;
 	int timeout = 400;
 
-	/* the timeout for HDMIPHY is reduced to 10 ms because
-	 * the hangup is expected to happen, so waiting 400 ms
-	 * causes only unnecessary system hangup
-	 */
-	if (i2c->quirks & QUIRK_HDMIPHY)
-		timeout = 10;
-
 	while (timeout-- > 0) {
 		iicstat = readl(i2c->regs + S3C2410_IICSTAT);
 
@@ -506,16 +541,61 @@ static int s3c24xx_i2c_set_master(struct s3c24xx_i2c *i2c)
 		msleep(1);
 	}
 
-	/* hang-up of bus dedicated for HDMIPHY occurred, resetting */
-	if (i2c->quirks & QUIRK_HDMIPHY) {
-		writel(0, i2c->regs + S3C2410_IICCON);
-		writel(0, i2c->regs + S3C2410_IICSTAT);
-		writel(0, i2c->regs + S3C2410_IICDS);
+	return -ETIMEDOUT;
+}
 
-		return 0;
+/* s3c24xx_i2c_wait_idle
+ *
+ * wait for the i2c bus to become idle.
+*/
+
+static void s3c24xx_i2c_wait_idle(struct s3c24xx_i2c *i2c)
+{
+	unsigned long iicstat;
+	ktime_t start, now;
+	unsigned long delay;
+	int spins;
+
+	/* ensure the stop has been through the bus */
+
+	dev_dbg(i2c->dev, "waiting for bus idle\n");
+
+	start = now = ktime_get();
+
+	/*
+	 * Most of the time, the bus is already idle within a few usec of the
+	 * end of a transaction.  However, really slow i2c devices can stretch
+	 * the clock, delaying STOP generation.
+	 *
+	 * On slower SoCs this typically happens within a very small number of
+	 * instructions so busy wait briefly to avoid scheduling overhead.
+	 */
+	spins = 3;
+	iicstat = readl(i2c->regs + S3C2410_IICSTAT);
+	while ((iicstat & S3C2410_IICSTAT_START) && --spins) {
+		cpu_relax();
+		iicstat = readl(i2c->regs + S3C2410_IICSTAT);
 	}
 
-	return -ETIMEDOUT;
+	/*
+	 * If we do get an appreciable delay as a compromise between idle
+	 * detection latency for the normal, fast case, and system load in the
+	 * slow device case, use an exponential back off in the polling loop,
+	 * up to 1/10th of the total timeout, then continue to poll at a
+	 * constant rate up to the timeout.
+	 */
+	delay = 1;
+	while ((iicstat & S3C2410_IICSTAT_START) &&
+	       ktime_us_delta(now, start) < S3C2410_IDLE_TIMEOUT) {
+		usleep_range(delay, 2 * delay);
+		if (delay < S3C2410_IDLE_TIMEOUT / 10)
+			delay <<= 1;
+		now = ktime_get();
+		iicstat = readl(i2c->regs + S3C2410_IICSTAT);
+	}
+
+	if (iicstat & S3C2410_IICSTAT_START)
+		dev_warn(i2c->dev, "timeout waiting for bus idle\n");
 }
 
 /* s3c24xx_i2c_doxfer
@@ -526,8 +606,7 @@ static int s3c24xx_i2c_set_master(struct s3c24xx_i2c *i2c)
 static int s3c24xx_i2c_doxfer(struct s3c24xx_i2c *i2c,
 			      struct i2c_msg *msgs, int num)
 {
-	unsigned long iicstat, timeout;
-	int spins = 20;
+	unsigned long timeout;
 	int ret;
 
 	if (i2c->suspended)
@@ -540,8 +619,6 @@ static int s3c24xx_i2c_doxfer(struct s3c24xx_i2c *i2c,
 		goto out;
 	}
 
-	spin_lock_irq(&i2c->lock);
-
 	i2c->msg     = msgs;
 	i2c->msg_num = num;
 	i2c->msg_ptr = 0;
@@ -550,7 +627,6 @@ static int s3c24xx_i2c_doxfer(struct s3c24xx_i2c *i2c,
 
 	s3c24xx_i2c_enable_irq(i2c);
 	s3c24xx_i2c_message_start(i2c, msgs);
-	spin_unlock_irq(&i2c->lock);
 
 	timeout = wait_event_timeout(i2c->wait, i2c->msg_num == 0, HZ * 5);
 
@@ -564,24 +640,11 @@ static int s3c24xx_i2c_doxfer(struct s3c24xx_i2c *i2c,
 	else if (ret != num)
 		dev_dbg(i2c->dev, "incomplete xfer (%d)\n", ret);
 
-	/* ensure the stop has been through the bus */
-
-	dev_dbg(i2c->dev, "waiting for bus idle\n");
-
-	/* first, try busy waiting briefly */
-	do {
-		cpu_relax();
-		iicstat = readl(i2c->regs + S3C2410_IICSTAT);
-	} while ((iicstat & S3C2410_IICSTAT_START) && --spins);
-
-	/* if that timed out sleep */
-	if (!spins) {
-		msleep(1);
-		iicstat = readl(i2c->regs + S3C2410_IICSTAT);
-	}
+	/* For QUIRK_HDMIPHY, bus is already disabled */
+	if (i2c->quirks & QUIRK_HDMIPHY)
+		goto out;
 
-	if (iicstat & S3C2410_IICSTAT_START)
-		dev_warn(i2c->dev, "timeout waiting for bus idle\n");
+	s3c24xx_i2c_wait_idle(i2c);
 
  out:
 	return ret;
@@ -740,7 +803,6 @@ static int s3c24xx_i2c_cpufreq_transition(struct notifier_block *nb,
 					  unsigned long val, void *data)
 {
 	struct s3c24xx_i2c *i2c = freq_to_i2c(nb);
-	unsigned long flags;
 	unsigned int got;
 	int delta_f;
 	int ret;
@@ -754,9 +816,9 @@ static int s3c24xx_i2c_cpufreq_transition(struct notifier_block *nb,
 
 	if ((val == CPUFREQ_POSTCHANGE && delta_f < 0) ||
 	    (val == CPUFREQ_PRECHANGE && delta_f > 0)) {
-		spin_lock_irqsave(&i2c->lock, flags);
+		i2c_lock_adapter(&i2c->adap);
 		ret = s3c24xx_i2c_clockrate(i2c, &got);
-		spin_unlock_irqrestore(&i2c->lock, flags);
+		i2c_unlock_adapter(&i2c->adap);
 
 		if (ret < 0)
 			dev_err(i2c->dev, "cannot find frequency\n");
@@ -858,14 +920,6 @@ static int s3c24xx_i2c_init(struct s3c24xx_i2c *i2c)
 
 	pdata = i2c->pdata;
 
-	/* inititalise the gpio */
-
-	if (pdata->cfg_gpio)
-		pdata->cfg_gpio(to_platform_device(i2c->dev));
-	else
-		if (s3c24xx_i2c_parse_dt_gpio(i2c))
-			return -EINVAL;
-
 	/* write slave address */
 
 	writeb(pdata->slave_addr, i2c->regs + S3C2410_IICADD);
@@ -963,7 +1017,6 @@ static int s3c24xx_i2c_probe(struct platform_device *pdev)
 	i2c->adap.class   = I2C_CLASS_HWMON | I2C_CLASS_SPD;
 	i2c->tx_setup     = 50;
 
-	spin_lock_init(&i2c->lock);
 	init_waitqueue_head(&i2c->wait);
 
 	/* find the clock and enable it */
@@ -989,36 +1042,38 @@ static int s3c24xx_i2c_probe(struct platform_device *pdev)
 		goto err_clk;
 	}
 
-	i2c->ioarea = request_mem_region(res->start, resource_size(res),
-					 pdev->name);
-
-	if (i2c->ioarea == NULL) {
-		dev_err(&pdev->dev, "cannot request IO\n");
-		ret = -ENXIO;
-		goto err_clk;
-	}
-
-	i2c->regs = ioremap(res->start, resource_size(res));
+	i2c->regs = devm_request_and_ioremap(&pdev->dev, res);
 
 	if (i2c->regs == NULL) {
 		dev_err(&pdev->dev, "cannot map IO\n");
 		ret = -ENXIO;
-		goto err_ioarea;
+		goto err_clk;
 	}
 
-	dev_dbg(&pdev->dev, "registers %p (%p, %p)\n",
-		i2c->regs, i2c->ioarea, res);
+	dev_dbg(&pdev->dev, "registers %p (%p)\n",
+		i2c->regs, res);
 
 	/* setup info block for the i2c core */
 
 	i2c->adap.algo_data = i2c;
 	i2c->adap.dev.parent = &pdev->dev;
 
+	i2c->pctrl = devm_pinctrl_get_select_default(i2c->dev);
+
+	/* inititalise the i2c gpio lines */
+
+	if (i2c->pdata->cfg_gpio) {
+		i2c->pdata->cfg_gpio(to_platform_device(i2c->dev));
+	} else if (IS_ERR(i2c->pctrl) && s3c24xx_i2c_parse_dt_gpio(i2c)) {
+		ret = -EINVAL;
+		goto err_clk;
+	}
+
 	/* initialise the i2c controller */
 
 	ret = s3c24xx_i2c_init(i2c);
 	if (ret != 0)
-		goto err_iomap;
+		goto err_clk;
 
 	/* find the IRQ for this unit (note, this relies on the init call to
 	 * ensure no current IRQs pending
@@ -1027,7 +1082,7 @@ static int s3c24xx_i2c_probe(struct platform_device *pdev)
 	i2c->irq = ret = platform_get_irq(pdev, 0);
 	if (ret <= 0) {
 		dev_err(&pdev->dev, "cannot find IRQ\n");
-		goto err_iomap;
+		goto err_clk;
 	}
 
 	ret = request_irq(i2c->irq, s3c24xx_i2c_irq, 0,
@@ -1035,7 +1090,7 @@ static int s3c24xx_i2c_probe(struct platform_device *pdev)
 
 	if (ret != 0) {
 		dev_err(&pdev->dev, "cannot claim IRQ %d\n", i2c->irq);
-		goto err_iomap;
+		goto err_clk;
 	}
 
 	ret = s3c24xx_i2c_register_cpufreq(i2c);
@@ -1075,13 +1130,6 @@ static int s3c24xx_i2c_probe(struct platform_device *pdev)
  err_irq:
 	free_irq(i2c->irq, i2c);
 
- err_iomap:
-	iounmap(i2c->regs);
-
- err_ioarea:
-	release_resource(i2c->ioarea);
-	kfree(i2c->ioarea);
-
  err_clk:
 	clk_disable_unprepare(i2c->clk);
 	clk_put(i2c->clk);
@@ -1110,16 +1158,13 @@ static int s3c24xx_i2c_remove(struct platform_device *pdev)
 	clk_disable_unprepare(i2c->clk);
 	clk_put(i2c->clk);
 
-	iounmap(i2c->regs);
-
-	release_resource(i2c->ioarea);
-	s3c24xx_i2c_dt_gpio_free(i2c);
-	kfree(i2c->ioarea);
+	if (pdev->dev.of_node && IS_ERR(i2c->pctrl))
+		s3c24xx_i2c_dt_gpio_free(i2c);
 
 	return 0;
 }
 
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
 static int s3c24xx_i2c_suspend_noirq(struct device *dev)
 {
 	struct platform_device *pdev = to_platform_device(dev);
@@ -1142,10 +1187,14 @@ static int s3c24xx_i2c_resume(struct device *dev)
 
 	return 0;
 }
+#endif
 
+#ifdef CONFIG_PM
 static const struct dev_pm_ops s3c24xx_i2c_dev_pm_ops = {
+#ifdef CONFIG_PM_SLEEP
 	.suspend_noirq = s3c24xx_i2c_suspend_noirq,
 	.resume = s3c24xx_i2c_resume,
+#endif
 };
 
 #define S3C24XX_DEV_PM_OPS (&s3c24xx_i2c_dev_pm_ops)
diff --git a/trunk/drivers/i2c/busses/i2c-sh_mobile.c b/trunk/drivers/i2c/busses/i2c-sh_mobile.c
index 8110ca45f342..9411c1b892c0 100644
--- a/trunk/drivers/i2c/busses/i2c-sh_mobile.c
+++ b/trunk/drivers/i2c/busses/i2c-sh_mobile.c
@@ -120,11 +120,12 @@ struct sh_mobile_i2c_data {
 	void __iomem *reg;
 	struct i2c_adapter adap;
 	unsigned long bus_speed;
+	unsigned int clks_per_count;
 	struct clk *clk;
 	u_int8_t icic;
-	u_int8_t iccl;
-	u_int8_t icch;
 	u_int8_t flags;
+	u_int16_t iccl;
+	u_int16_t icch;
 
 	spinlock_t lock;
 	wait_queue_head_t wait;
@@ -135,7 +136,8 @@ struct sh_mobile_i2c_data {
 
 #define IIC_FLAG_HAS_ICIC67	(1 << 0)
 
-#define NORMAL_SPEED		100000 /* FAST_SPEED 400000 */
+#define STANDARD_MODE		100000
+#define FAST_MODE		400000
 
 /* Register offsets */
 #define ICDR			0x00
@@ -187,57 +189,90 @@ static void iic_set_clr(struct sh_mobile_i2c_data *pd, int offs,
 	iic_wr(pd, offs, (iic_rd(pd, offs) | set) & ~clr);
 }
 
-static void activate_ch(struct sh_mobile_i2c_data *pd)
+static u32 sh_mobile_i2c_iccl(unsigned long count_khz, u32 tLOW, u32 tf, int offset)
 {
-	unsigned long i2c_clk;
-	u_int32_t num;
-	u_int32_t denom;
-	u_int32_t tmp;
-
-	/* Wake up device and enable clock */
-	pm_runtime_get_sync(pd->dev);
-	clk_enable(pd->clk);
-
-	/* Get clock rate after clock is enabled */
-	i2c_clk = clk_get_rate(pd->clk);
+	/*
+	 * Conditional expression:
+	 *   ICCL >= COUNT_CLK * (tLOW + tf)
+	 *
+	 * SH-Mobile IIC hardware starts counting the LOW period of
+	 * the SCL signal (tLOW) as soon as it pulls the SCL line.
+	 * In order to meet the tLOW timing spec, we need to take into
+	 * account the fall time of SCL signal (tf).  Default tf value
+	 * should be 0.3 us, for safety.
+	 */
+	return (((count_khz * (tLOW + tf)) + 5000) / 10000) + offset;
+}
 
-	/* Calculate the value for iccl. From the data sheet:
-	 * iccl = (p clock / transfer rate) * (L / (L + H))
-	 * where L and H are the SCL low/high ratio (5/4 in this case).
-	 * We also round off the result.
+static u32 sh_mobile_i2c_icch(unsigned long count_khz, u32 tHIGH, u32 tf, int offset)
+{
+	/*
+	 * Conditional expression:
+	 *   ICCH >= COUNT_CLK * (tHIGH + tf)
+	 *
+	 * SH-Mobile IIC hardware is aware of SCL transition period 'tr',
+	 * and can ignore it.  SH-Mobile IIC controller starts counting
+	 * the HIGH period of the SCL signal (tHIGH) after the SCL input
+	 * voltage increases at VIH.
+	 *
+	 * Afterward it turned out calculating ICCH using only tHIGH spec
+	 * will result in violation of the tHD;STA timing spec.  We need
+	 * to take into account the fall time of SDA signal (tf) at START
+	 * condition, in order to meet both tHIGH and tHD;STA specs.
 	 */
-	num = i2c_clk * 5;
-	denom = pd->bus_speed * 9;
-	tmp = num * 10 / denom;
-	if (tmp % 10 >= 5)
-		pd->iccl = (u_int8_t)((num/denom) + 1);
-	else
-		pd->iccl = (u_int8_t)(num/denom);
+	return (((count_khz * (tHIGH + tf)) + 5000) / 10000) + offset;
+}
 
-	/* one more bit of ICCL in ICIC */
-	if (pd->flags & IIC_FLAG_HAS_ICIC67) {
-		if ((num/denom) > 0xff)
-			pd->icic |= ICIC_ICCLB8;
-		else
-			pd->icic &= ~ICIC_ICCLB8;
+static void sh_mobile_i2c_init(struct sh_mobile_i2c_data *pd)
+{
+	unsigned long i2c_clk_khz;
+	u32 tHIGH, tLOW, tf;
+	int offset;
+
+	/* Get clock rate after clock is enabled */
+	clk_enable(pd->clk);
+	i2c_clk_khz = clk_get_rate(pd->clk) / 1000;
+	i2c_clk_khz /= pd->clks_per_count;
+
+	if (pd->bus_speed == STANDARD_MODE) {
+		tLOW	= 47;	/* tLOW = 4.7 us */
+		tHIGH	= 40;	/* tHD;STA = tHIGH = 4.0 us */
+		tf	= 3;	/* tf = 0.3 us */
+		offset	= 0;	/* No offset */
+	} else if (pd->bus_speed == FAST_MODE) {
+		tLOW	= 13;	/* tLOW = 1.3 us */
+		tHIGH	= 6;	/* tHD;STA = tHIGH = 0.6 us */
+		tf	= 3;	/* tf = 0.3 us */
+		offset	= 0;	/* No offset */
+	} else {
+		dev_err(pd->dev, "unrecognized bus speed %lu Hz\n",
+			pd->bus_speed);
+		goto out;
 	}
 
-	/* Calculate the value for icch. From the data sheet:
-	   icch = (p clock / transfer rate) * (H / (L + H)) */
-	num = i2c_clk * 4;
-	tmp = num * 10 / denom;
-	if (tmp % 10 >= 5)
-		pd->icch = (u_int8_t)((num/denom) + 1);
+	pd->iccl = sh_mobile_i2c_iccl(i2c_clk_khz, tLOW, tf, offset);
+	/* one more bit of ICCL in ICIC */
+	if ((pd->iccl > 0xff) && (pd->flags & IIC_FLAG_HAS_ICIC67))
+		pd->icic |= ICIC_ICCLB8;
 	else
-		pd->icch = (u_int8_t)(num/denom);
+		pd->icic &= ~ICIC_ICCLB8;
 
+	pd->icch = sh_mobile_i2c_icch(i2c_clk_khz, tHIGH, tf, offset);
 	/* one more bit of ICCH in ICIC */
-	if (pd->flags & IIC_FLAG_HAS_ICIC67) {
-		if ((num/denom) > 0xff)
-			pd->icic |= ICIC_ICCHB8;
-		else
-			pd->icic &= ~ICIC_ICCHB8;
-	}
+	if ((pd->icch > 0xff) && (pd->flags & IIC_FLAG_HAS_ICIC67))
+		pd->icic |= ICIC_ICCHB8;
+	else
+		pd->icic &= ~ICIC_ICCHB8;
+
+out:
+	clk_disable(pd->clk);
+}
+
+static void activate_ch(struct sh_mobile_i2c_data *pd)
+{
+	/* Wake up device and enable clock */
+	pm_runtime_get_sync(pd->dev);
+	clk_enable(pd->clk);
 
 	/* Enable channel and configure rx ack */
 	iic_set_clr(pd, ICCR, ICCR_ICE, 0);
@@ -246,8 +281,8 @@ static void activate_ch(struct sh_mobile_i2c_data *pd)
 	iic_wr(pd, ICIC, 0);
 
 	/* Set the clock */
-	iic_wr(pd, ICCL, pd->iccl);
-	iic_wr(pd, ICCH, pd->icch);
+	iic_wr(pd, ICCL, pd->iccl & 0xff);
+	iic_wr(pd, ICCH, pd->icch & 0xff);
 }
 
 static void deactivate_ch(struct sh_mobile_i2c_data *pd)
@@ -434,6 +469,9 @@ static irqreturn_t sh_mobile_i2c_isr(int irq, void *dev_id)
 		wake_up(&pd->wait);
 	}
 
+	/* defeat write posting to avoid spurious WAIT interrupts */
+	iic_rd(pd, ICSR);
+
 	return IRQ_HANDLED;
 }
 
@@ -451,8 +489,8 @@ static int start_ch(struct sh_mobile_i2c_data *pd, struct i2c_msg *usr_msg)
 	iic_set_clr(pd, ICCR, ICCR_ICE, 0);
 
 	/* Set the clock */
-	iic_wr(pd, ICCL, pd->iccl);
-	iic_wr(pd, ICCH, pd->icch);
+	iic_wr(pd, ICCL, pd->iccl & 0xff);
+	iic_wr(pd, ICCH, pd->icch & 0xff);
 
 	pd->msg = usr_msg;
 	pd->pos = -1;
@@ -621,10 +659,13 @@ static int sh_mobile_i2c_probe(struct platform_device *dev)
 		goto err_irq;
 	}
 
-	/* Use platformd data bus speed or NORMAL_SPEED */
-	pd->bus_speed = NORMAL_SPEED;
+	/* Use platform data bus speed or STANDARD_MODE */
+	pd->bus_speed = STANDARD_MODE;
 	if (pdata && pdata->bus_speed)
 		pd->bus_speed = pdata->bus_speed;
+	pd->clks_per_count = 1;
+	if (pdata && pdata->clks_per_count)
+		pd->clks_per_count = pdata->clks_per_count;
 
 	/* The IIC blocks on SH-Mobile ARM processors
 	 * come with two new bits in ICIC.
@@ -632,6 +673,8 @@ static int sh_mobile_i2c_probe(struct platform_device *dev)
 	if (size > 0x17)
 		pd->flags |= IIC_FLAG_HAS_ICIC67;
 
+	sh_mobile_i2c_init(pd);
+
 	/* Enable Runtime PM for this device.
 	 *
 	 * Also tell the Runtime PM core to ignore children
@@ -667,8 +710,9 @@ static int sh_mobile_i2c_probe(struct platform_device *dev)
 		goto err_all;
 	}
 
-	dev_info(&dev->dev, "I2C adapter %d with bus speed %lu Hz\n",
-		 adap->nr, pd->bus_speed);
+	dev_info(&dev->dev,
+		 "I2C adapter %d with bus speed %lu Hz (L/H=%x/%x)\n",
+		 adap->nr, pd->bus_speed, pd->iccl, pd->icch);
 
 	of_i2c_register_devices(adap);
 	return 0;
diff --git a/trunk/drivers/i2c/muxes/i2c-mux-gpio.c b/trunk/drivers/i2c/muxes/i2c-mux-gpio.c
index 566a6757a33d..3b7bc06fe8a6 100644
--- a/trunk/drivers/i2c/muxes/i2c-mux-gpio.c
+++ b/trunk/drivers/i2c/muxes/i2c-mux-gpio.c
@@ -16,6 +16,8 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/gpio.h>
+#include <linux/of_i2c.h>
+#include <linux/of_gpio.h>
 
 struct gpiomux {
 	struct i2c_adapter *parent;
@@ -57,29 +59,110 @@ static int __devinit match_gpio_chip_by_label(struct gpio_chip *chip,
 	return !strcmp(chip->label, data);
 }
 
+#ifdef CONFIG_OF
+static int __devinit i2c_mux_gpio_probe_dt(struct gpiomux *mux,
+					struct platform_device *pdev)
+{
+	struct device_node *np = pdev->dev.of_node;
+	struct device_node *adapter_np, *child;
+	struct i2c_adapter *adapter;
+	unsigned *values, *gpios;
+	int i = 0;
+
+	if (!np)
+		return -ENODEV;
+
+	adapter_np = of_parse_phandle(np, "i2c-parent", 0);
+	if (!adapter_np) {
+		dev_err(&pdev->dev, "Cannot parse i2c-parent\n");
+		return -ENODEV;
+	}
+	adapter = of_find_i2c_adapter_by_node(adapter_np);
+	if (!adapter) {
+		dev_err(&pdev->dev, "Cannot find parent bus\n");
+		return -ENODEV;
+	}
+	mux->data.parent = i2c_adapter_id(adapter);
+	put_device(&adapter->dev);
+
+	mux->data.n_values = of_get_child_count(np);
+
+	values = devm_kzalloc(&pdev->dev,
+			      sizeof(*mux->data.values) * mux->data.n_values,
+			      GFP_KERNEL);
+	if (!values) {
+		dev_err(&pdev->dev, "Cannot allocate values array");
+		return -ENOMEM;
+	}
+
+	for_each_child_of_node(np, child) {
+		of_property_read_u32(child, "reg", values + i);
+		i++;
+	}
+	mux->data.values = values;
+
+	if (of_property_read_u32(np, "idle-state", &mux->data.idle))
+		mux->data.idle = I2C_MUX_GPIO_NO_IDLE;
+
+	mux->data.n_gpios = of_gpio_named_count(np, "mux-gpios");
+	if (mux->data.n_gpios < 0) {
+		dev_err(&pdev->dev, "Missing mux-gpios property in the DT.\n");
+		return -EINVAL;
+	}
+
+	gpios = devm_kzalloc(&pdev->dev,
+			     sizeof(*mux->data.gpios) * mux->data.n_gpios, GFP_KERNEL);
+	if (!gpios) {
+		dev_err(&pdev->dev, "Cannot allocate gpios array");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < mux->data.n_gpios; i++)
+		gpios[i] = of_get_named_gpio(np, "mux-gpios", i);
+
+	mux->data.gpios = gpios;
+
+	return 0;
+}
+#else
+static int __devinit i2c_mux_gpio_probe_dt(struct gpiomux *mux,
+					struct platform_device *pdev)
+{
+	return 0;
+}
+#endif
+
 static int __devinit i2c_mux_gpio_probe(struct platform_device *pdev)
 {
 	struct gpiomux *mux;
-	struct i2c_mux_gpio_platform_data *pdata;
 	struct i2c_adapter *parent;
 	int (*deselect) (struct i2c_adapter *, void *, u32);
 	unsigned initial_state, gpio_base;
 	int i, ret;
 
-	pdata = pdev->dev.platform_data;
-	if (!pdata) {
-		dev_err(&pdev->dev, "Missing platform data\n");
-		return -ENODEV;
+	mux = devm_kzalloc(&pdev->dev, sizeof(*mux), GFP_KERNEL);
+	if (!mux) {
+		dev_err(&pdev->dev, "Cannot allocate gpiomux structure");
+		return -ENOMEM;
 	}
 
+	platform_set_drvdata(pdev, mux);
+
+	if (!pdev->dev.platform_data) {
+		ret = i2c_mux_gpio_probe_dt(mux, pdev);
+		if (ret < 0)
+			return ret;
+	} else
+		memcpy(&mux->data, pdev->dev.platform_data, sizeof(mux->data));
+
 	/*
 	 * If a GPIO chip name is provided, the GPIO pin numbers provided are
 	 * relative to its base GPIO number. Otherwise they are absolute.
 	 */
-	if (pdata->gpio_chip) {
+	if (mux->data.gpio_chip) {
 		struct gpio_chip *gpio;
 
-		gpio = gpiochip_find(pdata->gpio_chip,
+		gpio = gpiochip_find(mux->data.gpio_chip,
 				     match_gpio_chip_by_label);
 		if (!gpio)
 			return -EPROBE_DEFER;
@@ -89,49 +172,44 @@ static int __devinit i2c_mux_gpio_probe(struct platform_device *pdev)
 		gpio_base = 0;
 	}
 
-	parent = i2c_get_adapter(pdata->parent);
+	parent = i2c_get_adapter(mux->data.parent);
 	if (!parent) {
 		dev_err(&pdev->dev, "Parent adapter (%d) not found\n",
-			pdata->parent);
+			mux->data.parent);
 		return -ENODEV;
 	}
 
-	mux = devm_kzalloc(&pdev->dev, sizeof(*mux), GFP_KERNEL);
-	if (!mux) {
-		ret = -ENOMEM;
-		goto alloc_failed;
-	}
-
 	mux->parent = parent;
-	mux->data = *pdata;
 	mux->gpio_base = gpio_base;
+
 	mux->adap = devm_kzalloc(&pdev->dev,
-				 sizeof(*mux->adap) * pdata->n_values,
+				 sizeof(*mux->adap) * mux->data.n_values,
 				 GFP_KERNEL);
 	if (!mux->adap) {
+		dev_err(&pdev->dev, "Cannot allocate i2c_adapter structure");
 		ret = -ENOMEM;
 		goto alloc_failed;
 	}
 
-	if (pdata->idle != I2C_MUX_GPIO_NO_IDLE) {
-		initial_state = pdata->idle;
+	if (mux->data.idle != I2C_MUX_GPIO_NO_IDLE) {
+		initial_state = mux->data.idle;
 		deselect = i2c_mux_gpio_deselect;
 	} else {
-		initial_state = pdata->values[0];
+		initial_state = mux->data.values[0];
 		deselect = NULL;
 	}
 
-	for (i = 0; i < pdata->n_gpios; i++) {
-		ret = gpio_request(gpio_base + pdata->gpios[i], "i2c-mux-gpio");
+	for (i = 0; i < mux->data.n_gpios; i++) {
+		ret = gpio_request(gpio_base + mux->data.gpios[i], "i2c-mux-gpio");
 		if (ret)
 			goto err_request_gpio;
-		gpio_direction_output(gpio_base + pdata->gpios[i],
+		gpio_direction_output(gpio_base + mux->data.gpios[i],
 				      initial_state & (1 << i));
 	}
 
-	for (i = 0; i < pdata->n_values; i++) {
-		u32 nr = pdata->base_nr ? (pdata->base_nr + i) : 0;
-		unsigned int class = pdata->classes ? pdata->classes[i] : 0;
+	for (i = 0; i < mux->data.n_values; i++) {
+		u32 nr = mux->data.base_nr ? (mux->data.base_nr + i) : 0;
+		unsigned int class = mux->data.classes ? mux->data.classes[i] : 0;
 
 		mux->adap[i] = i2c_add_mux_adapter(parent, &pdev->dev, mux, nr,
 						   i, class,
@@ -144,19 +222,17 @@ static int __devinit i2c_mux_gpio_probe(struct platform_device *pdev)
 	}
 
 	dev_info(&pdev->dev, "%d port mux on %s adapter\n",
-		 pdata->n_values, parent->name);
-
-	platform_set_drvdata(pdev, mux);
+		 mux->data.n_values, parent->name);
 
 	return 0;
 
 add_adapter_failed:
 	for (; i > 0; i--)
 		i2c_del_mux_adapter(mux->adap[i - 1]);
-	i = pdata->n_gpios;
+	i = mux->data.n_gpios;
 err_request_gpio:
 	for (; i > 0; i--)
-		gpio_free(gpio_base + pdata->gpios[i - 1]);
+		gpio_free(gpio_base + mux->data.gpios[i - 1]);
 alloc_failed:
 	i2c_put_adapter(parent);
 
@@ -180,12 +256,19 @@ static int __devexit i2c_mux_gpio_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct of_device_id i2c_mux_gpio_of_match[] __devinitconst = {
+	{ .compatible = "i2c-mux-gpio", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, i2c_mux_gpio_of_match);
+
 static struct platform_driver i2c_mux_gpio_driver = {
 	.probe	= i2c_mux_gpio_probe,
 	.remove	= __devexit_p(i2c_mux_gpio_remove),
 	.driver	= {
 		.owner	= THIS_MODULE,
 		.name	= "i2c-mux-gpio",
+		.of_match_table = of_match_ptr(i2c_mux_gpio_of_match),
 	},
 };
 
diff --git a/trunk/drivers/infiniband/hw/ehca/hcp_if.c b/trunk/drivers/infiniband/hw/ehca/hcp_if.c
index 2d41d04fd959..89517ffb4389 100644
--- a/trunk/drivers/infiniband/hw/ehca/hcp_if.c
+++ b/trunk/drivers/infiniband/hw/ehca/hcp_if.c
@@ -90,26 +90,6 @@
 
 static DEFINE_SPINLOCK(hcall_lock);
 
-static u32 get_longbusy_msecs(int longbusy_rc)
-{
-	switch (longbusy_rc) {
-	case H_LONG_BUSY_ORDER_1_MSEC:
-		return 1;
-	case H_LONG_BUSY_ORDER_10_MSEC:
-		return 10;
-	case H_LONG_BUSY_ORDER_100_MSEC:
-		return 100;
-	case H_LONG_BUSY_ORDER_1_SEC:
-		return 1000;
-	case H_LONG_BUSY_ORDER_10_SEC:
-		return 10000;
-	case H_LONG_BUSY_ORDER_100_SEC:
-		return 100000;
-	default:
-		return 1;
-	}
-}
-
 static long ehca_plpar_hcall_norets(unsigned long opcode,
 				    unsigned long arg1,
 				    unsigned long arg2,
diff --git a/trunk/drivers/input/gameport/emu10k1-gp.c b/trunk/drivers/input/gameport/emu10k1-gp.c
index daceafe7ee7d..fa7a95c1da0e 100644
--- a/trunk/drivers/input/gameport/emu10k1-gp.c
+++ b/trunk/drivers/input/gameport/emu10k1-gp.c
@@ -57,7 +57,7 @@ static const struct pci_device_id emu_tbl[] = {
 
 MODULE_DEVICE_TABLE(pci, emu_tbl);
 
-static int __devinit emu_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+static int emu_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	struct emu *emu;
 	struct gameport *port;
@@ -107,7 +107,7 @@ static int __devinit emu_probe(struct pci_dev *pdev, const struct pci_device_id
 	return error;
 }
 
-static void __devexit emu_remove(struct pci_dev *pdev)
+static void emu_remove(struct pci_dev *pdev)
 {
 	struct emu *emu = pci_get_drvdata(pdev);
 
@@ -122,7 +122,7 @@ static struct pci_driver emu_driver = {
         .name =         "Emu10k1_gameport",
         .id_table =     emu_tbl,
         .probe =        emu_probe,
-        .remove =       __devexit_p(emu_remove),
+	.remove =	emu_remove,
 };
 
 module_pci_driver(emu_driver);
diff --git a/trunk/drivers/input/gameport/fm801-gp.c b/trunk/drivers/input/gameport/fm801-gp.c
index 48ad3829ff20..ae912d3aee4e 100644
--- a/trunk/drivers/input/gameport/fm801-gp.c
+++ b/trunk/drivers/input/gameport/fm801-gp.c
@@ -78,7 +78,7 @@ static int fm801_gp_open(struct gameport *gameport, int mode)
 	return 0;
 }
 
-static int __devinit fm801_gp_probe(struct pci_dev *pci, const struct pci_device_id *id)
+static int fm801_gp_probe(struct pci_dev *pci, const struct pci_device_id *id)
 {
 	struct fm801_gp *gp;
 	struct gameport *port;
@@ -129,7 +129,7 @@ static int __devinit fm801_gp_probe(struct pci_dev *pci, const struct pci_device
 	return error;
 }
 
-static void __devexit fm801_gp_remove(struct pci_dev *pci)
+static void fm801_gp_remove(struct pci_dev *pci)
 {
 	struct fm801_gp *gp = pci_get_drvdata(pci);
 
@@ -150,7 +150,7 @@ static struct pci_driver fm801_gp_driver = {
 	.name =		"FM801_gameport",
 	.id_table =	fm801_gp_id_table,
 	.probe =	fm801_gp_probe,
-	.remove =	__devexit_p(fm801_gp_remove),
+	.remove =	fm801_gp_remove,
 };
 
 module_pci_driver(fm801_gp_driver);
diff --git a/trunk/drivers/input/input-mt.c b/trunk/drivers/input/input-mt.c
index 8c4b50fd9a79..47a6009dbf43 100644
--- a/trunk/drivers/input/input-mt.c
+++ b/trunk/drivers/input/input-mt.c
@@ -194,7 +194,7 @@ void input_mt_report_pointer_emulation(struct input_dev *dev, bool use_count)
 	if (!mt)
 		return;
 
-	oldest = 0;
+	oldest = NULL;
 	oldid = mt->trkid;
 	count = 0;
 
diff --git a/trunk/drivers/input/input.c b/trunk/drivers/input/input.c
index 53a0ddee7872..ce01332f7b3a 100644
--- a/trunk/drivers/input/input.c
+++ b/trunk/drivers/input/input.c
@@ -534,8 +534,11 @@ EXPORT_SYMBOL(input_grab_device);
 static void __input_release_device(struct input_handle *handle)
 {
 	struct input_dev *dev = handle->dev;
+	struct input_handle *grabber;
 
-	if (dev->grab == handle) {
+	grabber = rcu_dereference_protected(dev->grab,
+					    lockdep_is_held(&dev->mutex));
+	if (grabber == handle) {
 		rcu_assign_pointer(dev->grab, NULL);
 		/* Make sure input_pass_event() notices that grab is gone */
 		synchronize_rcu();
@@ -1723,7 +1726,7 @@ EXPORT_SYMBOL_GPL(input_class);
 /**
  * input_allocate_device - allocate memory for new input device
  *
- * Returns prepared struct input_dev or NULL.
+ * Returns prepared struct input_dev or %NULL.
  *
  * NOTE: Use input_free_device() to free devices that have not been
  * registered; input_unregister_device() should be used for already
@@ -1750,6 +1753,70 @@ struct input_dev *input_allocate_device(void)
 }
 EXPORT_SYMBOL(input_allocate_device);
 
+struct input_devres {
+	struct input_dev *input;
+};
+
+static int devm_input_device_match(struct device *dev, void *res, void *data)
+{
+	struct input_devres *devres = res;
+
+	return devres->input == data;
+}
+
+static void devm_input_device_release(struct device *dev, void *res)
+{
+	struct input_devres *devres = res;
+	struct input_dev *input = devres->input;
+
+	dev_dbg(dev, "%s: dropping reference to %s\n",
+		__func__, dev_name(&input->dev));
+	input_put_device(input);
+}
+
+/**
+ * devm_input_allocate_device - allocate managed input device
+ * @dev: device owning the input device being created
+ *
+ * Returns prepared struct input_dev or %NULL.
+ *
+ * Managed input devices do not need to be explicitly unregistered or
+ * freed as it will be done automatically when owner device unbinds from
+ * its driver (or binding fails). Once managed input device is allocated,
+ * it is ready to be set up and registered in the same fashion as regular
+ * input device. There are no special devm_input_device_[un]register()
+ * variants, regular ones work with both managed and unmanaged devices.
+ *
+ * NOTE: the owner device is set up as parent of input device and users
+ * should not override it.
+ */
+
+struct input_dev *devm_input_allocate_device(struct device *dev)
+{
+	struct input_dev *input;
+	struct input_devres *devres;
+
+	devres = devres_alloc(devm_input_device_release,
+			      sizeof(struct input_devres), GFP_KERNEL);
+	if (!devres)
+		return NULL;
+
+	input = input_allocate_device();
+	if (!input) {
+		devres_free(devres);
+		return NULL;
+	}
+
+	input->dev.parent = dev;
+	input->devres_managed = true;
+
+	devres->input = input;
+	devres_add(dev, devres);
+
+	return input;
+}
+EXPORT_SYMBOL(devm_input_allocate_device);
+
 /**
  * input_free_device - free memory occupied by input_dev structure
  * @dev: input device to free
@@ -1766,8 +1833,14 @@ EXPORT_SYMBOL(input_allocate_device);
  */
 void input_free_device(struct input_dev *dev)
 {
-	if (dev)
+	if (dev) {
+		if (dev->devres_managed)
+			WARN_ON(devres_destroy(dev->dev.parent,
+						devm_input_device_release,
+						devm_input_device_match,
+						dev));
 		input_put_device(dev);
+	}
 }
 EXPORT_SYMBOL(input_free_device);
 
@@ -1888,6 +1961,38 @@ static void input_cleanse_bitmasks(struct input_dev *dev)
 	INPUT_CLEANSE_BITMASK(dev, SW, sw);
 }
 
+static void __input_unregister_device(struct input_dev *dev)
+{
+	struct input_handle *handle, *next;
+
+	input_disconnect_device(dev);
+
+	mutex_lock(&input_mutex);
+
+	list_for_each_entry_safe(handle, next, &dev->h_list, d_node)
+		handle->handler->disconnect(handle);
+	WARN_ON(!list_empty(&dev->h_list));
+
+	del_timer_sync(&dev->timer);
+	list_del_init(&dev->node);
+
+	input_wakeup_procfs_readers();
+
+	mutex_unlock(&input_mutex);
+
+	device_del(&dev->dev);
+}
+
+static void devm_input_device_unregister(struct device *dev, void *res)
+{
+	struct input_devres *devres = res;
+	struct input_dev *input = devres->input;
+
+	dev_dbg(dev, "%s: unregistering device %s\n",
+		__func__, dev_name(&input->dev));
+	__input_unregister_device(input);
+}
+
 /**
  * input_register_device - register device with input core
  * @dev: device to be registered
@@ -1903,11 +2008,21 @@ static void input_cleanse_bitmasks(struct input_dev *dev)
 int input_register_device(struct input_dev *dev)
 {
 	static atomic_t input_no = ATOMIC_INIT(0);
+	struct input_devres *devres = NULL;
 	struct input_handler *handler;
 	unsigned int packet_size;
 	const char *path;
 	int error;
 
+	if (dev->devres_managed) {
+		devres = devres_alloc(devm_input_device_unregister,
+				      sizeof(struct input_devres), GFP_KERNEL);
+		if (!devres)
+			return -ENOMEM;
+
+		devres->input = dev;
+	}
+
 	/* Every input device generates EV_SYN/SYN_REPORT events. */
 	__set_bit(EV_SYN, dev->evbit);
 
@@ -1923,8 +2038,10 @@ int input_register_device(struct input_dev *dev)
 
 	dev->max_vals = max(dev->hint_events_per_packet, packet_size) + 2;
 	dev->vals = kcalloc(dev->max_vals, sizeof(*dev->vals), GFP_KERNEL);
-	if (!dev->vals)
-		return -ENOMEM;
+	if (!dev->vals) {
+		error = -ENOMEM;
+		goto err_devres_free;
+	}
 
 	/*
 	 * If delay and period are pre-set by the driver, then autorepeating
@@ -1949,7 +2066,7 @@ int input_register_device(struct input_dev *dev)
 
 	error = device_add(&dev->dev);
 	if (error)
-		return error;
+		goto err_free_vals;
 
 	path = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
 	pr_info("%s as %s\n",
@@ -1958,10 +2075,8 @@ int input_register_device(struct input_dev *dev)
 	kfree(path);
 
 	error = mutex_lock_interruptible(&input_mutex);
-	if (error) {
-		device_del(&dev->dev);
-		return error;
-	}
+	if (error)
+		goto err_device_del;
 
 	list_add_tail(&dev->node, &input_dev_list);
 
@@ -1972,7 +2087,21 @@ int input_register_device(struct input_dev *dev)
 
 	mutex_unlock(&input_mutex);
 
+	if (dev->devres_managed) {
+		dev_dbg(dev->dev.parent, "%s: registering %s with devres.\n",
+			__func__, dev_name(&dev->dev));
+		devres_add(dev->dev.parent, devres);
+	}
 	return 0;
+
+err_device_del:
+	device_del(&dev->dev);
+err_free_vals:
+	kfree(dev->vals);
+	dev->vals = NULL;
+err_devres_free:
+	devres_free(devres);
+	return error;
 }
 EXPORT_SYMBOL(input_register_device);
 
@@ -1985,24 +2114,20 @@ EXPORT_SYMBOL(input_register_device);
  */
 void input_unregister_device(struct input_dev *dev)
 {
-	struct input_handle *handle, *next;
-
-	input_disconnect_device(dev);
-
-	mutex_lock(&input_mutex);
-
-	list_for_each_entry_safe(handle, next, &dev->h_list, d_node)
-		handle->handler->disconnect(handle);
-	WARN_ON(!list_empty(&dev->h_list));
-
-	del_timer_sync(&dev->timer);
-	list_del_init(&dev->node);
-
-	input_wakeup_procfs_readers();
-
-	mutex_unlock(&input_mutex);
-
-	device_unregister(&dev->dev);
+	if (dev->devres_managed) {
+		WARN_ON(devres_destroy(dev->dev.parent,
+					devm_input_device_unregister,
+					devm_input_device_match,
+					dev));
+		__input_unregister_device(dev);
+		/*
+		 * We do not do input_put_device() here because it will be done
+		 * when 2nd devres fires up.
+		 */
+	} else {
+		__input_unregister_device(dev);
+		input_put_device(dev);
+	}
 }
 EXPORT_SYMBOL(input_unregister_device);
 
diff --git a/trunk/drivers/input/joystick/as5011.c b/trunk/drivers/input/joystick/as5011.c
index c96653b58867..121cd63d3334 100644
--- a/trunk/drivers/input/joystick/as5011.c
+++ b/trunk/drivers/input/joystick/as5011.c
@@ -85,7 +85,10 @@ static int as5011_i2c_write(struct i2c_client *client,
 {
 	uint8_t data[2] = { aregaddr, avalue };
 	struct i2c_msg msg = {
-		client->addr, I2C_M_IGNORE_NAK, 2, (uint8_t *)data
+		.addr = client->addr,
+		.flags = I2C_M_IGNORE_NAK,
+		.len = 2,
+		.buf = (uint8_t *)data
 	};
 	int error;
 
@@ -98,8 +101,18 @@ static int as5011_i2c_read(struct i2c_client *client,
 {
 	uint8_t data[2] = { aregaddr };
 	struct i2c_msg msg_set[2] = {
-		{ client->addr, I2C_M_REV_DIR_ADDR, 1, (uint8_t *)data },
-		{ client->addr, I2C_M_RD | I2C_M_NOSTART, 1, (uint8_t *)data }
+		{
+			.addr = client->addr,
+			.flags = I2C_M_REV_DIR_ADDR,
+			.len = 1,
+			.buf = (uint8_t *)data
+		},
+		{
+			.addr = client->addr,
+			.flags = I2C_M_RD | I2C_M_NOSTART,
+			.len = 1,
+			.buf = (uint8_t *)data
+		}
 	};
 	int error;
 
@@ -144,7 +157,7 @@ static irqreturn_t as5011_axis_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int __devinit as5011_configure_chip(struct as5011_device *as5011,
+static int as5011_configure_chip(struct as5011_device *as5011,
 				const struct as5011_platform_data *plat_dat)
 {
 	struct i2c_client *client = as5011->i2c_client;
@@ -212,8 +225,8 @@ static int __devinit as5011_configure_chip(struct as5011_device *as5011,
 	return 0;
 }
 
-static int __devinit as5011_probe(struct i2c_client *client,
-				const struct i2c_device_id *id)
+static int as5011_probe(struct i2c_client *client,
+			 const struct i2c_device_id *id)
 {
 	const struct as5011_platform_data *plat_data;
 	struct as5011_device *as5011;
@@ -328,7 +341,7 @@ static int __devinit as5011_probe(struct i2c_client *client,
 	return error;
 }
 
-static int __devexit as5011_remove(struct i2c_client *client)
+static int as5011_remove(struct i2c_client *client)
 {
 	struct as5011_device *as5011 = i2c_get_clientdata(client);
 
@@ -353,7 +366,7 @@ static struct i2c_driver as5011_driver = {
 		.name = "as5011",
 	},
 	.probe		= as5011_probe,
-	.remove		= __devexit_p(as5011_remove),
+	.remove		= as5011_remove,
 	.id_table	= as5011_id,
 };
 
diff --git a/trunk/drivers/input/joystick/maplecontrol.c b/trunk/drivers/input/joystick/maplecontrol.c
index 77cfde571bd9..59c10ec5a2a1 100644
--- a/trunk/drivers/input/joystick/maplecontrol.c
+++ b/trunk/drivers/input/joystick/maplecontrol.c
@@ -78,7 +78,7 @@ static void dc_pad_close(struct input_dev *dev)
 }
 
 /* allow the controller to be used */
-static int __devinit probe_maple_controller(struct device *dev)
+static int probe_maple_controller(struct device *dev)
 {
 	static const short btn_bit[32] = {
 		BTN_C, BTN_B, BTN_A, BTN_START, -1, -1, -1, -1,
@@ -157,7 +157,7 @@ static int __devinit probe_maple_controller(struct device *dev)
 	return error;
 }
 
-static int __devexit remove_maple_controller(struct device *dev)
+static int remove_maple_controller(struct device *dev)
 {
 	struct maple_device *mdev = to_maple_dev(dev);
 	struct dc_pad *pad = maple_get_drvdata(mdev);
@@ -175,7 +175,7 @@ static struct maple_driver dc_pad_driver = {
 	.drv = {
 		.name	= "Dreamcast_controller",
 		.probe	= probe_maple_controller,
-		.remove	= __devexit_p(remove_maple_controller),
+		.remove	= remove_maple_controller,
 	},
 };
 
diff --git a/trunk/drivers/input/joystick/walkera0701.c b/trunk/drivers/input/joystick/walkera0701.c
index 4dfa1eed4b7c..f8f892b076e8 100644
--- a/trunk/drivers/input/joystick/walkera0701.c
+++ b/trunk/drivers/input/joystick/walkera0701.c
@@ -196,6 +196,7 @@ static void walkera0701_close(struct input_dev *dev)
 	struct walkera_dev *w = input_get_drvdata(dev);
 
 	parport_disable_irq(w->parport);
+	hrtimer_cancel(&w->timer);
 }
 
 static int walkera0701_connect(struct walkera_dev *w, int parport)
@@ -224,6 +225,9 @@ static int walkera0701_connect(struct walkera_dev *w, int parport)
 	if (parport_claim(w->pardevice))
 		goto init_err1;
 
+	hrtimer_init(&w->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	w->timer.function = timer_handler;
+
 	w->input_dev = input_allocate_device();
 	if (!w->input_dev)
 		goto init_err2;
@@ -254,8 +258,6 @@ static int walkera0701_connect(struct walkera_dev *w, int parport)
 	if (err)
 		goto init_err3;
 
-	hrtimer_init(&w->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	w->timer.function = timer_handler;
 	return 0;
 
  init_err3:
@@ -271,7 +273,6 @@ static int walkera0701_connect(struct walkera_dev *w, int parport)
 
 static void walkera0701_disconnect(struct walkera_dev *w)
 {
-	hrtimer_cancel(&w->timer);
 	input_unregister_device(w->input_dev);
 	parport_release(w->pardevice);
 	parport_unregister_device(w->pardevice);
diff --git a/trunk/drivers/input/joystick/xpad.c b/trunk/drivers/input/joystick/xpad.c
index 83811e45d633..d6cbfe9df218 100644
--- a/trunk/drivers/input/joystick/xpad.c
+++ b/trunk/drivers/input/joystick/xpad.c
@@ -118,11 +118,12 @@ static const struct xpad_device {
 	u8 xtype;
 } xpad_device[] = {
 	{ 0x045e, 0x0202, "Microsoft X-Box pad v1 (US)", 0, XTYPE_XBOX },
-	{ 0x045e, 0x0289, "Microsoft X-Box pad v2 (US)", 0, XTYPE_XBOX },
 	{ 0x045e, 0x0285, "Microsoft X-Box pad (Japan)", 0, XTYPE_XBOX },
 	{ 0x045e, 0x0287, "Microsoft Xbox Controller S", 0, XTYPE_XBOX },
+	{ 0x045e, 0x0289, "Microsoft X-Box pad v2 (US)", 0, XTYPE_XBOX },
+	{ 0x045e, 0x028e, "Microsoft X-Box 360 pad", 0, XTYPE_XBOX360 },
+	{ 0x045e, 0x0291, "Xbox 360 Wireless Receiver (XBOX)", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360W },
 	{ 0x045e, 0x0719, "Xbox 360 Wireless Receiver", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360W },
-	{ 0x0c12, 0x8809, "RedOctane Xbox Dance Pad", DANCEPAD_MAP_CONFIG, XTYPE_XBOX },
 	{ 0x044f, 0x0f07, "Thrustmaster, Inc. Controller", 0, XTYPE_XBOX },
 	{ 0x046d, 0xc242, "Logitech Chillstream Controller", 0, XTYPE_XBOX360 },
 	{ 0x046d, 0xca84, "Logitech Xbox Cordless Controller", 0, XTYPE_XBOX },
@@ -136,9 +137,12 @@ static const struct xpad_device {
 	{ 0x0738, 0x4540, "Mad Catz Beat Pad", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX },
 	{ 0x0738, 0x4556, "Mad Catz Lynx Wireless Controller", 0, XTYPE_XBOX },
 	{ 0x0738, 0x4716, "Mad Catz Wired Xbox 360 Controller", 0, XTYPE_XBOX360 },
+	{ 0x0738, 0x4728, "Mad Catz Street Fighter IV FightPad", XTYPE_XBOX360 },
 	{ 0x0738, 0x4738, "Mad Catz Wired Xbox 360 Controller (SFIV)", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOX360 },
 	{ 0x0738, 0x6040, "Mad Catz Beat Pad Pro", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX },
+	{ 0x0738, 0xbeef, "Mad Catz JOYTECH NEO SE Advanced GamePad", XTYPE_XBOX360 },
 	{ 0x0c12, 0x8802, "Zeroplus Xbox Controller", 0, XTYPE_XBOX },
+	{ 0x0c12, 0x8809, "RedOctane Xbox Dance Pad", DANCEPAD_MAP_CONFIG, XTYPE_XBOX },
 	{ 0x0c12, 0x880a, "Pelican Eclipse PL-2023", 0, XTYPE_XBOX },
 	{ 0x0c12, 0x8810, "Zeroplus Xbox Controller", 0, XTYPE_XBOX },
 	{ 0x0c12, 0x9902, "HAMA VibraX - *FAULTY HARDWARE*", 0, XTYPE_XBOX },
@@ -148,24 +152,28 @@ static const struct xpad_device {
 	{ 0x0e6f, 0x0003, "Logic3 Freebird wireless Controller", 0, XTYPE_XBOX },
 	{ 0x0e6f, 0x0005, "Eclipse wireless Controller", 0, XTYPE_XBOX },
 	{ 0x0e6f, 0x0006, "Edge wireless Controller", 0, XTYPE_XBOX },
-	{ 0x0e6f, 0x0006, "Pelican 'TSZ' Wired Xbox 360 Controller", 0, XTYPE_XBOX360 },
+	{ 0x0e6f, 0x0105, "HSM3 Xbox360 dancepad", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360 },
 	{ 0x0e6f, 0x0201, "Pelican PL-3601 'TSZ' Wired Xbox 360 Controller", 0, XTYPE_XBOX360 },
+	{ 0x0e6f, 0x0213, "Afterglow Gamepad for Xbox 360", 0, XTYPE_XBOX360 },
 	{ 0x0e8f, 0x0201, "SmartJoy Frag Xpad/PS2 adaptor", 0, XTYPE_XBOX },
+	{ 0x0f0d, 0x000d, "Hori Fighting Stick EX2", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOX360 },
+	{ 0x0f0d, 0x0016, "Hori Real Arcade Pro.EX", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOX360 },
 	{ 0x0f30, 0x0202, "Joytech Advanced Controller", 0, XTYPE_XBOX },
 	{ 0x0f30, 0x8888, "BigBen XBMiniPad Controller", 0, XTYPE_XBOX },
 	{ 0x102c, 0xff0c, "Joytech Wireless Advanced Controller", 0, XTYPE_XBOX },
-	{ 0x12ab, 0x8809, "Xbox DDR dancepad", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX },
 	{ 0x12ab, 0x0004, "Honey Bee Xbox360 dancepad", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360 },
-	{ 0x0e6f, 0x0105, "HSM3 Xbox360 dancepad", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360 },
+	{ 0x12ab, 0x8809, "Xbox DDR dancepad", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX },
 	{ 0x1430, 0x4748, "RedOctane Guitar Hero X-plorer", 0, XTYPE_XBOX360 },
 	{ 0x1430, 0x8888, "TX6500+ Dance Pad (first generation)", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX },
 	{ 0x146b, 0x0601, "BigBen Interactive XBOX 360 Controller", 0, XTYPE_XBOX360 },
-	{ 0x045e, 0x028e, "Microsoft X-Box 360 pad", 0, XTYPE_XBOX360 },
+	{ 0x1689, 0xfd00, "Razer Onza Tournament Edition", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360 },
 	{ 0x1bad, 0x0002, "Harmonix Rock Band Guitar", 0, XTYPE_XBOX360 },
 	{ 0x1bad, 0x0003, "Harmonix Rock Band Drumkit", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360 },
-	{ 0x0f0d, 0x0016, "Hori Real Arcade Pro.EX", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOX360 },
-	{ 0x0f0d, 0x000d, "Hori Fighting Stick EX2", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOX360 },
-	{ 0x1689, 0xfd00, "Razer Onza Tournament Edition", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360 },
+	{ 0x1bad, 0xf016, "Mad Catz Xbox 360 Controller", 0, XTYPE_XBOX360 },
+	{ 0x1bad, 0xf028, "Street Fighter IV FightPad", 0, XTYPE_XBOX360 },
+	{ 0x1bad, 0xf901, "Gamestop Xbox 360 Controller", 0, XTYPE_XBOX360 },
+	{ 0x1bad, 0xf903, "Tron Xbox 360 controller", 0, XTYPE_XBOX360 },
+	{ 0x24c6, 0x5300, "PowerA MINI PROEX Controller", 0, XTYPE_XBOX360 },
 	{ 0xffff, 0xffff, "Chinese-made Xbox Controller", 0, XTYPE_XBOX },
 	{ 0x0000, 0x0000, "Generic X-Box pad", 0, XTYPE_UNKNOWN }
 };
@@ -235,7 +243,7 @@ static const signed short xpad_abs_triggers[] = {
 	{ XPAD_XBOX360_VENDOR_PROTOCOL(vend,1) }, \
 	{ XPAD_XBOX360_VENDOR_PROTOCOL(vend,129) }
 
-static struct usb_device_id xpad_table [] = {
+static struct usb_device_id xpad_table[] = {
 	{ USB_INTERFACE_INFO('X', 'B', 0) },	/* X-Box USB-IF not approved class */
 	XPAD_XBOX360_VENDOR(0x045e),		/* Microsoft X-Box 360 controllers */
 	XPAD_XBOX360_VENDOR(0x046d),		/* Logitech X-Box 360 style controllers */
@@ -248,10 +256,11 @@ static struct usb_device_id xpad_table [] = {
 	XPAD_XBOX360_VENDOR(0x1bad),		/* Harminix Rock Band Guitar and Drums */
 	XPAD_XBOX360_VENDOR(0x0f0d),		/* Hori Controllers */
 	XPAD_XBOX360_VENDOR(0x1689),		/* Razer Onza */
+	XPAD_XBOX360_VENDOR(0x24c6),		/* PowerA Controllers */
 	{ }
 };
 
-MODULE_DEVICE_TABLE (usb, xpad_table);
+MODULE_DEVICE_TABLE(usb, xpad_table);
 
 struct usb_xpad {
 	struct input_dev *dev;		/* input device interface */
@@ -783,7 +792,7 @@ static int xpad_open(struct input_dev *dev)
 	struct usb_xpad *xpad = input_get_drvdata(dev);
 
 	/* URB was submitted in probe */
-	if(xpad->xtype == XTYPE_XBOX360W)
+	if (xpad->xtype == XTYPE_XBOX360W)
 		return 0;
 
 	xpad->irq_in->dev = xpad->udev;
diff --git a/trunk/drivers/input/keyboard/Kconfig b/trunk/drivers/input/keyboard/Kconfig
index febead4bf8a5..5a240c60342d 100644
--- a/trunk/drivers/input/keyboard/Kconfig
+++ b/trunk/drivers/input/keyboard/Kconfig
@@ -134,7 +134,7 @@ config KEYBOARD_QT1070
 
 config KEYBOARD_QT2160
 	tristate "Atmel AT42QT2160 Touch Sensor Chip"
-	depends on I2C && EXPERIMENTAL
+	depends on I2C
 	help
 	  If you say yes here you get support for Atmel AT42QT2160 Touch
 	  Sensor chip as a keyboard input.
diff --git a/trunk/drivers/input/keyboard/adp5520-keys.c b/trunk/drivers/input/keyboard/adp5520-keys.c
index e9e8674dfda1..ef26b17fb159 100644
--- a/trunk/drivers/input/keyboard/adp5520-keys.c
+++ b/trunk/drivers/input/keyboard/adp5520-keys.c
@@ -69,7 +69,7 @@ static int adp5520_keys_notifier(struct notifier_block *nb,
 	return 0;
 }
 
-static int __devinit adp5520_keys_probe(struct platform_device *pdev)
+static int adp5520_keys_probe(struct platform_device *pdev)
 {
 	struct adp5520_keys_platform_data *pdata = pdev->dev.platform_data;
 	struct input_dev *input;
@@ -182,7 +182,7 @@ static int __devinit adp5520_keys_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static int __devexit adp5520_keys_remove(struct platform_device *pdev)
+static int adp5520_keys_remove(struct platform_device *pdev)
 {
 	struct adp5520_keys *dev = platform_get_drvdata(pdev);
 
@@ -200,7 +200,7 @@ static struct platform_driver adp5520_keys_driver = {
 		.owner	= THIS_MODULE,
 	},
 	.probe		= adp5520_keys_probe,
-	.remove		= __devexit_p(adp5520_keys_remove),
+	.remove		= adp5520_keys_remove,
 };
 module_platform_driver(adp5520_keys_driver);
 
diff --git a/trunk/drivers/input/keyboard/adp5588-keys.c b/trunk/drivers/input/keyboard/adp5588-keys.c
index b083bf10f139..dbd2047f1641 100644
--- a/trunk/drivers/input/keyboard/adp5588-keys.c
+++ b/trunk/drivers/input/keyboard/adp5588-keys.c
@@ -145,7 +145,7 @@ static int adp5588_gpio_direction_output(struct gpio_chip *chip,
 	return ret;
 }
 
-static int __devinit adp5588_build_gpiomap(struct adp5588_kpad *kpad,
+static int adp5588_build_gpiomap(struct adp5588_kpad *kpad,
 				const struct adp5588_kpad_platform_data *pdata)
 {
 	bool pin_used[ADP5588_MAXGPIO];
@@ -170,7 +170,7 @@ static int __devinit adp5588_build_gpiomap(struct adp5588_kpad *kpad,
 	return n_unused;
 }
 
-static int __devinit adp5588_gpio_add(struct adp5588_kpad *kpad)
+static int adp5588_gpio_add(struct adp5588_kpad *kpad)
 {
 	struct device *dev = &kpad->client->dev;
 	const struct adp5588_kpad_platform_data *pdata = dev->platform_data;
@@ -224,7 +224,7 @@ static int __devinit adp5588_gpio_add(struct adp5588_kpad *kpad)
 	return 0;
 }
 
-static void __devexit adp5588_gpio_remove(struct adp5588_kpad *kpad)
+static void adp5588_gpio_remove(struct adp5588_kpad *kpad)
 {
 	struct device *dev = &kpad->client->dev;
 	const struct adp5588_kpad_platform_data *pdata = dev->platform_data;
@@ -319,7 +319,7 @@ static irqreturn_t adp5588_irq(int irq, void *handle)
 	return IRQ_HANDLED;
 }
 
-static int __devinit adp5588_setup(struct i2c_client *client)
+static int adp5588_setup(struct i2c_client *client)
 {
 	const struct adp5588_kpad_platform_data *pdata = client->dev.platform_data;
 	const struct adp5588_gpio_platform_data *gpio_data = pdata->gpio_data;
@@ -382,7 +382,7 @@ static int __devinit adp5588_setup(struct i2c_client *client)
 	return 0;
 }
 
-static void __devinit adp5588_report_switch_state(struct adp5588_kpad *kpad)
+static void adp5588_report_switch_state(struct adp5588_kpad *kpad)
 {
 	int gpi_stat1 = adp5588_read(kpad->client, GPIO_DAT_STAT1);
 	int gpi_stat2 = adp5588_read(kpad->client, GPIO_DAT_STAT2);
@@ -420,8 +420,8 @@ static void __devinit adp5588_report_switch_state(struct adp5588_kpad *kpad)
 }
 
 
-static int __devinit adp5588_probe(struct i2c_client *client,
-					const struct i2c_device_id *id)
+static int adp5588_probe(struct i2c_client *client,
+			 const struct i2c_device_id *id)
 {
 	struct adp5588_kpad *kpad;
 	const struct adp5588_kpad_platform_data *pdata = client->dev.platform_data;
@@ -587,7 +587,7 @@ static int __devinit adp5588_probe(struct i2c_client *client,
 	return error;
 }
 
-static int __devexit adp5588_remove(struct i2c_client *client)
+static int adp5588_remove(struct i2c_client *client)
 {
 	struct adp5588_kpad *kpad = i2c_get_clientdata(client);
 
@@ -650,7 +650,7 @@ static struct i2c_driver adp5588_driver = {
 #endif
 	},
 	.probe    = adp5588_probe,
-	.remove   = __devexit_p(adp5588_remove),
+	.remove   = adp5588_remove,
 	.id_table = adp5588_id,
 };
 
diff --git a/trunk/drivers/input/keyboard/adp5589-keys.c b/trunk/drivers/input/keyboard/adp5589-keys.c
index 74e603213386..67d12b3427c9 100644
--- a/trunk/drivers/input/keyboard/adp5589-keys.c
+++ b/trunk/drivers/input/keyboard/adp5589-keys.c
@@ -464,7 +464,7 @@ static int adp5589_gpio_direction_output(struct gpio_chip *chip,
 	return ret;
 }
 
-static int __devinit adp5589_build_gpiomap(struct adp5589_kpad *kpad,
+static int adp5589_build_gpiomap(struct adp5589_kpad *kpad,
 				const struct adp5589_kpad_platform_data *pdata)
 {
 	bool pin_used[ADP5589_MAXGPIO];
@@ -496,7 +496,7 @@ static int __devinit adp5589_build_gpiomap(struct adp5589_kpad *kpad,
 	return n_unused;
 }
 
-static int __devinit adp5589_gpio_add(struct adp5589_kpad *kpad)
+static int adp5589_gpio_add(struct adp5589_kpad *kpad)
 {
 	struct device *dev = &kpad->client->dev;
 	const struct adp5589_kpad_platform_data *pdata = dev->platform_data;
@@ -550,7 +550,7 @@ static int __devinit adp5589_gpio_add(struct adp5589_kpad *kpad)
 	return 0;
 }
 
-static void __devexit adp5589_gpio_remove(struct adp5589_kpad *kpad)
+static void adp5589_gpio_remove(struct adp5589_kpad *kpad)
 {
 	struct device *dev = &kpad->client->dev;
 	const struct adp5589_kpad_platform_data *pdata = dev->platform_data;
@@ -641,8 +641,7 @@ static irqreturn_t adp5589_irq(int irq, void *handle)
 	return IRQ_HANDLED;
 }
 
-static int __devinit adp5589_get_evcode(struct adp5589_kpad *kpad,
-					unsigned short key)
+static int adp5589_get_evcode(struct adp5589_kpad *kpad, unsigned short key)
 {
 	int i;
 
@@ -655,7 +654,7 @@ static int __devinit adp5589_get_evcode(struct adp5589_kpad *kpad,
 	return -EINVAL;
 }
 
-static int __devinit adp5589_setup(struct adp5589_kpad *kpad)
+static int adp5589_setup(struct adp5589_kpad *kpad)
 {
 	struct i2c_client *client = kpad->client;
 	const struct adp5589_kpad_platform_data *pdata =
@@ -820,7 +819,7 @@ static int __devinit adp5589_setup(struct adp5589_kpad *kpad)
 	return 0;
 }
 
-static void __devinit adp5589_report_switch_state(struct adp5589_kpad *kpad)
+static void adp5589_report_switch_state(struct adp5589_kpad *kpad)
 {
 	int gpi_stat_tmp, pin_loc;
 	int i;
@@ -860,8 +859,8 @@ static void __devinit adp5589_report_switch_state(struct adp5589_kpad *kpad)
 	input_sync(kpad->input);
 }
 
-static int __devinit adp5589_probe(struct i2c_client *client,
-				   const struct i2c_device_id *id)
+static int adp5589_probe(struct i2c_client *client,
+			 const struct i2c_device_id *id)
 {
 	struct adp5589_kpad *kpad;
 	const struct adp5589_kpad_platform_data *pdata =
@@ -1045,7 +1044,7 @@ static int __devinit adp5589_probe(struct i2c_client *client,
 	return error;
 }
 
-static int __devexit adp5589_remove(struct i2c_client *client)
+static int adp5589_remove(struct i2c_client *client)
 {
 	struct adp5589_kpad *kpad = i2c_get_clientdata(client);
 
@@ -1104,7 +1103,7 @@ static struct i2c_driver adp5589_driver = {
 		.pm = &adp5589_dev_pm_ops,
 	},
 	.probe = adp5589_probe,
-	.remove = __devexit_p(adp5589_remove),
+	.remove = adp5589_remove,
 	.id_table = adp5589_id,
 };
 
diff --git a/trunk/drivers/input/keyboard/bf54x-keys.c b/trunk/drivers/input/keyboard/bf54x-keys.c
index 8eb9116e0a5f..20b9fa91fb9e 100644
--- a/trunk/drivers/input/keyboard/bf54x-keys.c
+++ b/trunk/drivers/input/keyboard/bf54x-keys.c
@@ -177,7 +177,7 @@ static irqreturn_t bfin_kpad_isr(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int __devinit bfin_kpad_probe(struct platform_device *pdev)
+static int bfin_kpad_probe(struct platform_device *pdev)
 {
 	struct bf54x_kpad *bf54x_kpad;
 	struct bfin_kpad_platform_data *pdata = pdev->dev.platform_data;
@@ -331,7 +331,7 @@ static int __devinit bfin_kpad_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit bfin_kpad_remove(struct platform_device *pdev)
+static int bfin_kpad_remove(struct platform_device *pdev)
 {
 	struct bfin_kpad_platform_data *pdata = pdev->dev.platform_data;
 	struct bf54x_kpad *bf54x_kpad = platform_get_drvdata(pdev);
@@ -390,7 +390,7 @@ static struct platform_driver bfin_kpad_device_driver = {
 		.owner	= THIS_MODULE,
 	},
 	.probe		= bfin_kpad_probe,
-	.remove		= __devexit_p(bfin_kpad_remove),
+	.remove		= bfin_kpad_remove,
 	.suspend	= bfin_kpad_suspend,
 	.resume		= bfin_kpad_resume,
 };
diff --git a/trunk/drivers/input/keyboard/davinci_keyscan.c b/trunk/drivers/input/keyboard/davinci_keyscan.c
index d5bacbb479b0..4e4e453ea15e 100644
--- a/trunk/drivers/input/keyboard/davinci_keyscan.c
+++ b/trunk/drivers/input/keyboard/davinci_keyscan.c
@@ -303,7 +303,7 @@ static int __init davinci_ks_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit davinci_ks_remove(struct platform_device *pdev)
+static int davinci_ks_remove(struct platform_device *pdev)
 {
 	struct davinci_ks *davinci_ks = platform_get_drvdata(pdev);
 
@@ -326,7 +326,7 @@ static struct platform_driver davinci_ks_driver = {
 		.name = "davinci_keyscan",
 		.owner = THIS_MODULE,
 	},
-	.remove	= __devexit_p(davinci_ks_remove),
+	.remove	= davinci_ks_remove,
 };
 
 static int __init davinci_ks_init(void)
diff --git a/trunk/drivers/input/keyboard/ep93xx_keypad.c b/trunk/drivers/input/keyboard/ep93xx_keypad.c
index 7363402de8d4..9857e8fd0987 100644
--- a/trunk/drivers/input/keyboard/ep93xx_keypad.c
+++ b/trunk/drivers/input/keyboard/ep93xx_keypad.c
@@ -232,7 +232,7 @@ static int ep93xx_keypad_resume(struct device *dev)
 static SIMPLE_DEV_PM_OPS(ep93xx_keypad_pm_ops,
 			 ep93xx_keypad_suspend, ep93xx_keypad_resume);
 
-static int __devinit ep93xx_keypad_probe(struct platform_device *pdev)
+static int ep93xx_keypad_probe(struct platform_device *pdev)
 {
 	struct ep93xx_keypad *keypad;
 	const struct matrix_keymap_data *keymap_data;
@@ -346,7 +346,7 @@ static int __devinit ep93xx_keypad_probe(struct platform_device *pdev)
 	return err;
 }
 
-static int __devexit ep93xx_keypad_remove(struct platform_device *pdev)
+static int ep93xx_keypad_remove(struct platform_device *pdev)
 {
 	struct ep93xx_keypad *keypad = platform_get_drvdata(pdev);
 	struct resource *res;
@@ -380,7 +380,7 @@ static struct platform_driver ep93xx_keypad_driver = {
 		.pm	= &ep93xx_keypad_pm_ops,
 	},
 	.probe		= ep93xx_keypad_probe,
-	.remove		= __devexit_p(ep93xx_keypad_remove),
+	.remove		= ep93xx_keypad_remove,
 };
 module_platform_driver(ep93xx_keypad_driver);
 
diff --git a/trunk/drivers/input/keyboard/gpio_keys.c b/trunk/drivers/input/keyboard/gpio_keys.c
index 6a68041c261d..d327f5a2bb0e 100644
--- a/trunk/drivers/input/keyboard/gpio_keys.c
+++ b/trunk/drivers/input/keyboard/gpio_keys.c
@@ -423,10 +423,10 @@ static irqreturn_t gpio_keys_irq_isr(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int __devinit gpio_keys_setup_key(struct platform_device *pdev,
-					 struct input_dev *input,
-					 struct gpio_button_data *bdata,
-					 const struct gpio_keys_button *button)
+static int gpio_keys_setup_key(struct platform_device *pdev,
+				struct input_dev *input,
+				struct gpio_button_data *bdata,
+				const struct gpio_keys_button *button)
 {
 	const char *desc = button->desc ? button->desc : "gpio_keys";
 	struct device *dev = &pdev->dev;
@@ -440,21 +440,13 @@ static int __devinit gpio_keys_setup_key(struct platform_device *pdev,
 
 	if (gpio_is_valid(button->gpio)) {
 
-		error = gpio_request(button->gpio, desc);
+		error = gpio_request_one(button->gpio, GPIOF_IN, desc);
 		if (error < 0) {
 			dev_err(dev, "Failed to request GPIO %d, error %d\n",
 				button->gpio, error);
 			return error;
 		}
 
-		error = gpio_direction_input(button->gpio);
-		if (error < 0) {
-			dev_err(dev,
-				"Failed to configure direction for GPIO %d, error %d\n",
-				button->gpio, error);
-			goto fail;
-		}
-
 		if (button->debounce_interval) {
 			error = gpio_set_debounce(button->gpio,
 					button->debounce_interval * 1000);
@@ -526,12 +518,35 @@ static int __devinit gpio_keys_setup_key(struct platform_device *pdev,
 	return error;
 }
 
+static void gpio_keys_report_state(struct gpio_keys_drvdata *ddata)
+{
+	struct input_dev *input = ddata->input;
+	int i;
+
+	for (i = 0; i < ddata->pdata->nbuttons; i++) {
+		struct gpio_button_data *bdata = &ddata->data[i];
+		if (gpio_is_valid(bdata->button->gpio))
+			gpio_keys_gpio_report_event(bdata);
+	}
+	input_sync(input);
+}
+
 static int gpio_keys_open(struct input_dev *input)
 {
 	struct gpio_keys_drvdata *ddata = input_get_drvdata(input);
 	const struct gpio_keys_platform_data *pdata = ddata->pdata;
+	int error;
+
+	if (pdata->enable) {
+		error = pdata->enable(input->dev.parent);
+		if (error)
+			return error;
+	}
+
+	/* Report current state of buttons that are connected to GPIOs */
+	gpio_keys_report_state(ddata);
 
-	return pdata->enable ? pdata->enable(input->dev.parent) : 0;
+	return 0;
 }
 
 static void gpio_keys_close(struct input_dev *input)
@@ -551,7 +566,7 @@ static void gpio_keys_close(struct input_dev *input)
 /*
  * Translate OpenFirmware node properties into platform_data
  */
-static struct gpio_keys_platform_data * __devinit
+static struct gpio_keys_platform_data *
 gpio_keys_get_devtree_pdata(struct device *dev)
 {
 	struct device_node *node, *pp;
@@ -658,7 +673,7 @@ static void gpio_remove_key(struct gpio_button_data *bdata)
 		gpio_free(bdata->button->gpio);
 }
 
-static int __devinit gpio_keys_probe(struct platform_device *pdev)
+static int gpio_keys_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	const struct gpio_keys_platform_data *pdata = dev_get_platdata(dev);
@@ -731,14 +746,6 @@ static int __devinit gpio_keys_probe(struct platform_device *pdev)
 		goto fail3;
 	}
 
-	/* get current state of buttons that are connected to GPIOs */
-	for (i = 0; i < pdata->nbuttons; i++) {
-		struct gpio_button_data *bdata = &ddata->data[i];
-		if (gpio_is_valid(bdata->button->gpio))
-			gpio_keys_gpio_report_event(bdata);
-	}
-	input_sync(input);
-
 	device_init_wakeup(&pdev->dev, wakeup);
 
 	return 0;
@@ -760,7 +767,7 @@ static int __devinit gpio_keys_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit gpio_keys_remove(struct platform_device *pdev)
+static int gpio_keys_remove(struct platform_device *pdev)
 {
 	struct gpio_keys_drvdata *ddata = platform_get_drvdata(pdev);
 	struct input_dev *input = ddata->input;
@@ -788,6 +795,7 @@ static int __devexit gpio_keys_remove(struct platform_device *pdev)
 static int gpio_keys_suspend(struct device *dev)
 {
 	struct gpio_keys_drvdata *ddata = dev_get_drvdata(dev);
+	struct input_dev *input = ddata->input;
 	int i;
 
 	if (device_may_wakeup(dev)) {
@@ -796,6 +804,11 @@ static int gpio_keys_suspend(struct device *dev)
 			if (bdata->button->wakeup)
 				enable_irq_wake(bdata->irq);
 		}
+	} else {
+		mutex_lock(&input->mutex);
+		if (input->users)
+			gpio_keys_close(input);
+		mutex_unlock(&input->mutex);
 	}
 
 	return 0;
@@ -804,18 +817,27 @@ static int gpio_keys_suspend(struct device *dev)
 static int gpio_keys_resume(struct device *dev)
 {
 	struct gpio_keys_drvdata *ddata = dev_get_drvdata(dev);
+	struct input_dev *input = ddata->input;
+	int error = 0;
 	int i;
 
-	for (i = 0; i < ddata->pdata->nbuttons; i++) {
-		struct gpio_button_data *bdata = &ddata->data[i];
-		if (bdata->button->wakeup && device_may_wakeup(dev))
-			disable_irq_wake(bdata->irq);
-
-		if (gpio_is_valid(bdata->button->gpio))
-			gpio_keys_gpio_report_event(bdata);
+	if (device_may_wakeup(dev)) {
+		for (i = 0; i < ddata->pdata->nbuttons; i++) {
+			struct gpio_button_data *bdata = &ddata->data[i];
+			if (bdata->button->wakeup)
+				disable_irq_wake(bdata->irq);
+		}
+	} else {
+		mutex_lock(&input->mutex);
+		if (input->users)
+			error = gpio_keys_open(input);
+		mutex_unlock(&input->mutex);
 	}
-	input_sync(ddata->input);
 
+	if (error)
+		return error;
+
+	gpio_keys_report_state(ddata);
 	return 0;
 }
 #endif
@@ -824,7 +846,7 @@ static SIMPLE_DEV_PM_OPS(gpio_keys_pm_ops, gpio_keys_suspend, gpio_keys_resume);
 
 static struct platform_driver gpio_keys_device_driver = {
 	.probe		= gpio_keys_probe,
-	.remove		= __devexit_p(gpio_keys_remove),
+	.remove		= gpio_keys_remove,
 	.driver		= {
 		.name	= "gpio-keys",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/keyboard/gpio_keys_polled.c b/trunk/drivers/input/keyboard/gpio_keys_polled.c
index f2142de789e7..f686fd970553 100644
--- a/trunk/drivers/input/keyboard/gpio_keys_polled.c
+++ b/trunk/drivers/input/keyboard/gpio_keys_polled.c
@@ -103,8 +103,7 @@ static void gpio_keys_polled_close(struct input_polled_dev *dev)
 }
 
 #ifdef CONFIG_OF
-static struct gpio_keys_platform_data * __devinit
-gpio_keys_polled_get_devtree_pdata(struct device *dev)
+static struct gpio_keys_platform_data *gpio_keys_polled_get_devtree_pdata(struct device *dev)
 {
 	struct device_node *node, *pp;
 	struct gpio_keys_platform_data *pdata;
@@ -196,7 +195,7 @@ gpio_keys_polled_get_devtree_pdata(struct device *dev)
 }
 #endif
 
-static int __devinit gpio_keys_polled_probe(struct platform_device *pdev)
+static int gpio_keys_polled_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	const struct gpio_keys_platform_data *pdata = dev_get_platdata(dev);
@@ -246,7 +245,6 @@ static int __devinit gpio_keys_polled_probe(struct platform_device *pdev)
 
 	input = poll_dev->input;
 
-	input->evbit[0] = BIT(EV_KEY);
 	input->name = pdev->name;
 	input->phys = DRV_NAME"/input0";
 	input->dev.parent = &pdev->dev;
@@ -256,6 +254,10 @@ static int __devinit gpio_keys_polled_probe(struct platform_device *pdev)
 	input->id.product = 0x0001;
 	input->id.version = 0x0100;
 
+	__set_bit(EV_KEY, input->evbit);
+	if (pdata->rep)
+		__set_bit(EV_REP, input->evbit);
+
 	for (i = 0; i < pdata->nbuttons; i++) {
 		struct gpio_keys_button *button = &pdata->buttons[i];
 		struct gpio_keys_button_data *bdata = &bdev->data[i];
@@ -268,22 +270,14 @@ static int __devinit gpio_keys_polled_probe(struct platform_device *pdev)
 			goto err_free_gpio;
 		}
 
-		error = gpio_request(gpio,
-				     button->desc ? button->desc : DRV_NAME);
+		error = gpio_request_one(gpio, GPIOF_IN,
+					 button->desc ?: DRV_NAME);
 		if (error) {
 			dev_err(dev, "unable to claim gpio %u, err=%d\n",
 				gpio, error);
 			goto err_free_gpio;
 		}
 
-		error = gpio_direction_input(gpio);
-		if (error) {
-			dev_err(dev,
-				"unable to set direction on gpio %u, err=%d\n",
-				gpio, error);
-			goto err_free_gpio;
-		}
-
 		bdata->can_sleep = gpio_cansleep(gpio);
 		bdata->last_state = -1;
 		bdata->threshold = DIV_ROUND_UP(button->debounce_interval,
@@ -329,7 +323,7 @@ static int __devinit gpio_keys_polled_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit gpio_keys_polled_remove(struct platform_device *pdev)
+static int gpio_keys_polled_remove(struct platform_device *pdev)
 {
 	struct gpio_keys_polled_dev *bdev = platform_get_drvdata(pdev);
 	const struct gpio_keys_platform_data *pdata = bdev->pdata;
@@ -357,7 +351,7 @@ static int __devexit gpio_keys_polled_remove(struct platform_device *pdev)
 
 static struct platform_driver gpio_keys_polled_driver = {
 	.probe	= gpio_keys_polled_probe,
-	.remove	= __devexit_p(gpio_keys_polled_remove),
+	.remove	= gpio_keys_polled_remove,
 	.driver	= {
 		.name	= DRV_NAME,
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/keyboard/hilkbd.c b/trunk/drivers/input/keyboard/hilkbd.c
index 5f72440b50c8..198dc07a1be5 100644
--- a/trunk/drivers/input/keyboard/hilkbd.c
+++ b/trunk/drivers/input/keyboard/hilkbd.c
@@ -200,7 +200,7 @@ static void hil_do(unsigned char cmd, unsigned char *data, unsigned int len)
 
 
 /* initialize HIL */
-static int __devinit hil_keyb_init(void)
+static int hil_keyb_init(void)
 {
 	unsigned char c;
 	unsigned int i, kbid;
@@ -286,7 +286,7 @@ static int __devinit hil_keyb_init(void)
 	return err;
 }
 
-static void __devexit hil_keyb_exit(void)
+static void hil_keyb_exit(void)
 {
 	if (HIL_IRQ)
 		free_irq(HIL_IRQ, hil_dev.dev_id);
@@ -299,7 +299,7 @@ static void __devexit hil_keyb_exit(void)
 }
 
 #if defined(CONFIG_PARISC)
-static int __devinit hil_probe_chip(struct parisc_device *dev)
+static int hil_probe_chip(struct parisc_device *dev)
 {
 	/* Only allow one HIL keyboard */
 	if (hil_dev.dev)
@@ -320,7 +320,7 @@ static int __devinit hil_probe_chip(struct parisc_device *dev)
 	return hil_keyb_init();
 }
 
-static int __devexit hil_remove_chip(struct parisc_device *dev)
+static int hil_remove_chip(struct parisc_device *dev)
 {
 	hil_keyb_exit();
 
@@ -341,7 +341,7 @@ static struct parisc_driver hil_driver = {
 	.name		= "hil",
 	.id_table	= hil_tbl,
 	.probe		= hil_probe_chip,
-	.remove		= __devexit_p(hil_remove_chip),
+	.remove		= hil_remove_chip,
 };
 
 static int __init hil_init(void)
diff --git a/trunk/drivers/input/keyboard/imx_keypad.c b/trunk/drivers/input/keyboard/imx_keypad.c
index cdc252612c0b..6d150e3e1f55 100644
--- a/trunk/drivers/input/keyboard/imx_keypad.c
+++ b/trunk/drivers/input/keyboard/imx_keypad.c
@@ -362,7 +362,8 @@ static void imx_keypad_inhibit(struct imx_keypad *keypad)
 	writew(reg_val, keypad->mmio_base + KPSR);
 
 	/* Colums as open drain and disable all rows */
-	writew(0xff00, keypad->mmio_base + KPCR);
+	reg_val = (keypad->cols_en_mask & 0xff) << 8;
+	writew(reg_val, keypad->mmio_base + KPCR);
 }
 
 static void imx_keypad_close(struct input_dev *dev)
@@ -413,7 +414,7 @@ static int imx_keypad_open(struct input_dev *dev)
 	return -EIO;
 }
 
-static int __devinit imx_keypad_probe(struct platform_device *pdev)
+static int imx_keypad_probe(struct platform_device *pdev)
 {
 	const struct matrix_keymap_data *keymap_data = pdev->dev.platform_data;
 	struct imx_keypad *keypad;
@@ -554,7 +555,7 @@ static int __devinit imx_keypad_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit imx_keypad_remove(struct platform_device *pdev)
+static int imx_keypad_remove(struct platform_device *pdev)
 {
 	struct imx_keypad *keypad = platform_get_drvdata(pdev);
 	struct resource *res;
@@ -632,7 +633,7 @@ static struct platform_driver imx_keypad_driver = {
 		.pm	= &imx_kbd_pm_ops,
 	},
 	.probe		= imx_keypad_probe,
-	.remove		= __devexit_p(imx_keypad_remove),
+	.remove		= imx_keypad_remove,
 };
 module_platform_driver(imx_keypad_driver);
 
diff --git a/trunk/drivers/input/keyboard/jornada680_kbd.c b/trunk/drivers/input/keyboard/jornada680_kbd.c
index 24f3ea01c4d5..74e75a6e8deb 100644
--- a/trunk/drivers/input/keyboard/jornada680_kbd.c
+++ b/trunk/drivers/input/keyboard/jornada680_kbd.c
@@ -179,7 +179,7 @@ static void jornadakbd680_poll(struct input_polled_dev *dev)
 	memcpy(jornadakbd->old_scan, jornadakbd->new_scan, JORNADA_SCAN_SIZE);
 }
 
-static int __devinit jornada680kbd_probe(struct platform_device *pdev)
+static int jornada680kbd_probe(struct platform_device *pdev)
 {
 	struct jornadakbd *jornadakbd;
 	struct input_polled_dev *poll_dev;
@@ -240,7 +240,7 @@ static int __devinit jornada680kbd_probe(struct platform_device *pdev)
 
 }
 
-static int __devexit jornada680kbd_remove(struct platform_device *pdev)
+static int jornada680kbd_remove(struct platform_device *pdev)
 {
 	struct jornadakbd *jornadakbd = platform_get_drvdata(pdev);
 
@@ -258,7 +258,7 @@ static struct platform_driver jornada680kbd_driver = {
 		.owner	= THIS_MODULE,
 	},
 	.probe	= jornada680kbd_probe,
-	.remove	= __devexit_p(jornada680kbd_remove),
+	.remove	= jornada680kbd_remove,
 };
 module_platform_driver(jornada680kbd_driver);
 
diff --git a/trunk/drivers/input/keyboard/jornada720_kbd.c b/trunk/drivers/input/keyboard/jornada720_kbd.c
index 9d639fa1afbd..5ceef636df2f 100644
--- a/trunk/drivers/input/keyboard/jornada720_kbd.c
+++ b/trunk/drivers/input/keyboard/jornada720_kbd.c
@@ -94,7 +94,7 @@ static irqreturn_t jornada720_kbd_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 };
 
-static int __devinit jornada720_kbd_probe(struct platform_device *pdev)
+static int jornada720_kbd_probe(struct platform_device *pdev)
 {
 	struct jornadakbd *jornadakbd;
 	struct input_dev *input_dev;
@@ -152,7 +152,7 @@ static int __devinit jornada720_kbd_probe(struct platform_device *pdev)
 	return err;
 };
 
-static int __devexit jornada720_kbd_remove(struct platform_device *pdev)
+static int jornada720_kbd_remove(struct platform_device *pdev)
 {
 	struct jornadakbd *jornadakbd = platform_get_drvdata(pdev);
 
@@ -173,6 +173,6 @@ static struct platform_driver jornada720_kbd_driver = {
 		.owner	= THIS_MODULE,
 	 },
 	.probe   = jornada720_kbd_probe,
-	.remove  = __devexit_p(jornada720_kbd_remove),
+	.remove  = jornada720_kbd_remove,
 };
 module_platform_driver(jornada720_kbd_driver);
diff --git a/trunk/drivers/input/keyboard/lm8323.c b/trunk/drivers/input/keyboard/lm8323.c
index 39ac2787e275..93c812662134 100644
--- a/trunk/drivers/input/keyboard/lm8323.c
+++ b/trunk/drivers/input/keyboard/lm8323.c
@@ -624,7 +624,7 @@ static ssize_t lm8323_set_disable(struct device *dev,
 }
 static DEVICE_ATTR(disable_kp, 0644, lm8323_show_disable, lm8323_set_disable);
 
-static int __devinit lm8323_probe(struct i2c_client *client,
+static int lm8323_probe(struct i2c_client *client,
 				  const struct i2c_device_id *id)
 {
 	struct lm8323_platform_data *pdata = client->dev.platform_data;
@@ -764,7 +764,7 @@ static int __devinit lm8323_probe(struct i2c_client *client,
 	return err;
 }
 
-static int __devexit lm8323_remove(struct i2c_client *client)
+static int lm8323_remove(struct i2c_client *client)
 {
 	struct lm8323_chip *lm = i2c_get_clientdata(client);
 	int i;
@@ -846,7 +846,7 @@ static struct i2c_driver lm8323_i2c_driver = {
 		.pm	= &lm8323_pm_ops,
 	},
 	.probe		= lm8323_probe,
-	.remove		= __devexit_p(lm8323_remove),
+	.remove		= lm8323_remove,
 	.id_table	= lm8323_id,
 };
 MODULE_DEVICE_TABLE(i2c, lm8323_id);
diff --git a/trunk/drivers/input/keyboard/lm8333.c b/trunk/drivers/input/keyboard/lm8333.c
index 081fd9effa8c..5a8ca35dc9af 100644
--- a/trunk/drivers/input/keyboard/lm8333.c
+++ b/trunk/drivers/input/keyboard/lm8333.c
@@ -128,7 +128,7 @@ static irqreturn_t lm8333_irq_thread(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static int __devinit lm8333_probe(struct i2c_client *client,
+static int lm8333_probe(struct i2c_client *client,
 				  const struct i2c_device_id *id)
 {
 	const struct lm8333_platform_data *pdata = client->dev.platform_data;
@@ -202,7 +202,7 @@ static int __devinit lm8333_probe(struct i2c_client *client,
 	return err;
 }
 
-static int __devexit lm8333_remove(struct i2c_client *client)
+static int lm8333_remove(struct i2c_client *client)
 {
 	struct lm8333 *lm8333 = i2c_get_clientdata(client);
 
@@ -225,7 +225,7 @@ static struct i2c_driver lm8333_driver = {
 		.owner		= THIS_MODULE,
 	},
 	.probe		= lm8333_probe,
-	.remove		= __devexit_p(lm8333_remove),
+	.remove		= lm8333_remove,
 	.id_table	= lm8333_id,
 };
 module_i2c_driver(lm8333_driver);
diff --git a/trunk/drivers/input/keyboard/locomokbd.c b/trunk/drivers/input/keyboard/locomokbd.c
index b1ab29861e1c..c94d610b9d78 100644
--- a/trunk/drivers/input/keyboard/locomokbd.c
+++ b/trunk/drivers/input/keyboard/locomokbd.c
@@ -46,7 +46,7 @@ MODULE_LICENSE("GPL");
 #define KEY_CENTER		KEY_F15
 
 static const unsigned char
-locomokbd_keycode[LOCOMOKBD_NUMKEYS] __devinitconst = {
+locomokbd_keycode[LOCOMOKBD_NUMKEYS] = {
 	0, KEY_ESC, KEY_ACTIVITY, 0, 0, 0, 0, 0, 0, 0,				/* 0 - 9 */
 	0, 0, 0, 0, 0, 0, 0, KEY_MENU, KEY_HOME, KEY_CONTACT,			/* 10 - 19 */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,						/* 20 - 29 */
@@ -236,7 +236,7 @@ static void locomokbd_close(struct input_dev *dev)
 	locomo_writel(r, locomokbd->base + LOCOMO_KIC);
 }
 
-static int __devinit locomokbd_probe(struct locomo_dev *dev)
+static int locomokbd_probe(struct locomo_dev *dev)
 {
 	struct locomokbd *locomokbd;
 	struct input_dev *input_dev;
@@ -321,7 +321,7 @@ static int __devinit locomokbd_probe(struct locomo_dev *dev)
 	return err;
 }
 
-static int __devexit locomokbd_remove(struct locomo_dev *dev)
+static int locomokbd_remove(struct locomo_dev *dev)
 {
 	struct locomokbd *locomokbd = locomo_get_drvdata(dev);
 
@@ -345,7 +345,7 @@ static struct locomo_driver keyboard_driver = {
 	},
 	.devid	= LOCOMO_DEVID_KEYBOARD,
 	.probe	= locomokbd_probe,
-	.remove	= __devexit_p(locomokbd_remove),
+	.remove	= locomokbd_remove,
 };
 
 static int __init locomokbd_init(void)
diff --git a/trunk/drivers/input/keyboard/lpc32xx-keys.c b/trunk/drivers/input/keyboard/lpc32xx-keys.c
index dd786c8a7584..1b8add6cfb9d 100644
--- a/trunk/drivers/input/keyboard/lpc32xx-keys.c
+++ b/trunk/drivers/input/keyboard/lpc32xx-keys.c
@@ -139,7 +139,7 @@ static void lpc32xx_kscan_close(struct input_dev *dev)
 	clk_disable_unprepare(kscandat->clk);
 }
 
-static int __devinit lpc32xx_parse_dt(struct device *dev,
+static int lpc32xx_parse_dt(struct device *dev,
 				      struct lpc32xx_kscan_drv *kscandat)
 {
 	struct device_node *np = dev->of_node;
@@ -166,7 +166,7 @@ static int __devinit lpc32xx_parse_dt(struct device *dev,
 	return 0;
 }
 
-static int __devinit lpc32xx_kscan_probe(struct platform_device *pdev)
+static int lpc32xx_kscan_probe(struct platform_device *pdev)
 {
 	struct lpc32xx_kscan_drv *kscandat;
 	struct input_dev *input;
@@ -310,7 +310,7 @@ static int __devinit lpc32xx_kscan_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit lpc32xx_kscan_remove(struct platform_device *pdev)
+static int lpc32xx_kscan_remove(struct platform_device *pdev)
 {
 	struct lpc32xx_kscan_drv *kscandat = platform_get_drvdata(pdev);
 
@@ -377,7 +377,7 @@ MODULE_DEVICE_TABLE(of, lpc32xx_kscan_match);
 
 static struct platform_driver lpc32xx_kscan_driver = {
 	.probe		= lpc32xx_kscan_probe,
-	.remove		= __devexit_p(lpc32xx_kscan_remove),
+	.remove		= lpc32xx_kscan_remove,
 	.driver		= {
 		.name	= DRV_NAME,
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/keyboard/matrix_keypad.c b/trunk/drivers/input/keyboard/matrix_keypad.c
index 18b72372028a..f4ff0dda7597 100644
--- a/trunk/drivers/input/keyboard/matrix_keypad.c
+++ b/trunk/drivers/input/keyboard/matrix_keypad.c
@@ -23,6 +23,9 @@
 #include <linux/gpio.h>
 #include <linux/input/matrix_keypad.h>
 #include <linux/slab.h>
+#include <linux/of.h>
+#include <linux/of_gpio.h>
+#include <linux/of_platform.h>
 
 struct matrix_keypad {
 	const struct matrix_keypad_platform_data *pdata;
@@ -37,8 +40,6 @@ struct matrix_keypad {
 	bool scan_pending;
 	bool stopped;
 	bool gpio_all_disabled;
-
-	unsigned short keycodes[];
 };
 
 /*
@@ -118,6 +119,7 @@ static void matrix_keypad_scan(struct work_struct *work)
 	struct matrix_keypad *keypad =
 		container_of(work, struct matrix_keypad, work.work);
 	struct input_dev *input_dev = keypad->input_dev;
+	const unsigned short *keycodes = input_dev->keycode;
 	const struct matrix_keypad_platform_data *pdata = keypad->pdata;
 	uint32_t new_state[MATRIX_MAX_COLS];
 	int row, col, code;
@@ -153,7 +155,7 @@ static void matrix_keypad_scan(struct work_struct *work)
 			code = MATRIX_SCAN_CODE(row, col, keypad->row_shift);
 			input_event(input_dev, EV_MSC, MSC_SCAN, code);
 			input_report_key(input_dev,
-					 keypad->keycodes[code],
+					 keycodes[code],
 					 new_state[col] & (1 << row));
 		}
 	}
@@ -299,8 +301,8 @@ static int matrix_keypad_resume(struct device *dev)
 static SIMPLE_DEV_PM_OPS(matrix_keypad_pm_ops,
 			 matrix_keypad_suspend, matrix_keypad_resume);
 
-static int __devinit matrix_keypad_init_gpio(struct platform_device *pdev,
-					     struct matrix_keypad *keypad)
+static int matrix_keypad_init_gpio(struct platform_device *pdev,
+				   struct matrix_keypad *keypad)
 {
 	const struct matrix_keypad_platform_data *pdata = keypad->pdata;
 	int i, err;
@@ -394,33 +396,95 @@ static void matrix_keypad_free_gpio(struct matrix_keypad *keypad)
 		gpio_free(pdata->col_gpios[i]);
 }
 
-static int __devinit matrix_keypad_probe(struct platform_device *pdev)
+#ifdef CONFIG_OF
+static struct matrix_keypad_platform_data *
+matrix_keypad_parse_dt(struct device *dev)
+{
+	struct matrix_keypad_platform_data *pdata;
+	struct device_node *np = dev->of_node;
+	unsigned int *gpios;
+	int i;
+
+	if (!np) {
+		dev_err(dev, "device lacks DT data\n");
+		return ERR_PTR(-ENODEV);
+	}
+
+	pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL);
+	if (!pdata) {
+		dev_err(dev, "could not allocate memory for platform data\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	pdata->num_row_gpios = of_gpio_named_count(np, "row-gpios");
+	pdata->num_col_gpios = of_gpio_named_count(np, "col-gpios");
+	if (!pdata->num_row_gpios || !pdata->num_col_gpios) {
+		dev_err(dev, "number of keypad rows/columns not specified\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (of_get_property(np, "linux,no-autorepeat", NULL))
+		pdata->no_autorepeat = true;
+	if (of_get_property(np, "linux,wakeup", NULL))
+		pdata->wakeup = true;
+	if (of_get_property(np, "gpio-activelow", NULL))
+		pdata->active_low = true;
+
+	of_property_read_u32(np, "debounce-delay-ms", &pdata->debounce_ms);
+	of_property_read_u32(np, "col-scan-delay-us",
+						&pdata->col_scan_delay_us);
+
+	gpios = devm_kzalloc(dev,
+			     sizeof(unsigned int) *
+				(pdata->num_row_gpios + pdata->num_col_gpios),
+			     GFP_KERNEL);
+	if (!gpios) {
+		dev_err(dev, "could not allocate memory for gpios\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (i = 0; i < pdata->num_row_gpios; i++)
+		gpios[i] = of_get_named_gpio(np, "row-gpios", i);
+
+	for (i = 0; i < pdata->num_col_gpios; i++)
+		gpios[pdata->num_row_gpios + i] =
+			of_get_named_gpio(np, "col-gpios", i);
+
+	pdata->row_gpios = gpios;
+	pdata->col_gpios = &gpios[pdata->num_row_gpios];
+
+	return pdata;
+}
+#else
+static inline struct matrix_keypad_platform_data *
+matrix_keypad_parse_dt(struct device *dev)
+{
+	dev_err(dev, "no platform data defined\n");
+
+	return ERR_PTR(-EINVAL);
+}
+#endif
+
+static int matrix_keypad_probe(struct platform_device *pdev)
 {
 	const struct matrix_keypad_platform_data *pdata;
-	const struct matrix_keymap_data *keymap_data;
 	struct matrix_keypad *keypad;
 	struct input_dev *input_dev;
-	unsigned int row_shift;
-	size_t keymap_size;
 	int err;
 
-	pdata = pdev->dev.platform_data;
+	pdata = dev_get_platdata(&pdev->dev);
 	if (!pdata) {
-		dev_err(&pdev->dev, "no platform data defined\n");
-		return -EINVAL;
-	}
-
-	keymap_data = pdata->keymap_data;
-	if (!keymap_data) {
+		pdata = matrix_keypad_parse_dt(&pdev->dev);
+		if (IS_ERR(pdata)) {
+			dev_err(&pdev->dev, "no platform data defined\n");
+			return PTR_ERR(pdata);
+		}
+	} else if (!pdata->keymap_data) {
 		dev_err(&pdev->dev, "no keymap data defined\n");
 		return -EINVAL;
 	}
 
-	row_shift = get_count_order(pdata->num_col_gpios);
-	keymap_size = (pdata->num_row_gpios << row_shift) *
-			sizeof(keypad->keycodes[0]);
-	keypad = kzalloc(sizeof(struct matrix_keypad) + keymap_size,
-			 GFP_KERNEL);
+	keypad = kzalloc(sizeof(struct matrix_keypad), GFP_KERNEL);
 	input_dev = input_allocate_device();
 	if (!keypad || !input_dev) {
 		err = -ENOMEM;
@@ -429,7 +493,7 @@ static int __devinit matrix_keypad_probe(struct platform_device *pdev)
 
 	keypad->input_dev = input_dev;
 	keypad->pdata = pdata;
-	keypad->row_shift = row_shift;
+	keypad->row_shift = get_count_order(pdata->num_col_gpios);
 	keypad->stopped = true;
 	INIT_DELAYED_WORK(&keypad->work, matrix_keypad_scan);
 	spin_lock_init(&keypad->lock);
@@ -440,12 +504,14 @@ static int __devinit matrix_keypad_probe(struct platform_device *pdev)
 	input_dev->open		= matrix_keypad_start;
 	input_dev->close	= matrix_keypad_stop;
 
-	err = matrix_keypad_build_keymap(keymap_data, NULL,
+	err = matrix_keypad_build_keymap(pdata->keymap_data, NULL,
 					 pdata->num_row_gpios,
 					 pdata->num_col_gpios,
-					 keypad->keycodes, input_dev);
-	if (err)
+					 NULL, input_dev);
+	if (err) {
+		dev_err(&pdev->dev, "failed to build keymap\n");
 		goto err_free_mem;
+	}
 
 	if (!pdata->no_autorepeat)
 		__set_bit(EV_REP, input_dev->evbit);
@@ -473,7 +539,7 @@ static int __devinit matrix_keypad_probe(struct platform_device *pdev)
 	return err;
 }
 
-static int __devexit matrix_keypad_remove(struct platform_device *pdev)
+static int matrix_keypad_remove(struct platform_device *pdev)
 {
 	struct matrix_keypad *keypad = platform_get_drvdata(pdev);
 
@@ -488,13 +554,22 @@ static int __devexit matrix_keypad_remove(struct platform_device *pdev)
 	return 0;
 }
 
+#ifdef CONFIG_OF
+static const struct of_device_id matrix_keypad_dt_match[] = {
+	{ .compatible = "gpio-matrix-keypad" },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, matrix_keypad_dt_match);
+#endif
+
 static struct platform_driver matrix_keypad_driver = {
 	.probe		= matrix_keypad_probe,
-	.remove		= __devexit_p(matrix_keypad_remove),
+	.remove		= matrix_keypad_remove,
 	.driver		= {
 		.name	= "matrix-keypad",
 		.owner	= THIS_MODULE,
 		.pm	= &matrix_keypad_pm_ops,
+		.of_match_table = of_match_ptr(matrix_keypad_dt_match),
 	},
 };
 module_platform_driver(matrix_keypad_driver);
diff --git a/trunk/drivers/input/keyboard/max7359_keypad.c b/trunk/drivers/input/keyboard/max7359_keypad.c
index 8edada8ae712..7c7af2b01e65 100644
--- a/trunk/drivers/input/keyboard/max7359_keypad.c
+++ b/trunk/drivers/input/keyboard/max7359_keypad.c
@@ -179,7 +179,7 @@ static void max7359_initialize(struct i2c_client *client)
 	max7359_fall_deepsleep(client);
 }
 
-static int __devinit max7359_probe(struct i2c_client *client,
+static int max7359_probe(struct i2c_client *client,
 					const struct i2c_device_id *id)
 {
 	const struct matrix_keymap_data *keymap_data = client->dev.platform_data;
@@ -260,7 +260,7 @@ static int __devinit max7359_probe(struct i2c_client *client,
 	return error;
 }
 
-static int __devexit max7359_remove(struct i2c_client *client)
+static int max7359_remove(struct i2c_client *client)
 {
 	struct max7359_keypad *keypad = i2c_get_clientdata(client);
 
@@ -312,7 +312,7 @@ static struct i2c_driver max7359_i2c_driver = {
 		.pm   = &max7359_pm,
 	},
 	.probe		= max7359_probe,
-	.remove		= __devexit_p(max7359_remove),
+	.remove		= max7359_remove,
 	.id_table	= max7359_ids,
 };
 
diff --git a/trunk/drivers/input/keyboard/mcs_touchkey.c b/trunk/drivers/input/keyboard/mcs_touchkey.c
index 0d77f6c84950..7c236f9c6a51 100644
--- a/trunk/drivers/input/keyboard/mcs_touchkey.c
+++ b/trunk/drivers/input/keyboard/mcs_touchkey.c
@@ -97,7 +97,7 @@ static irqreturn_t mcs_touchkey_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int __devinit mcs_touchkey_probe(struct i2c_client *client,
+static int mcs_touchkey_probe(struct i2c_client *client,
 		const struct i2c_device_id *id)
 {
 	const struct mcs_platform_data *pdata;
@@ -200,7 +200,7 @@ static int __devinit mcs_touchkey_probe(struct i2c_client *client,
 	return error;
 }
 
-static int __devexit mcs_touchkey_remove(struct i2c_client *client)
+static int mcs_touchkey_remove(struct i2c_client *client)
 {
 	struct mcs_touchkey_data *data = i2c_get_clientdata(client);
 
@@ -270,7 +270,7 @@ static struct i2c_driver mcs_touchkey_driver = {
 		.pm	= &mcs_touchkey_pm_ops,
 	},
 	.probe		= mcs_touchkey_probe,
-	.remove		= __devexit_p(mcs_touchkey_remove),
+	.remove		= mcs_touchkey_remove,
 	.shutdown       = mcs_touchkey_shutdown,
 	.id_table	= mcs_touchkey_id,
 };
diff --git a/trunk/drivers/input/keyboard/mpr121_touchkey.c b/trunk/drivers/input/keyboard/mpr121_touchkey.c
index 7613f1cac951..f7f3e9a9fd3f 100644
--- a/trunk/drivers/input/keyboard/mpr121_touchkey.c
+++ b/trunk/drivers/input/keyboard/mpr121_touchkey.c
@@ -71,7 +71,7 @@ struct mpr121_init_register {
 	u8 val;
 };
 
-static const struct mpr121_init_register init_reg_table[] __devinitconst = {
+static const struct mpr121_init_register init_reg_table[] = {
 	{ MHD_RISING_ADDR,	0x1 },
 	{ NHD_RISING_ADDR,	0x1 },
 	{ MHD_FALLING_ADDR,	0x1 },
@@ -123,7 +123,7 @@ static irqreturn_t mpr_touchkey_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int __devinit mpr121_phys_init(const struct mpr121_platform_data *pdata,
+static int mpr121_phys_init(const struct mpr121_platform_data *pdata,
 				      struct mpr121_touchkey *mpr121,
 				      struct i2c_client *client)
 {
@@ -185,8 +185,8 @@ static int __devinit mpr121_phys_init(const struct mpr121_platform_data *pdata,
 	return ret;
 }
 
-static int __devinit mpr_touchkey_probe(struct i2c_client *client,
-					const struct i2c_device_id *id)
+static int mpr_touchkey_probe(struct i2c_client *client,
+			      const struct i2c_device_id *id)
 {
 	const struct mpr121_platform_data *pdata = client->dev.platform_data;
 	struct mpr121_touchkey *mpr121;
@@ -272,7 +272,7 @@ static int __devinit mpr_touchkey_probe(struct i2c_client *client,
 	return error;
 }
 
-static int __devexit mpr_touchkey_remove(struct i2c_client *client)
+static int mpr_touchkey_remove(struct i2c_client *client)
 {
 	struct mpr121_touchkey *mpr121 = i2c_get_clientdata(client);
 
@@ -327,7 +327,7 @@ static struct i2c_driver mpr_touchkey_driver = {
 	},
 	.id_table	= mpr121_id,
 	.probe		= mpr_touchkey_probe,
-	.remove		= __devexit_p(mpr_touchkey_remove),
+	.remove		= mpr_touchkey_remove,
 };
 
 module_i2c_driver(mpr_touchkey_driver);
diff --git a/trunk/drivers/input/keyboard/nomadik-ske-keypad.c b/trunk/drivers/input/keyboard/nomadik-ske-keypad.c
index 49f5fa64e0b1..0e6a8151fee3 100644
--- a/trunk/drivers/input/keyboard/nomadik-ske-keypad.c
+++ b/trunk/drivers/input/keyboard/nomadik-ske-keypad.c
@@ -67,6 +67,7 @@ struct ske_keypad {
 	const struct ske_keypad_platform_data *board;
 	unsigned short keymap[SKE_KPD_NUM_ROWS * SKE_KPD_NUM_COLS];
 	struct clk *clk;
+	struct clk *pclk;
 	spinlock_t ske_keypad_lock;
 };
 
@@ -271,11 +272,18 @@ static int __init ske_keypad_probe(struct platform_device *pdev)
 		goto err_free_mem_region;
 	}
 
+	keypad->pclk = clk_get(&pdev->dev, "apb_pclk");
+	if (IS_ERR(keypad->pclk)) {
+		dev_err(&pdev->dev, "failed to get pclk\n");
+		error = PTR_ERR(keypad->pclk);
+		goto err_iounmap;
+	}
+
 	keypad->clk = clk_get(&pdev->dev, NULL);
 	if (IS_ERR(keypad->clk)) {
 		dev_err(&pdev->dev, "failed to get clk\n");
 		error = PTR_ERR(keypad->clk);
-		goto err_iounmap;
+		goto err_pclk;
 	}
 
 	input->id.bustype = BUS_HOST;
@@ -287,14 +295,25 @@ static int __init ske_keypad_probe(struct platform_device *pdev)
 					   keypad->keymap, input);
 	if (error) {
 		dev_err(&pdev->dev, "Failed to build keymap\n");
-		goto err_iounmap;
+		goto err_clk;
 	}
 
 	input_set_capability(input, EV_MSC, MSC_SCAN);
 	if (!plat->no_autorepeat)
 		__set_bit(EV_REP, input->evbit);
 
-	clk_enable(keypad->clk);
+	error = clk_prepare_enable(keypad->pclk);
+	if (error) {
+		dev_err(&pdev->dev, "Failed to prepare/enable pclk\n");
+		goto err_clk;
+	}
+
+	error = clk_prepare_enable(keypad->clk);
+	if (error) {
+		dev_err(&pdev->dev, "Failed to prepare/enable clk\n");
+		goto err_pclk_disable;
+	}
+
 
 	/* go through board initialization helpers */
 	if (keypad->board->init)
@@ -330,8 +349,13 @@ static int __init ske_keypad_probe(struct platform_device *pdev)
 err_free_irq:
 	free_irq(keypad->irq, keypad);
 err_clk_disable:
-	clk_disable(keypad->clk);
+	clk_disable_unprepare(keypad->clk);
+err_pclk_disable:
+	clk_disable_unprepare(keypad->pclk);
+err_clk:
 	clk_put(keypad->clk);
+err_pclk:
+	clk_put(keypad->pclk);
 err_iounmap:
 	iounmap(keypad->reg_base);
 err_free_mem_region:
@@ -342,7 +366,7 @@ static int __init ske_keypad_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit ske_keypad_remove(struct platform_device *pdev)
+static int ske_keypad_remove(struct platform_device *pdev)
 {
 	struct ske_keypad *keypad = platform_get_drvdata(pdev);
 	struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
@@ -351,7 +375,7 @@ static int __devexit ske_keypad_remove(struct platform_device *pdev)
 
 	input_unregister_device(keypad->input);
 
-	clk_disable(keypad->clk);
+	clk_disable_unprepare(keypad->clk);
 	clk_put(keypad->clk);
 
 	if (keypad->board->exit)
@@ -403,7 +427,7 @@ static struct platform_driver ske_keypad_driver = {
 		.owner  = THIS_MODULE,
 		.pm = &ske_keypad_dev_pm_ops,
 	},
-	.remove = __devexit_p(ske_keypad_remove),
+	.remove = ske_keypad_remove,
 };
 
 static int __init ske_keypad_init(void)
diff --git a/trunk/drivers/input/keyboard/omap-keypad.c b/trunk/drivers/input/keyboard/omap-keypad.c
index 4a5fcc8026f5..d0d5226d9cd4 100644
--- a/trunk/drivers/input/keyboard/omap-keypad.c
+++ b/trunk/drivers/input/keyboard/omap-keypad.c
@@ -244,7 +244,7 @@ static int omap_kp_resume(struct platform_device *dev)
 #define omap_kp_resume	NULL
 #endif
 
-static int __devinit omap_kp_probe(struct platform_device *pdev)
+static int omap_kp_probe(struct platform_device *pdev)
 {
 	struct omap_kp *omap_kp;
 	struct input_dev *input_dev;
@@ -357,7 +357,7 @@ static int __devinit omap_kp_probe(struct platform_device *pdev)
 	return -EINVAL;
 }
 
-static int __devexit omap_kp_remove(struct platform_device *pdev)
+static int omap_kp_remove(struct platform_device *pdev)
 {
 	struct omap_kp *omap_kp = platform_get_drvdata(pdev);
 
@@ -379,7 +379,7 @@ static int __devexit omap_kp_remove(struct platform_device *pdev)
 
 static struct platform_driver omap_kp_driver = {
 	.probe		= omap_kp_probe,
-	.remove		= __devexit_p(omap_kp_remove),
+	.remove		= omap_kp_remove,
 	.suspend	= omap_kp_suspend,
 	.resume		= omap_kp_resume,
 	.driver		= {
diff --git a/trunk/drivers/input/keyboard/omap4-keypad.c b/trunk/drivers/input/keyboard/omap4-keypad.c
index c05f98c41410..e25b022692cd 100644
--- a/trunk/drivers/input/keyboard/omap4-keypad.c
+++ b/trunk/drivers/input/keyboard/omap4-keypad.c
@@ -211,8 +211,8 @@ static void omap4_keypad_close(struct input_dev *input)
 }
 
 #ifdef CONFIG_OF
-static int __devinit omap4_keypad_parse_dt(struct device *dev,
-					   struct omap4_keypad *keypad_data)
+static int omap4_keypad_parse_dt(struct device *dev,
+				 struct omap4_keypad *keypad_data)
 {
 	struct device_node *np = dev->of_node;
 
@@ -241,7 +241,7 @@ static inline int omap4_keypad_parse_dt(struct device *dev,
 }
 #endif
 
-static int __devinit omap4_keypad_probe(struct platform_device *pdev)
+static int omap4_keypad_probe(struct platform_device *pdev)
 {
 	const struct omap4_keypad_platform_data *pdata =
 				dev_get_platdata(&pdev->dev);
@@ -406,7 +406,7 @@ static int __devinit omap4_keypad_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit omap4_keypad_remove(struct platform_device *pdev)
+static int omap4_keypad_remove(struct platform_device *pdev)
 {
 	struct omap4_keypad *keypad_data = platform_get_drvdata(pdev);
 	struct resource *res;
@@ -440,7 +440,7 @@ MODULE_DEVICE_TABLE(of, omap_keypad_dt_match);
 
 static struct platform_driver omap4_keypad_driver = {
 	.probe		= omap4_keypad_probe,
-	.remove		= __devexit_p(omap4_keypad_remove),
+	.remove		= omap4_keypad_remove,
 	.driver		= {
 		.name	= "omap4-keypad",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/keyboard/opencores-kbd.c b/trunk/drivers/input/keyboard/opencores-kbd.c
index abe728c7b88e..7ac5f174c6f7 100644
--- a/trunk/drivers/input/keyboard/opencores-kbd.c
+++ b/trunk/drivers/input/keyboard/opencores-kbd.c
@@ -37,7 +37,7 @@ static irqreturn_t opencores_kbd_isr(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int __devinit opencores_kbd_probe(struct platform_device *pdev)
+static int opencores_kbd_probe(struct platform_device *pdev)
 {
 	struct input_dev *input;
 	struct opencores_kbd *opencores_kbd;
@@ -139,7 +139,7 @@ static int __devinit opencores_kbd_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit opencores_kbd_remove(struct platform_device *pdev)
+static int opencores_kbd_remove(struct platform_device *pdev)
 {
 	struct opencores_kbd *opencores_kbd = platform_get_drvdata(pdev);
 
@@ -158,7 +158,7 @@ static int __devexit opencores_kbd_remove(struct platform_device *pdev)
 
 static struct platform_driver opencores_kbd_device_driver = {
 	.probe    = opencores_kbd_probe,
-	.remove   = __devexit_p(opencores_kbd_remove),
+	.remove   = opencores_kbd_remove,
 	.driver   = {
 		.name = "opencores-kbd",
 	},
diff --git a/trunk/drivers/input/keyboard/pmic8xxx-keypad.c b/trunk/drivers/input/keyboard/pmic8xxx-keypad.c
index 52c34657d301..74339e139d43 100644
--- a/trunk/drivers/input/keyboard/pmic8xxx-keypad.c
+++ b/trunk/drivers/input/keyboard/pmic8xxx-keypad.c
@@ -397,7 +397,7 @@ static irqreturn_t pmic8xxx_kp_irq(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static int __devinit pmic8xxx_kpd_init(struct pmic8xxx_kp *kp)
+static int pmic8xxx_kpd_init(struct pmic8xxx_kp *kp)
 {
 	int bits, rc, cycles;
 	u8 scan_val = 0, ctrl_val = 0;
@@ -447,7 +447,7 @@ static int __devinit pmic8xxx_kpd_init(struct pmic8xxx_kp *kp)
 
 }
 
-static int  __devinit pmic8xxx_kp_config_gpio(int gpio_start, int num_gpios,
+static int  pmic8xxx_kp_config_gpio(int gpio_start, int num_gpios,
 			struct pmic8xxx_kp *kp, struct pm_gpio *gpio_config)
 {
 	int	rc, i;
@@ -518,7 +518,7 @@ static void pmic8xxx_kp_close(struct input_dev *dev)
  * - set irq edge type.
  * - enable the keypad controller.
  */
-static int __devinit pmic8xxx_kp_probe(struct platform_device *pdev)
+static int pmic8xxx_kp_probe(struct platform_device *pdev)
 {
 	const struct pm8xxx_keypad_platform_data *pdata =
 					dev_get_platdata(&pdev->dev);
@@ -712,7 +712,7 @@ static int __devinit pmic8xxx_kp_probe(struct platform_device *pdev)
 	return rc;
 }
 
-static int __devexit pmic8xxx_kp_remove(struct platform_device *pdev)
+static int pmic8xxx_kp_remove(struct platform_device *pdev)
 {
 	struct pmic8xxx_kp *kp = platform_get_drvdata(pdev);
 
@@ -773,7 +773,7 @@ static SIMPLE_DEV_PM_OPS(pm8xxx_kp_pm_ops,
 
 static struct platform_driver pmic8xxx_kp_driver = {
 	.probe		= pmic8xxx_kp_probe,
-	.remove		= __devexit_p(pmic8xxx_kp_remove),
+	.remove		= pmic8xxx_kp_remove,
 	.driver		= {
 		.name = PM8XXX_KEYPAD_DEV_NAME,
 		.owner = THIS_MODULE,
diff --git a/trunk/drivers/input/keyboard/pxa27x_keypad.c b/trunk/drivers/input/keyboard/pxa27x_keypad.c
index cad9d5dd5973..5330d8fbf6c0 100644
--- a/trunk/drivers/input/keyboard/pxa27x_keypad.c
+++ b/trunk/drivers/input/keyboard/pxa27x_keypad.c
@@ -482,7 +482,7 @@ static const struct dev_pm_ops pxa27x_keypad_pm_ops = {
 };
 #endif
 
-static int __devinit pxa27x_keypad_probe(struct platform_device *pdev)
+static int pxa27x_keypad_probe(struct platform_device *pdev)
 {
 	struct pxa27x_keypad_platform_data *pdata = pdev->dev.platform_data;
 	struct pxa27x_keypad *keypad;
@@ -595,7 +595,7 @@ static int __devinit pxa27x_keypad_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit pxa27x_keypad_remove(struct platform_device *pdev)
+static int pxa27x_keypad_remove(struct platform_device *pdev)
 {
 	struct pxa27x_keypad *keypad = platform_get_drvdata(pdev);
 	struct resource *res;
@@ -620,7 +620,7 @@ MODULE_ALIAS("platform:pxa27x-keypad");
 
 static struct platform_driver pxa27x_keypad_driver = {
 	.probe		= pxa27x_keypad_probe,
-	.remove		= __devexit_p(pxa27x_keypad_remove),
+	.remove		= pxa27x_keypad_remove,
 	.driver		= {
 		.name	= "pxa27x-keypad",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/keyboard/pxa930_rotary.c b/trunk/drivers/input/keyboard/pxa930_rotary.c
index 41488f9add20..bcad95be73aa 100644
--- a/trunk/drivers/input/keyboard/pxa930_rotary.c
+++ b/trunk/drivers/input/keyboard/pxa930_rotary.c
@@ -82,7 +82,7 @@ static void pxa930_rotary_close(struct input_dev *dev)
 	clear_sbcr(r);
 }
 
-static int __devinit pxa930_rotary_probe(struct platform_device *pdev)
+static int pxa930_rotary_probe(struct platform_device *pdev)
 {
 	struct pxa930_rotary_platform_data *pdata = pdev->dev.platform_data;
 	struct pxa930_rotary *r;
@@ -174,7 +174,7 @@ static int __devinit pxa930_rotary_probe(struct platform_device *pdev)
 	return err;
 }
 
-static int __devexit pxa930_rotary_remove(struct platform_device *pdev)
+static int pxa930_rotary_remove(struct platform_device *pdev)
 {
 	struct pxa930_rotary *r = platform_get_drvdata(pdev);
 
@@ -193,7 +193,7 @@ static struct platform_driver pxa930_rotary_driver = {
 		.owner	= THIS_MODULE,
 	},
 	.probe		= pxa930_rotary_probe,
-	.remove		= __devexit_p(pxa930_rotary_remove),
+	.remove		= pxa930_rotary_remove,
 };
 module_platform_driver(pxa930_rotary_driver);
 
diff --git a/trunk/drivers/input/keyboard/qt1070.c b/trunk/drivers/input/keyboard/qt1070.c
index ca68f2992d72..42b773b3125a 100644
--- a/trunk/drivers/input/keyboard/qt1070.c
+++ b/trunk/drivers/input/keyboard/qt1070.c
@@ -91,7 +91,7 @@ static int qt1070_write(struct i2c_client *client, u8 reg, u8 data)
 	return ret;
 }
 
-static bool __devinit qt1070_identify(struct i2c_client *client)
+static bool qt1070_identify(struct i2c_client *client)
 {
 	int id, ver;
 
@@ -140,7 +140,7 @@ static irqreturn_t qt1070_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int __devinit qt1070_probe(struct i2c_client *client,
+static int qt1070_probe(struct i2c_client *client,
 				const struct i2c_device_id *id)
 {
 	struct qt1070_data *data;
@@ -230,7 +230,7 @@ static int __devinit qt1070_probe(struct i2c_client *client,
 	return err;
 }
 
-static int __devexit qt1070_remove(struct i2c_client *client)
+static int qt1070_remove(struct i2c_client *client)
 {
 	struct qt1070_data *data = i2c_get_clientdata(client);
 
@@ -256,7 +256,7 @@ static struct i2c_driver qt1070_driver = {
 	},
 	.id_table	= qt1070_id,
 	.probe		= qt1070_probe,
-	.remove		= __devexit_p(qt1070_remove),
+	.remove		= qt1070_remove,
 };
 
 module_i2c_driver(qt1070_driver);
diff --git a/trunk/drivers/input/keyboard/qt2160.c b/trunk/drivers/input/keyboard/qt2160.c
index 76b7d430d03a..3dc2b0f27b0c 100644
--- a/trunk/drivers/input/keyboard/qt2160.c
+++ b/trunk/drivers/input/keyboard/qt2160.c
@@ -183,7 +183,7 @@ static void qt2160_worker(struct work_struct *work)
 	qt2160_schedule_read(qt2160);
 }
 
-static int __devinit qt2160_read(struct i2c_client *client, u8 reg)
+static int qt2160_read(struct i2c_client *client, u8 reg)
 {
 	int ret;
 
@@ -204,29 +204,20 @@ static int __devinit qt2160_read(struct i2c_client *client, u8 reg)
 	return ret;
 }
 
-static int __devinit qt2160_write(struct i2c_client *client, u8 reg, u8 data)
+static int qt2160_write(struct i2c_client *client, u8 reg, u8 data)
 {
-	int error;
-
-	error = i2c_smbus_write_byte(client, reg);
-	if (error) {
-		dev_err(&client->dev,
-			"couldn't send request. Returned %d\n", error);
-		return error;
-	}
+	int ret;
 
-	error = i2c_smbus_write_byte(client, data);
-	if (error) {
+	ret = i2c_smbus_write_byte_data(client, reg, data);
+	if (ret < 0)
 		dev_err(&client->dev,
-			"couldn't write data. Returned %d\n", error);
-		return error;
-	}
+			"couldn't write data. Returned %d\n", ret);
 
-	return error;
+	return ret;
 }
 
 
-static bool __devinit qt2160_identify(struct i2c_client *client)
+static bool qt2160_identify(struct i2c_client *client)
 {
 	int id, ver, rev;
 
@@ -257,7 +248,7 @@ static bool __devinit qt2160_identify(struct i2c_client *client)
 	return true;
 }
 
-static int __devinit qt2160_probe(struct i2c_client *client,
+static int qt2160_probe(struct i2c_client *client,
 				  const struct i2c_device_id *id)
 {
 	struct qt2160_data *qt2160;
@@ -344,7 +335,7 @@ static int __devinit qt2160_probe(struct i2c_client *client,
 	return error;
 }
 
-static int __devexit qt2160_remove(struct i2c_client *client)
+static int qt2160_remove(struct i2c_client *client)
 {
 	struct qt2160_data *qt2160 = i2c_get_clientdata(client);
 
@@ -375,7 +366,7 @@ static struct i2c_driver qt2160_driver = {
 
 	.id_table	= qt2160_idtable,
 	.probe		= qt2160_probe,
-	.remove		= __devexit_p(qt2160_remove),
+	.remove		= qt2160_remove,
 };
 
 module_i2c_driver(qt2160_driver);
diff --git a/trunk/drivers/input/keyboard/samsung-keypad.c b/trunk/drivers/input/keyboard/samsung-keypad.c
index 9d7a111486f7..22e357b51024 100644
--- a/trunk/drivers/input/keyboard/samsung-keypad.c
+++ b/trunk/drivers/input/keyboard/samsung-keypad.c
@@ -309,7 +309,7 @@ static void samsung_keypad_parse_dt_gpio(struct device *dev,
 				struct samsung_keypad *keypad)
 {
 	struct device_node *np = dev->of_node;
-	int gpio, ret, row, col;
+	int gpio, error, row, col;
 
 	for (row = 0; row < keypad->rows; row++) {
 		gpio = of_get_named_gpio(np, "row-gpios", row);
@@ -320,10 +320,11 @@ static void samsung_keypad_parse_dt_gpio(struct device *dev,
 			continue;
 		}
 
-		ret = gpio_request(gpio, "keypad-row");
-		if (ret)
-			dev_err(dev, "keypad row[%d] gpio request failed\n",
-					row);
+		error = devm_gpio_request(dev, gpio, "keypad-row");
+		if (error)
+			dev_err(dev,
+				"keypad row[%d] gpio request failed: %d\n",
+				row, error);
 	}
 
 	for (col = 0; col < keypad->cols; col++) {
@@ -335,38 +336,22 @@ static void samsung_keypad_parse_dt_gpio(struct device *dev,
 			continue;
 		}
 
-		ret = gpio_request(gpio, "keypad-col");
-		if (ret)
-			dev_err(dev, "keypad column[%d] gpio request failed\n",
-					col);
+		error = devm_gpio_request(dev, gpio, "keypad-col");
+		if (error)
+			dev_err(dev,
+				"keypad column[%d] gpio request failed: %d\n",
+				col, error);
 	}
 }
-
-static void samsung_keypad_dt_gpio_free(struct samsung_keypad *keypad)
-{
-	int cnt;
-
-	for (cnt = 0; cnt < keypad->rows; cnt++)
-		if (gpio_is_valid(keypad->row_gpios[cnt]))
-			gpio_free(keypad->row_gpios[cnt]);
-
-	for (cnt = 0; cnt < keypad->cols; cnt++)
-		if (gpio_is_valid(keypad->col_gpios[cnt]))
-			gpio_free(keypad->col_gpios[cnt]);
-}
 #else
 static
 struct samsung_keypad_platdata *samsung_keypad_parse_dt(struct device *dev)
 {
 	return NULL;
 }
-
-static void samsung_keypad_dt_gpio_free(struct samsung_keypad *keypad)
-{
-}
 #endif
 
-static int __devinit samsung_keypad_probe(struct platform_device *pdev)
+static int samsung_keypad_probe(struct platform_device *pdev)
 {
 	const struct samsung_keypad_platdata *pdata;
 	const struct matrix_keymap_data *keymap_data;
@@ -405,36 +390,30 @@ static int __devinit samsung_keypad_probe(struct platform_device *pdev)
 	row_shift = get_count_order(pdata->cols);
 	keymap_size = (pdata->rows << row_shift) * sizeof(keypad->keycodes[0]);
 
-	keypad = kzalloc(sizeof(*keypad) + keymap_size, GFP_KERNEL);
-	input_dev = input_allocate_device();
-	if (!keypad || !input_dev) {
-		error = -ENOMEM;
-		goto err_free_mem;
-	}
+	keypad = devm_kzalloc(&pdev->dev, sizeof(*keypad) + keymap_size,
+			      GFP_KERNEL);
+	input_dev = devm_input_allocate_device(&pdev->dev);
+	if (!keypad || !input_dev)
+		return -ENOMEM;
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res) {
-		error = -ENODEV;
-		goto err_free_mem;
-	}
+	if (!res)
+		return -ENODEV;
 
-	keypad->base = ioremap(res->start, resource_size(res));
-	if (!keypad->base) {
-		error = -EBUSY;
-		goto err_free_mem;
-	}
+	keypad->base = devm_ioremap(&pdev->dev, res->start, resource_size(res));
+	if (!keypad->base)
+		return -EBUSY;
 
-	keypad->clk = clk_get(&pdev->dev, "keypad");
+	keypad->clk = devm_clk_get(&pdev->dev, "keypad");
 	if (IS_ERR(keypad->clk)) {
 		dev_err(&pdev->dev, "failed to get keypad clk\n");
-		error = PTR_ERR(keypad->clk);
-		goto err_unmap_base;
+		return PTR_ERR(keypad->clk);
 	}
 
 	error = clk_prepare(keypad->clk);
 	if (error) {
 		dev_err(&pdev->dev, "keypad clock prepare failed\n");
-		goto err_put_clk;
+		return error;
 	}
 
 	keypad->input_dev = input_dev;
@@ -479,14 +458,15 @@ static int __devinit samsung_keypad_probe(struct platform_device *pdev)
 	keypad->irq = platform_get_irq(pdev, 0);
 	if (keypad->irq < 0) {
 		error = keypad->irq;
-		goto err_put_clk;
+		goto err_unprepare_clk;
 	}
 
-	error = request_threaded_irq(keypad->irq, NULL, samsung_keypad_irq,
-			IRQF_ONESHOT, dev_name(&pdev->dev), keypad);
+	error = devm_request_threaded_irq(&pdev->dev, keypad->irq, NULL,
+					  samsung_keypad_irq, IRQF_ONESHOT,
+					  dev_name(&pdev->dev), keypad);
 	if (error) {
 		dev_err(&pdev->dev, "failed to register keypad interrupt\n");
-		goto err_put_clk;
+		goto err_unprepare_clk;
 	}
 
 	device_init_wakeup(&pdev->dev, pdata->wakeup);
@@ -495,7 +475,7 @@ static int __devinit samsung_keypad_probe(struct platform_device *pdev)
 
 	error = input_register_device(keypad->input_dev);
 	if (error)
-		goto err_free_irq;
+		goto err_disable_runtime_pm;
 
 	if (pdev->dev.of_node) {
 		devm_kfree(&pdev->dev, (void *)pdata->keymap_data->keymap);
@@ -504,26 +484,16 @@ static int __devinit samsung_keypad_probe(struct platform_device *pdev)
 	}
 	return 0;
 
-err_free_irq:
-	free_irq(keypad->irq, keypad);
+err_disable_runtime_pm:
 	pm_runtime_disable(&pdev->dev);
 	device_init_wakeup(&pdev->dev, 0);
 	platform_set_drvdata(pdev, NULL);
 err_unprepare_clk:
 	clk_unprepare(keypad->clk);
-err_put_clk:
-	clk_put(keypad->clk);
-	samsung_keypad_dt_gpio_free(keypad);
-err_unmap_base:
-	iounmap(keypad->base);
-err_free_mem:
-	input_free_device(input_dev);
-	kfree(keypad);
-
 	return error;
 }
 
-static int __devexit samsung_keypad_remove(struct platform_device *pdev)
+static int samsung_keypad_remove(struct platform_device *pdev)
 {
 	struct samsung_keypad *keypad = platform_get_drvdata(pdev);
 
@@ -533,18 +503,7 @@ static int __devexit samsung_keypad_remove(struct platform_device *pdev)
 
 	input_unregister_device(keypad->input_dev);
 
-	/*
-	 * It is safe to free IRQ after unregistering device because
-	 * samsung_keypad_close will shut off interrupts.
-	 */
-	free_irq(keypad->irq, keypad);
-
 	clk_unprepare(keypad->clk);
-	clk_put(keypad->clk);
-	samsung_keypad_dt_gpio_free(keypad);
-
-	iounmap(keypad->base);
-	kfree(keypad);
 
 	return 0;
 }
@@ -685,7 +644,7 @@ MODULE_DEVICE_TABLE(platform, samsung_keypad_driver_ids);
 
 static struct platform_driver samsung_keypad_driver = {
 	.probe		= samsung_keypad_probe,
-	.remove		= __devexit_p(samsung_keypad_remove),
+	.remove		= samsung_keypad_remove,
 	.driver		= {
 		.name	= "samsung-keypad",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/keyboard/sh_keysc.c b/trunk/drivers/input/keyboard/sh_keysc.c
index da54ad5db154..fdb9eb2df380 100644
--- a/trunk/drivers/input/keyboard/sh_keysc.c
+++ b/trunk/drivers/input/keyboard/sh_keysc.c
@@ -162,7 +162,7 @@ static irqreturn_t sh_keysc_isr(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int __devinit sh_keysc_probe(struct platform_device *pdev)
+static int sh_keysc_probe(struct platform_device *pdev)
 {
 	struct sh_keysc_priv *priv;
 	struct sh_keysc_info *pdata;
@@ -272,7 +272,7 @@ static int __devinit sh_keysc_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit sh_keysc_remove(struct platform_device *pdev)
+static int sh_keysc_remove(struct platform_device *pdev)
 {
 	struct sh_keysc_priv *priv = platform_get_drvdata(pdev);
 
@@ -331,7 +331,7 @@ static SIMPLE_DEV_PM_OPS(sh_keysc_dev_pm_ops,
 
 static struct platform_driver sh_keysc_device_driver = {
 	.probe		= sh_keysc_probe,
-	.remove		= __devexit_p(sh_keysc_remove),
+	.remove		= sh_keysc_remove,
 	.driver		= {
 		.name	= "sh_keysc",
 		.pm	= &sh_keysc_dev_pm_ops,
diff --git a/trunk/drivers/input/keyboard/spear-keyboard.c b/trunk/drivers/input/keyboard/spear-keyboard.c
index c7ca97f44bfb..695d237417d6 100644
--- a/trunk/drivers/input/keyboard/spear-keyboard.c
+++ b/trunk/drivers/input/keyboard/spear-keyboard.c
@@ -55,15 +55,15 @@
 
 struct spear_kbd {
 	struct input_dev *input;
-	struct resource *res;
 	void __iomem *io_base;
 	struct clk *clk;
 	unsigned int irq;
 	unsigned int mode;
+	unsigned int suspended_rate;
 	unsigned short last_key;
 	unsigned short keycodes[NUM_ROWS * NUM_COLS];
 	bool rep;
-	unsigned int suspended_rate;
+	bool irq_wake_enabled;
 	u32 mode_ctl_reg;
 };
 
@@ -146,7 +146,7 @@ static void spear_kbd_close(struct input_dev *dev)
 }
 
 #ifdef CONFIG_OF
-static int __devinit spear_kbd_parse_dt(struct platform_device *pdev,
+static int spear_kbd_parse_dt(struct platform_device *pdev,
                                         struct spear_kbd *kbd)
 {
 	struct device_node *np = pdev->dev.of_node;
@@ -181,7 +181,7 @@ static inline int spear_kbd_parse_dt(struct platform_device *pdev,
 }
 #endif
 
-static int __devinit spear_kbd_probe(struct platform_device *pdev)
+static int spear_kbd_probe(struct platform_device *pdev)
 {
 	struct kbd_platform_data *pdata = dev_get_platdata(&pdev->dev);
 	const struct matrix_keymap_data *keymap = pdata ? pdata->keymap : NULL;
@@ -203,12 +203,16 @@ static int __devinit spear_kbd_probe(struct platform_device *pdev)
 		return irq;
 	}
 
-	kbd = kzalloc(sizeof(*kbd), GFP_KERNEL);
-	input_dev = input_allocate_device();
-	if (!kbd || !input_dev) {
-		dev_err(&pdev->dev, "out of memory\n");
-		error = -ENOMEM;
-		goto err_free_mem;
+	kbd = devm_kzalloc(&pdev->dev, sizeof(*kbd), GFP_KERNEL);
+	if (!kbd) {
+		dev_err(&pdev->dev, "not enough memory for driver data\n");
+		return -ENOMEM;
+	}
+
+	input_dev = devm_input_allocate_device(&pdev->dev);
+	if (!input_dev) {
+		dev_err(&pdev->dev, "unable to allocate input device\n");
+		return -ENOMEM;
 	}
 
 	kbd->input = input_dev;
@@ -217,37 +221,25 @@ static int __devinit spear_kbd_probe(struct platform_device *pdev)
 	if (!pdata) {
 		error = spear_kbd_parse_dt(pdev, kbd);
 		if (error)
-			goto err_free_mem;
+			return error;
 	} else {
 		kbd->mode = pdata->mode;
 		kbd->rep = pdata->rep;
 		kbd->suspended_rate = pdata->suspended_rate;
 	}
 
-	kbd->res = request_mem_region(res->start, resource_size(res),
-				      pdev->name);
-	if (!kbd->res) {
-		dev_err(&pdev->dev, "keyboard region already claimed\n");
-		error = -EBUSY;
-		goto err_free_mem;
-	}
-
-	kbd->io_base = ioremap(res->start, resource_size(res));
+	kbd->io_base = devm_request_and_ioremap(&pdev->dev, res);
 	if (!kbd->io_base) {
-		dev_err(&pdev->dev, "ioremap failed for kbd_region\n");
-		error = -ENOMEM;
-		goto err_release_mem_region;
+		dev_err(&pdev->dev, "request-ioremap failed for kbd_region\n");
+		return -ENOMEM;
 	}
 
-	kbd->clk = clk_get(&pdev->dev, NULL);
-	if (IS_ERR(kbd->clk)) {
-		error = PTR_ERR(kbd->clk);
-		goto err_iounmap;
-	}
+	kbd->clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(kbd->clk))
+		return PTR_ERR(kbd->clk);
 
 	input_dev->name = "Spear Keyboard";
 	input_dev->phys = "keyboard/input0";
-	input_dev->dev.parent = &pdev->dev;
 	input_dev->id.bustype = BUS_HOST;
 	input_dev->id.vendor = 0x0001;
 	input_dev->id.product = 0x0001;
@@ -259,7 +251,7 @@ static int __devinit spear_kbd_probe(struct platform_device *pdev)
 					   kbd->keycodes, input_dev);
 	if (error) {
 		dev_err(&pdev->dev, "Failed to build keymap\n");
-		goto err_put_clk;
+		return error;
 	}
 
 	if (kbd->rep)
@@ -268,48 +260,36 @@ static int __devinit spear_kbd_probe(struct platform_device *pdev)
 
 	input_set_drvdata(input_dev, kbd);
 
-	error = request_irq(irq, spear_kbd_interrupt, 0, "keyboard", kbd);
+	error = devm_request_irq(&pdev->dev, irq, spear_kbd_interrupt, 0,
+			"keyboard", kbd);
 	if (error) {
-		dev_err(&pdev->dev, "request_irq fail\n");
-		goto err_put_clk;
+		dev_err(&pdev->dev, "request_irq failed\n");
+		return error;
 	}
 
+	error = clk_prepare(kbd->clk);
+	if (error)
+		return error;
+
 	error = input_register_device(input_dev);
 	if (error) {
 		dev_err(&pdev->dev, "Unable to register keyboard device\n");
-		goto err_free_irq;
+		clk_unprepare(kbd->clk);
+		return error;
 	}
 
 	device_init_wakeup(&pdev->dev, 1);
 	platform_set_drvdata(pdev, kbd);
 
 	return 0;
-
-err_free_irq:
-	free_irq(kbd->irq, kbd);
-err_put_clk:
-	clk_put(kbd->clk);
-err_iounmap:
-	iounmap(kbd->io_base);
-err_release_mem_region:
-	release_mem_region(res->start, resource_size(res));
-err_free_mem:
-	input_free_device(input_dev);
-	kfree(kbd);
-
-	return error;
 }
 
-static int __devexit spear_kbd_remove(struct platform_device *pdev)
+static int spear_kbd_remove(struct platform_device *pdev)
 {
 	struct spear_kbd *kbd = platform_get_drvdata(pdev);
 
-	free_irq(kbd->irq, kbd);
 	input_unregister_device(kbd->input);
-	clk_put(kbd->clk);
-	iounmap(kbd->io_base);
-	release_mem_region(kbd->res->start, resource_size(kbd->res));
-	kfree(kbd);
+	clk_unprepare(kbd->clk);
 
 	device_init_wakeup(&pdev->dev, 0);
 	platform_set_drvdata(pdev, NULL);
@@ -333,7 +313,8 @@ static int spear_kbd_suspend(struct device *dev)
 	mode_ctl_reg = readl_relaxed(kbd->io_base + MODE_CTL_REG);
 
 	if (device_may_wakeup(&pdev->dev)) {
-		enable_irq_wake(kbd->irq);
+		if (!enable_irq_wake(kbd->irq))
+			kbd->irq_wake_enabled = true;
 
 		/*
 		 * reprogram the keyboard operating frequency as on some
@@ -379,7 +360,10 @@ static int spear_kbd_resume(struct device *dev)
 	mutex_lock(&input_dev->mutex);
 
 	if (device_may_wakeup(&pdev->dev)) {
-		disable_irq_wake(kbd->irq);
+		if (kbd->irq_wake_enabled) {
+			kbd->irq_wake_enabled = false;
+			disable_irq_wake(kbd->irq);
+		}
 	} else {
 		if (input_dev->users)
 			clk_enable(kbd->clk);
@@ -407,7 +391,7 @@ MODULE_DEVICE_TABLE(of, spear_kbd_id_table);
 
 static struct platform_driver spear_kbd_driver = {
 	.probe		= spear_kbd_probe,
-	.remove		= __devexit_p(spear_kbd_remove),
+	.remove		= spear_kbd_remove,
 	.driver		= {
 		.name	= "keyboard",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/keyboard/stmpe-keypad.c b/trunk/drivers/input/keyboard/stmpe-keypad.c
index 470a8778dec1..5cbec56f7720 100644
--- a/trunk/drivers/input/keyboard/stmpe-keypad.c
+++ b/trunk/drivers/input/keyboard/stmpe-keypad.c
@@ -166,7 +166,7 @@ static irqreturn_t stmpe_keypad_irq(int irq, void *dev)
 	return IRQ_HANDLED;
 }
 
-static int __devinit stmpe_keypad_altfunc_init(struct stmpe_keypad *keypad)
+static int stmpe_keypad_altfunc_init(struct stmpe_keypad *keypad)
 {
 	const struct stmpe_keypad_variant *variant = keypad->variant;
 	unsigned int col_gpios = variant->col_gpios;
@@ -207,7 +207,7 @@ static int __devinit stmpe_keypad_altfunc_init(struct stmpe_keypad *keypad)
 	return stmpe_set_altfunc(stmpe, pins, STMPE_BLOCK_KEYPAD);
 }
 
-static int __devinit stmpe_keypad_chip_init(struct stmpe_keypad *keypad)
+static int stmpe_keypad_chip_init(struct stmpe_keypad *keypad)
 {
 	const struct stmpe_keypad_platform_data *plat = keypad->plat;
 	const struct stmpe_keypad_variant *variant = keypad->variant;
@@ -257,105 +257,131 @@ static int __devinit stmpe_keypad_chip_init(struct stmpe_keypad *keypad)
 			      (plat->debounce_ms << 1));
 }
 
-static int __devinit stmpe_keypad_probe(struct platform_device *pdev)
+static void stmpe_keypad_fill_used_pins(struct stmpe_keypad *keypad)
 {
-	struct stmpe *stmpe = dev_get_drvdata(pdev->dev.parent);
+	int row, col;
+
+	for (row = 0; row < STMPE_KEYPAD_MAX_ROWS; row++) {
+		for (col = 0; col < STMPE_KEYPAD_MAX_COLS; col++) {
+			int code = MATRIX_SCAN_CODE(row, col,
+						STMPE_KEYPAD_ROW_SHIFT);
+			if (keypad->keymap[code] != KEY_RESERVED) {
+				keypad->rows |= 1 << row;
+				keypad->cols |= 1 << col;
+			}
+		}
+	}
+}
+
+#ifdef CONFIG_OF
+static const struct stmpe_keypad_platform_data *
+stmpe_keypad_of_probe(struct device *dev)
+{
+	struct device_node *np = dev->of_node;
 	struct stmpe_keypad_platform_data *plat;
+
+	if (!np)
+		return ERR_PTR(-ENODEV);
+
+	plat = devm_kzalloc(dev, sizeof(*plat), GFP_KERNEL);
+	if (!plat)
+		return ERR_PTR(-ENOMEM);
+
+	of_property_read_u32(np, "debounce-interval", &plat->debounce_ms);
+	of_property_read_u32(np, "st,scan-count", &plat->scan_count);
+
+	plat->no_autorepeat = of_property_read_bool(np, "st,no-autorepeat");
+
+	return plat;
+}
+#else
+static inline const struct stmpe_keypad_platform_data *
+stmpe_keypad_of_probe(struct device *dev)
+{
+	return ERR_PTR(-EINVAL);
+}
+#endif
+
+static int stmpe_keypad_probe(struct platform_device *pdev)
+{
+	struct stmpe *stmpe = dev_get_drvdata(pdev->dev.parent);
+	const struct stmpe_keypad_platform_data *plat;
 	struct stmpe_keypad *keypad;
 	struct input_dev *input;
-	int ret;
+	int error;
 	int irq;
-	int i;
 
 	plat = stmpe->pdata->keypad;
-	if (!plat)
-		return -ENODEV;
+	if (!plat) {
+		plat = stmpe_keypad_of_probe(&pdev->dev);
+		if (IS_ERR(plat))
+			return PTR_ERR(plat);
+	}
 
 	irq = platform_get_irq(pdev, 0);
 	if (irq < 0)
 		return irq;
 
-	keypad = kzalloc(sizeof(struct stmpe_keypad), GFP_KERNEL);
+	keypad = devm_kzalloc(&pdev->dev, sizeof(struct stmpe_keypad),
+			      GFP_KERNEL);
 	if (!keypad)
 		return -ENOMEM;
 
-	input = input_allocate_device();
-	if (!input) {
-		ret = -ENOMEM;
-		goto out_freekeypad;
-	}
+	input = devm_input_allocate_device(&pdev->dev);
+	if (!input)
+		return -ENOMEM;
 
 	input->name = "STMPE keypad";
 	input->id.bustype = BUS_I2C;
 	input->dev.parent = &pdev->dev;
 
-	ret = matrix_keypad_build_keymap(plat->keymap_data, NULL,
-					 STMPE_KEYPAD_MAX_ROWS,
-					 STMPE_KEYPAD_MAX_COLS,
-					 keypad->keymap, input);
-	if (ret)
-		goto out_freeinput;
+	error = matrix_keypad_build_keymap(plat->keymap_data, NULL,
+					   STMPE_KEYPAD_MAX_ROWS,
+					   STMPE_KEYPAD_MAX_COLS,
+					   keypad->keymap, input);
+	if (error)
+		return error;
 
 	input_set_capability(input, EV_MSC, MSC_SCAN);
 	if (!plat->no_autorepeat)
 		__set_bit(EV_REP, input->evbit);
 
-	for (i = 0; i < plat->keymap_data->keymap_size; i++) {
-		unsigned int key = plat->keymap_data->keymap[i];
-
-		keypad->cols |= 1 << KEY_COL(key);
-		keypad->rows |= 1 << KEY_ROW(key);
-	}
+	stmpe_keypad_fill_used_pins(keypad);
 
 	keypad->stmpe = stmpe;
 	keypad->plat = plat;
 	keypad->input = input;
 	keypad->variant = &stmpe_keypad_variants[stmpe->partnum];
 
-	ret = stmpe_keypad_chip_init(keypad);
-	if (ret < 0)
-		goto out_freeinput;
+	error = stmpe_keypad_chip_init(keypad);
+	if (error < 0)
+		return error;
 
-	ret = input_register_device(input);
-	if (ret) {
-		dev_err(&pdev->dev,
-			"unable to register input device: %d\n", ret);
-		goto out_freeinput;
+	error = devm_request_threaded_irq(&pdev->dev, irq,
+					  NULL, stmpe_keypad_irq,
+					  IRQF_ONESHOT, "stmpe-keypad", keypad);
+	if (error) {
+		dev_err(&pdev->dev, "unable to get irq: %d\n", error);
+		return error;
 	}
 
-	ret = request_threaded_irq(irq, NULL, stmpe_keypad_irq, IRQF_ONESHOT,
-				   "stmpe-keypad", keypad);
-	if (ret) {
-		dev_err(&pdev->dev, "unable to get irq: %d\n", ret);
-		goto out_unregisterinput;
+	error = input_register_device(input);
+	if (error) {
+		dev_err(&pdev->dev,
+			"unable to register input device: %d\n", error);
+		return error;
 	}
 
 	platform_set_drvdata(pdev, keypad);
 
 	return 0;
-
-out_unregisterinput:
-	input_unregister_device(input);
-	input = NULL;
-out_freeinput:
-	input_free_device(input);
-out_freekeypad:
-	kfree(keypad);
-	return ret;
 }
 
-static int __devexit stmpe_keypad_remove(struct platform_device *pdev)
+static int stmpe_keypad_remove(struct platform_device *pdev)
 {
 	struct stmpe_keypad *keypad = platform_get_drvdata(pdev);
-	struct stmpe *stmpe = keypad->stmpe;
-	int irq = platform_get_irq(pdev, 0);
-
-	stmpe_disable(stmpe, STMPE_BLOCK_KEYPAD);
 
-	free_irq(irq, keypad);
-	input_unregister_device(keypad->input);
-	platform_set_drvdata(pdev, NULL);
-	kfree(keypad);
+	stmpe_disable(keypad->stmpe, STMPE_BLOCK_KEYPAD);
 
 	return 0;
 }
@@ -364,7 +390,7 @@ static struct platform_driver stmpe_keypad_driver = {
 	.driver.name	= "stmpe-keypad",
 	.driver.owner	= THIS_MODULE,
 	.probe		= stmpe_keypad_probe,
-	.remove		= __devexit_p(stmpe_keypad_remove),
+	.remove		= stmpe_keypad_remove,
 };
 module_platform_driver(stmpe_keypad_driver);
 
diff --git a/trunk/drivers/input/keyboard/tc3589x-keypad.c b/trunk/drivers/input/keyboard/tc3589x-keypad.c
index 7d498e698508..2fb0d76a04c4 100644
--- a/trunk/drivers/input/keyboard/tc3589x-keypad.c
+++ b/trunk/drivers/input/keyboard/tc3589x-keypad.c
@@ -299,7 +299,7 @@ static void tc3589x_keypad_close(struct input_dev *input)
 	tc3589x_keypad_disable(keypad);
 }
 
-static int __devinit tc3589x_keypad_probe(struct platform_device *pdev)
+static int tc3589x_keypad_probe(struct platform_device *pdev)
 {
 	struct tc3589x *tc3589x = dev_get_drvdata(pdev->dev.parent);
 	struct tc_keypad *keypad;
@@ -382,7 +382,7 @@ static int __devinit tc3589x_keypad_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit tc3589x_keypad_remove(struct platform_device *pdev)
+static int tc3589x_keypad_remove(struct platform_device *pdev)
 {
 	struct tc_keypad *keypad = platform_get_drvdata(pdev);
 	int irq = platform_get_irq(pdev, 0);
@@ -448,7 +448,7 @@ static struct platform_driver tc3589x_keypad_driver = {
 		.pm	= &tc3589x_keypad_dev_pm_ops,
 	},
 	.probe	= tc3589x_keypad_probe,
-	.remove	= __devexit_p(tc3589x_keypad_remove),
+	.remove	= tc3589x_keypad_remove,
 };
 module_platform_driver(tc3589x_keypad_driver);
 
diff --git a/trunk/drivers/input/keyboard/tca6416-keypad.c b/trunk/drivers/input/keyboard/tca6416-keypad.c
index c355cdde8d22..bfc832c35a7c 100644
--- a/trunk/drivers/input/keyboard/tca6416-keypad.c
+++ b/trunk/drivers/input/keyboard/tca6416-keypad.c
@@ -166,7 +166,7 @@ static void tca6416_keys_close(struct input_dev *dev)
 		disable_irq(chip->irqnum);
 }
 
-static int __devinit tca6416_setup_registers(struct tca6416_keypad_chip *chip)
+static int tca6416_setup_registers(struct tca6416_keypad_chip *chip)
 {
 	int error;
 
@@ -197,7 +197,7 @@ static int __devinit tca6416_setup_registers(struct tca6416_keypad_chip *chip)
 	return 0;
 }
 
-static int __devinit tca6416_keypad_probe(struct i2c_client *client,
+static int tca6416_keypad_probe(struct i2c_client *client,
 				   const struct i2c_device_id *id)
 {
 	struct tca6416_keys_platform_data *pdata;
@@ -313,7 +313,7 @@ static int __devinit tca6416_keypad_probe(struct i2c_client *client,
 	return error;
 }
 
-static int __devexit tca6416_keypad_remove(struct i2c_client *client)
+static int tca6416_keypad_remove(struct i2c_client *client)
 {
 	struct tca6416_keypad_chip *chip = i2c_get_clientdata(client);
 
@@ -361,7 +361,7 @@ static struct i2c_driver tca6416_keypad_driver = {
 		.pm	= &tca6416_keypad_dev_pm_ops,
 	},
 	.probe		= tca6416_keypad_probe,
-	.remove		= __devexit_p(tca6416_keypad_remove),
+	.remove		= tca6416_keypad_remove,
 	.id_table	= tca6416_id,
 };
 
diff --git a/trunk/drivers/input/keyboard/tca8418_keypad.c b/trunk/drivers/input/keyboard/tca8418_keypad.c
index 893869b29ed9..50e9c5e195e1 100644
--- a/trunk/drivers/input/keyboard/tca8418_keypad.c
+++ b/trunk/drivers/input/keyboard/tca8418_keypad.c
@@ -35,6 +35,7 @@
 #include <linux/i2c.h>
 #include <linux/input.h>
 #include <linux/input/tca8418_keypad.h>
+#include <linux/of.h>
 
 /* TCA8418 hardware limits */
 #define TCA8418_MAX_ROWS	8
@@ -109,25 +110,11 @@
 #define KEY_EVENT_CODE		0x7f
 #define KEY_EVENT_VALUE		0x80
 
-
-static const struct i2c_device_id tca8418_id[] = {
-	{ TCA8418_NAME, 8418, },
-	{ }
-};
-MODULE_DEVICE_TABLE(i2c, tca8418_id);
-
 struct tca8418_keypad {
-	unsigned int rows;
-	unsigned int cols;
-	unsigned int keypad_mask; /* Mask for keypad col/rol regs */
-	unsigned int irq;
-	unsigned int row_shift;
-
 	struct i2c_client *client;
 	struct input_dev *input;
 
-	/* Flexible array member, must be at end of struct */
-	unsigned short keymap[];
+	unsigned int row_shift;
 };
 
 /*
@@ -172,6 +159,8 @@ static int tca8418_read_byte(struct tca8418_keypad *keypad_data,
 
 static void tca8418_read_keypad(struct tca8418_keypad *keypad_data)
 {
+	struct input_dev *input = keypad_data->input;
+	unsigned short *keymap = input->keycode;
 	int error, col, row;
 	u8 reg, state, code;
 
@@ -190,9 +179,8 @@ static void tca8418_read_keypad(struct tca8418_keypad *keypad_data)
 		col = (col) ? col - 1 : TCA8418_MAX_COLS - 1;
 
 		code = MATRIX_SCAN_CODE(row, col, keypad_data->row_shift);
-		input_event(keypad_data->input, EV_MSC, MSC_SCAN, code);
-		input_report_key(keypad_data->input,
-				keypad_data->keymap[code], state);
+		input_event(input, EV_MSC, MSC_SCAN, code);
+		input_report_key(input, keymap[code], state);
 
 		/* Read for next loop */
 		error = tca8418_read_byte(keypad_data, REG_KEY_EVENT_A, &reg);
@@ -202,7 +190,7 @@ static void tca8418_read_keypad(struct tca8418_keypad *keypad_data)
 		dev_err(&keypad_data->client->dev,
 			"unable to read REG_KEY_EVENT_A\n");
 
-	input_sync(keypad_data->input);
+	input_sync(input);
 }
 
 /*
@@ -218,16 +206,18 @@ static irqreturn_t tca8418_irq_handler(int irq, void *dev_id)
 	if (error) {
 		dev_err(&keypad_data->client->dev,
 			"unable to read REG_INT_STAT\n");
-		goto exit;
+		return IRQ_NONE;
 	}
 
+	if (!reg)
+		return IRQ_NONE;
+
 	if (reg & INT_STAT_OVR_FLOW_INT)
 		dev_warn(&keypad_data->client->dev, "overflow occurred\n");
 
 	if (reg & INT_STAT_K_INT)
 		tca8418_read_keypad(keypad_data);
 
-exit:
 	/* Clear all interrupts, even IRQs we didn't check (GPI, CAD, LCK) */
 	reg = 0xff;
 	error = tca8418_write_byte(keypad_data, REG_INT_STAT, reg);
@@ -241,7 +231,8 @@ static irqreturn_t tca8418_irq_handler(int irq, void *dev_id)
 /*
  * Configure the TCA8418 for keypad operation
  */
-static int __devinit tca8418_configure(struct tca8418_keypad *keypad_data)
+static int tca8418_configure(struct tca8418_keypad *keypad_data,
+			     u32 rows, u32 cols)
 {
 	int reg, error;
 
@@ -253,9 +244,8 @@ static int __devinit tca8418_configure(struct tca8418_keypad *keypad_data)
 
 
 	/* Assemble a mask for row and column registers */
-	reg  =  ~(~0 << keypad_data->rows);
-	reg += (~(~0 << keypad_data->cols)) << 8;
-	keypad_data->keypad_mask = reg;
+	reg  =  ~(~0 << rows);
+	reg += (~(~0 << cols)) << 8;
 
 	/* Set registers to keypad mode */
 	error |= tca8418_write_byte(keypad_data, REG_KP_GPIO1, reg);
@@ -270,145 +260,144 @@ static int __devinit tca8418_configure(struct tca8418_keypad *keypad_data)
 	return error;
 }
 
-static int __devinit tca8418_keypad_probe(struct i2c_client *client,
+static int tca8418_keypad_probe(struct i2c_client *client,
 					  const struct i2c_device_id *id)
 {
+	struct device *dev = &client->dev;
 	const struct tca8418_keypad_platform_data *pdata =
-						client->dev.platform_data;
+						dev_get_platdata(dev);
 	struct tca8418_keypad *keypad_data;
 	struct input_dev *input;
+	const struct matrix_keymap_data *keymap_data = NULL;
+	u32 rows = 0, cols = 0;
+	bool rep = false;
+	bool irq_is_gpio = false;
+	int irq;
 	int error, row_shift, max_keys;
 
 	/* Copy the platform data */
-	if (!pdata) {
-		dev_dbg(&client->dev, "no platform data\n");
-		return -EINVAL;
-	}
-
-	if (!pdata->keymap_data) {
-		dev_err(&client->dev, "no keymap data defined\n");
-		return -EINVAL;
+	if (pdata) {
+		if (!pdata->keymap_data) {
+			dev_err(dev, "no keymap data defined\n");
+			return -EINVAL;
+		}
+		keymap_data = pdata->keymap_data;
+		rows = pdata->rows;
+		cols = pdata->cols;
+		rep  = pdata->rep;
+		irq_is_gpio = pdata->irq_is_gpio;
+	} else {
+		struct device_node *np = dev->of_node;
+		of_property_read_u32(np, "keypad,num-rows", &rows);
+		of_property_read_u32(np, "keypad,num-columns", &cols);
+		rep = of_property_read_bool(np, "keypad,autorepeat");
 	}
 
-	if (!pdata->rows || pdata->rows > TCA8418_MAX_ROWS) {
-		dev_err(&client->dev, "invalid rows\n");
+	if (!rows || rows > TCA8418_MAX_ROWS) {
+		dev_err(dev, "invalid rows\n");
 		return -EINVAL;
 	}
 
-	if (!pdata->cols || pdata->cols > TCA8418_MAX_COLS) {
-		dev_err(&client->dev, "invalid columns\n");
+	if (!cols || cols > TCA8418_MAX_COLS) {
+		dev_err(dev, "invalid columns\n");
 		return -EINVAL;
 	}
 
 	/* Check i2c driver capabilities */
 	if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_BYTE)) {
-		dev_err(&client->dev, "%s adapter not supported\n",
+		dev_err(dev, "%s adapter not supported\n",
 			dev_driver_string(&client->adapter->dev));
 		return -ENODEV;
 	}
 
-	row_shift = get_count_order(pdata->cols);
-	max_keys = pdata->rows << row_shift;
+	row_shift = get_count_order(cols);
+	max_keys = rows << row_shift;
 
-	/* Allocate memory for keypad_data, keymap and input device */
-	keypad_data = kzalloc(sizeof(*keypad_data) +
-			max_keys * sizeof(keypad_data->keymap[0]), GFP_KERNEL);
+	/* Allocate memory for keypad_data and input device */
+	keypad_data = devm_kzalloc(dev, sizeof(*keypad_data), GFP_KERNEL);
 	if (!keypad_data)
 		return -ENOMEM;
 
-	keypad_data->rows = pdata->rows;
-	keypad_data->cols = pdata->cols;
 	keypad_data->client = client;
 	keypad_data->row_shift = row_shift;
 
 	/* Initialize the chip or fail if chip isn't present */
-	error = tca8418_configure(keypad_data);
+	error = tca8418_configure(keypad_data, rows, cols);
 	if (error < 0)
-		goto fail1;
+		return error;
 
 	/* Configure input device */
-	input = input_allocate_device();
-	if (!input) {
-		error = -ENOMEM;
-		goto fail1;
-	}
+	input = devm_input_allocate_device(dev);
+	if (!input)
+		return -ENOMEM;
+
 	keypad_data->input = input;
 
 	input->name = client->name;
-	input->dev.parent = &client->dev;
-
 	input->id.bustype = BUS_I2C;
 	input->id.vendor  = 0x0001;
 	input->id.product = 0x001;
 	input->id.version = 0x0001;
 
-	error = matrix_keypad_build_keymap(pdata->keymap_data, NULL,
-					   pdata->rows, pdata->cols,
-					   keypad_data->keymap, input);
+	error = matrix_keypad_build_keymap(keymap_data, NULL, rows, cols,
+					   NULL, input);
 	if (error) {
-		dev_dbg(&client->dev, "Failed to build keymap\n");
-		goto fail2;
+		dev_err(dev, "Failed to build keymap\n");
+		return error;
 	}
 
-	if (pdata->rep)
+	if (rep)
 		__set_bit(EV_REP, input->evbit);
 	input_set_capability(input, EV_MSC, MSC_SCAN);
 
 	input_set_drvdata(input, keypad_data);
 
-	if (pdata->irq_is_gpio)
-		client->irq = gpio_to_irq(client->irq);
+	irq = client->irq;
+	if (irq_is_gpio)
+		irq = gpio_to_irq(irq);
 
-	error = request_threaded_irq(client->irq, NULL, tca8418_irq_handler,
-				     IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
-				     client->name, keypad_data);
+	error = devm_request_threaded_irq(dev, irq, NULL, tca8418_irq_handler,
+					  IRQF_TRIGGER_FALLING |
+						IRQF_SHARED |
+						IRQF_ONESHOT,
+					  client->name, keypad_data);
 	if (error) {
-		dev_dbg(&client->dev,
-			"Unable to claim irq %d; error %d\n",
+		dev_err(dev, "Unable to claim irq %d; error %d\n",
 			client->irq, error);
-		goto fail2;
+		return error;
 	}
 
 	error = input_register_device(input);
 	if (error) {
-		dev_dbg(&client->dev,
-			"Unable to register input device, error: %d\n", error);
-		goto fail3;
+		dev_err(dev, "Unable to register input device, error: %d\n",
+			error);
+		return error;
 	}
 
-	i2c_set_clientdata(client, keypad_data);
 	return 0;
-
-fail3:
-	free_irq(client->irq, keypad_data);
-fail2:
-	input_free_device(input);
-fail1:
-	kfree(keypad_data);
-	return error;
 }
 
-static int __devexit tca8418_keypad_remove(struct i2c_client *client)
-{
-	struct tca8418_keypad *keypad_data = i2c_get_clientdata(client);
-
-	free_irq(keypad_data->client->irq, keypad_data);
-
-	input_unregister_device(keypad_data->input);
-
-	kfree(keypad_data);
-
-	return 0;
-}
+static const struct i2c_device_id tca8418_id[] = {
+	{ TCA8418_NAME, 8418, },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, tca8418_id);
 
+#ifdef CONFIG_OF
+static const struct of_device_id tca8418_dt_ids[] __devinitconst = {
+	{ .compatible = "ti,tca8418", },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, tca8418_dt_ids);
+#endif
 
 static struct i2c_driver tca8418_keypad_driver = {
 	.driver = {
 		.name	= TCA8418_NAME,
 		.owner	= THIS_MODULE,
+		.of_match_table = of_match_ptr(tca8418_dt_ids),
 	},
 	.probe		= tca8418_keypad_probe,
-	.remove		= __devexit_p(tca8418_keypad_remove),
 	.id_table	= tca8418_id,
 };
 
diff --git a/trunk/drivers/input/keyboard/tegra-kbc.c b/trunk/drivers/input/keyboard/tegra-kbc.c
index 5faaf2553e33..c76f96872d31 100644
--- a/trunk/drivers/input/keyboard/tegra-kbc.c
+++ b/trunk/drivers/input/keyboard/tegra-kbc.c
@@ -87,7 +87,7 @@ struct tegra_kbc {
 	struct clk *clk;
 };
 
-static const u32 tegra_kbc_default_keymap[] __devinitdata = {
+static const u32 tegra_kbc_default_keymap[] = {
 	KEY(0, 2, KEY_W),
 	KEY(0, 3, KEY_S),
 	KEY(0, 4, KEY_A),
@@ -223,7 +223,7 @@ static const u32 tegra_kbc_default_keymap[] __devinitdata = {
 };
 
 static const
-struct matrix_keymap_data tegra_kbc_default_keymap_data __devinitdata = {
+struct matrix_keymap_data tegra_kbc_default_keymap_data = {
 	.keymap		= tegra_kbc_default_keymap,
 	.keymap_size	= ARRAY_SIZE(tegra_kbc_default_keymap),
 };
@@ -573,7 +573,7 @@ static void tegra_kbc_close(struct input_dev *dev)
 	return tegra_kbc_stop(kbc);
 }
 
-static bool __devinit
+static bool
 tegra_kbc_check_pin_cfg(const struct tegra_kbc_platform_data *pdata,
 			struct device *dev, unsigned int *num_rows)
 {
@@ -619,7 +619,7 @@ tegra_kbc_check_pin_cfg(const struct tegra_kbc_platform_data *pdata,
 }
 
 #ifdef CONFIG_OF
-static struct tegra_kbc_platform_data * __devinit tegra_kbc_dt_parse_pdata(
+static struct tegra_kbc_platform_data *tegra_kbc_dt_parse_pdata(
 	struct platform_device *pdev)
 {
 	struct tegra_kbc_platform_data *pdata;
@@ -670,7 +670,7 @@ static inline struct tegra_kbc_platform_data *tegra_kbc_dt_parse_pdata(
 }
 #endif
 
-static int __devinit tegra_kbd_setup_keymap(struct tegra_kbc *kbc)
+static int tegra_kbd_setup_keymap(struct tegra_kbc *kbc)
 {
 	const struct tegra_kbc_platform_data *pdata = kbc->pdata;
 	const struct matrix_keymap_data *keymap_data = pdata->keymap_data;
@@ -697,7 +697,7 @@ static int __devinit tegra_kbd_setup_keymap(struct tegra_kbc *kbc)
 	return retval;
 }
 
-static int __devinit tegra_kbc_probe(struct platform_device *pdev)
+static int tegra_kbc_probe(struct platform_device *pdev)
 {
 	const struct tegra_kbc_platform_data *pdata = pdev->dev.platform_data;
 	struct tegra_kbc *kbc;
@@ -838,7 +838,7 @@ static int __devinit tegra_kbc_probe(struct platform_device *pdev)
 	return err;
 }
 
-static int __devexit tegra_kbc_remove(struct platform_device *pdev)
+static int tegra_kbc_remove(struct platform_device *pdev)
 {
 	struct tegra_kbc *kbc = platform_get_drvdata(pdev);
 	struct resource *res;
@@ -954,7 +954,7 @@ MODULE_DEVICE_TABLE(of, tegra_kbc_of_match);
 
 static struct platform_driver tegra_kbc_driver = {
 	.probe		= tegra_kbc_probe,
-	.remove		= __devexit_p(tegra_kbc_remove),
+	.remove		= tegra_kbc_remove,
 	.driver	= {
 		.name	= "tegra-kbc",
 		.owner  = THIS_MODULE,
diff --git a/trunk/drivers/input/keyboard/tnetv107x-keypad.c b/trunk/drivers/input/keyboard/tnetv107x-keypad.c
index 4c34f21fbe2d..ee1635011292 100644
--- a/trunk/drivers/input/keyboard/tnetv107x-keypad.c
+++ b/trunk/drivers/input/keyboard/tnetv107x-keypad.c
@@ -153,7 +153,7 @@ static void keypad_stop(struct input_dev *dev)
 	clk_disable(kp->clk);
 }
 
-static int __devinit keypad_probe(struct platform_device *pdev)
+static int keypad_probe(struct platform_device *pdev)
 {
 	const struct matrix_keypad_platform_data *pdata;
 	const struct matrix_keymap_data *keymap_data;
@@ -301,7 +301,7 @@ static int __devinit keypad_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit keypad_remove(struct platform_device *pdev)
+static int keypad_remove(struct platform_device *pdev)
 {
 	struct keypad_data *kp = platform_get_drvdata(pdev);
 
@@ -319,7 +319,7 @@ static int __devexit keypad_remove(struct platform_device *pdev)
 
 static struct platform_driver keypad_driver = {
 	.probe		= keypad_probe,
-	.remove		= __devexit_p(keypad_remove),
+	.remove		= keypad_remove,
 	.driver.name	= "tnetv107x-keypad",
 	.driver.owner	= THIS_MODULE,
 };
diff --git a/trunk/drivers/input/keyboard/twl4030_keypad.c b/trunk/drivers/input/keyboard/twl4030_keypad.c
index a2c6f79aa101..04f84fd57173 100644
--- a/trunk/drivers/input/keyboard/twl4030_keypad.c
+++ b/trunk/drivers/input/keyboard/twl4030_keypad.c
@@ -271,7 +271,7 @@ static irqreturn_t do_kp_irq(int irq, void *_kp)
 	return IRQ_HANDLED;
 }
 
-static int __devinit twl4030_kp_program(struct twl4030_keypad *kp)
+static int twl4030_kp_program(struct twl4030_keypad *kp)
 {
 	u8 reg;
 	int i;
@@ -328,7 +328,7 @@ static int __devinit twl4030_kp_program(struct twl4030_keypad *kp)
  * Registers keypad device with input subsystem
  * and configures TWL4030 keypad registers
  */
-static int __devinit twl4030_kp_probe(struct platform_device *pdev)
+static int twl4030_kp_probe(struct platform_device *pdev)
 {
 	struct twl4030_keypad_data *pdata = pdev->dev.platform_data;
 	const struct matrix_keymap_data *keymap_data;
@@ -432,7 +432,7 @@ static int __devinit twl4030_kp_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit twl4030_kp_remove(struct platform_device *pdev)
+static int twl4030_kp_remove(struct platform_device *pdev)
 {
 	struct twl4030_keypad *kp = platform_get_drvdata(pdev);
 
@@ -452,7 +452,7 @@ static int __devexit twl4030_kp_remove(struct platform_device *pdev)
 
 static struct platform_driver twl4030_kp_driver = {
 	.probe		= twl4030_kp_probe,
-	.remove		= __devexit_p(twl4030_kp_remove),
+	.remove		= twl4030_kp_remove,
 	.driver		= {
 		.name	= "twl4030_keypad",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/keyboard/w90p910_keypad.c b/trunk/drivers/input/keyboard/w90p910_keypad.c
index e0f6cd1ad0fd..ee163bee8cce 100644
--- a/trunk/drivers/input/keyboard/w90p910_keypad.c
+++ b/trunk/drivers/input/keyboard/w90p910_keypad.c
@@ -118,7 +118,7 @@ static void w90p910_keypad_close(struct input_dev *dev)
 	clk_disable(keypad->clk);
 }
 
-static int __devinit w90p910_keypad_probe(struct platform_device *pdev)
+static int w90p910_keypad_probe(struct platform_device *pdev)
 {
 	const struct w90p910_keypad_platform_data *pdata =
 						pdev->dev.platform_data;
@@ -234,7 +234,7 @@ static int __devinit w90p910_keypad_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit w90p910_keypad_remove(struct platform_device *pdev)
+static int w90p910_keypad_remove(struct platform_device *pdev)
 {
 	struct w90p910_keypad *keypad = platform_get_drvdata(pdev);
 	struct resource *res;
@@ -257,7 +257,7 @@ static int __devexit w90p910_keypad_remove(struct platform_device *pdev)
 
 static struct platform_driver w90p910_keypad_driver = {
 	.probe		= w90p910_keypad_probe,
-	.remove		= __devexit_p(w90p910_keypad_remove),
+	.remove		= w90p910_keypad_remove,
 	.driver		= {
 		.name	= "nuc900-kpi",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/matrix-keymap.c b/trunk/drivers/input/matrix-keymap.c
index d88d9be1d1b7..3ae496ea5fe6 100644
--- a/trunk/drivers/input/matrix-keymap.c
+++ b/trunk/drivers/input/matrix-keymap.c
@@ -18,6 +18,7 @@
  */
 
 #include <linux/device.h>
+#include <linux/gfp.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/input.h>
@@ -123,6 +124,11 @@ static int matrix_keypad_parse_of_keymap(const char *propname,
  * it will attempt load the keymap from property specified by @keymap_name
  * argument (or "linux,keymap" if @keymap_name is %NULL).
  *
+ * If @keymap is %NULL the function will automatically allocate managed
+ * block of memory to store the keymap. This memory will be associated with
+ * the parent device and automatically freed when device unbinds from the
+ * driver.
+ *
  * Callers are expected to set up input_dev->dev.parent before calling this
  * function.
  */
@@ -133,12 +139,27 @@ int matrix_keypad_build_keymap(const struct matrix_keymap_data *keymap_data,
 			       struct input_dev *input_dev)
 {
 	unsigned int row_shift = get_count_order(cols);
+	size_t max_keys = rows << row_shift;
 	int i;
 	int error;
 
+	if (WARN_ON(!input_dev->dev.parent))
+		return -EINVAL;
+
+	if (!keymap) {
+		keymap = devm_kzalloc(input_dev->dev.parent,
+				      max_keys * sizeof(*keymap),
+				      GFP_KERNEL);
+		if (!keymap) {
+			dev_err(input_dev->dev.parent,
+				"Unable to allocate memory for keymap");
+			return -ENOMEM;
+		}
+	}
+
 	input_dev->keycode = keymap;
 	input_dev->keycodesize = sizeof(*keymap);
-	input_dev->keycodemax = rows << row_shift;
+	input_dev->keycodemax = max_keys;
 
 	__set_bit(EV_KEY, input_dev->evbit);
 
diff --git a/trunk/drivers/input/misc/88pm80x_onkey.c b/trunk/drivers/input/misc/88pm80x_onkey.c
index 7f26e7b6c228..ee43e5b7c881 100644
--- a/trunk/drivers/input/misc/88pm80x_onkey.c
+++ b/trunk/drivers/input/misc/88pm80x_onkey.c
@@ -62,7 +62,7 @@ static irqreturn_t pm80x_onkey_handler(int irq, void *data)
 static SIMPLE_DEV_PM_OPS(pm80x_onkey_pm_ops, pm80x_dev_suspend,
 			 pm80x_dev_resume);
 
-static int __devinit pm80x_onkey_probe(struct platform_device *pdev)
+static int pm80x_onkey_probe(struct platform_device *pdev)
 {
 
 	struct pm80x_chip *chip = dev_get_drvdata(pdev->dev.parent);
@@ -139,7 +139,7 @@ static int __devinit pm80x_onkey_probe(struct platform_device *pdev)
 	return err;
 }
 
-static int __devexit pm80x_onkey_remove(struct platform_device *pdev)
+static int pm80x_onkey_remove(struct platform_device *pdev)
 {
 	struct pm80x_onkey_info *info = platform_get_drvdata(pdev);
 
@@ -157,7 +157,7 @@ static struct platform_driver pm80x_onkey_driver = {
 		   .pm = &pm80x_onkey_pm_ops,
 		   },
 	.probe = pm80x_onkey_probe,
-	.remove = __devexit_p(pm80x_onkey_remove),
+	.remove = pm80x_onkey_remove,
 };
 
 module_platform_driver(pm80x_onkey_driver);
diff --git a/trunk/drivers/input/misc/88pm860x_onkey.c b/trunk/drivers/input/misc/88pm860x_onkey.c
index f9ce1835e4d7..abd8453e5212 100644
--- a/trunk/drivers/input/misc/88pm860x_onkey.c
+++ b/trunk/drivers/input/misc/88pm860x_onkey.c
@@ -56,7 +56,7 @@ static irqreturn_t pm860x_onkey_handler(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static int __devinit pm860x_onkey_probe(struct platform_device *pdev)
+static int pm860x_onkey_probe(struct platform_device *pdev)
 {
 	struct pm860x_chip *chip = dev_get_drvdata(pdev->dev.parent);
 	struct pm860x_onkey_info *info;
@@ -121,7 +121,7 @@ static int __devinit pm860x_onkey_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static int __devexit pm860x_onkey_remove(struct platform_device *pdev)
+static int pm860x_onkey_remove(struct platform_device *pdev)
 {
 	struct pm860x_onkey_info *info = platform_get_drvdata(pdev);
 
@@ -161,7 +161,7 @@ static struct platform_driver pm860x_onkey_driver = {
 		.pm	= &pm860x_onkey_pm_ops,
 	},
 	.probe		= pm860x_onkey_probe,
-	.remove		= __devexit_p(pm860x_onkey_remove),
+	.remove		= pm860x_onkey_remove,
 };
 module_platform_driver(pm860x_onkey_driver);
 
diff --git a/trunk/drivers/input/misc/Kconfig b/trunk/drivers/input/misc/Kconfig
index 104a7c3153c0..259ef31abb18 100644
--- a/trunk/drivers/input/misc/Kconfig
+++ b/trunk/drivers/input/misc/Kconfig
@@ -300,8 +300,7 @@ config INPUT_ATI_REMOTE2
 	  called ati_remote2.
 
 config INPUT_KEYSPAN_REMOTE
-	tristate "Keyspan DMR USB remote control (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	tristate "Keyspan DMR USB remote control"
 	depends on USB_ARCH_HAS_HCD
 	select USB
 	help
@@ -350,7 +349,6 @@ config INPUT_POWERMATE
 
 config INPUT_YEALINK
 	tristate "Yealink usb-p1k voip phone"
-	depends on EXPERIMENTAL
 	depends on USB_ARCH_HAS_HCD
 	select USB
 	help
@@ -366,7 +364,6 @@ config INPUT_YEALINK
 
 config INPUT_CM109
 	tristate "C-Media CM109 USB I/O Controller"
-	depends on EXPERIMENTAL
 	depends on USB_ARCH_HAS_HCD
 	select USB
 	help
@@ -377,6 +374,16 @@ config INPUT_CM109
 	  To compile this driver as a module, choose M here: the module will be
 	  called cm109.
 
+config INPUT_RETU_PWRBUTTON
+	tristate "Retu Power button Driver"
+	depends on MFD_RETU
+	help
+	  Say Y here if you want to enable power key reporting via the
+	  Retu chips found in Nokia Internet Tablets (770, N800, N810).
+
+	  To compile this driver as a module, choose M here. The module will
+	  be called retu-pwrbutton.
+
 config INPUT_TWL4030_PWRBUTTON
 	tristate "TWL4030 Power button Driver"
 	depends on TWL4030_CORE
@@ -444,7 +451,7 @@ config INPUT_PCF50633_PMU
 
 config INPUT_PCF8574
 	tristate "PCF8574 Keypad input device"
-	depends on I2C && EXPERIMENTAL
+	depends on I2C
 	help
 	  Say Y here if you want to support a keypad connected via I2C
 	  with a PCF8574.
@@ -454,7 +461,7 @@ config INPUT_PCF8574
 
 config INPUT_PWM_BEEPER
 	tristate "PWM beeper support"
-	depends on HAVE_PWM
+	depends on HAVE_PWM || PWM
 	help
 	  Say Y here to get support for PWM based beeper devices.
 
@@ -496,6 +503,16 @@ config INPUT_DA9052_ONKEY
 	  To compile this driver as a module, choose M here: the
 	  module will be called da9052_onkey.
 
+config INPUT_DA9055_ONKEY
+	tristate "Dialog Semiconductor DA9055 ONKEY"
+	depends on MFD_DA9055
+	help
+	  Support the ONKEY of DA9055 PMICs as an input device
+	  reporting power button status.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called da9055_onkey.
+
 config INPUT_DM355EVM
 	tristate "TI DaVinci DM355 EVM Keypad and IR Remote"
 	depends on MFD_DM355EVM_MSP
diff --git a/trunk/drivers/input/misc/Makefile b/trunk/drivers/input/misc/Makefile
index 5ea769eda999..1f1e1b109d9d 100644
--- a/trunk/drivers/input/misc/Makefile
+++ b/trunk/drivers/input/misc/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_INPUT_CMA3000)		+= cma3000_d0x.o
 obj-$(CONFIG_INPUT_CMA3000_I2C)		+= cma3000_d0x_i2c.o
 obj-$(CONFIG_INPUT_COBALT_BTNS)		+= cobalt_btns.o
 obj-$(CONFIG_INPUT_DA9052_ONKEY)	+= da9052_onkey.o
+obj-$(CONFIG_INPUT_DA9055_ONKEY)	+= da9055_onkey.o
 obj-$(CONFIG_INPUT_DM355EVM)		+= dm355evm_keys.o
 obj-$(CONFIG_INPUT_GP2A)		+= gp2ap002a00f.o
 obj-$(CONFIG_INPUT_GPIO_TILT_POLLED)	+= gpio_tilt_polled.o
@@ -46,6 +47,7 @@ obj-$(CONFIG_INPUT_PMIC8XXX_PWRKEY)	+= pmic8xxx-pwrkey.o
 obj-$(CONFIG_INPUT_POWERMATE)		+= powermate.o
 obj-$(CONFIG_INPUT_PWM_BEEPER)		+= pwm-beeper.o
 obj-$(CONFIG_INPUT_RB532_BUTTON)	+= rb532_button.o
+obj-$(CONFIG_INPUT_RETU_PWRBUTTON)	+= retu-pwrbutton.o
 obj-$(CONFIG_INPUT_GPIO_ROTARY_ENCODER)	+= rotary_encoder.o
 obj-$(CONFIG_INPUT_SGI_BTNS)		+= sgi_btns.o
 obj-$(CONFIG_INPUT_SPARCSPKR)		+= sparcspkr.o
diff --git a/trunk/drivers/input/misc/ab8500-ponkey.c b/trunk/drivers/input/misc/ab8500-ponkey.c
index 84ec691c05aa..2f090b46e716 100644
--- a/trunk/drivers/input/misc/ab8500-ponkey.c
+++ b/trunk/drivers/input/misc/ab8500-ponkey.c
@@ -45,7 +45,7 @@ static irqreturn_t ab8500_ponkey_handler(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static int __devinit ab8500_ponkey_probe(struct platform_device *pdev)
+static int ab8500_ponkey_probe(struct platform_device *pdev)
 {
 	struct ab8500 *ab8500 = dev_get_drvdata(pdev->dev.parent);
 	struct ab8500_ponkey *ponkey;
@@ -118,7 +118,7 @@ static int __devinit ab8500_ponkey_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit ab8500_ponkey_remove(struct platform_device *pdev)
+static int ab8500_ponkey_remove(struct platform_device *pdev)
 {
 	struct ab8500_ponkey *ponkey = platform_get_drvdata(pdev);
 
@@ -146,7 +146,7 @@ static struct platform_driver ab8500_ponkey_driver = {
 		.of_match_table = of_match_ptr(ab8500_ponkey_match),
 	},
 	.probe		= ab8500_ponkey_probe,
-	.remove		= __devexit_p(ab8500_ponkey_remove),
+	.remove		= ab8500_ponkey_remove,
 };
 module_platform_driver(ab8500_ponkey_driver);
 
diff --git a/trunk/drivers/input/misc/ad714x-i2c.c b/trunk/drivers/input/misc/ad714x-i2c.c
index c8a79015472a..29d2064c26f2 100644
--- a/trunk/drivers/input/misc/ad714x-i2c.c
+++ b/trunk/drivers/input/misc/ad714x-i2c.c
@@ -72,7 +72,7 @@ static int ad714x_i2c_read(struct ad714x_chip *chip,
 	return 0;
 }
 
-static int __devinit ad714x_i2c_probe(struct i2c_client *client,
+static int ad714x_i2c_probe(struct i2c_client *client,
 					const struct i2c_device_id *id)
 {
 	struct ad714x_chip *chip;
@@ -87,7 +87,7 @@ static int __devinit ad714x_i2c_probe(struct i2c_client *client,
 	return 0;
 }
 
-static int __devexit ad714x_i2c_remove(struct i2c_client *client)
+static int ad714x_i2c_remove(struct i2c_client *client)
 {
 	struct ad714x_chip *chip = i2c_get_clientdata(client);
 
@@ -112,7 +112,7 @@ static struct i2c_driver ad714x_i2c_driver = {
 		.pm   = &ad714x_i2c_pm,
 	},
 	.probe    = ad714x_i2c_probe,
-	.remove   = __devexit_p(ad714x_i2c_remove),
+	.remove   = ad714x_i2c_remove,
 	.id_table = ad714x_id,
 };
 
diff --git a/trunk/drivers/input/misc/ad714x-spi.c b/trunk/drivers/input/misc/ad714x-spi.c
index 75f6136d608e..bdccca42d138 100644
--- a/trunk/drivers/input/misc/ad714x-spi.c
+++ b/trunk/drivers/input/misc/ad714x-spi.c
@@ -83,7 +83,7 @@ static int ad714x_spi_write(struct ad714x_chip *chip,
 	return 0;
 }
 
-static int __devinit ad714x_spi_probe(struct spi_device *spi)
+static int ad714x_spi_probe(struct spi_device *spi)
 {
 	struct ad714x_chip *chip;
 	int err;
@@ -103,7 +103,7 @@ static int __devinit ad714x_spi_probe(struct spi_device *spi)
 	return 0;
 }
 
-static int __devexit ad714x_spi_remove(struct spi_device *spi)
+static int ad714x_spi_remove(struct spi_device *spi)
 {
 	struct ad714x_chip *chip = spi_get_drvdata(spi);
 
@@ -120,7 +120,7 @@ static struct spi_driver ad714x_spi_driver = {
 		.pm	= &ad714x_spi_pm,
 	},
 	.probe		= ad714x_spi_probe,
-	.remove		= __devexit_p(ad714x_spi_remove),
+	.remove		= ad714x_spi_remove,
 };
 
 module_spi_driver(ad714x_spi_driver);
diff --git a/trunk/drivers/input/misc/adxl34x-i2c.c b/trunk/drivers/input/misc/adxl34x-i2c.c
index dd1d1c145a7f..535dda48cace 100644
--- a/trunk/drivers/input/misc/adxl34x-i2c.c
+++ b/trunk/drivers/input/misc/adxl34x-i2c.c
@@ -73,7 +73,7 @@ static const struct adxl34x_bus_ops adxl34x_i2c_bops = {
 	.read_block	= adxl34x_i2c_read_block,
 };
 
-static int __devinit adxl34x_i2c_probe(struct i2c_client *client,
+static int adxl34x_i2c_probe(struct i2c_client *client,
 				       const struct i2c_device_id *id)
 {
 	struct adxl34x *ac;
@@ -98,7 +98,7 @@ static int __devinit adxl34x_i2c_probe(struct i2c_client *client,
 	return 0;
 }
 
-static int __devexit adxl34x_i2c_remove(struct i2c_client *client)
+static int adxl34x_i2c_remove(struct i2c_client *client)
 {
 	struct adxl34x *ac = i2c_get_clientdata(client);
 
@@ -144,7 +144,7 @@ static struct i2c_driver adxl34x_driver = {
 		.pm = &adxl34x_i2c_pm,
 	},
 	.probe    = adxl34x_i2c_probe,
-	.remove   = __devexit_p(adxl34x_i2c_remove),
+	.remove   = adxl34x_i2c_remove,
 	.id_table = adxl34x_id,
 };
 
diff --git a/trunk/drivers/input/misc/adxl34x-spi.c b/trunk/drivers/input/misc/adxl34x-spi.c
index 820a802a1e6e..ad5f40d37e48 100644
--- a/trunk/drivers/input/misc/adxl34x-spi.c
+++ b/trunk/drivers/input/misc/adxl34x-spi.c
@@ -65,7 +65,7 @@ static const struct adxl34x_bus_ops adxl34x_spi_bops = {
 	.read_block	= adxl34x_spi_read_block,
 };
 
-static int __devinit adxl34x_spi_probe(struct spi_device *spi)
+static int adxl34x_spi_probe(struct spi_device *spi)
 {
 	struct adxl34x *ac;
 
@@ -87,7 +87,7 @@ static int __devinit adxl34x_spi_probe(struct spi_device *spi)
 	return 0;
 }
 
-static int __devexit adxl34x_spi_remove(struct spi_device *spi)
+static int adxl34x_spi_remove(struct spi_device *spi)
 {
 	struct adxl34x *ac = dev_get_drvdata(&spi->dev);
 
@@ -126,7 +126,7 @@ static struct spi_driver adxl34x_driver = {
 		.pm = &adxl34x_spi_pm,
 	},
 	.probe   = adxl34x_spi_probe,
-	.remove  = __devexit_p(adxl34x_spi_remove),
+	.remove  = adxl34x_spi_remove,
 };
 
 module_spi_driver(adxl34x_driver);
diff --git a/trunk/drivers/input/misc/bfin_rotary.c b/trunk/drivers/input/misc/bfin_rotary.c
index 1c4146fccfdf..a6666e142a91 100644
--- a/trunk/drivers/input/misc/bfin_rotary.c
+++ b/trunk/drivers/input/misc/bfin_rotary.c
@@ -90,7 +90,7 @@ static irqreturn_t bfin_rotary_isr(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int __devinit bfin_rotary_probe(struct platform_device *pdev)
+static int bfin_rotary_probe(struct platform_device *pdev)
 {
 	struct bfin_rotary_platform_data *pdata = pdev->dev.platform_data;
 	struct bfin_rot *rotary;
@@ -196,7 +196,7 @@ static int __devinit bfin_rotary_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit bfin_rotary_remove(struct platform_device *pdev)
+static int bfin_rotary_remove(struct platform_device *pdev)
 {
 	struct bfin_rot *rotary = platform_get_drvdata(pdev);
 
@@ -255,7 +255,7 @@ static const struct dev_pm_ops bfin_rotary_pm_ops = {
 
 static struct platform_driver bfin_rotary_device_driver = {
 	.probe		= bfin_rotary_probe,
-	.remove		= __devexit_p(bfin_rotary_remove),
+	.remove		= bfin_rotary_remove,
 	.driver		= {
 		.name	= "bfin-rotary",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/misc/bma150.c b/trunk/drivers/input/misc/bma150.c
index e2f1e9f952b1..08ffcabd7220 100644
--- a/trunk/drivers/input/misc/bma150.c
+++ b/trunk/drivers/input/misc/bma150.c
@@ -158,7 +158,7 @@ struct bma150_data {
  * are stated and verified by Bosch Sensortec where they are configured
  * to provide a generic sensitivity performance.
  */
-static struct bma150_cfg default_cfg __devinitdata = {
+static struct bma150_cfg default_cfg = {
 	.any_motion_int = 1,
 	.hg_int = 1,
 	.lg_int = 1,
@@ -224,7 +224,7 @@ static int bma150_set_mode(struct bma150_data *bma150, u8 mode)
 	return 0;
 }
 
-static int __devinit bma150_soft_reset(struct bma150_data *bma150)
+static int bma150_soft_reset(struct bma150_data *bma150)
 {
 	int error;
 
@@ -237,19 +237,19 @@ static int __devinit bma150_soft_reset(struct bma150_data *bma150)
 	return 0;
 }
 
-static int __devinit bma150_set_range(struct bma150_data *bma150, u8 range)
+static int bma150_set_range(struct bma150_data *bma150, u8 range)
 {
 	return bma150_set_reg_bits(bma150->client, range, BMA150_RANGE_POS,
 				BMA150_RANGE_MSK, BMA150_RANGE_REG);
 }
 
-static int __devinit bma150_set_bandwidth(struct bma150_data *bma150, u8 bw)
+static int bma150_set_bandwidth(struct bma150_data *bma150, u8 bw)
 {
 	return bma150_set_reg_bits(bma150->client, bw, BMA150_BANDWIDTH_POS,
 				BMA150_BANDWIDTH_MSK, BMA150_BANDWIDTH_REG);
 }
 
-static int __devinit bma150_set_low_g_interrupt(struct bma150_data *bma150,
+static int bma150_set_low_g_interrupt(struct bma150_data *bma150,
 					u8 enable, u8 hyst, u8 dur, u8 thres)
 {
 	int error;
@@ -273,7 +273,7 @@ static int __devinit bma150_set_low_g_interrupt(struct bma150_data *bma150,
 				BMA150_LOW_G_EN_REG);
 }
 
-static int __devinit bma150_set_high_g_interrupt(struct bma150_data *bma150,
+static int bma150_set_high_g_interrupt(struct bma150_data *bma150,
 					u8 enable, u8 hyst, u8 dur, u8 thres)
 {
 	int error;
@@ -300,7 +300,7 @@ static int __devinit bma150_set_high_g_interrupt(struct bma150_data *bma150,
 }
 
 
-static int __devinit bma150_set_any_motion_interrupt(struct bma150_data *bma150,
+static int bma150_set_any_motion_interrupt(struct bma150_data *bma150,
 						u8 enable, u8 dur, u8 thres)
 {
 	int error;
@@ -424,7 +424,7 @@ static void bma150_poll_close(struct input_polled_dev *ipoll_dev)
 	bma150_close(bma150);
 }
 
-static int __devinit bma150_initialize(struct bma150_data *bma150,
+static int bma150_initialize(struct bma150_data *bma150,
 				       const struct bma150_cfg *cfg)
 {
 	int error;
@@ -465,7 +465,7 @@ static int __devinit bma150_initialize(struct bma150_data *bma150,
 	return bma150_set_mode(bma150, BMA150_MODE_SLEEP);
 }
 
-static void __devinit bma150_init_input_device(struct bma150_data *bma150,
+static void bma150_init_input_device(struct bma150_data *bma150,
 						struct input_dev *idev)
 {
 	idev->name = BMA150_DRIVER;
@@ -479,7 +479,7 @@ static void __devinit bma150_init_input_device(struct bma150_data *bma150,
 	input_set_abs_params(idev, ABS_Z, ABSMIN_ACC_VAL, ABSMAX_ACC_VAL, 0, 0);
 }
 
-static int __devinit bma150_register_input_device(struct bma150_data *bma150)
+static int bma150_register_input_device(struct bma150_data *bma150)
 {
 	struct input_dev *idev;
 	int error;
@@ -504,7 +504,7 @@ static int __devinit bma150_register_input_device(struct bma150_data *bma150)
 	return 0;
 }
 
-static int __devinit bma150_register_polled_device(struct bma150_data *bma150)
+static int bma150_register_polled_device(struct bma150_data *bma150)
 {
 	struct input_polled_dev *ipoll_dev;
 	int error;
@@ -535,7 +535,7 @@ static int __devinit bma150_register_polled_device(struct bma150_data *bma150)
 	return 0;
 }
 
-static int __devinit bma150_probe(struct i2c_client *client,
+static int bma150_probe(struct i2c_client *client,
 				  const struct i2c_device_id *id)
 {
 	const struct bma150_platform_data *pdata = client->dev.platform_data;
@@ -613,7 +613,7 @@ static int __devinit bma150_probe(struct i2c_client *client,
 	return error;
 }
 
-static int __devexit bma150_remove(struct i2c_client *client)
+static int bma150_remove(struct i2c_client *client)
 {
 	struct bma150_data *bma150 = i2c_get_clientdata(client);
 
@@ -670,7 +670,7 @@ static struct i2c_driver bma150_driver = {
 	.class		= I2C_CLASS_HWMON,
 	.id_table	= bma150_id,
 	.probe		= bma150_probe,
-	.remove		= __devexit_p(bma150_remove),
+	.remove		= bma150_remove,
 };
 
 module_i2c_driver(bma150_driver);
diff --git a/trunk/drivers/input/misc/cma3000_d0x_i2c.c b/trunk/drivers/input/misc/cma3000_d0x_i2c.c
index fe9b85f07792..4fdef98ceb56 100644
--- a/trunk/drivers/input/misc/cma3000_d0x_i2c.c
+++ b/trunk/drivers/input/misc/cma3000_d0x_i2c.c
@@ -55,7 +55,7 @@ static const struct cma3000_bus_ops cma3000_i2c_bops = {
 	.write		= cma3000_i2c_set,
 };
 
-static int __devinit cma3000_i2c_probe(struct i2c_client *client,
+static int cma3000_i2c_probe(struct i2c_client *client,
 					const struct i2c_device_id *id)
 {
 	struct cma3000_accl_data *data;
@@ -69,7 +69,7 @@ static int __devinit cma3000_i2c_probe(struct i2c_client *client,
 	return 0;
 }
 
-static int __devexit cma3000_i2c_remove(struct i2c_client *client)
+static int cma3000_i2c_remove(struct i2c_client *client)
 {
 	struct cma3000_accl_data *data = i2c_get_clientdata(client);
 
@@ -114,7 +114,7 @@ MODULE_DEVICE_TABLE(i2c, cma3000_i2c_id);
 
 static struct i2c_driver cma3000_i2c_driver = {
 	.probe		= cma3000_i2c_probe,
-	.remove		= __devexit_p(cma3000_i2c_remove),
+	.remove		= cma3000_i2c_remove,
 	.id_table	= cma3000_i2c_id,
 	.driver = {
 		.name	= "cma3000_i2c_accl",
diff --git a/trunk/drivers/input/misc/cobalt_btns.c b/trunk/drivers/input/misc/cobalt_btns.c
index 53e43d295148..4f77f87847e8 100644
--- a/trunk/drivers/input/misc/cobalt_btns.c
+++ b/trunk/drivers/input/misc/cobalt_btns.c
@@ -73,7 +73,7 @@ static void handle_buttons(struct input_polled_dev *dev)
 	}
 }
 
-static int __devinit cobalt_buttons_probe(struct platform_device *pdev)
+static int cobalt_buttons_probe(struct platform_device *pdev)
 {
 	struct buttons_dev *bdev;
 	struct input_polled_dev *poll_dev;
@@ -135,7 +135,7 @@ static int __devinit cobalt_buttons_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit cobalt_buttons_remove(struct platform_device *pdev)
+static int cobalt_buttons_remove(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct buttons_dev *bdev = dev_get_drvdata(dev);
@@ -157,7 +157,7 @@ MODULE_ALIAS("platform:Cobalt buttons");
 
 static struct platform_driver cobalt_buttons_driver = {
 	.probe	= cobalt_buttons_probe,
-	.remove	= __devexit_p(cobalt_buttons_remove),
+	.remove	= cobalt_buttons_remove,
 	.driver	= {
 		.name	= "Cobalt buttons",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/misc/da9052_onkey.c b/trunk/drivers/input/misc/da9052_onkey.c
index 3be3acc3a6eb..020569a499f2 100644
--- a/trunk/drivers/input/misc/da9052_onkey.c
+++ b/trunk/drivers/input/misc/da9052_onkey.c
@@ -70,7 +70,7 @@ static irqreturn_t da9052_onkey_irq(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static int __devinit da9052_onkey_probe(struct platform_device *pdev)
+static int da9052_onkey_probe(struct platform_device *pdev)
 {
 	struct da9052 *da9052 = dev_get_drvdata(pdev->dev.parent);
 	struct da9052_onkey *onkey;
@@ -129,7 +129,7 @@ static int __devinit da9052_onkey_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit da9052_onkey_remove(struct platform_device *pdev)
+static int da9052_onkey_remove(struct platform_device *pdev)
 {
 	struct da9052_onkey *onkey = platform_get_drvdata(pdev);
 
@@ -144,7 +144,7 @@ static int __devexit da9052_onkey_remove(struct platform_device *pdev)
 
 static struct platform_driver da9052_onkey_driver = {
 	.probe	= da9052_onkey_probe,
-	.remove	= __devexit_p(da9052_onkey_remove),
+	.remove	= da9052_onkey_remove,
 	.driver = {
 		.name	= "da9052-onkey",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/misc/da9055_onkey.c b/trunk/drivers/input/misc/da9055_onkey.c
new file mode 100644
index 000000000000..ee6ae3a00174
--- /dev/null
+++ b/trunk/drivers/input/misc/da9055_onkey.c
@@ -0,0 +1,171 @@
+/*
+ * ON pin driver for Dialog DA9055 PMICs
+ *
+ * Copyright(c) 2012 Dialog Semiconductor Ltd.
+ *
+ * Author: David Dajun Chen <dchen@diasemi.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/input.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+
+#include <linux/mfd/da9055/core.h>
+#include <linux/mfd/da9055/reg.h>
+
+struct da9055_onkey {
+	struct da9055 *da9055;
+	struct input_dev *input;
+	struct delayed_work work;
+};
+
+static void da9055_onkey_query(struct da9055_onkey *onkey)
+{
+	int key_stat;
+
+	key_stat = da9055_reg_read(onkey->da9055, DA9055_REG_STATUS_A);
+	if (key_stat < 0) {
+		dev_err(onkey->da9055->dev,
+			"Failed to read onkey event %d\n", key_stat);
+	} else {
+		key_stat &= DA9055_NOKEY_STS;
+		/*
+		 * Onkey status bit is cleared when onkey button is relased.
+		 */
+		if (!key_stat) {
+			input_report_key(onkey->input, KEY_POWER, 0);
+			input_sync(onkey->input);
+		}
+	}
+
+	/*
+	 * Interrupt is generated only when the ONKEY pin is asserted.
+	 * Hence the deassertion of the pin is simulated through work queue.
+	 */
+	if (key_stat)
+		schedule_delayed_work(&onkey->work, msecs_to_jiffies(10));
+
+}
+
+static void da9055_onkey_work(struct work_struct *work)
+{
+	struct da9055_onkey *onkey = container_of(work, struct da9055_onkey,
+						  work.work);
+
+	da9055_onkey_query(onkey);
+}
+
+static irqreturn_t da9055_onkey_irq(int irq, void *data)
+{
+	struct da9055_onkey *onkey = data;
+
+	input_report_key(onkey->input, KEY_POWER, 1);
+	input_sync(onkey->input);
+
+	da9055_onkey_query(onkey);
+
+	return IRQ_HANDLED;
+}
+
+static int da9055_onkey_probe(struct platform_device *pdev)
+{
+	struct da9055 *da9055 = dev_get_drvdata(pdev->dev.parent);
+	struct da9055_onkey *onkey;
+	struct input_dev *input_dev;
+	int irq, err;
+
+	irq = platform_get_irq_byname(pdev, "ONKEY");
+	if (irq < 0) {
+		dev_err(&pdev->dev,
+			"Failed to get an IRQ for input device, %d\n", irq);
+		return -EINVAL;
+	}
+
+	onkey = devm_kzalloc(&pdev->dev, sizeof(*onkey), GFP_KERNEL);
+	if (!onkey) {
+		dev_err(&pdev->dev, "Failed to allocate memory\n");
+		return -ENOMEM;
+	}
+
+	input_dev = input_allocate_device();
+	if (!input_dev) {
+		dev_err(&pdev->dev, "Failed to allocate memory\n");
+		return -ENOMEM;
+	}
+
+	onkey->input = input_dev;
+	onkey->da9055 = da9055;
+	input_dev->name = "da9055-onkey";
+	input_dev->phys = "da9055-onkey/input0";
+	input_dev->dev.parent = &pdev->dev;
+
+	input_dev->evbit[0] = BIT_MASK(EV_KEY);
+	__set_bit(KEY_POWER, input_dev->keybit);
+
+	INIT_DELAYED_WORK(&onkey->work, da9055_onkey_work);
+
+	irq = regmap_irq_get_virq(da9055->irq_data, irq);
+	err = request_threaded_irq(irq, NULL, da9055_onkey_irq,
+				   IRQF_TRIGGER_HIGH | IRQF_ONESHOT,
+				   "ONKEY", onkey);
+	if (err < 0) {
+		dev_err(&pdev->dev,
+			"Failed to register ONKEY IRQ %d, error = %d\n",
+			irq, err);
+		goto err_free_input;
+	}
+
+	err = input_register_device(input_dev);
+	if (err) {
+		dev_err(&pdev->dev, "Unable to register input device, %d\n",
+			err);
+		goto err_free_irq;
+	}
+
+	platform_set_drvdata(pdev, onkey);
+
+	return 0;
+
+err_free_irq:
+	free_irq(irq, onkey);
+	cancel_delayed_work_sync(&onkey->work);
+err_free_input:
+	input_free_device(input_dev);
+
+	return err;
+}
+
+static int da9055_onkey_remove(struct platform_device *pdev)
+{
+	struct da9055_onkey *onkey = platform_get_drvdata(pdev);
+	int irq = platform_get_irq_byname(pdev, "ONKEY");
+
+	irq = regmap_irq_get_virq(onkey->da9055->irq_data, irq);
+	free_irq(irq, onkey);
+	cancel_delayed_work_sync(&onkey->work);
+	input_unregister_device(onkey->input);
+
+	return 0;
+}
+
+static struct platform_driver da9055_onkey_driver = {
+	.probe	= da9055_onkey_probe,
+	.remove	= da9055_onkey_remove,
+	.driver = {
+		.name	= "da9055-onkey",
+		.owner	= THIS_MODULE,
+	},
+};
+
+module_platform_driver(da9055_onkey_driver);
+
+MODULE_AUTHOR("David Dajun Chen <dchen@diasemi.com>");
+MODULE_DESCRIPTION("Onkey driver for DA9055");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:da9055-onkey");
diff --git a/trunk/drivers/input/misc/dm355evm_keys.c b/trunk/drivers/input/misc/dm355evm_keys.c
index c1313d8535c3..a309a5c0899e 100644
--- a/trunk/drivers/input/misc/dm355evm_keys.c
+++ b/trunk/drivers/input/misc/dm355evm_keys.c
@@ -173,7 +173,7 @@ static irqreturn_t dm355evm_keys_irq(int irq, void *_keys)
 
 /*----------------------------------------------------------------------*/
 
-static int __devinit dm355evm_keys_probe(struct platform_device *pdev)
+static int dm355evm_keys_probe(struct platform_device *pdev)
 {
 	struct dm355evm_keys	*keys;
 	struct input_dev	*input;
@@ -239,7 +239,7 @@ static int __devinit dm355evm_keys_probe(struct platform_device *pdev)
 	return status;
 }
 
-static int __devexit dm355evm_keys_remove(struct platform_device *pdev)
+static int dm355evm_keys_remove(struct platform_device *pdev)
 {
 	struct dm355evm_keys	*keys = platform_get_drvdata(pdev);
 
@@ -262,7 +262,7 @@ static int __devexit dm355evm_keys_remove(struct platform_device *pdev)
  */
 static struct platform_driver dm355evm_keys_driver = {
 	.probe		= dm355evm_keys_probe,
-	.remove		= __devexit_p(dm355evm_keys_remove),
+	.remove		= dm355evm_keys_remove,
 	.driver		= {
 		.owner	= THIS_MODULE,
 		.name	= "dm355evm_keys",
diff --git a/trunk/drivers/input/misc/gp2ap002a00f.c b/trunk/drivers/input/misc/gp2ap002a00f.c
index b6664cfa340a..fe30bd0fe4bd 100644
--- a/trunk/drivers/input/misc/gp2ap002a00f.c
+++ b/trunk/drivers/input/misc/gp2ap002a00f.c
@@ -98,7 +98,7 @@ static void gp2a_device_close(struct input_dev *dev)
 			"unable to deactivate, err %d\n", error);
 }
 
-static int __devinit gp2a_initialize(struct gp2a_data *dt)
+static int gp2a_initialize(struct gp2a_data *dt)
 {
 	int error;
 
@@ -122,7 +122,7 @@ static int __devinit gp2a_initialize(struct gp2a_data *dt)
 	return error;
 }
 
-static int __devinit gp2a_probe(struct i2c_client *client,
+static int gp2a_probe(struct i2c_client *client,
 				const struct i2c_device_id *id)
 {
 	const struct gp2a_platform_data *pdata = client->dev.platform_data;
@@ -205,7 +205,7 @@ static int __devinit gp2a_probe(struct i2c_client *client,
 	return error;
 }
 
-static int __devexit gp2a_remove(struct i2c_client *client)
+static int gp2a_remove(struct i2c_client *client)
 {
 	struct gp2a_data *dt = i2c_get_clientdata(client);
 	const struct gp2a_platform_data *pdata = dt->pdata;
@@ -277,7 +277,7 @@ static struct i2c_driver gp2a_i2c_driver = {
 		.pm	= &gp2a_pm,
 	},
 	.probe		= gp2a_probe,
-	.remove		= __devexit_p(gp2a_remove),
+	.remove		= gp2a_remove,
 	.id_table	= gp2a_i2c_id,
 };
 
diff --git a/trunk/drivers/input/misc/gpio_tilt_polled.c b/trunk/drivers/input/misc/gpio_tilt_polled.c
index 277a0574c199..da05cca8b562 100644
--- a/trunk/drivers/input/misc/gpio_tilt_polled.c
+++ b/trunk/drivers/input/misc/gpio_tilt_polled.c
@@ -96,7 +96,7 @@ static void gpio_tilt_polled_close(struct input_polled_dev *dev)
 		pdata->disable(tdev->dev);
 }
 
-static int __devinit gpio_tilt_polled_probe(struct platform_device *pdev)
+static int gpio_tilt_polled_probe(struct platform_device *pdev)
 {
 	const struct gpio_tilt_platform_data *pdata = pdev->dev.platform_data;
 	struct device *dev = &pdev->dev;
@@ -179,7 +179,7 @@ static int __devinit gpio_tilt_polled_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit gpio_tilt_polled_remove(struct platform_device *pdev)
+static int gpio_tilt_polled_remove(struct platform_device *pdev)
 {
 	struct gpio_tilt_polled_dev *tdev = platform_get_drvdata(pdev);
 	const struct gpio_tilt_platform_data *pdata = tdev->pdata;
@@ -198,7 +198,7 @@ static int __devexit gpio_tilt_polled_remove(struct platform_device *pdev)
 
 static struct platform_driver gpio_tilt_polled_driver = {
 	.probe	= gpio_tilt_polled_probe,
-	.remove	= __devexit_p(gpio_tilt_polled_remove),
+	.remove	= gpio_tilt_polled_remove,
 	.driver	= {
 		.name	= DRV_NAME,
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/misc/ixp4xx-beeper.c b/trunk/drivers/input/misc/ixp4xx-beeper.c
index 50e283068301..6ab3decc86e6 100644
--- a/trunk/drivers/input/misc/ixp4xx-beeper.c
+++ b/trunk/drivers/input/misc/ixp4xx-beeper.c
@@ -87,7 +87,7 @@ static irqreturn_t ixp4xx_spkr_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int __devinit ixp4xx_spkr_probe(struct platform_device *dev)
+static int ixp4xx_spkr_probe(struct platform_device *dev)
 {
 	struct input_dev *input_dev;
 	int err;
@@ -132,7 +132,7 @@ static int __devinit ixp4xx_spkr_probe(struct platform_device *dev)
 	return err;
 }
 
-static int __devexit ixp4xx_spkr_remove(struct platform_device *dev)
+static int ixp4xx_spkr_remove(struct platform_device *dev)
 {
 	struct input_dev *input_dev = platform_get_drvdata(dev);
 	unsigned int pin = (unsigned int) input_get_drvdata(input_dev);
@@ -165,7 +165,7 @@ static struct platform_driver ixp4xx_spkr_platform_driver = {
 		.owner	= THIS_MODULE,
 	},
 	.probe		= ixp4xx_spkr_probe,
-	.remove		= __devexit_p(ixp4xx_spkr_remove),
+	.remove		= ixp4xx_spkr_remove,
 	.shutdown	= ixp4xx_spkr_shutdown,
 };
 module_platform_driver(ixp4xx_spkr_platform_driver);
diff --git a/trunk/drivers/input/misc/kxtj9.c b/trunk/drivers/input/misc/kxtj9.c
index f46139f19ff1..a993b67a8a5b 100644
--- a/trunk/drivers/input/misc/kxtj9.c
+++ b/trunk/drivers/input/misc/kxtj9.c
@@ -295,7 +295,7 @@ static void kxtj9_input_close(struct input_dev *dev)
 	kxtj9_disable(tj9);
 }
 
-static void __devinit kxtj9_init_input_device(struct kxtj9_data *tj9,
+static void kxtj9_init_input_device(struct kxtj9_data *tj9,
 					      struct input_dev *input_dev)
 {
 	__set_bit(EV_ABS, input_dev->evbit);
@@ -308,7 +308,7 @@ static void __devinit kxtj9_init_input_device(struct kxtj9_data *tj9,
 	input_dev->dev.parent = &tj9->client->dev;
 }
 
-static int __devinit kxtj9_setup_input_device(struct kxtj9_data *tj9)
+static int kxtj9_setup_input_device(struct kxtj9_data *tj9)
 {
 	struct input_dev *input_dev;
 	int err;
@@ -433,7 +433,7 @@ static void kxtj9_polled_input_close(struct input_polled_dev *dev)
 	kxtj9_disable(tj9);
 }
 
-static int __devinit kxtj9_setup_polled_device(struct kxtj9_data *tj9)
+static int kxtj9_setup_polled_device(struct kxtj9_data *tj9)
 {
 	int err;
 	struct input_polled_dev *poll_dev;
@@ -466,7 +466,7 @@ static int __devinit kxtj9_setup_polled_device(struct kxtj9_data *tj9)
 	return 0;
 }
 
-static void __devexit kxtj9_teardown_polled_device(struct kxtj9_data *tj9)
+static void kxtj9_teardown_polled_device(struct kxtj9_data *tj9)
 {
 	input_unregister_polled_device(tj9->poll_dev);
 	input_free_polled_device(tj9->poll_dev);
@@ -485,7 +485,7 @@ static inline void kxtj9_teardown_polled_device(struct kxtj9_data *tj9)
 
 #endif
 
-static int __devinit kxtj9_verify(struct kxtj9_data *tj9)
+static int kxtj9_verify(struct kxtj9_data *tj9)
 {
 	int retval;
 
@@ -506,7 +506,7 @@ static int __devinit kxtj9_verify(struct kxtj9_data *tj9)
 	return retval;
 }
 
-static int __devinit kxtj9_probe(struct i2c_client *client,
+static int kxtj9_probe(struct i2c_client *client,
 				 const struct i2c_device_id *id)
 {
 	const struct kxtj9_platform_data *pdata = client->dev.platform_data;
@@ -594,7 +594,7 @@ static int __devinit kxtj9_probe(struct i2c_client *client,
 	return err;
 }
 
-static int __devexit kxtj9_remove(struct i2c_client *client)
+static int kxtj9_remove(struct i2c_client *client)
 {
 	struct kxtj9_data *tj9 = i2c_get_clientdata(client);
 
@@ -663,7 +663,7 @@ static struct i2c_driver kxtj9_driver = {
 		.pm	= &kxtj9_pm_ops,
 	},
 	.probe		= kxtj9_probe,
-	.remove		= __devexit_p(kxtj9_remove),
+	.remove		= kxtj9_remove,
 	.id_table	= kxtj9_id,
 };
 
diff --git a/trunk/drivers/input/misc/m68kspkr.c b/trunk/drivers/input/misc/m68kspkr.c
index 0c64d9bb718e..b40ee4b47f4f 100644
--- a/trunk/drivers/input/misc/m68kspkr.c
+++ b/trunk/drivers/input/misc/m68kspkr.c
@@ -48,7 +48,7 @@ static int m68kspkr_event(struct input_dev *dev, unsigned int type, unsigned int
 	return 0;
 }
 
-static int __devinit m68kspkr_probe(struct platform_device *dev)
+static int m68kspkr_probe(struct platform_device *dev)
 {
 	struct input_dev *input_dev;
 	int err;
@@ -80,7 +80,7 @@ static int __devinit m68kspkr_probe(struct platform_device *dev)
 	return 0;
 }
 
-static int __devexit m68kspkr_remove(struct platform_device *dev)
+static int m68kspkr_remove(struct platform_device *dev)
 {
 	struct input_dev *input_dev = platform_get_drvdata(dev);
 
@@ -104,7 +104,7 @@ static struct platform_driver m68kspkr_platform_driver = {
 		.owner	= THIS_MODULE,
 	},
 	.probe		= m68kspkr_probe,
-	.remove		= __devexit_p(m68kspkr_remove),
+	.remove		= m68kspkr_remove,
 	.shutdown	= m68kspkr_shutdown,
 };
 
diff --git a/trunk/drivers/input/misc/max8925_onkey.c b/trunk/drivers/input/misc/max8925_onkey.c
index 0a12b74140d3..369a39de4ff3 100644
--- a/trunk/drivers/input/misc/max8925_onkey.c
+++ b/trunk/drivers/input/misc/max8925_onkey.c
@@ -62,7 +62,7 @@ static irqreturn_t max8925_onkey_handler(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static int __devinit max8925_onkey_probe(struct platform_device *pdev)
+static int max8925_onkey_probe(struct platform_device *pdev)
 {
 	struct max8925_chip *chip = dev_get_drvdata(pdev->dev.parent);
 	struct max8925_onkey_info *info;
@@ -141,7 +141,7 @@ static int __devinit max8925_onkey_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit max8925_onkey_remove(struct platform_device *pdev)
+static int max8925_onkey_remove(struct platform_device *pdev)
 {
 	struct max8925_onkey_info *info = platform_get_drvdata(pdev);
 	struct max8925_chip *chip = dev_get_drvdata(pdev->dev.parent);
@@ -195,7 +195,7 @@ static struct platform_driver max8925_onkey_driver = {
 		.pm	= &max8925_onkey_pm_ops,
 	},
 	.probe		= max8925_onkey_probe,
-	.remove		= __devexit_p(max8925_onkey_remove),
+	.remove		= max8925_onkey_remove,
 };
 module_platform_driver(max8925_onkey_driver);
 
diff --git a/trunk/drivers/input/misc/max8997_haptic.c b/trunk/drivers/input/misc/max8997_haptic.c
index 05b7b8bfaf0a..e973133212a5 100644
--- a/trunk/drivers/input/misc/max8997_haptic.c
+++ b/trunk/drivers/input/misc/max8997_haptic.c
@@ -241,7 +241,7 @@ static void max8997_haptic_close(struct input_dev *dev)
 	max8997_haptic_disable(chip);
 }
 
-static int __devinit max8997_haptic_probe(struct platform_device *pdev)
+static int max8997_haptic_probe(struct platform_device *pdev)
 {
 	struct max8997_dev *iodev = dev_get_drvdata(pdev->dev.parent);
 	const struct max8997_platform_data *pdata =
@@ -354,7 +354,7 @@ static int __devinit max8997_haptic_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit max8997_haptic_remove(struct platform_device *pdev)
+static int max8997_haptic_remove(struct platform_device *pdev)
 {
 	struct max8997_haptic *chip = platform_get_drvdata(pdev);
 
@@ -396,7 +396,7 @@ static struct platform_driver max8997_haptic_driver = {
 		.pm	= &max8997_haptic_pm_ops,
 	},
 	.probe		= max8997_haptic_probe,
-	.remove		= __devexit_p(max8997_haptic_remove),
+	.remove		= max8997_haptic_remove,
 	.id_table	= max8997_haptic_id,
 };
 module_platform_driver(max8997_haptic_driver);
diff --git a/trunk/drivers/input/misc/mc13783-pwrbutton.c b/trunk/drivers/input/misc/mc13783-pwrbutton.c
index 8428f1e8e83e..0906ca593d5f 100644
--- a/trunk/drivers/input/misc/mc13783-pwrbutton.c
+++ b/trunk/drivers/input/misc/mc13783-pwrbutton.c
@@ -89,7 +89,7 @@ static irqreturn_t button_irq(int irq, void *_priv)
 	return IRQ_HANDLED;
 }
 
-static int __devinit mc13783_pwrbutton_probe(struct platform_device *pdev)
+static int mc13783_pwrbutton_probe(struct platform_device *pdev)
 {
 	const struct mc13xxx_buttons_platform_data *pdata;
 	struct mc13xxx *mc13783 = dev_get_drvdata(pdev->dev.parent);
@@ -230,7 +230,7 @@ static int __devinit mc13783_pwrbutton_probe(struct platform_device *pdev)
 	return err;
 }
 
-static int __devexit mc13783_pwrbutton_remove(struct platform_device *pdev)
+static int mc13783_pwrbutton_remove(struct platform_device *pdev)
 {
 	struct mc13783_pwrb *priv = platform_get_drvdata(pdev);
 	const struct mc13xxx_buttons_platform_data *pdata;
@@ -257,7 +257,7 @@ static int __devexit mc13783_pwrbutton_remove(struct platform_device *pdev)
 
 static struct platform_driver mc13783_pwrbutton_driver = {
 	.probe		= mc13783_pwrbutton_probe,
-	.remove		= __devexit_p(mc13783_pwrbutton_remove),
+	.remove		= mc13783_pwrbutton_remove,
 	.driver		= {
 		.name	= "mc13783-pwrbutton",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/misc/mma8450.c b/trunk/drivers/input/misc/mma8450.c
index 873ebced544e..480557f14f23 100644
--- a/trunk/drivers/input/misc/mma8450.c
+++ b/trunk/drivers/input/misc/mma8450.c
@@ -167,7 +167,7 @@ static void mma8450_close(struct input_polled_dev *dev)
 /*
  * I2C init/probing/exit functions
  */
-static int __devinit mma8450_probe(struct i2c_client *c,
+static int mma8450_probe(struct i2c_client *c,
 				   const struct i2c_device_id *id)
 {
 	struct input_polled_dev *idev;
@@ -212,7 +212,7 @@ static int __devinit mma8450_probe(struct i2c_client *c,
 	return err;
 }
 
-static int __devexit mma8450_remove(struct i2c_client *c)
+static int mma8450_remove(struct i2c_client *c)
 {
 	struct mma8450 *m = i2c_get_clientdata(c);
 	struct input_polled_dev *idev = m->idev;
@@ -243,7 +243,7 @@ static struct i2c_driver mma8450_driver = {
 		.of_match_table = mma8450_dt_ids,
 	},
 	.probe		= mma8450_probe,
-	.remove		= __devexit_p(mma8450_remove),
+	.remove		= mma8450_remove,
 	.id_table	= mma8450_id,
 };
 
diff --git a/trunk/drivers/input/misc/mpu3050.c b/trunk/drivers/input/misc/mpu3050.c
index 306f84c2d8fb..dce0d95943c5 100644
--- a/trunk/drivers/input/misc/mpu3050.c
+++ b/trunk/drivers/input/misc/mpu3050.c
@@ -257,7 +257,7 @@ static irqreturn_t mpu3050_interrupt_thread(int irq, void *data)
  *
  *	Called during device probe; configures the sampling method.
  */
-static int __devinit mpu3050_hw_init(struct mpu3050_sensor *sensor)
+static int mpu3050_hw_init(struct mpu3050_sensor *sensor)
 {
 	struct i2c_client *client = sensor->client;
 	int ret;
@@ -306,7 +306,7 @@ static int __devinit mpu3050_hw_init(struct mpu3050_sensor *sensor)
  *
  *	If present install the relevant sysfs interfaces and input device.
  */
-static int __devinit mpu3050_probe(struct i2c_client *client,
+static int mpu3050_probe(struct i2c_client *client,
 				   const struct i2c_device_id *id)
 {
 	struct mpu3050_sensor *sensor;
@@ -402,7 +402,7 @@ static int __devinit mpu3050_probe(struct i2c_client *client,
  *
  *	Our sensor is going away, clean up the resources.
  */
-static int __devexit mpu3050_remove(struct i2c_client *client)
+static int mpu3050_remove(struct i2c_client *client)
 {
 	struct mpu3050_sensor *sensor = i2c_get_clientdata(client);
 
@@ -471,7 +471,7 @@ static struct i2c_driver mpu3050_i2c_driver = {
 		.of_match_table = mpu3050_of_match,
 	},
 	.probe		= mpu3050_probe,
-	.remove		= __devexit_p(mpu3050_remove),
+	.remove		= mpu3050_remove,
 	.id_table	= mpu3050_ids,
 };
 
diff --git a/trunk/drivers/input/misc/pcap_keys.c b/trunk/drivers/input/misc/pcap_keys.c
index e09b4fe81913..40ac9a5adf89 100644
--- a/trunk/drivers/input/misc/pcap_keys.c
+++ b/trunk/drivers/input/misc/pcap_keys.c
@@ -48,7 +48,7 @@ static irqreturn_t pcap_keys_handler(int irq, void *_pcap_keys)
 	return IRQ_HANDLED;
 }
 
-static int __devinit pcap_keys_probe(struct platform_device *pdev)
+static int pcap_keys_probe(struct platform_device *pdev)
 {
 	int err = -ENOMEM;
 	struct pcap_keys *pcap_keys;
@@ -104,7 +104,7 @@ static int __devinit pcap_keys_probe(struct platform_device *pdev)
 	return err;
 }
 
-static int __devexit pcap_keys_remove(struct platform_device *pdev)
+static int pcap_keys_remove(struct platform_device *pdev)
 {
 	struct pcap_keys *pcap_keys = platform_get_drvdata(pdev);
 
@@ -119,7 +119,7 @@ static int __devexit pcap_keys_remove(struct platform_device *pdev)
 
 static struct platform_driver pcap_keys_device_driver = {
 	.probe		= pcap_keys_probe,
-	.remove		= __devexit_p(pcap_keys_remove),
+	.remove		= pcap_keys_remove,
 	.driver		= {
 		.name	= "pcap-keys",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/misc/pcf50633-input.c b/trunk/drivers/input/misc/pcf50633-input.c
index 53891de80b0e..73b13ebabe56 100644
--- a/trunk/drivers/input/misc/pcf50633-input.c
+++ b/trunk/drivers/input/misc/pcf50633-input.c
@@ -53,7 +53,7 @@ pcf50633_input_irq(int irq, void *data)
 	input_sync(input->input_dev);
 }
 
-static int __devinit pcf50633_input_probe(struct platform_device *pdev)
+static int pcf50633_input_probe(struct platform_device *pdev)
 {
 	struct pcf50633_input *input;
 	struct input_dev *input_dev;
@@ -93,7 +93,7 @@ static int __devinit pcf50633_input_probe(struct platform_device *pdev)
 	return 0;
 }
 
-static int __devexit pcf50633_input_remove(struct platform_device *pdev)
+static int pcf50633_input_remove(struct platform_device *pdev)
 {
 	struct pcf50633_input *input  = platform_get_drvdata(pdev);
 
@@ -111,7 +111,7 @@ static struct platform_driver pcf50633_input_driver = {
 		.name = "pcf50633-input",
 	},
 	.probe = pcf50633_input_probe,
-	.remove = __devexit_p(pcf50633_input_remove),
+	.remove = pcf50633_input_remove,
 };
 module_platform_driver(pcf50633_input_driver);
 
diff --git a/trunk/drivers/input/misc/pcf8574_keypad.c b/trunk/drivers/input/misc/pcf8574_keypad.c
index 544c6635abe9..e37392976fdd 100644
--- a/trunk/drivers/input/misc/pcf8574_keypad.c
+++ b/trunk/drivers/input/misc/pcf8574_keypad.c
@@ -82,7 +82,7 @@ static irqreturn_t pcf8574_kp_irq_handler(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int __devinit pcf8574_kp_probe(struct i2c_client *client, const struct i2c_device_id *id)
+static int pcf8574_kp_probe(struct i2c_client *client, const struct i2c_device_id *id)
 {
 	int i, ret;
 	struct input_dev *idev;
@@ -156,7 +156,7 @@ static int __devinit pcf8574_kp_probe(struct i2c_client *client, const struct i2
 	return ret;
 }
 
-static int __devexit pcf8574_kp_remove(struct i2c_client *client)
+static int pcf8574_kp_remove(struct i2c_client *client)
 {
 	struct kp_data *lp = i2c_get_clientdata(client);
 
@@ -212,7 +212,7 @@ static struct i2c_driver pcf8574_kp_driver = {
 #endif
 	},
 	.probe    = pcf8574_kp_probe,
-	.remove   = __devexit_p(pcf8574_kp_remove),
+	.remove   = pcf8574_kp_remove,
 	.id_table = pcf8574_kp_id,
 };
 
diff --git a/trunk/drivers/input/misc/pcspkr.c b/trunk/drivers/input/misc/pcspkr.c
index b2484aa07f32..199db78acc4f 100644
--- a/trunk/drivers/input/misc/pcspkr.c
+++ b/trunk/drivers/input/misc/pcspkr.c
@@ -63,7 +63,7 @@ static int pcspkr_event(struct input_dev *dev, unsigned int type, unsigned int c
 	return 0;
 }
 
-static int __devinit pcspkr_probe(struct platform_device *dev)
+static int pcspkr_probe(struct platform_device *dev)
 {
 	struct input_dev *pcspkr_dev;
 	int err;
@@ -95,7 +95,7 @@ static int __devinit pcspkr_probe(struct platform_device *dev)
 	return 0;
 }
 
-static int __devexit pcspkr_remove(struct platform_device *dev)
+static int pcspkr_remove(struct platform_device *dev)
 {
 	struct input_dev *pcspkr_dev = platform_get_drvdata(dev);
 
@@ -131,7 +131,7 @@ static struct platform_driver pcspkr_platform_driver = {
 		.pm	= &pcspkr_pm_ops,
 	},
 	.probe		= pcspkr_probe,
-	.remove		= __devexit_p(pcspkr_remove),
+	.remove		= pcspkr_remove,
 	.shutdown	= pcspkr_shutdown,
 };
 module_platform_driver(pcspkr_platform_driver);
diff --git a/trunk/drivers/input/misc/pm8xxx-vibrator.c b/trunk/drivers/input/misc/pm8xxx-vibrator.c
index dfbfb463ea5d..a9da65e41c5b 100644
--- a/trunk/drivers/input/misc/pm8xxx-vibrator.c
+++ b/trunk/drivers/input/misc/pm8xxx-vibrator.c
@@ -178,7 +178,7 @@ static int pm8xxx_vib_play_effect(struct input_dev *dev, void *data,
 	return 0;
 }
 
-static int __devinit pm8xxx_vib_probe(struct platform_device *pdev)
+static int pm8xxx_vib_probe(struct platform_device *pdev)
 
 {
 	struct pm8xxx_vib *vib;
@@ -242,7 +242,7 @@ static int __devinit pm8xxx_vib_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit pm8xxx_vib_remove(struct platform_device *pdev)
+static int pm8xxx_vib_remove(struct platform_device *pdev)
 {
 	struct pm8xxx_vib *vib = platform_get_drvdata(pdev);
 
@@ -270,7 +270,7 @@ static SIMPLE_DEV_PM_OPS(pm8xxx_vib_pm_ops, pm8xxx_vib_suspend, NULL);
 
 static struct platform_driver pm8xxx_vib_driver = {
 	.probe		= pm8xxx_vib_probe,
-	.remove		= __devexit_p(pm8xxx_vib_remove),
+	.remove		= pm8xxx_vib_remove,
 	.driver		= {
 		.name	= "pm8xxx-vib",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/misc/pmic8xxx-pwrkey.c b/trunk/drivers/input/misc/pmic8xxx-pwrkey.c
index 0f83d0f1d015..4b811be73974 100644
--- a/trunk/drivers/input/misc/pmic8xxx-pwrkey.c
+++ b/trunk/drivers/input/misc/pmic8xxx-pwrkey.c
@@ -81,7 +81,7 @@ static int pmic8xxx_pwrkey_resume(struct device *dev)
 static SIMPLE_DEV_PM_OPS(pm8xxx_pwr_key_pm_ops,
 		pmic8xxx_pwrkey_suspend, pmic8xxx_pwrkey_resume);
 
-static int __devinit pmic8xxx_pwrkey_probe(struct platform_device *pdev)
+static int pmic8xxx_pwrkey_probe(struct platform_device *pdev)
 {
 	struct input_dev *pwr;
 	int key_release_irq = platform_get_irq(pdev, 0);
@@ -187,7 +187,7 @@ static int __devinit pmic8xxx_pwrkey_probe(struct platform_device *pdev)
 	return err;
 }
 
-static int __devexit pmic8xxx_pwrkey_remove(struct platform_device *pdev)
+static int pmic8xxx_pwrkey_remove(struct platform_device *pdev)
 {
 	struct pmic8xxx_pwrkey *pwrkey = platform_get_drvdata(pdev);
 	int key_release_irq = platform_get_irq(pdev, 0);
@@ -206,7 +206,7 @@ static int __devexit pmic8xxx_pwrkey_remove(struct platform_device *pdev)
 
 static struct platform_driver pmic8xxx_pwrkey_driver = {
 	.probe		= pmic8xxx_pwrkey_probe,
-	.remove		= __devexit_p(pmic8xxx_pwrkey_remove),
+	.remove		= pmic8xxx_pwrkey_remove,
 	.driver		= {
 		.name	= PM8XXX_PWRKEY_DEV_NAME,
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/misc/pwm-beeper.c b/trunk/drivers/input/misc/pwm-beeper.c
index fc84c8a51147..0808868461de 100644
--- a/trunk/drivers/input/misc/pwm-beeper.c
+++ b/trunk/drivers/input/misc/pwm-beeper.c
@@ -65,7 +65,7 @@ static int pwm_beeper_event(struct input_dev *input,
 	return 0;
 }
 
-static int __devinit pwm_beeper_probe(struct platform_device *pdev)
+static int pwm_beeper_probe(struct platform_device *pdev)
 {
 	unsigned long pwm_id = (unsigned long)pdev->dev.platform_data;
 	struct pwm_beeper *beeper;
@@ -75,7 +75,11 @@ static int __devinit pwm_beeper_probe(struct platform_device *pdev)
 	if (!beeper)
 		return -ENOMEM;
 
-	beeper->pwm = pwm_request(pwm_id, "pwm beeper");
+	beeper->pwm = pwm_get(&pdev->dev, NULL);
+	if (IS_ERR(beeper->pwm)) {
+		dev_dbg(&pdev->dev, "unable to request PWM, trying legacy API\n");
+		beeper->pwm = pwm_request(pwm_id, "pwm beeper");
+	}
 
 	if (IS_ERR(beeper->pwm)) {
 		error = PTR_ERR(beeper->pwm);
@@ -125,7 +129,7 @@ static int __devinit pwm_beeper_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit pwm_beeper_remove(struct platform_device *pdev)
+static int pwm_beeper_remove(struct platform_device *pdev)
 {
 	struct pwm_beeper *beeper = platform_get_drvdata(pdev);
 
@@ -171,13 +175,21 @@ static SIMPLE_DEV_PM_OPS(pwm_beeper_pm_ops,
 #define PWM_BEEPER_PM_OPS NULL
 #endif
 
+#ifdef CONFIG_OF
+static const struct of_device_id pwm_beeper_match[] = {
+	{ .compatible = "pwm-beeper", },
+	{ },
+};
+#endif
+
 static struct platform_driver pwm_beeper_driver = {
 	.probe	= pwm_beeper_probe,
-	.remove = __devexit_p(pwm_beeper_remove),
+	.remove = pwm_beeper_remove,
 	.driver = {
 		.name	= "pwm-beeper",
 		.owner	= THIS_MODULE,
 		.pm	= PWM_BEEPER_PM_OPS,
+		.of_match_table = of_match_ptr(pwm_beeper_match),
 	},
 };
 module_platform_driver(pwm_beeper_driver);
diff --git a/trunk/drivers/input/misc/rb532_button.c b/trunk/drivers/input/misc/rb532_button.c
index aeb02bcf7233..fb4f8ac3343b 100644
--- a/trunk/drivers/input/misc/rb532_button.c
+++ b/trunk/drivers/input/misc/rb532_button.c
@@ -51,7 +51,7 @@ static void rb532_button_poll(struct input_polled_dev *poll_dev)
 	input_sync(poll_dev->input);
 }
 
-static int __devinit rb532_button_probe(struct platform_device *pdev)
+static int rb532_button_probe(struct platform_device *pdev)
 {
 	struct input_polled_dev *poll_dev;
 	int error;
@@ -81,7 +81,7 @@ static int __devinit rb532_button_probe(struct platform_device *pdev)
 	return 0;
 }
 
-static int __devexit rb532_button_remove(struct platform_device *pdev)
+static int rb532_button_remove(struct platform_device *pdev)
 {
 	struct input_polled_dev *poll_dev = dev_get_drvdata(&pdev->dev);
 
@@ -94,7 +94,7 @@ static int __devexit rb532_button_remove(struct platform_device *pdev)
 
 static struct platform_driver rb532_button_driver = {
 	.probe = rb532_button_probe,
-	.remove = __devexit_p(rb532_button_remove),
+	.remove = rb532_button_remove,
 	.driver = {
 		.name = DRV_NAME,
 		.owner = THIS_MODULE,
diff --git a/trunk/drivers/input/misc/retu-pwrbutton.c b/trunk/drivers/input/misc/retu-pwrbutton.c
new file mode 100644
index 000000000000..7ca09baa0016
--- /dev/null
+++ b/trunk/drivers/input/misc/retu-pwrbutton.c
@@ -0,0 +1,99 @@
+/*
+ * Retu power button driver.
+ *
+ * Copyright (C) 2004-2010 Nokia Corporation
+ *
+ * Original code written by Ari Saastamoinen, Juha Yrjölä and Felipe Balbi.
+ * Rewritten by Aaro Koskinen.
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License. See the file "COPYING" in the main directory of this
+ * archive for more details.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/irq.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/input.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mfd/retu.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+
+#define RETU_STATUS_PWRONX (1 << 5)
+
+static irqreturn_t retu_pwrbutton_irq(int irq, void *_pwr)
+{
+	struct input_dev *idev = _pwr;
+	struct retu_dev *rdev = input_get_drvdata(idev);
+	bool state;
+
+	state = !(retu_read(rdev, RETU_REG_STATUS) & RETU_STATUS_PWRONX);
+	input_report_key(idev, KEY_POWER, state);
+	input_sync(idev);
+
+	return IRQ_HANDLED;
+}
+
+static int retu_pwrbutton_probe(struct platform_device *pdev)
+{
+	struct retu_dev *rdev = dev_get_drvdata(pdev->dev.parent);
+	struct input_dev *idev;
+	int irq;
+	int error;
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0)
+		return irq;
+
+	idev = devm_input_allocate_device(&pdev->dev);
+	if (!idev)
+		return -ENOMEM;
+
+	idev->name = "retu-pwrbutton";
+	idev->dev.parent = &pdev->dev;
+
+	input_set_capability(idev, EV_KEY, KEY_POWER);
+	input_set_drvdata(idev, rdev);
+
+	error = devm_request_threaded_irq(&pdev->dev, irq,
+					  NULL, retu_pwrbutton_irq, 0,
+					  "retu-pwrbutton", idev);
+	if (error)
+		return error;
+
+	error = input_register_device(idev);
+	if (error)
+		return error;
+
+	return 0;
+}
+
+static int retu_pwrbutton_remove(struct platform_device *pdev)
+{
+	return 0;
+}
+
+static struct platform_driver retu_pwrbutton_driver = {
+	.probe		= retu_pwrbutton_probe,
+	.remove		= retu_pwrbutton_remove,
+	.driver		= {
+		.name	= "retu-pwrbutton",
+		.owner	= THIS_MODULE,
+	},
+};
+module_platform_driver(retu_pwrbutton_driver);
+
+MODULE_ALIAS("platform:retu-pwrbutton");
+MODULE_DESCRIPTION("Retu Power Button");
+MODULE_AUTHOR("Ari Saastamoinen");
+MODULE_AUTHOR("Felipe Balbi");
+MODULE_AUTHOR("Aaro Koskinen <aaro.koskinen@iki.fi>");
+MODULE_LICENSE("GPL");
diff --git a/trunk/drivers/input/misc/rotary_encoder.c b/trunk/drivers/input/misc/rotary_encoder.c
index 99a49e4968d2..aff47b2c38ff 100644
--- a/trunk/drivers/input/misc/rotary_encoder.c
+++ b/trunk/drivers/input/misc/rotary_encoder.c
@@ -149,8 +149,7 @@ static struct of_device_id rotary_encoder_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, rotary_encoder_of_match);
 
-static struct rotary_encoder_platform_data * __devinit
-rotary_encoder_parse_dt(struct device *dev)
+static struct rotary_encoder_platform_data *rotary_encoder_parse_dt(struct device *dev)
 {
 	const struct of_device_id *of_id =
 				of_match_device(rotary_encoder_of_match, dev);
@@ -192,7 +191,7 @@ rotary_encoder_parse_dt(struct device *dev)
 }
 #endif
 
-static int __devinit rotary_encoder_probe(struct platform_device *pdev)
+static int rotary_encoder_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	const struct rotary_encoder_platform_data *pdata = dev_get_platdata(dev);
@@ -302,7 +301,7 @@ static int __devinit rotary_encoder_probe(struct platform_device *pdev)
 	return err;
 }
 
-static int __devexit rotary_encoder_remove(struct platform_device *pdev)
+static int rotary_encoder_remove(struct platform_device *pdev)
 {
 	struct rotary_encoder *encoder = platform_get_drvdata(pdev);
 	const struct rotary_encoder_platform_data *pdata = encoder->pdata;
@@ -325,7 +324,7 @@ static int __devexit rotary_encoder_remove(struct platform_device *pdev)
 
 static struct platform_driver rotary_encoder_driver = {
 	.probe		= rotary_encoder_probe,
-	.remove		= __devexit_p(rotary_encoder_remove),
+	.remove		= rotary_encoder_remove,
 	.driver		= {
 		.name	= DRV_NAME,
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/misc/sgi_btns.c b/trunk/drivers/input/misc/sgi_btns.c
index 5d9fd5571199..ad6415ceaf5f 100644
--- a/trunk/drivers/input/misc/sgi_btns.c
+++ b/trunk/drivers/input/misc/sgi_btns.c
@@ -91,7 +91,7 @@ static void handle_buttons(struct input_polled_dev *dev)
 	}
 }
 
-static int __devinit sgi_buttons_probe(struct platform_device *pdev)
+static int sgi_buttons_probe(struct platform_device *pdev)
 {
 	struct buttons_dev *bdev;
 	struct input_polled_dev *poll_dev;
@@ -143,7 +143,7 @@ static int __devinit sgi_buttons_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit sgi_buttons_remove(struct platform_device *pdev)
+static int sgi_buttons_remove(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct buttons_dev *bdev = dev_get_drvdata(dev);
@@ -158,7 +158,7 @@ static int __devexit sgi_buttons_remove(struct platform_device *pdev)
 
 static struct platform_driver sgi_buttons_driver = {
 	.probe	= sgi_buttons_probe,
-	.remove	= __devexit_p(sgi_buttons_remove),
+	.remove	= sgi_buttons_remove,
 	.driver	= {
 		.name	= "sgibtns",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/misc/sparcspkr.c b/trunk/drivers/input/misc/sparcspkr.c
index 0122f5351577..a53586a7fbdb 100644
--- a/trunk/drivers/input/misc/sparcspkr.c
+++ b/trunk/drivers/input/misc/sparcspkr.c
@@ -139,7 +139,7 @@ static int grover_spkr_event(struct input_dev *dev, unsigned int type, unsigned
 	return 0;
 }
 
-static int __devinit sparcspkr_probe(struct device *dev)
+static int sparcspkr_probe(struct device *dev)
 {
 	struct sparcspkr_state *state = dev_get_drvdata(dev);
 	struct input_dev *input_dev;
@@ -182,7 +182,7 @@ static void sparcspkr_shutdown(struct platform_device *dev)
 	state->event(input_dev, EV_SND, SND_BELL, 0);
 }
 
-static int __devinit bbc_beep_probe(struct platform_device *op)
+static int bbc_beep_probe(struct platform_device *op)
 {
 	struct sparcspkr_state *state;
 	struct bbc_beep_info *info;
@@ -229,7 +229,7 @@ static int __devinit bbc_beep_probe(struct platform_device *op)
 	return err;
 }
 
-static int __devexit bbc_remove(struct platform_device *op)
+static int bbc_remove(struct platform_device *op)
 {
 	struct sparcspkr_state *state = dev_get_drvdata(&op->dev);
 	struct input_dev *input_dev = state->input_dev;
@@ -263,11 +263,11 @@ static struct platform_driver bbc_beep_driver = {
 		.of_match_table = bbc_beep_match,
 	},
 	.probe		= bbc_beep_probe,
-	.remove		= __devexit_p(bbc_remove),
+	.remove		= bbc_remove,
 	.shutdown	= sparcspkr_shutdown,
 };
 
-static int __devinit grover_beep_probe(struct platform_device *op)
+static int grover_beep_probe(struct platform_device *op)
 {
 	struct sparcspkr_state *state;
 	struct grover_beep_info *info;
@@ -310,7 +310,7 @@ static int __devinit grover_beep_probe(struct platform_device *op)
 	return err;
 }
 
-static int __devexit grover_remove(struct platform_device *op)
+static int grover_remove(struct platform_device *op)
 {
 	struct sparcspkr_state *state = dev_get_drvdata(&op->dev);
 	struct grover_beep_info *info = &state->u.grover;
@@ -345,7 +345,7 @@ static struct platform_driver grover_beep_driver = {
 		.of_match_table = grover_beep_match,
 	},
 	.probe		= grover_beep_probe,
-	.remove		= __devexit_p(grover_remove),
+	.remove		= grover_remove,
 	.shutdown	= sparcspkr_shutdown,
 };
 
diff --git a/trunk/drivers/input/misc/twl4030-pwrbutton.c b/trunk/drivers/input/misc/twl4030-pwrbutton.c
index b3dd96d6448b..27c2bc8aa890 100644
--- a/trunk/drivers/input/misc/twl4030-pwrbutton.c
+++ b/trunk/drivers/input/misc/twl4030-pwrbutton.c
@@ -39,8 +39,7 @@ static irqreturn_t powerbutton_irq(int irq, void *_pwr)
 	int err;
 	u8 value;
 
-	err = twl_i2c_read_u8(TWL4030_MODULE_PM_MASTER, &value,
-				STS_HW_CONDITIONS);
+	err = twl_i2c_read_u8(TWL_MODULE_PM_MASTER, &value, STS_HW_CONDITIONS);
 	if (!err)  {
 		pm_wakeup_event(pwr->dev.parent, 0);
 		input_report_key(pwr, KEY_POWER, value & PWR_PWRON_IRQ);
diff --git a/trunk/drivers/input/misc/twl4030-vibra.c b/trunk/drivers/input/misc/twl4030-vibra.c
index 2194a3c7236a..78eb6b30580a 100644
--- a/trunk/drivers/input/misc/twl4030-vibra.c
+++ b/trunk/drivers/input/misc/twl4030-vibra.c
@@ -207,7 +207,7 @@ static bool twl4030_vibra_check_coexist(struct twl4030_vibra_data *pdata,
 	return false;
 }
 
-static int __devinit twl4030_vibra_probe(struct platform_device *pdev)
+static int twl4030_vibra_probe(struct platform_device *pdev)
 {
 	struct twl4030_vibra_data *pdata = pdev->dev.platform_data;
 	struct device_node *twl4030_core_node = pdev->dev.parent->of_node;
@@ -269,7 +269,7 @@ static int __devinit twl4030_vibra_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static int __devexit twl4030_vibra_remove(struct platform_device *pdev)
+static int twl4030_vibra_remove(struct platform_device *pdev)
 {
 	struct vibra_info *info = platform_get_drvdata(pdev);
 
@@ -283,7 +283,7 @@ static int __devexit twl4030_vibra_remove(struct platform_device *pdev)
 
 static struct platform_driver twl4030_vibra_driver = {
 	.probe		= twl4030_vibra_probe,
-	.remove		= __devexit_p(twl4030_vibra_remove),
+	.remove		= twl4030_vibra_remove,
 	.driver		= {
 		.name	= "twl4030-vibra",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/misc/twl6040-vibra.c b/trunk/drivers/input/misc/twl6040-vibra.c
index c8a288ae1d5b..71a28ee699f3 100644
--- a/trunk/drivers/input/misc/twl6040-vibra.c
+++ b/trunk/drivers/input/misc/twl6040-vibra.c
@@ -255,7 +255,7 @@ static int twl6040_vibra_suspend(struct device *dev)
 
 static SIMPLE_DEV_PM_OPS(twl6040_vibra_pm_ops, twl6040_vibra_suspend, NULL);
 
-static int __devinit twl6040_vibra_probe(struct platform_device *pdev)
+static int twl6040_vibra_probe(struct platform_device *pdev)
 {
 	struct twl6040_vibra_data *pdata = pdev->dev.platform_data;
 	struct device *twl6040_core_dev = pdev->dev.parent;
@@ -418,7 +418,7 @@ static int __devinit twl6040_vibra_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static int __devexit twl6040_vibra_remove(struct platform_device *pdev)
+static int twl6040_vibra_remove(struct platform_device *pdev)
 {
 	struct vibra_info *info = platform_get_drvdata(pdev);
 
@@ -433,7 +433,7 @@ static int __devexit twl6040_vibra_remove(struct platform_device *pdev)
 
 static struct platform_driver twl6040_vibra_driver = {
 	.probe		= twl6040_vibra_probe,
-	.remove		= __devexit_p(twl6040_vibra_remove),
+	.remove		= twl6040_vibra_remove,
 	.driver		= {
 		.name	= "twl6040-vibra",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/misc/wistron_btns.c b/trunk/drivers/input/misc/wistron_btns.c
index e2bdfd4bea70..56536f4b9572 100644
--- a/trunk/drivers/input/misc/wistron_btns.c
+++ b/trunk/drivers/input/misc/wistron_btns.c
@@ -170,7 +170,7 @@ static u16 bios_pop_queue(void)
 	return regs.eax;
 }
 
-static void __devinit bios_attach(void)
+static void bios_attach(void)
 {
 	struct regs regs;
 
@@ -190,7 +190,7 @@ static void bios_detach(void)
 	call_bios(&regs);
 }
 
-static u8 __devinit bios_get_cmos_address(void)
+static u8 bios_get_cmos_address(void)
 {
 	struct regs regs;
 
@@ -202,7 +202,7 @@ static u8 __devinit bios_get_cmos_address(void)
 	return regs.ecx;
 }
 
-static u16 __devinit bios_get_default_setting(u8 subsys)
+static u16 bios_get_default_setting(u8 subsys)
 {
 	struct regs regs;
 
@@ -1052,7 +1052,7 @@ static struct led_classdev wistron_wifi_led = {
 	.brightness_set		= wistron_wifi_led_set,
 };
 
-static void __devinit wistron_led_init(struct device *parent)
+static void wistron_led_init(struct device *parent)
 {
 	if (leds_present & FE_WIFI_LED) {
 		u16 wifi = bios_get_default_setting(WIFI);
@@ -1077,7 +1077,7 @@ static void __devinit wistron_led_init(struct device *parent)
 	}
 }
 
-static void __devexit wistron_led_remove(void)
+static void wistron_led_remove(void)
 {
 	if (leds_present & FE_MAIL_LED)
 		led_classdev_unregister(&wistron_mail_led);
@@ -1168,7 +1168,7 @@ static void wistron_poll(struct input_polled_dev *dev)
 		dev->poll_interval = POLL_INTERVAL_DEFAULT;
 }
 
-static int __devinit wistron_setup_keymap(struct input_dev *dev,
+static int wistron_setup_keymap(struct input_dev *dev,
 					  struct key_entry *entry)
 {
 	switch (entry->type) {
@@ -1199,7 +1199,7 @@ static int __devinit wistron_setup_keymap(struct input_dev *dev,
 	return 0;
 }
 
-static int __devinit setup_input_dev(void)
+static int setup_input_dev(void)
 {
 	struct input_dev *input_dev;
 	int error;
@@ -1237,7 +1237,7 @@ static int __devinit setup_input_dev(void)
 
 /* Driver core */
 
-static int __devinit wistron_probe(struct platform_device *dev)
+static int wistron_probe(struct platform_device *dev)
 {
 	int err;
 
@@ -1277,7 +1277,7 @@ static int __devinit wistron_probe(struct platform_device *dev)
 	return 0;
 }
 
-static int __devexit wistron_remove(struct platform_device *dev)
+static int wistron_remove(struct platform_device *dev)
 {
 	wistron_led_remove();
 	input_unregister_polled_device(wistron_idev);
@@ -1334,7 +1334,7 @@ static struct platform_driver wistron_driver = {
 #endif
 	},
 	.probe		= wistron_probe,
-	.remove		= __devexit_p(wistron_remove),
+	.remove		= wistron_remove,
 };
 
 static int __init wb_module_init(void)
diff --git a/trunk/drivers/input/misc/wm831x-on.c b/trunk/drivers/input/misc/wm831x-on.c
index 6790a812a1db..558767d8ebf4 100644
--- a/trunk/drivers/input/misc/wm831x-on.c
+++ b/trunk/drivers/input/misc/wm831x-on.c
@@ -69,14 +69,15 @@ static irqreturn_t wm831x_on_irq(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static int __devinit wm831x_on_probe(struct platform_device *pdev)
+static int wm831x_on_probe(struct platform_device *pdev)
 {
 	struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent);
 	struct wm831x_on *wm831x_on;
 	int irq = wm831x_irq(wm831x, platform_get_irq(pdev, 0));
 	int ret;
 
-	wm831x_on = kzalloc(sizeof(struct wm831x_on), GFP_KERNEL);
+	wm831x_on = devm_kzalloc(&pdev->dev, sizeof(struct wm831x_on),
+				 GFP_KERNEL);
 	if (!wm831x_on) {
 		dev_err(&pdev->dev, "Can't allocate data\n");
 		return -ENOMEM;
@@ -120,11 +121,10 @@ static int __devinit wm831x_on_probe(struct platform_device *pdev)
 err_input_dev:
 	input_free_device(wm831x_on->dev);
 err:
-	kfree(wm831x_on);
 	return ret;
 }
 
-static int __devexit wm831x_on_remove(struct platform_device *pdev)
+static int wm831x_on_remove(struct platform_device *pdev)
 {
 	struct wm831x_on *wm831x_on = platform_get_drvdata(pdev);
 	int irq = platform_get_irq(pdev, 0);
@@ -132,14 +132,13 @@ static int __devexit wm831x_on_remove(struct platform_device *pdev)
 	free_irq(irq, wm831x_on);
 	cancel_delayed_work_sync(&wm831x_on->work);
 	input_unregister_device(wm831x_on->dev);
-	kfree(wm831x_on);
 
 	return 0;
 }
 
 static struct platform_driver wm831x_on_driver = {
 	.probe		= wm831x_on_probe,
-	.remove		= __devexit_p(wm831x_on_remove),
+	.remove		= wm831x_on_remove,
 	.driver		= {
 		.name	= "wm831x-on",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/misc/xen-kbdfront.c b/trunk/drivers/input/misc/xen-kbdfront.c
index 6f7d99013031..e21c1816a8f9 100644
--- a/trunk/drivers/input/misc/xen-kbdfront.c
+++ b/trunk/drivers/input/misc/xen-kbdfront.c
@@ -104,7 +104,7 @@ static irqreturn_t input_handler(int rq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int __devinit xenkbd_probe(struct xenbus_device *dev,
+static int xenkbd_probe(struct xenbus_device *dev,
 				  const struct xenbus_device_id *id)
 {
 	int ret, i, abs;
diff --git a/trunk/drivers/input/mouse/alps.c b/trunk/drivers/input/mouse/alps.c
index cf5af1f495ec..e229fa3cad96 100644
--- a/trunk/drivers/input/mouse/alps.c
+++ b/trunk/drivers/input/mouse/alps.c
@@ -767,9 +767,8 @@ static psmouse_ret_t alps_handle_interleaved_ps2(struct psmouse *psmouse)
 		      psmouse->packet[5]) & 0x80) ||
 		    (!alps_is_valid_first_byte(priv->i, psmouse->packet[6]))) {
 			psmouse_dbg(psmouse,
-				    "refusing packet %x %x %x %x (suspected interleaved ps/2)\n",
-				    psmouse->packet[3], psmouse->packet[4],
-				    psmouse->packet[5], psmouse->packet[6]);
+				    "refusing packet %4ph (suspected interleaved ps/2)\n",
+				    psmouse->packet + 3);
 			return PSMOUSE_BAD_DATA;
 		}
 
@@ -831,9 +830,8 @@ static void alps_flush_packet(unsigned long data)
 		     psmouse->packet[4] |
 		     psmouse->packet[5]) & 0x80) {
 			psmouse_dbg(psmouse,
-				    "refusing packet %x %x %x (suspected interleaved ps/2)\n",
-				    psmouse->packet[3], psmouse->packet[4],
-				    psmouse->packet[5]);
+				    "refusing packet %3ph (suspected interleaved ps/2)\n",
+				    psmouse->packet + 3);
 		} else {
 			alps_process_packet(psmouse);
 		}
diff --git a/trunk/drivers/input/mouse/gpio_mouse.c b/trunk/drivers/input/mouse/gpio_mouse.c
index 39fe9b737cae..532eaca4cc56 100644
--- a/trunk/drivers/input/mouse/gpio_mouse.c
+++ b/trunk/drivers/input/mouse/gpio_mouse.c
@@ -46,7 +46,7 @@ static void gpio_mouse_scan(struct input_polled_dev *dev)
 	input_sync(input);
 }
 
-static int __devinit gpio_mouse_probe(struct platform_device *pdev)
+static int gpio_mouse_probe(struct platform_device *pdev)
 {
 	struct gpio_mouse_platform_data *pdata = pdev->dev.platform_data;
 	struct input_polled_dev *input_poll;
@@ -150,7 +150,7 @@ static int __devinit gpio_mouse_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit gpio_mouse_remove(struct platform_device *pdev)
+static int gpio_mouse_remove(struct platform_device *pdev)
 {
 	struct input_polled_dev *input = platform_get_drvdata(pdev);
 	struct gpio_mouse_platform_data *pdata = input->private;
@@ -172,7 +172,7 @@ static int __devexit gpio_mouse_remove(struct platform_device *pdev)
 
 static struct platform_driver gpio_mouse_device_driver = {
 	.probe		= gpio_mouse_probe,
-	.remove		= __devexit_p(gpio_mouse_remove),
+	.remove		= gpio_mouse_remove,
 	.driver		= {
 		.name	= "gpio_mouse",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/mouse/maplemouse.c b/trunk/drivers/input/mouse/maplemouse.c
index 5f278176eb9b..0a60717b91c6 100644
--- a/trunk/drivers/input/mouse/maplemouse.c
+++ b/trunk/drivers/input/mouse/maplemouse.c
@@ -64,7 +64,7 @@ static void dc_mouse_close(struct input_dev *dev)
 }
 
 /* allow the mouse to be used */
-static int __devinit probe_maple_mouse(struct device *dev)
+static int probe_maple_mouse(struct device *dev)
 {
 	struct maple_device *mdev = to_maple_dev(dev);
 	struct maple_driver *mdrv = to_maple_driver(dev->driver);
@@ -114,7 +114,7 @@ static int __devinit probe_maple_mouse(struct device *dev)
 	return error;
 }
 
-static int __devexit remove_maple_mouse(struct device *dev)
+static int remove_maple_mouse(struct device *dev)
 {
 	struct maple_device *mdev = to_maple_dev(dev);
 	struct dc_mouse *mse = maple_get_drvdata(mdev);
@@ -132,7 +132,7 @@ static struct maple_driver dc_mouse_driver = {
 	.drv = {
 		.name = "Dreamcast_mouse",
 		.probe = probe_maple_mouse,
-		.remove = __devexit_p(remove_maple_mouse),
+		.remove = remove_maple_mouse,
 	},
 };
 
diff --git a/trunk/drivers/input/mouse/navpoint.c b/trunk/drivers/input/mouse/navpoint.c
index c29ae7654d5e..8e1b98ea5648 100644
--- a/trunk/drivers/input/mouse/navpoint.c
+++ b/trunk/drivers/input/mouse/navpoint.c
@@ -206,7 +206,7 @@ static void navpoint_close(struct input_dev *input)
 	navpoint_down(navpoint);
 }
 
-static int __devinit navpoint_probe(struct platform_device *pdev)
+static int navpoint_probe(struct platform_device *pdev)
 {
 	const struct navpoint_platform_data *pdata =
 					dev_get_platdata(&pdev->dev);
@@ -299,7 +299,7 @@ static int __devinit navpoint_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit navpoint_remove(struct platform_device *pdev)
+static int navpoint_remove(struct platform_device *pdev)
 {
 	const struct navpoint_platform_data *pdata =
 					dev_get_platdata(&pdev->dev);
@@ -353,7 +353,7 @@ static SIMPLE_DEV_PM_OPS(navpoint_pm_ops, navpoint_suspend, navpoint_resume);
 
 static struct platform_driver navpoint_driver = {
 	.probe		= navpoint_probe,
-	.remove		= __devexit_p(navpoint_remove),
+	.remove		= navpoint_remove,
 	.driver = {
 		.name	= "navpoint",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/mouse/pxa930_trkball.c b/trunk/drivers/input/mouse/pxa930_trkball.c
index 4fe055f2c536..0ecb9e7945eb 100644
--- a/trunk/drivers/input/mouse/pxa930_trkball.c
+++ b/trunk/drivers/input/mouse/pxa930_trkball.c
@@ -143,7 +143,7 @@ static void pxa930_trkball_close(struct input_dev *dev)
 	pxa930_trkball_disable(trkball);
 }
 
-static int __devinit pxa930_trkball_probe(struct platform_device *pdev)
+static int pxa930_trkball_probe(struct platform_device *pdev)
 {
 	struct pxa930_trkball *trkball;
 	struct input_dev *input;
@@ -230,7 +230,7 @@ static int __devinit pxa930_trkball_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit pxa930_trkball_remove(struct platform_device *pdev)
+static int pxa930_trkball_remove(struct platform_device *pdev)
 {
 	struct pxa930_trkball *trkball = platform_get_drvdata(pdev);
 	int irq = platform_get_irq(pdev, 0);
@@ -248,7 +248,7 @@ static struct platform_driver pxa930_trkball_driver = {
 		.name	= "pxa930-trkball",
 	},
 	.probe		= pxa930_trkball_probe,
-	.remove		= __devexit_p(pxa930_trkball_remove),
+	.remove		= pxa930_trkball_remove,
 };
 module_platform_driver(pxa930_trkball_driver);
 
diff --git a/trunk/drivers/input/mouse/synaptics_i2c.c b/trunk/drivers/input/mouse/synaptics_i2c.c
index 063a174d3a88..ad822608f6ee 100644
--- a/trunk/drivers/input/mouse/synaptics_i2c.c
+++ b/trunk/drivers/input/mouse/synaptics_i2c.c
@@ -535,7 +535,7 @@ static struct synaptics_i2c *synaptics_i2c_touch_create(struct i2c_client *clien
 	return touch;
 }
 
-static int __devinit synaptics_i2c_probe(struct i2c_client *client,
+static int synaptics_i2c_probe(struct i2c_client *client,
 			       const struct i2c_device_id *dev_id)
 {
 	int ret;
@@ -601,7 +601,7 @@ static int __devinit synaptics_i2c_probe(struct i2c_client *client,
 	return ret;
 }
 
-static int __devexit synaptics_i2c_remove(struct i2c_client *client)
+static int synaptics_i2c_remove(struct i2c_client *client)
 {
 	struct synaptics_i2c *touch = i2c_get_clientdata(client);
 
@@ -662,7 +662,7 @@ static struct i2c_driver synaptics_i2c_driver = {
 	},
 
 	.probe		= synaptics_i2c_probe,
-	.remove		= __devexit_p(synaptics_i2c_remove),
+	.remove		= synaptics_i2c_remove,
 
 	.id_table	= synaptics_i2c_id_table,
 };
diff --git a/trunk/drivers/input/serio/Kconfig b/trunk/drivers/input/serio/Kconfig
index 55f2c2293ec6..4a4e182c33e7 100644
--- a/trunk/drivers/input/serio/Kconfig
+++ b/trunk/drivers/input/serio/Kconfig
@@ -234,4 +234,13 @@ config SERIO_PS2MULT
 	  To compile this driver as a module, choose M here: the
 	  module will be called ps2mult.
 
+config SERIO_ARC_PS2
+	tristate "ARC PS/2 support"
+	help
+	  Say Y here if you have an ARC FPGA platform with a PS/2
+	  controller in it.
+
+	  To compile this driver as a module, choose M here; the module
+	  will be called arc_ps2.
+
 endif
diff --git a/trunk/drivers/input/serio/Makefile b/trunk/drivers/input/serio/Makefile
index dbbe37616c92..4b0c8f84f1c1 100644
--- a/trunk/drivers/input/serio/Makefile
+++ b/trunk/drivers/input/serio/Makefile
@@ -25,3 +25,4 @@ obj-$(CONFIG_SERIO_RAW)		+= serio_raw.o
 obj-$(CONFIG_SERIO_AMS_DELTA)	+= ams_delta_serio.o
 obj-$(CONFIG_SERIO_XILINX_XPS_PS2)	+= xilinx_ps2.o
 obj-$(CONFIG_SERIO_ALTERA_PS2)	+= altera_ps2.o
+obj-$(CONFIG_SERIO_ARC_PS2)	+= arc_ps2.o
diff --git a/trunk/drivers/input/serio/altera_ps2.c b/trunk/drivers/input/serio/altera_ps2.c
index cc11f4efe119..479ce5fe8955 100644
--- a/trunk/drivers/input/serio/altera_ps2.c
+++ b/trunk/drivers/input/serio/altera_ps2.c
@@ -81,7 +81,7 @@ static void altera_ps2_close(struct serio *io)
 /*
  * Add one device to this driver.
  */
-static int __devinit altera_ps2_probe(struct platform_device *pdev)
+static int altera_ps2_probe(struct platform_device *pdev)
 {
 	struct ps2if *ps2if;
 	struct serio *serio;
@@ -159,7 +159,7 @@ static int __devinit altera_ps2_probe(struct platform_device *pdev)
 /*
  * Remove one device from this driver.
  */
-static int __devexit altera_ps2_remove(struct platform_device *pdev)
+static int altera_ps2_remove(struct platform_device *pdev)
 {
 	struct ps2if *ps2if = platform_get_drvdata(pdev);
 
@@ -187,7 +187,7 @@ MODULE_DEVICE_TABLE(of, altera_ps2_match);
  */
 static struct platform_driver altera_ps2_driver = {
 	.probe		= altera_ps2_probe,
-	.remove		= __devexit_p(altera_ps2_remove),
+	.remove		= altera_ps2_remove,
 	.driver	= {
 		.name	= DRV_NAME,
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/serio/ambakmi.c b/trunk/drivers/input/serio/ambakmi.c
index 2e77246c2e5a..4e2fd44865e1 100644
--- a/trunk/drivers/input/serio/ambakmi.c
+++ b/trunk/drivers/input/serio/ambakmi.c
@@ -107,7 +107,7 @@ static void amba_kmi_close(struct serio *io)
 	clk_disable_unprepare(kmi->clk);
 }
 
-static int __devinit amba_kmi_probe(struct amba_device *dev,
+static int amba_kmi_probe(struct amba_device *dev,
 	const struct amba_id *id)
 {
 	struct amba_kmi_port *kmi;
@@ -163,7 +163,7 @@ static int __devinit amba_kmi_probe(struct amba_device *dev,
 	return ret;
 }
 
-static int __devexit amba_kmi_remove(struct amba_device *dev)
+static int amba_kmi_remove(struct amba_device *dev)
 {
 	struct amba_kmi_port *kmi = amba_get_drvdata(dev);
 
@@ -204,7 +204,7 @@ static struct amba_driver ambakmi_driver = {
 	},
 	.id_table	= amba_kmi_idtable,
 	.probe		= amba_kmi_probe,
-	.remove		= __devexit_p(amba_kmi_remove),
+	.remove		= amba_kmi_remove,
 	.resume		= amba_kmi_resume,
 };
 
diff --git a/trunk/drivers/input/serio/arc_ps2.c b/trunk/drivers/input/serio/arc_ps2.c
new file mode 100644
index 000000000000..b571eb3e4efc
--- /dev/null
+++ b/trunk/drivers/input/serio/arc_ps2.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Driver is originally developed by Pavel Sokolov <psokolov@synopsys.com>
+ */
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/input.h>
+#include <linux/serio.h>
+#include <linux/platform_device.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#define ARC_PS2_PORTS                   2
+
+#define ARC_ARC_PS2_ID                  0x0001f609
+
+#define STAT_TIMEOUT                    128
+
+#define PS2_STAT_RX_FRM_ERR             (1)
+#define PS2_STAT_RX_BUF_OVER            (1 << 1)
+#define PS2_STAT_RX_INT_EN              (1 << 2)
+#define PS2_STAT_RX_VAL                 (1 << 3)
+#define PS2_STAT_TX_ISNOT_FUL           (1 << 4)
+#define PS2_STAT_TX_INT_EN              (1 << 5)
+
+struct arc_ps2_port {
+	void __iomem *data_addr;
+	void __iomem *status_addr;
+	struct serio *io;
+};
+
+struct arc_ps2_data {
+	struct arc_ps2_port port[ARC_PS2_PORTS];
+	void __iomem *addr;
+	unsigned int frame_error;
+	unsigned int buf_overflow;
+	unsigned int total_int;
+};
+
+static void arc_ps2_check_rx(struct arc_ps2_data *arc_ps2,
+			     struct arc_ps2_port *port)
+{
+	unsigned int timeout = 1000;
+	unsigned int flag, status;
+	unsigned char data;
+
+	do {
+		status = ioread32(port->status_addr);
+		if (!(status & PS2_STAT_RX_VAL))
+			return;
+
+		data = ioread32(port->data_addr) & 0xff;
+
+		flag = 0;
+		arc_ps2->total_int++;
+		if (status & PS2_STAT_RX_FRM_ERR) {
+			arc_ps2->frame_error++;
+			flag |= SERIO_PARITY;
+		} else if (status & PS2_STAT_RX_BUF_OVER) {
+			arc_ps2->buf_overflow++;
+			flag |= SERIO_FRAME;
+		}
+
+		serio_interrupt(port->io, data, flag);
+	} while (--timeout);
+
+	dev_err(&port->io->dev, "PS/2 hardware stuck\n");
+}
+
+static irqreturn_t arc_ps2_interrupt(int irq, void *dev)
+{
+	struct arc_ps2_data *arc_ps2 = dev;
+	int i;
+
+	for (i = 0; i < ARC_PS2_PORTS; i++)
+		arc_ps2_check_rx(arc_ps2, &arc_ps2->port[i]);
+
+	return IRQ_HANDLED;
+}
+
+static int arc_ps2_write(struct serio *io, unsigned char val)
+{
+	unsigned status;
+	struct arc_ps2_port *port = io->port_data;
+	int timeout = STAT_TIMEOUT;
+
+	do {
+		status = ioread32(port->status_addr);
+		cpu_relax();
+
+		if (status & PS2_STAT_TX_ISNOT_FUL) {
+			iowrite32(val & 0xff, port->data_addr);
+			return 0;
+		}
+
+	} while (--timeout);
+
+	dev_err(&io->dev, "write timeout\n");
+	return -ETIMEDOUT;
+}
+
+static int arc_ps2_open(struct serio *io)
+{
+	struct arc_ps2_port *port = io->port_data;
+
+	iowrite32(PS2_STAT_RX_INT_EN, port->status_addr);
+
+	return 0;
+}
+
+static void arc_ps2_close(struct serio *io)
+{
+	struct arc_ps2_port *port = io->port_data;
+
+	iowrite32(ioread32(port->status_addr) & ~PS2_STAT_RX_INT_EN,
+		  port->status_addr);
+}
+
+static void __iomem *arc_ps2_calc_addr(struct arc_ps2_data *arc_ps2,
+						  int index, bool status)
+{
+	void __iomem *addr;
+
+	addr = arc_ps2->addr + 4 + 4 * index;
+	if (status)
+		addr += ARC_PS2_PORTS * 4;
+
+	return addr;
+}
+
+static void arc_ps2_inhibit_ports(struct arc_ps2_data *arc_ps2)
+{
+	void __iomem *addr;
+	u32 val;
+	int i;
+
+	for (i = 0; i < ARC_PS2_PORTS; i++) {
+		addr = arc_ps2_calc_addr(arc_ps2, i, true);
+		val = ioread32(addr);
+		val &= ~(PS2_STAT_RX_INT_EN | PS2_STAT_TX_INT_EN);
+		iowrite32(val, addr);
+	}
+}
+
+static int arc_ps2_create_port(struct platform_device *pdev,
+					 struct arc_ps2_data *arc_ps2,
+					 int index)
+{
+	struct arc_ps2_port *port = &arc_ps2->port[index];
+	struct serio *io;
+
+	io = kzalloc(sizeof(struct serio), GFP_KERNEL);
+	if (!io)
+		return -ENOMEM;
+
+	io->id.type = SERIO_8042;
+	io->write = arc_ps2_write;
+	io->open = arc_ps2_open;
+	io->close = arc_ps2_close;
+	snprintf(io->name, sizeof(io->name), "ARC PS/2 port%d", index);
+	snprintf(io->phys, sizeof(io->phys), "arc/serio%d", index);
+	io->port_data = port;
+
+	port->io = io;
+
+	port->data_addr = arc_ps2_calc_addr(arc_ps2, index, false);
+	port->status_addr = arc_ps2_calc_addr(arc_ps2, index, true);
+
+	dev_dbg(&pdev->dev, "port%d is allocated (data = 0x%p, status = 0x%p)\n",
+		index, port->data_addr, port->status_addr);
+
+	serio_register_port(port->io);
+	return 0;
+}
+
+static int arc_ps2_probe(struct platform_device *pdev)
+{
+	struct arc_ps2_data *arc_ps2;
+	struct resource *res;
+	int irq;
+	int error, id, i;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&pdev->dev, "no IO memory defined\n");
+		return -EINVAL;
+	}
+
+	irq = platform_get_irq_byname(pdev, "arc_ps2_irq");
+	if (irq < 0) {
+		dev_err(&pdev->dev, "no IRQ defined\n");
+		return -EINVAL;
+	}
+
+	arc_ps2 = devm_kzalloc(&pdev->dev, sizeof(struct arc_ps2_data),
+				GFP_KERNEL);
+	if (!arc_ps2) {
+		dev_err(&pdev->dev, "out of memory\n");
+		return -ENOMEM;
+	}
+
+	arc_ps2->addr = devm_request_and_ioremap(&pdev->dev, res);
+	if (!arc_ps2->addr)
+		return -EBUSY;
+
+	dev_info(&pdev->dev, "irq = %d, address = 0x%p, ports = %i\n",
+		 irq, arc_ps2->addr, ARC_PS2_PORTS);
+
+	id = ioread32(arc_ps2->addr);
+	if (id != ARC_ARC_PS2_ID) {
+		dev_err(&pdev->dev, "device id does not match\n");
+		return -ENXIO;
+	}
+
+	arc_ps2_inhibit_ports(arc_ps2);
+
+	error = devm_request_irq(&pdev->dev, irq, arc_ps2_interrupt,
+				 0, "arc_ps2", arc_ps2);
+	if (error) {
+		dev_err(&pdev->dev, "Could not allocate IRQ\n");
+		return error;
+	}
+
+	for (i = 0; i < ARC_PS2_PORTS; i++) {
+		error = arc_ps2_create_port(pdev, arc_ps2, i);
+		if (error) {
+			while (--i >= 0)
+				serio_unregister_port(arc_ps2->port[i].io);
+			return error;
+		}
+	}
+
+	platform_set_drvdata(pdev, arc_ps2);
+
+	return 0;
+}
+
+static int arc_ps2_remove(struct platform_device *pdev)
+{
+	struct arc_ps2_data *arc_ps2 = platform_get_drvdata(pdev);
+	int i;
+
+	for (i = 0; i < ARC_PS2_PORTS; i++)
+		serio_unregister_port(arc_ps2->port[i].io);
+
+	dev_dbg(&pdev->dev, "interrupt count = %i\n", arc_ps2->total_int);
+	dev_dbg(&pdev->dev, "frame error count = %i\n", arc_ps2->frame_error);
+	dev_dbg(&pdev->dev, "buffer overflow count = %i\n",
+		arc_ps2->buf_overflow);
+
+	return 0;
+}
+
+static struct platform_driver arc_ps2_driver = {
+	.driver	= {
+		.name	= "arc_ps2",
+		.owner	= THIS_MODULE,
+	},
+	.probe	= arc_ps2_probe,
+	.remove	= arc_ps2_remove,
+};
+
+module_platform_driver(arc_ps2_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pavel Sokolov <psokolov@synopsys.com>");
+MODULE_DESCRIPTION("ARC PS/2 Driver");
diff --git a/trunk/drivers/input/serio/ct82c710.c b/trunk/drivers/input/serio/ct82c710.c
index 852816567241..cfe549d4eaa5 100644
--- a/trunk/drivers/input/serio/ct82c710.c
+++ b/trunk/drivers/input/serio/ct82c710.c
@@ -175,7 +175,7 @@ static int __init ct82c710_detect(void)
 	return 0;
 }
 
-static int __devinit ct82c710_probe(struct platform_device *dev)
+static int ct82c710_probe(struct platform_device *dev)
 {
 	ct82c710_port = kzalloc(sizeof(struct serio), GFP_KERNEL);
 	if (!ct82c710_port)
@@ -199,7 +199,7 @@ static int __devinit ct82c710_probe(struct platform_device *dev)
 	return 0;
 }
 
-static int __devexit ct82c710_remove(struct platform_device *dev)
+static int ct82c710_remove(struct platform_device *dev)
 {
 	serio_unregister_port(ct82c710_port);
 
@@ -212,7 +212,7 @@ static struct platform_driver ct82c710_driver = {
 		.owner	= THIS_MODULE,
 	},
 	.probe		= ct82c710_probe,
-	.remove		= __devexit_p(ct82c710_remove),
+	.remove		= ct82c710_remove,
 };
 
 
diff --git a/trunk/drivers/input/serio/gscps2.c b/trunk/drivers/input/serio/gscps2.c
index 4225f5d6b15f..8d9ba0c3827c 100644
--- a/trunk/drivers/input/serio/gscps2.c
+++ b/trunk/drivers/input/serio/gscps2.c
@@ -327,7 +327,7 @@ static void gscps2_close(struct serio *port)
  * @return: success/error report
  */
 
-static int __devinit gscps2_probe(struct parisc_device *dev)
+static int gscps2_probe(struct parisc_device *dev)
 {
 	struct gscps2port *ps2port;
 	struct serio *serio;
@@ -414,7 +414,7 @@ static int __devinit gscps2_probe(struct parisc_device *dev)
  * @return: success/error report
  */
 
-static int __devexit gscps2_remove(struct parisc_device *dev)
+static int gscps2_remove(struct parisc_device *dev)
 {
 	struct gscps2port *ps2port = dev_get_drvdata(&dev->dev);
 
@@ -444,7 +444,7 @@ static struct parisc_driver parisc_ps2_driver = {
 	.name		= "gsc_ps2",
 	.id_table	= gscps2_device_tbl,
 	.probe		= gscps2_probe,
-	.remove		= __devexit_p(gscps2_remove),
+	.remove		= gscps2_remove,
 };
 
 static int __init gscps2_init(void)
diff --git a/trunk/drivers/input/serio/hil_mlc.c b/trunk/drivers/input/serio/hil_mlc.c
index bfd3865d886b..65605e4ef3cf 100644
--- a/trunk/drivers/input/serio/hil_mlc.c
+++ b/trunk/drivers/input/serio/hil_mlc.c
@@ -686,13 +686,12 @@ static int hilse_donode(hil_mlc *mlc)
 		write_lock_irqsave(&mlc->lock, flags);
 		pack = node->object.packet;
 	out:
-		if (mlc->istarted)
-			goto out2;
-		/* Prepare to receive input */
-		if ((node + 1)->act & HILSE_IN)
-			hilse_setup_input(mlc, node + 1);
+		if (!mlc->istarted) {
+			/* Prepare to receive input */
+			if ((node + 1)->act & HILSE_IN)
+				hilse_setup_input(mlc, node + 1);
+		}
 
-	out2:
 		write_unlock_irqrestore(&mlc->lock, flags);
 
 		if (down_trylock(&mlc->osem)) {
@@ -1010,8 +1009,6 @@ static int __init hil_mlc_init(void)
 static void __exit hil_mlc_exit(void)
 {
 	del_timer_sync(&hil_mlcs_kicker);
-
-	tasklet_disable(&hil_mlcs_tasklet);
 	tasklet_kill(&hil_mlcs_tasklet);
 }
 
diff --git a/trunk/drivers/input/serio/i8042-io.h b/trunk/drivers/input/serio/i8042-io.h
index 5d48bb66aa73..a5eed2ade53d 100644
--- a/trunk/drivers/input/serio/i8042-io.h
+++ b/trunk/drivers/input/serio/i8042-io.h
@@ -76,7 +76,7 @@ static inline int i8042_platform_init(void)
 	if (check_legacy_ioport(I8042_DATA_REG))
 		return -ENODEV;
 #endif
-#if !defined(__sh__) && !defined(__alpha__) && !defined(__mips__)
+#if !defined(__sh__) && !defined(__alpha__)
 	if (!request_region(I8042_DATA_REG, 16, "i8042"))
 		return -EBUSY;
 #endif
diff --git a/trunk/drivers/input/serio/i8042-sparcio.h b/trunk/drivers/input/serio/i8042-sparcio.h
index 395a9af3adcd..d6aa4c67dbb6 100644
--- a/trunk/drivers/input/serio/i8042-sparcio.h
+++ b/trunk/drivers/input/serio/i8042-sparcio.h
@@ -49,7 +49,7 @@ static inline void i8042_write_command(int val)
 #define OBP_PS2MS_NAME1		"kdmouse"
 #define OBP_PS2MS_NAME2		"mouse"
 
-static int __devinit sparc_i8042_probe(struct platform_device *op)
+static int sparc_i8042_probe(struct platform_device *op)
 {
 	struct device_node *dp = op->dev.of_node;
 
@@ -80,7 +80,7 @@ static int __devinit sparc_i8042_probe(struct platform_device *op)
 	return 0;
 }
 
-static int __devexit sparc_i8042_remove(struct platform_device *op)
+static int sparc_i8042_remove(struct platform_device *op)
 {
 	of_iounmap(kbd_res, kbd_iobase, 8);
 
@@ -102,7 +102,7 @@ static struct platform_driver sparc_i8042_driver = {
 		.of_match_table = sparc_i8042_match,
 	},
 	.probe		= sparc_i8042_probe,
-	.remove		= __devexit_p(sparc_i8042_remove),
+	.remove		= sparc_i8042_remove,
 };
 
 static int __init i8042_platform_init(void)
diff --git a/trunk/drivers/input/serio/i8042-x86ia64io.h b/trunk/drivers/input/serio/i8042-x86ia64io.h
index d6cc77a53c7e..5f306f79da0c 100644
--- a/trunk/drivers/input/serio/i8042-x86ia64io.h
+++ b/trunk/drivers/input/serio/i8042-x86ia64io.h
@@ -921,6 +921,7 @@ static int __init i8042_platform_init(void)
 	int retval;
 
 #ifdef CONFIG_X86
+	u8 a20_on = 0xdf;
 	/* Just return if pre-detection shows no i8042 controller exist */
 	if (!x86_platform.i8042_detect())
 		return -ENODEV;
@@ -960,6 +961,14 @@ static int __init i8042_platform_init(void)
 
 	if (dmi_check_system(i8042_dmi_dritek_table))
 		i8042_dritek = true;
+
+	/*
+	 * A20 was already enabled during early kernel init. But some buggy
+	 * BIOSes (in MSI Laptops) require A20 to be enabled using 8042 to
+	 * resume from S3. So we do it here and hope that nothing breaks.
+	 */
+	i8042_command(&a20_on, 0x10d1);
+	i8042_command(NULL, 0x00ff);	/* Null command for SMM firmware */
 #endif /* CONFIG_X86 */
 
 	return retval;
diff --git a/trunk/drivers/input/serio/i8042.c b/trunk/drivers/input/serio/i8042.c
index 86564414b75a..78e4de42efaa 100644
--- a/trunk/drivers/input/serio/i8042.c
+++ b/trunk/drivers/input/serio/i8042.c
@@ -1284,7 +1284,7 @@ static void __init i8042_register_ports(void)
 	}
 }
 
-static void __devexit i8042_unregister_ports(void)
+static void i8042_unregister_ports(void)
 {
 	int i;
 
@@ -1437,7 +1437,7 @@ static int __init i8042_probe(struct platform_device *dev)
 	return error;
 }
 
-static int __devexit i8042_remove(struct platform_device *dev)
+static int i8042_remove(struct platform_device *dev)
 {
 	i8042_unregister_ports();
 	i8042_free_irqs();
@@ -1455,7 +1455,7 @@ static struct platform_driver i8042_driver = {
 		.pm	= &i8042_pm_ops,
 #endif
 	},
-	.remove		= __devexit_p(i8042_remove),
+	.remove		= i8042_remove,
 	.shutdown	= i8042_shutdown,
 };
 
diff --git a/trunk/drivers/input/serio/maceps2.c b/trunk/drivers/input/serio/maceps2.c
index 61da763b1209..bc85e1cc66d8 100644
--- a/trunk/drivers/input/serio/maceps2.c
+++ b/trunk/drivers/input/serio/maceps2.c
@@ -116,7 +116,7 @@ static void maceps2_close(struct serio *dev)
 }
 
 
-static struct serio * __devinit maceps2_allocate_port(int idx)
+static struct serio *maceps2_allocate_port(int idx)
 {
 	struct serio *serio;
 
@@ -135,7 +135,7 @@ static struct serio * __devinit maceps2_allocate_port(int idx)
 	return serio;
 }
 
-static int __devinit maceps2_probe(struct platform_device *dev)
+static int maceps2_probe(struct platform_device *dev)
 {
 	maceps2_port[0] = maceps2_allocate_port(0);
 	maceps2_port[1] = maceps2_allocate_port(1);
@@ -151,7 +151,7 @@ static int __devinit maceps2_probe(struct platform_device *dev)
 	return 0;
 }
 
-static int __devexit maceps2_remove(struct platform_device *dev)
+static int maceps2_remove(struct platform_device *dev)
 {
 	serio_unregister_port(maceps2_port[0]);
 	serio_unregister_port(maceps2_port[1]);
@@ -165,7 +165,7 @@ static struct platform_driver maceps2_driver = {
 		.owner	= THIS_MODULE,
 	},
 	.probe		= maceps2_probe,
-	.remove		= __devexit_p(maceps2_remove),
+	.remove		= maceps2_remove,
 };
 
 static int __init maceps2_init(void)
diff --git a/trunk/drivers/input/serio/pcips2.c b/trunk/drivers/input/serio/pcips2.c
index 0c42497aaaf4..76f83836fd5a 100644
--- a/trunk/drivers/input/serio/pcips2.c
+++ b/trunk/drivers/input/serio/pcips2.c
@@ -127,7 +127,7 @@ static void pcips2_close(struct serio *io)
 	free_irq(ps2if->dev->irq, ps2if);
 }
 
-static int __devinit pcips2_probe(struct pci_dev *dev, const struct pci_device_id *id)
+static int pcips2_probe(struct pci_dev *dev, const struct pci_device_id *id)
 {
 	struct pcips2_data *ps2if;
 	struct serio *serio;
@@ -176,7 +176,7 @@ static int __devinit pcips2_probe(struct pci_dev *dev, const struct pci_device_i
 	return ret;
 }
 
-static void __devexit pcips2_remove(struct pci_dev *dev)
+static void pcips2_remove(struct pci_dev *dev)
 {
 	struct pcips2_data *ps2if = pci_get_drvdata(dev);
 
@@ -212,7 +212,7 @@ static struct pci_driver pcips2_driver = {
 	.name			= "pcips2",
 	.id_table		= pcips2_ids,
 	.probe			= pcips2_probe,
-	.remove			= __devexit_p(pcips2_remove),
+	.remove			= pcips2_remove,
 };
 
 module_pci_driver(pcips2_driver);
diff --git a/trunk/drivers/input/serio/q40kbd.c b/trunk/drivers/input/serio/q40kbd.c
index 0c0df7f73802..70fe542839fb 100644
--- a/trunk/drivers/input/serio/q40kbd.c
+++ b/trunk/drivers/input/serio/q40kbd.c
@@ -122,7 +122,7 @@ static void q40kbd_close(struct serio *port)
 	q40kbd_flush(q40kbd);
 }
 
-static int __devinit q40kbd_probe(struct platform_device *pdev)
+static int q40kbd_probe(struct platform_device *pdev)
 {
 	struct q40kbd *q40kbd;
 	struct serio *port;
@@ -168,7 +168,7 @@ static int __devinit q40kbd_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit q40kbd_remove(struct platform_device *pdev)
+static int q40kbd_remove(struct platform_device *pdev)
 {
 	struct q40kbd *q40kbd = platform_get_drvdata(pdev);
 
@@ -190,7 +190,7 @@ static struct platform_driver q40kbd_driver = {
 		.name	= "q40kbd",
 		.owner	= THIS_MODULE,
 	},
-	.remove		= __devexit_p(q40kbd_remove),
+	.remove		= q40kbd_remove,
 };
 
 static int __init q40kbd_init(void)
diff --git a/trunk/drivers/input/serio/rpckbd.c b/trunk/drivers/input/serio/rpckbd.c
index 2af5df6a8fba..567566ae0dae 100644
--- a/trunk/drivers/input/serio/rpckbd.c
+++ b/trunk/drivers/input/serio/rpckbd.c
@@ -114,7 +114,7 @@ static void rpckbd_close(struct serio *port)
  * Allocate and initialize serio structure for subsequent registration
  * with serio core.
  */
-static int __devinit rpckbd_probe(struct platform_device *dev)
+static int rpckbd_probe(struct platform_device *dev)
 {
 	struct rpckbd_data *rpckbd;
 	struct serio *serio;
@@ -153,7 +153,7 @@ static int __devinit rpckbd_probe(struct platform_device *dev)
 	return 0;
 }
 
-static int __devexit rpckbd_remove(struct platform_device *dev)
+static int rpckbd_remove(struct platform_device *dev)
 {
 	struct serio *serio = platform_get_drvdata(dev);
 	struct rpckbd_data *rpckbd = serio->port_data;
@@ -166,7 +166,7 @@ static int __devexit rpckbd_remove(struct platform_device *dev)
 
 static struct platform_driver rpckbd_driver = {
 	.probe		= rpckbd_probe,
-	.remove		= __devexit_p(rpckbd_remove),
+	.remove		= rpckbd_remove,
 	.driver		= {
 		.name	= "kart",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/serio/sa1111ps2.c b/trunk/drivers/input/serio/sa1111ps2.c
index 389766707534..b3e688911fd9 100644
--- a/trunk/drivers/input/serio/sa1111ps2.c
+++ b/trunk/drivers/input/serio/sa1111ps2.c
@@ -193,7 +193,7 @@ static void ps2_close(struct serio *io)
 /*
  * Clear the input buffer.
  */
-static void __devinit ps2_clear_input(struct ps2if *ps2if)
+static void ps2_clear_input(struct ps2if *ps2if)
 {
 	int maxread = 100;
 
@@ -203,7 +203,7 @@ static void __devinit ps2_clear_input(struct ps2if *ps2if)
 	}
 }
 
-static unsigned int __devinit ps2_test_one(struct ps2if *ps2if,
+static unsigned int ps2_test_one(struct ps2if *ps2if,
 					   unsigned int mask)
 {
 	unsigned int val;
@@ -220,7 +220,7 @@ static unsigned int __devinit ps2_test_one(struct ps2if *ps2if,
  * Test the keyboard interface.  We basically check to make sure that
  * we can drive each line to the keyboard independently of each other.
  */
-static int __devinit ps2_test(struct ps2if *ps2if)
+static int ps2_test(struct ps2if *ps2if)
 {
 	unsigned int stat;
 	int ret = 0;
@@ -251,7 +251,7 @@ static int __devinit ps2_test(struct ps2if *ps2if)
 /*
  * Add one device to this driver.
  */
-static int __devinit ps2_probe(struct sa1111_dev *dev)
+static int ps2_probe(struct sa1111_dev *dev)
 {
 	struct ps2if *ps2if;
 	struct serio *serio;
@@ -334,7 +334,7 @@ static int __devinit ps2_probe(struct sa1111_dev *dev)
 /*
  * Remove one device from this driver.
  */
-static int __devexit ps2_remove(struct sa1111_dev *dev)
+static int ps2_remove(struct sa1111_dev *dev)
 {
 	struct ps2if *ps2if = sa1111_get_drvdata(dev);
 
@@ -357,7 +357,7 @@ static struct sa1111_driver ps2_driver = {
 	},
 	.devid		= SA1111_DEVID_PS2,
 	.probe		= ps2_probe,
-	.remove		= __devexit_p(ps2_remove),
+	.remove		= ps2_remove,
 };
 
 static int __init ps2_init(void)
diff --git a/trunk/drivers/input/serio/serio.c b/trunk/drivers/input/serio/serio.c
index d0f7533dbf88..25fc5971f426 100644
--- a/trunk/drivers/input/serio/serio.c
+++ b/trunk/drivers/input/serio/serio.c
@@ -891,8 +891,6 @@ static int serio_bus_match(struct device *dev, struct device_driver *drv)
 	return serio_match_port(serio_drv->id_table, serio);
 }
 
-#ifdef CONFIG_HOTPLUG
-
 #define SERIO_ADD_UEVENT_VAR(fmt, val...)				\
 	do {								\
 		int err = add_uevent_var(env, fmt, val);		\
@@ -920,15 +918,6 @@ static int serio_uevent(struct device *dev, struct kobj_uevent_env *env)
 }
 #undef SERIO_ADD_UEVENT_VAR
 
-#else
-
-static int serio_uevent(struct device *dev, struct kobj_uevent_env *env)
-{
-	return -ENODEV;
-}
-
-#endif /* CONFIG_HOTPLUG */
-
 #ifdef CONFIG_PM
 static int serio_suspend(struct device *dev)
 {
diff --git a/trunk/drivers/input/serio/xilinx_ps2.c b/trunk/drivers/input/serio/xilinx_ps2.c
index 1e983bec7d86..17be85948ffd 100644
--- a/trunk/drivers/input/serio/xilinx_ps2.c
+++ b/trunk/drivers/input/serio/xilinx_ps2.c
@@ -233,7 +233,7 @@ static void sxps2_close(struct serio *pserio)
  * It returns 0, if the driver is bound to the PS/2 device, or a negative
  * value if there is an error.
  */
-static int __devinit xps2_of_probe(struct platform_device *ofdev)
+static int xps2_of_probe(struct platform_device *ofdev)
 {
 	struct resource r_irq; /* Interrupt resources */
 	struct resource r_mem; /* IO mem resources */
@@ -333,7 +333,7 @@ static int __devinit xps2_of_probe(struct platform_device *ofdev)
  * if the driver module is being unloaded. It frees any resources allocated to
  * the device.
  */
-static int __devexit xps2_of_remove(struct platform_device *of_dev)
+static int xps2_of_remove(struct platform_device *of_dev)
 {
 	struct xps2data *drvdata = platform_get_drvdata(of_dev);
 	struct resource r_mem; /* IO mem resources */
@@ -355,7 +355,7 @@ static int __devexit xps2_of_remove(struct platform_device *of_dev)
 }
 
 /* Match table for of_platform binding */
-static const struct of_device_id xps2_of_match[] __devinitconst = {
+static const struct of_device_id xps2_of_match[] = {
 	{ .compatible = "xlnx,xps-ps2-1.00.a", },
 	{ /* end of list */ },
 };
@@ -368,7 +368,7 @@ static struct platform_driver xps2_of_driver = {
 		.of_match_table = xps2_of_match,
 	},
 	.probe		= xps2_of_probe,
-	.remove		= __devexit_p(xps2_of_remove),
+	.remove		= xps2_of_remove,
 };
 module_platform_driver(xps2_of_driver);
 
diff --git a/trunk/drivers/input/tablet/wacom_sys.c b/trunk/drivers/input/tablet/wacom_sys.c
index 858ad446de91..f92d34f45a1c 100644
--- a/trunk/drivers/input/tablet/wacom_sys.c
+++ b/trunk/drivers/input/tablet/wacom_sys.c
@@ -386,23 +386,40 @@ static int wacom_parse_hid(struct usb_interface *intf,
 				if (usage == WCM_DESKTOP) {
 					if (finger) {
 						features->device_type = BTN_TOOL_FINGER;
-						if (features->type == TABLETPC2FG) {
-							/* need to reset back */
+
+						switch (features->type) {
+						case TABLETPC2FG:
 							features->pktlen = WACOM_PKGLEN_TPC2FG;
-						}
+							break;
 
-						if (features->type == MTSCREEN || features->type == WACOM_24HDT)
+						case MTSCREEN:
+						case WACOM_24HDT:
 							features->pktlen = WACOM_PKGLEN_MTOUCH;
+							break;
 
-						if (features->type == BAMBOO_PT) {
-							/* need to reset back */
+						case MTTPC:
+							features->pktlen = WACOM_PKGLEN_MTTPC;
+							break;
+
+						case BAMBOO_PT:
 							features->pktlen = WACOM_PKGLEN_BBTOUCH;
+							break;
+
+						default:
+							features->pktlen = WACOM_PKGLEN_GRAPHIRE;
+							break;
+						}
+
+						switch (features->type) {
+						case BAMBOO_PT:
 							features->x_phy =
 								get_unaligned_le16(&report[i + 5]);
 							features->x_max =
 								get_unaligned_le16(&report[i + 8]);
 							i += 15;
-						} else if (features->type == WACOM_24HDT) {
+							break;
+
+						case WACOM_24HDT:
 							features->x_max =
 								get_unaligned_le16(&report[i + 3]);
 							features->x_phy =
@@ -410,7 +427,9 @@ static int wacom_parse_hid(struct usb_interface *intf,
 							features->unit = report[i - 1];
 							features->unitExpo = report[i - 3];
 							i += 12;
-						} else {
+							break;
+
+						default:
 							features->x_max =
 								get_unaligned_le16(&report[i + 3]);
 							features->x_phy =
@@ -418,10 +437,11 @@ static int wacom_parse_hid(struct usb_interface *intf,
 							features->unit = report[i + 9];
 							features->unitExpo = report[i + 11];
 							i += 12;
+							break;
 						}
 					} else if (pen) {
 						/* penabled only accepts exact bytes of data */
-						if (features->type == TABLETPC2FG)
+						if (features->type >= TABLETPC)
 							features->pktlen = WACOM_PKGLEN_GRAPHIRE;
 						features->device_type = BTN_TOOL_PEN;
 						features->x_max =
@@ -434,32 +454,40 @@ static int wacom_parse_hid(struct usb_interface *intf,
 			case HID_USAGE_Y:
 				if (usage == WCM_DESKTOP) {
 					if (finger) {
-						int type = features->type;
-
-						if (type == TABLETPC2FG || type == MTSCREEN) {
+						switch (features->type) {
+						case TABLETPC2FG:
+						case MTSCREEN:
+						case MTTPC:
 							features->y_max =
 								get_unaligned_le16(&report[i + 3]);
 							features->y_phy =
 								get_unaligned_le16(&report[i + 6]);
 							i += 7;
-						} else if (type == WACOM_24HDT) {
+							break;
+
+						case WACOM_24HDT:
 							features->y_max =
 								get_unaligned_le16(&report[i + 3]);
 							features->y_phy =
 								get_unaligned_le16(&report[i - 2]);
 							i += 7;
-						} else if (type == BAMBOO_PT) {
+							break;
+
+						case BAMBOO_PT:
 							features->y_phy =
 								get_unaligned_le16(&report[i + 3]);
 							features->y_max =
 								get_unaligned_le16(&report[i + 6]);
 							i += 12;
-						} else {
+							break;
+
+						default:
 							features->y_max =
 								features->x_max;
 							features->y_phy =
 								get_unaligned_le16(&report[i + 3]);
 							i += 4;
+							break;
 						}
 					} else if (pen) {
 						features->y_max =
diff --git a/trunk/drivers/input/tablet/wacom_wac.c b/trunk/drivers/input/tablet/wacom_wac.c
index 0a67031ffc13..264138f3217e 100644
--- a/trunk/drivers/input/tablet/wacom_wac.c
+++ b/trunk/drivers/input/tablet/wacom_wac.c
@@ -467,9 +467,7 @@ static void wacom_intuos_general(struct wacom_wac *wacom)
 	/* general pen packet */
 	if ((data[1] & 0xb8) == 0xa0) {
 		t = (data[6] << 2) | ((data[7] >> 6) & 3);
-		if ((features->type >= INTUOS4S && features->type <= INTUOS4L) ||
-                    (features->type >= INTUOS5S && features->type <= INTUOS5L) ||
-		    (features->type >= WACOM_21UX2 && features->type <= WACOM_24HD)) {
+		if (features->type >= INTUOS4S && features->type <= WACOM_24HD) {
 			t = (t << 1) | (data[1] & 1);
 		}
 		input_report_abs(input, ABS_PRESSURE, t);
@@ -877,6 +875,11 @@ static int wacom_mt_touch(struct wacom_wac *wacom)
 	int i;
 	int current_num_contacts = data[2];
 	int contacts_to_send = 0;
+	int x_offset = 0;
+
+	/* MTTPC does not support Height and Width */
+	if (wacom->features.type == MTTPC)
+		x_offset = -4;
 
 	/*
 	 * First packet resets the counter since only the first
@@ -889,7 +892,7 @@ static int wacom_mt_touch(struct wacom_wac *wacom)
 	contacts_to_send = min(5, wacom->num_contacts_left);
 
 	for (i = 0; i < contacts_to_send; i++) {
-		int offset = (WACOM_BYTES_PER_MT_PACKET * i) + 3;
+		int offset = (WACOM_BYTES_PER_MT_PACKET + x_offset) * i + 3;
 		bool touch = data[offset] & 0x1;
 		int id = le16_to_cpup((__le16 *)&data[offset + 1]);
 		int slot = find_slot_from_contactid(wacom, id);
@@ -900,8 +903,8 @@ static int wacom_mt_touch(struct wacom_wac *wacom)
 		input_mt_slot(input, slot);
 		input_mt_report_slot_state(input, MT_TOOL_FINGER, touch);
 		if (touch) {
-			int x = le16_to_cpup((__le16 *)&data[offset + 7]);
-			int y = le16_to_cpup((__le16 *)&data[offset + 9]);
+			int x = le16_to_cpup((__le16 *)&data[offset + x_offset + 7]);
+			int y = le16_to_cpup((__le16 *)&data[offset + x_offset + 9]);
 			input_report_abs(input, ABS_MT_POSITION_X, x);
 			input_report_abs(input, ABS_MT_POSITION_Y, y);
 		}
@@ -1336,6 +1339,7 @@ void wacom_wac_irq(struct wacom_wac *wacom_wac, size_t len)
 	case TABLETPCE:
 	case TABLETPC2FG:
 	case MTSCREEN:
+	case MTTPC:
 		sync = wacom_tpc_irq(wacom_wac, len);
 		break;
 
@@ -1657,6 +1661,7 @@ int wacom_setup_input_capabilities(struct input_dev *input_dev,
 		/* fall through */
 
 	case MTSCREEN:
+	case MTTPC:
 		if (features->device_type == BTN_TOOL_FINGER) {
 			wacom_wac->slots = kmalloc(features->touch_max *
 							sizeof(int),
@@ -2018,6 +2023,15 @@ static const struct wacom_features wacom_features_0xED =
 static const struct wacom_features wacom_features_0xEF =
 	{ "Wacom ISDv4 EF",       WACOM_PKGLEN_GRAPHIRE,  26202, 16325,  255,
 	  0, TABLETPC, WACOM_INTUOS_RES, WACOM_INTUOS_RES };
+static const struct wacom_features wacom_features_0x100 =
+	{ "Wacom ISDv4 100",      WACOM_PKGLEN_MTTPC,     26202, 16325,  255,
+	  0, MTTPC, WACOM_INTUOS_RES, WACOM_INTUOS_RES };
+static const struct wacom_features wacom_features_0x101 =
+	{ "Wacom ISDv4 101",      WACOM_PKGLEN_MTTPC,     26202, 16325,  255,
+	  0, MTTPC, WACOM_INTUOS_RES, WACOM_INTUOS_RES };
+static const struct wacom_features wacom_features_0x4001 =
+	{ "Wacom ISDv4 4001",      WACOM_PKGLEN_MTTPC,     26202, 16325,  255,
+	  0, MTTPC, WACOM_INTUOS_RES, WACOM_INTUOS_RES };
 static const struct wacom_features wacom_features_0x47 =
 	{ "Wacom Intuos2 6x8",    WACOM_PKGLEN_INTUOS,    20320, 16240, 1023,
 	  31, INTUOS, WACOM_INTUOS_RES, WACOM_INTUOS_RES };
@@ -2034,7 +2048,8 @@ static const struct wacom_features wacom_features_0xD1 =
 	  .touch_max = 2 };
 static const struct wacom_features wacom_features_0xD2 =
 	{ "Wacom Bamboo Craft",   WACOM_PKGLEN_BBFUN,     14720,  9200, 1023,
-	  31, BAMBOO_PT, WACOM_INTUOS_RES, WACOM_INTUOS_RES };
+	  31, BAMBOO_PT, WACOM_INTUOS_RES, WACOM_INTUOS_RES,
+	  .touch_max = 2 };
 static const struct wacom_features wacom_features_0xD3 =
 	{ "Wacom Bamboo 2FG 6x8", WACOM_PKGLEN_BBFUN,     21648, 13700, 1023,
 	  31, BAMBOO_PT, WACOM_INTUOS_RES, WACOM_INTUOS_RES,
@@ -2194,6 +2209,9 @@ const struct usb_device_id wacom_ids[] = {
 	{ USB_DEVICE_WACOM(0xEC) },
 	{ USB_DEVICE_WACOM(0xED) },
 	{ USB_DEVICE_WACOM(0xEF) },
+	{ USB_DEVICE_WACOM(0x100) },
+	{ USB_DEVICE_WACOM(0x101) },
+	{ USB_DEVICE_WACOM(0x4001) },
 	{ USB_DEVICE_WACOM(0x47) },
 	{ USB_DEVICE_WACOM(0xF4) },
 	{ USB_DEVICE_WACOM(0xF8) },
diff --git a/trunk/drivers/input/tablet/wacom_wac.h b/trunk/drivers/input/tablet/wacom_wac.h
index 345f1e76975e..9396d7769f86 100644
--- a/trunk/drivers/input/tablet/wacom_wac.h
+++ b/trunk/drivers/input/tablet/wacom_wac.h
@@ -26,6 +26,7 @@
 #define WACOM_PKGLEN_BBPEN	10
 #define WACOM_PKGLEN_WIRELESS	32
 #define WACOM_PKGLEN_MTOUCH	62
+#define WACOM_PKGLEN_MTTPC	40
 
 /* wacom data size per MT contact */
 #define WACOM_BYTES_PER_MT_PACKET	11
@@ -88,6 +89,7 @@ enum {
 	TABLETPCE,
 	TABLETPC2FG,
 	MTSCREEN,
+	MTTPC,
 	MAX_TYPE
 };
 
diff --git a/trunk/drivers/input/touchscreen/88pm860x-ts.c b/trunk/drivers/input/touchscreen/88pm860x-ts.c
index 326218dbd6e6..c7068942ebe8 100644
--- a/trunk/drivers/input/touchscreen/88pm860x-ts.c
+++ b/trunk/drivers/input/touchscreen/88pm860x-ts.c
@@ -115,7 +115,7 @@ static void pm860x_touch_close(struct input_dev *dev)
 }
 
 #ifdef CONFIG_OF
-static int __devinit pm860x_touch_dt_init(struct platform_device *pdev,
+static int pm860x_touch_dt_init(struct platform_device *pdev,
 					  struct pm860x_chip *chip,
 					  int *res_x)
 {
@@ -169,7 +169,7 @@ static int __devinit pm860x_touch_dt_init(struct platform_device *pdev,
 #define pm860x_touch_dt_init(x, y, z)	(-1)
 #endif
 
-static int __devinit pm860x_touch_probe(struct platform_device *pdev)
+static int pm860x_touch_probe(struct platform_device *pdev)
 {
 	struct pm860x_chip *chip = dev_get_drvdata(pdev->dev.parent);
 	struct pm860x_touch_pdata *pdata = pdev->dev.platform_data;
@@ -293,7 +293,7 @@ static int __devinit pm860x_touch_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static int __devexit pm860x_touch_remove(struct platform_device *pdev)
+static int pm860x_touch_remove(struct platform_device *pdev)
 {
 	struct pm860x_touch *touch = platform_get_drvdata(pdev);
 
@@ -310,7 +310,7 @@ static struct platform_driver pm860x_touch_driver = {
 		.owner	= THIS_MODULE,
 	},
 	.probe	= pm860x_touch_probe,
-	.remove	= __devexit_p(pm860x_touch_remove),
+	.remove	= pm860x_touch_remove,
 };
 module_platform_driver(pm860x_touch_driver);
 
diff --git a/trunk/drivers/input/touchscreen/Kconfig b/trunk/drivers/input/touchscreen/Kconfig
index 0c45caddd41c..515cfe790543 100644
--- a/trunk/drivers/input/touchscreen/Kconfig
+++ b/trunk/drivers/input/touchscreen/Kconfig
@@ -111,18 +111,6 @@ config TOUCHSCREEN_AUO_PIXCIR
 	  To compile this driver as a module, choose M here: the
 	  module will be called auo-pixcir-ts.
 
-config TOUCHSCREEN_BITSY
-	tristate "Compaq iPAQ H3600 (Bitsy) touchscreen"
-	depends on SA1100_BITSY
-	select SERIO
-	help
-	  Say Y here if you have the h3600 (Bitsy) touchscreen.
-
-	  If unsure, say N.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called h3600_ts_input.
-
 config TOUCHSCREEN_BU21013
 	tristate "BU21013 based touch panel controllers"
 	depends on I2C
diff --git a/trunk/drivers/input/touchscreen/Makefile b/trunk/drivers/input/touchscreen/Makefile
index 7c4c78ebe49e..6bfbeab67c9f 100644
--- a/trunk/drivers/input/touchscreen/Makefile
+++ b/trunk/drivers/input/touchscreen/Makefile
@@ -15,7 +15,6 @@ obj-$(CONFIG_TOUCHSCREEN_ADS7846)	+= ads7846.o
 obj-$(CONFIG_TOUCHSCREEN_ATMEL_MXT)	+= atmel_mxt_ts.o
 obj-$(CONFIG_TOUCHSCREEN_ATMEL_TSADCC)	+= atmel_tsadcc.o
 obj-$(CONFIG_TOUCHSCREEN_AUO_PIXCIR)	+= auo-pixcir-ts.o
-obj-$(CONFIG_TOUCHSCREEN_BITSY)		+= h3600_ts_input.o
 obj-$(CONFIG_TOUCHSCREEN_BU21013)	+= bu21013_ts.o
 obj-$(CONFIG_TOUCHSCREEN_CY8CTMG110)	+= cy8ctmg110_ts.o
 obj-$(CONFIG_TOUCHSCREEN_CYTTSP_CORE)	+= cyttsp_core.o
diff --git a/trunk/drivers/input/touchscreen/ad7877.c b/trunk/drivers/input/touchscreen/ad7877.c
index 2c7692108e6c..23fa829b869d 100644
--- a/trunk/drivers/input/touchscreen/ad7877.c
+++ b/trunk/drivers/input/touchscreen/ad7877.c
@@ -682,7 +682,7 @@ static void ad7877_setup_ts_def_msg(struct spi_device *spi, struct ad7877 *ts)
 	}
 }
 
-static int __devinit ad7877_probe(struct spi_device *spi)
+static int ad7877_probe(struct spi_device *spi)
 {
 	struct ad7877			*ts;
 	struct input_dev		*input_dev;
@@ -810,7 +810,7 @@ static int __devinit ad7877_probe(struct spi_device *spi)
 	return err;
 }
 
-static int __devexit ad7877_remove(struct spi_device *spi)
+static int ad7877_remove(struct spi_device *spi)
 {
 	struct ad7877 *ts = dev_get_drvdata(&spi->dev);
 
@@ -857,7 +857,7 @@ static struct spi_driver ad7877_driver = {
 		.pm	= &ad7877_pm,
 	},
 	.probe		= ad7877_probe,
-	.remove		= __devexit_p(ad7877_remove),
+	.remove		= ad7877_remove,
 };
 
 module_spi_driver(ad7877_driver);
diff --git a/trunk/drivers/input/touchscreen/ad7879-i2c.c b/trunk/drivers/input/touchscreen/ad7879-i2c.c
index 3054354d0dd3..dcf390771549 100644
--- a/trunk/drivers/input/touchscreen/ad7879-i2c.c
+++ b/trunk/drivers/input/touchscreen/ad7879-i2c.c
@@ -54,7 +54,7 @@ static const struct ad7879_bus_ops ad7879_i2c_bus_ops = {
 	.write		= ad7879_i2c_write,
 };
 
-static int __devinit ad7879_i2c_probe(struct i2c_client *client,
+static int ad7879_i2c_probe(struct i2c_client *client,
 				      const struct i2c_device_id *id)
 {
 	struct ad7879 *ts;
@@ -75,7 +75,7 @@ static int __devinit ad7879_i2c_probe(struct i2c_client *client,
 	return 0;
 }
 
-static int __devexit ad7879_i2c_remove(struct i2c_client *client)
+static int ad7879_i2c_remove(struct i2c_client *client)
 {
 	struct ad7879 *ts = i2c_get_clientdata(client);
 
@@ -98,7 +98,7 @@ static struct i2c_driver ad7879_i2c_driver = {
 		.pm	= &ad7879_pm_ops,
 	},
 	.probe		= ad7879_i2c_probe,
-	.remove		= __devexit_p(ad7879_i2c_remove),
+	.remove		= ad7879_i2c_remove,
 	.id_table	= ad7879_id,
 };
 
diff --git a/trunk/drivers/input/touchscreen/ad7879-spi.c b/trunk/drivers/input/touchscreen/ad7879-spi.c
index db49abf056ba..606da5bd6115 100644
--- a/trunk/drivers/input/touchscreen/ad7879-spi.c
+++ b/trunk/drivers/input/touchscreen/ad7879-spi.c
@@ -110,7 +110,7 @@ static const struct ad7879_bus_ops ad7879_spi_bus_ops = {
 	.write		= ad7879_spi_write,
 };
 
-static int __devinit ad7879_spi_probe(struct spi_device *spi)
+static int ad7879_spi_probe(struct spi_device *spi)
 {
 	struct ad7879 *ts;
 	int err;
@@ -137,7 +137,7 @@ static int __devinit ad7879_spi_probe(struct spi_device *spi)
 	return 0;
 }
 
-static int __devexit ad7879_spi_remove(struct spi_device *spi)
+static int ad7879_spi_remove(struct spi_device *spi)
 {
 	struct ad7879 *ts = spi_get_drvdata(spi);
 
@@ -154,7 +154,7 @@ static struct spi_driver ad7879_spi_driver = {
 		.pm	= &ad7879_pm_ops,
 	},
 	.probe		= ad7879_spi_probe,
-	.remove		= __devexit_p(ad7879_spi_remove),
+	.remove		= ad7879_spi_remove,
 };
 
 module_spi_driver(ad7879_spi_driver);
diff --git a/trunk/drivers/input/touchscreen/ads7846.c b/trunk/drivers/input/touchscreen/ads7846.c
index 78e5d9ab0ba7..4f702b3ec1a3 100644
--- a/trunk/drivers/input/touchscreen/ads7846.c
+++ b/trunk/drivers/input/touchscreen/ads7846.c
@@ -955,7 +955,7 @@ static int ads7846_resume(struct device *dev)
 
 static SIMPLE_DEV_PM_OPS(ads7846_pm, ads7846_suspend, ads7846_resume);
 
-static int __devinit ads7846_setup_pendown(struct spi_device *spi,
+static int ads7846_setup_pendown(struct spi_device *spi,
 					   struct ads7846 *ts)
 {
 	struct ads7846_platform_data *pdata = spi->dev.platform_data;
@@ -997,7 +997,7 @@ static int __devinit ads7846_setup_pendown(struct spi_device *spi,
  * Set up the transfers to read touchscreen state; this assumes we
  * use formula #2 for pressure, not #3.
  */
-static void __devinit ads7846_setup_spi_msg(struct ads7846 *ts,
+static void ads7846_setup_spi_msg(struct ads7846 *ts,
 				const struct ads7846_platform_data *pdata)
 {
 	struct spi_message *m = &ts->msg[0];
@@ -1196,7 +1196,7 @@ static void __devinit ads7846_setup_spi_msg(struct ads7846 *ts,
 	spi_message_add_tail(x, m);
 }
 
-static int __devinit ads7846_probe(struct spi_device *spi)
+static int ads7846_probe(struct spi_device *spi)
 {
 	struct ads7846 *ts;
 	struct ads7846_packet *packet;
@@ -1390,7 +1390,7 @@ static int __devinit ads7846_probe(struct spi_device *spi)
 	return err;
 }
 
-static int __devexit ads7846_remove(struct spi_device *spi)
+static int ads7846_remove(struct spi_device *spi)
 {
 	struct ads7846 *ts = dev_get_drvdata(&spi->dev);
 
@@ -1434,7 +1434,7 @@ static struct spi_driver ads7846_driver = {
 		.pm	= &ads7846_pm,
 	},
 	.probe		= ads7846_probe,
-	.remove		= __devexit_p(ads7846_remove),
+	.remove		= ads7846_remove,
 };
 
 module_spi_driver(ads7846_driver);
diff --git a/trunk/drivers/input/touchscreen/atmel_mxt_ts.c b/trunk/drivers/input/touchscreen/atmel_mxt_ts.c
index 1df2396af008..d04f810cb1dd 100644
--- a/trunk/drivers/input/touchscreen/atmel_mxt_ts.c
+++ b/trunk/drivers/input/touchscreen/atmel_mxt_ts.c
@@ -1095,7 +1095,7 @@ static void mxt_input_close(struct input_dev *dev)
 	mxt_stop(data);
 }
 
-static int __devinit mxt_probe(struct i2c_client *client,
+static int mxt_probe(struct i2c_client *client,
 		const struct i2c_device_id *id)
 {
 	const struct mxt_platform_data *pdata = client->dev.platform_data;
@@ -1200,7 +1200,7 @@ static int __devinit mxt_probe(struct i2c_client *client,
 	return error;
 }
 
-static int __devexit mxt_remove(struct i2c_client *client)
+static int mxt_remove(struct i2c_client *client)
 {
 	struct mxt_data *data = i2c_get_clientdata(client);
 
@@ -1270,7 +1270,7 @@ static struct i2c_driver mxt_driver = {
 		.pm	= &mxt_pm_ops,
 	},
 	.probe		= mxt_probe,
-	.remove		= __devexit_p(mxt_remove),
+	.remove		= mxt_remove,
 	.id_table	= mxt_id,
 };
 
diff --git a/trunk/drivers/input/touchscreen/atmel_tsadcc.c b/trunk/drivers/input/touchscreen/atmel_tsadcc.c
index ea392ee138ed..95f6785a94b0 100644
--- a/trunk/drivers/input/touchscreen/atmel_tsadcc.c
+++ b/trunk/drivers/input/touchscreen/atmel_tsadcc.c
@@ -177,7 +177,7 @@ static irqreturn_t atmel_tsadcc_interrupt(int irq, void *dev)
  * The functions for inserting/removing us as a module.
  */
 
-static int __devinit atmel_tsadcc_probe(struct platform_device *pdev)
+static int atmel_tsadcc_probe(struct platform_device *pdev)
 {
 	struct atmel_tsadcc	*ts_dev;
 	struct input_dev	*input_dev;
@@ -323,7 +323,7 @@ static int __devinit atmel_tsadcc_probe(struct platform_device *pdev)
 	return err;
 }
 
-static int __devexit atmel_tsadcc_remove(struct platform_device *pdev)
+static int atmel_tsadcc_remove(struct platform_device *pdev)
 {
 	struct atmel_tsadcc *ts_dev = dev_get_drvdata(&pdev->dev);
 	struct resource *res;
@@ -346,7 +346,7 @@ static int __devexit atmel_tsadcc_remove(struct platform_device *pdev)
 
 static struct platform_driver atmel_tsadcc_driver = {
 	.probe		= atmel_tsadcc_probe,
-	.remove		= __devexit_p(atmel_tsadcc_remove),
+	.remove		= atmel_tsadcc_remove,
 	.driver		= {
 		.name	= "atmel_tsadcc",
 	},
diff --git a/trunk/drivers/input/touchscreen/auo-pixcir-ts.c b/trunk/drivers/input/touchscreen/auo-pixcir-ts.c
index c7047b6bb020..c6e19a96348e 100644
--- a/trunk/drivers/input/touchscreen/auo-pixcir-ts.c
+++ b/trunk/drivers/input/touchscreen/auo-pixcir-ts.c
@@ -286,7 +286,7 @@ static int auo_pixcir_power_mode(struct auo_pixcir_ts *ts, int mode)
 	return 0;
 }
 
-static __devinit int auo_pixcir_int_config(struct auo_pixcir_ts *ts,
+static int auo_pixcir_int_config(struct auo_pixcir_ts *ts,
 					   int int_setting)
 {
 	struct i2c_client *client = ts->client;
@@ -482,7 +482,7 @@ static int auo_pixcir_resume(struct device *dev)
 static SIMPLE_DEV_PM_OPS(auo_pixcir_pm_ops, auo_pixcir_suspend,
 			 auo_pixcir_resume);
 
-static int __devinit auo_pixcir_probe(struct i2c_client *client,
+static int auo_pixcir_probe(struct i2c_client *client,
 				      const struct i2c_device_id *id)
 {
 	const struct auo_pixcir_ts_platdata *pdata = client->dev.platform_data;
@@ -599,7 +599,7 @@ static int __devinit auo_pixcir_probe(struct i2c_client *client,
 	return ret;
 }
 
-static int __devexit auo_pixcir_remove(struct i2c_client *client)
+static int auo_pixcir_remove(struct i2c_client *client)
 {
 	struct auo_pixcir_ts *ts = i2c_get_clientdata(client);
 	const struct auo_pixcir_ts_platdata *pdata = client->dev.platform_data;
@@ -631,7 +631,7 @@ static struct i2c_driver auo_pixcir_driver = {
 		.pm	= &auo_pixcir_pm_ops,
 	},
 	.probe		= auo_pixcir_probe,
-	.remove		= __devexit_p(auo_pixcir_remove),
+	.remove		= auo_pixcir_remove,
 	.id_table	= auo_pixcir_idtable,
 };
 
diff --git a/trunk/drivers/input/touchscreen/bu21013_ts.c b/trunk/drivers/input/touchscreen/bu21013_ts.c
index 5c487d23f11c..b9b5ddad6658 100644
--- a/trunk/drivers/input/touchscreen/bu21013_ts.c
+++ b/trunk/drivers/input/touchscreen/bu21013_ts.c
@@ -14,6 +14,9 @@
 #include <linux/slab.h>
 #include <linux/regulator/consumer.h>
 #include <linux/module.h>
+#include <linux/gpio.h>
+#include <linux/of.h>
+#include <linux/of_gpio.h>
 
 #define PEN_DOWN_INTR	0
 #define MAX_FINGERS	2
@@ -148,11 +151,12 @@
 struct bu21013_ts_data {
 	struct i2c_client *client;
 	wait_queue_head_t wait;
-	bool touch_stopped;
 	const struct bu21013_platform_device *chip;
 	struct input_dev *in_dev;
-	unsigned int intr_pin;
 	struct regulator *regulator;
+	unsigned int irq;
+	unsigned int intr_pin;
+	bool touch_stopped;
 };
 
 /**
@@ -262,7 +266,7 @@ static irqreturn_t bu21013_gpio_irq(int irq, void *device_data)
 			return IRQ_NONE;
 		}
 
-		data->intr_pin = data->chip->irq_read_val();
+		data->intr_pin = gpio_get_value(data->chip->touch_pin);
 		if (data->intr_pin == PEN_DOWN_INTR)
 			wait_event_timeout(data->wait, data->touch_stopped,
 					   msecs_to_jiffies(2));
@@ -418,8 +422,70 @@ static void bu21013_free_irq(struct bu21013_ts_data *bu21013_data)
 {
 	bu21013_data->touch_stopped = true;
 	wake_up(&bu21013_data->wait);
-	free_irq(bu21013_data->chip->irq, bu21013_data);
+	free_irq(bu21013_data->irq, bu21013_data);
+}
+
+/**
+ * bu21013_cs_disable() - deconfigures the touch panel controller
+ * @bu21013_data: device structure pointer
+ *
+ * This function is used to deconfigure the chip selection
+ * for touch panel controller.
+ */
+static void bu21013_cs_disable(struct bu21013_ts_data *bu21013_data)
+{
+	int error;
+
+	error = gpio_direction_output(bu21013_data->chip->cs_pin, 0);
+	if (error < 0)
+		dev_warn(&bu21013_data->client->dev,
+			 "%s: gpio direction failed, error: %d\n",
+			 __func__, error);
+	else
+		gpio_set_value(bu21013_data->chip->cs_pin, 0);
+
+	gpio_free(bu21013_data->chip->cs_pin);
+}
+
+#ifdef CONFIG_OF
+static const struct bu21013_platform_device *
+bu21013_parse_dt(struct device *dev)
+{
+	struct device_node *np = dev->of_node;
+	struct bu21013_platform_device *pdata;
+
+	if (!np) {
+		dev_err(dev, "no device tree or platform data\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL);
+	if (!pdata)
+		return ERR_PTR(-ENOMEM);
+
+	pdata->y_flip = pdata->x_flip = false;
+
+	pdata->x_flip = of_property_read_bool(np, "rohm,flip-x");
+	pdata->y_flip = of_property_read_bool(np, "rohm,flip-y");
+
+	of_property_read_u32(np, "rohm,touch-max-x", &pdata->touch_x_max);
+	of_property_read_u32(np, "rohm,touch-max-y", &pdata->touch_y_max);
+
+	pdata->touch_pin = of_get_named_gpio(np, "touch-gpio", 0);
+	pdata->cs_pin = of_get_named_gpio(np, "reset-gpio", 0);
+
+	pdata->ext_clk = false;
+
+	return pdata;
 }
+#else
+static inline const struct bu21013_platform_device *
+bu21013_parse_dt(struct device *dev)
+{
+	dev_err(dev, "no platform data available\n");
+	return ERR_PTR(-EINVAL);
+}
+#endif
 
 /**
  * bu21013_probe() - initializes the i2c-client touchscreen driver
@@ -429,13 +495,13 @@ static void bu21013_free_irq(struct bu21013_ts_data *bu21013_data)
  * This function used to initializes the i2c-client touchscreen
  * driver and returns integer.
  */
-static int __devinit bu21013_probe(struct i2c_client *client,
-					const struct i2c_device_id *id)
+static int bu21013_probe(struct i2c_client *client,
+			 const struct i2c_device_id *id)
 {
+	const struct bu21013_platform_device *pdata =
+					dev_get_platdata(&client->dev);
 	struct bu21013_ts_data *bu21013_data;
 	struct input_dev *in_dev;
-	const struct bu21013_platform_device *pdata =
-					client->dev.platform_data;
 	int error;
 
 	if (!i2c_check_functionality(client->adapter,
@@ -445,7 +511,13 @@ static int __devinit bu21013_probe(struct i2c_client *client,
 	}
 
 	if (!pdata) {
-		dev_err(&client->dev, "platform data not defined\n");
+		pdata = bu21013_parse_dt(&client->dev);
+		if (IS_ERR(pdata))
+			return PTR_ERR(pdata);
+	}
+
+	if (!gpio_is_valid(pdata->touch_pin)) {
+		dev_err(&client->dev, "invalid touch_pin supplied\n");
 		return -EINVAL;
 	}
 
@@ -460,8 +532,9 @@ static int __devinit bu21013_probe(struct i2c_client *client,
 	bu21013_data->in_dev = in_dev;
 	bu21013_data->chip = pdata;
 	bu21013_data->client = client;
+	bu21013_data->irq = gpio_to_irq(pdata->touch_pin);
 
-	bu21013_data->regulator = regulator_get(&client->dev, "V-TOUCH");
+	bu21013_data->regulator = regulator_get(&client->dev, "avdd");
 	if (IS_ERR(bu21013_data->regulator)) {
 		dev_err(&client->dev, "regulator_get failed\n");
 		error = PTR_ERR(bu21013_data->regulator);
@@ -478,12 +551,11 @@ static int __devinit bu21013_probe(struct i2c_client *client,
 	init_waitqueue_head(&bu21013_data->wait);
 
 	/* configure the gpio pins */
-	if (pdata->cs_en) {
-		error = pdata->cs_en(pdata->cs_pin);
-		if (error < 0) {
-			dev_err(&client->dev, "chip init failed\n");
-			goto err_disable_regulator;
-		}
+	error = gpio_request_one(pdata->cs_pin, GPIOF_OUT_INIT_HIGH,
+				 "touchp_reset");
+	if (error < 0) {
+		dev_err(&client->dev, "Unable to request gpio reset_pin\n");
+		goto err_disable_regulator;
 	}
 
 	/* configure the touch panel controller */
@@ -508,12 +580,13 @@ static int __devinit bu21013_probe(struct i2c_client *client,
 						pdata->touch_y_max, 0, 0);
 	input_set_drvdata(in_dev, bu21013_data);
 
-	error = request_threaded_irq(pdata->irq, NULL, bu21013_gpio_irq,
+	error = request_threaded_irq(bu21013_data->irq, NULL, bu21013_gpio_irq,
 				     IRQF_TRIGGER_FALLING | IRQF_SHARED |
 					IRQF_ONESHOT,
 				     DRIVER_TP, bu21013_data);
 	if (error) {
-		dev_err(&client->dev, "request irq %d failed\n", pdata->irq);
+		dev_err(&client->dev, "request irq %d failed\n",
+			bu21013_data->irq);
 		goto err_cs_disable;
 	}
 
@@ -531,7 +604,7 @@ static int __devinit bu21013_probe(struct i2c_client *client,
 err_free_irq:
 	bu21013_free_irq(bu21013_data);
 err_cs_disable:
-	pdata->cs_dis(pdata->cs_pin);
+	bu21013_cs_disable(bu21013_data);
 err_disable_regulator:
 	regulator_disable(bu21013_data->regulator);
 err_put_regulator:
@@ -549,13 +622,13 @@ static int __devinit bu21013_probe(struct i2c_client *client,
  * This function uses to remove the i2c-client
  * touchscreen driver and returns integer.
  */
-static int __devexit bu21013_remove(struct i2c_client *client)
+static int bu21013_remove(struct i2c_client *client)
 {
 	struct bu21013_ts_data *bu21013_data = i2c_get_clientdata(client);
 
 	bu21013_free_irq(bu21013_data);
 
-	bu21013_data->chip->cs_dis(bu21013_data->chip->cs_pin);
+	bu21013_cs_disable(bu21013_data);
 
 	input_unregister_device(bu21013_data->in_dev);
 
@@ -584,9 +657,9 @@ static int bu21013_suspend(struct device *dev)
 
 	bu21013_data->touch_stopped = true;
 	if (device_may_wakeup(&client->dev))
-		enable_irq_wake(bu21013_data->chip->irq);
+		enable_irq_wake(bu21013_data->irq);
 	else
-		disable_irq(bu21013_data->chip->irq);
+		disable_irq(bu21013_data->irq);
 
 	regulator_disable(bu21013_data->regulator);
 
@@ -621,9 +694,9 @@ static int bu21013_resume(struct device *dev)
 	bu21013_data->touch_stopped = false;
 
 	if (device_may_wakeup(&client->dev))
-		disable_irq_wake(bu21013_data->chip->irq);
+		disable_irq_wake(bu21013_data->irq);
 	else
-		enable_irq(bu21013_data->chip->irq);
+		enable_irq(bu21013_data->irq);
 
 	return 0;
 }
@@ -649,7 +722,7 @@ static struct i2c_driver bu21013_driver = {
 #endif
 	},
 	.probe		=	bu21013_probe,
-	.remove		=	__devexit_p(bu21013_remove),
+	.remove		=	bu21013_remove,
 	.id_table	=	bu21013_id,
 };
 
diff --git a/trunk/drivers/input/touchscreen/cy8ctmg110_ts.c b/trunk/drivers/input/touchscreen/cy8ctmg110_ts.c
index 464f1bf4b61d..96e0eedcc7e5 100644
--- a/trunk/drivers/input/touchscreen/cy8ctmg110_ts.c
+++ b/trunk/drivers/input/touchscreen/cy8ctmg110_ts.c
@@ -99,9 +99,18 @@ static int cy8ctmg110_read_regs(struct cy8ctmg110 *tsc,
 	int ret;
 	struct i2c_msg msg[2] = {
 		/* first write slave position to i2c devices */
-		{ client->addr, 0, 1, &cmd },
+		{
+			.addr = client->addr,
+			.len = 1,
+			.buf = &cmd
+		},
 		/* Second read data from position */
-		{ client->addr, I2C_M_RD, len, data }
+		{
+			.addr = client->addr,
+			.flags = I2C_M_RD,
+			.len = len,
+			.buf = data
+		}
 	};
 
 	ret = i2c_transfer(client->adapter, msg, 2);
@@ -166,7 +175,7 @@ static irqreturn_t cy8ctmg110_irq_thread(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int __devinit cy8ctmg110_probe(struct i2c_client *client,
+static int cy8ctmg110_probe(struct i2c_client *client,
 					const struct i2c_device_id *id)
 {
 	const struct cy8ctmg110_pdata *pdata = client->dev.platform_data;
@@ -314,7 +323,7 @@ static int cy8ctmg110_resume(struct device *dev)
 static SIMPLE_DEV_PM_OPS(cy8ctmg110_pm, cy8ctmg110_suspend, cy8ctmg110_resume);
 #endif
 
-static int __devexit cy8ctmg110_remove(struct i2c_client *client)
+static int cy8ctmg110_remove(struct i2c_client *client)
 {
 	struct cy8ctmg110 *ts = i2c_get_clientdata(client);
 
@@ -348,7 +357,7 @@ static struct i2c_driver cy8ctmg110_driver = {
 	},
 	.id_table	= cy8ctmg110_idtable,
 	.probe		= cy8ctmg110_probe,
-	.remove		= __devexit_p(cy8ctmg110_remove),
+	.remove		= cy8ctmg110_remove,
 };
 
 module_i2c_driver(cy8ctmg110_driver);
diff --git a/trunk/drivers/input/touchscreen/cyttsp_i2c.c b/trunk/drivers/input/touchscreen/cyttsp_i2c.c
index 2af1d0c52bcd..4dbdf44b8fc5 100644
--- a/trunk/drivers/input/touchscreen/cyttsp_i2c.c
+++ b/trunk/drivers/input/touchscreen/cyttsp_i2c.c
@@ -81,7 +81,7 @@ static const struct cyttsp_bus_ops cyttsp_i2c_bus_ops = {
 	.read           = cyttsp_i2c_read_block_data,
 };
 
-static int __devinit cyttsp_i2c_probe(struct i2c_client *client,
+static int cyttsp_i2c_probe(struct i2c_client *client,
 				      const struct i2c_device_id *id)
 {
 	struct cyttsp *ts;
@@ -102,7 +102,7 @@ static int __devinit cyttsp_i2c_probe(struct i2c_client *client,
 	return 0;
 }
 
-static int __devexit cyttsp_i2c_remove(struct i2c_client *client)
+static int cyttsp_i2c_remove(struct i2c_client *client)
 {
 	struct cyttsp *ts = i2c_get_clientdata(client);
 
@@ -124,7 +124,7 @@ static struct i2c_driver cyttsp_i2c_driver = {
 		.pm	= &cyttsp_pm_ops,
 	},
 	.probe		= cyttsp_i2c_probe,
-	.remove		= __devexit_p(cyttsp_i2c_remove),
+	.remove		= cyttsp_i2c_remove,
 	.id_table	= cyttsp_i2c_id,
 };
 
diff --git a/trunk/drivers/input/touchscreen/cyttsp_spi.c b/trunk/drivers/input/touchscreen/cyttsp_spi.c
index 9f263410407b..638e20310f12 100644
--- a/trunk/drivers/input/touchscreen/cyttsp_spi.c
+++ b/trunk/drivers/input/touchscreen/cyttsp_spi.c
@@ -147,7 +147,7 @@ static const struct cyttsp_bus_ops cyttsp_spi_bus_ops = {
 	.read		= cyttsp_spi_read_block_data,
 };
 
-static int __devinit cyttsp_spi_probe(struct spi_device *spi)
+static int cyttsp_spi_probe(struct spi_device *spi)
 {
 	struct cyttsp *ts;
 	int error;
@@ -172,7 +172,7 @@ static int __devinit cyttsp_spi_probe(struct spi_device *spi)
 	return 0;
 }
 
-static int __devexit cyttsp_spi_remove(struct spi_device *spi)
+static int cyttsp_spi_remove(struct spi_device *spi)
 {
 	struct cyttsp *ts = spi_get_drvdata(spi);
 
@@ -188,7 +188,7 @@ static struct spi_driver cyttsp_spi_driver = {
 		.pm	= &cyttsp_pm_ops,
 	},
 	.probe  = cyttsp_spi_probe,
-	.remove = __devexit_p(cyttsp_spi_remove),
+	.remove = cyttsp_spi_remove,
 };
 
 module_spi_driver(cyttsp_spi_driver);
diff --git a/trunk/drivers/input/touchscreen/da9034-ts.c b/trunk/drivers/input/touchscreen/da9034-ts.c
index 36b65cf10d7f..34ad84105e6e 100644
--- a/trunk/drivers/input/touchscreen/da9034-ts.c
+++ b/trunk/drivers/input/touchscreen/da9034-ts.c
@@ -297,7 +297,7 @@ static void da9034_touch_close(struct input_dev *dev)
 }
 
 
-static int __devinit da9034_touch_probe(struct platform_device *pdev)
+static int da9034_touch_probe(struct platform_device *pdev)
 {
 	struct da9034_touch_pdata *pdata = pdev->dev.platform_data;
 	struct da9034_touch *touch;
@@ -361,7 +361,7 @@ static int __devinit da9034_touch_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static int __devexit da9034_touch_remove(struct platform_device *pdev)
+static int da9034_touch_remove(struct platform_device *pdev)
 {
 	struct da9034_touch *touch = platform_get_drvdata(pdev);
 
@@ -377,7 +377,7 @@ static struct platform_driver da9034_touch_driver = {
 		.owner	= THIS_MODULE,
 	},
 	.probe		= da9034_touch_probe,
-	.remove		= __devexit_p(da9034_touch_remove),
+	.remove		= da9034_touch_remove,
 };
 module_platform_driver(da9034_touch_driver);
 
diff --git a/trunk/drivers/input/touchscreen/da9052_tsi.c b/trunk/drivers/input/touchscreen/da9052_tsi.c
index 53133efe0418..8f561e22bdd4 100644
--- a/trunk/drivers/input/touchscreen/da9052_tsi.c
+++ b/trunk/drivers/input/touchscreen/da9052_tsi.c
@@ -141,7 +141,7 @@ static void da9052_ts_pen_work(struct work_struct *work)
 	}
 }
 
-static int __devinit da9052_ts_configure_gpio(struct da9052 *da9052)
+static int da9052_ts_configure_gpio(struct da9052 *da9052)
 {
 	int error;
 
@@ -160,7 +160,7 @@ static int __devinit da9052_ts_configure_gpio(struct da9052 *da9052)
 	return 0;
 }
 
-static int __devinit da9052_configure_tsi(struct da9052_tsi *tsi)
+static int da9052_configure_tsi(struct da9052_tsi *tsi)
 {
 	int error;
 
@@ -227,7 +227,7 @@ static void da9052_ts_input_close(struct input_dev *input_dev)
 	da9052_reg_update(tsi->da9052, DA9052_TSI_CONT_A_REG, 1 << 1, 0);
 }
 
-static int __devinit da9052_ts_probe(struct platform_device *pdev)
+static int da9052_ts_probe(struct platform_device *pdev)
 {
 	struct da9052 *da9052;
 	struct da9052_tsi *tsi;
@@ -317,7 +317,7 @@ static int __devinit da9052_ts_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int  __devexit da9052_ts_remove(struct platform_device *pdev)
+static int  da9052_ts_remove(struct platform_device *pdev)
 {
 	struct da9052_tsi *tsi = platform_get_drvdata(pdev);
 
@@ -336,7 +336,7 @@ static int  __devexit da9052_ts_remove(struct platform_device *pdev)
 
 static struct platform_driver da9052_tsi_driver = {
 	.probe	= da9052_ts_probe,
-	.remove	= __devexit_p(da9052_ts_remove),
+	.remove	= da9052_ts_remove,
 	.driver	= {
 		.name	= "da9052-tsi",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/touchscreen/edt-ft5x06.c b/trunk/drivers/input/touchscreen/edt-ft5x06.c
index 099d144ab7c9..a9170157b442 100644
--- a/trunk/drivers/input/touchscreen/edt-ft5x06.c
+++ b/trunk/drivers/input/touchscreen/edt-ft5x06.c
@@ -491,14 +491,6 @@ static int edt_ft5x06_debugfs_mode_set(void *data, u64 mode)
 DEFINE_SIMPLE_ATTRIBUTE(debugfs_mode_fops, edt_ft5x06_debugfs_mode_get,
 			edt_ft5x06_debugfs_mode_set, "%llu\n");
 
-static int edt_ft5x06_debugfs_raw_data_open(struct inode *inode,
-					    struct file *file)
-{
-	file->private_data = inode->i_private;
-
-	return 0;
-}
-
 static ssize_t edt_ft5x06_debugfs_raw_data_read(struct file *file,
 				char __user *buf, size_t count, loff_t *off)
 {
@@ -579,11 +571,11 @@ static ssize_t edt_ft5x06_debugfs_raw_data_read(struct file *file,
 
 
 static const struct file_operations debugfs_raw_data_fops = {
-	.open = edt_ft5x06_debugfs_raw_data_open,
+	.open = simple_open,
 	.read = edt_ft5x06_debugfs_raw_data_read,
 };
 
-static void __devinit
+static void
 edt_ft5x06_ts_prepare_debugfs(struct edt_ft5x06_ts_data *tsdata,
 			      const char *debugfs_name)
 {
@@ -600,7 +592,7 @@ edt_ft5x06_ts_prepare_debugfs(struct edt_ft5x06_ts_data *tsdata,
 			    tsdata->debug_dir, tsdata, &debugfs_raw_data_fops);
 }
 
-static void __devexit
+static void
 edt_ft5x06_ts_teardown_debugfs(struct edt_ft5x06_ts_data *tsdata)
 {
 	if (tsdata->debug_dir)
@@ -625,7 +617,7 @@ edt_ft5x06_ts_teardown_debugfs(struct edt_ft5x06_ts_data *tsdata)
 
 
 
-static int __devinit edt_ft5x06_ts_reset(struct i2c_client *client,
+static int edt_ft5x06_ts_reset(struct i2c_client *client,
 					 int reset_pin)
 {
 	int error;
@@ -649,7 +641,7 @@ static int __devinit edt_ft5x06_ts_reset(struct i2c_client *client,
 	return 0;
 }
 
-static int __devinit edt_ft5x06_ts_identify(struct i2c_client *client,
+static int edt_ft5x06_ts_identify(struct i2c_client *client,
 					    char *model_name,
 					    char *fw_version)
 {
@@ -683,7 +675,7 @@ static int __devinit edt_ft5x06_ts_identify(struct i2c_client *client,
 	    pdata->name <= edt_ft5x06_attr_##name.limit_high)		\
 		edt_ft5x06_register_write(tsdata, reg, pdata->name)
 
-static void __devinit
+static void
 edt_ft5x06_ts_get_defaults(struct edt_ft5x06_ts_data *tsdata,
 			   const struct edt_ft5x06_platform_data *pdata)
 {
@@ -697,7 +689,7 @@ edt_ft5x06_ts_get_defaults(struct edt_ft5x06_ts_data *tsdata,
 	EDT_ATTR_CHECKSET(report_rate, WORK_REGISTER_REPORT_RATE);
 }
 
-static void __devinit
+static void
 edt_ft5x06_ts_get_parameters(struct edt_ft5x06_ts_data *tsdata)
 {
 	tsdata->threshold = edt_ft5x06_register_read(tsdata,
@@ -710,7 +702,7 @@ edt_ft5x06_ts_get_parameters(struct edt_ft5x06_ts_data *tsdata)
 	tsdata->num_y = edt_ft5x06_register_read(tsdata, WORK_REGISTER_NUM_Y);
 }
 
-static int __devinit edt_ft5x06_ts_probe(struct i2c_client *client,
+static int edt_ft5x06_ts_probe(struct i2c_client *client,
 					 const struct i2c_device_id *id)
 {
 	const struct edt_ft5x06_platform_data *pdata =
@@ -830,7 +822,7 @@ static int __devinit edt_ft5x06_ts_probe(struct i2c_client *client,
 	return error;
 }
 
-static int __devexit edt_ft5x06_ts_remove(struct i2c_client *client)
+static int edt_ft5x06_ts_remove(struct i2c_client *client)
 {
 	const struct edt_ft5x06_platform_data *pdata =
 						dev_get_platdata(&client->dev);
@@ -891,7 +883,7 @@ static struct i2c_driver edt_ft5x06_ts_driver = {
 	},
 	.id_table = edt_ft5x06_ts_id,
 	.probe    = edt_ft5x06_ts_probe,
-	.remove   = __devexit_p(edt_ft5x06_ts_remove),
+	.remove   = edt_ft5x06_ts_remove,
 };
 
 module_i2c_driver(edt_ft5x06_ts_driver);
diff --git a/trunk/drivers/input/touchscreen/eeti_ts.c b/trunk/drivers/input/touchscreen/eeti_ts.c
index 908407efc672..55255a940072 100644
--- a/trunk/drivers/input/touchscreen/eeti_ts.c
+++ b/trunk/drivers/input/touchscreen/eeti_ts.c
@@ -154,7 +154,7 @@ static void eeti_ts_close(struct input_dev *dev)
 	eeti_ts_stop(priv);
 }
 
-static int __devinit eeti_ts_probe(struct i2c_client *client,
+static int eeti_ts_probe(struct i2c_client *client,
 				   const struct i2c_device_id *idp)
 {
 	struct eeti_ts_platform_data *pdata = client->dev.platform_data;
@@ -248,7 +248,7 @@ static int __devinit eeti_ts_probe(struct i2c_client *client,
 	return err;
 }
 
-static int __devexit eeti_ts_remove(struct i2c_client *client)
+static int eeti_ts_remove(struct i2c_client *client)
 {
 	struct eeti_ts_priv *priv = i2c_get_clientdata(client);
 
@@ -321,7 +321,7 @@ static struct i2c_driver eeti_ts_driver = {
 #endif
 	},
 	.probe = eeti_ts_probe,
-	.remove = __devexit_p(eeti_ts_remove),
+	.remove = eeti_ts_remove,
 	.id_table = eeti_ts_id,
 };
 
diff --git a/trunk/drivers/input/touchscreen/egalax_ts.c b/trunk/drivers/input/touchscreen/egalax_ts.c
index 13fa62fdfb0b..17c9097f3b5d 100644
--- a/trunk/drivers/input/touchscreen/egalax_ts.c
+++ b/trunk/drivers/input/touchscreen/egalax_ts.c
@@ -153,7 +153,7 @@ static int egalax_wake_up_device(struct i2c_client *client)
 	return 0;
 }
 
-static int __devinit egalax_firmware_version(struct i2c_client *client)
+static int egalax_firmware_version(struct i2c_client *client)
 {
 	static const u8 cmd[MAX_I2C_DATA_LEN] = { 0x03, 0x03, 0xa, 0x01, 0x41 };
 	int ret;
@@ -165,7 +165,7 @@ static int __devinit egalax_firmware_version(struct i2c_client *client)
 	return 0;
 }
 
-static int __devinit egalax_ts_probe(struct i2c_client *client,
+static int egalax_ts_probe(struct i2c_client *client,
 				       const struct i2c_device_id *id)
 {
 	struct egalax_ts *ts;
@@ -246,7 +246,7 @@ static int __devinit egalax_ts_probe(struct i2c_client *client,
 	return error;
 }
 
-static __devexit int egalax_ts_remove(struct i2c_client *client)
+static int egalax_ts_remove(struct i2c_client *client)
 {
 	struct egalax_ts *ts = i2c_get_clientdata(client);
 
@@ -301,7 +301,7 @@ static struct i2c_driver egalax_ts_driver = {
 	},
 	.id_table	= egalax_ts_id,
 	.probe		= egalax_ts_probe,
-	.remove		= __devexit_p(egalax_ts_remove),
+	.remove		= egalax_ts_remove,
 };
 
 module_i2c_driver(egalax_ts_driver);
diff --git a/trunk/drivers/input/touchscreen/h3600_ts_input.c b/trunk/drivers/input/touchscreen/h3600_ts_input.c
deleted file mode 100644
index b9e8686a6f1c..000000000000
--- a/trunk/drivers/input/touchscreen/h3600_ts_input.c
+++ /dev/null
@@ -1,479 +0,0 @@
-/*
- *  Copyright (c) 2001 "Crazy" James Simmons jsimmons@transvirtual.com
- *
- *  Sponsored by Transvirtual Technology.
- *
- *  Derived from the code in h3600_ts.[ch] by Charles Flynn
- */
-
-/*
- * Driver for the h3600 Touch Screen and other Atmel controlled devices.
- */
-
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so by
- * e-mail - mail your message to <jsimmons@transvirtual.com>.
- */
-
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/input.h>
-#include <linux/serio.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-
-/* SA1100 serial defines */
-#include <mach/hardware.h>
-#include <mach/irqs.h>
-
-#define DRIVER_DESC	"H3600 touchscreen driver"
-
-MODULE_AUTHOR("James Simmons <jsimmons@transvirtual.com>");
-MODULE_DESCRIPTION(DRIVER_DESC);
-MODULE_LICENSE("GPL");
-
-/*
- * Definitions & global arrays.
- */
-
-/* The start and end of frame characters SOF and EOF */
-#define CHAR_SOF                0x02
-#define CHAR_EOF                0x03
-#define FRAME_OVERHEAD          3       /* CHAR_SOF,CHAR_EOF,LENGTH = 3 */
-
-/*
-        Atmel events and response IDs contained in frame.
-        Programmer has no control over these numbers.
-        TODO there are holes - specifically  1,7,0x0a
-*/
-#define VERSION_ID              0       /* Get Version (request/response) */
-#define KEYBD_ID                2       /* Keyboard (event) */
-#define TOUCHS_ID               3       /* Touch Screen (event)*/
-#define EEPROM_READ_ID          4       /* (request/response) */
-#define EEPROM_WRITE_ID         5       /* (request/response) */
-#define THERMAL_ID              6       /* (request/response) */
-#define NOTIFY_LED_ID           8       /* (request/response) */
-#define BATTERY_ID              9       /* (request/response) */
-#define SPI_READ_ID             0x0b    /* ( request/response) */
-#define SPI_WRITE_ID            0x0c    /* ( request/response) */
-#define FLITE_ID                0x0d    /* backlight ( request/response) */
-#define STX_ID                  0xa1    /* extension pack status (req/resp) */
-
-#define MAX_ID                  14
-
-#define H3600_MAX_LENGTH 16
-#define H3600_KEY 0xf
-
-#define H3600_SCANCODE_RECORD	1	 /* 1 -> record button */
-#define H3600_SCANCODE_CALENDAR 2	 /* 2 -> calendar */
-#define H3600_SCANCODE_CONTACTS 3	 /* 3 -> contact */
-#define H3600_SCANCODE_Q	4	 /* 4 -> Q button */
-#define	H3600_SCANCODE_START	5	 /* 5 -> start menu */
-#define	H3600_SCANCODE_UP	6	 /* 6 -> up */
-#define H3600_SCANCODE_RIGHT	7	 /* 7 -> right */
-#define H3600_SCANCODE_LEFT	8	 /* 8 -> left */
-#define H3600_SCANCODE_DOWN	9	 /* 9 -> down */
-
-/*
- * Per-touchscreen data.
- */
-struct h3600_dev {
-	struct input_dev *dev;
-	struct serio *serio;
-	unsigned char event;	/* event ID from packet */
-	unsigned char chksum;
-	unsigned char len;
-	unsigned char idx;
-	unsigned char buf[H3600_MAX_LENGTH];
-	char phys[32];
-};
-
-static irqreturn_t action_button_handler(int irq, void *dev_id)
-{
-	int down = (GPLR & GPIO_BITSY_ACTION_BUTTON) ? 0 : 1;
-	struct input_dev *dev = dev_id;
-
-	input_report_key(dev, KEY_ENTER, down);
-	input_sync(dev);
-
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t npower_button_handler(int irq, void *dev_id)
-{
-	int down = (GPLR & GPIO_BITSY_NPOWER_BUTTON) ? 0 : 1;
-	struct input_dev *dev = dev_id;
-
-	/*
-	 * This interrupt is only called when we release the key. So we have
-	 * to fake a key press.
-	 */
-	input_report_key(dev, KEY_SUSPEND, 1);
-	input_report_key(dev, KEY_SUSPEND, down);
-	input_sync(dev);
-
-	return IRQ_HANDLED;
-}
-
-#ifdef CONFIG_PM
-
-static int flite_brightness = 25;
-
-enum flite_pwr {
-	FLITE_PWR_OFF = 0,
-	FLITE_PWR_ON = 1
-};
-
-/*
- * h3600_flite_power: enables or disables power to frontlight, using last bright */
-unsigned int h3600_flite_power(struct input_dev *dev, enum flite_pwr pwr)
-{
-	unsigned char brightness = (pwr == FLITE_PWR_OFF) ? 0 : flite_brightness;
-	struct h3600_dev *ts = input_get_drvdata(dev);
-
-	/* Must be in this order */
-	serio_write(ts->serio, 1);
-	serio_write(ts->serio, pwr);
-	serio_write(ts->serio, brightness);
-
-	return 0;
-}
-
-#endif
-
-/*
- * This function translates the native event packets to linux input event
- * packets. Some packets coming from serial are not touchscreen related. In
- * this case we send them off to be processed elsewhere.
- */
-static void h3600ts_process_packet(struct h3600_dev *ts)
-{
-	struct input_dev *dev = ts->dev;
-	static int touched = 0;
-	int key, down = 0;
-
-	switch (ts->event) {
-		/*
-		   Buttons - returned as a single byte
-			7 6 5 4 3 2 1 0
-			S x x x N N N N
-
-		   S       switch state ( 0=pressed 1=released)
-		   x       Unused.
-		   NNNN    switch number 0-15
-
-		   Note: This is true for non interrupt generated key events.
-		*/
-		case KEYBD_ID:
-			down = (ts->buf[0] & 0x80) ? 0 : 1;
-
-			switch (ts->buf[0] & 0x7f) {
-				case H3600_SCANCODE_RECORD:
-					key = KEY_RECORD;
-					break;
-				case H3600_SCANCODE_CALENDAR:
-					key = KEY_PROG1;
-                                        break;
-				case H3600_SCANCODE_CONTACTS:
-					key = KEY_PROG2;
-					break;
-				case H3600_SCANCODE_Q:
-					key = KEY_Q;
-					break;
-				case H3600_SCANCODE_START:
-					key = KEY_PROG3;
-					break;
-				case H3600_SCANCODE_UP:
-					key = KEY_UP;
-					break;
-				case H3600_SCANCODE_RIGHT:
-					key = KEY_RIGHT;
-					break;
-				case H3600_SCANCODE_LEFT:
-					key = KEY_LEFT;
-					break;
-				case H3600_SCANCODE_DOWN:
-					key = KEY_DOWN;
-					break;
-				default:
-					key = 0;
-			}
-			if (key)
-				input_report_key(dev, key, down);
-			break;
-		/*
-		 * Native touchscreen event data is formatted as shown below:-
-		 *
-		 *      +-------+-------+-------+-------+
-		 *      | Xmsb  | Xlsb  | Ymsb  | Ylsb  |
-		 *      +-------+-------+-------+-------+
-		 *       byte 0    1       2       3
-		 */
-		case TOUCHS_ID:
-			if (!touched) {
-				input_report_key(dev, BTN_TOUCH, 1);
-				touched = 1;
-			}
-
-			if (ts->len) {
-				unsigned short x, y;
-
-				x = ts->buf[0]; x <<= 8; x += ts->buf[1];
-				y = ts->buf[2]; y <<= 8; y += ts->buf[3];
-
-				input_report_abs(dev, ABS_X, x);
-				input_report_abs(dev, ABS_Y, y);
-			} else {
-				input_report_key(dev, BTN_TOUCH, 0);
-				touched = 0;
-			}
-			break;
-		default:
-			/* Send a non input event elsewhere */
-			break;
-	}
-
-	input_sync(dev);
-}
-
-/*
- * h3600ts_event() handles events from the input module.
- */
-static int h3600ts_event(struct input_dev *dev, unsigned int type,
-			 unsigned int code, int value)
-{
-#if 0
-	struct h3600_dev *ts = input_get_drvdata(dev);
-
-	switch (type) {
-		case EV_LED: {
-		//	serio_write(ts->serio, SOME_CMD);
-			return 0;
-		}
-	}
-	return -1;
-#endif
-	return 0;
-}
-
-/*
-        Frame format
-  byte    1       2               3              len + 4
-        +-------+---------------+---------------+--=------------+
-        |SOF    |id     |len    | len bytes     | Chksum        |
-        +-------+---------------+---------------+--=------------+
-  bit   0     7  8    11 12   15 16
-
-        +-------+---------------+-------+
-        |SOF    |id     |0      |Chksum | - Note Chksum does not include SOF
-        +-------+---------------+-------+
-  bit   0     7  8    11 12   15 16
-
-*/
-
-static int state;
-
-/* decode States  */
-#define STATE_SOF       0       /* start of FRAME */
-#define STATE_ID        1       /* state where we decode the ID & len */
-#define STATE_DATA      2       /* state where we decode data */
-#define STATE_EOF       3       /* state where we decode checksum or EOF */
-
-static irqreturn_t h3600ts_interrupt(struct serio *serio, unsigned char data,
-                                     unsigned int flags)
-{
-	struct h3600_dev *ts = serio_get_drvdata(serio);
-
-	/*
-	 * We have a new frame coming in.
-	 */
-	switch (state) {
-		case STATE_SOF:
-			if (data == CHAR_SOF)
-				state = STATE_ID;
-			break;
-		case STATE_ID:
-			ts->event = (data & 0xf0) >> 4;
-			ts->len = (data & 0xf);
-			ts->idx = 0;
-			if (ts->event >= MAX_ID) {
-				state = STATE_SOF;
-				break;
-			}
-			ts->chksum = data;
-			state = (ts->len > 0) ? STATE_DATA : STATE_EOF;
-			break;
-		case STATE_DATA:
-			ts->chksum += data;
-			ts->buf[ts->idx]= data;
-			if (++ts->idx == ts->len)
-				state = STATE_EOF;
-			break;
-		case STATE_EOF:
-			state = STATE_SOF;
-			if (data == CHAR_EOF || data == ts->chksum)
-				h3600ts_process_packet(ts);
-			break;
-		default:
-			printk("Error3\n");
-			break;
-	}
-
-	return IRQ_HANDLED;
-}
-
-/*
- * h3600ts_connect() is the routine that is called when someone adds a
- * new serio device that supports H3600 protocol and registers it as
- * an input device.
- */
-static int h3600ts_connect(struct serio *serio, struct serio_driver *drv)
-{
-	struct h3600_dev *ts;
-	struct input_dev *input_dev;
-	int err;
-
-	ts = kzalloc(sizeof(struct h3600_dev), GFP_KERNEL);
-	input_dev = input_allocate_device();
-	if (!ts || !input_dev) {
-		err = -ENOMEM;
-		goto fail1;
-	}
-
-	ts->serio = serio;
-	ts->dev = input_dev;
-	snprintf(ts->phys, sizeof(ts->phys), "%s/input0", serio->phys);
-
-	input_dev->name = "H3600 TouchScreen";
-	input_dev->phys = ts->phys;
-	input_dev->id.bustype = BUS_RS232;
-	input_dev->id.vendor = SERIO_H3600;
-	input_dev->id.product = 0x0666;  /* FIXME !!! We can ask the hardware */
-	input_dev->id.version = 0x0100;
-	input_dev->dev.parent = &serio->dev;
-
-	input_set_drvdata(input_dev, ts);
-
-	input_dev->event = h3600ts_event;
-
-	input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) |
-		BIT_MASK(EV_LED) | BIT_MASK(EV_PWR);
-	input_dev->ledbit[0] = BIT_MASK(LED_SLEEP);
-	input_set_abs_params(input_dev, ABS_X, 60, 985, 0, 0);
-	input_set_abs_params(input_dev, ABS_Y, 35, 1024, 0, 0);
-
-	set_bit(KEY_RECORD, input_dev->keybit);
-	set_bit(KEY_Q, input_dev->keybit);
-	set_bit(KEY_PROG1, input_dev->keybit);
-	set_bit(KEY_PROG2, input_dev->keybit);
-	set_bit(KEY_PROG3, input_dev->keybit);
-	set_bit(KEY_UP, input_dev->keybit);
-	set_bit(KEY_RIGHT, input_dev->keybit);
-	set_bit(KEY_LEFT, input_dev->keybit);
-	set_bit(KEY_DOWN, input_dev->keybit);
-	set_bit(KEY_ENTER, input_dev->keybit);
-	set_bit(KEY_SUSPEND, input_dev->keybit);
-	set_bit(BTN_TOUCH, input_dev->keybit);
-
-	/* Device specific stuff */
-	set_GPIO_IRQ_edge(GPIO_BITSY_ACTION_BUTTON, GPIO_BOTH_EDGES);
-	set_GPIO_IRQ_edge(GPIO_BITSY_NPOWER_BUTTON, GPIO_RISING_EDGE);
-
-	if (request_irq(IRQ_GPIO_BITSY_ACTION_BUTTON, action_button_handler,
-			IRQF_SHARED, "h3600_action", ts->dev)) {
-		printk(KERN_ERR "h3600ts.c: Could not allocate Action Button IRQ!\n");
-		err = -EBUSY;
-		goto fail1;
-	}
-
-	if (request_irq(IRQ_GPIO_BITSY_NPOWER_BUTTON, npower_button_handler,
-			IRQF_SHARED, "h3600_suspend", ts->dev)) {
-		printk(KERN_ERR "h3600ts.c: Could not allocate Power Button IRQ!\n");
-		err = -EBUSY;
-		goto fail2;
-	}
-
-	serio_set_drvdata(serio, ts);
-
-	err = serio_open(serio, drv);
-	if (err)
-		goto fail3;
-
-	//h3600_flite_control(1, 25);     /* default brightness */
-	err = input_register_device(ts->dev);
-	if (err)
-		goto fail4;
-
-	return 0;
-
-fail4:	serio_close(serio);
-fail3:	serio_set_drvdata(serio, NULL);
-	free_irq(IRQ_GPIO_BITSY_NPOWER_BUTTON, ts->dev);
-fail2:	free_irq(IRQ_GPIO_BITSY_ACTION_BUTTON, ts->dev);
-fail1:	input_free_device(input_dev);
-	kfree(ts);
-	return err;
-}
-
-/*
- * h3600ts_disconnect() is the opposite of h3600ts_connect()
- */
-
-static void h3600ts_disconnect(struct serio *serio)
-{
-	struct h3600_dev *ts = serio_get_drvdata(serio);
-
-	free_irq(IRQ_GPIO_BITSY_ACTION_BUTTON, ts->dev);
-	free_irq(IRQ_GPIO_BITSY_NPOWER_BUTTON, ts->dev);
-	input_get_device(ts->dev);
-	input_unregister_device(ts->dev);
-	serio_close(serio);
-	serio_set_drvdata(serio, NULL);
-	input_put_device(ts->dev);
-	kfree(ts);
-}
-
-/*
- * The serio driver structure.
- */
-
-static struct serio_device_id h3600ts_serio_ids[] = {
-	{
-		.type	= SERIO_RS232,
-		.proto	= SERIO_H3600,
-		.id	= SERIO_ANY,
-		.extra	= SERIO_ANY,
-	},
-	{ 0 }
-};
-
-MODULE_DEVICE_TABLE(serio, h3600ts_serio_ids);
-
-static struct serio_driver h3600ts_drv = {
-	.driver		= {
-		.name	= "h3600ts",
-	},
-	.description	= DRIVER_DESC,
-	.id_table	= h3600ts_serio_ids,
-	.interrupt	= h3600ts_interrupt,
-	.connect	= h3600ts_connect,
-	.disconnect	= h3600ts_disconnect,
-};
-
-module_serio_driver(h3600ts_drv);
diff --git a/trunk/drivers/input/touchscreen/htcpen.c b/trunk/drivers/input/touchscreen/htcpen.c
index d13143b68b3e..6c4fb8436957 100644
--- a/trunk/drivers/input/touchscreen/htcpen.c
+++ b/trunk/drivers/input/touchscreen/htcpen.c
@@ -102,7 +102,7 @@ static void htcpen_close(struct input_dev *dev)
 	synchronize_irq(HTCPEN_IRQ);
 }
 
-static int __devinit htcpen_isa_probe(struct device *dev, unsigned int id)
+static int htcpen_isa_probe(struct device *dev, unsigned int id)
 {
 	struct input_dev *htcpen_dev;
 	int err = -EBUSY;
@@ -174,7 +174,7 @@ static int __devinit htcpen_isa_probe(struct device *dev, unsigned int id)
 	return err;
 }
 
-static int __devexit htcpen_isa_remove(struct device *dev, unsigned int id)
+static int htcpen_isa_remove(struct device *dev, unsigned int id)
 {
 	struct input_dev *htcpen_dev = dev_get_drvdata(dev);
 
@@ -210,7 +210,7 @@ static int htcpen_isa_resume(struct device *dev, unsigned int n)
 
 static struct isa_driver htcpen_isa_driver = {
 	.probe		= htcpen_isa_probe,
-	.remove		= __devexit_p(htcpen_isa_remove),
+	.remove		= htcpen_isa_remove,
 #ifdef CONFIG_PM
 	.suspend	= htcpen_isa_suspend,
 	.resume		= htcpen_isa_resume,
diff --git a/trunk/drivers/input/touchscreen/ili210x.c b/trunk/drivers/input/touchscreen/ili210x.c
index 4ac69760ec08..1418bdda61bb 100644
--- a/trunk/drivers/input/touchscreen/ili210x.c
+++ b/trunk/drivers/input/touchscreen/ili210x.c
@@ -180,7 +180,7 @@ static const struct attribute_group ili210x_attr_group = {
 	.attrs = ili210x_attributes,
 };
 
-static int __devinit ili210x_i2c_probe(struct i2c_client *client,
+static int ili210x_i2c_probe(struct i2c_client *client,
 				       const struct i2c_device_id *id)
 {
 	struct device *dev = &client->dev;
@@ -298,7 +298,7 @@ static int __devinit ili210x_i2c_probe(struct i2c_client *client,
 	return error;
 }
 
-static int __devexit ili210x_i2c_remove(struct i2c_client *client)
+static int ili210x_i2c_remove(struct i2c_client *client)
 {
 	struct ili210x *priv = i2c_get_clientdata(client);
 
@@ -350,7 +350,7 @@ static struct i2c_driver ili210x_ts_driver = {
 	},
 	.id_table = ili210x_i2c_id,
 	.probe = ili210x_i2c_probe,
-	.remove = __devexit_p(ili210x_i2c_remove),
+	.remove = ili210x_i2c_remove,
 };
 
 module_i2c_driver(ili210x_ts_driver);
diff --git a/trunk/drivers/input/touchscreen/intel-mid-touch.c b/trunk/drivers/input/touchscreen/intel-mid-touch.c
index cf299377fc49..465db5dba8b4 100644
--- a/trunk/drivers/input/touchscreen/intel-mid-touch.c
+++ b/trunk/drivers/input/touchscreen/intel-mid-touch.c
@@ -427,7 +427,7 @@ static irqreturn_t mrstouch_pendet_irq(int irq, void *dev_id)
 }
 
 /* Utility to read PMIC ID */
-static int __devinit mrstouch_read_pmic_id(uint *vendor, uint *rev)
+static int mrstouch_read_pmic_id(uint *vendor, uint *rev)
 {
 	int err;
 	u8 r;
@@ -446,7 +446,7 @@ static int __devinit mrstouch_read_pmic_id(uint *vendor, uint *rev)
  * Parse ADC channels to find end of the channel configured by other ADC user
  * NEC and MAXIM requires 4 channels and FreeScale needs 18 channels
  */
-static int __devinit mrstouch_chan_parse(struct mrstouch_dev *tsdev)
+static int mrstouch_chan_parse(struct mrstouch_dev *tsdev)
 {
 	int found = 0;
 	int err, i;
@@ -478,7 +478,7 @@ static int __devinit mrstouch_chan_parse(struct mrstouch_dev *tsdev)
 /*
  * Writes touch screen channels to ADC address selection registers
  */
-static int __devinit mrstouch_ts_chan_set(uint offset)
+static int mrstouch_ts_chan_set(uint offset)
 {
 	u16 chan;
 
@@ -494,7 +494,7 @@ static int __devinit mrstouch_ts_chan_set(uint offset)
 }
 
 /* Initialize ADC */
-static int __devinit mrstouch_adc_init(struct mrstouch_dev *tsdev)
+static int mrstouch_adc_init(struct mrstouch_dev *tsdev)
 {
 	int err, start;
 	u8 ra, rm;
@@ -568,7 +568,7 @@ static int __devinit mrstouch_adc_init(struct mrstouch_dev *tsdev)
 
 
 /* Probe function for touch screen driver */
-static int __devinit mrstouch_probe(struct platform_device *pdev)
+static int mrstouch_probe(struct platform_device *pdev)
 {
 	struct mrstouch_dev *tsdev;
 	struct input_dev *input;
@@ -643,7 +643,7 @@ static int __devinit mrstouch_probe(struct platform_device *pdev)
 	return err;
 }
 
-static int __devexit mrstouch_remove(struct platform_device *pdev)
+static int mrstouch_remove(struct platform_device *pdev)
 {
 	struct mrstouch_dev *tsdev = platform_get_drvdata(pdev);
 
@@ -662,7 +662,7 @@ static struct platform_driver mrstouch_driver = {
 		.owner	= THIS_MODULE,
 	},
 	.probe		= mrstouch_probe,
-	.remove		= __devexit_p(mrstouch_remove),
+	.remove		= mrstouch_remove,
 };
 module_platform_driver(mrstouch_driver);
 
diff --git a/trunk/drivers/input/touchscreen/jornada720_ts.c b/trunk/drivers/input/touchscreen/jornada720_ts.c
index 7f03d1bd916e..282d7c7ad2fc 100644
--- a/trunk/drivers/input/touchscreen/jornada720_ts.c
+++ b/trunk/drivers/input/touchscreen/jornada720_ts.c
@@ -99,7 +99,7 @@ static irqreturn_t jornada720_ts_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int __devinit jornada720_ts_probe(struct platform_device *pdev)
+static int jornada720_ts_probe(struct platform_device *pdev)
 {
 	struct jornada_ts *jornada_ts;
 	struct input_dev *input_dev;
@@ -151,7 +151,7 @@ static int __devinit jornada720_ts_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit jornada720_ts_remove(struct platform_device *pdev)
+static int jornada720_ts_remove(struct platform_device *pdev)
 {
 	struct jornada_ts *jornada_ts = platform_get_drvdata(pdev);
 
@@ -168,7 +168,7 @@ MODULE_ALIAS("platform:jornada_ts");
 
 static struct platform_driver jornada720_ts_driver = {
 	.probe		= jornada720_ts_probe,
-	.remove		= __devexit_p(jornada720_ts_remove),
+	.remove		= jornada720_ts_remove,
 	.driver		= {
 		.name	= "jornada_ts",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/touchscreen/lpc32xx_ts.c b/trunk/drivers/input/touchscreen/lpc32xx_ts.c
index 4c2b8ed3bf16..9101ee529c92 100644
--- a/trunk/drivers/input/touchscreen/lpc32xx_ts.c
+++ b/trunk/drivers/input/touchscreen/lpc32xx_ts.c
@@ -203,7 +203,7 @@ static void lpc32xx_ts_close(struct input_dev *dev)
 	lpc32xx_stop_tsc(tsc);
 }
 
-static int __devinit lpc32xx_ts_probe(struct platform_device *pdev)
+static int lpc32xx_ts_probe(struct platform_device *pdev)
 {
 	struct lpc32xx_tsc *tsc;
 	struct input_dev *input;
@@ -309,7 +309,7 @@ static int __devinit lpc32xx_ts_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit lpc32xx_ts_remove(struct platform_device *pdev)
+static int lpc32xx_ts_remove(struct platform_device *pdev)
 {
 	struct lpc32xx_tsc *tsc = platform_get_drvdata(pdev);
 	struct resource *res;
@@ -394,7 +394,7 @@ MODULE_DEVICE_TABLE(of, lpc32xx_tsc_of_match);
 
 static struct platform_driver lpc32xx_ts_driver = {
 	.probe		= lpc32xx_ts_probe,
-	.remove		= __devexit_p(lpc32xx_ts_remove),
+	.remove		= lpc32xx_ts_remove,
 	.driver		= {
 		.name	= MOD_NAME,
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/touchscreen/max11801_ts.c b/trunk/drivers/input/touchscreen/max11801_ts.c
index 4eab50b856d7..00bc6caa27f5 100644
--- a/trunk/drivers/input/touchscreen/max11801_ts.c
+++ b/trunk/drivers/input/touchscreen/max11801_ts.c
@@ -156,7 +156,7 @@ static irqreturn_t max11801_ts_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static void __devinit max11801_ts_phy_init(struct max11801_data *data)
+static void max11801_ts_phy_init(struct max11801_data *data)
 {
 	struct i2c_client *client = data->client;
 
@@ -174,7 +174,7 @@ static void __devinit max11801_ts_phy_init(struct max11801_data *data)
 	max11801_write_reg(client, OP_MODE_CONF_REG, 0x36);
 }
 
-static int __devinit max11801_ts_probe(struct i2c_client *client,
+static int max11801_ts_probe(struct i2c_client *client,
 				       const struct i2c_device_id *id)
 {
 	struct max11801_data *data;
@@ -228,7 +228,7 @@ static int __devinit max11801_ts_probe(struct i2c_client *client,
 	return error;
 }
 
-static __devexit int max11801_ts_remove(struct i2c_client *client)
+static int max11801_ts_remove(struct i2c_client *client)
 {
 	struct max11801_data *data = i2c_get_clientdata(client);
 
@@ -252,7 +252,7 @@ static struct i2c_driver max11801_ts_driver = {
 	},
 	.id_table	= max11801_ts_id,
 	.probe		= max11801_ts_probe,
-	.remove		= __devexit_p(max11801_ts_remove),
+	.remove		= max11801_ts_remove,
 };
 
 module_i2c_driver(max11801_ts_driver);
diff --git a/trunk/drivers/input/touchscreen/mc13783_ts.c b/trunk/drivers/input/touchscreen/mc13783_ts.c
index 48dc5b0d26f1..02103b6abb39 100644
--- a/trunk/drivers/input/touchscreen/mc13783_ts.c
+++ b/trunk/drivers/input/touchscreen/mc13783_ts.c
@@ -229,7 +229,7 @@ static int __init mc13783_ts_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static int __devexit mc13783_ts_remove(struct platform_device *pdev)
+static int mc13783_ts_remove(struct platform_device *pdev)
 {
 	struct mc13783_ts_priv *priv = platform_get_drvdata(pdev);
 
@@ -243,7 +243,7 @@ static int __devexit mc13783_ts_remove(struct platform_device *pdev)
 }
 
 static struct platform_driver mc13783_ts_driver = {
-	.remove		= __devexit_p(mc13783_ts_remove),
+	.remove		= mc13783_ts_remove,
 	.driver		= {
 		.owner	= THIS_MODULE,
 		.name	= MC13783_TS_NAME,
diff --git a/trunk/drivers/input/touchscreen/mcs5000_ts.c b/trunk/drivers/input/touchscreen/mcs5000_ts.c
index b528511861ce..f9f4e0c56eda 100644
--- a/trunk/drivers/input/touchscreen/mcs5000_ts.c
+++ b/trunk/drivers/input/touchscreen/mcs5000_ts.c
@@ -187,7 +187,7 @@ static void mcs5000_ts_phys_init(struct mcs5000_ts_data *data)
 			OP_MODE_ACTIVE | REPORT_RATE_80);
 }
 
-static int __devinit mcs5000_ts_probe(struct i2c_client *client,
+static int mcs5000_ts_probe(struct i2c_client *client,
 		const struct i2c_device_id *id)
 {
 	struct mcs5000_ts_data *data;
@@ -249,7 +249,7 @@ static int __devinit mcs5000_ts_probe(struct i2c_client *client,
 	return ret;
 }
 
-static int __devexit mcs5000_ts_remove(struct i2c_client *client)
+static int mcs5000_ts_remove(struct i2c_client *client)
 {
 	struct mcs5000_ts_data *data = i2c_get_clientdata(client);
 
@@ -292,7 +292,7 @@ MODULE_DEVICE_TABLE(i2c, mcs5000_ts_id);
 
 static struct i2c_driver mcs5000_ts_driver = {
 	.probe		= mcs5000_ts_probe,
-	.remove		= __devexit_p(mcs5000_ts_remove),
+	.remove		= mcs5000_ts_remove,
 	.driver = {
 		.name = "mcs5000_ts",
 #ifdef CONFIG_PM
diff --git a/trunk/drivers/input/touchscreen/mms114.c b/trunk/drivers/input/touchscreen/mms114.c
index 560cf09d1c5a..98841d8aa635 100644
--- a/trunk/drivers/input/touchscreen/mms114.c
+++ b/trunk/drivers/input/touchscreen/mms114.c
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/of.h>
 #include <linux/i2c.h>
 #include <linux/i2c/mms114.h>
 #include <linux/input/mt.h>
@@ -360,14 +361,63 @@ static void mms114_input_close(struct input_dev *dev)
 	mms114_stop(data);
 }
 
-static int __devinit mms114_probe(struct i2c_client *client,
+#ifdef CONFIG_OF
+static struct mms114_platform_data *mms114_parse_dt(struct device *dev)
+{
+	struct mms114_platform_data *pdata;
+	struct device_node *np = dev->of_node;
+
+	if (!np)
+		return NULL;
+
+	pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL);
+	if (!pdata) {
+		dev_err(dev, "failed to allocate platform data\n");
+		return NULL;
+	}
+
+	if (of_property_read_u32(np, "x-size", &pdata->x_size)) {
+		dev_err(dev, "failed to get x-size property\n");
+		return NULL;
+	};
+
+	if (of_property_read_u32(np, "y-size", &pdata->y_size)) {
+		dev_err(dev, "failed to get y-size property\n");
+		return NULL;
+	};
+
+	of_property_read_u32(np, "contact-threshold",
+				&pdata->contact_threshold);
+	of_property_read_u32(np, "moving-threshold",
+				&pdata->moving_threshold);
+
+	if (of_find_property(np, "x-invert", NULL))
+		pdata->x_invert = true;
+	if (of_find_property(np, "y-invert", NULL))
+		pdata->y_invert = true;
+
+	return pdata;
+}
+#else
+static inline struct mms114_platform_data *mms114_parse_dt(struct device *dev)
+{
+	return NULL;
+}
+#endif
+
+static int mms114_probe(struct i2c_client *client,
 				  const struct i2c_device_id *id)
 {
+	const struct mms114_platform_data *pdata;
 	struct mms114_data *data;
 	struct input_dev *input_dev;
 	int error;
 
-	if (!client->dev.platform_data) {
+	pdata = dev_get_platdata(&client->dev);
+	if (!pdata)
+		pdata = mms114_parse_dt(&client->dev);
+
+	if (!pdata) {
 		dev_err(&client->dev, "Need platform data\n");
 		return -EINVAL;
 	}
@@ -389,7 +439,7 @@ static int __devinit mms114_probe(struct i2c_client *client,
 
 	data->client = client;
 	data->input_dev = input_dev;
-	data->pdata = client->dev.platform_data;
+	data->pdata = pdata;
 
 	input_dev->name = "MELPAS MMS114 Touchscreen";
 	input_dev->id.bustype = BUS_I2C;
@@ -458,7 +508,7 @@ static int __devinit mms114_probe(struct i2c_client *client,
 	return error;
 }
 
-static int __devexit mms114_remove(struct i2c_client *client)
+static int mms114_remove(struct i2c_client *client)
 {
 	struct mms114_data *data = i2c_get_clientdata(client);
 
@@ -525,14 +575,22 @@ static const struct i2c_device_id mms114_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, mms114_id);
 
+#ifdef CONFIG_OF
+static struct of_device_id mms114_dt_match[] = {
+	{ .compatible = "melfas,mms114" },
+	{ }
+};
+#endif
+
 static struct i2c_driver mms114_driver = {
 	.driver = {
 		.name	= "mms114",
 		.owner	= THIS_MODULE,
 		.pm	= &mms114_pm_ops,
+		.of_match_table = of_match_ptr(mms114_dt_match),
 	},
 	.probe		= mms114_probe,
-	.remove		= __devexit_p(mms114_remove),
+	.remove		= mms114_remove,
 	.id_table	= mms114_id,
 };
 
diff --git a/trunk/drivers/input/touchscreen/pcap_ts.c b/trunk/drivers/input/touchscreen/pcap_ts.c
index f57aeb80f7e3..f22e04dd4e16 100644
--- a/trunk/drivers/input/touchscreen/pcap_ts.c
+++ b/trunk/drivers/input/touchscreen/pcap_ts.c
@@ -137,7 +137,7 @@ static void pcap_ts_close(struct input_dev *dev)
 				pcap_ts->read_state << PCAP_ADC_TS_M_SHIFT);
 }
 
-static int __devinit pcap_ts_probe(struct platform_device *pdev)
+static int pcap_ts_probe(struct platform_device *pdev)
 {
 	struct input_dev *input_dev;
 	struct pcap_ts *pcap_ts;
@@ -202,7 +202,7 @@ static int __devinit pcap_ts_probe(struct platform_device *pdev)
 	return err;
 }
 
-static int __devexit pcap_ts_remove(struct platform_device *pdev)
+static int pcap_ts_remove(struct platform_device *pdev)
 {
 	struct pcap_ts *pcap_ts = platform_get_drvdata(pdev);
 
@@ -245,7 +245,7 @@ static const struct dev_pm_ops pcap_ts_pm_ops = {
 
 static struct platform_driver pcap_ts_driver = {
 	.probe		= pcap_ts_probe,
-	.remove		= __devexit_p(pcap_ts_remove),
+	.remove		= pcap_ts_remove,
 	.driver		= {
 		.name	= "pcap-ts",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/touchscreen/pixcir_i2c_ts.c b/trunk/drivers/input/touchscreen/pixcir_i2c_ts.c
index 953b4c105cad..6cc6b36663ff 100644
--- a/trunk/drivers/input/touchscreen/pixcir_i2c_ts.c
+++ b/trunk/drivers/input/touchscreen/pixcir_i2c_ts.c
@@ -125,7 +125,7 @@ static int pixcir_i2c_ts_resume(struct device *dev)
 static SIMPLE_DEV_PM_OPS(pixcir_dev_pm_ops,
 			 pixcir_i2c_ts_suspend, pixcir_i2c_ts_resume);
 
-static int __devinit pixcir_i2c_ts_probe(struct i2c_client *client,
+static int pixcir_i2c_ts_probe(struct i2c_client *client,
 					 const struct i2c_device_id *id)
 {
 	const struct pixcir_ts_platform_data *pdata = client->dev.platform_data;
@@ -189,7 +189,7 @@ static int __devinit pixcir_i2c_ts_probe(struct i2c_client *client,
 	return error;
 }
 
-static int __devexit pixcir_i2c_ts_remove(struct i2c_client *client)
+static int pixcir_i2c_ts_remove(struct i2c_client *client)
 {
 	struct pixcir_i2c_ts_data *tsdata = i2c_get_clientdata(client);
 
@@ -218,7 +218,7 @@ static struct i2c_driver pixcir_i2c_ts_driver = {
 		.pm	= &pixcir_dev_pm_ops,
 	},
 	.probe		= pixcir_i2c_ts_probe,
-	.remove		= __devexit_p(pixcir_i2c_ts_remove),
+	.remove		= pixcir_i2c_ts_remove,
 	.id_table	= pixcir_i2c_ts_id,
 };
 
diff --git a/trunk/drivers/input/touchscreen/s3c2410_ts.c b/trunk/drivers/input/touchscreen/s3c2410_ts.c
index 549fa29548f8..b061af2c8376 100644
--- a/trunk/drivers/input/touchscreen/s3c2410_ts.c
+++ b/trunk/drivers/input/touchscreen/s3c2410_ts.c
@@ -238,7 +238,7 @@ static void s3c24xx_ts_select(struct s3c_adc_client *client, unsigned select)
  * Initialise, find and allocate any resources we need to run and then
  * register with the ADC and input systems.
  */
-static int __devinit s3c2410ts_probe(struct platform_device *pdev)
+static int s3c2410ts_probe(struct platform_device *pdev)
 {
 	struct s3c2410_ts_mach_info *info;
 	struct device *dev = &pdev->dev;
@@ -365,7 +365,7 @@ static int __devinit s3c2410ts_probe(struct platform_device *pdev)
  *
  * Free up our state ready to be removed.
  */
-static int __devexit s3c2410ts_remove(struct platform_device *pdev)
+static int s3c2410ts_remove(struct platform_device *pdev)
 {
 	free_irq(ts.irq_tc, ts.input);
 	del_timer_sync(&touch_timer);
@@ -430,7 +430,7 @@ static struct platform_driver s3c_ts_driver = {
 	},
 	.id_table	= s3cts_driver_ids,
 	.probe		= s3c2410ts_probe,
-	.remove		= __devexit_p(s3c2410ts_remove),
+	.remove		= s3c2410ts_remove,
 };
 module_platform_driver(s3c_ts_driver);
 
diff --git a/trunk/drivers/input/touchscreen/st1232.c b/trunk/drivers/input/touchscreen/st1232.c
index 6cb68a1981bf..d9d05e222428 100644
--- a/trunk/drivers/input/touchscreen/st1232.c
+++ b/trunk/drivers/input/touchscreen/st1232.c
@@ -139,7 +139,7 @@ static irqreturn_t st1232_ts_irq_handler(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int __devinit st1232_ts_probe(struct i2c_client *client,
+static int st1232_ts_probe(struct i2c_client *client,
 					const struct i2c_device_id *id)
 {
 	struct st1232_ts_data *ts;
@@ -206,7 +206,7 @@ static int __devinit st1232_ts_probe(struct i2c_client *client,
 	return error;
 }
 
-static int __devexit st1232_ts_remove(struct i2c_client *client)
+static int st1232_ts_remove(struct i2c_client *client)
 {
 	struct st1232_ts_data *ts = i2c_get_clientdata(client);
 
@@ -255,7 +255,7 @@ static const struct i2c_device_id st1232_ts_id[] = {
 MODULE_DEVICE_TABLE(i2c, st1232_ts_id);
 
 #ifdef CONFIG_OF
-static const struct of_device_id st1232_ts_dt_ids[] __devinitconst = {
+static const struct of_device_id st1232_ts_dt_ids[] = {
 	{ .compatible = "sitronix,st1232", },
 	{ }
 };
@@ -264,7 +264,7 @@ MODULE_DEVICE_TABLE(of, st1232_ts_dt_ids);
 
 static struct i2c_driver st1232_ts_driver = {
 	.probe		= st1232_ts_probe,
-	.remove		= __devexit_p(st1232_ts_remove),
+	.remove		= st1232_ts_remove,
 	.id_table	= st1232_ts_id,
 	.driver = {
 		.name	= ST1232_TS_NAME,
diff --git a/trunk/drivers/input/touchscreen/stmpe-ts.c b/trunk/drivers/input/touchscreen/stmpe-ts.c
index 692b685720ce..84d884b4ec3e 100644
--- a/trunk/drivers/input/touchscreen/stmpe-ts.c
+++ b/trunk/drivers/input/touchscreen/stmpe-ts.c
@@ -1,4 +1,5 @@
-/* STMicroelectronics STMPE811 Touchscreen Driver
+/*
+ * STMicroelectronics STMPE811 Touchscreen Driver
  *
  * (C) 2010 Luotao Fu <l.fu@pengutronix.de>
  * All rights reserved.
@@ -16,6 +17,7 @@
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/device.h>
+#include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/input.h>
 #include <linux/slab.h>
@@ -166,7 +168,7 @@ static irqreturn_t stmpe_ts_handler(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static int __devinit stmpe_init_hw(struct stmpe_touch *ts)
+static int stmpe_init_hw(struct stmpe_touch *ts)
 {
 	int ret;
 	u8 adc_ctrl1, adc_ctrl1_mask, tsc_cfg, tsc_cfg_mask;
@@ -261,41 +263,18 @@ static void stmpe_ts_close(struct input_dev *dev)
 			STMPE_TSC_CTRL_TSC_EN, 0);
 }
 
-static int __devinit stmpe_input_probe(struct platform_device *pdev)
+static void stmpe_ts_get_platform_info(struct platform_device *pdev,
+					struct stmpe_touch *ts)
 {
 	struct stmpe *stmpe = dev_get_drvdata(pdev->dev.parent);
-	struct stmpe_platform_data *pdata = stmpe->pdata;
-	struct stmpe_touch *ts;
-	struct input_dev *idev;
+	struct device_node *np = pdev->dev.of_node;
 	struct stmpe_ts_platform_data *ts_pdata = NULL;
-	int ret;
-	int ts_irq;
-
-	ts_irq = platform_get_irq_byname(pdev, "FIFO_TH");
-	if (ts_irq < 0)
-		return ts_irq;
-
-	ts = kzalloc(sizeof(*ts), GFP_KERNEL);
-	if (!ts) {
-		ret = -ENOMEM;
-		goto err_out;
-	}
 
-	idev = input_allocate_device();
-	if (!idev) {
-		ret = -ENOMEM;
-		goto err_free_ts;
-	}
-
-	platform_set_drvdata(pdev, ts);
 	ts->stmpe = stmpe;
-	ts->idev = idev;
-	ts->dev = &pdev->dev;
 
-	if (pdata)
-		ts_pdata = pdata->ts;
+	if (stmpe->pdata && stmpe->pdata->ts) {
+		ts_pdata = stmpe->pdata->ts;
 
-	if (ts_pdata) {
 		ts->sample_time = ts_pdata->sample_time;
 		ts->mod_12b = ts_pdata->mod_12b;
 		ts->ref_sel = ts_pdata->ref_sel;
@@ -305,22 +284,71 @@ static int __devinit stmpe_input_probe(struct platform_device *pdev)
 		ts->settling = ts_pdata->settling;
 		ts->fraction_z = ts_pdata->fraction_z;
 		ts->i_drive = ts_pdata->i_drive;
+	} else if (np) {
+		u32 val;
+
+		if (!of_property_read_u32(np, "st,sample-time", &val))
+			ts->sample_time = val;
+		if (!of_property_read_u32(np, "st,mod-12b", &val))
+			ts->mod_12b = val;
+		if (!of_property_read_u32(np, "st,ref-sel", &val))
+			ts->ref_sel = val;
+		if (!of_property_read_u32(np, "st,adc-freq", &val))
+			ts->adc_freq = val;
+		if (!of_property_read_u32(np, "st,ave-ctrl", &val))
+			ts->ave_ctrl = val;
+		if (!of_property_read_u32(np, "st,touch-det-delay", &val))
+			ts->touch_det_delay = val;
+		if (!of_property_read_u32(np, "st,settling", &val))
+			ts->settling = val;
+		if (!of_property_read_u32(np, "st,fraction-z", &val))
+			ts->fraction_z = val;
+		if (!of_property_read_u32(np, "st,i-drive", &val))
+			ts->i_drive = val;
 	}
+}
+
+static int stmpe_input_probe(struct platform_device *pdev)
+{
+	struct stmpe_touch *ts;
+	struct input_dev *idev;
+	int error;
+	int ts_irq;
+
+	ts_irq = platform_get_irq_byname(pdev, "FIFO_TH");
+	if (ts_irq < 0)
+		return ts_irq;
+
+	ts = devm_kzalloc(&pdev->dev, sizeof(*ts), GFP_KERNEL);
+	if (!ts)
+		return -ENOMEM;
+
+	idev = devm_input_allocate_device(&pdev->dev);
+	if (!idev)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, ts);
+	ts->idev = idev;
+	ts->dev = &pdev->dev;
+
+	stmpe_ts_get_platform_info(pdev, ts);
 
 	INIT_DELAYED_WORK(&ts->work, stmpe_work);
 
-	ret = request_threaded_irq(ts_irq, NULL, stmpe_ts_handler,
-			IRQF_ONESHOT, STMPE_TS_NAME, ts);
-	if (ret) {
+	error = devm_request_threaded_irq(&pdev->dev, ts_irq,
+					  NULL, stmpe_ts_handler,
+					  IRQF_ONESHOT, STMPE_TS_NAME, ts);
+	if (error) {
 		dev_err(&pdev->dev, "Failed to request IRQ %d\n", ts_irq);
-		goto err_free_input;
+		return error;
 	}
 
-	ret = stmpe_init_hw(ts);
-	if (ret)
-		goto err_free_irq;
+	error = stmpe_init_hw(ts);
+	if (error)
+		return error;
 
 	idev->name = STMPE_TS_NAME;
+	idev->phys = STMPE_TS_NAME"/input0";
 	idev->id.bustype = BUS_I2C;
 	idev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS);
 	idev->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH);
@@ -334,40 +362,21 @@ static int __devinit stmpe_input_probe(struct platform_device *pdev)
 	input_set_abs_params(idev, ABS_Y, 0, XY_MASK, 0, 0);
 	input_set_abs_params(idev, ABS_PRESSURE, 0x0, 0xff, 0, 0);
 
-	ret = input_register_device(idev);
-	if (ret) {
+	error = input_register_device(idev);
+	if (error) {
 		dev_err(&pdev->dev, "Could not register input device\n");
-		goto err_free_irq;
+		return error;
 	}
 
-	return ret;
-
-err_free_irq:
-	free_irq(ts_irq, ts);
-err_free_input:
-	input_free_device(idev);
-	platform_set_drvdata(pdev, NULL);
-err_free_ts:
-	kfree(ts);
-err_out:
-	return ret;
+	return 0;
 }
 
-static int __devexit stmpe_ts_remove(struct platform_device *pdev)
+static int stmpe_ts_remove(struct platform_device *pdev)
 {
 	struct stmpe_touch *ts = platform_get_drvdata(pdev);
-	unsigned int ts_irq = platform_get_irq_byname(pdev, "FIFO_TH");
 
 	stmpe_disable(ts->stmpe, STMPE_BLOCK_TOUCHSCREEN);
 
-	free_irq(ts_irq, ts);
-
-	platform_set_drvdata(pdev, NULL);
-
-	input_unregister_device(ts->idev);
-
-	kfree(ts);
-
 	return 0;
 }
 
@@ -377,7 +386,7 @@ static struct platform_driver stmpe_ts_driver = {
 		   .owner = THIS_MODULE,
 		   },
 	.probe = stmpe_input_probe,
-	.remove = __devexit_p(stmpe_ts_remove),
+	.remove = stmpe_ts_remove,
 };
 module_platform_driver(stmpe_ts_driver);
 
diff --git a/trunk/drivers/input/touchscreen/ti_am335x_tsc.c b/trunk/drivers/input/touchscreen/ti_am335x_tsc.c
index 7a18a8a15228..51e7b87827a4 100644
--- a/trunk/drivers/input/touchscreen/ti_am335x_tsc.c
+++ b/trunk/drivers/input/touchscreen/ti_am335x_tsc.c
@@ -258,7 +258,7 @@ static irqreturn_t titsc_irq(int irq, void *dev)
  * The functions for inserting/removing driver as a module.
  */
 
-static int __devinit titsc_probe(struct platform_device *pdev)
+static int titsc_probe(struct platform_device *pdev)
 {
 	struct titsc *ts_dev;
 	struct input_dev *input_dev;
@@ -327,7 +327,7 @@ static int __devinit titsc_probe(struct platform_device *pdev)
 	return err;
 }
 
-static int __devexit titsc_remove(struct platform_device *pdev)
+static int titsc_remove(struct platform_device *pdev)
 {
 	struct ti_tscadc_dev *tscadc_dev = pdev->dev.platform_data;
 	struct titsc *ts_dev = tscadc_dev->tsc;
@@ -384,7 +384,7 @@ static const struct dev_pm_ops titsc_pm_ops = {
 
 static struct platform_driver ti_tsc_driver = {
 	.probe	= titsc_probe,
-	.remove	= __devexit_p(titsc_remove),
+	.remove	= titsc_remove,
 	.driver	= {
 		.name   = "tsc",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/touchscreen/tnetv107x-ts.c b/trunk/drivers/input/touchscreen/tnetv107x-ts.c
index 368d2c6cf780..acfb87607b87 100644
--- a/trunk/drivers/input/touchscreen/tnetv107x-ts.c
+++ b/trunk/drivers/input/touchscreen/tnetv107x-ts.c
@@ -243,7 +243,7 @@ static void tsc_stop(struct input_dev *dev)
 	clk_disable(ts->clk);
 }
 
-static int __devinit tsc_probe(struct platform_device *pdev)
+static int tsc_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct tsc_data *ts;
@@ -357,7 +357,7 @@ static int __devinit tsc_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit tsc_remove(struct platform_device *pdev)
+static int tsc_remove(struct platform_device *pdev)
 {
 	struct tsc_data *ts = platform_get_drvdata(pdev);
 
@@ -374,7 +374,7 @@ static int __devexit tsc_remove(struct platform_device *pdev)
 
 static struct platform_driver tsc_driver = {
 	.probe		= tsc_probe,
-	.remove		= __devexit_p(tsc_remove),
+	.remove		= tsc_remove,
 	.driver.name	= "tnetv107x-ts",
 	.driver.owner	= THIS_MODULE,
 };
diff --git a/trunk/drivers/input/touchscreen/tps6507x-ts.c b/trunk/drivers/input/touchscreen/tps6507x-ts.c
index f7eda3d00fad..820a066c3b8a 100644
--- a/trunk/drivers/input/touchscreen/tps6507x-ts.c
+++ b/trunk/drivers/input/touchscreen/tps6507x-ts.c
@@ -345,7 +345,7 @@ static int tps6507x_ts_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit tps6507x_ts_remove(struct platform_device *pdev)
+static int tps6507x_ts_remove(struct platform_device *pdev)
 {
 	struct tps6507x_dev *tps6507x_dev = platform_get_drvdata(pdev);
 	struct tps6507x_ts *tsc = tps6507x_dev->ts;
@@ -367,7 +367,7 @@ static struct platform_driver tps6507x_ts_driver = {
 		.owner = THIS_MODULE,
 	},
 	.probe = tps6507x_ts_probe,
-	.remove = __devexit_p(tps6507x_ts_remove),
+	.remove = tps6507x_ts_remove,
 };
 module_platform_driver(tps6507x_ts_driver);
 
diff --git a/trunk/drivers/input/touchscreen/tsc2005.c b/trunk/drivers/input/touchscreen/tsc2005.c
index 5ce3fa8ce646..9c0cdc7ea449 100644
--- a/trunk/drivers/input/touchscreen/tsc2005.c
+++ b/trunk/drivers/input/touchscreen/tsc2005.c
@@ -555,7 +555,7 @@ static void tsc2005_close(struct input_dev *input)
 	mutex_unlock(&ts->mutex);
 }
 
-static void __devinit tsc2005_setup_spi_xfer(struct tsc2005 *ts)
+static void tsc2005_setup_spi_xfer(struct tsc2005 *ts)
 {
 	tsc2005_setup_read(&ts->spi_x, TSC2005_REG_X, false);
 	tsc2005_setup_read(&ts->spi_y, TSC2005_REG_Y, false);
@@ -569,7 +569,7 @@ static void __devinit tsc2005_setup_spi_xfer(struct tsc2005 *ts)
 	spi_message_add_tail(&ts->spi_z2.spi_xfer, &ts->spi_read_msg);
 }
 
-static int __devinit tsc2005_probe(struct spi_device *spi)
+static int tsc2005_probe(struct spi_device *spi)
 {
 	const struct tsc2005_platform_data *pdata = spi->dev.platform_data;
 	struct tsc2005 *ts;
@@ -686,7 +686,7 @@ static int __devinit tsc2005_probe(struct spi_device *spi)
 	return error;
 }
 
-static int __devexit tsc2005_remove(struct spi_device *spi)
+static int tsc2005_remove(struct spi_device *spi)
 {
 	struct tsc2005 *ts = spi_get_drvdata(spi);
 
@@ -745,7 +745,7 @@ static struct spi_driver tsc2005_driver = {
 		.pm	= &tsc2005_pm_ops,
 	},
 	.probe	= tsc2005_probe,
-	.remove	= __devexit_p(tsc2005_remove),
+	.remove	= tsc2005_remove,
 };
 
 module_spi_driver(tsc2005_driver);
diff --git a/trunk/drivers/input/touchscreen/tsc2007.c b/trunk/drivers/input/touchscreen/tsc2007.c
index 1473d2382afd..0b67ba476b4c 100644
--- a/trunk/drivers/input/touchscreen/tsc2007.c
+++ b/trunk/drivers/input/touchscreen/tsc2007.c
@@ -273,7 +273,7 @@ static void tsc2007_close(struct input_dev *input_dev)
 	tsc2007_stop(ts);
 }
 
-static int __devinit tsc2007_probe(struct i2c_client *client,
+static int tsc2007_probe(struct i2c_client *client,
 				   const struct i2c_device_id *id)
 {
 	struct tsc2007 *ts;
@@ -366,7 +366,7 @@ static int __devinit tsc2007_probe(struct i2c_client *client,
 	return err;
 }
 
-static int __devexit tsc2007_remove(struct i2c_client *client)
+static int tsc2007_remove(struct i2c_client *client)
 {
 	struct tsc2007	*ts = i2c_get_clientdata(client);
 	struct tsc2007_platform_data *pdata = client->dev.platform_data;
@@ -396,7 +396,7 @@ static struct i2c_driver tsc2007_driver = {
 	},
 	.id_table	= tsc2007_idtable,
 	.probe		= tsc2007_probe,
-	.remove		= __devexit_p(tsc2007_remove),
+	.remove		= tsc2007_remove,
 };
 
 module_i2c_driver(tsc2007_driver);
diff --git a/trunk/drivers/input/touchscreen/ucb1400_ts.c b/trunk/drivers/input/touchscreen/ucb1400_ts.c
index 46e83ad53f43..1271f97b4079 100644
--- a/trunk/drivers/input/touchscreen/ucb1400_ts.c
+++ b/trunk/drivers/input/touchscreen/ucb1400_ts.c
@@ -274,7 +274,7 @@ static void ucb1400_ts_close(struct input_dev *idev)
  * Try to probe our interrupt, rather than relying on lots of
  * hard-coded machine dependencies.
  */
-static int __devinit ucb1400_ts_detect_irq(struct ucb1400_ts *ucb,
+static int ucb1400_ts_detect_irq(struct ucb1400_ts *ucb,
 					   struct platform_device *pdev)
 {
 	unsigned long mask, timeout;
@@ -318,7 +318,7 @@ static int __devinit ucb1400_ts_detect_irq(struct ucb1400_ts *ucb,
 	return 0;
 }
 
-static int __devinit ucb1400_ts_probe(struct platform_device *pdev)
+static int ucb1400_ts_probe(struct platform_device *pdev)
 {
 	struct ucb1400_ts *ucb = pdev->dev.platform_data;
 	int error, x_res, y_res;
@@ -397,7 +397,7 @@ static int __devinit ucb1400_ts_probe(struct platform_device *pdev)
 	return error;
 }
 
-static int __devexit ucb1400_ts_remove(struct platform_device *pdev)
+static int ucb1400_ts_remove(struct platform_device *pdev)
 {
 	struct ucb1400_ts *ucb = pdev->dev.platform_data;
 
@@ -442,7 +442,7 @@ static SIMPLE_DEV_PM_OPS(ucb1400_ts_pm_ops,
 
 static struct platform_driver ucb1400_ts_driver = {
 	.probe	= ucb1400_ts_probe,
-	.remove	= __devexit_p(ucb1400_ts_remove),
+	.remove	= ucb1400_ts_remove,
 	.driver	= {
 		.name	= "ucb1400_ts",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/touchscreen/w90p910_ts.c b/trunk/drivers/input/touchscreen/w90p910_ts.c
index 9396b21d0e8f..d2ef8f05c66e 100644
--- a/trunk/drivers/input/touchscreen/w90p910_ts.c
+++ b/trunk/drivers/input/touchscreen/w90p910_ts.c
@@ -215,7 +215,7 @@ static void w90p910_close(struct input_dev *dev)
 	clk_disable(w90p910_ts->clk);
 }
 
-static int __devinit w90x900ts_probe(struct platform_device *pdev)
+static int w90x900ts_probe(struct platform_device *pdev)
 {
 	struct w90p910_ts *w90p910_ts;
 	struct input_dev *input_dev;
@@ -301,7 +301,7 @@ fail1:	input_free_device(input_dev);
 	return err;
 }
 
-static int __devexit w90x900ts_remove(struct platform_device *pdev)
+static int w90x900ts_remove(struct platform_device *pdev)
 {
 	struct w90p910_ts *w90p910_ts = platform_get_drvdata(pdev);
 	struct resource *res;
@@ -325,7 +325,7 @@ static int __devexit w90x900ts_remove(struct platform_device *pdev)
 
 static struct platform_driver w90x900ts_driver = {
 	.probe		= w90x900ts_probe,
-	.remove		= __devexit_p(w90x900ts_remove),
+	.remove		= w90x900ts_remove,
 	.driver		= {
 		.name	= "nuc900-ts",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/input/touchscreen/wacom_i2c.c b/trunk/drivers/input/touchscreen/wacom_i2c.c
index 0c01657132fd..bf0d07620bac 100644
--- a/trunk/drivers/input/touchscreen/wacom_i2c.c
+++ b/trunk/drivers/input/touchscreen/wacom_i2c.c
@@ -144,7 +144,7 @@ static void wacom_i2c_close(struct input_dev *dev)
 	disable_irq(client->irq);
 }
 
-static int __devinit wacom_i2c_probe(struct i2c_client *client,
+static int wacom_i2c_probe(struct i2c_client *client,
 				     const struct i2c_device_id *id)
 {
 	struct wacom_i2c *wac_i2c;
@@ -225,7 +225,7 @@ static int __devinit wacom_i2c_probe(struct i2c_client *client,
 	return error;
 }
 
-static int __devexit wacom_i2c_remove(struct i2c_client *client)
+static int wacom_i2c_remove(struct i2c_client *client)
 {
 	struct wacom_i2c *wac_i2c = i2c_get_clientdata(client);
 
@@ -272,7 +272,7 @@ static struct i2c_driver wacom_i2c_driver = {
 	},
 
 	.probe		= wacom_i2c_probe,
-	.remove		= __devexit_p(wacom_i2c_remove),
+	.remove		= wacom_i2c_remove,
 	.id_table	= wacom_i2c_id,
 };
 module_i2c_driver(wacom_i2c_driver);
diff --git a/trunk/drivers/input/touchscreen/wm831x-ts.c b/trunk/drivers/input/touchscreen/wm831x-ts.c
index 52abb98a8ae5..f88fab56178c 100644
--- a/trunk/drivers/input/touchscreen/wm831x-ts.c
+++ b/trunk/drivers/input/touchscreen/wm831x-ts.c
@@ -233,7 +233,7 @@ static void wm831x_ts_input_close(struct input_dev *idev)
 	}
 }
 
-static __devinit int wm831x_ts_probe(struct platform_device *pdev)
+static int wm831x_ts_probe(struct platform_device *pdev)
 {
 	struct wm831x_ts *wm831x_ts;
 	struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent);
@@ -245,7 +245,8 @@ static __devinit int wm831x_ts_probe(struct platform_device *pdev)
 	if (core_pdata)
 		pdata = core_pdata->touch;
 
-	wm831x_ts = kzalloc(sizeof(struct wm831x_ts), GFP_KERNEL);
+	wm831x_ts = devm_kzalloc(&pdev->dev, sizeof(struct wm831x_ts),
+				 GFP_KERNEL);
 	input_dev = input_allocate_device();
 	if (!wm831x_ts || !input_dev) {
 		error = -ENOMEM;
@@ -376,21 +377,18 @@ static __devinit int wm831x_ts_probe(struct platform_device *pdev)
 	free_irq(wm831x_ts->data_irq, wm831x_ts);
 err_alloc:
 	input_free_device(input_dev);
-	kfree(wm831x_ts);
 
 	return error;
 }
 
-static __devexit int wm831x_ts_remove(struct platform_device *pdev)
+static int wm831x_ts_remove(struct platform_device *pdev)
 {
 	struct wm831x_ts *wm831x_ts = platform_get_drvdata(pdev);
 
 	free_irq(wm831x_ts->pd_irq, wm831x_ts);
 	free_irq(wm831x_ts->data_irq, wm831x_ts);
 	input_unregister_device(wm831x_ts->input_dev);
-	kfree(wm831x_ts);
 
-	platform_set_drvdata(pdev, NULL);
 	return 0;
 }
 
@@ -400,7 +398,7 @@ static struct platform_driver wm831x_ts_driver = {
 		.owner = THIS_MODULE,
 	},
 	.probe = wm831x_ts_probe,
-	.remove = __devexit_p(wm831x_ts_remove),
+	.remove = wm831x_ts_remove,
 };
 module_platform_driver(wm831x_ts_driver);
 
diff --git a/trunk/drivers/isdn/mISDN/dsp_core.c b/trunk/drivers/isdn/mISDN/dsp_core.c
index 28c99c623bcd..22b720ec80cb 100644
--- a/trunk/drivers/isdn/mISDN/dsp_core.c
+++ b/trunk/drivers/isdn/mISDN/dsp_core.c
@@ -1217,8 +1217,7 @@ static void __exit dsp_cleanup(void)
 {
 	mISDN_unregister_Bprotocol(&DSP);
 
-	if (timer_pending(&dsp_spl_tl))
-		del_timer(&dsp_spl_tl);
+	del_timer_sync(&dsp_spl_tl);
 
 	if (!list_empty(&dsp_ilist)) {
 		printk(KERN_ERR "mISDN_dsp: Audio DSP object inst list not "
diff --git a/trunk/drivers/macintosh/smu.c b/trunk/drivers/macintosh/smu.c
index 196368009001..9c6b96414862 100644
--- a/trunk/drivers/macintosh/smu.c
+++ b/trunk/drivers/macintosh/smu.c
@@ -997,7 +997,7 @@ static struct smu_sdbp_header *smu_create_sdb_partition(int id)
 		       "%02x !\n", id, hdr->id);
 		goto failure;
 	}
-	if (prom_add_property(smu->of_node, prop)) {
+	if (of_add_property(smu->of_node, prop)) {
 		printk(KERN_DEBUG "SMU: Failed creating sdb-partition-%02x "
 		       "property !\n", id);
 		goto failure;
diff --git a/trunk/drivers/macintosh/windfarm_fcu_controls.c b/trunk/drivers/macintosh/windfarm_fcu_controls.c
index b3411edb324b..fd6ed15a979d 100644
--- a/trunk/drivers/macintosh/windfarm_fcu_controls.c
+++ b/trunk/drivers/macintosh/windfarm_fcu_controls.c
@@ -593,19 +593,7 @@ static struct i2c_driver wf_fcu_driver = {
 	.id_table	= wf_fcu_id,
 };
 
-static int __init wf_fcu_init(void)
-{
-	return i2c_add_driver(&wf_fcu_driver);
-}
-
-static void __exit wf_fcu_exit(void)
-{
-	i2c_del_driver(&wf_fcu_driver);
-}
-
-
-module_init(wf_fcu_init);
-module_exit(wf_fcu_exit);
+module_i2c_driver(wf_fcu_driver);
 
 MODULE_AUTHOR("Benjamin Herrenschmidt <benh@kernel.crashing.org>");
 MODULE_DESCRIPTION("FCU control objects for PowerMacs thermal control");
diff --git a/trunk/drivers/macintosh/windfarm_lm75_sensor.c b/trunk/drivers/macintosh/windfarm_lm75_sensor.c
index b0c2d3695b34..9ef32b3df91f 100644
--- a/trunk/drivers/macintosh/windfarm_lm75_sensor.c
+++ b/trunk/drivers/macintosh/windfarm_lm75_sensor.c
@@ -174,19 +174,7 @@ static struct i2c_driver wf_lm75_driver = {
 	.id_table	= wf_lm75_id,
 };
 
-static int __init wf_lm75_sensor_init(void)
-{
-	return i2c_add_driver(&wf_lm75_driver);
-}
-
-static void __exit wf_lm75_sensor_exit(void)
-{
-	i2c_del_driver(&wf_lm75_driver);
-}
-
-
-module_init(wf_lm75_sensor_init);
-module_exit(wf_lm75_sensor_exit);
+module_i2c_driver(wf_lm75_driver);
 
 MODULE_AUTHOR("Benjamin Herrenschmidt <benh@kernel.crashing.org>");
 MODULE_DESCRIPTION("LM75 sensor objects for PowerMacs thermal control");
diff --git a/trunk/drivers/macintosh/windfarm_max6690_sensor.c b/trunk/drivers/macintosh/windfarm_max6690_sensor.c
index 371b058d2f7d..945a25b2f31e 100644
--- a/trunk/drivers/macintosh/windfarm_max6690_sensor.c
+++ b/trunk/drivers/macintosh/windfarm_max6690_sensor.c
@@ -130,18 +130,7 @@ static struct i2c_driver wf_max6690_driver = {
 	.id_table	= wf_max6690_id,
 };
 
-static int __init wf_max6690_sensor_init(void)
-{
-	return i2c_add_driver(&wf_max6690_driver);
-}
-
-static void __exit wf_max6690_sensor_exit(void)
-{
-	i2c_del_driver(&wf_max6690_driver);
-}
-
-module_init(wf_max6690_sensor_init);
-module_exit(wf_max6690_sensor_exit);
+module_i2c_driver(wf_max6690_driver);
 
 MODULE_AUTHOR("Paul Mackerras <paulus@samba.org>");
 MODULE_DESCRIPTION("MAX6690 sensor objects for PowerMac thermal control");
diff --git a/trunk/drivers/macintosh/windfarm_smu_sat.c b/trunk/drivers/macintosh/windfarm_smu_sat.c
index 426e810233d7..d87f5ee04ca9 100644
--- a/trunk/drivers/macintosh/windfarm_smu_sat.c
+++ b/trunk/drivers/macintosh/windfarm_smu_sat.c
@@ -364,18 +364,7 @@ static struct i2c_driver wf_sat_driver = {
 	.id_table	= wf_sat_id,
 };
 
-static int __init sat_sensors_init(void)
-{
-	return i2c_add_driver(&wf_sat_driver);
-}
-
-static void __exit sat_sensors_exit(void)
-{
-	i2c_del_driver(&wf_sat_driver);
-}
-
-module_init(sat_sensors_init);
-module_exit(sat_sensors_exit);
+module_i2c_driver(wf_sat_driver);
 
 MODULE_AUTHOR("Paul Mackerras <paulus@samba.org>");
 MODULE_DESCRIPTION("SMU satellite sensors for PowerMac thermal control");
diff --git a/trunk/drivers/md/md.c b/trunk/drivers/md/md.c
index bd8bf0953fe3..3db3d1b271f7 100644
--- a/trunk/drivers/md/md.c
+++ b/trunk/drivers/md/md.c
@@ -452,7 +452,7 @@ void md_flush_request(struct mddev *mddev, struct bio *bio)
 	spin_lock_irq(&mddev->write_lock);
 	wait_event_lock_irq(mddev->sb_wait,
 			    !mddev->flush_bio,
-			    mddev->write_lock, /*nothing*/);
+			    mddev->write_lock);
 	mddev->flush_bio = bio;
 	spin_unlock_irq(&mddev->write_lock);
 
@@ -1414,12 +1414,11 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
 	unsigned long long newcsum;
 	int size = 256 + le32_to_cpu(sb->max_dev)*2;
 	__le32 *isuper = (__le32*)sb;
-	int i;
 
 	disk_csum = sb->sb_csum;
 	sb->sb_csum = 0;
 	newcsum = 0;
-	for (i=0; size>=4; size -= 4 )
+	for (; size >= 4; size -= 4)
 		newcsum += le32_to_cpu(*isuper++);
 
 	if (size == 2)
@@ -4753,6 +4752,8 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
 	}
 	mddev_get(mddev);
 	spin_unlock(&all_mddevs_lock);
+	if (entry->store == new_dev_store)
+		flush_workqueue(md_misc_wq);
 	rv = mddev_lock(mddev);
 	if (!rv) {
 		rv = entry->store(mddev, page, length);
@@ -6346,24 +6347,23 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 	 * Commands dealing with the RAID driver but not any
 	 * particular array:
 	 */
-	switch (cmd)
-	{
-		case RAID_VERSION:
-			err = get_version(argp);
-			goto done;
+	switch (cmd) {
+	case RAID_VERSION:
+		err = get_version(argp);
+		goto done;
 
-		case PRINT_RAID_DEBUG:
-			err = 0;
-			md_print_devices();
-			goto done;
+	case PRINT_RAID_DEBUG:
+		err = 0;
+		md_print_devices();
+		goto done;
 
 #ifndef MODULE
-		case RAID_AUTORUN:
-			err = 0;
-			autostart_arrays(arg);
-			goto done;
+	case RAID_AUTORUN:
+		err = 0;
+		autostart_arrays(arg);
+		goto done;
 #endif
-		default:;
+	default:;
 	}
 
 	/*
@@ -6398,6 +6398,10 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 		goto abort;
 	}
 
+	if (cmd == ADD_NEW_DISK)
+		/* need to ensure md_delayed_delete() has completed */
+		flush_workqueue(md_misc_wq);
+
 	err = mddev_lock(mddev);
 	if (err) {
 		printk(KERN_INFO 
@@ -6406,50 +6410,44 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 		goto abort;
 	}
 
-	switch (cmd)
-	{
-		case SET_ARRAY_INFO:
-			{
-				mdu_array_info_t info;
-				if (!arg)
-					memset(&info, 0, sizeof(info));
-				else if (copy_from_user(&info, argp, sizeof(info))) {
-					err = -EFAULT;
-					goto abort_unlock;
-				}
-				if (mddev->pers) {
-					err = update_array_info(mddev, &info);
-					if (err) {
-						printk(KERN_WARNING "md: couldn't update"
-						       " array info. %d\n", err);
-						goto abort_unlock;
-					}
-					goto done_unlock;
-				}
-				if (!list_empty(&mddev->disks)) {
-					printk(KERN_WARNING
-					       "md: array %s already has disks!\n",
-					       mdname(mddev));
-					err = -EBUSY;
-					goto abort_unlock;
-				}
-				if (mddev->raid_disks) {
-					printk(KERN_WARNING
-					       "md: array %s already initialised!\n",
-					       mdname(mddev));
-					err = -EBUSY;
-					goto abort_unlock;
-				}
-				err = set_array_info(mddev, &info);
-				if (err) {
-					printk(KERN_WARNING "md: couldn't set"
-					       " array info. %d\n", err);
-					goto abort_unlock;
-				}
+	if (cmd == SET_ARRAY_INFO) {
+		mdu_array_info_t info;
+		if (!arg)
+			memset(&info, 0, sizeof(info));
+		else if (copy_from_user(&info, argp, sizeof(info))) {
+			err = -EFAULT;
+			goto abort_unlock;
+		}
+		if (mddev->pers) {
+			err = update_array_info(mddev, &info);
+			if (err) {
+				printk(KERN_WARNING "md: couldn't update"
+				       " array info. %d\n", err);
+				goto abort_unlock;
 			}
 			goto done_unlock;
-
-		default:;
+		}
+		if (!list_empty(&mddev->disks)) {
+			printk(KERN_WARNING
+			       "md: array %s already has disks!\n",
+			       mdname(mddev));
+			err = -EBUSY;
+			goto abort_unlock;
+		}
+		if (mddev->raid_disks) {
+			printk(KERN_WARNING
+			       "md: array %s already initialised!\n",
+			       mdname(mddev));
+			err = -EBUSY;
+			goto abort_unlock;
+		}
+		err = set_array_info(mddev, &info);
+		if (err) {
+			printk(KERN_WARNING "md: couldn't set"
+			       " array info. %d\n", err);
+			goto abort_unlock;
+		}
+		goto done_unlock;
 	}
 
 	/*
@@ -6468,52 +6466,51 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 	/*
 	 * Commands even a read-only array can execute:
 	 */
-	switch (cmd)
-	{
-		case GET_BITMAP_FILE:
-			err = get_bitmap_file(mddev, argp);
-			goto done_unlock;
+	switch (cmd) {
+	case GET_BITMAP_FILE:
+		err = get_bitmap_file(mddev, argp);
+		goto done_unlock;
 
-		case RESTART_ARRAY_RW:
-			err = restart_array(mddev);
-			goto done_unlock;
+	case RESTART_ARRAY_RW:
+		err = restart_array(mddev);
+		goto done_unlock;
 
-		case STOP_ARRAY:
-			err = do_md_stop(mddev, 0, bdev);
-			goto done_unlock;
+	case STOP_ARRAY:
+		err = do_md_stop(mddev, 0, bdev);
+		goto done_unlock;
 
-		case STOP_ARRAY_RO:
-			err = md_set_readonly(mddev, bdev);
-			goto done_unlock;
+	case STOP_ARRAY_RO:
+		err = md_set_readonly(mddev, bdev);
+		goto done_unlock;
 
-		case BLKROSET:
-			if (get_user(ro, (int __user *)(arg))) {
-				err = -EFAULT;
-				goto done_unlock;
-			}
-			err = -EINVAL;
+	case BLKROSET:
+		if (get_user(ro, (int __user *)(arg))) {
+			err = -EFAULT;
+			goto done_unlock;
+		}
+		err = -EINVAL;
 
-			/* if the bdev is going readonly the value of mddev->ro
-			 * does not matter, no writes are coming
-			 */
-			if (ro)
-				goto done_unlock;
+		/* if the bdev is going readonly the value of mddev->ro
+		 * does not matter, no writes are coming
+		 */
+		if (ro)
+			goto done_unlock;
 
-			/* are we are already prepared for writes? */
-			if (mddev->ro != 1)
-				goto done_unlock;
+		/* are we are already prepared for writes? */
+		if (mddev->ro != 1)
+			goto done_unlock;
 
-			/* transitioning to readauto need only happen for
-			 * arrays that call md_write_start
-			 */
-			if (mddev->pers) {
-				err = restart_array(mddev);
-				if (err == 0) {
-					mddev->ro = 2;
-					set_disk_ro(mddev->gendisk, 0);
-				}
+		/* transitioning to readauto need only happen for
+		 * arrays that call md_write_start
+		 */
+		if (mddev->pers) {
+			err = restart_array(mddev);
+			if (err == 0) {
+				mddev->ro = 2;
+				set_disk_ro(mddev->gendisk, 0);
 			}
-			goto done_unlock;
+		}
+		goto done_unlock;
 	}
 
 	/*
@@ -6535,37 +6532,36 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 		}
 	}
 
-	switch (cmd)
+	switch (cmd) {
+	case ADD_NEW_DISK:
 	{
-		case ADD_NEW_DISK:
-		{
-			mdu_disk_info_t info;
-			if (copy_from_user(&info, argp, sizeof(info)))
-				err = -EFAULT;
-			else
-				err = add_new_disk(mddev, &info);
-			goto done_unlock;
-		}
+		mdu_disk_info_t info;
+		if (copy_from_user(&info, argp, sizeof(info)))
+			err = -EFAULT;
+		else
+			err = add_new_disk(mddev, &info);
+		goto done_unlock;
+	}
 
-		case HOT_REMOVE_DISK:
-			err = hot_remove_disk(mddev, new_decode_dev(arg));
-			goto done_unlock;
+	case HOT_REMOVE_DISK:
+		err = hot_remove_disk(mddev, new_decode_dev(arg));
+		goto done_unlock;
 
-		case HOT_ADD_DISK:
-			err = hot_add_disk(mddev, new_decode_dev(arg));
-			goto done_unlock;
+	case HOT_ADD_DISK:
+		err = hot_add_disk(mddev, new_decode_dev(arg));
+		goto done_unlock;
 
-		case RUN_ARRAY:
-			err = do_md_run(mddev);
-			goto done_unlock;
+	case RUN_ARRAY:
+		err = do_md_run(mddev);
+		goto done_unlock;
 
-		case SET_BITMAP_FILE:
-			err = set_bitmap_file(mddev, (int)arg);
-			goto done_unlock;
+	case SET_BITMAP_FILE:
+		err = set_bitmap_file(mddev, (int)arg);
+		goto done_unlock;
 
-		default:
-			err = -EINVAL;
-			goto abort_unlock;
+	default:
+		err = -EINVAL;
+		goto abort_unlock;
 	}
 
 done_unlock:
@@ -7184,6 +7180,7 @@ void md_done_sync(struct mddev *mddev, int blocks, int ok)
 	wake_up(&mddev->recovery_wait);
 	if (!ok) {
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+		set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
 		md_wakeup_thread(mddev->thread);
 		// stop recovery, signal do_sync ....
 	}
@@ -7281,6 +7278,7 @@ EXPORT_SYMBOL_GPL(md_allow_write);
 
 #define SYNC_MARKS	10
 #define	SYNC_MARK_STEP	(3*HZ)
+#define UPDATE_FREQUENCY (5*60*HZ)
 void md_do_sync(struct md_thread *thread)
 {
 	struct mddev *mddev = thread->mddev;
@@ -7289,6 +7287,7 @@ void md_do_sync(struct md_thread *thread)
 		 window;
 	sector_t max_sectors,j, io_sectors;
 	unsigned long mark[SYNC_MARKS];
+	unsigned long update_time;
 	sector_t mark_cnt[SYNC_MARKS];
 	int last_mark,m;
 	struct list_head *tmp;
@@ -7448,6 +7447,7 @@ void md_do_sync(struct md_thread *thread)
 	mddev->curr_resync_completed = j;
 	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 	md_new_event(mddev);
+	update_time = jiffies;
 
 	blk_start_plug(&plug);
 	while (j < max_sectors) {
@@ -7459,6 +7459,7 @@ void md_do_sync(struct md_thread *thread)
 		    ((mddev->curr_resync > mddev->curr_resync_completed &&
 		      (mddev->curr_resync - mddev->curr_resync_completed)
 		      > (max_sectors >> 4)) ||
+		     time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
 		     (j - mddev->curr_resync_completed)*2
 		     >= mddev->resync_max - mddev->curr_resync_completed
 			    )) {
@@ -7466,6 +7467,10 @@ void md_do_sync(struct md_thread *thread)
 			wait_event(mddev->recovery_wait,
 				   atomic_read(&mddev->recovery_active) == 0);
 			mddev->curr_resync_completed = j;
+			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+			    j > mddev->recovery_cp)
+				mddev->recovery_cp = j;
+			update_time = jiffies;
 			set_bit(MD_CHANGE_CLEAN, &mddev->flags);
 			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 		}
@@ -7570,8 +7575,13 @@ void md_do_sync(struct md_thread *thread)
 					printk(KERN_INFO
 					       "md: checkpointing %s of %s.\n",
 					       desc, mdname(mddev));
-					mddev->recovery_cp =
-						mddev->curr_resync_completed;
+					if (test_bit(MD_RECOVERY_ERROR,
+						&mddev->recovery))
+						mddev->recovery_cp =
+							mddev->curr_resync_completed;
+					else
+						mddev->recovery_cp =
+							mddev->curr_resync;
 				}
 			} else
 				mddev->recovery_cp = MaxSector;
diff --git a/trunk/drivers/md/md.h b/trunk/drivers/md/md.h
index af443ab868db..eca59c3074ef 100644
--- a/trunk/drivers/md/md.h
+++ b/trunk/drivers/md/md.h
@@ -307,6 +307,7 @@ struct mddev {
 	 * REQUEST:  user-space has requested a sync (used with SYNC)
 	 * CHECK:    user-space request for check-only, no repair
 	 * RESHAPE:  A reshape is happening
+	 * ERROR:    sync-action interrupted because io-error
 	 *
 	 * If neither SYNC or RESHAPE are set, then it is a recovery.
 	 */
@@ -320,6 +321,7 @@ struct mddev {
 #define	MD_RECOVERY_CHECK	7
 #define MD_RECOVERY_RESHAPE	8
 #define	MD_RECOVERY_FROZEN	9
+#define	MD_RECOVERY_ERROR	10
 
 	unsigned long			recovery;
 	/* If a RAID personality determines that recovery (of a particular
@@ -551,32 +553,6 @@ struct md_thread {
 
 #define THREAD_WAKEUP  0
 
-#define __wait_event_lock_irq(wq, condition, lock, cmd) 		\
-do {									\
-	wait_queue_t __wait;						\
-	init_waitqueue_entry(&__wait, current);				\
-									\
-	add_wait_queue(&wq, &__wait);					\
-	for (;;) {							\
-		set_current_state(TASK_UNINTERRUPTIBLE);		\
-		if (condition)						\
-			break;						\
-		spin_unlock_irq(&lock);					\
-		cmd;							\
-		schedule();						\
-		spin_lock_irq(&lock);					\
-	}								\
-	current->state = TASK_RUNNING;					\
-	remove_wait_queue(&wq, &__wait);				\
-} while (0)
-
-#define wait_event_lock_irq(wq, condition, lock, cmd) 			\
-do {									\
-	if (condition)	 						\
-		break;							\
-	__wait_event_lock_irq(wq, condition, lock, cmd);		\
-} while (0)
-
 static inline void safe_put_page(struct page *p)
 {
 	if (p) put_page(p);
diff --git a/trunk/drivers/md/raid1.c b/trunk/drivers/md/raid1.c
index a0f73092176e..d5bddfc4010e 100644
--- a/trunk/drivers/md/raid1.c
+++ b/trunk/drivers/md/raid1.c
@@ -822,7 +822,7 @@ static void raise_barrier(struct r1conf *conf)
 
 	/* Wait until no block IO is waiting */
 	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
-			    conf->resync_lock, );
+			    conf->resync_lock);
 
 	/* block any new IO from starting */
 	conf->barrier++;
@@ -830,7 +830,7 @@ static void raise_barrier(struct r1conf *conf)
 	/* Now wait for all pending IO to complete */
 	wait_event_lock_irq(conf->wait_barrier,
 			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
-			    conf->resync_lock, );
+			    conf->resync_lock);
 
 	spin_unlock_irq(&conf->resync_lock);
 }
@@ -864,8 +864,7 @@ static void wait_barrier(struct r1conf *conf)
 				    (conf->nr_pending &&
 				     current->bio_list &&
 				     !bio_list_empty(current->bio_list)),
-				    conf->resync_lock,
-			);
+				    conf->resync_lock);
 		conf->nr_waiting--;
 	}
 	conf->nr_pending++;
@@ -898,10 +897,10 @@ static void freeze_array(struct r1conf *conf)
 	spin_lock_irq(&conf->resync_lock);
 	conf->barrier++;
 	conf->nr_waiting++;
-	wait_event_lock_irq(conf->wait_barrier,
-			    conf->nr_pending == conf->nr_queued+1,
-			    conf->resync_lock,
-			    flush_pending_writes(conf));
+	wait_event_lock_irq_cmd(conf->wait_barrier,
+				conf->nr_pending == conf->nr_queued+1,
+				conf->resync_lock,
+				flush_pending_writes(conf));
 	spin_unlock_irq(&conf->resync_lock);
 }
 static void unfreeze_array(struct r1conf *conf)
diff --git a/trunk/drivers/md/raid10.c b/trunk/drivers/md/raid10.c
index c9acbd717131..64d48249c03b 100644
--- a/trunk/drivers/md/raid10.c
+++ b/trunk/drivers/md/raid10.c
@@ -952,7 +952,7 @@ static void raise_barrier(struct r10conf *conf, int force)
 
 	/* Wait until no block IO is waiting (unless 'force') */
 	wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
-			    conf->resync_lock, );
+			    conf->resync_lock);
 
 	/* block any new IO from starting */
 	conf->barrier++;
@@ -960,7 +960,7 @@ static void raise_barrier(struct r10conf *conf, int force)
 	/* Now wait for all pending IO to complete */
 	wait_event_lock_irq(conf->wait_barrier,
 			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
-			    conf->resync_lock, );
+			    conf->resync_lock);
 
 	spin_unlock_irq(&conf->resync_lock);
 }
@@ -993,8 +993,7 @@ static void wait_barrier(struct r10conf *conf)
 				    (conf->nr_pending &&
 				     current->bio_list &&
 				     !bio_list_empty(current->bio_list)),
-				    conf->resync_lock,
-			);
+				    conf->resync_lock);
 		conf->nr_waiting--;
 	}
 	conf->nr_pending++;
@@ -1027,10 +1026,10 @@ static void freeze_array(struct r10conf *conf)
 	spin_lock_irq(&conf->resync_lock);
 	conf->barrier++;
 	conf->nr_waiting++;
-	wait_event_lock_irq(conf->wait_barrier,
-			    conf->nr_pending == conf->nr_queued+1,
-			    conf->resync_lock,
-			    flush_pending_writes(conf));
+	wait_event_lock_irq_cmd(conf->wait_barrier,
+				conf->nr_pending == conf->nr_queued+1,
+				conf->resync_lock,
+				flush_pending_writes(conf));
 
 	spin_unlock_irq(&conf->resync_lock);
 }
diff --git a/trunk/drivers/md/raid5.c b/trunk/drivers/md/raid5.c
index 3380372c0393..19d77a026639 100644
--- a/trunk/drivers/md/raid5.c
+++ b/trunk/drivers/md/raid5.c
@@ -53,6 +53,8 @@
 #include <linux/cpu.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
+#include <trace/events/block.h>
+
 #include "md.h"
 #include "raid5.h"
 #include "raid0.h"
@@ -182,6 +184,8 @@ static void return_io(struct bio *return_bi)
 		return_bi = bi->bi_next;
 		bi->bi_next = NULL;
 		bi->bi_size = 0;
+		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+					 bi, 0);
 		bio_endio(bi, 0);
 		bi = return_bi;
 	}
@@ -466,7 +470,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
 	do {
 		wait_event_lock_irq(conf->wait_for_stripe,
 				    conf->quiesce == 0 || noquiesce,
-				    conf->device_lock, /* nothing */);
+				    conf->device_lock);
 		sh = __find_stripe(conf, sector, conf->generation - previous);
 		if (!sh) {
 			if (!conf->inactive_blocked)
@@ -480,8 +484,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
 						    (atomic_read(&conf->active_stripes)
 						     < (conf->max_nr_stripes *3/4)
 						     || !conf->inactive_blocked),
-						    conf->device_lock,
-						    );
+						    conf->device_lock);
 				conf->inactive_blocked = 0;
 			} else
 				init_stripe(sh, sector, previous);
@@ -671,6 +674,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 			bi->bi_next = NULL;
 			if (rrdev)
 				set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
+			trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
+					      bi, disk_devt(conf->mddev->gendisk),
+					      sh->dev[i].sector);
 			generic_make_request(bi);
 		}
 		if (rrdev) {
@@ -698,6 +704,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 			rbi->bi_io_vec[0].bv_offset = 0;
 			rbi->bi_size = STRIPE_SIZE;
 			rbi->bi_next = NULL;
+			trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
+					      rbi, disk_devt(conf->mddev->gendisk),
+					      sh->dev[i].sector);
 			generic_make_request(rbi);
 		}
 		if (!rdev && !rrdev) {
@@ -1646,8 +1655,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 		spin_lock_irq(&conf->device_lock);
 		wait_event_lock_irq(conf->wait_for_stripe,
 				    !list_empty(&conf->inactive_list),
-				    conf->device_lock,
-				    );
+				    conf->device_lock);
 		osh = get_free_stripe(conf);
 		spin_unlock_irq(&conf->device_lock);
 		atomic_set(&nsh->count, 1);
@@ -2855,8 +2863,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	pr_debug("for sector %llu, rmw=%d rcw=%d\n",
 		(unsigned long long)sh->sector, rmw, rcw);
 	set_bit(STRIPE_HANDLE, &sh->state);
-	if (rmw < rcw && rmw > 0)
+	if (rmw < rcw && rmw > 0) {
 		/* prefer read-modify-write, but need to get some data */
+		blk_add_trace_msg(conf->mddev->queue, "raid5 rmw %llu %d",
+				  (unsigned long long)sh->sector, rmw);
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
 			if ((dev->towrite || i == sh->pd_idx) &&
@@ -2867,7 +2877,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 				if (
 				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
 					pr_debug("Read_old block "
-						"%d for r-m-w\n", i);
+						 "%d for r-m-w\n", i);
 					set_bit(R5_LOCKED, &dev->flags);
 					set_bit(R5_Wantread, &dev->flags);
 					s->locked++;
@@ -2877,8 +2887,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 				}
 			}
 		}
+	}
 	if (rcw <= rmw && rcw > 0) {
 		/* want reconstruct write, but need to get some data */
+		int qread =0;
 		rcw = 0;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
@@ -2897,12 +2909,17 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 					set_bit(R5_LOCKED, &dev->flags);
 					set_bit(R5_Wantread, &dev->flags);
 					s->locked++;
+					qread++;
 				} else {
 					set_bit(STRIPE_DELAYED, &sh->state);
 					set_bit(STRIPE_HANDLE, &sh->state);
 				}
 			}
 		}
+		if (rcw)
+			blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
+					  (unsigned long long)sh->sector,
+					  rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
 	}
 	/* now if nothing is locked, and if we have enough data,
 	 * we can start a write request
@@ -3224,10 +3241,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
 
 		}
 	/* done submitting copies, wait for them to complete */
-	if (tx) {
-		async_tx_ack(tx);
-		dma_wait_for_async_tx(tx);
-	}
+	async_tx_quiesce(&tx);
 }
 
 /*
@@ -3903,6 +3917,8 @@ static void raid5_align_endio(struct bio *bi, int error)
 	rdev_dec_pending(rdev, conf->mddev);
 
 	if (!error && uptodate) {
+		trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
+					 raid_bi, 0);
 		bio_endio(raid_bi, 0);
 		if (atomic_dec_and_test(&conf->active_aligned_reads))
 			wake_up(&conf->wait_for_stripe);
@@ -4003,10 +4019,13 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
 		spin_lock_irq(&conf->device_lock);
 		wait_event_lock_irq(conf->wait_for_stripe,
 				    conf->quiesce == 0,
-				    conf->device_lock, /* nothing */);
+				    conf->device_lock);
 		atomic_inc(&conf->active_aligned_reads);
 		spin_unlock_irq(&conf->device_lock);
 
+		trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
+				      align_bi, disk_devt(mddev->gendisk),
+				      raid_bio->bi_sector);
 		generic_make_request(align_bi);
 		return 1;
 	} else {
@@ -4081,6 +4100,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
 	struct stripe_head *sh;
 	struct mddev *mddev = cb->cb.data;
 	struct r5conf *conf = mddev->private;
+	int cnt = 0;
 
 	if (cb->list.next && !list_empty(&cb->list)) {
 		spin_lock_irq(&conf->device_lock);
@@ -4095,9 +4115,11 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
 			smp_mb__before_clear_bit();
 			clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
 			__release_stripe(conf, sh);
+			cnt++;
 		}
 		spin_unlock_irq(&conf->device_lock);
 	}
+	trace_block_unplug(mddev->queue, cnt, !from_schedule);
 	kfree(cb);
 }
 
@@ -4355,6 +4377,8 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 		if ( rw == WRITE )
 			md_write_end(mddev);
 
+		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+					 bi, 0);
 		bio_endio(bi, 0);
 	}
 }
@@ -4731,8 +4755,11 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 		handled++;
 	}
 	remaining = raid5_dec_bi_active_stripes(raid_bio);
-	if (remaining == 0)
+	if (remaining == 0) {
+		trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
+					 raid_bio, 0);
 		bio_endio(raid_bio, 0);
+	}
 	if (atomic_dec_and_test(&conf->active_aligned_reads))
 		wake_up(&conf->wait_for_stripe);
 	return handled;
@@ -6095,7 +6122,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
 		wait_event_lock_irq(conf->wait_for_stripe,
 				    atomic_read(&conf->active_stripes) == 0 &&
 				    atomic_read(&conf->active_aligned_reads) == 0,
-				    conf->device_lock, /* nothing */);
+				    conf->device_lock);
 		conf->quiesce = 1;
 		spin_unlock_irq(&conf->device_lock);
 		/* allow reshape to continue */
diff --git a/trunk/drivers/message/fusion/mptscsih.c b/trunk/drivers/message/fusion/mptscsih.c
index 0c3ced70707b..164afa71bba7 100644
--- a/trunk/drivers/message/fusion/mptscsih.c
+++ b/trunk/drivers/message/fusion/mptscsih.c
@@ -792,6 +792,7 @@ mptscsih_io_done(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *mr)
 			 * than an unsolicited DID_ABORT.
 			 */
 			sc->result = DID_RESET << 16;
+			break;
 
 		case MPI_IOCSTATUS_SCSI_EXT_TERMINATED:		/* 0x004C */
 			if (ioc->bus_type == FC)
diff --git a/trunk/drivers/mfd/stmpe.c b/trunk/drivers/mfd/stmpe.c
index 19636199d7a5..5e8e6927cfcd 100644
--- a/trunk/drivers/mfd/stmpe.c
+++ b/trunk/drivers/mfd/stmpe.c
@@ -326,6 +326,7 @@ static struct resource stmpe_keypad_resources[] = {
 
 static struct mfd_cell stmpe_keypad_cell = {
 	.name		= "stmpe-keypad",
+	.of_compatible  = "st,stmpe-keypad",
 	.resources	= stmpe_keypad_resources,
 	.num_resources	= ARRAY_SIZE(stmpe_keypad_resources),
 };
@@ -409,6 +410,7 @@ static struct resource stmpe_ts_resources[] = {
 
 static struct mfd_cell stmpe_ts_cell = {
 	.name		= "stmpe-ts",
+	.of_compatible	= "st,stmpe-ts",
 	.resources	= stmpe_ts_resources,
 	.num_resources	= ARRAY_SIZE(stmpe_ts_resources),
 };
diff --git a/trunk/drivers/mtd/ar7part.c b/trunk/drivers/mtd/ar7part.c
index 945393129952..7c057a05adb6 100644
--- a/trunk/drivers/mtd/ar7part.c
+++ b/trunk/drivers/mtd/ar7part.c
@@ -26,19 +26,16 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
 #include <linux/bootmem.h>
-#include <linux/magic.h>
 #include <linux/module.h>
 
+#include <uapi/linux/magic.h>
+
 #define AR7_PARTS	4
 #define ROOT_OFFSET	0xe0000
 
 #define LOADER_MAGIC1	le32_to_cpu(0xfeedfa42)
 #define LOADER_MAGIC2	le32_to_cpu(0xfeed1281)
 
-#ifndef SQUASHFS_MAGIC
-#define SQUASHFS_MAGIC	0x73717368
-#endif
-
 struct ar7_bin_rec {
 	unsigned int checksum;
 	unsigned int length;
diff --git a/trunk/drivers/mtd/bcm63xxpart.c b/trunk/drivers/mtd/bcm63xxpart.c
index 63d2a64331f7..6eeb84c81bc2 100644
--- a/trunk/drivers/mtd/bcm63xxpart.c
+++ b/trunk/drivers/mtd/bcm63xxpart.c
@@ -37,8 +37,7 @@
 
 #define BCM63XX_EXTENDED_SIZE	0xBFC00000	/* Extended flash address */
 
-#define BCM63XX_MIN_CFE_SIZE	0x10000		/* always at least 64KiB */
-#define BCM63XX_MIN_NVRAM_SIZE	0x10000		/* always at least 64KiB */
+#define BCM63XX_CFE_BLOCK_SIZE	0x10000		/* always at least 64KiB */
 
 #define BCM63XX_CFE_MAGIC_OFFSET 0x4e0
 
@@ -79,7 +78,7 @@ static int bcm63xx_parse_cfe_partitions(struct mtd_info *master,
 	unsigned int rootfsaddr, kerneladdr, spareaddr;
 	unsigned int rootfslen, kernellen, sparelen, totallen;
 	unsigned int cfelen, nvramlen;
-	int namelen = 0;
+	unsigned int cfe_erasesize;
 	int i;
 	u32 computed_crc;
 	bool rootfs_first = false;
@@ -87,8 +86,11 @@ static int bcm63xx_parse_cfe_partitions(struct mtd_info *master,
 	if (bcm63xx_detect_cfe(master))
 		return -EINVAL;
 
-	cfelen = max_t(uint32_t, master->erasesize, BCM63XX_MIN_CFE_SIZE);
-	nvramlen = max_t(uint32_t, master->erasesize, BCM63XX_MIN_NVRAM_SIZE);
+	cfe_erasesize = max_t(uint32_t, master->erasesize,
+			      BCM63XX_CFE_BLOCK_SIZE);
+
+	cfelen = cfe_erasesize;
+	nvramlen = cfe_erasesize;
 
 	/* Allocate memory for buffer */
 	buf = vmalloc(sizeof(struct bcm_tag));
@@ -121,7 +123,6 @@ static int bcm63xx_parse_cfe_partitions(struct mtd_info *master,
 		kerneladdr = kerneladdr - BCM63XX_EXTENDED_SIZE;
 		rootfsaddr = rootfsaddr - BCM63XX_EXTENDED_SIZE;
 		spareaddr = roundup(totallen, master->erasesize) + cfelen;
-		sparelen = master->size - spareaddr - nvramlen;
 
 		if (rootfsaddr < kerneladdr) {
 			/* default Broadcom layout */
@@ -139,19 +140,15 @@ static int bcm63xx_parse_cfe_partitions(struct mtd_info *master,
 		rootfslen = 0;
 		rootfsaddr = 0;
 		spareaddr = cfelen;
-		sparelen = master->size - cfelen - nvramlen;
 	}
+	sparelen = master->size - spareaddr - nvramlen;
 
 	/* Determine number of partitions */
-	namelen = 8;
-	if (rootfslen > 0) {
+	if (rootfslen > 0)
 		nrparts++;
-		namelen += 6;
-	}
-	if (kernellen > 0) {
+
+	if (kernellen > 0)
 		nrparts++;
-		namelen += 6;
-	}
 
 	/* Ask kernel for more memory */
 	parts = kzalloc(sizeof(*parts) * nrparts + 10 * nrparts, GFP_KERNEL);
@@ -193,17 +190,16 @@ static int bcm63xx_parse_cfe_partitions(struct mtd_info *master,
 	parts[curpart].name = "nvram";
 	parts[curpart].offset = master->size - nvramlen;
 	parts[curpart].size = nvramlen;
+	curpart++;
 
 	/* Global partition "linux" to make easy firmware upgrade */
-	curpart++;
 	parts[curpart].name = "linux";
 	parts[curpart].offset = cfelen;
 	parts[curpart].size = master->size - cfelen - nvramlen;
 
 	for (i = 0; i < nrparts; i++)
-		pr_info("Partition %d is %s offset %lx and length %lx\n", i,
-			parts[i].name, (long unsigned int)(parts[i].offset),
-			(long unsigned int)(parts[i].size));
+		pr_info("Partition %d is %s offset %llx and length %llx\n", i,
+			parts[i].name, parts[i].offset,	parts[i].size);
 
 	pr_info("Spare partition is offset %x and length %x\n",	spareaddr,
 		sparelen);
diff --git a/trunk/drivers/mtd/chips/cfi_cmdset_0002.c b/trunk/drivers/mtd/chips/cfi_cmdset_0002.c
index 5ff5c4a16943..b86197286f24 100644
--- a/trunk/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/trunk/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -1536,8 +1536,20 @@ static int __xipram do_write_buffer(struct map_info *map, struct flchip *chip,
 		UDELAY(map, chip, adr, 1);
 	}
 
-	/* reset on all failures. */
-	map_write( map, CMD(0xF0), chip->start );
+	/*
+	 * Recovery from write-buffer programming failures requires
+	 * the write-to-buffer-reset sequence.  Since the last part
+	 * of the sequence also works as a normal reset, we can run
+	 * the same commands regardless of why we are here.
+	 * See e.g.
+	 * http://www.spansion.com/Support/Application%20Notes/MirrorBit_Write_Buffer_Prog_Page_Buffer_Read_AN.pdf
+	 */
+	cfi_send_gen_cmd(0xAA, cfi->addr_unlock1, chip->start, map, cfi,
+			 cfi->device_type, NULL);
+	cfi_send_gen_cmd(0x55, cfi->addr_unlock2, chip->start, map, cfi,
+			 cfi->device_type, NULL);
+	cfi_send_gen_cmd(0xF0, cfi->addr_unlock1, chip->start, map, cfi,
+			 cfi->device_type, NULL);
 	xip_enable(map, chip, adr);
 	/* FIXME - should have reset delay before continuing */
 
diff --git a/trunk/drivers/mtd/cmdlinepart.c b/trunk/drivers/mtd/cmdlinepart.c
index aed1b8a63c9f..c533f27d863f 100644
--- a/trunk/drivers/mtd/cmdlinepart.c
+++ b/trunk/drivers/mtd/cmdlinepart.c
@@ -56,8 +56,8 @@
 
 
 /* special size referring to all the remaining space in a partition */
-#define SIZE_REMAINING UINT_MAX
-#define OFFSET_CONTINUOUS UINT_MAX
+#define SIZE_REMAINING ULLONG_MAX
+#define OFFSET_CONTINUOUS ULLONG_MAX
 
 struct cmdline_mtd_partition {
 	struct cmdline_mtd_partition *next;
@@ -89,7 +89,7 @@ static struct mtd_partition * newpart(char *s,
 				      int extra_mem_size)
 {
 	struct mtd_partition *parts;
-	unsigned long size, offset = OFFSET_CONTINUOUS;
+	unsigned long long size, offset = OFFSET_CONTINUOUS;
 	char *name;
 	int name_len;
 	unsigned char *extra_mem;
@@ -104,7 +104,8 @@ static struct mtd_partition * newpart(char *s,
 	} else {
 		size = memparse(s, &s);
 		if (size < PAGE_SIZE) {
-			printk(KERN_ERR ERRP "partition size too small (%lx)\n", size);
+			printk(KERN_ERR ERRP "partition size too small (%llx)\n",
+			       size);
 			return ERR_PTR(-EINVAL);
 		}
 	}
@@ -296,7 +297,7 @@ static int parse_cmdline_partitions(struct mtd_info *master,
 				    struct mtd_partition **pparts,
 				    struct mtd_part_parser_data *data)
 {
-	unsigned long offset;
+	unsigned long long offset;
 	int i, err;
 	struct cmdline_mtd_partition *part;
 	const char *mtd_id = master->name;
@@ -308,48 +309,52 @@ static int parse_cmdline_partitions(struct mtd_info *master,
 			return err;
 	}
 
+	/*
+	 * Search for the partition definition matching master->name.
+	 * If master->name is not set, stop at first partition definition.
+	 */
 	for (part = partitions; part; part = part->next) {
-		if ((!mtd_id) || (!strcmp(part->mtd_id, mtd_id))) {
-			for (i = 0, offset = 0; i < part->num_parts; i++) {
-				if (part->parts[i].offset == OFFSET_CONTINUOUS)
-					part->parts[i].offset = offset;
-				else
-					offset = part->parts[i].offset;
-
-				if (part->parts[i].size == SIZE_REMAINING)
-					part->parts[i].size = master->size - offset;
-
-				if (part->parts[i].size == 0) {
-					printk(KERN_WARNING ERRP
-					       "%s: skipping zero sized partition\n",
-					       part->mtd_id);
-					part->num_parts--;
-					memmove(&part->parts[i],
-						&part->parts[i + 1],
-						sizeof(*part->parts) * (part->num_parts - i));
-					continue;
-				}
-
-				if (offset + part->parts[i].size > master->size) {
-					printk(KERN_WARNING ERRP
-					       "%s: partitioning exceeds flash size, truncating\n",
-					       part->mtd_id);
-					part->parts[i].size = master->size - offset;
-				}
-				offset += part->parts[i].size;
-			}
-
-			*pparts = kmemdup(part->parts,
-					sizeof(*part->parts) * part->num_parts,
-					GFP_KERNEL);
-			if (!*pparts)
-				return -ENOMEM;
-
-			return part->num_parts;
+		if ((!mtd_id) || (!strcmp(part->mtd_id, mtd_id)))
+			break;
+	}
+
+	if (!part)
+		return 0;
+
+	for (i = 0, offset = 0; i < part->num_parts; i++) {
+		if (part->parts[i].offset == OFFSET_CONTINUOUS)
+			part->parts[i].offset = offset;
+		else
+			offset = part->parts[i].offset;
+
+		if (part->parts[i].size == SIZE_REMAINING)
+			part->parts[i].size = master->size - offset;
+
+		if (part->parts[i].size == 0) {
+			printk(KERN_WARNING ERRP
+			       "%s: skipping zero sized partition\n",
+			       part->mtd_id);
+			part->num_parts--;
+			memmove(&part->parts[i], &part->parts[i + 1],
+				sizeof(*part->parts) * (part->num_parts - i));
+			continue;
 		}
+
+		if (offset + part->parts[i].size > master->size) {
+			printk(KERN_WARNING ERRP
+			       "%s: partitioning exceeds flash size, truncating\n",
+			       part->mtd_id);
+			part->parts[i].size = master->size - offset;
+		}
+		offset += part->parts[i].size;
 	}
 
-	return 0;
+	*pparts = kmemdup(part->parts, sizeof(*part->parts) * part->num_parts,
+			  GFP_KERNEL);
+	if (!*pparts)
+		return -ENOMEM;
+
+	return part->num_parts;
 }
 
 
diff --git a/trunk/drivers/mtd/devices/bcm47xxsflash.c b/trunk/drivers/mtd/devices/bcm47xxsflash.c
index 2dc5a6f3fd57..4714584aa993 100644
--- a/trunk/drivers/mtd/devices/bcm47xxsflash.c
+++ b/trunk/drivers/mtd/devices/bcm47xxsflash.c
@@ -66,7 +66,7 @@ static int bcm47xxsflash_probe(struct platform_device *pdev)
 	return err;
 }
 
-static int __devexit bcm47xxsflash_remove(struct platform_device *pdev)
+static int bcm47xxsflash_remove(struct platform_device *pdev)
 {
 	struct bcma_sflash *sflash = dev_get_platdata(&pdev->dev);
 
@@ -77,7 +77,7 @@ static int __devexit bcm47xxsflash_remove(struct platform_device *pdev)
 }
 
 static struct platform_driver bcma_sflash_driver = {
-	.remove = __devexit_p(bcm47xxsflash_remove),
+	.remove = bcm47xxsflash_remove,
 	.driver = {
 		.name = "bcma_sflash",
 		.owner = THIS_MODULE,
diff --git a/trunk/drivers/mtd/devices/block2mtd.c b/trunk/drivers/mtd/devices/block2mtd.c
index 681e2ee0f2d6..e081bfeaaf7d 100644
--- a/trunk/drivers/mtd/devices/block2mtd.c
+++ b/trunk/drivers/mtd/devices/block2mtd.c
@@ -62,6 +62,7 @@ static int _block2mtd_erase(struct block2mtd_dev *dev, loff_t to, size_t len)
 				memset(page_address(page), 0xff, PAGE_SIZE);
 				set_page_dirty(page);
 				unlock_page(page);
+				balance_dirty_pages_ratelimited(mapping);
 				break;
 			}
 
@@ -152,6 +153,7 @@ static int _block2mtd_write(struct block2mtd_dev *dev, const u_char *buf,
 			memcpy(page_address(page) + offset, buf, cpylen);
 			set_page_dirty(page);
 			unlock_page(page);
+			balance_dirty_pages_ratelimited(mapping);
 		}
 		page_cache_release(page);
 
@@ -433,7 +435,7 @@ static int __init block2mtd_init(void)
 }
 
 
-static void __devexit block2mtd_exit(void)
+static void block2mtd_exit(void)
 {
 	struct list_head *pos, *next;
 
diff --git a/trunk/drivers/mtd/devices/docg3.c b/trunk/drivers/mtd/devices/docg3.c
index d34d83b8f9c2..8510ccb9c6f0 100644
--- a/trunk/drivers/mtd/devices/docg3.c
+++ b/trunk/drivers/mtd/devices/docg3.c
@@ -1440,7 +1440,7 @@ static int doc_write_oob(struct mtd_info *mtd, loff_t ofs,
 		oobdelta = mtd->ecclayout->oobavail;
 		break;
 	default:
-		oobdelta = 0;
+		return -EINVAL;
 	}
 	if ((len % DOC_LAYOUT_PAGE_SIZE) || (ooblen % oobdelta) ||
 	    (ofs % DOC_LAYOUT_PAGE_SIZE))
diff --git a/trunk/drivers/mtd/devices/docprobe.c b/trunk/drivers/mtd/devices/docprobe.c
index 706b847b46b3..88b3fd3e18a7 100644
--- a/trunk/drivers/mtd/devices/docprobe.c
+++ b/trunk/drivers/mtd/devices/docprobe.c
@@ -70,8 +70,6 @@ static unsigned long __initdata doc_locations[] = {
 	0xe0000, 0xe2000, 0xe4000, 0xe6000,
 	0xe8000, 0xea000, 0xec000, 0xee000,
 #endif /*  CONFIG_MTD_DOCPROBE_HIGH */
-#else
-#warning Unknown architecture for DiskOnChip. No default probe locations defined
 #endif
 	0xffffffff };
 
diff --git a/trunk/drivers/mtd/devices/m25p80.c b/trunk/drivers/mtd/devices/m25p80.c
index 03838bab1f59..4eeeb2d7f6ea 100644
--- a/trunk/drivers/mtd/devices/m25p80.c
+++ b/trunk/drivers/mtd/devices/m25p80.c
@@ -73,14 +73,6 @@
 #define	MAX_READY_WAIT_JIFFIES	(40 * HZ)	/* M25P16 specs 40s max chip erase */
 #define	MAX_CMD_SIZE		5
 
-#ifdef CONFIG_M25PXX_USE_FAST_READ
-#define OPCODE_READ 	OPCODE_FAST_READ
-#define FAST_READ_DUMMY_BYTE 1
-#else
-#define OPCODE_READ 	OPCODE_NORM_READ
-#define FAST_READ_DUMMY_BYTE 0
-#endif
-
 #define JEDEC_MFR(_jedec_id)	((_jedec_id) >> 16)
 
 /****************************************************************************/
@@ -93,6 +85,7 @@ struct m25p {
 	u16			addr_width;
 	u8			erase_opcode;
 	u8			*command;
+	bool			fast_read;
 };
 
 static inline struct m25p *mtd_to_m25p(struct mtd_info *mtd)
@@ -168,6 +161,7 @@ static inline int set_4byte(struct m25p *flash, u32 jedec_id, int enable)
 {
 	switch (JEDEC_MFR(jedec_id)) {
 	case CFI_MFR_MACRONIX:
+	case 0xEF /* winbond */:
 		flash->command[0] = enable ? OPCODE_EN4B : OPCODE_EX4B;
 		return spi_write(flash->spi, flash->command, 1);
 	default:
@@ -342,6 +336,7 @@ static int m25p80_read(struct mtd_info *mtd, loff_t from, size_t len,
 	struct m25p *flash = mtd_to_m25p(mtd);
 	struct spi_transfer t[2];
 	struct spi_message m;
+	uint8_t opcode;
 
 	pr_debug("%s: %s from 0x%08x, len %zd\n", dev_name(&flash->spi->dev),
 			__func__, (u32)from, len);
@@ -354,7 +349,7 @@ static int m25p80_read(struct mtd_info *mtd, loff_t from, size_t len,
 	 * Should add 1 byte DUMMY_BYTE.
 	 */
 	t[0].tx_buf = flash->command;
-	t[0].len = m25p_cmdsz(flash) + FAST_READ_DUMMY_BYTE;
+	t[0].len = m25p_cmdsz(flash) + (flash->fast_read ? 1 : 0);
 	spi_message_add_tail(&t[0], &m);
 
 	t[1].rx_buf = buf;
@@ -376,12 +371,14 @@ static int m25p80_read(struct mtd_info *mtd, loff_t from, size_t len,
 	 */
 
 	/* Set up the write data buffer. */
-	flash->command[0] = OPCODE_READ;
+	opcode = flash->fast_read ? OPCODE_FAST_READ : OPCODE_NORM_READ;
+	flash->command[0] = opcode;
 	m25p_addr2cmd(flash, from, flash->command);
 
 	spi_sync(flash->spi, &m);
 
-	*retlen = m.actual_length - m25p_cmdsz(flash) - FAST_READ_DUMMY_BYTE;
+	*retlen = m.actual_length - m25p_cmdsz(flash) -
+			(flash->fast_read ? 1 : 0);
 
 	mutex_unlock(&flash->lock);
 
@@ -664,7 +661,8 @@ static const struct spi_device_id m25p_ids[] = {
 	{ "mx25l25655e", INFO(0xc22619, 0, 64 * 1024, 512, 0) },
 
 	/* Micron */
-	{ "n25q128",  INFO(0x20ba18, 0, 64 * 1024, 256, 0) },
+	{ "n25q128a11",  INFO(0x20bb18, 0, 64 * 1024, 256, 0) },
+	{ "n25q128a13",  INFO(0x20ba18, 0, 64 * 1024, 256, 0) },
 	{ "n25q256a", INFO(0x20ba19, 0, 64 * 1024, 512, SECT_4K) },
 
 	/* Spansion -- single (large) sector size only, at least
@@ -745,6 +743,8 @@ static const struct spi_device_id m25p_ids[] = {
 	{ "w25x64", INFO(0xef3017, 0, 64 * 1024, 128, SECT_4K) },
 	{ "w25q64", INFO(0xef4017, 0, 64 * 1024, 128, SECT_4K) },
 	{ "w25q80", INFO(0xef5014, 0, 64 * 1024,  16, SECT_4K) },
+	{ "w25q80bl", INFO(0xef4014, 0, 64 * 1024,  16, SECT_4K) },
+	{ "w25q256", INFO(0xef4019, 0, 64 * 1024, 512, SECT_4K) },
 
 	/* Catalyst / On Semiconductor -- non-JEDEC */
 	{ "cat25c11", CAT25_INFO(  16, 8, 16, 1) },
@@ -756,7 +756,7 @@ static const struct spi_device_id m25p_ids[] = {
 };
 MODULE_DEVICE_TABLE(spi, m25p_ids);
 
-static const struct spi_device_id *__devinit jedec_probe(struct spi_device *spi)
+static const struct spi_device_id *jedec_probe(struct spi_device *spi)
 {
 	int			tmp;
 	u8			code = OPCODE_RDID;
@@ -801,7 +801,7 @@ static const struct spi_device_id *__devinit jedec_probe(struct spi_device *spi)
  * matches what the READ command supports, at least until this driver
  * understands FAST_READ (for clocks over 25 MHz).
  */
-static int __devinit m25p_probe(struct spi_device *spi)
+static int m25p_probe(struct spi_device *spi)
 {
 	const struct spi_device_id	*id = spi_get_device_id(spi);
 	struct flash_platform_data	*data;
@@ -809,9 +809,10 @@ static int __devinit m25p_probe(struct spi_device *spi)
 	struct flash_info		*info;
 	unsigned			i;
 	struct mtd_part_parser_data	ppdata;
+	struct device_node __maybe_unused *np = spi->dev.of_node;
 
 #ifdef CONFIG_MTD_OF_PARTS
-	if (!of_device_is_available(spi->dev.of_node))
+	if (!of_device_is_available(np))
 		return -ENODEV;
 #endif
 
@@ -863,7 +864,8 @@ static int __devinit m25p_probe(struct spi_device *spi)
 	flash = kzalloc(sizeof *flash, GFP_KERNEL);
 	if (!flash)
 		return -ENOMEM;
-	flash->command = kmalloc(MAX_CMD_SIZE + FAST_READ_DUMMY_BYTE, GFP_KERNEL);
+	flash->command = kmalloc(MAX_CMD_SIZE + (flash->fast_read ? 1 : 0),
+					GFP_KERNEL);
 	if (!flash->command) {
 		kfree(flash);
 		return -ENOMEM;
@@ -920,6 +922,16 @@ static int __devinit m25p_probe(struct spi_device *spi)
 	flash->page_size = info->page_size;
 	flash->mtd.writebufsize = flash->page_size;
 
+	flash->fast_read = false;
+#ifdef CONFIG_OF
+	if (np && of_property_read_bool(np, "m25p,fast-read"))
+		flash->fast_read = true;
+#endif
+
+#ifdef CONFIG_M25PXX_USE_FAST_READ
+	flash->fast_read = true;
+#endif
+
 	if (info->addr_width)
 		flash->addr_width = info->addr_width;
 	else {
@@ -961,7 +973,7 @@ static int __devinit m25p_probe(struct spi_device *spi)
 }
 
 
-static int __devexit m25p_remove(struct spi_device *spi)
+static int m25p_remove(struct spi_device *spi)
 {
 	struct m25p	*flash = dev_get_drvdata(&spi->dev);
 	int		status;
@@ -983,7 +995,7 @@ static struct spi_driver m25p80_driver = {
 	},
 	.id_table	= m25p_ids,
 	.probe	= m25p_probe,
-	.remove	= __devexit_p(m25p_remove),
+	.remove	= m25p_remove,
 
 	/* REVISIT: many of these chips have deep power-down modes, which
 	 * should clearly be entered on suspend() to minimize power use.
diff --git a/trunk/drivers/mtd/devices/mtd_dataflash.c b/trunk/drivers/mtd/devices/mtd_dataflash.c
index 928fb0e6d73a..ea7ea7b595d8 100644
--- a/trunk/drivers/mtd/devices/mtd_dataflash.c
+++ b/trunk/drivers/mtd/devices/mtd_dataflash.c
@@ -618,7 +618,7 @@ static char *otp_setup(struct mtd_info *device, char revision)
 /*
  * Register DataFlash device with MTD subsystem.
  */
-static int __devinit
+static int
 add_dataflash_otp(struct spi_device *spi, char *name,
 		int nr_pages, int pagesize, int pageoffset, char revision)
 {
@@ -679,7 +679,7 @@ add_dataflash_otp(struct spi_device *spi, char *name,
 	return err;
 }
 
-static inline int __devinit
+static inline int
 add_dataflash(struct spi_device *spi, char *name,
 		int nr_pages, int pagesize, int pageoffset)
 {
@@ -705,7 +705,7 @@ struct flash_info {
 #define IS_POW2PS	0x0001		/* uses 2^N byte pages */
 };
 
-static struct flash_info __devinitdata dataflash_data [] = {
+static struct flash_info dataflash_data[] = {
 
 	/*
 	 * NOTE:  chips with SUP_POW2PS (rev D and up) need two entries,
@@ -740,7 +740,7 @@ static struct flash_info __devinitdata dataflash_data [] = {
 	{ "at45db642d",  0x1f2800, 8192, 1024, 10, SUP_POW2PS | IS_POW2PS},
 };
 
-static struct flash_info *__devinit jedec_probe(struct spi_device *spi)
+static struct flash_info *jedec_probe(struct spi_device *spi)
 {
 	int			tmp;
 	uint8_t			code = OP_READ_ID;
@@ -823,7 +823,7 @@ static struct flash_info *__devinit jedec_probe(struct spi_device *spi)
  *   AT45DB0642  64Mbit  (8M)    xx111xxx (0x3c)   8192   1056     11
  *   AT45DB1282  128Mbit (16M)   xx0100xx (0x10)  16384   1056     11
  */
-static int __devinit dataflash_probe(struct spi_device *spi)
+static int dataflash_probe(struct spi_device *spi)
 {
 	int status;
 	struct flash_info	*info;
@@ -897,7 +897,7 @@ static int __devinit dataflash_probe(struct spi_device *spi)
 	return status;
 }
 
-static int __devexit dataflash_remove(struct spi_device *spi)
+static int dataflash_remove(struct spi_device *spi)
 {
 	struct dataflash	*flash = dev_get_drvdata(&spi->dev);
 	int			status;
@@ -920,7 +920,7 @@ static struct spi_driver dataflash_driver = {
 	},
 
 	.probe		= dataflash_probe,
-	.remove		= __devexit_p(dataflash_remove),
+	.remove		= dataflash_remove,
 
 	/* FIXME:  investigate suspend and resume... */
 };
diff --git a/trunk/drivers/mtd/devices/spear_smi.c b/trunk/drivers/mtd/devices/spear_smi.c
index dcc3c9511530..2d2c2a5d4d2a 100644
--- a/trunk/drivers/mtd/devices/spear_smi.c
+++ b/trunk/drivers/mtd/devices/spear_smi.c
@@ -756,7 +756,7 @@ static int spear_smi_probe_flash(struct spear_smi *dev, u32 bank)
 
 
 #ifdef CONFIG_OF
-static int __devinit spear_smi_probe_config_dt(struct platform_device *pdev,
+static int spear_smi_probe_config_dt(struct platform_device *pdev,
 					       struct device_node *np)
 {
 	struct spear_smi_plat_data *pdata = dev_get_platdata(&pdev->dev);
@@ -799,7 +799,7 @@ static int __devinit spear_smi_probe_config_dt(struct platform_device *pdev,
 	return 0;
 }
 #else
-static int __devinit spear_smi_probe_config_dt(struct platform_device *pdev,
+static int spear_smi_probe_config_dt(struct platform_device *pdev,
 					       struct device_node *np)
 {
 	return -ENOSYS;
@@ -901,7 +901,7 @@ static int spear_smi_setup_banks(struct platform_device *pdev,
  * and do proper init for any found one.
  * Returns 0 on success, non zero otherwise
  */
-static int __devinit spear_smi_probe(struct platform_device *pdev)
+static int spear_smi_probe(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
 	struct spear_smi_plat_data *pdata = NULL;
@@ -1016,7 +1016,7 @@ static int __devinit spear_smi_probe(struct platform_device *pdev)
  *
  * free all allocations and delete the partitions.
  */
-static int __devexit spear_smi_remove(struct platform_device *pdev)
+static int spear_smi_remove(struct platform_device *pdev)
 {
 	struct spear_smi *dev;
 	struct spear_snor_flash *flash;
@@ -1092,20 +1092,9 @@ static struct platform_driver spear_smi_driver = {
 #endif
 	},
 	.probe = spear_smi_probe,
-	.remove = __devexit_p(spear_smi_remove),
+	.remove = spear_smi_remove,
 };
-
-static int spear_smi_init(void)
-{
-	return platform_driver_register(&spear_smi_driver);
-}
-module_init(spear_smi_init);
-
-static void spear_smi_exit(void)
-{
-	platform_driver_unregister(&spear_smi_driver);
-}
-module_exit(spear_smi_exit);
+module_platform_driver(spear_smi_driver);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Ashish Priyadarshi, Shiraz Hashim <shiraz.hashim@st.com>");
diff --git a/trunk/drivers/mtd/devices/sst25l.c b/trunk/drivers/mtd/devices/sst25l.c
index ab8a2f4c8d60..8091b0163694 100644
--- a/trunk/drivers/mtd/devices/sst25l.c
+++ b/trunk/drivers/mtd/devices/sst25l.c
@@ -64,7 +64,7 @@ struct flash_info {
 
 #define to_sst25l_flash(x) container_of(x, struct sst25l_flash, mtd)
 
-static struct flash_info __devinitdata sst25l_flash_info[] = {
+static struct flash_info sst25l_flash_info[] = {
 	{"sst25lf020a", 0xbf43, 256, 1024, 4096},
 	{"sst25lf040a",	0xbf44,	256, 2048, 4096},
 };
@@ -313,7 +313,7 @@ static int sst25l_write(struct mtd_info *mtd, loff_t to, size_t len,
 	return ret;
 }
 
-static struct flash_info *__devinit sst25l_match_device(struct spi_device *spi)
+static struct flash_info *sst25l_match_device(struct spi_device *spi)
 {
 	struct flash_info *flash_info = NULL;
 	struct spi_message m;
@@ -353,7 +353,7 @@ static struct flash_info *__devinit sst25l_match_device(struct spi_device *spi)
 	return flash_info;
 }
 
-static int __devinit sst25l_probe(struct spi_device *spi)
+static int sst25l_probe(struct spi_device *spi)
 {
 	struct flash_info *flash_info;
 	struct sst25l_flash *flash;
@@ -411,7 +411,7 @@ static int __devinit sst25l_probe(struct spi_device *spi)
 	return 0;
 }
 
-static int __devexit sst25l_remove(struct spi_device *spi)
+static int sst25l_remove(struct spi_device *spi)
 {
 	struct sst25l_flash *flash = dev_get_drvdata(&spi->dev);
 	int ret;
@@ -428,7 +428,7 @@ static struct spi_driver sst25l_driver = {
 		.owner	= THIS_MODULE,
 	},
 	.probe		= sst25l_probe,
-	.remove		= __devexit_p(sst25l_remove),
+	.remove		= sst25l_remove,
 };
 
 module_spi_driver(sst25l_driver);
diff --git a/trunk/drivers/mtd/maps/Kconfig b/trunk/drivers/mtd/maps/Kconfig
index df304868bebb..62ba82c396c2 100644
--- a/trunk/drivers/mtd/maps/Kconfig
+++ b/trunk/drivers/mtd/maps/Kconfig
@@ -358,13 +358,6 @@ config MTD_IXP2000
 	  IXP2000 based board and would like to use the flash chips on it,
 	  say 'Y'.
 
-config MTD_FORTUNET
-	tristate "CFI Flash device mapped on the FortuNet board"
-	depends on MTD_CFI && SA1100_FORTUNET
-	help
-	  This enables access to the Flash on the FortuNet board.  If you
-	  have such a board, say 'Y'.
-
 config MTD_AUTCPU12
 	bool "NV-RAM mapping AUTCPU12 board"
 	depends on ARCH_AUTCPU12
diff --git a/trunk/drivers/mtd/maps/Makefile b/trunk/drivers/mtd/maps/Makefile
index a0240edd1961..4ded28711bc1 100644
--- a/trunk/drivers/mtd/maps/Makefile
+++ b/trunk/drivers/mtd/maps/Makefile
@@ -39,7 +39,6 @@ obj-$(CONFIG_MTD_SOLUTIONENGINE)+= solutionengine.o
 obj-$(CONFIG_MTD_PCI)		+= pci.o
 obj-$(CONFIG_MTD_AUTCPU12)	+= autcpu12-nvram.o
 obj-$(CONFIG_MTD_IMPA7)		+= impa7.o
-obj-$(CONFIG_MTD_FORTUNET)	+= fortunet.o
 obj-$(CONFIG_MTD_UCLINUX)	+= uclinux.o
 obj-$(CONFIG_MTD_NETtel)	+= nettel.o
 obj-$(CONFIG_MTD_SCB2_FLASH)	+= scb2_flash.o
diff --git a/trunk/drivers/mtd/maps/amd76xrom.c b/trunk/drivers/mtd/maps/amd76xrom.c
index e2875d6fe129..f7207b0a76dc 100644
--- a/trunk/drivers/mtd/maps/amd76xrom.c
+++ b/trunk/drivers/mtd/maps/amd76xrom.c
@@ -100,8 +100,8 @@ static void amd76xrom_cleanup(struct amd76xrom_window *window)
 }
 
 
-static int __devinit amd76xrom_init_one (struct pci_dev *pdev,
-	const struct pci_device_id *ent)
+static int amd76xrom_init_one(struct pci_dev *pdev,
+			      const struct pci_device_id *ent)
 {
 	static char *rom_probe_types[] = { "cfi_probe", "jedec_probe", NULL };
 	u8 byte;
@@ -289,7 +289,7 @@ static int __devinit amd76xrom_init_one (struct pci_dev *pdev,
 }
 
 
-static void __devexit amd76xrom_remove_one (struct pci_dev *pdev)
+static void amd76xrom_remove_one(struct pci_dev *pdev)
 {
 	struct amd76xrom_window *window = &amd76xrom_window;
 
@@ -347,4 +347,3 @@ module_exit(cleanup_amd76xrom);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Eric Biederman <ebiederman@lnxi.com>");
 MODULE_DESCRIPTION("MTD map driver for BIOS chips on the AMD76X southbridge");
-
diff --git a/trunk/drivers/mtd/maps/autcpu12-nvram.c b/trunk/drivers/mtd/maps/autcpu12-nvram.c
index 76fb594bb1d9..a2dc2ae4b24e 100644
--- a/trunk/drivers/mtd/maps/autcpu12-nvram.c
+++ b/trunk/drivers/mtd/maps/autcpu12-nvram.c
@@ -33,7 +33,7 @@ struct autcpu12_nvram_priv {
 	struct map_info map;
 };
 
-static int __devinit autcpu12_nvram_probe(struct platform_device *pdev)
+static int autcpu12_nvram_probe(struct platform_device *pdev)
 {
 	map_word tmp, save0, save1;
 	struct resource *res;
@@ -105,7 +105,7 @@ static int __devinit autcpu12_nvram_probe(struct platform_device *pdev)
 	return -ENOMEM;
 }
 
-static int __devexit autcpu12_nvram_remove(struct platform_device *pdev)
+static int autcpu12_nvram_remove(struct platform_device *pdev)
 {
 	struct autcpu12_nvram_priv *priv = platform_get_drvdata(pdev);
 
@@ -121,7 +121,7 @@ static struct platform_driver autcpu12_nvram_driver = {
 		.owner	= THIS_MODULE,
 	},
 	.probe		= autcpu12_nvram_probe,
-	.remove		= __devexit_p(autcpu12_nvram_remove),
+	.remove		= autcpu12_nvram_remove,
 };
 module_platform_driver(autcpu12_nvram_driver);
 
diff --git a/trunk/drivers/mtd/maps/bfin-async-flash.c b/trunk/drivers/mtd/maps/bfin-async-flash.c
index ef5cde84a8b3..f833edfaab79 100644
--- a/trunk/drivers/mtd/maps/bfin-async-flash.c
+++ b/trunk/drivers/mtd/maps/bfin-async-flash.c
@@ -30,7 +30,8 @@
 #include <linux/io.h>
 #include <asm/unaligned.h>
 
-#define pr_devinit(fmt, args...) ({ static const __devinitconst char __fmt[] = fmt; printk(__fmt, ## args); })
+#define pr_devinit(fmt, args...) \
+		({ static const char __fmt[] = fmt; printk(__fmt, ## args); })
 
 #define DRIVER_NAME "bfin-async-flash"
 
@@ -123,7 +124,7 @@ static void bfin_flash_copy_to(struct map_info *map, unsigned long to, const voi
 
 static const char *part_probe_types[] = { "cmdlinepart", "RedBoot", NULL };
 
-static int __devinit bfin_flash_probe(struct platform_device *pdev)
+static int bfin_flash_probe(struct platform_device *pdev)
 {
 	int ret;
 	struct physmap_flash_data *pdata = pdev->dev.platform_data;
@@ -172,7 +173,7 @@ static int __devinit bfin_flash_probe(struct platform_device *pdev)
 	return 0;
 }
 
-static int __devexit bfin_flash_remove(struct platform_device *pdev)
+static int bfin_flash_remove(struct platform_device *pdev)
 {
 	struct async_state *state = platform_get_drvdata(pdev);
 	gpio_free(state->enet_flash_pin);
@@ -184,7 +185,7 @@ static int __devexit bfin_flash_remove(struct platform_device *pdev)
 
 static struct platform_driver bfin_flash_driver = {
 	.probe		= bfin_flash_probe,
-	.remove		= __devexit_p(bfin_flash_remove),
+	.remove		= bfin_flash_remove,
 	.driver		= {
 		.name	= DRIVER_NAME,
 	},
diff --git a/trunk/drivers/mtd/maps/ck804xrom.c b/trunk/drivers/mtd/maps/ck804xrom.c
index 3d0e762fa5f2..586a1c77e48a 100644
--- a/trunk/drivers/mtd/maps/ck804xrom.c
+++ b/trunk/drivers/mtd/maps/ck804xrom.c
@@ -112,8 +112,8 @@ static void ck804xrom_cleanup(struct ck804xrom_window *window)
 }
 
 
-static int __devinit ck804xrom_init_one (struct pci_dev *pdev,
-					 const struct pci_device_id *ent)
+static int ck804xrom_init_one(struct pci_dev *pdev,
+			      const struct pci_device_id *ent)
 {
 	static char *rom_probe_types[] = { "cfi_probe", "jedec_probe", NULL };
 	u8 byte;
@@ -320,7 +320,7 @@ static int __devinit ck804xrom_init_one (struct pci_dev *pdev,
 }
 
 
-static void __devexit ck804xrom_remove_one (struct pci_dev *pdev)
+static void ck804xrom_remove_one(struct pci_dev *pdev)
 {
 	struct ck804xrom_window *window = &ck804xrom_window;
 
diff --git a/trunk/drivers/mtd/maps/esb2rom.c b/trunk/drivers/mtd/maps/esb2rom.c
index 08322b1c3e81..ff8681a25831 100644
--- a/trunk/drivers/mtd/maps/esb2rom.c
+++ b/trunk/drivers/mtd/maps/esb2rom.c
@@ -144,7 +144,7 @@ static void esb2rom_cleanup(struct esb2rom_window *window)
 	pci_dev_put(window->pdev);
 }
 
-static int __devinit esb2rom_init_one(struct pci_dev *pdev,
+static int esb2rom_init_one(struct pci_dev *pdev,
 				      const struct pci_device_id *ent)
 {
 	static char *rom_probe_types[] = { "cfi_probe", "jedec_probe", NULL };
@@ -378,13 +378,13 @@ static int __devinit esb2rom_init_one(struct pci_dev *pdev,
 	return 0;
 }
 
-static void __devexit esb2rom_remove_one (struct pci_dev *pdev)
+static void esb2rom_remove_one(struct pci_dev *pdev)
 {
 	struct esb2rom_window *window = &esb2rom_window;
 	esb2rom_cleanup(window);
 }
 
-static struct pci_device_id esb2rom_pci_tbl[] __devinitdata = {
+static struct pci_device_id esb2rom_pci_tbl[] = {
 	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801BA_0,
 	  PCI_ANY_ID, PCI_ANY_ID, },
 	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0,
diff --git a/trunk/drivers/mtd/maps/fortunet.c b/trunk/drivers/mtd/maps/fortunet.c
deleted file mode 100644
index 956e2e4f30ea..000000000000
--- a/trunk/drivers/mtd/maps/fortunet.c
+++ /dev/null
@@ -1,277 +0,0 @@
-/* fortunet.c memory map
- *
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/string.h>
-
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/map.h>
-#include <linux/mtd/partitions.h>
-
-#include <asm/io.h>
-
-#define MAX_NUM_REGIONS		4
-#define MAX_NUM_PARTITIONS	8
-
-#define DEF_WINDOW_ADDR_PHY	0x00000000
-#define DEF_WINDOW_SIZE		0x00800000		// 8 Mega Bytes
-
-#define MTD_FORTUNET_PK		"MTD FortuNet: "
-
-#define MAX_NAME_SIZE		128
-
-struct map_region
-{
-	int			window_addr_physical;
-	int			altbankwidth;
-	struct map_info		map_info;
-	struct mtd_info		*mymtd;
-	struct mtd_partition	parts[MAX_NUM_PARTITIONS];
-	char			map_name[MAX_NAME_SIZE];
-	char			parts_name[MAX_NUM_PARTITIONS][MAX_NAME_SIZE];
-};
-
-static struct map_region	map_regions[MAX_NUM_REGIONS];
-static int			map_regions_set[MAX_NUM_REGIONS] = {0,0,0,0};
-static int			map_regions_parts[MAX_NUM_REGIONS] = {0,0,0,0};
-
-
-
-struct map_info default_map = {
-	.size = DEF_WINDOW_SIZE,
-	.bankwidth = 4,
-};
-
-static char * __init get_string_option(char *dest,int dest_size,char *sor)
-{
-	if(!dest_size)
-		return sor;
-	dest_size--;
-	while(*sor)
-	{
-		if(*sor==',')
-		{
-			sor++;
-			break;
-		}
-		else if(*sor=='\"')
-		{
-			sor++;
-			while(*sor)
-			{
-				if(*sor=='\"')
-				{
-					sor++;
-					break;
-				}
-				*dest = *sor;
-				dest++;
-				sor++;
-				dest_size--;
-				if(!dest_size)
-				{
-					*dest = 0;
-					return sor;
-				}
-			}
-		}
-		else
-		{
-			*dest = *sor;
-			dest++;
-			sor++;
-			dest_size--;
-			if(!dest_size)
-			{
-				*dest = 0;
-				return sor;
-			}
-		}
-	}
-	*dest = 0;
-	return sor;
-}
-
-static int __init MTD_New_Region(char *line)
-{
-	char	string[MAX_NAME_SIZE];
-	int	params[6];
-	get_options (get_string_option(string,sizeof(string),line),6,params);
-	if(params[0]<1)
-	{
-		printk(MTD_FORTUNET_PK "Bad parameters for MTD Region "
-			" name,region-number[,base,size,bankwidth,altbankwidth]\n");
-		return 1;
-	}
-	if((params[1]<0)||(params[1]>=MAX_NUM_REGIONS))
-	{
-		printk(MTD_FORTUNET_PK "Bad region index of %d only have 0..%u regions\n",
-			params[1],MAX_NUM_REGIONS-1);
-		return 1;
-	}
-	memset(&map_regions[params[1]],0,sizeof(map_regions[params[1]]));
-	memcpy(&map_regions[params[1]].map_info,
-		&default_map,sizeof(map_regions[params[1]].map_info));
-        map_regions_set[params[1]] = 1;
-        map_regions[params[1]].window_addr_physical = DEF_WINDOW_ADDR_PHY;
-        map_regions[params[1]].altbankwidth = 2;
-        map_regions[params[1]].mymtd = NULL;
-	map_regions[params[1]].map_info.name = map_regions[params[1]].map_name;
-	strcpy(map_regions[params[1]].map_info.name,string);
-	if(params[0]>1)
-	{
-		map_regions[params[1]].window_addr_physical = params[2];
-	}
-	if(params[0]>2)
-	{
-		map_regions[params[1]].map_info.size = params[3];
-	}
-	if(params[0]>3)
-	{
-		map_regions[params[1]].map_info.bankwidth = params[4];
-	}
-	if(params[0]>4)
-	{
-		map_regions[params[1]].altbankwidth = params[5];
-	}
-	return 1;
-}
-
-static int __init MTD_New_Partition(char *line)
-{
-	char	string[MAX_NAME_SIZE];
-	int	params[4];
-	get_options (get_string_option(string,sizeof(string),line),4,params);
-	if(params[0]<3)
-	{
-		printk(MTD_FORTUNET_PK "Bad parameters for MTD Partition "
-			" name,region-number,size,offset\n");
-		return 1;
-	}
-	if((params[1]<0)||(params[1]>=MAX_NUM_REGIONS))
-	{
-		printk(MTD_FORTUNET_PK "Bad region index of %d only have 0..%u regions\n",
-			params[1],MAX_NUM_REGIONS-1);
-		return 1;
-	}
-	if(map_regions_parts[params[1]]>=MAX_NUM_PARTITIONS)
-	{
-		printk(MTD_FORTUNET_PK "Out of space for partition in this region\n");
-		return 1;
-	}
-	map_regions[params[1]].parts[map_regions_parts[params[1]]].name =
-		map_regions[params[1]].	parts_name[map_regions_parts[params[1]]];
-	strcpy(map_regions[params[1]].parts[map_regions_parts[params[1]]].name,string);
-	map_regions[params[1]].parts[map_regions_parts[params[1]]].size =
-		params[2];
-	map_regions[params[1]].parts[map_regions_parts[params[1]]].offset =
-		params[3];
-	map_regions[params[1]].parts[map_regions_parts[params[1]]].mask_flags = 0;
-	map_regions_parts[params[1]]++;
-	return 1;
-}
-
-__setup("MTD_Region=", MTD_New_Region);
-__setup("MTD_Partition=", MTD_New_Partition);
-
-/* Backwards-spelling-compatibility */
-__setup("MTD_Partion=", MTD_New_Partition);
-
-static int __init init_fortunet(void)
-{
-	int	ix,iy;
-	for(iy=ix=0;ix<MAX_NUM_REGIONS;ix++)
-	{
-		if(map_regions_parts[ix]&&(!map_regions_set[ix]))
-		{
-			printk(MTD_FORTUNET_PK "Region %d is not setup (Setting to default)\n",
-				ix);
-			memset(&map_regions[ix],0,sizeof(map_regions[ix]));
-			memcpy(&map_regions[ix].map_info,&default_map,
-				sizeof(map_regions[ix].map_info));
-			map_regions_set[ix] = 1;
-			map_regions[ix].window_addr_physical = DEF_WINDOW_ADDR_PHY;
-			map_regions[ix].altbankwidth = 2;
-			map_regions[ix].mymtd = NULL;
-			map_regions[ix].map_info.name = map_regions[ix].map_name;
-			strcpy(map_regions[ix].map_info.name,"FORTUNET");
-		}
-		if(map_regions_set[ix])
-		{
-			iy++;
-			printk(KERN_NOTICE MTD_FORTUNET_PK "%s flash device at physically "
-				" address %x size %x\n",
-				map_regions[ix].map_info.name,
-				map_regions[ix].window_addr_physical,
-				map_regions[ix].map_info.size);
-
-			map_regions[ix].map_info.phys =	map_regions[ix].window_addr_physical,
-
-			map_regions[ix].map_info.virt =
-				ioremap_nocache(
-				map_regions[ix].window_addr_physical,
-				map_regions[ix].map_info.size);
-			if(!map_regions[ix].map_info.virt)
-			{
-				int j = 0;
-				printk(MTD_FORTUNET_PK "%s flash failed to ioremap!\n",
-					map_regions[ix].map_info.name);
-				for (j = 0 ; j < ix; j++)
-					iounmap(map_regions[j].map_info.virt);
-				return -ENXIO;
-			}
-			simple_map_init(&map_regions[ix].map_info);
-
-			printk(KERN_NOTICE MTD_FORTUNET_PK "%s flash is virtually at: %x\n",
-				map_regions[ix].map_info.name,
-				map_regions[ix].map_info.virt);
-			map_regions[ix].mymtd = do_map_probe("cfi_probe",
-				&map_regions[ix].map_info);
-			if((!map_regions[ix].mymtd)&&(
-				map_regions[ix].altbankwidth!=map_regions[ix].map_info.bankwidth))
-			{
-				printk(KERN_NOTICE MTD_FORTUNET_PK "Trying alternate bankwidth "
-					"for %s flash.\n",
-					map_regions[ix].map_info.name);
-				map_regions[ix].map_info.bankwidth =
-					map_regions[ix].altbankwidth;
-				map_regions[ix].mymtd = do_map_probe("cfi_probe",
-					&map_regions[ix].map_info);
-			}
-			map_regions[ix].mymtd->owner = THIS_MODULE;
-			mtd_device_register(map_regions[ix].mymtd,
-					    map_regions[ix].parts,
-					    map_regions_parts[ix]);
-		}
-	}
-	if(iy)
-		return 0;
-	return -ENXIO;
-}
-
-static void __exit cleanup_fortunet(void)
-{
-	int	ix;
-	for(ix=0;ix<MAX_NUM_REGIONS;ix++)
-	{
-		if(map_regions_set[ix])
-		{
-			if( map_regions[ix].mymtd )
-			{
-				mtd_device_unregister(map_regions[ix].mymtd);
-				map_destroy( map_regions[ix].mymtd );
-			}
-			iounmap((void *)map_regions[ix].map_info.virt);
-		}
-	}
-}
-
-module_init(init_fortunet);
-module_exit(cleanup_fortunet);
-
-MODULE_AUTHOR("FortuNet, Inc.");
-MODULE_DESCRIPTION("MTD map driver for FortuNet boards");
diff --git a/trunk/drivers/mtd/maps/gpio-addr-flash.c b/trunk/drivers/mtd/maps/gpio-addr-flash.c
index e4de96ba52b3..7b643de2500b 100644
--- a/trunk/drivers/mtd/maps/gpio-addr-flash.c
+++ b/trunk/drivers/mtd/maps/gpio-addr-flash.c
@@ -26,7 +26,8 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 
-#define pr_devinit(fmt, args...) ({ static const __devinitconst char __fmt[] = fmt; printk(__fmt, ## args); })
+#define pr_devinit(fmt, args...) \
+	({ static const char __fmt[] = fmt; printk(__fmt, ## args); })
 
 #define DRIVER_NAME "gpio-addr-flash"
 #define PFX DRIVER_NAME ": "
@@ -142,7 +143,8 @@ static void gf_write(struct map_info *map, map_word d1, unsigned long ofs)
  *
  * See gf_copy_from() caveat.
  */
-static void gf_copy_to(struct map_info *map, unsigned long to, const void *from, ssize_t len)
+static void gf_copy_to(struct map_info *map, unsigned long to,
+		       const void *from, ssize_t len)
 {
 	struct async_state *state = gf_map_info_to_state(map);
 
@@ -185,7 +187,7 @@ static const char *part_probe_types[] = { "cmdlinepart", "RedBoot", NULL };
  *	...
  * };
  */
-static int __devinit gpio_flash_probe(struct platform_device *pdev)
+static int gpio_flash_probe(struct platform_device *pdev)
 {
 	size_t i, arr_size;
 	struct physmap_flash_data *pdata;
@@ -258,7 +260,7 @@ static int __devinit gpio_flash_probe(struct platform_device *pdev)
 	return 0;
 }
 
-static int __devexit gpio_flash_remove(struct platform_device *pdev)
+static int gpio_flash_remove(struct platform_device *pdev)
 {
 	struct async_state *state = platform_get_drvdata(pdev);
 	size_t i = 0;
@@ -273,7 +275,7 @@ static int __devexit gpio_flash_remove(struct platform_device *pdev)
 
 static struct platform_driver gpio_flash_driver = {
 	.probe		= gpio_flash_probe,
-	.remove		= __devexit_p(gpio_flash_remove),
+	.remove		= gpio_flash_remove,
 	.driver		= {
 		.name	= DRIVER_NAME,
 	},
diff --git a/trunk/drivers/mtd/maps/ichxrom.c b/trunk/drivers/mtd/maps/ichxrom.c
index 6689dcb3124d..c7478e18f485 100644
--- a/trunk/drivers/mtd/maps/ichxrom.c
+++ b/trunk/drivers/mtd/maps/ichxrom.c
@@ -84,8 +84,8 @@ static void ichxrom_cleanup(struct ichxrom_window *window)
 }
 
 
-static int __devinit ichxrom_init_one (struct pci_dev *pdev,
-	const struct pci_device_id *ent)
+static int ichxrom_init_one(struct pci_dev *pdev,
+			    const struct pci_device_id *ent)
 {
 	static char *rom_probe_types[] = { "cfi_probe", "jedec_probe", NULL };
 	struct ichxrom_window *window = &ichxrom_window;
@@ -315,13 +315,13 @@ static int __devinit ichxrom_init_one (struct pci_dev *pdev,
 }
 
 
-static void __devexit ichxrom_remove_one (struct pci_dev *pdev)
+static void ichxrom_remove_one(struct pci_dev *pdev)
 {
 	struct ichxrom_window *window = &ichxrom_window;
 	ichxrom_cleanup(window);
 }
 
-static struct pci_device_id ichxrom_pci_tbl[] __devinitdata = {
+static struct pci_device_id ichxrom_pci_tbl[] = {
 	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801BA_0,
 	  PCI_ANY_ID, PCI_ANY_ID, },
 	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0,
diff --git a/trunk/drivers/mtd/maps/intel_vr_nor.c b/trunk/drivers/mtd/maps/intel_vr_nor.c
index 93f03175c82d..3ee2ad1dcbe7 100644
--- a/trunk/drivers/mtd/maps/intel_vr_nor.c
+++ b/trunk/drivers/mtd/maps/intel_vr_nor.c
@@ -63,24 +63,24 @@ struct vr_nor_mtd {
 #define TIMING_BYTE_EN		(1 <<  0)	/* 8-bit vs 16-bit bus */
 #define TIMING_MASK		0x3FFF0000
 
-static void __devexit vr_nor_destroy_partitions(struct vr_nor_mtd *p)
+static void vr_nor_destroy_partitions(struct vr_nor_mtd *p)
 {
 	mtd_device_unregister(p->info);
 }
 
-static int __devinit vr_nor_init_partitions(struct vr_nor_mtd *p)
+static int vr_nor_init_partitions(struct vr_nor_mtd *p)
 {
 	/* register the flash bank */
 	/* partition the flash bank */
 	return mtd_device_parse_register(p->info, NULL, NULL, NULL, 0);
 }
 
-static void __devexit vr_nor_destroy_mtd_setup(struct vr_nor_mtd *p)
+static void vr_nor_destroy_mtd_setup(struct vr_nor_mtd *p)
 {
 	map_destroy(p->info);
 }
 
-static int __devinit vr_nor_mtd_setup(struct vr_nor_mtd *p)
+static int vr_nor_mtd_setup(struct vr_nor_mtd *p)
 {
 	static const char *probe_types[] =
 	    { "cfi_probe", "jedec_probe", NULL };
@@ -96,7 +96,7 @@ static int __devinit vr_nor_mtd_setup(struct vr_nor_mtd *p)
 	return 0;
 }
 
-static void __devexit vr_nor_destroy_maps(struct vr_nor_mtd *p)
+static void vr_nor_destroy_maps(struct vr_nor_mtd *p)
 {
 	unsigned int exp_timing_cs0;
 
@@ -116,7 +116,7 @@ static void __devexit vr_nor_destroy_maps(struct vr_nor_mtd *p)
  * Initialize the map_info structure and map the flash.
  * Returns 0 on success, nonzero otherwise.
  */
-static int __devinit vr_nor_init_maps(struct vr_nor_mtd *p)
+static int vr_nor_init_maps(struct vr_nor_mtd *p)
 {
 	unsigned long csr_phys, csr_len;
 	unsigned long win_phys, win_len;
@@ -176,7 +176,7 @@ static struct pci_device_id vr_nor_pci_ids[] = {
 	{0,}
 };
 
-static void __devexit vr_nor_pci_remove(struct pci_dev *dev)
+static void vr_nor_pci_remove(struct pci_dev *dev)
 {
 	struct vr_nor_mtd *p = pci_get_drvdata(dev);
 
@@ -189,7 +189,7 @@ static void __devexit vr_nor_pci_remove(struct pci_dev *dev)
 	pci_disable_device(dev);
 }
 
-static int __devinit
+static int
 vr_nor_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 {
 	struct vr_nor_mtd *p = NULL;
@@ -256,7 +256,7 @@ vr_nor_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 static struct pci_driver vr_nor_pci_driver = {
 	.name = DRV_NAME,
 	.probe = vr_nor_pci_probe,
-	.remove = __devexit_p(vr_nor_pci_remove),
+	.remove = vr_nor_pci_remove,
 	.id_table = vr_nor_pci_ids,
 };
 
diff --git a/trunk/drivers/mtd/maps/lantiq-flash.c b/trunk/drivers/mtd/maps/lantiq-flash.c
index c03456f17004..3c3c791eb96a 100644
--- a/trunk/drivers/mtd/maps/lantiq-flash.c
+++ b/trunk/drivers/mtd/maps/lantiq-flash.c
@@ -45,7 +45,7 @@ struct ltq_mtd {
 };
 
 static const char ltq_map_name[] = "ltq_nor";
-static const char *ltq_probe_types[] __devinitconst = {
+static const char *ltq_probe_types[] = {
 					"cmdlinepart", "ofpart", NULL };
 
 static map_word
@@ -109,7 +109,7 @@ ltq_copy_to(struct map_info *map, unsigned long to,
 	spin_unlock_irqrestore(&ebu_lock, flags);
 }
 
-static int __devinit
+static int
 ltq_mtd_probe(struct platform_device *pdev)
 {
 	struct mtd_part_parser_data ppdata;
@@ -185,7 +185,7 @@ ltq_mtd_probe(struct platform_device *pdev)
 	return err;
 }
 
-static int __devexit
+static int
 ltq_mtd_remove(struct platform_device *pdev)
 {
 	struct ltq_mtd *ltq_mtd = platform_get_drvdata(pdev);
@@ -209,7 +209,7 @@ MODULE_DEVICE_TABLE(of, ltq_mtd_match);
 
 static struct platform_driver ltq_mtd_driver = {
 	.probe = ltq_mtd_probe,
-	.remove = __devexit_p(ltq_mtd_remove),
+	.remove = ltq_mtd_remove,
 	.driver = {
 		.name = "ltq-nor",
 		.owner = THIS_MODULE,
diff --git a/trunk/drivers/mtd/maps/latch-addr-flash.c b/trunk/drivers/mtd/maps/latch-addr-flash.c
index 3c7ad17fca78..ab0fead56b83 100644
--- a/trunk/drivers/mtd/maps/latch-addr-flash.c
+++ b/trunk/drivers/mtd/maps/latch-addr-flash.c
@@ -125,7 +125,7 @@ static int latch_addr_flash_remove(struct platform_device *dev)
 	return 0;
 }
 
-static int __devinit latch_addr_flash_probe(struct platform_device *dev)
+static int latch_addr_flash_probe(struct platform_device *dev)
 {
 	struct latch_addr_flash_data *latch_addr_data;
 	struct latch_addr_flash_info *info;
@@ -218,7 +218,7 @@ static int __devinit latch_addr_flash_probe(struct platform_device *dev)
 
 static struct platform_driver latch_addr_flash_driver = {
 	.probe		= latch_addr_flash_probe,
-	.remove		= __devexit_p(latch_addr_flash_remove),
+	.remove		= latch_addr_flash_remove,
 	.driver		= {
 		.name	= DRIVER_NAME,
 	},
diff --git a/trunk/drivers/mtd/maps/pci.c b/trunk/drivers/mtd/maps/pci.c
index 1c30c1a307f4..ed82914966f5 100644
--- a/trunk/drivers/mtd/maps/pci.c
+++ b/trunk/drivers/mtd/maps/pci.c
@@ -253,7 +253,7 @@ static struct pci_device_id mtd_pci_ids[] = {
  * Generic code follows.
  */
 
-static int __devinit
+static int
 mtd_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 {
 	struct mtd_pci_info *info = (struct mtd_pci_info *)id->driver_data;
@@ -308,7 +308,7 @@ mtd_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	return err;
 }
 
-static void __devexit
+static void
 mtd_pci_remove(struct pci_dev *dev)
 {
 	struct mtd_info *mtd = pci_get_drvdata(dev);
@@ -326,7 +326,7 @@ mtd_pci_remove(struct pci_dev *dev)
 static struct pci_driver mtd_pci_driver = {
 	.name =		"MTD PCI",
 	.probe =	mtd_pci_probe,
-	.remove =	__devexit_p(mtd_pci_remove),
+	.remove =	mtd_pci_remove,
 	.id_table =	mtd_pci_ids,
 };
 
diff --git a/trunk/drivers/mtd/maps/physmap_of.c b/trunk/drivers/mtd/maps/physmap_of.c
index 6f19acadb06c..37cdc201652f 100644
--- a/trunk/drivers/mtd/maps/physmap_of.c
+++ b/trunk/drivers/mtd/maps/physmap_of.c
@@ -77,7 +77,7 @@ static int of_flash_remove(struct platform_device *dev)
 /* Helper function to handle probing of the obsolete "direct-mapped"
  * compatible binding, which has an extra "probe-type" property
  * describing the type of flash probe necessary. */
-static struct mtd_info * __devinit obsolete_probe(struct platform_device *dev,
+static struct mtd_info *obsolete_probe(struct platform_device *dev,
 						  struct map_info *map)
 {
 	struct device_node *dp = dev->dev.of_node;
@@ -116,7 +116,7 @@ static struct mtd_info * __devinit obsolete_probe(struct platform_device *dev,
    information. */
 static const char *part_probe_types_def[] = { "cmdlinepart", "RedBoot",
 					"ofpart", "ofoldpart", NULL };
-static const char ** __devinit of_get_probes(struct device_node *dp)
+static const char **of_get_probes(struct device_node *dp)
 {
 	const char *cp;
 	int cplen;
@@ -145,14 +145,14 @@ static const char ** __devinit of_get_probes(struct device_node *dp)
 	return res;
 }
 
-static void __devinit of_free_probes(const char **probes)
+static void of_free_probes(const char **probes)
 {
 	if (probes != part_probe_types_def)
 		kfree(probes);
 }
 
 static struct of_device_id of_flash_match[];
-static int __devinit of_flash_probe(struct platform_device *dev)
+static int of_flash_probe(struct platform_device *dev)
 {
 	const char **part_probe_types;
 	const struct of_device_id *match;
@@ -170,6 +170,7 @@ static int __devinit of_flash_probe(struct platform_device *dev)
 	resource_size_t res_size;
 	struct mtd_part_parser_data ppdata;
 	bool map_indirect;
+	const char *mtd_name;
 
 	match = of_match_device(of_flash_match, &dev->dev);
 	if (!match)
@@ -178,6 +179,8 @@ static int __devinit of_flash_probe(struct platform_device *dev)
 
 	reg_tuple_size = (of_n_addr_cells(dp) + of_n_size_cells(dp)) * sizeof(u32);
 
+	of_property_read_string(dp, "linux,mtd-name", &mtd_name);
+
 	/*
 	 * Get number of "reg" tuples. Scan for MTD devices on area's
 	 * described by each "reg" region. This makes it possible (including
@@ -234,7 +237,7 @@ static int __devinit of_flash_probe(struct platform_device *dev)
 			goto err_out;
 		}
 
-		info->list[i].map.name = dev_name(&dev->dev);
+		info->list[i].map.name = mtd_name ?: dev_name(&dev->dev);
 		info->list[i].map.phys = res.start;
 		info->list[i].map.size = res_size;
 		info->list[i].map.bankwidth = be32_to_cpup(width);
@@ -282,6 +285,7 @@ static int __devinit of_flash_probe(struct platform_device *dev)
 	}
 
 	err = 0;
+	info->cmtd = NULL;
 	if (info->list_size == 1) {
 		info->cmtd = info->list[0].mtd;
 	} else if (info->list_size > 1) {
@@ -290,9 +294,10 @@ static int __devinit of_flash_probe(struct platform_device *dev)
 		 */
 		info->cmtd = mtd_concat_create(mtd_list, info->list_size,
 					       dev_name(&dev->dev));
-		if (info->cmtd == NULL)
-			err = -ENXIO;
 	}
+	if (info->cmtd == NULL)
+		err = -ENXIO;
+
 	if (err)
 		goto err_out;
 
diff --git a/trunk/drivers/mtd/maps/pismo.c b/trunk/drivers/mtd/maps/pismo.c
index 65bd1cd4d627..afea93b515d5 100644
--- a/trunk/drivers/mtd/maps/pismo.c
+++ b/trunk/drivers/mtd/maps/pismo.c
@@ -58,7 +58,7 @@ static void pismo_set_vpp(struct platform_device *pdev, int on)
 	pismo->vpp(pismo->vpp_data, on);
 }
 
-static unsigned int __devinit pismo_width_to_bytes(unsigned int width)
+static unsigned int pismo_width_to_bytes(unsigned int width)
 {
 	width &= 15;
 	if (width > 2)
@@ -66,7 +66,7 @@ static unsigned int __devinit pismo_width_to_bytes(unsigned int width)
 	return 1 << width;
 }
 
-static int __devinit pismo_eeprom_read(struct i2c_client *client, void *buf,
+static int pismo_eeprom_read(struct i2c_client *client, void *buf,
 	u8 addr, size_t size)
 {
 	int ret;
@@ -88,7 +88,7 @@ static int __devinit pismo_eeprom_read(struct i2c_client *client, void *buf,
 	return ret == ARRAY_SIZE(msg) ? size : -EIO;
 }
 
-static int __devinit pismo_add_device(struct pismo_data *pismo, int i,
+static int pismo_add_device(struct pismo_data *pismo, int i,
 	struct pismo_mem *region, const char *name, void *pdata, size_t psize)
 {
 	struct platform_device *dev;
@@ -129,7 +129,7 @@ static int __devinit pismo_add_device(struct pismo_data *pismo, int i,
 	return ret;
 }
 
-static int __devinit pismo_add_nor(struct pismo_data *pismo, int i,
+static int pismo_add_nor(struct pismo_data *pismo, int i,
 	struct pismo_mem *region)
 {
 	struct physmap_flash_data data = {
@@ -143,7 +143,7 @@ static int __devinit pismo_add_nor(struct pismo_data *pismo, int i,
 		&data, sizeof(data));
 }
 
-static int __devinit pismo_add_sram(struct pismo_data *pismo, int i,
+static int pismo_add_sram(struct pismo_data *pismo, int i,
 	struct pismo_mem *region)
 {
 	struct platdata_mtd_ram data = {
@@ -154,7 +154,7 @@ static int __devinit pismo_add_sram(struct pismo_data *pismo, int i,
 		&data, sizeof(data));
 }
 
-static void __devinit pismo_add_one(struct pismo_data *pismo, int i,
+static void pismo_add_one(struct pismo_data *pismo, int i,
 	const struct pismo_cs_block *cs, phys_addr_t base)
 {
 	struct device *dev = &pismo->client->dev;
@@ -197,7 +197,7 @@ static void __devinit pismo_add_one(struct pismo_data *pismo, int i,
 	}
 }
 
-static int __devexit pismo_remove(struct i2c_client *client)
+static int pismo_remove(struct i2c_client *client)
 {
 	struct pismo_data *pismo = i2c_get_clientdata(client);
 	int i;
@@ -210,7 +210,7 @@ static int __devexit pismo_remove(struct i2c_client *client)
 	return 0;
 }
 
-static int __devinit pismo_probe(struct i2c_client *client,
+static int pismo_probe(struct i2c_client *client,
 				 const struct i2c_device_id *id)
 {
 	struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent);
@@ -267,7 +267,7 @@ static struct i2c_driver pismo_driver = {
 		.owner	= THIS_MODULE,
 	},
 	.probe		= pismo_probe,
-	.remove		= __devexit_p(pismo_remove),
+	.remove		= pismo_remove,
 	.id_table	= pismo_id,
 };
 
diff --git a/trunk/drivers/mtd/maps/pxa2xx-flash.c b/trunk/drivers/mtd/maps/pxa2xx-flash.c
index 81884c277405..43e3dbb976d9 100644
--- a/trunk/drivers/mtd/maps/pxa2xx-flash.c
+++ b/trunk/drivers/mtd/maps/pxa2xx-flash.c
@@ -49,7 +49,7 @@ struct pxa2xx_flash_info {
 static const char *probes[] = { "RedBoot", "cmdlinepart", NULL };
 
 
-static int __devinit pxa2xx_flash_probe(struct platform_device *pdev)
+static int pxa2xx_flash_probe(struct platform_device *pdev)
 {
 	struct flash_platform_data *flash = pdev->dev.platform_data;
 	struct pxa2xx_flash_info *info;
@@ -105,7 +105,7 @@ static int __devinit pxa2xx_flash_probe(struct platform_device *pdev)
 	return 0;
 }
 
-static int __devexit pxa2xx_flash_remove(struct platform_device *dev)
+static int pxa2xx_flash_remove(struct platform_device *dev)
 {
 	struct pxa2xx_flash_info *info = platform_get_drvdata(dev);
 
@@ -139,7 +139,7 @@ static struct platform_driver pxa2xx_flash_driver = {
 		.owner		= THIS_MODULE,
 	},
 	.probe		= pxa2xx_flash_probe,
-	.remove		= __devexit_p(pxa2xx_flash_remove),
+	.remove		= pxa2xx_flash_remove,
 	.shutdown	= pxa2xx_flash_shutdown,
 };
 
diff --git a/trunk/drivers/mtd/maps/sa1100-flash.c b/trunk/drivers/mtd/maps/sa1100-flash.c
index a675bdbcb0fe..f694417cf7e6 100644
--- a/trunk/drivers/mtd/maps/sa1100-flash.c
+++ b/trunk/drivers/mtd/maps/sa1100-flash.c
@@ -149,8 +149,8 @@ static void sa1100_destroy(struct sa_info *info, struct flash_platform_data *pla
 		plat->exit();
 }
 
-static struct sa_info *__devinit
-sa1100_setup_mtd(struct platform_device *pdev, struct flash_platform_data *plat)
+static struct sa_info *sa1100_setup_mtd(struct platform_device *pdev,
+					struct flash_platform_data *plat)
 {
 	struct sa_info *info;
 	int nr, size, i, ret = 0;
@@ -246,7 +246,7 @@ sa1100_setup_mtd(struct platform_device *pdev, struct flash_platform_data *plat)
 
 static const char *part_probes[] = { "cmdlinepart", "RedBoot", NULL };
 
-static int __devinit sa1100_mtd_probe(struct platform_device *pdev)
+static int sa1100_mtd_probe(struct platform_device *pdev)
 {
 	struct flash_platform_data *plat = pdev->dev.platform_data;
 	struct sa_info *info;
diff --git a/trunk/drivers/mtd/maps/scb2_flash.c b/trunk/drivers/mtd/maps/scb2_flash.c
index 9dcbc684abdb..71796137e97b 100644
--- a/trunk/drivers/mtd/maps/scb2_flash.c
+++ b/trunk/drivers/mtd/maps/scb2_flash.c
@@ -69,7 +69,7 @@ static struct map_info scb2_map = {
 };
 static int region_fail;
 
-static int __devinit
+static int
 scb2_fixup_mtd(struct mtd_info *mtd)
 {
 	int i;
@@ -133,7 +133,7 @@ scb2_fixup_mtd(struct mtd_info *mtd)
 /* CSB5's 'Function Control Register' has bits for decoding @ >= 0xffc00000 */
 #define CSB5_FCR	0x41
 #define CSB5_FCR_DECODE_ALL 0x0e
-static int __devinit
+static int
 scb2_flash_probe(struct pci_dev *dev, const struct pci_device_id *ent)
 {
 	u8 reg;
@@ -197,7 +197,7 @@ scb2_flash_probe(struct pci_dev *dev, const struct pci_device_id *ent)
 	return 0;
 }
 
-static void __devexit
+static void
 scb2_flash_remove(struct pci_dev *dev)
 {
 	if (!scb2_mtd)
@@ -231,7 +231,7 @@ static struct pci_driver scb2_flash_driver = {
 	.name =     "Intel SCB2 BIOS Flash",
 	.id_table = scb2_flash_pci_ids,
 	.probe =    scb2_flash_probe,
-	.remove =   __devexit_p(scb2_flash_remove),
+	.remove =   scb2_flash_remove,
 };
 
 module_pci_driver(scb2_flash_driver);
diff --git a/trunk/drivers/mtd/maps/sun_uflash.c b/trunk/drivers/mtd/maps/sun_uflash.c
index 175e537b444f..d467f3b11c96 100644
--- a/trunk/drivers/mtd/maps/sun_uflash.c
+++ b/trunk/drivers/mtd/maps/sun_uflash.c
@@ -108,7 +108,7 @@ int uflash_devinit(struct platform_device *op, struct device_node *dp)
 	return 0;
 }
 
-static int __devinit uflash_probe(struct platform_device *op)
+static int uflash_probe(struct platform_device *op)
 {
 	struct device_node *dp = op->dev.of_node;
 
@@ -121,7 +121,7 @@ static int __devinit uflash_probe(struct platform_device *op)
 	return uflash_devinit(op, dp);
 }
 
-static int __devexit uflash_remove(struct platform_device *op)
+static int uflash_remove(struct platform_device *op)
 {
 	struct uflash_dev *up = dev_get_drvdata(&op->dev);
 
@@ -155,7 +155,7 @@ static struct platform_driver uflash_driver = {
 		.of_match_table = uflash_match,
 	},
 	.probe		= uflash_probe,
-	.remove		= __devexit_p(uflash_remove),
+	.remove		= uflash_remove,
 };
 
 module_platform_driver(uflash_driver);
diff --git a/trunk/drivers/mtd/maps/vmu-flash.c b/trunk/drivers/mtd/maps/vmu-flash.c
index 2e2b0945edc7..6b223cfe92b7 100644
--- a/trunk/drivers/mtd/maps/vmu-flash.c
+++ b/trunk/drivers/mtd/maps/vmu-flash.c
@@ -596,7 +596,7 @@ static void vmu_queryblocks(struct mapleq *mq)
 }
 
 /* Handles very basic info about the flash, queries for details */
-static int __devinit vmu_connect(struct maple_device *mdev)
+static int vmu_connect(struct maple_device *mdev)
 {
 	unsigned long test_flash_data, basic_flash_data;
 	int c, error;
@@ -690,7 +690,7 @@ static int __devinit vmu_connect(struct maple_device *mdev)
 	return error;
 }
 
-static void __devexit vmu_disconnect(struct maple_device *mdev)
+static void vmu_disconnect(struct maple_device *mdev)
 {
 	struct memcard *card;
 	struct mdev_part *mpart;
@@ -772,7 +772,7 @@ static void vmu_file_error(struct maple_device *mdev, void *recvbuf)
 }
 
 
-static int __devinit probe_maple_vmu(struct device *dev)
+static int probe_maple_vmu(struct device *dev)
 {
 	int error;
 	struct maple_device *mdev = to_maple_dev(dev);
@@ -789,7 +789,7 @@ static int __devinit probe_maple_vmu(struct device *dev)
 	return 0;
 }
 
-static int __devexit remove_maple_vmu(struct device *dev)
+static int remove_maple_vmu(struct device *dev)
 {
 	struct maple_device *mdev = to_maple_dev(dev);
 
@@ -802,7 +802,7 @@ static struct maple_driver vmu_flash_driver = {
 	.drv = {
 		.name =		"Dreamcast_visual_memory",
 		.probe =	probe_maple_vmu,
-		.remove = 	__devexit_p(remove_maple_vmu),
+		.remove =	remove_maple_vmu,
 	},
 };
 
diff --git a/trunk/drivers/mtd/mtd_blkdevs.c b/trunk/drivers/mtd/mtd_blkdevs.c
index f1f06715d4e0..5ad39bb5ab4c 100644
--- a/trunk/drivers/mtd/mtd_blkdevs.c
+++ b/trunk/drivers/mtd/mtd_blkdevs.c
@@ -32,7 +32,6 @@
 #include <linux/hdreg.h>
 #include <linux/init.h>
 #include <linux/mutex.h>
-#include <linux/kthread.h>
 #include <asm/uaccess.h>
 
 #include "mtdcore.h"
@@ -121,16 +120,14 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 
 int mtd_blktrans_cease_background(struct mtd_blktrans_dev *dev)
 {
-	if (kthread_should_stop())
-		return 1;
-
 	return dev->bg_stop;
 }
 EXPORT_SYMBOL_GPL(mtd_blktrans_cease_background);
 
-static int mtd_blktrans_thread(void *arg)
+static void mtd_blktrans_work(struct work_struct *work)
 {
-	struct mtd_blktrans_dev *dev = arg;
+	struct mtd_blktrans_dev *dev =
+		container_of(work, struct mtd_blktrans_dev, work);
 	struct mtd_blktrans_ops *tr = dev->tr;
 	struct request_queue *rq = dev->rq;
 	struct request *req = NULL;
@@ -138,7 +135,7 @@ static int mtd_blktrans_thread(void *arg)
 
 	spin_lock_irq(rq->queue_lock);
 
-	while (!kthread_should_stop()) {
+	while (1) {
 		int res;
 
 		dev->bg_stop = false;
@@ -156,15 +153,7 @@ static int mtd_blktrans_thread(void *arg)
 				background_done = !dev->bg_stop;
 				continue;
 			}
-			set_current_state(TASK_INTERRUPTIBLE);
-
-			if (kthread_should_stop())
-				set_current_state(TASK_RUNNING);
-
-			spin_unlock_irq(rq->queue_lock);
-			schedule();
-			spin_lock_irq(rq->queue_lock);
-			continue;
+			break;
 		}
 
 		spin_unlock_irq(rq->queue_lock);
@@ -185,8 +174,6 @@ static int mtd_blktrans_thread(void *arg)
 		__blk_end_request_all(req, -EIO);
 
 	spin_unlock_irq(rq->queue_lock);
-
-	return 0;
 }
 
 static void mtd_blktrans_request(struct request_queue *rq)
@@ -199,10 +186,8 @@ static void mtd_blktrans_request(struct request_queue *rq)
 	if (!dev)
 		while ((req = blk_fetch_request(rq)) != NULL)
 			__blk_end_request_all(req, -ENODEV);
-	else {
-		dev->bg_stop = true;
-		wake_up_process(dev->thread);
-	}
+	else
+		queue_work(dev->wq, &dev->work);
 }
 
 static int blktrans_open(struct block_device *bdev, fmode_t mode)
@@ -325,7 +310,7 @@ static int blktrans_ioctl(struct block_device *bdev, fmode_t mode,
 	return ret;
 }
 
-static const struct block_device_operations mtd_blktrans_ops = {
+static const struct block_device_operations mtd_block_ops = {
 	.owner		= THIS_MODULE,
 	.open		= blktrans_open,
 	.release	= blktrans_release,
@@ -401,7 +386,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
 	gd->private_data = new;
 	gd->major = tr->major;
 	gd->first_minor = (new->devnum) << tr->part_bits;
-	gd->fops = &mtd_blktrans_ops;
+	gd->fops = &mtd_block_ops;
 
 	if (tr->part_bits)
 		if (new->devnum < 26)
@@ -437,14 +422,13 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
 
 	gd->queue = new->rq;
 
-	/* Create processing thread */
-	/* TODO: workqueue ? */
-	new->thread = kthread_run(mtd_blktrans_thread, new,
-			"%s%d", tr->name, new->mtd->index);
-	if (IS_ERR(new->thread)) {
-		ret = PTR_ERR(new->thread);
+	/* Create processing workqueue */
+	new->wq = alloc_workqueue("%s%d", 0, 0,
+				  tr->name, new->mtd->index);
+	if (!new->wq)
 		goto error4;
-	}
+	INIT_WORK(&new->work, mtd_blktrans_work);
+
 	gd->driverfs_dev = &new->mtd->dev;
 
 	if (new->readonly)
@@ -484,9 +468,8 @@ int del_mtd_blktrans_dev(struct mtd_blktrans_dev *old)
 	/* Stop new requests to arrive */
 	del_gendisk(old->disk);
 
-
-	/* Stop the thread */
-	kthread_stop(old->thread);
+	/* Stop workqueue. This will perform any pending request. */
+	destroy_workqueue(old->wq);
 
 	/* Kill current requests */
 	spin_lock_irqsave(&old->queue_lock, flags);
diff --git a/trunk/drivers/mtd/mtdoops.c b/trunk/drivers/mtd/mtdoops.c
index f5b3f91fa1cc..97bb8f6304d4 100644
--- a/trunk/drivers/mtd/mtdoops.c
+++ b/trunk/drivers/mtd/mtdoops.c
@@ -271,7 +271,7 @@ static void find_next_position(struct mtdoops_context *cxt)
 
 		if (count[0] == 0xffffffff && count[1] == 0xffffffff)
 			mark_page_unused(cxt, page);
-		if (count[0] == 0xffffffff)
+		if (count[0] == 0xffffffff || count[1] != MTDOOPS_KERNMSG_MAGIC)
 			continue;
 		if (maxcount == 0xffffffff) {
 			maxcount = count[0];
@@ -289,14 +289,13 @@ static void find_next_position(struct mtdoops_context *cxt)
 		}
 	}
 	if (maxcount == 0xffffffff) {
-		cxt->nextpage = 0;
-		cxt->nextcount = 1;
-		schedule_work(&cxt->work_erase);
-		return;
+		cxt->nextpage = cxt->oops_pages - 1;
+		cxt->nextcount = 0;
+	}
+	else {
+		cxt->nextpage = maxpos;
+		cxt->nextcount = maxcount;
 	}
-
-	cxt->nextpage = maxpos;
-	cxt->nextcount = maxcount;
 
 	mtdoops_inc_counter(cxt);
 }
diff --git a/trunk/drivers/mtd/nand/Kconfig b/trunk/drivers/mtd/nand/Kconfig
index dae191b3c081..5819eb575210 100644
--- a/trunk/drivers/mtd/nand/Kconfig
+++ b/trunk/drivers/mtd/nand/Kconfig
@@ -50,16 +50,30 @@ config MTD_NAND_MUSEUM_IDS
 	  of these chips were reused by later, larger chips.
 
 config MTD_NAND_DENALI
-       depends on PCI
+        tristate "Support Denali NAND controller"
+        help
+	  Enable support for the Denali NAND controller.  This should be
+	  combined with either the PCI or platform drivers to provide device
+	  registration.
+
+config MTD_NAND_DENALI_PCI
         tristate "Support Denali NAND controller on Intel Moorestown"
+	depends on PCI && MTD_NAND_DENALI
         help
           Enable the driver for NAND flash on Intel Moorestown, using the
           Denali NAND controller core.
- 
+
+config MTD_NAND_DENALI_DT
+	tristate "Support Denali NAND controller as a DT device"
+	depends on HAVE_CLK && MTD_NAND_DENALI
+	help
+	  Enable the driver for NAND flash on platforms using a Denali NAND
+	  controller as a DT device.
+
 config MTD_NAND_DENALI_SCRATCH_REG_ADDR
         hex "Denali NAND size scratch register address"
         default "0xFF108018"
-        depends on MTD_NAND_DENALI
+        depends on MTD_NAND_DENALI_PCI
         help
           Some platforms place the NAND chip size in a scratch register
           because (some versions of) the driver aren't able to automatically
@@ -433,6 +447,14 @@ config MTD_NAND_GPMI_NAND
 	 block, such as SD card. So pay attention to it when you enable
 	 the GPMI.
 
+config MTD_NAND_BCM47XXNFLASH
+	tristate "Support for NAND flash on BCM4706 BCMA bus"
+	depends on BCMA_NFLASH
+	help
+	  BCMA bus can have various flash memories attached, they are
+	  registered by bcma as platform devices. This enables driver for
+	  NAND flash memories. For now only BCM4706 is supported.
+
 config MTD_NAND_PLATFORM
 	tristate "Support for generic platform NAND driver"
 	depends on HAS_IOMEM
@@ -499,12 +521,6 @@ config MTD_NAND_MXC
 	  This enables the driver for the NAND flash controller on the
 	  MXC processors.
 
-config MTD_NAND_NOMADIK
-	tristate "ST Nomadik 8815 NAND support"
-	depends on ARCH_NOMADIK
-	help
-	  Driver for the NAND flash controller on the Nomadik, with ECC.
-
 config MTD_NAND_SH_FLCTL
 	tristate "Support for NAND on Renesas SuperH FLCTL"
 	depends on SUPERH || ARCH_SHMOBILE
diff --git a/trunk/drivers/mtd/nand/Makefile b/trunk/drivers/mtd/nand/Makefile
index 6c7f2b3ca8ae..d76d91205691 100644
--- a/trunk/drivers/mtd/nand/Makefile
+++ b/trunk/drivers/mtd/nand/Makefile
@@ -11,6 +11,8 @@ obj-$(CONFIG_MTD_SM_COMMON) 		+= sm_common.o
 obj-$(CONFIG_MTD_NAND_CAFE)		+= cafe_nand.o
 obj-$(CONFIG_MTD_NAND_AMS_DELTA)	+= ams-delta.o
 obj-$(CONFIG_MTD_NAND_DENALI)		+= denali.o
+obj-$(CONFIG_MTD_NAND_DENALI_PCI)	+= denali_pci.o
+obj-$(CONFIG_MTD_NAND_DENALI_DT)	+= denali_dt.o
 obj-$(CONFIG_MTD_NAND_AU1550)		+= au1550nd.o
 obj-$(CONFIG_MTD_NAND_BF5XX)		+= bf5xx_nand.o
 obj-$(CONFIG_MTD_NAND_PPCHAMELEONEVB)	+= ppchameleonevb.o
@@ -45,11 +47,11 @@ obj-$(CONFIG_MTD_NAND_MXC)		+= mxc_nand.o
 obj-$(CONFIG_MTD_NAND_SOCRATES)		+= socrates_nand.o
 obj-$(CONFIG_MTD_NAND_TXX9NDFMC)	+= txx9ndfmc.o
 obj-$(CONFIG_MTD_NAND_NUC900)		+= nuc900_nand.o
-obj-$(CONFIG_MTD_NAND_NOMADIK)		+= nomadik_nand.o
 obj-$(CONFIG_MTD_NAND_MPC5121_NFC)	+= mpc5121_nfc.o
 obj-$(CONFIG_MTD_NAND_RICOH)		+= r852.o
 obj-$(CONFIG_MTD_NAND_JZ4740)		+= jz4740_nand.o
 obj-$(CONFIG_MTD_NAND_GPMI_NAND)	+= gpmi-nand/
 obj-$(CONFIG_MTD_NAND_XWAY)		+= xway_nand.o
+obj-$(CONFIG_MTD_NAND_BCM47XXNFLASH)	+= bcm47xxnflash/
 
 nand-objs := nand_base.o nand_bbt.o
diff --git a/trunk/drivers/mtd/nand/ams-delta.c b/trunk/drivers/mtd/nand/ams-delta.c
index 9e7723aa7acc..f1d71cdc8aac 100644
--- a/trunk/drivers/mtd/nand/ams-delta.c
+++ b/trunk/drivers/mtd/nand/ams-delta.c
@@ -173,7 +173,7 @@ static const struct gpio _mandatory_gpio[] = {
 /*
  * Main initialization routine
  */
-static int __devinit ams_delta_init(struct platform_device *pdev)
+static int ams_delta_init(struct platform_device *pdev)
 {
 	struct nand_chip *this;
 	struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
@@ -270,7 +270,7 @@ static int __devinit ams_delta_init(struct platform_device *pdev)
 /*
  * Clean up routine
  */
-static int __devexit ams_delta_cleanup(struct platform_device *pdev)
+static int ams_delta_cleanup(struct platform_device *pdev)
 {
 	void __iomem *io_base = platform_get_drvdata(pdev);
 
@@ -289,7 +289,7 @@ static int __devexit ams_delta_cleanup(struct platform_device *pdev)
 
 static struct platform_driver ams_delta_nand_driver = {
 	.probe		= ams_delta_init,
-	.remove		= __devexit_p(ams_delta_cleanup),
+	.remove		= ams_delta_cleanup,
 	.driver		= {
 		.name	= "ams-delta-nand",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/mtd/nand/atmel_nand.c b/trunk/drivers/mtd/nand/atmel_nand.c
index 92623ac2015a..90bdca61c797 100644
--- a/trunk/drivers/mtd/nand/atmel_nand.c
+++ b/trunk/drivers/mtd/nand/atmel_nand.c
@@ -331,13 +331,13 @@ static void atmel_write_buf(struct mtd_info *mtd, const u8 *buf, int len)
  *               12-bits                20-bytes                 21-bytes
  *               24-bits                39-bytes                 42-bytes
  */
-static int __devinit pmecc_get_ecc_bytes(int cap, int sector_size)
+static int pmecc_get_ecc_bytes(int cap, int sector_size)
 {
 	int m = 12 + sector_size / 512;
 	return (m * cap + 7) / 8;
 }
 
-static void __devinit pmecc_config_ecc_layout(struct nand_ecclayout *layout,
+static void pmecc_config_ecc_layout(struct nand_ecclayout *layout,
 	int oobsize, int ecc_len)
 {
 	int i;
@@ -353,7 +353,7 @@ static void __devinit pmecc_config_ecc_layout(struct nand_ecclayout *layout,
 		oobsize - ecc_len - layout->oobfree[0].offset;
 }
 
-static void __devinit __iomem *pmecc_get_alpha_to(struct atmel_nand_host *host)
+static void __iomem *pmecc_get_alpha_to(struct atmel_nand_host *host)
 {
 	int table_size;
 
@@ -375,7 +375,7 @@ static void pmecc_data_free(struct atmel_nand_host *host)
 	kfree(host->pmecc_delta);
 }
 
-static int __devinit pmecc_data_alloc(struct atmel_nand_host *host)
+static int pmecc_data_alloc(struct atmel_nand_host *host)
 {
 	const int cap = host->pmecc_corr_cap;
 
@@ -724,6 +724,7 @@ static int pmecc_correction(struct mtd_info *mtd, u32 pmecc_stat, uint8_t *buf,
 	struct atmel_nand_host *host = nand_chip->priv;
 	int i, err_nbr, eccbytes;
 	uint8_t *buf_pos;
+	int total_err = 0;
 
 	eccbytes = nand_chip->ecc.bytes;
 	for (i = 0; i < eccbytes; i++)
@@ -751,12 +752,13 @@ static int pmecc_correction(struct mtd_info *mtd, u32 pmecc_stat, uint8_t *buf,
 				pmecc_correct_data(mtd, buf_pos, ecc, i,
 					host->pmecc_bytes_per_sector, err_nbr);
 				mtd->ecc_stats.corrected += err_nbr;
+				total_err += err_nbr;
 			}
 		}
 		pmecc_stat >>= 1;
 	}
 
-	return 0;
+	return total_err;
 }
 
 static int atmel_nand_pmecc_read_page(struct mtd_info *mtd,
@@ -768,6 +770,7 @@ static int atmel_nand_pmecc_read_page(struct mtd_info *mtd,
 	uint32_t *eccpos = chip->ecc.layout->eccpos;
 	uint32_t stat;
 	unsigned long end_time;
+	int bitflips = 0;
 
 	pmecc_writel(host->ecc, CTRL, PMECC_CTRL_RST);
 	pmecc_writel(host->ecc, CTRL, PMECC_CTRL_DISABLE);
@@ -790,11 +793,14 @@ static int atmel_nand_pmecc_read_page(struct mtd_info *mtd,
 	}
 
 	stat = pmecc_readl_relaxed(host->ecc, ISR);
-	if (stat != 0)
-		if (pmecc_correction(mtd, stat, buf, &oob[eccpos[0]]) != 0)
-			return -EIO;
+	if (stat != 0) {
+		bitflips = pmecc_correction(mtd, stat, buf, &oob[eccpos[0]]);
+		if (bitflips < 0)
+			/* uncorrectable errors */
+			return 0;
+	}
 
-	return 0;
+	return bitflips;
 }
 
 static int atmel_nand_pmecc_write_page(struct mtd_info *mtd,
@@ -1206,7 +1212,7 @@ static void atmel_nand_hwctl(struct mtd_info *mtd, int mode)
 }
 
 #if defined(CONFIG_OF)
-static int __devinit atmel_of_init_port(struct atmel_nand_host *host,
+static int atmel_of_init_port(struct atmel_nand_host *host,
 					 struct device_node *np)
 {
 	u32 val, table_offset;
@@ -1293,7 +1299,7 @@ static int __devinit atmel_of_init_port(struct atmel_nand_host *host,
 	return 0;
 }
 #else
-static int __devinit atmel_of_init_port(struct atmel_nand_host *host,
+static int atmel_of_init_port(struct atmel_nand_host *host,
 					 struct device_node *np)
 {
 	return -EINVAL;
diff --git a/trunk/drivers/mtd/nand/au1550nd.c b/trunk/drivers/mtd/nand/au1550nd.c
index 5c47b200045a..217459d02b2f 100644
--- a/trunk/drivers/mtd/nand/au1550nd.c
+++ b/trunk/drivers/mtd/nand/au1550nd.c
@@ -382,7 +382,7 @@ static void au1550_command(struct mtd_info *mtd, unsigned command, int column, i
 	while(!this->dev_ready(mtd));
 }
 
-static int __devinit find_nand_cs(unsigned long nand_base)
+static int find_nand_cs(unsigned long nand_base)
 {
 	void __iomem *base =
 			(void __iomem *)KSEG1ADDR(AU1000_STATIC_MEM_PHYS_ADDR);
@@ -403,7 +403,7 @@ static int __devinit find_nand_cs(unsigned long nand_base)
 	return -ENODEV;
 }
 
-static int __devinit au1550nd_probe(struct platform_device *pdev)
+static int au1550nd_probe(struct platform_device *pdev)
 {
 	struct au1550nd_platdata *pd;
 	struct au1550nd_ctx *ctx;
@@ -491,7 +491,7 @@ static int __devinit au1550nd_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static int __devexit au1550nd_remove(struct platform_device *pdev)
+static int au1550nd_remove(struct platform_device *pdev)
 {
 	struct au1550nd_ctx *ctx = platform_get_drvdata(pdev);
 	struct resource *r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
@@ -509,7 +509,7 @@ static struct platform_driver au1550nd_driver = {
 		.owner	= THIS_MODULE,
 	},
 	.probe		= au1550nd_probe,
-	.remove		= __devexit_p(au1550nd_remove),
+	.remove		= au1550nd_remove,
 };
 
 module_platform_driver(au1550nd_driver);
diff --git a/trunk/drivers/mtd/nand/bcm47xxnflash/Makefile b/trunk/drivers/mtd/nand/bcm47xxnflash/Makefile
new file mode 100644
index 000000000000..f05b119e134b
--- /dev/null
+++ b/trunk/drivers/mtd/nand/bcm47xxnflash/Makefile
@@ -0,0 +1,4 @@
+bcm47xxnflash-y				+= main.o
+bcm47xxnflash-y				+= ops_bcm4706.o
+
+obj-$(CONFIG_MTD_NAND_BCM47XXNFLASH)	+= bcm47xxnflash.o
diff --git a/trunk/drivers/mtd/nand/bcm47xxnflash/bcm47xxnflash.h b/trunk/drivers/mtd/nand/bcm47xxnflash/bcm47xxnflash.h
new file mode 100644
index 000000000000..0bdb2ce4da75
--- /dev/null
+++ b/trunk/drivers/mtd/nand/bcm47xxnflash/bcm47xxnflash.h
@@ -0,0 +1,22 @@
+#ifndef __BCM47XXNFLASH_H
+#define __BCM47XXNFLASH_H
+
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/nand.h>
+
+struct bcm47xxnflash {
+	struct bcma_drv_cc *cc;
+
+	struct nand_chip nand_chip;
+	struct mtd_info mtd;
+
+	unsigned curr_command;
+	int curr_page_addr;
+	int curr_column;
+
+	u8 id_data[8];
+};
+
+int bcm47xxnflash_ops_bcm4706_init(struct bcm47xxnflash *b47n);
+
+#endif /* BCM47XXNFLASH */
diff --git a/trunk/drivers/mtd/nand/bcm47xxnflash/main.c b/trunk/drivers/mtd/nand/bcm47xxnflash/main.c
new file mode 100644
index 000000000000..2b8b05bec3dd
--- /dev/null
+++ b/trunk/drivers/mtd/nand/bcm47xxnflash/main.c
@@ -0,0 +1,108 @@
+/*
+ * BCM47XX NAND flash driver
+ *
+ * Copyright (C) 2012 Rafał Miłecki <zajec5@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/platform_device.h>
+#include <linux/bcma/bcma.h>
+
+#include "bcm47xxnflash.h"
+
+MODULE_DESCRIPTION("NAND flash driver for BCMA bus");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rafał Miłecki");
+
+static const char *probes[] = { "bcm47xxpart", NULL };
+
+static int bcm47xxnflash_probe(struct platform_device *pdev)
+{
+	struct bcma_nflash *nflash = dev_get_platdata(&pdev->dev);
+	struct bcm47xxnflash *b47n;
+	int err = 0;
+
+	b47n = kzalloc(sizeof(*b47n), GFP_KERNEL);
+	if (!b47n) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	b47n->nand_chip.priv = b47n;
+	b47n->mtd.owner = THIS_MODULE;
+	b47n->mtd.priv = &b47n->nand_chip; /* Required */
+	b47n->cc = container_of(nflash, struct bcma_drv_cc, nflash);
+
+	if (b47n->cc->core->bus->chipinfo.id == BCMA_CHIP_ID_BCM4706) {
+		err = bcm47xxnflash_ops_bcm4706_init(b47n);
+	} else {
+		pr_err("Device not supported\n");
+		err = -ENOTSUPP;
+	}
+	if (err) {
+		pr_err("Initialization failed: %d\n", err);
+		goto err_init;
+	}
+
+	err = mtd_device_parse_register(&b47n->mtd, probes, NULL, NULL, 0);
+	if (err) {
+		pr_err("Failed to register MTD device: %d\n", err);
+		goto err_dev_reg;
+	}
+
+	return 0;
+
+err_dev_reg:
+err_init:
+	kfree(b47n);
+out:
+	return err;
+}
+
+static int __devexit bcm47xxnflash_remove(struct platform_device *pdev)
+{
+	struct bcma_nflash *nflash = dev_get_platdata(&pdev->dev);
+
+	if (nflash->mtd)
+		mtd_device_unregister(nflash->mtd);
+
+	return 0;
+}
+
+static struct platform_driver bcm47xxnflash_driver = {
+	.remove = __devexit_p(bcm47xxnflash_remove),
+	.driver = {
+		.name = "bcma_nflash",
+		.owner = THIS_MODULE,
+	},
+};
+
+static int __init bcm47xxnflash_init(void)
+{
+	int err;
+
+	/*
+	 * Platform device "bcma_nflash" exists on SoCs and is registered very
+	 * early, it won't be added during runtime (use platform_driver_probe).
+	 */
+	err = platform_driver_probe(&bcm47xxnflash_driver, bcm47xxnflash_probe);
+	if (err)
+		pr_err("Failed to register serial flash driver: %d\n", err);
+
+	return err;
+}
+
+static void __exit bcm47xxnflash_exit(void)
+{
+	platform_driver_unregister(&bcm47xxnflash_driver);
+}
+
+module_init(bcm47xxnflash_init);
+module_exit(bcm47xxnflash_exit);
diff --git a/trunk/drivers/mtd/nand/bcm47xxnflash/ops_bcm4706.c b/trunk/drivers/mtd/nand/bcm47xxnflash/ops_bcm4706.c
new file mode 100644
index 000000000000..86c9a79b89b3
--- /dev/null
+++ b/trunk/drivers/mtd/nand/bcm47xxnflash/ops_bcm4706.c
@@ -0,0 +1,413 @@
+/*
+ * BCM47XX NAND flash driver
+ *
+ * Copyright (C) 2012 Rafał Miłecki <zajec5@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/bcma/bcma.h>
+
+#include "bcm47xxnflash.h"
+
+/* Broadcom uses 1'000'000 but it seems to be too many. Tests on WNDR4500 has
+ * shown 164 retries as maxiumum. */
+#define NFLASH_READY_RETRIES		1000
+
+#define NFLASH_SECTOR_SIZE		512
+
+#define NCTL_CMD0			0x00010000
+#define NCTL_CMD1W			0x00080000
+#define NCTL_READ			0x00100000
+#define NCTL_WRITE			0x00200000
+#define NCTL_SPECADDR			0x01000000
+#define NCTL_READY			0x04000000
+#define NCTL_ERR			0x08000000
+#define NCTL_CSA			0x40000000
+#define NCTL_START			0x80000000
+
+/**************************************************
+ * Various helpers
+ **************************************************/
+
+static inline u8 bcm47xxnflash_ops_bcm4706_ns_to_cycle(u16 ns, u16 clock)
+{
+	return ((ns * 1000 * clock) / 1000000) + 1;
+}
+
+static int bcm47xxnflash_ops_bcm4706_ctl_cmd(struct bcma_drv_cc *cc, u32 code)
+{
+	int i = 0;
+
+	bcma_cc_write32(cc, BCMA_CC_NFLASH_CTL, NCTL_START | code);
+	for (i = 0; i < NFLASH_READY_RETRIES; i++) {
+		if (!(bcma_cc_read32(cc, BCMA_CC_NFLASH_CTL) & NCTL_START)) {
+			i = 0;
+			break;
+		}
+	}
+	if (i) {
+		pr_err("NFLASH control command not ready!\n");
+		return -EBUSY;
+	}
+	return 0;
+}
+
+static int bcm47xxnflash_ops_bcm4706_poll(struct bcma_drv_cc *cc)
+{
+	int i;
+
+	for (i = 0; i < NFLASH_READY_RETRIES; i++) {
+		if (bcma_cc_read32(cc, BCMA_CC_NFLASH_CTL) & NCTL_READY) {
+			if (bcma_cc_read32(cc, BCMA_CC_NFLASH_CTL) &
+			    BCMA_CC_NFLASH_CTL_ERR) {
+				pr_err("Error on polling\n");
+				return -EBUSY;
+			} else {
+				return 0;
+			}
+		}
+	}
+
+	pr_err("Polling timeout!\n");
+	return -EBUSY;
+}
+
+/**************************************************
+ * R/W
+ **************************************************/
+
+static void bcm47xxnflash_ops_bcm4706_read(struct mtd_info *mtd, uint8_t *buf,
+					   int len)
+{
+	struct nand_chip *nand_chip = (struct nand_chip *)mtd->priv;
+	struct bcm47xxnflash *b47n = (struct bcm47xxnflash *)nand_chip->priv;
+
+	u32 ctlcode;
+	u32 *dest = (u32 *)buf;
+	int i;
+	int toread;
+
+	BUG_ON(b47n->curr_page_addr & ~nand_chip->pagemask);
+	/* Don't validate column using nand_chip->page_shift, it may be bigger
+	 * when accessing OOB */
+
+	while (len) {
+		/* We can read maximum of 0x200 bytes at once */
+		toread = min(len, 0x200);
+
+		/* Set page and column */
+		bcma_cc_write32(b47n->cc, BCMA_CC_NFLASH_COL_ADDR,
+				b47n->curr_column);
+		bcma_cc_write32(b47n->cc, BCMA_CC_NFLASH_ROW_ADDR,
+				b47n->curr_page_addr);
+
+		/* Prepare to read */
+		ctlcode = NCTL_CSA | NCTL_CMD1W | 0x00040000 | 0x00020000 |
+			  NCTL_CMD0;
+		ctlcode |= NAND_CMD_READSTART << 8;
+		if (bcm47xxnflash_ops_bcm4706_ctl_cmd(b47n->cc, ctlcode))
+			return;
+		if (bcm47xxnflash_ops_bcm4706_poll(b47n->cc))
+			return;
+
+		/* Eventually read some data :) */
+		for (i = 0; i < toread; i += 4, dest++) {
+			ctlcode = NCTL_CSA | 0x30000000 | NCTL_READ;
+			if (i == toread - 4) /* Last read goes without that */
+				ctlcode &= ~NCTL_CSA;
+			if (bcm47xxnflash_ops_bcm4706_ctl_cmd(b47n->cc,
+							      ctlcode))
+				return;
+			*dest = bcma_cc_read32(b47n->cc, BCMA_CC_NFLASH_DATA);
+		}
+
+		b47n->curr_column += toread;
+		len -= toread;
+	}
+}
+
+static void bcm47xxnflash_ops_bcm4706_write(struct mtd_info *mtd,
+					    const uint8_t *buf, int len)
+{
+	struct nand_chip *nand_chip = (struct nand_chip *)mtd->priv;
+	struct bcm47xxnflash *b47n = (struct bcm47xxnflash *)nand_chip->priv;
+	struct bcma_drv_cc *cc = b47n->cc;
+
+	u32 ctlcode;
+	const u32 *data = (u32 *)buf;
+	int i;
+
+	BUG_ON(b47n->curr_page_addr & ~nand_chip->pagemask);
+	/* Don't validate column using nand_chip->page_shift, it may be bigger
+	 * when accessing OOB */
+
+	for (i = 0; i < len; i += 4, data++) {
+		bcma_cc_write32(cc, BCMA_CC_NFLASH_DATA, *data);
+
+		ctlcode = NCTL_CSA | 0x30000000 | NCTL_WRITE;
+		if (i == len - 4) /* Last read goes without that */
+			ctlcode &= ~NCTL_CSA;
+		if (bcm47xxnflash_ops_bcm4706_ctl_cmd(cc, ctlcode)) {
+			pr_err("%s ctl_cmd didn't work!\n", __func__);
+			return;
+		}
+	}
+
+	b47n->curr_column += len;
+}
+
+/**************************************************
+ * NAND chip ops
+ **************************************************/
+
+/* Default nand_select_chip calls cmd_ctrl, which is not used in BCM4706 */
+static void bcm47xxnflash_ops_bcm4706_select_chip(struct mtd_info *mtd,
+						  int chip)
+{
+	return;
+}
+
+/*
+ * Default nand_command and nand_command_lp don't match BCM4706 hardware layout.
+ * For example, reading chip id is performed in a non-standard way.
+ * Setting column and page is also handled differently, we use a special
+ * registers of ChipCommon core. Hacking cmd_ctrl to understand and convert
+ * standard commands would be much more complicated.
+ */
+static void bcm47xxnflash_ops_bcm4706_cmdfunc(struct mtd_info *mtd,
+					      unsigned command, int column,
+					      int page_addr)
+{
+	struct nand_chip *nand_chip = (struct nand_chip *)mtd->priv;
+	struct bcm47xxnflash *b47n = (struct bcm47xxnflash *)nand_chip->priv;
+	struct bcma_drv_cc *cc = b47n->cc;
+	u32 ctlcode;
+	int i;
+
+	if (column != -1)
+		b47n->curr_column = column;
+	if (page_addr != -1)
+		b47n->curr_page_addr = page_addr;
+
+	switch (command) {
+	case NAND_CMD_RESET:
+		pr_warn("Chip reset not implemented yet\n");
+		break;
+	case NAND_CMD_READID:
+		ctlcode = NCTL_CSA | 0x01000000 | NCTL_CMD1W | NCTL_CMD0;
+		ctlcode |= NAND_CMD_READID;
+		if (bcm47xxnflash_ops_bcm4706_ctl_cmd(b47n->cc, ctlcode)) {
+			pr_err("READID error\n");
+			break;
+		}
+
+		/*
+		 * Reading is specific, last one has to go without NCTL_CSA
+		 * bit. We don't know how many reads NAND subsystem is going
+		 * to perform, so cache everything.
+		 */
+		for (i = 0; i < ARRAY_SIZE(b47n->id_data); i++) {
+			ctlcode = NCTL_CSA | NCTL_READ;
+			if (i == ARRAY_SIZE(b47n->id_data) - 1)
+				ctlcode &= ~NCTL_CSA;
+			if (bcm47xxnflash_ops_bcm4706_ctl_cmd(b47n->cc,
+							      ctlcode)) {
+				pr_err("READID error\n");
+				break;
+			}
+			b47n->id_data[i] =
+				bcma_cc_read32(b47n->cc, BCMA_CC_NFLASH_DATA)
+				& 0xFF;
+		}
+
+		break;
+	case NAND_CMD_STATUS:
+		ctlcode = NCTL_CSA | NCTL_CMD0 | NAND_CMD_STATUS;
+		if (bcm47xxnflash_ops_bcm4706_ctl_cmd(cc, ctlcode))
+			pr_err("STATUS command error\n");
+		break;
+	case NAND_CMD_READ0:
+		break;
+	case NAND_CMD_READOOB:
+		if (page_addr != -1)
+			b47n->curr_column += mtd->writesize;
+		break;
+	case NAND_CMD_ERASE1:
+		bcma_cc_write32(cc, BCMA_CC_NFLASH_ROW_ADDR,
+				b47n->curr_page_addr);
+		ctlcode = 0x00040000 | NCTL_CMD1W | NCTL_CMD0 |
+			  NAND_CMD_ERASE1 | (NAND_CMD_ERASE2 << 8);
+		if (bcm47xxnflash_ops_bcm4706_ctl_cmd(cc, ctlcode))
+			pr_err("ERASE1 failed\n");
+		break;
+	case NAND_CMD_ERASE2:
+		break;
+	case NAND_CMD_SEQIN:
+		/* Set page and column */
+		bcma_cc_write32(cc, BCMA_CC_NFLASH_COL_ADDR,
+				b47n->curr_column);
+		bcma_cc_write32(cc, BCMA_CC_NFLASH_ROW_ADDR,
+				b47n->curr_page_addr);
+
+		/* Prepare to write */
+		ctlcode = 0x40000000 | 0x00040000 | 0x00020000 | 0x00010000;
+		ctlcode |= NAND_CMD_SEQIN;
+		if (bcm47xxnflash_ops_bcm4706_ctl_cmd(cc, ctlcode))
+			pr_err("SEQIN failed\n");
+		break;
+	case NAND_CMD_PAGEPROG:
+		if (bcm47xxnflash_ops_bcm4706_ctl_cmd(cc, 0x00010000 |
+							  NAND_CMD_PAGEPROG))
+			pr_err("PAGEPROG failed\n");
+		if (bcm47xxnflash_ops_bcm4706_poll(cc))
+			pr_err("PAGEPROG not ready\n");
+		break;
+	default:
+		pr_err("Command 0x%X unsupported\n", command);
+		break;
+	}
+	b47n->curr_command = command;
+}
+
+static u8 bcm47xxnflash_ops_bcm4706_read_byte(struct mtd_info *mtd)
+{
+	struct nand_chip *nand_chip = (struct nand_chip *)mtd->priv;
+	struct bcm47xxnflash *b47n = (struct bcm47xxnflash *)nand_chip->priv;
+	struct bcma_drv_cc *cc = b47n->cc;
+	u32 tmp = 0;
+
+	switch (b47n->curr_command) {
+	case NAND_CMD_READID:
+		if (b47n->curr_column >= ARRAY_SIZE(b47n->id_data)) {
+			pr_err("Requested invalid id_data: %d\n",
+			       b47n->curr_column);
+			return 0;
+		}
+		return b47n->id_data[b47n->curr_column++];
+	case NAND_CMD_STATUS:
+		if (bcm47xxnflash_ops_bcm4706_ctl_cmd(cc, NCTL_READ))
+			return 0;
+		return bcma_cc_read32(cc, BCMA_CC_NFLASH_DATA) & 0xff;
+	case NAND_CMD_READOOB:
+		bcm47xxnflash_ops_bcm4706_read(mtd, (u8 *)&tmp, 4);
+		return tmp & 0xFF;
+	}
+
+	pr_err("Invalid command for byte read: 0x%X\n", b47n->curr_command);
+	return 0;
+}
+
+static void bcm47xxnflash_ops_bcm4706_read_buf(struct mtd_info *mtd,
+					       uint8_t *buf, int len)
+{
+	struct nand_chip *nand_chip = (struct nand_chip *)mtd->priv;
+	struct bcm47xxnflash *b47n = (struct bcm47xxnflash *)nand_chip->priv;
+
+	switch (b47n->curr_command) {
+	case NAND_CMD_READ0:
+	case NAND_CMD_READOOB:
+		bcm47xxnflash_ops_bcm4706_read(mtd, buf, len);
+		return;
+	}
+
+	pr_err("Invalid command for buf read: 0x%X\n", b47n->curr_command);
+}
+
+static void bcm47xxnflash_ops_bcm4706_write_buf(struct mtd_info *mtd,
+						const uint8_t *buf, int len)
+{
+	struct nand_chip *nand_chip = (struct nand_chip *)mtd->priv;
+	struct bcm47xxnflash *b47n = (struct bcm47xxnflash *)nand_chip->priv;
+
+	switch (b47n->curr_command) {
+	case NAND_CMD_SEQIN:
+		bcm47xxnflash_ops_bcm4706_write(mtd, buf, len);
+		return;
+	}
+
+	pr_err("Invalid command for buf write: 0x%X\n", b47n->curr_command);
+}
+
+/**************************************************
+ * Init
+ **************************************************/
+
+int bcm47xxnflash_ops_bcm4706_init(struct bcm47xxnflash *b47n)
+{
+	int err;
+	u32 freq;
+	u16 clock;
+	u8 w0, w1, w2, w3, w4;
+
+	unsigned long chipsize; /* MiB */
+	u8 tbits, col_bits, col_size, row_bits, row_bsize;
+	u32 val;
+
+	b47n->nand_chip.select_chip = bcm47xxnflash_ops_bcm4706_select_chip;
+	b47n->nand_chip.cmdfunc = bcm47xxnflash_ops_bcm4706_cmdfunc;
+	b47n->nand_chip.read_byte = bcm47xxnflash_ops_bcm4706_read_byte;
+	b47n->nand_chip.read_buf = bcm47xxnflash_ops_bcm4706_read_buf;
+	b47n->nand_chip.write_buf = bcm47xxnflash_ops_bcm4706_write_buf;
+	b47n->nand_chip.bbt_options = NAND_BBT_USE_FLASH;
+	b47n->nand_chip.ecc.mode = NAND_ECC_NONE; /* TODO: implement ECC */
+
+	/* Enable NAND flash access */
+	bcma_cc_set32(b47n->cc, BCMA_CC_4706_FLASHSCFG,
+		      BCMA_CC_4706_FLASHSCFG_NF1);
+
+	/* Configure wait counters */
+	if (b47n->cc->status & BCMA_CC_CHIPST_4706_PKG_OPTION) {
+		freq = 100000000;
+	} else {
+		freq = bcma_chipco_pll_read(b47n->cc, 4);
+		freq = (freq * 0xFFF) >> 3;
+		freq = (freq * 25000000) >> 3;
+	}
+	clock = freq / 1000000;
+	w0 = bcm47xxnflash_ops_bcm4706_ns_to_cycle(15, clock);
+	w1 = bcm47xxnflash_ops_bcm4706_ns_to_cycle(20, clock);
+	w2 = bcm47xxnflash_ops_bcm4706_ns_to_cycle(10, clock);
+	w3 = bcm47xxnflash_ops_bcm4706_ns_to_cycle(10, clock);
+	w4 = bcm47xxnflash_ops_bcm4706_ns_to_cycle(100, clock);
+	bcma_cc_write32(b47n->cc, BCMA_CC_NFLASH_WAITCNT0,
+			(w4 << 24 | w3 << 18 | w2 << 12 | w1 << 6 | w0));
+
+	/* Scan NAND */
+	err = nand_scan(&b47n->mtd, 1);
+	if (err) {
+		pr_err("Could not scan NAND flash: %d\n", err);
+		goto exit;
+	}
+
+	/* Configure FLASH */
+	chipsize = b47n->nand_chip.chipsize >> 20;
+	tbits = ffs(chipsize); /* find first bit set */
+	if (!tbits || tbits != fls(chipsize)) {
+		pr_err("Invalid flash size: 0x%lX\n", chipsize);
+		err = -ENOTSUPP;
+		goto exit;
+	}
+	tbits += 19; /* Broadcom increases *index* by 20, we increase *pos* */
+
+	col_bits = b47n->nand_chip.page_shift + 1;
+	col_size = (col_bits + 7) / 8;
+
+	row_bits = tbits - col_bits + 1;
+	row_bsize = (row_bits + 7) / 8;
+
+	val = ((row_bsize - 1) << 6) | ((col_size - 1) << 4) | 2;
+	bcma_cc_write32(b47n->cc, BCMA_CC_NFLASH_CONF, val);
+
+exit:
+	if (err)
+		bcma_cc_mask32(b47n->cc, BCMA_CC_4706_FLASHSCFG,
+			       ~BCMA_CC_4706_FLASHSCFG_NF1);
+	return err;
+}
diff --git a/trunk/drivers/mtd/nand/bf5xx_nand.c b/trunk/drivers/mtd/nand/bf5xx_nand.c
index ab0caa74eb43..4271e948d1e2 100644
--- a/trunk/drivers/mtd/nand/bf5xx_nand.c
+++ b/trunk/drivers/mtd/nand/bf5xx_nand.c
@@ -658,7 +658,7 @@ static int bf5xx_nand_hw_init(struct bf5xx_nand_info *info)
 /*
  * Device management interface
  */
-static int __devinit bf5xx_nand_add_partition(struct bf5xx_nand_info *info)
+static int bf5xx_nand_add_partition(struct bf5xx_nand_info *info)
 {
 	struct mtd_info *mtd = &info->mtd;
 	struct mtd_partition *parts = info->platform->partitions;
@@ -667,7 +667,7 @@ static int __devinit bf5xx_nand_add_partition(struct bf5xx_nand_info *info)
 	return mtd_device_register(mtd, parts, nr);
 }
 
-static int __devexit bf5xx_nand_remove(struct platform_device *pdev)
+static int bf5xx_nand_remove(struct platform_device *pdev)
 {
 	struct bf5xx_nand_info *info = to_nand_info(pdev);
 
@@ -725,7 +725,7 @@ static int bf5xx_nand_scan(struct mtd_info *mtd)
  * it can allocate all necessary resources then calls the
  * nand layer to look for devices
  */
-static int __devinit bf5xx_nand_probe(struct platform_device *pdev)
+static int bf5xx_nand_probe(struct platform_device *pdev)
 {
 	struct bf5xx_nand_platform *plat = to_nand_plat(pdev);
 	struct bf5xx_nand_info *info = NULL;
@@ -865,7 +865,7 @@ static int bf5xx_nand_resume(struct platform_device *dev)
 /* driver device registration */
 static struct platform_driver bf5xx_nand_driver = {
 	.probe		= bf5xx_nand_probe,
-	.remove		= __devexit_p(bf5xx_nand_remove),
+	.remove		= bf5xx_nand_remove,
 	.suspend	= bf5xx_nand_suspend,
 	.resume		= bf5xx_nand_resume,
 	.driver		= {
diff --git a/trunk/drivers/mtd/nand/cafe_nand.c b/trunk/drivers/mtd/nand/cafe_nand.c
index 2bb7170502c2..010d61266536 100644
--- a/trunk/drivers/mtd/nand/cafe_nand.c
+++ b/trunk/drivers/mtd/nand/cafe_nand.c
@@ -585,7 +585,7 @@ static int cafe_nand_block_bad(struct mtd_info *mtd, loff_t ofs, int getchip)
 }
 
 /* F_2[X]/(X**6+X+1)  */
-static unsigned short __devinit gf64_mul(u8 a, u8 b)
+static unsigned short gf64_mul(u8 a, u8 b)
 {
 	u8 c;
 	unsigned int i;
@@ -604,7 +604,7 @@ static unsigned short __devinit gf64_mul(u8 a, u8 b)
 }
 
 /* F_64[X]/(X**2+X+A**-1) with A the generator of F_64[X]  */
-static u16 __devinit gf4096_mul(u16 a, u16 b)
+static u16 gf4096_mul(u16 a, u16 b)
 {
 	u8 ah, al, bh, bl, ch, cl;
 
@@ -619,14 +619,14 @@ static u16 __devinit gf4096_mul(u16 a, u16 b)
 	return (ch << 6) ^ cl;
 }
 
-static int __devinit cafe_mul(int x)
+static int cafe_mul(int x)
 {
 	if (x == 0)
 		return 1;
 	return gf4096_mul(x, 0xe01);
 }
 
-static int __devinit cafe_nand_probe(struct pci_dev *pdev,
+static int cafe_nand_probe(struct pci_dev *pdev,
 				     const struct pci_device_id *ent)
 {
 	struct mtd_info *mtd;
@@ -821,7 +821,7 @@ static int __devinit cafe_nand_probe(struct pci_dev *pdev,
 	return err;
 }
 
-static void __devexit cafe_nand_remove(struct pci_dev *pdev)
+static void cafe_nand_remove(struct pci_dev *pdev)
 {
 	struct mtd_info *mtd = pci_get_drvdata(pdev);
 	struct cafe_priv *cafe = mtd->priv;
@@ -887,7 +887,7 @@ static struct pci_driver cafe_nand_pci_driver = {
 	.name = "CAFÉ NAND",
 	.id_table = cafe_nand_tbl,
 	.probe = cafe_nand_probe,
-	.remove = __devexit_p(cafe_nand_remove),
+	.remove = cafe_nand_remove,
 	.resume = cafe_nand_resume,
 };
 
diff --git a/trunk/drivers/mtd/nand/cs553x_nand.c b/trunk/drivers/mtd/nand/cs553x_nand.c
index adb6c3ef37fb..2cdeab8bebc4 100644
--- a/trunk/drivers/mtd/nand/cs553x_nand.c
+++ b/trunk/drivers/mtd/nand/cs553x_nand.c
@@ -237,6 +237,7 @@ static int __init cs553x_init_one(int cs, int mmio, unsigned long adr)
 	this->ecc.hwctl  = cs_enable_hwecc;
 	this->ecc.calculate = cs_calculate_ecc;
 	this->ecc.correct  = nand_correct_data;
+	this->ecc.strength = 1;
 
 	/* Enable the following for a flash based bad block table */
 	this->bbt_options = NAND_BBT_USE_FLASH;
@@ -247,8 +248,6 @@ static int __init cs553x_init_one(int cs, int mmio, unsigned long adr)
 		goto out_ior;
 	}
 
-	this->ecc.strength = 1;
-
 	new_mtd->name = kasprintf(GFP_KERNEL, "cs553x_nand_cs%d", cs);
 
 	cs553x_mtd[cs] = new_mtd;
diff --git a/trunk/drivers/mtd/nand/davinci_nand.c b/trunk/drivers/mtd/nand/davinci_nand.c
index 945047ad0952..3502606f6480 100644
--- a/trunk/drivers/mtd/nand/davinci_nand.c
+++ b/trunk/drivers/mtd/nand/davinci_nand.c
@@ -821,9 +821,16 @@ static int __init nand_davinci_probe(struct platform_device *pdev)
 	if (ret < 0)
 		goto err_scan;
 
-	ret = mtd_device_parse_register(&info->mtd, NULL, NULL, pdata->parts,
-					pdata->nr_parts);
-
+	if (pdata->parts)
+		ret = mtd_device_parse_register(&info->mtd, NULL, NULL,
+					pdata->parts, pdata->nr_parts);
+	else {
+		struct mtd_part_parser_data	ppdata;
+
+		ppdata.of_node = pdev->dev.of_node;
+		ret = mtd_device_parse_register(&info->mtd, NULL, &ppdata,
+						NULL, 0);
+	}
 	if (ret < 0)
 		goto err_scan;
 
diff --git a/trunk/drivers/mtd/nand/denali.c b/trunk/drivers/mtd/nand/denali.c
index e706a237170f..0c8bb6bf8424 100644
--- a/trunk/drivers/mtd/nand/denali.c
+++ b/trunk/drivers/mtd/nand/denali.c
@@ -16,14 +16,12 @@
  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  *
  */
-
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/dma-mapping.h>
 #include <linux/wait.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/pci.h>
 #include <linux/mtd/mtd.h>
 #include <linux/module.h>
 
@@ -89,13 +87,6 @@ MODULE_PARM_DESC(onfi_timing_mode, "Overrides default ONFI setting."
  * format the bank into the proper bits for the controller */
 #define BANK(x) ((x) << 24)
 
-/* List of platforms this NAND controller has be integrated into */
-static const struct pci_device_id denali_pci_ids[] = {
-	{ PCI_VDEVICE(INTEL, 0x0701), INTEL_CE4100 },
-	{ PCI_VDEVICE(INTEL, 0x0809), INTEL_MRST },
-	{ /* end: all zeroes */ }
-};
-
 /* forward declarations */
 static void clear_interrupts(struct denali_nand_info *denali);
 static uint32_t wait_for_irq(struct denali_nand_info *denali,
@@ -699,7 +690,7 @@ static uint32_t wait_for_irq(struct denali_nand_info *denali, uint32_t irq_mask)
 
 	if (comp_res == 0) {
 		/* timeout */
-		printk(KERN_ERR "timeout occurred, status = 0x%x, mask = 0x%x\n",
+		pr_err("timeout occurred, status = 0x%x, mask = 0x%x\n",
 				intr_status, irq_mask);
 
 		intr_status = 0;
@@ -1305,8 +1296,7 @@ static void denali_cmdfunc(struct mtd_info *mtd, unsigned int cmd, int col,
 		/* TODO: Read OOB data */
 		break;
 	default:
-		printk(KERN_ERR ": unsupported command"
-				" received 0x%x\n", cmd);
+		pr_err(": unsupported command received 0x%x\n", cmd);
 		break;
 	}
 }
@@ -1425,107 +1415,48 @@ void denali_drv_init(struct denali_nand_info *denali)
 	denali->irq_status = 0;
 }
 
-/* driver entry point */
-static int denali_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
+int denali_init(struct denali_nand_info *denali)
 {
-	int ret = -ENODEV;
-	resource_size_t csr_base, mem_base;
-	unsigned long csr_len, mem_len;
-	struct denali_nand_info *denali;
-
-	denali = kzalloc(sizeof(*denali), GFP_KERNEL);
-	if (!denali)
-		return -ENOMEM;
+	int ret;
 
-	ret = pci_enable_device(dev);
-	if (ret) {
-		printk(KERN_ERR "Spectra: pci_enable_device failed.\n");
-		goto failed_alloc_memery;
-	}
-
-	if (id->driver_data == INTEL_CE4100) {
+	if (denali->platform == INTEL_CE4100) {
 		/* Due to a silicon limitation, we can only support
 		 * ONFI timing mode 1 and below.
 		 */
 		if (onfi_timing_mode < -1 || onfi_timing_mode > 1) {
-			printk(KERN_ERR "Intel CE4100 only supports"
-					" ONFI timing mode 1 or below\n");
-			ret = -EINVAL;
-			goto failed_enable_dev;
-		}
-		denali->platform = INTEL_CE4100;
-		mem_base = pci_resource_start(dev, 0);
-		mem_len = pci_resource_len(dev, 1);
-		csr_base = pci_resource_start(dev, 1);
-		csr_len = pci_resource_len(dev, 1);
-	} else {
-		denali->platform = INTEL_MRST;
-		csr_base = pci_resource_start(dev, 0);
-		csr_len = pci_resource_len(dev, 0);
-		mem_base = pci_resource_start(dev, 1);
-		mem_len = pci_resource_len(dev, 1);
-		if (!mem_len) {
-			mem_base = csr_base + csr_len;
-			mem_len = csr_len;
+			pr_err("Intel CE4100 only supports ONFI timing mode 1 or below\n");
+			return -EINVAL;
 		}
 	}
 
 	/* Is 32-bit DMA supported? */
-	ret = dma_set_mask(&dev->dev, DMA_BIT_MASK(32));
+	ret = dma_set_mask(denali->dev, DMA_BIT_MASK(32));
 	if (ret) {
-		printk(KERN_ERR "Spectra: no usable DMA configuration\n");
-		goto failed_enable_dev;
+		pr_err("Spectra: no usable DMA configuration\n");
+		return ret;
 	}
-	denali->buf.dma_buf = dma_map_single(&dev->dev, denali->buf.buf,
+	denali->buf.dma_buf = dma_map_single(denali->dev, denali->buf.buf,
 					     DENALI_BUF_SIZE,
 					     DMA_BIDIRECTIONAL);
 
-	if (dma_mapping_error(&dev->dev, denali->buf.dma_buf)) {
-		dev_err(&dev->dev, "Spectra: failed to map DMA buffer\n");
-		goto failed_enable_dev;
-	}
-
-	pci_set_master(dev);
-	denali->dev = &dev->dev;
-	denali->mtd.dev.parent = &dev->dev;
-
-	ret = pci_request_regions(dev, DENALI_NAND_NAME);
-	if (ret) {
-		printk(KERN_ERR "Spectra: Unable to request memory regions\n");
-		goto failed_dma_map;
-	}
-
-	denali->flash_reg = ioremap_nocache(csr_base, csr_len);
-	if (!denali->flash_reg) {
-		printk(KERN_ERR "Spectra: Unable to remap memory region\n");
-		ret = -ENOMEM;
-		goto failed_req_regions;
-	}
-
-	denali->flash_mem = ioremap_nocache(mem_base, mem_len);
-	if (!denali->flash_mem) {
-		printk(KERN_ERR "Spectra: ioremap_nocache failed!");
-		ret = -ENOMEM;
-		goto failed_remap_reg;
+	if (dma_mapping_error(denali->dev, denali->buf.dma_buf)) {
+		dev_err(denali->dev, "Spectra: failed to map DMA buffer\n");
+		return -EIO;
 	}
-
+	denali->mtd.dev.parent = denali->dev;
 	denali_hw_init(denali);
 	denali_drv_init(denali);
 
 	/* denali_isr register is done after all the hardware
 	 * initilization is finished*/
-	if (request_irq(dev->irq, denali_isr, IRQF_SHARED,
+	if (request_irq(denali->irq, denali_isr, IRQF_SHARED,
 			DENALI_NAND_NAME, denali)) {
-		printk(KERN_ERR "Spectra: Unable to allocate IRQ\n");
-		ret = -ENODEV;
-		goto failed_remap_mem;
+		pr_err("Spectra: Unable to allocate IRQ\n");
+		return -ENODEV;
 	}
 
 	/* now that our ISR is registered, we can enable interrupts */
 	denali_set_intr_modes(denali, true);
-
-	pci_set_drvdata(dev, denali);
-
 	denali->mtd.name = "denali-nand";
 	denali->mtd.owner = THIS_MODULE;
 	denali->mtd.priv = &denali->nand;
@@ -1549,8 +1480,7 @@ static int denali_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	 */
 	if (denali->mtd.writesize > NAND_MAX_PAGESIZE + NAND_MAX_OOBSIZE) {
 		ret = -ENODEV;
-		printk(KERN_ERR "Spectra: device size not supported by this "
-			"version of MTD.");
+		pr_err("Spectra: device size not supported by this version of MTD.");
 		goto failed_req_irq;
 	}
 
@@ -1602,8 +1532,8 @@ static int denali_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	} else if (denali->mtd.oobsize < (denali->bbtskipbytes +
 			ECC_8BITS * (denali->mtd.writesize /
 			ECC_SECTOR_SIZE))) {
-		printk(KERN_ERR "Your NAND chip OOB is not large enough to"
-				" contain 8bit ECC correction codes");
+		pr_err("Your NAND chip OOB is not large enough to \
+				contain 8bit ECC correction codes");
 		goto failed_req_irq;
 	} else {
 		denali->nand.ecc.strength = 8;
@@ -1655,56 +1585,24 @@ static int denali_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 
 	ret = mtd_device_register(&denali->mtd, NULL, 0);
 	if (ret) {
-		dev_err(&dev->dev, "Spectra: Failed to register MTD: %d\n",
+		dev_err(denali->dev, "Spectra: Failed to register MTD: %d\n",
 				ret);
 		goto failed_req_irq;
 	}
 	return 0;
 
 failed_req_irq:
-	denali_irq_cleanup(dev->irq, denali);
-failed_remap_mem:
-	iounmap(denali->flash_mem);
-failed_remap_reg:
-	iounmap(denali->flash_reg);
-failed_req_regions:
-	pci_release_regions(dev);
-failed_dma_map:
-	dma_unmap_single(&dev->dev, denali->buf.dma_buf, DENALI_BUF_SIZE,
-			 DMA_BIDIRECTIONAL);
-failed_enable_dev:
-	pci_disable_device(dev);
-failed_alloc_memery:
-	kfree(denali);
+	denali_irq_cleanup(denali->irq, denali);
+
 	return ret;
 }
+EXPORT_SYMBOL(denali_init);
 
 /* driver exit point */
-static void denali_pci_remove(struct pci_dev *dev)
+void denali_remove(struct denali_nand_info *denali)
 {
-	struct denali_nand_info *denali = pci_get_drvdata(dev);
-
-	nand_release(&denali->mtd);
-
-	denali_irq_cleanup(dev->irq, denali);
-
-	iounmap(denali->flash_reg);
-	iounmap(denali->flash_mem);
-	pci_release_regions(dev);
-	pci_disable_device(dev);
-	dma_unmap_single(&dev->dev, denali->buf.dma_buf, DENALI_BUF_SIZE,
-			 DMA_BIDIRECTIONAL);
-	pci_set_drvdata(dev, NULL);
-	kfree(denali);
+	denali_irq_cleanup(denali->irq, denali);
+	dma_unmap_single(denali->dev, denali->buf.dma_buf, DENALI_BUF_SIZE,
+			DMA_BIDIRECTIONAL);
 }
-
-MODULE_DEVICE_TABLE(pci, denali_pci_ids);
-
-static struct pci_driver denali_pci_driver = {
-	.name = DENALI_NAND_NAME,
-	.id_table = denali_pci_ids,
-	.probe = denali_pci_probe,
-	.remove = denali_pci_remove,
-};
-
-module_pci_driver(denali_pci_driver);
+EXPORT_SYMBOL(denali_remove);
diff --git a/trunk/drivers/mtd/nand/denali.h b/trunk/drivers/mtd/nand/denali.h
index fabb9d56b39e..cec5712862c9 100644
--- a/trunk/drivers/mtd/nand/denali.h
+++ b/trunk/drivers/mtd/nand/denali.h
@@ -466,6 +466,7 @@ struct nand_buf {
 
 #define INTEL_CE4100	1
 #define INTEL_MRST	2
+#define DT		3
 
 struct denali_nand_info {
 	struct mtd_info mtd;
@@ -487,6 +488,7 @@ struct denali_nand_info {
 	uint32_t irq_status;
 	int irq_debug_array[32];
 	int idx;
+	int irq;
 
 	uint32_t devnum;	/* represent how many nands connected */
 	uint32_t fwblks; /* represent how many blocks FW used */
@@ -496,4 +498,7 @@ struct denali_nand_info {
 	uint32_t max_banks;
 };
 
+extern int denali_init(struct denali_nand_info *denali);
+extern void denali_remove(struct denali_nand_info *denali);
+
 #endif /*_LLD_NAND_*/
diff --git a/trunk/drivers/mtd/nand/denali_dt.c b/trunk/drivers/mtd/nand/denali_dt.c
new file mode 100644
index 000000000000..546f8cb5688d
--- /dev/null
+++ b/trunk/drivers/mtd/nand/denali_dt.c
@@ -0,0 +1,167 @@
+/*
+ * NAND Flash Controller Device Driver for DT
+ *
+ * Copyright © 2011, Picochip.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/slab.h>
+
+#include "denali.h"
+
+struct denali_dt {
+	struct denali_nand_info	denali;
+	struct clk		*clk;
+};
+
+static void __iomem *request_and_map(struct device *dev,
+				     const struct resource *res)
+{
+	void __iomem *ptr;
+
+	if (!devm_request_mem_region(dev, res->start, resource_size(res),
+				     "denali-dt")) {
+		dev_err(dev, "unable to request %s\n", res->name);
+		return NULL;
+	}
+
+	ptr = devm_ioremap_nocache(dev, res->start, resource_size(res));
+	if (!res)
+		dev_err(dev, "ioremap_nocache of %s failed!", res->name);
+
+	return ptr;
+}
+
+static const struct of_device_id denali_nand_dt_ids[] = {
+		{ .compatible = "denali,denali-nand-dt" },
+		{ /* sentinel */ }
+	};
+
+MODULE_DEVICE_TABLE(of, denali_nand_dt_ids);
+
+static u64 denali_dma_mask;
+
+static int denali_dt_probe(struct platform_device *ofdev)
+{
+	struct resource *denali_reg, *nand_data;
+	struct denali_dt *dt;
+	struct denali_nand_info *denali;
+	int ret;
+	const struct of_device_id *of_id;
+
+	of_id = of_match_device(denali_nand_dt_ids, &ofdev->dev);
+	if (of_id) {
+		ofdev->id_entry = of_id->data;
+	} else {
+		pr_err("Failed to find the right device id.\n");
+		return -ENOMEM;
+	}
+
+	dt = devm_kzalloc(&ofdev->dev, sizeof(*dt), GFP_KERNEL);
+	if (!dt)
+		return -ENOMEM;
+	denali = &dt->denali;
+
+	denali_reg = platform_get_resource_byname(ofdev, IORESOURCE_MEM, "denali_reg");
+	nand_data = platform_get_resource_byname(ofdev, IORESOURCE_MEM, "nand_data");
+	if (!denali_reg || !nand_data) {
+		dev_err(&ofdev->dev, "resources not completely defined\n");
+		return -EINVAL;
+	}
+
+	denali->platform = DT;
+	denali->dev = &ofdev->dev;
+	denali->irq = platform_get_irq(ofdev, 0);
+	if (denali->irq < 0) {
+		dev_err(&ofdev->dev, "no irq defined\n");
+		return -ENXIO;
+	}
+
+	denali->flash_reg = request_and_map(&ofdev->dev, denali_reg);
+	if (!denali->flash_reg)
+		return -ENOMEM;
+
+	denali->flash_mem = request_and_map(&ofdev->dev, nand_data);
+	if (!denali->flash_mem)
+		return -ENOMEM;
+
+	if (!of_property_read_u32(ofdev->dev.of_node,
+		"dma-mask", (u32 *)&denali_dma_mask)) {
+		denali->dev->dma_mask = &denali_dma_mask;
+	} else {
+		denali->dev->dma_mask = NULL;
+	}
+
+	dt->clk = clk_get(&ofdev->dev, NULL);
+	if (IS_ERR(dt->clk)) {
+		dev_err(&ofdev->dev, "no clk available\n");
+		return PTR_ERR(dt->clk);
+	}
+	clk_prepare_enable(dt->clk);
+
+	ret = denali_init(denali);
+	if (ret)
+		goto out_disable_clk;
+
+	platform_set_drvdata(ofdev, dt);
+	return 0;
+
+out_disable_clk:
+	clk_disable_unprepare(dt->clk);
+	clk_put(dt->clk);
+
+	return ret;
+}
+
+static int denali_dt_remove(struct platform_device *ofdev)
+{
+	struct denali_dt *dt = platform_get_drvdata(ofdev);
+
+	denali_remove(&dt->denali);
+	clk_disable(dt->clk);
+	clk_put(dt->clk);
+
+	return 0;
+}
+
+static struct platform_driver denali_dt_driver = {
+	.probe		= denali_dt_probe,
+	.remove		= denali_dt_remove,
+	.driver		= {
+		.name	= "denali-nand-dt",
+		.owner	= THIS_MODULE,
+		.of_match_table	= of_match_ptr(denali_nand_dt_ids),
+	},
+};
+
+static int __init denali_init_dt(void)
+{
+	return platform_driver_register(&denali_dt_driver);
+}
+module_init(denali_init_dt);
+
+static void __exit denali_exit_dt(void)
+{
+	platform_driver_unregister(&denali_dt_driver);
+}
+module_exit(denali_exit_dt);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jamie Iles");
+MODULE_DESCRIPTION("DT driver for Denali NAND controller");
diff --git a/trunk/drivers/mtd/nand/denali_pci.c b/trunk/drivers/mtd/nand/denali_pci.c
new file mode 100644
index 000000000000..e3e46623b2b4
--- /dev/null
+++ b/trunk/drivers/mtd/nand/denali_pci.c
@@ -0,0 +1,144 @@
+/*
+ * NAND Flash Controller Device Driver
+ * Copyright © 2009-2010, Intel Corporation and its suppliers.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+
+#include "denali.h"
+
+#define DENALI_NAND_NAME    "denali-nand-pci"
+
+/* List of platforms this NAND controller has be integrated into */
+static DEFINE_PCI_DEVICE_TABLE(denali_pci_ids) = {
+	{ PCI_VDEVICE(INTEL, 0x0701), INTEL_CE4100 },
+	{ PCI_VDEVICE(INTEL, 0x0809), INTEL_MRST },
+	{ /* end: all zeroes */ }
+};
+MODULE_DEVICE_TABLE(pci, denali_pci_ids);
+
+static int denali_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	int ret = -ENODEV;
+	resource_size_t csr_base, mem_base;
+	unsigned long csr_len, mem_len;
+	struct denali_nand_info *denali;
+
+	denali = kzalloc(sizeof(*denali), GFP_KERNEL);
+	if (!denali)
+		return -ENOMEM;
+
+	ret = pci_enable_device(dev);
+	if (ret) {
+		pr_err("Spectra: pci_enable_device failed.\n");
+		goto failed_alloc_memery;
+	}
+
+	if (id->driver_data == INTEL_CE4100) {
+		denali->platform = INTEL_CE4100;
+		mem_base = pci_resource_start(dev, 0);
+		mem_len = pci_resource_len(dev, 1);
+		csr_base = pci_resource_start(dev, 1);
+		csr_len = pci_resource_len(dev, 1);
+	} else {
+		denali->platform = INTEL_MRST;
+		csr_base = pci_resource_start(dev, 0);
+		csr_len = pci_resource_len(dev, 0);
+		mem_base = pci_resource_start(dev, 1);
+		mem_len = pci_resource_len(dev, 1);
+		if (!mem_len) {
+			mem_base = csr_base + csr_len;
+			mem_len = csr_len;
+		}
+	}
+
+	pci_set_master(dev);
+	denali->dev = &dev->dev;
+	denali->irq = dev->irq;
+
+	ret = pci_request_regions(dev, DENALI_NAND_NAME);
+	if (ret) {
+		pr_err("Spectra: Unable to request memory regions\n");
+		goto failed_enable_dev;
+	}
+
+	denali->flash_reg = ioremap_nocache(csr_base, csr_len);
+	if (!denali->flash_reg) {
+		pr_err("Spectra: Unable to remap memory region\n");
+		ret = -ENOMEM;
+		goto failed_req_regions;
+	}
+
+	denali->flash_mem = ioremap_nocache(mem_base, mem_len);
+	if (!denali->flash_mem) {
+		pr_err("Spectra: ioremap_nocache failed!");
+		ret = -ENOMEM;
+		goto failed_remap_reg;
+	}
+
+	ret = denali_init(denali);
+	if (ret)
+		goto failed_remap_mem;
+
+	pci_set_drvdata(dev, denali);
+
+	return 0;
+
+failed_remap_mem:
+	iounmap(denali->flash_mem);
+failed_remap_reg:
+	iounmap(denali->flash_reg);
+failed_req_regions:
+	pci_release_regions(dev);
+failed_enable_dev:
+	pci_disable_device(dev);
+failed_alloc_memery:
+	kfree(denali);
+
+	return ret;
+}
+
+/* driver exit point */
+static void denali_pci_remove(struct pci_dev *dev)
+{
+	struct denali_nand_info *denali = pci_get_drvdata(dev);
+
+	denali_remove(denali);
+	iounmap(denali->flash_reg);
+	iounmap(denali->flash_mem);
+	pci_release_regions(dev);
+	pci_disable_device(dev);
+	pci_set_drvdata(dev, NULL);
+	kfree(denali);
+}
+
+static struct pci_driver denali_pci_driver = {
+	.name = DENALI_NAND_NAME,
+	.id_table = denali_pci_ids,
+	.probe = denali_pci_probe,
+	.remove = denali_pci_remove,
+};
+
+static int denali_init_pci(void)
+{
+	pr_info("Spectra MTD driver built on %s @ %s\n", __DATE__, __TIME__);
+	return pci_register_driver(&denali_pci_driver);
+}
+module_init(denali_init_pci);
+
+static void denali_exit_pci(void)
+{
+	pci_unregister_driver(&denali_pci_driver);
+}
+module_exit(denali_exit_pci);
diff --git a/trunk/drivers/mtd/nand/diskonchip.c b/trunk/drivers/mtd/nand/diskonchip.c
index 256eb30f6180..81fa5784f98b 100644
--- a/trunk/drivers/mtd/nand/diskonchip.c
+++ b/trunk/drivers/mtd/nand/diskonchip.c
@@ -53,8 +53,6 @@ static unsigned long __initdata doc_locations[] = {
 	0xe0000, 0xe2000, 0xe4000, 0xe6000,
 	0xe8000, 0xea000, 0xec000, 0xee000,
 #endif /*  CONFIG_MTD_DOCPROBE_HIGH */
-#else
-#warning Unknown architecture for DiskOnChip. No default probe locations defined
 #endif
 	0xffffffff };
 
diff --git a/trunk/drivers/mtd/nand/docg4.c b/trunk/drivers/mtd/nand/docg4.c
index 799da5d1c857..18fa4489e52e 100644
--- a/trunk/drivers/mtd/nand/docg4.c
+++ b/trunk/drivers/mtd/nand/docg4.c
@@ -45,6 +45,25 @@
 #include <linux/bch.h>
 #include <linux/bitrev.h>
 
+/*
+ * In "reliable mode" consecutive 2k pages are used in parallel (in some
+ * fashion) to store the same data.  The data can be read back from the
+ * even-numbered pages in the normal manner; odd-numbered pages will appear to
+ * contain junk.  Systems that boot from the docg4 typically write the secondary
+ * program loader (SPL) code in this mode.  The SPL is loaded by the initial
+ * program loader (IPL, stored in the docg4's 2k NOR-like region that is mapped
+ * to the reset vector address).  This module parameter enables you to use this
+ * driver to write the SPL.  When in this mode, no more than 2k of data can be
+ * written at a time, because the addresses do not increment in the normal
+ * manner, and the starting offset must be within an even-numbered 2k region;
+ * i.e., invalid starting offsets are 0x800, 0xa00, 0xc00, 0xe00, 0x1800,
+ * 0x1a00, ...  Reliable mode is a special case and should not be used unless
+ * you know what you're doing.
+ */
+static bool reliable_mode;
+module_param(reliable_mode, bool, 0);
+MODULE_PARM_DESC(reliable_mode, "pages are programmed in reliable mode");
+
 /*
  * You'll want to ignore badblocks if you're reading a partition that contains
  * data written by the TrueFFS library (i.e., by PalmOS, Windows, etc), since
@@ -113,6 +132,7 @@ struct docg4_priv {
 #define DOCG4_SEQ_PAGEWRITE		0x16
 #define DOCG4_SEQ_PAGEPROG		0x1e
 #define DOCG4_SEQ_BLOCKERASE		0x24
+#define DOCG4_SEQ_SETMODE		0x45
 
 /* DOC_FLASHCOMMAND register commands */
 #define DOCG4_CMD_PAGE_READ             0x00
@@ -122,6 +142,8 @@ struct docg4_priv {
 #define DOC_CMD_PROG_BLOCK_ADDR		0x60
 #define DOCG4_CMD_PAGEWRITE		0x80
 #define DOC_CMD_PROG_CYCLE2		0x10
+#define DOCG4_CMD_FAST_MODE		0xa3 /* functionality guessed */
+#define DOC_CMD_RELIABLE_MODE		0x22
 #define DOC_CMD_RESET			0xff
 
 /* DOC_POWERMODE register bits */
@@ -190,17 +212,20 @@ struct docg4_priv {
 #define DOCG4_T                4   /* BCH alg corrects up to 4 bit errors */
 
 #define DOCG4_FACTORY_BBT_PAGE 16 /* page where read-only factory bbt lives */
+#define DOCG4_REDUNDANT_BBT_PAGE 24 /* page where redundant factory bbt lives */
 
 /*
- * Oob bytes 0 - 6 are available to the user.
- * Byte 7 is hamming ecc for first 7 bytes.  Bytes 8 - 14 are hw-generated ecc.
+ * Bytes 0, 1 are used as badblock marker.
+ * Bytes 2 - 6 are available to the user.
+ * Byte 7 is hamming ecc for first 7 oob bytes only.
+ * Bytes 8 - 14 are hw-generated ecc covering entire page + oob bytes 0 - 14.
  * Byte 15 (the last) is used by the driver as a "page written" flag.
  */
 static struct nand_ecclayout docg4_oobinfo = {
 	.eccbytes = 9,
 	.eccpos = {7, 8, 9, 10, 11, 12, 13, 14, 15},
-	.oobavail = 7,
-	.oobfree = { {0, 7} }
+	.oobavail = 5,
+	.oobfree = { {.offset = 2, .length = 5} }
 };
 
 /*
@@ -611,6 +636,14 @@ static void write_page_prologue(struct mtd_info *mtd, uint32_t docg4_addr)
 	dev_dbg(doc->dev,
 	      "docg4: %s: g4 addr: %x\n", __func__, docg4_addr);
 	sequence_reset(mtd);
+
+	if (unlikely(reliable_mode)) {
+		writew(DOCG4_SEQ_SETMODE, docptr + DOC_FLASHSEQUENCE);
+		writew(DOCG4_CMD_FAST_MODE, docptr + DOC_FLASHCOMMAND);
+		writew(DOC_CMD_RELIABLE_MODE, docptr + DOC_FLASHCOMMAND);
+		write_nop(docptr);
+	}
+
 	writew(DOCG4_SEQ_PAGEWRITE, docptr + DOC_FLASHSEQUENCE);
 	writew(DOCG4_CMD_PAGEWRITE, docptr + DOC_FLASHCOMMAND);
 	write_nop(docptr);
@@ -691,6 +724,15 @@ static void docg4_command(struct mtd_info *mtd, unsigned command, int column,
 		break;
 
 	case NAND_CMD_SEQIN:
+		if (unlikely(reliable_mode)) {
+			uint16_t g4_page = g4_addr >> 16;
+
+			/* writes to odd-numbered 2k pages are invalid */
+			if (g4_page & 0x01)
+				dev_warn(doc->dev,
+					 "invalid reliable mode address\n");
+		}
+
 		write_page_prologue(mtd, g4_addr);
 
 		/* hack for deferred write of oob bytes */
@@ -979,16 +1021,15 @@ static int __init read_factory_bbt(struct mtd_info *mtd)
 	struct docg4_priv *doc = nand->priv;
 	uint32_t g4_addr = mtd_to_docg4_address(DOCG4_FACTORY_BBT_PAGE, 0);
 	uint8_t *buf;
-	int i, block, status;
+	int i, block;
+	__u32 eccfailed_stats = mtd->ecc_stats.failed;
 
 	buf = kzalloc(DOCG4_PAGE_SIZE, GFP_KERNEL);
 	if (buf == NULL)
 		return -ENOMEM;
 
 	read_page_prologue(mtd, g4_addr);
-	status = docg4_read_page(mtd, nand, buf, 0, DOCG4_FACTORY_BBT_PAGE);
-	if (status)
-		goto exit;
+	docg4_read_page(mtd, nand, buf, 0, DOCG4_FACTORY_BBT_PAGE);
 
 	/*
 	 * If no memory-based bbt was created, exit.  This will happen if module
@@ -1000,6 +1041,20 @@ static int __init read_factory_bbt(struct mtd_info *mtd)
 	if (nand->bbt == NULL)  /* no memory-based bbt */
 		goto exit;
 
+	if (mtd->ecc_stats.failed > eccfailed_stats) {
+		/*
+		 * Whoops, an ecc failure ocurred reading the factory bbt.
+		 * It is stored redundantly, so we get another chance.
+		 */
+		eccfailed_stats = mtd->ecc_stats.failed;
+		docg4_read_page(mtd, nand, buf, 0, DOCG4_REDUNDANT_BBT_PAGE);
+		if (mtd->ecc_stats.failed > eccfailed_stats) {
+			dev_warn(doc->dev,
+				 "The factory bbt could not be read!\n");
+			goto exit;
+		}
+	}
+
 	/*
 	 * Parse factory bbt and update memory-based bbt.  Factory bbt format is
 	 * simple: one bit per block, block numbers increase left to right (msb
@@ -1019,7 +1074,7 @@ static int __init read_factory_bbt(struct mtd_info *mtd)
 	}
  exit:
 	kfree(buf);
-	return status;
+	return 0;
 }
 
 static int docg4_block_markbad(struct mtd_info *mtd, loff_t ofs)
diff --git a/trunk/drivers/mtd/nand/fsl_elbc_nand.c b/trunk/drivers/mtd/nand/fsl_elbc_nand.c
index cc1480a5e4c1..20657209a472 100644
--- a/trunk/drivers/mtd/nand/fsl_elbc_nand.c
+++ b/trunk/drivers/mtd/nand/fsl_elbc_nand.c
@@ -108,20 +108,6 @@ static struct nand_ecclayout fsl_elbc_oob_lp_eccm1 = {
 	.oobfree = { {1, 7}, {11, 13}, {27, 13}, {43, 13}, {59, 5} },
 };
 
-/*
- * fsl_elbc_oob_lp_eccm* specify that LP NAND's OOB free area starts at offset
- * 1, so we have to adjust bad block pattern. This pattern should be used for
- * x8 chips only. So far hardware does not support x16 chips anyway.
- */
-static u8 scan_ff_pattern[] = { 0xff, };
-
-static struct nand_bbt_descr largepage_memorybased = {
-	.options = 0,
-	.offs = 0,
-	.len = 1,
-	.pattern = scan_ff_pattern,
-};
-
 /*
  * ELBC may use HW ECC, so that OOB offsets, that NAND core uses for bbt,
  * interfere with ECC positions, that's why we implement our own descriptors.
@@ -699,7 +685,6 @@ static int fsl_elbc_chip_init_tail(struct mtd_info *mtd)
 			chip->ecc.layout = (priv->fmr & FMR_ECCM) ?
 			                   &fsl_elbc_oob_lp_eccm1 :
 			                   &fsl_elbc_oob_lp_eccm0;
-			chip->badblock_pattern = &largepage_memorybased;
 		}
 	} else {
 		dev_err(priv->dev,
@@ -814,7 +799,7 @@ static int fsl_elbc_chip_remove(struct fsl_elbc_mtd *priv)
 
 static DEFINE_MUTEX(fsl_elbc_nand_mutex);
 
-static int __devinit fsl_elbc_nand_probe(struct platform_device *pdev)
+static int fsl_elbc_nand_probe(struct platform_device *pdev)
 {
 	struct fsl_lbc_regs __iomem *lbc;
 	struct fsl_elbc_mtd *priv;
diff --git a/trunk/drivers/mtd/nand/fsl_ifc_nand.c b/trunk/drivers/mtd/nand/fsl_ifc_nand.c
index 3551a99076ba..ad6222627fed 100644
--- a/trunk/drivers/mtd/nand/fsl_ifc_nand.c
+++ b/trunk/drivers/mtd/nand/fsl_ifc_nand.c
@@ -389,7 +389,7 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
 			timing = IFC_FIR_OP_RBCD;
 
 		out_be32(&ifc->ifc_nand.nand_fir0,
-				(IFC_FIR_OP_CMD0 << IFC_NAND_FIR0_OP0_SHIFT) |
+				(IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
 				(IFC_FIR_OP_UA  << IFC_NAND_FIR0_OP1_SHIFT) |
 				(timing << IFC_NAND_FIR0_OP2_SHIFT));
 		out_be32(&ifc->ifc_nand.nand_fcr0,
@@ -754,7 +754,7 @@ static void fsl_ifc_sram_init(struct fsl_ifc_mtd *priv)
 
 	/* READID */
 	out_be32(&ifc->ifc_nand.nand_fir0,
-			(IFC_FIR_OP_CMD0 << IFC_NAND_FIR0_OP0_SHIFT) |
+			(IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
 			(IFC_FIR_OP_UA  << IFC_NAND_FIR0_OP1_SHIFT) |
 			(IFC_FIR_OP_RB << IFC_NAND_FIR0_OP2_SHIFT));
 	out_be32(&ifc->ifc_nand.nand_fcr0,
@@ -922,7 +922,7 @@ static int match_bank(struct fsl_ifc_regs __iomem *ifc, int bank,
 
 static DEFINE_MUTEX(fsl_ifc_nand_mutex);
 
-static int __devinit fsl_ifc_nand_probe(struct platform_device *dev)
+static int fsl_ifc_nand_probe(struct platform_device *dev)
 {
 	struct fsl_ifc_regs __iomem *ifc;
 	struct fsl_ifc_mtd *priv;
diff --git a/trunk/drivers/mtd/nand/fsl_upm.c b/trunk/drivers/mtd/nand/fsl_upm.c
index 45df542b9c61..5a8f5c4ce512 100644
--- a/trunk/drivers/mtd/nand/fsl_upm.c
+++ b/trunk/drivers/mtd/nand/fsl_upm.c
@@ -152,7 +152,7 @@ static void fun_write_buf(struct mtd_info *mtd, const uint8_t *buf, int len)
 		fun_wait_rnb(fun);
 }
 
-static int __devinit fun_chip_init(struct fsl_upm_nand *fun,
+static int fun_chip_init(struct fsl_upm_nand *fun,
 				   const struct device_node *upm_np,
 				   const struct resource *io_res)
 {
@@ -201,7 +201,7 @@ static int __devinit fun_chip_init(struct fsl_upm_nand *fun,
 	return ret;
 }
 
-static int __devinit fun_probe(struct platform_device *ofdev)
+static int fun_probe(struct platform_device *ofdev)
 {
 	struct fsl_upm_nand *fun;
 	struct resource io_res;
@@ -318,7 +318,7 @@ static int __devinit fun_probe(struct platform_device *ofdev)
 	return ret;
 }
 
-static int __devexit fun_remove(struct platform_device *ofdev)
+static int fun_remove(struct platform_device *ofdev)
 {
 	struct fsl_upm_nand *fun = dev_get_drvdata(&ofdev->dev);
 	int i;
@@ -350,7 +350,7 @@ static struct platform_driver of_fun_driver = {
 		.of_match_table = of_fun_match,
 	},
 	.probe		= fun_probe,
-	.remove		= __devexit_p(fun_remove),
+	.remove		= fun_remove,
 };
 
 module_platform_driver(of_fun_driver);
diff --git a/trunk/drivers/mtd/nand/fsmc_nand.c b/trunk/drivers/mtd/nand/fsmc_nand.c
index 38d26240d8b1..1d7446434b0e 100644
--- a/trunk/drivers/mtd/nand/fsmc_nand.c
+++ b/trunk/drivers/mtd/nand/fsmc_nand.c
@@ -361,7 +361,7 @@ static void fsmc_cmd_ctrl(struct mtd_info *mtd, int cmd, unsigned int ctrl)
 	struct nand_chip *this = mtd->priv;
 	struct fsmc_nand_data *host = container_of(mtd,
 					struct fsmc_nand_data, mtd);
-	void *__iomem *regs = host->regs_va;
+	void __iomem *regs = host->regs_va;
 	unsigned int bank = host->bank;
 
 	if (ctrl & NAND_CTRL_CHANGE) {
@@ -383,13 +383,13 @@ static void fsmc_cmd_ctrl(struct mtd_info *mtd, int cmd, unsigned int ctrl)
 			pc |= FSMC_ENABLE;
 		else
 			pc &= ~FSMC_ENABLE;
-		writel(pc, FSMC_NAND_REG(regs, bank, PC));
+		writel_relaxed(pc, FSMC_NAND_REG(regs, bank, PC));
 	}
 
 	mb();
 
 	if (cmd != NAND_CMD_NONE)
-		writeb(cmd, this->IO_ADDR_W);
+		writeb_relaxed(cmd, this->IO_ADDR_W);
 }
 
 /*
@@ -426,14 +426,18 @@ static void fsmc_nand_setup(void __iomem *regs, uint32_t bank,
 	tset = (tims->tset & FSMC_TSET_MASK) << FSMC_TSET_SHIFT;
 
 	if (busw)
-		writel(value | FSMC_DEVWID_16, FSMC_NAND_REG(regs, bank, PC));
+		writel_relaxed(value | FSMC_DEVWID_16,
+				FSMC_NAND_REG(regs, bank, PC));
 	else
-		writel(value | FSMC_DEVWID_8, FSMC_NAND_REG(regs, bank, PC));
+		writel_relaxed(value | FSMC_DEVWID_8,
+				FSMC_NAND_REG(regs, bank, PC));
 
-	writel(readl(FSMC_NAND_REG(regs, bank, PC)) | tclr | tar,
+	writel_relaxed(readl(FSMC_NAND_REG(regs, bank, PC)) | tclr | tar,
 			FSMC_NAND_REG(regs, bank, PC));
-	writel(thiz | thold | twait | tset, FSMC_NAND_REG(regs, bank, COMM));
-	writel(thiz | thold | twait | tset, FSMC_NAND_REG(regs, bank, ATTRIB));
+	writel_relaxed(thiz | thold | twait | tset,
+			FSMC_NAND_REG(regs, bank, COMM));
+	writel_relaxed(thiz | thold | twait | tset,
+			FSMC_NAND_REG(regs, bank, ATTRIB));
 }
 
 /*
@@ -446,11 +450,11 @@ static void fsmc_enable_hwecc(struct mtd_info *mtd, int mode)
 	void __iomem *regs = host->regs_va;
 	uint32_t bank = host->bank;
 
-	writel(readl(FSMC_NAND_REG(regs, bank, PC)) & ~FSMC_ECCPLEN_256,
+	writel_relaxed(readl(FSMC_NAND_REG(regs, bank, PC)) & ~FSMC_ECCPLEN_256,
 			FSMC_NAND_REG(regs, bank, PC));
-	writel(readl(FSMC_NAND_REG(regs, bank, PC)) & ~FSMC_ECCEN,
+	writel_relaxed(readl(FSMC_NAND_REG(regs, bank, PC)) & ~FSMC_ECCEN,
 			FSMC_NAND_REG(regs, bank, PC));
-	writel(readl(FSMC_NAND_REG(regs, bank, PC)) | FSMC_ECCEN,
+	writel_relaxed(readl(FSMC_NAND_REG(regs, bank, PC)) | FSMC_ECCEN,
 			FSMC_NAND_REG(regs, bank, PC));
 }
 
@@ -470,7 +474,7 @@ static int fsmc_read_hwecc_ecc4(struct mtd_info *mtd, const uint8_t *data,
 	unsigned long deadline = jiffies + FSMC_BUSY_WAIT_TIMEOUT;
 
 	do {
-		if (readl(FSMC_NAND_REG(regs, bank, STS)) & FSMC_CODE_RDY)
+		if (readl_relaxed(FSMC_NAND_REG(regs, bank, STS)) & FSMC_CODE_RDY)
 			break;
 		else
 			cond_resched();
@@ -481,25 +485,25 @@ static int fsmc_read_hwecc_ecc4(struct mtd_info *mtd, const uint8_t *data,
 		return -ETIMEDOUT;
 	}
 
-	ecc_tmp = readl(FSMC_NAND_REG(regs, bank, ECC1));
+	ecc_tmp = readl_relaxed(FSMC_NAND_REG(regs, bank, ECC1));
 	ecc[0] = (uint8_t) (ecc_tmp >> 0);
 	ecc[1] = (uint8_t) (ecc_tmp >> 8);
 	ecc[2] = (uint8_t) (ecc_tmp >> 16);
 	ecc[3] = (uint8_t) (ecc_tmp >> 24);
 
-	ecc_tmp = readl(FSMC_NAND_REG(regs, bank, ECC2));
+	ecc_tmp = readl_relaxed(FSMC_NAND_REG(regs, bank, ECC2));
 	ecc[4] = (uint8_t) (ecc_tmp >> 0);
 	ecc[5] = (uint8_t) (ecc_tmp >> 8);
 	ecc[6] = (uint8_t) (ecc_tmp >> 16);
 	ecc[7] = (uint8_t) (ecc_tmp >> 24);
 
-	ecc_tmp = readl(FSMC_NAND_REG(regs, bank, ECC3));
+	ecc_tmp = readl_relaxed(FSMC_NAND_REG(regs, bank, ECC3));
 	ecc[8] = (uint8_t) (ecc_tmp >> 0);
 	ecc[9] = (uint8_t) (ecc_tmp >> 8);
 	ecc[10] = (uint8_t) (ecc_tmp >> 16);
 	ecc[11] = (uint8_t) (ecc_tmp >> 24);
 
-	ecc_tmp = readl(FSMC_NAND_REG(regs, bank, STS));
+	ecc_tmp = readl_relaxed(FSMC_NAND_REG(regs, bank, STS));
 	ecc[12] = (uint8_t) (ecc_tmp >> 16);
 
 	return 0;
@@ -519,7 +523,7 @@ static int fsmc_read_hwecc_ecc1(struct mtd_info *mtd, const uint8_t *data,
 	uint32_t bank = host->bank;
 	uint32_t ecc_tmp;
 
-	ecc_tmp = readl(FSMC_NAND_REG(regs, bank, ECC1));
+	ecc_tmp = readl_relaxed(FSMC_NAND_REG(regs, bank, ECC1));
 	ecc[0] = (uint8_t) (ecc_tmp >> 0);
 	ecc[1] = (uint8_t) (ecc_tmp >> 8);
 	ecc[2] = (uint8_t) (ecc_tmp >> 16);
@@ -601,7 +605,7 @@ static int dma_xfer(struct fsmc_nand_data *host, void *buffer, int len,
 	dma_async_issue_pending(chan);
 
 	ret =
-	wait_for_completion_interruptible_timeout(&host->dma_access_complete,
+	wait_for_completion_timeout(&host->dma_access_complete,
 				msecs_to_jiffies(3000));
 	if (ret <= 0) {
 		chan->device->device_control(chan, DMA_TERMINATE_ALL, 0);
@@ -628,10 +632,10 @@ static void fsmc_write_buf(struct mtd_info *mtd, const uint8_t *buf, int len)
 		uint32_t *p = (uint32_t *)buf;
 		len = len >> 2;
 		for (i = 0; i < len; i++)
-			writel(p[i], chip->IO_ADDR_W);
+			writel_relaxed(p[i], chip->IO_ADDR_W);
 	} else {
 		for (i = 0; i < len; i++)
-			writeb(buf[i], chip->IO_ADDR_W);
+			writeb_relaxed(buf[i], chip->IO_ADDR_W);
 	}
 }
 
@@ -651,10 +655,10 @@ static void fsmc_read_buf(struct mtd_info *mtd, uint8_t *buf, int len)
 		uint32_t *p = (uint32_t *)buf;
 		len = len >> 2;
 		for (i = 0; i < len; i++)
-			p[i] = readl(chip->IO_ADDR_R);
+			p[i] = readl_relaxed(chip->IO_ADDR_R);
 	} else {
 		for (i = 0; i < len; i++)
-			buf[i] = readb(chip->IO_ADDR_R);
+			buf[i] = readb_relaxed(chip->IO_ADDR_R);
 	}
 }
 
@@ -783,7 +787,7 @@ static int fsmc_bch8_correct_data(struct mtd_info *mtd, uint8_t *dat,
 	uint32_t num_err, i;
 	uint32_t ecc1, ecc2, ecc3, ecc4;
 
-	num_err = (readl(FSMC_NAND_REG(regs, bank, STS)) >> 10) & 0xF;
+	num_err = (readl_relaxed(FSMC_NAND_REG(regs, bank, STS)) >> 10) & 0xF;
 
 	/* no bit flipping */
 	if (likely(num_err == 0))
@@ -826,10 +830,10 @@ static int fsmc_bch8_correct_data(struct mtd_info *mtd, uint8_t *dat,
 	 * uint64_t array and error offset indexes are populated in err_idx
 	 * array
 	 */
-	ecc1 = readl(FSMC_NAND_REG(regs, bank, ECC1));
-	ecc2 = readl(FSMC_NAND_REG(regs, bank, ECC2));
-	ecc3 = readl(FSMC_NAND_REG(regs, bank, ECC3));
-	ecc4 = readl(FSMC_NAND_REG(regs, bank, STS));
+	ecc1 = readl_relaxed(FSMC_NAND_REG(regs, bank, ECC1));
+	ecc2 = readl_relaxed(FSMC_NAND_REG(regs, bank, ECC2));
+	ecc3 = readl_relaxed(FSMC_NAND_REG(regs, bank, ECC3));
+	ecc4 = readl_relaxed(FSMC_NAND_REG(regs, bank, STS));
 
 	err_idx[0] = (ecc1 >> 0) & 0x1FFF;
 	err_idx[1] = (ecc1 >> 13) & 0x1FFF;
@@ -860,7 +864,7 @@ static bool filter(struct dma_chan *chan, void *slave)
 }
 
 #ifdef CONFIG_OF
-static int __devinit fsmc_nand_probe_config_dt(struct platform_device *pdev,
+static int fsmc_nand_probe_config_dt(struct platform_device *pdev,
 					       struct device_node *np)
 {
 	struct fsmc_nand_platform_data *pdata = dev_get_platdata(&pdev->dev);
@@ -876,15 +880,13 @@ static int __devinit fsmc_nand_probe_config_dt(struct platform_device *pdev,
 			return -EINVAL;
 		}
 	}
-	of_property_read_u32(np, "st,ale-off", &pdata->ale_off);
-	of_property_read_u32(np, "st,cle-off", &pdata->cle_off);
 	if (of_get_property(np, "nand-skip-bbtscan", NULL))
 		pdata->options = NAND_SKIP_BBTSCAN;
 
 	return 0;
 }
 #else
-static int __devinit fsmc_nand_probe_config_dt(struct platform_device *pdev,
+static int fsmc_nand_probe_config_dt(struct platform_device *pdev,
 					       struct device_node *np)
 {
 	return -ENOSYS;
@@ -935,41 +937,28 @@ static int __init fsmc_nand_probe(struct platform_device *pdev)
 	if (!res)
 		return -EINVAL;
 
-	if (!devm_request_mem_region(&pdev->dev, res->start, resource_size(res),
-				pdev->name)) {
-		dev_err(&pdev->dev, "Failed to get memory data resourse\n");
-		return -ENOENT;
-	}
-
-	host->data_pa = (dma_addr_t)res->start;
-	host->data_va = devm_ioremap(&pdev->dev, res->start,
-			resource_size(res));
+	host->data_va = devm_request_and_ioremap(&pdev->dev, res);
 	if (!host->data_va) {
 		dev_err(&pdev->dev, "data ioremap failed\n");
 		return -ENOMEM;
 	}
+	host->data_pa = (dma_addr_t)res->start;
 
-	if (!devm_request_mem_region(&pdev->dev, res->start + pdata->ale_off,
-			resource_size(res), pdev->name)) {
-		dev_err(&pdev->dev, "Failed to get memory ale resourse\n");
-		return -ENOENT;
-	}
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "nand_addr");
+	if (!res)
+		return -EINVAL;
 
-	host->addr_va = devm_ioremap(&pdev->dev, res->start + pdata->ale_off,
-			resource_size(res));
+	host->addr_va = devm_request_and_ioremap(&pdev->dev, res);
 	if (!host->addr_va) {
 		dev_err(&pdev->dev, "ale ioremap failed\n");
 		return -ENOMEM;
 	}
 
-	if (!devm_request_mem_region(&pdev->dev, res->start + pdata->cle_off,
-			resource_size(res), pdev->name)) {
-		dev_err(&pdev->dev, "Failed to get memory cle resourse\n");
-		return -ENOENT;
-	}
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "nand_cmd");
+	if (!res)
+		return -EINVAL;
 
-	host->cmd_va = devm_ioremap(&pdev->dev, res->start + pdata->cle_off,
-			resource_size(res));
+	host->cmd_va = devm_request_and_ioremap(&pdev->dev, res);
 	if (!host->cmd_va) {
 		dev_err(&pdev->dev, "ale ioremap failed\n");
 		return -ENOMEM;
@@ -979,14 +968,7 @@ static int __init fsmc_nand_probe(struct platform_device *pdev)
 	if (!res)
 		return -EINVAL;
 
-	if (!devm_request_mem_region(&pdev->dev, res->start, resource_size(res),
-			pdev->name)) {
-		dev_err(&pdev->dev, "Failed to get memory regs resourse\n");
-		return -ENOENT;
-	}
-
-	host->regs_va = devm_ioremap(&pdev->dev, res->start,
-			resource_size(res));
+	host->regs_va = devm_request_and_ioremap(&pdev->dev, res);
 	if (!host->regs_va) {
 		dev_err(&pdev->dev, "regs ioremap failed\n");
 		return -ENOMEM;
diff --git a/trunk/drivers/mtd/nand/gpio.c b/trunk/drivers/mtd/nand/gpio.c
index bc73bc5f2713..e789e3f51710 100644
--- a/trunk/drivers/mtd/nand/gpio.c
+++ b/trunk/drivers/mtd/nand/gpio.c
@@ -90,14 +90,14 @@ static void gpio_nand_writebuf(struct mtd_info *mtd, const u_char *buf, int len)
 {
 	struct nand_chip *this = mtd->priv;
 
-	writesb(this->IO_ADDR_W, buf, len);
+	iowrite8_rep(this->IO_ADDR_W, buf, len);
 }
 
 static void gpio_nand_readbuf(struct mtd_info *mtd, u_char *buf, int len)
 {
 	struct nand_chip *this = mtd->priv;
 
-	readsb(this->IO_ADDR_R, buf, len);
+	ioread8_rep(this->IO_ADDR_R, buf, len);
 }
 
 static void gpio_nand_writebuf16(struct mtd_info *mtd, const u_char *buf,
@@ -106,7 +106,7 @@ static void gpio_nand_writebuf16(struct mtd_info *mtd, const u_char *buf,
 	struct nand_chip *this = mtd->priv;
 
 	if (IS_ALIGNED((unsigned long)buf, 2)) {
-		writesw(this->IO_ADDR_W, buf, len>>1);
+		iowrite16_rep(this->IO_ADDR_W, buf, len>>1);
 	} else {
 		int i;
 		unsigned short *ptr = (unsigned short *)buf;
@@ -121,7 +121,7 @@ static void gpio_nand_readbuf16(struct mtd_info *mtd, u_char *buf, int len)
 	struct nand_chip *this = mtd->priv;
 
 	if (IS_ALIGNED((unsigned long)buf, 2)) {
-		readsw(this->IO_ADDR_R, buf, len>>1);
+		ioread16_rep(this->IO_ADDR_R, buf, len>>1);
 	} else {
 		int i;
 		unsigned short *ptr = (unsigned short *)buf;
@@ -134,7 +134,11 @@ static void gpio_nand_readbuf16(struct mtd_info *mtd, u_char *buf, int len)
 static int gpio_nand_devready(struct mtd_info *mtd)
 {
 	struct gpiomtd *gpiomtd = gpio_nand_getpriv(mtd);
-	return gpio_get_value(gpiomtd->plat.gpio_rdy);
+
+	if (gpio_is_valid(gpiomtd->plat.gpio_rdy))
+		return gpio_get_value(gpiomtd->plat.gpio_rdy);
+
+	return 1;
 }
 
 #ifdef CONFIG_OF
@@ -227,7 +231,7 @@ gpio_nand_get_io_sync(struct platform_device *pdev)
 	return platform_get_resource(pdev, IORESOURCE_MEM, 1);
 }
 
-static int __devexit gpio_nand_remove(struct platform_device *dev)
+static int gpio_nand_remove(struct platform_device *dev)
 {
 	struct gpiomtd *gpiomtd = platform_get_drvdata(dev);
 	struct resource *res;
@@ -252,7 +256,8 @@ static int __devexit gpio_nand_remove(struct platform_device *dev)
 	gpio_free(gpiomtd->plat.gpio_nce);
 	if (gpio_is_valid(gpiomtd->plat.gpio_nwp))
 		gpio_free(gpiomtd->plat.gpio_nwp);
-	gpio_free(gpiomtd->plat.gpio_rdy);
+	if (gpio_is_valid(gpiomtd->plat.gpio_rdy))
+		gpio_free(gpiomtd->plat.gpio_rdy);
 
 	kfree(gpiomtd);
 
@@ -277,7 +282,7 @@ static void __iomem *request_and_remap(struct resource *res, size_t size,
 	return ptr;
 }
 
-static int __devinit gpio_nand_probe(struct platform_device *dev)
+static int gpio_nand_probe(struct platform_device *dev)
 {
 	struct gpiomtd *gpiomtd;
 	struct nand_chip *this;
@@ -336,10 +341,12 @@ static int __devinit gpio_nand_probe(struct platform_device *dev)
 	if (ret)
 		goto err_cle;
 	gpio_direction_output(gpiomtd->plat.gpio_cle, 0);
-	ret = gpio_request(gpiomtd->plat.gpio_rdy, "NAND RDY");
-	if (ret)
-		goto err_rdy;
-	gpio_direction_input(gpiomtd->plat.gpio_rdy);
+	if (gpio_is_valid(gpiomtd->plat.gpio_rdy)) {
+		ret = gpio_request(gpiomtd->plat.gpio_rdy, "NAND RDY");
+		if (ret)
+			goto err_rdy;
+		gpio_direction_input(gpiomtd->plat.gpio_rdy);
+	}
 
 
 	this->IO_ADDR_W  = this->IO_ADDR_R;
@@ -386,7 +393,8 @@ static int __devinit gpio_nand_probe(struct platform_device *dev)
 err_wp:
 	if (gpio_is_valid(gpiomtd->plat.gpio_nwp))
 		gpio_set_value(gpiomtd->plat.gpio_nwp, 0);
-	gpio_free(gpiomtd->plat.gpio_rdy);
+	if (gpio_is_valid(gpiomtd->plat.gpio_rdy))
+		gpio_free(gpiomtd->plat.gpio_rdy);
 err_rdy:
 	gpio_free(gpiomtd->plat.gpio_cle);
 err_cle:
diff --git a/trunk/drivers/mtd/nand/gpmi-nand/gpmi-lib.c b/trunk/drivers/mtd/nand/gpmi-nand/gpmi-lib.c
index 3502accd4bc3..d84699c7968e 100644
--- a/trunk/drivers/mtd/nand/gpmi-nand/gpmi-lib.c
+++ b/trunk/drivers/mtd/nand/gpmi-nand/gpmi-lib.c
@@ -18,7 +18,6 @@
  * with this program; if not, write to the Free Software Foundation, Inc.,
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
-#include <linux/mtd/gpmi-nand.h>
 #include <linux/delay.h>
 #include <linux/clk.h>
 
@@ -166,6 +165,15 @@ int gpmi_init(struct gpmi_nand_data *this)
 	if (ret)
 		goto err_out;
 
+	/*
+	 * Reset BCH here, too. We got failures otherwise :(
+	 * See later BCH reset for explanation of MX23 handling
+	 */
+	ret = gpmi_reset_block(r->bch_regs, GPMI_IS_MX23(this));
+	if (ret)
+		goto err_out;
+
+
 	/* Choose NAND mode. */
 	writel(BM_GPMI_CTRL1_GPMI_MODE, r->gpmi_regs + HW_GPMI_CTRL1_CLR);
 
diff --git a/trunk/drivers/mtd/nand/gpmi-nand/gpmi-nand.c b/trunk/drivers/mtd/nand/gpmi-nand/gpmi-nand.c
index d79696b2f19b..5cd141f7bfc2 100644
--- a/trunk/drivers/mtd/nand/gpmi-nand/gpmi-nand.c
+++ b/trunk/drivers/mtd/nand/gpmi-nand/gpmi-nand.c
@@ -25,7 +25,6 @@
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
-#include <linux/mtd/gpmi-nand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/pinctrl/consumer.h>
 #include <linux/of.h>
@@ -33,6 +32,12 @@
 #include <linux/of_mtd.h>
 #include "gpmi-nand.h"
 
+/* Resource names for the GPMI NAND driver. */
+#define GPMI_NAND_GPMI_REGS_ADDR_RES_NAME  "gpmi-nand"
+#define GPMI_NAND_BCH_REGS_ADDR_RES_NAME   "bch"
+#define GPMI_NAND_BCH_INTERRUPT_RES_NAME   "bch"
+#define GPMI_NAND_DMA_INTERRUPT_RES_NAME   "gpmi-dma"
+
 /* add our owner bbt descriptor */
 static uint8_t scan_ff_pattern[] = { 0xff };
 static struct nand_bbt_descr gpmi_bbt_descr = {
@@ -222,7 +227,7 @@ void prepare_data_dma(struct gpmi_nand_data *this, enum dma_data_direction dr)
 
 		ret = dma_map_sg(this->dev, sgl, 1, dr);
 		if (ret == 0)
-			pr_err("map failed.\n");
+			pr_err("DMA mapping failed.\n");
 
 		this->direct_dma_map_ok = false;
 	}
@@ -314,7 +319,7 @@ int start_dma_with_bch_irq(struct gpmi_nand_data *this,
 	return 0;
 }
 
-static int __devinit
+static int
 acquire_register_block(struct gpmi_nand_data *this, const char *res_name)
 {
 	struct platform_device *pdev = this->pdev;
@@ -355,7 +360,7 @@ static void release_register_block(struct gpmi_nand_data *this)
 	res->bch_regs = NULL;
 }
 
-static int __devinit
+static int
 acquire_bch_irq(struct gpmi_nand_data *this, irq_handler_t irq_h)
 {
 	struct platform_device *pdev = this->pdev;
@@ -422,7 +427,7 @@ static void release_dma_channels(struct gpmi_nand_data *this)
 		}
 }
 
-static int __devinit acquire_dma_channels(struct gpmi_nand_data *this)
+static int acquire_dma_channels(struct gpmi_nand_data *this)
 {
 	struct platform_device *pdev = this->pdev;
 	struct resource *r_dma;
@@ -456,7 +461,7 @@ static int __devinit acquire_dma_channels(struct gpmi_nand_data *this)
 
 	dma_chan = dma_request_channel(mask, gpmi_dma_filter, this);
 	if (!dma_chan) {
-		pr_err("dma_request_channel failed.\n");
+		pr_err("Failed to request DMA channel.\n");
 		goto acquire_err;
 	}
 
@@ -487,7 +492,7 @@ static char *extra_clks_for_mx6q[GPMI_CLK_MAX] = {
 	"gpmi_apb", "gpmi_bch", "gpmi_bch_apb", "per1_bch",
 };
 
-static int __devinit gpmi_get_clks(struct gpmi_nand_data *this)
+static int gpmi_get_clks(struct gpmi_nand_data *this)
 {
 	struct resources *r = &this->resources;
 	char **extra_clks = NULL;
@@ -533,7 +538,7 @@ static int __devinit gpmi_get_clks(struct gpmi_nand_data *this)
 	return -ENOMEM;
 }
 
-static int __devinit acquire_resources(struct gpmi_nand_data *this)
+static int acquire_resources(struct gpmi_nand_data *this)
 {
 	struct pinctrl *pinctrl;
 	int ret;
@@ -583,7 +588,7 @@ static void release_resources(struct gpmi_nand_data *this)
 	release_dma_channels(this);
 }
 
-static int __devinit init_hardware(struct gpmi_nand_data *this)
+static int init_hardware(struct gpmi_nand_data *this)
 {
 	int ret;
 
@@ -625,7 +630,8 @@ static int read_page_prepare(struct gpmi_nand_data *this,
 						length, DMA_FROM_DEVICE);
 		if (dma_mapping_error(dev, dest_phys)) {
 			if (alt_size < length) {
-				pr_err("Alternate buffer is too small\n");
+				pr_err("%s, Alternate buffer is too small\n",
+					__func__);
 				return -ENOMEM;
 			}
 			goto map_failed;
@@ -675,7 +681,8 @@ static int send_page_prepare(struct gpmi_nand_data *this,
 						DMA_TO_DEVICE);
 		if (dma_mapping_error(dev, source_phys)) {
 			if (alt_size < length) {
-				pr_err("Alternate buffer is too small\n");
+				pr_err("%s, Alternate buffer is too small\n",
+					__func__);
 				return -ENOMEM;
 			}
 			goto map_failed;
@@ -763,7 +770,7 @@ static int gpmi_alloc_dma_buffer(struct gpmi_nand_data *this)
 
 error_alloc:
 	gpmi_free_dma_buffer(this);
-	pr_err("allocate DMA buffer ret!!\n");
+	pr_err("Error allocating DMA buffers!\n");
 	return -ENOMEM;
 }
 
@@ -1474,7 +1481,7 @@ static int gpmi_set_geometry(struct gpmi_nand_data *this)
 	/* Set up the NFC geometry which is used by BCH. */
 	ret = bch_set_geometry(this);
 	if (ret) {
-		pr_err("set geometry ret : %d\n", ret);
+		pr_err("Error setting BCH geometry : %d\n", ret);
 		return ret;
 	}
 
@@ -1535,7 +1542,7 @@ static void gpmi_nfc_exit(struct gpmi_nand_data *this)
 	gpmi_free_dma_buffer(this);
 }
 
-static int __devinit gpmi_nfc_init(struct gpmi_nand_data *this)
+static int gpmi_nfc_init(struct gpmi_nand_data *this)
 {
 	struct mtd_info  *mtd = &this->mtd;
 	struct nand_chip *chip = &this->nand;
@@ -1618,7 +1625,7 @@ static const struct of_device_id gpmi_nand_id_table[] = {
 };
 MODULE_DEVICE_TABLE(of, gpmi_nand_id_table);
 
-static int __devinit gpmi_nand_probe(struct platform_device *pdev)
+static int gpmi_nand_probe(struct platform_device *pdev)
 {
 	struct gpmi_nand_data *this;
 	const struct of_device_id *of_id;
@@ -1668,7 +1675,7 @@ static int __devinit gpmi_nand_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static int __devexit gpmi_nand_remove(struct platform_device *pdev)
+static int gpmi_nand_remove(struct platform_device *pdev)
 {
 	struct gpmi_nand_data *this = platform_get_drvdata(pdev);
 
@@ -1685,7 +1692,7 @@ static struct platform_driver gpmi_nand_driver = {
 		.of_match_table = gpmi_nand_id_table,
 	},
 	.probe   = gpmi_nand_probe,
-	.remove  = __devexit_p(gpmi_nand_remove),
+	.remove  = gpmi_nand_remove,
 	.id_table = gpmi_ids,
 };
 module_platform_driver(gpmi_nand_driver);
diff --git a/trunk/drivers/mtd/nand/gpmi-nand/gpmi-nand.h b/trunk/drivers/mtd/nand/gpmi-nand/gpmi-nand.h
index 7ac25c1e58f9..3d93a5e39090 100644
--- a/trunk/drivers/mtd/nand/gpmi-nand/gpmi-nand.h
+++ b/trunk/drivers/mtd/nand/gpmi-nand/gpmi-nand.h
@@ -130,7 +130,6 @@ struct gpmi_nand_data {
 	/* System Interface */
 	struct device		*dev;
 	struct platform_device	*pdev;
-	struct gpmi_nand_platform_data	*pdata;
 
 	/* Resources */
 	struct resources	resources;
diff --git a/trunk/drivers/mtd/nand/jz4740_nand.c b/trunk/drivers/mtd/nand/jz4740_nand.c
index 100b6775e175..8d415f014e1d 100644
--- a/trunk/drivers/mtd/nand/jz4740_nand.c
+++ b/trunk/drivers/mtd/nand/jz4740_nand.c
@@ -316,13 +316,17 @@ static int jz_nand_ioremap_resource(struct platform_device *pdev,
 	return ret;
 }
 
-static inline void jz_nand_iounmap_resource(struct resource *res, void __iomem *base)
+static inline void jz_nand_iounmap_resource(struct resource *res,
+					    void __iomem *base)
 {
 	iounmap(base);
 	release_mem_region(res->start, resource_size(res));
 }
 
-static int __devinit jz_nand_detect_bank(struct platform_device *pdev, struct jz_nand *nand, unsigned char bank, size_t chipnr, uint8_t *nand_maf_id, uint8_t *nand_dev_id) {
+static int jz_nand_detect_bank(struct platform_device *pdev,
+			       struct jz_nand *nand, unsigned char bank,
+			       size_t chipnr, uint8_t *nand_maf_id,
+			       uint8_t *nand_dev_id) {
 	int ret;
 	int gpio;
 	char gpio_name[9];
@@ -400,7 +404,7 @@ static int __devinit jz_nand_detect_bank(struct platform_device *pdev, struct jz
 	return ret;
 }
 
-static int __devinit jz_nand_probe(struct platform_device *pdev)
+static int jz_nand_probe(struct platform_device *pdev)
 {
 	int ret;
 	struct jz_nand *nand;
@@ -541,7 +545,7 @@ static int __devinit jz_nand_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static int __devexit jz_nand_remove(struct platform_device *pdev)
+static int jz_nand_remove(struct platform_device *pdev)
 {
 	struct jz_nand *nand = platform_get_drvdata(pdev);
 	struct jz_nand_platform_data *pdata = pdev->dev.platform_data;
@@ -573,7 +577,7 @@ static int __devexit jz_nand_remove(struct platform_device *pdev)
 
 static struct platform_driver jz_nand_driver = {
 	.probe = jz_nand_probe,
-	.remove = __devexit_p(jz_nand_remove),
+	.remove = jz_nand_remove,
 	.driver = {
 		.name = "jz4740-nand",
 		.owner = THIS_MODULE,
diff --git a/trunk/drivers/mtd/nand/lpc32xx_mlc.c b/trunk/drivers/mtd/nand/lpc32xx_mlc.c
index c29b7ac1f6af..f182befa7360 100644
--- a/trunk/drivers/mtd/nand/lpc32xx_mlc.c
+++ b/trunk/drivers/mtd/nand/lpc32xx_mlc.c
@@ -655,7 +655,7 @@ static struct lpc32xx_nand_cfg_mlc *lpc32xx_parse_dt(struct device *dev)
 /*
  * Probe for NAND controller
  */
-static int __devinit lpc32xx_nand_probe(struct platform_device *pdev)
+static int lpc32xx_nand_probe(struct platform_device *pdev)
 {
 	struct lpc32xx_nand_host *host;
 	struct mtd_info *mtd;
@@ -845,7 +845,7 @@ static int __devinit lpc32xx_nand_probe(struct platform_device *pdev)
 /*
  * Remove NAND device
  */
-static int __devexit lpc32xx_nand_remove(struct platform_device *pdev)
+static int lpc32xx_nand_remove(struct platform_device *pdev)
 {
 	struct lpc32xx_nand_host *host = platform_get_drvdata(pdev);
 	struct mtd_info *mtd = &host->mtd;
@@ -907,7 +907,7 @@ MODULE_DEVICE_TABLE(of, lpc32xx_nand_match);
 
 static struct platform_driver lpc32xx_nand_driver = {
 	.probe		= lpc32xx_nand_probe,
-	.remove		= __devexit_p(lpc32xx_nand_remove),
+	.remove		= lpc32xx_nand_remove,
 	.resume		= lpc32xx_nand_resume,
 	.suspend	= lpc32xx_nand_suspend,
 	.driver		= {
diff --git a/trunk/drivers/mtd/nand/lpc32xx_slc.c b/trunk/drivers/mtd/nand/lpc32xx_slc.c
index 32409c45d479..030b78c62895 100644
--- a/trunk/drivers/mtd/nand/lpc32xx_slc.c
+++ b/trunk/drivers/mtd/nand/lpc32xx_slc.c
@@ -755,7 +755,7 @@ static struct lpc32xx_nand_cfg_slc *lpc32xx_parse_dt(struct device *dev)
 /*
  * Probe for NAND controller
  */
-static int __devinit lpc32xx_nand_probe(struct platform_device *pdev)
+static int lpc32xx_nand_probe(struct platform_device *pdev)
 {
 	struct lpc32xx_nand_host *host;
 	struct mtd_info *mtd;
@@ -949,7 +949,7 @@ static int __devinit lpc32xx_nand_probe(struct platform_device *pdev)
 /*
  * Remove NAND device.
  */
-static int __devexit lpc32xx_nand_remove(struct platform_device *pdev)
+static int lpc32xx_nand_remove(struct platform_device *pdev)
 {
 	uint32_t tmp;
 	struct lpc32xx_nand_host *host = platform_get_drvdata(pdev);
@@ -1021,7 +1021,7 @@ MODULE_DEVICE_TABLE(of, lpc32xx_nand_match);
 
 static struct platform_driver lpc32xx_nand_driver = {
 	.probe		= lpc32xx_nand_probe,
-	.remove		= __devexit_p(lpc32xx_nand_remove),
+	.remove		= lpc32xx_nand_remove,
 	.resume		= lpc32xx_nand_resume,
 	.suspend	= lpc32xx_nand_suspend,
 	.driver		= {
diff --git a/trunk/drivers/mtd/nand/mpc5121_nfc.c b/trunk/drivers/mtd/nand/mpc5121_nfc.c
index f776c8577b8c..3c9cdcbc4cba 100644
--- a/trunk/drivers/mtd/nand/mpc5121_nfc.c
+++ b/trunk/drivers/mtd/nand/mpc5121_nfc.c
@@ -626,7 +626,7 @@ static void mpc5121_nfc_free(struct device *dev, struct mtd_info *mtd)
 		iounmap(prv->csreg);
 }
 
-static int __devinit mpc5121_nfc_probe(struct platform_device *op)
+static int mpc5121_nfc_probe(struct platform_device *op)
 {
 	struct device_node *rootnode, *dn = op->dev.of_node;
 	struct device *dev = &op->dev;
@@ -827,7 +827,7 @@ static int __devinit mpc5121_nfc_probe(struct platform_device *op)
 	return retval;
 }
 
-static int __devexit mpc5121_nfc_remove(struct platform_device *op)
+static int mpc5121_nfc_remove(struct platform_device *op)
 {
 	struct device *dev = &op->dev;
 	struct mtd_info *mtd = dev_get_drvdata(dev);
@@ -841,14 +841,14 @@ static int __devexit mpc5121_nfc_remove(struct platform_device *op)
 	return 0;
 }
 
-static struct of_device_id mpc5121_nfc_match[] __devinitdata = {
+static struct of_device_id mpc5121_nfc_match[] = {
 	{ .compatible = "fsl,mpc5121-nfc", },
 	{},
 };
 
 static struct platform_driver mpc5121_nfc_driver = {
 	.probe		= mpc5121_nfc_probe,
-	.remove		= __devexit_p(mpc5121_nfc_remove),
+	.remove		= mpc5121_nfc_remove,
 	.driver		= {
 		.name = DRV_NAME,
 		.owner = THIS_MODULE,
diff --git a/trunk/drivers/mtd/nand/mxc_nand.c b/trunk/drivers/mtd/nand/mxc_nand.c
index 022dcdc256fb..45204e41a028 100644
--- a/trunk/drivers/mtd/nand/mxc_nand.c
+++ b/trunk/drivers/mtd/nand/mxc_nand.c
@@ -266,7 +266,8 @@ static struct nand_ecclayout nandv2_hw_eccoob_4k = {
 	}
 };
 
-static const char *part_probes[] = { "RedBoot", "cmdlinepart", "ofpart", NULL };
+static const char const *part_probes[] = {
+	"cmdlinepart", "RedBoot", "ofpart", NULL };
 
 static void memcpy32_fromio(void *trg, const void __iomem  *src, size_t size)
 {
@@ -1378,7 +1379,7 @@ static int __init mxcnd_probe_dt(struct mxc_nand_host *host)
 }
 #endif
 
-static int __devinit mxcnd_probe(struct platform_device *pdev)
+static int mxcnd_probe(struct platform_device *pdev)
 {
 	struct nand_chip *this;
 	struct mtd_info *mtd;
@@ -1556,12 +1557,13 @@ static int __devinit mxcnd_probe(struct platform_device *pdev)
 	return 0;
 
 escan:
-	clk_disable_unprepare(host->clk);
+	if (host->clk_act)
+		clk_disable_unprepare(host->clk);
 
 	return err;
 }
 
-static int __devexit mxcnd_remove(struct platform_device *pdev)
+static int mxcnd_remove(struct platform_device *pdev)
 {
 	struct mxc_nand_host *host = platform_get_drvdata(pdev);
 
@@ -1580,7 +1582,7 @@ static struct platform_driver mxcnd_driver = {
 	},
 	.id_table = mxcnd_devtype,
 	.probe = mxcnd_probe,
-	.remove = __devexit_p(mxcnd_remove),
+	.remove = mxcnd_remove,
 };
 module_platform_driver(mxcnd_driver);
 
diff --git a/trunk/drivers/mtd/nand/nand_base.c b/trunk/drivers/mtd/nand/nand_base.c
index 1a03b7f673ce..8323ac991ad1 100644
--- a/trunk/drivers/mtd/nand/nand_base.c
+++ b/trunk/drivers/mtd/nand/nand_base.c
@@ -93,8 +93,7 @@ static struct nand_ecclayout nand_oob_128 = {
 		 .length = 78} }
 };
 
-static int nand_get_device(struct nand_chip *chip, struct mtd_info *mtd,
-			   int new_state);
+static int nand_get_device(struct mtd_info *mtd, int new_state);
 
 static int nand_do_write_oob(struct mtd_info *mtd, loff_t to,
 			     struct mtd_oob_ops *ops);
@@ -130,15 +129,12 @@ static int check_offs_len(struct mtd_info *mtd,
  * nand_release_device - [GENERIC] release chip
  * @mtd: MTD device structure
  *
- * Deselect, release chip lock and wake up anyone waiting on the device.
+ * Release chip lock and wake up anyone waiting on the device.
  */
 static void nand_release_device(struct mtd_info *mtd)
 {
 	struct nand_chip *chip = mtd->priv;
 
-	/* De-select the NAND device */
-	chip->select_chip(mtd, -1);
-
 	/* Release the controller and the chip */
 	spin_lock(&chip->controller->lock);
 	chip->controller->active = NULL;
@@ -160,7 +156,7 @@ static uint8_t nand_read_byte(struct mtd_info *mtd)
 }
 
 /**
- * nand_read_byte16 - [DEFAULT] read one byte endianess aware from the chip
+ * nand_read_byte16 - [DEFAULT] read one byte endianness aware from the chip
  * nand_read_byte16 - [DEFAULT] read one byte endianness aware from the chip
  * @mtd: MTD device structure
  *
@@ -303,7 +299,7 @@ static int nand_block_bad(struct mtd_info *mtd, loff_t ofs, int getchip)
 	if (getchip) {
 		chipnr = (int)(ofs >> chip->chip_shift);
 
-		nand_get_device(chip, mtd, FL_READING);
+		nand_get_device(mtd, FL_READING);
 
 		/* Select the NAND device */
 		chip->select_chip(mtd, chipnr);
@@ -333,8 +329,10 @@ static int nand_block_bad(struct mtd_info *mtd, loff_t ofs, int getchip)
 		i++;
 	} while (!res && i < 2 && (chip->bbt_options & NAND_BBT_SCAN2NDPAGE));
 
-	if (getchip)
+	if (getchip) {
+		chip->select_chip(mtd, -1);
 		nand_release_device(mtd);
+	}
 
 	return res;
 }
@@ -383,7 +381,7 @@ static int nand_default_block_markbad(struct mtd_info *mtd, loff_t ofs)
 		struct mtd_oob_ops ops;
 		loff_t wr_ofs = ofs;
 
-		nand_get_device(chip, mtd, FL_WRITING);
+		nand_get_device(mtd, FL_WRITING);
 
 		ops.datbuf = NULL;
 		ops.oobbuf = buf;
@@ -492,7 +490,7 @@ static void panic_nand_wait_ready(struct mtd_info *mtd, unsigned long timeo)
 void nand_wait_ready(struct mtd_info *mtd)
 {
 	struct nand_chip *chip = mtd->priv;
-	unsigned long timeo = jiffies + 2;
+	unsigned long timeo = jiffies + msecs_to_jiffies(20);
 
 	/* 400ms timeout */
 	if (in_interrupt() || oops_in_progress)
@@ -750,15 +748,15 @@ static void panic_nand_get_device(struct nand_chip *chip,
 
 /**
  * nand_get_device - [GENERIC] Get chip for selected access
- * @chip: the nand chip descriptor
  * @mtd: MTD device structure
  * @new_state: the state which is requested
  *
  * Get the device and lock it for exclusive access
  */
 static int
-nand_get_device(struct nand_chip *chip, struct mtd_info *mtd, int new_state)
+nand_get_device(struct mtd_info *mtd, int new_state)
 {
+	struct nand_chip *chip = mtd->priv;
 	spinlock_t *lock = &chip->controller->lock;
 	wait_queue_head_t *wq = &chip->controller->wq;
 	DECLARE_WAITQUEUE(wait, current);
@@ -865,6 +863,8 @@ static int nand_wait(struct mtd_info *mtd, struct nand_chip *chip)
 	led_trigger_event(nand_led_trigger, LED_OFF);
 
 	status = (int)chip->read_byte(mtd);
+	/* This can happen if in case of timeout or buggy dev_ready */
+	WARN_ON(!(status & NAND_STATUS_READY));
 	return status;
 }
 
@@ -899,7 +899,7 @@ static int __nand_unlock(struct mtd_info *mtd, loff_t ofs,
 	/* Call wait ready function */
 	status = chip->waitfunc(mtd, chip);
 	/* See if device thinks it succeeded */
-	if (status & 0x01) {
+	if (status & NAND_STATUS_FAIL) {
 		pr_debug("%s: error status = 0x%08x\n",
 					__func__, status);
 		ret = -EIO;
@@ -932,7 +932,7 @@ int nand_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 	if (ofs + len == mtd->size)
 		len -= mtd->erasesize;
 
-	nand_get_device(chip, mtd, FL_UNLOCKING);
+	nand_get_device(mtd, FL_UNLOCKING);
 
 	/* Shift to get chip number */
 	chipnr = ofs >> chip->chip_shift;
@@ -950,6 +950,7 @@ int nand_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 	ret = __nand_unlock(mtd, ofs, len, 0);
 
 out:
+	chip->select_chip(mtd, -1);
 	nand_release_device(mtd);
 
 	return ret;
@@ -981,7 +982,7 @@ int nand_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 	if (check_offs_len(mtd, ofs, len))
 		ret = -EINVAL;
 
-	nand_get_device(chip, mtd, FL_LOCKING);
+	nand_get_device(mtd, FL_LOCKING);
 
 	/* Shift to get chip number */
 	chipnr = ofs >> chip->chip_shift;
@@ -1004,7 +1005,7 @@ int nand_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 	/* Call wait ready function */
 	status = chip->waitfunc(mtd, chip);
 	/* See if device thinks it succeeded */
-	if (status & 0x01) {
+	if (status & NAND_STATUS_FAIL) {
 		pr_debug("%s: error status = 0x%08x\n",
 					__func__, status);
 		ret = -EIO;
@@ -1014,6 +1015,7 @@ int nand_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 	ret = __nand_unlock(mtd, ofs, len, 0x1);
 
 out:
+	chip->select_chip(mtd, -1);
 	nand_release_device(mtd);
 
 	return ret;
@@ -1550,6 +1552,7 @@ static int nand_do_read_ops(struct mtd_info *mtd, loff_t from,
 			chip->select_chip(mtd, chipnr);
 		}
 	}
+	chip->select_chip(mtd, -1);
 
 	ops->retlen = ops->len - (size_t) readlen;
 	if (oob)
@@ -1577,11 +1580,10 @@ static int nand_do_read_ops(struct mtd_info *mtd, loff_t from,
 static int nand_read(struct mtd_info *mtd, loff_t from, size_t len,
 		     size_t *retlen, uint8_t *buf)
 {
-	struct nand_chip *chip = mtd->priv;
 	struct mtd_oob_ops ops;
 	int ret;
 
-	nand_get_device(chip, mtd, FL_READING);
+	nand_get_device(mtd, FL_READING);
 	ops.len = len;
 	ops.datbuf = buf;
 	ops.oobbuf = NULL;
@@ -1804,6 +1806,7 @@ static int nand_do_read_oob(struct mtd_info *mtd, loff_t from,
 			chip->select_chip(mtd, chipnr);
 		}
 	}
+	chip->select_chip(mtd, -1);
 
 	ops->oobretlen = ops->ooblen - readlen;
 
@@ -1827,7 +1830,6 @@ static int nand_do_read_oob(struct mtd_info *mtd, loff_t from,
 static int nand_read_oob(struct mtd_info *mtd, loff_t from,
 			 struct mtd_oob_ops *ops)
 {
-	struct nand_chip *chip = mtd->priv;
 	int ret = -ENOTSUPP;
 
 	ops->retlen = 0;
@@ -1839,7 +1841,7 @@ static int nand_read_oob(struct mtd_info *mtd, loff_t from,
 		return -EINVAL;
 	}
 
-	nand_get_device(chip, mtd, FL_READING);
+	nand_get_device(mtd, FL_READING);
 
 	switch (ops->mode) {
 	case MTD_OPS_PLACE_OOB:
@@ -2186,8 +2188,10 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to,
 	chip->select_chip(mtd, chipnr);
 
 	/* Check, if it is write protected */
-	if (nand_check_wp(mtd))
-		return -EIO;
+	if (nand_check_wp(mtd)) {
+		ret = -EIO;
+		goto err_out;
+	}
 
 	realpage = (int)(to >> chip->page_shift);
 	page = realpage & chip->pagemask;
@@ -2199,8 +2203,10 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to,
 		chip->pagebuf = -1;
 
 	/* Don't allow multipage oob writes with offset */
-	if (oob && ops->ooboffs && (ops->ooboffs + ops->ooblen > oobmaxlen))
-		return -EINVAL;
+	if (oob && ops->ooboffs && (ops->ooboffs + ops->ooblen > oobmaxlen)) {
+		ret = -EINVAL;
+		goto err_out;
+	}
 
 	while (1) {
 		int bytes = mtd->writesize;
@@ -2251,6 +2257,9 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to,
 	ops->retlen = ops->len - writelen;
 	if (unlikely(oob))
 		ops->oobretlen = ops->ooblen;
+
+err_out:
+	chip->select_chip(mtd, -1);
 	return ret;
 }
 
@@ -2302,11 +2311,10 @@ static int panic_nand_write(struct mtd_info *mtd, loff_t to, size_t len,
 static int nand_write(struct mtd_info *mtd, loff_t to, size_t len,
 			  size_t *retlen, const uint8_t *buf)
 {
-	struct nand_chip *chip = mtd->priv;
 	struct mtd_oob_ops ops;
 	int ret;
 
-	nand_get_device(chip, mtd, FL_WRITING);
+	nand_get_device(mtd, FL_WRITING);
 	ops.len = len;
 	ops.datbuf = (uint8_t *)buf;
 	ops.oobbuf = NULL;
@@ -2377,8 +2385,10 @@ static int nand_do_write_oob(struct mtd_info *mtd, loff_t to,
 	chip->cmdfunc(mtd, NAND_CMD_RESET, -1, -1);
 
 	/* Check, if it is write protected */
-	if (nand_check_wp(mtd))
+	if (nand_check_wp(mtd)) {
+		chip->select_chip(mtd, -1);
 		return -EROFS;
+	}
 
 	/* Invalidate the page cache, if we write to the cached page */
 	if (page == chip->pagebuf)
@@ -2391,6 +2401,8 @@ static int nand_do_write_oob(struct mtd_info *mtd, loff_t to,
 	else
 		status = chip->ecc.write_oob(mtd, chip, page & chip->pagemask);
 
+	chip->select_chip(mtd, -1);
+
 	if (status)
 		return status;
 
@@ -2408,7 +2420,6 @@ static int nand_do_write_oob(struct mtd_info *mtd, loff_t to,
 static int nand_write_oob(struct mtd_info *mtd, loff_t to,
 			  struct mtd_oob_ops *ops)
 {
-	struct nand_chip *chip = mtd->priv;
 	int ret = -ENOTSUPP;
 
 	ops->retlen = 0;
@@ -2420,7 +2431,7 @@ static int nand_write_oob(struct mtd_info *mtd, loff_t to,
 		return -EINVAL;
 	}
 
-	nand_get_device(chip, mtd, FL_WRITING);
+	nand_get_device(mtd, FL_WRITING);
 
 	switch (ops->mode) {
 	case MTD_OPS_PLACE_OOB:
@@ -2513,7 +2524,7 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
 		return -EINVAL;
 
 	/* Grab the lock and see if the device is available */
-	nand_get_device(chip, mtd, FL_ERASING);
+	nand_get_device(mtd, FL_ERASING);
 
 	/* Shift to get first page */
 	page = (int)(instr->addr >> chip->page_shift);
@@ -2623,6 +2634,7 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
 	ret = instr->state == MTD_ERASE_DONE ? 0 : -EIO;
 
 	/* Deselect and wake up anyone waiting on the device */
+	chip->select_chip(mtd, -1);
 	nand_release_device(mtd);
 
 	/* Do call back function */
@@ -2658,12 +2670,10 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
  */
 static void nand_sync(struct mtd_info *mtd)
 {
-	struct nand_chip *chip = mtd->priv;
-
 	pr_debug("%s: called\n", __func__);
 
 	/* Grab the lock and see if the device is available */
-	nand_get_device(chip, mtd, FL_SYNCING);
+	nand_get_device(mtd, FL_SYNCING);
 	/* Release it and go back */
 	nand_release_device(mtd);
 }
@@ -2749,9 +2759,7 @@ static int nand_onfi_get_features(struct mtd_info *mtd, struct nand_chip *chip,
  */
 static int nand_suspend(struct mtd_info *mtd)
 {
-	struct nand_chip *chip = mtd->priv;
-
-	return nand_get_device(chip, mtd, FL_PM_SUSPENDED);
+	return nand_get_device(mtd, FL_PM_SUSPENDED);
 }
 
 /**
@@ -2849,6 +2857,8 @@ static int nand_flash_detect_onfi(struct mtd_info *mtd, struct nand_chip *chip,
 	int i;
 	int val;
 
+	/* ONFI need to be probed in 8 bits mode */
+	WARN_ON(chip->options & NAND_BUSWIDTH_16);
 	/* Try ONFI for unknown chip or LP */
 	chip->cmdfunc(mtd, NAND_CMD_READID, 0x20, -1);
 	if (chip->read_byte(mtd) != 'O' || chip->read_byte(mtd) != 'N' ||
@@ -2913,7 +2923,7 @@ static int nand_flash_detect_onfi(struct mtd_info *mtd, struct nand_chip *chip,
  *
  * Check if an ID string is repeated within a given sequence of bytes at
  * specific repetition interval period (e.g., {0x20,0x01,0x7F,0x20} has a
- * period of 2). This is a helper function for nand_id_len(). Returns non-zero
+ * period of 3). This is a helper function for nand_id_len(). Returns non-zero
  * if the repetition has a period of @period; otherwise, returns zero.
  */
 static int nand_id_has_period(u8 *id_data, int arrlen, int period)
@@ -3242,11 +3252,15 @@ static struct nand_flash_dev *nand_get_flash_type(struct mtd_info *mtd,
 			break;
 	}
 
-	/*
-	 * Check, if buswidth is correct. Hardware drivers should set
-	 * chip correct!
-	 */
-	if (busw != (chip->options & NAND_BUSWIDTH_16)) {
+	if (chip->options & NAND_BUSWIDTH_AUTO) {
+		WARN_ON(chip->options & NAND_BUSWIDTH_16);
+		chip->options |= busw;
+		nand_set_defaults(chip, busw);
+	} else if (busw != (chip->options & NAND_BUSWIDTH_16)) {
+		/*
+		 * Check, if buswidth is correct. Hardware drivers should set
+		 * chip correct!
+		 */
 		pr_info("NAND device: Manufacturer ID:"
 			" 0x%02x, Chip ID: 0x%02x (%s %s)\n", *maf_id,
 			*dev_id, nand_manuf_ids[maf_idx].name, mtd->name);
@@ -3285,10 +3299,10 @@ static struct nand_flash_dev *nand_get_flash_type(struct mtd_info *mtd,
 		chip->cmdfunc = nand_command_lp;
 
 	pr_info("NAND device: Manufacturer ID: 0x%02x, Chip ID: 0x%02x (%s %s),"
-		" page size: %d, OOB size: %d\n",
+		" %dMiB, page size: %d, OOB size: %d\n",
 		*maf_id, *dev_id, nand_manuf_ids[maf_idx].name,
 		chip->onfi_version ? chip->onfi_params.model : type->name,
-		mtd->writesize, mtd->oobsize);
+		(int)(chip->chipsize >> 20), mtd->writesize, mtd->oobsize);
 
 	return type;
 }
@@ -3327,6 +3341,8 @@ int nand_scan_ident(struct mtd_info *mtd, int maxchips,
 		return PTR_ERR(type);
 	}
 
+	chip->select_chip(mtd, -1);
+
 	/* Check for a chip array */
 	for (i = 1; i < maxchips; i++) {
 		chip->select_chip(mtd, i);
@@ -3336,8 +3352,11 @@ int nand_scan_ident(struct mtd_info *mtd, int maxchips,
 		chip->cmdfunc(mtd, NAND_CMD_READID, 0x00, -1);
 		/* Read manufacturer and device IDs */
 		if (nand_maf_id != chip->read_byte(mtd) ||
-		    nand_dev_id != chip->read_byte(mtd))
+		    nand_dev_id != chip->read_byte(mtd)) {
+			chip->select_chip(mtd, -1);
 			break;
+		}
+		chip->select_chip(mtd, -1);
 	}
 	if (i > 1)
 		pr_info("%d NAND chips detected\n", i);
@@ -3596,9 +3615,6 @@ int nand_scan_tail(struct mtd_info *mtd)
 	/* Initialize state */
 	chip->state = FL_READY;
 
-	/* De-select the device */
-	chip->select_chip(mtd, -1);
-
 	/* Invalidate the pagebuffer reference */
 	chip->pagebuf = -1;
 
diff --git a/trunk/drivers/mtd/nand/nandsim.c b/trunk/drivers/mtd/nand/nandsim.c
index a932c485eb04..818b65c85d12 100644
--- a/trunk/drivers/mtd/nand/nandsim.c
+++ b/trunk/drivers/mtd/nand/nandsim.c
@@ -42,6 +42,8 @@
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
 
 /* Default simulator parameters values */
 #if !defined(CONFIG_NANDSIM_FIRST_ID_BYTE)  || \
@@ -105,7 +107,6 @@ static char *weakblocks = NULL;
 static char *weakpages = NULL;
 static unsigned int bitflips = 0;
 static char *gravepages = NULL;
-static unsigned int rptwear = 0;
 static unsigned int overridesize = 0;
 static char *cache_file = NULL;
 static unsigned int bbt;
@@ -130,7 +131,6 @@ module_param(weakblocks,     charp, 0400);
 module_param(weakpages,      charp, 0400);
 module_param(bitflips,       uint, 0400);
 module_param(gravepages,     charp, 0400);
-module_param(rptwear,        uint, 0400);
 module_param(overridesize,   uint, 0400);
 module_param(cache_file,     charp, 0400);
 module_param(bbt,	     uint, 0400);
@@ -162,7 +162,6 @@ MODULE_PARM_DESC(bitflips,       "Maximum number of random bit flips per page (z
 MODULE_PARM_DESC(gravepages,     "Pages that lose data [: maximum reads (defaults to 3)]"
 				 " separated by commas e.g. 1401:2 means page 1401"
 				 " can be read only twice before failing");
-MODULE_PARM_DESC(rptwear,        "Number of erases between reporting wear, if not zero");
 MODULE_PARM_DESC(overridesize,   "Specifies the NAND Flash size overriding the ID bytes. "
 				 "The size is specified in erase blocks and as the exponent of a power of two"
 				 " e.g. 5 means a size of 32 erase blocks");
@@ -286,6 +285,11 @@ MODULE_PARM_DESC(bch,		 "Enable BCH ecc and set how many bits should "
 /* Maximum page cache pages needed to read or write a NAND page to the cache_file */
 #define NS_MAX_HELD_PAGES 16
 
+struct nandsim_debug_info {
+	struct dentry *dfs_root;
+	struct dentry *dfs_wear_report;
+};
+
 /*
  * A union to represent flash memory contents and flash buffer.
  */
@@ -365,6 +369,8 @@ struct nandsim {
 	void *file_buf;
 	struct page *held_pages[NS_MAX_HELD_PAGES];
 	int held_cnt;
+
+	struct nandsim_debug_info dbg;
 };
 
 /*
@@ -442,11 +448,123 @@ static LIST_HEAD(grave_pages);
 static unsigned long *erase_block_wear = NULL;
 static unsigned int wear_eb_count = 0;
 static unsigned long total_wear = 0;
-static unsigned int rptwear_cnt = 0;
 
 /* MTD structure for NAND controller */
 static struct mtd_info *nsmtd;
 
+static int nandsim_debugfs_show(struct seq_file *m, void *private)
+{
+	unsigned long wmin = -1, wmax = 0, avg;
+	unsigned long deciles[10], decile_max[10], tot = 0;
+	unsigned int i;
+
+	/* Calc wear stats */
+	for (i = 0; i < wear_eb_count; ++i) {
+		unsigned long wear = erase_block_wear[i];
+		if (wear < wmin)
+			wmin = wear;
+		if (wear > wmax)
+			wmax = wear;
+		tot += wear;
+	}
+
+	for (i = 0; i < 9; ++i) {
+		deciles[i] = 0;
+		decile_max[i] = (wmax * (i + 1) + 5) / 10;
+	}
+	deciles[9] = 0;
+	decile_max[9] = wmax;
+	for (i = 0; i < wear_eb_count; ++i) {
+		int d;
+		unsigned long wear = erase_block_wear[i];
+		for (d = 0; d < 10; ++d)
+			if (wear <= decile_max[d]) {
+				deciles[d] += 1;
+				break;
+			}
+	}
+	avg = tot / wear_eb_count;
+
+	/* Output wear report */
+	seq_printf(m, "Total numbers of erases:  %lu\n", tot);
+	seq_printf(m, "Number of erase blocks:   %u\n", wear_eb_count);
+	seq_printf(m, "Average number of erases: %lu\n", avg);
+	seq_printf(m, "Maximum number of erases: %lu\n", wmax);
+	seq_printf(m, "Minimum number of erases: %lu\n", wmin);
+	for (i = 0; i < 10; ++i) {
+		unsigned long from = (i ? decile_max[i - 1] + 1 : 0);
+		if (from > decile_max[i])
+			continue;
+		seq_printf(m, "Number of ebs with erase counts from %lu to %lu : %lu\n",
+			from,
+			decile_max[i],
+			deciles[i]);
+	}
+
+	return 0;
+}
+
+static int nandsim_debugfs_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, nandsim_debugfs_show, inode->i_private);
+}
+
+static const struct file_operations dfs_fops = {
+	.open		= nandsim_debugfs_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+/**
+ * nandsim_debugfs_create - initialize debugfs
+ * @dev: nandsim device description object
+ *
+ * This function creates all debugfs files for UBI device @ubi. Returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+static int nandsim_debugfs_create(struct nandsim *dev)
+{
+	struct nandsim_debug_info *dbg = &dev->dbg;
+	struct dentry *dent;
+	int err;
+
+	if (!IS_ENABLED(CONFIG_DEBUG_FS))
+		return 0;
+
+	dent = debugfs_create_dir("nandsim", NULL);
+	if (IS_ERR_OR_NULL(dent)) {
+		int err = dent ? -ENODEV : PTR_ERR(dent);
+
+		NS_ERR("cannot create \"nandsim\" debugfs directory, err %d\n",
+			err);
+		return err;
+	}
+	dbg->dfs_root = dent;
+
+	dent = debugfs_create_file("wear_report", S_IRUSR,
+				   dbg->dfs_root, dev, &dfs_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto out_remove;
+	dbg->dfs_wear_report = dent;
+
+	return 0;
+
+out_remove:
+	debugfs_remove_recursive(dbg->dfs_root);
+	err = dent ? PTR_ERR(dent) : -ENODEV;
+	return err;
+}
+
+/**
+ * nandsim_debugfs_remove - destroy all debugfs files
+ */
+static void nandsim_debugfs_remove(struct nandsim *ns)
+{
+	if (IS_ENABLED(CONFIG_DEBUG_FS))
+		debugfs_remove_recursive(ns->dbg.dfs_root);
+}
+
 /*
  * Allocate array of page pointers, create slab allocation for an array
  * and initialize the array by NULL pointers.
@@ -911,8 +1029,6 @@ static int setup_wear_reporting(struct mtd_info *mtd)
 {
 	size_t mem;
 
-	if (!rptwear)
-		return 0;
 	wear_eb_count = div_u64(mtd->size, mtd->erasesize);
 	mem = wear_eb_count * sizeof(unsigned long);
 	if (mem / sizeof(unsigned long) != wear_eb_count) {
@@ -929,64 +1045,18 @@ static int setup_wear_reporting(struct mtd_info *mtd)
 
 static void update_wear(unsigned int erase_block_no)
 {
-	unsigned long wmin = -1, wmax = 0, avg;
-	unsigned long deciles[10], decile_max[10], tot = 0;
-	unsigned int i;
-
 	if (!erase_block_wear)
 		return;
 	total_wear += 1;
+	/*
+	 * TODO: Notify this through a debugfs entry,
+	 * instead of showing an error message.
+	 */
 	if (total_wear == 0)
 		NS_ERR("Erase counter total overflow\n");
 	erase_block_wear[erase_block_no] += 1;
 	if (erase_block_wear[erase_block_no] == 0)
 		NS_ERR("Erase counter overflow for erase block %u\n", erase_block_no);
-	rptwear_cnt += 1;
-	if (rptwear_cnt < rptwear)
-		return;
-	rptwear_cnt = 0;
-	/* Calc wear stats */
-	for (i = 0; i < wear_eb_count; ++i) {
-		unsigned long wear = erase_block_wear[i];
-		if (wear < wmin)
-			wmin = wear;
-		if (wear > wmax)
-			wmax = wear;
-		tot += wear;
-	}
-	for (i = 0; i < 9; ++i) {
-		deciles[i] = 0;
-		decile_max[i] = (wmax * (i + 1) + 5) / 10;
-	}
-	deciles[9] = 0;
-	decile_max[9] = wmax;
-	for (i = 0; i < wear_eb_count; ++i) {
-		int d;
-		unsigned long wear = erase_block_wear[i];
-		for (d = 0; d < 10; ++d)
-			if (wear <= decile_max[d]) {
-				deciles[d] += 1;
-				break;
-			}
-	}
-	avg = tot / wear_eb_count;
-	/* Output wear report */
-	NS_INFO("*** Wear Report ***\n");
-	NS_INFO("Total numbers of erases:  %lu\n", tot);
-	NS_INFO("Number of erase blocks:   %u\n", wear_eb_count);
-	NS_INFO("Average number of erases: %lu\n", avg);
-	NS_INFO("Maximum number of erases: %lu\n", wmax);
-	NS_INFO("Minimum number of erases: %lu\n", wmin);
-	for (i = 0; i < 10; ++i) {
-		unsigned long from = (i ? decile_max[i - 1] + 1 : 0);
-		if (from > decile_max[i])
-			continue;
-		NS_INFO("Number of ebs with erase counts from %lu to %lu : %lu\n",
-			from,
-			decile_max[i],
-			deciles[i]);
-	}
-	NS_INFO("*** End of Wear Report ***\n");
 }
 
 /*
@@ -1397,10 +1467,7 @@ int do_read_error(struct nandsim *ns, int num)
 	unsigned int page_no = ns->regs.row;
 
 	if (read_error(page_no)) {
-		int i;
-		memset(ns->buf.byte, 0xFF, num);
-		for (i = 0; i < num; ++i)
-			ns->buf.byte[i] = random32();
+		prandom_bytes(ns->buf.byte, num);
 		NS_WARN("simulating read error in page %u\n", page_no);
 		return 1;
 	}
@@ -2330,6 +2397,9 @@ static int __init ns_init_module(void)
 	if ((retval = setup_wear_reporting(nsmtd)) != 0)
 		goto err_exit;
 
+	if ((retval = nandsim_debugfs_create(nand)) != 0)
+		goto err_exit;
+
 	if ((retval = init_nandsim(nsmtd)) != 0)
 		goto err_exit;
 
@@ -2369,6 +2439,7 @@ static void __exit ns_cleanup_module(void)
 	struct nandsim *ns = ((struct nand_chip *)nsmtd->priv)->priv;
 	int i;
 
+	nandsim_debugfs_remove(ns);
 	free_nandsim(ns);    /* Free nandsim private resources */
 	nand_release(nsmtd); /* Unregister driver */
 	for (i = 0;i < ARRAY_SIZE(ns->partitions); ++i)
diff --git a/trunk/drivers/mtd/nand/ndfc.c b/trunk/drivers/mtd/nand/ndfc.c
index 5fd3f010e3ae..8e148f1478fd 100644
--- a/trunk/drivers/mtd/nand/ndfc.c
+++ b/trunk/drivers/mtd/nand/ndfc.c
@@ -197,7 +197,7 @@ static int ndfc_chip_init(struct ndfc_controller *ndfc,
 	return ret;
 }
 
-static int __devinit ndfc_probe(struct platform_device *ofdev)
+static int ndfc_probe(struct platform_device *ofdev)
 {
 	struct ndfc_controller *ndfc;
 	const __be32 *reg;
@@ -256,7 +256,7 @@ static int __devinit ndfc_probe(struct platform_device *ofdev)
 	return 0;
 }
 
-static int __devexit ndfc_remove(struct platform_device *ofdev)
+static int ndfc_remove(struct platform_device *ofdev)
 {
 	struct ndfc_controller *ndfc = dev_get_drvdata(&ofdev->dev);
 
@@ -279,7 +279,7 @@ static struct platform_driver ndfc_driver = {
 		.of_match_table = ndfc_match,
 	},
 	.probe = ndfc_probe,
-	.remove = __devexit_p(ndfc_remove),
+	.remove = ndfc_remove,
 };
 
 module_platform_driver(ndfc_driver);
diff --git a/trunk/drivers/mtd/nand/nomadik_nand.c b/trunk/drivers/mtd/nand/nomadik_nand.c
deleted file mode 100644
index 9ee0c4edfacf..000000000000
--- a/trunk/drivers/mtd/nand/nomadik_nand.c
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
- *  drivers/mtd/nand/nomadik_nand.c
- *
- *  Overview:
- *  	Driver for on-board NAND flash on Nomadik Platforms
- *
- * Copyright © 2007 STMicroelectronics Pvt. Ltd.
- * Author: Sachin Verma <sachin.verma@st.com>
- *
- * Copyright © 2009 Alessandro Rubini
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
-#include <linux/mtd/nand_ecc.h>
-#include <linux/platform_device.h>
-#include <linux/mtd/partitions.h>
-#include <linux/io.h>
-#include <linux/slab.h>
-#include <linux/platform_data/mtd-nomadik-nand.h>
-#include <mach/fsmc.h>
-
-#include <mtd/mtd-abi.h>
-
-struct nomadik_nand_host {
-	struct mtd_info		mtd;
-	struct nand_chip	nand;
-	void __iomem *data_va;
-	void __iomem *cmd_va;
-	void __iomem *addr_va;
-	struct nand_bbt_descr *bbt_desc;
-};
-
-static struct nand_ecclayout nomadik_ecc_layout = {
-	.eccbytes = 3 * 4,
-	.eccpos = { /* each subpage has 16 bytes: pos 2,3,4 hosts ECC */
-		0x02, 0x03, 0x04,
-		0x12, 0x13, 0x14,
-		0x22, 0x23, 0x24,
-		0x32, 0x33, 0x34},
-	/* let's keep bytes 5,6,7 for us, just in case we change ECC algo */
-	.oobfree = { {0x08, 0x08}, {0x18, 0x08}, {0x28, 0x08}, {0x38, 0x08} },
-};
-
-static void nomadik_ecc_control(struct mtd_info *mtd, int mode)
-{
-	/* No need to enable hw ecc, it's on by default */
-}
-
-static void nomadik_cmd_ctrl(struct mtd_info *mtd, int cmd, unsigned int ctrl)
-{
-	struct nand_chip *nand = mtd->priv;
-	struct nomadik_nand_host *host = nand->priv;
-
-	if (cmd == NAND_CMD_NONE)
-		return;
-
-	if (ctrl & NAND_CLE)
-		writeb(cmd, host->cmd_va);
-	else
-		writeb(cmd, host->addr_va);
-}
-
-static int nomadik_nand_probe(struct platform_device *pdev)
-{
-	struct nomadik_nand_platform_data *pdata = pdev->dev.platform_data;
-	struct nomadik_nand_host *host;
-	struct mtd_info *mtd;
-	struct nand_chip *nand;
-	struct resource *res;
-	int ret = 0;
-
-	/* Allocate memory for the device structure (and zero it) */
-	host = kzalloc(sizeof(struct nomadik_nand_host), GFP_KERNEL);
-	if (!host) {
-		dev_err(&pdev->dev, "Failed to allocate device structure.\n");
-		return -ENOMEM;
-	}
-
-	/* Call the client's init function, if any */
-	if (pdata->init)
-		ret = pdata->init();
-	if (ret < 0) {
-		dev_err(&pdev->dev, "Init function failed\n");
-		goto err;
-	}
-
-	/* ioremap three regions */
-	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "nand_addr");
-	if (!res) {
-		ret = -EIO;
-		goto err_unmap;
-	}
-	host->addr_va = ioremap(res->start, resource_size(res));
-
-	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "nand_data");
-	if (!res) {
-		ret = -EIO;
-		goto err_unmap;
-	}
-	host->data_va = ioremap(res->start, resource_size(res));
-
-	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "nand_cmd");
-	if (!res) {
-		ret = -EIO;
-		goto err_unmap;
-	}
-	host->cmd_va = ioremap(res->start, resource_size(res));
-
-	if (!host->addr_va || !host->data_va || !host->cmd_va) {
-		ret = -ENOMEM;
-		goto err_unmap;
-	}
-
-	/* Link all private pointers */
-	mtd = &host->mtd;
-	nand = &host->nand;
-	mtd->priv = nand;
-	nand->priv = host;
-
-	host->mtd.owner = THIS_MODULE;
-	nand->IO_ADDR_R = host->data_va;
-	nand->IO_ADDR_W = host->data_va;
-	nand->cmd_ctrl = nomadik_cmd_ctrl;
-
-	/*
-	 * This stanza declares ECC_HW but uses soft routines. It's because
-	 * HW claims to make the calculation but not the correction. However,
-	 * I haven't managed to get the desired data out of it until now.
-	 */
-	nand->ecc.mode = NAND_ECC_SOFT;
-	nand->ecc.layout = &nomadik_ecc_layout;
-	nand->ecc.hwctl = nomadik_ecc_control;
-	nand->ecc.size = 512;
-	nand->ecc.bytes = 3;
-
-	nand->options = pdata->options;
-
-	/*
-	 * Scan to find existence of the device
-	 */
-	if (nand_scan(&host->mtd, 1)) {
-		ret = -ENXIO;
-		goto err_unmap;
-	}
-
-	mtd_device_register(&host->mtd, pdata->parts, pdata->nparts);
-
-	platform_set_drvdata(pdev, host);
-	return 0;
-
- err_unmap:
-	if (host->cmd_va)
-		iounmap(host->cmd_va);
-	if (host->data_va)
-		iounmap(host->data_va);
-	if (host->addr_va)
-		iounmap(host->addr_va);
- err:
-	kfree(host);
-	return ret;
-}
-
-/*
- * Clean up routine
- */
-static int nomadik_nand_remove(struct platform_device *pdev)
-{
-	struct nomadik_nand_host *host = platform_get_drvdata(pdev);
-	struct nomadik_nand_platform_data *pdata = pdev->dev.platform_data;
-
-	if (pdata->exit)
-		pdata->exit();
-
-	if (host) {
-		nand_release(&host->mtd);
-		iounmap(host->cmd_va);
-		iounmap(host->data_va);
-		iounmap(host->addr_va);
-		kfree(host);
-	}
-	return 0;
-}
-
-static int nomadik_nand_suspend(struct device *dev)
-{
-	struct nomadik_nand_host *host = dev_get_drvdata(dev);
-	int ret = 0;
-	if (host)
-		ret = mtd_suspend(&host->mtd);
-	return ret;
-}
-
-static int nomadik_nand_resume(struct device *dev)
-{
-	struct nomadik_nand_host *host = dev_get_drvdata(dev);
-	if (host)
-		mtd_resume(&host->mtd);
-	return 0;
-}
-
-static const struct dev_pm_ops nomadik_nand_pm_ops = {
-	.suspend = nomadik_nand_suspend,
-	.resume = nomadik_nand_resume,
-};
-
-static struct platform_driver nomadik_nand_driver = {
-	.probe = nomadik_nand_probe,
-	.remove = nomadik_nand_remove,
-	.driver = {
-		.owner = THIS_MODULE,
-		.name = "nomadik_nand",
-		.pm = &nomadik_nand_pm_ops,
-	},
-};
-
-module_platform_driver(nomadik_nand_driver);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("ST Microelectronics (sachin.verma@st.com)");
-MODULE_DESCRIPTION("NAND driver for Nomadik Platform");
diff --git a/trunk/drivers/mtd/nand/nuc900_nand.c b/trunk/drivers/mtd/nand/nuc900_nand.c
index 94dc46bc118c..a6191198d259 100644
--- a/trunk/drivers/mtd/nand/nuc900_nand.c
+++ b/trunk/drivers/mtd/nand/nuc900_nand.c
@@ -246,7 +246,7 @@ static void nuc900_nand_enable(struct nuc900_nand *nand)
 	spin_unlock(&nand->lock);
 }
 
-static int __devinit nuc900_nand_probe(struct platform_device *pdev)
+static int nuc900_nand_probe(struct platform_device *pdev)
 {
 	struct nuc900_nand *nuc900_nand;
 	struct nand_chip *chip;
@@ -317,7 +317,7 @@ fail1:	kfree(nuc900_nand);
 	return retval;
 }
 
-static int __devexit nuc900_nand_remove(struct platform_device *pdev)
+static int nuc900_nand_remove(struct platform_device *pdev)
 {
 	struct nuc900_nand *nuc900_nand = platform_get_drvdata(pdev);
 	struct resource *res;
@@ -340,7 +340,7 @@ static int __devexit nuc900_nand_remove(struct platform_device *pdev)
 
 static struct platform_driver nuc900_nand_driver = {
 	.probe		= nuc900_nand_probe,
-	.remove		= __devexit_p(nuc900_nand_remove),
+	.remove		= nuc900_nand_remove,
 	.driver		= {
 		.name	= "nuc900-fmi",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/mtd/nand/omap2.c b/trunk/drivers/mtd/nand/omap2.c
index 1f34ba104ef4..0002d5e94f0d 100644
--- a/trunk/drivers/mtd/nand/omap2.c
+++ b/trunk/drivers/mtd/nand/omap2.c
@@ -1323,7 +1323,7 @@ static void omap3_free_bch(struct mtd_info *mtd)
 }
 #endif /* CONFIG_MTD_NAND_OMAP_BCH */
 
-static int __devinit omap_nand_probe(struct platform_device *pdev)
+static int omap_nand_probe(struct platform_device *pdev)
 {
 	struct omap_nand_info		*info;
 	struct omap_nand_platform_data	*pdata;
diff --git a/trunk/drivers/mtd/nand/orion_nand.c b/trunk/drivers/mtd/nand/orion_nand.c
index aefaf8cd31ef..cd72b9299f6b 100644
--- a/trunk/drivers/mtd/nand/orion_nand.c
+++ b/trunk/drivers/mtd/nand/orion_nand.c
@@ -194,7 +194,7 @@ static int __init orion_nand_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static int __devexit orion_nand_remove(struct platform_device *pdev)
+static int orion_nand_remove(struct platform_device *pdev)
 {
 	struct mtd_info *mtd = platform_get_drvdata(pdev);
 	struct nand_chip *nc = mtd->priv;
@@ -223,7 +223,7 @@ static struct of_device_id orion_nand_of_match_table[] = {
 #endif
 
 static struct platform_driver orion_nand_driver = {
-	.remove		= __devexit_p(orion_nand_remove),
+	.remove		= orion_nand_remove,
 	.driver		= {
 		.name	= "orion_nand",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/mtd/nand/pasemi_nand.c b/trunk/drivers/mtd/nand/pasemi_nand.c
index 1440e51cedcc..5a67082c07ee 100644
--- a/trunk/drivers/mtd/nand/pasemi_nand.c
+++ b/trunk/drivers/mtd/nand/pasemi_nand.c
@@ -89,7 +89,7 @@ int pasemi_device_ready(struct mtd_info *mtd)
 	return !!(inl(lpcctl) & LBICTRL_LPCCTL_NR);
 }
 
-static int __devinit pasemi_nand_probe(struct platform_device *ofdev)
+static int pasemi_nand_probe(struct platform_device *ofdev)
 {
 	struct pci_dev *pdev;
 	struct device_node *np = ofdev->dev.of_node;
@@ -184,7 +184,7 @@ static int __devinit pasemi_nand_probe(struct platform_device *ofdev)
 	return err;
 }
 
-static int __devexit pasemi_nand_remove(struct platform_device *ofdev)
+static int pasemi_nand_remove(struct platform_device *ofdev)
 {
 	struct nand_chip *chip;
 
diff --git a/trunk/drivers/mtd/nand/plat_nand.c b/trunk/drivers/mtd/nand/plat_nand.c
index a47ee68a0cfa..c004566a9ad2 100644
--- a/trunk/drivers/mtd/nand/plat_nand.c
+++ b/trunk/drivers/mtd/nand/plat_nand.c
@@ -28,7 +28,7 @@ static const char *part_probe_types[] = { "cmdlinepart", NULL };
 /*
  * Probe for the NAND device.
  */
-static int __devinit plat_nand_probe(struct platform_device *pdev)
+static int plat_nand_probe(struct platform_device *pdev)
 {
 	struct platform_nand_data *pdata = pdev->dev.platform_data;
 	struct mtd_part_parser_data ppdata;
@@ -134,7 +134,7 @@ static int __devinit plat_nand_probe(struct platform_device *pdev)
 /*
  * Remove a NAND device.
  */
-static int __devexit plat_nand_remove(struct platform_device *pdev)
+static int plat_nand_remove(struct platform_device *pdev)
 {
 	struct plat_nand_data *data = platform_get_drvdata(pdev);
 	struct platform_nand_data *pdata = pdev->dev.platform_data;
@@ -160,7 +160,7 @@ MODULE_DEVICE_TABLE(of, plat_nand_match);
 
 static struct platform_driver plat_nand_driver = {
 	.probe	= plat_nand_probe,
-	.remove	= __devexit_p(plat_nand_remove),
+	.remove	= plat_nand_remove,
 	.driver	= {
 		.name		= "gen_nand",
 		.owner		= THIS_MODULE,
diff --git a/trunk/drivers/mtd/nand/s3c2410.c b/trunk/drivers/mtd/nand/s3c2410.c
index 79ded48e7427..df954b4dcba2 100644
--- a/trunk/drivers/mtd/nand/s3c2410.c
+++ b/trunk/drivers/mtd/nand/s3c2410.c
@@ -730,11 +730,14 @@ static int s3c2410_nand_add_partition(struct s3c2410_nand_info *info,
 				      struct s3c2410_nand_mtd *mtd,
 				      struct s3c2410_nand_set *set)
 {
-	if (set)
+	if (set) {
 		mtd->mtd.name = set->name;
 
-	return mtd_device_parse_register(&mtd->mtd, NULL, NULL,
+		return mtd_device_parse_register(&mtd->mtd, NULL, NULL,
 					 set->partitions, set->nr_partitions);
+	}
+
+	return -ENODEV;
 }
 
 /**
diff --git a/trunk/drivers/mtd/nand/sh_flctl.c b/trunk/drivers/mtd/nand/sh_flctl.c
index f48ac5d80bbf..57b3971c9c0a 100644
--- a/trunk/drivers/mtd/nand/sh_flctl.c
+++ b/trunk/drivers/mtd/nand/sh_flctl.c
@@ -23,11 +23,18 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/completion.h>
 #include <linux/delay.h>
+#include <linux/dmaengine.h>
+#include <linux/dma-mapping.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_mtd.h>
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
+#include <linux/sh_dma.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 
@@ -106,6 +113,84 @@ static void wait_completion(struct sh_flctl *flctl)
 	writeb(0x0, FLTRCR(flctl));
 }
 
+static void flctl_dma_complete(void *param)
+{
+	struct sh_flctl *flctl = param;
+
+	complete(&flctl->dma_complete);
+}
+
+static void flctl_release_dma(struct sh_flctl *flctl)
+{
+	if (flctl->chan_fifo0_rx) {
+		dma_release_channel(flctl->chan_fifo0_rx);
+		flctl->chan_fifo0_rx = NULL;
+	}
+	if (flctl->chan_fifo0_tx) {
+		dma_release_channel(flctl->chan_fifo0_tx);
+		flctl->chan_fifo0_tx = NULL;
+	}
+}
+
+static void flctl_setup_dma(struct sh_flctl *flctl)
+{
+	dma_cap_mask_t mask;
+	struct dma_slave_config cfg;
+	struct platform_device *pdev = flctl->pdev;
+	struct sh_flctl_platform_data *pdata = pdev->dev.platform_data;
+	int ret;
+
+	if (!pdata)
+		return;
+
+	if (pdata->slave_id_fifo0_tx <= 0 || pdata->slave_id_fifo0_rx <= 0)
+		return;
+
+	/* We can only either use DMA for both Tx and Rx or not use it at all */
+	dma_cap_zero(mask);
+	dma_cap_set(DMA_SLAVE, mask);
+
+	flctl->chan_fifo0_tx = dma_request_channel(mask, shdma_chan_filter,
+					    (void *)pdata->slave_id_fifo0_tx);
+	dev_dbg(&pdev->dev, "%s: TX: got channel %p\n", __func__,
+		flctl->chan_fifo0_tx);
+
+	if (!flctl->chan_fifo0_tx)
+		return;
+
+	memset(&cfg, 0, sizeof(cfg));
+	cfg.slave_id = pdata->slave_id_fifo0_tx;
+	cfg.direction = DMA_MEM_TO_DEV;
+	cfg.dst_addr = (dma_addr_t)FLDTFIFO(flctl);
+	cfg.src_addr = 0;
+	ret = dmaengine_slave_config(flctl->chan_fifo0_tx, &cfg);
+	if (ret < 0)
+		goto err;
+
+	flctl->chan_fifo0_rx = dma_request_channel(mask, shdma_chan_filter,
+					    (void *)pdata->slave_id_fifo0_rx);
+	dev_dbg(&pdev->dev, "%s: RX: got channel %p\n", __func__,
+		flctl->chan_fifo0_rx);
+
+	if (!flctl->chan_fifo0_rx)
+		goto err;
+
+	cfg.slave_id = pdata->slave_id_fifo0_rx;
+	cfg.direction = DMA_DEV_TO_MEM;
+	cfg.dst_addr = 0;
+	cfg.src_addr = (dma_addr_t)FLDTFIFO(flctl);
+	ret = dmaengine_slave_config(flctl->chan_fifo0_rx, &cfg);
+	if (ret < 0)
+		goto err;
+
+	init_completion(&flctl->dma_complete);
+
+	return;
+
+err:
+	flctl_release_dma(flctl);
+}
+
 static void set_addr(struct mtd_info *mtd, int column, int page_addr)
 {
 	struct sh_flctl *flctl = mtd_to_flctl(mtd);
@@ -225,7 +310,7 @@ static enum flctl_ecc_res_t wait_recfifo_ready
 
 		for (i = 0; i < 3; i++) {
 			uint8_t org;
-			int index;
+			unsigned int index;
 
 			data = readl(ecc_reg[i]);
 
@@ -261,6 +346,70 @@ static void wait_wecfifo_ready(struct sh_flctl *flctl)
 	timeout_error(flctl, __func__);
 }
 
+static int flctl_dma_fifo0_transfer(struct sh_flctl *flctl, unsigned long *buf,
+					int len, enum dma_data_direction dir)
+{
+	struct dma_async_tx_descriptor *desc = NULL;
+	struct dma_chan *chan;
+	enum dma_transfer_direction tr_dir;
+	dma_addr_t dma_addr;
+	dma_cookie_t cookie = -EINVAL;
+	uint32_t reg;
+	int ret;
+
+	if (dir == DMA_FROM_DEVICE) {
+		chan = flctl->chan_fifo0_rx;
+		tr_dir = DMA_DEV_TO_MEM;
+	} else {
+		chan = flctl->chan_fifo0_tx;
+		tr_dir = DMA_MEM_TO_DEV;
+	}
+
+	dma_addr = dma_map_single(chan->device->dev, buf, len, dir);
+
+	if (dma_addr)
+		desc = dmaengine_prep_slave_single(chan, dma_addr, len,
+			tr_dir, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+
+	if (desc) {
+		reg = readl(FLINTDMACR(flctl));
+		reg |= DREQ0EN;
+		writel(reg, FLINTDMACR(flctl));
+
+		desc->callback = flctl_dma_complete;
+		desc->callback_param = flctl;
+		cookie = dmaengine_submit(desc);
+
+		dma_async_issue_pending(chan);
+	} else {
+		/* DMA failed, fall back to PIO */
+		flctl_release_dma(flctl);
+		dev_warn(&flctl->pdev->dev,
+			 "DMA failed, falling back to PIO\n");
+		ret = -EIO;
+		goto out;
+	}
+
+	ret =
+	wait_for_completion_timeout(&flctl->dma_complete,
+				msecs_to_jiffies(3000));
+
+	if (ret <= 0) {
+		chan->device->device_control(chan, DMA_TERMINATE_ALL, 0);
+		dev_err(&flctl->pdev->dev, "wait_for_completion_timeout\n");
+	}
+
+out:
+	reg = readl(FLINTDMACR(flctl));
+	reg &= ~DREQ0EN;
+	writel(reg, FLINTDMACR(flctl));
+
+	dma_unmap_single(chan->device->dev, dma_addr, len, dir);
+
+	/* ret > 0 is success */
+	return ret;
+}
+
 static void read_datareg(struct sh_flctl *flctl, int offset)
 {
 	unsigned long data;
@@ -279,11 +428,20 @@ static void read_fiforeg(struct sh_flctl *flctl, int rlen, int offset)
 
 	len_4align = (rlen + 3) / 4;
 
+	/* initiate DMA transfer */
+	if (flctl->chan_fifo0_rx && rlen >= 32 &&
+		flctl_dma_fifo0_transfer(flctl, buf, rlen, DMA_DEV_TO_MEM) > 0)
+			goto convert;	/* DMA success */
+
+	/* do polling transfer */
 	for (i = 0; i < len_4align; i++) {
 		wait_rfifo_ready(flctl);
 		buf[i] = readl(FLDTFIFO(flctl));
-		buf[i] = be32_to_cpu(buf[i]);
 	}
+
+convert:
+	for (i = 0; i < len_4align; i++)
+		buf[i] = be32_to_cpu(buf[i]);
 }
 
 static enum flctl_ecc_res_t read_ecfiforeg
@@ -305,28 +463,39 @@ static enum flctl_ecc_res_t read_ecfiforeg
 	return res;
 }
 
-static void write_fiforeg(struct sh_flctl *flctl, int rlen, int offset)
+static void write_fiforeg(struct sh_flctl *flctl, int rlen,
+						unsigned int offset)
 {
 	int i, len_4align;
-	unsigned long *data = (unsigned long *)&flctl->done_buff[offset];
-	void *fifo_addr = (void *)FLDTFIFO(flctl);
+	unsigned long *buf = (unsigned long *)&flctl->done_buff[offset];
 
 	len_4align = (rlen + 3) / 4;
 	for (i = 0; i < len_4align; i++) {
 		wait_wfifo_ready(flctl);
-		writel(cpu_to_be32(data[i]), fifo_addr);
+		writel(cpu_to_be32(buf[i]), FLDTFIFO(flctl));
 	}
 }
 
-static void write_ec_fiforeg(struct sh_flctl *flctl, int rlen, int offset)
+static void write_ec_fiforeg(struct sh_flctl *flctl, int rlen,
+						unsigned int offset)
 {
 	int i, len_4align;
-	unsigned long *data = (unsigned long *)&flctl->done_buff[offset];
+	unsigned long *buf = (unsigned long *)&flctl->done_buff[offset];
 
 	len_4align = (rlen + 3) / 4;
+
+	for (i = 0; i < len_4align; i++)
+		buf[i] = cpu_to_be32(buf[i]);
+
+	/* initiate DMA transfer */
+	if (flctl->chan_fifo0_tx && rlen >= 32 &&
+		flctl_dma_fifo0_transfer(flctl, buf, rlen, DMA_MEM_TO_DEV) > 0)
+			return;	/* DMA success */
+
+	/* do polling transfer */
 	for (i = 0; i < len_4align; i++) {
 		wait_wecfifo_ready(flctl);
-		writel(cpu_to_be32(data[i]), FLECFIFO(flctl));
+		writel(buf[i], FLECFIFO(flctl));
 	}
 }
 
@@ -750,41 +919,35 @@ static void flctl_select_chip(struct mtd_info *mtd, int chipnr)
 static void flctl_write_buf(struct mtd_info *mtd, const uint8_t *buf, int len)
 {
 	struct sh_flctl *flctl = mtd_to_flctl(mtd);
-	int index = flctl->index;
 
-	memcpy(&flctl->done_buff[index], buf, len);
+	memcpy(&flctl->done_buff[flctl->index], buf, len);
 	flctl->index += len;
 }
 
 static uint8_t flctl_read_byte(struct mtd_info *mtd)
 {
 	struct sh_flctl *flctl = mtd_to_flctl(mtd);
-	int index = flctl->index;
 	uint8_t data;
 
-	data = flctl->done_buff[index];
+	data = flctl->done_buff[flctl->index];
 	flctl->index++;
 	return data;
 }
 
 static uint16_t flctl_read_word(struct mtd_info *mtd)
 {
-       struct sh_flctl *flctl = mtd_to_flctl(mtd);
-       int index = flctl->index;
-       uint16_t data;
-       uint16_t *buf = (uint16_t *)&flctl->done_buff[index];
+	struct sh_flctl *flctl = mtd_to_flctl(mtd);
+	uint16_t *buf = (uint16_t *)&flctl->done_buff[flctl->index];
 
-       data = *buf;
-       flctl->index += 2;
-       return data;
+	flctl->index += 2;
+	return *buf;
 }
 
 static void flctl_read_buf(struct mtd_info *mtd, uint8_t *buf, int len)
 {
 	struct sh_flctl *flctl = mtd_to_flctl(mtd);
-	int index = flctl->index;
 
-	memcpy(buf, &flctl->done_buff[index], len);
+	memcpy(buf, &flctl->done_buff[flctl->index], len);
 	flctl->index += len;
 }
 
@@ -858,7 +1021,74 @@ static irqreturn_t flctl_handle_flste(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int __devinit flctl_probe(struct platform_device *pdev)
+#ifdef CONFIG_OF
+struct flctl_soc_config {
+	unsigned long flcmncr_val;
+	unsigned has_hwecc:1;
+	unsigned use_holden:1;
+};
+
+static struct flctl_soc_config flctl_sh7372_config = {
+	.flcmncr_val = CLK_16B_12L_4H | TYPESEL_SET | SHBUSSEL,
+	.has_hwecc = 1,
+	.use_holden = 1,
+};
+
+static const struct of_device_id of_flctl_match[] = {
+	{ .compatible = "renesas,shmobile-flctl-sh7372",
+				.data = &flctl_sh7372_config },
+	{},
+};
+MODULE_DEVICE_TABLE(of, of_flctl_match);
+
+static struct sh_flctl_platform_data *flctl_parse_dt(struct device *dev)
+{
+	const struct of_device_id *match;
+	struct flctl_soc_config *config;
+	struct sh_flctl_platform_data *pdata;
+	struct device_node *dn = dev->of_node;
+	int ret;
+
+	match = of_match_device(of_flctl_match, dev);
+	if (match)
+		config = (struct flctl_soc_config *)match->data;
+	else {
+		dev_err(dev, "%s: no OF configuration attached\n", __func__);
+		return NULL;
+	}
+
+	pdata = devm_kzalloc(dev, sizeof(struct sh_flctl_platform_data),
+								GFP_KERNEL);
+	if (!pdata) {
+		dev_err(dev, "%s: failed to allocate config data\n", __func__);
+		return NULL;
+	}
+
+	/* set SoC specific options */
+	pdata->flcmncr_val = config->flcmncr_val;
+	pdata->has_hwecc = config->has_hwecc;
+	pdata->use_holden = config->use_holden;
+
+	/* parse user defined options */
+	ret = of_get_nand_bus_width(dn);
+	if (ret == 16)
+		pdata->flcmncr_val |= SEL_16BIT;
+	else if (ret != 8) {
+		dev_err(dev, "%s: invalid bus width\n", __func__);
+		return NULL;
+	}
+
+	return pdata;
+}
+#else /* CONFIG_OF */
+#define of_flctl_match NULL
+static struct sh_flctl_platform_data *flctl_parse_dt(struct device *dev)
+{
+	return NULL;
+}
+#endif /* CONFIG_OF */
+
+static int flctl_probe(struct platform_device *pdev)
 {
 	struct resource *res;
 	struct sh_flctl *flctl;
@@ -867,12 +1097,7 @@ static int __devinit flctl_probe(struct platform_device *pdev)
 	struct sh_flctl_platform_data *pdata;
 	int ret = -ENXIO;
 	int irq;
-
-	pdata = pdev->dev.platform_data;
-	if (pdata == NULL) {
-		dev_err(&pdev->dev, "no platform data defined\n");
-		return -EINVAL;
-	}
+	struct mtd_part_parser_data ppdata = {};
 
 	flctl = kzalloc(sizeof(struct sh_flctl), GFP_KERNEL);
 	if (!flctl) {
@@ -904,6 +1129,17 @@ static int __devinit flctl_probe(struct platform_device *pdev)
 		goto err_flste;
 	}
 
+	if (pdev->dev.of_node)
+		pdata = flctl_parse_dt(&pdev->dev);
+	else
+		pdata = pdev->dev.platform_data;
+
+	if (!pdata) {
+		dev_err(&pdev->dev, "no setup data defined\n");
+		ret = -EINVAL;
+		goto err_pdata;
+	}
+
 	platform_set_drvdata(pdev, flctl);
 	flctl_mtd = &flctl->mtd;
 	nand = &flctl->chip;
@@ -932,6 +1168,8 @@ static int __devinit flctl_probe(struct platform_device *pdev)
 	pm_runtime_enable(&pdev->dev);
 	pm_runtime_resume(&pdev->dev);
 
+	flctl_setup_dma(flctl);
+
 	ret = nand_scan_ident(flctl_mtd, 1, NULL);
 	if (ret)
 		goto err_chip;
@@ -944,12 +1182,16 @@ static int __devinit flctl_probe(struct platform_device *pdev)
 	if (ret)
 		goto err_chip;
 
-	mtd_device_register(flctl_mtd, pdata->parts, pdata->nr_parts);
+	ppdata.of_node = pdev->dev.of_node;
+	ret = mtd_device_parse_register(flctl_mtd, NULL, &ppdata, pdata->parts,
+			pdata->nr_parts);
 
 	return 0;
 
 err_chip:
+	flctl_release_dma(flctl);
 	pm_runtime_disable(&pdev->dev);
+err_pdata:
 	free_irq(irq, flctl);
 err_flste:
 	iounmap(flctl->reg);
@@ -958,10 +1200,11 @@ static int __devinit flctl_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static int __devexit flctl_remove(struct platform_device *pdev)
+static int flctl_remove(struct platform_device *pdev)
 {
 	struct sh_flctl *flctl = platform_get_drvdata(pdev);
 
+	flctl_release_dma(flctl);
 	nand_release(&flctl->mtd);
 	pm_runtime_disable(&pdev->dev);
 	free_irq(platform_get_irq(pdev, 0), flctl);
@@ -976,6 +1219,7 @@ static struct platform_driver flctl_driver = {
 	.driver = {
 		.name	= "sh_flctl",
 		.owner	= THIS_MODULE,
+		.of_match_table = of_flctl_match,
 	},
 };
 
diff --git a/trunk/drivers/mtd/nand/sharpsl.c b/trunk/drivers/mtd/nand/sharpsl.c
index 3421e3762a5a..127bc4271821 100644
--- a/trunk/drivers/mtd/nand/sharpsl.c
+++ b/trunk/drivers/mtd/nand/sharpsl.c
@@ -106,7 +106,7 @@ static int sharpsl_nand_calculate_ecc(struct mtd_info *mtd, const u_char * dat,
 /*
  * Main initialization routine
  */
-static int __devinit sharpsl_nand_probe(struct platform_device *pdev)
+static int sharpsl_nand_probe(struct platform_device *pdev)
 {
 	struct nand_chip *this;
 	struct resource *r;
@@ -205,7 +205,7 @@ static int __devinit sharpsl_nand_probe(struct platform_device *pdev)
 /*
  * Clean up routine
  */
-static int __devexit sharpsl_nand_remove(struct platform_device *pdev)
+static int sharpsl_nand_remove(struct platform_device *pdev)
 {
 	struct sharpsl_nand *sharpsl = platform_get_drvdata(pdev);
 
@@ -228,7 +228,7 @@ static struct platform_driver sharpsl_nand_driver = {
 		.owner	= THIS_MODULE,
 	},
 	.probe		= sharpsl_nand_probe,
-	.remove		= __devexit_p(sharpsl_nand_remove),
+	.remove		= sharpsl_nand_remove,
 };
 
 module_platform_driver(sharpsl_nand_driver);
diff --git a/trunk/drivers/mtd/nand/socrates_nand.c b/trunk/drivers/mtd/nand/socrates_nand.c
index f3f28fafbf7a..09dde7d27178 100644
--- a/trunk/drivers/mtd/nand/socrates_nand.c
+++ b/trunk/drivers/mtd/nand/socrates_nand.c
@@ -140,7 +140,7 @@ static int socrates_nand_device_ready(struct mtd_info *mtd)
 /*
  * Probe for the NAND device.
  */
-static int __devinit socrates_nand_probe(struct platform_device *ofdev)
+static int socrates_nand_probe(struct platform_device *ofdev)
 {
 	struct socrates_nand_host *host;
 	struct mtd_info *mtd;
@@ -220,7 +220,7 @@ static int __devinit socrates_nand_probe(struct platform_device *ofdev)
 /*
  * Remove a NAND device.
  */
-static int __devexit socrates_nand_remove(struct platform_device *ofdev)
+static int socrates_nand_remove(struct platform_device *ofdev)
 {
 	struct socrates_nand_host *host = dev_get_drvdata(&ofdev->dev);
 	struct mtd_info *mtd = &host->mtd;
@@ -251,7 +251,7 @@ static struct platform_driver socrates_nand_driver = {
 		.of_match_table = socrates_nand_match,
 	},
 	.probe		= socrates_nand_probe,
-	.remove		= __devexit_p(socrates_nand_remove),
+	.remove		= socrates_nand_remove,
 };
 
 module_platform_driver(socrates_nand_driver);
diff --git a/trunk/drivers/mtd/ofpart.c b/trunk/drivers/mtd/ofpart.c
index d9127e2ed808..dbd3aa574eaf 100644
--- a/trunk/drivers/mtd/ofpart.c
+++ b/trunk/drivers/mtd/ofpart.c
@@ -71,7 +71,10 @@ static int parse_ofpart_partitions(struct mtd_info *master,
 		(*pparts)[i].name = (char *)partname;
 
 		if (of_get_property(pp, "read-only", &len))
-			(*pparts)[i].mask_flags = MTD_WRITEABLE;
+			(*pparts)[i].mask_flags |= MTD_WRITEABLE;
+
+		if (of_get_property(pp, "lock", &len))
+			(*pparts)[i].mask_flags |= MTD_POWERUP_LOCK;
 
 		i++;
 	}
diff --git a/trunk/drivers/mtd/onenand/generic.c b/trunk/drivers/mtd/onenand/generic.c
index 1c4f97c63e62..9f11562f849d 100644
--- a/trunk/drivers/mtd/onenand/generic.c
+++ b/trunk/drivers/mtd/onenand/generic.c
@@ -35,7 +35,7 @@ struct onenand_info {
 	struct onenand_chip	onenand;
 };
 
-static int __devinit generic_onenand_probe(struct platform_device *pdev)
+static int generic_onenand_probe(struct platform_device *pdev)
 {
 	struct onenand_info *info;
 	struct onenand_platform_data *pdata = pdev->dev.platform_data;
@@ -88,7 +88,7 @@ static int __devinit generic_onenand_probe(struct platform_device *pdev)
 	return err;
 }
 
-static int __devexit generic_onenand_remove(struct platform_device *pdev)
+static int generic_onenand_remove(struct platform_device *pdev)
 {
 	struct onenand_info *info = platform_get_drvdata(pdev);
 	struct resource *res = pdev->resource;
@@ -112,7 +112,7 @@ static struct platform_driver generic_onenand_driver = {
 		.owner		= THIS_MODULE,
 	},
 	.probe		= generic_onenand_probe,
-	.remove		= __devexit_p(generic_onenand_remove),
+	.remove		= generic_onenand_remove,
 };
 
 module_platform_driver(generic_onenand_driver);
diff --git a/trunk/drivers/mtd/onenand/omap2.c b/trunk/drivers/mtd/onenand/omap2.c
index 00cd3da29435..065f3fe02a2f 100644
--- a/trunk/drivers/mtd/onenand/omap2.c
+++ b/trunk/drivers/mtd/onenand/omap2.c
@@ -630,7 +630,7 @@ static int omap2_onenand_disable(struct mtd_info *mtd)
 	return ret;
 }
 
-static int __devinit omap2_onenand_probe(struct platform_device *pdev)
+static int omap2_onenand_probe(struct platform_device *pdev)
 {
 	struct omap_onenand_platform_data *pdata;
 	struct omap2_onenand *c;
@@ -799,7 +799,7 @@ static int __devinit omap2_onenand_probe(struct platform_device *pdev)
 	return r;
 }
 
-static int __devexit omap2_onenand_remove(struct platform_device *pdev)
+static int omap2_onenand_remove(struct platform_device *pdev)
 {
 	struct omap2_onenand *c = dev_get_drvdata(&pdev->dev);
 
@@ -822,7 +822,7 @@ static int __devexit omap2_onenand_remove(struct platform_device *pdev)
 
 static struct platform_driver omap2_onenand_driver = {
 	.probe		= omap2_onenand_probe,
-	.remove		= __devexit_p(omap2_onenand_remove),
+	.remove		= omap2_onenand_remove,
 	.shutdown	= omap2_onenand_shutdown,
 	.driver		= {
 		.name	= DRIVER_NAME,
diff --git a/trunk/drivers/mtd/onenand/samsung.c b/trunk/drivers/mtd/onenand/samsung.c
index 8e4b3f2742ba..33f2a8fb8df9 100644
--- a/trunk/drivers/mtd/onenand/samsung.c
+++ b/trunk/drivers/mtd/onenand/samsung.c
@@ -1053,7 +1053,7 @@ static int s3c_onenand_probe(struct platform_device *pdev)
 	return err;
 }
 
-static int __devexit s3c_onenand_remove(struct platform_device *pdev)
+static int s3c_onenand_remove(struct platform_device *pdev)
 {
 	struct mtd_info *mtd = platform_get_drvdata(pdev);
 
@@ -1130,7 +1130,7 @@ static struct platform_driver s3c_onenand_driver = {
 	},
 	.id_table	= s3c_onenand_driver_ids,
 	.probe          = s3c_onenand_probe,
-	.remove         = __devexit_p(s3c_onenand_remove),
+	.remove         = s3c_onenand_remove,
 };
 
 module_platform_driver(s3c_onenand_driver);
diff --git a/trunk/drivers/mtd/tests/mtd_nandbiterrs.c b/trunk/drivers/mtd/tests/mtd_nandbiterrs.c
index cc8d62cb280c..207bf9a9972f 100644
--- a/trunk/drivers/mtd/tests/mtd_nandbiterrs.c
+++ b/trunk/drivers/mtd/tests/mtd_nandbiterrs.c
@@ -39,6 +39,9 @@
  * this program; see the file COPYING. If not, write to the Free Software
  * Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
@@ -47,8 +50,6 @@
 #include <linux/mtd/nand.h>
 #include <linux/slab.h>
 
-#define msg(FMT, VA...) pr_info("mtd_nandbiterrs: "FMT, ##VA)
-
 static int dev;
 module_param(dev, int, S_IRUGO);
 MODULE_PARM_DESC(dev, "MTD device number to use");
@@ -103,7 +104,7 @@ static int erase_block(void)
 	struct erase_info ei;
 	loff_t addr = eraseblock * mtd->erasesize;
 
-	msg("erase_block\n");
+	pr_info("erase_block\n");
 
 	memset(&ei, 0, sizeof(struct erase_info));
 	ei.mtd  = mtd;
@@ -112,7 +113,7 @@ static int erase_block(void)
 
 	err = mtd_erase(mtd, &ei);
 	if (err || ei.state == MTD_ERASE_FAILED) {
-		msg("error %d while erasing\n", err);
+		pr_err("error %d while erasing\n", err);
 		if (!err)
 			err = -EIO;
 		return err;
@@ -128,11 +129,11 @@ static int write_page(int log)
 	size_t written;
 
 	if (log)
-		msg("write_page\n");
+		pr_info("write_page\n");
 
 	err = mtd_write(mtd, offset, mtd->writesize, &written, wbuffer);
 	if (err || written != mtd->writesize) {
-		msg("error: write failed at %#llx\n", (long long)offset);
+		pr_err("error: write failed at %#llx\n", (long long)offset);
 		if (!err)
 			err = -EIO;
 	}
@@ -147,7 +148,7 @@ static int rewrite_page(int log)
 	struct mtd_oob_ops ops;
 
 	if (log)
-		msg("rewrite page\n");
+		pr_info("rewrite page\n");
 
 	ops.mode      = MTD_OPS_RAW; /* No ECC */
 	ops.len       = mtd->writesize;
@@ -160,7 +161,7 @@ static int rewrite_page(int log)
 
 	err = mtd_write_oob(mtd, offset, &ops);
 	if (err || ops.retlen != mtd->writesize) {
-		msg("error: write_oob failed (%d)\n", err);
+		pr_err("error: write_oob failed (%d)\n", err);
 		if (!err)
 			err = -EIO;
 	}
@@ -177,7 +178,7 @@ static int read_page(int log)
 	struct mtd_ecc_stats oldstats;
 
 	if (log)
-		msg("read_page\n");
+		pr_info("read_page\n");
 
 	/* Saving last mtd stats */
 	memcpy(&oldstats, &mtd->ecc_stats, sizeof(oldstats));
@@ -187,7 +188,7 @@ static int read_page(int log)
 		err = mtd->ecc_stats.corrected - oldstats.corrected;
 
 	if (err < 0 || read != mtd->writesize) {
-		msg("error: read failed at %#llx\n", (long long)offset);
+		pr_err("error: read failed at %#llx\n", (long long)offset);
 		if (err >= 0)
 			err = -EIO;
 	}
@@ -201,11 +202,11 @@ static int verify_page(int log)
 	unsigned i, errs = 0;
 
 	if (log)
-		msg("verify_page\n");
+		pr_info("verify_page\n");
 
 	for (i = 0; i < mtd->writesize; i++) {
 		if (rbuffer[i] != hash(i+seed)) {
-			msg("Error: page offset %u, expected %02x, got %02x\n",
+			pr_err("Error: page offset %u, expected %02x, got %02x\n",
 				i, hash(i+seed), rbuffer[i]);
 			errs++;
 		}
@@ -230,13 +231,13 @@ static int insert_biterror(unsigned byte)
 		for (bit = 7; bit >= 0; bit--) {
 			if (CBIT(wbuffer[byte], bit)) {
 				BCLR(wbuffer[byte], bit);
-				msg("Inserted biterror @ %u/%u\n", byte, bit);
+				pr_info("Inserted biterror @ %u/%u\n", byte, bit);
 				return 0;
 			}
 		}
 		byte++;
 	}
-	msg("biterror: Failed to find a '1' bit\n");
+	pr_err("biterror: Failed to find a '1' bit\n");
 	return -EIO;
 }
 
@@ -248,7 +249,7 @@ static int incremental_errors_test(void)
 	unsigned i;
 	unsigned errs_per_subpage = 0;
 
-	msg("incremental biterrors test\n");
+	pr_info("incremental biterrors test\n");
 
 	for (i = 0; i < mtd->writesize; i++)
 		wbuffer[i] = hash(i+seed);
@@ -265,9 +266,9 @@ static int incremental_errors_test(void)
 
 		err = read_page(1);
 		if (err > 0)
-			msg("Read reported %d corrected bit errors\n", err);
+			pr_info("Read reported %d corrected bit errors\n", err);
 		if (err < 0) {
-			msg("After %d biterrors per subpage, read reported error %d\n",
+			pr_err("After %d biterrors per subpage, read reported error %d\n",
 				errs_per_subpage, err);
 			err = 0;
 			goto exit;
@@ -275,11 +276,11 @@ static int incremental_errors_test(void)
 
 		err = verify_page(1);
 		if (err) {
-			msg("ECC failure, read data is incorrect despite read success\n");
+			pr_err("ECC failure, read data is incorrect despite read success\n");
 			goto exit;
 		}
 
-		msg("Successfully corrected %d bit errors per subpage\n",
+		pr_info("Successfully corrected %d bit errors per subpage\n",
 			errs_per_subpage);
 
 		for (i = 0; i < subcount; i++) {
@@ -311,7 +312,7 @@ static int overwrite_test(void)
 
 	memset(bitstats, 0, sizeof(bitstats));
 
-	msg("overwrite biterrors test\n");
+	pr_info("overwrite biterrors test\n");
 
 	for (i = 0; i < mtd->writesize; i++)
 		wbuffer[i] = hash(i+seed);
@@ -329,18 +330,18 @@ static int overwrite_test(void)
 		err = read_page(0);
 		if (err >= 0) {
 			if (err >= MAXBITS) {
-				msg("Implausible number of bit errors corrected\n");
+				pr_info("Implausible number of bit errors corrected\n");
 				err = -EIO;
 				break;
 			}
 			bitstats[err]++;
 			if (err > max_corrected) {
 				max_corrected = err;
-				msg("Read reported %d corrected bit errors\n",
+				pr_info("Read reported %d corrected bit errors\n",
 					err);
 			}
 		} else { /* err < 0 */
-			msg("Read reported error %d\n", err);
+			pr_info("Read reported error %d\n", err);
 			err = 0;
 			break;
 		}
@@ -348,7 +349,7 @@ static int overwrite_test(void)
 		err = verify_page(0);
 		if (err) {
 			bitstats[max_corrected] = opno;
-			msg("ECC failure, read data is incorrect despite read success\n");
+			pr_info("ECC failure, read data is incorrect despite read success\n");
 			break;
 		}
 
@@ -357,9 +358,9 @@ static int overwrite_test(void)
 
 	/* At this point bitstats[0] contains the number of ops with no bit
 	 * errors, bitstats[1] the number of ops with 1 bit error, etc. */
-	msg("Bit error histogram (%d operations total):\n", opno);
+	pr_info("Bit error histogram (%d operations total):\n", opno);
 	for (i = 0; i < max_corrected; i++)
-		msg("Page reads with %3d corrected bit errors: %d\n",
+		pr_info("Page reads with %3d corrected bit errors: %d\n",
 			i, bitstats[i]);
 
 exit:
@@ -370,36 +371,36 @@ static int __init mtd_nandbiterrs_init(void)
 {
 	int err = 0;
 
-	msg("\n");
-	msg("==================================================\n");
-	msg("MTD device: %d\n", dev);
+	printk("\n");
+	printk(KERN_INFO "==================================================\n");
+	pr_info("MTD device: %d\n", dev);
 
 	mtd = get_mtd_device(NULL, dev);
 	if (IS_ERR(mtd)) {
 		err = PTR_ERR(mtd);
-		msg("error: cannot get MTD device\n");
+		pr_err("error: cannot get MTD device\n");
 		goto exit_mtddev;
 	}
 
 	if (mtd->type != MTD_NANDFLASH) {
-		msg("this test requires NAND flash\n");
+		pr_info("this test requires NAND flash\n");
 		err = -ENODEV;
 		goto exit_nand;
 	}
 
-	msg("MTD device size %llu, eraseblock=%u, page=%u, oob=%u\n",
+	pr_info("MTD device size %llu, eraseblock=%u, page=%u, oob=%u\n",
 		(unsigned long long)mtd->size, mtd->erasesize,
 		mtd->writesize, mtd->oobsize);
 
 	subsize  = mtd->writesize >> mtd->subpage_sft;
 	subcount = mtd->writesize / subsize;
 
-	msg("Device uses %d subpages of %d bytes\n", subcount, subsize);
+	pr_info("Device uses %d subpages of %d bytes\n", subcount, subsize);
 
 	offset     = page_offset * mtd->writesize;
 	eraseblock = mtd_div_by_eb(offset, mtd);
 
-	msg("Using page=%u, offset=%llu, eraseblock=%u\n",
+	pr_info("Using page=%u, offset=%llu, eraseblock=%u\n",
 		page_offset, offset, eraseblock);
 
 	wbuffer = kmalloc(mtd->writesize, GFP_KERNEL);
@@ -432,8 +433,8 @@ static int __init mtd_nandbiterrs_init(void)
 		goto exit_error;
 
 	err = -EIO;
-	msg("finished successfully.\n");
-	msg("==================================================\n");
+	pr_info("finished successfully.\n");
+	printk(KERN_INFO "==================================================\n");
 
 exit_error:
 	kfree(rbuffer);
diff --git a/trunk/drivers/mtd/tests/mtd_nandecctest.c b/trunk/drivers/mtd/tests/mtd_nandecctest.c
index b437fa425077..1eee264509a8 100644
--- a/trunk/drivers/mtd/tests/mtd_nandecctest.c
+++ b/trunk/drivers/mtd/tests/mtd_nandecctest.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/list.h>
@@ -264,13 +266,13 @@ static int nand_ecc_test_run(const size_t size)
 						correct_data, size);
 
 		if (err) {
-			pr_err("mtd_nandecctest: not ok - %s-%zd\n",
+			pr_err("not ok - %s-%zd\n",
 				nand_ecc_test[i].name, size);
 			dump_data_ecc(error_data, error_ecc,
 				correct_data, correct_ecc, size);
 			break;
 		}
-		pr_info("mtd_nandecctest: ok - %s-%zd\n",
+		pr_info("ok - %s-%zd\n",
 			nand_ecc_test[i].name, size);
 	}
 error:
diff --git a/trunk/drivers/mtd/tests/mtd_oobtest.c b/trunk/drivers/mtd/tests/mtd_oobtest.c
index ed9b62827f1b..e827fa8cd844 100644
--- a/trunk/drivers/mtd/tests/mtd_oobtest.c
+++ b/trunk/drivers/mtd/tests/mtd_oobtest.c
@@ -19,6 +19,8 @@
  * Author: Adrian Hunter <ext-adrian.hunter@nokia.com>
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <asm/div64.h>
 #include <linux/init.h>
 #include <linux/module.h>
@@ -28,8 +30,6 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 
-#define PRINT_PREF KERN_INFO "mtd_oobtest: "
-
 static int dev = -EINVAL;
 module_param(dev, int, S_IRUGO);
 MODULE_PARM_DESC(dev, "MTD device number to use");
@@ -80,13 +80,12 @@ static int erase_eraseblock(int ebnum)
 
 	err = mtd_erase(mtd, &ei);
 	if (err) {
-		printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum);
+		pr_err("error %d while erasing EB %d\n", err, ebnum);
 		return err;
 	}
 
 	if (ei.state == MTD_ERASE_FAILED) {
-		printk(PRINT_PREF "some erase error occurred at EB %d\n",
-		       ebnum);
+		pr_err("some erase error occurred at EB %d\n", ebnum);
 		return -EIO;
 	}
 
@@ -98,7 +97,7 @@ static int erase_whole_device(void)
 	int err;
 	unsigned int i;
 
-	printk(PRINT_PREF "erasing whole device\n");
+	pr_info("erasing whole device\n");
 	for (i = 0; i < ebcnt; ++i) {
 		if (bbt[i])
 			continue;
@@ -107,7 +106,7 @@ static int erase_whole_device(void)
 			return err;
 		cond_resched();
 	}
-	printk(PRINT_PREF "erased %u eraseblocks\n", i);
+	pr_info("erased %u eraseblocks\n", i);
 	return 0;
 }
 
@@ -141,9 +140,9 @@ static int write_eraseblock(int ebnum)
 		ops.oobbuf    = writebuf;
 		err = mtd_write_oob(mtd, addr, &ops);
 		if (err || ops.oobretlen != use_len) {
-			printk(PRINT_PREF "error: writeoob failed at %#llx\n",
+			pr_err("error: writeoob failed at %#llx\n",
 			       (long long)addr);
-			printk(PRINT_PREF "error: use_len %d, use_offset %d\n",
+			pr_err("error: use_len %d, use_offset %d\n",
 			       use_len, use_offset);
 			errcnt += 1;
 			return err ? err : -1;
@@ -160,7 +159,7 @@ static int write_whole_device(void)
 	int err;
 	unsigned int i;
 
-	printk(PRINT_PREF "writing OOBs of whole device\n");
+	pr_info("writing OOBs of whole device\n");
 	for (i = 0; i < ebcnt; ++i) {
 		if (bbt[i])
 			continue;
@@ -168,10 +167,10 @@ static int write_whole_device(void)
 		if (err)
 			return err;
 		if (i % 256 == 0)
-			printk(PRINT_PREF "written up to eraseblock %u\n", i);
+			pr_info("written up to eraseblock %u\n", i);
 		cond_resched();
 	}
-	printk(PRINT_PREF "written %u eraseblocks\n", i);
+	pr_info("written %u eraseblocks\n", i);
 	return 0;
 }
 
@@ -194,17 +193,17 @@ static int verify_eraseblock(int ebnum)
 		ops.oobbuf    = readbuf;
 		err = mtd_read_oob(mtd, addr, &ops);
 		if (err || ops.oobretlen != use_len) {
-			printk(PRINT_PREF "error: readoob failed at %#llx\n",
+			pr_err("error: readoob failed at %#llx\n",
 			       (long long)addr);
 			errcnt += 1;
 			return err ? err : -1;
 		}
 		if (memcmp(readbuf, writebuf, use_len)) {
-			printk(PRINT_PREF "error: verify failed at %#llx\n",
+			pr_err("error: verify failed at %#llx\n",
 			       (long long)addr);
 			errcnt += 1;
 			if (errcnt > 1000) {
-				printk(PRINT_PREF "error: too many errors\n");
+				pr_err("error: too many errors\n");
 				return -1;
 			}
 		}
@@ -221,29 +220,28 @@ static int verify_eraseblock(int ebnum)
 			ops.oobbuf    = readbuf;
 			err = mtd_read_oob(mtd, addr, &ops);
 			if (err || ops.oobretlen != mtd->ecclayout->oobavail) {
-				printk(PRINT_PREF "error: readoob failed at "
-				       "%#llx\n", (long long)addr);
+				pr_err("error: readoob failed at %#llx\n",
+						(long long)addr);
 				errcnt += 1;
 				return err ? err : -1;
 			}
 			if (memcmp(readbuf + use_offset, writebuf, use_len)) {
-				printk(PRINT_PREF "error: verify failed at "
-				       "%#llx\n", (long long)addr);
+				pr_err("error: verify failed at %#llx\n",
+						(long long)addr);
 				errcnt += 1;
 				if (errcnt > 1000) {
-					printk(PRINT_PREF "error: too many "
-					       "errors\n");
+					pr_err("error: too many errors\n");
 					return -1;
 				}
 			}
 			for (k = 0; k < use_offset; ++k)
 				if (readbuf[k] != 0xff) {
-					printk(PRINT_PREF "error: verify 0xff "
+					pr_err("error: verify 0xff "
 					       "failed at %#llx\n",
 					       (long long)addr);
 					errcnt += 1;
 					if (errcnt > 1000) {
-						printk(PRINT_PREF "error: too "
+						pr_err("error: too "
 						       "many errors\n");
 						return -1;
 					}
@@ -251,12 +249,12 @@ static int verify_eraseblock(int ebnum)
 			for (k = use_offset + use_len;
 			     k < mtd->ecclayout->oobavail; ++k)
 				if (readbuf[k] != 0xff) {
-					printk(PRINT_PREF "error: verify 0xff "
+					pr_err("error: verify 0xff "
 					       "failed at %#llx\n",
 					       (long long)addr);
 					errcnt += 1;
 					if (errcnt > 1000) {
-						printk(PRINT_PREF "error: too "
+						pr_err("error: too "
 						       "many errors\n");
 						return -1;
 					}
@@ -286,17 +284,17 @@ static int verify_eraseblock_in_one_go(int ebnum)
 	ops.oobbuf    = readbuf;
 	err = mtd_read_oob(mtd, addr, &ops);
 	if (err || ops.oobretlen != len) {
-		printk(PRINT_PREF "error: readoob failed at %#llx\n",
+		pr_err("error: readoob failed at %#llx\n",
 		       (long long)addr);
 		errcnt += 1;
 		return err ? err : -1;
 	}
 	if (memcmp(readbuf, writebuf, len)) {
-		printk(PRINT_PREF "error: verify failed at %#llx\n",
+		pr_err("error: verify failed at %#llx\n",
 		       (long long)addr);
 		errcnt += 1;
 		if (errcnt > 1000) {
-			printk(PRINT_PREF "error: too many errors\n");
+			pr_err("error: too many errors\n");
 			return -1;
 		}
 	}
@@ -309,7 +307,7 @@ static int verify_all_eraseblocks(void)
 	int err;
 	unsigned int i;
 
-	printk(PRINT_PREF "verifying all eraseblocks\n");
+	pr_info("verifying all eraseblocks\n");
 	for (i = 0; i < ebcnt; ++i) {
 		if (bbt[i])
 			continue;
@@ -317,10 +315,10 @@ static int verify_all_eraseblocks(void)
 		if (err)
 			return err;
 		if (i % 256 == 0)
-			printk(PRINT_PREF "verified up to eraseblock %u\n", i);
+			pr_info("verified up to eraseblock %u\n", i);
 		cond_resched();
 	}
-	printk(PRINT_PREF "verified %u eraseblocks\n", i);
+	pr_info("verified %u eraseblocks\n", i);
 	return 0;
 }
 
@@ -331,7 +329,7 @@ static int is_block_bad(int ebnum)
 
 	ret = mtd_block_isbad(mtd, addr);
 	if (ret)
-		printk(PRINT_PREF "block %d is bad\n", ebnum);
+		pr_info("block %d is bad\n", ebnum);
 	return ret;
 }
 
@@ -341,18 +339,18 @@ static int scan_for_bad_eraseblocks(void)
 
 	bbt = kmalloc(ebcnt, GFP_KERNEL);
 	if (!bbt) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_err("error: cannot allocate memory\n");
 		return -ENOMEM;
 	}
 
-	printk(PRINT_PREF "scanning for bad eraseblocks\n");
+	pr_info("scanning for bad eraseblocks\n");
 	for (i = 0; i < ebcnt; ++i) {
 		bbt[i] = is_block_bad(i) ? 1 : 0;
 		if (bbt[i])
 			bad += 1;
 		cond_resched();
 	}
-	printk(PRINT_PREF "scanned %d eraseblocks, %d are bad\n", i, bad);
+	pr_info("scanned %d eraseblocks, %d are bad\n", i, bad);
 	return 0;
 }
 
@@ -368,22 +366,22 @@ static int __init mtd_oobtest_init(void)
 	printk(KERN_INFO "=================================================\n");
 
 	if (dev < 0) {
-		printk(PRINT_PREF "Please specify a valid mtd-device via module paramter\n");
-		printk(KERN_CRIT "CAREFUL: This test wipes all data on the specified MTD device!\n");
+		pr_info("Please specify a valid mtd-device via module parameter\n");
+		pr_crit("CAREFUL: This test wipes all data on the specified MTD device!\n");
 		return -EINVAL;
 	}
 
-	printk(PRINT_PREF "MTD device: %d\n", dev);
+	pr_info("MTD device: %d\n", dev);
 
 	mtd = get_mtd_device(NULL, dev);
 	if (IS_ERR(mtd)) {
 		err = PTR_ERR(mtd);
-		printk(PRINT_PREF "error: cannot get MTD device\n");
+		pr_err("error: cannot get MTD device\n");
 		return err;
 	}
 
 	if (mtd->type != MTD_NANDFLASH) {
-		printk(PRINT_PREF "this test requires NAND flash\n");
+		pr_info("this test requires NAND flash\n");
 		goto out;
 	}
 
@@ -392,7 +390,7 @@ static int __init mtd_oobtest_init(void)
 	ebcnt = tmp;
 	pgcnt = mtd->erasesize / mtd->writesize;
 
-	printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, "
+	pr_info("MTD device size %llu, eraseblock size %u, "
 	       "page size %u, count of eraseblocks %u, pages per "
 	       "eraseblock %u, OOB size %u\n",
 	       (unsigned long long)mtd->size, mtd->erasesize,
@@ -401,12 +399,12 @@ static int __init mtd_oobtest_init(void)
 	err = -ENOMEM;
 	readbuf = kmalloc(mtd->erasesize, GFP_KERNEL);
 	if (!readbuf) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_err("error: cannot allocate memory\n");
 		goto out;
 	}
 	writebuf = kmalloc(mtd->erasesize, GFP_KERNEL);
 	if (!writebuf) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_err("error: cannot allocate memory\n");
 		goto out;
 	}
 
@@ -420,7 +418,7 @@ static int __init mtd_oobtest_init(void)
 	vary_offset = 0;
 
 	/* First test: write all OOB, read it back and verify */
-	printk(PRINT_PREF "test 1 of 5\n");
+	pr_info("test 1 of 5\n");
 
 	err = erase_whole_device();
 	if (err)
@@ -440,7 +438,7 @@ static int __init mtd_oobtest_init(void)
 	 * Second test: write all OOB, a block at a time, read it back and
 	 * verify.
 	 */
-	printk(PRINT_PREF "test 2 of 5\n");
+	pr_info("test 2 of 5\n");
 
 	err = erase_whole_device();
 	if (err)
@@ -453,7 +451,7 @@ static int __init mtd_oobtest_init(void)
 
 	/* Check all eraseblocks */
 	simple_srand(3);
-	printk(PRINT_PREF "verifying all eraseblocks\n");
+	pr_info("verifying all eraseblocks\n");
 	for (i = 0; i < ebcnt; ++i) {
 		if (bbt[i])
 			continue;
@@ -461,16 +459,16 @@ static int __init mtd_oobtest_init(void)
 		if (err)
 			goto out;
 		if (i % 256 == 0)
-			printk(PRINT_PREF "verified up to eraseblock %u\n", i);
+			pr_info("verified up to eraseblock %u\n", i);
 		cond_resched();
 	}
-	printk(PRINT_PREF "verified %u eraseblocks\n", i);
+	pr_info("verified %u eraseblocks\n", i);
 
 	/*
 	 * Third test: write OOB at varying offsets and lengths, read it back
 	 * and verify.
 	 */
-	printk(PRINT_PREF "test 3 of 5\n");
+	pr_info("test 3 of 5\n");
 
 	err = erase_whole_device();
 	if (err)
@@ -503,7 +501,7 @@ static int __init mtd_oobtest_init(void)
 	vary_offset = 0;
 
 	/* Fourth test: try to write off end of device */
-	printk(PRINT_PREF "test 4 of 5\n");
+	pr_info("test 4 of 5\n");
 
 	err = erase_whole_device();
 	if (err)
@@ -522,14 +520,14 @@ static int __init mtd_oobtest_init(void)
 	ops.ooboffs   = mtd->ecclayout->oobavail;
 	ops.datbuf    = NULL;
 	ops.oobbuf    = writebuf;
-	printk(PRINT_PREF "attempting to start write past end of OOB\n");
-	printk(PRINT_PREF "an error is expected...\n");
+	pr_info("attempting to start write past end of OOB\n");
+	pr_info("an error is expected...\n");
 	err = mtd_write_oob(mtd, addr0, &ops);
 	if (err) {
-		printk(PRINT_PREF "error occurred as expected\n");
+		pr_info("error occurred as expected\n");
 		err = 0;
 	} else {
-		printk(PRINT_PREF "error: can write past end of OOB\n");
+		pr_err("error: can write past end of OOB\n");
 		errcnt += 1;
 	}
 
@@ -542,19 +540,19 @@ static int __init mtd_oobtest_init(void)
 	ops.ooboffs   = mtd->ecclayout->oobavail;
 	ops.datbuf    = NULL;
 	ops.oobbuf    = readbuf;
-	printk(PRINT_PREF "attempting to start read past end of OOB\n");
-	printk(PRINT_PREF "an error is expected...\n");
+	pr_info("attempting to start read past end of OOB\n");
+	pr_info("an error is expected...\n");
 	err = mtd_read_oob(mtd, addr0, &ops);
 	if (err) {
-		printk(PRINT_PREF "error occurred as expected\n");
+		pr_info("error occurred as expected\n");
 		err = 0;
 	} else {
-		printk(PRINT_PREF "error: can read past end of OOB\n");
+		pr_err("error: can read past end of OOB\n");
 		errcnt += 1;
 	}
 
 	if (bbt[ebcnt - 1])
-		printk(PRINT_PREF "skipping end of device tests because last "
+		pr_info("skipping end of device tests because last "
 		       "block is bad\n");
 	else {
 		/* Attempt to write off end of device */
@@ -566,14 +564,14 @@ static int __init mtd_oobtest_init(void)
 		ops.ooboffs   = 0;
 		ops.datbuf    = NULL;
 		ops.oobbuf    = writebuf;
-		printk(PRINT_PREF "attempting to write past end of device\n");
-		printk(PRINT_PREF "an error is expected...\n");
+		pr_info("attempting to write past end of device\n");
+		pr_info("an error is expected...\n");
 		err = mtd_write_oob(mtd, mtd->size - mtd->writesize, &ops);
 		if (err) {
-			printk(PRINT_PREF "error occurred as expected\n");
+			pr_info("error occurred as expected\n");
 			err = 0;
 		} else {
-			printk(PRINT_PREF "error: wrote past end of device\n");
+			pr_err("error: wrote past end of device\n");
 			errcnt += 1;
 		}
 
@@ -586,14 +584,14 @@ static int __init mtd_oobtest_init(void)
 		ops.ooboffs   = 0;
 		ops.datbuf    = NULL;
 		ops.oobbuf    = readbuf;
-		printk(PRINT_PREF "attempting to read past end of device\n");
-		printk(PRINT_PREF "an error is expected...\n");
+		pr_info("attempting to read past end of device\n");
+		pr_info("an error is expected...\n");
 		err = mtd_read_oob(mtd, mtd->size - mtd->writesize, &ops);
 		if (err) {
-			printk(PRINT_PREF "error occurred as expected\n");
+			pr_info("error occurred as expected\n");
 			err = 0;
 		} else {
-			printk(PRINT_PREF "error: read past end of device\n");
+			pr_err("error: read past end of device\n");
 			errcnt += 1;
 		}
 
@@ -610,14 +608,14 @@ static int __init mtd_oobtest_init(void)
 		ops.ooboffs   = 1;
 		ops.datbuf    = NULL;
 		ops.oobbuf    = writebuf;
-		printk(PRINT_PREF "attempting to write past end of device\n");
-		printk(PRINT_PREF "an error is expected...\n");
+		pr_info("attempting to write past end of device\n");
+		pr_info("an error is expected...\n");
 		err = mtd_write_oob(mtd, mtd->size - mtd->writesize, &ops);
 		if (err) {
-			printk(PRINT_PREF "error occurred as expected\n");
+			pr_info("error occurred as expected\n");
 			err = 0;
 		} else {
-			printk(PRINT_PREF "error: wrote past end of device\n");
+			pr_err("error: wrote past end of device\n");
 			errcnt += 1;
 		}
 
@@ -630,20 +628,20 @@ static int __init mtd_oobtest_init(void)
 		ops.ooboffs   = 1;
 		ops.datbuf    = NULL;
 		ops.oobbuf    = readbuf;
-		printk(PRINT_PREF "attempting to read past end of device\n");
-		printk(PRINT_PREF "an error is expected...\n");
+		pr_info("attempting to read past end of device\n");
+		pr_info("an error is expected...\n");
 		err = mtd_read_oob(mtd, mtd->size - mtd->writesize, &ops);
 		if (err) {
-			printk(PRINT_PREF "error occurred as expected\n");
+			pr_info("error occurred as expected\n");
 			err = 0;
 		} else {
-			printk(PRINT_PREF "error: read past end of device\n");
+			pr_err("error: read past end of device\n");
 			errcnt += 1;
 		}
 	}
 
 	/* Fifth test: write / read across block boundaries */
-	printk(PRINT_PREF "test 5 of 5\n");
+	pr_info("test 5 of 5\n");
 
 	/* Erase all eraseblocks */
 	err = erase_whole_device();
@@ -652,7 +650,7 @@ static int __init mtd_oobtest_init(void)
 
 	/* Write all eraseblocks */
 	simple_srand(11);
-	printk(PRINT_PREF "writing OOBs of whole device\n");
+	pr_info("writing OOBs of whole device\n");
 	for (i = 0; i < ebcnt - 1; ++i) {
 		int cnt = 2;
 		int pg;
@@ -674,17 +672,16 @@ static int __init mtd_oobtest_init(void)
 			if (err)
 				goto out;
 			if (i % 256 == 0)
-				printk(PRINT_PREF "written up to eraseblock "
-				       "%u\n", i);
+				pr_info("written up to eraseblock %u\n", i);
 			cond_resched();
 			addr += mtd->writesize;
 		}
 	}
-	printk(PRINT_PREF "written %u eraseblocks\n", i);
+	pr_info("written %u eraseblocks\n", i);
 
 	/* Check all eraseblocks */
 	simple_srand(11);
-	printk(PRINT_PREF "verifying all eraseblocks\n");
+	pr_info("verifying all eraseblocks\n");
 	for (i = 0; i < ebcnt - 1; ++i) {
 		if (bbt[i] || bbt[i + 1])
 			continue;
@@ -702,28 +699,28 @@ static int __init mtd_oobtest_init(void)
 		if (err)
 			goto out;
 		if (memcmp(readbuf, writebuf, mtd->ecclayout->oobavail * 2)) {
-			printk(PRINT_PREF "error: verify failed at %#llx\n",
+			pr_err("error: verify failed at %#llx\n",
 			       (long long)addr);
 			errcnt += 1;
 			if (errcnt > 1000) {
-				printk(PRINT_PREF "error: too many errors\n");
+				pr_err("error: too many errors\n");
 				goto out;
 			}
 		}
 		if (i % 256 == 0)
-			printk(PRINT_PREF "verified up to eraseblock %u\n", i);
+			pr_info("verified up to eraseblock %u\n", i);
 		cond_resched();
 	}
-	printk(PRINT_PREF "verified %u eraseblocks\n", i);
+	pr_info("verified %u eraseblocks\n", i);
 
-	printk(PRINT_PREF "finished with %d errors\n", errcnt);
+	pr_info("finished with %d errors\n", errcnt);
 out:
 	kfree(bbt);
 	kfree(writebuf);
 	kfree(readbuf);
 	put_mtd_device(mtd);
 	if (err)
-		printk(PRINT_PREF "error %d occurred\n", err);
+		pr_info("error %d occurred\n", err);
 	printk(KERN_INFO "=================================================\n");
 	return err;
 }
diff --git a/trunk/drivers/mtd/tests/mtd_pagetest.c b/trunk/drivers/mtd/tests/mtd_pagetest.c
index 252ddb092fb2..f93a76f88113 100644
--- a/trunk/drivers/mtd/tests/mtd_pagetest.c
+++ b/trunk/drivers/mtd/tests/mtd_pagetest.c
@@ -19,6 +19,8 @@
  * Author: Adrian Hunter <ext-adrian.hunter@nokia.com>
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <asm/div64.h>
 #include <linux/init.h>
 #include <linux/module.h>
@@ -28,8 +30,6 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 
-#define PRINT_PREF KERN_INFO "mtd_pagetest: "
-
 static int dev = -EINVAL;
 module_param(dev, int, S_IRUGO);
 MODULE_PARM_DESC(dev, "MTD device number to use");
@@ -79,12 +79,12 @@ static int erase_eraseblock(int ebnum)
 
 	err = mtd_erase(mtd, &ei);
 	if (err) {
-		printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum);
+		pr_err("error %d while erasing EB %d\n", err, ebnum);
 		return err;
 	}
 
 	if (ei.state == MTD_ERASE_FAILED) {
-		printk(PRINT_PREF "some erase error occurred at EB %d\n",
+		pr_err("some erase error occurred at EB %d\n",
 		       ebnum);
 		return -EIO;
 	}
@@ -102,7 +102,7 @@ static int write_eraseblock(int ebnum)
 	cond_resched();
 	err = mtd_write(mtd, addr, mtd->erasesize, &written, writebuf);
 	if (err || written != mtd->erasesize)
-		printk(PRINT_PREF "error: write failed at %#llx\n",
+		pr_err("error: write failed at %#llx\n",
 		       (long long)addr);
 
 	return err;
@@ -131,7 +131,7 @@ static int verify_eraseblock(int ebnum)
 		if (mtd_is_bitflip(err))
 			err = 0;
 		if (err || read != bufsize) {
-			printk(PRINT_PREF "error: read failed at %#llx\n",
+			pr_err("error: read failed at %#llx\n",
 			       (long long)addr0);
 			return err;
 		}
@@ -139,7 +139,7 @@ static int verify_eraseblock(int ebnum)
 		if (mtd_is_bitflip(err))
 			err = 0;
 		if (err || read != bufsize) {
-			printk(PRINT_PREF "error: read failed at %#llx\n",
+			pr_err("error: read failed at %#llx\n",
 			       (long long)(addrn - bufsize));
 			return err;
 		}
@@ -148,12 +148,12 @@ static int verify_eraseblock(int ebnum)
 		if (mtd_is_bitflip(err))
 			err = 0;
 		if (err || read != bufsize) {
-			printk(PRINT_PREF "error: read failed at %#llx\n",
+			pr_err("error: read failed at %#llx\n",
 			       (long long)addr);
 			break;
 		}
 		if (memcmp(twopages, writebuf + (j * pgsize), bufsize)) {
-			printk(PRINT_PREF "error: verify failed at %#llx\n",
+			pr_err("error: verify failed at %#llx\n",
 			       (long long)addr);
 			errcnt += 1;
 		}
@@ -166,7 +166,7 @@ static int verify_eraseblock(int ebnum)
 		if (mtd_is_bitflip(err))
 			err = 0;
 		if (err || read != bufsize) {
-			printk(PRINT_PREF "error: read failed at %#llx\n",
+			pr_err("error: read failed at %#llx\n",
 			       (long long)addr0);
 			return err;
 		}
@@ -174,7 +174,7 @@ static int verify_eraseblock(int ebnum)
 		if (mtd_is_bitflip(err))
 			err = 0;
 		if (err || read != bufsize) {
-			printk(PRINT_PREF "error: read failed at %#llx\n",
+			pr_err("error: read failed at %#llx\n",
 			       (long long)(addrn - bufsize));
 			return err;
 		}
@@ -183,14 +183,14 @@ static int verify_eraseblock(int ebnum)
 		if (mtd_is_bitflip(err))
 			err = 0;
 		if (err || read != bufsize) {
-			printk(PRINT_PREF "error: read failed at %#llx\n",
+			pr_err("error: read failed at %#llx\n",
 			       (long long)addr);
 			return err;
 		}
 		memcpy(boundary, writebuf + mtd->erasesize - pgsize, pgsize);
 		set_random_data(boundary + pgsize, pgsize);
 		if (memcmp(twopages, boundary, bufsize)) {
-			printk(PRINT_PREF "error: verify failed at %#llx\n",
+			pr_err("error: verify failed at %#llx\n",
 			       (long long)addr);
 			errcnt += 1;
 		}
@@ -206,10 +206,10 @@ static int crosstest(void)
 	loff_t addr, addr0, addrn;
 	unsigned char *pp1, *pp2, *pp3, *pp4;
 
-	printk(PRINT_PREF "crosstest\n");
+	pr_info("crosstest\n");
 	pp1 = kmalloc(pgsize * 4, GFP_KERNEL);
 	if (!pp1) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_err("error: cannot allocate memory\n");
 		return -ENOMEM;
 	}
 	pp2 = pp1 + pgsize;
@@ -231,7 +231,7 @@ static int crosstest(void)
 	if (mtd_is_bitflip(err))
 		err = 0;
 	if (err || read != pgsize) {
-		printk(PRINT_PREF "error: read failed at %#llx\n",
+		pr_err("error: read failed at %#llx\n",
 		       (long long)addr);
 		kfree(pp1);
 		return err;
@@ -243,7 +243,7 @@ static int crosstest(void)
 	if (mtd_is_bitflip(err))
 		err = 0;
 	if (err || read != pgsize) {
-		printk(PRINT_PREF "error: read failed at %#llx\n",
+		pr_err("error: read failed at %#llx\n",
 		       (long long)addr);
 		kfree(pp1);
 		return err;
@@ -251,12 +251,12 @@ static int crosstest(void)
 
 	/* Read first page to pp2 */
 	addr = addr0;
-	printk(PRINT_PREF "reading page at %#llx\n", (long long)addr);
+	pr_info("reading page at %#llx\n", (long long)addr);
 	err = mtd_read(mtd, addr, pgsize, &read, pp2);
 	if (mtd_is_bitflip(err))
 		err = 0;
 	if (err || read != pgsize) {
-		printk(PRINT_PREF "error: read failed at %#llx\n",
+		pr_err("error: read failed at %#llx\n",
 		       (long long)addr);
 		kfree(pp1);
 		return err;
@@ -264,12 +264,12 @@ static int crosstest(void)
 
 	/* Read last page to pp3 */
 	addr = addrn - pgsize;
-	printk(PRINT_PREF "reading page at %#llx\n", (long long)addr);
+	pr_info("reading page at %#llx\n", (long long)addr);
 	err = mtd_read(mtd, addr, pgsize, &read, pp3);
 	if (mtd_is_bitflip(err))
 		err = 0;
 	if (err || read != pgsize) {
-		printk(PRINT_PREF "error: read failed at %#llx\n",
+		pr_err("error: read failed at %#llx\n",
 		       (long long)addr);
 		kfree(pp1);
 		return err;
@@ -277,25 +277,25 @@ static int crosstest(void)
 
 	/* Read first page again to pp4 */
 	addr = addr0;
-	printk(PRINT_PREF "reading page at %#llx\n", (long long)addr);
+	pr_info("reading page at %#llx\n", (long long)addr);
 	err = mtd_read(mtd, addr, pgsize, &read, pp4);
 	if (mtd_is_bitflip(err))
 		err = 0;
 	if (err || read != pgsize) {
-		printk(PRINT_PREF "error: read failed at %#llx\n",
+		pr_err("error: read failed at %#llx\n",
 		       (long long)addr);
 		kfree(pp1);
 		return err;
 	}
 
 	/* pp2 and pp4 should be the same */
-	printk(PRINT_PREF "verifying pages read at %#llx match\n",
+	pr_info("verifying pages read at %#llx match\n",
 	       (long long)addr0);
 	if (memcmp(pp2, pp4, pgsize)) {
-		printk(PRINT_PREF "verify failed!\n");
+		pr_err("verify failed!\n");
 		errcnt += 1;
 	} else if (!err)
-		printk(PRINT_PREF "crosstest ok\n");
+		pr_info("crosstest ok\n");
 	kfree(pp1);
 	return err;
 }
@@ -307,7 +307,7 @@ static int erasecrosstest(void)
 	loff_t addr0;
 	char *readbuf = twopages;
 
-	printk(PRINT_PREF "erasecrosstest\n");
+	pr_info("erasecrosstest\n");
 
 	ebnum = 0;
 	addr0 = 0;
@@ -320,79 +320,79 @@ static int erasecrosstest(void)
 	while (ebnum2 && bbt[ebnum2])
 		ebnum2 -= 1;
 
-	printk(PRINT_PREF "erasing block %d\n", ebnum);
+	pr_info("erasing block %d\n", ebnum);
 	err = erase_eraseblock(ebnum);
 	if (err)
 		return err;
 
-	printk(PRINT_PREF "writing 1st page of block %d\n", ebnum);
+	pr_info("writing 1st page of block %d\n", ebnum);
 	set_random_data(writebuf, pgsize);
 	strcpy(writebuf, "There is no data like this!");
 	err = mtd_write(mtd, addr0, pgsize, &written, writebuf);
 	if (err || written != pgsize) {
-		printk(PRINT_PREF "error: write failed at %#llx\n",
+		pr_info("error: write failed at %#llx\n",
 		       (long long)addr0);
 		return err ? err : -1;
 	}
 
-	printk(PRINT_PREF "reading 1st page of block %d\n", ebnum);
+	pr_info("reading 1st page of block %d\n", ebnum);
 	memset(readbuf, 0, pgsize);
 	err = mtd_read(mtd, addr0, pgsize, &read, readbuf);
 	if (mtd_is_bitflip(err))
 		err = 0;
 	if (err || read != pgsize) {
-		printk(PRINT_PREF "error: read failed at %#llx\n",
+		pr_err("error: read failed at %#llx\n",
 		       (long long)addr0);
 		return err ? err : -1;
 	}
 
-	printk(PRINT_PREF "verifying 1st page of block %d\n", ebnum);
+	pr_info("verifying 1st page of block %d\n", ebnum);
 	if (memcmp(writebuf, readbuf, pgsize)) {
-		printk(PRINT_PREF "verify failed!\n");
+		pr_err("verify failed!\n");
 		errcnt += 1;
 		return -1;
 	}
 
-	printk(PRINT_PREF "erasing block %d\n", ebnum);
+	pr_info("erasing block %d\n", ebnum);
 	err = erase_eraseblock(ebnum);
 	if (err)
 		return err;
 
-	printk(PRINT_PREF "writing 1st page of block %d\n", ebnum);
+	pr_info("writing 1st page of block %d\n", ebnum);
 	set_random_data(writebuf, pgsize);
 	strcpy(writebuf, "There is no data like this!");
 	err = mtd_write(mtd, addr0, pgsize, &written, writebuf);
 	if (err || written != pgsize) {
-		printk(PRINT_PREF "error: write failed at %#llx\n",
+		pr_err("error: write failed at %#llx\n",
 		       (long long)addr0);
 		return err ? err : -1;
 	}
 
-	printk(PRINT_PREF "erasing block %d\n", ebnum2);
+	pr_info("erasing block %d\n", ebnum2);
 	err = erase_eraseblock(ebnum2);
 	if (err)
 		return err;
 
-	printk(PRINT_PREF "reading 1st page of block %d\n", ebnum);
+	pr_info("reading 1st page of block %d\n", ebnum);
 	memset(readbuf, 0, pgsize);
 	err = mtd_read(mtd, addr0, pgsize, &read, readbuf);
 	if (mtd_is_bitflip(err))
 		err = 0;
 	if (err || read != pgsize) {
-		printk(PRINT_PREF "error: read failed at %#llx\n",
+		pr_err("error: read failed at %#llx\n",
 		       (long long)addr0);
 		return err ? err : -1;
 	}
 
-	printk(PRINT_PREF "verifying 1st page of block %d\n", ebnum);
+	pr_info("verifying 1st page of block %d\n", ebnum);
 	if (memcmp(writebuf, readbuf, pgsize)) {
-		printk(PRINT_PREF "verify failed!\n");
+		pr_err("verify failed!\n");
 		errcnt += 1;
 		return -1;
 	}
 
 	if (!err)
-		printk(PRINT_PREF "erasecrosstest ok\n");
+		pr_info("erasecrosstest ok\n");
 	return err;
 }
 
@@ -402,7 +402,7 @@ static int erasetest(void)
 	int err = 0, i, ebnum, ok = 1;
 	loff_t addr0;
 
-	printk(PRINT_PREF "erasetest\n");
+	pr_info("erasetest\n");
 
 	ebnum = 0;
 	addr0 = 0;
@@ -411,40 +411,40 @@ static int erasetest(void)
 		ebnum += 1;
 	}
 
-	printk(PRINT_PREF "erasing block %d\n", ebnum);
+	pr_info("erasing block %d\n", ebnum);
 	err = erase_eraseblock(ebnum);
 	if (err)
 		return err;
 
-	printk(PRINT_PREF "writing 1st page of block %d\n", ebnum);
+	pr_info("writing 1st page of block %d\n", ebnum);
 	set_random_data(writebuf, pgsize);
 	err = mtd_write(mtd, addr0, pgsize, &written, writebuf);
 	if (err || written != pgsize) {
-		printk(PRINT_PREF "error: write failed at %#llx\n",
+		pr_err("error: write failed at %#llx\n",
 		       (long long)addr0);
 		return err ? err : -1;
 	}
 
-	printk(PRINT_PREF "erasing block %d\n", ebnum);
+	pr_info("erasing block %d\n", ebnum);
 	err = erase_eraseblock(ebnum);
 	if (err)
 		return err;
 
-	printk(PRINT_PREF "reading 1st page of block %d\n", ebnum);
+	pr_info("reading 1st page of block %d\n", ebnum);
 	err = mtd_read(mtd, addr0, pgsize, &read, twopages);
 	if (mtd_is_bitflip(err))
 		err = 0;
 	if (err || read != pgsize) {
-		printk(PRINT_PREF "error: read failed at %#llx\n",
+		pr_err("error: read failed at %#llx\n",
 		       (long long)addr0);
 		return err ? err : -1;
 	}
 
-	printk(PRINT_PREF "verifying 1st page of block %d is all 0xff\n",
+	pr_info("verifying 1st page of block %d is all 0xff\n",
 	       ebnum);
 	for (i = 0; i < pgsize; ++i)
 		if (twopages[i] != 0xff) {
-			printk(PRINT_PREF "verifying all 0xff failed at %d\n",
+			pr_err("verifying all 0xff failed at %d\n",
 			       i);
 			errcnt += 1;
 			ok = 0;
@@ -452,7 +452,7 @@ static int erasetest(void)
 		}
 
 	if (ok && !err)
-		printk(PRINT_PREF "erasetest ok\n");
+		pr_info("erasetest ok\n");
 
 	return err;
 }
@@ -464,7 +464,7 @@ static int is_block_bad(int ebnum)
 
 	ret = mtd_block_isbad(mtd, addr);
 	if (ret)
-		printk(PRINT_PREF "block %d is bad\n", ebnum);
+		pr_info("block %d is bad\n", ebnum);
 	return ret;
 }
 
@@ -474,18 +474,18 @@ static int scan_for_bad_eraseblocks(void)
 
 	bbt = kzalloc(ebcnt, GFP_KERNEL);
 	if (!bbt) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_err("error: cannot allocate memory\n");
 		return -ENOMEM;
 	}
 
-	printk(PRINT_PREF "scanning for bad eraseblocks\n");
+	pr_info("scanning for bad eraseblocks\n");
 	for (i = 0; i < ebcnt; ++i) {
 		bbt[i] = is_block_bad(i) ? 1 : 0;
 		if (bbt[i])
 			bad += 1;
 		cond_resched();
 	}
-	printk(PRINT_PREF "scanned %d eraseblocks, %d are bad\n", i, bad);
+	pr_info("scanned %d eraseblocks, %d are bad\n", i, bad);
 	return 0;
 }
 
@@ -499,22 +499,22 @@ static int __init mtd_pagetest_init(void)
 	printk(KERN_INFO "=================================================\n");
 
 	if (dev < 0) {
-		printk(PRINT_PREF "Please specify a valid mtd-device via module paramter\n");
-		printk(KERN_CRIT "CAREFUL: This test wipes all data on the specified MTD device!\n");
+		pr_info("Please specify a valid mtd-device via module parameter\n");
+		pr_crit("CAREFUL: This test wipes all data on the specified MTD device!\n");
 		return -EINVAL;
 	}
 
-	printk(PRINT_PREF "MTD device: %d\n", dev);
+	pr_info("MTD device: %d\n", dev);
 
 	mtd = get_mtd_device(NULL, dev);
 	if (IS_ERR(mtd)) {
 		err = PTR_ERR(mtd);
-		printk(PRINT_PREF "error: cannot get MTD device\n");
+		pr_err("error: cannot get MTD device\n");
 		return err;
 	}
 
 	if (mtd->type != MTD_NANDFLASH) {
-		printk(PRINT_PREF "this test requires NAND flash\n");
+		pr_info("this test requires NAND flash\n");
 		goto out;
 	}
 
@@ -524,7 +524,7 @@ static int __init mtd_pagetest_init(void)
 	pgcnt = mtd->erasesize / mtd->writesize;
 	pgsize = mtd->writesize;
 
-	printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, "
+	pr_info("MTD device size %llu, eraseblock size %u, "
 	       "page size %u, count of eraseblocks %u, pages per "
 	       "eraseblock %u, OOB size %u\n",
 	       (unsigned long long)mtd->size, mtd->erasesize,
@@ -534,17 +534,17 @@ static int __init mtd_pagetest_init(void)
 	bufsize = pgsize * 2;
 	writebuf = kmalloc(mtd->erasesize, GFP_KERNEL);
 	if (!writebuf) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_err("error: cannot allocate memory\n");
 		goto out;
 	}
 	twopages = kmalloc(bufsize, GFP_KERNEL);
 	if (!twopages) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_err("error: cannot allocate memory\n");
 		goto out;
 	}
 	boundary = kmalloc(bufsize, GFP_KERNEL);
 	if (!boundary) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_err("error: cannot allocate memory\n");
 		goto out;
 	}
 
@@ -553,7 +553,7 @@ static int __init mtd_pagetest_init(void)
 		goto out;
 
 	/* Erase all eraseblocks */
-	printk(PRINT_PREF "erasing whole device\n");
+	pr_info("erasing whole device\n");
 	for (i = 0; i < ebcnt; ++i) {
 		if (bbt[i])
 			continue;
@@ -562,11 +562,11 @@ static int __init mtd_pagetest_init(void)
 			goto out;
 		cond_resched();
 	}
-	printk(PRINT_PREF "erased %u eraseblocks\n", i);
+	pr_info("erased %u eraseblocks\n", i);
 
 	/* Write all eraseblocks */
 	simple_srand(1);
-	printk(PRINT_PREF "writing whole device\n");
+	pr_info("writing whole device\n");
 	for (i = 0; i < ebcnt; ++i) {
 		if (bbt[i])
 			continue;
@@ -574,14 +574,14 @@ static int __init mtd_pagetest_init(void)
 		if (err)
 			goto out;
 		if (i % 256 == 0)
-			printk(PRINT_PREF "written up to eraseblock %u\n", i);
+			pr_info("written up to eraseblock %u\n", i);
 		cond_resched();
 	}
-	printk(PRINT_PREF "written %u eraseblocks\n", i);
+	pr_info("written %u eraseblocks\n", i);
 
 	/* Check all eraseblocks */
 	simple_srand(1);
-	printk(PRINT_PREF "verifying all eraseblocks\n");
+	pr_info("verifying all eraseblocks\n");
 	for (i = 0; i < ebcnt; ++i) {
 		if (bbt[i])
 			continue;
@@ -589,10 +589,10 @@ static int __init mtd_pagetest_init(void)
 		if (err)
 			goto out;
 		if (i % 256 == 0)
-			printk(PRINT_PREF "verified up to eraseblock %u\n", i);
+			pr_info("verified up to eraseblock %u\n", i);
 		cond_resched();
 	}
-	printk(PRINT_PREF "verified %u eraseblocks\n", i);
+	pr_info("verified %u eraseblocks\n", i);
 
 	err = crosstest();
 	if (err)
@@ -606,7 +606,7 @@ static int __init mtd_pagetest_init(void)
 	if (err)
 		goto out;
 
-	printk(PRINT_PREF "finished with %d errors\n", errcnt);
+	pr_info("finished with %d errors\n", errcnt);
 out:
 
 	kfree(bbt);
@@ -615,7 +615,7 @@ static int __init mtd_pagetest_init(void)
 	kfree(writebuf);
 	put_mtd_device(mtd);
 	if (err)
-		printk(PRINT_PREF "error %d occurred\n", err);
+		pr_info("error %d occurred\n", err);
 	printk(KERN_INFO "=================================================\n");
 	return err;
 }
diff --git a/trunk/drivers/mtd/tests/mtd_readtest.c b/trunk/drivers/mtd/tests/mtd_readtest.c
index 121aba189cec..266de04b6d29 100644
--- a/trunk/drivers/mtd/tests/mtd_readtest.c
+++ b/trunk/drivers/mtd/tests/mtd_readtest.c
@@ -19,6 +19,8 @@
  * Author: Adrian Hunter <ext-adrian.hunter@nokia.com>
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
@@ -27,8 +29,6 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 
-#define PRINT_PREF KERN_INFO "mtd_readtest: "
-
 static int dev = -EINVAL;
 module_param(dev, int, S_IRUGO);
 MODULE_PARM_DESC(dev, "MTD device number to use");
@@ -51,12 +51,12 @@ static int read_eraseblock_by_page(int ebnum)
 	void *oobbuf = iobuf1;
 
 	for (i = 0; i < pgcnt; i++) {
-		memset(buf, 0 , pgcnt);
+		memset(buf, 0 , pgsize);
 		ret = mtd_read(mtd, addr, pgsize, &read, buf);
 		if (ret == -EUCLEAN)
 			ret = 0;
 		if (ret || read != pgsize) {
-			printk(PRINT_PREF "error: read failed at %#llx\n",
+			pr_err("error: read failed at %#llx\n",
 			       (long long)addr);
 			if (!err)
 				err = ret;
@@ -77,7 +77,7 @@ static int read_eraseblock_by_page(int ebnum)
 			ret = mtd_read_oob(mtd, addr, &ops);
 			if ((ret && !mtd_is_bitflip(ret)) ||
 					ops.oobretlen != mtd->oobsize) {
-				printk(PRINT_PREF "error: read oob failed at "
+				pr_err("error: read oob failed at "
 						  "%#llx\n", (long long)addr);
 				if (!err)
 					err = ret;
@@ -99,7 +99,7 @@ static void dump_eraseblock(int ebnum)
 	char line[128];
 	int pg, oob;
 
-	printk(PRINT_PREF "dumping eraseblock %d\n", ebnum);
+	pr_info("dumping eraseblock %d\n", ebnum);
 	n = mtd->erasesize;
 	for (i = 0; i < n;) {
 		char *p = line;
@@ -112,7 +112,7 @@ static void dump_eraseblock(int ebnum)
 	}
 	if (!mtd->oobsize)
 		return;
-	printk(PRINT_PREF "dumping oob from eraseblock %d\n", ebnum);
+	pr_info("dumping oob from eraseblock %d\n", ebnum);
 	n = mtd->oobsize;
 	for (pg = 0, i = 0; pg < pgcnt; pg++)
 		for (oob = 0; oob < n;) {
@@ -134,7 +134,7 @@ static int is_block_bad(int ebnum)
 
 	ret = mtd_block_isbad(mtd, addr);
 	if (ret)
-		printk(PRINT_PREF "block %d is bad\n", ebnum);
+		pr_info("block %d is bad\n", ebnum);
 	return ret;
 }
 
@@ -144,21 +144,21 @@ static int scan_for_bad_eraseblocks(void)
 
 	bbt = kzalloc(ebcnt, GFP_KERNEL);
 	if (!bbt) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_err("error: cannot allocate memory\n");
 		return -ENOMEM;
 	}
 
 	if (!mtd_can_have_bb(mtd))
 		return 0;
 
-	printk(PRINT_PREF "scanning for bad eraseblocks\n");
+	pr_info("scanning for bad eraseblocks\n");
 	for (i = 0; i < ebcnt; ++i) {
 		bbt[i] = is_block_bad(i) ? 1 : 0;
 		if (bbt[i])
 			bad += 1;
 		cond_resched();
 	}
-	printk(PRINT_PREF "scanned %d eraseblocks, %d are bad\n", i, bad);
+	pr_info("scanned %d eraseblocks, %d are bad\n", i, bad);
 	return 0;
 }
 
@@ -171,21 +171,21 @@ static int __init mtd_readtest_init(void)
 	printk(KERN_INFO "=================================================\n");
 
 	if (dev < 0) {
-		printk(PRINT_PREF "Please specify a valid mtd-device via module paramter\n");
+		pr_info("Please specify a valid mtd-device via module parameter\n");
 		return -EINVAL;
 	}
 
-	printk(PRINT_PREF "MTD device: %d\n", dev);
+	pr_info("MTD device: %d\n", dev);
 
 	mtd = get_mtd_device(NULL, dev);
 	if (IS_ERR(mtd)) {
 		err = PTR_ERR(mtd);
-		printk(PRINT_PREF "error: Cannot get MTD device\n");
+		pr_err("error: Cannot get MTD device\n");
 		return err;
 	}
 
 	if (mtd->writesize == 1) {
-		printk(PRINT_PREF "not NAND flash, assume page size is 512 "
+		pr_info("not NAND flash, assume page size is 512 "
 		       "bytes.\n");
 		pgsize = 512;
 	} else
@@ -196,7 +196,7 @@ static int __init mtd_readtest_init(void)
 	ebcnt = tmp;
 	pgcnt = mtd->erasesize / pgsize;
 
-	printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, "
+	pr_info("MTD device size %llu, eraseblock size %u, "
 	       "page size %u, count of eraseblocks %u, pages per "
 	       "eraseblock %u, OOB size %u\n",
 	       (unsigned long long)mtd->size, mtd->erasesize,
@@ -205,12 +205,12 @@ static int __init mtd_readtest_init(void)
 	err = -ENOMEM;
 	iobuf = kmalloc(mtd->erasesize, GFP_KERNEL);
 	if (!iobuf) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_err("error: cannot allocate memory\n");
 		goto out;
 	}
 	iobuf1 = kmalloc(mtd->erasesize, GFP_KERNEL);
 	if (!iobuf1) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_err("error: cannot allocate memory\n");
 		goto out;
 	}
 
@@ -219,7 +219,7 @@ static int __init mtd_readtest_init(void)
 		goto out;
 
 	/* Read all eraseblocks 1 page at a time */
-	printk(PRINT_PREF "testing page read\n");
+	pr_info("testing page read\n");
 	for (i = 0; i < ebcnt; ++i) {
 		int ret;
 
@@ -235,9 +235,9 @@ static int __init mtd_readtest_init(void)
 	}
 
 	if (err)
-		printk(PRINT_PREF "finished with errors\n");
+		pr_info("finished with errors\n");
 	else
-		printk(PRINT_PREF "finished\n");
+		pr_info("finished\n");
 
 out:
 
@@ -246,7 +246,7 @@ static int __init mtd_readtest_init(void)
 	kfree(bbt);
 	put_mtd_device(mtd);
 	if (err)
-		printk(PRINT_PREF "error %d occurred\n", err);
+		pr_info("error %d occurred\n", err);
 	printk(KERN_INFO "=================================================\n");
 	return err;
 }
diff --git a/trunk/drivers/mtd/tests/mtd_speedtest.c b/trunk/drivers/mtd/tests/mtd_speedtest.c
index 42b0f7456fc4..596cbea8df4c 100644
--- a/trunk/drivers/mtd/tests/mtd_speedtest.c
+++ b/trunk/drivers/mtd/tests/mtd_speedtest.c
@@ -19,6 +19,8 @@
  * Author: Adrian Hunter <adrian.hunter@nokia.com>
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
@@ -28,8 +30,6 @@
 #include <linux/sched.h>
 #include <linux/random.h>
 
-#define PRINT_PREF KERN_INFO "mtd_speedtest: "
-
 static int dev = -EINVAL;
 module_param(dev, int, S_IRUGO);
 MODULE_PARM_DESC(dev, "MTD device number to use");
@@ -70,12 +70,12 @@ static int erase_eraseblock(int ebnum)
 
 	err = mtd_erase(mtd, &ei);
 	if (err) {
-		printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum);
+		pr_err("error %d while erasing EB %d\n", err, ebnum);
 		return err;
 	}
 
 	if (ei.state == MTD_ERASE_FAILED) {
-		printk(PRINT_PREF "some erase error occurred at EB %d\n",
+		pr_err("some erase error occurred at EB %d\n",
 		       ebnum);
 		return -EIO;
 	}
@@ -96,13 +96,13 @@ static int multiblock_erase(int ebnum, int blocks)
 
 	err = mtd_erase(mtd, &ei);
 	if (err) {
-		printk(PRINT_PREF "error %d while erasing EB %d, blocks %d\n",
+		pr_err("error %d while erasing EB %d, blocks %d\n",
 		       err, ebnum, blocks);
 		return err;
 	}
 
 	if (ei.state == MTD_ERASE_FAILED) {
-		printk(PRINT_PREF "some erase error occurred at EB %d,"
+		pr_err("some erase error occurred at EB %d,"
 		       "blocks %d\n", ebnum, blocks);
 		return -EIO;
 	}
@@ -134,7 +134,7 @@ static int write_eraseblock(int ebnum)
 
 	err = mtd_write(mtd, addr, mtd->erasesize, &written, iobuf);
 	if (err || written != mtd->erasesize) {
-		printk(PRINT_PREF "error: write failed at %#llx\n", addr);
+		pr_err("error: write failed at %#llx\n", addr);
 		if (!err)
 			err = -EINVAL;
 	}
@@ -152,7 +152,7 @@ static int write_eraseblock_by_page(int ebnum)
 	for (i = 0; i < pgcnt; i++) {
 		err = mtd_write(mtd, addr, pgsize, &written, buf);
 		if (err || written != pgsize) {
-			printk(PRINT_PREF "error: write failed at %#llx\n",
+			pr_err("error: write failed at %#llx\n",
 			       addr);
 			if (!err)
 				err = -EINVAL;
@@ -175,7 +175,7 @@ static int write_eraseblock_by_2pages(int ebnum)
 	for (i = 0; i < n; i++) {
 		err = mtd_write(mtd, addr, sz, &written, buf);
 		if (err || written != sz) {
-			printk(PRINT_PREF "error: write failed at %#llx\n",
+			pr_err("error: write failed at %#llx\n",
 			       addr);
 			if (!err)
 				err = -EINVAL;
@@ -187,7 +187,7 @@ static int write_eraseblock_by_2pages(int ebnum)
 	if (pgcnt % 2) {
 		err = mtd_write(mtd, addr, pgsize, &written, buf);
 		if (err || written != pgsize) {
-			printk(PRINT_PREF "error: write failed at %#llx\n",
+			pr_err("error: write failed at %#llx\n",
 			       addr);
 			if (!err)
 				err = -EINVAL;
@@ -208,7 +208,7 @@ static int read_eraseblock(int ebnum)
 	if (mtd_is_bitflip(err))
 		err = 0;
 	if (err || read != mtd->erasesize) {
-		printk(PRINT_PREF "error: read failed at %#llx\n", addr);
+		pr_err("error: read failed at %#llx\n", addr);
 		if (!err)
 			err = -EINVAL;
 	}
@@ -229,7 +229,7 @@ static int read_eraseblock_by_page(int ebnum)
 		if (mtd_is_bitflip(err))
 			err = 0;
 		if (err || read != pgsize) {
-			printk(PRINT_PREF "error: read failed at %#llx\n",
+			pr_err("error: read failed at %#llx\n",
 			       addr);
 			if (!err)
 				err = -EINVAL;
@@ -255,7 +255,7 @@ static int read_eraseblock_by_2pages(int ebnum)
 		if (mtd_is_bitflip(err))
 			err = 0;
 		if (err || read != sz) {
-			printk(PRINT_PREF "error: read failed at %#llx\n",
+			pr_err("error: read failed at %#llx\n",
 			       addr);
 			if (!err)
 				err = -EINVAL;
@@ -270,7 +270,7 @@ static int read_eraseblock_by_2pages(int ebnum)
 		if (mtd_is_bitflip(err))
 			err = 0;
 		if (err || read != pgsize) {
-			printk(PRINT_PREF "error: read failed at %#llx\n",
+			pr_err("error: read failed at %#llx\n",
 			       addr);
 			if (!err)
 				err = -EINVAL;
@@ -287,7 +287,7 @@ static int is_block_bad(int ebnum)
 
 	ret = mtd_block_isbad(mtd, addr);
 	if (ret)
-		printk(PRINT_PREF "block %d is bad\n", ebnum);
+		pr_info("block %d is bad\n", ebnum);
 	return ret;
 }
 
@@ -321,21 +321,21 @@ static int scan_for_bad_eraseblocks(void)
 
 	bbt = kzalloc(ebcnt, GFP_KERNEL);
 	if (!bbt) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_err("error: cannot allocate memory\n");
 		return -ENOMEM;
 	}
 
 	if (!mtd_can_have_bb(mtd))
 		goto out;
 
-	printk(PRINT_PREF "scanning for bad eraseblocks\n");
+	pr_info("scanning for bad eraseblocks\n");
 	for (i = 0; i < ebcnt; ++i) {
 		bbt[i] = is_block_bad(i) ? 1 : 0;
 		if (bbt[i])
 			bad += 1;
 		cond_resched();
 	}
-	printk(PRINT_PREF "scanned %d eraseblocks, %d are bad\n", i, bad);
+	pr_info("scanned %d eraseblocks, %d are bad\n", i, bad);
 out:
 	goodebcnt = ebcnt - bad;
 	return 0;
@@ -351,25 +351,25 @@ static int __init mtd_speedtest_init(void)
 	printk(KERN_INFO "=================================================\n");
 
 	if (dev < 0) {
-		printk(PRINT_PREF "Please specify a valid mtd-device via module paramter\n");
-		printk(KERN_CRIT "CAREFUL: This test wipes all data on the specified MTD device!\n");
+		pr_info("Please specify a valid mtd-device via module parameter\n");
+		pr_crit("CAREFUL: This test wipes all data on the specified MTD device!\n");
 		return -EINVAL;
 	}
 
 	if (count)
-		printk(PRINT_PREF "MTD device: %d    count: %d\n", dev, count);
+		pr_info("MTD device: %d    count: %d\n", dev, count);
 	else
-		printk(PRINT_PREF "MTD device: %d\n", dev);
+		pr_info("MTD device: %d\n", dev);
 
 	mtd = get_mtd_device(NULL, dev);
 	if (IS_ERR(mtd)) {
 		err = PTR_ERR(mtd);
-		printk(PRINT_PREF "error: cannot get MTD device\n");
+		pr_err("error: cannot get MTD device\n");
 		return err;
 	}
 
 	if (mtd->writesize == 1) {
-		printk(PRINT_PREF "not NAND flash, assume page size is 512 "
+		pr_info("not NAND flash, assume page size is 512 "
 		       "bytes.\n");
 		pgsize = 512;
 	} else
@@ -380,7 +380,7 @@ static int __init mtd_speedtest_init(void)
 	ebcnt = tmp;
 	pgcnt = mtd->erasesize / pgsize;
 
-	printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, "
+	pr_info("MTD device size %llu, eraseblock size %u, "
 	       "page size %u, count of eraseblocks %u, pages per "
 	       "eraseblock %u, OOB size %u\n",
 	       (unsigned long long)mtd->size, mtd->erasesize,
@@ -392,7 +392,7 @@ static int __init mtd_speedtest_init(void)
 	err = -ENOMEM;
 	iobuf = kmalloc(mtd->erasesize, GFP_KERNEL);
 	if (!iobuf) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_err("error: cannot allocate memory\n");
 		goto out;
 	}
 
@@ -407,7 +407,7 @@ static int __init mtd_speedtest_init(void)
 		goto out;
 
 	/* Write all eraseblocks, 1 eraseblock at a time */
-	printk(PRINT_PREF "testing eraseblock write speed\n");
+	pr_info("testing eraseblock write speed\n");
 	start_timing();
 	for (i = 0; i < ebcnt; ++i) {
 		if (bbt[i])
@@ -419,10 +419,10 @@ static int __init mtd_speedtest_init(void)
 	}
 	stop_timing();
 	speed = calc_speed();
-	printk(PRINT_PREF "eraseblock write speed is %ld KiB/s\n", speed);
+	pr_info("eraseblock write speed is %ld KiB/s\n", speed);
 
 	/* Read all eraseblocks, 1 eraseblock at a time */
-	printk(PRINT_PREF "testing eraseblock read speed\n");
+	pr_info("testing eraseblock read speed\n");
 	start_timing();
 	for (i = 0; i < ebcnt; ++i) {
 		if (bbt[i])
@@ -434,14 +434,14 @@ static int __init mtd_speedtest_init(void)
 	}
 	stop_timing();
 	speed = calc_speed();
-	printk(PRINT_PREF "eraseblock read speed is %ld KiB/s\n", speed);
+	pr_info("eraseblock read speed is %ld KiB/s\n", speed);
 
 	err = erase_whole_device();
 	if (err)
 		goto out;
 
 	/* Write all eraseblocks, 1 page at a time */
-	printk(PRINT_PREF "testing page write speed\n");
+	pr_info("testing page write speed\n");
 	start_timing();
 	for (i = 0; i < ebcnt; ++i) {
 		if (bbt[i])
@@ -453,10 +453,10 @@ static int __init mtd_speedtest_init(void)
 	}
 	stop_timing();
 	speed = calc_speed();
-	printk(PRINT_PREF "page write speed is %ld KiB/s\n", speed);
+	pr_info("page write speed is %ld KiB/s\n", speed);
 
 	/* Read all eraseblocks, 1 page at a time */
-	printk(PRINT_PREF "testing page read speed\n");
+	pr_info("testing page read speed\n");
 	start_timing();
 	for (i = 0; i < ebcnt; ++i) {
 		if (bbt[i])
@@ -468,14 +468,14 @@ static int __init mtd_speedtest_init(void)
 	}
 	stop_timing();
 	speed = calc_speed();
-	printk(PRINT_PREF "page read speed is %ld KiB/s\n", speed);
+	pr_info("page read speed is %ld KiB/s\n", speed);
 
 	err = erase_whole_device();
 	if (err)
 		goto out;
 
 	/* Write all eraseblocks, 2 pages at a time */
-	printk(PRINT_PREF "testing 2 page write speed\n");
+	pr_info("testing 2 page write speed\n");
 	start_timing();
 	for (i = 0; i < ebcnt; ++i) {
 		if (bbt[i])
@@ -487,10 +487,10 @@ static int __init mtd_speedtest_init(void)
 	}
 	stop_timing();
 	speed = calc_speed();
-	printk(PRINT_PREF "2 page write speed is %ld KiB/s\n", speed);
+	pr_info("2 page write speed is %ld KiB/s\n", speed);
 
 	/* Read all eraseblocks, 2 pages at a time */
-	printk(PRINT_PREF "testing 2 page read speed\n");
+	pr_info("testing 2 page read speed\n");
 	start_timing();
 	for (i = 0; i < ebcnt; ++i) {
 		if (bbt[i])
@@ -502,10 +502,10 @@ static int __init mtd_speedtest_init(void)
 	}
 	stop_timing();
 	speed = calc_speed();
-	printk(PRINT_PREF "2 page read speed is %ld KiB/s\n", speed);
+	pr_info("2 page read speed is %ld KiB/s\n", speed);
 
 	/* Erase all eraseblocks */
-	printk(PRINT_PREF "Testing erase speed\n");
+	pr_info("Testing erase speed\n");
 	start_timing();
 	for (i = 0; i < ebcnt; ++i) {
 		if (bbt[i])
@@ -517,12 +517,12 @@ static int __init mtd_speedtest_init(void)
 	}
 	stop_timing();
 	speed = calc_speed();
-	printk(PRINT_PREF "erase speed is %ld KiB/s\n", speed);
+	pr_info("erase speed is %ld KiB/s\n", speed);
 
 	/* Multi-block erase all eraseblocks */
 	for (k = 1; k < 7; k++) {
 		blocks = 1 << k;
-		printk(PRINT_PREF "Testing %dx multi-block erase speed\n",
+		pr_info("Testing %dx multi-block erase speed\n",
 		       blocks);
 		start_timing();
 		for (i = 0; i < ebcnt; ) {
@@ -541,16 +541,16 @@ static int __init mtd_speedtest_init(void)
 		}
 		stop_timing();
 		speed = calc_speed();
-		printk(PRINT_PREF "%dx multi-block erase speed is %ld KiB/s\n",
+		pr_info("%dx multi-block erase speed is %ld KiB/s\n",
 		       blocks, speed);
 	}
-	printk(PRINT_PREF "finished\n");
+	pr_info("finished\n");
 out:
 	kfree(iobuf);
 	kfree(bbt);
 	put_mtd_device(mtd);
 	if (err)
-		printk(PRINT_PREF "error %d occurred\n", err);
+		pr_info("error %d occurred\n", err);
 	printk(KERN_INFO "=================================================\n");
 	return err;
 }
diff --git a/trunk/drivers/mtd/tests/mtd_stresstest.c b/trunk/drivers/mtd/tests/mtd_stresstest.c
index cb268cebf01a..3729f679ae5d 100644
--- a/trunk/drivers/mtd/tests/mtd_stresstest.c
+++ b/trunk/drivers/mtd/tests/mtd_stresstest.c
@@ -19,6 +19,8 @@
  * Author: Adrian Hunter <ext-adrian.hunter@nokia.com>
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
@@ -29,8 +31,6 @@
 #include <linux/vmalloc.h>
 #include <linux/random.h>
 
-#define PRINT_PREF KERN_INFO "mtd_stresstest: "
-
 static int dev = -EINVAL;
 module_param(dev, int, S_IRUGO);
 MODULE_PARM_DESC(dev, "MTD device number to use");
@@ -94,12 +94,12 @@ static int erase_eraseblock(int ebnum)
 
 	err = mtd_erase(mtd, &ei);
 	if (unlikely(err)) {
-		printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum);
+		pr_err("error %d while erasing EB %d\n", err, ebnum);
 		return err;
 	}
 
 	if (unlikely(ei.state == MTD_ERASE_FAILED)) {
-		printk(PRINT_PREF "some erase error occurred at EB %d\n",
+		pr_err("some erase error occurred at EB %d\n",
 		       ebnum);
 		return -EIO;
 	}
@@ -114,7 +114,7 @@ static int is_block_bad(int ebnum)
 
 	ret = mtd_block_isbad(mtd, addr);
 	if (ret)
-		printk(PRINT_PREF "block %d is bad\n", ebnum);
+		pr_info("block %d is bad\n", ebnum);
 	return ret;
 }
 
@@ -137,7 +137,7 @@ static int do_read(void)
 	if (mtd_is_bitflip(err))
 		err = 0;
 	if (unlikely(err || read != len)) {
-		printk(PRINT_PREF "error: read failed at 0x%llx\n",
+		pr_err("error: read failed at 0x%llx\n",
 		       (long long)addr);
 		if (!err)
 			err = -EINVAL;
@@ -174,7 +174,7 @@ static int do_write(void)
 	addr = eb * mtd->erasesize + offs;
 	err = mtd_write(mtd, addr, len, &written, writebuf);
 	if (unlikely(err || written != len)) {
-		printk(PRINT_PREF "error: write failed at 0x%llx\n",
+		pr_err("error: write failed at 0x%llx\n",
 		       (long long)addr);
 		if (!err)
 			err = -EINVAL;
@@ -203,21 +203,21 @@ static int scan_for_bad_eraseblocks(void)
 
 	bbt = kzalloc(ebcnt, GFP_KERNEL);
 	if (!bbt) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_err("error: cannot allocate memory\n");
 		return -ENOMEM;
 	}
 
 	if (!mtd_can_have_bb(mtd))
 		return 0;
 
-	printk(PRINT_PREF "scanning for bad eraseblocks\n");
+	pr_info("scanning for bad eraseblocks\n");
 	for (i = 0; i < ebcnt; ++i) {
 		bbt[i] = is_block_bad(i) ? 1 : 0;
 		if (bbt[i])
 			bad += 1;
 		cond_resched();
 	}
-	printk(PRINT_PREF "scanned %d eraseblocks, %d are bad\n", i, bad);
+	pr_info("scanned %d eraseblocks, %d are bad\n", i, bad);
 	return 0;
 }
 
@@ -231,22 +231,22 @@ static int __init mtd_stresstest_init(void)
 	printk(KERN_INFO "=================================================\n");
 
 	if (dev < 0) {
-		printk(PRINT_PREF "Please specify a valid mtd-device via module paramter\n");
-		printk(KERN_CRIT "CAREFUL: This test wipes all data on the specified MTD device!\n");
+		pr_info("Please specify a valid mtd-device via module parameter\n");
+		pr_crit("CAREFUL: This test wipes all data on the specified MTD device!\n");
 		return -EINVAL;
 	}
 
-	printk(PRINT_PREF "MTD device: %d\n", dev);
+	pr_info("MTD device: %d\n", dev);
 
 	mtd = get_mtd_device(NULL, dev);
 	if (IS_ERR(mtd)) {
 		err = PTR_ERR(mtd);
-		printk(PRINT_PREF "error: cannot get MTD device\n");
+		pr_err("error: cannot get MTD device\n");
 		return err;
 	}
 
 	if (mtd->writesize == 1) {
-		printk(PRINT_PREF "not NAND flash, assume page size is 512 "
+		pr_info("not NAND flash, assume page size is 512 "
 		       "bytes.\n");
 		pgsize = 512;
 	} else
@@ -257,14 +257,14 @@ static int __init mtd_stresstest_init(void)
 	ebcnt = tmp;
 	pgcnt = mtd->erasesize / pgsize;
 
-	printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, "
+	pr_info("MTD device size %llu, eraseblock size %u, "
 	       "page size %u, count of eraseblocks %u, pages per "
 	       "eraseblock %u, OOB size %u\n",
 	       (unsigned long long)mtd->size, mtd->erasesize,
 	       pgsize, ebcnt, pgcnt, mtd->oobsize);
 
 	if (ebcnt < 2) {
-		printk(PRINT_PREF "error: need at least 2 eraseblocks\n");
+		pr_err("error: need at least 2 eraseblocks\n");
 		err = -ENOSPC;
 		goto out_put_mtd;
 	}
@@ -277,7 +277,7 @@ static int __init mtd_stresstest_init(void)
 	writebuf = vmalloc(bufsize);
 	offsets = kmalloc(ebcnt * sizeof(int), GFP_KERNEL);
 	if (!readbuf || !writebuf || !offsets) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_err("error: cannot allocate memory\n");
 		goto out;
 	}
 	for (i = 0; i < ebcnt; i++)
@@ -290,16 +290,16 @@ static int __init mtd_stresstest_init(void)
 		goto out;
 
 	/* Do operations */
-	printk(PRINT_PREF "doing operations\n");
+	pr_info("doing operations\n");
 	for (op = 0; op < count; op++) {
 		if ((op & 1023) == 0)
-			printk(PRINT_PREF "%d operations done\n", op);
+			pr_info("%d operations done\n", op);
 		err = do_operation();
 		if (err)
 			goto out;
 		cond_resched();
 	}
-	printk(PRINT_PREF "finished, %d operations done\n", op);
+	pr_info("finished, %d operations done\n", op);
 
 out:
 	kfree(offsets);
@@ -309,7 +309,7 @@ static int __init mtd_stresstest_init(void)
 out_put_mtd:
 	put_mtd_device(mtd);
 	if (err)
-		printk(PRINT_PREF "error %d occurred\n", err);
+		pr_info("error %d occurred\n", err);
 	printk(KERN_INFO "=================================================\n");
 	return err;
 }
diff --git a/trunk/drivers/mtd/tests/mtd_subpagetest.c b/trunk/drivers/mtd/tests/mtd_subpagetest.c
index 9667bf535282..c880c2229c59 100644
--- a/trunk/drivers/mtd/tests/mtd_subpagetest.c
+++ b/trunk/drivers/mtd/tests/mtd_subpagetest.c
@@ -19,6 +19,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
@@ -27,8 +29,6 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 
-#define PRINT_PREF KERN_INFO "mtd_subpagetest: "
-
 static int dev = -EINVAL;
 module_param(dev, int, S_IRUGO);
 MODULE_PARM_DESC(dev, "MTD device number to use");
@@ -82,12 +82,12 @@ static int erase_eraseblock(int ebnum)
 
 	err = mtd_erase(mtd, &ei);
 	if (err) {
-		printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum);
+		pr_err("error %d while erasing EB %d\n", err, ebnum);
 		return err;
 	}
 
 	if (ei.state == MTD_ERASE_FAILED) {
-		printk(PRINT_PREF "some erase error occurred at EB %d\n",
+		pr_err("some erase error occurred at EB %d\n",
 		       ebnum);
 		return -EIO;
 	}
@@ -100,7 +100,7 @@ static int erase_whole_device(void)
 	int err;
 	unsigned int i;
 
-	printk(PRINT_PREF "erasing whole device\n");
+	pr_info("erasing whole device\n");
 	for (i = 0; i < ebcnt; ++i) {
 		if (bbt[i])
 			continue;
@@ -109,7 +109,7 @@ static int erase_whole_device(void)
 			return err;
 		cond_resched();
 	}
-	printk(PRINT_PREF "erased %u eraseblocks\n", i);
+	pr_info("erased %u eraseblocks\n", i);
 	return 0;
 }
 
@@ -122,11 +122,11 @@ static int write_eraseblock(int ebnum)
 	set_random_data(writebuf, subpgsize);
 	err = mtd_write(mtd, addr, subpgsize, &written, writebuf);
 	if (unlikely(err || written != subpgsize)) {
-		printk(PRINT_PREF "error: write failed at %#llx\n",
+		pr_err("error: write failed at %#llx\n",
 		       (long long)addr);
 		if (written != subpgsize) {
-			printk(PRINT_PREF "  write size: %#x\n", subpgsize);
-			printk(PRINT_PREF "  written: %#zx\n", written);
+			pr_err("  write size: %#x\n", subpgsize);
+			pr_err("  written: %#zx\n", written);
 		}
 		return err ? err : -1;
 	}
@@ -136,11 +136,11 @@ static int write_eraseblock(int ebnum)
 	set_random_data(writebuf, subpgsize);
 	err = mtd_write(mtd, addr, subpgsize, &written, writebuf);
 	if (unlikely(err || written != subpgsize)) {
-		printk(PRINT_PREF "error: write failed at %#llx\n",
+		pr_err("error: write failed at %#llx\n",
 		       (long long)addr);
 		if (written != subpgsize) {
-			printk(PRINT_PREF "  write size: %#x\n", subpgsize);
-			printk(PRINT_PREF "  written: %#zx\n", written);
+			pr_err("  write size: %#x\n", subpgsize);
+			pr_err("  written: %#zx\n", written);
 		}
 		return err ? err : -1;
 	}
@@ -160,12 +160,12 @@ static int write_eraseblock2(int ebnum)
 		set_random_data(writebuf, subpgsize * k);
 		err = mtd_write(mtd, addr, subpgsize * k, &written, writebuf);
 		if (unlikely(err || written != subpgsize * k)) {
-			printk(PRINT_PREF "error: write failed at %#llx\n",
+			pr_err("error: write failed at %#llx\n",
 			       (long long)addr);
 			if (written != subpgsize) {
-				printk(PRINT_PREF "  write size: %#x\n",
+				pr_err("  write size: %#x\n",
 				       subpgsize * k);
-				printk(PRINT_PREF "  written: %#08zx\n",
+				pr_err("  written: %#08zx\n",
 				       written);
 			}
 			return err ? err : -1;
@@ -198,23 +198,23 @@ static int verify_eraseblock(int ebnum)
 	err = mtd_read(mtd, addr, subpgsize, &read, readbuf);
 	if (unlikely(err || read != subpgsize)) {
 		if (mtd_is_bitflip(err) && read == subpgsize) {
-			printk(PRINT_PREF "ECC correction at %#llx\n",
+			pr_info("ECC correction at %#llx\n",
 			       (long long)addr);
 			err = 0;
 		} else {
-			printk(PRINT_PREF "error: read failed at %#llx\n",
+			pr_err("error: read failed at %#llx\n",
 			       (long long)addr);
 			return err ? err : -1;
 		}
 	}
 	if (unlikely(memcmp(readbuf, writebuf, subpgsize))) {
-		printk(PRINT_PREF "error: verify failed at %#llx\n",
+		pr_err("error: verify failed at %#llx\n",
 		       (long long)addr);
-		printk(PRINT_PREF "------------- written----------------\n");
+		pr_info("------------- written----------------\n");
 		print_subpage(writebuf);
-		printk(PRINT_PREF "------------- read ------------------\n");
+		pr_info("------------- read ------------------\n");
 		print_subpage(readbuf);
-		printk(PRINT_PREF "-------------------------------------\n");
+		pr_info("-------------------------------------\n");
 		errcnt += 1;
 	}
 
@@ -225,23 +225,23 @@ static int verify_eraseblock(int ebnum)
 	err = mtd_read(mtd, addr, subpgsize, &read, readbuf);
 	if (unlikely(err || read != subpgsize)) {
 		if (mtd_is_bitflip(err) && read == subpgsize) {
-			printk(PRINT_PREF "ECC correction at %#llx\n",
+			pr_info("ECC correction at %#llx\n",
 			       (long long)addr);
 			err = 0;
 		} else {
-			printk(PRINT_PREF "error: read failed at %#llx\n",
+			pr_err("error: read failed at %#llx\n",
 			       (long long)addr);
 			return err ? err : -1;
 		}
 	}
 	if (unlikely(memcmp(readbuf, writebuf, subpgsize))) {
-		printk(PRINT_PREF "error: verify failed at %#llx\n",
+		pr_info("error: verify failed at %#llx\n",
 		       (long long)addr);
-		printk(PRINT_PREF "------------- written----------------\n");
+		pr_info("------------- written----------------\n");
 		print_subpage(writebuf);
-		printk(PRINT_PREF "------------- read ------------------\n");
+		pr_info("------------- read ------------------\n");
 		print_subpage(readbuf);
-		printk(PRINT_PREF "-------------------------------------\n");
+		pr_info("-------------------------------------\n");
 		errcnt += 1;
 	}
 
@@ -262,17 +262,17 @@ static int verify_eraseblock2(int ebnum)
 		err = mtd_read(mtd, addr, subpgsize * k, &read, readbuf);
 		if (unlikely(err || read != subpgsize * k)) {
 			if (mtd_is_bitflip(err) && read == subpgsize * k) {
-				printk(PRINT_PREF "ECC correction at %#llx\n",
+				pr_info("ECC correction at %#llx\n",
 				       (long long)addr);
 				err = 0;
 			} else {
-				printk(PRINT_PREF "error: read failed at "
+				pr_err("error: read failed at "
 				       "%#llx\n", (long long)addr);
 				return err ? err : -1;
 			}
 		}
 		if (unlikely(memcmp(readbuf, writebuf, subpgsize * k))) {
-			printk(PRINT_PREF "error: verify failed at %#llx\n",
+			pr_err("error: verify failed at %#llx\n",
 			       (long long)addr);
 			errcnt += 1;
 		}
@@ -295,17 +295,17 @@ static int verify_eraseblock_ff(int ebnum)
 		err = mtd_read(mtd, addr, subpgsize, &read, readbuf);
 		if (unlikely(err || read != subpgsize)) {
 			if (mtd_is_bitflip(err) && read == subpgsize) {
-				printk(PRINT_PREF "ECC correction at %#llx\n",
+				pr_info("ECC correction at %#llx\n",
 				       (long long)addr);
 				err = 0;
 			} else {
-				printk(PRINT_PREF "error: read failed at "
+				pr_err("error: read failed at "
 				       "%#llx\n", (long long)addr);
 				return err ? err : -1;
 			}
 		}
 		if (unlikely(memcmp(readbuf, writebuf, subpgsize))) {
-			printk(PRINT_PREF "error: verify 0xff failed at "
+			pr_err("error: verify 0xff failed at "
 			       "%#llx\n", (long long)addr);
 			errcnt += 1;
 		}
@@ -320,7 +320,7 @@ static int verify_all_eraseblocks_ff(void)
 	int err;
 	unsigned int i;
 
-	printk(PRINT_PREF "verifying all eraseblocks for 0xff\n");
+	pr_info("verifying all eraseblocks for 0xff\n");
 	for (i = 0; i < ebcnt; ++i) {
 		if (bbt[i])
 			continue;
@@ -328,10 +328,10 @@ static int verify_all_eraseblocks_ff(void)
 		if (err)
 			return err;
 		if (i % 256 == 0)
-			printk(PRINT_PREF "verified up to eraseblock %u\n", i);
+			pr_info("verified up to eraseblock %u\n", i);
 		cond_resched();
 	}
-	printk(PRINT_PREF "verified %u eraseblocks\n", i);
+	pr_info("verified %u eraseblocks\n", i);
 	return 0;
 }
 
@@ -342,7 +342,7 @@ static int is_block_bad(int ebnum)
 
 	ret = mtd_block_isbad(mtd, addr);
 	if (ret)
-		printk(PRINT_PREF "block %d is bad\n", ebnum);
+		pr_info("block %d is bad\n", ebnum);
 	return ret;
 }
 
@@ -352,18 +352,18 @@ static int scan_for_bad_eraseblocks(void)
 
 	bbt = kzalloc(ebcnt, GFP_KERNEL);
 	if (!bbt) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_err("error: cannot allocate memory\n");
 		return -ENOMEM;
 	}
 
-	printk(PRINT_PREF "scanning for bad eraseblocks\n");
+	pr_info("scanning for bad eraseblocks\n");
 	for (i = 0; i < ebcnt; ++i) {
 		bbt[i] = is_block_bad(i) ? 1 : 0;
 		if (bbt[i])
 			bad += 1;
 		cond_resched();
 	}
-	printk(PRINT_PREF "scanned %d eraseblocks, %d are bad\n", i, bad);
+	pr_info("scanned %d eraseblocks, %d are bad\n", i, bad);
 	return 0;
 }
 
@@ -377,22 +377,22 @@ static int __init mtd_subpagetest_init(void)
 	printk(KERN_INFO "=================================================\n");
 
 	if (dev < 0) {
-		printk(PRINT_PREF "Please specify a valid mtd-device via module paramter\n");
-		printk(KERN_CRIT "CAREFUL: This test wipes all data on the specified MTD device!\n");
+		pr_info("Please specify a valid mtd-device via module parameter\n");
+		pr_crit("CAREFUL: This test wipes all data on the specified MTD device!\n");
 		return -EINVAL;
 	}
 
-	printk(PRINT_PREF "MTD device: %d\n", dev);
+	pr_info("MTD device: %d\n", dev);
 
 	mtd = get_mtd_device(NULL, dev);
 	if (IS_ERR(mtd)) {
 		err = PTR_ERR(mtd);
-		printk(PRINT_PREF "error: cannot get MTD device\n");
+		pr_err("error: cannot get MTD device\n");
 		return err;
 	}
 
 	if (mtd->type != MTD_NANDFLASH) {
-		printk(PRINT_PREF "this test requires NAND flash\n");
+		pr_info("this test requires NAND flash\n");
 		goto out;
 	}
 
@@ -402,7 +402,7 @@ static int __init mtd_subpagetest_init(void)
 	ebcnt = tmp;
 	pgcnt = mtd->erasesize / mtd->writesize;
 
-	printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, "
+	pr_info("MTD device size %llu, eraseblock size %u, "
 	       "page size %u, subpage size %u, count of eraseblocks %u, "
 	       "pages per eraseblock %u, OOB size %u\n",
 	       (unsigned long long)mtd->size, mtd->erasesize,
@@ -412,12 +412,12 @@ static int __init mtd_subpagetest_init(void)
 	bufsize = subpgsize * 32;
 	writebuf = kmalloc(bufsize, GFP_KERNEL);
 	if (!writebuf) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_info("error: cannot allocate memory\n");
 		goto out;
 	}
 	readbuf = kmalloc(bufsize, GFP_KERNEL);
 	if (!readbuf) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_info("error: cannot allocate memory\n");
 		goto out;
 	}
 
@@ -429,7 +429,7 @@ static int __init mtd_subpagetest_init(void)
 	if (err)
 		goto out;
 
-	printk(PRINT_PREF "writing whole device\n");
+	pr_info("writing whole device\n");
 	simple_srand(1);
 	for (i = 0; i < ebcnt; ++i) {
 		if (bbt[i])
@@ -438,13 +438,13 @@ static int __init mtd_subpagetest_init(void)
 		if (unlikely(err))
 			goto out;
 		if (i % 256 == 0)
-			printk(PRINT_PREF "written up to eraseblock %u\n", i);
+			pr_info("written up to eraseblock %u\n", i);
 		cond_resched();
 	}
-	printk(PRINT_PREF "written %u eraseblocks\n", i);
+	pr_info("written %u eraseblocks\n", i);
 
 	simple_srand(1);
-	printk(PRINT_PREF "verifying all eraseblocks\n");
+	pr_info("verifying all eraseblocks\n");
 	for (i = 0; i < ebcnt; ++i) {
 		if (bbt[i])
 			continue;
@@ -452,10 +452,10 @@ static int __init mtd_subpagetest_init(void)
 		if (unlikely(err))
 			goto out;
 		if (i % 256 == 0)
-			printk(PRINT_PREF "verified up to eraseblock %u\n", i);
+			pr_info("verified up to eraseblock %u\n", i);
 		cond_resched();
 	}
-	printk(PRINT_PREF "verified %u eraseblocks\n", i);
+	pr_info("verified %u eraseblocks\n", i);
 
 	err = erase_whole_device();
 	if (err)
@@ -467,7 +467,7 @@ static int __init mtd_subpagetest_init(void)
 
 	/* Write all eraseblocks */
 	simple_srand(3);
-	printk(PRINT_PREF "writing whole device\n");
+	pr_info("writing whole device\n");
 	for (i = 0; i < ebcnt; ++i) {
 		if (bbt[i])
 			continue;
@@ -475,14 +475,14 @@ static int __init mtd_subpagetest_init(void)
 		if (unlikely(err))
 			goto out;
 		if (i % 256 == 0)
-			printk(PRINT_PREF "written up to eraseblock %u\n", i);
+			pr_info("written up to eraseblock %u\n", i);
 		cond_resched();
 	}
-	printk(PRINT_PREF "written %u eraseblocks\n", i);
+	pr_info("written %u eraseblocks\n", i);
 
 	/* Check all eraseblocks */
 	simple_srand(3);
-	printk(PRINT_PREF "verifying all eraseblocks\n");
+	pr_info("verifying all eraseblocks\n");
 	for (i = 0; i < ebcnt; ++i) {
 		if (bbt[i])
 			continue;
@@ -490,10 +490,10 @@ static int __init mtd_subpagetest_init(void)
 		if (unlikely(err))
 			goto out;
 		if (i % 256 == 0)
-			printk(PRINT_PREF "verified up to eraseblock %u\n", i);
+			pr_info("verified up to eraseblock %u\n", i);
 		cond_resched();
 	}
-	printk(PRINT_PREF "verified %u eraseblocks\n", i);
+	pr_info("verified %u eraseblocks\n", i);
 
 	err = erase_whole_device();
 	if (err)
@@ -503,7 +503,7 @@ static int __init mtd_subpagetest_init(void)
 	if (err)
 		goto out;
 
-	printk(PRINT_PREF "finished with %d errors\n", errcnt);
+	pr_info("finished with %d errors\n", errcnt);
 
 out:
 	kfree(bbt);
@@ -511,7 +511,7 @@ static int __init mtd_subpagetest_init(void)
 	kfree(writebuf);
 	put_mtd_device(mtd);
 	if (err)
-		printk(PRINT_PREF "error %d occurred\n", err);
+		pr_info("error %d occurred\n", err);
 	printk(KERN_INFO "=================================================\n");
 	return err;
 }
diff --git a/trunk/drivers/mtd/tests/mtd_torturetest.c b/trunk/drivers/mtd/tests/mtd_torturetest.c
index b65861bc7b8e..c4cde1e9eddb 100644
--- a/trunk/drivers/mtd/tests/mtd_torturetest.c
+++ b/trunk/drivers/mtd/tests/mtd_torturetest.c
@@ -23,6 +23,8 @@
  * damage caused by this program.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
@@ -31,7 +33,6 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 
-#define PRINT_PREF KERN_INFO "mtd_torturetest: "
 #define RETRIES 3
 
 static int eb = 8;
@@ -107,12 +108,12 @@ static inline int erase_eraseblock(int ebnum)
 
 	err = mtd_erase(mtd, &ei);
 	if (err) {
-		printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum);
+		pr_err("error %d while erasing EB %d\n", err, ebnum);
 		return err;
 	}
 
 	if (ei.state == MTD_ERASE_FAILED) {
-		printk(PRINT_PREF "some erase error occurred at EB %d\n",
+		pr_err("some erase error occurred at EB %d\n",
 		       ebnum);
 		return -EIO;
 	}
@@ -139,40 +140,40 @@ static inline int check_eraseblock(int ebnum, unsigned char *buf)
 retry:
 	err = mtd_read(mtd, addr, len, &read, check_buf);
 	if (mtd_is_bitflip(err))
-		printk(PRINT_PREF "single bit flip occurred at EB %d "
+		pr_err("single bit flip occurred at EB %d "
 		       "MTD reported that it was fixed.\n", ebnum);
 	else if (err) {
-		printk(PRINT_PREF "error %d while reading EB %d, "
+		pr_err("error %d while reading EB %d, "
 		       "read %zd\n", err, ebnum, read);
 		return err;
 	}
 
 	if (read != len) {
-		printk(PRINT_PREF "failed to read %zd bytes from EB %d, "
+		pr_err("failed to read %zd bytes from EB %d, "
 		       "read only %zd, but no error reported\n",
 		       len, ebnum, read);
 		return -EIO;
 	}
 
 	if (memcmp(buf, check_buf, len)) {
-		printk(PRINT_PREF "read wrong data from EB %d\n", ebnum);
+		pr_err("read wrong data from EB %d\n", ebnum);
 		report_corrupt(check_buf, buf);
 
 		if (retries++ < RETRIES) {
 			/* Try read again */
 			yield();
-			printk(PRINT_PREF "re-try reading data from EB %d\n",
+			pr_info("re-try reading data from EB %d\n",
 			       ebnum);
 			goto retry;
 		} else {
-			printk(PRINT_PREF "retried %d times, still errors, "
+			pr_info("retried %d times, still errors, "
 			       "give-up\n", RETRIES);
 			return -EINVAL;
 		}
 	}
 
 	if (retries != 0)
-		printk(PRINT_PREF "only attempt number %d was OK (!!!)\n",
+		pr_info("only attempt number %d was OK (!!!)\n",
 		       retries);
 
 	return 0;
@@ -191,12 +192,12 @@ static inline int write_pattern(int ebnum, void *buf)
 	}
 	err = mtd_write(mtd, addr, len, &written, buf);
 	if (err) {
-		printk(PRINT_PREF "error %d while writing EB %d, written %zd"
+		pr_err("error %d while writing EB %d, written %zd"
 		      " bytes\n", err, ebnum, written);
 		return err;
 	}
 	if (written != len) {
-		printk(PRINT_PREF "written only %zd bytes of %zd, but no error"
+		pr_info("written only %zd bytes of %zd, but no error"
 		       " reported\n", written, len);
 		return -EIO;
 	}
@@ -211,64 +212,64 @@ static int __init tort_init(void)
 
 	printk(KERN_INFO "\n");
 	printk(KERN_INFO "=================================================\n");
-	printk(PRINT_PREF "Warning: this program is trying to wear out your "
+	pr_info("Warning: this program is trying to wear out your "
 	       "flash, stop it if this is not wanted.\n");
 
 	if (dev < 0) {
-		printk(PRINT_PREF "Please specify a valid mtd-device via module paramter\n");
-		printk(KERN_CRIT "CAREFUL: This test wipes all data on the specified MTD device!\n");
+		pr_info("Please specify a valid mtd-device via module parameter\n");
+		pr_crit("CAREFUL: This test wipes all data on the specified MTD device!\n");
 		return -EINVAL;
 	}
 
-	printk(PRINT_PREF "MTD device: %d\n", dev);
-	printk(PRINT_PREF "torture %d eraseblocks (%d-%d) of mtd%d\n",
+	pr_info("MTD device: %d\n", dev);
+	pr_info("torture %d eraseblocks (%d-%d) of mtd%d\n",
 	       ebcnt, eb, eb + ebcnt - 1, dev);
 	if (pgcnt)
-		printk(PRINT_PREF "torturing just %d pages per eraseblock\n",
+		pr_info("torturing just %d pages per eraseblock\n",
 			pgcnt);
-	printk(PRINT_PREF "write verify %s\n", check ? "enabled" : "disabled");
+	pr_info("write verify %s\n", check ? "enabled" : "disabled");
 
 	mtd = get_mtd_device(NULL, dev);
 	if (IS_ERR(mtd)) {
 		err = PTR_ERR(mtd);
-		printk(PRINT_PREF "error: cannot get MTD device\n");
+		pr_err("error: cannot get MTD device\n");
 		return err;
 	}
 
 	if (mtd->writesize == 1) {
-		printk(PRINT_PREF "not NAND flash, assume page size is 512 "
+		pr_info("not NAND flash, assume page size is 512 "
 		       "bytes.\n");
 		pgsize = 512;
 	} else
 		pgsize = mtd->writesize;
 
 	if (pgcnt && (pgcnt > mtd->erasesize / pgsize || pgcnt < 0)) {
-		printk(PRINT_PREF "error: invalid pgcnt value %d\n", pgcnt);
+		pr_err("error: invalid pgcnt value %d\n", pgcnt);
 		goto out_mtd;
 	}
 
 	err = -ENOMEM;
 	patt_5A5 = kmalloc(mtd->erasesize, GFP_KERNEL);
 	if (!patt_5A5) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_err("error: cannot allocate memory\n");
 		goto out_mtd;
 	}
 
 	patt_A5A = kmalloc(mtd->erasesize, GFP_KERNEL);
 	if (!patt_A5A) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_err("error: cannot allocate memory\n");
 		goto out_patt_5A5;
 	}
 
 	patt_FF = kmalloc(mtd->erasesize, GFP_KERNEL);
 	if (!patt_FF) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_err("error: cannot allocate memory\n");
 		goto out_patt_A5A;
 	}
 
 	check_buf = kmalloc(mtd->erasesize, GFP_KERNEL);
 	if (!check_buf) {
-		printk(PRINT_PREF "error: cannot allocate memory\n");
+		pr_err("error: cannot allocate memory\n");
 		goto out_patt_FF;
 	}
 
@@ -295,13 +296,13 @@ static int __init tort_init(void)
 			err = mtd_block_isbad(mtd, (loff_t)i * mtd->erasesize);
 
 			if (err < 0) {
-				printk(PRINT_PREF "block_isbad() returned %d "
+				pr_info("block_isbad() returned %d "
 				       "for EB %d\n", err, i);
 				goto out;
 			}
 
 			if (err) {
-				printk("EB %d is bad. Skip it.\n", i);
+				pr_err("EB %d is bad. Skip it.\n", i);
 				bad_ebs[i - eb] = 1;
 			}
 		}
@@ -329,7 +330,7 @@ static int __init tort_init(void)
 					continue;
 				err = check_eraseblock(i, patt_FF);
 				if (err) {
-					printk(PRINT_PREF "verify failed"
+					pr_info("verify failed"
 					       " for 0xFF... pattern\n");
 					goto out;
 				}
@@ -362,7 +363,7 @@ static int __init tort_init(void)
 					patt = patt_A5A;
 				err = check_eraseblock(i, patt);
 				if (err) {
-					printk(PRINT_PREF "verify failed for %s"
+					pr_info("verify failed for %s"
 					       " pattern\n",
 					       ((eb + erase_cycles) & 1) ?
 					       "0x55AA55..." : "0xAA55AA...");
@@ -380,7 +381,7 @@ static int __init tort_init(void)
 			stop_timing();
 			ms = (finish.tv_sec - start.tv_sec) * 1000 +
 			     (finish.tv_usec - start.tv_usec) / 1000;
-			printk(PRINT_PREF "%08u erase cycles done, took %lu "
+			pr_info("%08u erase cycles done, took %lu "
 			       "milliseconds (%lu seconds)\n",
 			       erase_cycles, ms, ms / 1000);
 			start_timing();
@@ -391,7 +392,7 @@ static int __init tort_init(void)
 	}
 out:
 
-	printk(PRINT_PREF "finished after %u erase cycles\n",
+	pr_info("finished after %u erase cycles\n",
 	       erase_cycles);
 	kfree(check_buf);
 out_patt_FF:
@@ -403,7 +404,7 @@ static int __init tort_init(void)
 out_mtd:
 	put_mtd_device(mtd);
 	if (err)
-		printk(PRINT_PREF "error %d occurred during torturing\n", err);
+		pr_info("error %d occurred during torturing\n", err);
 	printk(KERN_INFO "=================================================\n");
 	return err;
 }
@@ -441,9 +442,9 @@ static void report_corrupt(unsigned char *read, unsigned char *written)
 			       &bits) >= 0)
 			pages++;
 
-	printk(PRINT_PREF "verify fails on %d pages, %d bytes/%d bits\n",
+	pr_info("verify fails on %d pages, %d bytes/%d bits\n",
 	       pages, bytes, bits);
-	printk(PRINT_PREF "The following is a list of all differences between"
+	pr_info("The following is a list of all differences between"
 	       " what was read from flash and what was expected\n");
 
 	for (i = 0; i < check_len; i += pgsize) {
@@ -457,7 +458,7 @@ static void report_corrupt(unsigned char *read, unsigned char *written)
 		printk("-------------------------------------------------------"
 		       "----------------------------------\n");
 
-		printk(PRINT_PREF "Page %zd has %d bytes/%d bits failing verify,"
+		pr_info("Page %zd has %d bytes/%d bits failing verify,"
 		       " starting at offset 0x%x\n",
 		       (mtd->erasesize - check_len + i) / pgsize,
 		       bytes, bits, first);
diff --git a/trunk/drivers/net/bonding/bond_main.c b/trunk/drivers/net/bonding/bond_main.c
index ef2cb2418535..b7d45f367d4a 100644
--- a/trunk/drivers/net/bonding/bond_main.c
+++ b/trunk/drivers/net/bonding/bond_main.c
@@ -4431,8 +4431,6 @@ static void bond_uninit(struct net_device *bond_dev)
 
 	list_del(&bond->bond_list);
 
-	bond_work_cancel_all(bond);
-
 	bond_debug_unregister(bond);
 
 	__hw_addr_flush(&bond->mc_list);
diff --git a/trunk/drivers/net/can/sja1000/sja1000_of_platform.c b/trunk/drivers/net/can/sja1000/sja1000_of_platform.c
index 0f5917000aa2..6433b81256cd 100644
--- a/trunk/drivers/net/can/sja1000/sja1000_of_platform.c
+++ b/trunk/drivers/net/can/sja1000/sja1000_of_platform.c
@@ -121,7 +121,7 @@ static int sja1000_ofp_probe(struct platform_device *ofdev)
 	}
 
 	irq = irq_of_parse_and_map(np, 0);
-	if (irq == NO_IRQ) {
+	if (irq == 0) {
 		dev_err(&ofdev->dev, "no irq found\n");
 		err = -ENODEV;
 		goto exit_unmap_mem;
diff --git a/trunk/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/trunk/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index a2998bea5d4b..01588b66a38c 100644
--- a/trunk/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/trunk/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -1832,7 +1832,6 @@ int bnx2x_config_rss_pf(struct bnx2x *bp, struct bnx2x_rss_config_obj *rss_obj,
 			bool config_hash)
 {
 	struct bnx2x_config_rss_params params = {NULL};
-	int i;
 
 	/* Although RSS is meaningless when there is a single HW queue we
 	 * still need it enabled in order to have HW Rx hash generated.
@@ -1864,9 +1863,7 @@ int bnx2x_config_rss_pf(struct bnx2x *bp, struct bnx2x_rss_config_obj *rss_obj,
 
 	if (config_hash) {
 		/* RSS keys */
-		for (i = 0; i < sizeof(params.rss_key) / 4; i++)
-			params.rss_key[i] = random32();
-
+		prandom_bytes(params.rss_key, sizeof(params.rss_key));
 		__set_bit(BNX2X_RSS_SET_SRCH, &params.rss_flags);
 	}
 
diff --git a/trunk/drivers/net/ethernet/emulex/benet/be.h b/trunk/drivers/net/ethernet/emulex/benet/be.h
index abf26c7c1d19..3bc1912afba9 100644
--- a/trunk/drivers/net/ethernet/emulex/benet/be.h
+++ b/trunk/drivers/net/ethernet/emulex/benet/be.h
@@ -616,7 +616,7 @@ static inline bool be_error(struct be_adapter *adapter)
 	return adapter->eeh_error || adapter->hw_error || adapter->fw_timeout;
 }
 
-static inline bool be_crit_error(struct be_adapter *adapter)
+static inline bool be_hw_error(struct be_adapter *adapter)
 {
 	return adapter->eeh_error || adapter->hw_error;
 }
diff --git a/trunk/drivers/net/ethernet/emulex/benet/be_cmds.c b/trunk/drivers/net/ethernet/emulex/benet/be_cmds.c
index f2875aa47661..8a250c38fb82 100644
--- a/trunk/drivers/net/ethernet/emulex/benet/be_cmds.c
+++ b/trunk/drivers/net/ethernet/emulex/benet/be_cmds.c
@@ -298,7 +298,12 @@ void be_async_mcc_enable(struct be_adapter *adapter)
 
 void be_async_mcc_disable(struct be_adapter *adapter)
 {
+	spin_lock_bh(&adapter->mcc_cq_lock);
+
 	adapter->mcc_obj.rearm_cq = false;
+	be_cq_notify(adapter, adapter->mcc_obj.cq.id, false, 0);
+
+	spin_unlock_bh(&adapter->mcc_cq_lock);
 }
 
 int be_process_mcc(struct be_adapter *adapter)
diff --git a/trunk/drivers/net/ethernet/emulex/benet/be_main.c b/trunk/drivers/net/ethernet/emulex/benet/be_main.c
index f95612b907ae..9dca22be8125 100644
--- a/trunk/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/trunk/drivers/net/ethernet/emulex/benet/be_main.c
@@ -1689,15 +1689,41 @@ static void be_rx_cq_clean(struct be_rx_obj *rxo)
 	struct be_queue_info *rxq = &rxo->q;
 	struct be_queue_info *rx_cq = &rxo->cq;
 	struct be_rx_compl_info *rxcp;
+	struct be_adapter *adapter = rxo->adapter;
+	int flush_wait = 0;
 	u16 tail;
 
-	/* First cleanup pending rx completions */
-	while ((rxcp = be_rx_compl_get(rxo)) != NULL) {
-		be_rx_compl_discard(rxo, rxcp);
-		be_cq_notify(rxo->adapter, rx_cq->id, false, 1);
+	/* Consume pending rx completions.
+	 * Wait for the flush completion (identified by zero num_rcvd)
+	 * to arrive. Notify CQ even when there are no more CQ entries
+	 * for HW to flush partially coalesced CQ entries.
+	 * In Lancer, there is no need to wait for flush compl.
+	 */
+	for (;;) {
+		rxcp = be_rx_compl_get(rxo);
+		if (rxcp == NULL) {
+			if (lancer_chip(adapter))
+				break;
+
+			if (flush_wait++ > 10 || be_hw_error(adapter)) {
+				dev_warn(&adapter->pdev->dev,
+					 "did not receive flush compl\n");
+				break;
+			}
+			be_cq_notify(adapter, rx_cq->id, true, 0);
+			mdelay(1);
+		} else {
+			be_rx_compl_discard(rxo, rxcp);
+			be_cq_notify(adapter, rx_cq->id, true, 1);
+			if (rxcp->num_rcvd == 0)
+				break;
+		}
 	}
 
-	/* Then free posted rx buffer that were not used */
+	/* After cleanup, leave the CQ in unarmed state */
+	be_cq_notify(adapter, rx_cq->id, false, 0);
+
+	/* Then free posted rx buffers that were not used */
 	tail = (rxq->head + rxq->len - atomic_read(&rxq->used)) % rxq->len;
 	for (; atomic_read(&rxq->used) > 0; index_inc(&tail, rxq->len)) {
 		page_info = get_rx_page_info(rxo, tail);
@@ -2157,7 +2183,7 @@ void be_detect_error(struct be_adapter *adapter)
 	u32 sliport_status = 0, sliport_err1 = 0, sliport_err2 = 0;
 	u32 i;
 
-	if (be_crit_error(adapter))
+	if (be_hw_error(adapter))
 		return;
 
 	if (lancer_chip(adapter)) {
@@ -2398,13 +2424,22 @@ static int be_close(struct net_device *netdev)
 
 	be_roce_dev_close(adapter);
 
-	be_async_mcc_disable(adapter);
-
 	if (!lancer_chip(adapter))
 		be_intr_set(adapter, false);
 
-	for_all_evt_queues(adapter, eqo, i) {
+	for_all_evt_queues(adapter, eqo, i)
 		napi_disable(&eqo->napi);
+
+	be_async_mcc_disable(adapter);
+
+	/* Wait for all pending tx completions to arrive so that
+	 * all tx skbs are freed.
+	 */
+	be_tx_compl_clean(adapter);
+
+	be_rx_qs_destroy(adapter);
+
+	for_all_evt_queues(adapter, eqo, i) {
 		if (msix_enabled(adapter))
 			synchronize_irq(be_msix_vec_get(adapter, eqo));
 		else
@@ -2414,12 +2449,6 @@ static int be_close(struct net_device *netdev)
 
 	be_irq_unregister(adapter);
 
-	/* Wait for all pending tx completions to arrive so that
-	 * all tx skbs are freed.
-	 */
-	be_tx_compl_clean(adapter);
-
-	be_rx_qs_destroy(adapter);
 	return 0;
 }
 
diff --git a/trunk/drivers/net/ethernet/freescale/Kconfig b/trunk/drivers/net/ethernet/freescale/Kconfig
index 5ba6e1cbd346..ec490d741fc0 100644
--- a/trunk/drivers/net/ethernet/freescale/Kconfig
+++ b/trunk/drivers/net/ethernet/freescale/Kconfig
@@ -94,9 +94,8 @@ config GIANFAR
 
 config FEC_PTP
 	bool "PTP Hardware Clock (PHC)"
-	depends on FEC && ARCH_MXC
+	depends on FEC && ARCH_MXC && !SOC_IMX25 && !SOC_IMX27 && !SOC_IMX35 && !SOC_IMX5
 	select PTP_1588_CLOCK
-	default y if SOC_IMX6Q
 	--help---
 	  Say Y here if you want to use PTP Hardware Clock (PHC) in the
 	  driver.  Only the basic clock operations have been implemented.
diff --git a/trunk/drivers/net/ethernet/ibm/ehea/ehea_phyp.h b/trunk/drivers/net/ethernet/ibm/ehea/ehea_phyp.h
index 8364815c32ff..99b6c2a38dbf 100644
--- a/trunk/drivers/net/ethernet/ibm/ehea/ehea_phyp.h
+++ b/trunk/drivers/net/ethernet/ibm/ehea/ehea_phyp.h
@@ -39,26 +39,6 @@
  * hcp_*  - structures, variables and functions releated to Hypervisor Calls
  */
 
-static inline u32 get_longbusy_msecs(int long_busy_ret_code)
-{
-	switch (long_busy_ret_code) {
-	case H_LONG_BUSY_ORDER_1_MSEC:
-		return 1;
-	case H_LONG_BUSY_ORDER_10_MSEC:
-		return 10;
-	case H_LONG_BUSY_ORDER_100_MSEC:
-		return 100;
-	case H_LONG_BUSY_ORDER_1_SEC:
-		return 1000;
-	case H_LONG_BUSY_ORDER_10_SEC:
-		return 10000;
-	case H_LONG_BUSY_ORDER_100_SEC:
-		return 100000;
-	default:
-		return 1;
-	}
-}
-
 /* Number of pages which can be registered at once by H_REGISTER_HEA_RPAGES */
 #define EHEA_MAX_RPAGE 512
 
diff --git a/trunk/drivers/net/ethernet/micrel/ksz884x.c b/trunk/drivers/net/ethernet/micrel/ksz884x.c
index 83f0ea929d3d..8ebc352bcbe6 100644
--- a/trunk/drivers/net/ethernet/micrel/ksz884x.c
+++ b/trunk/drivers/net/ethernet/micrel/ksz884x.c
@@ -4761,7 +4761,7 @@ static void transmit_cleanup(struct dev_info *hw_priv, int normal)
 	struct ksz_dma_buf *dma_buf;
 	struct net_device *dev = NULL;
 
-	spin_lock(&hw_priv->hwlock);
+	spin_lock_irq(&hw_priv->hwlock);
 	last = info->last;
 
 	while (info->avail < info->alloc) {
@@ -4795,7 +4795,7 @@ static void transmit_cleanup(struct dev_info *hw_priv, int normal)
 		info->avail++;
 	}
 	info->last = last;
-	spin_unlock(&hw_priv->hwlock);
+	spin_unlock_irq(&hw_priv->hwlock);
 
 	/* Notify the network subsystem that the packet has been sent. */
 	if (dev)
@@ -5259,11 +5259,15 @@ static irqreturn_t netdev_intr(int irq, void *dev_id)
 	struct dev_info *hw_priv = priv->adapter;
 	struct ksz_hw *hw = &hw_priv->hw;
 
+	spin_lock(&hw_priv->hwlock);
+
 	hw_read_intr(hw, &int_enable);
 
 	/* Not our interrupt! */
-	if (!int_enable)
+	if (!int_enable) {
+		spin_unlock(&hw_priv->hwlock);
 		return IRQ_NONE;
+	}
 
 	do {
 		hw_ack_intr(hw, int_enable);
@@ -5310,6 +5314,8 @@ static irqreturn_t netdev_intr(int irq, void *dev_id)
 
 	hw_ena_intr(hw);
 
+	spin_unlock(&hw_priv->hwlock);
+
 	return IRQ_HANDLED;
 }
 
diff --git a/trunk/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h b/trunk/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h
index 537902479689..bc7ec64e9c7a 100644
--- a/trunk/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h
+++ b/trunk/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h
@@ -36,8 +36,8 @@
 
 #define _QLCNIC_LINUX_MAJOR 5
 #define _QLCNIC_LINUX_MINOR 0
-#define _QLCNIC_LINUX_SUBVERSION 29
-#define QLCNIC_LINUX_VERSIONID  "5.0.29"
+#define _QLCNIC_LINUX_SUBVERSION 30
+#define QLCNIC_LINUX_VERSIONID  "5.0.30"
 #define QLCNIC_DRV_IDC_VER  0x01
 #define QLCNIC_DRIVER_VERSION  ((_QLCNIC_LINUX_MAJOR << 16) |\
 		 (_QLCNIC_LINUX_MINOR << 8) | (_QLCNIC_LINUX_SUBVERSION))
diff --git a/trunk/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c b/trunk/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c
index 58f094ca052e..b14b8f0787ea 100644
--- a/trunk/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c
+++ b/trunk/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c
@@ -134,7 +134,7 @@ int qlcnic_fw_cmd_get_minidump_temp(struct qlcnic_adapter *adapter)
 	__le32 *tmp_buf;
 	struct qlcnic_cmd_args cmd;
 	struct qlcnic_hardware_context *ahw;
-	struct qlcnic_dump_template_hdr *tmpl_hdr, *tmp_tmpl;
+	struct qlcnic_dump_template_hdr *tmpl_hdr;
 	dma_addr_t tmp_addr_t = 0;
 
 	ahw = adapter->ahw;
@@ -150,6 +150,8 @@ int qlcnic_fw_cmd_get_minidump_temp(struct qlcnic_adapter *adapter)
 	}
 	temp_size = cmd.rsp.arg2;
 	version = cmd.rsp.arg3;
+	dev_info(&adapter->pdev->dev,
+		 "minidump template version = 0x%x", version);
 	if (!temp_size)
 		return -EIO;
 
@@ -174,7 +176,6 @@ int qlcnic_fw_cmd_get_minidump_temp(struct qlcnic_adapter *adapter)
 		err = -EIO;
 		goto error;
 	}
-	tmp_tmpl = tmp_addr;
 	ahw->fw_dump.tmpl_hdr = vzalloc(temp_size);
 	if (!ahw->fw_dump.tmpl_hdr) {
 		err = -EIO;
diff --git a/trunk/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c b/trunk/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c
index fc48e000f35f..7a6d5ebe4e0f 100644
--- a/trunk/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c
+++ b/trunk/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c
@@ -365,7 +365,7 @@ static int
 qlcnic_send_cmd_descs(struct qlcnic_adapter *adapter,
 		struct cmd_desc_type0 *cmd_desc_arr, int nr_desc)
 {
-	u32 i, producer, consumer;
+	u32 i, producer;
 	struct qlcnic_cmd_buffer *pbuf;
 	struct cmd_desc_type0 *cmd_desc;
 	struct qlcnic_host_tx_ring *tx_ring;
@@ -379,7 +379,6 @@ qlcnic_send_cmd_descs(struct qlcnic_adapter *adapter,
 	__netif_tx_lock_bh(tx_ring->txq);
 
 	producer = tx_ring->producer;
-	consumer = tx_ring->sw_consumer;
 
 	if (nr_desc >= qlcnic_tx_avail(tx_ring)) {
 		netif_tx_stop_queue(tx_ring->txq);
@@ -402,7 +401,7 @@ qlcnic_send_cmd_descs(struct qlcnic_adapter *adapter,
 		pbuf->frag_count = 0;
 
 		memcpy(&tx_ring->desc_head[producer],
-			&cmd_desc_arr[i], sizeof(struct cmd_desc_type0));
+		       cmd_desc, sizeof(struct cmd_desc_type0));
 
 		producer = get_next_index(producer, tx_ring->num_desc);
 		i++;
diff --git a/trunk/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/trunk/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index a7554d9aab0c..d833f5927891 100644
--- a/trunk/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/trunk/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -445,13 +445,10 @@ static int
 qlcnic_set_function_modes(struct qlcnic_adapter *adapter)
 {
 	u8 id;
-	u32 ref_count;
 	int i, ret = 1;
 	u32 data = QLCNIC_MGMT_FUNC;
 	struct qlcnic_hardware_context *ahw = adapter->ahw;
 
-	/* If other drivers are not in use set their privilege level */
-	ref_count = QLCRD32(adapter, QLCNIC_CRB_DRV_ACTIVE);
 	ret = qlcnic_api_lock(adapter);
 	if (ret)
 		goto err_lock;
@@ -531,11 +528,9 @@ static int qlcnic_setup_pci_map(struct pci_dev *pdev,
 {
 	u32 offset;
 	void __iomem *mem_ptr0 = NULL;
-	resource_size_t mem_base;
 	unsigned long mem_len, pci_len0 = 0, bar0_len;
 
 	/* remap phys address */
-	mem_base = pci_resource_start(pdev, 0);	/* 0 is for BAR 0 */
 	mem_len = pci_resource_len(pdev, 0);
 
 	qlcnic_get_bar_length(pdev->device, &bar0_len);
diff --git a/trunk/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c b/trunk/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c
index 12ff29270745..0b8d8625834c 100644
--- a/trunk/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c
+++ b/trunk/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c
@@ -197,7 +197,7 @@ static u32 qlcnic_dump_ctrl(struct qlcnic_adapter *adapter,
 	int i, k, timeout = 0;
 	void __iomem *base = adapter->ahw->pci_base0;
 	u32 addr, data;
-	u8 opcode, no_ops;
+	u8 no_ops;
 	struct __ctrl *ctr = &entry->region.ctrl;
 	struct qlcnic_dump_template_hdr *t_hdr = adapter->ahw->fw_dump.tmpl_hdr;
 
@@ -206,7 +206,6 @@ static u32 qlcnic_dump_ctrl(struct qlcnic_adapter *adapter,
 
 	for (i = 0; i < no_ops; i++) {
 		k = 0;
-		opcode = 0;
 		for (k = 0; k < 8; k++) {
 			if (!(ctr->opcode & (1 << k)))
 				continue;
diff --git a/trunk/drivers/net/ethernet/realtek/8139cp.c b/trunk/drivers/net/ethernet/realtek/8139cp.c
index cb6fc5a743ca..5ac93323a40c 100644
--- a/trunk/drivers/net/ethernet/realtek/8139cp.c
+++ b/trunk/drivers/net/ethernet/realtek/8139cp.c
@@ -577,28 +577,30 @@ static irqreturn_t cp_interrupt (int irq, void *dev_instance)
 {
 	struct net_device *dev = dev_instance;
 	struct cp_private *cp;
+	int handled = 0;
 	u16 status;
 
 	if (unlikely(dev == NULL))
 		return IRQ_NONE;
 	cp = netdev_priv(dev);
 
+	spin_lock(&cp->lock);
+
 	status = cpr16(IntrStatus);
 	if (!status || (status == 0xFFFF))
-		return IRQ_NONE;
+		goto out_unlock;
+
+	handled = 1;
 
 	netif_dbg(cp, intr, dev, "intr, status %04x cmd %02x cpcmd %04x\n",
 		  status, cpr8(Cmd), cpr16(CpCmd));
 
 	cpw16(IntrStatus, status & ~cp_rx_intr_mask);
 
-	spin_lock(&cp->lock);
-
 	/* close possible race's with dev_close */
 	if (unlikely(!netif_running(dev))) {
 		cpw16(IntrMask, 0);
-		spin_unlock(&cp->lock);
-		return IRQ_HANDLED;
+		goto out_unlock;
 	}
 
 	if (status & (RxOK | RxErr | RxEmpty | RxFIFOOvr))
@@ -612,7 +614,6 @@ static irqreturn_t cp_interrupt (int irq, void *dev_instance)
 	if (status & LinkChg)
 		mii_check_media(&cp->mii_if, netif_msg_link(cp), false);
 
-	spin_unlock(&cp->lock);
 
 	if (status & PciErr) {
 		u16 pci_status;
@@ -625,7 +626,10 @@ static irqreturn_t cp_interrupt (int irq, void *dev_instance)
 		/* TODO: reset hardware */
 	}
 
-	return IRQ_HANDLED;
+out_unlock:
+	spin_unlock(&cp->lock);
+
+	return IRQ_RETVAL(handled);
 }
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/trunk/drivers/net/ethernet/smsc/smc91x.c b/trunk/drivers/net/ethernet/smsc/smc91x.c
index 022b45bc14ff..a670d23d9340 100644
--- a/trunk/drivers/net/ethernet/smsc/smc91x.c
+++ b/trunk/drivers/net/ethernet/smsc/smc91x.c
@@ -2386,8 +2386,6 @@ static const struct of_device_id smc91x_match[] = {
 	{},
 };
 MODULE_DEVICE_TABLE(of, smc91x_match);
-#else
-#define smc91x_match NULL
 #endif
 
 static struct dev_pm_ops smc_drv_pm_ops = {
@@ -2402,7 +2400,7 @@ static struct platform_driver smc_driver = {
 		.name	= CARDNAME,
 		.owner	= THIS_MODULE,
 		.pm	= &smc_drv_pm_ops,
-		.of_match_table = smc91x_match,
+		.of_match_table = of_match_ptr(smc91x_match),
 	},
 };
 
diff --git a/trunk/drivers/net/ethernet/smsc/smsc911x.c b/trunk/drivers/net/ethernet/smsc/smsc911x.c
index 4616bf27d515..e112877d15d3 100644
--- a/trunk/drivers/net/ethernet/smsc/smsc911x.c
+++ b/trunk/drivers/net/ethernet/smsc/smsc911x.c
@@ -2575,11 +2575,13 @@ static const struct dev_pm_ops smsc911x_pm_ops = {
 #define SMSC911X_PM_OPS NULL
 #endif
 
+#ifdef CONFIG_OF
 static const struct of_device_id smsc911x_dt_ids[] = {
 	{ .compatible = "smsc,lan9115", },
 	{ /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, smsc911x_dt_ids);
+#endif
 
 static struct platform_driver smsc911x_driver = {
 	.probe = smsc911x_drv_probe,
@@ -2588,7 +2590,7 @@ static struct platform_driver smsc911x_driver = {
 		.name	= SMSC_CHIPNAME,
 		.owner	= THIS_MODULE,
 		.pm	= SMSC911X_PM_OPS,
-		.of_match_table = smsc911x_dt_ids,
+		.of_match_table = of_match_ptr(smsc911x_dt_ids),
 	},
 };
 
diff --git a/trunk/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/trunk/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index 023a4fb4efa5..b05df8983be5 100644
--- a/trunk/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/trunk/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -127,14 +127,14 @@ static inline int stmmac_register_platform(void)
 }
 static inline void stmmac_unregister_platform(void)
 {
-	platform_driver_register(&stmmac_pltfr_driver);
+	platform_driver_unregister(&stmmac_pltfr_driver);
 }
 #else
 static inline int stmmac_register_platform(void)
 {
 	pr_debug("stmmac: do not register the platf driver\n");
 
-	return -EINVAL;
+	return 0;
 }
 static inline void stmmac_unregister_platform(void)
 {
@@ -162,7 +162,7 @@ static inline int stmmac_register_pci(void)
 {
 	pr_debug("stmmac: do not register the PCI driver\n");
 
-	return -EINVAL;
+	return 0;
 }
 static inline void stmmac_unregister_pci(void)
 {
diff --git a/trunk/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/trunk/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 542edbcd92c7..f07c0612abf6 100644
--- a/trunk/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/trunk/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2194,18 +2194,20 @@ int stmmac_restore(struct net_device *ndev)
  */
 static int __init stmmac_init(void)
 {
-	int err_plt = 0;
-	int err_pci = 0;
-
-	err_plt = stmmac_register_platform();
-	err_pci = stmmac_register_pci();
-
-	if ((err_pci) && (err_plt)) {
-		pr_err("stmmac: driver registration failed\n");
-		return -EINVAL;
-	}
+	int ret;
 
+	ret = stmmac_register_platform();
+	if (ret)
+		goto err;
+	ret = stmmac_register_pci();
+	if (ret)
+		goto err_pci;
 	return 0;
+err_pci:
+	stmmac_unregister_platform();
+err:
+	pr_err("stmmac: driver registration failed\n");
+	return ret;
 }
 
 static void __exit stmmac_exit(void)
diff --git a/trunk/drivers/net/ethernet/ti/cpts.c b/trunk/drivers/net/ethernet/ti/cpts.c
index 337766738eca..5e62c1aeeffb 100644
--- a/trunk/drivers/net/ethernet/ti/cpts.c
+++ b/trunk/drivers/net/ethernet/ti/cpts.c
@@ -27,8 +27,6 @@
 #include <linux/uaccess.h>
 #include <linux/workqueue.h>
 
-#include <plat/clock.h>
-
 #include "cpts.h"
 
 #ifdef CONFIG_TI_CPTS
diff --git a/trunk/drivers/net/tun.c b/trunk/drivers/net/tun.c
index 40b426edc9e6..504f7f1cad94 100644
--- a/trunk/drivers/net/tun.c
+++ b/trunk/drivers/net/tun.c
@@ -138,6 +138,8 @@ struct tun_file {
 	/* only used for fasnyc */
 	unsigned int flags;
 	u16 queue_index;
+	struct list_head next;
+	struct tun_struct *detached;
 };
 
 struct tun_flow_entry {
@@ -182,6 +184,8 @@ struct tun_struct {
 	struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
 	struct timer_list flow_gc_timer;
 	unsigned long ageing_time;
+	unsigned int numdisabled;
+	struct list_head disabled;
 };
 
 static inline u32 tun_hashfn(u32 rxhash)
@@ -385,6 +389,23 @@ static void tun_set_real_num_queues(struct tun_struct *tun)
 	netif_set_real_num_rx_queues(tun->dev, tun->numqueues);
 }
 
+static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile)
+{
+	tfile->detached = tun;
+	list_add_tail(&tfile->next, &tun->disabled);
+	++tun->numdisabled;
+}
+
+static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
+{
+	struct tun_struct *tun = tfile->detached;
+
+	tfile->detached = NULL;
+	list_del_init(&tfile->next);
+	--tun->numdisabled;
+	return tun;
+}
+
 static void __tun_detach(struct tun_file *tfile, bool clean)
 {
 	struct tun_file *ntfile;
@@ -406,20 +427,25 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 		ntfile->queue_index = index;
 
 		--tun->numqueues;
-		sock_put(&tfile->sk);
+		if (clean)
+			sock_put(&tfile->sk);
+		else
+			tun_disable_queue(tun, tfile);
 
 		synchronize_net();
 		tun_flow_delete_by_queue(tun, tun->numqueues + 1);
 		/* Drop read queue */
 		skb_queue_purge(&tfile->sk.sk_receive_queue);
 		tun_set_real_num_queues(tun);
-
-		if (tun->numqueues == 0 && !(tun->flags & TUN_PERSIST))
-			if (dev->reg_state == NETREG_REGISTERED)
-				unregister_netdevice(dev);
-	}
+	} else if (tfile->detached && clean)
+		tun = tun_enable_queue(tfile);
 
 	if (clean) {
+		if (tun && tun->numqueues == 0 && tun->numdisabled == 0 &&
+		    !(tun->flags & TUN_PERSIST))
+			if (tun->dev->reg_state == NETREG_REGISTERED)
+				unregister_netdevice(tun->dev);
+
 		BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED,
 				 &tfile->socket.flags));
 		sk_release_kernel(&tfile->sk);
@@ -436,7 +462,7 @@ static void tun_detach(struct tun_file *tfile, bool clean)
 static void tun_detach_all(struct net_device *dev)
 {
 	struct tun_struct *tun = netdev_priv(dev);
-	struct tun_file *tfile;
+	struct tun_file *tfile, *tmp;
 	int i, n = tun->numqueues;
 
 	for (i = 0; i < n; i++) {
@@ -457,6 +483,12 @@ static void tun_detach_all(struct net_device *dev)
 		skb_queue_purge(&tfile->sk.sk_receive_queue);
 		sock_put(&tfile->sk);
 	}
+	list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
+		tun_enable_queue(tfile);
+		skb_queue_purge(&tfile->sk.sk_receive_queue);
+		sock_put(&tfile->sk);
+	}
+	BUG_ON(tun->numdisabled != 0);
 }
 
 static int tun_attach(struct tun_struct *tun, struct file *file)
@@ -473,7 +505,8 @@ static int tun_attach(struct tun_struct *tun, struct file *file)
 		goto out;
 
 	err = -E2BIG;
-	if (tun->numqueues == MAX_TAP_QUEUES)
+	if (!tfile->detached &&
+	    tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES)
 		goto out;
 
 	err = 0;
@@ -487,9 +520,13 @@ static int tun_attach(struct tun_struct *tun, struct file *file)
 	tfile->queue_index = tun->numqueues;
 	rcu_assign_pointer(tfile->tun, tun);
 	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
-	sock_hold(&tfile->sk);
 	tun->numqueues++;
 
+	if (tfile->detached)
+		tun_enable_queue(tfile);
+	else
+		sock_hold(&tfile->sk);
+
 	tun_set_real_num_queues(tun);
 
 	/* device is allowed to go away first, so no need to hold extra
@@ -1162,6 +1199,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
 	}
 
+	skb_reset_network_header(skb);
 	rxhash = skb_get_rxhash(skb);
 	netif_rx_ni(skb);
 
@@ -1349,6 +1387,7 @@ static void tun_free_netdev(struct net_device *dev)
 {
 	struct tun_struct *tun = netdev_priv(dev);
 
+	BUG_ON(!(list_empty(&tun->disabled)));
 	tun_flow_uninit(tun);
 	free_netdev(dev);
 }
@@ -1543,6 +1582,10 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		err = tun_attach(tun, file);
 		if (err < 0)
 			return err;
+
+		if (tun->flags & TUN_TAP_MQ &&
+		    (tun->numqueues + tun->numdisabled > 1))
+			return err;
 	}
 	else {
 		char *name;
@@ -1601,6 +1644,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 			TUN_USER_FEATURES;
 		dev->features = dev->hw_features;
 
+		INIT_LIST_HEAD(&tun->disabled);
 		err = tun_attach(tun, file);
 		if (err < 0)
 			goto err_free_dev;
@@ -1755,32 +1799,28 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
 {
 	struct tun_file *tfile = file->private_data;
 	struct tun_struct *tun;
-	struct net_device *dev;
 	int ret = 0;
 
 	rtnl_lock();
 
 	if (ifr->ifr_flags & IFF_ATTACH_QUEUE) {
-		dev = __dev_get_by_name(tfile->net, ifr->ifr_name);
-		if (!dev) {
-			ret = -EINVAL;
-			goto unlock;
-		}
-
-		tun = netdev_priv(dev);
-		if (dev->netdev_ops != &tap_netdev_ops &&
-			dev->netdev_ops != &tun_netdev_ops)
+		tun = tfile->detached;
+		if (!tun)
 			ret = -EINVAL;
 		else if (tun_not_capable(tun))
 			ret = -EPERM;
 		else
 			ret = tun_attach(tun, file);
-	} else if (ifr->ifr_flags & IFF_DETACH_QUEUE)
-		__tun_detach(tfile, false);
-	else
+	} else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
+		tun = rcu_dereference_protected(tfile->tun,
+						lockdep_rtnl_is_held());
+		if (!tun || !(tun->flags & TUN_TAP_MQ))
+			ret = -EINVAL;
+		else
+			__tun_detach(tfile, false);
+	} else
 		ret = -EINVAL;
 
-unlock:
 	rtnl_unlock();
 	return ret;
 }
@@ -2092,6 +2132,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 
 	file->private_data = tfile;
 	set_bit(SOCK_EXTERNALLY_ALLOCATED, &tfile->socket.flags);
+	INIT_LIST_HEAD(&tfile->next);
 
 	return 0;
 }
diff --git a/trunk/drivers/net/usb/cdc_ether.c b/trunk/drivers/net/usb/cdc_ether.c
index d0129827602b..3f3d12d766e7 100644
--- a/trunk/drivers/net/usb/cdc_ether.c
+++ b/trunk/drivers/net/usb/cdc_ether.c
@@ -457,12 +457,6 @@ int usbnet_cdc_bind(struct usbnet *dev, struct usb_interface *intf)
 }
 EXPORT_SYMBOL_GPL(usbnet_cdc_bind);
 
-static int cdc_manage_power(struct usbnet *dev, int on)
-{
-	dev->intf->needs_remote_wakeup = on;
-	return 0;
-}
-
 static const struct driver_info	cdc_info = {
 	.description =	"CDC Ethernet Device",
 	.flags =	FLAG_ETHER | FLAG_POINTTOPOINT,
@@ -470,7 +464,7 @@ static const struct driver_info	cdc_info = {
 	.bind =		usbnet_cdc_bind,
 	.unbind =	usbnet_cdc_unbind,
 	.status =	usbnet_cdc_status,
-	.manage_power =	cdc_manage_power,
+	.manage_power =	usbnet_manage_power,
 };
 
 static const struct driver_info wwan_info = {
@@ -479,7 +473,7 @@ static const struct driver_info wwan_info = {
 	.bind =		usbnet_cdc_bind,
 	.unbind =	usbnet_cdc_unbind,
 	.status =	usbnet_cdc_status,
-	.manage_power =	cdc_manage_power,
+	.manage_power =	usbnet_manage_power,
 };
 
 /*-------------------------------------------------------------------------*/
@@ -487,6 +481,7 @@ static const struct driver_info wwan_info = {
 #define HUAWEI_VENDOR_ID	0x12D1
 #define NOVATEL_VENDOR_ID	0x1410
 #define ZTE_VENDOR_ID		0x19D2
+#define DELL_VENDOR_ID		0x413C
 
 static const struct usb_device_id	products [] = {
 /*
@@ -594,27 +589,29 @@ static const struct usb_device_id	products [] = {
 
 /* Novatel USB551L and MC551 - handled by qmi_wwan */
 {
-	.match_flags    =   USB_DEVICE_ID_MATCH_VENDOR
-		 | USB_DEVICE_ID_MATCH_PRODUCT
-		 | USB_DEVICE_ID_MATCH_INT_INFO,
-	.idVendor               = NOVATEL_VENDOR_ID,
-	.idProduct		= 0xB001,
-	.bInterfaceClass	= USB_CLASS_COMM,
-	.bInterfaceSubClass	= USB_CDC_SUBCLASS_ETHERNET,
-	.bInterfaceProtocol	= USB_CDC_PROTO_NONE,
+	USB_DEVICE_AND_INTERFACE_INFO(NOVATEL_VENDOR_ID, 0xB001, USB_CLASS_COMM,
+			USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
 	.driver_info = 0,
 },
 
 /* Novatel E362 - handled by qmi_wwan */
 {
-	.match_flags    =   USB_DEVICE_ID_MATCH_VENDOR
-		 | USB_DEVICE_ID_MATCH_PRODUCT
-		 | USB_DEVICE_ID_MATCH_INT_INFO,
-	.idVendor               = NOVATEL_VENDOR_ID,
-	.idProduct		= 0x9010,
-	.bInterfaceClass	= USB_CLASS_COMM,
-	.bInterfaceSubClass	= USB_CDC_SUBCLASS_ETHERNET,
-	.bInterfaceProtocol	= USB_CDC_PROTO_NONE,
+	USB_DEVICE_AND_INTERFACE_INFO(NOVATEL_VENDOR_ID, 0x9010, USB_CLASS_COMM,
+			USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
+	.driver_info = 0,
+},
+
+/* Dell Wireless 5800 (Novatel E362) - handled by qmi_wwan */
+{
+	USB_DEVICE_AND_INTERFACE_INFO(DELL_VENDOR_ID, 0x8195, USB_CLASS_COMM,
+			USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
+	.driver_info = 0,
+},
+
+/* Dell Wireless 5800 (Novatel E362) - handled by qmi_wwan */
+{
+	USB_DEVICE_AND_INTERFACE_INFO(DELL_VENDOR_ID, 0x8196, USB_CLASS_COMM,
+			USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
 	.driver_info = 0,
 },
 
diff --git a/trunk/drivers/net/usb/cdc_ncm.c b/trunk/drivers/net/usb/cdc_ncm.c
index d38bc20a60e2..71b6e92b8e9b 100644
--- a/trunk/drivers/net/usb/cdc_ncm.c
+++ b/trunk/drivers/net/usb/cdc_ncm.c
@@ -1129,19 +1129,13 @@ static void cdc_ncm_disconnect(struct usb_interface *intf)
 	usbnet_disconnect(intf);
 }
 
-static int cdc_ncm_manage_power(struct usbnet *dev, int status)
-{
-	dev->intf->needs_remote_wakeup = status;
-	return 0;
-}
-
 static const struct driver_info cdc_ncm_info = {
 	.description = "CDC NCM",
 	.flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET,
 	.bind = cdc_ncm_bind,
 	.unbind = cdc_ncm_unbind,
 	.check_connect = cdc_ncm_check_connect,
-	.manage_power = cdc_ncm_manage_power,
+	.manage_power = usbnet_manage_power,
 	.status = cdc_ncm_status,
 	.rx_fixup = cdc_ncm_rx_fixup,
 	.tx_fixup = cdc_ncm_tx_fixup,
@@ -1155,7 +1149,7 @@ static const struct driver_info wwan_info = {
 	.bind = cdc_ncm_bind,
 	.unbind = cdc_ncm_unbind,
 	.check_connect = cdc_ncm_check_connect,
-	.manage_power = cdc_ncm_manage_power,
+	.manage_power = usbnet_manage_power,
 	.status = cdc_ncm_status,
 	.rx_fixup = cdc_ncm_rx_fixup,
 	.tx_fixup = cdc_ncm_tx_fixup,
diff --git a/trunk/drivers/net/usb/qmi_wwan.c b/trunk/drivers/net/usb/qmi_wwan.c
index 1ea91f4237f0..91d7cb9728eb 100644
--- a/trunk/drivers/net/usb/qmi_wwan.c
+++ b/trunk/drivers/net/usb/qmi_wwan.c
@@ -383,6 +383,20 @@ static const struct usb_device_id products[] = {
 		                              USB_CDC_PROTO_NONE),
 		.driver_info        = (unsigned long)&qmi_wwan_info,
 	},
+	{	/* Dell Wireless 5800 (Novatel E362) */
+		USB_DEVICE_AND_INTERFACE_INFO(0x413C, 0x8195,
+					      USB_CLASS_COMM,
+					      USB_CDC_SUBCLASS_ETHERNET,
+					      USB_CDC_PROTO_NONE),
+		.driver_info        = (unsigned long)&qmi_wwan_info,
+	},
+	{	/* Dell Wireless 5800 V2 (Novatel E362) */
+		USB_DEVICE_AND_INTERFACE_INFO(0x413C, 0x8196,
+					      USB_CLASS_COMM,
+					      USB_CDC_SUBCLASS_ETHERNET,
+					      USB_CDC_PROTO_NONE),
+		.driver_info        = (unsigned long)&qmi_wwan_info,
+	},
 
 	/* 3. Combined interface devices matching on interface number */
 	{QMI_FIXED_INTF(0x12d1, 0x140c, 1)},	/* Huawei E173 */
@@ -419,6 +433,7 @@ static const struct usb_device_id products[] = {
 	{QMI_FIXED_INTF(0x19d2, 0x0199, 1)},	/* ZTE MF820S */
 	{QMI_FIXED_INTF(0x19d2, 0x0200, 1)},
 	{QMI_FIXED_INTF(0x19d2, 0x0257, 3)},	/* ZTE MF821 */
+	{QMI_FIXED_INTF(0x19d2, 0x0284, 4)},	/* ZTE MF880 */
 	{QMI_FIXED_INTF(0x19d2, 0x0326, 4)},	/* ZTE MF821D */
 	{QMI_FIXED_INTF(0x19d2, 0x1008, 4)},	/* ZTE (Vodafone) K3570-Z */
 	{QMI_FIXED_INTF(0x19d2, 0x1010, 4)},	/* ZTE (Vodafone) K3571-Z */
diff --git a/trunk/drivers/net/usb/usbnet.c b/trunk/drivers/net/usb/usbnet.c
index c04110ba677f..3d4bf01641b4 100644
--- a/trunk/drivers/net/usb/usbnet.c
+++ b/trunk/drivers/net/usb/usbnet.c
@@ -719,7 +719,8 @@ int usbnet_stop (struct net_device *net)
 	dev->flags = 0;
 	del_timer_sync (&dev->delay);
 	tasklet_kill (&dev->bh);
-	if (info->manage_power)
+	if (info->manage_power &&
+	    !test_and_clear_bit(EVENT_NO_RUNTIME_PM, &dev->flags))
 		info->manage_power(dev, 0);
 	else
 		usb_autopm_put_interface(dev->intf);
@@ -794,14 +795,14 @@ int usbnet_open (struct net_device *net)
 	tasklet_schedule (&dev->bh);
 	if (info->manage_power) {
 		retval = info->manage_power(dev, 1);
-		if (retval < 0)
-			goto done_manage_power_error;
-		usb_autopm_put_interface(dev->intf);
+		if (retval < 0) {
+			retval = 0;
+			set_bit(EVENT_NO_RUNTIME_PM, &dev->flags);
+		} else {
+			usb_autopm_put_interface(dev->intf);
+		}
 	}
 	return retval;
-
-done_manage_power_error:
-	clear_bit(EVENT_DEV_OPEN, &dev->flags);
 done:
 	usb_autopm_put_interface(dev->intf);
 done_nopm:
@@ -1615,6 +1616,16 @@ void usbnet_device_suggests_idle(struct usbnet *dev)
 }
 EXPORT_SYMBOL(usbnet_device_suggests_idle);
 
+/*
+ * For devices that can do without special commands
+ */
+int usbnet_manage_power(struct usbnet *dev, int on)
+{
+	dev->intf->needs_remote_wakeup = on;
+	return 0;
+}
+EXPORT_SYMBOL(usbnet_manage_power);
+
 /*-------------------------------------------------------------------------*/
 static int __usbnet_read_cmd(struct usbnet *dev, u8 cmd, u8 reqtype,
 			     u16 value, u16 index, void *data, u16 size)
diff --git a/trunk/drivers/net/wimax/i2400m/i2400m-usb.h b/trunk/drivers/net/wimax/i2400m/i2400m-usb.h
index 6650fde99e1d..9f1e947f3557 100644
--- a/trunk/drivers/net/wimax/i2400m/i2400m-usb.h
+++ b/trunk/drivers/net/wimax/i2400m/i2400m-usb.h
@@ -152,6 +152,9 @@ enum {
 	/* Device IDs */
 	USB_DEVICE_ID_I6050 = 0x0186,
 	USB_DEVICE_ID_I6050_2 = 0x0188,
+	USB_DEVICE_ID_I6150 = 0x07d6,
+	USB_DEVICE_ID_I6150_2 = 0x07d7,
+	USB_DEVICE_ID_I6150_3 = 0x07d9,
 	USB_DEVICE_ID_I6250 = 0x0187,
 };
 
diff --git a/trunk/drivers/net/wimax/i2400m/usb.c b/trunk/drivers/net/wimax/i2400m/usb.c
index 713d033891e6..080f36303a4f 100644
--- a/trunk/drivers/net/wimax/i2400m/usb.c
+++ b/trunk/drivers/net/wimax/i2400m/usb.c
@@ -510,6 +510,9 @@ int i2400mu_probe(struct usb_interface *iface,
 	switch (id->idProduct) {
 	case USB_DEVICE_ID_I6050:
 	case USB_DEVICE_ID_I6050_2:
+	case USB_DEVICE_ID_I6150:
+	case USB_DEVICE_ID_I6150_2:
+	case USB_DEVICE_ID_I6150_3:
 	case USB_DEVICE_ID_I6250:
 		i2400mu->i6050 = 1;
 		break;
@@ -759,6 +762,9 @@ static
 struct usb_device_id i2400mu_id_table[] = {
 	{ USB_DEVICE(0x8086, USB_DEVICE_ID_I6050) },
 	{ USB_DEVICE(0x8086, USB_DEVICE_ID_I6050_2) },
+	{ USB_DEVICE(0x8087, USB_DEVICE_ID_I6150) },
+	{ USB_DEVICE(0x8087, USB_DEVICE_ID_I6150_2) },
+	{ USB_DEVICE(0x8087, USB_DEVICE_ID_I6150_3) },
 	{ USB_DEVICE(0x8086, USB_DEVICE_ID_I6250) },
 	{ USB_DEVICE(0x8086, 0x0181) },
 	{ USB_DEVICE(0x8086, 0x1403) },
diff --git a/trunk/drivers/net/wireless/Kconfig b/trunk/drivers/net/wireless/Kconfig
index 6deaae18db57..28aa05f60c26 100644
--- a/trunk/drivers/net/wireless/Kconfig
+++ b/trunk/drivers/net/wireless/Kconfig
@@ -156,11 +156,7 @@ config PRISM54
 	---help---
 	  This enables support for FullMAC PCI/Cardbus prism54 devices. This
 	  driver is now deprecated in favor for the SoftMAC driver, p54pci.
-	  p54pci supports FullMAC PCI/Cardbus devices as well. For details on
-	  the scheduled removal of this driver on the kernel see the feature
-	  removal schedule:
-
-	  Documentation/feature-removal-schedule.txt
+	  p54pci supports FullMAC PCI/Cardbus devices as well.
 
 	  For more information refer to the p54 wiki:
 
diff --git a/trunk/drivers/net/wireless/Makefile b/trunk/drivers/net/wireless/Makefile
index 062dfdff6364..67156efe14c4 100644
--- a/trunk/drivers/net/wireless/Makefile
+++ b/trunk/drivers/net/wireless/Makefile
@@ -47,7 +47,7 @@ obj-$(CONFIG_RT2X00)	+= rt2x00/
 
 obj-$(CONFIG_P54_COMMON)	+= p54/
 
-obj-$(CONFIG_ATH_COMMON)	+= ath/
+obj-$(CONFIG_ATH_CARDS)		+= ath/
 
 obj-$(CONFIG_MAC80211_HWSIM)	+= mac80211_hwsim.o
 
diff --git a/trunk/drivers/net/wireless/rt2x00/rt2x00dev.c b/trunk/drivers/net/wireless/rt2x00/rt2x00dev.c
index 4ffb6a584cd0..44f8b3f3cbed 100644
--- a/trunk/drivers/net/wireless/rt2x00/rt2x00dev.c
+++ b/trunk/drivers/net/wireless/rt2x00/rt2x00dev.c
@@ -685,6 +685,14 @@ void rt2x00lib_rxdone(struct queue_entry *entry, gfp_t gfp)
 	 * to mac80211.
 	 */
 	rx_status = IEEE80211_SKB_RXCB(entry->skb);
+
+	/* Ensure that all fields of rx_status are initialized
+	 * properly. The skb->cb array was used for driver
+	 * specific informations, so rx_status might contain
+	 * garbage.
+	 */
+	memset(rx_status, 0, sizeof(*rx_status));
+
 	rx_status->mactime = rxdesc.timestamp;
 	rx_status->band = rt2x00dev->curr_band;
 	rx_status->freq = rt2x00dev->curr_freq;
diff --git a/trunk/drivers/of/base.c b/trunk/drivers/of/base.c
index be846408dbc1..2390ddb22d60 100644
--- a/trunk/drivers/of/base.c
+++ b/trunk/drivers/of/base.c
@@ -629,7 +629,7 @@ struct device_node *of_find_matching_node_and_match(struct device_node *from,
 	read_unlock(&devtree_lock);
 	return np;
 }
-EXPORT_SYMBOL(of_find_matching_node);
+EXPORT_SYMBOL(of_find_matching_node_and_match);
 
 /**
  * of_modalias_node - Lookup appropriate modalias for a device node
@@ -1114,13 +1114,36 @@ int of_parse_phandle_with_args(const struct device_node *np, const char *list_na
 }
 EXPORT_SYMBOL(of_parse_phandle_with_args);
 
+#if defined(CONFIG_OF_DYNAMIC)
+static int of_property_notify(int action, struct device_node *np,
+			      struct property *prop)
+{
+	struct of_prop_reconfig pr;
+
+	pr.dn = np;
+	pr.prop = prop;
+	return of_reconfig_notify(action, &pr);
+}
+#else
+static int of_property_notify(int action, struct device_node *np,
+			      struct property *prop)
+{
+	return 0;
+}
+#endif
+
 /**
- * prom_add_property - Add a property to a node
+ * of_add_property - Add a property to a node
  */
-int prom_add_property(struct device_node *np, struct property *prop)
+int of_add_property(struct device_node *np, struct property *prop)
 {
 	struct property **next;
 	unsigned long flags;
+	int rc;
+
+	rc = of_property_notify(OF_RECONFIG_ADD_PROPERTY, np, prop);
+	if (rc)
+		return rc;
 
 	prop->next = NULL;
 	write_lock_irqsave(&devtree_lock, flags);
@@ -1146,18 +1169,23 @@ int prom_add_property(struct device_node *np, struct property *prop)
 }
 
 /**
- * prom_remove_property - Remove a property from a node.
+ * of_remove_property - Remove a property from a node.
  *
  * Note that we don't actually remove it, since we have given out
  * who-knows-how-many pointers to the data using get-property.
  * Instead we just move the property to the "dead properties"
  * list, so it won't be found any more.
  */
-int prom_remove_property(struct device_node *np, struct property *prop)
+int of_remove_property(struct device_node *np, struct property *prop)
 {
 	struct property **next;
 	unsigned long flags;
 	int found = 0;
+	int rc;
+
+	rc = of_property_notify(OF_RECONFIG_REMOVE_PROPERTY, np, prop);
+	if (rc)
+		return rc;
 
 	write_lock_irqsave(&devtree_lock, flags);
 	next = &np->properties;
@@ -1187,7 +1215,7 @@ int prom_remove_property(struct device_node *np, struct property *prop)
 }
 
 /*
- * prom_update_property - Update a property in a node, if the property does
+ * of_update_property - Update a property in a node, if the property does
  * not exist, add it.
  *
  * Note that we don't actually remove it, since we have given out
@@ -1195,19 +1223,22 @@ int prom_remove_property(struct device_node *np, struct property *prop)
  * Instead we just move the property to the "dead properties" list,
  * and add the new property to the property list
  */
-int prom_update_property(struct device_node *np,
-			 struct property *newprop)
+int of_update_property(struct device_node *np, struct property *newprop)
 {
 	struct property **next, *oldprop;
 	unsigned long flags;
-	int found = 0;
+	int rc, found = 0;
+
+	rc = of_property_notify(OF_RECONFIG_UPDATE_PROPERTY, np, newprop);
+	if (rc)
+		return rc;
 
 	if (!newprop->name)
 		return -EINVAL;
 
 	oldprop = of_find_property(np, newprop->name, NULL);
 	if (!oldprop)
-		return prom_add_property(np, newprop);
+		return of_add_property(np, newprop);
 
 	write_lock_irqsave(&devtree_lock, flags);
 	next = &np->properties;
@@ -1246,12 +1277,55 @@ int prom_update_property(struct device_node *np,
  * device tree nodes.
  */
 
+static BLOCKING_NOTIFIER_HEAD(of_reconfig_chain);
+
+int of_reconfig_notifier_register(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&of_reconfig_chain, nb);
+}
+EXPORT_SYMBOL_GPL(of_reconfig_notifier_register);
+
+int of_reconfig_notifier_unregister(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&of_reconfig_chain, nb);
+}
+EXPORT_SYMBOL_GPL(of_reconfig_notifier_unregister);
+
+int of_reconfig_notify(unsigned long action, void *p)
+{
+	int rc;
+
+	rc = blocking_notifier_call_chain(&of_reconfig_chain, action, p);
+	return notifier_to_errno(rc);
+}
+
+#ifdef CONFIG_PROC_DEVICETREE
+static void of_add_proc_dt_entry(struct device_node *dn)
+{
+	struct proc_dir_entry *ent;
+
+	ent = proc_mkdir(strrchr(dn->full_name, '/') + 1, dn->parent->pde);
+	if (ent)
+		proc_device_tree_add_node(dn, ent);
+}
+#else
+static void of_add_proc_dt_entry(struct device_node *dn)
+{
+	return;
+}
+#endif
+
 /**
  * of_attach_node - Plug a device node into the tree and global list.
  */
-void of_attach_node(struct device_node *np)
+int of_attach_node(struct device_node *np)
 {
 	unsigned long flags;
+	int rc;
+
+	rc = of_reconfig_notify(OF_RECONFIG_ATTACH_NODE, np);
+	if (rc)
+		return rc;
 
 	write_lock_irqsave(&devtree_lock, flags);
 	np->sibling = np->parent->child;
@@ -1259,24 +1333,61 @@ void of_attach_node(struct device_node *np)
 	np->parent->child = np;
 	of_allnodes = np;
 	write_unlock_irqrestore(&devtree_lock, flags);
+
+	of_add_proc_dt_entry(np);
+	return 0;
 }
 
+#ifdef CONFIG_PROC_DEVICETREE
+static void of_remove_proc_dt_entry(struct device_node *dn)
+{
+	struct device_node *parent = dn->parent;
+	struct property *prop = dn->properties;
+
+	while (prop) {
+		remove_proc_entry(prop->name, dn->pde);
+		prop = prop->next;
+	}
+
+	if (dn->pde)
+		remove_proc_entry(dn->pde->name, parent->pde);
+}
+#else
+static void of_remove_proc_dt_entry(struct device_node *dn)
+{
+	return;
+}
+#endif
+
 /**
  * of_detach_node - "Unplug" a node from the device tree.
  *
  * The caller must hold a reference to the node.  The memory associated with
  * the node is not freed until its refcount goes to zero.
  */
-void of_detach_node(struct device_node *np)
+int of_detach_node(struct device_node *np)
 {
 	struct device_node *parent;
 	unsigned long flags;
+	int rc = 0;
+
+	rc = of_reconfig_notify(OF_RECONFIG_DETACH_NODE, np);
+	if (rc)
+		return rc;
 
 	write_lock_irqsave(&devtree_lock, flags);
 
+	if (of_node_check_flag(np, OF_DETACHED)) {
+		/* someone already detached it */
+		write_unlock_irqrestore(&devtree_lock, flags);
+		return rc;
+	}
+
 	parent = np->parent;
-	if (!parent)
-		goto out_unlock;
+	if (!parent) {
+		write_unlock_irqrestore(&devtree_lock, flags);
+		return rc;
+	}
 
 	if (of_allnodes == np)
 		of_allnodes = np->allnext;
@@ -1301,9 +1412,10 @@ void of_detach_node(struct device_node *np)
 	}
 
 	of_node_set_flag(np, OF_DETACHED);
-
-out_unlock:
 	write_unlock_irqrestore(&devtree_lock, flags);
+
+	of_remove_proc_dt_entry(np);
+	return rc;
 }
 #endif /* defined(CONFIG_OF_DYNAMIC) */
 
diff --git a/trunk/drivers/of/fdt.c b/trunk/drivers/of/fdt.c
index a65c39c473bf..808be06bb67e 100644
--- a/trunk/drivers/of/fdt.c
+++ b/trunk/drivers/of/fdt.c
@@ -488,14 +488,8 @@ int __init of_scan_flat_dt(int (*it)(unsigned long node,
 		depth++;
 		pathp = (char *)p;
 		p = ALIGN(p + strlen(pathp) + 1, 4);
-		if ((*pathp) == '/') {
-			const char *lp, *np;
-			for (lp = NULL, np = pathp; *np; np++)
-				if ((*np) == '/')
-					lp = np+1;
-			if (lp != NULL)
-				pathp = lp;
-		}
+		if (*pathp == '/')
+			pathp = kbasename(pathp);
 		rc = it(p, pathp, depth, data);
 		if (rc != 0)
 			break;
diff --git a/trunk/drivers/platform/x86/asus-nb-wmi.c b/trunk/drivers/platform/x86/asus-nb-wmi.c
index 6b0ebdeae916..be790402e0f1 100644
--- a/trunk/drivers/platform/x86/asus-nb-wmi.c
+++ b/trunk/drivers/platform/x86/asus-nb-wmi.c
@@ -32,7 +32,7 @@
 
 #define	ASUS_NB_WMI_FILE	"asus-nb-wmi"
 
-MODULE_AUTHOR("Corentin Chary <corentincj@iksaif.net>");
+MODULE_AUTHOR("Corentin Chary <corentin.chary@gmail.com>");
 MODULE_DESCRIPTION("Asus Notebooks WMI Hotkey Driver");
 MODULE_LICENSE("GPL");
 
diff --git a/trunk/drivers/platform/x86/asus-wmi.c b/trunk/drivers/platform/x86/asus-wmi.c
index c0e9ff489b24..f80ae4d10f68 100644
--- a/trunk/drivers/platform/x86/asus-wmi.c
+++ b/trunk/drivers/platform/x86/asus-wmi.c
@@ -51,7 +51,7 @@
 
 #include "asus-wmi.h"
 
-MODULE_AUTHOR("Corentin Chary <corentincj@iksaif.net>, "
+MODULE_AUTHOR("Corentin Chary <corentin.chary@gmail.com>, "
 	      "Yong Wang <yong.y.wang@intel.com>");
 MODULE_DESCRIPTION("Asus Generic WMI Driver");
 MODULE_LICENSE("GPL");
diff --git a/trunk/drivers/platform/x86/eeepc-wmi.c b/trunk/drivers/platform/x86/eeepc-wmi.c
index 5838332ea5bd..60cb76a5b513 100644
--- a/trunk/drivers/platform/x86/eeepc-wmi.c
+++ b/trunk/drivers/platform/x86/eeepc-wmi.c
@@ -39,7 +39,7 @@
 
 #define	EEEPC_WMI_FILE	"eeepc-wmi"
 
-MODULE_AUTHOR("Corentin Chary <corentincj@iksaif.net>");
+MODULE_AUTHOR("Corentin Chary <corentin.chary@gmail.com>");
 MODULE_DESCRIPTION("Eee PC WMI Hotkey Driver");
 MODULE_LICENSE("GPL");
 
diff --git a/trunk/drivers/power/charger-manager.c b/trunk/drivers/power/charger-manager.c
index adb3a4b59cb3..6ba047f5ac2c 100644
--- a/trunk/drivers/power/charger-manager.c
+++ b/trunk/drivers/power/charger-manager.c
@@ -239,44 +239,37 @@ static bool is_full_charged(struct charger_manager *cm)
 	int uV;
 
 	/* If there is no battery, it cannot be charged */
-	if (!is_batt_present(cm)) {
-		val.intval = 0;
-		goto out;
-	}
+	if (!is_batt_present(cm))
+		return false;
 
 	if (cm->fuel_gauge && desc->fullbatt_full_capacity > 0) {
+		val.intval = 0;
+
 		/* Not full if capacity of fuel gauge isn't full */
 		ret = cm->fuel_gauge->get_property(cm->fuel_gauge,
 				POWER_SUPPLY_PROP_CHARGE_FULL, &val);
-		if (!ret && val.intval > desc->fullbatt_full_capacity) {
-			val.intval = 1;
-			goto out;
-		}
+		if (!ret && val.intval > desc->fullbatt_full_capacity)
+			return true;
 	}
 
 	/* Full, if it's over the fullbatt voltage */
 	if (desc->fullbatt_uV > 0) {
 		ret = get_batt_uV(cm, &uV);
-		if (!ret && uV >= desc->fullbatt_uV) {
-			val.intval = 1;
-			goto out;
-		}
+		if (!ret && uV >= desc->fullbatt_uV)
+			return true;
 	}
 
 	/* Full, if the capacity is more than fullbatt_soc */
 	if (cm->fuel_gauge && desc->fullbatt_soc > 0) {
+		val.intval = 0;
+
 		ret = cm->fuel_gauge->get_property(cm->fuel_gauge,
 				POWER_SUPPLY_PROP_CAPACITY, &val);
-		if (!ret && val.intval >= desc->fullbatt_soc) {
-			val.intval = 1;
-			goto out;
-		}
+		if (!ret && val.intval >= desc->fullbatt_soc)
+			return true;
 	}
 
-	val.intval = 0;
-
-out:
-	return val.intval ? true : false;
+	return false;
 }
 
 /**
@@ -489,8 +482,9 @@ static void fullbatt_vchk(struct work_struct *work)
 		return;
 	}
 
-	diff = desc->fullbatt_uV;
-	diff -= batt_uV;
+	diff = desc->fullbatt_uV - batt_uV;
+	if (diff < 0)
+		return;
 
 	dev_info(cm->dev, "VBATT dropped %duV after full-batt.\n", diff);
 
diff --git a/trunk/drivers/pwm/Kconfig b/trunk/drivers/pwm/Kconfig
index ed81720e7b2b..e513cd998170 100644
--- a/trunk/drivers/pwm/Kconfig
+++ b/trunk/drivers/pwm/Kconfig
@@ -112,6 +112,17 @@ config PWM_SAMSUNG
 	  To compile this driver as a module, choose M here: the module
 	  will be called pwm-samsung.
 
+config PWM_SPEAR
+	tristate "STMicroelectronics SPEAr PWM support"
+	depends on PLAT_SPEAR
+	depends on OF
+	help
+	  Generic PWM framework driver for the PWM controller on ST
+	  SPEAr SoCs.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called pwm-spear.
+
 config PWM_TEGRA
 	tristate "NVIDIA Tegra PWM support"
 	depends on ARCH_TEGRA
@@ -125,6 +136,7 @@ config PWM_TEGRA
 config  PWM_TIECAP
 	tristate "ECAP PWM support"
 	depends on SOC_AM33XX
+	select PWM_TIPWMSS
 	help
 	  PWM driver support for the ECAP APWM controller found on AM33XX
 	  TI SOC
@@ -135,6 +147,7 @@ config  PWM_TIECAP
 config  PWM_TIEHRPWM
 	tristate "EHRPWM PWM support"
 	depends on SOC_AM33XX
+	select PWM_TIPWMSS
 	help
 	  PWM driver support for the EHRPWM controller found on AM33XX
 	  TI SOC
@@ -142,14 +155,32 @@ config  PWM_TIEHRPWM
 	  To compile this driver as a module, choose M here: the module
 	  will be called pwm-tiehrpwm.
 
-config PWM_TWL6030
-	tristate "TWL6030 PWM support"
+config  PWM_TIPWMSS
+	bool
+	depends on SOC_AM33XX && (PWM_TIEHRPWM || PWM_TIECAP)
+	help
+	  PWM Subsystem driver support for AM33xx SOC.
+
+	  PWM submodules require PWM config space access from submodule
+	  drivers and require common parent driver support.
+
+config PWM_TWL
+	tristate "TWL4030/6030 PWM support"
+	depends on TWL4030_CORE
+	help
+	  Generic PWM framework driver for TWL4030/6030.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called pwm-twl.
+
+config PWM_TWL_LED
+	tristate "TWL4030/6030 PWM support for LED drivers"
 	depends on TWL4030_CORE
 	help
-	  Generic PWM framework driver for TWL6030.
+	  Generic PWM framework driver for TWL4030/6030 LED terminals.
 
 	  To compile this driver as a module, choose M here: the module
-	  will be called pwm-twl6030.
+	  will be called pwm-twl-led.
 
 config PWM_VT8500
 	tristate "vt8500 pwm support"
diff --git a/trunk/drivers/pwm/Makefile b/trunk/drivers/pwm/Makefile
index acfe4821c58b..62a2963cfe58 100644
--- a/trunk/drivers/pwm/Makefile
+++ b/trunk/drivers/pwm/Makefile
@@ -8,8 +8,11 @@ obj-$(CONFIG_PWM_MXS)		+= pwm-mxs.o
 obj-$(CONFIG_PWM_PUV3)		+= pwm-puv3.o
 obj-$(CONFIG_PWM_PXA)		+= pwm-pxa.o
 obj-$(CONFIG_PWM_SAMSUNG)	+= pwm-samsung.o
+obj-$(CONFIG_PWM_SPEAR)		+= pwm-spear.o
 obj-$(CONFIG_PWM_TEGRA)		+= pwm-tegra.o
 obj-$(CONFIG_PWM_TIECAP)	+= pwm-tiecap.o
 obj-$(CONFIG_PWM_TIEHRPWM)	+= pwm-tiehrpwm.o
-obj-$(CONFIG_PWM_TWL6030)	+= pwm-twl6030.o
+obj-$(CONFIG_PWM_TIPWMSS)	+= pwm-tipwmss.o
+obj-$(CONFIG_PWM_TWL)		+= pwm-twl.o
+obj-$(CONFIG_PWM_TWL_LED)	+= pwm-twl-led.o
 obj-$(CONFIG_PWM_VT8500)	+= pwm-vt8500.o
diff --git a/trunk/drivers/pwm/core.c b/trunk/drivers/pwm/core.c
index f5acdaa52707..903138b18842 100644
--- a/trunk/drivers/pwm/core.c
+++ b/trunk/drivers/pwm/core.c
@@ -32,6 +32,9 @@
 
 #define MAX_PWMS 1024
 
+/* flags in the third cell of the DT PWM specifier */
+#define PWM_SPEC_POLARITY	(1 << 0)
+
 static DEFINE_MUTEX(pwm_lookup_lock);
 static LIST_HEAD(pwm_lookup_list);
 static DEFINE_MUTEX(pwm_lock);
@@ -129,6 +132,32 @@ static int pwm_device_request(struct pwm_device *pwm, const char *label)
 	return 0;
 }
 
+struct pwm_device *
+of_pwm_xlate_with_flags(struct pwm_chip *pc, const struct of_phandle_args *args)
+{
+	struct pwm_device *pwm;
+
+	if (pc->of_pwm_n_cells < 3)
+		return ERR_PTR(-EINVAL);
+
+	if (args->args[0] >= pc->npwm)
+		return ERR_PTR(-EINVAL);
+
+	pwm = pwm_request_from_chip(pc, args->args[0], NULL);
+	if (IS_ERR(pwm))
+		return pwm;
+
+	pwm_set_period(pwm, args->args[1]);
+
+	if (args->args[2] & PWM_SPEC_POLARITY)
+		pwm_set_polarity(pwm, PWM_POLARITY_INVERSED);
+	else
+		pwm_set_polarity(pwm, PWM_POLARITY_NORMAL);
+
+	return pwm;
+}
+EXPORT_SYMBOL_GPL(of_pwm_xlate_with_flags);
+
 static struct pwm_device *
 of_pwm_simple_xlate(struct pwm_chip *pc, const struct of_phandle_args *args)
 {
diff --git a/trunk/drivers/pwm/pwm-imx.c b/trunk/drivers/pwm/pwm-imx.c
index 8f26e9fcea97..65a86bdeabed 100644
--- a/trunk/drivers/pwm/pwm-imx.c
+++ b/trunk/drivers/pwm/pwm-imx.c
@@ -235,7 +235,7 @@ static int imx_pwm_probe(struct platform_device *pdev)
 {
 	const struct of_device_id *of_id =
 			of_match_device(imx_pwm_dt_ids, &pdev->dev);
-	struct imx_pwm_data *data;
+	const struct imx_pwm_data *data;
 	struct imx_chip *imx;
 	struct resource *r;
 	int ret = 0;
diff --git a/trunk/drivers/pwm/pwm-lpc32xx.c b/trunk/drivers/pwm/pwm-lpc32xx.c
index 015a82235620..14106440294f 100644
--- a/trunk/drivers/pwm/pwm-lpc32xx.c
+++ b/trunk/drivers/pwm/pwm-lpc32xx.c
@@ -49,9 +49,24 @@ static int lpc32xx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 		c = 0; /* 0 set division by 256 */
 	period_cycles = c;
 
+	/* The duty-cycle value is as follows:
+	 *
+	 *  DUTY-CYCLE     HIGH LEVEL
+	 *      1            99.9%
+	 *      25           90.0%
+	 *      128          50.0%
+	 *      220          10.0%
+	 *      255           0.1%
+	 *      0             0.0%
+	 *
+	 * In other words, the register value is duty-cycle % 256 with
+	 * duty-cycle in the range 1-256.
+	 */
 	c = 256 * duty_ns;
 	do_div(c, period_ns);
-	duty_cycles = c;
+	if (c > 255)
+		c = 255;
+	duty_cycles = 256 - c;
 
 	writel(PWM_ENABLE | PWM_RELOADV(period_cycles) | PWM_DUTY(duty_cycles),
 		lpc32xx->base + (pwm->hwpwm << 2));
@@ -106,6 +121,7 @@ static int lpc32xx_pwm_probe(struct platform_device *pdev)
 	lpc32xx->chip.dev = &pdev->dev;
 	lpc32xx->chip.ops = &lpc32xx_pwm_ops;
 	lpc32xx->chip.npwm = 2;
+	lpc32xx->chip.base = -1;
 
 	ret = pwmchip_add(&lpc32xx->chip);
 	if (ret < 0) {
@@ -121,8 +137,11 @@ static int lpc32xx_pwm_probe(struct platform_device *pdev)
 static int lpc32xx_pwm_remove(struct platform_device *pdev)
 {
 	struct lpc32xx_pwm_chip *lpc32xx = platform_get_drvdata(pdev);
+	unsigned int i;
+
+	for (i = 0; i < lpc32xx->chip.npwm; i++)
+		pwm_disable(&lpc32xx->chip.pwms[i]);
 
-	clk_disable(lpc32xx->clk);
 	return pwmchip_remove(&lpc32xx->chip);
 }
 
diff --git a/trunk/drivers/pwm/pwm-samsung.c b/trunk/drivers/pwm/pwm-samsung.c
index e9b15d099c03..5207e6cd8648 100644
--- a/trunk/drivers/pwm/pwm-samsung.c
+++ b/trunk/drivers/pwm/pwm-samsung.c
@@ -222,6 +222,7 @@ static int s3c_pwm_probe(struct platform_device *pdev)
 
 	/* calculate base of control bits in TCON */
 	s3c->tcon_base = id == 0 ? 0 : (id * 4) + 4;
+	s3c->pwm_id = id;
 	s3c->chip.dev = &pdev->dev;
 	s3c->chip.ops = &s3c_pwm_ops;
 	s3c->chip.base = -1;
diff --git a/trunk/drivers/pwm/pwm-spear.c b/trunk/drivers/pwm/pwm-spear.c
new file mode 100644
index 000000000000..83b21d9d5cf9
--- /dev/null
+++ b/trunk/drivers/pwm/pwm-spear.c
@@ -0,0 +1,276 @@
+/*
+ * ST Microelectronics SPEAr Pulse Width Modulator driver
+ *
+ * Copyright (C) 2012 ST Microelectronics
+ * Shiraz Hashim <shiraz.hashim@st.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/math64.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/pwm.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#define NUM_PWM		4
+
+/* PWM registers and bits definitions */
+#define PWMCR			0x00	/* Control Register */
+#define PWMCR_PWM_ENABLE	0x1
+#define PWMCR_PRESCALE_SHIFT	2
+#define PWMCR_MIN_PRESCALE	0x00
+#define PWMCR_MAX_PRESCALE	0x3FFF
+
+#define PWMDCR			0x04	/* Duty Cycle Register */
+#define PWMDCR_MIN_DUTY		0x0001
+#define PWMDCR_MAX_DUTY		0xFFFF
+
+#define PWMPCR			0x08	/* Period Register */
+#define PWMPCR_MIN_PERIOD	0x0001
+#define PWMPCR_MAX_PERIOD	0xFFFF
+
+/* Following only available on 13xx SoCs */
+#define PWMMCR			0x3C	/* Master Control Register */
+#define PWMMCR_PWM_ENABLE	0x1
+
+/**
+ * struct spear_pwm_chip - struct representing pwm chip
+ *
+ * @mmio_base: base address of pwm chip
+ * @clk: pointer to clk structure of pwm chip
+ * @chip: linux pwm chip representation
+ * @dev: pointer to device structure of pwm chip
+ */
+struct spear_pwm_chip {
+	void __iomem *mmio_base;
+	struct clk *clk;
+	struct pwm_chip chip;
+	struct device *dev;
+};
+
+static inline struct spear_pwm_chip *to_spear_pwm_chip(struct pwm_chip *chip)
+{
+	return container_of(chip, struct spear_pwm_chip, chip);
+}
+
+static inline u32 spear_pwm_readl(struct spear_pwm_chip *chip, unsigned int num,
+				  unsigned long offset)
+{
+	return readl_relaxed(chip->mmio_base + (num << 4) + offset);
+}
+
+static inline void spear_pwm_writel(struct spear_pwm_chip *chip,
+				    unsigned int num, unsigned long offset,
+				    unsigned long val)
+{
+	writel_relaxed(val, chip->mmio_base + (num << 4) + offset);
+}
+
+static int spear_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
+			    int duty_ns, int period_ns)
+{
+	struct spear_pwm_chip *pc = to_spear_pwm_chip(chip);
+	u64 val, div, clk_rate;
+	unsigned long prescale = PWMCR_MIN_PRESCALE, pv, dc;
+	int ret;
+
+	/*
+	 * Find pv, dc and prescale to suit duty_ns and period_ns. This is done
+	 * according to formulas described below:
+	 *
+	 * period_ns = 10^9 * (PRESCALE + 1) * PV / PWM_CLK_RATE
+	 * duty_ns = 10^9 * (PRESCALE + 1) * DC / PWM_CLK_RATE
+	 *
+	 * PV = (PWM_CLK_RATE * period_ns) / (10^9 * (PRESCALE + 1))
+	 * DC = (PWM_CLK_RATE * duty_ns) / (10^9 * (PRESCALE + 1))
+	 */
+	clk_rate = clk_get_rate(pc->clk);
+	while (1) {
+		div = 1000000000;
+		div *= 1 + prescale;
+		val = clk_rate * period_ns;
+		pv = div64_u64(val, div);
+		val = clk_rate * duty_ns;
+		dc = div64_u64(val, div);
+
+		/* if duty_ns and period_ns are not achievable then return */
+		if (pv < PWMPCR_MIN_PERIOD || dc < PWMDCR_MIN_DUTY)
+			return -EINVAL;
+
+		/*
+		 * if pv and dc have crossed their upper limit, then increase
+		 * prescale and recalculate pv and dc.
+		 */
+		if (pv > PWMPCR_MAX_PERIOD || dc > PWMDCR_MAX_DUTY) {
+			if (++prescale > PWMCR_MAX_PRESCALE)
+				return -EINVAL;
+			continue;
+		}
+		break;
+	}
+
+	/*
+	 * NOTE: the clock to PWM has to be enabled first before writing to the
+	 * registers.
+	 */
+	ret = clk_enable(pc->clk);
+	if (ret)
+		return ret;
+
+	spear_pwm_writel(pc, pwm->hwpwm, PWMCR,
+			prescale << PWMCR_PRESCALE_SHIFT);
+	spear_pwm_writel(pc, pwm->hwpwm, PWMDCR, dc);
+	spear_pwm_writel(pc, pwm->hwpwm, PWMPCR, pv);
+	clk_disable(pc->clk);
+
+	return 0;
+}
+
+static int spear_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+	struct spear_pwm_chip *pc = to_spear_pwm_chip(chip);
+	int rc = 0;
+	u32 val;
+
+	rc = clk_enable(pc->clk);
+	if (!rc)
+		return rc;
+
+	val = spear_pwm_readl(pc, pwm->hwpwm, PWMCR);
+	val |= PWMCR_PWM_ENABLE;
+	spear_pwm_writel(pc, pwm->hwpwm, PWMCR, val);
+
+	return 0;
+}
+
+static void spear_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+	struct spear_pwm_chip *pc = to_spear_pwm_chip(chip);
+	u32 val;
+
+	val = spear_pwm_readl(pc, pwm->hwpwm, PWMCR);
+	val &= ~PWMCR_PWM_ENABLE;
+	spear_pwm_writel(pc, pwm->hwpwm, PWMCR, val);
+
+	clk_disable(pc->clk);
+}
+
+static const struct pwm_ops spear_pwm_ops = {
+	.config = spear_pwm_config,
+	.enable = spear_pwm_enable,
+	.disable = spear_pwm_disable,
+	.owner = THIS_MODULE,
+};
+
+static int spear_pwm_probe(struct platform_device *pdev)
+{
+	struct device_node *np = pdev->dev.of_node;
+	struct spear_pwm_chip *pc;
+	struct resource *r;
+	int ret;
+	u32 val;
+
+	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!r) {
+		dev_err(&pdev->dev, "no memory resources defined\n");
+		return -ENODEV;
+	}
+
+	pc = devm_kzalloc(&pdev->dev, sizeof(*pc), GFP_KERNEL);
+	if (!pc) {
+		dev_err(&pdev->dev, "failed to allocate memory\n");
+		return -ENOMEM;
+	}
+
+	pc->mmio_base = devm_request_and_ioremap(&pdev->dev, r);
+	if (!pc->mmio_base)
+		return -EADDRNOTAVAIL;
+
+	pc->clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(pc->clk))
+		return PTR_ERR(pc->clk);
+
+	pc->dev = &pdev->dev;
+	platform_set_drvdata(pdev, pc);
+
+	pc->chip.dev = &pdev->dev;
+	pc->chip.ops = &spear_pwm_ops;
+	pc->chip.base = -1;
+	pc->chip.npwm = NUM_PWM;
+
+	ret = clk_prepare(pc->clk);
+	if (!ret)
+		return ret;
+
+	if (of_device_is_compatible(np, "st,spear1340-pwm")) {
+		ret = clk_enable(pc->clk);
+		if (!ret) {
+			clk_unprepare(pc->clk);
+			return ret;
+		}
+		/*
+		 * Following enables PWM chip, channels would still be
+		 * enabled individually through their control register
+		 */
+		val = readl_relaxed(pc->mmio_base + PWMMCR);
+		val |= PWMMCR_PWM_ENABLE;
+		writel_relaxed(val, pc->mmio_base + PWMMCR);
+
+		clk_disable(pc->clk);
+	}
+
+	ret = pwmchip_add(&pc->chip);
+	if (!ret) {
+		clk_unprepare(pc->clk);
+		dev_err(&pdev->dev, "pwmchip_add() failed: %d\n", ret);
+	}
+
+	return ret;
+}
+
+static int spear_pwm_remove(struct platform_device *pdev)
+{
+	struct spear_pwm_chip *pc = platform_get_drvdata(pdev);
+	int i;
+
+	for (i = 0; i < NUM_PWM; i++)
+		pwm_disable(&pc->chip.pwms[i]);
+
+	/* clk was prepared in probe, hence unprepare it here */
+	clk_unprepare(pc->clk);
+	return pwmchip_remove(&pc->chip);
+}
+
+static struct of_device_id spear_pwm_of_match[] = {
+	{ .compatible = "st,spear320-pwm" },
+	{ .compatible = "st,spear1340-pwm" },
+	{ }
+};
+
+MODULE_DEVICE_TABLE(of, spear_pwm_of_match);
+
+static struct platform_driver spear_pwm_driver = {
+	.driver = {
+		.name = "spear-pwm",
+		.of_match_table = spear_pwm_of_match,
+	},
+	.probe = spear_pwm_probe,
+	.remove = spear_pwm_remove,
+};
+
+module_platform_driver(spear_pwm_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Shiraz Hashim <shiraz.hashim@st.com>");
+MODULE_AUTHOR("Viresh Kumar <viresh.kumar@linaro.com>");
+MODULE_ALIAS("platform:spear-pwm");
diff --git a/trunk/drivers/pwm/pwm-tiecap.c b/trunk/drivers/pwm/pwm-tiecap.c
index 87c091b245cc..5cf016dd9822 100644
--- a/trunk/drivers/pwm/pwm-tiecap.c
+++ b/trunk/drivers/pwm/pwm-tiecap.c
@@ -25,6 +25,10 @@
 #include <linux/clk.h>
 #include <linux/pm_runtime.h>
 #include <linux/pwm.h>
+#include <linux/of_device.h>
+#include <linux/pinctrl/consumer.h>
+
+#include "pwm-tipwmss.h"
 
 /* ECAP registers and bits definitions */
 #define CAP1			0x08
@@ -184,12 +188,24 @@ static const struct pwm_ops ecap_pwm_ops = {
 	.owner		= THIS_MODULE,
 };
 
+static const struct of_device_id ecap_of_match[] = {
+	{ .compatible	= "ti,am33xx-ecap" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, ecap_of_match);
+
 static int ecap_pwm_probe(struct platform_device *pdev)
 {
 	int ret;
 	struct resource *r;
 	struct clk *clk;
 	struct ecap_pwm_chip *pc;
+	u16 status;
+	struct pinctrl *pinctrl;
+
+	pinctrl = devm_pinctrl_get_select_default(&pdev->dev);
+	if (IS_ERR(pinctrl))
+		dev_warn(&pdev->dev, "unable to select pin group\n");
 
 	pc = devm_kzalloc(&pdev->dev, sizeof(*pc), GFP_KERNEL);
 	if (!pc) {
@@ -211,6 +227,8 @@ static int ecap_pwm_probe(struct platform_device *pdev)
 
 	pc->chip.dev = &pdev->dev;
 	pc->chip.ops = &ecap_pwm_ops;
+	pc->chip.of_xlate = of_pwm_xlate_with_flags;
+	pc->chip.of_pwm_n_cells = 3;
 	pc->chip.base = -1;
 	pc->chip.npwm = 1;
 
@@ -231,14 +249,40 @@ static int ecap_pwm_probe(struct platform_device *pdev)
 	}
 
 	pm_runtime_enable(&pdev->dev);
+	pm_runtime_get_sync(&pdev->dev);
+
+	status = pwmss_submodule_state_change(pdev->dev.parent,
+			PWMSS_ECAPCLK_EN);
+	if (!(status & PWMSS_ECAPCLK_EN_ACK)) {
+		dev_err(&pdev->dev, "PWMSS config space clock enable failed\n");
+		ret = -EINVAL;
+		goto pwmss_clk_failure;
+	}
+
+	pm_runtime_put_sync(&pdev->dev);
+
 	platform_set_drvdata(pdev, pc);
 	return 0;
+
+pwmss_clk_failure:
+	pm_runtime_put_sync(&pdev->dev);
+	pm_runtime_disable(&pdev->dev);
+	pwmchip_remove(&pc->chip);
+	return ret;
 }
 
 static int ecap_pwm_remove(struct platform_device *pdev)
 {
 	struct ecap_pwm_chip *pc = platform_get_drvdata(pdev);
 
+	pm_runtime_get_sync(&pdev->dev);
+	/*
+	 * Due to hardware misbehaviour, acknowledge of the stop_req
+	 * is missing. Hence checking of the status bit skipped.
+	 */
+	pwmss_submodule_state_change(pdev->dev.parent, PWMSS_ECAPCLK_STOP_REQ);
+	pm_runtime_put_sync(&pdev->dev);
+
 	pm_runtime_put_sync(&pdev->dev);
 	pm_runtime_disable(&pdev->dev);
 	return pwmchip_remove(&pc->chip);
@@ -246,7 +290,9 @@ static int ecap_pwm_remove(struct platform_device *pdev)
 
 static struct platform_driver ecap_pwm_driver = {
 	.driver = {
-		.name = "ecap",
+		.name	= "ecap",
+		.owner	= THIS_MODULE,
+		.of_match_table = ecap_of_match,
 	},
 	.probe = ecap_pwm_probe,
 	.remove = ecap_pwm_remove,
diff --git a/trunk/drivers/pwm/pwm-tiehrpwm.c b/trunk/drivers/pwm/pwm-tiehrpwm.c
index 9ffd389d0c8b..72a6dd40c9ec 100644
--- a/trunk/drivers/pwm/pwm-tiehrpwm.c
+++ b/trunk/drivers/pwm/pwm-tiehrpwm.c
@@ -25,6 +25,10 @@
 #include <linux/err.h>
 #include <linux/clk.h>
 #include <linux/pm_runtime.h>
+#include <linux/of_device.h>
+#include <linux/pinctrl/consumer.h>
+
+#include "pwm-tipwmss.h"
 
 /* EHRPWM registers and bits definitions */
 
@@ -115,6 +119,7 @@ struct ehrpwm_pwm_chip {
 	void __iomem	*mmio_base;
 	unsigned long period_cycles[NUM_PWM_CHANNEL];
 	enum pwm_polarity polarity[NUM_PWM_CHANNEL];
+	struct	clk	*tbclk;
 };
 
 static inline struct ehrpwm_pwm_chip *to_ehrpwm_pwm_chip(struct pwm_chip *chip)
@@ -335,6 +340,9 @@ static int ehrpwm_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 	/* Channels polarity can be configured from action qualifier module */
 	configure_polarity(pc, pwm->hwpwm);
 
+	/* Enable TBCLK before enabling PWM device */
+	clk_enable(pc->tbclk);
+
 	/* Enable time counter for free_run */
 	ehrpwm_modify(pc->mmio_base, TBCTL, TBCTL_RUN_MASK, TBCTL_FREE_RUN);
 	return 0;
@@ -363,6 +371,9 @@ static void ehrpwm_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
 
 	ehrpwm_modify(pc->mmio_base, AQCSFRC, aqcsfrc_mask, aqcsfrc_val);
 
+	/* Disabling TBCLK on PWM disable */
+	clk_disable(pc->tbclk);
+
 	/* Stop Time base counter */
 	ehrpwm_modify(pc->mmio_base, TBCTL, TBCTL_RUN_MASK, TBCTL_STOP_NEXT);
 
@@ -392,12 +403,24 @@ static const struct pwm_ops ehrpwm_pwm_ops = {
 	.owner		= THIS_MODULE,
 };
 
+static const struct of_device_id ehrpwm_of_match[] = {
+	{ .compatible	= "ti,am33xx-ehrpwm" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, ehrpwm_of_match);
+
 static int ehrpwm_pwm_probe(struct platform_device *pdev)
 {
 	int ret;
 	struct resource *r;
 	struct clk *clk;
 	struct ehrpwm_pwm_chip *pc;
+	u16 status;
+	struct pinctrl *pinctrl;
+
+	pinctrl = devm_pinctrl_get_select_default(&pdev->dev);
+	if (IS_ERR(pinctrl))
+		dev_warn(&pdev->dev, "unable to select pin group\n");
 
 	pc = devm_kzalloc(&pdev->dev, sizeof(*pc), GFP_KERNEL);
 	if (!pc) {
@@ -419,6 +442,8 @@ static int ehrpwm_pwm_probe(struct platform_device *pdev)
 
 	pc->chip.dev = &pdev->dev;
 	pc->chip.ops = &ehrpwm_pwm_ops;
+	pc->chip.of_xlate = of_pwm_xlate_with_flags;
+	pc->chip.of_pwm_n_cells = 3;
 	pc->chip.base = -1;
 	pc->chip.npwm = NUM_PWM_CHANNEL;
 
@@ -432,6 +457,13 @@ static int ehrpwm_pwm_probe(struct platform_device *pdev)
 	if (!pc->mmio_base)
 		return  -EADDRNOTAVAIL;
 
+	/* Acquire tbclk for Time Base EHRPWM submodule */
+	pc->tbclk = devm_clk_get(&pdev->dev, "tbclk");
+	if (IS_ERR(pc->tbclk)) {
+		dev_err(&pdev->dev, "Failed to get tbclk\n");
+		return PTR_ERR(pc->tbclk);
+	}
+
 	ret = pwmchip_add(&pc->chip);
 	if (ret < 0) {
 		dev_err(&pdev->dev, "pwmchip_add() failed: %d\n", ret);
@@ -439,14 +471,40 @@ static int ehrpwm_pwm_probe(struct platform_device *pdev)
 	}
 
 	pm_runtime_enable(&pdev->dev);
+	pm_runtime_get_sync(&pdev->dev);
+
+	status = pwmss_submodule_state_change(pdev->dev.parent,
+			PWMSS_EPWMCLK_EN);
+	if (!(status & PWMSS_EPWMCLK_EN_ACK)) {
+		dev_err(&pdev->dev, "PWMSS config space clock enable failed\n");
+		ret = -EINVAL;
+		goto pwmss_clk_failure;
+	}
+
+	pm_runtime_put_sync(&pdev->dev);
+
 	platform_set_drvdata(pdev, pc);
 	return 0;
+
+pwmss_clk_failure:
+	pm_runtime_put_sync(&pdev->dev);
+	pm_runtime_disable(&pdev->dev);
+	pwmchip_remove(&pc->chip);
+	return ret;
 }
 
 static int ehrpwm_pwm_remove(struct platform_device *pdev)
 {
 	struct ehrpwm_pwm_chip *pc = platform_get_drvdata(pdev);
 
+	pm_runtime_get_sync(&pdev->dev);
+	/*
+	 * Due to hardware misbehaviour, acknowledge of the stop_req
+	 * is missing. Hence checking of the status bit skipped.
+	 */
+	pwmss_submodule_state_change(pdev->dev.parent, PWMSS_EPWMCLK_STOP_REQ);
+	pm_runtime_put_sync(&pdev->dev);
+
 	pm_runtime_put_sync(&pdev->dev);
 	pm_runtime_disable(&pdev->dev);
 	return pwmchip_remove(&pc->chip);
@@ -454,7 +512,9 @@ static int ehrpwm_pwm_remove(struct platform_device *pdev)
 
 static struct platform_driver ehrpwm_pwm_driver = {
 	.driver = {
-		.name = "ehrpwm",
+		.name	= "ehrpwm",
+		.owner	= THIS_MODULE,
+		.of_match_table = ehrpwm_of_match,
 	},
 	.probe = ehrpwm_pwm_probe,
 	.remove = ehrpwm_pwm_remove,
diff --git a/trunk/drivers/pwm/pwm-tipwmss.c b/trunk/drivers/pwm/pwm-tipwmss.c
new file mode 100644
index 000000000000..3448a1c88590
--- /dev/null
+++ b/trunk/drivers/pwm/pwm-tipwmss.c
@@ -0,0 +1,139 @@
+/*
+ * TI PWM Subsystem driver
+ *
+ * Copyright (C) 2012 Texas Instruments Incorporated - http://www.ti.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/io.h>
+#include <linux/err.h>
+#include <linux/pm_runtime.h>
+#include <linux/of_device.h>
+
+#include "pwm-tipwmss.h"
+
+#define PWMSS_CLKCONFIG		0x8	/* Clock gating reg */
+#define PWMSS_CLKSTATUS		0xc	/* Clock gating status reg */
+
+struct pwmss_info {
+	void __iomem	*mmio_base;
+	struct mutex	pwmss_lock;
+	u16		pwmss_clkconfig;
+};
+
+u16 pwmss_submodule_state_change(struct device *dev, int set)
+{
+	struct pwmss_info *info = dev_get_drvdata(dev);
+	u16 val;
+
+	mutex_lock(&info->pwmss_lock);
+	val = readw(info->mmio_base + PWMSS_CLKCONFIG);
+	val |= set;
+	writew(val , info->mmio_base + PWMSS_CLKCONFIG);
+	mutex_unlock(&info->pwmss_lock);
+
+	return readw(info->mmio_base + PWMSS_CLKSTATUS);
+}
+EXPORT_SYMBOL(pwmss_submodule_state_change);
+
+static const struct of_device_id pwmss_of_match[] = {
+	{ .compatible	= "ti,am33xx-pwmss" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, pwmss_of_match);
+
+static int pwmss_probe(struct platform_device *pdev)
+{
+	int ret;
+	struct resource *r;
+	struct pwmss_info *info;
+	struct device_node *node = pdev->dev.of_node;
+
+	info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		dev_err(&pdev->dev, "failed to allocate memory\n");
+		return -ENOMEM;
+	}
+
+	mutex_init(&info->pwmss_lock);
+
+	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!r) {
+		dev_err(&pdev->dev, "no memory resource defined\n");
+		return -ENODEV;
+	}
+
+	info->mmio_base = devm_request_and_ioremap(&pdev->dev, r);
+	if (!info->mmio_base)
+		return -EADDRNOTAVAIL;
+
+	pm_runtime_enable(&pdev->dev);
+	pm_runtime_get_sync(&pdev->dev);
+	platform_set_drvdata(pdev, info);
+
+	/* Populate all the child nodes here... */
+	ret = of_platform_populate(node, NULL, NULL, &pdev->dev);
+	if (ret)
+		dev_err(&pdev->dev, "no child node found\n");
+
+	return ret;
+}
+
+static int pwmss_remove(struct platform_device *pdev)
+{
+	struct pwmss_info *info = platform_get_drvdata(pdev);
+
+	pm_runtime_put_sync(&pdev->dev);
+	pm_runtime_disable(&pdev->dev);
+	mutex_destroy(&info->pwmss_lock);
+	return 0;
+}
+
+static int pwmss_suspend(struct device *dev)
+{
+	struct pwmss_info *info = dev_get_drvdata(dev);
+
+	info->pwmss_clkconfig = readw(info->mmio_base + PWMSS_CLKCONFIG);
+	pm_runtime_put_sync(dev);
+	return 0;
+}
+
+static int pwmss_resume(struct device *dev)
+{
+	struct pwmss_info *info = dev_get_drvdata(dev);
+
+	pm_runtime_get_sync(dev);
+	writew(info->pwmss_clkconfig, info->mmio_base + PWMSS_CLKCONFIG);
+	return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(pwmss_pm_ops, pwmss_suspend, pwmss_resume);
+
+static struct platform_driver pwmss_driver = {
+	.driver	= {
+		.name	= "pwmss",
+		.owner	= THIS_MODULE,
+		.pm	= &pwmss_pm_ops,
+		.of_match_table	= pwmss_of_match,
+	},
+	.probe	= pwmss_probe,
+	.remove	= pwmss_remove,
+};
+
+module_platform_driver(pwmss_driver);
+
+MODULE_DESCRIPTION("PWM Subsystem driver");
+MODULE_AUTHOR("Texas Instruments");
+MODULE_LICENSE("GPL");
diff --git a/trunk/drivers/pwm/pwm-tipwmss.h b/trunk/drivers/pwm/pwm-tipwmss.h
new file mode 100644
index 000000000000..11f76a1e266b
--- /dev/null
+++ b/trunk/drivers/pwm/pwm-tipwmss.h
@@ -0,0 +1,39 @@
+/*
+ * TI PWM Subsystem driver
+ *
+ * Copyright (C) 2012 Texas Instruments Incorporated - http://www.ti.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __TIPWMSS_H
+#define __TIPWMSS_H
+
+#ifdef CONFIG_PWM_TIPWMSS
+/* PWM substem clock gating */
+#define PWMSS_ECAPCLK_EN	BIT(0)
+#define PWMSS_ECAPCLK_STOP_REQ	BIT(1)
+#define PWMSS_EPWMCLK_EN	BIT(8)
+#define PWMSS_EPWMCLK_STOP_REQ	BIT(9)
+
+#define PWMSS_ECAPCLK_EN_ACK	BIT(0)
+#define PWMSS_EPWMCLK_EN_ACK	BIT(8)
+
+extern u16 pwmss_submodule_state_change(struct device *dev, int set);
+#else
+static inline u16 pwmss_submodule_state_change(struct device *dev, int set)
+{
+	/* return success status value */
+	return 0xFFFF;
+}
+#endif
+#endif	/* __TIPWMSS_H */
diff --git a/trunk/drivers/pwm/pwm-twl-led.c b/trunk/drivers/pwm/pwm-twl-led.c
new file mode 100644
index 000000000000..9dfa0f3eca30
--- /dev/null
+++ b/trunk/drivers/pwm/pwm-twl-led.c
@@ -0,0 +1,344 @@
+/*
+ * Driver for TWL4030/6030 Pulse Width Modulator used as LED driver
+ *
+ * Copyright (C) 2012 Texas Instruments
+ * Author: Peter Ujfalusi <peter.ujfalusi@ti.com>
+ *
+ * This driver is a complete rewrite of the former pwm-twl6030.c authorded by:
+ * Hemanth V <hemanthv@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/pwm.h>
+#include <linux/i2c/twl.h>
+#include <linux/slab.h>
+
+/*
+ * This driver handles the PWM driven LED terminals of TWL4030 and TWL6030.
+ * To generate the signal on TWL4030:
+ *  - LEDA uses PWMA
+ *  - LEDB uses PWMB
+ * TWL6030 has one LED pin with dedicated LEDPWM
+ */
+
+#define TWL4030_LED_MAX		0x7f
+#define TWL6030_LED_MAX		0xff
+
+/* Registers, bits and macro for TWL4030 */
+#define TWL4030_LEDEN_REG	0x00
+#define TWL4030_PWMA_REG	0x01
+
+#define TWL4030_LEDXON		(1 << 0)
+#define TWL4030_LEDXPWM		(1 << 4)
+#define TWL4030_LED_PINS	(TWL4030_LEDXON | TWL4030_LEDXPWM)
+#define TWL4030_LED_TOGGLE(led, x)	((x) << (led))
+
+/* Register, bits and macro for TWL6030 */
+#define TWL6030_LED_PWM_CTRL1	0xf4
+#define TWL6030_LED_PWM_CTRL2	0xf5
+
+#define TWL6040_LED_MODE_HW	0x00
+#define TWL6040_LED_MODE_ON	0x01
+#define TWL6040_LED_MODE_OFF	0x02
+#define TWL6040_LED_MODE_MASK	0x03
+
+struct twl_pwmled_chip {
+	struct pwm_chip chip;
+	struct mutex mutex;
+};
+
+static inline struct twl_pwmled_chip *to_twl(struct pwm_chip *chip)
+{
+	return container_of(chip, struct twl_pwmled_chip, chip);
+}
+
+static int twl4030_pwmled_config(struct pwm_chip *chip, struct pwm_device *pwm,
+			      int duty_ns, int period_ns)
+{
+	int duty_cycle = DIV_ROUND_UP(duty_ns * TWL4030_LED_MAX, period_ns) + 1;
+	u8 pwm_config[2] = { 1, 0 };
+	int base, ret;
+
+	/*
+	 * To configure the duty period:
+	 * On-cycle is set to 1 (the minimum allowed value)
+	 * The off time of 0 is not configurable, so the mapping is:
+	 * 0 -> off cycle = 2,
+	 * 1 -> off cycle = 2,
+	 * 2 -> off cycle = 3,
+	 * 126 - > off cycle 127,
+	 * 127 - > off cycle 1
+	 * When on cycle == off cycle the PWM will be always on
+	 */
+	if (duty_cycle == 1)
+		duty_cycle = 2;
+	else if (duty_cycle > TWL4030_LED_MAX)
+		duty_cycle = 1;
+
+	base = pwm->hwpwm * 2 + TWL4030_PWMA_REG;
+
+	pwm_config[1] = duty_cycle;
+
+	ret = twl_i2c_write(TWL4030_MODULE_LED, pwm_config, base, 2);
+	if (ret < 0)
+		dev_err(chip->dev, "%s: Failed to configure PWM\n", pwm->label);
+
+	return ret;
+}
+
+static int twl4030_pwmled_enable(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+	struct twl_pwmled_chip *twl = to_twl(chip);
+	int ret;
+	u8 val;
+
+	mutex_lock(&twl->mutex);
+	ret = twl_i2c_read_u8(TWL4030_MODULE_LED, &val, TWL4030_LEDEN_REG);
+	if (ret < 0) {
+		dev_err(chip->dev, "%s: Failed to read LEDEN\n", pwm->label);
+		goto out;
+	}
+
+	val |= TWL4030_LED_TOGGLE(pwm->hwpwm, TWL4030_LED_PINS);
+
+	ret = twl_i2c_write_u8(TWL4030_MODULE_LED, val, TWL4030_LEDEN_REG);
+	if (ret < 0)
+		dev_err(chip->dev, "%s: Failed to enable PWM\n", pwm->label);
+
+out:
+	mutex_unlock(&twl->mutex);
+	return ret;
+}
+
+static void twl4030_pwmled_disable(struct pwm_chip *chip,
+				   struct pwm_device *pwm)
+{
+	struct twl_pwmled_chip *twl = to_twl(chip);
+	int ret;
+	u8 val;
+
+	mutex_lock(&twl->mutex);
+	ret = twl_i2c_read_u8(TWL4030_MODULE_LED, &val, TWL4030_LEDEN_REG);
+	if (ret < 0) {
+		dev_err(chip->dev, "%s: Failed to read LEDEN\n", pwm->label);
+		goto out;
+	}
+
+	val &= ~TWL4030_LED_TOGGLE(pwm->hwpwm, TWL4030_LED_PINS);
+
+	ret = twl_i2c_write_u8(TWL4030_MODULE_LED, val, TWL4030_LEDEN_REG);
+	if (ret < 0)
+		dev_err(chip->dev, "%s: Failed to disable PWM\n", pwm->label);
+
+out:
+	mutex_unlock(&twl->mutex);
+}
+
+static int twl6030_pwmled_config(struct pwm_chip *chip, struct pwm_device *pwm,
+			      int duty_ns, int period_ns)
+{
+	int duty_cycle = (duty_ns * TWL6030_LED_MAX) / period_ns;
+	u8 on_time;
+	int ret;
+
+	on_time = duty_cycle & 0xff;
+
+	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, on_time,
+			       TWL6030_LED_PWM_CTRL1);
+	if (ret < 0)
+		dev_err(chip->dev, "%s: Failed to configure PWM\n", pwm->label);
+
+	return ret;
+}
+
+static int twl6030_pwmled_enable(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+	struct twl_pwmled_chip *twl = to_twl(chip);
+	int ret;
+	u8 val;
+
+	mutex_lock(&twl->mutex);
+	ret = twl_i2c_read_u8(TWL6030_MODULE_ID1, &val, TWL6030_LED_PWM_CTRL2);
+	if (ret < 0) {
+		dev_err(chip->dev, "%s: Failed to read PWM_CTRL2\n",
+			pwm->label);
+		goto out;
+	}
+
+	val &= ~TWL6040_LED_MODE_MASK;
+	val |= TWL6040_LED_MODE_ON;
+
+	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, TWL6030_LED_PWM_CTRL2);
+	if (ret < 0)
+		dev_err(chip->dev, "%s: Failed to enable PWM\n", pwm->label);
+
+out:
+	mutex_unlock(&twl->mutex);
+	return ret;
+}
+
+static void twl6030_pwmled_disable(struct pwm_chip *chip,
+				   struct pwm_device *pwm)
+{
+	struct twl_pwmled_chip *twl = to_twl(chip);
+	int ret;
+	u8 val;
+
+	mutex_lock(&twl->mutex);
+	ret = twl_i2c_read_u8(TWL6030_MODULE_ID1, &val, TWL6030_LED_PWM_CTRL2);
+	if (ret < 0) {
+		dev_err(chip->dev, "%s: Failed to read PWM_CTRL2\n",
+			pwm->label);
+		goto out;
+	}
+
+	val &= ~TWL6040_LED_MODE_MASK;
+	val |= TWL6040_LED_MODE_OFF;
+
+	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, TWL6030_LED_PWM_CTRL2);
+	if (ret < 0)
+		dev_err(chip->dev, "%s: Failed to disable PWM\n", pwm->label);
+
+out:
+	mutex_unlock(&twl->mutex);
+}
+
+static int twl6030_pwmled_request(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+	struct twl_pwmled_chip *twl = to_twl(chip);
+	int ret;
+	u8 val;
+
+	mutex_lock(&twl->mutex);
+	ret = twl_i2c_read_u8(TWL6030_MODULE_ID1, &val, TWL6030_LED_PWM_CTRL2);
+	if (ret < 0) {
+		dev_err(chip->dev, "%s: Failed to read PWM_CTRL2\n",
+			pwm->label);
+		goto out;
+	}
+
+	val &= ~TWL6040_LED_MODE_MASK;
+	val |= TWL6040_LED_MODE_OFF;
+
+	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, TWL6030_LED_PWM_CTRL2);
+	if (ret < 0)
+		dev_err(chip->dev, "%s: Failed to request PWM\n", pwm->label);
+
+out:
+	mutex_unlock(&twl->mutex);
+	return ret;
+}
+
+static void twl6030_pwmled_free(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+	struct twl_pwmled_chip *twl = to_twl(chip);
+	int ret;
+	u8 val;
+
+	mutex_lock(&twl->mutex);
+	ret = twl_i2c_read_u8(TWL6030_MODULE_ID1, &val, TWL6030_LED_PWM_CTRL2);
+	if (ret < 0) {
+		dev_err(chip->dev, "%s: Failed to read PWM_CTRL2\n",
+			pwm->label);
+		goto out;
+	}
+
+	val &= ~TWL6040_LED_MODE_MASK;
+	val |= TWL6040_LED_MODE_HW;
+
+	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, TWL6030_LED_PWM_CTRL2);
+	if (ret < 0)
+		dev_err(chip->dev, "%s: Failed to free PWM\n", pwm->label);
+
+out:
+	mutex_unlock(&twl->mutex);
+}
+
+static const struct pwm_ops twl4030_pwmled_ops = {
+	.enable = twl4030_pwmled_enable,
+	.disable = twl4030_pwmled_disable,
+	.config = twl4030_pwmled_config,
+};
+
+static const struct pwm_ops twl6030_pwmled_ops = {
+	.enable = twl6030_pwmled_enable,
+	.disable = twl6030_pwmled_disable,
+	.config = twl6030_pwmled_config,
+	.request = twl6030_pwmled_request,
+	.free = twl6030_pwmled_free,
+};
+
+static int twl_pwmled_probe(struct platform_device *pdev)
+{
+	struct twl_pwmled_chip *twl;
+	int ret;
+
+	twl = devm_kzalloc(&pdev->dev, sizeof(*twl), GFP_KERNEL);
+	if (!twl)
+		return -ENOMEM;
+
+	if (twl_class_is_4030()) {
+		twl->chip.ops = &twl4030_pwmled_ops;
+		twl->chip.npwm = 2;
+	} else {
+		twl->chip.ops = &twl6030_pwmled_ops;
+		twl->chip.npwm = 1;
+	}
+
+	twl->chip.dev = &pdev->dev;
+	twl->chip.base = -1;
+
+	mutex_init(&twl->mutex);
+
+	ret = pwmchip_add(&twl->chip);
+	if (ret < 0)
+		return ret;
+
+	platform_set_drvdata(pdev, twl);
+
+	return 0;
+}
+
+static int twl_pwmled_remove(struct platform_device *pdev)
+{
+	struct twl_pwmled_chip *twl = platform_get_drvdata(pdev);
+
+	return pwmchip_remove(&twl->chip);
+}
+
+#ifdef CONFIG_OF
+static struct of_device_id twl_pwmled_of_match[] = {
+	{ .compatible = "ti,twl4030-pwmled" },
+	{ .compatible = "ti,twl6030-pwmled" },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, twl_pwmled_of_match);
+#endif
+
+static struct platform_driver twl_pwmled_driver = {
+	.driver = {
+		.name = "twl-pwmled",
+		.of_match_table = of_match_ptr(twl_pwmled_of_match),
+	},
+	.probe = twl_pwmled_probe,
+	.remove = twl_pwmled_remove,
+};
+module_platform_driver(twl_pwmled_driver);
+
+MODULE_AUTHOR("Peter Ujfalusi <peter.ujfalusi@ti.com>");
+MODULE_DESCRIPTION("PWM driver for TWL4030 and TWL6030 LED outputs");
+MODULE_ALIAS("platform:twl-pwmled");
+MODULE_LICENSE("GPL");
diff --git a/trunk/drivers/pwm/pwm-twl.c b/trunk/drivers/pwm/pwm-twl.c
new file mode 100644
index 000000000000..e65db95d5e59
--- /dev/null
+++ b/trunk/drivers/pwm/pwm-twl.c
@@ -0,0 +1,359 @@
+/*
+ * Driver for TWL4030/6030 Generic Pulse Width Modulator
+ *
+ * Copyright (C) 2012 Texas Instruments
+ * Author: Peter Ujfalusi <peter.ujfalusi@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/pwm.h>
+#include <linux/i2c/twl.h>
+#include <linux/slab.h>
+
+/*
+ * This driver handles the PWMs of TWL4030 and TWL6030.
+ * The TRM names for the PWMs on TWL4030 are: PWM0, PWM1
+ * TWL6030 also have two PWMs named in the TRM as PWM1, PWM2
+ */
+
+#define TWL_PWM_MAX		0x7f
+
+/* Registers, bits and macro for TWL4030 */
+#define TWL4030_GPBR1_REG	0x0c
+#define TWL4030_PMBR1_REG	0x0d
+
+/* GPBR1 register bits */
+#define TWL4030_PWMXCLK_ENABLE	(1 << 0)
+#define TWL4030_PWMX_ENABLE	(1 << 2)
+#define TWL4030_PWMX_BITS	(TWL4030_PWMX_ENABLE | TWL4030_PWMXCLK_ENABLE)
+#define TWL4030_PWM_TOGGLE(pwm, x)	((x) << (pwm))
+
+/* PMBR1 register bits */
+#define TWL4030_GPIO6_PWM0_MUTE_MASK		(0x03 << 2)
+#define TWL4030_GPIO6_PWM0_MUTE_PWM0		(0x01 << 2)
+#define TWL4030_GPIO7_VIBRASYNC_PWM1_MASK	(0x03 << 4)
+#define TWL4030_GPIO7_VIBRASYNC_PWM1_PWM1	(0x03 << 4)
+
+/* Register, bits and macro for TWL6030 */
+#define TWL6030_TOGGLE3_REG	0x92
+
+#define TWL6030_PWMXR		(1 << 0)
+#define TWL6030_PWMXS		(1 << 1)
+#define TWL6030_PWMXEN		(1 << 2)
+#define TWL6030_PWM_TOGGLE(pwm, x)	((x) << (pwm * 3))
+
+struct twl_pwm_chip {
+	struct pwm_chip chip;
+	struct mutex mutex;
+	u8 twl6030_toggle3;
+	u8 twl4030_pwm_mux;
+};
+
+static inline struct twl_pwm_chip *to_twl(struct pwm_chip *chip)
+{
+	return container_of(chip, struct twl_pwm_chip, chip);
+}
+
+static int twl_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
+			      int duty_ns, int period_ns)
+{
+	int duty_cycle = DIV_ROUND_UP(duty_ns * TWL_PWM_MAX, period_ns) + 1;
+	u8 pwm_config[2] = { 1, 0 };
+	int base, ret;
+
+	/*
+	 * To configure the duty period:
+	 * On-cycle is set to 1 (the minimum allowed value)
+	 * The off time of 0 is not configurable, so the mapping is:
+	 * 0 -> off cycle = 2,
+	 * 1 -> off cycle = 2,
+	 * 2 -> off cycle = 3,
+	 * 126 - > off cycle 127,
+	 * 127 - > off cycle 1
+	 * When on cycle == off cycle the PWM will be always on
+	 */
+	if (duty_cycle == 1)
+		duty_cycle = 2;
+	else if (duty_cycle > TWL_PWM_MAX)
+		duty_cycle = 1;
+
+	base = pwm->hwpwm * 3;
+
+	pwm_config[1] = duty_cycle;
+
+	ret = twl_i2c_write(TWL_MODULE_PWM, pwm_config, base, 2);
+	if (ret < 0)
+		dev_err(chip->dev, "%s: Failed to configure PWM\n", pwm->label);
+
+	return ret;
+}
+
+static int twl4030_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+	struct twl_pwm_chip *twl = to_twl(chip);
+	int ret;
+	u8 val;
+
+	mutex_lock(&twl->mutex);
+	ret = twl_i2c_read_u8(TWL4030_MODULE_INTBR, &val, TWL4030_GPBR1_REG);
+	if (ret < 0) {
+		dev_err(chip->dev, "%s: Failed to read GPBR1\n", pwm->label);
+		goto out;
+	}
+
+	val |= TWL4030_PWM_TOGGLE(pwm->hwpwm, TWL4030_PWMXCLK_ENABLE);
+
+	ret = twl_i2c_write_u8(TWL4030_MODULE_INTBR, val, TWL4030_GPBR1_REG);
+	if (ret < 0)
+		dev_err(chip->dev, "%s: Failed to enable PWM\n", pwm->label);
+
+	val |= TWL4030_PWM_TOGGLE(pwm->hwpwm, TWL4030_PWMX_ENABLE);
+
+	ret = twl_i2c_write_u8(TWL4030_MODULE_INTBR, val, TWL4030_GPBR1_REG);
+	if (ret < 0)
+		dev_err(chip->dev, "%s: Failed to enable PWM\n", pwm->label);
+
+out:
+	mutex_unlock(&twl->mutex);
+	return ret;
+}
+
+static void twl4030_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+	struct twl_pwm_chip *twl = to_twl(chip);
+	int ret;
+	u8 val;
+
+	mutex_lock(&twl->mutex);
+	ret = twl_i2c_read_u8(TWL4030_MODULE_INTBR, &val, TWL4030_GPBR1_REG);
+	if (ret < 0) {
+		dev_err(chip->dev, "%s: Failed to read GPBR1\n", pwm->label);
+		goto out;
+	}
+
+	val &= ~TWL4030_PWM_TOGGLE(pwm->hwpwm, TWL4030_PWMX_ENABLE);
+
+	ret = twl_i2c_write_u8(TWL4030_MODULE_INTBR, val, TWL4030_GPBR1_REG);
+	if (ret < 0)
+		dev_err(chip->dev, "%s: Failed to disable PWM\n", pwm->label);
+
+	val &= ~TWL4030_PWM_TOGGLE(pwm->hwpwm, TWL4030_PWMXCLK_ENABLE);
+
+	ret = twl_i2c_write_u8(TWL4030_MODULE_INTBR, val, TWL4030_GPBR1_REG);
+	if (ret < 0)
+		dev_err(chip->dev, "%s: Failed to disable PWM\n", pwm->label);
+
+out:
+	mutex_unlock(&twl->mutex);
+}
+
+static int twl4030_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+	struct twl_pwm_chip *twl = to_twl(chip);
+	int ret;
+	u8 val, mask, bits;
+
+	if (pwm->hwpwm == 1) {
+		mask = TWL4030_GPIO7_VIBRASYNC_PWM1_MASK;
+		bits = TWL4030_GPIO7_VIBRASYNC_PWM1_PWM1;
+	} else {
+		mask = TWL4030_GPIO6_PWM0_MUTE_MASK;
+		bits = TWL4030_GPIO6_PWM0_MUTE_PWM0;
+	}
+
+	mutex_lock(&twl->mutex);
+	ret = twl_i2c_read_u8(TWL4030_MODULE_INTBR, &val, TWL4030_PMBR1_REG);
+	if (ret < 0) {
+		dev_err(chip->dev, "%s: Failed to read PMBR1\n", pwm->label);
+		goto out;
+	}
+
+	/* Save the current MUX configuration for the PWM */
+	twl->twl4030_pwm_mux &= ~mask;
+	twl->twl4030_pwm_mux |= (val & mask);
+
+	/* Select PWM functionality */
+	val &= ~mask;
+	val |= bits;
+
+	ret = twl_i2c_write_u8(TWL4030_MODULE_INTBR, val, TWL4030_PMBR1_REG);
+	if (ret < 0)
+		dev_err(chip->dev, "%s: Failed to request PWM\n", pwm->label);
+
+out:
+	mutex_unlock(&twl->mutex);
+	return ret;
+}
+
+static void twl4030_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+	struct twl_pwm_chip *twl = container_of(chip, struct twl_pwm_chip,
+						chip);
+	int ret;
+	u8 val, mask;
+
+	if (pwm->hwpwm == 1)
+		mask = TWL4030_GPIO7_VIBRASYNC_PWM1_MASK;
+	else
+		mask = TWL4030_GPIO6_PWM0_MUTE_MASK;
+
+	mutex_lock(&twl->mutex);
+	ret = twl_i2c_read_u8(TWL4030_MODULE_INTBR, &val, TWL4030_PMBR1_REG);
+	if (ret < 0) {
+		dev_err(chip->dev, "%s: Failed to read PMBR1\n", pwm->label);
+		goto out;
+	}
+
+	/* Restore the MUX configuration for the PWM */
+	val &= ~mask;
+	val |= (twl->twl4030_pwm_mux & mask);
+
+	ret = twl_i2c_write_u8(TWL4030_MODULE_INTBR, val, TWL4030_PMBR1_REG);
+	if (ret < 0)
+		dev_err(chip->dev, "%s: Failed to free PWM\n", pwm->label);
+
+out:
+	mutex_unlock(&twl->mutex);
+}
+
+static int twl6030_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+	struct twl_pwm_chip *twl = container_of(chip, struct twl_pwm_chip,
+						chip);
+	int ret;
+	u8 val;
+
+	mutex_lock(&twl->mutex);
+	val = twl->twl6030_toggle3;
+	val |= TWL6030_PWM_TOGGLE(pwm->hwpwm, TWL6030_PWMXS | TWL6030_PWMXEN);
+	val &= ~TWL6030_PWM_TOGGLE(pwm->hwpwm, TWL6030_PWMXR);
+
+	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, TWL6030_TOGGLE3_REG);
+	if (ret < 0) {
+		dev_err(chip->dev, "%s: Failed to enable PWM\n", pwm->label);
+		goto out;
+	}
+
+	twl->twl6030_toggle3 = val;
+out:
+	mutex_unlock(&twl->mutex);
+	return 0;
+}
+
+static void twl6030_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+	struct twl_pwm_chip *twl = container_of(chip, struct twl_pwm_chip,
+						chip);
+	int ret;
+	u8 val;
+
+	mutex_lock(&twl->mutex);
+	val = twl->twl6030_toggle3;
+	val |= TWL6030_PWM_TOGGLE(pwm->hwpwm, TWL6030_PWMXR);
+	val &= ~TWL6030_PWM_TOGGLE(pwm->hwpwm, TWL6030_PWMXS | TWL6030_PWMXEN);
+
+	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, TWL6030_TOGGLE3_REG);
+	if (ret < 0) {
+		dev_err(chip->dev, "%s: Failed to read TOGGLE3\n", pwm->label);
+		goto out;
+	}
+
+	val |= TWL6030_PWM_TOGGLE(pwm->hwpwm, TWL6030_PWMXS | TWL6030_PWMXEN);
+
+	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, TWL6030_TOGGLE3_REG);
+	if (ret < 0) {
+		dev_err(chip->dev, "%s: Failed to disable PWM\n", pwm->label);
+		goto out;
+	}
+
+	twl->twl6030_toggle3 = val;
+out:
+	mutex_unlock(&twl->mutex);
+}
+
+static const struct pwm_ops twl4030_pwm_ops = {
+	.config = twl_pwm_config,
+	.enable = twl4030_pwm_enable,
+	.disable = twl4030_pwm_disable,
+	.request = twl4030_pwm_request,
+	.free = twl4030_pwm_free,
+};
+
+static const struct pwm_ops twl6030_pwm_ops = {
+	.config = twl_pwm_config,
+	.enable = twl6030_pwm_enable,
+	.disable = twl6030_pwm_disable,
+};
+
+static int twl_pwm_probe(struct platform_device *pdev)
+{
+	struct twl_pwm_chip *twl;
+	int ret;
+
+	twl = devm_kzalloc(&pdev->dev, sizeof(*twl), GFP_KERNEL);
+	if (!twl)
+		return -ENOMEM;
+
+	if (twl_class_is_4030())
+		twl->chip.ops = &twl4030_pwm_ops;
+	else
+		twl->chip.ops = &twl6030_pwm_ops;
+
+	twl->chip.dev = &pdev->dev;
+	twl->chip.base = -1;
+	twl->chip.npwm = 2;
+
+	mutex_init(&twl->mutex);
+
+	ret = pwmchip_add(&twl->chip);
+	if (ret < 0)
+		return ret;
+
+	platform_set_drvdata(pdev, twl);
+
+	return 0;
+}
+
+static int twl_pwm_remove(struct platform_device *pdev)
+{
+	struct twl_pwm_chip *twl = platform_get_drvdata(pdev);
+
+	return pwmchip_remove(&twl->chip);
+}
+
+#ifdef CONFIG_OF
+static struct of_device_id twl_pwm_of_match[] = {
+	{ .compatible = "ti,twl4030-pwm" },
+	{ .compatible = "ti,twl6030-pwm" },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, twl_pwm_of_match);
+#endif
+
+static struct platform_driver twl_pwm_driver = {
+	.driver = {
+		.name = "twl-pwm",
+		.of_match_table = of_match_ptr(twl_pwm_of_match),
+	},
+	.probe = twl_pwm_probe,
+	.remove = twl_pwm_remove,
+};
+module_platform_driver(twl_pwm_driver);
+
+MODULE_AUTHOR("Peter Ujfalusi <peter.ujfalusi@ti.com>");
+MODULE_DESCRIPTION("PWM driver for TWL4030 and TWL6030");
+MODULE_ALIAS("platform:twl-pwm");
+MODULE_LICENSE("GPL");
diff --git a/trunk/drivers/pwm/pwm-twl6030.c b/trunk/drivers/pwm/pwm-twl6030.c
deleted file mode 100644
index 378a7e286366..000000000000
--- a/trunk/drivers/pwm/pwm-twl6030.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * twl6030_pwm.c
- * Driver for PHOENIX (TWL6030) Pulse Width Modulator
- *
- * Copyright (C) 2010 Texas Instruments
- * Author: Hemanth V <hemanthv@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/pwm.h>
-#include <linux/i2c/twl.h>
-#include <linux/slab.h>
-
-#define LED_PWM_CTRL1	0xF4
-#define LED_PWM_CTRL2	0xF5
-
-/* Max value for CTRL1 register */
-#define PWM_CTRL1_MAX	255
-
-/* Pull down disable */
-#define PWM_CTRL2_DIS_PD	(1 << 6)
-
-/* Current control 2.5 milli Amps */
-#define PWM_CTRL2_CURR_02	(2 << 4)
-
-/* LED supply source */
-#define PWM_CTRL2_SRC_VAC	(1 << 2)
-
-/* LED modes */
-#define PWM_CTRL2_MODE_HW	(0 << 0)
-#define PWM_CTRL2_MODE_SW	(1 << 0)
-#define PWM_CTRL2_MODE_DIS	(2 << 0)
-
-#define PWM_CTRL2_MODE_MASK	0x3
-
-struct twl6030_pwm_chip {
-	struct pwm_chip chip;
-};
-
-static int twl6030_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm)
-{
-	int ret;
-	u8 val;
-
-	/* Configure PWM */
-	val = PWM_CTRL2_DIS_PD | PWM_CTRL2_CURR_02 | PWM_CTRL2_SRC_VAC |
-	      PWM_CTRL2_MODE_HW;
-
-	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, LED_PWM_CTRL2);
-	if (ret < 0) {
-		dev_err(chip->dev, "%s: Failed to configure PWM, Error %d\n",
-			pwm->label, ret);
-		return ret;
-	}
-
-	return 0;
-}
-
-static int twl6030_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
-			      int duty_ns, int period_ns)
-{
-	u8 duty_cycle = (duty_ns * PWM_CTRL1_MAX) / period_ns;
-	int ret;
-
-	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, duty_cycle, LED_PWM_CTRL1);
-	if (ret < 0) {
-		pr_err("%s: Failed to configure PWM, Error %d\n",
-			pwm->label, ret);
-		return ret;
-	}
-
-	return 0;
-}
-
-static int twl6030_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
-{
-	int ret;
-	u8 val;
-
-	ret = twl_i2c_read_u8(TWL6030_MODULE_ID1, &val, LED_PWM_CTRL2);
-	if (ret < 0) {
-		dev_err(chip->dev, "%s: Failed to enable PWM, Error %d\n",
-			pwm->label, ret);
-		return ret;
-	}
-
-	/* Change mode to software control */
-	val &= ~PWM_CTRL2_MODE_MASK;
-	val |= PWM_CTRL2_MODE_SW;
-
-	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, LED_PWM_CTRL2);
-	if (ret < 0) {
-		dev_err(chip->dev, "%s: Failed to enable PWM, Error %d\n",
-			pwm->label, ret);
-		return ret;
-	}
-
-	twl_i2c_read_u8(TWL6030_MODULE_ID1, &val, LED_PWM_CTRL2);
-	return 0;
-}
-
-static void twl6030_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
-{
-	int ret;
-	u8 val;
-
-	ret = twl_i2c_read_u8(TWL6030_MODULE_ID1, &val, LED_PWM_CTRL2);
-	if (ret < 0) {
-		dev_err(chip->dev, "%s: Failed to disable PWM, Error %d\n",
-			pwm->label, ret);
-		return;
-	}
-
-	val &= ~PWM_CTRL2_MODE_MASK;
-	val |= PWM_CTRL2_MODE_HW;
-
-	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, LED_PWM_CTRL2);
-	if (ret < 0) {
-		dev_err(chip->dev, "%s: Failed to disable PWM, Error %d\n",
-			pwm->label, ret);
-	}
-}
-
-static const struct pwm_ops twl6030_pwm_ops = {
-	.request = twl6030_pwm_request,
-	.config = twl6030_pwm_config,
-	.enable = twl6030_pwm_enable,
-	.disable = twl6030_pwm_disable,
-};
-
-static int twl6030_pwm_probe(struct platform_device *pdev)
-{
-	struct twl6030_pwm_chip *twl6030;
-	int ret;
-
-	twl6030 = devm_kzalloc(&pdev->dev, sizeof(*twl6030), GFP_KERNEL);
-	if (!twl6030)
-		return -ENOMEM;
-
-	twl6030->chip.dev = &pdev->dev;
-	twl6030->chip.ops = &twl6030_pwm_ops;
-	twl6030->chip.base = -1;
-	twl6030->chip.npwm = 1;
-
-	ret = pwmchip_add(&twl6030->chip);
-	if (ret < 0)
-		return ret;
-
-	platform_set_drvdata(pdev, twl6030);
-
-	return 0;
-}
-
-static int twl6030_pwm_remove(struct platform_device *pdev)
-{
-	struct twl6030_pwm_chip *twl6030 = platform_get_drvdata(pdev);
-
-	return pwmchip_remove(&twl6030->chip);
-}
-
-static struct platform_driver twl6030_pwm_driver = {
-	.driver = {
-		.name = "twl6030-pwm",
-	},
-	.probe = twl6030_pwm_probe,
-	.remove = twl6030_pwm_remove,
-};
-module_platform_driver(twl6030_pwm_driver);
-
-MODULE_ALIAS("platform:twl6030-pwm");
-MODULE_LICENSE("GPL");
diff --git a/trunk/drivers/pwm/pwm-vt8500.c b/trunk/drivers/pwm/pwm-vt8500.c
index ad14389b7144..b0ba2d403439 100644
--- a/trunk/drivers/pwm/pwm-vt8500.c
+++ b/trunk/drivers/pwm/pwm-vt8500.c
@@ -1,7 +1,8 @@
 /*
  * drivers/pwm/pwm-vt8500.c
  *
- *  Copyright (C) 2010 Alexey Charkov <alchark@gmail.com>
+ * Copyright (C) 2012 Tony Prisk <linux@prisktech.co.nz>
+ * Copyright (C) 2010 Alexey Charkov <alchark@gmail.com>
  *
  * This software is licensed under the terms of the GNU General Public
  * License version 2, as published by the Free Software Foundation, and
@@ -21,14 +22,24 @@
 #include <linux/io.h>
 #include <linux/pwm.h>
 #include <linux/delay.h>
+#include <linux/clk.h>
 
 #include <asm/div64.h>
 
-#define VT8500_NR_PWMS 4
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_address.h>
+
+/*
+ * SoC architecture allocates register space for 4 PWMs but only
+ * 2 are currently implemented.
+ */
+#define VT8500_NR_PWMS	2
 
 struct vt8500_chip {
 	struct pwm_chip chip;
 	void __iomem *base;
+	struct clk *clk;
 };
 
 #define to_vt8500_chip(chip)	container_of(chip, struct vt8500_chip, chip)
@@ -51,8 +62,15 @@ static int vt8500_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 	struct vt8500_chip *vt8500 = to_vt8500_chip(chip);
 	unsigned long long c;
 	unsigned long period_cycles, prescale, pv, dc;
+	int err;
 
-	c = 25000000/2; /* wild guess --- need to implement clocks */
+	err = clk_enable(vt8500->clk);
+	if (err < 0) {
+		dev_err(chip->dev, "failed to enable clock\n");
+		return err;
+	}
+
+	c = clk_get_rate(vt8500->clk);
 	c = c * period_ns;
 	do_div(c, 1000000000);
 	period_cycles = c;
@@ -64,8 +82,10 @@ static int vt8500_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 	if (pv > 4095)
 		pv = 4095;
 
-	if (prescale > 1023)
+	if (prescale > 1023) {
+		clk_disable(vt8500->clk);
 		return -EINVAL;
+	}
 
 	c = (unsigned long long)pv * duty_ns;
 	do_div(c, period_ns);
@@ -80,13 +100,21 @@ static int vt8500_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 	pwm_busy_wait(vt8500->base + 0x40 + pwm->hwpwm, (1 << 3));
 	writel(dc, vt8500->base + 0xc + (pwm->hwpwm << 4));
 
+	clk_disable(vt8500->clk);
 	return 0;
 }
 
 static int vt8500_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 {
+	int err;
 	struct vt8500_chip *vt8500 = to_vt8500_chip(chip);
 
+	err = clk_enable(vt8500->clk);
+	if (err < 0) {
+		dev_err(chip->dev, "failed to enable clock\n");
+		return err;
+	}
+
 	pwm_busy_wait(vt8500->base + 0x40 + pwm->hwpwm, (1 << 0));
 	writel(5, vt8500->base + (pwm->hwpwm << 4));
 	return 0;
@@ -98,6 +126,8 @@ static void vt8500_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
 
 	pwm_busy_wait(vt8500->base + 0x40 + pwm->hwpwm, (1 << 0));
 	writel(0, vt8500->base + (pwm->hwpwm << 4));
+
+	clk_disable(vt8500->clk);
 }
 
 static struct pwm_ops vt8500_pwm_ops = {
@@ -107,12 +137,24 @@ static struct pwm_ops vt8500_pwm_ops = {
 	.owner = THIS_MODULE,
 };
 
-static int __devinit pwm_probe(struct platform_device *pdev)
+static const struct of_device_id vt8500_pwm_dt_ids[] = {
+	{ .compatible = "via,vt8500-pwm", },
+	{ /* Sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, vt8500_pwm_dt_ids);
+
+static int vt8500_pwm_probe(struct platform_device *pdev)
 {
 	struct vt8500_chip *chip;
 	struct resource *r;
+	struct device_node *np = pdev->dev.of_node;
 	int ret;
 
+	if (!np) {
+		dev_err(&pdev->dev, "invalid devicetree node\n");
+		return -EINVAL;
+	}
+
 	chip = devm_kzalloc(&pdev->dev, sizeof(*chip), GFP_KERNEL);
 	if (chip == NULL) {
 		dev_err(&pdev->dev, "failed to allocate memory\n");
@@ -124,6 +166,12 @@ static int __devinit pwm_probe(struct platform_device *pdev)
 	chip->chip.base = -1;
 	chip->chip.npwm = VT8500_NR_PWMS;
 
+	chip->clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(chip->clk)) {
+		dev_err(&pdev->dev, "clock source not specified\n");
+		return PTR_ERR(chip->clk);
+	}
+
 	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (r == NULL) {
 		dev_err(&pdev->dev, "no memory resource defined\n");
@@ -131,18 +179,26 @@ static int __devinit pwm_probe(struct platform_device *pdev)
 	}
 
 	chip->base = devm_request_and_ioremap(&pdev->dev, r);
-	if (chip->base == NULL)
+	if (!chip->base)
 		return -EADDRNOTAVAIL;
 
+	ret = clk_prepare(chip->clk);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "failed to prepare clock\n");
+		return ret;
+	}
+
 	ret = pwmchip_add(&chip->chip);
-	if (ret < 0)
+	if (ret < 0) {
+		dev_err(&pdev->dev, "failed to add PWM chip\n");
 		return ret;
+	}
 
 	platform_set_drvdata(pdev, chip);
 	return ret;
 }
 
-static int __devexit pwm_remove(struct platform_device *pdev)
+static int vt8500_pwm_remove(struct platform_device *pdev)
 {
 	struct vt8500_chip *chip;
 
@@ -150,28 +206,22 @@ static int __devexit pwm_remove(struct platform_device *pdev)
 	if (chip == NULL)
 		return -ENODEV;
 
+	clk_unprepare(chip->clk);
+
 	return pwmchip_remove(&chip->chip);
 }
 
-static struct platform_driver pwm_driver = {
+static struct platform_driver vt8500_pwm_driver = {
+	.probe		= vt8500_pwm_probe,
+	.remove		= vt8500_pwm_remove,
 	.driver		= {
 		.name	= "vt8500-pwm",
 		.owner	= THIS_MODULE,
+		.of_match_table = vt8500_pwm_dt_ids,
 	},
-	.probe		= pwm_probe,
-	.remove		= __devexit_p(pwm_remove),
 };
+module_platform_driver(vt8500_pwm_driver);
 
-static int __init pwm_init(void)
-{
-	return platform_driver_register(&pwm_driver);
-}
-arch_initcall(pwm_init);
-
-static void __exit pwm_exit(void)
-{
-	platform_driver_unregister(&pwm_driver);
-}
-module_exit(pwm_exit);
-
-MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("VT8500 PWM Driver");
+MODULE_AUTHOR("Tony Prisk <linux@prisktech.co.nz>");
+MODULE_LICENSE("GPL v2");
diff --git a/trunk/drivers/rtc/Kconfig b/trunk/drivers/rtc/Kconfig
index 19c03ab2bdcb..d0cea02b5dfc 100644
--- a/trunk/drivers/rtc/Kconfig
+++ b/trunk/drivers/rtc/Kconfig
@@ -269,6 +269,15 @@ config RTC_DRV_X1205
 	  This driver can also be built as a module. If so, the module
 	  will be called rtc-x1205.
 
+config RTC_DRV_PCF8523
+	tristate "NXP PCF8523"
+	help
+	  If you say yes here you get support for the NXP PCF8523 RTC
+	  chips.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called rtc-pcf8523.
+
 config RTC_DRV_PCF8563
 	tristate "Philips PCF8563/Epson RTC8564"
 	help
@@ -600,6 +609,16 @@ config RTC_DRV_DA9052
 	  Say y here to support the RTC driver for Dialog Semiconductor
 	  DA9052-BC and DA9053-AA/Bx PMICs.
 
+config RTC_DRV_DA9055
+	tristate "Dialog Semiconductor DA9055 RTC"
+	depends on MFD_DA9055
+	help
+	  If you say yes here you will get support for the
+	  RTC of the Dialog DA9055 PMIC.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called rtc-da9055
+
 config RTC_DRV_EFI
 	tristate "EFI RTC"
 	depends on IA64
@@ -768,7 +787,7 @@ config RTC_DRV_DAVINCI
 
 config RTC_DRV_IMXDI
 	tristate "Freescale IMX DryIce Real Time Clock"
-	depends on SOC_IMX25
+	depends on ARCH_MXC
 	help
 	   Support for Freescale IMX DryIce RTC
 
@@ -777,11 +796,13 @@ config RTC_DRV_IMXDI
 
 config RTC_DRV_OMAP
 	tristate "TI OMAP1"
-	depends on ARCH_OMAP15XX || ARCH_OMAP16XX || ARCH_OMAP730 || ARCH_DAVINCI_DA8XX
+	depends on ARCH_OMAP15XX || ARCH_OMAP16XX || ARCH_OMAP730 || ARCH_DAVINCI_DA8XX || SOC_AM33XX
 	help
-	  Say "yes" here to support the real time clock on TI OMAP1 and
-	  DA8xx/OMAP-L13x chips.  This driver can also be built as a
-	  module called rtc-omap.
+	  Say "yes" here to support the on chip real time clock
+	  present on TI OMAP1, AM33xx and DA8xx/OMAP-L13x.
+
+	  This driver can also be built as a module, if so, module
+	  will be called rtc-omap.
 
 config HAVE_S3C_RTC
 	bool
diff --git a/trunk/drivers/rtc/Makefile b/trunk/drivers/rtc/Makefile
index 56297f0fd388..c3f62c80dc06 100644
--- a/trunk/drivers/rtc/Makefile
+++ b/trunk/drivers/rtc/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_RTC_DRV_BQ4802)	+= rtc-bq4802.o
 obj-$(CONFIG_RTC_DRV_CMOS)	+= rtc-cmos.o
 obj-$(CONFIG_RTC_DRV_COH901331)	+= rtc-coh901331.o
 obj-$(CONFIG_RTC_DRV_DA9052)	+= rtc-da9052.o
+obj-$(CONFIG_RTC_DRV_DA9055)	+= rtc-da9055.o
 obj-$(CONFIG_RTC_DRV_DAVINCI)	+= rtc-davinci.o
 obj-$(CONFIG_RTC_DRV_DM355EVM)	+= rtc-dm355evm.o
 obj-$(CONFIG_RTC_DRV_VRTC)	+= rtc-mrst.o
@@ -76,6 +77,7 @@ obj-$(CONFIG_RTC_DRV_MV)	+= rtc-mv.o
 obj-$(CONFIG_RTC_DRV_NUC900)	+= rtc-nuc900.o
 obj-$(CONFIG_RTC_DRV_OMAP)	+= rtc-omap.o
 obj-$(CONFIG_RTC_DRV_PCAP)	+= rtc-pcap.o
+obj-$(CONFIG_RTC_DRV_PCF8523)	+= rtc-pcf8523.o
 obj-$(CONFIG_RTC_DRV_PCF8563)	+= rtc-pcf8563.o
 obj-$(CONFIG_RTC_DRV_PCF8583)	+= rtc-pcf8583.o
 obj-$(CONFIG_RTC_DRV_PCF2123)	+= rtc-pcf2123.o
diff --git a/trunk/drivers/rtc/rtc-da9055.c b/trunk/drivers/rtc/rtc-da9055.c
new file mode 100644
index 000000000000..96bafc5c3bf8
--- /dev/null
+++ b/trunk/drivers/rtc/rtc-da9055.c
@@ -0,0 +1,413 @@
+/*
+ * Real time clock driver for DA9055
+ *
+ * Copyright(c) 2012 Dialog Semiconductor Ltd.
+ *
+ * Author: Dajun Dajun Chen <dajun.chen@diasemi.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+
+#include <linux/mfd/da9055/core.h>
+#include <linux/mfd/da9055/reg.h>
+#include <linux/mfd/da9055/pdata.h>
+
+struct da9055_rtc {
+	struct rtc_device *rtc;
+	struct da9055 *da9055;
+	int alarm_enable;
+};
+
+static int da9055_rtc_enable_alarm(struct da9055_rtc *rtc, bool enable)
+{
+	int ret;
+	if (enable) {
+		ret = da9055_reg_update(rtc->da9055, DA9055_REG_ALARM_Y,
+					DA9055_RTC_ALM_EN,
+					DA9055_RTC_ALM_EN);
+		if (ret != 0)
+			dev_err(rtc->da9055->dev, "Failed to enable ALM: %d\n",
+				ret);
+		rtc->alarm_enable = 1;
+	} else {
+		ret = da9055_reg_update(rtc->da9055, DA9055_REG_ALARM_Y,
+					DA9055_RTC_ALM_EN, 0);
+		if (ret != 0)
+			dev_err(rtc->da9055->dev,
+				"Failed to disable ALM: %d\n", ret);
+		rtc->alarm_enable = 0;
+	}
+	return ret;
+}
+
+static irqreturn_t da9055_rtc_alm_irq(int irq, void *data)
+{
+	struct da9055_rtc *rtc = data;
+
+	da9055_rtc_enable_alarm(rtc, 0);
+	rtc_update_irq(rtc->rtc, 1, RTC_IRQF | RTC_AF);
+
+	return IRQ_HANDLED;
+}
+
+static int da9055_read_alarm(struct da9055 *da9055, struct rtc_time *rtc_tm)
+{
+	int ret;
+	uint8_t v[5];
+
+	ret = da9055_group_read(da9055, DA9055_REG_ALARM_MI, 5, v);
+	if (ret != 0) {
+		dev_err(da9055->dev, "Failed to group read ALM: %d\n", ret);
+		return ret;
+	}
+
+	rtc_tm->tm_year = (v[4] & DA9055_RTC_ALM_YEAR) + 100;
+	rtc_tm->tm_mon  = (v[3] & DA9055_RTC_ALM_MONTH) - 1;
+	rtc_tm->tm_mday = v[2] & DA9055_RTC_ALM_DAY;
+	rtc_tm->tm_hour = v[1] & DA9055_RTC_ALM_HOUR;
+	rtc_tm->tm_min  = v[0] & DA9055_RTC_ALM_MIN;
+
+	return rtc_valid_tm(rtc_tm);
+}
+
+static int da9055_set_alarm(struct da9055 *da9055, struct rtc_time *rtc_tm)
+{
+	int ret;
+	uint8_t v[2];
+
+	rtc_tm->tm_year -= 100;
+	rtc_tm->tm_mon += 1;
+
+	ret = da9055_reg_update(da9055, DA9055_REG_ALARM_MI,
+				DA9055_RTC_ALM_MIN, rtc_tm->tm_min);
+	if (ret != 0) {
+		dev_err(da9055->dev, "Failed to write ALRM MIN: %d\n", ret);
+		return ret;
+	}
+
+	v[0] = rtc_tm->tm_hour;
+	v[1] = rtc_tm->tm_mday;
+
+	ret = da9055_group_write(da9055, DA9055_REG_ALARM_H, 2, v);
+	if (ret < 0)
+		return ret;
+
+	ret = da9055_reg_update(da9055, DA9055_REG_ALARM_MO,
+				DA9055_RTC_ALM_MONTH, rtc_tm->tm_mon);
+	if (ret < 0)
+		dev_err(da9055->dev, "Failed to write ALM Month:%d\n", ret);
+
+	ret = da9055_reg_update(da9055, DA9055_REG_ALARM_Y,
+				DA9055_RTC_ALM_YEAR, rtc_tm->tm_year);
+	if (ret < 0)
+		dev_err(da9055->dev, "Failed to write ALM Year:%d\n", ret);
+
+	return ret;
+}
+
+static int da9055_rtc_get_alarm_status(struct da9055 *da9055)
+{
+	int ret;
+
+	ret = da9055_reg_read(da9055, DA9055_REG_ALARM_Y);
+	if (ret < 0) {
+		dev_err(da9055->dev, "Failed to read ALM: %d\n", ret);
+		return ret;
+	}
+	ret &= DA9055_RTC_ALM_EN;
+	return (ret > 0) ? 1 : 0;
+}
+
+static int da9055_rtc_read_time(struct device *dev, struct rtc_time *rtc_tm)
+{
+	struct da9055_rtc *rtc = dev_get_drvdata(dev);
+	uint8_t v[6];
+	int ret;
+
+	ret = da9055_reg_read(rtc->da9055, DA9055_REG_COUNT_S);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Registers are only valid when RTC_READ
+	 * status bit is asserted
+	 */
+	if (!(ret & DA9055_RTC_READ))
+		return -EBUSY;
+
+	ret = da9055_group_read(rtc->da9055, DA9055_REG_COUNT_S, 6, v);
+	if (ret < 0) {
+		dev_err(rtc->da9055->dev, "Failed to read RTC time : %d\n",
+			ret);
+		return ret;
+	}
+
+	rtc_tm->tm_year = (v[5] & DA9055_RTC_YEAR) + 100;
+	rtc_tm->tm_mon  = (v[4] & DA9055_RTC_MONTH) - 1;
+	rtc_tm->tm_mday = v[3] & DA9055_RTC_DAY;
+	rtc_tm->tm_hour = v[2] & DA9055_RTC_HOUR;
+	rtc_tm->tm_min  = v[1] & DA9055_RTC_MIN;
+	rtc_tm->tm_sec  = v[0] & DA9055_RTC_SEC;
+
+	return rtc_valid_tm(rtc_tm);
+}
+
+static int da9055_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+	struct da9055_rtc *rtc;
+	uint8_t v[6];
+
+	rtc = dev_get_drvdata(dev);
+
+	v[0] = tm->tm_sec;
+	v[1] = tm->tm_min;
+	v[2] = tm->tm_hour;
+	v[3] = tm->tm_mday;
+	v[4] = tm->tm_mon + 1;
+	v[5] = tm->tm_year - 100;
+
+	return da9055_group_write(rtc->da9055, DA9055_REG_COUNT_S, 6, v);
+}
+
+static int da9055_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+	int ret;
+	struct rtc_time *tm = &alrm->time;
+	struct da9055_rtc *rtc = dev_get_drvdata(dev);
+
+	ret = da9055_read_alarm(rtc->da9055, tm);
+
+	if (ret)
+		return ret;
+
+	alrm->enabled = da9055_rtc_get_alarm_status(rtc->da9055);
+
+	return 0;
+}
+
+static int da9055_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+	int ret;
+	struct rtc_time *tm = &alrm->time;
+	struct da9055_rtc *rtc = dev_get_drvdata(dev);
+
+	ret = da9055_rtc_enable_alarm(rtc, 0);
+	if (ret < 0)
+		return ret;
+
+	ret = da9055_set_alarm(rtc->da9055, tm);
+	if (ret)
+		return ret;
+
+	ret = da9055_rtc_enable_alarm(rtc, 1);
+
+	return ret;
+}
+
+static int da9055_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
+{
+	struct da9055_rtc *rtc = dev_get_drvdata(dev);
+
+	return da9055_rtc_enable_alarm(rtc, enabled);
+}
+
+static const struct rtc_class_ops da9055_rtc_ops = {
+	.read_time	= da9055_rtc_read_time,
+	.set_time	= da9055_rtc_set_time,
+	.read_alarm	= da9055_rtc_read_alarm,
+	.set_alarm	= da9055_rtc_set_alarm,
+	.alarm_irq_enable = da9055_rtc_alarm_irq_enable,
+};
+
+static int __init da9055_rtc_device_init(struct da9055 *da9055,
+					struct da9055_pdata *pdata)
+{
+	int ret;
+
+	/* Enable RTC and the internal Crystal */
+	ret = da9055_reg_update(da9055, DA9055_REG_CONTROL_B,
+				DA9055_RTC_EN, DA9055_RTC_EN);
+	if (ret < 0)
+		return ret;
+	ret = da9055_reg_update(da9055, DA9055_REG_EN_32K,
+				DA9055_CRYSTAL_EN, DA9055_CRYSTAL_EN);
+	if (ret < 0)
+		return ret;
+
+	/* Enable RTC in Power Down mode */
+	ret = da9055_reg_update(da9055, DA9055_REG_CONTROL_B,
+				DA9055_RTC_MODE_PD, DA9055_RTC_MODE_PD);
+	if (ret < 0)
+		return ret;
+
+	/* Enable RTC in Reset mode */
+	if (pdata && pdata->reset_enable) {
+		ret = da9055_reg_update(da9055, DA9055_REG_CONTROL_B,
+					DA9055_RTC_MODE_SD,
+					DA9055_RTC_MODE_SD <<
+					DA9055_RTC_MODE_SD_SHIFT);
+		if (ret < 0)
+			return ret;
+	}
+
+	/* Disable the RTC TICK ALM */
+	ret = da9055_reg_update(da9055, DA9055_REG_ALARM_MO,
+				DA9055_RTC_TICK_WAKE_MASK, 0);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static int da9055_rtc_probe(struct platform_device *pdev)
+{
+	struct da9055_rtc *rtc;
+	struct da9055_pdata *pdata = NULL;
+	int ret, alm_irq;
+
+	rtc = devm_kzalloc(&pdev->dev, sizeof(struct da9055_rtc), GFP_KERNEL);
+	if (!rtc)
+		return -ENOMEM;
+
+	rtc->da9055 = dev_get_drvdata(pdev->dev.parent);
+	pdata = rtc->da9055->dev->platform_data;
+	platform_set_drvdata(pdev, rtc);
+
+	ret = da9055_rtc_device_init(rtc->da9055, pdata);
+	if (ret < 0)
+		goto err_rtc;
+
+	ret = da9055_reg_read(rtc->da9055, DA9055_REG_ALARM_Y);
+	if (ret < 0)
+		goto err_rtc;
+
+	if (ret & DA9055_RTC_ALM_EN)
+		rtc->alarm_enable = 1;
+
+	device_init_wakeup(&pdev->dev, 1);
+
+	rtc->rtc = rtc_device_register(pdev->name, &pdev->dev,
+					&da9055_rtc_ops, THIS_MODULE);
+	if (IS_ERR(rtc->rtc)) {
+		ret = PTR_ERR(rtc->rtc);
+		goto err_rtc;
+	}
+
+	alm_irq = platform_get_irq_byname(pdev, "ALM");
+	alm_irq = regmap_irq_get_virq(rtc->da9055->irq_data, alm_irq);
+	ret = devm_request_threaded_irq(&pdev->dev, alm_irq, NULL,
+					da9055_rtc_alm_irq,
+					IRQF_TRIGGER_HIGH | IRQF_ONESHOT,
+					"ALM", rtc);
+	if (ret != 0)
+		dev_err(rtc->da9055->dev, "irq registration failed: %d\n", ret);
+
+err_rtc:
+	return ret;
+
+}
+
+static int da9055_rtc_remove(struct platform_device *pdev)
+{
+	struct da9055_rtc *rtc = pdev->dev.platform_data;
+
+	rtc_device_unregister(rtc->rtc);
+	platform_set_drvdata(pdev, NULL);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+/* Turn off the alarm if it should not be a wake source. */
+static int da9055_rtc_suspend(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct da9055_rtc *rtc = dev_get_drvdata(&pdev->dev);
+	int ret;
+
+	if (!device_may_wakeup(&pdev->dev)) {
+		/* Disable the ALM IRQ */
+		ret = da9055_rtc_enable_alarm(rtc, 0);
+		if (ret < 0)
+			dev_err(&pdev->dev, "Failed to disable RTC ALM\n");
+	}
+
+	return 0;
+}
+
+/* Enable the alarm if it should be enabled (in case it was disabled to
+ * prevent use as a wake source).
+ */
+static int da9055_rtc_resume(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct da9055_rtc *rtc = dev_get_drvdata(&pdev->dev);
+	int ret;
+
+	if (!device_may_wakeup(&pdev->dev)) {
+		if (rtc->alarm_enable) {
+			ret = da9055_rtc_enable_alarm(rtc, 1);
+			if (ret < 0)
+				dev_err(&pdev->dev,
+					"Failed to restart RTC ALM\n");
+		}
+	}
+
+	return 0;
+}
+
+/* Unconditionally disable the alarm */
+static int da9055_rtc_freeze(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct da9055_rtc *rtc = dev_get_drvdata(&pdev->dev);
+	int ret;
+
+	ret = da9055_rtc_enable_alarm(rtc, 0);
+	if (ret < 0)
+		dev_err(&pdev->dev, "Failed to freeze RTC ALMs\n");
+
+	return 0;
+
+}
+#else
+#define da9055_rtc_suspend NULL
+#define da9055_rtc_resume NULL
+#define da9055_rtc_freeze NULL
+#endif
+
+static const struct dev_pm_ops da9055_rtc_pm_ops = {
+	.suspend = da9055_rtc_suspend,
+	.resume = da9055_rtc_resume,
+
+	.freeze = da9055_rtc_freeze,
+	.thaw = da9055_rtc_resume,
+	.restore = da9055_rtc_resume,
+
+	.poweroff = da9055_rtc_suspend,
+};
+
+static struct platform_driver da9055_rtc_driver = {
+	.probe  = da9055_rtc_probe,
+	.remove = da9055_rtc_remove,
+	.driver = {
+		.name   = "da9055-rtc",
+		.owner  = THIS_MODULE,
+		.pm = &da9055_rtc_pm_ops,
+	},
+};
+
+module_platform_driver(da9055_rtc_driver);
+
+MODULE_AUTHOR("David Dajun Chen <dchen@diasemi.com>");
+MODULE_DESCRIPTION("RTC driver for Dialog DA9055 PMIC");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:da9055-rtc");
diff --git a/trunk/drivers/rtc/rtc-davinci.c b/trunk/drivers/rtc/rtc-davinci.c
index 14c2109dbaa3..07cd03eae606 100644
--- a/trunk/drivers/rtc/rtc-davinci.c
+++ b/trunk/drivers/rtc/rtc-davinci.c
@@ -485,7 +485,7 @@ static int __init davinci_rtc_probe(struct platform_device *pdev)
 	struct resource *res, *mem;
 	int ret = 0;
 
-	davinci_rtc = kzalloc(sizeof(struct davinci_rtc), GFP_KERNEL);
+	davinci_rtc = devm_kzalloc(&pdev->dev, sizeof(struct davinci_rtc), GFP_KERNEL);
 	if (!davinci_rtc) {
 		dev_dbg(dev, "could not allocate memory for private data\n");
 		return -ENOMEM;
@@ -494,15 +494,13 @@ static int __init davinci_rtc_probe(struct platform_device *pdev)
 	davinci_rtc->irq = platform_get_irq(pdev, 0);
 	if (davinci_rtc->irq < 0) {
 		dev_err(dev, "no RTC irq\n");
-		ret = davinci_rtc->irq;
-		goto fail1;
+		return davinci_rtc->irq;
 	}
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!res) {
 		dev_err(dev, "no mem resource\n");
-		ret = -EINVAL;
-		goto fail1;
+		return -EINVAL;
 	}
 
 	davinci_rtc->pbase = res->start;
@@ -513,8 +511,7 @@ static int __init davinci_rtc_probe(struct platform_device *pdev)
 	if (!mem) {
 		dev_err(dev, "RTC registers at %08x are not free\n",
 			davinci_rtc->pbase);
-		ret = -EBUSY;
-		goto fail1;
+		return -EBUSY;
 	}
 
 	davinci_rtc->base = ioremap(davinci_rtc->pbase, davinci_rtc->base_size);
@@ -529,8 +526,9 @@ static int __init davinci_rtc_probe(struct platform_device *pdev)
 	davinci_rtc->rtc = rtc_device_register(pdev->name, &pdev->dev,
 				    &davinci_rtc_ops, THIS_MODULE);
 	if (IS_ERR(davinci_rtc->rtc)) {
-		dev_err(dev, "unable to register RTC device, err %ld\n",
-				PTR_ERR(davinci_rtc->rtc));
+		ret = PTR_ERR(davinci_rtc->rtc);
+		dev_err(dev, "unable to register RTC device, err %d\n",
+				ret);
 		goto fail3;
 	}
 
@@ -566,9 +564,6 @@ static int __init davinci_rtc_probe(struct platform_device *pdev)
 	iounmap(davinci_rtc->base);
 fail2:
 	release_mem_region(davinci_rtc->pbase, davinci_rtc->base_size);
-fail1:
-	kfree(davinci_rtc);
-
 	return ret;
 }
 
@@ -589,8 +584,6 @@ static int __devexit davinci_rtc_remove(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, NULL);
 
-	kfree(davinci_rtc);
-
 	return 0;
 }
 
diff --git a/trunk/drivers/rtc/rtc-dev.c b/trunk/drivers/rtc/rtc-dev.c
index cace6d3aed9a..9a86b4bd8699 100644
--- a/trunk/drivers/rtc/rtc-dev.c
+++ b/trunk/drivers/rtc/rtc-dev.c
@@ -379,25 +379,6 @@ static long rtc_dev_ioctl(struct file *file,
 		err = put_user(rtc->irq_freq, (unsigned long __user *)uarg);
 		break;
 
-#if 0
-	case RTC_EPOCH_SET:
-#ifndef rtc_epoch
-		/*
-		 * There were no RTC clocks before 1900.
-		 */
-		if (arg < 1900) {
-			err = -EINVAL;
-			break;
-		}
-		rtc_epoch = arg;
-		err = 0;
-#endif
-		break;
-
-	case RTC_EPOCH_READ:
-		err = put_user(rtc_epoch, (unsigned long __user *)uarg);
-		break;
-#endif
 	case RTC_WKALM_SET:
 		mutex_unlock(&rtc->ops_lock);
 		if (copy_from_user(&alarm, uarg, sizeof(alarm)))
diff --git a/trunk/drivers/rtc/rtc-imxdi.c b/trunk/drivers/rtc/rtc-imxdi.c
index 4eed51044c5d..18a4f0dd78a3 100644
--- a/trunk/drivers/rtc/rtc-imxdi.c
+++ b/trunk/drivers/rtc/rtc-imxdi.c
@@ -37,6 +37,7 @@
 #include <linux/rtc.h>
 #include <linux/sched.h>
 #include <linux/workqueue.h>
+#include <linux/of.h>
 
 /* DryIce Register Definitions */
 
@@ -495,10 +496,20 @@ static int __devexit dryice_rtc_remove(struct platform_device *pdev)
 	return 0;
 }
 
+#ifdef CONFIG_OF
+static const struct of_device_id dryice_dt_ids[] = {
+	{ .compatible = "fsl,imx25-rtc" },
+	{ /* sentinel */ }
+};
+
+MODULE_DEVICE_TABLE(of, dryice_dt_ids);
+#endif
+
 static struct platform_driver dryice_rtc_driver = {
 	.driver = {
 		   .name = "imxdi_rtc",
 		   .owner = THIS_MODULE,
+		   .of_match_table = of_match_ptr(dryice_dt_ids),
 		   },
 	.remove = __devexit_p(dryice_rtc_remove),
 };
diff --git a/trunk/drivers/rtc/rtc-omap.c b/trunk/drivers/rtc/rtc-omap.c
index 0b614e32653d..600971407aac 100644
--- a/trunk/drivers/rtc/rtc-omap.c
+++ b/trunk/drivers/rtc/rtc-omap.c
@@ -20,6 +20,9 @@
 #include <linux/rtc.h>
 #include <linux/bcd.h>
 #include <linux/platform_device.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/pm_runtime.h>
 
 #include <asm/io.h>
 
@@ -38,6 +41,8 @@
  * the SoC). See the BOARD-SPECIFIC CUSTOMIZATION comment.
  */
 
+#define	DRIVER_NAME			"omap_rtc"
+
 #define OMAP_RTC_BASE			0xfffb4800
 
 /* RTC registers */
@@ -64,6 +69,9 @@
 #define OMAP_RTC_COMP_MSB_REG		0x50
 #define OMAP_RTC_OSC_REG		0x54
 
+#define OMAP_RTC_KICK0_REG		0x6c
+#define OMAP_RTC_KICK1_REG		0x70
+
 /* OMAP_RTC_CTRL_REG bit fields: */
 #define OMAP_RTC_CTRL_SPLIT		(1<<7)
 #define OMAP_RTC_CTRL_DISABLE		(1<<6)
@@ -88,10 +96,18 @@
 #define OMAP_RTC_INTERRUPTS_IT_ALARM    (1<<3)
 #define OMAP_RTC_INTERRUPTS_IT_TIMER    (1<<2)
 
+/* OMAP_RTC_KICKER values */
+#define	KICK0_VALUE			0x83e70b13
+#define	KICK1_VALUE			0x95a4f1e0
+
+#define	OMAP_RTC_HAS_KICKER		0x1
+
 static void __iomem	*rtc_base;
 
-#define rtc_read(addr)		__raw_readb(rtc_base + (addr))
-#define rtc_write(val, addr)	__raw_writeb(val, rtc_base + (addr))
+#define rtc_read(addr)		readb(rtc_base + (addr))
+#define rtc_write(val, addr)	writeb(val, rtc_base + (addr))
+
+#define rtc_writel(val, addr)	writel(val, rtc_base + (addr))
 
 
 /* we rely on the rtc framework to handle locking (rtc->ops_lock),
@@ -285,11 +301,38 @@ static struct rtc_class_ops omap_rtc_ops = {
 static int omap_rtc_alarm;
 static int omap_rtc_timer;
 
+#define	OMAP_RTC_DATA_DA830_IDX	1
+
+static struct platform_device_id omap_rtc_devtype[] = {
+	{
+		.name	= DRIVER_NAME,
+	}, {
+		.name	= "da830-rtc",
+		.driver_data = OMAP_RTC_HAS_KICKER,
+	},
+	{},
+};
+MODULE_DEVICE_TABLE(platform, omap_rtc_devtype);
+
+static const struct of_device_id omap_rtc_of_match[] = {
+	{	.compatible	= "ti,da830-rtc",
+		.data		= &omap_rtc_devtype[OMAP_RTC_DATA_DA830_IDX],
+	},
+	{},
+};
+MODULE_DEVICE_TABLE(of, omap_rtc_of_match);
+
 static int __init omap_rtc_probe(struct platform_device *pdev)
 {
 	struct resource		*res, *mem;
 	struct rtc_device	*rtc;
 	u8			reg, new_ctrl;
+	const struct platform_device_id *id_entry;
+	const struct of_device_id *of_id;
+
+	of_id = of_match_device(omap_rtc_of_match, &pdev->dev);
+	if (of_id)
+		pdev->id_entry = of_id->data;
 
 	omap_rtc_timer = platform_get_irq(pdev, 0);
 	if (omap_rtc_timer <= 0) {
@@ -322,6 +365,16 @@ static int __init omap_rtc_probe(struct platform_device *pdev)
 		goto fail;
 	}
 
+	/* Enable the clock/module so that we can access the registers */
+	pm_runtime_enable(&pdev->dev);
+	pm_runtime_get_sync(&pdev->dev);
+
+	id_entry = platform_get_device_id(pdev);
+	if (id_entry && (id_entry->driver_data & OMAP_RTC_HAS_KICKER)) {
+		rtc_writel(KICK0_VALUE, OMAP_RTC_KICK0_REG);
+		rtc_writel(KICK1_VALUE, OMAP_RTC_KICK1_REG);
+	}
+
 	rtc = rtc_device_register(pdev->name, &pdev->dev,
 			&omap_rtc_ops, THIS_MODULE);
 	if (IS_ERR(rtc)) {
@@ -398,6 +451,10 @@ static int __init omap_rtc_probe(struct platform_device *pdev)
 fail1:
 	rtc_device_unregister(rtc);
 fail0:
+	if (id_entry && (id_entry->driver_data & OMAP_RTC_HAS_KICKER))
+		rtc_writel(0, OMAP_RTC_KICK0_REG);
+	pm_runtime_put_sync(&pdev->dev);
+	pm_runtime_disable(&pdev->dev);
 	iounmap(rtc_base);
 fail:
 	release_mem_region(mem->start, resource_size(mem));
@@ -408,6 +465,8 @@ static int __exit omap_rtc_remove(struct platform_device *pdev)
 {
 	struct rtc_device	*rtc = platform_get_drvdata(pdev);
 	struct resource		*mem = dev_get_drvdata(&rtc->dev);
+	const struct platform_device_id *id_entry =
+				platform_get_device_id(pdev);
 
 	device_init_wakeup(&pdev->dev, 0);
 
@@ -420,6 +479,13 @@ static int __exit omap_rtc_remove(struct platform_device *pdev)
 		free_irq(omap_rtc_alarm, rtc);
 
 	rtc_device_unregister(rtc);
+	if (id_entry && (id_entry->driver_data & OMAP_RTC_HAS_KICKER))
+		rtc_writel(0, OMAP_RTC_KICK0_REG);
+
+	/* Disable the clock/module */
+	pm_runtime_put_sync(&pdev->dev);
+	pm_runtime_disable(&pdev->dev);
+
 	iounmap(rtc_base);
 	release_mem_region(mem->start, resource_size(mem));
 	return 0;
@@ -442,11 +508,17 @@ static int omap_rtc_suspend(struct platform_device *pdev, pm_message_t state)
 	else
 		rtc_write(0, OMAP_RTC_INTERRUPTS_REG);
 
+	/* Disable the clock/module */
+	pm_runtime_put_sync(&pdev->dev);
+
 	return 0;
 }
 
 static int omap_rtc_resume(struct platform_device *pdev)
 {
+	/* Enable the clock/module so that we can access the registers */
+	pm_runtime_get_sync(&pdev->dev);
+
 	if (device_may_wakeup(&pdev->dev))
 		disable_irq_wake(omap_rtc_alarm);
 	else
@@ -471,9 +543,11 @@ static struct platform_driver omap_rtc_driver = {
 	.resume		= omap_rtc_resume,
 	.shutdown	= omap_rtc_shutdown,
 	.driver		= {
-		.name	= "omap_rtc",
+		.name	= DRIVER_NAME,
 		.owner	= THIS_MODULE,
+		.of_match_table = of_match_ptr(omap_rtc_of_match),
 	},
+	.id_table	= omap_rtc_devtype,
 };
 
 static int __init rtc_init(void)
diff --git a/trunk/drivers/rtc/rtc-pcf8523.c b/trunk/drivers/rtc/rtc-pcf8523.c
new file mode 100644
index 000000000000..be05a645f99e
--- /dev/null
+++ b/trunk/drivers/rtc/rtc-pcf8523.c
@@ -0,0 +1,326 @@
+/*
+ * Copyright (C) 2012 Avionic Design GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/bcd.h>
+#include <linux/i2c.h>
+#include <linux/module.h>
+#include <linux/rtc.h>
+#include <linux/of.h>
+
+#define DRIVER_NAME "rtc-pcf8523"
+
+#define REG_CONTROL1 0x00
+#define REG_CONTROL1_CAP_SEL (1 << 7)
+#define REG_CONTROL1_STOP    (1 << 5)
+
+#define REG_CONTROL3 0x02
+#define REG_CONTROL3_PM_BLD (1 << 7) /* battery low detection disabled */
+#define REG_CONTROL3_PM_VDD (1 << 6) /* switch-over disabled */
+#define REG_CONTROL3_PM_DSM (1 << 5) /* direct switching mode */
+#define REG_CONTROL3_PM_MASK 0xe0
+
+#define REG_SECONDS  0x03
+#define REG_SECONDS_OS (1 << 7)
+
+#define REG_MINUTES  0x04
+#define REG_HOURS    0x05
+#define REG_DAYS     0x06
+#define REG_WEEKDAYS 0x07
+#define REG_MONTHS   0x08
+#define REG_YEARS    0x09
+
+struct pcf8523 {
+	struct rtc_device *rtc;
+};
+
+static int pcf8523_read(struct i2c_client *client, u8 reg, u8 *valuep)
+{
+	struct i2c_msg msgs[2];
+	u8 value = 0;
+	int err;
+
+	msgs[0].addr = client->addr;
+	msgs[0].flags = 0;
+	msgs[0].len = sizeof(reg);
+	msgs[0].buf = &reg;
+
+	msgs[1].addr = client->addr;
+	msgs[1].flags = I2C_M_RD;
+	msgs[1].len = sizeof(value);
+	msgs[1].buf = &value;
+
+	err = i2c_transfer(client->adapter, msgs, ARRAY_SIZE(msgs));
+	if (err < 0)
+		return err;
+
+	*valuep = value;
+
+	return 0;
+}
+
+static int pcf8523_write(struct i2c_client *client, u8 reg, u8 value)
+{
+	u8 buffer[2] = { reg, value };
+	struct i2c_msg msg;
+	int err;
+
+	msg.addr = client->addr;
+	msg.flags = 0;
+	msg.len = sizeof(buffer);
+	msg.buf = buffer;
+
+	err = i2c_transfer(client->adapter, &msg, 1);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static int pcf8523_select_capacitance(struct i2c_client *client, bool high)
+{
+	u8 value;
+	int err;
+
+	err = pcf8523_read(client, REG_CONTROL1, &value);
+	if (err < 0)
+		return err;
+
+	if (!high)
+		value &= ~REG_CONTROL1_CAP_SEL;
+	else
+		value |= REG_CONTROL1_CAP_SEL;
+
+	err = pcf8523_write(client, REG_CONTROL1, value);
+	if (err < 0)
+		return err;
+
+	return err;
+}
+
+static int pcf8523_set_pm(struct i2c_client *client, u8 pm)
+{
+	u8 value;
+	int err;
+
+	err = pcf8523_read(client, REG_CONTROL3, &value);
+	if (err < 0)
+		return err;
+
+	value = (value & ~REG_CONTROL3_PM_MASK) | pm;
+
+	err = pcf8523_write(client, REG_CONTROL3, value);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static int pcf8523_stop_rtc(struct i2c_client *client)
+{
+	u8 value;
+	int err;
+
+	err = pcf8523_read(client, REG_CONTROL1, &value);
+	if (err < 0)
+		return err;
+
+	value |= REG_CONTROL1_STOP;
+
+	err = pcf8523_write(client, REG_CONTROL1, value);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static int pcf8523_start_rtc(struct i2c_client *client)
+{
+	u8 value;
+	int err;
+
+	err = pcf8523_read(client, REG_CONTROL1, &value);
+	if (err < 0)
+		return err;
+
+	value &= ~REG_CONTROL1_STOP;
+
+	err = pcf8523_write(client, REG_CONTROL1, value);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static int pcf8523_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	u8 start = REG_SECONDS, regs[7];
+	struct i2c_msg msgs[2];
+	int err;
+
+	msgs[0].addr = client->addr;
+	msgs[0].flags = 0;
+	msgs[0].len = 1;
+	msgs[0].buf = &start;
+
+	msgs[1].addr = client->addr;
+	msgs[1].flags = I2C_M_RD;
+	msgs[1].len = sizeof(regs);
+	msgs[1].buf = regs;
+
+	err = i2c_transfer(client->adapter, msgs, ARRAY_SIZE(msgs));
+	if (err < 0)
+		return err;
+
+	if (regs[0] & REG_SECONDS_OS) {
+		/*
+		 * If the oscillator was stopped, try to clear the flag. Upon
+		 * power-up the flag is always set, but if we cannot clear it
+		 * the oscillator isn't running properly for some reason. The
+		 * sensible thing therefore is to return an error, signalling
+		 * that the clock cannot be assumed to be correct.
+		 */
+
+		regs[0] &= ~REG_SECONDS_OS;
+
+		err = pcf8523_write(client, REG_SECONDS, regs[0]);
+		if (err < 0)
+			return err;
+
+		err = pcf8523_read(client, REG_SECONDS, &regs[0]);
+		if (err < 0)
+			return err;
+
+		if (regs[0] & REG_SECONDS_OS)
+			return -EAGAIN;
+	}
+
+	tm->tm_sec = bcd2bin(regs[0] & 0x7f);
+	tm->tm_min = bcd2bin(regs[1] & 0x7f);
+	tm->tm_hour = bcd2bin(regs[2] & 0x3f);
+	tm->tm_mday = bcd2bin(regs[3] & 0x3f);
+	tm->tm_wday = regs[4] & 0x7;
+	tm->tm_mon = bcd2bin(regs[5] & 0x1f);
+	tm->tm_year = bcd2bin(regs[6]) + 100;
+
+	return rtc_valid_tm(tm);
+}
+
+static int pcf8523_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct i2c_msg msg;
+	u8 regs[8];
+	int err;
+
+	err = pcf8523_stop_rtc(client);
+	if (err < 0)
+		return err;
+
+	regs[0] = REG_SECONDS;
+	regs[1] = bin2bcd(tm->tm_sec);
+	regs[2] = bin2bcd(tm->tm_min);
+	regs[3] = bin2bcd(tm->tm_hour);
+	regs[4] = bin2bcd(tm->tm_mday);
+	regs[5] = tm->tm_wday;
+	regs[6] = bin2bcd(tm->tm_mon);
+	regs[7] = bin2bcd(tm->tm_year - 100);
+
+	msg.addr = client->addr;
+	msg.flags = 0;
+	msg.len = sizeof(regs);
+	msg.buf = regs;
+
+	err = i2c_transfer(client->adapter, &msg, 1);
+	if (err < 0) {
+		/*
+		 * If the time cannot be set, restart the RTC anyway. Note
+		 * that errors are ignored if the RTC cannot be started so
+		 * that we have a chance to propagate the original error.
+		 */
+		pcf8523_start_rtc(client);
+		return err;
+	}
+
+	return pcf8523_start_rtc(client);
+}
+
+static const struct rtc_class_ops pcf8523_rtc_ops = {
+	.read_time = pcf8523_rtc_read_time,
+	.set_time = pcf8523_rtc_set_time,
+};
+
+static int pcf8523_probe(struct i2c_client *client,
+			 const struct i2c_device_id *id)
+{
+	struct pcf8523 *pcf;
+	int err;
+
+	if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C))
+		return -ENODEV;
+
+	pcf = devm_kzalloc(&client->dev, sizeof(*pcf), GFP_KERNEL);
+	if (!pcf)
+		return -ENOMEM;
+
+	err = pcf8523_select_capacitance(client, true);
+	if (err < 0)
+		return err;
+
+	err = pcf8523_set_pm(client, 0);
+	if (err < 0)
+		return err;
+
+	pcf->rtc = rtc_device_register(DRIVER_NAME, &client->dev,
+				       &pcf8523_rtc_ops, THIS_MODULE);
+	if (IS_ERR(pcf->rtc))
+		return PTR_ERR(pcf->rtc);
+
+	i2c_set_clientdata(client, pcf);
+
+	return 0;
+}
+
+static int pcf8523_remove(struct i2c_client *client)
+{
+	struct pcf8523 *pcf = i2c_get_clientdata(client);
+
+	rtc_device_unregister(pcf->rtc);
+
+	return 0;
+}
+
+static const struct i2c_device_id pcf8523_id[] = {
+	{ "pcf8523", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, pcf8523_id);
+
+#ifdef CONFIG_OF
+static const struct of_device_id pcf8523_of_match[] = {
+	{ .compatible = "nxp,pcf8523" },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, pcf8523_of_match);
+#endif
+
+static struct i2c_driver pcf8523_driver = {
+	.driver = {
+		.name = DRIVER_NAME,
+		.owner = THIS_MODULE,
+		.of_match_table = of_match_ptr(pcf8523_of_match),
+	},
+	.probe = pcf8523_probe,
+	.remove = pcf8523_remove,
+	.id_table = pcf8523_id,
+};
+module_i2c_driver(pcf8523_driver);
+
+MODULE_AUTHOR("Thierry Reding <thierry.reding@avionic-design.de>");
+MODULE_DESCRIPTION("NXP PCF8523 RTC driver");
+MODULE_LICENSE("GPL v2");
diff --git a/trunk/drivers/rtc/rtc-s3c.c b/trunk/drivers/rtc/rtc-s3c.c
index a7a2a998fa91..4bd9414aee65 100644
--- a/trunk/drivers/rtc/rtc-s3c.c
+++ b/trunk/drivers/rtc/rtc-s3c.c
@@ -47,8 +47,6 @@ struct s3c_rtc_drv_data {
 /* I have yet to find an S3C implementation with more than one
  * of these rtc blocks in */
 
-static struct resource *s3c_rtc_mem;
-
 static struct clk *rtc_clk;
 static void __iomem *s3c_rtc_base;
 static int s3c_rtc_alarmno = NO_IRQ;
@@ -427,21 +425,13 @@ static int __devexit s3c_rtc_remove(struct platform_device *dev)
 {
 	struct rtc_device *rtc = platform_get_drvdata(dev);
 
-	free_irq(s3c_rtc_alarmno, rtc);
-	free_irq(s3c_rtc_tickno, rtc);
-
 	platform_set_drvdata(dev, NULL);
 	rtc_device_unregister(rtc);
 
 	s3c_rtc_setaie(&dev->dev, 0);
 
-	clk_put(rtc_clk);
 	rtc_clk = NULL;
 
-	iounmap(s3c_rtc_base);
-	release_resource(s3c_rtc_mem);
-	kfree(s3c_rtc_mem);
-
 	return 0;
 }
 
@@ -496,28 +486,18 @@ static int __devinit s3c_rtc_probe(struct platform_device *pdev)
 		return -ENOENT;
 	}
 
-	s3c_rtc_mem = request_mem_region(res->start, resource_size(res),
-					 pdev->name);
-
-	if (s3c_rtc_mem == NULL) {
-		dev_err(&pdev->dev, "failed to reserve memory region\n");
-		ret = -ENOENT;
-		goto err_nores;
-	}
-
-	s3c_rtc_base = ioremap(res->start, resource_size(res));
+	s3c_rtc_base = devm_request_and_ioremap(&pdev->dev, res);
 	if (s3c_rtc_base == NULL) {
-		dev_err(&pdev->dev, "failed ioremap()\n");
-		ret = -EINVAL;
-		goto err_nomap;
+		dev_err(&pdev->dev, "failed to ioremap memory region\n");
+		return -EINVAL;
 	}
 
-	rtc_clk = clk_get(&pdev->dev, "rtc");
+	rtc_clk = devm_clk_get(&pdev->dev, "rtc");
 	if (IS_ERR(rtc_clk)) {
 		dev_err(&pdev->dev, "failed to find rtc clock source\n");
 		ret = PTR_ERR(rtc_clk);
 		rtc_clk = NULL;
-		goto err_clk;
+		return ret;
 	}
 
 	clk_enable(rtc_clk);
@@ -576,28 +556,24 @@ static int __devinit s3c_rtc_probe(struct platform_device *pdev)
 
 	s3c_rtc_setfreq(&pdev->dev, 1);
 
-	ret = request_irq(s3c_rtc_alarmno, s3c_rtc_alarmirq,
+	ret = devm_request_irq(&pdev->dev, s3c_rtc_alarmno, s3c_rtc_alarmirq,
 			  0,  "s3c2410-rtc alarm", rtc);
 	if (ret) {
 		dev_err(&pdev->dev, "IRQ%d error %d\n", s3c_rtc_alarmno, ret);
 		goto err_alarm_irq;
 	}
 
-	ret = request_irq(s3c_rtc_tickno, s3c_rtc_tickirq,
+	ret = devm_request_irq(&pdev->dev, s3c_rtc_tickno, s3c_rtc_tickirq,
 			  0,  "s3c2410-rtc tick", rtc);
 	if (ret) {
 		dev_err(&pdev->dev, "IRQ%d error %d\n", s3c_rtc_tickno, ret);
-		free_irq(s3c_rtc_alarmno, rtc);
-		goto err_tick_irq;
+		goto err_alarm_irq;
 	}
 
 	clk_disable(rtc_clk);
 
 	return 0;
 
- err_tick_irq:
-	free_irq(s3c_rtc_alarmno, rtc);
-
  err_alarm_irq:
 	platform_set_drvdata(pdev, NULL);
 	rtc_device_unregister(rtc);
@@ -605,15 +581,7 @@ static int __devinit s3c_rtc_probe(struct platform_device *pdev)
  err_nortc:
 	s3c_rtc_enable(pdev, 0);
 	clk_disable(rtc_clk);
-	clk_put(rtc_clk);
 
- err_clk:
-	iounmap(s3c_rtc_base);
-
- err_nomap:
-	release_resource(s3c_rtc_mem);
-
- err_nores:
 	return ret;
 }
 
@@ -695,8 +663,6 @@ static const struct of_device_id s3c_rtc_dt_match[] = {
 	{},
 };
 MODULE_DEVICE_TABLE(of, s3c_rtc_dt_match);
-#else
-#define s3c_rtc_dt_match NULL
 #endif
 
 static struct platform_device_id s3c_rtc_driver_ids[] = {
@@ -727,7 +693,7 @@ static struct platform_driver s3c_rtc_driver = {
 	.driver		= {
 		.name	= "s3c-rtc",
 		.owner	= THIS_MODULE,
-		.of_match_table	= s3c_rtc_dt_match,
+		.of_match_table	= of_match_ptr(s3c_rtc_dt_match),
 	},
 };
 
diff --git a/trunk/drivers/rtc/rtc-spear.c b/trunk/drivers/rtc/rtc-spear.c
index bb507d23f6ce..141fc945295f 100644
--- a/trunk/drivers/rtc/rtc-spear.c
+++ b/trunk/drivers/rtc/rtc-spear.c
@@ -363,35 +363,42 @@ static int __devinit spear_rtc_probe(struct platform_device *pdev)
 		dev_err(&pdev->dev, "no resource defined\n");
 		return -EBUSY;
 	}
-	if (!request_mem_region(res->start, resource_size(res), pdev->name)) {
-		dev_err(&pdev->dev, "rtc region already claimed\n");
-		return -EBUSY;
-	}
 
-	config = kzalloc(sizeof(*config), GFP_KERNEL);
+	config = devm_kzalloc(&pdev->dev, sizeof(*config), GFP_KERNEL);
 	if (!config) {
 		dev_err(&pdev->dev, "out of memory\n");
-		status = -ENOMEM;
-		goto err_release_region;
+		return -ENOMEM;
 	}
 
-	config->clk = clk_get(&pdev->dev, NULL);
-	if (IS_ERR(config->clk)) {
-		status = PTR_ERR(config->clk);
-		goto err_kfree;
+	/* alarm irqs */
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0) {
+		dev_err(&pdev->dev, "no update irq?\n");
+		return irq;
 	}
 
-	status = clk_enable(config->clk);
-	if (status < 0)
-		goto err_clk_put;
+	status = devm_request_irq(&pdev->dev, irq, spear_rtc_irq, 0, pdev->name,
+			config);
+	if (status) {
+		dev_err(&pdev->dev, "Alarm interrupt IRQ%d already claimed\n",
+				irq);
+		return status;
+	}
 
-	config->ioaddr = ioremap(res->start, resource_size(res));
+	config->ioaddr = devm_request_and_ioremap(&pdev->dev, res);
 	if (!config->ioaddr) {
-		dev_err(&pdev->dev, "ioremap fail\n");
-		status = -ENOMEM;
-		goto err_disable_clock;
+		dev_err(&pdev->dev, "request-ioremap fail\n");
+		return -ENOMEM;
 	}
 
+	config->clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(config->clk))
+		return PTR_ERR(config->clk);
+
+	status = clk_prepare_enable(config->clk);
+	if (status < 0)
+		return status;
+
 	spin_lock_init(&config->lock);
 	platform_set_drvdata(pdev, config);
 
@@ -401,42 +408,19 @@ static int __devinit spear_rtc_probe(struct platform_device *pdev)
 		dev_err(&pdev->dev, "can't register RTC device, err %ld\n",
 				PTR_ERR(config->rtc));
 		status = PTR_ERR(config->rtc);
-		goto err_iounmap;
-	}
-
-	/* alarm irqs */
-	irq = platform_get_irq(pdev, 0);
-	if (irq < 0) {
-		dev_err(&pdev->dev, "no update irq?\n");
-		status = irq;
-		goto err_clear_platdata;
+		goto err_disable_clock;
 	}
 
-	status = request_irq(irq, spear_rtc_irq, 0, pdev->name, config);
-	if (status) {
-		dev_err(&pdev->dev, "Alarm interrupt IRQ%d already \
-				claimed\n", irq);
-		goto err_clear_platdata;
-	}
+	config->rtc->uie_unsupported = 1;
 
 	if (!device_can_wakeup(&pdev->dev))
 		device_init_wakeup(&pdev->dev, 1);
 
 	return 0;
 
-err_clear_platdata:
-	platform_set_drvdata(pdev, NULL);
-	rtc_device_unregister(config->rtc);
-err_iounmap:
-	iounmap(config->ioaddr);
 err_disable_clock:
-	clk_disable(config->clk);
-err_clk_put:
-	clk_put(config->clk);
-err_kfree:
-	kfree(config);
-err_release_region:
-	release_mem_region(res->start, resource_size(res));
+	platform_set_drvdata(pdev, NULL);
+	clk_disable_unprepare(config->clk);
 
 	return status;
 }
@@ -444,24 +428,11 @@ static int __devinit spear_rtc_probe(struct platform_device *pdev)
 static int __devexit spear_rtc_remove(struct platform_device *pdev)
 {
 	struct spear_rtc_config *config = platform_get_drvdata(pdev);
-	int irq;
-	struct resource *res;
 
-	/* leave rtc running, but disable irqs */
+	rtc_device_unregister(config->rtc);
 	spear_rtc_disable_interrupt(config);
+	clk_disable_unprepare(config->clk);
 	device_init_wakeup(&pdev->dev, 0);
-	irq = platform_get_irq(pdev, 0);
-	if (irq)
-		free_irq(irq, pdev);
-	clk_disable(config->clk);
-	clk_put(config->clk);
-	iounmap(config->ioaddr);
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (res)
-		release_mem_region(res->start, resource_size(res));
-	platform_set_drvdata(pdev, NULL);
-	rtc_device_unregister(config->rtc);
-	kfree(config);
 
 	return 0;
 }
diff --git a/trunk/drivers/rtc/rtc-test.c b/trunk/drivers/rtc/rtc-test.c
index 7e96254bd365..974b9ae252ab 100644
--- a/trunk/drivers/rtc/rtc-test.c
+++ b/trunk/drivers/rtc/rtc-test.c
@@ -152,24 +152,24 @@ static int __init test_init(void)
 
 	if ((test1 = platform_device_alloc("rtc-test", 1)) == NULL) {
 		err = -ENOMEM;
-		goto exit_free_test0;
+		goto exit_put_test0;
 	}
 
 	if ((err = platform_device_add(test0)))
-		goto exit_free_test1;
+		goto exit_put_test1;
 
 	if ((err = platform_device_add(test1)))
-		goto exit_device_unregister;
+		goto exit_del_test0;
 
 	return 0;
 
-exit_device_unregister:
-	platform_device_unregister(test0);
+exit_del_test0:
+	platform_device_del(test0);
 
-exit_free_test1:
+exit_put_test1:
 	platform_device_put(test1);
 
-exit_free_test0:
+exit_put_test0:
 	platform_device_put(test0);
 
 exit_driver_unregister:
diff --git a/trunk/drivers/rtc/rtc-tps65910.c b/trunk/drivers/rtc/rtc-tps65910.c
index 073108dcf9e7..22eb4ebfa1a6 100644
--- a/trunk/drivers/rtc/rtc-tps65910.c
+++ b/trunk/drivers/rtc/rtc-tps65910.c
@@ -247,6 +247,13 @@ static int __devinit tps65910_rtc_probe(struct platform_device *pdev)
 		return ret;
 
 	dev_dbg(&pdev->dev, "Enabling rtc-tps65910.\n");
+
+	/* Enable RTC digital power domain */
+	ret = regmap_update_bits(tps65910->regmap, TPS65910_DEVCTRL,
+		DEVCTRL_RTC_PWDN_MASK, 0 << DEVCTRL_RTC_PWDN_SHIFT);
+	if (ret < 0)
+		return ret;
+
 	rtc_reg = TPS65910_RTC_CTRL_STOP_RTC;
 	ret = regmap_write(tps65910->regmap, TPS65910_RTC_CTRL, rtc_reg);
 	if (ret < 0)
@@ -261,7 +268,7 @@ static int __devinit tps65910_rtc_probe(struct platform_device *pdev)
 
 	ret = devm_request_threaded_irq(&pdev->dev, irq, NULL,
 		tps65910_rtc_interrupt, IRQF_TRIGGER_LOW,
-		"rtc-tps65910", &pdev->dev);
+		dev_name(&pdev->dev), &pdev->dev);
 	if (ret < 0) {
 		dev_err(&pdev->dev, "IRQ is not free.\n");
 		return ret;
diff --git a/trunk/drivers/rtc/rtc-vt8500.c b/trunk/drivers/rtc/rtc-vt8500.c
index 07bf19364a74..14e2d8cfcc83 100644
--- a/trunk/drivers/rtc/rtc-vt8500.c
+++ b/trunk/drivers/rtc/rtc-vt8500.c
@@ -210,7 +210,8 @@ static int __devinit vt8500_rtc_probe(struct platform_device *pdev)
 	struct vt8500_rtc *vt8500_rtc;
 	int ret;
 
-	vt8500_rtc = kzalloc(sizeof(struct vt8500_rtc), GFP_KERNEL);
+	vt8500_rtc = devm_kzalloc(&pdev->dev,
+			   sizeof(struct vt8500_rtc), GFP_KERNEL);
 	if (!vt8500_rtc)
 		return -ENOMEM;
 
@@ -220,15 +221,13 @@ static int __devinit vt8500_rtc_probe(struct platform_device *pdev)
 	vt8500_rtc->res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!vt8500_rtc->res) {
 		dev_err(&pdev->dev, "No I/O memory resource defined\n");
-		ret = -ENXIO;
-		goto err_free;
+		return -ENXIO;
 	}
 
 	vt8500_rtc->irq_alarm = platform_get_irq(pdev, 0);
 	if (vt8500_rtc->irq_alarm < 0) {
 		dev_err(&pdev->dev, "No alarm IRQ resource defined\n");
-		ret = -ENXIO;
-		goto err_free;
+		return -ENXIO;
 	}
 
 	vt8500_rtc->res = request_mem_region(vt8500_rtc->res->start,
@@ -236,8 +235,7 @@ static int __devinit vt8500_rtc_probe(struct platform_device *pdev)
 					     "vt8500-rtc");
 	if (vt8500_rtc->res == NULL) {
 		dev_err(&pdev->dev, "failed to request I/O memory\n");
-		ret = -EBUSY;
-		goto err_free;
+		return -EBUSY;
 	}
 
 	vt8500_rtc->regbase = ioremap(vt8500_rtc->res->start,
@@ -278,8 +276,6 @@ static int __devinit vt8500_rtc_probe(struct platform_device *pdev)
 err_release:
 	release_mem_region(vt8500_rtc->res->start,
 			   resource_size(vt8500_rtc->res));
-err_free:
-	kfree(vt8500_rtc);
 	return ret;
 }
 
@@ -297,7 +293,6 @@ static int __devexit vt8500_rtc_remove(struct platform_device *pdev)
 	release_mem_region(vt8500_rtc->res->start,
 			   resource_size(vt8500_rtc->res));
 
-	kfree(vt8500_rtc);
 	platform_set_drvdata(pdev, NULL);
 
 	return 0;
diff --git a/trunk/drivers/scsi/fcoe/fcoe_ctlr.c b/trunk/drivers/scsi/fcoe/fcoe_ctlr.c
index 2ebe03a4b51d..4a909d7cfde1 100644
--- a/trunk/drivers/scsi/fcoe/fcoe_ctlr.c
+++ b/trunk/drivers/scsi/fcoe/fcoe_ctlr.c
@@ -2144,7 +2144,7 @@ static void fcoe_ctlr_vn_restart(struct fcoe_ctlr *fip)
 	 */
 	port_id = fip->port_id;
 	if (fip->probe_tries)
-		port_id = prandom32(&fip->rnd_state) & 0xffff;
+		port_id = prandom_u32_state(&fip->rnd_state) & 0xffff;
 	else if (!port_id)
 		port_id = fip->lp->wwpn & 0xffff;
 	if (!port_id || port_id == 0xffff)
@@ -2169,7 +2169,7 @@ static void fcoe_ctlr_vn_restart(struct fcoe_ctlr *fip)
 static void fcoe_ctlr_vn_start(struct fcoe_ctlr *fip)
 {
 	fip->probe_tries = 0;
-	prandom32_seed(&fip->rnd_state, fip->lp->wwpn);
+	prandom_seed_state(&fip->rnd_state, fip->lp->wwpn);
 	fcoe_ctlr_vn_restart(fip);
 }
 
diff --git a/trunk/drivers/spi/spi-atmel.c b/trunk/drivers/spi/spi-atmel.c
index 75c0c4f5fdf2..ab34497bcfee 100644
--- a/trunk/drivers/spi/spi-atmel.c
+++ b/trunk/drivers/spi/spi-atmel.c
@@ -20,6 +20,7 @@
 #include <linux/spi/spi.h>
 #include <linux/slab.h>
 #include <linux/platform_data/atmel.h>
+#include <linux/of.h>
 
 #include <asm/io.h>
 #include <asm/gpio.h>
@@ -768,6 +769,10 @@ static int atmel_spi_setup(struct spi_device *spi)
 
 	/* chipselect must have been muxed as GPIO (e.g. in board setup) */
 	npcs_pin = (unsigned int)spi->controller_data;
+
+	if (gpio_is_valid(spi->cs_gpio))
+		npcs_pin = spi->cs_gpio;
+
 	asd = spi->controller_state;
 	if (!asd) {
 		asd = kzalloc(sizeof(struct atmel_spi_device), GFP_KERNEL);
@@ -937,8 +942,9 @@ static int atmel_spi_probe(struct platform_device *pdev)
 	/* the spi->mode bits understood by this driver: */
 	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH;
 
+	master->dev.of_node = pdev->dev.of_node;
 	master->bus_num = pdev->id;
-	master->num_chipselect = 4;
+	master->num_chipselect = master->dev.of_node ? 0 : 4;
 	master->setup = atmel_spi_setup;
 	master->transfer = atmel_spi_transfer;
 	master->cleanup = atmel_spi_cleanup;
@@ -1064,11 +1070,20 @@ static int atmel_spi_resume(struct platform_device *pdev)
 #define	atmel_spi_resume	NULL
 #endif
 
+#if defined(CONFIG_OF)
+static const struct of_device_id atmel_spi_dt_ids[] = {
+	{ .compatible = "atmel,at91rm9200-spi" },
+	{ /* sentinel */ }
+};
+
+MODULE_DEVICE_TABLE(of, atmel_spi_dt_ids);
+#endif
 
 static struct platform_driver atmel_spi_driver = {
 	.driver		= {
 		.name	= "atmel_spi",
 		.owner	= THIS_MODULE,
+		.of_match_table	= of_match_ptr(atmel_spi_dt_ids),
 	},
 	.suspend	= atmel_spi_suspend,
 	.resume		= atmel_spi_resume,
diff --git a/trunk/drivers/spi/spi-s3c64xx.c b/trunk/drivers/spi/spi-s3c64xx.c
index 4dd7b7ce5c5a..ad93231a8038 100644
--- a/trunk/drivers/spi/spi-s3c64xx.c
+++ b/trunk/drivers/spi/spi-s3c64xx.c
@@ -214,6 +214,10 @@ static void flush_fifo(struct s3c64xx_spi_driver_data *sdd)
 
 	writel(0, regs + S3C64XX_SPI_PACKET_CNT);
 
+	val = readl(regs + S3C64XX_SPI_CH_CFG);
+	val &= ~(S3C64XX_SPI_CH_RXCH_ON | S3C64XX_SPI_CH_TXCH_ON);
+	writel(val, regs + S3C64XX_SPI_CH_CFG);
+
 	val = readl(regs + S3C64XX_SPI_CH_CFG);
 	val |= S3C64XX_SPI_CH_SW_RST;
 	val &= ~S3C64XX_SPI_CH_HS_EN;
@@ -248,10 +252,6 @@ static void flush_fifo(struct s3c64xx_spi_driver_data *sdd)
 	val = readl(regs + S3C64XX_SPI_MODE_CFG);
 	val &= ~(S3C64XX_SPI_MODE_TXDMA_ON | S3C64XX_SPI_MODE_RXDMA_ON);
 	writel(val, regs + S3C64XX_SPI_MODE_CFG);
-
-	val = readl(regs + S3C64XX_SPI_CH_CFG);
-	val &= ~(S3C64XX_SPI_CH_RXCH_ON | S3C64XX_SPI_CH_TXCH_ON);
-	writel(val, regs + S3C64XX_SPI_CH_CFG);
 }
 
 static void s3c64xx_spi_dmacb(void *data)
@@ -771,8 +771,6 @@ static int s3c64xx_spi_transfer_one_message(struct spi_master *master,
 			if (list_is_last(&xfer->transfer_list,
 						&msg->transfers))
 				cs_toggle = 1;
-			else
-				disable_cs(sdd, spi);
 		}
 
 		msg->actual_length += xfer->len;
diff --git a/trunk/drivers/spi/spi-sh-hspi.c b/trunk/drivers/spi/spi-sh-hspi.c
index 32f7b55fce09..60cfae51c713 100644
--- a/trunk/drivers/spi/spi-sh-hspi.c
+++ b/trunk/drivers/spi/spi-sh-hspi.c
@@ -290,7 +290,7 @@ static int hspi_probe(struct platform_device *pdev)
 	}
 
 	clk = clk_get(NULL, "shyway_clk");
-	if (!clk) {
+	if (IS_ERR(clk)) {
 		dev_err(&pdev->dev, "shyway_clk is required\n");
 		ret = -EINVAL;
 		goto error0;
diff --git a/trunk/drivers/spi/spi.c b/trunk/drivers/spi/spi.c
index ab095acdb2a8..19ee901577da 100644
--- a/trunk/drivers/spi/spi.c
+++ b/trunk/drivers/spi/spi.c
@@ -824,6 +824,7 @@ static void of_register_spi_devices(struct spi_master *master)
 	struct spi_device *spi;
 	struct device_node *nc;
 	const __be32 *prop;
+	char modalias[SPI_NAME_SIZE + 4];
 	int rc;
 	int len;
 
@@ -887,7 +888,9 @@ static void of_register_spi_devices(struct spi_master *master)
 		spi->dev.of_node = nc;
 
 		/* Register the new device */
-		request_module(spi->modalias);
+		snprintf(modalias, sizeof(modalias), "%s%s", SPI_MODULE_PREFIX,
+			 spi->modalias);
+		request_module(modalias);
 		rc = spi_add_device(spi);
 		if (rc) {
 			dev_err(&master->dev, "spi_device register error %s\n",
diff --git a/trunk/drivers/staging/android/binder.c b/trunk/drivers/staging/android/binder.c
index 4a36e9ab8cf7..2d12e8a1f82e 100644
--- a/trunk/drivers/staging/android/binder.c
+++ b/trunk/drivers/staging/android/binder.c
@@ -35,6 +35,7 @@
 #include <linux/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
+#include <linux/pid_namespace.h>
 
 #include "binder.h"
 #include "binder_trace.h"
@@ -2320,7 +2321,7 @@ static int binder_thread_read(struct binder_proc *proc,
 		if (t->from) {
 			struct task_struct *sender = t->from->proc->tsk;
 			tr.sender_pid = task_tgid_nr_ns(sender,
-							current->nsproxy->pid_ns);
+							task_active_pid_ns(current));
 		} else {
 			tr.sender_pid = 0;
 		}
diff --git a/trunk/drivers/usb/musb/musb_core.c b/trunk/drivers/usb/musb/musb_core.c
index 57cc9c6eaa9f..f1c6c5470b92 100644
--- a/trunk/drivers/usb/musb/musb_core.c
+++ b/trunk/drivers/usb/musb/musb_core.c
@@ -251,7 +251,7 @@ void musb_write_fifo(struct musb_hw_ep *hw_ep, u16 len, const u8 *src)
 		/* best case is 32bit-aligned source address */
 		if ((0x02 & (unsigned long) src) == 0) {
 			if (len >= 4) {
-				writesl(fifo, src + index, len >> 2);
+				iowrite32_rep(fifo, src + index, len >> 2);
 				index += len & ~0x03;
 			}
 			if (len & 0x02) {
@@ -260,7 +260,7 @@ void musb_write_fifo(struct musb_hw_ep *hw_ep, u16 len, const u8 *src)
 			}
 		} else {
 			if (len >= 2) {
-				writesw(fifo, src + index, len >> 1);
+				iowrite16_rep(fifo, src + index, len >> 1);
 				index += len & ~0x01;
 			}
 		}
@@ -268,7 +268,7 @@ void musb_write_fifo(struct musb_hw_ep *hw_ep, u16 len, const u8 *src)
 			musb_writeb(fifo, 0, src[index]);
 	} else  {
 		/* byte aligned */
-		writesb(fifo, src, len);
+		iowrite8_rep(fifo, src, len);
 	}
 }
 
@@ -294,7 +294,7 @@ void musb_read_fifo(struct musb_hw_ep *hw_ep, u16 len, u8 *dst)
 		/* best case is 32bit-aligned destination address */
 		if ((0x02 & (unsigned long) dst) == 0) {
 			if (len >= 4) {
-				readsl(fifo, dst, len >> 2);
+				ioread32_rep(fifo, dst, len >> 2);
 				index = len & ~0x03;
 			}
 			if (len & 0x02) {
@@ -303,7 +303,7 @@ void musb_read_fifo(struct musb_hw_ep *hw_ep, u16 len, u8 *dst)
 			}
 		} else {
 			if (len >= 2) {
-				readsw(fifo, dst, len >> 1);
+				ioread16_rep(fifo, dst, len >> 1);
 				index = len & ~0x01;
 			}
 		}
@@ -311,7 +311,7 @@ void musb_read_fifo(struct musb_hw_ep *hw_ep, u16 len, u8 *dst)
 			dst[index] = musb_readb(fifo, 0);
 	} else  {
 		/* byte aligned */
-		readsb(fifo, dst, len);
+		ioread8_rep(fifo, dst, len);
 	}
 }
 #endif
diff --git a/trunk/drivers/usb/musb/musb_io.h b/trunk/drivers/usb/musb/musb_io.h
index 565ad1617832..eebeed78edd6 100644
--- a/trunk/drivers/usb/musb/musb_io.h
+++ b/trunk/drivers/usb/musb/musb_io.h
@@ -37,27 +37,6 @@
 
 #include <linux/io.h>
 
-#if !defined(CONFIG_ARM) && !defined(CONFIG_SUPERH) \
-	&& !defined(CONFIG_AVR32) && !defined(CONFIG_PPC32) \
-	&& !defined(CONFIG_PPC64) && !defined(CONFIG_BLACKFIN) \
-	&& !defined(CONFIG_MIPS) && !defined(CONFIG_M68K) \
-	&& !defined(CONFIG_XTENSA)
-static inline void readsl(const void __iomem *addr, void *buf, int len)
-	{ insl((unsigned long)addr, buf, len); }
-static inline void readsw(const void __iomem *addr, void *buf, int len)
-	{ insw((unsigned long)addr, buf, len); }
-static inline void readsb(const void __iomem *addr, void *buf, int len)
-	{ insb((unsigned long)addr, buf, len); }
-
-static inline void writesl(const void __iomem *addr, const void *buf, int len)
-	{ outsl((unsigned long)addr, buf, len); }
-static inline void writesw(const void __iomem *addr, const void *buf, int len)
-	{ outsw((unsigned long)addr, buf, len); }
-static inline void writesb(const void __iomem *addr, const void *buf, int len)
-	{ outsb((unsigned long)addr, buf, len); }
-
-#endif
-
 #ifndef CONFIG_BLACKFIN
 
 /* NOTE:  these offsets are all in bytes */
diff --git a/trunk/drivers/usb/musb/tusb6010.c b/trunk/drivers/usb/musb/tusb6010.c
index 8bde6fc5eb75..3969813c217d 100644
--- a/trunk/drivers/usb/musb/tusb6010.c
+++ b/trunk/drivers/usb/musb/tusb6010.c
@@ -22,6 +22,7 @@
 #include <linux/prefetch.h>
 #include <linux/usb.h>
 #include <linux/irq.h>
+#include <linux/io.h>
 #include <linux/platform_device.h>
 #include <linux/dma-mapping.h>
 #include <linux/usb/nop-usb-xceiv.h>
@@ -198,7 +199,7 @@ void musb_write_fifo(struct musb_hw_ep *hw_ep, u16 len, const u8 *buf)
 		/* Best case is 32bit-aligned destination address */
 		if ((0x02 & (unsigned long) buf) == 0) {
 			if (len >= 4) {
-				writesl(fifo, buf, len >> 2);
+				iowrite32_rep(fifo, buf, len >> 2);
 				buf += (len & ~0x03);
 				len &= 0x03;
 			}
@@ -245,7 +246,7 @@ void musb_read_fifo(struct musb_hw_ep *hw_ep, u16 len, u8 *buf)
 		/* Best case is 32bit-aligned destination address */
 		if ((0x02 & (unsigned long) buf) == 0) {
 			if (len >= 4) {
-				readsl(fifo, buf, len >> 2);
+				ioread32_rep(fifo, buf, len >> 2);
 				buf += (len & ~0x03);
 				len &= 0x03;
 			}
diff --git a/trunk/drivers/video/Kconfig b/trunk/drivers/video/Kconfig
index 9c31277b3a81..e7068c508800 100644
--- a/trunk/drivers/video/Kconfig
+++ b/trunk/drivers/video/Kconfig
@@ -2140,14 +2140,16 @@ config FB_UDL
 	  To compile as a module, choose M here: the module name is udlfb.
 
 config FB_IBM_GXT4500
-	tristate "Framebuffer support for IBM GXT4500P adaptor"
+	tristate "Framebuffer support for IBM GXT4000P/4500P/6000P/6500P adaptors"
 	depends on FB && PPC
 	select FB_CFB_FILLRECT
 	select FB_CFB_COPYAREA
 	select FB_CFB_IMAGEBLIT
 	---help---
-	  Say Y here to enable support for the IBM GXT4500P display
-	  adaptor, found on some IBM System P (pSeries) machines.
+	  Say Y here to enable support for the IBM GXT4000P/6000P and
+	  GXT4500P/6500P display adaptor based on Raster Engine RC1000,
+	  found on some IBM System P (pSeries) machines. This driver
+	  doesn't use Geometry Engine GT1000.
 
 config FB_PS3
 	tristate "PS3 GPU framebuffer driver"
diff --git a/trunk/drivers/video/backlight/88pm860x_bl.c b/trunk/drivers/video/backlight/88pm860x_bl.c
index b7ec34c57f46..c072ed9aea36 100644
--- a/trunk/drivers/video/backlight/88pm860x_bl.c
+++ b/trunk/drivers/video/backlight/88pm860x_bl.c
@@ -117,8 +117,8 @@ static int pm860x_backlight_set(struct backlight_device *bl, int brightness)
 	data->current_brightness = value;
 	return 0;
 out:
-	dev_dbg(chip->dev, "set brightness %d failure with return "
-		"value:%d\n", value, ret);
+	dev_dbg(chip->dev, "set brightness %d failure with return value: %d\n",
+		value, ret);
 	return ret;
 }
 
@@ -208,22 +208,19 @@ static int pm860x_backlight_probe(struct platform_device *pdev)
 	res = platform_get_resource_byname(pdev, IORESOURCE_REG, "duty cycle");
 	if (!res) {
 		dev_err(&pdev->dev, "No REG resource for duty cycle\n");
-		ret = -ENXIO;
-		goto out;
+		return -ENXIO;
 	}
 	data->reg_duty_cycle = res->start;
 	res = platform_get_resource_byname(pdev, IORESOURCE_REG, "always on");
 	if (!res) {
 		dev_err(&pdev->dev, "No REG resorce for always on\n");
-		ret = -ENXIO;
-		goto out;
+		return -ENXIO;
 	}
 	data->reg_always_on = res->start;
 	res = platform_get_resource_byname(pdev, IORESOURCE_REG, "current");
 	if (!res) {
 		dev_err(&pdev->dev, "No REG resource for current\n");
-		ret = -ENXIO;
-		goto out;
+		return -ENXIO;
 	}
 	data->reg_current = res->start;
 
@@ -231,8 +228,7 @@ static int pm860x_backlight_probe(struct platform_device *pdev)
 	sprintf(name, "backlight-%d", pdev->id);
 	data->port = pdev->id;
 	data->chip = chip;
-	data->i2c = (chip->id == CHIP_PM8606) ? chip->client	\
-			: chip->companion;
+	data->i2c = (chip->id == CHIP_PM8606) ? chip->client : chip->companion;
 	data->current_brightness = MAX_BRIGHTNESS;
 	if (pm860x_backlight_dt_init(pdev, data, name)) {
 		if (pdata) {
@@ -263,8 +259,6 @@ static int pm860x_backlight_probe(struct platform_device *pdev)
 	return 0;
 out_brt:
 	backlight_device_unregister(bl);
-out:
-	devm_kfree(&pdev->dev, data);
 	return ret;
 }
 
diff --git a/trunk/drivers/video/backlight/atmel-pwm-bl.c b/trunk/drivers/video/backlight/atmel-pwm-bl.c
index df1cbb7ef6ca..de5e5e74e2a7 100644
--- a/trunk/drivers/video/backlight/atmel-pwm-bl.c
+++ b/trunk/drivers/video/backlight/atmel-pwm-bl.c
@@ -106,10 +106,9 @@ static int atmel_pwm_bl_init_pwm(struct atmel_pwm_bl *pwmbl)
 	pwm_channel_writel(&pwmbl->pwmc, PWM_CPRD,
 			pwmbl->pdata->pwm_compare_max);
 
-	dev_info(&pwmbl->pdev->dev, "Atmel PWM backlight driver "
-			"(%lu Hz)\n", pwmbl->pwmc.mck /
-			pwmbl->pdata->pwm_compare_max /
-			(1 << prescale));
+	dev_info(&pwmbl->pdev->dev, "Atmel PWM backlight driver (%lu Hz)\n",
+		pwmbl->pwmc.mck / pwmbl->pdata->pwm_compare_max /
+		(1 << prescale));
 
 	return pwm_channel_enable(&pwmbl->pwmc);
 }
diff --git a/trunk/drivers/video/backlight/backlight.c b/trunk/drivers/video/backlight/backlight.c
index 297db2fa91f5..345f6660d4b3 100644
--- a/trunk/drivers/video/backlight/backlight.c
+++ b/trunk/drivers/video/backlight/backlight.c
@@ -370,6 +370,35 @@ void backlight_device_unregister(struct backlight_device *bd)
 }
 EXPORT_SYMBOL(backlight_device_unregister);
 
+#ifdef CONFIG_OF
+static int of_parent_match(struct device *dev, void *data)
+{
+	return dev->parent && dev->parent->of_node == data;
+}
+
+/**
+ * of_find_backlight_by_node() - find backlight device by device-tree node
+ * @node: device-tree node of the backlight device
+ *
+ * Returns a pointer to the backlight device corresponding to the given DT
+ * node or NULL if no such backlight device exists or if the device hasn't
+ * been probed yet.
+ *
+ * This function obtains a reference on the backlight device and it is the
+ * caller's responsibility to drop the reference by calling put_device() on
+ * the backlight device's .dev field.
+ */
+struct backlight_device *of_find_backlight_by_node(struct device_node *node)
+{
+	struct device *dev;
+
+	dev = class_find_device(backlight_class, NULL, node, of_parent_match);
+
+	return dev ? to_backlight_device(dev) : NULL;
+}
+EXPORT_SYMBOL(of_find_backlight_by_node);
+#endif
+
 static void __exit backlight_class_exit(void)
 {
 	class_destroy(backlight_class);
diff --git a/trunk/drivers/video/backlight/corgi_lcd.c b/trunk/drivers/video/backlight/corgi_lcd.c
index eaaebf21993e..e323fcbe884e 100644
--- a/trunk/drivers/video/backlight/corgi_lcd.c
+++ b/trunk/drivers/video/backlight/corgi_lcd.c
@@ -6,8 +6,8 @@
  *  Based on Sharp's 2.4 Backlight Driver
  *
  *  Copyright (c) 2008 Marvell International Ltd.
- *  	Converted to SPI device based LCD/Backlight device driver
- *  	by Eric Miao <eric.miao@marvell.com>
+ *	Converted to SPI device based LCD/Backlight device driver
+ *	by Eric Miao <eric.miao@marvell.com>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License version 2 as
@@ -192,7 +192,7 @@ static void lcdtg_set_phadadj(struct corgi_lcd *lcd, int mode)
 {
 	int adj;
 
-	switch(mode) {
+	switch (mode) {
 	case CORGI_LCD_MODE_VGA:
 		/* Setting for VGA */
 		adj = sharpsl_param.phadadj;
@@ -409,10 +409,10 @@ static int corgi_bl_set_intensity(struct corgi_lcd *lcd, int intensity)
 	cont = !!(intensity & 0x20) ^ lcd->gpio_backlight_cont_inverted;
 
 	if (gpio_is_valid(lcd->gpio_backlight_cont))
-		gpio_set_value(lcd->gpio_backlight_cont, cont);
+		gpio_set_value_cansleep(lcd->gpio_backlight_cont, cont);
 
 	if (gpio_is_valid(lcd->gpio_backlight_on))
-		gpio_set_value(lcd->gpio_backlight_on, intensity);
+		gpio_set_value_cansleep(lcd->gpio_backlight_on, intensity);
 
 	if (lcd->kick_battery)
 		lcd->kick_battery();
@@ -495,8 +495,9 @@ static int setup_gpio_backlight(struct corgi_lcd *lcd,
 		err = devm_gpio_request(&spi->dev, pdata->gpio_backlight_on,
 					"BL_ON");
 		if (err) {
-			dev_err(&spi->dev, "failed to request GPIO%d for "
-				"backlight_on\n", pdata->gpio_backlight_on);
+			dev_err(&spi->dev,
+				"failed to request GPIO%d for backlight_on\n",
+				pdata->gpio_backlight_on);
 			return err;
 		}
 
@@ -508,8 +509,9 @@ static int setup_gpio_backlight(struct corgi_lcd *lcd,
 		err = devm_gpio_request(&spi->dev, pdata->gpio_backlight_cont,
 					"BL_CONT");
 		if (err) {
-			dev_err(&spi->dev, "failed to request GPIO%d for "
-				"backlight_cont\n", pdata->gpio_backlight_cont);
+			dev_err(&spi->dev,
+				"failed to request GPIO%d for backlight_cont\n",
+				pdata->gpio_backlight_cont);
 			return err;
 		}
 
diff --git a/trunk/drivers/video/backlight/da903x_bl.c b/trunk/drivers/video/backlight/da903x_bl.c
index 573c7ece0fde..8179cef0730f 100644
--- a/trunk/drivers/video/backlight/da903x_bl.c
+++ b/trunk/drivers/video/backlight/da903x_bl.c
@@ -2,10 +2,10 @@
  * Backlight driver for Dialog Semiconductor DA9030/DA9034
  *
  * Copyright (C) 2008 Compulab, Ltd.
- * 	Mike Rapoport <mike@compulab.co.il>
+ *	Mike Rapoport <mike@compulab.co.il>
  *
  * Copyright (C) 2006-2008 Marvell International Ltd.
- * 	Eric Miao <eric.miao@marvell.com>
+ *	Eric Miao <eric.miao@marvell.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -164,15 +164,14 @@ static int da903x_backlight_remove(struct platform_device *pdev)
 #ifdef CONFIG_PM
 static int da903x_backlight_suspend(struct device *dev)
 {
-	struct platform_device *pdev = to_platform_device(dev);
-	struct backlight_device *bl = platform_get_drvdata(pdev);
+	struct backlight_device *bl = dev_get_drvdata(dev);
+
 	return da903x_backlight_set(bl, 0);
 }
 
 static int da903x_backlight_resume(struct device *dev)
 {
-	struct platform_device *pdev = to_platform_device(dev);
-	struct backlight_device *bl = platform_get_drvdata(pdev);
+	struct backlight_device *bl = dev_get_drvdata(dev);
 
 	backlight_update_status(bl);
 	return 0;
@@ -199,7 +198,7 @@ static struct platform_driver da903x_backlight_driver = {
 module_platform_driver(da903x_backlight_driver);
 
 MODULE_DESCRIPTION("Backlight Driver for Dialog Semiconductor DA9030/DA9034");
-MODULE_AUTHOR("Eric Miao <eric.miao@marvell.com>"
-	      "Mike Rapoport <mike@compulab.co.il>");
+MODULE_AUTHOR("Eric Miao <eric.miao@marvell.com>");
+MODULE_AUTHOR("Mike Rapoport <mike@compulab.co.il>");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("platform:da903x-backlight");
diff --git a/trunk/drivers/video/backlight/da9052_bl.c b/trunk/drivers/video/backlight/da9052_bl.c
index ac196181fe45..842da5a3ac4f 100644
--- a/trunk/drivers/video/backlight/da9052_bl.c
+++ b/trunk/drivers/video/backlight/da9052_bl.c
@@ -34,7 +34,7 @@ enum {
 	DA9052_TYPE_WLED3,
 };
 
-static unsigned char wled_bank[] = {
+static const unsigned char wled_bank[] = {
 	DA9052_LED1_CONF_REG,
 	DA9052_LED2_CONF_REG,
 	DA9052_LED3_CONF_REG,
diff --git a/trunk/drivers/video/backlight/generic_bl.c b/trunk/drivers/video/backlight/generic_bl.c
index 8c660fcd250d..0ae155be9c89 100644
--- a/trunk/drivers/video/backlight/generic_bl.c
+++ b/trunk/drivers/video/backlight/generic_bl.c
@@ -97,8 +97,8 @@ static int genericbl_probe(struct platform_device *pdev)
 	props.max_brightness = machinfo->max_intensity;
 	bd = backlight_device_register(name, &pdev->dev, NULL, &genericbl_ops,
 				       &props);
-	if (IS_ERR (bd))
-		return PTR_ERR (bd);
+	if (IS_ERR(bd))
+		return PTR_ERR(bd);
 
 	platform_set_drvdata(pdev, bd);
 
diff --git a/trunk/drivers/video/backlight/hp680_bl.c b/trunk/drivers/video/backlight/hp680_bl.c
index c99966342448..5cefd73526f8 100644
--- a/trunk/drivers/video/backlight/hp680_bl.c
+++ b/trunk/drivers/video/backlight/hp680_bl.c
@@ -26,7 +26,7 @@
 #define HP680_DEFAULT_INTENSITY 10
 
 static int hp680bl_suspended;
-static int current_intensity = 0;
+static int current_intensity;
 static DEFINE_SPINLOCK(bl_lock);
 
 static void hp680bl_send_intensity(struct backlight_device *bd)
@@ -168,7 +168,7 @@ static int __init hp680bl_init(void)
 static void __exit hp680bl_exit(void)
 {
 	platform_device_unregister(hp680bl_device);
- 	platform_driver_unregister(&hp680bl_driver);
+	platform_driver_unregister(&hp680bl_driver);
 }
 
 module_init(hp680bl_init);
diff --git a/trunk/drivers/video/backlight/ili9320.c b/trunk/drivers/video/backlight/ili9320.c
index 66cc313185ad..1235bf9defc4 100644
--- a/trunk/drivers/video/backlight/ili9320.c
+++ b/trunk/drivers/video/backlight/ili9320.c
@@ -45,7 +45,7 @@ static inline int ili9320_write_spi(struct ili9320 *ili,
 	/* second message is the data to transfer */
 
 	data[0] = spi->id | ILI9320_SPI_DATA  | ILI9320_SPI_WRITE;
- 	data[1] = value >> 8;
+	data[1] = value >> 8;
 	data[2] = value;
 
 	return spi_sync(spi->dev, &spi->message);
@@ -56,11 +56,10 @@ int ili9320_write(struct ili9320 *ili, unsigned int reg, unsigned int value)
 	dev_dbg(ili->dev, "write: reg=%02x, val=%04x\n", reg, value);
 	return ili->write(ili, reg, value);
 }
-
 EXPORT_SYMBOL_GPL(ili9320_write);
 
 int ili9320_write_regs(struct ili9320 *ili,
-		       struct ili9320_reg *values,
+		       const struct ili9320_reg *values,
 		       int nr_values)
 {
 	int index;
@@ -74,7 +73,6 @@ int ili9320_write_regs(struct ili9320 *ili,
 
 	return 0;
 }
-
 EXPORT_SYMBOL_GPL(ili9320_write_regs);
 
 static void ili9320_reset(struct ili9320 *lcd)
@@ -260,7 +258,6 @@ int ili9320_probe_spi(struct spi_device *spi,
 
 	return ret;
 }
-
 EXPORT_SYMBOL_GPL(ili9320_probe_spi);
 
 int ili9320_remove(struct ili9320 *ili)
@@ -271,7 +268,6 @@ int ili9320_remove(struct ili9320 *ili)
 
 	return 0;
 }
-
 EXPORT_SYMBOL_GPL(ili9320_remove);
 
 #ifdef CONFIG_PM
@@ -296,20 +292,17 @@ int ili9320_suspend(struct ili9320 *lcd, pm_message_t state)
 
 	return 0;
 }
-
 EXPORT_SYMBOL_GPL(ili9320_suspend);
 
 int ili9320_resume(struct ili9320 *lcd)
 {
 	dev_info(lcd->dev, "resuming from power state %d\n", lcd->power);
 
-	if (lcd->platdata->suspend == ILI9320_SUSPEND_DEEP) {
+	if (lcd->platdata->suspend == ILI9320_SUSPEND_DEEP)
 		ili9320_write(lcd, ILI9320_POWER1, 0x00);
-	}
 
 	return ili9320_power(lcd, FB_BLANK_UNBLANK);
 }
-
 EXPORT_SYMBOL_GPL(ili9320_resume);
 #endif
 
@@ -318,7 +311,6 @@ void ili9320_shutdown(struct ili9320 *lcd)
 {
 	ili9320_power(lcd, FB_BLANK_POWERDOWN);
 }
-
 EXPORT_SYMBOL_GPL(ili9320_shutdown);
 
 MODULE_AUTHOR("Ben Dooks <ben-linux@fluff.org>");
diff --git a/trunk/drivers/video/backlight/ili9320.h b/trunk/drivers/video/backlight/ili9320.h
index e388eca7cac5..e0db738f7bb9 100644
--- a/trunk/drivers/video/backlight/ili9320.h
+++ b/trunk/drivers/video/backlight/ili9320.h
@@ -63,7 +63,7 @@ extern int ili9320_write(struct ili9320 *ili,
 			 unsigned int reg, unsigned int value);
 
 extern int ili9320_write_regs(struct ili9320 *ili,
-			      struct ili9320_reg *values,
+			      const struct ili9320_reg *values,
 			      int nr_values);
 
 /* Device probe */
diff --git a/trunk/drivers/video/backlight/jornada720_bl.c b/trunk/drivers/video/backlight/jornada720_bl.c
index 16f593b64427..fef6ce4fad71 100644
--- a/trunk/drivers/video/backlight/jornada720_bl.c
+++ b/trunk/drivers/video/backlight/jornada720_bl.c
@@ -48,7 +48,7 @@ static int jornada_bl_get_brightness(struct backlight_device *bd)
 
 	jornada_ssp_end();
 
-	return (BL_MAX_BRIGHT - ret);
+	return BL_MAX_BRIGHT - ret;
 }
 
 static int jornada_bl_update_status(struct backlight_device *bd)
@@ -77,18 +77,23 @@ static int jornada_bl_update_status(struct backlight_device *bd)
 			goto out;
 		}
 
-		/* at this point we expect that the mcu has accepted
-		   our command and is waiting for our new value
-		   please note that maximum brightness is 255,
-		   but due to physical layout it is equal to 0, so we simply
-		   invert the value (MAX VALUE - NEW VALUE). */
-		if (jornada_ssp_byte(BL_MAX_BRIGHT - bd->props.brightness) != TXDUMMY) {
+		/*
+		 * at this point we expect that the mcu has accepted
+		 * our command and is waiting for our new value
+		 * please note that maximum brightness is 255,
+		 * but due to physical layout it is equal to 0, so we simply
+		 * invert the value (MAX VALUE - NEW VALUE).
+		 */
+		if (jornada_ssp_byte(BL_MAX_BRIGHT - bd->props.brightness)
+			!= TXDUMMY) {
 			pr_err("set brightness failed\n");
 			ret = -ETIMEDOUT;
 		}
 
-		/* If infact we get an TXDUMMY as output we are happy and dont
-		   make any further comments about it */
+		/*
+		 * If infact we get an TXDUMMY as output we are happy and dont
+		 * make any further comments about it
+		 */
 out:
 	jornada_ssp_end();
 
@@ -121,9 +126,11 @@ static int jornada_bl_probe(struct platform_device *pdev)
 
 	bd->props.power = FB_BLANK_UNBLANK;
 	bd->props.brightness = BL_DEF_BRIGHT;
-	/* note. make sure max brightness is set otherwise
-	   you will get seemingly non-related errors when
-	   trying to change brightness */
+	/*
+	 * note. make sure max brightness is set otherwise
+	 * you will get seemingly non-related errors when
+	 * trying to change brightness
+	 */
 	jornada_bl_update_status(bd);
 
 	platform_set_drvdata(pdev, bd);
diff --git a/trunk/drivers/video/backlight/l4f00242t03.c b/trunk/drivers/video/backlight/l4f00242t03.c
index f5aa0a5961d6..9a35196d12d7 100644
--- a/trunk/drivers/video/backlight/l4f00242t03.c
+++ b/trunk/drivers/video/backlight/l4f00242t03.c
@@ -4,7 +4,7 @@
  * Copyright 2007-2009 Freescale Semiconductor, Inc. All Rights Reserved.
  *
  * Copyright (c) 2009 Alberto Panizzo <maramaopercheseimorto@gmail.com>
- * 	Inspired by Marek Vasut work in l4f00242t03.c
+ *	Inspired by Marek Vasut work in l4f00242t03.c
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -33,7 +33,6 @@ struct l4f00242t03_priv {
 	struct regulator *core_reg;
 };
 
-
 static void l4f00242t03_reset(unsigned int gpio)
 {
 	pr_debug("l4f00242t03_reset.\n");
diff --git a/trunk/drivers/video/backlight/lcd.c b/trunk/drivers/video/backlight/lcd.c
index a5d0d024bb92..34fb6bd798c8 100644
--- a/trunk/drivers/video/backlight/lcd.c
+++ b/trunk/drivers/video/backlight/lcd.c
@@ -108,7 +108,7 @@ static ssize_t lcd_show_power(struct device *dev, struct device_attribute *attr,
 static ssize_t lcd_store_power(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
-	int rc = -ENXIO;
+	int rc;
 	struct lcd_device *ld = to_lcd_device(dev);
 	unsigned long power;
 
@@ -116,6 +116,8 @@ static ssize_t lcd_store_power(struct device *dev,
 	if (rc)
 		return rc;
 
+	rc = -ENXIO;
+
 	mutex_lock(&ld->ops_lock);
 	if (ld->ops && ld->ops->set_power) {
 		pr_debug("set power to %lu\n", power);
@@ -144,7 +146,7 @@ static ssize_t lcd_show_contrast(struct device *dev,
 static ssize_t lcd_store_contrast(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
-	int rc = -ENXIO;
+	int rc;
 	struct lcd_device *ld = to_lcd_device(dev);
 	unsigned long contrast;
 
@@ -152,6 +154,8 @@ static ssize_t lcd_store_contrast(struct device *dev,
 	if (rc)
 		return rc;
 
+	rc = -ENXIO;
+
 	mutex_lock(&ld->ops_lock);
 	if (ld->ops && ld->ops->set_contrast) {
 		pr_debug("set contrast to %lu\n", contrast);
diff --git a/trunk/drivers/video/backlight/lm3630_bl.c b/trunk/drivers/video/backlight/lm3630_bl.c
index 0207bc0a4407..a6d637b5c68f 100644
--- a/trunk/drivers/video/backlight/lm3630_bl.c
+++ b/trunk/drivers/video/backlight/lm3630_bl.c
@@ -37,7 +37,7 @@ enum lm3630_leds {
 	BLED_2
 };
 
-static const char *bled_name[] = {
+static const char * const bled_name[] = {
 	[BLED_ALL] = "lm3630_bled",	/*Bank1 controls all string */
 	[BLED_1] = "lm3630_bled1",	/*Bank1 controls bled1 */
 	[BLED_2] = "lm3630_bled2",	/*Bank1 or 2 controls bled2 */
diff --git a/trunk/drivers/video/backlight/lm3639_bl.c b/trunk/drivers/video/backlight/lm3639_bl.c
index b0e1e8ba4d9f..7ab2d2a04e41 100644
--- a/trunk/drivers/video/backlight/lm3639_bl.c
+++ b/trunk/drivers/video/backlight/lm3639_bl.c
@@ -214,7 +214,7 @@ static ssize_t lm3639_bled_mode_store(struct device *dev,
 
 }
 
-static DEVICE_ATTR(bled_mode, 0666, NULL, lm3639_bled_mode_store);
+static DEVICE_ATTR(bled_mode, S_IWUSR, NULL, lm3639_bled_mode_store);
 
 /* torch */
 static void lm3639_torch_brightness_set(struct led_classdev *cdev,
diff --git a/trunk/drivers/video/backlight/lms283gf05.c b/trunk/drivers/video/backlight/lms283gf05.c
index b29c7071c9db..55819b384701 100644
--- a/trunk/drivers/video/backlight/lms283gf05.c
+++ b/trunk/drivers/video/backlight/lms283gf05.c
@@ -31,7 +31,7 @@ struct lms283gf05_seq {
 };
 
 /* Magic sequences supplied by manufacturer, for details refer to datasheet */
-static struct lms283gf05_seq disp_initseq[] = {
+static const struct lms283gf05_seq disp_initseq[] = {
 	/* REG, VALUE, DELAY */
 	{ 0x07, 0x0000, 0 },
 	{ 0x13, 0x0000, 10 },
@@ -78,7 +78,7 @@ static struct lms283gf05_seq disp_initseq[] = {
 	{ 0x22, 0x0000, 0 }
 };
 
-static struct lms283gf05_seq disp_pdwnseq[] = {
+static const struct lms283gf05_seq disp_pdwnseq[] = {
 	{ 0x07, 0x0016, 30 },
 
 	{ 0x07, 0x0004, 0 },
@@ -104,7 +104,7 @@ static void lms283gf05_reset(unsigned long gpio, bool inverted)
 }
 
 static void lms283gf05_toggle(struct spi_device *spi,
-			struct lms283gf05_seq *seq, int sz)
+				const struct lms283gf05_seq *seq, int sz)
 {
 	char buf[3];
 	int i;
@@ -158,13 +158,10 @@ static int lms283gf05_probe(struct spi_device *spi)
 	int ret = 0;
 
 	if (pdata != NULL) {
-		ret = devm_gpio_request(&spi->dev, pdata->reset_gpio,
-					"LMS285GF05 RESET");
-		if (ret)
-			return ret;
-
-		ret = gpio_direction_output(pdata->reset_gpio,
-						!pdata->reset_inverted);
+		ret = devm_gpio_request_one(&spi->dev, pdata->reset_gpio,
+				GPIOF_DIR_OUT | (!pdata->reset_inverted ?
+				GPIOF_INIT_HIGH : GPIOF_INIT_LOW),
+				"LMS285GF05 RESET");
 		if (ret)
 			return ret;
 	}
diff --git a/trunk/drivers/video/backlight/locomolcd.c b/trunk/drivers/video/backlight/locomolcd.c
index 3a6d5419e3e3..146fea8aa431 100644
--- a/trunk/drivers/video/backlight/locomolcd.c
+++ b/trunk/drivers/video/backlight/locomolcd.c
@@ -107,7 +107,6 @@ void locomolcd_power(int on)
 }
 EXPORT_SYMBOL(locomolcd_power);
 
-
 static int current_intensity;
 
 static int locomolcd_set_intensity(struct backlight_device *bd)
@@ -122,13 +121,25 @@ static int locomolcd_set_intensity(struct backlight_device *bd)
 		intensity = 0;
 
 	switch (intensity) {
-	/* AC and non-AC are handled differently, but produce same results in sharp code? */
-	case 0: locomo_frontlight_set(locomolcd_dev, 0, 0, 161); break;
-	case 1: locomo_frontlight_set(locomolcd_dev, 117, 0, 161); break;
-	case 2: locomo_frontlight_set(locomolcd_dev, 163, 0, 148); break;
-	case 3: locomo_frontlight_set(locomolcd_dev, 194, 0, 161); break;
-	case 4: locomo_frontlight_set(locomolcd_dev, 194, 1, 161); break;
-
+	/*
+	 * AC and non-AC are handled differently,
+	 * but produce same results in sharp code?
+	 */
+	case 0:
+		locomo_frontlight_set(locomolcd_dev, 0, 0, 161);
+		break;
+	case 1:
+		locomo_frontlight_set(locomolcd_dev, 117, 0, 161);
+		break;
+	case 2:
+		locomo_frontlight_set(locomolcd_dev, 163, 0, 148);
+		break;
+	case 3:
+		locomo_frontlight_set(locomolcd_dev, 194, 0, 161);
+		break;
+	case 4:
+		locomo_frontlight_set(locomolcd_dev, 194, 1, 161);
+		break;
 	default:
 		return -ENODEV;
 	}
@@ -175,9 +186,11 @@ static int locomolcd_probe(struct locomo_dev *ldev)
 
 	locomo_gpio_set_dir(ldev->dev.parent, LOCOMO_GPIO_FL_VR, 0);
 
-	/* the poodle_lcd_power function is called for the first time
+	/*
+	 * the poodle_lcd_power function is called for the first time
 	 * from fs_initcall, which is before locomo is activated.
-	 * We need to recall poodle_lcd_power here*/
+	 * We need to recall poodle_lcd_power here
+	 */
 	if (machine_is_poodle())
 		locomolcd_power(1);
 
@@ -190,8 +203,8 @@ static int locomolcd_probe(struct locomo_dev *ldev)
 							&ldev->dev, NULL,
 							&locomobl_data, &props);
 
-	if (IS_ERR (locomolcd_bl_device))
-		return PTR_ERR (locomolcd_bl_device);
+	if (IS_ERR(locomolcd_bl_device))
+		return PTR_ERR(locomolcd_bl_device);
 
 	/* Set up frontlight so that screen is readable */
 	locomolcd_bl_device->props.brightness = 2;
@@ -226,7 +239,6 @@ static struct locomo_driver poodle_lcd_driver = {
 	.resume = locomolcd_resume,
 };
 
-
 static int __init locomolcd_init(void)
 {
 	return locomo_driver_register(&poodle_lcd_driver);
diff --git a/trunk/drivers/video/backlight/lp855x_bl.c b/trunk/drivers/video/backlight/lp855x_bl.c
index fd985e0681e9..6e4db0c874c8 100644
--- a/trunk/drivers/video/backlight/lp855x_bl.c
+++ b/trunk/drivers/video/backlight/lp855x_bl.c
@@ -15,6 +15,7 @@
 #include <linux/backlight.h>
 #include <linux/err.h>
 #include <linux/platform_data/lp855x.h>
+#include <linux/pwm.h>
 
 /* Registers */
 #define BRIGHTNESS_CTRL		0x00
@@ -34,22 +35,19 @@ struct lp855x {
 	struct i2c_client *client;
 	struct backlight_device *bl;
 	struct device *dev;
-	struct mutex xfer_lock;
 	struct lp855x_platform_data *pdata;
+	struct pwm_device *pwm;
 };
 
 static int lp855x_read_byte(struct lp855x *lp, u8 reg, u8 *data)
 {
 	int ret;
 
-	mutex_lock(&lp->xfer_lock);
 	ret = i2c_smbus_read_byte_data(lp->client, reg);
 	if (ret < 0) {
-		mutex_unlock(&lp->xfer_lock);
 		dev_err(lp->dev, "failed to read 0x%.2x\n", reg);
 		return ret;
 	}
-	mutex_unlock(&lp->xfer_lock);
 
 	*data = (u8)ret;
 	return 0;
@@ -57,13 +55,7 @@ static int lp855x_read_byte(struct lp855x *lp, u8 reg, u8 *data)
 
 static int lp855x_write_byte(struct lp855x *lp, u8 reg, u8 data)
 {
-	int ret;
-
-	mutex_lock(&lp->xfer_lock);
-	ret = i2c_smbus_write_byte_data(lp->client, reg, data);
-	mutex_unlock(&lp->xfer_lock);
-
-	return ret;
+	return i2c_smbus_write_byte_data(lp->client, reg, data);
 }
 
 static bool lp855x_is_valid_rom_area(struct lp855x *lp, u8 addr)
@@ -121,6 +113,28 @@ static int lp855x_init_registers(struct lp855x *lp)
 	return ret;
 }
 
+static void lp855x_pwm_ctrl(struct lp855x *lp, int br, int max_br)
+{
+	unsigned int period = lp->pdata->period_ns;
+	unsigned int duty = br * period / max_br;
+	struct pwm_device *pwm;
+
+	/* request pwm device with the consumer name */
+	if (!lp->pwm) {
+		pwm = devm_pwm_get(lp->dev, lp->chipname);
+		if (IS_ERR(pwm))
+			return;
+
+		lp->pwm = pwm;
+	}
+
+	pwm_config(lp->pwm, duty, period);
+	if (duty)
+		pwm_enable(lp->pwm);
+	else
+		pwm_disable(lp->pwm);
+}
+
 static int lp855x_bl_update_status(struct backlight_device *bl)
 {
 	struct lp855x *lp = bl_get_data(bl);
@@ -130,12 +144,10 @@ static int lp855x_bl_update_status(struct backlight_device *bl)
 		bl->props.brightness = 0;
 
 	if (mode == PWM_BASED) {
-		struct lp855x_pwm_data *pd = &lp->pdata->pwm_data;
 		int br = bl->props.brightness;
 		int max_br = bl->props.max_brightness;
 
-		if (pd->pwm_set_intensity)
-			pd->pwm_set_intensity(br, max_br);
+		lp855x_pwm_ctrl(lp, br, max_br);
 
 	} else if (mode == REGISTER_BASED) {
 		u8 val = bl->props.brightness;
@@ -150,14 +162,7 @@ static int lp855x_bl_get_brightness(struct backlight_device *bl)
 	struct lp855x *lp = bl_get_data(bl);
 	enum lp855x_brightness_ctrl_mode mode = lp->pdata->mode;
 
-	if (mode == PWM_BASED) {
-		struct lp855x_pwm_data *pd = &lp->pdata->pwm_data;
-		int max_br = bl->props.max_brightness;
-
-		if (pd->pwm_get_intensity)
-			bl->props.brightness = pd->pwm_get_intensity(max_br);
-
-	} else if (mode == REGISTER_BASED) {
+	if (mode == REGISTER_BASED) {
 		u8 val = 0;
 
 		lp855x_read_byte(lp, BRIGHTNESS_CTRL, &val);
@@ -266,8 +271,6 @@ static int lp855x_probe(struct i2c_client *cl, const struct i2c_device_id *id)
 	lp->chip_id = id->driver_data;
 	i2c_set_clientdata(cl, lp);
 
-	mutex_init(&lp->xfer_lock);
-
 	ret = lp855x_init_registers(lp);
 	if (ret) {
 		dev_err(lp->dev, "i2c communication err: %d", ret);
diff --git a/trunk/drivers/video/backlight/max8925_bl.c b/trunk/drivers/video/backlight/max8925_bl.c
index c6bec7aab87b..2c9bce050aa9 100644
--- a/trunk/drivers/video/backlight/max8925_bl.c
+++ b/trunk/drivers/video/backlight/max8925_bl.c
@@ -120,15 +120,13 @@ static int max8925_backlight_probe(struct platform_device *pdev)
 	res = platform_get_resource(pdev, IORESOURCE_REG, 0);
 	if (!res) {
 		dev_err(&pdev->dev, "No REG resource for mode control!\n");
-		ret = -ENXIO;
-		goto out;
+		return -ENXIO;
 	}
 	data->reg_mode_cntl = res->start;
 	res = platform_get_resource(pdev, IORESOURCE_REG, 1);
 	if (!res) {
 		dev_err(&pdev->dev, "No REG resource for control!\n");
-		ret = -ENXIO;
-		goto out;
+		return -ENXIO;
 	}
 	data->reg_cntl = res->start;
 
@@ -142,8 +140,7 @@ static int max8925_backlight_probe(struct platform_device *pdev)
 					&max8925_backlight_ops, &props);
 	if (IS_ERR(bl)) {
 		dev_err(&pdev->dev, "failed to register backlight\n");
-		ret = PTR_ERR(bl);
-		goto out;
+		return PTR_ERR(bl);
 	}
 	bl->props.brightness = MAX_BRIGHTNESS;
 
@@ -166,8 +163,6 @@ static int max8925_backlight_probe(struct platform_device *pdev)
 	return 0;
 out_brt:
 	backlight_device_unregister(bl);
-out:
-	devm_kfree(&pdev->dev, data);
 	return ret;
 }
 
diff --git a/trunk/drivers/video/backlight/omap1_bl.c b/trunk/drivers/video/backlight/omap1_bl.c
index 9a046a4c98f5..af31c269baa6 100644
--- a/trunk/drivers/video/backlight/omap1_bl.c
+++ b/trunk/drivers/video/backlight/omap1_bl.c
@@ -42,12 +42,12 @@ struct omap_backlight {
 	struct omap_backlight_config *pdata;
 };
 
-static void inline omapbl_send_intensity(int intensity)
+static inline void omapbl_send_intensity(int intensity)
 {
 	omap_writeb(intensity, OMAP_PWL_ENABLE);
 }
 
-static void inline omapbl_send_enable(int enable)
+static inline void omapbl_send_enable(int enable)
 {
 	omap_writeb(enable, OMAP_PWL_CLK_ENABLE);
 }
diff --git a/trunk/drivers/video/backlight/pandora_bl.c b/trunk/drivers/video/backlight/pandora_bl.c
index 4ec30748b447..633b0a22fd64 100644
--- a/trunk/drivers/video/backlight/pandora_bl.c
+++ b/trunk/drivers/video/backlight/pandora_bl.c
@@ -71,8 +71,7 @@ static int pandora_backlight_update_status(struct backlight_device *bl)
 		 * set PWM duty cycle to max. TPS61161 seems to use this
 		 * to calibrate it's PWM sensitivity when it starts.
 		 */
-		twl_i2c_write_u8(TWL4030_MODULE_PWM0, MAX_VALUE,
-					TWL_PWM0_OFF);
+		twl_i2c_write_u8(TWL_MODULE_PWM, MAX_VALUE, TWL_PWM0_OFF);
 
 		/* first enable clock, then PWM0 out */
 		twl_i2c_read_u8(TWL4030_MODULE_INTBR, &r, TWL_INTBR_GPBR1);
@@ -90,8 +89,7 @@ static int pandora_backlight_update_status(struct backlight_device *bl)
 		usleep_range(2000, 10000);
 	}
 
-	twl_i2c_write_u8(TWL4030_MODULE_PWM0, MIN_VALUE + brightness,
-				TWL_PWM0_OFF);
+	twl_i2c_write_u8(TWL_MODULE_PWM, MIN_VALUE + brightness, TWL_PWM0_OFF);
 
 done:
 	if (brightness != 0)
@@ -132,7 +130,7 @@ static int pandora_backlight_probe(struct platform_device *pdev)
 	platform_set_drvdata(pdev, bl);
 
 	/* 64 cycle period, ON position 0 */
-	twl_i2c_write_u8(TWL4030_MODULE_PWM0, 0x80, TWL_PWM0_ON);
+	twl_i2c_write_u8(TWL_MODULE_PWM, 0x80, TWL_PWM0_ON);
 
 	bl->props.state |= PANDORABL_WAS_OFF;
 	bl->props.brightness = MAX_USER_VALUE;
diff --git a/trunk/drivers/video/backlight/pcf50633-backlight.c b/trunk/drivers/video/backlight/pcf50633-backlight.c
index 0087396007e4..e87c7a3394f3 100644
--- a/trunk/drivers/video/backlight/pcf50633-backlight.c
+++ b/trunk/drivers/video/backlight/pcf50633-backlight.c
@@ -52,7 +52,7 @@ int pcf50633_bl_set_brightness_limit(struct pcf50633 *pcf, unsigned int limit)
 	pcf_bl->brightness_limit = limit & 0x3f;
 	backlight_update_status(pcf_bl->bl);
 
-    return 0;
+	return 0;
 }
 
 static int pcf50633_bl_update_status(struct backlight_device *bl)
@@ -136,8 +136,10 @@ static int pcf50633_bl_probe(struct platform_device *pdev)
 
 	pcf50633_reg_write(pcf_bl->pcf, PCF50633_REG_LEDDIM, pdata->ramp_time);
 
-	/* Should be different from bl_props.brightness, so we do not exit
-	 * update_status early the first time it's called */
+	/*
+	 * Should be different from bl_props.brightness, so we do not exit
+	 * update_status early the first time it's called
+	 */
 	pcf_bl->brightness = pcf_bl->bl->props.brightness + 1;
 
 	backlight_update_status(pcf_bl->bl);
diff --git a/trunk/drivers/video/backlight/platform_lcd.c b/trunk/drivers/video/backlight/platform_lcd.c
index 894bfc5ce422..17a6b83f97af 100644
--- a/trunk/drivers/video/backlight/platform_lcd.c
+++ b/trunk/drivers/video/backlight/platform_lcd.c
@@ -27,7 +27,7 @@ struct platform_lcd {
 	struct plat_lcd_data	*pdata;
 
 	unsigned int		 power;
-	unsigned int		 suspended : 1;
+	unsigned int		 suspended:1;
 };
 
 static inline struct platform_lcd *to_our_lcd(struct lcd_device *lcd)
diff --git a/trunk/drivers/video/backlight/s6e63m0.c b/trunk/drivers/video/backlight/s6e63m0.c
index 484e10dd1a8e..3e1c1135f6df 100644
--- a/trunk/drivers/video/backlight/s6e63m0.c
+++ b/trunk/drivers/video/backlight/s6e63m0.c
@@ -757,7 +757,7 @@ static int s6e63m0_probe(struct spi_device *spi)
 	lcd->spi = spi;
 	lcd->dev = &spi->dev;
 
-	lcd->lcd_pd = (struct lcd_platform_data *)spi->dev.platform_data;
+	lcd->lcd_pd = spi->dev.platform_data;
 	if (!lcd->lcd_pd) {
 		dev_err(&spi->dev, "platform data is NULL.\n");
 		return -EFAULT;
diff --git a/trunk/drivers/video/backlight/tdo24m.c b/trunk/drivers/video/backlight/tdo24m.c
index 146ffb9404d1..ad2325f3d652 100644
--- a/trunk/drivers/video/backlight/tdo24m.c
+++ b/trunk/drivers/video/backlight/tdo24m.c
@@ -2,7 +2,7 @@
  * tdo24m - SPI-based drivers for Toppoly TDO24M series LCD panels
  *
  * Copyright (C) 2008 Marvell International Ltd.
- * 	Eric Miao <eric.miao@marvell.com>
+ *	Eric Miao <eric.miao@marvell.com>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License version 2 as
@@ -47,7 +47,7 @@ struct tdo24m {
 			((x1) << 9) | 0x100 | (x2))
 #define CMD_NULL	(-1)
 
-static uint32_t lcd_panel_reset[] = {
+static const uint32_t lcd_panel_reset[] = {
 	CMD0(0x1), /* reset */
 	CMD0(0x0), /* nop */
 	CMD0(0x0), /* nop */
@@ -55,7 +55,7 @@ static uint32_t lcd_panel_reset[] = {
 	CMD_NULL,
 };
 
-static uint32_t lcd_panel_on[] = {
+static const uint32_t lcd_panel_on[] = {
 	CMD0(0x29),		/* Display ON */
 	CMD2(0xB8, 0xFF, 0xF9),	/* Output Control */
 	CMD0(0x11),		/* Sleep out */
@@ -63,7 +63,7 @@ static uint32_t lcd_panel_on[] = {
 	CMD_NULL,
 };
 
-static uint32_t lcd_panel_off[] = {
+static const uint32_t lcd_panel_off[] = {
 	CMD0(0x28),		/* Display OFF */
 	CMD2(0xB8, 0x80, 0x02),	/* Output Control */
 	CMD0(0x10),		/* Sleep in */
@@ -71,7 +71,7 @@ static uint32_t lcd_panel_off[] = {
 	CMD_NULL,
 };
 
-static uint32_t lcd_vga_pass_through_tdo24m[] = {
+static const uint32_t lcd_vga_pass_through_tdo24m[] = {
 	CMD1(0xB0, 0x16),
 	CMD1(0xBC, 0x80),
 	CMD1(0xE1, 0x00),
@@ -80,7 +80,7 @@ static uint32_t lcd_vga_pass_through_tdo24m[] = {
 	CMD_NULL,
 };
 
-static uint32_t lcd_qvga_pass_through_tdo24m[] = {
+static const uint32_t lcd_qvga_pass_through_tdo24m[] = {
 	CMD1(0xB0, 0x16),
 	CMD1(0xBC, 0x81),
 	CMD1(0xE1, 0x00),
@@ -89,8 +89,8 @@ static uint32_t lcd_qvga_pass_through_tdo24m[] = {
 	CMD_NULL,
 };
 
-static uint32_t lcd_vga_transfer_tdo24m[] = {
-	CMD1(0xcf, 0x02), 	/* Blanking period control (1) */
+static const uint32_t lcd_vga_transfer_tdo24m[] = {
+	CMD1(0xcf, 0x02),	/* Blanking period control (1) */
 	CMD2(0xd0, 0x08, 0x04),	/* Blanking period control (2) */
 	CMD1(0xd1, 0x01),	/* CKV timing control on/off */
 	CMD2(0xd2, 0x14, 0x00),	/* CKV 1,2 timing control */
@@ -102,7 +102,7 @@ static uint32_t lcd_vga_transfer_tdo24m[] = {
 	CMD_NULL,
 };
 
-static uint32_t lcd_qvga_transfer[] = {
+static const uint32_t lcd_qvga_transfer[] = {
 	CMD1(0xd6, 0x02),	/* Blanking period control (1) */
 	CMD2(0xd7, 0x08, 0x04),	/* Blanking period control (2) */
 	CMD1(0xd8, 0x01),	/* CKV timing control on/off */
@@ -115,7 +115,7 @@ static uint32_t lcd_qvga_transfer[] = {
 	CMD_NULL,
 };
 
-static uint32_t lcd_vga_pass_through_tdo35s[] = {
+static const uint32_t lcd_vga_pass_through_tdo35s[] = {
 	CMD1(0xB0, 0x16),
 	CMD1(0xBC, 0x80),
 	CMD1(0xE1, 0x00),
@@ -123,7 +123,7 @@ static uint32_t lcd_vga_pass_through_tdo35s[] = {
 	CMD_NULL,
 };
 
-static uint32_t lcd_qvga_pass_through_tdo35s[] = {
+static const uint32_t lcd_qvga_pass_through_tdo35s[] = {
 	CMD1(0xB0, 0x16),
 	CMD1(0xBC, 0x81),
 	CMD1(0xE1, 0x00),
@@ -131,8 +131,8 @@ static uint32_t lcd_qvga_pass_through_tdo35s[] = {
 	CMD_NULL,
 };
 
-static uint32_t lcd_vga_transfer_tdo35s[] = {
-	CMD1(0xcf, 0x02), 	/* Blanking period control (1) */
+static const uint32_t lcd_vga_transfer_tdo35s[] = {
+	CMD1(0xcf, 0x02),	/* Blanking period control (1) */
 	CMD2(0xd0, 0x08, 0x04),	/* Blanking period control (2) */
 	CMD1(0xd1, 0x01),	/* CKV timing control on/off */
 	CMD2(0xd2, 0x00, 0x1e),	/* CKV 1,2 timing control */
@@ -144,7 +144,7 @@ static uint32_t lcd_vga_transfer_tdo35s[] = {
 	CMD_NULL,
 };
 
-static uint32_t lcd_panel_config[] = {
+static const uint32_t lcd_panel_config[] = {
 	CMD2(0xb8, 0xff, 0xf9),	/* Output control */
 	CMD0(0x11),		/* sleep out */
 	CMD1(0xba, 0x01),	/* Display mode (1) */
@@ -175,10 +175,11 @@ static uint32_t lcd_panel_config[] = {
 	CMD_NULL,
 };
 
-static int tdo24m_writes(struct tdo24m *lcd, uint32_t *array)
+static int tdo24m_writes(struct tdo24m *lcd, const uint32_t *array)
 {
 	struct spi_transfer *x = &lcd->xfer;
-	uint32_t data, *p = array;
+	const uint32_t *p = array;
+	uint32_t data;
 	int nparams, err = 0;
 
 	for (; *p != CMD_NULL; p++) {
diff --git a/trunk/drivers/video/backlight/tosa_bl.c b/trunk/drivers/video/backlight/tosa_bl.c
index a0521abdcd8a..588682cc1614 100644
--- a/trunk/drivers/video/backlight/tosa_bl.c
+++ b/trunk/drivers/video/backlight/tosa_bl.c
@@ -92,14 +92,12 @@ static int tosa_bl_probe(struct i2c_client *client,
 
 	data->comadj = sharpsl_param.comadj == -1 ? COMADJ_DEFAULT : sharpsl_param.comadj;
 
-	ret = devm_gpio_request(&client->dev, TOSA_GPIO_BL_C20MA, "backlight");
+	ret = devm_gpio_request_one(&client->dev, TOSA_GPIO_BL_C20MA,
+				GPIOF_OUT_INIT_LOW, "backlight");
 	if (ret) {
 		dev_dbg(&data->bl->dev, "Unable to request gpio!\n");
 		return ret;
 	}
-	ret = gpio_direction_output(TOSA_GPIO_BL_C20MA, 0);
-	if (ret)
-		return ret;
 
 	i2c_set_clientdata(client, data);
 	data->i2c = client;
@@ -163,7 +161,6 @@ static const struct i2c_device_id tosa_bl_id[] = {
 	{ },
 };
 
-
 static struct i2c_driver tosa_bl_driver = {
 	.driver = {
 		.name		= "tosa-bl",
diff --git a/trunk/drivers/video/backlight/tosa_lcd.c b/trunk/drivers/video/backlight/tosa_lcd.c
index 86fff88c2e4a..96bae941585a 100644
--- a/trunk/drivers/video/backlight/tosa_lcd.c
+++ b/trunk/drivers/video/backlight/tosa_lcd.c
@@ -63,7 +63,7 @@ static int tosa_tg_send(struct spi_device *spi, int adrs, uint8_t data)
 int tosa_bl_enable(struct spi_device *spi, int enable)
 {
 	/* bl_enable GP04=1 otherwise GP04=0*/
-	return tosa_tg_send(spi, TG_GPODR2, enable? 0x01 : 0x00);
+	return tosa_tg_send(spi, TG_GPODR2, enable ? 0x01 : 0x00);
 }
 EXPORT_SYMBOL(tosa_bl_enable);
 
@@ -91,15 +91,17 @@ static void tosa_lcd_tg_on(struct tosa_lcd_data *data)
 	tosa_tg_send(spi, TG_PNLCTL, value);
 
 	/* TG LCD pannel power up */
-	tosa_tg_send(spi, TG_PINICTL,0x4);
+	tosa_tg_send(spi, TG_PINICTL, 0x4);
 	mdelay(50);
 
 	/* TG LCD GVSS */
-	tosa_tg_send(spi, TG_PINICTL,0x0);
+	tosa_tg_send(spi, TG_PINICTL, 0x0);
 
 	if (!data->i2c) {
-		/* after the pannel is powered up the first time, we can access the i2c bus */
-		/* so probe for the DAC */
+		/*
+		 * after the pannel is powered up the first time,
+		 * we can access the i2c bus so probe for the DAC
+		 */
 		struct i2c_adapter *adap = i2c_get_adapter(0);
 		struct i2c_board_info info = {
 			.type	= "tosa-bl",
@@ -115,11 +117,11 @@ static void tosa_lcd_tg_off(struct tosa_lcd_data *data)
 	struct spi_device *spi = data->spi;
 
 	/* TG LCD VHSA off */
-	tosa_tg_send(spi, TG_PINICTL,0x4);
+	tosa_tg_send(spi, TG_PINICTL, 0x4);
 	mdelay(50);
 
 	/* TG LCD signal off */
-	tosa_tg_send(spi, TG_PINICTL,0x6);
+	tosa_tg_send(spi, TG_PINICTL, 0x6);
 	mdelay(50);
 
 	/* TG Off */
@@ -193,17 +195,13 @@ static int tosa_lcd_probe(struct spi_device *spi)
 	data->spi = spi;
 	dev_set_drvdata(&spi->dev, data);
 
-	ret = devm_gpio_request(&spi->dev, TOSA_GPIO_TG_ON, "tg #pwr");
+	ret = devm_gpio_request_one(&spi->dev, TOSA_GPIO_TG_ON,
+				GPIOF_OUT_INIT_LOW, "tg #pwr");
 	if (ret < 0)
 		goto err_gpio_tg;
 
 	mdelay(60);
 
-	ret = gpio_direction_output(TOSA_GPIO_TG_ON, 0);
-	if (ret < 0)
-		goto err_gpio_tg;
-
-	mdelay(60);
 	tosa_lcd_tg_init(data);
 
 	tosa_lcd_tg_on(data);
diff --git a/trunk/drivers/video/backlight/vgg2432a4.c b/trunk/drivers/video/backlight/vgg2432a4.c
index 712b0acfd339..45e81b4cf8b4 100644
--- a/trunk/drivers/video/backlight/vgg2432a4.c
+++ b/trunk/drivers/video/backlight/vgg2432a4.c
@@ -26,7 +26,7 @@
 
 /* Device initialisation sequences */
 
-static struct ili9320_reg vgg_init1[] = {
+static const struct ili9320_reg vgg_init1[] = {
 	{
 		.address = ILI9320_POWER1,
 		.value	 = ILI9320_POWER1_AP(0) | ILI9320_POWER1_BT(0),
@@ -43,7 +43,7 @@ static struct ili9320_reg vgg_init1[] = {
 	},
 };
 
-static struct ili9320_reg vgg_init2[] = {
+static const struct ili9320_reg vgg_init2[] = {
 	{
 		.address = ILI9320_POWER1,
 		.value   = (ILI9320_POWER1_AP(3) | ILI9320_POWER1_APE |
@@ -54,7 +54,7 @@ static struct ili9320_reg vgg_init2[] = {
 	}
 };
 
-static struct ili9320_reg vgg_gamma[] = {
+static const struct ili9320_reg vgg_gamma[] = {
 	{
 		.address = ILI9320_GAMMA1,
 		.value	 = 0x0000,
@@ -89,7 +89,7 @@ static struct ili9320_reg vgg_gamma[] = {
 
 };
 
-static struct ili9320_reg vgg_init0[] = {
+static const struct ili9320_reg vgg_init0[] = {
 	[0]	= {
 		/* set direction and scan mode gate */
 		.address = ILI9320_DRIVER,
@@ -217,7 +217,7 @@ static int vgg2432a4_resume(struct spi_device *spi)
 }
 #else
 #define vgg2432a4_suspend	NULL
-#define vgg2432a4_resume 	NULL
+#define vgg2432a4_resume	NULL
 #endif
 
 static struct ili9320_client vgg2432a4_client = {
diff --git a/trunk/drivers/video/gxt4500.c b/trunk/drivers/video/gxt4500.c
index 0e9afa41d163..4bdea6e9bd55 100644
--- a/trunk/drivers/video/gxt4500.c
+++ b/trunk/drivers/video/gxt4500.c
@@ -1,5 +1,6 @@
 /*
- * Frame buffer device for IBM GXT4500P and GXT6000P display adaptors
+ * Frame buffer device for IBM GXT4500P/6500P and GXT4000P/6000P
+ * display adaptors
  *
  * Copyright (C) 2006 Paul Mackerras, IBM Corp. <paulus@samba.org>
  */
@@ -14,6 +15,8 @@
 #include <linux/string.h>
 
 #define PCI_DEVICE_ID_IBM_GXT4500P	0x21c
+#define PCI_DEVICE_ID_IBM_GXT6500P	0x21b
+#define PCI_DEVICE_ID_IBM_GXT4000P	0x16e
 #define PCI_DEVICE_ID_IBM_GXT6000P	0x170
 
 /* GXT4500P registers */
@@ -173,6 +176,8 @@ static const struct fb_videomode defaultmode __devinitconst = {
 /* List of supported cards */
 enum gxt_cards {
 	GXT4500P,
+	GXT6500P,
+	GXT4000P,
 	GXT6000P
 };
 
@@ -182,6 +187,8 @@ static const struct cardinfo {
 	const char *cardname;
 } cardinfo[] = {
 	[GXT4500P] = { .refclk_ps = 9259, .cardname = "IBM GXT4500P" },
+	[GXT6500P] = { .refclk_ps = 9259, .cardname = "IBM GXT6500P" },
+	[GXT4000P] = { .refclk_ps = 40000, .cardname = "IBM GXT4000P" },
 	[GXT6000P] = { .refclk_ps = 40000, .cardname = "IBM GXT6000P" },
 };
 
@@ -736,6 +743,10 @@ static void __devexit gxt4500_remove(struct pci_dev *pdev)
 static const struct pci_device_id gxt4500_pci_tbl[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_GXT4500P),
 	  .driver_data = GXT4500P },
+	{ PCI_DEVICE(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_GXT6500P),
+	  .driver_data = GXT6500P },
+	{ PCI_DEVICE(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_GXT4000P),
+	  .driver_data = GXT4000P },
 	{ PCI_DEVICE(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_GXT6000P),
 	  .driver_data = GXT6000P },
 	{ 0 }
@@ -768,7 +779,7 @@ static void __exit gxt4500_exit(void)
 module_exit(gxt4500_exit);
 
 MODULE_AUTHOR("Paul Mackerras <paulus@samba.org>");
-MODULE_DESCRIPTION("FBDev driver for IBM GXT4500P/6000P");
+MODULE_DESCRIPTION("FBDev driver for IBM GXT4500P/6500P and GXT4000P/6000P");
 MODULE_LICENSE("GPL");
 module_param(mode_option, charp, 0);
 MODULE_PARM_DESC(mode_option, "Specify resolution as \"<xres>x<yres>[-<bpp>][@<refresh>]\"");
diff --git a/trunk/drivers/virt/fsl_hypervisor.c b/trunk/drivers/virt/fsl_hypervisor.c
index 4939e0ccc4e5..d294f67d6f84 100644
--- a/trunk/drivers/virt/fsl_hypervisor.c
+++ b/trunk/drivers/virt/fsl_hypervisor.c
@@ -796,9 +796,6 @@ static int has_fsl_hypervisor(void)
 	struct device_node *node;
 	int ret;
 
-	if (!(mfmsr() & MSR_GS))
-		return 0;
-
 	node = of_find_node_by_path("/hypervisor");
 	if (!node)
 		return 0;
diff --git a/trunk/fs/attr.c b/trunk/fs/attr.c
index cce7df53b694..1449adb14ef6 100644
--- a/trunk/fs/attr.c
+++ b/trunk/fs/attr.c
@@ -49,14 +49,15 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
 	/* Make sure a caller can chown. */
 	if ((ia_valid & ATTR_UID) &&
 	    (!uid_eq(current_fsuid(), inode->i_uid) ||
-	     !uid_eq(attr->ia_uid, inode->i_uid)) && !capable(CAP_CHOWN))
+	     !uid_eq(attr->ia_uid, inode->i_uid)) &&
+	    !inode_capable(inode, CAP_CHOWN))
 		return -EPERM;
 
 	/* Make sure caller can chgrp. */
 	if ((ia_valid & ATTR_GID) &&
 	    (!uid_eq(current_fsuid(), inode->i_uid) ||
 	    (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) &&
-	    !capable(CAP_CHOWN))
+	    !inode_capable(inode, CAP_CHOWN))
 		return -EPERM;
 
 	/* Make sure a caller can chmod. */
@@ -65,7 +66,8 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
 			return -EPERM;
 		/* Also check the setgid bit! */
 		if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
-				inode->i_gid) && !capable(CAP_FSETID))
+				inode->i_gid) &&
+		    !inode_capable(inode, CAP_FSETID))
 			attr->ia_mode &= ~S_ISGID;
 	}
 
@@ -157,7 +159,8 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = attr->ia_mode;
 
-		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+		if (!in_group_p(inode->i_gid) &&
+		    !inode_capable(inode, CAP_FSETID))
 			mode &= ~S_ISGID;
 		inode->i_mode = mode;
 	}
diff --git a/trunk/fs/autofs4/autofs_i.h b/trunk/fs/autofs4/autofs_i.h
index 908e18455413..b785e7707959 100644
--- a/trunk/fs/autofs4/autofs_i.h
+++ b/trunk/fs/autofs4/autofs_i.h
@@ -74,8 +74,8 @@ struct autofs_info {
 	unsigned long last_used;
 	atomic_t count;
 
-	uid_t uid;
-	gid_t gid;
+	kuid_t uid;
+	kgid_t gid;
 };
 
 #define AUTOFS_INF_EXPIRING	(1<<0) /* dentry is in the process of expiring */
@@ -89,8 +89,8 @@ struct autofs_wait_queue {
 	struct qstr name;
 	u32 dev;
 	u64 ino;
-	uid_t uid;
-	gid_t gid;
+	kuid_t uid;
+	kgid_t gid;
 	pid_t pid;
 	pid_t tgid;
 	/* This is for status reporting upon return */
diff --git a/trunk/fs/autofs4/dev-ioctl.c b/trunk/fs/autofs4/dev-ioctl.c
index a16214109d31..9f68a37bb2b2 100644
--- a/trunk/fs/autofs4/dev-ioctl.c
+++ b/trunk/fs/autofs4/dev-ioctl.c
@@ -437,8 +437,8 @@ static int autofs_dev_ioctl_requester(struct file *fp,
 		err = 0;
 		autofs4_expire_wait(path.dentry);
 		spin_lock(&sbi->fs_lock);
-		param->requester.uid = ino->uid;
-		param->requester.gid = ino->gid;
+		param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid);
+		param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid);
 		spin_unlock(&sbi->fs_lock);
 	}
 	path_put(&path);
diff --git a/trunk/fs/autofs4/inode.c b/trunk/fs/autofs4/inode.c
index 8a4fed8ead30..b104726e2d0a 100644
--- a/trunk/fs/autofs4/inode.c
+++ b/trunk/fs/autofs4/inode.c
@@ -36,8 +36,8 @@ struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
 
 void autofs4_clean_ino(struct autofs_info *ino)
 {
-	ino->uid = 0;
-	ino->gid = 0;
+	ino->uid = GLOBAL_ROOT_UID;
+	ino->gid = GLOBAL_ROOT_GID;
 	ino->last_used = jiffies;
 }
 
@@ -79,10 +79,12 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
 		return 0;
 
 	seq_printf(m, ",fd=%d", sbi->pipefd);
-	if (root_inode->i_uid != 0)
-		seq_printf(m, ",uid=%u", root_inode->i_uid);
-	if (root_inode->i_gid != 0)
-		seq_printf(m, ",gid=%u", root_inode->i_gid);
+	if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID))
+		seq_printf(m, ",uid=%u",
+			from_kuid_munged(&init_user_ns, root_inode->i_uid));
+	if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID))
+		seq_printf(m, ",gid=%u",
+			from_kgid_munged(&init_user_ns, root_inode->i_gid));
 	seq_printf(m, ",pgrp=%d", sbi->oz_pgrp);
 	seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ);
 	seq_printf(m, ",minproto=%d", sbi->min_proto);
@@ -126,7 +128,7 @@ static const match_table_t tokens = {
 	{Opt_err, NULL}
 };
 
-static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
+static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
 		pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto)
 {
 	char *p;
@@ -159,12 +161,16 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
 		case Opt_uid:
 			if (match_int(args, &option))
 				return 1;
-			*uid = option;
+			*uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(*uid))
+				return 1;
 			break;
 		case Opt_gid:
 			if (match_int(args, &option))
 				return 1;
-			*gid = option;
+			*gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(*gid))
+				return 1;
 			break;
 		case Opt_pgrp:
 			if (match_int(args, &option))
diff --git a/trunk/fs/autofs4/waitq.c b/trunk/fs/autofs4/waitq.c
index dce436e595c1..03bc1d347d8e 100644
--- a/trunk/fs/autofs4/waitq.c
+++ b/trunk/fs/autofs4/waitq.c
@@ -154,6 +154,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 	case autofs_ptype_expire_direct:
 	{
 		struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
+		struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns;
 
 		pktsz = sizeof(*packet);
 
@@ -163,8 +164,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 		packet->name[wq->name.len] = '\0';
 		packet->dev = wq->dev;
 		packet->ino = wq->ino;
-		packet->uid = wq->uid;
-		packet->gid = wq->gid;
+		packet->uid = from_kuid_munged(user_ns, wq->uid);
+		packet->gid = from_kgid_munged(user_ns, wq->gid);
 		packet->pid = wq->pid;
 		packet->tgid = wq->tgid;
 		break;
diff --git a/trunk/fs/bad_inode.c b/trunk/fs/bad_inode.c
index b1342ffb3cf6..922ad460bff9 100644
--- a/trunk/fs/bad_inode.c
+++ b/trunk/fs/bad_inode.c
@@ -16,7 +16,7 @@
 #include <linux/poll.h>
 
 
-static loff_t bad_file_llseek(struct file *file, loff_t offset, int origin)
+static loff_t bad_file_llseek(struct file *file, loff_t offset, int whence)
 {
 	return -EIO;
 }
diff --git a/trunk/fs/binfmt_elf.c b/trunk/fs/binfmt_elf.c
index 6d7d1647a68c..0c42cdbabecf 100644
--- a/trunk/fs/binfmt_elf.c
+++ b/trunk/fs/binfmt_elf.c
@@ -1601,8 +1601,10 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	info->thread = NULL;
 
 	psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
-	if (psinfo == NULL)
+	if (psinfo == NULL) {
+		info->psinfo.data = NULL; /* So we don't free this wrongly */
 		return 0;
+	}
 
 	fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
 
diff --git a/trunk/fs/binfmt_em86.c b/trunk/fs/binfmt_em86.c
index 4e6cce57d113..037a3e2b045b 100644
--- a/trunk/fs/binfmt_em86.c
+++ b/trunk/fs/binfmt_em86.c
@@ -42,7 +42,6 @@ static int load_em86(struct linux_binprm *bprm)
 			return -ENOEXEC;
 	}
 
-	bprm->recursion_depth++; /* Well, the bang-shell is implicit... */
 	allow_write_access(bprm->file);
 	fput(bprm->file);
 	bprm->file = NULL;
diff --git a/trunk/fs/binfmt_misc.c b/trunk/fs/binfmt_misc.c
index b0b70fbea06c..9be335fb8a7c 100644
--- a/trunk/fs/binfmt_misc.c
+++ b/trunk/fs/binfmt_misc.c
@@ -117,10 +117,6 @@ static int load_misc_binary(struct linux_binprm *bprm)
 	if (!enabled)
 		goto _ret;
 
-	retval = -ENOEXEC;
-	if (bprm->recursion_depth > BINPRM_MAX_RECURSION)
-		goto _ret;
-
 	/* to keep locking time low, we copy the interpreter string */
 	read_lock(&entries_lock);
 	fmt = check_file(bprm);
@@ -197,8 +193,6 @@ static int load_misc_binary(struct linux_binprm *bprm)
 	if (retval < 0)
 		goto _error;
 
-	bprm->recursion_depth++;
-
 	retval = search_binary_handler(bprm);
 	if (retval < 0)
 		goto _error;
diff --git a/trunk/fs/binfmt_script.c b/trunk/fs/binfmt_script.c
index 8c954997e7f7..1610a91637e5 100644
--- a/trunk/fs/binfmt_script.c
+++ b/trunk/fs/binfmt_script.c
@@ -22,15 +22,13 @@ static int load_script(struct linux_binprm *bprm)
 	char interp[BINPRM_BUF_SIZE];
 	int retval;
 
-	if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') ||
-	    (bprm->recursion_depth > BINPRM_MAX_RECURSION))
+	if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
 		return -ENOEXEC;
 	/*
 	 * This section does the #! interpretation.
 	 * Sorta complicated, but hopefully it will work.  -TYT
 	 */
 
-	bprm->recursion_depth++;
 	allow_write_access(bprm->file);
 	fput(bprm->file);
 	bprm->file = NULL;
diff --git a/trunk/fs/block_dev.c b/trunk/fs/block_dev.c
index ab3a456f6650..172f8491a2bd 100644
--- a/trunk/fs/block_dev.c
+++ b/trunk/fs/block_dev.c
@@ -321,7 +321,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
  * for a block special file file->f_path.dentry->d_inode->i_size is zero
  * so we compute the size by hand (just as in block_read/write above)
  */
-static loff_t block_llseek(struct file *file, loff_t offset, int origin)
+static loff_t block_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct inode *bd_inode = file->f_mapping->host;
 	loff_t size;
@@ -331,7 +331,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
 	size = i_size_read(bd_inode);
 
 	retval = -EINVAL;
-	switch (origin) {
+	switch (whence) {
 		case SEEK_END:
 			offset += size;
 			break;
diff --git a/trunk/fs/btrfs/Makefile b/trunk/fs/btrfs/Makefile
index d7fcdba141a2..7df3e0f0ee51 100644
--- a/trunk/fs/btrfs/Makefile
+++ b/trunk/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-	   reada.o backref.o ulist.o qgroup.o send.o
+	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/trunk/fs/btrfs/acl.c b/trunk/fs/btrfs/acl.c
index 0c16e3dbfd56..e15d2b0d8d3b 100644
--- a/trunk/fs/btrfs/acl.c
+++ b/trunk/fs/btrfs/acl.c
@@ -121,6 +121,8 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
 			ret = posix_acl_equiv_mode(acl, &inode->i_mode);
 			if (ret < 0)
 				return ret;
+			if (ret == 0)
+				acl = NULL;
 		}
 		ret = 0;
 		break;
diff --git a/trunk/fs/btrfs/backref.c b/trunk/fs/btrfs/backref.c
index 208d8aa5b07e..04edf69be875 100644
--- a/trunk/fs/btrfs/backref.c
+++ b/trunk/fs/btrfs/backref.c
@@ -461,6 +461,7 @@ static int __merge_refs(struct list_head *head, int mode)
 		     pos2 = n2, n2 = pos2->next) {
 			struct __prelim_ref *ref2;
 			struct __prelim_ref *xchg;
+			struct extent_inode_elem *eie;
 
 			ref2 = list_entry(pos2, struct __prelim_ref, list);
 
@@ -472,12 +473,20 @@ static int __merge_refs(struct list_head *head, int mode)
 					ref1 = ref2;
 					ref2 = xchg;
 				}
-				ref1->count += ref2->count;
 			} else {
 				if (ref1->parent != ref2->parent)
 					continue;
-				ref1->count += ref2->count;
 			}
+
+			eie = ref1->inode_list;
+			while (eie && eie->next)
+				eie = eie->next;
+			if (eie)
+				eie->next = ref2->inode_list;
+			else
+				ref1->inode_list = ref2->inode_list;
+			ref1->count += ref2->count;
+
 			list_del(&ref2->list);
 			kfree(ref2);
 		}
@@ -890,8 +899,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 	while (!list_empty(&prefs)) {
 		ref = list_first_entry(&prefs, struct __prelim_ref, list);
 		list_del(&ref->list);
-		if (ref->count < 0)
-			WARN_ON(1);
+		WARN_ON(ref->count < 0);
 		if (ref->count && ref->root_id && ref->parent == 0) {
 			/* no parent == root of tree */
 			ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
diff --git a/trunk/fs/btrfs/btrfs_inode.h b/trunk/fs/btrfs/btrfs_inode.h
index ed8ca7ca5eff..2a8c242bc4f5 100644
--- a/trunk/fs/btrfs/btrfs_inode.h
+++ b/trunk/fs/btrfs/btrfs_inode.h
@@ -39,6 +39,7 @@
 #define BTRFS_INODE_HAS_ORPHAN_ITEM		5
 #define BTRFS_INODE_HAS_ASYNC_EXTENT		6
 #define BTRFS_INODE_NEEDS_FULL_SYNC		7
+#define BTRFS_INODE_COPY_EVERYTHING		8
 
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -90,6 +91,9 @@ struct btrfs_inode {
 
 	unsigned long runtime_flags;
 
+	/* Keep track of who's O_SYNC/fsycing currently */
+	atomic_t sync_writers;
+
 	/* full 64 bit generation number, struct vfs_inode doesn't have a big
 	 * enough field for this.
 	 */
diff --git a/trunk/fs/btrfs/check-integrity.c b/trunk/fs/btrfs/check-integrity.c
index 5a3e45db642a..11d47bfb62b4 100644
--- a/trunk/fs/btrfs/check-integrity.c
+++ b/trunk/fs/btrfs/check-integrity.c
@@ -137,7 +137,7 @@ struct btrfsic_block {
 	unsigned int never_written:1;	/* block was added because it was
 					 * referenced, not because it was
 					 * written */
-	unsigned int mirror_num:2;	/* large enough to hold
+	unsigned int mirror_num;	/* large enough to hold
 					 * BTRFS_SUPER_MIRROR_MAX */
 	struct btrfsic_dev_state *dev_state;
 	u64 dev_bytenr;		/* key, physical byte num on disk */
@@ -723,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
 		}
 
 		num_copies =
-		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+		    btrfs_num_copies(state->root->fs_info,
 				     next_bytenr, state->metablock_size);
 		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
 			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -903,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror(
 		}
 
 		num_copies =
-		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+		    btrfs_num_copies(state->root->fs_info,
 				     next_bytenr, state->metablock_size);
 		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
 			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1287,7 +1287,7 @@ static int btrfsic_create_link_to_next_block(
 	*next_blockp = NULL;
 	if (0 == *num_copiesp) {
 		*num_copiesp =
-		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+		    btrfs_num_copies(state->root->fs_info,
 				     next_bytenr, state->metablock_size);
 		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
 			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1489,7 +1489,7 @@ static int btrfsic_handle_extent_data(
 			chunk_len = num_bytes;
 
 		num_copies =
-		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+		    btrfs_num_copies(state->root->fs_info,
 				     next_bytenr, state->datablock_size);
 		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
 			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1582,9 +1582,21 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
 	struct btrfs_device *device;
 
 	length = len;
-	ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
+	ret = btrfs_map_block(state->root->fs_info, READ,
 			      bytenr, &length, &multi, mirror_num);
 
+	if (ret) {
+		block_ctx_out->start = 0;
+		block_ctx_out->dev_bytenr = 0;
+		block_ctx_out->len = 0;
+		block_ctx_out->dev = NULL;
+		block_ctx_out->datav = NULL;
+		block_ctx_out->pagev = NULL;
+		block_ctx_out->mem_to_free = NULL;
+
+		return ret;
+	}
+
 	device = multi->stripes[0].dev;
 	block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
 	block_ctx_out->dev_bytenr = multi->stripes[0].physical;
@@ -1594,8 +1606,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
 	block_ctx_out->pagev = NULL;
 	block_ctx_out->mem_to_free = NULL;
 
-	if (0 == ret)
-		kfree(multi);
+	kfree(multi);
 	if (NULL == block_ctx_out->dev) {
 		ret = -ENXIO;
 		printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
@@ -2463,7 +2474,7 @@ static int btrfsic_process_written_superblock(
 		}
 
 		num_copies =
-		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+		    btrfs_num_copies(state->root->fs_info,
 				     next_bytenr, BTRFS_SUPER_INFO_SIZE);
 		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
 			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -2960,7 +2971,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
 	struct btrfsic_block_data_ctx block_ctx;
 	int match = 0;
 
-	num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
+	num_copies = btrfs_num_copies(state->root->fs_info,
 				      bytenr, state->metablock_size);
 
 	for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
diff --git a/trunk/fs/btrfs/compression.c b/trunk/fs/btrfs/compression.c
index c6467aa88bee..94ab2f80e7e3 100644
--- a/trunk/fs/btrfs/compression.c
+++ b/trunk/fs/btrfs/compression.c
@@ -687,7 +687,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 			ret = btrfs_map_bio(root, READ, comp_bio,
 					    mirror_num, 0);
-			BUG_ON(ret); /* -ENOMEM */
+			if (ret)
+				bio_endio(comp_bio, ret);
 
 			bio_put(comp_bio);
 
@@ -712,7 +713,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	}
 
 	ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
-	BUG_ON(ret); /* -ENOMEM */
+	if (ret)
+		bio_endio(comp_bio, ret);
 
 	bio_put(comp_bio);
 	return 0;
diff --git a/trunk/fs/btrfs/ctree.c b/trunk/fs/btrfs/ctree.c
index cdfb4c49a806..c7b67cf24bba 100644
--- a/trunk/fs/btrfs/ctree.c
+++ b/trunk/fs/btrfs/ctree.c
@@ -38,8 +38,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 			      struct extent_buffer *dst_buf,
 			      struct extent_buffer *src_buf);
 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		    struct btrfs_path *path, int level, int slot,
-		    int tree_mod_log);
+		    struct btrfs_path *path, int level, int slot);
 static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
 				 struct extent_buffer *eb);
 struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -776,8 +775,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 
 static noinline void
 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
-			  struct extent_buffer *eb,
-			  struct btrfs_disk_key *disk_key, int slot, int atomic)
+			  struct extent_buffer *eb, int slot, int atomic)
 {
 	int ret;
 
@@ -1140,13 +1138,13 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
 		switch (tm->op) {
 		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
 			BUG_ON(tm->slot < n);
-		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
 		case MOD_LOG_KEY_REMOVE:
+			n++;
+		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
 			btrfs_set_node_key(eb, &tm->key, tm->slot);
 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
 			btrfs_set_node_ptr_generation(eb, tm->slot,
 						      tm->generation);
-			n++;
 			break;
 		case MOD_LOG_KEY_REPLACE:
 			BUG_ON(tm->slot >= n);
@@ -1361,19 +1359,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 	u64 search_start;
 	int ret;
 
-	if (trans->transaction != root->fs_info->running_transaction) {
-		printk(KERN_CRIT "trans %llu running %llu\n",
+	if (trans->transaction != root->fs_info->running_transaction)
+		WARN(1, KERN_CRIT "trans %llu running %llu\n",
 		       (unsigned long long)trans->transid,
 		       (unsigned long long)
 		       root->fs_info->running_transaction->transid);
-		WARN_ON(1);
-	}
-	if (trans->transid != root->fs_info->generation) {
-		printk(KERN_CRIT "trans %llu running %llu\n",
+
+	if (trans->transid != root->fs_info->generation)
+		WARN(1, KERN_CRIT "trans %llu running %llu\n",
 		       (unsigned long long)trans->transid,
 		       (unsigned long long)root->fs_info->generation);
-		WARN_ON(1);
-	}
 
 	if (!should_cow_block(trans, root, buf)) {
 		*cow_ret = buf;
@@ -1469,10 +1464,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	if (cache_only && parent_level != 1)
 		return 0;
 
-	if (trans->transaction != root->fs_info->running_transaction)
-		WARN_ON(1);
-	if (trans->transid != root->fs_info->generation)
-		WARN_ON(1);
+	WARN_ON(trans->transaction != root->fs_info->running_transaction);
+	WARN_ON(trans->transid != root->fs_info->generation);
 
 	parent_nritems = btrfs_header_nritems(parent);
 	blocksize = btrfs_level_size(root, parent_level - 1);
@@ -1827,7 +1820,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		if (btrfs_header_nritems(right) == 0) {
 			clean_tree_block(trans, root, right);
 			btrfs_tree_unlock(right);
-			del_ptr(trans, root, path, level + 1, pslot + 1, 1);
+			del_ptr(trans, root, path, level + 1, pslot + 1);
 			root_sub_used(root, right->len);
 			btrfs_free_tree_block(trans, root, right, 0, 1);
 			free_extent_buffer_stale(right);
@@ -1836,7 +1829,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			struct btrfs_disk_key right_key;
 			btrfs_node_key(right, &right_key, 0);
 			tree_mod_log_set_node_key(root->fs_info, parent,
-						  &right_key, pslot + 1, 0);
+						  pslot + 1, 0);
 			btrfs_set_node_key(parent, &right_key, pslot + 1);
 			btrfs_mark_buffer_dirty(parent);
 		}
@@ -1871,7 +1864,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	if (btrfs_header_nritems(mid) == 0) {
 		clean_tree_block(trans, root, mid);
 		btrfs_tree_unlock(mid);
-		del_ptr(trans, root, path, level + 1, pslot, 1);
+		del_ptr(trans, root, path, level + 1, pslot);
 		root_sub_used(root, mid->len);
 		btrfs_free_tree_block(trans, root, mid, 0, 1);
 		free_extent_buffer_stale(mid);
@@ -1880,7 +1873,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		/* update the parent key to reflect our changes */
 		struct btrfs_disk_key mid_key;
 		btrfs_node_key(mid, &mid_key, 0);
-		tree_mod_log_set_node_key(root->fs_info, parent, &mid_key,
+		tree_mod_log_set_node_key(root->fs_info, parent,
 					  pslot, 0);
 		btrfs_set_node_key(parent, &mid_key, pslot);
 		btrfs_mark_buffer_dirty(parent);
@@ -1980,7 +1973,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 			orig_slot += left_nr;
 			btrfs_node_key(mid, &disk_key, 0);
 			tree_mod_log_set_node_key(root->fs_info, parent,
-						  &disk_key, pslot, 0);
+						  pslot, 0);
 			btrfs_set_node_key(parent, &disk_key, pslot);
 			btrfs_mark_buffer_dirty(parent);
 			if (btrfs_header_nritems(left) > orig_slot) {
@@ -2033,7 +2026,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 
 			btrfs_node_key(right, &disk_key, 0);
 			tree_mod_log_set_node_key(root->fs_info, parent,
-						  &disk_key, pslot + 1, 0);
+						  pslot + 1, 0);
 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
 			btrfs_mark_buffer_dirty(parent);
 
@@ -2219,6 +2212,9 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
 	int no_skips = 0;
 	struct extent_buffer *t;
 
+	if (path->really_keep_locks)
+		return;
+
 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
 		if (!path->nodes[i])
 			break;
@@ -2266,7 +2262,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
 {
 	int i;
 
-	if (path->keep_locks)
+	if (path->keep_locks || path->really_keep_locks)
 		return;
 
 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -2499,7 +2495,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (!cow)
 		write_lock_level = -1;
 
-	if (cow && (p->keep_locks || p->lowest_level))
+	if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level))
 		write_lock_level = BTRFS_MAX_LEVEL;
 
 	min_write_lock_level = write_lock_level;
@@ -2568,7 +2564,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 			 * must have write locks on this node and the
 			 * parent
 			 */
-			if (level + 1 > write_lock_level) {
+			if (level > write_lock_level ||
+			    (level + 1 > write_lock_level &&
+			    level + 1 < BTRFS_MAX_LEVEL &&
+			    p->nodes[level + 1])) {
 				write_lock_level = level + 1;
 				btrfs_release_path(p);
 				goto again;
@@ -2917,7 +2916,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
 		if (!path->nodes[i])
 			break;
 		t = path->nodes[i];
-		tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);
+		tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
 		btrfs_set_node_key(t, key, tslot);
 		btrfs_mark_buffer_dirty(path->nodes[i]);
 		if (tslot != 0)
@@ -3302,14 +3301,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
  */
 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
 {
+	struct btrfs_item *start_item;
+	struct btrfs_item *end_item;
+	struct btrfs_map_token token;
 	int data_len;
 	int nritems = btrfs_header_nritems(l);
 	int end = min(nritems, start + nr) - 1;
 
 	if (!nr)
 		return 0;
-	data_len = btrfs_item_end_nr(l, start);
-	data_len = data_len - btrfs_item_offset_nr(l, end);
+	btrfs_init_map_token(&token);
+	start_item = btrfs_item_nr(l, start);
+	end_item = btrfs_item_nr(l, end);
+	data_len = btrfs_token_item_offset(l, start_item, &token) +
+		btrfs_token_item_size(l, start_item, &token);
+	data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
 	data_len += sizeof(struct btrfs_item) * nr;
 	WARN_ON(data_len < 0);
 	return data_len;
@@ -3403,8 +3409,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 	if (push_items == 0)
 		goto out_unlock;
 
-	if (!empty && push_items == left_nritems)
-		WARN_ON(1);
+	WARN_ON(!empty && push_items == left_nritems);
 
 	/* push left to right */
 	right_nritems = btrfs_header_nritems(right);
@@ -3642,11 +3647,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
 
 	/* fixup right node */
-	if (push_items > right_nritems) {
-		printk(KERN_CRIT "push items %d nr %u\n", push_items,
+	if (push_items > right_nritems)
+		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
 		       right_nritems);
-		WARN_ON(1);
-	}
 
 	if (push_items < right_nritems) {
 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
@@ -4602,16 +4605,21 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
  * empty a node.
  */
 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		    struct btrfs_path *path, int level, int slot,
-		    int tree_mod_log)
+		    struct btrfs_path *path, int level, int slot)
 {
 	struct extent_buffer *parent = path->nodes[level];
 	u32 nritems;
 	int ret;
 
+	if (level) {
+		ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
+					      MOD_LOG_KEY_REMOVE);
+		BUG_ON(ret < 0);
+	}
+
 	nritems = btrfs_header_nritems(parent);
 	if (slot != nritems - 1) {
-		if (tree_mod_log && level)
+		if (level)
 			tree_mod_log_eb_move(root->fs_info, parent, slot,
 					     slot + 1, nritems - slot - 1);
 		memmove_extent_buffer(parent,
@@ -4619,10 +4627,6 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			      btrfs_node_key_ptr_offset(slot + 1),
 			      sizeof(struct btrfs_key_ptr) *
 			      (nritems - slot - 1));
-	} else if (tree_mod_log && level) {
-		ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
-					      MOD_LOG_KEY_REMOVE);
-		BUG_ON(ret < 0);
 	}
 
 	nritems--;
@@ -4656,7 +4660,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
 				    struct extent_buffer *leaf)
 {
 	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
-	del_ptr(trans, root, path, 1, path->slots[1], 1);
+	del_ptr(trans, root, path, 1, path->slots[1]);
 
 	/*
 	 * btrfs_free_extent is expensive, we want to make sure we
@@ -5123,13 +5127,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 	right_path->search_commit_root = 1;
 	right_path->skip_locking = 1;
 
-	spin_lock(&left_root->root_times_lock);
+	spin_lock(&left_root->root_item_lock);
 	left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
-	spin_unlock(&left_root->root_times_lock);
+	spin_unlock(&left_root->root_item_lock);
 
-	spin_lock(&right_root->root_times_lock);
+	spin_lock(&right_root->root_item_lock);
 	right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
-	spin_unlock(&right_root->root_times_lock);
+	spin_unlock(&right_root->root_item_lock);
 
 	trans = btrfs_join_transaction(left_root);
 	if (IS_ERR(trans)) {
@@ -5224,15 +5228,15 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 				goto out;
 			}
 
-			spin_lock(&left_root->root_times_lock);
+			spin_lock(&left_root->root_item_lock);
 			ctransid = btrfs_root_ctransid(&left_root->root_item);
-			spin_unlock(&left_root->root_times_lock);
+			spin_unlock(&left_root->root_item_lock);
 			if (ctransid != left_start_ctransid)
 				left_start_ctransid = 0;
 
-			spin_lock(&right_root->root_times_lock);
+			spin_lock(&right_root->root_item_lock);
 			ctransid = btrfs_root_ctransid(&right_root->root_item);
-			spin_unlock(&right_root->root_times_lock);
+			spin_unlock(&right_root->root_item_lock);
 			if (ctransid != right_start_ctransid)
 				right_start_ctransid = 0;
 
@@ -5496,6 +5500,139 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	return btrfs_next_old_leaf(root, path, 0);
 }
 
+/* Release the path up to but not including the given level */
+static void btrfs_release_level(struct btrfs_path *path, int level)
+{
+	int i;
+
+	for (i = 0; i < level; i++) {
+		path->slots[i] = 0;
+		if (!path->nodes[i])
+			continue;
+		if (path->locks[i]) {
+			btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
+			path->locks[i] = 0;
+		}
+		free_extent_buffer(path->nodes[i]);
+		path->nodes[i] = NULL;
+	}
+}
+
+/*
+ * This function assumes 2 things
+ *
+ * 1) You are using path->keep_locks
+ * 2) You are not inserting items.
+ *
+ * If either of these are not true do not use this function. If you need a next
+ * leaf with either of these not being true then this function can be easily
+ * adapted to do that, but at the moment these are the limitations.
+ */
+int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct btrfs_path *path,
+			  int del)
+{
+	struct extent_buffer *b;
+	struct btrfs_key key;
+	u32 nritems;
+	int level = 1;
+	int slot;
+	int ret = 1;
+	int write_lock_level = BTRFS_MAX_LEVEL;
+	int ins_len = del ? -1 : 0;
+
+	WARN_ON(!(path->keep_locks || path->really_keep_locks));
+
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+
+	while (path->nodes[level]) {
+		nritems = btrfs_header_nritems(path->nodes[level]);
+		if (!(path->locks[level] & BTRFS_WRITE_LOCK)) {
+search:
+			btrfs_release_path(path);
+			ret = btrfs_search_slot(trans, root, &key, path,
+						ins_len, 1);
+			if (ret < 0)
+				goto out;
+			level = 1;
+			continue;
+		}
+
+		if (path->slots[level] >= nritems - 1) {
+			level++;
+			continue;
+		}
+
+		btrfs_release_level(path, level);
+		break;
+	}
+
+	if (!path->nodes[level]) {
+		ret = 1;
+		goto out;
+	}
+
+	path->slots[level]++;
+	b = path->nodes[level];
+
+	while (b) {
+		level = btrfs_header_level(b);
+
+		if (!should_cow_block(trans, root, b))
+			goto cow_done;
+
+		btrfs_set_path_blocking(path);
+		ret = btrfs_cow_block(trans, root, b,
+				      path->nodes[level + 1],
+				      path->slots[level + 1], &b);
+		if (ret)
+			goto out;
+cow_done:
+		path->nodes[level] = b;
+		btrfs_clear_path_blocking(path, NULL, 0);
+		if (level != 0) {
+			ret = setup_nodes_for_search(trans, root, path, b,
+						     level, ins_len,
+						     &write_lock_level);
+			if (ret == -EAGAIN)
+				goto search;
+			if (ret)
+				goto out;
+
+			b = path->nodes[level];
+			slot = path->slots[level];
+
+			ret = read_block_for_search(trans, root, path,
+						    &b, level, slot, &key, 0);
+			if (ret == -EAGAIN)
+				goto search;
+			if (ret)
+				goto out;
+			level = btrfs_header_level(b);
+			if (!btrfs_try_tree_write_lock(b)) {
+				btrfs_set_path_blocking(path);
+				btrfs_tree_lock(b);
+				btrfs_clear_path_blocking(path, b,
+							  BTRFS_WRITE_LOCK);
+			}
+			path->locks[level] = BTRFS_WRITE_LOCK;
+			path->nodes[level] = b;
+			path->slots[level] = 0;
+		} else {
+			path->slots[level] = 0;
+			ret = 0;
+			break;
+		}
+	}
+
+out:
+	if (ret)
+		btrfs_release_path(path);
+
+	return ret;
+}
+
 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
 			u64 time_seq)
 {
diff --git a/trunk/fs/btrfs/ctree.h b/trunk/fs/btrfs/ctree.h
index 596617ecd329..547b7b05727f 100644
--- a/trunk/fs/btrfs/ctree.h
+++ b/trunk/fs/btrfs/ctree.h
@@ -48,7 +48,7 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_MAGIC "_BHRfS_M"
 
-#define BTRFS_MAX_MIRRORS 2
+#define BTRFS_MAX_MIRRORS 3
 
 #define BTRFS_MAX_LEVEL 8
 
@@ -142,6 +142,8 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
 
+#define BTRFS_DEV_REPLACE_DEVID 0
+
 /*
  * the max metadata block size.  This limit is somewhat artificial,
  * but the memmove costs go through the roof for larger blocks.
@@ -172,6 +174,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
 /* four bytes for CRC32 */
 #define BTRFS_EMPTY_DIR_SIZE 0
 
+/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
+#define REQ_GET_READ_MIRRORS	(1 << 30)
+
 #define BTRFS_FT_UNKNOWN	0
 #define BTRFS_FT_REG_FILE	1
 #define BTRFS_FT_DIR		2
@@ -571,6 +576,7 @@ struct btrfs_path {
 	unsigned int skip_locking:1;
 	unsigned int leave_spinning:1;
 	unsigned int search_commit_root:1;
+	unsigned int really_keep_locks:1;
 };
 
 /*
@@ -885,6 +891,59 @@ struct btrfs_dev_stats_item {
 	__le64 values[BTRFS_DEV_STAT_VALUES_MAX];
 } __attribute__ ((__packed__));
 
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS	0
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID	1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED	0
+#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED		1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED		2
+#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED		3
+#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED		4
+
+struct btrfs_dev_replace {
+	u64 replace_state;	/* see #define above */
+	u64 time_started;	/* seconds since 1-Jan-1970 */
+	u64 time_stopped;	/* seconds since 1-Jan-1970 */
+	atomic64_t num_write_errors;
+	atomic64_t num_uncorrectable_read_errors;
+
+	u64 cursor_left;
+	u64 committed_cursor_left;
+	u64 cursor_left_last_write_of_item;
+	u64 cursor_right;
+
+	u64 cont_reading_from_srcdev_mode;	/* see #define above */
+
+	int is_valid;
+	int item_needs_writeback;
+	struct btrfs_device *srcdev;
+	struct btrfs_device *tgtdev;
+
+	pid_t lock_owner;
+	atomic_t nesting_level;
+	struct mutex lock_finishing_cancel_unmount;
+	struct mutex lock_management_lock;
+	struct mutex lock;
+
+	struct btrfs_scrub_progress scrub_progress;
+};
+
+struct btrfs_dev_replace_item {
+	/*
+	 * grow this item struct at the end for future enhancements and keep
+	 * the existing values unchanged
+	 */
+	__le64 src_devid;
+	__le64 cursor_left;
+	__le64 cursor_right;
+	__le64 cont_reading_from_srcdev_mode;
+
+	__le64 replace_state;
+	__le64 time_started;
+	__le64 time_stopped;
+	__le64 num_write_errors;
+	__le64 num_uncorrectable_read_errors;
+} __attribute__ ((__packed__));
+
 /* different types of block groups (and chunks) */
 #define BTRFS_BLOCK_GROUP_DATA		(1ULL << 0)
 #define BTRFS_BLOCK_GROUP_SYSTEM	(1ULL << 1)
@@ -1333,6 +1392,7 @@ struct btrfs_fs_info {
 	struct btrfs_workers generic_worker;
 	struct btrfs_workers workers;
 	struct btrfs_workers delalloc_workers;
+	struct btrfs_workers flush_workers;
 	struct btrfs_workers endio_workers;
 	struct btrfs_workers endio_meta_workers;
 	struct btrfs_workers endio_meta_write_workers;
@@ -1429,6 +1489,8 @@ struct btrfs_fs_info {
 	struct rw_semaphore scrub_super_lock;
 	int scrub_workers_refcnt;
 	struct btrfs_workers scrub_workers;
+	struct btrfs_workers scrub_wr_completion_workers;
+	struct btrfs_workers scrub_nocow_workers;
 
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	u32 check_integrity_print_mask;
@@ -1470,6 +1532,11 @@ struct btrfs_fs_info {
 	int backup_root_index;
 
 	int num_tolerated_disk_barrier_failures;
+
+	/* device replace state */
+	struct btrfs_dev_replace dev_replace;
+
+	atomic_t mutually_exclusive_operation_running;
 };
 
 /*
@@ -1579,7 +1646,7 @@ struct btrfs_root {
 
 	int force_cow;
 
-	spinlock_t root_times_lock;
+	spinlock_t root_item_lock;
 };
 
 struct btrfs_ioctl_defrag_range_args {
@@ -1722,6 +1789,12 @@ struct btrfs_ioctl_defrag_range_args {
  */
 #define BTRFS_DEV_STATS_KEY	249
 
+/*
+ * Persistantly stores the device replace state in the device tree.
+ * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
+ */
+#define BTRFS_DEV_REPLACE_KEY	250
+
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
@@ -1787,7 +1860,7 @@ struct btrfs_map_token {
 
 static inline void btrfs_init_map_token (struct btrfs_map_token *token)
 {
-	memset(token, 0, sizeof(*token));
+	token->kaddr = NULL;
 }
 
 /* some macros to generate set/get funcs for the struct fields.  This
@@ -2755,6 +2828,49 @@ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
 BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
 		   rsv_excl, 64);
 
+/* btrfs_dev_replace_item */
+BTRFS_SETGET_FUNCS(dev_replace_src_devid,
+		   struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
+		   struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
+		   64);
+BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
+		   replace_state, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
+		   time_started, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
+		   time_stopped, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
+		   num_write_errors, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
+		   struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
+		   64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
+		   cursor_left, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
+		   cursor_right, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
+			 struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
+			 struct btrfs_dev_replace_item,
+			 cont_reading_from_srcdev_mode, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
+			 struct btrfs_dev_replace_item, replace_state, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
+			 struct btrfs_dev_replace_item, time_started, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
+			 struct btrfs_dev_replace_item, time_stopped, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
+			 struct btrfs_dev_replace_item, num_write_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
+			 struct btrfs_dev_replace_item,
+			 num_uncorrectable_read_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
+			 struct btrfs_dev_replace_item, cursor_left, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
+			 struct btrfs_dev_replace_item, cursor_right, 64);
+
 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
 {
 	return sb->s_fs_info;
@@ -2900,6 +3016,18 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+
+enum btrfs_reserve_flush_enum {
+	/* If we are in the transaction, we can't flush anything.*/
+	BTRFS_RESERVE_NO_FLUSH,
+	/*
+	 * Flushing delalloc may cause deadlock somewhere, in this
+	 * case, use FLUSH LIMIT
+	 */
+	BTRFS_RESERVE_FLUSH_LIMIT,
+	BTRFS_RESERVE_FLUSH_ALL,
+};
+
 int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -2919,19 +3047,13 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
 void btrfs_free_block_rsv(struct btrfs_root *root,
 			  struct btrfs_block_rsv *rsv);
 int btrfs_block_rsv_add(struct btrfs_root *root,
-			struct btrfs_block_rsv *block_rsv,
-			u64 num_bytes);
-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
-				struct btrfs_block_rsv *block_rsv,
-				u64 num_bytes);
+			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+			enum btrfs_reserve_flush_enum flush);
 int btrfs_block_rsv_check(struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv, int min_factor);
 int btrfs_block_rsv_refill(struct btrfs_root *root,
-			  struct btrfs_block_rsv *block_rsv,
-			  u64 min_reserved);
-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
-				   struct btrfs_block_rsv *block_rsv,
-				   u64 min_reserved);
+			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+			   enum btrfs_reserve_flush_enum flush);
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 			    struct btrfs_block_rsv *dst_rsv,
 			    u64 num_bytes);
@@ -2955,6 +3077,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
 int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
 					 struct btrfs_fs_info *fs_info);
+int __get_raid_index(u64 flags);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		     int level, int *slot);
@@ -3065,6 +3188,9 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct btrfs_path *path,
+			  int del);
 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
 			u64 time_seq);
 static inline int btrfs_next_old_item(struct btrfs_root *root,
@@ -3157,6 +3283,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root);
 
 /* dir-item.c */
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+			  const char *name, int name_len);
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, const char *name,
 			  int name_len, struct inode *dir,
@@ -3256,6 +3384,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid,
 			     u64 bytenr, int mod);
+u64 btrfs_file_extent_length(struct btrfs_path *path);
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_ordered_sum *sums);
@@ -3271,6 +3400,19 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 			     struct list_head *list, int search_commit);
 /* inode.c */
+struct btrfs_delalloc_work {
+	struct inode *inode;
+	int wait;
+	int delay_iput;
+	struct completion completion;
+	struct list_head list;
+	struct btrfs_work work;
+};
+
+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
+						    int wait, int delay_iput);
+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
+
 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
 					   size_t pg_offset, u64 start, u64 len,
 					   int create);
@@ -3370,9 +3512,12 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
 				struct btrfs_ioctl_space_info *space);
 
 /* file.c */
+int btrfs_auto_defrag_init(void);
+void btrfs_auto_defrag_exit(void);
 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 			   struct inode *inode);
 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
 void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			     int skip_pinned);
@@ -3519,15 +3664,16 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
 			      struct btrfs_pending_snapshot *pending);
 
 /* scrub.c */
-int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
-		    struct btrfs_scrub_progress *progress, int readonly);
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+		    u64 end, struct btrfs_scrub_progress *progress,
+		    int readonly, int is_dev_replace);
 void btrfs_scrub_pause(struct btrfs_root *root);
 void btrfs_scrub_pause_super(struct btrfs_root *root);
 void btrfs_scrub_continue(struct btrfs_root *root);
 void btrfs_scrub_continue_super(struct btrfs_root *root);
-int __btrfs_scrub_cancel(struct btrfs_fs_info *info);
-int btrfs_scrub_cancel(struct btrfs_root *root);
-int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev);
+int btrfs_scrub_cancel(struct btrfs_fs_info *info);
+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
+			   struct btrfs_device *dev);
 int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
 			 struct btrfs_scrub_progress *progress);
diff --git a/trunk/fs/btrfs/delayed-inode.c b/trunk/fs/btrfs/delayed-inode.c
index 478f66bdc57b..34836036f01b 100644
--- a/trunk/fs/btrfs/delayed-inode.c
+++ b/trunk/fs/btrfs/delayed-inode.c
@@ -651,7 +651,8 @@ static int btrfs_delayed_inode_reserve_metadata(
 	 */
 	if (!src_rsv || (!trans->bytes_reserved &&
 			 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
-		ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+		ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+					  BTRFS_RESERVE_NO_FLUSH);
 		/*
 		 * Since we're under a transaction reserve_metadata_bytes could
 		 * try to commit the transaction which will make it return
@@ -686,7 +687,8 @@ static int btrfs_delayed_inode_reserve_metadata(
 		 * reserve something strictly for us.  If not be a pain and try
 		 * to steal from the delalloc block rsv.
 		 */
-		ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+		ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+					  BTRFS_RESERVE_NO_FLUSH);
 		if (!ret)
 			goto out;
 
@@ -1255,7 +1257,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
 	struct btrfs_delayed_node *delayed_node = NULL;
 	struct btrfs_root *root;
 	struct btrfs_block_rsv *block_rsv;
-	unsigned long nr = 0;
 	int need_requeue = 0;
 	int ret;
 
@@ -1316,11 +1317,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
 					   delayed_node);
 	mutex_unlock(&delayed_node->mutex);
 
-	nr = trans->blocks_used;
-
 	trans->block_rsv = block_rsv;
 	btrfs_end_transaction_dmeta(trans, root);
-	__btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty_nodelay(root);
 free_path:
 	btrfs_free_path(path);
 out:
diff --git a/trunk/fs/btrfs/dev-replace.c b/trunk/fs/btrfs/dev-replace.c
new file mode 100644
index 000000000000..66dbc8dbddf7
--- /dev/null
+++ b/trunk/fs/btrfs/dev-replace.c
@@ -0,0 +1,856 @@
+/*
+ * Copyright (C) STRATO AG 2012.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <linux/capability.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <asm/div64.h>
+#include "compat.h"
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+
+static u64 btrfs_get_seconds_since_1970(void);
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+				       int scrub_ret);
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+						struct btrfs_fs_info *fs_info,
+						struct btrfs_device *srcdev,
+						struct btrfs_device *tgtdev);
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+					 char *srcdev_name,
+					 struct btrfs_device **device);
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
+static int btrfs_dev_replace_kthread(void *data);
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
+
+
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_key key;
+	struct btrfs_root *dev_root = fs_info->dev_root;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	struct extent_buffer *eb;
+	int slot;
+	int ret = 0;
+	struct btrfs_path *path = NULL;
+	int item_size;
+	struct btrfs_dev_replace_item *ptr;
+	u64 src_devid;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key.objectid = 0;
+	key.type = BTRFS_DEV_REPLACE_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
+	if (ret) {
+no_valid_dev_replace_entry_found:
+		ret = 0;
+		dev_replace->replace_state =
+			BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
+		dev_replace->cont_reading_from_srcdev_mode =
+		    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
+		dev_replace->replace_state = 0;
+		dev_replace->time_started = 0;
+		dev_replace->time_stopped = 0;
+		atomic64_set(&dev_replace->num_write_errors, 0);
+		atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
+		dev_replace->cursor_left = 0;
+		dev_replace->committed_cursor_left = 0;
+		dev_replace->cursor_left_last_write_of_item = 0;
+		dev_replace->cursor_right = 0;
+		dev_replace->srcdev = NULL;
+		dev_replace->tgtdev = NULL;
+		dev_replace->is_valid = 0;
+		dev_replace->item_needs_writeback = 0;
+		goto out;
+	}
+	slot = path->slots[0];
+	eb = path->nodes[0];
+	item_size = btrfs_item_size_nr(eb, slot);
+	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
+
+	if (item_size != sizeof(struct btrfs_dev_replace_item)) {
+		pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n");
+		goto no_valid_dev_replace_entry_found;
+	}
+
+	src_devid = btrfs_dev_replace_src_devid(eb, ptr);
+	dev_replace->cont_reading_from_srcdev_mode =
+		btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
+	dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
+	dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
+	dev_replace->time_stopped =
+		btrfs_dev_replace_time_stopped(eb, ptr);
+	atomic64_set(&dev_replace->num_write_errors,
+		     btrfs_dev_replace_num_write_errors(eb, ptr));
+	atomic64_set(&dev_replace->num_uncorrectable_read_errors,
+		     btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
+	dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
+	dev_replace->committed_cursor_left = dev_replace->cursor_left;
+	dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
+	dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
+	dev_replace->is_valid = 1;
+
+	dev_replace->item_needs_writeback = 0;
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		dev_replace->srcdev = NULL;
+		dev_replace->tgtdev = NULL;
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
+							NULL, NULL);
+		dev_replace->tgtdev = btrfs_find_device(fs_info,
+							BTRFS_DEV_REPLACE_DEVID,
+							NULL, NULL);
+		/*
+		 * allow 'btrfs dev replace_cancel' if src/tgt device is
+		 * missing
+		 */
+		if (!dev_replace->srcdev &&
+		    !btrfs_test_opt(dev_root, DEGRADED)) {
+			ret = -EIO;
+			pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
+				(unsigned long long)src_devid);
+		}
+		if (!dev_replace->tgtdev &&
+		    !btrfs_test_opt(dev_root, DEGRADED)) {
+			ret = -EIO;
+			pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
+				(unsigned long long)BTRFS_DEV_REPLACE_DEVID);
+		}
+		if (dev_replace->tgtdev) {
+			if (dev_replace->srcdev) {
+				dev_replace->tgtdev->total_bytes =
+					dev_replace->srcdev->total_bytes;
+				dev_replace->tgtdev->disk_total_bytes =
+					dev_replace->srcdev->disk_total_bytes;
+				dev_replace->tgtdev->bytes_used =
+					dev_replace->srcdev->bytes_used;
+			}
+			dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
+			btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
+				dev_replace->tgtdev);
+		}
+		break;
+	}
+
+out:
+	if (path)
+		btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * called from commit_transaction. Writes changed device replace state to
+ * disk.
+ */
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+			  struct btrfs_fs_info *fs_info)
+{
+	int ret;
+	struct btrfs_root *dev_root = fs_info->dev_root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	struct btrfs_dev_replace_item *ptr;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+	btrfs_dev_replace_lock(dev_replace);
+	if (!dev_replace->is_valid ||
+	    !dev_replace->item_needs_writeback) {
+		btrfs_dev_replace_unlock(dev_replace);
+		return 0;
+	}
+	btrfs_dev_replace_unlock(dev_replace);
+
+	key.objectid = 0;
+	key.type = BTRFS_DEV_REPLACE_KEY;
+	key.offset = 0;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
+	if (ret < 0) {
+		pr_warn("btrfs: error %d while searching for dev_replace item!\n",
+			ret);
+		goto out;
+	}
+
+	if (ret == 0 &&
+	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+		/*
+		 * need to delete old one and insert a new one.
+		 * Since no attempt is made to recover any old state, if the
+		 * dev_replace state is 'running', the data on the target
+		 * drive is lost.
+		 * It would be possible to recover the state: just make sure
+		 * that the beginning of the item is never changed and always
+		 * contains all the essential information. Then read this
+		 * minimal set of information and use it as a base for the
+		 * new state.
+		 */
+		ret = btrfs_del_item(trans, dev_root, path);
+		if (ret != 0) {
+			pr_warn("btrfs: delete too small dev_replace item failed %d!\n",
+				ret);
+			goto out;
+		}
+		ret = 1;
+	}
+
+	if (ret == 1) {
+		/* need to insert a new item */
+		btrfs_release_path(path);
+		ret = btrfs_insert_empty_item(trans, dev_root, path,
+					      &key, sizeof(*ptr));
+		if (ret < 0) {
+			pr_warn("btrfs: insert dev_replace item failed %d!\n",
+				ret);
+			goto out;
+		}
+	}
+
+	eb = path->nodes[0];
+	ptr = btrfs_item_ptr(eb, path->slots[0],
+			     struct btrfs_dev_replace_item);
+
+	btrfs_dev_replace_lock(dev_replace);
+	if (dev_replace->srcdev)
+		btrfs_set_dev_replace_src_devid(eb, ptr,
+			dev_replace->srcdev->devid);
+	else
+		btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
+	btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
+		dev_replace->cont_reading_from_srcdev_mode);
+	btrfs_set_dev_replace_replace_state(eb, ptr,
+		dev_replace->replace_state);
+	btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
+	btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
+	btrfs_set_dev_replace_num_write_errors(eb, ptr,
+		atomic64_read(&dev_replace->num_write_errors));
+	btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
+		atomic64_read(&dev_replace->num_uncorrectable_read_errors));
+	dev_replace->cursor_left_last_write_of_item =
+		dev_replace->cursor_left;
+	btrfs_set_dev_replace_cursor_left(eb, ptr,
+		dev_replace->cursor_left_last_write_of_item);
+	btrfs_set_dev_replace_cursor_right(eb, ptr,
+		dev_replace->cursor_right);
+	dev_replace->item_needs_writeback = 0;
+	btrfs_dev_replace_unlock(dev_replace);
+
+	btrfs_mark_buffer_dirty(eb);
+
+out:
+	btrfs_free_path(path);
+
+	return ret;
+}
+
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+	dev_replace->committed_cursor_left =
+		dev_replace->cursor_left_last_write_of_item;
+}
+
+static u64 btrfs_get_seconds_since_1970(void)
+{
+	struct timespec t = CURRENT_TIME_SEC;
+
+	return t.tv_sec;
+}
+
+int btrfs_dev_replace_start(struct btrfs_root *root,
+			    struct btrfs_ioctl_dev_replace_args *args)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	int ret;
+	struct btrfs_device *tgt_device = NULL;
+	struct btrfs_device *src_device = NULL;
+
+	switch (args->start.cont_reading_from_srcdev_mode) {
+	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
+	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
+	    args->start.tgtdev_name[0] == '\0')
+		return -EINVAL;
+
+	mutex_lock(&fs_info->volume_mutex);
+	ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
+					    &tgt_device);
+	if (ret) {
+		pr_err("btrfs: target device %s is invalid!\n",
+		       args->start.tgtdev_name);
+		mutex_unlock(&fs_info->volume_mutex);
+		return -EINVAL;
+	}
+
+	ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
+					    args->start.srcdev_name,
+					    &src_device);
+	mutex_unlock(&fs_info->volume_mutex);
+	if (ret) {
+		ret = -EINVAL;
+		goto leave_no_lock;
+	}
+
+	if (tgt_device->total_bytes < src_device->total_bytes) {
+		pr_err("btrfs: target device is smaller than source device!\n");
+		ret = -EINVAL;
+		goto leave_no_lock;
+	}
+
+	btrfs_dev_replace_lock(dev_replace);
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
+		goto leave;
+	}
+
+	dev_replace->cont_reading_from_srcdev_mode =
+		args->start.cont_reading_from_srcdev_mode;
+	WARN_ON(!src_device);
+	dev_replace->srcdev = src_device;
+	WARN_ON(!tgt_device);
+	dev_replace->tgtdev = tgt_device;
+
+	printk_in_rcu(KERN_INFO
+		      "btrfs: dev_replace from %s (devid %llu) to %s) started\n",
+		      src_device->missing ? "<missing disk>" :
+		        rcu_str_deref(src_device->name),
+		      src_device->devid,
+		      rcu_str_deref(tgt_device->name));
+
+	tgt_device->total_bytes = src_device->total_bytes;
+	tgt_device->disk_total_bytes = src_device->disk_total_bytes;
+	tgt_device->bytes_used = src_device->bytes_used;
+
+	/*
+	 * from now on, the writes to the srcdev are all duplicated to
+	 * go to the tgtdev as well (refer to btrfs_map_block()).
+	 */
+	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+	dev_replace->time_started = btrfs_get_seconds_since_1970();
+	dev_replace->cursor_left = 0;
+	dev_replace->committed_cursor_left = 0;
+	dev_replace->cursor_left_last_write_of_item = 0;
+	dev_replace->cursor_right = 0;
+	dev_replace->is_valid = 1;
+	dev_replace->item_needs_writeback = 1;
+	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+	btrfs_dev_replace_unlock(dev_replace);
+
+	btrfs_wait_ordered_extents(root, 0);
+
+	/* force writing the updated state information to disk */
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		btrfs_dev_replace_lock(dev_replace);
+		goto leave;
+	}
+
+	ret = btrfs_commit_transaction(trans, root);
+	WARN_ON(ret);
+
+	/* the disk copy procedure reuses the scrub code */
+	ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
+			      src_device->total_bytes,
+			      &dev_replace->scrub_progress, 0, 1);
+
+	ret = btrfs_dev_replace_finishing(root->fs_info, ret);
+	WARN_ON(ret);
+
+	return 0;
+
+leave:
+	dev_replace->srcdev = NULL;
+	dev_replace->tgtdev = NULL;
+	btrfs_dev_replace_unlock(dev_replace);
+leave_no_lock:
+	if (tgt_device)
+		btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+	return ret;
+}
+
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+				       int scrub_ret)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	struct btrfs_device *tgt_device;
+	struct btrfs_device *src_device;
+	struct btrfs_root *root = fs_info->tree_root;
+	u8 uuid_tmp[BTRFS_UUID_SIZE];
+	struct btrfs_trans_handle *trans;
+	int ret = 0;
+
+	/* don't allow cancel or unmount to disturb the finishing procedure */
+	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+
+	btrfs_dev_replace_lock(dev_replace);
+	/* was the operation canceled, or is it finished? */
+	if (dev_replace->replace_state !=
+	    BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
+		btrfs_dev_replace_unlock(dev_replace);
+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+		return 0;
+	}
+
+	tgt_device = dev_replace->tgtdev;
+	src_device = dev_replace->srcdev;
+	btrfs_dev_replace_unlock(dev_replace);
+
+	/* replace old device with new one in mapping tree */
+	if (!scrub_ret)
+		btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
+								src_device,
+								tgt_device);
+
+	/*
+	 * flush all outstanding I/O and inode extent mappings before the
+	 * copy operation is declared as being finished
+	 */
+	btrfs_start_delalloc_inodes(root, 0);
+	btrfs_wait_ordered_extents(root, 0);
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+		return PTR_ERR(trans);
+	}
+	ret = btrfs_commit_transaction(trans, root);
+	WARN_ON(ret);
+
+	/* keep away write_all_supers() during the finishing procedure */
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	btrfs_dev_replace_lock(dev_replace);
+	dev_replace->replace_state =
+		scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
+			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
+	dev_replace->tgtdev = NULL;
+	dev_replace->srcdev = NULL;
+	dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+	dev_replace->item_needs_writeback = 1;
+
+	if (scrub_ret) {
+		printk_in_rcu(KERN_ERR
+			      "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
+			      src_device->missing ? "<missing disk>" :
+			        rcu_str_deref(src_device->name),
+			      src_device->devid,
+			      rcu_str_deref(tgt_device->name), scrub_ret);
+		btrfs_dev_replace_unlock(dev_replace);
+		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+		if (tgt_device)
+			btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+
+		return 0;
+	}
+
+	printk_in_rcu(KERN_INFO
+		      "btrfs: dev_replace from %s (devid %llu) to %s) finished\n",
+		      src_device->missing ? "<missing disk>" :
+		        rcu_str_deref(src_device->name),
+		      src_device->devid,
+		      rcu_str_deref(tgt_device->name));
+	tgt_device->is_tgtdev_for_dev_replace = 0;
+	tgt_device->devid = src_device->devid;
+	src_device->devid = BTRFS_DEV_REPLACE_DEVID;
+	tgt_device->bytes_used = src_device->bytes_used;
+	memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
+	memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
+	memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
+	tgt_device->total_bytes = src_device->total_bytes;
+	tgt_device->disk_total_bytes = src_device->disk_total_bytes;
+	tgt_device->bytes_used = src_device->bytes_used;
+	if (fs_info->sb->s_bdev == src_device->bdev)
+		fs_info->sb->s_bdev = tgt_device->bdev;
+	if (fs_info->fs_devices->latest_bdev == src_device->bdev)
+		fs_info->fs_devices->latest_bdev = tgt_device->bdev;
+	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
+
+	btrfs_rm_dev_replace_srcdev(fs_info, src_device);
+	if (src_device->bdev) {
+		/* zero out the old super */
+		btrfs_scratch_superblock(src_device);
+	}
+	/*
+	 * this is again a consistent state where no dev_replace procedure
+	 * is running, the target device is part of the filesystem, the
+	 * source device is not part of the filesystem anymore and its 1st
+	 * superblock is scratched out so that it is no longer marked to
+	 * belong to this filesystem.
+	 */
+	btrfs_dev_replace_unlock(dev_replace);
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+	/* write back the superblocks */
+	trans = btrfs_start_transaction(root, 0);
+	if (!IS_ERR(trans))
+		btrfs_commit_transaction(trans, root);
+
+	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+
+	return 0;
+}
+
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+						struct btrfs_fs_info *fs_info,
+						struct btrfs_device *srcdev,
+						struct btrfs_device *tgtdev)
+{
+	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+	struct extent_map *em;
+	struct map_lookup *map;
+	u64 start = 0;
+	int i;
+
+	write_lock(&em_tree->lock);
+	do {
+		em = lookup_extent_mapping(em_tree, start, (u64)-1);
+		if (!em)
+			break;
+		map = (struct map_lookup *)em->bdev;
+		for (i = 0; i < map->num_stripes; i++)
+			if (srcdev == map->stripes[i].dev)
+				map->stripes[i].dev = tgtdev;
+		start = em->start + em->len;
+		free_extent_map(em);
+	} while (start);
+	write_unlock(&em_tree->lock);
+}
+
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+					 char *srcdev_name,
+					 struct btrfs_device **device)
+{
+	int ret;
+
+	if (srcdevid) {
+		ret = 0;
+		*device = btrfs_find_device(root->fs_info, srcdevid, NULL,
+					    NULL);
+		if (!*device)
+			ret = -ENOENT;
+	} else {
+		ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
+							   device);
+	}
+	return ret;
+}
+
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+			      struct btrfs_ioctl_dev_replace_args *args)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+	btrfs_dev_replace_lock(dev_replace);
+	/* even if !dev_replace_is_valid, the values are good enough for
+	 * the replace_status ioctl */
+	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+	args->status.replace_state = dev_replace->replace_state;
+	args->status.time_started = dev_replace->time_started;
+	args->status.time_stopped = dev_replace->time_stopped;
+	args->status.num_write_errors =
+		atomic64_read(&dev_replace->num_write_errors);
+	args->status.num_uncorrectable_read_errors =
+		atomic64_read(&dev_replace->num_uncorrectable_read_errors);
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		args->status.progress_1000 = 0;
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+		args->status.progress_1000 = 1000;
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
+			div64_u64(dev_replace->srcdev->total_bytes, 1000));
+		break;
+	}
+	btrfs_dev_replace_unlock(dev_replace);
+}
+
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+			     struct btrfs_ioctl_dev_replace_args *args)
+{
+	args->result = __btrfs_dev_replace_cancel(fs_info);
+	return 0;
+}
+
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	struct btrfs_device *tgt_device = NULL;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = fs_info->tree_root;
+	u64 result;
+	int ret;
+
+	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+	btrfs_dev_replace_lock(dev_replace);
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
+		btrfs_dev_replace_unlock(dev_replace);
+		goto leave;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+		tgt_device = dev_replace->tgtdev;
+		dev_replace->tgtdev = NULL;
+		dev_replace->srcdev = NULL;
+		break;
+	}
+	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
+	dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+	dev_replace->item_needs_writeback = 1;
+	btrfs_dev_replace_unlock(dev_replace);
+	btrfs_scrub_cancel(fs_info);
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+		return PTR_ERR(trans);
+	}
+	ret = btrfs_commit_transaction(trans, root);
+	WARN_ON(ret);
+	if (tgt_device)
+		btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+
+leave:
+	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+	return result;
+}
+
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+	btrfs_dev_replace_lock(dev_replace);
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+		dev_replace->replace_state =
+			BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+		dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+		dev_replace->item_needs_writeback = 1;
+		pr_info("btrfs: suspending dev_replace for unmount\n");
+		break;
+	}
+
+	btrfs_dev_replace_unlock(dev_replace);
+	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+}
+
+/* resume dev_replace procedure that was interrupted by unmount */
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
+{
+	struct task_struct *task;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+	btrfs_dev_replace_lock(dev_replace);
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		btrfs_dev_replace_unlock(dev_replace);
+		return 0;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		dev_replace->replace_state =
+			BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+		break;
+	}
+	if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
+		pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n"
+			"btrfs: you may cancel the operation after 'mount -o degraded'\n");
+		btrfs_dev_replace_unlock(dev_replace);
+		return 0;
+	}
+	btrfs_dev_replace_unlock(dev_replace);
+
+	WARN_ON(atomic_xchg(
+		&fs_info->mutually_exclusive_operation_running, 1));
+	task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
+	return PTR_RET(task);
+}
+
+static int btrfs_dev_replace_kthread(void *data)
+{
+	struct btrfs_fs_info *fs_info = data;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	struct btrfs_ioctl_dev_replace_args *status_args;
+	u64 progress;
+
+	status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
+	if (status_args) {
+		btrfs_dev_replace_status(fs_info, status_args);
+		progress = status_args->status.progress_1000;
+		kfree(status_args);
+		do_div(progress, 10);
+		printk_in_rcu(KERN_INFO
+			      "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
+			      dev_replace->srcdev->missing ? "<missing disk>" :
+				rcu_str_deref(dev_replace->srcdev->name),
+			      dev_replace->srcdev->devid,
+			      dev_replace->tgtdev ?
+				rcu_str_deref(dev_replace->tgtdev->name) :
+				"<missing target disk>",
+			      (unsigned int)progress);
+	}
+	btrfs_dev_replace_continue_on_mount(fs_info);
+	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+
+	return 0;
+}
+
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	int ret;
+
+	ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
+			      dev_replace->committed_cursor_left,
+			      dev_replace->srcdev->total_bytes,
+			      &dev_replace->scrub_progress, 0, 1);
+	ret = btrfs_dev_replace_finishing(fs_info, ret);
+	WARN_ON(ret);
+	return 0;
+}
+
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
+{
+	if (!dev_replace->is_valid)
+		return 0;
+
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		return 0;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		/*
+		 * return true even if tgtdev is missing (this is
+		 * something that can happen if the dev_replace
+		 * procedure is suspended by an umount and then
+		 * the tgtdev is missing (or "btrfs dev scan") was
+		 * not called and the the filesystem is remounted
+		 * in degraded state. This does not stop the
+		 * dev_replace procedure. It needs to be canceled
+		 * manually if the cancelation is wanted.
+		 */
+		break;
+	}
+	return 1;
+}
+
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
+{
+	/* the beginning is just an optimization for the typical case */
+	if (atomic_read(&dev_replace->nesting_level) == 0) {
+acquire_lock:
+		/* this is not a nested case where the same thread
+		 * is trying to acqurire the same lock twice */
+		mutex_lock(&dev_replace->lock);
+		mutex_lock(&dev_replace->lock_management_lock);
+		dev_replace->lock_owner = current->pid;
+		atomic_inc(&dev_replace->nesting_level);
+		mutex_unlock(&dev_replace->lock_management_lock);
+		return;
+	}
+
+	mutex_lock(&dev_replace->lock_management_lock);
+	if (atomic_read(&dev_replace->nesting_level) > 0 &&
+	    dev_replace->lock_owner == current->pid) {
+		WARN_ON(!mutex_is_locked(&dev_replace->lock));
+		atomic_inc(&dev_replace->nesting_level);
+		mutex_unlock(&dev_replace->lock_management_lock);
+		return;
+	}
+
+	mutex_unlock(&dev_replace->lock_management_lock);
+	goto acquire_lock;
+}
+
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
+{
+	WARN_ON(!mutex_is_locked(&dev_replace->lock));
+	mutex_lock(&dev_replace->lock_management_lock);
+	WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
+	WARN_ON(dev_replace->lock_owner != current->pid);
+	atomic_dec(&dev_replace->nesting_level);
+	if (atomic_read(&dev_replace->nesting_level) == 0) {
+		dev_replace->lock_owner = 0;
+		mutex_unlock(&dev_replace->lock_management_lock);
+		mutex_unlock(&dev_replace->lock);
+	} else {
+		mutex_unlock(&dev_replace->lock_management_lock);
+	}
+}
diff --git a/trunk/fs/btrfs/dev-replace.h b/trunk/fs/btrfs/dev-replace.h
new file mode 100644
index 000000000000..20035cbbf021
--- /dev/null
+++ b/trunk/fs/btrfs/dev-replace.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) STRATO AG 2012.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#if !defined(__BTRFS_DEV_REPLACE__)
+#define __BTRFS_DEV_REPLACE__
+
+struct btrfs_ioctl_dev_replace_args;
+
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+			  struct btrfs_fs_info *fs_info);
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_start(struct btrfs_root *root,
+			    struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+			      struct btrfs_ioctl_dev_replace_args *args);
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+			     struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
+
+static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
+{
+	atomic64_inc(stat_value);
+}
+#endif
diff --git a/trunk/fs/btrfs/dir-item.c b/trunk/fs/btrfs/dir-item.c
index c1a074d0696f..502c2158167c 100644
--- a/trunk/fs/btrfs/dir-item.c
+++ b/trunk/fs/btrfs/dir-item.c
@@ -213,6 +213,65 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 	return btrfs_match_dir_item_name(root, path, name, name_len);
 }
 
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+				   const char *name, int name_len)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_dir_item *di;
+	int data_size;
+	struct extent_buffer *leaf;
+	int slot;
+	struct btrfs_path *path;
+
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = dir;
+	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+	key.offset = btrfs_name_hash(name, name_len);
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+
+	/* return back any errors */
+	if (ret < 0)
+		goto out;
+
+	/* nothing found, we're safe */
+	if (ret > 0) {
+		ret = 0;
+		goto out;
+	}
+
+	/* we found an item, look for our name in the item */
+	di = btrfs_match_dir_item_name(root, path, name, name_len);
+	if (di) {
+		/* our exact name was found */
+		ret = -EEXIST;
+		goto out;
+	}
+
+	/*
+	 * see if there is room in the item to insert this
+	 * name
+	 */
+	data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item);
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	if (data_size + btrfs_item_size_nr(leaf, slot) +
+	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) {
+		ret = -EOVERFLOW;
+	} else {
+		/* plenty of insertion room */
+		ret = 0;
+	}
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
 /*
  * lookup a directory item based on index.  'dir' is the objectid
  * we're searching in, and 'mod' tells us if you plan on deleting the
diff --git a/trunk/fs/btrfs/disk-io.c b/trunk/fs/btrfs/disk-io.c
index 22a0439e5a86..a8f652dc940b 100644
--- a/trunk/fs/btrfs/disk-io.c
+++ b/trunk/fs/btrfs/disk-io.c
@@ -45,6 +45,7 @@
 #include "inode-map.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
+#include "dev-replace.h"
 
 #ifdef CONFIG_X86
 #include <asm/cpufeature.h>
@@ -387,7 +388,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 		if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
 			break;
 
-		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
+		num_copies = btrfs_num_copies(root->fs_info,
 					      eb->start, eb->len);
 		if (num_copies == 1)
 			break;
@@ -852,11 +853,16 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags,
 				 u64 bio_offset)
 {
+	int ret;
+
 	/*
 	 * when we're called for a write, we're already in the async
 	 * submission context.  Just jump into btrfs_map_bio
 	 */
-	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+	ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+	if (ret)
+		bio_endio(bio, ret);
+	return ret;
 }
 
 static int check_async_write(struct inode *inode, unsigned long bio_flags)
@@ -878,7 +884,6 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	int ret;
 
 	if (!(rw & REQ_WRITE)) {
-
 		/*
 		 * called for a read, do the setup so that checksum validation
 		 * can happen in the async kernel threads
@@ -886,26 +891,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 		ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
 					  bio, 1);
 		if (ret)
-			return ret;
-		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
-				     mirror_num, 0);
+			goto out_w_error;
+		ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+				    mirror_num, 0);
 	} else if (!async) {
 		ret = btree_csum_one_bio(bio);
 		if (ret)
-			return ret;
-		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
-				     mirror_num, 0);
+			goto out_w_error;
+		ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+				    mirror_num, 0);
+	} else {
+		/*
+		 * kthread helpers are used to submit writes so that
+		 * checksumming can happen in parallel across all CPUs
+		 */
+		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+					  inode, rw, bio, mirror_num, 0,
+					  bio_offset,
+					  __btree_submit_bio_start,
+					  __btree_submit_bio_done);
 	}
 
-	/*
-	 * kthread helpers are used to submit writes so that checksumming
-	 * can happen in parallel across all CPUs
-	 */
-	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
-				   inode, rw, bio, mirror_num, 0,
-				   bio_offset,
-				   __btree_submit_bio_start,
-				   __btree_submit_bio_done);
+	if (ret) {
+out_w_error:
+		bio_endio(bio, ret);
+	}
+	return ret;
 }
 
 #ifdef CONFIG_MIGRATION
@@ -990,6 +1001,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
 
 static int btree_set_page_dirty(struct page *page)
 {
+#ifdef DEBUG
 	struct extent_buffer *eb;
 
 	BUG_ON(!PagePrivate(page));
@@ -998,6 +1010,7 @@ static int btree_set_page_dirty(struct page *page)
 	BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
 	BUG_ON(!atomic_read(&eb->refs));
 	btrfs_assert_tree_locked(eb);
+#endif
 	return __set_page_dirty_nobuffers(page);
 }
 
@@ -1129,11 +1142,11 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 					  root->fs_info->dirty_metadata_bytes);
 			}
 			spin_unlock(&root->fs_info->delalloc_lock);
-		}
 
-		/* ugh, clear_extent_buffer_dirty needs to lock the page */
-		btrfs_set_lock_blocking(buf);
-		clear_extent_buffer_dirty(buf);
+			/* ugh, clear_extent_buffer_dirty needs to lock the page */
+			btrfs_set_lock_blocking(buf);
+			clear_extent_buffer_dirty(buf);
+		}
 	}
 }
 
@@ -1193,7 +1206,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->root_key.objectid = objectid;
 	root->anon_dev = 0;
 
-	spin_lock_init(&root->root_times_lock);
+	spin_lock_init(&root->root_item_lock);
 }
 
 static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
@@ -2131,6 +2144,11 @@ int open_ctree(struct super_block *sb,
 	init_rwsem(&fs_info->extent_commit_sem);
 	init_rwsem(&fs_info->cleanup_work_sem);
 	init_rwsem(&fs_info->subvol_sem);
+	fs_info->dev_replace.lock_owner = 0;
+	atomic_set(&fs_info->dev_replace.nesting_level, 0);
+	mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
+	mutex_init(&fs_info->dev_replace.lock_management_lock);
+	mutex_init(&fs_info->dev_replace.lock);
 
 	spin_lock_init(&fs_info->qgroup_lock);
 	fs_info->qgroup_tree = RB_ROOT;
@@ -2279,6 +2297,10 @@ int open_ctree(struct super_block *sb,
 			   fs_info->thread_pool_size,
 			   &fs_info->generic_worker);
 
+	btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
+			   fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
+
 	btrfs_init_workers(&fs_info->submit_workers, "submit",
 			   min_t(u64, fs_devices->num_devices,
 			   fs_info->thread_pool_size),
@@ -2350,6 +2372,7 @@ int open_ctree(struct super_block *sb,
 	ret |= btrfs_start_workers(&fs_info->delayed_workers);
 	ret |= btrfs_start_workers(&fs_info->caching_workers);
 	ret |= btrfs_start_workers(&fs_info->readahead_workers);
+	ret |= btrfs_start_workers(&fs_info->flush_workers);
 	if (ret) {
 		err = -ENOMEM;
 		goto fail_sb_buffer;
@@ -2418,7 +2441,11 @@ int open_ctree(struct super_block *sb,
 		goto fail_tree_roots;
 	}
 
-	btrfs_close_extra_devices(fs_devices);
+	/*
+	 * keep the device that is marked to be the target device for the
+	 * dev_replace procedure
+	 */
+	btrfs_close_extra_devices(fs_info, fs_devices, 0);
 
 	if (!fs_devices->latest_bdev) {
 		printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
@@ -2490,6 +2517,14 @@ int open_ctree(struct super_block *sb,
 		goto fail_block_groups;
 	}
 
+	ret = btrfs_init_dev_replace(fs_info);
+	if (ret) {
+		pr_err("btrfs: failed to init dev_replace: %d\n", ret);
+		goto fail_block_groups;
+	}
+
+	btrfs_close_extra_devices(fs_info, fs_devices, 1);
+
 	ret = btrfs_init_space_info(fs_info);
 	if (ret) {
 		printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2503,6 +2538,13 @@ int open_ctree(struct super_block *sb,
 	}
 	fs_info->num_tolerated_disk_barrier_failures =
 		btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+	if (fs_info->fs_devices->missing_devices >
+	     fs_info->num_tolerated_disk_barrier_failures &&
+	    !(sb->s_flags & MS_RDONLY)) {
+		printk(KERN_WARNING
+		       "Btrfs: too many missing devices, writeable mount is not allowed\n");
+		goto fail_block_groups;
+	}
 
 	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
 					       "btrfs-cleaner");
@@ -2631,6 +2673,13 @@ int open_ctree(struct super_block *sb,
 		return ret;
 	}
 
+	ret = btrfs_resume_dev_replace_async(fs_info);
+	if (ret) {
+		pr_warn("btrfs: failed to resume dev_replace\n");
+		close_ctree(tree_root);
+		return ret;
+	}
+
 	return 0;
 
 fail_qgroup:
@@ -2667,6 +2716,7 @@ int open_ctree(struct super_block *sb,
 	btrfs_stop_workers(&fs_info->submit_workers);
 	btrfs_stop_workers(&fs_info->delayed_workers);
 	btrfs_stop_workers(&fs_info->caching_workers);
+	btrfs_stop_workers(&fs_info->flush_workers);
 fail_alloc:
 fail_iput:
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -3270,16 +3320,18 @@ int close_ctree(struct btrfs_root *root)
 	smp_mb();
 
 	/* pause restriper - we want to resume on mount */
-	btrfs_pause_balance(root->fs_info);
+	btrfs_pause_balance(fs_info);
 
-	btrfs_scrub_cancel(root);
+	btrfs_dev_replace_suspend_for_unmount(fs_info);
+
+	btrfs_scrub_cancel(fs_info);
 
 	/* wait for any defraggers to finish */
 	wait_event(fs_info->transaction_wait,
 		   (atomic_read(&fs_info->defrag_running) == 0));
 
 	/* clear out the rbtree of defraggable inodes */
-	btrfs_run_defrag_inodes(fs_info);
+	btrfs_cleanup_defrag_inodes(fs_info);
 
 	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
 		ret = btrfs_commit_super(root);
@@ -3339,6 +3391,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->delayed_workers);
 	btrfs_stop_workers(&fs_info->caching_workers);
 	btrfs_stop_workers(&fs_info->readahead_workers);
+	btrfs_stop_workers(&fs_info->flush_workers);
 
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	if (btrfs_test_opt(root, CHECK_INTEGRITY))
@@ -3383,14 +3436,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 	int was_dirty;
 
 	btrfs_assert_tree_locked(buf);
-	if (transid != root->fs_info->generation) {
-		printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
+	if (transid != root->fs_info->generation)
+		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
 		       "found %llu running %llu\n",
 			(unsigned long long)buf->start,
 			(unsigned long long)transid,
 			(unsigned long long)root->fs_info->generation);
-		WARN_ON(1);
-	}
 	was_dirty = set_extent_buffer_dirty(buf);
 	if (!was_dirty) {
 		spin_lock(&root->fs_info->delalloc_lock);
@@ -3399,7 +3450,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 	}
 }
 
-void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
+					int flush_delayed)
 {
 	/*
 	 * looks as though older kernels can get into trouble with
@@ -3411,7 +3463,8 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 	if (current->flags & PF_MEMALLOC)
 		return;
 
-	btrfs_balance_delayed_items(root);
+	if (flush_delayed)
+		btrfs_balance_delayed_items(root);
 
 	num_dirty = root->fs_info->dirty_metadata_bytes;
 
@@ -3422,25 +3475,14 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 	return;
 }
 
-void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+void btrfs_btree_balance_dirty(struct btrfs_root *root)
 {
-	/*
-	 * looks as though older kernels can get into trouble with
-	 * this code, they end up stuck in balance_dirty_pages forever
-	 */
-	u64 num_dirty;
-	unsigned long thresh = 32 * 1024 * 1024;
-
-	if (current->flags & PF_MEMALLOC)
-		return;
-
-	num_dirty = root->fs_info->dirty_metadata_bytes;
+	__btrfs_btree_balance_dirty(root, 1);
+}
 
-	if (num_dirty > thresh) {
-		balance_dirty_pages_ratelimited(
-				   root->fs_info->btree_inode->i_mapping);
-	}
-	return;
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
+{
+	__btrfs_btree_balance_dirty(root, 0);
 }
 
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
diff --git a/trunk/fs/btrfs/disk-io.h b/trunk/fs/btrfs/disk-io.h
index 2025a9132c16..305c33efb0e3 100644
--- a/trunk/fs/btrfs/disk-io.h
+++ b/trunk/fs/btrfs/disk-io.h
@@ -62,8 +62,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 					      struct btrfs_key *location);
 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
-void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
-void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
+void btrfs_btree_balance_dirty(struct btrfs_root *root);
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
 void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
diff --git a/trunk/fs/btrfs/extent-tree.c b/trunk/fs/btrfs/extent-tree.c
index 06b2635073f3..521e9d4424f6 100644
--- a/trunk/fs/btrfs/extent-tree.c
+++ b/trunk/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
 #include "volumes.h"
 #include "locking.h"
 #include "free-space-cache.h"
+#include "math.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
@@ -649,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 	rcu_read_unlock();
 }
 
-static u64 div_factor(u64 num, int factor)
-{
-	if (factor == 10)
-		return num;
-	num *= factor;
-	do_div(num, 10);
-	return num;
-}
-
-static u64 div_factor_fine(u64 num, int factor)
-{
-	if (factor == 100)
-		return num;
-	num *= factor;
-	do_div(num, 100);
-	return num;
-}
-
 u64 btrfs_find_block_group(struct btrfs_root *root,
 			   u64 search_start, u64 search_hint, int owner)
 {
@@ -1835,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 
 
 	/* Tell the block device(s) that the sectors can be discarded */
-	ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
+	ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
 			      bytenr, &num_bytes, &bbio, 0);
 	/* Error condition is -ENOMEM */
 	if (!ret) {
@@ -2314,6 +2297,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 				kfree(extent_op);
 
 				if (ret) {
+					list_del_init(&locked_ref->cluster);
+					mutex_unlock(&locked_ref->mutex);
+
 					printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
 					spin_lock(&delayed_refs->lock);
 					return ret;
@@ -2356,6 +2342,10 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		count++;
 
 		if (ret) {
+			if (locked_ref) {
+				list_del_init(&locked_ref->cluster);
+				mutex_unlock(&locked_ref->mutex);
+			}
 			printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
 			spin_lock(&delayed_refs->lock);
 			return ret;
@@ -3661,7 +3651,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 
 static int can_overcommit(struct btrfs_root *root,
 			  struct btrfs_space_info *space_info, u64 bytes,
-			  int flush)
+			  enum btrfs_reserve_flush_enum flush)
 {
 	u64 profile = btrfs_get_alloc_profile(root, 0);
 	u64 avail;
@@ -3685,11 +3675,11 @@ static int can_overcommit(struct btrfs_root *root,
 		avail >>= 1;
 
 	/*
-	 * If we aren't flushing don't let us overcommit too much, say
-	 * 1/8th of the space.  If we can flush, let it overcommit up to
-	 * 1/2 of the space.
+	 * If we aren't flushing all things, let us overcommit up to
+	 * 1/2th of the space. If we can flush, don't let us overcommit
+	 * too much, let it overcommit up to 1/8 of the space.
 	 */
-	if (flush)
+	if (flush == BTRFS_RESERVE_FLUSH_ALL)
 		avail >>= 3;
 	else
 		avail >>= 1;
@@ -3699,6 +3689,20 @@ static int can_overcommit(struct btrfs_root *root,
 	return 0;
 }
 
+static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
+					       unsigned long nr_pages,
+					       enum wb_reason reason)
+{
+	if (!writeback_in_progress(sb->s_bdi) &&
+	    down_read_trylock(&sb->s_umount)) {
+		writeback_inodes_sb_nr(sb, nr_pages, reason);
+		up_read(&sb->s_umount);
+		return 1;
+	}
+
+	return 0;
+}
+
 /*
  * shrink metadata reservation for delalloc
  */
@@ -3713,6 +3717,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 	long time_left;
 	unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
 	int loops = 0;
+	enum btrfs_reserve_flush_enum flush;
 
 	trans = (struct btrfs_trans_handle *)current->journal_info;
 	block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -3730,8 +3735,9 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 	while (delalloc_bytes && loops < 3) {
 		max_reclaim = min(delalloc_bytes, to_reclaim);
 		nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
-		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
-					       WB_REASON_FS_FREE_SPACE);
+		writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
+						    nr_pages,
+						    WB_REASON_FS_FREE_SPACE);
 
 		/*
 		 * We need to wait for the async pages to actually start before
@@ -3740,8 +3746,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 		wait_event(root->fs_info->async_submit_wait,
 			   !atomic_read(&root->fs_info->async_delalloc_pages));
 
+		if (!trans)
+			flush = BTRFS_RESERVE_FLUSH_ALL;
+		else
+			flush = BTRFS_RESERVE_NO_FLUSH;
 		spin_lock(&space_info->lock);
-		if (can_overcommit(root, space_info, orig, !trans)) {
+		if (can_overcommit(root, space_info, orig, flush)) {
 			spin_unlock(&space_info->lock);
 			break;
 		}
@@ -3899,7 +3909,8 @@ static int flush_space(struct btrfs_root *root,
  */
 static int reserve_metadata_bytes(struct btrfs_root *root,
 				  struct btrfs_block_rsv *block_rsv,
-				  u64 orig_bytes, int flush)
+				  u64 orig_bytes,
+				  enum btrfs_reserve_flush_enum flush)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
 	u64 used;
@@ -3912,10 +3923,11 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
 	ret = 0;
 	spin_lock(&space_info->lock);
 	/*
-	 * We only want to wait if somebody other than us is flushing and we are
-	 * actually alloed to flush.
+	 * We only want to wait if somebody other than us is flushing and we
+	 * are actually allowed to flush all things.
 	 */
-	while (flush && !flushing && space_info->flush) {
+	while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
+	       space_info->flush) {
 		spin_unlock(&space_info->lock);
 		/*
 		 * If we have a trans handle we can't wait because the flusher
@@ -3981,23 +3993,40 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
 	 * Couldn't make our reservation, save our place so while we're trying
 	 * to reclaim space we can actually use it instead of somebody else
 	 * stealing it from us.
+	 *
+	 * We make the other tasks wait for the flush only when we can flush
+	 * all things.
 	 */
-	if (ret && flush) {
+	if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {
 		flushing = true;
 		space_info->flush = 1;
 	}
 
 	spin_unlock(&space_info->lock);
 
-	if (!ret || !flush)
+	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
 		goto out;
 
 	ret = flush_space(root, space_info, num_bytes, orig_bytes,
 			  flush_state);
 	flush_state++;
+
+	/*
+	 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
+	 * would happen. So skip delalloc flush.
+	 */
+	if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+	    (flush_state == FLUSH_DELALLOC ||
+	     flush_state == FLUSH_DELALLOC_WAIT))
+		flush_state = ALLOC_CHUNK;
+
 	if (!ret)
 		goto again;
-	else if (flush_state <= COMMIT_TRANS)
+	else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+		 flush_state < COMMIT_TRANS)
+		goto again;
+	else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+		 flush_state <= COMMIT_TRANS)
 		goto again;
 
 out:
@@ -4148,9 +4177,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
 	kfree(rsv);
 }
 
-static inline int __block_rsv_add(struct btrfs_root *root,
-				  struct btrfs_block_rsv *block_rsv,
-				  u64 num_bytes, int flush)
+int btrfs_block_rsv_add(struct btrfs_root *root,
+			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+			enum btrfs_reserve_flush_enum flush)
 {
 	int ret;
 
@@ -4166,20 +4195,6 @@ static inline int __block_rsv_add(struct btrfs_root *root,
 	return ret;
 }
 
-int btrfs_block_rsv_add(struct btrfs_root *root,
-			struct btrfs_block_rsv *block_rsv,
-			u64 num_bytes)
-{
-	return __block_rsv_add(root, block_rsv, num_bytes, 1);
-}
-
-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
-				struct btrfs_block_rsv *block_rsv,
-				u64 num_bytes)
-{
-	return __block_rsv_add(root, block_rsv, num_bytes, 0);
-}
-
 int btrfs_block_rsv_check(struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv, int min_factor)
 {
@@ -4198,9 +4213,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
 	return ret;
 }
 
-static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
-					   struct btrfs_block_rsv *block_rsv,
-					   u64 min_reserved, int flush)
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+			   enum btrfs_reserve_flush_enum flush)
 {
 	u64 num_bytes = 0;
 	int ret = -ENOSPC;
@@ -4228,20 +4243,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
 	return ret;
 }
 
-int btrfs_block_rsv_refill(struct btrfs_root *root,
-			   struct btrfs_block_rsv *block_rsv,
-			   u64 min_reserved)
-{
-	return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
-}
-
-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
-				   struct btrfs_block_rsv *block_rsv,
-				   u64 min_reserved)
-{
-	return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
-}
-
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 			    struct btrfs_block_rsv *dst_rsv,
 			    u64 num_bytes)
@@ -4532,17 +4533,27 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	u64 csum_bytes;
 	unsigned nr_extents = 0;
 	int extra_reserve = 0;
-	int flush = 1;
+	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
 	int ret;
+	bool delalloc_lock = true;
 
-	/* Need to be holding the i_mutex here if we aren't free space cache */
-	if (btrfs_is_free_space_inode(inode))
-		flush = 0;
+	/* If we are a free space inode we need to not flush since we will be in
+	 * the middle of a transaction commit.  We also don't need the delalloc
+	 * mutex since we won't race with anybody.  We need this mostly to make
+	 * lockdep shut its filthy mouth.
+	 */
+	if (btrfs_is_free_space_inode(inode)) {
+		flush = BTRFS_RESERVE_NO_FLUSH;
+		delalloc_lock = false;
+	}
 
-	if (flush && btrfs_transaction_in_commit(root->fs_info))
+	if (flush != BTRFS_RESERVE_NO_FLUSH &&
+	    btrfs_transaction_in_commit(root->fs_info))
 		schedule_timeout(1);
 
-	mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+	if (delalloc_lock)
+		mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+
 	num_bytes = ALIGN(num_bytes, root->sectorsize);
 
 	spin_lock(&BTRFS_I(inode)->lock);
@@ -4572,7 +4583,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 		ret = btrfs_qgroup_reserve(root, num_bytes +
 					   nr_extents * root->leafsize);
 		if (ret) {
-			mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+			spin_lock(&BTRFS_I(inode)->lock);
+			calc_csum_metadata_size(inode, num_bytes, 0);
+			spin_unlock(&BTRFS_I(inode)->lock);
+			if (delalloc_lock)
+				mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
 			return ret;
 		}
 	}
@@ -4607,7 +4622,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 						      btrfs_ino(inode),
 						      to_free, 0);
 		}
-		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+		if (root->fs_info->quota_enabled) {
+			btrfs_qgroup_free(root, num_bytes +
+						nr_extents * root->leafsize);
+		}
+		if (delalloc_lock)
+			mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
 		return ret;
 	}
 
@@ -4619,7 +4639,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	}
 	BTRFS_I(inode)->reserved_extents += nr_extents;
 	spin_unlock(&BTRFS_I(inode)->lock);
-	mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+
+	if (delalloc_lock)
+		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
 
 	if (to_reserve)
 		trace_btrfs_space_reservation(root->fs_info,"delalloc",
@@ -4969,9 +4991,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_block_group_cache *cache = NULL;
+	struct btrfs_space_info *space_info;
+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 	u64 len;
+	bool readonly;
 
 	while (start <= end) {
+		readonly = false;
 		if (!cache ||
 		    start >= cache->key.objectid + cache->key.offset) {
 			if (cache)
@@ -4989,15 +5015,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 		}
 
 		start += len;
+		space_info = cache->space_info;
 
-		spin_lock(&cache->space_info->lock);
+		spin_lock(&space_info->lock);
 		spin_lock(&cache->lock);
 		cache->pinned -= len;
-		cache->space_info->bytes_pinned -= len;
-		if (cache->ro)
-			cache->space_info->bytes_readonly += len;
+		space_info->bytes_pinned -= len;
+		if (cache->ro) {
+			space_info->bytes_readonly += len;
+			readonly = true;
+		}
 		spin_unlock(&cache->lock);
-		spin_unlock(&cache->space_info->lock);
+		if (!readonly && global_rsv->space_info == space_info) {
+			spin_lock(&global_rsv->lock);
+			if (!global_rsv->full) {
+				len = min(len, global_rsv->size -
+					  global_rsv->reserved);
+				global_rsv->reserved += len;
+				space_info->bytes_may_use += len;
+				if (global_rsv->reserved >= global_rsv->size)
+					global_rsv->full = 1;
+			}
+			spin_unlock(&global_rsv->lock);
+		}
+		spin_unlock(&space_info->lock);
 	}
 
 	if (cache)
@@ -5466,7 +5507,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
 	return 0;
 }
 
-static int __get_block_group_index(u64 flags)
+int __get_raid_index(u64 flags)
 {
 	int index;
 
@@ -5486,7 +5527,7 @@ static int __get_block_group_index(u64 flags)
 
 static int get_block_group_index(struct btrfs_block_group_cache *cache)
 {
-	return __get_block_group_index(cache->flags);
+	return __get_raid_index(cache->flags);
 }
 
 enum btrfs_loop_type {
@@ -6269,7 +6310,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	block_rsv = get_block_rsv(trans, root);
 
 	if (block_rsv->size == 0) {
-		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+					     BTRFS_RESERVE_NO_FLUSH);
 		/*
 		 * If we couldn't reserve metadata bytes try and use some from
 		 * the global reserve.
@@ -6292,11 +6334,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 		static DEFINE_RATELIMIT_STATE(_rs,
 				DEFAULT_RATELIMIT_INTERVAL,
 				/*DEFAULT_RATELIMIT_BURST*/ 2);
-		if (__ratelimit(&_rs)) {
-			printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
-			WARN_ON(1);
-		}
-		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+		if (__ratelimit(&_rs))
+			WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
+			     ret);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+					     BTRFS_RESERVE_NO_FLUSH);
 		if (!ret) {
 			return block_rsv;
 		} else if (ret && block_rsv != global_rsv) {
@@ -7427,7 +7469,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 	 */
 	target = get_restripe_target(root->fs_info, block_group->flags);
 	if (target) {
-		index = __get_block_group_index(extended_to_chunk(target));
+		index = __get_raid_index(extended_to_chunk(target));
 	} else {
 		/*
 		 * this is just a balance, so if we were marked as full
@@ -7461,7 +7503,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 		 * check to make sure we can actually find a chunk with enough
 		 * space to fit our block group in.
 		 */
-		if (device->total_bytes > device->bytes_used + min_free) {
+		if (device->total_bytes > device->bytes_used + min_free &&
+		    !device->is_tgtdev_for_dev_replace) {
 			ret = find_free_dev_extent(device, min_free,
 						   &dev_offset, NULL);
 			if (!ret)
diff --git a/trunk/fs/btrfs/extent_io.c b/trunk/fs/btrfs/extent_io.c
index 472873a94d96..1b319df29eee 100644
--- a/trunk/fs/btrfs/extent_io.c
+++ b/trunk/fs/btrfs/extent_io.c
@@ -341,12 +341,10 @@ static int insert_state(struct extent_io_tree *tree,
 {
 	struct rb_node *node;
 
-	if (end < start) {
-		printk(KERN_ERR "btrfs end < start %llu %llu\n",
+	if (end < start)
+		WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
 		       (unsigned long long)end,
 		       (unsigned long long)start);
-		WARN_ON(1);
-	}
 	state->start = start;
 	state->end = end;
 
@@ -1919,12 +1917,12 @@ static void repair_io_failure_callback(struct bio *bio, int err)
  * the standard behavior is to write all copies in a raid setup. here we only
  * want to write the one bad copy. so we do the mapping for ourselves and issue
  * submit_bio directly.
- * to avoid any synchonization issues, wait for the data after writing, which
+ * to avoid any synchronization issues, wait for the data after writing, which
  * actually prevents the read that triggered the error from finishing.
  * currently, there can be no more than two copies of every data bit. thus,
  * exactly one rewrite is required.
  */
-int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 			u64 length, u64 logical, struct page *page,
 			int mirror_num)
 {
@@ -1946,7 +1944,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
 	bio->bi_size = 0;
 	map_length = length;
 
-	ret = btrfs_map_block(map_tree, WRITE, logical,
+	ret = btrfs_map_block(fs_info, WRITE, logical,
 			      &map_length, &bbio, mirror_num);
 	if (ret) {
 		bio_put(bio);
@@ -1984,14 +1982,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
 			 int mirror_num)
 {
-	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
 	u64 start = eb->start;
 	unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
 	int ret = 0;
 
 	for (i = 0; i < num_pages; i++) {
 		struct page *p = extent_buffer_page(eb, i);
-		ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE,
+		ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
 					start, p, mirror_num);
 		if (ret)
 			break;
@@ -2010,7 +2007,7 @@ static int clean_io_failure(u64 start, struct page *page)
 	u64 private;
 	u64 private_failure;
 	struct io_failure_record *failrec;
-	struct btrfs_mapping_tree *map_tree;
+	struct btrfs_fs_info *fs_info;
 	struct extent_state *state;
 	int num_copies;
 	int did_repair = 0;
@@ -2046,11 +2043,11 @@ static int clean_io_failure(u64 start, struct page *page)
 	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
 
 	if (state && state->start == failrec->start) {
-		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
-		num_copies = btrfs_num_copies(map_tree, failrec->logical,
-						failrec->len);
+		fs_info = BTRFS_I(inode)->root->fs_info;
+		num_copies = btrfs_num_copies(fs_info, failrec->logical,
+					      failrec->len);
 		if (num_copies > 1)  {
-			ret = repair_io_failure(map_tree, start, failrec->len,
+			ret = repair_io_failure(fs_info, start, failrec->len,
 						failrec->logical, page,
 						failrec->failed_mirror);
 			did_repair = !ret;
@@ -2159,9 +2156,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
 		 * clean_io_failure() clean all those errors at once.
 		 */
 	}
-	num_copies = btrfs_num_copies(
-			      &BTRFS_I(inode)->root->fs_info->mapping_tree,
-			      failrec->logical, failrec->len);
+	num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+				      failrec->logical, failrec->len);
 	if (num_copies == 1) {
 		/*
 		 * we only have a single copy of the data, so don't bother with
@@ -2466,10 +2462,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 	return bio;
 }
 
-/*
- * Since writes are async, they will only return -ENOMEM.
- * Reads can return the full range of I/O error conditions.
- */
 static int __must_check submit_one_bio(int rw, struct bio *bio,
 				       int mirror_num, unsigned long bio_flags)
 {
@@ -4721,10 +4713,9 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
 	}
 
 	if (start + min_len > eb->len) {
-		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
+		WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
 		       "wanted %lu %lu\n", (unsigned long long)eb->start,
 		       eb->len, start, min_len);
-		WARN_ON(1);
 		return -EINVAL;
 	}
 
diff --git a/trunk/fs/btrfs/extent_io.h b/trunk/fs/btrfs/extent_io.h
index 711d12b80028..2eacfabd3263 100644
--- a/trunk/fs/btrfs/extent_io.h
+++ b/trunk/fs/btrfs/extent_io.h
@@ -337,9 +337,9 @@ struct bio *
 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 		gfp_t gfp_flags);
 
-struct btrfs_mapping_tree;
+struct btrfs_fs_info;
 
-int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 			u64 length, u64 logical, struct page *page,
 			int mirror_num);
 int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
diff --git a/trunk/fs/btrfs/extent_map.c b/trunk/fs/btrfs/extent_map.c
index ce9f79216723..f169d6b11d7f 100644
--- a/trunk/fs/btrfs/extent_map.c
+++ b/trunk/fs/btrfs/extent_map.c
@@ -49,7 +49,7 @@ void extent_map_tree_init(struct extent_map_tree *tree)
 struct extent_map *alloc_extent_map(void)
 {
 	struct extent_map *em;
-	em = kmem_cache_alloc(extent_map_cache, GFP_NOFS);
+	em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
 	if (!em)
 		return NULL;
 	em->in_tree = 0;
@@ -198,16 +198,15 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 			merge = rb_entry(rb, struct extent_map, rb_node);
 		if (rb && mergable_maps(merge, em)) {
 			em->start = merge->start;
+			em->orig_start = merge->orig_start;
 			em->len += merge->len;
 			em->block_len += merge->block_len;
 			em->block_start = merge->block_start;
 			merge->in_tree = 0;
-			if (merge->generation > em->generation) {
-				em->mod_start = em->start;
-				em->mod_len = em->len;
-				em->generation = merge->generation;
-				list_move(&em->list, &tree->modified_extents);
-			}
+			em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
+			em->mod_start = merge->mod_start;
+			em->generation = max(em->generation, merge->generation);
+			list_move(&em->list, &tree->modified_extents);
 
 			list_del_init(&merge->list);
 			rb_erase(&merge->rb_node, &tree->map);
@@ -223,11 +222,8 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 		em->block_len += merge->len;
 		rb_erase(&merge->rb_node, &tree->map);
 		merge->in_tree = 0;
-		if (merge->generation > em->generation) {
-			em->mod_len = em->len;
-			em->generation = merge->generation;
-			list_move(&em->list, &tree->modified_extents);
-		}
+		em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
+		em->generation = max(em->generation, merge->generation);
 		list_del_init(&merge->list);
 		free_extent_map(merge);
 	}
@@ -265,9 +261,9 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
 	em->mod_start = em->start;
 	em->mod_len = em->len;
 
-	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+	if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
 		prealloc = true;
-		clear_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+		clear_bit(EXTENT_FLAG_FILLING, &em->flags);
 	}
 
 	try_merge_map(tree, em);
diff --git a/trunk/fs/btrfs/extent_map.h b/trunk/fs/btrfs/extent_map.h
index 679225555f7b..922943ce29e8 100644
--- a/trunk/fs/btrfs/extent_map.h
+++ b/trunk/fs/btrfs/extent_map.h
@@ -14,6 +14,7 @@
 #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
 #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
 #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
+#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
 
 struct extent_map {
 	struct rb_node rb_node;
@@ -24,6 +25,7 @@ struct extent_map {
 	u64 mod_start;
 	u64 mod_len;
 	u64 orig_start;
+	u64 orig_block_len;
 	u64 block_start;
 	u64 block_len;
 	u64 generation;
diff --git a/trunk/fs/btrfs/file-item.c b/trunk/fs/btrfs/file-item.c
index 1ad08e4e4a15..bd38cef42358 100644
--- a/trunk/fs/btrfs/file-item.c
+++ b/trunk/fs/btrfs/file-item.c
@@ -133,7 +133,6 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 	return ERR_PTR(ret);
 }
 
-
 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid,
@@ -151,6 +150,26 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+u64 btrfs_file_extent_length(struct btrfs_path *path)
+{
+	int extent_type;
+	struct btrfs_file_extent_item *fi;
+	u64 len;
+
+	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			    struct btrfs_file_extent_item);
+	extent_type = btrfs_file_extent_type(path->nodes[0], fi);
+
+	if (extent_type == BTRFS_FILE_EXTENT_REG ||
+	    extent_type == BTRFS_FILE_EXTENT_PREALLOC)
+		len = btrfs_file_extent_num_bytes(path->nodes[0], fi);
+	else if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+		len = btrfs_file_extent_inline_len(path->nodes[0], fi);
+	else
+		BUG();
+
+	return len;
+}
 
 static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 				   struct inode *inode, struct bio *bio,
diff --git a/trunk/fs/btrfs/file.c b/trunk/fs/btrfs/file.c
index a8ee75cb96ee..77061bf43edb 100644
--- a/trunk/fs/btrfs/file.c
+++ b/trunk/fs/btrfs/file.c
@@ -41,6 +41,7 @@
 #include "compat.h"
 #include "volumes.h"
 
+static struct kmem_cache *btrfs_inode_defrag_cachep;
 /*
  * when auto defrag is enabled we
  * queue up these defrag structs to remember which
@@ -90,7 +91,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1,
  * If an existing record is found the defrag item you
  * pass in is freed
  */
-static void __btrfs_add_inode_defrag(struct inode *inode,
+static int __btrfs_add_inode_defrag(struct inode *inode,
 				    struct inode_defrag *defrag)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -118,18 +119,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
 				entry->transid = defrag->transid;
 			if (defrag->last_offset > entry->last_offset)
 				entry->last_offset = defrag->last_offset;
-			goto exists;
+			return -EEXIST;
 		}
 	}
 	set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
 	rb_link_node(&defrag->rb_node, parent, p);
 	rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
-	return;
+	return 0;
+}
 
-exists:
-	kfree(defrag);
-	return;
+static inline int __need_auto_defrag(struct btrfs_root *root)
+{
+	if (!btrfs_test_opt(root, AUTO_DEFRAG))
+		return 0;
+
+	if (btrfs_fs_closing(root->fs_info))
+		return 0;
 
+	return 1;
 }
 
 /*
@@ -142,11 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct inode_defrag *defrag;
 	u64 transid;
+	int ret;
 
-	if (!btrfs_test_opt(root, AUTO_DEFRAG))
-		return 0;
-
-	if (btrfs_fs_closing(root->fs_info))
+	if (!__need_auto_defrag(root))
 		return 0;
 
 	if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
@@ -157,7 +162,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 	else
 		transid = BTRFS_I(inode)->root->last_trans;
 
-	defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
+	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
 	if (!defrag)
 		return -ENOMEM;
 
@@ -166,20 +171,56 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 	defrag->root = root->root_key.objectid;
 
 	spin_lock(&root->fs_info->defrag_inodes_lock);
-	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
-		__btrfs_add_inode_defrag(inode, defrag);
-	else
-		kfree(defrag);
+	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
+		/*
+		 * If we set IN_DEFRAG flag and evict the inode from memory,
+		 * and then re-read this inode, this new inode doesn't have
+		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
+		 */
+		ret = __btrfs_add_inode_defrag(inode, defrag);
+		if (ret)
+			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+	} else {
+		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+	}
 	spin_unlock(&root->fs_info->defrag_inodes_lock);
 	return 0;
 }
 
 /*
- * must be called with the defrag_inodes lock held
+ * Requeue the defrag object. If there is a defrag object that points to
+ * the same inode in the tree, we will merge them together (by
+ * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
  */
-struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
-					     u64 root, u64 ino,
-					     struct rb_node **next)
+void btrfs_requeue_inode_defrag(struct inode *inode,
+				struct inode_defrag *defrag)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	if (!__need_auto_defrag(root))
+		goto out;
+
+	/*
+	 * Here we don't check the IN_DEFRAG flag, because we need merge
+	 * them together.
+	 */
+	spin_lock(&root->fs_info->defrag_inodes_lock);
+	ret = __btrfs_add_inode_defrag(inode, defrag);
+	spin_unlock(&root->fs_info->defrag_inodes_lock);
+	if (ret)
+		goto out;
+	return;
+out:
+	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+}
+
+/*
+ * pick the defragable inode that we want, if it doesn't exist, we will get
+ * the next one.
+ */
+static struct inode_defrag *
+btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
 {
 	struct inode_defrag *entry = NULL;
 	struct inode_defrag tmp;
@@ -190,7 +231,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
 	tmp.ino = ino;
 	tmp.root = root;
 
-	p = info->defrag_inodes.rb_node;
+	spin_lock(&fs_info->defrag_inodes_lock);
+	p = fs_info->defrag_inodes.rb_node;
 	while (p) {
 		parent = p;
 		entry = rb_entry(parent, struct inode_defrag, rb_node);
@@ -201,52 +243,131 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
 		else if (ret > 0)
 			p = parent->rb_right;
 		else
-			return entry;
+			goto out;
 	}
 
-	if (next) {
-		while (parent && __compare_inode_defrag(&tmp, entry) > 0) {
-			parent = rb_next(parent);
+	if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
+		parent = rb_next(parent);
+		if (parent)
 			entry = rb_entry(parent, struct inode_defrag, rb_node);
-		}
-		*next = parent;
+		else
+			entry = NULL;
 	}
-	return NULL;
+out:
+	if (entry)
+		rb_erase(parent, &fs_info->defrag_inodes);
+	spin_unlock(&fs_info->defrag_inodes_lock);
+	return entry;
 }
 
-/*
- * run through the list of inodes in the FS that need
- * defragging
- */
-int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
 {
 	struct inode_defrag *defrag;
+	struct rb_node *node;
+
+	spin_lock(&fs_info->defrag_inodes_lock);
+	node = rb_first(&fs_info->defrag_inodes);
+	while (node) {
+		rb_erase(node, &fs_info->defrag_inodes);
+		defrag = rb_entry(node, struct inode_defrag, rb_node);
+		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+
+		if (need_resched()) {
+			spin_unlock(&fs_info->defrag_inodes_lock);
+			cond_resched();
+			spin_lock(&fs_info->defrag_inodes_lock);
+		}
+
+		node = rb_first(&fs_info->defrag_inodes);
+	}
+	spin_unlock(&fs_info->defrag_inodes_lock);
+}
+
+#define BTRFS_DEFRAG_BATCH	1024
+
+static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+				    struct inode_defrag *defrag)
+{
 	struct btrfs_root *inode_root;
 	struct inode *inode;
-	struct rb_node *n;
 	struct btrfs_key key;
 	struct btrfs_ioctl_defrag_range_args range;
-	u64 first_ino = 0;
-	u64 root_objectid = 0;
 	int num_defrag;
-	int defrag_batch = 1024;
 
+	/* get the inode */
+	key.objectid = defrag->root;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+	key.offset = (u64)-1;
+	inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(inode_root)) {
+		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+		return PTR_ERR(inode_root);
+	}
+
+	key.objectid = defrag->ino;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+	key.offset = 0;
+	inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+	if (IS_ERR(inode)) {
+		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+		return PTR_ERR(inode);
+	}
+
+	/* do a chunk of defrag */
+	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
 	memset(&range, 0, sizeof(range));
 	range.len = (u64)-1;
+	range.start = defrag->last_offset;
+
+	sb_start_write(fs_info->sb);
+	num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+				       BTRFS_DEFRAG_BATCH);
+	sb_end_write(fs_info->sb);
+	/*
+	 * if we filled the whole defrag batch, there
+	 * must be more work to do.  Queue this defrag
+	 * again
+	 */
+	if (num_defrag == BTRFS_DEFRAG_BATCH) {
+		defrag->last_offset = range.start;
+		btrfs_requeue_inode_defrag(inode, defrag);
+	} else if (defrag->last_offset && !defrag->cycled) {
+		/*
+		 * we didn't fill our defrag batch, but
+		 * we didn't start at zero.  Make sure we loop
+		 * around to the start of the file.
+		 */
+		defrag->last_offset = 0;
+		defrag->cycled = 1;
+		btrfs_requeue_inode_defrag(inode, defrag);
+	} else {
+		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+	}
+
+	iput(inode);
+	return 0;
+}
+
+/*
+ * run through the list of inodes in the FS that need
+ * defragging
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+	struct inode_defrag *defrag;
+	u64 first_ino = 0;
+	u64 root_objectid = 0;
 
 	atomic_inc(&fs_info->defrag_running);
-	spin_lock(&fs_info->defrag_inodes_lock);
 	while(1) {
-		n = NULL;
+		if (!__need_auto_defrag(fs_info->tree_root))
+			break;
 
 		/* find an inode to defrag */
-		defrag = btrfs_find_defrag_inode(fs_info, root_objectid,
-						 first_ino, &n);
+		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
+						 first_ino);
 		if (!defrag) {
-			if (n) {
-				defrag = rb_entry(n, struct inode_defrag,
-						  rb_node);
-			} else if (root_objectid || first_ino) {
+			if (root_objectid || first_ino) {
 				root_objectid = 0;
 				first_ino = 0;
 				continue;
@@ -255,70 +376,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
 			}
 		}
 
-		/* remove it from the rbtree */
 		first_ino = defrag->ino + 1;
 		root_objectid = defrag->root;
-		rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
-
-		if (btrfs_fs_closing(fs_info))
-			goto next_free;
-
-		spin_unlock(&fs_info->defrag_inodes_lock);
-
-		/* get the inode */
-		key.objectid = defrag->root;
-		btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-		key.offset = (u64)-1;
-		inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
-		if (IS_ERR(inode_root))
-			goto next;
-
-		key.objectid = defrag->ino;
-		btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
-		key.offset = 0;
-
-		inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
-		if (IS_ERR(inode))
-			goto next;
 
-		/* do a chunk of defrag */
-		clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
-		range.start = defrag->last_offset;
-		num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
-					       defrag_batch);
-		/*
-		 * if we filled the whole defrag batch, there
-		 * must be more work to do.  Queue this defrag
-		 * again
-		 */
-		if (num_defrag == defrag_batch) {
-			defrag->last_offset = range.start;
-			__btrfs_add_inode_defrag(inode, defrag);
-			/*
-			 * we don't want to kfree defrag, we added it back to
-			 * the rbtree
-			 */
-			defrag = NULL;
-		} else if (defrag->last_offset && !defrag->cycled) {
-			/*
-			 * we didn't fill our defrag batch, but
-			 * we didn't start at zero.  Make sure we loop
-			 * around to the start of the file.
-			 */
-			defrag->last_offset = 0;
-			defrag->cycled = 1;
-			__btrfs_add_inode_defrag(inode, defrag);
-			defrag = NULL;
-		}
-
-		iput(inode);
-next:
-		spin_lock(&fs_info->defrag_inodes_lock);
-next_free:
-		kfree(defrag);
+		__btrfs_run_defrag_inode(fs_info, defrag);
 	}
-	spin_unlock(&fs_info->defrag_inodes_lock);
-
 	atomic_dec(&fs_info->defrag_running);
 
 	/*
@@ -526,6 +588,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 				split->block_len = em->block_len;
 			else
 				split->block_len = split->len;
+			split->orig_block_len = max(split->block_len,
+						    em->orig_block_len);
 			split->generation = gen;
 			split->bdev = em->bdev;
 			split->flags = flags;
@@ -547,6 +611,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			split->flags = flags;
 			split->compress_type = em->compress_type;
 			split->generation = gen;
+			split->orig_block_len = max(em->block_len,
+						    em->orig_block_len);
 
 			if (compressed) {
 				split->block_len = em->block_len;
@@ -555,7 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			} else {
 				split->block_len = split->len;
 				split->block_start = em->block_start + diff;
-				split->orig_start = split->start;
+				split->orig_start = em->orig_start;
 			}
 
 			ret = add_extent_mapping(em_tree, split);
@@ -1348,7 +1414,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 
 		balance_dirty_pages_ratelimited(inode->i_mapping);
 		if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
-			btrfs_btree_balance_dirty(root, 1);
+			btrfs_btree_balance_dirty(root);
 
 		pos += copied;
 		num_written += copied;
@@ -1397,6 +1463,24 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
 	return written ? written : err;
 }
 
+static void update_time_for_write(struct inode *inode)
+{
+	struct timespec now;
+
+	if (IS_NOCMTIME(inode))
+		return;
+
+	now = current_fs_time(inode->i_sb);
+	if (!timespec_equal(&inode->i_mtime, &now))
+		inode->i_mtime = now;
+
+	if (!timespec_equal(&inode->i_ctime, &now))
+		inode->i_ctime = now;
+
+	if (IS_I_VERSION(inode))
+		inode_inc_iversion(inode);
+}
+
 static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 				    const struct iovec *iov,
 				    unsigned long nr_segs, loff_t pos)
@@ -1409,6 +1493,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 	ssize_t num_written = 0;
 	ssize_t err = 0;
 	size_t count, ocount;
+	bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
 
 	sb_start_write(inode->i_sb);
 
@@ -1451,11 +1536,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 		goto out;
 	}
 
-	err = file_update_time(file);
-	if (err) {
-		mutex_unlock(&inode->i_mutex);
-		goto out;
-	}
+	/*
+	 * We reserve space for updating the inode when we reserve space for the
+	 * extent we are going to write, so we will enospc out there.  We don't
+	 * need to start yet another transaction to update the inode as we will
+	 * update the inode when we finish writing whatever data we write.
+	 */
+	update_time_for_write(inode);
 
 	start_pos = round_down(pos, root->sectorsize);
 	if (start_pos > i_size_read(inode)) {
@@ -1466,6 +1553,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 		}
 	}
 
+	if (sync)
+		atomic_inc(&BTRFS_I(inode)->sync_writers);
+
 	if (unlikely(file->f_flags & O_DIRECT)) {
 		num_written = __btrfs_direct_write(iocb, iov, nr_segs,
 						   pos, ppos, count, ocount);
@@ -1492,14 +1582,21 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 	 * this will either be one more than the running transaction
 	 * or the generation used for the next transaction if there isn't
 	 * one running right now.
+	 *
+	 * We also have to set last_sub_trans to the current log transid,
+	 * otherwise subsequent syncs to a file that's been synced in this
+	 * transaction will appear to have already occured.
 	 */
 	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+	BTRFS_I(inode)->last_sub_trans = root->log_transid;
 	if (num_written > 0 || num_written == -EIOCBQUEUED) {
 		err = generic_write_sync(file, pos, num_written);
 		if (err < 0 && num_written > 0)
 			num_written = err;
 	}
 out:
+	if (sync)
+		atomic_dec(&BTRFS_I(inode)->sync_writers);
 	sb_end_write(inode->i_sb);
 	current->backing_dev_info = NULL;
 	return num_written ? num_written : err;
@@ -1550,7 +1647,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	 * out of the ->i_mutex. If so, we can flush the dirty pages by
 	 * multi-task, and make the performance up.
 	 */
+	atomic_inc(&BTRFS_I(inode)->sync_writers);
 	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	atomic_dec(&BTRFS_I(inode)->sync_writers);
 	if (ret)
 		return ret;
 
@@ -1561,7 +1660,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	 * range being left.
 	 */
 	atomic_inc(&root->log_batch);
-	btrfs_wait_ordered_range(inode, start, end);
+	btrfs_wait_ordered_range(inode, start, end - start + 1);
 	atomic_inc(&root->log_batch);
 
 	/*
@@ -1767,6 +1866,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
 
 		hole_em->block_start = EXTENT_MAP_HOLE;
 		hole_em->block_len = 0;
+		hole_em->orig_block_len = 0;
 		hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
 		hole_em->compress_type = BTRFS_COMPRESS_NONE;
 		hole_em->generation = trans->transid;
@@ -1796,48 +1896,51 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	struct btrfs_path *path;
 	struct btrfs_block_rsv *rsv;
 	struct btrfs_trans_handle *trans;
-	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
-	u64 lockstart = (offset + mask) & ~mask;
-	u64 lockend = ((offset + len) & ~mask) - 1;
+	u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
+	u64 lockend = round_down(offset + len,
+				 BTRFS_I(inode)->root->sectorsize) - 1;
 	u64 cur_offset = lockstart;
 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 	u64 drop_end;
-	unsigned long nr;
 	int ret = 0;
 	int err = 0;
-	bool same_page = (offset >> PAGE_CACHE_SHIFT) ==
-		((offset + len) >> PAGE_CACHE_SHIFT);
+	bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
+			  ((offset + len - 1) >> PAGE_CACHE_SHIFT));
 
 	btrfs_wait_ordered_range(inode, offset, len);
 
 	mutex_lock(&inode->i_mutex);
-	if (offset >= inode->i_size) {
-		mutex_unlock(&inode->i_mutex);
-		return 0;
-	}
-
+	/*
+	 * We needn't truncate any page which is beyond the end of the file
+	 * because we are sure there is no data there.
+	 */
 	/*
 	 * Only do this if we are in the same page and we aren't doing the
 	 * entire page.
 	 */
 	if (same_page && len < PAGE_CACHE_SIZE) {
-		ret = btrfs_truncate_page(inode, offset, len, 0);
+		if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
+			ret = btrfs_truncate_page(inode, offset, len, 0);
 		mutex_unlock(&inode->i_mutex);
 		return ret;
 	}
 
 	/* zero back part of the first page */
-	ret = btrfs_truncate_page(inode, offset, 0, 0);
-	if (ret) {
-		mutex_unlock(&inode->i_mutex);
-		return ret;
+	if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+		ret = btrfs_truncate_page(inode, offset, 0, 0);
+		if (ret) {
+			mutex_unlock(&inode->i_mutex);
+			return ret;
+		}
 	}
 
 	/* zero the front end of the last page */
-	ret = btrfs_truncate_page(inode, offset + len, 0, 1);
-	if (ret) {
-		mutex_unlock(&inode->i_mutex);
-		return ret;
+	if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+		ret = btrfs_truncate_page(inode, offset + len, 0, 1);
+		if (ret) {
+			mutex_unlock(&inode->i_mutex);
+			return ret;
+		}
 	}
 
 	if (lockend < lockstart) {
@@ -1930,9 +2033,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 			break;
 		}
 
-		nr = trans->blocks_used;
 		btrfs_end_transaction(trans, root);
-		btrfs_btree_balance_dirty(root, nr);
+		btrfs_btree_balance_dirty(root);
 
 		trans = btrfs_start_transaction(root, 3);
 		if (IS_ERR(trans)) {
@@ -1963,11 +2065,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	if (!trans)
 		goto out_free;
 
+	inode_inc_iversion(inode);
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
 	trans->block_rsv = &root->fs_info->trans_block_rsv;
 	ret = btrfs_update_inode(trans, root, inode);
-	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 out_free:
 	btrfs_free_path(path);
 	btrfs_free_block_rsv(root, rsv);
@@ -1991,12 +2095,12 @@ static long btrfs_fallocate(struct file *file, int mode,
 	u64 alloc_end;
 	u64 alloc_hint = 0;
 	u64 locked_end;
-	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
 	struct extent_map *em;
+	int blocksize = BTRFS_I(inode)->root->sectorsize;
 	int ret;
 
-	alloc_start = offset & ~mask;
-	alloc_end =  (offset + len + mask) & ~mask;
+	alloc_start = round_down(offset, blocksize);
+	alloc_end = round_up(offset + len, blocksize);
 
 	/* Make sure we aren't being give some crap mode */
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -2009,7 +2113,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 	 * Make sure we have enough space before we do the
 	 * allocation.
 	 */
-	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1);
+	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
 	if (ret)
 		return ret;
 
@@ -2077,7 +2181,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 		}
 		last_byte = min(extent_map_end(em), alloc_end);
 		actual_end = min_t(u64, extent_map_end(em), offset + len);
-		last_byte = (last_byte + mask) & ~mask;
+		last_byte = ALIGN(last_byte, blocksize);
 
 		if (em->block_start == EXTENT_MAP_HOLE ||
 		    (cur_offset >= inode->i_size &&
@@ -2116,11 +2220,11 @@ static long btrfs_fallocate(struct file *file, int mode,
 out:
 	mutex_unlock(&inode->i_mutex);
 	/* Let go of our reservation. */
-	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1);
+	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
 	return ret;
 }
 
-static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
+static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_map *em;
@@ -2154,7 +2258,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
 	 * before the position we want in case there is outstanding delalloc
 	 * going on here.
 	 */
-	if (origin == SEEK_HOLE && start != 0) {
+	if (whence == SEEK_HOLE && start != 0) {
 		if (start <= root->sectorsize)
 			em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,
 						     root->sectorsize, 0);
@@ -2188,13 +2292,13 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
 				}
 			}
 
-			if (origin == SEEK_HOLE) {
+			if (whence == SEEK_HOLE) {
 				*offset = start;
 				free_extent_map(em);
 				break;
 			}
 		} else {
-			if (origin == SEEK_DATA) {
+			if (whence == SEEK_DATA) {
 				if (em->block_start == EXTENT_MAP_DELALLOC) {
 					if (start >= inode->i_size) {
 						free_extent_map(em);
@@ -2231,16 +2335,16 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
 	return ret;
 }
 
-static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
+static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct inode *inode = file->f_mapping->host;
 	int ret;
 
 	mutex_lock(&inode->i_mutex);
-	switch (origin) {
+	switch (whence) {
 	case SEEK_END:
 	case SEEK_CUR:
-		offset = generic_file_llseek(file, offset, origin);
+		offset = generic_file_llseek(file, offset, whence);
 		goto out;
 	case SEEK_DATA:
 	case SEEK_HOLE:
@@ -2249,7 +2353,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
 			return -ENXIO;
 		}
 
-		ret = find_desired_extent(inode, &offset, origin);
+		ret = find_desired_extent(inode, &offset, whence);
 		if (ret) {
 			mutex_unlock(&inode->i_mutex);
 			return ret;
@@ -2292,3 +2396,21 @@ const struct file_operations btrfs_file_operations = {
 	.compat_ioctl	= btrfs_ioctl,
 #endif
 };
+
+void btrfs_auto_defrag_exit(void)
+{
+	if (btrfs_inode_defrag_cachep)
+		kmem_cache_destroy(btrfs_inode_defrag_cachep);
+}
+
+int btrfs_auto_defrag_init(void)
+{
+	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
+					sizeof(struct inode_defrag), 0,
+					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+					NULL);
+	if (!btrfs_inode_defrag_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/trunk/fs/btrfs/free-space-cache.c b/trunk/fs/btrfs/free-space-cache.c
index 1027b854b90c..59ea2e4349c9 100644
--- a/trunk/fs/btrfs/free-space-cache.c
+++ b/trunk/fs/btrfs/free-space-cache.c
@@ -307,7 +307,6 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl)
 
 static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
 {
-	WARN_ON(io_ctl->cur);
 	BUG_ON(io_ctl->index >= io_ctl->num_pages);
 	io_ctl->page = io_ctl->pages[io_ctl->index++];
 	io_ctl->cur = kmap(io_ctl->page);
@@ -1250,18 +1249,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
 			 * if previous extent entry covers the offset,
 			 * we should return it instead of the bitmap entry
 			 */
-			n = &entry->offset_index;
-			while (1) {
-				n = rb_prev(n);
-				if (!n)
-					break;
+			n = rb_prev(&entry->offset_index);
+			if (n) {
 				prev = rb_entry(n, struct btrfs_free_space,
 						offset_index);
-				if (!prev->bitmap) {
-					if (prev->offset + prev->bytes > offset)
-						entry = prev;
-					break;
-				}
+				if (!prev->bitmap &&
+				    prev->offset + prev->bytes > offset)
+					entry = prev;
 			}
 		}
 		return entry;
@@ -1287,18 +1281,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
 	}
 
 	if (entry->bitmap) {
-		n = &entry->offset_index;
-		while (1) {
-			n = rb_prev(n);
-			if (!n)
-				break;
+		n = rb_prev(&entry->offset_index);
+		if (n) {
 			prev = rb_entry(n, struct btrfs_free_space,
 					offset_index);
-			if (!prev->bitmap) {
-				if (prev->offset + prev->bytes > offset)
-					return prev;
-				break;
-			}
+			if (!prev->bitmap &&
+			    prev->offset + prev->bytes > offset)
+				return prev;
 		}
 		if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
 			return entry;
@@ -1364,7 +1353,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 	u64 bitmap_bytes;
 	u64 extent_bytes;
 	u64 size = block_group->key.offset;
-	u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+	u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
 	int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
 
 	BUG_ON(ctl->total_bitmaps > max_bitmaps);
@@ -1650,8 +1639,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
 	 * some block groups are so tiny they can't be enveloped by a bitmap, so
 	 * don't even bother to create a bitmap for this
 	 */
-	if (BITS_PER_BITMAP * block_group->sectorsize >
-	    block_group->key.offset)
+	if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset)
 		return false;
 
 	return true;
@@ -2298,10 +2286,10 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
 	unsigned long total_found = 0;
 	int ret;
 
-	i = offset_to_bit(entry->offset, block_group->sectorsize,
+	i = offset_to_bit(entry->offset, ctl->unit,
 			  max_t(u64, offset, entry->offset));
-	want_bits = bytes_to_bits(bytes, block_group->sectorsize);
-	min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+	want_bits = bytes_to_bits(bytes, ctl->unit);
+	min_bits = bytes_to_bits(min_bytes, ctl->unit);
 
 again:
 	found_bits = 0;
@@ -2325,23 +2313,22 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
 
 	total_found += found_bits;
 
-	if (cluster->max_size < found_bits * block_group->sectorsize)
-		cluster->max_size = found_bits * block_group->sectorsize;
+	if (cluster->max_size < found_bits * ctl->unit)
+		cluster->max_size = found_bits * ctl->unit;
 
 	if (total_found < want_bits || cluster->max_size < cont1_bytes) {
 		i = next_zero + 1;
 		goto again;
 	}
 
-	cluster->window_start = start * block_group->sectorsize +
-		entry->offset;
+	cluster->window_start = start * ctl->unit + entry->offset;
 	rb_erase(&entry->offset_index, &ctl->free_space_offset);
 	ret = tree_insert_offset(&cluster->root, entry->offset,
 				 &entry->offset_index, 1);
 	BUG_ON(ret); /* -EEXIST; Logic error */
 
 	trace_btrfs_setup_cluster(block_group, cluster,
-				  total_found * block_group->sectorsize, 1);
+				  total_found * ctl->unit, 1);
 	return 0;
 }
 
diff --git a/trunk/fs/btrfs/inode-map.c b/trunk/fs/btrfs/inode-map.c
index b1a1c929ba80..d26f67a59e36 100644
--- a/trunk/fs/btrfs/inode-map.c
+++ b/trunk/fs/btrfs/inode-map.c
@@ -434,8 +434,9 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
 	 * 3 items for pre-allocation
 	 */
 	trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
-	ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
-					  trans->bytes_reserved);
+	ret = btrfs_block_rsv_add(root, trans->block_rsv,
+				  trans->bytes_reserved,
+				  BTRFS_RESERVE_NO_FLUSH);
 	if (ret)
 		goto out;
 	trace_btrfs_space_reservation(root->fs_info, "ino_cache",
diff --git a/trunk/fs/btrfs/inode.c b/trunk/fs/btrfs/inode.c
index 95542a1b3dfc..67ed24ae86bb 100644
--- a/trunk/fs/btrfs/inode.c
+++ b/trunk/fs/btrfs/inode.c
@@ -71,6 +71,7 @@ static const struct file_operations btrfs_dir_file_operations;
 static struct extent_io_ops btrfs_extent_io_ops;
 
 static struct kmem_cache *btrfs_inode_cachep;
+static struct kmem_cache *btrfs_delalloc_work_cachep;
 struct kmem_cache *btrfs_trans_handle_cachep;
 struct kmem_cache *btrfs_transaction_cachep;
 struct kmem_cache *btrfs_path_cachep;
@@ -94,6 +95,10 @@ static noinline int cow_file_range(struct inode *inode,
 				   struct page *locked_page,
 				   u64 start, u64 end, int *page_started,
 				   unsigned long *nr_written, int unlock);
+static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
+					   u64 len, u64 orig_start,
+					   u64 block_start, u64 block_len,
+					   u64 orig_block_len, int type);
 
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 				     struct inode *inode,  struct inode *dir,
@@ -698,14 +703,19 @@ static noinline int submit_compressed_extents(struct inode *inode,
 
 		em->block_start = ins.objectid;
 		em->block_len = ins.offset;
+		em->orig_block_len = ins.offset;
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
 		em->compress_type = async_extent->compress_type;
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+		em->generation = -1;
 
 		while (1) {
 			write_lock(&em_tree->lock);
 			ret = add_extent_mapping(em_tree, em);
+			if (!ret)
+				list_move(&em->list,
+					  &em_tree->modified_extents);
 			write_unlock(&em_tree->lock);
 			if (ret != -EEXIST) {
 				free_extent_map(em);
@@ -803,14 +813,14 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
  * required to start IO on it.  It may be clean and already done with
  * IO when we return.
  */
-static noinline int cow_file_range(struct inode *inode,
-				   struct page *locked_page,
-				   u64 start, u64 end, int *page_started,
-				   unsigned long *nr_written,
-				   int unlock)
+static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
+				     struct inode *inode,
+				     struct btrfs_root *root,
+				     struct page *locked_page,
+				     u64 start, u64 end, int *page_started,
+				     unsigned long *nr_written,
+				     int unlock)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
 	u64 alloc_hint = 0;
 	u64 num_bytes;
 	unsigned long ram_size;
@@ -823,25 +833,10 @@ static noinline int cow_file_range(struct inode *inode,
 	int ret = 0;
 
 	BUG_ON(btrfs_is_free_space_inode(inode));
-	trans = btrfs_join_transaction(root);
-	if (IS_ERR(trans)) {
-		extent_clear_unlock_delalloc(inode,
-			     &BTRFS_I(inode)->io_tree,
-			     start, end, locked_page,
-			     EXTENT_CLEAR_UNLOCK_PAGE |
-			     EXTENT_CLEAR_UNLOCK |
-			     EXTENT_CLEAR_DELALLOC |
-			     EXTENT_CLEAR_DIRTY |
-			     EXTENT_SET_WRITEBACK |
-			     EXTENT_END_WRITEBACK);
-		return PTR_ERR(trans);
-	}
-	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
 	num_bytes = max(blocksize,  num_bytes);
 	disk_num_bytes = num_bytes;
-	ret = 0;
 
 	/* if this is a small write inside eof, kick off defrag */
 	if (num_bytes < 64 * 1024 &&
@@ -900,12 +895,17 @@ static noinline int cow_file_range(struct inode *inode,
 
 		em->block_start = ins.objectid;
 		em->block_len = ins.offset;
+		em->orig_block_len = ins.offset;
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
+		em->generation = -1;
 
 		while (1) {
 			write_lock(&em_tree->lock);
 			ret = add_extent_mapping(em_tree, em);
+			if (!ret)
+				list_move(&em->list,
+					  &em_tree->modified_extents);
 			write_unlock(&em_tree->lock);
 			if (ret != -EEXIST) {
 				free_extent_map(em);
@@ -952,11 +952,9 @@ static noinline int cow_file_range(struct inode *inode,
 		alloc_hint = ins.objectid + ins.offset;
 		start += cur_alloc_size;
 	}
-	ret = 0;
 out:
-	btrfs_end_transaction(trans, root);
-
 	return ret;
+
 out_unlock:
 	extent_clear_unlock_delalloc(inode,
 		     &BTRFS_I(inode)->io_tree,
@@ -971,6 +969,39 @@ static noinline int cow_file_range(struct inode *inode,
 	goto out;
 }
 
+static noinline int cow_file_range(struct inode *inode,
+				   struct page *locked_page,
+				   u64 start, u64 end, int *page_started,
+				   unsigned long *nr_written,
+				   int unlock)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		extent_clear_unlock_delalloc(inode,
+			     &BTRFS_I(inode)->io_tree,
+			     start, end, locked_page,
+			     EXTENT_CLEAR_UNLOCK_PAGE |
+			     EXTENT_CLEAR_UNLOCK |
+			     EXTENT_CLEAR_DELALLOC |
+			     EXTENT_CLEAR_DIRTY |
+			     EXTENT_SET_WRITEBACK |
+			     EXTENT_END_WRITEBACK);
+		return PTR_ERR(trans);
+	}
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+	ret = __cow_file_range(trans, inode, root, locked_page, start, end,
+			       page_started, nr_written, unlock);
+
+	btrfs_end_transaction(trans, root);
+
+	return ret;
+}
+
 /*
  * work queue call back to started compression on a file and pages
  */
@@ -1126,6 +1157,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 	u64 extent_offset;
 	u64 disk_bytenr;
 	u64 num_bytes;
+	u64 disk_num_bytes;
 	int extent_type;
 	int ret, err;
 	int type;
@@ -1228,6 +1260,8 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 			extent_offset = btrfs_file_extent_offset(leaf, fi);
 			extent_end = found_key.offset +
 				btrfs_file_extent_num_bytes(leaf, fi);
+			disk_num_bytes =
+				btrfs_file_extent_disk_num_bytes(leaf, fi);
 			if (extent_end <= start) {
 				path->slots[0]++;
 				goto next_slot;
@@ -1281,9 +1315,9 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 
 		btrfs_release_path(path);
 		if (cow_start != (u64)-1) {
-			ret = cow_file_range(inode, locked_page, cow_start,
-					found_key.offset - 1, page_started,
-					nr_written, 1);
+			ret = __cow_file_range(trans, inode, root, locked_page,
+					       cow_start, found_key.offset - 1,
+					       page_started, nr_written, 1);
 			if (ret) {
 				btrfs_abort_transaction(trans, root, ret);
 				goto error;
@@ -1298,16 +1332,21 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 			em = alloc_extent_map();
 			BUG_ON(!em); /* -ENOMEM */
 			em->start = cur_offset;
-			em->orig_start = em->start;
+			em->orig_start = found_key.offset - extent_offset;
 			em->len = num_bytes;
 			em->block_len = num_bytes;
 			em->block_start = disk_bytenr;
+			em->orig_block_len = disk_num_bytes;
 			em->bdev = root->fs_info->fs_devices->latest_bdev;
 			set_bit(EXTENT_FLAG_PINNED, &em->flags);
-			set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+			set_bit(EXTENT_FLAG_FILLING, &em->flags);
+			em->generation = -1;
 			while (1) {
 				write_lock(&em_tree->lock);
 				ret = add_extent_mapping(em_tree, em);
+				if (!ret)
+					list_move(&em->list,
+						  &em_tree->modified_extents);
 				write_unlock(&em_tree->lock);
 				if (ret != -EEXIST) {
 					free_extent_map(em);
@@ -1352,8 +1391,9 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 	}
 
 	if (cow_start != (u64)-1) {
-		ret = cow_file_range(inode, locked_page, cow_start, end,
-				     page_started, nr_written, 1);
+		ret = __cow_file_range(trans, inode, root, locked_page,
+				       cow_start, end,
+				       page_started, nr_written, 1);
 		if (ret) {
 			btrfs_abort_transaction(trans, root, ret);
 			goto error;
@@ -1531,7 +1571,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 			 unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-	struct btrfs_mapping_tree *map_tree;
 	u64 logical = (u64)bio->bi_sector << 9;
 	u64 length = 0;
 	u64 map_length;
@@ -1541,11 +1580,10 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 		return 0;
 
 	length = bio->bi_size;
-	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
-	ret = btrfs_map_block(map_tree, READ, logical,
+	ret = btrfs_map_block(root->fs_info, READ, logical,
 			      &map_length, NULL, 0);
-	/* Will always return 0 or 1 with map_multi == NULL */
+	/* Will always return 0 with map_multi == NULL */
 	BUG_ON(ret < 0);
 	if (map_length < length + size)
 		return 1;
@@ -1586,7 +1624,12 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 			  u64 bio_offset)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	return btrfs_map_bio(root, rw, bio, mirror_num, 1);
+	int ret;
+
+	ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
+	if (ret)
+		bio_endio(bio, ret);
+	return ret;
 }
 
 /*
@@ -1601,6 +1644,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	int ret = 0;
 	int skip_sum;
 	int metadata = 0;
+	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
 
 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
@@ -1610,31 +1654,43 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	if (!(rw & REQ_WRITE)) {
 		ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
 		if (ret)
-			return ret;
+			goto out;
 
 		if (bio_flags & EXTENT_BIO_COMPRESSED) {
-			return btrfs_submit_compressed_read(inode, bio,
-						    mirror_num, bio_flags);
+			ret = btrfs_submit_compressed_read(inode, bio,
+							   mirror_num,
+							   bio_flags);
+			goto out;
 		} else if (!skip_sum) {
 			ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
 			if (ret)
-				return ret;
+				goto out;
 		}
 		goto mapit;
-	} else if (!skip_sum) {
+	} else if (async && !skip_sum) {
 		/* csum items have already been cloned */
 		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
 			goto mapit;
 		/* we're doing a write, do the async checksumming */
-		return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 				   inode, rw, bio, mirror_num,
 				   bio_flags, bio_offset,
 				   __btrfs_submit_bio_start,
 				   __btrfs_submit_bio_done);
+		goto out;
+	} else if (!skip_sum) {
+		ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
+		if (ret)
+			goto out;
 	}
 
 mapit:
-	return btrfs_map_bio(root, rw, bio, mirror_num, 0);
+	ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+
+out:
+	if (ret < 0)
+		bio_endio(bio, ret);
+	return ret;
 }
 
 /*
@@ -1657,8 +1713,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 			      struct extent_state **cached_state)
 {
-	if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
-		WARN_ON(1);
+	WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
 				   cached_state, GFP_NOFS);
 }
@@ -1867,22 +1922,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
 		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
-		ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
-		if (!ret) {
-			if (nolock)
-				trans = btrfs_join_transaction_nolock(root);
-			else
-				trans = btrfs_join_transaction(root);
-			if (IS_ERR(trans)) {
-				ret = PTR_ERR(trans);
-				trans = NULL;
-				goto out;
-			}
-			trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-			ret = btrfs_update_inode_fallback(trans, root, inode);
-			if (ret) /* -ENOMEM or corruption */
-				btrfs_abort_transaction(trans, root, ret);
+		btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+		if (nolock)
+			trans = btrfs_join_transaction_nolock(root);
+		else
+			trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			trans = NULL;
+			goto out;
 		}
+		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+		ret = btrfs_update_inode_fallback(trans, root, inode);
+		if (ret) /* -ENOMEM or corruption */
+			btrfs_abort_transaction(trans, root, ret);
 		goto out;
 	}
 
@@ -1931,15 +1984,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 	add_pending_csums(trans, inode, ordered_extent->file_offset,
 			  &ordered_extent->list);
 
-	ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
-	if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
-		ret = btrfs_update_inode_fallback(trans, root, inode);
-		if (ret) { /* -ENOMEM or corruption */
-			btrfs_abort_transaction(trans, root, ret);
-			goto out_unlock;
-		}
-	} else {
-		btrfs_set_inode_last_trans(trans, inode);
+	btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+	ret = btrfs_update_inode_fallback(trans, root, inode);
+	if (ret) { /* -ENOMEM or corruption */
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_unlock;
 	}
 	ret = 0;
 out_unlock:
@@ -3074,7 +3123,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	struct btrfs_trans_handle *trans;
 	struct inode *inode = dentry->d_inode;
 	int ret;
-	unsigned long nr = 0;
 
 	trans = __unlink_start_trans(dir, dentry);
 	if (IS_ERR(trans))
@@ -3094,9 +3142,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	}
 
 out:
-	nr = trans->blocks_used;
 	__unlink_end_trans(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 	return ret;
 }
 
@@ -3186,7 +3233,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	int err = 0;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_trans_handle *trans;
-	unsigned long nr = 0;
 
 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 		return -ENOTEMPTY;
@@ -3215,9 +3261,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (!err)
 		btrfs_i_size_write(inode, 0);
 out:
-	nr = trans->blocks_used;
 	__unlink_end_trans(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 
 	return err;
 }
@@ -3497,11 +3542,11 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
 	if (ret)
 		goto out;
 
-	ret = -ENOMEM;
 again:
 	page = find_or_create_page(mapping, index, mask);
 	if (!page) {
 		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+		ret = -ENOMEM;
 		goto out;
 	}
 
@@ -3550,7 +3595,6 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
 		goto out_unlock;
 	}
 
-	ret = 0;
 	if (offset != PAGE_CACHE_SIZE) {
 		if (!len)
 			len = PAGE_CACHE_SIZE - offset;
@@ -3668,6 +3712,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 
 			hole_em->block_start = EXTENT_MAP_HOLE;
 			hole_em->block_len = 0;
+			hole_em->orig_block_len = 0;
 			hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
 			hole_em->compress_type = BTRFS_COMPRESS_NONE;
 			hole_em->generation = trans->transid;
@@ -3783,7 +3828,6 @@ void btrfs_evict_inode(struct inode *inode)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_block_rsv *rsv, *global_rsv;
 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
-	unsigned long nr;
 	int ret;
 
 	trace_btrfs_inode_evict(inode);
@@ -3829,7 +3873,8 @@ void btrfs_evict_inode(struct inode *inode)
 	 * inode item when doing the truncate.
 	 */
 	while (1) {
-		ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
+		ret = btrfs_block_rsv_refill(root, rsv, min_size,
+					     BTRFS_RESERVE_FLUSH_LIMIT);
 
 		/*
 		 * Try and steal from the global reserve since we will
@@ -3847,7 +3892,7 @@ void btrfs_evict_inode(struct inode *inode)
 			goto no_delete;
 		}
 
-		trans = btrfs_start_transaction_noflush(root, 1);
+		trans = btrfs_start_transaction_lflush(root, 1);
 		if (IS_ERR(trans)) {
 			btrfs_orphan_del(NULL, inode);
 			btrfs_free_block_rsv(root, rsv);
@@ -3864,10 +3909,9 @@ void btrfs_evict_inode(struct inode *inode)
 		ret = btrfs_update_inode(trans, root, inode);
 		BUG_ON(ret);
 
-		nr = trans->blocks_used;
 		btrfs_end_transaction(trans, root);
 		trans = NULL;
-		btrfs_btree_balance_dirty(root, nr);
+		btrfs_btree_balance_dirty(root);
 	}
 
 	btrfs_free_block_rsv(root, rsv);
@@ -3883,9 +3927,8 @@ void btrfs_evict_inode(struct inode *inode)
 	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
 		btrfs_return_ino(root, btrfs_ino(inode));
 
-	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 no_delete:
 	clear_inode(inode);
 	return;
@@ -4775,8 +4818,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	if (S_ISREG(mode)) {
 		if (btrfs_test_opt(root, NODATASUM))
 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
-		if (btrfs_test_opt(root, NODATACOW) ||
-		    (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
+		if (btrfs_test_opt(root, NODATACOW))
 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
 	}
 
@@ -4842,7 +4884,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
 	ret = btrfs_insert_dir_item(trans, root, name, name_len,
 				    parent_inode, &key,
 				    btrfs_inode_type(inode), index);
-	if (ret == -EEXIST)
+	if (ret == -EEXIST || ret == -EOVERFLOW)
 		goto fail_dir_item;
 	else if (ret) {
 		btrfs_abort_transaction(trans, root, ret);
@@ -4897,7 +4939,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	int err;
 	int drop_inode = 0;
 	u64 objectid;
-	unsigned long nr = 0;
 	u64 index = 0;
 
 	if (!new_valid_dev(rdev))
@@ -4930,6 +4971,12 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
+	err = btrfs_update_inode(trans, root, inode);
+	if (err) {
+		drop_inode = 1;
+		goto out_unlock;
+	}
+
 	/*
 	* If the active LSM wants to access the inode during
 	* d_instantiate it needs these. Smack checks to see
@@ -4947,9 +4994,8 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 		d_instantiate(dentry, inode);
 	}
 out_unlock:
-	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -4963,9 +5009,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct inode *inode = NULL;
-	int drop_inode = 0;
+	int drop_inode_on_err = 0;
 	int err;
-	unsigned long nr = 0;
 	u64 objectid;
 	u64 index = 0;
 
@@ -4989,12 +5034,15 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		err = PTR_ERR(inode);
 		goto out_unlock;
 	}
+	drop_inode_on_err = 1;
 
 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
-	if (err) {
-		drop_inode = 1;
+	if (err)
+		goto out_unlock;
+
+	err = btrfs_update_inode(trans, root, inode);
+	if (err)
 		goto out_unlock;
-	}
 
 	/*
 	* If the active LSM wants to access the inode during
@@ -5007,21 +5055,20 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 
 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
 	if (err)
-		drop_inode = 1;
-	else {
-		inode->i_mapping->a_ops = &btrfs_aops;
-		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
-		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-		d_instantiate(dentry, inode);
-	}
+		goto out_unlock;
+
+	inode->i_mapping->a_ops = &btrfs_aops;
+	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+	d_instantiate(dentry, inode);
+
 out_unlock:
-	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
-	if (drop_inode) {
+	if (err && drop_inode_on_err) {
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 	return err;
 }
 
@@ -5032,7 +5079,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct inode *inode = old_dentry->d_inode;
 	u64 index;
-	unsigned long nr = 0;
 	int err;
 	int drop_inode = 0;
 
@@ -5062,6 +5108,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	inode_inc_iversion(inode);
 	inode->i_ctime = CURRENT_TIME;
 	ihold(inode);
+	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
 
 	err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
 
@@ -5076,14 +5123,13 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		btrfs_log_new_name(trans, inode, NULL, parent);
 	}
 
-	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 fail:
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 	return err;
 }
 
@@ -5096,7 +5142,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	int drop_on_err = 0;
 	u64 objectid = 0;
 	u64 index = 0;
-	unsigned long nr = 1;
 
 	/*
 	 * 2 items for inode and ref
@@ -5142,11 +5187,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	drop_on_err = 0;
 
 out_fail:
-	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 	if (drop_on_err)
 		iput(inode);
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 	return err;
 }
 
@@ -5340,6 +5384,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 		if (start + len <= found_key.offset)
 			goto not_found;
 		em->start = start;
+		em->orig_start = start;
 		em->len = found_key.offset - start;
 		goto not_found_em;
 	}
@@ -5350,6 +5395,8 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 		em->len = extent_end - extent_start;
 		em->orig_start = extent_start -
 				 btrfs_file_extent_offset(leaf, item);
+		em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
+								      item);
 		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
 		if (bytenr == 0) {
 			em->block_start = EXTENT_MAP_HOLE;
@@ -5359,8 +5406,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 			em->compress_type = compress_type;
 			em->block_start = bytenr;
-			em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
-									 item);
+			em->block_len = em->orig_block_len;
 		} else {
 			bytenr += btrfs_file_extent_offset(leaf, item);
 			em->block_start = bytenr;
@@ -5390,7 +5436,8 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 		em->start = extent_start + extent_offset;
 		em->len = (copy_size + root->sectorsize - 1) &
 			~((u64)root->sectorsize - 1);
-		em->orig_start = EXTENT_MAP_INLINE;
+		em->orig_block_len = em->len;
+		em->orig_start = em->start;
 		if (compress_type) {
 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 			em->compress_type = compress_type;
@@ -5439,11 +5486,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 				    extent_map_end(em) - 1, NULL, GFP_NOFS);
 		goto insert;
 	} else {
-		printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
-		WARN_ON(1);
+		WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
 	}
 not_found:
 	em->start = start;
+	em->orig_start = start;
 	em->len = len;
 not_found_em:
 	em->block_start = EXTENT_MAP_HOLE;
@@ -5645,38 +5692,19 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
 }
 
 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
-						  struct extent_map *em,
 						  u64 start, u64 len)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
 	struct btrfs_key ins;
 	u64 alloc_hint;
 	int ret;
-	bool insert = false;
-
-	/*
-	 * Ok if the extent map we looked up is a hole and is for the exact
-	 * range we want, there is no reason to allocate a new one, however if
-	 * it is not right then we need to free this one and drop the cache for
-	 * our range.
-	 */
-	if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
-	    em->len != len) {
-		free_extent_map(em);
-		em = NULL;
-		insert = true;
-		btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
-	}
 
 	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans))
 		return ERR_CAST(trans);
 
-	if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
-		btrfs_add_inode_defrag(trans, inode);
-
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
 	alloc_hint = get_extent_allocation_hint(inode, start, len);
@@ -5687,37 +5715,10 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
 		goto out;
 	}
 
-	if (!em) {
-		em = alloc_extent_map();
-		if (!em) {
-			em = ERR_PTR(-ENOMEM);
-			goto out;
-		}
-	}
-
-	em->start = start;
-	em->orig_start = em->start;
-	em->len = ins.offset;
-
-	em->block_start = ins.objectid;
-	em->block_len = ins.offset;
-	em->bdev = root->fs_info->fs_devices->latest_bdev;
-
-	/*
-	 * We need to do this because if we're using the original em we searched
-	 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
-	 */
-	em->flags = 0;
-	set_bit(EXTENT_FLAG_PINNED, &em->flags);
-
-	while (insert) {
-		write_lock(&em_tree->lock);
-		ret = add_extent_mapping(em_tree, em);
-		write_unlock(&em_tree->lock);
-		if (ret != -EEXIST)
-			break;
-		btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
-	}
+	em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
+			      ins.offset, ins.offset, 0);
+	if (IS_ERR(em))
+		goto out;
 
 	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
 					   ins.offset, ins.offset, 0);
@@ -5894,7 +5895,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
 static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
 					   u64 len, u64 orig_start,
 					   u64 block_start, u64 block_len,
-					   int type)
+					   u64 orig_block_len, int type)
 {
 	struct extent_map_tree *em_tree;
 	struct extent_map *em;
@@ -5912,15 +5913,20 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
 	em->block_len = block_len;
 	em->block_start = block_start;
 	em->bdev = root->fs_info->fs_devices->latest_bdev;
+	em->orig_block_len = orig_block_len;
+	em->generation = -1;
 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
 	if (type == BTRFS_ORDERED_PREALLOC)
-		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+		set_bit(EXTENT_FLAG_FILLING, &em->flags);
 
 	do {
 		btrfs_drop_extent_cache(inode, em->start,
 				em->start + em->len - 1, 0);
 		write_lock(&em_tree->lock);
 		ret = add_extent_mapping(em_tree, em);
+		if (!ret)
+			list_move(&em->list,
+				  &em_tree->modified_extents);
 		write_unlock(&em_tree->lock);
 	} while (ret == -EEXIST);
 
@@ -6047,13 +6053,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 			goto must_cow;
 
 		if (can_nocow_odirect(trans, inode, start, len) == 1) {
-			u64 orig_start = em->start;
+			u64 orig_start = em->orig_start;
+			u64 orig_block_len = em->orig_block_len;
 
 			if (type == BTRFS_ORDERED_PREALLOC) {
 				free_extent_map(em);
 				em = create_pinned_em(inode, start, len,
 						       orig_start,
-						       block_start, len, type);
+						       block_start, len,
+						       orig_block_len, type);
 				if (IS_ERR(em)) {
 					btrfs_end_transaction(trans, root);
 					goto unlock_err;
@@ -6077,7 +6085,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 	 * it above
 	 */
 	len = bh_result->b_size;
-	em = btrfs_new_extent_direct(inode, em, start, len);
+	free_extent_map(em);
+	em = btrfs_new_extent_direct(inode, start, len);
 	if (IS_ERR(em)) {
 		ret = PTR_ERR(em);
 		goto unlock_err;
@@ -6318,6 +6327,9 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
 
+	if (async_submit)
+		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
+
 	bio_get(bio);
 
 	if (!write) {
@@ -6362,7 +6374,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 {
 	struct inode *inode = dip->inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
 	struct bio *bio;
 	struct bio *orig_bio = dip->orig_bio;
 	struct bio_vec *bvec = orig_bio->bi_io_vec;
@@ -6375,7 +6386,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 	int async_submit = 0;
 
 	map_length = orig_bio->bi_size;
-	ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+	ret = btrfs_map_block(root->fs_info, READ, start_sector << 9,
 			      &map_length, NULL, 0);
 	if (ret) {
 		bio_put(orig_bio);
@@ -6429,7 +6440,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 			bio->bi_end_io = btrfs_end_dio_bio;
 
 			map_length = orig_bio->bi_size;
-			ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+			ret = btrfs_map_block(root->fs_info, READ,
+					      start_sector << 9,
 					      &map_length, NULL, 0);
 			if (ret) {
 				bio_put(bio);
@@ -6582,9 +6594,17 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 		   btrfs_submit_direct, 0);
 }
 
+#define BTRFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)
+
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len)
 {
+	int	ret;
+
+	ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
+	if (ret)
+		return ret;
+
 	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
 }
 
@@ -6855,7 +6875,6 @@ static int btrfs_truncate(struct inode *inode)
 	int ret;
 	int err = 0;
 	struct btrfs_trans_handle *trans;
-	unsigned long nr;
 	u64 mask = root->sectorsize - 1;
 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 
@@ -6978,9 +6997,8 @@ static int btrfs_truncate(struct inode *inode)
 			break;
 		}
 
-		nr = trans->blocks_used;
 		btrfs_end_transaction(trans, root);
-		btrfs_btree_balance_dirty(root, nr);
+		btrfs_btree_balance_dirty(root);
 
 		trans = btrfs_start_transaction(root, 2);
 		if (IS_ERR(trans)) {
@@ -7014,9 +7032,8 @@ static int btrfs_truncate(struct inode *inode)
 		if (ret && !err)
 			err = ret;
 
-		nr = trans->blocks_used;
 		ret = btrfs_end_transaction(trans, root);
-		btrfs_btree_balance_dirty(root, nr);
+		btrfs_btree_balance_dirty(root);
 	}
 
 out:
@@ -7093,6 +7110,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
 	ei->io_tree.track_uptodate = 1;
 	ei->io_failure_tree.track_uptodate = 1;
+	atomic_set(&ei->sync_writers, 0);
 	mutex_init(&ei->log_mutex);
 	mutex_init(&ei->delalloc_mutex);
 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
@@ -7203,6 +7221,8 @@ void btrfs_destroy_cachep(void)
 		kmem_cache_destroy(btrfs_path_cachep);
 	if (btrfs_free_space_cachep)
 		kmem_cache_destroy(btrfs_free_space_cachep);
+	if (btrfs_delalloc_work_cachep)
+		kmem_cache_destroy(btrfs_delalloc_work_cachep);
 }
 
 int btrfs_init_cachep(void)
@@ -7237,6 +7257,13 @@ int btrfs_init_cachep(void)
 	if (!btrfs_free_space_cachep)
 		goto fail;
 
+	btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
+			sizeof(struct btrfs_delalloc_work), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+			NULL);
+	if (!btrfs_delalloc_work_cachep)
+		goto fail;
+
 	return 0;
 fail:
 	btrfs_destroy_cachep();
@@ -7308,6 +7335,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
 	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 		return -ENOTEMPTY;
+
+
+	/* check for collisions, even if the  name isn't there */
+	ret = btrfs_check_dir_item_collision(root, new_dir->i_ino,
+			     new_dentry->d_name.name,
+			     new_dentry->d_name.len);
+
+	if (ret) {
+		if (ret == -EEXIST) {
+			/* we shouldn't get
+			 * eexist without a new_inode */
+			if (!new_inode) {
+				WARN_ON(1);
+				return ret;
+			}
+		} else {
+			/* maybe -EOVERFLOW */
+			return ret;
+		}
+	}
+	ret = 0;
+
 	/*
 	 * we're using rename to replace one file with another.
 	 * and the replacement file is large.  Start IO on it now so
@@ -7447,6 +7496,49 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return ret;
 }
 
+static void btrfs_run_delalloc_work(struct btrfs_work *work)
+{
+	struct btrfs_delalloc_work *delalloc_work;
+
+	delalloc_work = container_of(work, struct btrfs_delalloc_work,
+				     work);
+	if (delalloc_work->wait)
+		btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
+	else
+		filemap_flush(delalloc_work->inode->i_mapping);
+
+	if (delalloc_work->delay_iput)
+		btrfs_add_delayed_iput(delalloc_work->inode);
+	else
+		iput(delalloc_work->inode);
+	complete(&delalloc_work->completion);
+}
+
+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
+						    int wait, int delay_iput)
+{
+	struct btrfs_delalloc_work *work;
+
+	work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
+	if (!work)
+		return NULL;
+
+	init_completion(&work->completion);
+	INIT_LIST_HEAD(&work->list);
+	work->inode = inode;
+	work->wait = wait;
+	work->delay_iput = delay_iput;
+	work->work.func = btrfs_run_delalloc_work;
+
+	return work;
+}
+
+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
+{
+	wait_for_completion(&work->completion);
+	kmem_cache_free(btrfs_delalloc_work_cachep, work);
+}
+
 /*
  * some fairly slow code that needs optimization. This walks the list
  * of all the inodes with pending delalloc and forces them to disk.
@@ -7456,10 +7548,15 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 	struct list_head *head = &root->fs_info->delalloc_inodes;
 	struct btrfs_inode *binode;
 	struct inode *inode;
+	struct btrfs_delalloc_work *work, *next;
+	struct list_head works;
+	int ret = 0;
 
 	if (root->fs_info->sb->s_flags & MS_RDONLY)
 		return -EROFS;
 
+	INIT_LIST_HEAD(&works);
+
 	spin_lock(&root->fs_info->delalloc_lock);
 	while (!list_empty(head)) {
 		binode = list_entry(head->next, struct btrfs_inode,
@@ -7469,11 +7566,14 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 			list_del_init(&binode->delalloc_inodes);
 		spin_unlock(&root->fs_info->delalloc_lock);
 		if (inode) {
-			filemap_flush(inode->i_mapping);
-			if (delay_iput)
-				btrfs_add_delayed_iput(inode);
-			else
-				iput(inode);
+			work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+			if (!work) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			list_add_tail(&work->list, &works);
+			btrfs_queue_worker(&root->fs_info->flush_workers,
+					   &work->work);
 		}
 		cond_resched();
 		spin_lock(&root->fs_info->delalloc_lock);
@@ -7492,7 +7592,12 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
 	}
 	atomic_dec(&root->fs_info->async_submit_draining);
-	return 0;
+out:
+	list_for_each_entry_safe(work, next, &works, list) {
+		list_del_init(&work->list);
+		btrfs_wait_and_free_delalloc_work(work);
+	}
+	return ret;
 }
 
 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
@@ -7512,7 +7617,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	unsigned long ptr;
 	struct btrfs_file_extent_item *ei;
 	struct extent_buffer *leaf;
-	unsigned long nr = 0;
 
 	name_len = strlen(symname) + 1;
 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
@@ -7610,13 +7714,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 out_unlock:
 	if (!err)
 		d_instantiate(dentry, inode);
-	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 	return err;
 }
 
@@ -7679,6 +7782,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 		em->len = ins.offset;
 		em->block_start = ins.objectid;
 		em->block_len = ins.offset;
+		em->orig_block_len = ins.offset;
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
 		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
 		em->generation = trans->transid;
diff --git a/trunk/fs/btrfs/ioctl.c b/trunk/fs/btrfs/ioctl.c
index 5b3429ab8ec1..4b4516770f05 100644
--- a/trunk/fs/btrfs/ioctl.c
+++ b/trunk/fs/btrfs/ioctl.c
@@ -55,6 +55,7 @@
 #include "backref.h"
 #include "rcu-string.h"
 #include "send.h"
+#include "dev-replace.h"
 
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -140,8 +141,11 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
 		BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
 	}
 
-	if (flags & BTRFS_INODE_NODATACOW)
+	if (flags & BTRFS_INODE_NODATACOW) {
 		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+		if (S_ISREG(inode->i_mode))
+			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+	}
 
 	btrfs_update_iflags(inode);
 }
@@ -571,8 +575,12 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 		ret = btrfs_commit_transaction(trans,
 					       root->fs_info->extent_root);
 	}
-	if (ret)
+	if (ret) {
+		/* cleanup_transaction has freed this for us */
+		if (trans->aborted)
+			pending_snapshot = NULL;
 		goto fail;
+	}
 
 	ret = pending_snapshot->error;
 	if (ret)
@@ -705,6 +713,16 @@ static noinline int btrfs_mksubvol(struct path *parent,
 	if (error)
 		goto out_dput;
 
+	/*
+	 * even if this name doesn't exist, we may get hash collisions.
+	 * check for them now when we can safely fail
+	 */
+	error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
+					       dir->i_ino, name,
+					       namelen);
+	if (error)
+		goto out_dput;
+
 	down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
 
 	if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
@@ -1293,12 +1311,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 	return ret;
 }
 
-static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
+static noinline int btrfs_ioctl_resize(struct file *file,
 					void __user *arg)
 {
 	u64 new_size;
 	u64 old_size;
 	u64 devid = 1;
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 	struct btrfs_ioctl_vol_args *vol_args;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_device *device = NULL;
@@ -1313,13 +1332,17 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	mutex_lock(&root->fs_info->volume_mutex);
-	if (root->fs_info->balance_ctl) {
-		printk(KERN_INFO "btrfs: balance in progress\n");
-		ret = -EINVAL;
-		goto out;
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+			1)) {
+		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+		return -EINPROGRESS;
 	}
 
+	mutex_lock(&root->fs_info->volume_mutex);
 	vol_args = memdup_user(arg, sizeof(*vol_args));
 	if (IS_ERR(vol_args)) {
 		ret = PTR_ERR(vol_args);
@@ -1339,7 +1362,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 		printk(KERN_INFO "btrfs: resizing devid %llu\n",
 		       (unsigned long long)devid);
 	}
-	device = btrfs_find_device(root, devid, NULL, NULL);
+	device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
 	if (!device) {
 		printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
 		       (unsigned long long)devid);
@@ -1371,6 +1394,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 		}
 	}
 
+	if (device->is_tgtdev_for_dev_replace) {
+		ret = -EINVAL;
+		goto out_free;
+	}
+
 	old_size = device->total_bytes;
 
 	if (mod < 0) {
@@ -1409,12 +1437,14 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 		btrfs_commit_transaction(trans, root);
 	} else if (new_size < old_size) {
 		ret = btrfs_shrink_device(device, new_size);
-	}
+	} /* equal, nothing need to do */
 
 out_free:
 	kfree(vol_args);
 out:
 	mutex_unlock(&root->fs_info->volume_mutex);
+	mnt_drop_write_file(file);
+	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
 	return ret;
 }
 
@@ -2156,9 +2186,17 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 	if (btrfs_root_readonly(root))
 		return -EROFS;
 
+	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+			1)) {
+		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+		return -EINPROGRESS;
+	}
 	ret = mnt_want_write_file(file);
-	if (ret)
+	if (ret) {
+		atomic_set(&root->fs_info->mutually_exclusive_operation_running,
+			   0);
 		return ret;
+	}
 
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFDIR:
@@ -2210,6 +2248,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 	}
 out:
 	mnt_drop_write_file(file);
+	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
 	return ret;
 }
 
@@ -2221,13 +2260,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	mutex_lock(&root->fs_info->volume_mutex);
-	if (root->fs_info->balance_ctl) {
-		printk(KERN_INFO "btrfs: balance in progress\n");
-		ret = -EINVAL;
-		goto out;
+	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+			1)) {
+		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+		return -EINPROGRESS;
 	}
 
+	mutex_lock(&root->fs_info->volume_mutex);
 	vol_args = memdup_user(arg, sizeof(*vol_args));
 	if (IS_ERR(vol_args)) {
 		ret = PTR_ERR(vol_args);
@@ -2240,27 +2279,31 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
 	kfree(vol_args);
 out:
 	mutex_unlock(&root->fs_info->volume_mutex);
+	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
 	return ret;
 }
 
-static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 {
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 	struct btrfs_ioctl_vol_args *vol_args;
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (root->fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
 
-	mutex_lock(&root->fs_info->volume_mutex);
-	if (root->fs_info->balance_ctl) {
-		printk(KERN_INFO "btrfs: balance in progress\n");
-		ret = -EINVAL;
-		goto out;
+	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+			1)) {
+		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+		mnt_drop_write_file(file);
+		return -EINPROGRESS;
 	}
 
+	mutex_lock(&root->fs_info->volume_mutex);
 	vol_args = memdup_user(arg, sizeof(*vol_args));
 	if (IS_ERR(vol_args)) {
 		ret = PTR_ERR(vol_args);
@@ -2273,6 +2316,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
 	kfree(vol_args);
 out:
 	mutex_unlock(&root->fs_info->volume_mutex);
+	mnt_drop_write_file(file);
+	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
 	return ret;
 }
 
@@ -2328,7 +2373,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
 		s_uuid = di_args->uuid;
 
 	mutex_lock(&fs_devices->device_list_mutex);
-	dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL);
+	dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
 	mutex_unlock(&fs_devices->device_list_mutex);
 
 	if (!dev) {
@@ -2821,12 +2866,19 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 	struct btrfs_disk_key disk_key;
 	u64 objectid = 0;
 	u64 dir_id;
+	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (copy_from_user(&objectid, argp, sizeof(objectid)))
-		return -EFAULT;
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	if (copy_from_user(&objectid, argp, sizeof(objectid))) {
+		ret = -EFAULT;
+		goto out;
+	}
 
 	if (!objectid)
 		objectid = root->root_key.objectid;
@@ -2836,21 +2888,28 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 	location.offset = (u64)-1;
 
 	new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
-	if (IS_ERR(new_root))
-		return PTR_ERR(new_root);
+	if (IS_ERR(new_root)) {
+		ret = PTR_ERR(new_root);
+		goto out;
+	}
 
-	if (btrfs_root_refs(&new_root->root_item) == 0)
-		return -ENOENT;
+	if (btrfs_root_refs(&new_root->root_item) == 0) {
+		ret = -ENOENT;
+		goto out;
+	}
 
 	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	path->leave_spinning = 1;
 
 	trans = btrfs_start_transaction(root, 1);
 	if (IS_ERR(trans)) {
 		btrfs_free_path(path);
-		return PTR_ERR(trans);
+		ret = PTR_ERR(trans);
+		goto out;
 	}
 
 	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
@@ -2861,7 +2920,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 		btrfs_end_transaction(trans, root);
 		printk(KERN_ERR "Umm, you don't have the default dir item, "
 		       "this isn't going to work\n");
-		return -ENOENT;
+		ret = -ENOENT;
+		goto out;
 	}
 
 	btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
@@ -2871,8 +2931,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 
 	btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
 	btrfs_end_transaction(trans, root);
-
-	return 0;
+out:
+	mnt_drop_write_file(file);
+	return ret;
 }
 
 void btrfs_get_block_group_info(struct list_head *groups_list,
@@ -3036,32 +3097,38 @@ long btrfs_ioctl_trans_end(struct file *file)
 	return 0;
 }
 
-static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
+static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
+					    void __user *argp)
 {
-	struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
 	struct btrfs_trans_handle *trans;
 	u64 transid;
 	int ret;
 
-	trans = btrfs_start_transaction(root, 0);
-	if (IS_ERR(trans))
-		return PTR_ERR(trans);
+	trans = btrfs_attach_transaction(root);
+	if (IS_ERR(trans)) {
+		if (PTR_ERR(trans) != -ENOENT)
+			return PTR_ERR(trans);
+
+		/* No running transaction, don't bother */
+		transid = root->fs_info->last_trans_committed;
+		goto out;
+	}
 	transid = trans->transid;
 	ret = btrfs_commit_transaction_async(trans, root, 0);
 	if (ret) {
 		btrfs_end_transaction(trans, root);
 		return ret;
 	}
-
+out:
 	if (argp)
 		if (copy_to_user(argp, &transid, sizeof(transid)))
 			return -EFAULT;
 	return 0;
 }
 
-static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
+static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
+					   void __user *argp)
 {
-	struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
 	u64 transid;
 
 	if (argp) {
@@ -3073,10 +3140,11 @@ static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
 	return btrfs_wait_for_commit(root, transid);
 }
 
-static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
 {
-	int ret;
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 	struct btrfs_ioctl_scrub_args *sa;
+	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -3085,12 +3153,22 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
 	if (IS_ERR(sa))
 		return PTR_ERR(sa);
 
-	ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end,
-			      &sa->progress, sa->flags & BTRFS_SCRUB_READONLY);
+	if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
+		ret = mnt_want_write_file(file);
+		if (ret)
+			goto out;
+	}
+
+	ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
+			      &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
+			      0);
 
 	if (copy_to_user(arg, sa, sizeof(*sa)))
 		ret = -EFAULT;
 
+	if (!(sa->flags & BTRFS_SCRUB_READONLY))
+		mnt_drop_write_file(file);
+out:
 	kfree(sa);
 	return ret;
 }
@@ -3100,7 +3178,7 @@ static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	return btrfs_scrub_cancel(root);
+	return btrfs_scrub_cancel(root->fs_info);
 }
 
 static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
@@ -3149,6 +3227,51 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
 	return ret;
 }
 
+static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_dev_replace_args *p;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	p = memdup_user(arg, sizeof(*p));
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	switch (p->cmd) {
+	case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
+		if (atomic_xchg(
+			&root->fs_info->mutually_exclusive_operation_running,
+			1)) {
+			pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+			ret = -EINPROGRESS;
+		} else {
+			ret = btrfs_dev_replace_start(root, p);
+			atomic_set(
+			 &root->fs_info->mutually_exclusive_operation_running,
+			 0);
+		}
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
+		btrfs_dev_replace_status(root->fs_info, p);
+		ret = 0;
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
+		ret = btrfs_dev_replace_cancel(root->fs_info, p);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	if (copy_to_user(arg, p, sizeof(*p)))
+		ret = -EFAULT;
+
+	kfree(p);
+	return ret;
+}
+
 static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
 {
 	int ret = 0;
@@ -3315,6 +3438,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
 	struct btrfs_ioctl_balance_args *bargs;
 	struct btrfs_balance_control *bctl;
 	int ret;
+	int need_to_clear_lock = 0;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -3350,10 +3474,13 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
 		bargs = NULL;
 	}
 
-	if (fs_info->balance_ctl) {
+	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+			1)) {
+		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
 		ret = -EINPROGRESS;
 		goto out_bargs;
 	}
+	need_to_clear_lock = 1;
 
 	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
 	if (!bctl) {
@@ -3387,6 +3514,9 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
 out_bargs:
 	kfree(bargs);
 out:
+	if (need_to_clear_lock)
+		atomic_set(&root->fs_info->mutually_exclusive_operation_running,
+			   0);
 	mutex_unlock(&fs_info->balance_mutex);
 	mutex_unlock(&fs_info->volume_mutex);
 	mnt_drop_write_file(file);
@@ -3441,8 +3571,9 @@ static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
 	return ret;
 }
 
-static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
 {
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 	struct btrfs_ioctl_quota_ctl_args *sa;
 	struct btrfs_trans_handle *trans = NULL;
 	int ret;
@@ -3451,12 +3582,15 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (root->fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
 
 	sa = memdup_user(arg, sizeof(*sa));
-	if (IS_ERR(sa))
-		return PTR_ERR(sa);
+	if (IS_ERR(sa)) {
+		ret = PTR_ERR(sa);
+		goto drop_write;
+	}
 
 	if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
 		trans = btrfs_start_transaction(root, 2);
@@ -3489,14 +3623,16 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
 		if (err && !ret)
 			ret = err;
 	}
-
 out:
 	kfree(sa);
+drop_write:
+	mnt_drop_write_file(file);
 	return ret;
 }
 
-static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
 {
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 	struct btrfs_ioctl_qgroup_assign_args *sa;
 	struct btrfs_trans_handle *trans;
 	int ret;
@@ -3505,12 +3641,15 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (root->fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
 
 	sa = memdup_user(arg, sizeof(*sa));
-	if (IS_ERR(sa))
-		return PTR_ERR(sa);
+	if (IS_ERR(sa)) {
+		ret = PTR_ERR(sa);
+		goto drop_write;
+	}
 
 	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans)) {
@@ -3533,11 +3672,14 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
 
 out:
 	kfree(sa);
+drop_write:
+	mnt_drop_write_file(file);
 	return ret;
 }
 
-static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
 {
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 	struct btrfs_ioctl_qgroup_create_args *sa;
 	struct btrfs_trans_handle *trans;
 	int ret;
@@ -3546,12 +3688,15 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (root->fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
 
 	sa = memdup_user(arg, sizeof(*sa));
-	if (IS_ERR(sa))
-		return PTR_ERR(sa);
+	if (IS_ERR(sa)) {
+		ret = PTR_ERR(sa);
+		goto drop_write;
+	}
 
 	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans)) {
@@ -3573,11 +3718,14 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
 
 out:
 	kfree(sa);
+drop_write:
+	mnt_drop_write_file(file);
 	return ret;
 }
 
-static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
 {
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 	struct btrfs_ioctl_qgroup_limit_args *sa;
 	struct btrfs_trans_handle *trans;
 	int ret;
@@ -3587,12 +3735,15 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (root->fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
 
 	sa = memdup_user(arg, sizeof(*sa));
-	if (IS_ERR(sa))
-		return PTR_ERR(sa);
+	if (IS_ERR(sa)) {
+		ret = PTR_ERR(sa);
+		goto drop_write;
+	}
 
 	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans)) {
@@ -3615,6 +3766,8 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
 
 out:
 	kfree(sa);
+drop_write:
+	mnt_drop_write_file(file);
 	return ret;
 }
 
@@ -3735,11 +3888,11 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_DEFRAG_RANGE:
 		return btrfs_ioctl_defrag(file, argp);
 	case BTRFS_IOC_RESIZE:
-		return btrfs_ioctl_resize(root, argp);
+		return btrfs_ioctl_resize(file, argp);
 	case BTRFS_IOC_ADD_DEV:
 		return btrfs_ioctl_add_dev(root, argp);
 	case BTRFS_IOC_RM_DEV:
-		return btrfs_ioctl_rm_dev(root, argp);
+		return btrfs_ioctl_rm_dev(file, argp);
 	case BTRFS_IOC_FS_INFO:
 		return btrfs_ioctl_fs_info(root, argp);
 	case BTRFS_IOC_DEV_INFO:
@@ -3768,11 +3921,11 @@ long btrfs_ioctl(struct file *file, unsigned int
 		btrfs_sync_fs(file->f_dentry->d_sb, 1);
 		return 0;
 	case BTRFS_IOC_START_SYNC:
-		return btrfs_ioctl_start_sync(file, argp);
+		return btrfs_ioctl_start_sync(root, argp);
 	case BTRFS_IOC_WAIT_SYNC:
-		return btrfs_ioctl_wait_sync(file, argp);
+		return btrfs_ioctl_wait_sync(root, argp);
 	case BTRFS_IOC_SCRUB:
-		return btrfs_ioctl_scrub(root, argp);
+		return btrfs_ioctl_scrub(file, argp);
 	case BTRFS_IOC_SCRUB_CANCEL:
 		return btrfs_ioctl_scrub_cancel(root, argp);
 	case BTRFS_IOC_SCRUB_PROGRESS:
@@ -3790,13 +3943,15 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_GET_DEV_STATS:
 		return btrfs_ioctl_get_dev_stats(root, argp);
 	case BTRFS_IOC_QUOTA_CTL:
-		return btrfs_ioctl_quota_ctl(root, argp);
+		return btrfs_ioctl_quota_ctl(file, argp);
 	case BTRFS_IOC_QGROUP_ASSIGN:
-		return btrfs_ioctl_qgroup_assign(root, argp);
+		return btrfs_ioctl_qgroup_assign(file, argp);
 	case BTRFS_IOC_QGROUP_CREATE:
-		return btrfs_ioctl_qgroup_create(root, argp);
+		return btrfs_ioctl_qgroup_create(file, argp);
 	case BTRFS_IOC_QGROUP_LIMIT:
-		return btrfs_ioctl_qgroup_limit(root, argp);
+		return btrfs_ioctl_qgroup_limit(file, argp);
+	case BTRFS_IOC_DEV_REPLACE:
+		return btrfs_ioctl_dev_replace(root, argp);
 	}
 
 	return -ENOTTY;
diff --git a/trunk/fs/btrfs/ioctl.h b/trunk/fs/btrfs/ioctl.h
index 731e2875ab93..dabca9cc8c2e 100644
--- a/trunk/fs/btrfs/ioctl.h
+++ b/trunk/fs/btrfs/ioctl.h
@@ -30,6 +30,8 @@ struct btrfs_ioctl_vol_args {
 	char name[BTRFS_PATH_NAME_MAX + 1];
 };
 
+#define BTRFS_DEVICE_PATH_NAME_MAX 1024
+
 #define BTRFS_SUBVOL_CREATE_ASYNC	(1ULL << 0)
 #define BTRFS_SUBVOL_RDONLY		(1ULL << 1)
 #define BTRFS_SUBVOL_QGROUP_INHERIT	(1ULL << 2)
@@ -123,7 +125,48 @@ struct btrfs_ioctl_scrub_args {
 	__u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
 };
 
-#define BTRFS_DEVICE_PATH_NAME_MAX 1024
+#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS	0
+#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID	1
+struct btrfs_ioctl_dev_replace_start_params {
+	__u64 srcdevid;	/* in, if 0, use srcdev_name instead */
+	__u64 cont_reading_from_srcdev_mode;	/* in, see #define
+						 * above */
+	__u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1];	/* in */
+	__u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1];	/* in */
+};
+
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED	0
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED		1
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED		2
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED		3
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED		4
+struct btrfs_ioctl_dev_replace_status_params {
+	__u64 replace_state;	/* out, see #define above */
+	__u64 progress_1000;	/* out, 0 <= x <= 1000 */
+	__u64 time_started;	/* out, seconds since 1-Jan-1970 */
+	__u64 time_stopped;	/* out, seconds since 1-Jan-1970 */
+	__u64 num_write_errors;	/* out */
+	__u64 num_uncorrectable_read_errors;	/* out */
+};
+
+#define BTRFS_IOCTL_DEV_REPLACE_CMD_START			0
+#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS			1
+#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL			2
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR			0
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED		1
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED		2
+struct btrfs_ioctl_dev_replace_args {
+	__u64 cmd;	/* in */
+	__u64 result;	/* out */
+
+	union {
+		struct btrfs_ioctl_dev_replace_start_params start;
+		struct btrfs_ioctl_dev_replace_status_params status;
+	};	/* in/out */
+
+	__u64 spare[64];
+};
+
 struct btrfs_ioctl_dev_info_args {
 	__u64 devid;				/* in/out */
 	__u8 uuid[BTRFS_UUID_SIZE];		/* in/out */
@@ -453,4 +496,7 @@ struct btrfs_ioctl_send_args {
 			       struct btrfs_ioctl_qgroup_limit_args)
 #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
 				      struct btrfs_ioctl_get_dev_stats)
+#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
+				    struct btrfs_ioctl_dev_replace_args)
+
 #endif
diff --git a/trunk/fs/btrfs/math.h b/trunk/fs/btrfs/math.h
new file mode 100644
index 000000000000..b7816cefbd13
--- /dev/null
+++ b/trunk/fs/btrfs/math.h
@@ -0,0 +1,44 @@
+
+/*
+ * Copyright (C) 2012 Fujitsu.  All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_MATH_H
+#define __BTRFS_MATH_H
+
+#include <asm/div64.h>
+
+static inline u64 div_factor(u64 num, int factor)
+{
+	if (factor == 10)
+		return num;
+	num *= factor;
+	do_div(num, 10);
+	return num;
+}
+
+static inline u64 div_factor_fine(u64 num, int factor)
+{
+	if (factor == 100)
+		return num;
+	num *= factor;
+	do_div(num, 100);
+	return num;
+}
+
+#endif
diff --git a/trunk/fs/btrfs/ordered-data.c b/trunk/fs/btrfs/ordered-data.c
index 7772f02ba28e..f10731297040 100644
--- a/trunk/fs/btrfs/ordered-data.c
+++ b/trunk/fs/btrfs/ordered-data.c
@@ -211,6 +211,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	init_waitqueue_head(&entry->wait);
 	INIT_LIST_HEAD(&entry->list);
 	INIT_LIST_HEAD(&entry->root_extent_list);
+	INIT_LIST_HEAD(&entry->work_list);
+	init_completion(&entry->completion);
 
 	trace_btrfs_ordered_extent_add(inode, entry);
 
@@ -464,18 +466,28 @@ void btrfs_remove_ordered_extent(struct inode *inode,
 	wake_up(&entry->wait);
 }
 
+static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
+{
+	struct btrfs_ordered_extent *ordered;
+
+	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
+	btrfs_start_ordered_extent(ordered->inode, ordered, 1);
+	complete(&ordered->completion);
+}
+
 /*
  * wait for all the ordered extents in a root.  This is done when balancing
  * space between drives.
  */
 void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
 {
-	struct list_head splice;
+	struct list_head splice, works;
 	struct list_head *cur;
-	struct btrfs_ordered_extent *ordered;
+	struct btrfs_ordered_extent *ordered, *next;
 	struct inode *inode;
 
 	INIT_LIST_HEAD(&splice);
+	INIT_LIST_HEAD(&works);
 
 	spin_lock(&root->fs_info->ordered_extent_lock);
 	list_splice_init(&root->fs_info->ordered_extents, &splice);
@@ -494,19 +506,32 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
 		spin_unlock(&root->fs_info->ordered_extent_lock);
 
 		if (inode) {
-			btrfs_start_ordered_extent(inode, ordered, 1);
-			btrfs_put_ordered_extent(ordered);
-			if (delay_iput)
-				btrfs_add_delayed_iput(inode);
-			else
-				iput(inode);
+			ordered->flush_work.func = btrfs_run_ordered_extent_work;
+			list_add_tail(&ordered->work_list, &works);
+			btrfs_queue_worker(&root->fs_info->flush_workers,
+					   &ordered->flush_work);
 		} else {
 			btrfs_put_ordered_extent(ordered);
 		}
 
+		cond_resched();
 		spin_lock(&root->fs_info->ordered_extent_lock);
 	}
 	spin_unlock(&root->fs_info->ordered_extent_lock);
+
+	list_for_each_entry_safe(ordered, next, &works, work_list) {
+		list_del_init(&ordered->work_list);
+		wait_for_completion(&ordered->completion);
+
+		inode = ordered->inode;
+		btrfs_put_ordered_extent(ordered);
+		if (delay_iput)
+			btrfs_add_delayed_iput(inode);
+		else
+			iput(inode);
+
+		cond_resched();
+	}
 }
 
 /*
@@ -519,13 +544,17 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
  * extra check to make sure the ordered operation list really is empty
  * before we return
  */
-void btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
 {
 	struct btrfs_inode *btrfs_inode;
 	struct inode *inode;
 	struct list_head splice;
+	struct list_head works;
+	struct btrfs_delalloc_work *work, *next;
+	int ret = 0;
 
 	INIT_LIST_HEAD(&splice);
+	INIT_LIST_HEAD(&works);
 
 	mutex_lock(&root->fs_info->ordered_operations_mutex);
 	spin_lock(&root->fs_info->ordered_extent_lock);
@@ -533,6 +562,7 @@ void btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
 	list_splice_init(&root->fs_info->ordered_operations, &splice);
 
 	while (!list_empty(&splice)) {
+
 		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
 				   ordered_operations);
 
@@ -549,15 +579,26 @@ void btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
 			list_add_tail(&BTRFS_I(inode)->ordered_operations,
 			      &root->fs_info->ordered_operations);
 		}
+
+		if (!inode)
+			continue;
 		spin_unlock(&root->fs_info->ordered_extent_lock);
 
-		if (inode) {
-			if (wait)
-				btrfs_wait_ordered_range(inode, 0, (u64)-1);
-			else
-				filemap_flush(inode->i_mapping);
-			btrfs_add_delayed_iput(inode);
+		work = btrfs_alloc_delalloc_work(inode, wait, 1);
+		if (!work) {
+			if (list_empty(&BTRFS_I(inode)->ordered_operations))
+				list_add_tail(&btrfs_inode->ordered_operations,
+					      &splice);
+			spin_lock(&root->fs_info->ordered_extent_lock);
+			list_splice_tail(&splice,
+					 &root->fs_info->ordered_operations);
+			spin_unlock(&root->fs_info->ordered_extent_lock);
+			ret = -ENOMEM;
+			goto out;
 		}
+		list_add_tail(&work->list, &works);
+		btrfs_queue_worker(&root->fs_info->flush_workers,
+				   &work->work);
 
 		cond_resched();
 		spin_lock(&root->fs_info->ordered_extent_lock);
@@ -566,7 +607,13 @@ void btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
 		goto again;
 
 	spin_unlock(&root->fs_info->ordered_extent_lock);
+out:
+	list_for_each_entry_safe(work, next, &works, list) {
+		list_del_init(&work->list);
+		btrfs_wait_and_free_delalloc_work(work);
+	}
 	mutex_unlock(&root->fs_info->ordered_operations_mutex);
+	return ret;
 }
 
 /*
@@ -606,7 +653,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 	u64 end;
 	u64 orig_end;
 	struct btrfs_ordered_extent *ordered;
-	int found;
 
 	if (start + len < start) {
 		orig_end = INT_LIMIT(loff_t);
@@ -642,7 +688,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 	filemap_fdatawait_range(inode->i_mapping, start, orig_end);
 
 	end = orig_end;
-	found = 0;
 	while (1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, end);
 		if (!ordered)
@@ -655,7 +700,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 			btrfs_put_ordered_extent(ordered);
 			break;
 		}
-		found++;
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		end = ordered->file_offset;
 		btrfs_put_ordered_extent(ordered);
@@ -934,15 +978,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
 	if (last_mod < root->fs_info->last_trans_committed)
 		return;
 
-	/*
-	 * the transaction is already committing.  Just start the IO and
-	 * don't bother with all of this list nonsense
-	 */
-	if (trans && root->fs_info->running_transaction->blocked) {
-		btrfs_wait_ordered_range(inode, 0, (u64)-1);
-		return;
-	}
-
 	spin_lock(&root->fs_info->ordered_extent_lock);
 	if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
 		list_add_tail(&BTRFS_I(inode)->ordered_operations,
@@ -959,6 +994,7 @@ int __init ordered_data_init(void)
 				     NULL);
 	if (!btrfs_ordered_extent_cache)
 		return -ENOMEM;
+
 	return 0;
 }
 
diff --git a/trunk/fs/btrfs/ordered-data.h b/trunk/fs/btrfs/ordered-data.h
index 853fc7beedfa..f29d4bf5fbe7 100644
--- a/trunk/fs/btrfs/ordered-data.h
+++ b/trunk/fs/btrfs/ordered-data.h
@@ -128,8 +128,11 @@ struct btrfs_ordered_extent {
 	struct list_head root_extent_list;
 
 	struct btrfs_work work;
-};
 
+	struct completion completion;
+	struct btrfs_work flush_work;
+	struct list_head work_list;
+};
 
 /*
  * calculates the total size you need to allocate for an ordered sum
@@ -186,7 +189,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 				struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-void btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
 void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct inode *inode);
diff --git a/trunk/fs/btrfs/print-tree.c b/trunk/fs/btrfs/print-tree.c
index 5e23684887eb..50d95fd190a5 100644
--- a/trunk/fs/btrfs/print-tree.c
+++ b/trunk/fs/btrfs/print-tree.c
@@ -297,6 +297,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 		case BTRFS_DEV_STATS_KEY:
 			printk(KERN_INFO "\t\tdevice stats\n");
 			break;
+		case BTRFS_DEV_REPLACE_KEY:
+			printk(KERN_INFO "\t\tdev replace\n");
+			break;
 		};
 	}
 }
diff --git a/trunk/fs/btrfs/reada.c b/trunk/fs/btrfs/reada.c
index a955669519a2..96b93daa0bbb 100644
--- a/trunk/fs/btrfs/reada.c
+++ b/trunk/fs/btrfs/reada.c
@@ -27,6 +27,7 @@
 #include "volumes.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "dev-replace.h"
 
 #undef DEBUG
 
@@ -323,7 +324,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 	struct reada_extent *re = NULL;
 	struct reada_extent *re_exist = NULL;
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 	struct btrfs_bio *bbio = NULL;
 	struct btrfs_device *dev;
 	struct btrfs_device *prev_dev;
@@ -332,6 +332,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 	int nzones = 0;
 	int i;
 	unsigned long index = logical >> PAGE_CACHE_SHIFT;
+	int dev_replace_is_ongoing;
 
 	spin_lock(&fs_info->reada_lock);
 	re = radix_tree_lookup(&fs_info->reada_tree, index);
@@ -358,7 +359,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 	 * map block
 	 */
 	length = blocksize;
-	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
+	ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
+			      &bbio, 0);
 	if (ret || !bbio || length < blocksize)
 		goto error;
 
@@ -393,6 +395,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 	}
 
 	/* insert extent in reada_tree + all per-device trees, all or nothing */
+	btrfs_dev_replace_lock(&fs_info->dev_replace);
 	spin_lock(&fs_info->reada_lock);
 	ret = radix_tree_insert(&fs_info->reada_tree, index, re);
 	if (ret == -EEXIST) {
@@ -400,13 +403,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 		BUG_ON(!re_exist);
 		re_exist->refcnt++;
 		spin_unlock(&fs_info->reada_lock);
+		btrfs_dev_replace_unlock(&fs_info->dev_replace);
 		goto error;
 	}
 	if (ret) {
 		spin_unlock(&fs_info->reada_lock);
+		btrfs_dev_replace_unlock(&fs_info->dev_replace);
 		goto error;
 	}
 	prev_dev = NULL;
+	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
+			&fs_info->dev_replace);
 	for (i = 0; i < nzones; ++i) {
 		dev = bbio->stripes[i].dev;
 		if (dev == prev_dev) {
@@ -419,21 +426,36 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 			 */
 			continue;
 		}
+		if (!dev->bdev) {
+			/* cannot read ahead on missing device */
+			continue;
+		}
+		if (dev_replace_is_ongoing &&
+		    dev == fs_info->dev_replace.tgtdev) {
+			/*
+			 * as this device is selected for reading only as
+			 * a last resort, skip it for read ahead.
+			 */
+			continue;
+		}
 		prev_dev = dev;
 		ret = radix_tree_insert(&dev->reada_extents, index, re);
 		if (ret) {
 			while (--i >= 0) {
 				dev = bbio->stripes[i].dev;
 				BUG_ON(dev == NULL);
+				/* ignore whether the entry was inserted */
 				radix_tree_delete(&dev->reada_extents, index);
 			}
 			BUG_ON(fs_info == NULL);
 			radix_tree_delete(&fs_info->reada_tree, index);
 			spin_unlock(&fs_info->reada_lock);
+			btrfs_dev_replace_unlock(&fs_info->dev_replace);
 			goto error;
 		}
 	}
 	spin_unlock(&fs_info->reada_lock);
+	btrfs_dev_replace_unlock(&fs_info->dev_replace);
 
 	kfree(bbio);
 	return re;
@@ -915,7 +937,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
 	generation = btrfs_header_generation(node);
 	free_extent_buffer(node);
 
-	reada_add_block(rc, start, &max_key, level, generation);
+	if (reada_add_block(rc, start, &max_key, level, generation)) {
+		kfree(rc);
+		return ERR_PTR(-ENOMEM);
+	}
 
 	reada_start_machine(root->fs_info);
 
diff --git a/trunk/fs/btrfs/relocation.c b/trunk/fs/btrfs/relocation.c
index 776f0aa128fc..300e09ac3659 100644
--- a/trunk/fs/btrfs/relocation.c
+++ b/trunk/fs/btrfs/relocation.c
@@ -2025,7 +2025,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 	struct btrfs_root_item *root_item;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
-	unsigned long nr;
 	int level;
 	int max_level;
 	int replaced = 0;
@@ -2074,7 +2073,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 		BUG_ON(IS_ERR(trans));
 		trans->block_rsv = rc->block_rsv;
 
-		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
+		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
+					     BTRFS_RESERVE_FLUSH_ALL);
 		if (ret) {
 			BUG_ON(ret != -EAGAIN);
 			ret = btrfs_commit_transaction(trans, root);
@@ -2125,10 +2125,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 			       path->slots[level]);
 		root_item->drop_level = level;
 
-		nr = trans->blocks_used;
 		btrfs_end_transaction_throttle(trans, root);
 
-		btrfs_btree_balance_dirty(root, nr);
+		btrfs_btree_balance_dirty(root);
 
 		if (replaced && rc->stage == UPDATE_DATA_PTRS)
 			invalidate_extent_cache(root, &key, &next_key);
@@ -2155,10 +2154,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 		btrfs_update_reloc_root(trans, root);
 	}
 
-	nr = trans->blocks_used;
 	btrfs_end_transaction_throttle(trans, root);
 
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 
 	if (replaced && rc->stage == UPDATE_DATA_PTRS)
 		invalidate_extent_cache(root, &key, &next_key);
@@ -2184,7 +2182,8 @@ int prepare_to_merge(struct reloc_control *rc, int err)
 again:
 	if (!err) {
 		num_bytes = rc->merging_rsv_size;
-		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
+		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+					  BTRFS_RESERVE_FLUSH_ALL);
 		if (ret)
 			err = ret;
 	}
@@ -2459,7 +2458,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
 	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
 
 	trans->block_rsv = rc->block_rsv;
-	ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
+	ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+				  BTRFS_RESERVE_FLUSH_ALL);
 	if (ret) {
 		if (ret == -EAGAIN)
 			rc->commit_transaction = 1;
@@ -3259,7 +3259,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
 	struct btrfs_path *path;
 	struct btrfs_root *root = fs_info->tree_root;
 	struct btrfs_trans_handle *trans;
-	unsigned long nr;
 	int ret = 0;
 
 	if (inode)
@@ -3293,9 +3292,8 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
 	ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
 
 	btrfs_free_path(path);
-	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 out:
 	iput(inode);
 	return ret;
@@ -3685,7 +3683,8 @@ int prepare_to_relocate(struct reloc_control *rc)
 	 * is no reservation in transaction handle.
 	 */
 	ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
-				  rc->extent_root->nodesize * 256);
+				  rc->extent_root->nodesize * 256,
+				  BTRFS_RESERVE_FLUSH_ALL);
 	if (ret)
 		return ret;
 
@@ -3711,7 +3710,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_path *path;
 	struct btrfs_extent_item *ei;
-	unsigned long nr;
 	u64 flags;
 	u32 item_size;
 	int ret;
@@ -3828,9 +3826,8 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 			ret = btrfs_commit_transaction(trans, rc->extent_root);
 			BUG_ON(ret);
 		} else {
-			nr = trans->blocks_used;
 			btrfs_end_transaction_throttle(trans, rc->extent_root);
-			btrfs_btree_balance_dirty(rc->extent_root, nr);
+			btrfs_btree_balance_dirty(rc->extent_root);
 		}
 		trans = NULL;
 
@@ -3860,9 +3857,8 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 			  GFP_NOFS);
 
 	if (trans) {
-		nr = trans->blocks_used;
 		btrfs_end_transaction_throttle(trans, rc->extent_root);
-		btrfs_btree_balance_dirty(rc->extent_root, nr);
+		btrfs_btree_balance_dirty(rc->extent_root);
 	}
 
 	if (!err) {
@@ -3941,7 +3937,6 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root;
 	struct btrfs_key key;
-	unsigned long nr;
 	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
 	int err = 0;
 
@@ -3969,9 +3964,8 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
 
 	err = btrfs_orphan_add(trans, inode);
 out:
-	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 	if (err) {
 		if (inode)
 			iput(inode);
@@ -4057,7 +4051,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 	       (unsigned long long)rc->block_group->key.objectid,
 	       (unsigned long long)rc->block_group->flags);
 
-	btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+	ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
 	btrfs_wait_ordered_extents(fs_info->tree_root, 0);
 
 	while (1) {
diff --git a/trunk/fs/btrfs/root-tree.c b/trunk/fs/btrfs/root-tree.c
index eb923d087da7..668af537a3ea 100644
--- a/trunk/fs/btrfs/root-tree.c
+++ b/trunk/fs/btrfs/root-tree.c
@@ -548,9 +548,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
 	struct btrfs_root_item *item = &root->root_item;
 	struct timespec ct = CURRENT_TIME;
 
-	spin_lock(&root->root_times_lock);
+	spin_lock(&root->root_item_lock);
 	item->ctransid = cpu_to_le64(trans->transid);
 	item->ctime.sec = cpu_to_le64(ct.tv_sec);
 	item->ctime.nsec = cpu_to_le32(ct.tv_nsec);
-	spin_unlock(&root->root_times_lock);
+	spin_unlock(&root->root_item_lock);
 }
diff --git a/trunk/fs/btrfs/scrub.c b/trunk/fs/btrfs/scrub.c
index 27892f67e69b..bdbb94f245c9 100644
--- a/trunk/fs/btrfs/scrub.c
+++ b/trunk/fs/btrfs/scrub.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2011 STRATO.  All rights reserved.
+ * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
@@ -25,6 +25,7 @@
 #include "transaction.h"
 #include "backref.h"
 #include "extent_io.h"
+#include "dev-replace.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
 
@@ -42,10 +43,23 @@
  */
 
 struct scrub_block;
-struct scrub_dev;
+struct scrub_ctx;
 
-#define SCRUB_PAGES_PER_BIO	16	/* 64k per bio */
-#define SCRUB_BIOS_PER_DEV	16	/* 1 MB per device in flight */
+/*
+ * the following three values only influence the performance.
+ * The last one configures the number of parallel and outstanding I/O
+ * operations. The first two values configure an upper limit for the number
+ * of (dynamically allocated) pages that are added to a bio.
+ */
+#define SCRUB_PAGES_PER_RD_BIO	32	/* 128k per bio */
+#define SCRUB_PAGES_PER_WR_BIO	32	/* 128k per bio */
+#define SCRUB_BIOS_PER_SCTX	64	/* 8MB per device in flight */
+
+/*
+ * the following value times PAGE_SIZE needs to be large enough to match the
+ * largest node/leaf/sector size that shall be supported.
+ * Values larger than BTRFS_STRIPE_LEN are not supported.
+ */
 #define SCRUB_MAX_PAGES_PER_BLOCK	16	/* 64k per node/leaf/sector */
 
 struct scrub_page {
@@ -56,6 +70,8 @@ struct scrub_page {
 	u64			generation;
 	u64			logical;
 	u64			physical;
+	u64			physical_for_dev_replace;
+	atomic_t		ref_count;
 	struct {
 		unsigned int	mirror_num:8;
 		unsigned int	have_csum:1;
@@ -66,23 +82,28 @@ struct scrub_page {
 
 struct scrub_bio {
 	int			index;
-	struct scrub_dev	*sdev;
+	struct scrub_ctx	*sctx;
+	struct btrfs_device	*dev;
 	struct bio		*bio;
 	int			err;
 	u64			logical;
 	u64			physical;
-	struct scrub_page	*pagev[SCRUB_PAGES_PER_BIO];
+#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
+	struct scrub_page	*pagev[SCRUB_PAGES_PER_WR_BIO];
+#else
+	struct scrub_page	*pagev[SCRUB_PAGES_PER_RD_BIO];
+#endif
 	int			page_count;
 	int			next_free;
 	struct btrfs_work	work;
 };
 
 struct scrub_block {
-	struct scrub_page	pagev[SCRUB_MAX_PAGES_PER_BLOCK];
+	struct scrub_page	*pagev[SCRUB_MAX_PAGES_PER_BLOCK];
 	int			page_count;
 	atomic_t		outstanding_pages;
 	atomic_t		ref_count; /* free mem on transition to zero */
-	struct scrub_dev	*sdev;
+	struct scrub_ctx	*sctx;
 	struct {
 		unsigned int	header_error:1;
 		unsigned int	checksum_error:1;
@@ -91,23 +112,35 @@ struct scrub_block {
 	};
 };
 
-struct scrub_dev {
-	struct scrub_bio	*bios[SCRUB_BIOS_PER_DEV];
-	struct btrfs_device	*dev;
+struct scrub_wr_ctx {
+	struct scrub_bio *wr_curr_bio;
+	struct btrfs_device *tgtdev;
+	int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
+	atomic_t flush_all_writes;
+	struct mutex wr_lock;
+};
+
+struct scrub_ctx {
+	struct scrub_bio	*bios[SCRUB_BIOS_PER_SCTX];
+	struct btrfs_root	*dev_root;
 	int			first_free;
 	int			curr;
-	atomic_t		in_flight;
-	atomic_t		fixup_cnt;
+	atomic_t		bios_in_flight;
+	atomic_t		workers_pending;
 	spinlock_t		list_lock;
 	wait_queue_head_t	list_wait;
 	u16			csum_size;
 	struct list_head	csum_list;
 	atomic_t		cancel_req;
 	int			readonly;
-	int			pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */
+	int			pages_per_rd_bio;
 	u32			sectorsize;
 	u32			nodesize;
 	u32			leafsize;
+
+	int			is_dev_replace;
+	struct scrub_wr_ctx	wr_ctx;
+
 	/*
 	 * statistics
 	 */
@@ -116,13 +149,23 @@ struct scrub_dev {
 };
 
 struct scrub_fixup_nodatasum {
-	struct scrub_dev	*sdev;
+	struct scrub_ctx	*sctx;
+	struct btrfs_device	*dev;
 	u64			logical;
 	struct btrfs_root	*root;
 	struct btrfs_work	work;
 	int			mirror_num;
 };
 
+struct scrub_copy_nocow_ctx {
+	struct scrub_ctx	*sctx;
+	u64			logical;
+	u64			len;
+	int			mirror_num;
+	u64			physical_for_dev_replace;
+	struct btrfs_work	work;
+};
+
 struct scrub_warning {
 	struct btrfs_path	*path;
 	u64			extent_item_size;
@@ -137,15 +180,20 @@ struct scrub_warning {
 };
 
 
+static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
+static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
+static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
+static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
-static int scrub_setup_recheck_block(struct scrub_dev *sdev,
-				     struct btrfs_mapping_tree *map_tree,
+static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
+				     struct btrfs_fs_info *fs_info,
+				     struct scrub_block *original_sblock,
 				     u64 length, u64 logical,
-				     struct scrub_block *sblock);
-static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
-			       struct scrub_block *sblock, int is_metadata,
-			       int have_csum, u8 *csum, u64 generation,
-			       u16 csum_size);
+				     struct scrub_block *sblocks_for_recheck);
+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
+				struct scrub_block *sblock, int is_metadata,
+				int have_csum, u8 *csum, u64 generation,
+				u16 csum_size);
 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 					 struct scrub_block *sblock,
 					 int is_metadata, int have_csum,
@@ -158,118 +206,221 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 					    struct scrub_block *sblock_good,
 					    int page_num, int force_write);
+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
+static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
+					   int page_num);
 static int scrub_checksum_data(struct scrub_block *sblock);
 static int scrub_checksum_tree_block(struct scrub_block *sblock);
 static int scrub_checksum_super(struct scrub_block *sblock);
 static void scrub_block_get(struct scrub_block *sblock);
 static void scrub_block_put(struct scrub_block *sblock);
-static int scrub_add_page_to_bio(struct scrub_dev *sdev,
-				 struct scrub_page *spage);
-static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
-		       u64 physical, u64 flags, u64 gen, int mirror_num,
-		       u8 *csum, int force);
+static void scrub_page_get(struct scrub_page *spage);
+static void scrub_page_put(struct scrub_page *spage);
+static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
+				    struct scrub_page *spage);
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+		       u64 physical, struct btrfs_device *dev, u64 flags,
+		       u64 gen, int mirror_num, u8 *csum, int force,
+		       u64 physical_for_dev_replace);
 static void scrub_bio_end_io(struct bio *bio, int err);
 static void scrub_bio_end_io_worker(struct btrfs_work *work);
 static void scrub_block_complete(struct scrub_block *sblock);
+static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
+			       u64 extent_logical, u64 extent_len,
+			       u64 *extent_physical,
+			       struct btrfs_device **extent_dev,
+			       int *extent_mirror_num);
+static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
+			      struct scrub_wr_ctx *wr_ctx,
+			      struct btrfs_fs_info *fs_info,
+			      struct btrfs_device *dev,
+			      int is_dev_replace);
+static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
+static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
+				    struct scrub_page *spage);
+static void scrub_wr_submit(struct scrub_ctx *sctx);
+static void scrub_wr_bio_end_io(struct bio *bio, int err);
+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
+static int write_page_nocow(struct scrub_ctx *sctx,
+			    u64 physical_for_dev_replace, struct page *page);
+static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
+				      void *ctx);
+static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+			    int mirror_num, u64 physical_for_dev_replace);
+static void copy_nocow_pages_worker(struct btrfs_work *work);
+
+
+static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
+{
+	atomic_inc(&sctx->bios_in_flight);
+}
+
+static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
+{
+	atomic_dec(&sctx->bios_in_flight);
+	wake_up(&sctx->list_wait);
+}
+
+/*
+ * used for workers that require transaction commits (i.e., for the
+ * NOCOW case)
+ */
+static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
+{
+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+
+	/*
+	 * increment scrubs_running to prevent cancel requests from
+	 * completing as long as a worker is running. we must also
+	 * increment scrubs_paused to prevent deadlocking on pause
+	 * requests used for transactions commits (as the worker uses a
+	 * transaction context). it is safe to regard the worker
+	 * as paused for all matters practical. effectively, we only
+	 * avoid cancellation requests from completing.
+	 */
+	mutex_lock(&fs_info->scrub_lock);
+	atomic_inc(&fs_info->scrubs_running);
+	atomic_inc(&fs_info->scrubs_paused);
+	mutex_unlock(&fs_info->scrub_lock);
+	atomic_inc(&sctx->workers_pending);
+}
 
+/* used for workers that require transaction commits */
+static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
+{
+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 
-static void scrub_free_csums(struct scrub_dev *sdev)
+	/*
+	 * see scrub_pending_trans_workers_inc() why we're pretending
+	 * to be paused in the scrub counters
+	 */
+	mutex_lock(&fs_info->scrub_lock);
+	atomic_dec(&fs_info->scrubs_running);
+	atomic_dec(&fs_info->scrubs_paused);
+	mutex_unlock(&fs_info->scrub_lock);
+	atomic_dec(&sctx->workers_pending);
+	wake_up(&fs_info->scrub_pause_wait);
+	wake_up(&sctx->list_wait);
+}
+
+static void scrub_free_csums(struct scrub_ctx *sctx)
 {
-	while (!list_empty(&sdev->csum_list)) {
+	while (!list_empty(&sctx->csum_list)) {
 		struct btrfs_ordered_sum *sum;
-		sum = list_first_entry(&sdev->csum_list,
+		sum = list_first_entry(&sctx->csum_list,
 				       struct btrfs_ordered_sum, list);
 		list_del(&sum->list);
 		kfree(sum);
 	}
 }
 
-static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
+static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 {
 	int i;
 
-	if (!sdev)
+	if (!sctx)
 		return;
 
+	scrub_free_wr_ctx(&sctx->wr_ctx);
+
 	/* this can happen when scrub is cancelled */
-	if (sdev->curr != -1) {
-		struct scrub_bio *sbio = sdev->bios[sdev->curr];
+	if (sctx->curr != -1) {
+		struct scrub_bio *sbio = sctx->bios[sctx->curr];
 
 		for (i = 0; i < sbio->page_count; i++) {
-			BUG_ON(!sbio->pagev[i]);
-			BUG_ON(!sbio->pagev[i]->page);
+			WARN_ON(!sbio->pagev[i]->page);
 			scrub_block_put(sbio->pagev[i]->sblock);
 		}
 		bio_put(sbio->bio);
 	}
 
-	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
-		struct scrub_bio *sbio = sdev->bios[i];
+	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
+		struct scrub_bio *sbio = sctx->bios[i];
 
 		if (!sbio)
 			break;
 		kfree(sbio);
 	}
 
-	scrub_free_csums(sdev);
-	kfree(sdev);
+	scrub_free_csums(sctx);
+	kfree(sctx);
 }
 
 static noinline_for_stack
-struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
+struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 {
-	struct scrub_dev *sdev;
+	struct scrub_ctx *sctx;
 	int		i;
 	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
-	int pages_per_bio;
+	int pages_per_rd_bio;
+	int ret;
 
-	pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO,
-			      bio_get_nr_vecs(dev->bdev));
-	sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
-	if (!sdev)
+	/*
+	 * the setting of pages_per_rd_bio is correct for scrub but might
+	 * be wrong for the dev_replace code where we might read from
+	 * different devices in the initial huge bios. However, that
+	 * code is able to correctly handle the case when adding a page
+	 * to a bio fails.
+	 */
+	if (dev->bdev)
+		pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
+					 bio_get_nr_vecs(dev->bdev));
+	else
+		pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
+	sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
+	if (!sctx)
 		goto nomem;
-	sdev->dev = dev;
-	sdev->pages_per_bio = pages_per_bio;
-	sdev->curr = -1;
-	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
+	sctx->is_dev_replace = is_dev_replace;
+	sctx->pages_per_rd_bio = pages_per_rd_bio;
+	sctx->curr = -1;
+	sctx->dev_root = dev->dev_root;
+	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 		struct scrub_bio *sbio;
 
 		sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
 		if (!sbio)
 			goto nomem;
-		sdev->bios[i] = sbio;
+		sctx->bios[i] = sbio;
 
 		sbio->index = i;
-		sbio->sdev = sdev;
+		sbio->sctx = sctx;
 		sbio->page_count = 0;
 		sbio->work.func = scrub_bio_end_io_worker;
 
-		if (i != SCRUB_BIOS_PER_DEV-1)
-			sdev->bios[i]->next_free = i + 1;
+		if (i != SCRUB_BIOS_PER_SCTX - 1)
+			sctx->bios[i]->next_free = i + 1;
 		else
-			sdev->bios[i]->next_free = -1;
-	}
-	sdev->first_free = 0;
-	sdev->nodesize = dev->dev_root->nodesize;
-	sdev->leafsize = dev->dev_root->leafsize;
-	sdev->sectorsize = dev->dev_root->sectorsize;
-	atomic_set(&sdev->in_flight, 0);
-	atomic_set(&sdev->fixup_cnt, 0);
-	atomic_set(&sdev->cancel_req, 0);
-	sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
-	INIT_LIST_HEAD(&sdev->csum_list);
-
-	spin_lock_init(&sdev->list_lock);
-	spin_lock_init(&sdev->stat_lock);
-	init_waitqueue_head(&sdev->list_wait);
-	return sdev;
+			sctx->bios[i]->next_free = -1;
+	}
+	sctx->first_free = 0;
+	sctx->nodesize = dev->dev_root->nodesize;
+	sctx->leafsize = dev->dev_root->leafsize;
+	sctx->sectorsize = dev->dev_root->sectorsize;
+	atomic_set(&sctx->bios_in_flight, 0);
+	atomic_set(&sctx->workers_pending, 0);
+	atomic_set(&sctx->cancel_req, 0);
+	sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
+	INIT_LIST_HEAD(&sctx->csum_list);
+
+	spin_lock_init(&sctx->list_lock);
+	spin_lock_init(&sctx->stat_lock);
+	init_waitqueue_head(&sctx->list_wait);
+
+	ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
+				 fs_info->dev_replace.tgtdev, is_dev_replace);
+	if (ret) {
+		scrub_free_ctx(sctx);
+		return ERR_PTR(ret);
+	}
+	return sctx;
 
 nomem:
-	scrub_free_dev(sdev);
+	scrub_free_ctx(sctx);
 	return ERR_PTR(-ENOMEM);
 }
 
-static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
+static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
+				     void *warn_ctx)
 {
 	u64 isize;
 	u32 nlink;
@@ -277,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
 	int i;
 	struct extent_buffer *eb;
 	struct btrfs_inode_item *inode_item;
-	struct scrub_warning *swarn = ctx;
+	struct scrub_warning *swarn = warn_ctx;
 	struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
 	struct inode_fs_paths *ipath = NULL;
 	struct btrfs_root *local_root;
@@ -345,8 +496,8 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
 
 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 {
-	struct btrfs_device *dev = sblock->sdev->dev;
-	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+	struct btrfs_device *dev;
+	struct btrfs_fs_info *fs_info;
 	struct btrfs_path *path;
 	struct btrfs_key found_key;
 	struct extent_buffer *eb;
@@ -361,15 +512,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 	const int bufsize = 4096;
 	int ret;
 
+	WARN_ON(sblock->page_count < 1);
+	dev = sblock->pagev[0]->dev;
+	fs_info = sblock->sctx->dev_root->fs_info;
+
 	path = btrfs_alloc_path();
 
 	swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
 	swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
-	BUG_ON(sblock->page_count < 1);
-	swarn.sector = (sblock->pagev[0].physical) >> 9;
-	swarn.logical = sblock->pagev[0].logical;
+	swarn.sector = (sblock->pagev[0]->physical) >> 9;
+	swarn.logical = sblock->pagev[0]->logical;
 	swarn.errstr = errstr;
-	swarn.dev = dev;
+	swarn.dev = NULL;
 	swarn.msg_bufsize = bufsize;
 	swarn.scratch_bufsize = bufsize;
 
@@ -405,6 +559,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 		} while (ret != 1);
 	} else {
 		swarn.path = path;
+		swarn.dev = dev;
 		iterate_extent_inodes(fs_info, found_key.objectid,
 					extent_item_pos, 1,
 					scrub_print_warning_inode, &swarn);
@@ -416,11 +571,11 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 	kfree(swarn.msg_buf);
 }
 
-static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
+static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
 {
 	struct page *page = NULL;
 	unsigned long index;
-	struct scrub_fixup_nodatasum *fixup = ctx;
+	struct scrub_fixup_nodatasum *fixup = fixup_ctx;
 	int ret;
 	int corrected = 0;
 	struct btrfs_key key;
@@ -451,7 +606,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
 	}
 
 	if (PageUptodate(page)) {
-		struct btrfs_mapping_tree *map_tree;
+		struct btrfs_fs_info *fs_info;
 		if (PageDirty(page)) {
 			/*
 			 * we need to write the data to the defect sector. the
@@ -472,8 +627,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
 			ret = -EIO;
 			goto out;
 		}
-		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
-		ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
+		fs_info = BTRFS_I(inode)->root->fs_info;
+		ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
 					fixup->logical, page,
 					fixup->mirror_num);
 		unlock_page(page);
@@ -530,21 +685,21 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
 {
 	int ret;
 	struct scrub_fixup_nodatasum *fixup;
-	struct scrub_dev *sdev;
+	struct scrub_ctx *sctx;
 	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_fs_info *fs_info;
 	struct btrfs_path *path;
 	int uncorrectable = 0;
 
 	fixup = container_of(work, struct scrub_fixup_nodatasum, work);
-	sdev = fixup->sdev;
+	sctx = fixup->sctx;
 	fs_info = fixup->root->fs_info;
 
 	path = btrfs_alloc_path();
 	if (!path) {
-		spin_lock(&sdev->stat_lock);
-		++sdev->stat.malloc_errors;
-		spin_unlock(&sdev->stat_lock);
+		spin_lock(&sctx->stat_lock);
+		++sctx->stat.malloc_errors;
+		spin_unlock(&sctx->stat_lock);
 		uncorrectable = 1;
 		goto out;
 	}
@@ -573,35 +728,30 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
 	}
 	WARN_ON(ret != 1);
 
-	spin_lock(&sdev->stat_lock);
-	++sdev->stat.corrected_errors;
-	spin_unlock(&sdev->stat_lock);
+	spin_lock(&sctx->stat_lock);
+	++sctx->stat.corrected_errors;
+	spin_unlock(&sctx->stat_lock);
 
 out:
 	if (trans && !IS_ERR(trans))
 		btrfs_end_transaction(trans, fixup->root);
 	if (uncorrectable) {
-		spin_lock(&sdev->stat_lock);
-		++sdev->stat.uncorrectable_errors;
-		spin_unlock(&sdev->stat_lock);
-
+		spin_lock(&sctx->stat_lock);
+		++sctx->stat.uncorrectable_errors;
+		spin_unlock(&sctx->stat_lock);
+		btrfs_dev_replace_stats_inc(
+			&sctx->dev_root->fs_info->dev_replace.
+			num_uncorrectable_read_errors);
 		printk_ratelimited_in_rcu(KERN_ERR
 			"btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
 			(unsigned long long)fixup->logical,
-			rcu_str_deref(sdev->dev->name));
+			rcu_str_deref(fixup->dev->name));
 	}
 
 	btrfs_free_path(path);
 	kfree(fixup);
 
-	/* see caller why we're pretending to be paused in the scrub counters */
-	mutex_lock(&fs_info->scrub_lock);
-	atomic_dec(&fs_info->scrubs_running);
-	atomic_dec(&fs_info->scrubs_paused);
-	mutex_unlock(&fs_info->scrub_lock);
-	atomic_dec(&sdev->fixup_cnt);
-	wake_up(&fs_info->scrub_pause_wait);
-	wake_up(&sdev->list_wait);
+	scrub_pending_trans_workers_dec(sctx);
 }
 
 /*
@@ -614,7 +764,8 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
  */
 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 {
-	struct scrub_dev *sdev = sblock_to_check->sdev;
+	struct scrub_ctx *sctx = sblock_to_check->sctx;
+	struct btrfs_device *dev;
 	struct btrfs_fs_info *fs_info;
 	u64 length;
 	u64 logical;
@@ -633,16 +784,33 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 				      DEFAULT_RATELIMIT_BURST);
 
 	BUG_ON(sblock_to_check->page_count < 1);
-	fs_info = sdev->dev->dev_root->fs_info;
+	fs_info = sctx->dev_root->fs_info;
+	if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+		/*
+		 * if we find an error in a super block, we just report it.
+		 * They will get written with the next transaction commit
+		 * anyway
+		 */
+		spin_lock(&sctx->stat_lock);
+		++sctx->stat.super_errors;
+		spin_unlock(&sctx->stat_lock);
+		return 0;
+	}
 	length = sblock_to_check->page_count * PAGE_SIZE;
-	logical = sblock_to_check->pagev[0].logical;
-	generation = sblock_to_check->pagev[0].generation;
-	BUG_ON(sblock_to_check->pagev[0].mirror_num < 1);
-	failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1;
-	is_metadata = !(sblock_to_check->pagev[0].flags &
+	logical = sblock_to_check->pagev[0]->logical;
+	generation = sblock_to_check->pagev[0]->generation;
+	BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
+	failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
+	is_metadata = !(sblock_to_check->pagev[0]->flags &
 			BTRFS_EXTENT_FLAG_DATA);
-	have_csum = sblock_to_check->pagev[0].have_csum;
-	csum = sblock_to_check->pagev[0].csum;
+	have_csum = sblock_to_check->pagev[0]->have_csum;
+	csum = sblock_to_check->pagev[0]->csum;
+	dev = sblock_to_check->pagev[0]->dev;
+
+	if (sctx->is_dev_replace && !is_metadata && !have_csum) {
+		sblocks_for_recheck = NULL;
+		goto nodatasum_case;
+	}
 
 	/*
 	 * read all mirrors one after the other. This includes to
@@ -677,43 +845,32 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 				     sizeof(*sblocks_for_recheck),
 				     GFP_NOFS);
 	if (!sblocks_for_recheck) {
-		spin_lock(&sdev->stat_lock);
-		sdev->stat.malloc_errors++;
-		sdev->stat.read_errors++;
-		sdev->stat.uncorrectable_errors++;
-		spin_unlock(&sdev->stat_lock);
-		btrfs_dev_stat_inc_and_print(sdev->dev,
-					     BTRFS_DEV_STAT_READ_ERRS);
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		sctx->stat.read_errors++;
+		sctx->stat.uncorrectable_errors++;
+		spin_unlock(&sctx->stat_lock);
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 		goto out;
 	}
 
 	/* setup the context, map the logical blocks and alloc the pages */
-	ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length,
+	ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
 					logical, sblocks_for_recheck);
 	if (ret) {
-		spin_lock(&sdev->stat_lock);
-		sdev->stat.read_errors++;
-		sdev->stat.uncorrectable_errors++;
-		spin_unlock(&sdev->stat_lock);
-		btrfs_dev_stat_inc_and_print(sdev->dev,
-					     BTRFS_DEV_STAT_READ_ERRS);
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.read_errors++;
+		sctx->stat.uncorrectable_errors++;
+		spin_unlock(&sctx->stat_lock);
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 		goto out;
 	}
 	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
 	sblock_bad = sblocks_for_recheck + failed_mirror_index;
 
 	/* build and submit the bios for the failed mirror, check checksums */
-	ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
-				  csum, generation, sdev->csum_size);
-	if (ret) {
-		spin_lock(&sdev->stat_lock);
-		sdev->stat.read_errors++;
-		sdev->stat.uncorrectable_errors++;
-		spin_unlock(&sdev->stat_lock);
-		btrfs_dev_stat_inc_and_print(sdev->dev,
-					     BTRFS_DEV_STAT_READ_ERRS);
-		goto out;
-	}
+	scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
+			    csum, generation, sctx->csum_size);
 
 	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
 	    sblock_bad->no_io_error_seen) {
@@ -725,50 +882,54 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 		 * different bio (usually one of the two latter cases is
 		 * the cause)
 		 */
-		spin_lock(&sdev->stat_lock);
-		sdev->stat.unverified_errors++;
-		spin_unlock(&sdev->stat_lock);
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.unverified_errors++;
+		spin_unlock(&sctx->stat_lock);
 
+		if (sctx->is_dev_replace)
+			scrub_write_block_to_dev_replace(sblock_bad);
 		goto out;
 	}
 
 	if (!sblock_bad->no_io_error_seen) {
-		spin_lock(&sdev->stat_lock);
-		sdev->stat.read_errors++;
-		spin_unlock(&sdev->stat_lock);
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.read_errors++;
+		spin_unlock(&sctx->stat_lock);
 		if (__ratelimit(&_rs))
 			scrub_print_warning("i/o error", sblock_to_check);
-		btrfs_dev_stat_inc_and_print(sdev->dev,
-					     BTRFS_DEV_STAT_READ_ERRS);
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 	} else if (sblock_bad->checksum_error) {
-		spin_lock(&sdev->stat_lock);
-		sdev->stat.csum_errors++;
-		spin_unlock(&sdev->stat_lock);
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.csum_errors++;
+		spin_unlock(&sctx->stat_lock);
 		if (__ratelimit(&_rs))
 			scrub_print_warning("checksum error", sblock_to_check);
-		btrfs_dev_stat_inc_and_print(sdev->dev,
+		btrfs_dev_stat_inc_and_print(dev,
 					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
 	} else if (sblock_bad->header_error) {
-		spin_lock(&sdev->stat_lock);
-		sdev->stat.verify_errors++;
-		spin_unlock(&sdev->stat_lock);
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.verify_errors++;
+		spin_unlock(&sctx->stat_lock);
 		if (__ratelimit(&_rs))
 			scrub_print_warning("checksum/header error",
 					    sblock_to_check);
 		if (sblock_bad->generation_error)
-			btrfs_dev_stat_inc_and_print(sdev->dev,
+			btrfs_dev_stat_inc_and_print(dev,
 				BTRFS_DEV_STAT_GENERATION_ERRS);
 		else
-			btrfs_dev_stat_inc_and_print(sdev->dev,
+			btrfs_dev_stat_inc_and_print(dev,
 				BTRFS_DEV_STAT_CORRUPTION_ERRS);
 	}
 
-	if (sdev->readonly)
+	if (sctx->readonly && !sctx->is_dev_replace)
 		goto did_not_correct_error;
 
 	if (!is_metadata && !have_csum) {
 		struct scrub_fixup_nodatasum *fixup_nodatasum;
 
+nodatasum_case:
+		WARN_ON(sctx->is_dev_replace);
+
 		/*
 		 * !is_metadata and !have_csum, this means that the data
 		 * might not be COW'ed, that it might be modified
@@ -779,24 +940,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 		fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
 		if (!fixup_nodatasum)
 			goto did_not_correct_error;
-		fixup_nodatasum->sdev = sdev;
+		fixup_nodatasum->sctx = sctx;
+		fixup_nodatasum->dev = dev;
 		fixup_nodatasum->logical = logical;
 		fixup_nodatasum->root = fs_info->extent_root;
 		fixup_nodatasum->mirror_num = failed_mirror_index + 1;
-		/*
-		 * increment scrubs_running to prevent cancel requests from
-		 * completing as long as a fixup worker is running. we must also
-		 * increment scrubs_paused to prevent deadlocking on pause
-		 * requests used for transactions commits (as the worker uses a
-		 * transaction context). it is safe to regard the fixup worker
-		 * as paused for all matters practical. effectively, we only
-		 * avoid cancellation requests from completing.
-		 */
-		mutex_lock(&fs_info->scrub_lock);
-		atomic_inc(&fs_info->scrubs_running);
-		atomic_inc(&fs_info->scrubs_paused);
-		mutex_unlock(&fs_info->scrub_lock);
-		atomic_inc(&sdev->fixup_cnt);
+		scrub_pending_trans_workers_inc(sctx);
 		fixup_nodatasum->work.func = scrub_fixup_nodatasum;
 		btrfs_queue_worker(&fs_info->scrub_workers,
 				   &fixup_nodatasum->work);
@@ -805,26 +954,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 
 	/*
 	 * now build and submit the bios for the other mirrors, check
-	 * checksums
-	 */
-	for (mirror_index = 0;
-	     mirror_index < BTRFS_MAX_MIRRORS &&
-	     sblocks_for_recheck[mirror_index].page_count > 0;
-	     mirror_index++) {
-		if (mirror_index == failed_mirror_index)
-			continue;
-
-		/* build and submit the bios, check checksums */
-		ret = scrub_recheck_block(fs_info,
-					  sblocks_for_recheck + mirror_index,
-					  is_metadata, have_csum, csum,
-					  generation, sdev->csum_size);
-		if (ret)
-			goto did_not_correct_error;
-	}
-
-	/*
-	 * first try to pick the mirror which is completely without I/O
+	 * checksums.
+	 * First try to pick the mirror which is completely without I/O
 	 * errors and also does not have a checksum error.
 	 * If one is found, and if a checksum is present, the full block
 	 * that is known to contain an error is rewritten. Afterwards
@@ -840,24 +971,93 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 	     mirror_index < BTRFS_MAX_MIRRORS &&
 	     sblocks_for_recheck[mirror_index].page_count > 0;
 	     mirror_index++) {
-		struct scrub_block *sblock_other = sblocks_for_recheck +
-						   mirror_index;
+		struct scrub_block *sblock_other;
+
+		if (mirror_index == failed_mirror_index)
+			continue;
+		sblock_other = sblocks_for_recheck + mirror_index;
+
+		/* build and submit the bios, check checksums */
+		scrub_recheck_block(fs_info, sblock_other, is_metadata,
+				    have_csum, csum, generation,
+				    sctx->csum_size);
 
 		if (!sblock_other->header_error &&
 		    !sblock_other->checksum_error &&
 		    sblock_other->no_io_error_seen) {
-			int force_write = is_metadata || have_csum;
-
-			ret = scrub_repair_block_from_good_copy(sblock_bad,
-								sblock_other,
-								force_write);
+			if (sctx->is_dev_replace) {
+				scrub_write_block_to_dev_replace(sblock_other);
+			} else {
+				int force_write = is_metadata || have_csum;
+
+				ret = scrub_repair_block_from_good_copy(
+						sblock_bad, sblock_other,
+						force_write);
+			}
 			if (0 == ret)
 				goto corrected_error;
 		}
 	}
 
 	/*
-	 * in case of I/O errors in the area that is supposed to be
+	 * for dev_replace, pick good pages and write to the target device.
+	 */
+	if (sctx->is_dev_replace) {
+		success = 1;
+		for (page_num = 0; page_num < sblock_bad->page_count;
+		     page_num++) {
+			int sub_success;
+
+			sub_success = 0;
+			for (mirror_index = 0;
+			     mirror_index < BTRFS_MAX_MIRRORS &&
+			     sblocks_for_recheck[mirror_index].page_count > 0;
+			     mirror_index++) {
+				struct scrub_block *sblock_other =
+					sblocks_for_recheck + mirror_index;
+				struct scrub_page *page_other =
+					sblock_other->pagev[page_num];
+
+				if (!page_other->io_error) {
+					ret = scrub_write_page_to_dev_replace(
+							sblock_other, page_num);
+					if (ret == 0) {
+						/* succeeded for this page */
+						sub_success = 1;
+						break;
+					} else {
+						btrfs_dev_replace_stats_inc(
+							&sctx->dev_root->
+							fs_info->dev_replace.
+							num_write_errors);
+					}
+				}
+			}
+
+			if (!sub_success) {
+				/*
+				 * did not find a mirror to fetch the page
+				 * from. scrub_write_page_to_dev_replace()
+				 * handles this case (page->io_error), by
+				 * filling the block with zeros before
+				 * submitting the write request
+				 */
+				success = 0;
+				ret = scrub_write_page_to_dev_replace(
+						sblock_bad, page_num);
+				if (ret)
+					btrfs_dev_replace_stats_inc(
+						&sctx->dev_root->fs_info->
+						dev_replace.num_write_errors);
+			}
+		}
+
+		goto out;
+	}
+
+	/*
+	 * for regular scrub, repair those pages that are errored.
+	 * In case of I/O errors in the area that is supposed to be
 	 * repaired, continue by picking good copies of those pages.
 	 * Select the good pages from mirrors to rewrite bad pages from
 	 * the area to fix. Afterwards verify the checksum of the block
@@ -887,7 +1087,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 
 	success = 1;
 	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
-		struct scrub_page *page_bad = sblock_bad->pagev + page_num;
+		struct scrub_page *page_bad = sblock_bad->pagev[page_num];
 
 		if (!page_bad->io_error)
 			continue;
@@ -898,8 +1098,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 		     mirror_index++) {
 			struct scrub_block *sblock_other = sblocks_for_recheck +
 							   mirror_index;
-			struct scrub_page *page_other = sblock_other->pagev +
-							page_num;
+			struct scrub_page *page_other = sblock_other->pagev[
+							page_num];
 
 			if (!page_other->io_error) {
 				ret = scrub_repair_page_from_good_copy(
@@ -928,10 +1128,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 			 * is verified, but most likely the data comes out
 			 * of the page cache.
 			 */
-			ret = scrub_recheck_block(fs_info, sblock_bad,
-						  is_metadata, have_csum, csum,
-						  generation, sdev->csum_size);
-			if (!ret && !sblock_bad->header_error &&
+			scrub_recheck_block(fs_info, sblock_bad,
+					    is_metadata, have_csum, csum,
+					    generation, sctx->csum_size);
+			if (!sblock_bad->header_error &&
 			    !sblock_bad->checksum_error &&
 			    sblock_bad->no_io_error_seen)
 				goto corrected_error;
@@ -939,23 +1139,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 				goto did_not_correct_error;
 		} else {
 corrected_error:
-			spin_lock(&sdev->stat_lock);
-			sdev->stat.corrected_errors++;
-			spin_unlock(&sdev->stat_lock);
+			spin_lock(&sctx->stat_lock);
+			sctx->stat.corrected_errors++;
+			spin_unlock(&sctx->stat_lock);
 			printk_ratelimited_in_rcu(KERN_ERR
 				"btrfs: fixed up error at logical %llu on dev %s\n",
 				(unsigned long long)logical,
-				rcu_str_deref(sdev->dev->name));
+				rcu_str_deref(dev->name));
 		}
 	} else {
 did_not_correct_error:
-		spin_lock(&sdev->stat_lock);
-		sdev->stat.uncorrectable_errors++;
-		spin_unlock(&sdev->stat_lock);
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.uncorrectable_errors++;
+		spin_unlock(&sctx->stat_lock);
 		printk_ratelimited_in_rcu(KERN_ERR
 			"btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
 			(unsigned long long)logical,
-			rcu_str_deref(sdev->dev->name));
+			rcu_str_deref(dev->name));
 	}
 
 out:
@@ -966,11 +1166,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 						     mirror_index;
 			int page_index;
 
-			for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO;
-			     page_index++)
-				if (sblock->pagev[page_index].page)
-					__free_page(
-						sblock->pagev[page_index].page);
+			for (page_index = 0; page_index < sblock->page_count;
+			     page_index++) {
+				sblock->pagev[page_index]->sblock = NULL;
+				scrub_page_put(sblock->pagev[page_index]);
+			}
 		}
 		kfree(sblocks_for_recheck);
 	}
@@ -978,8 +1178,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 	return 0;
 }
 
-static int scrub_setup_recheck_block(struct scrub_dev *sdev,
-				     struct btrfs_mapping_tree *map_tree,
+static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
+				     struct btrfs_fs_info *fs_info,
+				     struct scrub_block *original_sblock,
 				     u64 length, u64 logical,
 				     struct scrub_block *sblocks_for_recheck)
 {
@@ -988,7 +1189,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
 	int ret;
 
 	/*
-	 * note: the three members sdev, ref_count and outstanding_pages
+	 * note: the two members ref_count and outstanding_pages
 	 * are not used (and not set) in the blocks that are used for
 	 * the recheck procedure
 	 */
@@ -1003,14 +1204,14 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
 		 * with a length of PAGE_SIZE, each returned stripe
 		 * represents one mirror
 		 */
-		ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length,
-				      &bbio, 0);
+		ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
+				      &mapped_length, &bbio, 0);
 		if (ret || !bbio || mapped_length < sublen) {
 			kfree(bbio);
 			return -EIO;
 		}
 
-		BUG_ON(page_index >= SCRUB_PAGES_PER_BIO);
+		BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
 		for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
 		     mirror_index++) {
 			struct scrub_block *sblock;
@@ -1020,21 +1221,31 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
 				continue;
 
 			sblock = sblocks_for_recheck + mirror_index;
-			page = sblock->pagev + page_index;
+			sblock->sctx = sctx;
+			page = kzalloc(sizeof(*page), GFP_NOFS);
+			if (!page) {
+leave_nomem:
+				spin_lock(&sctx->stat_lock);
+				sctx->stat.malloc_errors++;
+				spin_unlock(&sctx->stat_lock);
+				kfree(bbio);
+				return -ENOMEM;
+			}
+			scrub_page_get(page);
+			sblock->pagev[page_index] = page;
 			page->logical = logical;
 			page->physical = bbio->stripes[mirror_index].physical;
+			BUG_ON(page_index >= original_sblock->page_count);
+			page->physical_for_dev_replace =
+				original_sblock->pagev[page_index]->
+				physical_for_dev_replace;
 			/* for missing devices, dev->bdev is NULL */
 			page->dev = bbio->stripes[mirror_index].dev;
 			page->mirror_num = mirror_index + 1;
-			page->page = alloc_page(GFP_NOFS);
-			if (!page->page) {
-				spin_lock(&sdev->stat_lock);
-				sdev->stat.malloc_errors++;
-				spin_unlock(&sdev->stat_lock);
-				kfree(bbio);
-				return -ENOMEM;
-			}
 			sblock->page_count++;
+			page->page = alloc_page(GFP_NOFS);
+			if (!page->page)
+				goto leave_nomem;
 		}
 		kfree(bbio);
 		length -= sublen;
@@ -1052,10 +1263,10 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
  * to take those pages that are not errored from all the mirrors so that
  * the pages that are errored in the just handled mirror can be repaired.
  */
-static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
-			       struct scrub_block *sblock, int is_metadata,
-			       int have_csum, u8 *csum, u64 generation,
-			       u16 csum_size)
+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
+				struct scrub_block *sblock, int is_metadata,
+				int have_csum, u8 *csum, u64 generation,
+				u16 csum_size)
 {
 	int page_num;
 
@@ -1065,8 +1276,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
 
 	for (page_num = 0; page_num < sblock->page_count; page_num++) {
 		struct bio *bio;
-		int ret;
-		struct scrub_page *page = sblock->pagev + page_num;
+		struct scrub_page *page = sblock->pagev[page_num];
 		DECLARE_COMPLETION_ONSTACK(complete);
 
 		if (page->dev->bdev == NULL) {
@@ -1075,20 +1285,19 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
 			continue;
 		}
 
-		BUG_ON(!page->page);
+		WARN_ON(!page->page);
 		bio = bio_alloc(GFP_NOFS, 1);
-		if (!bio)
-			return -EIO;
+		if (!bio) {
+			page->io_error = 1;
+			sblock->no_io_error_seen = 0;
+			continue;
+		}
 		bio->bi_bdev = page->dev->bdev;
 		bio->bi_sector = page->physical >> 9;
 		bio->bi_end_io = scrub_complete_bio_end_io;
 		bio->bi_private = &complete;
 
-		ret = bio_add_page(bio, page->page, PAGE_SIZE, 0);
-		if (PAGE_SIZE != ret) {
-			bio_put(bio);
-			return -EIO;
-		}
+		bio_add_page(bio, page->page, PAGE_SIZE, 0);
 		btrfsic_submit_bio(READ, bio);
 
 		/* this will also unplug the queue */
@@ -1105,7 +1314,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
 					     have_csum, csum, generation,
 					     csum_size);
 
-	return 0;
+	return;
 }
 
 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
@@ -1120,14 +1329,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 	struct btrfs_root *root = fs_info->extent_root;
 	void *mapped_buffer;
 
-	BUG_ON(!sblock->pagev[0].page);
+	WARN_ON(!sblock->pagev[0]->page);
 	if (is_metadata) {
 		struct btrfs_header *h;
 
-		mapped_buffer = kmap_atomic(sblock->pagev[0].page);
+		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
 		h = (struct btrfs_header *)mapped_buffer;
 
-		if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
+		if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||
 		    memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
 		    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
 			   BTRFS_UUID_SIZE)) {
@@ -1141,7 +1350,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 		if (!have_csum)
 			return;
 
-		mapped_buffer = kmap_atomic(sblock->pagev[0].page);
+		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
 	}
 
 	for (page_num = 0;;) {
@@ -1157,9 +1366,9 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 		page_num++;
 		if (page_num >= sblock->page_count)
 			break;
-		BUG_ON(!sblock->pagev[page_num].page);
+		WARN_ON(!sblock->pagev[page_num]->page);
 
-		mapped_buffer = kmap_atomic(sblock->pagev[page_num].page);
+		mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
 	}
 
 	btrfs_csum_final(crc, calculated_csum);
@@ -1197,17 +1406,23 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 					    struct scrub_block *sblock_good,
 					    int page_num, int force_write)
 {
-	struct scrub_page *page_bad = sblock_bad->pagev + page_num;
-	struct scrub_page *page_good = sblock_good->pagev + page_num;
+	struct scrub_page *page_bad = sblock_bad->pagev[page_num];
+	struct scrub_page *page_good = sblock_good->pagev[page_num];
 
-	BUG_ON(sblock_bad->pagev[page_num].page == NULL);
-	BUG_ON(sblock_good->pagev[page_num].page == NULL);
+	BUG_ON(page_bad->page == NULL);
+	BUG_ON(page_good->page == NULL);
 	if (force_write || sblock_bad->header_error ||
 	    sblock_bad->checksum_error || page_bad->io_error) {
 		struct bio *bio;
 		int ret;
 		DECLARE_COMPLETION_ONSTACK(complete);
 
+		if (!page_bad->dev->bdev) {
+			printk_ratelimited(KERN_WARNING
+				"btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
+			return -EIO;
+		}
+
 		bio = bio_alloc(GFP_NOFS, 1);
 		if (!bio)
 			return -EIO;
@@ -1228,6 +1443,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 		if (!bio_flagged(bio, BIO_UPTODATE)) {
 			btrfs_dev_stat_inc_and_print(page_bad->dev,
 				BTRFS_DEV_STAT_WRITE_ERRS);
+			btrfs_dev_replace_stats_inc(
+				&sblock_bad->sctx->dev_root->fs_info->
+				dev_replace.num_write_errors);
 			bio_put(bio);
 			return -EIO;
 		}
@@ -1237,13 +1455,174 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 	return 0;
 }
 
-static void scrub_checksum(struct scrub_block *sblock)
+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
+{
+	int page_num;
+
+	for (page_num = 0; page_num < sblock->page_count; page_num++) {
+		int ret;
+
+		ret = scrub_write_page_to_dev_replace(sblock, page_num);
+		if (ret)
+			btrfs_dev_replace_stats_inc(
+				&sblock->sctx->dev_root->fs_info->dev_replace.
+				num_write_errors);
+	}
+}
+
+static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
+					   int page_num)
+{
+	struct scrub_page *spage = sblock->pagev[page_num];
+
+	BUG_ON(spage->page == NULL);
+	if (spage->io_error) {
+		void *mapped_buffer = kmap_atomic(spage->page);
+
+		memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
+		flush_dcache_page(spage->page);
+		kunmap_atomic(mapped_buffer);
+	}
+	return scrub_add_page_to_wr_bio(sblock->sctx, spage);
+}
+
+static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
+				    struct scrub_page *spage)
+{
+	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
+	struct scrub_bio *sbio;
+	int ret;
+
+	mutex_lock(&wr_ctx->wr_lock);
+again:
+	if (!wr_ctx->wr_curr_bio) {
+		wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
+					      GFP_NOFS);
+		if (!wr_ctx->wr_curr_bio) {
+			mutex_unlock(&wr_ctx->wr_lock);
+			return -ENOMEM;
+		}
+		wr_ctx->wr_curr_bio->sctx = sctx;
+		wr_ctx->wr_curr_bio->page_count = 0;
+	}
+	sbio = wr_ctx->wr_curr_bio;
+	if (sbio->page_count == 0) {
+		struct bio *bio;
+
+		sbio->physical = spage->physical_for_dev_replace;
+		sbio->logical = spage->logical;
+		sbio->dev = wr_ctx->tgtdev;
+		bio = sbio->bio;
+		if (!bio) {
+			bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
+			if (!bio) {
+				mutex_unlock(&wr_ctx->wr_lock);
+				return -ENOMEM;
+			}
+			sbio->bio = bio;
+		}
+
+		bio->bi_private = sbio;
+		bio->bi_end_io = scrub_wr_bio_end_io;
+		bio->bi_bdev = sbio->dev->bdev;
+		bio->bi_sector = sbio->physical >> 9;
+		sbio->err = 0;
+	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+		   spage->physical_for_dev_replace ||
+		   sbio->logical + sbio->page_count * PAGE_SIZE !=
+		   spage->logical) {
+		scrub_wr_submit(sctx);
+		goto again;
+	}
+
+	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
+	if (ret != PAGE_SIZE) {
+		if (sbio->page_count < 1) {
+			bio_put(sbio->bio);
+			sbio->bio = NULL;
+			mutex_unlock(&wr_ctx->wr_lock);
+			return -EIO;
+		}
+		scrub_wr_submit(sctx);
+		goto again;
+	}
+
+	sbio->pagev[sbio->page_count] = spage;
+	scrub_page_get(spage);
+	sbio->page_count++;
+	if (sbio->page_count == wr_ctx->pages_per_wr_bio)
+		scrub_wr_submit(sctx);
+	mutex_unlock(&wr_ctx->wr_lock);
+
+	return 0;
+}
+
+static void scrub_wr_submit(struct scrub_ctx *sctx)
+{
+	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
+	struct scrub_bio *sbio;
+
+	if (!wr_ctx->wr_curr_bio)
+		return;
+
+	sbio = wr_ctx->wr_curr_bio;
+	wr_ctx->wr_curr_bio = NULL;
+	WARN_ON(!sbio->bio->bi_bdev);
+	scrub_pending_bio_inc(sctx);
+	/* process all writes in a single worker thread. Then the block layer
+	 * orders the requests before sending them to the driver which
+	 * doubled the write performance on spinning disks when measured
+	 * with Linux 3.5 */
+	btrfsic_submit_bio(WRITE, sbio->bio);
+}
+
+static void scrub_wr_bio_end_io(struct bio *bio, int err)
+{
+	struct scrub_bio *sbio = bio->bi_private;
+	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
+
+	sbio->err = err;
+	sbio->bio = bio;
+
+	sbio->work.func = scrub_wr_bio_end_io_worker;
+	btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
+}
+
+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
+{
+	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+	struct scrub_ctx *sctx = sbio->sctx;
+	int i;
+
+	WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
+	if (sbio->err) {
+		struct btrfs_dev_replace *dev_replace =
+			&sbio->sctx->dev_root->fs_info->dev_replace;
+
+		for (i = 0; i < sbio->page_count; i++) {
+			struct scrub_page *spage = sbio->pagev[i];
+
+			spage->io_error = 1;
+			btrfs_dev_replace_stats_inc(&dev_replace->
+						    num_write_errors);
+		}
+	}
+
+	for (i = 0; i < sbio->page_count; i++)
+		scrub_page_put(sbio->pagev[i]);
+
+	bio_put(sbio->bio);
+	kfree(sbio);
+	scrub_pending_bio_dec(sctx);
+}
+
+static int scrub_checksum(struct scrub_block *sblock)
 {
 	u64 flags;
 	int ret;
 
-	BUG_ON(sblock->page_count < 1);
-	flags = sblock->pagev[0].flags;
+	WARN_ON(sblock->page_count < 1);
+	flags = sblock->pagev[0]->flags;
 	ret = 0;
 	if (flags & BTRFS_EXTENT_FLAG_DATA)
 		ret = scrub_checksum_data(sblock);
@@ -1255,30 +1634,32 @@ static void scrub_checksum(struct scrub_block *sblock)
 		WARN_ON(1);
 	if (ret)
 		scrub_handle_errored_block(sblock);
+
+	return ret;
 }
 
 static int scrub_checksum_data(struct scrub_block *sblock)
 {
-	struct scrub_dev *sdev = sblock->sdev;
+	struct scrub_ctx *sctx = sblock->sctx;
 	u8 csum[BTRFS_CSUM_SIZE];
 	u8 *on_disk_csum;
 	struct page *page;
 	void *buffer;
 	u32 crc = ~(u32)0;
 	int fail = 0;
-	struct btrfs_root *root = sdev->dev->dev_root;
+	struct btrfs_root *root = sctx->dev_root;
 	u64 len;
 	int index;
 
 	BUG_ON(sblock->page_count < 1);
-	if (!sblock->pagev[0].have_csum)
+	if (!sblock->pagev[0]->have_csum)
 		return 0;
 
-	on_disk_csum = sblock->pagev[0].csum;
-	page = sblock->pagev[0].page;
+	on_disk_csum = sblock->pagev[0]->csum;
+	page = sblock->pagev[0]->page;
 	buffer = kmap_atomic(page);
 
-	len = sdev->sectorsize;
+	len = sctx->sectorsize;
 	index = 0;
 	for (;;) {
 		u64 l = min_t(u64, len, PAGE_SIZE);
@@ -1290,13 +1671,13 @@ static int scrub_checksum_data(struct scrub_block *sblock)
 			break;
 		index++;
 		BUG_ON(index >= sblock->page_count);
-		BUG_ON(!sblock->pagev[index].page);
-		page = sblock->pagev[index].page;
+		BUG_ON(!sblock->pagev[index]->page);
+		page = sblock->pagev[index]->page;
 		buffer = kmap_atomic(page);
 	}
 
 	btrfs_csum_final(crc, csum);
-	if (memcmp(csum, on_disk_csum, sdev->csum_size))
+	if (memcmp(csum, on_disk_csum, sctx->csum_size))
 		fail = 1;
 
 	return fail;
@@ -1304,9 +1685,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
 
 static int scrub_checksum_tree_block(struct scrub_block *sblock)
 {
-	struct scrub_dev *sdev = sblock->sdev;
+	struct scrub_ctx *sctx = sblock->sctx;
 	struct btrfs_header *h;
-	struct btrfs_root *root = sdev->dev->dev_root;
+	struct btrfs_root *root = sctx->dev_root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	u8 calculated_csum[BTRFS_CSUM_SIZE];
 	u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1321,10 +1702,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
 	int index;
 
 	BUG_ON(sblock->page_count < 1);
-	page = sblock->pagev[0].page;
+	page = sblock->pagev[0]->page;
 	mapped_buffer = kmap_atomic(page);
 	h = (struct btrfs_header *)mapped_buffer;
-	memcpy(on_disk_csum, h->csum, sdev->csum_size);
+	memcpy(on_disk_csum, h->csum, sctx->csum_size);
 
 	/*
 	 * we don't use the getter functions here, as we
@@ -1332,10 +1713,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
 	 * b) the page is already kmapped
 	 */
 
-	if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr))
+	if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))
 		++fail;
 
-	if (sblock->pagev[0].generation != le64_to_cpu(h->generation))
+	if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))
 		++fail;
 
 	if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1345,8 +1726,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
 		   BTRFS_UUID_SIZE))
 		++fail;
 
-	BUG_ON(sdev->nodesize != sdev->leafsize);
-	len = sdev->nodesize - BTRFS_CSUM_SIZE;
+	WARN_ON(sctx->nodesize != sctx->leafsize);
+	len = sctx->nodesize - BTRFS_CSUM_SIZE;
 	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
 	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
 	index = 0;
@@ -1360,15 +1741,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
 			break;
 		index++;
 		BUG_ON(index >= sblock->page_count);
-		BUG_ON(!sblock->pagev[index].page);
-		page = sblock->pagev[index].page;
+		BUG_ON(!sblock->pagev[index]->page);
+		page = sblock->pagev[index]->page;
 		mapped_buffer = kmap_atomic(page);
 		mapped_size = PAGE_SIZE;
 		p = mapped_buffer;
 	}
 
 	btrfs_csum_final(crc, calculated_csum);
-	if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
+	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
 		++crc_fail;
 
 	return fail || crc_fail;
@@ -1377,8 +1758,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
 static int scrub_checksum_super(struct scrub_block *sblock)
 {
 	struct btrfs_super_block *s;
-	struct scrub_dev *sdev = sblock->sdev;
-	struct btrfs_root *root = sdev->dev->dev_root;
+	struct scrub_ctx *sctx = sblock->sctx;
+	struct btrfs_root *root = sctx->dev_root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	u8 calculated_csum[BTRFS_CSUM_SIZE];
 	u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1393,15 +1774,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
 	int index;
 
 	BUG_ON(sblock->page_count < 1);
-	page = sblock->pagev[0].page;
+	page = sblock->pagev[0]->page;
 	mapped_buffer = kmap_atomic(page);
 	s = (struct btrfs_super_block *)mapped_buffer;
-	memcpy(on_disk_csum, s->csum, sdev->csum_size);
+	memcpy(on_disk_csum, s->csum, sctx->csum_size);
 
-	if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
+	if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))
 		++fail_cor;
 
-	if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
+	if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))
 		++fail_gen;
 
 	if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1421,15 +1802,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
 			break;
 		index++;
 		BUG_ON(index >= sblock->page_count);
-		BUG_ON(!sblock->pagev[index].page);
-		page = sblock->pagev[index].page;
+		BUG_ON(!sblock->pagev[index]->page);
+		page = sblock->pagev[index]->page;
 		mapped_buffer = kmap_atomic(page);
 		mapped_size = PAGE_SIZE;
 		p = mapped_buffer;
 	}
 
 	btrfs_csum_final(crc, calculated_csum);
-	if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
+	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
 		++fail_cor;
 
 	if (fail_cor + fail_gen) {
@@ -1438,14 +1819,14 @@ static int scrub_checksum_super(struct scrub_block *sblock)
 		 * They will get written with the next transaction commit
 		 * anyway
 		 */
-		spin_lock(&sdev->stat_lock);
-		++sdev->stat.super_errors;
-		spin_unlock(&sdev->stat_lock);
+		spin_lock(&sctx->stat_lock);
+		++sctx->stat.super_errors;
+		spin_unlock(&sctx->stat_lock);
 		if (fail_cor)
-			btrfs_dev_stat_inc_and_print(sdev->dev,
+			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
 				BTRFS_DEV_STAT_CORRUPTION_ERRS);
 		else
-			btrfs_dev_stat_inc_and_print(sdev->dev,
+			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
 				BTRFS_DEV_STAT_GENERATION_ERRS);
 	}
 
@@ -1463,28 +1844,54 @@ static void scrub_block_put(struct scrub_block *sblock)
 		int i;
 
 		for (i = 0; i < sblock->page_count; i++)
-			if (sblock->pagev[i].page)
-				__free_page(sblock->pagev[i].page);
+			scrub_page_put(sblock->pagev[i]);
 		kfree(sblock);
 	}
 }
 
-static void scrub_submit(struct scrub_dev *sdev)
+static void scrub_page_get(struct scrub_page *spage)
+{
+	atomic_inc(&spage->ref_count);
+}
+
+static void scrub_page_put(struct scrub_page *spage)
+{
+	if (atomic_dec_and_test(&spage->ref_count)) {
+		if (spage->page)
+			__free_page(spage->page);
+		kfree(spage);
+	}
+}
+
+static void scrub_submit(struct scrub_ctx *sctx)
 {
 	struct scrub_bio *sbio;
 
-	if (sdev->curr == -1)
+	if (sctx->curr == -1)
 		return;
 
-	sbio = sdev->bios[sdev->curr];
-	sdev->curr = -1;
-	atomic_inc(&sdev->in_flight);
+	sbio = sctx->bios[sctx->curr];
+	sctx->curr = -1;
+	scrub_pending_bio_inc(sctx);
 
-	btrfsic_submit_bio(READ, sbio->bio);
+	if (!sbio->bio->bi_bdev) {
+		/*
+		 * this case should not happen. If btrfs_map_block() is
+		 * wrong, it could happen for dev-replace operations on
+		 * missing devices when no mirrors are available, but in
+		 * this case it should already fail the mount.
+		 * This case is handled correctly (but _very_ slowly).
+		 */
+		printk_ratelimited(KERN_WARNING
+			"btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
+		bio_endio(sbio->bio, -EIO);
+	} else {
+		btrfsic_submit_bio(READ, sbio->bio);
+	}
 }
 
-static int scrub_add_page_to_bio(struct scrub_dev *sdev,
-				 struct scrub_page *spage)
+static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
+				    struct scrub_page *spage)
 {
 	struct scrub_block *sblock = spage->sblock;
 	struct scrub_bio *sbio;
@@ -1494,28 +1901,29 @@ static int scrub_add_page_to_bio(struct scrub_dev *sdev,
 	/*
 	 * grab a fresh bio or wait for one to become available
 	 */
-	while (sdev->curr == -1) {
-		spin_lock(&sdev->list_lock);
-		sdev->curr = sdev->first_free;
-		if (sdev->curr != -1) {
-			sdev->first_free = sdev->bios[sdev->curr]->next_free;
-			sdev->bios[sdev->curr]->next_free = -1;
-			sdev->bios[sdev->curr]->page_count = 0;
-			spin_unlock(&sdev->list_lock);
+	while (sctx->curr == -1) {
+		spin_lock(&sctx->list_lock);
+		sctx->curr = sctx->first_free;
+		if (sctx->curr != -1) {
+			sctx->first_free = sctx->bios[sctx->curr]->next_free;
+			sctx->bios[sctx->curr]->next_free = -1;
+			sctx->bios[sctx->curr]->page_count = 0;
+			spin_unlock(&sctx->list_lock);
 		} else {
-			spin_unlock(&sdev->list_lock);
-			wait_event(sdev->list_wait, sdev->first_free != -1);
+			spin_unlock(&sctx->list_lock);
+			wait_event(sctx->list_wait, sctx->first_free != -1);
 		}
 	}
-	sbio = sdev->bios[sdev->curr];
+	sbio = sctx->bios[sctx->curr];
 	if (sbio->page_count == 0) {
 		struct bio *bio;
 
 		sbio->physical = spage->physical;
 		sbio->logical = spage->logical;
+		sbio->dev = spage->dev;
 		bio = sbio->bio;
 		if (!bio) {
-			bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio);
+			bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
 			if (!bio)
 				return -ENOMEM;
 			sbio->bio = bio;
@@ -1523,14 +1931,15 @@ static int scrub_add_page_to_bio(struct scrub_dev *sdev,
 
 		bio->bi_private = sbio;
 		bio->bi_end_io = scrub_bio_end_io;
-		bio->bi_bdev = sdev->dev->bdev;
-		bio->bi_sector = spage->physical >> 9;
+		bio->bi_bdev = sbio->dev->bdev;
+		bio->bi_sector = sbio->physical >> 9;
 		sbio->err = 0;
 	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
 		   spage->physical ||
 		   sbio->logical + sbio->page_count * PAGE_SIZE !=
-		   spage->logical) {
-		scrub_submit(sdev);
+		   spage->logical ||
+		   sbio->dev != spage->dev) {
+		scrub_submit(sctx);
 		goto again;
 	}
 
@@ -1542,81 +1951,87 @@ static int scrub_add_page_to_bio(struct scrub_dev *sdev,
 			sbio->bio = NULL;
 			return -EIO;
 		}
-		scrub_submit(sdev);
+		scrub_submit(sctx);
 		goto again;
 	}
 
-	scrub_block_get(sblock); /* one for the added page */
+	scrub_block_get(sblock); /* one for the page added to the bio */
 	atomic_inc(&sblock->outstanding_pages);
 	sbio->page_count++;
-	if (sbio->page_count == sdev->pages_per_bio)
-		scrub_submit(sdev);
+	if (sbio->page_count == sctx->pages_per_rd_bio)
+		scrub_submit(sctx);
 
 	return 0;
 }
 
-static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
-		       u64 physical, u64 flags, u64 gen, int mirror_num,
-		       u8 *csum, int force)
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+		       u64 physical, struct btrfs_device *dev, u64 flags,
+		       u64 gen, int mirror_num, u8 *csum, int force,
+		       u64 physical_for_dev_replace)
 {
 	struct scrub_block *sblock;
 	int index;
 
 	sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
 	if (!sblock) {
-		spin_lock(&sdev->stat_lock);
-		sdev->stat.malloc_errors++;
-		spin_unlock(&sdev->stat_lock);
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		spin_unlock(&sctx->stat_lock);
 		return -ENOMEM;
 	}
 
-	/* one ref inside this function, plus one for each page later on */
+	/* one ref inside this function, plus one for each page added to
+	 * a bio later on */
 	atomic_set(&sblock->ref_count, 1);
-	sblock->sdev = sdev;
+	sblock->sctx = sctx;
 	sblock->no_io_error_seen = 1;
 
 	for (index = 0; len > 0; index++) {
-		struct scrub_page *spage = sblock->pagev + index;
+		struct scrub_page *spage;
 		u64 l = min_t(u64, len, PAGE_SIZE);
 
-		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
-		spage->page = alloc_page(GFP_NOFS);
-		if (!spage->page) {
-			spin_lock(&sdev->stat_lock);
-			sdev->stat.malloc_errors++;
-			spin_unlock(&sdev->stat_lock);
-			while (index > 0) {
-				index--;
-				__free_page(sblock->pagev[index].page);
-			}
-			kfree(sblock);
+		spage = kzalloc(sizeof(*spage), GFP_NOFS);
+		if (!spage) {
+leave_nomem:
+			spin_lock(&sctx->stat_lock);
+			sctx->stat.malloc_errors++;
+			spin_unlock(&sctx->stat_lock);
+			scrub_block_put(sblock);
 			return -ENOMEM;
 		}
+		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+		scrub_page_get(spage);
+		sblock->pagev[index] = spage;
 		spage->sblock = sblock;
-		spage->dev = sdev->dev;
+		spage->dev = dev;
 		spage->flags = flags;
 		spage->generation = gen;
 		spage->logical = logical;
 		spage->physical = physical;
+		spage->physical_for_dev_replace = physical_for_dev_replace;
 		spage->mirror_num = mirror_num;
 		if (csum) {
 			spage->have_csum = 1;
-			memcpy(spage->csum, csum, sdev->csum_size);
+			memcpy(spage->csum, csum, sctx->csum_size);
 		} else {
 			spage->have_csum = 0;
 		}
 		sblock->page_count++;
+		spage->page = alloc_page(GFP_NOFS);
+		if (!spage->page)
+			goto leave_nomem;
 		len -= l;
 		logical += l;
 		physical += l;
+		physical_for_dev_replace += l;
 	}
 
-	BUG_ON(sblock->page_count == 0);
+	WARN_ON(sblock->page_count == 0);
 	for (index = 0; index < sblock->page_count; index++) {
-		struct scrub_page *spage = sblock->pagev + index;
+		struct scrub_page *spage = sblock->pagev[index];
 		int ret;
 
-		ret = scrub_add_page_to_bio(sdev, spage);
+		ret = scrub_add_page_to_rd_bio(sctx, spage);
 		if (ret) {
 			scrub_block_put(sblock);
 			return ret;
@@ -1624,7 +2039,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
 	}
 
 	if (force)
-		scrub_submit(sdev);
+		scrub_submit(sctx);
 
 	/* last one frees, either here or in bio completion for last page */
 	scrub_block_put(sblock);
@@ -1634,8 +2049,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
 static void scrub_bio_end_io(struct bio *bio, int err)
 {
 	struct scrub_bio *sbio = bio->bi_private;
-	struct scrub_dev *sdev = sbio->sdev;
-	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
 
 	sbio->err = err;
 	sbio->bio = bio;
@@ -1646,10 +2060,10 @@ static void scrub_bio_end_io(struct bio *bio, int err)
 static void scrub_bio_end_io_worker(struct btrfs_work *work)
 {
 	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
-	struct scrub_dev *sdev = sbio->sdev;
+	struct scrub_ctx *sctx = sbio->sctx;
 	int i;
 
-	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO);
+	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
 	if (sbio->err) {
 		for (i = 0; i < sbio->page_count; i++) {
 			struct scrub_page *spage = sbio->pagev[i];
@@ -1671,23 +2085,37 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
 
 	bio_put(sbio->bio);
 	sbio->bio = NULL;
-	spin_lock(&sdev->list_lock);
-	sbio->next_free = sdev->first_free;
-	sdev->first_free = sbio->index;
-	spin_unlock(&sdev->list_lock);
-	atomic_dec(&sdev->in_flight);
-	wake_up(&sdev->list_wait);
+	spin_lock(&sctx->list_lock);
+	sbio->next_free = sctx->first_free;
+	sctx->first_free = sbio->index;
+	spin_unlock(&sctx->list_lock);
+
+	if (sctx->is_dev_replace &&
+	    atomic_read(&sctx->wr_ctx.flush_all_writes)) {
+		mutex_lock(&sctx->wr_ctx.wr_lock);
+		scrub_wr_submit(sctx);
+		mutex_unlock(&sctx->wr_ctx.wr_lock);
+	}
+
+	scrub_pending_bio_dec(sctx);
 }
 
 static void scrub_block_complete(struct scrub_block *sblock)
 {
-	if (!sblock->no_io_error_seen)
+	if (!sblock->no_io_error_seen) {
 		scrub_handle_errored_block(sblock);
-	else
-		scrub_checksum(sblock);
+	} else {
+		/*
+		 * if has checksum error, write via repair mechanism in
+		 * dev replace case, otherwise write here in dev replace
+		 * case.
+		 */
+		if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
+			scrub_write_block_to_dev_replace(sblock);
+	}
 }
 
-static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
+static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
 			   u8 *csum)
 {
 	struct btrfs_ordered_sum *sum = NULL;
@@ -1695,15 +2123,15 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
 	unsigned long i;
 	unsigned long num_sectors;
 
-	while (!list_empty(&sdev->csum_list)) {
-		sum = list_first_entry(&sdev->csum_list,
+	while (!list_empty(&sctx->csum_list)) {
+		sum = list_first_entry(&sctx->csum_list,
 				       struct btrfs_ordered_sum, list);
 		if (sum->bytenr > logical)
 			return 0;
 		if (sum->bytenr + sum->len > logical)
 			break;
 
-		++sdev->stat.csum_discards;
+		++sctx->stat.csum_discards;
 		list_del(&sum->list);
 		kfree(sum);
 		sum = NULL;
@@ -1711,10 +2139,10 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
 	if (!sum)
 		return 0;
 
-	num_sectors = sum->len / sdev->sectorsize;
+	num_sectors = sum->len / sctx->sectorsize;
 	for (i = 0; i < num_sectors; ++i) {
 		if (sum->sums[i].bytenr == logical) {
-			memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
+			memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
 			ret = 1;
 			break;
 		}
@@ -1727,29 +2155,30 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
 }
 
 /* scrub extent tries to collect up to 64 kB for each bio */
-static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
-			u64 physical, u64 flags, u64 gen, int mirror_num)
+static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
+			u64 physical, struct btrfs_device *dev, u64 flags,
+			u64 gen, int mirror_num, u64 physical_for_dev_replace)
 {
 	int ret;
 	u8 csum[BTRFS_CSUM_SIZE];
 	u32 blocksize;
 
 	if (flags & BTRFS_EXTENT_FLAG_DATA) {
-		blocksize = sdev->sectorsize;
-		spin_lock(&sdev->stat_lock);
-		sdev->stat.data_extents_scrubbed++;
-		sdev->stat.data_bytes_scrubbed += len;
-		spin_unlock(&sdev->stat_lock);
+		blocksize = sctx->sectorsize;
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.data_extents_scrubbed++;
+		sctx->stat.data_bytes_scrubbed += len;
+		spin_unlock(&sctx->stat_lock);
 	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-		BUG_ON(sdev->nodesize != sdev->leafsize);
-		blocksize = sdev->nodesize;
-		spin_lock(&sdev->stat_lock);
-		sdev->stat.tree_extents_scrubbed++;
-		sdev->stat.tree_bytes_scrubbed += len;
-		spin_unlock(&sdev->stat_lock);
+		WARN_ON(sctx->nodesize != sctx->leafsize);
+		blocksize = sctx->nodesize;
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.tree_extents_scrubbed++;
+		sctx->stat.tree_bytes_scrubbed += len;
+		spin_unlock(&sctx->stat_lock);
 	} else {
-		blocksize = sdev->sectorsize;
-		BUG_ON(1);
+		blocksize = sctx->sectorsize;
+		WARN_ON(1);
 	}
 
 	while (len) {
@@ -1758,26 +2187,38 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
 
 		if (flags & BTRFS_EXTENT_FLAG_DATA) {
 			/* push csums to sbio */
-			have_csum = scrub_find_csum(sdev, logical, l, csum);
+			have_csum = scrub_find_csum(sctx, logical, l, csum);
 			if (have_csum == 0)
-				++sdev->stat.no_csum;
+				++sctx->stat.no_csum;
+			if (sctx->is_dev_replace && !have_csum) {
+				ret = copy_nocow_pages(sctx, logical, l,
+						       mirror_num,
+						      physical_for_dev_replace);
+				goto behind_scrub_pages;
+			}
 		}
-		ret = scrub_pages(sdev, logical, l, physical, flags, gen,
-				  mirror_num, have_csum ? csum : NULL, 0);
+		ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
+				  mirror_num, have_csum ? csum : NULL, 0,
+				  physical_for_dev_replace);
+behind_scrub_pages:
 		if (ret)
 			return ret;
 		len -= l;
 		logical += l;
 		physical += l;
+		physical_for_dev_replace += l;
 	}
 	return 0;
 }
 
-static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
-	struct map_lookup *map, int num, u64 base, u64 length)
+static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
+					   struct map_lookup *map,
+					   struct btrfs_device *scrub_dev,
+					   int num, u64 base, u64 length,
+					   int is_dev_replace)
 {
 	struct btrfs_path *path;
-	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 	struct btrfs_root *root = fs_info->extent_root;
 	struct btrfs_root *csum_root = fs_info->csum_root;
 	struct btrfs_extent_item *extent;
@@ -1797,9 +2238,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 	struct reada_control *reada2;
 	struct btrfs_key key_start;
 	struct btrfs_key key_end;
-
 	u64 increment = map->stripe_len;
 	u64 offset;
+	u64 extent_logical;
+	u64 extent_physical;
+	u64 extent_len;
+	struct btrfs_device *extent_dev;
+	int extent_mirror_num;
 
 	nstripes = length;
 	offset = 0;
@@ -1843,8 +2288,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 	 */
 	logical = base + offset;
 
-	wait_event(sdev->list_wait,
-		   atomic_read(&sdev->in_flight) == 0);
+	wait_event(sctx->list_wait,
+		   atomic_read(&sctx->bios_in_flight) == 0);
 	atomic_inc(&fs_info->scrubs_paused);
 	wake_up(&fs_info->scrub_pause_wait);
 
@@ -1898,7 +2343,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 		 * canceled?
 		 */
 		if (atomic_read(&fs_info->scrub_cancel_req) ||
-		    atomic_read(&sdev->cancel_req)) {
+		    atomic_read(&sctx->cancel_req)) {
 			ret = -ECANCELED;
 			goto out;
 		}
@@ -1907,9 +2352,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 		 */
 		if (atomic_read(&fs_info->scrub_pause_req)) {
 			/* push queued extents */
-			scrub_submit(sdev);
-			wait_event(sdev->list_wait,
-				   atomic_read(&sdev->in_flight) == 0);
+			atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
+			scrub_submit(sctx);
+			mutex_lock(&sctx->wr_ctx.wr_lock);
+			scrub_wr_submit(sctx);
+			mutex_unlock(&sctx->wr_ctx.wr_lock);
+			wait_event(sctx->list_wait,
+				   atomic_read(&sctx->bios_in_flight) == 0);
+			atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
 			atomic_inc(&fs_info->scrubs_paused);
 			wake_up(&fs_info->scrub_pause_wait);
 			mutex_lock(&fs_info->scrub_lock);
@@ -1926,7 +2376,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 
 		ret = btrfs_lookup_csums_range(csum_root, logical,
 					       logical + map->stripe_len - 1,
-					       &sdev->csum_list, 1);
+					       &sctx->csum_list, 1);
 		if (ret)
 			goto out;
 
@@ -2004,9 +2454,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 					     key.objectid;
 			}
 
-			ret = scrub_extent(sdev, key.objectid, key.offset,
-					   key.objectid - logical + physical,
-					   flags, generation, mirror_num);
+			extent_logical = key.objectid;
+			extent_physical = key.objectid - logical + physical;
+			extent_len = key.offset;
+			extent_dev = scrub_dev;
+			extent_mirror_num = mirror_num;
+			if (is_dev_replace)
+				scrub_remap_extent(fs_info, extent_logical,
+						   extent_len, &extent_physical,
+						   &extent_dev,
+						   &extent_mirror_num);
+			ret = scrub_extent(sctx, extent_logical, extent_len,
+					   extent_physical, extent_dev, flags,
+					   generation, extent_mirror_num,
+					   key.objectid - logical + physical);
 			if (ret)
 				goto out;
 
@@ -2016,29 +2477,34 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 		btrfs_release_path(path);
 		logical += increment;
 		physical += map->stripe_len;
-		spin_lock(&sdev->stat_lock);
-		sdev->stat.last_physical = physical;
-		spin_unlock(&sdev->stat_lock);
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.last_physical = physical;
+		spin_unlock(&sctx->stat_lock);
 	}
+out:
 	/* push queued extents */
-	scrub_submit(sdev);
+	scrub_submit(sctx);
+	mutex_lock(&sctx->wr_ctx.wr_lock);
+	scrub_wr_submit(sctx);
+	mutex_unlock(&sctx->wr_ctx.wr_lock);
 
-out:
 	blk_finish_plug(&plug);
 	btrfs_free_path(path);
 	return ret < 0 ? ret : 0;
 }
 
-static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
-	u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length,
-	u64 dev_offset)
+static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
+					  struct btrfs_device *scrub_dev,
+					  u64 chunk_tree, u64 chunk_objectid,
+					  u64 chunk_offset, u64 length,
+					  u64 dev_offset, int is_dev_replace)
 {
 	struct btrfs_mapping_tree *map_tree =
-		&sdev->dev->dev_root->fs_info->mapping_tree;
+		&sctx->dev_root->fs_info->mapping_tree;
 	struct map_lookup *map;
 	struct extent_map *em;
 	int i;
-	int ret = -EINVAL;
+	int ret = 0;
 
 	read_lock(&map_tree->map_tree.lock);
 	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
@@ -2055,9 +2521,11 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
 		goto out;
 
 	for (i = 0; i < map->num_stripes; ++i) {
-		if (map->stripes[i].dev == sdev->dev &&
+		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
 		    map->stripes[i].physical == dev_offset) {
-			ret = scrub_stripe(sdev, map, i, chunk_offset, length);
+			ret = scrub_stripe(sctx, map, scrub_dev, i,
+					   chunk_offset, length,
+					   is_dev_replace);
 			if (ret)
 				goto out;
 		}
@@ -2069,11 +2537,13 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
 }
 
 static noinline_for_stack
-int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
+int scrub_enumerate_chunks(struct scrub_ctx *sctx,
+			   struct btrfs_device *scrub_dev, u64 start, u64 end,
+			   int is_dev_replace)
 {
 	struct btrfs_dev_extent *dev_extent = NULL;
 	struct btrfs_path *path;
-	struct btrfs_root *root = sdev->dev->dev_root;
+	struct btrfs_root *root = sctx->dev_root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	u64 length;
 	u64 chunk_tree;
@@ -2085,6 +2555,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct btrfs_block_group_cache *cache;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -2094,11 +2565,10 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
 	path->search_commit_root = 1;
 	path->skip_locking = 1;
 
-	key.objectid = sdev->dev->devid;
+	key.objectid = scrub_dev->devid;
 	key.offset = 0ull;
 	key.type = BTRFS_DEV_EXTENT_KEY;
 
-
 	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
@@ -2117,7 +2587,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
 
 		btrfs_item_key_to_cpu(l, &found_key, slot);
 
-		if (found_key.objectid != sdev->dev->devid)
+		if (found_key.objectid != scrub_dev->devid)
 			break;
 
 		if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
@@ -2151,11 +2621,62 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
 			ret = -ENOENT;
 			break;
 		}
-		ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
-				  chunk_offset, length, found_key.offset);
+		dev_replace->cursor_right = found_key.offset + length;
+		dev_replace->cursor_left = found_key.offset;
+		dev_replace->item_needs_writeback = 1;
+		ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
+				  chunk_offset, length, found_key.offset,
+				  is_dev_replace);
+
+		/*
+		 * flush, submit all pending read and write bios, afterwards
+		 * wait for them.
+		 * Note that in the dev replace case, a read request causes
+		 * write requests that are submitted in the read completion
+		 * worker. Therefore in the current situation, it is required
+		 * that all write requests are flushed, so that all read and
+		 * write requests are really completed when bios_in_flight
+		 * changes to 0.
+		 */
+		atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
+		scrub_submit(sctx);
+		mutex_lock(&sctx->wr_ctx.wr_lock);
+		scrub_wr_submit(sctx);
+		mutex_unlock(&sctx->wr_ctx.wr_lock);
+
+		wait_event(sctx->list_wait,
+			   atomic_read(&sctx->bios_in_flight) == 0);
+		atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+		atomic_inc(&fs_info->scrubs_paused);
+		wake_up(&fs_info->scrub_pause_wait);
+		wait_event(sctx->list_wait,
+			   atomic_read(&sctx->workers_pending) == 0);
+
+		mutex_lock(&fs_info->scrub_lock);
+		while (atomic_read(&fs_info->scrub_pause_req)) {
+			mutex_unlock(&fs_info->scrub_lock);
+			wait_event(fs_info->scrub_pause_wait,
+			   atomic_read(&fs_info->scrub_pause_req) == 0);
+			mutex_lock(&fs_info->scrub_lock);
+		}
+		atomic_dec(&fs_info->scrubs_paused);
+		mutex_unlock(&fs_info->scrub_lock);
+		wake_up(&fs_info->scrub_pause_wait);
+
+		dev_replace->cursor_left = dev_replace->cursor_right;
+		dev_replace->item_needs_writeback = 1;
 		btrfs_put_block_group(cache);
 		if (ret)
 			break;
+		if (is_dev_replace &&
+		    atomic64_read(&dev_replace->num_write_errors) > 0) {
+			ret = -EIO;
+			break;
+		}
+		if (sctx->stat.malloc_errors > 0) {
+			ret = -ENOMEM;
+			break;
+		}
 
 		key.offset = found_key.offset + length;
 		btrfs_release_path(path);
@@ -2170,14 +2691,14 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
 	return ret < 0 ? ret : 0;
 }
 
-static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
+static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
+					   struct btrfs_device *scrub_dev)
 {
 	int	i;
 	u64	bytenr;
 	u64	gen;
 	int	ret;
-	struct btrfs_device *device = sdev->dev;
-	struct btrfs_root *root = device->dev_root;
+	struct btrfs_root *root = sctx->dev_root;
 
 	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
 		return -EIO;
@@ -2186,15 +2707,16 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
 
 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 		bytenr = btrfs_sb_offset(i);
-		if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
+		if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
 			break;
 
-		ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
-				     BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
+		ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
+				  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
+				  NULL, 1, bytenr);
 		if (ret)
 			return ret;
 	}
-	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
 
 	return 0;
 }
@@ -2202,19 +2724,38 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
 /*
  * get a reference count on fs_info->scrub_workers. start worker if necessary
  */
-static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
+static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
+						int is_dev_replace)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret = 0;
 
 	mutex_lock(&fs_info->scrub_lock);
 	if (fs_info->scrub_workers_refcnt == 0) {
-		btrfs_init_workers(&fs_info->scrub_workers, "scrub",
-			   fs_info->thread_pool_size, &fs_info->generic_worker);
+		if (is_dev_replace)
+			btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
+					&fs_info->generic_worker);
+		else
+			btrfs_init_workers(&fs_info->scrub_workers, "scrub",
+					fs_info->thread_pool_size,
+					&fs_info->generic_worker);
 		fs_info->scrub_workers.idle_thresh = 4;
 		ret = btrfs_start_workers(&fs_info->scrub_workers);
 		if (ret)
 			goto out;
+		btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
+				   "scrubwrc",
+				   fs_info->thread_pool_size,
+				   &fs_info->generic_worker);
+		fs_info->scrub_wr_completion_workers.idle_thresh = 2;
+		ret = btrfs_start_workers(
+				&fs_info->scrub_wr_completion_workers);
+		if (ret)
+			goto out;
+		btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
+				   &fs_info->generic_worker);
+		ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
+		if (ret)
+			goto out;
 	}
 	++fs_info->scrub_workers_refcnt;
 out:
@@ -2223,40 +2764,41 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
 	return ret;
 }
 
-static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
+static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
-
 	mutex_lock(&fs_info->scrub_lock);
-	if (--fs_info->scrub_workers_refcnt == 0)
+	if (--fs_info->scrub_workers_refcnt == 0) {
 		btrfs_stop_workers(&fs_info->scrub_workers);
+		btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
+		btrfs_stop_workers(&fs_info->scrub_nocow_workers);
+	}
 	WARN_ON(fs_info->scrub_workers_refcnt < 0);
 	mutex_unlock(&fs_info->scrub_lock);
 }
 
-
-int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
-		    struct btrfs_scrub_progress *progress, int readonly)
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+		    u64 end, struct btrfs_scrub_progress *progress,
+		    int readonly, int is_dev_replace)
 {
-	struct scrub_dev *sdev;
-	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct scrub_ctx *sctx;
 	int ret;
 	struct btrfs_device *dev;
 
-	if (btrfs_fs_closing(root->fs_info))
+	if (btrfs_fs_closing(fs_info))
 		return -EINVAL;
 
 	/*
 	 * check some assumptions
 	 */
-	if (root->nodesize != root->leafsize) {
+	if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
 		printk(KERN_ERR
 		       "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
-		       root->nodesize, root->leafsize);
+		       fs_info->chunk_root->nodesize,
+		       fs_info->chunk_root->leafsize);
 		return -EINVAL;
 	}
 
-	if (root->nodesize > BTRFS_STRIPE_LEN) {
+	if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
 		/*
 		 * in this case scrub is unable to calculate the checksum
 		 * the way scrub is implemented. Do not handle this
@@ -2264,80 +2806,105 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
 		 */
 		printk(KERN_ERR
 		       "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
-		       root->nodesize, BTRFS_STRIPE_LEN);
+		       fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
 		return -EINVAL;
 	}
 
-	if (root->sectorsize != PAGE_SIZE) {
+	if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
 		/* not supported for data w/o checksums */
 		printk(KERN_ERR
 		       "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
-		       root->sectorsize, (unsigned long long)PAGE_SIZE);
+		       fs_info->chunk_root->sectorsize,
+		       (unsigned long long)PAGE_SIZE);
 		return -EINVAL;
 	}
 
-	ret = scrub_workers_get(root);
+	if (fs_info->chunk_root->nodesize >
+	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
+	    fs_info->chunk_root->sectorsize >
+	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
+		/*
+		 * would exhaust the array bounds of pagev member in
+		 * struct scrub_block
+		 */
+		pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
+		       fs_info->chunk_root->nodesize,
+		       SCRUB_MAX_PAGES_PER_BLOCK,
+		       fs_info->chunk_root->sectorsize,
+		       SCRUB_MAX_PAGES_PER_BLOCK);
+		return -EINVAL;
+	}
+
+	ret = scrub_workers_get(fs_info, is_dev_replace);
 	if (ret)
 		return ret;
 
-	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
-	dev = btrfs_find_device(root, devid, NULL, NULL);
-	if (!dev || dev->missing) {
-		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-		scrub_workers_put(root);
+	mutex_lock(&fs_info->fs_devices->device_list_mutex);
+	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+	if (!dev || (dev->missing && !is_dev_replace)) {
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		scrub_workers_put(fs_info);
 		return -ENODEV;
 	}
 	mutex_lock(&fs_info->scrub_lock);
 
-	if (!dev->in_fs_metadata) {
+	if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
 		mutex_unlock(&fs_info->scrub_lock);
-		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-		scrub_workers_put(root);
-		return -ENODEV;
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		scrub_workers_put(fs_info);
+		return -EIO;
 	}
 
-	if (dev->scrub_device) {
+	btrfs_dev_replace_lock(&fs_info->dev_replace);
+	if (dev->scrub_device ||
+	    (!is_dev_replace &&
+	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
+		btrfs_dev_replace_unlock(&fs_info->dev_replace);
 		mutex_unlock(&fs_info->scrub_lock);
-		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-		scrub_workers_put(root);
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		scrub_workers_put(fs_info);
 		return -EINPROGRESS;
 	}
-	sdev = scrub_setup_dev(dev);
-	if (IS_ERR(sdev)) {
+	btrfs_dev_replace_unlock(&fs_info->dev_replace);
+	sctx = scrub_setup_ctx(dev, is_dev_replace);
+	if (IS_ERR(sctx)) {
 		mutex_unlock(&fs_info->scrub_lock);
-		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-		scrub_workers_put(root);
-		return PTR_ERR(sdev);
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		scrub_workers_put(fs_info);
+		return PTR_ERR(sctx);
 	}
-	sdev->readonly = readonly;
-	dev->scrub_device = sdev;
+	sctx->readonly = readonly;
+	dev->scrub_device = sctx;
 
 	atomic_inc(&fs_info->scrubs_running);
 	mutex_unlock(&fs_info->scrub_lock);
-	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 
-	down_read(&fs_info->scrub_super_lock);
-	ret = scrub_supers(sdev);
-	up_read(&fs_info->scrub_super_lock);
+	if (!is_dev_replace) {
+		down_read(&fs_info->scrub_super_lock);
+		ret = scrub_supers(sctx, dev);
+		up_read(&fs_info->scrub_super_lock);
+	}
 
 	if (!ret)
-		ret = scrub_enumerate_chunks(sdev, start, end);
+		ret = scrub_enumerate_chunks(sctx, dev, start, end,
+					     is_dev_replace);
 
-	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
 	atomic_dec(&fs_info->scrubs_running);
 	wake_up(&fs_info->scrub_pause_wait);
 
-	wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
+	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
 
 	if (progress)
-		memcpy(progress, &sdev->stat, sizeof(*progress));
+		memcpy(progress, &sctx->stat, sizeof(*progress));
 
 	mutex_lock(&fs_info->scrub_lock);
 	dev->scrub_device = NULL;
 	mutex_unlock(&fs_info->scrub_lock);
 
-	scrub_free_dev(sdev);
-	scrub_workers_put(root);
+	scrub_free_ctx(sctx);
+	scrub_workers_put(fs_info);
 
 	return ret;
 }
@@ -2377,9 +2944,8 @@ void btrfs_scrub_continue_super(struct btrfs_root *root)
 	up_write(&root->fs_info->scrub_super_lock);
 }
 
-int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
+int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
 {
-
 	mutex_lock(&fs_info->scrub_lock);
 	if (!atomic_read(&fs_info->scrubs_running)) {
 		mutex_unlock(&fs_info->scrub_lock);
@@ -2399,23 +2965,18 @@ int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
 	return 0;
 }
 
-int btrfs_scrub_cancel(struct btrfs_root *root)
+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
+			   struct btrfs_device *dev)
 {
-	return __btrfs_scrub_cancel(root->fs_info);
-}
-
-int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
-{
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct scrub_dev *sdev;
+	struct scrub_ctx *sctx;
 
 	mutex_lock(&fs_info->scrub_lock);
-	sdev = dev->scrub_device;
-	if (!sdev) {
+	sctx = dev->scrub_device;
+	if (!sctx) {
 		mutex_unlock(&fs_info->scrub_lock);
 		return -ENOTCONN;
 	}
-	atomic_inc(&sdev->cancel_req);
+	atomic_inc(&sctx->cancel_req);
 	while (dev->scrub_device) {
 		mutex_unlock(&fs_info->scrub_lock);
 		wait_event(fs_info->scrub_pause_wait,
@@ -2438,12 +2999,12 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
 	 * does not go away in cancel_dev. FIXME: find a better solution
 	 */
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
-	dev = btrfs_find_device(root, devid, NULL, NULL);
+	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
 	if (!dev) {
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 		return -ENODEV;
 	}
-	ret = btrfs_scrub_cancel_dev(root, dev);
+	ret = btrfs_scrub_cancel_dev(fs_info, dev);
 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 
 	return ret;
@@ -2453,15 +3014,284 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
 			 struct btrfs_scrub_progress *progress)
 {
 	struct btrfs_device *dev;
-	struct scrub_dev *sdev = NULL;
+	struct scrub_ctx *sctx = NULL;
 
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
-	dev = btrfs_find_device(root, devid, NULL, NULL);
+	dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
 	if (dev)
-		sdev = dev->scrub_device;
-	if (sdev)
-		memcpy(progress, &sdev->stat, sizeof(*progress));
+		sctx = dev->scrub_device;
+	if (sctx)
+		memcpy(progress, &sctx->stat, sizeof(*progress));
 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
-	return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV;
+	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
+}
+
+static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
+			       u64 extent_logical, u64 extent_len,
+			       u64 *extent_physical,
+			       struct btrfs_device **extent_dev,
+			       int *extent_mirror_num)
+{
+	u64 mapped_length;
+	struct btrfs_bio *bbio = NULL;
+	int ret;
+
+	mapped_length = extent_len;
+	ret = btrfs_map_block(fs_info, READ, extent_logical,
+			      &mapped_length, &bbio, 0);
+	if (ret || !bbio || mapped_length < extent_len ||
+	    !bbio->stripes[0].dev->bdev) {
+		kfree(bbio);
+		return;
+	}
+
+	*extent_physical = bbio->stripes[0].physical;
+	*extent_mirror_num = bbio->mirror_num;
+	*extent_dev = bbio->stripes[0].dev;
+	kfree(bbio);
+}
+
+static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
+			      struct scrub_wr_ctx *wr_ctx,
+			      struct btrfs_fs_info *fs_info,
+			      struct btrfs_device *dev,
+			      int is_dev_replace)
+{
+	WARN_ON(wr_ctx->wr_curr_bio != NULL);
+
+	mutex_init(&wr_ctx->wr_lock);
+	wr_ctx->wr_curr_bio = NULL;
+	if (!is_dev_replace)
+		return 0;
+
+	WARN_ON(!dev->bdev);
+	wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
+					 bio_get_nr_vecs(dev->bdev));
+	wr_ctx->tgtdev = dev;
+	atomic_set(&wr_ctx->flush_all_writes, 0);
+	return 0;
+}
+
+static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
+{
+	mutex_lock(&wr_ctx->wr_lock);
+	kfree(wr_ctx->wr_curr_bio);
+	wr_ctx->wr_curr_bio = NULL;
+	mutex_unlock(&wr_ctx->wr_lock);
+}
+
+static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+			    int mirror_num, u64 physical_for_dev_replace)
+{
+	struct scrub_copy_nocow_ctx *nocow_ctx;
+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+
+	nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
+	if (!nocow_ctx) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		spin_unlock(&sctx->stat_lock);
+		return -ENOMEM;
+	}
+
+	scrub_pending_trans_workers_inc(sctx);
+
+	nocow_ctx->sctx = sctx;
+	nocow_ctx->logical = logical;
+	nocow_ctx->len = len;
+	nocow_ctx->mirror_num = mirror_num;
+	nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
+	nocow_ctx->work.func = copy_nocow_pages_worker;
+	btrfs_queue_worker(&fs_info->scrub_nocow_workers,
+			   &nocow_ctx->work);
+
+	return 0;
+}
+
+static void copy_nocow_pages_worker(struct btrfs_work *work)
+{
+	struct scrub_copy_nocow_ctx *nocow_ctx =
+		container_of(work, struct scrub_copy_nocow_ctx, work);
+	struct scrub_ctx *sctx = nocow_ctx->sctx;
+	u64 logical = nocow_ctx->logical;
+	u64 len = nocow_ctx->len;
+	int mirror_num = nocow_ctx->mirror_num;
+	u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
+	int ret;
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_path *path;
+	struct btrfs_root *root;
+	int not_written = 0;
+
+	fs_info = sctx->dev_root->fs_info;
+	root = fs_info->extent_root;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		spin_unlock(&sctx->stat_lock);
+		not_written = 1;
+		goto out;
+	}
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		not_written = 1;
+		goto out;
+	}
+
+	ret = iterate_inodes_from_logical(logical, fs_info, path,
+					  copy_nocow_pages_for_inode,
+					  nocow_ctx);
+	if (ret != 0 && ret != -ENOENT) {
+		pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n",
+			(unsigned long long)logical,
+			(unsigned long long)physical_for_dev_replace,
+			(unsigned long long)len,
+			(unsigned long long)mirror_num, ret);
+		not_written = 1;
+		goto out;
+	}
+
+out:
+	if (trans && !IS_ERR(trans))
+		btrfs_end_transaction(trans, root);
+	if (not_written)
+		btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
+					    num_uncorrectable_read_errors);
+
+	btrfs_free_path(path);
+	kfree(nocow_ctx);
+
+	scrub_pending_trans_workers_dec(sctx);
+}
+
+static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
+{
+	unsigned long index;
+	struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
+	int ret = 0;
+	struct btrfs_key key;
+	struct inode *inode = NULL;
+	struct btrfs_root *local_root;
+	u64 physical_for_dev_replace;
+	u64 len;
+	struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
+
+	key.objectid = root;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+	local_root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(local_root))
+		return PTR_ERR(local_root);
+
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.objectid = inum;
+	key.offset = 0;
+	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
+	len = nocow_ctx->len;
+	while (len >= PAGE_CACHE_SIZE) {
+		struct page *page = NULL;
+		int ret_sub;
+
+		index = offset >> PAGE_CACHE_SHIFT;
+
+		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+		if (!page) {
+			pr_err("find_or_create_page() failed\n");
+			ret = -ENOMEM;
+			goto next_page;
+		}
+
+		if (PageUptodate(page)) {
+			if (PageDirty(page))
+				goto next_page;
+		} else {
+			ClearPageError(page);
+			ret_sub = extent_read_full_page(&BTRFS_I(inode)->
+							 io_tree,
+							page, btrfs_get_extent,
+							nocow_ctx->mirror_num);
+			if (ret_sub) {
+				ret = ret_sub;
+				goto next_page;
+			}
+			wait_on_page_locked(page);
+			if (!PageUptodate(page)) {
+				ret = -EIO;
+				goto next_page;
+			}
+		}
+		ret_sub = write_page_nocow(nocow_ctx->sctx,
+					   physical_for_dev_replace, page);
+		if (ret_sub) {
+			ret = ret_sub;
+			goto next_page;
+		}
+
+next_page:
+		if (page) {
+			unlock_page(page);
+			put_page(page);
+		}
+		offset += PAGE_CACHE_SIZE;
+		physical_for_dev_replace += PAGE_CACHE_SIZE;
+		len -= PAGE_CACHE_SIZE;
+	}
+
+	if (inode)
+		iput(inode);
+	return ret;
+}
+
+static int write_page_nocow(struct scrub_ctx *sctx,
+			    u64 physical_for_dev_replace, struct page *page)
+{
+	struct bio *bio;
+	struct btrfs_device *dev;
+	int ret;
+	DECLARE_COMPLETION_ONSTACK(compl);
+
+	dev = sctx->wr_ctx.tgtdev;
+	if (!dev)
+		return -EIO;
+	if (!dev->bdev) {
+		printk_ratelimited(KERN_WARNING
+			"btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
+		return -EIO;
+	}
+	bio = bio_alloc(GFP_NOFS, 1);
+	if (!bio) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		spin_unlock(&sctx->stat_lock);
+		return -ENOMEM;
+	}
+	bio->bi_private = &compl;
+	bio->bi_end_io = scrub_complete_bio_end_io;
+	bio->bi_size = 0;
+	bio->bi_sector = physical_for_dev_replace >> 9;
+	bio->bi_bdev = dev->bdev;
+	ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+	if (ret != PAGE_CACHE_SIZE) {
+leave_with_eio:
+		bio_put(bio);
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+		return -EIO;
+	}
+	btrfsic_submit_bio(WRITE_SYNC, bio);
+	wait_for_completion(&compl);
+
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		goto leave_with_eio;
+
+	bio_put(bio);
+	return 0;
 }
diff --git a/trunk/fs/btrfs/send.c b/trunk/fs/btrfs/send.c
index e78b297b0b00..54454542ad40 100644
--- a/trunk/fs/btrfs/send.c
+++ b/trunk/fs/btrfs/send.c
@@ -4397,9 +4397,9 @@ static int full_send_tree(struct send_ctx *sctx)
 	if (!path)
 		return -ENOMEM;
 
-	spin_lock(&send_root->root_times_lock);
+	spin_lock(&send_root->root_item_lock);
 	start_ctransid = btrfs_root_ctransid(&send_root->root_item);
-	spin_unlock(&send_root->root_times_lock);
+	spin_unlock(&send_root->root_item_lock);
 
 	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
 	key.type = BTRFS_INODE_ITEM_KEY;
@@ -4422,9 +4422,9 @@ static int full_send_tree(struct send_ctx *sctx)
 	 * Make sure the tree has not changed after re-joining. We detect this
 	 * by comparing start_ctransid and ctransid. They should always match.
 	 */
-	spin_lock(&send_root->root_times_lock);
+	spin_lock(&send_root->root_item_lock);
 	ctransid = btrfs_root_ctransid(&send_root->root_item);
-	spin_unlock(&send_root->root_times_lock);
+	spin_unlock(&send_root->root_item_lock);
 
 	if (ctransid != start_ctransid) {
 		WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
diff --git a/trunk/fs/btrfs/super.c b/trunk/fs/btrfs/super.c
index 915ac14c2064..99545df1b86c 100644
--- a/trunk/fs/btrfs/super.c
+++ b/trunk/fs/btrfs/super.c
@@ -55,6 +55,7 @@
 #include "export.h"
 #include "compression.h"
 #include "rcu-string.h"
+#include "dev-replace.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
@@ -116,7 +117,16 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
 	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
 		sb->s_flags |= MS_RDONLY;
 		printk(KERN_INFO "btrfs is forced readonly\n");
-		__btrfs_scrub_cancel(fs_info);
+		/*
+		 * Note that a running device replace operation is not
+		 * canceled here although there is no way to update
+		 * the progress. It would add the risk of a deadlock,
+		 * therefore the canceling is ommited. The only penalty
+		 * is that some I/O remains active until the procedure
+		 * completes. The next time when the filesystem is
+		 * mounted writeable again, the device replace
+		 * operation continues.
+		 */
 //		WARN_ON(1);
 	}
 }
@@ -1186,7 +1196,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 	btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
 	btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
 	btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size);
+	btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
+			      new_pool_size);
 }
 
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -1215,8 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		return 0;
 
 	if (*flags & MS_RDONLY) {
+		/*
+		 * this also happens on 'umount -rf' or on shutdown, when
+		 * the filesystem is busy.
+		 */
 		sb->s_flags |= MS_RDONLY;
 
+		btrfs_dev_replace_suspend_for_unmount(fs_info);
+		btrfs_scrub_cancel(fs_info);
+
 		ret = btrfs_commit_super(root);
 		if (ret)
 			goto restore;
@@ -1226,6 +1244,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 			goto restore;
 		}
 
+		if (fs_info->fs_devices->missing_devices >
+		     fs_info->num_tolerated_disk_barrier_failures &&
+		    !(*flags & MS_RDONLY)) {
+			printk(KERN_WARNING
+			       "Btrfs: too many missing devices, writeable remount is not allowed\n");
+			ret = -EACCES;
+			goto restore;
+		}
+
 		if (btrfs_super_log_root(fs_info->super_copy) != 0) {
 			ret = -EINVAL;
 			goto restore;
@@ -1244,6 +1271,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		if (ret)
 			goto restore;
 
+		ret = btrfs_resume_dev_replace_async(fs_info);
+		if (ret) {
+			pr_warn("btrfs: failed to resume dev_replace\n");
+			goto restore;
+		}
 		sb->s_flags &= ~MS_RDONLY;
 	}
 
@@ -1336,7 +1368,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 		min_stripe_size = BTRFS_STRIPE_LEN;
 
 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
-		if (!device->in_fs_metadata || !device->bdev)
+		if (!device->in_fs_metadata || !device->bdev ||
+		    device->is_tgtdev_for_dev_replace)
 			continue;
 
 		avail_space = device->total_bytes - device->bytes_used;
@@ -1647,10 +1680,14 @@ static int __init init_btrfs_fs(void)
 	if (err)
 		goto free_ordered_data;
 
-	err = btrfs_interface_init();
+	err = btrfs_auto_defrag_init();
 	if (err)
 		goto free_delayed_inode;
 
+	err = btrfs_interface_init();
+	if (err)
+		goto free_auto_defrag;
+
 	err = register_filesystem(&btrfs_fs_type);
 	if (err)
 		goto unregister_ioctl;
@@ -1662,6 +1699,8 @@ static int __init init_btrfs_fs(void)
 
 unregister_ioctl:
 	btrfs_interface_exit();
+free_auto_defrag:
+	btrfs_auto_defrag_exit();
 free_delayed_inode:
 	btrfs_delayed_inode_exit();
 free_ordered_data:
@@ -1681,6 +1720,7 @@ static int __init init_btrfs_fs(void)
 static void __exit exit_btrfs_fs(void)
 {
 	btrfs_destroy_cachep();
+	btrfs_auto_defrag_exit();
 	btrfs_delayed_inode_exit();
 	ordered_data_exit();
 	extent_map_exit();
diff --git a/trunk/fs/btrfs/transaction.c b/trunk/fs/btrfs/transaction.c
index 04bbfb1052eb..87fac9a21ea5 100644
--- a/trunk/fs/btrfs/transaction.c
+++ b/trunk/fs/btrfs/transaction.c
@@ -30,6 +30,7 @@
 #include "tree-log.h"
 #include "inode-map.h"
 #include "volumes.h"
+#include "dev-replace.h"
 
 #define BTRFS_ROOT_TRANS_TAG 0
 
@@ -145,16 +146,12 @@ static noinline int join_transaction(struct btrfs_root *root, int type)
 	 * the log must never go across transaction boundaries.
 	 */
 	smp_mb();
-	if (!list_empty(&fs_info->tree_mod_seq_list)) {
-		printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when "
+	if (!list_empty(&fs_info->tree_mod_seq_list))
+		WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when "
 			"creating a fresh transaction\n");
-		WARN_ON(1);
-	}
-	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
-		printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
+	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
+		WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
 			"creating a fresh transaction\n");
-		WARN_ON(1);
-	}
 	atomic_set(&fs_info->tree_mod_seq, 0);
 
 	spin_lock_init(&cur_trans->commit_lock);
@@ -295,9 +292,9 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
 	return 0;
 }
 
-static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
-						    u64 num_items, int type,
-						    int noflush)
+static struct btrfs_trans_handle *
+start_transaction(struct btrfs_root *root, u64 num_items, int type,
+		  enum btrfs_reserve_flush_enum flush)
 {
 	struct btrfs_trans_handle *h;
 	struct btrfs_transaction *cur_trans;
@@ -312,6 +309,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 		WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
 		h = current->journal_info;
 		h->use_count++;
+		WARN_ON(h->use_count > 2);
 		h->orig_rsv = h->block_rsv;
 		h->block_rsv = NULL;
 		goto got_it;
@@ -331,14 +329,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 		}
 
 		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
-		if (noflush)
-			ret = btrfs_block_rsv_add_noflush(root,
-						&root->fs_info->trans_block_rsv,
-						num_bytes);
-		else
-			ret = btrfs_block_rsv_add(root,
-						&root->fs_info->trans_block_rsv,
-						num_bytes);
+		ret = btrfs_block_rsv_add(root,
+					  &root->fs_info->trans_block_rsv,
+					  num_bytes, flush);
 		if (ret)
 			return ERR_PTR(ret);
 	}
@@ -422,13 +415,15 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   int num_items)
 {
-	return start_transaction(root, num_items, TRANS_START, 0);
+	return start_transaction(root, num_items, TRANS_START,
+				 BTRFS_RESERVE_FLUSH_ALL);
 }
 
-struct btrfs_trans_handle *btrfs_start_transaction_noflush(
+struct btrfs_trans_handle *btrfs_start_transaction_lflush(
 					struct btrfs_root *root, int num_items)
 {
-	return start_transaction(root, num_items, TRANS_START, 1);
+	return start_transaction(root, num_items, TRANS_START,
+				 BTRFS_RESERVE_FLUSH_LIMIT);
 }
 
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
@@ -461,28 +456,31 @@ static noinline void wait_for_commit(struct btrfs_root *root,
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
 {
 	struct btrfs_transaction *cur_trans = NULL, *t;
-	int ret;
+	int ret = 0;
 
-	ret = 0;
 	if (transid) {
 		if (transid <= root->fs_info->last_trans_committed)
 			goto out;
 
+		ret = -EINVAL;
 		/* find specified transaction */
 		spin_lock(&root->fs_info->trans_lock);
 		list_for_each_entry(t, &root->fs_info->trans_list, list) {
 			if (t->transid == transid) {
 				cur_trans = t;
 				atomic_inc(&cur_trans->use_count);
+				ret = 0;
 				break;
 			}
-			if (t->transid > transid)
+			if (t->transid > transid) {
+				ret = 0;
 				break;
+			}
 		}
 		spin_unlock(&root->fs_info->trans_lock);
-		ret = -EINVAL;
+		/* The specified transaction doesn't exist */
 		if (!cur_trans)
-			goto out;  /* bad transid */
+			goto out;
 	} else {
 		/* find newest transaction that is committing | committed */
 		spin_lock(&root->fs_info->trans_lock);
@@ -502,9 +500,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
 	}
 
 	wait_for_commit(root, cur_trans);
-
 	put_transaction(cur_trans);
-	ret = 0;
 out:
 	return ret;
 }
@@ -851,7 +847,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 		return ret;
 
 	ret = btrfs_run_dev_stats(trans, root->fs_info);
-	BUG_ON(ret);
+	WARN_ON(ret);
+	ret = btrfs_run_dev_replace(trans, root->fs_info);
+	WARN_ON(ret);
 
 	ret = btrfs_run_qgroups(trans, root->fs_info);
 	BUG_ON(ret);
@@ -874,6 +872,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 	switch_commit_root(fs_info->extent_root);
 	up_write(&fs_info->extent_commit_sem);
 
+	btrfs_after_dev_replace_commit(fs_info);
+
 	return 0;
 }
 
@@ -958,7 +958,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_trans_handle *trans;
 	int ret;
-	unsigned long nr;
 
 	if (xchg(&root->defrag_running, 1))
 		return 0;
@@ -970,9 +969,8 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 
 		ret = btrfs_defrag_leaves(trans, root, cacheonly);
 
-		nr = trans->blocks_used;
 		btrfs_end_transaction(trans, root);
-		btrfs_btree_balance_dirty(info->tree_root, nr);
+		btrfs_btree_balance_dirty(info->tree_root);
 		cond_resched();
 
 		if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
@@ -1032,8 +1030,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
 
 	if (to_reserve > 0) {
-		ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
-						  to_reserve);
+		ret = btrfs_block_rsv_add(root, &pending->block_rsv,
+					  to_reserve,
+					  BTRFS_RESERVE_NO_FLUSH);
 		if (ret) {
 			pending->error = ret;
 			goto no_free_objectid;
@@ -1191,7 +1190,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 				    parent_inode, &key,
 				    BTRFS_FT_DIR, index);
 	/* We have check then name at the beginning, so it is impossible. */
-	BUG_ON(ret == -EEXIST);
+	BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
 	if (ret) {
 		btrfs_abort_transaction(trans, root, ret);
 		goto fail;
@@ -1309,9 +1308,10 @@ static void do_async_commit(struct work_struct *work)
 	 * We've got freeze protection passed with the transaction.
 	 * Tell lockdep about it.
 	 */
-	rwsem_acquire_read(
-		&ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
-		0, 1, _THIS_IP_);
+	if (ac->newtrans->type < TRANS_JOIN_NOLOCK)
+		rwsem_acquire_read(
+		     &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+		     0, 1, _THIS_IP_);
 
 	current->journal_info = ac->newtrans;
 
@@ -1349,8 +1349,10 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
 	 * Tell lockdep we've released the freeze rwsem, since the
 	 * async commit thread will be the one to unlock it.
 	 */
-	rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
-		      1, _THIS_IP_);
+	if (trans->type < TRANS_JOIN_NOLOCK)
+		rwsem_release(
+			&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+			1, _THIS_IP_);
 
 	schedule_delayed_work(&ac->work, 0);
 
@@ -1400,6 +1402,48 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 }
 
+static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root)
+{
+	int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
+	int snap_pending = 0;
+	int ret;
+
+	if (!flush_on_commit) {
+		spin_lock(&root->fs_info->trans_lock);
+		if (!list_empty(&trans->transaction->pending_snapshots))
+			snap_pending = 1;
+		spin_unlock(&root->fs_info->trans_lock);
+	}
+
+	if (flush_on_commit || snap_pending) {
+		btrfs_start_delalloc_inodes(root, 1);
+		btrfs_wait_ordered_extents(root, 1);
+	}
+
+	ret = btrfs_run_delayed_items(trans, root);
+	if (ret)
+		return ret;
+
+	/*
+	 * running the delayed items may have added new refs. account
+	 * them now so that they hinder processing of more delayed refs
+	 * as little as possible.
+	 */
+	btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+
+	/*
+	 * rename don't use btrfs_join_transaction, so, once we
+	 * set the transaction to blocked above, we aren't going
+	 * to get any new ordered operations.  We can safely run
+	 * it here and no for sure that nothing new will be added
+	 * to the list
+	 */
+	btrfs_run_ordered_operations(root, 1);
+
+	return 0;
+}
+
 /*
  * btrfs_transaction state sequence:
  *    in_commit = 0, blocked = 0  (initial)
@@ -1414,15 +1458,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	struct btrfs_transaction *cur_trans = trans->transaction;
 	struct btrfs_transaction *prev_trans = NULL;
 	DEFINE_WAIT(wait);
-	int ret = -EIO;
+	int ret;
 	int should_grow = 0;
 	unsigned long now = get_seconds();
-	int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
 
-	btrfs_run_ordered_operations(root, 0);
+	ret = btrfs_run_ordered_operations(root, 0);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto cleanup_transaction;
+	}
 
-	if (cur_trans->aborted)
+	if (cur_trans->aborted) {
+		ret = cur_trans->aborted;
 		goto cleanup_transaction;
+	}
 
 	/* make a pass through all the delayed refs we have so far
 	 * any runnings procs may add more while we are here
@@ -1490,39 +1539,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		should_grow = 1;
 
 	do {
-		int snap_pending = 0;
-
 		joined = cur_trans->num_joined;
-		if (!list_empty(&trans->transaction->pending_snapshots))
-			snap_pending = 1;
 
 		WARN_ON(cur_trans != trans->transaction);
 
-		if (flush_on_commit || snap_pending) {
-			btrfs_start_delalloc_inodes(root, 1);
-			btrfs_wait_ordered_extents(root, 1);
-		}
-
-		ret = btrfs_run_delayed_items(trans, root);
+		ret = btrfs_flush_all_pending_stuffs(trans, root);
 		if (ret)
 			goto cleanup_transaction;
 
-		/*
-		 * running the delayed items may have added new refs. account
-		 * them now so that they hinder processing of more delayed refs
-		 * as little as possible.
-		 */
-		btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
-
-		/*
-		 * rename don't use btrfs_join_transaction, so, once we
-		 * set the transaction to blocked above, we aren't going
-		 * to get any new ordered operations.  We can safely run
-		 * it here and no for sure that nothing new will be added
-		 * to the list
-		 */
-		btrfs_run_ordered_operations(root, 1);
-
 		prepare_to_wait(&cur_trans->writer_wait, &wait,
 				TASK_UNINTERRUPTIBLE);
 
@@ -1535,6 +1559,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	} while (atomic_read(&cur_trans->num_writers) > 1 ||
 		 (should_grow && cur_trans->num_joined != joined));
 
+	ret = btrfs_flush_all_pending_stuffs(trans, root);
+	if (ret)
+		goto cleanup_transaction;
+
 	/*
 	 * Ok now we need to make sure to block out any other joins while we
 	 * commit the transaction.  We could have started a join before setting
diff --git a/trunk/fs/btrfs/transaction.h b/trunk/fs/btrfs/transaction.h
index 80961947a6b2..0e8aa1e6c287 100644
--- a/trunk/fs/btrfs/transaction.h
+++ b/trunk/fs/btrfs/transaction.h
@@ -105,7 +105,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   int num_items);
-struct btrfs_trans_handle *btrfs_start_transaction_noflush(
+struct btrfs_trans_handle *btrfs_start_transaction_lflush(
 					struct btrfs_root *root, int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
diff --git a/trunk/fs/btrfs/tree-log.c b/trunk/fs/btrfs/tree-log.c
index 81e407d9677a..83186c7e45d4 100644
--- a/trunk/fs/btrfs/tree-log.c
+++ b/trunk/fs/btrfs/tree-log.c
@@ -2952,33 +2952,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 			    struct btrfs_inode_item *item,
 			    struct inode *inode, int log_inode_only)
 {
-	btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
-	btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
-	btrfs_set_inode_mode(leaf, item, inode->i_mode);
-	btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
-
-	btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
-			       inode->i_atime.tv_sec);
-	btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
-				inode->i_atime.tv_nsec);
-
-	btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
-			       inode->i_mtime.tv_sec);
-	btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
-				inode->i_mtime.tv_nsec);
-
-	btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
-			       inode->i_ctime.tv_sec);
-	btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
-				inode->i_ctime.tv_nsec);
-
-	btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
-
-	btrfs_set_inode_sequence(leaf, item, inode->i_version);
-	btrfs_set_inode_transid(leaf, item, trans->transid);
-	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
-	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
-	btrfs_set_inode_block_group(leaf, item, 0);
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
 
 	if (log_inode_only) {
 		/* set the generation to zero so the recover code
@@ -2986,14 +2962,63 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 		 * just to say 'this inode exists' and a logging
 		 * to say 'update this inode with these values'
 		 */
-		btrfs_set_inode_generation(leaf, item, 0);
-		btrfs_set_inode_size(leaf, item, 0);
+		btrfs_set_token_inode_generation(leaf, item, 0, &token);
+		btrfs_set_token_inode_size(leaf, item, 0, &token);
 	} else {
-		btrfs_set_inode_generation(leaf, item,
-					   BTRFS_I(inode)->generation);
-		btrfs_set_inode_size(leaf, item, inode->i_size);
-	}
+		btrfs_set_token_inode_generation(leaf, item,
+						 BTRFS_I(inode)->generation,
+						 &token);
+		btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
+	}
+
+	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
+	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
+	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
+	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
+
+	btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+				     inode->i_atime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+				      inode->i_atime.tv_nsec, &token);
+
+	btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+				     inode->i_mtime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+				      inode->i_mtime.tv_nsec, &token);
+
+	btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+				     inode->i_ctime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+				      inode->i_ctime.tv_nsec, &token);
+
+	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
+				     &token);
+
+	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
+	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
+	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
+	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
+}
 
+static int log_inode_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *log, struct btrfs_path *path,
+			  struct inode *inode)
+{
+	struct btrfs_inode_item *inode_item;
+	struct btrfs_key key;
+	int ret;
+
+	memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
+	ret = btrfs_insert_empty_item(trans, log, path, &key,
+				      sizeof(*inode_item));
+	if (ret && ret != -EEXIST)
+		return ret;
+	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				    struct btrfs_inode_item);
+	fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
+	btrfs_release_path(path);
+	return 0;
 }
 
 static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -3130,151 +3155,234 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
 	return 0;
 }
 
-struct log_args {
-	struct extent_buffer *src;
-	u64 next_offset;
-	int start_slot;
-	int nr;
-};
+static int drop_adjacent_extents(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root, struct inode *inode,
+				 struct extent_map *em,
+				 struct btrfs_path *path)
+{
+	struct btrfs_file_extent_item *fi;
+	struct extent_buffer *leaf;
+	struct btrfs_key key, new_key;
+	struct btrfs_map_token token;
+	u64 extent_end;
+	u64 extent_offset = 0;
+	int extent_type;
+	int del_slot = 0;
+	int del_nr = 0;
+	int ret = 0;
+
+	while (1) {
+		btrfs_init_map_token(&token);
+		leaf = path->nodes[0];
+		path->slots[0]++;
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			if (del_nr) {
+				ret = btrfs_del_items(trans, root, path,
+						      del_slot, del_nr);
+				if (ret)
+					return ret;
+				del_nr = 0;
+			}
+
+			ret = btrfs_next_leaf_write(trans, root, path, 1);
+			if (ret < 0)
+				return ret;
+			if (ret > 0)
+				return 0;
+			leaf = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != btrfs_ino(inode) ||
+		    key.type != BTRFS_EXTENT_DATA_KEY ||
+		    key.offset >= em->start + em->len)
+			break;
+
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		extent_type = btrfs_token_file_extent_type(leaf, fi, &token);
+		if (extent_type == BTRFS_FILE_EXTENT_REG ||
+		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+			extent_offset = btrfs_token_file_extent_offset(leaf,
+								fi, &token);
+			extent_end = key.offset +
+				btrfs_token_file_extent_num_bytes(leaf, fi,
+								  &token);
+		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+			extent_end = key.offset +
+				btrfs_file_extent_inline_len(leaf, fi);
+		} else {
+			BUG();
+		}
+
+		if (extent_end <= em->len + em->start) {
+			if (!del_nr) {
+				del_slot = path->slots[0];
+			}
+			del_nr++;
+			continue;
+		}
+
+		/*
+		 * Ok so we'll ignore previous items if we log a new extent,
+		 * which can lead to overlapping extents, so if we have an
+		 * existing extent we want to adjust we _have_ to check the next
+		 * guy to make sure we even need this extent anymore, this keeps
+		 * us from panicing in set_item_key_safe.
+		 */
+		if (path->slots[0] < btrfs_header_nritems(leaf) - 1) {
+			struct btrfs_key tmp_key;
+
+			btrfs_item_key_to_cpu(leaf, &tmp_key,
+					      path->slots[0] + 1);
+			if (tmp_key.objectid == btrfs_ino(inode) &&
+			    tmp_key.type == BTRFS_EXTENT_DATA_KEY &&
+			    tmp_key.offset <= em->start + em->len) {
+				if (!del_nr)
+					del_slot = path->slots[0];
+				del_nr++;
+				continue;
+			}
+		}
+
+		BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+		memcpy(&new_key, &key, sizeof(new_key));
+		new_key.offset = em->start + em->len;
+		btrfs_set_item_key_safe(trans, root, path, &new_key);
+		extent_offset += em->start + em->len - key.offset;
+		btrfs_set_token_file_extent_offset(leaf, fi, extent_offset,
+						   &token);
+		btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end -
+						      (em->start + em->len),
+						      &token);
+		btrfs_mark_buffer_dirty(leaf);
+	}
+
+	if (del_nr)
+		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+
+	return ret;
+}
 
 static int log_one_extent(struct btrfs_trans_handle *trans,
 			  struct inode *inode, struct btrfs_root *root,
-			  struct extent_map *em, struct btrfs_path *path,
-			  struct btrfs_path *dst_path, struct log_args *args)
+			  struct extent_map *em, struct btrfs_path *path)
 {
 	struct btrfs_root *log = root->log_root;
 	struct btrfs_file_extent_item *fi;
+	struct extent_buffer *leaf;
+	struct list_head ordered_sums;
+	struct btrfs_map_token token;
 	struct btrfs_key key;
-	u64 start = em->mod_start;
-	u64 search_start = start;
-	u64 len = em->mod_len;
-	u64 num_bytes;
-	int nritems;
+	u64 csum_offset = em->mod_start - em->start;
+	u64 csum_len = em->mod_len;
+	u64 extent_offset = em->start - em->orig_start;
+	u64 block_len;
 	int ret;
+	bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
-	if (BTRFS_I(inode)->logged_trans == trans->transid) {
-		ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
-					   start + len, NULL, 0);
-		if (ret)
-			return ret;
+	INIT_LIST_HEAD(&ordered_sums);
+	btrfs_init_map_token(&token);
+	key.objectid = btrfs_ino(inode);
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = em->start;
+	path->really_keep_locks = 1;
+
+	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
+	if (ret && ret != -EEXIST) {
+		path->really_keep_locks = 0;
+		return ret;
 	}
+	leaf = path->nodes[0];
+	fi = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
+					       &token);
+	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+		skip_csum = true;
+		btrfs_set_token_file_extent_type(leaf, fi,
+						 BTRFS_FILE_EXTENT_PREALLOC,
+						 &token);
+	} else {
+		btrfs_set_token_file_extent_type(leaf, fi,
+						 BTRFS_FILE_EXTENT_REG,
+						 &token);
+		if (em->block_start == 0)
+			skip_csum = true;
+	}
+
+	block_len = max(em->block_len, em->orig_block_len);
+	if (em->compress_type != BTRFS_COMPRESS_NONE) {
+		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+							em->block_start,
+							&token);
+		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+							   &token);
+	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+							em->block_start -
+							extent_offset, &token);
+		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+							   &token);
+	} else {
+		btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
+		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
+							   &token);
+	}
+
+	btrfs_set_token_file_extent_offset(leaf, fi,
+					   em->start - em->orig_start,
+					   &token);
+	btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
+	btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token);
+	btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
+						&token);
+	btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
+	btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
+	btrfs_mark_buffer_dirty(leaf);
 
-	while (len) {
-		if (args->nr)
-			goto next_slot;
-again:
-		key.objectid = btrfs_ino(inode);
-		key.type = BTRFS_EXTENT_DATA_KEY;
-		key.offset = search_start;
-
-		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-		if (ret < 0)
-			return ret;
-
-		if (ret) {
-			/*
-			 * A rare case were we can have an em for a section of a
-			 * larger extent so we need to make sure that this em
-			 * falls within the extent we've found.  If not we just
-			 * bail and go back to ye-olde way of doing things but
-			 * it happens often enough in testing that we need to do
-			 * this dance to make sure.
-			 */
-			do {
-				if (path->slots[0] == 0) {
-					btrfs_release_path(path);
-					if (search_start == 0)
-						return -ENOENT;
-					search_start--;
-					goto again;
-				}
-
-				path->slots[0]--;
-				btrfs_item_key_to_cpu(path->nodes[0], &key,
-						      path->slots[0]);
-				if (key.objectid != btrfs_ino(inode) ||
-				    key.type != BTRFS_EXTENT_DATA_KEY) {
-					btrfs_release_path(path);
-					return -ENOENT;
-				}
-			} while (key.offset > start);
+	/*
+	 * Have to check the extent to the right of us to make sure it doesn't
+	 * fall in our current range.  We're ok if the previous extent is in our
+	 * range since the recovery stuff will run us in key order and thus just
+	 * drop the part we overwrote.
+	 */
+	ret = drop_adjacent_extents(trans, log, inode, em, path);
+	btrfs_release_path(path);
+	path->really_keep_locks = 0;
+	if (ret) {
+		return ret;
+	}
 
-			fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
-					    struct btrfs_file_extent_item);
-			num_bytes = btrfs_file_extent_num_bytes(path->nodes[0],
-								fi);
-			if (key.offset + num_bytes <= start) {
-				btrfs_release_path(path);
-				return -ENOENT;
-			}
-		}
-		args->src = path->nodes[0];
-next_slot:
-		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
-		fi = btrfs_item_ptr(args->src, path->slots[0],
-				    struct btrfs_file_extent_item);
-		if (args->nr &&
-		    args->start_slot + args->nr == path->slots[0]) {
-			args->nr++;
-		} else if (args->nr) {
-			ret = copy_items(trans, inode, dst_path, args->src,
-					 args->start_slot, args->nr,
-					 LOG_INODE_ALL);
-			if (ret)
-				return ret;
-			args->nr = 1;
-			args->start_slot = path->slots[0];
-		} else if (!args->nr) {
-			args->nr = 1;
-			args->start_slot = path->slots[0];
-		}
-		nritems = btrfs_header_nritems(path->nodes[0]);
-		path->slots[0]++;
-		num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
-		if (len < num_bytes) {
-			/* I _think_ this is ok, envision we write to a
-			 * preallocated space that is adjacent to a previously
-			 * written preallocated space that gets merged when we
-			 * mark this preallocated space written.  If we do not
-			 * have the adjacent extent in cache then when we copy
-			 * this extent it could end up being larger than our EM
-			 * thinks it is, which is a-ok, so just set len to 0.
-			 */
-			len = 0;
-		} else {
-			len -= num_bytes;
-		}
-		start = key.offset + num_bytes;
-		args->next_offset = start;
-		search_start = start;
+	if (skip_csum)
+		return 0;
 
-		if (path->slots[0] < nritems) {
-			if (len)
-				goto next_slot;
-			break;
-		}
+	/* block start is already adjusted for the file extent offset. */
+	ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
+				       em->block_start + csum_offset,
+				       em->block_start + csum_offset +
+				       csum_len - 1, &ordered_sums, 0);
+	if (ret)
+		return ret;
 
-		if (args->nr) {
-			ret = copy_items(trans, inode, dst_path, args->src,
-					 args->start_slot, args->nr,
-					 LOG_INODE_ALL);
-			if (ret)
-				return ret;
-			args->nr = 0;
-			btrfs_release_path(path);
-		}
+	while (!list_empty(&ordered_sums)) {
+		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
+						   struct btrfs_ordered_sum,
+						   list);
+		if (!ret)
+			ret = btrfs_csum_file_blocks(trans, log, sums);
+		list_del(&sums->list);
+		kfree(sums);
 	}
 
-	return 0;
+	return ret;
 }
 
 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     struct inode *inode,
-				     struct btrfs_path *path,
-				     struct btrfs_path *dst_path)
+				     struct btrfs_path *path)
 {
-	struct log_args args;
 	struct extent_map *em, *n;
 	struct list_head extents;
 	struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
@@ -3283,8 +3391,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 
 	INIT_LIST_HEAD(&extents);
 
-	memset(&args, 0, sizeof(args));
-
 	write_lock(&tree->lock);
 	test_gen = root->fs_info->last_trans_committed;
 
@@ -3317,34 +3423,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 
 		write_unlock(&tree->lock);
 
-		/*
-		 * If the previous EM and the last extent we left off on aren't
-		 * sequential then we need to copy the items we have and redo
-		 * our search
-		 */
-		if (args.nr && em->mod_start != args.next_offset) {
-			ret = copy_items(trans, inode, dst_path, args.src,
-					 args.start_slot, args.nr,
-					 LOG_INODE_ALL);
-			if (ret) {
-				free_extent_map(em);
-				write_lock(&tree->lock);
-				continue;
-			}
-			btrfs_release_path(path);
-			args.nr = 0;
-		}
-
-		ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
+		ret = log_one_extent(trans, inode, root, em, path);
 		free_extent_map(em);
 		write_lock(&tree->lock);
 	}
 	WARN_ON(!list_empty(&extents));
 	write_unlock(&tree->lock);
 
-	if (!ret && args.nr)
-		ret = copy_items(trans, inode, dst_path, args.src,
-				 args.start_slot, args.nr, LOG_INODE_ALL);
 	btrfs_release_path(path);
 	return ret;
 }
@@ -3400,7 +3485,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
 
 	/* today the code can only do partial logging of directories */
-	if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
+	if (S_ISDIR(inode->i_mode) ||
+	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+		       &BTRFS_I(inode)->runtime_flags) &&
+	     inode_only == LOG_INODE_EXISTS))
 		max_key.type = BTRFS_XATTR_ITEM_KEY;
 	else
 		max_key.type = (u8)-1;
@@ -3432,14 +3520,28 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	} else {
 		if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 				       &BTRFS_I(inode)->runtime_flags)) {
+			clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+				  &BTRFS_I(inode)->runtime_flags);
 			ret = btrfs_truncate_inode_items(trans, log,
 							 inode, 0, 0);
-		} else {
-			fast_search = true;
+		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+					      &BTRFS_I(inode)->runtime_flags)) {
+			if (inode_only == LOG_INODE_ALL)
+				fast_search = true;
 			max_key.type = BTRFS_XATTR_ITEM_KEY;
 			ret = drop_objectid_items(trans, log, path, ino,
-						  BTRFS_XATTR_ITEM_KEY);
+						  max_key.type);
+		} else {
+			if (inode_only == LOG_INODE_ALL)
+				fast_search = true;
+			ret = log_inode_item(trans, log, dst_path, inode);
+			if (ret) {
+				err = ret;
+				goto out_unlock;
+			}
+			goto log_extents;
 		}
+
 	}
 	if (ret) {
 		err = ret;
@@ -3518,11 +3620,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 		ins_nr = 0;
 	}
 
+log_extents:
 	if (fast_search) {
-		btrfs_release_path(path);
 		btrfs_release_path(dst_path);
-		ret = btrfs_log_changed_extents(trans, root, inode, path,
-						dst_path);
+		ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
 		if (ret) {
 			err = ret;
 			goto out_unlock;
@@ -3531,8 +3632,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 		struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
 		struct extent_map *em, *n;
 
+		write_lock(&tree->lock);
 		list_for_each_entry_safe(em, n, &tree->modified_extents, list)
 			list_del_init(&em->list);
+		write_unlock(&tree->lock);
 	}
 
 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
diff --git a/trunk/fs/btrfs/volumes.c b/trunk/fs/btrfs/volumes.c
index e3c6ee3cc2ba..5cce6aa74012 100644
--- a/trunk/fs/btrfs/volumes.c
+++ b/trunk/fs/btrfs/volumes.c
@@ -25,7 +25,6 @@
 #include <linux/capability.h>
 #include <linux/ratelimit.h>
 #include <linux/kthread.h>
-#include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
 #include "extent_map.h"
@@ -36,6 +35,8 @@
 #include "async-thread.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
+#include "math.h"
+#include "dev-replace.h"
 
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
@@ -71,6 +72,19 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 	kfree(fs_devices);
 }
 
+static void btrfs_kobject_uevent(struct block_device *bdev,
+				 enum kobject_action action)
+{
+	int ret;
+
+	ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
+	if (ret)
+		pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n",
+			action,
+			kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
+			&disk_to_dev(bdev->bd_disk)->kobj);
+}
+
 void btrfs_cleanup_fs_uuids(void)
 {
 	struct btrfs_fs_devices *fs_devices;
@@ -108,6 +122,44 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 	return NULL;
 }
 
+static int
+btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
+		      int flush, struct block_device **bdev,
+		      struct buffer_head **bh)
+{
+	int ret;
+
+	*bdev = blkdev_get_by_path(device_path, flags, holder);
+
+	if (IS_ERR(*bdev)) {
+		ret = PTR_ERR(*bdev);
+		printk(KERN_INFO "btrfs: open %s failed\n", device_path);
+		goto error;
+	}
+
+	if (flush)
+		filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
+	ret = set_blocksize(*bdev, 4096);
+	if (ret) {
+		blkdev_put(*bdev, flags);
+		goto error;
+	}
+	invalidate_bdev(*bdev);
+	*bh = btrfs_read_dev_super(*bdev);
+	if (!*bh) {
+		ret = -EINVAL;
+		blkdev_put(*bdev, flags);
+		goto error;
+	}
+
+	return 0;
+
+error:
+	*bdev = NULL;
+	*bh = NULL;
+	return ret;
+}
+
 static void requeue_list(struct btrfs_pending_bios *pending_bios,
 			struct bio *head, struct bio *tail)
 {
@@ -467,7 +519,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 	return ERR_PTR(-ENOMEM);
 }
 
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+			       struct btrfs_fs_devices *fs_devices, int step)
 {
 	struct btrfs_device *device, *next;
 
@@ -480,8 +533,9 @@ void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
 	/* This is the initialized path, it is safe to release the devices. */
 	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
 		if (device->in_fs_metadata) {
-			if (!latest_transid ||
-			    device->generation > latest_transid) {
+			if (!device->is_tgtdev_for_dev_replace &&
+			    (!latest_transid ||
+			     device->generation > latest_transid)) {
 				latest_devid = device->devid;
 				latest_transid = device->generation;
 				latest_bdev = device->bdev;
@@ -489,6 +543,21 @@ void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
 			continue;
 		}
 
+		if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
+			/*
+			 * In the first step, keep the device which has
+			 * the correct fsid and the devid that is used
+			 * for the dev_replace procedure.
+			 * In the second step, the dev_replace state is
+			 * read from the device tree and it is known
+			 * whether the procedure is really active or
+			 * not, which means whether this device is
+			 * used or whether it should be removed.
+			 */
+			if (step == 0 || device->is_tgtdev_for_dev_replace) {
+				continue;
+			}
+		}
 		if (device->bdev) {
 			blkdev_put(device->bdev, device->mode);
 			device->bdev = NULL;
@@ -497,7 +566,8 @@ void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
 		if (device->writeable) {
 			list_del_init(&device->dev_alloc_list);
 			device->writeable = 0;
-			fs_devices->rw_devices--;
+			if (!device->is_tgtdev_for_dev_replace)
+				fs_devices->rw_devices--;
 		}
 		list_del_init(&device->dev_list);
 		fs_devices->num_devices--;
@@ -555,7 +625,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 		if (device->bdev)
 			fs_devices->open_devices--;
 
-		if (device->writeable) {
+		if (device->writeable && !device->is_tgtdev_for_dev_replace) {
 			list_del_init(&device->dev_alloc_list);
 			fs_devices->rw_devices--;
 		}
@@ -637,18 +707,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		if (!device->name)
 			continue;
 
-		bdev = blkdev_get_by_path(device->name->str, flags, holder);
-		if (IS_ERR(bdev)) {
-			printk(KERN_INFO "btrfs: open %s failed\n", device->name->str);
-			goto error;
-		}
-		filemap_write_and_wait(bdev->bd_inode->i_mapping);
-		invalidate_bdev(bdev);
-		set_blocksize(bdev, 4096);
-
-		bh = btrfs_read_dev_super(bdev);
-		if (!bh)
-			goto error_close;
+		ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
+					    &bdev, &bh);
+		if (ret)
+			continue;
 
 		disk_super = (struct btrfs_super_block *)bh->b_data;
 		devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -687,7 +749,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 			fs_devices->rotating = 1;
 
 		fs_devices->open_devices++;
-		if (device->writeable) {
+		if (device->writeable && !device->is_tgtdev_for_dev_replace) {
 			fs_devices->rw_devices++;
 			list_add(&device->dev_alloc_list,
 				 &fs_devices->alloc_list);
@@ -697,9 +759,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 
 error_brelse:
 		brelse(bh);
-error_close:
 		blkdev_put(bdev, flags);
-error:
 		continue;
 	}
 	if (fs_devices->open_devices == 0) {
@@ -744,40 +804,30 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	u64 total_devices;
 
 	flags |= FMODE_EXCL;
-	bdev = blkdev_get_by_path(path, flags, holder);
-
-	if (IS_ERR(bdev)) {
-		ret = PTR_ERR(bdev);
-		goto error;
-	}
-
 	mutex_lock(&uuid_mutex);
-	ret = set_blocksize(bdev, 4096);
+	ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh);
 	if (ret)
-		goto error_close;
-	bh = btrfs_read_dev_super(bdev);
-	if (!bh) {
-		ret = -EINVAL;
-		goto error_close;
-	}
+		goto error;
 	disk_super = (struct btrfs_super_block *)bh->b_data;
 	devid = btrfs_stack_device_id(&disk_super->dev_item);
 	transid = btrfs_super_generation(disk_super);
 	total_devices = btrfs_super_num_devices(disk_super);
-	if (disk_super->label[0])
+	if (disk_super->label[0]) {
+		if (disk_super->label[BTRFS_LABEL_SIZE - 1])
+			disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
 		printk(KERN_INFO "device label %s ", disk_super->label);
-	else
+	} else {
 		printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
+	}
 	printk(KERN_CONT "devid %llu transid %llu %s\n",
 	       (unsigned long long)devid, (unsigned long long)transid, path);
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 	if (!ret && fs_devices_ret)
 		(*fs_devices_ret)->total_devices = total_devices;
 	brelse(bh);
-error_close:
-	mutex_unlock(&uuid_mutex);
 	blkdev_put(bdev, flags);
 error:
+	mutex_unlock(&uuid_mutex);
 	return ret;
 }
 
@@ -796,7 +846,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
 
 	*length = 0;
 
-	if (start >= device->total_bytes)
+	if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
 		return 0;
 
 	path = btrfs_alloc_path();
@@ -913,7 +963,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
 	max_hole_size = 0;
 	hole_size = 0;
 
-	if (search_start >= search_end) {
+	if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
 		ret = -ENOSPC;
 		goto error;
 	}
@@ -1096,6 +1146,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 
 	WARN_ON(!device->in_fs_metadata);
+	WARN_ON(device->is_tgtdev_for_dev_replace);
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -1330,16 +1381,22 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		root->fs_info->avail_system_alloc_bits |
 		root->fs_info->avail_metadata_alloc_bits;
 
-	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
-	    root->fs_info->fs_devices->num_devices <= 4) {
+	num_devices = root->fs_info->fs_devices->num_devices;
+	btrfs_dev_replace_lock(&root->fs_info->dev_replace);
+	if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
+		WARN_ON(num_devices < 1);
+		num_devices--;
+	}
+	btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
+
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
 		printk(KERN_ERR "btrfs: unable to go below four devices "
 		       "on raid10\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
-	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
-	    root->fs_info->fs_devices->num_devices <= 2) {
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
 		printk(KERN_ERR "btrfs: unable to go below two "
 		       "devices on raid1\n");
 		ret = -EINVAL;
@@ -1357,7 +1414,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		 * is held.
 		 */
 		list_for_each_entry(tmp, devices, dev_list) {
-			if (tmp->in_fs_metadata && !tmp->bdev) {
+			if (tmp->in_fs_metadata &&
+			    !tmp->is_tgtdev_for_dev_replace &&
+			    !tmp->bdev) {
 				device = tmp;
 				break;
 			}
@@ -1371,24 +1430,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 			goto out;
 		}
 	} else {
-		bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
-					  root->fs_info->bdev_holder);
-		if (IS_ERR(bdev)) {
-			ret = PTR_ERR(bdev);
+		ret = btrfs_get_bdev_and_sb(device_path,
+					    FMODE_READ | FMODE_EXCL,
+					    root->fs_info->bdev_holder, 0,
+					    &bdev, &bh);
+		if (ret)
 			goto out;
-		}
-
-		set_blocksize(bdev, 4096);
-		invalidate_bdev(bdev);
-		bh = btrfs_read_dev_super(bdev);
-		if (!bh) {
-			ret = -EINVAL;
-			goto error_close;
-		}
 		disk_super = (struct btrfs_super_block *)bh->b_data;
 		devid = btrfs_stack_device_id(&disk_super->dev_item);
 		dev_uuid = disk_super->dev_item.uuid;
-		device = btrfs_find_device(root, devid, dev_uuid,
+		device = btrfs_find_device(root->fs_info, devid, dev_uuid,
 					   disk_super->fsid);
 		if (!device) {
 			ret = -ENOENT;
@@ -1396,6 +1447,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		}
 	}
 
+	if (device->is_tgtdev_for_dev_replace) {
+		pr_err("btrfs: unable to remove the dev_replace target dev\n");
+		ret = -EINVAL;
+		goto error_brelse;
+	}
+
 	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
 		printk(KERN_ERR "btrfs: unable to remove the only writeable "
 		       "device\n");
@@ -1415,6 +1472,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (ret)
 		goto error_undo;
 
+	/*
+	 * TODO: the superblock still includes this device in its num_devices
+	 * counter although write_all_supers() is not locked out. This
+	 * could give a filesystem state which requires a degraded mount.
+	 */
 	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
 	if (ret)
 		goto error_undo;
@@ -1425,7 +1487,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	spin_unlock(&root->fs_info->free_chunk_lock);
 
 	device->in_fs_metadata = 0;
-	btrfs_scrub_cancel_dev(root, device);
+	btrfs_scrub_cancel_dev(root->fs_info, device);
 
 	/*
 	 * the device list mutex makes sure that we don't change
@@ -1482,7 +1544,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	 * at this point, the device is zero sized.  We want to
 	 * remove it from the devices list and zero out the old super
 	 */
-	if (clear_super) {
+	if (clear_super && disk_super) {
 		/* make sure this device isn't detected as part of
 		 * the FS anymore
 		 */
@@ -1493,9 +1555,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
 	ret = 0;
 
+	/* Notify udev that device has changed */
+	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+
 error_brelse:
 	brelse(bh);
-error_close:
 	if (bdev)
 		blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
 out:
@@ -1512,6 +1576,112 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	goto error_brelse;
 }
 
+void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
+				 struct btrfs_device *srcdev)
+{
+	WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
+	list_del_rcu(&srcdev->dev_list);
+	list_del_rcu(&srcdev->dev_alloc_list);
+	fs_info->fs_devices->num_devices--;
+	if (srcdev->missing) {
+		fs_info->fs_devices->missing_devices--;
+		fs_info->fs_devices->rw_devices++;
+	}
+	if (srcdev->can_discard)
+		fs_info->fs_devices->num_can_discard--;
+	if (srcdev->bdev)
+		fs_info->fs_devices->open_devices--;
+
+	call_rcu(&srcdev->rcu, free_device);
+}
+
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+				      struct btrfs_device *tgtdev)
+{
+	struct btrfs_device *next_device;
+
+	WARN_ON(!tgtdev);
+	mutex_lock(&fs_info->fs_devices->device_list_mutex);
+	if (tgtdev->bdev) {
+		btrfs_scratch_superblock(tgtdev);
+		fs_info->fs_devices->open_devices--;
+	}
+	fs_info->fs_devices->num_devices--;
+	if (tgtdev->can_discard)
+		fs_info->fs_devices->num_can_discard++;
+
+	next_device = list_entry(fs_info->fs_devices->devices.next,
+				 struct btrfs_device, dev_list);
+	if (tgtdev->bdev == fs_info->sb->s_bdev)
+		fs_info->sb->s_bdev = next_device->bdev;
+	if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
+		fs_info->fs_devices->latest_bdev = next_device->bdev;
+	list_del_rcu(&tgtdev->dev_list);
+
+	call_rcu(&tgtdev->rcu, free_device);
+
+	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+}
+
+int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
+			      struct btrfs_device **device)
+{
+	int ret = 0;
+	struct btrfs_super_block *disk_super;
+	u64 devid;
+	u8 *dev_uuid;
+	struct block_device *bdev;
+	struct buffer_head *bh;
+
+	*device = NULL;
+	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
+				    root->fs_info->bdev_holder, 0, &bdev, &bh);
+	if (ret)
+		return ret;
+	disk_super = (struct btrfs_super_block *)bh->b_data;
+	devid = btrfs_stack_device_id(&disk_super->dev_item);
+	dev_uuid = disk_super->dev_item.uuid;
+	*device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+				    disk_super->fsid);
+	brelse(bh);
+	if (!*device)
+		ret = -ENOENT;
+	blkdev_put(bdev, FMODE_READ);
+	return ret;
+}
+
+int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
+					 char *device_path,
+					 struct btrfs_device **device)
+{
+	*device = NULL;
+	if (strcmp(device_path, "missing") == 0) {
+		struct list_head *devices;
+		struct btrfs_device *tmp;
+
+		devices = &root->fs_info->fs_devices->devices;
+		/*
+		 * It is safe to read the devices since the volume_mutex
+		 * is held by the caller.
+		 */
+		list_for_each_entry(tmp, devices, dev_list) {
+			if (tmp->in_fs_metadata && !tmp->bdev) {
+				*device = tmp;
+				break;
+			}
+		}
+
+		if (!*device) {
+			pr_err("btrfs: no missing device found\n");
+			return -ENOENT;
+		}
+
+		return 0;
+	} else {
+		return btrfs_find_device_by_path(root, device_path, device);
+	}
+}
+
 /*
  * does all the dirty work required for changing file system's UUID.
  */
@@ -1630,7 +1800,8 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
 		read_extent_buffer(leaf, fs_uuid,
 				   (unsigned long)btrfs_device_fsid(dev_item),
 				   BTRFS_UUID_SIZE);
-		device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+		device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+					   fs_uuid);
 		BUG_ON(!device); /* Logic error */
 
 		if (device->fs_devices->seeding) {
@@ -1678,16 +1849,17 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	filemap_write_and_wait(bdev->bd_inode->i_mapping);
 
 	devices = &root->fs_info->fs_devices->devices;
-	/*
-	 * we have the volume lock, so we don't need the extra
-	 * device list mutex while reading the list here.
-	 */
+
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	list_for_each_entry(device, devices, dev_list) {
 		if (device->bdev == bdev) {
 			ret = -EEXIST;
+			mutex_unlock(
+				&root->fs_info->fs_devices->device_list_mutex);
 			goto error;
 		}
 	}
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
 	device = kzalloc(sizeof(*device), GFP_NOFS);
 	if (!device) {
@@ -1737,6 +1909,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = bdev;
 	device->in_fs_metadata = 1;
+	device->is_tgtdev_for_dev_replace = 0;
 	device->mode = FMODE_EXCL;
 	set_blocksize(device->bdev, 4096);
 
@@ -1844,6 +2017,98 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	return ret;
 }
 
+int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+				  struct btrfs_device **device_out)
+{
+	struct request_queue *q;
+	struct btrfs_device *device;
+	struct block_device *bdev;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct list_head *devices;
+	struct rcu_string *name;
+	int ret = 0;
+
+	*device_out = NULL;
+	if (fs_info->fs_devices->seeding)
+		return -EINVAL;
+
+	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
+				  fs_info->bdev_holder);
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
+
+	filemap_write_and_wait(bdev->bd_inode->i_mapping);
+
+	devices = &fs_info->fs_devices->devices;
+	list_for_each_entry(device, devices, dev_list) {
+		if (device->bdev == bdev) {
+			ret = -EEXIST;
+			goto error;
+		}
+	}
+
+	device = kzalloc(sizeof(*device), GFP_NOFS);
+	if (!device) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	name = rcu_string_strdup(device_path, GFP_NOFS);
+	if (!name) {
+		kfree(device);
+		ret = -ENOMEM;
+		goto error;
+	}
+	rcu_assign_pointer(device->name, name);
+
+	q = bdev_get_queue(bdev);
+	if (blk_queue_discard(q))
+		device->can_discard = 1;
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	device->writeable = 1;
+	device->work.func = pending_bios_fn;
+	generate_random_uuid(device->uuid);
+	device->devid = BTRFS_DEV_REPLACE_DEVID;
+	spin_lock_init(&device->io_lock);
+	device->generation = 0;
+	device->io_width = root->sectorsize;
+	device->io_align = root->sectorsize;
+	device->sector_size = root->sectorsize;
+	device->total_bytes = i_size_read(bdev->bd_inode);
+	device->disk_total_bytes = device->total_bytes;
+	device->dev_root = fs_info->dev_root;
+	device->bdev = bdev;
+	device->in_fs_metadata = 1;
+	device->is_tgtdev_for_dev_replace = 1;
+	device->mode = FMODE_EXCL;
+	set_blocksize(device->bdev, 4096);
+	device->fs_devices = fs_info->fs_devices;
+	list_add(&device->dev_list, &fs_info->fs_devices->devices);
+	fs_info->fs_devices->num_devices++;
+	fs_info->fs_devices->open_devices++;
+	if (device->can_discard)
+		fs_info->fs_devices->num_can_discard++;
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+	*device_out = device;
+	return ret;
+
+error:
+	blkdev_put(bdev, FMODE_EXCL);
+	return ret;
+}
+
+void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
+					      struct btrfs_device *tgtdev)
+{
+	WARN_ON(fs_info->fs_devices->rw_devices == 0);
+	tgtdev->io_width = fs_info->dev_root->sectorsize;
+	tgtdev->io_align = fs_info->dev_root->sectorsize;
+	tgtdev->sector_size = fs_info->dev_root->sectorsize;
+	tgtdev->dev_root = fs_info->dev_root;
+	tgtdev->in_fs_metadata = 1;
+}
+
 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
 					struct btrfs_device *device)
 {
@@ -1900,7 +2165,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
 
 	if (!device->writeable)
 		return -EACCES;
-	if (new_size <= device->total_bytes)
+	if (new_size <= device->total_bytes ||
+	    device->is_tgtdev_for_dev_replace)
 		return -EINVAL;
 
 	btrfs_set_super_total_bytes(super_copy, old_total + diff);
@@ -2338,18 +2604,6 @@ static int chunk_profiles_filter(u64 chunk_type,
 	return 1;
 }
 
-static u64 div_factor_fine(u64 num, int factor)
-{
-	if (factor <= 0)
-		return 0;
-	if (factor >= 100)
-		return num;
-
-	num *= factor;
-	do_div(num, 100);
-	return num;
-}
-
 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
 			      struct btrfs_balance_args *bargs)
 {
@@ -2514,15 +2768,6 @@ static int should_balance_chunk(struct btrfs_root *root,
 	return 1;
 }
 
-static u64 div_factor(u64 num, int factor)
-{
-	if (factor == 10)
-		return num;
-	num *= factor;
-	do_div(num, 10);
-	return num;
-}
-
 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -2550,7 +2795,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 		size_to_free = div_factor(old_size, 1);
 		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
 		if (!device->writeable ||
-		    device->total_bytes - device->bytes_used > size_to_free)
+		    device->total_bytes - device->bytes_used > size_to_free ||
+		    device->is_tgtdev_for_dev_replace)
 			continue;
 
 		ret = btrfs_shrink_device(device, old_size - size_to_free);
@@ -2728,6 +2974,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	u64 allowed;
 	int mixed = 0;
 	int ret;
+	u64 num_devices;
 
 	if (btrfs_fs_closing(fs_info) ||
 	    atomic_read(&fs_info->balance_pause_req) ||
@@ -2756,10 +3003,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		}
 	}
 
+	num_devices = fs_info->fs_devices->num_devices;
+	btrfs_dev_replace_lock(&fs_info->dev_replace);
+	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
+		BUG_ON(num_devices < 1);
+		num_devices--;
+	}
+	btrfs_dev_replace_unlock(&fs_info->dev_replace);
 	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-	if (fs_info->fs_devices->num_devices == 1)
+	if (num_devices == 1)
 		allowed |= BTRFS_BLOCK_GROUP_DUP;
-	else if (fs_info->fs_devices->num_devices < 4)
+	else if (num_devices < 4)
 		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
 	else
 		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
@@ -2902,6 +3156,7 @@ static int balance_kthread(void *data)
 		ret = btrfs_balance(fs_info->balance_ctl, NULL);
 	}
 
+	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
 	mutex_unlock(&fs_info->balance_mutex);
 	mutex_unlock(&fs_info->volume_mutex);
 
@@ -2924,6 +3179,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
 		return 0;
 	}
 
+	WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
 	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
 	if (IS_ERR(tsk))
 		return PTR_ERR(tsk);
@@ -3080,7 +3336,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	u64 old_size = device->total_bytes;
 	u64 diff = device->total_bytes - new_size;
 
-	if (new_size >= device->total_bytes)
+	if (device->is_tgtdev_for_dev_replace)
 		return -EINVAL;
 
 	path = btrfs_alloc_path();
@@ -3235,6 +3491,14 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
 	return 0;
 }
 
+struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+	{ 2, 1, 0, 4, 2, 2 /* raid10 */ },
+	{ 1, 1, 2, 2, 2, 2 /* raid1 */ },
+	{ 1, 2, 1, 1, 1, 2 /* dup */ },
+	{ 1, 1, 0, 2, 1, 1 /* raid0 */ },
+	{ 1, 1, 0, 1, 1, 1 /* single */ },
+};
+
 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *extent_root,
 			       struct map_lookup **map_ret,
@@ -3264,43 +3528,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	int ndevs;
 	int i;
 	int j;
+	int index;
 
 	BUG_ON(!alloc_profile_is_valid(type, 0));
 
 	if (list_empty(&fs_devices->alloc_list))
 		return -ENOSPC;
 
-	sub_stripes = 1;
-	dev_stripes = 1;
-	devs_increment = 1;
-	ncopies = 1;
-	devs_max = 0;	/* 0 == as many as possible */
-	devs_min = 1;
+	index = __get_raid_index(type);
 
-	/*
-	 * define the properties of each RAID type.
-	 * FIXME: move this to a global table and use it in all RAID
-	 * calculation code
-	 */
-	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
-		dev_stripes = 2;
-		ncopies = 2;
-		devs_max = 1;
-	} else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-		devs_min = 2;
-	} else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
-		devs_increment = 2;
-		ncopies = 2;
-		devs_max = 2;
-		devs_min = 2;
-	} else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-		sub_stripes = 2;
-		devs_increment = 2;
-		ncopies = 2;
-		devs_min = 4;
-	} else {
-		devs_max = 1;
-	}
+	sub_stripes = btrfs_raid_array[index].sub_stripes;
+	dev_stripes = btrfs_raid_array[index].dev_stripes;
+	devs_max = btrfs_raid_array[index].devs_max;
+	devs_min = btrfs_raid_array[index].devs_min;
+	devs_increment = btrfs_raid_array[index].devs_increment;
+	ncopies = btrfs_raid_array[index].ncopies;
 
 	if (type & BTRFS_BLOCK_GROUP_DATA) {
 		max_stripe_size = 1024 * 1024 * 1024;
@@ -3347,13 +3589,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		cur = cur->next;
 
 		if (!device->writeable) {
-			printk(KERN_ERR
+			WARN(1, KERN_ERR
 			       "btrfs: read-only device in alloc_list\n");
-			WARN_ON(1);
 			continue;
 		}
 
-		if (!device->in_fs_metadata)
+		if (!device->in_fs_metadata ||
+		    device->is_tgtdev_for_dev_replace)
 			continue;
 
 		if (device->total_bytes > device->bytes_used)
@@ -3382,6 +3624,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		devices_info[ndevs].total_avail = total_avail;
 		devices_info[ndevs].dev = device;
 		++ndevs;
+		WARN_ON(ndevs > fs_devices->rw_devices);
 	}
 
 	/*
@@ -3740,8 +3983,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
 	}
 }
 
-int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 {
+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 	struct extent_map *em;
 	struct map_lookup *map;
 	struct extent_map_tree *em_tree = &map_tree->map_tree;
@@ -3761,32 +4005,60 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
 	else
 		ret = 1;
 	free_extent_map(em);
+
+	btrfs_dev_replace_lock(&fs_info->dev_replace);
+	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
+		ret++;
+	btrfs_dev_replace_unlock(&fs_info->dev_replace);
+
 	return ret;
 }
 
-static int find_live_mirror(struct map_lookup *map, int first, int num,
-			    int optimal)
+static int find_live_mirror(struct btrfs_fs_info *fs_info,
+			    struct map_lookup *map, int first, int num,
+			    int optimal, int dev_replace_is_ongoing)
 {
 	int i;
-	if (map->stripes[optimal].dev->bdev)
-		return optimal;
-	for (i = first; i < first + num; i++) {
-		if (map->stripes[i].dev->bdev)
-			return i;
+	int tolerance;
+	struct btrfs_device *srcdev;
+
+	if (dev_replace_is_ongoing &&
+	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
+	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
+		srcdev = fs_info->dev_replace.srcdev;
+	else
+		srcdev = NULL;
+
+	/*
+	 * try to avoid the drive that is the source drive for a
+	 * dev-replace procedure, only choose it if no other non-missing
+	 * mirror is available
+	 */
+	for (tolerance = 0; tolerance < 2; tolerance++) {
+		if (map->stripes[optimal].dev->bdev &&
+		    (tolerance || map->stripes[optimal].dev != srcdev))
+			return optimal;
+		for (i = first; i < first + num; i++) {
+			if (map->stripes[i].dev->bdev &&
+			    (tolerance || map->stripes[i].dev != srcdev))
+				return i;
+		}
 	}
+
 	/* we couldn't find one that doesn't fail.  Just return something
 	 * and the io error handling code will clean up eventually
 	 */
 	return optimal;
 }
 
-static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 			     u64 logical, u64 *length,
 			     struct btrfs_bio **bbio_ret,
 			     int mirror_num)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 	struct extent_map_tree *em_tree = &map_tree->map_tree;
 	u64 offset;
 	u64 stripe_offset;
@@ -3800,6 +4072,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	int num_stripes;
 	int max_errors = 0;
 	struct btrfs_bio *bbio = NULL;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	int dev_replace_is_ongoing = 0;
+	int num_alloc_stripes;
+	int patch_the_first_stripe_for_dev_replace = 0;
+	u64 physical_to_patch_in_first_stripe = 0;
 
 	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, *length);
@@ -3816,9 +4093,6 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	map = (struct map_lookup *)em->bdev;
 	offset = logical - em->start;
 
-	if (mirror_num > map->num_stripes)
-		mirror_num = 0;
-
 	stripe_nr = offset;
 	/*
 	 * stripe_nr counts the total number of stripes we have to stride
@@ -3845,6 +4119,93 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	if (!bbio_ret)
 		goto out;
 
+	btrfs_dev_replace_lock(dev_replace);
+	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+	if (!dev_replace_is_ongoing)
+		btrfs_dev_replace_unlock(dev_replace);
+
+	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
+	    !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
+	    dev_replace->tgtdev != NULL) {
+		/*
+		 * in dev-replace case, for repair case (that's the only
+		 * case where the mirror is selected explicitly when
+		 * calling btrfs_map_block), blocks left of the left cursor
+		 * can also be read from the target drive.
+		 * For REQ_GET_READ_MIRRORS, the target drive is added as
+		 * the last one to the array of stripes. For READ, it also
+		 * needs to be supported using the same mirror number.
+		 * If the requested block is not left of the left cursor,
+		 * EIO is returned. This can happen because btrfs_num_copies()
+		 * returns one more in the dev-replace case.
+		 */
+		u64 tmp_length = *length;
+		struct btrfs_bio *tmp_bbio = NULL;
+		int tmp_num_stripes;
+		u64 srcdev_devid = dev_replace->srcdev->devid;
+		int index_srcdev = 0;
+		int found = 0;
+		u64 physical_of_found = 0;
+
+		ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
+			     logical, &tmp_length, &tmp_bbio, 0);
+		if (ret) {
+			WARN_ON(tmp_bbio != NULL);
+			goto out;
+		}
+
+		tmp_num_stripes = tmp_bbio->num_stripes;
+		if (mirror_num > tmp_num_stripes) {
+			/*
+			 * REQ_GET_READ_MIRRORS does not contain this
+			 * mirror, that means that the requested area
+			 * is not left of the left cursor
+			 */
+			ret = -EIO;
+			kfree(tmp_bbio);
+			goto out;
+		}
+
+		/*
+		 * process the rest of the function using the mirror_num
+		 * of the source drive. Therefore look it up first.
+		 * At the end, patch the device pointer to the one of the
+		 * target drive.
+		 */
+		for (i = 0; i < tmp_num_stripes; i++) {
+			if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
+				/*
+				 * In case of DUP, in order to keep it
+				 * simple, only add the mirror with the
+				 * lowest physical address
+				 */
+				if (found &&
+				    physical_of_found <=
+				     tmp_bbio->stripes[i].physical)
+					continue;
+				index_srcdev = i;
+				found = 1;
+				physical_of_found =
+					tmp_bbio->stripes[i].physical;
+			}
+		}
+
+		if (found) {
+			mirror_num = index_srcdev + 1;
+			patch_the_first_stripe_for_dev_replace = 1;
+			physical_to_patch_in_first_stripe = physical_of_found;
+		} else {
+			WARN_ON(1);
+			ret = -EIO;
+			kfree(tmp_bbio);
+			goto out;
+		}
+
+		kfree(tmp_bbio);
+	} else if (mirror_num > map->num_stripes) {
+		mirror_num = 0;
+	}
+
 	num_stripes = 1;
 	stripe_index = 0;
 	stripe_nr_orig = stripe_nr;
@@ -3859,19 +4220,20 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 					    stripe_nr_end - stripe_nr_orig);
 		stripe_index = do_div(stripe_nr, map->num_stripes);
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-		if (rw & (REQ_WRITE | REQ_DISCARD))
+		if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
 			num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
 		else {
-			stripe_index = find_live_mirror(map, 0,
+			stripe_index = find_live_mirror(fs_info, map, 0,
 					    map->num_stripes,
-					    current->pid % map->num_stripes);
+					    current->pid % map->num_stripes,
+					    dev_replace_is_ongoing);
 			mirror_num = stripe_index + 1;
 		}
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-		if (rw & (REQ_WRITE | REQ_DISCARD)) {
+		if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
 			num_stripes = map->num_stripes;
 		} else if (mirror_num) {
 			stripe_index = mirror_num - 1;
@@ -3885,7 +4247,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		stripe_index = do_div(stripe_nr, factor);
 		stripe_index *= map->sub_stripes;
 
-		if (rw & REQ_WRITE)
+		if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
 			num_stripes = map->sub_stripes;
 		else if (rw & REQ_DISCARD)
 			num_stripes = min_t(u64, map->sub_stripes *
@@ -3895,9 +4257,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 			stripe_index += mirror_num - 1;
 		else {
 			int old_stripe_index = stripe_index;
-			stripe_index = find_live_mirror(map, stripe_index,
+			stripe_index = find_live_mirror(fs_info, map,
+					      stripe_index,
 					      map->sub_stripes, stripe_index +
-					      current->pid % map->sub_stripes);
+					      current->pid % map->sub_stripes,
+					      dev_replace_is_ongoing);
 			mirror_num = stripe_index - old_stripe_index + 1;
 		}
 	} else {
@@ -3911,7 +4275,14 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	}
 	BUG_ON(stripe_index >= map->num_stripes);
 
-	bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
+	num_alloc_stripes = num_stripes;
+	if (dev_replace_is_ongoing) {
+		if (rw & (REQ_WRITE | REQ_DISCARD))
+			num_alloc_stripes <<= 1;
+		if (rw & REQ_GET_READ_MIRRORS)
+			num_alloc_stripes++;
+	}
+	bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
 	if (!bbio) {
 		ret = -ENOMEM;
 		goto out;
@@ -3998,7 +4369,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		}
 	}
 
-	if (rw & REQ_WRITE) {
+	if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
 		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
 				 BTRFS_BLOCK_GROUP_RAID10 |
 				 BTRFS_BLOCK_GROUP_DUP)) {
@@ -4006,20 +4377,115 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		}
 	}
 
+	if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
+	    dev_replace->tgtdev != NULL) {
+		int index_where_to_add;
+		u64 srcdev_devid = dev_replace->srcdev->devid;
+
+		/*
+		 * duplicate the write operations while the dev replace
+		 * procedure is running. Since the copying of the old disk
+		 * to the new disk takes place at run time while the
+		 * filesystem is mounted writable, the regular write
+		 * operations to the old disk have to be duplicated to go
+		 * to the new disk as well.
+		 * Note that device->missing is handled by the caller, and
+		 * that the write to the old disk is already set up in the
+		 * stripes array.
+		 */
+		index_where_to_add = num_stripes;
+		for (i = 0; i < num_stripes; i++) {
+			if (bbio->stripes[i].dev->devid == srcdev_devid) {
+				/* write to new disk, too */
+				struct btrfs_bio_stripe *new =
+					bbio->stripes + index_where_to_add;
+				struct btrfs_bio_stripe *old =
+					bbio->stripes + i;
+
+				new->physical = old->physical;
+				new->length = old->length;
+				new->dev = dev_replace->tgtdev;
+				index_where_to_add++;
+				max_errors++;
+			}
+		}
+		num_stripes = index_where_to_add;
+	} else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
+		   dev_replace->tgtdev != NULL) {
+		u64 srcdev_devid = dev_replace->srcdev->devid;
+		int index_srcdev = 0;
+		int found = 0;
+		u64 physical_of_found = 0;
+
+		/*
+		 * During the dev-replace procedure, the target drive can
+		 * also be used to read data in case it is needed to repair
+		 * a corrupt block elsewhere. This is possible if the
+		 * requested area is left of the left cursor. In this area,
+		 * the target drive is a full copy of the source drive.
+		 */
+		for (i = 0; i < num_stripes; i++) {
+			if (bbio->stripes[i].dev->devid == srcdev_devid) {
+				/*
+				 * In case of DUP, in order to keep it
+				 * simple, only add the mirror with the
+				 * lowest physical address
+				 */
+				if (found &&
+				    physical_of_found <=
+				     bbio->stripes[i].physical)
+					continue;
+				index_srcdev = i;
+				found = 1;
+				physical_of_found = bbio->stripes[i].physical;
+			}
+		}
+		if (found) {
+			u64 length = map->stripe_len;
+
+			if (physical_of_found + length <=
+			    dev_replace->cursor_left) {
+				struct btrfs_bio_stripe *tgtdev_stripe =
+					bbio->stripes + num_stripes;
+
+				tgtdev_stripe->physical = physical_of_found;
+				tgtdev_stripe->length =
+					bbio->stripes[index_srcdev].length;
+				tgtdev_stripe->dev = dev_replace->tgtdev;
+
+				num_stripes++;
+			}
+		}
+	}
+
 	*bbio_ret = bbio;
 	bbio->num_stripes = num_stripes;
 	bbio->max_errors = max_errors;
 	bbio->mirror_num = mirror_num;
+
+	/*
+	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
+	 * mirror_num == num_stripes + 1 && dev_replace target drive is
+	 * available as a mirror
+	 */
+	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
+		WARN_ON(num_stripes > 1);
+		bbio->stripes[0].dev = dev_replace->tgtdev;
+		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
+		bbio->mirror_num = map->num_stripes + 1;
+	}
 out:
+	if (dev_replace_is_ongoing)
+		btrfs_dev_replace_unlock(dev_replace);
 	free_extent_map(em);
 	return ret;
 }
 
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		      u64 logical, u64 *length,
 		      struct btrfs_bio **bbio_ret, int mirror_num)
 {
-	return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
+	return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
 				 mirror_num);
 }
 
@@ -4238,10 +4704,116 @@ static noinline void schedule_bio(struct btrfs_root *root,
 				   &device->work);
 }
 
+static int bio_size_ok(struct block_device *bdev, struct bio *bio,
+		       sector_t sector)
+{
+	struct bio_vec *prev;
+	struct request_queue *q = bdev_get_queue(bdev);
+	unsigned short max_sectors = queue_max_sectors(q);
+	struct bvec_merge_data bvm = {
+		.bi_bdev = bdev,
+		.bi_sector = sector,
+		.bi_rw = bio->bi_rw,
+	};
+
+	if (bio->bi_vcnt == 0) {
+		WARN_ON(1);
+		return 1;
+	}
+
+	prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
+	if ((bio->bi_size >> 9) > max_sectors)
+		return 0;
+
+	if (!q->merge_bvec_fn)
+		return 1;
+
+	bvm.bi_size = bio->bi_size - prev->bv_len;
+	if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
+		return 0;
+	return 1;
+}
+
+static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+			      struct bio *bio, u64 physical, int dev_nr,
+			      int rw, int async)
+{
+	struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
+
+	bio->bi_private = bbio;
+	bio->bi_private = merge_stripe_index_into_bio_private(
+			bio->bi_private, (unsigned int)dev_nr);
+	bio->bi_end_io = btrfs_end_bio;
+	bio->bi_sector = physical >> 9;
+#ifdef DEBUG
+	{
+		struct rcu_string *name;
+
+		rcu_read_lock();
+		name = rcu_dereference(dev->name);
+		pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
+			 "(%s id %llu), size=%u\n", rw,
+			 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
+			 name->str, dev->devid, bio->bi_size);
+		rcu_read_unlock();
+	}
+#endif
+	bio->bi_bdev = dev->bdev;
+	if (async)
+		schedule_bio(root, dev, rw, bio);
+	else
+		btrfsic_submit_bio(rw, bio);
+}
+
+static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+			      struct bio *first_bio, struct btrfs_device *dev,
+			      int dev_nr, int rw, int async)
+{
+	struct bio_vec *bvec = first_bio->bi_io_vec;
+	struct bio *bio;
+	int nr_vecs = bio_get_nr_vecs(dev->bdev);
+	u64 physical = bbio->stripes[dev_nr].physical;
+
+again:
+	bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
+	if (!bio)
+		return -ENOMEM;
+
+	while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
+		if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+				 bvec->bv_offset) < bvec->bv_len) {
+			u64 len = bio->bi_size;
+
+			atomic_inc(&bbio->stripes_pending);
+			submit_stripe_bio(root, bbio, bio, physical, dev_nr,
+					  rw, async);
+			physical += len;
+			goto again;
+		}
+		bvec++;
+	}
+
+	submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
+	return 0;
+}
+
+static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
+{
+	atomic_inc(&bbio->error);
+	if (atomic_dec_and_test(&bbio->stripes_pending)) {
+		bio->bi_private = bbio->private;
+		bio->bi_end_io = bbio->end_io;
+		bio->bi_bdev = (struct block_device *)
+			(unsigned long)bbio->mirror_num;
+		bio->bi_sector = logical >> 9;
+		kfree(bbio);
+		bio_endio(bio, -EIO);
+	}
+}
+
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		  int mirror_num, int async_submit)
 {
-	struct btrfs_mapping_tree *map_tree;
 	struct btrfs_device *dev;
 	struct bio *first_bio = bio;
 	u64 logical = (u64)bio->bi_sector << 9;
@@ -4253,12 +4825,11 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	struct btrfs_bio *bbio = NULL;
 
 	length = bio->bi_size;
-	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
 
-	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
+	ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
 			      mirror_num);
-	if (ret) /* -ENOMEM */
+	if (ret)
 		return ret;
 
 	total_devs = bbio->num_stripes;
@@ -4276,52 +4847,48 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
 
 	while (dev_nr < total_devs) {
+		dev = bbio->stripes[dev_nr].dev;
+		if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
+			bbio_error(bbio, first_bio, logical);
+			dev_nr++;
+			continue;
+		}
+
+		/*
+		 * Check and see if we're ok with this bio based on it's size
+		 * and offset with the given device.
+		 */
+		if (!bio_size_ok(dev->bdev, first_bio,
+				 bbio->stripes[dev_nr].physical >> 9)) {
+			ret = breakup_stripe_bio(root, bbio, first_bio, dev,
+						 dev_nr, rw, async_submit);
+			BUG_ON(ret);
+			dev_nr++;
+			continue;
+		}
+
 		if (dev_nr < total_devs - 1) {
 			bio = bio_clone(first_bio, GFP_NOFS);
 			BUG_ON(!bio); /* -ENOMEM */
 		} else {
 			bio = first_bio;
 		}
-		bio->bi_private = bbio;
-		bio->bi_private = merge_stripe_index_into_bio_private(
-				bio->bi_private, (unsigned int)dev_nr);
-		bio->bi_end_io = btrfs_end_bio;
-		bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
-		dev = bbio->stripes[dev_nr].dev;
-		if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
-#ifdef DEBUG
-			struct rcu_string *name;
-
-			rcu_read_lock();
-			name = rcu_dereference(dev->name);
-			pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
-				 "(%s id %llu), size=%u\n", rw,
-				 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
-				 name->str, dev->devid, bio->bi_size);
-			rcu_read_unlock();
-#endif
-			bio->bi_bdev = dev->bdev;
-			if (async_submit)
-				schedule_bio(root, dev, rw, bio);
-			else
-				btrfsic_submit_bio(rw, bio);
-		} else {
-			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
-			bio->bi_sector = logical >> 9;
-			bio_endio(bio, -EIO);
-		}
+
+		submit_stripe_bio(root, bbio, bio,
+				  bbio->stripes[dev_nr].physical, dev_nr, rw,
+				  async_submit);
 		dev_nr++;
 	}
 	return 0;
 }
 
-struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
 				       u8 *uuid, u8 *fsid)
 {
 	struct btrfs_device *device;
 	struct btrfs_fs_devices *cur_devices;
 
-	cur_devices = root->fs_info->fs_devices;
+	cur_devices = fs_info->fs_devices;
 	while (cur_devices) {
 		if (!fsid ||
 		    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
@@ -4402,6 +4969,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	em->bdev = (struct block_device *)map;
 	em->start = logical;
 	em->len = length;
+	em->orig_start = 0;
 	em->block_start = 0;
 	em->block_len = em->len;
 
@@ -4419,8 +4987,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 		read_extent_buffer(leaf, uuid, (unsigned long)
 				   btrfs_stripe_dev_uuid_nr(chunk, i),
 				   BTRFS_UUID_SIZE);
-		map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
-							NULL);
+		map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
+							uuid, NULL);
 		if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
 			kfree(map);
 			free_extent_map(em);
@@ -4461,6 +5029,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,
 	device->io_align = btrfs_device_io_align(leaf, dev_item);
 	device->io_width = btrfs_device_io_width(leaf, dev_item);
 	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
+	device->is_tgtdev_for_dev_replace = 0;
 
 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
 	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
@@ -4538,7 +5108,7 @@ static int read_one_dev(struct btrfs_root *root,
 			return ret;
 	}
 
-	device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+	device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
 	if (!device || !device->bdev) {
 		if (!btrfs_test_opt(root, DEGRADED))
 			return -EIO;
@@ -4571,7 +5141,7 @@ static int read_one_dev(struct btrfs_root *root,
 	fill_device_from_item(leaf, dev_item, device);
 	device->dev_root = root->fs_info->dev_root;
 	device->in_fs_metadata = 1;
-	if (device->writeable) {
+	if (device->writeable && !device->is_tgtdev_for_dev_replace) {
 		device->fs_devices->total_rw_bytes += device->total_bytes;
 		spin_lock(&root->fs_info->free_chunk_lock);
 		root->fs_info->free_chunk_space += device->total_bytes -
@@ -4930,7 +5500,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
 	int i;
 
 	mutex_lock(&fs_devices->device_list_mutex);
-	dev = btrfs_find_device(root, stats->devid, NULL, NULL);
+	dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
 	mutex_unlock(&fs_devices->device_list_mutex);
 
 	if (!dev) {
@@ -4958,3 +5528,21 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
 		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
 	return 0;
 }
+
+int btrfs_scratch_superblock(struct btrfs_device *device)
+{
+	struct buffer_head *bh;
+	struct btrfs_super_block *disk_super;
+
+	bh = btrfs_read_dev_super(device->bdev);
+	if (!bh)
+		return -EINVAL;
+	disk_super = (struct btrfs_super_block *)bh->b_data;
+
+	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+	set_buffer_dirty(bh);
+	sync_dirty_buffer(bh);
+	brelse(bh);
+
+	return 0;
+}
diff --git a/trunk/fs/btrfs/volumes.h b/trunk/fs/btrfs/volumes.h
index 53c06af92e8d..d3c3939ac751 100644
--- a/trunk/fs/btrfs/volumes.h
+++ b/trunk/fs/btrfs/volumes.h
@@ -50,6 +50,7 @@ struct btrfs_device {
 	int in_fs_metadata;
 	int missing;
 	int can_discard;
+	int is_tgtdev_for_dev_replace;
 
 	spinlock_t io_lock;
 
@@ -88,7 +89,7 @@ struct btrfs_device {
 	u8 uuid[BTRFS_UUID_SIZE];
 
 	/* per-device scrub information */
-	struct scrub_dev *scrub_device;
+	struct scrub_ctx *scrub_device;
 
 	struct btrfs_work work;
 	struct rcu_head rcu;
@@ -179,6 +180,15 @@ struct btrfs_device_info {
 	u64 total_avail;
 };
 
+struct btrfs_raid_attr {
+	int sub_stripes;	/* sub_stripes info for map */
+	int dev_stripes;	/* stripes per dev */
+	int devs_max;		/* max devs to use */
+	int devs_min;		/* min devs needed */
+	int devs_increment;	/* ndevs has to be a multiple of this */
+	int ncopies;		/* how many copies to data has */
+};
+
 struct map_lookup {
 	u64 type;
 	int io_align;
@@ -248,7 +258,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
 			   u64 chunk_tree, u64 chunk_objectid,
 			   u64 chunk_offset, u64 start, u64 num_bytes);
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		    u64 logical, u64 *length,
 		    struct btrfs_bio **bbio_ret, int mirror_num);
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -267,19 +277,27 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 			  struct btrfs_fs_devices **fs_devices_ret);
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+			       struct btrfs_fs_devices *fs_devices, int step);
+int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
+					 char *device_path,
+					 struct btrfs_device **device);
+int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
+			      struct btrfs_device **device);
 int btrfs_add_device(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root,
 		     struct btrfs_device *device);
 int btrfs_rm_device(struct btrfs_root *root, char *device_path);
 void btrfs_cleanup_fs_uuids(void);
-int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
 int btrfs_grow_device(struct btrfs_trans_handle *trans,
 		      struct btrfs_device *device, u64 new_size);
-struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
 				       u8 *uuid, u8 *fsid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
+int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+				  struct btrfs_device **device_out);
 int btrfs_balance(struct btrfs_balance_control *bctl,
 		  struct btrfs_ioctl_balance_args *bargs);
 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
@@ -296,6 +314,13 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
 			struct btrfs_fs_info *fs_info);
+void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
+				 struct btrfs_device *srcdev);
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+				      struct btrfs_device *tgtdev);
+void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
+					      struct btrfs_device *tgtdev);
+int btrfs_scratch_superblock(struct btrfs_device *device);
 
 static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
 				      int index)
diff --git a/trunk/fs/btrfs/xattr.c b/trunk/fs/btrfs/xattr.c
index 3f4e2d69e83a..446a6848c554 100644
--- a/trunk/fs/btrfs/xattr.c
+++ b/trunk/fs/btrfs/xattr.c
@@ -122,6 +122,16 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
 		 */
 		if (!value)
 			goto out;
+	} else {
+		di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
+					name, name_len, 0);
+		if (IS_ERR(di)) {
+			ret = PTR_ERR(di);
+			goto out;
+		}
+		if (!di && !value)
+			goto out;
+		btrfs_release_path(path);
 	}
 
 again:
@@ -198,6 +208,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
 
 	inode_inc_iversion(inode);
 	inode->i_ctime = CURRENT_TIME;
+	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
 	ret = btrfs_update_inode(trans, root, inode);
 	BUG_ON(ret);
 out:
@@ -265,7 +276,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 
 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
 		if (verify_dir_item(root, leaf, di))
-			continue;
+			goto next;
 
 		name_len = btrfs_dir_name_len(leaf, di);
 		total_size += name_len + 1;
diff --git a/trunk/fs/ceph/dir.c b/trunk/fs/ceph/dir.c
index e5b77319c97b..8c1aabe93b67 100644
--- a/trunk/fs/ceph/dir.c
+++ b/trunk/fs/ceph/dir.c
@@ -454,7 +454,7 @@ static void reset_readdir(struct ceph_file_info *fi)
 	fi->flags &= ~CEPH_F_ATEND;
 }
 
-static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
+static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct ceph_file_info *fi = file->private_data;
 	struct inode *inode = file->f_mapping->host;
@@ -463,7 +463,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
 
 	mutex_lock(&inode->i_mutex);
 	retval = -EINVAL;
-	switch (origin) {
+	switch (whence) {
 	case SEEK_END:
 		offset += inode->i_size + 2;   /* FIXME */
 		break;
diff --git a/trunk/fs/ceph/export.c b/trunk/fs/ceph/export.c
index 9349bb37a2fe..ca3ab3f9ca70 100644
--- a/trunk/fs/ceph/export.c
+++ b/trunk/fs/ceph/export.c
@@ -56,13 +56,15 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
 	struct ceph_nfs_confh *cfh = (void *)rawfh;
 	int connected_handle_length = sizeof(*cfh)/4;
 	int handle_length = sizeof(*fh)/4;
-	struct dentry *dentry = d_find_alias(inode);
+	struct dentry *dentry;
 	struct dentry *parent;
 
 	/* don't re-export snaps */
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EINVAL;
 
+	dentry = d_find_alias(inode);
+
 	/* if we found an alias, generate a connectable fh */
 	if (*max_len >= connected_handle_length && dentry) {
 		dout("encode_fh %p connectable\n", dentry);
diff --git a/trunk/fs/ceph/file.c b/trunk/fs/ceph/file.c
index 5840d2aaed15..d4dfdcf76d7f 100644
--- a/trunk/fs/ceph/file.c
+++ b/trunk/fs/ceph/file.c
@@ -797,7 +797,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
 /*
  * llseek.  be sure to verify file size on SEEK_END.
  */
-static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
+static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct inode *inode = file->f_mapping->host;
 	int ret;
@@ -805,7 +805,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
 	mutex_lock(&inode->i_mutex);
 	__ceph_do_pending_vmtruncate(inode);
 
-	if (origin == SEEK_END || origin == SEEK_DATA || origin == SEEK_HOLE) {
+	if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
 		ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
 		if (ret < 0) {
 			offset = ret;
@@ -813,7 +813,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
 		}
 	}
 
-	switch (origin) {
+	switch (whence) {
 	case SEEK_END:
 		offset += inode->i_size;
 		break;
diff --git a/trunk/fs/cifs/cifsfs.c b/trunk/fs/cifs/cifsfs.c
index 210f0af83fc4..ce9f3c5421bf 100644
--- a/trunk/fs/cifs/cifsfs.c
+++ b/trunk/fs/cifs/cifsfs.c
@@ -695,13 +695,13 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	return written;
 }
 
-static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
+static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
 {
 	/*
-	 * origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
+	 * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
 	 * the cached file length
 	 */
-	if (origin != SEEK_SET && origin != SEEK_CUR) {
+	if (whence != SEEK_SET && whence != SEEK_CUR) {
 		int rc;
 		struct inode *inode = file->f_path.dentry->d_inode;
 
@@ -728,7 +728,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 		if (rc < 0)
 			return (loff_t)rc;
 	}
-	return generic_file_llseek(file, offset, origin);
+	return generic_file_llseek(file, offset, whence);
 }
 
 static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
diff --git a/trunk/fs/configfs/dir.c b/trunk/fs/configfs/dir.c
index 7414ae24a79b..712b10f64c70 100644
--- a/trunk/fs/configfs/dir.c
+++ b/trunk/fs/configfs/dir.c
@@ -1613,12 +1613,12 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
 	return 0;
 }
 
-static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
+static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
 {
 	struct dentry * dentry = file->f_path.dentry;
 
 	mutex_lock(&dentry->d_inode->i_mutex);
-	switch (origin) {
+	switch (whence) {
 		case 1:
 			offset += file->f_pos;
 		case 0:
diff --git a/trunk/fs/eventfd.c b/trunk/fs/eventfd.c
index d81b9f654086..35470d9b96e6 100644
--- a/trunk/fs/eventfd.c
+++ b/trunk/fs/eventfd.c
@@ -19,6 +19,8 @@
 #include <linux/export.h>
 #include <linux/kref.h>
 #include <linux/eventfd.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 
 struct eventfd_ctx {
 	struct kref kref;
@@ -284,7 +286,25 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 	return res;
 }
 
+#ifdef CONFIG_PROC_FS
+static int eventfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+	struct eventfd_ctx *ctx = f->private_data;
+	int ret;
+
+	spin_lock_irq(&ctx->wqh.lock);
+	ret = seq_printf(m, "eventfd-count: %16llx\n",
+			 (unsigned long long)ctx->count);
+	spin_unlock_irq(&ctx->wqh.lock);
+
+	return ret;
+}
+#endif
+
 static const struct file_operations eventfd_fops = {
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo	= eventfd_show_fdinfo,
+#endif
 	.release	= eventfd_release,
 	.poll		= eventfd_poll,
 	.read		= eventfd_read,
diff --git a/trunk/fs/eventpoll.c b/trunk/fs/eventpoll.c
index cd96649bfe62..be56b21435f8 100644
--- a/trunk/fs/eventpoll.c
+++ b/trunk/fs/eventpoll.c
@@ -38,6 +38,8 @@
 #include <asm/io.h>
 #include <asm/mman.h>
 #include <linux/atomic.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 
 /*
  * LOCKING:
@@ -783,8 +785,34 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 	return pollflags != -1 ? pollflags : 0;
 }
 
+#ifdef CONFIG_PROC_FS
+static int ep_show_fdinfo(struct seq_file *m, struct file *f)
+{
+	struct eventpoll *ep = f->private_data;
+	struct rb_node *rbp;
+	int ret = 0;
+
+	mutex_lock(&ep->mtx);
+	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+		struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
+
+		ret = seq_printf(m, "tfd: %8d events: %8x data: %16llx\n",
+				 epi->ffd.fd, epi->event.events,
+				 (long long)epi->event.data);
+		if (ret)
+			break;
+	}
+	mutex_unlock(&ep->mtx);
+
+	return ret;
+}
+#endif
+
 /* File callbacks that implement the eventpoll file behaviour */
 static const struct file_operations eventpoll_fops = {
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo	= ep_show_fdinfo,
+#endif
 	.release	= ep_eventpoll_release,
 	.poll		= ep_eventpoll_poll,
 	.llseek		= noop_llseek,
diff --git a/trunk/fs/exec.c b/trunk/fs/exec.c
index 721a29929511..d8e1191cb112 100644
--- a/trunk/fs/exec.c
+++ b/trunk/fs/exec.c
@@ -1266,14 +1266,13 @@ int prepare_binprm(struct linux_binprm *bprm)
 	bprm->cred->egid = current_egid();
 
 	if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
-	    !current->no_new_privs) {
+	    !current->no_new_privs &&
+	    kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) &&
+	    kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) {
 		/* Set-uid? */
 		if (mode & S_ISUID) {
-			if (!kuid_has_mapping(bprm->cred->user_ns, inode->i_uid))
-				return -EPERM;
 			bprm->per_clear |= PER_CLEAR_ON_SETID;
 			bprm->cred->euid = inode->i_uid;
-
 		}
 
 		/* Set-gid? */
@@ -1283,8 +1282,6 @@ int prepare_binprm(struct linux_binprm *bprm)
 		 * executable.
 		 */
 		if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
-			if (!kgid_has_mapping(bprm->cred->user_ns, inode->i_gid))
-				return -EPERM;
 			bprm->per_clear |= PER_CLEAR_ON_SETID;
 			bprm->cred->egid = inode->i_gid;
 		}
@@ -1356,6 +1353,10 @@ int search_binary_handler(struct linux_binprm *bprm)
 	struct linux_binfmt *fmt;
 	pid_t old_pid, old_vpid;
 
+	/* This allows 4 levels of binfmt rewrites before failing hard. */
+	if (depth > 5)
+		return -ELOOP;
+
 	retval = security_bprm_check(bprm);
 	if (retval)
 		return retval;
@@ -1380,12 +1381,8 @@ int search_binary_handler(struct linux_binprm *bprm)
 			if (!try_module_get(fmt->module))
 				continue;
 			read_unlock(&binfmt_lock);
+			bprm->recursion_depth = depth + 1;
 			retval = fn(bprm);
-			/*
-			 * Restore the depth counter to its starting value
-			 * in this call, so we don't have to rely on every
-			 * load_binary function to restore it on return.
-			 */
 			bprm->recursion_depth = depth;
 			if (retval >= 0) {
 				if (depth == 0) {
diff --git a/trunk/fs/exofs/inode.c b/trunk/fs/exofs/inode.c
index b56181047751..d1f80abd8828 100644
--- a/trunk/fs/exofs/inode.c
+++ b/trunk/fs/exofs/inode.c
@@ -361,12 +361,12 @@ static int read_exec(struct page_collect *pcol)
 	return 0;
 
 err:
-	if (!pcol->read_4_write)
-		_unlock_pcol_pages(pcol, ret, READ);
-
-	pcol_free(pcol);
-
+	if (!pcol_copy) /* Failed before ownership transfer */
+		pcol_copy = pcol;
+	_unlock_pcol_pages(pcol_copy, ret, READ);
+	pcol_free(pcol_copy);
 	kfree(pcol_copy);
+
 	return ret;
 }
 
@@ -676,8 +676,10 @@ static int write_exec(struct page_collect *pcol)
 	return 0;
 
 err:
-	_unlock_pcol_pages(pcol, ret, WRITE);
-	pcol_free(pcol);
+	if (!pcol_copy) /* Failed before ownership transfer */
+		pcol_copy = pcol;
+	_unlock_pcol_pages(pcol_copy, ret, WRITE);
+	pcol_free(pcol_copy);
 	kfree(pcol_copy);
 
 	return ret;
diff --git a/trunk/fs/exportfs/expfs.c b/trunk/fs/exportfs/expfs.c
index 29ab099e3e08..606bb074c501 100644
--- a/trunk/fs/exportfs/expfs.c
+++ b/trunk/fs/exportfs/expfs.c
@@ -341,10 +341,21 @@ static int export_encode_fh(struct inode *inode, struct fid *fid,
 	return type;
 }
 
+int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
+			     int *max_len, struct inode *parent)
+{
+	const struct export_operations *nop = inode->i_sb->s_export_op;
+
+	if (nop && nop->encode_fh)
+		return nop->encode_fh(inode, fid->raw, max_len, parent);
+
+	return export_encode_fh(inode, fid, max_len, parent);
+}
+EXPORT_SYMBOL_GPL(exportfs_encode_inode_fh);
+
 int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len,
 		int connectable)
 {
-	const struct export_operations *nop = dentry->d_sb->s_export_op;
 	int error;
 	struct dentry *p = NULL;
 	struct inode *inode = dentry->d_inode, *parent = NULL;
@@ -357,10 +368,8 @@ int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len,
 		 */
 		parent = p->d_inode;
 	}
-	if (nop->encode_fh)
-		error = nop->encode_fh(inode, fid->raw, max_len, parent);
-	else
-		error = export_encode_fh(inode, fid, max_len, parent);
+
+	error = exportfs_encode_inode_fh(inode, fid, max_len, parent);
 	dput(p);
 
 	return error;
diff --git a/trunk/fs/ext3/dir.c b/trunk/fs/ext3/dir.c
index c8fff930790d..dd91264ba94f 100644
--- a/trunk/fs/ext3/dir.c
+++ b/trunk/fs/ext3/dir.c
@@ -296,17 +296,17 @@ static inline loff_t ext3_get_htree_eof(struct file *filp)
  * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
  *       will be invalid once the directory was converted into a dx directory
  */
-loff_t ext3_dir_llseek(struct file *file, loff_t offset, int origin)
+loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct inode *inode = file->f_mapping->host;
 	int dx_dir = is_dx_dir(inode);
 	loff_t htree_max = ext3_get_htree_eof(file);
 
 	if (likely(dx_dir))
-		return generic_file_llseek_size(file, offset, origin,
+		return generic_file_llseek_size(file, offset, whence,
 					        htree_max, htree_max);
 	else
-		return generic_file_llseek(file, offset, origin);
+		return generic_file_llseek(file, offset, whence);
 }
 
 /*
diff --git a/trunk/fs/ext4/dir.c b/trunk/fs/ext4/dir.c
index b8d877f6c1fa..80a28b297279 100644
--- a/trunk/fs/ext4/dir.c
+++ b/trunk/fs/ext4/dir.c
@@ -333,17 +333,17 @@ static inline loff_t ext4_get_htree_eof(struct file *filp)
  *
  * For non-htree, ext4_llseek already chooses the proper max offset.
  */
-loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin)
+loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct inode *inode = file->f_mapping->host;
 	int dx_dir = is_dx_dir(inode);
 	loff_t htree_max = ext4_get_htree_eof(file);
 
 	if (likely(dx_dir))
-		return generic_file_llseek_size(file, offset, origin,
+		return generic_file_llseek_size(file, offset, whence,
 						    htree_max, htree_max);
 	else
-		return ext4_llseek(file, offset, origin);
+		return ext4_llseek(file, offset, whence);
 }
 
 /*
diff --git a/trunk/fs/ext4/file.c b/trunk/fs/ext4/file.c
index b64a60bf105a..d07c27ca594a 100644
--- a/trunk/fs/ext4/file.c
+++ b/trunk/fs/ext4/file.c
@@ -303,7 +303,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
  * page cache has data or not.
  */
 static int ext4_find_unwritten_pgoff(struct inode *inode,
-				     int origin,
+				     int whence,
 				     struct ext4_map_blocks *map,
 				     loff_t *offset)
 {
@@ -333,10 +333,10 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
 		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
 					  (pgoff_t)num);
 		if (nr_pages == 0) {
-			if (origin == SEEK_DATA)
+			if (whence == SEEK_DATA)
 				break;
 
-			BUG_ON(origin != SEEK_HOLE);
+			BUG_ON(whence != SEEK_HOLE);
 			/*
 			 * If this is the first time to go into the loop and
 			 * offset is not beyond the end offset, it will be a
@@ -352,7 +352,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
 		 * offset is smaller than the first page offset, it will be a
 		 * hole at this offset.
 		 */
-		if (lastoff == startoff && origin == SEEK_HOLE &&
+		if (lastoff == startoff && whence == SEEK_HOLE &&
 		    lastoff < page_offset(pvec.pages[0])) {
 			found = 1;
 			break;
@@ -366,7 +366,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
 			 * If the current offset is not beyond the end of given
 			 * range, it will be a hole.
 			 */
-			if (lastoff < endoff && origin == SEEK_HOLE &&
+			if (lastoff < endoff && whence == SEEK_HOLE &&
 			    page->index > end) {
 				found = 1;
 				*offset = lastoff;
@@ -391,10 +391,10 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
 				do {
 					if (buffer_uptodate(bh) ||
 					    buffer_unwritten(bh)) {
-						if (origin == SEEK_DATA)
+						if (whence == SEEK_DATA)
 							found = 1;
 					} else {
-						if (origin == SEEK_HOLE)
+						if (whence == SEEK_HOLE)
 							found = 1;
 					}
 					if (found) {
@@ -416,7 +416,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
 		 * The no. of pages is less than our desired, that would be a
 		 * hole in there.
 		 */
-		if (nr_pages < num && origin == SEEK_HOLE) {
+		if (nr_pages < num && whence == SEEK_HOLE) {
 			found = 1;
 			*offset = lastoff;
 			break;
@@ -609,7 +609,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
  * by calling generic_file_llseek_size() with the appropriate maxbytes
  * value for each.
  */
-loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
+loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct inode *inode = file->f_mapping->host;
 	loff_t maxbytes;
@@ -619,11 +619,11 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
 	else
 		maxbytes = inode->i_sb->s_maxbytes;
 
-	switch (origin) {
+	switch (whence) {
 	case SEEK_SET:
 	case SEEK_CUR:
 	case SEEK_END:
-		return generic_file_llseek_size(file, offset, origin,
+		return generic_file_llseek_size(file, offset, whence,
 						maxbytes, i_size_read(inode));
 	case SEEK_DATA:
 		return ext4_seek_data(file, offset, maxbytes);
diff --git a/trunk/fs/fat/fat.h b/trunk/fs/fat/fat.h
index 623f36f0423b..12701a567752 100644
--- a/trunk/fs/fat/fat.h
+++ b/trunk/fs/fat/fat.h
@@ -29,6 +29,7 @@ struct fat_mount_options {
 	unsigned short fs_fmask;
 	unsigned short fs_dmask;
 	unsigned short codepage;   /* Codepage for shortname conversions */
+	int time_offset;	   /* Offset of timestamps from UTC (in minutes) */
 	char *iocharset;           /* Charset used for filename input/display */
 	unsigned short shortname;  /* flags for shortname display/create rule */
 	unsigned char name_check;  /* r = relaxed, n = normal, s = strict */
@@ -45,7 +46,7 @@ struct fat_mount_options {
 		 flush:1,	   /* write things quickly */
 		 nocase:1,	   /* Does this need case conversion? 0=need case conversion*/
 		 usefree:1,	   /* Use free_clusters for FAT32 */
-		 tz_utc:1,	   /* Filesystem timestamps are in UTC */
+		 tz_set:1,	   /* Filesystem timestamps' offset set */
 		 rodir:1,	   /* allow ATTR_RO for directory */
 		 discard:1,	   /* Issue discard requests on deletions */
 		 nfs:1;		   /* Do extra work needed for NFS export */
diff --git a/trunk/fs/fat/inode.c b/trunk/fs/fat/inode.c
index 5bafaad00530..35806813ea4e 100644
--- a/trunk/fs/fat/inode.c
+++ b/trunk/fs/fat/inode.c
@@ -26,6 +26,7 @@
 #include <linux/writeback.h>
 #include <linux/log2.h>
 #include <linux/hash.h>
+#include <linux/blkdev.h>
 #include <asm/unaligned.h>
 #include "fat.h"
 
@@ -725,7 +726,8 @@ static int fat_show_options(struct seq_file *m, struct dentry *root)
 	if (opts->allow_utime)
 		seq_printf(m, ",allow_utime=%04o", opts->allow_utime);
 	if (sbi->nls_disk)
-		seq_printf(m, ",codepage=%s", sbi->nls_disk->charset);
+		/* strip "cp" prefix from displayed option */
+		seq_printf(m, ",codepage=%s", &sbi->nls_disk->charset[2]);
 	if (isvfat) {
 		if (sbi->nls_io)
 			seq_printf(m, ",iocharset=%s", sbi->nls_io->charset);
@@ -777,8 +779,12 @@ static int fat_show_options(struct seq_file *m, struct dentry *root)
 	}
 	if (opts->flush)
 		seq_puts(m, ",flush");
-	if (opts->tz_utc)
-		seq_puts(m, ",tz=UTC");
+	if (opts->tz_set) {
+		if (opts->time_offset)
+			seq_printf(m, ",time_offset=%d", opts->time_offset);
+		else
+			seq_puts(m, ",tz=UTC");
+	}
 	if (opts->errors == FAT_ERRORS_CONT)
 		seq_puts(m, ",errors=continue");
 	else if (opts->errors == FAT_ERRORS_PANIC)
@@ -800,7 +806,8 @@ enum {
 	Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
 	Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
 	Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
-	Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_err,
+	Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset,
+	Opt_err,
 };
 
 static const match_table_t fat_tokens = {
@@ -825,6 +832,7 @@ static const match_table_t fat_tokens = {
 	{Opt_immutable, "sys_immutable"},
 	{Opt_flush, "flush"},
 	{Opt_tz_utc, "tz=UTC"},
+	{Opt_time_offset, "time_offset=%d"},
 	{Opt_err_cont, "errors=continue"},
 	{Opt_err_panic, "errors=panic"},
 	{Opt_err_ro, "errors=remount-ro"},
@@ -909,7 +917,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
 	opts->utf8 = opts->unicode_xlate = 0;
 	opts->numtail = 1;
 	opts->usefree = opts->nocase = 0;
-	opts->tz_utc = 0;
+	opts->tz_set = 0;
 	opts->nfs = 0;
 	opts->errors = FAT_ERRORS_RO;
 	*debug = 0;
@@ -965,48 +973,57 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
 			break;
 		case Opt_uid:
 			if (match_int(&args[0], &option))
-				return 0;
+				return -EINVAL;
 			opts->fs_uid = make_kuid(current_user_ns(), option);
 			if (!uid_valid(opts->fs_uid))
-				return 0;
+				return -EINVAL;
 			break;
 		case Opt_gid:
 			if (match_int(&args[0], &option))
-				return 0;
+				return -EINVAL;
 			opts->fs_gid = make_kgid(current_user_ns(), option);
 			if (!gid_valid(opts->fs_gid))
-				return 0;
+				return -EINVAL;
 			break;
 		case Opt_umask:
 			if (match_octal(&args[0], &option))
-				return 0;
+				return -EINVAL;
 			opts->fs_fmask = opts->fs_dmask = option;
 			break;
 		case Opt_dmask:
 			if (match_octal(&args[0], &option))
-				return 0;
+				return -EINVAL;
 			opts->fs_dmask = option;
 			break;
 		case Opt_fmask:
 			if (match_octal(&args[0], &option))
-				return 0;
+				return -EINVAL;
 			opts->fs_fmask = option;
 			break;
 		case Opt_allow_utime:
 			if (match_octal(&args[0], &option))
-				return 0;
+				return -EINVAL;
 			opts->allow_utime = option & (S_IWGRP | S_IWOTH);
 			break;
 		case Opt_codepage:
 			if (match_int(&args[0], &option))
-				return 0;
+				return -EINVAL;
 			opts->codepage = option;
 			break;
 		case Opt_flush:
 			opts->flush = 1;
 			break;
+		case Opt_time_offset:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			if (option < -12 * 60 || option > 12 * 60)
+				return -EINVAL;
+			opts->tz_set = 1;
+			opts->time_offset = option;
+			break;
 		case Opt_tz_utc:
-			opts->tz_utc = 1;
+			opts->tz_set = 1;
+			opts->time_offset = 0;
 			break;
 		case Opt_err_cont:
 			opts->errors = FAT_ERRORS_CONT;
@@ -1431,6 +1448,14 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
 		goto out_fail;
 	}
 
+	if (sbi->options.discard) {
+		struct request_queue *q = bdev_get_queue(sb->s_bdev);
+		if (!blk_queue_discard(q))
+			fat_msg(sb, KERN_WARNING,
+					"mounting with \"discard\" option, but "
+					"the device does not support discard");
+	}
+
 	return 0;
 
 out_invalid:
diff --git a/trunk/fs/fat/misc.c b/trunk/fs/fat/misc.c
index 6d93360ca0cc..5eb600dc43a9 100644
--- a/trunk/fs/fat/misc.c
+++ b/trunk/fs/fat/misc.c
@@ -212,8 +212,10 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
 		   + days_in_year[month] + day
 		   + DAYS_DELTA) * SECS_PER_DAY;
 
-	if (!sbi->options.tz_utc)
+	if (!sbi->options.tz_set)
 		second += sys_tz.tz_minuteswest * SECS_PER_MIN;
+	else
+		second -= sbi->options.time_offset * SECS_PER_MIN;
 
 	if (time_cs) {
 		ts->tv_sec = second + (time_cs / 100);
@@ -229,8 +231,9 @@ void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
 		       __le16 *time, __le16 *date, u8 *time_cs)
 {
 	struct tm tm;
-	time_to_tm(ts->tv_sec, sbi->options.tz_utc ? 0 :
-		   -sys_tz.tz_minuteswest * 60, &tm);
+	time_to_tm(ts->tv_sec,
+		   (sbi->options.tz_set ? sbi->options.time_offset :
+		   -sys_tz.tz_minuteswest) * SECS_PER_MIN, &tm);
 
 	/*  FAT can only support year between 1980 to 2107 */
 	if (tm.tm_year < 1980 - 1900) {
diff --git a/trunk/fs/fuse/dev.c b/trunk/fs/fuse/dev.c
index 8c23fa7a91e6..c16335315e5d 100644
--- a/trunk/fs/fuse/dev.c
+++ b/trunk/fs/fuse/dev.c
@@ -92,8 +92,8 @@ static void __fuse_put_request(struct fuse_req *req)
 
 static void fuse_req_init_context(struct fuse_req *req)
 {
-	req->in.h.uid = current_fsuid();
-	req->in.h.gid = current_fsgid();
+	req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
+	req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
 	req->in.h.pid = current->pid;
 }
 
diff --git a/trunk/fs/fuse/dir.c b/trunk/fs/fuse/dir.c
index 324bc0850534..b7c09f9eb40c 100644
--- a/trunk/fs/fuse/dir.c
+++ b/trunk/fs/fuse/dir.c
@@ -818,8 +818,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
 	stat->ino = attr->ino;
 	stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
 	stat->nlink = attr->nlink;
-	stat->uid = attr->uid;
-	stat->gid = attr->gid;
+	stat->uid = make_kuid(&init_user_ns, attr->uid);
+	stat->gid = make_kgid(&init_user_ns, attr->gid);
 	stat->rdev = inode->i_rdev;
 	stat->atime.tv_sec = attr->atime;
 	stat->atime.tv_nsec = attr->atimensec;
@@ -1007,12 +1007,12 @@ int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
 	rcu_read_lock();
 	ret = 0;
 	cred = __task_cred(task);
-	if (cred->euid == fc->user_id &&
-	    cred->suid == fc->user_id &&
-	    cred->uid  == fc->user_id &&
-	    cred->egid == fc->group_id &&
-	    cred->sgid == fc->group_id &&
-	    cred->gid  == fc->group_id)
+	if (uid_eq(cred->euid, fc->user_id) &&
+	    uid_eq(cred->suid, fc->user_id) &&
+	    uid_eq(cred->uid,  fc->user_id) &&
+	    gid_eq(cred->egid, fc->group_id) &&
+	    gid_eq(cred->sgid, fc->group_id) &&
+	    gid_eq(cred->gid,  fc->group_id))
 		ret = 1;
 	rcu_read_unlock();
 
@@ -1306,9 +1306,9 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
 	if (ivalid & ATTR_MODE)
 		arg->valid |= FATTR_MODE,   arg->mode = iattr->ia_mode;
 	if (ivalid & ATTR_UID)
-		arg->valid |= FATTR_UID,    arg->uid = iattr->ia_uid;
+		arg->valid |= FATTR_UID,    arg->uid = from_kuid(&init_user_ns, iattr->ia_uid);
 	if (ivalid & ATTR_GID)
-		arg->valid |= FATTR_GID,    arg->gid = iattr->ia_gid;
+		arg->valid |= FATTR_GID,    arg->gid = from_kgid(&init_user_ns, iattr->ia_gid);
 	if (ivalid & ATTR_SIZE)
 		arg->valid |= FATTR_SIZE,   arg->size = iattr->ia_size;
 	if (ivalid & ATTR_ATIME) {
diff --git a/trunk/fs/fuse/file.c b/trunk/fs/fuse/file.c
index 78d2837bc940..e21d4d8f87e3 100644
--- a/trunk/fs/fuse/file.c
+++ b/trunk/fs/fuse/file.c
@@ -1599,19 +1599,19 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
 	return err ? 0 : outarg.block;
 }
 
-static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
+static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
 {
 	loff_t retval;
 	struct inode *inode = file->f_path.dentry->d_inode;
 
 	/* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
-	if (origin == SEEK_CUR || origin == SEEK_SET)
-		return generic_file_llseek(file, offset, origin);
+	if (whence == SEEK_CUR || whence == SEEK_SET)
+		return generic_file_llseek(file, offset, whence);
 
 	mutex_lock(&inode->i_mutex);
 	retval = fuse_update_attributes(inode, NULL, file, NULL);
 	if (!retval)
-		retval = generic_file_llseek(file, offset, origin);
+		retval = generic_file_llseek(file, offset, whence);
 	mutex_unlock(&inode->i_mutex);
 
 	return retval;
diff --git a/trunk/fs/fuse/fuse_i.h b/trunk/fs/fuse/fuse_i.h
index e24dd74e3068..e105a53fc72d 100644
--- a/trunk/fs/fuse/fuse_i.h
+++ b/trunk/fs/fuse/fuse_i.h
@@ -333,10 +333,10 @@ struct fuse_conn {
 	atomic_t count;
 
 	/** The user id for this mount */
-	uid_t user_id;
+	kuid_t user_id;
 
 	/** The group id for this mount */
-	gid_t group_id;
+	kgid_t group_id;
 
 	/** The fuse mount flags for this mount */
 	unsigned flags;
diff --git a/trunk/fs/fuse/inode.c b/trunk/fs/fuse/inode.c
index f0eda124cffb..73ca6b72beaf 100644
--- a/trunk/fs/fuse/inode.c
+++ b/trunk/fs/fuse/inode.c
@@ -60,8 +60,8 @@ MODULE_PARM_DESC(max_user_congthresh,
 struct fuse_mount_data {
 	int fd;
 	unsigned rootmode;
-	unsigned user_id;
-	unsigned group_id;
+	kuid_t user_id;
+	kgid_t group_id;
 	unsigned fd_present:1;
 	unsigned rootmode_present:1;
 	unsigned user_id_present:1;
@@ -164,8 +164,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
 	inode->i_ino     = fuse_squash_ino(attr->ino);
 	inode->i_mode    = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
 	set_nlink(inode, attr->nlink);
-	inode->i_uid     = attr->uid;
-	inode->i_gid     = attr->gid;
+	inode->i_uid     = make_kuid(&init_user_ns, attr->uid);
+	inode->i_gid     = make_kgid(&init_user_ns, attr->gid);
 	inode->i_blocks  = attr->blocks;
 	inode->i_atime.tv_sec   = attr->atime;
 	inode->i_atime.tv_nsec  = attr->atimensec;
@@ -492,14 +492,18 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
 		case OPT_USER_ID:
 			if (match_int(&args[0], &value))
 				return 0;
-			d->user_id = value;
+			d->user_id = make_kuid(current_user_ns(), value);
+			if (!uid_valid(d->user_id))
+				return 0;
 			d->user_id_present = 1;
 			break;
 
 		case OPT_GROUP_ID:
 			if (match_int(&args[0], &value))
 				return 0;
-			d->group_id = value;
+			d->group_id = make_kgid(current_user_ns(), value);
+			if (!gid_valid(d->group_id))
+				return 0;
 			d->group_id_present = 1;
 			break;
 
@@ -540,8 +544,8 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
 	struct super_block *sb = root->d_sb;
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 
-	seq_printf(m, ",user_id=%u", fc->user_id);
-	seq_printf(m, ",group_id=%u", fc->group_id);
+	seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id));
+	seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id));
 	if (fc->flags & FUSE_DEFAULT_PERMISSIONS)
 		seq_puts(m, ",default_permissions");
 	if (fc->flags & FUSE_ALLOW_OTHER)
@@ -989,7 +993,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (!file)
 		goto err;
 
-	if (file->f_op != &fuse_dev_operations)
+	if ((file->f_op != &fuse_dev_operations) ||
+	    (file->f_cred->user_ns != &init_user_ns))
 		goto err_fput;
 
 	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
diff --git a/trunk/fs/gfs2/file.c b/trunk/fs/gfs2/file.c
index dfe2d8cb9b2c..991ab2d484dd 100644
--- a/trunk/fs/gfs2/file.c
+++ b/trunk/fs/gfs2/file.c
@@ -44,7 +44,7 @@
  * gfs2_llseek - seek to a location in a file
  * @file: the file
  * @offset: the offset
- * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
+ * @whence: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
  *
  * SEEK_END requires the glock for the file because it references the
  * file's size.
@@ -52,26 +52,26 @@
  * Returns: The new offset, or errno
  */
 
-static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
+static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
 	struct gfs2_holder i_gh;
 	loff_t error;
 
-	switch (origin) {
+	switch (whence) {
 	case SEEK_END: /* These reference inode->i_size */
 	case SEEK_DATA:
 	case SEEK_HOLE:
 		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
 					   &i_gh);
 		if (!error) {
-			error = generic_file_llseek(file, offset, origin);
+			error = generic_file_llseek(file, offset, whence);
 			gfs2_glock_dq_uninit(&i_gh);
 		}
 		break;
 	case SEEK_CUR:
 	case SEEK_SET:
-		error = generic_file_llseek(file, offset, origin);
+		error = generic_file_llseek(file, offset, whence);
 		break;
 	default:
 		error = -EINVAL;
diff --git a/trunk/fs/hppfs/hppfs.c b/trunk/fs/hppfs/hppfs.c
index 78f21f8dc2ec..43b315f2002b 100644
--- a/trunk/fs/hppfs/hppfs.c
+++ b/trunk/fs/hppfs/hppfs.c
@@ -710,7 +710,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
 	struct vfsmount *proc_mnt;
 	int err = -ENOENT;
 
-	proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt);
+	proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt);
 	if (IS_ERR(proc_mnt))
 		goto out;
 
diff --git a/trunk/fs/jffs2/nodemgmt.c b/trunk/fs/jffs2/nodemgmt.c
index 0c96eb52c797..03310721712f 100644
--- a/trunk/fs/jffs2/nodemgmt.c
+++ b/trunk/fs/jffs2/nodemgmt.c
@@ -417,14 +417,16 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
 			spin_unlock(&c->erase_completion_lock);
 
 			ret = jffs2_prealloc_raw_node_refs(c, jeb, 1);
-			if (ret)
-				return ret;
+
 			/* Just lock it again and continue. Nothing much can change because
 			   we hold c->alloc_sem anyway. In fact, it's not entirely clear why
 			   we hold c->erase_completion_lock in the majority of this function...
 			   but that's a question for another (more caffeine-rich) day. */
 			spin_lock(&c->erase_completion_lock);
 
+			if (ret)
+				return ret;
+
 			waste = jeb->free_size;
 			jffs2_link_node_ref(c, jeb,
 					    (jeb->offset + c->sector_size - waste) | REF_OBSOLETE,
diff --git a/trunk/fs/libfs.c b/trunk/fs/libfs.c
index 7cc37ca19cd8..35fc6e74cd88 100644
--- a/trunk/fs/libfs.c
+++ b/trunk/fs/libfs.c
@@ -81,11 +81,11 @@ int dcache_dir_close(struct inode *inode, struct file *file)
 	return 0;
 }
 
-loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
+loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	mutex_lock(&dentry->d_inode->i_mutex);
-	switch (origin) {
+	switch (whence) {
 		case 1:
 			offset += file->f_pos;
 		case 0:
diff --git a/trunk/fs/lockd/clnt4xdr.c b/trunk/fs/lockd/clnt4xdr.c
index 13ad1539fbf2..00ec0b9c94d1 100644
--- a/trunk/fs/lockd/clnt4xdr.c
+++ b/trunk/fs/lockd/clnt4xdr.c
@@ -64,10 +64,6 @@ static void nlm4_compute_offsets(const struct nlm_lock *lock,
 {
 	const struct file_lock *fl = &lock->fl;
 
-	BUG_ON(fl->fl_start > NLM4_OFFSET_MAX);
-	BUG_ON(fl->fl_end > NLM4_OFFSET_MAX &&
-				fl->fl_end != OFFSET_MAX);
-
 	*l_offset = loff_t_to_s64(fl->fl_start);
 	if (fl->fl_end == OFFSET_MAX)
 		*l_len = 0;
@@ -122,7 +118,6 @@ static void encode_netobj(struct xdr_stream *xdr,
 {
 	__be32 *p;
 
-	BUG_ON(length > XDR_MAX_NETOBJ);
 	p = xdr_reserve_space(xdr, 4 + length);
 	xdr_encode_opaque(p, data, length);
 }
@@ -156,7 +151,6 @@ static int decode_netobj(struct xdr_stream *xdr,
 static void encode_cookie(struct xdr_stream *xdr,
 			  const struct nlm_cookie *cookie)
 {
-	BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
 	encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
 }
 
@@ -198,7 +192,6 @@ static int decode_cookie(struct xdr_stream *xdr,
  */
 static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
 {
-	BUG_ON(fh->size > NFS3_FHSIZE);
 	encode_netobj(xdr, (u8 *)&fh->data, fh->size);
 }
 
@@ -336,7 +329,6 @@ static void encode_caller_name(struct xdr_stream *xdr, const char *name)
 	u32 length = strlen(name);
 	__be32 *p;
 
-	BUG_ON(length > NLM_MAXSTRLEN);
 	p = xdr_reserve_space(xdr, 4 + length);
 	xdr_encode_opaque(p, name, length);
 }
diff --git a/trunk/fs/lockd/clntproc.c b/trunk/fs/lockd/clntproc.c
index 05d29124c6ab..54f9e6ce0430 100644
--- a/trunk/fs/lockd/clntproc.c
+++ b/trunk/fs/lockd/clntproc.c
@@ -141,7 +141,7 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
 
 static void nlmclnt_release_lockargs(struct nlm_rqst *req)
 {
-	BUG_ON(req->a_args.lock.fl.fl_ops != NULL);
+	WARN_ON_ONCE(req->a_args.lock.fl.fl_ops != NULL);
 }
 
 /**
@@ -465,7 +465,6 @@ static const struct file_lock_operations nlmclnt_lock_ops = {
 
 static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host)
 {
-	BUG_ON(fl->fl_ops != NULL);
 	fl->fl_u.nfs_fl.state = 0;
 	fl->fl_u.nfs_fl.owner = nlm_find_lockowner(host, fl->fl_owner);
 	INIT_LIST_HEAD(&fl->fl_u.nfs_fl.list);
diff --git a/trunk/fs/lockd/clntxdr.c b/trunk/fs/lockd/clntxdr.c
index 982d2676e1f8..9a55797a1cd4 100644
--- a/trunk/fs/lockd/clntxdr.c
+++ b/trunk/fs/lockd/clntxdr.c
@@ -60,10 +60,6 @@ static void nlm_compute_offsets(const struct nlm_lock *lock,
 {
 	const struct file_lock *fl = &lock->fl;
 
-	BUG_ON(fl->fl_start > NLM_OFFSET_MAX);
-	BUG_ON(fl->fl_end > NLM_OFFSET_MAX &&
-				fl->fl_end != OFFSET_MAX);
-
 	*l_offset = loff_t_to_s32(fl->fl_start);
 	if (fl->fl_end == OFFSET_MAX)
 		*l_len = 0;
@@ -119,7 +115,6 @@ static void encode_netobj(struct xdr_stream *xdr,
 {
 	__be32 *p;
 
-	BUG_ON(length > XDR_MAX_NETOBJ);
 	p = xdr_reserve_space(xdr, 4 + length);
 	xdr_encode_opaque(p, data, length);
 }
@@ -153,7 +148,6 @@ static int decode_netobj(struct xdr_stream *xdr,
 static void encode_cookie(struct xdr_stream *xdr,
 			  const struct nlm_cookie *cookie)
 {
-	BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
 	encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
 }
 
@@ -195,7 +189,6 @@ static int decode_cookie(struct xdr_stream *xdr,
  */
 static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
 {
-	BUG_ON(fh->size != NFS2_FHSIZE);
 	encode_netobj(xdr, (u8 *)&fh->data, NFS2_FHSIZE);
 }
 
@@ -330,7 +323,6 @@ static void encode_caller_name(struct xdr_stream *xdr, const char *name)
 	u32 length = strlen(name);
 	__be32 *p;
 
-	BUG_ON(length > NLM_MAXSTRLEN);
 	p = xdr_reserve_space(xdr, 4 + length);
 	xdr_encode_opaque(p, name, length);
 }
diff --git a/trunk/fs/lockd/host.c b/trunk/fs/lockd/host.c
index f9b22e58f78f..0e17090c310f 100644
--- a/trunk/fs/lockd/host.c
+++ b/trunk/fs/lockd/host.c
@@ -177,9 +177,6 @@ static void nlm_destroy_host_locked(struct nlm_host *host)
 
 	dprintk("lockd: destroy host %s\n", host->h_name);
 
-	BUG_ON(!list_empty(&host->h_lockowners));
-	BUG_ON(atomic_read(&host->h_count));
-
 	hlist_del_init(&host->h_hash);
 
 	nsm_unmonitor(host);
@@ -289,13 +286,12 @@ void nlmclnt_release_host(struct nlm_host *host)
 
 	dprintk("lockd: release client host %s\n", host->h_name);
 
-	BUG_ON(atomic_read(&host->h_count) < 0);
-	BUG_ON(host->h_server);
+	WARN_ON_ONCE(host->h_server);
 
 	if (atomic_dec_and_test(&host->h_count)) {
-		BUG_ON(!list_empty(&host->h_lockowners));
-		BUG_ON(!list_empty(&host->h_granted));
-		BUG_ON(!list_empty(&host->h_reclaim));
+		WARN_ON_ONCE(!list_empty(&host->h_lockowners));
+		WARN_ON_ONCE(!list_empty(&host->h_granted));
+		WARN_ON_ONCE(!list_empty(&host->h_reclaim));
 
 		mutex_lock(&nlm_host_mutex);
 		nlm_destroy_host_locked(host);
@@ -412,8 +408,7 @@ void nlmsvc_release_host(struct nlm_host *host)
 
 	dprintk("lockd: release server host %s\n", host->h_name);
 
-	BUG_ON(atomic_read(&host->h_count) < 0);
-	BUG_ON(!host->h_server);
+	WARN_ON_ONCE(!host->h_server);
 	atomic_dec(&host->h_count);
 }
 
diff --git a/trunk/fs/lockd/mon.c b/trunk/fs/lockd/mon.c
index 3d7e09bcc0e9..3c2cfc683631 100644
--- a/trunk/fs/lockd/mon.c
+++ b/trunk/fs/lockd/mon.c
@@ -154,8 +154,6 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
 		.rpc_resp	= res,
 	};
 
-	BUG_ON(clnt == NULL);
-
 	memset(res, 0, sizeof(*res));
 
 	msg.rpc_proc = &clnt->cl_procinfo[proc];
@@ -466,7 +464,6 @@ static void encode_nsm_string(struct xdr_stream *xdr, const char *string)
 	const u32 len = strlen(string);
 	__be32 *p;
 
-	BUG_ON(len > SM_MAXSTRLEN);
 	p = xdr_reserve_space(xdr, 4 + len);
 	xdr_encode_opaque(p, string, len);
 }
diff --git a/trunk/fs/mount.h b/trunk/fs/mount.h
index 4f291f9de641..cd5007980400 100644
--- a/trunk/fs/mount.h
+++ b/trunk/fs/mount.h
@@ -4,8 +4,11 @@
 
 struct mnt_namespace {
 	atomic_t		count;
+	unsigned int		proc_inum;
 	struct mount *	root;
 	struct list_head	list;
+	struct user_namespace	*user_ns;
+	u64			seq;	/* Sequence number to prevent loops */
 	wait_queue_head_t poll;
 	int event;
 };
diff --git a/trunk/fs/namespace.c b/trunk/fs/namespace.c
index 24960626bb6b..398a50ff2438 100644
--- a/trunk/fs/namespace.c
+++ b/trunk/fs/namespace.c
@@ -12,6 +12,7 @@
 #include <linux/export.h>
 #include <linux/capability.h>
 #include <linux/mnt_namespace.h>
+#include <linux/user_namespace.h>
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/idr.h>
@@ -20,6 +21,7 @@
 #include <linux/fs_struct.h>	/* get_fs_root et.al. */
 #include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
 #include <linux/uaccess.h>
+#include <linux/proc_fs.h>
 #include "pnode.h"
 #include "internal.h"
 
@@ -784,7 +786,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	if (!mnt)
 		return ERR_PTR(-ENOMEM);
 
-	if (flag & (CL_SLAVE | CL_PRIVATE))
+	if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
 		mnt->mnt_group_id = 0; /* not a peer of original */
 	else
 		mnt->mnt_group_id = old->mnt_group_id;
@@ -805,7 +807,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
 	br_write_unlock(&vfsmount_lock);
 
-	if (flag & CL_SLAVE) {
+	if ((flag & CL_SLAVE) ||
+	    ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
 		list_add(&mnt->mnt_slave, &old->mnt_slave_list);
 		mnt->mnt_master = old;
 		CLEAR_MNT_SHARED(mnt);
@@ -1266,7 +1269,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
 		goto dput_and_out;
 
 	retval = -EPERM;
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
 		goto dput_and_out;
 
 	retval = do_umount(mnt, flags);
@@ -1292,7 +1295,7 @@ SYSCALL_DEFINE1(oldumount, char __user *, name)
 
 static int mount_is_safe(struct path *path)
 {
-	if (capable(CAP_SYS_ADMIN))
+	if (ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN))
 		return 0;
 	return -EPERM;
 #ifdef notyet
@@ -1308,6 +1311,26 @@ static int mount_is_safe(struct path *path)
 #endif
 }
 
+static bool mnt_ns_loop(struct path *path)
+{
+	/* Could bind mounting the mount namespace inode cause a
+	 * mount namespace loop?
+	 */
+	struct inode *inode = path->dentry->d_inode;
+	struct proc_inode *ei;
+	struct mnt_namespace *mnt_ns;
+
+	if (!proc_ns_inode(inode))
+		return false;
+
+	ei = PROC_I(inode);
+	if (ei->ns_ops != &mntns_operations)
+		return false;
+
+	mnt_ns = ei->ns;
+	return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
+}
+
 struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 					int flag)
 {
@@ -1610,7 +1633,7 @@ static int do_change_type(struct path *path, int flag)
 	int type;
 	int err = 0;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
 	if (path->dentry != path->mnt->mnt_root)
@@ -1655,6 +1678,10 @@ static int do_loopback(struct path *path, const char *old_name,
 	if (err)
 		return err;
 
+	err = -EINVAL;
+	if (mnt_ns_loop(&old_path))
+		goto out; 
+
 	err = lock_mount(path);
 	if (err)
 		goto out;
@@ -1770,7 +1797,7 @@ static int do_move_mount(struct path *path, const char *old_name)
 	struct mount *p;
 	struct mount *old;
 	int err = 0;
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 	if (!old_name || !*old_name)
 		return -EINVAL;
@@ -1857,21 +1884,6 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
 	return ERR_PTR(err);
 }
 
-static struct vfsmount *
-do_kern_mount(const char *fstype, int flags, const char *name, void *data)
-{
-	struct file_system_type *type = get_fs_type(fstype);
-	struct vfsmount *mnt;
-	if (!type)
-		return ERR_PTR(-ENODEV);
-	mnt = vfs_kern_mount(type, flags, name, data);
-	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
-	    !mnt->mnt_sb->s_subtype)
-		mnt = fs_set_subtype(mnt, fstype);
-	put_filesystem(type);
-	return mnt;
-}
-
 /*
  * add a mount into a namespace's mount tree
  */
@@ -1917,20 +1929,46 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
  * create a new mount for userspace and request it to be added into the
  * namespace's tree
  */
-static int do_new_mount(struct path *path, const char *type, int flags,
+static int do_new_mount(struct path *path, const char *fstype, int flags,
 			int mnt_flags, const char *name, void *data)
 {
+	struct file_system_type *type;
+	struct user_namespace *user_ns;
 	struct vfsmount *mnt;
 	int err;
 
-	if (!type)
+	if (!fstype)
 		return -EINVAL;
 
 	/* we need capabilities... */
-	if (!capable(CAP_SYS_ADMIN))
+	user_ns = real_mount(path->mnt)->mnt_ns->user_ns;
+	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
-	mnt = do_kern_mount(type, flags, name, data);
+	type = get_fs_type(fstype);
+	if (!type)
+		return -ENODEV;
+
+	if (user_ns != &init_user_ns) {
+		if (!(type->fs_flags & FS_USERNS_MOUNT)) {
+			put_filesystem(type);
+			return -EPERM;
+		}
+		/* Only in special cases allow devices from mounts
+		 * created outside the initial user namespace.
+		 */
+		if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
+			flags |= MS_NODEV;
+			mnt_flags |= MNT_NODEV;
+		}
+	}
+
+	mnt = vfs_kern_mount(type, flags, name, data);
+	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
+	    !mnt->mnt_sb->s_subtype)
+		mnt = fs_set_subtype(mnt, fstype);
+
+	put_filesystem(type);
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
@@ -2261,18 +2299,42 @@ long do_mount(const char *dev_name, const char *dir_name,
 	return retval;
 }
 
-static struct mnt_namespace *alloc_mnt_ns(void)
+static void free_mnt_ns(struct mnt_namespace *ns)
+{
+	proc_free_inum(ns->proc_inum);
+	put_user_ns(ns->user_ns);
+	kfree(ns);
+}
+
+/*
+ * Assign a sequence number so we can detect when we attempt to bind
+ * mount a reference to an older mount namespace into the current
+ * mount namespace, preventing reference counting loops.  A 64bit
+ * number incrementing at 10Ghz will take 12,427 years to wrap which
+ * is effectively never, so we can ignore the possibility.
+ */
+static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
+
+static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
 {
 	struct mnt_namespace *new_ns;
+	int ret;
 
 	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
 	if (!new_ns)
 		return ERR_PTR(-ENOMEM);
+	ret = proc_alloc_inum(&new_ns->proc_inum);
+	if (ret) {
+		kfree(new_ns);
+		return ERR_PTR(ret);
+	}
+	new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
 	atomic_set(&new_ns->count, 1);
 	new_ns->root = NULL;
 	INIT_LIST_HEAD(&new_ns->list);
 	init_waitqueue_head(&new_ns->poll);
 	new_ns->event = 0;
+	new_ns->user_ns = get_user_ns(user_ns);
 	return new_ns;
 }
 
@@ -2281,24 +2343,28 @@ static struct mnt_namespace *alloc_mnt_ns(void)
  * copied from the namespace of the passed in task structure.
  */
 static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
-		struct fs_struct *fs)
+		struct user_namespace *user_ns, struct fs_struct *fs)
 {
 	struct mnt_namespace *new_ns;
 	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
 	struct mount *p, *q;
 	struct mount *old = mnt_ns->root;
 	struct mount *new;
+	int copy_flags;
 
-	new_ns = alloc_mnt_ns();
+	new_ns = alloc_mnt_ns(user_ns);
 	if (IS_ERR(new_ns))
 		return new_ns;
 
 	down_write(&namespace_sem);
 	/* First pass: copy the tree topology */
-	new = copy_tree(old, old->mnt.mnt_root, CL_COPY_ALL | CL_EXPIRE);
+	copy_flags = CL_COPY_ALL | CL_EXPIRE;
+	if (user_ns != mnt_ns->user_ns)
+		copy_flags |= CL_SHARED_TO_SLAVE;
+	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
 	if (IS_ERR(new)) {
 		up_write(&namespace_sem);
-		kfree(new_ns);
+		free_mnt_ns(new_ns);
 		return ERR_CAST(new);
 	}
 	new_ns->root = new;
@@ -2339,7 +2405,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 }
 
 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
-		struct fs_struct *new_fs)
+		struct user_namespace *user_ns, struct fs_struct *new_fs)
 {
 	struct mnt_namespace *new_ns;
 
@@ -2349,7 +2415,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 	if (!(flags & CLONE_NEWNS))
 		return ns;
 
-	new_ns = dup_mnt_ns(ns, new_fs);
+	new_ns = dup_mnt_ns(ns, user_ns, new_fs);
 
 	put_mnt_ns(ns);
 	return new_ns;
@@ -2361,7 +2427,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
  */
 static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
 {
-	struct mnt_namespace *new_ns = alloc_mnt_ns();
+	struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
 	if (!IS_ERR(new_ns)) {
 		struct mount *mnt = real_mount(m);
 		mnt->mnt_ns = new_ns;
@@ -2501,7 +2567,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	struct mount *new_mnt, *root_mnt;
 	int error;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
 	error = user_path_dir(new_root, &new);
@@ -2583,8 +2649,13 @@ static void __init init_mount_tree(void)
 	struct vfsmount *mnt;
 	struct mnt_namespace *ns;
 	struct path root;
+	struct file_system_type *type;
 
-	mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
+	type = get_fs_type("rootfs");
+	if (!type)
+		panic("Can't find rootfs type");
+	mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
+	put_filesystem(type);
 	if (IS_ERR(mnt))
 		panic("Can't create rootfs");
 
@@ -2647,7 +2718,7 @@ void put_mnt_ns(struct mnt_namespace *ns)
 	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
-	kfree(ns);
+	free_mnt_ns(ns);
 }
 
 struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
@@ -2681,3 +2752,72 @@ bool our_mnt(struct vfsmount *mnt)
 {
 	return check_mnt(real_mount(mnt));
 }
+
+static void *mntns_get(struct task_struct *task)
+{
+	struct mnt_namespace *ns = NULL;
+	struct nsproxy *nsproxy;
+
+	rcu_read_lock();
+	nsproxy = task_nsproxy(task);
+	if (nsproxy) {
+		ns = nsproxy->mnt_ns;
+		get_mnt_ns(ns);
+	}
+	rcu_read_unlock();
+
+	return ns;
+}
+
+static void mntns_put(void *ns)
+{
+	put_mnt_ns(ns);
+}
+
+static int mntns_install(struct nsproxy *nsproxy, void *ns)
+{
+	struct fs_struct *fs = current->fs;
+	struct mnt_namespace *mnt_ns = ns;
+	struct path root;
+
+	if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
+	    !nsown_capable(CAP_SYS_CHROOT) ||
+	    !nsown_capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (fs->users != 1)
+		return -EINVAL;
+
+	get_mnt_ns(mnt_ns);
+	put_mnt_ns(nsproxy->mnt_ns);
+	nsproxy->mnt_ns = mnt_ns;
+
+	/* Find the root */
+	root.mnt    = &mnt_ns->root->mnt;
+	root.dentry = mnt_ns->root->mnt.mnt_root;
+	path_get(&root);
+	while(d_mountpoint(root.dentry) && follow_down_one(&root))
+		;
+
+	/* Update the pwd and root */
+	set_fs_pwd(fs, &root);
+	set_fs_root(fs, &root);
+
+	path_put(&root);
+	return 0;
+}
+
+static unsigned int mntns_inum(void *ns)
+{
+	struct mnt_namespace *mnt_ns = ns;
+	return mnt_ns->proc_inum;
+}
+
+const struct proc_ns_operations mntns_operations = {
+	.name		= "mnt",
+	.type		= CLONE_NEWNS,
+	.get		= mntns_get,
+	.put		= mntns_put,
+	.install	= mntns_install,
+	.inum		= mntns_inum,
+};
diff --git a/trunk/fs/nfs/Makefile b/trunk/fs/nfs/Makefile
index b7db60897f91..cce2c057bd2d 100644
--- a/trunk/fs/nfs/Makefile
+++ b/trunk/fs/nfs/Makefile
@@ -24,7 +24,7 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o
 	  delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \
 	  nfs4namespace.o nfs4getroot.o nfs4client.o
 nfsv4-$(CONFIG_SYSCTL)	+= nfs4sysctl.o
-nfsv4-$(CONFIG_NFS_V4_1)	+= pnfs.o pnfs_dev.o
+nfsv4-$(CONFIG_NFS_V4_1)	+= nfs4session.o pnfs.o pnfs_dev.o
 
 obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
 nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
diff --git a/trunk/fs/nfs/blocklayout/blocklayout.c b/trunk/fs/nfs/blocklayout/blocklayout.c
index f1027b06a1a9..4fa788c93f46 100644
--- a/trunk/fs/nfs/blocklayout/blocklayout.c
+++ b/trunk/fs/nfs/blocklayout/blocklayout.c
@@ -40,6 +40,7 @@
 #include <linux/pagevec.h>
 
 #include "../pnfs.h"
+#include "../nfs4session.h"
 #include "../internal.h"
 #include "blocklayout.h"
 
diff --git a/trunk/fs/nfs/cache_lib.c b/trunk/fs/nfs/cache_lib.c
index dded26368111..862a2f16db64 100644
--- a/trunk/fs/nfs/cache_lib.c
+++ b/trunk/fs/nfs/cache_lib.c
@@ -118,7 +118,6 @@ int nfs_cache_register_sb(struct super_block *sb, struct cache_detail *cd)
 	struct dentry *dir;
 
 	dir = rpc_d_lookup_sb(sb, "cache");
-	BUG_ON(dir == NULL);
 	ret = sunrpc_cache_register_pipefs(dir, cd->name, 0600, cd);
 	dput(dir);
 	return ret;
diff --git a/trunk/fs/nfs/callback.h b/trunk/fs/nfs/callback.h
index 4251c2ae06ad..efd54f0a4c46 100644
--- a/trunk/fs/nfs/callback.h
+++ b/trunk/fs/nfs/callback.h
@@ -142,7 +142,7 @@ extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
 
 struct cb_recallslotargs {
 	struct sockaddr	*crsa_addr;
-	uint32_t	crsa_target_max_slots;
+	uint32_t	crsa_target_highest_slotid;
 };
 extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
 					 void *dummy,
@@ -167,8 +167,6 @@ extern __be32 nfs4_callback_layoutrecall(
 	struct cb_layoutrecallargs *args,
 	void *dummy, struct cb_process_state *cps);
 
-extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
-
 struct cb_devicenotifyitem {
 	uint32_t		cbd_notify_type;
 	uint32_t		cbd_layout_type;
diff --git a/trunk/fs/nfs/callback_proc.c b/trunk/fs/nfs/callback_proc.c
index 76b4a7a3e559..c89b26bc9759 100644
--- a/trunk/fs/nfs/callback_proc.c
+++ b/trunk/fs/nfs/callback_proc.c
@@ -14,6 +14,7 @@
 #include "delegation.h"
 #include "internal.h"
 #include "pnfs.h"
+#include "nfs4session.h"
 
 #ifdef NFS_DEBUG
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
@@ -216,7 +217,6 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
 			}
 			pnfs_get_layout_hdr(lo);
 			spin_unlock(&ino->i_lock);
-			BUG_ON(!list_empty(&lo->plh_bulk_recall));
 			list_add(&lo->plh_bulk_recall, &recall_list);
 		}
 	}
@@ -562,23 +562,16 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
 	if (!cps->clp) /* set in cb_sequence */
 		goto out;
 
-	dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
+	dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target highest slotid %d\n",
 		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
-		args->crsa_target_max_slots);
+		args->crsa_target_highest_slotid);
 
 	fc_tbl = &cps->clp->cl_session->fc_slot_table;
 
-	status = htonl(NFS4ERR_BAD_HIGH_SLOT);
-	if (args->crsa_target_max_slots > fc_tbl->max_slots ||
-	    args->crsa_target_max_slots < 1)
-		goto out;
-
 	status = htonl(NFS4_OK);
-	if (args->crsa_target_max_slots == fc_tbl->max_slots)
-		goto out;
 
-	fc_tbl->target_max_slots = args->crsa_target_max_slots;
-	nfs41_handle_recall_slot(cps->clp);
+	nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid);
+	nfs41_server_notify_target_slotid_update(cps->clp);
 out:
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
 	return status;
diff --git a/trunk/fs/nfs/callback_xdr.c b/trunk/fs/nfs/callback_xdr.c
index 742ff4ffced7..59461c957d9d 100644
--- a/trunk/fs/nfs/callback_xdr.c
+++ b/trunk/fs/nfs/callback_xdr.c
@@ -16,6 +16,7 @@
 #include "nfs4_fs.h"
 #include "callback.h"
 #include "internal.h"
+#include "nfs4session.h"
 
 #define CB_OP_TAGLEN_MAXSZ	(512)
 #define CB_OP_HDR_RES_MAXSZ	(2 + CB_OP_TAGLEN_MAXSZ)
@@ -520,7 +521,7 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
 	p = read_buf(xdr, 4);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_BADXDR);
-	args->crsa_target_max_slots = ntohl(*p++);
+	args->crsa_target_highest_slotid = ntohl(*p++);
 	return 0;
 }
 
@@ -762,7 +763,7 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
 	 * A single slot, so highest used slotid is either 0 or -1
 	 */
 	tbl->highest_used_slotid = NFS4_NO_SLOT;
-	nfs4_check_drain_bc_complete(session);
+	nfs4_session_drain_complete(session, tbl);
 	spin_unlock(&tbl->slot_tbl_lock);
 }
 
diff --git a/trunk/fs/nfs/client.c b/trunk/fs/nfs/client.c
index 8b39a42ac35e..9f3c66438d0e 100644
--- a/trunk/fs/nfs/client.c
+++ b/trunk/fs/nfs/client.c
@@ -277,7 +277,7 @@ void nfs_put_client(struct nfs_client *clp)
 		nfs_cb_idr_remove_locked(clp);
 		spin_unlock(&nn->nfs_client_lock);
 
-		BUG_ON(!list_empty(&clp->cl_superblocks));
+		WARN_ON_ONCE(!list_empty(&clp->cl_superblocks));
 
 		clp->rpc_ops->free_client(clp);
 	}
@@ -615,8 +615,7 @@ EXPORT_SYMBOL_GPL(nfs_create_rpc_client);
  */
 static void nfs_destroy_server(struct nfs_server *server)
 {
-	if (!(server->flags & NFS_MOUNT_LOCAL_FLOCK) ||
-			!(server->flags & NFS_MOUNT_LOCAL_FCNTL))
+	if (server->nlm_host)
 		nlmclnt_done(server->nlm_host);
 }
 
@@ -1061,10 +1060,6 @@ struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info,
 	if (error < 0)
 		goto error;
 
-	BUG_ON(!server->nfs_client);
-	BUG_ON(!server->nfs_client->rpc_ops);
-	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
-
 	/* Probe the root fh to retrieve its FSID */
 	error = nfs_probe_fsinfo(server, mount_info->mntfh, fattr);
 	if (error < 0)
diff --git a/trunk/fs/nfs/dir.c b/trunk/fs/nfs/dir.c
index b9e66b7e0c14..32e6c53520e2 100644
--- a/trunk/fs/nfs/dir.c
+++ b/trunk/fs/nfs/dir.c
@@ -871,7 +871,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	return res;
 }
 
-static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
+static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
 {
 	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
@@ -880,10 +880,10 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
 	dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n",
 			dentry->d_parent->d_name.name,
 			dentry->d_name.name,
-			offset, origin);
+			offset, whence);
 
 	mutex_lock(&inode->i_mutex);
-	switch (origin) {
+	switch (whence) {
 		case 1:
 			offset += filp->f_pos;
 		case 0:
@@ -979,10 +979,11 @@ static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags)
  * particular file and the "nocto" mount flag is not set.
  *
  */
-static inline
+static
 int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
+	int ret;
 
 	if (IS_AUTOMOUNT(inode))
 		return 0;
@@ -993,9 +994,13 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
 	if ((flags & LOOKUP_OPEN) && !(server->flags & NFS_MOUNT_NOCTO) &&
 	    (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
 		goto out_force;
-	return 0;
+out:
+	return (inode->i_nlink == 0) ? -ENOENT : 0;
 out_force:
-	return __nfs_revalidate_inode(server, inode);
+	ret = __nfs_revalidate_inode(server, inode);
+	if (ret != 0)
+		return ret;
+	goto out;
 }
 
 /*
@@ -1156,11 +1161,14 @@ static int nfs_dentry_delete(const struct dentry *dentry)
 
 }
 
+/* Ensure that we revalidate inode->i_nlink */
 static void nfs_drop_nlink(struct inode *inode)
 {
 	spin_lock(&inode->i_lock);
-	if (inode->i_nlink > 0)
-		drop_nlink(inode);
+	/* drop the inode if we're reasonably sure this is the last link */
+	if (inode->i_nlink == 1)
+		clear_nlink(inode);
+	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
 	spin_unlock(&inode->i_lock);
 }
 
@@ -1175,8 +1183,8 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
 		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
 
 	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
-		drop_nlink(inode);
 		nfs_complete_unlink(dentry, inode);
+		nfs_drop_nlink(inode);
 	}
 	iput(inode);
 }
@@ -1647,10 +1655,8 @@ static int nfs_safe_remove(struct dentry *dentry)
 	if (inode != NULL) {
 		NFS_PROTO(inode)->return_delegation(inode);
 		error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
-		/* The VFS may want to delete this inode */
 		if (error == 0)
 			nfs_drop_nlink(inode);
-		nfs_mark_for_revalidate(inode);
 	} else
 		error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
 	if (error == -ENOENT)
diff --git a/trunk/fs/nfs/direct.c b/trunk/fs/nfs/direct.c
index cae26cbd59ee..0bd7a55a5f07 100644
--- a/trunk/fs/nfs/direct.c
+++ b/trunk/fs/nfs/direct.c
@@ -266,21 +266,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
 		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
 		struct page *page = req->wb_page;
 
-		if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
-			if (bytes > hdr->good_bytes)
-				zero_user(page, 0, PAGE_SIZE);
-			else if (hdr->good_bytes - bytes < PAGE_SIZE)
-				zero_user_segment(page,
-					hdr->good_bytes & ~PAGE_MASK,
-					PAGE_SIZE);
-		}
-		if (!PageCompound(page)) {
-			if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
-				if (bytes < hdr->good_bytes)
-					set_page_dirty(page);
-			} else
-				set_page_dirty(page);
-		}
+		if (!PageCompound(page) && bytes < hdr->good_bytes)
+			set_page_dirty(page);
 		bytes += req->wb_bytes;
 		nfs_list_remove_request(req);
 		nfs_direct_readpage_release(req);
diff --git a/trunk/fs/nfs/file.c b/trunk/fs/nfs/file.c
index 582bb8866131..3c2b893665ba 100644
--- a/trunk/fs/nfs/file.c
+++ b/trunk/fs/nfs/file.c
@@ -119,18 +119,18 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
 	return __nfs_revalidate_inode(server, inode);
 }
 
-loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
+loff_t nfs_file_llseek(struct file *filp, loff_t offset, int whence)
 {
 	dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
 			filp->f_path.dentry->d_parent->d_name.name,
 			filp->f_path.dentry->d_name.name,
-			offset, origin);
+			offset, whence);
 
 	/*
-	 * origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
+	 * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
 	 * the cached file length
 	 */
-	if (origin != SEEK_SET && origin != SEEK_CUR) {
+	if (whence != SEEK_SET && whence != SEEK_CUR) {
 		struct inode *inode = filp->f_mapping->host;
 
 		int retval = nfs_revalidate_file_size(inode, filp);
@@ -138,7 +138,7 @@ loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 			return (loff_t)retval;
 	}
 
-	return generic_file_llseek(filp, offset, origin);
+	return generic_file_llseek(filp, offset, whence);
 }
 EXPORT_SYMBOL_GPL(nfs_file_llseek);
 
diff --git a/trunk/fs/nfs/inode.c b/trunk/fs/nfs/inode.c
index 6fa01aea2488..2faae14d89f4 100644
--- a/trunk/fs/nfs/inode.c
+++ b/trunk/fs/nfs/inode.c
@@ -107,13 +107,19 @@ u64 nfs_compat_user_ino64(u64 fileid)
 	return ino;
 }
 
+int nfs_drop_inode(struct inode *inode)
+{
+	return NFS_STALE(inode) || generic_drop_inode(inode);
+}
+EXPORT_SYMBOL_GPL(nfs_drop_inode);
+
 void nfs_clear_inode(struct inode *inode)
 {
 	/*
 	 * The following should never happen...
 	 */
-	BUG_ON(nfs_have_writebacks(inode));
-	BUG_ON(!list_empty(&NFS_I(inode)->open_files));
+	WARN_ON_ONCE(nfs_have_writebacks(inode));
+	WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files));
 	nfs_zap_acl_cache(inode);
 	nfs_access_zap_cache(inode);
 	nfs_fscache_release_inode_cookie(inode);
diff --git a/trunk/fs/nfs/internal.h b/trunk/fs/nfs/internal.h
index 05521cadac2e..f0e6c7df1a07 100644
--- a/trunk/fs/nfs/internal.h
+++ b/trunk/fs/nfs/internal.h
@@ -18,27 +18,6 @@ struct nfs_string;
  */
 #define NFS_MAX_READAHEAD	(RPC_DEF_SLOT_TABLE - 1)
 
-/*
- * Determine if sessions are in use.
- */
-static inline int nfs4_has_session(const struct nfs_client *clp)
-{
-#ifdef CONFIG_NFS_V4_1
-	if (clp->cl_session)
-		return 1;
-#endif /* CONFIG_NFS_V4_1 */
-	return 0;
-}
-
-static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
-{
-#ifdef CONFIG_NFS_V4_1
-	if (nfs4_has_session(clp))
-		return (clp->cl_session->flags & SESSION4_PERSIST);
-#endif /* CONFIG_NFS_V4_1 */
-	return 0;
-}
-
 static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr)
 {
 	if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid))
@@ -276,8 +255,6 @@ extern const u32 nfs41_maxwrite_overhead;
 extern struct rpc_procinfo nfs4_procedures[];
 #endif
 
-extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
-
 /* proc.c */
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
 extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
@@ -319,6 +296,7 @@ extern struct workqueue_struct *nfsiod_workqueue;
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
 extern void nfs_destroy_inode(struct inode *);
 extern int nfs_write_inode(struct inode *, struct writeback_control *);
+extern int nfs_drop_inode(struct inode *);
 extern void nfs_clear_inode(struct inode *);
 extern void nfs_evict_inode(struct inode *);
 void nfs_zap_acl_cache(struct inode *inode);
@@ -386,9 +364,6 @@ extern int nfs_initiate_read(struct rpc_clnt *clnt,
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
 			      struct nfs_pgio_header *hdr);
-extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
-			struct inode *inode,
-			const struct nfs_pgio_completion_ops *compl_ops);
 extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
 extern void nfs_readdata_release(struct nfs_read_data *rdata);
 
@@ -411,9 +386,6 @@ extern struct nfs_write_header *nfs_writehdr_alloc(void);
 extern void nfs_writehdr_free(struct nfs_pgio_header *hdr);
 extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
 			     struct nfs_pgio_header *hdr);
-extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
-			struct inode *inode, int ioflags,
-			const struct nfs_pgio_completion_ops *compl_ops);
 extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
 extern void nfs_writedata_release(struct nfs_write_data *wdata);
 extern void nfs_commit_free(struct nfs_commit_data *p);
@@ -474,18 +446,6 @@ extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 			    const struct rpc_timeout *timeparms,
 			    const char *ip_addr,
 			    rpc_authflavor_t authflavour);
-extern int _nfs4_call_sync(struct rpc_clnt *clnt,
-			   struct nfs_server *server,
-			   struct rpc_message *msg,
-			   struct nfs4_sequence_args *args,
-			   struct nfs4_sequence_res *res,
-			   int cache_reply);
-extern int _nfs4_call_sync_session(struct rpc_clnt *clnt,
-				   struct nfs_server *server,
-				   struct rpc_message *msg,
-				   struct nfs4_sequence_args *args,
-				   struct nfs4_sequence_res *res,
-				   int cache_reply);
 extern int nfs40_walk_client_list(struct nfs_client *clp,
 				struct nfs_client **result,
 				struct rpc_cred *cred);
diff --git a/trunk/fs/nfs/mount_clnt.c b/trunk/fs/nfs/mount_clnt.c
index 015f71f8f62c..91a6faf811ac 100644
--- a/trunk/fs/nfs/mount_clnt.c
+++ b/trunk/fs/nfs/mount_clnt.c
@@ -169,6 +169,9 @@ int nfs_mount(struct nfs_mount_request *info)
 		(info->hostname ? info->hostname : "server"),
 			info->dirpath);
 
+	if (strlen(info->dirpath) > MNTPATHLEN)
+		return -ENAMETOOLONG;
+
 	if (info->noresvport)
 		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
 
@@ -242,6 +245,9 @@ void nfs_umount(const struct nfs_mount_request *info)
 	struct rpc_clnt *clnt;
 	int status;
 
+	if (strlen(info->dirpath) > MNTPATHLEN)
+		return;
+
 	if (info->noresvport)
 		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
 
@@ -283,7 +289,6 @@ static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
 	const u32 pathname_len = strlen(pathname);
 	__be32 *p;
 
-	BUG_ON(pathname_len > MNTPATHLEN);
 	p = xdr_reserve_space(xdr, 4 + pathname_len);
 	xdr_encode_opaque(p, pathname, pathname_len);
 }
diff --git a/trunk/fs/nfs/nfs2xdr.c b/trunk/fs/nfs/nfs2xdr.c
index d04f0df7be55..06b9df49f7f7 100644
--- a/trunk/fs/nfs/nfs2xdr.c
+++ b/trunk/fs/nfs/nfs2xdr.c
@@ -195,7 +195,6 @@ static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh)
 {
 	__be32 *p;
 
-	BUG_ON(fh->size != NFS2_FHSIZE);
 	p = xdr_reserve_space(xdr, NFS2_FHSIZE);
 	memcpy(p, fh->data, NFS2_FHSIZE);
 }
@@ -388,7 +387,7 @@ static void encode_filename(struct xdr_stream *xdr,
 {
 	__be32 *p;
 
-	BUG_ON(length > NFS2_MAXNAMLEN);
+	WARN_ON_ONCE(length > NFS2_MAXNAMLEN);
 	p = xdr_reserve_space(xdr, 4 + length);
 	xdr_encode_opaque(p, name, length);
 }
@@ -428,7 +427,6 @@ static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length)
 {
 	__be32 *p;
 
-	BUG_ON(length > NFS2_MAXPATHLEN);
 	p = xdr_reserve_space(xdr, 4);
 	*p = cpu_to_be32(length);
 	xdr_write_pages(xdr, pages, 0, length);
diff --git a/trunk/fs/nfs/nfs3proc.c b/trunk/fs/nfs/nfs3proc.c
index 69322096c325..70efb63b1e42 100644
--- a/trunk/fs/nfs/nfs3proc.c
+++ b/trunk/fs/nfs/nfs3proc.c
@@ -24,14 +24,14 @@
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
 
-/* A wrapper to handle the EJUKEBOX and EKEYEXPIRED error messages */
+/* A wrapper to handle the EJUKEBOX error messages */
 static int
 nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 {
 	int res;
 	do {
 		res = rpc_call_sync(clnt, msg, flags);
-		if (res != -EJUKEBOX && res != -EKEYEXPIRED)
+		if (res != -EJUKEBOX)
 			break;
 		freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
 		res = -ERESTARTSYS;
@@ -44,7 +44,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 static int
 nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode)
 {
-	if (task->tk_status != -EJUKEBOX && task->tk_status != -EKEYEXPIRED)
+	if (task->tk_status != -EJUKEBOX)
 		return 0;
 	if (task->tk_status == -EJUKEBOX)
 		nfs_inc_stats(inode, NFSIOS_DELAY);
diff --git a/trunk/fs/nfs/nfs3xdr.c b/trunk/fs/nfs/nfs3xdr.c
index 6cbe89400dfc..bffc32406fbf 100644
--- a/trunk/fs/nfs/nfs3xdr.c
+++ b/trunk/fs/nfs/nfs3xdr.c
@@ -198,7 +198,7 @@ static void encode_filename3(struct xdr_stream *xdr,
 {
 	__be32 *p;
 
-	BUG_ON(length > NFS3_MAXNAMLEN);
+	WARN_ON_ONCE(length > NFS3_MAXNAMLEN);
 	p = xdr_reserve_space(xdr, 4 + length);
 	xdr_encode_opaque(p, name, length);
 }
@@ -238,7 +238,6 @@ static int decode_inline_filename3(struct xdr_stream *xdr,
 static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages,
 			    const u32 length)
 {
-	BUG_ON(length > NFS3_MAXPATHLEN);
 	encode_uint32(xdr, length);
 	xdr_write_pages(xdr, pages, 0, length);
 }
@@ -388,7 +387,6 @@ static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status)
  */
 static void encode_ftype3(struct xdr_stream *xdr, const u32 type)
 {
-	BUG_ON(type > NF3FIFO);
 	encode_uint32(xdr, type);
 }
 
@@ -443,7 +441,7 @@ static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh)
 {
 	__be32 *p;
 
-	BUG_ON(fh->size > NFS3_FHSIZE);
+	WARN_ON_ONCE(fh->size > NFS3_FHSIZE);
 	p = xdr_reserve_space(xdr, 4 + fh->size);
 	xdr_encode_opaque(p, fh->data, fh->size);
 }
@@ -1339,6 +1337,7 @@ static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
 	error = nfsacl_encode(xdr->buf, base, args->inode,
 			    (args->mask & NFS_ACL) ?
 			    args->acl_access : NULL, 1, 0);
+	/* FIXME: this is just broken */
 	BUG_ON(error < 0);
 	error = nfsacl_encode(xdr->buf, base + error, args->inode,
 			    (args->mask & NFS_DFACL) ?
diff --git a/trunk/fs/nfs/nfs4_fs.h b/trunk/fs/nfs/nfs4_fs.h
index a525fdefccde..a3f488b074a2 100644
--- a/trunk/fs/nfs/nfs4_fs.h
+++ b/trunk/fs/nfs/nfs4_fs.h
@@ -11,6 +11,8 @@
 
 #if IS_ENABLED(CONFIG_NFS_V4)
 
+#define NFS4_MAX_LOOP_ON_RECOVER (10)
+
 struct idmap;
 
 enum nfs4_client_state {
@@ -21,18 +23,12 @@ enum nfs4_client_state {
 	NFS4CLNT_RECLAIM_NOGRACE,
 	NFS4CLNT_DELEGRETURN,
 	NFS4CLNT_SESSION_RESET,
-	NFS4CLNT_RECALL_SLOT,
 	NFS4CLNT_LEASE_CONFIRM,
 	NFS4CLNT_SERVER_SCOPE_MISMATCH,
 	NFS4CLNT_PURGE_STATE,
 	NFS4CLNT_BIND_CONN_TO_SESSION,
 };
 
-enum nfs4_session_state {
-	NFS4_SESSION_INITING,
-	NFS4_SESSION_DRAINING,
-};
-
 #define NFS4_RENEW_TIMEOUT		0x01
 #define NFS4_RENEW_DELEGATION_CB	0x02
 
@@ -43,8 +39,7 @@ struct nfs4_minor_version_ops {
 			struct nfs_server *server,
 			struct rpc_message *msg,
 			struct nfs4_sequence_args *args,
-			struct nfs4_sequence_res *res,
-			int cache_reply);
+			struct nfs4_sequence_res *res);
 	bool	(*match_stateid)(const nfs4_stateid *,
 			const nfs4_stateid *);
 	int	(*find_root_sec)(struct nfs_server *, struct nfs_fh *,
@@ -241,18 +236,14 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
 	return server->nfs_client->cl_session;
 }
 
-extern bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy);
 extern int nfs4_setup_sequence(const struct nfs_server *server,
 		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
 		struct rpc_task *task);
 extern int nfs41_setup_sequence(struct nfs4_session *session,
 		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
 		struct rpc_task *task);
-extern void nfs4_destroy_session(struct nfs4_session *session);
-extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
 extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *);
-extern int nfs4_init_session(struct nfs_server *server);
 extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
 		struct nfs_fsinfo *fsinfo);
 extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
@@ -280,11 +271,7 @@ static inline int nfs4_setup_sequence(const struct nfs_server *server,
 		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
 		struct rpc_task *task)
 {
-	return 0;
-}
-
-static inline int nfs4_init_session(struct nfs_server *server)
-{
+	rpc_call_start(task);
 	return 0;
 }
 
@@ -321,17 +308,20 @@ extern void nfs4_renew_state(struct work_struct *);
 
 /* nfs4state.c */
 struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp);
+struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
 struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
 int nfs4_discover_server_trunking(struct nfs_client *clp,
 			struct nfs_client **);
 int nfs40_discover_server_trunking(struct nfs_client *clp,
 			struct nfs_client **, struct rpc_cred *);
 #if defined(CONFIG_NFS_V4_1)
-struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
 struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
 int nfs41_discover_server_trunking(struct nfs_client *clp,
 			struct nfs_client **, struct rpc_cred *);
 extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
+extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp);
+extern void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp);
+
 #else
 static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
 {
@@ -349,11 +339,12 @@ extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
 extern void nfs_inode_find_state_and_recover(struct inode *inode,
 		const nfs4_stateid *stateid);
 extern void nfs4_schedule_lease_recovery(struct nfs_client *);
+extern int nfs4_wait_clnt_recover(struct nfs_client *clp);
+extern int nfs4_client_recover_expired_lease(struct nfs_client *clp);
 extern void nfs4_schedule_state_manager(struct nfs_client *);
 extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
 extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
-extern void nfs41_handle_recall_slot(struct nfs_client *clp);
 extern void nfs41_handle_server_scope(struct nfs_client *,
 				      struct nfs41_server_scope **);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
diff --git a/trunk/fs/nfs/nfs4client.c b/trunk/fs/nfs/nfs4client.c
index 6bacfde1319a..acc347268124 100644
--- a/trunk/fs/nfs/nfs4client.c
+++ b/trunk/fs/nfs/nfs4client.c
@@ -12,6 +12,7 @@
 #include "internal.h"
 #include "callback.h"
 #include "delegation.h"
+#include "nfs4session.h"
 #include "pnfs.h"
 #include "netns.h"
 
@@ -713,10 +714,6 @@ static int nfs4_server_common_setup(struct nfs_server *server,
 	struct nfs_fattr *fattr;
 	int error;
 
-	BUG_ON(!server->nfs_client);
-	BUG_ON(!server->nfs_client->rpc_ops);
-	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
-
 	/* data servers support only a subset of NFSv4.1 */
 	if (is_ds_only_client(server->nfs_client))
 		return -EPROTONOSUPPORT;
diff --git a/trunk/fs/nfs/nfs4file.c b/trunk/fs/nfs/nfs4file.c
index afddd6639afb..e7699308364a 100644
--- a/trunk/fs/nfs/nfs4file.c
+++ b/trunk/fs/nfs/nfs4file.c
@@ -20,7 +20,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 	struct iattr attr;
 	int err;
 
-	BUG_ON(inode != dentry->d_inode);
 	/*
 	 * If no cached dentry exists or if it's negative, NFSv4 handled the
 	 * opens in ->lookup() or ->create().
diff --git a/trunk/fs/nfs/nfs4filelayout.c b/trunk/fs/nfs/nfs4filelayout.c
index 2e45fd9c02a3..194c48410336 100644
--- a/trunk/fs/nfs/nfs4filelayout.c
+++ b/trunk/fs/nfs/nfs4filelayout.c
@@ -35,6 +35,7 @@
 
 #include <linux/sunrpc/metrics.h>
 
+#include "nfs4session.h"
 #include "internal.h"
 #include "delegation.h"
 #include "nfs4filelayout.h"
@@ -178,7 +179,6 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 		break;
 	case -NFS4ERR_DELAY:
 	case -NFS4ERR_GRACE:
-	case -EKEYEXPIRED:
 		rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
 		break;
 	case -NFS4ERR_RETRY_UNCACHED_REP:
@@ -306,12 +306,10 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
 	}
 	rdata->read_done_cb = filelayout_read_done_cb;
 
-	if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
-				&rdata->args.seq_args, &rdata->res.seq_res,
-				task))
-		return;
-
-	rpc_call_start(task);
+	nfs41_setup_sequence(rdata->ds_clp->cl_session,
+			&rdata->args.seq_args,
+			&rdata->res.seq_res,
+			task);
 }
 
 static void filelayout_read_call_done(struct rpc_task *task, void *data)
@@ -408,12 +406,10 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data)
 		rpc_exit(task, 0);
 		return;
 	}
-	if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
-				&wdata->args.seq_args, &wdata->res.seq_res,
-				task))
-		return;
-
-	rpc_call_start(task);
+	nfs41_setup_sequence(wdata->ds_clp->cl_session,
+			&wdata->args.seq_args,
+			&wdata->res.seq_res,
+			task);
 }
 
 static void filelayout_write_call_done(struct rpc_task *task, void *data)
@@ -449,12 +445,10 @@ static void filelayout_commit_prepare(struct rpc_task *task, void *data)
 {
 	struct nfs_commit_data *wdata = data;
 
-	if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
-				&wdata->args.seq_args, &wdata->res.seq_res,
-				task))
-		return;
-
-	rpc_call_start(task);
+	nfs41_setup_sequence(wdata->ds_clp->cl_session,
+			&wdata->args.seq_args,
+			&wdata->res.seq_res,
+			task);
 }
 
 static void filelayout_write_commit_done(struct rpc_task *task, void *data)
@@ -512,7 +506,6 @@ filelayout_read_pagelist(struct nfs_read_data *data)
 	loff_t offset = data->args.offset;
 	u32 j, idx;
 	struct nfs_fh *fh;
-	int status;
 
 	dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
 		__func__, hdr->inode->i_ino,
@@ -538,9 +531,8 @@ filelayout_read_pagelist(struct nfs_read_data *data)
 	data->mds_offset = offset;
 
 	/* Perform an asynchronous read to ds */
-	status = nfs_initiate_read(ds->ds_clp->cl_rpcclient, data,
+	nfs_initiate_read(ds->ds_clp->cl_rpcclient, data,
 				  &filelayout_read_call_ops, RPC_TASK_SOFTCONN);
-	BUG_ON(status != 0);
 	return PNFS_ATTEMPTED;
 }
 
@@ -554,7 +546,6 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
 	loff_t offset = data->args.offset;
 	u32 j, idx;
 	struct nfs_fh *fh;
-	int status;
 
 	/* Retrieve the correct rpc_client for the byte range */
 	j = nfs4_fl_calc_j_index(lseg, offset);
@@ -579,10 +570,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
 	data->args.offset = filelayout_get_dserver_offset(lseg, offset);
 
 	/* Perform an asynchronous write */
-	status = nfs_initiate_write(ds->ds_clp->cl_rpcclient, data,
+	nfs_initiate_write(ds->ds_clp->cl_rpcclient, data,
 				    &filelayout_write_call_ops, sync,
 				    RPC_TASK_SOFTCONN);
-	BUG_ON(status != 0);
 	return PNFS_ATTEMPTED;
 }
 
@@ -909,7 +899,7 @@ static void
 filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
 			struct nfs_page *req)
 {
-	BUG_ON(pgio->pg_lseg != NULL);
+	WARN_ON_ONCE(pgio->pg_lseg != NULL);
 
 	if (req->wb_offset != req->wb_pgbase) {
 		/*
@@ -939,7 +929,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 	struct nfs_commit_info cinfo;
 	int status;
 
-	BUG_ON(pgio->pg_lseg != NULL);
+	WARN_ON_ONCE(pgio->pg_lseg != NULL);
 
 	if (req->wb_offset != req->wb_pgbase)
 		goto out_mds;
@@ -1187,7 +1177,6 @@ static void filelayout_recover_commit_reqs(struct list_head *dst,
 	 */
 	for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
 		if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
-			BUG_ON(!list_empty(&b->written));
 			pnfs_put_lseg(b->wlseg);
 			b->wlseg = NULL;
 		}
diff --git a/trunk/fs/nfs/nfs4filelayoutdev.c b/trunk/fs/nfs/nfs4filelayoutdev.c
index a8eaa9b7bb0f..b720064bcd7f 100644
--- a/trunk/fs/nfs/nfs4filelayoutdev.c
+++ b/trunk/fs/nfs/nfs4filelayoutdev.c
@@ -33,6 +33,7 @@
 #include <linux/module.h>
 
 #include "internal.h"
+#include "nfs4session.h"
 #include "nfs4filelayout.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PNFS_LD
@@ -162,8 +163,6 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
 	dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
 		mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
 
-	BUG_ON(list_empty(&ds->ds_addrs));
-
 	list_for_each_entry(da, &ds->ds_addrs, da_node) {
 		dprintk("%s: DS %s: trying address %s\n",
 			__func__, ds->ds_remotestr, da->da_remotestr);
diff --git a/trunk/fs/nfs/nfs4proc.c b/trunk/fs/nfs/nfs4proc.c
index 5eec4429970c..493f0f41c554 100644
--- a/trunk/fs/nfs/nfs4proc.c
+++ b/trunk/fs/nfs/nfs4proc.c
@@ -52,7 +52,6 @@
 #include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/nfs_idmap.h>
-#include <linux/sunrpc/bc_xprt.h>
 #include <linux/xattr.h>
 #include <linux/utsname.h>
 #include <linux/freezer.h>
@@ -64,14 +63,14 @@
 #include "callback.h"
 #include "pnfs.h"
 #include "netns.h"
+#include "nfs4session.h"
+
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
 
 #define NFS4_POLL_RETRY_MIN	(HZ/10)
 #define NFS4_POLL_RETRY_MAX	(15*HZ)
 
-#define NFS4_MAX_LOOP_ON_RECOVER (10)
-
 struct nfs4_opendata;
 static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
@@ -206,7 +205,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
 {
 	__be32 *start, *p;
 
-	BUG_ON(readdir->count < 80);
 	if (cookie > 2) {
 		readdir->cookie = cookie;
 		memcpy(&readdir->verifier, verifier, sizeof(readdir->verifier));
@@ -256,22 +254,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
 	kunmap_atomic(start);
 }
 
-static int nfs4_wait_clnt_recover(struct nfs_client *clp)
-{
-	int res;
-
-	might_sleep();
-
-	res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
-			nfs_wait_bit_killable, TASK_KILLABLE);
-	if (res)
-		return res;
-
-	if (clp->cl_cons_state < 0)
-		return clp->cl_cons_state;
-	return 0;
-}
-
 static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 {
 	int res = 0;
@@ -351,7 +333,6 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
 			}
 		case -NFS4ERR_GRACE:
 		case -NFS4ERR_DELAY:
-		case -EKEYEXPIRED:
 			ret = nfs4_delay(server->client, &exception->timeout);
 			if (ret != 0)
 				break;
@@ -397,144 +378,136 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
 
 #if defined(CONFIG_NFS_V4_1)
 
-/*
- * nfs4_free_slot - free a slot and efficiently update slot table.
- *
- * freeing a slot is trivially done by clearing its respective bit
- * in the bitmap.
- * If the freed slotid equals highest_used_slotid we want to update it
- * so that the server would be able to size down the slot table if needed,
- * otherwise we know that the highest_used_slotid is still in use.
- * When updating highest_used_slotid there may be "holes" in the bitmap
- * so we need to scan down from highest_used_slotid to 0 looking for the now
- * highest slotid in use.
- * If none found, highest_used_slotid is set to NFS4_NO_SLOT.
- *
- * Must be called while holding tbl->slot_tbl_lock
- */
-static void
-nfs4_free_slot(struct nfs4_slot_table *tbl, u32 slotid)
-{
-	BUG_ON(slotid >= NFS4_MAX_SLOT_TABLE);
-	/* clear used bit in bitmap */
-	__clear_bit(slotid, tbl->used_slots);
-
-	/* update highest_used_slotid when it is freed */
-	if (slotid == tbl->highest_used_slotid) {
-		slotid = find_last_bit(tbl->used_slots, tbl->max_slots);
-		if (slotid < tbl->max_slots)
-			tbl->highest_used_slotid = slotid;
-		else
-			tbl->highest_used_slotid = NFS4_NO_SLOT;
-	}
-	dprintk("%s: slotid %u highest_used_slotid %d\n", __func__,
-		slotid, tbl->highest_used_slotid);
-}
-
-bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy)
-{
-	rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
-	return true;
-}
-
-/*
- * Signal state manager thread if session fore channel is drained
- */
-static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
-{
-	if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
-		rpc_wake_up_first(&ses->fc_slot_table.slot_tbl_waitq,
-				nfs4_set_task_privileged, NULL);
-		return;
-	}
-
-	if (ses->fc_slot_table.highest_used_slotid != NFS4_NO_SLOT)
-		return;
-
-	dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__);
-	complete(&ses->fc_slot_table.complete);
-}
-
-/*
- * Signal state manager thread if session back channel is drained
- */
-void nfs4_check_drain_bc_complete(struct nfs4_session *ses)
-{
-	if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) ||
-	    ses->bc_slot_table.highest_used_slotid != NFS4_NO_SLOT)
-		return;
-	dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);
-	complete(&ses->bc_slot_table.complete);
-}
-
 static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
 {
+	struct nfs4_session *session;
 	struct nfs4_slot_table *tbl;
+	bool send_new_highest_used_slotid = false;
 
-	tbl = &res->sr_session->fc_slot_table;
 	if (!res->sr_slot) {
 		/* just wake up the next guy waiting since
 		 * we may have not consumed a slot after all */
 		dprintk("%s: No slot\n", __func__);
 		return;
 	}
+	tbl = res->sr_slot->table;
+	session = tbl->session;
 
 	spin_lock(&tbl->slot_tbl_lock);
-	nfs4_free_slot(tbl, res->sr_slot - tbl->slots);
-	nfs4_check_drain_fc_complete(res->sr_session);
+	/* Be nice to the server: try to ensure that the last transmitted
+	 * value for highest_user_slotid <= target_highest_slotid
+	 */
+	if (tbl->highest_used_slotid > tbl->target_highest_slotid)
+		send_new_highest_used_slotid = true;
+
+	if (nfs41_wake_and_assign_slot(tbl, res->sr_slot)) {
+		send_new_highest_used_slotid = false;
+		goto out_unlock;
+	}
+	nfs4_free_slot(tbl, res->sr_slot);
+
+	if (tbl->highest_used_slotid != NFS4_NO_SLOT)
+		send_new_highest_used_slotid = false;
+out_unlock:
 	spin_unlock(&tbl->slot_tbl_lock);
 	res->sr_slot = NULL;
+	if (send_new_highest_used_slotid)
+		nfs41_server_notify_highest_slotid_update(session->clp);
 }
 
 static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
 {
-	unsigned long timestamp;
+	struct nfs4_session *session;
+	struct nfs4_slot *slot;
 	struct nfs_client *clp;
-
-	/*
-	 * sr_status remains 1 if an RPC level error occurred. The server
-	 * may or may not have processed the sequence operation..
-	 * Proceed as if the server received and processed the sequence
-	 * operation.
-	 */
-	if (res->sr_status == 1)
-		res->sr_status = NFS_OK;
+	bool interrupted = false;
+	int ret = 1;
 
 	/* don't increment the sequence number if the task wasn't sent */
 	if (!RPC_WAS_SENT(task))
 		goto out;
 
+	slot = res->sr_slot;
+	session = slot->table->session;
+
+	if (slot->interrupted) {
+		slot->interrupted = 0;
+		interrupted = true;
+	}
+
 	/* Check the SEQUENCE operation status */
 	switch (res->sr_status) {
 	case 0:
 		/* Update the slot's sequence and clientid lease timer */
-		++res->sr_slot->seq_nr;
-		timestamp = res->sr_renewal_time;
-		clp = res->sr_session->clp;
-		do_renew_lease(clp, timestamp);
+		++slot->seq_nr;
+		clp = session->clp;
+		do_renew_lease(clp, res->sr_timestamp);
 		/* Check sequence flags */
 		if (res->sr_status_flags != 0)
 			nfs4_schedule_lease_recovery(clp);
+		nfs41_update_target_slotid(slot->table, slot, res);
 		break;
+	case 1:
+		/*
+		 * sr_status remains 1 if an RPC level error occurred.
+		 * The server may or may not have processed the sequence
+		 * operation..
+		 * Mark the slot as having hosted an interrupted RPC call.
+		 */
+		slot->interrupted = 1;
+		goto out;
 	case -NFS4ERR_DELAY:
 		/* The server detected a resend of the RPC call and
 		 * returned NFS4ERR_DELAY as per Section 2.10.6.2
 		 * of RFC5661.
 		 */
-		dprintk("%s: slot=%td seq=%d: Operation in progress\n",
+		dprintk("%s: slot=%u seq=%u: Operation in progress\n",
 			__func__,
-			res->sr_slot - res->sr_session->fc_slot_table.slots,
-			res->sr_slot->seq_nr);
+			slot->slot_nr,
+			slot->seq_nr);
 		goto out_retry;
+	case -NFS4ERR_BADSLOT:
+		/*
+		 * The slot id we used was probably retired. Try again
+		 * using a different slot id.
+		 */
+		goto retry_nowait;
+	case -NFS4ERR_SEQ_MISORDERED:
+		/*
+		 * Was the last operation on this sequence interrupted?
+		 * If so, retry after bumping the sequence number.
+		 */
+		if (interrupted) {
+			++slot->seq_nr;
+			goto retry_nowait;
+		}
+		/*
+		 * Could this slot have been previously retired?
+		 * If so, then the server may be expecting seq_nr = 1!
+		 */
+		if (slot->seq_nr != 1) {
+			slot->seq_nr = 1;
+			goto retry_nowait;
+		}
+		break;
+	case -NFS4ERR_SEQ_FALSE_RETRY:
+		++slot->seq_nr;
+		goto retry_nowait;
 	default:
 		/* Just update the slot sequence no. */
-		++res->sr_slot->seq_nr;
+		++slot->seq_nr;
 	}
 out:
 	/* The session may be reset by one of the error handlers. */
 	dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
 	nfs41_sequence_free_slot(res);
-	return 1;
+	return ret;
+retry_nowait:
+	if (rpc_restart_call_prepare(task)) {
+		task->tk_status = 0;
+		ret = 0;
+	}
+	goto out;
 out_retry:
 	if (!rpc_restart_call(task))
 		goto out;
@@ -545,55 +518,27 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
 static int nfs4_sequence_done(struct rpc_task *task,
 			       struct nfs4_sequence_res *res)
 {
-	if (res->sr_session == NULL)
+	if (res->sr_slot == NULL)
 		return 1;
 	return nfs41_sequence_done(task, res);
 }
 
-/*
- * nfs4_find_slot - efficiently look for a free slot
- *
- * nfs4_find_slot looks for an unset bit in the used_slots bitmap.
- * If found, we mark the slot as used, update the highest_used_slotid,
- * and respectively set up the sequence operation args.
- * The slot number is returned if found, or NFS4_NO_SLOT otherwise.
- *
- * Note: must be called with under the slot_tbl_lock.
- */
-static u32
-nfs4_find_slot(struct nfs4_slot_table *tbl)
-{
-	u32 slotid;
-	u32 ret_id = NFS4_NO_SLOT;
-
-	dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n",
-		__func__, tbl->used_slots[0], tbl->highest_used_slotid,
-		tbl->max_slots);
-	slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slots);
-	if (slotid >= tbl->max_slots)
-		goto out;
-	__set_bit(slotid, tbl->used_slots);
-	if (slotid > tbl->highest_used_slotid ||
-			tbl->highest_used_slotid == NFS4_NO_SLOT)
-		tbl->highest_used_slotid = slotid;
-	ret_id = slotid;
-out:
-	dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n",
-		__func__, tbl->used_slots[0], tbl->highest_used_slotid, ret_id);
-	return ret_id;
-}
-
 static void nfs41_init_sequence(struct nfs4_sequence_args *args,
 		struct nfs4_sequence_res *res, int cache_reply)
 {
-	args->sa_session = NULL;
+	args->sa_slot = NULL;
 	args->sa_cache_this = 0;
+	args->sa_privileged = 0;
 	if (cache_reply)
 		args->sa_cache_this = 1;
-	res->sr_session = NULL;
 	res->sr_slot = NULL;
 }
 
+static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args)
+{
+	args->sa_privileged = 1;
+}
+
 int nfs41_setup_sequence(struct nfs4_session *session,
 				struct nfs4_sequence_args *args,
 				struct nfs4_sequence_res *res,
@@ -601,59 +546,59 @@ int nfs41_setup_sequence(struct nfs4_session *session,
 {
 	struct nfs4_slot *slot;
 	struct nfs4_slot_table *tbl;
-	u32 slotid;
 
 	dprintk("--> %s\n", __func__);
 	/* slot already allocated? */
 	if (res->sr_slot != NULL)
-		return 0;
+		goto out_success;
 
 	tbl = &session->fc_slot_table;
 
+	task->tk_timeout = 0;
+
 	spin_lock(&tbl->slot_tbl_lock);
 	if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) &&
-	    !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
+	    !args->sa_privileged) {
 		/* The state manager will wait until the slot table is empty */
-		rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
-		spin_unlock(&tbl->slot_tbl_lock);
 		dprintk("%s session is draining\n", __func__);
-		return -EAGAIN;
+		goto out_sleep;
 	}
 
-	if (!rpc_queue_empty(&tbl->slot_tbl_waitq) &&
-	    !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
-		rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
-		spin_unlock(&tbl->slot_tbl_lock);
-		dprintk("%s enforce FIFO order\n", __func__);
-		return -EAGAIN;
-	}
-
-	slotid = nfs4_find_slot(tbl);
-	if (slotid == NFS4_NO_SLOT) {
-		rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
-		spin_unlock(&tbl->slot_tbl_lock);
+	slot = nfs4_alloc_slot(tbl);
+	if (IS_ERR(slot)) {
+		/* If out of memory, try again in 1/4 second */
+		if (slot == ERR_PTR(-ENOMEM))
+			task->tk_timeout = HZ >> 2;
 		dprintk("<-- %s: no free slots\n", __func__);
-		return -EAGAIN;
+		goto out_sleep;
 	}
 	spin_unlock(&tbl->slot_tbl_lock);
 
-	rpc_task_set_priority(task, RPC_PRIORITY_NORMAL);
-	slot = tbl->slots + slotid;
-	args->sa_session = session;
-	args->sa_slotid = slotid;
+	args->sa_slot = slot;
 
-	dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
+	dprintk("<-- %s slotid=%d seqid=%d\n", __func__,
+			slot->slot_nr, slot->seq_nr);
 
-	res->sr_session = session;
 	res->sr_slot = slot;
-	res->sr_renewal_time = jiffies;
+	res->sr_timestamp = jiffies;
 	res->sr_status_flags = 0;
 	/*
 	 * sr_status is only set in decode_sequence, and so will remain
 	 * set to 1 if an rpc level failure occurs.
 	 */
 	res->sr_status = 1;
+out_success:
+	rpc_call_start(task);
 	return 0;
+out_sleep:
+	/* Privileged tasks are queued with top priority */
+	if (args->sa_privileged)
+		rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task,
+				NULL, RPC_PRIORITY_PRIVILEGED);
+	else
+		rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
+	spin_unlock(&tbl->slot_tbl_lock);
+	return -EAGAIN;
 }
 EXPORT_SYMBOL_GPL(nfs41_setup_sequence);
 
@@ -665,12 +610,14 @@ int nfs4_setup_sequence(const struct nfs_server *server,
 	struct nfs4_session *session = nfs4_get_session(server);
 	int ret = 0;
 
-	if (session == NULL)
+	if (session == NULL) {
+		rpc_call_start(task);
 		goto out;
+	}
 
-	dprintk("--> %s clp %p session %p sr_slot %td\n",
+	dprintk("--> %s clp %p session %p sr_slot %d\n",
 		__func__, session->clp, session, res->sr_slot ?
-			res->sr_slot - session->fc_slot_table.slots : -1);
+			res->sr_slot->slot_nr : -1);
 
 	ret = nfs41_setup_sequence(session, args, res, task);
 out:
@@ -687,19 +634,11 @@ struct nfs41_call_sync_data {
 static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs41_call_sync_data *data = calldata;
+	struct nfs4_session *session = nfs4_get_session(data->seq_server);
 
 	dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
 
-	if (nfs4_setup_sequence(data->seq_server, data->seq_args,
-				data->seq_res, task))
-		return;
-	rpc_call_start(task);
-}
-
-static void nfs41_call_priv_sync_prepare(struct rpc_task *task, void *calldata)
-{
-	rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
-	nfs41_call_sync_prepare(task, calldata);
+	nfs41_setup_sequence(session, data->seq_args, data->seq_res, task);
 }
 
 static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
@@ -714,17 +653,11 @@ static const struct rpc_call_ops nfs41_call_sync_ops = {
 	.rpc_call_done = nfs41_call_sync_done,
 };
 
-static const struct rpc_call_ops nfs41_call_priv_sync_ops = {
-	.rpc_call_prepare = nfs41_call_priv_sync_prepare,
-	.rpc_call_done = nfs41_call_sync_done,
-};
-
 static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
 				   struct nfs_server *server,
 				   struct rpc_message *msg,
 				   struct nfs4_sequence_args *args,
-				   struct nfs4_sequence_res *res,
-				   int privileged)
+				   struct nfs4_sequence_res *res)
 {
 	int ret;
 	struct rpc_task *task;
@@ -740,8 +673,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
 		.callback_data = &data
 	};
 
-	if (privileged)
-		task_setup.callback_ops = &nfs41_call_priv_sync_ops;
 	task = rpc_run_task(&task_setup);
 	if (IS_ERR(task))
 		ret = PTR_ERR(task);
@@ -752,24 +683,18 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
 	return ret;
 }
 
-int _nfs4_call_sync_session(struct rpc_clnt *clnt,
-			    struct nfs_server *server,
-			    struct rpc_message *msg,
-			    struct nfs4_sequence_args *args,
-			    struct nfs4_sequence_res *res,
-			    int cache_reply)
-{
-	nfs41_init_sequence(args, res, cache_reply);
-	return nfs4_call_sync_sequence(clnt, server, msg, args, res, 0);
-}
-
 #else
-static inline
+static
 void nfs41_init_sequence(struct nfs4_sequence_args *args,
 		struct nfs4_sequence_res *res, int cache_reply)
 {
 }
 
+static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args)
+{
+}
+
+
 static int nfs4_sequence_done(struct rpc_task *task,
 			       struct nfs4_sequence_res *res)
 {
@@ -777,18 +702,17 @@ static int nfs4_sequence_done(struct rpc_task *task,
 }
 #endif /* CONFIG_NFS_V4_1 */
 
+static
 int _nfs4_call_sync(struct rpc_clnt *clnt,
 		    struct nfs_server *server,
 		    struct rpc_message *msg,
 		    struct nfs4_sequence_args *args,
-		    struct nfs4_sequence_res *res,
-		    int cache_reply)
+		    struct nfs4_sequence_res *res)
 {
-	nfs41_init_sequence(args, res, cache_reply);
 	return rpc_call_sync(clnt, msg, 0);
 }
 
-static inline
+static
 int nfs4_call_sync(struct rpc_clnt *clnt,
 		   struct nfs_server *server,
 		   struct rpc_message *msg,
@@ -796,8 +720,9 @@ int nfs4_call_sync(struct rpc_clnt *clnt,
 		   struct nfs4_sequence_res *res,
 		   int cache_reply)
 {
+	nfs41_init_sequence(args, res, cache_reply);
 	return server->nfs_client->cl_mvops->call_sync(clnt, server, msg,
-						args, res, cache_reply);
+						args, res);
 }
 
 static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
@@ -1445,13 +1370,6 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
 				nfs_inode_find_state_and_recover(state->inode,
 						stateid);
 				nfs4_schedule_stateid_recovery(server, state);
-			case -EKEYEXPIRED:
-				/*
-				 * User RPCSEC_GSS context has expired.
-				 * We cannot recover this stateid now, so
-				 * skip it and allow recovery thread to
-				 * proceed.
-				 */
 			case -ENOMEM:
 				err = 0;
 				goto out;
@@ -1574,20 +1492,12 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 				&data->o_res.seq_res,
 				task) != 0)
 		nfs_release_seqid(data->o_arg.seqid);
-	else
-		rpc_call_start(task);
 	return;
 unlock_no_action:
 	rcu_read_unlock();
 out_no_action:
 	task->tk_action = NULL;
-
-}
-
-static void nfs4_recover_open_prepare(struct rpc_task *task, void *calldata)
-{
-	rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
-	nfs4_open_prepare(task, calldata);
+	nfs4_sequence_done(task, &data->o_res.seq_res);
 }
 
 static void nfs4_open_done(struct rpc_task *task, void *calldata)
@@ -1648,12 +1558,6 @@ static const struct rpc_call_ops nfs4_open_ops = {
 	.rpc_release = nfs4_open_release,
 };
 
-static const struct rpc_call_ops nfs4_recover_open_ops = {
-	.rpc_call_prepare = nfs4_recover_open_prepare,
-	.rpc_call_done = nfs4_open_done,
-	.rpc_release = nfs4_open_release,
-};
-
 static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
 {
 	struct inode *dir = data->dir->d_inode;
@@ -1683,7 +1587,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
 	data->rpc_status = 0;
 	data->cancelled = 0;
 	if (isrecover)
-		task_setup_data.callback_ops = &nfs4_recover_open_ops;
+		nfs4_set_sequence_privileged(&o_arg->seq_args);
 	task = rpc_run_task(&task_setup_data);
         if (IS_ERR(task))
                 return PTR_ERR(task);
@@ -1789,24 +1693,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 	return 0;
 }
 
-static int nfs4_client_recover_expired_lease(struct nfs_client *clp)
-{
-	unsigned int loop;
-	int ret;
-
-	for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
-		ret = nfs4_wait_clnt_recover(clp);
-		if (ret != 0)
-			break;
-		if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
-		    !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
-			break;
-		nfs4_schedule_state_manager(clp);
-		ret = -EIO;
-	}
-	return ret;
-}
-
 static int nfs4_recover_expired_lease(struct nfs_server *server)
 {
 	return nfs4_client_recover_expired_lease(server->nfs_client);
@@ -2282,6 +2168,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 	if (!call_close) {
 		/* Note: exit _without_ calling nfs4_close_done */
 		task->tk_action = NULL;
+		nfs4_sequence_done(task, &calldata->res.seq_res);
 		goto out;
 	}
 
@@ -2299,8 +2186,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 				&calldata->res.seq_res,
 				task) != 0)
 		nfs_release_seqid(calldata->arg.seqid);
-	else
-		rpc_call_start(task);
 out:
 	dprintk("%s: done!\n", __func__);
 }
@@ -2533,7 +2418,8 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
 	rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS];
 
 	len = rpcauth_list_flavors(flav_array, ARRAY_SIZE(flav_array));
-	BUG_ON(len < 0);
+	if (len < 0)
+		return len;
 
 	for (i = 0; i < len; i++) {
 		/* AUTH_UNIX is the default flavor if none was specified,
@@ -3038,12 +2924,10 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
 
 static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
 {
-	if (nfs4_setup_sequence(NFS_SERVER(data->dir),
-				&data->args.seq_args,
-				&data->res.seq_res,
-				task))
-		return;
-	rpc_call_start(task);
+	nfs4_setup_sequence(NFS_SERVER(data->dir),
+			&data->args.seq_args,
+			&data->res.seq_res,
+			task);
 }
 
 static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
@@ -3071,12 +2955,10 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
 
 static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
 {
-	if (nfs4_setup_sequence(NFS_SERVER(data->old_dir),
-				&data->args.seq_args,
-				&data->res.seq_res,
-				task))
-		return;
-	rpc_call_start(task);
+	nfs4_setup_sequence(NFS_SERVER(data->old_dir),
+			&data->args.seq_args,
+			&data->res.seq_res,
+			task);
 }
 
 static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
@@ -3362,9 +3244,6 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
 	int mode = sattr->ia_mode;
 	int status = -ENOMEM;
 
-	BUG_ON(!(sattr->ia_valid & ATTR_MODE));
-	BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode));
-
 	data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK);
 	if (data == NULL)
 		goto out;
@@ -3380,10 +3259,13 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
 		data->arg.ftype = NF4CHR;
 		data->arg.u.device.specdata1 = MAJOR(rdev);
 		data->arg.u.device.specdata2 = MINOR(rdev);
+	} else if (!S_ISSOCK(mode)) {
+		status = -EINVAL;
+		goto out_free;
 	}
 	
 	status = nfs4_do_create(dir, dentry, data);
-
+out_free:
 	nfs4_free_createdata(data);
 out:
 	return status;
@@ -3565,12 +3447,10 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message
 
 static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
 {
-	if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
-				&data->args.seq_args,
-				&data->res.seq_res,
-				task))
-		return;
-	rpc_call_start(task);
+	nfs4_setup_sequence(NFS_SERVER(data->header->inode),
+			&data->args.seq_args,
+			&data->res.seq_res,
+			task);
 }
 
 static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
@@ -3631,22 +3511,18 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
 
 static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
 {
-	if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
-				&data->args.seq_args,
-				&data->res.seq_res,
-				task))
-		return;
-	rpc_call_start(task);
+	nfs4_setup_sequence(NFS_SERVER(data->header->inode),
+			&data->args.seq_args,
+			&data->res.seq_res,
+			task);
 }
 
 static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
 {
-	if (nfs4_setup_sequence(NFS_SERVER(data->inode),
-				&data->args.seq_args,
-				&data->res.seq_res,
-				task))
-		return;
-	rpc_call_start(task);
+	nfs4_setup_sequence(NFS_SERVER(data->inode),
+			&data->args.seq_args,
+			&data->res.seq_res,
+			task);
 }
 
 static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data)
@@ -3937,8 +3813,13 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
 		goto out_free;
 	}
 	nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len);
-	if (buf)
+	if (buf) {
+		if (res.acl_len > buflen) {
+			ret = -ERANGE;
+			goto out_free;
+		}
 		_copy_from_pages(buf, pages, res.acl_data_offset, res.acl_len);
+	}
 out_ok:
 	ret = res.acl_len;
 out_free:
@@ -4085,7 +3966,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 		case -NFS4ERR_DELAY:
 			nfs_inc_server_stats(server, NFSIOS_DELAY);
 		case -NFS4ERR_GRACE:
-		case -EKEYEXPIRED:
 			rpc_delay(task, NFS4_POLL_RETRY_MAX);
 			task->tk_status = 0;
 			return -EAGAIN;
@@ -4293,11 +4173,10 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
 
 	d_data = (struct nfs4_delegreturndata *)data;
 
-	if (nfs4_setup_sequence(d_data->res.server,
-				&d_data->args.seq_args,
-				&d_data->res.seq_res, task))
-		return;
-	rpc_call_start(task);
+	nfs4_setup_sequence(d_data->res.server,
+			&d_data->args.seq_args,
+			&d_data->res.seq_res,
+			task);
 }
 #endif /* CONFIG_NFS_V4_1 */
 
@@ -4543,6 +4422,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
 	if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
 		/* Note: exit _without_ running nfs4_locku_done */
 		task->tk_action = NULL;
+		nfs4_sequence_done(task, &calldata->res.seq_res);
 		return;
 	}
 	calldata->timestamp = jiffies;
@@ -4551,8 +4431,6 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
 				&calldata->res.seq_res,
 				task) != 0)
 		nfs_release_seqid(calldata->arg.seqid);
-	else
-		rpc_call_start(task);
 }
 
 static const struct rpc_call_ops nfs4_locku_ops = {
@@ -4696,8 +4574,9 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
 		return;
 	/* Do we need to do an open_to_lock_owner? */
 	if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
-		if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0)
+		if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) {
 			goto out_release_lock_seqid;
+		}
 		data->arg.open_stateid = &state->stateid;
 		data->arg.new_lock_owner = 1;
 		data->res.open_seqid = data->arg.open_seqid;
@@ -4707,20 +4586,12 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
 	if (nfs4_setup_sequence(data->server,
 				&data->arg.seq_args,
 				&data->res.seq_res,
-				task) == 0) {
-		rpc_call_start(task);
+				task) == 0)
 		return;
-	}
 	nfs_release_seqid(data->arg.open_seqid);
 out_release_lock_seqid:
 	nfs_release_seqid(data->arg.lock_seqid);
-	dprintk("%s: done!, ret = %d\n", __func__, task->tk_status);
-}
-
-static void nfs4_recover_lock_prepare(struct rpc_task *task, void *calldata)
-{
-	rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
-	nfs4_lock_prepare(task, calldata);
+	dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
 }
 
 static void nfs4_lock_done(struct rpc_task *task, void *calldata)
@@ -4775,12 +4646,6 @@ static const struct rpc_call_ops nfs4_lock_ops = {
 	.rpc_release = nfs4_lock_release,
 };
 
-static const struct rpc_call_ops nfs4_recover_lock_ops = {
-	.rpc_call_prepare = nfs4_recover_lock_prepare,
-	.rpc_call_done = nfs4_lock_done,
-	.rpc_release = nfs4_lock_release,
-};
-
 static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)
 {
 	switch (error) {
@@ -4823,15 +4688,15 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
 		return -ENOMEM;
 	if (IS_SETLKW(cmd))
 		data->arg.block = 1;
-	if (recovery_type > NFS_LOCK_NEW) {
-		if (recovery_type == NFS_LOCK_RECLAIM)
-			data->arg.reclaim = NFS_LOCK_RECLAIM;
-		task_setup_data.callback_ops = &nfs4_recover_lock_ops;
-	}
 	nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1);
 	msg.rpc_argp = &data->arg;
 	msg.rpc_resp = &data->res;
 	task_setup_data.callback_data = data;
+	if (recovery_type > NFS_LOCK_NEW) {
+		if (recovery_type == NFS_LOCK_RECLAIM)
+			data->arg.reclaim = NFS_LOCK_RECLAIM;
+		nfs4_set_sequence_privileged(&data->arg.seq_args);
+	}
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
@@ -5100,15 +4965,6 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
 				nfs4_schedule_stateid_recovery(server, state);
 				err = 0;
 				goto out;
-			case -EKEYEXPIRED:
-				/*
-				 * User RPCSEC_GSS context has expired.
-				 * We cannot recover this stateid now, so
-				 * skip it and allow recovery thread to
-				 * proceed.
-				 */
-				err = 0;
-				goto out;
 			case -ENOMEM:
 			case -NFS4ERR_DENIED:
 				/* kill_proc(fl->fl_pid, SIGLOST, 1); */
@@ -5357,7 +5213,6 @@ int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred
 	};
 
 	dprintk("--> %s\n", __func__);
-	BUG_ON(clp == NULL);
 
 	res.session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
 	if (unlikely(res.session == NULL)) {
@@ -5569,20 +5424,16 @@ struct nfs4_get_lease_time_data {
 static void nfs4_get_lease_time_prepare(struct rpc_task *task,
 					void *calldata)
 {
-	int ret;
 	struct nfs4_get_lease_time_data *data =
 			(struct nfs4_get_lease_time_data *)calldata;
 
 	dprintk("--> %s\n", __func__);
-	rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
 	/* just setup sequence, do not trigger session recovery
 	   since we're invoked within one */
-	ret = nfs41_setup_sequence(data->clp->cl_session,
-				   &data->args->la_seq_args,
-				   &data->res->lr_seq_res, task);
-
-	BUG_ON(ret == -EAGAIN);
-	rpc_call_start(task);
+	nfs41_setup_sequence(data->clp->cl_session,
+			&data->args->la_seq_args,
+			&data->res->lr_seq_res,
+			task);
 	dprintk("<-- %s\n", __func__);
 }
 
@@ -5644,6 +5495,7 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
 	int status;
 
 	nfs41_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0);
+	nfs4_set_sequence_privileged(&args.la_seq_args);
 	dprintk("--> %s\n", __func__);
 	task = rpc_run_task(&task_setup);
 
@@ -5658,145 +5510,6 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
 	return status;
 }
 
-static struct nfs4_slot *nfs4_alloc_slots(u32 max_slots, gfp_t gfp_flags)
-{
-	return kcalloc(max_slots, sizeof(struct nfs4_slot), gfp_flags);
-}
-
-static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl,
-		struct nfs4_slot *new,
-		u32 max_slots,
-		u32 ivalue)
-{
-	struct nfs4_slot *old = NULL;
-	u32 i;
-
-	spin_lock(&tbl->slot_tbl_lock);
-	if (new) {
-		old = tbl->slots;
-		tbl->slots = new;
-		tbl->max_slots = max_slots;
-	}
-	tbl->highest_used_slotid = NFS4_NO_SLOT;
-	for (i = 0; i < tbl->max_slots; i++)
-		tbl->slots[i].seq_nr = ivalue;
-	spin_unlock(&tbl->slot_tbl_lock);
-	kfree(old);
-}
-
-/*
- * (re)Initialise a slot table
- */
-static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
-				 u32 ivalue)
-{
-	struct nfs4_slot *new = NULL;
-	int ret = -ENOMEM;
-
-	dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__,
-		max_reqs, tbl->max_slots);
-
-	/* Does the newly negotiated max_reqs match the existing slot table? */
-	if (max_reqs != tbl->max_slots) {
-		new = nfs4_alloc_slots(max_reqs, GFP_NOFS);
-		if (!new)
-			goto out;
-	}
-	ret = 0;
-
-	nfs4_add_and_init_slots(tbl, new, max_reqs, ivalue);
-	dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
-		tbl, tbl->slots, tbl->max_slots);
-out:
-	dprintk("<-- %s: return %d\n", __func__, ret);
-	return ret;
-}
-
-/* Destroy the slot table */
-static void nfs4_destroy_slot_tables(struct nfs4_session *session)
-{
-	if (session->fc_slot_table.slots != NULL) {
-		kfree(session->fc_slot_table.slots);
-		session->fc_slot_table.slots = NULL;
-	}
-	if (session->bc_slot_table.slots != NULL) {
-		kfree(session->bc_slot_table.slots);
-		session->bc_slot_table.slots = NULL;
-	}
-	return;
-}
-
-/*
- * Initialize or reset the forechannel and backchannel tables
- */
-static int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
-{
-	struct nfs4_slot_table *tbl;
-	int status;
-
-	dprintk("--> %s\n", __func__);
-	/* Fore channel */
-	tbl = &ses->fc_slot_table;
-	status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
-	if (status) /* -ENOMEM */
-		return status;
-	/* Back channel */
-	tbl = &ses->bc_slot_table;
-	status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
-	if (status && tbl->slots == NULL)
-		/* Fore and back channel share a connection so get
-		 * both slot tables or neither */
-		nfs4_destroy_slot_tables(ses);
-	return status;
-}
-
-struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
-{
-	struct nfs4_session *session;
-	struct nfs4_slot_table *tbl;
-
-	session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
-	if (!session)
-		return NULL;
-
-	tbl = &session->fc_slot_table;
-	tbl->highest_used_slotid = NFS4_NO_SLOT;
-	spin_lock_init(&tbl->slot_tbl_lock);
-	rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
-	init_completion(&tbl->complete);
-
-	tbl = &session->bc_slot_table;
-	tbl->highest_used_slotid = NFS4_NO_SLOT;
-	spin_lock_init(&tbl->slot_tbl_lock);
-	rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
-	init_completion(&tbl->complete);
-
-	session->session_state = 1<<NFS4_SESSION_INITING;
-
-	session->clp = clp;
-	return session;
-}
-
-void nfs4_destroy_session(struct nfs4_session *session)
-{
-	struct rpc_xprt *xprt;
-	struct rpc_cred *cred;
-
-	cred = nfs4_get_exchange_id_cred(session->clp);
-	nfs4_proc_destroy_session(session, cred);
-	if (cred)
-		put_rpccred(cred);
-
-	rcu_read_lock();
-	xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt);
-	rcu_read_unlock();
-	dprintk("%s Destroy backchannel for xprt %p\n",
-		__func__, xprt);
-	xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS);
-	nfs4_destroy_slot_tables(session);
-	kfree(session);
-}
-
 /*
  * Initialize the values to be used by the client in CREATE_SESSION
  * If nfs4_init_session set the fore channel request and response sizes,
@@ -5809,8 +5522,8 @@ void nfs4_destroy_session(struct nfs4_session *session)
 static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
 {
 	struct nfs4_session *session = args->client->cl_session;
-	unsigned int mxrqst_sz = session->fc_attrs.max_rqst_sz,
-		     mxresp_sz = session->fc_attrs.max_resp_sz;
+	unsigned int mxrqst_sz = session->fc_target_max_rqst_sz,
+		     mxresp_sz = session->fc_target_max_resp_sz;
 
 	if (mxrqst_sz == 0)
 		mxrqst_sz = NFS_MAX_FILE_IO_SIZE;
@@ -5919,10 +5632,9 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
 
 	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
 
-	if (!status)
+	if (!status) {
 		/* Verify the session's negotiated channel_attrs values */
 		status = nfs4_verify_channel_attrs(&args, session);
-	if (!status) {
 		/* Increment the clientid slot sequence id */
 		clp->cl_seqid++;
 	}
@@ -5991,83 +5703,6 @@ int nfs4_proc_destroy_session(struct nfs4_session *session,
 	return status;
 }
 
-/*
- * With sessions, the client is not marked ready until after a
- * successful EXCHANGE_ID and CREATE_SESSION.
- *
- * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate
- * other versions of NFS can be tried.
- */
-static int nfs41_check_session_ready(struct nfs_client *clp)
-{
-	int ret;
-	
-	if (clp->cl_cons_state == NFS_CS_SESSION_INITING) {
-		ret = nfs4_client_recover_expired_lease(clp);
-		if (ret)
-			return ret;
-	}
-	if (clp->cl_cons_state < NFS_CS_READY)
-		return -EPROTONOSUPPORT;
-	smp_rmb();
-	return 0;
-}
-
-int nfs4_init_session(struct nfs_server *server)
-{
-	struct nfs_client *clp = server->nfs_client;
-	struct nfs4_session *session;
-	unsigned int rsize, wsize;
-
-	if (!nfs4_has_session(clp))
-		return 0;
-
-	session = clp->cl_session;
-	spin_lock(&clp->cl_lock);
-	if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
-
-		rsize = server->rsize;
-		if (rsize == 0)
-			rsize = NFS_MAX_FILE_IO_SIZE;
-		wsize = server->wsize;
-		if (wsize == 0)
-			wsize = NFS_MAX_FILE_IO_SIZE;
-
-		session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
-		session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
-	}
-	spin_unlock(&clp->cl_lock);
-
-	return nfs41_check_session_ready(clp);
-}
-
-int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time)
-{
-	struct nfs4_session *session = clp->cl_session;
-	int ret;
-
-	spin_lock(&clp->cl_lock);
-	if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
-		/*
-		 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the
-		 * DS lease to be equal to the MDS lease.
-		 */
-		clp->cl_lease_time = lease_time;
-		clp->cl_last_renewal = jiffies;
-	}
-	spin_unlock(&clp->cl_lock);
-
-	ret = nfs41_check_session_ready(clp);
-	if (ret)
-		return ret;
-	/* Test for the DS role */
-	if (!is_ds_client(clp))
-		return -ENODEV;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
-
-
 /*
  * Renew the cl_session lease.
  */
@@ -6133,9 +5768,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
 	args = task->tk_msg.rpc_argp;
 	res = task->tk_msg.rpc_resp;
 
-	if (nfs41_setup_sequence(clp->cl_session, args, res, task))
-		return;
-	rpc_call_start(task);
+	nfs41_setup_sequence(clp->cl_session, args, res, task);
 }
 
 static const struct rpc_call_ops nfs41_sequence_ops = {
@@ -6144,7 +5777,9 @@ static const struct rpc_call_ops nfs41_sequence_ops = {
 	.rpc_release = nfs41_sequence_release,
 };
 
-static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
+static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
+		struct rpc_cred *cred,
+		bool is_privileged)
 {
 	struct nfs4_sequence_data *calldata;
 	struct rpc_message msg = {
@@ -6166,6 +5801,8 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_
 		return ERR_PTR(-ENOMEM);
 	}
 	nfs41_init_sequence(&calldata->args, &calldata->res, 0);
+	if (is_privileged)
+		nfs4_set_sequence_privileged(&calldata->args);
 	msg.rpc_argp = &calldata->args;
 	msg.rpc_resp = &calldata->res;
 	calldata->clp = clp;
@@ -6181,7 +5818,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
 
 	if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
 		return 0;
-	task = _nfs41_proc_sequence(clp, cred);
+	task = _nfs41_proc_sequence(clp, cred, false);
 	if (IS_ERR(task))
 		ret = PTR_ERR(task);
 	else
@@ -6195,7 +5832,7 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
 	struct rpc_task *task;
 	int ret;
 
-	task = _nfs41_proc_sequence(clp, cred);
+	task = _nfs41_proc_sequence(clp, cred, true);
 	if (IS_ERR(task)) {
 		ret = PTR_ERR(task);
 		goto out;
@@ -6224,13 +5861,10 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)
 {
 	struct nfs4_reclaim_complete_data *calldata = data;
 
-	rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
-	if (nfs41_setup_sequence(calldata->clp->cl_session,
-				&calldata->arg.seq_args,
-				&calldata->res.seq_res, task))
-		return;
-
-	rpc_call_start(task);
+	nfs41_setup_sequence(calldata->clp->cl_session,
+			&calldata->arg.seq_args,
+			&calldata->res.seq_res,
+			task);
 }
 
 static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp)
@@ -6307,6 +5941,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
 	calldata->arg.one_fs = 0;
 
 	nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0);
+	nfs4_set_sequence_privileged(&calldata->arg.seq_args);
 	msg.rpc_argp = &calldata->arg;
 	msg.rpc_resp = &calldata->res;
 	task_setup_data.callback_data = calldata;
@@ -6330,6 +5965,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_layoutget *lgp = calldata;
 	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+	struct nfs4_session *session = nfs4_get_session(server);
 
 	dprintk("--> %s\n", __func__);
 	/* Note the is a race here, where a CB_LAYOUTRECALL can come in
@@ -6337,16 +5973,14 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
 	 * However, that is not so catastrophic, and there seems
 	 * to be no way to prevent it completely.
 	 */
-	if (nfs4_setup_sequence(server, &lgp->args.seq_args,
+	if (nfs41_setup_sequence(session, &lgp->args.seq_args,
 				&lgp->res.seq_res, task))
 		return;
 	if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
 					  NFS_I(lgp->args.inode)->layout,
 					  lgp->args.ctx->state)) {
 		rpc_exit(task, NFS4_OK);
-		return;
 	}
-	rpc_call_start(task);
 }
 
 static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
@@ -6359,7 +5993,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 
 	dprintk("--> %s\n", __func__);
 
-	if (!nfs4_sequence_done(task, &lgp->res.seq_res))
+	if (!nfs41_sequence_done(task, &lgp->res.seq_res))
 		goto out;
 
 	switch (task->tk_status) {
@@ -6510,10 +6144,10 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
 	struct nfs4_layoutreturn *lrp = calldata;
 
 	dprintk("--> %s\n", __func__);
-	if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args,
-				&lrp->res.seq_res, task))
-		return;
-	rpc_call_start(task);
+	nfs41_setup_sequence(lrp->clp->cl_session,
+			&lrp->args.seq_args,
+			&lrp->res.seq_res,
+			task);
 }
 
 static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
@@ -6523,7 +6157,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
 
 	dprintk("--> %s\n", __func__);
 
-	if (!nfs4_sequence_done(task, &lrp->res.seq_res))
+	if (!nfs41_sequence_done(task, &lrp->res.seq_res))
 		return;
 
 	server = NFS_SERVER(lrp->args.inode);
@@ -6672,11 +6306,12 @@ static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_layoutcommit_data *data = calldata;
 	struct nfs_server *server = NFS_SERVER(data->args.inode);
+	struct nfs4_session *session = nfs4_get_session(server);
 
-	if (nfs4_setup_sequence(server, &data->args.seq_args,
-				&data->res.seq_res, task))
-		return;
-	rpc_call_start(task);
+	nfs41_setup_sequence(session,
+			&data->args.seq_args,
+			&data->res.seq_res,
+			task);
 }
 
 static void
@@ -6685,7 +6320,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
 	struct nfs4_layoutcommit_data *data = calldata;
 	struct nfs_server *server = NFS_SERVER(data->args.inode);
 
-	if (!nfs4_sequence_done(task, &data->res.seq_res))
+	if (!nfs41_sequence_done(task, &data->res.seq_res))
 		return;
 
 	switch (task->tk_status) { /* Just ignore these failures */
@@ -6873,7 +6508,9 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
 
 	dprintk("NFS call  test_stateid %p\n", stateid);
 	nfs41_init_sequence(&args.seq_args, &res.seq_res, 0);
-	status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
+	nfs4_set_sequence_privileged(&args.seq_args);
+	status = nfs4_call_sync_sequence(server->client, server, &msg,
+			&args.seq_args, &res.seq_res);
 	if (status != NFS_OK) {
 		dprintk("NFS reply test_stateid: failed, %d\n", status);
 		return status;
@@ -6920,8 +6557,9 @@ static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
 
 	dprintk("NFS call  free_stateid %p\n", stateid);
 	nfs41_init_sequence(&args.seq_args, &res.seq_res, 0);
+	nfs4_set_sequence_privileged(&args.seq_args);
 	status = nfs4_call_sync_sequence(server->client, server, &msg,
-					 &args.seq_args, &res.seq_res, 1);
+			&args.seq_args, &res.seq_res);
 	dprintk("NFS reply free_stateid: %d\n", status);
 	return status;
 }
@@ -7041,7 +6679,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
 #if defined(CONFIG_NFS_V4_1)
 static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
 	.minor_version = 1,
-	.call_sync = _nfs4_call_sync_session,
+	.call_sync = nfs4_call_sync_sequence,
 	.match_stateid = nfs41_match_stateid,
 	.find_root_sec = nfs41_find_root_sec,
 	.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
diff --git a/trunk/fs/nfs/nfs4session.c b/trunk/fs/nfs/nfs4session.c
new file mode 100644
index 000000000000..ebda5f4a031b
--- /dev/null
+++ b/trunk/fs/nfs/nfs4session.c
@@ -0,0 +1,552 @@
+/*
+ * fs/nfs/nfs4session.c
+ *
+ * Copyright (c) 2012 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/bc_xprt.h>
+#include <linux/nfs.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/module.h>
+
+#include "nfs4_fs.h"
+#include "internal.h"
+#include "nfs4session.h"
+#include "callback.h"
+
+#define NFSDBG_FACILITY		NFSDBG_STATE
+
+/*
+ * nfs4_shrink_slot_table - free retired slots from the slot table
+ */
+static void nfs4_shrink_slot_table(struct nfs4_slot_table  *tbl, u32 newsize)
+{
+	struct nfs4_slot **p;
+	if (newsize >= tbl->max_slots)
+		return;
+
+	p = &tbl->slots;
+	while (newsize--)
+		p = &(*p)->next;
+	while (*p) {
+		struct nfs4_slot *slot = *p;
+
+		*p = slot->next;
+		kfree(slot);
+		tbl->max_slots--;
+	}
+}
+
+/*
+ * nfs4_free_slot - free a slot and efficiently update slot table.
+ *
+ * freeing a slot is trivially done by clearing its respective bit
+ * in the bitmap.
+ * If the freed slotid equals highest_used_slotid we want to update it
+ * so that the server would be able to size down the slot table if needed,
+ * otherwise we know that the highest_used_slotid is still in use.
+ * When updating highest_used_slotid there may be "holes" in the bitmap
+ * so we need to scan down from highest_used_slotid to 0 looking for the now
+ * highest slotid in use.
+ * If none found, highest_used_slotid is set to NFS4_NO_SLOT.
+ *
+ * Must be called while holding tbl->slot_tbl_lock
+ */
+void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
+{
+	u32 slotid = slot->slot_nr;
+
+	/* clear used bit in bitmap */
+	__clear_bit(slotid, tbl->used_slots);
+
+	/* update highest_used_slotid when it is freed */
+	if (slotid == tbl->highest_used_slotid) {
+		u32 new_max = find_last_bit(tbl->used_slots, slotid);
+		if (new_max < slotid)
+			tbl->highest_used_slotid = new_max;
+		else {
+			tbl->highest_used_slotid = NFS4_NO_SLOT;
+			nfs4_session_drain_complete(tbl->session, tbl);
+		}
+	}
+	dprintk("%s: slotid %u highest_used_slotid %d\n", __func__,
+		slotid, tbl->highest_used_slotid);
+}
+
+static struct nfs4_slot *nfs4_new_slot(struct nfs4_slot_table  *tbl,
+		u32 slotid, u32 seq_init, gfp_t gfp_mask)
+{
+	struct nfs4_slot *slot;
+
+	slot = kzalloc(sizeof(*slot), gfp_mask);
+	if (slot) {
+		slot->table = tbl;
+		slot->slot_nr = slotid;
+		slot->seq_nr = seq_init;
+	}
+	return slot;
+}
+
+static struct nfs4_slot *nfs4_find_or_create_slot(struct nfs4_slot_table  *tbl,
+		u32 slotid, u32 seq_init, gfp_t gfp_mask)
+{
+	struct nfs4_slot **p, *slot;
+
+	p = &tbl->slots;
+	for (;;) {
+		if (*p == NULL) {
+			*p = nfs4_new_slot(tbl, tbl->max_slots,
+					seq_init, gfp_mask);
+			if (*p == NULL)
+				break;
+			tbl->max_slots++;
+		}
+		slot = *p;
+		if (slot->slot_nr == slotid)
+			return slot;
+		p = &slot->next;
+	}
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * nfs4_alloc_slot - efficiently look for a free slot
+ *
+ * nfs4_alloc_slot looks for an unset bit in the used_slots bitmap.
+ * If found, we mark the slot as used, update the highest_used_slotid,
+ * and respectively set up the sequence operation args.
+ *
+ * Note: must be called with under the slot_tbl_lock.
+ */
+struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl)
+{
+	struct nfs4_slot *ret = ERR_PTR(-EBUSY);
+	u32 slotid;
+
+	dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n",
+		__func__, tbl->used_slots[0], tbl->highest_used_slotid,
+		tbl->max_slotid + 1);
+	slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slotid + 1);
+	if (slotid > tbl->max_slotid)
+		goto out;
+	ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT);
+	if (IS_ERR(ret))
+		goto out;
+	__set_bit(slotid, tbl->used_slots);
+	if (slotid > tbl->highest_used_slotid ||
+			tbl->highest_used_slotid == NFS4_NO_SLOT)
+		tbl->highest_used_slotid = slotid;
+	ret->generation = tbl->generation;
+
+out:
+	dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n",
+		__func__, tbl->used_slots[0], tbl->highest_used_slotid,
+		!IS_ERR(ret) ? ret->slot_nr : -1);
+	return ret;
+}
+
+static int nfs4_grow_slot_table(struct nfs4_slot_table *tbl,
+		 u32 max_reqs, u32 ivalue)
+{
+	if (max_reqs <= tbl->max_slots)
+		return 0;
+	if (!IS_ERR(nfs4_find_or_create_slot(tbl, max_reqs - 1, ivalue, GFP_NOFS)))
+		return 0;
+	return -ENOMEM;
+}
+
+static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl,
+		u32 server_highest_slotid,
+		u32 ivalue)
+{
+	struct nfs4_slot **p;
+
+	nfs4_shrink_slot_table(tbl, server_highest_slotid + 1);
+	p = &tbl->slots;
+	while (*p) {
+		(*p)->seq_nr = ivalue;
+		(*p)->interrupted = 0;
+		p = &(*p)->next;
+	}
+	tbl->highest_used_slotid = NFS4_NO_SLOT;
+	tbl->target_highest_slotid = server_highest_slotid;
+	tbl->server_highest_slotid = server_highest_slotid;
+	tbl->d_target_highest_slotid = 0;
+	tbl->d2_target_highest_slotid = 0;
+	tbl->max_slotid = server_highest_slotid;
+}
+
+/*
+ * (re)Initialise a slot table
+ */
+static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl,
+		u32 max_reqs, u32 ivalue)
+{
+	int ret;
+
+	dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__,
+		max_reqs, tbl->max_slots);
+
+	if (max_reqs > NFS4_MAX_SLOT_TABLE)
+		max_reqs = NFS4_MAX_SLOT_TABLE;
+
+	ret = nfs4_grow_slot_table(tbl, max_reqs, ivalue);
+	if (ret)
+		goto out;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	nfs4_reset_slot_table(tbl, max_reqs - 1, ivalue);
+	spin_unlock(&tbl->slot_tbl_lock);
+
+	dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
+		tbl, tbl->slots, tbl->max_slots);
+out:
+	dprintk("<-- %s: return %d\n", __func__, ret);
+	return ret;
+}
+
+/* Destroy the slot table */
+static void nfs4_destroy_slot_tables(struct nfs4_session *session)
+{
+	nfs4_shrink_slot_table(&session->fc_slot_table, 0);
+	nfs4_shrink_slot_table(&session->bc_slot_table, 0);
+}
+
+static bool nfs41_assign_slot(struct rpc_task *task, void *pslot)
+{
+	struct nfs4_sequence_args *args = task->tk_msg.rpc_argp;
+	struct nfs4_sequence_res *res = task->tk_msg.rpc_resp;
+	struct nfs4_slot *slot = pslot;
+	struct nfs4_slot_table *tbl = slot->table;
+
+	if (nfs4_session_draining(tbl->session) && !args->sa_privileged)
+		return false;
+	slot->generation = tbl->generation;
+	args->sa_slot = slot;
+	res->sr_timestamp = jiffies;
+	res->sr_slot = slot;
+	res->sr_status_flags = 0;
+	res->sr_status = 1;
+	return true;
+}
+
+static bool __nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
+		struct nfs4_slot *slot)
+{
+	if (rpc_wake_up_first(&tbl->slot_tbl_waitq, nfs41_assign_slot, slot))
+		return true;
+	return false;
+}
+
+bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
+		struct nfs4_slot *slot)
+{
+	if (slot->slot_nr > tbl->max_slotid)
+		return false;
+	return __nfs41_wake_and_assign_slot(tbl, slot);
+}
+
+static bool nfs41_try_wake_next_slot_table_entry(struct nfs4_slot_table *tbl)
+{
+	struct nfs4_slot *slot = nfs4_alloc_slot(tbl);
+	if (!IS_ERR(slot)) {
+		bool ret = __nfs41_wake_and_assign_slot(tbl, slot);
+		if (ret)
+			return ret;
+		nfs4_free_slot(tbl, slot);
+	}
+	return false;
+}
+
+void nfs41_wake_slot_table(struct nfs4_slot_table *tbl)
+{
+	for (;;) {
+		if (!nfs41_try_wake_next_slot_table_entry(tbl))
+			break;
+	}
+}
+
+static void nfs41_set_max_slotid_locked(struct nfs4_slot_table *tbl,
+		u32 target_highest_slotid)
+{
+	u32 max_slotid;
+
+	max_slotid = min(NFS4_MAX_SLOT_TABLE - 1, target_highest_slotid);
+	if (max_slotid > tbl->server_highest_slotid)
+		max_slotid = tbl->server_highest_slotid;
+	if (max_slotid > tbl->target_highest_slotid)
+		max_slotid = tbl->target_highest_slotid;
+	tbl->max_slotid = max_slotid;
+	nfs41_wake_slot_table(tbl);
+}
+
+/* Update the client's idea of target_highest_slotid */
+static void nfs41_set_target_slotid_locked(struct nfs4_slot_table *tbl,
+		u32 target_highest_slotid)
+{
+	if (tbl->target_highest_slotid == target_highest_slotid)
+		return;
+	tbl->target_highest_slotid = target_highest_slotid;
+	tbl->generation++;
+}
+
+void nfs41_set_target_slotid(struct nfs4_slot_table *tbl,
+		u32 target_highest_slotid)
+{
+	spin_lock(&tbl->slot_tbl_lock);
+	nfs41_set_target_slotid_locked(tbl, target_highest_slotid);
+	tbl->d_target_highest_slotid = 0;
+	tbl->d2_target_highest_slotid = 0;
+	nfs41_set_max_slotid_locked(tbl, target_highest_slotid);
+	spin_unlock(&tbl->slot_tbl_lock);
+}
+
+static void nfs41_set_server_slotid_locked(struct nfs4_slot_table *tbl,
+		u32 highest_slotid)
+{
+	if (tbl->server_highest_slotid == highest_slotid)
+		return;
+	if (tbl->highest_used_slotid > highest_slotid)
+		return;
+	/* Deallocate slots */
+	nfs4_shrink_slot_table(tbl, highest_slotid + 1);
+	tbl->server_highest_slotid = highest_slotid;
+}
+
+static s32 nfs41_derivative_target_slotid(s32 s1, s32 s2)
+{
+	s1 -= s2;
+	if (s1 == 0)
+		return 0;
+	if (s1 < 0)
+		return (s1 - 1) >> 1;
+	return (s1 + 1) >> 1;
+}
+
+static int nfs41_sign_s32(s32 s1)
+{
+	if (s1 > 0)
+		return 1;
+	if (s1 < 0)
+		return -1;
+	return 0;
+}
+
+static bool nfs41_same_sign_or_zero_s32(s32 s1, s32 s2)
+{
+	if (!s1 || !s2)
+		return true;
+	return nfs41_sign_s32(s1) == nfs41_sign_s32(s2);
+}
+
+/* Try to eliminate outliers by checking for sharp changes in the
+ * derivatives and second derivatives
+ */
+static bool nfs41_is_outlier_target_slotid(struct nfs4_slot_table *tbl,
+		u32 new_target)
+{
+	s32 d_target, d2_target;
+	bool ret = true;
+
+	d_target = nfs41_derivative_target_slotid(new_target,
+			tbl->target_highest_slotid);
+	d2_target = nfs41_derivative_target_slotid(d_target,
+			tbl->d_target_highest_slotid);
+	/* Is first derivative same sign? */
+	if (nfs41_same_sign_or_zero_s32(d_target, tbl->d_target_highest_slotid))
+		ret = false;
+	/* Is second derivative same sign? */
+	if (nfs41_same_sign_or_zero_s32(d2_target, tbl->d2_target_highest_slotid))
+		ret = false;
+	tbl->d_target_highest_slotid = d_target;
+	tbl->d2_target_highest_slotid = d2_target;
+	return ret;
+}
+
+void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
+		struct nfs4_slot *slot,
+		struct nfs4_sequence_res *res)
+{
+	spin_lock(&tbl->slot_tbl_lock);
+	if (!nfs41_is_outlier_target_slotid(tbl, res->sr_target_highest_slotid))
+		nfs41_set_target_slotid_locked(tbl, res->sr_target_highest_slotid);
+	if (tbl->generation == slot->generation)
+		nfs41_set_server_slotid_locked(tbl, res->sr_highest_slotid);
+	nfs41_set_max_slotid_locked(tbl, res->sr_target_highest_slotid);
+	spin_unlock(&tbl->slot_tbl_lock);
+}
+
+/*
+ * Initialize or reset the forechannel and backchannel tables
+ */
+int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
+{
+	struct nfs4_slot_table *tbl;
+	int status;
+
+	dprintk("--> %s\n", __func__);
+	/* Fore channel */
+	tbl = &ses->fc_slot_table;
+	tbl->session = ses;
+	status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
+	if (status) /* -ENOMEM */
+		return status;
+	/* Back channel */
+	tbl = &ses->bc_slot_table;
+	tbl->session = ses;
+	status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
+	if (status && tbl->slots == NULL)
+		/* Fore and back channel share a connection so get
+		 * both slot tables or neither */
+		nfs4_destroy_slot_tables(ses);
+	return status;
+}
+
+struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
+{
+	struct nfs4_session *session;
+	struct nfs4_slot_table *tbl;
+
+	session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
+	if (!session)
+		return NULL;
+
+	tbl = &session->fc_slot_table;
+	tbl->highest_used_slotid = NFS4_NO_SLOT;
+	spin_lock_init(&tbl->slot_tbl_lock);
+	rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
+	init_completion(&tbl->complete);
+
+	tbl = &session->bc_slot_table;
+	tbl->highest_used_slotid = NFS4_NO_SLOT;
+	spin_lock_init(&tbl->slot_tbl_lock);
+	rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
+	init_completion(&tbl->complete);
+
+	session->session_state = 1<<NFS4_SESSION_INITING;
+
+	session->clp = clp;
+	return session;
+}
+
+void nfs4_destroy_session(struct nfs4_session *session)
+{
+	struct rpc_xprt *xprt;
+	struct rpc_cred *cred;
+
+	cred = nfs4_get_exchange_id_cred(session->clp);
+	nfs4_proc_destroy_session(session, cred);
+	if (cred)
+		put_rpccred(cred);
+
+	rcu_read_lock();
+	xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt);
+	rcu_read_unlock();
+	dprintk("%s Destroy backchannel for xprt %p\n",
+		__func__, xprt);
+	xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS);
+	nfs4_destroy_slot_tables(session);
+	kfree(session);
+}
+
+/*
+ * With sessions, the client is not marked ready until after a
+ * successful EXCHANGE_ID and CREATE_SESSION.
+ *
+ * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate
+ * other versions of NFS can be tried.
+ */
+static int nfs41_check_session_ready(struct nfs_client *clp)
+{
+	int ret;
+	
+	if (clp->cl_cons_state == NFS_CS_SESSION_INITING) {
+		ret = nfs4_client_recover_expired_lease(clp);
+		if (ret)
+			return ret;
+	}
+	if (clp->cl_cons_state < NFS_CS_READY)
+		return -EPROTONOSUPPORT;
+	smp_rmb();
+	return 0;
+}
+
+int nfs4_init_session(struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_session *session;
+	unsigned int target_max_rqst_sz = NFS_MAX_FILE_IO_SIZE;
+	unsigned int target_max_resp_sz = NFS_MAX_FILE_IO_SIZE;
+
+	if (!nfs4_has_session(clp))
+		return 0;
+
+	if (server->rsize != 0)
+		target_max_resp_sz = server->rsize;
+	target_max_resp_sz += nfs41_maxread_overhead;
+
+	if (server->wsize != 0)
+		target_max_rqst_sz = server->wsize;
+	target_max_rqst_sz += nfs41_maxwrite_overhead;
+
+	session = clp->cl_session;
+	spin_lock(&clp->cl_lock);
+	if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
+		/* Initialise targets and channel attributes */
+		session->fc_target_max_rqst_sz = target_max_rqst_sz;
+		session->fc_attrs.max_rqst_sz = target_max_rqst_sz;
+		session->fc_target_max_resp_sz = target_max_resp_sz;
+		session->fc_attrs.max_resp_sz = target_max_resp_sz;
+	} else {
+		/* Just adjust the targets */
+		if (target_max_rqst_sz > session->fc_target_max_rqst_sz) {
+			session->fc_target_max_rqst_sz = target_max_rqst_sz;
+			set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+		}
+		if (target_max_resp_sz > session->fc_target_max_resp_sz) {
+			session->fc_target_max_resp_sz = target_max_resp_sz;
+			set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+		}
+	}
+	spin_unlock(&clp->cl_lock);
+
+	if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
+		nfs4_schedule_lease_recovery(clp);
+
+	return nfs41_check_session_ready(clp);
+}
+
+int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time)
+{
+	struct nfs4_session *session = clp->cl_session;
+	int ret;
+
+	spin_lock(&clp->cl_lock);
+	if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
+		/*
+		 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the
+		 * DS lease to be equal to the MDS lease.
+		 */
+		clp->cl_lease_time = lease_time;
+		clp->cl_last_renewal = jiffies;
+	}
+	spin_unlock(&clp->cl_lock);
+
+	ret = nfs41_check_session_ready(clp);
+	if (ret)
+		return ret;
+	/* Test for the DS role */
+	if (!is_ds_client(clp))
+		return -ENODEV;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
+
+
diff --git a/trunk/fs/nfs/nfs4session.h b/trunk/fs/nfs/nfs4session.h
new file mode 100644
index 000000000000..6f3cb39386d4
--- /dev/null
+++ b/trunk/fs/nfs/nfs4session.h
@@ -0,0 +1,142 @@
+/*
+ * fs/nfs/nfs4session.h
+ *
+ * Copyright (c) 2012 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ */
+#ifndef __LINUX_FS_NFS_NFS4SESSION_H
+#define __LINUX_FS_NFS_NFS4SESSION_H
+
+/* maximum number of slots to use */
+#define NFS4_DEF_SLOT_TABLE_SIZE (16U)
+#define NFS4_MAX_SLOT_TABLE (1024U)
+#define NFS4_NO_SLOT ((u32)-1)
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+
+/* Sessions slot seqid */
+struct nfs4_slot {
+	struct nfs4_slot_table	*table;
+	struct nfs4_slot	*next;
+	unsigned long		generation;
+	u32			slot_nr;
+	u32		 	seq_nr;
+	unsigned int		interrupted : 1;
+};
+
+/* Sessions */
+#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long))
+struct nfs4_slot_table {
+	struct nfs4_session *session;		/* Parent session */
+	struct nfs4_slot *slots;		/* seqid per slot */
+	unsigned long   used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */
+	spinlock_t	slot_tbl_lock;
+	struct rpc_wait_queue	slot_tbl_waitq;	/* allocators may wait here */
+	u32		max_slots;		/* # slots in table */
+	u32		max_slotid;		/* Max allowed slotid value */
+	u32		highest_used_slotid;	/* sent to server on each SEQ.
+						 * op for dynamic resizing */
+	u32		target_highest_slotid;	/* Server max_slot target */
+	u32		server_highest_slotid;	/* Server highest slotid */
+	s32		d_target_highest_slotid; /* Derivative */
+	s32		d2_target_highest_slotid; /* 2nd derivative */
+	unsigned long	generation;		/* Generation counter for
+						   target_highest_slotid */
+	struct completion complete;
+};
+
+/*
+ * Session related parameters
+ */
+struct nfs4_session {
+	struct nfs4_sessionid		sess_id;
+	u32				flags;
+	unsigned long			session_state;
+	u32				hash_alg;
+	u32				ssv_len;
+
+	/* The fore and back channel */
+	struct nfs4_channel_attrs	fc_attrs;
+	struct nfs4_slot_table		fc_slot_table;
+	struct nfs4_channel_attrs	bc_attrs;
+	struct nfs4_slot_table		bc_slot_table;
+	struct nfs_client		*clp;
+	/* Create session arguments */
+	unsigned int			fc_target_max_rqst_sz;
+	unsigned int			fc_target_max_resp_sz;
+};
+
+enum nfs4_session_state {
+	NFS4_SESSION_INITING,
+	NFS4_SESSION_DRAINING,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);
+extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
+
+extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl,
+		u32 target_highest_slotid);
+extern void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
+		struct nfs4_slot *slot,
+		struct nfs4_sequence_res *res);
+
+extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses);
+
+extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
+extern void nfs4_destroy_session(struct nfs4_session *session);
+extern int nfs4_init_session(struct nfs_server *server);
+extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
+
+extern void nfs4_session_drain_complete(struct nfs4_session *session,
+		struct nfs4_slot_table *tbl);
+
+static inline bool nfs4_session_draining(struct nfs4_session *session)
+{
+	return !!test_bit(NFS4_SESSION_DRAINING, &session->session_state);
+}
+
+bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
+		struct nfs4_slot *slot);
+void nfs41_wake_slot_table(struct nfs4_slot_table *tbl);
+
+/*
+ * Determine if sessions are in use.
+ */
+static inline int nfs4_has_session(const struct nfs_client *clp)
+{
+	if (clp->cl_session)
+		return 1;
+	return 0;
+}
+
+static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
+{
+	if (nfs4_has_session(clp))
+		return (clp->cl_session->flags & SESSION4_PERSIST);
+	return 0;
+}
+
+#else /* defined(CONFIG_NFS_V4_1) */
+
+static inline int nfs4_init_session(struct nfs_server *server)
+{
+	return 0;
+}
+
+/*
+ * Determine if sessions are in use.
+ */
+static inline int nfs4_has_session(const struct nfs_client *clp)
+{
+	return 0;
+}
+
+static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
+{
+	return 0;
+}
+
+#endif /* defined(CONFIG_NFS_V4_1) */
+#endif /* IS_ENABLED(CONFIG_NFS_V4) */
+#endif /* __LINUX_FS_NFS_NFS4SESSION_H */
diff --git a/trunk/fs/nfs/nfs4state.c b/trunk/fs/nfs/nfs4state.c
index c351e6b39838..9448c579d41a 100644
--- a/trunk/fs/nfs/nfs4state.c
+++ b/trunk/fs/nfs/nfs4state.c
@@ -57,6 +57,7 @@
 #include "callback.h"
 #include "delegation.h"
 #include "internal.h"
+#include "nfs4session.h"
 #include "pnfs.h"
 #include "netns.h"
 
@@ -66,7 +67,6 @@
 
 const nfs4_stateid zero_stateid;
 static DEFINE_MUTEX(nfs_clid_init_mutex);
-static LIST_HEAD(nfs4_clientid_list);
 
 int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 {
@@ -254,24 +254,27 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
 {
 	struct nfs4_session *ses = clp->cl_session;
 	struct nfs4_slot_table *tbl;
-	int max_slots;
 
 	if (ses == NULL)
 		return;
 	tbl = &ses->fc_slot_table;
 	if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
 		spin_lock(&tbl->slot_tbl_lock);
-		max_slots = tbl->max_slots;
-		while (max_slots--) {
-			if (rpc_wake_up_first(&tbl->slot_tbl_waitq,
-						nfs4_set_task_privileged,
-						NULL) == NULL)
-				break;
-		}
+		nfs41_wake_slot_table(tbl);
 		spin_unlock(&tbl->slot_tbl_lock);
 	}
 }
 
+/*
+ * Signal state manager thread if session fore channel is drained
+ */
+void nfs4_session_drain_complete(struct nfs4_session *session,
+		struct nfs4_slot_table *tbl)
+{
+	if (nfs4_session_draining(session))
+		complete(&tbl->complete);
+}
+
 static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)
 {
 	spin_lock(&tbl->slot_tbl_lock);
@@ -303,7 +306,6 @@ static void nfs41_finish_session_reset(struct nfs_client *clp)
 	clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
 	clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
 	/* create_session negotiated new slot table */
-	clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
 	clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
 	nfs41_setup_state_renewal(clp);
 }
@@ -1086,7 +1088,6 @@ void nfs_free_seqid(struct nfs_seqid *seqid)
  */
 static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
 {
-	BUG_ON(list_first_entry(&seqid->sequence->list, struct nfs_seqid, list) != seqid);
 	switch (status) {
 		case 0:
 			break;
@@ -1209,6 +1210,40 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery);
 
+int nfs4_wait_clnt_recover(struct nfs_client *clp)
+{
+	int res;
+
+	might_sleep();
+
+	res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
+			nfs_wait_bit_killable, TASK_KILLABLE);
+	if (res)
+		return res;
+
+	if (clp->cl_cons_state < 0)
+		return clp->cl_cons_state;
+	return 0;
+}
+
+int nfs4_client_recover_expired_lease(struct nfs_client *clp)
+{
+	unsigned int loop;
+	int ret;
+
+	for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
+		ret = nfs4_wait_clnt_recover(clp);
+		if (ret != 0)
+			break;
+		if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
+		    !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
+			break;
+		nfs4_schedule_state_manager(clp);
+		ret = -EIO;
+	}
+	return ret;
+}
+
 /*
  * nfs40_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
  * @clp: client to process
@@ -1401,14 +1436,6 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
 				/* Mark the file as being 'closed' */
 				state->state = 0;
 				break;
-			case -EKEYEXPIRED:
-				/*
-				 * User RPCSEC_GSS context has expired.
-				 * We cannot recover this stateid now, so
-				 * skip it and allow recovery thread to
-				 * proceed.
-				 */
-				break;
 			case -NFS4ERR_ADMIN_REVOKED:
 			case -NFS4ERR_STALE_STATEID:
 			case -NFS4ERR_BAD_STATEID:
@@ -1561,14 +1588,6 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
 	nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
 }
 
-static void nfs4_warn_keyexpired(const char *s)
-{
-	printk_ratelimited(KERN_WARNING "Error: state manager"
-			" encountered RPCSEC_GSS session"
-			" expired against NFSv4 server %s.\n",
-			s);
-}
-
 static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 {
 	switch (error) {
@@ -1602,10 +1621,6 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 			set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
 			break;
-		case -EKEYEXPIRED:
-			/* Nothing we can do */
-			nfs4_warn_keyexpired(clp->cl_hostname);
-			break;
 		default:
 			dprintk("%s: failed to handle error %d for server %s\n",
 					__func__, error, clp->cl_hostname);
@@ -1722,8 +1737,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
 		dprintk("%s: exit with error %d for server %s\n",
 				__func__, -EPROTONOSUPPORT, clp->cl_hostname);
 		return -EPROTONOSUPPORT;
-	case -EKEYEXPIRED:
-		nfs4_warn_keyexpired(clp->cl_hostname);
 	case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
 				 * in nfs4_exchange_id */
 	default:
@@ -1876,7 +1889,6 @@ int nfs4_discover_server_trunking(struct nfs_client *clp,
 		break;
 
 	case -EKEYEXPIRED:
-		nfs4_warn_keyexpired(clp->cl_hostname);
 	case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
 				 * in nfs4_exchange_id */
 		status = -EKEYEXPIRED;
@@ -1907,14 +1919,23 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
 
-void nfs41_handle_recall_slot(struct nfs_client *clp)
+static void nfs41_ping_server(struct nfs_client *clp)
 {
-	set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
-	dprintk("%s: scheduling slot recall for server %s\n", __func__,
-			clp->cl_hostname);
+	/* Use CHECK_LEASE to ping the server with a SEQUENCE */
+	set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
 	nfs4_schedule_state_manager(clp);
 }
 
+void nfs41_server_notify_target_slotid_update(struct nfs_client *clp)
+{
+	nfs41_ping_server(clp);
+}
+
+void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp)
+{
+	nfs41_ping_server(clp);
+}
+
 static void nfs4_reset_all_state(struct nfs_client *clp)
 {
 	if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
@@ -2024,35 +2045,6 @@ static int nfs4_reset_session(struct nfs_client *clp)
 	return status;
 }
 
-static int nfs4_recall_slot(struct nfs_client *clp)
-{
-	struct nfs4_slot_table *fc_tbl;
-	struct nfs4_slot *new, *old;
-	int i;
-
-	if (!nfs4_has_session(clp))
-		return 0;
-	nfs4_begin_drain_session(clp);
-	fc_tbl = &clp->cl_session->fc_slot_table;
-	new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot),
-		      GFP_NOFS);
-        if (!new)
-		return -ENOMEM;
-
-	spin_lock(&fc_tbl->slot_tbl_lock);
-	for (i = 0; i < fc_tbl->target_max_slots; i++)
-		new[i].seq_nr = fc_tbl->slots[i].seq_nr;
-	old = fc_tbl->slots;
-	fc_tbl->slots = new;
-	fc_tbl->max_slots = fc_tbl->target_max_slots;
-	fc_tbl->target_max_slots = 0;
-	clp->cl_session->fc_attrs.max_reqs = fc_tbl->max_slots;
-	spin_unlock(&fc_tbl->slot_tbl_lock);
-
-	kfree(old);
-	return 0;
-}
-
 static int nfs4_bind_conn_to_session(struct nfs_client *clp)
 {
 	struct rpc_cred *cred;
@@ -2083,7 +2075,6 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
 #else /* CONFIG_NFS_V4_1 */
 static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
 static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; }
-static int nfs4_recall_slot(struct nfs_client *clp) { return 0; }
 
 static int nfs4_bind_conn_to_session(struct nfs_client *clp)
 {
@@ -2115,15 +2106,6 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			continue;
 		}
 
-		if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
-			section = "check lease";
-			status = nfs4_check_lease(clp);
-			if (status < 0)
-				goto out_error;
-			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
-				continue;
-		}
-
 		/* Initialize or reset the session */
 		if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) {
 			section = "reset session";
@@ -2144,10 +2126,9 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			continue;
 		}
 
-		/* Recall session slots */
-		if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state)) {
-			section = "recall slot";
-			status = nfs4_recall_slot(clp);
+		if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
+			section = "check lease";
+			status = nfs4_check_lease(clp);
 			if (status < 0)
 				goto out_error;
 			continue;
diff --git a/trunk/fs/nfs/nfs4super.c b/trunk/fs/nfs/nfs4super.c
index bd61221ad2c5..84d2e9e2f313 100644
--- a/trunk/fs/nfs/nfs4super.c
+++ b/trunk/fs/nfs/nfs4super.c
@@ -51,6 +51,7 @@ static const struct super_operations nfs4_sops = {
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
 	.write_inode	= nfs4_write_inode,
+	.drop_inode	= nfs_drop_inode,
 	.put_super	= nfs_put_super,
 	.statfs		= nfs_statfs,
 	.evict_inode	= nfs4_evict_inode,
diff --git a/trunk/fs/nfs/nfs4xdr.c b/trunk/fs/nfs/nfs4xdr.c
index 40836ee5dc3a..26b143920433 100644
--- a/trunk/fs/nfs/nfs4xdr.c
+++ b/trunk/fs/nfs/nfs4xdr.c
@@ -56,6 +56,7 @@
 
 #include "nfs4_fs.h"
 #include "internal.h"
+#include "nfs4session.h"
 #include "pnfs.h"
 #include "netns.h"
 
@@ -270,6 +271,8 @@ static int nfs4_stat_to_errno(int);
 
 #if defined(CONFIG_NFS_V4_1)
 #define NFS4_MAX_MACHINE_NAME_LEN (64)
+#define IMPL_NAME_LIMIT (sizeof(utsname()->sysname) + sizeof(utsname()->release) + \
+			 sizeof(utsname()->version) + sizeof(utsname()->machine) + 8)
 
 #define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \
 				encode_verifier_maxsz + \
@@ -282,7 +285,7 @@ static int nfs4_stat_to_errno(int);
 				1 /* nii_domain */ + \
 				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
 				1 /* nii_name */ + \
-				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+				XDR_QUADLEN(IMPL_NAME_LIMIT) + \
 				3 /* nii_date */)
 #define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \
 				2 /* eir_clientid */ + \
@@ -936,7 +939,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 	 * but this is not required as a MUST for the server to do so. */
 	hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen;
 
-	BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
+	WARN_ON_ONCE(hdr->taglen > NFS4_MAXTAGLEN);
 	encode_string(xdr, hdr->taglen, hdr->tag);
 	p = reserve_space(xdr, 8);
 	*p++ = cpu_to_be32(hdr->minorversion);
@@ -955,7 +958,7 @@ static void encode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 op,
 
 static void encode_nops(struct compound_hdr *hdr)
 {
-	BUG_ON(hdr->nops > NFS4_MAX_OPS);
+	WARN_ON_ONCE(hdr->nops > NFS4_MAX_OPS);
 	*hdr->nops_p = htonl(hdr->nops);
 }
 
@@ -1403,7 +1406,6 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
 		*p = cpu_to_be32(NFS4_OPEN_NOCREATE);
 		break;
 	default:
-		BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
 		*p = cpu_to_be32(NFS4_OPEN_CREATE);
 		encode_createmode(xdr, arg);
 	}
@@ -1621,7 +1623,6 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
 	p = reserve_space(xdr, 2*4);
 	*p++ = cpu_to_be32(1);
 	*p = cpu_to_be32(FATTR4_WORD0_ACL);
-	BUG_ON(arg->acl_len % 4);
 	p = reserve_space(xdr, 4);
 	*p = cpu_to_be32(arg->acl_len);
 	xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
@@ -1713,7 +1714,7 @@ static void encode_exchange_id(struct xdr_stream *xdr,
 			       struct compound_hdr *hdr)
 {
 	__be32 *p;
-	char impl_name[NFS4_OPAQUE_LIMIT];
+	char impl_name[IMPL_NAME_LIMIT];
 	int len = 0;
 
 	encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr);
@@ -1728,7 +1729,7 @@ static void encode_exchange_id(struct xdr_stream *xdr,
 	if (send_implementation_id &&
 	    sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 &&
 	    sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN)
-		<= NFS4_OPAQUE_LIMIT + 1)
+		<= sizeof(impl_name) + 1)
 		len = snprintf(impl_name, sizeof(impl_name), "%s %s %s %s",
 			       utsname()->sysname, utsname()->release,
 			       utsname()->version, utsname()->machine);
@@ -1835,18 +1836,16 @@ static void encode_sequence(struct xdr_stream *xdr,
 			    struct compound_hdr *hdr)
 {
 #if defined(CONFIG_NFS_V4_1)
-	struct nfs4_session *session = args->sa_session;
+	struct nfs4_session *session;
 	struct nfs4_slot_table *tp;
-	struct nfs4_slot *slot;
+	struct nfs4_slot *slot = args->sa_slot;
 	__be32 *p;
 
-	if (!session)
+	if (slot == NULL)
 		return;
 
-	tp = &session->fc_slot_table;
-
-	WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
-	slot = tp->slots + args->sa_slotid;
+	tp = slot->table;
+	session = tp->session;
 
 	encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr);
 
@@ -1860,12 +1859,12 @@ static void encode_sequence(struct xdr_stream *xdr,
 		((u32 *)session->sess_id.data)[1],
 		((u32 *)session->sess_id.data)[2],
 		((u32 *)session->sess_id.data)[3],
-		slot->seq_nr, args->sa_slotid,
+		slot->seq_nr, slot->slot_nr,
 		tp->highest_used_slotid, args->sa_cache_this);
 	p = reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 16);
 	p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
 	*p++ = cpu_to_be32(slot->seq_nr);
-	*p++ = cpu_to_be32(args->sa_slotid);
+	*p++ = cpu_to_be32(slot->slot_nr);
 	*p++ = cpu_to_be32(tp->highest_used_slotid);
 	*p = cpu_to_be32(args->sa_cache_this);
 #endif /* CONFIG_NFS_V4_1 */
@@ -2027,8 +2026,9 @@ static void encode_free_stateid(struct xdr_stream *xdr,
 static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
 {
 #if defined(CONFIG_NFS_V4_1)
-	if (args->sa_session)
-		return args->sa_session->clp->cl_mvops->minor_version;
+
+	if (args->sa_slot)
+		return args->sa_slot->table->session->clp->cl_mvops->minor_version;
 #endif /* CONFIG_NFS_V4_1 */
 	return 0;
 }
@@ -5509,12 +5509,13 @@ static int decode_sequence(struct xdr_stream *xdr,
 			   struct rpc_rqst *rqstp)
 {
 #if defined(CONFIG_NFS_V4_1)
+	struct nfs4_session *session;
 	struct nfs4_sessionid id;
 	u32 dummy;
 	int status;
 	__be32 *p;
 
-	if (!res->sr_session)
+	if (res->sr_slot == NULL)
 		return 0;
 
 	status = decode_op_hdr(xdr, OP_SEQUENCE);
@@ -5528,8 +5529,9 @@ static int decode_sequence(struct xdr_stream *xdr,
 	 * sequence number, the server is looney tunes.
 	 */
 	status = -EREMOTEIO;
+	session = res->sr_slot->table->session;
 
-	if (memcmp(id.data, res->sr_session->sess_id.data,
+	if (memcmp(id.data, session->sess_id.data,
 		   NFS4_MAX_SESSIONID_LEN)) {
 		dprintk("%s Invalid session id\n", __func__);
 		goto out_err;
@@ -5547,14 +5549,14 @@ static int decode_sequence(struct xdr_stream *xdr,
 	}
 	/* slot id */
 	dummy = be32_to_cpup(p++);
-	if (dummy != res->sr_slot - res->sr_session->fc_slot_table.slots) {
+	if (dummy != res->sr_slot->slot_nr) {
 		dprintk("%s Invalid slot id\n", __func__);
 		goto out_err;
 	}
-	/* highest slot id - currently not processed */
-	dummy = be32_to_cpup(p++);
-	/* target highest slot id - currently not processed */
-	dummy = be32_to_cpup(p++);
+	/* highest slot id */
+	res->sr_highest_slotid = be32_to_cpup(p++);
+	/* target highest slot id */
+	res->sr_target_highest_slotid = be32_to_cpup(p++);
 	/* result flags */
 	res->sr_status_flags = be32_to_cpup(p);
 	status = 0;
diff --git a/trunk/fs/nfs/objlayout/objlayout.c b/trunk/fs/nfs/objlayout/objlayout.c
index 874613545301..a9ebd817278b 100644
--- a/trunk/fs/nfs/objlayout/objlayout.c
+++ b/trunk/fs/nfs/objlayout/objlayout.c
@@ -148,17 +148,6 @@ end_offset(u64 start, u64 len)
 	return end >= start ? end : NFS4_MAX_UINT64;
 }
 
-/* last octet in a range */
-static inline u64
-last_byte_offset(u64 start, u64 len)
-{
-	u64 end;
-
-	BUG_ON(!len);
-	end = start + len;
-	return end > start ? end - 1 : NFS4_MAX_UINT64;
-}
-
 static void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
 			   struct page ***p_pages, unsigned *p_pgbase,
 			   u64 offset, unsigned long count)
diff --git a/trunk/fs/nfs/pnfs.c b/trunk/fs/nfs/pnfs.c
index 2878f97bd78d..e7165d915362 100644
--- a/trunk/fs/nfs/pnfs.c
+++ b/trunk/fs/nfs/pnfs.c
@@ -369,17 +369,6 @@ end_offset(u64 start, u64 len)
 	return end >= start ? end : NFS4_MAX_UINT64;
 }
 
-/* last octet in a range */
-static inline u64
-last_byte_offset(u64 start, u64 len)
-{
-	u64 end;
-
-	BUG_ON(!len);
-	end = start + len;
-	return end > start ? end - 1 : NFS4_MAX_UINT64;
-}
-
 /*
  * is l2 fully contained in l1?
  *   start1                             end1
@@ -645,7 +634,6 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 
 	dprintk("--> %s\n", __func__);
 
-	BUG_ON(ctx == NULL);
 	lgp = kzalloc(sizeof(*lgp), gfp_flags);
 	if (lgp == NULL)
 		return NULL;
@@ -1126,7 +1114,6 @@ pnfs_update_layout(struct inode *ino,
 		 * chance of a CB_LAYOUTRECALL(FILE) coming in.
 		 */
 		spin_lock(&clp->cl_lock);
-		BUG_ON(!list_empty(&lo->plh_layouts));
 		list_add_tail(&lo->plh_layouts, &server->layouts);
 		spin_unlock(&clp->cl_lock);
 	}
@@ -1222,7 +1209,7 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
 {
 	u64 rd_size = req->wb_bytes;
 
-	BUG_ON(pgio->pg_lseg != NULL);
+	WARN_ON_ONCE(pgio->pg_lseg != NULL);
 
 	if (req->wb_offset != req->wb_pgbase) {
 		nfs_pageio_reset_read_mds(pgio);
@@ -1251,7 +1238,7 @@ void
 pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
 			   struct nfs_page *req, u64 wb_size)
 {
-	BUG_ON(pgio->pg_lseg != NULL);
+	WARN_ON_ONCE(pgio->pg_lseg != NULL);
 
 	if (req->wb_offset != req->wb_pgbase) {
 		nfs_pageio_reset_write_mds(pgio);
diff --git a/trunk/fs/nfs/proc.c b/trunk/fs/nfs/proc.c
index 50a88c3546ed..f084dac948e1 100644
--- a/trunk/fs/nfs/proc.c
+++ b/trunk/fs/nfs/proc.c
@@ -46,39 +46,6 @@
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
 
-/*
- * wrapper to handle the -EKEYEXPIRED error message. This should generally
- * only happen if using krb5 auth and a user's TGT expires. NFSv2 doesn't
- * support the NFSERR_JUKEBOX error code, but we handle this situation in the
- * same way that we handle that error with NFSv3.
- */
-static int
-nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
-{
-	int res;
-	do {
-		res = rpc_call_sync(clnt, msg, flags);
-		if (res != -EKEYEXPIRED)
-			break;
-		freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
-		res = -ERESTARTSYS;
-	} while (!fatal_signal_pending(current));
-	return res;
-}
-
-#define rpc_call_sync(clnt, msg, flags)	nfs_rpc_wrapper(clnt, msg, flags)
-
-static int
-nfs_async_handle_expired_key(struct rpc_task *task)
-{
-	if (task->tk_status != -EKEYEXPIRED)
-		return 0;
-	task->tk_status = 0;
-	rpc_restart_call(task);
-	rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
-	return 1;
-}
-
 /*
  * Bare-bones access to getattr: this is for nfs_read_super.
  */
@@ -364,8 +331,6 @@ static void nfs_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlink
 
 static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 {
-	if (nfs_async_handle_expired_key(task))
-		return 0;
 	nfs_mark_for_revalidate(dir);
 	return 1;
 }
@@ -385,8 +350,6 @@ static int
 nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
 		     struct inode *new_dir)
 {
-	if (nfs_async_handle_expired_key(task))
-		return 0;
 	nfs_mark_for_revalidate(old_dir);
 	nfs_mark_for_revalidate(new_dir);
 	return 1;
@@ -642,9 +605,6 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
 	struct inode *inode = data->header->inode;
 
-	if (nfs_async_handle_expired_key(task))
-		return -EAGAIN;
-
 	nfs_invalidate_atime(inode);
 	if (task->tk_status >= 0) {
 		nfs_refresh_inode(inode, data->res.fattr);
@@ -671,9 +631,6 @@ static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
 	struct inode *inode = data->header->inode;
 
-	if (nfs_async_handle_expired_key(task))
-		return -EAGAIN;
-
 	if (task->tk_status >= 0)
 		nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
 	return 0;
diff --git a/trunk/fs/nfs/super.c b/trunk/fs/nfs/super.c
index 652d3f7176a9..aa5315bb3666 100644
--- a/trunk/fs/nfs/super.c
+++ b/trunk/fs/nfs/super.c
@@ -64,6 +64,7 @@
 #include "iostat.h"
 #include "internal.h"
 #include "fscache.h"
+#include "nfs4session.h"
 #include "pnfs.h"
 #include "nfs.h"
 
@@ -307,6 +308,7 @@ const struct super_operations nfs_sops = {
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
 	.write_inode	= nfs_write_inode,
+	.drop_inode	= nfs_drop_inode,
 	.put_super	= nfs_put_super,
 	.statfs		= nfs_statfs,
 	.evict_inode	= nfs_evict_inode,
diff --git a/trunk/fs/nfs/write.c b/trunk/fs/nfs/write.c
index 9347ab7c9574..5209916e1222 100644
--- a/trunk/fs/nfs/write.c
+++ b/trunk/fs/nfs/write.c
@@ -202,7 +202,6 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c
 /* A writeback failed: mark the page as bad, and invalidate the page cache */
 static void nfs_set_pageerror(struct page *page)
 {
-	SetPageError(page);
 	nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page));
 }
 
@@ -239,21 +238,18 @@ int nfs_congestion_kb;
 #define NFS_CONGESTION_OFF_THRESH	\
 	(NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
 
-static int nfs_set_page_writeback(struct page *page)
+static void nfs_set_page_writeback(struct page *page)
 {
+	struct nfs_server *nfss = NFS_SERVER(page_file_mapping(page)->host);
 	int ret = test_set_page_writeback(page);
 
-	if (!ret) {
-		struct inode *inode = page_file_mapping(page)->host;
-		struct nfs_server *nfss = NFS_SERVER(inode);
+	WARN_ON_ONCE(ret != 0);
 
-		if (atomic_long_inc_return(&nfss->writeback) >
-				NFS_CONGESTION_ON_THRESH) {
-			set_bdi_congested(&nfss->backing_dev_info,
-						BLK_RW_ASYNC);
-		}
+	if (atomic_long_inc_return(&nfss->writeback) >
+			NFS_CONGESTION_ON_THRESH) {
+		set_bdi_congested(&nfss->backing_dev_info,
+					BLK_RW_ASYNC);
 	}
-	return ret;
 }
 
 static void nfs_end_page_writeback(struct page *page)
@@ -315,10 +311,10 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 	if (IS_ERR(req))
 		goto out;
 
-	ret = nfs_set_page_writeback(page);
-	BUG_ON(ret != 0);
-	BUG_ON(test_bit(PG_CLEAN, &req->wb_flags));
+	nfs_set_page_writeback(page);
+	WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags));
 
+	ret = 0;
 	if (!nfs_pageio_add_request(pgio, req)) {
 		nfs_redirty_request(req);
 		ret = pgio->pg_error;
@@ -451,8 +447,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 	struct inode *inode = req->wb_context->dentry->d_inode;
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	BUG_ON (!NFS_WBACK_BUSY(req));
-
 	spin_lock(&inode->i_lock);
 	if (likely(!PageSwapCache(req->wb_page))) {
 		set_page_private(req->wb_page, 0);
@@ -884,7 +878,7 @@ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
 {
 	if (nfs_have_delegated_attributes(inode))
 		goto out;
-	if (NFS_I(inode)->cache_validity & NFS_INO_REVAL_PAGECACHE)
+	if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
 		return false;
 out:
 	return PageUptodate(page) != 0;
@@ -1727,7 +1721,6 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
 	struct nfs_page *req;
 	int ret = 0;
 
-	BUG_ON(!PageLocked(page));
 	for (;;) {
 		wait_on_page_writeback(page);
 		req = nfs_page_find_request(page);
@@ -1829,7 +1822,7 @@ int __init nfs_init_writepagecache(void)
 		goto out_destroy_write_mempool;
 
 	nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT,
-						      nfs_wdata_cachep);
+						      nfs_cdata_cachep);
 	if (nfs_commit_mempool == NULL)
 		goto out_destroy_commit_cache;
 
diff --git a/trunk/fs/notify/Makefile b/trunk/fs/notify/Makefile
index ae5f33a6d868..96d3420d0242 100644
--- a/trunk/fs/notify/Makefile
+++ b/trunk/fs/notify/Makefile
@@ -1,5 +1,5 @@
 obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o inode_mark.o \
-				   mark.o vfsmount_mark.o
+				   mark.o vfsmount_mark.o fdinfo.o
 
 obj-y			+= dnotify/
 obj-y			+= inotify/
diff --git a/trunk/fs/notify/fanotify/fanotify_user.c b/trunk/fs/notify/fanotify/fanotify_user.c
index 6fcaeb8c902e..a5cd9bba022f 100644
--- a/trunk/fs/notify/fanotify/fanotify_user.c
+++ b/trunk/fs/notify/fanotify/fanotify_user.c
@@ -17,6 +17,7 @@
 #include <asm/ioctls.h>
 
 #include "../../mount.h"
+#include "../fdinfo.h"
 
 #define FANOTIFY_DEFAULT_MAX_EVENTS	16384
 #define FANOTIFY_DEFAULT_MAX_MARKS	8192
@@ -428,6 +429,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
 }
 
 static const struct file_operations fanotify_fops = {
+	.show_fdinfo	= fanotify_show_fdinfo,
 	.poll		= fanotify_poll,
 	.read		= fanotify_read,
 	.write		= fanotify_write,
diff --git a/trunk/fs/notify/fdinfo.c b/trunk/fs/notify/fdinfo.c
new file mode 100644
index 000000000000..514c4b81483d
--- /dev/null
+++ b/trunk/fs/notify/fdinfo.c
@@ -0,0 +1,179 @@
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/inotify.h>
+#include <linux/fanotify.h>
+#include <linux/kernel.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/exportfs.h>
+
+#include "inotify/inotify.h"
+#include "../fs/mount.h"
+
+#if defined(CONFIG_PROC_FS)
+
+#if defined(CONFIG_INOTIFY_USER) || defined(CONFIG_FANOTIFY)
+
+static int show_fdinfo(struct seq_file *m, struct file *f,
+		       int (*show)(struct seq_file *m, struct fsnotify_mark *mark))
+{
+	struct fsnotify_group *group = f->private_data;
+	struct fsnotify_mark *mark;
+	int ret = 0;
+
+	spin_lock(&group->mark_lock);
+	list_for_each_entry(mark, &group->marks_list, g_list) {
+		ret = show(m, mark);
+		if (ret)
+			break;
+	}
+	spin_unlock(&group->mark_lock);
+	return ret;
+}
+
+#if defined(CONFIG_EXPORTFS)
+static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
+{
+	struct {
+		struct file_handle handle;
+		u8 pad[64];
+	} f;
+	int size, ret, i;
+
+	f.handle.handle_bytes = sizeof(f.pad);
+	size = f.handle.handle_bytes >> 2;
+
+	ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0);
+	if ((ret == 255) || (ret == -ENOSPC)) {
+		WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
+		return 0;
+	}
+
+	f.handle.handle_type = ret;
+	f.handle.handle_bytes = size * sizeof(u32);
+
+	ret = seq_printf(m, "fhandle-bytes:%x fhandle-type:%x f_handle:",
+			 f.handle.handle_bytes, f.handle.handle_type);
+
+	for (i = 0; i < f.handle.handle_bytes; i++)
+		ret |= seq_printf(m, "%02x", (int)f.handle.f_handle[i]);
+
+	return ret;
+}
+#else
+static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
+{
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_INOTIFY_USER
+
+static int inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
+{
+	struct inotify_inode_mark *inode_mark;
+	struct inode *inode;
+	int ret = 0;
+
+	if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE)))
+		return 0;
+
+	inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
+	inode = igrab(mark->i.inode);
+	if (inode) {
+		ret = seq_printf(m, "inotify wd:%x ino:%lx sdev:%x "
+				 "mask:%x ignored_mask:%x ",
+				 inode_mark->wd, inode->i_ino,
+				 inode->i_sb->s_dev,
+				 mark->mask, mark->ignored_mask);
+		ret |= show_mark_fhandle(m, inode);
+		ret |= seq_putc(m, '\n');
+		iput(inode);
+	}
+
+	return ret;
+}
+
+int inotify_show_fdinfo(struct seq_file *m, struct file *f)
+{
+	return show_fdinfo(m, f, inotify_fdinfo);
+}
+
+#endif /* CONFIG_INOTIFY_USER */
+
+#ifdef CONFIG_FANOTIFY
+
+static int fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
+{
+	unsigned int mflags = 0;
+	struct inode *inode;
+	int ret = 0;
+
+	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE))
+		return 0;
+
+	if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
+		mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
+
+	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
+		inode = igrab(mark->i.inode);
+		if (!inode)
+			goto out;
+		ret = seq_printf(m, "fanotify ino:%lx sdev:%x "
+				 "mflags:%x mask:%x ignored_mask:%x ",
+				 inode->i_ino, inode->i_sb->s_dev,
+				 mflags, mark->mask, mark->ignored_mask);
+		ret |= show_mark_fhandle(m, inode);
+		ret |= seq_putc(m, '\n');
+		iput(inode);
+	} else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) {
+		struct mount *mnt = real_mount(mark->m.mnt);
+
+		ret = seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x "
+				 "ignored_mask:%x\n", mnt->mnt_id, mflags,
+				 mark->mask, mark->ignored_mask);
+	}
+out:
+	return ret;
+}
+
+int fanotify_show_fdinfo(struct seq_file *m, struct file *f)
+{
+	struct fsnotify_group *group = f->private_data;
+	unsigned int flags = 0;
+
+	switch (group->priority) {
+	case FS_PRIO_0:
+		flags |= FAN_CLASS_NOTIF;
+		break;
+	case FS_PRIO_1:
+		flags |= FAN_CLASS_CONTENT;
+		break;
+	case FS_PRIO_2:
+		flags |= FAN_CLASS_PRE_CONTENT;
+		break;
+	}
+
+	if (group->max_events == UINT_MAX)
+		flags |= FAN_UNLIMITED_QUEUE;
+
+	if (group->fanotify_data.max_marks == UINT_MAX)
+		flags |= FAN_UNLIMITED_MARKS;
+
+	seq_printf(m, "fanotify flags:%x event-flags:%x\n",
+		   flags, group->fanotify_data.f_flags);
+
+	return show_fdinfo(m, f, fanotify_fdinfo);
+}
+
+#endif /* CONFIG_FANOTIFY */
+
+#endif /* CONFIG_INOTIFY_USER || CONFIG_FANOTIFY */
+
+#endif /* CONFIG_PROC_FS */
diff --git a/trunk/fs/notify/fdinfo.h b/trunk/fs/notify/fdinfo.h
new file mode 100644
index 000000000000..556afda990e9
--- /dev/null
+++ b/trunk/fs/notify/fdinfo.h
@@ -0,0 +1,27 @@
+#ifndef __FSNOTIFY_FDINFO_H__
+#define __FSNOTIFY_FDINFO_H__
+
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+
+struct seq_file;
+struct file;
+
+#ifdef CONFIG_PROC_FS
+
+#ifdef CONFIG_INOTIFY_USER
+extern int inotify_show_fdinfo(struct seq_file *m, struct file *f);
+#endif
+
+#ifdef CONFIG_FANOTIFY
+extern int fanotify_show_fdinfo(struct seq_file *m, struct file *f);
+#endif
+
+#else /* CONFIG_PROC_FS */
+
+#define inotify_show_fdinfo	NULL
+#define fanotify_show_fdinfo	NULL
+
+#endif /* CONFIG_PROC_FS */
+
+#endif /* __FSNOTIFY_FDINFO_H__ */
diff --git a/trunk/fs/notify/inode_mark.c b/trunk/fs/notify/inode_mark.c
index b13c00ac48eb..f3035691f528 100644
--- a/trunk/fs/notify/inode_mark.c
+++ b/trunk/fs/notify/inode_mark.c
@@ -116,8 +116,9 @@ void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
  * given a group and inode, find the mark associated with that combination.
  * if found take a reference to that mark and return it, else return NULL
  */
-struct fsnotify_mark *fsnotify_find_inode_mark_locked(struct fsnotify_group *group,
-						      struct inode *inode)
+static struct fsnotify_mark *fsnotify_find_inode_mark_locked(
+		struct fsnotify_group *group,
+		struct inode *inode)
 {
 	struct fsnotify_mark *mark;
 	struct hlist_node *pos;
diff --git a/trunk/fs/notify/inotify/inotify_user.c b/trunk/fs/notify/inotify/inotify_user.c
index c311dda054a3..36cb013c7c13 100644
--- a/trunk/fs/notify/inotify/inotify_user.c
+++ b/trunk/fs/notify/inotify/inotify_user.c
@@ -40,6 +40,7 @@
 #include <linux/wait.h>
 
 #include "inotify.h"
+#include "../fdinfo.h"
 
 #include <asm/ioctls.h>
 
@@ -335,6 +336,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
 }
 
 static const struct file_operations inotify_fops = {
+	.show_fdinfo	= inotify_show_fdinfo,
 	.poll		= inotify_poll,
 	.read		= inotify_read,
 	.fasync		= inotify_fasync,
diff --git a/trunk/fs/ocfs2/extent_map.c b/trunk/fs/ocfs2/extent_map.c
index 70b5863a2d64..f487aa343442 100644
--- a/trunk/fs/ocfs2/extent_map.c
+++ b/trunk/fs/ocfs2/extent_map.c
@@ -832,7 +832,7 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	return ret;
 }
 
-int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
+int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
 {
 	struct inode *inode = file->f_mapping->host;
 	int ret;
@@ -843,7 +843,7 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
 	struct buffer_head *di_bh = NULL;
 	struct ocfs2_extent_rec rec;
 
-	BUG_ON(origin != SEEK_DATA && origin != SEEK_HOLE);
+	BUG_ON(whence != SEEK_DATA && whence != SEEK_HOLE);
 
 	ret = ocfs2_inode_lock(inode, &di_bh, 0);
 	if (ret) {
@@ -859,7 +859,7 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
 	}
 
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-		if (origin == SEEK_HOLE)
+		if (whence == SEEK_HOLE)
 			*offset = inode->i_size;
 		goto out_unlock;
 	}
@@ -888,8 +888,8 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
 			is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ?  0 : 1;
 		}
 
-		if ((!is_data && origin == SEEK_HOLE) ||
-		    (is_data && origin == SEEK_DATA)) {
+		if ((!is_data && whence == SEEK_HOLE) ||
+		    (is_data && whence == SEEK_DATA)) {
 			if (extoff > *offset)
 				*offset = extoff;
 			goto out_unlock;
@@ -899,7 +899,7 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
 			cpos += clen;
 	}
 
-	if (origin == SEEK_HOLE) {
+	if (whence == SEEK_HOLE) {
 		extoff = cpos;
 		extoff <<= cs_bits;
 		extlen = clen;
diff --git a/trunk/fs/ocfs2/file.c b/trunk/fs/ocfs2/file.c
index dda089804942..fe492e1a3cfc 100644
--- a/trunk/fs/ocfs2/file.c
+++ b/trunk/fs/ocfs2/file.c
@@ -2637,14 +2637,14 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 }
 
 /* Refer generic_file_llseek_unlocked() */
-static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin)
+static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct inode *inode = file->f_mapping->host;
 	int ret = 0;
 
 	mutex_lock(&inode->i_mutex);
 
-	switch (origin) {
+	switch (whence) {
 	case SEEK_SET:
 		break;
 	case SEEK_END:
@@ -2659,7 +2659,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin)
 		break;
 	case SEEK_DATA:
 	case SEEK_HOLE:
-		ret = ocfs2_seek_data_hole_offset(file, &offset, origin);
+		ret = ocfs2_seek_data_hole_offset(file, &offset, whence);
 		if (ret)
 			goto out;
 		break;
diff --git a/trunk/fs/open.c b/trunk/fs/open.c
index 59071f55bf7f..182d8667b7bd 100644
--- a/trunk/fs/open.c
+++ b/trunk/fs/open.c
@@ -435,7 +435,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
 		goto dput_and_out;
 
 	error = -EPERM;
-	if (!capable(CAP_SYS_CHROOT))
+	if (!nsown_capable(CAP_SYS_CHROOT))
 		goto dput_and_out;
 	error = security_path_chroot(&path);
 	if (error)
diff --git a/trunk/fs/pnode.h b/trunk/fs/pnode.h
index 65c60979d541..19b853a3445c 100644
--- a/trunk/fs/pnode.h
+++ b/trunk/fs/pnode.h
@@ -22,6 +22,7 @@
 #define CL_COPY_ALL 		0x04
 #define CL_MAKE_SHARED 		0x08
 #define CL_PRIVATE 		0x10
+#define CL_SHARED_TO_SLAVE	0x20
 
 static inline void set_mnt_shared(struct mount *mnt)
 {
diff --git a/trunk/fs/proc/Makefile b/trunk/fs/proc/Makefile
index 99349efbbc2b..981b05601931 100644
--- a/trunk/fs/proc/Makefile
+++ b/trunk/fs/proc/Makefile
@@ -21,6 +21,7 @@ proc-y	+= uptime.o
 proc-y	+= version.o
 proc-y	+= softirqs.o
 proc-y	+= namespaces.o
+proc-y	+= self.o
 proc-$(CONFIG_PROC_SYSCTL)	+= proc_sysctl.o
 proc-$(CONFIG_NET)		+= proc_net.o
 proc-$(CONFIG_PROC_KCORE)	+= kcore.o
diff --git a/trunk/fs/proc/array.c b/trunk/fs/proc/array.c
index d3696708fc1a..6a91e6ffbcbd 100644
--- a/trunk/fs/proc/array.c
+++ b/trunk/fs/proc/array.c
@@ -162,7 +162,7 @@ static inline const char *get_task_state(struct task_struct *tsk)
 static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 				struct pid *pid, struct task_struct *p)
 {
-	struct user_namespace *user_ns = current_user_ns();
+	struct user_namespace *user_ns = seq_user_ns(m);
 	struct group_info *group_info;
 	int g;
 	struct fdtable *fdt = NULL;
@@ -212,7 +212,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 	group_info = cred->group_info;
 	task_unlock(p);
 
-	for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++)
+	for (g = 0; g < group_info->ngroups; g++)
 		seq_printf(m, "%d ",
 			   from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
 	put_cred(cred);
@@ -220,7 +220,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 	seq_putc(m, '\n');
 }
 
-static void render_sigset_t(struct seq_file *m, const char *header,
+void render_sigset_t(struct seq_file *m, const char *header,
 				sigset_t *set)
 {
 	int i;
@@ -308,6 +308,10 @@ static void render_cap_t(struct seq_file *m, const char *header,
 	seq_putc(m, '\n');
 }
 
+/* Remove non-existent capabilities */
+#define NORM_CAPS(v) (v.cap[CAP_TO_INDEX(CAP_LAST_CAP)] &= \
+				CAP_TO_MASK(CAP_LAST_CAP + 1) - 1)
+
 static inline void task_cap(struct seq_file *m, struct task_struct *p)
 {
 	const struct cred *cred;
@@ -321,12 +325,24 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
 	cap_bset	= cred->cap_bset;
 	rcu_read_unlock();
 
+	NORM_CAPS(cap_inheritable);
+	NORM_CAPS(cap_permitted);
+	NORM_CAPS(cap_effective);
+	NORM_CAPS(cap_bset);
+
 	render_cap_t(m, "CapInh:\t", &cap_inheritable);
 	render_cap_t(m, "CapPrm:\t", &cap_permitted);
 	render_cap_t(m, "CapEff:\t", &cap_effective);
 	render_cap_t(m, "CapBnd:\t", &cap_bset);
 }
 
+static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
+{
+#ifdef CONFIG_SECCOMP
+	seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode);
+#endif
+}
+
 static inline void task_context_switch_counts(struct seq_file *m,
 						struct task_struct *p)
 {
@@ -360,6 +376,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 	}
 	task_sig(m, task);
 	task_cap(m, task);
+	task_seccomp(m, task);
 	task_cpus_allowed(m, task);
 	cpuset_task_status_allowed(m, task);
 	task_context_switch_counts(m, task);
diff --git a/trunk/fs/proc/base.c b/trunk/fs/proc/base.c
index aa63d25157b8..5a5a0be40e40 100644
--- a/trunk/fs/proc/base.c
+++ b/trunk/fs/proc/base.c
@@ -2345,146 +2345,6 @@ static const struct file_operations proc_coredump_filter_operations = {
 };
 #endif
 
-/*
- * /proc/self:
- */
-static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
-			      int buflen)
-{
-	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
-	pid_t tgid = task_tgid_nr_ns(current, ns);
-	char tmp[PROC_NUMBUF];
-	if (!tgid)
-		return -ENOENT;
-	sprintf(tmp, "%d", tgid);
-	return vfs_readlink(dentry,buffer,buflen,tmp);
-}
-
-static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
-	pid_t tgid = task_tgid_nr_ns(current, ns);
-	char *name = ERR_PTR(-ENOENT);
-	if (tgid) {
-		/* 11 for max length of signed int in decimal + NULL term */
-		name = kmalloc(12, GFP_KERNEL);
-		if (!name)
-			name = ERR_PTR(-ENOMEM);
-		else
-			sprintf(name, "%d", tgid);
-	}
-	nd_set_link(nd, name);
-	return NULL;
-}
-
-static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
-				void *cookie)
-{
-	char *s = nd_get_link(nd);
-	if (!IS_ERR(s))
-		kfree(s);
-}
-
-static const struct inode_operations proc_self_inode_operations = {
-	.readlink	= proc_self_readlink,
-	.follow_link	= proc_self_follow_link,
-	.put_link	= proc_self_put_link,
-};
-
-/*
- * proc base
- *
- * These are the directory entries in the root directory of /proc
- * that properly belong to the /proc filesystem, as they describe
- * describe something that is process related.
- */
-static const struct pid_entry proc_base_stuff[] = {
-	NOD("self", S_IFLNK|S_IRWXUGO,
-		&proc_self_inode_operations, NULL, {}),
-};
-
-static struct dentry *proc_base_instantiate(struct inode *dir,
-	struct dentry *dentry, struct task_struct *task, const void *ptr)
-{
-	const struct pid_entry *p = ptr;
-	struct inode *inode;
-	struct proc_inode *ei;
-	struct dentry *error;
-
-	/* Allocate the inode */
-	error = ERR_PTR(-ENOMEM);
-	inode = new_inode(dir->i_sb);
-	if (!inode)
-		goto out;
-
-	/* Initialize the inode */
-	ei = PROC_I(inode);
-	inode->i_ino = get_next_ino();
-	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-
-	/*
-	 * grab the reference to the task.
-	 */
-	ei->pid = get_task_pid(task, PIDTYPE_PID);
-	if (!ei->pid)
-		goto out_iput;
-
-	inode->i_mode = p->mode;
-	if (S_ISDIR(inode->i_mode))
-		set_nlink(inode, 2);
-	if (S_ISLNK(inode->i_mode))
-		inode->i_size = 64;
-	if (p->iop)
-		inode->i_op = p->iop;
-	if (p->fop)
-		inode->i_fop = p->fop;
-	ei->op = p->op;
-	d_add(dentry, inode);
-	error = NULL;
-out:
-	return error;
-out_iput:
-	iput(inode);
-	goto out;
-}
-
-static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry)
-{
-	struct dentry *error;
-	struct task_struct *task = get_proc_task(dir);
-	const struct pid_entry *p, *last;
-
-	error = ERR_PTR(-ENOENT);
-
-	if (!task)
-		goto out_no_task;
-
-	/* Lookup the directory entry */
-	last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1];
-	for (p = proc_base_stuff; p <= last; p++) {
-		if (p->len != dentry->d_name.len)
-			continue;
-		if (!memcmp(dentry->d_name.name, p->name, p->len))
-			break;
-	}
-	if (p > last)
-		goto out;
-
-	error = proc_base_instantiate(dir, dentry, task, p);
-
-out:
-	put_task_struct(task);
-out_no_task:
-	return error;
-}
-
-static int proc_base_fill_cache(struct file *filp, void *dirent,
-	filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
-{
-	return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
-				proc_base_instantiate, task, p);
-}
-
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 {
@@ -2839,10 +2699,6 @@ void proc_flush_task(struct task_struct *task)
 		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
 					tgid->numbers[i].nr);
 	}
-
-	upid = &pid->numbers[pid->level];
-	if (upid->nr == 1)
-		pid_ns_release_proc(upid->ns);
 }
 
 static struct dentry *proc_pid_instantiate(struct inode *dir,
@@ -2876,15 +2732,11 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
 
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
 {
-	struct dentry *result;
+	struct dentry *result = NULL;
 	struct task_struct *task;
 	unsigned tgid;
 	struct pid_namespace *ns;
 
-	result = proc_base_lookup(dir, dentry);
-	if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT)
-		goto out;
-
 	tgid = name_to_int(dentry);
 	if (tgid == ~0U)
 		goto out;
@@ -2947,7 +2799,7 @@ static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter ite
 	return iter;
 }
 
-#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff))
+#define TGID_OFFSET (FIRST_PROCESS_ENTRY)
 
 static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 	struct tgid_iter iter)
@@ -2967,25 +2819,12 @@ static int fake_filldir(void *buf, const char *name, int namelen,
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
-	unsigned int nr;
-	struct task_struct *reaper;
 	struct tgid_iter iter;
 	struct pid_namespace *ns;
 	filldir_t __filldir;
 
 	if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
-		goto out_no_task;
-	nr = filp->f_pos - FIRST_PROCESS_ENTRY;
-
-	reaper = get_proc_task(filp->f_path.dentry->d_inode);
-	if (!reaper)
-		goto out_no_task;
-
-	for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
-		const struct pid_entry *p = &proc_base_stuff[nr];
-		if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0)
-			goto out;
-	}
+		goto out;
 
 	ns = filp->f_dentry->d_sb->s_fs_info;
 	iter.task = NULL;
@@ -3006,8 +2845,6 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	}
 	filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
 out:
-	put_task_struct(reaper);
-out_no_task:
 	return 0;
 }
 
diff --git a/trunk/fs/proc/fd.c b/trunk/fs/proc/fd.c
index f28a875f8779..d7a4a28ef630 100644
--- a/trunk/fs/proc/fd.c
+++ b/trunk/fs/proc/fd.c
@@ -50,6 +50,8 @@ static int seq_show(struct seq_file *m, void *v)
 	if (!ret) {
                 seq_printf(m, "pos:\t%lli\nflags:\t0%o\n",
 			   (long long)file->f_pos, f_flags);
+		if (file->f_op->show_fdinfo)
+			ret = file->f_op->show_fdinfo(m, file);
 		fput(file);
 	}
 
diff --git a/trunk/fs/proc/generic.c b/trunk/fs/proc/generic.c
index 0d80cef4cfb9..7b3ae3cc0ef9 100644
--- a/trunk/fs/proc/generic.c
+++ b/trunk/fs/proc/generic.c
@@ -350,14 +350,14 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
  * Return an inode number between PROC_DYNAMIC_FIRST and
  * 0xffffffff, or zero on failure.
  */
-static unsigned int get_inode_number(void)
+int proc_alloc_inum(unsigned int *inum)
 {
 	unsigned int i;
 	int error;
 
 retry:
-	if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0)
-		return 0;
+	if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL))
+		return -ENOMEM;
 
 	spin_lock(&proc_inum_lock);
 	error = ida_get_new(&proc_inum_ida, &i);
@@ -365,18 +365,19 @@ static unsigned int get_inode_number(void)
 	if (error == -EAGAIN)
 		goto retry;
 	else if (error)
-		return 0;
+		return error;
 
 	if (i > UINT_MAX - PROC_DYNAMIC_FIRST) {
 		spin_lock(&proc_inum_lock);
 		ida_remove(&proc_inum_ida, i);
 		spin_unlock(&proc_inum_lock);
-		return 0;
+		return -ENOSPC;
 	}
-	return PROC_DYNAMIC_FIRST + i;
+	*inum = PROC_DYNAMIC_FIRST + i;
+	return 0;
 }
 
-static void release_inode_number(unsigned int inum)
+void proc_free_inum(unsigned int inum)
 {
 	spin_lock(&proc_inum_lock);
 	ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
@@ -554,13 +555,12 @@ static const struct inode_operations proc_dir_inode_operations = {
 
 static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
 {
-	unsigned int i;
 	struct proc_dir_entry *tmp;
+	int ret;
 	
-	i = get_inode_number();
-	if (i == 0)
-		return -EAGAIN;
-	dp->low_ino = i;
+	ret = proc_alloc_inum(&dp->low_ino);
+	if (ret)
+		return ret;
 
 	if (S_ISDIR(dp->mode)) {
 		if (dp->proc_iops == NULL) {
@@ -764,7 +764,7 @@ EXPORT_SYMBOL(proc_create_data);
 
 static void free_proc_entry(struct proc_dir_entry *de)
 {
-	release_inode_number(de->low_ino);
+	proc_free_inum(de->low_ino);
 
 	if (S_ISLNK(de->mode))
 		kfree(de->data);
diff --git a/trunk/fs/proc/inode.c b/trunk/fs/proc/inode.c
index 3b22bbdee9ec..439ae6886507 100644
--- a/trunk/fs/proc/inode.c
+++ b/trunk/fs/proc/inode.c
@@ -31,6 +31,7 @@ static void proc_evict_inode(struct inode *inode)
 	struct proc_dir_entry *de;
 	struct ctl_table_header *head;
 	const struct proc_ns_operations *ns_ops;
+	void *ns;
 
 	truncate_inode_pages(&inode->i_data, 0);
 	clear_inode(inode);
@@ -49,8 +50,9 @@ static void proc_evict_inode(struct inode *inode)
 	}
 	/* Release any associated namespace */
 	ns_ops = PROC_I(inode)->ns_ops;
-	if (ns_ops && ns_ops->put)
-		ns_ops->put(PROC_I(inode)->ns);
+	ns = PROC_I(inode)->ns;
+	if (ns_ops && ns)
+		ns_ops->put(ns);
 }
 
 static struct kmem_cache * proc_inode_cachep;
diff --git a/trunk/fs/proc/internal.h b/trunk/fs/proc/internal.h
index 43973b084abf..252544c05207 100644
--- a/trunk/fs/proc/internal.h
+++ b/trunk/fs/proc/internal.h
@@ -15,6 +15,7 @@ struct  ctl_table_header;
 struct  mempolicy;
 
 extern struct proc_dir_entry proc_root;
+extern void proc_self_init(void);
 #ifdef CONFIG_PROC_SYSCTL
 extern int proc_sys_init(void);
 extern void sysctl_head_put(struct ctl_table_header *head);
diff --git a/trunk/fs/proc/namespaces.c b/trunk/fs/proc/namespaces.c
index b178ed733c36..b7a47196c8c3 100644
--- a/trunk/fs/proc/namespaces.c
+++ b/trunk/fs/proc/namespaces.c
@@ -11,6 +11,7 @@
 #include <net/net_namespace.h>
 #include <linux/ipc_namespace.h>
 #include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
 #include "internal.h"
 
 
@@ -24,12 +25,168 @@ static const struct proc_ns_operations *ns_entries[] = {
 #ifdef CONFIG_IPC_NS
 	&ipcns_operations,
 #endif
+#ifdef CONFIG_PID_NS
+	&pidns_operations,
+#endif
+#ifdef CONFIG_USER_NS
+	&userns_operations,
+#endif
+	&mntns_operations,
 };
 
 static const struct file_operations ns_file_operations = {
 	.llseek		= no_llseek,
 };
 
+static const struct inode_operations ns_inode_operations = {
+	.setattr	= proc_setattr,
+};
+
+static int ns_delete_dentry(const struct dentry *dentry)
+{
+	/* Don't cache namespace inodes when not in use */
+	return 1;
+}
+
+static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+	struct inode *inode = dentry->d_inode;
+	const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
+
+	return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
+		ns_ops->name, inode->i_ino);
+}
+
+const struct dentry_operations ns_dentry_operations =
+{
+	.d_delete	= ns_delete_dentry,
+	.d_dname	= ns_dname,
+};
+
+static struct dentry *proc_ns_get_dentry(struct super_block *sb,
+	struct task_struct *task, const struct proc_ns_operations *ns_ops)
+{
+	struct dentry *dentry, *result;
+	struct inode *inode;
+	struct proc_inode *ei;
+	struct qstr qname = { .name = "", };
+	void *ns;
+
+	ns = ns_ops->get(task);
+	if (!ns)
+		return ERR_PTR(-ENOENT);
+
+	dentry = d_alloc_pseudo(sb, &qname);
+	if (!dentry) {
+		ns_ops->put(ns);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	inode = iget_locked(sb, ns_ops->inum(ns));
+	if (!inode) {
+		dput(dentry);
+		ns_ops->put(ns);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ei = PROC_I(inode);
+	if (inode->i_state & I_NEW) {
+		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+		inode->i_op = &ns_inode_operations;
+		inode->i_mode = S_IFREG | S_IRUGO;
+		inode->i_fop = &ns_file_operations;
+		ei->ns_ops = ns_ops;
+		ei->ns = ns;
+		unlock_new_inode(inode);
+	} else {
+		ns_ops->put(ns);
+	}
+
+	d_set_d_op(dentry, &ns_dentry_operations);
+	result = d_instantiate_unique(dentry, inode);
+	if (result) {
+		dput(dentry);
+		dentry = result;
+	}
+
+	return dentry;
+}
+
+static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct proc_inode *ei = PROC_I(inode);
+	struct task_struct *task;
+	struct dentry *ns_dentry;
+	void *error = ERR_PTR(-EACCES);
+
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out_put_task;
+
+	ns_dentry = proc_ns_get_dentry(sb, task, ei->ns_ops);
+	if (IS_ERR(ns_dentry)) {
+		error = ERR_CAST(ns_dentry);
+		goto out_put_task;
+	}
+
+	dput(nd->path.dentry);
+	nd->path.dentry = ns_dentry;
+	error = NULL;
+
+out_put_task:
+	put_task_struct(task);
+out:
+	return error;
+}
+
+static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+{
+	struct inode *inode = dentry->d_inode;
+	struct proc_inode *ei = PROC_I(inode);
+	const struct proc_ns_operations *ns_ops = ei->ns_ops;
+	struct task_struct *task;
+	void *ns;
+	char name[50];
+	int len = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out_put_task;
+
+	len = -ENOENT;
+	ns = ns_ops->get(task);
+	if (!ns)
+		goto out_put_task;
+
+	snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns));
+	len = strlen(name);
+
+	if (len > buflen)
+		len = buflen;
+	if (copy_to_user(buffer, name, len))
+		len = -EFAULT;
+
+	ns_ops->put(ns);
+out_put_task:
+	put_task_struct(task);
+out:
+	return len;
+}
+
+static const struct inode_operations proc_ns_link_inode_operations = {
+	.readlink	= proc_ns_readlink,
+	.follow_link	= proc_ns_follow_link,
+	.setattr	= proc_setattr,
+};
+
 static struct dentry *proc_ns_instantiate(struct inode *dir,
 	struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
@@ -37,21 +194,15 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
 	struct inode *inode;
 	struct proc_inode *ei;
 	struct dentry *error = ERR_PTR(-ENOENT);
-	void *ns;
 
 	inode = proc_pid_make_inode(dir->i_sb, task);
 	if (!inode)
 		goto out;
 
-	ns = ns_ops->get(task);
-	if (!ns)
-		goto out_iput;
-
 	ei = PROC_I(inode);
-	inode->i_mode = S_IFREG|S_IRUSR;
-	inode->i_fop  = &ns_file_operations;
-	ei->ns_ops    = ns_ops;
-	ei->ns	      = ns;
+	inode->i_mode = S_IFLNK|S_IRWXUGO;
+	inode->i_op = &proc_ns_link_inode_operations;
+	ei->ns_ops = ns_ops;
 
 	d_set_d_op(dentry, &pid_dentry_operations);
 	d_add(dentry, inode);
@@ -60,9 +211,6 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
 		error = NULL;
 out:
 	return error;
-out_iput:
-	iput(inode);
-	goto out;
 }
 
 static int proc_ns_fill_cache(struct file *filp, void *dirent,
@@ -89,10 +237,6 @@ static int proc_ns_dir_readdir(struct file *filp, void *dirent,
 	if (!task)
 		goto out_no_task;
 
-	ret = -EPERM;
-	if (!ptrace_may_access(task, PTRACE_MODE_READ))
-		goto out;
-
 	ret = 0;
 	i = filp->f_pos;
 	switch (i) {
@@ -152,10 +296,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
 	if (!task)
 		goto out_no_task;
 
-	error = ERR_PTR(-EPERM);
-	if (!ptrace_may_access(task, PTRACE_MODE_READ))
-		goto out;
-
 	last = &ns_entries[ARRAY_SIZE(ns_entries)];
 	for (entry = ns_entries; entry < last; entry++) {
 		if (strlen((*entry)->name) != len)
@@ -163,7 +303,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
 		if (!memcmp(dentry->d_name.name, (*entry)->name, len))
 			break;
 	}
-	error = ERR_PTR(-ENOENT);
 	if (entry == last)
 		goto out;
 
@@ -198,3 +337,7 @@ struct file *proc_ns_fget(int fd)
 	return ERR_PTR(-EINVAL);
 }
 
+bool proc_ns_inode(struct inode *inode)
+{
+	return inode->i_fop == &ns_file_operations;
+}
diff --git a/trunk/fs/proc/proc_devtree.c b/trunk/fs/proc/proc_devtree.c
index df7dd08d4391..de20ec480fa0 100644
--- a/trunk/fs/proc/proc_devtree.c
+++ b/trunk/fs/proc/proc_devtree.c
@@ -195,11 +195,7 @@ void proc_device_tree_add_node(struct device_node *np,
 	set_node_proc_entry(np, de);
 	for (child = NULL; (child = of_get_next_child(np, child));) {
 		/* Use everything after the last slash, or the full name */
-		p = strrchr(child->full_name, '/');
-		if (!p)
-			p = child->full_name;
-		else
-			++p;
+		p = kbasename(child->full_name);
 
 		if (duplicate_name(de, p))
 			p = fixup_name(np, de, p);
diff --git a/trunk/fs/proc/root.c b/trunk/fs/proc/root.c
index 9889a92d2e01..c6e9fac26bac 100644
--- a/trunk/fs/proc/root.c
+++ b/trunk/fs/proc/root.c
@@ -100,14 +100,13 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 	int err;
 	struct super_block *sb;
 	struct pid_namespace *ns;
-	struct proc_inode *ei;
 	char *options;
 
 	if (flags & MS_KERNMOUNT) {
 		ns = (struct pid_namespace *)data;
 		options = NULL;
 	} else {
-		ns = current->nsproxy->pid_ns;
+		ns = task_active_pid_ns(current);
 		options = data;
 	}
 
@@ -130,13 +129,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 		sb->s_flags |= MS_ACTIVE;
 	}
 
-	ei = PROC_I(sb->s_root->d_inode);
-	if (!ei->pid) {
-		rcu_read_lock();
-		ei->pid = get_pid(find_pid_ns(1, ns));
-		rcu_read_unlock();
-	}
-
 	return dget(sb->s_root);
 }
 
@@ -153,6 +145,7 @@ static struct file_system_type proc_fs_type = {
 	.name		= "proc",
 	.mount		= proc_mount,
 	.kill_sb	= proc_kill_sb,
+	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 void __init proc_root_init(void)
@@ -163,12 +156,8 @@ void __init proc_root_init(void)
 	err = register_filesystem(&proc_fs_type);
 	if (err)
 		return;
-	err = pid_ns_prepare_proc(&init_pid_ns);
-	if (err) {
-		unregister_filesystem(&proc_fs_type);
-		return;
-	}
 
+	proc_self_init();
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	proc_net_init();
diff --git a/trunk/fs/proc/self.c b/trunk/fs/proc/self.c
new file mode 100644
index 000000000000..aa5cc3bff140
--- /dev/null
+++ b/trunk/fs/proc/self.c
@@ -0,0 +1,59 @@
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/namei.h>
+
+/*
+ * /proc/self:
+ */
+static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
+			      int buflen)
+{
+	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+	pid_t tgid = task_tgid_nr_ns(current, ns);
+	char tmp[PROC_NUMBUF];
+	if (!tgid)
+		return -ENOENT;
+	sprintf(tmp, "%d", tgid);
+	return vfs_readlink(dentry,buffer,buflen,tmp);
+}
+
+static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+	pid_t tgid = task_tgid_nr_ns(current, ns);
+	char *name = ERR_PTR(-ENOENT);
+	if (tgid) {
+		/* 11 for max length of signed int in decimal + NULL term */
+		name = kmalloc(12, GFP_KERNEL);
+		if (!name)
+			name = ERR_PTR(-ENOMEM);
+		else
+			sprintf(name, "%d", tgid);
+	}
+	nd_set_link(nd, name);
+	return NULL;
+}
+
+static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
+				void *cookie)
+{
+	char *s = nd_get_link(nd);
+	if (!IS_ERR(s))
+		kfree(s);
+}
+
+static const struct inode_operations proc_self_inode_operations = {
+	.readlink	= proc_self_readlink,
+	.follow_link	= proc_self_follow_link,
+	.put_link	= proc_self_put_link,
+};
+
+void __init proc_self_init(void)
+{
+	struct proc_dir_entry *proc_self_symlink;
+	mode_t mode;
+
+	mode = S_IFLNK | S_IRWXUGO;
+	proc_self_symlink = proc_create("self", mode, NULL, NULL );
+	proc_self_symlink->proc_iops = &proc_self_inode_operations;
+}
diff --git a/trunk/fs/proc/task_mmu.c b/trunk/fs/proc/task_mmu.c
index 48775628abbf..448455b7fd91 100644
--- a/trunk/fs/proc/task_mmu.c
+++ b/trunk/fs/proc/task_mmu.c
@@ -526,6 +526,57 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	return 0;
 }
 
+static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
+{
+	/*
+	 * Don't forget to update Documentation/ on changes.
+	 */
+	static const char mnemonics[BITS_PER_LONG][2] = {
+		/*
+		 * In case if we meet a flag we don't know about.
+		 */
+		[0 ... (BITS_PER_LONG-1)] = "??",
+
+		[ilog2(VM_READ)]	= "rd",
+		[ilog2(VM_WRITE)]	= "wr",
+		[ilog2(VM_EXEC)]	= "ex",
+		[ilog2(VM_SHARED)]	= "sh",
+		[ilog2(VM_MAYREAD)]	= "mr",
+		[ilog2(VM_MAYWRITE)]	= "mw",
+		[ilog2(VM_MAYEXEC)]	= "me",
+		[ilog2(VM_MAYSHARE)]	= "ms",
+		[ilog2(VM_GROWSDOWN)]	= "gd",
+		[ilog2(VM_PFNMAP)]	= "pf",
+		[ilog2(VM_DENYWRITE)]	= "dw",
+		[ilog2(VM_LOCKED)]	= "lo",
+		[ilog2(VM_IO)]		= "io",
+		[ilog2(VM_SEQ_READ)]	= "sr",
+		[ilog2(VM_RAND_READ)]	= "rr",
+		[ilog2(VM_DONTCOPY)]	= "dc",
+		[ilog2(VM_DONTEXPAND)]	= "de",
+		[ilog2(VM_ACCOUNT)]	= "ac",
+		[ilog2(VM_NORESERVE)]	= "nr",
+		[ilog2(VM_HUGETLB)]	= "ht",
+		[ilog2(VM_NONLINEAR)]	= "nl",
+		[ilog2(VM_ARCH_1)]	= "ar",
+		[ilog2(VM_DONTDUMP)]	= "dd",
+		[ilog2(VM_MIXEDMAP)]	= "mm",
+		[ilog2(VM_HUGEPAGE)]	= "hg",
+		[ilog2(VM_NOHUGEPAGE)]	= "nh",
+		[ilog2(VM_MERGEABLE)]	= "mg",
+	};
+	size_t i;
+
+	seq_puts(m, "VmFlags: ");
+	for (i = 0; i < BITS_PER_LONG; i++) {
+		if (vma->vm_flags & (1UL << i)) {
+			seq_printf(m, "%c%c ",
+				   mnemonics[i][0], mnemonics[i][1]);
+		}
+	}
+	seq_putc(m, '\n');
+}
+
 static int show_smap(struct seq_file *m, void *v, int is_pid)
 {
 	struct proc_maps_private *priv = m->private;
@@ -581,6 +632,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 		seq_printf(m, "Nonlinear:      %8lu kB\n",
 				mss.nonlinear >> 10);
 
+	show_smap_vma_flags(m, vma);
+
 	if (m->count < m->size)  /* vma is copied successfully */
 		m->version = (vma != get_gate_vma(task->mm))
 			? vma->vm_start : 0;
diff --git a/trunk/fs/pstore/inode.c b/trunk/fs/pstore/inode.c
index ed1d8c7212da..67de74ca85f4 100644
--- a/trunk/fs/pstore/inode.c
+++ b/trunk/fs/pstore/inode.c
@@ -151,13 +151,13 @@ static int pstore_file_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static loff_t pstore_file_llseek(struct file *file, loff_t off, int origin)
+static loff_t pstore_file_llseek(struct file *file, loff_t off, int whence)
 {
 	struct seq_file *sf = file->private_data;
 
 	if (sf->op)
-		return seq_lseek(file, off, origin);
-	return default_llseek(file, off, origin);
+		return seq_lseek(file, off, whence);
+	return default_llseek(file, off, whence);
 }
 
 static const struct file_operations pstore_file_operations = {
diff --git a/trunk/fs/read_write.c b/trunk/fs/read_write.c
index d06534857e9e..1edaf099ddd7 100644
--- a/trunk/fs/read_write.c
+++ b/trunk/fs/read_write.c
@@ -54,7 +54,7 @@ static loff_t lseek_execute(struct file *file, struct inode *inode,
  * generic_file_llseek_size - generic llseek implementation for regular files
  * @file:	file structure to seek on
  * @offset:	file offset to seek to
- * @origin:	type of seek
+ * @whence:	type of seek
  * @size:	max size of this file in file system
  * @eof:	offset used for SEEK_END position
  *
@@ -67,12 +67,12 @@ static loff_t lseek_execute(struct file *file, struct inode *inode,
  * read/writes behave like SEEK_SET against seeks.
  */
 loff_t
-generic_file_llseek_size(struct file *file, loff_t offset, int origin,
+generic_file_llseek_size(struct file *file, loff_t offset, int whence,
 		loff_t maxsize, loff_t eof)
 {
 	struct inode *inode = file->f_mapping->host;
 
-	switch (origin) {
+	switch (whence) {
 	case SEEK_END:
 		offset += eof;
 		break;
@@ -122,17 +122,17 @@ EXPORT_SYMBOL(generic_file_llseek_size);
  * generic_file_llseek - generic llseek implementation for regular files
  * @file:	file structure to seek on
  * @offset:	file offset to seek to
- * @origin:	type of seek
+ * @whence:	type of seek
  *
  * This is a generic implemenation of ->llseek useable for all normal local
  * filesystems.  It just updates the file offset to the value specified by
- * @offset and @origin under i_mutex.
+ * @offset and @whence under i_mutex.
  */
-loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
+loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct inode *inode = file->f_mapping->host;
 
-	return generic_file_llseek_size(file, offset, origin,
+	return generic_file_llseek_size(file, offset, whence,
 					inode->i_sb->s_maxbytes,
 					i_size_read(inode));
 }
@@ -142,32 +142,32 @@ EXPORT_SYMBOL(generic_file_llseek);
  * noop_llseek - No Operation Performed llseek implementation
  * @file:	file structure to seek on
  * @offset:	file offset to seek to
- * @origin:	type of seek
+ * @whence:	type of seek
  *
  * This is an implementation of ->llseek useable for the rare special case when
  * userspace expects the seek to succeed but the (device) file is actually not
  * able to perform the seek. In this case you use noop_llseek() instead of
  * falling back to the default implementation of ->llseek.
  */
-loff_t noop_llseek(struct file *file, loff_t offset, int origin)
+loff_t noop_llseek(struct file *file, loff_t offset, int whence)
 {
 	return file->f_pos;
 }
 EXPORT_SYMBOL(noop_llseek);
 
-loff_t no_llseek(struct file *file, loff_t offset, int origin)
+loff_t no_llseek(struct file *file, loff_t offset, int whence)
 {
 	return -ESPIPE;
 }
 EXPORT_SYMBOL(no_llseek);
 
-loff_t default_llseek(struct file *file, loff_t offset, int origin)
+loff_t default_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
 	loff_t retval;
 
 	mutex_lock(&inode->i_mutex);
-	switch (origin) {
+	switch (whence) {
 		case SEEK_END:
 			offset += i_size_read(inode);
 			break;
@@ -216,7 +216,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
 }
 EXPORT_SYMBOL(default_llseek);
 
-loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
+loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 {
 	loff_t (*fn)(struct file *, loff_t, int);
 
@@ -225,11 +225,11 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
 		if (file->f_op && file->f_op->llseek)
 			fn = file->f_op->llseek;
 	}
-	return fn(file, offset, origin);
+	return fn(file, offset, whence);
 }
 EXPORT_SYMBOL(vfs_llseek);
 
-SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
+SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 {
 	off_t retval;
 	struct fd f = fdget(fd);
@@ -237,8 +237,8 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
 		return -EBADF;
 
 	retval = -EINVAL;
-	if (origin <= SEEK_MAX) {
-		loff_t res = vfs_llseek(f.file, offset, origin);
+	if (whence <= SEEK_MAX) {
+		loff_t res = vfs_llseek(f.file, offset, whence);
 		retval = res;
 		if (res != (loff_t)retval)
 			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
@@ -250,7 +250,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
 #ifdef __ARCH_WANT_SYS_LLSEEK
 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 		unsigned long, offset_low, loff_t __user *, result,
-		unsigned int, origin)
+		unsigned int, whence)
 {
 	int retval;
 	struct fd f = fdget(fd);
@@ -260,11 +260,11 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 		return -EBADF;
 
 	retval = -EINVAL;
-	if (origin > SEEK_MAX)
+	if (whence > SEEK_MAX)
 		goto out_putf;
 
 	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
-			origin);
+			whence);
 
 	retval = (int)offset;
 	if (offset >= 0) {
diff --git a/trunk/fs/seq_file.c b/trunk/fs/seq_file.c
index 99dffab4c4e4..9d863fb501f9 100644
--- a/trunk/fs/seq_file.c
+++ b/trunk/fs/seq_file.c
@@ -300,14 +300,14 @@ EXPORT_SYMBOL(seq_read);
  *
  *	Ready-made ->f_op->llseek()
  */
-loff_t seq_lseek(struct file *file, loff_t offset, int origin)
+loff_t seq_lseek(struct file *file, loff_t offset, int whence)
 {
 	struct seq_file *m = file->private_data;
 	loff_t retval = -EINVAL;
 
 	mutex_lock(&m->lock);
 	m->version = file->f_version;
-	switch (origin) {
+	switch (whence) {
 		case 1:
 			offset += file->f_pos;
 		case 0:
diff --git a/trunk/fs/signalfd.c b/trunk/fs/signalfd.c
index 8bee4e570911..b53486961735 100644
--- a/trunk/fs/signalfd.c
+++ b/trunk/fs/signalfd.c
@@ -29,6 +29,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/signalfd.h>
 #include <linux/syscalls.h>
+#include <linux/proc_fs.h>
 
 void signalfd_cleanup(struct sighand_struct *sighand)
 {
@@ -227,7 +228,24 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
 	return total ? total: ret;
 }
 
+#ifdef CONFIG_PROC_FS
+static int signalfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+	struct signalfd_ctx *ctx = f->private_data;
+	sigset_t sigmask;
+
+	sigmask = ctx->sigmask;
+	signotset(&sigmask);
+	render_sigset_t(m, "sigmask:\t", &sigmask);
+
+	return 0;
+}
+#endif
+
 static const struct file_operations signalfd_fops = {
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo	= signalfd_show_fdinfo,
+#endif
 	.release	= signalfd_release,
 	.poll		= signalfd_poll,
 	.read		= signalfd_read,
diff --git a/trunk/fs/sysfs/mount.c b/trunk/fs/sysfs/mount.c
index 71eb7e253927..db940a9be045 100644
--- a/trunk/fs/sysfs/mount.c
+++ b/trunk/fs/sysfs/mount.c
@@ -149,6 +149,7 @@ static struct file_system_type sysfs_fs_type = {
 	.name		= "sysfs",
 	.mount		= sysfs_mount,
 	.kill_sb	= sysfs_kill_sb,
+	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 int __init sysfs_init(void)
diff --git a/trunk/fs/ubifs/debug.c b/trunk/fs/ubifs/debug.c
index 62911637e12f..12817ffc7345 100644
--- a/trunk/fs/ubifs/debug.c
+++ b/trunk/fs/ubifs/debug.c
@@ -2560,7 +2560,7 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write)
 static int corrupt_data(const struct ubifs_info *c, const void *buf,
 			unsigned int len)
 {
-	unsigned int from, to, i, ffs = chance(1, 2);
+	unsigned int from, to, ffs = chance(1, 2);
 	unsigned char *p = (void *)buf;
 
 	from = random32() % (len + 1);
@@ -2571,11 +2571,9 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf,
 		   ffs ? "0xFFs" : "random data");
 
 	if (ffs)
-		for (i = from; i < to; i++)
-			p[i] = 0xFF;
+		memset(p + from, 0xFF, to - from);
 	else
-		for (i = from; i < to; i++)
-			p[i] = random32() % 0x100;
+		prandom_bytes(p + from, to - from);
 
 	return to;
 }
diff --git a/trunk/fs/ubifs/dir.c b/trunk/fs/ubifs/dir.c
index e271fba1651b..8a574776a493 100644
--- a/trunk/fs/ubifs/dir.c
+++ b/trunk/fs/ubifs/dir.c
@@ -453,11 +453,11 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
 }
 
 /* If a directory is seeked, we have to free saved readdir() state */
-static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int origin)
+static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence)
 {
 	kfree(file->private_data);
 	file->private_data = NULL;
-	return generic_file_llseek(file, offset, origin);
+	return generic_file_llseek(file, offset, whence);
 }
 
 /* Free saved readdir() state when the directory is closed */
diff --git a/trunk/include/asm-generic/io.h b/trunk/include/asm-generic/io.h
index 9e0ebe051243..d1e93284d72a 100644
--- a/trunk/include/asm-generic/io.h
+++ b/trunk/include/asm-generic/io.h
@@ -154,7 +154,7 @@ static inline void insb(unsigned long addr, void *buffer, int count)
 	if (count) {
 		u8 *buf = buffer;
 		do {
-			u8 x = inb(addr);
+			u8 x = __raw_readb(addr + PCI_IOBASE);
 			*buf++ = x;
 		} while (--count);
 	}
@@ -167,7 +167,7 @@ static inline void insw(unsigned long addr, void *buffer, int count)
 	if (count) {
 		u16 *buf = buffer;
 		do {
-			u16 x = inw(addr);
+			u16 x = __raw_readw(addr + PCI_IOBASE);
 			*buf++ = x;
 		} while (--count);
 	}
@@ -180,7 +180,7 @@ static inline void insl(unsigned long addr, void *buffer, int count)
 	if (count) {
 		u32 *buf = buffer;
 		do {
-			u32 x = inl(addr);
+			u32 x = __raw_readl(addr + PCI_IOBASE);
 			*buf++ = x;
 		} while (--count);
 	}
@@ -193,7 +193,7 @@ static inline void outsb(unsigned long addr, const void *buffer, int count)
 	if (count) {
 		const u8 *buf = buffer;
 		do {
-			outb(*buf++, addr);
+			__raw_writeb(*buf++, addr + PCI_IOBASE);
 		} while (--count);
 	}
 }
@@ -205,7 +205,7 @@ static inline void outsw(unsigned long addr, const void *buffer, int count)
 	if (count) {
 		const u16 *buf = buffer;
 		do {
-			outw(*buf++, addr);
+			__raw_writew(*buf++, addr + PCI_IOBASE);
 		} while (--count);
 	}
 }
@@ -217,7 +217,7 @@ static inline void outsl(unsigned long addr, const void *buffer, int count)
 	if (count) {
 		const u32 *buf = buffer;
 		do {
-			outl(*buf++, addr);
+			__raw_writel(*buf++, addr + PCI_IOBASE);
 		} while (--count);
 	}
 }
diff --git a/trunk/include/linux/asn1.h b/trunk/include/linux/asn1.h
index 5c3f4e4b9a23..eed6982860ba 100644
--- a/trunk/include/linux/asn1.h
+++ b/trunk/include/linux/asn1.h
@@ -64,4 +64,6 @@ enum asn1_tag {
 	ASN1_LONG_TAG	= 31	/* Long form tag */
 };
 
+#define ASN1_INDEFINITE_LENGTH 0x80
+
 #endif /* _LINUX_ASN1_H */
diff --git a/trunk/include/linux/backing-dev.h b/trunk/include/linux/backing-dev.h
index 238521a19849..2a9a9abc9126 100644
--- a/trunk/include/linux/backing-dev.h
+++ b/trunk/include/linux/backing-dev.h
@@ -18,7 +18,6 @@
 #include <linux/writeback.h>
 #include <linux/atomic.h>
 #include <linux/sysctl.h>
-#include <linux/mutex.h>
 
 struct page;
 struct device;
@@ -106,9 +105,6 @@ struct backing_dev_info {
 
 	struct timer_list laptop_mode_wb_timer;
 
-	cpumask_t *flusher_cpumask; /* used for writeback thread scheduling */
-	struct mutex flusher_cpumask_lock;
-
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debug_dir;
 	struct dentry *debug_stats;
diff --git a/trunk/include/linux/backlight.h b/trunk/include/linux/backlight.h
index 5ffc6dda4675..da9a0825e007 100644
--- a/trunk/include/linux/backlight.h
+++ b/trunk/include/linux/backlight.h
@@ -134,4 +134,14 @@ struct generic_bl_info {
 	void (*kick_battery)(void);
 };
 
+#ifdef CONFIG_OF
+struct backlight_device *of_find_backlight_by_node(struct device_node *node);
+#else
+static inline struct backlight_device *
+of_find_backlight_by_node(struct device_node *node)
+{
+	return NULL;
+}
+#endif
+
 #endif
diff --git a/trunk/include/linux/bcma/bcma.h b/trunk/include/linux/bcma/bcma.h
index 93b1e091b1e9..e0ce311011c0 100644
--- a/trunk/include/linux/bcma/bcma.h
+++ b/trunk/include/linux/bcma/bcma.h
@@ -350,6 +350,7 @@ extern void bcma_core_set_clockmode(struct bcma_device *core,
 				    enum bcma_clkmode clkmode);
 extern void bcma_core_pll_ctl(struct bcma_device *core, u32 req, u32 status,
 			      bool on);
+extern u32 bcma_chipco_pll_read(struct bcma_drv_cc *cc, u32 offset);
 #define BCMA_DMA_TRANSLATION_MASK	0xC0000000
 #define  BCMA_DMA_TRANSLATION_NONE	0x00000000
 #define  BCMA_DMA_TRANSLATION_DMA32_CMT	0x40000000 /* Client Mode Translation for 32-bit DMA */
diff --git a/trunk/include/linux/binfmts.h b/trunk/include/linux/binfmts.h
index 2630c9b41a86..a4c2b565c835 100644
--- a/trunk/include/linux/binfmts.h
+++ b/trunk/include/linux/binfmts.h
@@ -54,8 +54,6 @@ struct linux_binprm {
 #define BINPRM_FLAGS_EXECFD_BIT 1
 #define BINPRM_FLAGS_EXECFD (1 << BINPRM_FLAGS_EXECFD_BIT)
 
-#define BINPRM_MAX_RECURSION 4
-
 /* Function parameter for binfmt->coredump */
 struct coredump_params {
 	siginfo_t *siginfo;
diff --git a/trunk/include/linux/blkdev.h b/trunk/include/linux/blkdev.h
index acb4f7bbbd32..f94bc83011ed 100644
--- a/trunk/include/linux/blkdev.h
+++ b/trunk/include/linux/blkdev.h
@@ -1188,14 +1188,25 @@ static inline int queue_discard_alignment(struct request_queue *q)
 
 static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector_t sector)
 {
-	sector_t alignment = sector << 9;
-	alignment = sector_div(alignment, lim->discard_granularity);
+	unsigned int alignment, granularity, offset;
 
 	if (!lim->max_discard_sectors)
 		return 0;
 
-	alignment = lim->discard_granularity + lim->discard_alignment - alignment;
-	return sector_div(alignment, lim->discard_granularity);
+	/* Why are these in bytes, not sectors? */
+	alignment = lim->discard_alignment >> 9;
+	granularity = lim->discard_granularity >> 9;
+	if (!granularity)
+		return 0;
+
+	/* Offset of the partition start in 'granularity' sectors */
+	offset = sector_div(sector, granularity);
+
+	/* And why do we do this modulus *again* in blkdev_issue_discard()? */
+	offset = (granularity + alignment - offset) % granularity;
+
+	/* Turn it back into bytes, gaah */
+	return offset << 9;
 }
 
 static inline int bdev_discard_alignment(struct block_device *bdev)
diff --git a/trunk/include/linux/compat.h b/trunk/include/linux/compat.h
index 784ebfe63c48..e4920bd58a47 100644
--- a/trunk/include/linux/compat.h
+++ b/trunk/include/linux/compat.h
@@ -588,6 +588,9 @@ asmlinkage ssize_t compat_sys_process_vm_writev(compat_pid_t pid,
 asmlinkage long compat_sys_sendfile(int out_fd, int in_fd,
 				    compat_off_t __user *offset, compat_size_t count);
 
+asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
+						 struct compat_timespec __user *interval);
+
 #else
 
 #define is_compat_task() (0)
diff --git a/trunk/include/linux/compiler-gcc4.h b/trunk/include/linux/compiler-gcc4.h
index 412bc6c2b023..662fd1b4c42a 100644
--- a/trunk/include/linux/compiler-gcc4.h
+++ b/trunk/include/linux/compiler-gcc4.h
@@ -31,6 +31,8 @@
 
 #define __linktime_error(message) __attribute__((__error__(message)))
 
+#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
+
 #if __GNUC_MINOR__ >= 5
 /*
  * Mark a position in code as unreachable.  This can be used to
@@ -63,3 +65,13 @@
 #define __compiletime_warning(message) __attribute__((warning(message)))
 #define __compiletime_error(message) __attribute__((error(message)))
 #endif
+
+#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP
+#if __GNUC_MINOR__ >= 4
+#define __HAVE_BUILTIN_BSWAP32__
+#define __HAVE_BUILTIN_BSWAP64__
+#endif
+#if __GNUC_MINOR__ >= 8 || (defined(__powerpc__) && __GNUC_MINOR__ >= 6)
+#define __HAVE_BUILTIN_BSWAP16__
+#endif
+#endif
diff --git a/trunk/include/linux/compiler-intel.h b/trunk/include/linux/compiler-intel.h
index d8e636e5607d..973ce10c40b6 100644
--- a/trunk/include/linux/compiler-intel.h
+++ b/trunk/include/linux/compiler-intel.h
@@ -29,3 +29,10 @@
 #endif
 
 #define uninitialized_var(x) x
+
+#ifndef __HAVE_BUILTIN_BSWAP16__
+/* icc has this, but it's called _bswap16 */
+#define __HAVE_BUILTIN_BSWAP16__
+#define __builtin_bswap16 _bswap16
+#endif
+
diff --git a/trunk/include/linux/compiler.h b/trunk/include/linux/compiler.h
index f430e4162f41..dd852b73b286 100644
--- a/trunk/include/linux/compiler.h
+++ b/trunk/include/linux/compiler.h
@@ -10,6 +10,7 @@
 # define __force	__attribute__((force))
 # define __nocast	__attribute__((nocast))
 # define __iomem	__attribute__((noderef, address_space(2)))
+# define __must_hold(x)	__attribute__((context(x,1,1)))
 # define __acquires(x)	__attribute__((context(x,0,1)))
 # define __releases(x)	__attribute__((context(x,1,0)))
 # define __acquire(x)	__context__(x,1)
@@ -33,6 +34,7 @@ extern void __chk_io_ptr(const volatile void __iomem *);
 # define __chk_user_ptr(x) (void)0
 # define __chk_io_ptr(x) (void)0
 # define __builtin_warning(x, y...) (1)
+# define __must_hold(x)
 # define __acquires(x)
 # define __releases(x)
 # define __acquire(x) (void)0
@@ -42,6 +44,10 @@ extern void __chk_io_ptr(const volatile void __iomem *);
 # define __rcu
 #endif
 
+/* Indirect macros required for expanded argument pasting, eg. __LINE__. */
+#define ___PASTE(a,b) a##b
+#define __PASTE(a,b) ___PASTE(a,b)
+
 #ifdef __KERNEL__
 
 #ifdef __GNUC__
@@ -164,6 +170,11 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
     (typeof(ptr)) (__ptr + (off)); })
 #endif
 
+/* Not-quite-unique ID. */
+#ifndef __UNIQUE_ID
+# define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __LINE__)
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* __ASSEMBLY__ */
diff --git a/trunk/include/linux/cred.h b/trunk/include/linux/cred.h
index 0142aacb70b7..abb2cd50f6b2 100644
--- a/trunk/include/linux/cred.h
+++ b/trunk/include/linux/cred.h
@@ -344,10 +344,8 @@ static inline void put_cred(const struct cred *_cred)
 extern struct user_namespace init_user_ns;
 #ifdef CONFIG_USER_NS
 #define current_user_ns()	(current_cred_xxx(user_ns))
-#define task_user_ns(task)	(task_cred_xxx((task), user_ns))
 #else
 #define current_user_ns()	(&init_user_ns)
-#define task_user_ns(task)	(&init_user_ns)
 #endif
 
 
diff --git a/trunk/include/linux/dma-buf.h b/trunk/include/linux/dma-buf.h
index eb48f3816df9..bd2e52ccc4f2 100644
--- a/trunk/include/linux/dma-buf.h
+++ b/trunk/include/linux/dma-buf.h
@@ -156,7 +156,6 @@ static inline void get_dma_buf(struct dma_buf *dmabuf)
 	get_file(dmabuf->file);
 }
 
-#ifdef CONFIG_DMA_SHARED_BUFFER
 struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf,
 							struct device *dev);
 void dma_buf_detach(struct dma_buf *dmabuf,
@@ -184,103 +183,5 @@ int dma_buf_mmap(struct dma_buf *, struct vm_area_struct *,
 		 unsigned long);
 void *dma_buf_vmap(struct dma_buf *);
 void dma_buf_vunmap(struct dma_buf *, void *vaddr);
-#else
-
-static inline struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf,
-							struct device *dev)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline void dma_buf_detach(struct dma_buf *dmabuf,
-				  struct dma_buf_attachment *dmabuf_attach)
-{
-	return;
-}
-
-static inline struct dma_buf *dma_buf_export(void *priv,
-					     const struct dma_buf_ops *ops,
-					     size_t size, int flags)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline int dma_buf_fd(struct dma_buf *dmabuf, int flags)
-{
-	return -ENODEV;
-}
-
-static inline struct dma_buf *dma_buf_get(int fd)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline void dma_buf_put(struct dma_buf *dmabuf)
-{
-	return;
-}
-
-static inline struct sg_table *dma_buf_map_attachment(
-	struct dma_buf_attachment *attach, enum dma_data_direction write)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline void dma_buf_unmap_attachment(struct dma_buf_attachment *attach,
-			struct sg_table *sg, enum dma_data_direction dir)
-{
-	return;
-}
-
-static inline int dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
-					   size_t start, size_t len,
-					   enum dma_data_direction dir)
-{
-	return -ENODEV;
-}
-
-static inline void dma_buf_end_cpu_access(struct dma_buf *dmabuf,
-					  size_t start, size_t len,
-					  enum dma_data_direction dir)
-{
-}
-
-static inline void *dma_buf_kmap_atomic(struct dma_buf *dmabuf,
-					unsigned long pnum)
-{
-	return NULL;
-}
-
-static inline void dma_buf_kunmap_atomic(struct dma_buf *dmabuf,
-					 unsigned long pnum, void *vaddr)
-{
-}
-
-static inline void *dma_buf_kmap(struct dma_buf *dmabuf, unsigned long pnum)
-{
-	return NULL;
-}
-
-static inline void dma_buf_kunmap(struct dma_buf *dmabuf,
-				  unsigned long pnum, void *vaddr)
-{
-}
-
-static inline int dma_buf_mmap(struct dma_buf *dmabuf,
-			       struct vm_area_struct *vma,
-			       unsigned long pgoff)
-{
-	return -ENODEV;
-}
-
-static inline void *dma_buf_vmap(struct dma_buf *dmabuf)
-{
-	return NULL;
-}
-
-static inline void dma_buf_vunmap(struct dma_buf *dmabuf, void *vaddr)
-{
-}
-#endif /* CONFIG_DMA_SHARED_BUFFER */
 
 #endif /* __DMA_BUF_H__ */
diff --git a/trunk/include/linux/drbd.h b/trunk/include/linux/drbd.h
index 47e3d4850584..0c5a18ec322c 100644
--- a/trunk/include/linux/drbd.h
+++ b/trunk/include/linux/drbd.h
@@ -51,12 +51,11 @@
 
 #endif
 
-
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.3.13"
-#define API_VERSION 88
+#define REL_VERSION "8.4.2"
+#define API_VERSION 1
 #define PRO_VERSION_MIN 86
-#define PRO_VERSION_MAX 96
+#define PRO_VERSION_MAX 101
 
 
 enum drbd_io_error_p {
@@ -66,7 +65,8 @@ enum drbd_io_error_p {
 };
 
 enum drbd_fencing_p {
-	FP_DONT_CARE,
+	FP_NOT_AVAIL = -1, /* Not a policy */
+	FP_DONT_CARE = 0,
 	FP_RESOURCE,
 	FP_STONITH
 };
@@ -102,6 +102,20 @@ enum drbd_on_congestion {
 	OC_DISCONNECT,
 };
 
+enum drbd_read_balancing {
+	RB_PREFER_LOCAL,
+	RB_PREFER_REMOTE,
+	RB_ROUND_ROBIN,
+	RB_LEAST_PENDING,
+	RB_CONGESTED_REMOTE,
+	RB_32K_STRIPING,
+	RB_64K_STRIPING,
+	RB_128K_STRIPING,
+	RB_256K_STRIPING,
+	RB_512K_STRIPING,
+	RB_1M_STRIPING,
+};
+
 /* KEEP the order, do not delete or insert. Only append. */
 enum drbd_ret_code {
 	ERR_CODE_BASE		= 100,
@@ -122,7 +136,7 @@ enum drbd_ret_code {
 	ERR_AUTH_ALG		= 120,
 	ERR_AUTH_ALG_ND		= 121,
 	ERR_NOMEM		= 122,
-	ERR_DISCARD		= 123,
+	ERR_DISCARD_IMPOSSIBLE	= 123,
 	ERR_DISK_CONFIGURED	= 124,
 	ERR_NET_CONFIGURED	= 125,
 	ERR_MANDATORY_TAG	= 126,
@@ -130,8 +144,8 @@ enum drbd_ret_code {
 	ERR_INTR		= 129, /* EINTR */
 	ERR_RESIZE_RESYNC	= 130,
 	ERR_NO_PRIMARY		= 131,
-	ERR_SYNC_AFTER		= 132,
-	ERR_SYNC_AFTER_CYCLE	= 133,
+	ERR_RESYNC_AFTER	= 132,
+	ERR_RESYNC_AFTER_CYCLE	= 133,
 	ERR_PAUSE_IS_SET	= 134,
 	ERR_PAUSE_IS_CLEAR	= 135,
 	ERR_PACKET_NR		= 137,
@@ -155,6 +169,14 @@ enum drbd_ret_code {
 	ERR_CONG_NOT_PROTO_A	= 155,
 	ERR_PIC_AFTER_DEP	= 156,
 	ERR_PIC_PEER_DEP	= 157,
+	ERR_RES_NOT_KNOWN	= 158,
+	ERR_RES_IN_USE		= 159,
+	ERR_MINOR_CONFIGURED    = 160,
+	ERR_MINOR_EXISTS	= 161,
+	ERR_INVALID_REQUEST	= 162,
+	ERR_NEED_APV_100	= 163,
+	ERR_NEED_ALLOW_TWO_PRI  = 164,
+	ERR_MD_UNCLEAN          = 165,
 
 	/* insert new ones above this line */
 	AFTER_LAST_ERR_CODE
@@ -296,7 +318,8 @@ enum drbd_state_rv {
 	SS_NOT_SUPPORTED = -17,      /* drbd-8.2 only */
 	SS_IN_TRANSIENT_STATE = -18,  /* Retry after the next state change */
 	SS_CONCURRENT_ST_CHG = -19,   /* Concurrent cluster side state change! */
-	SS_AFTER_LAST_ERROR = -20,    /* Keep this at bottom */
+	SS_O_VOL_PEER_PRI = -20,
+	SS_AFTER_LAST_ERROR = -21,    /* Keep this at bottom */
 };
 
 /* from drbd_strings.c */
@@ -313,7 +336,9 @@ extern const char *drbd_set_st_err_str(enum drbd_state_rv);
 #define MDF_FULL_SYNC		(1 << 3)
 #define MDF_WAS_UP_TO_DATE	(1 << 4)
 #define MDF_PEER_OUT_DATED	(1 << 5)
-#define MDF_CRASHED_PRIMARY     (1 << 6)
+#define MDF_CRASHED_PRIMARY	(1 << 6)
+#define MDF_AL_CLEAN		(1 << 7)
+#define MDF_AL_DISABLED		(1 << 8)
 
 enum drbd_uuid_index {
 	UI_CURRENT,
@@ -333,37 +358,23 @@ enum drbd_timeout_flag {
 
 #define UUID_JUST_CREATED ((__u64)4)
 
+/* magic numbers used in meta data and network packets */
 #define DRBD_MAGIC 0x83740267
-#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC)
 #define DRBD_MAGIC_BIG 0x835a
-#define BE_DRBD_MAGIC_BIG __constant_cpu_to_be16(DRBD_MAGIC_BIG)
+#define DRBD_MAGIC_100 0x8620ec20
+
+#define DRBD_MD_MAGIC_07   (DRBD_MAGIC+3)
+#define DRBD_MD_MAGIC_08   (DRBD_MAGIC+4)
+#define DRBD_MD_MAGIC_84_UNCLEAN	(DRBD_MAGIC+5)
+
+
+/* how I came up with this magic?
+ * base64 decode "actlog==" ;) */
+#define DRBD_AL_MAGIC 0x69cb65a2
 
 /* these are of type "int" */
 #define DRBD_MD_INDEX_INTERNAL -1
 #define DRBD_MD_INDEX_FLEX_EXT -2
 #define DRBD_MD_INDEX_FLEX_INT -3
 
-/* Start of the new netlink/connector stuff */
-
-#define DRBD_NL_CREATE_DEVICE 0x01
-#define DRBD_NL_SET_DEFAULTS  0x02
-
-
-/* For searching a vacant cn_idx value */
-#define CN_IDX_STEP			6977
-
-struct drbd_nl_cfg_req {
-	int packet_type;
-	unsigned int drbd_minor;
-	int flags;
-	unsigned short tag_list[];
-};
-
-struct drbd_nl_cfg_reply {
-	int packet_type;
-	unsigned int minor;
-	int ret_code; /* enum ret_code or set_st_err_t */
-	unsigned short tag_list[]; /* only used with get_* calls */
-};
-
 #endif
diff --git a/trunk/include/linux/drbd_genl.h b/trunk/include/linux/drbd_genl.h
new file mode 100644
index 000000000000..d0d8fac8a6e4
--- /dev/null
+++ b/trunk/include/linux/drbd_genl.h
@@ -0,0 +1,378 @@
+/*
+ * General overview:
+ * full generic netlink message:
+ * |nlmsghdr|genlmsghdr|<payload>
+ *
+ * payload:
+ * |optional fixed size family header|<sequence of netlink attributes>
+ *
+ * sequence of netlink attributes:
+ * I chose to have all "top level" attributes NLA_NESTED,
+ * corresponding to some real struct.
+ * So we have a sequence of |tla, len|<nested nla sequence>
+ *
+ * nested nla sequence:
+ * may be empty, or contain a sequence of netlink attributes
+ * representing the struct fields.
+ *
+ * The tag number of any field (regardless of containing struct)
+ * will be available as T_ ## field_name,
+ * so you cannot have the same field name in two differnt structs.
+ *
+ * The tag numbers themselves are per struct, though,
+ * so should always begin at 1 (not 0, that is the special "NLA_UNSPEC" type,
+ * which we won't use here).
+ * The tag numbers are used as index in the respective nla_policy array.
+ *
+ * GENL_struct(tag_name, tag_number, struct name, struct fields) - struct and policy
+ *	genl_magic_struct.h
+ *		generates the struct declaration,
+ *		generates an entry in the tla enum,
+ *	genl_magic_func.h
+ *		generates an entry in the static tla policy
+ *		with .type = NLA_NESTED
+ *		generates the static <struct_name>_nl_policy definition,
+ *		and static conversion functions
+ *
+ *	genl_magic_func.h
+ *
+ * GENL_mc_group(group)
+ *	genl_magic_struct.h
+ *		does nothing
+ *	genl_magic_func.h
+ *		defines and registers the mcast group,
+ *		and provides a send helper
+ *
+ * GENL_notification(op_name, op_num, mcast_group, tla list)
+ *	These are notifications to userspace.
+ *
+ *	genl_magic_struct.h
+ *		generates an entry in the genl_ops enum,
+ *	genl_magic_func.h
+ *		does nothing
+ *
+ *	mcast group: the name of the mcast group this notification should be
+ *	expected on
+ *	tla list: the list of expected top level attributes,
+ *	for documentation and sanity checking.
+ *
+ * GENL_op(op_name, op_num, flags and handler, tla list) - "genl operations"
+ *	These are requests from userspace.
+ *
+ *	_op and _notification share the same "number space",
+ *	op_nr will be assigned to "genlmsghdr->cmd"
+ *
+ *	genl_magic_struct.h
+ *		generates an entry in the genl_ops enum,
+ *	genl_magic_func.h
+ *		generates an entry in the static genl_ops array,
+ *		and static register/unregister functions to
+ *		genl_register_family_with_ops().
+ *
+ *	flags and handler:
+ *		GENL_op_init( .doit = x, .dumpit = y, .flags = something)
+ *		GENL_doit(x) => .dumpit = NULL, .flags = GENL_ADMIN_PERM
+ *	tla list: the list of expected top level attributes,
+ *	for documentation and sanity checking.
+ */
+
+/*
+ * STRUCTS
+ */
+
+/* this is sent kernel -> userland on various error conditions, and contains
+ * informational textual info, which is supposedly human readable.
+ * The computer relevant return code is in the drbd_genlmsghdr.
+ */
+GENL_struct(DRBD_NLA_CFG_REPLY, 1, drbd_cfg_reply,
+		/* "arbitrary" size strings, nla_policy.len = 0 */
+	__str_field(1, DRBD_GENLA_F_MANDATORY,	info_text, 0)
+)
+
+/* Configuration requests typically need a context to operate on.
+ * Possible keys are device minor (fits in the drbd_genlmsghdr),
+ * the replication link (aka connection) name,
+ * and/or the replication group (aka resource) name,
+ * and the volume id within the resource. */
+GENL_struct(DRBD_NLA_CFG_CONTEXT, 2, drbd_cfg_context,
+	__u32_field(1, DRBD_GENLA_F_MANDATORY,	ctx_volume)
+	__str_field(2, DRBD_GENLA_F_MANDATORY,	ctx_resource_name, 128)
+	__bin_field(3, DRBD_GENLA_F_MANDATORY,	ctx_my_addr, 128)
+	__bin_field(4, DRBD_GENLA_F_MANDATORY,	ctx_peer_addr, 128)
+)
+
+GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf,
+	__str_field(1, DRBD_F_REQUIRED | DRBD_F_INVARIANT,	backing_dev,	128)
+	__str_field(2, DRBD_F_REQUIRED | DRBD_F_INVARIANT,	meta_dev,	128)
+	__s32_field(3, DRBD_F_REQUIRED | DRBD_F_INVARIANT,	meta_dev_idx)
+
+	/* use the resize command to try and change the disk_size */
+	__u64_field(4, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT,	disk_size)
+	/* we could change the max_bio_bvecs,
+	 * but it won't propagate through the stack */
+	__u32_field(5, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT,	max_bio_bvecs)
+
+	__u32_field_def(6, DRBD_GENLA_F_MANDATORY,	on_io_error, DRBD_ON_IO_ERROR_DEF)
+	__u32_field_def(7, DRBD_GENLA_F_MANDATORY,	fencing, DRBD_FENCING_DEF)
+
+	__u32_field_def(8,	DRBD_GENLA_F_MANDATORY,	resync_rate, DRBD_RESYNC_RATE_DEF)
+	__s32_field_def(9,	DRBD_GENLA_F_MANDATORY,	resync_after, DRBD_MINOR_NUMBER_DEF)
+	__u32_field_def(10,	DRBD_GENLA_F_MANDATORY,	al_extents, DRBD_AL_EXTENTS_DEF)
+	__u32_field_def(11,	DRBD_GENLA_F_MANDATORY,	c_plan_ahead, DRBD_C_PLAN_AHEAD_DEF)
+	__u32_field_def(12,	DRBD_GENLA_F_MANDATORY,	c_delay_target, DRBD_C_DELAY_TARGET_DEF)
+	__u32_field_def(13,	DRBD_GENLA_F_MANDATORY,	c_fill_target, DRBD_C_FILL_TARGET_DEF)
+	__u32_field_def(14,	DRBD_GENLA_F_MANDATORY,	c_max_rate, DRBD_C_MAX_RATE_DEF)
+	__u32_field_def(15,	DRBD_GENLA_F_MANDATORY,	c_min_rate, DRBD_C_MIN_RATE_DEF)
+
+	__flg_field_def(16, DRBD_GENLA_F_MANDATORY,	disk_barrier, DRBD_DISK_BARRIER_DEF)
+	__flg_field_def(17, DRBD_GENLA_F_MANDATORY,	disk_flushes, DRBD_DISK_FLUSHES_DEF)
+	__flg_field_def(18, DRBD_GENLA_F_MANDATORY,	disk_drain, DRBD_DISK_DRAIN_DEF)
+	__flg_field_def(19, DRBD_GENLA_F_MANDATORY,	md_flushes, DRBD_MD_FLUSHES_DEF)
+	__u32_field_def(20,	DRBD_GENLA_F_MANDATORY,	disk_timeout, DRBD_DISK_TIMEOUT_DEF)
+	__u32_field_def(21,	0 /* OPTIONAL */,       read_balancing, DRBD_READ_BALANCING_DEF)
+	/* 9: __u32_field_def(22,	DRBD_GENLA_F_MANDATORY,	unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) */
+	__flg_field_def(23,     0 /* OPTIONAL */,	al_updates, DRBD_AL_UPDATES_DEF)
+)
+
+GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts,
+	__str_field_def(1,	DRBD_GENLA_F_MANDATORY,	cpu_mask,       32)
+	__u32_field_def(2,	DRBD_GENLA_F_MANDATORY,	on_no_data, DRBD_ON_NO_DATA_DEF)
+)
+
+GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf,
+	__str_field_def(1,	DRBD_GENLA_F_MANDATORY | DRBD_F_SENSITIVE,
+						shared_secret,	SHARED_SECRET_MAX)
+	__str_field_def(2,	DRBD_GENLA_F_MANDATORY,	cram_hmac_alg,	SHARED_SECRET_MAX)
+	__str_field_def(3,	DRBD_GENLA_F_MANDATORY,	integrity_alg,	SHARED_SECRET_MAX)
+	__str_field_def(4,	DRBD_GENLA_F_MANDATORY,	verify_alg,     SHARED_SECRET_MAX)
+	__str_field_def(5,	DRBD_GENLA_F_MANDATORY,	csums_alg,	SHARED_SECRET_MAX)
+	__u32_field_def(6,	DRBD_GENLA_F_MANDATORY,	wire_protocol, DRBD_PROTOCOL_DEF)
+	__u32_field_def(7,	DRBD_GENLA_F_MANDATORY,	connect_int, DRBD_CONNECT_INT_DEF)
+	__u32_field_def(8,	DRBD_GENLA_F_MANDATORY,	timeout, DRBD_TIMEOUT_DEF)
+	__u32_field_def(9,	DRBD_GENLA_F_MANDATORY,	ping_int, DRBD_PING_INT_DEF)
+	__u32_field_def(10,	DRBD_GENLA_F_MANDATORY,	ping_timeo, DRBD_PING_TIMEO_DEF)
+	__u32_field_def(11,	DRBD_GENLA_F_MANDATORY,	sndbuf_size, DRBD_SNDBUF_SIZE_DEF)
+	__u32_field_def(12,	DRBD_GENLA_F_MANDATORY,	rcvbuf_size, DRBD_RCVBUF_SIZE_DEF)
+	__u32_field_def(13,	DRBD_GENLA_F_MANDATORY,	ko_count, DRBD_KO_COUNT_DEF)
+	__u32_field_def(14,	DRBD_GENLA_F_MANDATORY,	max_buffers, DRBD_MAX_BUFFERS_DEF)
+	__u32_field_def(15,	DRBD_GENLA_F_MANDATORY,	max_epoch_size, DRBD_MAX_EPOCH_SIZE_DEF)
+	__u32_field_def(16,	DRBD_GENLA_F_MANDATORY,	unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF)
+	__u32_field_def(17,	DRBD_GENLA_F_MANDATORY,	after_sb_0p, DRBD_AFTER_SB_0P_DEF)
+	__u32_field_def(18,	DRBD_GENLA_F_MANDATORY,	after_sb_1p, DRBD_AFTER_SB_1P_DEF)
+	__u32_field_def(19,	DRBD_GENLA_F_MANDATORY,	after_sb_2p, DRBD_AFTER_SB_2P_DEF)
+	__u32_field_def(20,	DRBD_GENLA_F_MANDATORY,	rr_conflict, DRBD_RR_CONFLICT_DEF)
+	__u32_field_def(21,	DRBD_GENLA_F_MANDATORY,	on_congestion, DRBD_ON_CONGESTION_DEF)
+	__u32_field_def(22,	DRBD_GENLA_F_MANDATORY,	cong_fill, DRBD_CONG_FILL_DEF)
+	__u32_field_def(23,	DRBD_GENLA_F_MANDATORY,	cong_extents, DRBD_CONG_EXTENTS_DEF)
+	__flg_field_def(24, DRBD_GENLA_F_MANDATORY,	two_primaries, DRBD_ALLOW_TWO_PRIMARIES_DEF)
+	__flg_field(25, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT,	discard_my_data)
+	__flg_field_def(26, DRBD_GENLA_F_MANDATORY,	tcp_cork, DRBD_TCP_CORK_DEF)
+	__flg_field_def(27, DRBD_GENLA_F_MANDATORY,	always_asbp, DRBD_ALWAYS_ASBP_DEF)
+	__flg_field(28, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT,	tentative)
+	__flg_field_def(29,	DRBD_GENLA_F_MANDATORY,	use_rle, DRBD_USE_RLE_DEF)
+	/* 9: __u32_field_def(30,	DRBD_GENLA_F_MANDATORY,	fencing_policy, DRBD_FENCING_DEF) */
+)
+
+GENL_struct(DRBD_NLA_SET_ROLE_PARMS, 6, set_role_parms,
+	__flg_field(1, DRBD_GENLA_F_MANDATORY,	assume_uptodate)
+)
+
+GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms,
+	__u64_field(1, DRBD_GENLA_F_MANDATORY,	resize_size)
+	__flg_field(2, DRBD_GENLA_F_MANDATORY,	resize_force)
+	__flg_field(3, DRBD_GENLA_F_MANDATORY,	no_resync)
+)
+
+GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info,
+	/* the reason of the broadcast,
+	 * if this is an event triggered broadcast. */
+	__u32_field(1, DRBD_GENLA_F_MANDATORY,	sib_reason)
+	__u32_field(2, DRBD_F_REQUIRED,	current_state)
+	__u64_field(3, DRBD_GENLA_F_MANDATORY,	capacity)
+	__u64_field(4, DRBD_GENLA_F_MANDATORY,	ed_uuid)
+
+	/* These are for broadcast from after state change work.
+	 * prev_state and new_state are from the moment the state change took
+	 * place, new_state is not neccessarily the same as current_state,
+	 * there may have been more state changes since.  Which will be
+	 * broadcasted soon, in their respective after state change work.  */
+	__u32_field(5, DRBD_GENLA_F_MANDATORY,	prev_state)
+	__u32_field(6, DRBD_GENLA_F_MANDATORY,	new_state)
+
+	/* if we have a local disk: */
+	__bin_field(7, DRBD_GENLA_F_MANDATORY,	uuids, (UI_SIZE*sizeof(__u64)))
+	__u32_field(8, DRBD_GENLA_F_MANDATORY,	disk_flags)
+	__u64_field(9, DRBD_GENLA_F_MANDATORY,	bits_total)
+	__u64_field(10, DRBD_GENLA_F_MANDATORY,	bits_oos)
+	/* and in case resync or online verify is active */
+	__u64_field(11, DRBD_GENLA_F_MANDATORY,	bits_rs_total)
+	__u64_field(12, DRBD_GENLA_F_MANDATORY,	bits_rs_failed)
+
+	/* for pre and post notifications of helper execution */
+	__str_field(13, DRBD_GENLA_F_MANDATORY,	helper, 32)
+	__u32_field(14, DRBD_GENLA_F_MANDATORY,	helper_exit_code)
+
+	__u64_field(15,                      0, send_cnt)
+	__u64_field(16,                      0, recv_cnt)
+	__u64_field(17,                      0, read_cnt)
+	__u64_field(18,                      0, writ_cnt)
+	__u64_field(19,                      0, al_writ_cnt)
+	__u64_field(20,                      0, bm_writ_cnt)
+	__u32_field(21,                      0, ap_bio_cnt)
+	__u32_field(22,                      0, ap_pending_cnt)
+	__u32_field(23,                      0, rs_pending_cnt)
+)
+
+GENL_struct(DRBD_NLA_START_OV_PARMS, 9, start_ov_parms,
+	__u64_field(1, DRBD_GENLA_F_MANDATORY,	ov_start_sector)
+	__u64_field(2, DRBD_GENLA_F_MANDATORY,	ov_stop_sector)
+)
+
+GENL_struct(DRBD_NLA_NEW_C_UUID_PARMS, 10, new_c_uuid_parms,
+	__flg_field(1, DRBD_GENLA_F_MANDATORY, clear_bm)
+)
+
+GENL_struct(DRBD_NLA_TIMEOUT_PARMS, 11, timeout_parms,
+	__u32_field(1,	DRBD_F_REQUIRED,	timeout_type)
+)
+
+GENL_struct(DRBD_NLA_DISCONNECT_PARMS, 12, disconnect_parms,
+	__flg_field(1, DRBD_GENLA_F_MANDATORY,	force_disconnect)
+)
+
+GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms,
+	__flg_field(1, DRBD_GENLA_F_MANDATORY,	force_detach)
+)
+
+/*
+ * Notifications and commands (genlmsghdr->cmd)
+ */
+GENL_mc_group(events)
+
+	/* kernel -> userspace announcement of changes */
+GENL_notification(
+	DRBD_EVENT, 1, events,
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_STATE_INFO, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_GENLA_F_MANDATORY)
+	GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_GENLA_F_MANDATORY)
+	GENL_tla_expected(DRBD_NLA_SYNCER_CONF, DRBD_GENLA_F_MANDATORY)
+)
+
+	/* query kernel for specific or all info */
+GENL_op(
+	DRBD_ADM_GET_STATUS, 2,
+	GENL_op_init(
+		.doit = drbd_adm_get_status,
+		.dumpit = drbd_adm_get_status_all,
+		/* anyone may ask for the status,
+		 * it is broadcasted anyways */
+	),
+	/* To select the object .doit.
+	 * Or a subset of objects in .dumpit. */
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
+)
+
+	/* add DRBD minor devices as volumes to resources */
+GENL_op(DRBD_ADM_NEW_MINOR, 5, GENL_doit(drbd_adm_add_minor),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+GENL_op(DRBD_ADM_DEL_MINOR, 6, GENL_doit(drbd_adm_delete_minor),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+
+	/* add or delete resources */
+GENL_op(DRBD_ADM_NEW_RESOURCE, 7, GENL_doit(drbd_adm_new_resource),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+GENL_op(DRBD_ADM_DEL_RESOURCE, 8, GENL_doit(drbd_adm_del_resource),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+
+GENL_op(DRBD_ADM_RESOURCE_OPTS, 9,
+	GENL_doit(drbd_adm_resource_opts),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_RESOURCE_OPTS, DRBD_GENLA_F_MANDATORY)
+)
+
+GENL_op(
+	DRBD_ADM_CONNECT, 10,
+	GENL_doit(drbd_adm_connect),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_F_REQUIRED)
+)
+
+GENL_op(
+	DRBD_ADM_CHG_NET_OPTS, 29,
+	GENL_doit(drbd_adm_net_opts),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_F_REQUIRED)
+)
+
+GENL_op(DRBD_ADM_DISCONNECT, 11, GENL_doit(drbd_adm_disconnect),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+
+GENL_op(DRBD_ADM_ATTACH, 12,
+	GENL_doit(drbd_adm_attach),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_F_REQUIRED)
+)
+
+GENL_op(DRBD_ADM_CHG_DISK_OPTS, 28,
+	GENL_doit(drbd_adm_disk_opts),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_DISK_OPTS, DRBD_F_REQUIRED)
+)
+
+GENL_op(
+	DRBD_ADM_RESIZE, 13,
+	GENL_doit(drbd_adm_resize),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_RESIZE_PARMS, DRBD_GENLA_F_MANDATORY)
+)
+
+GENL_op(
+	DRBD_ADM_PRIMARY, 14,
+	GENL_doit(drbd_adm_set_role),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_SET_ROLE_PARMS, DRBD_F_REQUIRED)
+)
+
+GENL_op(
+	DRBD_ADM_SECONDARY, 15,
+	GENL_doit(drbd_adm_set_role),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_SET_ROLE_PARMS, DRBD_F_REQUIRED)
+)
+
+GENL_op(
+	DRBD_ADM_NEW_C_UUID, 16,
+	GENL_doit(drbd_adm_new_c_uuid),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_NEW_C_UUID_PARMS, DRBD_GENLA_F_MANDATORY)
+)
+
+GENL_op(
+	DRBD_ADM_START_OV, 17,
+	GENL_doit(drbd_adm_start_ov),
+	GENL_tla_expected(DRBD_NLA_START_OV_PARMS, DRBD_GENLA_F_MANDATORY)
+)
+
+GENL_op(DRBD_ADM_DETACH,	18, GENL_doit(drbd_adm_detach),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+	GENL_tla_expected(DRBD_NLA_DETACH_PARMS, DRBD_GENLA_F_MANDATORY))
+
+GENL_op(DRBD_ADM_INVALIDATE,	19, GENL_doit(drbd_adm_invalidate),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+GENL_op(DRBD_ADM_INVAL_PEER,	20, GENL_doit(drbd_adm_invalidate_peer),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+GENL_op(DRBD_ADM_PAUSE_SYNC,	21, GENL_doit(drbd_adm_pause_sync),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+GENL_op(DRBD_ADM_RESUME_SYNC,	22, GENL_doit(drbd_adm_resume_sync),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+GENL_op(DRBD_ADM_SUSPEND_IO,	23, GENL_doit(drbd_adm_suspend_io),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+GENL_op(DRBD_ADM_RESUME_IO,	24, GENL_doit(drbd_adm_resume_io),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+GENL_op(DRBD_ADM_OUTDATE,	25, GENL_doit(drbd_adm_outdate),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+GENL_op(DRBD_ADM_GET_TIMEOUT_TYPE, 26, GENL_doit(drbd_adm_get_timeout_type),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+GENL_op(DRBD_ADM_DOWN,		27, GENL_doit(drbd_adm_down),
+	GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
diff --git a/trunk/include/linux/drbd_genl_api.h b/trunk/include/linux/drbd_genl_api.h
new file mode 100644
index 000000000000..9ef50d51e34e
--- /dev/null
+++ b/trunk/include/linux/drbd_genl_api.h
@@ -0,0 +1,55 @@
+#ifndef DRBD_GENL_STRUCT_H
+#define DRBD_GENL_STRUCT_H
+
+/**
+ * struct drbd_genlmsghdr - DRBD specific header used in NETLINK_GENERIC requests
+ * @minor:
+ *     For admin requests (user -> kernel): which minor device to operate on.
+ *     For (unicast) replies or informational (broadcast) messages
+ *     (kernel -> user): which minor device the information is about.
+ *     If we do not operate on minors, but on connections or resources,
+ *     the minor value shall be (~0), and the attribute DRBD_NLA_CFG_CONTEXT
+ *     is used instead.
+ * @flags: possible operation modifiers (relevant only for user->kernel):
+ *     DRBD_GENL_F_SET_DEFAULTS
+ * @volume:
+ *     When creating a new minor (adding it to a resource), the resource needs
+ *     to know which volume number within the resource this is supposed to be.
+ *     The volume number corresponds to the same volume number on the remote side,
+ *     whereas the minor number on the remote side may be different
+ *     (union with flags).
+ * @ret_code: kernel->userland unicast cfg reply return code (union with flags);
+ */
+struct drbd_genlmsghdr {
+	__u32 minor;
+	union {
+	__u32 flags;
+	__s32 ret_code;
+	};
+};
+
+/* To be used in drbd_genlmsghdr.flags */
+enum {
+	DRBD_GENL_F_SET_DEFAULTS = 1,
+};
+
+enum drbd_state_info_bcast_reason {
+	SIB_GET_STATUS_REPLY = 1,
+	SIB_STATE_CHANGE = 2,
+	SIB_HELPER_PRE = 3,
+	SIB_HELPER_POST = 4,
+	SIB_SYNC_PROGRESS = 5,
+};
+
+/* hack around predefined gcc/cpp "linux=1",
+ * we cannot possibly include <1/drbd_genl.h> */
+#undef linux
+
+#include <linux/drbd.h>
+#define GENL_MAGIC_VERSION	API_VERSION
+#define GENL_MAGIC_FAMILY	drbd
+#define GENL_MAGIC_FAMILY_HDRSZ	sizeof(struct drbd_genlmsghdr)
+#define GENL_MAGIC_INCLUDE_FILE <linux/drbd_genl.h>
+#include <linux/genl_magic_struct.h>
+
+#endif
diff --git a/trunk/include/linux/drbd_limits.h b/trunk/include/linux/drbd_limits.h
index fb670bf603f7..1fa19c5f5e64 100644
--- a/trunk/include/linux/drbd_limits.h
+++ b/trunk/include/linux/drbd_limits.h
@@ -16,29 +16,37 @@
 #define DEBUG_RANGE_CHECK 0
 
 #define DRBD_MINOR_COUNT_MIN 1
-#define DRBD_MINOR_COUNT_MAX 256
+#define DRBD_MINOR_COUNT_MAX 255
 #define DRBD_MINOR_COUNT_DEF 32
+#define DRBD_MINOR_COUNT_SCALE '1'
+
+#define DRBD_VOLUME_MAX 65535
 
 #define DRBD_DIALOG_REFRESH_MIN 0
 #define DRBD_DIALOG_REFRESH_MAX 600
+#define DRBD_DIALOG_REFRESH_SCALE '1'
 
 /* valid port number */
 #define DRBD_PORT_MIN 1
 #define DRBD_PORT_MAX 0xffff
+#define DRBD_PORT_SCALE '1'
 
 /* startup { */
   /* if you want more than 3.4 days, disable */
 #define DRBD_WFC_TIMEOUT_MIN 0
 #define DRBD_WFC_TIMEOUT_MAX 300000
 #define DRBD_WFC_TIMEOUT_DEF 0
+#define DRBD_WFC_TIMEOUT_SCALE '1'
 
 #define DRBD_DEGR_WFC_TIMEOUT_MIN 0
 #define DRBD_DEGR_WFC_TIMEOUT_MAX 300000
 #define DRBD_DEGR_WFC_TIMEOUT_DEF 0
+#define DRBD_DEGR_WFC_TIMEOUT_SCALE '1'
 
 #define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0
 #define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000
 #define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0
+#define DRBD_OUTDATED_WFC_TIMEOUT_SCALE '1'
 /* }*/
 
 /* net { */
@@ -47,75 +55,91 @@
 #define DRBD_TIMEOUT_MIN 1
 #define DRBD_TIMEOUT_MAX 600
 #define DRBD_TIMEOUT_DEF 60       /* 6 seconds */
+#define DRBD_TIMEOUT_SCALE '1'
 
  /* If backing disk takes longer than disk_timeout, mark the disk as failed */
 #define DRBD_DISK_TIMEOUT_MIN 0    /* 0 = disabled */
 #define DRBD_DISK_TIMEOUT_MAX 6000 /* 10 Minutes */
 #define DRBD_DISK_TIMEOUT_DEF 0    /* disabled */
+#define DRBD_DISK_TIMEOUT_SCALE '1'
 
   /* active connection retries when C_WF_CONNECTION */
 #define DRBD_CONNECT_INT_MIN 1
 #define DRBD_CONNECT_INT_MAX 120
 #define DRBD_CONNECT_INT_DEF 10   /* seconds */
+#define DRBD_CONNECT_INT_SCALE '1'
 
   /* keep-alive probes when idle */
 #define DRBD_PING_INT_MIN 1
 #define DRBD_PING_INT_MAX 120
 #define DRBD_PING_INT_DEF 10
+#define DRBD_PING_INT_SCALE '1'
 
  /* timeout for the ping packets.*/
 #define DRBD_PING_TIMEO_MIN  1
 #define DRBD_PING_TIMEO_MAX  300
 #define DRBD_PING_TIMEO_DEF  5
+#define DRBD_PING_TIMEO_SCALE '1'
 
   /* max number of write requests between write barriers */
 #define DRBD_MAX_EPOCH_SIZE_MIN 1
 #define DRBD_MAX_EPOCH_SIZE_MAX 20000
 #define DRBD_MAX_EPOCH_SIZE_DEF 2048
+#define DRBD_MAX_EPOCH_SIZE_SCALE '1'
 
   /* I don't think that a tcp send buffer of more than 10M is useful */
 #define DRBD_SNDBUF_SIZE_MIN  0
 #define DRBD_SNDBUF_SIZE_MAX  (10<<20)
 #define DRBD_SNDBUF_SIZE_DEF  0
+#define DRBD_SNDBUF_SIZE_SCALE '1'
 
 #define DRBD_RCVBUF_SIZE_MIN  0
 #define DRBD_RCVBUF_SIZE_MAX  (10<<20)
 #define DRBD_RCVBUF_SIZE_DEF  0
+#define DRBD_RCVBUF_SIZE_SCALE '1'
 
   /* @4k PageSize -> 128kB - 512MB */
 #define DRBD_MAX_BUFFERS_MIN  32
 #define DRBD_MAX_BUFFERS_MAX  131072
 #define DRBD_MAX_BUFFERS_DEF  2048
+#define DRBD_MAX_BUFFERS_SCALE '1'
 
   /* @4k PageSize -> 4kB - 512MB */
 #define DRBD_UNPLUG_WATERMARK_MIN  1
 #define DRBD_UNPLUG_WATERMARK_MAX  131072
 #define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16)
+#define DRBD_UNPLUG_WATERMARK_SCALE '1'
 
   /* 0 is disabled.
    * 200 should be more than enough even for very short timeouts */
 #define DRBD_KO_COUNT_MIN  0
 #define DRBD_KO_COUNT_MAX  200
-#define DRBD_KO_COUNT_DEF  0
+#define DRBD_KO_COUNT_DEF  7
+#define DRBD_KO_COUNT_SCALE '1'
 /* } */
 
 /* syncer { */
   /* FIXME allow rate to be zero? */
-#define DRBD_RATE_MIN 1
+#define DRBD_RESYNC_RATE_MIN 1
 /* channel bonding 10 GbE, or other hardware */
-#define DRBD_RATE_MAX (4 << 20)
-#define DRBD_RATE_DEF 250  /* kb/second */
+#define DRBD_RESYNC_RATE_MAX (4 << 20)
+#define DRBD_RESYNC_RATE_DEF 250
+#define DRBD_RESYNC_RATE_SCALE 'k'  /* kilobytes */
 
   /* less than 7 would hit performance unnecessarily.
-   * 3833 is the largest prime that still does fit
-   * into 64 sectors of activity log */
+   * 919 slots context information per transaction,
+   * 32k activity log, 4k transaction size,
+   * one transaction in flight:
+   * 919 * 7 = 6433 */
 #define DRBD_AL_EXTENTS_MIN  7
-#define DRBD_AL_EXTENTS_MAX  3833
-#define DRBD_AL_EXTENTS_DEF  127
+#define DRBD_AL_EXTENTS_MAX  6433
+#define DRBD_AL_EXTENTS_DEF  1237
+#define DRBD_AL_EXTENTS_SCALE '1'
 
-#define DRBD_AFTER_MIN  -1
-#define DRBD_AFTER_MAX  255
-#define DRBD_AFTER_DEF  -1
+#define DRBD_MINOR_NUMBER_MIN  -1
+#define DRBD_MINOR_NUMBER_MAX  ((1 << 20) - 1)
+#define DRBD_MINOR_NUMBER_DEF  -1
+#define DRBD_MINOR_NUMBER_SCALE '1'
 
 /* } */
 
@@ -124,11 +148,12 @@
  * the upper limit with 64bit kernel, enough ram and flexible meta data
  * is 1 PiB, currently. */
 /* DRBD_MAX_SECTORS */
-#define DRBD_DISK_SIZE_SECT_MIN  0
-#define DRBD_DISK_SIZE_SECT_MAX  (1 * (2LLU << 40))
-#define DRBD_DISK_SIZE_SECT_DEF  0 /* = disabled = no user size... */
+#define DRBD_DISK_SIZE_MIN  0
+#define DRBD_DISK_SIZE_MAX  (1 * (2LLU << 40))
+#define DRBD_DISK_SIZE_DEF  0 /* = disabled = no user size... */
+#define DRBD_DISK_SIZE_SCALE 's'  /* sectors */
 
-#define DRBD_ON_IO_ERROR_DEF EP_PASS_ON
+#define DRBD_ON_IO_ERROR_DEF EP_DETACH
 #define DRBD_FENCING_DEF FP_DONT_CARE
 #define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT
 #define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT
@@ -136,38 +161,59 @@
 #define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT
 #define DRBD_ON_NO_DATA_DEF OND_IO_ERROR
 #define DRBD_ON_CONGESTION_DEF OC_BLOCK
+#define DRBD_READ_BALANCING_DEF RB_PREFER_LOCAL
 
 #define DRBD_MAX_BIO_BVECS_MIN 0
 #define DRBD_MAX_BIO_BVECS_MAX 128
 #define DRBD_MAX_BIO_BVECS_DEF 0
+#define DRBD_MAX_BIO_BVECS_SCALE '1'
 
 #define DRBD_C_PLAN_AHEAD_MIN  0
 #define DRBD_C_PLAN_AHEAD_MAX  300
-#define DRBD_C_PLAN_AHEAD_DEF  0 /* RS rate controller disabled by default */
+#define DRBD_C_PLAN_AHEAD_DEF  20
+#define DRBD_C_PLAN_AHEAD_SCALE '1'
 
 #define DRBD_C_DELAY_TARGET_MIN 1
 #define DRBD_C_DELAY_TARGET_MAX 100
 #define DRBD_C_DELAY_TARGET_DEF 10
+#define DRBD_C_DELAY_TARGET_SCALE '1'
 
 #define DRBD_C_FILL_TARGET_MIN 0
 #define DRBD_C_FILL_TARGET_MAX (1<<20) /* 500MByte in sec */
-#define DRBD_C_FILL_TARGET_DEF 0 /* By default disabled -> controlled by delay_target */
+#define DRBD_C_FILL_TARGET_DEF 100 /* Try to place 50KiB in socket send buffer during resync */
+#define DRBD_C_FILL_TARGET_SCALE 's'  /* sectors */
 
-#define DRBD_C_MAX_RATE_MIN     250 /* kByte/sec */
+#define DRBD_C_MAX_RATE_MIN     250
 #define DRBD_C_MAX_RATE_MAX     (4 << 20)
 #define DRBD_C_MAX_RATE_DEF     102400
+#define DRBD_C_MAX_RATE_SCALE	'k'  /* kilobytes */
 
-#define DRBD_C_MIN_RATE_MIN     0 /* kByte/sec */
+#define DRBD_C_MIN_RATE_MIN     0
 #define DRBD_C_MIN_RATE_MAX     (4 << 20)
-#define DRBD_C_MIN_RATE_DEF     4096
+#define DRBD_C_MIN_RATE_DEF     250
+#define DRBD_C_MIN_RATE_SCALE	'k'  /* kilobytes */
 
 #define DRBD_CONG_FILL_MIN	0
 #define DRBD_CONG_FILL_MAX	(10<<21) /* 10GByte in sectors */
 #define DRBD_CONG_FILL_DEF	0
+#define DRBD_CONG_FILL_SCALE	's'  /* sectors */
 
 #define DRBD_CONG_EXTENTS_MIN	DRBD_AL_EXTENTS_MIN
 #define DRBD_CONG_EXTENTS_MAX	DRBD_AL_EXTENTS_MAX
 #define DRBD_CONG_EXTENTS_DEF	DRBD_AL_EXTENTS_DEF
+#define DRBD_CONG_EXTENTS_SCALE DRBD_AL_EXTENTS_SCALE
+
+#define DRBD_PROTOCOL_DEF DRBD_PROT_C
+
+#define DRBD_DISK_BARRIER_DEF	0
+#define DRBD_DISK_FLUSHES_DEF	1
+#define DRBD_DISK_DRAIN_DEF	1
+#define DRBD_MD_FLUSHES_DEF	1
+#define DRBD_TCP_CORK_DEF	1
+#define DRBD_AL_UPDATES_DEF     1
+
+#define DRBD_ALLOW_TWO_PRIMARIES_DEF	0
+#define DRBD_ALWAYS_ASBP_DEF	0
+#define DRBD_USE_RLE_DEF	1
 
-#undef RANGE
 #endif
diff --git a/trunk/include/linux/drbd_nl.h b/trunk/include/linux/drbd_nl.h
deleted file mode 100644
index a8706f08ab36..000000000000
--- a/trunk/include/linux/drbd_nl.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
-   PAKET( name,
-	  TYPE ( pn, pr, member )
-	  ...
-   )
-
-   You may never reissue one of the pn arguments
-*/
-
-#if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64)
-#error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined"
-#endif
-
-NL_PACKET(primary, 1,
-       NL_BIT(		1,	T_MAY_IGNORE,	primary_force)
-)
-
-NL_PACKET(secondary, 2, )
-
-NL_PACKET(disk_conf, 3,
-	NL_INT64(	2,	T_MAY_IGNORE,	disk_size)
-	NL_STRING(	3,	T_MANDATORY,	backing_dev,	128)
-	NL_STRING(	4,	T_MANDATORY,	meta_dev,	128)
-	NL_INTEGER(	5,	T_MANDATORY,	meta_dev_idx)
-	NL_INTEGER(	6,	T_MAY_IGNORE,	on_io_error)
-	NL_INTEGER(	7,	T_MAY_IGNORE,	fencing)
-	NL_BIT(		37,	T_MAY_IGNORE,	use_bmbv)
-	NL_BIT(		53,	T_MAY_IGNORE,	no_disk_flush)
-	NL_BIT(		54,	T_MAY_IGNORE,	no_md_flush)
-	  /*  55 max_bio_size was available in 8.2.6rc2 */
-	NL_INTEGER(	56,	T_MAY_IGNORE,	max_bio_bvecs)
-	NL_BIT(		57,	T_MAY_IGNORE,	no_disk_barrier)
-	NL_BIT(		58,	T_MAY_IGNORE,	no_disk_drain)
-	NL_INTEGER(	89,	T_MAY_IGNORE,	disk_timeout)
-)
-
-NL_PACKET(detach, 4,
-	NL_BIT(		88,	T_MANDATORY,	detach_force)
-)
-
-NL_PACKET(net_conf, 5,
-	NL_STRING(	8,	T_MANDATORY,	my_addr,	128)
-	NL_STRING(	9,	T_MANDATORY,	peer_addr,	128)
-	NL_STRING(	10,	T_MAY_IGNORE,	shared_secret,	SHARED_SECRET_MAX)
-	NL_STRING(	11,	T_MAY_IGNORE,	cram_hmac_alg,	SHARED_SECRET_MAX)
-	NL_STRING(	44,	T_MAY_IGNORE,	integrity_alg,	SHARED_SECRET_MAX)
-	NL_INTEGER(	14,	T_MAY_IGNORE,	timeout)
-	NL_INTEGER(	15,	T_MANDATORY,	wire_protocol)
-	NL_INTEGER(	16,	T_MAY_IGNORE,	try_connect_int)
-	NL_INTEGER(	17,	T_MAY_IGNORE,	ping_int)
-	NL_INTEGER(	18,	T_MAY_IGNORE,	max_epoch_size)
-	NL_INTEGER(	19,	T_MAY_IGNORE,	max_buffers)
-	NL_INTEGER(	20,	T_MAY_IGNORE,	unplug_watermark)
-	NL_INTEGER(	21,	T_MAY_IGNORE,	sndbuf_size)
-	NL_INTEGER(	22,	T_MAY_IGNORE,	ko_count)
-	NL_INTEGER(	24,	T_MAY_IGNORE,	after_sb_0p)
-	NL_INTEGER(	25,	T_MAY_IGNORE,	after_sb_1p)
-	NL_INTEGER(	26,	T_MAY_IGNORE,	after_sb_2p)
-	NL_INTEGER(	39,	T_MAY_IGNORE,	rr_conflict)
-	NL_INTEGER(	40,	T_MAY_IGNORE,	ping_timeo)
-	NL_INTEGER(	67,	T_MAY_IGNORE,	rcvbuf_size)
-	NL_INTEGER(	81,	T_MAY_IGNORE,	on_congestion)
-	NL_INTEGER(	82,	T_MAY_IGNORE,	cong_fill)
-	NL_INTEGER(	83,	T_MAY_IGNORE,	cong_extents)
-	  /* 59 addr_family was available in GIT, never released */
-	NL_BIT(		60,	T_MANDATORY,	mind_af)
-	NL_BIT(		27,	T_MAY_IGNORE,	want_lose)
-	NL_BIT(		28,	T_MAY_IGNORE,	two_primaries)
-	NL_BIT(		41,	T_MAY_IGNORE,	always_asbp)
-	NL_BIT(		61,	T_MAY_IGNORE,	no_cork)
-	NL_BIT(		62,	T_MANDATORY,	auto_sndbuf_size)
-	NL_BIT(		70,	T_MANDATORY,	dry_run)
-)
-
-NL_PACKET(disconnect, 6,
-	NL_BIT(		84,	T_MAY_IGNORE,	force)
-)
-
-NL_PACKET(resize, 7,
-	NL_INT64(		29,	T_MAY_IGNORE,	resize_size)
-	NL_BIT(			68,	T_MAY_IGNORE,	resize_force)
-	NL_BIT(			69,	T_MANDATORY,	no_resync)
-)
-
-NL_PACKET(syncer_conf, 8,
-	NL_INTEGER(	30,	T_MAY_IGNORE,	rate)
-	NL_INTEGER(	31,	T_MAY_IGNORE,	after)
-	NL_INTEGER(	32,	T_MAY_IGNORE,	al_extents)
-/*	NL_INTEGER(     71,	T_MAY_IGNORE,	dp_volume)
- *	NL_INTEGER(     72,	T_MAY_IGNORE,	dp_interval)
- *	NL_INTEGER(     73,	T_MAY_IGNORE,	throttle_th)
- *	NL_INTEGER(     74,	T_MAY_IGNORE,	hold_off_th)
- * feature will be reimplemented differently with 8.3.9 */
-	NL_STRING(      52,     T_MAY_IGNORE,   verify_alg,     SHARED_SECRET_MAX)
-	NL_STRING(      51,     T_MAY_IGNORE,   cpu_mask,       32)
-	NL_STRING(	64,	T_MAY_IGNORE,	csums_alg,	SHARED_SECRET_MAX)
-	NL_BIT(         65,     T_MAY_IGNORE,   use_rle)
-	NL_INTEGER(	75,	T_MAY_IGNORE,	on_no_data)
-	NL_INTEGER(	76,	T_MAY_IGNORE,	c_plan_ahead)
-	NL_INTEGER(     77,	T_MAY_IGNORE,	c_delay_target)
-	NL_INTEGER(     78,	T_MAY_IGNORE,	c_fill_target)
-	NL_INTEGER(     79,	T_MAY_IGNORE,	c_max_rate)
-	NL_INTEGER(     80,	T_MAY_IGNORE,	c_min_rate)
-)
-
-NL_PACKET(invalidate, 9, )
-NL_PACKET(invalidate_peer, 10, )
-NL_PACKET(pause_sync, 11, )
-NL_PACKET(resume_sync, 12, )
-NL_PACKET(suspend_io, 13, )
-NL_PACKET(resume_io, 14, )
-NL_PACKET(outdate, 15, )
-NL_PACKET(get_config, 16, )
-NL_PACKET(get_state, 17,
-	NL_INTEGER(	33,	T_MAY_IGNORE,	state_i)
-)
-
-NL_PACKET(get_uuids, 18,
-	NL_STRING(	34,	T_MAY_IGNORE,	uuids,	(UI_SIZE*sizeof(__u64)))
-	NL_INTEGER(	35,	T_MAY_IGNORE,	uuids_flags)
-)
-
-NL_PACKET(get_timeout_flag, 19,
-	NL_BIT(		36,	T_MAY_IGNORE,	use_degraded)
-)
-
-NL_PACKET(call_helper, 20,
-	NL_STRING(	38,	T_MAY_IGNORE,	helper,		32)
-)
-
-/* Tag nr 42 already allocated in drbd-8.1 development. */
-
-NL_PACKET(sync_progress, 23,
-	NL_INTEGER(	43,	T_MAY_IGNORE,	sync_progress)
-)
-
-NL_PACKET(dump_ee, 24,
-	NL_STRING(	45,	T_MAY_IGNORE,	dump_ee_reason, 32)
-	NL_STRING(	46,	T_MAY_IGNORE,	seen_digest, SHARED_SECRET_MAX)
-	NL_STRING(	47,	T_MAY_IGNORE,	calc_digest, SHARED_SECRET_MAX)
-	NL_INT64(	48,	T_MAY_IGNORE,	ee_sector)
-	NL_INT64(	49,	T_MAY_IGNORE,	ee_block_id)
-	NL_STRING(	50,	T_MAY_IGNORE,	ee_data,	32 << 10)
-)
-
-NL_PACKET(start_ov, 25,
-	NL_INT64(	66,	T_MAY_IGNORE,	start_sector)
-)
-
-NL_PACKET(new_c_uuid, 26,
-       NL_BIT(		63,	T_MANDATORY,	clear_bm)
-)
-
-#ifdef NL_RESPONSE
-NL_RESPONSE(return_code_only, 27)
-#endif
-
-#undef NL_PACKET
-#undef NL_INTEGER
-#undef NL_INT64
-#undef NL_BIT
-#undef NL_STRING
-#undef NL_RESPONSE
diff --git a/trunk/include/linux/drbd_tag_magic.h b/trunk/include/linux/drbd_tag_magic.h
deleted file mode 100644
index 82de1f9e48b1..000000000000
--- a/trunk/include/linux/drbd_tag_magic.h
+++ /dev/null
@@ -1,84 +0,0 @@
-#ifndef DRBD_TAG_MAGIC_H
-#define DRBD_TAG_MAGIC_H
-
-#define TT_END     0
-#define TT_REMOVED 0xE000
-
-/* declare packet_type enums */
-enum packet_types {
-#define NL_PACKET(name, number, fields) P_ ## name = number,
-#define NL_RESPONSE(name, number) P_ ## name = number,
-#define NL_INTEGER(pn, pr, member)
-#define NL_INT64(pn, pr, member)
-#define NL_BIT(pn, pr, member)
-#define NL_STRING(pn, pr, member, len)
-#include <linux/drbd_nl.h>
-	P_nl_after_last_packet,
-};
-
-/* These struct are used to deduce the size of the tag lists: */
-#define NL_PACKET(name, number, fields)	\
-	struct name ## _tag_len_struct { fields };
-#define NL_INTEGER(pn, pr, member)		\
-	int member; int tag_and_len ## member;
-#define NL_INT64(pn, pr, member)		\
-	__u64 member; int tag_and_len ## member;
-#define NL_BIT(pn, pr, member)		\
-	unsigned char member:1; int tag_and_len ## member;
-#define NL_STRING(pn, pr, member, len)	\
-	unsigned char member[len]; int member ## _len; \
-	int tag_and_len ## member;
-#include <linux/drbd_nl.h>
-
-/* declare tag-list-sizes */
-static const int tag_list_sizes[] = {
-#define NL_PACKET(name, number, fields) 2 fields ,
-#define NL_INTEGER(pn, pr, member)      + 4 + 4
-#define NL_INT64(pn, pr, member)        + 4 + 8
-#define NL_BIT(pn, pr, member)          + 4 + 1
-#define NL_STRING(pn, pr, member, len)  + 4 + (len)
-#include <linux/drbd_nl.h>
-};
-
-/* The two highest bits are used for the tag type */
-#define TT_MASK      0xC000
-#define TT_INTEGER   0x0000
-#define TT_INT64     0x4000
-#define TT_BIT       0x8000
-#define TT_STRING    0xC000
-/* The next bit indicates if processing of the tag is mandatory */
-#define T_MANDATORY  0x2000
-#define T_MAY_IGNORE 0x0000
-#define TN_MASK      0x1fff
-/* The remaining 13 bits are used to enumerate the tags */
-
-#define tag_type(T)   ((T) & TT_MASK)
-#define tag_number(T) ((T) & TN_MASK)
-
-/* declare tag enums */
-#define NL_PACKET(name, number, fields) fields
-enum drbd_tags {
-#define NL_INTEGER(pn, pr, member)     T_ ## member = pn | TT_INTEGER | pr ,
-#define NL_INT64(pn, pr, member)       T_ ## member = pn | TT_INT64   | pr ,
-#define NL_BIT(pn, pr, member)         T_ ## member = pn | TT_BIT     | pr ,
-#define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING  | pr ,
-#include <linux/drbd_nl.h>
-};
-
-struct tag {
-	const char *name;
-	int type_n_flags;
-	int max_len;
-};
-
-/* declare tag names */
-#define NL_PACKET(name, number, fields) fields
-static const struct tag tag_descriptions[] = {
-#define NL_INTEGER(pn, pr, member)     [ pn ] = { #member, TT_INTEGER | pr, sizeof(int)   },
-#define NL_INT64(pn, pr, member)       [ pn ] = { #member, TT_INT64   | pr, sizeof(__u64) },
-#define NL_BIT(pn, pr, member)         [ pn ] = { #member, TT_BIT     | pr, sizeof(int)   },
-#define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING  | pr, (len)         },
-#include <linux/drbd_nl.h>
-};
-
-#endif
diff --git a/trunk/include/linux/exportfs.h b/trunk/include/linux/exportfs.h
index 12291a7ee275..c7e6b6392ab8 100644
--- a/trunk/include/linux/exportfs.h
+++ b/trunk/include/linux/exportfs.h
@@ -177,6 +177,8 @@ struct export_operations {
 	int (*commit_metadata)(struct inode *inode);
 };
 
+extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
+				    int *max_len, struct inode *parent);
 extern int exportfs_encode_fh(struct dentry *dentry, struct fid *fid,
 	int *max_len, int connectable);
 extern struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
diff --git a/trunk/include/linux/fs.h b/trunk/include/linux/fs.h
index 408fb1e77a0a..a823d4be38e7 100644
--- a/trunk/include/linux/fs.h
+++ b/trunk/include/linux/fs.h
@@ -44,6 +44,7 @@ struct vm_area_struct;
 struct vfsmount;
 struct cred;
 struct swap_info_struct;
+struct seq_file;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
@@ -1543,6 +1544,7 @@ struct file_operations {
 	int (*setlease)(struct file *, long, struct file_lock **);
 	long (*fallocate)(struct file *file, int mode, loff_t offset,
 			  loff_t len);
+	int (*show_fdinfo)(struct seq_file *m, struct file *f);
 };
 
 struct inode_operations {
@@ -1578,8 +1580,6 @@ struct inode_operations {
 			   umode_t create_mode, int *opened);
 } ____cacheline_aligned;
 
-struct seq_file;
-
 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 			      unsigned long nr_segs, unsigned long fast_segs,
 			      struct iovec *fast_pointer,
@@ -1810,6 +1810,8 @@ struct file_system_type {
 #define FS_REQUIRES_DEV		1 
 #define FS_BINARY_MOUNTDATA	2
 #define FS_HAS_SUBTYPE		4
+#define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
+#define FS_USERNS_DEV_MOUNT	16 /* A userns mount does not imply MNT_NODEV */
 #define FS_REVAL_DOT		16384	/* Check the paths ".", ".." for staleness */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	struct dentry *(*mount) (struct file_system_type *, int,
@@ -2286,9 +2288,9 @@ extern ino_t find_inode_number(struct dentry *, struct qstr *);
 #include <linux/err.h>
 
 /* needed for stackable file system support */
-extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
+extern loff_t default_llseek(struct file *file, loff_t offset, int whence);
 
-extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin);
+extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence);
 
 extern int inode_init_always(struct super_block *, struct inode *);
 extern void inode_init_once(struct inode *);
@@ -2396,11 +2398,11 @@ extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 
 extern void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
-extern loff_t noop_llseek(struct file *file, loff_t offset, int origin);
-extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
-extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
+extern loff_t noop_llseek(struct file *file, loff_t offset, int whence);
+extern loff_t no_llseek(struct file *file, loff_t offset, int whence);
+extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence);
 extern loff_t generic_file_llseek_size(struct file *file, loff_t offset,
-		int origin, loff_t maxsize, loff_t eof);
+		int whence, loff_t maxsize, loff_t eof);
 extern int generic_file_open(struct inode * inode, struct file * filp);
 extern int nonseekable_open(struct inode * inode, struct file * filp);
 
diff --git a/trunk/include/linux/ftrace.h b/trunk/include/linux/ftrace.h
index a52f2f4fe030..92691d85c320 100644
--- a/trunk/include/linux/ftrace.h
+++ b/trunk/include/linux/ftrace.h
@@ -394,7 +394,7 @@ ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf,
 			    size_t cnt, loff_t *ppos);
 ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf,
 			     size_t cnt, loff_t *ppos);
-loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin);
+loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int whence);
 int ftrace_regex_release(struct inode *inode, struct file *file);
 
 void __init
@@ -559,7 +559,7 @@ static inline ssize_t ftrace_filter_write(struct file *file, const char __user *
 			    size_t cnt, loff_t *ppos) { return -ENODEV; }
 static inline ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf,
 			     size_t cnt, loff_t *ppos) { return -ENODEV; }
-static inline loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
+static inline loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
 {
 	return -ENODEV;
 }
diff --git a/trunk/include/linux/genhd.h b/trunk/include/linux/genhd.h
index 4f440b3e89fe..79b8bba19363 100644
--- a/trunk/include/linux/genhd.h
+++ b/trunk/include/linux/genhd.h
@@ -88,10 +88,14 @@ struct disk_stats {
 };
 
 #define PARTITION_META_INFO_VOLNAMELTH	64
-#define PARTITION_META_INFO_UUIDLTH	16
+/*
+ * Enough for the string representation of any kind of UUID plus NULL.
+ * EFI UUID is 36 characters. MSDOS UUID is 11 characters.
+ */
+#define PARTITION_META_INFO_UUIDLTH	37
 
 struct partition_meta_info {
-	u8 uuid[PARTITION_META_INFO_UUIDLTH];	/* always big endian */
+	char uuid[PARTITION_META_INFO_UUIDLTH];
 	u8 volname[PARTITION_META_INFO_VOLNAMELTH];
 };
 
diff --git a/trunk/include/linux/genl_magic_func.h b/trunk/include/linux/genl_magic_func.h
new file mode 100644
index 000000000000..023bc346b877
--- /dev/null
+++ b/trunk/include/linux/genl_magic_func.h
@@ -0,0 +1,422 @@
+#ifndef GENL_MAGIC_FUNC_H
+#define GENL_MAGIC_FUNC_H
+
+#include <linux/genl_magic_struct.h>
+
+/*
+ * Magic: declare tla policy						{{{1
+ * Magic: declare nested policies
+ *									{{{2
+ */
+#undef GENL_mc_group
+#define GENL_mc_group(group)
+
+#undef GENL_notification
+#define GENL_notification(op_name, op_num, mcast_group, tla_list)
+
+#undef GENL_op
+#define GENL_op(op_name, op_num, handler, tla_list)
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields)		\
+	[tag_name] = { .type = NLA_NESTED },
+
+static struct nla_policy CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy)[] = {
+#include GENL_MAGIC_INCLUDE_FILE
+};
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields)		\
+static struct nla_policy s_name ## _nl_policy[] __read_mostly =		\
+{ s_fields };
+
+#undef __field
+#define __field(attr_nr, attr_flag, name, nla_type, _type, __get,	\
+		 __put, __is_signed)					\
+	[attr_nr] = { .type = nla_type },
+
+#undef __array
+#define __array(attr_nr, attr_flag, name, nla_type, _type, maxlen,	\
+		__get, __put, __is_signed)				\
+	[attr_nr] = { .type = nla_type,					\
+		      .len = maxlen - (nla_type == NLA_NUL_STRING) },
+
+#include GENL_MAGIC_INCLUDE_FILE
+
+#ifndef __KERNEL__
+#ifndef pr_info
+#define pr_info(args...)	fprintf(stderr, args);
+#endif
+#endif
+
+#ifdef GENL_MAGIC_DEBUG
+static void dprint_field(const char *dir, int nla_type,
+		const char *name, void *valp)
+{
+	__u64 val = valp ? *(__u32 *)valp : 1;
+	switch (nla_type) {
+	case NLA_U8:  val = (__u8)val;
+	case NLA_U16: val = (__u16)val;
+	case NLA_U32: val = (__u32)val;
+		pr_info("%s attr %s: %d 0x%08x\n", dir,
+			name, (int)val, (unsigned)val);
+		break;
+	case NLA_U64:
+		val = *(__u64*)valp;
+		pr_info("%s attr %s: %lld 0x%08llx\n", dir,
+			name, (long long)val, (unsigned long long)val);
+		break;
+	case NLA_FLAG:
+		if (val)
+			pr_info("%s attr %s: set\n", dir, name);
+		break;
+	}
+}
+
+static void dprint_array(const char *dir, int nla_type,
+		const char *name, const char *val, unsigned len)
+{
+	switch (nla_type) {
+	case NLA_NUL_STRING:
+		if (len && val[len-1] == '\0')
+			len--;
+		pr_info("%s attr %s: [len:%u] '%s'\n", dir, name, len, val);
+		break;
+	default:
+		/* we can always show 4 byte,
+		 * thats what nlattr are aligned to. */
+		pr_info("%s attr %s: [len:%u] %02x%02x%02x%02x ...\n",
+			dir, name, len, val[0], val[1], val[2], val[3]);
+	}
+}
+
+#define DPRINT_TLA(a, op, b) pr_info("%s %s %s\n", a, op, b);
+
+/* Name is a member field name of the struct s.
+ * If s is NULL (only parsing, no copy requested in *_from_attrs()),
+ * nla is supposed to point to the attribute containing the information
+ * corresponding to that struct member. */
+#define DPRINT_FIELD(dir, nla_type, name, s, nla)			\
+	do {								\
+		if (s)							\
+			dprint_field(dir, nla_type, #name, &s->name);	\
+		else if (nla)						\
+			dprint_field(dir, nla_type, #name,		\
+				(nla_type == NLA_FLAG) ? NULL		\
+						: nla_data(nla));	\
+	} while (0)
+
+#define	DPRINT_ARRAY(dir, nla_type, name, s, nla)			\
+	do {								\
+		if (s)							\
+			dprint_array(dir, nla_type, #name,		\
+					s->name, s->name ## _len);	\
+		else if (nla)						\
+			dprint_array(dir, nla_type, #name,		\
+					nla_data(nla), nla_len(nla));	\
+	} while (0)
+#else
+#define DPRINT_TLA(a, op, b) do {} while (0)
+#define DPRINT_FIELD(dir, nla_type, name, s, nla) do {} while (0)
+#define	DPRINT_ARRAY(dir, nla_type, name, s, nla) do {} while (0)
+#endif
+
+/*
+ * Magic: provide conversion functions					{{{1
+ * populate struct from attribute table:
+ *									{{{2
+ */
+
+/* processing of generic netlink messages is serialized.
+ * use one static buffer for parsing of nested attributes */
+static struct nlattr *nested_attr_tb[128];
+
+#ifndef BUILD_BUG_ON
+/* Force a compilation error if condition is true */
+#define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition))
+/* Force a compilation error if condition is true, but also produce a
+   result (of value 0 and type size_t), so the expression can be used
+   e.g. in a structure initializer (or where-ever else comma expressions
+   aren't permitted). */
+#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
+#define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); }))
+#endif
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields)		\
+/* *_from_attrs functions are static, but potentially unused */		\
+static int __ ## s_name ## _from_attrs(struct s_name *s,		\
+		struct genl_info *info, bool exclude_invariants)	\
+{									\
+	const int maxtype = ARRAY_SIZE(s_name ## _nl_policy)-1;		\
+	struct nlattr *tla = info->attrs[tag_number];			\
+	struct nlattr **ntb = nested_attr_tb;				\
+	struct nlattr *nla;						\
+	int err;							\
+	BUILD_BUG_ON(ARRAY_SIZE(s_name ## _nl_policy) > ARRAY_SIZE(nested_attr_tb));	\
+	if (!tla)							\
+		return -ENOMSG;						\
+	DPRINT_TLA(#s_name, "<=-", #tag_name);				\
+	err = drbd_nla_parse_nested(ntb, maxtype, tla, s_name ## _nl_policy);	\
+	if (err)							\
+		return err;						\
+									\
+	s_fields							\
+	return 0;							\
+}					__attribute__((unused))		\
+static int s_name ## _from_attrs(struct s_name *s,			\
+						struct genl_info *info)	\
+{									\
+	return __ ## s_name ## _from_attrs(s, info, false);		\
+}					__attribute__((unused))		\
+static int s_name ## _from_attrs_for_change(struct s_name *s,		\
+						struct genl_info *info)	\
+{									\
+	return __ ## s_name ## _from_attrs(s, info, true);		\
+}					__attribute__((unused))		\
+
+#define __assign(attr_nr, attr_flag, name, nla_type, type, assignment...)	\
+		nla = ntb[attr_nr];						\
+		if (nla) {						\
+			if (exclude_invariants && ((attr_flag) & DRBD_F_INVARIANT)) {		\
+				pr_info("<< must not change invariant attr: %s\n", #name);	\
+				return -EEXIST;				\
+			}						\
+			assignment;					\
+		} else if (exclude_invariants && ((attr_flag) & DRBD_F_INVARIANT)) {		\
+			/* attribute missing from payload, */		\
+			/* which was expected */			\
+		} else if ((attr_flag) & DRBD_F_REQUIRED) {		\
+			pr_info("<< missing attr: %s\n", #name);	\
+			return -ENOMSG;					\
+		}
+
+#undef __field
+#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put,	\
+		__is_signed)						\
+	__assign(attr_nr, attr_flag, name, nla_type, type,		\
+			if (s)						\
+				s->name = __get(nla);			\
+			DPRINT_FIELD("<<", nla_type, name, s, nla))
+
+/* validate_nla() already checked nla_len <= maxlen appropriately. */
+#undef __array
+#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen,	\
+		__get, __put, __is_signed)				\
+	__assign(attr_nr, attr_flag, name, nla_type, type,		\
+			if (s)						\
+				s->name ## _len =			\
+					__get(s->name, nla, maxlen);	\
+			DPRINT_ARRAY("<<", nla_type, name, s, nla))
+
+#include GENL_MAGIC_INCLUDE_FILE
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields)
+
+/*
+ * Magic: define op number to op name mapping				{{{1
+ *									{{{2
+ */
+const char *CONCAT_(GENL_MAGIC_FAMILY, _genl_cmd_to_str)(__u8 cmd)
+{
+	switch (cmd) {
+#undef GENL_op
+#define GENL_op(op_name, op_num, handler, tla_list)		\
+	case op_num: return #op_name;
+#include GENL_MAGIC_INCLUDE_FILE
+	default:
+		     return "unknown";
+	}
+}
+
+#ifdef __KERNEL__
+#include <linux/stringify.h>
+/*
+ * Magic: define genl_ops						{{{1
+ *									{{{2
+ */
+
+#undef GENL_op
+#define GENL_op(op_name, op_num, handler, tla_list)		\
+{								\
+	handler							\
+	.cmd = op_name,						\
+	.policy	= CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy),	\
+},
+
+#define ZZZ_genl_ops		CONCAT_(GENL_MAGIC_FAMILY, _genl_ops)
+static struct genl_ops ZZZ_genl_ops[] __read_mostly = {
+#include GENL_MAGIC_INCLUDE_FILE
+};
+
+#undef GENL_op
+#define GENL_op(op_name, op_num, handler, tla_list)
+
+/*
+ * Define the genl_family, multicast groups,				{{{1
+ * and provide register/unregister functions.
+ *									{{{2
+ */
+#define ZZZ_genl_family		CONCAT_(GENL_MAGIC_FAMILY, _genl_family)
+static struct genl_family ZZZ_genl_family __read_mostly = {
+	.id = GENL_ID_GENERATE,
+	.name = __stringify(GENL_MAGIC_FAMILY),
+	.version = GENL_MAGIC_VERSION,
+#ifdef GENL_MAGIC_FAMILY_HDRSZ
+	.hdrsize = NLA_ALIGN(GENL_MAGIC_FAMILY_HDRSZ),
+#endif
+	.maxattr = ARRAY_SIZE(drbd_tla_nl_policy)-1,
+};
+
+/*
+ * Magic: define multicast groups
+ * Magic: define multicast group registration helper
+ */
+#undef GENL_mc_group
+#define GENL_mc_group(group)						\
+static struct genl_multicast_group					\
+CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group) __read_mostly = {		\
+	.name = #group,							\
+};									\
+static int CONCAT_(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)(	\
+	struct sk_buff *skb, gfp_t flags)				\
+{									\
+	unsigned int group_id =						\
+		CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group).id;	\
+	if (!group_id)							\
+		return -EINVAL;						\
+	return genlmsg_multicast(skb, 0, group_id, flags);		\
+}
+
+#include GENL_MAGIC_INCLUDE_FILE
+
+int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void)
+{
+	int err = genl_register_family_with_ops(&ZZZ_genl_family,
+		ZZZ_genl_ops, ARRAY_SIZE(ZZZ_genl_ops));
+	if (err)
+		return err;
+#undef GENL_mc_group
+#define GENL_mc_group(group)						\
+	err = genl_register_mc_group(&ZZZ_genl_family,			\
+		&CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group));		\
+	if (err)							\
+		goto fail;						\
+	else								\
+		pr_info("%s: mcg %s: %u\n", #group,			\
+			__stringify(GENL_MAGIC_FAMILY),			\
+			CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group).id);
+
+#include GENL_MAGIC_INCLUDE_FILE
+
+#undef GENL_mc_group
+#define GENL_mc_group(group)
+	return 0;
+fail:
+	genl_unregister_family(&ZZZ_genl_family);
+	return err;
+}
+
+void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void)
+{
+	genl_unregister_family(&ZZZ_genl_family);
+}
+
+/*
+ * Magic: provide conversion functions					{{{1
+ * populate skb from struct.
+ *									{{{2
+ */
+
+#undef GENL_op
+#define GENL_op(op_name, op_num, handler, tla_list)
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields)		\
+static int s_name ## _to_skb(struct sk_buff *skb, struct s_name *s,	\
+		const bool exclude_sensitive)				\
+{									\
+	struct nlattr *tla = nla_nest_start(skb, tag_number);		\
+	if (!tla)							\
+		goto nla_put_failure;					\
+	DPRINT_TLA(#s_name, "-=>", #tag_name);				\
+	s_fields							\
+	nla_nest_end(skb, tla);						\
+	return 0;							\
+									\
+nla_put_failure:							\
+	if (tla)							\
+		nla_nest_cancel(skb, tla);				\
+        return -EMSGSIZE;						\
+}									\
+static inline int s_name ## _to_priv_skb(struct sk_buff *skb,		\
+		struct s_name *s)					\
+{									\
+	return s_name ## _to_skb(skb, s, 0);				\
+}									\
+static inline int s_name ## _to_unpriv_skb(struct sk_buff *skb,		\
+		struct s_name *s)					\
+{									\
+	return s_name ## _to_skb(skb, s, 1);				\
+}
+
+
+#undef __field
+#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put,	\
+		__is_signed)						\
+	if (!exclude_sensitive || !((attr_flag) & DRBD_F_SENSITIVE)) {	\
+		DPRINT_FIELD(">>", nla_type, name, s, NULL);		\
+		if (__put(skb, attr_nr, s->name))			\
+			goto nla_put_failure;				\
+	}
+
+#undef __array
+#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen,	\
+		__get, __put, __is_signed)				\
+	if (!exclude_sensitive || !((attr_flag) & DRBD_F_SENSITIVE)) {	\
+		DPRINT_ARRAY(">>",nla_type, name, s, NULL);		\
+		if (__put(skb, attr_nr, min_t(int, maxlen,		\
+			s->name ## _len + (nla_type == NLA_NUL_STRING)),\
+						s->name))		\
+			goto nla_put_failure;				\
+	}
+
+#include GENL_MAGIC_INCLUDE_FILE
+
+
+/* Functions for initializing structs to default values.  */
+
+#undef __field
+#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put,	\
+		__is_signed)
+#undef __array
+#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen,	\
+		__get, __put, __is_signed)
+#undef __u32_field_def
+#define __u32_field_def(attr_nr, attr_flag, name, default)		\
+	x->name = default;
+#undef __s32_field_def
+#define __s32_field_def(attr_nr, attr_flag, name, default)		\
+	x->name = default;
+#undef __flg_field_def
+#define __flg_field_def(attr_nr, attr_flag, name, default)		\
+	x->name = default;
+#undef __str_field_def
+#define __str_field_def(attr_nr, attr_flag, name, maxlen)		\
+	memset(x->name, 0, sizeof(x->name));				\
+	x->name ## _len = 0;
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields)		\
+static void set_ ## s_name ## _defaults(struct s_name *x) __attribute__((unused)); \
+static void set_ ## s_name ## _defaults(struct s_name *x) {	\
+s_fields								\
+}
+
+#include GENL_MAGIC_INCLUDE_FILE
+
+#endif /* __KERNEL__ */
+
+/* }}}1 */
+#endif /* GENL_MAGIC_FUNC_H */
+/* vim: set foldmethod=marker foldlevel=1 nofoldenable : */
diff --git a/trunk/include/linux/genl_magic_struct.h b/trunk/include/linux/genl_magic_struct.h
new file mode 100644
index 000000000000..eecd19b37001
--- /dev/null
+++ b/trunk/include/linux/genl_magic_struct.h
@@ -0,0 +1,277 @@
+#ifndef GENL_MAGIC_STRUCT_H
+#define GENL_MAGIC_STRUCT_H
+
+#ifndef GENL_MAGIC_FAMILY
+# error "you need to define GENL_MAGIC_FAMILY before inclusion"
+#endif
+
+#ifndef GENL_MAGIC_VERSION
+# error "you need to define GENL_MAGIC_VERSION before inclusion"
+#endif
+
+#ifndef GENL_MAGIC_INCLUDE_FILE
+# error "you need to define GENL_MAGIC_INCLUDE_FILE before inclusion"
+#endif
+
+#include <linux/genetlink.h>
+#include <linux/types.h>
+
+#define CONCAT__(a,b)	a ## b
+#define CONCAT_(a,b)	CONCAT__(a,b)
+
+extern int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void);
+extern void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void);
+
+/*
+ * Extension of genl attribute validation policies			{{{2
+ */
+
+/*
+ * @DRBD_GENLA_F_MANDATORY: By default, netlink ignores attributes it does not
+ * know about.  This flag can be set in nlattr->nla_type to indicate that this
+ * attribute must not be ignored.
+ *
+ * We check and remove this flag in drbd_nla_check_mandatory() before
+ * validating the attribute types and lengths via nla_parse_nested().
+ */
+#define DRBD_GENLA_F_MANDATORY (1 << 14)
+
+/*
+ * Flags specific to drbd and not visible at the netlink layer, used in
+ * <struct>_from_attrs and <struct>_to_skb:
+ *
+ * @DRBD_F_REQUIRED: Attribute is required; a request without this attribute is
+ * invalid.
+ *
+ * @DRBD_F_SENSITIVE: Attribute includes sensitive information and must not be
+ * included in unpriviledged get requests or broadcasts.
+ *
+ * @DRBD_F_INVARIANT: Attribute is set when an object is initially created, but
+ * cannot subsequently be changed.
+ */
+#define DRBD_F_REQUIRED (1 << 0)
+#define DRBD_F_SENSITIVE (1 << 1)
+#define DRBD_F_INVARIANT (1 << 2)
+
+#define __nla_type(x)	((__u16)((x) & NLA_TYPE_MASK & ~DRBD_GENLA_F_MANDATORY))
+
+/*									}}}1
+ * MAGIC
+ * multi-include macro expansion magic starts here
+ */
+
+/* MAGIC helpers							{{{2 */
+
+/* possible field types */
+#define __flg_field(attr_nr, attr_flag, name) \
+	__field(attr_nr, attr_flag, name, NLA_U8, char, \
+			nla_get_u8, nla_put_u8, false)
+#define __u8_field(attr_nr, attr_flag, name)	\
+	__field(attr_nr, attr_flag, name, NLA_U8, unsigned char, \
+			nla_get_u8, nla_put_u8, false)
+#define __u16_field(attr_nr, attr_flag, name)	\
+	__field(attr_nr, attr_flag, name, NLA_U16, __u16, \
+			nla_get_u16, nla_put_u16, false)
+#define __u32_field(attr_nr, attr_flag, name)	\
+	__field(attr_nr, attr_flag, name, NLA_U32, __u32, \
+			nla_get_u32, nla_put_u32, false)
+#define __s32_field(attr_nr, attr_flag, name)	\
+	__field(attr_nr, attr_flag, name, NLA_U32, __s32, \
+			nla_get_u32, nla_put_u32, true)
+#define __u64_field(attr_nr, attr_flag, name)	\
+	__field(attr_nr, attr_flag, name, NLA_U64, __u64, \
+			nla_get_u64, nla_put_u64, false)
+#define __str_field(attr_nr, attr_flag, name, maxlen) \
+	__array(attr_nr, attr_flag, name, NLA_NUL_STRING, char, maxlen, \
+			nla_strlcpy, nla_put, false)
+#define __bin_field(attr_nr, attr_flag, name, maxlen) \
+	__array(attr_nr, attr_flag, name, NLA_BINARY, char, maxlen, \
+			nla_memcpy, nla_put, false)
+
+/* fields with default values */
+#define __flg_field_def(attr_nr, attr_flag, name, default) \
+	__flg_field(attr_nr, attr_flag, name)
+#define __u32_field_def(attr_nr, attr_flag, name, default) \
+	__u32_field(attr_nr, attr_flag, name)
+#define __s32_field_def(attr_nr, attr_flag, name, default) \
+	__s32_field(attr_nr, attr_flag, name)
+#define __str_field_def(attr_nr, attr_flag, name, maxlen) \
+	__str_field(attr_nr, attr_flag, name, maxlen)
+
+#define GENL_op_init(args...)	args
+#define GENL_doit(handler)		\
+	.doit = handler,		\
+	.flags = GENL_ADMIN_PERM,
+#define GENL_dumpit(handler)		\
+	.dumpit = handler,		\
+	.flags = GENL_ADMIN_PERM,
+
+/*									}}}1
+ * Magic: define the enum symbols for genl_ops
+ * Magic: define the enum symbols for top level attributes
+ * Magic: define the enum symbols for nested attributes
+ *									{{{2
+ */
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields)
+
+#undef GENL_mc_group
+#define GENL_mc_group(group)
+
+#undef GENL_notification
+#define GENL_notification(op_name, op_num, mcast_group, tla_list)	\
+	op_name = op_num,
+
+#undef GENL_op
+#define GENL_op(op_name, op_num, handler, tla_list)			\
+	op_name = op_num,
+
+enum {
+#include GENL_MAGIC_INCLUDE_FILE
+};
+
+#undef GENL_notification
+#define GENL_notification(op_name, op_num, mcast_group, tla_list)
+
+#undef GENL_op
+#define GENL_op(op_name, op_num, handler, attr_list)
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
+		tag_name = tag_number,
+
+enum {
+#include GENL_MAGIC_INCLUDE_FILE
+};
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields)	\
+enum {								\
+	s_fields						\
+};
+
+#undef __field
+#define __field(attr_nr, attr_flag, name, nla_type, type,	\
+		__get, __put, __is_signed)			\
+	T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)),
+
+#undef __array
+#define __array(attr_nr, attr_flag, name, nla_type, type,	\
+		maxlen, __get, __put, __is_signed)		\
+	T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)),
+
+#include GENL_MAGIC_INCLUDE_FILE
+
+/*									}}}1
+ * Magic: compile time assert unique numbers for operations
+ * Magic: -"- unique numbers for top level attributes
+ * Magic: -"- unique numbers for nested attributes
+ *									{{{2
+ */
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields)
+
+#undef GENL_op
+#define GENL_op(op_name, op_num, handler, attr_list)	\
+	case op_name:
+
+#undef GENL_notification
+#define GENL_notification(op_name, op_num, mcast_group, tla_list)	\
+	case op_name:
+
+static inline void ct_assert_unique_operations(void)
+{
+	switch (0) {
+#include GENL_MAGIC_INCLUDE_FILE
+		;
+	}
+}
+
+#undef GENL_op
+#define GENL_op(op_name, op_num, handler, attr_list)
+
+#undef GENL_notification
+#define GENL_notification(op_name, op_num, mcast_group, tla_list)
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields)		\
+		case tag_number:
+
+static inline void ct_assert_unique_top_level_attributes(void)
+{
+	switch (0) {
+#include GENL_MAGIC_INCLUDE_FILE
+		;
+	}
+}
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields)		\
+static inline void ct_assert_unique_ ## s_name ## _attributes(void)	\
+{									\
+	switch (0) {							\
+		s_fields						\
+			;						\
+	}								\
+}
+
+#undef __field
+#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put,	\
+		__is_signed)						\
+	case attr_nr:
+
+#undef __array
+#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen,	\
+		__get, __put, __is_signed)				\
+	case attr_nr:
+
+#include GENL_MAGIC_INCLUDE_FILE
+
+/*									}}}1
+ * Magic: declare structs
+ * struct <name> {
+ *	fields
+ * };
+ *									{{{2
+ */
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields)		\
+struct s_name { s_fields };
+
+#undef __field
+#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put,	\
+		__is_signed)						\
+	type name;
+
+#undef __array
+#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen,	\
+		__get, __put, __is_signed)				\
+	type name[maxlen];	\
+	__u32 name ## _len;
+
+#include GENL_MAGIC_INCLUDE_FILE
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields)		\
+enum {									\
+	s_fields							\
+};
+
+#undef __field
+#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put,	\
+		is_signed)						\
+	F_ ## name ## _IS_SIGNED = is_signed,
+
+#undef __array
+#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen,	\
+		__get, __put, is_signed)				\
+	F_ ## name ## _IS_SIGNED = is_signed,
+
+#include GENL_MAGIC_INCLUDE_FILE
+
+/* }}}1 */
+#endif /* GENL_MAGIC_STRUCT_H */
+/* vim: set foldmethod=marker nofoldenable : */
diff --git a/trunk/include/linux/gfp.h b/trunk/include/linux/gfp.h
index f74856e17e48..0f615eb23d05 100644
--- a/trunk/include/linux/gfp.h
+++ b/trunk/include/linux/gfp.h
@@ -30,6 +30,7 @@ struct vm_area_struct;
 #define ___GFP_HARDWALL		0x20000u
 #define ___GFP_THISNODE		0x40000u
 #define ___GFP_RECLAIMABLE	0x80000u
+#define ___GFP_KMEMCG		0x100000u
 #define ___GFP_NOTRACK		0x200000u
 #define ___GFP_NO_KSWAPD	0x400000u
 #define ___GFP_OTHER_NODE	0x800000u
@@ -89,6 +90,7 @@ struct vm_area_struct;
 
 #define __GFP_NO_KSWAPD	((__force gfp_t)___GFP_NO_KSWAPD)
 #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
+#define __GFP_KMEMCG	((__force gfp_t)___GFP_KMEMCG) /* Allocation comes from a memcg-accounted resource */
 #define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)	/* Allocator intends to dirty page */
 
 /*
@@ -365,6 +367,9 @@ extern void free_pages(unsigned long addr, unsigned int order);
 extern void free_hot_cold_page(struct page *page, int cold);
 extern void free_hot_cold_page_list(struct list_head *list, int cold);
 
+extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
+extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
+
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr), 0)
 
diff --git a/trunk/include/linux/hugetlb_cgroup.h b/trunk/include/linux/hugetlb_cgroup.h
index d73878c694b3..ce8217f7b5c2 100644
--- a/trunk/include/linux/hugetlb_cgroup.h
+++ b/trunk/include/linux/hugetlb_cgroup.h
@@ -62,7 +62,7 @@ extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
 					 struct page *page);
 extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
 					   struct hugetlb_cgroup *h_cg);
-extern int hugetlb_cgroup_file_init(int idx) __init;
+extern void hugetlb_cgroup_file_init(void) __init;
 extern void hugetlb_cgroup_migrate(struct page *oldhpage,
 				   struct page *newhpage);
 
@@ -111,9 +111,8 @@ hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
 	return;
 }
 
-static inline int __init hugetlb_cgroup_file_init(int idx)
+static inline void hugetlb_cgroup_file_init(void)
 {
-	return 0;
 }
 
 static inline void hugetlb_cgroup_migrate(struct page *oldhpage,
diff --git a/trunk/include/linux/i2c-omap.h b/trunk/include/linux/i2c-omap.h
index 92a0dc75bc74..babe0cf6d56b 100644
--- a/trunk/include/linux/i2c-omap.h
+++ b/trunk/include/linux/i2c-omap.h
@@ -20,8 +20,6 @@
 #define OMAP_I2C_FLAG_NO_FIFO			BIT(0)
 #define OMAP_I2C_FLAG_SIMPLE_CLOCK		BIT(1)
 #define OMAP_I2C_FLAG_16BIT_DATA_REG		BIT(2)
-#define OMAP_I2C_FLAG_RESET_REGS_POSTIDLE	BIT(3)
-#define OMAP_I2C_FLAG_APPLY_ERRATA_I207	BIT(4)
 #define OMAP_I2C_FLAG_ALWAYS_ARMXOR_CLK	BIT(5)
 #define OMAP_I2C_FLAG_FORCE_19200_INT_CLK	BIT(6)
 /* how the CPU address bus must be translated for I2C unit access */
diff --git a/trunk/include/linux/i2c/i2c-sh_mobile.h b/trunk/include/linux/i2c/i2c-sh_mobile.h
index beda7081aead..06e3089795fb 100644
--- a/trunk/include/linux/i2c/i2c-sh_mobile.h
+++ b/trunk/include/linux/i2c/i2c-sh_mobile.h
@@ -5,6 +5,7 @@
 
 struct i2c_sh_mobile_platform_data {
 	unsigned long bus_speed;
+	unsigned int clks_per_count;
 };
 
 #endif /* __I2C_SH_MOBILE_H__ */
diff --git a/trunk/include/linux/idr.h b/trunk/include/linux/idr.h
index 87259a44c251..de7e190f1af4 100644
--- a/trunk/include/linux/idr.h
+++ b/trunk/include/linux/idr.h
@@ -152,4 +152,15 @@ void ida_simple_remove(struct ida *ida, unsigned int id);
 
 void __init idr_init_cache(void);
 
+/**
+ * idr_for_each_entry - iterate over an idr's elements of a given type
+ * @idp:     idr handle
+ * @entry:   the type * to use as cursor
+ * @id:      id entry's key
+ */
+#define idr_for_each_entry(idp, entry, id)				\
+	for (id = 0, entry = (typeof(entry))idr_get_next((idp), &(id)); \
+	     entry != NULL;                                             \
+	     ++id, entry = (typeof(entry))idr_get_next((idp), &(id)))
+
 #endif /* __IDR_H__ */
diff --git a/trunk/include/linux/ima.h b/trunk/include/linux/ima.h
index 2c7223d7e73b..86c361e947b9 100644
--- a/trunk/include/linux/ima.h
+++ b/trunk/include/linux/ima.h
@@ -18,6 +18,7 @@ extern int ima_bprm_check(struct linux_binprm *bprm);
 extern int ima_file_check(struct file *file, int mask);
 extern void ima_file_free(struct file *file);
 extern int ima_file_mmap(struct file *file, unsigned long prot);
+extern int ima_module_check(struct file *file);
 
 #else
 static inline int ima_bprm_check(struct linux_binprm *bprm)
@@ -40,6 +41,11 @@ static inline int ima_file_mmap(struct file *file, unsigned long prot)
 	return 0;
 }
 
+static inline int ima_module_check(struct file *file)
+{
+	return 0;
+}
+
 #endif /* CONFIG_IMA_H */
 
 #ifdef CONFIG_IMA_APPRAISE
diff --git a/trunk/include/linux/init.h b/trunk/include/linux/init.h
index f63692d6902e..a799273714ac 100644
--- a/trunk/include/linux/init.h
+++ b/trunk/include/linux/init.h
@@ -182,16 +182,16 @@ extern bool initcall_debug;
  * can point at the same handler without causing duplicate-symbol build errors.
  */
 
-#define __define_initcall(level,fn,id) \
+#define __define_initcall(fn, id) \
 	static initcall_t __initcall_##fn##id __used \
-	__attribute__((__section__(".initcall" level ".init"))) = fn
+	__attribute__((__section__(".initcall" #id ".init"))) = fn
 
 /*
  * Early initcalls run before initializing SMP.
  *
  * Only for built-in code, not modules.
  */
-#define early_initcall(fn)		__define_initcall("early",fn,early)
+#define early_initcall(fn)		__define_initcall(fn, early)
 
 /*
  * A "pure" initcall has no dependencies on anything else, and purely
@@ -200,23 +200,23 @@ extern bool initcall_debug;
  * This only exists for built-in code, not for modules.
  * Keep main.c:initcall_level_names[] in sync.
  */
-#define pure_initcall(fn)		__define_initcall("0",fn,0)
-
-#define core_initcall(fn)		__define_initcall("1",fn,1)
-#define core_initcall_sync(fn)		__define_initcall("1s",fn,1s)
-#define postcore_initcall(fn)		__define_initcall("2",fn,2)
-#define postcore_initcall_sync(fn)	__define_initcall("2s",fn,2s)
-#define arch_initcall(fn)		__define_initcall("3",fn,3)
-#define arch_initcall_sync(fn)		__define_initcall("3s",fn,3s)
-#define subsys_initcall(fn)		__define_initcall("4",fn,4)
-#define subsys_initcall_sync(fn)	__define_initcall("4s",fn,4s)
-#define fs_initcall(fn)			__define_initcall("5",fn,5)
-#define fs_initcall_sync(fn)		__define_initcall("5s",fn,5s)
-#define rootfs_initcall(fn)		__define_initcall("rootfs",fn,rootfs)
-#define device_initcall(fn)		__define_initcall("6",fn,6)
-#define device_initcall_sync(fn)	__define_initcall("6s",fn,6s)
-#define late_initcall(fn)		__define_initcall("7",fn,7)
-#define late_initcall_sync(fn)		__define_initcall("7s",fn,7s)
+#define pure_initcall(fn)		__define_initcall(fn, 0)
+
+#define core_initcall(fn)		__define_initcall(fn, 1)
+#define core_initcall_sync(fn)		__define_initcall(fn, 1s)
+#define postcore_initcall(fn)		__define_initcall(fn, 2)
+#define postcore_initcall_sync(fn)	__define_initcall(fn, 2s)
+#define arch_initcall(fn)		__define_initcall(fn, 3)
+#define arch_initcall_sync(fn)		__define_initcall(fn, 3s)
+#define subsys_initcall(fn)		__define_initcall(fn, 4)
+#define subsys_initcall_sync(fn)	__define_initcall(fn, 4s)
+#define fs_initcall(fn)			__define_initcall(fn, 5)
+#define fs_initcall_sync(fn)		__define_initcall(fn, 5s)
+#define rootfs_initcall(fn)		__define_initcall(fn, rootfs)
+#define device_initcall(fn)		__define_initcall(fn, 6)
+#define device_initcall_sync(fn)	__define_initcall(fn, 6s)
+#define late_initcall(fn)		__define_initcall(fn, 7)
+#define late_initcall_sync(fn)		__define_initcall(fn, 7s)
 
 #define __initcall(fn) device_initcall(fn)
 
diff --git a/trunk/include/linux/input.h b/trunk/include/linux/input.h
index cab994ba6d91..82ce323b9986 100644
--- a/trunk/include/linux/input.h
+++ b/trunk/include/linux/input.h
@@ -112,6 +112,11 @@ struct input_value {
  * @h_list: list of input handles associated with the device. When
  *	accessing the list dev->mutex must be held
  * @node: used to place the device onto input_dev_list
+ * @num_vals: number of values queued in the current frame
+ * @max_vals: maximum number of values queued in a frame
+ * @vals: array of values queued in the current frame
+ * @devres_managed: indicates that devices is managed with devres framework
+ *	and needs not be explicitly unregistered or freed.
  */
 struct input_dev {
 	const char *name;
@@ -180,6 +185,8 @@ struct input_dev {
 	unsigned int num_vals;
 	unsigned int max_vals;
 	struct input_value *vals;
+
+	bool devres_managed;
 };
 #define to_input_dev(d) container_of(d, struct input_dev, dev)
 
@@ -323,7 +330,8 @@ struct input_handle {
 	struct list_head	h_node;
 };
 
-struct input_dev *input_allocate_device(void);
+struct input_dev __must_check *input_allocate_device(void);
+struct input_dev __must_check *devm_input_allocate_device(struct device *);
 void input_free_device(struct input_dev *dev);
 
 static inline struct input_dev *input_get_device(struct input_dev *dev)
diff --git a/trunk/include/linux/input/bu21013.h b/trunk/include/linux/input/bu21013.h
index 05e03284b92a..6230d76bde5d 100644
--- a/trunk/include/linux/input/bu21013.h
+++ b/trunk/include/linux/input/bu21013.h
@@ -9,13 +9,10 @@
 
 /**
  * struct bu21013_platform_device - Handle the platform data
- * @cs_en:	pointer to the cs enable function
- * @cs_dis:	pointer to the cs disable function
- * @irq_read_val:    pointer to read the pen irq value function
  * @touch_x_max: touch x max
  * @touch_y_max: touch y max
  * @cs_pin: chip select pin
- * @irq: irq pin
+ * @touch_pin: touch gpio pin
  * @ext_clk: external clock flag
  * @x_flip: x flip flag
  * @y_flip: y flip flag
@@ -24,13 +21,10 @@
  * This is used to handle the platform data
  */
 struct bu21013_platform_device {
-	int (*cs_en)(int reset_pin);
-	int (*cs_dis)(int reset_pin);
-	int (*irq_read_val)(void);
 	int touch_x_max;
 	int touch_y_max;
 	unsigned int cs_pin;
-	unsigned int irq;
+	unsigned int touch_pin;
 	bool ext_clk;
 	bool x_flip;
 	bool y_flip;
diff --git a/trunk/include/linux/ipc_namespace.h b/trunk/include/linux/ipc_namespace.h
index 5499c92a9153..fe771978e877 100644
--- a/trunk/include/linux/ipc_namespace.h
+++ b/trunk/include/linux/ipc_namespace.h
@@ -67,6 +67,8 @@ struct ipc_namespace {
 
 	/* user_ns which owns the ipc ns */
 	struct user_namespace *user_ns;
+
+	unsigned int	proc_inum;
 };
 
 extern struct ipc_namespace init_ipc_ns;
@@ -133,7 +135,8 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
 
 #if defined(CONFIG_IPC_NS)
 extern struct ipc_namespace *copy_ipcs(unsigned long flags,
-				       struct task_struct *tsk);
+	struct user_namespace *user_ns, struct ipc_namespace *ns);
+
 static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
 {
 	if (ns)
@@ -144,12 +147,12 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
 extern void put_ipc_ns(struct ipc_namespace *ns);
 #else
 static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
-					      struct task_struct *tsk)
+	struct user_namespace *user_ns, struct ipc_namespace *ns)
 {
 	if (flags & CLONE_NEWIPC)
 		return ERR_PTR(-EINVAL);
 
-	return tsk->nsproxy->ipc_ns;
+	return ns;
 }
 
 static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
diff --git a/trunk/include/linux/kernel.h b/trunk/include/linux/kernel.h
index d97ed5897447..d140e8fb075f 100644
--- a/trunk/include/linux/kernel.h
+++ b/trunk/include/linux/kernel.h
@@ -220,6 +220,23 @@ int __must_check _kstrtol(const char *s, unsigned int base, long *res);
 
 int __must_check kstrtoull(const char *s, unsigned int base, unsigned long long *res);
 int __must_check kstrtoll(const char *s, unsigned int base, long long *res);
+
+/**
+ * kstrtoul - convert a string to an unsigned long
+ * @s: The start of the string. The string must be null-terminated, and may also
+ *  include a single newline before its terminating null. The first character
+ *  may also be a plus sign, but not a minus sign.
+ * @base: The number base to use. The maximum supported base is 16. If base is
+ *  given as 0, then the base of the string is automatically detected with the
+ *  conventional semantics - If it begins with 0x the number will be parsed as a
+ *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
+ *  parsed as an octal number. Otherwise it will be parsed as a decimal.
+ * @res: Where to write the result of the conversion on success.
+ *
+ * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
+ * Used as a replacement for the obsolete simple_strtoull. Return code must
+ * be checked.
+*/
 static inline int __must_check kstrtoul(const char *s, unsigned int base, unsigned long *res)
 {
 	/*
@@ -233,6 +250,22 @@ static inline int __must_check kstrtoul(const char *s, unsigned int base, unsign
 		return _kstrtoul(s, base, res);
 }
 
+/**
+ * kstrtol - convert a string to a long
+ * @s: The start of the string. The string must be null-terminated, and may also
+ *  include a single newline before its terminating null. The first character
+ *  may also be a plus sign or a minus sign.
+ * @base: The number base to use. The maximum supported base is 16. If base is
+ *  given as 0, then the base of the string is automatically detected with the
+ *  conventional semantics - If it begins with 0x the number will be parsed as a
+ *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
+ *  parsed as an octal number. Otherwise it will be parsed as a decimal.
+ * @res: Where to write the result of the conversion on success.
+ *
+ * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
+ * Used as a replacement for the obsolete simple_strtoull. Return code must
+ * be checked.
+ */
 static inline int __must_check kstrtol(const char *s, unsigned int base, long *res)
 {
 	/*
diff --git a/trunk/include/linux/loop.h b/trunk/include/linux/loop.h
index 6492181bcb1d..460b60fa7adf 100644
--- a/trunk/include/linux/loop.h
+++ b/trunk/include/linux/loop.h
@@ -53,10 +53,13 @@ struct loop_device {
 
 	spinlock_t		lo_lock;
 	struct bio_list		lo_bio_list;
+	unsigned int		lo_bio_count;
 	int			lo_state;
 	struct mutex		lo_ctl_mutex;
 	struct task_struct	*lo_thread;
 	wait_queue_head_t	lo_event;
+	/* wait queue for incoming requests */
+	wait_queue_head_t	lo_req_wait;
 
 	struct request_queue	*lo_queue;
 	struct gendisk		*lo_disk;
diff --git a/trunk/include/linux/lru_cache.h b/trunk/include/linux/lru_cache.h
index cafc7f99e124..4019013c6593 100644
--- a/trunk/include/linux/lru_cache.h
+++ b/trunk/include/linux/lru_cache.h
@@ -166,9 +166,11 @@ struct lc_element {
 	/* if we want to track a larger set of objects,
 	 * it needs to become arch independend u64 */
 	unsigned lc_number;
-
 	/* special label when on free list */
 #define LC_FREE (~0U)
+
+	/* for pending changes */
+	unsigned lc_new_number;
 };
 
 struct lru_cache {
@@ -176,6 +178,7 @@ struct lru_cache {
 	struct list_head lru;
 	struct list_head free;
 	struct list_head in_use;
+	struct list_head to_be_changed;
 
 	/* the pre-created kmem cache to allocate the objects from */
 	struct kmem_cache *lc_cache;
@@ -186,7 +189,7 @@ struct lru_cache {
 	size_t element_off;
 
 	/* number of elements (indices) */
-	unsigned int  nr_elements;
+	unsigned int nr_elements;
 	/* Arbitrary limit on maximum tracked objects. Practical limit is much
 	 * lower due to allocation failures, probably. For typical use cases,
 	 * nr_elements should be a few thousand at most.
@@ -194,18 +197,19 @@ struct lru_cache {
 	 * 8 high bits of .lc_index to be overloaded with flags in the future. */
 #define LC_MAX_ACTIVE	(1<<24)
 
+	/* allow to accumulate a few (index:label) changes,
+	 * but no more than max_pending_changes */
+	unsigned int max_pending_changes;
+	/* number of elements currently on to_be_changed list */
+	unsigned int pending_changes;
+
 	/* statistics */
-	unsigned used; /* number of lelements currently on in_use list */
-	unsigned long hits, misses, starving, dirty, changed;
+	unsigned used; /* number of elements currently on in_use list */
+	unsigned long hits, misses, starving, locked, changed;
 
 	/* see below: flag-bits for lru_cache */
 	unsigned long flags;
 
-	/* when changing the label of an index element */
-	unsigned int  new_number;
-
-	/* for paranoia when changing the label of an index element */
-	struct lc_element *changing_element;
 
 	void  *lc_private;
 	const char *name;
@@ -221,10 +225,15 @@ enum {
 	/* debugging aid, to catch concurrent access early.
 	 * user needs to guarantee exclusive access by proper locking! */
 	__LC_PARANOIA,
-	/* if we need to change the set, but currently there is a changing
-	 * transaction pending, we are "dirty", and must deferr further
-	 * changing requests */
+
+	/* annotate that the set is "dirty", possibly accumulating further
+	 * changes, until a transaction is finally triggered */
 	__LC_DIRTY,
+
+	/* Locked, no further changes allowed.
+	 * Also used to serialize changing transactions. */
+	__LC_LOCKED,
+
 	/* if we need to change the set, but currently there is no free nor
 	 * unused element available, we are "starving", and must not give out
 	 * further references, to guarantee that eventually some refcnt will
@@ -236,9 +245,11 @@ enum {
 };
 #define LC_PARANOIA (1<<__LC_PARANOIA)
 #define LC_DIRTY    (1<<__LC_DIRTY)
+#define LC_LOCKED   (1<<__LC_LOCKED)
 #define LC_STARVING (1<<__LC_STARVING)
 
 extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
+		unsigned max_pending_changes,
 		unsigned e_count, size_t e_size, size_t e_off);
 extern void lc_reset(struct lru_cache *lc);
 extern void lc_destroy(struct lru_cache *lc);
@@ -249,7 +260,7 @@ extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr);
 extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr);
 extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr);
 extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e);
-extern void lc_changed(struct lru_cache *lc, struct lc_element *e);
+extern void lc_committed(struct lru_cache *lc);
 
 struct seq_file;
 extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
@@ -258,17 +269,29 @@ extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char
 				void (*detail) (struct seq_file *, struct lc_element *));
 
 /**
- * lc_try_lock - can be used to stop lc_get() from changing the tracked set
+ * lc_try_lock_for_transaction - can be used to stop lc_get() from changing the tracked set
  * @lc: the lru cache to operate on
  *
- * Note that the reference counts and order on the active and lru lists may
- * still change.  Returns true if we acquired the lock.
+ * Allows (expects) the set to be "dirty".  Note that the reference counts and
+ * order on the active and lru lists may still change.  Used to serialize
+ * changing transactions.  Returns true if we aquired the lock.
  */
-static inline int lc_try_lock(struct lru_cache *lc)
+static inline int lc_try_lock_for_transaction(struct lru_cache *lc)
 {
-	return !test_and_set_bit(__LC_DIRTY, &lc->flags);
+	return !test_and_set_bit(__LC_LOCKED, &lc->flags);
 }
 
+/**
+ * lc_try_lock - variant to stop lc_get() from changing the tracked set
+ * @lc: the lru cache to operate on
+ *
+ * Note that the reference counts and order on the active and lru lists may
+ * still change.  Only works on a "clean" set.  Returns true if we aquired the
+ * lock, which means there are no pending changes, and any further attempt to
+ * change the set will not succeed until the next lc_unlock().
+ */
+extern int lc_try_lock(struct lru_cache *lc);
+
 /**
  * lc_unlock - unlock @lc, allow lc_get() to change the set again
  * @lc: the lru cache to operate on
@@ -276,14 +299,10 @@ static inline int lc_try_lock(struct lru_cache *lc)
 static inline void lc_unlock(struct lru_cache *lc)
 {
 	clear_bit(__LC_DIRTY, &lc->flags);
-	smp_mb__after_clear_bit();
+	clear_bit_unlock(__LC_LOCKED, &lc->flags);
 }
 
-static inline int lc_is_used(struct lru_cache *lc, unsigned int enr)
-{
-	struct lc_element *e = lc_find(lc, enr);
-	return e && e->refcnt;
-}
+extern bool lc_is_used(struct lru_cache *lc, unsigned int enr);
 
 #define lc_entry(ptr, type, member) \
 	container_of(ptr, type, member)
diff --git a/trunk/include/linux/memcontrol.h b/trunk/include/linux/memcontrol.h
index e98a74c0c9c0..0108a56f814e 100644
--- a/trunk/include/linux/memcontrol.h
+++ b/trunk/include/linux/memcontrol.h
@@ -21,11 +21,14 @@
 #define _LINUX_MEMCONTROL_H
 #include <linux/cgroup.h>
 #include <linux/vm_event_item.h>
+#include <linux/hardirq.h>
+#include <linux/jump_label.h>
 
 struct mem_cgroup;
 struct page_cgroup;
 struct page;
 struct mm_struct;
+struct kmem_cache;
 
 /* Stats that can be updated by kernel. */
 enum mem_cgroup_page_stat_item {
@@ -414,5 +417,211 @@ static inline void sock_release_memcg(struct sock *sk)
 {
 }
 #endif /* CONFIG_INET && CONFIG_MEMCG_KMEM */
+
+#ifdef CONFIG_MEMCG_KMEM
+extern struct static_key memcg_kmem_enabled_key;
+
+extern int memcg_limited_groups_array_size;
+
+/*
+ * Helper macro to loop through all memcg-specific caches. Callers must still
+ * check if the cache is valid (it is either valid or NULL).
+ * the slab_mutex must be held when looping through those caches
+ */
+#define for_each_memcg_cache_index(_idx)	\
+	for ((_idx) = 0; i < memcg_limited_groups_array_size; (_idx)++)
+
+static inline bool memcg_kmem_enabled(void)
+{
+	return static_key_false(&memcg_kmem_enabled_key);
+}
+
+/*
+ * In general, we'll do everything in our power to not incur in any overhead
+ * for non-memcg users for the kmem functions. Not even a function call, if we
+ * can avoid it.
+ *
+ * Therefore, we'll inline all those functions so that in the best case, we'll
+ * see that kmemcg is off for everybody and proceed quickly.  If it is on,
+ * we'll still do most of the flag checking inline. We check a lot of
+ * conditions, but because they are pretty simple, they are expected to be
+ * fast.
+ */
+bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg,
+					int order);
+void __memcg_kmem_commit_charge(struct page *page,
+				       struct mem_cgroup *memcg, int order);
+void __memcg_kmem_uncharge_pages(struct page *page, int order);
+
+int memcg_cache_id(struct mem_cgroup *memcg);
+int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
+			 struct kmem_cache *root_cache);
+void memcg_release_cache(struct kmem_cache *cachep);
+void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep);
+
+int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
+void memcg_update_array_size(int num_groups);
+
+struct kmem_cache *
+__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
+
+void mem_cgroup_destroy_cache(struct kmem_cache *cachep);
+void kmem_cache_destroy_memcg_children(struct kmem_cache *s);
+
+/**
+ * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
+ * @gfp: the gfp allocation flags.
+ * @memcg: a pointer to the memcg this was charged against.
+ * @order: allocation order.
+ *
+ * returns true if the memcg where the current task belongs can hold this
+ * allocation.
+ *
+ * We return true automatically if this allocation is not to be accounted to
+ * any memcg.
+ */
+static inline bool
+memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
+{
+	if (!memcg_kmem_enabled())
+		return true;
+
+	/*
+	 * __GFP_NOFAIL allocations will move on even if charging is not
+	 * possible. Therefore we don't even try, and have this allocation
+	 * unaccounted. We could in theory charge it with
+	 * res_counter_charge_nofail, but we hope those allocations are rare,
+	 * and won't be worth the trouble.
+	 */
+	if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL))
+		return true;
+	if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
+		return true;
+
+	/* If the test is dying, just let it go. */
+	if (unlikely(fatal_signal_pending(current)))
+		return true;
+
+	return __memcg_kmem_newpage_charge(gfp, memcg, order);
+}
+
+/**
+ * memcg_kmem_uncharge_pages: uncharge pages from memcg
+ * @page: pointer to struct page being freed
+ * @order: allocation order.
+ *
+ * there is no need to specify memcg here, since it is embedded in page_cgroup
+ */
+static inline void
+memcg_kmem_uncharge_pages(struct page *page, int order)
+{
+	if (memcg_kmem_enabled())
+		__memcg_kmem_uncharge_pages(page, order);
+}
+
+/**
+ * memcg_kmem_commit_charge: embeds correct memcg in a page
+ * @page: pointer to struct page recently allocated
+ * @memcg: the memcg structure we charged against
+ * @order: allocation order.
+ *
+ * Needs to be called after memcg_kmem_newpage_charge, regardless of success or
+ * failure of the allocation. if @page is NULL, this function will revert the
+ * charges. Otherwise, it will commit the memcg given by @memcg to the
+ * corresponding page_cgroup.
+ */
+static inline void
+memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
+{
+	if (memcg_kmem_enabled() && memcg)
+		__memcg_kmem_commit_charge(page, memcg, order);
+}
+
+/**
+ * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation
+ * @cachep: the original global kmem cache
+ * @gfp: allocation flags.
+ *
+ * This function assumes that the task allocating, which determines the memcg
+ * in the page allocator, belongs to the same cgroup throughout the whole
+ * process.  Misacounting can happen if the task calls memcg_kmem_get_cache()
+ * while belonging to a cgroup, and later on changes. This is considered
+ * acceptable, and should only happen upon task migration.
+ *
+ * Before the cache is created by the memcg core, there is also a possible
+ * imbalance: the task belongs to a memcg, but the cache being allocated from
+ * is the global cache, since the child cache is not yet guaranteed to be
+ * ready. This case is also fine, since in this case the GFP_KMEMCG will not be
+ * passed and the page allocator will not attempt any cgroup accounting.
+ */
+static __always_inline struct kmem_cache *
+memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
+{
+	if (!memcg_kmem_enabled())
+		return cachep;
+	if (gfp & __GFP_NOFAIL)
+		return cachep;
+	if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
+		return cachep;
+	if (unlikely(fatal_signal_pending(current)))
+		return cachep;
+
+	return __memcg_kmem_get_cache(cachep, gfp);
+}
+#else
+#define for_each_memcg_cache_index(_idx)	\
+	for (; NULL; )
+
+static inline bool memcg_kmem_enabled(void)
+{
+	return false;
+}
+
+static inline bool
+memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
+{
+	return true;
+}
+
+static inline void memcg_kmem_uncharge_pages(struct page *page, int order)
+{
+}
+
+static inline void
+memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
+{
+}
+
+static inline int memcg_cache_id(struct mem_cgroup *memcg)
+{
+	return -1;
+}
+
+static inline int
+memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
+		     struct kmem_cache *root_cache)
+{
+	return 0;
+}
+
+static inline void memcg_release_cache(struct kmem_cache *cachep)
+{
+}
+
+static inline void memcg_cache_list_add(struct mem_cgroup *memcg,
+					struct kmem_cache *s)
+{
+}
+
+static inline struct kmem_cache *
+memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
+{
+	return cachep;
+}
+
+static inline void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
 #endif /* _LINUX_MEMCONTROL_H */
 
diff --git a/trunk/include/linux/mm_types.h b/trunk/include/linux/mm_types.h
index 7d9ebb7cc982..f8f5162a3571 100644
--- a/trunk/include/linux/mm_types.h
+++ b/trunk/include/linux/mm_types.h
@@ -128,10 +128,7 @@ struct page {
 		};
 
 		struct list_head list;	/* slobs list of pages */
-		struct {		/* slab fields */
-			struct kmem_cache *slab_cache;
-			struct slab *slab_page;
-		};
+		struct slab *slab_page; /* slab fields */
 	};
 
 	/* Remainder is not double word aligned */
@@ -146,7 +143,7 @@ struct page {
 #if USE_SPLIT_PTLOCKS
 		spinlock_t ptl;
 #endif
-		struct kmem_cache *slab;	/* SLUB: Pointer to slab */
+		struct kmem_cache *slab_cache;	/* SL[AU]B: Pointer to slab */
 		struct page *first_page;	/* Compound tail pages */
 	};
 
diff --git a/trunk/include/linux/mnt_namespace.h b/trunk/include/linux/mnt_namespace.h
index 5a8e3903d770..12b2ab510323 100644
--- a/trunk/include/linux/mnt_namespace.h
+++ b/trunk/include/linux/mnt_namespace.h
@@ -4,9 +4,10 @@
 
 struct mnt_namespace;
 struct fs_struct;
+struct user_namespace;
 
 extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
-		struct fs_struct *);
+		struct user_namespace *, struct fs_struct *);
 extern void put_mnt_ns(struct mnt_namespace *ns);
 
 extern const struct file_operations proc_mounts_operations;
diff --git a/trunk/include/linux/moduleparam.h b/trunk/include/linux/moduleparam.h
index d6a58065c09c..137b4198fc03 100644
--- a/trunk/include/linux/moduleparam.h
+++ b/trunk/include/linux/moduleparam.h
@@ -16,17 +16,15 @@
 /* Chosen so that structs with an unsigned long line up. */
 #define MAX_PARAM_PREFIX_LEN (64 - sizeof(unsigned long))
 
-#define ___module_cat(a,b) __mod_ ## a ## b
-#define __module_cat(a,b) ___module_cat(a,b)
 #ifdef MODULE
 #define __MODULE_INFO(tag, name, info)					  \
-static const char __module_cat(name,__LINE__)[]				  \
+static const char __UNIQUE_ID(name)[]					  \
   __used __attribute__((section(".modinfo"), unused, aligned(1)))	  \
   = __stringify(tag) "=" info
 #else  /* !MODULE */
 /* This struct is here for syntactic coherency, it is not used */
 #define __MODULE_INFO(tag, name, info)					  \
-  struct __module_cat(name,__LINE__) {}
+  struct __UNIQUE_ID(name) {}
 #endif
 #define __MODULE_PARM_TYPE(name, _type)					  \
   __MODULE_INFO(parmtype, name##type, #name ":" _type)
diff --git a/trunk/include/linux/mtd/blktrans.h b/trunk/include/linux/mtd/blktrans.h
index ed270bd2e4df..4eb0a50d0c55 100644
--- a/trunk/include/linux/mtd/blktrans.h
+++ b/trunk/include/linux/mtd/blktrans.h
@@ -23,6 +23,7 @@
 #include <linux/mutex.h>
 #include <linux/kref.h>
 #include <linux/sysfs.h>
+#include <linux/workqueue.h>
 
 struct hd_geometry;
 struct mtd_info;
@@ -43,7 +44,8 @@ struct mtd_blktrans_dev {
 	struct kref ref;
 	struct gendisk *disk;
 	struct attribute_group *disk_attributes;
-	struct task_struct *thread;
+	struct workqueue_struct *wq;
+	struct work_struct work;
 	struct request_queue *rq;
 	spinlock_t queue_lock;
 	void *priv;
diff --git a/trunk/include/linux/mtd/doc2000.h b/trunk/include/linux/mtd/doc2000.h
index 0f6fea73a1f6..407d1e556c39 100644
--- a/trunk/include/linux/mtd/doc2000.h
+++ b/trunk/include/linux/mtd/doc2000.h
@@ -92,12 +92,26 @@
  * Others use readb/writeb
  */
 #if defined(__arm__)
-#define ReadDOC_(adr, reg)      ((unsigned char)(*(volatile __u32 *)(((unsigned long)adr)+((reg)<<2))))
-#define WriteDOC_(d, adr, reg)  do{ *(volatile __u32 *)(((unsigned long)adr)+((reg)<<2)) = (__u32)d; wmb();} while(0)
+static inline u8 ReadDOC_(u32 __iomem *addr, unsigned long reg)
+{
+	return __raw_readl(addr + reg);
+}
+static inline void WriteDOC_(u8 data, u32 __iomem *addr, unsigned long reg)
+{
+	__raw_writel(data, addr + reg);
+	wmb();
+}
 #define DOC_IOREMAP_LEN 0x8000
 #elif defined(__ppc__)
-#define ReadDOC_(adr, reg)      ((unsigned char)(*(volatile __u16 *)(((unsigned long)adr)+((reg)<<1))))
-#define WriteDOC_(d, adr, reg)  do{ *(volatile __u16 *)(((unsigned long)adr)+((reg)<<1)) = (__u16)d; wmb();} while(0)
+static inline u8 ReadDOC_(u16 __iomem *addr, unsigned long reg)
+{
+	return __raw_readw(addr + reg);
+}
+static inline void WriteDOC_(u8 data, u16 __iomem *addr, unsigned long reg)
+{
+	__raw_writew(data, addr + reg);
+	wmb();
+}
 #define DOC_IOREMAP_LEN 0x4000
 #else
 #define ReadDOC_(adr, reg)      readb((void __iomem *)(adr) + (reg))
diff --git a/trunk/include/linux/mtd/fsmc.h b/trunk/include/linux/mtd/fsmc.h
index b20029221fb1..d6ed61ef451d 100644
--- a/trunk/include/linux/mtd/fsmc.h
+++ b/trunk/include/linux/mtd/fsmc.h
@@ -155,9 +155,6 @@ struct fsmc_nand_platform_data {
 	unsigned int		width;
 	unsigned int		bank;
 
-	/* CLE, ALE offsets */
-	unsigned int		cle_off;
-	unsigned int		ale_off;
 	enum access_mode	mode;
 
 	void			(*select_bank)(uint32_t bank, uint32_t busw);
diff --git a/trunk/include/linux/mtd/gpmi-nand.h b/trunk/include/linux/mtd/gpmi-nand.h
deleted file mode 100644
index ed3c4e09f3d1..000000000000
--- a/trunk/include/linux/mtd/gpmi-nand.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (C) 2011 Freescale Semiconductor, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- */
-
-#ifndef __MACH_MXS_GPMI_NAND_H__
-#define __MACH_MXS_GPMI_NAND_H__
-
-/* The size of the resources is fixed. */
-#define GPMI_NAND_RES_SIZE	6
-
-/* Resource names for the GPMI NAND driver. */
-#define GPMI_NAND_GPMI_REGS_ADDR_RES_NAME  "gpmi-nand"
-#define GPMI_NAND_GPMI_INTERRUPT_RES_NAME  "GPMI NAND GPMI Interrupt"
-#define GPMI_NAND_BCH_REGS_ADDR_RES_NAME   "bch"
-#define GPMI_NAND_BCH_INTERRUPT_RES_NAME   "bch"
-#define GPMI_NAND_DMA_CHANNELS_RES_NAME    "GPMI NAND DMA Channels"
-#define GPMI_NAND_DMA_INTERRUPT_RES_NAME   "gpmi-dma"
-
-/**
- * struct gpmi_nand_platform_data - GPMI NAND driver platform data.
- *
- * This structure communicates platform-specific information to the GPMI NAND
- * driver that can't be expressed as resources.
- *
- * @platform_init:           A pointer to a function the driver will call to
- *                           initialize the platform (e.g., set up the pin mux).
- * @min_prop_delay_in_ns:    Minimum propagation delay of GPMI signals to and
- *                           from the NAND Flash device, in nanoseconds.
- * @max_prop_delay_in_ns:    Maximum propagation delay of GPMI signals to and
- *                           from the NAND Flash device, in nanoseconds.
- * @max_chip_count:          The maximum number of chips for which the driver
- *                           should configure the hardware. This value most
- *                           likely reflects the number of pins that are
- *                           connected to a NAND Flash device. If this is
- *                           greater than the SoC hardware can support, the
- *                           driver will print a message and fail to initialize.
- * @partitions:              An optional pointer to an array of partition
- *                           descriptions.
- * @partition_count:         The number of elements in the partitions array.
- */
-struct gpmi_nand_platform_data {
-	/* SoC hardware information. */
-	int		(*platform_init)(void);
-
-	/* NAND Flash information. */
-	unsigned int	min_prop_delay_in_ns;
-	unsigned int	max_prop_delay_in_ns;
-	unsigned int	max_chip_count;
-
-	/* Medium information. */
-	struct		mtd_partition *partitions;
-	unsigned	partition_count;
-};
-#endif
diff --git a/trunk/include/linux/mtd/map.h b/trunk/include/linux/mtd/map.h
index 3595a0236b0f..f6eb4332ac92 100644
--- a/trunk/include/linux/mtd/map.h
+++ b/trunk/include/linux/mtd/map.h
@@ -328,7 +328,7 @@ static inline int map_word_bitsset(struct map_info *map, map_word val1, map_word
 
 static inline map_word map_word_load(struct map_info *map, const void *ptr)
 {
-	map_word r;
+	map_word r = {{0} };
 
 	if (map_bankwidth_is_1(map))
 		r.x[0] = *(unsigned char *)ptr;
@@ -391,7 +391,7 @@ static inline map_word map_word_ff(struct map_info *map)
 
 static inline map_word inline_map_read(struct map_info *map, unsigned long ofs)
 {
-	map_word r;
+	map_word uninitialized_var(r);
 
 	if (map_bankwidth_is_1(map))
 		r.x[0] = __raw_readb(map->virt + ofs);
diff --git a/trunk/include/linux/mtd/mtd.h b/trunk/include/linux/mtd/mtd.h
index 81d61e704599..f9ac2897b86b 100644
--- a/trunk/include/linux/mtd/mtd.h
+++ b/trunk/include/linux/mtd/mtd.h
@@ -98,7 +98,7 @@ struct mtd_oob_ops {
 };
 
 #define MTD_MAX_OOBFREE_ENTRIES_LARGE	32
-#define MTD_MAX_ECCPOS_ENTRIES_LARGE	448
+#define MTD_MAX_ECCPOS_ENTRIES_LARGE	640
 /*
  * Internal ECC layout control structure. For historical reasons, there is a
  * similar, smaller struct nand_ecclayout_user (in mtd-abi.h) that is retained
diff --git a/trunk/include/linux/mtd/nand.h b/trunk/include/linux/mtd/nand.h
index 24e915957e4f..7ccb3c59ed60 100644
--- a/trunk/include/linux/mtd/nand.h
+++ b/trunk/include/linux/mtd/nand.h
@@ -219,6 +219,13 @@ typedef enum {
 #define NAND_OWN_BUFFERS	0x00020000
 /* Chip may not exist, so silence any errors in scan */
 #define NAND_SCAN_SILENT_NODEV	0x00040000
+/*
+ * Autodetect nand buswidth with readid/onfi.
+ * This suppose the driver will configure the hardware in 8 bits mode
+ * when calling nand_scan_ident, and update its configuration
+ * before calling nand_scan_tail.
+ */
+#define NAND_BUSWIDTH_AUTO      0x00080000
 
 /* Options set by nand scan */
 /* Nand scan has allocated controller struct */
@@ -471,8 +478,8 @@ struct nand_buffers {
  *			non 0 if ONFI supported.
  * @onfi_params:	[INTERN] holds the ONFI page parameter when ONFI is
  *			supported, 0 otherwise.
- * @onfi_set_features	[REPLACEABLE] set the features for ONFI nand
- * @onfi_get_features	[REPLACEABLE] get the features for ONFI nand
+ * @onfi_set_features:	[REPLACEABLE] set the features for ONFI nand
+ * @onfi_get_features:	[REPLACEABLE] get the features for ONFI nand
  * @ecclayout:		[REPLACEABLE] the default ECC placement scheme
  * @bbt:		[INTERN] bad block table pointer
  * @bbt_td:		[REPLACEABLE] bad block table descriptor for flash
diff --git a/trunk/include/linux/mtd/sh_flctl.h b/trunk/include/linux/mtd/sh_flctl.h
index 01e4b15b280e..1c28f8879b1c 100644
--- a/trunk/include/linux/mtd/sh_flctl.h
+++ b/trunk/include/linux/mtd/sh_flctl.h
@@ -20,6 +20,7 @@
 #ifndef __SH_FLCTL_H__
 #define __SH_FLCTL_H__
 
+#include <linux/completion.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
 #include <linux/mtd/partitions.h>
@@ -107,6 +108,7 @@
 #define ESTERINTE	(0x1 << 24)	/* ECC error interrupt enable */
 #define AC1CLR		(0x1 << 19)	/* ECC FIFO clear */
 #define AC0CLR		(0x1 << 18)	/* Data FIFO clear */
+#define DREQ0EN		(0x1 << 16)	/* FLDTFIFODMA Request Enable */
 #define ECERB		(0x1 << 9)	/* ECC error */
 #define STERB		(0x1 << 8)	/* Status error */
 #define STERINTE	(0x1 << 4)	/* Status error enable */
@@ -138,6 +140,8 @@ enum flctl_ecc_res_t {
 	FL_TIMEOUT
 };
 
+struct dma_chan;
+
 struct sh_flctl {
 	struct mtd_info		mtd;
 	struct nand_chip	chip;
@@ -147,7 +151,7 @@ struct sh_flctl {
 
 	uint8_t	done_buff[2048 + 64];	/* max size 2048 + 64 */
 	int	read_bytes;
-	int	index;
+	unsigned int index;
 	int	seqin_column;		/* column in SEQIN cmd */
 	int	seqin_page_addr;	/* page_addr in SEQIN cmd */
 	uint32_t seqin_read_cmd;		/* read cmd in SEQIN cmd */
@@ -161,6 +165,11 @@ struct sh_flctl {
 	unsigned hwecc:1;	/* Hardware ECC (0 = disabled, 1 = enabled) */
 	unsigned holden:1;	/* Hardware has FLHOLDCR and HOLDEN is set */
 	unsigned qos_request:1;	/* QoS request to prevent deep power shutdown */
+
+	/* DMA related objects */
+	struct dma_chan		*chan_fifo0_rx;
+	struct dma_chan		*chan_fifo0_tx;
+	struct completion	dma_complete;
 };
 
 struct sh_flctl_platform_data {
@@ -170,6 +179,9 @@ struct sh_flctl_platform_data {
 
 	unsigned has_hwecc:1;
 	unsigned use_holden:1;
+
+	unsigned int            slave_id_fifo0_tx;
+	unsigned int            slave_id_fifo0_rx;
 };
 
 static inline struct sh_flctl *mtd_to_flctl(struct mtd_info *mtdinfo)
diff --git a/trunk/include/linux/nfs_fs_sb.h b/trunk/include/linux/nfs_fs_sb.h
index a9e76ee1adca..6c6ed153a9b4 100644
--- a/trunk/include/linux/nfs_fs_sb.h
+++ b/trunk/include/linux/nfs_fs_sb.h
@@ -198,51 +198,4 @@ struct nfs_server {
 #define NFS_CAP_POSIX_LOCK	(1U << 14)
 #define NFS_CAP_UIDGID_NOMAP	(1U << 15)
 
-
-/* maximum number of slots to use */
-#define NFS4_DEF_SLOT_TABLE_SIZE (16U)
-#define NFS4_MAX_SLOT_TABLE (256U)
-#define NFS4_NO_SLOT ((u32)-1)
-
-#if IS_ENABLED(CONFIG_NFS_V4)
-
-/* Sessions */
-#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long))
-struct nfs4_slot_table {
-	struct nfs4_slot *slots;		/* seqid per slot */
-	unsigned long   used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */
-	spinlock_t	slot_tbl_lock;
-	struct rpc_wait_queue	slot_tbl_waitq;	/* allocators may wait here */
-	u32		max_slots;		/* # slots in table */
-	u32		highest_used_slotid;	/* sent to server on each SEQ.
-						 * op for dynamic resizing */
-	u32		target_max_slots;	/* Set by CB_RECALL_SLOT as
-						 * the new max_slots */
-	struct completion complete;
-};
-
-static inline int slot_idx(struct nfs4_slot_table *tbl, struct nfs4_slot *sp)
-{
-	return sp - tbl->slots;
-}
-
-/*
- * Session related parameters
- */
-struct nfs4_session {
-	struct nfs4_sessionid		sess_id;
-	u32				flags;
-	unsigned long			session_state;
-	u32				hash_alg;
-	u32				ssv_len;
-
-	/* The fore and back channel */
-	struct nfs4_channel_attrs	fc_attrs;
-	struct nfs4_slot_table		fc_slot_table;
-	struct nfs4_channel_attrs	bc_attrs;
-	struct nfs4_slot_table		bc_slot_table;
-	struct nfs_client		*clp;
-};
-
-#endif /* CONFIG_NFS_V4 */
 #endif
diff --git a/trunk/include/linux/nfs_xdr.h b/trunk/include/linux/nfs_xdr.h
index a73ea89789d1..29adb12c7ecf 100644
--- a/trunk/include/linux/nfs_xdr.h
+++ b/trunk/include/linux/nfs_xdr.h
@@ -185,23 +185,20 @@ struct nfs4_channel_attrs {
 	u32			max_reqs;
 };
 
-/* nfs41 sessions slot seqid */
-struct nfs4_slot {
-	u32		 	seq_nr;
-};
-
+struct nfs4_slot;
 struct nfs4_sequence_args {
-	struct nfs4_session	*sa_session;
-	u32			sa_slotid;
-	u8			sa_cache_this;
+	struct nfs4_slot	*sa_slot;
+	u8			sa_cache_this : 1,
+				sa_privileged : 1;
 };
 
 struct nfs4_sequence_res {
-	struct nfs4_session	*sr_session;
 	struct nfs4_slot	*sr_slot;	/* slot used to send request */
+	unsigned long		sr_timestamp;
 	int			sr_status;	/* sequence operation status */
-	unsigned long		sr_renewal_time;
 	u32			sr_status_flags;
+	u32			sr_highest_slotid;
+	u32			sr_target_highest_slotid;
 };
 
 struct nfs4_get_lease_time_args {
@@ -209,8 +206,8 @@ struct nfs4_get_lease_time_args {
 };
 
 struct nfs4_get_lease_time_res {
-	struct nfs_fsinfo	       *lr_fsinfo;
 	struct nfs4_sequence_res	lr_seq_res;
+	struct nfs_fsinfo	       *lr_fsinfo;
 };
 
 #define PNFS_LAYOUT_MAXSIZE 4096
@@ -228,23 +225,23 @@ struct pnfs_layout_range {
 };
 
 struct nfs4_layoutget_args {
+	struct nfs4_sequence_args seq_args;
 	__u32 type;
 	struct pnfs_layout_range range;
 	__u64 minlength;
 	__u32 maxcount;
 	struct inode *inode;
 	struct nfs_open_context *ctx;
-	struct nfs4_sequence_args seq_args;
 	nfs4_stateid stateid;
 	struct nfs4_layoutdriver_data layout;
 };
 
 struct nfs4_layoutget_res {
+	struct nfs4_sequence_res seq_res;
 	__u32 return_on_close;
 	struct pnfs_layout_range range;
 	__u32 type;
 	nfs4_stateid stateid;
-	struct nfs4_sequence_res seq_res;
 	struct nfs4_layoutdriver_data *layoutp;
 };
 
@@ -255,38 +252,38 @@ struct nfs4_layoutget {
 };
 
 struct nfs4_getdevicelist_args {
+	struct nfs4_sequence_args seq_args;
 	const struct nfs_fh *fh;
 	u32 layoutclass;
-	struct nfs4_sequence_args seq_args;
 };
 
 struct nfs4_getdevicelist_res {
-	struct pnfs_devicelist *devlist;
 	struct nfs4_sequence_res seq_res;
+	struct pnfs_devicelist *devlist;
 };
 
 struct nfs4_getdeviceinfo_args {
-	struct pnfs_device *pdev;
 	struct nfs4_sequence_args seq_args;
+	struct pnfs_device *pdev;
 };
 
 struct nfs4_getdeviceinfo_res {
-	struct pnfs_device *pdev;
 	struct nfs4_sequence_res seq_res;
+	struct pnfs_device *pdev;
 };
 
 struct nfs4_layoutcommit_args {
+	struct nfs4_sequence_args seq_args;
 	nfs4_stateid stateid;
 	__u64 lastbytewritten;
 	struct inode *inode;
 	const u32 *bitmask;
-	struct nfs4_sequence_args seq_args;
 };
 
 struct nfs4_layoutcommit_res {
+	struct nfs4_sequence_res seq_res;
 	struct nfs_fattr *fattr;
 	const struct nfs_server *server;
-	struct nfs4_sequence_res seq_res;
 	int status;
 };
 
@@ -300,11 +297,11 @@ struct nfs4_layoutcommit_data {
 };
 
 struct nfs4_layoutreturn_args {
+	struct nfs4_sequence_args seq_args;
 	struct pnfs_layout_hdr *layout;
 	struct inode *inode;
 	nfs4_stateid stateid;
 	__u32   layout_type;
-	struct nfs4_sequence_args seq_args;
 };
 
 struct nfs4_layoutreturn_res {
@@ -330,6 +327,7 @@ struct stateowner_id {
  * Arguments to the open call.
  */
 struct nfs_openargs {
+	struct nfs4_sequence_args	seq_args;
 	const struct nfs_fh *	fh;
 	struct nfs_seqid *	seqid;
 	int			open_flags;
@@ -350,10 +348,10 @@ struct nfs_openargs {
 	const u32 *		bitmask;
 	const u32 *		open_bitmap;
 	__u32			claim;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_openres {
+	struct nfs4_sequence_res	seq_res;
 	nfs4_stateid            stateid;
 	struct nfs_fh           fh;
 	struct nfs4_change_info	cinfo;
@@ -368,7 +366,6 @@ struct nfs_openres {
 	__u32			attrset[NFS4_BITMAP_SIZE];
 	struct nfs4_string	*owner;
 	struct nfs4_string	*group_owner;
-	struct nfs4_sequence_res	seq_res;
 	__u32			access_request;
 	__u32			access_supported;
 	__u32			access_result;
@@ -392,20 +389,20 @@ struct nfs_open_confirmres {
  * Arguments to the close call.
  */
 struct nfs_closeargs {
+	struct nfs4_sequence_args	seq_args;
 	struct nfs_fh *         fh;
 	nfs4_stateid *		stateid;
 	struct nfs_seqid *	seqid;
 	fmode_t			fmode;
 	const u32 *		bitmask;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_closeres {
+	struct nfs4_sequence_res	seq_res;
 	nfs4_stateid            stateid;
 	struct nfs_fattr *	fattr;
 	struct nfs_seqid *	seqid;
 	const struct nfs_server *server;
-	struct nfs4_sequence_res	seq_res;
 };
 /*
  *  * Arguments to the lock,lockt, and locku call.
@@ -417,6 +414,7 @@ struct nfs_lowner {
 };
 
 struct nfs_lock_args {
+	struct nfs4_sequence_args	seq_args;
 	struct nfs_fh *		fh;
 	struct file_lock *	fl;
 	struct nfs_seqid *	lock_seqid;
@@ -427,40 +425,39 @@ struct nfs_lock_args {
 	unsigned char		block : 1;
 	unsigned char		reclaim : 1;
 	unsigned char		new_lock_owner : 1;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_lock_res {
+	struct nfs4_sequence_res	seq_res;
 	nfs4_stateid		stateid;
 	struct nfs_seqid *	lock_seqid;
 	struct nfs_seqid *	open_seqid;
-	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs_locku_args {
+	struct nfs4_sequence_args	seq_args;
 	struct nfs_fh *		fh;
 	struct file_lock *	fl;
 	struct nfs_seqid *	seqid;
 	nfs4_stateid *		stateid;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_locku_res {
+	struct nfs4_sequence_res	seq_res;
 	nfs4_stateid		stateid;
 	struct nfs_seqid *	seqid;
-	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs_lockt_args {
+	struct nfs4_sequence_args	seq_args;
 	struct nfs_fh *		fh;
 	struct file_lock *	fl;
 	struct nfs_lowner	lock_owner;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_lockt_res {
-	struct file_lock *	denied; /* LOCK, LOCKT failed */
 	struct nfs4_sequence_res	seq_res;
+	struct file_lock *	denied; /* LOCK, LOCKT failed */
 };
 
 struct nfs_release_lockowner_args {
@@ -468,22 +465,23 @@ struct nfs_release_lockowner_args {
 };
 
 struct nfs4_delegreturnargs {
+	struct nfs4_sequence_args	seq_args;
 	const struct nfs_fh *fhandle;
 	const nfs4_stateid *stateid;
 	const u32 * bitmask;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_delegreturnres {
+	struct nfs4_sequence_res	seq_res;
 	struct nfs_fattr * fattr;
 	const struct nfs_server *server;
-	struct nfs4_sequence_res	seq_res;
 };
 
 /*
  * Arguments to the read call.
  */
 struct nfs_readargs {
+	struct nfs4_sequence_args	seq_args;
 	struct nfs_fh *		fh;
 	struct nfs_open_context *context;
 	struct nfs_lock_context *lock_context;
@@ -491,20 +489,20 @@ struct nfs_readargs {
 	__u32			count;
 	unsigned int		pgbase;
 	struct page **		pages;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_readres {
+	struct nfs4_sequence_res	seq_res;
 	struct nfs_fattr *	fattr;
 	__u32			count;
 	int                     eof;
-	struct nfs4_sequence_res	seq_res;
 };
 
 /*
  * Arguments to the write call.
  */
 struct nfs_writeargs {
+	struct nfs4_sequence_args	seq_args;
 	struct nfs_fh *		fh;
 	struct nfs_open_context *context;
 	struct nfs_lock_context *lock_context;
@@ -514,7 +512,6 @@ struct nfs_writeargs {
 	unsigned int		pgbase;
 	struct page **		pages;
 	const u32 *		bitmask;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_write_verifier {
@@ -527,65 +524,65 @@ struct nfs_writeverf {
 };
 
 struct nfs_writeres {
+	struct nfs4_sequence_res	seq_res;
 	struct nfs_fattr *	fattr;
 	struct nfs_writeverf *	verf;
 	__u32			count;
 	const struct nfs_server *server;
-	struct nfs4_sequence_res	seq_res;
 };
 
 /*
  * Arguments to the commit call.
  */
 struct nfs_commitargs {
+	struct nfs4_sequence_args	seq_args;
 	struct nfs_fh		*fh;
 	__u64			offset;
 	__u32			count;
 	const u32		*bitmask;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_commitres {
+	struct nfs4_sequence_res	seq_res;
 	struct nfs_fattr	*fattr;
 	struct nfs_writeverf	*verf;
 	const struct nfs_server *server;
-	struct nfs4_sequence_res	seq_res;
 };
 
 /*
  * Common arguments to the unlink call
  */
 struct nfs_removeargs {
+	struct nfs4_sequence_args	seq_args;
 	const struct nfs_fh	*fh;
 	struct qstr		name;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_removeres {
+	struct nfs4_sequence_res 	seq_res;
 	const struct nfs_server *server;
 	struct nfs_fattr	*dir_attr;
 	struct nfs4_change_info	cinfo;
-	struct nfs4_sequence_res 	seq_res;
 };
 
 /*
  * Common arguments to the rename call
  */
 struct nfs_renameargs {
+	struct nfs4_sequence_args	seq_args;
 	const struct nfs_fh		*old_dir;
 	const struct nfs_fh		*new_dir;
 	const struct qstr		*old_name;
 	const struct qstr		*new_name;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_renameres {
+	struct nfs4_sequence_res	seq_res;
 	const struct nfs_server		*server;
 	struct nfs4_change_info		old_cinfo;
 	struct nfs_fattr		*old_fattr;
 	struct nfs4_change_info		new_cinfo;
 	struct nfs_fattr		*new_fattr;
-	struct nfs4_sequence_res	seq_res;
 };
 
 /*
@@ -626,20 +623,20 @@ struct nfs_createargs {
 };
 
 struct nfs_setattrargs {
+	struct nfs4_sequence_args 	seq_args;
 	struct nfs_fh *                 fh;
 	nfs4_stateid                    stateid;
 	struct iattr *                  iap;
 	const struct nfs_server *	server; /* Needed for name mapping */
 	const u32 *			bitmask;
-	struct nfs4_sequence_args 	seq_args;
 };
 
 struct nfs_setaclargs {
+	struct nfs4_sequence_args	seq_args;
 	struct nfs_fh *			fh;
 	size_t				acl_len;
 	unsigned int			acl_pgbase;
 	struct page **			acl_pages;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_setaclres {
@@ -647,27 +644,27 @@ struct nfs_setaclres {
 };
 
 struct nfs_getaclargs {
+	struct nfs4_sequence_args 	seq_args;
 	struct nfs_fh *			fh;
 	size_t				acl_len;
 	unsigned int			acl_pgbase;
 	struct page **			acl_pages;
-	struct nfs4_sequence_args 	seq_args;
 };
 
 /* getxattr ACL interface flags */
 #define NFS4_ACL_TRUNC		0x0001	/* ACL was truncated */
 struct nfs_getaclres {
+	struct nfs4_sequence_res	seq_res;
 	size_t				acl_len;
 	size_t				acl_data_offset;
 	int				acl_flags;
 	struct page *			acl_scratch;
-	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs_setattrres {
+	struct nfs4_sequence_res	seq_res;
 	struct nfs_fattr *              fattr;
 	const struct nfs_server *	server;
-	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs_linkargs {
@@ -832,21 +829,22 @@ struct nfs3_getaclres {
 typedef u64 clientid4;
 
 struct nfs4_accessargs {
+	struct nfs4_sequence_args	seq_args;
 	const struct nfs_fh *		fh;
 	const u32 *			bitmask;
 	u32				access;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_accessres {
+	struct nfs4_sequence_res	seq_res;
 	const struct nfs_server *	server;
 	struct nfs_fattr *		fattr;
 	u32				supported;
 	u32				access;
-	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs4_create_arg {
+	struct nfs4_sequence_args 	seq_args;
 	u32				ftype;
 	union {
 		struct {
@@ -863,88 +861,88 @@ struct nfs4_create_arg {
 	const struct iattr *		attrs;
 	const struct nfs_fh *		dir_fh;
 	const u32 *			bitmask;
-	struct nfs4_sequence_args 	seq_args;
 };
 
 struct nfs4_create_res {
+	struct nfs4_sequence_res	seq_res;
 	const struct nfs_server *	server;
 	struct nfs_fh *			fh;
 	struct nfs_fattr *		fattr;
 	struct nfs4_change_info		dir_cinfo;
-	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs4_fsinfo_arg {
+	struct nfs4_sequence_args	seq_args;
 	const struct nfs_fh *		fh;
 	const u32 *			bitmask;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_fsinfo_res {
-	struct nfs_fsinfo	       *fsinfo;
 	struct nfs4_sequence_res	seq_res;
+	struct nfs_fsinfo	       *fsinfo;
 };
 
 struct nfs4_getattr_arg {
+	struct nfs4_sequence_args	seq_args;
 	const struct nfs_fh *		fh;
 	const u32 *			bitmask;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_getattr_res {
+	struct nfs4_sequence_res	seq_res;
 	const struct nfs_server *	server;
 	struct nfs_fattr *		fattr;
-	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs4_link_arg {
+	struct nfs4_sequence_args 	seq_args;
 	const struct nfs_fh *		fh;
 	const struct nfs_fh *		dir_fh;
 	const struct qstr *		name;
 	const u32 *			bitmask;
-	struct nfs4_sequence_args 	seq_args;
 };
 
 struct nfs4_link_res {
+	struct nfs4_sequence_res	seq_res;
 	const struct nfs_server *	server;
 	struct nfs_fattr *		fattr;
 	struct nfs4_change_info		cinfo;
 	struct nfs_fattr *		dir_attr;
-	struct nfs4_sequence_res	seq_res;
 };
 
 
 struct nfs4_lookup_arg {
+	struct nfs4_sequence_args	seq_args;
 	const struct nfs_fh *		dir_fh;
 	const struct qstr *		name;
 	const u32 *			bitmask;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_lookup_res {
+	struct nfs4_sequence_res	seq_res;
 	const struct nfs_server *	server;
 	struct nfs_fattr *		fattr;
 	struct nfs_fh *			fh;
-	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs4_lookup_root_arg {
-	const u32 *			bitmask;
 	struct nfs4_sequence_args	seq_args;
+	const u32 *			bitmask;
 };
 
 struct nfs4_pathconf_arg {
+	struct nfs4_sequence_args	seq_args;
 	const struct nfs_fh *		fh;
 	const u32 *			bitmask;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_pathconf_res {
-	struct nfs_pathconf	       *pathconf;
 	struct nfs4_sequence_res	seq_res;
+	struct nfs_pathconf	       *pathconf;
 };
 
 struct nfs4_readdir_arg {
+	struct nfs4_sequence_args	seq_args;
 	const struct nfs_fh *		fh;
 	u64				cookie;
 	nfs4_verifier			verifier;
@@ -953,21 +951,20 @@ struct nfs4_readdir_arg {
 	unsigned int			pgbase;	/* zero-copy data */
 	const u32 *			bitmask;
 	int				plus;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_readdir_res {
+	struct nfs4_sequence_res	seq_res;
 	nfs4_verifier			verifier;
 	unsigned int			pgbase;
-	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs4_readlink {
+	struct nfs4_sequence_args	seq_args;
 	const struct nfs_fh *		fh;
 	unsigned int			pgbase;
 	unsigned int			pglen;   /* zero-copy data */
 	struct page **			pages;   /* zero-copy data */
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_readlink_res {
@@ -993,28 +990,28 @@ struct nfs4_setclientid_res {
 };
 
 struct nfs4_statfs_arg {
+	struct nfs4_sequence_args	seq_args;
 	const struct nfs_fh *		fh;
 	const u32 *			bitmask;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_statfs_res {
-	struct nfs_fsstat	       *fsstat;
 	struct nfs4_sequence_res	seq_res;
+	struct nfs_fsstat	       *fsstat;
 };
 
 struct nfs4_server_caps_arg {
-	struct nfs_fh		       *fhandle;
 	struct nfs4_sequence_args	seq_args;
+	struct nfs_fh		       *fhandle;
 };
 
 struct nfs4_server_caps_res {
+	struct nfs4_sequence_res	seq_res;
 	u32				attr_bitmask[3];
 	u32				acl_bitmask;
 	u32				has_links;
 	u32				has_symlinks;
 	u32				fh_expire_type;
-	struct nfs4_sequence_res	seq_res;
 };
 
 #define NFS4_PATHNAME_MAXCOMPONENTS 512
@@ -1040,16 +1037,16 @@ struct nfs4_fs_locations {
 };
 
 struct nfs4_fs_locations_arg {
+	struct nfs4_sequence_args	seq_args;
 	const struct nfs_fh *dir_fh;
 	const struct qstr *name;
 	struct page *page;
 	const u32 *bitmask;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_fs_locations_res {
-	struct nfs4_fs_locations       *fs_locations;
 	struct nfs4_sequence_res	seq_res;
+	struct nfs4_fs_locations       *fs_locations;
 };
 
 struct nfs4_secinfo_oid {
@@ -1074,14 +1071,14 @@ struct nfs4_secinfo_flavors {
 };
 
 struct nfs4_secinfo_arg {
+	struct nfs4_sequence_args	seq_args;
 	const struct nfs_fh		*dir_fh;
 	const struct qstr		*name;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_secinfo_res {
-	struct nfs4_secinfo_flavors	*flavors;
 	struct nfs4_sequence_res	seq_res;
+	struct nfs4_secinfo_flavors	*flavors;
 };
 
 #endif /* CONFIG_NFS_V4 */
@@ -1161,9 +1158,9 @@ struct nfs41_create_session_res {
 };
 
 struct nfs41_reclaim_complete_args {
+	struct nfs4_sequence_args	seq_args;
 	/* In the future extend to include curr_fh for use with migration */
 	unsigned char			one_fs:1;
-	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs41_reclaim_complete_res {
@@ -1173,28 +1170,28 @@ struct nfs41_reclaim_complete_res {
 #define SECINFO_STYLE_CURRENT_FH 0
 #define SECINFO_STYLE_PARENT 1
 struct nfs41_secinfo_no_name_args {
-	int				style;
 	struct nfs4_sequence_args	seq_args;
+	int				style;
 };
 
 struct nfs41_test_stateid_args {
-	nfs4_stateid			*stateid;
 	struct nfs4_sequence_args	seq_args;
+	nfs4_stateid			*stateid;
 };
 
 struct nfs41_test_stateid_res {
-	unsigned int			status;
 	struct nfs4_sequence_res	seq_res;
+	unsigned int			status;
 };
 
 struct nfs41_free_stateid_args {
-	nfs4_stateid			*stateid;
 	struct nfs4_sequence_args	seq_args;
+	nfs4_stateid			*stateid;
 };
 
 struct nfs41_free_stateid_res {
-	unsigned int			status;
 	struct nfs4_sequence_res	seq_res;
+	unsigned int			status;
 };
 
 #else
diff --git a/trunk/include/linux/nsproxy.h b/trunk/include/linux/nsproxy.h
index cc37a55ad004..10e5947491c7 100644
--- a/trunk/include/linux/nsproxy.h
+++ b/trunk/include/linux/nsproxy.h
@@ -67,7 +67,7 @@ void exit_task_namespaces(struct task_struct *tsk);
 void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
 void free_nsproxy(struct nsproxy *ns);
 int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
-	struct fs_struct *);
+	struct cred *, struct fs_struct *);
 int __init nsproxy_cache_init(void);
 
 static inline void put_nsproxy(struct nsproxy *ns)
diff --git a/trunk/include/linux/of.h b/trunk/include/linux/of.h
index 6cfea9aa401f..5ebcc5c8e423 100644
--- a/trunk/include/linux/of.h
+++ b/trunk/include/linux/of.h
@@ -22,6 +22,7 @@
 #include <linux/mod_devicetable.h>
 #include <linux/spinlock.h>
 #include <linux/topology.h>
+#include <linux/notifier.h>
 
 #include <asm/byteorder.h>
 #include <asm/errno.h>
@@ -282,16 +283,28 @@ extern int of_alias_get_id(struct device_node *np, const char *stem);
 
 extern int of_machine_is_compatible(const char *compat);
 
-extern int prom_add_property(struct device_node* np, struct property* prop);
-extern int prom_remove_property(struct device_node *np, struct property *prop);
-extern int prom_update_property(struct device_node *np,
-				struct property *newprop);
+extern int of_add_property(struct device_node *np, struct property *prop);
+extern int of_remove_property(struct device_node *np, struct property *prop);
+extern int of_update_property(struct device_node *np, struct property *newprop);
 
-#if defined(CONFIG_OF_DYNAMIC)
 /* For updating the device tree at runtime */
-extern void of_attach_node(struct device_node *);
-extern void of_detach_node(struct device_node *);
-#endif
+#define OF_RECONFIG_ATTACH_NODE		0x0001
+#define OF_RECONFIG_DETACH_NODE		0x0002
+#define OF_RECONFIG_ADD_PROPERTY	0x0003
+#define OF_RECONFIG_REMOVE_PROPERTY	0x0004
+#define OF_RECONFIG_UPDATE_PROPERTY	0x0005
+
+struct of_prop_reconfig {
+	struct device_node	*dn;
+	struct property		*prop;
+};
+
+extern int of_reconfig_notifier_register(struct notifier_block *);
+extern int of_reconfig_notifier_unregister(struct notifier_block *);
+extern int of_reconfig_notify(unsigned long, void *);
+
+extern int of_attach_node(struct device_node *);
+extern int of_detach_node(struct device_node *);
 
 #define of_match_ptr(_ptr)	(_ptr)
 
diff --git a/trunk/include/linux/of_platform.h b/trunk/include/linux/of_platform.h
index b47d2040c9f2..3863a4dbdf18 100644
--- a/trunk/include/linux/of_platform.h
+++ b/trunk/include/linux/of_platform.h
@@ -100,6 +100,7 @@ extern int of_platform_populate(struct device_node *root,
 
 #if !defined(CONFIG_OF_ADDRESS)
 struct of_dev_auxdata;
+struct device;
 static inline int of_platform_populate(struct device_node *root,
 					const struct of_device_id *matches,
 					const struct of_dev_auxdata *lookup,
diff --git a/trunk/include/linux/percpu-rwsem.h b/trunk/include/linux/percpu-rwsem.h
index bd1e86071e57..3e88c9a7d57f 100644
--- a/trunk/include/linux/percpu-rwsem.h
+++ b/trunk/include/linux/percpu-rwsem.h
@@ -1,83 +1,34 @@
 #ifndef _LINUX_PERCPU_RWSEM_H
 #define _LINUX_PERCPU_RWSEM_H
 
-#include <linux/mutex.h>
+#include <linux/atomic.h>
+#include <linux/rwsem.h>
 #include <linux/percpu.h>
-#include <linux/rcupdate.h>
-#include <linux/delay.h>
+#include <linux/wait.h>
+#include <linux/lockdep.h>
 
 struct percpu_rw_semaphore {
-	unsigned __percpu *counters;
-	bool locked;
-	struct mutex mtx;
+	unsigned int __percpu	*fast_read_ctr;
+	atomic_t		write_ctr;
+	struct rw_semaphore	rw_sem;
+	atomic_t		slow_read_ctr;
+	wait_queue_head_t	write_waitq;
 };
 
-#define light_mb()	barrier()
-#define heavy_mb()	synchronize_sched_expedited()
+extern void percpu_down_read(struct percpu_rw_semaphore *);
+extern void percpu_up_read(struct percpu_rw_semaphore *);
 
-static inline void percpu_down_read(struct percpu_rw_semaphore *p)
-{
-	rcu_read_lock_sched();
-	if (unlikely(p->locked)) {
-		rcu_read_unlock_sched();
-		mutex_lock(&p->mtx);
-		this_cpu_inc(*p->counters);
-		mutex_unlock(&p->mtx);
-		return;
-	}
-	this_cpu_inc(*p->counters);
-	rcu_read_unlock_sched();
-	light_mb(); /* A, between read of p->locked and read of data, paired with D */
-}
+extern void percpu_down_write(struct percpu_rw_semaphore *);
+extern void percpu_up_write(struct percpu_rw_semaphore *);
 
-static inline void percpu_up_read(struct percpu_rw_semaphore *p)
-{
-	light_mb(); /* B, between read of the data and write to p->counter, paired with C */
-	this_cpu_dec(*p->counters);
-}
+extern int __percpu_init_rwsem(struct percpu_rw_semaphore *,
+				const char *, struct lock_class_key *);
+extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
 
-static inline unsigned __percpu_count(unsigned __percpu *counters)
-{
-	unsigned total = 0;
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		total += ACCESS_ONCE(*per_cpu_ptr(counters, cpu));
-
-	return total;
-}
-
-static inline void percpu_down_write(struct percpu_rw_semaphore *p)
-{
-	mutex_lock(&p->mtx);
-	p->locked = true;
-	synchronize_sched_expedited(); /* make sure that all readers exit the rcu_read_lock_sched region */
-	while (__percpu_count(p->counters))
-		msleep(1);
-	heavy_mb(); /* C, between read of p->counter and write to data, paired with B */
-}
-
-static inline void percpu_up_write(struct percpu_rw_semaphore *p)
-{
-	heavy_mb(); /* D, between write to data and write to p->locked, paired with A */
-	p->locked = false;
-	mutex_unlock(&p->mtx);
-}
-
-static inline int percpu_init_rwsem(struct percpu_rw_semaphore *p)
-{
-	p->counters = alloc_percpu(unsigned);
-	if (unlikely(!p->counters))
-		return -ENOMEM;
-	p->locked = false;
-	mutex_init(&p->mtx);
-	return 0;
-}
-
-static inline void percpu_free_rwsem(struct percpu_rw_semaphore *p)
-{
-	free_percpu(p->counters);
-	p->counters = NULL; /* catch use after free bugs */
-}
+#define percpu_init_rwsem(brw)	\
+({								\
+	static struct lock_class_key rwsem_key;			\
+	__percpu_init_rwsem(brw, #brw, &rwsem_key);		\
+})
 
 #endif
diff --git a/trunk/include/linux/pid_namespace.h b/trunk/include/linux/pid_namespace.h
index 65e3e87eacc5..bf285999273a 100644
--- a/trunk/include/linux/pid_namespace.h
+++ b/trunk/include/linux/pid_namespace.h
@@ -21,6 +21,7 @@ struct pid_namespace {
 	struct kref kref;
 	struct pidmap pidmap[PIDMAP_ENTRIES];
 	int last_pid;
+	int nr_hashed;
 	struct task_struct *child_reaper;
 	struct kmem_cache *pid_cachep;
 	unsigned int level;
@@ -31,9 +32,12 @@ struct pid_namespace {
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct bsd_acct_struct *bacct;
 #endif
+	struct user_namespace *user_ns;
+	struct work_struct proc_work;
 	kgid_t pid_gid;
 	int hide_pid;
 	int reboot;	/* group exit code if this pidns was rebooted */
+	unsigned int proc_inum;
 };
 
 extern struct pid_namespace init_pid_ns;
@@ -46,7 +50,8 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
 	return ns;
 }
 
-extern struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *ns);
+extern struct pid_namespace *copy_pid_ns(unsigned long flags,
+	struct user_namespace *user_ns, struct pid_namespace *ns);
 extern void zap_pid_ns_processes(struct pid_namespace *pid_ns);
 extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd);
 extern void put_pid_ns(struct pid_namespace *ns);
@@ -59,8 +64,8 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
 	return ns;
 }
 
-static inline struct pid_namespace *
-copy_pid_ns(unsigned long flags, struct pid_namespace *ns)
+static inline struct pid_namespace *copy_pid_ns(unsigned long flags,
+	struct user_namespace *user_ns, struct pid_namespace *ns)
 {
 	if (flags & CLONE_NEWPID)
 		ns = ERR_PTR(-EINVAL);
diff --git a/trunk/include/linux/platform_data/i2c-cbus-gpio.h b/trunk/include/linux/platform_data/i2c-cbus-gpio.h
new file mode 100644
index 000000000000..6faa992a9502
--- /dev/null
+++ b/trunk/include/linux/platform_data/i2c-cbus-gpio.h
@@ -0,0 +1,27 @@
+/*
+ * i2c-cbus-gpio.h - CBUS I2C platform_data definition
+ *
+ * Copyright (C) 2004-2009 Nokia Corporation
+ *
+ * Written by Felipe Balbi and Aaro Koskinen.
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License. See the file "COPYING" in the main directory of this
+ * archive for more details.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __INCLUDE_LINUX_I2C_CBUS_GPIO_H
+#define __INCLUDE_LINUX_I2C_CBUS_GPIO_H
+
+struct i2c_cbus_platform_data {
+	int dat_gpio;
+	int clk_gpio;
+	int sel_gpio;
+};
+
+#endif /* __INCLUDE_LINUX_I2C_CBUS_GPIO_H */
diff --git a/trunk/include/linux/platform_data/lp855x.h b/trunk/include/linux/platform_data/lp855x.h
index 761f31752367..e81f62d24ee2 100644
--- a/trunk/include/linux/platform_data/lp855x.h
+++ b/trunk/include/linux/platform_data/lp855x.h
@@ -89,11 +89,6 @@ enum lp8556_brightness_source {
 	LP8556_COMBINED2,	/* pwm + i2c after the shaper block */
 };
 
-struct lp855x_pwm_data {
-	void (*pwm_set_intensity) (int brightness, int max_brightness);
-	int (*pwm_get_intensity) (int max_brightness);
-};
-
 struct lp855x_rom_data {
 	u8 addr;
 	u8 val;
@@ -105,7 +100,7 @@ struct lp855x_rom_data {
  * @mode : brightness control by pwm or lp855x register
  * @device_control : value of DEVICE CONTROL register
  * @initial_brightness : initial value of backlight brightness
- * @pwm_data : platform specific pwm generation functions.
+ * @period_ns : platform specific pwm period value. unit is nano.
 		Only valid when mode is PWM_BASED.
  * @load_new_rom_data :
 	0 : use default configuration data
@@ -118,7 +113,7 @@ struct lp855x_platform_data {
 	enum lp855x_brightness_ctrl_mode mode;
 	u8 device_control;
 	int initial_brightness;
-	struct lp855x_pwm_data pwm_data;
+	unsigned int period_ns;
 	u8 load_new_rom_data;
 	int size_program;
 	struct lp855x_rom_data *rom_data;
diff --git a/trunk/include/linux/platform_data/mtd-nomadik-nand.h b/trunk/include/linux/platform_data/mtd-nomadik-nand.h
deleted file mode 100644
index c3c8254c22a5..000000000000
--- a/trunk/include/linux/platform_data/mtd-nomadik-nand.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef __ASM_ARCH_NAND_H
-#define __ASM_ARCH_NAND_H
-
-struct nomadik_nand_platform_data {
-	struct mtd_partition *parts;
-	int nparts;
-	int options;
-	int (*init) (void);
-	int (*exit) (void);
-};
-
-#define NAND_IO_DATA	0x40000000
-#define NAND_IO_CMD	0x40800000
-#define NAND_IO_ADDR	0x41000000
-
-#endif				/* __ASM_ARCH_NAND_H */
diff --git a/trunk/include/linux/proc_fs.h b/trunk/include/linux/proc_fs.h
index 3fd2e871ff1b..32676b35d2f5 100644
--- a/trunk/include/linux/proc_fs.h
+++ b/trunk/include/linux/proc_fs.h
@@ -28,7 +28,11 @@ struct mm_struct;
  */
 
 enum {
-	PROC_ROOT_INO = 1,
+	PROC_ROOT_INO		= 1,
+	PROC_IPC_INIT_INO	= 0xEFFFFFFFU,
+	PROC_UTS_INIT_INO	= 0xEFFFFFFEU,
+	PROC_USER_INIT_INO	= 0xEFFFFFFDU,
+	PROC_PID_INIT_INO	= 0xEFFFFFFCU,
 };
 
 /*
@@ -174,7 +178,10 @@ extern struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
 	struct proc_dir_entry *parent);
 
 extern struct file *proc_ns_fget(int fd);
+extern bool proc_ns_inode(struct inode *inode);
 
+extern int proc_alloc_inum(unsigned int *pino);
+extern void proc_free_inum(unsigned int inum);
 #else
 
 #define proc_net_fops_create(net, name, mode, fops)  ({ (void)(mode), NULL; })
@@ -229,6 +236,19 @@ static inline struct file *proc_ns_fget(int fd)
 	return ERR_PTR(-EINVAL);
 }
 
+static inline bool proc_ns_inode(struct inode *inode)
+{
+	return false;
+}
+
+static inline int proc_alloc_inum(unsigned int *inum)
+{
+	*inum = 1;
+	return 0;
+}
+static inline void proc_free_inum(unsigned int inum)
+{
+}
 #endif /* CONFIG_PROC_FS */
 
 #if !defined(CONFIG_PROC_KCORE)
@@ -247,10 +267,14 @@ struct proc_ns_operations {
 	void *(*get)(struct task_struct *task);
 	void (*put)(void *ns);
 	int (*install)(struct nsproxy *nsproxy, void *ns);
+	unsigned int (*inum)(void *ns);
 };
 extern const struct proc_ns_operations netns_operations;
 extern const struct proc_ns_operations utsns_operations;
 extern const struct proc_ns_operations ipcns_operations;
+extern const struct proc_ns_operations pidns_operations;
+extern const struct proc_ns_operations userns_operations;
+extern const struct proc_ns_operations mntns_operations;
 
 union proc_op {
 	int (*proc_get_link)(struct dentry *, struct path *);
@@ -290,4 +314,7 @@ static inline struct net *PDE_NET(struct proc_dir_entry *pde)
 	return pde->parent->data;
 }
 
+#include <linux/signal.h>
+
+void render_sigset_t(struct seq_file *m, const char *header, sigset_t *set);
 #endif /* _LINUX_PROC_FS_H */
diff --git a/trunk/include/linux/ptrace.h b/trunk/include/linux/ptrace.h
index a89ff04bddd9..addfbe7c180e 100644
--- a/trunk/include/linux/ptrace.h
+++ b/trunk/include/linux/ptrace.h
@@ -32,6 +32,8 @@
 #define PT_TRACE_EXIT		PT_EVENT_FLAG(PTRACE_EVENT_EXIT)
 #define PT_TRACE_SECCOMP	PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP)
 
+#define PT_EXITKILL		(PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT)
+
 /* single stepping state bits (used on ARM and PA-RISC) */
 #define PT_SINGLESTEP_BIT	31
 #define PT_SINGLESTEP		(1<<PT_SINGLESTEP_BIT)
diff --git a/trunk/include/linux/pwm.h b/trunk/include/linux/pwm.h
index 112b31436848..6d661f32e0e4 100644
--- a/trunk/include/linux/pwm.h
+++ b/trunk/include/linux/pwm.h
@@ -171,6 +171,9 @@ struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip,
 					 unsigned int index,
 					 const char *label);
 
+struct pwm_device *of_pwm_xlate_with_flags(struct pwm_chip *pc,
+		const struct of_phandle_args *args);
+
 struct pwm_device *pwm_get(struct device *dev, const char *consumer);
 void pwm_put(struct pwm_device *pwm);
 
diff --git a/trunk/include/linux/raid/pq.h b/trunk/include/linux/raid/pq.h
index 640c69ceec96..8dfaa2ce2e95 100644
--- a/trunk/include/linux/raid/pq.h
+++ b/trunk/include/linux/raid/pq.h
@@ -98,6 +98,9 @@ extern const struct raid6_calls raid6_altivec1;
 extern const struct raid6_calls raid6_altivec2;
 extern const struct raid6_calls raid6_altivec4;
 extern const struct raid6_calls raid6_altivec8;
+extern const struct raid6_calls raid6_avx2x1;
+extern const struct raid6_calls raid6_avx2x2;
+extern const struct raid6_calls raid6_avx2x4;
 
 struct raid6_recov_calls {
 	void (*data2)(int, size_t, int, int, void **);
@@ -109,6 +112,7 @@ struct raid6_recov_calls {
 
 extern const struct raid6_recov_calls raid6_recov_intx1;
 extern const struct raid6_recov_calls raid6_recov_ssse3;
+extern const struct raid6_recov_calls raid6_recov_avx2;
 
 /* Algorithm list */
 extern const struct raid6_calls * const raid6_algos[];
diff --git a/trunk/include/linux/random.h b/trunk/include/linux/random.h
index 6330ed47b38b..d9846088c2c5 100644
--- a/trunk/include/linux/random.h
+++ b/trunk/include/linux/random.h
@@ -25,10 +25,19 @@ extern const struct file_operations random_fops, urandom_fops;
 unsigned int get_random_int(void);
 unsigned long randomize_range(unsigned long start, unsigned long end, unsigned long len);
 
-u32 random32(void);
-void srandom32(u32 seed);
+u32 prandom_u32(void);
+void prandom_bytes(void *buf, int nbytes);
+void prandom_seed(u32 seed);
 
-u32 prandom32(struct rnd_state *);
+/*
+ * These macros are preserved for backward compatibility and should be
+ * removed as soon as a transition is finished.
+ */
+#define random32() prandom_u32()
+#define srandom32(seed) prandom_seed(seed)
+
+u32 prandom_u32_state(struct rnd_state *);
+void prandom_bytes_state(struct rnd_state *state, void *buf, int nbytes);
 
 /*
  * Handle minimum values for seeds
@@ -39,11 +48,11 @@ static inline u32 __seed(u32 x, u32 m)
 }
 
 /**
- * prandom32_seed - set seed for prandom32().
+ * prandom_seed_state - set seed for prandom_u32_state().
  * @state: pointer to state structure to receive the seed.
  * @seed: arbitrary 64-bit value to use as a seed.
  */
-static inline void prandom32_seed(struct rnd_state *state, u64 seed)
+static inline void prandom_seed_state(struct rnd_state *state, u64 seed)
 {
 	u32 i = (seed >> 32) ^ (seed << 10) ^ seed;
 
diff --git a/trunk/include/linux/res_counter.h b/trunk/include/linux/res_counter.h
index 6f54e40fa218..5ae8456d9670 100644
--- a/trunk/include/linux/res_counter.h
+++ b/trunk/include/linux/res_counter.h
@@ -125,14 +125,16 @@ int res_counter_charge_nofail(struct res_counter *counter,
  *
  * these calls check for usage underflow and show a warning on the console
  * _locked call expects the counter->lock to be taken
+ *
+ * returns the total charges still present in @counter.
  */
 
-void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
-void res_counter_uncharge(struct res_counter *counter, unsigned long val);
+u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
+u64 res_counter_uncharge(struct res_counter *counter, unsigned long val);
 
-void res_counter_uncharge_until(struct res_counter *counter,
-				struct res_counter *top,
-				unsigned long val);
+u64 res_counter_uncharge_until(struct res_counter *counter,
+			       struct res_counter *top,
+			       unsigned long val);
 /**
  * res_counter_margin - calculate chargeable space of a counter
  * @cnt: the counter
diff --git a/trunk/include/linux/sched.h b/trunk/include/linux/sched.h
index b089c92c609b..f712465b05c5 100644
--- a/trunk/include/linux/sched.h
+++ b/trunk/include/linux/sched.h
@@ -1597,6 +1597,7 @@ struct task_struct {
 		unsigned long nr_pages;	/* uncharged usage */
 		unsigned long memsw_nr_pages; /* uncharged mem+swap usage */
 	} memcg_batch;
+	unsigned int memcg_kmem_skip_account;
 #endif
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 	atomic_t ptrace_bp_refcnt;
@@ -1778,12 +1779,6 @@ static inline int is_global_init(struct task_struct *tsk)
 	return tsk->pid == 1;
 }
 
-/*
- * is_container_init:
- * check whether in the task is init in its own pid namespace.
- */
-extern int is_container_init(struct task_struct *tsk);
-
 extern struct pid *cad_pid;
 
 extern void free_task(struct task_struct *tsk);
diff --git a/trunk/include/linux/security.h b/trunk/include/linux/security.h
index 05e88bdcf7d9..0f6afc657f77 100644
--- a/trunk/include/linux/security.h
+++ b/trunk/include/linux/security.h
@@ -694,6 +694,12 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	userspace to load a kernel module with the given name.
  *	@kmod_name name of the module requested by the kernel
  *	Return 0 if successful.
+ * @kernel_module_from_file:
+ *	Load a kernel module from userspace.
+ *	@file contains the file structure pointing to the file containing
+ *	the kernel module to load. If the module is being loaded from a blob,
+ *	this argument will be NULL.
+ *	Return 0 if permission is granted.
  * @task_fix_setuid:
  *	Update the module's state after setting one or more of the user
  *	identity attributes of the current process.  The @flags parameter
@@ -1508,6 +1514,7 @@ struct security_operations {
 	int (*kernel_act_as)(struct cred *new, u32 secid);
 	int (*kernel_create_files_as)(struct cred *new, struct inode *inode);
 	int (*kernel_module_request)(char *kmod_name);
+	int (*kernel_module_from_file)(struct file *file);
 	int (*task_fix_setuid) (struct cred *new, const struct cred *old,
 				int flags);
 	int (*task_setpgid) (struct task_struct *p, pid_t pgid);
@@ -1765,6 +1772,7 @@ void security_transfer_creds(struct cred *new, const struct cred *old);
 int security_kernel_act_as(struct cred *new, u32 secid);
 int security_kernel_create_files_as(struct cred *new, struct inode *inode);
 int security_kernel_module_request(char *kmod_name);
+int security_kernel_module_from_file(struct file *file);
 int security_task_fix_setuid(struct cred *new, const struct cred *old,
 			     int flags);
 int security_task_setpgid(struct task_struct *p, pid_t pgid);
@@ -2278,6 +2286,11 @@ static inline int security_kernel_module_request(char *kmod_name)
 	return 0;
 }
 
+static inline int security_kernel_module_from_file(struct file *file)
+{
+	return 0;
+}
+
 static inline int security_task_fix_setuid(struct cred *new,
 					   const struct cred *old,
 					   int flags)
diff --git a/trunk/include/linux/slab.h b/trunk/include/linux/slab.h
index 83d1a1454b7e..5d168d7e0a28 100644
--- a/trunk/include/linux/slab.h
+++ b/trunk/include/linux/slab.h
@@ -11,6 +11,8 @@
 
 #include <linux/gfp.h>
 #include <linux/types.h>
+#include <linux/workqueue.h>
+
 
 /*
  * Flags to pass to kmem_cache_create().
@@ -116,6 +118,7 @@ struct kmem_cache {
 };
 #endif
 
+struct mem_cgroup;
 /*
  * struct kmem_cache related prototypes
  */
@@ -125,10 +128,12 @@ int slab_is_available(void);
 struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			unsigned long,
 			void (*)(void *));
+struct kmem_cache *
+kmem_cache_create_memcg(struct mem_cgroup *, const char *, size_t, size_t,
+			unsigned long, void (*)(void *), struct kmem_cache *);
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
 void kmem_cache_free(struct kmem_cache *, void *);
-unsigned int kmem_cache_size(struct kmem_cache *);
 
 /*
  * Please use this macro to create slab caches. Simply specify the
@@ -176,6 +181,48 @@ unsigned int kmem_cache_size(struct kmem_cache *);
 #ifndef ARCH_SLAB_MINALIGN
 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
 #endif
+/*
+ * This is the main placeholder for memcg-related information in kmem caches.
+ * struct kmem_cache will hold a pointer to it, so the memory cost while
+ * disabled is 1 pointer. The runtime cost while enabled, gets bigger than it
+ * would otherwise be if that would be bundled in kmem_cache: we'll need an
+ * extra pointer chase. But the trade off clearly lays in favor of not
+ * penalizing non-users.
+ *
+ * Both the root cache and the child caches will have it. For the root cache,
+ * this will hold a dynamically allocated array large enough to hold
+ * information about the currently limited memcgs in the system.
+ *
+ * Child caches will hold extra metadata needed for its operation. Fields are:
+ *
+ * @memcg: pointer to the memcg this cache belongs to
+ * @list: list_head for the list of all caches in this memcg
+ * @root_cache: pointer to the global, root cache, this cache was derived from
+ * @dead: set to true after the memcg dies; the cache may still be around.
+ * @nr_pages: number of pages that belongs to this cache.
+ * @destroy: worker to be called whenever we are ready, or believe we may be
+ *           ready, to destroy this cache.
+ */
+struct memcg_cache_params {
+	bool is_root_cache;
+	union {
+		struct kmem_cache *memcg_caches[0];
+		struct {
+			struct mem_cgroup *memcg;
+			struct list_head list;
+			struct kmem_cache *root_cache;
+			bool dead;
+			atomic_t nr_pages;
+			struct work_struct destroy;
+		};
+	};
+};
+
+int memcg_update_all_caches(int num_memcgs);
+
+struct seq_file;
+int cache_show(struct kmem_cache *s, struct seq_file *m);
+void print_slabinfo_header(struct seq_file *m);
 
 /*
  * Common kmalloc functions provided by all allocators
@@ -388,6 +435,14 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
 	return kmalloc_node(size, flags | __GFP_ZERO, node);
 }
 
+/*
+ * Determine the size of a slab object
+ */
+static inline unsigned int kmem_cache_size(struct kmem_cache *s)
+{
+	return s->object_size;
+}
+
 void __init kmem_cache_init_late(void);
 
 #endif	/* _LINUX_SLAB_H */
diff --git a/trunk/include/linux/slab_def.h b/trunk/include/linux/slab_def.h
index cc290f0bdb34..8bb6e0eaf3c6 100644
--- a/trunk/include/linux/slab_def.h
+++ b/trunk/include/linux/slab_def.h
@@ -81,6 +81,9 @@ struct kmem_cache {
 	 */
 	int obj_offset;
 #endif /* CONFIG_DEBUG_SLAB */
+#ifdef CONFIG_MEMCG_KMEM
+	struct memcg_cache_params *memcg_params;
+#endif
 
 /* 6) per-cpu/per-node data, touched during every alloc/free */
 	/*
@@ -89,9 +92,13 @@ struct kmem_cache {
 	 * (see kmem_cache_init())
 	 * We still use [NR_CPUS] and not [1] or [0] because cache_cache
 	 * is statically defined, so we reserve the max number of cpus.
+	 *
+	 * We also need to guarantee that the list is able to accomodate a
+	 * pointer for each node since "nodelists" uses the remainder of
+	 * available pointers.
 	 */
 	struct kmem_list3 **nodelists;
-	struct array_cache *array[NR_CPUS];
+	struct array_cache *array[NR_CPUS + MAX_NUMNODES];
 	/*
 	 * Do not add fields after array[]
 	 */
diff --git a/trunk/include/linux/slub_def.h b/trunk/include/linux/slub_def.h
index df448adb7283..9db4825cd393 100644
--- a/trunk/include/linux/slub_def.h
+++ b/trunk/include/linux/slub_def.h
@@ -101,6 +101,10 @@ struct kmem_cache {
 #ifdef CONFIG_SYSFS
 	struct kobject kobj;	/* For sysfs */
 #endif
+#ifdef CONFIG_MEMCG_KMEM
+	struct memcg_cache_params *memcg_params;
+	int max_attr_size; /* for propagation, maximum size of a stored attr */
+#endif
 
 #ifdef CONFIG_NUMA
 	/*
@@ -222,7 +226,10 @@ void *__kmalloc(size_t size, gfp_t flags);
 static __always_inline void *
 kmalloc_order(size_t size, gfp_t flags, unsigned int order)
 {
-	void *ret = (void *) __get_free_pages(flags | __GFP_COMP, order);
+	void *ret;
+
+	flags |= (__GFP_COMP | __GFP_KMEMCG);
+	ret = (void *) __get_free_pages(flags, order);
 	kmemleak_alloc(ret, size, 1, flags);
 	return ret;
 }
diff --git a/trunk/include/linux/string.h b/trunk/include/linux/string.h
index 630125818ca8..ac889c5ea11b 100644
--- a/trunk/include/linux/string.h
+++ b/trunk/include/linux/string.h
@@ -143,4 +143,15 @@ static inline bool strstarts(const char *str, const char *prefix)
 
 extern size_t memweight(const void *ptr, size_t bytes);
 
+/**
+ * kbasename - return the last part of a pathname.
+ *
+ * @path: path to extract the filename from.
+ */
+static inline const char *kbasename(const char *path)
+{
+	const char *tail = strrchr(path, '/');
+	return tail ? tail + 1 : path;
+}
+
 #endif /* _LINUX_STRING_H_ */
diff --git a/trunk/include/linux/sunrpc/sched.h b/trunk/include/linux/sunrpc/sched.h
index dc0c3cc3ada3..b64f8eb0b973 100644
--- a/trunk/include/linux/sunrpc/sched.h
+++ b/trunk/include/linux/sunrpc/sched.h
@@ -192,7 +192,6 @@ struct rpc_wait_queue {
 	pid_t			owner;			/* process id of last task serviced */
 	unsigned char		maxpriority;		/* maximum priority (0 if queue is not a priority queue) */
 	unsigned char		priority;		/* current priority */
-	unsigned char		count;			/* # task groups remaining serviced so far */
 	unsigned char		nr;			/* # tasks remaining for cookie */
 	unsigned short		qlen;			/* total # tasks waiting in queue */
 	struct rpc_timer	timer_list;
diff --git a/trunk/include/linux/syscalls.h b/trunk/include/linux/syscalls.h
index 91835e7f364d..6caee34bf8a2 100644
--- a/trunk/include/linux/syscalls.h
+++ b/trunk/include/linux/syscalls.h
@@ -560,10 +560,10 @@ asmlinkage long sys_utime(char __user *filename,
 asmlinkage long sys_utimes(char __user *filename,
 				struct timeval __user *utimes);
 asmlinkage long sys_lseek(unsigned int fd, off_t offset,
-			  unsigned int origin);
+			  unsigned int whence);
 asmlinkage long sys_llseek(unsigned int fd, unsigned long offset_high,
 			unsigned long offset_low, loff_t __user *result,
-			unsigned int origin);
+			unsigned int whence);
 asmlinkage long sys_read(unsigned int fd, char __user *buf, size_t count);
 asmlinkage long sys_readahead(int fd, loff_t offset, size_t count);
 asmlinkage long sys_readv(unsigned long fd,
@@ -880,4 +880,5 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
 
 asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
 			 unsigned long idx1, unsigned long idx2);
+asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
 #endif
diff --git a/trunk/include/linux/thread_info.h b/trunk/include/linux/thread_info.h
index ccc1899bd62e..e7e04736802f 100644
--- a/trunk/include/linux/thread_info.h
+++ b/trunk/include/linux/thread_info.h
@@ -61,6 +61,8 @@ extern long do_no_restart_syscall(struct restart_block *parm);
 # define THREADINFO_GFP		(GFP_KERNEL | __GFP_NOTRACK)
 #endif
 
+#define THREADINFO_GFP_ACCOUNTED (THREADINFO_GFP | __GFP_KMEMCG)
+
 /*
  * flag set/clear/test wrappers
  * - pass TIF_xxxx constants to these functions
diff --git a/trunk/include/linux/usb/usbnet.h b/trunk/include/linux/usb/usbnet.h
index 9bbeabf66c54..bd45eb7bedc8 100644
--- a/trunk/include/linux/usb/usbnet.h
+++ b/trunk/include/linux/usb/usbnet.h
@@ -69,6 +69,7 @@ struct usbnet {
 #		define EVENT_DEV_ASLEEP 6
 #		define EVENT_DEV_OPEN	7
 #		define EVENT_DEVICE_REPORT_IDLE	8
+#		define EVENT_NO_RUNTIME_PM	9
 };
 
 static inline struct usb_driver *driver_of(struct usb_interface *intf)
@@ -240,4 +241,6 @@ extern void usbnet_set_msglevel(struct net_device *, u32);
 extern void usbnet_get_drvinfo(struct net_device *, struct ethtool_drvinfo *);
 extern int usbnet_nway_reset(struct net_device *net);
 
+extern int usbnet_manage_power(struct usbnet *, int);
+
 #endif /* __LINUX_USB_USBNET_H */
diff --git a/trunk/include/linux/user_namespace.h b/trunk/include/linux/user_namespace.h
index 95142cae446a..b9bd2e6c73cc 100644
--- a/trunk/include/linux/user_namespace.h
+++ b/trunk/include/linux/user_namespace.h
@@ -25,6 +25,7 @@ struct user_namespace {
 	struct user_namespace	*parent;
 	kuid_t			owner;
 	kgid_t			group;
+	unsigned int		proc_inum;
 };
 
 extern struct user_namespace init_user_ns;
@@ -39,6 +40,7 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 }
 
 extern int create_user_ns(struct cred *new);
+extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred);
 extern void free_user_ns(struct kref *kref);
 
 static inline void put_user_ns(struct user_namespace *ns)
@@ -66,6 +68,14 @@ static inline int create_user_ns(struct cred *new)
 	return -EINVAL;
 }
 
+static inline int unshare_userns(unsigned long unshare_flags,
+				 struct cred **new_cred)
+{
+	if (unshare_flags & CLONE_NEWUSER)
+		return -EINVAL;
+	return 0;
+}
+
 static inline void put_user_ns(struct user_namespace *ns)
 {
 }
diff --git a/trunk/include/linux/utsname.h b/trunk/include/linux/utsname.h
index 2b345206722a..239e27733d6c 100644
--- a/trunk/include/linux/utsname.h
+++ b/trunk/include/linux/utsname.h
@@ -23,6 +23,7 @@ struct uts_namespace {
 	struct kref kref;
 	struct new_utsname name;
 	struct user_namespace *user_ns;
+	unsigned int proc_inum;
 };
 extern struct uts_namespace init_uts_ns;
 
@@ -33,7 +34,7 @@ static inline void get_uts_ns(struct uts_namespace *ns)
 }
 
 extern struct uts_namespace *copy_utsname(unsigned long flags,
-					  struct task_struct *tsk);
+	struct user_namespace *user_ns, struct uts_namespace *old_ns);
 extern void free_uts_ns(struct kref *kref);
 
 static inline void put_uts_ns(struct uts_namespace *ns)
@@ -50,12 +51,12 @@ static inline void put_uts_ns(struct uts_namespace *ns)
 }
 
 static inline struct uts_namespace *copy_utsname(unsigned long flags,
-						 struct task_struct *tsk)
+	struct user_namespace *user_ns, struct uts_namespace *old_ns)
 {
 	if (flags & CLONE_NEWUTS)
 		return ERR_PTR(-EINVAL);
 
-	return tsk->nsproxy->uts_ns;
+	return old_ns;
 }
 #endif
 
diff --git a/trunk/include/linux/wait.h b/trunk/include/linux/wait.h
index 168dfe122dd3..7cb64d4b499d 100644
--- a/trunk/include/linux/wait.h
+++ b/trunk/include/linux/wait.h
@@ -550,6 +550,170 @@ do {									\
 	__ret;								\
 })
 
+
+#define __wait_event_lock_irq(wq, condition, lock, cmd)			\
+do {									\
+	DEFINE_WAIT(__wait);						\
+									\
+	for (;;) {							\
+		prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE);	\
+		if (condition)						\
+			break;						\
+		spin_unlock_irq(&lock);					\
+		cmd;							\
+		schedule();						\
+		spin_lock_irq(&lock);					\
+	}								\
+	finish_wait(&wq, &__wait);					\
+} while (0)
+
+/**
+ * wait_event_lock_irq_cmd - sleep until a condition gets true. The
+ *			     condition is checked under the lock. This
+ *			     is expected to be called with the lock
+ *			     taken.
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @lock: a locked spinlock_t, which will be released before cmd
+ *	  and schedule() and reacquired afterwards.
+ * @cmd: a command which is invoked outside the critical section before
+ *	 sleep
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * This is supposed to be called while holding the lock. The lock is
+ * dropped before invoking the cmd and going to sleep and is reacquired
+ * afterwards.
+ */
+#define wait_event_lock_irq_cmd(wq, condition, lock, cmd)		\
+do {									\
+	if (condition)							\
+		break;							\
+	__wait_event_lock_irq(wq, condition, lock, cmd);		\
+} while (0)
+
+/**
+ * wait_event_lock_irq - sleep until a condition gets true. The
+ *			 condition is checked under the lock. This
+ *			 is expected to be called with the lock
+ *			 taken.
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @lock: a locked spinlock_t, which will be released before schedule()
+ *	  and reacquired afterwards.
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * This is supposed to be called while holding the lock. The lock is
+ * dropped before going to sleep and is reacquired afterwards.
+ */
+#define wait_event_lock_irq(wq, condition, lock)			\
+do {									\
+	if (condition)							\
+		break;							\
+	__wait_event_lock_irq(wq, condition, lock, );			\
+} while (0)
+
+
+#define __wait_event_interruptible_lock_irq(wq, condition,		\
+					    lock, ret, cmd)		\
+do {									\
+	DEFINE_WAIT(__wait);						\
+									\
+	for (;;) {							\
+		prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE);	\
+		if (condition)						\
+			break;						\
+		if (signal_pending(current)) {				\
+			ret = -ERESTARTSYS;				\
+			break;						\
+		}							\
+		spin_unlock_irq(&lock);					\
+		cmd;							\
+		schedule();						\
+		spin_lock_irq(&lock);					\
+	}								\
+	finish_wait(&wq, &__wait);					\
+} while (0)
+
+/**
+ * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true.
+ *		The condition is checked under the lock. This is expected to
+ *		be called with the lock taken.
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @lock: a locked spinlock_t, which will be released before cmd and
+ *	  schedule() and reacquired afterwards.
+ * @cmd: a command which is invoked outside the critical section before
+ *	 sleep
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received. The @condition is
+ * checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * This is supposed to be called while holding the lock. The lock is
+ * dropped before invoking the cmd and going to sleep and is reacquired
+ * afterwards.
+ *
+ * The macro will return -ERESTARTSYS if it was interrupted by a signal
+ * and 0 if @condition evaluated to true.
+ */
+#define wait_event_interruptible_lock_irq_cmd(wq, condition, lock, cmd)	\
+({									\
+	int __ret = 0;							\
+									\
+	if (!(condition))						\
+		__wait_event_interruptible_lock_irq(wq, condition,	\
+						    lock, __ret, cmd);	\
+	__ret;								\
+})
+
+/**
+ * wait_event_interruptible_lock_irq - sleep until a condition gets true.
+ *		The condition is checked under the lock. This is expected
+ *		to be called with the lock taken.
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @lock: a locked spinlock_t, which will be released before schedule()
+ *	  and reacquired afterwards.
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or signal is received. The @condition is
+ * checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * This is supposed to be called while holding the lock. The lock is
+ * dropped before going to sleep and is reacquired afterwards.
+ *
+ * The macro will return -ERESTARTSYS if it was interrupted by a signal
+ * and 0 if @condition evaluated to true.
+ */
+#define wait_event_interruptible_lock_irq(wq, condition, lock)		\
+({									\
+	int __ret = 0;							\
+									\
+	if (!(condition))						\
+		__wait_event_interruptible_lock_irq(wq, condition,	\
+						    lock, __ret, );	\
+	__ret;								\
+})
+
+
 /*
  * These are the old interfaces to sleep waiting for an event.
  * They are racy.  DO NOT use them, use the wait_event* interfaces above.
diff --git a/trunk/include/net/inet_connection_sock.h b/trunk/include/net/inet_connection_sock.h
index ba1d3615acbb..183292722f6e 100644
--- a/trunk/include/net/inet_connection_sock.h
+++ b/trunk/include/net/inet_connection_sock.h
@@ -318,6 +318,7 @@ extern void inet_csk_reqsk_queue_prune(struct sock *parent,
 				       const unsigned long max_rto);
 
 extern void inet_csk_destroy_sock(struct sock *sk);
+extern void inet_csk_prepare_forced_close(struct sock *sk);
 
 /*
  * LISTEN is a special case for poll..
diff --git a/trunk/include/net/ndisc.h b/trunk/include/net/ndisc.h
index 7af1ea893038..23b3a7c58783 100644
--- a/trunk/include/net/ndisc.h
+++ b/trunk/include/net/ndisc.h
@@ -78,6 +78,13 @@ struct ra_msg {
 	__be32			retrans_timer;
 };
 
+struct rd_msg {
+	struct icmp6hdr icmph;
+	struct in6_addr	target;
+	struct in6_addr	dest;
+	__u8		opt[0];
+};
+
 struct nd_opt_hdr {
 	__u8		nd_opt_type;
 	__u8		nd_opt_len;
diff --git a/trunk/include/net/net_namespace.h b/trunk/include/net/net_namespace.h
index c5a43f56b796..de644bcd8613 100644
--- a/trunk/include/net/net_namespace.h
+++ b/trunk/include/net/net_namespace.h
@@ -56,6 +56,8 @@ struct net {
 
 	struct user_namespace   *user_ns;	/* Owning user namespace */
 
+	unsigned int		proc_inum;
+
 	struct proc_dir_entry 	*proc_net;
 	struct proc_dir_entry 	*proc_net_stat;
 
diff --git a/trunk/include/trace/events/btrfs.h b/trunk/include/trace/events/btrfs.h
index 54fab041b22a..ea546a4e9609 100644
--- a/trunk/include/trace/events/btrfs.h
+++ b/trunk/include/trace/events/btrfs.h
@@ -45,7 +45,8 @@ struct extent_buffer;
 
 #define show_root_type(obj)						\
 	obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) ||		\
-	      (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-"
+	      (obj >= BTRFS_ROOT_TREE_OBJECTID &&			\
+	       obj <= BTRFS_CSUM_TREE_OBJECTID)) ? __show_root_type(obj) : "-"
 
 #define BTRFS_GROUP_FLAGS	\
 	{ BTRFS_BLOCK_GROUP_DATA,	"DATA"}, \
diff --git a/trunk/include/trace/events/gfpflags.h b/trunk/include/trace/events/gfpflags.h
index d6fd8e5b14b7..1eddbf1557f2 100644
--- a/trunk/include/trace/events/gfpflags.h
+++ b/trunk/include/trace/events/gfpflags.h
@@ -34,6 +34,7 @@
 	{(unsigned long)__GFP_HARDWALL,		"GFP_HARDWALL"},	\
 	{(unsigned long)__GFP_THISNODE,		"GFP_THISNODE"},	\
 	{(unsigned long)__GFP_RECLAIMABLE,	"GFP_RECLAIMABLE"},	\
+	{(unsigned long)__GFP_KMEMCG,		"GFP_KMEMCG"},		\
 	{(unsigned long)__GFP_MOVABLE,		"GFP_MOVABLE"},		\
 	{(unsigned long)__GFP_NOTRACK,		"GFP_NOTRACK"},		\
 	{(unsigned long)__GFP_NO_KSWAPD,	"GFP_NO_KSWAPD"},	\
diff --git a/trunk/include/uapi/asm-generic/unistd.h b/trunk/include/uapi/asm-generic/unistd.h
index 6e595ba545f4..2c531f478410 100644
--- a/trunk/include/uapi/asm-generic/unistd.h
+++ b/trunk/include/uapi/asm-generic/unistd.h
@@ -690,9 +690,11 @@ __SC_COMP(__NR_process_vm_writev, sys_process_vm_writev, \
           compat_sys_process_vm_writev)
 #define __NR_kcmp 272
 __SYSCALL(__NR_kcmp, sys_kcmp)
+#define __NR_finit_module 273
+__SYSCALL(__NR_finit_module, sys_finit_module)
 
 #undef __NR_syscalls
-#define __NR_syscalls 273
+#define __NR_syscalls 274
 
 /*
  * All syscalls below here should go away really,
diff --git a/trunk/include/uapi/linux/if_bridge.h b/trunk/include/uapi/linux/if_bridge.h
index afbb18a0227c..5db297514aec 100644
--- a/trunk/include/uapi/linux/if_bridge.h
+++ b/trunk/include/uapi/linux/if_bridge.h
@@ -163,6 +163,9 @@ struct br_port_msg {
 
 struct br_mdb_entry {
 	__u32 ifindex;
+#define MDB_TEMPORARY 0
+#define MDB_PERMANENT 1
+	__u8 state;
 	struct {
 		union {
 			__be32	ip4;
diff --git a/trunk/include/uapi/linux/module.h b/trunk/include/uapi/linux/module.h
new file mode 100644
index 000000000000..38da4258b12f
--- /dev/null
+++ b/trunk/include/uapi/linux/module.h
@@ -0,0 +1,8 @@
+#ifndef _UAPI_LINUX_MODULE_H
+#define _UAPI_LINUX_MODULE_H
+
+/* Flags for sys_finit_module: */
+#define MODULE_INIT_IGNORE_MODVERSIONS	1
+#define MODULE_INIT_IGNORE_VERMAGIC	2
+
+#endif /* _UAPI_LINUX_MODULE_H */
diff --git a/trunk/include/uapi/linux/ptrace.h b/trunk/include/uapi/linux/ptrace.h
index 1ef6c056a9e4..022ab186a812 100644
--- a/trunk/include/uapi/linux/ptrace.h
+++ b/trunk/include/uapi/linux/ptrace.h
@@ -73,7 +73,10 @@
 #define PTRACE_O_TRACEEXIT	(1 << PTRACE_EVENT_EXIT)
 #define PTRACE_O_TRACESECCOMP	(1 << PTRACE_EVENT_SECCOMP)
 
-#define PTRACE_O_MASK		0x000000ff
+/* eventless options */
+#define PTRACE_O_EXITKILL	(1 << 20)
+
+#define PTRACE_O_MASK		(0x000000ff | PTRACE_O_EXITKILL)
 
 #include <asm/ptrace.h>
 
diff --git a/trunk/include/uapi/linux/swab.h b/trunk/include/uapi/linux/swab.h
index e811474724c2..0e011eb91b5d 100644
--- a/trunk/include/uapi/linux/swab.h
+++ b/trunk/include/uapi/linux/swab.h
@@ -45,7 +45,9 @@
 
 static inline __attribute_const__ __u16 __fswab16(__u16 val)
 {
-#ifdef __arch_swab16
+#ifdef __HAVE_BUILTIN_BSWAP16__
+	return __builtin_bswap16(val);
+#elif defined (__arch_swab16)
 	return __arch_swab16(val);
 #else
 	return ___constant_swab16(val);
@@ -54,7 +56,9 @@ static inline __attribute_const__ __u16 __fswab16(__u16 val)
 
 static inline __attribute_const__ __u32 __fswab32(__u32 val)
 {
-#ifdef __arch_swab32
+#ifdef __HAVE_BUILTIN_BSWAP32__
+	return __builtin_bswap32(val);
+#elif defined(__arch_swab32)
 	return __arch_swab32(val);
 #else
 	return ___constant_swab32(val);
@@ -63,7 +67,9 @@ static inline __attribute_const__ __u32 __fswab32(__u32 val)
 
 static inline __attribute_const__ __u64 __fswab64(__u64 val)
 {
-#ifdef __arch_swab64
+#ifdef __HAVE_BUILTIN_BSWAP64__
+	return __builtin_bswap64(val);
+#elif defined (__arch_swab64)
 	return __arch_swab64(val);
 #elif defined(__SWAB_64_THRU_32__)
 	__u32 h = val >> 32;
diff --git a/trunk/include/xen/interface/event_channel.h b/trunk/include/xen/interface/event_channel.h
index 2090881c3650..f4942921e202 100644
--- a/trunk/include/xen/interface/event_channel.h
+++ b/trunk/include/xen/interface/event_channel.h
@@ -177,6 +177,19 @@ struct evtchn_unmask {
 	evtchn_port_t port;
 };
 
+/*
+ * EVTCHNOP_reset: Close all event channels associated with specified domain.
+ * NOTES:
+ *  1. <dom> may be specified as DOMID_SELF.
+ *  2. Only a sufficiently-privileged domain may specify other than DOMID_SELF.
+ */
+#define EVTCHNOP_reset		 10
+struct evtchn_reset {
+	/* IN parameters. */
+	domid_t dom;
+};
+typedef struct evtchn_reset evtchn_reset_t;
+
 struct evtchn_op {
 	uint32_t cmd; /* EVTCHNOP_* */
 	union {
diff --git a/trunk/init/Kconfig b/trunk/init/Kconfig
index 1a207efca591..7d30240e5bfe 100644
--- a/trunk/init/Kconfig
+++ b/trunk/init/Kconfig
@@ -882,7 +882,7 @@ config MEMCG_SWAP_ENABLED
 config MEMCG_KMEM
 	bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)"
 	depends on MEMCG && EXPERIMENTAL
-	default n
+	depends on SLUB || SLAB
 	help
 	  The Kernel Memory extension for Memory Resource Controller can limit
 	  the amount of memory used by kernel objects in the system. Those are
@@ -1069,11 +1069,9 @@ config UIDGID_CONVERTED
 	# Filesystems
 	depends on 9P_FS = n
 	depends on AFS_FS = n
-	depends on AUTOFS4_FS = n
 	depends on CEPH_FS = n
 	depends on CIFS = n
 	depends on CODA_FS = n
-	depends on FUSE_FS = n
 	depends on GFS2_FS = n
 	depends on NCP_FS = n
 	depends on NFSD = n
diff --git a/trunk/init/do_mounts.c b/trunk/init/do_mounts.c
index f8a66424360d..1d1b6348f903 100644
--- a/trunk/init/do_mounts.c
+++ b/trunk/init/do_mounts.c
@@ -69,23 +69,28 @@ __setup("ro", readonly);
 __setup("rw", readwrite);
 
 #ifdef CONFIG_BLOCK
+struct uuidcmp {
+	const char *uuid;
+	int len;
+};
+
 /**
  * match_dev_by_uuid - callback for finding a partition using its uuid
  * @dev:	device passed in by the caller
- * @data:	opaque pointer to a 36 byte char array with a UUID
+ * @data:	opaque pointer to the desired struct uuidcmp to match
  *
  * Returns 1 if the device matches, and 0 otherwise.
  */
 static int match_dev_by_uuid(struct device *dev, void *data)
 {
-	u8 *uuid = data;
+	struct uuidcmp *cmp = data;
 	struct hd_struct *part = dev_to_part(dev);
 
 	if (!part->info)
 		goto no_match;
 
-	if (memcmp(uuid, part->info->uuid, sizeof(part->info->uuid)))
-			goto no_match;
+	if (strncasecmp(cmp->uuid, part->info->uuid, cmp->len))
+		goto no_match;
 
 	return 1;
 no_match:
@@ -95,7 +100,7 @@ static int match_dev_by_uuid(struct device *dev, void *data)
 
 /**
  * devt_from_partuuid - looks up the dev_t of a partition by its UUID
- * @uuid:	min 36 byte char array containing a hex ascii UUID
+ * @uuid:	char array containing ascii UUID
  *
  * The function will return the first partition which contains a matching
  * UUID value in its partition_meta_info struct.  This does not search
@@ -106,38 +111,41 @@ static int match_dev_by_uuid(struct device *dev, void *data)
  *
  * Returns the matching dev_t on success or 0 on failure.
  */
-static dev_t devt_from_partuuid(char *uuid_str)
+static dev_t devt_from_partuuid(const char *uuid_str)
 {
 	dev_t res = 0;
+	struct uuidcmp cmp;
 	struct device *dev = NULL;
-	u8 uuid[16];
 	struct gendisk *disk;
 	struct hd_struct *part;
 	int offset = 0;
+	bool clear_root_wait = false;
+	char *slash;
 
-	if (strlen(uuid_str) < 36)
-		goto done;
+	cmp.uuid = uuid_str;
 
+	slash = strchr(uuid_str, '/');
 	/* Check for optional partition number offset attributes. */
-	if (uuid_str[36]) {
+	if (slash) {
 		char c = 0;
 		/* Explicitly fail on poor PARTUUID syntax. */
-		if (sscanf(&uuid_str[36],
-			   "/PARTNROFF=%d%c", &offset, &c) != 1) {
-			printk(KERN_ERR "VFS: PARTUUID= is invalid.\n"
-			 "Expected PARTUUID=<valid-uuid-id>[/PARTNROFF=%%d]\n");
-			if (root_wait)
-				printk(KERN_ERR
-				     "Disabling rootwait; root= is invalid.\n");
-			root_wait = 0;
+		if (sscanf(slash + 1,
+			   "PARTNROFF=%d%c", &offset, &c) != 1) {
+			clear_root_wait = true;
 			goto done;
 		}
+		cmp.len = slash - uuid_str;
+	} else {
+		cmp.len = strlen(uuid_str);
 	}
 
-	/* Pack the requested UUID in the expected format. */
-	part_pack_uuid(uuid_str, uuid);
+	if (!cmp.len) {
+		clear_root_wait = true;
+		goto done;
+	}
 
-	dev = class_find_device(&block_class, NULL, uuid, &match_dev_by_uuid);
+	dev = class_find_device(&block_class, NULL, &cmp,
+				&match_dev_by_uuid);
 	if (!dev)
 		goto done;
 
@@ -158,6 +166,13 @@ static dev_t devt_from_partuuid(char *uuid_str)
 no_offset:
 	put_device(dev);
 done:
+	if (clear_root_wait) {
+		pr_err("VFS: PARTUUID= is invalid.\n"
+		       "Expected PARTUUID=<valid-uuid-id>[/PARTNROFF=%%d]\n");
+		if (root_wait)
+			pr_err("Disabling rootwait; root= is invalid.\n");
+		root_wait = 0;
+	}
 	return res;
 }
 #endif
@@ -174,6 +189,10 @@ static dev_t devt_from_partuuid(char *uuid_str)
  *	   used when disk name of partitioned disk ends on a digit.
  *	6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the
  *	   unique id of a partition if the partition table provides it.
+ *	   The UUID may be either an EFI/GPT UUID, or refer to an MSDOS
+ *	   partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero-
+ *	   filled hex representation of the 32-bit "NT disk signature", and PP
+ *	   is a zero-filled hex representation of the 1-based partition number.
  *	7) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation to
  *	   a partition with a known unique id.
  *
diff --git a/trunk/init/main.c b/trunk/init/main.c
index 63ae904a99a8..baf1f0f5c461 100644
--- a/trunk/init/main.c
+++ b/trunk/init/main.c
@@ -812,7 +812,6 @@ static int __ref kernel_init(void *unused)
 	system_state = SYSTEM_RUNNING;
 	numa_default_policy();
 
-	current->signal->flags |= SIGNAL_UNKILLABLE;
 	flush_delayed_fput();
 
 	if (ramdisk_execute_command) {
diff --git a/trunk/init/version.c b/trunk/init/version.c
index 86fe0ccb997a..58170f18912d 100644
--- a/trunk/init/version.c
+++ b/trunk/init/version.c
@@ -12,6 +12,7 @@
 #include <linux/utsname.h>
 #include <generated/utsrelease.h>
 #include <linux/version.h>
+#include <linux/proc_fs.h>
 
 #ifndef CONFIG_KALLSYMS
 #define version(a) Version_ ## a
@@ -34,6 +35,7 @@ struct uts_namespace init_uts_ns = {
 		.domainname	= UTS_DOMAINNAME,
 	},
 	.user_ns = &init_user_ns,
+	.proc_inum = PROC_UTS_INIT_INO,
 };
 EXPORT_SYMBOL_GPL(init_uts_ns);
 
diff --git a/trunk/ipc/msgutil.c b/trunk/ipc/msgutil.c
index 26143d377c95..6471f1bdae96 100644
--- a/trunk/ipc/msgutil.c
+++ b/trunk/ipc/msgutil.c
@@ -16,6 +16,7 @@
 #include <linux/msg.h>
 #include <linux/ipc_namespace.h>
 #include <linux/utsname.h>
+#include <linux/proc_fs.h>
 #include <asm/uaccess.h>
 
 #include "util.h"
@@ -30,6 +31,7 @@ DEFINE_SPINLOCK(mq_lock);
 struct ipc_namespace init_ipc_ns = {
 	.count		= ATOMIC_INIT(1),
 	.user_ns = &init_user_ns,
+	.proc_inum = PROC_IPC_INIT_INO,
 };
 
 atomic_t nr_ipc_ns = ATOMIC_INIT(1);
diff --git a/trunk/ipc/namespace.c b/trunk/ipc/namespace.c
index f362298c5ce4..7c1fa451b0b0 100644
--- a/trunk/ipc/namespace.c
+++ b/trunk/ipc/namespace.c
@@ -16,7 +16,7 @@
 
 #include "util.h"
 
-static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk,
+static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 					   struct ipc_namespace *old_ns)
 {
 	struct ipc_namespace *ns;
@@ -26,9 +26,16 @@ static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk,
 	if (ns == NULL)
 		return ERR_PTR(-ENOMEM);
 
+	err = proc_alloc_inum(&ns->proc_inum);
+	if (err) {
+		kfree(ns);
+		return ERR_PTR(err);
+	}
+
 	atomic_set(&ns->count, 1);
 	err = mq_init_ns(ns);
 	if (err) {
+		proc_free_inum(ns->proc_inum);
 		kfree(ns);
 		return ERR_PTR(err);
 	}
@@ -46,19 +53,17 @@ static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk,
 	ipcns_notify(IPCNS_CREATED);
 	register_ipcns_notifier(ns);
 
-	ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns));
+	ns->user_ns = get_user_ns(user_ns);
 
 	return ns;
 }
 
 struct ipc_namespace *copy_ipcs(unsigned long flags,
-				struct task_struct *tsk)
+	struct user_namespace *user_ns, struct ipc_namespace *ns)
 {
-	struct ipc_namespace *ns = tsk->nsproxy->ipc_ns;
-
 	if (!(flags & CLONE_NEWIPC))
 		return get_ipc_ns(ns);
-	return create_ipc_ns(tsk, ns);
+	return create_ipc_ns(user_ns, ns);
 }
 
 /*
@@ -113,6 +118,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
 	 */
 	ipcns_notify(IPCNS_REMOVED);
 	put_user_ns(ns->user_ns);
+	proc_free_inum(ns->proc_inum);
 	kfree(ns);
 }
 
@@ -161,8 +167,13 @@ static void ipcns_put(void *ns)
 	return put_ipc_ns(ns);
 }
 
-static int ipcns_install(struct nsproxy *nsproxy, void *ns)
+static int ipcns_install(struct nsproxy *nsproxy, void *new)
 {
+	struct ipc_namespace *ns = new;
+	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
+	    !nsown_capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	/* Ditch state from the old ipc namespace */
 	exit_sem(current);
 	put_ipc_ns(nsproxy->ipc_ns);
@@ -170,10 +181,18 @@ static int ipcns_install(struct nsproxy *nsproxy, void *ns)
 	return 0;
 }
 
+static unsigned int ipcns_inum(void *vp)
+{
+	struct ipc_namespace *ns = vp;
+
+	return ns->proc_inum;
+}
+
 const struct proc_ns_operations ipcns_operations = {
 	.name		= "ipc",
 	.type		= CLONE_NEWIPC,
 	.get		= ipcns_get,
 	.put		= ipcns_put,
 	.install	= ipcns_install,
+	.inum		= ipcns_inum,
 };
diff --git a/trunk/kernel/Makefile b/trunk/kernel/Makefile
index ac0d533eb7de..6c072b6da239 100644
--- a/trunk/kernel/Makefile
+++ b/trunk/kernel/Makefile
@@ -54,7 +54,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o
+obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
@@ -137,10 +137,14 @@ ifeq ($(CONFIG_MODULE_SIG),y)
 #
 # Pull the signing certificate and any extra certificates into the kernel
 #
+
+quiet_cmd_touch = TOUCH   $@
+      cmd_touch = touch   $@
+
 extra_certificates:
-	touch $@
+	$(call cmd,touch)
 
-kernel/modsign_pubkey.o: signing_key.x509 extra_certificates
+kernel/modsign_certificate.o: signing_key.x509 extra_certificates
 
 ###############################################################################
 #
diff --git a/trunk/kernel/cgroup.c b/trunk/kernel/cgroup.c
index f34c41bfaa37..4855892798fd 100644
--- a/trunk/kernel/cgroup.c
+++ b/trunk/kernel/cgroup.c
@@ -1333,7 +1333,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	if (ret)
 		goto out_unlock;
 
-	/* See feature-removal-schedule.txt */
 	if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
 		pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
 			   task_tgid_nr(current), current->comm);
@@ -3409,7 +3408,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 {
 	struct cgroup_pidlist *l;
 	/* don't need task_nsproxy() if we're looking at ourself */
-	struct pid_namespace *ns = current->nsproxy->pid_ns;
+	struct pid_namespace *ns = task_active_pid_ns(current);
 
 	/*
 	 * We can't drop the pidlist_mutex before taking the l->mutex in case
diff --git a/trunk/kernel/compat.c b/trunk/kernel/compat.c
index c28a306ae05c..f6150e92dfc9 100644
--- a/trunk/kernel/compat.c
+++ b/trunk/kernel/compat.c
@@ -1215,6 +1215,23 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
 	return 0;
 }
 
+#ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL
+asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
+						 struct compat_timespec __user *interval)
+{
+	struct timespec t;
+	int ret;
+	mm_segment_t old_fs = get_fs();
+
+	set_fs(KERNEL_DS);
+	ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
+	set_fs(old_fs);
+	if (put_compat_timespec(&t, interval))
+		return -EFAULT;
+	return ret;
+}
+#endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */
+
 /*
  * Allocate user-space memory for the duration of a single system call,
  * in order to marshall parameters inside a compat thunk.
diff --git a/trunk/kernel/cred.c b/trunk/kernel/cred.c
index 8888afb846e9..e0573a43c7df 100644
--- a/trunk/kernel/cred.c
+++ b/trunk/kernel/cred.c
@@ -372,6 +372,31 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	return ret;
 }
 
+static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
+{
+	const struct user_namespace *set_ns = set->user_ns;
+	const struct user_namespace *subset_ns = subset->user_ns;
+
+	/* If the two credentials are in the same user namespace see if
+	 * the capabilities of subset are a subset of set.
+	 */
+	if (set_ns == subset_ns)
+		return cap_issubset(subset->cap_permitted, set->cap_permitted);
+
+	/* The credentials are in a different user namespaces
+	 * therefore one is a subset of the other only if a set is an
+	 * ancestor of subset and set->euid is owner of subset or one
+	 * of subsets ancestors.
+	 */
+	for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) {
+		if ((set_ns == subset_ns->parent)  &&
+		    uid_eq(subset_ns->owner, set->euid))
+			return true;
+	}
+
+	return false;
+}
+
 /**
  * commit_creds - Install new credentials upon the current task
  * @new: The credentials to be assigned
@@ -410,7 +435,7 @@ int commit_creds(struct cred *new)
 	    !gid_eq(old->egid, new->egid) ||
 	    !uid_eq(old->fsuid, new->fsuid) ||
 	    !gid_eq(old->fsgid, new->fsgid) ||
-	    !cap_issubset(new->cap_permitted, old->cap_permitted)) {
+	    !cred_cap_issubset(old, new)) {
 		if (task->mm)
 			set_dumpable(task->mm, suid_dumpable);
 		task->pdeath_signal = 0;
diff --git a/trunk/kernel/events/core.c b/trunk/kernel/events/core.c
index f9ff5493171d..301079d06f24 100644
--- a/trunk/kernel/events/core.c
+++ b/trunk/kernel/events/core.c
@@ -6155,7 +6155,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
 	event->parent		= parent_event;
 
-	event->ns		= get_pid_ns(current->nsproxy->pid_ns);
+	event->ns		= get_pid_ns(task_active_pid_ns(current));
 	event->id		= atomic64_inc_return(&perf_event_id);
 
 	event->state		= PERF_EVENT_STATE_INACTIVE;
diff --git a/trunk/kernel/exit.c b/trunk/kernel/exit.c
index 50d2e93c36ea..b4df21937216 100644
--- a/trunk/kernel/exit.c
+++ b/trunk/kernel/exit.c
@@ -72,18 +72,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
 		list_del_rcu(&p->tasks);
 		list_del_init(&p->sibling);
 		__this_cpu_dec(process_counts);
-		/*
-		 * If we are the last child process in a pid namespace to be
-		 * reaped, notify the reaper sleeping zap_pid_ns_processes().
-		 */
-		if (IS_ENABLED(CONFIG_PID_NS)) {
-			struct task_struct *parent = p->real_parent;
-
-			if ((task_active_pid_ns(parent)->child_reaper == parent) &&
-			    list_empty(&parent->children) &&
-			    (parent->flags & PF_EXITING))
-				wake_up_process(parent);
-		}
 	}
 	list_del_rcu(&p->thread_group);
 }
diff --git a/trunk/kernel/fork.c b/trunk/kernel/fork.c
index 115d6c2e4cca..85f6d536608d 100644
--- a/trunk/kernel/fork.c
+++ b/trunk/kernel/fork.c
@@ -146,7 +146,7 @@ void __weak arch_release_thread_info(struct thread_info *ti)
 static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 						  int node)
 {
-	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
+	struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
 					     THREAD_SIZE_ORDER);
 
 	return page ? page_address(page) : NULL;
@@ -154,7 +154,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 
 static inline void free_thread_info(struct thread_info *ti)
 {
-	free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+	free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
 }
 # else
 static struct kmem_cache *thread_info_cache;
@@ -1044,8 +1044,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	atomic_set(&sig->live, 1);
 	atomic_set(&sig->sigcnt, 1);
 	init_waitqueue_head(&sig->wait_chldexit);
-	if (clone_flags & CLONE_NEWPID)
-		sig->flags |= SIGNAL_UNKILLABLE;
 	sig->curr_target = tsk;
 	init_sigpending(&sig->shared_pending);
 	INIT_LIST_HEAD(&sig->posix_timers);
@@ -1438,8 +1436,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
 
 		if (thread_group_leader(p)) {
-			if (is_child_reaper(pid))
-				p->nsproxy->pid_ns->child_reaper = p;
+			if (is_child_reaper(pid)) {
+				ns_of_pid(pid)->child_reaper = p;
+				p->signal->flags |= SIGNAL_UNKILLABLE;
+			}
 
 			p->signal->leader_pid = pid;
 			p->signal->tty = tty_kref_get(current->signal->tty);
@@ -1473,8 +1473,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (p->io_context)
 		exit_io_context(p);
 bad_fork_cleanup_namespaces:
-	if (unlikely(clone_flags & CLONE_NEWPID))
-		pid_ns_release_proc(p->nsproxy->pid_ns);
 	exit_task_namespaces(p);
 bad_fork_cleanup_mm:
 	if (p->mm)
@@ -1554,15 +1552,9 @@ long do_fork(unsigned long clone_flags,
 	 * Do some preliminary argument and permissions checking before we
 	 * actually start allocating stuff
 	 */
-	if (clone_flags & CLONE_NEWUSER) {
-		if (clone_flags & CLONE_THREAD)
+	if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
+		if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
 			return -EINVAL;
-		/* hopefully this check will go away when userns support is
-		 * complete
-		 */
-		if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
-				!capable(CAP_SETGID))
-			return -EPERM;
 	}
 
 	/*
@@ -1724,7 +1716,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
 {
 	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
 				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
-				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
+				CLONE_NEWUSER|CLONE_NEWPID))
 		return -EINVAL;
 	/*
 	 * Not implemented, but pretend it works if there is nothing to
@@ -1791,19 +1784,40 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 {
 	struct fs_struct *fs, *new_fs = NULL;
 	struct files_struct *fd, *new_fd = NULL;
+	struct cred *new_cred = NULL;
 	struct nsproxy *new_nsproxy = NULL;
 	int do_sysvsem = 0;
 	int err;
 
-	err = check_unshare_flags(unshare_flags);
-	if (err)
-		goto bad_unshare_out;
-
+	/*
+	 * If unsharing a user namespace must also unshare the thread.
+	 */
+	if (unshare_flags & CLONE_NEWUSER)
+		unshare_flags |= CLONE_THREAD;
+	/*
+	 * If unsharing a pid namespace must also unshare the thread.
+	 */
+	if (unshare_flags & CLONE_NEWPID)
+		unshare_flags |= CLONE_THREAD;
+	/*
+	 * If unsharing a thread from a thread group, must also unshare vm.
+	 */
+	if (unshare_flags & CLONE_THREAD)
+		unshare_flags |= CLONE_VM;
+	/*
+	 * If unsharing vm, must also unshare signal handlers.
+	 */
+	if (unshare_flags & CLONE_VM)
+		unshare_flags |= CLONE_SIGHAND;
 	/*
 	 * If unsharing namespace, must also unshare filesystem information.
 	 */
 	if (unshare_flags & CLONE_NEWNS)
 		unshare_flags |= CLONE_FS;
+
+	err = check_unshare_flags(unshare_flags);
+	if (err)
+		goto bad_unshare_out;
 	/*
 	 * CLONE_NEWIPC must also detach from the undolist: after switching
 	 * to a new ipc namespace, the semaphore arrays from the old
@@ -1817,11 +1831,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 	err = unshare_fd(unshare_flags, &new_fd);
 	if (err)
 		goto bad_unshare_cleanup_fs;
-	err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs);
+	err = unshare_userns(unshare_flags, &new_cred);
 	if (err)
 		goto bad_unshare_cleanup_fd;
+	err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+					 new_cred, new_fs);
+	if (err)
+		goto bad_unshare_cleanup_cred;
 
-	if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
+	if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
 		if (do_sysvsem) {
 			/*
 			 * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1854,11 +1872,20 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 		}
 
 		task_unlock(current);
+
+		if (new_cred) {
+			/* Install the new user namespace */
+			commit_creds(new_cred);
+			new_cred = NULL;
+		}
 	}
 
 	if (new_nsproxy)
 		put_nsproxy(new_nsproxy);
 
+bad_unshare_cleanup_cred:
+	if (new_cred)
+		put_cred(new_cred);
 bad_unshare_cleanup_fd:
 	if (new_fd)
 		put_files_struct(new_fd);
diff --git a/trunk/kernel/irq/manage.c b/trunk/kernel/irq/manage.c
index 35c70c9e24d8..e49a288fa479 100644
--- a/trunk/kernel/irq/manage.c
+++ b/trunk/kernel/irq/manage.c
@@ -818,7 +818,7 @@ static void irq_thread_dtor(struct callback_head *unused)
 	action = kthread_data(tsk);
 
 	pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
-	       tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
+	       tsk->comm, tsk->pid, action->irq);
 
 
 	desc = irq_to_desc(action->irq);
diff --git a/trunk/kernel/modsign_certificate.S b/trunk/kernel/modsign_certificate.S
new file mode 100644
index 000000000000..246b4c6e6135
--- /dev/null
+++ b/trunk/kernel/modsign_certificate.S
@@ -0,0 +1,19 @@
+/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */
+#ifndef SYMBOL_PREFIX
+#define ASM_SYMBOL(sym) sym
+#else
+#define PASTE2(x,y) x##y
+#define PASTE(x,y) PASTE2(x,y)
+#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym)
+#endif
+
+#define GLOBAL(name)	\
+	.globl ASM_SYMBOL(name);	\
+	ASM_SYMBOL(name):
+
+	.section ".init.data","aw"
+
+GLOBAL(modsign_certificate_list)
+	.incbin "signing_key.x509"
+	.incbin "extra_certificates"
+GLOBAL(modsign_certificate_list_end)
diff --git a/trunk/kernel/modsign_pubkey.c b/trunk/kernel/modsign_pubkey.c
index 767e559dfb10..045504fffbb2 100644
--- a/trunk/kernel/modsign_pubkey.c
+++ b/trunk/kernel/modsign_pubkey.c
@@ -20,12 +20,6 @@ struct key *modsign_keyring;
 
 extern __initdata const u8 modsign_certificate_list[];
 extern __initdata const u8 modsign_certificate_list_end[];
-asm(".section .init.data,\"aw\"\n"
-    SYMBOL_PREFIX "modsign_certificate_list:\n"
-    ".incbin \"signing_key.x509\"\n"
-    ".incbin \"extra_certificates\"\n"
-    SYMBOL_PREFIX "modsign_certificate_list_end:"
-    );
 
 /*
  * We need to make sure ccache doesn't cache the .o file as it doesn't notice
diff --git a/trunk/kernel/module.c b/trunk/kernel/module.c
index 6e48c3a43599..250092c1d57d 100644
--- a/trunk/kernel/module.c
+++ b/trunk/kernel/module.c
@@ -21,6 +21,7 @@
 #include <linux/ftrace_event.h>
 #include <linux/init.h>
 #include <linux/kallsyms.h>
+#include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/sysfs.h>
 #include <linux/kernel.h>
@@ -28,6 +29,7 @@
 #include <linux/vmalloc.h>
 #include <linux/elf.h>
 #include <linux/proc_fs.h>
+#include <linux/security.h>
 #include <linux/seq_file.h>
 #include <linux/syscalls.h>
 #include <linux/fcntl.h>
@@ -59,6 +61,7 @@
 #include <linux/pfn.h>
 #include <linux/bsearch.h>
 #include <linux/fips.h>
+#include <uapi/linux/module.h>
 #include "module-internal.h"
 
 #define CREATE_TRACE_POINTS
@@ -372,9 +375,6 @@ static bool check_symbol(const struct symsearch *syms,
 			printk(KERN_WARNING "Symbol %s is being used "
 			       "by a non-GPL module, which will not "
 			       "be allowed in the future\n", fsa->name);
-			printk(KERN_WARNING "Please see the file "
-			       "Documentation/feature-removal-schedule.txt "
-			       "in the kernel source tree for more details.\n");
 		}
 	}
 
@@ -2282,7 +2282,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 	Elf_Shdr *symsect = info->sechdrs + info->index.sym;
 	Elf_Shdr *strsect = info->sechdrs + info->index.str;
 	const Elf_Sym *src;
-	unsigned int i, nsrc, ndst, strtab_size;
+	unsigned int i, nsrc, ndst, strtab_size = 0;
 
 	/* Put symbol section at end of init part of module. */
 	symsect->sh_flags |= SHF_ALLOC;
@@ -2293,9 +2293,6 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 	src = (void *)info->hdr + symsect->sh_offset;
 	nsrc = symsect->sh_size / sizeof(*src);
 
-	/* strtab always starts with a nul, so offset 0 is the empty string. */
-	strtab_size = 1;
-
 	/* Compute total space required for the core symbols' strtab. */
 	for (ndst = i = 0; i < nsrc; i++) {
 		if (i == 0 ||
@@ -2337,7 +2334,6 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
 	mod->core_symtab = dst = mod->module_core + info->symoffs;
 	mod->core_strtab = s = mod->module_core + info->stroffs;
 	src = mod->symtab;
-	*s++ = 0;
 	for (ndst = i = 0; i < mod->num_symtab; i++) {
 		if (i == 0 ||
 		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
@@ -2378,7 +2374,7 @@ static void dynamic_debug_remove(struct _ddebug *debug)
 
 void * __weak module_alloc(unsigned long size)
 {
-	return size == 0 ? NULL : vmalloc_exec(size);
+	return vmalloc_exec(size);
 }
 
 static void *module_alloc_update_bounds(unsigned long size)
@@ -2425,18 +2421,17 @@ static inline void kmemleak_load_module(const struct module *mod,
 #endif
 
 #ifdef CONFIG_MODULE_SIG
-static int module_sig_check(struct load_info *info,
-			    const void *mod, unsigned long *_len)
+static int module_sig_check(struct load_info *info)
 {
 	int err = -ENOKEY;
-	unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
-	unsigned long len = *_len;
+	const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
+	const void *mod = info->hdr;
 
-	if (len > markerlen &&
-	    memcmp(mod + len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
+	if (info->len > markerlen &&
+	    memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
 		/* We truncate the module to discard the signature */
-		*_len -= markerlen;
-		err = mod_verify_sig(mod, _len);
+		info->len -= markerlen;
+		err = mod_verify_sig(mod, &info->len);
 	}
 
 	if (!err) {
@@ -2454,59 +2449,107 @@ static int module_sig_check(struct load_info *info,
 	return err;
 }
 #else /* !CONFIG_MODULE_SIG */
-static int module_sig_check(struct load_info *info,
-			    void *mod, unsigned long *len)
+static int module_sig_check(struct load_info *info)
 {
 	return 0;
 }
 #endif /* !CONFIG_MODULE_SIG */
 
-/* Sets info->hdr, info->len and info->sig_ok. */
-static int copy_and_check(struct load_info *info,
-			  const void __user *umod, unsigned long len,
-			  const char __user *uargs)
+/* Sanity checks against invalid binaries, wrong arch, weird elf version. */
+static int elf_header_check(struct load_info *info)
+{
+	if (info->len < sizeof(*(info->hdr)))
+		return -ENOEXEC;
+
+	if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0
+	    || info->hdr->e_type != ET_REL
+	    || !elf_check_arch(info->hdr)
+	    || info->hdr->e_shentsize != sizeof(Elf_Shdr))
+		return -ENOEXEC;
+
+	if (info->hdr->e_shoff >= info->len
+	    || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
+		info->len - info->hdr->e_shoff))
+		return -ENOEXEC;
+
+	return 0;
+}
+
+/* Sets info->hdr and info->len. */
+static int copy_module_from_user(const void __user *umod, unsigned long len,
+				  struct load_info *info)
 {
 	int err;
-	Elf_Ehdr *hdr;
 
-	if (len < sizeof(*hdr))
+	info->len = len;
+	if (info->len < sizeof(*(info->hdr)))
 		return -ENOEXEC;
 
+	err = security_kernel_module_from_file(NULL);
+	if (err)
+		return err;
+
 	/* Suck in entire file: we'll want most of it. */
-	if ((hdr = vmalloc(len)) == NULL)
+	info->hdr = vmalloc(info->len);
+	if (!info->hdr)
 		return -ENOMEM;
 
-	if (copy_from_user(hdr, umod, len) != 0) {
-		err = -EFAULT;
-		goto free_hdr;
+	if (copy_from_user(info->hdr, umod, info->len) != 0) {
+		vfree(info->hdr);
+		return -EFAULT;
 	}
 
-	err = module_sig_check(info, hdr, &len);
+	return 0;
+}
+
+/* Sets info->hdr and info->len. */
+static int copy_module_from_fd(int fd, struct load_info *info)
+{
+	struct file *file;
+	int err;
+	struct kstat stat;
+	loff_t pos;
+	ssize_t bytes = 0;
+
+	file = fget(fd);
+	if (!file)
+		return -ENOEXEC;
+
+	err = security_kernel_module_from_file(file);
 	if (err)
-		goto free_hdr;
+		goto out;
 
-	/* Sanity checks against insmoding binaries or wrong arch,
-	   weird elf version */
-	if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
-	    || hdr->e_type != ET_REL
-	    || !elf_check_arch(hdr)
-	    || hdr->e_shentsize != sizeof(Elf_Shdr)) {
-		err = -ENOEXEC;
-		goto free_hdr;
-	}
+	err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat);
+	if (err)
+		goto out;
 
-	if (hdr->e_shoff >= len ||
-	    hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) {
-		err = -ENOEXEC;
-		goto free_hdr;
+	if (stat.size > INT_MAX) {
+		err = -EFBIG;
+		goto out;
+	}
+	info->hdr = vmalloc(stat.size);
+	if (!info->hdr) {
+		err = -ENOMEM;
+		goto out;
 	}
 
-	info->hdr = hdr;
-	info->len = len;
-	return 0;
+	pos = 0;
+	while (pos < stat.size) {
+		bytes = kernel_read(file, pos, (char *)(info->hdr) + pos,
+				    stat.size - pos);
+		if (bytes < 0) {
+			vfree(info->hdr);
+			err = bytes;
+			goto out;
+		}
+		if (bytes == 0)
+			break;
+		pos += bytes;
+	}
+	info->len = pos;
 
-free_hdr:
-	vfree(hdr);
+out:
+	fput(file);
 	return err;
 }
 
@@ -2515,7 +2558,7 @@ static void free_copy(struct load_info *info)
 	vfree(info->hdr);
 }
 
-static int rewrite_section_headers(struct load_info *info)
+static int rewrite_section_headers(struct load_info *info, int flags)
 {
 	unsigned int i;
 
@@ -2543,7 +2586,10 @@ static int rewrite_section_headers(struct load_info *info)
 	}
 
 	/* Track but don't keep modinfo and version sections. */
-	info->index.vers = find_sec(info, "__versions");
+	if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
+		info->index.vers = 0; /* Pretend no __versions section! */
+	else
+		info->index.vers = find_sec(info, "__versions");
 	info->index.info = find_sec(info, ".modinfo");
 	info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
 	info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -2558,7 +2604,7 @@ static int rewrite_section_headers(struct load_info *info)
  * Return the temporary module pointer (we'll replace it with the final
  * one when we move the module sections around).
  */
-static struct module *setup_load_info(struct load_info *info)
+static struct module *setup_load_info(struct load_info *info, int flags)
 {
 	unsigned int i;
 	int err;
@@ -2569,7 +2615,7 @@ static struct module *setup_load_info(struct load_info *info)
 	info->secstrings = (void *)info->hdr
 		+ info->sechdrs[info->hdr->e_shstrndx].sh_offset;
 
-	err = rewrite_section_headers(info);
+	err = rewrite_section_headers(info, flags);
 	if (err)
 		return ERR_PTR(err);
 
@@ -2607,11 +2653,14 @@ static struct module *setup_load_info(struct load_info *info)
 	return mod;
 }
 
-static int check_modinfo(struct module *mod, struct load_info *info)
+static int check_modinfo(struct module *mod, struct load_info *info, int flags)
 {
 	const char *modmagic = get_modinfo(info, "vermagic");
 	int err;
 
+	if (flags & MODULE_INIT_IGNORE_VERMAGIC)
+		modmagic = NULL;
+
 	/* This is allowed: modprobe --force will invalidate it. */
 	if (!modmagic) {
 		err = try_to_force_load(mod, "bad vermagic");
@@ -2741,20 +2790,23 @@ static int move_module(struct module *mod, struct load_info *info)
 	memset(ptr, 0, mod->core_size);
 	mod->module_core = ptr;
 
-	ptr = module_alloc_update_bounds(mod->init_size);
-	/*
-	 * The pointer to this block is stored in the module structure
-	 * which is inside the block. This block doesn't need to be
-	 * scanned as it contains data and code that will be freed
-	 * after the module is initialized.
-	 */
-	kmemleak_ignore(ptr);
-	if (!ptr && mod->init_size) {
-		module_free(mod, mod->module_core);
-		return -ENOMEM;
-	}
-	memset(ptr, 0, mod->init_size);
-	mod->module_init = ptr;
+	if (mod->init_size) {
+		ptr = module_alloc_update_bounds(mod->init_size);
+		/*
+		 * The pointer to this block is stored in the module structure
+		 * which is inside the block. This block doesn't need to be
+		 * scanned as it contains data and code that will be freed
+		 * after the module is initialized.
+		 */
+		kmemleak_ignore(ptr);
+		if (!ptr) {
+			module_free(mod, mod->module_core);
+			return -ENOMEM;
+		}
+		memset(ptr, 0, mod->init_size);
+		mod->module_init = ptr;
+	} else
+		mod->module_init = NULL;
 
 	/* Transfer each section which specifies SHF_ALLOC */
 	pr_debug("final section addresses:\n");
@@ -2847,18 +2899,18 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
 	return 0;
 }
 
-static struct module *layout_and_allocate(struct load_info *info)
+static struct module *layout_and_allocate(struct load_info *info, int flags)
 {
 	/* Module within temporary copy. */
 	struct module *mod;
 	Elf_Shdr *pcpusec;
 	int err;
 
-	mod = setup_load_info(info);
+	mod = setup_load_info(info, flags);
 	if (IS_ERR(mod))
 		return mod;
 
-	err = check_modinfo(mod, info);
+	err = check_modinfo(mod, info, flags);
 	if (err)
 		return ERR_PTR(err);
 
@@ -2945,33 +2997,124 @@ static bool finished_loading(const char *name)
 	return ret;
 }
 
+/* Call module constructors. */
+static void do_mod_ctors(struct module *mod)
+{
+#ifdef CONFIG_CONSTRUCTORS
+	unsigned long i;
+
+	for (i = 0; i < mod->num_ctors; i++)
+		mod->ctors[i]();
+#endif
+}
+
+/* This is where the real work happens */
+static int do_init_module(struct module *mod)
+{
+	int ret = 0;
+
+	blocking_notifier_call_chain(&module_notify_list,
+			MODULE_STATE_COMING, mod);
+
+	/* Set RO and NX regions for core */
+	set_section_ro_nx(mod->module_core,
+				mod->core_text_size,
+				mod->core_ro_size,
+				mod->core_size);
+
+	/* Set RO and NX regions for init */
+	set_section_ro_nx(mod->module_init,
+				mod->init_text_size,
+				mod->init_ro_size,
+				mod->init_size);
+
+	do_mod_ctors(mod);
+	/* Start the module */
+	if (mod->init != NULL)
+		ret = do_one_initcall(mod->init);
+	if (ret < 0) {
+		/* Init routine failed: abort.  Try to protect us from
+                   buggy refcounters. */
+		mod->state = MODULE_STATE_GOING;
+		synchronize_sched();
+		module_put(mod);
+		blocking_notifier_call_chain(&module_notify_list,
+					     MODULE_STATE_GOING, mod);
+		free_module(mod);
+		wake_up_all(&module_wq);
+		return ret;
+	}
+	if (ret > 0) {
+		printk(KERN_WARNING
+"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
+"%s: loading module anyway...\n",
+		       __func__, mod->name, ret,
+		       __func__);
+		dump_stack();
+	}
+
+	/* Now it's a first class citizen! */
+	mod->state = MODULE_STATE_LIVE;
+	blocking_notifier_call_chain(&module_notify_list,
+				     MODULE_STATE_LIVE, mod);
+
+	/* We need to finish all async code before the module init sequence is done */
+	async_synchronize_full();
+
+	mutex_lock(&module_mutex);
+	/* Drop initial reference. */
+	module_put(mod);
+	trim_init_extable(mod);
+#ifdef CONFIG_KALLSYMS
+	mod->num_symtab = mod->core_num_syms;
+	mod->symtab = mod->core_symtab;
+	mod->strtab = mod->core_strtab;
+#endif
+	unset_module_init_ro_nx(mod);
+	module_free(mod, mod->module_init);
+	mod->module_init = NULL;
+	mod->init_size = 0;
+	mod->init_ro_size = 0;
+	mod->init_text_size = 0;
+	mutex_unlock(&module_mutex);
+	wake_up_all(&module_wq);
+
+	return 0;
+}
+
+static int may_init_module(void)
+{
+	if (!capable(CAP_SYS_MODULE) || modules_disabled)
+		return -EPERM;
+
+	return 0;
+}
+
 /* Allocate and load the module: note that size of section 0 is always
    zero, and we rely on this for optional sections. */
-static struct module *load_module(void __user *umod,
-				  unsigned long len,
-				  const char __user *uargs)
+static int load_module(struct load_info *info, const char __user *uargs,
+		       int flags)
 {
-	struct load_info info = { NULL, };
 	struct module *mod, *old;
 	long err;
 
-	pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n",
-	       umod, len, uargs);
+	err = module_sig_check(info);
+	if (err)
+		goto free_copy;
 
-	/* Copy in the blobs from userspace, check they are vaguely sane. */
-	err = copy_and_check(&info, umod, len, uargs);
+	err = elf_header_check(info);
 	if (err)
-		return ERR_PTR(err);
+		goto free_copy;
 
 	/* Figure out module layout, and allocate all the memory. */
-	mod = layout_and_allocate(&info);
+	mod = layout_and_allocate(info, flags);
 	if (IS_ERR(mod)) {
 		err = PTR_ERR(mod);
 		goto free_copy;
 	}
 
 #ifdef CONFIG_MODULE_SIG
-	mod->sig_ok = info.sig_ok;
+	mod->sig_ok = info->sig_ok;
 	if (!mod->sig_ok)
 		add_taint_module(mod, TAINT_FORCED_MODULE);
 #endif
@@ -2983,25 +3126,25 @@ static struct module *load_module(void __user *umod,
 
 	/* Now we've got everything in the final locations, we can
 	 * find optional sections. */
-	find_module_sections(mod, &info);
+	find_module_sections(mod, info);
 
 	err = check_module_license_and_versions(mod);
 	if (err)
 		goto free_unload;
 
 	/* Set up MODINFO_ATTR fields */
-	setup_modinfo(mod, &info);
+	setup_modinfo(mod, info);
 
 	/* Fix up syms, so that st_value is a pointer to location. */
-	err = simplify_symbols(mod, &info);
+	err = simplify_symbols(mod, info);
 	if (err < 0)
 		goto free_modinfo;
 
-	err = apply_relocations(mod, &info);
+	err = apply_relocations(mod, info);
 	if (err < 0)
 		goto free_modinfo;
 
-	err = post_relocation(mod, &info);
+	err = post_relocation(mod, info);
 	if (err < 0)
 		goto free_modinfo;
 
@@ -3041,14 +3184,14 @@ static struct module *load_module(void __user *umod,
 	}
 
 	/* This has to be done once we're sure module name is unique. */
-	dynamic_debug_setup(info.debug, info.num_debug);
+	dynamic_debug_setup(info->debug, info->num_debug);
 
 	/* Find duplicate symbols */
 	err = verify_export_symbols(mod);
 	if (err < 0)
 		goto ddebug;
 
-	module_bug_finalize(info.hdr, info.sechdrs, mod);
+	module_bug_finalize(info->hdr, info->sechdrs, mod);
 	list_add_rcu(&mod->list, &modules);
 	mutex_unlock(&module_mutex);
 
@@ -3059,16 +3202,17 @@ static struct module *load_module(void __user *umod,
 		goto unlink;
 
 	/* Link in to syfs. */
-	err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp);
+	err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
 	if (err < 0)
 		goto unlink;
 
 	/* Get rid of temporary copy. */
-	free_copy(&info);
+	free_copy(info);
 
 	/* Done! */
 	trace_module_load(mod);
-	return mod;
+
+	return do_init_module(mod);
 
  unlink:
 	mutex_lock(&module_mutex);
@@ -3077,7 +3221,7 @@ static struct module *load_module(void __user *umod,
 	module_bug_cleanup(mod);
 	wake_up_all(&module_wq);
  ddebug:
-	dynamic_debug_remove(info.debug);
+	dynamic_debug_remove(info->debug);
  unlock:
 	mutex_unlock(&module_mutex);
 	synchronize_sched();
@@ -3089,106 +3233,52 @@ static struct module *load_module(void __user *umod,
  free_unload:
 	module_unload_free(mod);
  free_module:
-	module_deallocate(mod, &info);
+	module_deallocate(mod, info);
  free_copy:
-	free_copy(&info);
-	return ERR_PTR(err);
-}
-
-/* Call module constructors. */
-static void do_mod_ctors(struct module *mod)
-{
-#ifdef CONFIG_CONSTRUCTORS
-	unsigned long i;
-
-	for (i = 0; i < mod->num_ctors; i++)
-		mod->ctors[i]();
-#endif
+	free_copy(info);
+	return err;
 }
 
-/* This is where the real work happens */
 SYSCALL_DEFINE3(init_module, void __user *, umod,
 		unsigned long, len, const char __user *, uargs)
 {
-	struct module *mod;
-	int ret = 0;
+	int err;
+	struct load_info info = { };
 
-	/* Must have permission */
-	if (!capable(CAP_SYS_MODULE) || modules_disabled)
-		return -EPERM;
+	err = may_init_module();
+	if (err)
+		return err;
 
-	/* Do all the hard work */
-	mod = load_module(umod, len, uargs);
-	if (IS_ERR(mod))
-		return PTR_ERR(mod);
+	pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
+	       umod, len, uargs);
 
-	blocking_notifier_call_chain(&module_notify_list,
-			MODULE_STATE_COMING, mod);
+	err = copy_module_from_user(umod, len, &info);
+	if (err)
+		return err;
 
-	/* Set RO and NX regions for core */
-	set_section_ro_nx(mod->module_core,
-				mod->core_text_size,
-				mod->core_ro_size,
-				mod->core_size);
+	return load_module(&info, uargs, 0);
+}
 
-	/* Set RO and NX regions for init */
-	set_section_ro_nx(mod->module_init,
-				mod->init_text_size,
-				mod->init_ro_size,
-				mod->init_size);
+SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
+{
+	int err;
+	struct load_info info = { };
 
-	do_mod_ctors(mod);
-	/* Start the module */
-	if (mod->init != NULL)
-		ret = do_one_initcall(mod->init);
-	if (ret < 0) {
-		/* Init routine failed: abort.  Try to protect us from
-                   buggy refcounters. */
-		mod->state = MODULE_STATE_GOING;
-		synchronize_sched();
-		module_put(mod);
-		blocking_notifier_call_chain(&module_notify_list,
-					     MODULE_STATE_GOING, mod);
-		free_module(mod);
-		wake_up_all(&module_wq);
-		return ret;
-	}
-	if (ret > 0) {
-		printk(KERN_WARNING
-"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
-"%s: loading module anyway...\n",
-		       __func__, mod->name, ret,
-		       __func__);
-		dump_stack();
-	}
+	err = may_init_module();
+	if (err)
+		return err;
 
-	/* Now it's a first class citizen! */
-	mod->state = MODULE_STATE_LIVE;
-	blocking_notifier_call_chain(&module_notify_list,
-				     MODULE_STATE_LIVE, mod);
+	pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
 
-	/* We need to finish all async code before the module init sequence is done */
-	async_synchronize_full();
+	if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
+		      |MODULE_INIT_IGNORE_VERMAGIC))
+		return -EINVAL;
 
-	mutex_lock(&module_mutex);
-	/* Drop initial reference. */
-	module_put(mod);
-	trim_init_extable(mod);
-#ifdef CONFIG_KALLSYMS
-	mod->num_symtab = mod->core_num_syms;
-	mod->symtab = mod->core_symtab;
-	mod->strtab = mod->core_strtab;
-#endif
-	unset_module_init_ro_nx(mod);
-	module_free(mod, mod->module_init);
-	mod->module_init = NULL;
-	mod->init_size = 0;
-	mod->init_ro_size = 0;
-	mod->init_text_size = 0;
-	mutex_unlock(&module_mutex);
-	wake_up_all(&module_wq);
+	err = copy_module_from_fd(fd, &info);
+	if (err)
+		return err;
 
-	return 0;
+	return load_module(&info, uargs, flags);
 }
 
 static inline int within(unsigned long addr, void *start, unsigned long size)
diff --git a/trunk/kernel/nsproxy.c b/trunk/kernel/nsproxy.c
index 7e1c3de1ce45..78e2ecb20165 100644
--- a/trunk/kernel/nsproxy.c
+++ b/trunk/kernel/nsproxy.c
@@ -57,7 +57,8 @@ static inline struct nsproxy *create_nsproxy(void)
  * leave it to the caller to do proper locking and attach it to task.
  */
 static struct nsproxy *create_new_namespaces(unsigned long flags,
-			struct task_struct *tsk, struct fs_struct *new_fs)
+	struct task_struct *tsk, struct user_namespace *user_ns,
+	struct fs_struct *new_fs)
 {
 	struct nsproxy *new_nsp;
 	int err;
@@ -66,31 +67,31 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 	if (!new_nsp)
 		return ERR_PTR(-ENOMEM);
 
-	new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
+	new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
 	if (IS_ERR(new_nsp->mnt_ns)) {
 		err = PTR_ERR(new_nsp->mnt_ns);
 		goto out_ns;
 	}
 
-	new_nsp->uts_ns = copy_utsname(flags, tsk);
+	new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
 	if (IS_ERR(new_nsp->uts_ns)) {
 		err = PTR_ERR(new_nsp->uts_ns);
 		goto out_uts;
 	}
 
-	new_nsp->ipc_ns = copy_ipcs(flags, tsk);
+	new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
 	if (IS_ERR(new_nsp->ipc_ns)) {
 		err = PTR_ERR(new_nsp->ipc_ns);
 		goto out_ipc;
 	}
 
-	new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk));
+	new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns);
 	if (IS_ERR(new_nsp->pid_ns)) {
 		err = PTR_ERR(new_nsp->pid_ns);
 		goto out_pid;
 	}
 
-	new_nsp->net_ns = copy_net_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->net_ns);
+	new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
 	if (IS_ERR(new_nsp->net_ns)) {
 		err = PTR_ERR(new_nsp->net_ns);
 		goto out_net;
@@ -122,6 +123,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 {
 	struct nsproxy *old_ns = tsk->nsproxy;
+	struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
 	struct nsproxy *new_ns;
 	int err = 0;
 
@@ -134,7 +136,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 				CLONE_NEWPID | CLONE_NEWNET)))
 		return 0;
 
-	if (!capable(CAP_SYS_ADMIN)) {
+	if (!ns_capable(user_ns, CAP_SYS_ADMIN)) {
 		err = -EPERM;
 		goto out;
 	}
@@ -151,7 +153,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 		goto out;
 	}
 
-	new_ns = create_new_namespaces(flags, tsk, tsk->fs);
+	new_ns = create_new_namespaces(flags, tsk,
+				       task_cred_xxx(tsk, user_ns), tsk->fs);
 	if (IS_ERR(new_ns)) {
 		err = PTR_ERR(new_ns);
 		goto out;
@@ -183,19 +186,21 @@ void free_nsproxy(struct nsproxy *ns)
  * On success, returns the new nsproxy.
  */
 int unshare_nsproxy_namespaces(unsigned long unshare_flags,
-		struct nsproxy **new_nsp, struct fs_struct *new_fs)
+	struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
 {
+	struct user_namespace *user_ns;
 	int err = 0;
 
 	if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
-			       CLONE_NEWNET)))
+			       CLONE_NEWNET | CLONE_NEWPID)))
 		return 0;
 
-	if (!capable(CAP_SYS_ADMIN))
+	user_ns = new_cred ? new_cred->user_ns : current_user_ns();
+	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
-	*new_nsp = create_new_namespaces(unshare_flags, current,
-				new_fs ? new_fs : current->fs);
+	*new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
+					 new_fs ? new_fs : current->fs);
 	if (IS_ERR(*new_nsp)) {
 		err = PTR_ERR(*new_nsp);
 		goto out;
@@ -241,9 +246,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
 	struct file *file;
 	int err;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
 	file = proc_ns_fget(fd);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
@@ -254,7 +256,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
 	if (nstype && (ops->type != nstype))
 		goto out;
 
-	new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
+	new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
 	if (IS_ERR(new_nsproxy)) {
 		err = PTR_ERR(new_nsproxy);
 		goto out;
diff --git a/trunk/kernel/pid.c b/trunk/kernel/pid.c
index fd996c1ed9f8..36aa02ff17d6 100644
--- a/trunk/kernel/pid.c
+++ b/trunk/kernel/pid.c
@@ -36,6 +36,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/init_task.h>
 #include <linux/syscalls.h>
+#include <linux/proc_fs.h>
 
 #define pid_hashfn(nr, ns)	\
 	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -78,24 +79,11 @@ struct pid_namespace init_pid_ns = {
 	.last_pid = 0,
 	.level = 0,
 	.child_reaper = &init_task,
+	.user_ns = &init_user_ns,
+	.proc_inum = PROC_PID_INIT_INO,
 };
 EXPORT_SYMBOL_GPL(init_pid_ns);
 
-int is_container_init(struct task_struct *tsk)
-{
-	int ret = 0;
-	struct pid *pid;
-
-	rcu_read_lock();
-	pid = task_pid(tsk);
-	if (pid != NULL && pid->numbers[pid->level].nr == 1)
-		ret = 1;
-	rcu_read_unlock();
-
-	return ret;
-}
-EXPORT_SYMBOL(is_container_init);
-
 /*
  * Note: disable interrupts while the pidmap_lock is held as an
  * interrupt might come in and do read_lock(&tasklist_lock).
@@ -269,8 +257,24 @@ void free_pid(struct pid *pid)
 	unsigned long flags;
 
 	spin_lock_irqsave(&pidmap_lock, flags);
-	for (i = 0; i <= pid->level; i++)
-		hlist_del_rcu(&pid->numbers[i].pid_chain);
+	for (i = 0; i <= pid->level; i++) {
+		struct upid *upid = pid->numbers + i;
+		struct pid_namespace *ns = upid->ns;
+		hlist_del_rcu(&upid->pid_chain);
+		switch(--ns->nr_hashed) {
+		case 1:
+			/* When all that is left in the pid namespace
+			 * is the reaper wake up the reaper.  The reaper
+			 * may be sleeping in zap_pid_ns_processes().
+			 */
+			wake_up_process(ns->child_reaper);
+			break;
+		case 0:
+			ns->nr_hashed = -1;
+			schedule_work(&ns->proc_work);
+			break;
+		}
+	}
 	spin_unlock_irqrestore(&pidmap_lock, flags);
 
 	for (i = 0; i <= pid->level; i++)
@@ -292,6 +296,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 		goto out;
 
 	tmp = ns;
+	pid->level = ns->level;
 	for (i = ns->level; i >= 0; i--) {
 		nr = alloc_pidmap(tmp);
 		if (nr < 0)
@@ -302,22 +307,32 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 		tmp = tmp->parent;
 	}
 
+	if (unlikely(is_child_reaper(pid))) {
+		if (pid_ns_prepare_proc(ns))
+			goto out_free;
+	}
+
 	get_pid_ns(ns);
-	pid->level = ns->level;
 	atomic_set(&pid->count, 1);
 	for (type = 0; type < PIDTYPE_MAX; ++type)
 		INIT_HLIST_HEAD(&pid->tasks[type]);
 
 	upid = pid->numbers + ns->level;
 	spin_lock_irq(&pidmap_lock);
-	for ( ; upid >= pid->numbers; --upid)
+	if (ns->nr_hashed < 0)
+		goto out_unlock;
+	for ( ; upid >= pid->numbers; --upid) {
 		hlist_add_head_rcu(&upid->pid_chain,
 				&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+		upid->ns->nr_hashed++;
+	}
 	spin_unlock_irq(&pidmap_lock);
 
 out:
 	return pid;
 
+out_unlock:
+	spin_unlock(&pidmap_lock);
 out_free:
 	while (++i <= ns->level)
 		free_pidmap(pid->numbers + i);
@@ -344,7 +359,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns);
 
 struct pid *find_vpid(int nr)
 {
-	return find_pid_ns(nr, current->nsproxy->pid_ns);
+	return find_pid_ns(nr, task_active_pid_ns(current));
 }
 EXPORT_SYMBOL_GPL(find_vpid);
 
@@ -428,7 +443,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 
 struct task_struct *find_task_by_vpid(pid_t vnr)
 {
-	return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
+	return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
 }
 
 struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
@@ -483,7 +498,7 @@ EXPORT_SYMBOL_GPL(pid_nr_ns);
 
 pid_t pid_vnr(struct pid *pid)
 {
-	return pid_nr_ns(pid, current->nsproxy->pid_ns);
+	return pid_nr_ns(pid, task_active_pid_ns(current));
 }
 EXPORT_SYMBOL_GPL(pid_vnr);
 
@@ -494,7 +509,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 
 	rcu_read_lock();
 	if (!ns)
-		ns = current->nsproxy->pid_ns;
+		ns = task_active_pid_ns(current);
 	if (likely(pid_alive(task))) {
 		if (type != PIDTYPE_PID)
 			task = task->group_leader;
@@ -569,6 +584,7 @@ void __init pidmap_init(void)
 	/* Reserve PID 0. We never call free_pidmap(0) */
 	set_bit(0, init_pid_ns.pidmap[0].page);
 	atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+	init_pid_ns.nr_hashed = 1;
 
 	init_pid_ns.pid_cachep = KMEM_CACHE(pid,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --git a/trunk/kernel/pid_namespace.c b/trunk/kernel/pid_namespace.c
index 7b07cc0dfb75..fdbd0cdf271a 100644
--- a/trunk/kernel/pid_namespace.c
+++ b/trunk/kernel/pid_namespace.c
@@ -10,6 +10,7 @@
 
 #include <linux/pid.h>
 #include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
 #include <linux/syscalls.h>
 #include <linux/err.h>
 #include <linux/acct.h>
@@ -71,10 +72,17 @@ static struct kmem_cache *create_pid_cachep(int nr_ids)
 	return NULL;
 }
 
+static void proc_cleanup_work(struct work_struct *work)
+{
+	struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
+	pid_ns_release_proc(ns);
+}
+
 /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
 #define MAX_PID_NS_LEVEL 32
 
-static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
+static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
+	struct pid_namespace *parent_pid_ns)
 {
 	struct pid_namespace *ns;
 	unsigned int level = parent_pid_ns->level + 1;
@@ -99,9 +107,15 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
 	if (ns->pid_cachep == NULL)
 		goto out_free_map;
 
+	err = proc_alloc_inum(&ns->proc_inum);
+	if (err)
+		goto out_free_map;
+
 	kref_init(&ns->kref);
 	ns->level = level;
 	ns->parent = get_pid_ns(parent_pid_ns);
+	ns->user_ns = get_user_ns(user_ns);
+	INIT_WORK(&ns->proc_work, proc_cleanup_work);
 
 	set_bit(0, ns->pidmap[0].page);
 	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -109,14 +123,8 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
 	for (i = 1; i < PIDMAP_ENTRIES; i++)
 		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
 
-	err = pid_ns_prepare_proc(ns);
-	if (err)
-		goto out_put_parent_pid_ns;
-
 	return ns;
 
-out_put_parent_pid_ns:
-	put_pid_ns(parent_pid_ns);
 out_free_map:
 	kfree(ns->pidmap[0].page);
 out_free:
@@ -129,18 +137,21 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
 {
 	int i;
 
+	proc_free_inum(ns->proc_inum);
 	for (i = 0; i < PIDMAP_ENTRIES; i++)
 		kfree(ns->pidmap[i].page);
+	put_user_ns(ns->user_ns);
 	kmem_cache_free(pid_ns_cachep, ns);
 }
 
-struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
+struct pid_namespace *copy_pid_ns(unsigned long flags,
+	struct user_namespace *user_ns, struct pid_namespace *old_ns)
 {
 	if (!(flags & CLONE_NEWPID))
 		return get_pid_ns(old_ns);
-	if (flags & (CLONE_THREAD|CLONE_PARENT))
+	if (task_active_pid_ns(current) != old_ns)
 		return ERR_PTR(-EINVAL);
-	return create_pid_namespace(old_ns);
+	return create_pid_namespace(user_ns, old_ns);
 }
 
 static void free_pid_ns(struct kref *kref)
@@ -211,22 +222,15 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 
 	/*
 	 * sys_wait4() above can't reap the TASK_DEAD children.
-	 * Make sure they all go away, see __unhash_process().
+	 * Make sure they all go away, see free_pid().
 	 */
 	for (;;) {
-		bool need_wait = false;
-
-		read_lock(&tasklist_lock);
-		if (!list_empty(&current->children)) {
-			__set_current_state(TASK_UNINTERRUPTIBLE);
-			need_wait = true;
-		}
-		read_unlock(&tasklist_lock);
-
-		if (!need_wait)
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (pid_ns->nr_hashed == 1)
 			break;
 		schedule();
 	}
+	__set_current_state(TASK_RUNNING);
 
 	if (pid_ns->reboot)
 		current->signal->group_exit_code = pid_ns->reboot;
@@ -239,9 +243,10 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp, loff_t *ppos)
 {
+	struct pid_namespace *pid_ns = task_active_pid_ns(current);
 	struct ctl_table tmp = *table;
 
-	if (write && !capable(CAP_SYS_ADMIN))
+	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
 	/*
@@ -250,7 +255,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 	 * it should synchronize its usage with external means.
 	 */
 
-	tmp.data = &current->nsproxy->pid_ns->last_pid;
+	tmp.data = &pid_ns->last_pid;
 	return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 }
 
@@ -299,6 +304,68 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
 	return 0;
 }
 
+static void *pidns_get(struct task_struct *task)
+{
+	struct pid_namespace *ns;
+
+	rcu_read_lock();
+	ns = get_pid_ns(task_active_pid_ns(task));
+	rcu_read_unlock();
+
+	return ns;
+}
+
+static void pidns_put(void *ns)
+{
+	put_pid_ns(ns);
+}
+
+static int pidns_install(struct nsproxy *nsproxy, void *ns)
+{
+	struct pid_namespace *active = task_active_pid_ns(current);
+	struct pid_namespace *ancestor, *new = ns;
+
+	if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
+	    !nsown_capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/*
+	 * Only allow entering the current active pid namespace
+	 * or a child of the current active pid namespace.
+	 *
+	 * This is required for fork to return a usable pid value and
+	 * this maintains the property that processes and their
+	 * children can not escape their current pid namespace.
+	 */
+	if (new->level < active->level)
+		return -EINVAL;
+
+	ancestor = new;
+	while (ancestor->level > active->level)
+		ancestor = ancestor->parent;
+	if (ancestor != active)
+		return -EINVAL;
+
+	put_pid_ns(nsproxy->pid_ns);
+	nsproxy->pid_ns = get_pid_ns(new);
+	return 0;
+}
+
+static unsigned int pidns_inum(void *ns)
+{
+	struct pid_namespace *pid_ns = ns;
+	return pid_ns->proc_inum;
+}
+
+const struct proc_ns_operations pidns_operations = {
+	.name		= "pid",
+	.type		= CLONE_NEWPID,
+	.get		= pidns_get,
+	.put		= pidns_put,
+	.install	= pidns_install,
+	.inum		= pidns_inum,
+};
+
 static __init int pid_namespaces_init(void)
 {
 	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
diff --git a/trunk/kernel/posix-cpu-timers.c b/trunk/kernel/posix-cpu-timers.c
index d73840271dce..a278cad1d5d6 100644
--- a/trunk/kernel/posix-cpu-timers.c
+++ b/trunk/kernel/posix-cpu-timers.c
@@ -9,6 +9,7 @@
 #include <asm/uaccess.h>
 #include <linux/kernel_stat.h>
 #include <trace/events/timer.h>
+#include <linux/random.h>
 
 /*
  * Called after updating RLIMIT_CPU to run cpu timer and update
@@ -470,6 +471,8 @@ static void cleanup_timers(struct list_head *head,
  */
 void posix_cpu_timers_exit(struct task_struct *tsk)
 {
+	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
+						sizeof(unsigned long long));
 	cleanup_timers(tsk->cpu_timers,
 		       tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
 
diff --git a/trunk/kernel/printk.c b/trunk/kernel/printk.c
index 22e070f3470a..19c0d7bcf24a 100644
--- a/trunk/kernel/printk.c
+++ b/trunk/kernel/printk.c
@@ -747,6 +747,21 @@ void __init setup_log_buf(int early)
 		free, (free * 100) / __LOG_BUF_LEN);
 }
 
+static bool __read_mostly ignore_loglevel;
+
+static int __init ignore_loglevel_setup(char *str)
+{
+	ignore_loglevel = 1;
+	printk(KERN_INFO "debug: ignoring loglevel setting.\n");
+
+	return 0;
+}
+
+early_param("ignore_loglevel", ignore_loglevel_setup);
+module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
+	"print all kernel messages to the console.");
+
 #ifdef CONFIG_BOOT_PRINTK_DELAY
 
 static int boot_delay; /* msecs delay after each printk during bootup */
@@ -770,13 +785,15 @@ static int __init boot_delay_setup(char *str)
 }
 __setup("boot_delay=", boot_delay_setup);
 
-static void boot_delay_msec(void)
+static void boot_delay_msec(int level)
 {
 	unsigned long long k;
 	unsigned long timeout;
 
-	if (boot_delay == 0 || system_state != SYSTEM_BOOTING)
+	if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
+		|| (level >= console_loglevel && !ignore_loglevel)) {
 		return;
+	}
 
 	k = (unsigned long long)loops_per_msec * boot_delay;
 
@@ -795,7 +812,7 @@ static void boot_delay_msec(void)
 	}
 }
 #else
-static inline void boot_delay_msec(void)
+static inline void boot_delay_msec(int level)
 {
 }
 #endif
@@ -1238,21 +1255,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
 	return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
 }
 
-static bool __read_mostly ignore_loglevel;
-
-static int __init ignore_loglevel_setup(char *str)
-{
-	ignore_loglevel = 1;
-	printk(KERN_INFO "debug: ignoring loglevel setting.\n");
-
-	return 0;
-}
-
-early_param("ignore_loglevel", ignore_loglevel_setup);
-module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
-	"print all kernel messages to the console.");
-
 /*
  * Call the console drivers, asking them to write out
  * log_buf[start] to log_buf[end - 1].
@@ -1498,7 +1500,7 @@ asmlinkage int vprintk_emit(int facility, int level,
 	int this_cpu;
 	int printed_len = 0;
 
-	boot_delay_msec();
+	boot_delay_msec(level);
 	printk_delay();
 
 	/* This stops the holder of console_sem just where we want him */
diff --git a/trunk/kernel/ptrace.c b/trunk/kernel/ptrace.c
index 1f5e55dda955..1599157336a6 100644
--- a/trunk/kernel/ptrace.c
+++ b/trunk/kernel/ptrace.c
@@ -215,8 +215,12 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	smp_rmb();
 	if (task->mm)
 		dumpable = get_dumpable(task->mm);
-	if (!dumpable  && !ptrace_has_cap(task_user_ns(task), mode))
+	rcu_read_lock();
+	if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
+		rcu_read_unlock();
 		return -EPERM;
+	}
+	rcu_read_unlock();
 
 	return security_ptrace_access_check(task, mode);
 }
@@ -280,8 +284,10 @@ static int ptrace_attach(struct task_struct *task, long request,
 
 	if (seize)
 		flags |= PT_SEIZED;
-	if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
+	rcu_read_lock();
+	if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
 		flags |= PT_PTRACE_CAP;
+	rcu_read_unlock();
 	task->ptrace = flags;
 
 	__ptrace_link(task, current);
@@ -457,6 +463,9 @@ void exit_ptrace(struct task_struct *tracer)
 		return;
 
 	list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
+		if (unlikely(p->ptrace & PT_EXITKILL))
+			send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
+
 		if (__ptrace_detach(tracer, p))
 			list_add(&p->ptrace_entry, &ptrace_dead);
 	}
diff --git a/trunk/kernel/res_counter.c b/trunk/kernel/res_counter.c
index 3920d593e63c..ff55247e7049 100644
--- a/trunk/kernel/res_counter.c
+++ b/trunk/kernel/res_counter.c
@@ -86,33 +86,39 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
 	return __res_counter_charge(counter, val, limit_fail_at, true);
 }
 
-void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
+u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
 {
 	if (WARN_ON(counter->usage < val))
 		val = counter->usage;
 
 	counter->usage -= val;
+	return counter->usage;
 }
 
-void res_counter_uncharge_until(struct res_counter *counter,
-				struct res_counter *top,
-				unsigned long val)
+u64 res_counter_uncharge_until(struct res_counter *counter,
+			       struct res_counter *top,
+			       unsigned long val)
 {
 	unsigned long flags;
 	struct res_counter *c;
+	u64 ret = 0;
 
 	local_irq_save(flags);
 	for (c = counter; c != top; c = c->parent) {
+		u64 r;
 		spin_lock(&c->lock);
-		res_counter_uncharge_locked(c, val);
+		r = res_counter_uncharge_locked(c, val);
+		if (c == counter)
+			ret = r;
 		spin_unlock(&c->lock);
 	}
 	local_irq_restore(flags);
+	return ret;
 }
 
-void res_counter_uncharge(struct res_counter *counter, unsigned long val)
+u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
 {
-	res_counter_uncharge_until(counter, NULL, val);
+	return res_counter_uncharge_until(counter, NULL, val);
 }
 
 static inline unsigned long long *
diff --git a/trunk/kernel/sched/core.c b/trunk/kernel/sched/core.c
index c1fb82104bfb..257002c13bb0 100644
--- a/trunk/kernel/sched/core.c
+++ b/trunk/kernel/sched/core.c
@@ -4097,8 +4097,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 		goto out_free_cpus_allowed;
 	}
 	retval = -EPERM;
-	if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
-		goto out_unlock;
+	if (!check_same_owner(p)) {
+		rcu_read_lock();
+		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+			rcu_read_unlock();
+			goto out_unlock;
+		}
+		rcu_read_unlock();
+	}
 
 	retval = security_task_setscheduler(p);
 	if (retval)
diff --git a/trunk/kernel/sched/fair.c b/trunk/kernel/sched/fair.c
index 4603d6cb9e25..5eea8707234a 100644
--- a/trunk/kernel/sched/fair.c
+++ b/trunk/kernel/sched/fair.c
@@ -793,8 +793,11 @@ unsigned int sysctl_numa_balancing_scan_delay = 1000;
 
 static void task_numa_placement(struct task_struct *p)
 {
-	int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+	int seq;
 
+	if (!p->mm)	/* for example, ksmd faulting in a user's mm */
+		return;
+	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
 	if (p->numa_scan_seq == seq)
 		return;
 	p->numa_scan_seq = seq;
diff --git a/trunk/kernel/signal.c b/trunk/kernel/signal.c
index a49c7f36ceb3..580a91e63471 100644
--- a/trunk/kernel/signal.c
+++ b/trunk/kernel/signal.c
@@ -1753,7 +1753,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
 	 * see comment in do_notify_parent() about the following 4 lines
 	 */
 	rcu_read_lock();
-	info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
+	info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
 	info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
 	rcu_read_unlock();
 
diff --git a/trunk/kernel/sys_ni.c b/trunk/kernel/sys_ni.c
index dbff751e4086..395084d4ce16 100644
--- a/trunk/kernel/sys_ni.c
+++ b/trunk/kernel/sys_ni.c
@@ -25,6 +25,7 @@ cond_syscall(sys_swapoff);
 cond_syscall(sys_kexec_load);
 cond_syscall(compat_sys_kexec_load);
 cond_syscall(sys_init_module);
+cond_syscall(sys_finit_module);
 cond_syscall(sys_delete_module);
 cond_syscall(sys_socketpair);
 cond_syscall(sys_bind);
diff --git a/trunk/kernel/sysctl_binary.c b/trunk/kernel/sysctl_binary.c
index 65bdcf198d4e..5a6384450501 100644
--- a/trunk/kernel/sysctl_binary.c
+++ b/trunk/kernel/sysctl_binary.c
@@ -1344,7 +1344,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
 		goto out_putname;
 	}
 
-	mnt = current->nsproxy->pid_ns->proc_mnt;
+	mnt = task_active_pid_ns(current)->proc_mnt;
 	file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
 	result = PTR_ERR(file);
 	if (IS_ERR(file))
diff --git a/trunk/kernel/trace/ftrace.c b/trunk/kernel/trace/ftrace.c
index afd092de45b7..3ffe4c5ad3f3 100644
--- a/trunk/kernel/trace/ftrace.c
+++ b/trunk/kernel/trace/ftrace.c
@@ -2675,12 +2675,12 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
 }
 
 loff_t
-ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
+ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
 {
 	loff_t ret;
 
 	if (file->f_mode & FMODE_READ)
-		ret = seq_lseek(file, offset, origin);
+		ret = seq_lseek(file, offset, whence);
 	else
 		file->f_pos = ret = 1;
 
diff --git a/trunk/kernel/trace/trace.c b/trunk/kernel/trace/trace.c
index 61e081b4ba11..e5125677efa0 100644
--- a/trunk/kernel/trace/trace.c
+++ b/trunk/kernel/trace/trace.c
@@ -3034,6 +3034,31 @@ static void set_buffer_entries(struct trace_array *tr, unsigned long val)
 		tr->data[cpu]->entries = val;
 }
 
+/* resize @tr's buffer to the size of @size_tr's entries */
+static int resize_buffer_duplicate_size(struct trace_array *tr,
+					struct trace_array *size_tr, int cpu_id)
+{
+	int cpu, ret = 0;
+
+	if (cpu_id == RING_BUFFER_ALL_CPUS) {
+		for_each_tracing_cpu(cpu) {
+			ret = ring_buffer_resize(tr->buffer,
+					size_tr->data[cpu]->entries, cpu);
+			if (ret < 0)
+				break;
+			tr->data[cpu]->entries = size_tr->data[cpu]->entries;
+		}
+	} else {
+		ret = ring_buffer_resize(tr->buffer,
+					size_tr->data[cpu_id]->entries, cpu_id);
+		if (ret == 0)
+			tr->data[cpu_id]->entries =
+				size_tr->data[cpu_id]->entries;
+	}
+
+	return ret;
+}
+
 static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
 {
 	int ret;
@@ -3058,23 +3083,8 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
 
 	ret = ring_buffer_resize(max_tr.buffer, size, cpu);
 	if (ret < 0) {
-		int r = 0;
-
-		if (cpu == RING_BUFFER_ALL_CPUS) {
-			int i;
-			for_each_tracing_cpu(i) {
-				r = ring_buffer_resize(global_trace.buffer,
-						global_trace.data[i]->entries,
-						i);
-				if (r < 0)
-					break;
-			}
-		} else {
-			r = ring_buffer_resize(global_trace.buffer,
-						global_trace.data[cpu]->entries,
-						cpu);
-		}
-
+		int r = resize_buffer_duplicate_size(&global_trace,
+						     &global_trace, cpu);
 		if (r < 0) {
 			/*
 			 * AARGH! We are left with different
@@ -3212,17 +3222,11 @@ static int tracing_set_tracer(const char *buf)
 
 	topts = create_trace_option_files(t);
 	if (t->use_max_tr) {
-		int cpu;
 		/* we need to make per cpu buffer sizes equivalent */
-		for_each_tracing_cpu(cpu) {
-			ret = ring_buffer_resize(max_tr.buffer,
-						global_trace.data[cpu]->entries,
-						cpu);
-			if (ret < 0)
-				goto out;
-			max_tr.data[cpu]->entries =
-					global_trace.data[cpu]->entries;
-		}
+		ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
+						   RING_BUFFER_ALL_CPUS);
+		if (ret < 0)
+			goto out;
 	}
 
 	if (t->init) {
@@ -4271,13 +4275,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		return -ENOMEM;
 
 	if (*ppos & (PAGE_SIZE - 1)) {
-		WARN_ONCE(1, "Ftrace: previous read must page-align\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (len & (PAGE_SIZE - 1)) {
-		WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
 		if (len < PAGE_SIZE) {
 			ret = -EINVAL;
 			goto out;
diff --git a/trunk/kernel/trace/trace_stack.c b/trunk/kernel/trace/trace_stack.c
index 0c1b165778e5..42ca822fc701 100644
--- a/trunk/kernel/trace/trace_stack.c
+++ b/trunk/kernel/trace/trace_stack.c
@@ -33,7 +33,6 @@ static unsigned long max_stack_size;
 static arch_spinlock_t max_stack_lock =
 	(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 
-static int stack_trace_disabled __read_mostly;
 static DEFINE_PER_CPU(int, trace_active);
 static DEFINE_MUTEX(stack_sysctl_mutex);
 
@@ -116,9 +115,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
 {
 	int cpu;
 
-	if (unlikely(!ftrace_enabled || stack_trace_disabled))
-		return;
-
 	preempt_disable_notrace();
 
 	cpu = raw_smp_processor_id();
diff --git a/trunk/kernel/trace/trace_uprobe.c b/trunk/kernel/trace/trace_uprobe.c
index 9614db8b0f8c..c86e6d4f67fb 100644
--- a/trunk/kernel/trace/trace_uprobe.c
+++ b/trunk/kernel/trace/trace_uprobe.c
@@ -22,6 +22,7 @@
 #include <linux/uaccess.h>
 #include <linux/uprobes.h>
 #include <linux/namei.h>
+#include <linux/string.h>
 
 #include "trace_probe.h"
 
@@ -263,16 +264,15 @@ static int create_trace_uprobe(int argc, char **argv)
 
 	/* setup a probe */
 	if (!event) {
-		char *tail = strrchr(filename, '/');
+		char *tail;
 		char *ptr;
 
-		ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL);
-		if (!ptr) {
+		tail = kstrdup(kbasename(filename), GFP_KERNEL);
+		if (!tail) {
 			ret = -ENOMEM;
 			goto fail_address_parse;
 		}
 
-		tail = ptr;
 		ptr = strpbrk(tail, ".-_");
 		if (ptr)
 			*ptr = '\0';
diff --git a/trunk/kernel/user.c b/trunk/kernel/user.c
index 750acffbe9ec..33acb5e53a5f 100644
--- a/trunk/kernel/user.c
+++ b/trunk/kernel/user.c
@@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/export.h>
 #include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
 
 /*
  * userns count is 1 for root user, 1 for init_uts_ns,
@@ -51,6 +52,7 @@ struct user_namespace init_user_ns = {
 	},
 	.owner = GLOBAL_ROOT_UID,
 	.group = GLOBAL_ROOT_GID,
+	.proc_inum = PROC_USER_INIT_INO,
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
 
diff --git a/trunk/kernel/user_namespace.c b/trunk/kernel/user_namespace.c
index 456a6b9fba34..2b042c42fbc4 100644
--- a/trunk/kernel/user_namespace.c
+++ b/trunk/kernel/user_namespace.c
@@ -9,6 +9,7 @@
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
 #include <linux/highuid.h>
 #include <linux/cred.h>
 #include <linux/securebits.h>
@@ -26,6 +27,24 @@ static struct kmem_cache *user_ns_cachep __read_mostly;
 static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
 				struct uid_gid_map *map);
 
+static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
+{
+	/* Start with the same capabilities as init but useless for doing
+	 * anything as the capabilities are bound to the new user namespace.
+	 */
+	cred->securebits = SECUREBITS_DEFAULT;
+	cred->cap_inheritable = CAP_EMPTY_SET;
+	cred->cap_permitted = CAP_FULL_SET;
+	cred->cap_effective = CAP_FULL_SET;
+	cred->cap_bset = CAP_FULL_SET;
+#ifdef CONFIG_KEYS
+	key_put(cred->request_key_auth);
+	cred->request_key_auth = NULL;
+#endif
+	/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
+	cred->user_ns = user_ns;
+}
+
 /*
  * Create a new user namespace, deriving the creator from the user in the
  * passed credentials, and replacing that user with the new root user for the
@@ -39,6 +58,7 @@ int create_user_ns(struct cred *new)
 	struct user_namespace *ns, *parent_ns = new->user_ns;
 	kuid_t owner = new->euid;
 	kgid_t group = new->egid;
+	int ret;
 
 	/* The creator needs a mapping in the parent user namespace
 	 * or else we won't be able to reasonably tell userspace who
@@ -52,38 +72,45 @@ int create_user_ns(struct cred *new)
 	if (!ns)
 		return -ENOMEM;
 
+	ret = proc_alloc_inum(&ns->proc_inum);
+	if (ret) {
+		kmem_cache_free(user_ns_cachep, ns);
+		return ret;
+	}
+
 	kref_init(&ns->kref);
+	/* Leave the new->user_ns reference with the new user namespace. */
 	ns->parent = parent_ns;
 	ns->owner = owner;
 	ns->group = group;
 
-	/* Start with the same capabilities as init but useless for doing
-	 * anything as the capabilities are bound to the new user namespace.
-	 */
-	new->securebits = SECUREBITS_DEFAULT;
-	new->cap_inheritable = CAP_EMPTY_SET;
-	new->cap_permitted = CAP_FULL_SET;
-	new->cap_effective = CAP_FULL_SET;
-	new->cap_bset = CAP_FULL_SET;
-#ifdef CONFIG_KEYS
-	key_put(new->request_key_auth);
-	new->request_key_auth = NULL;
-#endif
-	/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
-
-	/* Leave the new->user_ns reference with the new user namespace. */
-	/* Leave the reference to our user_ns with the new cred. */
-	new->user_ns = ns;
+	set_cred_user_ns(new, ns);
 
 	return 0;
 }
 
+int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
+{
+	struct cred *cred;
+
+	if (!(unshare_flags & CLONE_NEWUSER))
+		return 0;
+
+	cred = prepare_creds();
+	if (!cred)
+		return -ENOMEM;
+
+	*new_cred = cred;
+	return create_user_ns(cred);
+}
+
 void free_user_ns(struct kref *kref)
 {
 	struct user_namespace *parent, *ns =
 		container_of(kref, struct user_namespace, kref);
 
 	parent = ns->parent;
+	proc_free_inum(ns->proc_inum);
 	kmem_cache_free(user_ns_cachep, ns);
 	put_user_ns(parent);
 }
@@ -372,7 +399,7 @@ static int uid_m_show(struct seq_file *seq, void *v)
 	struct user_namespace *lower_ns;
 	uid_t lower;
 
-	lower_ns = current_user_ns();
+	lower_ns = seq_user_ns(seq);
 	if ((lower_ns == ns) && lower_ns->parent)
 		lower_ns = lower_ns->parent;
 
@@ -393,7 +420,7 @@ static int gid_m_show(struct seq_file *seq, void *v)
 	struct user_namespace *lower_ns;
 	gid_t lower;
 
-	lower_ns = current_user_ns();
+	lower_ns = seq_user_ns(seq);
 	if ((lower_ns == ns) && lower_ns->parent)
 		lower_ns = lower_ns->parent;
 
@@ -669,10 +696,14 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz
 {
 	struct seq_file *seq = file->private_data;
 	struct user_namespace *ns = seq->private;
+	struct user_namespace *seq_ns = seq_user_ns(seq);
 
 	if (!ns->parent)
 		return -EPERM;
 
+	if ((seq_ns != ns) && (seq_ns != ns->parent))
+		return -EPERM;
+
 	return map_write(file, buf, size, ppos, CAP_SETUID,
 			 &ns->uid_map, &ns->parent->uid_map);
 }
@@ -681,10 +712,14 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz
 {
 	struct seq_file *seq = file->private_data;
 	struct user_namespace *ns = seq->private;
+	struct user_namespace *seq_ns = seq_user_ns(seq);
 
 	if (!ns->parent)
 		return -EPERM;
 
+	if ((seq_ns != ns) && (seq_ns != ns->parent))
+		return -EPERM;
+
 	return map_write(file, buf, size, ppos, CAP_SETGID,
 			 &ns->gid_map, &ns->parent->gid_map);
 }
@@ -709,6 +744,21 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
 static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
 				struct uid_gid_map *new_map)
 {
+	/* Allow mapping to your own filesystem ids */
+	if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) {
+		u32 id = new_map->extent[0].lower_first;
+		if (cap_setid == CAP_SETUID) {
+			kuid_t uid = make_kuid(ns->parent, id);
+			if (uid_eq(uid, current_fsuid()))
+				return true;
+		}
+		else if (cap_setid == CAP_SETGID) {
+			kgid_t gid = make_kgid(ns->parent, id);
+			if (gid_eq(gid, current_fsgid()))
+				return true;
+		}
+	}
+
 	/* Allow anyone to set a mapping that doesn't require privilege */
 	if (!cap_valid(cap_setid))
 		return true;
@@ -722,6 +772,65 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
 	return false;
 }
 
+static void *userns_get(struct task_struct *task)
+{
+	struct user_namespace *user_ns;
+
+	rcu_read_lock();
+	user_ns = get_user_ns(__task_cred(task)->user_ns);
+	rcu_read_unlock();
+
+	return user_ns;
+}
+
+static void userns_put(void *ns)
+{
+	put_user_ns(ns);
+}
+
+static int userns_install(struct nsproxy *nsproxy, void *ns)
+{
+	struct user_namespace *user_ns = ns;
+	struct cred *cred;
+
+	/* Don't allow gaining capabilities by reentering
+	 * the same user namespace.
+	 */
+	if (user_ns == current_user_ns())
+		return -EINVAL;
+
+	/* Threaded processes may not enter a different user namespace */
+	if (atomic_read(&current->mm->mm_users) > 1)
+		return -EINVAL;
+
+	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	cred = prepare_creds();
+	if (!cred)
+		return -ENOMEM;
+
+	put_user_ns(cred->user_ns);
+	set_cred_user_ns(cred, get_user_ns(user_ns));
+
+	return commit_creds(cred);
+}
+
+static unsigned int userns_inum(void *ns)
+{
+	struct user_namespace *user_ns = ns;
+	return user_ns->proc_inum;
+}
+
+const struct proc_ns_operations userns_operations = {
+	.name		= "user",
+	.type		= CLONE_NEWUSER,
+	.get		= userns_get,
+	.put		= userns_put,
+	.install	= userns_install,
+	.inum		= userns_inum,
+};
+
 static __init int user_namespaces_init(void)
 {
 	user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
diff --git a/trunk/kernel/utsname.c b/trunk/kernel/utsname.c
index 679d97a5d3fd..08b197e8c485 100644
--- a/trunk/kernel/utsname.c
+++ b/trunk/kernel/utsname.c
@@ -32,18 +32,25 @@ static struct uts_namespace *create_uts_ns(void)
  * @old_ns: namespace to clone
  * Return NULL on error (failure to kmalloc), new ns otherwise
  */
-static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
+static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
 					  struct uts_namespace *old_ns)
 {
 	struct uts_namespace *ns;
+	int err;
 
 	ns = create_uts_ns();
 	if (!ns)
 		return ERR_PTR(-ENOMEM);
 
+	err = proc_alloc_inum(&ns->proc_inum);
+	if (err) {
+		kfree(ns);
+		return ERR_PTR(err);
+	}
+
 	down_read(&uts_sem);
 	memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
-	ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns));
+	ns->user_ns = get_user_ns(user_ns);
 	up_read(&uts_sem);
 	return ns;
 }
@@ -55,9 +62,8 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
  * versa.
  */
 struct uts_namespace *copy_utsname(unsigned long flags,
-				   struct task_struct *tsk)
+	struct user_namespace *user_ns, struct uts_namespace *old_ns)
 {
-	struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
 	struct uts_namespace *new_ns;
 
 	BUG_ON(!old_ns);
@@ -66,7 +72,7 @@ struct uts_namespace *copy_utsname(unsigned long flags,
 	if (!(flags & CLONE_NEWUTS))
 		return old_ns;
 
-	new_ns = clone_uts_ns(tsk, old_ns);
+	new_ns = clone_uts_ns(user_ns, old_ns);
 
 	put_uts_ns(old_ns);
 	return new_ns;
@@ -78,6 +84,7 @@ void free_uts_ns(struct kref *kref)
 
 	ns = container_of(kref, struct uts_namespace, kref);
 	put_user_ns(ns->user_ns);
+	proc_free_inum(ns->proc_inum);
 	kfree(ns);
 }
 
@@ -102,19 +109,32 @@ static void utsns_put(void *ns)
 	put_uts_ns(ns);
 }
 
-static int utsns_install(struct nsproxy *nsproxy, void *ns)
+static int utsns_install(struct nsproxy *nsproxy, void *new)
 {
+	struct uts_namespace *ns = new;
+
+	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
+	    !nsown_capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	get_uts_ns(ns);
 	put_uts_ns(nsproxy->uts_ns);
 	nsproxy->uts_ns = ns;
 	return 0;
 }
 
+static unsigned int utsns_inum(void *vp)
+{
+	struct uts_namespace *ns = vp;
+
+	return ns->proc_inum;
+}
+
 const struct proc_ns_operations utsns_operations = {
 	.name		= "uts",
 	.type		= CLONE_NEWUTS,
 	.get		= utsns_get,
 	.put		= utsns_put,
 	.install	= utsns_install,
+	.inum		= utsns_inum,
 };
-
diff --git a/trunk/kernel/watchdog.c b/trunk/kernel/watchdog.c
index c8c21be11ab4..75a2ab3d0b02 100644
--- a/trunk/kernel/watchdog.c
+++ b/trunk/kernel/watchdog.c
@@ -31,6 +31,7 @@
 int watchdog_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
 static int __read_mostly watchdog_disabled;
+static u64 __read_mostly sample_period;
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -116,7 +117,7 @@ static unsigned long get_timestamp(int this_cpu)
 	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
 }
 
-static u64 get_sample_period(void)
+static void set_sample_period(void)
 {
 	/*
 	 * convert watchdog_thresh from seconds to ns
@@ -125,7 +126,7 @@ static u64 get_sample_period(void)
 	 * and hard thresholds) to increment before the
 	 * hardlockup detector generates a warning
 	 */
-	return get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
+	sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
 }
 
 /* Commands for resetting the watchdog */
@@ -275,7 +276,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	wake_up_process(__this_cpu_read(softlockup_watchdog));
 
 	/* .. and repeat */
-	hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
+	hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
 
 	if (touch_ts == 0) {
 		if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
@@ -343,6 +344,10 @@ static void watchdog_enable(unsigned int cpu)
 {
 	struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
 
+	/* kick off the timer for the hardlockup detector */
+	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer->function = watchdog_timer_fn;
+
 	if (!watchdog_enabled) {
 		kthread_park(current);
 		return;
@@ -351,12 +356,8 @@ static void watchdog_enable(unsigned int cpu)
 	/* Enable the perf event */
 	watchdog_nmi_enable(cpu);
 
-	/* kick off the timer for the hardlockup detector */
-	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	hrtimer->function = watchdog_timer_fn;
-
 	/* done here because hrtimer_start can only pin to smp_processor_id() */
-	hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
+	hrtimer_start(hrtimer, ns_to_ktime(sample_period),
 		      HRTIMER_MODE_REL_PINNED);
 
 	/* initialize timestamp */
@@ -368,9 +369,6 @@ static void watchdog_disable(unsigned int cpu)
 {
 	struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
 
-	if (!watchdog_enabled)
-		return;
-
 	watchdog_set_prio(SCHED_NORMAL, 0);
 	hrtimer_cancel(hrtimer);
 	/* disable the perf event */
@@ -386,7 +384,7 @@ static int watchdog_should_run(unsigned int cpu)
 /*
  * The watchdog thread function - touches the timestamp.
  *
- * It only runs once every get_sample_period() seconds (4 seconds by
+ * It only runs once every sample_period seconds (4 seconds by
  * default) to reset the softlockup timestamp. If this gets delayed
  * for more than 2*watchdog_thresh seconds then the debug-printout
  * triggers in watchdog_timer_fn().
@@ -519,6 +517,7 @@ int proc_dowatchdog(struct ctl_table *table, int write,
 	if (ret || !write)
 		return ret;
 
+	set_sample_period();
 	if (watchdog_enabled && watchdog_thresh)
 		watchdog_enable_all_cpus();
 	else
@@ -540,6 +539,7 @@ static struct smp_hotplug_thread watchdog_threads = {
 
 void __init lockup_detector_init(void)
 {
+	set_sample_period();
 	if (smpboot_register_percpu_thread(&watchdog_threads)) {
 		pr_err("Failed to create watchdog threads, disabled\n");
 		watchdog_disabled = -ENODEV;
diff --git a/trunk/lib/Kconfig b/trunk/lib/Kconfig
index 4b31a46fb307..75cdb77fa49d 100644
--- a/trunk/lib/Kconfig
+++ b/trunk/lib/Kconfig
@@ -42,6 +42,9 @@ config GENERIC_IO
 config STMP_DEVICE
 	bool
 
+config PERCPU_RWSEM
+	boolean
+
 config CRC_CCITT
 	tristate "CRC-CCITT functions"
 	help
diff --git a/trunk/lib/Kconfig.debug b/trunk/lib/Kconfig.debug
index e458782f3c52..3a353091a903 100644
--- a/trunk/lib/Kconfig.debug
+++ b/trunk/lib/Kconfig.debug
@@ -1192,14 +1192,14 @@ config MEMORY_NOTIFIER_ERROR_INJECT
 
 	  If unsure, say N.
 
-config PSERIES_RECONFIG_NOTIFIER_ERROR_INJECT
-	tristate "pSeries reconfig notifier error injection module"
-	depends on PPC_PSERIES && NOTIFIER_ERROR_INJECTION
+config OF_RECONFIG_NOTIFIER_ERROR_INJECT
+	tristate "OF reconfig notifier error injection module"
+	depends on OF_DYNAMIC && NOTIFIER_ERROR_INJECTION
 	help
 	  This option provides the ability to inject artificial errors to
-	  pSeries reconfig notifier chain callbacks.  It is controlled
+	  OF reconfig notifier chain callbacks.  It is controlled
 	  through debugfs interface under
-	  /sys/kernel/debug/notifier-error-inject/pSeries-reconfig/
+	  /sys/kernel/debug/notifier-error-inject/OF-reconfig/
 
 	  If the notifier call chain should be failed with some events
 	  notified, write the error code to "actions/<notifier event>/error".
diff --git a/trunk/lib/Makefile b/trunk/lib/Makefile
index e2152fa7ff4d..02ed6c04cd7d 100644
--- a/trunk/lib/Makefile
+++ b/trunk/lib/Makefile
@@ -40,6 +40,7 @@ obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
 lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
+lib-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
 
 CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS))
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
@@ -94,8 +95,8 @@ obj-$(CONFIG_NOTIFIER_ERROR_INJECTION) += notifier-error-inject.o
 obj-$(CONFIG_CPU_NOTIFIER_ERROR_INJECT) += cpu-notifier-error-inject.o
 obj-$(CONFIG_PM_NOTIFIER_ERROR_INJECT) += pm-notifier-error-inject.o
 obj-$(CONFIG_MEMORY_NOTIFIER_ERROR_INJECT) += memory-notifier-error-inject.o
-obj-$(CONFIG_PSERIES_RECONFIG_NOTIFIER_ERROR_INJECT) += \
-	pSeries-reconfig-notifier-error-inject.o
+obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \
+	of-reconfig-notifier-error-inject.o
 
 lib-$(CONFIG_GENERIC_BUG) += bug.o
 
diff --git a/trunk/lib/asn1_decoder.c b/trunk/lib/asn1_decoder.c
index 5293d2433029..11b9b01fda6b 100644
--- a/trunk/lib/asn1_decoder.c
+++ b/trunk/lib/asn1_decoder.c
@@ -81,7 +81,7 @@ static int asn1_find_indefinite_length(const unsigned char *data, size_t datalen
 		goto next_tag;
 	}
 
-	if (unlikely((tag & 0x1f) == 0x1f)) {
+	if (unlikely((tag & 0x1f) == ASN1_LONG_TAG)) {
 		do {
 			if (unlikely(datalen - dp < 2))
 				goto data_overrun_error;
@@ -96,7 +96,7 @@ static int asn1_find_indefinite_length(const unsigned char *data, size_t datalen
 		goto next_tag;
 	}
 
-	if (unlikely(len == 0x80)) {
+	if (unlikely(len == ASN1_INDEFINITE_LENGTH)) {
 		/* Indefinite length */
 		if (unlikely((tag & ASN1_CONS_BIT) == ASN1_PRIM << 5))
 			goto indefinite_len_primitive;
@@ -222,7 +222,7 @@ int asn1_ber_decoder(const struct asn1_decoder *decoder,
 		if (unlikely(dp >= datalen - 1))
 			goto data_overrun_error;
 		tag = data[dp++];
-		if (unlikely((tag & 0x1f) == 0x1f))
+		if (unlikely((tag & 0x1f) == ASN1_LONG_TAG))
 			goto long_tag_not_supported;
 
 		if (op & ASN1_OP_MATCH__ANY) {
@@ -254,7 +254,7 @@ int asn1_ber_decoder(const struct asn1_decoder *decoder,
 
 		len = data[dp++];
 		if (len > 0x7f) {
-			if (unlikely(len == 0x80)) {
+			if (unlikely(len == ASN1_INDEFINITE_LENGTH)) {
 				/* Indefinite length */
 				if (unlikely(!(tag & ASN1_CONS_BIT)))
 					goto indefinite_len_primitive;
diff --git a/trunk/lib/dynamic_debug.c b/trunk/lib/dynamic_debug.c
index e7f7d993357a..1db1fc660538 100644
--- a/trunk/lib/dynamic_debug.c
+++ b/trunk/lib/dynamic_debug.c
@@ -62,13 +62,6 @@ static LIST_HEAD(ddebug_tables);
 static int verbose = 0;
 module_param(verbose, int, 0644);
 
-/* Return the last part of a pathname */
-static inline const char *basename(const char *path)
-{
-	const char *tail = strrchr(path, '/');
-	return tail ? tail+1 : path;
-}
-
 /* Return the path relative to source root */
 static inline const char *trim_prefix(const char *path)
 {
@@ -154,7 +147,7 @@ static int ddebug_change(const struct ddebug_query *query,
 			/* match against the source filename */
 			if (query->filename &&
 			    strcmp(query->filename, dp->filename) &&
-			    strcmp(query->filename, basename(dp->filename)) &&
+			    strcmp(query->filename, kbasename(dp->filename)) &&
 			    strcmp(query->filename, trim_prefix(dp->filename)))
 				continue;
 
diff --git a/trunk/lib/interval_tree_test_main.c b/trunk/lib/interval_tree_test_main.c
index b25903987f7a..245900b98c8e 100644
--- a/trunk/lib/interval_tree_test_main.c
+++ b/trunk/lib/interval_tree_test_main.c
@@ -30,7 +30,8 @@ static void init(void)
 {
 	int i;
 	for (i = 0; i < NODES; i++) {
-		u32 a = prandom32(&rnd), b = prandom32(&rnd);
+		u32 a = prandom_u32_state(&rnd);
+		u32 b = prandom_u32_state(&rnd);
 		if (a <= b) {
 			nodes[i].start = a;
 			nodes[i].last = b;
@@ -40,7 +41,7 @@ static void init(void)
 		}
 	}
 	for (i = 0; i < SEARCHES; i++)
-		queries[i] = prandom32(&rnd);
+		queries[i] = prandom_u32_state(&rnd);
 }
 
 static int interval_tree_test_init(void)
@@ -51,7 +52,7 @@ static int interval_tree_test_init(void)
 
 	printk(KERN_ALERT "interval tree insert/remove");
 
-	prandom32_seed(&rnd, 3141592653589793238ULL);
+	prandom_seed_state(&rnd, 3141592653589793238ULL);
 	init();
 
 	time1 = get_cycles();
diff --git a/trunk/lib/kstrtox.c b/trunk/lib/kstrtox.c
index c3615eab0cc3..f78ae0c0c4e2 100644
--- a/trunk/lib/kstrtox.c
+++ b/trunk/lib/kstrtox.c
@@ -104,6 +104,22 @@ static int _kstrtoull(const char *s, unsigned int base, unsigned long long *res)
 	return 0;
 }
 
+/**
+ * kstrtoull - convert a string to an unsigned long long
+ * @s: The start of the string. The string must be null-terminated, and may also
+ *  include a single newline before its terminating null. The first character
+ *  may also be a plus sign, but not a minus sign.
+ * @base: The number base to use. The maximum supported base is 16. If base is
+ *  given as 0, then the base of the string is automatically detected with the
+ *  conventional semantics - If it begins with 0x the number will be parsed as a
+ *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
+ *  parsed as an octal number. Otherwise it will be parsed as a decimal.
+ * @res: Where to write the result of the conversion on success.
+ *
+ * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
+ * Used as a replacement for the obsolete simple_strtoull. Return code must
+ * be checked.
+ */
 int kstrtoull(const char *s, unsigned int base, unsigned long long *res)
 {
 	if (s[0] == '+')
@@ -112,6 +128,22 @@ int kstrtoull(const char *s, unsigned int base, unsigned long long *res)
 }
 EXPORT_SYMBOL(kstrtoull);
 
+/**
+ * kstrtoll - convert a string to a long long
+ * @s: The start of the string. The string must be null-terminated, and may also
+ *  include a single newline before its terminating null. The first character
+ *  may also be a plus sign or a minus sign.
+ * @base: The number base to use. The maximum supported base is 16. If base is
+ *  given as 0, then the base of the string is automatically detected with the
+ *  conventional semantics - If it begins with 0x the number will be parsed as a
+ *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
+ *  parsed as an octal number. Otherwise it will be parsed as a decimal.
+ * @res: Where to write the result of the conversion on success.
+ *
+ * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
+ * Used as a replacement for the obsolete simple_strtoull. Return code must
+ * be checked.
+ */
 int kstrtoll(const char *s, unsigned int base, long long *res)
 {
 	unsigned long long tmp;
@@ -168,6 +200,22 @@ int _kstrtol(const char *s, unsigned int base, long *res)
 }
 EXPORT_SYMBOL(_kstrtol);
 
+/**
+ * kstrtouint - convert a string to an unsigned int
+ * @s: The start of the string. The string must be null-terminated, and may also
+ *  include a single newline before its terminating null. The first character
+ *  may also be a plus sign, but not a minus sign.
+ * @base: The number base to use. The maximum supported base is 16. If base is
+ *  given as 0, then the base of the string is automatically detected with the
+ *  conventional semantics - If it begins with 0x the number will be parsed as a
+ *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
+ *  parsed as an octal number. Otherwise it will be parsed as a decimal.
+ * @res: Where to write the result of the conversion on success.
+ *
+ * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
+ * Used as a replacement for the obsolete simple_strtoull. Return code must
+ * be checked.
+ */
 int kstrtouint(const char *s, unsigned int base, unsigned int *res)
 {
 	unsigned long long tmp;
@@ -183,6 +231,22 @@ int kstrtouint(const char *s, unsigned int base, unsigned int *res)
 }
 EXPORT_SYMBOL(kstrtouint);
 
+/**
+ * kstrtoint - convert a string to an int
+ * @s: The start of the string. The string must be null-terminated, and may also
+ *  include a single newline before its terminating null. The first character
+ *  may also be a plus sign or a minus sign.
+ * @base: The number base to use. The maximum supported base is 16. If base is
+ *  given as 0, then the base of the string is automatically detected with the
+ *  conventional semantics - If it begins with 0x the number will be parsed as a
+ *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
+ *  parsed as an octal number. Otherwise it will be parsed as a decimal.
+ * @res: Where to write the result of the conversion on success.
+ *
+ * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
+ * Used as a replacement for the obsolete simple_strtoull. Return code must
+ * be checked.
+ */
 int kstrtoint(const char *s, unsigned int base, int *res)
 {
 	long long tmp;
diff --git a/trunk/lib/lru_cache.c b/trunk/lib/lru_cache.c
index a07e7268d7ed..d71d89498943 100644
--- a/trunk/lib/lru_cache.c
+++ b/trunk/lib/lru_cache.c
@@ -44,8 +44,8 @@ MODULE_LICENSE("GPL");
 } while (0)
 
 #define RETURN(x...)     do { \
-	clear_bit(__LC_PARANOIA, &lc->flags); \
-	smp_mb__after_clear_bit(); return x ; } while (0)
+	clear_bit_unlock(__LC_PARANOIA, &lc->flags); \
+	return x ; } while (0)
 
 /* BUG() if e is not one of the elements tracked by lc */
 #define PARANOIA_LC_ELEMENT(lc, e) do {	\
@@ -55,9 +55,40 @@ MODULE_LICENSE("GPL");
 	BUG_ON(i >= lc_->nr_elements);	\
 	BUG_ON(lc_->lc_element[i] != e_); } while (0)
 
+
+/* We need to atomically
+ *  - try to grab the lock (set LC_LOCKED)
+ *  - only if there is no pending transaction
+ *    (neither LC_DIRTY nor LC_STARVING is set)
+ * Because of PARANOIA_ENTRY() above abusing lc->flags as well,
+ * it is not sufficient to just say
+ *	return 0 == cmpxchg(&lc->flags, 0, LC_LOCKED);
+ */
+int lc_try_lock(struct lru_cache *lc)
+{
+	unsigned long val;
+	do {
+		val = cmpxchg(&lc->flags, 0, LC_LOCKED);
+	} while (unlikely (val == LC_PARANOIA));
+	/* Spin until no-one is inside a PARANOIA_ENTRY()/RETURN() section. */
+	return 0 == val;
+#if 0
+	/* Alternative approach, spin in case someone enters or leaves a
+	 * PARANOIA_ENTRY()/RETURN() section. */
+	unsigned long old, new, val;
+	do {
+		old = lc->flags & LC_PARANOIA;
+		new = old | LC_LOCKED;
+		val = cmpxchg(&lc->flags, old, new);
+	} while (unlikely (val == (old ^ LC_PARANOIA)));
+	return old == val;
+#endif
+}
+
 /**
  * lc_create - prepares to track objects in an active set
  * @name: descriptive name only used in lc_seq_printf_stats and lc_seq_dump_details
+ * @max_pending_changes: maximum changes to accumulate until a transaction is required
  * @e_count: number of elements allowed to be active simultaneously
  * @e_size: size of the tracked objects
  * @e_off: offset to the &struct lc_element member in a tracked object
@@ -66,6 +97,7 @@ MODULE_LICENSE("GPL");
  * or NULL on (allocation) failure.
  */
 struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
+		unsigned max_pending_changes,
 		unsigned e_count, size_t e_size, size_t e_off)
 {
 	struct hlist_head *slot = NULL;
@@ -98,12 +130,13 @@ struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
 	INIT_LIST_HEAD(&lc->in_use);
 	INIT_LIST_HEAD(&lc->lru);
 	INIT_LIST_HEAD(&lc->free);
+	INIT_LIST_HEAD(&lc->to_be_changed);
 
 	lc->name = name;
 	lc->element_size = e_size;
 	lc->element_off = e_off;
 	lc->nr_elements = e_count;
-	lc->new_number = LC_FREE;
+	lc->max_pending_changes = max_pending_changes;
 	lc->lc_cache = cache;
 	lc->lc_element = element;
 	lc->lc_slot = slot;
@@ -117,6 +150,7 @@ struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
 		e = p + e_off;
 		e->lc_index = i;
 		e->lc_number = LC_FREE;
+		e->lc_new_number = LC_FREE;
 		list_add(&e->list, &lc->free);
 		element[i] = e;
 	}
@@ -175,15 +209,15 @@ void lc_reset(struct lru_cache *lc)
 	INIT_LIST_HEAD(&lc->in_use);
 	INIT_LIST_HEAD(&lc->lru);
 	INIT_LIST_HEAD(&lc->free);
+	INIT_LIST_HEAD(&lc->to_be_changed);
 	lc->used = 0;
 	lc->hits = 0;
 	lc->misses = 0;
 	lc->starving = 0;
-	lc->dirty = 0;
+	lc->locked = 0;
 	lc->changed = 0;
+	lc->pending_changes = 0;
 	lc->flags = 0;
-	lc->changing_element = NULL;
-	lc->new_number = LC_FREE;
 	memset(lc->lc_slot, 0, sizeof(struct hlist_head) * lc->nr_elements);
 
 	for (i = 0; i < lc->nr_elements; i++) {
@@ -194,6 +228,7 @@ void lc_reset(struct lru_cache *lc)
 		/* re-init it */
 		e->lc_index = i;
 		e->lc_number = LC_FREE;
+		e->lc_new_number = LC_FREE;
 		list_add(&e->list, &lc->free);
 	}
 }
@@ -208,14 +243,14 @@ size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
 	/* NOTE:
 	 * total calls to lc_get are
 	 * (starving + hits + misses)
-	 * misses include "dirty" count (update from an other thread in
+	 * misses include "locked" count (update from an other thread in
 	 * progress) and "changed", when this in fact lead to an successful
 	 * update of the cache.
 	 */
 	return seq_printf(seq, "\t%s: used:%u/%u "
-		"hits:%lu misses:%lu starving:%lu dirty:%lu changed:%lu\n",
+		"hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n",
 		lc->name, lc->used, lc->nr_elements,
-		lc->hits, lc->misses, lc->starving, lc->dirty, lc->changed);
+		lc->hits, lc->misses, lc->starving, lc->locked, lc->changed);
 }
 
 static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr)
@@ -224,16 +259,8 @@ static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr)
 }
 
 
-/**
- * lc_find - find element by label, if present in the hash table
- * @lc: The lru_cache object
- * @enr: element number
- *
- * Returns the pointer to an element, if the element with the requested
- * "label" or element number is present in the hash table,
- * or NULL if not found. Does not change the refcnt.
- */
-struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr)
+static struct lc_element *__lc_find(struct lru_cache *lc, unsigned int enr,
+		bool include_changing)
 {
 	struct hlist_node *n;
 	struct lc_element *e;
@@ -241,29 +268,48 @@ struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr)
 	BUG_ON(!lc);
 	BUG_ON(!lc->nr_elements);
 	hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) {
-		if (e->lc_number == enr)
+		/* "about to be changed" elements, pending transaction commit,
+		 * are hashed by their "new number". "Normal" elements have
+		 * lc_number == lc_new_number. */
+		if (e->lc_new_number != enr)
+			continue;
+		if (e->lc_new_number == e->lc_number || include_changing)
 			return e;
+		break;
 	}
 	return NULL;
 }
 
-/* returned element will be "recycled" immediately */
-static struct lc_element *lc_evict(struct lru_cache *lc)
+/**
+ * lc_find - find element by label, if present in the hash table
+ * @lc: The lru_cache object
+ * @enr: element number
+ *
+ * Returns the pointer to an element, if the element with the requested
+ * "label" or element number is present in the hash table,
+ * or NULL if not found. Does not change the refcnt.
+ * Ignores elements that are "about to be used", i.e. not yet in the active
+ * set, but still pending transaction commit.
+ */
+struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr)
 {
-	struct list_head  *n;
-	struct lc_element *e;
-
-	if (list_empty(&lc->lru))
-		return NULL;
-
-	n = lc->lru.prev;
-	e = list_entry(n, struct lc_element, list);
-
-	PARANOIA_LC_ELEMENT(lc, e);
+	return __lc_find(lc, enr, 0);
+}
 
-	list_del(&e->list);
-	hlist_del(&e->colision);
-	return e;
+/**
+ * lc_is_used - find element by label
+ * @lc: The lru_cache object
+ * @enr: element number
+ *
+ * Returns true, if the element with the requested "label" or element number is
+ * present in the hash table, and is used (refcnt > 0).
+ * Also finds elements that are not _currently_ used but only "about to be
+ * used", i.e. on the "to_be_changed" list, pending transaction commit.
+ */
+bool lc_is_used(struct lru_cache *lc, unsigned int enr)
+{
+	struct lc_element *e = __lc_find(lc, enr, 1);
+	return e && e->refcnt;
 }
 
 /**
@@ -280,22 +326,34 @@ void lc_del(struct lru_cache *lc, struct lc_element *e)
 	PARANOIA_LC_ELEMENT(lc, e);
 	BUG_ON(e->refcnt);
 
-	e->lc_number = LC_FREE;
+	e->lc_number = e->lc_new_number = LC_FREE;
 	hlist_del_init(&e->colision);
 	list_move(&e->list, &lc->free);
 	RETURN();
 }
 
-static struct lc_element *lc_get_unused_element(struct lru_cache *lc)
+static struct lc_element *lc_prepare_for_change(struct lru_cache *lc, unsigned new_number)
 {
 	struct list_head *n;
+	struct lc_element *e;
+
+	if (!list_empty(&lc->free))
+		n = lc->free.next;
+	else if (!list_empty(&lc->lru))
+		n = lc->lru.prev;
+	else
+		return NULL;
+
+	e = list_entry(n, struct lc_element, list);
+	PARANOIA_LC_ELEMENT(lc, e);
 
-	if (list_empty(&lc->free))
-		return lc_evict(lc);
+	e->lc_new_number = new_number;
+	if (!hlist_unhashed(&e->colision))
+		__hlist_del(&e->colision);
+	hlist_add_head(&e->colision, lc_hash_slot(lc, new_number));
+	list_move(&e->list, &lc->to_be_changed);
 
-	n = lc->free.next;
-	list_del(n);
-	return list_entry(n, struct lc_element, list);
+	return e;
 }
 
 static int lc_unused_element_available(struct lru_cache *lc)
@@ -308,45 +366,7 @@ static int lc_unused_element_available(struct lru_cache *lc)
 	return 0;
 }
 
-
-/**
- * lc_get - get element by label, maybe change the active set
- * @lc: the lru cache to operate on
- * @enr: the label to look up
- *
- * Finds an element in the cache, increases its usage count,
- * "touches" and returns it.
- *
- * In case the requested number is not present, it needs to be added to the
- * cache. Therefore it is possible that an other element becomes evicted from
- * the cache. In either case, the user is notified so he is able to e.g. keep
- * a persistent log of the cache changes, and therefore the objects in use.
- *
- * Return values:
- *  NULL
- *     The cache was marked %LC_STARVING,
- *     or the requested label was not in the active set
- *     and a changing transaction is still pending (@lc was marked %LC_DIRTY).
- *     Or no unused or free element could be recycled (@lc will be marked as
- *     %LC_STARVING, blocking further lc_get() operations).
- *
- *  pointer to the element with the REQUESTED element number.
- *     In this case, it can be used right away
- *
- *  pointer to an UNUSED element with some different element number,
- *          where that different number may also be %LC_FREE.
- *
- *          In this case, the cache is marked %LC_DIRTY (blocking further changes),
- *          and the returned element pointer is removed from the lru list and
- *          hash collision chains.  The user now should do whatever housekeeping
- *          is necessary.
- *          Then he must call lc_changed(lc,element_pointer), to finish
- *          the change.
- *
- * NOTE: The user needs to check the lc_number on EACH use, so he recognizes
- *       any cache set change.
- */
-struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
+static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool may_change)
 {
 	struct lc_element *e;
 
@@ -356,8 +376,12 @@ struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
 		RETURN(NULL);
 	}
 
-	e = lc_find(lc, enr);
-	if (e) {
+	e = __lc_find(lc, enr, 1);
+	/* if lc_new_number != lc_number,
+	 * this enr is currently being pulled in already,
+	 * and will be available once the pending transaction
+	 * has been committed. */
+	if (e && e->lc_new_number == e->lc_number) {
 		++lc->hits;
 		if (e->refcnt++ == 0)
 			lc->used++;
@@ -366,6 +390,26 @@ struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
 	}
 
 	++lc->misses;
+	if (!may_change)
+		RETURN(NULL);
+
+	/* It has been found above, but on the "to_be_changed" list, not yet
+	 * committed.  Don't pull it in twice, wait for the transaction, then
+	 * try again */
+	if (e)
+		RETURN(NULL);
+
+	/* To avoid races with lc_try_lock(), first, mark us dirty
+	 * (using test_and_set_bit, as it implies memory barriers), ... */
+	test_and_set_bit(__LC_DIRTY, &lc->flags);
+
+	/* ... only then check if it is locked anyways. If lc_unlock clears
+	 * the dirty bit again, that's not a problem, we will come here again.
+	 */
+	if (test_bit(__LC_LOCKED, &lc->flags)) {
+		++lc->locked;
+		RETURN(NULL);
+	}
 
 	/* In case there is nothing available and we can not kick out
 	 * the LRU element, we have to wait ...
@@ -375,71 +419,109 @@ struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
 		RETURN(NULL);
 	}
 
-	/* it was not present in the active set.
-	 * we are going to recycle an unused (or even "free") element.
-	 * user may need to commit a transaction to record that change.
-	 * we serialize on flags & TF_DIRTY */
-	if (test_and_set_bit(__LC_DIRTY, &lc->flags)) {
-		++lc->dirty;
+	/* It was not present in the active set.  We are going to recycle an
+	 * unused (or even "free") element, but we won't accumulate more than
+	 * max_pending_changes changes.  */
+	if (lc->pending_changes >= lc->max_pending_changes)
 		RETURN(NULL);
-	}
 
-	e = lc_get_unused_element(lc);
+	e = lc_prepare_for_change(lc, enr);
 	BUG_ON(!e);
 
 	clear_bit(__LC_STARVING, &lc->flags);
 	BUG_ON(++e->refcnt != 1);
 	lc->used++;
-
-	lc->changing_element = e;
-	lc->new_number = enr;
+	lc->pending_changes++;
 
 	RETURN(e);
 }
 
-/* similar to lc_get,
- * but only gets a new reference on an existing element.
- * you either get the requested element, or NULL.
- * will be consolidated into one function.
+/**
+ * lc_get - get element by label, maybe change the active set
+ * @lc: the lru cache to operate on
+ * @enr: the label to look up
+ *
+ * Finds an element in the cache, increases its usage count,
+ * "touches" and returns it.
+ *
+ * In case the requested number is not present, it needs to be added to the
+ * cache. Therefore it is possible that an other element becomes evicted from
+ * the cache. In either case, the user is notified so he is able to e.g. keep
+ * a persistent log of the cache changes, and therefore the objects in use.
+ *
+ * Return values:
+ *  NULL
+ *     The cache was marked %LC_STARVING,
+ *     or the requested label was not in the active set
+ *     and a changing transaction is still pending (@lc was marked %LC_DIRTY).
+ *     Or no unused or free element could be recycled (@lc will be marked as
+ *     %LC_STARVING, blocking further lc_get() operations).
+ *
+ *  pointer to the element with the REQUESTED element number.
+ *     In this case, it can be used right away
+ *
+ *  pointer to an UNUSED element with some different element number,
+ *          where that different number may also be %LC_FREE.
+ *
+ *          In this case, the cache is marked %LC_DIRTY,
+ *          so lc_try_lock() will no longer succeed.
+ *          The returned element pointer is moved to the "to_be_changed" list,
+ *          and registered with the new element number on the hash collision chains,
+ *          so it is possible to pick it up from lc_is_used().
+ *          Up to "max_pending_changes" (see lc_create()) can be accumulated.
+ *          The user now should do whatever housekeeping is necessary,
+ *          typically serialize on lc_try_lock_for_transaction(), then call
+ *          lc_committed(lc) and lc_unlock(), to finish the change.
+ *
+ * NOTE: The user needs to check the lc_number on EACH use, so he recognizes
+ *       any cache set change.
  */
-struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr)
+struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
 {
-	struct lc_element *e;
-
-	PARANOIA_ENTRY();
-	if (lc->flags & LC_STARVING) {
-		++lc->starving;
-		RETURN(NULL);
-	}
+	return __lc_get(lc, enr, 1);
+}
 
-	e = lc_find(lc, enr);
-	if (e) {
-		++lc->hits;
-		if (e->refcnt++ == 0)
-			lc->used++;
-		list_move(&e->list, &lc->in_use); /* Not evictable... */
-	}
-	RETURN(e);
+/**
+ * lc_try_get - get element by label, if present; do not change the active set
+ * @lc: the lru cache to operate on
+ * @enr: the label to look up
+ *
+ * Finds an element in the cache, increases its usage count,
+ * "touches" and returns it.
+ *
+ * Return values:
+ *  NULL
+ *     The cache was marked %LC_STARVING,
+ *     or the requested label was not in the active set
+ *
+ *  pointer to the element with the REQUESTED element number.
+ *     In this case, it can be used right away
+ */
+struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr)
+{
+	return __lc_get(lc, enr, 0);
 }
 
 /**
- * lc_changed - tell @lc that the change has been recorded
+ * lc_committed - tell @lc that pending changes have been recorded
  * @lc: the lru cache to operate on
- * @e: the element pending label change
+ *
+ * User is expected to serialize on explicit lc_try_lock_for_transaction()
+ * before the transaction is started, and later needs to lc_unlock() explicitly
+ * as well.
  */
-void lc_changed(struct lru_cache *lc, struct lc_element *e)
+void lc_committed(struct lru_cache *lc)
 {
+	struct lc_element *e, *tmp;
+
 	PARANOIA_ENTRY();
-	BUG_ON(e != lc->changing_element);
-	PARANOIA_LC_ELEMENT(lc, e);
-	++lc->changed;
-	e->lc_number = lc->new_number;
-	list_add(&e->list, &lc->in_use);
-	hlist_add_head(&e->colision, lc_hash_slot(lc, lc->new_number));
-	lc->changing_element = NULL;
-	lc->new_number = LC_FREE;
-	clear_bit(__LC_DIRTY, &lc->flags);
-	smp_mb__after_clear_bit();
+	list_for_each_entry_safe(e, tmp, &lc->to_be_changed, list) {
+		/* count number of changes, not number of transactions */
+		++lc->changed;
+		e->lc_number = e->lc_new_number;
+		list_move(&e->list, &lc->in_use);
+	}
+	lc->pending_changes = 0;
 	RETURN();
 }
 
@@ -458,13 +540,12 @@ unsigned int lc_put(struct lru_cache *lc, struct lc_element *e)
 	PARANOIA_ENTRY();
 	PARANOIA_LC_ELEMENT(lc, e);
 	BUG_ON(e->refcnt == 0);
-	BUG_ON(e == lc->changing_element);
+	BUG_ON(e->lc_number != e->lc_new_number);
 	if (--e->refcnt == 0) {
 		/* move it to the front of LRU. */
 		list_move(&e->list, &lc->lru);
 		lc->used--;
-		clear_bit(__LC_STARVING, &lc->flags);
-		smp_mb__after_clear_bit();
+		clear_bit_unlock(__LC_STARVING, &lc->flags);
 	}
 	RETURN(e->refcnt);
 }
@@ -504,16 +585,24 @@ unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e)
 void lc_set(struct lru_cache *lc, unsigned int enr, int index)
 {
 	struct lc_element *e;
+	struct list_head *lh;
 
 	if (index < 0 || index >= lc->nr_elements)
 		return;
 
 	e = lc_element_by_index(lc, index);
-	e->lc_number = enr;
+	BUG_ON(e->lc_number != e->lc_new_number);
+	BUG_ON(e->refcnt != 0);
 
+	e->lc_number = e->lc_new_number = enr;
 	hlist_del_init(&e->colision);
-	hlist_add_head(&e->colision, lc_hash_slot(lc, enr));
-	list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru);
+	if (enr == LC_FREE)
+		lh = &lc->free;
+	else {
+		hlist_add_head(&e->colision, lc_hash_slot(lc, enr));
+		lh = &lc->lru;
+	}
+	list_move(&e->list, lh);
 }
 
 /**
@@ -553,8 +642,10 @@ EXPORT_SYMBOL(lc_try_get);
 EXPORT_SYMBOL(lc_find);
 EXPORT_SYMBOL(lc_get);
 EXPORT_SYMBOL(lc_put);
-EXPORT_SYMBOL(lc_changed);
+EXPORT_SYMBOL(lc_committed);
 EXPORT_SYMBOL(lc_element_by_index);
 EXPORT_SYMBOL(lc_index_of);
 EXPORT_SYMBOL(lc_seq_printf_stats);
 EXPORT_SYMBOL(lc_seq_dump_details);
+EXPORT_SYMBOL(lc_try_lock);
+EXPORT_SYMBOL(lc_is_used);
diff --git a/trunk/lib/pSeries-reconfig-notifier-error-inject.c b/trunk/lib/of-reconfig-notifier-error-inject.c
similarity index 51%
rename from trunk/lib/pSeries-reconfig-notifier-error-inject.c
rename to trunk/lib/of-reconfig-notifier-error-inject.c
index 7f7c98dcd5c4..8dc79861758a 100644
--- a/trunk/lib/pSeries-reconfig-notifier-error-inject.c
+++ b/trunk/lib/of-reconfig-notifier-error-inject.c
@@ -1,20 +1,20 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
-
-#include <asm/pSeries_reconfig.h>
+#include <linux/of.h>
 
 #include "notifier-error-inject.h"
 
 static int priority;
 module_param(priority, int, 0);
-MODULE_PARM_DESC(priority, "specify pSeries reconfig notifier priority");
+MODULE_PARM_DESC(priority, "specify OF reconfig notifier priority");
 
 static struct notifier_err_inject reconfig_err_inject = {
 	.actions = {
-		{ NOTIFIER_ERR_INJECT_ACTION(PSERIES_RECONFIG_ADD) },
-		{ NOTIFIER_ERR_INJECT_ACTION(PSERIES_RECONFIG_REMOVE) },
-		{ NOTIFIER_ERR_INJECT_ACTION(PSERIES_DRCONF_MEM_ADD) },
-		{ NOTIFIER_ERR_INJECT_ACTION(PSERIES_DRCONF_MEM_REMOVE) },
+		{ NOTIFIER_ERR_INJECT_ACTION(OF_RECONFIG_ATTACH_NODE) },
+		{ NOTIFIER_ERR_INJECT_ACTION(OF_RECONFIG_DETACH_NODE) },
+		{ NOTIFIER_ERR_INJECT_ACTION(OF_RECONFIG_ADD_PROPERTY) },
+		{ NOTIFIER_ERR_INJECT_ACTION(OF_RECONFIG_REMOVE_PROPERTY) },
+		{ NOTIFIER_ERR_INJECT_ACTION(OF_RECONFIG_UPDATE_PROPERTY) },
 		{}
 	}
 };
@@ -25,12 +25,12 @@ static int err_inject_init(void)
 {
 	int err;
 
-	dir = notifier_err_inject_init("pSeries-reconfig",
+	dir = notifier_err_inject_init("OF-reconfig",
 		notifier_err_inject_dir, &reconfig_err_inject, priority);
 	if (IS_ERR(dir))
 		return PTR_ERR(dir);
 
-	err = pSeries_reconfig_notifier_register(&reconfig_err_inject.nb);
+	err = of_reconfig_notifier_register(&reconfig_err_inject.nb);
 	if (err)
 		debugfs_remove_recursive(dir);
 
@@ -39,13 +39,13 @@ static int err_inject_init(void)
 
 static void err_inject_exit(void)
 {
-	pSeries_reconfig_notifier_unregister(&reconfig_err_inject.nb);
+	of_reconfig_notifier_unregister(&reconfig_err_inject.nb);
 	debugfs_remove_recursive(dir);
 }
 
 module_init(err_inject_init);
 module_exit(err_inject_exit);
 
-MODULE_DESCRIPTION("pSeries reconfig notifier error injection module");
+MODULE_DESCRIPTION("OF reconfig notifier error injection module");
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Akinobu Mita <akinobu.mita@gmail.com>");
diff --git a/trunk/lib/percpu-rwsem.c b/trunk/lib/percpu-rwsem.c
new file mode 100644
index 000000000000..652a8ee8efe9
--- /dev/null
+++ b/trunk/lib/percpu-rwsem.c
@@ -0,0 +1,165 @@
+#include <linux/atomic.h>
+#include <linux/rwsem.h>
+#include <linux/percpu.h>
+#include <linux/wait.h>
+#include <linux/lockdep.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+
+int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
+			const char *name, struct lock_class_key *rwsem_key)
+{
+	brw->fast_read_ctr = alloc_percpu(int);
+	if (unlikely(!brw->fast_read_ctr))
+		return -ENOMEM;
+
+	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
+	__init_rwsem(&brw->rw_sem, name, rwsem_key);
+	atomic_set(&brw->write_ctr, 0);
+	atomic_set(&brw->slow_read_ctr, 0);
+	init_waitqueue_head(&brw->write_waitq);
+	return 0;
+}
+
+void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
+{
+	free_percpu(brw->fast_read_ctr);
+	brw->fast_read_ctr = NULL; /* catch use after free bugs */
+}
+
+/*
+ * This is the fast-path for down_read/up_read, it only needs to ensure
+ * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the
+ * fast per-cpu counter. The writer uses synchronize_sched_expedited() to
+ * serialize with the preempt-disabled section below.
+ *
+ * The nontrivial part is that we should guarantee acquire/release semantics
+ * in case when
+ *
+ *	R_W: down_write() comes after up_read(), the writer should see all
+ *	     changes done by the reader
+ * or
+ *	W_R: down_read() comes after up_write(), the reader should see all
+ *	     changes done by the writer
+ *
+ * If this helper fails the callers rely on the normal rw_semaphore and
+ * atomic_dec_and_test(), so in this case we have the necessary barriers.
+ *
+ * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or
+ * __this_cpu_add() below can be reordered with any LOAD/STORE done by the
+ * reader inside the critical section. See the comments in down_write and
+ * up_write below.
+ */
+static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
+{
+	bool success = false;
+
+	preempt_disable();
+	if (likely(!atomic_read(&brw->write_ctr))) {
+		__this_cpu_add(*brw->fast_read_ctr, val);
+		success = true;
+	}
+	preempt_enable();
+
+	return success;
+}
+
+/*
+ * Like the normal down_read() this is not recursive, the writer can
+ * come after the first percpu_down_read() and create the deadlock.
+ *
+ * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
+ * percpu_up_read() does rwsem_release(). This pairs with the usage
+ * of ->rw_sem in percpu_down/up_write().
+ */
+void percpu_down_read(struct percpu_rw_semaphore *brw)
+{
+	might_sleep();
+	if (likely(update_fast_ctr(brw, +1))) {
+		rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
+		return;
+	}
+
+	down_read(&brw->rw_sem);
+	atomic_inc(&brw->slow_read_ctr);
+	/* avoid up_read()->rwsem_release() */
+	__up_read(&brw->rw_sem);
+}
+
+void percpu_up_read(struct percpu_rw_semaphore *brw)
+{
+	rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
+
+	if (likely(update_fast_ctr(brw, -1)))
+		return;
+
+	/* false-positive is possible but harmless */
+	if (atomic_dec_and_test(&brw->slow_read_ctr))
+		wake_up_all(&brw->write_waitq);
+}
+
+static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
+{
+	unsigned int sum = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		sum += per_cpu(*brw->fast_read_ctr, cpu);
+		per_cpu(*brw->fast_read_ctr, cpu) = 0;
+	}
+
+	return sum;
+}
+
+/*
+ * A writer increments ->write_ctr to force the readers to switch to the
+ * slow mode, note the atomic_read() check in update_fast_ctr().
+ *
+ * After that the readers can only inc/dec the slow ->slow_read_ctr counter,
+ * ->fast_read_ctr is stable. Once the writer moves its sum into the slow
+ * counter it represents the number of active readers.
+ *
+ * Finally the writer takes ->rw_sem for writing and blocks the new readers,
+ * then waits until the slow counter becomes zero.
+ */
+void percpu_down_write(struct percpu_rw_semaphore *brw)
+{
+	/* tell update_fast_ctr() there is a pending writer */
+	atomic_inc(&brw->write_ctr);
+	/*
+	 * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read
+	 *    so that update_fast_ctr() can't succeed.
+	 *
+	 * 2. Ensures we see the result of every previous this_cpu_add() in
+	 *    update_fast_ctr().
+	 *
+	 * 3. Ensures that if any reader has exited its critical section via
+	 *    fast-path, it executes a full memory barrier before we return.
+	 *    See R_W case in the comment above update_fast_ctr().
+	 */
+	synchronize_sched_expedited();
+
+	/* exclude other writers, and block the new readers completely */
+	down_write(&brw->rw_sem);
+
+	/* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
+	atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
+
+	/* wait for all readers to complete their percpu_up_read() */
+	wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
+}
+
+void percpu_up_write(struct percpu_rw_semaphore *brw)
+{
+	/* release the lock, but the readers can't use the fast-path */
+	up_write(&brw->rw_sem);
+	/*
+	 * Insert the barrier before the next fast-path in down_read,
+	 * see W_R case in the comment above update_fast_ctr().
+	 */
+	synchronize_sched_expedited();
+	/* the last writer unblocks update_fast_ctr() */
+	atomic_dec(&brw->write_ctr);
+}
diff --git a/trunk/lib/raid6/Makefile b/trunk/lib/raid6/Makefile
index de06dfe165b8..9f7c184725d7 100644
--- a/trunk/lib/raid6/Makefile
+++ b/trunk/lib/raid6/Makefile
@@ -1,8 +1,11 @@
 obj-$(CONFIG_RAID6_PQ)	+= raid6_pq.o
 
-raid6_pq-y	+= algos.o recov.o recov_ssse3.o tables.o int1.o int2.o int4.o \
-		   int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \
-		   altivec8.o mmx.o sse1.o sse2.o
+raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
+		   int8.o int16.o int32.o
+
+raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o
+raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
+
 hostprogs-y	+= mktables
 
 quiet_cmd_unroll = UNROLL  $@
diff --git a/trunk/lib/raid6/algos.c b/trunk/lib/raid6/algos.c
index 589f5f50ad2e..6d7316fe9f30 100644
--- a/trunk/lib/raid6/algos.c
+++ b/trunk/lib/raid6/algos.c
@@ -45,11 +45,20 @@ const struct raid6_calls * const raid6_algos[] = {
 	&raid6_sse1x2,
 	&raid6_sse2x1,
 	&raid6_sse2x2,
+#ifdef CONFIG_AS_AVX2
+	&raid6_avx2x1,
+	&raid6_avx2x2,
+#endif
 #endif
 #if defined(__x86_64__) && !defined(__arch_um__)
 	&raid6_sse2x1,
 	&raid6_sse2x2,
 	&raid6_sse2x4,
+#ifdef CONFIG_AS_AVX2
+	&raid6_avx2x1,
+	&raid6_avx2x2,
+	&raid6_avx2x4,
+#endif
 #endif
 #ifdef CONFIG_ALTIVEC
 	&raid6_altivec1,
@@ -72,6 +81,9 @@ EXPORT_SYMBOL_GPL(raid6_datap_recov);
 
 const struct raid6_recov_calls *const raid6_recov_algos[] = {
 #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
+#ifdef CONFIG_AS_AVX2
+	&raid6_recov_avx2,
+#endif
 	&raid6_recov_ssse3,
 #endif
 	&raid6_recov_intx1,
diff --git a/trunk/lib/raid6/altivec.uc b/trunk/lib/raid6/altivec.uc
index b71012b756f4..7cc12b532e95 100644
--- a/trunk/lib/raid6/altivec.uc
+++ b/trunk/lib/raid6/altivec.uc
@@ -24,13 +24,10 @@
 
 #include <linux/raid/pq.h>
 
-#ifdef CONFIG_ALTIVEC
-
 #include <altivec.h>
 #ifdef __KERNEL__
 # include <asm/cputable.h>
 # include <asm/switch_to.h>
-#endif
 
 /*
  * This is the C data type to use.  We use a vector of
diff --git a/trunk/lib/raid6/avx2.c b/trunk/lib/raid6/avx2.c
new file mode 100644
index 000000000000..bc3b1dd436eb
--- /dev/null
+++ b/trunk/lib/raid6/avx2.c
@@ -0,0 +1,251 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 2012 Intel Corporation
+ *   Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
+ *
+ *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * AVX2 implementation of RAID-6 syndrome functions
+ *
+ */
+
+#ifdef CONFIG_AS_AVX2
+
+#include <linux/raid/pq.h>
+#include "x86.h"
+
+static const struct raid6_avx2_constants {
+	u64 x1d[4];
+} raid6_avx2_constants __aligned(32) = {
+	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
+	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
+};
+
+static int raid6_have_avx2(void)
+{
+	return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
+}
+
+/*
+ * Plain AVX2 implementation
+ */
+static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
+	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* Zero temp */
+
+	for (d = 0; d < bytes; d += 32) {
+		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
+		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
+		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
+		asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
+		asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
+		for (z = z0-2; z >= 0; z--) {
+			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
+			asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
+			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+			asm volatile("vpand %ymm0,%ymm5,%ymm5");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+			asm volatile("vpxor %ymm6,%ymm2,%ymm2");
+			asm volatile("vpxor %ymm6,%ymm4,%ymm4");
+			asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
+		}
+		asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
+		asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+		asm volatile("vpand %ymm0,%ymm5,%ymm5");
+		asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+		asm volatile("vpxor %ymm6,%ymm2,%ymm2");
+		asm volatile("vpxor %ymm6,%ymm4,%ymm4");
+
+		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
+		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
+		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
+		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_avx2x1 = {
+	raid6_avx21_gen_syndrome,
+	raid6_have_avx2,
+	"avx2x1",
+	1			/* Has cache hints */
+};
+
+/*
+ * Unrolled-by-2 AVX2 implementation
+ */
+static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
+	asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
+
+	/* We uniformly assume a single prefetch covers at least 32 bytes */
+	for (d = 0; d < bytes; d += 64) {
+		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
+		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
+		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
+		asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
+		asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
+		asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
+		for (z = z0-1; z >= 0; z--) {
+			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
+			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
+			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
+			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
+			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
+			asm volatile("vpand %ymm0,%ymm5,%ymm5");
+			asm volatile("vpand %ymm0,%ymm7,%ymm7");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
+			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
+			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
+			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+		}
+		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
+		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
+		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
+		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_avx2x2 = {
+	raid6_avx22_gen_syndrome,
+	raid6_have_avx2,
+	"avx2x2",
+	1			/* Has cache hints */
+};
+
+#ifdef CONFIG_X86_64
+
+/*
+ * Unrolled-by-4 AVX2 implementation
+ */
+static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
+	asm volatile("vpxor %ymm1,%ymm1,%ymm1");	/* Zero temp */
+	asm volatile("vpxor %ymm2,%ymm2,%ymm2");	/* P[0] */
+	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* P[1] */
+	asm volatile("vpxor %ymm4,%ymm4,%ymm4");	/* Q[0] */
+	asm volatile("vpxor %ymm6,%ymm6,%ymm6");	/* Q[1] */
+	asm volatile("vpxor %ymm10,%ymm10,%ymm10");	/* P[2] */
+	asm volatile("vpxor %ymm11,%ymm11,%ymm11");	/* P[3] */
+	asm volatile("vpxor %ymm12,%ymm12,%ymm12");	/* Q[2] */
+	asm volatile("vpxor %ymm14,%ymm14,%ymm14");	/* Q[3] */
+
+	for (d = 0; d < bytes; d += 128) {
+		for (z = z0; z >= 0; z--) {
+			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
+			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
+			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
+			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
+			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
+			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
+			asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
+			asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
+			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
+			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
+			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
+			asm volatile("vpand %ymm0,%ymm5,%ymm5");
+			asm volatile("vpand %ymm0,%ymm7,%ymm7");
+			asm volatile("vpand %ymm0,%ymm13,%ymm13");
+			asm volatile("vpand %ymm0,%ymm15,%ymm15");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
+			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
+			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
+			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
+			asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
+			asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
+			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
+			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
+			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
+			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
+			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
+		}
+		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
+		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
+		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
+		asm volatile("vpxor %ymm3,%ymm3,%ymm3");
+		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
+		asm volatile("vpxor %ymm10,%ymm10,%ymm10");
+		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
+		asm volatile("vpxor %ymm11,%ymm11,%ymm11");
+		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
+		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
+		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
+		asm volatile("vpxor %ymm6,%ymm6,%ymm6");
+		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
+		asm volatile("vpxor %ymm12,%ymm12,%ymm12");
+		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
+		asm volatile("vpxor %ymm14,%ymm14,%ymm14");
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_avx2x4 = {
+	raid6_avx24_gen_syndrome,
+	raid6_have_avx2,
+	"avx2x4",
+	1			/* Has cache hints */
+};
+#endif
+
+#endif /* CONFIG_AS_AVX2 */
diff --git a/trunk/lib/raid6/mmx.c b/trunk/lib/raid6/mmx.c
index 279347f23094..590c71c9e200 100644
--- a/trunk/lib/raid6/mmx.c
+++ b/trunk/lib/raid6/mmx.c
@@ -16,7 +16,7 @@
  * MMX implementation of RAID-6 syndrome functions
  */
 
-#if defined(__i386__) && !defined(__arch_um__)
+#ifdef CONFIG_X86_32
 
 #include <linux/raid/pq.h>
 #include "x86.h"
diff --git a/trunk/lib/raid6/recov_avx2.c b/trunk/lib/raid6/recov_avx2.c
new file mode 100644
index 000000000000..e1eea433a493
--- /dev/null
+++ b/trunk/lib/raid6/recov_avx2.c
@@ -0,0 +1,323 @@
+/*
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#if CONFIG_AS_AVX2
+
+#include <linux/raid/pq.h>
+#include "x86.h"
+
+static int raid6_has_avx2(void)
+{
+	return boot_cpu_has(X86_FEATURE_AVX2) &&
+		boot_cpu_has(X86_FEATURE_AVX);
+}
+
+static void raid6_2data_recov_avx2(int disks, size_t bytes, int faila,
+		int failb, void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+	const u8 x0f = 0x0f;
+
+	p = (u8 *)ptrs[disks-2];
+	q = (u8 *)ptrs[disks-1];
+
+	/* Compute syndrome with zero for the missing data pages
+	   Use the dead data pages as temporary storage for
+	   delta p and delta q */
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks-2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[disks-1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]   = dp;
+	ptrs[failb]   = dq;
+	ptrs[disks-2] = p;
+	ptrs[disks-1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+		raid6_gfexp[failb]]];
+
+	kernel_fpu_begin();
+
+	/* ymm0 = x0f[16] */
+	asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f));
+
+	while (bytes) {
+#ifdef CONFIG_X86_64
+		asm volatile("vmovdqa %0, %%ymm1" : : "m" (q[0]));
+		asm volatile("vmovdqa %0, %%ymm9" : : "m" (q[32]));
+		asm volatile("vmovdqa %0, %%ymm0" : : "m" (p[0]));
+		asm volatile("vmovdqa %0, %%ymm8" : : "m" (p[32]));
+		asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (dq[0]));
+		asm volatile("vpxor %0, %%ymm9, %%ymm9" : : "m" (dq[32]));
+		asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (dp[0]));
+		asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (dp[32]));
+
+		/*
+		 * 1 = dq[0]  ^ q[0]
+		 * 9 = dq[32] ^ q[32]
+		 * 0 = dp[0]  ^ p[0]
+		 * 8 = dp[32] ^ p[32]
+		 */
+
+		asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0]));
+		asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16]));
+
+		asm volatile("vpsraw $4, %ymm1, %ymm3");
+		asm volatile("vpsraw $4, %ymm9, %ymm12");
+		asm volatile("vpand %ymm7, %ymm1, %ymm1");
+		asm volatile("vpand %ymm7, %ymm9, %ymm9");
+		asm volatile("vpand %ymm7, %ymm3, %ymm3");
+		asm volatile("vpand %ymm7, %ymm12, %ymm12");
+		asm volatile("vpshufb %ymm9, %ymm4, %ymm14");
+		asm volatile("vpshufb %ymm1, %ymm4, %ymm4");
+		asm volatile("vpshufb %ymm12, %ymm5, %ymm15");
+		asm volatile("vpshufb %ymm3, %ymm5, %ymm5");
+		asm volatile("vpxor %ymm14, %ymm15, %ymm15");
+		asm volatile("vpxor %ymm4, %ymm5, %ymm5");
+
+		/*
+		 * 5 = qx[0]
+		 * 15 = qx[32]
+		 */
+
+		asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0]));
+		asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16]));
+		asm volatile("vpsraw $4, %ymm0, %ymm2");
+		asm volatile("vpsraw $4, %ymm8, %ymm6");
+		asm volatile("vpand %ymm7, %ymm0, %ymm3");
+		asm volatile("vpand %ymm7, %ymm8, %ymm14");
+		asm volatile("vpand %ymm7, %ymm2, %ymm2");
+		asm volatile("vpand %ymm7, %ymm6, %ymm6");
+		asm volatile("vpshufb %ymm14, %ymm4, %ymm12");
+		asm volatile("vpshufb %ymm3, %ymm4, %ymm4");
+		asm volatile("vpshufb %ymm6, %ymm1, %ymm13");
+		asm volatile("vpshufb %ymm2, %ymm1, %ymm1");
+		asm volatile("vpxor %ymm4, %ymm1, %ymm1");
+		asm volatile("vpxor %ymm12, %ymm13, %ymm13");
+
+		/*
+		 * 1  = pbmul[px[0]]
+		 * 13 = pbmul[px[32]]
+		 */
+		asm volatile("vpxor %ymm5, %ymm1, %ymm1");
+		asm volatile("vpxor %ymm15, %ymm13, %ymm13");
+
+		/*
+		 * 1 = db = DQ
+		 * 13 = db[32] = DQ[32]
+		 */
+		asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
+		asm volatile("vmovdqa %%ymm13,%0" : "=m" (dq[32]));
+		asm volatile("vpxor %ymm1, %ymm0, %ymm0");
+		asm volatile("vpxor %ymm13, %ymm8, %ymm8");
+
+		asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0]));
+		asm volatile("vmovdqa %%ymm8, %0" : "=m" (dp[32]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dp += 64;
+		dq += 64;
+#else
+		asm volatile("vmovdqa %0, %%ymm1" : : "m" (*q));
+		asm volatile("vmovdqa %0, %%ymm0" : : "m" (*p));
+		asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (*dq));
+		asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (*dp));
+
+		/* 1 = dq ^ q;  0 = dp ^ p */
+
+		asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0]));
+		asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16]));
+
+		/*
+		 * 1 = dq ^ q
+		 * 3 = dq ^ p >> 4
+		 */
+		asm volatile("vpsraw $4, %ymm1, %ymm3");
+		asm volatile("vpand %ymm7, %ymm1, %ymm1");
+		asm volatile("vpand %ymm7, %ymm3, %ymm3");
+		asm volatile("vpshufb %ymm1, %ymm4, %ymm4");
+		asm volatile("vpshufb %ymm3, %ymm5, %ymm5");
+		asm volatile("vpxor %ymm4, %ymm5, %ymm5");
+
+		/* 5 = qx */
+
+		asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0]));
+		asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16]));
+
+		asm volatile("vpsraw $4, %ymm0, %ymm2");
+		asm volatile("vpand %ymm7, %ymm0, %ymm3");
+		asm volatile("vpand %ymm7, %ymm2, %ymm2");
+		asm volatile("vpshufb %ymm3, %ymm4, %ymm4");
+		asm volatile("vpshufb %ymm2, %ymm1, %ymm1");
+		asm volatile("vpxor %ymm4, %ymm1, %ymm1");
+
+		/* 1 = pbmul[px] */
+		asm volatile("vpxor %ymm5, %ymm1, %ymm1");
+		/* 1 = db = DQ */
+		asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
+
+		asm volatile("vpxor %ymm1, %ymm0, %ymm0");
+		asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0]));
+
+		bytes -= 32;
+		p += 32;
+		q += 32;
+		dp += 32;
+		dq += 32;
+#endif
+	}
+
+	kernel_fpu_end();
+}
+
+static void raid6_datap_recov_avx2(int disks, size_t bytes, int faila,
+		void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+	const u8 x0f = 0x0f;
+
+	p = (u8 *)ptrs[disks-2];
+	q = (u8 *)ptrs[disks-1];
+
+	/* Compute syndrome with zero for the missing data page
+	   Use the dead data page as temporary storage for delta q */
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks-1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]   = dq;
+	ptrs[disks-1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	kernel_fpu_begin();
+
+	asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f));
+
+	while (bytes) {
+#ifdef CONFIG_X86_64
+		asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0]));
+		asm volatile("vmovdqa %0, %%ymm8" : : "m" (dq[32]));
+		asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0]));
+		asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (q[32]));
+
+		/*
+		 * 3 = q[0] ^ dq[0]
+		 * 8 = q[32] ^ dq[32]
+		 */
+		asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0]));
+		asm volatile("vmovapd %ymm0, %ymm13");
+		asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16]));
+		asm volatile("vmovapd %ymm1, %ymm14");
+
+		asm volatile("vpsraw $4, %ymm3, %ymm6");
+		asm volatile("vpsraw $4, %ymm8, %ymm12");
+		asm volatile("vpand %ymm7, %ymm3, %ymm3");
+		asm volatile("vpand %ymm7, %ymm8, %ymm8");
+		asm volatile("vpand %ymm7, %ymm6, %ymm6");
+		asm volatile("vpand %ymm7, %ymm12, %ymm12");
+		asm volatile("vpshufb %ymm3, %ymm0, %ymm0");
+		asm volatile("vpshufb %ymm8, %ymm13, %ymm13");
+		asm volatile("vpshufb %ymm6, %ymm1, %ymm1");
+		asm volatile("vpshufb %ymm12, %ymm14, %ymm14");
+		asm volatile("vpxor %ymm0, %ymm1, %ymm1");
+		asm volatile("vpxor %ymm13, %ymm14, %ymm14");
+
+		/*
+		 * 1  = qmul[q[0]  ^ dq[0]]
+		 * 14 = qmul[q[32] ^ dq[32]]
+		 */
+		asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0]));
+		asm volatile("vmovdqa %0, %%ymm12" : : "m" (p[32]));
+		asm volatile("vpxor %ymm1, %ymm2, %ymm2");
+		asm volatile("vpxor %ymm14, %ymm12, %ymm12");
+
+		/*
+		 * 2  = p[0]  ^ qmul[q[0]  ^ dq[0]]
+		 * 12 = p[32] ^ qmul[q[32] ^ dq[32]]
+		 */
+
+		asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
+		asm volatile("vmovdqa %%ymm14, %0" : "=m" (dq[32]));
+		asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0]));
+		asm volatile("vmovdqa %%ymm12,%0" : "=m" (p[32]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dq += 64;
+#else
+		asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0]));
+		asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0]));
+
+		/* 3 = q ^ dq */
+
+		asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0]));
+		asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16]));
+
+		asm volatile("vpsraw $4, %ymm3, %ymm6");
+		asm volatile("vpand %ymm7, %ymm3, %ymm3");
+		asm volatile("vpand %ymm7, %ymm6, %ymm6");
+		asm volatile("vpshufb %ymm3, %ymm0, %ymm0");
+		asm volatile("vpshufb %ymm6, %ymm1, %ymm1");
+		asm volatile("vpxor %ymm0, %ymm1, %ymm1");
+
+		/* 1 = qmul[q ^ dq] */
+
+		asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0]));
+		asm volatile("vpxor %ymm1, %ymm2, %ymm2");
+
+		/* 2 = p ^ qmul[q ^ dq] */
+
+		asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
+		asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0]));
+
+		bytes -= 32;
+		p += 32;
+		q += 32;
+		dq += 32;
+#endif
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_recov_calls raid6_recov_avx2 = {
+	.data2 = raid6_2data_recov_avx2,
+	.datap = raid6_datap_recov_avx2,
+	.valid = raid6_has_avx2,
+#ifdef CONFIG_X86_64
+	.name = "avx2x2",
+#else
+	.name = "avx2x1",
+#endif
+	.priority = 2,
+};
+
+#else
+#warning "your version of binutils lacks AVX2 support"
+#endif
diff --git a/trunk/lib/raid6/recov_ssse3.c b/trunk/lib/raid6/recov_ssse3.c
index ecb710c0b4d9..a9168328f03b 100644
--- a/trunk/lib/raid6/recov_ssse3.c
+++ b/trunk/lib/raid6/recov_ssse3.c
@@ -7,8 +7,6 @@
  * of the License.
  */
 
-#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
-
 #include <linux/raid/pq.h>
 #include "x86.h"
 
@@ -332,5 +330,3 @@ const struct raid6_recov_calls raid6_recov_ssse3 = {
 #endif
 	.priority = 1,
 };
-
-#endif
diff --git a/trunk/lib/raid6/sse1.c b/trunk/lib/raid6/sse1.c
index 10dd91948c07..f76297139445 100644
--- a/trunk/lib/raid6/sse1.c
+++ b/trunk/lib/raid6/sse1.c
@@ -21,7 +21,7 @@
  * worthwhile as a separate implementation.
  */
 
-#if defined(__i386__) && !defined(__arch_um__)
+#ifdef CONFIG_X86_32
 
 #include <linux/raid/pq.h>
 #include "x86.h"
diff --git a/trunk/lib/raid6/sse2.c b/trunk/lib/raid6/sse2.c
index bc2d57daa589..85b82c85f28e 100644
--- a/trunk/lib/raid6/sse2.c
+++ b/trunk/lib/raid6/sse2.c
@@ -17,8 +17,6 @@
  *
  */
 
-#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
-
 #include <linux/raid/pq.h>
 #include "x86.h"
 
@@ -159,9 +157,7 @@ const struct raid6_calls raid6_sse2x2 = {
 	1			/* Has cache hints */
 };
 
-#endif
-
-#if defined(__x86_64__) && !defined(__arch_um__)
+#ifdef CONFIG_X86_64
 
 /*
  * Unrolled-by-4 SSE2 implementation
@@ -259,4 +255,4 @@ const struct raid6_calls raid6_sse2x4 = {
 	1			/* Has cache hints */
 };
 
-#endif
+#endif /* CONFIG_X86_64 */
diff --git a/trunk/lib/raid6/test/Makefile b/trunk/lib/raid6/test/Makefile
index c76151d94764..087332dbf8aa 100644
--- a/trunk/lib/raid6/test/Makefile
+++ b/trunk/lib/raid6/test/Makefile
@@ -10,6 +10,31 @@ LD	 = ld
 AWK	 = awk -f
 AR	 = ar
 RANLIB	 = ranlib
+OBJS	 = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o
+
+ARCH := $(shell uname -m 2>/dev/null | sed -e /s/i.86/i386/)
+ifeq ($(ARCH),i386)
+        CFLAGS += -DCONFIG_X86_32
+        IS_X86 = yes
+endif
+ifeq ($(ARCH),x86_64)
+        CFLAGS += -DCONFIG_X86_64
+        IS_X86 = yes
+endif
+
+ifeq ($(IS_X86),yes)
+        OBJS   += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o
+        CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" |	\
+                    gcc -c -x assembler - >&/dev/null &&	\
+                    rm ./-.o && echo -DCONFIG_AS_AVX2=1)
+else
+        HAS_ALTIVEC := $(shell echo -e '\#include <altivec.h>\nvector int a;' |\
+                         gcc -c -x c - >&/dev/null && \
+                         rm ./-.o && echo yes)
+        ifeq ($(HAS_ALTIVEC),yes)
+                OBJS += altivec1.o altivec2.o altivec4.o altivec8.o
+        endif
+endif
 
 .c.o:
 	$(CC) $(CFLAGS) -c -o $@ $<
@@ -22,9 +47,7 @@ RANLIB	 = ranlib
 
 all:	raid6.a raid6test
 
-raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \
-	 altivec1.o altivec2.o altivec4.o altivec8.o recov.o recov_ssse3.o algos.o \
-	 tables.o
+raid6.a: $(OBJS)
 	 rm -f $@
 	 $(AR) cq $@ $^
 	 $(RANLIB) $@
diff --git a/trunk/lib/raid6/x86.h b/trunk/lib/raid6/x86.h
index d55d63232c55..b7595484a815 100644
--- a/trunk/lib/raid6/x86.h
+++ b/trunk/lib/raid6/x86.h
@@ -45,19 +45,23 @@ static inline void kernel_fpu_end(void)
 #define X86_FEATURE_XMM3	(4*32+ 0) /* "pni" SSE-3 */
 #define X86_FEATURE_SSSE3	(4*32+ 9) /* Supplemental SSE-3 */
 #define X86_FEATURE_AVX	(4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_AVX2        (9*32+ 5) /* AVX2 instructions */
 #define X86_FEATURE_MMXEXT	(1*32+22) /* AMD MMX extensions */
 
 /* Should work well enough on modern CPUs for testing */
 static inline int boot_cpu_has(int flag)
 {
-	u32 eax = (flag & 0x20) ? 0x80000001 : 1;
-	u32 ecx, edx;
+	u32 eax, ebx, ecx, edx;
+
+	eax = (flag & 0x100) ? 7 :
+		(flag & 0x20) ? 0x80000001 : 1;
+	ecx = 0;
 
 	asm volatile("cpuid"
-		     : "+a" (eax), "=d" (edx), "=c" (ecx)
-		     : : "ebx");
+		     : "+a" (eax), "=b" (ebx), "=d" (edx), "+c" (ecx));
 
-	return ((flag & 0x80 ? ecx : edx) >> (flag & 31)) & 1;
+	return ((flag & 0x100 ? ebx :
+		(flag & 0x80) ? ecx : edx) >> (flag & 31)) & 1;
 }
 
 #endif /* ndef __KERNEL__ */
diff --git a/trunk/lib/random32.c b/trunk/lib/random32.c
index 938bde5876ac..52280d5526be 100644
--- a/trunk/lib/random32.c
+++ b/trunk/lib/random32.c
@@ -42,13 +42,13 @@
 static DEFINE_PER_CPU(struct rnd_state, net_rand_state);
 
 /**
- *	prandom32 - seeded pseudo-random number generator.
+ *	prandom_u32_state - seeded pseudo-random number generator.
  *	@state: pointer to state structure holding seeded state.
  *
  *	This is used for pseudo-randomness with no outside seeding.
- *	For more random results, use random32().
+ *	For more random results, use prandom_u32().
  */
-u32 prandom32(struct rnd_state *state)
+u32 prandom_u32_state(struct rnd_state *state)
 {
 #define TAUSWORTHE(s,a,b,c,d) ((s&c)<<d) ^ (((s <<a) ^ s)>>b)
 
@@ -58,32 +58,81 @@ u32 prandom32(struct rnd_state *state)
 
 	return (state->s1 ^ state->s2 ^ state->s3);
 }
-EXPORT_SYMBOL(prandom32);
+EXPORT_SYMBOL(prandom_u32_state);
 
 /**
- *	random32 - pseudo random number generator
+ *	prandom_u32 - pseudo random number generator
  *
  *	A 32 bit pseudo-random number is generated using a fast
  *	algorithm suitable for simulation. This algorithm is NOT
  *	considered safe for cryptographic use.
  */
-u32 random32(void)
+u32 prandom_u32(void)
 {
 	unsigned long r;
 	struct rnd_state *state = &get_cpu_var(net_rand_state);
-	r = prandom32(state);
+	r = prandom_u32_state(state);
 	put_cpu_var(state);
 	return r;
 }
-EXPORT_SYMBOL(random32);
+EXPORT_SYMBOL(prandom_u32);
+
+/*
+ *	prandom_bytes_state - get the requested number of pseudo-random bytes
+ *
+ *	@state: pointer to state structure holding seeded state.
+ *	@buf: where to copy the pseudo-random bytes to
+ *	@bytes: the requested number of bytes
+ *
+ *	This is used for pseudo-randomness with no outside seeding.
+ *	For more random results, use prandom_bytes().
+ */
+void prandom_bytes_state(struct rnd_state *state, void *buf, int bytes)
+{
+	unsigned char *p = buf;
+	int i;
+
+	for (i = 0; i < round_down(bytes, sizeof(u32)); i += sizeof(u32)) {
+		u32 random = prandom_u32_state(state);
+		int j;
+
+		for (j = 0; j < sizeof(u32); j++) {
+			p[i + j] = random;
+			random >>= BITS_PER_BYTE;
+		}
+	}
+	if (i < bytes) {
+		u32 random = prandom_u32_state(state);
+
+		for (; i < bytes; i++) {
+			p[i] = random;
+			random >>= BITS_PER_BYTE;
+		}
+	}
+}
+EXPORT_SYMBOL(prandom_bytes_state);
+
+/**
+ *	prandom_bytes - get the requested number of pseudo-random bytes
+ *	@buf: where to copy the pseudo-random bytes to
+ *	@bytes: the requested number of bytes
+ */
+void prandom_bytes(void *buf, int bytes)
+{
+	struct rnd_state *state = &get_cpu_var(net_rand_state);
+
+	prandom_bytes_state(state, buf, bytes);
+	put_cpu_var(state);
+}
+EXPORT_SYMBOL(prandom_bytes);
 
 /**
- *	srandom32 - add entropy to pseudo random number generator
+ *	prandom_seed - add entropy to pseudo random number generator
  *	@seed: seed value
  *
- *	Add some additional seeding to the random32() pool.
+ *	Add some additional seeding to the prandom pool.
  */
-void srandom32(u32 entropy)
+void prandom_seed(u32 entropy)
 {
 	int i;
 	/*
@@ -95,13 +144,13 @@ void srandom32(u32 entropy)
 		state->s1 = __seed(state->s1 ^ entropy, 1);
 	}
 }
-EXPORT_SYMBOL(srandom32);
+EXPORT_SYMBOL(prandom_seed);
 
 /*
  *	Generate some initially weak seeding values to allow
- *	to start the random32() engine.
+ *	to start the prandom_u32() engine.
  */
-static int __init random32_init(void)
+static int __init prandom_init(void)
 {
 	int i;
 
@@ -114,22 +163,22 @@ static int __init random32_init(void)
 		state->s3 = __seed(LCG(state->s2), 15);
 
 		/* "warm it up" */
-		prandom32(state);
-		prandom32(state);
-		prandom32(state);
-		prandom32(state);
-		prandom32(state);
-		prandom32(state);
+		prandom_u32_state(state);
+		prandom_u32_state(state);
+		prandom_u32_state(state);
+		prandom_u32_state(state);
+		prandom_u32_state(state);
+		prandom_u32_state(state);
 	}
 	return 0;
 }
-core_initcall(random32_init);
+core_initcall(prandom_init);
 
 /*
  *	Generate better values after random number generator
  *	is fully initialized.
  */
-static int __init random32_reseed(void)
+static int __init prandom_reseed(void)
 {
 	int i;
 
@@ -143,8 +192,8 @@ static int __init random32_reseed(void)
 		state->s3 = __seed(seeds[2], 15);
 
 		/* mix it in */
-		prandom32(state);
+		prandom_u32_state(state);
 	}
 	return 0;
 }
-late_initcall(random32_reseed);
+late_initcall(prandom_reseed);
diff --git a/trunk/lib/rbtree_test.c b/trunk/lib/rbtree_test.c
index 268b23951fec..af38aedbd874 100644
--- a/trunk/lib/rbtree_test.c
+++ b/trunk/lib/rbtree_test.c
@@ -96,8 +96,8 @@ static void init(void)
 {
 	int i;
 	for (i = 0; i < NODES; i++) {
-		nodes[i].key = prandom32(&rnd);
-		nodes[i].val = prandom32(&rnd);
+		nodes[i].key = prandom_u32_state(&rnd);
+		nodes[i].val = prandom_u32_state(&rnd);
 	}
 }
 
@@ -118,7 +118,7 @@ static void check(int nr_nodes)
 {
 	struct rb_node *rb;
 	int count = 0;
-	int blacks;
+	int blacks = 0;
 	u32 prev_key = 0;
 
 	for (rb = rb_first(&root); rb; rb = rb_next(rb)) {
@@ -155,7 +155,7 @@ static int rbtree_test_init(void)
 
 	printk(KERN_ALERT "rbtree testing");
 
-	prandom32_seed(&rnd, 3141592653589793238ULL);
+	prandom_seed_state(&rnd, 3141592653589793238ULL);
 	init();
 
 	time1 = get_cycles();
diff --git a/trunk/lib/scatterlist.c b/trunk/lib/scatterlist.c
index 3675452b23ca..7874b01e816e 100644
--- a/trunk/lib/scatterlist.c
+++ b/trunk/lib/scatterlist.c
@@ -248,7 +248,8 @@ int __sg_alloc_table(struct sg_table *table, unsigned int nents,
 	unsigned int left;
 
 #ifndef ARCH_HAS_SG_CHAIN
-	BUG_ON(nents > max_ents);
+	if (WARN_ON_ONCE(nents > max_ents))
+		return -EINVAL;
 #endif
 
 	memset(table, 0, sizeof(*table));
diff --git a/trunk/lib/vsprintf.c b/trunk/lib/vsprintf.c
index 39c99fea7c03..fab33a9c5318 100644
--- a/trunk/lib/vsprintf.c
+++ b/trunk/lib/vsprintf.c
@@ -23,12 +23,12 @@
 #include <linux/ctype.h>
 #include <linux/kernel.h>
 #include <linux/kallsyms.h>
+#include <linux/math64.h>
 #include <linux/uaccess.h>
 #include <linux/ioport.h>
 #include <net/addrconf.h>
 
 #include <asm/page.h>		/* for PAGE_SIZE */
-#include <asm/div64.h>
 #include <asm/sections.h>	/* for dereference_function_descriptor() */
 
 #include "kstrtox.h"
@@ -38,6 +38,8 @@
  * @cp: The start of the string
  * @endp: A pointer to the end of the parsed string will be placed here
  * @base: The number base to use
+ *
+ * This function is obsolete. Please use kstrtoull instead.
  */
 unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
 {
@@ -61,6 +63,8 @@ EXPORT_SYMBOL(simple_strtoull);
  * @cp: The start of the string
  * @endp: A pointer to the end of the parsed string will be placed here
  * @base: The number base to use
+ *
+ * This function is obsolete. Please use kstrtoul instead.
  */
 unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base)
 {
@@ -73,6 +77,8 @@ EXPORT_SYMBOL(simple_strtoul);
  * @cp: The start of the string
  * @endp: A pointer to the end of the parsed string will be placed here
  * @base: The number base to use
+ *
+ * This function is obsolete. Please use kstrtol instead.
  */
 long simple_strtol(const char *cp, char **endp, unsigned int base)
 {
@@ -88,6 +94,8 @@ EXPORT_SYMBOL(simple_strtol);
  * @cp: The start of the string
  * @endp: A pointer to the end of the parsed string will be placed here
  * @base: The number base to use
+ *
+ * This function is obsolete. Please use kstrtoll instead.
  */
 long long simple_strtoll(const char *cp, char **endp, unsigned int base)
 {
@@ -1485,7 +1493,10 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
 				num = va_arg(args, long);
 				break;
 			case FORMAT_TYPE_SIZE_T:
-				num = va_arg(args, size_t);
+				if (spec.flags & SIGN)
+					num = va_arg(args, ssize_t);
+				else
+					num = va_arg(args, size_t);
 				break;
 			case FORMAT_TYPE_PTRDIFF:
 				num = va_arg(args, ptrdiff_t);
@@ -2013,7 +2024,11 @@ int vsscanf(const char *buf, const char *fmt, va_list args)
 	char digit;
 	int num = 0;
 	u8 qualifier;
-	u8 base;
+	unsigned int base;
+	union {
+		long long s;
+		unsigned long long u;
+	} val;
 	s16 field_width;
 	bool is_sign;
 
@@ -2053,8 +2068,11 @@ int vsscanf(const char *buf, const char *fmt, va_list args)
 
 		/* get field width */
 		field_width = -1;
-		if (isdigit(*fmt))
+		if (isdigit(*fmt)) {
 			field_width = skip_atoi(&fmt);
+			if (field_width <= 0)
+				break;
+		}
 
 		/* get conversion qualifier */
 		qualifier = -1;
@@ -2154,58 +2172,61 @@ int vsscanf(const char *buf, const char *fmt, va_list args)
 		    || (base == 0 && !isdigit(digit)))
 			break;
 
+		if (is_sign)
+			val.s = qualifier != 'L' ?
+				simple_strtol(str, &next, base) :
+				simple_strtoll(str, &next, base);
+		else
+			val.u = qualifier != 'L' ?
+				simple_strtoul(str, &next, base) :
+				simple_strtoull(str, &next, base);
+
+		if (field_width > 0 && next - str > field_width) {
+			if (base == 0)
+				_parse_integer_fixup_radix(str, &base);
+			while (next - str > field_width) {
+				if (is_sign)
+					val.s = div_s64(val.s, base);
+				else
+					val.u = div_u64(val.u, base);
+				--next;
+			}
+		}
+
 		switch (qualifier) {
 		case 'H':	/* that's 'hh' in format */
-			if (is_sign) {
-				signed char *s = (signed char *)va_arg(args, signed char *);
-				*s = (signed char)simple_strtol(str, &next, base);
-			} else {
-				unsigned char *s = (unsigned char *)va_arg(args, unsigned char *);
-				*s = (unsigned char)simple_strtoul(str, &next, base);
-			}
+			if (is_sign)
+				*va_arg(args, signed char *) = val.s;
+			else
+				*va_arg(args, unsigned char *) = val.u;
 			break;
 		case 'h':
-			if (is_sign) {
-				short *s = (short *)va_arg(args, short *);
-				*s = (short)simple_strtol(str, &next, base);
-			} else {
-				unsigned short *s = (unsigned short *)va_arg(args, unsigned short *);
-				*s = (unsigned short)simple_strtoul(str, &next, base);
-			}
+			if (is_sign)
+				*va_arg(args, short *) = val.s;
+			else
+				*va_arg(args, unsigned short *) = val.u;
 			break;
 		case 'l':
-			if (is_sign) {
-				long *l = (long *)va_arg(args, long *);
-				*l = simple_strtol(str, &next, base);
-			} else {
-				unsigned long *l = (unsigned long *)va_arg(args, unsigned long *);
-				*l = simple_strtoul(str, &next, base);
-			}
+			if (is_sign)
+				*va_arg(args, long *) = val.s;
+			else
+				*va_arg(args, unsigned long *) = val.u;
 			break;
 		case 'L':
-			if (is_sign) {
-				long long *l = (long long *)va_arg(args, long long *);
-				*l = simple_strtoll(str, &next, base);
-			} else {
-				unsigned long long *l = (unsigned long long *)va_arg(args, unsigned long long *);
-				*l = simple_strtoull(str, &next, base);
-			}
+			if (is_sign)
+				*va_arg(args, long long *) = val.s;
+			else
+				*va_arg(args, unsigned long long *) = val.u;
 			break;
 		case 'Z':
 		case 'z':
-		{
-			size_t *s = (size_t *)va_arg(args, size_t *);
-			*s = (size_t)simple_strtoul(str, &next, base);
-		}
-		break;
+			*va_arg(args, size_t *) = val.u;
+			break;
 		default:
-			if (is_sign) {
-				int *i = (int *)va_arg(args, int *);
-				*i = (int)simple_strtol(str, &next, base);
-			} else {
-				unsigned int *i = (unsigned int *)va_arg(args, unsigned int*);
-				*i = (unsigned int)simple_strtoul(str, &next, base);
-			}
+			if (is_sign)
+				*va_arg(args, int *) = val.s;
+			else
+				*va_arg(args, unsigned int *) = val.u;
 			break;
 		}
 		num++;
diff --git a/trunk/mm/Kconfig b/trunk/mm/Kconfig
index 71259e052ce8..278e3ab1f169 100644
--- a/trunk/mm/Kconfig
+++ b/trunk/mm/Kconfig
@@ -149,7 +149,18 @@ config MOVABLE_NODE
 	depends on NO_BOOTMEM
 	depends on X86_64
 	depends on NUMA
-	depends on BROKEN
+	default n
+	help
+	  Allow a node to have only movable memory.  Pages used by the kernel,
+	  such as direct mapping pages cannot be migrated.  So the corresponding
+	  memory device cannot be hotplugged.  This option allows users to
+	  online all the memory of a node as movable memory so that the whole
+	  node can be hotplugged.  Users who don't use the memory hotplug
+	  feature are fine with this option on since they don't online memory
+	  as movable.
+
+	  Say Y here if you want to hotplug a whole node.
+	  Say N here if you want kernel to use memory on all nodes evenly.
 
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
diff --git a/trunk/mm/backing-dev.c b/trunk/mm/backing-dev.c
index bd6a6cabef71..d3ca2b3ee176 100644
--- a/trunk/mm/backing-dev.c
+++ b/trunk/mm/backing-dev.c
@@ -10,7 +10,6 @@
 #include <linux/module.h>
 #include <linux/writeback.h>
 #include <linux/device.h>
-#include <linux/slab.h>
 #include <trace/events/writeback.h>
 
 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
@@ -222,63 +221,12 @@ static ssize_t max_ratio_store(struct device *dev,
 }
 BDI_SHOW(max_ratio, bdi->max_ratio)
 
-static ssize_t cpu_list_store(struct device *dev,
-		struct device_attribute *attr, const char *buf, size_t count)
-{
-	struct backing_dev_info *bdi = dev_get_drvdata(dev);
-	struct bdi_writeback *wb = &bdi->wb;
-	cpumask_var_t newmask;
-	ssize_t ret;
-	struct task_struct *task;
-
-	if (!alloc_cpumask_var(&newmask, GFP_KERNEL))
-		return -ENOMEM;
-
-	ret = cpulist_parse(buf, newmask);
-	if (!ret) {
-		spin_lock_bh(&bdi->wb_lock);
-		task = wb->task;
-		if (task)
-			get_task_struct(task);
-		spin_unlock_bh(&bdi->wb_lock);
-
-		mutex_lock(&bdi->flusher_cpumask_lock);
-		if (task) {
-			ret = set_cpus_allowed_ptr(task, newmask);
-			put_task_struct(task);
-		}
-		if (ret == 0) {
-			cpumask_copy(bdi->flusher_cpumask, newmask);
-			ret = count;
-		}
-		mutex_unlock(&bdi->flusher_cpumask_lock);
-
-	}
-	free_cpumask_var(newmask);
-
-	return ret;
-}
-
-static ssize_t cpu_list_show(struct device *dev,
-		struct device_attribute *attr, char *page)
-{
-	struct backing_dev_info *bdi = dev_get_drvdata(dev);
-	ssize_t ret;
-
-	mutex_lock(&bdi->flusher_cpumask_lock);
-	ret = cpulist_scnprintf(page, PAGE_SIZE-1, bdi->flusher_cpumask);
-	mutex_unlock(&bdi->flusher_cpumask_lock);
-
-	return ret;
-}
-
 #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
 
 static struct device_attribute bdi_dev_attrs[] = {
 	__ATTR_RW(read_ahead_kb),
 	__ATTR_RW(min_ratio),
 	__ATTR_RW(max_ratio),
-	__ATTR_RW(cpu_list),
 	__ATTR_NULL,
 };
 
@@ -480,7 +428,6 @@ static int bdi_forker_thread(void *ptr)
 				writeback_inodes_wb(&bdi->wb, 1024,
 						    WB_REASON_FORKER_THREAD);
 			} else {
-				int ret;
 				/*
 				 * The spinlock makes sure we do not lose
 				 * wake-ups when racing with 'bdi_queue_work()'.
@@ -490,14 +437,6 @@ static int bdi_forker_thread(void *ptr)
 				spin_lock_bh(&bdi->wb_lock);
 				bdi->wb.task = task;
 				spin_unlock_bh(&bdi->wb_lock);
-				mutex_lock(&bdi->flusher_cpumask_lock);
-				ret = set_cpus_allowed_ptr(task,
-							bdi->flusher_cpumask);
-				mutex_unlock(&bdi->flusher_cpumask_lock);
-				if (ret)
-					printk_once("%s: failed to bind flusher"
-						    " thread %s, error %d\n",
-						    __func__, task->comm, ret);
 				wake_up_process(task);
 			}
 			bdi_clear_pending(bdi);
@@ -570,17 +509,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 						dev_name(dev));
 		if (IS_ERR(wb->task))
 			return PTR_ERR(wb->task);
-	} else {
-		int node;
-		/*
-		 * Set up a default cpumask for the flusher threads that
-		 * includes all cpus on the same numa node as the device.
-		 * The mask may be overridden via sysfs.
-		 */
-		node = dev_to_node(bdi->dev);
-		if (node != NUMA_NO_NODE)
-			cpumask_copy(bdi->flusher_cpumask,
-				     cpumask_of_node(node));
 	}
 
 	bdi_debug_register(bdi, dev_name(dev));
@@ -706,15 +634,6 @@ int bdi_init(struct backing_dev_info *bdi)
 
 	bdi_wb_init(&bdi->wb, bdi);
 
-	if (!bdi_cap_flush_forker(bdi)) {
-		bdi->flusher_cpumask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
-		if (!bdi->flusher_cpumask)
-			return -ENOMEM;
-		cpumask_setall(bdi->flusher_cpumask);
-		mutex_init(&bdi->flusher_cpumask_lock);
-	} else
-		bdi->flusher_cpumask = NULL;
-
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
 		err = percpu_counter_init(&bdi->bdi_stat[i], 0);
 		if (err)
@@ -737,7 +656,6 @@ int bdi_init(struct backing_dev_info *bdi)
 err:
 		while (i--)
 			percpu_counter_destroy(&bdi->bdi_stat[i]);
-		kfree(bdi->flusher_cpumask);
 	}
 
 	return err;
@@ -765,8 +683,6 @@ void bdi_destroy(struct backing_dev_info *bdi)
 
 	bdi_unregister(bdi);
 
-	kfree(bdi->flusher_cpumask);
-
 	/*
 	 * If bdi_unregister() had already been called earlier, the
 	 * wakeup_timer could still be armed because bdi_prune_sb()
diff --git a/trunk/mm/hugetlb.c b/trunk/mm/hugetlb.c
index e5318c7793ae..4f3ea0b1e57c 100644
--- a/trunk/mm/hugetlb.c
+++ b/trunk/mm/hugetlb.c
@@ -1906,14 +1906,12 @@ static int __init hugetlb_init(void)
 		default_hstate.max_huge_pages = default_hstate_max_huge_pages;
 
 	hugetlb_init_hstates();
-
 	gather_bootmem_prealloc();
-
 	report_hugepages();
 
 	hugetlb_sysfs_init();
-
 	hugetlb_register_all_nodes();
+	hugetlb_cgroup_file_init();
 
 	return 0;
 }
@@ -1943,13 +1941,6 @@ void __init hugetlb_add_hstate(unsigned order)
 	h->next_nid_to_free = first_node(node_states[N_MEMORY]);
 	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
 					huge_page_size(h)/1024);
-	/*
-	 * Add cgroup control files only if the huge page consists
-	 * of more than two normal pages. This is because we use
-	 * page[2].lru.next for storing cgoup details.
-	 */
-	if (order >= HUGETLB_CGROUP_MIN_ORDER)
-		hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
 
 	parsed_hstate = h;
 }
diff --git a/trunk/mm/hugetlb_cgroup.c b/trunk/mm/hugetlb_cgroup.c
index b5bde7a5c017..9cea7de22ffb 100644
--- a/trunk/mm/hugetlb_cgroup.c
+++ b/trunk/mm/hugetlb_cgroup.c
@@ -333,7 +333,7 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize)
 	return buf;
 }
 
-int __init hugetlb_cgroup_file_init(int idx)
+static void __init __hugetlb_cgroup_file_init(int idx)
 {
 	char buf[32];
 	struct cftype *cft;
@@ -375,7 +375,22 @@ int __init hugetlb_cgroup_file_init(int idx)
 
 	WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
 
-	return 0;
+	return;
+}
+
+void __init hugetlb_cgroup_file_init(void)
+{
+	struct hstate *h;
+
+	for_each_hstate(h) {
+		/*
+		 * Add cgroup control files only if the huge page consists
+		 * of more than two normal pages. This is because we use
+		 * page[2].lru.next for storing cgroup details.
+		 */
+		if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
+			__hugetlb_cgroup_file_init(hstate_index(h));
+	}
 }
 
 /*
diff --git a/trunk/mm/kmemleak.c b/trunk/mm/kmemleak.c
index a217cc544060..752a705c77c2 100644
--- a/trunk/mm/kmemleak.c
+++ b/trunk/mm/kmemleak.c
@@ -1556,7 +1556,8 @@ static int dump_str_object_info(const char *str)
 	struct kmemleak_object *object;
 	unsigned long addr;
 
-	addr= simple_strtoul(str, NULL, 0);
+	if (kstrtoul(str, 0, &addr))
+		return -EINVAL;
 	object = find_and_get_object(addr, 0);
 	if (!object) {
 		pr_info("Unknown object at 0x%08lx\n", addr);
diff --git a/trunk/mm/ksm.c b/trunk/mm/ksm.c
index 82dfb4b54321..51573858938d 100644
--- a/trunk/mm/ksm.c
+++ b/trunk/mm/ksm.c
@@ -1624,7 +1624,7 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
 		struct anon_vma_chain *vmac;
 		struct vm_area_struct *vma;
 
-		anon_vma_lock_write(anon_vma);
+		anon_vma_lock_read(anon_vma);
 		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
 					       0, ULONG_MAX) {
 			vma = vmac->vma;
@@ -1648,7 +1648,7 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
 			if (!search_new_forks || !mapcount)
 				break;
 		}
-		anon_vma_unlock(anon_vma);
+		anon_vma_unlock_read(anon_vma);
 		if (!mapcount)
 			goto out;
 	}
@@ -1678,7 +1678,7 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
 		struct anon_vma_chain *vmac;
 		struct vm_area_struct *vma;
 
-		anon_vma_lock_write(anon_vma);
+		anon_vma_lock_read(anon_vma);
 		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
 					       0, ULONG_MAX) {
 			vma = vmac->vma;
@@ -1697,11 +1697,11 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
 			ret = try_to_unmap_one(page, vma,
 					rmap_item->address, flags);
 			if (ret != SWAP_AGAIN || !page_mapped(page)) {
-				anon_vma_unlock(anon_vma);
+				anon_vma_unlock_read(anon_vma);
 				goto out;
 			}
 		}
-		anon_vma_unlock(anon_vma);
+		anon_vma_unlock_read(anon_vma);
 	}
 	if (!search_new_forks++)
 		goto again;
@@ -1731,7 +1731,7 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
 		struct anon_vma_chain *vmac;
 		struct vm_area_struct *vma;
 
-		anon_vma_lock_write(anon_vma);
+		anon_vma_lock_read(anon_vma);
 		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
 					       0, ULONG_MAX) {
 			vma = vmac->vma;
@@ -1749,11 +1749,11 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
 
 			ret = rmap_one(page, vma, rmap_item->address, arg);
 			if (ret != SWAP_AGAIN) {
-				anon_vma_unlock(anon_vma);
+				anon_vma_unlock_read(anon_vma);
 				goto out;
 			}
 		}
-		anon_vma_unlock(anon_vma);
+		anon_vma_unlock_read(anon_vma);
 	}
 	if (!search_new_forks++)
 		goto again;
diff --git a/trunk/mm/memcontrol.c b/trunk/mm/memcontrol.c
index bbfac5063ca8..f3009b4bae51 100644
--- a/trunk/mm/memcontrol.c
+++ b/trunk/mm/memcontrol.c
@@ -10,6 +10,10 @@
  * Copyright (C) 2009 Nokia Corporation
  * Author: Kirill A. Shutemov
  *
+ * Kernel Memory Controller
+ * Copyright (C) 2012 Parallels Inc. and Google Inc.
+ * Authors: Glauber Costa and Suleiman Souhlal
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -267,6 +271,10 @@ struct mem_cgroup {
 		struct work_struct work_freeing;
 	};
 
+	/*
+	 * the counter to account for kernel memory usage.
+	 */
+	struct res_counter kmem;
 	/*
 	 * Per cgroup active and inactive list, similar to the
 	 * per zone LRU lists.
@@ -282,6 +290,7 @@ struct mem_cgroup {
 	 * Should the accounting and control be hierarchical, per subtree?
 	 */
 	bool use_hierarchy;
+	unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
 
 	bool		oom_lock;
 	atomic_t	under_oom;
@@ -332,8 +341,61 @@ struct mem_cgroup {
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
 	struct tcp_memcontrol tcp_mem;
 #endif
+#if defined(CONFIG_MEMCG_KMEM)
+	/* analogous to slab_common's slab_caches list. per-memcg */
+	struct list_head memcg_slab_caches;
+	/* Not a spinlock, we can take a lot of time walking the list */
+	struct mutex slab_caches_mutex;
+        /* Index in the kmem_cache->memcg_params->memcg_caches array */
+	int kmemcg_id;
+#endif
 };
 
+/* internal only representation about the status of kmem accounting. */
+enum {
+	KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
+	KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
+	KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
+};
+
+/* We account when limit is on, but only after call sites are patched */
+#define KMEM_ACCOUNTED_MASK \
+		((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
+
+#ifdef CONFIG_MEMCG_KMEM
+static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
+{
+	set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
+}
+
+static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+{
+	return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
+}
+
+static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
+{
+	set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
+}
+
+static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
+{
+	clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
+}
+
+static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
+{
+	if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
+		set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
+}
+
+static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
+{
+	return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
+				  &memcg->kmem_account_flags);
+}
+#endif
+
 /* Stuffs for move charges at task migration. */
 /*
  * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
@@ -388,9 +450,13 @@ enum charge_type {
 };
 
 /* for encoding cft->private value on file */
-#define _MEM			(0)
-#define _MEMSWAP		(1)
-#define _OOM_TYPE		(2)
+enum res_type {
+	_MEM,
+	_MEMSWAP,
+	_OOM_TYPE,
+	_KMEM,
+};
+
 #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
 #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
 #define MEMFILE_ATTR(val)	((val) & 0xffff)
@@ -487,6 +553,75 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
 }
 #endif
 
+#ifdef CONFIG_MEMCG_KMEM
+/*
+ * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
+ * There are two main reasons for not using the css_id for this:
+ *  1) this works better in sparse environments, where we have a lot of memcgs,
+ *     but only a few kmem-limited. Or also, if we have, for instance, 200
+ *     memcgs, and none but the 200th is kmem-limited, we'd have to have a
+ *     200 entry array for that.
+ *
+ *  2) In order not to violate the cgroup API, we would like to do all memory
+ *     allocation in ->create(). At that point, we haven't yet allocated the
+ *     css_id. Having a separate index prevents us from messing with the cgroup
+ *     core for this
+ *
+ * The current size of the caches array is stored in
+ * memcg_limited_groups_array_size.  It will double each time we have to
+ * increase it.
+ */
+static DEFINE_IDA(kmem_limited_groups);
+int memcg_limited_groups_array_size;
+
+/*
+ * MIN_SIZE is different than 1, because we would like to avoid going through
+ * the alloc/free process all the time. In a small machine, 4 kmem-limited
+ * cgroups is a reasonable guess. In the future, it could be a parameter or
+ * tunable, but that is strictly not necessary.
+ *
+ * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
+ * this constant directly from cgroup, but it is understandable that this is
+ * better kept as an internal representation in cgroup.c. In any case, the
+ * css_id space is not getting any smaller, and we don't have to necessarily
+ * increase ours as well if it increases.
+ */
+#define MEMCG_CACHES_MIN_SIZE 4
+#define MEMCG_CACHES_MAX_SIZE 65535
+
+/*
+ * A lot of the calls to the cache allocation functions are expected to be
+ * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
+ * conditional to this static branch, we'll have to allow modules that does
+ * kmem_cache_alloc and the such to see this symbol as well
+ */
+struct static_key memcg_kmem_enabled_key;
+EXPORT_SYMBOL(memcg_kmem_enabled_key);
+
+static void disarm_kmem_keys(struct mem_cgroup *memcg)
+{
+	if (memcg_kmem_is_active(memcg)) {
+		static_key_slow_dec(&memcg_kmem_enabled_key);
+		ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
+	}
+	/*
+	 * This check can't live in kmem destruction function,
+	 * since the charges will outlive the cgroup
+	 */
+	WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
+}
+#else
+static void disarm_kmem_keys(struct mem_cgroup *memcg)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+static void disarm_static_keys(struct mem_cgroup *memcg)
+{
+	disarm_sock_keys(memcg);
+	disarm_kmem_keys(memcg);
+}
+
 static void drain_all_stock_async(struct mem_cgroup *memcg);
 
 static struct mem_cgroup_per_zone *
@@ -1453,6 +1588,10 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
 		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
 		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
+	printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
+		res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
+		res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
+		res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
 }
 
 /*
@@ -2060,20 +2199,28 @@ struct memcg_stock_pcp {
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
 static DEFINE_MUTEX(percpu_charge_mutex);
 
-/*
- * Try to consume stocked charge on this cpu. If success, one page is consumed
- * from local stock and true is returned. If the stock is 0 or charges from a
- * cgroup which is not current target, returns false. This stock will be
- * refilled.
+/**
+ * consume_stock: Try to consume stocked charge on this cpu.
+ * @memcg: memcg to consume from.
+ * @nr_pages: how many pages to charge.
+ *
+ * The charges will only happen if @memcg matches the current cpu's memcg
+ * stock, and at least @nr_pages are available in that stock.  Failure to
+ * service an allocation will refill the stock.
+ *
+ * returns true if successful, false otherwise.
  */
-static bool consume_stock(struct mem_cgroup *memcg)
+static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	struct memcg_stock_pcp *stock;
 	bool ret = true;
 
+	if (nr_pages > CHARGE_BATCH)
+		return false;
+
 	stock = &get_cpu_var(memcg_stock);
-	if (memcg == stock->cached && stock->nr_pages)
-		stock->nr_pages--;
+	if (memcg == stock->cached && stock->nr_pages >= nr_pages)
+		stock->nr_pages -= nr_pages;
 	else /* need to call res_counter_charge */
 		ret = false;
 	put_cpu_var(memcg_stock);
@@ -2250,7 +2397,8 @@ enum {
 };
 
 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
-				unsigned int nr_pages, bool oom_check)
+				unsigned int nr_pages, unsigned int min_pages,
+				bool oom_check)
 {
 	unsigned long csize = nr_pages * PAGE_SIZE;
 	struct mem_cgroup *mem_over_limit;
@@ -2273,18 +2421,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	} else
 		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
 	/*
-	 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
-	 * of regular pages (CHARGE_BATCH), or a single regular page (1).
-	 *
 	 * Never reclaim on behalf of optional batching, retry with a
 	 * single page instead.
 	 */
-	if (nr_pages == CHARGE_BATCH)
+	if (nr_pages > min_pages)
 		return CHARGE_RETRY;
 
 	if (!(gfp_mask & __GFP_WAIT))
 		return CHARGE_WOULDBLOCK;
 
+	if (gfp_mask & __GFP_NORETRY)
+		return CHARGE_NOMEM;
+
 	ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
 	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
 		return CHARGE_RETRY;
@@ -2297,7 +2445,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	 * unlikely to succeed so close to the limit, and we fall back
 	 * to regular pages anyway in case of failure.
 	 */
-	if (nr_pages == 1 && ret)
+	if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
 		return CHARGE_RETRY;
 
 	/*
@@ -2371,7 +2519,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 		memcg = *ptr;
 		if (mem_cgroup_is_root(memcg))
 			goto done;
-		if (nr_pages == 1 && consume_stock(memcg))
+		if (consume_stock(memcg, nr_pages))
 			goto done;
 		css_get(&memcg->css);
 	} else {
@@ -2396,7 +2544,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 			rcu_read_unlock();
 			goto done;
 		}
-		if (nr_pages == 1 && consume_stock(memcg)) {
+		if (consume_stock(memcg, nr_pages)) {
 			/*
 			 * It seems dagerous to access memcg without css_get().
 			 * But considering how consume_stok works, it's not
@@ -2431,7 +2579,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 			nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
 		}
 
-		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
+		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
+		    oom_check);
 		switch (ret) {
 		case CHARGE_OK:
 			break;
@@ -2624,183 +2773,943 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 	memcg_check_events(memcg, page);
 }
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static DEFINE_MUTEX(set_limit_mutex);
+
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
+{
+	return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
+		(memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
+}
 
-#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
 /*
- * Because tail pages are not marked as "used", set it. We're under
- * zone->lru_lock, 'splitting on pmd' and compound_lock.
- * charge/uncharge will be never happen and move_account() is done under
- * compound_lock(), so we don't have to take care of races.
+ * This is a bit cumbersome, but it is rarely used and avoids a backpointer
+ * in the memcg_cache_params struct.
  */
-void mem_cgroup_split_huge_fixup(struct page *head)
+static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
 {
-	struct page_cgroup *head_pc = lookup_page_cgroup(head);
-	struct page_cgroup *pc;
-	int i;
+	struct kmem_cache *cachep;
 
-	if (mem_cgroup_disabled())
-		return;
-	for (i = 1; i < HPAGE_PMD_NR; i++) {
-		pc = head_pc + i;
-		pc->mem_cgroup = head_pc->mem_cgroup;
-		smp_wmb();/* see __commit_charge() */
-		pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
-	}
+	VM_BUG_ON(p->is_root_cache);
+	cachep = p->root_cache;
+	return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
 }
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
-/**
- * mem_cgroup_move_account - move account of the page
- * @page: the page
- * @nr_pages: number of regular pages (>1 for huge pages)
- * @pc:	page_cgroup of the page.
- * @from: mem_cgroup which the page is moved from.
- * @to:	mem_cgroup which the page is moved to. @from != @to.
- *
- * The caller must confirm following.
- * - page is not on LRU (isolate_page() is useful.)
- * - compound_lock is held when nr_pages > 1
- *
- * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
- * from old cgroup.
- */
-static int mem_cgroup_move_account(struct page *page,
-				   unsigned int nr_pages,
-				   struct page_cgroup *pc,
-				   struct mem_cgroup *from,
-				   struct mem_cgroup *to)
+#ifdef CONFIG_SLABINFO
+static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft,
+					struct seq_file *m)
 {
-	unsigned long flags;
-	int ret;
-	bool anon = PageAnon(page);
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	struct memcg_cache_params *params;
 
-	VM_BUG_ON(from == to);
-	VM_BUG_ON(PageLRU(page));
-	/*
-	 * The page is isolated from LRU. So, collapse function
-	 * will not handle this page. But page splitting can happen.
-	 * Do this check under compound_page_lock(). The caller should
-	 * hold it.
-	 */
-	ret = -EBUSY;
-	if (nr_pages > 1 && !PageTransHuge(page))
-		goto out;
+	if (!memcg_can_account_kmem(memcg))
+		return -EIO;
 
-	lock_page_cgroup(pc);
+	print_slabinfo_header(m);
 
-	ret = -EINVAL;
-	if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
-		goto unlock;
+	mutex_lock(&memcg->slab_caches_mutex);
+	list_for_each_entry(params, &memcg->memcg_slab_caches, list)
+		cache_show(memcg_params_to_cache(params), m);
+	mutex_unlock(&memcg->slab_caches_mutex);
 
-	move_lock_mem_cgroup(from, &flags);
+	return 0;
+}
+#endif
 
-	if (!anon && page_mapped(page)) {
-		/* Update mapped_file data for mem_cgroup */
-		preempt_disable();
-		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
-		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
-		preempt_enable();
-	}
-	mem_cgroup_charge_statistics(from, anon, -nr_pages);
+static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
+{
+	struct res_counter *fail_res;
+	struct mem_cgroup *_memcg;
+	int ret = 0;
+	bool may_oom;
+
+	ret = res_counter_charge(&memcg->kmem, size, &fail_res);
+	if (ret)
+		return ret;
 
-	/* caller should have done css_get */
-	pc->mem_cgroup = to;
-	mem_cgroup_charge_statistics(to, anon, nr_pages);
-	move_unlock_mem_cgroup(from, &flags);
-	ret = 0;
-unlock:
-	unlock_page_cgroup(pc);
 	/*
-	 * check events
+	 * Conditions under which we can wait for the oom_killer. Those are
+	 * the same conditions tested by the core page allocator
 	 */
-	memcg_check_events(to, page);
-	memcg_check_events(from, page);
-out:
+	may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
+
+	_memcg = memcg;
+	ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
+				      &_memcg, may_oom);
+
+	if (ret == -EINTR)  {
+		/*
+		 * __mem_cgroup_try_charge() chosed to bypass to root due to
+		 * OOM kill or fatal signal.  Since our only options are to
+		 * either fail the allocation or charge it to this cgroup, do
+		 * it as a temporary condition. But we can't fail. From a
+		 * kmem/slab perspective, the cache has already been selected,
+		 * by mem_cgroup_kmem_get_cache(), so it is too late to change
+		 * our minds.
+		 *
+		 * This condition will only trigger if the task entered
+		 * memcg_charge_kmem in a sane state, but was OOM-killed during
+		 * __mem_cgroup_try_charge() above. Tasks that were already
+		 * dying when the allocation triggers should have been already
+		 * directed to the root cgroup in memcontrol.h
+		 */
+		res_counter_charge_nofail(&memcg->res, size, &fail_res);
+		if (do_swap_account)
+			res_counter_charge_nofail(&memcg->memsw, size,
+						  &fail_res);
+		ret = 0;
+	} else if (ret)
+		res_counter_uncharge(&memcg->kmem, size);
+
 	return ret;
 }
 
-/**
- * mem_cgroup_move_parent - moves page to the parent group
- * @page: the page to move
- * @pc: page_cgroup of the page
- * @child: page's cgroup
- *
- * move charges to its parent or the root cgroup if the group has no
- * parent (aka use_hierarchy==0).
- * Although this might fail (get_page_unless_zero, isolate_lru_page or
- * mem_cgroup_move_account fails) the failure is always temporary and
- * it signals a race with a page removal/uncharge or migration. In the
- * first case the page is on the way out and it will vanish from the LRU
- * on the next attempt and the call should be retried later.
- * Isolation from the LRU fails only if page has been isolated from
- * the LRU since we looked at it and that usually means either global
- * reclaim or migration going on. The page will either get back to the
- * LRU or vanish.
- * Finaly mem_cgroup_move_account fails only if the page got uncharged
- * (!PageCgroupUsed) or moved to a different group. The page will
- * disappear in the next attempt.
- */
-static int mem_cgroup_move_parent(struct page *page,
-				  struct page_cgroup *pc,
-				  struct mem_cgroup *child)
+static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
 {
-	struct mem_cgroup *parent;
-	unsigned int nr_pages;
-	unsigned long uninitialized_var(flags);
-	int ret;
+	res_counter_uncharge(&memcg->res, size);
+	if (do_swap_account)
+		res_counter_uncharge(&memcg->memsw, size);
 
-	VM_BUG_ON(mem_cgroup_is_root(child));
+	/* Not down to 0 */
+	if (res_counter_uncharge(&memcg->kmem, size))
+		return;
 
-	ret = -EBUSY;
-	if (!get_page_unless_zero(page))
-		goto out;
-	if (isolate_lru_page(page))
-		goto put;
+	if (memcg_kmem_test_and_clear_dead(memcg))
+		mem_cgroup_put(memcg);
+}
 
-	nr_pages = hpage_nr_pages(page);
+void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
+{
+	if (!memcg)
+		return;
 
-	parent = parent_mem_cgroup(child);
+	mutex_lock(&memcg->slab_caches_mutex);
+	list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
+	mutex_unlock(&memcg->slab_caches_mutex);
+}
+
+/*
+ * helper for acessing a memcg's index. It will be used as an index in the
+ * child cache array in kmem_cache, and also to derive its name. This function
+ * will return -1 when this is not a kmem-limited memcg.
+ */
+int memcg_cache_id(struct mem_cgroup *memcg)
+{
+	return memcg ? memcg->kmemcg_id : -1;
+}
+
+/*
+ * This ends up being protected by the set_limit mutex, during normal
+ * operation, because that is its main call site.
+ *
+ * But when we create a new cache, we can call this as well if its parent
+ * is kmem-limited. That will have to hold set_limit_mutex as well.
+ */
+int memcg_update_cache_sizes(struct mem_cgroup *memcg)
+{
+	int num, ret;
+
+	num = ida_simple_get(&kmem_limited_groups,
+				0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
+	if (num < 0)
+		return num;
 	/*
-	 * If no parent, move charges to root cgroup.
+	 * After this point, kmem_accounted (that we test atomically in
+	 * the beginning of this conditional), is no longer 0. This
+	 * guarantees only one process will set the following boolean
+	 * to true. We don't need test_and_set because we're protected
+	 * by the set_limit_mutex anyway.
 	 */
-	if (!parent)
-		parent = root_mem_cgroup;
+	memcg_kmem_set_activated(memcg);
 
-	if (nr_pages > 1) {
-		VM_BUG_ON(!PageTransHuge(page));
-		flags = compound_lock_irqsave(page);
+	ret = memcg_update_all_caches(num+1);
+	if (ret) {
+		ida_simple_remove(&kmem_limited_groups, num);
+		memcg_kmem_clear_activated(memcg);
+		return ret;
 	}
 
-	ret = mem_cgroup_move_account(page, nr_pages,
-				pc, child, parent);
-	if (!ret)
-		__mem_cgroup_cancel_local_charge(child, nr_pages);
+	memcg->kmemcg_id = num;
+	INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+	mutex_init(&memcg->slab_caches_mutex);
+	return 0;
+}
 
-	if (nr_pages > 1)
-		compound_unlock_irqrestore(page, flags);
-	putback_lru_page(page);
-put:
-	put_page(page);
-out:
-	return ret;
+static size_t memcg_caches_array_size(int num_groups)
+{
+	ssize_t size;
+	if (num_groups <= 0)
+		return 0;
+
+	size = 2 * num_groups;
+	if (size < MEMCG_CACHES_MIN_SIZE)
+		size = MEMCG_CACHES_MIN_SIZE;
+	else if (size > MEMCG_CACHES_MAX_SIZE)
+		size = MEMCG_CACHES_MAX_SIZE;
+
+	return size;
 }
 
 /*
- * Charge the memory controller for page usage.
- * Return
- * 0 if the charge was successful
- * < 0 if the cgroup is over its limit
+ * We should update the current array size iff all caches updates succeed. This
+ * can only be done from the slab side. The slab mutex needs to be held when
+ * calling this.
  */
-static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
-				gfp_t gfp_mask, enum charge_type ctype)
+void memcg_update_array_size(int num)
 {
-	struct mem_cgroup *memcg = NULL;
-	unsigned int nr_pages = 1;
-	bool oom = true;
-	int ret;
+	if (num > memcg_limited_groups_array_size)
+		memcg_limited_groups_array_size = memcg_caches_array_size(num);
+}
 
-	if (PageTransHuge(page)) {
+int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
+{
+	struct memcg_cache_params *cur_params = s->memcg_params;
+
+	VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
+
+	if (num_groups > memcg_limited_groups_array_size) {
+		int i;
+		ssize_t size = memcg_caches_array_size(num_groups);
+
+		size *= sizeof(void *);
+		size += sizeof(struct memcg_cache_params);
+
+		s->memcg_params = kzalloc(size, GFP_KERNEL);
+		if (!s->memcg_params) {
+			s->memcg_params = cur_params;
+			return -ENOMEM;
+		}
+
+		s->memcg_params->is_root_cache = true;
+
+		/*
+		 * There is the chance it will be bigger than
+		 * memcg_limited_groups_array_size, if we failed an allocation
+		 * in a cache, in which case all caches updated before it, will
+		 * have a bigger array.
+		 *
+		 * But if that is the case, the data after
+		 * memcg_limited_groups_array_size is certainly unused
+		 */
+		for (i = 0; i < memcg_limited_groups_array_size; i++) {
+			if (!cur_params->memcg_caches[i])
+				continue;
+			s->memcg_params->memcg_caches[i] =
+						cur_params->memcg_caches[i];
+		}
+
+		/*
+		 * Ideally, we would wait until all caches succeed, and only
+		 * then free the old one. But this is not worth the extra
+		 * pointer per-cache we'd have to have for this.
+		 *
+		 * It is not a big deal if some caches are left with a size
+		 * bigger than the others. And all updates will reset this
+		 * anyway.
+		 */
+		kfree(cur_params);
+	}
+	return 0;
+}
+
+int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
+			 struct kmem_cache *root_cache)
+{
+	size_t size = sizeof(struct memcg_cache_params);
+
+	if (!memcg_kmem_enabled())
+		return 0;
+
+	if (!memcg)
+		size += memcg_limited_groups_array_size * sizeof(void *);
+
+	s->memcg_params = kzalloc(size, GFP_KERNEL);
+	if (!s->memcg_params)
+		return -ENOMEM;
+
+	if (memcg) {
+		s->memcg_params->memcg = memcg;
+		s->memcg_params->root_cache = root_cache;
+	}
+	return 0;
+}
+
+void memcg_release_cache(struct kmem_cache *s)
+{
+	struct kmem_cache *root;
+	struct mem_cgroup *memcg;
+	int id;
+
+	/*
+	 * This happens, for instance, when a root cache goes away before we
+	 * add any memcg.
+	 */
+	if (!s->memcg_params)
+		return;
+
+	if (s->memcg_params->is_root_cache)
+		goto out;
+
+	memcg = s->memcg_params->memcg;
+	id  = memcg_cache_id(memcg);
+
+	root = s->memcg_params->root_cache;
+	root->memcg_params->memcg_caches[id] = NULL;
+	mem_cgroup_put(memcg);
+
+	mutex_lock(&memcg->slab_caches_mutex);
+	list_del(&s->memcg_params->list);
+	mutex_unlock(&memcg->slab_caches_mutex);
+
+out:
+	kfree(s->memcg_params);
+}
+
+/*
+ * During the creation a new cache, we need to disable our accounting mechanism
+ * altogether. This is true even if we are not creating, but rather just
+ * enqueing new caches to be created.
+ *
+ * This is because that process will trigger allocations; some visible, like
+ * explicit kmallocs to auxiliary data structures, name strings and internal
+ * cache structures; some well concealed, like INIT_WORK() that can allocate
+ * objects during debug.
+ *
+ * If any allocation happens during memcg_kmem_get_cache, we will recurse back
+ * to it. This may not be a bounded recursion: since the first cache creation
+ * failed to complete (waiting on the allocation), we'll just try to create the
+ * cache again, failing at the same point.
+ *
+ * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
+ * memcg_kmem_skip_account. So we enclose anything that might allocate memory
+ * inside the following two functions.
+ */
+static inline void memcg_stop_kmem_account(void)
+{
+	VM_BUG_ON(!current->mm);
+	current->memcg_kmem_skip_account++;
+}
+
+static inline void memcg_resume_kmem_account(void)
+{
+	VM_BUG_ON(!current->mm);
+	current->memcg_kmem_skip_account--;
+}
+
+static void kmem_cache_destroy_work_func(struct work_struct *w)
+{
+	struct kmem_cache *cachep;
+	struct memcg_cache_params *p;
+
+	p = container_of(w, struct memcg_cache_params, destroy);
+
+	cachep = memcg_params_to_cache(p);
+
+	/*
+	 * If we get down to 0 after shrink, we could delete right away.
+	 * However, memcg_release_pages() already puts us back in the workqueue
+	 * in that case. If we proceed deleting, we'll get a dangling
+	 * reference, and removing the object from the workqueue in that case
+	 * is unnecessary complication. We are not a fast path.
+	 *
+	 * Note that this case is fundamentally different from racing with
+	 * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
+	 * kmem_cache_shrink, not only we would be reinserting a dead cache
+	 * into the queue, but doing so from inside the worker racing to
+	 * destroy it.
+	 *
+	 * So if we aren't down to zero, we'll just schedule a worker and try
+	 * again
+	 */
+	if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
+		kmem_cache_shrink(cachep);
+		if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
+			return;
+	} else
+		kmem_cache_destroy(cachep);
+}
+
+void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
+{
+	if (!cachep->memcg_params->dead)
+		return;
+
+	/*
+	 * There are many ways in which we can get here.
+	 *
+	 * We can get to a memory-pressure situation while the delayed work is
+	 * still pending to run. The vmscan shrinkers can then release all
+	 * cache memory and get us to destruction. If this is the case, we'll
+	 * be executed twice, which is a bug (the second time will execute over
+	 * bogus data). In this case, cancelling the work should be fine.
+	 *
+	 * But we can also get here from the worker itself, if
+	 * kmem_cache_shrink is enough to shake all the remaining objects and
+	 * get the page count to 0. In this case, we'll deadlock if we try to
+	 * cancel the work (the worker runs with an internal lock held, which
+	 * is the same lock we would hold for cancel_work_sync().)
+	 *
+	 * Since we can't possibly know who got us here, just refrain from
+	 * running if there is already work pending
+	 */
+	if (work_pending(&cachep->memcg_params->destroy))
+		return;
+	/*
+	 * We have to defer the actual destroying to a workqueue, because
+	 * we might currently be in a context that cannot sleep.
+	 */
+	schedule_work(&cachep->memcg_params->destroy);
+}
+
+static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s)
+{
+	char *name;
+	struct dentry *dentry;
+
+	rcu_read_lock();
+	dentry = rcu_dereference(memcg->css.cgroup->dentry);
+	rcu_read_unlock();
+
+	BUG_ON(dentry == NULL);
+
+	name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name,
+			 memcg_cache_id(memcg), dentry->d_name.name);
+
+	return name;
+}
+
+static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
+					 struct kmem_cache *s)
+{
+	char *name;
+	struct kmem_cache *new;
+
+	name = memcg_cache_name(memcg, s);
+	if (!name)
+		return NULL;
+
+	new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align,
+				      (s->flags & ~SLAB_PANIC), s->ctor, s);
+
+	if (new)
+		new->allocflags |= __GFP_KMEMCG;
+
+	kfree(name);
+	return new;
+}
+
+/*
+ * This lock protects updaters, not readers. We want readers to be as fast as
+ * they can, and they will either see NULL or a valid cache value. Our model
+ * allow them to see NULL, in which case the root memcg will be selected.
+ *
+ * We need this lock because multiple allocations to the same cache from a non
+ * will span more than one worker. Only one of them can create the cache.
+ */
+static DEFINE_MUTEX(memcg_cache_mutex);
+static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
+						  struct kmem_cache *cachep)
+{
+	struct kmem_cache *new_cachep;
+	int idx;
+
+	BUG_ON(!memcg_can_account_kmem(memcg));
+
+	idx = memcg_cache_id(memcg);
+
+	mutex_lock(&memcg_cache_mutex);
+	new_cachep = cachep->memcg_params->memcg_caches[idx];
+	if (new_cachep)
+		goto out;
+
+	new_cachep = kmem_cache_dup(memcg, cachep);
+	if (new_cachep == NULL) {
+		new_cachep = cachep;
+		goto out;
+	}
+
+	mem_cgroup_get(memcg);
+	atomic_set(&new_cachep->memcg_params->nr_pages , 0);
+
+	cachep->memcg_params->memcg_caches[idx] = new_cachep;
+	/*
+	 * the readers won't lock, make sure everybody sees the updated value,
+	 * so they won't put stuff in the queue again for no reason
+	 */
+	wmb();
+out:
+	mutex_unlock(&memcg_cache_mutex);
+	return new_cachep;
+}
+
+void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
+{
+	struct kmem_cache *c;
+	int i;
+
+	if (!s->memcg_params)
+		return;
+	if (!s->memcg_params->is_root_cache)
+		return;
+
+	/*
+	 * If the cache is being destroyed, we trust that there is no one else
+	 * requesting objects from it. Even if there are, the sanity checks in
+	 * kmem_cache_destroy should caught this ill-case.
+	 *
+	 * Still, we don't want anyone else freeing memcg_caches under our
+	 * noses, which can happen if a new memcg comes to life. As usual,
+	 * we'll take the set_limit_mutex to protect ourselves against this.
+	 */
+	mutex_lock(&set_limit_mutex);
+	for (i = 0; i < memcg_limited_groups_array_size; i++) {
+		c = s->memcg_params->memcg_caches[i];
+		if (!c)
+			continue;
+
+		/*
+		 * We will now manually delete the caches, so to avoid races
+		 * we need to cancel all pending destruction workers and
+		 * proceed with destruction ourselves.
+		 *
+		 * kmem_cache_destroy() will call kmem_cache_shrink internally,
+		 * and that could spawn the workers again: it is likely that
+		 * the cache still have active pages until this very moment.
+		 * This would lead us back to mem_cgroup_destroy_cache.
+		 *
+		 * But that will not execute at all if the "dead" flag is not
+		 * set, so flip it down to guarantee we are in control.
+		 */
+		c->memcg_params->dead = false;
+		cancel_work_sync(&c->memcg_params->destroy);
+		kmem_cache_destroy(c);
+	}
+	mutex_unlock(&set_limit_mutex);
+}
+
+struct create_work {
+	struct mem_cgroup *memcg;
+	struct kmem_cache *cachep;
+	struct work_struct work;
+};
+
+static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+{
+	struct kmem_cache *cachep;
+	struct memcg_cache_params *params;
+
+	if (!memcg_kmem_is_active(memcg))
+		return;
+
+	mutex_lock(&memcg->slab_caches_mutex);
+	list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
+		cachep = memcg_params_to_cache(params);
+		cachep->memcg_params->dead = true;
+		INIT_WORK(&cachep->memcg_params->destroy,
+				  kmem_cache_destroy_work_func);
+		schedule_work(&cachep->memcg_params->destroy);
+	}
+	mutex_unlock(&memcg->slab_caches_mutex);
+}
+
+static void memcg_create_cache_work_func(struct work_struct *w)
+{
+	struct create_work *cw;
+
+	cw = container_of(w, struct create_work, work);
+	memcg_create_kmem_cache(cw->memcg, cw->cachep);
+	/* Drop the reference gotten when we enqueued. */
+	css_put(&cw->memcg->css);
+	kfree(cw);
+}
+
+/*
+ * Enqueue the creation of a per-memcg kmem_cache.
+ * Called with rcu_read_lock.
+ */
+static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
+					 struct kmem_cache *cachep)
+{
+	struct create_work *cw;
+
+	cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
+	if (cw == NULL)
+		return;
+
+	/* The corresponding put will be done in the workqueue. */
+	if (!css_tryget(&memcg->css)) {
+		kfree(cw);
+		return;
+	}
+
+	cw->memcg = memcg;
+	cw->cachep = cachep;
+
+	INIT_WORK(&cw->work, memcg_create_cache_work_func);
+	schedule_work(&cw->work);
+}
+
+static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
+				       struct kmem_cache *cachep)
+{
+	/*
+	 * We need to stop accounting when we kmalloc, because if the
+	 * corresponding kmalloc cache is not yet created, the first allocation
+	 * in __memcg_create_cache_enqueue will recurse.
+	 *
+	 * However, it is better to enclose the whole function. Depending on
+	 * the debugging options enabled, INIT_WORK(), for instance, can
+	 * trigger an allocation. This too, will make us recurse. Because at
+	 * this point we can't allow ourselves back into memcg_kmem_get_cache,
+	 * the safest choice is to do it like this, wrapping the whole function.
+	 */
+	memcg_stop_kmem_account();
+	__memcg_create_cache_enqueue(memcg, cachep);
+	memcg_resume_kmem_account();
+}
+/*
+ * Return the kmem_cache we're supposed to use for a slab allocation.
+ * We try to use the current memcg's version of the cache.
+ *
+ * If the cache does not exist yet, if we are the first user of it,
+ * we either create it immediately, if possible, or create it asynchronously
+ * in a workqueue.
+ * In the latter case, we will let the current allocation go through with
+ * the original cache.
+ *
+ * Can't be called in interrupt context or from kernel threads.
+ * This function needs to be called with rcu_read_lock() held.
+ */
+struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
+					  gfp_t gfp)
+{
+	struct mem_cgroup *memcg;
+	int idx;
+
+	VM_BUG_ON(!cachep->memcg_params);
+	VM_BUG_ON(!cachep->memcg_params->is_root_cache);
+
+	if (!current->mm || current->memcg_kmem_skip_account)
+		return cachep;
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
+	rcu_read_unlock();
+
+	if (!memcg_can_account_kmem(memcg))
+		return cachep;
+
+	idx = memcg_cache_id(memcg);
+
+	/*
+	 * barrier to mare sure we're always seeing the up to date value.  The
+	 * code updating memcg_caches will issue a write barrier to match this.
+	 */
+	read_barrier_depends();
+	if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) {
+		/*
+		 * If we are in a safe context (can wait, and not in interrupt
+		 * context), we could be be predictable and return right away.
+		 * This would guarantee that the allocation being performed
+		 * already belongs in the new cache.
+		 *
+		 * However, there are some clashes that can arrive from locking.
+		 * For instance, because we acquire the slab_mutex while doing
+		 * kmem_cache_dup, this means no further allocation could happen
+		 * with the slab_mutex held.
+		 *
+		 * Also, because cache creation issue get_online_cpus(), this
+		 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
+		 * that ends up reversed during cpu hotplug. (cpuset allocates
+		 * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
+		 * better to defer everything.
+		 */
+		memcg_create_cache_enqueue(memcg, cachep);
+		return cachep;
+	}
+
+	return cachep->memcg_params->memcg_caches[idx];
+}
+EXPORT_SYMBOL(__memcg_kmem_get_cache);
+
+/*
+ * We need to verify if the allocation against current->mm->owner's memcg is
+ * possible for the given order. But the page is not allocated yet, so we'll
+ * need a further commit step to do the final arrangements.
+ *
+ * It is possible for the task to switch cgroups in this mean time, so at
+ * commit time, we can't rely on task conversion any longer.  We'll then use
+ * the handle argument to return to the caller which cgroup we should commit
+ * against. We could also return the memcg directly and avoid the pointer
+ * passing, but a boolean return value gives better semantics considering
+ * the compiled-out case as well.
+ *
+ * Returning true means the allocation is possible.
+ */
+bool
+__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
+{
+	struct mem_cgroup *memcg;
+	int ret;
+
+	*_memcg = NULL;
+	memcg = try_get_mem_cgroup_from_mm(current->mm);
+
+	/*
+	 * very rare case described in mem_cgroup_from_task. Unfortunately there
+	 * isn't much we can do without complicating this too much, and it would
+	 * be gfp-dependent anyway. Just let it go
+	 */
+	if (unlikely(!memcg))
+		return true;
+
+	if (!memcg_can_account_kmem(memcg)) {
+		css_put(&memcg->css);
+		return true;
+	}
+
+	ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
+	if (!ret)
+		*_memcg = memcg;
+
+	css_put(&memcg->css);
+	return (ret == 0);
+}
+
+void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
+			      int order)
+{
+	struct page_cgroup *pc;
+
+	VM_BUG_ON(mem_cgroup_is_root(memcg));
+
+	/* The page allocation failed. Revert */
+	if (!page) {
+		memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
+		return;
+	}
+
+	pc = lookup_page_cgroup(page);
+	lock_page_cgroup(pc);
+	pc->mem_cgroup = memcg;
+	SetPageCgroupUsed(pc);
+	unlock_page_cgroup(pc);
+}
+
+void __memcg_kmem_uncharge_pages(struct page *page, int order)
+{
+	struct mem_cgroup *memcg = NULL;
+	struct page_cgroup *pc;
+
+
+	pc = lookup_page_cgroup(page);
+	/*
+	 * Fast unlocked return. Theoretically might have changed, have to
+	 * check again after locking.
+	 */
+	if (!PageCgroupUsed(pc))
+		return;
+
+	lock_page_cgroup(pc);
+	if (PageCgroupUsed(pc)) {
+		memcg = pc->mem_cgroup;
+		ClearPageCgroupUsed(pc);
+	}
+	unlock_page_cgroup(pc);
+
+	/*
+	 * We trust that only if there is a memcg associated with the page, it
+	 * is a valid allocation
+	 */
+	if (!memcg)
+		return;
+
+	VM_BUG_ON(mem_cgroup_is_root(memcg));
+	memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
+}
+#else
+static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
+/*
+ * Because tail pages are not marked as "used", set it. We're under
+ * zone->lru_lock, 'splitting on pmd' and compound_lock.
+ * charge/uncharge will be never happen and move_account() is done under
+ * compound_lock(), so we don't have to take care of races.
+ */
+void mem_cgroup_split_huge_fixup(struct page *head)
+{
+	struct page_cgroup *head_pc = lookup_page_cgroup(head);
+	struct page_cgroup *pc;
+	int i;
+
+	if (mem_cgroup_disabled())
+		return;
+	for (i = 1; i < HPAGE_PMD_NR; i++) {
+		pc = head_pc + i;
+		pc->mem_cgroup = head_pc->mem_cgroup;
+		smp_wmb();/* see __commit_charge() */
+		pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
+	}
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+/**
+ * mem_cgroup_move_account - move account of the page
+ * @page: the page
+ * @nr_pages: number of regular pages (>1 for huge pages)
+ * @pc:	page_cgroup of the page.
+ * @from: mem_cgroup which the page is moved from.
+ * @to:	mem_cgroup which the page is moved to. @from != @to.
+ *
+ * The caller must confirm following.
+ * - page is not on LRU (isolate_page() is useful.)
+ * - compound_lock is held when nr_pages > 1
+ *
+ * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
+ * from old cgroup.
+ */
+static int mem_cgroup_move_account(struct page *page,
+				   unsigned int nr_pages,
+				   struct page_cgroup *pc,
+				   struct mem_cgroup *from,
+				   struct mem_cgroup *to)
+{
+	unsigned long flags;
+	int ret;
+	bool anon = PageAnon(page);
+
+	VM_BUG_ON(from == to);
+	VM_BUG_ON(PageLRU(page));
+	/*
+	 * The page is isolated from LRU. So, collapse function
+	 * will not handle this page. But page splitting can happen.
+	 * Do this check under compound_page_lock(). The caller should
+	 * hold it.
+	 */
+	ret = -EBUSY;
+	if (nr_pages > 1 && !PageTransHuge(page))
+		goto out;
+
+	lock_page_cgroup(pc);
+
+	ret = -EINVAL;
+	if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
+		goto unlock;
+
+	move_lock_mem_cgroup(from, &flags);
+
+	if (!anon && page_mapped(page)) {
+		/* Update mapped_file data for mem_cgroup */
+		preempt_disable();
+		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+		preempt_enable();
+	}
+	mem_cgroup_charge_statistics(from, anon, -nr_pages);
+
+	/* caller should have done css_get */
+	pc->mem_cgroup = to;
+	mem_cgroup_charge_statistics(to, anon, nr_pages);
+	move_unlock_mem_cgroup(from, &flags);
+	ret = 0;
+unlock:
+	unlock_page_cgroup(pc);
+	/*
+	 * check events
+	 */
+	memcg_check_events(to, page);
+	memcg_check_events(from, page);
+out:
+	return ret;
+}
+
+/**
+ * mem_cgroup_move_parent - moves page to the parent group
+ * @page: the page to move
+ * @pc: page_cgroup of the page
+ * @child: page's cgroup
+ *
+ * move charges to its parent or the root cgroup if the group has no
+ * parent (aka use_hierarchy==0).
+ * Although this might fail (get_page_unless_zero, isolate_lru_page or
+ * mem_cgroup_move_account fails) the failure is always temporary and
+ * it signals a race with a page removal/uncharge or migration. In the
+ * first case the page is on the way out and it will vanish from the LRU
+ * on the next attempt and the call should be retried later.
+ * Isolation from the LRU fails only if page has been isolated from
+ * the LRU since we looked at it and that usually means either global
+ * reclaim or migration going on. The page will either get back to the
+ * LRU or vanish.
+ * Finaly mem_cgroup_move_account fails only if the page got uncharged
+ * (!PageCgroupUsed) or moved to a different group. The page will
+ * disappear in the next attempt.
+ */
+static int mem_cgroup_move_parent(struct page *page,
+				  struct page_cgroup *pc,
+				  struct mem_cgroup *child)
+{
+	struct mem_cgroup *parent;
+	unsigned int nr_pages;
+	unsigned long uninitialized_var(flags);
+	int ret;
+
+	VM_BUG_ON(mem_cgroup_is_root(child));
+
+	ret = -EBUSY;
+	if (!get_page_unless_zero(page))
+		goto out;
+	if (isolate_lru_page(page))
+		goto put;
+
+	nr_pages = hpage_nr_pages(page);
+
+	parent = parent_mem_cgroup(child);
+	/*
+	 * If no parent, move charges to root cgroup.
+	 */
+	if (!parent)
+		parent = root_mem_cgroup;
+
+	if (nr_pages > 1) {
+		VM_BUG_ON(!PageTransHuge(page));
+		flags = compound_lock_irqsave(page);
+	}
+
+	ret = mem_cgroup_move_account(page, nr_pages,
+				pc, child, parent);
+	if (!ret)
+		__mem_cgroup_cancel_local_charge(child, nr_pages);
+
+	if (nr_pages > 1)
+		compound_unlock_irqrestore(page, flags);
+	putback_lru_page(page);
+put:
+	put_page(page);
+out:
+	return ret;
+}
+
+/*
+ * Charge the memory controller for page usage.
+ * Return
+ * 0 if the charge was successful
+ * < 0 if the cgroup is over its limit
+ */
+static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
+				gfp_t gfp_mask, enum charge_type ctype)
+{
+	struct mem_cgroup *memcg = NULL;
+	unsigned int nr_pages = 1;
+	bool oom = true;
+	int ret;
+
+	if (PageTransHuge(page)) {
 		nr_pages <<= compound_order(page);
 		VM_BUG_ON(!PageTransHuge(page));
 		/*
@@ -3486,8 +4395,6 @@ void mem_cgroup_print_bad_page(struct page *page)
 }
 #endif
 
-static DEFINE_MUTEX(set_limit_mutex);
-
 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 				unsigned long long val)
 {
@@ -3772,6 +4679,7 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
 static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
 {
 	int node, zid;
+	u64 usage;
 
 	do {
 		/* This is for making all *used* pages to be on LRU. */
@@ -3792,13 +4700,20 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
 		cond_resched();
 
 		/*
+		 * Kernel memory may not necessarily be trackable to a specific
+		 * process. So they are not migrated, and therefore we can't
+		 * expect their value to drop to 0 here.
+		 * Having res filled up with kmem only is enough.
+		 *
 		 * This is a safety check because mem_cgroup_force_empty_list
 		 * could have raced with mem_cgroup_replace_page_cache callers
 		 * so the lru seemed empty but the page could have been added
 		 * right after the check. RES_USAGE should be safe as we always
 		 * charge before adding to the LRU.
 		 */
-	} while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0);
+		usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
+			res_counter_read_u64(&memcg->kmem, RES_USAGE);
+	} while (usage > 0);
 }
 
 /*
@@ -3942,7 +4857,8 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	char str[64];
 	u64 val;
-	int type, name, len;
+	int name, len;
+	enum res_type type;
 
 	type = MEMFILE_TYPE(cft->private);
 	name = MEMFILE_ATTR(cft->private);
@@ -3963,6 +4879,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
 		else
 			val = res_counter_read_u64(&memcg->memsw, name);
 		break;
+	case _KMEM:
+		val = res_counter_read_u64(&memcg->kmem, name);
+		break;
 	default:
 		BUG();
 	}
@@ -3970,6 +4889,125 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
 	len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
 	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
 }
+
+static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
+{
+	int ret = -EINVAL;
+#ifdef CONFIG_MEMCG_KMEM
+	bool must_inc_static_branch = false;
+
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	/*
+	 * For simplicity, we won't allow this to be disabled.  It also can't
+	 * be changed if the cgroup has children already, or if tasks had
+	 * already joined.
+	 *
+	 * If tasks join before we set the limit, a person looking at
+	 * kmem.usage_in_bytes will have no way to determine when it took
+	 * place, which makes the value quite meaningless.
+	 *
+	 * After it first became limited, changes in the value of the limit are
+	 * of course permitted.
+	 *
+	 * Taking the cgroup_lock is really offensive, but it is so far the only
+	 * way to guarantee that no children will appear. There are plenty of
+	 * other offenders, and they should all go away. Fine grained locking
+	 * is probably the way to go here. When we are fully hierarchical, we
+	 * can also get rid of the use_hierarchy check.
+	 */
+	cgroup_lock();
+	mutex_lock(&set_limit_mutex);
+	if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
+		if (cgroup_task_count(cont) || (memcg->use_hierarchy &&
+						!list_empty(&cont->children))) {
+			ret = -EBUSY;
+			goto out;
+		}
+		ret = res_counter_set_limit(&memcg->kmem, val);
+		VM_BUG_ON(ret);
+
+		ret = memcg_update_cache_sizes(memcg);
+		if (ret) {
+			res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
+			goto out;
+		}
+		must_inc_static_branch = true;
+		/*
+		 * kmem charges can outlive the cgroup. In the case of slab
+		 * pages, for instance, a page contain objects from various
+		 * processes, so it is unfeasible to migrate them away. We
+		 * need to reference count the memcg because of that.
+		 */
+		mem_cgroup_get(memcg);
+	} else
+		ret = res_counter_set_limit(&memcg->kmem, val);
+out:
+	mutex_unlock(&set_limit_mutex);
+	cgroup_unlock();
+
+	/*
+	 * We are by now familiar with the fact that we can't inc the static
+	 * branch inside cgroup_lock. See disarm functions for details. A
+	 * worker here is overkill, but also wrong: After the limit is set, we
+	 * must start accounting right away. Since this operation can't fail,
+	 * we can safely defer it to here - no rollback will be needed.
+	 *
+	 * The boolean used to control this is also safe, because
+	 * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be
+	 * able to set it to true;
+	 */
+	if (must_inc_static_branch) {
+		static_key_slow_inc(&memcg_kmem_enabled_key);
+		/*
+		 * setting the active bit after the inc will guarantee no one
+		 * starts accounting before all call sites are patched
+		 */
+		memcg_kmem_set_active(memcg);
+	}
+
+#endif
+	return ret;
+}
+
+static int memcg_propagate_kmem(struct mem_cgroup *memcg)
+{
+	int ret = 0;
+	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+	if (!parent)
+		goto out;
+
+	memcg->kmem_account_flags = parent->kmem_account_flags;
+#ifdef CONFIG_MEMCG_KMEM
+	/*
+	 * When that happen, we need to disable the static branch only on those
+	 * memcgs that enabled it. To achieve this, we would be forced to
+	 * complicate the code by keeping track of which memcgs were the ones
+	 * that actually enabled limits, and which ones got it from its
+	 * parents.
+	 *
+	 * It is a lot simpler just to do static_key_slow_inc() on every child
+	 * that is accounted.
+	 */
+	if (!memcg_kmem_is_active(memcg))
+		goto out;
+
+	/*
+	 * destroy(), called if we fail, will issue static_key_slow_inc() and
+	 * mem_cgroup_put() if kmem is enabled. We have to either call them
+	 * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
+	 * this more consistent, since it always leads to the same destroy path
+	 */
+	mem_cgroup_get(memcg);
+	static_key_slow_inc(&memcg_kmem_enabled_key);
+
+	mutex_lock(&set_limit_mutex);
+	ret = memcg_update_cache_sizes(memcg);
+	mutex_unlock(&set_limit_mutex);
+#endif
+out:
+	return ret;
+}
+
 /*
  * The user of this function is...
  * RES_LIMIT.
@@ -3978,7 +5016,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 			    const char *buffer)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-	int type, name;
+	enum res_type type;
+	int name;
 	unsigned long long val;
 	int ret;
 
@@ -4000,8 +5039,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 			break;
 		if (type == _MEM)
 			ret = mem_cgroup_resize_limit(memcg, val);
-		else
+		else if (type == _MEMSWAP)
 			ret = mem_cgroup_resize_memsw_limit(memcg, val);
+		else if (type == _KMEM)
+			ret = memcg_update_kmem_limit(cont, val);
+		else
+			return -EINVAL;
 		break;
 	case RES_SOFT_LIMIT:
 		ret = res_counter_memparse_write_strategy(buffer, &val);
@@ -4054,7 +5097,8 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-	int type, name;
+	int name;
+	enum res_type type;
 
 	type = MEMFILE_TYPE(event);
 	name = MEMFILE_ATTR(event);
@@ -4066,14 +5110,22 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 	case RES_MAX_USAGE:
 		if (type == _MEM)
 			res_counter_reset_max(&memcg->res);
-		else
+		else if (type == _MEMSWAP)
 			res_counter_reset_max(&memcg->memsw);
+		else if (type == _KMEM)
+			res_counter_reset_max(&memcg->kmem);
+		else
+			return -EINVAL;
 		break;
 	case RES_FAILCNT:
 		if (type == _MEM)
 			res_counter_reset_failcnt(&memcg->res);
-		else
+		else if (type == _MEMSWAP)
 			res_counter_reset_failcnt(&memcg->memsw);
+		else if (type == _KMEM)
+			res_counter_reset_failcnt(&memcg->kmem);
+		else
+			return -EINVAL;
 		break;
 	}
 
@@ -4390,7 +5442,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup_thresholds *thresholds;
 	struct mem_cgroup_threshold_ary *new;
-	int type = MEMFILE_TYPE(cft->private);
+	enum res_type type = MEMFILE_TYPE(cft->private);
 	u64 threshold, usage;
 	int i, size, ret;
 
@@ -4473,7 +5525,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup_thresholds *thresholds;
 	struct mem_cgroup_threshold_ary *new;
-	int type = MEMFILE_TYPE(cft->private);
+	enum res_type type = MEMFILE_TYPE(cft->private);
 	u64 usage;
 	int i, j, size;
 
@@ -4551,7 +5603,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup_eventfd_list *event;
-	int type = MEMFILE_TYPE(cft->private);
+	enum res_type type = MEMFILE_TYPE(cft->private);
 
 	BUG_ON(type != _OOM_TYPE);
 	event = kmalloc(sizeof(*event),	GFP_KERNEL);
@@ -4576,7 +5628,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup_eventfd_list *ev, *tmp;
-	int type = MEMFILE_TYPE(cft->private);
+	enum res_type type = MEMFILE_TYPE(cft->private);
 
 	BUG_ON(type != _OOM_TYPE);
 
@@ -4635,12 +5687,33 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
 #ifdef CONFIG_MEMCG_KMEM
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
+	int ret;
+
+	memcg->kmemcg_id = -1;
+	ret = memcg_propagate_kmem(memcg);
+	if (ret)
+		return ret;
+
 	return mem_cgroup_sockets_init(memcg, ss);
 };
 
 static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
 {
 	mem_cgroup_sockets_destroy(memcg);
+
+	memcg_kmem_mark_dead(memcg);
+
+	if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
+		return;
+
+	/*
+	 * Charges already down to 0, undo mem_cgroup_get() done in the charge
+	 * path here, being careful not to race with memcg_uncharge_kmem: it is
+	 * possible that the charges went down to 0 between mark_dead and the
+	 * res_counter read, so in that case, we don't need the put
+	 */
+	if (memcg_kmem_test_and_clear_dead(memcg))
+		mem_cgroup_put(memcg);
 }
 #else
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
@@ -4748,6 +5821,37 @@ static struct cftype mem_cgroup_files[] = {
 		.trigger = mem_cgroup_reset,
 		.read = mem_cgroup_read,
 	},
+#endif
+#ifdef CONFIG_MEMCG_KMEM
+	{
+		.name = "kmem.limit_in_bytes",
+		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
+		.write_string = mem_cgroup_write,
+		.read = mem_cgroup_read,
+	},
+	{
+		.name = "kmem.usage_in_bytes",
+		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
+		.read = mem_cgroup_read,
+	},
+	{
+		.name = "kmem.failcnt",
+		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
+		.trigger = mem_cgroup_reset,
+		.read = mem_cgroup_read,
+	},
+	{
+		.name = "kmem.max_usage_in_bytes",
+		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
+		.trigger = mem_cgroup_reset,
+		.read = mem_cgroup_read,
+	},
+#ifdef CONFIG_SLABINFO
+	{
+		.name = "kmem.slabinfo",
+		.read_seq_string = mem_cgroup_slabinfo_read,
+	},
+#endif
 #endif
 	{ },	/* terminate */
 };
@@ -4816,16 +5920,29 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 }
 
 /*
- * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
- * but in process context.  The work_freeing structure is overlaid
- * on the rcu_freeing structure, which itself is overlaid on memsw.
+ * At destroying mem_cgroup, references from swap_cgroup can remain.
+ * (scanning all at force_empty is too costly...)
+ *
+ * Instead of clearing all references at force_empty, we remember
+ * the number of reference from swap_cgroup and free mem_cgroup when
+ * it goes down to 0.
+ *
+ * Removal of cgroup itself succeeds regardless of refs from swap.
  */
-static void free_work(struct work_struct *work)
+
+static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
-	struct mem_cgroup *memcg;
+	int node;
 	int size = sizeof(struct mem_cgroup);
 
-	memcg = container_of(work, struct mem_cgroup, work_freeing);
+	mem_cgroup_remove_from_trees(memcg);
+	free_css_id(&mem_cgroup_subsys, &memcg->css);
+
+	for_each_node(node)
+		free_mem_cgroup_per_zone_info(memcg, node);
+
+	free_percpu(memcg->stat);
+
 	/*
 	 * We need to make sure that (at least for now), the jump label
 	 * destruction code runs outside of the cgroup lock. This is because
@@ -4837,45 +5954,34 @@ static void free_work(struct work_struct *work)
 	 * to move this code around, and make sure it is outside
 	 * the cgroup_lock.
 	 */
-	disarm_sock_keys(memcg);
+	disarm_static_keys(memcg);
 	if (size < PAGE_SIZE)
 		kfree(memcg);
 	else
 		vfree(memcg);
 }
 
-static void free_rcu(struct rcu_head *rcu_head)
-{
-	struct mem_cgroup *memcg;
-
-	memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
-	INIT_WORK(&memcg->work_freeing, free_work);
-	schedule_work(&memcg->work_freeing);
-}
 
 /*
- * At destroying mem_cgroup, references from swap_cgroup can remain.
- * (scanning all at force_empty is too costly...)
- *
- * Instead of clearing all references at force_empty, we remember
- * the number of reference from swap_cgroup and free mem_cgroup when
- * it goes down to 0.
- *
- * Removal of cgroup itself succeeds regardless of refs from swap.
+ * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
+ * but in process context.  The work_freeing structure is overlaid
+ * on the rcu_freeing structure, which itself is overlaid on memsw.
  */
-
-static void __mem_cgroup_free(struct mem_cgroup *memcg)
+static void free_work(struct work_struct *work)
 {
-	int node;
+	struct mem_cgroup *memcg;
 
-	mem_cgroup_remove_from_trees(memcg);
-	free_css_id(&mem_cgroup_subsys, &memcg->css);
+	memcg = container_of(work, struct mem_cgroup, work_freeing);
+	__mem_cgroup_free(memcg);
+}
 
-	for_each_node(node)
-		free_mem_cgroup_per_zone_info(memcg, node);
+static void free_rcu(struct rcu_head *rcu_head)
+{
+	struct mem_cgroup *memcg;
 
-	free_percpu(memcg->stat);
-	call_rcu(&memcg->rcu_freeing, free_rcu);
+	memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
+	INIT_WORK(&memcg->work_freeing, free_work);
+	schedule_work(&memcg->work_freeing);
 }
 
 static void mem_cgroup_get(struct mem_cgroup *memcg)
@@ -4887,7 +5993,7 @@ static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
 {
 	if (atomic_sub_and_test(count, &memcg->refcnt)) {
 		struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-		__mem_cgroup_free(memcg);
+		call_rcu(&memcg->rcu_freeing, free_rcu);
 		if (parent)
 			mem_cgroup_put(parent);
 	}
@@ -4994,6 +6100,8 @@ mem_cgroup_css_alloc(struct cgroup *cont)
 	if (parent && parent->use_hierarchy) {
 		res_counter_init(&memcg->res, &parent->res);
 		res_counter_init(&memcg->memsw, &parent->memsw);
+		res_counter_init(&memcg->kmem, &parent->kmem);
+
 		/*
 		 * We increment refcnt of the parent to ensure that we can
 		 * safely access it on res_counter_charge/uncharge.
@@ -5004,6 +6112,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
 	} else {
 		res_counter_init(&memcg->res, NULL);
 		res_counter_init(&memcg->memsw, NULL);
+		res_counter_init(&memcg->kmem, NULL);
 		/*
 		 * Deeper hierachy with use_hierarchy == false doesn't make
 		 * much sense so let cgroup subsystem know about this
@@ -5043,6 +6152,7 @@ static void mem_cgroup_css_offline(struct cgroup *cont)
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 
 	mem_cgroup_reparent_charges(memcg);
+	mem_cgroup_destroy_all_caches(memcg);
 }
 
 static void mem_cgroup_css_free(struct cgroup *cont)
diff --git a/trunk/mm/memory.c b/trunk/mm/memory.c
index e6a3b933517e..e0a9b0ce4f10 100644
--- a/trunk/mm/memory.c
+++ b/trunk/mm/memory.c
@@ -58,6 +58,7 @@
 #include <linux/elf.h>
 #include <linux/gfp.h>
 #include <linux/migrate.h>
+#include <linux/string.h>
 
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -3590,6 +3591,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		     unsigned long addr, pmd_t *pmdp)
 {
 	BUG();
+	return 0;
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
@@ -4117,15 +4119,12 @@ void print_vma_addr(char *prefix, unsigned long ip)
 		struct file *f = vma->vm_file;
 		char *buf = (char *)__get_free_page(GFP_KERNEL);
 		if (buf) {
-			char *p, *s;
+			char *p;
 
 			p = d_path(&f->f_path, buf, PAGE_SIZE);
 			if (IS_ERR(p))
 				p = "?";
-			s = strrchr(p, '/');
-			if (s)
-				p = s+1;
-			printk("%s%s[%lx+%lx]", prefix, p,
+			printk("%s%s[%lx+%lx]", prefix, kbasename(p),
 					vma->vm_start,
 					vma->vm_end - vma->vm_start);
 			free_page((unsigned long)buf);
diff --git a/trunk/mm/memory_hotplug.c b/trunk/mm/memory_hotplug.c
index 962e353aa86f..d04ed87bfacb 100644
--- a/trunk/mm/memory_hotplug.c
+++ b/trunk/mm/memory_hotplug.c
@@ -590,18 +590,21 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
 }
 
 #ifdef CONFIG_MOVABLE_NODE
-/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */
+/*
+ * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
+ * normal memory.
+ */
 static bool can_online_high_movable(struct zone *zone)
 {
 	return true;
 }
-#else /* #ifdef CONFIG_MOVABLE_NODE */
+#else /* CONFIG_MOVABLE_NODE */
 /* ensure every online node has NORMAL memory */
 static bool can_online_high_movable(struct zone *zone)
 {
 	return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
 }
-#endif /* #ifdef CONFIG_MOVABLE_NODE */
+#endif /* CONFIG_MOVABLE_NODE */
 
 /* check which state of node_states will be changed when online memory */
 static void node_states_check_changes_online(unsigned long nr_pages,
@@ -1112,12 +1115,15 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
 }
 
 #ifdef CONFIG_MOVABLE_NODE
-/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */
+/*
+ * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have
+ * normal memory.
+ */
 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
 {
 	return true;
 }
-#else /* #ifdef CONFIG_MOVABLE_NODE */
+#else /* CONFIG_MOVABLE_NODE */
 /* ensure the node has NORMAL memory if it is still online */
 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
 {
@@ -1141,7 +1147,7 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
 	 */
 	return present_pages == 0;
 }
-#endif /* #ifdef CONFIG_MOVABLE_NODE */
+#endif /* CONFIG_MOVABLE_NODE */
 
 /* check which state of node_states will be changed when offline memory */
 static void node_states_check_changes_offline(unsigned long nr_pages,
diff --git a/trunk/mm/migrate.c b/trunk/mm/migrate.c
index 32efd8028bc9..3b676b0c5c3e 100644
--- a/trunk/mm/migrate.c
+++ b/trunk/mm/migrate.c
@@ -1734,7 +1734,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 	page_add_new_anon_rmap(new_page, vma, haddr);
 
 	set_pmd_at(mm, haddr, pmd, entry);
-	update_mmu_cache_pmd(vma, address, entry);
+	update_mmu_cache_pmd(vma, address, &entry);
 	page_remove_rmap(page);
 	/*
 	 * Finish the charge transaction under the page table lock to
diff --git a/trunk/mm/mprotect.c b/trunk/mm/mprotect.c
index 3dca970367db..94722a4d6b43 100644
--- a/trunk/mm/mprotect.c
+++ b/trunk/mm/mprotect.c
@@ -114,7 +114,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 
 #ifdef CONFIG_NUMA_BALANCING
 static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
-		pmd_t *pmd)
+				       pmd_t *pmd)
 {
 	spin_lock(&mm->page_table_lock);
 	set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
@@ -122,15 +122,15 @@ static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
 }
 #else
 static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
-		pmd_t *pmd)
+				       pmd_t *pmd)
 {
 	BUG();
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
-		unsigned long addr, unsigned long end, pgprot_t newprot,
-		int dirty_accountable, int prot_numa)
+static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
+		pud_t *pud, unsigned long addr, unsigned long end,
+		pgprot_t newprot, int dirty_accountable, int prot_numa)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -143,7 +143,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *
 		if (pmd_trans_huge(*pmd)) {
 			if (next - addr != HPAGE_PMD_SIZE)
 				split_huge_page_pmd(vma, addr, pmd);
-			else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) {
+			else if (change_huge_pmd(vma, pmd, addr, newprot,
+						 prot_numa)) {
 				pages += HPAGE_PMD_NR;
 				continue;
 			}
@@ -167,9 +168,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *
 	return pages;
 }
 
-static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
-		unsigned long addr, unsigned long end, pgprot_t newprot,
-		int dirty_accountable, int prot_numa)
+static inline unsigned long change_pud_range(struct vm_area_struct *vma,
+		pgd_t *pgd, unsigned long addr, unsigned long end,
+		pgprot_t newprot, int dirty_accountable, int prot_numa)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -304,7 +305,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 		dirty_accountable = 1;
 	}
 
-	change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0);
+	change_protection(vma, start, end, vma->vm_page_prot,
+			  dirty_accountable, 0);
 
 	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
 	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
@@ -361,8 +363,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
 		error = -EINVAL;
 		if (!(vma->vm_flags & VM_GROWSDOWN))
 			goto out;
-	}
-	else {
+	} else {
 		if (vma->vm_start > start)
 			goto out;
 		if (unlikely(grows & PROT_GROWSUP)) {
@@ -378,9 +379,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
 	for (nstart = start ; ; ) {
 		unsigned long newflags;
 
-		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
+		/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
 
-		newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
+		newflags = vm_flags;
+		newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
 
 		/* newflags >> 4 shift VM_MAY% in place of VM_% */
 		if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
diff --git a/trunk/mm/page_alloc.c b/trunk/mm/page_alloc.c
index d037c8bc1512..2ad2ad168efe 100644
--- a/trunk/mm/page_alloc.c
+++ b/trunk/mm/page_alloc.c
@@ -371,8 +371,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
 	int nr_pages = 1 << order;
 	int bad = 0;
 
-	if (unlikely(compound_order(page) != order) ||
-	    unlikely(!PageHead(page))) {
+	if (unlikely(compound_order(page) != order)) {
 		bad_page(page);
 		bad++;
 	}
@@ -2613,6 +2612,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	int migratetype = allocflags_to_migratetype(gfp_mask);
 	unsigned int cpuset_mems_cookie;
 	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
+	struct mem_cgroup *memcg = NULL;
 
 	gfp_mask &= gfp_allowed_mask;
 
@@ -2631,6 +2631,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	if (unlikely(!zonelist->_zonerefs->zone))
 		return NULL;
 
+	/*
+	 * Will only have any effect when __GFP_KMEMCG is set.  This is
+	 * verified in the (always inline) callee
+	 */
+	if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
+		return NULL;
+
 retry_cpuset:
 	cpuset_mems_cookie = get_mems_allowed();
 
@@ -2666,6 +2673,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
 		goto retry_cpuset;
 
+	memcg_kmem_commit_charge(page, memcg, order);
+
 	return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2718,6 +2727,31 @@ void free_pages(unsigned long addr, unsigned int order)
 
 EXPORT_SYMBOL(free_pages);
 
+/*
+ * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
+ * pages allocated with __GFP_KMEMCG.
+ *
+ * Those pages are accounted to a particular memcg, embedded in the
+ * corresponding page_cgroup. To avoid adding a hit in the allocator to search
+ * for that information only to find out that it is NULL for users who have no
+ * interest in that whatsoever, we provide these functions.
+ *
+ * The caller knows better which flags it relies on.
+ */
+void __free_memcg_kmem_pages(struct page *page, unsigned int order)
+{
+	memcg_kmem_uncharge_pages(page, order);
+	__free_pages(page, order);
+}
+
+void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
+{
+	if (addr != 0) {
+		VM_BUG_ON(!virt_addr_valid((void *)addr));
+		__free_memcg_kmem_pages(virt_to_page((void *)addr), order);
+	}
+}
+
 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
 {
 	if (addr) {
diff --git a/trunk/mm/shmem.c b/trunk/mm/shmem.c
index 03f9ba8fb8e5..5c90d84c2b02 100644
--- a/trunk/mm/shmem.c
+++ b/trunk/mm/shmem.c
@@ -1719,7 +1719,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
  * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
  */
 static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
-				    pgoff_t index, pgoff_t end, int origin)
+				    pgoff_t index, pgoff_t end, int whence)
 {
 	struct page *page;
 	struct pagevec pvec;
@@ -1733,13 +1733,13 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
 		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
 					pvec.nr, pvec.pages, indices);
 		if (!pvec.nr) {
-			if (origin == SEEK_DATA)
+			if (whence == SEEK_DATA)
 				index = end;
 			break;
 		}
 		for (i = 0; i < pvec.nr; i++, index++) {
 			if (index < indices[i]) {
-				if (origin == SEEK_HOLE) {
+				if (whence == SEEK_HOLE) {
 					done = true;
 					break;
 				}
@@ -1751,8 +1751,8 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
 					page = NULL;
 			}
 			if (index >= end ||
-			    (page && origin == SEEK_DATA) ||
-			    (!page && origin == SEEK_HOLE)) {
+			    (page && whence == SEEK_DATA) ||
+			    (!page && whence == SEEK_HOLE)) {
 				done = true;
 				break;
 			}
@@ -1765,15 +1765,15 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
 	return index;
 }
 
-static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
+static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	pgoff_t start, end;
 	loff_t new_offset;
 
-	if (origin != SEEK_DATA && origin != SEEK_HOLE)
-		return generic_file_llseek_size(file, offset, origin,
+	if (whence != SEEK_DATA && whence != SEEK_HOLE)
+		return generic_file_llseek_size(file, offset, whence,
 					MAX_LFS_FILESIZE, i_size_read(inode));
 	mutex_lock(&inode->i_mutex);
 	/* We're holding i_mutex so we can access i_size directly */
@@ -1785,12 +1785,12 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
 	else {
 		start = offset >> PAGE_CACHE_SHIFT;
 		end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-		new_offset = shmem_seek_hole_data(mapping, start, end, origin);
+		new_offset = shmem_seek_hole_data(mapping, start, end, whence);
 		new_offset <<= PAGE_CACHE_SHIFT;
 		if (new_offset > offset) {
 			if (new_offset < inode->i_size)
 				offset = new_offset;
-			else if (origin == SEEK_DATA)
+			else if (whence == SEEK_DATA)
 				offset = -ENXIO;
 			else
 				offset = inode->i_size;
diff --git a/trunk/mm/slab.c b/trunk/mm/slab.c
index 33d3363658df..e7667a3584bc 100644
--- a/trunk/mm/slab.c
+++ b/trunk/mm/slab.c
@@ -87,7 +87,6 @@
  */
 
 #include	<linux/slab.h>
-#include	"slab.h"
 #include	<linux/mm.h>
 #include	<linux/poison.h>
 #include	<linux/swap.h>
@@ -128,6 +127,8 @@
 
 #include	"internal.h"
 
+#include	"slab.h"
+
 /*
  * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
  *		  0 for faster, smaller code (especially in the critical paths).
@@ -162,23 +163,6 @@
  */
 static bool pfmemalloc_active __read_mostly;
 
-/* Legal flag mask for kmem_cache_create(). */
-#if DEBUG
-# define CREATE_MASK	(SLAB_RED_ZONE | \
-			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
-			 SLAB_CACHE_DMA | \
-			 SLAB_STORE_USER | \
-			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
-			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
-			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
-#else
-# define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
-			 SLAB_CACHE_DMA | \
-			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
-			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
-			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
-#endif
-
 /*
  * kmem_bufctl_t:
  *
@@ -564,15 +548,11 @@ static struct cache_names __initdata cache_names[] = {
 #undef CACHE
 };
 
-static struct arraycache_init initarray_cache __initdata =
-    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 static struct arraycache_init initarray_generic =
     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 
 /* internal cache of cache description objs */
-static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES];
 static struct kmem_cache kmem_cache_boot = {
-	.nodelists = kmem_cache_nodelists,
 	.batchcount = 1,
 	.limit = BOOT_CPUCACHE_ENTRIES,
 	.shared = 1,
@@ -662,6 +642,26 @@ static void init_node_lock_keys(int q)
 	}
 }
 
+static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q)
+{
+	struct kmem_list3 *l3;
+	l3 = cachep->nodelists[q];
+	if (!l3)
+		return;
+
+	slab_set_lock_classes(cachep, &on_slab_l3_key,
+			&on_slab_alc_key, q);
+}
+
+static inline void on_slab_lock_classes(struct kmem_cache *cachep)
+{
+	int node;
+
+	VM_BUG_ON(OFF_SLAB(cachep));
+	for_each_node(node)
+		on_slab_lock_classes_node(cachep, node);
+}
+
 static inline void init_lock_keys(void)
 {
 	int node;
@@ -678,6 +678,14 @@ static inline void init_lock_keys(void)
 {
 }
 
+static inline void on_slab_lock_classes(struct kmem_cache *cachep)
+{
+}
+
+static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node)
+{
+}
+
 static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
 {
 }
@@ -1406,6 +1414,9 @@ static int __cpuinit cpuup_prepare(long cpu)
 		free_alien_cache(alien);
 		if (cachep->flags & SLAB_DEBUG_OBJECTS)
 			slab_set_debugobj_lock_classes_node(cachep, node);
+		else if (!OFF_SLAB(cachep) &&
+			 !(cachep->flags & SLAB_DESTROY_BY_RCU))
+			on_slab_lock_classes_node(cachep, node);
 	}
 	init_node_lock_keys(node);
 
@@ -1576,29 +1587,34 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index)
 	}
 }
 
+/*
+ * The memory after the last cpu cache pointer is used for the
+ * the nodelists pointer.
+ */
+static void setup_nodelists_pointer(struct kmem_cache *cachep)
+{
+	cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
+}
+
 /*
  * Initialisation.  Called after the page allocator have been initialised and
  * before smp_init().
  */
 void __init kmem_cache_init(void)
 {
-	size_t left_over;
 	struct cache_sizes *sizes;
 	struct cache_names *names;
 	int i;
-	int order;
-	int node;
 
 	kmem_cache = &kmem_cache_boot;
+	setup_nodelists_pointer(kmem_cache);
 
 	if (num_possible_nodes() == 1)
 		use_alien_caches = 0;
 
-	for (i = 0; i < NUM_INIT_LISTS; i++) {
+	for (i = 0; i < NUM_INIT_LISTS; i++)
 		kmem_list3_init(&initkmem_list3[i]);
-		if (i < MAX_NUMNODES)
-			kmem_cache->nodelists[i] = NULL;
-	}
+
 	set_up_list3s(kmem_cache, CACHE_CACHE);
 
 	/*
@@ -1629,37 +1645,16 @@ void __init kmem_cache_init(void)
 	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
 	 */
 
-	node = numa_mem_id();
-
 	/* 1) create the kmem_cache */
-	INIT_LIST_HEAD(&slab_caches);
-	list_add(&kmem_cache->list, &slab_caches);
-	kmem_cache->colour_off = cache_line_size();
-	kmem_cache->array[smp_processor_id()] = &initarray_cache.cache;
-	kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
 
 	/*
 	 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
 	 */
-	kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
-				  nr_node_ids * sizeof(struct kmem_list3 *);
-	kmem_cache->object_size = kmem_cache->size;
-	kmem_cache->size = ALIGN(kmem_cache->object_size,
-					cache_line_size());
-	kmem_cache->reciprocal_buffer_size =
-		reciprocal_value(kmem_cache->size);
-
-	for (order = 0; order < MAX_ORDER; order++) {
-		cache_estimate(order, kmem_cache->size,
-			cache_line_size(), 0, &left_over, &kmem_cache->num);
-		if (kmem_cache->num)
-			break;
-	}
-	BUG_ON(!kmem_cache->num);
-	kmem_cache->gfporder = order;
-	kmem_cache->colour = left_over / kmem_cache->colour_off;
-	kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) +
-				      sizeof(struct slab), cache_line_size());
+	create_boot_cache(kmem_cache, "kmem_cache",
+		offsetof(struct kmem_cache, array[nr_cpu_ids]) +
+				  nr_node_ids * sizeof(struct kmem_list3 *),
+				  SLAB_HWCACHE_ALIGN);
+	list_add(&kmem_cache->list, &slab_caches);
 
 	/* 2+3) create the kmalloc caches */
 	sizes = malloc_sizes;
@@ -1671,23 +1666,13 @@ void __init kmem_cache_init(void)
 	 * bug.
 	 */
 
-	sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-	sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name;
-	sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size;
-	sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size;
-	sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
-	__kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
-	list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches);
-
-	if (INDEX_AC != INDEX_L3) {
-		sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-		sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name;
-		sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size;
-		sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size;
-		sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
-		__kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
-		list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches);
-	}
+	sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name,
+					sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS);
+
+	if (INDEX_AC != INDEX_L3)
+		sizes[INDEX_L3].cs_cachep =
+			create_kmalloc_cache(names[INDEX_L3].name,
+				sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS);
 
 	slab_early_init = 0;
 
@@ -1699,24 +1684,14 @@ void __init kmem_cache_init(void)
 		 * Note for systems short on memory removing the alignment will
 		 * allow tighter packing of the smaller caches.
 		 */
-		if (!sizes->cs_cachep) {
-			sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-			sizes->cs_cachep->name = names->name;
-			sizes->cs_cachep->size = sizes->cs_size;
-			sizes->cs_cachep->object_size = sizes->cs_size;
-			sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN;
-			__kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
-			list_add(&sizes->cs_cachep->list, &slab_caches);
-		}
+		if (!sizes->cs_cachep)
+			sizes->cs_cachep = create_kmalloc_cache(names->name,
+					sizes->cs_size, ARCH_KMALLOC_FLAGS);
+
 #ifdef CONFIG_ZONE_DMA
-		sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-		sizes->cs_dmacachep->name = names->name_dma;
-		sizes->cs_dmacachep->size = sizes->cs_size;
-		sizes->cs_dmacachep->object_size = sizes->cs_size;
-		sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN;
-		__kmem_cache_create(sizes->cs_dmacachep,
-			       ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC);
-		list_add(&sizes->cs_dmacachep->list, &slab_caches);
+		sizes->cs_dmacachep = create_kmalloc_cache(
+			names->name_dma, sizes->cs_size,
+			SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS);
 #endif
 		sizes++;
 		names++;
@@ -1727,7 +1702,6 @@ void __init kmem_cache_init(void)
 
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
 
-		BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache);
 		memcpy(ptr, cpu_cache_get(kmem_cache),
 		       sizeof(struct arraycache_init));
 		/*
@@ -1921,6 +1895,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 		if (page->pfmemalloc)
 			SetPageSlabPfmemalloc(page + i);
 	}
+	memcg_bind_pages(cachep, cachep->gfporder);
 
 	if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
 		kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1957,9 +1932,11 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
 		__ClearPageSlab(page);
 		page++;
 	}
+
+	memcg_release_pages(cachep, cachep->gfporder);
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += nr_freed;
-	free_pages((unsigned long)addr, cachep->gfporder);
+	free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
 }
 
 static void kmem_rcu_free(struct rcu_head *head)
@@ -2282,7 +2259,15 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 
 	if (slab_state == DOWN) {
 		/*
-		 * Note: the first kmem_cache_create must create the cache
+		 * Note: Creation of first cache (kmem_cache).
+		 * The setup_list3s is taken care
+		 * of by the caller of __kmem_cache_create
+		 */
+		cachep->array[smp_processor_id()] = &initarray_generic.cache;
+		slab_state = PARTIAL;
+	} else if (slab_state == PARTIAL) {
+		/*
+		 * Note: the second kmem_cache_create must create the cache
 		 * that's used by kmalloc(24), otherwise the creation of
 		 * further caches will BUG().
 		 */
@@ -2290,7 +2275,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 
 		/*
 		 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
-		 * the first cache, then we need to set up all its list3s,
+		 * the second cache, then we need to set up all its list3s,
 		 * otherwise the creation of further caches will BUG().
 		 */
 		set_up_list3s(cachep, SIZE_AC);
@@ -2299,6 +2284,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 		else
 			slab_state = PARTIAL_ARRAYCACHE;
 	} else {
+		/* Remaining boot caches */
 		cachep->array[smp_processor_id()] =
 			kmalloc(sizeof(struct arraycache_init), gfp);
 
@@ -2331,11 +2317,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 
 /**
  * __kmem_cache_create - Create a cache.
- * @name: A string which is used in /proc/slabinfo to identify this cache.
- * @size: The size of objects to be created in this cache.
- * @align: The required alignment for the objects.
+ * @cachep: cache management descriptor
  * @flags: SLAB flags
- * @ctor: A constructor for the objects.
  *
  * Returns a ptr to the cache on success, NULL on failure.
  * Cannot be called within a int, but can be interrupted.
@@ -2378,11 +2361,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 	if (flags & SLAB_DESTROY_BY_RCU)
 		BUG_ON(flags & SLAB_POISON);
 #endif
-	/*
-	 * Always checks flags, a caller might be expecting debug support which
-	 * isn't available.
-	 */
-	BUG_ON(flags & ~CREATE_MASK);
 
 	/*
 	 * Check that size is in terms of words.  This is needed to avoid
@@ -2394,22 +2372,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 		size &= ~(BYTES_PER_WORD - 1);
 	}
 
-	/* calculate the final buffer alignment: */
-
-	/* 1) arch recommendation: can be overridden for debug */
-	if (flags & SLAB_HWCACHE_ALIGN) {
-		/*
-		 * Default alignment: as specified by the arch code.  Except if
-		 * an object is really small, then squeeze multiple objects into
-		 * one cacheline.
-		 */
-		ralign = cache_line_size();
-		while (size <= ralign / 2)
-			ralign /= 2;
-	} else {
-		ralign = BYTES_PER_WORD;
-	}
-
 	/*
 	 * Redzoning and user store require word alignment or possibly larger.
 	 * Note this will be overridden by architecture or caller mandated
@@ -2426,10 +2388,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 		size &= ~(REDZONE_ALIGN - 1);
 	}
 
-	/* 2) arch mandated alignment */
-	if (ralign < ARCH_SLAB_MINALIGN) {
-		ralign = ARCH_SLAB_MINALIGN;
-	}
 	/* 3) caller mandated alignment */
 	if (ralign < cachep->align) {
 		ralign = cachep->align;
@@ -2447,7 +2405,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 	else
 		gfp = GFP_NOWAIT;
 
-	cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
+	setup_nodelists_pointer(cachep);
 #if DEBUG
 
 	/*
@@ -2566,7 +2524,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 		WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
 
 		slab_set_debugobj_lock_classes(cachep);
-	}
+	} else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU))
+		on_slab_lock_classes(cachep);
 
 	return 0;
 }
@@ -3530,6 +3489,8 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	if (slab_should_failslab(cachep, flags))
 		return NULL;
 
+	cachep = memcg_kmem_get_cache(cachep, flags);
+
 	cache_alloc_debugcheck_before(cachep, flags);
 	local_irq_save(save_flags);
 
@@ -3615,6 +3576,8 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
 	if (slab_should_failslab(cachep, flags))
 		return NULL;
 
+	cachep = memcg_kmem_get_cache(cachep, flags);
+
 	cache_alloc_debugcheck_before(cachep, flags);
 	local_irq_save(save_flags);
 	objp = __do_cache_alloc(cachep, flags);
@@ -3928,6 +3891,9 @@ EXPORT_SYMBOL(__kmalloc);
 void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 {
 	unsigned long flags;
+	cachep = cache_from_obj(cachep, objp);
+	if (!cachep)
+		return;
 
 	local_irq_save(flags);
 	debug_check_no_locks_freed(objp, cachep->object_size);
@@ -3969,12 +3935,6 @@ void kfree(const void *objp)
 }
 EXPORT_SYMBOL(kfree);
 
-unsigned int kmem_cache_size(struct kmem_cache *cachep)
-{
-	return cachep->object_size;
-}
-EXPORT_SYMBOL(kmem_cache_size);
-
 /*
  * This initializes kmem_list3 or resizes various caches for all nodes.
  */
@@ -4081,7 +4041,7 @@ static void do_ccupdate_local(void *info)
 }
 
 /* Always called with the slab_mutex held */
-static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
 				int batchcount, int shared, gfp_t gfp)
 {
 	struct ccupdate_struct *new;
@@ -4124,12 +4084,49 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 	return alloc_kmemlist(cachep, gfp);
 }
 
+static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+				int batchcount, int shared, gfp_t gfp)
+{
+	int ret;
+	struct kmem_cache *c = NULL;
+	int i = 0;
+
+	ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
+
+	if (slab_state < FULL)
+		return ret;
+
+	if ((ret < 0) || !is_root_cache(cachep))
+		return ret;
+
+	VM_BUG_ON(!mutex_is_locked(&slab_mutex));
+	for_each_memcg_cache_index(i) {
+		c = cache_from_memcg(cachep, i);
+		if (c)
+			/* return value determined by the parent cache only */
+			__do_tune_cpucache(c, limit, batchcount, shared, gfp);
+	}
+
+	return ret;
+}
+
 /* Called with slab_mutex held always */
 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
 {
 	int err;
-	int limit, shared;
+	int limit = 0;
+	int shared = 0;
+	int batchcount = 0;
+
+	if (!is_root_cache(cachep)) {
+		struct kmem_cache *root = memcg_root_cache(cachep);
+		limit = root->limit;
+		shared = root->shared;
+		batchcount = root->batchcount;
+	}
 
+	if (limit && shared && batchcount)
+		goto skip_setup;
 	/*
 	 * The head array serves three purposes:
 	 * - create a LIFO ordering, i.e. return objects that are cache-warm
@@ -4171,7 +4168,9 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
 	if (limit > 32)
 		limit = 32;
 #endif
-	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
+	batchcount = (limit + 1) / 2;
+skip_setup:
+	err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
 	if (err)
 		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
 		       cachep->name, -err);
@@ -4276,54 +4275,8 @@ static void cache_reap(struct work_struct *w)
 }
 
 #ifdef CONFIG_SLABINFO
-
-static void print_slabinfo_header(struct seq_file *m)
-{
-	/*
-	 * Output format version, so at least we can change it
-	 * without _too_ many complaints.
-	 */
-#if STATS
-	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
-#else
-	seq_puts(m, "slabinfo - version: 2.1\n");
-#endif
-	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
-		 "<objperslab> <pagesperslab>");
-	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
-	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
-#if STATS
-	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
-		 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
-	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
-#endif
-	seq_putc(m, '\n');
-}
-
-static void *s_start(struct seq_file *m, loff_t *pos)
-{
-	loff_t n = *pos;
-
-	mutex_lock(&slab_mutex);
-	if (!n)
-		print_slabinfo_header(m);
-
-	return seq_list_start(&slab_caches, *pos);
-}
-
-static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
 {
-	return seq_list_next(p, &slab_caches, pos);
-}
-
-static void s_stop(struct seq_file *m, void *p)
-{
-	mutex_unlock(&slab_mutex);
-}
-
-static int s_show(struct seq_file *m, void *p)
-{
-	struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
 	struct slab *slabp;
 	unsigned long active_objs;
 	unsigned long num_objs;
@@ -4378,13 +4331,20 @@ static int s_show(struct seq_file *m, void *p)
 	if (error)
 		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
 
-	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
-		   name, active_objs, num_objs, cachep->size,
-		   cachep->num, (1 << cachep->gfporder));
-	seq_printf(m, " : tunables %4u %4u %4u",
-		   cachep->limit, cachep->batchcount, cachep->shared);
-	seq_printf(m, " : slabdata %6lu %6lu %6lu",
-		   active_slabs, num_slabs, shared_avail);
+	sinfo->active_objs = active_objs;
+	sinfo->num_objs = num_objs;
+	sinfo->active_slabs = active_slabs;
+	sinfo->num_slabs = num_slabs;
+	sinfo->shared_avail = shared_avail;
+	sinfo->limit = cachep->limit;
+	sinfo->batchcount = cachep->batchcount;
+	sinfo->shared = cachep->shared;
+	sinfo->objects_per_slab = cachep->num;
+	sinfo->cache_order = cachep->gfporder;
+}
+
+void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
+{
 #if STATS
 	{			/* list3 stats */
 		unsigned long high = cachep->high_mark;
@@ -4414,31 +4374,8 @@ static int s_show(struct seq_file *m, void *p)
 			   allochit, allocmiss, freehit, freemiss);
 	}
 #endif
-	seq_putc(m, '\n');
-	return 0;
 }
 
-/*
- * slabinfo_op - iterator that generates /proc/slabinfo
- *
- * Output layout:
- * cache-name
- * num-active-objs
- * total-objs
- * object size
- * num-active-slabs
- * total-slabs
- * num-pages-per-slab
- * + further values on SMP and with statistics enabled
- */
-
-static const struct seq_operations slabinfo_op = {
-	.start = s_start,
-	.next = s_next,
-	.stop = s_stop,
-	.show = s_show,
-};
-
 #define MAX_SLABINFO_WRITE 128
 /**
  * slabinfo_write - Tuning for the slab allocator
@@ -4447,7 +4384,7 @@ static const struct seq_operations slabinfo_op = {
  * @count: data length
  * @ppos: unused
  */
-static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 		       size_t count, loff_t *ppos)
 {
 	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
@@ -4490,19 +4427,6 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 	return res;
 }
 
-static int slabinfo_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &slabinfo_op);
-}
-
-static const struct file_operations proc_slabinfo_operations = {
-	.open		= slabinfo_open,
-	.read		= seq_read,
-	.write		= slabinfo_write,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 #ifdef CONFIG_DEBUG_SLAB_LEAK
 
 static void *leaks_start(struct seq_file *m, loff_t *pos)
@@ -4631,6 +4555,16 @@ static int leaks_show(struct seq_file *m, void *p)
 	return 0;
 }
 
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	return seq_list_next(p, &slab_caches, pos);
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&slab_mutex);
+}
+
 static const struct seq_operations slabstats_op = {
 	.start = leaks_start,
 	.next = s_next,
@@ -4665,7 +4599,6 @@ static const struct file_operations proc_slabstats_operations = {
 
 static int __init slab_proc_init(void)
 {
-	proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations);
 #ifdef CONFIG_DEBUG_SLAB_LEAK
 	proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
 #endif
diff --git a/trunk/mm/slab.h b/trunk/mm/slab.h
index 7deeb449a301..34a98d642196 100644
--- a/trunk/mm/slab.h
+++ b/trunk/mm/slab.h
@@ -32,19 +32,201 @@ extern struct list_head slab_caches;
 /* The slab cache that manages slab cache information */
 extern struct kmem_cache *kmem_cache;
 
+unsigned long calculate_alignment(unsigned long flags,
+		unsigned long align, unsigned long size);
+
 /* Functions provided by the slab allocators */
 extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags);
 
+extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
+			unsigned long flags);
+extern void create_boot_cache(struct kmem_cache *, const char *name,
+			size_t size, unsigned long flags);
+
+struct mem_cgroup;
 #ifdef CONFIG_SLUB
-struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
-	size_t align, unsigned long flags, void (*ctor)(void *));
+struct kmem_cache *
+__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+		   size_t align, unsigned long flags, void (*ctor)(void *));
 #else
-static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
-	size_t align, unsigned long flags, void (*ctor)(void *))
+static inline struct kmem_cache *
+__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+		   size_t align, unsigned long flags, void (*ctor)(void *))
 { return NULL; }
 #endif
 
 
+/* Legal flag mask for kmem_cache_create(), for various configurations */
+#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \
+			 SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS )
+
+#if defined(CONFIG_DEBUG_SLAB)
+#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
+#elif defined(CONFIG_SLUB_DEBUG)
+#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
+			  SLAB_TRACE | SLAB_DEBUG_FREE)
+#else
+#define SLAB_DEBUG_FLAGS (0)
+#endif
+
+#if defined(CONFIG_SLAB)
+#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
+			  SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK)
+#elif defined(CONFIG_SLUB)
+#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
+			  SLAB_TEMPORARY | SLAB_NOTRACK)
+#else
+#define SLAB_CACHE_FLAGS (0)
+#endif
+
+#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
+
 int __kmem_cache_shutdown(struct kmem_cache *);
 
+struct seq_file;
+struct file;
+
+struct slabinfo {
+	unsigned long active_objs;
+	unsigned long num_objs;
+	unsigned long active_slabs;
+	unsigned long num_slabs;
+	unsigned long shared_avail;
+	unsigned int limit;
+	unsigned int batchcount;
+	unsigned int shared;
+	unsigned int objects_per_slab;
+	unsigned int cache_order;
+};
+
+void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo);
+void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+		       size_t count, loff_t *ppos);
+
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool is_root_cache(struct kmem_cache *s)
+{
+	return !s->memcg_params || s->memcg_params->is_root_cache;
+}
+
+static inline bool cache_match_memcg(struct kmem_cache *cachep,
+				     struct mem_cgroup *memcg)
+{
+	return (is_root_cache(cachep) && !memcg) ||
+				(cachep->memcg_params->memcg == memcg);
+}
+
+static inline void memcg_bind_pages(struct kmem_cache *s, int order)
+{
+	if (!is_root_cache(s))
+		atomic_add(1 << order, &s->memcg_params->nr_pages);
+}
+
+static inline void memcg_release_pages(struct kmem_cache *s, int order)
+{
+	if (is_root_cache(s))
+		return;
+
+	if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages))
+		mem_cgroup_destroy_cache(s);
+}
+
+static inline bool slab_equal_or_root(struct kmem_cache *s,
+					struct kmem_cache *p)
+{
+	return (p == s) ||
+		(s->memcg_params && (p == s->memcg_params->root_cache));
+}
+
+/*
+ * We use suffixes to the name in memcg because we can't have caches
+ * created in the system with the same name. But when we print them
+ * locally, better refer to them with the base name
+ */
+static inline const char *cache_name(struct kmem_cache *s)
+{
+	if (!is_root_cache(s))
+		return s->memcg_params->root_cache->name;
+	return s->name;
+}
+
+static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
+{
+	return s->memcg_params->memcg_caches[idx];
+}
+
+static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
+{
+	if (is_root_cache(s))
+		return s;
+	return s->memcg_params->root_cache;
+}
+#else
+static inline bool is_root_cache(struct kmem_cache *s)
+{
+	return true;
+}
+
+static inline bool cache_match_memcg(struct kmem_cache *cachep,
+				     struct mem_cgroup *memcg)
+{
+	return true;
+}
+
+static inline void memcg_bind_pages(struct kmem_cache *s, int order)
+{
+}
+
+static inline void memcg_release_pages(struct kmem_cache *s, int order)
+{
+}
+
+static inline bool slab_equal_or_root(struct kmem_cache *s,
+				      struct kmem_cache *p)
+{
+	return true;
+}
+
+static inline const char *cache_name(struct kmem_cache *s)
+{
+	return s->name;
+}
+
+static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
+{
+	return NULL;
+}
+
+static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
+{
+	return s;
+}
+#endif
+
+static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
+{
+	struct kmem_cache *cachep;
+	struct page *page;
+
+	/*
+	 * When kmemcg is not being used, both assignments should return the
+	 * same value. but we don't want to pay the assignment price in that
+	 * case. If it is not compiled in, the compiler should be smart enough
+	 * to not do even the assignment. In that case, slab_equal_or_root
+	 * will also be a constant.
+	 */
+	if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE))
+		return s;
+
+	page = virt_to_head_page(x);
+	cachep = page->slab_cache;
+	if (slab_equal_or_root(cachep, s))
+		return cachep;
+
+	pr_err("%s: Wrong slab cache. %s but object is from %s\n",
+		__FUNCTION__, cachep->name, s->name);
+	WARN_ON_ONCE(1);
+	return s;
+}
 #endif
diff --git a/trunk/mm/slab_common.c b/trunk/mm/slab_common.c
index 069a24e64403..3f3cd97d3fdf 100644
--- a/trunk/mm/slab_common.c
+++ b/trunk/mm/slab_common.c
@@ -13,9 +13,12 @@
 #include <linux/module.h>
 #include <linux/cpu.h>
 #include <linux/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 #include <asm/page.h>
+#include <linux/memcontrol.h>
 
 #include "slab.h"
 
@@ -25,7 +28,8 @@ DEFINE_MUTEX(slab_mutex);
 struct kmem_cache *kmem_cache;
 
 #ifdef CONFIG_DEBUG_VM
-static int kmem_cache_sanity_check(const char *name, size_t size)
+static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
+				   size_t size)
 {
 	struct kmem_cache *s = NULL;
 
@@ -51,7 +55,13 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
 			continue;
 		}
 
-		if (!strcmp(s->name, name)) {
+		/*
+		 * For simplicity, we won't check this in the list of memcg
+		 * caches. We have control over memcg naming, and if there
+		 * aren't duplicates in the global list, there won't be any
+		 * duplicates in the memcg lists as well.
+		 */
+		if (!memcg && !strcmp(s->name, name)) {
 			pr_err("%s (%s): Cache name already exists.\n",
 			       __func__, name);
 			dump_stack();
@@ -64,12 +74,69 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
 	return 0;
 }
 #else
-static inline int kmem_cache_sanity_check(const char *name, size_t size)
+static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
+					  const char *name, size_t size)
 {
 	return 0;
 }
 #endif
 
+#ifdef CONFIG_MEMCG_KMEM
+int memcg_update_all_caches(int num_memcgs)
+{
+	struct kmem_cache *s;
+	int ret = 0;
+	mutex_lock(&slab_mutex);
+
+	list_for_each_entry(s, &slab_caches, list) {
+		if (!is_root_cache(s))
+			continue;
+
+		ret = memcg_update_cache_size(s, num_memcgs);
+		/*
+		 * See comment in memcontrol.c, memcg_update_cache_size:
+		 * Instead of freeing the memory, we'll just leave the caches
+		 * up to this point in an updated state.
+		 */
+		if (ret)
+			goto out;
+	}
+
+	memcg_update_array_size(num_memcgs);
+out:
+	mutex_unlock(&slab_mutex);
+	return ret;
+}
+#endif
+
+/*
+ * Figure out what the alignment of the objects will be given a set of
+ * flags, a user specified alignment and the size of the objects.
+ */
+unsigned long calculate_alignment(unsigned long flags,
+		unsigned long align, unsigned long size)
+{
+	/*
+	 * If the user wants hardware cache aligned objects then follow that
+	 * suggestion if the object is sufficiently large.
+	 *
+	 * The hardware cache alignment cannot override the specified
+	 * alignment though. If that is greater then use it.
+	 */
+	if (flags & SLAB_HWCACHE_ALIGN) {
+		unsigned long ralign = cache_line_size();
+		while (size <= ralign / 2)
+			ralign /= 2;
+		align = max(align, ralign);
+	}
+
+	if (align < ARCH_SLAB_MINALIGN)
+		align = ARCH_SLAB_MINALIGN;
+
+	return ALIGN(align, sizeof(void *));
+}
+
+
 /*
  * kmem_cache_create - Create a cache.
  * @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -95,8 +162,10 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
  * as davem.
  */
 
-struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align,
-		unsigned long flags, void (*ctor)(void *))
+struct kmem_cache *
+kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
+			size_t align, unsigned long flags, void (*ctor)(void *),
+			struct kmem_cache *parent_cache)
 {
 	struct kmem_cache *s = NULL;
 	int err = 0;
@@ -104,19 +173,33 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
 	get_online_cpus();
 	mutex_lock(&slab_mutex);
 
-	if (!kmem_cache_sanity_check(name, size) == 0)
+	if (!kmem_cache_sanity_check(memcg, name, size) == 0)
 		goto out_locked;
 
+	/*
+	 * Some allocators will constraint the set of valid flags to a subset
+	 * of all flags. We expect them to define CACHE_CREATE_MASK in this
+	 * case, and we'll just provide them with a sanitized version of the
+	 * passed flags.
+	 */
+	flags &= CACHE_CREATE_MASK;
 
-	s = __kmem_cache_alias(name, size, align, flags, ctor);
+	s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
 	if (s)
 		goto out_locked;
 
 	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
 	if (s) {
 		s->object_size = s->size = size;
-		s->align = align;
+		s->align = calculate_alignment(flags, align, size);
 		s->ctor = ctor;
+
+		if (memcg_register_cache(memcg, s, parent_cache)) {
+			kmem_cache_free(kmem_cache, s);
+			err = -ENOMEM;
+			goto out_locked;
+		}
+
 		s->name = kstrdup(name, GFP_KERNEL);
 		if (!s->name) {
 			kmem_cache_free(kmem_cache, s);
@@ -126,10 +209,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
 
 		err = __kmem_cache_create(s, flags);
 		if (!err) {
-
 			s->refcount = 1;
 			list_add(&s->list, &slab_caches);
-
+			memcg_cache_list_add(memcg, s);
 		} else {
 			kfree(s->name);
 			kmem_cache_free(kmem_cache, s);
@@ -157,10 +239,20 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
 
 	return s;
 }
+
+struct kmem_cache *
+kmem_cache_create(const char *name, size_t size, size_t align,
+		  unsigned long flags, void (*ctor)(void *))
+{
+	return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
+}
 EXPORT_SYMBOL(kmem_cache_create);
 
 void kmem_cache_destroy(struct kmem_cache *s)
 {
+	/* Destroy all the children caches if we aren't a memcg cache */
+	kmem_cache_destroy_memcg_children(s);
+
 	get_online_cpus();
 	mutex_lock(&slab_mutex);
 	s->refcount--;
@@ -172,6 +264,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
 			if (s->flags & SLAB_DESTROY_BY_RCU)
 				rcu_barrier();
 
+			memcg_release_cache(s);
 			kfree(s->name);
 			kmem_cache_free(kmem_cache, s);
 		} else {
@@ -192,3 +285,182 @@ int slab_is_available(void)
 {
 	return slab_state >= UP;
 }
+
+#ifndef CONFIG_SLOB
+/* Create a cache during boot when no slab services are available yet */
+void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
+		unsigned long flags)
+{
+	int err;
+
+	s->name = name;
+	s->size = s->object_size = size;
+	s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
+	err = __kmem_cache_create(s, flags);
+
+	if (err)
+		panic("Creation of kmalloc slab %s size=%zd failed. Reason %d\n",
+					name, size, err);
+
+	s->refcount = -1;	/* Exempt from merging for now */
+}
+
+struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
+				unsigned long flags)
+{
+	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+
+	if (!s)
+		panic("Out of memory when creating slab %s\n", name);
+
+	create_boot_cache(s, name, size, flags);
+	list_add(&s->list, &slab_caches);
+	s->refcount = 1;
+	return s;
+}
+
+#endif /* !CONFIG_SLOB */
+
+
+#ifdef CONFIG_SLABINFO
+void print_slabinfo_header(struct seq_file *m)
+{
+	/*
+	 * Output format version, so at least we can change it
+	 * without _too_ many complaints.
+	 */
+#ifdef CONFIG_DEBUG_SLAB
+	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
+#else
+	seq_puts(m, "slabinfo - version: 2.1\n");
+#endif
+	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
+		 "<objperslab> <pagesperslab>");
+	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
+#ifdef CONFIG_DEBUG_SLAB
+	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
+		 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
+	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
+#endif
+	seq_putc(m, '\n');
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t n = *pos;
+
+	mutex_lock(&slab_mutex);
+	if (!n)
+		print_slabinfo_header(m);
+
+	return seq_list_start(&slab_caches, *pos);
+}
+
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	return seq_list_next(p, &slab_caches, pos);
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&slab_mutex);
+}
+
+static void
+memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
+{
+	struct kmem_cache *c;
+	struct slabinfo sinfo;
+	int i;
+
+	if (!is_root_cache(s))
+		return;
+
+	for_each_memcg_cache_index(i) {
+		c = cache_from_memcg(s, i);
+		if (!c)
+			continue;
+
+		memset(&sinfo, 0, sizeof(sinfo));
+		get_slabinfo(c, &sinfo);
+
+		info->active_slabs += sinfo.active_slabs;
+		info->num_slabs += sinfo.num_slabs;
+		info->shared_avail += sinfo.shared_avail;
+		info->active_objs += sinfo.active_objs;
+		info->num_objs += sinfo.num_objs;
+	}
+}
+
+int cache_show(struct kmem_cache *s, struct seq_file *m)
+{
+	struct slabinfo sinfo;
+
+	memset(&sinfo, 0, sizeof(sinfo));
+	get_slabinfo(s, &sinfo);
+
+	memcg_accumulate_slabinfo(s, &sinfo);
+
+	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
+		   cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size,
+		   sinfo.objects_per_slab, (1 << sinfo.cache_order));
+
+	seq_printf(m, " : tunables %4u %4u %4u",
+		   sinfo.limit, sinfo.batchcount, sinfo.shared);
+	seq_printf(m, " : slabdata %6lu %6lu %6lu",
+		   sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
+	slabinfo_show_stats(m, s);
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static int s_show(struct seq_file *m, void *p)
+{
+	struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+
+	if (!is_root_cache(s))
+		return 0;
+	return cache_show(s, m);
+}
+
+/*
+ * slabinfo_op - iterator that generates /proc/slabinfo
+ *
+ * Output layout:
+ * cache-name
+ * num-active-objs
+ * total-objs
+ * object size
+ * num-active-slabs
+ * total-slabs
+ * num-pages-per-slab
+ * + further values on SMP and with statistics enabled
+ */
+static const struct seq_operations slabinfo_op = {
+	.start = s_start,
+	.next = s_next,
+	.stop = s_stop,
+	.show = s_show,
+};
+
+static int slabinfo_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &slabinfo_op);
+}
+
+static const struct file_operations proc_slabinfo_operations = {
+	.open		= slabinfo_open,
+	.read		= seq_read,
+	.write          = slabinfo_write,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init slab_proc_init(void)
+{
+	proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
+	return 0;
+}
+module_init(slab_proc_init);
+#endif /* CONFIG_SLABINFO */
diff --git a/trunk/mm/slob.c b/trunk/mm/slob.c
index 1e921c5e9576..a99fdf7a0907 100644
--- a/trunk/mm/slob.c
+++ b/trunk/mm/slob.c
@@ -28,9 +28,8 @@
  * from kmalloc are prepended with a 4-byte header with the kmalloc size.
  * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
  * alloc_pages() directly, allocating compound pages so the page order
- * does not have to be separately tracked, and also stores the exact
- * allocation size in page->private so that it can be used to accurately
- * provide ksize(). These objects are detected in kfree() because slob_page()
+ * does not have to be separately tracked.
+ * These objects are detected in kfree() because PageSlab()
  * is false for them.
  *
  * SLAB is emulated on top of SLOB by simply calling constructors and
@@ -59,7 +58,6 @@
 
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include "slab.h"
 
 #include <linux/mm.h>
 #include <linux/swap.h> /* struct reclaim_state */
@@ -74,6 +72,7 @@
 
 #include <linux/atomic.h>
 
+#include "slab.h"
 /*
  * slob_block has a field 'units', which indicates size of block if +ve,
  * or offset of next block if -ve (in SLOB_UNITs).
@@ -124,7 +123,6 @@ static inline void clear_slob_page_free(struct page *sp)
 
 #define SLOB_UNIT sizeof(slob_t)
 #define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
-#define SLOB_ALIGN L1_CACHE_BYTES
 
 /*
  * struct slob_rcu is inserted at the tail of allocated slob blocks, which
@@ -455,11 +453,6 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
 		if (likely(order))
 			gfp |= __GFP_COMP;
 		ret = slob_new_pages(gfp, order, node);
-		if (ret) {
-			struct page *page;
-			page = virt_to_page(ret);
-			page->private = size;
-		}
 
 		trace_kmalloc_node(caller, ret,
 				   size, PAGE_SIZE << order, gfp, node);
@@ -506,7 +499,7 @@ void kfree(const void *block)
 		unsigned int *m = (unsigned int *)(block - align);
 		slob_free(m, *m + align);
 	} else
-		put_page(sp);
+		__free_pages(sp, compound_order(sp));
 }
 EXPORT_SYMBOL(kfree);
 
@@ -514,37 +507,30 @@ EXPORT_SYMBOL(kfree);
 size_t ksize(const void *block)
 {
 	struct page *sp;
+	int align;
+	unsigned int *m;
 
 	BUG_ON(!block);
 	if (unlikely(block == ZERO_SIZE_PTR))
 		return 0;
 
 	sp = virt_to_page(block);
-	if (PageSlab(sp)) {
-		int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
-		unsigned int *m = (unsigned int *)(block - align);
-		return SLOB_UNITS(*m) * SLOB_UNIT;
-	} else
-		return sp->private;
+	if (unlikely(!PageSlab(sp)))
+		return PAGE_SIZE << compound_order(sp);
+
+	align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+	m = (unsigned int *)(block - align);
+	return SLOB_UNITS(*m) * SLOB_UNIT;
 }
 EXPORT_SYMBOL(ksize);
 
 int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
 {
-	size_t align = c->size;
-
 	if (flags & SLAB_DESTROY_BY_RCU) {
 		/* leave room for rcu footer at the end of object */
 		c->size += sizeof(struct slob_rcu);
 	}
 	c->flags = flags;
-	/* ignore alignment unless it's forced */
-	c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
-	if (c->align < ARCH_SLAB_MINALIGN)
-		c->align = ARCH_SLAB_MINALIGN;
-	if (c->align < align)
-		c->align = align;
-
 	return 0;
 }
 
@@ -558,12 +544,12 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 
 	if (c->size < PAGE_SIZE) {
 		b = slob_alloc(c->size, flags, c->align, node);
-		trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
+		trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
 					    SLOB_UNITS(c->size) * SLOB_UNIT,
 					    flags, node);
 	} else {
 		b = slob_new_pages(flags, get_order(c->size), node);
-		trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
+		trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
 					    PAGE_SIZE << get_order(c->size),
 					    flags, node);
 	}
@@ -608,12 +594,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
-unsigned int kmem_cache_size(struct kmem_cache *c)
-{
-	return c->size;
-}
-EXPORT_SYMBOL(kmem_cache_size);
-
 int __kmem_cache_shutdown(struct kmem_cache *c)
 {
 	/* No way to check for remaining objects */
diff --git a/trunk/mm/slub.c b/trunk/mm/slub.c
index 487f0bdd53c0..ba2ca53f6c3a 100644
--- a/trunk/mm/slub.c
+++ b/trunk/mm/slub.c
@@ -31,6 +31,7 @@
 #include <linux/fault-inject.h>
 #include <linux/stacktrace.h>
 #include <linux/prefetch.h>
+#include <linux/memcontrol.h>
 
 #include <trace/events/kmem.h>
 
@@ -112,9 +113,6 @@
  * 			the fast path and disables lockless freelists.
  */
 
-#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
-		SLAB_TRACE | SLAB_DEBUG_FREE)
-
 static inline int kmem_cache_debug(struct kmem_cache *s)
 {
 #ifdef CONFIG_SLUB_DEBUG
@@ -179,8 +177,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
 #define __OBJECT_POISON		0x80000000UL /* Poison object */
 #define __CMPXCHG_DOUBLE	0x40000000UL /* Use cmpxchg_double */
 
-static int kmem_size = sizeof(struct kmem_cache);
-
 #ifdef CONFIG_SMP
 static struct notifier_block slab_notifier;
 #endif
@@ -205,13 +201,14 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
 static int sysfs_slab_add(struct kmem_cache *);
 static int sysfs_slab_alias(struct kmem_cache *, const char *);
 static void sysfs_slab_remove(struct kmem_cache *);
-
+static void memcg_propagate_slab_attrs(struct kmem_cache *s);
 #else
 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
 							{ return 0; }
 static inline void sysfs_slab_remove(struct kmem_cache *s) { }
 
+static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
 #endif
 
 static inline void stat(const struct kmem_cache *s, enum stat_item si)
@@ -1092,11 +1089,11 @@ static noinline struct kmem_cache_node *free_debug_processing(
 	if (!check_object(s, page, object, SLUB_RED_ACTIVE))
 		goto out;
 
-	if (unlikely(s != page->slab)) {
+	if (unlikely(s != page->slab_cache)) {
 		if (!PageSlab(page)) {
 			slab_err(s, page, "Attempt to free object(0x%p) "
 				"outside of slab", object);
-		} else if (!page->slab) {
+		} else if (!page->slab_cache) {
 			printk(KERN_ERR
 				"SLUB <none>: no slab for object 0x%p.\n",
 						object);
@@ -1348,6 +1345,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 	void *start;
 	void *last;
 	void *p;
+	int order;
 
 	BUG_ON(flags & GFP_SLAB_BUG_MASK);
 
@@ -1356,8 +1354,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 	if (!page)
 		goto out;
 
+	order = compound_order(page);
 	inc_slabs_node(s, page_to_nid(page), page->objects);
-	page->slab = s;
+	memcg_bind_pages(s, order);
+	page->slab_cache = s;
 	__SetPageSlab(page);
 	if (page->pfmemalloc)
 		SetPageSlabPfmemalloc(page);
@@ -1365,7 +1365,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 	start = page_address(page);
 
 	if (unlikely(s->flags & SLAB_POISON))
-		memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page));
+		memset(start, POISON_INUSE, PAGE_SIZE << order);
 
 	last = start;
 	for_each_object(p, s, start, page->objects) {
@@ -1406,10 +1406,12 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 
 	__ClearPageSlabPfmemalloc(page);
 	__ClearPageSlab(page);
+
+	memcg_release_pages(s, order);
 	reset_page_mapcount(page);
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += pages;
-	__free_pages(page, order);
+	__free_memcg_kmem_pages(page, order);
 }
 
 #define need_reserve_slab_rcu						\
@@ -1424,7 +1426,7 @@ static void rcu_free_slab(struct rcu_head *h)
 	else
 		page = container_of((struct list_head *)h, struct page, lru);
 
-	__free_slab(page->slab, page);
+	__free_slab(page->slab_cache, page);
 }
 
 static void free_slab(struct kmem_cache *s, struct page *page)
@@ -1872,12 +1874,14 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, void *freel
 /*
  * Unfreeze all the cpu partial slabs.
  *
- * This function must be called with interrupt disabled.
+ * This function must be called with interrupts disabled
+ * for the cpu using c (or some other guarantee must be there
+ * to guarantee no concurrent accesses).
  */
-static void unfreeze_partials(struct kmem_cache *s)
+static void unfreeze_partials(struct kmem_cache *s,
+		struct kmem_cache_cpu *c)
 {
 	struct kmem_cache_node *n = NULL, *n2 = NULL;
-	struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
 	struct page *page, *discard_page = NULL;
 
 	while ((page = c->partial)) {
@@ -1963,7 +1967,7 @@ static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 				 * set to the per node partial list.
 				 */
 				local_irq_save(flags);
-				unfreeze_partials(s);
+				unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
 				local_irq_restore(flags);
 				oldpage = NULL;
 				pobjects = 0;
@@ -2006,7 +2010,7 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
 		if (c->page)
 			flush_slab(s, c);
 
-		unfreeze_partials(s);
+		unfreeze_partials(s, c);
 	}
 }
 
@@ -2325,6 +2329,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
 	if (slab_pre_alloc_hook(s, gfpflags))
 		return NULL;
 
+	s = memcg_kmem_get_cache(s, gfpflags);
 redo:
 
 	/*
@@ -2459,7 +2464,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 	void *prior;
 	void **object = (void *)x;
 	int was_frozen;
-	int inuse;
 	struct page new;
 	unsigned long counters;
 	struct kmem_cache_node *n = NULL;
@@ -2472,13 +2476,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 		return;
 
 	do {
+		if (unlikely(n)) {
+			spin_unlock_irqrestore(&n->list_lock, flags);
+			n = NULL;
+		}
 		prior = page->freelist;
 		counters = page->counters;
 		set_freepointer(s, object, prior);
 		new.counters = counters;
 		was_frozen = new.frozen;
 		new.inuse--;
-		if ((!new.inuse || !prior) && !was_frozen && !n) {
+		if ((!new.inuse || !prior) && !was_frozen) {
 
 			if (!kmem_cache_debug(s) && !prior)
 
@@ -2503,7 +2511,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 
 			}
 		}
-		inuse = new.inuse;
 
 	} while (!cmpxchg_double_slab(s, page,
 		prior, counters,
@@ -2529,25 +2536,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
                 return;
         }
 
+	if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
+		goto slab_empty;
+
 	/*
-	 * was_frozen may have been set after we acquired the list_lock in
-	 * an earlier loop. So we need to check it here again.
+	 * Objects left in the slab. If it was not on the partial list before
+	 * then add it.
 	 */
-	if (was_frozen)
-		stat(s, FREE_FROZEN);
-	else {
-		if (unlikely(!inuse && n->nr_partial > s->min_partial))
-                        goto slab_empty;
-
-		/*
-		 * Objects left in the slab. If it was not on the partial list before
-		 * then add it.
-		 */
-		if (unlikely(!prior)) {
-			remove_full(s, page);
-			add_partial(n, page, DEACTIVATE_TO_TAIL);
-			stat(s, FREE_ADD_PARTIAL);
-		}
+	if (kmem_cache_debug(s) && unlikely(!prior)) {
+		remove_full(s, page);
+		add_partial(n, page, DEACTIVATE_TO_TAIL);
+		stat(s, FREE_ADD_PARTIAL);
 	}
 	spin_unlock_irqrestore(&n->list_lock, flags);
 	return;
@@ -2619,19 +2618,10 @@ static __always_inline void slab_free(struct kmem_cache *s,
 
 void kmem_cache_free(struct kmem_cache *s, void *x)
 {
-	struct page *page;
-
-	page = virt_to_head_page(x);
-
-	if (kmem_cache_debug(s) && page->slab != s) {
-		pr_err("kmem_cache_free: Wrong slab cache. %s but object"
-			" is from  %s\n", page->slab->name, s->name);
-		WARN_ON_ONCE(1);
+	s = cache_from_obj(s, x);
+	if (!s)
 		return;
-	}
-
-	slab_free(s, page, x, _RET_IP_);
-
+	slab_free(s, virt_to_head_page(x), x, _RET_IP_);
 	trace_kmem_cache_free(_RET_IP_, x);
 }
 EXPORT_SYMBOL(kmem_cache_free);
@@ -2769,32 +2759,6 @@ static inline int calculate_order(int size, int reserved)
 	return -ENOSYS;
 }
 
-/*
- * Figure out what the alignment of the objects will be.
- */
-static unsigned long calculate_alignment(unsigned long flags,
-		unsigned long align, unsigned long size)
-{
-	/*
-	 * If the user wants hardware cache aligned objects then follow that
-	 * suggestion if the object is sufficiently large.
-	 *
-	 * The hardware cache alignment cannot override the specified
-	 * alignment though. If that is greater then use it.
-	 */
-	if (flags & SLAB_HWCACHE_ALIGN) {
-		unsigned long ralign = cache_line_size();
-		while (size <= ralign / 2)
-			ralign /= 2;
-		align = max(align, ralign);
-	}
-
-	if (align < ARCH_SLAB_MINALIGN)
-		align = ARCH_SLAB_MINALIGN;
-
-	return ALIGN(align, sizeof(void *));
-}
-
 static void
 init_kmem_cache_node(struct kmem_cache_node *n)
 {
@@ -2928,7 +2892,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 {
 	unsigned long flags = s->flags;
 	unsigned long size = s->object_size;
-	unsigned long align = s->align;
 	int order;
 
 	/*
@@ -2999,20 +2962,12 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 		size += sizeof(void *);
 #endif
 
-	/*
-	 * Determine the alignment based on various parameters that the
-	 * user specified and the dynamic determination of cache line size
-	 * on bootup.
-	 */
-	align = calculate_alignment(flags, align, s->object_size);
-	s->align = align;
-
 	/*
 	 * SLUB stores one object immediately after another beginning from
 	 * offset 0. In order to align the objects we have to simply size
 	 * each object to conform to the alignment.
 	 */
-	size = ALIGN(size, align);
+	size = ALIGN(size, s->align);
 	s->size = size;
 	if (forced_order >= 0)
 		order = forced_order;
@@ -3041,7 +2996,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 		s->max = s->oo;
 
 	return !!oo_objects(s->oo);
-
 }
 
 static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
@@ -3127,15 +3081,6 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
 	return -EINVAL;
 }
 
-/*
- * Determine the size of a slab object
- */
-unsigned int kmem_cache_size(struct kmem_cache *s)
-{
-	return s->object_size;
-}
-EXPORT_SYMBOL(kmem_cache_size);
-
 static void list_slab_objects(struct kmem_cache *s, struct page *page,
 							const char *text)
 {
@@ -3208,8 +3153,19 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
 {
 	int rc = kmem_cache_close(s);
 
-	if (!rc)
+	if (!rc) {
+		/*
+		 * We do the same lock strategy around sysfs_slab_add, see
+		 * __kmem_cache_create. Because this is pretty much the last
+		 * operation we do and the lock will be released shortly after
+		 * that in slab_common.c, we could just move sysfs_slab_remove
+		 * to a later point in common code. We should do that when we
+		 * have a common sysfs framework for all allocators.
+		 */
+		mutex_unlock(&slab_mutex);
 		sysfs_slab_remove(s);
+		mutex_lock(&slab_mutex);
+	}
 
 	return rc;
 }
@@ -3261,32 +3217,6 @@ static int __init setup_slub_nomerge(char *str)
 
 __setup("slub_nomerge", setup_slub_nomerge);
 
-static struct kmem_cache *__init create_kmalloc_cache(const char *name,
-						int size, unsigned int flags)
-{
-	struct kmem_cache *s;
-
-	s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-
-	s->name = name;
-	s->size = s->object_size = size;
-	s->align = ARCH_KMALLOC_MINALIGN;
-
-	/*
-	 * This function is called with IRQs disabled during early-boot on
-	 * single CPU so there's no need to take slab_mutex here.
-	 */
-	if (kmem_cache_open(s, flags))
-		goto panic;
-
-	list_add(&s->list, &slab_caches);
-	return s;
-
-panic:
-	panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
-	return NULL;
-}
-
 /*
  * Conversion table for small slabs sizes / 8 to the index in the
  * kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -3372,7 +3302,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
 	struct page *page;
 	void *ptr = NULL;
 
-	flags |= __GFP_COMP | __GFP_NOTRACK;
+	flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
 	page = alloc_pages_node(node, flags, get_order(size));
 	if (page)
 		ptr = page_address(page);
@@ -3424,7 +3354,7 @@ size_t ksize(const void *object)
 		return PAGE_SIZE << compound_order(page);
 	}
 
-	return slab_ksize(page->slab);
+	return slab_ksize(page->slab_cache);
 }
 EXPORT_SYMBOL(ksize);
 
@@ -3449,8 +3379,8 @@ bool verify_mem_not_deleted(const void *x)
 	}
 
 	slab_lock(page);
-	if (on_freelist(page->slab, page, object)) {
-		object_err(page->slab, page, object, "Object is on free-list");
+	if (on_freelist(page->slab_cache, page, object)) {
+		object_err(page->slab_cache, page, object, "Object is on free-list");
 		rv = false;
 	} else {
 		rv = true;
@@ -3478,10 +3408,10 @@ void kfree(const void *x)
 	if (unlikely(!PageSlab(page))) {
 		BUG_ON(!PageCompound(page));
 		kmemleak_free(x);
-		__free_pages(page, compound_order(page));
+		__free_memcg_kmem_pages(page, compound_order(page));
 		return;
 	}
-	slab_free(page->slab, page, object, _RET_IP_);
+	slab_free(page->slab_cache, page, object, _RET_IP_);
 }
 EXPORT_SYMBOL(kfree);
 
@@ -3676,15 +3606,16 @@ static int slab_memory_callback(struct notifier_block *self,
 
 /*
  * Used for early kmem_cache structures that were allocated using
- * the page allocator
+ * the page allocator. Allocate them properly then fix up the pointers
+ * that may be pointing to the wrong kmem_cache structure.
  */
 
-static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
+static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
 {
 	int node;
+	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
 
-	list_add(&s->list, &slab_caches);
-	s->refcount = -1;
+	memcpy(s, static_cache, kmem_cache->object_size);
 
 	for_each_node_state(node, N_NORMAL_MEMORY) {
 		struct kmem_cache_node *n = get_node(s, node);
@@ -3692,78 +3623,52 @@ static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
 
 		if (n) {
 			list_for_each_entry(p, &n->partial, lru)
-				p->slab = s;
+				p->slab_cache = s;
 
 #ifdef CONFIG_SLUB_DEBUG
 			list_for_each_entry(p, &n->full, lru)
-				p->slab = s;
+				p->slab_cache = s;
 #endif
 		}
 	}
+	list_add(&s->list, &slab_caches);
+	return s;
 }
 
 void __init kmem_cache_init(void)
 {
+	static __initdata struct kmem_cache boot_kmem_cache,
+		boot_kmem_cache_node;
 	int i;
-	int caches = 0;
-	struct kmem_cache *temp_kmem_cache;
-	int order;
-	struct kmem_cache *temp_kmem_cache_node;
-	unsigned long kmalloc_size;
+	int caches = 2;
 
 	if (debug_guardpage_minorder())
 		slub_max_order = 0;
 
-	kmem_size = offsetof(struct kmem_cache, node) +
-			nr_node_ids * sizeof(struct kmem_cache_node *);
-
-	/* Allocate two kmem_caches from the page allocator */
-	kmalloc_size = ALIGN(kmem_size, cache_line_size());
-	order = get_order(2 * kmalloc_size);
-	kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order);
-
-	/*
-	 * Must first have the slab cache available for the allocations of the
-	 * struct kmem_cache_node's. There is special bootstrap code in
-	 * kmem_cache_open for slab_state == DOWN.
-	 */
-	kmem_cache_node = (void *)kmem_cache + kmalloc_size;
+	kmem_cache_node = &boot_kmem_cache_node;
+	kmem_cache = &boot_kmem_cache;
 
-	kmem_cache_node->name = "kmem_cache_node";
-	kmem_cache_node->size = kmem_cache_node->object_size =
-		sizeof(struct kmem_cache_node);
-	kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+	create_boot_cache(kmem_cache_node, "kmem_cache_node",
+		sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
 
 	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
 
 	/* Able to allocate the per node structures */
 	slab_state = PARTIAL;
 
-	temp_kmem_cache = kmem_cache;
-	kmem_cache->name = "kmem_cache";
-	kmem_cache->size = kmem_cache->object_size = kmem_size;
-	kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+	create_boot_cache(kmem_cache, "kmem_cache",
+			offsetof(struct kmem_cache, node) +
+				nr_node_ids * sizeof(struct kmem_cache_node *),
+		       SLAB_HWCACHE_ALIGN);
 
-	kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
-	memcpy(kmem_cache, temp_kmem_cache, kmem_size);
+	kmem_cache = bootstrap(&boot_kmem_cache);
 
 	/*
 	 * Allocate kmem_cache_node properly from the kmem_cache slab.
 	 * kmem_cache_node is separately allocated so no need to
 	 * update any list pointers.
 	 */
-	temp_kmem_cache_node = kmem_cache_node;
-
-	kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
-	memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
-
-	kmem_cache_bootstrap_fixup(kmem_cache_node);
-
-	caches++;
-	kmem_cache_bootstrap_fixup(kmem_cache);
-	caches++;
-	/* Free temporary boot structure */
-	free_pages((unsigned long)temp_kmem_cache, order);
+	kmem_cache_node = bootstrap(&boot_kmem_cache_node);
 
 	/* Now we can use the kmem_cache to allocate kmalloc slabs */
 
@@ -3891,7 +3796,7 @@ static int slab_unmergeable(struct kmem_cache *s)
 	return 0;
 }
 
-static struct kmem_cache *find_mergeable(size_t size,
+static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
 		size_t align, unsigned long flags, const char *name,
 		void (*ctor)(void *))
 {
@@ -3927,17 +3832,21 @@ static struct kmem_cache *find_mergeable(size_t size,
 		if (s->size - size >= sizeof(void *))
 			continue;
 
+		if (!cache_match_memcg(s, memcg))
+			continue;
+
 		return s;
 	}
 	return NULL;
 }
 
-struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
-		size_t align, unsigned long flags, void (*ctor)(void *))
+struct kmem_cache *
+__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+		   size_t align, unsigned long flags, void (*ctor)(void *))
 {
 	struct kmem_cache *s;
 
-	s = find_mergeable(size, align, flags, name, ctor);
+	s = find_mergeable(memcg, size, align, flags, name, ctor);
 	if (s) {
 		s->refcount++;
 		/*
@@ -3964,6 +3873,11 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
 	if (err)
 		return err;
 
+	/* Mutex is not taken during early boot */
+	if (slab_state <= UP)
+		return 0;
+
+	memcg_propagate_slab_attrs(s);
 	mutex_unlock(&slab_mutex);
 	err = sysfs_slab_add(s);
 	mutex_lock(&slab_mutex);
@@ -5197,10 +5111,95 @@ static ssize_t slab_attr_store(struct kobject *kobj,
 		return -EIO;
 
 	err = attribute->store(s, buf, len);
+#ifdef CONFIG_MEMCG_KMEM
+	if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
+		int i;
+
+		mutex_lock(&slab_mutex);
+		if (s->max_attr_size < len)
+			s->max_attr_size = len;
 
+		/*
+		 * This is a best effort propagation, so this function's return
+		 * value will be determined by the parent cache only. This is
+		 * basically because not all attributes will have a well
+		 * defined semantics for rollbacks - most of the actions will
+		 * have permanent effects.
+		 *
+		 * Returning the error value of any of the children that fail
+		 * is not 100 % defined, in the sense that users seeing the
+		 * error code won't be able to know anything about the state of
+		 * the cache.
+		 *
+		 * Only returning the error code for the parent cache at least
+		 * has well defined semantics. The cache being written to
+		 * directly either failed or succeeded, in which case we loop
+		 * through the descendants with best-effort propagation.
+		 */
+		for_each_memcg_cache_index(i) {
+			struct kmem_cache *c = cache_from_memcg(s, i);
+			if (c)
+				attribute->store(c, buf, len);
+		}
+		mutex_unlock(&slab_mutex);
+	}
+#endif
 	return err;
 }
 
+static void memcg_propagate_slab_attrs(struct kmem_cache *s)
+{
+#ifdef CONFIG_MEMCG_KMEM
+	int i;
+	char *buffer = NULL;
+
+	if (!is_root_cache(s))
+		return;
+
+	/*
+	 * This mean this cache had no attribute written. Therefore, no point
+	 * in copying default values around
+	 */
+	if (!s->max_attr_size)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
+		char mbuf[64];
+		char *buf;
+		struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
+
+		if (!attr || !attr->store || !attr->show)
+			continue;
+
+		/*
+		 * It is really bad that we have to allocate here, so we will
+		 * do it only as a fallback. If we actually allocate, though,
+		 * we can just use the allocated buffer until the end.
+		 *
+		 * Most of the slub attributes will tend to be very small in
+		 * size, but sysfs allows buffers up to a page, so they can
+		 * theoretically happen.
+		 */
+		if (buffer)
+			buf = buffer;
+		else if (s->max_attr_size < ARRAY_SIZE(mbuf))
+			buf = mbuf;
+		else {
+			buffer = (char *) get_zeroed_page(GFP_KERNEL);
+			if (WARN_ON(!buffer))
+				continue;
+			buf = buffer;
+		}
+
+		attr->show(s->memcg_params->root_cache, buf);
+		attr->store(s, buf, strlen(buf));
+	}
+
+	if (buffer)
+		free_page((unsigned long)buffer);
+#endif
+}
+
 static const struct sysfs_ops slab_sysfs_ops = {
 	.show = slab_attr_show,
 	.store = slab_attr_store,
@@ -5257,6 +5256,12 @@ static char *create_unique_id(struct kmem_cache *s)
 	if (p != name + 1)
 		*p++ = '-';
 	p += sprintf(p, "%07d", s->size);
+
+#ifdef CONFIG_MEMCG_KMEM
+	if (!is_root_cache(s))
+		p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg));
+#endif
+
 	BUG_ON(p > name + ID_STR_LENGTH - 1);
 	return name;
 }
@@ -5265,13 +5270,8 @@ static int sysfs_slab_add(struct kmem_cache *s)
 {
 	int err;
 	const char *name;
-	int unmergeable;
-
-	if (slab_state < FULL)
-		/* Defer until later */
-		return 0;
+	int unmergeable = slab_unmergeable(s);
 
-	unmergeable = slab_unmergeable(s);
 	if (unmergeable) {
 		/*
 		 * Slabcache can never be merged so we can use the name proper.
@@ -5405,49 +5405,14 @@ __initcall(slab_sysfs_init);
  * The /proc/slabinfo ABI
  */
 #ifdef CONFIG_SLABINFO
-static void print_slabinfo_header(struct seq_file *m)
-{
-	seq_puts(m, "slabinfo - version: 2.1\n");
-	seq_puts(m, "# name            <active_objs> <num_objs> <object_size> "
-		 "<objperslab> <pagesperslab>");
-	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
-	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
-	seq_putc(m, '\n');
-}
-
-static void *s_start(struct seq_file *m, loff_t *pos)
-{
-	loff_t n = *pos;
-
-	mutex_lock(&slab_mutex);
-	if (!n)
-		print_slabinfo_header(m);
-
-	return seq_list_start(&slab_caches, *pos);
-}
-
-static void *s_next(struct seq_file *m, void *p, loff_t *pos)
-{
-	return seq_list_next(p, &slab_caches, pos);
-}
-
-static void s_stop(struct seq_file *m, void *p)
-{
-	mutex_unlock(&slab_mutex);
-}
-
-static int s_show(struct seq_file *m, void *p)
+void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
 {
 	unsigned long nr_partials = 0;
 	unsigned long nr_slabs = 0;
-	unsigned long nr_inuse = 0;
 	unsigned long nr_objs = 0;
 	unsigned long nr_free = 0;
-	struct kmem_cache *s;
 	int node;
 
-	s = list_entry(p, struct kmem_cache, list);
-
 	for_each_online_node(node) {
 		struct kmem_cache_node *n = get_node(s, node);
 
@@ -5460,41 +5425,21 @@ static int s_show(struct seq_file *m, void *p)
 		nr_free += count_partial(n, count_free);
 	}
 
-	nr_inuse = nr_objs - nr_free;
-
-	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
-		   nr_objs, s->size, oo_objects(s->oo),
-		   (1 << oo_order(s->oo)));
-	seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0);
-	seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
-		   0UL);
-	seq_putc(m, '\n');
-	return 0;
+	sinfo->active_objs = nr_objs - nr_free;
+	sinfo->num_objs = nr_objs;
+	sinfo->active_slabs = nr_slabs;
+	sinfo->num_slabs = nr_slabs;
+	sinfo->objects_per_slab = oo_objects(s->oo);
+	sinfo->cache_order = oo_order(s->oo);
 }
 
-static const struct seq_operations slabinfo_op = {
-	.start = s_start,
-	.next = s_next,
-	.stop = s_stop,
-	.show = s_show,
-};
-
-static int slabinfo_open(struct inode *inode, struct file *file)
+void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
 {
-	return seq_open(file, &slabinfo_op);
 }
 
-static const struct file_operations proc_slabinfo_operations = {
-	.open		= slabinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static int __init slab_proc_init(void)
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+		       size_t count, loff_t *ppos)
 {
-	proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
-	return 0;
+	return -EIO;
 }
-module_init(slab_proc_init);
 #endif /* CONFIG_SLABINFO */
diff --git a/trunk/mm/vmscan.c b/trunk/mm/vmscan.c
index 7f3096137b8a..adc7e9058181 100644
--- a/trunk/mm/vmscan.c
+++ b/trunk/mm/vmscan.c
@@ -1177,7 +1177,11 @@ int isolate_lru_page(struct page *page)
 }
 
 /*
- * Are there way too many processes in the direct reclaim path already?
+ * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
+ * then get resheduled. When there are massive number of tasks doing page
+ * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
+ * the LRU list will go small and be scanned faster than necessary, leading to
+ * unnecessary swapping, thrashing and OOM.
  */
 static int too_many_isolated(struct zone *zone, int file,
 		struct scan_control *sc)
@@ -1198,6 +1202,14 @@ static int too_many_isolated(struct zone *zone, int file,
 		isolated = zone_page_state(zone, NR_ISOLATED_ANON);
 	}
 
+	/*
+	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
+	 * won't get blocked by normal direct-reclaimers, forming a circular
+	 * deadlock.
+	 */
+	if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
+		inactive >>= 3;
+
 	return isolated > inactive;
 }
 
@@ -2558,7 +2570,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 							int *classzone_idx)
 {
-	int all_zones_ok;
+	struct zone *unbalanced_zone;
 	unsigned long balanced;
 	int i;
 	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
@@ -2592,7 +2604,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 		unsigned long lru_pages = 0;
 		int has_under_min_watermark_zone = 0;
 
-		all_zones_ok = 1;
+		unbalanced_zone = NULL;
 		balanced = 0;
 
 		/*
@@ -2731,7 +2743,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 			}
 
 			if (!zone_balanced(zone, testorder, 0, end_zone)) {
-				all_zones_ok = 0;
+				unbalanced_zone = zone;
 				/*
 				 * We are still under min water mark.  This
 				 * means that we have a GFP_ATOMIC allocation
@@ -2764,7 +2776,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 				pfmemalloc_watermark_ok(pgdat))
 			wake_up(&pgdat->pfmemalloc_wait);
 
-		if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
+		if (!unbalanced_zone || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
 			break;		/* kswapd: all done */
 		/*
 		 * OK, kswapd is getting into trouble.  Take a nap, then take
@@ -2774,7 +2786,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 			if (has_under_min_watermark_zone)
 				count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
 			else
-				congestion_wait(BLK_RW_ASYNC, HZ/10);
+				wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10);
 		}
 
 		/*
@@ -2793,7 +2805,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 	 * high-order: Balanced zones must make up at least 25% of the node
 	 *             for the node to be balanced
 	 */
-	if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
+	if (unbalanced_zone && (!order || !pgdat_balanced(pgdat, balanced, *classzone_idx))) {
 		cond_resched();
 
 		try_to_freeze();
diff --git a/trunk/net/atm/atm_sysfs.c b/trunk/net/atm/atm_sysfs.c
index f49da5814bc3..350bf62b2ae3 100644
--- a/trunk/net/atm/atm_sysfs.c
+++ b/trunk/net/atm/atm_sysfs.c
@@ -14,49 +14,45 @@ static ssize_t show_type(struct device *cdev,
 			 struct device_attribute *attr, char *buf)
 {
 	struct atm_dev *adev = to_atm_dev(cdev);
-	return sprintf(buf, "%s\n", adev->type);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n", adev->type);
 }
 
 static ssize_t show_address(struct device *cdev,
 			    struct device_attribute *attr, char *buf)
 {
-	char *pos = buf;
 	struct atm_dev *adev = to_atm_dev(cdev);
-	int i;
-
-	for (i = 0; i < (ESI_LEN - 1); i++)
-		pos += sprintf(pos, "%02x:", adev->esi[i]);
-	pos += sprintf(pos, "%02x\n", adev->esi[i]);
 
-	return pos - buf;
+	return scnprintf(buf, PAGE_SIZE, "%pM\n", adev->esi);
 }
 
 static ssize_t show_atmaddress(struct device *cdev,
 			       struct device_attribute *attr, char *buf)
 {
 	unsigned long flags;
-	char *pos = buf;
 	struct atm_dev *adev = to_atm_dev(cdev);
 	struct atm_dev_addr *aaddr;
 	int bin[] = { 1, 2, 10, 6, 1 }, *fmt = bin;
-	int i, j;
+	int i, j, count = 0;
 
 	spin_lock_irqsave(&adev->lock, flags);
 	list_for_each_entry(aaddr, &adev->local, entry) {
 		for (i = 0, j = 0; i < ATM_ESA_LEN; ++i, ++j) {
 			if (j == *fmt) {
-				pos += sprintf(pos, ".");
+				count += scnprintf(buf + count,
+						   PAGE_SIZE - count, ".");
 				++fmt;
 				j = 0;
 			}
-			pos += sprintf(pos, "%02x",
-				       aaddr->addr.sas_addr.prv[i]);
+			count += scnprintf(buf + count,
+					   PAGE_SIZE - count, "%02x",
+					   aaddr->addr.sas_addr.prv[i]);
 		}
-		pos += sprintf(pos, "\n");
+		count += scnprintf(buf + count, PAGE_SIZE - count, "\n");
 	}
 	spin_unlock_irqrestore(&adev->lock, flags);
 
-	return pos - buf;
+	return count;
 }
 
 static ssize_t show_atmindex(struct device *cdev,
@@ -64,25 +60,21 @@ static ssize_t show_atmindex(struct device *cdev,
 {
 	struct atm_dev *adev = to_atm_dev(cdev);
 
-	return sprintf(buf, "%d\n", adev->number);
+	return scnprintf(buf, PAGE_SIZE, "%d\n", adev->number);
 }
 
 static ssize_t show_carrier(struct device *cdev,
 			    struct device_attribute *attr, char *buf)
 {
-	char *pos = buf;
 	struct atm_dev *adev = to_atm_dev(cdev);
 
-	pos += sprintf(pos, "%d\n",
-		       adev->signal == ATM_PHY_SIG_LOST ? 0 : 1);
-
-	return pos - buf;
+	return scnprintf(buf, PAGE_SIZE, "%d\n",
+			 adev->signal == ATM_PHY_SIG_LOST ? 0 : 1);
 }
 
 static ssize_t show_link_rate(struct device *cdev,
 			      struct device_attribute *attr, char *buf)
 {
-	char *pos = buf;
 	struct atm_dev *adev = to_atm_dev(cdev);
 	int link_rate;
 
@@ -100,9 +92,7 @@ static ssize_t show_link_rate(struct device *cdev,
 	default:
 		link_rate = adev->link_rate * 8 * 53;
 	}
-	pos += sprintf(pos, "%d\n", link_rate);
-
-	return pos - buf;
+	return scnprintf(buf, PAGE_SIZE, "%d\n", link_rate);
 }
 
 static DEVICE_ATTR(address, S_IRUGO, show_address, NULL);
diff --git a/trunk/net/bridge/br_mdb.c b/trunk/net/bridge/br_mdb.c
index 6f0a2eebcb27..acc9f4cc18f7 100644
--- a/trunk/net/bridge/br_mdb.c
+++ b/trunk/net/bridge/br_mdb.c
@@ -83,9 +83,12 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 				if (port) {
 					struct br_mdb_entry e;
 					e.ifindex = port->dev->ifindex;
-					e.addr.u.ip4 = p->addr.u.ip4;
+					e.state = p->state;
+					if (p->addr.proto == htons(ETH_P_IP))
+						e.addr.u.ip4 = p->addr.u.ip4;
 #if IS_ENABLED(CONFIG_IPV6)
-					e.addr.u.ip6 = p->addr.u.ip6;
+					if (p->addr.proto == htons(ETH_P_IPV6))
+						e.addr.u.ip6 = p->addr.u.ip6;
 #endif
 					e.addr.proto = p->addr.proto;
 					if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(e), &e)) {
@@ -253,6 +256,8 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry)
 #endif
 	} else
 		return false;
+	if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY)
+		return false;
 
 	return true;
 }
@@ -310,7 +315,7 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh,
 }
 
 static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
-			    struct br_ip *group)
+			    struct br_ip *group, unsigned char state)
 {
 	struct net_bridge_mdb_entry *mp;
 	struct net_bridge_port_group *p;
@@ -336,7 +341,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
 			break;
 	}
 
-	p = br_multicast_new_port_group(port, group, *pp);
+	p = br_multicast_new_port_group(port, group, *pp, state);
 	if (unlikely(!p))
 		return -ENOMEM;
 	rcu_assign_pointer(*pp, p);
@@ -373,7 +378,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br,
 #endif
 
 	spin_lock_bh(&br->multicast_lock);
-	ret = br_mdb_add_group(br, p, &ip);
+	ret = br_mdb_add_group(br, p, &ip, entry->state);
 	spin_unlock_bh(&br->multicast_lock);
 	return ret;
 }
@@ -479,3 +484,10 @@ void br_mdb_init(void)
 	rtnl_register(PF_BRIDGE, RTM_NEWMDB, br_mdb_add, NULL, NULL);
 	rtnl_register(PF_BRIDGE, RTM_DELMDB, br_mdb_del, NULL, NULL);
 }
+
+void br_mdb_uninit(void)
+{
+	rtnl_unregister(PF_BRIDGE, RTM_GETMDB);
+	rtnl_unregister(PF_BRIDGE, RTM_NEWMDB);
+	rtnl_unregister(PF_BRIDGE, RTM_DELMDB);
+}
diff --git a/trunk/net/bridge/br_multicast.c b/trunk/net/bridge/br_multicast.c
index 1093c89095d8..5391ca43336a 100644
--- a/trunk/net/bridge/br_multicast.c
+++ b/trunk/net/bridge/br_multicast.c
@@ -279,7 +279,7 @@ static void br_multicast_port_group_expired(unsigned long data)
 
 	spin_lock(&br->multicast_lock);
 	if (!netif_running(br->dev) || timer_pending(&pg->timer) ||
-	    hlist_unhashed(&pg->mglist))
+	    hlist_unhashed(&pg->mglist) || pg->state & MDB_PERMANENT)
 		goto out;
 
 	br_multicast_del_pg(br, pg);
@@ -622,7 +622,8 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br,
 struct net_bridge_port_group *br_multicast_new_port_group(
 			struct net_bridge_port *port,
 			struct br_ip *group,
-			struct net_bridge_port_group __rcu *next)
+			struct net_bridge_port_group __rcu *next,
+			unsigned char state)
 {
 	struct net_bridge_port_group *p;
 
@@ -632,6 +633,7 @@ struct net_bridge_port_group *br_multicast_new_port_group(
 
 	p->addr = *group;
 	p->port = port;
+	p->state = state;
 	rcu_assign_pointer(p->next, next);
 	hlist_add_head(&p->mglist, &port->mglist);
 	setup_timer(&p->timer, br_multicast_port_group_expired,
@@ -674,7 +676,7 @@ static int br_multicast_add_group(struct net_bridge *br,
 			break;
 	}
 
-	p = br_multicast_new_port_group(port, group, *pp);
+	p = br_multicast_new_port_group(port, group, *pp, MDB_TEMPORARY);
 	if (unlikely(!p))
 		goto err;
 	rcu_assign_pointer(*pp, p);
@@ -1165,7 +1167,6 @@ static int br_ip6_multicast_query(struct net_bridge *br,
 		if (max_delay)
 			group = &mld->mld_mca;
 	} else if (skb->len >= sizeof(*mld2q)) {
-		u16 mrc;
 		if (!pskb_may_pull(skb, sizeof(*mld2q))) {
 			err = -EINVAL;
 			goto out;
@@ -1173,8 +1174,7 @@ static int br_ip6_multicast_query(struct net_bridge *br,
 		mld2q = (struct mld2_query *)icmp6_hdr(skb);
 		if (!mld2q->mld2q_nsrcs)
 			group = &mld2q->mld2q_mca;
-		mrc = ntohs(mld2q->mld2q_mrc);
-		max_delay = mrc ? MLDV2_MRC(mrc) : 1;
+		max_delay = mld2q->mld2q_mrc ? MLDV2_MRC(ntohs(mld2q->mld2q_mrc)) : 1;
 	}
 
 	if (!group)
@@ -1633,6 +1633,7 @@ void br_multicast_stop(struct net_bridge *br)
 	del_timer_sync(&br->multicast_querier_timer);
 	del_timer_sync(&br->multicast_query_timer);
 
+	br_mdb_uninit();
 	spin_lock_bh(&br->multicast_lock);
 	mdb = mlock_dereference(br->mdb, br);
 	if (!mdb)
diff --git a/trunk/net/bridge/br_netlink.c b/trunk/net/bridge/br_netlink.c
index dead9dfe865b..97ba0189c6f7 100644
--- a/trunk/net/bridge/br_netlink.c
+++ b/trunk/net/bridge/br_netlink.c
@@ -305,5 +305,4 @@ int __init br_netlink_init(void)
 void __exit br_netlink_fini(void)
 {
 	rtnl_link_unregister(&br_link_ops);
-	rtnl_unregister_all(PF_BRIDGE);
 }
diff --git a/trunk/net/bridge/br_private.h b/trunk/net/bridge/br_private.h
index f21a739a6186..8d83be5ffedc 100644
--- a/trunk/net/bridge/br_private.h
+++ b/trunk/net/bridge/br_private.h
@@ -83,6 +83,7 @@ struct net_bridge_port_group {
 	struct rcu_head			rcu;
 	struct timer_list		timer;
 	struct br_ip			addr;
+	unsigned char			state;
 };
 
 struct net_bridge_mdb_entry
@@ -443,8 +444,10 @@ extern void br_multicast_free_pg(struct rcu_head *head);
 extern struct net_bridge_port_group *br_multicast_new_port_group(
 				struct net_bridge_port *port,
 				struct br_ip *group,
-				struct net_bridge_port_group *next);
+				struct net_bridge_port_group *next,
+				unsigned char state);
 extern void br_mdb_init(void);
+extern void br_mdb_uninit(void);
 extern void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
 			  struct br_ip *group, int type);
 
diff --git a/trunk/net/core/net_namespace.c b/trunk/net/core/net_namespace.c
index 6456439cbbd9..8acce01b6dab 100644
--- a/trunk/net/core/net_namespace.c
+++ b/trunk/net/core/net_namespace.c
@@ -381,6 +381,21 @@ struct net *get_net_ns_by_pid(pid_t pid)
 }
 EXPORT_SYMBOL_GPL(get_net_ns_by_pid);
 
+static __net_init int net_ns_net_init(struct net *net)
+{
+	return proc_alloc_inum(&net->proc_inum);
+}
+
+static __net_exit void net_ns_net_exit(struct net *net)
+{
+	proc_free_inum(net->proc_inum);
+}
+
+static struct pernet_operations __net_initdata net_ns_ops = {
+	.init = net_ns_net_init,
+	.exit = net_ns_net_exit,
+};
+
 static int __init net_ns_init(void)
 {
 	struct net_generic *ng;
@@ -412,6 +427,8 @@ static int __init net_ns_init(void)
 
 	mutex_unlock(&net_mutex);
 
+	register_pernet_subsys(&net_ns_ops);
+
 	return 0;
 }
 
@@ -630,16 +647,29 @@ static void netns_put(void *ns)
 
 static int netns_install(struct nsproxy *nsproxy, void *ns)
 {
+	struct net *net = ns;
+
+	if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
+	    !nsown_capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	put_net(nsproxy->net_ns);
-	nsproxy->net_ns = get_net(ns);
+	nsproxy->net_ns = get_net(net);
 	return 0;
 }
 
+static unsigned int netns_inum(void *ns)
+{
+	struct net *net = ns;
+	return net->proc_inum;
+}
+
 const struct proc_ns_operations netns_operations = {
 	.name		= "net",
 	.type		= CLONE_NEWNET,
 	.get		= netns_get,
 	.put		= netns_put,
 	.install	= netns_install,
+	.inum		= netns_inum,
 };
 #endif
diff --git a/trunk/net/dccp/ipv4.c b/trunk/net/dccp/ipv4.c
index 176ecdba4a22..4f9f5eb478f1 100644
--- a/trunk/net/dccp/ipv4.c
+++ b/trunk/net/dccp/ipv4.c
@@ -439,8 +439,8 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
 	return NULL;
 put_and_exit:
-	bh_unlock_sock(newsk);
-	sock_put(newsk);
+	inet_csk_prepare_forced_close(newsk);
+	dccp_done(newsk);
 	goto exit;
 }
 
diff --git a/trunk/net/dccp/ipv6.c b/trunk/net/dccp/ipv6.c
index 56840b249f3b..6e05981f271e 100644
--- a/trunk/net/dccp/ipv6.c
+++ b/trunk/net/dccp/ipv6.c
@@ -585,7 +585,8 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 	newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
 
 	if (__inet_inherit_port(sk, newsk) < 0) {
-		sock_put(newsk);
+		inet_csk_prepare_forced_close(newsk);
+		dccp_done(newsk);
 		goto out;
 	}
 	__inet6_hash(newsk, NULL);
diff --git a/trunk/net/ipv4/inet_connection_sock.c b/trunk/net/ipv4/inet_connection_sock.c
index 2026542d6836..d0670f00d524 100644
--- a/trunk/net/ipv4/inet_connection_sock.c
+++ b/trunk/net/ipv4/inet_connection_sock.c
@@ -710,6 +710,22 @@ void inet_csk_destroy_sock(struct sock *sk)
 }
 EXPORT_SYMBOL(inet_csk_destroy_sock);
 
+/* This function allows to force a closure of a socket after the call to
+ * tcp/dccp_create_openreq_child().
+ */
+void inet_csk_prepare_forced_close(struct sock *sk)
+{
+	/* sk_clone_lock locked the socket and set refcnt to 2 */
+	bh_unlock_sock(sk);
+	sock_put(sk);
+
+	/* The below has to be done to allow calling inet_csk_destroy_sock */
+	sock_set_flag(sk, SOCK_DEAD);
+	percpu_counter_inc(sk->sk_prot->orphan_count);
+	inet_sk(sk)->inet_num = 0;
+}
+EXPORT_SYMBOL(inet_csk_prepare_forced_close);
+
 int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
 {
 	struct inet_sock *inet = inet_sk(sk);
diff --git a/trunk/net/ipv4/tcp_ipv4.c b/trunk/net/ipv4/tcp_ipv4.c
index 1ed230716d51..54139fa514e6 100644
--- a/trunk/net/ipv4/tcp_ipv4.c
+++ b/trunk/net/ipv4/tcp_ipv4.c
@@ -1767,10 +1767,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
 	return NULL;
 put_and_exit:
-	tcp_clear_xmit_timers(newsk);
-	tcp_cleanup_congestion_control(newsk);
-	bh_unlock_sock(newsk);
-	sock_put(newsk);
+	inet_csk_prepare_forced_close(newsk);
+	tcp_done(newsk);
 	goto exit;
 }
 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
diff --git a/trunk/net/ipv6/Makefile b/trunk/net/ipv6/Makefile
index 2068ac4fbdad..4ea244891b58 100644
--- a/trunk/net/ipv6/Makefile
+++ b/trunk/net/ipv6/Makefile
@@ -41,6 +41,6 @@ obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
 obj-$(CONFIG_IPV6_GRE) += ip6_gre.o
 
 obj-y += addrconf_core.o exthdrs_core.o
-obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6_offload)
+obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload)
 
 obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
diff --git a/trunk/net/ipv6/addrconf.c b/trunk/net/ipv6/addrconf.c
index 6fca01f136ad..408cac4ae00a 100644
--- a/trunk/net/ipv6/addrconf.c
+++ b/trunk/net/ipv6/addrconf.c
@@ -534,8 +534,7 @@ void inet6_netconf_notify_devconf(struct net *net, int type, int ifindex,
 	rtnl_notify(skb, net, 0, RTNLGRP_IPV6_NETCONF, NULL, GFP_ATOMIC);
 	return;
 errout:
-	if (err < 0)
-		rtnl_set_sk_err(net, RTNLGRP_IPV6_NETCONF, err);
+	rtnl_set_sk_err(net, RTNLGRP_IPV6_NETCONF, err);
 }
 
 static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = {
diff --git a/trunk/net/ipv6/ndisc.c b/trunk/net/ipv6/ndisc.c
index f2a007b7bde3..6574175795df 100644
--- a/trunk/net/ipv6/ndisc.c
+++ b/trunk/net/ipv6/ndisc.c
@@ -1314,6 +1314,12 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 
 static void ndisc_redirect_rcv(struct sk_buff *skb)
 {
+	u8 *hdr;
+	struct ndisc_options ndopts;
+	struct rd_msg *msg = (struct rd_msg *)skb_transport_header(skb);
+	u32 ndoptlen = skb->tail - (skb->transport_header +
+				    offsetof(struct rd_msg, opt));
+
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
 	switch (skb->ndisc_nodetype) {
 	case NDISC_NODETYPE_HOST:
@@ -1330,6 +1336,17 @@ static void ndisc_redirect_rcv(struct sk_buff *skb)
 		return;
 	}
 
+	if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts))
+		return;
+
+	if (!ndopts.nd_opts_rh)
+		return;
+
+	hdr = (u8 *)ndopts.nd_opts_rh;
+	hdr += 8;
+	if (!pskb_pull(skb, hdr - skb_transport_header(skb)))
+		return;
+
 	icmpv6_notify(skb, NDISC_REDIRECT, 0, 0);
 }
 
diff --git a/trunk/net/ipv6/tcp_ipv6.c b/trunk/net/ipv6/tcp_ipv6.c
index 6565cf55eb1e..93825dd3a7c0 100644
--- a/trunk/net/ipv6/tcp_ipv6.c
+++ b/trunk/net/ipv6/tcp_ipv6.c
@@ -1288,7 +1288,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 #endif
 
 	if (__inet_inherit_port(sk, newsk) < 0) {
-		sock_put(newsk);
+		inet_csk_prepare_forced_close(newsk);
+		tcp_done(newsk);
 		goto out;
 	}
 	__inet6_hash(newsk, NULL);
diff --git a/trunk/net/mac802154/ieee802154_dev.c b/trunk/net/mac802154/ieee802154_dev.c
index e748aed290aa..b7c7f815deae 100644
--- a/trunk/net/mac802154/ieee802154_dev.c
+++ b/trunk/net/mac802154/ieee802154_dev.c
@@ -224,9 +224,9 @@ void ieee802154_free_device(struct ieee802154_dev *hw)
 
 	BUG_ON(!list_empty(&priv->slaves));
 
-	wpan_phy_free(priv->phy);
-
 	mutex_destroy(&priv->slaves_mtx);
+
+	wpan_phy_free(priv->phy);
 }
 EXPORT_SYMBOL(ieee802154_free_device);
 
diff --git a/trunk/net/netlink/af_netlink.c b/trunk/net/netlink/af_netlink.c
index c8a1eb6eca2d..c0353d55d56f 100644
--- a/trunk/net/netlink/af_netlink.c
+++ b/trunk/net/netlink/af_netlink.c
@@ -669,6 +669,9 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
 	struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
 	int err;
 
+	if (addr_len < sizeof(struct sockaddr_nl))
+		return -EINVAL;
+
 	if (nladdr->nl_family != AF_NETLINK)
 		return -EINVAL;
 
@@ -2059,7 +2062,7 @@ static int netlink_seq_show(struct seq_file *seq, void *v)
 		struct sock *s = v;
 		struct netlink_sock *nlk = nlk_sk(s);
 
-		seq_printf(seq, "%pK %-3d %-6d %08x %-8d %-8d %pK %-8d %-8d %-8lu\n",
+		seq_printf(seq, "%pK %-3d %-6u %08x %-8d %-8d %pK %-8d %-8d %-8lu\n",
 			   s,
 			   s->sk_protocol,
 			   nlk->portid,
diff --git a/trunk/net/sctp/Kconfig b/trunk/net/sctp/Kconfig
index a9edd2e205f4..c26210618e14 100644
--- a/trunk/net/sctp/Kconfig
+++ b/trunk/net/sctp/Kconfig
@@ -66,12 +66,36 @@ config SCTP_DBG_OBJCNT
 	  'cat /proc/net/sctp/sctp_dbg_objcnt'
 
 	  If unsure, say N
+choice
+	prompt "Default SCTP cookie HMAC encoding"
+	default SCTP_COOKIE_HMAC_MD5
+	help
+	  This option sets the default sctp cookie hmac algorithm
+	  when in doubt select 'md5'
+
+config SCTP_DEFAULT_COOKIE_HMAC_MD5
+	bool "Enable optional MD5 hmac cookie generation"
+	help
+	  Enable optional MD5 hmac based SCTP cookie generation
+	select SCTP_COOKIE_HMAC_MD5
+
+config SCTP_DEFAULT_COOKIE_HMAC_SHA1
+	bool "Enable optional SHA1 hmac cookie generation"
+	help
+	  Enable optional SHA1 hmac based SCTP cookie generation
+	select SCTP_COOKIE_HMAC_SHA1
+
+config SCTP_DEFAULT_COOKIE_HMAC_NONE
+	bool "Use no hmac alg in SCTP cookie generation"
+	help
+	  Use no hmac algorithm in SCTP cookie generation
+
+endchoice
 
 config SCTP_COOKIE_HMAC_MD5
 	bool "Enable optional MD5 hmac cookie generation"
 	help
 	  Enable optional MD5 hmac based SCTP cookie generation
-	default y
 	select CRYPTO_HMAC if SCTP_COOKIE_HMAC_MD5
 	select CRYPTO_MD5 if SCTP_COOKIE_HMAC_MD5
 
@@ -79,7 +103,6 @@ config SCTP_COOKIE_HMAC_SHA1
 	bool "Enable optional SHA1 hmac cookie generation"
 	help
 	  Enable optional SHA1 hmac based SCTP cookie generation
-	default y
 	select CRYPTO_HMAC if SCTP_COOKIE_HMAC_SHA1
 	select CRYPTO_SHA1 if SCTP_COOKIE_HMAC_SHA1
 
diff --git a/trunk/net/sctp/probe.c b/trunk/net/sctp/probe.c
index bc6cd75cc1dc..5f7518de2fd1 100644
--- a/trunk/net/sctp/probe.c
+++ b/trunk/net/sctp/probe.c
@@ -122,7 +122,8 @@ static const struct file_operations sctpprobe_fops = {
 	.llseek = noop_llseek,
 };
 
-sctp_disposition_t jsctp_sf_eat_sack(const struct sctp_endpoint *ep,
+sctp_disposition_t jsctp_sf_eat_sack(struct net *net,
+				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
 				     const sctp_subtype_t type,
 				     void *arg,
diff --git a/trunk/net/sctp/protocol.c b/trunk/net/sctp/protocol.c
index 2c7785bacf74..f898b1c58bd2 100644
--- a/trunk/net/sctp/protocol.c
+++ b/trunk/net/sctp/protocol.c
@@ -1191,9 +1191,9 @@ static int __net_init sctp_net_init(struct net *net)
 	net->sctp.cookie_preserve_enable 	= 1;
 
 	/* Default sctp sockets to use md5 as their hmac alg */
-#if defined (CONFIG_CRYPTO_MD5)
+#if defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5)
 	net->sctp.sctp_hmac_alg			= "md5";
-#elif defined (CONFIG_CRYPTO_SHA1)
+#elif defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1)
 	net->sctp.sctp_hmac_alg			= "sha1";
 #else
 	net->sctp.sctp_hmac_alg			= NULL;
diff --git a/trunk/net/sunrpc/auth_gss/auth_gss.c b/trunk/net/sunrpc/auth_gss/auth_gss.c
index 909dc0c31aab..6e5c824b040b 100644
--- a/trunk/net/sunrpc/auth_gss/auth_gss.c
+++ b/trunk/net/sunrpc/auth_gss/auth_gss.c
@@ -192,17 +192,23 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct
 	const void *q;
 	unsigned int seclen;
 	unsigned int timeout;
+	unsigned long now = jiffies;
 	u32 window_size;
 	int ret;
 
-	/* First unsigned int gives the lifetime (in seconds) of the cred */
+	/* First unsigned int gives the remaining lifetime in seconds of the
+	 * credential - e.g. the remaining TGT lifetime for Kerberos or
+	 * the -t value passed to GSSD.
+	 */
 	p = simple_get_bytes(p, end, &timeout, sizeof(timeout));
 	if (IS_ERR(p))
 		goto err;
 	if (timeout == 0)
 		timeout = GSSD_MIN_TIMEOUT;
-	ctx->gc_expiry = jiffies + (unsigned long)timeout * HZ * 3 / 4;
-	/* Sequence number window. Determines the maximum number of simultaneous requests */
+	ctx->gc_expiry = now + ((unsigned long)timeout * HZ);
+	/* Sequence number window. Determines the maximum number of
+	 * simultaneous requests
+	 */
 	p = simple_get_bytes(p, end, &window_size, sizeof(window_size));
 	if (IS_ERR(p))
 		goto err;
@@ -237,9 +243,12 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct
 		p = ERR_PTR(ret);
 		goto err;
 	}
+	dprintk("RPC:       %s Success. gc_expiry %lu now %lu timeout %u\n",
+		__func__, ctx->gc_expiry, now, timeout);
 	return q;
 err:
-	dprintk("RPC:       %s returning %ld\n", __func__, -PTR_ERR(p));
+	dprintk("RPC:       %s returns %ld gc_expiry %lu now %lu timeout %u\n",
+		__func__, -PTR_ERR(p), ctx->gc_expiry, now, timeout);
 	return p;
 }
 
diff --git a/trunk/net/sunrpc/backchannel_rqst.c b/trunk/net/sunrpc/backchannel_rqst.c
index a9c0bbccad6b..890a29912d5a 100644
--- a/trunk/net/sunrpc/backchannel_rqst.c
+++ b/trunk/net/sunrpc/backchannel_rqst.c
@@ -59,7 +59,7 @@ static void xprt_free_allocation(struct rpc_rqst *req)
 	struct xdr_buf *xbufp;
 
 	dprintk("RPC:        free allocations for req= %p\n", req);
-	BUG_ON(test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
+	WARN_ON_ONCE(test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
 	xbufp = &req->rq_private_buf;
 	free_page((unsigned long)xbufp->head[0].iov_base);
 	xbufp = &req->rq_snd_buf;
@@ -191,7 +191,9 @@ void xprt_destroy_backchannel(struct rpc_xprt *xprt, unsigned int max_reqs)
 
 	dprintk("RPC:        destroy backchannel transport\n");
 
-	BUG_ON(max_reqs == 0);
+	if (max_reqs == 0)
+		goto out;
+
 	spin_lock_bh(&xprt->bc_pa_lock);
 	xprt_dec_alloc_count(xprt, max_reqs);
 	list_for_each_entry_safe(req, tmp, &xprt->bc_pa_list, rq_bc_pa_list) {
@@ -202,6 +204,7 @@ void xprt_destroy_backchannel(struct rpc_xprt *xprt, unsigned int max_reqs)
 	}
 	spin_unlock_bh(&xprt->bc_pa_lock);
 
+out:
 	dprintk("RPC:        backchannel list empty= %s\n",
 		list_empty(&xprt->bc_pa_list) ? "true" : "false");
 }
@@ -255,7 +258,7 @@ void xprt_free_bc_request(struct rpc_rqst *req)
 	dprintk("RPC:       free backchannel req=%p\n", req);
 
 	smp_mb__before_clear_bit();
-	BUG_ON(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
+	WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
 	clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
 	smp_mb__after_clear_bit();
 
diff --git a/trunk/net/sunrpc/bc_svc.c b/trunk/net/sunrpc/bc_svc.c
index 0b2eb388cbda..15c7a8a1c24f 100644
--- a/trunk/net/sunrpc/bc_svc.c
+++ b/trunk/net/sunrpc/bc_svc.c
@@ -53,7 +53,7 @@ int bc_send(struct rpc_rqst *req)
 	if (IS_ERR(task))
 		ret = PTR_ERR(task);
 	else {
-		BUG_ON(atomic_read(&task->tk_count) != 1);
+		WARN_ON_ONCE(atomic_read(&task->tk_count) != 1);
 		ret = task->tk_status;
 		rpc_put_task(task);
 	}
diff --git a/trunk/net/sunrpc/cache.c b/trunk/net/sunrpc/cache.c
index fc2f7aa4dca7..9afa4393c217 100644
--- a/trunk/net/sunrpc/cache.c
+++ b/trunk/net/sunrpc/cache.c
@@ -775,11 +775,11 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
 	if (rp->q.list.next == &cd->queue) {
 		spin_unlock(&queue_lock);
 		mutex_unlock(&inode->i_mutex);
-		BUG_ON(rp->offset);
+		WARN_ON_ONCE(rp->offset);
 		return 0;
 	}
 	rq = container_of(rp->q.list.next, struct cache_request, q.list);
-	BUG_ON(rq->q.reader);
+	WARN_ON_ONCE(rq->q.reader);
 	if (rp->offset == 0)
 		rq->readers++;
 	spin_unlock(&queue_lock);
diff --git a/trunk/net/sunrpc/clnt.c b/trunk/net/sunrpc/clnt.c
index cdc7564b4512..822f020fa7f4 100644
--- a/trunk/net/sunrpc/clnt.c
+++ b/trunk/net/sunrpc/clnt.c
@@ -132,8 +132,10 @@ static struct dentry *rpc_setup_pipedir_sb(struct super_block *sb,
 	int error;
 
 	dir = rpc_d_lookup_sb(sb, dir_name);
-	if (dir == NULL)
+	if (dir == NULL) {
+		pr_info("RPC: pipefs directory doesn't exist: %s\n", dir_name);
 		return dir;
+	}
 	for (;;) {
 		q.len = snprintf(name, sizeof(name), "clnt%x", (unsigned int)clntid++);
 		name[sizeof(name) - 1] = '\0';
@@ -192,7 +194,8 @@ static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event,
 	case RPC_PIPEFS_MOUNT:
 		dentry = rpc_setup_pipedir_sb(sb, clnt,
 					      clnt->cl_program->pipe_dir_name);
-		BUG_ON(dentry == NULL);
+		if (!dentry)
+			return -ENOENT;
 		if (IS_ERR(dentry))
 			return PTR_ERR(dentry);
 		clnt->cl_dentry = dentry;
@@ -234,7 +237,7 @@ static struct rpc_clnt *rpc_get_client_for_event(struct net *net, int event)
 	spin_lock(&sn->rpc_client_lock);
 	list_for_each_entry(clnt, &sn->all_clients, cl_clients) {
 		if (clnt->cl_program->pipe_dir_name == NULL)
-			break;
+			continue;
 		if (rpc_clnt_skip_event(clnt, event))
 			continue;
 		if (atomic_inc_not_zero(&clnt->cl_count) == 0)
@@ -607,6 +610,13 @@ EXPORT_SYMBOL_GPL(rpc_killall_tasks);
  */
 void rpc_shutdown_client(struct rpc_clnt *clnt)
 {
+	/*
+	 * To avoid deadlock, never call rpc_shutdown_client from a
+	 * workqueue context!
+	 */
+	WARN_ON_ONCE(current->flags & PF_WQ_WORKER);
+	might_sleep();
+
 	dprintk_rcu("RPC:       shutting down %s client for %s\n",
 			clnt->cl_protname,
 			rcu_dereference(clnt->cl_xprt)->servername);
@@ -693,21 +703,19 @@ struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old,
 				      const struct rpc_program *program,
 				      u32 vers)
 {
+	struct rpc_create_args args = {
+		.program	= program,
+		.prognumber	= program->number,
+		.version	= vers,
+		.authflavor	= old->cl_auth->au_flavor,
+		.client_name	= old->cl_principal,
+	};
 	struct rpc_clnt *clnt;
-	const struct rpc_version *version;
 	int err;
 
-	BUG_ON(vers >= program->nrvers || !program->version[vers]);
-	version = program->version[vers];
-	clnt = rpc_clone_client(old);
+	clnt = __rpc_clone_client(&args, old);
 	if (IS_ERR(clnt))
 		goto out;
-	clnt->cl_procinfo = version->procs;
-	clnt->cl_maxproc  = version->nrprocs;
-	clnt->cl_protname = program->name;
-	clnt->cl_prog     = program->number;
-	clnt->cl_vers     = version->number;
-	clnt->cl_stats    = program->stats;
 	err = rpc_ping(clnt);
 	if (err != 0) {
 		rpc_shutdown_client(clnt);
@@ -832,7 +840,12 @@ int rpc_call_sync(struct rpc_clnt *clnt, const struct rpc_message *msg, int flag
 	};
 	int status;
 
-	BUG_ON(flags & RPC_TASK_ASYNC);
+	WARN_ON_ONCE(flags & RPC_TASK_ASYNC);
+	if (flags & RPC_TASK_ASYNC) {
+		rpc_release_calldata(task_setup_data.callback_ops,
+			task_setup_data.callback_data);
+		return -EINVAL;
+	}
 
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
@@ -908,7 +921,7 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
 
 	task->tk_action = call_bc_transmit;
 	atomic_inc(&task->tk_count);
-	BUG_ON(atomic_read(&task->tk_count) != 2);
+	WARN_ON_ONCE(atomic_read(&task->tk_count) != 2);
 	rpc_execute(task);
 
 out:
@@ -1368,6 +1381,7 @@ call_refreshresult(struct rpc_task *task)
 		return;
 	case -ETIMEDOUT:
 		rpc_delay(task, 3*HZ);
+	case -EKEYEXPIRED:
 	case -EAGAIN:
 		status = -EACCES;
 		if (!task->tk_cred_retry)
@@ -1654,7 +1668,6 @@ call_transmit(struct rpc_task *task)
 	task->tk_action = call_transmit_status;
 	/* Encode here so that rpcsec_gss can use correct sequence number. */
 	if (rpc_task_need_encode(task)) {
-		BUG_ON(task->tk_rqstp->rq_bytes_sent != 0);
 		rpc_xdr_encode(task);
 		/* Did the encode result in an error condition? */
 		if (task->tk_status != 0) {
@@ -1738,7 +1751,6 @@ call_bc_transmit(struct rpc_task *task)
 {
 	struct rpc_rqst *req = task->tk_rqstp;
 
-	BUG_ON(task->tk_status != 0);
 	task->tk_status = xprt_prepare_transmit(task);
 	if (task->tk_status == -EAGAIN) {
 		/*
@@ -1785,7 +1797,7 @@ call_bc_transmit(struct rpc_task *task)
 		 * We were unable to reply and will have to drop the
 		 * request.  The server should reconnect and retransmit.
 		 */
-		BUG_ON(task->tk_status == -EAGAIN);
+		WARN_ON_ONCE(task->tk_status == -EAGAIN);
 		printk(KERN_NOTICE "RPC: Could not send backchannel reply "
 			"error: %d\n", task->tk_status);
 		break;
diff --git a/trunk/net/sunrpc/rpc_pipe.c b/trunk/net/sunrpc/rpc_pipe.c
index 80f5dd23417d..fd10981ea792 100644
--- a/trunk/net/sunrpc/rpc_pipe.c
+++ b/trunk/net/sunrpc/rpc_pipe.c
@@ -1093,7 +1093,7 @@ void rpc_put_sb_net(const struct net *net)
 {
 	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
 
-	BUG_ON(sn->pipefs_sb == NULL);
+	WARN_ON(sn->pipefs_sb == NULL);
 	mutex_unlock(&sn->pipefs_sb_lock);
 }
 EXPORT_SYMBOL_GPL(rpc_put_sb_net);
@@ -1152,14 +1152,19 @@ static void rpc_kill_sb(struct super_block *sb)
 	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
 
 	mutex_lock(&sn->pipefs_sb_lock);
+	if (sn->pipefs_sb != sb) {
+		mutex_unlock(&sn->pipefs_sb_lock);
+		goto out;
+	}
 	sn->pipefs_sb = NULL;
 	mutex_unlock(&sn->pipefs_sb_lock);
-	put_net(net);
 	dprintk("RPC:       sending pipefs UMOUNT notification for net %p%s\n",
 		net, NET_NAME(net));
 	blocking_notifier_call_chain(&rpc_pipefs_notifier_list,
 					   RPC_PIPEFS_UMOUNT,
 					   sb);
+	put_net(net);
+out:
 	kill_litter_super(sb);
 }
 
diff --git a/trunk/net/sunrpc/rpcb_clnt.c b/trunk/net/sunrpc/rpcb_clnt.c
index a70acae496e4..411f332de0b3 100644
--- a/trunk/net/sunrpc/rpcb_clnt.c
+++ b/trunk/net/sunrpc/rpcb_clnt.c
@@ -884,7 +884,10 @@ static void encode_rpcb_string(struct xdr_stream *xdr, const char *string,
 	u32 len;
 
 	len = strlen(string);
-	BUG_ON(len > maxstrlen);
+	WARN_ON_ONCE(len > maxstrlen);
+	if (len > maxstrlen)
+		/* truncate and hope for the best */
+		len = maxstrlen;
 	p = xdr_reserve_space(xdr, 4 + len);
 	xdr_encode_opaque(p, string, len);
 }
diff --git a/trunk/net/sunrpc/sched.c b/trunk/net/sunrpc/sched.c
index 6357fcb00c7e..d17a704aaf5f 100644
--- a/trunk/net/sunrpc/sched.c
+++ b/trunk/net/sunrpc/sched.c
@@ -98,6 +98,23 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
 	list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list);
 }
 
+static void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority)
+{
+	queue->priority = priority;
+}
+
+static void rpc_set_waitqueue_owner(struct rpc_wait_queue *queue, pid_t pid)
+{
+	queue->owner = pid;
+	queue->nr = RPC_BATCH_COUNT;
+}
+
+static void rpc_reset_waitqueue_priority(struct rpc_wait_queue *queue)
+{
+	rpc_set_waitqueue_priority(queue, queue->maxpriority);
+	rpc_set_waitqueue_owner(queue, 0);
+}
+
 /*
  * Add new request to a priority queue.
  */
@@ -109,9 +126,11 @@ static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue,
 	struct rpc_task *t;
 
 	INIT_LIST_HEAD(&task->u.tk_wait.links);
-	q = &queue->tasks[queue_priority];
 	if (unlikely(queue_priority > queue->maxpriority))
-		q = &queue->tasks[queue->maxpriority];
+		queue_priority = queue->maxpriority;
+	if (queue_priority > queue->priority)
+		rpc_set_waitqueue_priority(queue, queue_priority);
+	q = &queue->tasks[queue_priority];
 	list_for_each_entry(t, q, u.tk_wait.list) {
 		if (t->tk_owner == task->tk_owner) {
 			list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links);
@@ -133,7 +152,9 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue,
 		struct rpc_task *task,
 		unsigned char queue_priority)
 {
-	BUG_ON (RPC_IS_QUEUED(task));
+	WARN_ON_ONCE(RPC_IS_QUEUED(task));
+	if (RPC_IS_QUEUED(task))
+		return;
 
 	if (RPC_IS_PRIORITY(queue))
 		__rpc_add_wait_queue_priority(queue, task, queue_priority);
@@ -178,24 +199,6 @@ static void __rpc_remove_wait_queue(struct rpc_wait_queue *queue, struct rpc_tas
 			task->tk_pid, queue, rpc_qname(queue));
 }
 
-static inline void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority)
-{
-	queue->priority = priority;
-	queue->count = 1 << (priority * 2);
-}
-
-static inline void rpc_set_waitqueue_owner(struct rpc_wait_queue *queue, pid_t pid)
-{
-	queue->owner = pid;
-	queue->nr = RPC_BATCH_COUNT;
-}
-
-static inline void rpc_reset_waitqueue_priority(struct rpc_wait_queue *queue)
-{
-	rpc_set_waitqueue_priority(queue, queue->maxpriority);
-	rpc_set_waitqueue_owner(queue, 0);
-}
-
 static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname, unsigned char nr_queues)
 {
 	int i;
@@ -334,7 +337,7 @@ static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
 
 	__rpc_add_wait_queue(q, task, queue_priority);
 
-	BUG_ON(task->tk_callback != NULL);
+	WARN_ON_ONCE(task->tk_callback != NULL);
 	task->tk_callback = action;
 	__rpc_add_timer(q, task);
 }
@@ -343,7 +346,12 @@ void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
 				rpc_action action)
 {
 	/* We shouldn't ever put an inactive task to sleep */
-	BUG_ON(!RPC_IS_ACTIVATED(task));
+	WARN_ON_ONCE(!RPC_IS_ACTIVATED(task));
+	if (!RPC_IS_ACTIVATED(task)) {
+		task->tk_status = -EIO;
+		rpc_put_task_async(task);
+		return;
+	}
 
 	/*
 	 * Protect the queue operations.
@@ -358,7 +366,12 @@ void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task,
 		rpc_action action, int priority)
 {
 	/* We shouldn't ever put an inactive task to sleep */
-	BUG_ON(!RPC_IS_ACTIVATED(task));
+	WARN_ON_ONCE(!RPC_IS_ACTIVATED(task));
+	if (!RPC_IS_ACTIVATED(task)) {
+		task->tk_status = -EIO;
+		rpc_put_task_async(task);
+		return;
+	}
 
 	/*
 	 * Protect the queue operations.
@@ -367,6 +380,7 @@ void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task,
 	__rpc_sleep_on_priority(q, task, action, priority - RPC_PRIORITY_LOW);
 	spin_unlock_bh(&q->lock);
 }
+EXPORT_SYMBOL_GPL(rpc_sleep_on_priority);
 
 /**
  * __rpc_do_wake_up_task - wake up a single rpc_task
@@ -451,8 +465,7 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q
 		/*
 		 * Check if we need to switch queues.
 		 */
-		if (--queue->count)
-			goto new_owner;
+		goto new_owner;
 	}
 
 	/*
@@ -697,7 +710,9 @@ static void __rpc_execute(struct rpc_task *task)
 	dprintk("RPC: %5u __rpc_execute flags=0x%x\n",
 			task->tk_pid, task->tk_flags);
 
-	BUG_ON(RPC_IS_QUEUED(task));
+	WARN_ON_ONCE(RPC_IS_QUEUED(task));
+	if (RPC_IS_QUEUED(task))
+		return;
 
 	for (;;) {
 		void (*do_action)(struct rpc_task *);
@@ -981,7 +996,7 @@ static void rpc_release_task(struct rpc_task *task)
 {
 	dprintk("RPC: %5u release task\n", task->tk_pid);
 
-	BUG_ON (RPC_IS_QUEUED(task));
+	WARN_ON_ONCE(RPC_IS_QUEUED(task));
 
 	rpc_release_resources_task(task);
 
diff --git a/trunk/net/sunrpc/svc.c b/trunk/net/sunrpc/svc.c
index 3ee7461926d8..dfa4ba69ff45 100644
--- a/trunk/net/sunrpc/svc.c
+++ b/trunk/net/sunrpc/svc.c
@@ -324,7 +324,9 @@ svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx)
 	 * The caller checks for sv_nrpools > 1, which
 	 * implies that we've been initialized.
 	 */
-	BUG_ON(m->count == 0);
+	WARN_ON_ONCE(m->count == 0);
+	if (m->count == 0)
+		return;
 
 	switch (m->mode) {
 	case SVC_POOL_PERCPU:
@@ -585,7 +587,9 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node)
 				       * We assume one is at most one page
 				       */
 	arghi = 0;
-	BUG_ON(pages > RPCSVC_MAXPAGES);
+	WARN_ON_ONCE(pages > RPCSVC_MAXPAGES);
+	if (pages > RPCSVC_MAXPAGES)
+		pages = RPCSVC_MAXPAGES;
 	while (pages) {
 		struct page *p = alloc_pages_node(node, GFP_KERNEL, 0);
 		if (!p)
@@ -946,7 +950,9 @@ int svc_register(const struct svc_serv *serv, struct net *net,
 	unsigned int		i;
 	int			error = 0;
 
-	BUG_ON(proto == 0 && port == 0);
+	WARN_ON_ONCE(proto == 0 && port == 0);
+	if (proto == 0 && port == 0)
+		return -EINVAL;
 
 	for (progp = serv->sv_program; progp; progp = progp->pg_next) {
 		for (i = 0; i < progp->pg_nvers; i++) {
diff --git a/trunk/net/sunrpc/svc_xprt.c b/trunk/net/sunrpc/svc_xprt.c
index 194d865fae72..b8e47fac7315 100644
--- a/trunk/net/sunrpc/svc_xprt.c
+++ b/trunk/net/sunrpc/svc_xprt.c
@@ -218,7 +218,9 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
  */
 static void svc_xprt_received(struct svc_xprt *xprt)
 {
-	BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags));
+	WARN_ON_ONCE(!test_bit(XPT_BUSY, &xprt->xpt_flags));
+	if (!test_bit(XPT_BUSY, &xprt->xpt_flags))
+		return;
 	/* As soon as we clear busy, the xprt could be closed and
 	 * 'put', so we need a reference to call svc_xprt_enqueue with:
 	 */
@@ -577,7 +579,10 @@ int svc_alloc_arg(struct svc_rqst *rqstp)
 
 	/* now allocate needed pages.  If we get a failure, sleep briefly */
 	pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE;
-	BUG_ON(pages >= RPCSVC_MAXPAGES);
+	WARN_ON_ONCE(pages >= RPCSVC_MAXPAGES);
+	if (pages >= RPCSVC_MAXPAGES)
+		/* use as many pages as possible */
+		pages = RPCSVC_MAXPAGES - 1;
 	for (i = 0; i < pages ; i++)
 		while (rqstp->rq_pages[i] == NULL) {
 			struct page *p = alloc_page(GFP_KERNEL);
@@ -926,7 +931,7 @@ static void svc_delete_xprt(struct svc_xprt *xprt)
 	spin_lock_bh(&serv->sv_lock);
 	if (!test_and_set_bit(XPT_DETACHED, &xprt->xpt_flags))
 		list_del_init(&xprt->xpt_list);
-	BUG_ON(!list_empty(&xprt->xpt_ready));
+	WARN_ON_ONCE(!list_empty(&xprt->xpt_ready));
 	if (test_bit(XPT_TEMP, &xprt->xpt_flags))
 		serv->sv_tmpcnt--;
 	spin_unlock_bh(&serv->sv_lock);
diff --git a/trunk/net/sunrpc/svcsock.c b/trunk/net/sunrpc/svcsock.c
index 03827cef1fa7..cc3020d16789 100644
--- a/trunk/net/sunrpc/svcsock.c
+++ b/trunk/net/sunrpc/svcsock.c
@@ -84,7 +84,11 @@ static struct lock_class_key svc_slock_key[2];
 static void svc_reclassify_socket(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
-	BUG_ON(sock_owned_by_user(sk));
+
+	WARN_ON_ONCE(sock_owned_by_user(sk));
+	if (sock_owned_by_user(sk))
+		return;
+
 	switch (sk->sk_family) {
 	case AF_INET:
 		sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD",
diff --git a/trunk/net/sunrpc/xdr.c b/trunk/net/sunrpc/xdr.c
index 08f50afd5f2a..56055632f151 100644
--- a/trunk/net/sunrpc/xdr.c
+++ b/trunk/net/sunrpc/xdr.c
@@ -318,7 +318,10 @@ xdr_shrink_bufhead(struct xdr_buf *buf, size_t len)
 
 	tail = buf->tail;
 	head = buf->head;
-	BUG_ON (len > head->iov_len);
+
+	WARN_ON_ONCE(len > head->iov_len);
+	if (len > head->iov_len)
+		len = head->iov_len;
 
 	/* Shift the tail first */
 	if (tail->iov_len != 0) {
diff --git a/trunk/net/sunrpc/xprtsock.c b/trunk/net/sunrpc/xprtsock.c
index 75853cabf4c9..68b0a81c31d5 100644
--- a/trunk/net/sunrpc/xprtsock.c
+++ b/trunk/net/sunrpc/xprtsock.c
@@ -1746,7 +1746,6 @@ static inline void xs_reclassify_socketu(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
 
-	BUG_ON(sock_owned_by_user(sk));
 	sock_lock_init_class_and_name(sk, "slock-AF_LOCAL-RPC",
 		&xs_slock_key[1], "sk_lock-AF_LOCAL-RPC", &xs_key[1]);
 }
@@ -1755,7 +1754,6 @@ static inline void xs_reclassify_socket4(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
 
-	BUG_ON(sock_owned_by_user(sk));
 	sock_lock_init_class_and_name(sk, "slock-AF_INET-RPC",
 		&xs_slock_key[0], "sk_lock-AF_INET-RPC", &xs_key[0]);
 }
@@ -1764,13 +1762,16 @@ static inline void xs_reclassify_socket6(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
 
-	BUG_ON(sock_owned_by_user(sk));
 	sock_lock_init_class_and_name(sk, "slock-AF_INET6-RPC",
 		&xs_slock_key[1], "sk_lock-AF_INET6-RPC", &xs_key[1]);
 }
 
 static inline void xs_reclassify_socket(int family, struct socket *sock)
 {
+	WARN_ON_ONCE(sock_owned_by_user(sock->sk));
+	if (sock_owned_by_user(sock->sk))
+		return;
+
 	switch (family) {
 	case AF_LOCAL:
 		xs_reclassify_socketu(sock);
@@ -1901,6 +1902,10 @@ static void xs_local_setup_socket(struct work_struct *work)
 		dprintk("RPC:       xprt %p: socket %s does not exist\n",
 				xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
 		break;
+	case -ECONNREFUSED:
+		dprintk("RPC:       xprt %p: connection refused for %s\n",
+				xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
+		break;
 	default:
 		printk(KERN_ERR "%s: unhandled error (%d) connecting to %s\n",
 				__func__, -status,
@@ -2329,9 +2334,11 @@ static void *bc_malloc(struct rpc_task *task, size_t size)
 	struct page *page;
 	struct rpc_buffer *buf;
 
-	BUG_ON(size > PAGE_SIZE - sizeof(struct rpc_buffer));
-	page = alloc_page(GFP_KERNEL);
+	WARN_ON_ONCE(size > PAGE_SIZE - sizeof(struct rpc_buffer));
+	if (size > PAGE_SIZE - sizeof(struct rpc_buffer))
+		return NULL;
 
+	page = alloc_page(GFP_KERNEL);
 	if (!page)
 		return NULL;
 
@@ -2393,7 +2400,6 @@ static int bc_send_request(struct rpc_task *task)
 {
 	struct rpc_rqst *req = task->tk_rqstp;
 	struct svc_xprt	*xprt;
-	struct svc_sock         *svsk;
 	u32                     len;
 
 	dprintk("sending request with xid: %08x\n", ntohl(req->rq_xid));
@@ -2401,7 +2407,6 @@ static int bc_send_request(struct rpc_task *task)
 	 * Get the server socket associated with this callback xprt
 	 */
 	xprt = req->rq_xprt->bc_xprt;
-	svsk = container_of(xprt, struct svc_sock, sk_xprt);
 
 	/*
 	 * Grab the mutex to serialize data as the connection is shared
diff --git a/trunk/scripts/Makefile.modsign b/trunk/scripts/Makefile.modsign
new file mode 100644
index 000000000000..abfda626dbad
--- /dev/null
+++ b/trunk/scripts/Makefile.modsign
@@ -0,0 +1,32 @@
+# ==========================================================================
+# Signing modules
+# ==========================================================================
+
+PHONY := __modsign
+__modsign:
+
+include scripts/Kbuild.include
+
+__modules := $(sort $(shell grep -h '\.ko' /dev/null $(wildcard $(MODVERDIR)/*.mod)))
+modules := $(patsubst %.o,%.ko,$(wildcard $(__modules:.ko=.o)))
+
+PHONY += $(modules)
+__modsign: $(modules)
+	@:
+
+quiet_cmd_sign_ko = SIGN [M] $(2)/$(notdir $@)
+        cmd_sign_ko = $(mod_sign_cmd) $(2)/$(notdir $@)
+
+# Modules built outside the kernel source tree go into extra by default
+INSTALL_MOD_DIR ?= extra
+ext-mod-dir = $(INSTALL_MOD_DIR)$(subst $(patsubst %/,%,$(KBUILD_EXTMOD)),,$(@D))
+
+modinst_dir = $(if $(KBUILD_EXTMOD),$(ext-mod-dir),kernel/$(@D))
+
+$(modules):
+	$(call cmd,sign_ko,$(MODLIB)/$(modinst_dir))
+
+# Declare the contents of the .PHONY variable as phony.  We keep that
+# information in a variable se we can use it in if_changed and friends.
+
+.PHONY: $(PHONY)
diff --git a/trunk/scripts/checkpatch.pl b/trunk/scripts/checkpatch.pl
index f18750e3bd6c..1d6e4c541370 100755
--- a/trunk/scripts/checkpatch.pl
+++ b/trunk/scripts/checkpatch.pl
@@ -33,6 +33,7 @@
 my @ignore = ();
 my $help = 0;
 my $configuration_file = ".checkpatch.conf";
+my $max_line_length = 80;
 
 sub help {
 	my ($exitcode) = @_;
@@ -51,6 +52,7 @@ sub help {
   -f, --file                 treat FILE as regular source file
   --subjective, --strict     enable more subjective tests
   --ignore TYPE(,TYPE2...)   ignore various comma separated message types
+  --max-line-length=n        set the maximum line length, if exceeded, warn
   --show-types               show the message "types" in the output
   --root=PATH                PATH to the kernel tree root
   --no-summary               suppress the per-file summary
@@ -107,6 +109,7 @@ sub help {
 	'strict!'	=> \$check,
 	'ignore=s'	=> \@ignore,
 	'show-types!'	=> \$show_types,
+	'max-line-length=i' => \$max_line_length,
 	'root=s'	=> \$root,
 	'summary!'	=> \$summary,
 	'mailback!'	=> \$mailback,
@@ -227,7 +230,11 @@ sub help {
 our $Member	= qr{->$Ident|\.$Ident|\[[^]]*\]};
 our $Lval	= qr{$Ident(?:$Member)*};
 
-our $Constant	= qr{(?i:(?:[0-9]+|0x[0-9a-f]+)[ul]*)};
+our $Float_hex	= qr{(?i:0x[0-9a-f]+p-?[0-9]+[fl]?)};
+our $Float_dec	= qr{(?i:((?:[0-9]+\.[0-9]*|[0-9]*\.[0-9]+)(?:e-?[0-9]+)?[fl]?))};
+our $Float_int	= qr{(?i:[0-9]+e-?[0-9]+[fl]?)};
+our $Float	= qr{$Float_hex|$Float_dec|$Float_int};
+our $Constant	= qr{(?:$Float|(?i:(?:0x[0-9a-f]+|[0-9]+)[ul]*))};
 our $Assignment	= qr{(?:\*\=|/=|%=|\+=|-=|<<=|>>=|&=|\^=|\|=|=)};
 our $Compare    = qr{<=|>=|==|!=|<|>};
 our $Operators	= qr{
@@ -352,27 +359,6 @@ sub deparenthesize {
 
 $chk_signoff = 0 if ($file);
 
-my @dep_includes = ();
-my @dep_functions = ();
-my $removal = "Documentation/feature-removal-schedule.txt";
-if ($tree && -f "$root/$removal") {
-	open(my $REMOVE, '<', "$root/$removal") ||
-				die "$P: $removal: open failed - $!\n";
-	while (<$REMOVE>) {
-		if (/^Check:\s+(.*\S)/) {
-			for my $entry (split(/[, ]+/, $1)) {
-				if ($entry =~ m@include/(.*)@) {
-					push(@dep_includes, $1);
-
-				} elsif ($entry !~ m@/@) {
-					push(@dep_functions, $entry);
-				}
-			}
-		}
-	}
-	close($REMOVE);
-}
-
 my @rawlines = ();
 my @lines = ();
 my $vname;
@@ -1412,6 +1398,8 @@ sub process {
 	my %suppress_export;
 	my $suppress_statement = 0;
 
+	my %camelcase = ();
+
 	# Pre-scan the patch sanitizing the lines.
 	# Pre-scan the patch looking for any __setup documentation.
 	#
@@ -1757,6 +1745,13 @@ sub process {
 			#print "is_start<$is_start> is_end<$is_end> length<$length>\n";
 		}
 
+# discourage the addition of CONFIG_EXPERIMENTAL in Kconfig.
+		if ($realfile =~ /Kconfig/ &&
+		    $line =~ /.\s*depends on\s+.*\bEXPERIMENTAL\b/) {
+			WARN("CONFIG_EXPERIMENTAL",
+			     "Use of CONFIG_EXPERIMENTAL is deprecated. For alternatives, see https://lkml.org/lkml/2012/10/23/580\n");
+		}
+
 		if (($realfile =~ /Makefile.*/ || $realfile =~ /Kbuild.*/) &&
 		    ($line =~ /\+(EXTRA_[A-Z]+FLAGS).*/)) {
 			my $flag = $1;
@@ -1774,15 +1769,15 @@ sub process {
 # check we are in a valid source file if not then ignore this hunk
 		next if ($realfile !~ /\.(h|c|s|S|pl|sh)$/);
 
-#80 column limit
+#line length limit
 		if ($line =~ /^\+/ && $prevrawline !~ /\/\*\*/ &&
 		    $rawline !~ /^.\s*\*\s*\@$Ident\s/ &&
 		    !($line =~ /^\+\s*$logFunctions\s*\(\s*(?:(KERN_\S+\s*|[^"]*))?"[X\t]*"\s*(?:|,|\)\s*;)\s*$/ ||
 		    $line =~ /^\+\s*"[^"]*"\s*(?:\s*|,|\)\s*;)\s*$/) &&
-		    $length > 80)
+		    $length > $max_line_length)
 		{
 			WARN("LONG_LINE",
-			     "line over 80 characters\n" . $herecurr);
+			     "line over $max_line_length characters\n" . $herecurr);
 		}
 
 # Check for user-visible strings broken across lines, which breaks the ability
@@ -1912,6 +1907,12 @@ sub process {
 # check we are in a valid C source file if not then ignore this hunk
 		next if ($realfile !~ /\.(h|c)$/);
 
+# discourage the addition of CONFIG_EXPERIMENTAL in #if(def).
+		if ($line =~ /^\+\s*\#\s*if.*\bCONFIG_EXPERIMENTAL\b/) {
+			WARN("CONFIG_EXPERIMENTAL",
+			     "Use of CONFIG_EXPERIMENTAL is deprecated. For alternatives, see https://lkml.org/lkml/2012/10/23/580\n");
+		}
+
 # check for RCS/CVS revision markers
 		if ($rawline =~ /^\+.*\$(Revision|Log|Id)(?:\$|)/) {
 			WARN("CVS_KEYWORD",
@@ -2906,12 +2907,17 @@ sub process {
 			}
 		}
 
-#studly caps, commented out until figure out how to distinguish between use of existing and adding new
-#		if (($line=~/[\w_][a-z\d]+[A-Z]/) and !($line=~/print/)) {
-#		    print "No studly caps, use _\n";
-#		    print "$herecurr";
-#		    $clean = 0;
-#		}
+#CamelCase
+		while ($line =~ m{($Constant|$Lval)}g) {
+			my $var = $1;
+			if ($var !~ /$Constant/ &&
+			    $var =~ /[A-Z]\w*[a-z]|[a-z]\w*[A-Z]/ &&
+			    !defined $camelcase{$var}) {
+				$camelcase{$var} = 1;
+				WARN("CAMELCASE",
+				     "Avoid CamelCase: <$var>\n" . $herecurr);
+			}
+		}
 
 #no spaces allowed after \ in define
 		if ($line=~/\#\s*define.*\\\s$/) {
@@ -3013,6 +3019,17 @@ sub process {
 					      "Macros with complex values should be enclosed in parenthesis\n" . "$herectx");
 				}
 			}
+
+# check for line continuations outside of #defines, preprocessor #, and asm
+
+		} else {
+			if ($prevline !~ /^..*\\$/ &&
+			    $line !~ /^\+\s*\#.*\\$/ &&		# preprocessor
+			    $line !~ /^\+.*\b(__asm__|asm)\b.*\\$/ &&	# asm
+			    $line =~ /^\+.*\\$/) {
+				WARN("LINE_CONTINUATIONS",
+				     "Avoid unnecessary line continuations\n" . $herecurr);
+			}
 		}
 
 # do {} while (0) macro tests:
@@ -3183,20 +3200,14 @@ sub process {
 			}
 		}
 
-# don't include deprecated include files (uses RAW line)
-		for my $inc (@dep_includes) {
-			if ($rawline =~ m@^.\s*\#\s*include\s*\<$inc>@) {
-				ERROR("DEPRECATED_INCLUDE",
-				      "Don't use <$inc>: see Documentation/feature-removal-schedule.txt\n" . $herecurr);
-			}
+# check for unnecessary blank lines around braces
+		if (($line =~ /^..*}\s*$/ && $prevline =~ /^.\s*$/)) {
+			CHK("BRACES",
+			    "Blank lines aren't necessary before a close brace '}'\n" . $hereprev);
 		}
-
-# don't use deprecated functions
-		for my $func (@dep_functions) {
-			if ($line =~ /\b$func\b/) {
-				ERROR("DEPRECATED_FUNCTION",
-				      "Don't use $func(): see Documentation/feature-removal-schedule.txt\n" . $herecurr);
-			}
+		if (($line =~ /^.\s*$/ && $prevline =~ /^..*{\s*$/)) {
+			CHK("BRACES",
+			    "Blank lines aren't necessary after an open brace '{'\n" . $hereprev);
 		}
 
 # no volatiles please
@@ -3213,20 +3224,12 @@ sub process {
 				$herecurr);
 		}
 
-# check for needless kfree() checks
-		if ($prevline =~ /\bif\s*\(([^\)]*)\)/) {
-			my $expr = $1;
-			if ($line =~ /\bkfree\(\Q$expr\E\);/) {
-				WARN("NEEDLESS_KFREE",
-				     "kfree(NULL) is safe this check is probably not required\n" . $hereprev);
-			}
-		}
-# check for needless usb_free_urb() checks
-		if ($prevline =~ /\bif\s*\(([^\)]*)\)/) {
-			my $expr = $1;
-			if ($line =~ /\busb_free_urb\(\Q$expr\E\);/) {
-				WARN("NEEDLESS_USB_FREE_URB",
-				     "usb_free_urb(NULL) is safe this check is probably not required\n" . $hereprev);
+# check for needless "if (<foo>) fn(<foo>)" uses
+		if ($prevline =~ /\bif\s*\(\s*($Lval)\s*\)/) {
+			my $expr = '\s*\(\s*' . quotemeta($1) . '\s*\)\s*;';
+			if ($line =~ /\b(kfree|usb_free_urb|debugfs_remove(?:_recursive)?)$expr/) {
+				WARN('NEEDLESS_IF',
+				     "$1(NULL) is safe this check is probably not required\n" . $hereprev);
 			}
 		}
 
@@ -3344,6 +3347,12 @@ sub process {
 			     "Avoid line continuations in quoted strings\n" . $herecurr);
 		}
 
+# check for struct spinlock declarations
+		if ($line =~ /^.\s*\bstruct\s+spinlock\s+\w+\s*;/) {
+			WARN("USE_SPINLOCK_T",
+			     "struct spinlock should be spinlock_t\n" . $herecurr);
+		}
+
 # Check for misused memsets
 		if ($^V && $^V ge 5.10.0 &&
 		    defined $stat &&
@@ -3450,8 +3459,22 @@ sub process {
 
 # check for multiple semicolons
 		if ($line =~ /;\s*;\s*$/) {
-		    WARN("ONE_SEMICOLON",
-			 "Statements terminations use 1 semicolon\n" . $herecurr);
+			WARN("ONE_SEMICOLON",
+			     "Statements terminations use 1 semicolon\n" . $herecurr);
+		}
+
+# check for switch/default statements without a break;
+		if ($^V && $^V ge 5.10.0 &&
+		    defined $stat &&
+		    $stat =~ /^\+[$;\s]*(?:case[$;\s]+\w+[$;\s]*:[$;\s]*|)*[$;\s]*\bdefault[$;\s]*:[$;\s]*;/g) {
+			my $ctx = '';
+			my $herectx = $here . "\n";
+			my $cnt = statement_rawlines($stat);
+			for (my $n = 0; $n < $cnt; $n++) {
+				$herectx .= raw_line($linenr, $n) . "\n";
+			}
+			WARN("DEFAULT_NO_BREAK",
+			     "switch default: should use break\n" . $herectx);
 		}
 
 # check for gcc specific __FUNCTION__
diff --git a/trunk/scripts/coccinelle/api/d_find_alias.cocci b/trunk/scripts/coccinelle/api/d_find_alias.cocci
new file mode 100644
index 000000000000..a9694a8d3e5a
--- /dev/null
+++ b/trunk/scripts/coccinelle/api/d_find_alias.cocci
@@ -0,0 +1,80 @@
+/// Make sure calls to d_find_alias() have a corresponding call to dput().
+//
+// Keywords: d_find_alias, dput
+//
+// Confidence: Moderate
+// URL: http://coccinelle.lip6.fr/
+// Options: -include_headers
+
+virtual context
+virtual org
+virtual patch
+virtual report
+
+@r exists@
+local idexpression struct dentry *dent;
+expression E, E1;
+statement S1, S2;
+position p1, p2;
+@@
+(
+	if (!(dent@p1 = d_find_alias(...))) S1
+|
+	dent@p1 = d_find_alias(...)
+)
+
+<...when != dput(dent)
+    when != if (...) { <+... dput(dent) ...+> }
+    when != true !dent || ...
+    when != dent = E
+    when != E = dent
+if (!dent || ...) S2
+...>
+(
+	return <+...dent...+>;
+|
+	return @p2 ...;
+|
+	dent@p2 = E1;
+|
+	E1 = dent;
+)
+
+@depends on context@
+local idexpression struct dentry *r.dent;
+position r.p1,r.p2;
+@@
+* dent@p1 = ...
+  ...
+(
+* return@p2 ...;
+|
+* dent@p2
+)
+
+
+@script:python depends on org@
+p1 << r.p1;
+p2 << r.p2;
+@@
+cocci.print_main("Missing call to dput()",p1)
+cocci.print_secs("",p2)
+
+@depends on patch@
+local idexpression struct dentry *r.dent;
+position r.p2;
+@@
+(
++ dput(dent);
+  return @p2 ...;
+|
++ dput(dent);
+  dent@p2 = ...;
+)
+
+@script:python depends on report@
+p1 << r.p1;
+p2 << r.p2;
+@@
+msg = "Missing call to dput() at line %s."
+coccilib.report.print_report(p1[0], msg % (p2[0].line))
diff --git a/trunk/security/capability.c b/trunk/security/capability.c
index b14a30c234b8..0fe5a026aef8 100644
--- a/trunk/security/capability.c
+++ b/trunk/security/capability.c
@@ -395,6 +395,11 @@ static int cap_kernel_module_request(char *kmod_name)
 	return 0;
 }
 
+static int cap_kernel_module_from_file(struct file *file)
+{
+	return 0;
+}
+
 static int cap_task_setpgid(struct task_struct *p, pid_t pgid)
 {
 	return 0;
@@ -967,6 +972,7 @@ void __init security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, kernel_act_as);
 	set_to_cap_if_null(ops, kernel_create_files_as);
 	set_to_cap_if_null(ops, kernel_module_request);
+	set_to_cap_if_null(ops, kernel_module_from_file);
 	set_to_cap_if_null(ops, task_fix_setuid);
 	set_to_cap_if_null(ops, task_setpgid);
 	set_to_cap_if_null(ops, task_getpgid);
diff --git a/trunk/security/commoncap.c b/trunk/security/commoncap.c
index 6dbae4650abe..7ee08c756d6b 100644
--- a/trunk/security/commoncap.c
+++ b/trunk/security/commoncap.c
@@ -76,24 +76,33 @@ int cap_netlink_send(struct sock *sk, struct sk_buff *skb)
 int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
 		int cap, int audit)
 {
-	for (;;) {
-		/* The owner of the user namespace has all caps. */
-		if (targ_ns != &init_user_ns && uid_eq(targ_ns->owner, cred->euid))
-			return 0;
+	struct user_namespace *ns = targ_ns;
 
+	/* See if cred has the capability in the target user namespace
+	 * by examining the target user namespace and all of the target
+	 * user namespace's parents.
+	 */
+	for (;;) {
 		/* Do we have the necessary capabilities? */
-		if (targ_ns == cred->user_ns)
+		if (ns == cred->user_ns)
 			return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
 
 		/* Have we tried all of the parent namespaces? */
-		if (targ_ns == &init_user_ns)
+		if (ns == &init_user_ns)
 			return -EPERM;
 
+		/* 
+		 * The owner of the user namespace in the parent of the
+		 * user namespace has all caps.
+		 */
+		if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid))
+			return 0;
+
 		/*
-		 *If you have a capability in a parent user ns, then you have
+		 * If you have a capability in a parent user ns, then you have
 		 * it over all children user namespaces as well.
 		 */
-		targ_ns = targ_ns->parent;
+		ns = ns->parent;
 	}
 
 	/* We never get here */
diff --git a/trunk/security/integrity/ima/ima.h b/trunk/security/integrity/ima/ima.h
index 6ee8826662cc..3b2adb794f15 100644
--- a/trunk/security/integrity/ima/ima.h
+++ b/trunk/security/integrity/ima/ima.h
@@ -127,7 +127,7 @@ struct integrity_iint_cache *integrity_iint_insert(struct inode *inode);
 struct integrity_iint_cache *integrity_iint_find(struct inode *inode);
 
 /* IMA policy related functions */
-enum ima_hooks { FILE_CHECK = 1, FILE_MMAP, BPRM_CHECK, POST_SETATTR };
+enum ima_hooks { FILE_CHECK = 1, FILE_MMAP, BPRM_CHECK, MODULE_CHECK, POST_SETATTR };
 
 int ima_match_policy(struct inode *inode, enum ima_hooks func, int mask,
 		     int flags);
diff --git a/trunk/security/integrity/ima/ima_api.c b/trunk/security/integrity/ima/ima_api.c
index b356884fb3ef..0cea3db21657 100644
--- a/trunk/security/integrity/ima/ima_api.c
+++ b/trunk/security/integrity/ima/ima_api.c
@@ -100,12 +100,12 @@ void ima_add_violation(struct inode *inode, const unsigned char *filename,
  * ima_get_action - appraise & measure decision based on policy.
  * @inode: pointer to inode to measure
  * @mask: contains the permission mask (MAY_READ, MAY_WRITE, MAY_EXECUTE)
- * @function: calling function (FILE_CHECK, BPRM_CHECK, FILE_MMAP)
+ * @function: calling function (FILE_CHECK, BPRM_CHECK, FILE_MMAP, MODULE_CHECK)
  *
  * The policy is defined in terms of keypairs:
  * 		subj=, obj=, type=, func=, mask=, fsmagic=
  *	subj,obj, and type: are LSM specific.
- * 	func: FILE_CHECK | BPRM_CHECK | FILE_MMAP
+ * 	func: FILE_CHECK | BPRM_CHECK | FILE_MMAP | MODULE_CHECK
  * 	mask: contains the permission mask
  *	fsmagic: hex value
  *
diff --git a/trunk/security/integrity/ima/ima_main.c b/trunk/security/integrity/ima/ima_main.c
index 73c9a268253e..45de18e9a6f2 100644
--- a/trunk/security/integrity/ima/ima_main.c
+++ b/trunk/security/integrity/ima/ima_main.c
@@ -280,6 +280,27 @@ int ima_file_check(struct file *file, int mask)
 }
 EXPORT_SYMBOL_GPL(ima_file_check);
 
+/**
+ * ima_module_check - based on policy, collect/store/appraise measurement.
+ * @file: pointer to the file to be measured/appraised
+ *
+ * Measure/appraise kernel modules based on policy.
+ *
+ * Always return 0 and audit dentry_open failures.
+ * Return code is based upon measurement appraisal.
+ */
+int ima_module_check(struct file *file)
+{
+	int rc;
+
+	if (!file)
+		rc = INTEGRITY_UNKNOWN;
+	else
+		rc = process_measurement(file, file->f_dentry->d_name.name,
+					 MAY_EXEC, MODULE_CHECK);
+	return (ima_appraise & IMA_APPRAISE_ENFORCE) ? rc : 0;
+}
+
 static int __init init_ima(void)
 {
 	int error;
diff --git a/trunk/security/integrity/ima/ima_policy.c b/trunk/security/integrity/ima/ima_policy.c
index c7dacd2eab7a..af7d182d5a46 100644
--- a/trunk/security/integrity/ima/ima_policy.c
+++ b/trunk/security/integrity/ima/ima_policy.c
@@ -80,6 +80,7 @@ static struct ima_rule_entry default_rules[] = {
 	 .flags = IMA_FUNC | IMA_MASK},
 	{.action = MEASURE,.func = FILE_CHECK,.mask = MAY_READ,.uid = GLOBAL_ROOT_UID,
 	 .flags = IMA_FUNC | IMA_MASK | IMA_UID},
+	{.action = MEASURE,.func = MODULE_CHECK, .flags = IMA_FUNC},
 };
 
 static struct ima_rule_entry default_appraise_rules[] = {
@@ -401,6 +402,8 @@ static int ima_parse_rule(char *rule, struct ima_rule_entry *entry)
 			/* PATH_CHECK is for backwards compat */
 			else if (strcmp(args[0].from, "PATH_CHECK") == 0)
 				entry->func = FILE_CHECK;
+			else if (strcmp(args[0].from, "MODULE_CHECK") == 0)
+				entry->func = MODULE_CHECK;
 			else if (strcmp(args[0].from, "FILE_MMAP") == 0)
 				entry->func = FILE_MMAP;
 			else if (strcmp(args[0].from, "BPRM_CHECK") == 0)
diff --git a/trunk/security/security.c b/trunk/security/security.c
index 8dcd4ae10a5f..daa97f4ac9d1 100644
--- a/trunk/security/security.c
+++ b/trunk/security/security.c
@@ -820,6 +820,16 @@ int security_kernel_module_request(char *kmod_name)
 	return security_ops->kernel_module_request(kmod_name);
 }
 
+int security_kernel_module_from_file(struct file *file)
+{
+	int ret;
+
+	ret = security_ops->kernel_module_from_file(file);
+	if (ret)
+		return ret;
+	return ima_module_check(file);
+}
+
 int security_task_fix_setuid(struct cred *new, const struct cred *old,
 			     int flags)
 {
diff --git a/trunk/security/selinux/nlmsgtab.c b/trunk/security/selinux/nlmsgtab.c
index 370a6468b3ba..855e464e92ef 100644
--- a/trunk/security/selinux/nlmsgtab.c
+++ b/trunk/security/selinux/nlmsgtab.c
@@ -69,6 +69,8 @@ static struct nlmsg_perm nlmsg_route_perms[] =
 	{ RTM_SETDCB,		NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
 	{ RTM_NEWNETCONF,	NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
 	{ RTM_GETNETCONF,	NETLINK_ROUTE_SOCKET__NLMSG_READ  },
+	{ RTM_NEWMDB,		NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+	{ RTM_DELMDB,		NETLINK_ROUTE_SOCKET__NLMSG_WRITE  },
 	{ RTM_GETMDB,		NETLINK_ROUTE_SOCKET__NLMSG_READ  },
 };
 
diff --git a/trunk/security/yama/yama_lsm.c b/trunk/security/yama/yama_lsm.c
index 2663145d1197..23414b93771f 100644
--- a/trunk/security/yama/yama_lsm.c
+++ b/trunk/security/yama/yama_lsm.c
@@ -298,14 +298,18 @@ int yama_ptrace_access_check(struct task_struct *child,
 			/* No additional restrictions. */
 			break;
 		case YAMA_SCOPE_RELATIONAL:
+			rcu_read_lock();
 			if (!task_is_descendant(current, child) &&
 			    !ptracer_exception_found(current, child) &&
-			    !ns_capable(task_user_ns(child), CAP_SYS_PTRACE))
+			    !ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE))
 				rc = -EPERM;
+			rcu_read_unlock();
 			break;
 		case YAMA_SCOPE_CAPABILITY:
-			if (!ns_capable(task_user_ns(child), CAP_SYS_PTRACE))
+			rcu_read_lock();
+			if (!ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE))
 				rc = -EPERM;
+			rcu_read_unlock();
 			break;
 		case YAMA_SCOPE_NO_ATTACH:
 		default:
@@ -343,8 +347,10 @@ int yama_ptrace_traceme(struct task_struct *parent)
 	/* Only disallow PTRACE_TRACEME on more aggressive settings. */
 	switch (ptrace_scope) {
 	case YAMA_SCOPE_CAPABILITY:
-		if (!ns_capable(task_user_ns(parent), CAP_SYS_PTRACE))
+		rcu_read_lock();
+		if (!ns_capable(__task_cred(parent)->user_ns, CAP_SYS_PTRACE))
 			rc = -EPERM;
+		rcu_read_unlock();
 		break;
 	case YAMA_SCOPE_NO_ATTACH:
 		rc = -EPERM;
diff --git a/trunk/sound/Kconfig b/trunk/sound/Kconfig
index 261a03c8a209..c710ce2c5c37 100644
--- a/trunk/sound/Kconfig
+++ b/trunk/sound/Kconfig
@@ -52,9 +52,6 @@ config SOUND_OSS_CORE_PRECLAIM
 
 	  Disabling this allows alternative OSS implementations.
 
-	  Please read Documentation/feature-removal-schedule.txt for
-	  details.
-
 	  If unsure, say Y.
 
 source "sound/oss/dmasound/Kconfig"
diff --git a/trunk/sound/sound_core.c b/trunk/sound/sound_core.c
index fb9255cca214..bb23009edc8d 100644
--- a/trunk/sound/sound_core.c
+++ b/trunk/sound/sound_core.c
@@ -146,8 +146,7 @@ extern int msnd_pinnacle_init(void);
  * devices only the standard chrdev aliases are requested.
  *
  * All these clutters are scheduled to be removed along with
- * sound-slot/service-* module aliases.  Please take a look at
- * feature-removal-schedule.txt for details.
+ * sound-slot/service-* module aliases.
  */
 #ifdef CONFIG_SOUND_OSS_CORE_PRECLAIM
 static int preclaim_oss = 1;
diff --git a/trunk/tools/power/x86/turbostat/Makefile b/trunk/tools/power/x86/turbostat/Makefile
index f85649554191..f09641da40d4 100644
--- a/trunk/tools/power/x86/turbostat/Makefile
+++ b/trunk/tools/power/x86/turbostat/Makefile
@@ -1,9 +1,22 @@
+CC		= $(CROSS_COMPILE)gcc
+BUILD_OUTPUT	:= $(PWD)
+PREFIX		:= /usr
+DESTDIR		:=
+
 turbostat : turbostat.c
 CFLAGS +=	-Wall
+CFLAGS +=	-I../../../../arch/x86/include/uapi/
+
+%: %.c
+	@mkdir -p $(BUILD_OUTPUT)
+	$(CC) $(CFLAGS) $< -o $(BUILD_OUTPUT)/$@
 
+.PHONY : clean
 clean :
-	rm -f turbostat
+	@rm -f $(BUILD_OUTPUT)/turbostat
 
-install :
-	install turbostat /usr/bin/turbostat
-	install turbostat.8 /usr/share/man/man8
+install : turbostat
+	install -d  $(DESTDIR)$(PREFIX)/bin
+	install $(BUILD_OUTPUT)/turbostat $(DESTDIR)$(PREFIX)/bin/turbostat
+	install -d  $(DESTDIR)$(PREFIX)/share/man/man8
+	install turbostat.8 $(DESTDIR)$(PREFIX)/share/man/man8
diff --git a/trunk/tools/power/x86/turbostat/turbostat.8 b/trunk/tools/power/x86/turbostat/turbostat.8
index e4d0690cccf9..0d7dc2cfefb5 100644
--- a/trunk/tools/power/x86/turbostat/turbostat.8
+++ b/trunk/tools/power/x86/turbostat/turbostat.8
@@ -11,16 +11,16 @@ turbostat \- Report processor frequency and idle statistics
 .RB [ Options ]
 .RB [ "\-i interval_sec" ]
 .SH DESCRIPTION
-\fBturbostat \fP reports processor topology, frequency
-and idle power state statistics on modern X86 processors.
+\fBturbostat \fP reports processor topology, frequency,
+idle power-state statistics, temperature and power on modern X86 processors.
 Either \fBcommand\fP is forked and statistics are printed
 upon its completion, or statistics are printed periodically.
 
 \fBturbostat \fP
-requires that the processor
+must be run on root, and
+minimally requires that the processor
 supports an "invariant" TSC, plus the APERF and MPERF MSRs.
-\fBturbostat \fP will report idle cpu power state residency
-on processors that additionally support C-state residency counters.
+Additional information is reported depending on hardware counter support.
 
 .SS Options
 The \fB-p\fP option limits output to the 1st thread in 1st core of each package.
@@ -57,7 +57,15 @@ Note that multiple CPUs per core indicate support for Intel(R) Hyper-Threading T
 \fBGHz\fP average clock rate while the CPU was in c0 state.
 \fBTSC\fP average GHz that the TSC ran during the entire interval.
 \fB%c1, %c3, %c6, %c7\fP show the percentage residency in hardware core idle states.
+\fBCTMP\fP Degrees Celsius reported by the per-core Digital Thermal Sensor.
+\fBPTMP\fP Degrees Celsius reported by the per-package Package Thermal Monitor.
 \fB%pc2, %pc3, %pc6, %pc7\fP percentage residency in hardware package idle states.
+\fBPkg_W\fP Watts consumed by the whole package.
+\fBCor_W\fP Watts consumed by the core part of the package.
+\fBGFX_W\fP Watts consumed by the Graphics part of the package -- available only on client processors.
+\fBRAM_W\fP Watts consumed by the DRAM DIMMS -- available only on server processors.
+\fBPKG_%\fP percent of the interval that RAPL throttling was active on the Package.
+\fBRAM_%\fP percent of the interval that RAPL throttling was active on DRAM.
 .fi
 .PP
 .SH EXAMPLE
@@ -66,50 +74,73 @@ Without any parameters, turbostat prints out counters ever 5 seconds.
 for turbostat to fork).
 
 The first row of statistics is a summary for the entire system.
-Note that the summary is a weighted average.
+For residency % columns, the summary is a weighted average.
+For Temperature columns, the summary is the column maximum.
+For Watts columns, the summary is a system total.
 Subsequent rows show per-CPU statistics.
 
 .nf
-[root@x980]# ./turbostat
-cor CPU    %c0  GHz  TSC    %c1    %c3    %c6   %pc3   %pc6
-          0.09 1.62 3.38   1.83   0.32  97.76   1.26  83.61
-  0   0   0.15 1.62 3.38  10.23   0.05  89.56   1.26  83.61
-  0   6   0.05 1.62 3.38  10.34
-  1   2   0.03 1.62 3.38   0.07   0.05  99.86
-  1   8   0.03 1.62 3.38   0.06
-  2   4   0.21 1.62 3.38   0.10   1.49  98.21
-  2  10   0.02 1.62 3.38   0.29
-  8   1   0.04 1.62 3.38   0.04   0.08  99.84
-  8   7   0.01 1.62 3.38   0.06
-  9   3   0.53 1.62 3.38   0.10   0.20  99.17
-  9   9   0.02 1.62 3.38   0.60
- 10   5   0.01 1.62 3.38   0.02   0.04  99.92
- 10  11   0.02 1.62 3.38   0.02
+[root@sandy]# ./turbostat
+cor CPU    %c0  GHz  TSC    %c1    %c3    %c6    %c7 CTMP PTMP   %pc2   %pc3   %pc6   %pc7  Pkg_W  Cor_W GFX_W
+          0.06 0.80 2.29   0.11   0.00   0.00  99.83   47   40   0.26   0.01   0.44  98.78   3.49   0.12  0.14
+  0   0   0.07 0.80 2.29   0.07   0.00   0.00  99.86   40   40   0.26   0.01   0.44  98.78   3.49   0.12  0.14
+  0   4   0.03 0.80 2.29   0.12
+  1   1   0.04 0.80 2.29   0.25   0.01   0.00  99.71   40
+  1   5   0.16 0.80 2.29   0.13
+  2   2   0.05 0.80 2.29   0.06   0.01   0.00  99.88   40
+  2   6   0.03 0.80 2.29   0.08
+  3   3   0.05 0.80 2.29   0.08   0.00   0.00  99.87   47
+  3   7   0.04 0.84 2.29   0.09
 .fi
 .SH SUMMARY EXAMPLE
 The "-s" option prints the column headers just once,
 and then the one line system summary for each sample interval.
 
 .nf
-[root@x980]# ./turbostat -s
-   %c0  GHz  TSC    %c1    %c3    %c6   %pc3   %pc6
-  0.23 1.67 3.38   2.00   0.30  97.47   1.07  82.12
-  0.10 1.62 3.38   1.87   2.25  95.77  12.02  72.60
-  0.20 1.64 3.38   1.98   0.11  97.72   0.30  83.36
-  0.11 1.70 3.38   1.86   1.81  96.22   9.71  74.90
+[root@wsm]# turbostat -S
+   %c0  GHz  TSC    %c1    %c3    %c6 CTMP   %pc3   %pc6
+  1.40 2.81 3.38  10.78  43.47  44.35   42  13.67   2.09
+  1.34 2.90 3.38  11.48  58.96  28.23   41  19.89   0.15
+  1.55 2.72 3.38  26.73  37.66  34.07   42   2.53   2.80
+  1.37 2.83 3.38  16.95  60.05  21.63   42   5.76   0.20
 .fi
 .SH VERBOSE EXAMPLE
 The "-v" option adds verbosity to the output:
 
 .nf
-GenuineIntel 11 CPUID levels; family:model:stepping 0x6:2c:2 (6:44:2)
-12 * 133 = 1600 MHz max efficiency
-25 * 133 = 3333 MHz TSC frequency
-26 * 133 = 3467 MHz max turbo 4 active cores
-26 * 133 = 3467 MHz max turbo 3 active cores
-27 * 133 = 3600 MHz max turbo 2 active cores
-27 * 133 = 3600 MHz max turbo 1 active cores
-
+[root@ivy]# turbostat -v
+turbostat v3.0 November 23, 2012 - Len Brown <lenb@kernel.org>
+CPUID(0): GenuineIntel 13 CPUID levels; family:model:stepping 0x6:3a:9 (6:58:9)
+CPUID(6): APERF, DTS, PTM, EPB
+RAPL: 851 sec. Joule Counter Range
+cpu0: MSR_NHM_PLATFORM_INFO: 0x81010f0012300
+16 * 100 = 1600 MHz max efficiency
+35 * 100 = 3500 MHz TSC frequency
+cpu0: MSR_NHM_SNB_PKG_CST_CFG_CTL: 0x1e008402 (UNdemote-C3, UNdemote-C1, demote-C3, demote-C1, locked: pkg-cstate-limit=2: pc6-noret)
+cpu0: MSR_NHM_TURBO_RATIO_LIMIT: 0x25262727
+37 * 100 = 3700 MHz max turbo 4 active cores
+38 * 100 = 3800 MHz max turbo 3 active cores
+39 * 100 = 3900 MHz max turbo 2 active cores
+39 * 100 = 3900 MHz max turbo 1 active cores
+cpu0: MSR_IA32_ENERGY_PERF_BIAS: 0x00000006 (balanced)
+cpu0: MSR_RAPL_POWER_UNIT: 0x000a1003 (0.125000 Watts, 0.000015 Joules, 0.000977 sec.)
+cpu0: MSR_PKG_POWER_INFO: 0x01e00268 (77 W TDP, RAPL 60 - 0 W, 0.000000 sec.)
+cpu0: MSR_PKG_POWER_LIMIT: 0x830000148268 (UNlocked)
+cpu0: PKG Limit #1: ENabled (77.000000 Watts, 1.000000 sec, clamp DISabled)
+cpu0: PKG Limit #2: ENabled (96.000000 Watts, 0.000977* sec, clamp DISabled)
+cpu0: MSR_PP0_POLICY: 0
+cpu0: MSR_PP0_POWER_LIMIT: 0x00000000 (UNlocked)
+cpu0: Cores Limit: DISabled (0.000000 Watts, 0.000977 sec, clamp DISabled)
+cpu0: MSR_PP1_POLICY: 0
+cpu0: MSR_PP1_POWER_LIMIT: 0x00000000 (UNlocked)
+cpu0: GFX Limit: DISabled (0.000000 Watts, 0.000977 sec, clamp DISabled)
+cpu0: MSR_IA32_TEMPERATURE_TARGET: 0x00691400 (105 C)
+cpu0: MSR_IA32_PACKAGE_THERM_STATUS: 0x884e0000 (27 C)
+cpu0: MSR_IA32_THERM_STATUS: 0x88560000 (19 C +/- 1)
+cpu1: MSR_IA32_THERM_STATUS: 0x88560000 (19 C +/- 1)
+cpu2: MSR_IA32_THERM_STATUS: 0x88540000 (21 C +/- 1)
+cpu3: MSR_IA32_THERM_STATUS: 0x884e0000 (27 C +/- 1)
+ ...
 .fi
 The \fBmax efficiency\fP frequency, a.k.a. Low Frequency Mode, is the frequency
 available at the minimum package voltage.  The \fBTSC frequency\fP is the nominal
@@ -142,7 +173,7 @@ cor CPU    %c0  GHz  TSC    %c1    %c3    %c6   %pc3   %pc6
  10   5   1.42 3.43 3.38   2.14  30.99  65.44
  10  11   0.16 2.88 3.38   3.40
 .fi
-Above the cycle soaker drives cpu7 up its 3.6 Ghz turbo limit
+Above the cycle soaker drives cpu7 up its 3.6 GHz turbo limit
 while the other processors are generally in various states of idle.
 
 Note that cpu1 and cpu7 are HT siblings within core8.
diff --git a/trunk/tools/power/x86/turbostat/turbostat.c b/trunk/tools/power/x86/turbostat/turbostat.c
index ea095abbe97e..ce6d46038f74 100644
--- a/trunk/tools/power/x86/turbostat/turbostat.c
+++ b/trunk/tools/power/x86/turbostat/turbostat.c
@@ -20,6 +20,7 @@
  */
 
 #define _GNU_SOURCE
+#include <asm/msr.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <sys/types.h>
@@ -35,28 +36,18 @@
 #include <ctype.h>
 #include <sched.h>
 
-#define MSR_NEHALEM_PLATFORM_INFO	0xCE
-#define MSR_NEHALEM_TURBO_RATIO_LIMIT	0x1AD
-#define MSR_IVT_TURBO_RATIO_LIMIT	0x1AE
-#define MSR_APERF	0xE8
-#define MSR_MPERF	0xE7
-#define MSR_PKG_C2_RESIDENCY	0x60D	/* SNB only */
-#define MSR_PKG_C3_RESIDENCY	0x3F8
-#define MSR_PKG_C6_RESIDENCY	0x3F9
-#define MSR_PKG_C7_RESIDENCY	0x3FA	/* SNB only */
-#define MSR_CORE_C3_RESIDENCY	0x3FC
-#define MSR_CORE_C6_RESIDENCY	0x3FD
-#define MSR_CORE_C7_RESIDENCY	0x3FE	/* SNB only */
-
 char *proc_stat = "/proc/stat";
 unsigned int interval_sec = 5;	/* set with -i interval_sec */
 unsigned int verbose;		/* set with -v */
+unsigned int rapl_verbose;	/* set with -R */
+unsigned int thermal_verbose;	/* set with -T */
 unsigned int summary_only;	/* set with -s */
 unsigned int skip_c0;
 unsigned int skip_c1;
 unsigned int do_nhm_cstates;
 unsigned int do_snb_cstates;
 unsigned int has_aperf;
+unsigned int has_epb;
 unsigned int units = 1000000000;	/* Ghz etc */
 unsigned int genuine_intel;
 unsigned int has_invariant_tsc;
@@ -74,6 +65,23 @@ unsigned int show_cpu;
 unsigned int show_pkg_only;
 unsigned int show_core_only;
 char *output_buffer, *outp;
+unsigned int do_rapl;
+unsigned int do_dts;
+unsigned int do_ptm;
+unsigned int tcc_activation_temp;
+unsigned int tcc_activation_temp_override;
+double rapl_power_units, rapl_energy_units, rapl_time_units;
+double rapl_joule_counter_range;
+
+#define RAPL_PKG	(1 << 0)
+#define RAPL_CORES	(1 << 1)
+#define RAPL_GFX	(1 << 2)
+#define RAPL_DRAM	(1 << 3)
+#define RAPL_PKG_PERF_STATUS	(1 << 4)
+#define RAPL_DRAM_PERF_STATUS	(1 << 5)
+#define	TJMAX_DEFAULT	100
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
 
 int aperf_mperf_unstable;
 int backwards_count;
@@ -101,6 +109,7 @@ struct core_data {
 	unsigned long long c3;
 	unsigned long long c6;
 	unsigned long long c7;
+	unsigned int core_temp_c;
 	unsigned int core_id;
 } *core_even, *core_odd;
 
@@ -110,6 +119,14 @@ struct pkg_data {
 	unsigned long long pc6;
 	unsigned long long pc7;
 	unsigned int package_id;
+	unsigned int energy_pkg;	/* MSR_PKG_ENERGY_STATUS */
+	unsigned int energy_dram;	/* MSR_DRAM_ENERGY_STATUS */
+	unsigned int energy_cores;	/* MSR_PP0_ENERGY_STATUS */
+	unsigned int energy_gfx;	/* MSR_PP1_ENERGY_STATUS */
+	unsigned int rapl_pkg_perf_status;	/* MSR_PKG_PERF_STATUS */
+	unsigned int rapl_dram_perf_status;	/* MSR_DRAM_PERF_STATUS */
+	unsigned int pkg_temp_c;
+
 } *package_even, *package_odd;
 
 #define ODD_COUNTERS thread_odd, core_odd, package_odd
@@ -247,6 +264,12 @@ void print_header(void)
 		outp += sprintf(outp, "    %%c6");
 	if (do_snb_cstates)
 		outp += sprintf(outp, "    %%c7");
+
+	if (do_dts)
+		outp += sprintf(outp, " CTMP");
+	if (do_ptm)
+		outp += sprintf(outp, " PTMP");
+
 	if (do_snb_cstates)
 		outp += sprintf(outp, "   %%pc2");
 	if (do_nhm_cstates)
@@ -256,6 +279,19 @@ void print_header(void)
 	if (do_snb_cstates)
 		outp += sprintf(outp, "   %%pc7");
 
+	if (do_rapl & RAPL_PKG)
+		outp += sprintf(outp, "  Pkg_W");
+	if (do_rapl & RAPL_CORES)
+		outp += sprintf(outp, "  Cor_W");
+	if (do_rapl & RAPL_GFX)
+		outp += sprintf(outp, " GFX_W");
+	if (do_rapl & RAPL_DRAM)
+		outp += sprintf(outp, " RAM_W");
+	if (do_rapl & RAPL_PKG_PERF_STATUS)
+		outp += sprintf(outp, " PKG_%%");
+	if (do_rapl & RAPL_DRAM_PERF_STATUS)
+		outp += sprintf(outp, " RAM_%%");
+
 	outp += sprintf(outp, "\n");
 }
 
@@ -285,6 +321,7 @@ int dump_counters(struct thread_data *t, struct core_data *c,
 		fprintf(stderr, "c3: %016llX\n", c->c3);
 		fprintf(stderr, "c6: %016llX\n", c->c6);
 		fprintf(stderr, "c7: %016llX\n", c->c7);
+		fprintf(stderr, "DTS: %dC\n", c->core_temp_c);
 	}
 
 	if (p) {
@@ -293,6 +330,13 @@ int dump_counters(struct thread_data *t, struct core_data *c,
 		fprintf(stderr, "pc3: %016llX\n", p->pc3);
 		fprintf(stderr, "pc6: %016llX\n", p->pc6);
 		fprintf(stderr, "pc7: %016llX\n", p->pc7);
+		fprintf(stderr, "Joules PKG: %0X\n", p->energy_pkg);
+		fprintf(stderr, "Joules COR: %0X\n", p->energy_cores);
+		fprintf(stderr, "Joules GFX: %0X\n", p->energy_gfx);
+		fprintf(stderr, "Joules RAM: %0X\n", p->energy_dram);
+		fprintf(stderr, "Throttle PKG: %0X\n", p->rapl_pkg_perf_status);
+		fprintf(stderr, "Throttle RAM: %0X\n", p->rapl_dram_perf_status);
+		fprintf(stderr, "PTM: %dC\n", p->pkg_temp_c);
 	}
 	return 0;
 }
@@ -302,14 +346,21 @@ int dump_counters(struct thread_data *t, struct core_data *c,
  * package: "pk" 2 columns %2d
  * core: "cor" 3 columns %3d
  * CPU: "CPU" 3 columns %3d
+ * Pkg_W: %6.2
+ * Cor_W: %6.2
+ * GFX_W: %5.2
+ * RAM_W: %5.2
  * GHz: "GHz" 3 columns %3.2
  * TSC: "TSC" 3 columns %3.2
  * percentage " %pc3" %6.2
+ * Perf Status percentage: %5.2
+ * "CTMP" 4 columns %4d
  */
 int format_counters(struct thread_data *t, struct core_data *c,
 	struct pkg_data *p)
 {
 	double interval_float;
+	char *fmt5, *fmt6;
 
 	 /* if showing only 1st thread in core and this isn't one, bail out */
 	if (show_core_only && !(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
@@ -349,7 +400,6 @@ int format_counters(struct thread_data *t, struct core_data *c,
 		if (show_cpu)
 			outp += sprintf(outp, " %3d", t->cpu_id);
 	}
-
 	/* %c0 */
 	if (do_nhm_cstates) {
 		if (show_pkg || show_core || show_cpu)
@@ -414,10 +464,16 @@ int format_counters(struct thread_data *t, struct core_data *c,
 	if (do_snb_cstates)
 		outp += sprintf(outp, " %6.2f", 100.0 * c->c7/t->tsc);
 
+	if (do_dts)
+		outp += sprintf(outp, " %4d", c->core_temp_c);
+
 	/* print per-package data only for 1st core in package */
 	if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
 		goto done;
 
+	if (do_ptm)
+		outp += sprintf(outp, " %4d", p->pkg_temp_c);
+
 	if (do_snb_cstates)
 		outp += sprintf(outp, " %6.2f", 100.0 * p->pc2/t->tsc);
 	if (do_nhm_cstates)
@@ -426,6 +482,32 @@ int format_counters(struct thread_data *t, struct core_data *c,
 		outp += sprintf(outp, " %6.2f", 100.0 * p->pc6/t->tsc);
 	if (do_snb_cstates)
 		outp += sprintf(outp, " %6.2f", 100.0 * p->pc7/t->tsc);
+
+	/*
+ 	 * If measurement interval exceeds minimum RAPL Joule Counter range,
+ 	 * indicate that results are suspect by printing "**" in fraction place.
+ 	 */
+	if (interval_float < rapl_joule_counter_range) {
+		fmt5 = " %5.2f";
+		fmt6 = " %6.2f";
+	} else {
+		fmt5 = " %3.0f**";
+		fmt6 = " %4.0f**";
+	}
+
+	if (do_rapl & RAPL_PKG)
+		outp += sprintf(outp, fmt6, p->energy_pkg * rapl_energy_units / interval_float);
+	if (do_rapl & RAPL_CORES)
+		outp += sprintf(outp, fmt6, p->energy_cores * rapl_energy_units / interval_float);
+	if (do_rapl & RAPL_GFX)
+		outp += sprintf(outp, fmt5, p->energy_gfx * rapl_energy_units / interval_float); 
+	if (do_rapl & RAPL_DRAM)
+		outp += sprintf(outp, fmt5, p->energy_dram * rapl_energy_units / interval_float);
+	if (do_rapl & RAPL_PKG_PERF_STATUS )
+		outp += sprintf(outp, fmt5, 100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float);
+	if (do_rapl & RAPL_DRAM_PERF_STATUS )
+		outp += sprintf(outp, fmt5, 100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float);
+
 done:
 	outp += sprintf(outp, "\n");
 
@@ -435,6 +517,7 @@ int format_counters(struct thread_data *t, struct core_data *c,
 void flush_stdout()
 {
 	fputs(output_buffer, stdout);
+	fflush(stdout);
 	outp = output_buffer;
 }
 void flush_stderr()
@@ -461,6 +544,13 @@ void format_all_counters(struct thread_data *t, struct core_data *c, struct pkg_
 	for_all_cpus(format_counters, t, c, p);
 }
 
+#define DELTA_WRAP32(new, old)			\
+	if (new > old) {			\
+		old = new - old;		\
+	} else {				\
+		old = 0x100000000 + new - old;	\
+	}
+
 void
 delta_package(struct pkg_data *new, struct pkg_data *old)
 {
@@ -468,6 +558,14 @@ delta_package(struct pkg_data *new, struct pkg_data *old)
 	old->pc3 = new->pc3 - old->pc3;
 	old->pc6 = new->pc6 - old->pc6;
 	old->pc7 = new->pc7 - old->pc7;
+	old->pkg_temp_c = new->pkg_temp_c;
+
+	DELTA_WRAP32(new->energy_pkg, old->energy_pkg);
+	DELTA_WRAP32(new->energy_cores, old->energy_cores);
+	DELTA_WRAP32(new->energy_gfx, old->energy_gfx);
+	DELTA_WRAP32(new->energy_dram, old->energy_dram);
+	DELTA_WRAP32(new->rapl_pkg_perf_status, old->rapl_pkg_perf_status);
+	DELTA_WRAP32(new->rapl_dram_perf_status, old->rapl_dram_perf_status);
 }
 
 void
@@ -476,6 +574,7 @@ delta_core(struct core_data *new, struct core_data *old)
 	old->c3 = new->c3 - old->c3;
 	old->c6 = new->c6 - old->c6;
 	old->c7 = new->c7 - old->c7;
+	old->core_temp_c = new->core_temp_c;
 }
 
 /*
@@ -582,11 +681,20 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 	c->c3 = 0;
 	c->c6 = 0;
 	c->c7 = 0;
+	c->core_temp_c = 0;
 
 	p->pc2 = 0;
 	p->pc3 = 0;
 	p->pc6 = 0;
 	p->pc7 = 0;
+
+	p->energy_pkg = 0;
+	p->energy_dram = 0;
+	p->energy_cores = 0;
+	p->energy_gfx = 0;
+	p->rapl_pkg_perf_status = 0;
+	p->rapl_dram_perf_status = 0;
+	p->pkg_temp_c = 0;
 }
 int sum_counters(struct thread_data *t, struct core_data *c,
 	struct pkg_data *p)
@@ -607,6 +715,8 @@ int sum_counters(struct thread_data *t, struct core_data *c,
 	average.cores.c6 += c->c6;
 	average.cores.c7 += c->c7;
 
+	average.cores.core_temp_c = MAX(average.cores.core_temp_c, c->core_temp_c);
+
 	/* sum per-pkg values only for 1st core in pkg */
 	if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
 		return 0;
@@ -616,6 +726,15 @@ int sum_counters(struct thread_data *t, struct core_data *c,
 	average.packages.pc6 += p->pc6;
 	average.packages.pc7 += p->pc7;
 
+	average.packages.energy_pkg += p->energy_pkg;
+	average.packages.energy_dram += p->energy_dram;
+	average.packages.energy_cores += p->energy_cores;
+	average.packages.energy_gfx += p->energy_gfx;
+
+	average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c);
+
+	average.packages.rapl_pkg_perf_status += p->rapl_pkg_perf_status;
+	average.packages.rapl_dram_perf_status += p->rapl_dram_perf_status;
 	return 0;
 }
 /*
@@ -667,23 +786,26 @@ static unsigned long long rdtsc(void)
 int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
 	int cpu = t->cpu_id;
+	unsigned long long msr;
 
-	if (cpu_migrate(cpu))
+	if (cpu_migrate(cpu)) {
+		fprintf(stderr, "Could not migrate to CPU %d\n", cpu);
 		return -1;
+	}
 
 	t->tsc = rdtsc();	/* we are running on local CPU of interest */
 
 	if (has_aperf) {
-		if (get_msr(cpu, MSR_APERF, &t->aperf))
+		if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
 			return -3;
-		if (get_msr(cpu, MSR_MPERF, &t->mperf))
+		if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf))
 			return -4;
 	}
 
 	if (extra_delta_offset32) {
-		if (get_msr(cpu, extra_delta_offset32, &t->extra_delta32))
+		if (get_msr(cpu, extra_delta_offset32, &msr))
 			return -5;
-		t->extra_delta32 &= 0xFFFFFFFF;
+		t->extra_delta32 = msr & 0xFFFFFFFF;
 	}
 
 	if (extra_delta_offset64)
@@ -691,9 +813,9 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 			return -5;
 
 	if (extra_msr_offset32) {
-		if (get_msr(cpu, extra_msr_offset32, &t->extra_msr32))
+		if (get_msr(cpu, extra_msr_offset32, &msr))
 			return -5;
-		t->extra_msr32 &= 0xFFFFFFFF;
+		t->extra_msr32 = msr & 0xFFFFFFFF;
 	}
 
 	if (extra_msr_offset64)
@@ -715,6 +837,13 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 		if (get_msr(cpu, MSR_CORE_C7_RESIDENCY, &c->c7))
 			return -8;
 
+	if (do_dts) {
+		if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr))
+			return -9;
+		c->core_temp_c = tcc_activation_temp - ((msr >> 16) & 0x7F);
+	}
+
+
 	/* collect package counters only for 1st core in package */
 	if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
 		return 0;
@@ -731,6 +860,41 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 		if (get_msr(cpu, MSR_PKG_C7_RESIDENCY, &p->pc7))
 			return -12;
 	}
+	if (do_rapl & RAPL_PKG) {
+		if (get_msr(cpu, MSR_PKG_ENERGY_STATUS, &msr))
+			return -13;
+		p->energy_pkg = msr & 0xFFFFFFFF;
+	}
+	if (do_rapl & RAPL_CORES) {
+		if (get_msr(cpu, MSR_PP0_ENERGY_STATUS, &msr))
+			return -14;
+		p->energy_cores = msr & 0xFFFFFFFF;
+	}
+	if (do_rapl & RAPL_DRAM) {
+		if (get_msr(cpu, MSR_DRAM_ENERGY_STATUS, &msr))
+			return -15;
+		p->energy_dram = msr & 0xFFFFFFFF;
+	}
+	if (do_rapl & RAPL_GFX) {
+		if (get_msr(cpu, MSR_PP1_ENERGY_STATUS, &msr))
+			return -16;
+		p->energy_gfx = msr & 0xFFFFFFFF;
+	}
+	if (do_rapl & RAPL_PKG_PERF_STATUS) {
+		if (get_msr(cpu, MSR_PKG_PERF_STATUS, &msr))
+			return -16;
+		p->rapl_pkg_perf_status = msr & 0xFFFFFFFF;
+	}
+	if (do_rapl & RAPL_DRAM_PERF_STATUS) {
+		if (get_msr(cpu, MSR_DRAM_PERF_STATUS, &msr))
+			return -16;
+		p->rapl_dram_perf_status = msr & 0xFFFFFFFF;
+	}
+	if (do_ptm) {
+		if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
+			return -17;
+		p->pkg_temp_c = tcc_activation_temp - ((msr >> 16) & 0x7F);
+	}
 	return 0;
 }
 
@@ -742,10 +906,10 @@ void print_verbose_header(void)
 	if (!do_nehalem_platform_info)
 		return;
 
-	get_msr(0, MSR_NEHALEM_PLATFORM_INFO, &msr);
+	get_msr(0, MSR_NHM_PLATFORM_INFO, &msr);
 
-	if (verbose > 1)
-		fprintf(stderr, "MSR_NEHALEM_PLATFORM_INFO: 0x%llx\n", msr);
+	if (verbose)
+		fprintf(stderr, "cpu0: MSR_NHM_PLATFORM_INFO: 0x%08llx\n", msr);
 
 	ratio = (msr >> 40) & 0xFF;
 	fprintf(stderr, "%d * %.0f = %.0f MHz max efficiency\n",
@@ -760,8 +924,8 @@ void print_verbose_header(void)
 
 	get_msr(0, MSR_IVT_TURBO_RATIO_LIMIT, &msr);
 
-	if (verbose > 1)
-		fprintf(stderr, "MSR_IVT_TURBO_RATIO_LIMIT: 0x%llx\n", msr);
+	if (verbose)
+		fprintf(stderr, "cpu0: MSR_IVT_TURBO_RATIO_LIMIT: 0x%08llx\n", msr);
 
 	ratio = (msr >> 56) & 0xFF;
 	if (ratio)
@@ -804,14 +968,56 @@ void print_verbose_header(void)
 			ratio, bclk, ratio * bclk);
 
 print_nhm_turbo_ratio_limits:
+	get_msr(0, MSR_NHM_SNB_PKG_CST_CFG_CTL, &msr);
+
+#define SNB_C1_AUTO_UNDEMOTE              (1UL << 27)
+#define SNB_C3_AUTO_UNDEMOTE              (1UL << 28)
+
+	fprintf(stderr, "cpu0: MSR_NHM_SNB_PKG_CST_CFG_CTL: 0x%08llx", msr);
+
+	fprintf(stderr, " (%s%s%s%s%slocked: pkg-cstate-limit=%d: ",
+		(msr & SNB_C3_AUTO_UNDEMOTE) ? "UNdemote-C3, " : "",
+		(msr & SNB_C1_AUTO_UNDEMOTE) ? "UNdemote-C1, " : "",
+		(msr & NHM_C3_AUTO_DEMOTE) ? "demote-C3, " : "",
+		(msr & NHM_C1_AUTO_DEMOTE) ? "demote-C1, " : "",
+		(msr & (1 << 15)) ? "" : "UN",
+		(unsigned int)msr & 7);
+
+
+	switch(msr & 0x7) {
+	case 0:
+		fprintf(stderr, "pc0");
+		break;
+	case 1:
+		fprintf(stderr, do_snb_cstates ? "pc2" : "pc0");
+		break;
+	case 2:
+		fprintf(stderr, do_snb_cstates ? "pc6-noret" : "pc3");
+		break;
+	case 3:
+		fprintf(stderr, "pc6");
+		break;
+	case 4:
+		fprintf(stderr, "pc7");
+		break;
+	case 5:
+		fprintf(stderr, do_snb_cstates ? "pc7s" : "invalid");
+		break;
+	case 7:
+		fprintf(stderr, "unlimited");
+		break;
+	default:
+		fprintf(stderr, "invalid");
+	}
+	fprintf(stderr, ")\n");
 
 	if (!do_nehalem_turbo_ratio_limit)
 		return;
 
-	get_msr(0, MSR_NEHALEM_TURBO_RATIO_LIMIT, &msr);
+	get_msr(0, MSR_NHM_TURBO_RATIO_LIMIT, &msr);
 
-	if (verbose > 1)
-		fprintf(stderr, "MSR_NEHALEM_TURBO_RATIO_LIMIT: 0x%llx\n", msr);
+	if (verbose)
+		fprintf(stderr, "cpu0: MSR_NHM_TURBO_RATIO_LIMIT: 0x%08llx\n", msr);
 
 	ratio = (msr >> 56) & 0xFF;
 	if (ratio)
@@ -1100,15 +1306,22 @@ int mark_cpu_present(int cpu)
 void turbostat_loop()
 {
 	int retval;
+	int restarted = 0;
 
 restart:
+	restarted++;
+
 	retval = for_all_cpus(get_counters, EVEN_COUNTERS);
 	if (retval < -1) {
 		exit(retval);
 	} else if (retval == -1) {
+		if (restarted > 1) {
+			exit(retval);
+		}
 		re_initialize();
 		goto restart;
 	}
+	restarted = 0;
 	gettimeofday(&tv_even, (struct timezone *)NULL);
 
 	while (1) {
@@ -1207,6 +1420,299 @@ int has_ivt_turbo_ratio_limit(unsigned int family, unsigned int model)
 	}
 }
 
+/*
+ * print_epb()
+ * Decode the ENERGY_PERF_BIAS MSR
+ */
+int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+	unsigned long long msr;
+	char *epb_string;
+	int cpu;
+
+	if (!has_epb)
+		return 0;
+
+	cpu = t->cpu_id;
+
+	/* EPB is per-package */
+	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+		return 0;
+
+	if (cpu_migrate(cpu)) {
+		fprintf(stderr, "Could not migrate to CPU %d\n", cpu);
+		return -1;
+	}
+
+	if (get_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &msr))
+		return 0;
+
+	switch (msr & 0x7) {
+	case ENERGY_PERF_BIAS_PERFORMANCE:
+		epb_string = "performance";
+		break;
+	case ENERGY_PERF_BIAS_NORMAL:
+		epb_string = "balanced";
+		break;
+	case ENERGY_PERF_BIAS_POWERSAVE:
+		epb_string = "powersave";
+		break;
+	default:
+		epb_string = "custom";
+		break;
+	}
+	fprintf(stderr, "cpu%d: MSR_IA32_ENERGY_PERF_BIAS: 0x%08llx (%s)\n", cpu, msr, epb_string);
+
+	return 0;
+}
+
+#define	RAPL_POWER_GRANULARITY	0x7FFF	/* 15 bit power granularity */
+#define	RAPL_TIME_GRANULARITY	0x3F /* 6 bit time granularity */
+
+/*
+ * rapl_probe()
+ *
+ * sets do_rapl
+ */
+void rapl_probe(unsigned int family, unsigned int model)
+{
+	unsigned long long msr;
+	double tdp;
+
+	if (!genuine_intel)
+		return;
+
+	if (family != 6)
+		return;
+
+	switch (model) {
+	case 0x2A:
+	case 0x3A:
+		do_rapl = RAPL_PKG | RAPL_CORES | RAPL_GFX;
+		break;
+	case 0x2D:
+	case 0x3E:
+		do_rapl = RAPL_PKG | RAPL_CORES | RAPL_DRAM | RAPL_PKG_PERF_STATUS | RAPL_DRAM_PERF_STATUS;
+		break;
+	default:
+		return;
+	}
+
+	/* units on package 0, verify later other packages match */
+	if (get_msr(0, MSR_RAPL_POWER_UNIT, &msr))
+		return;
+
+	rapl_power_units = 1.0 / (1 << (msr & 0xF));
+	rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F));
+	rapl_time_units = 1.0 / (1 << (msr >> 16 & 0xF));
+
+	/* get TDP to determine energy counter range */
+	if (get_msr(0, MSR_PKG_POWER_INFO, &msr))
+		return;
+
+	tdp = ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units;
+
+	rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp;
+
+	if (verbose)
+		fprintf(stderr, "RAPL: %.0f sec. Joule Counter Range\n", rapl_joule_counter_range);
+
+	return;
+}
+
+int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+	unsigned long long msr;
+	unsigned int dts;
+	int cpu;
+
+	if (!(do_dts || do_ptm))
+		return 0;
+
+	cpu = t->cpu_id;
+
+	/* DTS is per-core, no need to print for each thread */
+	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) 
+		return 0;
+
+	if (cpu_migrate(cpu)) {
+		fprintf(stderr, "Could not migrate to CPU %d\n", cpu);
+		return -1;
+	}
+
+	if (do_ptm && (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) {
+		if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
+			return 0;
+
+		dts = (msr >> 16) & 0x7F;
+		fprintf(stderr, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n",
+			cpu, msr, tcc_activation_temp - dts);
+
+#ifdef	THERM_DEBUG
+		if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &msr))
+			return 0;
+
+		dts = (msr >> 16) & 0x7F;
+		dts2 = (msr >> 8) & 0x7F;
+		fprintf(stderr, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
+			cpu, msr, tcc_activation_temp - dts, tcc_activation_temp - dts2);
+#endif
+	}
+
+
+	if (do_dts) {
+		unsigned int resolution;
+
+		if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr))
+			return 0;
+
+		dts = (msr >> 16) & 0x7F;
+		resolution = (msr >> 27) & 0xF;
+		fprintf(stderr, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n",
+			cpu, msr, tcc_activation_temp - dts, resolution);
+
+#ifdef THERM_DEBUG
+		if (get_msr(cpu, MSR_IA32_THERM_INTERRUPT, &msr))
+			return 0;
+
+		dts = (msr >> 16) & 0x7F;
+		dts2 = (msr >> 8) & 0x7F;
+		fprintf(stderr, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
+			cpu, msr, tcc_activation_temp - dts, tcc_activation_temp - dts2);
+#endif
+	}
+
+	return 0;
+}
+	
+void print_power_limit_msr(int cpu, unsigned long long msr, char *label)
+{
+	fprintf(stderr, "cpu%d: %s: %sabled (%f Watts, %f sec, clamp %sabled)\n",
+		cpu, label,
+		((msr >> 15) & 1) ? "EN" : "DIS",
+		((msr >> 0) & 0x7FFF) * rapl_power_units,
+		(1.0 + (((msr >> 22) & 0x3)/4.0)) * (1 << ((msr >> 17) & 0x1F)) * rapl_time_units,
+		(((msr >> 16) & 1) ? "EN" : "DIS"));
+
+	return;
+}
+
+int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+	unsigned long long msr;
+	int cpu;
+	double local_rapl_power_units, local_rapl_energy_units, local_rapl_time_units;
+
+	if (!do_rapl)
+		return 0;
+
+	/* RAPL counters are per package, so print only for 1st thread/package */
+	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+		return 0;
+
+	cpu = t->cpu_id;
+	if (cpu_migrate(cpu)) {
+		fprintf(stderr, "Could not migrate to CPU %d\n", cpu);
+		return -1;
+	}
+
+	if (get_msr(cpu, MSR_RAPL_POWER_UNIT, &msr))
+		return -1;
+
+	local_rapl_power_units = 1.0 / (1 << (msr & 0xF));
+	local_rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F));
+	local_rapl_time_units = 1.0 / (1 << (msr >> 16 & 0xF));
+
+	if (local_rapl_power_units != rapl_power_units)
+		fprintf(stderr, "cpu%d, ERROR: Power units mis-match\n", cpu);
+	if (local_rapl_energy_units != rapl_energy_units)
+		fprintf(stderr, "cpu%d, ERROR: Energy units mis-match\n", cpu);
+	if (local_rapl_time_units != rapl_time_units)
+		fprintf(stderr, "cpu%d, ERROR: Time units mis-match\n", cpu);
+
+	if (verbose) {
+		fprintf(stderr, "cpu%d: MSR_RAPL_POWER_UNIT: 0x%08llx "
+			"(%f Watts, %f Joules, %f sec.)\n", cpu, msr,
+			local_rapl_power_units, local_rapl_energy_units, local_rapl_time_units);
+	}
+	if (do_rapl & RAPL_PKG) {
+		if (get_msr(cpu, MSR_PKG_POWER_INFO, &msr))
+                	return -5;
+
+
+		fprintf(stderr, "cpu%d: MSR_PKG_POWER_INFO: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n",
+			cpu, msr,
+			((msr >>  0) & RAPL_POWER_GRANULARITY) * rapl_power_units,
+			((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units,
+			((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units,
+			((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units);
+
+		if (get_msr(cpu, MSR_PKG_POWER_LIMIT, &msr))
+			return -9;
+
+		fprintf(stderr, "cpu%d: MSR_PKG_POWER_LIMIT: 0x%08llx (%slocked)\n",
+			cpu, msr, (msr >> 63) & 1 ? "": "UN");
+
+		print_power_limit_msr(cpu, msr, "PKG Limit #1");
+		fprintf(stderr, "cpu%d: PKG Limit #2: %sabled (%f Watts, %f* sec, clamp %sabled)\n",
+			cpu,
+			((msr >> 47) & 1) ? "EN" : "DIS",
+			((msr >> 32) & 0x7FFF) * rapl_power_units,
+			(1.0 + (((msr >> 54) & 0x3)/4.0)) * (1 << ((msr >> 49) & 0x1F)) * rapl_time_units,
+			((msr >> 48) & 1) ? "EN" : "DIS");
+	}
+
+	if (do_rapl & RAPL_DRAM) {
+		if (get_msr(cpu, MSR_DRAM_POWER_INFO, &msr))
+                	return -6;
+
+
+		fprintf(stderr, "cpu%d: MSR_DRAM_POWER_INFO,: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n",
+			cpu, msr,
+			((msr >>  0) & RAPL_POWER_GRANULARITY) * rapl_power_units,
+			((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units,
+			((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units,
+			((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units);
+
+
+		if (get_msr(cpu, MSR_DRAM_POWER_LIMIT, &msr))
+			return -9;
+		fprintf(stderr, "cpu%d: MSR_DRAM_POWER_LIMIT: 0x%08llx (%slocked)\n",
+				cpu, msr, (msr >> 31) & 1 ? "": "UN");
+
+		print_power_limit_msr(cpu, msr, "DRAM Limit");
+	}
+	if (do_rapl & RAPL_CORES) {
+		if (verbose) {
+			if (get_msr(cpu, MSR_PP0_POLICY, &msr))
+				return -7;
+
+			fprintf(stderr, "cpu%d: MSR_PP0_POLICY: %lld\n", cpu, msr & 0xF);
+
+			if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr))
+				return -9;
+			fprintf(stderr, "cpu%d: MSR_PP0_POWER_LIMIT: 0x%08llx (%slocked)\n",
+					cpu, msr, (msr >> 31) & 1 ? "": "UN");
+			print_power_limit_msr(cpu, msr, "Cores Limit");
+		}
+	}
+	if (do_rapl & RAPL_GFX) {
+		if (verbose) {
+			if (get_msr(cpu, MSR_PP1_POLICY, &msr))
+				return -8;
+
+			fprintf(stderr, "cpu%d: MSR_PP1_POLICY: %lld\n", cpu, msr & 0xF);
+
+			if (get_msr(cpu, MSR_PP1_POWER_LIMIT, &msr))
+				return -9;
+			fprintf(stderr, "cpu%d: MSR_PP1_POWER_LIMIT: 0x%08llx (%slocked)\n",
+					cpu, msr, (msr >> 31) & 1 ? "": "UN");
+			print_power_limit_msr(cpu, msr, "GFX Limit");
+		}
+	}
+	return 0;
+}
+
 
 int is_snb(unsigned int family, unsigned int model)
 {
@@ -1231,6 +1737,72 @@ double discover_bclk(unsigned int family, unsigned int model)
 		return 133.33;
 }
 
+/*
+ * MSR_IA32_TEMPERATURE_TARGET indicates the temperature where
+ * the Thermal Control Circuit (TCC) activates.
+ * This is usually equal to tjMax.
+ *
+ * Older processors do not have this MSR, so there we guess,
+ * but also allow cmdline over-ride with -T.
+ *
+ * Several MSR temperature values are in units of degrees-C
+ * below this value, including the Digital Thermal Sensor (DTS),
+ * Package Thermal Management Sensor (PTM), and thermal event thresholds.
+ */
+int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+	unsigned long long msr;
+	unsigned int target_c_local;
+	int cpu;
+
+	/* tcc_activation_temp is used only for dts or ptm */
+	if (!(do_dts || do_ptm))
+		return 0;
+
+	/* this is a per-package concept */
+	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+		return 0;
+
+	cpu = t->cpu_id;
+	if (cpu_migrate(cpu)) {
+		fprintf(stderr, "Could not migrate to CPU %d\n", cpu);
+		return -1;
+	}
+
+	if (tcc_activation_temp_override != 0) {
+		tcc_activation_temp = tcc_activation_temp_override;
+		fprintf(stderr, "cpu%d: Using cmdline TCC Target (%d C)\n",
+			cpu, tcc_activation_temp);
+		return 0;
+	}
+
+	/* Temperature Target MSR is Nehalem and newer only */
+	if (!do_nehalem_platform_info)
+		goto guess;
+
+	if (get_msr(0, MSR_IA32_TEMPERATURE_TARGET, &msr))
+		goto guess;
+
+	target_c_local = (msr >> 16) & 0x7F;
+
+	if (verbose)
+		fprintf(stderr, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n",
+			cpu, msr, target_c_local);
+
+	if (target_c_local < 85 || target_c_local > 120)
+		goto guess;
+
+	tcc_activation_temp = target_c_local;
+
+	return 0;
+
+guess:
+	tcc_activation_temp = TJMAX_DEFAULT;
+	fprintf(stderr, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n",
+		cpu, tcc_activation_temp);
+
+	return 0;
+}
 void check_cpuid()
 {
 	unsigned int eax, ebx, ecx, edx, max_level;
@@ -1244,7 +1816,7 @@ void check_cpuid()
 		genuine_intel = 1;
 
 	if (verbose)
-		fprintf(stderr, "%.4s%.4s%.4s ",
+		fprintf(stderr, "CPUID(0): %.4s%.4s%.4s ",
 			(char *)&ebx, (char *)&edx, (char *)&ecx);
 
 	asm("cpuid" : "=a" (fms), "=c" (ecx), "=d" (edx) : "a" (1) : "ebx");
@@ -1295,10 +1867,19 @@ void check_cpuid()
 
 	asm("cpuid" : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a" (0x6));
 	has_aperf = ecx & (1 << 0);
-	if (!has_aperf) {
-		fprintf(stderr, "No APERF MSR\n");
-		exit(1);
-	}
+	do_dts = eax & (1 << 0);
+	do_ptm = eax & (1 << 6);
+	has_epb = ecx & (1 << 3);
+
+	if (verbose)
+		fprintf(stderr, "CPUID(6): %s%s%s%s\n",
+			has_aperf ? "APERF" : "No APERF!",
+			do_dts ? ", DTS" : "",
+			do_ptm ? ", PTM": "",
+			has_epb ? ", EPB": "");
+
+	if (!has_aperf)
+		exit(-1);
 
 	do_nehalem_platform_info = genuine_intel && has_invariant_tsc;
 	do_nhm_cstates = genuine_intel;	/* all Intel w/ non-stop TSC have NHM counters */
@@ -1307,12 +1888,15 @@ void check_cpuid()
 
 	do_nehalem_turbo_ratio_limit = has_nehalem_turbo_ratio_limit(family, model);
 	do_ivt_turbo_ratio_limit = has_ivt_turbo_ratio_limit(family, model);
+	rapl_probe(family, model);
+
+	return;
 }
 
 
 void usage()
 {
-	fprintf(stderr, "%s: [-v][-p|-P|-S][-c MSR# | -s]][-C MSR#][-m MSR#][-M MSR#][-i interval_sec | command ...]\n",
+	fprintf(stderr, "%s: [-v][-R][-T][-p|-P|-S][-c MSR# | -s]][-C MSR#][-m MSR#][-M MSR#][-i interval_sec | command ...]\n",
 		progname);
 	exit(1);
 }
@@ -1548,6 +2132,17 @@ void turbostat_init()
 
 	if (verbose)
 		print_verbose_header();
+
+	if (verbose)
+		for_all_cpus(print_epb, ODD_COUNTERS);
+
+	if (verbose)
+		for_all_cpus(print_rapl, ODD_COUNTERS);
+
+	for_all_cpus(set_temperature_target, ODD_COUNTERS);
+
+	if (verbose)
+		for_all_cpus(print_thermal, ODD_COUNTERS);
 }
 
 int fork_it(char **argv)
@@ -1604,7 +2199,7 @@ void cmdline(int argc, char **argv)
 
 	progname = argv[0];
 
-	while ((opt = getopt(argc, argv, "+pPSvi:sc:sC:m:M:")) != -1) {
+	while ((opt = getopt(argc, argv, "+pPSvi:sc:sC:m:M:RT:")) != -1) {
 		switch (opt) {
 		case 'p':
 			show_core_only++;
@@ -1636,6 +2231,12 @@ void cmdline(int argc, char **argv)
 		case 'M':
 			sscanf(optarg, "%x", &extra_msr_offset64);
 			break;
+		case 'R':
+			rapl_verbose++;
+			break;
+		case 'T':
+			tcc_activation_temp_override = atoi(optarg);
+			break;
 		default:
 			usage();
 		}
@@ -1646,8 +2247,8 @@ int main(int argc, char **argv)
 {
 	cmdline(argc, argv);
 
-	if (verbose > 1)
-		fprintf(stderr, "turbostat v2.1 October 6, 2012"
+	if (verbose)
+		fprintf(stderr, "turbostat v3.0 November 23, 2012"
 			" - Len Brown <lenb@kernel.org>\n");
 
 	turbostat_init();
diff --git a/trunk/tools/power/x86/x86_energy_perf_policy/Makefile b/trunk/tools/power/x86/x86_energy_perf_policy/Makefile
index f458237fdd79..971c9ffdcb50 100644
--- a/trunk/tools/power/x86/x86_energy_perf_policy/Makefile
+++ b/trunk/tools/power/x86/x86_energy_perf_policy/Makefile
@@ -1,8 +1,10 @@
+DESTDIR ?=
+
 x86_energy_perf_policy : x86_energy_perf_policy.c
 
 clean :
 	rm -f x86_energy_perf_policy
 
 install :
-	install x86_energy_perf_policy /usr/bin/
-	install x86_energy_perf_policy.8 /usr/share/man/man8/
+	install x86_energy_perf_policy ${DESTDIR}/usr/bin/
+	install x86_energy_perf_policy.8 ${DESTDIR}/usr/share/man/man8/
diff --git a/trunk/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c b/trunk/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c
index 33c5c7ee148f..40b3e5482f8a 100644
--- a/trunk/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c
+++ b/trunk/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c
@@ -289,7 +289,7 @@ void for_every_cpu(void (func)(int))
 			"cpu%u %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d\n",
 			&cpu);
 		if (retval != 1)
-			return;
+			break;
 
 		func(cpu);
 	}
diff --git a/trunk/tools/testing/selftests/breakpoints/Makefile b/trunk/tools/testing/selftests/breakpoints/Makefile
index 931278035f5c..e18b42b254af 100644
--- a/trunk/tools/testing/selftests/breakpoints/Makefile
+++ b/trunk/tools/testing/selftests/breakpoints/Makefile
@@ -17,7 +17,7 @@ else
 endif
 
 run_tests:
-	./breakpoint_test
+	@./breakpoint_test || echo "breakpoints selftests: [FAIL]"
 
 clean:
 	rm -fr breakpoint_test
diff --git a/trunk/tools/testing/selftests/cpu-hotplug/Makefile b/trunk/tools/testing/selftests/cpu-hotplug/Makefile
index 7c9c20ff578a..12657a5e4bf9 100644
--- a/trunk/tools/testing/selftests/cpu-hotplug/Makefile
+++ b/trunk/tools/testing/selftests/cpu-hotplug/Makefile
@@ -1,6 +1,6 @@
 all:
 
 run_tests:
-	./on-off-test.sh
+	@./on-off-test.sh || echo "cpu-hotplug selftests: [FAIL]"
 
 clean:
diff --git a/trunk/tools/testing/selftests/kcmp/Makefile b/trunk/tools/testing/selftests/kcmp/Makefile
index dc79b86ea65c..56eb5523dbb8 100644
--- a/trunk/tools/testing/selftests/kcmp/Makefile
+++ b/trunk/tools/testing/selftests/kcmp/Makefile
@@ -16,13 +16,13 @@ CFLAGS += -I../../../../arch/x86/include/
 
 all:
 ifeq ($(ARCH),X86)
-	gcc $(CFLAGS) kcmp_test.c -o run_test
+	gcc $(CFLAGS) kcmp_test.c -o kcmp_test
 else
 	echo "Not an x86 target, can't build kcmp selftest"
 endif
 
-run-tests: all
-	./kcmp_test
+run_tests: all
+	@./kcmp_test || echo "kcmp_test: [FAIL]"
 
 clean:
 	rm -fr ./run_test
diff --git a/trunk/tools/testing/selftests/kcmp/kcmp_test.c b/trunk/tools/testing/selftests/kcmp/kcmp_test.c
index 358cc6bfa35d..fa4f1b37e045 100644
--- a/trunk/tools/testing/selftests/kcmp/kcmp_test.c
+++ b/trunk/tools/testing/selftests/kcmp/kcmp_test.c
@@ -72,7 +72,8 @@ int main(int argc, char **argv)
 		/* This one should return same fd */
 		ret = sys_kcmp(pid1, pid2, KCMP_FILE, fd1, fd1);
 		if (ret) {
-			printf("FAIL: 0 expected but %d returned\n", ret);
+			printf("FAIL: 0 expected but %d returned (%s)\n",
+				ret, strerror(errno));
 			ret = -1;
 		} else
 			printf("PASS: 0 returned as expected\n");
@@ -80,7 +81,8 @@ int main(int argc, char **argv)
 		/* Compare with self */
 		ret = sys_kcmp(pid1, pid1, KCMP_VM, 0, 0);
 		if (ret) {
-			printf("FAIL: 0 expected but %li returned\n", ret);
+			printf("FAIL: 0 expected but %li returned (%s)\n",
+				ret, strerror(errno));
 			ret = -1;
 		} else
 			printf("PASS: 0 returned as expected\n");
diff --git a/trunk/tools/testing/selftests/memory-hotplug/Makefile b/trunk/tools/testing/selftests/memory-hotplug/Makefile
index 7c9c20ff578a..0f49c3f5f58d 100644
--- a/trunk/tools/testing/selftests/memory-hotplug/Makefile
+++ b/trunk/tools/testing/selftests/memory-hotplug/Makefile
@@ -1,6 +1,6 @@
 all:
 
 run_tests:
-	./on-off-test.sh
+	@./on-off-test.sh || echo "memory-hotplug selftests: [FAIL]"
 
 clean:
diff --git a/trunk/tools/testing/selftests/mqueue/Makefile b/trunk/tools/testing/selftests/mqueue/Makefile
index 54c0aad2b47c..218a122c7951 100644
--- a/trunk/tools/testing/selftests/mqueue/Makefile
+++ b/trunk/tools/testing/selftests/mqueue/Makefile
@@ -3,8 +3,8 @@ all:
 	gcc -O2 -lrt -lpthread -lpopt -o mq_perf_tests mq_perf_tests.c
 
 run_tests:
-	./mq_open_tests /test1
-	./mq_perf_tests
+	@./mq_open_tests /test1 || echo "mq_open_tests: [FAIL]"
+	@./mq_perf_tests || echo "mq_perf_tests: [FAIL]"
 
 clean:
 	rm -f mq_open_tests mq_perf_tests
diff --git a/trunk/tools/testing/selftests/vm/Makefile b/trunk/tools/testing/selftests/vm/Makefile
index 7300d0702efe..436d2e81868b 100644
--- a/trunk/tools/testing/selftests/vm/Makefile
+++ b/trunk/tools/testing/selftests/vm/Makefile
@@ -8,7 +8,7 @@ all: hugepage-mmap hugepage-shm  map_hugetlb thuge-gen
 	$(CC) $(CFLAGS) -o $@ $^
 
 run_tests: all
-	/bin/sh ./run_vmtests
+	@/bin/sh ./run_vmtests || echo "vmtests: [FAIL]"
 
 clean:
 	$(RM) hugepage-mmap hugepage-shm  map_hugetlb