From 7398bb460b7a07b923a08ba922345caef2dff22b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Jun 2009 19:14:22 -0700 Subject: [PATCH] --- yaml --- r: 147897 b: refs/heads/master c: 8c5dd8f43367f4f266dd616f11658005bc2d20ef h: refs/heads/master i: 147895: 74d9b71b844dbf483b2531733ee2bbfce64938eb v: v3 --- [refs] | 2 +- trunk/MAINTAINERS | 8 +- trunk/arch/alpha/kernel/osf_sys.c | 3 + trunk/arch/m68k/include/asm/m520xsim.h | 9 - trunk/arch/m68k/include/asm/m523xsim.h | 9 - trunk/arch/m68k/include/asm/m527xsim.h | 9 - trunk/arch/m68k/include/asm/m528xsim.h | 8 - trunk/arch/m68k/include/asm/m532xsim.h | 12 - trunk/arch/m68k/include/asm/processor_no.h | 8 +- trunk/arch/m68k/include/asm/swab.h | 2 +- trunk/arch/m68k/include/asm/system_no.h | 107 + trunk/arch/m68knommu/kernel/entry.S | 1 + trunk/arch/m68knommu/kernel/setup.c | 16 +- trunk/arch/m68knommu/mm/init.c | 4 +- trunk/arch/m68knommu/platform/5206/config.c | 18 +- trunk/arch/m68knommu/platform/5206e/config.c | 18 +- trunk/arch/m68knommu/platform/520x/config.c | 15 +- trunk/arch/m68knommu/platform/523x/config.c | 14 +- trunk/arch/m68knommu/platform/5249/config.c | 18 +- trunk/arch/m68knommu/platform/5272/config.c | 18 +- trunk/arch/m68knommu/platform/527x/config.c | 15 +- trunk/arch/m68knommu/platform/528x/config.c | 13 +- trunk/arch/m68knommu/platform/5307/config.c | 16 +- trunk/arch/m68knommu/platform/532x/config.c | 12 +- trunk/arch/m68knommu/platform/5407/config.c | 16 +- .../m68knommu/platform/coldfire/vectors.c | 7 + trunk/arch/powerpc/include/asm/hw_irq.h | 3 + trunk/arch/powerpc/kernel/irq.c | 1 - trunk/arch/x86/kernel/apic/io_apic.c | 4 +- trunk/arch/x86/kernel/setup.c | 15 +- trunk/block/blk-core.c | 21 +- trunk/block/blk-settings.c | 6 +- trunk/drivers/ieee1394/dv1394.c | 5 +- trunk/drivers/ieee1394/ieee1394_core.h | 6 +- trunk/drivers/usb/core/inode.c | 5 - trunk/fs/adfs/adfs.h | 4 +- trunk/fs/adfs/dir.c | 10 +- trunk/fs/adfs/dir_f.c | 17 - trunk/fs/adfs/dir_fplus.c | 17 - trunk/fs/adfs/file.c | 2 +- trunk/fs/adfs/inode.c | 4 +- trunk/fs/adfs/map.c | 2 +- trunk/fs/adfs/super.c | 4 - trunk/fs/affs/affs.h | 1 - trunk/fs/affs/dir.c | 2 +- trunk/fs/affs/file.c | 14 +- trunk/fs/affs/super.c | 54 +- trunk/fs/afs/mntpt.c | 2 +- trunk/fs/afs/super.c | 4 - trunk/fs/autofs/dirhash.c | 5 +- trunk/fs/autofs4/autofs_i.h | 6 +- trunk/fs/autofs4/dev-ioctl.c | 195 +- trunk/fs/autofs4/expire.c | 15 +- trunk/fs/autofs4/root.c | 7 +- trunk/fs/befs/linuxvfs.c | 5 +- trunk/fs/bfs/dir.c | 8 +- trunk/fs/bfs/inode.c | 52 +- trunk/fs/block_dev.c | 19 +- trunk/fs/btrfs/Makefile | 4 +- trunk/fs/btrfs/acl.c | 5 + trunk/fs/btrfs/async-thread.c | 2 +- trunk/fs/btrfs/btrfs_inode.h | 4 +- trunk/fs/btrfs/compression.c | 6 +- trunk/fs/btrfs/crc32c.h | 29 + trunk/fs/btrfs/ctree.c | 698 ++-- trunk/fs/btrfs/ctree.h | 330 +- trunk/fs/btrfs/delayed-ref.c | 509 +-- trunk/fs/btrfs/delayed-ref.h | 85 +- trunk/fs/btrfs/disk-io.c | 164 +- trunk/fs/btrfs/export.c | 4 +- trunk/fs/btrfs/extent-tree.c | 2672 ++++-------- trunk/fs/btrfs/extent_io.c | 18 +- trunk/fs/btrfs/file.c | 78 +- trunk/fs/btrfs/free-space-cache.c | 10 +- trunk/fs/btrfs/free-space-cache.h | 1 - trunk/fs/btrfs/hash.h | 4 +- trunk/fs/btrfs/inode.c | 166 +- trunk/fs/btrfs/ioctl.c | 197 +- trunk/fs/btrfs/print-tree.c | 155 +- trunk/fs/btrfs/relocation.c | 3711 ----------------- trunk/fs/btrfs/root-tree.c | 17 +- trunk/fs/btrfs/super.c | 64 +- trunk/fs/btrfs/transaction.c | 410 +- trunk/fs/btrfs/transaction.h | 12 +- trunk/fs/btrfs/tree-log.c | 103 +- trunk/fs/btrfs/volumes.c | 69 +- trunk/fs/btrfs/volumes.h | 12 +- trunk/fs/cachefiles/interface.c | 4 +- trunk/fs/char_dev.c | 14 +- trunk/fs/cifs/cifs_dfs_ref.c | 2 +- trunk/fs/cifs/cifsfs.c | 6 +- trunk/fs/compat.c | 2 + trunk/fs/dcache.c | 7 +- trunk/fs/ecryptfs/super.c | 5 - trunk/fs/exofs/super.c | 25 +- trunk/fs/ext2/Makefile | 2 +- trunk/fs/ext2/dir.c | 2 +- trunk/fs/ext2/ext2.h | 3 + trunk/fs/ext2/file.c | 4 +- trunk/fs/ext2/fsync.c | 50 + trunk/fs/ext2/inode.c | 11 +- trunk/fs/ext2/super.c | 60 +- trunk/fs/ext3/balloc.c | 3 +- trunk/fs/ext3/ialloc.c | 3 +- trunk/fs/ext3/inode.c | 1 + trunk/fs/ext3/resize.c | 2 + trunk/fs/ext3/super.c | 34 +- trunk/fs/ext3/xattr.c | 1 + trunk/fs/ext4/super.c | 16 +- trunk/fs/fat/dir.c | 16 +- trunk/fs/fat/fat.h | 6 - trunk/fs/fat/fatent.c | 13 +- trunk/fs/fat/file.c | 14 +- trunk/fs/fat/inode.c | 31 +- trunk/fs/fat/namei_msdos.c | 4 +- trunk/fs/fat/namei_vfat.c | 4 +- trunk/fs/file_table.c | 40 +- trunk/fs/freevxfs/vxfs_super.c | 4 - trunk/fs/fs-writeback.c | 92 +- trunk/fs/gfs2/log.c | 2 + trunk/fs/gfs2/super.c | 15 +- trunk/fs/hfs/super.c | 23 +- trunk/fs/hfsplus/super.c | 25 +- trunk/fs/hpfs/super.c | 12 - trunk/fs/inode.c | 12 +- trunk/fs/internal.h | 17 - trunk/fs/isofs/inode.c | 5 - trunk/fs/jffs2/fs.c | 18 +- trunk/fs/jffs2/os-linux.h | 1 + trunk/fs/jffs2/super.c | 26 - trunk/fs/jfs/super.c | 27 +- trunk/fs/libfs.c | 25 - trunk/fs/minix/dir.c | 2 +- trunk/fs/minix/file.c | 20 +- trunk/fs/minix/inode.c | 37 +- trunk/fs/minix/minix.h | 2 + trunk/fs/namei.c | 129 +- trunk/fs/namespace.c | 327 +- trunk/fs/ncpfs/inode.c | 4 - trunk/fs/nfs/namespace.c | 2 +- trunk/fs/nfs/super.c | 2 - trunk/fs/nfsd/export.c | 78 +- trunk/fs/nfsd/vfs.c | 54 +- trunk/fs/nilfs2/cpfile.c | 6 +- trunk/fs/nilfs2/sb.h | 1 - trunk/fs/nilfs2/super.c | 256 +- trunk/fs/nilfs2/the_nilfs.c | 113 +- trunk/fs/nilfs2/the_nilfs.h | 23 +- trunk/fs/notify/Kconfig | 13 - trunk/fs/notify/Makefile | 2 - trunk/fs/notify/dnotify/Kconfig | 1 - trunk/fs/notify/dnotify/dnotify.c | 464 +-- trunk/fs/notify/fsnotify.c | 186 - trunk/fs/notify/fsnotify.h | 34 - trunk/fs/notify/group.c | 254 -- trunk/fs/notify/inode_mark.c | 426 -- trunk/fs/notify/inotify/Kconfig | 20 +- trunk/fs/notify/inotify/Makefile | 2 +- trunk/fs/notify/inotify/inotify.c | 20 - trunk/fs/notify/inotify/inotify.h | 21 - trunk/fs/notify/inotify/inotify_fsnotify.c | 138 - trunk/fs/notify/inotify/inotify_user.c | 837 ++-- trunk/fs/notify/notification.c | 411 -- trunk/fs/ntfs/super.c | 54 +- trunk/fs/ocfs2/super.c | 22 +- trunk/fs/omfs/file.c | 17 +- trunk/fs/open.c | 4 +- trunk/fs/proc/internal.h | 25 - trunk/fs/proc/proc_devtree.c | 1 - trunk/fs/qnx4/Makefile | 2 +- trunk/fs/qnx4/bitmap.c | 7 +- trunk/fs/qnx4/dir.c | 9 +- trunk/fs/qnx4/file.c | 5 +- trunk/fs/qnx4/fsync.c | 169 + trunk/fs/qnx4/inode.c | 58 +- trunk/fs/qnx4/namei.c | 13 +- trunk/fs/qnx4/qnx4.h | 57 - trunk/fs/qnx4/truncate.c | 6 +- trunk/fs/quota/quota.c | 25 +- trunk/fs/reiserfs/dir.c | 10 +- trunk/fs/reiserfs/super.c | 33 +- trunk/fs/reiserfs/xattr.c | 3 +- trunk/fs/smbfs/inode.c | 4 - trunk/fs/squashfs/super.c | 4 - trunk/fs/super.c | 192 +- trunk/fs/sync.c | 117 +- trunk/fs/sysv/dir.c | 2 +- trunk/fs/sysv/file.c | 17 +- trunk/fs/sysv/inode.c | 75 +- trunk/fs/sysv/sysv.h | 1 + trunk/fs/ubifs/super.c | 17 +- trunk/fs/udf/Makefile | 2 +- trunk/fs/udf/dir.c | 2 +- trunk/fs/udf/file.c | 2 +- trunk/fs/udf/fsync.c | 52 + trunk/fs/udf/super.c | 11 +- trunk/fs/udf/udfdecl.h | 3 + trunk/fs/ufs/dir.c | 2 +- trunk/fs/ufs/file.c | 23 +- trunk/fs/ufs/super.c | 65 +- trunk/fs/ufs/ufs.h | 1 + trunk/fs/xattr.c | 4 +- trunk/fs/xfs/linux-2.6/xfs_super.c | 12 + trunk/fs/xfs/xfs_trans.c | 2 + trunk/include/linux/Kbuild | 2 +- trunk/include/linux/cdev.h | 2 - trunk/include/linux/cramfs_fs.h | 3 +- trunk/include/linux/dcache.h | 11 +- trunk/include/linux/dnotify.h | 29 +- trunk/include/linux/fs.h | 27 +- trunk/include/linux/fsnotify.h | 199 +- trunk/include/linux/fsnotify_backend.h | 387 -- trunk/include/linux/magic.h | 2 - trunk/include/linux/mount.h | 25 +- trunk/include/linux/namei.h | 5 +- trunk/include/linux/nfsd/export.h | 6 +- trunk/include/linux/proc_fs.h | 24 + trunk/include/linux/qnx4_fs.h | 61 + trunk/include/linux/quotaops.h | 20 +- trunk/include/linux/reiserfs_fs_sb.h | 2 - trunk/include/linux/writeback.h | 1 + trunk/init/Kconfig | 3 +- trunk/kernel/audit_tree.c | 6 +- trunk/kernel/cgroup.c | 3 - trunk/virt/kvm/kvm_main.c | 1 + 225 files changed, 4890 insertions(+), 12065 deletions(-) create mode 100644 trunk/fs/btrfs/crc32c.h delete mode 100644 trunk/fs/btrfs/relocation.c create mode 100644 trunk/fs/ext2/fsync.c delete mode 100644 trunk/fs/notify/fsnotify.c delete mode 100644 trunk/fs/notify/fsnotify.h delete mode 100644 trunk/fs/notify/group.c delete mode 100644 trunk/fs/notify/inode_mark.c delete mode 100644 trunk/fs/notify/inotify/inotify.h delete mode 100644 trunk/fs/notify/inotify/inotify_fsnotify.c delete mode 100644 trunk/fs/notify/notification.c create mode 100644 trunk/fs/qnx4/fsync.c delete mode 100644 trunk/fs/qnx4/qnx4.h create mode 100644 trunk/fs/udf/fsync.c delete mode 100644 trunk/include/linux/fsnotify_backend.h diff --git a/[refs] b/[refs] index 3ec96a39b723..ad5cbea34100 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: 8ebf975608aaebd7feb33d77f07ba21a6380e086 +refs/heads/master: 8c5dd8f43367f4f266dd616f11658005bc2d20ef diff --git a/trunk/MAINTAINERS b/trunk/MAINTAINERS index c944d618dc83..1a0084e22cf3 100644 --- a/trunk/MAINTAINERS +++ b/trunk/MAINTAINERS @@ -1802,10 +1802,10 @@ F: drivers/char/epca* F: drivers/char/digi* DIRECTORY NOTIFICATION (DNOTIFY) -P: Eric Paris -M: eparis@parisplace.org +P: Stephen Rothwell +M: sfr@canb.auug.org.au L: linux-kernel@vger.kernel.org -S: Maintained +S: Supported F: Documentation/filesystems/dnotify.txt F: fs/notify/dnotify/ F: include/linux/dnotify.h @@ -2858,8 +2858,6 @@ P: John McCutchan M: john@johnmccutchan.com P: Robert Love M: rlove@rlove.org -P: Eric Paris -M: eparis@parisplace.org L: linux-kernel@vger.kernel.org S: Maintained F: Documentation/filesystems/inotify.txt diff --git a/trunk/arch/alpha/kernel/osf_sys.c b/trunk/arch/alpha/kernel/osf_sys.c index 9a3334ae282e..42ee05981e71 100644 --- a/trunk/arch/alpha/kernel/osf_sys.c +++ b/trunk/arch/alpha/kernel/osf_sys.c @@ -371,6 +371,8 @@ SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, char __user *, path, int retval = -EINVAL; char *name; + lock_kernel(); + name = getname(path); retval = PTR_ERR(name); if (IS_ERR(name)) @@ -390,6 +392,7 @@ SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, char __user *, path, } putname(name); out: + unlock_kernel(); return retval; } diff --git a/trunk/arch/m68k/include/asm/m520xsim.h b/trunk/arch/m68k/include/asm/m520xsim.h index 83bbcfd6e8f2..49d016e6391a 100644 --- a/trunk/arch/m68k/include/asm/m520xsim.h +++ b/trunk/arch/m68k/include/asm/m520xsim.h @@ -59,14 +59,5 @@ #define MCFPIT_IMR MCFINTC_IMRL #define MCFPIT_IMR_IBIT (1 << MCFINT_PIT1) -/* - * Reset Controll Unit. - */ -#define MCF_RCR 0xFC0A0000 -#define MCF_RSR 0xFC0A0001 - -#define MCF_RCR_SWRESET 0x80 /* Software reset bit */ -#define MCF_RCR_FRCSTOUT 0x40 /* Force external reset */ - /****************************************************************************/ #endif /* m520xsim_h */ diff --git a/trunk/arch/m68k/include/asm/m523xsim.h b/trunk/arch/m68k/include/asm/m523xsim.h index 55183b5df1b8..bf397313e93f 100644 --- a/trunk/arch/m68k/include/asm/m523xsim.h +++ b/trunk/arch/m68k/include/asm/m523xsim.h @@ -41,14 +41,5 @@ #define MCFSIM_DACR1 0x50 /* SDRAM base address 1 */ #define MCFSIM_DMR1 0x54 /* SDRAM address mask 1 */ -/* - * Reset Controll Unit (relative to IPSBAR). - */ -#define MCF_RCR 0x110000 -#define MCF_RSR 0x110001 - -#define MCF_RCR_SWRESET 0x80 /* Software reset bit */ -#define MCF_RCR_FRCSTOUT 0x40 /* Force external reset */ - /****************************************************************************/ #endif /* m523xsim_h */ diff --git a/trunk/arch/m68k/include/asm/m527xsim.h b/trunk/arch/m68k/include/asm/m527xsim.h index 95f4f8ee8f7c..1f63ab3fb3e6 100644 --- a/trunk/arch/m68k/include/asm/m527xsim.h +++ b/trunk/arch/m68k/include/asm/m527xsim.h @@ -70,14 +70,5 @@ #define UART2_ENABLE_MASK 0x3f00 #endif -/* - * Reset Controll Unit (relative to IPSBAR). - */ -#define MCF_RCR 0x110000 -#define MCF_RSR 0x110001 - -#define MCF_RCR_SWRESET 0x80 /* Software reset bit */ -#define MCF_RCR_FRCSTOUT 0x40 /* Force external reset */ - /****************************************************************************/ #endif /* m527xsim_h */ diff --git a/trunk/arch/m68k/include/asm/m528xsim.h b/trunk/arch/m68k/include/asm/m528xsim.h index d79c49f8134a..28bf783a5d6d 100644 --- a/trunk/arch/m68k/include/asm/m528xsim.h +++ b/trunk/arch/m68k/include/asm/m528xsim.h @@ -56,14 +56,6 @@ #define MCF5282_INTC0_ICR17 (volatile u8 *) (MCF_IPSBAR + 0x0C51) -/* - * Reset Control Unit (relative to IPSBAR). - */ -#define MCF_RCR 0x110000 -#define MCF_RSR 0x110001 - -#define MCF_RCR_SWRESET 0x80 /* Software reset bit */ -#define MCF_RCR_FRCSTOUT 0x40 /* Force external reset */ /********************************************************************* * diff --git a/trunk/arch/m68k/include/asm/m532xsim.h b/trunk/arch/m68k/include/asm/m532xsim.h index eb7fd4448947..ce603451b55e 100644 --- a/trunk/arch/m68k/include/asm/m532xsim.h +++ b/trunk/arch/m68k/include/asm/m532xsim.h @@ -125,18 +125,6 @@ #define ACR_CM_OFF_IMP (3<<5) #define ACR_WPROTECT (1<<2) -/********************************************************************* - * - * Reset Controller Module - * - *********************************************************************/ - -#define MCF_RCR 0xFC0A0000 -#define MCF_RSR 0xFC0A0001 - -#define MCF_RCR_SWRESET 0x80 /* Software reset bit */ -#define MCF_RCR_FRCSTOUT 0x40 /* Force external reset */ - /********************************************************************* * * Inter-IC (I2C) Module diff --git a/trunk/arch/m68k/include/asm/processor_no.h b/trunk/arch/m68k/include/asm/processor_no.h index 7a1e0ba35f5a..91cba18acdd3 100644 --- a/trunk/arch/m68k/include/asm/processor_no.h +++ b/trunk/arch/m68k/include/asm/processor_no.h @@ -72,10 +72,10 @@ struct thread_struct { unsigned char fpstate[FPSTATESIZE]; /* floating point state */ }; -#define INIT_THREAD { \ - .ksp = sizeof(init_stack) + (unsigned long) init_stack, \ - .sr = PS_S, \ - .fs = __KERNEL_DS, \ +#define INIT_THREAD { \ + sizeof(init_stack) + (unsigned long) init_stack, 0, \ + PS_S, __KERNEL_DS, \ + {0, 0}, 0, {0,}, {0, 0, 0}, {0,}, \ } /* diff --git a/trunk/arch/m68k/include/asm/swab.h b/trunk/arch/m68k/include/asm/swab.h index 5b754aace744..9e3054ea59e9 100644 --- a/trunk/arch/m68k/include/asm/swab.h +++ b/trunk/arch/m68k/include/asm/swab.h @@ -1,7 +1,7 @@ #ifndef _M68K_SWAB_H #define _M68K_SWAB_H -#include +#include #include #define __SWAB_64_THRU_32__ diff --git a/trunk/arch/m68k/include/asm/system_no.h b/trunk/arch/m68k/include/asm/system_no.h index 3c0718d74398..4496c0aa8379 100644 --- a/trunk/arch/m68k/include/asm/system_no.h +++ b/trunk/arch/m68k/include/asm/system_no.h @@ -203,6 +203,113 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz #include #endif +#if defined( CONFIG_M68328 ) || defined( CONFIG_M68EZ328 ) || \ + defined (CONFIG_M68360) || defined( CONFIG_M68VZ328 ) +#define HARD_RESET_NOW() ({ \ + local_irq_disable(); \ + asm(" \ + moveal #0x10c00000, %a0; \ + moveb #0, 0xFFFFF300; \ + moveal 0(%a0), %sp; \ + moveal 4(%a0), %a0; \ + jmp (%a0); \ + "); \ +}) +#endif + +#ifdef CONFIG_COLDFIRE +#if defined(CONFIG_M5272) && defined(CONFIG_NETtel) +/* + * Need to account for broken early mask of 5272 silicon. So don't + * jump through the original start address. Jump strait into the + * known start of the FLASH code. + */ +#define HARD_RESET_NOW() ({ \ + asm(" \ + movew #0x2700, %sr; \ + jmp 0xf0000400; \ + "); \ +}) +#elif defined(CONFIG_NETtel) || \ + defined(CONFIG_SECUREEDGEMP3) || defined(CONFIG_CLEOPATRA) +#define HARD_RESET_NOW() ({ \ + asm(" \ + movew #0x2700, %sr; \ + moveal #0x10000044, %a0; \ + movel #0xffffffff, (%a0); \ + moveal #0x10000001, %a0; \ + moveb #0x00, (%a0); \ + moveal #0xf0000004, %a0; \ + moveal (%a0), %a0; \ + jmp (%a0); \ + "); \ +}) +#elif defined(CONFIG_M5272) +/* + * Retrieve the boot address in flash using CSBR0 and CSOR0 + * find the reset vector at flash_address + 4 (e.g. 0x400) + * remap it in the flash's current location (e.g. 0xf0000400) + * and jump there. + */ +#define HARD_RESET_NOW() ({ \ + asm(" \ + movew #0x2700, %%sr; \ + move.l %0+0x40,%%d0; \ + and.l %0+0x44,%%d0; \ + andi.l #0xfffff000,%%d0; \ + mov.l %%d0,%%a0; \ + or.l 4(%%a0),%%d0; \ + mov.l %%d0,%%a0; \ + jmp (%%a0);" \ + : /* No output */ \ + : "o" (*(char *)MCF_MBAR) ); \ +}) +#elif defined(CONFIG_M528x) +/* + * The MCF528x has a bit (SOFTRST) in memory (Reset Control Register RCR), + * that when set, resets the MCF528x. + */ +#define HARD_RESET_NOW() \ +({ \ + unsigned char volatile *reset; \ + asm("move.w #0x2700, %sr"); \ + reset = ((volatile unsigned char *)(MCF_IPSBAR + 0x110000)); \ + while(1) \ + *reset |= (0x01 << 7);\ +}) +#elif defined(CONFIG_M523x) +#define HARD_RESET_NOW() ({ \ + asm(" \ + movew #0x2700, %sr; \ + movel #0x01000000, %sp; \ + moveal #0x40110000, %a0; \ + moveb #0x80, (%a0); \ + "); \ +}) +#elif defined(CONFIG_M520x) + /* + * The MCF5208 has a bit (SOFTRST) in memory (Reset Control Register + * RCR), that when set, resets the MCF5208. + */ +#define HARD_RESET_NOW() \ +({ \ + unsigned char volatile *reset; \ + asm("move.w #0x2700, %sr"); \ + reset = ((volatile unsigned char *)(MCF_IPSBAR + 0xA0000)); \ + while(1) \ + *reset |= 0x80; \ +}) +#else +#define HARD_RESET_NOW() ({ \ + asm(" \ + movew #0x2700, %sr; \ + moveal #0x4, %a0; \ + moveal (%a0), %a0; \ + jmp (%a0); \ + "); \ +}) +#endif +#endif #define arch_align_stack(x) (x) diff --git a/trunk/arch/m68knommu/kernel/entry.S b/trunk/arch/m68knommu/kernel/entry.S index f56faa5c9cd9..f4782d2dce8f 100644 --- a/trunk/arch/m68knommu/kernel/entry.S +++ b/trunk/arch/m68knommu/kernel/entry.S @@ -26,6 +26,7 @@ #include #include +#include #include #include #include diff --git a/trunk/arch/m68knommu/kernel/setup.c b/trunk/arch/m68knommu/kernel/setup.c index 5c2bb3eeaaa2..5985f1989021 100644 --- a/trunk/arch/m68knommu/kernel/setup.c +++ b/trunk/arch/m68knommu/kernel/setup.c @@ -166,13 +166,15 @@ void __init setup_arch(char **cmdline_p) printk(KERN_INFO "Motorola M5235EVB support (C)2005 Syn-tech Systems, Inc. (Jate Sujjavanich)\n"); #endif - pr_debug("KERNEL -> TEXT=0x%06x-0x%06x DATA=0x%06x-0x%06x " - "BSS=0x%06x-0x%06x\n", (int) &_stext, (int) &_etext, - (int) &_sdata, (int) &_edata, - (int) &_sbss, (int) &_ebss); - pr_debug("MEMORY -> ROMFS=0x%06x-0x%06x MEM=0x%06x-0x%06x\n ", - (int) &_ebss, (int) memory_start, - (int) memory_start, (int) memory_end); +#ifdef DEBUG + printk(KERN_DEBUG "KERNEL -> TEXT=0x%06x-0x%06x DATA=0x%06x-0x%06x " + "BSS=0x%06x-0x%06x\n", (int) &_stext, (int) &_etext, + (int) &_sdata, (int) &_edata, + (int) &_sbss, (int) &_ebss); + printk(KERN_DEBUG "MEMORY -> ROMFS=0x%06x-0x%06x MEM=0x%06x-0x%06x\n ", + (int) &_ebss, (int) memory_start, + (int) memory_start, (int) memory_end); +#endif /* Keep a copy of command line */ *cmdline_p = &command_line[0]; diff --git a/trunk/arch/m68knommu/mm/init.c b/trunk/arch/m68knommu/mm/init.c index b1703c67a4f1..7befc0c357e0 100644 --- a/trunk/arch/m68knommu/mm/init.c +++ b/trunk/arch/m68knommu/mm/init.c @@ -126,7 +126,9 @@ void __init mem_init(void) unsigned long start_mem = memory_start; /* DAVIDM - these must start at end of kernel */ unsigned long end_mem = memory_end; /* DAVIDM - this must not include kernel stack at top */ - pr_debug("Mem_init: start=%lx, end=%lx\n", start_mem, end_mem); +#ifdef DEBUG + printk(KERN_DEBUG "Mem_init: start=%lx, end=%lx\n", start_mem, end_mem); +#endif end_mem &= PAGE_MASK; high_memory = (void *) end_mem; diff --git a/trunk/arch/m68knommu/platform/5206/config.c b/trunk/arch/m68knommu/platform/5206/config.c index f6f79874e9af..53a5920c2b71 100644 --- a/trunk/arch/m68knommu/platform/5206/config.c +++ b/trunk/arch/m68knommu/platform/5206/config.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -20,6 +21,10 @@ /***************************************************************************/ +void coldfire_reset(void); + +/***************************************************************************/ + static struct mcf_platform_uart m5206_uart_platform[] = { { .mapbase = MCF_MBAR + MCFUART_BASE1, @@ -104,21 +109,10 @@ void mcf_settimericr(unsigned int timer, unsigned int level) /***************************************************************************/ -void m5206_cpu_reset(void) -{ - local_irq_disable(); - /* Set watchdog to soft reset, and enabled */ - __raw_writeb(0xc0, MCF_MBAR + MCFSIM_SYPCR); - for (;;) - /* wait for watchdog to timeout */; -} - -/***************************************************************************/ - void __init config_BSP(char *commandp, int size) { mcf_setimr(MCFSIM_IMR_MASKALL); - mach_reset = m5206_cpu_reset; + mach_reset = coldfire_reset; } /***************************************************************************/ diff --git a/trunk/arch/m68knommu/platform/5206e/config.c b/trunk/arch/m68knommu/platform/5206e/config.c index 65887799db81..db902540bf2c 100644 --- a/trunk/arch/m68knommu/platform/5206e/config.c +++ b/trunk/arch/m68knommu/platform/5206e/config.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -20,6 +21,10 @@ /***************************************************************************/ +void coldfire_reset(void); + +/***************************************************************************/ + static struct mcf_platform_uart m5206e_uart_platform[] = { { .mapbase = MCF_MBAR + MCFUART_BASE1, @@ -104,17 +109,6 @@ void mcf_settimericr(unsigned int timer, unsigned int level) /***************************************************************************/ -void m5206e_cpu_reset(void) -{ - local_irq_disable(); - /* Set watchdog to soft reset, and enabled */ - __raw_writeb(0xc0, MCF_MBAR + MCFSIM_SYPCR); - for (;;) - /* wait for watchdog to timeout */; -} - -/***************************************************************************/ - void __init config_BSP(char *commandp, int size) { mcf_setimr(MCFSIM_IMR_MASKALL); @@ -125,7 +119,7 @@ void __init config_BSP(char *commandp, int size) commandp[size-1] = 0; #endif /* CONFIG_NETtel */ - mach_reset = m5206e_cpu_reset; + mach_reset = coldfire_reset; } /***************************************************************************/ diff --git a/trunk/arch/m68knommu/platform/520x/config.c b/trunk/arch/m68knommu/platform/520x/config.c index 1c43a8aec69b..855fc6a79d72 100644 --- a/trunk/arch/m68knommu/platform/520x/config.c +++ b/trunk/arch/m68knommu/platform/520x/config.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -22,6 +23,10 @@ /***************************************************************************/ +void coldfire_reset(void); + +/***************************************************************************/ + static struct mcf_platform_uart m520x_uart_platform[] = { { .mapbase = MCF_MBAR + MCFUART_BASE1, @@ -164,17 +169,9 @@ void mcf_autovector(unsigned int vec) /***************************************************************************/ -static void m520x_cpu_reset(void) -{ - local_irq_disable(); - __raw_writeb(MCF_RCR_SWRESET, MCF_RCR); -} - -/***************************************************************************/ - void __init config_BSP(char *commandp, int size) { - mach_reset = m520x_cpu_reset; + mach_reset = coldfire_reset; m520x_uarts_init(); m520x_fec_init(); } diff --git a/trunk/arch/m68knommu/platform/523x/config.c b/trunk/arch/m68knommu/platform/523x/config.c index 961fefebca14..74133f27b30c 100644 --- a/trunk/arch/m68knommu/platform/523x/config.c +++ b/trunk/arch/m68knommu/platform/523x/config.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -23,6 +24,10 @@ /***************************************************************************/ +void coldfire_reset(void); + +/***************************************************************************/ + static struct mcf_platform_uart m523x_uart_platform[] = { { .mapbase = MCF_MBAR + MCFUART_BASE1, @@ -140,20 +145,13 @@ void mcf_autovector(unsigned int vec) { /* Everything is auto-vectored on the 523x */ } -/***************************************************************************/ - -static void m523x_cpu_reset(void) -{ - local_irq_disable(); - __raw_writeb(MCF_RCR_SWRESET, MCF_IPSBAR + MCF_RCR); -} /***************************************************************************/ void __init config_BSP(char *commandp, int size) { mcf_disableall(); - mach_reset = m523x_cpu_reset; + mach_reset = coldfire_reset; m523x_uarts_init(); m523x_fec_init(); } diff --git a/trunk/arch/m68knommu/platform/5249/config.c b/trunk/arch/m68knommu/platform/5249/config.c index 93d998825925..9eab19d01eb1 100644 --- a/trunk/arch/m68knommu/platform/5249/config.c +++ b/trunk/arch/m68knommu/platform/5249/config.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -19,6 +20,10 @@ /***************************************************************************/ +void coldfire_reset(void); + +/***************************************************************************/ + static struct mcf_platform_uart m5249_uart_platform[] = { { .mapbase = MCF_MBAR + MCFUART_BASE1, @@ -101,21 +106,10 @@ void mcf_settimericr(unsigned int timer, unsigned int level) /***************************************************************************/ -void m5249_cpu_reset(void) -{ - local_irq_disable(); - /* Set watchdog to soft reset, and enabled */ - __raw_writeb(0xc0, MCF_MBAR + MCFSIM_SYPCR); - for (;;) - /* wait for watchdog to timeout */; -} - -/***************************************************************************/ - void __init config_BSP(char *commandp, int size) { mcf_setimr(MCFSIM_IMR_MASKALL); - mach_reset = m5249_cpu_reset; + mach_reset = coldfire_reset; } /***************************************************************************/ diff --git a/trunk/arch/m68knommu/platform/5272/config.c b/trunk/arch/m68knommu/platform/5272/config.c index 5f95fcde05fd..e049245f4092 100644 --- a/trunk/arch/m68knommu/platform/5272/config.c +++ b/trunk/arch/m68knommu/platform/5272/config.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -20,6 +21,8 @@ /***************************************************************************/ +void coldfire_reset(void); + extern unsigned int mcf_timervector; extern unsigned int mcf_profilevector; extern unsigned int mcf_timerlevel; @@ -167,19 +170,6 @@ void mcf_settimericr(int timer, int level) /***************************************************************************/ -static void m5272_cpu_reset(void) -{ - local_irq_disable(); - /* Set watchdog to reset, and enabled */ - __raw_writew(0, MCF_MBAR + MCFSIM_WIRR); - __raw_writew(1, MCF_MBAR + MCFSIM_WRRR); - __raw_writew(0, MCF_MBAR + MCFSIM_WCR); - for (;;) - /* wait for watchdog to timeout */; -} - -/***************************************************************************/ - void __init config_BSP(char *commandp, int size) { #if defined (CONFIG_MOD5272) @@ -204,7 +194,7 @@ void __init config_BSP(char *commandp, int size) mcf_timervector = 69; mcf_profilevector = 70; - mach_reset = m5272_cpu_reset; + mach_reset = coldfire_reset; } /***************************************************************************/ diff --git a/trunk/arch/m68knommu/platform/527x/config.c b/trunk/arch/m68knommu/platform/527x/config.c index f746439cfd3e..428b15922ef5 100644 --- a/trunk/arch/m68knommu/platform/527x/config.c +++ b/trunk/arch/m68knommu/platform/527x/config.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -23,6 +24,10 @@ /***************************************************************************/ +void coldfire_reset(void); + +/***************************************************************************/ + static struct mcf_platform_uart m527x_uart_platform[] = { { .mapbase = MCF_MBAR + MCFUART_BASE1, @@ -222,18 +227,10 @@ void mcf_autovector(unsigned int vec) /***************************************************************************/ -static void m527x_cpu_reset(void) -{ - local_irq_disable(); - __raw_writeb(MCF_RCR_SWRESET, MCF_IPSBAR + MCF_RCR); -} - -/***************************************************************************/ - void __init config_BSP(char *commandp, int size) { mcf_disableall(); - mach_reset = m527x_cpu_reset; + mach_reset = coldfire_reset; m527x_uarts_init(); m527x_fec_init(); } diff --git a/trunk/arch/m68knommu/platform/528x/config.c b/trunk/arch/m68knommu/platform/528x/config.c index a1d1a61c4fe6..bee526f4d1af 100644 --- a/trunk/arch/m68knommu/platform/528x/config.c +++ b/trunk/arch/m68knommu/platform/528x/config.c @@ -31,6 +31,10 @@ /***************************************************************************/ +void coldfire_reset(void); + +/***************************************************************************/ + static struct mcf_platform_uart m528x_uart_platform[] = { { .mapbase = MCF_MBAR + MCFUART_BASE1, @@ -167,14 +171,6 @@ void mcf_autovector(unsigned int vec) /***************************************************************************/ -static void m528x_cpu_reset(void) -{ - local_irq_disable(); - __raw_writeb(MCF_RCR_SWRESET, MCF_IPSBAR + MCF_RCR); -} - -/***************************************************************************/ - #ifdef CONFIG_WILDFIRE void wildfire_halt(void) { @@ -218,7 +214,6 @@ void __init config_BSP(char *commandp, int size) static int __init init_BSP(void) { - mach_reset = m528x_cpu_reset; m528x_uarts_init(); m528x_fec_init(); platform_add_devices(m528x_devices, ARRAY_SIZE(m528x_devices)); diff --git a/trunk/arch/m68knommu/platform/5307/config.c b/trunk/arch/m68knommu/platform/5307/config.c index 39da9e9ff674..44803bf70a6e 100644 --- a/trunk/arch/m68knommu/platform/5307/config.c +++ b/trunk/arch/m68knommu/platform/5307/config.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -21,6 +22,8 @@ /***************************************************************************/ +void coldfire_reset(void); + extern unsigned int mcf_timervector; extern unsigned int mcf_profilevector; extern unsigned int mcf_timerlevel; @@ -116,17 +119,6 @@ void mcf_settimericr(unsigned int timer, unsigned int level) /***************************************************************************/ -void m5307_cpu_reset(void) -{ - local_irq_disable(); - /* Set watchdog to soft reset, and enabled */ - __raw_writeb(0xc0, MCF_MBAR + MCFSIM_SYPCR); - for (;;) - /* wait for watchdog to timeout */; -} - -/***************************************************************************/ - void __init config_BSP(char *commandp, int size) { mcf_setimr(MCFSIM_IMR_MASKALL); @@ -142,7 +134,7 @@ void __init config_BSP(char *commandp, int size) mcf_timerlevel = 6; #endif - mach_reset = m5307_cpu_reset; + mach_reset = coldfire_reset; #ifdef CONFIG_BDM_DISABLE /* diff --git a/trunk/arch/m68knommu/platform/532x/config.c b/trunk/arch/m68knommu/platform/532x/config.c index cdb761971f7a..591f2f801134 100644 --- a/trunk/arch/m68knommu/platform/532x/config.c +++ b/trunk/arch/m68knommu/platform/532x/config.c @@ -31,6 +31,8 @@ /***************************************************************************/ +void coldfire_reset(void); + extern unsigned int mcf_timervector; extern unsigned int mcf_profilevector; extern unsigned int mcf_timerlevel; @@ -162,14 +164,6 @@ void mcf_settimericr(unsigned int timer, unsigned int level) /***************************************************************************/ -static void m532x_cpu_reset(void) -{ - local_irq_disable(); - __raw_writeb(MCF_RCR_SWRESET, MCF_RCR); -} - -/***************************************************************************/ - void __init config_BSP(char *commandp, int size) { mcf_setimr(MCFSIM_IMR_MASKALL); @@ -187,7 +181,7 @@ void __init config_BSP(char *commandp, int size) mcf_timervector = 64+32; mcf_profilevector = 64+33; - mach_reset = m532x_cpu_reset; + mach_reset = coldfire_reset; #ifdef CONFIG_BDM_DISABLE /* diff --git a/trunk/arch/m68knommu/platform/5407/config.c b/trunk/arch/m68knommu/platform/5407/config.c index b41d942bf8d0..0ee8c1a200c8 100644 --- a/trunk/arch/m68knommu/platform/5407/config.c +++ b/trunk/arch/m68knommu/platform/5407/config.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -20,6 +21,8 @@ /***************************************************************************/ +void coldfire_reset(void); + extern unsigned int mcf_timervector; extern unsigned int mcf_profilevector; extern unsigned int mcf_timerlevel; @@ -107,17 +110,6 @@ void mcf_settimericr(unsigned int timer, unsigned int level) /***************************************************************************/ -void m5407_cpu_reset(void) -{ - local_irq_disable(); - /* set watchdog to soft reset, and enabled */ - __raw_writeb(0xc0, MCF_MBAR + MCFSIM_SYPCR); - for (;;) - /* wait for watchdog to timeout */; -} - -/***************************************************************************/ - void __init config_BSP(char *commandp, int size) { mcf_setimr(MCFSIM_IMR_MASKALL); @@ -129,7 +121,7 @@ void __init config_BSP(char *commandp, int size) mcf_timerlevel = 6; #endif - mach_reset = m5407_cpu_reset; + mach_reset = coldfire_reset; } /***************************************************************************/ diff --git a/trunk/arch/m68knommu/platform/coldfire/vectors.c b/trunk/arch/m68knommu/platform/coldfire/vectors.c index bdca0297fa9a..6cf894620234 100644 --- a/trunk/arch/m68knommu/platform/coldfire/vectors.c +++ b/trunk/arch/m68knommu/platform/coldfire/vectors.c @@ -96,3 +96,10 @@ void ack_vector(unsigned int irq) } /***************************************************************************/ + +void coldfire_reset(void) +{ + HARD_RESET_NOW(); +} + +/***************************************************************************/ diff --git a/trunk/arch/powerpc/include/asm/hw_irq.h b/trunk/arch/powerpc/include/asm/hw_irq.h index 53512374e1c9..20a44d0c9fdd 100644 --- a/trunk/arch/powerpc/include/asm/hw_irq.h +++ b/trunk/arch/powerpc/include/asm/hw_irq.h @@ -156,6 +156,8 @@ static inline void clear_perf_counter_pending(void) "i" (offsetof(struct paca_struct, perf_counter_pending))); } +extern void perf_counter_do_pending(void); + #else static inline unsigned long test_perf_counter_pending(void) @@ -165,6 +167,7 @@ static inline unsigned long test_perf_counter_pending(void) static inline void set_perf_counter_pending(void) {} static inline void clear_perf_counter_pending(void) {} +static inline void perf_counter_do_pending(void) {} #endif /* CONFIG_PERF_COUNTERS */ #endif /* __KERNEL__ */ diff --git a/trunk/arch/powerpc/kernel/irq.c b/trunk/arch/powerpc/kernel/irq.c index 844d3f882a15..feff792ed0f9 100644 --- a/trunk/arch/powerpc/kernel/irq.c +++ b/trunk/arch/powerpc/kernel/irq.c @@ -53,7 +53,6 @@ #include #include #include -#include #include #include diff --git a/trunk/arch/x86/kernel/apic/io_apic.c b/trunk/arch/x86/kernel/apic/io_apic.c index ef8d9290c7ea..94605e7f6a54 100644 --- a/trunk/arch/x86/kernel/apic/io_apic.c +++ b/trunk/arch/x86/kernel/apic/io_apic.c @@ -187,8 +187,8 @@ int __init arch_early_irq_init(void) for (i = 0; i < count; i++) { desc = irq_to_desc(i); desc->chip_data = &cfg[i]; - zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); - zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); + alloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); + alloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); if (i < NR_IRQS_LEGACY) cpumask_setall(cfg[i].domain); } diff --git a/trunk/arch/x86/kernel/setup.c b/trunk/arch/x86/kernel/setup.c index d1c636bf31a7..be5ae80f897f 100644 --- a/trunk/arch/x86/kernel/setup.c +++ b/trunk/arch/x86/kernel/setup.c @@ -301,15 +301,13 @@ static void __init reserve_brk(void) #ifdef CONFIG_BLK_DEV_INITRD -#ifdef CONFIG_X86_32 - #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) static void __init relocate_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; + u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; u64 ramdisk_here; unsigned long slop, clen, mapaddr; char *p, *q; @@ -365,14 +363,13 @@ static void __init relocate_initrd(void) ramdisk_image, ramdisk_image + ramdisk_size - 1, ramdisk_here, ramdisk_here + ramdisk_size - 1); } -#endif static void __init reserve_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; u64 ramdisk_end = ramdisk_image + ramdisk_size; - u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; + u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) @@ -402,14 +399,8 @@ static void __init reserve_initrd(void) return; } -#ifdef CONFIG_X86_32 relocate_initrd(); -#else - printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08llx > 0x%08llx)\ndisabling initrd\n", - ramdisk_end, end_of_lowmem); - initrd_start = 0; -#endif + free_early(ramdisk_image, ramdisk_end); } #else diff --git a/trunk/block/blk-core.c b/trunk/block/blk-core.c index f6452f692501..d17d71c71d4f 100644 --- a/trunk/block/blk-core.c +++ b/trunk/block/blk-core.c @@ -884,10 +884,9 @@ EXPORT_SYMBOL(blk_get_request); /** * blk_make_request - given a bio, allocate a corresponding struct request. - * @q: target request queue + * * @bio: The bio describing the memory mappings that will be submitted for IO. * It may be a chained-bio properly constructed by block/bio layer. - * @gfp_mask: gfp flags to be used for memory allocation * * blk_make_request is the parallel of generic_make_request for BLOCK_PC * type commands. Where the struct request needs to be farther initialized by @@ -1873,14 +1872,14 @@ EXPORT_SYMBOL(blk_fetch_request); /** * blk_update_request - Special helper function for request stacking drivers - * @req: the request being processed + * @rq: the request being processed * @error: %0 for success, < %0 for error - * @nr_bytes: number of bytes to complete @req + * @nr_bytes: number of bytes to complete @rq * * Description: - * Ends I/O on a number of bytes attached to @req, but doesn't complete - * the request structure even if @req doesn't have leftover. - * If @req has leftover, sets it up for the next range of segments. + * Ends I/O on a number of bytes attached to @rq, but doesn't complete + * the request structure even if @rq doesn't have leftover. + * If @rq has leftover, sets it up for the next range of segments. * * This special helper function is only for request stacking drivers * (e.g. request-based dm) so that they can handle partial completion. @@ -2146,7 +2145,7 @@ EXPORT_SYMBOL_GPL(blk_end_request); /** * blk_end_request_all - Helper function for drives to finish the request. * @rq: the request to finish - * @error: %0 for success, < %0 for error + * @err: %0 for success, < %0 for error * * Description: * Completely finish @rq. @@ -2167,7 +2166,7 @@ EXPORT_SYMBOL_GPL(blk_end_request_all); /** * blk_end_request_cur - Helper function to finish the current request chunk. * @rq: the request to finish the current chunk for - * @error: %0 for success, < %0 for error + * @err: %0 for success, < %0 for error * * Description: * Complete the current consecutively mapped chunk from @rq. @@ -2204,7 +2203,7 @@ EXPORT_SYMBOL_GPL(__blk_end_request); /** * __blk_end_request_all - Helper function for drives to finish the request. * @rq: the request to finish - * @error: %0 for success, < %0 for error + * @err: %0 for success, < %0 for error * * Description: * Completely finish @rq. Must be called with queue lock held. @@ -2225,7 +2224,7 @@ EXPORT_SYMBOL_GPL(__blk_end_request_all); /** * __blk_end_request_cur - Helper function to finish the current request chunk. * @rq: the request to finish the current chunk for - * @error: %0 for success, < %0 for error + * @err: %0 for success, < %0 for error * * Description: * Complete the current consecutively mapped chunk from @rq. Must diff --git a/trunk/block/blk-settings.c b/trunk/block/blk-settings.c index d71cedc09c4e..1c4df9bf6813 100644 --- a/trunk/block/blk-settings.c +++ b/trunk/block/blk-settings.c @@ -343,7 +343,7 @@ EXPORT_SYMBOL(blk_queue_physical_block_size); /** * blk_queue_alignment_offset - set physical block alignment offset * @q: the request queue for the device - * @offset: alignment offset in bytes + * @alignment: alignment offset in bytes * * Description: * Some devices are naturally misaligned to compensate for things like @@ -362,7 +362,7 @@ EXPORT_SYMBOL(blk_queue_alignment_offset); /** * blk_queue_io_min - set minimum request size for the queue * @q: the request queue for the device - * @min: smallest I/O size in bytes + * @io_min: smallest I/O size in bytes * * Description: * Some devices have an internal block size bigger than the reported @@ -385,7 +385,7 @@ EXPORT_SYMBOL(blk_queue_io_min); /** * blk_queue_io_opt - set optimal request size for the queue * @q: the request queue for the device - * @opt: optimal request size in bytes + * @io_opt: optimal request size in bytes * * Description: * Drivers can call this function to set the preferred I/O request diff --git a/trunk/drivers/ieee1394/dv1394.c b/trunk/drivers/ieee1394/dv1394.c index 2cd00b5b45b4..823a6297a1af 100644 --- a/trunk/drivers/ieee1394/dv1394.c +++ b/trunk/drivers/ieee1394/dv1394.c @@ -1789,13 +1789,12 @@ static int dv1394_open(struct inode *inode, struct file *file) } else { /* look up the card by ID */ unsigned long flags; - int idx = ieee1394_file_to_instance(file); spin_lock_irqsave(&dv1394_cards_lock, flags); if (!list_empty(&dv1394_cards)) { struct video_card *p; list_for_each_entry(p, &dv1394_cards, list) { - if ((p->id) == idx) { + if ((p->id) == ieee1394_file_to_instance(file)) { video = p; break; } @@ -1804,7 +1803,7 @@ static int dv1394_open(struct inode *inode, struct file *file) spin_unlock_irqrestore(&dv1394_cards_lock, flags); if (!video) { - debug_printk("dv1394: OHCI card %d not found", idx); + debug_printk("dv1394: OHCI card %d not found", ieee1394_file_to_instance(file)); return -ENODEV; } diff --git a/trunk/drivers/ieee1394/ieee1394_core.h b/trunk/drivers/ieee1394/ieee1394_core.h index 28b9f58bafd2..21d50f73a210 100644 --- a/trunk/drivers/ieee1394/ieee1394_core.h +++ b/trunk/drivers/ieee1394/ieee1394_core.h @@ -5,7 +5,6 @@ #include #include #include -#include #include #include "hosts.h" @@ -156,10 +155,7 @@ void hpsb_packet_received(struct hpsb_host *host, quadlet_t *data, size_t size, */ static inline unsigned char ieee1394_file_to_instance(struct file *file) { - int idx = cdev_index(file->f_path.dentry->d_inode); - if (idx < 0) - idx = 0; - return idx; + return file->f_path.dentry->d_inode->i_cindex; } extern int hpsb_disable_irm; diff --git a/trunk/drivers/usb/core/inode.c b/trunk/drivers/usb/core/inode.c index ffe75e83787c..dff5760a37f6 100644 --- a/trunk/drivers/usb/core/inode.c +++ b/trunk/drivers/usb/core/inode.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include "usb.h" #include "hcd.h" @@ -266,13 +265,9 @@ static int remount(struct super_block *sb, int *flags, char *data) return -EINVAL; } - lock_kernel(); - if (usbfs_mount && usbfs_mount->mnt_sb) update_sb(usbfs_mount->mnt_sb); - unlock_kernel(); - return 0; } diff --git a/trunk/fs/adfs/adfs.h b/trunk/fs/adfs/adfs.h index a6665f37f456..e0a85dbeeb88 100644 --- a/trunk/fs/adfs/adfs.h +++ b/trunk/fs/adfs/adfs.h @@ -53,7 +53,6 @@ struct adfs_dir_ops { int (*update)(struct adfs_dir *dir, struct object_info *obj); int (*create)(struct adfs_dir *dir, struct object_info *obj); int (*remove)(struct adfs_dir *dir, struct object_info *obj); - int (*sync)(struct adfs_dir *dir); void (*free)(struct adfs_dir *dir); }; @@ -91,8 +90,7 @@ extern const struct dentry_operations adfs_dentry_operations; extern struct adfs_dir_ops adfs_f_dir_ops; extern struct adfs_dir_ops adfs_fplus_dir_ops; -extern int adfs_dir_update(struct super_block *sb, struct object_info *obj, - int wait); +extern int adfs_dir_update(struct super_block *sb, struct object_info *obj); /* file.c */ extern const struct inode_operations adfs_file_inode_operations; diff --git a/trunk/fs/adfs/dir.c b/trunk/fs/adfs/dir.c index 4d4073447d1a..e867ccf37246 100644 --- a/trunk/fs/adfs/dir.c +++ b/trunk/fs/adfs/dir.c @@ -83,7 +83,7 @@ adfs_readdir(struct file *filp, void *dirent, filldir_t filldir) } int -adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait) +adfs_dir_update(struct super_block *sb, struct object_info *obj) { int ret = -EINVAL; #ifdef CONFIG_ADFS_FS_RW @@ -106,12 +106,6 @@ adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait) ret = ops->update(&dir, obj); write_unlock(&adfs_dir_lock); - if (wait) { - int err = ops->sync(&dir); - if (!ret) - ret = err; - } - ops->free(&dir); out: #endif @@ -205,7 +199,7 @@ const struct file_operations adfs_dir_operations = { .read = generic_read_dir, .llseek = generic_file_llseek, .readdir = adfs_readdir, - .fsync = simple_fsync, + .fsync = file_fsync, }; static int diff --git a/trunk/fs/adfs/dir_f.c b/trunk/fs/adfs/dir_f.c index 31df6adf0de6..ea7df2146921 100644 --- a/trunk/fs/adfs/dir_f.c +++ b/trunk/fs/adfs/dir_f.c @@ -437,22 +437,6 @@ adfs_f_update(struct adfs_dir *dir, struct object_info *obj) #endif } -static int -adfs_f_sync(struct adfs_dir *dir) -{ - int err = 0; - int i; - - for (i = dir->nr_buffers - 1; i >= 0; i--) { - struct buffer_head *bh = dir->bh[i]; - sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) - err = -EIO; - } - - return err; -} - static void adfs_f_free(struct adfs_dir *dir) { @@ -472,6 +456,5 @@ struct adfs_dir_ops adfs_f_dir_ops = { .setpos = adfs_f_setpos, .getnext = adfs_f_getnext, .update = adfs_f_update, - .sync = adfs_f_sync, .free = adfs_f_free }; diff --git a/trunk/fs/adfs/dir_fplus.c b/trunk/fs/adfs/dir_fplus.c index 139e0f345f18..1ec644e32df9 100644 --- a/trunk/fs/adfs/dir_fplus.c +++ b/trunk/fs/adfs/dir_fplus.c @@ -161,22 +161,6 @@ adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj) return ret; } -static int -adfs_fplus_sync(struct adfs_dir *dir) -{ - int err = 0; - int i; - - for (i = dir->nr_buffers - 1; i >= 0; i--) { - struct buffer_head *bh = dir->bh[i]; - sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) - err = -EIO; - } - - return err; -} - static void adfs_fplus_free(struct adfs_dir *dir) { @@ -191,6 +175,5 @@ struct adfs_dir_ops adfs_fplus_dir_ops = { .read = adfs_fplus_read, .setpos = adfs_fplus_setpos, .getnext = adfs_fplus_getnext, - .sync = adfs_fplus_sync, .free = adfs_fplus_free }; diff --git a/trunk/fs/adfs/file.c b/trunk/fs/adfs/file.c index 8224d54a2afb..36e381c6a99a 100644 --- a/trunk/fs/adfs/file.c +++ b/trunk/fs/adfs/file.c @@ -30,7 +30,7 @@ const struct file_operations adfs_file_operations = { .read = do_sync_read, .aio_read = generic_file_aio_read, .mmap = generic_file_mmap, - .fsync = simple_fsync, + .fsync = file_fsync, .write = do_sync_write, .aio_write = generic_file_aio_write, .splice_read = generic_file_splice_read, diff --git a/trunk/fs/adfs/inode.c b/trunk/fs/adfs/inode.c index 05b3a677201d..e647200262a2 100644 --- a/trunk/fs/adfs/inode.c +++ b/trunk/fs/adfs/inode.c @@ -376,7 +376,7 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr) * The adfs-specific inode data has already been updated by * adfs_notify_change() */ -int adfs_write_inode(struct inode *inode, int wait) +int adfs_write_inode(struct inode *inode, int unused) { struct super_block *sb = inode->i_sb; struct object_info obj; @@ -391,7 +391,7 @@ int adfs_write_inode(struct inode *inode, int wait) obj.attr = ADFS_I(inode)->attr; obj.size = inode->i_size; - ret = adfs_dir_update(sb, &obj, wait); + ret = adfs_dir_update(sb, &obj); unlock_kernel(); return ret; } diff --git a/trunk/fs/adfs/map.c b/trunk/fs/adfs/map.c index 568081b93f73..92ab4fbc2031 100644 --- a/trunk/fs/adfs/map.c +++ b/trunk/fs/adfs/map.c @@ -62,7 +62,7 @@ static DEFINE_RWLOCK(adfs_map_lock); #define GET_FRAG_ID(_map,_start,_idmask) \ ({ \ unsigned char *_m = _map + (_start >> 3); \ - u32 _frag = get_unaligned_le32(_m); \ + u32 _frag = get_unaligned((u32 *)_m); \ _frag >>= (_start & 7); \ _frag & _idmask; \ }) diff --git a/trunk/fs/adfs/super.c b/trunk/fs/adfs/super.c index 0ec5aaf47aa7..dd9becca4241 100644 --- a/trunk/fs/adfs/super.c +++ b/trunk/fs/adfs/super.c @@ -132,15 +132,11 @@ static void adfs_put_super(struct super_block *sb) int i; struct adfs_sb_info *asb = ADFS_SB(sb); - lock_kernel(); - for (i = 0; i < asb->s_map_size; i++) brelse(asb->s_map[i].dm_bh); kfree(asb->s_map); kfree(asb); sb->s_fs_info = NULL; - - unlock_kernel(); } static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt) diff --git a/trunk/fs/affs/affs.h b/trunk/fs/affs/affs.h index e511dc621a2e..1a2d5e3c7f4e 100644 --- a/trunk/fs/affs/affs.h +++ b/trunk/fs/affs/affs.h @@ -182,7 +182,6 @@ extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dent void affs_free_prealloc(struct inode *inode); extern void affs_truncate(struct inode *); -int affs_file_fsync(struct file *, struct dentry *, int); /* dir.c */ diff --git a/trunk/fs/affs/dir.c b/trunk/fs/affs/dir.c index 8ca8f3a55599..7b36904dbeac 100644 --- a/trunk/fs/affs/dir.c +++ b/trunk/fs/affs/dir.c @@ -21,7 +21,7 @@ const struct file_operations affs_dir_operations = { .read = generic_read_dir, .llseek = generic_file_llseek, .readdir = affs_readdir, - .fsync = affs_file_fsync, + .fsync = file_fsync, }; /* diff --git a/trunk/fs/affs/file.c b/trunk/fs/affs/file.c index 184e55c1c9ba..9246cb4aa018 100644 --- a/trunk/fs/affs/file.c +++ b/trunk/fs/affs/file.c @@ -34,7 +34,7 @@ const struct file_operations affs_file_operations = { .mmap = generic_file_mmap, .open = affs_file_open, .release = affs_file_release, - .fsync = affs_file_fsync, + .fsync = file_fsync, .splice_read = generic_file_splice_read, }; @@ -915,15 +915,3 @@ affs_truncate(struct inode *inode) } affs_free_prealloc(inode); } - -int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync) -{ - struct inode * inode = dentry->d_inode; - int ret, err; - - ret = write_inode_now(inode, 0); - err = sync_blockdev(inode->i_sb->s_bdev); - if (!ret) - ret = err; - return ret; -} diff --git a/trunk/fs/affs/super.c b/trunk/fs/affs/super.c index 104fdcb3a7fc..63f5183f263b 100644 --- a/trunk/fs/affs/super.c +++ b/trunk/fs/affs/super.c @@ -16,7 +16,6 @@ #include #include #include -#include #include "affs.h" extern struct timezone sys_tz; @@ -24,68 +23,50 @@ extern struct timezone sys_tz; static int affs_statfs(struct dentry *dentry, struct kstatfs *buf); static int affs_remount (struct super_block *sb, int *flags, char *data); -static void -affs_commit_super(struct super_block *sb, int clean) -{ - struct affs_sb_info *sbi = AFFS_SB(sb); - struct buffer_head *bh = sbi->s_root_bh; - struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh); - - tail->bm_flag = cpu_to_be32(clean); - secs_to_datestamp(get_seconds(), &tail->disk_change); - affs_fix_checksum(sb, bh); - mark_buffer_dirty(bh); -} - static void affs_put_super(struct super_block *sb) { struct affs_sb_info *sbi = AFFS_SB(sb); pr_debug("AFFS: put_super()\n"); - lock_kernel(); - - if (!(sb->s_flags & MS_RDONLY)) - affs_commit_super(sb, 1); + if (!(sb->s_flags & MS_RDONLY)) { + AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(1); + secs_to_datestamp(get_seconds(), + &AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->disk_change); + affs_fix_checksum(sb, sbi->s_root_bh); + mark_buffer_dirty(sbi->s_root_bh); + } kfree(sbi->s_prefix); affs_free_bitmap(sb); affs_brelse(sbi->s_root_bh); kfree(sbi); sb->s_fs_info = NULL; - - unlock_kernel(); + return; } static void affs_write_super(struct super_block *sb) { int clean = 2; + struct affs_sb_info *sbi = AFFS_SB(sb); - lock_super(sb); if (!(sb->s_flags & MS_RDONLY)) { // if (sbi->s_bitmap[i].bm_bh) { // if (buffer_dirty(sbi->s_bitmap[i].bm_bh)) { // clean = 0; - affs_commit_super(sb, clean); + AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(clean); + secs_to_datestamp(get_seconds(), + &AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->disk_change); + affs_fix_checksum(sb, sbi->s_root_bh); + mark_buffer_dirty(sbi->s_root_bh); sb->s_dirt = !clean; /* redo until bitmap synced */ } else sb->s_dirt = 0; - unlock_super(sb); pr_debug("AFFS: write_super() at %lu, clean=%d\n", get_seconds(), clean); } -static int -affs_sync_fs(struct super_block *sb, int wait) -{ - lock_super(sb); - affs_commit_super(sb, 2); - sb->s_dirt = 0; - unlock_super(sb); - return 0; -} - static struct kmem_cache * affs_inode_cachep; static struct inode *affs_alloc_inode(struct super_block *sb) @@ -143,7 +124,6 @@ static const struct super_operations affs_sops = { .clear_inode = affs_clear_inode, .put_super = affs_put_super, .write_super = affs_write_super, - .sync_fs = affs_sync_fs, .statfs = affs_statfs, .remount_fs = affs_remount, .show_options = generic_show_options, @@ -527,7 +507,6 @@ affs_remount(struct super_block *sb, int *flags, char *data) kfree(new_opts); return -EINVAL; } - lock_kernel(); replace_mount_options(sb, new_opts); sbi->s_flags = mount_flags; @@ -535,10 +514,8 @@ affs_remount(struct super_block *sb, int *flags, char *data) sbi->s_uid = uid; sbi->s_gid = gid; - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { - unlock_kernel(); + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; - } if (*flags & MS_RDONLY) { sb->s_dirt = 1; while (sb->s_dirt) @@ -547,7 +524,6 @@ affs_remount(struct super_block *sb, int *flags, char *data) } else res = affs_init_bitmap(sb, flags); - unlock_kernel(); return res; } diff --git a/trunk/fs/afs/mntpt.c b/trunk/fs/afs/mntpt.c index c52be53f6946..2b9e2d03a390 100644 --- a/trunk/fs/afs/mntpt.c +++ b/trunk/fs/afs/mntpt.c @@ -244,7 +244,7 @@ static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd) case -EBUSY: /* someone else made a mount here whilst we were busy */ while (d_mountpoint(nd->path.dentry) && - follow_down(&nd->path)) + follow_down(&nd->path.mnt, &nd->path.dentry)) ; err = 0; default: diff --git a/trunk/fs/afs/super.c b/trunk/fs/afs/super.c index ad0514d0115f..76828e5f8a39 100644 --- a/trunk/fs/afs/super.c +++ b/trunk/fs/afs/super.c @@ -440,12 +440,8 @@ static void afs_put_super(struct super_block *sb) _enter(""); - lock_kernel(); - afs_put_volume(as->volume); - unlock_kernel(); - _leave(""); } diff --git a/trunk/fs/autofs/dirhash.c b/trunk/fs/autofs/dirhash.c index 2316e944a109..4eb4d8dfb2f1 100644 --- a/trunk/fs/autofs/dirhash.c +++ b/trunk/fs/autofs/dirhash.c @@ -85,12 +85,13 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb, } path.mnt = mnt; path_get(&path); - if (!follow_down(&path)) { + if (!follow_down(&path.mnt, &path.dentry)) { path_put(&path); DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name)); continue; } - while (d_mountpoint(path.dentry) && follow_down(&path)); + while (d_mountpoint(path.dentry) && + follow_down(&path.mnt, &path.dentry)) ; umount_ok = may_umount(path.mnt); path_put(&path); diff --git a/trunk/fs/autofs4/autofs_i.h b/trunk/fs/autofs4/autofs_i.h index 8f7cdde41733..b7ff33c63101 100644 --- a/trunk/fs/autofs4/autofs_i.h +++ b/trunk/fs/autofs4/autofs_i.h @@ -223,12 +223,12 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify); int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int); void autofs4_catatonic_mode(struct autofs_sb_info *); -static inline int autofs4_follow_mount(struct path *path) +static inline int autofs4_follow_mount(struct vfsmount **mnt, struct dentry **dentry) { int res = 0; - while (d_mountpoint(path->dentry)) { - int followed = follow_down(path); + while (d_mountpoint(*dentry)) { + int followed = follow_down(mnt, dentry); if (!followed) break; res = 1; diff --git a/trunk/fs/autofs4/dev-ioctl.c b/trunk/fs/autofs4/dev-ioctl.c index f3da2eb51f56..84168c0dcc2d 100644 --- a/trunk/fs/autofs4/dev-ioctl.c +++ b/trunk/fs/autofs4/dev-ioctl.c @@ -192,42 +192,77 @@ static int autofs_dev_ioctl_protosubver(struct file *fp, return 0; } -static int find_autofs_mount(const char *pathname, - struct path *res, - int test(struct path *path, void *data), - void *data) +/* + * Walk down the mount stack looking for an autofs mount that + * has the requested device number (aka. new_encode_dev(sb->s_dev). + */ +static int autofs_dev_ioctl_find_super(struct nameidata *nd, dev_t devno) { - struct path path; - int err = kern_path(pathname, 0, &path); - if (err) - return err; + struct dentry *dentry; + struct inode *inode; + struct super_block *sb; + dev_t s_dev; + unsigned int err; + err = -ENOENT; - while (path.dentry == path.mnt->mnt_root) { - if (path.mnt->mnt_sb->s_magic == AUTOFS_SUPER_MAGIC) { - if (test(&path, data)) { - path_get(&path); - if (!err) /* already found some */ - path_put(res); - *res = path; + + /* Lookup the dentry name at the base of our mount point */ + dentry = d_lookup(nd->path.dentry, &nd->last); + if (!dentry) + goto out; + + dput(nd->path.dentry); + nd->path.dentry = dentry; + + /* And follow the mount stack looking for our autofs mount */ + while (follow_down(&nd->path.mnt, &nd->path.dentry)) { + inode = nd->path.dentry->d_inode; + if (!inode) + break; + + sb = inode->i_sb; + s_dev = new_encode_dev(sb->s_dev); + if (devno == s_dev) { + if (sb->s_magic == AUTOFS_SUPER_MAGIC) { err = 0; + break; } } - if (!follow_up(&path)) - break; } - path_put(&path); +out: return err; } -static int test_by_dev(struct path *path, void *p) +/* + * Walk down the mount stack looking for an autofs mount that + * has the requested mount type (ie. indirect, direct or offset). + */ +static int autofs_dev_ioctl_find_sbi_type(struct nameidata *nd, unsigned int type) { - return path->mnt->mnt_sb->s_dev == *(dev_t *)p; -} + struct dentry *dentry; + struct autofs_info *ino; + unsigned int err; -static int test_by_type(struct path *path, void *p) -{ - struct autofs_info *ino = autofs4_dentry_ino(path->dentry); - return ino && ino->sbi->type & *(unsigned *)p; + err = -ENOENT; + + /* Lookup the dentry name at the base of our mount point */ + dentry = d_lookup(nd->path.dentry, &nd->last); + if (!dentry) + goto out; + + dput(nd->path.dentry); + nd->path.dentry = dentry; + + /* And follow the mount stack looking for our autofs mount */ + while (follow_down(&nd->path.mnt, &nd->path.dentry)) { + ino = autofs4_dentry_ino(nd->path.dentry); + if (ino && ino->sbi->type & type) { + err = 0; + break; + } + } +out: + return err; } static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file) @@ -248,25 +283,31 @@ static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file) * Open a file descriptor on the autofs mount point corresponding * to the given path and device number (aka. new_encode_dev(sb->s_dev)). */ -static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid) +static int autofs_dev_ioctl_open_mountpoint(const char *path, dev_t devid) { + struct file *filp; + struct nameidata nd; int err, fd; fd = get_unused_fd(); if (likely(fd >= 0)) { - struct file *filp; - struct path path; - - err = find_autofs_mount(name, &path, test_by_dev, &devid); + /* Get nameidata of the parent directory */ + err = path_lookup(path, LOOKUP_PARENT, &nd); if (err) goto out; /* - * Find autofs super block that has the device number + * Search down, within the parent, looking for an + * autofs super block that has the device number * corresponding to the autofs fs we want to open. */ + err = autofs_dev_ioctl_find_super(&nd, devid); + if (err) { + path_put(&nd.path); + goto out; + } - filp = dentry_open(path.dentry, path.mnt, O_RDONLY, + filp = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY, current_cred()); if (IS_ERR(filp)) { err = PTR_ERR(filp); @@ -299,7 +340,7 @@ static int autofs_dev_ioctl_openmount(struct file *fp, param->ioctlfd = -1; path = param->path; - devid = new_decode_dev(param->openmount.devid); + devid = param->openmount.devid; err = 0; fd = autofs_dev_ioctl_open_mountpoint(path, devid); @@ -434,7 +475,8 @@ static int autofs_dev_ioctl_requester(struct file *fp, struct autofs_dev_ioctl *param) { struct autofs_info *ino; - struct path path; + struct nameidata nd; + const char *path; dev_t devid; int err = -ENOENT; @@ -443,24 +485,32 @@ static int autofs_dev_ioctl_requester(struct file *fp, goto out; } - devid = sbi->sb->s_dev; + path = param->path; + devid = new_encode_dev(sbi->sb->s_dev); param->requester.uid = param->requester.gid = -1; - err = find_autofs_mount(param->path, &path, test_by_dev, &devid); + /* Get nameidata of the parent directory */ + err = path_lookup(path, LOOKUP_PARENT, &nd); if (err) goto out; - ino = autofs4_dentry_ino(path.dentry); + err = autofs_dev_ioctl_find_super(&nd, devid); + if (err) + goto out_release; + + ino = autofs4_dentry_ino(nd.path.dentry); if (ino) { err = 0; - autofs4_expire_wait(path.dentry); + autofs4_expire_wait(nd.path.dentry); spin_lock(&sbi->fs_lock); param->requester.uid = ino->uid; param->requester.gid = ino->gid; spin_unlock(&sbi->fs_lock); } - path_put(&path); + +out_release: + path_put(&nd.path); out: return err; } @@ -519,8 +569,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - struct path path; - const char *name; + struct nameidata nd; + const char *path; unsigned int type; unsigned int devid, magic; int err = -ENOENT; @@ -530,46 +580,71 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, goto out; } - name = param->path; + path = param->path; type = param->ismountpoint.in.type; param->ismountpoint.out.devid = devid = 0; param->ismountpoint.out.magic = magic = 0; if (!fp || param->ioctlfd == -1) { - if (autofs_type_any(type)) - err = kern_path(name, LOOKUP_FOLLOW, &path); - else - err = find_autofs_mount(name, &path, test_by_type, &type); - if (err) - goto out; - devid = new_encode_dev(path.mnt->mnt_sb->s_dev); + if (autofs_type_any(type)) { + struct super_block *sb; + + err = path_lookup(path, LOOKUP_FOLLOW, &nd); + if (err) + goto out; + + sb = nd.path.dentry->d_sb; + devid = new_encode_dev(sb->s_dev); + } else { + struct autofs_info *ino; + + err = path_lookup(path, LOOKUP_PARENT, &nd); + if (err) + goto out; + + err = autofs_dev_ioctl_find_sbi_type(&nd, type); + if (err) + goto out_release; + + ino = autofs4_dentry_ino(nd.path.dentry); + devid = autofs4_get_dev(ino->sbi); + } + err = 0; - if (path.dentry->d_inode && - path.mnt->mnt_root == path.dentry) { + if (nd.path.dentry->d_inode && + nd.path.mnt->mnt_root == nd.path.dentry) { err = 1; - magic = path.dentry->d_inode->i_sb->s_magic; + magic = nd.path.dentry->d_inode->i_sb->s_magic; } } else { - dev_t dev = sbi->sb->s_dev; + dev_t dev = autofs4_get_dev(sbi); - err = find_autofs_mount(name, &path, test_by_dev, &dev); + err = path_lookup(path, LOOKUP_PARENT, &nd); if (err) goto out; - devid = new_encode_dev(dev); + err = autofs_dev_ioctl_find_super(&nd, dev); + if (err) + goto out_release; + + devid = dev; - err = have_submounts(path.dentry); + err = have_submounts(nd.path.dentry); - if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) { - if (follow_down(&path)) - magic = path.mnt->mnt_sb->s_magic; + if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) { + if (follow_down(&nd.path.mnt, &nd.path.dentry)) { + struct inode *inode = nd.path.dentry->d_inode; + magic = inode->i_sb->s_magic; + } } } param->ismountpoint.out.devid = devid; param->ismountpoint.out.magic = magic; - path_put(&path); + +out_release: + path_put(&nd.path); out: return err; } diff --git a/trunk/fs/autofs4/expire.c b/trunk/fs/autofs4/expire.c index aa39ae83f019..3077d8f16523 100644 --- a/trunk/fs/autofs4/expire.c +++ b/trunk/fs/autofs4/expire.c @@ -48,19 +48,19 @@ static inline int autofs4_can_expire(struct dentry *dentry, static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) { struct dentry *top = dentry; - struct path path = {.mnt = mnt, .dentry = dentry}; int status = 1; DPRINTK("dentry %p %.*s", dentry, (int)dentry->d_name.len, dentry->d_name.name); - path_get(&path); + mntget(mnt); + dget(dentry); - if (!follow_down(&path)) + if (!follow_down(&mnt, &dentry)) goto done; - if (is_autofs4_dentry(path.dentry)) { - struct autofs_sb_info *sbi = autofs4_sbi(path.dentry->d_sb); + if (is_autofs4_dentry(dentry)) { + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); /* This is an autofs submount, we can't expire it */ if (autofs_type_indirect(sbi->type)) @@ -70,7 +70,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) * Otherwise it's an offset mount and we need to check * if we can umount its mount, if there is one. */ - if (!d_mountpoint(path.dentry)) { + if (!d_mountpoint(dentry)) { status = 0; goto done; } @@ -86,7 +86,8 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) status = 0; done: DPRINTK("returning = %d", status); - path_put(&path); + dput(dentry); + mntput(mnt); return status; } diff --git a/trunk/fs/autofs4/root.c b/trunk/fs/autofs4/root.c index b96a3c57359d..e383bf0334f1 100644 --- a/trunk/fs/autofs4/root.c +++ b/trunk/fs/autofs4/root.c @@ -181,7 +181,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) nd->flags); /* * For an expire of a covered direct or offset mount we need - * to break out of follow_down() at the autofs mount trigger + * to beeak out of follow_down() at the autofs mount trigger * (d_mounted--), so we can see the expiring flag, and manage * the blocking and following here until the expire is completed. */ @@ -190,7 +190,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) if (ino->flags & AUTOFS_INF_EXPIRING) { spin_unlock(&sbi->fs_lock); /* Follow down to our covering mount. */ - if (!follow_down(&nd->path)) + if (!follow_down(&nd->path.mnt, &nd->path.dentry)) goto done; goto follow; } @@ -230,7 +230,8 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) * to follow it. */ if (d_mountpoint(dentry)) { - if (!autofs4_follow_mount(&nd->path)) { + if (!autofs4_follow_mount(&nd->path.mnt, + &nd->path.dentry)) { status = -ENOENT; goto out_error; } diff --git a/trunk/fs/befs/linuxvfs.c b/trunk/fs/befs/linuxvfs.c index 9367b6297d84..76afd0d6b86c 100644 --- a/trunk/fs/befs/linuxvfs.c +++ b/trunk/fs/befs/linuxvfs.c @@ -737,8 +737,6 @@ parse_options(char *options, befs_mount_options * opts) static void befs_put_super(struct super_block *sb) { - lock_kernel(); - kfree(BEFS_SB(sb)->mount_opts.iocharset); BEFS_SB(sb)->mount_opts.iocharset = NULL; @@ -749,8 +747,7 @@ befs_put_super(struct super_block *sb) kfree(sb->s_fs_info); sb->s_fs_info = NULL; - - unlock_kernel(); + return; } /* Allocate private field of the superblock, fill it. diff --git a/trunk/fs/bfs/dir.c b/trunk/fs/bfs/dir.c index 54bd07d44e68..4dd1b623f937 100644 --- a/trunk/fs/bfs/dir.c +++ b/trunk/fs/bfs/dir.c @@ -79,7 +79,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir) const struct file_operations bfs_dir_operations = { .read = generic_read_dir, .readdir = bfs_readdir, - .fsync = simple_fsync, + .fsync = file_fsync, .llseek = generic_file_llseek, }; @@ -205,7 +205,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry) inode->i_nlink = 1; } de->ino = 0; - mark_buffer_dirty_inode(bh, dir); + mark_buffer_dirty(bh); dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; mark_inode_dirty(dir); inode->i_ctime = dir->i_ctime; @@ -267,7 +267,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_inode->i_ctime = CURRENT_TIME_SEC; inode_dec_link_count(new_inode); } - mark_buffer_dirty_inode(old_bh, old_dir); + mark_buffer_dirty(old_bh); error = 0; end_rename: @@ -320,7 +320,7 @@ static int bfs_add_entry(struct inode *dir, const unsigned char *name, for (i = 0; i < BFS_NAMELEN; i++) de->name[i] = (i < namelen) ? name[i] : 0; - mark_buffer_dirty_inode(bh, dir); + mark_buffer_dirty(bh); brelse(bh); return 0; } diff --git a/trunk/fs/bfs/inode.c b/trunk/fs/bfs/inode.c index 6f60336c6628..cc4062d12ca2 100644 --- a/trunk/fs/bfs/inode.c +++ b/trunk/fs/bfs/inode.c @@ -30,7 +30,6 @@ MODULE_LICENSE("GPL"); #define dprintf(x...) #endif -static void bfs_write_super(struct super_block *s); void dump_imap(const char *prefix, struct super_block *s); struct inode *bfs_iget(struct super_block *sb, unsigned long ino) @@ -98,15 +97,14 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) return ERR_PTR(-EIO); } -static int bfs_write_inode(struct inode *inode, int wait) +static int bfs_write_inode(struct inode *inode, int unused) { - struct bfs_sb_info *info = BFS_SB(inode->i_sb); unsigned int ino = (u16)inode->i_ino; unsigned long i_sblock; struct bfs_inode *di; struct buffer_head *bh; int block, off; - int err = 0; + struct bfs_sb_info *info = BFS_SB(inode->i_sb); dprintf("ino=%08x\n", ino); @@ -147,14 +145,9 @@ static int bfs_write_inode(struct inode *inode, int wait) di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1); mark_buffer_dirty(bh); - if (wait) { - sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) - err = -EIO; - } brelse(bh); mutex_unlock(&info->bfs_lock); - return err; + return 0; } static void bfs_delete_inode(struct inode *inode) @@ -216,26 +209,6 @@ static void bfs_delete_inode(struct inode *inode) clear_inode(inode); } -static int bfs_sync_fs(struct super_block *sb, int wait) -{ - struct bfs_sb_info *info = BFS_SB(sb); - - mutex_lock(&info->bfs_lock); - mark_buffer_dirty(info->si_sbh); - sb->s_dirt = 0; - mutex_unlock(&info->bfs_lock); - - return 0; -} - -static void bfs_write_super(struct super_block *sb) -{ - if (!(sb->s_flags & MS_RDONLY)) - bfs_sync_fs(sb, 1); - else - sb->s_dirt = 0; -} - static void bfs_put_super(struct super_block *s) { struct bfs_sb_info *info = BFS_SB(s); @@ -243,18 +216,11 @@ static void bfs_put_super(struct super_block *s) if (!info) return; - lock_kernel(); - - if (s->s_dirt) - bfs_write_super(s); - brelse(info->si_sbh); mutex_destroy(&info->bfs_lock); kfree(info->si_imap); kfree(info); s->s_fs_info = NULL; - - unlock_kernel(); } static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -274,6 +240,17 @@ static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } +static void bfs_write_super(struct super_block *s) +{ + struct bfs_sb_info *info = BFS_SB(s); + + mutex_lock(&info->bfs_lock); + if (!(s->s_flags & MS_RDONLY)) + mark_buffer_dirty(info->si_sbh); + s->s_dirt = 0; + mutex_unlock(&info->bfs_lock); +} + static struct kmem_cache *bfs_inode_cachep; static struct inode *bfs_alloc_inode(struct super_block *sb) @@ -321,7 +298,6 @@ static const struct super_operations bfs_sops = { .delete_inode = bfs_delete_inode, .put_super = bfs_put_super, .write_super = bfs_write_super, - .sync_fs = bfs_sync_fs, .statfs = bfs_statfs, }; diff --git a/trunk/fs/block_dev.c b/trunk/fs/block_dev.c index 3a6d4fb2a329..931f6b8c4b2f 100644 --- a/trunk/fs/block_dev.c +++ b/trunk/fs/block_dev.c @@ -176,22 +176,17 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, iov, offset, nr_segs, blkdev_get_blocks, NULL); } -int __sync_blockdev(struct block_device *bdev, int wait) -{ - if (!bdev) - return 0; - if (!wait) - return filemap_flush(bdev->bd_inode->i_mapping); - return filemap_write_and_wait(bdev->bd_inode->i_mapping); -} - /* * Write out and wait upon all the dirty data associated with a block * device via its mapping. Does not take the superblock lock. */ int sync_blockdev(struct block_device *bdev) { - return __sync_blockdev(bdev, 1); + int ret = 0; + + if (bdev) + ret = filemap_write_and_wait(bdev->bd_inode->i_mapping); + return ret; } EXPORT_SYMBOL(sync_blockdev); @@ -204,7 +199,7 @@ int fsync_bdev(struct block_device *bdev) { struct super_block *sb = get_super(bdev); if (sb) { - int res = sync_filesystem(sb); + int res = fsync_super(sb); drop_super(sb); return res; } @@ -246,7 +241,7 @@ struct super_block *freeze_bdev(struct block_device *bdev) sb->s_frozen = SB_FREEZE_WRITE; smp_wmb(); - sync_filesystem(sb); + __fsync_super(sb); sb->s_frozen = SB_FREEZE_TRANS; smp_wmb(); diff --git a/trunk/fs/btrfs/Makefile b/trunk/fs/btrfs/Makefile index a35eb36b32fd..94212844a9bc 100644 --- a/trunk/fs/btrfs/Makefile +++ b/trunk/fs/btrfs/Makefile @@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - export.o tree-log.o acl.o free-space-cache.o zlib.o \ - compression.o delayed-ref.o relocation.o + ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ + compression.o delayed-ref.o diff --git a/trunk/fs/btrfs/acl.c b/trunk/fs/btrfs/acl.c index 603972576f0f..cbba000dccbe 100644 --- a/trunk/fs/btrfs/acl.c +++ b/trunk/fs/btrfs/acl.c @@ -351,4 +351,9 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) return 0; } +int btrfs_check_acl(struct inode *inode, int mask) +{ + return 0; +} + #endif /* CONFIG_FS_POSIX_ACL */ diff --git a/trunk/fs/btrfs/async-thread.c b/trunk/fs/btrfs/async-thread.c index 7f88628a1a72..502c3d61de62 100644 --- a/trunk/fs/btrfs/async-thread.c +++ b/trunk/fs/btrfs/async-thread.c @@ -294,10 +294,10 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) INIT_LIST_HEAD(&worker->worker_list); spin_lock_init(&worker->lock); atomic_set(&worker->num_pending, 0); - worker->workers = workers; worker->task = kthread_run(worker_loop, worker, "btrfs-%s-%d", workers->name, workers->num_workers + i); + worker->workers = workers; if (IS_ERR(worker->task)) { kfree(worker); ret = PTR_ERR(worker->task); diff --git a/trunk/fs/btrfs/btrfs_inode.h b/trunk/fs/btrfs/btrfs_inode.h index acb4f3517582..b30986f00b9d 100644 --- a/trunk/fs/btrfs/btrfs_inode.h +++ b/trunk/fs/btrfs/btrfs_inode.h @@ -72,9 +72,6 @@ struct btrfs_inode { */ struct list_head ordered_operations; - /* node for the red-black tree that links inodes in subvolume root */ - struct rb_node rb_node; - /* the space_info for where this inode's data allocations are done */ struct btrfs_space_info *space_info; @@ -157,4 +154,5 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size) BTRFS_I(inode)->disk_i_size = size; } + #endif diff --git a/trunk/fs/btrfs/compression.c b/trunk/fs/btrfs/compression.c index de1e2fd32080..ab07627084f1 100644 --- a/trunk/fs/btrfs/compression.c +++ b/trunk/fs/btrfs/compression.c @@ -123,7 +123,7 @@ static int check_compressed_csum(struct inode *inode, u32 csum; u32 *cb_sum = &cb->sums; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + if (btrfs_test_flag(inode, NODATASUM)) return 0; for (i = 0; i < cb->nr_pages; i++) { @@ -670,7 +670,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, */ atomic_inc(&cb->pending_bios); - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + if (!btrfs_test_flag(inode, NODATASUM)) { btrfs_lookup_bio_sums(root, inode, comp_bio, sums); } @@ -697,7 +697,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); BUG_ON(ret); - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) + if (!btrfs_test_flag(inode, NODATASUM)) btrfs_lookup_bio_sums(root, inode, comp_bio, sums); ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); diff --git a/trunk/fs/btrfs/crc32c.h b/trunk/fs/btrfs/crc32c.h new file mode 100644 index 000000000000..6e1b3de36700 --- /dev/null +++ b/trunk/fs/btrfs/crc32c.h @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_CRC32C__ +#define __BTRFS_CRC32C__ +#include + +/* + * this file used to do more for selecting the HW version of crc32c, + * perhaps it will one day again soon. + */ +#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length) +#endif + diff --git a/trunk/fs/btrfs/ctree.c b/trunk/fs/btrfs/ctree.c index 60a45f3a4e91..fedf8b9f03a2 100644 --- a/trunk/fs/btrfs/ctree.c +++ b/trunk/fs/btrfs/ctree.c @@ -197,7 +197,14 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, u32 nritems; int ret = 0; int level; - struct btrfs_disk_key disk_key; + struct btrfs_root *new_root; + + new_root = kmalloc(sizeof(*new_root), GFP_NOFS); + if (!new_root) + return -ENOMEM; + + memcpy(new_root, root, sizeof(*new_root)); + new_root->root_key.objectid = new_root_objectid; WARN_ON(root->ref_cows && trans->transid != root->fs_info->running_transaction->transid); @@ -205,37 +212,28 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, level = btrfs_header_level(buf); nritems = btrfs_header_nritems(buf); - if (level == 0) - btrfs_item_key(buf, &disk_key, 0); - else - btrfs_node_key(buf, &disk_key, 0); - cow = btrfs_alloc_free_block(trans, root, buf->len, 0, - new_root_objectid, &disk_key, level, - buf->start, 0); - if (IS_ERR(cow)) + cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0, + new_root_objectid, trans->transid, + level, buf->start, 0); + if (IS_ERR(cow)) { + kfree(new_root); return PTR_ERR(cow); + } copy_extent_buffer(cow, buf, 0, 0, cow->len); btrfs_set_header_bytenr(cow, cow->start); btrfs_set_header_generation(cow, trans->transid); - btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); - btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | - BTRFS_HEADER_FLAG_RELOC); - if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) - btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); - else - btrfs_set_header_owner(cow, new_root_objectid); + btrfs_set_header_owner(cow, new_root_objectid); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); write_extent_buffer(cow, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(cow), BTRFS_FSID_SIZE); WARN_ON(btrfs_header_generation(buf) > trans->transid); - if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); - else - ret = btrfs_inc_ref(trans, root, cow, 0); + ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL); + kfree(new_root); if (ret) return ret; @@ -245,125 +243,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, return 0; } -/* - * check if the tree block can be shared by multiple trees - */ -int btrfs_block_can_be_shared(struct btrfs_root *root, - struct extent_buffer *buf) -{ - /* - * Tree blocks not in refernece counted trees and tree roots - * are never shared. If a block was allocated after the last - * snapshot and the block was not allocated by tree relocation, - * we know the block is not shared. - */ - if (root->ref_cows && - buf != root->node && buf != root->commit_root && - (btrfs_header_generation(buf) <= - btrfs_root_last_snapshot(&root->root_item) || - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) - return 1; -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (root->ref_cows && - btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) - return 1; -#endif - return 0; -} - -static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf, - struct extent_buffer *cow) -{ - u64 refs; - u64 owner; - u64 flags; - u64 new_flags = 0; - int ret; - - /* - * Backrefs update rules: - * - * Always use full backrefs for extent pointers in tree block - * allocated by tree relocation. - * - * If a shared tree block is no longer referenced by its owner - * tree (btrfs_header_owner(buf) == root->root_key.objectid), - * use full backrefs for extent pointers in tree block. - * - * If a tree block is been relocating - * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), - * use full backrefs for extent pointers in tree block. - * The reason for this is some operations (such as drop tree) - * are only allowed for blocks use full backrefs. - */ - - if (btrfs_block_can_be_shared(root, buf)) { - ret = btrfs_lookup_extent_info(trans, root, buf->start, - buf->len, &refs, &flags); - BUG_ON(ret); - BUG_ON(refs == 0); - } else { - refs = 1; - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || - btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) - flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; - else - flags = 0; - } - - owner = btrfs_header_owner(buf); - BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && - !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); - - if (refs > 1) { - if ((owner == root->root_key.objectid || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && - !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { - ret = btrfs_inc_ref(trans, root, buf, 1); - BUG_ON(ret); - - if (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_dec_ref(trans, root, buf, 0); - BUG_ON(ret); - ret = btrfs_inc_ref(trans, root, cow, 1); - BUG_ON(ret); - } - new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; - } else { - - if (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); - else - ret = btrfs_inc_ref(trans, root, cow, 0); - BUG_ON(ret); - } - if (new_flags != 0) { - ret = btrfs_set_disk_extent_flags(trans, root, - buf->start, - buf->len, - new_flags, 0); - BUG_ON(ret); - } - } else { - if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { - if (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); - else - ret = btrfs_inc_ref(trans, root, cow, 0); - BUG_ON(ret); - ret = btrfs_dec_ref(trans, root, buf, 1); - BUG_ON(ret); - } - clean_tree_block(trans, root, buf); - } - return 0; -} - /* * does the dirty work in cow of a single block. The parent block (if * supplied) is updated to point to the new cow copy. The new buffer is marked @@ -383,39 +262,34 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 search_start, u64 empty_size) { - struct btrfs_disk_key disk_key; + u64 parent_start; struct extent_buffer *cow; + u32 nritems; + int ret = 0; int level; int unlock_orig = 0; - u64 parent_start; if (*cow_ret == buf) unlock_orig = 1; btrfs_assert_tree_locked(buf); + if (parent) + parent_start = parent->start; + else + parent_start = 0; + WARN_ON(root->ref_cows && trans->transid != root->fs_info->running_transaction->transid); WARN_ON(root->ref_cows && trans->transid != root->last_trans); level = btrfs_header_level(buf); + nritems = btrfs_header_nritems(buf); - if (level == 0) - btrfs_item_key(buf, &disk_key, 0); - else - btrfs_node_key(buf, &disk_key, 0); - - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - if (parent) - parent_start = parent->start; - else - parent_start = 0; - } else - parent_start = 0; - - cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, - root->root_key.objectid, &disk_key, - level, search_start, empty_size); + cow = btrfs_alloc_free_block(trans, root, buf->len, + parent_start, root->root_key.objectid, + trans->transid, level, + search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -424,53 +298,83 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, copy_extent_buffer(cow, buf, 0, 0, cow->len); btrfs_set_header_bytenr(cow, cow->start); btrfs_set_header_generation(cow, trans->transid); - btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); - btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | - BTRFS_HEADER_FLAG_RELOC); - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); - else - btrfs_set_header_owner(cow, root->root_key.objectid); + btrfs_set_header_owner(cow, root->root_key.objectid); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); write_extent_buffer(cow, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(cow), BTRFS_FSID_SIZE); - update_ref_for_cow(trans, root, buf, cow); + WARN_ON(btrfs_header_generation(buf) > trans->transid); + if (btrfs_header_generation(buf) != trans->transid) { + u32 nr_extents; + ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents); + if (ret) + return ret; + + ret = btrfs_cache_ref(trans, root, buf, nr_extents); + WARN_ON(ret); + } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) { + /* + * There are only two places that can drop reference to + * tree blocks owned by living reloc trees, one is here, + * the other place is btrfs_drop_subtree. In both places, + * we check reference count while tree block is locked. + * Furthermore, if reference count is one, it won't get + * increased by someone else. + */ + u32 refs; + ret = btrfs_lookup_extent_ref(trans, root, buf->start, + buf->len, &refs); + BUG_ON(ret); + if (refs == 1) { + ret = btrfs_update_ref(trans, root, buf, cow, + 0, nritems); + clean_tree_block(trans, root, buf); + } else { + ret = btrfs_inc_ref(trans, root, buf, cow, NULL); + } + BUG_ON(ret); + } else { + ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems); + if (ret) + return ret; + clean_tree_block(trans, root, buf); + } + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start); + WARN_ON(ret); + } if (buf == root->node) { WARN_ON(parent && parent != buf); - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || - btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) - parent_start = buf->start; - else - parent_start = 0; spin_lock(&root->node_lock); root->node = cow; extent_buffer_get(cow); spin_unlock(&root->node_lock); - btrfs_free_extent(trans, root, buf->start, buf->len, - parent_start, root->root_key.objectid, - level, 0); + if (buf != root->commit_root) { + btrfs_free_extent(trans, root, buf->start, + buf->len, buf->start, + root->root_key.objectid, + btrfs_header_generation(buf), + level, 1); + } free_extent_buffer(buf); add_root_to_dirty_list(root); } else { - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - parent_start = parent->start; - else - parent_start = 0; - - WARN_ON(trans->transid != btrfs_header_generation(parent)); btrfs_set_node_blockptr(parent, parent_slot, cow->start); + WARN_ON(trans->transid == 0); btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); + WARN_ON(btrfs_header_generation(parent) != trans->transid); btrfs_free_extent(trans, root, buf->start, buf->len, - parent_start, root->root_key.objectid, - level, 0); + parent_start, btrfs_header_owner(parent), + btrfs_header_generation(parent), level, 1); } if (unlock_orig) btrfs_tree_unlock(buf); @@ -480,18 +384,6 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, return 0; } -static inline int should_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf) -{ - if (btrfs_header_generation(buf) == trans->transid && - !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && - !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) - return 0; - return 1; -} - /* * cows a single block, see __btrfs_cow_block for the real work. * This version of it has extra checks so that a block isn't cow'd more than @@ -519,7 +411,9 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, WARN_ON(1); } - if (!should_cow_block(trans, root, buf)) { + if (btrfs_header_generation(buf) == trans->transid && + btrfs_header_owner(buf) == root->root_key.objectid && + !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { *cow_ret = buf; return 0; } @@ -575,7 +469,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) /* * same as comp_keys only with two btrfs_key's */ -int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) +static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) { if (k1->objectid > k2->objectid) return 1; @@ -951,12 +845,6 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, return -1; } -int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, - int level, int *slot) -{ - return bin_search(eb, key, level, slot); -} - /* given a node and slot number, this reads the blocks it points to. The * extent buffer is returned with a reference taken (but unlocked). * NULL is returned on error. @@ -1033,6 +921,13 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, root->node = child; spin_unlock(&root->node_lock); + ret = btrfs_update_extent_ref(trans, root, child->start, + child->len, + mid->start, child->start, + root->root_key.objectid, + trans->transid, level - 1); + BUG_ON(ret); + add_root_to_dirty_list(root); btrfs_tree_unlock(child); @@ -1043,7 +938,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* once for the path */ free_extent_buffer(mid); ret = btrfs_free_extent(trans, root, mid->start, mid->len, - 0, root->root_key.objectid, level, 1); + mid->start, root->root_key.objectid, + btrfs_header_generation(mid), + level, 1); /* once for the root ptr */ free_extent_buffer(mid); return ret; @@ -1052,7 +949,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(root) / 4) return 0; - if (btrfs_header_nritems(mid) > 2) + if (trans->transaction->delayed_refs.flushing && + btrfs_header_nritems(mid) > 2) return 0; if (btrfs_header_nritems(mid) < 2) @@ -1100,6 +998,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, ret = wret; if (btrfs_header_nritems(right) == 0) { u64 bytenr = right->start; + u64 generation = btrfs_header_generation(parent); u32 blocksize = right->len; clean_tree_block(trans, root, right); @@ -1111,9 +1010,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret) ret = wret; wret = btrfs_free_extent(trans, root, bytenr, - blocksize, 0, - root->root_key.objectid, - level, 0); + blocksize, parent->start, + btrfs_header_owner(parent), + generation, level, 1); if (wret) ret = wret; } else { @@ -1148,6 +1047,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } if (btrfs_header_nritems(mid) == 0) { /* we've managed to empty the middle node, drop it */ + u64 root_gen = btrfs_header_generation(parent); u64 bytenr = mid->start; u32 blocksize = mid->len; @@ -1159,8 +1059,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret) ret = wret; wret = btrfs_free_extent(trans, root, bytenr, blocksize, - 0, root->root_key.objectid, - level, 0); + parent->start, + btrfs_header_owner(parent), + root_gen, level, 1); if (wret) ret = wret; } else { @@ -1536,7 +1437,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) { int i; - if (path->keep_locks) + if (path->keep_locks || path->lowest_level) return; for (i = level; i < BTRFS_MAX_LEVEL; i++) { @@ -1651,7 +1552,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, } b = p->nodes[level]; } else if (ins_len < 0 && btrfs_header_nritems(b) < - BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { + BTRFS_NODEPTRS_PER_BLOCK(root) / 4) { int sret; sret = reada_for_balance(root, p, level); @@ -1713,17 +1614,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root lowest_unlock = 2; again: - if (p->search_commit_root) { - b = root->commit_root; - extent_buffer_get(b); - if (!p->skip_locking) - btrfs_tree_lock(b); - } else { - if (p->skip_locking) - b = btrfs_root_node(root); - else - b = btrfs_lock_root_node(root); - } + if (p->skip_locking) + b = btrfs_root_node(root); + else + b = btrfs_lock_root_node(root); while (b) { level = btrfs_header_level(b); @@ -1744,9 +1638,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root * then we don't want to set the path blocking, * so we test it here */ - if (!should_cow_block(trans, root, b)) + if (btrfs_header_generation(b) == trans->transid && + btrfs_header_owner(b) == root->root_key.objectid && + !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { goto cow_done; - + } btrfs_set_path_blocking(p); wret = btrfs_cow_block(trans, root, b, @@ -1868,6 +1764,138 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root return ret; } +int btrfs_merge_path(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *node_keys, + u64 *nodes, int lowest_level) +{ + struct extent_buffer *eb; + struct extent_buffer *parent; + struct btrfs_key key; + u64 bytenr; + u64 generation; + u32 blocksize; + int level; + int slot; + int key_match; + int ret; + + eb = btrfs_lock_root_node(root); + ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb); + BUG_ON(ret); + + btrfs_set_lock_blocking(eb); + + parent = eb; + while (1) { + level = btrfs_header_level(parent); + if (level == 0 || level <= lowest_level) + break; + + ret = bin_search(parent, &node_keys[lowest_level], level, + &slot); + if (ret && slot > 0) + slot--; + + bytenr = btrfs_node_blockptr(parent, slot); + if (nodes[level - 1] == bytenr) + break; + + blocksize = btrfs_level_size(root, level - 1); + generation = btrfs_node_ptr_generation(parent, slot); + btrfs_node_key_to_cpu(eb, &key, slot); + key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key)); + + if (generation == trans->transid) { + eb = read_tree_block(root, bytenr, blocksize, + generation); + btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); + } + + /* + * if node keys match and node pointer hasn't been modified + * in the running transaction, we can merge the path. for + * blocks owened by reloc trees, the node pointer check is + * skipped, this is because these blocks are fully controlled + * by the space balance code, no one else can modify them. + */ + if (!nodes[level - 1] || !key_match || + (generation == trans->transid && + btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) { + if (level == 1 || level == lowest_level + 1) { + if (generation == trans->transid) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + break; + } + + if (generation != trans->transid) { + eb = read_tree_block(root, bytenr, blocksize, + generation); + btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); + } + + ret = btrfs_cow_block(trans, root, eb, parent, slot, + &eb); + BUG_ON(ret); + + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) { + if (!nodes[level - 1]) { + nodes[level - 1] = eb->start; + memcpy(&node_keys[level - 1], &key, + sizeof(node_keys[0])); + } else { + WARN_ON(1); + } + } + + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + parent = eb; + continue; + } + + btrfs_set_node_blockptr(parent, slot, nodes[level - 1]); + btrfs_set_node_ptr_generation(parent, slot, trans->transid); + btrfs_mark_buffer_dirty(parent); + + ret = btrfs_inc_extent_ref(trans, root, + nodes[level - 1], + blocksize, parent->start, + btrfs_header_owner(parent), + btrfs_header_generation(parent), + level - 1); + BUG_ON(ret); + + /* + * If the block was created in the running transaction, + * it's possible this is the last reference to it, so we + * should drop the subtree. + */ + if (generation == trans->transid) { + ret = btrfs_drop_subtree(trans, root, eb, parent); + BUG_ON(ret); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } else { + ret = btrfs_free_extent(trans, root, bytenr, + blocksize, parent->start, + btrfs_header_owner(parent), + btrfs_header_generation(parent), + level - 1, 1); + BUG_ON(ret); + } + break; + } + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + return 0; +} + /* * adjust the pointers going up the tree, starting at level * making sure the right key of each node is points to 'key'. @@ -1993,6 +2021,9 @@ static int push_node_left(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(src); btrfs_mark_buffer_dirty(dst); + ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items); + BUG_ON(ret); + return ret; } @@ -2052,6 +2083,9 @@ static int balance_node_right(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(src); btrfs_mark_buffer_dirty(dst); + ret = btrfs_update_ref(trans, root, src, dst, 0, push_items); + BUG_ON(ret); + return ret; } @@ -2071,6 +2105,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, struct extent_buffer *c; struct extent_buffer *old; struct btrfs_disk_key lower_key; + int ret; BUG_ON(path->nodes[level]); BUG_ON(path->nodes[level-1] != root->node); @@ -2082,17 +2117,16 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_node_key(lower, &lower_key, 0); c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, - root->root_key.objectid, &lower_key, + root->root_key.objectid, trans->transid, level, root->node->start, 0); if (IS_ERR(c)) return PTR_ERR(c); - memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); + memset_extent_buffer(c, 0, 0, root->nodesize); btrfs_set_header_nritems(c, 1); btrfs_set_header_level(c, level); btrfs_set_header_bytenr(c, c->start); btrfs_set_header_generation(c, trans->transid); - btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(c, root->root_key.objectid); write_extent_buffer(c, root->fs_info->fsid, @@ -2117,6 +2151,12 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, root->node = c; spin_unlock(&root->node_lock); + ret = btrfs_update_extent_ref(trans, root, lower->start, + lower->len, lower->start, c->start, + root->root_key.objectid, + trans->transid, level - 1); + BUG_ON(ret); + /* the super has an extra ref to root->node */ free_extent_buffer(old); @@ -2193,7 +2233,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ret = insert_new_root(trans, root, path, level + 1); if (ret) return ret; - } else { + } else if (!trans->transaction->delayed_refs.flushing) { ret = push_nodes_for_insert(trans, root, path, level); c = path->nodes[level]; if (!ret && btrfs_header_nritems(c) < @@ -2204,21 +2244,20 @@ static noinline int split_node(struct btrfs_trans_handle *trans, } c_nritems = btrfs_header_nritems(c); - mid = (c_nritems + 1) / 2; - btrfs_node_key(c, &disk_key, mid); - split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, + split = btrfs_alloc_free_block(trans, root, root->nodesize, + path->nodes[level + 1]->start, root->root_key.objectid, - &disk_key, level, c->start, 0); + trans->transid, level, c->start, 0); if (IS_ERR(split)) return PTR_ERR(split); - memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_flags(split, btrfs_header_flags(c)); btrfs_set_header_level(split, btrfs_header_level(c)); btrfs_set_header_bytenr(split, split->start); btrfs_set_header_generation(split, trans->transid); - btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(split, root->root_key.objectid); + btrfs_set_header_flags(split, 0); write_extent_buffer(split, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(split), BTRFS_FSID_SIZE); @@ -2226,6 +2265,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); + mid = (c_nritems + 1) / 2; copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), @@ -2238,12 +2278,16 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); btrfs_mark_buffer_dirty(split); + btrfs_node_key(split, &disk_key, 0); wret = insert_ptr(trans, root, path, &disk_key, split->start, path->slots[level + 1] + 1, level + 1); if (wret) ret = wret; + ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid); + BUG_ON(ret); + if (path->slots[level] >= mid) { path->slots[level] -= mid; btrfs_tree_unlock(c); @@ -2316,6 +2360,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, u32 right_nritems; u32 data_end; u32 this_item_size; + int ret; if (empty) nr = 0; @@ -2428,6 +2473,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(left); btrfs_mark_buffer_dirty(right); + ret = btrfs_update_ref(trans, root, left, right, 0, push_items); + BUG_ON(ret); + btrfs_item_key(right, &disk_key, 0); btrfs_set_node_key(upper, &disk_key, slot + 1); btrfs_mark_buffer_dirty(upper); @@ -2672,6 +2720,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, if (right_nritems) btrfs_mark_buffer_dirty(right); + ret = btrfs_update_ref(trans, root, right, left, + old_left_nritems, push_items); + BUG_ON(ret); + btrfs_item_key(right, &disk_key, 0); wret = fixup_low_keys(trans, root, path, &disk_key, 1); if (wret) @@ -2828,6 +2880,9 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(l); BUG_ON(path->slots[0] != slot); + ret = btrfs_update_ref(trans, root, l, right, 0, nritems); + BUG_ON(ret); + if (mid <= slot) { btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); @@ -2856,7 +2911,6 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_path *path, int data_size, int extend) { - struct btrfs_disk_key disk_key; struct extent_buffer *l; u32 nritems; int mid; @@ -2864,11 +2918,12 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, struct extent_buffer *right; int ret = 0; int wret; - int split; + int double_split; int num_doubles = 0; /* first try to make some room by pushing left and right */ - if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { + if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY && + !trans->transaction->delayed_refs.flushing) { wret = push_leaf_right(trans, root, path, data_size, 0); if (wret < 0) return wret; @@ -2890,53 +2945,16 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, return ret; } again: - split = 1; + double_split = 0; l = path->nodes[0]; slot = path->slots[0]; nritems = btrfs_header_nritems(l); mid = (nritems + 1) / 2; - if (mid <= slot) { - if (nritems == 1 || - leaf_space_used(l, mid, nritems - mid) + data_size > - BTRFS_LEAF_DATA_SIZE(root)) { - if (slot >= nritems) { - split = 0; - } else { - mid = slot; - if (mid != nritems && - leaf_space_used(l, mid, nritems - mid) + - data_size > BTRFS_LEAF_DATA_SIZE(root)) { - split = 2; - } - } - } - } else { - if (leaf_space_used(l, 0, mid) + data_size > - BTRFS_LEAF_DATA_SIZE(root)) { - if (!extend && data_size && slot == 0) { - split = 0; - } else if ((extend || !data_size) && slot == 0) { - mid = 1; - } else { - mid = slot; - if (mid != nritems && - leaf_space_used(l, mid, nritems - mid) + - data_size > BTRFS_LEAF_DATA_SIZE(root)) { - split = 2 ; - } - } - } - } - - if (split == 0) - btrfs_cpu_key_to_disk(&disk_key, ins_key); - else - btrfs_item_key(l, &disk_key, mid); - - right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, + right = btrfs_alloc_free_block(trans, root, root->leafsize, + path->nodes[1]->start, root->root_key.objectid, - &disk_key, 0, l->start, 0); + trans->transid, 0, l->start, 0); if (IS_ERR(right)) { BUG_ON(1); return PTR_ERR(right); @@ -2945,7 +2963,6 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(right, right->start); btrfs_set_header_generation(right, trans->transid); - btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(right, root->root_key.objectid); btrfs_set_header_level(right, 0); write_extent_buffer(right, root->fs_info->fsid, @@ -2956,47 +2973,79 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_chunk_tree_uuid(right), BTRFS_UUID_SIZE); - if (split == 0) { - if (mid <= slot) { - btrfs_set_header_nritems(right, 0); - wret = insert_ptr(trans, root, path, - &disk_key, right->start, - path->slots[1] + 1, 1); - if (wret) - ret = wret; + if (mid <= slot) { + if (nritems == 1 || + leaf_space_used(l, mid, nritems - mid) + data_size > + BTRFS_LEAF_DATA_SIZE(root)) { + if (slot >= nritems) { + struct btrfs_disk_key disk_key; - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); - path->nodes[0] = right; - path->slots[0] = 0; - path->slots[1] += 1; - } else { - btrfs_set_header_nritems(right, 0); - wret = insert_ptr(trans, root, path, - &disk_key, - right->start, - path->slots[1], 1); - if (wret) - ret = wret; - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); - path->nodes[0] = right; - path->slots[0] = 0; - if (path->slots[1] == 0) { - wret = fixup_low_keys(trans, root, - path, &disk_key, 1); + btrfs_cpu_key_to_disk(&disk_key, ins_key); + btrfs_set_header_nritems(right, 0); + wret = insert_ptr(trans, root, path, + &disk_key, right->start, + path->slots[1] + 1, 1); if (wret) ret = wret; + + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + path->slots[1] += 1; + btrfs_mark_buffer_dirty(right); + return ret; + } + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + double_split = 1; + } + } + } else { + if (leaf_space_used(l, 0, mid) + data_size > + BTRFS_LEAF_DATA_SIZE(root)) { + if (!extend && data_size && slot == 0) { + struct btrfs_disk_key disk_key; + + btrfs_cpu_key_to_disk(&disk_key, ins_key); + btrfs_set_header_nritems(right, 0); + wret = insert_ptr(trans, root, path, + &disk_key, + right->start, + path->slots[1], 1); + if (wret) + ret = wret; + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + if (path->slots[1] == 0) { + wret = fixup_low_keys(trans, root, + path, &disk_key, 1); + if (wret) + ret = wret; + } + btrfs_mark_buffer_dirty(right); + return ret; + } else if ((extend || !data_size) && slot == 0) { + mid = 1; + } else { + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + double_split = 1; + } } } - btrfs_mark_buffer_dirty(right); - return ret; } ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems); BUG_ON(ret); - if (split == 2) { + if (double_split) { BUG_ON(num_doubles != 0); num_doubles++; goto again; @@ -3398,7 +3447,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, /* figure out how many keys we can insert in here */ total_data = data_size[0]; for (i = 1; i < nr; i++) { - if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0) + if (comp_cpu_keys(&found_key, cpu_key + i) <= 0) break; total_data += data_size[i]; } @@ -3696,7 +3745,9 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, /* * a helper function to delete the leaf pointed to by path->slots[1] and - * path->nodes[1]. + * path->nodes[1]. bytenr is the node block pointer, but since the callers + * already know it, it is faster to have them pass it down than to + * read it out of the node again. * * This deletes the pointer in path->nodes[1] and frees the leaf * block extent. zero is returned if it all worked out, < 0 otherwise. @@ -3704,14 +3755,15 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, * The path must have already been setup for deleting the leaf, including * all the proper balancing. path->nodes[1] must be locked. */ -static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *leaf) +noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 bytenr) { int ret; + u64 root_gen = btrfs_header_generation(path->nodes[1]); + u64 parent_start = path->nodes[1]->start; + u64 parent_owner = btrfs_header_owner(path->nodes[1]); - WARN_ON(btrfs_header_generation(leaf) != trans->transid); ret = del_ptr(trans, root, path, 1, path->slots[1]); if (ret) return ret; @@ -3722,8 +3774,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, */ btrfs_unlock_up_safe(path, 0); - ret = btrfs_free_extent(trans, root, leaf->start, leaf->len, - 0, root->root_key.objectid, 0, 0); + ret = btrfs_free_extent(trans, root, bytenr, + btrfs_level_size(root, 0), + parent_start, parent_owner, + root_gen, 0, 1); return ret; } /* @@ -3791,7 +3845,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (leaf == root->node) { btrfs_set_header_level(leaf, 0); } else { - ret = btrfs_del_leaf(trans, root, path, leaf); + ret = btrfs_del_leaf(trans, root, path, leaf->start); BUG_ON(ret); } } else { @@ -3807,7 +3861,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, } /* delete the leaf if it is mostly empty */ - if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) { + if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 && + !trans->transaction->delayed_refs.flushing) { /* push_leaf_left fixes the path. * make sure the path still points to our leaf * for possible call to del_ptr below @@ -3829,7 +3884,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (btrfs_header_nritems(leaf) == 0) { path->slots[1] = slot; - ret = btrfs_del_leaf(trans, root, path, leaf); + ret = btrfs_del_leaf(trans, root, path, + leaf->start); BUG_ON(ret); free_extent_buffer(leaf); } else { diff --git a/trunk/fs/btrfs/ctree.h b/trunk/fs/btrfs/ctree.h index 03441a99ea38..4414a5d9983a 100644 --- a/trunk/fs/btrfs/ctree.h +++ b/trunk/fs/btrfs/ctree.h @@ -45,8 +45,6 @@ struct btrfs_ordered_sum; #define BTRFS_MAX_LEVEL 8 -#define BTRFS_COMPAT_EXTENT_TREE_V0 - /* * files bigger than this get some pre-flushing when they are added * to the ordered operations list. That way we limit the total @@ -269,18 +267,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) } #define BTRFS_FSID_SIZE 16 -#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) -#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) -#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) -#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) - -#define BTRFS_BACKREF_REV_MAX 256 -#define BTRFS_BACKREF_REV_SHIFT 56 -#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \ - BTRFS_BACKREF_REV_SHIFT) - -#define BTRFS_OLD_BACKREF_REV 0 -#define BTRFS_MIXED_BACKREF_REV 1 +#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0) /* * every tree block (leaf or node) starts with this header. @@ -309,6 +296,7 @@ struct btrfs_header { sizeof(struct btrfs_item) - \ sizeof(struct btrfs_file_extent_item)) +#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) /* * this is a very generous portion of the super block, giving us @@ -367,12 +355,9 @@ struct btrfs_super_block { * Compat flags that we support. If any incompat flags are set other than the * ones specified below then we will fail to mount */ -#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) - -#define BTRFS_FEATURE_COMPAT_SUPP 0ULL -#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL -#define BTRFS_FEATURE_INCOMPAT_SUPP \ - BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF +#define BTRFS_FEATURE_COMPAT_SUPP 0x0 +#define BTRFS_FEATURE_COMPAT_RO_SUPP 0x0 +#define BTRFS_FEATURE_INCOMPAT_SUPP 0x0 /* * A leaf is full of items. offset and size tell us where to find @@ -436,65 +421,23 @@ struct btrfs_path { unsigned int keep_locks:1; unsigned int skip_locking:1; unsigned int leave_spinning:1; - unsigned int search_commit_root:1; }; /* * items in the extent btree are used to record the objectid of the * owner of the block and the number of references */ - struct btrfs_extent_item { - __le64 refs; - __le64 generation; - __le64 flags; -} __attribute__ ((__packed__)); - -struct btrfs_extent_item_v0 { __le32 refs; } __attribute__ ((__packed__)); -#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \ - sizeof(struct btrfs_item)) - -#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0) -#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1) - -/* following flags only apply to tree blocks */ - -/* use full backrefs for extent pointers in the block */ -#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8) - -struct btrfs_tree_block_info { - struct btrfs_disk_key key; - u8 level; -} __attribute__ ((__packed__)); - -struct btrfs_extent_data_ref { - __le64 root; - __le64 objectid; - __le64 offset; - __le32 count; -} __attribute__ ((__packed__)); - -struct btrfs_shared_data_ref { - __le32 count; -} __attribute__ ((__packed__)); - -struct btrfs_extent_inline_ref { - u8 type; - u64 offset; -} __attribute__ ((__packed__)); - -/* old style backrefs item */ -struct btrfs_extent_ref_v0 { +struct btrfs_extent_ref { __le64 root; __le64 generation; __le64 objectid; - __le32 count; + __le32 num_refs; } __attribute__ ((__packed__)); - /* dev extents record free space on individual devices. The owner * field points back to the chunk allocation mapping tree that allocated * the extent. The chunk tree uuid field is a way to double check the owner @@ -752,7 +695,12 @@ struct btrfs_block_group_cache { struct list_head cluster_list; }; -struct reloc_control; +struct btrfs_leaf_ref_tree { + struct rb_root root; + struct list_head list; + spinlock_t lock; +}; + struct btrfs_device; struct btrfs_fs_devices; struct btrfs_fs_info { @@ -883,11 +831,18 @@ struct btrfs_fs_info { struct task_struct *cleaner_kthread; int thread_pool_size; + /* tree relocation relocated fields */ + struct list_head dead_reloc_roots; + struct btrfs_leaf_ref_tree reloc_ref_tree; + struct btrfs_leaf_ref_tree shared_ref_tree; + struct kobject super_kobj; struct completion kobj_unregister; int do_barriers; int closing; int log_root_recovering; + atomic_t throttles; + atomic_t throttle_gen; u64 total_pinned; @@ -906,8 +861,6 @@ struct btrfs_fs_info { */ struct list_head space_info; - struct reloc_control *reloc_ctl; - spinlock_t delalloc_lock; spinlock_t new_trans_lock; u64 delalloc_bytes; @@ -938,6 +891,7 @@ struct btrfs_fs_info { * in ram representation of the tree. extent_root is used for all allocations * and for the extent tree extent_root root. */ +struct btrfs_dirty_root; struct btrfs_root { struct extent_buffer *node; @@ -945,6 +899,9 @@ struct btrfs_root { spinlock_t node_lock; struct extent_buffer *commit_root; + struct btrfs_leaf_ref_tree *ref_tree; + struct btrfs_leaf_ref_tree ref_tree_struct; + struct btrfs_dirty_root *dirty_root; struct btrfs_root *log_root; struct btrfs_root *reloc_root; @@ -995,15 +952,10 @@ struct btrfs_root { /* the dirty list is only used by non-reference counted roots */ struct list_head dirty_list; - struct list_head root_list; - spinlock_t list_lock; + struct list_head dead_list; struct list_head orphan_list; - spinlock_t inode_lock; - /* red-black tree that keeps track of in-memory inodes */ - struct rb_root inode_tree; - /* * right now this just gets used so that a root has its own devid * for stat. It may be used for more later @@ -1065,16 +1017,7 @@ struct btrfs_root { * are used, and how many references there are to each block */ #define BTRFS_EXTENT_ITEM_KEY 168 - -#define BTRFS_TREE_BLOCK_REF_KEY 176 - -#define BTRFS_EXTENT_DATA_REF_KEY 178 - -#define BTRFS_EXTENT_REF_V0_KEY 180 - -#define BTRFS_SHARED_BLOCK_REF_KEY 182 - -#define BTRFS_SHARED_DATA_REF_KEY 184 +#define BTRFS_EXTENT_REF_KEY 180 /* * block groups give us hints into the extent allocation trees. Which @@ -1100,8 +1043,6 @@ struct btrfs_root { #define BTRFS_MOUNT_COMPRESS (1 << 5) #define BTRFS_MOUNT_NOTREELOG (1 << 6) #define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) -#define BTRFS_MOUNT_SSD_SPREAD (1 << 8) -#define BTRFS_MOUNT_NOSSD (1 << 9) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1115,14 +1056,12 @@ struct btrfs_root { #define BTRFS_INODE_READONLY (1 << 2) #define BTRFS_INODE_NOCOMPRESS (1 << 3) #define BTRFS_INODE_PREALLOC (1 << 4) -#define BTRFS_INODE_SYNC (1 << 5) -#define BTRFS_INODE_IMMUTABLE (1 << 6) -#define BTRFS_INODE_APPEND (1 << 7) -#define BTRFS_INODE_NODUMP (1 << 8) -#define BTRFS_INODE_NOATIME (1 << 9) -#define BTRFS_INODE_DIRSYNC (1 << 10) - - +#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \ + ~BTRFS_INODE_##flag) +#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \ + BTRFS_INODE_##flag) +#define btrfs_test_flag(inode, flag) (BTRFS_I(inode)->flags & \ + BTRFS_INODE_##flag) /* some macros to generate set/get funcs for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple * one for u8: @@ -1378,67 +1317,24 @@ static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev) return (u8 *)((unsigned long)dev + ptr); } -BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); -BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, - generation, 64); -BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); - -BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32); - - -BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); - -static inline void btrfs_tree_block_key(struct extent_buffer *eb, - struct btrfs_tree_block_info *item, - struct btrfs_disk_key *key) -{ - read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); -} - -static inline void btrfs_set_tree_block_key(struct extent_buffer *eb, - struct btrfs_tree_block_info *item, - struct btrfs_disk_key *key) -{ - write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); -} - -BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, - root, 64); -BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref, - objectid, 64); -BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref, - offset, 64); -BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, - count, 32); - -BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, - count, 32); - -BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, - type, 8); -BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, - offset, 64); +/* struct btrfs_extent_ref */ +BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64); +BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64); +BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64); +BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32); -static inline u32 btrfs_extent_inline_ref_size(int type) -{ - if (type == BTRFS_TREE_BLOCK_REF_KEY || - type == BTRFS_SHARED_BLOCK_REF_KEY) - return sizeof(struct btrfs_extent_inline_ref); - if (type == BTRFS_SHARED_DATA_REF_KEY) - return sizeof(struct btrfs_shared_data_ref) + - sizeof(struct btrfs_extent_inline_ref); - if (type == BTRFS_EXTENT_DATA_REF_KEY) - return sizeof(struct btrfs_extent_data_ref) + - offsetof(struct btrfs_extent_inline_ref, offset); - BUG(); - return 0; -} +BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64); +BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref, + objectid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref, + num_refs, 32); -BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64); -BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0, - generation, 64); -BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64); -BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32); +/* struct btrfs_extent_item */ +BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32); +BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item, + refs, 32); /* struct btrfs_node */ BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); @@ -1662,21 +1558,6 @@ static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) return (flags & flag) == flag; } -static inline int btrfs_header_backref_rev(struct extent_buffer *eb) -{ - u64 flags = btrfs_header_flags(eb); - return flags >> BTRFS_BACKREF_REV_SHIFT; -} - -static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, - int rev) -{ - u64 flags = btrfs_header_flags(eb); - flags &= ~BTRFS_BACKREF_REV_MASK; - flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT; - btrfs_set_header_flags(eb, flags); -} - static inline u8 *btrfs_header_fsid(struct extent_buffer *eb) { unsigned long ptr = offsetof(struct btrfs_header, fsid); @@ -1909,32 +1790,39 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *leaf); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 offset, u64 bytenr); + struct btrfs_root *root, u64 objectid, u64 bytenr); int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); struct btrfs_block_group_cache *btrfs_lookup_block_group( struct btrfs_fs_info *info, u64 bytenr); -void btrfs_put_block_group(struct btrfs_block_group_cache *cache); u64 btrfs_find_block_group(struct btrfs_root *root, u64 search_start, u64 search_hint, int owner); struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u32 blocksize, - u64 parent, u64 root_objectid, - struct btrfs_disk_key *key, int level, - u64 hint, u64 empty_size); + struct btrfs_root *root, + u32 blocksize, u64 parent, + u64 root_objectid, + u64 ref_generation, + int level, + u64 hint, + u64 empty_size); struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u32 blocksize, int level); -int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 root_objectid, u64 owner, - u64 offset, struct btrfs_key *ins); -int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 root_objectid, u64 owner, u64 offset, - struct btrfs_key *ins); +int btrfs_alloc_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 parent, u64 min_bytes, + u64 root_objectid, u64 ref_generation, + u64 owner, u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, u64 data); +int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins); +int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins); int btrfs_reserve_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, @@ -1942,18 +1830,18 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, u64 search_end, struct btrfs_key *ins, u64 data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); -int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); -int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 flags, - int is_data); + struct extent_buffer *orig_buf, struct extent_buffer *buf, + u32 *nr_extents); +int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, u32 nr_extents); +int btrfs_update_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *orig_buf, + struct extent_buffer *buf, int start_slot, int nr); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset); - + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin); int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -1961,8 +1849,13 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset); - + u64 root_objectid, u64 ref_generation, + u64 owner_objectid); +int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 num_bytes, + u64 orig_parent, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); @@ -1974,9 +1867,16 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 size); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start); -int btrfs_prepare_block_group_relocation(struct btrfs_root *root, - struct btrfs_block_group_cache *group); - +int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); +int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_drop_dead_reloc_roots(struct btrfs_root *root); +int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, u64 orig_start); +int btrfs_add_dead_reloc_root(struct btrfs_root *root); +int btrfs_cleanup_reloc_trees(struct btrfs_root *root); +int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); @@ -1991,12 +1891,13 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); /* ctree.c */ -int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, - int level, int *slot); -int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); +int btrfs_merge_path(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *node_keys, + u64 *nodes, int lowest_level); int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *new_key); @@ -2017,8 +1918,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid); -int btrfs_block_can_be_shared(struct btrfs_root *root, - struct extent_buffer *buf); int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u32 data_size); int btrfs_truncate_item(struct btrfs_trans_handle *trans, @@ -2045,6 +1944,9 @@ void btrfs_unlock_up_safe(struct btrfs_path *p, int level); int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr); +int btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 bytenr); static inline int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path) @@ -2103,9 +2005,8 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct btrfs_root_item *item, struct btrfs_key *key); int btrfs_search_root(struct btrfs_root *root, u64 search_start, u64 *found_objectid); -int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); -int btrfs_set_root_node(struct btrfs_root_item *item, - struct extent_buffer *node); +int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, + struct btrfs_root *latest_root); /* dir-item.c */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, @@ -2238,6 +2139,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_delete_inode(struct inode *inode); void btrfs_put_inode(struct inode *inode); +void btrfs_read_locked_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, int wait); void btrfs_dirty_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); @@ -2245,8 +2147,12 @@ void btrfs_destroy_inode(struct inode *inode); int btrfs_init_cachep(void); void btrfs_destroy_cachep(void); long btrfs_ioctl_trans_end(struct file *file); +struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, + struct btrfs_root *root, int wait); +struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, + struct btrfs_root *root); struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root); + struct btrfs_root *root, int *is_new); int btrfs_commit_write(struct file *file, struct page *page, unsigned from, unsigned to); struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, @@ -2262,8 +2168,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t size); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -void btrfs_update_iflags(struct inode *inode); -void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); /* file.c */ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); @@ -2301,20 +2205,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); /* acl.c */ -#ifdef CONFIG_FS_POSIX_ACL int btrfs_check_acl(struct inode *inode, int mask); -#else -#define btrfs_check_acl NULL -#endif int btrfs_init_acl(struct inode *inode, struct inode *dir); int btrfs_acl_chmod(struct inode *inode); -/* relocation.c */ -int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); -int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_recover_relocation(struct btrfs_root *root); -int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); #endif diff --git a/trunk/fs/btrfs/delayed-ref.c b/trunk/fs/btrfs/delayed-ref.c index 84e6781413b1..d6c01c096a40 100644 --- a/trunk/fs/btrfs/delayed-ref.c +++ b/trunk/fs/btrfs/delayed-ref.c @@ -29,87 +29,27 @@ * add extents in the middle of btrfs_search_slot, and it allows * us to buffer up frequently modified backrefs in an rb tree instead * of hammering updates on the extent allocation tree. + * + * Right now this code is only used for reference counted trees, but + * the long term goal is to get rid of the similar code for delayed + * extent tree modifications. */ /* - * compare two delayed tree backrefs with same bytenr and type - */ -static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, - struct btrfs_delayed_tree_ref *ref1) -{ - if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { - if (ref1->root < ref2->root) - return -1; - if (ref1->root > ref2->root) - return 1; - } else { - if (ref1->parent < ref2->parent) - return -1; - if (ref1->parent > ref2->parent) - return 1; - } - return 0; -} - -/* - * compare two delayed data backrefs with same bytenr and type + * entries in the rb tree are ordered by the byte number of the extent + * and by the byte number of the parent block. */ -static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, - struct btrfs_delayed_data_ref *ref1) +static int comp_entry(struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 parent) { - if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { - if (ref1->root < ref2->root) - return -1; - if (ref1->root > ref2->root) - return 1; - if (ref1->objectid < ref2->objectid) - return -1; - if (ref1->objectid > ref2->objectid) - return 1; - if (ref1->offset < ref2->offset) - return -1; - if (ref1->offset > ref2->offset) - return 1; - } else { - if (ref1->parent < ref2->parent) - return -1; - if (ref1->parent > ref2->parent) - return 1; - } - return 0; -} - -/* - * entries in the rb tree are ordered by the byte number of the extent, - * type of the delayed backrefs and content of delayed backrefs. - */ -static int comp_entry(struct btrfs_delayed_ref_node *ref2, - struct btrfs_delayed_ref_node *ref1) -{ - if (ref1->bytenr < ref2->bytenr) + if (bytenr < ref->bytenr) return -1; - if (ref1->bytenr > ref2->bytenr) + if (bytenr > ref->bytenr) return 1; - if (ref1->is_head && ref2->is_head) - return 0; - if (ref2->is_head) + if (parent < ref->parent) return -1; - if (ref1->is_head) + if (parent > ref->parent) return 1; - if (ref1->type < ref2->type) - return -1; - if (ref1->type > ref2->type) - return 1; - if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || - ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { - return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), - btrfs_delayed_node_to_tree_ref(ref1)); - } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY || - ref1->type == BTRFS_SHARED_DATA_REF_KEY) { - return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2), - btrfs_delayed_node_to_data_ref(ref1)); - } - BUG(); return 0; } @@ -119,21 +59,20 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2, * inserted. */ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, + u64 bytenr, u64 parent, struct rb_node *node) { struct rb_node **p = &root->rb_node; struct rb_node *parent_node = NULL; struct btrfs_delayed_ref_node *entry; - struct btrfs_delayed_ref_node *ins; int cmp; - ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); while (*p) { parent_node = *p; entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, rb_node); - cmp = comp_entry(entry, ins); + cmp = comp_entry(entry, bytenr, parent); if (cmp < 0) p = &(*p)->rb_left; else if (cmp > 0) @@ -142,17 +81,18 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, return entry; } + entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); rb_link_node(node, parent_node, p); rb_insert_color(node, root); return NULL; } /* - * find an head entry based on bytenr. This returns the delayed ref - * head if it was able to find one, or NULL if nothing was in that spot + * find an entry based on (bytenr,parent). This returns the delayed + * ref if it was able to find one, or NULL if nothing was in that spot */ -static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, - u64 bytenr, +static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root, + u64 bytenr, u64 parent, struct btrfs_delayed_ref_node **last) { struct rb_node *n = root->rb_node; @@ -165,15 +105,7 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, if (last) *last = entry; - if (bytenr < entry->bytenr) - cmp = -1; - else if (bytenr > entry->bytenr) - cmp = 1; - else if (!btrfs_delayed_ref_is_head(entry)) - cmp = 1; - else - cmp = 0; - + cmp = comp_entry(entry, bytenr, parent); if (cmp < 0) n = n->rb_left; else if (cmp > 0) @@ -222,7 +154,7 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, node = rb_first(&delayed_refs->root); } else { ref = NULL; - find_ref_head(&delayed_refs->root, start, &ref); + tree_search(&delayed_refs->root, start, (u64)-1, &ref); if (ref) { struct btrfs_delayed_ref_node *tmp; @@ -302,7 +234,7 @@ int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr) delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - ref = find_ref_head(&delayed_refs->root, bytenr, NULL); + ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); if (ref) { prev_node = rb_prev(&ref->rb_node); if (!prev_node) @@ -318,28 +250,25 @@ int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr) } /* - * helper function to lookup reference count and flags of extent. + * helper function to lookup reference count * * the head node for delayed ref is used to store the sum of all the - * reference count modifications queued up in the rbtree. the head - * node may also store the extent flags to set. This way you can check - * to see what the reference count and extent flags would be if all of - * the delayed refs are not processed. + * reference count modifications queued up in the rbtree. This way you + * can check to see what the reference count would be if all of the + * delayed refs are processed. */ -int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *refs, u64 *flags) +int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u32 *refs) { struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_path *path; - struct btrfs_extent_item *ei; struct extent_buffer *leaf; + struct btrfs_extent_item *ei; struct btrfs_key key; - u32 item_size; - u64 num_refs; - u64 extent_flags; + u32 num_refs; int ret; path = btrfs_alloc_path(); @@ -358,60 +287,37 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, if (ret == 0) { leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - if (item_size >= sizeof(*ei)) { - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - num_refs = btrfs_extent_refs(leaf, ei); - extent_flags = btrfs_extent_flags(leaf, ei); - } else { -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - struct btrfs_extent_item_v0 *ei0; - BUG_ON(item_size != sizeof(*ei0)); - ei0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item_v0); - num_refs = btrfs_extent_refs_v0(leaf, ei0); - /* FIXME: this isn't correct for data */ - extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; -#else - BUG(); -#endif - } - BUG_ON(num_refs == 0); + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + num_refs = btrfs_extent_refs(leaf, ei); } else { num_refs = 0; - extent_flags = 0; ret = 0; } spin_lock(&delayed_refs->lock); - ref = find_ref_head(&delayed_refs->root, bytenr, NULL); + ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); if (ref) { head = btrfs_delayed_node_to_head(ref); - if (!mutex_trylock(&head->mutex)) { - atomic_inc(&ref->refs); - spin_unlock(&delayed_refs->lock); - - btrfs_release_path(root->fs_info->extent_root, path); - - mutex_lock(&head->mutex); + if (mutex_trylock(&head->mutex)) { + num_refs += ref->ref_mod; mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(ref); - goto again; + *refs = num_refs; + goto out; } - if (head->extent_op && head->extent_op->update_flags) - extent_flags |= head->extent_op->flags_to_set; - else - BUG_ON(num_refs == 0); - num_refs += ref->ref_mod; + atomic_inc(&ref->refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(root->fs_info->extent_root, path); + + mutex_lock(&head->mutex); mutex_unlock(&head->mutex); - } - WARN_ON(num_refs == 0); - if (refs) + btrfs_put_delayed_ref(ref); + goto again; + } else { *refs = num_refs; - if (flags) - *flags = extent_flags; + } out: spin_unlock(&delayed_refs->lock); btrfs_free_path(path); @@ -432,7 +338,16 @@ update_existing_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *existing, struct btrfs_delayed_ref_node *update) { - if (update->action != existing->action) { + struct btrfs_delayed_ref *existing_ref; + struct btrfs_delayed_ref *ref; + + existing_ref = btrfs_delayed_node_to_ref(existing); + ref = btrfs_delayed_node_to_ref(update); + + if (ref->pin) + existing_ref->pin = 1; + + if (ref->action != existing_ref->action) { /* * this is effectively undoing either an add or a * drop. We decrement the ref_mod, and if it goes @@ -448,13 +363,20 @@ update_existing_ref(struct btrfs_trans_handle *trans, delayed_refs->num_entries--; if (trans->delayed_ref_updates) trans->delayed_ref_updates--; - } else { - WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || - existing->type == BTRFS_SHARED_BLOCK_REF_KEY); } } else { - WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || - existing->type == BTRFS_SHARED_BLOCK_REF_KEY); + if (existing_ref->action == BTRFS_ADD_DELAYED_REF) { + /* if we're adding refs, make sure all the + * details match up. The extent could + * have been totally freed and reallocated + * by a different owner before the delayed + * ref entries were removed. + */ + existing_ref->owner_objectid = ref->owner_objectid; + existing_ref->generation = ref->generation; + existing_ref->root = ref->root; + existing->num_bytes = update->num_bytes; + } /* * the action on the existing ref matches * the action on the ref we're trying to add. @@ -479,7 +401,6 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, existing_ref = btrfs_delayed_node_to_head(existing); ref = btrfs_delayed_node_to_head(update); - BUG_ON(existing_ref->is_data != ref->is_data); if (ref->must_insert_reserved) { /* if the extent was freed and then @@ -499,24 +420,6 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, } - if (ref->extent_op) { - if (!existing_ref->extent_op) { - existing_ref->extent_op = ref->extent_op; - } else { - if (ref->extent_op->update_key) { - memcpy(&existing_ref->extent_op->key, - &ref->extent_op->key, - sizeof(ref->extent_op->key)); - existing_ref->extent_op->update_key = 1; - } - if (ref->extent_op->update_flags) { - existing_ref->extent_op->flags_to_set |= - ref->extent_op->flags_to_set; - existing_ref->extent_op->update_flags = 1; - } - kfree(ref->extent_op); - } - } /* * update the reference mod on the head to reflect this new operation */ @@ -524,16 +427,19 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, } /* - * helper function to actually insert a head node into the rbtree. + * helper function to actually insert a delayed ref into the rbtree. * this does all the dirty work in terms of maintaining the correct - * overall modification count. + * overall modification count in the head node and properly dealing + * with updating existing nodes as new modifications are queued. */ -static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, - int action, int is_data) +static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, + u64 ref_generation, u64 owner_objectid, int action, + int pin) { struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_ref *full_ref; struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; int count_mod = 1; @@ -543,10 +449,12 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, * the head node stores the sum of all the mods, so dropping a ref * should drop the sum in the head node by one. */ - if (action == BTRFS_UPDATE_DELAYED_HEAD) - count_mod = 0; - else if (action == BTRFS_DROP_DELAYED_REF) - count_mod = -1; + if (parent == (u64)-1) { + if (action == BTRFS_DROP_DELAYED_REF) + count_mod = -1; + else if (action == BTRFS_UPDATE_DELAYED_HEAD) + count_mod = 0; + } /* * BTRFS_ADD_DELAYED_EXTENT means that we need to update @@ -559,148 +467,57 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, * Once we record must_insert_reserved, switch the action to * BTRFS_ADD_DELAYED_REF because other special casing is not required. */ - if (action == BTRFS_ADD_DELAYED_EXTENT) + if (action == BTRFS_ADD_DELAYED_EXTENT) { must_insert_reserved = 1; - else - must_insert_reserved = 0; - - delayed_refs = &trans->transaction->delayed_refs; - - /* first set the basic ref node struct up */ - atomic_set(&ref->refs, 1); - ref->bytenr = bytenr; - ref->num_bytes = num_bytes; - ref->ref_mod = count_mod; - ref->type = 0; - ref->action = 0; - ref->is_head = 1; - ref->in_tree = 1; - - head_ref = btrfs_delayed_node_to_head(ref); - head_ref->must_insert_reserved = must_insert_reserved; - head_ref->is_data = is_data; - - INIT_LIST_HEAD(&head_ref->cluster); - mutex_init(&head_ref->mutex); - - existing = tree_insert(&delayed_refs->root, &ref->rb_node); - - if (existing) { - update_existing_head_ref(existing, ref); - /* - * we've updated the existing ref, free the newly - * allocated ref - */ - kfree(ref); - } else { - delayed_refs->num_heads++; - delayed_refs->num_heads_ready++; - delayed_refs->num_entries++; - trans->delayed_ref_updates++; - } - return 0; -} - -/* - * helper to insert a delayed tree ref into the rbtree. - */ -static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, int level, int action) -{ - struct btrfs_delayed_ref_node *existing; - struct btrfs_delayed_tree_ref *full_ref; - struct btrfs_delayed_ref_root *delayed_refs; - - if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; - - delayed_refs = &trans->transaction->delayed_refs; - - /* first set the basic ref node struct up */ - atomic_set(&ref->refs, 1); - ref->bytenr = bytenr; - ref->num_bytes = num_bytes; - ref->ref_mod = 1; - ref->action = action; - ref->is_head = 0; - ref->in_tree = 1; - - full_ref = btrfs_delayed_node_to_tree_ref(ref); - if (parent) { - full_ref->parent = parent; - ref->type = BTRFS_SHARED_BLOCK_REF_KEY; } else { - full_ref->root = ref_root; - ref->type = BTRFS_TREE_BLOCK_REF_KEY; - } - full_ref->level = level; - - existing = tree_insert(&delayed_refs->root, &ref->rb_node); - - if (existing) { - update_existing_ref(trans, delayed_refs, existing, ref); - /* - * we've updated the existing ref, free the newly - * allocated ref - */ - kfree(ref); - } else { - delayed_refs->num_entries++; - trans->delayed_ref_updates++; + must_insert_reserved = 0; } - return 0; -} -/* - * helper to insert a delayed data ref into the rbtree. - */ -static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, u64 owner, u64 offset, - int action) -{ - struct btrfs_delayed_ref_node *existing; - struct btrfs_delayed_data_ref *full_ref; - struct btrfs_delayed_ref_root *delayed_refs; - - if (action == BTRFS_ADD_DELAYED_EXTENT) - action = BTRFS_ADD_DELAYED_REF; delayed_refs = &trans->transaction->delayed_refs; /* first set the basic ref node struct up */ atomic_set(&ref->refs, 1); ref->bytenr = bytenr; - ref->num_bytes = num_bytes; - ref->ref_mod = 1; - ref->action = action; - ref->is_head = 0; + ref->parent = parent; + ref->ref_mod = count_mod; ref->in_tree = 1; + ref->num_bytes = num_bytes; - full_ref = btrfs_delayed_node_to_data_ref(ref); - if (parent) { - full_ref->parent = parent; - ref->type = BTRFS_SHARED_DATA_REF_KEY; + if (btrfs_delayed_ref_is_head(ref)) { + head_ref = btrfs_delayed_node_to_head(ref); + head_ref->must_insert_reserved = must_insert_reserved; + INIT_LIST_HEAD(&head_ref->cluster); + mutex_init(&head_ref->mutex); } else { + full_ref = btrfs_delayed_node_to_ref(ref); full_ref->root = ref_root; - ref->type = BTRFS_EXTENT_DATA_REF_KEY; + full_ref->generation = ref_generation; + full_ref->owner_objectid = owner_objectid; + full_ref->pin = pin; + full_ref->action = action; } - full_ref->objectid = owner; - full_ref->offset = offset; - existing = tree_insert(&delayed_refs->root, &ref->rb_node); + existing = tree_insert(&delayed_refs->root, bytenr, + parent, &ref->rb_node); if (existing) { - update_existing_ref(trans, delayed_refs, existing, ref); + if (btrfs_delayed_ref_is_head(ref)) + update_existing_head_ref(existing, ref); + else + update_existing_ref(trans, delayed_refs, existing, ref); + /* * we've updated the existing ref, free the newly * allocated ref */ kfree(ref); } else { + if (btrfs_delayed_ref_is_head(ref)) { + delayed_refs->num_heads++; + delayed_refs->num_heads_ready++; + } delayed_refs->num_entries++; trans->delayed_ref_updates++; } @@ -708,78 +525,37 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, } /* - * add a delayed tree ref. This does all of the accounting required + * add a delayed ref to the tree. This does all of the accounting required * to make sure the delayed ref is eventually processed before this * transaction commits. */ -int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op) +int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, + u64 ref_generation, u64 owner_objectid, int action, + int pin) { - struct btrfs_delayed_tree_ref *ref; + struct btrfs_delayed_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; int ret; - BUG_ON(extent_op && extent_op->is_data); ref = kmalloc(sizeof(*ref), GFP_NOFS); if (!ref) return -ENOMEM; - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); - if (!head_ref) { - kfree(ref); - return -ENOMEM; - } - - head_ref->extent_op = extent_op; - - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - /* - * insert both the head node and the new ref without dropping - * the spin lock + * the parent = 0 case comes from cases where we don't actually + * know the parent yet. It will get updated later via a add/drop + * pair. */ - ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, - action, 0); - BUG_ON(ret); - - ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes, - parent, ref_root, level, action); - BUG_ON(ret); - spin_unlock(&delayed_refs->lock); - return 0; -} - -/* - * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. - */ -int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, - u64 parent, u64 ref_root, - u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op) -{ - struct btrfs_delayed_data_ref *ref; - struct btrfs_delayed_ref_head *head_ref; - struct btrfs_delayed_ref_root *delayed_refs; - int ret; - - BUG_ON(extent_op && !extent_op->is_data); - ref = kmalloc(sizeof(*ref), GFP_NOFS); - if (!ref) - return -ENOMEM; + if (parent == 0) + parent = bytenr; head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); if (!head_ref) { kfree(ref); return -ENOMEM; } - - head_ref->extent_op = extent_op; - delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); @@ -787,39 +563,14 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, * insert both the head node and the new ref without dropping * the spin lock */ - ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, - action, 1); - BUG_ON(ret); - - ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes, - parent, ref_root, owner, offset, action); + ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes, + (u64)-1, 0, 0, 0, action, pin); BUG_ON(ret); - spin_unlock(&delayed_refs->lock); - return 0; -} - -int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, - struct btrfs_delayed_extent_op *extent_op) -{ - struct btrfs_delayed_ref_head *head_ref; - struct btrfs_delayed_ref_root *delayed_refs; - int ret; - - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); - if (!head_ref) - return -ENOMEM; - - head_ref->extent_op = extent_op; - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - - ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, - num_bytes, BTRFS_UPDATE_DELAYED_HEAD, - extent_op->is_data); + ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes, + parent, ref_root, ref_generation, + owner_objectid, action, pin); BUG_ON(ret); - spin_unlock(&delayed_refs->lock); return 0; } @@ -836,7 +587,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) struct btrfs_delayed_ref_root *delayed_refs; delayed_refs = &trans->transaction->delayed_refs; - ref = find_ref_head(&delayed_refs->root, bytenr, NULL); + ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); if (ref) return btrfs_delayed_node_to_head(ref); return NULL; @@ -852,7 +603,6 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) * * It is the same as doing a ref add and delete in two separate calls. */ -#if 0 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 orig_parent, u64 parent, u64 orig_ref_root, u64 ref_root, @@ -916,4 +666,3 @@ int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); return 0; } -#endif diff --git a/trunk/fs/btrfs/delayed-ref.h b/trunk/fs/btrfs/delayed-ref.h index f6fc67ddad36..3bec2ff0b15c 100644 --- a/trunk/fs/btrfs/delayed-ref.h +++ b/trunk/fs/btrfs/delayed-ref.h @@ -30,6 +30,9 @@ struct btrfs_delayed_ref_node { /* the starting bytenr of the extent */ u64 bytenr; + /* the parent our backref will point to */ + u64 parent; + /* the size of the extent */ u64 num_bytes; @@ -47,21 +50,10 @@ struct btrfs_delayed_ref_node { */ int ref_mod; - unsigned int action:8; - unsigned int type:8; /* is this node still in the rbtree? */ - unsigned int is_head:1; unsigned int in_tree:1; }; -struct btrfs_delayed_extent_op { - struct btrfs_disk_key key; - u64 flags_to_set; - unsigned int update_key:1; - unsigned int update_flags:1; - unsigned int is_data:1; -}; - /* * the head refs are used to hold a lock on a given extent, which allows us * to make sure that only one process is running the delayed refs @@ -79,7 +71,6 @@ struct btrfs_delayed_ref_head { struct list_head cluster; - struct btrfs_delayed_extent_op *extent_op; /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree @@ -93,26 +84,27 @@ struct btrfs_delayed_ref_head { * the free has happened. */ unsigned int must_insert_reserved:1; - unsigned int is_data:1; }; -struct btrfs_delayed_tree_ref { +struct btrfs_delayed_ref { struct btrfs_delayed_ref_node node; - union { - u64 root; - u64 parent; - }; - int level; -}; -struct btrfs_delayed_data_ref { - struct btrfs_delayed_ref_node node; - union { - u64 root; - u64 parent; - }; - u64 objectid; - u64 offset; + /* the root objectid our ref will point to */ + u64 root; + + /* the generation for the backref */ + u64 generation; + + /* owner_objectid of the backref */ + u64 owner_objectid; + + /* operation done by this entry in the rbtree */ + u8 action; + + /* if pin == 1, when the extent is freed it will be pinned until + * transaction commit + */ + unsigned int pin:1; }; struct btrfs_delayed_ref_root { @@ -151,25 +143,17 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) } } -int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op); -int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, - u64 parent, u64 ref_root, - u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op); -int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, - struct btrfs_delayed_extent_op *extent_op); +int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, + u64 ref_generation, u64 owner_objectid, int action, + int pin); struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); -int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *refs, u64 *flags); +int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u32 *refs); int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 orig_parent, u64 parent, u64 orig_ref_root, u64 ref_root, @@ -185,24 +169,18 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, */ static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node) { - return node->is_head; + return node->parent == (u64)-1; } /* * helper functions to cast a node into its container */ -static inline struct btrfs_delayed_tree_ref * -btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node) +static inline struct btrfs_delayed_ref * +btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node) { WARN_ON(btrfs_delayed_ref_is_head(node)); - return container_of(node, struct btrfs_delayed_tree_ref, node); -} + return container_of(node, struct btrfs_delayed_ref, node); -static inline struct btrfs_delayed_data_ref * -btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node) -{ - WARN_ON(btrfs_delayed_ref_is_head(node)); - return container_of(node, struct btrfs_delayed_data_ref, node); } static inline struct btrfs_delayed_ref_head * @@ -210,5 +188,6 @@ btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node) { WARN_ON(!btrfs_delayed_ref_is_head(node)); return container_of(node, struct btrfs_delayed_ref_head, node); + } #endif diff --git a/trunk/fs/btrfs/disk-io.c b/trunk/fs/btrfs/disk-io.c index 0d50d49d990a..4b0ea0b80c23 100644 --- a/trunk/fs/btrfs/disk-io.c +++ b/trunk/fs/btrfs/disk-io.c @@ -26,8 +26,8 @@ #include #include #include -#include #include "compat.h" +#include "crc32c.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -36,6 +36,7 @@ #include "print-tree.h" #include "async-thread.h" #include "locking.h" +#include "ref-cache.h" #include "tree-log.h" #include "free-space-cache.h" @@ -171,7 +172,7 @@ static struct extent_map *btree_get_extent(struct inode *inode, u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) { - return crc32c(seed, data, len); + return btrfs_crc32c(seed, data, len); } void btrfs_csum_final(u32 crc, char *result) @@ -883,6 +884,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, { root->node = NULL; root->commit_root = NULL; + root->ref_tree = NULL; root->sectorsize = sectorsize; root->nodesize = nodesize; root->leafsize = leafsize; @@ -897,14 +899,12 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->last_inode_alloc = 0; root->name = NULL; root->in_sysfs = 0; - root->inode_tree.rb_node = NULL; INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->orphan_list); - INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD(&root->dead_list); spin_lock_init(&root->node_lock); spin_lock_init(&root->list_lock); - spin_lock_init(&root->inode_lock); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); init_waitqueue_head(&root->log_writer_wait); @@ -918,6 +918,9 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, extent_io_tree_init(&root->dirty_log_pages, fs_info->btree_inode->i_mapping, GFP_NOFS); + btrfs_leaf_ref_tree_init(&root->ref_tree_struct); + root->ref_tree = &root->ref_tree_struct; + memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); @@ -956,7 +959,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root, blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); - root->commit_root = btrfs_root_node(root); BUG_ON(!root->node); return 0; } @@ -1023,19 +1025,20 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, */ root->ref_cows = 0; - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, - BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, + 0, BTRFS_TREE_LOG_OBJECTID, + trans->transid, 0, 0, 0); if (IS_ERR(leaf)) { kfree(root); return ERR_CAST(leaf); } - memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); - btrfs_set_header_bytenr(leaf, leaf->start); - btrfs_set_header_generation(leaf, trans->transid); - btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); root->node = leaf; + btrfs_set_header_nritems(root->node, 0); + btrfs_set_header_level(root->node, 0); + btrfs_set_header_bytenr(root->node, root->node->start); + btrfs_set_header_generation(root->node, trans->transid); + btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID); write_extent_buffer(root->node, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(root->node), @@ -1078,7 +1081,8 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, inode_item->nbytes = cpu_to_le64(root->leafsize); inode_item->mode = cpu_to_le32(S_IFDIR | 0755); - btrfs_set_root_node(&log_root->root_item, log_root->node); + btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start); + btrfs_set_root_generation(&log_root->root_item, trans->transid); WARN_ON(root->log_root); root->log_root = log_root; @@ -1140,7 +1144,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); - root->commit_root = btrfs_root_node(root); BUG_ON(!root->node); insert: if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { @@ -1207,7 +1210,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, } if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_find_dead_roots(fs_info->tree_root, - root->root_key.objectid); + root->root_key.objectid, root); BUG_ON(ret); btrfs_orphan_cleanup(root); } @@ -1566,6 +1569,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); atomic_set(&fs_info->nr_async_bios, 0); + atomic_set(&fs_info->throttles, 0); + atomic_set(&fs_info->throttle_gen, 0); fs_info->sb = sb; fs_info->max_extent = (u64)-1; fs_info->max_inline = 8192 * 1024; @@ -1593,7 +1598,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->btree_inode->i_mapping->a_ops = &btree_aops; fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi; - RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -1609,6 +1613,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->btree_inode->i_mapping, GFP_NOFS); fs_info->do_barriers = 1; + INIT_LIST_HEAD(&fs_info->dead_reloc_roots); + btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree); + btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree); + BTRFS_I(fs_info->btree_inode)->root = tree_root; memset(&BTRFS_I(fs_info->btree_inode)->location, 0, sizeof(struct btrfs_key)); @@ -1666,12 +1674,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_iput; } - features = btrfs_super_incompat_flags(disk_super); - if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) { - features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; - btrfs_set_super_incompat_flags(disk_super, features); - } - features = btrfs_super_compat_ro_flags(disk_super) & ~BTRFS_FEATURE_COMPAT_RO_SUPP; if (!(sb->s_flags & MS_RDONLY) && features) { @@ -1769,7 +1771,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (ret) { printk(KERN_WARNING "btrfs: failed to read the system " "array on %s\n", sb->s_id); - goto fail_sb_buffer; + goto fail_sys_array; } blocksize = btrfs_level_size(tree_root, @@ -1783,8 +1785,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_super_chunk_root(disk_super), blocksize, generation); BUG_ON(!chunk_root->node); - btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); - chunk_root->commit_root = btrfs_root_node(chunk_root); read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), @@ -1810,8 +1810,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, blocksize, generation); if (!tree_root->node) goto fail_chunk_root; - btrfs_set_root_node(&tree_root->root_item, tree_root->node); - tree_root->commit_root = btrfs_root_node(tree_root); + ret = find_and_setup_root(tree_root, fs_info, BTRFS_EXTENT_TREE_OBJECTID, extent_root); @@ -1821,14 +1820,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, ret = find_and_setup_root(tree_root, fs_info, BTRFS_DEV_TREE_OBJECTID, dev_root); + dev_root->track_dirty = 1; if (ret) goto fail_extent_root; - dev_root->track_dirty = 1; ret = find_and_setup_root(tree_root, fs_info, BTRFS_CSUM_TREE_OBJECTID, csum_root); if (ret) - goto fail_dev_root; + goto fail_extent_root; csum_root->track_dirty = 1; @@ -1850,14 +1849,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (IS_ERR(fs_info->transaction_kthread)) goto fail_cleaner; - if (!btrfs_test_opt(tree_root, SSD) && - !btrfs_test_opt(tree_root, NOSSD) && - !fs_info->fs_devices->rotating) { - printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD " - "mode\n"); - btrfs_set_opt(fs_info->mount_opt, SSD); - } - if (btrfs_super_log_root(disk_super) != 0) { u64 bytenr = btrfs_super_log_root(disk_super); @@ -1890,7 +1881,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, } if (!(sb->s_flags & MS_RDONLY)) { - ret = btrfs_recover_relocation(tree_root); + ret = btrfs_cleanup_reloc_trees(tree_root); BUG_ON(ret); } @@ -1901,7 +1892,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); if (!fs_info->fs_root) goto fail_trans_kthread; - return tree_root; fail_trans_kthread: @@ -1918,19 +1908,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, fail_csum_root: free_extent_buffer(csum_root->node); - free_extent_buffer(csum_root->commit_root); -fail_dev_root: - free_extent_buffer(dev_root->node); - free_extent_buffer(dev_root->commit_root); fail_extent_root: free_extent_buffer(extent_root->node); - free_extent_buffer(extent_root->commit_root); fail_tree_root: free_extent_buffer(tree_root->node); - free_extent_buffer(tree_root->commit_root); fail_chunk_root: free_extent_buffer(chunk_root->node); - free_extent_buffer(chunk_root->commit_root); +fail_sys_array: + free_extent_buffer(dev_root->node); fail_sb_buffer: btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); @@ -2020,17 +2005,6 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) return latest; } -/* - * this should be called twice, once with wait == 0 and - * once with wait == 1. When wait == 0 is done, all the buffer heads - * we write are pinned. - * - * They are released when wait == 1 is done. - * max_mirrors must be the same for both runs, and it indicates how - * many supers on this one device should be written. - * - * max_mirrors == 0 means to write them all. - */ static int write_dev_supers(struct btrfs_device *device, struct btrfs_super_block *sb, int do_barriers, int wait, int max_mirrors) @@ -2066,16 +2040,12 @@ static int write_dev_supers(struct btrfs_device *device, bh = __find_get_block(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); BUG_ON(!bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - errors++; - - /* drop our reference */ brelse(bh); - - /* drop the reference from the wait == 0 run */ - brelse(bh); - continue; + wait_on_buffer(bh); + if (buffer_uptodate(bh)) { + brelse(bh); + continue; + } } else { btrfs_set_super_bytenr(sb, bytenr); @@ -2086,18 +2056,12 @@ static int write_dev_supers(struct btrfs_device *device, BTRFS_CSUM_SIZE); btrfs_csum_final(crc, sb->csum); - /* - * one reference for us, and we leave it for the - * caller - */ bh = __getblk(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); - /* one reference for submit_bh */ - get_bh(bh); - set_buffer_uptodate(bh); + get_bh(bh); lock_buffer(bh); bh->b_end_io = btrfs_end_buffer_write_sync; } @@ -2109,7 +2073,6 @@ static int write_dev_supers(struct btrfs_device *device, device->name); set_buffer_uptodate(bh); device->barriers = 0; - /* one reference for submit_bh */ get_bh(bh); lock_buffer(bh); ret = submit_bh(WRITE_SYNC, bh); @@ -2118,15 +2081,22 @@ static int write_dev_supers(struct btrfs_device *device, ret = submit_bh(WRITE_SYNC, bh); } - if (ret) + if (!ret && wait) { + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + errors++; + } else if (ret) { errors++; + } + if (wait) + brelse(bh); } return errors < i ? 0 : -1; } int write_all_supers(struct btrfs_root *root, int max_mirrors) { - struct list_head *head; + struct list_head *head = &root->fs_info->fs_devices->devices; struct btrfs_device *dev; struct btrfs_super_block *sb; struct btrfs_dev_item *dev_item; @@ -2141,9 +2111,6 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) sb = &root->fs_info->super_for_commit; dev_item = &sb->dev_item; - - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - head = &root->fs_info->fs_devices->devices; list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) { total_errors++; @@ -2187,7 +2154,6 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) if (ret) total_errors++; } - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (total_errors > max_errors) { printk(KERN_ERR "btrfs: %d errors while writing supers\n", total_errors); @@ -2207,7 +2173,6 @@ int write_ctree_super(struct btrfs_trans_handle *trans, int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { - WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); radix_tree_delete(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid); if (root->anon_super.s_dev) { @@ -2254,12 +2219,10 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) ARRAY_SIZE(gang)); if (!ret) break; - - root_objectid = gang[ret - 1]->root_key.objectid + 1; for (i = 0; i < ret; i++) { root_objectid = gang[i]->root_key.objectid; ret = btrfs_find_dead_roots(fs_info->tree_root, - root_objectid); + root_objectid, gang[i]); BUG_ON(ret); btrfs_orphan_cleanup(gang[i]); } @@ -2315,16 +2278,20 @@ int close_ctree(struct btrfs_root *root) (unsigned long long)fs_info->total_ref_cache_size); } - free_extent_buffer(fs_info->extent_root->node); - free_extent_buffer(fs_info->extent_root->commit_root); - free_extent_buffer(fs_info->tree_root->node); - free_extent_buffer(fs_info->tree_root->commit_root); - free_extent_buffer(root->fs_info->chunk_root->node); - free_extent_buffer(root->fs_info->chunk_root->commit_root); - free_extent_buffer(root->fs_info->dev_root->node); - free_extent_buffer(root->fs_info->dev_root->commit_root); - free_extent_buffer(root->fs_info->csum_root->node); - free_extent_buffer(root->fs_info->csum_root->commit_root); + if (fs_info->extent_root->node) + free_extent_buffer(fs_info->extent_root->node); + + if (fs_info->tree_root->node) + free_extent_buffer(fs_info->tree_root->node); + + if (root->fs_info->chunk_root->node) + free_extent_buffer(root->fs_info->chunk_root->node); + + if (root->fs_info->dev_root->node) + free_extent_buffer(root->fs_info->dev_root->node); + + if (root->fs_info->csum_root->node) + free_extent_buffer(root->fs_info->csum_root->node); btrfs_free_block_groups(root->fs_info); @@ -2406,14 +2373,17 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) * looks as though older kernels can get into trouble with * this code, they end up stuck in balance_dirty_pages forever */ + struct extent_io_tree *tree; u64 num_dirty; + u64 start = 0; unsigned long thresh = 32 * 1024 * 1024; + tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; if (current->flags & PF_MEMALLOC) return; - num_dirty = root->fs_info->dirty_metadata_bytes; - + num_dirty = count_range_bits(tree, &start, (u64)-1, + thresh, EXTENT_DIRTY); if (num_dirty > thresh) { balance_dirty_pages_ratelimited_nr( root->fs_info->btree_inode->i_mapping, 1); diff --git a/trunk/fs/btrfs/export.c b/trunk/fs/btrfs/export.c index 9596b40caa4e..85315d2c90de 100644 --- a/trunk/fs/btrfs/export.c +++ b/trunk/fs/btrfs/export.c @@ -78,7 +78,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; - inode = btrfs_iget(sb, &key, root); + inode = btrfs_iget(sb, &key, root, NULL); if (IS_ERR(inode)) return (void *)inode; @@ -192,7 +192,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child) btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; - return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); + return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); } const struct export_operations btrfs_export_ops = { diff --git a/trunk/fs/btrfs/extent-tree.c b/trunk/fs/btrfs/extent-tree.c index edc7d208c5ce..35af93355063 100644 --- a/trunk/fs/btrfs/extent-tree.c +++ b/trunk/fs/btrfs/extent-tree.c @@ -23,39 +23,50 @@ #include #include "compat.h" #include "hash.h" +#include "crc32c.h" #include "ctree.h" #include "disk-io.h" #include "print-tree.h" #include "transaction.h" #include "volumes.h" #include "locking.h" +#include "ref-cache.h" #include "free-space-cache.h" +#define PENDING_EXTENT_INSERT 0 +#define PENDING_EXTENT_DELETE 1 +#define PENDING_BACKREF_UPDATE 2 + +struct pending_extent_op { + int type; + u64 bytenr; + u64 num_bytes; + u64 parent; + u64 orig_parent; + u64 generation; + u64 orig_generation; + int level; + struct list_head list; + int del; +}; + +static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins, + int ref_mod); static int update_reserved_extents(struct btrfs_root *root, u64 bytenr, u64 num, int reserve); static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc, int mark_free); -static int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner_objectid, - u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extra_op); -static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, - struct extent_buffer *leaf, - struct btrfs_extent_item *ei); -static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 parent, u64 root_objectid, - u64 flags, u64 owner, u64 offset, - struct btrfs_key *ins, int ref_mod); -static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 parent, u64 root_objectid, - u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins); +static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin, + int ref_to_drop); static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, @@ -442,1071 +453,348 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) * maintenance. This is actually the same as #2, but with a slightly * different use case. * - * There are two kinds of back refs. The implicit back refs is optimized - * for pointers in non-shared tree blocks. For a given pointer in a block, - * back refs of this kind provide information about the block's owner tree - * and the pointer's key. These information allow us to find the block by - * b-tree searching. The full back refs is for pointers in tree blocks not - * referenced by their owner trees. The location of tree block is recorded - * in the back refs. Actually the full back refs is generic, and can be - * used in all cases the implicit back refs is used. The major shortcoming - * of the full back refs is its overhead. Every time a tree block gets - * COWed, we have to update back refs entry for all pointers in it. - * - * For a newly allocated tree block, we use implicit back refs for - * pointers in it. This means most tree related operations only involve - * implicit back refs. For a tree block created in old transaction, the - * only way to drop a reference to it is COW it. So we can detect the - * event that tree block loses its owner tree's reference and do the - * back refs conversion. - * - * When a tree block is COW'd through a tree, there are four cases: - * - * The reference count of the block is one and the tree is the block's - * owner tree. Nothing to do in this case. - * - * The reference count of the block is one and the tree is not the - * block's owner tree. In this case, full back refs is used for pointers - * in the block. Remove these full back refs, add implicit back refs for - * every pointers in the new block. - * - * The reference count of the block is greater than one and the tree is - * the block's owner tree. In this case, implicit back refs is used for - * pointers in the block. Add full back refs for every pointers in the - * block, increase lower level extents' reference counts. The original - * implicit back refs are entailed to the new block. - * - * The reference count of the block is greater than one and the tree is - * not the block's owner tree. Add implicit back refs for every pointer in - * the new block, increase lower level extents' reference count. - * - * Back Reference Key composing: - * - * The key objectid corresponds to the first byte in the extent, - * The key type is used to differentiate between types of back refs. - * There are different meanings of the key offset for different types - * of back refs. - * * File extents can be referenced by: * * - multiple snapshots, subvolumes, or different generations in one subvol * - different files inside a single subvolume * - different offsets inside a file (bookend extents in file.c) * - * The extent ref structure for the implicit back refs has fields for: + * The extent ref structure has fields for: * * - Objectid of the subvolume root + * - Generation number of the tree holding the reference * - objectid of the file holding the reference - * - original offset in the file - * - how many bookend extents - * - * The key offset for the implicit back refs is hash of the first - * three fields. - * - * The extent ref structure for the full back refs has field for: + * - number of references holding by parent node (alway 1 for tree blocks) * - * - number of pointers in the tree leaf + * Btree leaf may hold multiple references to a file extent. In most cases, + * these references are from same file and the corresponding offsets inside + * the file are close together. * - * The key offset for the implicit back refs is the first byte of - * the tree leaf + * When a file extent is allocated the fields are filled in: + * (root_key.objectid, trans->transid, inode objectid, 1) * - * When a file extent is allocated, The implicit back refs is used. - * the fields are filled in: + * When a leaf is cow'd new references are added for every file extent found + * in the leaf. It looks similar to the create case, but trans->transid will + * be different when the block is cow'd. * - * (root_key.objectid, inode objectid, offset in file, 1) + * (root_key.objectid, trans->transid, inode objectid, + * number of references in the leaf) * - * When a file extent is removed file truncation, we find the - * corresponding implicit back refs and check the following fields: + * When a file extent is removed either during snapshot deletion or + * file truncation, we find the corresponding back reference and check + * the following fields: * - * (btrfs_header_owner(leaf), inode objectid, offset in file) + * (btrfs_header_owner(leaf), btrfs_header_generation(leaf), + * inode objectid) * * Btree extents can be referenced by: * * - Different subvolumes + * - Different generations of the same subvolume * - * Both the implicit back refs and the full back refs for tree blocks - * only consist of key. The key offset for the implicit back refs is - * objectid of block's owner tree. The key offset for the full back refs - * is the first byte of parent block. + * When a tree block is created, back references are inserted: * - * When implicit back refs is used, information about the lowest key and - * level of the tree block are required. These information are stored in - * tree block info structure. + * (root->root_key.objectid, trans->transid, level, 1) + * + * When a tree block is cow'd, new back references are added for all the + * blocks it points to. If the tree block isn't in reference counted root, + * the old back references are removed. These new back references are of + * the form (trans->transid will have increased since creation): + * + * (root->root_key.objectid, trans->transid, level, 1) + * + * When a backref is in deleting, the following fields are checked: + * + * if backref was for a tree root: + * (btrfs_header_owner(itself), btrfs_header_generation(itself), level) + * else + * (btrfs_header_owner(parent), btrfs_header_generation(parent), level) + * + * Back Reference Key composing: + * + * The key objectid corresponds to the first byte in the extent, the key + * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first + * byte of parent extent. If a extent is tree root, the key offset is set + * to the key objectid. */ -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 -static int convert_extent_item_v0(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 owner, u32 extra_size) -{ - struct btrfs_extent_item *item; - struct btrfs_extent_item_v0 *ei0; - struct btrfs_extent_ref_v0 *ref0; - struct btrfs_tree_block_info *bi; - struct extent_buffer *leaf; - struct btrfs_key key; - struct btrfs_key found_key; - u32 new_size = sizeof(*item); - u64 refs; - int ret; - - leaf = path->nodes[0]; - BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - ei0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item_v0); - refs = btrfs_extent_refs_v0(leaf, ei0); - - if (owner == (u64)-1) { - while (1) { - if (path->slots[0] >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - return ret; - BUG_ON(ret > 0); - leaf = path->nodes[0]; - } - btrfs_item_key_to_cpu(leaf, &found_key, - path->slots[0]); - BUG_ON(key.objectid != found_key.objectid); - if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { - path->slots[0]++; - continue; - } - ref0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref_v0); - owner = btrfs_ref_objectid_v0(leaf, ref0); - break; - } - } - btrfs_release_path(root, path); - - if (owner < BTRFS_FIRST_FREE_OBJECTID) - new_size += sizeof(*bi); - - new_size -= sizeof(*ei0); - ret = btrfs_search_slot(trans, root, &key, path, - new_size + extra_size, 1); - if (ret < 0) - return ret; - BUG_ON(ret); - - ret = btrfs_extend_item(trans, root, path, new_size); - BUG_ON(ret); - - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - btrfs_set_extent_refs(leaf, item, refs); - /* FIXME: get real generation */ - btrfs_set_extent_generation(leaf, item, 0); - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - btrfs_set_extent_flags(leaf, item, - BTRFS_EXTENT_FLAG_TREE_BLOCK | - BTRFS_BLOCK_FLAG_FULL_BACKREF); - bi = (struct btrfs_tree_block_info *)(item + 1); - /* FIXME: get first key of the block */ - memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); - btrfs_set_tree_block_level(leaf, bi, (int)owner); - } else { - btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); - } - btrfs_mark_buffer_dirty(leaf); - return 0; -} -#endif - -static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) -{ - u32 high_crc = ~(u32)0; - u32 low_crc = ~(u32)0; - __le64 lenum; - - lenum = cpu_to_le64(root_objectid); - high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); - lenum = cpu_to_le64(owner); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); - lenum = cpu_to_le64(offset); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); - - return ((u64)high_crc << 31) ^ (u64)low_crc; -} - -static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref) -{ - return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), - btrfs_extent_data_ref_objectid(leaf, ref), - btrfs_extent_data_ref_offset(leaf, ref)); -} - -static int match_extent_data_ref(struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref, - u64 root_objectid, u64 owner, u64 offset) -{ - if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || - btrfs_extent_data_ref_objectid(leaf, ref) != owner || - btrfs_extent_data_ref_offset(leaf, ref) != offset) - return 0; - return 1; -} - -static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid, - u64 owner, u64 offset) +static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 ref_root, u64 ref_generation, + u64 owner_objectid, int del) { struct btrfs_key key; - struct btrfs_extent_data_ref *ref; + struct btrfs_extent_ref *ref; struct extent_buffer *leaf; - u32 nritems; + u64 ref_objectid; int ret; - int recow; - int err = -ENOENT; key.objectid = bytenr; - if (parent) { - key.type = BTRFS_SHARED_DATA_REF_KEY; - key.offset = parent; - } else { - key.type = BTRFS_EXTENT_DATA_REF_KEY; - key.offset = hash_extent_data_ref(root_objectid, - owner, offset); - } -again: - recow = 0; - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) { - err = ret; - goto fail; - } + key.type = BTRFS_EXTENT_REF_KEY; + key.offset = parent; - if (parent) { - if (!ret) - return 0; -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - key.type = BTRFS_EXTENT_REF_V0_KEY; - btrfs_release_path(root, path); - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) { - err = ret; - goto fail; - } - if (!ret) - return 0; -#endif - goto fail; + ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; } leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - while (1) { - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - err = ret; - if (ret) - goto fail; - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - recow = 1; - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != bytenr || - key.type != BTRFS_EXTENT_DATA_REF_KEY) - goto fail; - - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_data_ref); - - if (match_extent_data_ref(leaf, ref, root_objectid, - owner, offset)) { - if (recow) { - btrfs_release_path(root, path); - goto again; - } - err = 0; - break; - } - path->slots[0]++; + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); + ref_objectid = btrfs_ref_objectid(leaf, ref); + if (btrfs_ref_root(leaf, ref) != ref_root || + btrfs_ref_generation(leaf, ref) != ref_generation || + (ref_objectid != owner_objectid && + ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) { + ret = -EIO; + WARN_ON(1); + goto out; } -fail: - return err; + ret = 0; +out: + return ret; } -static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid, u64 owner, - u64 offset, int refs_to_add) +static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 ref_root, u64 ref_generation, + u64 owner_objectid, + int refs_to_add) { struct btrfs_key key; struct extent_buffer *leaf; - u32 size; + struct btrfs_extent_ref *ref; u32 num_refs; int ret; key.objectid = bytenr; - if (parent) { - key.type = BTRFS_SHARED_DATA_REF_KEY; - key.offset = parent; - size = sizeof(struct btrfs_shared_data_ref); - } else { - key.type = BTRFS_EXTENT_DATA_REF_KEY; - key.offset = hash_extent_data_ref(root_objectid, - owner, offset); - size = sizeof(struct btrfs_extent_data_ref); - } - - ret = btrfs_insert_empty_item(trans, root, path, &key, size); - if (ret && ret != -EEXIST) - goto fail; + key.type = BTRFS_EXTENT_REF_KEY; + key.offset = parent; - leaf = path->nodes[0]; - if (parent) { - struct btrfs_shared_data_ref *ref; + ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref)); + if (ret == 0) { + leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_shared_data_ref); - if (ret == 0) { - btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); - } else { - num_refs = btrfs_shared_data_ref_count(leaf, ref); - num_refs += refs_to_add; - btrfs_set_shared_data_ref_count(leaf, ref, num_refs); + struct btrfs_extent_ref); + btrfs_set_ref_root(leaf, ref, ref_root); + btrfs_set_ref_generation(leaf, ref, ref_generation); + btrfs_set_ref_objectid(leaf, ref, owner_objectid); + btrfs_set_ref_num_refs(leaf, ref, refs_to_add); + } else if (ret == -EEXIST) { + u64 existing_owner; + + BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID); + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref); + if (btrfs_ref_root(leaf, ref) != ref_root || + btrfs_ref_generation(leaf, ref) != ref_generation) { + ret = -EIO; + WARN_ON(1); + goto out; } - } else { - struct btrfs_extent_data_ref *ref; - while (ret == -EEXIST) { - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_data_ref); - if (match_extent_data_ref(leaf, ref, root_objectid, - owner, offset)) - break; - btrfs_release_path(root, path); - key.offset++; - ret = btrfs_insert_empty_item(trans, root, path, &key, - size); - if (ret && ret != -EEXIST) - goto fail; - leaf = path->nodes[0]; - } - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_data_ref); - if (ret == 0) { - btrfs_set_extent_data_ref_root(leaf, ref, - root_objectid); - btrfs_set_extent_data_ref_objectid(leaf, ref, owner); - btrfs_set_extent_data_ref_offset(leaf, ref, offset); - btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); - } else { - num_refs = btrfs_extent_data_ref_count(leaf, ref); - num_refs += refs_to_add; - btrfs_set_extent_data_ref_count(leaf, ref, num_refs); + num_refs = btrfs_ref_num_refs(leaf, ref); + BUG_ON(num_refs == 0); + btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add); + + existing_owner = btrfs_ref_objectid(leaf, ref); + if (existing_owner != owner_objectid && + existing_owner != BTRFS_MULTIPLE_OBJECTIDS) { + btrfs_set_ref_objectid(leaf, ref, + BTRFS_MULTIPLE_OBJECTIDS); } + ret = 0; + } else { + goto out; } - btrfs_mark_buffer_dirty(leaf); - ret = 0; -fail: + btrfs_unlock_up_safe(path, 1); + btrfs_mark_buffer_dirty(path->nodes[0]); +out: btrfs_release_path(root, path); return ret; } -static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - int refs_to_drop) +static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int refs_to_drop) { - struct btrfs_key key; - struct btrfs_extent_data_ref *ref1 = NULL; - struct btrfs_shared_data_ref *ref2 = NULL; struct extent_buffer *leaf; - u32 num_refs = 0; + struct btrfs_extent_ref *ref; + u32 num_refs; int ret = 0; leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - - if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { - ref1 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_data_ref); - num_refs = btrfs_extent_data_ref_count(leaf, ref1); - } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { - ref2 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_shared_data_ref); - num_refs = btrfs_shared_data_ref_count(leaf, ref2); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { - struct btrfs_extent_ref_v0 *ref0; - ref0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref_v0); - num_refs = btrfs_ref_count_v0(leaf, ref0); -#endif - } else { - BUG(); - } - + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); + num_refs = btrfs_ref_num_refs(leaf, ref); BUG_ON(num_refs < refs_to_drop); num_refs -= refs_to_drop; - if (num_refs == 0) { ret = btrfs_del_item(trans, root, path); } else { - if (key.type == BTRFS_EXTENT_DATA_REF_KEY) - btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); - else if (key.type == BTRFS_SHARED_DATA_REF_KEY) - btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - else { - struct btrfs_extent_ref_v0 *ref0; - ref0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref_v0); - btrfs_set_ref_count_v0(leaf, ref0, num_refs); - } -#endif + btrfs_set_ref_num_refs(leaf, ref, num_refs); btrfs_mark_buffer_dirty(leaf); } + btrfs_release_path(root, path); return ret; } -static noinline u32 extent_data_ref_count(struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref) +#ifdef BIO_RW_DISCARD +static void btrfs_issue_discard(struct block_device *bdev, + u64 start, u64 len) { - struct btrfs_key key; - struct extent_buffer *leaf; - struct btrfs_extent_data_ref *ref1; - struct btrfs_shared_data_ref *ref2; - u32 num_refs = 0; - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (iref) { - if (btrfs_extent_inline_ref_type(leaf, iref) == - BTRFS_EXTENT_DATA_REF_KEY) { - ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); - num_refs = btrfs_extent_data_ref_count(leaf, ref1); - } else { - ref2 = (struct btrfs_shared_data_ref *)(iref + 1); - num_refs = btrfs_shared_data_ref_count(leaf, ref2); - } - } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { - ref1 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_data_ref); - num_refs = btrfs_extent_data_ref_count(leaf, ref1); - } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { - ref2 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_shared_data_ref); - num_refs = btrfs_shared_data_ref_count(leaf, ref2); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { - struct btrfs_extent_ref_v0 *ref0; - ref0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref_v0); - num_refs = btrfs_ref_count_v0(leaf, ref0); -#endif - } else { - WARN_ON(1); - } - return num_refs; + blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); } +#endif -static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid) +static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes) { - struct btrfs_key key; +#ifdef BIO_RW_DISCARD int ret; + u64 map_length = num_bytes; + struct btrfs_multi_bio *multi = NULL; - key.objectid = bytenr; - if (parent) { - key.type = BTRFS_SHARED_BLOCK_REF_KEY; - key.offset = parent; - } else { - key.type = BTRFS_TREE_BLOCK_REF_KEY; - key.offset = root_objectid; - } + /* Tell the block device(s) that the sectors can be discarded */ + ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, + bytenr, &map_length, &multi, 0); + if (!ret) { + struct btrfs_bio_stripe *stripe = multi->stripes; + int i; - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret > 0) - ret = -ENOENT; -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (ret == -ENOENT && parent) { - btrfs_release_path(root, path); - key.type = BTRFS_EXTENT_REF_V0_KEY; - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret > 0) - ret = -ENOENT; + if (map_length > num_bytes) + map_length = num_bytes; + + for (i = 0; i < multi->num_stripes; i++, stripe++) { + btrfs_issue_discard(stripe->dev->bdev, + stripe->physical, + map_length); + } + kfree(multi); } -#endif + return ret; +#else + return 0; +#endif } -static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid) +static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, + u64 orig_parent, u64 parent, + u64 orig_root, u64 ref_root, + u64 orig_generation, u64 ref_generation, + u64 owner_objectid) { - struct btrfs_key key; int ret; + int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID; - key.objectid = bytenr; - if (parent) { - key.type = BTRFS_SHARED_BLOCK_REF_KEY; - key.offset = parent; - } else { - key.type = BTRFS_TREE_BLOCK_REF_KEY; - key.offset = root_objectid; - } - - ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - btrfs_release_path(root, path); + ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes, + orig_parent, parent, orig_root, + ref_root, orig_generation, + ref_generation, owner_objectid, pin); + BUG_ON(ret); return ret; } -static inline int extent_ref_type(u64 parent, u64 owner) +int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 orig_parent, u64 parent, + u64 ref_root, u64 ref_generation, + u64 owner_objectid) { - int type; - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - if (parent > 0) - type = BTRFS_SHARED_BLOCK_REF_KEY; - else - type = BTRFS_TREE_BLOCK_REF_KEY; - } else { - if (parent > 0) - type = BTRFS_SHARED_DATA_REF_KEY; - else - type = BTRFS_EXTENT_DATA_REF_KEY; - } - return type; + int ret; + if (ref_root == BTRFS_TREE_LOG_OBJECTID && + owner_objectid < BTRFS_FIRST_FREE_OBJECTID) + return 0; + + ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes, + orig_parent, parent, ref_root, + ref_root, ref_generation, + ref_generation, owner_objectid); + return ret; } +static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, + u64 orig_parent, u64 parent, + u64 orig_root, u64 ref_root, + u64 orig_generation, u64 ref_generation, + u64 owner_objectid) +{ + int ret; -static int find_next_key(struct btrfs_path *path, struct btrfs_key *key) + ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root, + ref_generation, owner_objectid, + BTRFS_ADD_DELAYED_REF, 0); + BUG_ON(ret); + return ret; +} +static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 parent, u64 ref_root, + u64 ref_generation, u64 owner_objectid, + int refs_to_add) { - int level; - BUG_ON(!path->keep_locks); - for (level = 0; level < BTRFS_MAX_LEVEL; level++) { - if (!path->nodes[level]) - break; - btrfs_assert_tree_locked(path->nodes[level]); - if (path->slots[level] + 1 >= - btrfs_header_nritems(path->nodes[level])) - continue; - if (level == 0) - btrfs_item_key_to_cpu(path->nodes[level], key, - path->slots[level] + 1); - else - btrfs_node_key_to_cpu(path->nodes[level], key, - path->slots[level] + 1); - return 0; - } - return 1; -} - -/* - * look for inline back ref. if back ref is found, *ref_ret is set - * to the address of inline back ref, and 0 is returned. - * - * if back ref isn't found, *ref_ret is set to the address where it - * should be inserted, and -ENOENT is returned. - * - * if insert is true and there are too many inline back refs, the path - * points to the extent item, and -EAGAIN is returned. - * - * NOTE: inline back refs are ordered in the same way that back ref - * items in the tree are ordered. - */ -static noinline_for_stack -int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref **ref_ret, - u64 bytenr, u64 num_bytes, - u64 parent, u64 root_objectid, - u64 owner, u64 offset, int insert) -{ - struct btrfs_key key; - struct extent_buffer *leaf; - struct btrfs_extent_item *ei; - struct btrfs_extent_inline_ref *iref; - u64 flags; - u64 item_size; - unsigned long ptr; - unsigned long end; - int extra_size; - int type; - int want; + struct btrfs_path *path; int ret; - int err = 0; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_extent_item *item; + u32 refs; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 1; + path->leave_spinning = 1; key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; - want = extent_ref_type(parent, owner); - if (insert) { - extra_size = btrfs_extent_inline_ref_size(want); - path->keep_locks = 1; - } else - extra_size = -1; - ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); + /* first find the extent item and update its reference count */ + ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, + path, 0, 1); if (ret < 0) { - err = ret; - goto out; - } - BUG_ON(ret); - - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (item_size < sizeof(*ei)) { - if (!insert) { - err = -ENOENT; - goto out; - } - ret = convert_extent_item_v0(trans, root, path, owner, - extra_size); - if (ret < 0) { - err = ret; - goto out; - } - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - } -#endif - BUG_ON(item_size < sizeof(*ei)); - - ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - flags = btrfs_extent_flags(leaf, ei); - - ptr = (unsigned long)(ei + 1); - end = (unsigned long)ei + item_size; - - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - ptr += sizeof(struct btrfs_tree_block_info); - BUG_ON(ptr > end); - } else { - BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); - } - - err = -ENOENT; - while (1) { - if (ptr >= end) { - WARN_ON(ptr > end); - break; - } - iref = (struct btrfs_extent_inline_ref *)ptr; - type = btrfs_extent_inline_ref_type(leaf, iref); - if (want < type) - break; - if (want > type) { - ptr += btrfs_extent_inline_ref_size(type); - continue; - } - - if (type == BTRFS_EXTENT_DATA_REF_KEY) { - struct btrfs_extent_data_ref *dref; - dref = (struct btrfs_extent_data_ref *)(&iref->offset); - if (match_extent_data_ref(leaf, dref, root_objectid, - owner, offset)) { - err = 0; - break; - } - if (hash_extent_data_ref_item(leaf, dref) < - hash_extent_data_ref(root_objectid, owner, offset)) - break; - } else { - u64 ref_offset; - ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); - if (parent > 0) { - if (parent == ref_offset) { - err = 0; - break; - } - if (ref_offset < parent) - break; - } else { - if (root_objectid == ref_offset) { - err = 0; - break; - } - if (ref_offset < root_objectid) - break; - } - } - ptr += btrfs_extent_inline_ref_size(type); - } - if (err == -ENOENT && insert) { - if (item_size + extra_size >= - BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { - err = -EAGAIN; - goto out; - } - /* - * To add new inline back ref, we have to make sure - * there is no corresponding back ref item. - * For simplicity, we just do not add new inline back - * ref if there is any kind of item for this block - */ - if (find_next_key(path, &key) == 0 && key.objectid == bytenr && - key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { - err = -EAGAIN; - goto out; - } - } - *ref_ret = (struct btrfs_extent_inline_ref *)ptr; -out: - if (insert) { - path->keep_locks = 0; - btrfs_unlock_up_safe(path, 1); - } - return err; -} - -/* - * helper to add new inline back ref - */ -static noinline_for_stack -int setup_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref, - u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add, - struct btrfs_delayed_extent_op *extent_op) -{ - struct extent_buffer *leaf; - struct btrfs_extent_item *ei; - unsigned long ptr; - unsigned long end; - unsigned long item_offset; - u64 refs; - int size; - int type; - int ret; - - leaf = path->nodes[0]; - ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - item_offset = (unsigned long)iref - (unsigned long)ei; - - type = extent_ref_type(parent, owner); - size = btrfs_extent_inline_ref_size(type); - - ret = btrfs_extend_item(trans, root, path, size); - BUG_ON(ret); - - ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - refs = btrfs_extent_refs(leaf, ei); - refs += refs_to_add; - btrfs_set_extent_refs(leaf, ei, refs); - if (extent_op) - __run_delayed_extent_op(extent_op, leaf, ei); - - ptr = (unsigned long)ei + item_offset; - end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); - if (ptr < end - size) - memmove_extent_buffer(leaf, ptr + size, ptr, - end - size - ptr); - - iref = (struct btrfs_extent_inline_ref *)ptr; - btrfs_set_extent_inline_ref_type(leaf, iref, type); - if (type == BTRFS_EXTENT_DATA_REF_KEY) { - struct btrfs_extent_data_ref *dref; - dref = (struct btrfs_extent_data_ref *)(&iref->offset); - btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); - btrfs_set_extent_data_ref_objectid(leaf, dref, owner); - btrfs_set_extent_data_ref_offset(leaf, dref, offset); - btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); - } else if (type == BTRFS_SHARED_DATA_REF_KEY) { - struct btrfs_shared_data_ref *sref; - sref = (struct btrfs_shared_data_ref *)(iref + 1); - btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); - btrfs_set_extent_inline_ref_offset(leaf, iref, parent); - } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { - btrfs_set_extent_inline_ref_offset(leaf, iref, parent); - } else { - btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); - } - btrfs_mark_buffer_dirty(leaf); - return 0; -} - -static int lookup_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref **ref_ret, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset) -{ - int ret; - - ret = lookup_inline_extent_backref(trans, root, path, ref_ret, - bytenr, num_bytes, parent, - root_objectid, owner, offset, 0); - if (ret != -ENOENT) + btrfs_set_path_blocking(path); return ret; - - btrfs_release_path(root, path); - *ref_ret = NULL; - - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, - root_objectid); - } else { - ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, - root_objectid, owner, offset); - } - return ret; -} - -/* - * helper to update/remove inline back ref - */ -static noinline_for_stack -int update_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref, - int refs_to_mod, - struct btrfs_delayed_extent_op *extent_op) -{ - struct extent_buffer *leaf; - struct btrfs_extent_item *ei; - struct btrfs_extent_data_ref *dref = NULL; - struct btrfs_shared_data_ref *sref = NULL; - unsigned long ptr; - unsigned long end; - u32 item_size; - int size; - int type; - int ret; - u64 refs; - - leaf = path->nodes[0]; - ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - refs = btrfs_extent_refs(leaf, ei); - WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); - refs += refs_to_mod; - btrfs_set_extent_refs(leaf, ei, refs); - if (extent_op) - __run_delayed_extent_op(extent_op, leaf, ei); - - type = btrfs_extent_inline_ref_type(leaf, iref); - - if (type == BTRFS_EXTENT_DATA_REF_KEY) { - dref = (struct btrfs_extent_data_ref *)(&iref->offset); - refs = btrfs_extent_data_ref_count(leaf, dref); - } else if (type == BTRFS_SHARED_DATA_REF_KEY) { - sref = (struct btrfs_shared_data_ref *)(iref + 1); - refs = btrfs_shared_data_ref_count(leaf, sref); - } else { - refs = 1; - BUG_ON(refs_to_mod != -1); } - BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); - refs += refs_to_mod; - - if (refs > 0) { - if (type == BTRFS_EXTENT_DATA_REF_KEY) - btrfs_set_extent_data_ref_count(leaf, dref, refs); - else - btrfs_set_shared_data_ref_count(leaf, sref, refs); - } else { - size = btrfs_extent_inline_ref_size(type); - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - ptr = (unsigned long)iref; - end = (unsigned long)ei + item_size; - if (ptr + size < end) - memmove_extent_buffer(leaf, ptr, ptr + size, - end - ptr - size); - item_size -= size; - ret = btrfs_truncate_item(trans, root, path, item_size, 1); - BUG_ON(ret); - } - btrfs_mark_buffer_dirty(leaf); - return 0; -} - -static noinline_for_stack -int insert_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, - u64 offset, int refs_to_add, - struct btrfs_delayed_extent_op *extent_op) -{ - struct btrfs_extent_inline_ref *iref; - int ret; - - ret = lookup_inline_extent_backref(trans, root, path, &iref, - bytenr, num_bytes, parent, - root_objectid, owner, offset, 1); - if (ret == 0) { - BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); - ret = update_inline_extent_backref(trans, root, path, iref, - refs_to_add, extent_op); - } else if (ret == -ENOENT) { - ret = setup_inline_extent_backref(trans, root, path, iref, - parent, root_objectid, - owner, offset, refs_to_add, - extent_op); - } - return ret; -} - -static int insert_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add) -{ - int ret; - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - BUG_ON(refs_to_add != 1); - ret = insert_tree_block_ref(trans, root, path, bytenr, - parent, root_objectid); - } else { - ret = insert_extent_data_ref(trans, root, path, bytenr, - parent, root_objectid, - owner, offset, refs_to_add); - } - return ret; -} - -static int remove_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref, - int refs_to_drop, int is_data) -{ - int ret; - - BUG_ON(!is_data && refs_to_drop != 1); - if (iref) { - ret = update_inline_extent_backref(trans, root, path, iref, - -refs_to_drop, NULL); - } else if (is_data) { - ret = remove_extent_data_ref(trans, root, path, refs_to_drop); - } else { - ret = btrfs_del_item(trans, root, path); + if (ret > 0) { + WARN_ON(1); + btrfs_free_path(path); + return -EIO; } - return ret; -} - -#ifdef BIO_RW_DISCARD -static void btrfs_issue_discard(struct block_device *bdev, - u64 start, u64 len) -{ - blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); -} -#endif - -static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, - u64 num_bytes) -{ -#ifdef BIO_RW_DISCARD - int ret; - u64 map_length = num_bytes; - struct btrfs_multi_bio *multi = NULL; - - /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, - bytenr, &map_length, &multi, 0); - if (!ret) { - struct btrfs_bio_stripe *stripe = multi->stripes; - int i; + l = path->nodes[0]; - if (map_length > num_bytes) - map_length = num_bytes; - - for (i = 0; i < multi->num_stripes; i++, stripe++) { - btrfs_issue_discard(stripe->dev->bdev, - stripe->physical, - map_length); - } - kfree(multi); + btrfs_item_key_to_cpu(l, &key, path->slots[0]); + if (key.objectid != bytenr) { + btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]); + printk(KERN_ERR "btrfs wanted %llu found %llu\n", + (unsigned long long)bytenr, + (unsigned long long)key.objectid); + BUG(); } + BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY); - return ret; -#else - return 0; -#endif -} + item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); -int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset) -{ - int ret; - BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && - root_objectid == BTRFS_TREE_LOG_OBJECTID); + refs = btrfs_extent_refs(l, item); + btrfs_set_extent_refs(l, item, refs + refs_to_add); + btrfs_unlock_up_safe(path, 1); - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, - parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL); - } else { - ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, - parent, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_REF, NULL); - } - return ret; -} - -static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, - u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add, - struct btrfs_delayed_extent_op *extent_op) -{ - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_extent_item *item; - u64 refs; - int ret; - int err = 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - path->reada = 1; - path->leave_spinning = 1; - /* this will setup the path even if it fails to insert the back ref */ - ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, - path, bytenr, num_bytes, parent, - root_objectid, owner, offset, - refs_to_add, extent_op); - if (ret == 0) - goto out; - - if (ret != -EAGAIN) { - err = ret; - goto out; - } - - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - refs = btrfs_extent_refs(leaf, item); - btrfs_set_extent_refs(leaf, item, refs + refs_to_add); - if (extent_op) - __run_delayed_extent_op(extent_op, leaf, item); + btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_mark_buffer_dirty(leaf); btrfs_release_path(root->fs_info->extent_root, path); path->reada = 1; @@ -1514,197 +802,56 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, /* now insert the actual backref */ ret = insert_extent_backref(trans, root->fs_info->extent_root, - path, bytenr, parent, root_objectid, - owner, offset, refs_to_add); + path, bytenr, parent, + ref_root, ref_generation, + owner_objectid, refs_to_add); BUG_ON(ret); -out: btrfs_free_path(path); - return err; -} - -static int run_delayed_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op, - int insert_reserved) -{ - int ret = 0; - struct btrfs_delayed_data_ref *ref; - struct btrfs_key ins; - u64 parent = 0; - u64 ref_root = 0; - u64 flags = 0; - - ins.objectid = node->bytenr; - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; - - ref = btrfs_delayed_node_to_data_ref(node); - if (node->type == BTRFS_SHARED_DATA_REF_KEY) - parent = ref->parent; - else - ref_root = ref->root; - - if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { - if (extent_op) { - BUG_ON(extent_op->update_key); - flags |= extent_op->flags_to_set; - } - ret = alloc_reserved_file_extent(trans, root, - parent, ref_root, flags, - ref->objectid, ref->offset, - &ins, node->ref_mod); - update_reserved_extents(root, ins.objectid, ins.offset, 0); - } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, - node->num_bytes, parent, - ref_root, ref->objectid, - ref->offset, node->ref_mod, - extent_op); - } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, root, node->bytenr, - node->num_bytes, parent, - ref_root, ref->objectid, - ref->offset, node->ref_mod, - extent_op); - } else { - BUG(); - } - return ret; -} - -static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, - struct extent_buffer *leaf, - struct btrfs_extent_item *ei) -{ - u64 flags = btrfs_extent_flags(leaf, ei); - if (extent_op->update_flags) { - flags |= extent_op->flags_to_set; - btrfs_set_extent_flags(leaf, ei, flags); - } - - if (extent_op->update_key) { - struct btrfs_tree_block_info *bi; - BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); - bi = (struct btrfs_tree_block_info *)(ei + 1); - btrfs_set_tree_block_key(leaf, bi, &extent_op->key); - } + return 0; } -static int run_delayed_extent_op(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op) -{ - struct btrfs_key key; - struct btrfs_path *path; - struct btrfs_extent_item *ei; - struct extent_buffer *leaf; - u32 item_size; - int ret; - int err = 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = node->bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = node->num_bytes; - - path->reada = 1; - path->leave_spinning = 1; - ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, - path, 0, 1); - if (ret < 0) { - err = ret; - goto out; - } - if (ret > 0) { - err = -EIO; - goto out; - } - - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (item_size < sizeof(*ei)) { - ret = convert_extent_item_v0(trans, root->fs_info->extent_root, - path, (u64)-1, 0); - if (ret < 0) { - err = ret; - goto out; - } - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - } -#endif - BUG_ON(item_size < sizeof(*ei)); - ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - __run_delayed_extent_op(extent_op, leaf, ei); +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, u64 ref_generation, + u64 owner_objectid) +{ + int ret; + if (ref_root == BTRFS_TREE_LOG_OBJECTID && + owner_objectid < BTRFS_FIRST_FREE_OBJECTID) + return 0; - btrfs_mark_buffer_dirty(leaf); -out: - btrfs_free_path(path); - return err; + ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent, + 0, ref_root, 0, ref_generation, + owner_objectid); + return ret; } -static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op, - int insert_reserved) +static int drop_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node) { int ret = 0; - struct btrfs_delayed_tree_ref *ref; - struct btrfs_key ins; - u64 parent = 0; - u64 ref_root = 0; + struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node); - ins.objectid = node->bytenr; - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; + BUG_ON(node->ref_mod == 0); + ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes, + node->parent, ref->root, ref->generation, + ref->owner_objectid, ref->pin, node->ref_mod); - ref = btrfs_delayed_node_to_tree_ref(node); - if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) - parent = ref->parent; - else - ref_root = ref->root; - - BUG_ON(node->ref_mod != 1); - if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { - BUG_ON(!extent_op || !extent_op->update_flags || - !extent_op->update_key); - ret = alloc_reserved_tree_block(trans, root, - parent, ref_root, - extent_op->flags_to_set, - &extent_op->key, - ref->level, &ins); - update_reserved_extents(root, ins.objectid, ins.offset, 0); - } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, - node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op); - } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, root, node->bytenr, - node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op); - } else { - BUG(); - } return ret; } - /* helper function to actually process a single delayed ref entry */ -static int run_one_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op, - int insert_reserved) +static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + int insert_reserved) { int ret; - if (btrfs_delayed_ref_is_head(node)) { + struct btrfs_delayed_ref *ref; + + if (node->parent == (u64)-1) { struct btrfs_delayed_ref_head *head; /* * we've hit the end of the chain and we were supposed @@ -1712,35 +859,44 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, * deleted before we ever needed to insert it, so all * we have to do is clean up the accounting */ - BUG_ON(extent_op); - head = btrfs_delayed_node_to_head(node); if (insert_reserved) { - if (head->is_data) { - ret = btrfs_del_csums(trans, root, - node->bytenr, - node->num_bytes); - BUG_ON(ret); - } - btrfs_update_pinned_extents(root, node->bytenr, - node->num_bytes, 1); update_reserved_extents(root, node->bytenr, node->num_bytes, 0); } + head = btrfs_delayed_node_to_head(node); mutex_unlock(&head->mutex); return 0; } - if (node->type == BTRFS_TREE_BLOCK_REF_KEY || - node->type == BTRFS_SHARED_BLOCK_REF_KEY) - ret = run_delayed_tree_ref(trans, root, node, extent_op, - insert_reserved); - else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || - node->type == BTRFS_SHARED_DATA_REF_KEY) - ret = run_delayed_data_ref(trans, root, node, extent_op, - insert_reserved); - else - BUG(); - return ret; + ref = btrfs_delayed_node_to_ref(node); + if (ref->action == BTRFS_ADD_DELAYED_REF) { + if (insert_reserved) { + struct btrfs_key ins; + + ins.objectid = node->bytenr; + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + + /* record the full extent allocation */ + ret = __btrfs_alloc_reserved_extent(trans, root, + node->parent, ref->root, + ref->generation, ref->owner_objectid, + &ins, node->ref_mod); + update_reserved_extents(root, node->bytenr, + node->num_bytes, 0); + } else { + /* just add one backref */ + ret = add_extent_ref(trans, root, node->bytenr, + node->num_bytes, + node->parent, ref->root, ref->generation, + ref->owner_objectid, node->ref_mod); + } + BUG_ON(ret); + } else if (ref->action == BTRFS_DROP_DELAYED_REF) { + WARN_ON(insert_reserved); + ret = drop_delayed_ref(trans, root, node); + } + return 0; } static noinline struct btrfs_delayed_ref_node * @@ -1763,7 +919,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head) rb_node); if (ref->bytenr != head->node.bytenr) break; - if (ref->action == action) + if (btrfs_delayed_node_to_ref(ref)->action == action) return ref; node = rb_prev(node); } @@ -1781,7 +937,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *locked_ref = NULL; - struct btrfs_delayed_extent_op *extent_op; int ret; int count = 0; int must_insert_reserved = 0; @@ -1820,9 +975,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, must_insert_reserved = locked_ref->must_insert_reserved; locked_ref->must_insert_reserved = 0; - extent_op = locked_ref->extent_op; - locked_ref->extent_op = NULL; - /* * locked_ref is the head node, so we have to go one * node back for any delayed ref updates @@ -1834,25 +986,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, * so that any accounting fixes can happen */ ref = &locked_ref->node; - - if (extent_op && must_insert_reserved) { - kfree(extent_op); - extent_op = NULL; - } - - if (extent_op) { - spin_unlock(&delayed_refs->lock); - - ret = run_delayed_extent_op(trans, root, - ref, extent_op); - BUG_ON(ret); - kfree(extent_op); - - cond_resched(); - spin_lock(&delayed_refs->lock); - continue; - } - list_del_init(&locked_ref->cluster); locked_ref = NULL; } @@ -1860,17 +993,14 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); delayed_refs->num_entries--; - spin_unlock(&delayed_refs->lock); - ret = run_one_delayed_ref(trans, root, ref, extent_op, + ret = run_one_delayed_ref(trans, root, ref, must_insert_reserved); BUG_ON(ret); - btrfs_put_delayed_ref(ref); - kfree(extent_op); - count++; + count++; cond_resched(); spin_lock(&delayed_refs->lock); } @@ -1965,112 +1095,25 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, return 0; } -int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 flags, - int is_data) -{ - struct btrfs_delayed_extent_op *extent_op; - int ret; - - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - if (!extent_op) - return -ENOMEM; - - extent_op->flags_to_set = flags; - extent_op->update_flags = 1; - extent_op->update_key = 0; - extent_op->is_data = is_data ? 1 : 0; - - ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); - if (ret) - kfree(extent_op); - return ret; -} - -static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr) -{ - struct btrfs_delayed_ref_head *head; - struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_data_ref *data_ref; - struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *node; - int ret = 0; - - ret = -ENOENT; - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(trans, bytenr); - if (!head) - goto out; - - if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); - spin_unlock(&delayed_refs->lock); - - btrfs_release_path(root->fs_info->extent_root, path); - - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); - return -EAGAIN; - } - - node = rb_prev(&head->node.rb_node); - if (!node) - goto out_unlock; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - - if (ref->bytenr != bytenr) - goto out_unlock; - - ret = 1; - if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) - goto out_unlock; - - data_ref = btrfs_delayed_node_to_data_ref(ref); - - node = rb_prev(node); - if (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr == bytenr) - goto out_unlock; - } - - if (data_ref->root != root->root_key.objectid || - data_ref->objectid != objectid || data_ref->offset != offset) - goto out_unlock; - - ret = 0; -out_unlock: - mutex_unlock(&head->mutex); -out: - spin_unlock(&delayed_refs->lock); - return ret; -} - -static noinline int check_committed_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr) +int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid, u64 bytenr) { struct btrfs_root *extent_root = root->fs_info->extent_root; + struct btrfs_path *path; struct extent_buffer *leaf; - struct btrfs_extent_data_ref *ref; - struct btrfs_extent_inline_ref *iref; - struct btrfs_extent_item *ei; + struct btrfs_extent_ref *ref_item; struct btrfs_key key; - u32 item_size; + struct btrfs_key found_key; + u64 ref_root; + u64 last_snapshot; + u32 nritems; int ret; key.objectid = bytenr; key.offset = (u64)-1; key.type = BTRFS_EXTENT_ITEM_KEY; + path = btrfs_alloc_path(); ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto out; @@ -2082,83 +1125,55 @@ static noinline int check_committed_ref(struct btrfs_trans_handle *trans, path->slots[0]--; leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - - if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) - goto out; - - ret = 1; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (item_size < sizeof(*ei)) { - WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); - goto out; - } -#endif - ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - - if (item_size != sizeof(*ei) + - btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) - goto out; - - if (btrfs_extent_generation(leaf, ei) <= - btrfs_root_last_snapshot(&root->root_item)) - goto out; - - iref = (struct btrfs_extent_inline_ref *)(ei + 1); - if (btrfs_extent_inline_ref_type(leaf, iref) != - BTRFS_EXTENT_DATA_REF_KEY) - goto out; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - ref = (struct btrfs_extent_data_ref *)(&iref->offset); - if (btrfs_extent_refs(leaf, ei) != - btrfs_extent_data_ref_count(leaf, ref) || - btrfs_extent_data_ref_root(leaf, ref) != - root->root_key.objectid || - btrfs_extent_data_ref_objectid(leaf, ref) != objectid || - btrfs_extent_data_ref_offset(leaf, ref) != offset) + if (found_key.objectid != bytenr || + found_key.type != BTRFS_EXTENT_ITEM_KEY) goto out; - ret = 0; -out: - return ret; -} - -int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 offset, u64 bytenr) -{ - struct btrfs_path *path; - int ret; - int ret2; + last_snapshot = btrfs_root_last_snapshot(&root->root_item); + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(extent_root, path); + if (ret < 0) + goto out; + if (ret == 0) + continue; + break; + } + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != bytenr) + break; - path = btrfs_alloc_path(); - if (!path) - return -ENOENT; + if (found_key.type != BTRFS_EXTENT_REF_KEY) { + path->slots[0]++; + continue; + } - do { - ret = check_committed_ref(trans, root, path, objectid, - offset, bytenr); - if (ret && ret != -ENOENT) + ref_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref); + ref_root = btrfs_ref_root(leaf, ref_item); + if ((ref_root != root->root_key.objectid && + ref_root != BTRFS_TREE_LOG_OBJECTID) || + objectid != btrfs_ref_objectid(leaf, ref_item)) { + ret = 1; goto out; + } + if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) { + ret = 1; + goto out; + } - ret2 = check_delayed_ref(trans, root, path, objectid, - offset, bytenr); - } while (ret2 == -EAGAIN); - - if (ret2 && ret2 != -ENOENT) { - ret = ret2; - goto out; + path->slots[0]++; } - - if (ret != -ENOENT || ret2 != -ENOENT) - ret = 0; + ret = 0; out: btrfs_free_path(path); return ret; } -#if 0 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, u32 nr_extents) { @@ -2276,49 +1291,191 @@ static int refsort_cmp(const void *a_void, const void *b_void) return 1; return 0; } -#endif -static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, + +noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, - int full_backref, int inc) + struct extent_buffer *orig_buf, + struct extent_buffer *buf, u32 *nr_extents) { u64 bytenr; - u64 num_bytes; - u64 parent; u64 ref_root; + u64 orig_root; + u64 ref_generation; + u64 orig_generation; + struct refsort *sorted; u32 nritems; + u32 nr_file_extents = 0; struct btrfs_key key; struct btrfs_file_extent_item *fi; int i; int level; int ret = 0; + int faili = 0; + int refi = 0; + int slot; int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, - u64, u64, u64, u64, u64, u64); + u64, u64, u64, u64, u64, u64, u64, u64, u64); ref_root = btrfs_header_owner(buf); + ref_generation = btrfs_header_generation(buf); + orig_root = btrfs_header_owner(orig_buf); + orig_generation = btrfs_header_generation(orig_buf); + nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); - if (!root->ref_cows && level == 0) - return 0; + sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS); + BUG_ON(!sorted); - if (inc) - process_func = btrfs_inc_extent_ref; - else - process_func = btrfs_free_extent; + if (root->ref_cows) { + process_func = __btrfs_inc_extent_ref; + } else { + if (level == 0 && + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + goto out; + if (level != 0 && + root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) + goto out; + process_func = __btrfs_update_extent_ref; + } + + /* + * we make two passes through the items. In the first pass we + * only record the byte number and slot. Then we sort based on + * byte number and do the actual work based on the sorted results + */ + for (i = 0; i < nritems; i++) { + cond_resched(); + if (level == 0) { + btrfs_item_key_to_cpu(buf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(buf, i, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(buf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (bytenr == 0) + continue; + + nr_file_extents++; + sorted[refi].bytenr = bytenr; + sorted[refi].slot = i; + refi++; + } else { + bytenr = btrfs_node_blockptr(buf, i); + sorted[refi].bytenr = bytenr; + sorted[refi].slot = i; + refi++; + } + } + /* + * if refi == 0, we didn't actually put anything into the sorted + * array and we're done + */ + if (refi == 0) + goto out; + + sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); + + for (i = 0; i < refi; i++) { + cond_resched(); + slot = sorted[i].slot; + bytenr = sorted[i].bytenr; + + if (level == 0) { + btrfs_item_key_to_cpu(buf, &key, slot); + fi = btrfs_item_ptr(buf, slot, + struct btrfs_file_extent_item); + + bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (bytenr == 0) + continue; + + ret = process_func(trans, root, bytenr, + btrfs_file_extent_disk_num_bytes(buf, fi), + orig_buf->start, buf->start, + orig_root, ref_root, + orig_generation, ref_generation, + key.objectid); + + if (ret) { + faili = slot; + WARN_ON(1); + goto fail; + } + } else { + ret = process_func(trans, root, bytenr, buf->len, + orig_buf->start, buf->start, + orig_root, ref_root, + orig_generation, ref_generation, + level - 1); + if (ret) { + faili = slot; + WARN_ON(1); + goto fail; + } + } + } +out: + kfree(sorted); + if (nr_extents) { + if (level == 0) + *nr_extents = nr_file_extents; + else + *nr_extents = nritems; + } + return 0; +fail: + kfree(sorted); + WARN_ON(1); + return ret; +} + +int btrfs_update_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *orig_buf, + struct extent_buffer *buf, int start_slot, int nr) + +{ + u64 bytenr; + u64 ref_root; + u64 orig_root; + u64 ref_generation; + u64 orig_generation; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + int i; + int ret; + int slot; + int level; + + BUG_ON(start_slot < 0); + BUG_ON(start_slot + nr > btrfs_header_nritems(buf)); + + ref_root = btrfs_header_owner(buf); + ref_generation = btrfs_header_generation(buf); + orig_root = btrfs_header_owner(orig_buf); + orig_generation = btrfs_header_generation(orig_buf); + level = btrfs_header_level(buf); - if (full_backref) - parent = buf->start; - else - parent = 0; + if (!root->ref_cows) { + if (level == 0 && + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + return 0; + if (level != 0 && + root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) + return 0; + } - for (i = 0; i < nritems; i++) { + for (i = 0, slot = start_slot; i < nr; i++, slot++) { + cond_resched(); if (level == 0) { - btrfs_item_key_to_cpu(buf, &key, i); + btrfs_item_key_to_cpu(buf, &key, slot); if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) continue; - fi = btrfs_item_ptr(buf, i, + fi = btrfs_item_ptr(buf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(buf, fi) == BTRFS_FILE_EXTENT_INLINE) @@ -2326,39 +1483,28 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, bytenr = btrfs_file_extent_disk_bytenr(buf, fi); if (bytenr == 0) continue; - - num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); - key.offset -= btrfs_file_extent_offset(buf, fi); - ret = process_func(trans, root, bytenr, num_bytes, - parent, ref_root, key.objectid, - key.offset); + ret = __btrfs_update_extent_ref(trans, root, bytenr, + btrfs_file_extent_disk_num_bytes(buf, fi), + orig_buf->start, buf->start, + orig_root, ref_root, orig_generation, + ref_generation, key.objectid); if (ret) goto fail; } else { - bytenr = btrfs_node_blockptr(buf, i); - num_bytes = btrfs_level_size(root, level - 1); - ret = process_func(trans, root, bytenr, num_bytes, - parent, ref_root, level - 1, 0); + bytenr = btrfs_node_blockptr(buf, slot); + ret = __btrfs_update_extent_ref(trans, root, bytenr, + buf->len, orig_buf->start, + buf->start, orig_root, ref_root, + orig_generation, ref_generation, + level - 1); if (ret) goto fail; } } return 0; fail: - BUG(); - return ret; -} - -int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) -{ - return __btrfs_mod_ref(trans, root, buf, full_backref, 1); -} - -int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) -{ - return __btrfs_mod_ref(trans, root, buf, full_backref, 0); + WARN_ON(1); + return -1; } static int write_one_cache_group(struct btrfs_trans_handle *trans, @@ -2861,24 +2007,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, u64 old_val; u64 byte_in_group; - /* block accounting for super block */ - spin_lock(&info->delalloc_lock); - old_val = btrfs_super_bytes_used(&info->super_copy); - if (alloc) - old_val += num_bytes; - else - old_val -= num_bytes; - btrfs_set_super_bytes_used(&info->super_copy, old_val); - - /* block accounting for root item */ - old_val = btrfs_root_used(&root->root_item); - if (alloc) - old_val += num_bytes; - else - old_val -= num_bytes; - btrfs_set_root_used(&root->root_item, old_val); - spin_unlock(&info->delalloc_lock); - while (total) { cache = btrfs_lookup_block_group(info, bytenr); if (!cache) @@ -3088,6 +2216,8 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, u64 header_owner = btrfs_header_owner(buf); u64 header_transid = btrfs_header_generation(buf); if (header_owner != BTRFS_TREE_LOG_OBJECTID && + header_owner != BTRFS_TREE_RELOC_OBJECTID && + header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID && header_transid == trans->transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { *must_clean = buf; @@ -3105,77 +2235,63 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, return 0; } - -static int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner_objectid, - u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extent_op) +/* + * remove an extent from the root, returns 0 on success + */ +static int __free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin, int mark_free, + int refs_to_drop) { - struct btrfs_key key; struct btrfs_path *path; + struct btrfs_key key; struct btrfs_fs_info *info = root->fs_info; struct btrfs_root *extent_root = info->extent_root; struct extent_buffer *leaf; - struct btrfs_extent_item *ei; - struct btrfs_extent_inline_ref *iref; int ret; - int is_data; int extent_slot = 0; int found_extent = 0; int num_to_del = 1; - u32 item_size; - u64 refs; + struct btrfs_extent_item *ei; + u32 refs; + key.objectid = bytenr; + btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + key.offset = num_bytes; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = 1; path->leave_spinning = 1; - - is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; - BUG_ON(!is_data && refs_to_drop != 1); - - ret = lookup_extent_backref(trans, extent_root, path, &iref, - bytenr, num_bytes, parent, - root_objectid, owner_objectid, - owner_offset); + ret = lookup_extent_backref(trans, extent_root, path, + bytenr, parent, root_objectid, + ref_generation, owner_objectid, 1); if (ret == 0) { + struct btrfs_key found_key; extent_slot = path->slots[0]; - while (extent_slot >= 0) { - btrfs_item_key_to_cpu(path->nodes[0], &key, + while (extent_slot > 0) { + extent_slot--; + btrfs_item_key_to_cpu(path->nodes[0], &found_key, extent_slot); - if (key.objectid != bytenr) + if (found_key.objectid != bytenr) break; - if (key.type == BTRFS_EXTENT_ITEM_KEY && - key.offset == num_bytes) { + if (found_key.type == BTRFS_EXTENT_ITEM_KEY && + found_key.offset == num_bytes) { found_extent = 1; break; } if (path->slots[0] - extent_slot > 5) break; - extent_slot--; } -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); - if (found_extent && item_size < sizeof(*ei)) - found_extent = 0; -#endif if (!found_extent) { - BUG_ON(iref); ret = remove_extent_backref(trans, extent_root, path, - NULL, refs_to_drop, - is_data); + refs_to_drop); BUG_ON(ret); btrfs_release_path(extent_root, path); path->leave_spinning = 1; - - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = num_bytes; - ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); if (ret) { @@ -3191,98 +2307,82 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_print_leaf(extent_root, path->nodes[0]); WARN_ON(1); printk(KERN_ERR "btrfs unable to find ref byte nr %llu " - "parent %llu root %llu owner %llu offset %llu\n", + "parent %llu root %llu gen %llu owner %llu\n", (unsigned long long)bytenr, (unsigned long long)parent, (unsigned long long)root_objectid, - (unsigned long long)owner_objectid, - (unsigned long long)owner_offset); + (unsigned long long)ref_generation, + (unsigned long long)owner_objectid); } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, extent_slot); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (item_size < sizeof(*ei)) { - BUG_ON(found_extent || extent_slot != path->slots[0]); - ret = convert_extent_item_v0(trans, extent_root, path, - owner_objectid, 0); - BUG_ON(ret < 0); - - btrfs_release_path(extent_root, path); - path->leave_spinning = 1; - - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = num_bytes; - - ret = btrfs_search_slot(trans, extent_root, &key, path, - -1, 1); - if (ret) { - printk(KERN_ERR "umm, got %d back from search" - ", was looking for %llu\n", ret, - (unsigned long long)bytenr); - btrfs_print_leaf(extent_root, path->nodes[0]); - } - BUG_ON(ret); - extent_slot = path->slots[0]; - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, extent_slot); - } -#endif - BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); - if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { - struct btrfs_tree_block_info *bi; - BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); - bi = (struct btrfs_tree_block_info *)(ei + 1); - WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); - } - refs = btrfs_extent_refs(leaf, ei); + + /* + * we're not allowed to delete the extent item if there + * are other delayed ref updates pending + */ + BUG_ON(refs < refs_to_drop); refs -= refs_to_drop; + btrfs_set_extent_refs(leaf, ei, refs); + btrfs_mark_buffer_dirty(leaf); - if (refs > 0) { - if (extent_op) - __run_delayed_extent_op(extent_op, leaf, ei); - /* - * In the case of inline back ref, reference count will - * be updated by remove_extent_backref + if (refs == 0 && found_extent && + path->slots[0] == extent_slot + 1) { + struct btrfs_extent_ref *ref; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref); + BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop); + /* if the back ref and the extent are next to each other + * they get deleted below in one shot */ - if (iref) { - BUG_ON(!found_extent); - } else { - btrfs_set_extent_refs(leaf, ei, refs); - btrfs_mark_buffer_dirty(leaf); - } - if (found_extent) { - ret = remove_extent_backref(trans, extent_root, path, - iref, refs_to_drop, - is_data); + path->slots[0] = extent_slot; + num_to_del = 2; + } else if (found_extent) { + /* otherwise delete the extent back ref */ + ret = remove_extent_backref(trans, extent_root, path, + refs_to_drop); + BUG_ON(ret); + /* if refs are 0, we need to setup the path for deletion */ + if (refs == 0) { + btrfs_release_path(extent_root, path); + path->leave_spinning = 1; + ret = btrfs_search_slot(trans, extent_root, &key, path, + -1, 1); BUG_ON(ret); } - } else { - int mark_free = 0; + } + + if (refs == 0) { + u64 super_used; + u64 root_used; struct extent_buffer *must_clean = NULL; - if (found_extent) { - BUG_ON(is_data && refs_to_drop != - extent_data_ref_count(root, path, iref)); - if (iref) { - BUG_ON(path->slots[0] != extent_slot); - } else { - BUG_ON(path->slots[0] != extent_slot + 1); - path->slots[0] = extent_slot; - num_to_del = 2; - } + if (pin) { + ret = pin_down_bytes(trans, root, path, + bytenr, num_bytes, + owner_objectid >= BTRFS_FIRST_FREE_OBJECTID, + &must_clean); + if (ret > 0) + mark_free = 1; + BUG_ON(ret < 0); } - ret = pin_down_bytes(trans, root, path, bytenr, - num_bytes, is_data, &must_clean); - if (ret > 0) - mark_free = 1; - BUG_ON(ret < 0); + /* block accounting for super block */ + spin_lock(&info->delalloc_lock); + super_used = btrfs_super_bytes_used(&info->super_copy); + btrfs_set_super_bytes_used(&info->super_copy, + super_used - num_bytes); + + /* block accounting for root item */ + root_used = btrfs_root_used(&root->root_item); + btrfs_set_root_used(&root->root_item, + root_used - num_bytes); + spin_unlock(&info->delalloc_lock); + /* * it is going to be very rare for someone to be waiting * on the block we're freeing. del_items might need to @@ -3303,7 +2403,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, free_extent_buffer(must_clean); } - if (is_data) { + if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); BUG_ON(ret); } else { @@ -3320,6 +2420,34 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, return ret; } +/* + * remove an extent from the root, returns 0 on success + */ +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin, + int refs_to_drop) +{ + WARN_ON(num_bytes < root->sectorsize); + + /* + * if metadata always pin + * if data pin when any transaction has committed this + */ + if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID || + ref_generation != trans->transid) + pin = 1; + + if (ref_generation != trans->transid) + pin = 1; + + return __free_extent(trans, root, bytenr, num_bytes, parent, + root_objectid, ref_generation, + owner_objectid, pin, pin == 0, refs_to_drop); +} + /* * when we free an extent, it is possible (and likely) that we free the last * delayed ref for that extent as well. This searches the delayed ref tree for @@ -3351,13 +2479,6 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (ref->bytenr == bytenr) goto out; - if (head->extent_op) { - if (!head->must_insert_reserved) - goto out; - kfree(head->extent_op); - head->extent_op = NULL; - } - /* * waiting for the lock here would deadlock. If someone else has it * locked they are already in the process of dropping it anyway @@ -3386,8 +2507,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); ret = run_one_delayed_ref(trans, root->fs_info->tree_root, - &head->node, head->extent_op, - head->must_insert_reserved); + &head->node, head->must_insert_reserved); BUG_ON(ret); btrfs_put_delayed_ref(&head->node); return 0; @@ -3399,32 +2519,32 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset) + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin) { int ret; /* * tree log blocks never actually go into the extent allocation * tree, just update pinning info and exit early. + * + * data extents referenced by the tree log do need to have + * their reference counts bumped. */ - if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { - WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID && + owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { /* unlocks the pinned mutex */ btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); update_reserved_extents(root, bytenr, num_bytes, 0); ret = 0; - } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, - parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL); + } else { + ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, + root_objectid, ref_generation, + owner_objectid, + BTRFS_DROP_DELAYED_REF, 1); BUG_ON(ret); ret = check_ref_cleanup(trans, root, bytenr); BUG_ON(ret); - } else { - ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, - parent, root_objectid, owner, - offset, BTRFS_DROP_DELAYED_REF, NULL); - BUG_ON(ret); } return ret; } @@ -3599,7 +2719,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, last_ptr_loop = 0; /* allocate a cluster in this block group */ - ret = btrfs_find_space_cluster(trans, root, + ret = btrfs_find_space_cluster(trans, block_group, last_ptr, offset, num_bytes, empty_cluster + empty_size); @@ -3849,147 +2969,99 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, return ret; } -static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 parent, u64 root_objectid, - u64 flags, u64 owner, u64 offset, - struct btrfs_key *ins, int ref_mod) +static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins, + int ref_mod) { int ret; - struct btrfs_fs_info *fs_info = root->fs_info; + u64 super_used; + u64 root_used; + u64 num_bytes = ins->offset; + u32 sizes[2]; + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_root *extent_root = info->extent_root; struct btrfs_extent_item *extent_item; - struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_ref *ref; struct btrfs_path *path; - struct extent_buffer *leaf; - int type; - u32 size; - - if (parent > 0) - type = BTRFS_SHARED_DATA_REF_KEY; - else - type = BTRFS_EXTENT_DATA_REF_KEY; - - size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); - - path = btrfs_alloc_path(); - BUG_ON(!path); - - path->leave_spinning = 1; - ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, - ins, size); - BUG_ON(ret); + struct btrfs_key keys[2]; - leaf = path->nodes[0]; - extent_item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - btrfs_set_extent_refs(leaf, extent_item, ref_mod); - btrfs_set_extent_generation(leaf, extent_item, trans->transid); - btrfs_set_extent_flags(leaf, extent_item, - flags | BTRFS_EXTENT_FLAG_DATA); - - iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); - btrfs_set_extent_inline_ref_type(leaf, iref, type); - if (parent > 0) { - struct btrfs_shared_data_ref *ref; - ref = (struct btrfs_shared_data_ref *)(iref + 1); - btrfs_set_extent_inline_ref_offset(leaf, iref, parent); - btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); - } else { - struct btrfs_extent_data_ref *ref; - ref = (struct btrfs_extent_data_ref *)(&iref->offset); - btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); - btrfs_set_extent_data_ref_objectid(leaf, ref, owner); - btrfs_set_extent_data_ref_offset(leaf, ref, offset); - btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); - } + if (parent == 0) + parent = ins->objectid; - btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_free_path(path); + /* block accounting for super block */ + spin_lock(&info->delalloc_lock); + super_used = btrfs_super_bytes_used(&info->super_copy); + btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes); - ret = update_block_group(trans, root, ins->objectid, ins->offset, - 1, 0); - if (ret) { - printk(KERN_ERR "btrfs update block group failed for %llu " - "%llu\n", (unsigned long long)ins->objectid, - (unsigned long long)ins->offset); - BUG(); - } - return ret; -} + /* block accounting for root item */ + root_used = btrfs_root_used(&root->root_item); + btrfs_set_root_used(&root->root_item, root_used + num_bytes); + spin_unlock(&info->delalloc_lock); -static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 parent, u64 root_objectid, - u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins) -{ - int ret; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_extent_item *extent_item; - struct btrfs_tree_block_info *block_info; - struct btrfs_extent_inline_ref *iref; - struct btrfs_path *path; - struct extent_buffer *leaf; - u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); + memcpy(&keys[0], ins, sizeof(*ins)); + keys[1].objectid = ins->objectid; + keys[1].type = BTRFS_EXTENT_REF_KEY; + keys[1].offset = parent; + sizes[0] = sizeof(*extent_item); + sizes[1] = sizeof(*ref); path = btrfs_alloc_path(); BUG_ON(!path); path->leave_spinning = 1; - ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, - ins, size); + ret = btrfs_insert_empty_items(trans, extent_root, path, keys, + sizes, 2); BUG_ON(ret); - leaf = path->nodes[0]; - extent_item = btrfs_item_ptr(leaf, path->slots[0], + extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); - btrfs_set_extent_refs(leaf, extent_item, 1); - btrfs_set_extent_generation(leaf, extent_item, trans->transid); - btrfs_set_extent_flags(leaf, extent_item, - flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); - block_info = (struct btrfs_tree_block_info *)(extent_item + 1); - - btrfs_set_tree_block_key(leaf, block_info, key); - btrfs_set_tree_block_level(leaf, block_info, level); - - iref = (struct btrfs_extent_inline_ref *)(block_info + 1); - if (parent > 0) { - BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); - btrfs_set_extent_inline_ref_type(leaf, iref, - BTRFS_SHARED_BLOCK_REF_KEY); - btrfs_set_extent_inline_ref_offset(leaf, iref, parent); - } else { - btrfs_set_extent_inline_ref_type(leaf, iref, - BTRFS_TREE_BLOCK_REF_KEY); - btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); - } + btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod); + ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, + struct btrfs_extent_ref); - btrfs_mark_buffer_dirty(leaf); + btrfs_set_ref_root(path->nodes[0], ref, root_objectid); + btrfs_set_ref_generation(path->nodes[0], ref, ref_generation); + btrfs_set_ref_objectid(path->nodes[0], ref, owner); + btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod); + + btrfs_mark_buffer_dirty(path->nodes[0]); + + trans->alloc_exclude_start = 0; + trans->alloc_exclude_nr = 0; btrfs_free_path(path); - ret = update_block_group(trans, root, ins->objectid, ins->offset, - 1, 0); + if (ret) + goto out; + + ret = update_block_group(trans, root, ins->objectid, + ins->offset, 1, 0); if (ret) { printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, (unsigned long long)ins->offset); BUG(); } +out: return ret; } -int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 root_objectid, u64 owner, - u64 offset, struct btrfs_key *ins) +int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins) { int ret; - BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); + if (root_objectid == BTRFS_TREE_LOG_OBJECTID) + return 0; - ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, - 0, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_EXTENT, NULL); + ret = btrfs_add_delayed_ref(trans, ins->objectid, + ins->offset, parent, root_objectid, + ref_generation, owner, + BTRFS_ADD_DELAYED_EXTENT, 0); + BUG_ON(ret); return ret; } @@ -3998,10 +3070,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, * an extent has been allocated and makes sure to clear the free * space cache bits as well */ -int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 root_objectid, u64 owner, u64 offset, - struct btrfs_key *ins) +int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins) { int ret; struct btrfs_block_group_cache *block_group; @@ -4015,8 +3087,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, ins->offset); BUG_ON(ret); btrfs_put_block_group(block_group); - ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, - 0, owner, offset, ins, 1); + ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, + ref_generation, owner, ins, 1); return ret; } @@ -4027,48 +3099,26 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, * * returns 0 if everything worked, non-zero otherwise. */ -static int alloc_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 parent, u64 root_objectid, - struct btrfs_disk_key *key, int level, - u64 empty_size, u64 hint_byte, u64 search_end, - struct btrfs_key *ins) +int btrfs_alloc_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 parent, u64 min_alloc_size, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, u64 data) { int ret; - u64 flags = 0; - - ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes, - empty_size, hint_byte, search_end, - ins, 0); + ret = __btrfs_reserve_extent(trans, root, num_bytes, + min_alloc_size, empty_size, hint_byte, + search_end, ins, data); BUG_ON(ret); - - if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { - if (parent == 0) - parent = ins->objectid; - flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; - } else - BUG_ON(parent > 0); - - update_reserved_extents(root, ins->objectid, ins->offset, 1); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - struct btrfs_delayed_extent_op *extent_op; - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - BUG_ON(!extent_op); - if (key) - memcpy(&extent_op->key, key, sizeof(extent_op->key)); - else - memset(&extent_op->key, 0, sizeof(extent_op->key)); - extent_op->flags_to_set = flags; - extent_op->update_key = 1; - extent_op->update_flags = 1; - extent_op->is_data = 0; - - ret = btrfs_add_delayed_tree_ref(trans, ins->objectid, - ins->offset, parent, root_objectid, - level, BTRFS_ADD_DELAYED_EXTENT, - extent_op); + ret = btrfs_add_delayed_ref(trans, ins->objectid, + ins->offset, parent, root_objectid, + ref_generation, owner_objectid, + BTRFS_ADD_DELAYED_EXTENT, 0); BUG_ON(ret); } + update_reserved_extents(root, ins->objectid, ins->offset, 1); return ret; } @@ -4107,17 +3157,21 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, * returns the tree buffer or NULL. */ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u32 blocksize, - u64 parent, u64 root_objectid, - struct btrfs_disk_key *key, int level, - u64 hint, u64 empty_size) + struct btrfs_root *root, + u32 blocksize, u64 parent, + u64 root_objectid, + u64 ref_generation, + int level, + u64 hint, + u64 empty_size) { struct btrfs_key ins; int ret; struct extent_buffer *buf; - ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid, - key, level, empty_size, hint, (u64)-1, &ins); + ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize, + root_objectid, ref_generation, level, + empty_size, hint, (u64)-1, &ins, 0); if (ret) { BUG_ON(ret > 0); return ERR_PTR(ret); @@ -4131,19 +3185,32 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *leaf) { - u64 disk_bytenr; - u64 num_bytes; + u64 leaf_owner; + u64 leaf_generation; + struct refsort *sorted; struct btrfs_key key; struct btrfs_file_extent_item *fi; - u32 nritems; int i; + int nritems; int ret; + int refi = 0; + int slot; BUG_ON(!btrfs_is_leaf(leaf)); nritems = btrfs_header_nritems(leaf); + leaf_owner = btrfs_header_owner(leaf); + leaf_generation = btrfs_header_generation(leaf); + sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); + /* we do this loop twice. The first time we build a list + * of the extents we have a reference on, then we sort the list + * by bytenr. The second time around we actually do the + * extent freeing. + */ for (i = 0; i < nritems; i++) { + u64 disk_bytenr; cond_resched(); + btrfs_item_key_to_cpu(leaf, &key, i); /* only extents have references, skip everything else */ @@ -4163,16 +3230,45 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, if (disk_bytenr == 0) continue; - num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes, - leaf->start, 0, key.objectid, 0); + sorted[refi].bytenr = disk_bytenr; + sorted[refi].slot = i; + refi++; + } + + if (refi == 0) + goto out; + + sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); + + for (i = 0; i < refi; i++) { + u64 disk_bytenr; + + disk_bytenr = sorted[i].bytenr; + slot = sorted[i].slot; + + cond_resched(); + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + ret = btrfs_free_extent(trans, root, disk_bytenr, + btrfs_file_extent_disk_num_bytes(leaf, fi), + leaf->start, leaf_owner, leaf_generation, + key.objectid, 0); BUG_ON(ret); + + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); + cond_resched(); } +out: + kfree(sorted); return 0; } -#if 0 - static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_leaf_ref *ref) @@ -4215,14 +3311,13 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, return 0; } - static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 start, u64 len, u32 *refs) { int ret; - ret = btrfs_lookup_extent_refs(trans, root, start, len, refs); + ret = btrfs_lookup_extent_ref(trans, root, start, len, refs); BUG_ON(ret); #if 0 /* some debugging code in case we see problems here */ @@ -4257,7 +3352,6 @@ static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans, return ret; } - /* * this is used while deleting old snapshots, and it drops the refs * on a whole subtree starting from a level 1 node. @@ -4551,36 +3645,32 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, cond_resched(); return 0; } -#endif /* * helper function for drop_subtree, this function is similar to * walk_down_tree. The main difference is that it checks reference * counts while tree blocks are locked. */ -static noinline int walk_down_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int *level) +static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level) { struct extent_buffer *next; struct extent_buffer *cur; struct extent_buffer *parent; u64 bytenr; u64 ptr_gen; - u64 refs; - u64 flags; u32 blocksize; + u32 refs; int ret; cur = path->nodes[*level]; - ret = btrfs_lookup_extent_info(trans, root, cur->start, cur->len, - &refs, &flags); + ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len, + &refs); BUG_ON(ret); if (refs > 1) goto out; - BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); - while (*level >= 0) { cur = path->nodes[*level]; if (*level == 0) { @@ -4602,15 +3692,16 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, - &refs, &flags); + ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, + &refs); BUG_ON(ret); if (refs > 1) { parent = path->nodes[*level]; ret = btrfs_free_extent(trans, root, bytenr, - blocksize, parent->start, - btrfs_header_owner(parent), - *level - 1, 0); + blocksize, parent->start, + btrfs_header_owner(parent), + btrfs_header_generation(parent), + *level - 1, 1); BUG_ON(ret); path->slots[*level]++; btrfs_tree_unlock(next); @@ -4618,8 +3709,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, continue; } - BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); - *level = btrfs_header_level(next); path->nodes[*level] = next; path->slots[*level] = 0; @@ -4627,15 +3716,13 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, cond_resched(); } out: - if (path->nodes[*level] == root->node) - parent = path->nodes[*level]; - else - parent = path->nodes[*level + 1]; + parent = path->nodes[*level + 1]; bytenr = path->nodes[*level]->start; blocksize = path->nodes[*level]->len; - ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start, - btrfs_header_owner(parent), *level, 0); + ret = btrfs_free_extent(trans, root, bytenr, blocksize, + parent->start, btrfs_header_owner(parent), + btrfs_header_generation(parent), *level, 1); BUG_ON(ret); if (path->locks[*level]) { @@ -4659,6 +3746,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_path *path, int *level, int max_level) { + u64 root_owner; + u64 root_gen; struct btrfs_root_item *root_item = &root->root_item; int i; int slot; @@ -4666,22 +3755,24 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, for (i = *level; i < max_level && path->nodes[i]; i++) { slot = path->slots[i]; - if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { + if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { + struct extent_buffer *node; + struct btrfs_disk_key disk_key; + /* * there is more work to do in this level. * Update the drop_progress marker to reflect * the work we've done so far, and then bump * the slot number */ + node = path->nodes[i]; path->slots[i]++; - WARN_ON(*level == 0); - if (max_level == BTRFS_MAX_LEVEL) { - btrfs_node_key(path->nodes[i], - &root_item->drop_progress, - path->slots[i]); - root_item->drop_level = i; - } *level = i; + WARN_ON(*level == 0); + btrfs_node_key(node, &disk_key, path->slots[i]); + memcpy(&root_item->drop_progress, + &disk_key, sizeof(disk_key)); + root_item->drop_level = i; return 0; } else { struct extent_buffer *parent; @@ -4695,20 +3786,22 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, else parent = path->nodes[*level + 1]; - clean_tree_block(trans, root, path->nodes[i]); + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + + clean_tree_block(trans, root, path->nodes[*level]); ret = btrfs_free_extent(trans, root, - path->nodes[i]->start, - path->nodes[i]->len, - parent->start, - btrfs_header_owner(parent), - *level, 0); + path->nodes[*level]->start, + path->nodes[*level]->len, + parent->start, root_owner, + root_gen, *level, 1); BUG_ON(ret); if (path->locks[*level]) { - btrfs_tree_unlock(path->nodes[i]); - path->locks[i] = 0; + btrfs_tree_unlock(path->nodes[*level]); + path->locks[*level] = 0; } - free_extent_buffer(path->nodes[i]); - path->nodes[i] = NULL; + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; *level = i + 1; } } @@ -4727,18 +3820,21 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root int wret; int level; struct btrfs_path *path; + int i; + int orig_level; int update_count; struct btrfs_root_item *root_item = &root->root_item; + WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); path = btrfs_alloc_path(); BUG_ON(!path); level = btrfs_header_level(root->node); + orig_level = level; if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { - path->nodes[level] = btrfs_lock_root_node(root); - btrfs_set_lock_blocking(path->nodes[level]); + path->nodes[level] = root->node; + extent_buffer_get(root->node); path->slots[level] = 0; - path->locks[level] = 1; } else { struct btrfs_key key; struct btrfs_disk_key found_key; @@ -4760,7 +3856,12 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root * unlock our path, this is safe because only this * function is allowed to delete this snapshot */ - btrfs_unlock_up_safe(path, 0); + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + if (path->nodes[i] && path->locks[i]) { + path->locks[i] = 0; + btrfs_tree_unlock(path->nodes[i]); + } + } } while (1) { unsigned long update; @@ -4781,6 +3882,8 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root ret = -EAGAIN; break; } + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); for (update_count = 0; update_count < 16; update_count++) { update = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; @@ -4790,6 +3893,12 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root break; } } + for (i = 0; i <= orig_level; i++) { + if (path->nodes[i]) { + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + } out: btrfs_free_path(path); return ret; @@ -4822,7 +3931,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, path->slots[level] = 0; while (1) { - wret = walk_down_tree(trans, root, path, &level); + wret = walk_down_subtree(trans, root, path, &level); if (wret < 0) ret = wret; if (wret != 0) @@ -4839,7 +3948,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return ret; } -#if 0 static unsigned long calc_ra(unsigned long start, unsigned long last, unsigned long nr) { @@ -6321,7 +5429,6 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root, kfree(ref_path); return ret; } -#endif static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) { @@ -6370,8 +5477,7 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root, u64 calc; spin_lock(&shrink_block_group->lock); - if (btrfs_block_group_used(&shrink_block_group->item) + - shrink_block_group->reserved > 0) { + if (btrfs_block_group_used(&shrink_block_group->item) > 0) { spin_unlock(&shrink_block_group->lock); trans = btrfs_start_transaction(root, 1); @@ -6396,17 +5502,6 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root, return 0; } - -int btrfs_prepare_block_group_relocation(struct btrfs_root *root, - struct btrfs_block_group_cache *group) - -{ - __alloc_chunk_for_shrink(root, group, 1); - set_block_group_readonly(group); - return 0; -} - -#if 0 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 size) @@ -6686,7 +5781,6 @@ int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start) btrfs_free_path(path); return ret; } -#endif static int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key) diff --git a/trunk/fs/btrfs/extent_io.c b/trunk/fs/btrfs/extent_io.c index 68260180f587..fe9eb990e443 100644 --- a/trunk/fs/btrfs/extent_io.c +++ b/trunk/fs/btrfs/extent_io.c @@ -476,7 +476,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *state; struct extent_state *prealloc = NULL; struct rb_node *node; - u64 last_end; int err; int set = 0; @@ -499,7 +498,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (state->start > end) goto out; WARN_ON(state->end < start); - last_end = state->end; /* * | ---- desired range ---- | @@ -526,11 +524,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (err) goto out; if (state->end <= end) { + start = state->end + 1; set |= clear_state_bit(tree, state, bits, wake, delete); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; } else { start = state->start; } @@ -556,10 +552,8 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, goto out; } + start = state->end + 1; set |= clear_state_bit(tree, state, bits, wake, delete); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; goto search_again; out: @@ -713,10 +707,8 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, goto out; } set_state_bits(tree, state, bits); + start = state->end + 1; merge_state(tree, state); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; goto search_again; } @@ -750,10 +742,8 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, goto out; if (state->end <= end) { set_state_bits(tree, state, bits); + start = state->end + 1; merge_state(tree, state); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; } else { start = state->start; } diff --git a/trunk/fs/btrfs/file.c b/trunk/fs/btrfs/file.c index 126477eaecf5..1d51dc38bb49 100644 --- a/trunk/fs/btrfs/file.c +++ b/trunk/fs/btrfs/file.c @@ -291,12 +291,16 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, { u64 extent_end = 0; u64 search_start = start; + u64 leaf_start; u64 ram_bytes = 0; + u64 orig_parent = 0; u64 disk_bytenr = 0; u64 orig_locked_end = locked_end; u8 compression; u8 encryption; u16 other_encoding = 0; + u64 root_gen; + u64 root_owner; struct extent_buffer *leaf; struct btrfs_file_extent_item *extent; struct btrfs_path *path; @@ -336,6 +340,9 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, bookend = 0; found_extent = 0; found_inline = 0; + leaf_start = 0; + root_gen = 0; + root_owner = 0; compression = 0; encryption = 0; extent = NULL; @@ -410,6 +417,9 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (found_extent) { read_extent_buffer(leaf, &old, (unsigned long)extent, sizeof(old)); + root_gen = btrfs_header_generation(leaf); + root_owner = btrfs_header_owner(leaf); + leaf_start = leaf->start; } if (end < extent_end && end >= key.offset) { @@ -433,14 +443,14 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, } locked_end = extent_end; } + orig_parent = path->nodes[0]->start; disk_bytenr = le64_to_cpu(old.disk_bytenr); if (disk_bytenr != 0) { ret = btrfs_inc_extent_ref(trans, root, disk_bytenr, - le64_to_cpu(old.disk_num_bytes), 0, - root->root_key.objectid, - key.objectid, key.offset - - le64_to_cpu(old.offset)); + le64_to_cpu(old.disk_num_bytes), + orig_parent, root->root_key.objectid, + trans->transid, inode->i_ino); BUG_ON(ret); } } @@ -558,6 +568,17 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_set_lock_blocking(path->nodes[0]); + if (disk_bytenr != 0) { + ret = btrfs_update_extent_ref(trans, root, + disk_bytenr, + le64_to_cpu(old.disk_num_bytes), + orig_parent, + leaf->start, + root->root_key.objectid, + trans->transid, ins.objectid); + + BUG_ON(ret); + } path->leave_spinning = 0; btrfs_release_path(root, path); if (disk_bytenr != 0) @@ -573,9 +594,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, ret = btrfs_free_extent(trans, root, old_disk_bytenr, le64_to_cpu(old.disk_num_bytes), - 0, root->root_key.objectid, - key.objectid, key.offset - - le64_to_cpu(old.offset)); + leaf_start, root_owner, + root_gen, key.objectid, 0); BUG_ON(ret); *hint_byte = old_disk_bytenr; } @@ -644,11 +664,12 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, u64 bytenr; u64 num_bytes; u64 extent_end; - u64 orig_offset; + u64 extent_offset; u64 other_start; u64 other_end; u64 split = start; u64 locked_end = end; + u64 orig_parent; int extent_type; int split_end = 1; int ret; @@ -682,7 +703,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); + extent_offset = btrfs_file_extent_offset(leaf, fi); if (key.offset == start) split = end; @@ -690,6 +711,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, if (key.offset == start && extent_end == end) { int del_nr = 0; int del_slot = 0; + u64 leaf_owner = btrfs_header_owner(leaf); + u64 leaf_gen = btrfs_header_generation(leaf); other_start = end; other_end = 0; if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, @@ -698,8 +721,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, del_slot = path->slots[0] + 1; del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - 0, root->root_key.objectid, - inode->i_ino, orig_offset); + leaf->start, leaf_owner, + leaf_gen, inode->i_ino, 0); BUG_ON(ret); } other_start = 0; @@ -710,8 +733,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, del_slot = path->slots[0]; del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - 0, root->root_key.objectid, - inode->i_ino, orig_offset); + leaf->start, leaf_owner, + leaf_gen, inode->i_ino, 0); BUG_ON(ret); } split_end = 0; @@ -745,12 +768,13 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, locked_end = extent_end; } btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset); + extent_offset += split - key.offset; } else { BUG_ON(key.offset != start); - key.offset = split; - btrfs_set_file_extent_offset(leaf, fi, key.offset - - orig_offset); + btrfs_set_file_extent_offset(leaf, fi, extent_offset + + split - key.offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); + key.offset = split; btrfs_set_item_key_safe(trans, root, path, &key); extent_end = split; } @@ -769,8 +793,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item); key.offset = split; btrfs_set_item_key_safe(trans, root, path, &key); - btrfs_set_file_extent_offset(leaf, fi, key.offset - - orig_offset); + btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, other_end - split); goto done; @@ -792,9 +815,10 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); - ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, - root->root_key.objectid, - inode->i_ino, orig_offset); + orig_parent = leaf->start; + ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, + orig_parent, root->root_key.objectid, + trans->transid, inode->i_ino); BUG_ON(ret); btrfs_release_path(root, path); @@ -809,12 +833,20 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, btrfs_set_file_extent_type(leaf, fi, extent_type); btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr); btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes); - btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset); + btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_compression(leaf, fi, 0); btrfs_set_file_extent_encryption(leaf, fi, 0); btrfs_set_file_extent_other_encoding(leaf, fi, 0); + + if (orig_parent != leaf->start) { + ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes, + orig_parent, leaf->start, + root->root_key.objectid, + trans->transid, inode->i_ino); + BUG_ON(ret); + } done: btrfs_mark_buffer_dirty(leaf); @@ -1157,8 +1189,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) btrfs_wait_ordered_range(inode, 0, (u64)-1); root->log_batch++; - if (datasync && !(inode->i_state & I_DIRTY_PAGES)) - goto out; /* * ok we haven't committed the transaction yet, lets do a commit */ diff --git a/trunk/fs/btrfs/free-space-cache.c b/trunk/fs/btrfs/free-space-cache.c index 4538e48581a5..0bc93657b460 100644 --- a/trunk/fs/btrfs/free-space-cache.c +++ b/trunk/fs/btrfs/free-space-cache.c @@ -579,7 +579,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, * it returns -enospc */ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size) @@ -596,9 +595,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, int ret; /* for metadata, allow allocates with more holes */ - if (btrfs_test_opt(root, SSD_SPREAD)) { - min_bytes = bytes + empty_size; - } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { + if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { /* * we want to do larger allocations when we are * flushing out the delayed refs, it helps prevent @@ -648,15 +645,14 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, * we haven't filled the empty size and the window is * very large. reset and try again */ - if (next->offset - (last->offset + last->bytes) > 128 * 1024 || - next->offset - window_start > (bytes + empty_size) * 2) { + if (next->offset - window_start > (bytes + empty_size) * 2) { entry = next; window_start = entry->offset; window_free = entry->bytes; last = entry; max_extent = 0; total_retries++; - if (total_retries % 64 == 0) { + if (total_retries % 256 == 0) { if (min_bytes >= (bytes + empty_size)) { ret = -ENOSPC; goto out; diff --git a/trunk/fs/btrfs/free-space-cache.h b/trunk/fs/btrfs/free-space-cache.h index 266fb8764054..ab0bdc0a63ce 100644 --- a/trunk/fs/btrfs/free-space-cache.h +++ b/trunk/fs/btrfs/free-space-cache.h @@ -31,7 +31,6 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, u64 bytes); u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size); diff --git a/trunk/fs/btrfs/hash.h b/trunk/fs/btrfs/hash.h index db2ff9773b99..2a020b276768 100644 --- a/trunk/fs/btrfs/hash.h +++ b/trunk/fs/btrfs/hash.h @@ -19,9 +19,9 @@ #ifndef __HASH__ #define __HASH__ -#include +#include "crc32c.h" static inline u64 btrfs_name_hash(const char *name, int len) { - return crc32c((u32)~1, name, len); + return btrfs_crc32c((u32)~1, name, len); } #endif diff --git a/trunk/fs/btrfs/inode.c b/trunk/fs/btrfs/inode.c index 8612b3a09811..1c8b0190d031 100644 --- a/trunk/fs/btrfs/inode.c +++ b/trunk/fs/btrfs/inode.c @@ -48,6 +48,7 @@ #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" +#include "ref-cache.h" #include "compression.h" #include "locking.h" @@ -368,7 +369,7 @@ static noinline int compress_file_range(struct inode *inode, * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && + if (!btrfs_test_flag(inode, NOCOMPRESS) && btrfs_test_opt(root, COMPRESS)) { WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); @@ -469,7 +470,7 @@ static noinline int compress_file_range(struct inode *inode, nr_pages_ret = 0; /* flag the file so we don't compress in the future */ - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + btrfs_set_flag(inode, NOCOMPRESS); } if (will_compress) { *num_added += 1; @@ -862,7 +863,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->locked_page = locked_page; async_cow->start = start; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) + if (btrfs_test_flag(inode, NOCOMPRESS)) cur_end = end; else cur_end = min(end, start + 512 * 1024 - 1); @@ -943,7 +944,6 @@ static noinline int run_delalloc_nocow(struct inode *inode, u64 cow_start; u64 cur_offset; u64 extent_end; - u64 extent_offset; u64 disk_bytenr; u64 num_bytes; int extent_type; @@ -1005,7 +1005,6 @@ static noinline int run_delalloc_nocow(struct inode *inode, if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - extent_offset = btrfs_file_extent_offset(leaf, fi); extent_end = found_key.offset + btrfs_file_extent_num_bytes(leaf, fi); if (extent_end <= start) { @@ -1023,10 +1022,9 @@ static noinline int run_delalloc_nocow(struct inode *inode, if (btrfs_extent_readonly(root, disk_bytenr)) goto out_check; if (btrfs_cross_ref_exist(trans, root, inode->i_ino, - found_key.offset - - extent_offset, disk_bytenr)) + disk_bytenr)) goto out_check; - disk_bytenr += extent_offset; + disk_bytenr += btrfs_file_extent_offset(leaf, fi); disk_bytenr += cur_offset - found_key.offset; num_bytes = min(end + 1, extent_end) - cur_offset; /* @@ -1133,10 +1131,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, int ret; struct btrfs_root *root = BTRFS_I(inode)->root; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) + if (btrfs_test_flag(inode, NODATACOW)) ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 1, nr_written); - else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) + else if (btrfs_test_flag(inode, PREALLOC)) ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); else if (!btrfs_test_opt(root, COMPRESS)) @@ -1290,7 +1288,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, int ret = 0; int skip_sum; - skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + skip_sum = btrfs_test_flag(inode, NODATASUM); ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); BUG_ON(ret); @@ -1491,9 +1489,9 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_alloc_reserved_file_extent(trans, root, - root->root_key.objectid, - inode->i_ino, file_pos, &ins); + ret = btrfs_alloc_reserved_extent(trans, root, leaf->start, + root->root_key.objectid, + trans->transid, inode->i_ino, &ins); BUG_ON(ret); btrfs_free_path(path); @@ -1790,8 +1788,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, ClearPageChecked(page); goto good; } - - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + if (btrfs_test_flag(inode, NODATASUM)) return 0; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && @@ -1959,13 +1956,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) * crossing root thing. we store the inode number in the * offset of the orphan item. */ - found_key.objectid = found_key.offset; - found_key.type = BTRFS_INODE_ITEM_KEY; - found_key.offset = 0; - inode = btrfs_iget(root->fs_info->sb, &found_key, root); - if (IS_ERR(inode)) + inode = btrfs_iget_locked(root->fs_info->sb, + found_key.offset, root); + if (!inode) break; + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + + /* have to set the location manually */ + BTRFS_I(inode)->location.objectid = inode->i_ino; + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.offset = 0; + + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + } + /* * add this inode to the orphan list so btrfs_orphan_del does * the proper thing when we hit it @@ -2062,7 +2069,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, /* * read an inode from the btree into the in-memory inode */ -static void btrfs_read_locked_inode(struct inode *inode) +void btrfs_read_locked_inode(struct inode *inode) { struct btrfs_path *path; struct extent_buffer *leaf; @@ -2157,8 +2164,6 @@ static void btrfs_read_locked_inode(struct inode *inode) init_special_inode(inode, inode->i_mode, rdev); break; } - - btrfs_update_iflags(inode); return; make_bad: @@ -2322,6 +2327,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, btrfs_update_inode(trans, root, dir); btrfs_drop_nlink(inode); ret = btrfs_update_inode(trans, root, inode); + dir->i_sb->s_dirt = 1; out: return ret; } @@ -2593,8 +2599,9 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item *fi; u64 extent_start = 0; u64 extent_num_bytes = 0; - u64 extent_offset = 0; u64 item_end = 0; + u64 root_gen = 0; + u64 root_owner = 0; int found_extent; int del_item; int pending_del_nr = 0; @@ -2709,9 +2716,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, extent_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - extent_offset = found_key.offset - - btrfs_file_extent_offset(leaf, fi); - /* FIXME blocksize != 4096 */ num_dec = btrfs_file_extent_num_bytes(leaf, fi); if (extent_start != 0) { @@ -2719,6 +2723,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, if (root->ref_cows) inode_sub_bytes(inode, num_dec); } + root_gen = btrfs_header_generation(leaf); + root_owner = btrfs_header_owner(leaf); } } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { /* @@ -2762,12 +2768,12 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, } else { break; } - if (found_extent && root->ref_cows) { + if (found_extent) { btrfs_set_path_blocking(path); ret = btrfs_free_extent(trans, root, extent_start, - extent_num_bytes, 0, - btrfs_header_owner(leaf), - inode->i_ino, extent_offset); + extent_num_bytes, + leaf->start, root_owner, + root_gen, inode->i_ino, 0); BUG_ON(ret); } next: @@ -2805,6 +2811,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, pending_del_nr); } btrfs_free_path(path); + inode->i_sb->s_dirt = 1; return ret; } @@ -3098,45 +3105,6 @@ static int fixup_tree_root_location(struct btrfs_root *root, return 0; } -static void inode_tree_add(struct inode *inode) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_inode *entry; - struct rb_node **p = &root->inode_tree.rb_node; - struct rb_node *parent = NULL; - - spin_lock(&root->inode_lock); - while (*p) { - parent = *p; - entry = rb_entry(parent, struct btrfs_inode, rb_node); - - if (inode->i_ino < entry->vfs_inode.i_ino) - p = &(*p)->rb_left; - else if (inode->i_ino > entry->vfs_inode.i_ino) - p = &(*p)->rb_right; - else { - WARN_ON(!(entry->vfs_inode.i_state & - (I_WILL_FREE | I_FREEING | I_CLEAR))); - break; - } - } - rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); - rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree); - spin_unlock(&root->inode_lock); -} - -static void inode_tree_del(struct inode *inode) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - - if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { - spin_lock(&root->inode_lock); - rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); - spin_unlock(&root->inode_lock); - RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); - } -} - static noinline void init_btrfs_i(struct inode *inode) { struct btrfs_inode *bi = BTRFS_I(inode); @@ -3162,7 +3130,6 @@ static noinline void init_btrfs_i(struct inode *inode) inode->i_mapping, GFP_NOFS); INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); - RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); mutex_init(&BTRFS_I(inode)->extent_mutex); mutex_init(&BTRFS_I(inode)->log_mutex); @@ -3185,9 +3152,26 @@ static int btrfs_find_actor(struct inode *inode, void *opaque) args->root == BTRFS_I(inode)->root; } -static struct inode *btrfs_iget_locked(struct super_block *s, - u64 objectid, - struct btrfs_root *root) +struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, + struct btrfs_root *root, int wait) +{ + struct inode *inode; + struct btrfs_iget_args args; + args.ino = objectid; + args.root = root; + + if (wait) { + inode = ilookup5(s, objectid, btrfs_find_actor, + (void *)&args); + } else { + inode = ilookup5_nowait(s, objectid, btrfs_find_actor, + (void *)&args); + } + return inode; +} + +struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, + struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; @@ -3204,21 +3188,24 @@ static struct inode *btrfs_iget_locked(struct super_block *s, * Returns in *is_new if the inode was read from disk */ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root) + struct btrfs_root *root, int *is_new) { struct inode *inode; inode = btrfs_iget_locked(s, location->objectid, root); if (!inode) - return ERR_PTR(-ENOMEM); + return ERR_PTR(-EACCES); if (inode->i_state & I_NEW) { BTRFS_I(inode)->root = root; memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); btrfs_read_locked_inode(inode); - - inode_tree_add(inode); unlock_new_inode(inode); + if (is_new) + *is_new = 1; + } else { + if (is_new) + *is_new = 0; } return inode; @@ -3231,7 +3218,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) struct btrfs_root *root = bi->root; struct btrfs_root *sub_root = root; struct btrfs_key location; - int ret; + int ret, new; if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -3249,7 +3236,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return ERR_PTR(ret); if (ret > 0) return ERR_PTR(-ENOENT); - inode = btrfs_iget(dir->i_sb, &location, sub_root); + inode = btrfs_iget(dir->i_sb, &location, sub_root, &new); if (IS_ERR(inode)) return ERR_CAST(inode); } @@ -3587,9 +3574,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_find_block_group(root, 0, alloc_hint, owner); if ((mode & S_IFREG)) { if (btrfs_test_opt(root, NODATASUM)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + btrfs_set_flag(inode, NODATASUM); if (btrfs_test_opt(root, NODATACOW)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; + btrfs_set_flag(inode, NODATACOW); } key[0].objectid = objectid; @@ -3643,10 +3630,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, location->offset = 0; btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); - btrfs_inherit_iflags(inode, dir); - insert_inode_hash(inode); - inode_tree_add(inode); return inode; fail: if (dir) @@ -3766,6 +3750,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); btrfs_update_inode(trans, root, inode); } + dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); btrfs_update_inode_block_group(trans, dir); out_unlock: @@ -3830,6 +3815,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_file_inode_operations; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; } + dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); btrfs_update_inode_block_group(trans, dir); out_unlock: @@ -3876,6 +3862,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (err) drop_inode = 1; + dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, dir); err = btrfs_update_inode(trans, root, inode); @@ -3957,6 +3944,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) d_instantiate(dentry, inode); drop_on_err = 0; + dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); btrfs_update_inode_block_group(trans, dir); @@ -4695,7 +4683,6 @@ void btrfs_destroy_inode(struct inode *inode) btrfs_put_ordered_extent(ordered); } } - inode_tree_del(inode); btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } @@ -4985,6 +4972,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_file_inode_operations; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; } + dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); btrfs_update_inode_block_group(trans, dir); if (drop_inode) @@ -5073,7 +5061,7 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans, out: if (cur_offset > start) { inode->i_ctime = CURRENT_TIME; - BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; + btrfs_set_flag(inode, PREALLOC); if (!(mode & FALLOC_FL_KEEP_SIZE) && cur_offset > i_size_read(inode)) btrfs_i_size_write(inode, cur_offset); @@ -5194,7 +5182,7 @@ static int btrfs_set_page_dirty(struct page *page) static int btrfs_permission(struct inode *inode, int mask) { - if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) + if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE)) return -EACCES; return generic_permission(inode, mask, btrfs_check_acl); } diff --git a/trunk/fs/btrfs/ioctl.c b/trunk/fs/btrfs/ioctl.c index eff18f5b5362..2624b53ea783 100644 --- a/trunk/fs/btrfs/ioctl.c +++ b/trunk/fs/btrfs/ioctl.c @@ -50,177 +50,7 @@ #include "volumes.h" #include "locking.h" -/* Mask out flags that are inappropriate for the given type of inode. */ -static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) -{ - if (S_ISDIR(mode)) - return flags; - else if (S_ISREG(mode)) - return flags & ~FS_DIRSYNC_FL; - else - return flags & (FS_NODUMP_FL | FS_NOATIME_FL); -} - -/* - * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl. - */ -static unsigned int btrfs_flags_to_ioctl(unsigned int flags) -{ - unsigned int iflags = 0; - - if (flags & BTRFS_INODE_SYNC) - iflags |= FS_SYNC_FL; - if (flags & BTRFS_INODE_IMMUTABLE) - iflags |= FS_IMMUTABLE_FL; - if (flags & BTRFS_INODE_APPEND) - iflags |= FS_APPEND_FL; - if (flags & BTRFS_INODE_NODUMP) - iflags |= FS_NODUMP_FL; - if (flags & BTRFS_INODE_NOATIME) - iflags |= FS_NOATIME_FL; - if (flags & BTRFS_INODE_DIRSYNC) - iflags |= FS_DIRSYNC_FL; - - return iflags; -} - -/* - * Update inode->i_flags based on the btrfs internal flags. - */ -void btrfs_update_iflags(struct inode *inode) -{ - struct btrfs_inode *ip = BTRFS_I(inode); - - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - - if (ip->flags & BTRFS_INODE_SYNC) - inode->i_flags |= S_SYNC; - if (ip->flags & BTRFS_INODE_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; - if (ip->flags & BTRFS_INODE_APPEND) - inode->i_flags |= S_APPEND; - if (ip->flags & BTRFS_INODE_NOATIME) - inode->i_flags |= S_NOATIME; - if (ip->flags & BTRFS_INODE_DIRSYNC) - inode->i_flags |= S_DIRSYNC; -} - -/* - * Inherit flags from the parent inode. - * - * Unlike extN we don't have any flags we don't want to inherit currently. - */ -void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) -{ - unsigned int flags; - - if (!dir) - return; - - flags = BTRFS_I(dir)->flags; - - if (S_ISREG(inode->i_mode)) - flags &= ~BTRFS_INODE_DIRSYNC; - else if (!S_ISDIR(inode->i_mode)) - flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); - - BTRFS_I(inode)->flags = flags; - btrfs_update_iflags(inode); -} - -static int btrfs_ioctl_getflags(struct file *file, void __user *arg) -{ - struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode); - unsigned int flags = btrfs_flags_to_ioctl(ip->flags); - - if (copy_to_user(arg, &flags, sizeof(flags))) - return -EFAULT; - return 0; -} - -static int btrfs_ioctl_setflags(struct file *file, void __user *arg) -{ - struct inode *inode = file->f_path.dentry->d_inode; - struct btrfs_inode *ip = BTRFS_I(inode); - struct btrfs_root *root = ip->root; - struct btrfs_trans_handle *trans; - unsigned int flags, oldflags; - int ret; - - if (copy_from_user(&flags, arg, sizeof(flags))) - return -EFAULT; - - if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ - FS_NOATIME_FL | FS_NODUMP_FL | \ - FS_SYNC_FL | FS_DIRSYNC_FL)) - return -EOPNOTSUPP; - if (!is_owner_or_cap(inode)) - return -EACCES; - - mutex_lock(&inode->i_mutex); - - flags = btrfs_mask_flags(inode->i_mode, flags); - oldflags = btrfs_flags_to_ioctl(ip->flags); - if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) { - ret = -EPERM; - goto out_unlock; - } - } - - ret = mnt_want_write(file->f_path.mnt); - if (ret) - goto out_unlock; - - if (flags & FS_SYNC_FL) - ip->flags |= BTRFS_INODE_SYNC; - else - ip->flags &= ~BTRFS_INODE_SYNC; - if (flags & FS_IMMUTABLE_FL) - ip->flags |= BTRFS_INODE_IMMUTABLE; - else - ip->flags &= ~BTRFS_INODE_IMMUTABLE; - if (flags & FS_APPEND_FL) - ip->flags |= BTRFS_INODE_APPEND; - else - ip->flags &= ~BTRFS_INODE_APPEND; - if (flags & FS_NODUMP_FL) - ip->flags |= BTRFS_INODE_NODUMP; - else - ip->flags &= ~BTRFS_INODE_NODUMP; - if (flags & FS_NOATIME_FL) - ip->flags |= BTRFS_INODE_NOATIME; - else - ip->flags &= ~BTRFS_INODE_NOATIME; - if (flags & FS_DIRSYNC_FL) - ip->flags |= BTRFS_INODE_DIRSYNC; - else - ip->flags &= ~BTRFS_INODE_DIRSYNC; - - - trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); - - ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); - - btrfs_update_iflags(inode); - inode->i_ctime = CURRENT_TIME; - btrfs_end_transaction(trans, root); - - mnt_drop_write(file->f_path.mnt); - out_unlock: - mutex_unlock(&inode->i_mutex); - return 0; -} - -static int btrfs_ioctl_getversion(struct file *file, int __user *arg) -{ - struct inode *inode = file->f_path.dentry->d_inode; - - return put_user(inode->i_generation, arg); -} static noinline int create_subvol(struct btrfs_root *root, struct dentry *dentry, @@ -252,25 +82,22 @@ static noinline int create_subvol(struct btrfs_root *root, if (ret) goto fail; - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, - 0, objectid, NULL, 0, 0, 0); + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, + objectid, trans->transid, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; } - memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_nritems(leaf, 0); + btrfs_set_header_level(leaf, 0); btrfs_set_header_bytenr(leaf, leaf->start); btrfs_set_header_generation(leaf, trans->transid); - btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(leaf, objectid); write_extent_buffer(leaf, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(leaf), BTRFS_FSID_SIZE); - write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(leaf), - BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); inode_item = &root_item.inode; @@ -298,7 +125,7 @@ static noinline int create_subvol(struct btrfs_root *root, btrfs_set_root_dirid(&root_item, new_dirid); key.objectid = objectid; - key.offset = 0; + key.offset = 1; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, &root_item); @@ -1084,10 +911,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (disko) { inode_add_bytes(inode, datal); ret = btrfs_inc_extent_ref(trans, root, - disko, diskl, 0, - root->root_key.objectid, - inode->i_ino, - new_key.offset - datao); + disko, diskl, leaf->start, + root->root_key.objectid, + trans->transid, + inode->i_ino); BUG_ON(ret); } } else if (type == BTRFS_FILE_EXTENT_INLINE) { @@ -1247,12 +1074,6 @@ long btrfs_ioctl(struct file *file, unsigned int void __user *argp = (void __user *)arg; switch (cmd) { - case FS_IOC_GETFLAGS: - return btrfs_ioctl_getflags(file, argp); - case FS_IOC_SETFLAGS: - return btrfs_ioctl_setflags(file, argp); - case FS_IOC_GETVERSION: - return btrfs_ioctl_getversion(file, argp); case BTRFS_IOC_SNAP_CREATE: return btrfs_ioctl_snap_create(file, argp, 0); case BTRFS_IOC_SUBVOL_CREATE: diff --git a/trunk/fs/btrfs/print-tree.c b/trunk/fs/btrfs/print-tree.c index 6d6523da0a30..5f8f218c1005 100644 --- a/trunk/fs/btrfs/print-tree.c +++ b/trunk/fs/btrfs/print-tree.c @@ -45,132 +45,22 @@ static void print_dev_item(struct extent_buffer *eb, (unsigned long long)btrfs_device_total_bytes(eb, dev_item), (unsigned long long)btrfs_device_bytes_used(eb, dev_item)); } -static void print_extent_data_ref(struct extent_buffer *eb, - struct btrfs_extent_data_ref *ref) -{ - printk(KERN_INFO "\t\textent data backref root %llu " - "objectid %llu offset %llu count %u\n", - (unsigned long long)btrfs_extent_data_ref_root(eb, ref), - (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref), - (unsigned long long)btrfs_extent_data_ref_offset(eb, ref), - btrfs_extent_data_ref_count(eb, ref)); -} - -static void print_extent_item(struct extent_buffer *eb, int slot) -{ - struct btrfs_extent_item *ei; - struct btrfs_extent_inline_ref *iref; - struct btrfs_extent_data_ref *dref; - struct btrfs_shared_data_ref *sref; - struct btrfs_disk_key key; - unsigned long end; - unsigned long ptr; - int type; - u32 item_size = btrfs_item_size_nr(eb, slot); - u64 flags; - u64 offset; - - if (item_size < sizeof(*ei)) { -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - struct btrfs_extent_item_v0 *ei0; - BUG_ON(item_size != sizeof(*ei0)); - ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0); - printk(KERN_INFO "\t\textent refs %u\n", - btrfs_extent_refs_v0(eb, ei0)); - return; -#else - BUG(); -#endif - } - - ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item); - flags = btrfs_extent_flags(eb, ei); - - printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n", - (unsigned long long)btrfs_extent_refs(eb, ei), - (unsigned long long)btrfs_extent_generation(eb, ei), - (unsigned long long)flags); - - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - struct btrfs_tree_block_info *info; - info = (struct btrfs_tree_block_info *)(ei + 1); - btrfs_tree_block_key(eb, info, &key); - printk(KERN_INFO "\t\ttree block key (%llu %x %llu) " - "level %d\n", - (unsigned long long)btrfs_disk_key_objectid(&key), - key.type, - (unsigned long long)btrfs_disk_key_offset(&key), - btrfs_tree_block_level(eb, info)); - iref = (struct btrfs_extent_inline_ref *)(info + 1); - } else { - iref = (struct btrfs_extent_inline_ref *)(ei + 1); - } - - ptr = (unsigned long)iref; - end = (unsigned long)ei + item_size; - while (ptr < end) { - iref = (struct btrfs_extent_inline_ref *)ptr; - type = btrfs_extent_inline_ref_type(eb, iref); - offset = btrfs_extent_inline_ref_offset(eb, iref); - switch (type) { - case BTRFS_TREE_BLOCK_REF_KEY: - printk(KERN_INFO "\t\ttree block backref " - "root %llu\n", (unsigned long long)offset); - break; - case BTRFS_SHARED_BLOCK_REF_KEY: - printk(KERN_INFO "\t\tshared block backref " - "parent %llu\n", (unsigned long long)offset); - break; - case BTRFS_EXTENT_DATA_REF_KEY: - dref = (struct btrfs_extent_data_ref *)(&iref->offset); - print_extent_data_ref(eb, dref); - break; - case BTRFS_SHARED_DATA_REF_KEY: - sref = (struct btrfs_shared_data_ref *)(iref + 1); - printk(KERN_INFO "\t\tshared data backref " - "parent %llu count %u\n", - (unsigned long long)offset, - btrfs_shared_data_ref_count(eb, sref)); - break; - default: - BUG(); - } - ptr += btrfs_extent_inline_ref_size(type); - } - WARN_ON(ptr > end); -} - -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 -static void print_extent_ref_v0(struct extent_buffer *eb, int slot) -{ - struct btrfs_extent_ref_v0 *ref0; - - ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0); - printk("\t\textent back ref root %llu gen %llu " - "owner %llu num_refs %lu\n", - (unsigned long long)btrfs_ref_root_v0(eb, ref0), - (unsigned long long)btrfs_ref_generation_v0(eb, ref0), - (unsigned long long)btrfs_ref_objectid_v0(eb, ref0), - (unsigned long)btrfs_ref_count_v0(eb, ref0)); -} -#endif - void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) { int i; - u32 type; u32 nr = btrfs_header_nritems(l); struct btrfs_item *item; + struct btrfs_extent_item *ei; struct btrfs_root_item *ri; struct btrfs_dir_item *di; struct btrfs_inode_item *ii; struct btrfs_block_group_item *bi; struct btrfs_file_extent_item *fi; - struct btrfs_extent_data_ref *dref; - struct btrfs_shared_data_ref *sref; - struct btrfs_dev_extent *dev_extent; struct btrfs_key key; struct btrfs_key found_key; + struct btrfs_extent_ref *ref; + struct btrfs_dev_extent *dev_extent; + u32 type; printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", (unsigned long long)btrfs_header_bytenr(l), nr, @@ -210,25 +100,20 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_disk_root_refs(l, ri)); break; case BTRFS_EXTENT_ITEM_KEY: - print_extent_item(l, i); - break; - case BTRFS_TREE_BLOCK_REF_KEY: - printk(KERN_INFO "\t\ttree block backref\n"); - break; - case BTRFS_SHARED_BLOCK_REF_KEY: - printk(KERN_INFO "\t\tshared block backref\n"); - break; - case BTRFS_EXTENT_DATA_REF_KEY: - dref = btrfs_item_ptr(l, i, - struct btrfs_extent_data_ref); - print_extent_data_ref(l, dref); - break; - case BTRFS_SHARED_DATA_REF_KEY: - sref = btrfs_item_ptr(l, i, - struct btrfs_shared_data_ref); - printk(KERN_INFO "\t\tshared data backref count %u\n", - btrfs_shared_data_ref_count(l, sref)); + ei = btrfs_item_ptr(l, i, struct btrfs_extent_item); + printk(KERN_INFO "\t\textent data refs %u\n", + btrfs_extent_refs(l, ei)); + break; + case BTRFS_EXTENT_REF_KEY: + ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref); + printk(KERN_INFO "\t\textent back ref root %llu " + "gen %llu owner %llu num_refs %lu\n", + (unsigned long long)btrfs_ref_root(l, ref), + (unsigned long long)btrfs_ref_generation(l, ref), + (unsigned long long)btrfs_ref_objectid(l, ref), + (unsigned long)btrfs_ref_num_refs(l, ref)); break; + case BTRFS_EXTENT_DATA_KEY: fi = btrfs_item_ptr(l, i, struct btrfs_file_extent_item); @@ -254,12 +139,6 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) (unsigned long long) btrfs_file_extent_ram_bytes(l, fi)); break; - case BTRFS_EXTENT_REF_V0_KEY: -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - print_extent_ref_v0(l, i); -#else - BUG(); -#endif case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, struct btrfs_block_group_item); diff --git a/trunk/fs/btrfs/relocation.c b/trunk/fs/btrfs/relocation.c deleted file mode 100644 index b23dc209ae10..000000000000 --- a/trunk/fs/btrfs/relocation.c +++ /dev/null @@ -1,3711 +0,0 @@ -/* - * Copyright (C) 2009 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include -#include -#include -#include -#include -#include "ctree.h" -#include "disk-io.h" -#include "transaction.h" -#include "volumes.h" -#include "locking.h" -#include "btrfs_inode.h" -#include "async-thread.h" - -/* - * backref_node, mapping_node and tree_block start with this - */ -struct tree_entry { - struct rb_node rb_node; - u64 bytenr; -}; - -/* - * present a tree block in the backref cache - */ -struct backref_node { - struct rb_node rb_node; - u64 bytenr; - /* objectid tree block owner */ - u64 owner; - /* list of upper level blocks reference this block */ - struct list_head upper; - /* list of child blocks in the cache */ - struct list_head lower; - /* NULL if this node is not tree root */ - struct btrfs_root *root; - /* extent buffer got by COW the block */ - struct extent_buffer *eb; - /* level of tree block */ - unsigned int level:8; - /* 1 if the block is root of old snapshot */ - unsigned int old_root:1; - /* 1 if no child blocks in the cache */ - unsigned int lowest:1; - /* is the extent buffer locked */ - unsigned int locked:1; - /* has the block been processed */ - unsigned int processed:1; - /* have backrefs of this block been checked */ - unsigned int checked:1; -}; - -/* - * present a block pointer in the backref cache - */ -struct backref_edge { - struct list_head list[2]; - struct backref_node *node[2]; - u64 blockptr; -}; - -#define LOWER 0 -#define UPPER 1 - -struct backref_cache { - /* red black tree of all backref nodes in the cache */ - struct rb_root rb_root; - /* list of backref nodes with no child block in the cache */ - struct list_head pending[BTRFS_MAX_LEVEL]; - spinlock_t lock; -}; - -/* - * map address of tree root to tree - */ -struct mapping_node { - struct rb_node rb_node; - u64 bytenr; - void *data; -}; - -struct mapping_tree { - struct rb_root rb_root; - spinlock_t lock; -}; - -/* - * present a tree block to process - */ -struct tree_block { - struct rb_node rb_node; - u64 bytenr; - struct btrfs_key key; - unsigned int level:8; - unsigned int key_ready:1; -}; - -/* inode vector */ -#define INODEVEC_SIZE 16 - -struct inodevec { - struct list_head list; - struct inode *inode[INODEVEC_SIZE]; - int nr; -}; - -struct reloc_control { - /* block group to relocate */ - struct btrfs_block_group_cache *block_group; - /* extent tree */ - struct btrfs_root *extent_root; - /* inode for moving data */ - struct inode *data_inode; - struct btrfs_workers workers; - /* tree blocks have been processed */ - struct extent_io_tree processed_blocks; - /* map start of tree root to corresponding reloc tree */ - struct mapping_tree reloc_root_tree; - /* list of reloc trees */ - struct list_head reloc_roots; - u64 search_start; - u64 extents_found; - u64 extents_skipped; - int stage; - int create_reloc_root; - unsigned int found_file_extent:1; - unsigned int found_old_snapshot:1; -}; - -/* stages of data relocation */ -#define MOVE_DATA_EXTENTS 0 -#define UPDATE_DATA_PTRS 1 - -/* - * merge reloc tree to corresponding fs tree in worker threads - */ -struct async_merge { - struct btrfs_work work; - struct reloc_control *rc; - struct btrfs_root *root; - struct completion *done; - atomic_t *num_pending; -}; - -static void mapping_tree_init(struct mapping_tree *tree) -{ - tree->rb_root.rb_node = NULL; - spin_lock_init(&tree->lock); -} - -static void backref_cache_init(struct backref_cache *cache) -{ - int i; - cache->rb_root.rb_node = NULL; - for (i = 0; i < BTRFS_MAX_LEVEL; i++) - INIT_LIST_HEAD(&cache->pending[i]); - spin_lock_init(&cache->lock); -} - -static void backref_node_init(struct backref_node *node) -{ - memset(node, 0, sizeof(*node)); - INIT_LIST_HEAD(&node->upper); - INIT_LIST_HEAD(&node->lower); - RB_CLEAR_NODE(&node->rb_node); -} - -static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, - struct rb_node *node) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct tree_entry *entry; - - while (*p) { - parent = *p; - entry = rb_entry(parent, struct tree_entry, rb_node); - - if (bytenr < entry->bytenr) - p = &(*p)->rb_left; - else if (bytenr > entry->bytenr) - p = &(*p)->rb_right; - else - return parent; - } - - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; -} - -static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) -{ - struct rb_node *n = root->rb_node; - struct tree_entry *entry; - - while (n) { - entry = rb_entry(n, struct tree_entry, rb_node); - - if (bytenr < entry->bytenr) - n = n->rb_left; - else if (bytenr > entry->bytenr) - n = n->rb_right; - else - return n; - } - return NULL; -} - -/* - * walk up backref nodes until reach node presents tree root - */ -static struct backref_node *walk_up_backref(struct backref_node *node, - struct backref_edge *edges[], - int *index) -{ - struct backref_edge *edge; - int idx = *index; - - while (!list_empty(&node->upper)) { - edge = list_entry(node->upper.next, - struct backref_edge, list[LOWER]); - edges[idx++] = edge; - node = edge->node[UPPER]; - } - *index = idx; - return node; -} - -/* - * walk down backref nodes to find start of next reference path - */ -static struct backref_node *walk_down_backref(struct backref_edge *edges[], - int *index) -{ - struct backref_edge *edge; - struct backref_node *lower; - int idx = *index; - - while (idx > 0) { - edge = edges[idx - 1]; - lower = edge->node[LOWER]; - if (list_is_last(&edge->list[LOWER], &lower->upper)) { - idx--; - continue; - } - edge = list_entry(edge->list[LOWER].next, - struct backref_edge, list[LOWER]); - edges[idx - 1] = edge; - *index = idx; - return edge->node[UPPER]; - } - *index = 0; - return NULL; -} - -static void drop_node_buffer(struct backref_node *node) -{ - if (node->eb) { - if (node->locked) { - btrfs_tree_unlock(node->eb); - node->locked = 0; - } - free_extent_buffer(node->eb); - node->eb = NULL; - } -} - -static void drop_backref_node(struct backref_cache *tree, - struct backref_node *node) -{ - BUG_ON(!node->lowest); - BUG_ON(!list_empty(&node->upper)); - - drop_node_buffer(node); - list_del(&node->lower); - - rb_erase(&node->rb_node, &tree->rb_root); - kfree(node); -} - -/* - * remove a backref node from the backref cache - */ -static void remove_backref_node(struct backref_cache *cache, - struct backref_node *node) -{ - struct backref_node *upper; - struct backref_edge *edge; - - if (!node) - return; - - BUG_ON(!node->lowest); - while (!list_empty(&node->upper)) { - edge = list_entry(node->upper.next, struct backref_edge, - list[LOWER]); - upper = edge->node[UPPER]; - list_del(&edge->list[LOWER]); - list_del(&edge->list[UPPER]); - kfree(edge); - /* - * add the node to pending list if no other - * child block cached. - */ - if (list_empty(&upper->lower)) { - list_add_tail(&upper->lower, - &cache->pending[upper->level]); - upper->lowest = 1; - } - } - drop_backref_node(cache, node); -} - -/* - * find reloc tree by address of tree root - */ -static struct btrfs_root *find_reloc_root(struct reloc_control *rc, - u64 bytenr) -{ - struct rb_node *rb_node; - struct mapping_node *node; - struct btrfs_root *root = NULL; - - spin_lock(&rc->reloc_root_tree.lock); - rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr); - if (rb_node) { - node = rb_entry(rb_node, struct mapping_node, rb_node); - root = (struct btrfs_root *)node->data; - } - spin_unlock(&rc->reloc_root_tree.lock); - return root; -} - -static int is_cowonly_root(u64 root_objectid) -{ - if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || - root_objectid == BTRFS_EXTENT_TREE_OBJECTID || - root_objectid == BTRFS_CHUNK_TREE_OBJECTID || - root_objectid == BTRFS_DEV_TREE_OBJECTID || - root_objectid == BTRFS_TREE_LOG_OBJECTID || - root_objectid == BTRFS_CSUM_TREE_OBJECTID) - return 1; - return 0; -} - -static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, - u64 root_objectid) -{ - struct btrfs_key key; - - key.objectid = root_objectid; - key.type = BTRFS_ROOT_ITEM_KEY; - if (is_cowonly_root(root_objectid)) - key.offset = 0; - else - key.offset = (u64)-1; - - return btrfs_read_fs_root_no_name(fs_info, &key); -} - -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 -static noinline_for_stack -struct btrfs_root *find_tree_root(struct reloc_control *rc, - struct extent_buffer *leaf, - struct btrfs_extent_ref_v0 *ref0) -{ - struct btrfs_root *root; - u64 root_objectid = btrfs_ref_root_v0(leaf, ref0); - u64 generation = btrfs_ref_generation_v0(leaf, ref0); - - BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID); - - root = read_fs_root(rc->extent_root->fs_info, root_objectid); - BUG_ON(IS_ERR(root)); - - if (root->ref_cows && - generation != btrfs_root_generation(&root->root_item)) - return NULL; - - return root; -} -#endif - -static noinline_for_stack -int find_inline_backref(struct extent_buffer *leaf, int slot, - unsigned long *ptr, unsigned long *end) -{ - struct btrfs_extent_item *ei; - struct btrfs_tree_block_info *bi; - u32 item_size; - - item_size = btrfs_item_size_nr(leaf, slot); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (item_size < sizeof(*ei)) { - WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); - return 1; - } -#endif - ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); - WARN_ON(!(btrfs_extent_flags(leaf, ei) & - BTRFS_EXTENT_FLAG_TREE_BLOCK)); - - if (item_size <= sizeof(*ei) + sizeof(*bi)) { - WARN_ON(item_size < sizeof(*ei) + sizeof(*bi)); - return 1; - } - - bi = (struct btrfs_tree_block_info *)(ei + 1); - *ptr = (unsigned long)(bi + 1); - *end = (unsigned long)ei + item_size; - return 0; -} - -/* - * build backref tree for a given tree block. root of the backref tree - * corresponds the tree block, leaves of the backref tree correspond - * roots of b-trees that reference the tree block. - * - * the basic idea of this function is check backrefs of a given block - * to find upper level blocks that refernece the block, and then check - * bakcrefs of these upper level blocks recursively. the recursion stop - * when tree root is reached or backrefs for the block is cached. - * - * NOTE: if we find backrefs for a block are cached, we know backrefs - * for all upper level blocks that directly/indirectly reference the - * block are also cached. - */ -static struct backref_node *build_backref_tree(struct reloc_control *rc, - struct backref_cache *cache, - struct btrfs_key *node_key, - int level, u64 bytenr) -{ - struct btrfs_path *path1; - struct btrfs_path *path2; - struct extent_buffer *eb; - struct btrfs_root *root; - struct backref_node *cur; - struct backref_node *upper; - struct backref_node *lower; - struct backref_node *node = NULL; - struct backref_node *exist = NULL; - struct backref_edge *edge; - struct rb_node *rb_node; - struct btrfs_key key; - unsigned long end; - unsigned long ptr; - LIST_HEAD(list); - int ret; - int err = 0; - - path1 = btrfs_alloc_path(); - path2 = btrfs_alloc_path(); - if (!path1 || !path2) { - err = -ENOMEM; - goto out; - } - - node = kmalloc(sizeof(*node), GFP_NOFS); - if (!node) { - err = -ENOMEM; - goto out; - } - - backref_node_init(node); - node->bytenr = bytenr; - node->owner = 0; - node->level = level; - node->lowest = 1; - cur = node; -again: - end = 0; - ptr = 0; - key.objectid = cur->bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = (u64)-1; - - path1->search_commit_root = 1; - path1->skip_locking = 1; - ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1, - 0, 0); - if (ret < 0) { - err = ret; - goto out; - } - BUG_ON(!ret || !path1->slots[0]); - - path1->slots[0]--; - - WARN_ON(cur->checked); - if (!list_empty(&cur->upper)) { - /* - * the backref was added previously when processsing - * backref of type BTRFS_TREE_BLOCK_REF_KEY - */ - BUG_ON(!list_is_singular(&cur->upper)); - edge = list_entry(cur->upper.next, struct backref_edge, - list[LOWER]); - BUG_ON(!list_empty(&edge->list[UPPER])); - exist = edge->node[UPPER]; - /* - * add the upper level block to pending list if we need - * check its backrefs - */ - if (!exist->checked) - list_add_tail(&edge->list[UPPER], &list); - } else { - exist = NULL; - } - - while (1) { - cond_resched(); - eb = path1->nodes[0]; - - if (ptr >= end) { - if (path1->slots[0] >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(rc->extent_root, path1); - if (ret < 0) { - err = ret; - goto out; - } - if (ret > 0) - break; - eb = path1->nodes[0]; - } - - btrfs_item_key_to_cpu(eb, &key, path1->slots[0]); - if (key.objectid != cur->bytenr) { - WARN_ON(exist); - break; - } - - if (key.type == BTRFS_EXTENT_ITEM_KEY) { - ret = find_inline_backref(eb, path1->slots[0], - &ptr, &end); - if (ret) - goto next; - } - } - - if (ptr < end) { - /* update key for inline back ref */ - struct btrfs_extent_inline_ref *iref; - iref = (struct btrfs_extent_inline_ref *)ptr; - key.type = btrfs_extent_inline_ref_type(eb, iref); - key.offset = btrfs_extent_inline_ref_offset(eb, iref); - WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY && - key.type != BTRFS_SHARED_BLOCK_REF_KEY); - } - - if (exist && - ((key.type == BTRFS_TREE_BLOCK_REF_KEY && - exist->owner == key.offset) || - (key.type == BTRFS_SHARED_BLOCK_REF_KEY && - exist->bytenr == key.offset))) { - exist = NULL; - goto next; - } - -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || - key.type == BTRFS_EXTENT_REF_V0_KEY) { - if (key.objectid == key.offset && - key.type == BTRFS_EXTENT_REF_V0_KEY) { - struct btrfs_extent_ref_v0 *ref0; - ref0 = btrfs_item_ptr(eb, path1->slots[0], - struct btrfs_extent_ref_v0); - root = find_tree_root(rc, eb, ref0); - if (root) - cur->root = root; - else - cur->old_root = 1; - break; - } -#else - BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); - if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { -#endif - if (key.objectid == key.offset) { - /* - * only root blocks of reloc trees use - * backref of this type. - */ - root = find_reloc_root(rc, cur->bytenr); - BUG_ON(!root); - cur->root = root; - break; - } - - edge = kzalloc(sizeof(*edge), GFP_NOFS); - if (!edge) { - err = -ENOMEM; - goto out; - } - rb_node = tree_search(&cache->rb_root, key.offset); - if (!rb_node) { - upper = kmalloc(sizeof(*upper), GFP_NOFS); - if (!upper) { - kfree(edge); - err = -ENOMEM; - goto out; - } - backref_node_init(upper); - upper->bytenr = key.offset; - upper->owner = 0; - upper->level = cur->level + 1; - /* - * backrefs for the upper level block isn't - * cached, add the block to pending list - */ - list_add_tail(&edge->list[UPPER], &list); - } else { - upper = rb_entry(rb_node, struct backref_node, - rb_node); - INIT_LIST_HEAD(&edge->list[UPPER]); - } - list_add(&edge->list[LOWER], &cur->upper); - edge->node[UPPER] = upper; - edge->node[LOWER] = cur; - - goto next; - } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { - goto next; - } - - /* key.type == BTRFS_TREE_BLOCK_REF_KEY */ - root = read_fs_root(rc->extent_root->fs_info, key.offset); - if (IS_ERR(root)) { - err = PTR_ERR(root); - goto out; - } - - if (btrfs_root_level(&root->root_item) == cur->level) { - /* tree root */ - BUG_ON(btrfs_root_bytenr(&root->root_item) != - cur->bytenr); - cur->root = root; - break; - } - - level = cur->level + 1; - - /* - * searching the tree to find upper level blocks - * reference the block. - */ - path2->search_commit_root = 1; - path2->skip_locking = 1; - path2->lowest_level = level; - ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0); - path2->lowest_level = 0; - if (ret < 0) { - err = ret; - goto out; - } - - eb = path2->nodes[level]; - WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) != - cur->bytenr); - - lower = cur; - for (; level < BTRFS_MAX_LEVEL; level++) { - if (!path2->nodes[level]) { - BUG_ON(btrfs_root_bytenr(&root->root_item) != - lower->bytenr); - lower->root = root; - break; - } - - edge = kzalloc(sizeof(*edge), GFP_NOFS); - if (!edge) { - err = -ENOMEM; - goto out; - } - - eb = path2->nodes[level]; - rb_node = tree_search(&cache->rb_root, eb->start); - if (!rb_node) { - upper = kmalloc(sizeof(*upper), GFP_NOFS); - if (!upper) { - kfree(edge); - err = -ENOMEM; - goto out; - } - backref_node_init(upper); - upper->bytenr = eb->start; - upper->owner = btrfs_header_owner(eb); - upper->level = lower->level + 1; - - /* - * if we know the block isn't shared - * we can void checking its backrefs. - */ - if (btrfs_block_can_be_shared(root, eb)) - upper->checked = 0; - else - upper->checked = 1; - - /* - * add the block to pending list if we - * need check its backrefs. only block - * at 'cur->level + 1' is added to the - * tail of pending list. this guarantees - * we check backrefs from lower level - * blocks to upper level blocks. - */ - if (!upper->checked && - level == cur->level + 1) { - list_add_tail(&edge->list[UPPER], - &list); - } else - INIT_LIST_HEAD(&edge->list[UPPER]); - } else { - upper = rb_entry(rb_node, struct backref_node, - rb_node); - BUG_ON(!upper->checked); - INIT_LIST_HEAD(&edge->list[UPPER]); - } - list_add_tail(&edge->list[LOWER], &lower->upper); - edge->node[UPPER] = upper; - edge->node[LOWER] = lower; - - if (rb_node) - break; - lower = upper; - upper = NULL; - } - btrfs_release_path(root, path2); -next: - if (ptr < end) { - ptr += btrfs_extent_inline_ref_size(key.type); - if (ptr >= end) { - WARN_ON(ptr > end); - ptr = 0; - end = 0; - } - } - if (ptr >= end) - path1->slots[0]++; - } - btrfs_release_path(rc->extent_root, path1); - - cur->checked = 1; - WARN_ON(exist); - - /* the pending list isn't empty, take the first block to process */ - if (!list_empty(&list)) { - edge = list_entry(list.next, struct backref_edge, list[UPPER]); - list_del_init(&edge->list[UPPER]); - cur = edge->node[UPPER]; - goto again; - } - - /* - * everything goes well, connect backref nodes and insert backref nodes - * into the cache. - */ - BUG_ON(!node->checked); - rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); - BUG_ON(rb_node); - - list_for_each_entry(edge, &node->upper, list[LOWER]) - list_add_tail(&edge->list[UPPER], &list); - - while (!list_empty(&list)) { - edge = list_entry(list.next, struct backref_edge, list[UPPER]); - list_del_init(&edge->list[UPPER]); - upper = edge->node[UPPER]; - - if (!RB_EMPTY_NODE(&upper->rb_node)) { - if (upper->lowest) { - list_del_init(&upper->lower); - upper->lowest = 0; - } - - list_add_tail(&edge->list[UPPER], &upper->lower); - continue; - } - - BUG_ON(!upper->checked); - rb_node = tree_insert(&cache->rb_root, upper->bytenr, - &upper->rb_node); - BUG_ON(rb_node); - - list_add_tail(&edge->list[UPPER], &upper->lower); - - list_for_each_entry(edge, &upper->upper, list[LOWER]) - list_add_tail(&edge->list[UPPER], &list); - } -out: - btrfs_free_path(path1); - btrfs_free_path(path2); - if (err) { - INIT_LIST_HEAD(&list); - upper = node; - while (upper) { - if (RB_EMPTY_NODE(&upper->rb_node)) { - list_splice_tail(&upper->upper, &list); - kfree(upper); - } - - if (list_empty(&list)) - break; - - edge = list_entry(list.next, struct backref_edge, - list[LOWER]); - upper = edge->node[UPPER]; - kfree(edge); - } - return ERR_PTR(err); - } - return node; -} - -/* - * helper to add 'address of tree root -> reloc tree' mapping - */ -static int __add_reloc_root(struct btrfs_root *root) -{ - struct rb_node *rb_node; - struct mapping_node *node; - struct reloc_control *rc = root->fs_info->reloc_ctl; - - node = kmalloc(sizeof(*node), GFP_NOFS); - BUG_ON(!node); - - node->bytenr = root->node->start; - node->data = root; - - spin_lock(&rc->reloc_root_tree.lock); - rb_node = tree_insert(&rc->reloc_root_tree.rb_root, - node->bytenr, &node->rb_node); - spin_unlock(&rc->reloc_root_tree.lock); - BUG_ON(rb_node); - - list_add_tail(&root->root_list, &rc->reloc_roots); - return 0; -} - -/* - * helper to update/delete the 'address of tree root -> reloc tree' - * mapping - */ -static int __update_reloc_root(struct btrfs_root *root, int del) -{ - struct rb_node *rb_node; - struct mapping_node *node = NULL; - struct reloc_control *rc = root->fs_info->reloc_ctl; - - spin_lock(&rc->reloc_root_tree.lock); - rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->commit_root->start); - if (rb_node) { - node = rb_entry(rb_node, struct mapping_node, rb_node); - rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); - } - spin_unlock(&rc->reloc_root_tree.lock); - - BUG_ON((struct btrfs_root *)node->data != root); - - if (!del) { - spin_lock(&rc->reloc_root_tree.lock); - node->bytenr = root->node->start; - rb_node = tree_insert(&rc->reloc_root_tree.rb_root, - node->bytenr, &node->rb_node); - spin_unlock(&rc->reloc_root_tree.lock); - BUG_ON(rb_node); - } else { - list_del_init(&root->root_list); - kfree(node); - } - return 0; -} - -/* - * create reloc tree for a given fs tree. reloc tree is just a - * snapshot of the fs tree with special root objectid. - */ -int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_root *reloc_root; - struct extent_buffer *eb; - struct btrfs_root_item *root_item; - struct btrfs_key root_key; - int ret; - - if (root->reloc_root) { - reloc_root = root->reloc_root; - reloc_root->last_trans = trans->transid; - return 0; - } - - if (!root->fs_info->reloc_ctl || - !root->fs_info->reloc_ctl->create_reloc_root || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - return 0; - - root_item = kmalloc(sizeof(*root_item), GFP_NOFS); - BUG_ON(!root_item); - - root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; - root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = root->root_key.objectid; - - ret = btrfs_copy_root(trans, root, root->commit_root, &eb, - BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(ret); - - btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1); - memcpy(root_item, &root->root_item, sizeof(*root_item)); - btrfs_set_root_refs(root_item, 1); - btrfs_set_root_bytenr(root_item, eb->start); - btrfs_set_root_level(root_item, btrfs_header_level(eb)); - btrfs_set_root_generation(root_item, trans->transid); - memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); - root_item->drop_level = 0; - - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - - ret = btrfs_insert_root(trans, root->fs_info->tree_root, - &root_key, root_item); - BUG_ON(ret); - kfree(root_item); - - reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, - &root_key); - BUG_ON(IS_ERR(reloc_root)); - reloc_root->last_trans = trans->transid; - - __add_reloc_root(reloc_root); - root->reloc_root = reloc_root; - return 0; -} - -/* - * update root item of reloc tree - */ -int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_root *reloc_root; - struct btrfs_root_item *root_item; - int del = 0; - int ret; - - if (!root->reloc_root) - return 0; - - reloc_root = root->reloc_root; - root_item = &reloc_root->root_item; - - if (btrfs_root_refs(root_item) == 0) { - root->reloc_root = NULL; - del = 1; - } - - __update_reloc_root(reloc_root, del); - - if (reloc_root->commit_root != reloc_root->node) { - btrfs_set_root_node(root_item, reloc_root->node); - free_extent_buffer(reloc_root->commit_root); - reloc_root->commit_root = btrfs_root_node(reloc_root); - } - - ret = btrfs_update_root(trans, root->fs_info->tree_root, - &reloc_root->root_key, root_item); - BUG_ON(ret); - return 0; -} - -/* - * helper to find first cached inode with inode number >= objectid - * in a subvolume - */ -static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) -{ - struct rb_node *node; - struct rb_node *prev; - struct btrfs_inode *entry; - struct inode *inode; - - spin_lock(&root->inode_lock); -again: - node = root->inode_tree.rb_node; - prev = NULL; - while (node) { - prev = node; - entry = rb_entry(node, struct btrfs_inode, rb_node); - - if (objectid < entry->vfs_inode.i_ino) - node = node->rb_left; - else if (objectid > entry->vfs_inode.i_ino) - node = node->rb_right; - else - break; - } - if (!node) { - while (prev) { - entry = rb_entry(prev, struct btrfs_inode, rb_node); - if (objectid <= entry->vfs_inode.i_ino) { - node = prev; - break; - } - prev = rb_next(prev); - } - } - while (node) { - entry = rb_entry(node, struct btrfs_inode, rb_node); - inode = igrab(&entry->vfs_inode); - if (inode) { - spin_unlock(&root->inode_lock); - return inode; - } - - objectid = entry->vfs_inode.i_ino + 1; - if (cond_resched_lock(&root->inode_lock)) - goto again; - - node = rb_next(node); - } - spin_unlock(&root->inode_lock); - return NULL; -} - -static int in_block_group(u64 bytenr, - struct btrfs_block_group_cache *block_group) -{ - if (bytenr >= block_group->key.objectid && - bytenr < block_group->key.objectid + block_group->key.offset) - return 1; - return 0; -} - -/* - * get new location of data - */ -static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, - u64 bytenr, u64 num_bytes) -{ - struct btrfs_root *root = BTRFS_I(reloc_inode)->root; - struct btrfs_path *path; - struct btrfs_file_extent_item *fi; - struct extent_buffer *leaf; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - bytenr -= BTRFS_I(reloc_inode)->index_cnt; - ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino, - bytenr, 0); - if (ret < 0) - goto out; - if (ret > 0) { - ret = -ENOENT; - goto out; - } - - leaf = path->nodes[0]; - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - BUG_ON(btrfs_file_extent_offset(leaf, fi) || - btrfs_file_extent_compression(leaf, fi) || - btrfs_file_extent_encryption(leaf, fi) || - btrfs_file_extent_other_encoding(leaf, fi)); - - if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { - ret = 1; - goto out; - } - - if (new_bytenr) - *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - ret = 0; -out: - btrfs_free_path(path); - return ret; -} - -/* - * update file extent items in the tree leaf to point to - * the new locations. - */ -static int replace_file_extents(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct btrfs_root *root, - struct extent_buffer *leaf, - struct list_head *inode_list) -{ - struct btrfs_key key; - struct btrfs_file_extent_item *fi; - struct inode *inode = NULL; - struct inodevec *ivec = NULL; - u64 parent; - u64 bytenr; - u64 new_bytenr; - u64 num_bytes; - u64 end; - u32 nritems; - u32 i; - int ret; - int first = 1; - int dirty = 0; - - if (rc->stage != UPDATE_DATA_PTRS) - return 0; - - /* reloc trees always use full backref */ - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - parent = leaf->start; - else - parent = 0; - - nritems = btrfs_header_nritems(leaf); - for (i = 0; i < nritems; i++) { - cond_resched(); - btrfs_item_key_to_cpu(leaf, &key, i); - if (key.type != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - if (bytenr == 0) - continue; - if (!in_block_group(bytenr, rc->block_group)) - continue; - - /* - * if we are modifying block in fs tree, wait for readpage - * to complete and drop the extent cache - */ - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { - if (!ivec || ivec->nr == INODEVEC_SIZE) { - ivec = kmalloc(sizeof(*ivec), GFP_NOFS); - BUG_ON(!ivec); - ivec->nr = 0; - list_add_tail(&ivec->list, inode_list); - } - if (first) { - inode = find_next_inode(root, key.objectid); - if (inode) - ivec->inode[ivec->nr++] = inode; - first = 0; - } else if (inode && inode->i_ino < key.objectid) { - inode = find_next_inode(root, key.objectid); - if (inode) - ivec->inode[ivec->nr++] = inode; - } - if (inode && inode->i_ino == key.objectid) { - end = key.offset + - btrfs_file_extent_num_bytes(leaf, fi); - WARN_ON(!IS_ALIGNED(key.offset, - root->sectorsize)); - WARN_ON(!IS_ALIGNED(end, root->sectorsize)); - end--; - ret = try_lock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end, - GFP_NOFS); - if (!ret) - continue; - - btrfs_drop_extent_cache(inode, key.offset, end, - 1); - unlock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end, GFP_NOFS); - } - } - - ret = get_new_location(rc->data_inode, &new_bytenr, - bytenr, num_bytes); - if (ret > 0) - continue; - BUG_ON(ret < 0); - - btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); - dirty = 1; - - key.offset -= btrfs_file_extent_offset(leaf, fi); - ret = btrfs_inc_extent_ref(trans, root, new_bytenr, - num_bytes, parent, - btrfs_header_owner(leaf), - key.objectid, key.offset); - BUG_ON(ret); - - ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - parent, btrfs_header_owner(leaf), - key.objectid, key.offset); - BUG_ON(ret); - } - if (dirty) - btrfs_mark_buffer_dirty(leaf); - return 0; -} - -static noinline_for_stack -int memcmp_node_keys(struct extent_buffer *eb, int slot, - struct btrfs_path *path, int level) -{ - struct btrfs_disk_key key1; - struct btrfs_disk_key key2; - btrfs_node_key(eb, &key1, slot); - btrfs_node_key(path->nodes[level], &key2, path->slots[level]); - return memcmp(&key1, &key2, sizeof(key1)); -} - -/* - * try to replace tree blocks in fs tree with the new blocks - * in reloc tree. tree blocks haven't been modified since the - * reloc tree was create can be replaced. - * - * if a block was replaced, level of the block + 1 is returned. - * if no block got replaced, 0 is returned. if there are other - * errors, a negative error number is returned. - */ -static int replace_path(struct btrfs_trans_handle *trans, - struct btrfs_root *dest, struct btrfs_root *src, - struct btrfs_path *path, struct btrfs_key *next_key, - struct extent_buffer **leaf, - int lowest_level, int max_level) -{ - struct extent_buffer *eb; - struct extent_buffer *parent; - struct btrfs_key key; - u64 old_bytenr; - u64 new_bytenr; - u64 old_ptr_gen; - u64 new_ptr_gen; - u64 last_snapshot; - u32 blocksize; - int level; - int ret; - int slot; - - BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(lowest_level > 1 && leaf); - - last_snapshot = btrfs_root_last_snapshot(&src->root_item); - - slot = path->slots[lowest_level]; - btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); - - eb = btrfs_lock_root_node(dest); - btrfs_set_lock_blocking(eb); - level = btrfs_header_level(eb); - - if (level < lowest_level) { - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - return 0; - } - - ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); - BUG_ON(ret); - btrfs_set_lock_blocking(eb); - - if (next_key) { - next_key->objectid = (u64)-1; - next_key->type = (u8)-1; - next_key->offset = (u64)-1; - } - - parent = eb; - while (1) { - level = btrfs_header_level(parent); - BUG_ON(level < lowest_level); - - ret = btrfs_bin_search(parent, &key, level, &slot); - if (ret && slot > 0) - slot--; - - if (next_key && slot + 1 < btrfs_header_nritems(parent)) - btrfs_node_key_to_cpu(parent, next_key, slot + 1); - - old_bytenr = btrfs_node_blockptr(parent, slot); - blocksize = btrfs_level_size(dest, level - 1); - old_ptr_gen = btrfs_node_ptr_generation(parent, slot); - - if (level <= max_level) { - eb = path->nodes[level]; - new_bytenr = btrfs_node_blockptr(eb, - path->slots[level]); - new_ptr_gen = btrfs_node_ptr_generation(eb, - path->slots[level]); - } else { - new_bytenr = 0; - new_ptr_gen = 0; - } - - if (new_bytenr > 0 && new_bytenr == old_bytenr) { - WARN_ON(1); - ret = level; - break; - } - - if (new_bytenr == 0 || old_ptr_gen > last_snapshot || - memcmp_node_keys(parent, slot, path, level)) { - if (level <= lowest_level && !leaf) { - ret = 0; - break; - } - - eb = read_tree_block(dest, old_bytenr, blocksize, - old_ptr_gen); - btrfs_tree_lock(eb); - ret = btrfs_cow_block(trans, dest, eb, parent, - slot, &eb); - BUG_ON(ret); - btrfs_set_lock_blocking(eb); - - if (level <= lowest_level) { - *leaf = eb; - ret = 0; - break; - } - - btrfs_tree_unlock(parent); - free_extent_buffer(parent); - - parent = eb; - continue; - } - - btrfs_node_key_to_cpu(path->nodes[level], &key, - path->slots[level]); - btrfs_release_path(src, path); - - path->lowest_level = level; - ret = btrfs_search_slot(trans, src, &key, path, 0, 1); - path->lowest_level = 0; - BUG_ON(ret); - - /* - * swap blocks in fs tree and reloc tree. - */ - btrfs_set_node_blockptr(parent, slot, new_bytenr); - btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); - btrfs_mark_buffer_dirty(parent); - - btrfs_set_node_blockptr(path->nodes[level], - path->slots[level], old_bytenr); - btrfs_set_node_ptr_generation(path->nodes[level], - path->slots[level], old_ptr_gen); - btrfs_mark_buffer_dirty(path->nodes[level]); - - ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, - path->nodes[level]->start, - src->root_key.objectid, level - 1, 0); - BUG_ON(ret); - ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, - 0, dest->root_key.objectid, level - 1, - 0); - BUG_ON(ret); - - ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, - path->nodes[level]->start, - src->root_key.objectid, level - 1, 0); - BUG_ON(ret); - - ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, - 0, dest->root_key.objectid, level - 1, - 0); - BUG_ON(ret); - - btrfs_unlock_up_safe(path, 0); - - ret = level; - break; - } - btrfs_tree_unlock(parent); - free_extent_buffer(parent); - return ret; -} - -/* - * helper to find next relocated block in reloc tree - */ -static noinline_for_stack -int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, - int *level) -{ - struct extent_buffer *eb; - int i; - u64 last_snapshot; - u32 nritems; - - last_snapshot = btrfs_root_last_snapshot(&root->root_item); - - for (i = 0; i < *level; i++) { - free_extent_buffer(path->nodes[i]); - path->nodes[i] = NULL; - } - - for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { - eb = path->nodes[i]; - nritems = btrfs_header_nritems(eb); - while (path->slots[i] + 1 < nritems) { - path->slots[i]++; - if (btrfs_node_ptr_generation(eb, path->slots[i]) <= - last_snapshot) - continue; - - *level = i; - return 0; - } - free_extent_buffer(path->nodes[i]); - path->nodes[i] = NULL; - } - return 1; -} - -/* - * walk down reloc tree to find relocated block of lowest level - */ -static noinline_for_stack -int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, - int *level) -{ - struct extent_buffer *eb = NULL; - int i; - u64 bytenr; - u64 ptr_gen = 0; - u64 last_snapshot; - u32 blocksize; - u32 nritems; - - last_snapshot = btrfs_root_last_snapshot(&root->root_item); - - for (i = *level; i > 0; i--) { - eb = path->nodes[i]; - nritems = btrfs_header_nritems(eb); - while (path->slots[i] < nritems) { - ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); - if (ptr_gen > last_snapshot) - break; - path->slots[i]++; - } - if (path->slots[i] >= nritems) { - if (i == *level) - break; - *level = i + 1; - return 0; - } - if (i == 1) { - *level = i; - return 0; - } - - bytenr = btrfs_node_blockptr(eb, path->slots[i]); - blocksize = btrfs_level_size(root, i - 1); - eb = read_tree_block(root, bytenr, blocksize, ptr_gen); - BUG_ON(btrfs_header_level(eb) != i - 1); - path->nodes[i - 1] = eb; - path->slots[i - 1] = 0; - } - return 1; -} - -/* - * invalidate extent cache for file extents whose key in range of - * [min_key, max_key) - */ -static int invalidate_extent_cache(struct btrfs_root *root, - struct btrfs_key *min_key, - struct btrfs_key *max_key) -{ - struct inode *inode = NULL; - u64 objectid; - u64 start, end; - - objectid = min_key->objectid; - while (1) { - cond_resched(); - iput(inode); - - if (objectid > max_key->objectid) - break; - - inode = find_next_inode(root, objectid); - if (!inode) - break; - - if (inode->i_ino > max_key->objectid) { - iput(inode); - break; - } - - objectid = inode->i_ino + 1; - if (!S_ISREG(inode->i_mode)) - continue; - - if (unlikely(min_key->objectid == inode->i_ino)) { - if (min_key->type > BTRFS_EXTENT_DATA_KEY) - continue; - if (min_key->type < BTRFS_EXTENT_DATA_KEY) - start = 0; - else { - start = min_key->offset; - WARN_ON(!IS_ALIGNED(start, root->sectorsize)); - } - } else { - start = 0; - } - - if (unlikely(max_key->objectid == inode->i_ino)) { - if (max_key->type < BTRFS_EXTENT_DATA_KEY) - continue; - if (max_key->type > BTRFS_EXTENT_DATA_KEY) { - end = (u64)-1; - } else { - if (max_key->offset == 0) - continue; - end = max_key->offset; - WARN_ON(!IS_ALIGNED(end, root->sectorsize)); - end--; - } - } else { - end = (u64)-1; - } - - /* the lock_extent waits for readpage to complete */ - lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); - btrfs_drop_extent_cache(inode, start, end, 1); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); - } - return 0; -} - -static int find_next_key(struct btrfs_path *path, int level, - struct btrfs_key *key) - -{ - while (level < BTRFS_MAX_LEVEL) { - if (!path->nodes[level]) - break; - if (path->slots[level] + 1 < - btrfs_header_nritems(path->nodes[level])) { - btrfs_node_key_to_cpu(path->nodes[level], key, - path->slots[level] + 1); - return 0; - } - level++; - } - return 1; -} - -/* - * merge the relocated tree blocks in reloc tree with corresponding - * fs tree. - */ -static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, - struct btrfs_root *root) -{ - LIST_HEAD(inode_list); - struct btrfs_key key; - struct btrfs_key next_key; - struct btrfs_trans_handle *trans; - struct btrfs_root *reloc_root; - struct btrfs_root_item *root_item; - struct btrfs_path *path; - struct extent_buffer *leaf = NULL; - unsigned long nr; - int level; - int max_level; - int replaced = 0; - int ret; - int err = 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - reloc_root = root->reloc_root; - root_item = &reloc_root->root_item; - - if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { - level = btrfs_root_level(root_item); - extent_buffer_get(reloc_root->node); - path->nodes[level] = reloc_root->node; - path->slots[level] = 0; - } else { - btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); - - level = root_item->drop_level; - BUG_ON(level == 0); - path->lowest_level = level; - ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); - if (ret < 0) { - btrfs_free_path(path); - return ret; - } - - btrfs_node_key_to_cpu(path->nodes[level], &next_key, - path->slots[level]); - WARN_ON(memcmp(&key, &next_key, sizeof(key))); - - btrfs_unlock_up_safe(path, 0); - } - - if (level == 0 && rc->stage == UPDATE_DATA_PTRS) { - trans = btrfs_start_transaction(root, 1); - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, 0); - btrfs_release_path(reloc_root, path); - - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) { - err = ret; - goto out; - } - - leaf = path->nodes[0]; - btrfs_unlock_up_safe(path, 1); - ret = replace_file_extents(trans, rc, root, leaf, - &inode_list); - if (ret < 0) - err = ret; - goto out; - } - - memset(&next_key, 0, sizeof(next_key)); - - while (1) { - leaf = NULL; - replaced = 0; - trans = btrfs_start_transaction(root, 1); - max_level = level; - - ret = walk_down_reloc_tree(reloc_root, path, &level); - if (ret < 0) { - err = ret; - goto out; - } - if (ret > 0) - break; - - if (!find_next_key(path, level, &key) && - btrfs_comp_cpu_keys(&next_key, &key) >= 0) { - ret = 0; - } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) { - ret = replace_path(trans, root, reloc_root, - path, &next_key, &leaf, - level, max_level); - } else { - ret = replace_path(trans, root, reloc_root, - path, &next_key, NULL, - level, max_level); - } - if (ret < 0) { - err = ret; - goto out; - } - - if (ret > 0) { - level = ret; - btrfs_node_key_to_cpu(path->nodes[level], &key, - path->slots[level]); - replaced = 1; - } else if (leaf) { - /* - * no block got replaced, try replacing file extents - */ - btrfs_item_key_to_cpu(leaf, &key, 0); - ret = replace_file_extents(trans, rc, root, leaf, - &inode_list); - btrfs_tree_unlock(leaf); - free_extent_buffer(leaf); - BUG_ON(ret < 0); - } - - ret = walk_up_reloc_tree(reloc_root, path, &level); - if (ret > 0) - break; - - BUG_ON(level == 0); - /* - * save the merging progress in the drop_progress. - * this is OK since root refs == 1 in this case. - */ - btrfs_node_key(path->nodes[level], &root_item->drop_progress, - path->slots[level]); - root_item->drop_level = level; - - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - - btrfs_btree_balance_dirty(root, nr); - - if (replaced && rc->stage == UPDATE_DATA_PTRS) - invalidate_extent_cache(root, &key, &next_key); - } - - /* - * handle the case only one block in the fs tree need to be - * relocated and the block is tree root. - */ - leaf = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf); - btrfs_tree_unlock(leaf); - free_extent_buffer(leaf); - if (ret < 0) - err = ret; -out: - btrfs_free_path(path); - - if (err == 0) { - memset(&root_item->drop_progress, 0, - sizeof(root_item->drop_progress)); - root_item->drop_level = 0; - btrfs_set_root_refs(root_item, 0); - } - - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - - btrfs_btree_balance_dirty(root, nr); - - /* - * put inodes while we aren't holding the tree locks - */ - while (!list_empty(&inode_list)) { - struct inodevec *ivec; - ivec = list_entry(inode_list.next, struct inodevec, list); - list_del(&ivec->list); - while (ivec->nr > 0) { - ivec->nr--; - iput(ivec->inode[ivec->nr]); - } - kfree(ivec); - } - - if (replaced && rc->stage == UPDATE_DATA_PTRS) - invalidate_extent_cache(root, &key, &next_key); - - return err; -} - -/* - * callback for the work threads. - * this function merges reloc tree with corresponding fs tree, - * and then drops the reloc tree. - */ -static void merge_func(struct btrfs_work *work) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *root; - struct btrfs_root *reloc_root; - struct async_merge *async; - - async = container_of(work, struct async_merge, work); - reloc_root = async->root; - - if (btrfs_root_refs(&reloc_root->root_item) > 0) { - root = read_fs_root(reloc_root->fs_info, - reloc_root->root_key.offset); - BUG_ON(IS_ERR(root)); - BUG_ON(root->reloc_root != reloc_root); - - merge_reloc_root(async->rc, root); - - trans = btrfs_start_transaction(root, 1); - btrfs_update_reloc_root(trans, root); - btrfs_end_transaction(trans, root); - } - - btrfs_drop_dead_root(reloc_root); - - if (atomic_dec_and_test(async->num_pending)) - complete(async->done); - - kfree(async); -} - -static int merge_reloc_roots(struct reloc_control *rc) -{ - struct async_merge *async; - struct btrfs_root *root; - struct completion done; - atomic_t num_pending; - - init_completion(&done); - atomic_set(&num_pending, 1); - - while (!list_empty(&rc->reloc_roots)) { - root = list_entry(rc->reloc_roots.next, - struct btrfs_root, root_list); - list_del_init(&root->root_list); - - async = kmalloc(sizeof(*async), GFP_NOFS); - BUG_ON(!async); - async->work.func = merge_func; - async->work.flags = 0; - async->rc = rc; - async->root = root; - async->done = &done; - async->num_pending = &num_pending; - atomic_inc(&num_pending); - btrfs_queue_worker(&rc->workers, &async->work); - } - - if (!atomic_dec_and_test(&num_pending)) - wait_for_completion(&done); - - BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); - return 0; -} - -static void free_block_list(struct rb_root *blocks) -{ - struct tree_block *block; - struct rb_node *rb_node; - while ((rb_node = rb_first(blocks))) { - block = rb_entry(rb_node, struct tree_block, rb_node); - rb_erase(rb_node, blocks); - kfree(block); - } -} - -static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *reloc_root) -{ - struct btrfs_root *root; - - if (reloc_root->last_trans == trans->transid) - return 0; - - root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset); - BUG_ON(IS_ERR(root)); - BUG_ON(root->reloc_root != reloc_root); - - return btrfs_record_root_in_trans(trans, root); -} - -/* - * select one tree from trees that references the block. - * for blocks in refernce counted trees, we preper reloc tree. - * if no reloc tree found and reloc_only is true, NULL is returned. - */ -static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans, - struct backref_node *node, - struct backref_edge *edges[], - int *nr, int reloc_only) -{ - struct backref_node *next; - struct btrfs_root *root; - int index; - int loop = 0; -again: - index = 0; - next = node; - while (1) { - cond_resched(); - next = walk_up_backref(next, edges, &index); - root = next->root; - if (!root) { - BUG_ON(!node->old_root); - goto skip; - } - - /* no other choice for non-refernce counted tree */ - if (!root->ref_cows) { - BUG_ON(reloc_only); - break; - } - - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - record_reloc_root_in_trans(trans, root); - break; - } - - if (loop) { - btrfs_record_root_in_trans(trans, root); - break; - } - - if (reloc_only || next != node) { - if (!root->reloc_root) - btrfs_record_root_in_trans(trans, root); - root = root->reloc_root; - /* - * if the reloc tree was created in current - * transation, there is no node in backref tree - * corresponds to the root of the reloc tree. - */ - if (btrfs_root_last_snapshot(&root->root_item) == - trans->transid - 1) - break; - } -skip: - root = NULL; - next = walk_down_backref(edges, &index); - if (!next || next->level <= node->level) - break; - } - - if (!root && !loop && !reloc_only) { - loop = 1; - goto again; - } - - if (root) - *nr = index; - else - *nr = 0; - - return root; -} - -static noinline_for_stack -struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, - struct backref_node *node) -{ - struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; - int nr; - return __select_one_root(trans, node, edges, &nr, 0); -} - -static noinline_for_stack -struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, - struct backref_node *node, - struct backref_edge *edges[], int *nr) -{ - return __select_one_root(trans, node, edges, nr, 1); -} - -static void grab_path_buffers(struct btrfs_path *path, - struct backref_node *node, - struct backref_edge *edges[], int nr) -{ - int i = 0; - while (1) { - drop_node_buffer(node); - node->eb = path->nodes[node->level]; - BUG_ON(!node->eb); - if (path->locks[node->level]) - node->locked = 1; - path->nodes[node->level] = NULL; - path->locks[node->level] = 0; - - if (i >= nr) - break; - - edges[i]->blockptr = node->eb->start; - node = edges[i]->node[UPPER]; - i++; - } -} - -/* - * relocate a block tree, and then update pointers in upper level - * blocks that reference the block to point to the new location. - * - * if called by link_to_upper, the block has already been relocated. - * in that case this function just updates pointers. - */ -static int do_relocation(struct btrfs_trans_handle *trans, - struct backref_node *node, - struct btrfs_key *key, - struct btrfs_path *path, int lowest) -{ - struct backref_node *upper; - struct backref_edge *edge; - struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; - struct btrfs_root *root; - struct extent_buffer *eb; - u32 blocksize; - u64 bytenr; - u64 generation; - int nr; - int slot; - int ret; - int err = 0; - - BUG_ON(lowest && node->eb); - - path->lowest_level = node->level + 1; - list_for_each_entry(edge, &node->upper, list[LOWER]) { - cond_resched(); - if (node->eb && node->eb->start == edge->blockptr) - continue; - - upper = edge->node[UPPER]; - root = select_reloc_root(trans, upper, edges, &nr); - if (!root) - continue; - - if (upper->eb && !upper->locked) - drop_node_buffer(upper); - - if (!upper->eb) { - ret = btrfs_search_slot(trans, root, key, path, 0, 1); - if (ret < 0) { - err = ret; - break; - } - BUG_ON(ret > 0); - - slot = path->slots[upper->level]; - - btrfs_unlock_up_safe(path, upper->level + 1); - grab_path_buffers(path, upper, edges, nr); - - btrfs_release_path(NULL, path); - } else { - ret = btrfs_bin_search(upper->eb, key, upper->level, - &slot); - BUG_ON(ret); - } - - bytenr = btrfs_node_blockptr(upper->eb, slot); - if (!lowest) { - if (node->eb->start == bytenr) { - btrfs_tree_unlock(upper->eb); - upper->locked = 0; - continue; - } - } else { - BUG_ON(node->bytenr != bytenr); - } - - blocksize = btrfs_level_size(root, node->level); - generation = btrfs_node_ptr_generation(upper->eb, slot); - eb = read_tree_block(root, bytenr, blocksize, generation); - btrfs_tree_lock(eb); - btrfs_set_lock_blocking(eb); - - if (!node->eb) { - ret = btrfs_cow_block(trans, root, eb, upper->eb, - slot, &eb); - if (ret < 0) { - err = ret; - break; - } - btrfs_set_lock_blocking(eb); - node->eb = eb; - node->locked = 1; - } else { - btrfs_set_node_blockptr(upper->eb, slot, - node->eb->start); - btrfs_set_node_ptr_generation(upper->eb, slot, - trans->transid); - btrfs_mark_buffer_dirty(upper->eb); - - ret = btrfs_inc_extent_ref(trans, root, - node->eb->start, blocksize, - upper->eb->start, - btrfs_header_owner(upper->eb), - node->level, 0); - BUG_ON(ret); - - ret = btrfs_drop_subtree(trans, root, eb, upper->eb); - BUG_ON(ret); - - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - } - if (!lowest) { - btrfs_tree_unlock(upper->eb); - upper->locked = 0; - } - } - path->lowest_level = 0; - return err; -} - -static int link_to_upper(struct btrfs_trans_handle *trans, - struct backref_node *node, - struct btrfs_path *path) -{ - struct btrfs_key key; - if (!node->eb || list_empty(&node->upper)) - return 0; - - btrfs_node_key_to_cpu(node->eb, &key, 0); - return do_relocation(trans, node, &key, path, 0); -} - -static int finish_pending_nodes(struct btrfs_trans_handle *trans, - struct backref_cache *cache, - struct btrfs_path *path) -{ - struct backref_node *node; - int level; - int ret; - int err = 0; - - for (level = 0; level < BTRFS_MAX_LEVEL; level++) { - while (!list_empty(&cache->pending[level])) { - node = list_entry(cache->pending[level].next, - struct backref_node, lower); - BUG_ON(node->level != level); - - ret = link_to_upper(trans, node, path); - if (ret < 0) - err = ret; - /* - * this remove the node from the pending list and - * may add some other nodes to the level + 1 - * pending list - */ - remove_backref_node(cache, node); - } - } - BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root)); - return err; -} - -static void mark_block_processed(struct reloc_control *rc, - struct backref_node *node) -{ - u32 blocksize; - if (node->level == 0 || - in_block_group(node->bytenr, rc->block_group)) { - blocksize = btrfs_level_size(rc->extent_root, node->level); - set_extent_bits(&rc->processed_blocks, node->bytenr, - node->bytenr + blocksize - 1, EXTENT_DIRTY, - GFP_NOFS); - } - node->processed = 1; -} - -/* - * mark a block and all blocks directly/indirectly reference the block - * as processed. - */ -static void update_processed_blocks(struct reloc_control *rc, - struct backref_node *node) -{ - struct backref_node *next = node; - struct backref_edge *edge; - struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; - int index = 0; - - while (next) { - cond_resched(); - while (1) { - if (next->processed) - break; - - mark_block_processed(rc, next); - - if (list_empty(&next->upper)) - break; - - edge = list_entry(next->upper.next, - struct backref_edge, list[LOWER]); - edges[index++] = edge; - next = edge->node[UPPER]; - } - next = walk_down_backref(edges, &index); - } -} - -static int tree_block_processed(u64 bytenr, u32 blocksize, - struct reloc_control *rc) -{ - if (test_range_bit(&rc->processed_blocks, bytenr, - bytenr + blocksize - 1, EXTENT_DIRTY, 1)) - return 1; - return 0; -} - -/* - * check if there are any file extent pointers in the leaf point to - * data require processing - */ -static int check_file_extents(struct reloc_control *rc, - u64 bytenr, u32 blocksize, u64 ptr_gen) -{ - struct btrfs_key found_key; - struct btrfs_file_extent_item *fi; - struct extent_buffer *leaf; - u32 nritems; - int i; - int ret = 0; - - leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen); - - nritems = btrfs_header_nritems(leaf); - for (i = 0; i < nritems; i++) { - cond_resched(); - btrfs_item_key_to_cpu(leaf, &found_key, i); - if (found_key.type != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - if (bytenr == 0) - continue; - if (in_block_group(bytenr, rc->block_group)) { - ret = 1; - break; - } - } - free_extent_buffer(leaf); - return ret; -} - -/* - * scan child blocks of a given block to find blocks require processing - */ -static int add_child_blocks(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct backref_node *node, - struct rb_root *blocks) -{ - struct tree_block *block; - struct rb_node *rb_node; - u64 bytenr; - u64 ptr_gen; - u32 blocksize; - u32 nritems; - int i; - int err = 0; - - nritems = btrfs_header_nritems(node->eb); - blocksize = btrfs_level_size(rc->extent_root, node->level - 1); - for (i = 0; i < nritems; i++) { - cond_resched(); - bytenr = btrfs_node_blockptr(node->eb, i); - ptr_gen = btrfs_node_ptr_generation(node->eb, i); - if (ptr_gen == trans->transid) - continue; - if (!in_block_group(bytenr, rc->block_group) && - (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS)) - continue; - if (tree_block_processed(bytenr, blocksize, rc)) - continue; - - readahead_tree_block(rc->extent_root, - bytenr, blocksize, ptr_gen); - } - - for (i = 0; i < nritems; i++) { - cond_resched(); - bytenr = btrfs_node_blockptr(node->eb, i); - ptr_gen = btrfs_node_ptr_generation(node->eb, i); - if (ptr_gen == trans->transid) - continue; - if (!in_block_group(bytenr, rc->block_group) && - (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS)) - continue; - if (tree_block_processed(bytenr, blocksize, rc)) - continue; - if (!in_block_group(bytenr, rc->block_group) && - !check_file_extents(rc, bytenr, blocksize, ptr_gen)) - continue; - - block = kmalloc(sizeof(*block), GFP_NOFS); - if (!block) { - err = -ENOMEM; - break; - } - block->bytenr = bytenr; - btrfs_node_key_to_cpu(node->eb, &block->key, i); - block->level = node->level - 1; - block->key_ready = 1; - rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); - BUG_ON(rb_node); - } - if (err) - free_block_list(blocks); - return err; -} - -/* - * find adjacent blocks require processing - */ -static noinline_for_stack -int add_adjacent_blocks(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct backref_cache *cache, - struct rb_root *blocks, int level, - struct backref_node **upper) -{ - struct backref_node *node; - int ret = 0; - - WARN_ON(!list_empty(&cache->pending[level])); - - if (list_empty(&cache->pending[level + 1])) - return 1; - - node = list_entry(cache->pending[level + 1].next, - struct backref_node, lower); - if (node->eb) - ret = add_child_blocks(trans, rc, node, blocks); - - *upper = node; - return ret; -} - -static int get_tree_block_key(struct reloc_control *rc, - struct tree_block *block) -{ - struct extent_buffer *eb; - - BUG_ON(block->key_ready); - eb = read_tree_block(rc->extent_root, block->bytenr, - block->key.objectid, block->key.offset); - WARN_ON(btrfs_header_level(eb) != block->level); - if (block->level == 0) - btrfs_item_key_to_cpu(eb, &block->key, 0); - else - btrfs_node_key_to_cpu(eb, &block->key, 0); - free_extent_buffer(eb); - block->key_ready = 1; - return 0; -} - -static int reada_tree_block(struct reloc_control *rc, - struct tree_block *block) -{ - BUG_ON(block->key_ready); - readahead_tree_block(rc->extent_root, block->bytenr, - block->key.objectid, block->key.offset); - return 0; -} - -/* - * helper function to relocate a tree block - */ -static int relocate_tree_block(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct backref_node *node, - struct btrfs_key *key, - struct btrfs_path *path) -{ - struct btrfs_root *root; - int ret; - - root = select_one_root(trans, node); - if (unlikely(!root)) { - rc->found_old_snapshot = 1; - update_processed_blocks(rc, node); - return 0; - } - - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = do_relocation(trans, node, key, path, 1); - if (ret < 0) - goto out; - if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) { - ret = replace_file_extents(trans, rc, root, - node->eb, NULL); - if (ret < 0) - goto out; - } - drop_node_buffer(node); - } else if (!root->ref_cows) { - path->lowest_level = node->level; - ret = btrfs_search_slot(trans, root, key, path, 0, 1); - btrfs_release_path(root, path); - if (ret < 0) - goto out; - } else if (root != node->root) { - WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS); - } - - update_processed_blocks(rc, node); - ret = 0; -out: - drop_node_buffer(node); - return ret; -} - -/* - * relocate a list of blocks - */ -static noinline_for_stack -int relocate_tree_blocks(struct btrfs_trans_handle *trans, - struct reloc_control *rc, struct rb_root *blocks) -{ - struct backref_cache *cache; - struct backref_node *node; - struct btrfs_path *path; - struct tree_block *block; - struct rb_node *rb_node; - int level = -1; - int ret; - int err = 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - cache = kmalloc(sizeof(*cache), GFP_NOFS); - if (!cache) { - btrfs_free_path(path); - return -ENOMEM; - } - - backref_cache_init(cache); - - rb_node = rb_first(blocks); - while (rb_node) { - block = rb_entry(rb_node, struct tree_block, rb_node); - if (level == -1) - level = block->level; - else - BUG_ON(level != block->level); - if (!block->key_ready) - reada_tree_block(rc, block); - rb_node = rb_next(rb_node); - } - - rb_node = rb_first(blocks); - while (rb_node) { - block = rb_entry(rb_node, struct tree_block, rb_node); - if (!block->key_ready) - get_tree_block_key(rc, block); - rb_node = rb_next(rb_node); - } - - rb_node = rb_first(blocks); - while (rb_node) { - block = rb_entry(rb_node, struct tree_block, rb_node); - - node = build_backref_tree(rc, cache, &block->key, - block->level, block->bytenr); - if (IS_ERR(node)) { - err = PTR_ERR(node); - goto out; - } - - ret = relocate_tree_block(trans, rc, node, &block->key, - path); - if (ret < 0) { - err = ret; - goto out; - } - remove_backref_node(cache, node); - rb_node = rb_next(rb_node); - } - - if (level > 0) - goto out; - - free_block_list(blocks); - - /* - * now backrefs of some upper level tree blocks have been cached, - * try relocating blocks referenced by these upper level blocks. - */ - while (1) { - struct backref_node *upper = NULL; - if (trans->transaction->in_commit || - trans->transaction->delayed_refs.flushing) - break; - - ret = add_adjacent_blocks(trans, rc, cache, blocks, level, - &upper); - if (ret < 0) - err = ret; - if (ret != 0) - break; - - rb_node = rb_first(blocks); - while (rb_node) { - block = rb_entry(rb_node, struct tree_block, rb_node); - if (trans->transaction->in_commit || - trans->transaction->delayed_refs.flushing) - goto out; - BUG_ON(!block->key_ready); - node = build_backref_tree(rc, cache, &block->key, - level, block->bytenr); - if (IS_ERR(node)) { - err = PTR_ERR(node); - goto out; - } - - ret = relocate_tree_block(trans, rc, node, - &block->key, path); - if (ret < 0) { - err = ret; - goto out; - } - remove_backref_node(cache, node); - rb_node = rb_next(rb_node); - } - free_block_list(blocks); - - if (upper) { - ret = link_to_upper(trans, upper, path); - if (ret < 0) { - err = ret; - break; - } - remove_backref_node(cache, upper); - } - } -out: - free_block_list(blocks); - - ret = finish_pending_nodes(trans, cache, path); - if (ret < 0) - err = ret; - - kfree(cache); - btrfs_free_path(path); - return err; -} - -static noinline_for_stack -int relocate_inode_pages(struct inode *inode, u64 start, u64 len) -{ - u64 page_start; - u64 page_end; - unsigned long i; - unsigned long first_index; - unsigned long last_index; - unsigned int total_read = 0; - unsigned int total_dirty = 0; - struct page *page; - struct file_ra_state *ra; - struct btrfs_ordered_extent *ordered; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - int ret = 0; - - ra = kzalloc(sizeof(*ra), GFP_NOFS); - if (!ra) - return -ENOMEM; - - mutex_lock(&inode->i_mutex); - first_index = start >> PAGE_CACHE_SHIFT; - last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; - - /* make sure the dirty trick played by the caller work */ - ret = invalidate_inode_pages2_range(inode->i_mapping, - first_index, last_index); - if (ret) - goto out_unlock; - - file_ra_state_init(ra, inode->i_mapping); - - for (i = first_index ; i <= last_index; i++) { - if (total_read % ra->ra_pages == 0) { - btrfs_force_ra(inode->i_mapping, ra, NULL, i, - min(last_index, ra->ra_pages + i - 1)); - } - total_read++; -again: - if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) - BUG_ON(1); - page = grab_cache_page(inode->i_mapping, i); - if (!page) { - ret = -ENOMEM; - goto out_unlock; - } - if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - page_cache_release(page); - ret = -EIO; - goto out_unlock; - } - } - wait_on_page_writeback(page); - - page_start = (u64)page->index << PAGE_CACHE_SHIFT; - page_end = page_start + PAGE_CACHE_SIZE - 1; - lock_extent(io_tree, page_start, page_end, GFP_NOFS); - - ordered = btrfs_lookup_ordered_extent(inode, page_start); - if (ordered) { - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); - unlock_page(page); - page_cache_release(page); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - goto again; - } - set_page_extent_mapped(page); - - if (i == first_index) - set_extent_bits(io_tree, page_start, page_end, - EXTENT_BOUNDARY, GFP_NOFS); - btrfs_set_extent_delalloc(inode, page_start, page_end); - - set_page_dirty(page); - total_dirty++; - - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); - unlock_page(page); - page_cache_release(page); - } -out_unlock: - mutex_unlock(&inode->i_mutex); - kfree(ra); - balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); - return ret; -} - -static noinline_for_stack -int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct extent_map *em; - u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt; - u64 end = start + extent_key->offset - 1; - - em = alloc_extent_map(GFP_NOFS); - em->start = start; - em->len = extent_key->offset; - em->block_len = extent_key->offset; - em->block_start = extent_key->objectid; - em->bdev = root->fs_info->fs_devices->latest_bdev; - set_bit(EXTENT_FLAG_PINNED, &em->flags); - - /* setup extent map to cheat btrfs_readpage */ - lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); - while (1) { - int ret; - spin_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); - if (ret != -EEXIST) { - free_extent_map(em); - break; - } - btrfs_drop_extent_cache(inode, start, end, 0); - } - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); - - return relocate_inode_pages(inode, start, extent_key->offset); -} - -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 -static int get_ref_objectid_v0(struct reloc_control *rc, - struct btrfs_path *path, - struct btrfs_key *extent_key, - u64 *ref_objectid, int *path_change) -{ - struct btrfs_key key; - struct extent_buffer *leaf; - struct btrfs_extent_ref_v0 *ref0; - int ret; - int slot; - - leaf = path->nodes[0]; - slot = path->slots[0]; - while (1) { - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(rc->extent_root, path); - if (ret < 0) - return ret; - BUG_ON(ret > 0); - leaf = path->nodes[0]; - slot = path->slots[0]; - if (path_change) - *path_change = 1; - } - btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid != extent_key->objectid) - return -ENOENT; - - if (key.type != BTRFS_EXTENT_REF_V0_KEY) { - slot++; - continue; - } - ref0 = btrfs_item_ptr(leaf, slot, - struct btrfs_extent_ref_v0); - *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0); - break; - } - return 0; -} -#endif - -/* - * helper to add a tree block to the list. - * the major work is getting the generation and level of the block - */ -static int add_tree_block(struct reloc_control *rc, - struct btrfs_key *extent_key, - struct btrfs_path *path, - struct rb_root *blocks) -{ - struct extent_buffer *eb; - struct btrfs_extent_item *ei; - struct btrfs_tree_block_info *bi; - struct tree_block *block; - struct rb_node *rb_node; - u32 item_size; - int level = -1; - int generation; - - eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, path->slots[0]); - - if (item_size >= sizeof(*ei) + sizeof(*bi)) { - ei = btrfs_item_ptr(eb, path->slots[0], - struct btrfs_extent_item); - bi = (struct btrfs_tree_block_info *)(ei + 1); - generation = btrfs_extent_generation(eb, ei); - level = btrfs_tree_block_level(eb, bi); - } else { -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - u64 ref_owner; - int ret; - - BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0)); - ret = get_ref_objectid_v0(rc, path, extent_key, - &ref_owner, NULL); - BUG_ON(ref_owner >= BTRFS_MAX_LEVEL); - level = (int)ref_owner; - /* FIXME: get real generation */ - generation = 0; -#else - BUG(); -#endif - } - - btrfs_release_path(rc->extent_root, path); - - BUG_ON(level == -1); - - block = kmalloc(sizeof(*block), GFP_NOFS); - if (!block) - return -ENOMEM; - - block->bytenr = extent_key->objectid; - block->key.objectid = extent_key->offset; - block->key.offset = generation; - block->level = level; - block->key_ready = 0; - - rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); - BUG_ON(rb_node); - - return 0; -} - -/* - * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY - */ -static int __add_tree_block(struct reloc_control *rc, - u64 bytenr, u32 blocksize, - struct rb_root *blocks) -{ - struct btrfs_path *path; - struct btrfs_key key; - int ret; - - if (tree_block_processed(bytenr, blocksize, rc)) - return 0; - - if (tree_search(blocks, bytenr)) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = blocksize; - - path->search_commit_root = 1; - path->skip_locking = 1; - ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); - if (ret < 0) - goto out; - BUG_ON(ret); - - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - ret = add_tree_block(rc, &key, path, blocks); -out: - btrfs_free_path(path); - return ret; -} - -/* - * helper to check if the block use full backrefs for pointers in it - */ -static int block_use_full_backref(struct reloc_control *rc, - struct extent_buffer *eb) -{ - struct btrfs_path *path; - struct btrfs_extent_item *ei; - struct btrfs_key key; - u64 flags; - int ret; - - if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) || - btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) - return 1; - - path = btrfs_alloc_path(); - BUG_ON(!path); - - key.objectid = eb->start; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = eb->len; - - path->search_commit_root = 1; - path->skip_locking = 1; - ret = btrfs_search_slot(NULL, rc->extent_root, - &key, path, 0, 0); - BUG_ON(ret); - - ei = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_extent_item); - flags = btrfs_extent_flags(path->nodes[0], ei); - BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); - if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) - ret = 1; - else - ret = 0; - btrfs_free_path(path); - return ret; -} - -/* - * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY - * this function scans fs tree to find blocks reference the data extent - */ -static int find_data_references(struct reloc_control *rc, - struct btrfs_key *extent_key, - struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref, - struct rb_root *blocks) -{ - struct btrfs_path *path; - struct tree_block *block; - struct btrfs_root *root; - struct btrfs_file_extent_item *fi; - struct rb_node *rb_node; - struct btrfs_key key; - u64 ref_root; - u64 ref_objectid; - u64 ref_offset; - u32 ref_count; - u32 nritems; - int err = 0; - int added = 0; - int counted; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ref_root = btrfs_extent_data_ref_root(leaf, ref); - ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); - ref_offset = btrfs_extent_data_ref_offset(leaf, ref); - ref_count = btrfs_extent_data_ref_count(leaf, ref); - - root = read_fs_root(rc->extent_root->fs_info, ref_root); - if (IS_ERR(root)) { - err = PTR_ERR(root); - goto out; - } - - key.objectid = ref_objectid; - key.offset = ref_offset; - key.type = BTRFS_EXTENT_DATA_KEY; - - path->search_commit_root = 1; - path->skip_locking = 1; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - err = ret; - goto out; - } - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - /* - * the references in tree blocks that use full backrefs - * are not counted in - */ - if (block_use_full_backref(rc, leaf)) - counted = 0; - else - counted = 1; - rb_node = tree_search(blocks, leaf->start); - if (rb_node) { - if (counted) - added = 1; - else - path->slots[0] = nritems; - } - - while (ref_count > 0) { - while (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - err = ret; - goto out; - } - if (ret > 0) { - WARN_ON(1); - goto out; - } - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - added = 0; - - if (block_use_full_backref(rc, leaf)) - counted = 0; - else - counted = 1; - rb_node = tree_search(blocks, leaf->start); - if (rb_node) { - if (counted) - added = 1; - else - path->slots[0] = nritems; - } - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != ref_objectid || - key.type != BTRFS_EXTENT_DATA_KEY) { - WARN_ON(1); - break; - } - - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) - goto next; - - if (btrfs_file_extent_disk_bytenr(leaf, fi) != - extent_key->objectid) - goto next; - - key.offset -= btrfs_file_extent_offset(leaf, fi); - if (key.offset != ref_offset) - goto next; - - if (counted) - ref_count--; - if (added) - goto next; - - if (!tree_block_processed(leaf->start, leaf->len, rc)) { - block = kmalloc(sizeof(*block), GFP_NOFS); - if (!block) { - err = -ENOMEM; - break; - } - block->bytenr = leaf->start; - btrfs_item_key_to_cpu(leaf, &block->key, 0); - block->level = 0; - block->key_ready = 1; - rb_node = tree_insert(blocks, block->bytenr, - &block->rb_node); - BUG_ON(rb_node); - } - if (counted) - added = 1; - else - path->slots[0] = nritems; -next: - path->slots[0]++; - - } -out: - btrfs_free_path(path); - return err; -} - -/* - * hepler to find all tree blocks that reference a given data extent - */ -static noinline_for_stack -int add_data_references(struct reloc_control *rc, - struct btrfs_key *extent_key, - struct btrfs_path *path, - struct rb_root *blocks) -{ - struct btrfs_key key; - struct extent_buffer *eb; - struct btrfs_extent_data_ref *dref; - struct btrfs_extent_inline_ref *iref; - unsigned long ptr; - unsigned long end; - u32 blocksize; - int ret; - int err = 0; - - ret = get_new_location(rc->data_inode, NULL, extent_key->objectid, - extent_key->offset); - BUG_ON(ret < 0); - if (ret > 0) { - /* the relocated data is fragmented */ - rc->extents_skipped++; - btrfs_release_path(rc->extent_root, path); - return 0; - } - - blocksize = btrfs_level_size(rc->extent_root, 0); - - eb = path->nodes[0]; - ptr = btrfs_item_ptr_offset(eb, path->slots[0]); - end = ptr + btrfs_item_size_nr(eb, path->slots[0]); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (ptr + sizeof(struct btrfs_extent_item_v0) == end) - ptr = end; - else -#endif - ptr += sizeof(struct btrfs_extent_item); - - while (ptr < end) { - iref = (struct btrfs_extent_inline_ref *)ptr; - key.type = btrfs_extent_inline_ref_type(eb, iref); - if (key.type == BTRFS_SHARED_DATA_REF_KEY) { - key.offset = btrfs_extent_inline_ref_offset(eb, iref); - ret = __add_tree_block(rc, key.offset, blocksize, - blocks); - } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { - dref = (struct btrfs_extent_data_ref *)(&iref->offset); - ret = find_data_references(rc, extent_key, - eb, dref, blocks); - } else { - BUG(); - } - ptr += btrfs_extent_inline_ref_size(key.type); - } - WARN_ON(ptr > end); - - while (1) { - cond_resched(); - eb = path->nodes[0]; - if (path->slots[0] >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(rc->extent_root, path); - if (ret < 0) { - err = ret; - break; - } - if (ret > 0) - break; - eb = path->nodes[0]; - } - - btrfs_item_key_to_cpu(eb, &key, path->slots[0]); - if (key.objectid != extent_key->objectid) - break; - -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (key.type == BTRFS_SHARED_DATA_REF_KEY || - key.type == BTRFS_EXTENT_REF_V0_KEY) { -#else - BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); - if (key.type == BTRFS_SHARED_DATA_REF_KEY) { -#endif - ret = __add_tree_block(rc, key.offset, blocksize, - blocks); - } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { - dref = btrfs_item_ptr(eb, path->slots[0], - struct btrfs_extent_data_ref); - ret = find_data_references(rc, extent_key, - eb, dref, blocks); - } else { - ret = 0; - } - if (ret) { - err = ret; - break; - } - path->slots[0]++; - } - btrfs_release_path(rc->extent_root, path); - if (err) - free_block_list(blocks); - return err; -} - -/* - * hepler to find next unprocessed extent - */ -static noinline_for_stack -int find_next_extent(struct btrfs_trans_handle *trans, - struct reloc_control *rc, struct btrfs_path *path) -{ - struct btrfs_key key; - struct extent_buffer *leaf; - u64 start, end, last; - int ret; - - last = rc->block_group->key.objectid + rc->block_group->key.offset; - while (1) { - cond_resched(); - if (rc->search_start >= last) { - ret = 1; - break; - } - - key.objectid = rc->search_start; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = 0; - - path->search_commit_root = 1; - path->skip_locking = 1; - ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, - 0, 0); - if (ret < 0) - break; -next: - leaf = path->nodes[0]; - if (path->slots[0] >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(rc->extent_root, path); - if (ret != 0) - break; - leaf = path->nodes[0]; - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid >= last) { - ret = 1; - break; - } - - if (key.type != BTRFS_EXTENT_ITEM_KEY || - key.objectid + key.offset <= rc->search_start) { - path->slots[0]++; - goto next; - } - - ret = find_first_extent_bit(&rc->processed_blocks, - key.objectid, &start, &end, - EXTENT_DIRTY); - - if (ret == 0 && start <= key.objectid) { - btrfs_release_path(rc->extent_root, path); - rc->search_start = end + 1; - } else { - rc->search_start = key.objectid + key.offset; - return 0; - } - } - btrfs_release_path(rc->extent_root, path); - return ret; -} - -static void set_reloc_control(struct reloc_control *rc) -{ - struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - mutex_lock(&fs_info->trans_mutex); - fs_info->reloc_ctl = rc; - mutex_unlock(&fs_info->trans_mutex); -} - -static void unset_reloc_control(struct reloc_control *rc) -{ - struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - mutex_lock(&fs_info->trans_mutex); - fs_info->reloc_ctl = NULL; - mutex_unlock(&fs_info->trans_mutex); -} - -static int check_extent_flags(u64 flags) -{ - if ((flags & BTRFS_EXTENT_FLAG_DATA) && - (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) - return 1; - if (!(flags & BTRFS_EXTENT_FLAG_DATA) && - !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) - return 1; - if ((flags & BTRFS_EXTENT_FLAG_DATA) && - (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - return 1; - return 0; -} - -static noinline_for_stack int relocate_block_group(struct reloc_control *rc) -{ - struct rb_root blocks = RB_ROOT; - struct btrfs_key key; - struct btrfs_trans_handle *trans = NULL; - struct btrfs_path *path; - struct btrfs_extent_item *ei; - unsigned long nr; - u64 flags; - u32 item_size; - int ret; - int err = 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - rc->search_start = rc->block_group->key.objectid; - clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, - GFP_NOFS); - - rc->create_reloc_root = 1; - set_reloc_control(rc); - - trans = btrfs_start_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); - - while (1) { - trans = btrfs_start_transaction(rc->extent_root, 1); - - ret = find_next_extent(trans, rc, path); - if (ret < 0) - err = ret; - if (ret != 0) - break; - - rc->extents_found++; - - ei = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_extent_item); - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - item_size = btrfs_item_size_nr(path->nodes[0], - path->slots[0]); - if (item_size >= sizeof(*ei)) { - flags = btrfs_extent_flags(path->nodes[0], ei); - ret = check_extent_flags(flags); - BUG_ON(ret); - - } else { -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - u64 ref_owner; - int path_change = 0; - - BUG_ON(item_size != - sizeof(struct btrfs_extent_item_v0)); - ret = get_ref_objectid_v0(rc, path, &key, &ref_owner, - &path_change); - if (ref_owner < BTRFS_FIRST_FREE_OBJECTID) - flags = BTRFS_EXTENT_FLAG_TREE_BLOCK; - else - flags = BTRFS_EXTENT_FLAG_DATA; - - if (path_change) { - btrfs_release_path(rc->extent_root, path); - - path->search_commit_root = 1; - path->skip_locking = 1; - ret = btrfs_search_slot(NULL, rc->extent_root, - &key, path, 0, 0); - if (ret < 0) { - err = ret; - break; - } - BUG_ON(ret > 0); - } -#else - BUG(); -#endif - } - - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - ret = add_tree_block(rc, &key, path, &blocks); - } else if (rc->stage == UPDATE_DATA_PTRS && - (flags & BTRFS_EXTENT_FLAG_DATA)) { - ret = add_data_references(rc, &key, path, &blocks); - } else { - btrfs_release_path(rc->extent_root, path); - ret = 0; - } - if (ret < 0) { - err = 0; - break; - } - - if (!RB_EMPTY_ROOT(&blocks)) { - ret = relocate_tree_blocks(trans, rc, &blocks); - if (ret < 0) { - err = ret; - break; - } - } - - nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, rc->extent_root); - trans = NULL; - btrfs_btree_balance_dirty(rc->extent_root, nr); - - if (rc->stage == MOVE_DATA_EXTENTS && - (flags & BTRFS_EXTENT_FLAG_DATA)) { - rc->found_file_extent = 1; - ret = relocate_data_extent(rc->data_inode, &key); - if (ret < 0) { - err = ret; - break; - } - } - } - btrfs_free_path(path); - - if (trans) { - nr = trans->blocks_used; - btrfs_end_transaction(trans, rc->extent_root); - btrfs_btree_balance_dirty(rc->extent_root, nr); - } - - rc->create_reloc_root = 0; - smp_mb(); - - if (rc->extents_found > 0) { - trans = btrfs_start_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); - } - - merge_reloc_roots(rc); - - unset_reloc_control(rc); - - /* get rid of pinned extents */ - trans = btrfs_start_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); - - return err; -} - -static int __insert_orphan_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 size) -{ - struct btrfs_path *path; - struct btrfs_inode_item *item; - struct extent_buffer *leaf; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ret = btrfs_insert_empty_inode(trans, root, path, objectid); - if (ret) - goto out; - - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); - btrfs_set_inode_generation(leaf, item, 1); - btrfs_set_inode_size(leaf, item, size); - btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(root, path); -out: - btrfs_free_path(path); - return ret; -} - -/* - * helper to create inode for data relocation. - * the inode is in data relocation tree and its link count is 0 - */ -static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *group) -{ - struct inode *inode = NULL; - struct btrfs_trans_handle *trans; - struct btrfs_root *root; - struct btrfs_key key; - unsigned long nr; - u64 objectid = BTRFS_FIRST_FREE_OBJECTID; - int err = 0; - - root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); - if (IS_ERR(root)) - return ERR_CAST(root); - - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); - - err = btrfs_find_free_objectid(trans, root, objectid, &objectid); - if (err) - goto out; - - err = __insert_orphan_inode(trans, root, objectid, group->key.offset); - BUG_ON(err); - - err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, - group->key.offset, 0, group->key.offset, - 0, 0, 0); - BUG_ON(err); - - key.objectid = objectid; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - inode = btrfs_iget(root->fs_info->sb, &key, root); - BUG_ON(IS_ERR(inode) || is_bad_inode(inode)); - BTRFS_I(inode)->index_cnt = group->key.objectid; - - err = btrfs_orphan_add(trans, inode); -out: - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - - btrfs_btree_balance_dirty(root, nr); - if (err) { - if (inode) - iput(inode); - inode = ERR_PTR(err); - } - return inode; -} - -/* - * function to relocate all extents in a block group. - */ -int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) -{ - struct btrfs_fs_info *fs_info = extent_root->fs_info; - struct reloc_control *rc; - int ret; - int err = 0; - - rc = kzalloc(sizeof(*rc), GFP_NOFS); - if (!rc) - return -ENOMEM; - - mapping_tree_init(&rc->reloc_root_tree); - extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS); - INIT_LIST_HEAD(&rc->reloc_roots); - - rc->block_group = btrfs_lookup_block_group(fs_info, group_start); - BUG_ON(!rc->block_group); - - btrfs_init_workers(&rc->workers, "relocate", - fs_info->thread_pool_size); - - rc->extent_root = extent_root; - btrfs_prepare_block_group_relocation(extent_root, rc->block_group); - - rc->data_inode = create_reloc_inode(fs_info, rc->block_group); - if (IS_ERR(rc->data_inode)) { - err = PTR_ERR(rc->data_inode); - rc->data_inode = NULL; - goto out; - } - - printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n", - (unsigned long long)rc->block_group->key.objectid, - (unsigned long long)rc->block_group->flags); - - btrfs_start_delalloc_inodes(fs_info->tree_root); - btrfs_wait_ordered_extents(fs_info->tree_root, 0); - - while (1) { - mutex_lock(&fs_info->cleaner_mutex); - btrfs_clean_old_snapshots(fs_info->tree_root); - mutex_unlock(&fs_info->cleaner_mutex); - - rc->extents_found = 0; - rc->extents_skipped = 0; - - ret = relocate_block_group(rc); - if (ret < 0) { - err = ret; - break; - } - - if (rc->extents_found == 0) - break; - - printk(KERN_INFO "btrfs: found %llu extents\n", - (unsigned long long)rc->extents_found); - - if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { - btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1); - invalidate_mapping_pages(rc->data_inode->i_mapping, - 0, -1); - rc->stage = UPDATE_DATA_PTRS; - } else if (rc->stage == UPDATE_DATA_PTRS && - rc->extents_skipped >= rc->extents_found) { - iput(rc->data_inode); - rc->data_inode = create_reloc_inode(fs_info, - rc->block_group); - if (IS_ERR(rc->data_inode)) { - err = PTR_ERR(rc->data_inode); - rc->data_inode = NULL; - break; - } - rc->stage = MOVE_DATA_EXTENTS; - rc->found_file_extent = 0; - } - } - - filemap_fdatawrite_range(fs_info->btree_inode->i_mapping, - rc->block_group->key.objectid, - rc->block_group->key.objectid + - rc->block_group->key.offset - 1); - - WARN_ON(rc->block_group->pinned > 0); - WARN_ON(rc->block_group->reserved > 0); - WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); -out: - iput(rc->data_inode); - btrfs_stop_workers(&rc->workers); - btrfs_put_block_group(rc->block_group); - kfree(rc); - return err; -} - -/* - * recover relocation interrupted by system crash. - * - * this function resumes merging reloc trees with corresponding fs trees. - * this is important for keeping the sharing of tree blocks - */ -int btrfs_recover_relocation(struct btrfs_root *root) -{ - LIST_HEAD(reloc_roots); - struct btrfs_key key; - struct btrfs_root *fs_root; - struct btrfs_root *reloc_root; - struct btrfs_path *path; - struct extent_buffer *leaf; - struct reloc_control *rc = NULL; - struct btrfs_trans_handle *trans; - int ret; - int err = 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = BTRFS_TREE_RELOC_OBJECTID; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - while (1) { - ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, - path, 0, 0); - if (ret < 0) { - err = ret; - goto out; - } - if (ret > 0) { - if (path->slots[0] == 0) - break; - path->slots[0]--; - } - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - btrfs_release_path(root->fs_info->tree_root, path); - - if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || - key.type != BTRFS_ROOT_ITEM_KEY) - break; - - reloc_root = btrfs_read_fs_root_no_radix(root, &key); - if (IS_ERR(reloc_root)) { - err = PTR_ERR(reloc_root); - goto out; - } - - list_add(&reloc_root->root_list, &reloc_roots); - - if (btrfs_root_refs(&reloc_root->root_item) > 0) { - fs_root = read_fs_root(root->fs_info, - reloc_root->root_key.offset); - if (IS_ERR(fs_root)) { - err = PTR_ERR(fs_root); - goto out; - } - } - - if (key.offset == 0) - break; - - key.offset--; - } - btrfs_release_path(root->fs_info->tree_root, path); - - if (list_empty(&reloc_roots)) - goto out; - - rc = kzalloc(sizeof(*rc), GFP_NOFS); - if (!rc) { - err = -ENOMEM; - goto out; - } - - mapping_tree_init(&rc->reloc_root_tree); - INIT_LIST_HEAD(&rc->reloc_roots); - btrfs_init_workers(&rc->workers, "relocate", - root->fs_info->thread_pool_size); - rc->extent_root = root->fs_info->extent_root; - - set_reloc_control(rc); - - while (!list_empty(&reloc_roots)) { - reloc_root = list_entry(reloc_roots.next, - struct btrfs_root, root_list); - list_del(&reloc_root->root_list); - - if (btrfs_root_refs(&reloc_root->root_item) == 0) { - list_add_tail(&reloc_root->root_list, - &rc->reloc_roots); - continue; - } - - fs_root = read_fs_root(root->fs_info, - reloc_root->root_key.offset); - BUG_ON(IS_ERR(fs_root)); - - __add_reloc_root(reloc_root); - fs_root->reloc_root = reloc_root; - } - - trans = btrfs_start_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); - - merge_reloc_roots(rc); - - unset_reloc_control(rc); - - trans = btrfs_start_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); -out: - if (rc) { - btrfs_stop_workers(&rc->workers); - kfree(rc); - } - while (!list_empty(&reloc_roots)) { - reloc_root = list_entry(reloc_roots.next, - struct btrfs_root, root_list); - list_del(&reloc_root->root_list); - free_extent_buffer(reloc_root->node); - free_extent_buffer(reloc_root->commit_root); - kfree(reloc_root); - } - btrfs_free_path(path); - - if (err == 0) { - /* cleanup orphan inode in data relocation tree */ - fs_root = read_fs_root(root->fs_info, - BTRFS_DATA_RELOC_TREE_OBJECTID); - if (IS_ERR(fs_root)) - err = PTR_ERR(fs_root); - } - return err; -} - -/* - * helper to add ordered checksum for data relocation. - * - * cloning checksum properly handles the nodatasum extents. - * it also saves CPU time to re-calculate the checksum. - */ -int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) -{ - struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; - struct btrfs_ordered_extent *ordered; - struct btrfs_root *root = BTRFS_I(inode)->root; - size_t offset; - int ret; - u64 disk_bytenr; - LIST_HEAD(list); - - ordered = btrfs_lookup_ordered_extent(inode, file_pos); - BUG_ON(ordered->file_offset != file_pos || ordered->len != len); - - disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; - ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, - disk_bytenr + len - 1, &list); - - while (!list_empty(&list)) { - sums = list_entry(list.next, struct btrfs_ordered_sum, list); - list_del_init(&sums->list); - - sector_sum = sums->sums; - sums->bytenr = ordered->start; - - offset = 0; - while (offset < sums->len) { - sector_sum->bytenr += ordered->start - disk_bytenr; - sector_sum++; - offset += root->sectorsize; - } - - btrfs_add_ordered_sum(inode, ordered, sums); - } - btrfs_put_ordered_extent(ordered); - return 0; -} diff --git a/trunk/fs/btrfs/root-tree.c b/trunk/fs/btrfs/root-tree.c index 0ddc6d61c55a..b48650de4472 100644 --- a/trunk/fs/btrfs/root-tree.c +++ b/trunk/fs/btrfs/root-tree.c @@ -111,15 +111,6 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, return ret; } -int btrfs_set_root_node(struct btrfs_root_item *item, - struct extent_buffer *node) -{ - btrfs_set_root_bytenr(item, node->start); - btrfs_set_root_level(item, btrfs_header_level(node)); - btrfs_set_root_generation(item, btrfs_header_generation(node)); - return 0; -} - /* * copy the data in 'item' into the btree */ @@ -173,7 +164,8 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root * offset lower than the latest root. They need to be queued for deletion to * finish what was happening when we crashed. */ -int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid) +int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, + struct btrfs_root *latest) { struct btrfs_root *dead_root; struct btrfs_item *item; @@ -235,7 +227,10 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid) goto err; } - ret = btrfs_add_dead_root(dead_root); + if (objectid == BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_add_dead_reloc_root(dead_root); + else + ret = btrfs_add_dead_root(dead_root, latest); if (ret) goto err; goto again; diff --git a/trunk/fs/btrfs/super.c b/trunk/fs/btrfs/super.c index 9f179d4832d5..2ff7cd2db25f 100644 --- a/trunk/fs/btrfs/super.c +++ b/trunk/fs/btrfs/super.c @@ -52,6 +52,7 @@ #include "export.h" #include "compression.h" + static struct super_operations btrfs_super_ops; static void btrfs_put_super(struct super_block *sb) @@ -66,8 +67,8 @@ static void btrfs_put_super(struct super_block *sb) enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, - Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, - Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err, + Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog, + Opt_ratio, Opt_flushoncommit, Opt_err, }; static match_table_t tokens = { @@ -83,8 +84,6 @@ static match_table_t tokens = { {Opt_thread_pool, "thread_pool=%d"}, {Opt_compress, "compress"}, {Opt_ssd, "ssd"}, - {Opt_ssd_spread, "ssd_spread"}, - {Opt_nossd, "nossd"}, {Opt_noacl, "noacl"}, {Opt_notreelog, "notreelog"}, {Opt_flushoncommit, "flushoncommit"}, @@ -159,7 +158,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) */ break; case Opt_nodatasum: - printk(KERN_INFO "btrfs: setting nodatasum\n"); + printk(KERN_INFO "btrfs: setting nodatacsum\n"); btrfs_set_opt(info->mount_opt, NODATASUM); break; case Opt_nodatacow: @@ -175,19 +174,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); btrfs_set_opt(info->mount_opt, SSD); break; - case Opt_ssd_spread: - printk(KERN_INFO "btrfs: use spread ssd " - "allocation scheme\n"); - btrfs_set_opt(info->mount_opt, SSD); - btrfs_set_opt(info->mount_opt, SSD_SPREAD); - break; - case Opt_nossd: - printk(KERN_INFO "btrfs: not using ssd allocation " - "scheme\n"); - btrfs_set_opt(info->mount_opt, NOSSD); - btrfs_clear_opt(info->mount_opt, SSD); - btrfs_clear_opt(info->mount_opt, SSD_SPREAD); - break; case Opt_nobarrier: printk(KERN_INFO "btrfs: turning off barriers\n"); btrfs_set_opt(info->mount_opt, NOBARRIER); @@ -336,7 +322,7 @@ static int btrfs_fill_super(struct super_block *sb, struct dentry *root_dentry; struct btrfs_super_block *disk_super; struct btrfs_root *tree_root; - struct btrfs_key key; + struct btrfs_inode *bi; int err; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -355,15 +341,23 @@ static int btrfs_fill_super(struct super_block *sb, } sb->s_fs_info = tree_root; disk_super = &tree_root->fs_info->super_copy; + inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID, + tree_root->fs_info->fs_root); + bi = BTRFS_I(inode); + bi->location.objectid = inode->i_ino; + bi->location.offset = 0; + bi->root = tree_root->fs_info->fs_root; + + btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY); - key.objectid = BTRFS_FIRST_FREE_OBJECTID; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); + if (!inode) { + err = -ENOMEM; goto fail_close; } + if (inode->i_state & I_NEW) { + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + } root_dentry = d_alloc_root(inode); if (!root_dentry) { @@ -394,6 +388,10 @@ int btrfs_sync_fs(struct super_block *sb, int wait) struct btrfs_root *root = btrfs_sb(sb); int ret; + if (sb->s_flags & MS_RDONLY) + return 0; + + sb->s_dirt = 0; if (!wait) { filemap_flush(root->fs_info->btree_inode->i_mapping); return 0; @@ -404,6 +402,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); + sb->s_dirt = 0; return ret; } @@ -434,11 +433,7 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); if (btrfs_test_opt(root, COMPRESS)) seq_puts(seq, ",compress"); - if (btrfs_test_opt(root, NOSSD)) - seq_puts(seq, ",nossd"); - if (btrfs_test_opt(root, SSD_SPREAD)) - seq_puts(seq, ",ssd_spread"); - else if (btrfs_test_opt(root, SSD)) + if (btrfs_test_opt(root, SSD)) seq_puts(seq, ",ssd"); if (btrfs_test_opt(root, NOTREELOG)) seq_puts(seq, ",notreelog"); @@ -449,6 +444,11 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) return 0; } +static void btrfs_write_super(struct super_block *sb) +{ + sb->s_dirt = 0; +} + static int btrfs_test_super(struct super_block *s, void *data) { struct btrfs_fs_devices *test_fs_devices = data; @@ -584,8 +584,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) return -EINVAL; - /* recover relocation */ - ret = btrfs_recover_relocation(root); + ret = btrfs_cleanup_reloc_trees(root); WARN_ON(ret); ret = btrfs_cleanup_fs_roots(root->fs_info); @@ -679,6 +678,7 @@ static int btrfs_unfreeze(struct super_block *sb) static struct super_operations btrfs_super_ops = { .delete_inode = btrfs_delete_inode, .put_super = btrfs_put_super, + .write_super = btrfs_write_super, .sync_fs = btrfs_sync_fs, .show_options = btrfs_show_options, .write_inode = btrfs_write_inode, diff --git a/trunk/fs/btrfs/transaction.c b/trunk/fs/btrfs/transaction.c index 2e177d7f4bb9..01b143605ec1 100644 --- a/trunk/fs/btrfs/transaction.c +++ b/trunk/fs/btrfs/transaction.c @@ -25,6 +25,7 @@ #include "disk-io.h" #include "transaction.h" #include "locking.h" +#include "ref-cache.h" #include "tree-log.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -93,37 +94,45 @@ static noinline int join_transaction(struct btrfs_root *root) * to make sure the old root from before we joined the transaction is deleted * when the transaction commits */ -static noinline int record_root_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +noinline int btrfs_record_root_in_trans(struct btrfs_root *root) { - if (root->ref_cows && root->last_trans < trans->transid) { + struct btrfs_dirty_root *dirty; + u64 running_trans_id = root->fs_info->running_transaction->transid; + if (root->ref_cows && root->last_trans < running_trans_id) { WARN_ON(root == root->fs_info->extent_root); - WARN_ON(root->root_item.refs == 0); - WARN_ON(root->commit_root != root->node); - - radix_tree_tag_set(&root->fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - root->last_trans = trans->transid; - btrfs_init_reloc_root(trans, root); - } - return 0; -} + if (root->root_item.refs != 0) { + radix_tree_tag_set(&root->fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + + dirty = kmalloc(sizeof(*dirty), GFP_NOFS); + BUG_ON(!dirty); + dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS); + BUG_ON(!dirty->root); + dirty->latest_root = root; + INIT_LIST_HEAD(&dirty->list); -int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - if (!root->ref_cows) - return 0; + root->commit_root = btrfs_root_node(root); - mutex_lock(&root->fs_info->trans_mutex); - if (root->last_trans == trans->transid) { - mutex_unlock(&root->fs_info->trans_mutex); - return 0; - } + memcpy(dirty->root, root, sizeof(*root)); + spin_lock_init(&dirty->root->node_lock); + spin_lock_init(&dirty->root->list_lock); + mutex_init(&dirty->root->objectid_mutex); + mutex_init(&dirty->root->log_mutex); + INIT_LIST_HEAD(&dirty->root->dead_list); + dirty->root->node = root->commit_root; + dirty->root->commit_root = NULL; - record_root_in_trans(trans, root); - mutex_unlock(&root->fs_info->trans_mutex); + spin_lock(&root->list_lock); + list_add(&dirty->root->dead_list, &root->dead_list); + spin_unlock(&root->list_lock); + + root->dirty_root = dirty; + } else { + WARN_ON(1); + } + root->last_trans = running_trans_id; + } return 0; } @@ -172,6 +181,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, ret = join_transaction(root); BUG_ON(ret); + btrfs_record_root_in_trans(root); h->transid = root->fs_info->running_transaction->transid; h->transaction = root->fs_info->running_transaction; h->blocks_reserved = num_blocks; @@ -182,7 +192,6 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, h->delayed_ref_updates = 0; root->fs_info->running_transaction->use_count++; - record_root_in_trans(h, root); mutex_unlock(&root->fs_info->trans_mutex); return h; } @@ -224,7 +233,6 @@ static noinline int wait_for_commit(struct btrfs_root *root, return 0; } -#if 0 /* * rate limit against the drop_snapshot code. This helps to slow down new * operations if the drop_snapshot code isn't able to keep up. @@ -265,7 +273,6 @@ static void throttle_on_drops(struct btrfs_root *root) goto harder; } } -#endif void btrfs_throttle(struct btrfs_root *root) { @@ -273,6 +280,7 @@ void btrfs_throttle(struct btrfs_root *root) if (!root->fs_info->open_ioctl_trans) wait_current_trans(root); mutex_unlock(&root->fs_info->trans_mutex); + throttle_on_drops(root); } static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, @@ -315,6 +323,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, memset(trans, 0, sizeof(*trans)); kmem_cache_free(btrfs_trans_handle_cachep, trans); + if (throttle) + throttle_on_drops(root); + return 0; } @@ -451,8 +462,12 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, old_root_bytenr = btrfs_root_bytenr(&root->root_item); if (old_root_bytenr == root->node->start) break; + btrfs_set_root_bytenr(&root->root_item, + root->node->start); + btrfs_set_root_level(&root->root_item, + btrfs_header_level(root->node)); + btrfs_set_root_generation(&root->root_item, trans->transid); - btrfs_set_root_node(&root->root_item, root->node); ret = btrfs_update_root(trans, tree_root, &root->root_key, &root->root_item); @@ -462,16 +477,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); BUG_ON(ret); } - free_extent_buffer(root->commit_root); - root->commit_root = btrfs_root_node(root); return 0; } /* * update all the cowonly tree roots on disk */ -static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct list_head *next; @@ -507,54 +520,118 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, * a dirty root struct and adds it into the list of dead roots that need to * be deleted */ -int btrfs_add_dead_root(struct btrfs_root *root) +int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest) { + struct btrfs_dirty_root *dirty; + + dirty = kmalloc(sizeof(*dirty), GFP_NOFS); + if (!dirty) + return -ENOMEM; + dirty->root = root; + dirty->latest_root = latest; + mutex_lock(&root->fs_info->trans_mutex); - list_add(&root->root_list, &root->fs_info->dead_roots); + list_add(&dirty->list, &latest->fs_info->dead_roots); mutex_unlock(&root->fs_info->trans_mutex); return 0; } /* - * update all the cowonly tree roots on disk + * at transaction commit time we need to schedule the old roots for + * deletion via btrfs_drop_snapshot. This runs through all the + * reference counted roots that were modified in the current + * transaction and puts them into the drop list */ -static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static noinline int add_dirty_roots(struct btrfs_trans_handle *trans, + struct radix_tree_root *radix, + struct list_head *list) { + struct btrfs_dirty_root *dirty; struct btrfs_root *gang[8]; - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *root; int i; int ret; int err = 0; + u32 refs; while (1) { - ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, - (void **)gang, 0, + ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0, ARRAY_SIZE(gang), BTRFS_ROOT_TRANS_TAG); if (ret == 0) break; for (i = 0; i < ret; i++) { root = gang[i]; - radix_tree_tag_clear(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); + radix_tree_tag_clear(radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + + BUG_ON(!root->ref_tree); + dirty = root->dirty_root; btrfs_free_log(trans, root); - btrfs_update_reloc_root(trans, root); + btrfs_free_reloc_root(trans, root); - if (root->commit_root == root->node) - continue; + if (root->commit_root == root->node) { + WARN_ON(root->node->start != + btrfs_root_bytenr(&root->root_item)); - free_extent_buffer(root->commit_root); - root->commit_root = btrfs_root_node(root); + free_extent_buffer(root->commit_root); + root->commit_root = NULL; + root->dirty_root = NULL; + + spin_lock(&root->list_lock); + list_del_init(&dirty->root->dead_list); + spin_unlock(&root->list_lock); - btrfs_set_root_node(&root->root_item, root->node); - err = btrfs_update_root(trans, fs_info->tree_root, + kfree(dirty->root); + kfree(dirty); + + /* make sure to update the root on disk + * so we get any updates to the block used + * counts + */ + err = btrfs_update_root(trans, + root->fs_info->tree_root, + &root->root_key, + &root->root_item); + continue; + } + + memset(&root->root_item.drop_progress, 0, + sizeof(struct btrfs_disk_key)); + root->root_item.drop_level = 0; + root->commit_root = NULL; + root->dirty_root = NULL; + root->root_key.offset = root->fs_info->generation; + btrfs_set_root_bytenr(&root->root_item, + root->node->start); + btrfs_set_root_level(&root->root_item, + btrfs_header_level(root->node)); + btrfs_set_root_generation(&root->root_item, + root->root_key.offset); + + err = btrfs_insert_root(trans, root->fs_info->tree_root, &root->root_key, &root->root_item); if (err) break; + + refs = btrfs_root_refs(&dirty->root->root_item); + btrfs_set_root_refs(&dirty->root->root_item, refs - 1); + err = btrfs_update_root(trans, root->fs_info->tree_root, + &dirty->root->root_key, + &dirty->root->root_item); + + BUG_ON(err); + if (refs == 1) { + list_add(&dirty->list, list); + } else { + WARN_ON(1); + free_extent_buffer(dirty->root->node); + kfree(dirty->root); + kfree(dirty); + } } } return err; @@ -611,8 +688,12 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info) TASK_UNINTERRUPTIBLE); mutex_unlock(&info->trans_mutex); + atomic_dec(&info->throttles); + wake_up(&info->transaction_throttle); + schedule(); + atomic_inc(&info->throttles); mutex_lock(&info->trans_mutex); finish_wait(&info->transaction_wait, &wait); } @@ -624,61 +705,111 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info) * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on * all of them */ -int btrfs_drop_dead_root(struct btrfs_root *root) +static noinline int drop_dirty_roots(struct btrfs_root *tree_root, + struct list_head *list) { + struct btrfs_dirty_root *dirty; struct btrfs_trans_handle *trans; - struct btrfs_root *tree_root = root->fs_info->tree_root; unsigned long nr; - int ret; + u64 num_bytes; + u64 bytes_used; + u64 max_useless; + int ret = 0; + int err; - while (1) { - /* - * we don't want to jump in and create a bunch of - * delayed refs if the transaction is starting to close - */ - wait_transaction_pre_flush(tree_root->fs_info); - trans = btrfs_start_transaction(tree_root, 1); + while (!list_empty(list)) { + struct btrfs_root *root; - /* - * we've joined a transaction, make sure it isn't - * closing right now - */ - if (trans->transaction->delayed_refs.flushing) { - btrfs_end_transaction(trans, tree_root); - continue; + dirty = list_entry(list->prev, struct btrfs_dirty_root, list); + list_del_init(&dirty->list); + + num_bytes = btrfs_root_used(&dirty->root->root_item); + root = dirty->latest_root; + atomic_inc(&root->fs_info->throttles); + + while (1) { + /* + * we don't want to jump in and create a bunch of + * delayed refs if the transaction is starting to close + */ + wait_transaction_pre_flush(tree_root->fs_info); + trans = btrfs_start_transaction(tree_root, 1); + + /* + * we've joined a transaction, make sure it isn't + * closing right now + */ + if (trans->transaction->delayed_refs.flushing) { + btrfs_end_transaction(trans, tree_root); + continue; + } + + mutex_lock(&root->fs_info->drop_mutex); + ret = btrfs_drop_snapshot(trans, dirty->root); + if (ret != -EAGAIN) + break; + mutex_unlock(&root->fs_info->drop_mutex); + + err = btrfs_update_root(trans, + tree_root, + &dirty->root->root_key, + &dirty->root->root_item); + if (err) + ret = err; + nr = trans->blocks_used; + ret = btrfs_end_transaction(trans, tree_root); + BUG_ON(ret); + + btrfs_btree_balance_dirty(tree_root, nr); + cond_resched(); } + BUG_ON(ret); + atomic_dec(&root->fs_info->throttles); + wake_up(&root->fs_info->transaction_throttle); - ret = btrfs_drop_snapshot(trans, root); - if (ret != -EAGAIN) - break; + num_bytes -= btrfs_root_used(&dirty->root->root_item); + bytes_used = btrfs_root_used(&root->root_item); + if (num_bytes) { + mutex_lock(&root->fs_info->trans_mutex); + btrfs_record_root_in_trans(root); + mutex_unlock(&root->fs_info->trans_mutex); + btrfs_set_root_used(&root->root_item, + bytes_used - num_bytes); + } - ret = btrfs_update_root(trans, tree_root, - &root->root_key, - &root->root_item); - if (ret) + ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key); + if (ret) { + BUG(); break; + } + mutex_unlock(&root->fs_info->drop_mutex); + + spin_lock(&root->list_lock); + list_del_init(&dirty->root->dead_list); + if (!list_empty(&root->dead_list)) { + struct btrfs_root *oldest; + oldest = list_entry(root->dead_list.prev, + struct btrfs_root, dead_list); + max_useless = oldest->root_key.offset - 1; + } else { + max_useless = root->root_key.offset - 1; + } + spin_unlock(&root->list_lock); nr = trans->blocks_used; ret = btrfs_end_transaction(trans, tree_root); BUG_ON(ret); + ret = btrfs_remove_leaf_refs(root, max_useless, 0); + BUG_ON(ret); + + free_extent_buffer(dirty->root->node); + kfree(dirty->root); + kfree(dirty); + btrfs_btree_balance_dirty(tree_root, nr); cond_resched(); } - BUG_ON(ret); - - ret = btrfs_del_root(trans, tree_root, &root->root_key); - BUG_ON(ret); - - nr = trans->blocks_used; - ret = btrfs_end_transaction(trans, tree_root); - BUG_ON(ret); - - free_extent_buffer(root->node); - free_extent_buffer(root->commit_root); - kfree(root); - - btrfs_btree_balance_dirty(tree_root, nr); return ret; } @@ -708,23 +839,24 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret) goto fail; - record_root_in_trans(trans, root); + btrfs_record_root_in_trans(root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); key.objectid = objectid; - key.offset = 0; + key.offset = trans->transid; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); old = btrfs_lock_root_node(root); btrfs_cow_block(trans, root, old, NULL, 0, &old); - btrfs_set_lock_blocking(old); btrfs_copy_root(trans, root, old, &tmp, objectid); btrfs_tree_unlock(old); free_extent_buffer(old); - btrfs_set_root_node(new_root_item, tmp); + btrfs_set_root_bytenr(new_root_item, tmp->start); + btrfs_set_root_level(new_root_item, btrfs_header_level(tmp)); + btrfs_set_root_generation(new_root_item, trans->transid); ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, new_root_item); btrfs_tree_unlock(tmp); @@ -832,24 +964,6 @@ static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans, return 0; } -static void update_super_roots(struct btrfs_root *root) -{ - struct btrfs_root_item *root_item; - struct btrfs_super_block *super; - - super = &root->fs_info->super_copy; - - root_item = &root->fs_info->chunk_root->root_item; - super->chunk_root = root_item->bytenr; - super->chunk_root_generation = root_item->generation; - super->chunk_root_level = root_item->level; - - root_item = &root->fs_info->tree_root->root_item; - super->root = root_item->bytenr; - super->generation = root_item->generation; - super->root_level = root_item->level; -} - int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -857,6 +971,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, unsigned long timeout = 1; struct btrfs_transaction *cur_trans; struct btrfs_transaction *prev_trans = NULL; + struct btrfs_root *chunk_root = root->fs_info->chunk_root; + struct list_head dirty_fs_roots; struct extent_io_tree *pinned_copy; DEFINE_WAIT(wait); int ret; @@ -883,6 +999,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, BUG_ON(ret); mutex_lock(&root->fs_info->trans_mutex); + INIT_LIST_HEAD(&dirty_fs_roots); if (cur_trans->in_commit) { cur_trans->use_count++; mutex_unlock(&root->fs_info->trans_mutex); @@ -988,36 +1105,41 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * with the tree-log code. */ mutex_lock(&root->fs_info->tree_log_mutex); + /* + * keep tree reloc code from adding new reloc trees + */ + mutex_lock(&root->fs_info->tree_reloc_mutex); + - ret = commit_fs_roots(trans, root); + ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix, + &dirty_fs_roots); BUG_ON(ret); - /* commit_fs_roots gets rid of all the tree log roots, it is now + /* add_dirty_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots */ btrfs_free_log_root_tree(trans, root->fs_info); - ret = commit_cowonly_roots(trans, root); + ret = btrfs_commit_tree_roots(trans, root); BUG_ON(ret); cur_trans = root->fs_info->running_transaction; spin_lock(&root->fs_info->new_trans_lock); root->fs_info->running_transaction = NULL; spin_unlock(&root->fs_info->new_trans_lock); - - btrfs_set_root_node(&root->fs_info->tree_root->root_item, - root->fs_info->tree_root->node); - free_extent_buffer(root->fs_info->tree_root->commit_root); - root->fs_info->tree_root->commit_root = - btrfs_root_node(root->fs_info->tree_root); - - btrfs_set_root_node(&root->fs_info->chunk_root->root_item, - root->fs_info->chunk_root->node); - free_extent_buffer(root->fs_info->chunk_root->commit_root); - root->fs_info->chunk_root->commit_root = - btrfs_root_node(root->fs_info->chunk_root); - - update_super_roots(root); + btrfs_set_super_generation(&root->fs_info->super_copy, + cur_trans->transid); + btrfs_set_super_root(&root->fs_info->super_copy, + root->fs_info->tree_root->node->start); + btrfs_set_super_root_level(&root->fs_info->super_copy, + btrfs_header_level(root->fs_info->tree_root->node)); + + btrfs_set_super_chunk_root(&root->fs_info->super_copy, + chunk_root->node->start); + btrfs_set_super_chunk_root_level(&root->fs_info->super_copy, + btrfs_header_level(chunk_root->node)); + btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy, + btrfs_header_generation(chunk_root->node)); if (!root->fs_info->log_root_recovering) { btrfs_set_super_log_root(&root->fs_info->super_copy, 0); @@ -1031,6 +1153,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, trans->transaction->blocked = 0; + wake_up(&root->fs_info->transaction_throttle); wake_up(&root->fs_info->transaction_wait); mutex_unlock(&root->fs_info->trans_mutex); @@ -1047,6 +1170,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_finish_extent_commit(trans, root, pinned_copy); kfree(pinned_copy); + btrfs_drop_dead_reloc_roots(root); + mutex_unlock(&root->fs_info->tree_reloc_mutex); + /* do the directory inserts of any pending snapshot creations */ finish_pending_snapshots(trans, root->fs_info); @@ -1060,9 +1186,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, put_transaction(cur_trans); put_transaction(cur_trans); + list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots); + if (root->fs_info->closing) + list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots); + mutex_unlock(&root->fs_info->trans_mutex); kmem_cache_free(btrfs_trans_handle_cachep, trans); + + if (root->fs_info->closing) + drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots); return ret; } @@ -1071,17 +1204,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ int btrfs_clean_old_snapshots(struct btrfs_root *root) { - LIST_HEAD(list); - struct btrfs_fs_info *fs_info = root->fs_info; - - mutex_lock(&fs_info->trans_mutex); - list_splice_init(&fs_info->dead_roots, &list); - mutex_unlock(&fs_info->trans_mutex); + struct list_head dirty_roots; + INIT_LIST_HEAD(&dirty_roots); +again: + mutex_lock(&root->fs_info->trans_mutex); + list_splice_init(&root->fs_info->dead_roots, &dirty_roots); + mutex_unlock(&root->fs_info->trans_mutex); - while (!list_empty(&list)) { - root = list_entry(list.next, struct btrfs_root, root_list); - list_del_init(&root->root_list); - btrfs_drop_dead_root(root); + if (!list_empty(&dirty_roots)) { + drop_dirty_roots(root, &dirty_roots); + goto again; } return 0; } diff --git a/trunk/fs/btrfs/transaction.h b/trunk/fs/btrfs/transaction.h index 961c3ee5a2e1..94f5bde2b58d 100644 --- a/trunk/fs/btrfs/transaction.h +++ b/trunk/fs/btrfs/transaction.h @@ -62,6 +62,12 @@ struct btrfs_pending_snapshot { struct list_head list; }; +struct btrfs_dirty_root { + struct list_head list; + struct btrfs_root *root; + struct btrfs_root *latest_root; +}; + static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans, struct inode *inode) { @@ -94,8 +100,7 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_add_dead_root(struct btrfs_root *root); -int btrfs_drop_dead_root(struct btrfs_root *root); +int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest); int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); int btrfs_clean_old_snapshots(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans, @@ -103,8 +108,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_throttle(struct btrfs_root *root); -int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root); +int btrfs_record_root_in_trans(struct btrfs_root *root); int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages); #endif diff --git a/trunk/fs/btrfs/tree-log.c b/trunk/fs/btrfs/tree-log.c index c13922206d1b..db5e212e8445 100644 --- a/trunk/fs/btrfs/tree-log.c +++ b/trunk/fs/btrfs/tree-log.c @@ -430,16 +430,18 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, static noinline struct inode *read_one_inode(struct btrfs_root *root, u64 objectid) { - struct btrfs_key key; struct inode *inode; + inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + BTRFS_I(inode)->location.objectid = objectid; + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.offset = 0; + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); - key.objectid = objectid; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - inode = btrfs_iget(root->fs_info->sb, &key, root); - if (IS_ERR(inode)) { - inode = NULL; - } else if (is_bad_inode(inode)) { + } + if (is_bad_inode(inode)) { iput(inode); inode = NULL; } @@ -539,7 +541,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { - u64 offset; unsigned long dest_offset; struct btrfs_key ins; @@ -554,7 +555,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); ins.type = BTRFS_EXTENT_ITEM_KEY; - offset = key->offset - btrfs_file_extent_offset(eb, item); if (ins.objectid > 0) { u64 csum_start; @@ -569,16 +569,19 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (ret == 0) { ret = btrfs_inc_extent_ref(trans, root, ins.objectid, ins.offset, - 0, root->root_key.objectid, - key->objectid, offset); + path->nodes[0]->start, + root->root_key.objectid, + trans->transid, key->objectid); } else { /* * insert the extent pointer in the extent * allocation tree */ - ret = btrfs_alloc_logged_file_extent(trans, - root, root->root_key.objectid, - key->objectid, offset, &ins); + ret = btrfs_alloc_logged_extent(trans, root, + path->nodes[0]->start, + root->root_key.objectid, + trans->transid, key->objectid, + &ins); BUG_ON(ret); } btrfs_release_path(root, path); @@ -1703,6 +1706,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + ret = btrfs_drop_leaf_ref(trans, root, next); + BUG_ON(ret); + WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_reserved_extent(root, @@ -1747,6 +1753,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + if (*level == 0) { + ret = btrfs_drop_leaf_ref(trans, root, next); + BUG_ON(ret); + } WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_reserved_extent(root, bytenr, blocksize); BUG_ON(ret); @@ -1801,6 +1811,12 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + if (*level == 0) { + ret = btrfs_drop_leaf_ref(trans, root, + next); + BUG_ON(ret); + } + WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_reserved_extent(root, path->nodes[*level]->start, @@ -1868,6 +1884,11 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + if (orig_level == 0) { + ret = btrfs_drop_leaf_ref(trans, log, + next); + BUG_ON(ret); + } WARN_ON(log->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_reserved_extent(log, next->start, @@ -2006,7 +2027,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); BUG_ON(ret); - btrfs_set_root_node(&log->root_item, log->node); + btrfs_set_root_bytenr(&log->root_item, log->node->start); + btrfs_set_root_generation(&log->root_item, trans->transid); + btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node)); root->log_batch = 0; root->log_transid++; @@ -2558,7 +2581,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, ins_keys, ins_sizes, nr); BUG_ON(ret); - for (i = 0; i < nr; i++, dst_path->slots[0]++) { + for (i = 0; i < nr; i++) { dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_path->slots[0]); @@ -2594,31 +2617,36 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, found_type = btrfs_file_extent_type(src, extent); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { - u64 ds, dl, cs, cl; - ds = btrfs_file_extent_disk_bytenr(src, - extent); - /* ds == 0 is a hole */ - if (ds == 0) - continue; - - dl = btrfs_file_extent_disk_num_bytes(src, - extent); - cs = btrfs_file_extent_offset(src, extent); - cl = btrfs_file_extent_num_bytes(src, - extent);; + u64 ds = btrfs_file_extent_disk_bytenr(src, + extent); + u64 dl = btrfs_file_extent_disk_num_bytes(src, + extent); + u64 cs = btrfs_file_extent_offset(src, extent); + u64 cl = btrfs_file_extent_num_bytes(src, + extent);; if (btrfs_file_extent_compression(src, extent)) { cs = 0; cl = dl; } - - ret = btrfs_lookup_csums_range( - log->fs_info->csum_root, - ds + cs, ds + cs + cl - 1, - &ordered_sums); - BUG_ON(ret); + /* ds == 0 is a hole */ + if (ds != 0) { + ret = btrfs_inc_extent_ref(trans, log, + ds, dl, + dst_path->nodes[0]->start, + BTRFS_TREE_LOG_OBJECTID, + trans->transid, + ins_keys[i].objectid); + BUG_ON(ret); + ret = btrfs_lookup_csums_range( + log->fs_info->csum_root, + ds + cs, ds + cs + cl - 1, + &ordered_sums); + BUG_ON(ret); + } } } + dst_path->slots[0]++; } btrfs_mark_buffer_dirty(dst_path->nodes[0]); @@ -3001,7 +3029,9 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) BUG_ON(!wc.replay_dest); wc.replay_dest->log_root = log; - btrfs_record_root_in_trans(trans, wc.replay_dest); + mutex_lock(&fs_info->trans_mutex); + btrfs_record_root_in_trans(wc.replay_dest); + mutex_unlock(&fs_info->trans_mutex); ret = walk_log_tree(trans, log, &wc); BUG_ON(ret); @@ -3019,7 +3049,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) key.offset = found_key.offset - 1; wc.replay_dest->log_root = NULL; free_extent_buffer(log->node); - free_extent_buffer(log->commit_root); kfree(log); if (found_key.offset == 0) diff --git a/trunk/fs/btrfs/volumes.c b/trunk/fs/btrfs/volumes.c index 3ab80e9cd767..a6d35b0054ca 100644 --- a/trunk/fs/btrfs/volumes.c +++ b/trunk/fs/btrfs/volumes.c @@ -161,10 +161,8 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) int again = 0; unsigned long num_run; unsigned long num_sync_run; - unsigned long batch_run = 0; unsigned long limit; unsigned long last_waited = 0; - int force_reg = 0; bdi = blk_get_backing_dev_info(device->bdev); fs_info = device->dev_root->fs_info; @@ -178,22 +176,19 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) loop: spin_lock(&device->io_lock); + num_run = 0; loop_lock: - num_run = 0; /* take all the bios off the list at once and process them * later on (without the lock held). But, remember the * tail and other pointers so the bios can be properly reinserted * into the list if we hit congestion */ - if (!force_reg && device->pending_sync_bios.head) { + if (device->pending_sync_bios.head) pending_bios = &device->pending_sync_bios; - force_reg = 1; - } else { + else pending_bios = &device->pending_bios; - force_reg = 0; - } pending = pending_bios->head; tail = pending_bios->tail; @@ -233,14 +228,10 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) while (pending) { rmb(); - /* we want to work on both lists, but do more bios on the - * sync list than the regular list - */ - if ((num_run > 32 && - pending_bios != &device->pending_sync_bios && - device->pending_sync_bios.head) || - (num_run > 64 && pending_bios == &device->pending_sync_bios && - device->pending_bios.head)) { + if (pending_bios != &device->pending_sync_bios && + device->pending_sync_bios.head && + num_run > 16) { + cond_resched(); spin_lock(&device->io_lock); requeue_list(pending_bios, pending, tail); goto loop_lock; @@ -258,8 +249,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) BUG_ON(atomic_read(&cur->bi_cnt) == 0); submit_bio(cur->bi_rw, cur); num_run++; - batch_run++; - if (bio_sync(cur)) num_sync_run++; @@ -276,7 +265,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) * is now congested. Back off and let other work structs * run instead */ - if (pending && bdi_write_congested(bdi) && batch_run > 32 && + if (pending && bdi_write_congested(bdi) && num_run > 16 && fs_info->fs_devices->open_devices > 1) { struct io_context *ioc; @@ -377,7 +366,6 @@ static noinline int device_list_add(const char *path, memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); fs_devices->latest_devid = devid; fs_devices->latest_trans = found_transid; - mutex_init(&fs_devices->device_list_mutex); device = NULL; } else { device = __find_device(&fs_devices->devices, devid, @@ -404,11 +392,7 @@ static noinline int device_list_add(const char *path, return -ENOMEM; } INIT_LIST_HEAD(&device->dev_alloc_list); - - mutex_lock(&fs_devices->device_list_mutex); list_add(&device->dev_list, &fs_devices->devices); - mutex_unlock(&fs_devices->device_list_mutex); - device->fs_devices = fs_devices; fs_devices->num_devices++; } @@ -434,12 +418,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) INIT_LIST_HEAD(&fs_devices->devices); INIT_LIST_HEAD(&fs_devices->alloc_list); INIT_LIST_HEAD(&fs_devices->list); - mutex_init(&fs_devices->device_list_mutex); fs_devices->latest_devid = orig->latest_devid; fs_devices->latest_trans = orig->latest_trans; memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); - mutex_lock(&orig->device_list_mutex); list_for_each_entry(orig_dev, &orig->devices, dev_list) { device = kzalloc(sizeof(*device), GFP_NOFS); if (!device) @@ -461,10 +443,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) device->fs_devices = fs_devices; fs_devices->num_devices++; } - mutex_unlock(&orig->device_list_mutex); return fs_devices; error: - mutex_unlock(&orig->device_list_mutex); free_fs_devices(fs_devices); return ERR_PTR(-ENOMEM); } @@ -475,7 +455,6 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) mutex_lock(&uuid_mutex); again: - mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { if (device->in_fs_metadata) continue; @@ -495,7 +474,6 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) kfree(device->name); kfree(device); } - mutex_unlock(&fs_devices->device_list_mutex); if (fs_devices->seed) { fs_devices = fs_devices->seed; @@ -616,9 +594,6 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, device->in_fs_metadata = 0; device->mode = flags; - if (!blk_queue_nonrot(bdev_get_queue(bdev))) - fs_devices->rotating = 1; - fs_devices->open_devices++; if (device->writeable) { fs_devices->rw_devices++; @@ -1146,14 +1121,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) device = NULL; devices = &root->fs_info->fs_devices->devices; - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_for_each_entry(tmp, devices, dev_list) { if (tmp->in_fs_metadata && !tmp->bdev) { device = tmp; break; } } - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); bdev = NULL; bh = NULL; disk_super = NULL; @@ -1208,16 +1181,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) goto error_brelse; device->in_fs_metadata = 0; - - /* - * the device list mutex makes sure that we don't change - * the device list while someone else is writing out all - * the device supers. - */ - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_del_init(&device->dev_list); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - device->fs_devices->num_devices--; next_device = list_entry(root->fs_info->fs_devices->devices.next, @@ -1311,7 +1275,6 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, seed_devices->opened = 1; INIT_LIST_HEAD(&seed_devices->devices); INIT_LIST_HEAD(&seed_devices->alloc_list); - mutex_init(&seed_devices->device_list_mutex); list_splice_init(&fs_devices->devices, &seed_devices->devices); list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); list_for_each_entry(device, &seed_devices->devices, dev_list) { @@ -1437,10 +1400,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) mutex_lock(&root->fs_info->volume_mutex); devices = &root->fs_info->fs_devices->devices; - /* - * we have the volume lock, so we don't need the extra - * device list mutex while reading the list here. - */ list_for_each_entry(device, devices, dev_list) { if (device->bdev == bdev) { ret = -EEXIST; @@ -1495,12 +1454,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) } device->fs_devices = root->fs_info->fs_devices; - - /* - * we don't want write_supers to jump in here with our device - * half setup - */ - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_add(&device->dev_list, &root->fs_info->fs_devices->devices); list_add(&device->dev_alloc_list, &root->fs_info->fs_devices->alloc_list); @@ -1509,9 +1462,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) root->fs_info->fs_devices->rw_devices++; root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; - if (!blk_queue_nonrot(bdev_get_queue(bdev))) - root->fs_info->fs_devices->rotating = 1; - total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); btrfs_set_super_total_bytes(&root->fs_info->super_copy, total_bytes + device->total_bytes); @@ -1519,7 +1469,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); btrfs_set_super_num_devices(&root->fs_info->super_copy, total_bytes + 1); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (seeding_dev) { ret = init_first_rw_device(trans, root, device); @@ -1722,6 +1671,8 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, int ret; int i; + printk(KERN_INFO "btrfs relocating chunk %llu\n", + (unsigned long long)chunk_offset); root = root->fs_info->chunk_root; extent_root = root->fs_info->extent_root; em_tree = &root->fs_info->mapping_tree.map_tree; diff --git a/trunk/fs/btrfs/volumes.h b/trunk/fs/btrfs/volumes.h index 5139a833f721..5c3ff6d02fd7 100644 --- a/trunk/fs/btrfs/volumes.h +++ b/trunk/fs/btrfs/volumes.h @@ -96,12 +96,7 @@ struct btrfs_fs_devices { u64 rw_devices; u64 total_rw_bytes; struct block_device *latest_bdev; - - /* all of the devices in the FS, protected by a mutex - * so we can safely walk it to write out the supers without - * worrying about add/remove by the multi-device code - */ - struct mutex device_list_mutex; + /* all of the devices in the FS */ struct list_head devices; /* devices not currently being allocated */ @@ -112,11 +107,6 @@ struct btrfs_fs_devices { int seeding; int opened; - - /* set when we find or add a device that doesn't have the - * nonrot flag set - */ - int rotating; }; struct btrfs_bio_stripe { diff --git a/trunk/fs/cachefiles/interface.c b/trunk/fs/cachefiles/interface.c index 431accd475a7..1e962348d111 100644 --- a/trunk/fs/cachefiles/interface.c +++ b/trunk/fs/cachefiles/interface.c @@ -354,9 +354,7 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache) /* make sure all pages pinned by operations on behalf of the netfs are * written to disc */ cachefiles_begin_secure(cache, &saved_cred); - down_read(&cache->mnt->mnt_sb->s_umount); - ret = sync_filesystem(cache->mnt->mnt_sb); - up_read(&cache->mnt->mnt_sb->s_umount); + ret = fsync_super(cache->mnt->mnt_sb); cachefiles_end_secure(cache, saved_cred); if (ret == -EIO) diff --git a/trunk/fs/char_dev.c b/trunk/fs/char_dev.c index b7c9d5187a75..38f71222a552 100644 --- a/trunk/fs/char_dev.c +++ b/trunk/fs/char_dev.c @@ -375,6 +375,7 @@ static int chrdev_open(struct inode *inode, struct file *filp) p = inode->i_cdev; if (!p) { inode->i_cdev = p = new; + inode->i_cindex = idx; list_add(&inode->i_devices, &p->list); new = NULL; } else if (!cdev_get(p)) @@ -404,18 +405,6 @@ static int chrdev_open(struct inode *inode, struct file *filp) return ret; } -int cdev_index(struct inode *inode) -{ - int idx; - struct kobject *kobj; - - kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx); - if (!kobj) - return -1; - kobject_put(kobj); - return idx; -} - void cd_forget(struct inode *inode) { spin_lock(&cdev_lock); @@ -568,7 +557,6 @@ EXPORT_SYMBOL(cdev_init); EXPORT_SYMBOL(cdev_alloc); EXPORT_SYMBOL(cdev_del); EXPORT_SYMBOL(cdev_add); -EXPORT_SYMBOL(cdev_index); EXPORT_SYMBOL(register_chrdev); EXPORT_SYMBOL(unregister_chrdev); EXPORT_SYMBOL(directly_mappable_cdev_bdi); diff --git a/trunk/fs/cifs/cifs_dfs_ref.c b/trunk/fs/cifs/cifs_dfs_ref.c index 3bb11be8b6a8..83d62759c7c7 100644 --- a/trunk/fs/cifs/cifs_dfs_ref.c +++ b/trunk/fs/cifs/cifs_dfs_ref.c @@ -275,7 +275,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd, case -EBUSY: /* someone else made a mount here whilst we were busy */ while (d_mountpoint(nd->path.dentry) && - follow_down(&nd->path)) + follow_down(&nd->path.mnt, &nd->path.dentry)) ; err = 0; default: diff --git a/trunk/fs/cifs/cifsfs.c b/trunk/fs/cifs/cifsfs.c index 0d92114195ab..0a10a59b6392 100644 --- a/trunk/fs/cifs/cifsfs.c +++ b/trunk/fs/cifs/cifsfs.c @@ -204,9 +204,6 @@ cifs_put_super(struct super_block *sb) cFYI(1, ("Empty cifs superblock info passed to unmount")); return; } - - lock_kernel(); - rc = cifs_umount(sb, cifs_sb); if (rc) cERROR(1, ("cifs_umount failed with return code %d", rc)); @@ -219,8 +216,7 @@ cifs_put_super(struct super_block *sb) unload_nls(cifs_sb->local_nls); kfree(cifs_sb); - - unlock_kernel(); + return; } static int diff --git a/trunk/fs/compat.c b/trunk/fs/compat.c index 6aefb776dfeb..bb2a9b2e8173 100644 --- a/trunk/fs/compat.c +++ b/trunk/fs/compat.c @@ -812,8 +812,10 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name, } } + lock_kernel(); retval = do_mount((char*)dev_page, dir_page, (char*)type_page, flags, (void*)data_page); + unlock_kernel(); out4: free_page(data_page); diff --git a/trunk/fs/dcache.c b/trunk/fs/dcache.c index 9e5cd3c3a6ba..75659a6fd1f8 100644 --- a/trunk/fs/dcache.c +++ b/trunk/fs/dcache.c @@ -1910,7 +1910,7 @@ char *__d_path(const struct path *path, struct path *root, spin_lock(&vfsmount_lock); prepend(&end, &buflen, "\0", 1); - if (d_unlinked(dentry) && + if (!IS_ROOT(dentry) && d_unhashed(dentry) && (prepend(&end, &buflen, " (deleted)", 10) != 0)) goto Elong; @@ -2035,7 +2035,7 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen) spin_lock(&dcache_lock); prepend(&end, &buflen, "\0", 1); - if (d_unlinked(dentry) && + if (!IS_ROOT(dentry) && d_unhashed(dentry) && (prepend(&end, &buflen, "//deleted", 9) != 0)) goto Elong; if (buflen < 1) @@ -2097,8 +2097,9 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) read_unlock(¤t->fs->lock); error = -ENOENT; + /* Has the current directory has been unlinked? */ spin_lock(&dcache_lock); - if (!d_unlinked(pwd.dentry)) { + if (IS_ROOT(pwd.dentry) || !d_unhashed(pwd.dentry)) { unsigned long len; struct path tmp = root; char * cwd; diff --git a/trunk/fs/ecryptfs/super.c b/trunk/fs/ecryptfs/super.c index 12d649602d3a..fa4c7e7d15d9 100644 --- a/trunk/fs/ecryptfs/super.c +++ b/trunk/fs/ecryptfs/super.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include "ecryptfs_kernel.h" @@ -121,13 +120,9 @@ static void ecryptfs_put_super(struct super_block *sb) { struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb); - lock_kernel(); - ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); kmem_cache_free(ecryptfs_sb_info_cache, sb_info); ecryptfs_set_superblock_private(sb, NULL); - - unlock_kernel(); } /** diff --git a/trunk/fs/exofs/super.c b/trunk/fs/exofs/super.c index 8216c5b77b53..9f1985e857e2 100644 --- a/trunk/fs/exofs/super.c +++ b/trunk/fs/exofs/super.c @@ -200,21 +200,20 @@ static const struct export_operations exofs_export_ops; /* * Write the superblock to the OSD */ -static int exofs_sync_fs(struct super_block *sb, int wait) +static void exofs_write_super(struct super_block *sb) { struct exofs_sb_info *sbi; struct exofs_fscb *fscb; struct osd_request *or; struct osd_obj_id obj; - int ret = -ENOMEM; + int ret; fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL); if (!fscb) { EXOFS_ERR("exofs_write_super: memory allocation failed.\n"); - return -ENOMEM; + return; } - lock_super(sb); lock_kernel(); sbi = sb->s_fs_info; fscb->s_nextid = cpu_to_le64(sbi->s_nextid); @@ -247,17 +246,7 @@ static int exofs_sync_fs(struct super_block *sb, int wait) if (or) osd_end_request(or); unlock_kernel(); - unlock_super(sb); kfree(fscb); - return ret; -} - -static void exofs_write_super(struct super_block *sb) -{ - if (!(sb->s_flags & MS_RDONLY)) - exofs_sync_fs(sb, 1); - else - sb->s_dirt = 0; } /* @@ -269,11 +258,6 @@ static void exofs_put_super(struct super_block *sb) int num_pend; struct exofs_sb_info *sbi = sb->s_fs_info; - lock_kernel(); - - if (sb->s_dirt) - exofs_write_super(sb); - /* make sure there are no pending commands */ for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0; num_pend = atomic_read(&sbi->s_curr_pending)) { @@ -287,8 +271,6 @@ static void exofs_put_super(struct super_block *sb) osduld_put_device(sbi->s_dev); kfree(sb->s_fs_info); sb->s_fs_info = NULL; - - unlock_kernel(); } /* @@ -502,7 +484,6 @@ static const struct super_operations exofs_sops = { .delete_inode = exofs_delete_inode, .put_super = exofs_put_super, .write_super = exofs_write_super, - .sync_fs = exofs_sync_fs, .statfs = exofs_statfs, }; diff --git a/trunk/fs/ext2/Makefile b/trunk/fs/ext2/Makefile index f42af45cfd88..e0b2b43c1fdb 100644 --- a/trunk/fs/ext2/Makefile +++ b/trunk/fs/ext2/Makefile @@ -4,7 +4,7 @@ obj-$(CONFIG_EXT2_FS) += ext2.o -ext2-y := balloc.o dir.o file.o ialloc.o inode.o \ +ext2-y := balloc.o dir.o file.o fsync.o ialloc.o inode.o \ ioctl.o namei.o super.o symlink.o ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o diff --git a/trunk/fs/ext2/dir.c b/trunk/fs/ext2/dir.c index 003500498c22..2999d72153b7 100644 --- a/trunk/fs/ext2/dir.c +++ b/trunk/fs/ext2/dir.c @@ -720,5 +720,5 @@ const struct file_operations ext2_dir_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, #endif - .fsync = simple_fsync, + .fsync = ext2_sync_file, }; diff --git a/trunk/fs/ext2/ext2.h b/trunk/fs/ext2/ext2.h index b2bbf45039e0..3203042b36ef 100644 --- a/trunk/fs/ext2/ext2.h +++ b/trunk/fs/ext2/ext2.h @@ -113,6 +113,9 @@ extern int ext2_empty_dir (struct inode *); extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **); extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *); +/* fsync.c */ +extern int ext2_sync_file (struct file *, struct dentry *, int); + /* ialloc.c */ extern struct inode * ext2_new_inode (struct inode *, int); extern void ext2_free_inode (struct inode *); diff --git a/trunk/fs/ext2/file.c b/trunk/fs/ext2/file.c index 2b9e47dc9222..45ed07122182 100644 --- a/trunk/fs/ext2/file.c +++ b/trunk/fs/ext2/file.c @@ -55,7 +55,7 @@ const struct file_operations ext2_file_operations = { .mmap = generic_file_mmap, .open = generic_file_open, .release = ext2_release_file, - .fsync = simple_fsync, + .fsync = ext2_sync_file, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, }; @@ -72,7 +72,7 @@ const struct file_operations ext2_xip_file_operations = { .mmap = xip_file_mmap, .open = generic_file_open, .release = ext2_release_file, - .fsync = simple_fsync, + .fsync = ext2_sync_file, }; #endif diff --git a/trunk/fs/ext2/fsync.c b/trunk/fs/ext2/fsync.c new file mode 100644 index 000000000000..fc66c93fcb5c --- /dev/null +++ b/trunk/fs/ext2/fsync.c @@ -0,0 +1,50 @@ +/* + * linux/fs/ext2/fsync.c + * + * Copyright (C) 1993 Stephen Tweedie (sct@dcs.ed.ac.uk) + * from + * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds + * + * ext2fs fsync primitive + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Removed unnecessary code duplication for little endian machines + * and excessive __inline__s. + * Andi Kleen, 1997 + * + * Major simplications and cleanup - we only need to do the metadata, because + * we can depend on generic_block_fdatasync() to sync the data blocks. + */ + +#include "ext2.h" +#include /* for sync_mapping_buffers() */ + + +/* + * File may be NULL when we are called. Perhaps we shouldn't + * even pass file to fsync ? + */ + +int ext2_sync_file(struct file *file, struct dentry *dentry, int datasync) +{ + struct inode *inode = dentry->d_inode; + int err; + int ret; + + ret = sync_mapping_buffers(inode->i_mapping); + if (!(inode->i_state & I_DIRTY)) + return ret; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + return ret; + + err = ext2_sync_inode(inode); + if (ret == 0) + ret = err; + return ret; +} diff --git a/trunk/fs/ext2/inode.c b/trunk/fs/ext2/inode.c index 29ed682061f6..acf678831103 100644 --- a/trunk/fs/ext2/inode.c +++ b/trunk/fs/ext2/inode.c @@ -41,6 +41,8 @@ MODULE_AUTHOR("Remy Card and others"); MODULE_DESCRIPTION("Second Extended Filesystem"); MODULE_LICENSE("GPL"); +static int ext2_update_inode(struct inode * inode, int do_sync); + /* * Test whether an inode is a fast symlink. */ @@ -64,7 +66,7 @@ void ext2_delete_inode (struct inode * inode) goto no_delete; EXT2_I(inode)->i_dtime = get_seconds(); mark_inode_dirty(inode); - ext2_write_inode(inode, inode_needs_sync(inode)); + ext2_update_inode(inode, inode_needs_sync(inode)); inode->i_size = 0; if (inode->i_blocks) @@ -1335,7 +1337,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) return ERR_PTR(ret); } -int ext2_write_inode(struct inode *inode, int do_sync) +static int ext2_update_inode(struct inode * inode, int do_sync) { struct ext2_inode_info *ei = EXT2_I(inode); struct super_block *sb = inode->i_sb; @@ -1440,6 +1442,11 @@ int ext2_write_inode(struct inode *inode, int do_sync) return err; } +int ext2_write_inode(struct inode *inode, int wait) +{ + return ext2_update_inode(inode, wait); +} + int ext2_sync_inode(struct inode *inode) { struct writeback_control wbc = { diff --git a/trunk/fs/ext2/super.c b/trunk/fs/ext2/super.c index 458999638c3d..e3c748faf2db 100644 --- a/trunk/fs/ext2/super.c +++ b/trunk/fs/ext2/super.c @@ -42,7 +42,6 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es); static int ext2_remount (struct super_block * sb, int * flags, char * data); static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); -static int ext2_sync_fs(struct super_block *sb, int wait); void ext2_error (struct super_block * sb, const char * function, const char * fmt, ...) @@ -115,11 +114,6 @@ static void ext2_put_super (struct super_block * sb) int i; struct ext2_sb_info *sbi = EXT2_SB(sb); - lock_kernel(); - - if (sb->s_dirt) - ext2_write_super(sb); - ext2_xattr_put_super(sb); if (!(sb->s_flags & MS_RDONLY)) { struct ext2_super_block *es = sbi->s_es; @@ -141,7 +135,7 @@ static void ext2_put_super (struct super_block * sb) kfree(sbi->s_blockgroup_lock); kfree(sbi); - unlock_kernel(); + return; } static struct kmem_cache * ext2_inode_cachep; @@ -310,7 +304,6 @@ static const struct super_operations ext2_sops = { .delete_inode = ext2_delete_inode, .put_super = ext2_put_super, .write_super = ext2_write_super, - .sync_fs = ext2_sync_fs, .statfs = ext2_statfs, .remount_fs = ext2_remount, .clear_inode = ext2_clear_inode, @@ -1134,36 +1127,25 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es) * set s_state to EXT2_VALID_FS after some corrections. */ -static int ext2_sync_fs(struct super_block *sb, int wait) +void ext2_write_super (struct super_block * sb) { - struct ext2_super_block *es = EXT2_SB(sb)->s_es; - + struct ext2_super_block * es; lock_kernel(); - if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) { - ext2_debug("setting valid to 0\n"); - es->s_state &= cpu_to_le16(~EXT2_VALID_FS); - es->s_free_blocks_count = - cpu_to_le32(ext2_count_free_blocks(sb)); - es->s_free_inodes_count = - cpu_to_le32(ext2_count_free_inodes(sb)); - es->s_mtime = cpu_to_le32(get_seconds()); - ext2_sync_super(sb, es); - } else { - ext2_commit_super(sb, es); + if (!(sb->s_flags & MS_RDONLY)) { + es = EXT2_SB(sb)->s_es; + + if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) { + ext2_debug ("setting valid to 0\n"); + es->s_state &= cpu_to_le16(~EXT2_VALID_FS); + es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb)); + es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb)); + es->s_mtime = cpu_to_le32(get_seconds()); + ext2_sync_super(sb, es); + } else + ext2_commit_super (sb, es); } sb->s_dirt = 0; unlock_kernel(); - - return 0; -} - - -void ext2_write_super(struct super_block *sb) -{ - if (!(sb->s_flags & MS_RDONLY)) - ext2_sync_fs(sb, 1); - else - sb->s_dirt = 0; } static int ext2_remount (struct super_block * sb, int * flags, char * data) @@ -1175,8 +1157,6 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) unsigned long old_sb_flags; int err; - lock_kernel(); - /* Store the old options */ old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; @@ -1212,16 +1192,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; } - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { - unlock_kernel(); + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; - } if (*flags & MS_RDONLY) { if (le16_to_cpu(es->s_state) & EXT2_VALID_FS || - !(sbi->s_mount_state & EXT2_VALID_FS)) { - unlock_kernel(); + !(sbi->s_mount_state & EXT2_VALID_FS)) return 0; - } /* * OK, we are remounting a valid rw partition rdonly, so set * the rdonly flag and then mark the partition as valid again. @@ -1248,14 +1224,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) sb->s_flags &= ~MS_RDONLY; } ext2_sync_super(sb, es); - unlock_kernel(); return 0; restore_opts: sbi->s_mount_opt = old_opts.s_mount_opt; sbi->s_resuid = old_opts.s_resuid; sbi->s_resgid = old_opts.s_resgid; sb->s_flags = old_sb_flags; - unlock_kernel(); return err; } diff --git a/trunk/fs/ext3/balloc.c b/trunk/fs/ext3/balloc.c index 27967f92e820..225202db8974 100644 --- a/trunk/fs/ext3/balloc.c +++ b/trunk/fs/ext3/balloc.c @@ -649,7 +649,7 @@ void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb, count = overflow; goto do_more; } - + sb->s_dirt = 1; error_return: brelse(bitmap_bh); ext3_std_error(sb, err); @@ -1708,6 +1708,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, if (!fatal) fatal = err; + sb->s_dirt = 1; if (fatal) goto out; diff --git a/trunk/fs/ext3/ialloc.c b/trunk/fs/ext3/ialloc.c index b39991285136..dd13d60d524b 100644 --- a/trunk/fs/ext3/ialloc.c +++ b/trunk/fs/ext3/ialloc.c @@ -181,7 +181,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode) err = ext3_journal_dirty_metadata(handle, bitmap_bh); if (!fatal) fatal = err; - + sb->s_dirt = 1; error_return: brelse(bitmap_bh); ext3_std_error(sb, fatal); @@ -537,6 +537,7 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode) percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) percpu_counter_inc(&sbi->s_dirs_counter); + sb->s_dirt = 1; inode->i_uid = current_fsuid(); if (test_opt (sb, GRPID)) diff --git a/trunk/fs/ext3/inode.c b/trunk/fs/ext3/inode.c index b0248c6d5d4c..fcfa24361856 100644 --- a/trunk/fs/ext3/inode.c +++ b/trunk/fs/ext3/inode.c @@ -2960,6 +2960,7 @@ static int ext3_do_update_inode(handle_t *handle, ext3_update_dynamic_rev(sb); EXT3_SET_RO_COMPAT_FEATURE(sb, EXT3_FEATURE_RO_COMPAT_LARGE_FILE); + sb->s_dirt = 1; handle->h_sync = 1; err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); diff --git a/trunk/fs/ext3/resize.c b/trunk/fs/ext3/resize.c index 8a0b26340b54..78fdf3836370 100644 --- a/trunk/fs/ext3/resize.c +++ b/trunk/fs/ext3/resize.c @@ -934,6 +934,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) EXT3_INODES_PER_GROUP(sb)); ext3_journal_dirty_metadata(handle, sbi->s_sbh); + sb->s_dirt = 1; exit_journal: unlock_super(sb); @@ -1065,6 +1066,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, } es->s_blocks_count = cpu_to_le32(o_blocks_count + add); ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + sb->s_dirt = 1; unlock_super(sb); ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count, o_blocks_count + add); diff --git a/trunk/fs/ext3/super.c b/trunk/fs/ext3/super.c index 26aa64dee6aa..3c70d52afb10 100644 --- a/trunk/fs/ext3/super.c +++ b/trunk/fs/ext3/super.c @@ -67,6 +67,7 @@ static const char *ext3_decode_error(struct super_block * sb, int errno, static int ext3_remount (struct super_block * sb, int * flags, char * data); static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf); static int ext3_unfreeze(struct super_block *sb); +static void ext3_write_super (struct super_block * sb); static int ext3_freeze(struct super_block *sb); /* @@ -398,8 +399,6 @@ static void ext3_put_super (struct super_block * sb) struct ext3_super_block *es = sbi->s_es; int i, err; - lock_kernel(); - ext3_xattr_put_super(sb); err = journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -448,8 +447,7 @@ static void ext3_put_super (struct super_block * sb) sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); kfree(sbi); - - unlock_kernel(); + return; } static struct kmem_cache *ext3_inode_cachep; @@ -763,6 +761,7 @@ static const struct super_operations ext3_sops = { .dirty_inode = ext3_dirty_inode, .delete_inode = ext3_delete_inode, .put_super = ext3_put_super, + .write_super = ext3_write_super, .sync_fs = ext3_sync_fs, .freeze_fs = ext3_freeze, .unfreeze_fs = ext3_unfreeze, @@ -1786,6 +1785,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) #else es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); #endif + sb->s_dirt = 1; } if (sbi->s_blocks_per_group > blocksize * 8) { @@ -2265,6 +2265,7 @@ static int ext3_load_journal(struct super_block *sb, if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { es->s_journal_dev = cpu_to_le32(journal_devnum); + sb->s_dirt = 1; /* Make sure we flush the recovery flag to disk. */ ext3_commit_super(sb, es, 1); @@ -2307,6 +2308,7 @@ static int ext3_create_journal(struct super_block * sb, EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL); es->s_journal_inum = cpu_to_le32(journal_inum); + sb->s_dirt = 1; /* Make sure we flush the recovery flag to disk. */ ext3_commit_super(sb, es, 1); @@ -2352,6 +2354,7 @@ static void ext3_mark_recovery_complete(struct super_block * sb, if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && sb->s_flags & MS_RDONLY) { EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + sb->s_dirt = 0; ext3_commit_super(sb, es, 1); } unlock_super(sb); @@ -2410,14 +2413,29 @@ int ext3_force_commit(struct super_block *sb) return 0; journal = EXT3_SB(sb)->s_journal; + sb->s_dirt = 0; ret = ext3_journal_force_commit(journal); return ret; } +/* + * Ext3 always journals updates to the superblock itself, so we don't + * have to propagate any other updates to the superblock on disk at this + * point. (We can probably nuke this function altogether, and remove + * any mention to sb->s_dirt in all of fs/ext3; eventual cleanup...) + */ +static void ext3_write_super (struct super_block * sb) +{ + if (mutex_trylock(&sb->s_lock) != 0) + BUG(); + sb->s_dirt = 0; +} + static int ext3_sync_fs(struct super_block *sb, int wait) { tid_t target; + sb->s_dirt = 0; if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) { if (wait) log_wait_commit(EXT3_SB(sb)->s_journal, target); @@ -2433,6 +2451,7 @@ static int ext3_freeze(struct super_block *sb) { int error = 0; journal_t *journal; + sb->s_dirt = 0; if (!(sb->s_flags & MS_RDONLY)) { journal = EXT3_SB(sb)->s_journal; @@ -2490,10 +2509,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) int i; #endif - lock_kernel(); - /* Store the original options */ - lock_super(sb); old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; old_opts.s_resuid = sbi->s_resuid; @@ -2601,8 +2617,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) old_opts.s_qf_names[i] != sbi->s_qf_names[i]) kfree(old_opts.s_qf_names[i]); #endif - unlock_super(sb); - unlock_kernel(); return 0; restore_opts: sb->s_flags = old_sb_flags; @@ -2619,8 +2633,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) sbi->s_qf_names[i] = old_opts.s_qf_names[i]; } #endif - unlock_super(sb); - unlock_kernel(); return err; } diff --git a/trunk/fs/ext3/xattr.c b/trunk/fs/ext3/xattr.c index 545e37c4b91e..83b7be849bd5 100644 --- a/trunk/fs/ext3/xattr.c +++ b/trunk/fs/ext3/xattr.c @@ -463,6 +463,7 @@ static void ext3_xattr_update_super_block(handle_t *handle, if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) { EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR); + sb->s_dirt = 1; ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); } } diff --git a/trunk/fs/ext4/super.c b/trunk/fs/ext4/super.c index 012c4251397e..f016707597a7 100644 --- a/trunk/fs/ext4/super.c +++ b/trunk/fs/ext4/super.c @@ -576,11 +576,6 @@ static void ext4_put_super(struct super_block *sb) struct ext4_super_block *es = sbi->s_es; int i, err; - lock_super(sb); - lock_kernel(); - if (sb->s_dirt) - ext4_commit_super(sb, 1); - ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); @@ -647,6 +642,8 @@ static void ext4_put_super(struct super_block *sb) unlock_super(sb); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); + lock_super(sb); + lock_kernel(); kfree(sbi->s_blockgroup_lock); kfree(sbi); } @@ -3336,9 +3333,7 @@ int ext4_force_commit(struct super_block *sb) static void ext4_write_super(struct super_block *sb) { - lock_super(sb); ext4_commit_super(sb, 1); - unlock_super(sb); } static int ext4_sync_fs(struct super_block *sb, int wait) @@ -3422,10 +3417,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) int i; #endif - lock_kernel(); - /* Store the original options */ - lock_super(sb); old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; old_opts.s_resuid = sbi->s_resuid; @@ -3559,8 +3551,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) old_opts.s_qf_names[i] != sbi->s_qf_names[i]) kfree(old_opts.s_qf_names[i]); #endif - unlock_super(sb); - unlock_kernel(); return 0; restore_opts: @@ -3580,8 +3570,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) sbi->s_qf_names[i] = old_opts.s_qf_names[i]; } #endif - unlock_super(sb); - unlock_kernel(); return err; } diff --git a/trunk/fs/fat/dir.c b/trunk/fs/fat/dir.c index f3500294eec5..3a7f603b6982 100644 --- a/trunk/fs/fat/dir.c +++ b/trunk/fs/fat/dir.c @@ -840,7 +840,7 @@ const struct file_operations fat_dir_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = fat_compat_dir_ioctl, #endif - .fsync = fat_file_fsync, + .fsync = file_fsync, }; static int fat_get_short_entry(struct inode *dir, loff_t *pos, @@ -967,7 +967,7 @@ static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots) de++; nr_slots--; } - mark_buffer_dirty_inode(bh, dir); + mark_buffer_dirty(bh); if (IS_DIRSYNC(dir)) err = sync_dirty_buffer(bh); brelse(bh); @@ -1001,7 +1001,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo) de--; nr_slots--; } - mark_buffer_dirty_inode(bh, dir); + mark_buffer_dirty(bh); if (IS_DIRSYNC(dir)) err = sync_dirty_buffer(bh); brelse(bh); @@ -1051,7 +1051,7 @@ static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used, } memset(bhs[n]->b_data, 0, sb->s_blocksize); set_buffer_uptodate(bhs[n]); - mark_buffer_dirty_inode(bhs[n], dir); + mark_buffer_dirty(bhs[n]); n++; blknr++; @@ -1131,7 +1131,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts) de[0].size = de[1].size = 0; memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de)); set_buffer_uptodate(bhs[0]); - mark_buffer_dirty_inode(bhs[0], dir); + mark_buffer_dirty(bhs[0]); err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE); if (err) @@ -1193,7 +1193,7 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots, slots += copy; size -= copy; set_buffer_uptodate(bhs[n]); - mark_buffer_dirty_inode(bhs[n], dir); + mark_buffer_dirty(bhs[n]); if (!size) break; n++; @@ -1293,7 +1293,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots, for (i = 0; i < long_bhs; i++) { int copy = min_t(int, sb->s_blocksize - offset, size); memcpy(bhs[i]->b_data + offset, slots, copy); - mark_buffer_dirty_inode(bhs[i], dir); + mark_buffer_dirty(bhs[i]); offset = 0; slots += copy; size -= copy; @@ -1304,7 +1304,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots, /* Fill the short name slot. */ int copy = min_t(int, sb->s_blocksize - offset, size); memcpy(bhs[i]->b_data + offset, slots, copy); - mark_buffer_dirty_inode(bhs[i], dir); + mark_buffer_dirty(bhs[i]); if (IS_DIRSYNC(dir)) err = sync_dirty_buffer(bhs[i]); } diff --git a/trunk/fs/fat/fat.h b/trunk/fs/fat/fat.h index e4d88527b5dd..ea440d65819c 100644 --- a/trunk/fs/fat/fat.h +++ b/trunk/fs/fat/fat.h @@ -74,7 +74,6 @@ struct msdos_sb_info { int fatent_shift; struct fatent_operations *fatent_ops; - struct inode *fat_inode; spinlock_t inode_hash_lock; struct hlist_head inode_hashtable[FAT_HASH_SIZE]; @@ -252,7 +251,6 @@ struct fat_entry { } u; int nr_bhs; struct buffer_head *bhs[2]; - struct inode *fat_inode; }; static inline void fatent_init(struct fat_entry *fatent) @@ -261,7 +259,6 @@ static inline void fatent_init(struct fat_entry *fatent) fatent->entry = 0; fatent->u.ent32_p = NULL; fatent->bhs[0] = fatent->bhs[1] = NULL; - fatent->fat_inode = NULL; } static inline void fatent_set_entry(struct fat_entry *fatent, int entry) @@ -278,7 +275,6 @@ static inline void fatent_brelse(struct fat_entry *fatent) brelse(fatent->bhs[i]); fatent->nr_bhs = 0; fatent->bhs[0] = fatent->bhs[1] = NULL; - fatent->fat_inode = NULL; } extern void fat_ent_access_init(struct super_block *sb); @@ -300,8 +296,6 @@ extern int fat_setattr(struct dentry * dentry, struct iattr * attr); extern void fat_truncate(struct inode *inode); extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); -extern int fat_file_fsync(struct file *file, struct dentry *dentry, - int datasync); /* fat/inode.c */ extern void fat_attach(struct inode *inode, loff_t i_pos); diff --git a/trunk/fs/fat/fatent.c b/trunk/fs/fat/fatent.c index 618f5305c2e4..da6eea47872f 100644 --- a/trunk/fs/fat/fatent.c +++ b/trunk/fs/fat/fatent.c @@ -73,8 +73,6 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent, struct buffer_head **bhs = fatent->bhs; WARN_ON(blocknr < MSDOS_SB(sb)->fat_start); - fatent->fat_inode = MSDOS_SB(sb)->fat_inode; - bhs[0] = sb_bread(sb, blocknr); if (!bhs[0]) goto err; @@ -105,7 +103,6 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent, struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; WARN_ON(blocknr < MSDOS_SB(sb)->fat_start); - fatent->fat_inode = MSDOS_SB(sb)->fat_inode; fatent->bhs[0] = sb_bread(sb, blocknr); if (!fatent->bhs[0]) { printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n", @@ -170,9 +167,9 @@ static void fat12_ent_put(struct fat_entry *fatent, int new) } spin_unlock(&fat12_entry_lock); - mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode); + mark_buffer_dirty(fatent->bhs[0]); if (fatent->nr_bhs == 2) - mark_buffer_dirty_inode(fatent->bhs[1], fatent->fat_inode); + mark_buffer_dirty(fatent->bhs[1]); } static void fat16_ent_put(struct fat_entry *fatent, int new) @@ -181,7 +178,7 @@ static void fat16_ent_put(struct fat_entry *fatent, int new) new = EOF_FAT16; *fatent->u.ent16_p = cpu_to_le16(new); - mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode); + mark_buffer_dirty(fatent->bhs[0]); } static void fat32_ent_put(struct fat_entry *fatent, int new) @@ -192,7 +189,7 @@ static void fat32_ent_put(struct fat_entry *fatent, int new) WARN_ON(new & 0xf0000000); new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff; *fatent->u.ent32_p = cpu_to_le32(new); - mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode); + mark_buffer_dirty(fatent->bhs[0]); } static int fat12_ent_next(struct fat_entry *fatent) @@ -384,7 +381,7 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs, } memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize); set_buffer_uptodate(c_bh); - mark_buffer_dirty_inode(c_bh, sbi->fat_inode); + mark_buffer_dirty(c_bh); if (sb->s_flags & MS_SYNCHRONOUS) err = sync_dirty_buffer(c_bh); brelse(c_bh); diff --git a/trunk/fs/fat/file.c b/trunk/fs/fat/file.c index e955a56b4e5e..0a7f4a9918b3 100644 --- a/trunk/fs/fat/file.c +++ b/trunk/fs/fat/file.c @@ -133,18 +133,6 @@ static int fat_file_release(struct inode *inode, struct file *filp) return 0; } -int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync) -{ - struct inode *inode = dentry->d_inode; - int res, err; - - res = simple_fsync(filp, dentry, datasync); - err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping); - - return res ? res : err; -} - - const struct file_operations fat_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -154,7 +142,7 @@ const struct file_operations fat_file_operations = { .mmap = generic_file_mmap, .release = fat_file_release, .ioctl = fat_generic_ioctl, - .fsync = fat_file_fsync, + .fsync = file_fsync, .splice_read = generic_file_splice_read, }; diff --git a/trunk/fs/fat/inode.c b/trunk/fs/fat/inode.c index 51a5ecf9000a..296785a0dec8 100644 --- a/trunk/fs/fat/inode.c +++ b/trunk/fs/fat/inode.c @@ -441,35 +441,16 @@ static void fat_clear_inode(struct inode *inode) static void fat_write_super(struct super_block *sb) { - lock_super(sb); sb->s_dirt = 0; if (!(sb->s_flags & MS_RDONLY)) fat_clusters_flush(sb); - unlock_super(sb); -} - -static int fat_sync_fs(struct super_block *sb, int wait) -{ - lock_super(sb); - fat_clusters_flush(sb); - sb->s_dirt = 0; - unlock_super(sb); - - return 0; } static void fat_put_super(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); - lock_kernel(); - - if (sb->s_dirt) - fat_write_super(sb); - - iput(sbi->fat_inode); - if (sbi->nls_disk) { unload_nls(sbi->nls_disk); sbi->nls_disk = NULL; @@ -486,8 +467,6 @@ static void fat_put_super(struct super_block *sb) sb->s_fs_info = NULL; kfree(sbi); - - unlock_kernel(); } static struct kmem_cache *fat_inode_cachep; @@ -653,7 +632,6 @@ static const struct super_operations fat_sops = { .delete_inode = fat_delete_inode, .put_super = fat_put_super, .write_super = fat_write_super, - .sync_fs = fat_sync_fs, .statfs = fat_statfs, .clear_inode = fat_clear_inode, .remount_fs = fat_remount, @@ -1196,7 +1174,7 @@ static int fat_read_root(struct inode *inode) int fat_fill_super(struct super_block *sb, void *data, int silent, const struct inode_operations *fs_dir_inode_ops, int isvfat) { - struct inode *root_inode = NULL, *fat_inode = NULL; + struct inode *root_inode = NULL; struct buffer_head *bh; struct fat_boot_sector *b; struct msdos_sb_info *sbi; @@ -1436,11 +1414,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, } error = -ENOMEM; - fat_inode = new_inode(sb); - if (!fat_inode) - goto out_fail; - MSDOS_I(fat_inode)->i_pos = 0; - sbi->fat_inode = fat_inode; root_inode = new_inode(sb); if (!root_inode) goto out_fail; @@ -1466,8 +1439,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, " on dev %s.\n", sb->s_id); out_fail: - if (fat_inode) - iput(fat_inode); if (root_inode) iput(root_inode); if (sbi->nls_io) diff --git a/trunk/fs/fat/namei_msdos.c b/trunk/fs/fat/namei_msdos.c index 20f522861355..da3f361a37dd 100644 --- a/trunk/fs/fat/namei_msdos.c +++ b/trunk/fs/fat/namei_msdos.c @@ -544,7 +544,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, int start = MSDOS_I(new_dir)->i_logstart; dotdot_de->start = cpu_to_le16(start); dotdot_de->starthi = cpu_to_le16(start >> 16); - mark_buffer_dirty_inode(dotdot_bh, old_inode); + mark_buffer_dirty(dotdot_bh); if (IS_DIRSYNC(new_dir)) { err = sync_dirty_buffer(dotdot_bh); if (err) @@ -586,7 +586,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, int start = MSDOS_I(old_dir)->i_logstart; dotdot_de->start = cpu_to_le16(start); dotdot_de->starthi = cpu_to_le16(start >> 16); - mark_buffer_dirty_inode(dotdot_bh, old_inode); + mark_buffer_dirty(dotdot_bh); corrupt |= sync_dirty_buffer(dotdot_bh); } error_inode: diff --git a/trunk/fs/fat/namei_vfat.c b/trunk/fs/fat/namei_vfat.c index b50ecbe97f83..a0e00e3a46e9 100644 --- a/trunk/fs/fat/namei_vfat.c +++ b/trunk/fs/fat/namei_vfat.c @@ -965,7 +965,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, int start = MSDOS_I(new_dir)->i_logstart; dotdot_de->start = cpu_to_le16(start); dotdot_de->starthi = cpu_to_le16(start >> 16); - mark_buffer_dirty_inode(dotdot_bh, old_inode); + mark_buffer_dirty(dotdot_bh); if (IS_DIRSYNC(new_dir)) { err = sync_dirty_buffer(dotdot_bh); if (err) @@ -1009,7 +1009,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, int start = MSDOS_I(old_dir)->i_logstart; dotdot_de->start = cpu_to_le16(start); dotdot_de->starthi = cpu_to_le16(start >> 16); - mark_buffer_dirty_inode(dotdot_bh, old_inode); + mark_buffer_dirty(dotdot_bh); corrupt |= sync_dirty_buffer(dotdot_bh); } error_inode: diff --git a/trunk/fs/file_table.c b/trunk/fs/file_table.c index 334ce39881f8..54018fe48840 100644 --- a/trunk/fs/file_table.c +++ b/trunk/fs/file_table.c @@ -214,7 +214,7 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry, */ if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) { file_take_write(file); - error = mnt_clone_write(mnt); + error = mnt_want_write(mnt); WARN_ON(error); } return error; @@ -399,44 +399,6 @@ int fs_may_remount_ro(struct super_block *sb) return 0; } -/** - * mark_files_ro - mark all files read-only - * @sb: superblock in question - * - * All files are marked read-only. We don't care about pending - * delete files so this should be used in 'force' mode only. - */ -void mark_files_ro(struct super_block *sb) -{ - struct file *f; - -retry: - file_list_lock(); - list_for_each_entry(f, &sb->s_files, f_u.fu_list) { - struct vfsmount *mnt; - if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) - continue; - if (!file_count(f)) - continue; - if (!(f->f_mode & FMODE_WRITE)) - continue; - f->f_mode &= ~FMODE_WRITE; - if (file_check_writeable(f) != 0) - continue; - file_release_write(f); - mnt = mntget(f->f_path.mnt); - file_list_unlock(); - /* - * This can sleep, so we can't hold - * the file_list_lock() spinlock. - */ - mnt_drop_write(mnt); - mntput(mnt); - goto retry; - } - file_list_unlock(); -} - void __init files_init(unsigned long mempages) { int n; diff --git a/trunk/fs/freevxfs/vxfs_super.c b/trunk/fs/freevxfs/vxfs_super.c index cdbd1654e4cd..1dacda831577 100644 --- a/trunk/fs/freevxfs/vxfs_super.c +++ b/trunk/fs/freevxfs/vxfs_super.c @@ -80,16 +80,12 @@ vxfs_put_super(struct super_block *sbp) { struct vxfs_sb_info *infp = VXFS_SBI(sbp); - lock_kernel(); - vxfs_put_fake_inode(infp->vsi_fship); vxfs_put_fake_inode(infp->vsi_ilist); vxfs_put_fake_inode(infp->vsi_stilist); brelse(infp->vsi_bp); kfree(infp); - - unlock_kernel(); } /** diff --git a/trunk/fs/fs-writeback.c b/trunk/fs/fs-writeback.c index 40308e98c6a4..91013ff7dd53 100644 --- a/trunk/fs/fs-writeback.c +++ b/trunk/fs/fs-writeback.c @@ -64,28 +64,6 @@ static void writeback_release(struct backing_dev_info *bdi) clear_bit(BDI_pdflush, &bdi->state); } -static noinline void block_dump___mark_inode_dirty(struct inode *inode) -{ - if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { - struct dentry *dentry; - const char *name = "?"; - - dentry = d_find_alias(inode); - if (dentry) { - spin_lock(&dentry->d_lock); - name = (const char *) dentry->d_name.name; - } - printk(KERN_DEBUG - "%s(%d): dirtied inode %lu (%s) on %s\n", - current->comm, task_pid_nr(current), inode->i_ino, - name, inode->i_sb->s_id); - if (dentry) { - spin_unlock(&dentry->d_lock); - dput(dentry); - } - } -} - /** * __mark_inode_dirty - internal function * @inode: inode to mark @@ -136,8 +114,23 @@ void __mark_inode_dirty(struct inode *inode, int flags) if ((inode->i_state & flags) == flags) return; - if (unlikely(block_dump)) - block_dump___mark_inode_dirty(inode); + if (unlikely(block_dump)) { + struct dentry *dentry = NULL; + const char *name = "?"; + + if (!list_empty(&inode->i_dentry)) { + dentry = list_entry(inode->i_dentry.next, + struct dentry, d_alias); + if (dentry && dentry->d_name.name) + name = (const char *) dentry->d_name.name; + } + + if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) + printk(KERN_DEBUG + "%s(%d): dirtied inode %lu (%s) on %s\n", + current->comm, task_pid_nr(current), inode->i_ino, + name, inode->i_sb->s_id); + } spin_lock(&inode_lock); if ((inode->i_state & flags) != flags) { @@ -296,6 +289,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc) int ret; BUG_ON(inode->i_state & I_SYNC); + WARN_ON(inode->i_state & I_NEW); /* Set I_SYNC, reset I_DIRTY */ dirty = inode->i_state & I_DIRTY; @@ -320,6 +314,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc) } spin_lock(&inode_lock); + WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { if (!(inode->i_state & I_DIRTY) && @@ -683,6 +678,55 @@ void sync_inodes_sb(struct super_block *sb, int wait) sync_sb_inodes(sb, &wbc); } +/** + * sync_inodes - writes all inodes to disk + * @wait: wait for completion + * + * sync_inodes() goes through each super block's dirty inode list, writes the + * inodes out, waits on the writeout and puts the inodes back on the normal + * list. + * + * This is for sys_sync(). fsync_dev() uses the same algorithm. The subtle + * part of the sync functions is that the blockdev "superblock" is processed + * last. This is because the write_inode() function of a typical fs will + * perform no I/O, but will mark buffers in the blockdev mapping as dirty. + * What we want to do is to perform all that dirtying first, and then write + * back all those inode blocks via the blockdev mapping in one sweep. So the + * additional (somewhat redundant) sync_blockdev() calls here are to make + * sure that really happens. Because if we call sync_inodes_sb(wait=1) with + * outstanding dirty inodes, the writeback goes block-at-a-time within the + * filesystem's write_inode(). This is extremely slow. + */ +static void __sync_inodes(int wait) +{ + struct super_block *sb; + + spin_lock(&sb_lock); +restart: + list_for_each_entry(sb, &super_blocks, s_list) { + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + if (sb->s_root) { + sync_inodes_sb(sb, wait); + sync_blockdev(sb->s_bdev); + } + up_read(&sb->s_umount); + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto restart; + } + spin_unlock(&sb_lock); +} + +void sync_inodes(int wait) +{ + __sync_inodes(0); + + if (wait) + __sync_inodes(1); +} + /** * write_inode_now - write an inode to disk * @inode: inode to write to disk diff --git a/trunk/fs/gfs2/log.c b/trunk/fs/gfs2/log.c index f2e449c595b4..aa62cf5976e8 100644 --- a/trunk/fs/gfs2/log.c +++ b/trunk/fs/gfs2/log.c @@ -764,6 +764,7 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) } gfs2_log_unlock(sdp); + sdp->sd_vfs->s_dirt = 0; up_write(&sdp->sd_log_flush_lock); kfree(ai); @@ -822,6 +823,7 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) log_refund(sdp, tr); buf_lo_incore_commit(sdp, tr); + sdp->sd_vfs->s_dirt = 1; up_read(&sdp->sd_log_flush_lock); gfs2_log_lock(sdp); diff --git a/trunk/fs/gfs2/super.c b/trunk/fs/gfs2/super.c index c8930b31cdf0..40bcc37e5a70 100644 --- a/trunk/fs/gfs2/super.c +++ b/trunk/fs/gfs2/super.c @@ -719,8 +719,6 @@ static void gfs2_put_super(struct super_block *sb) int error; struct gfs2_jdesc *jd; - lock_kernel(); - /* Unfreeze the filesystem, if we need to */ mutex_lock(&sdp->sd_freeze_lock); @@ -787,8 +785,17 @@ static void gfs2_put_super(struct super_block *sb) /* At this point, we're through participating in the lockspace */ gfs2_sys_fs_del(sdp); +} + +/** + * gfs2_write_super + * @sb: the superblock + * + */ - unlock_kernel(); +static void gfs2_write_super(struct super_block *sb) +{ + sb->s_dirt = 0; } /** @@ -800,6 +807,7 @@ static void gfs2_put_super(struct super_block *sb) static int gfs2_sync_fs(struct super_block *sb, int wait) { + sb->s_dirt = 0; if (wait && sb->s_fs_info) gfs2_log_flush(sb->s_fs_info, NULL); return 0; @@ -1316,6 +1324,7 @@ const struct super_operations gfs2_super_ops = { .write_inode = gfs2_write_inode, .delete_inode = gfs2_delete_inode, .put_super = gfs2_put_super, + .write_super = gfs2_write_super, .sync_fs = gfs2_sync_fs, .freeze_fs = gfs2_freeze, .unfreeze_fs = gfs2_unfreeze, diff --git a/trunk/fs/hfs/super.c b/trunk/fs/hfs/super.c index 6f833dc8e910..a36bb749926d 100644 --- a/trunk/fs/hfs/super.c +++ b/trunk/fs/hfs/super.c @@ -49,23 +49,11 @@ MODULE_LICENSE("GPL"); */ static void hfs_write_super(struct super_block *sb) { - lock_super(sb); sb->s_dirt = 0; - + if (sb->s_flags & MS_RDONLY) + return; /* sync everything to the buffers */ - if (!(sb->s_flags & MS_RDONLY)) - hfs_mdb_commit(sb); - unlock_super(sb); -} - -static int hfs_sync_fs(struct super_block *sb, int wait) -{ - lock_super(sb); hfs_mdb_commit(sb); - sb->s_dirt = 0; - unlock_super(sb); - - return 0; } /* @@ -77,15 +65,9 @@ static int hfs_sync_fs(struct super_block *sb, int wait) */ static void hfs_put_super(struct super_block *sb) { - lock_kernel(); - - if (sb->s_dirt) - hfs_write_super(sb); hfs_mdb_close(sb); /* release the MDB's resources */ hfs_mdb_put(sb); - - unlock_kernel(); } /* @@ -182,7 +164,6 @@ static const struct super_operations hfs_super_operations = { .clear_inode = hfs_clear_inode, .put_super = hfs_put_super, .write_super = hfs_write_super, - .sync_fs = hfs_sync_fs, .statfs = hfs_statfs, .remount_fs = hfs_remount, .show_options = hfs_show_options, diff --git a/trunk/fs/hfsplus/super.c b/trunk/fs/hfsplus/super.c index 9fc3af0c0dab..f2a64020f42e 100644 --- a/trunk/fs/hfsplus/super.c +++ b/trunk/fs/hfsplus/super.c @@ -152,14 +152,15 @@ static void hfsplus_clear_inode(struct inode *inode) } } -static int hfsplus_sync_fs(struct super_block *sb, int wait) +static void hfsplus_write_super(struct super_block *sb) { struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; dprint(DBG_SUPER, "hfsplus_write_super\n"); - - lock_super(sb); sb->s_dirt = 0; + if (sb->s_flags & MS_RDONLY) + /* warn? */ + return; vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks); vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc); @@ -191,16 +192,6 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) } HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP; } - unlock_super(sb); - return 0; -} - -static void hfsplus_write_super(struct super_block *sb) -{ - if (!(sb->s_flags & MS_RDONLY)) - hfsplus_sync_fs(sb, 1); - else - sb->s_dirt = 0; } static void hfsplus_put_super(struct super_block *sb) @@ -208,11 +199,6 @@ static void hfsplus_put_super(struct super_block *sb) dprint(DBG_SUPER, "hfsplus_put_super\n"); if (!sb->s_fs_info) return; - - lock_kernel(); - - if (sb->s_dirt) - hfsplus_write_super(sb); if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) { struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; @@ -232,8 +218,6 @@ static void hfsplus_put_super(struct super_block *sb) unload_nls(HFSPLUS_SB(sb).nls); kfree(sb->s_fs_info); sb->s_fs_info = NULL; - - unlock_kernel(); } static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -295,7 +279,6 @@ static const struct super_operations hfsplus_sops = { .clear_inode = hfsplus_clear_inode, .put_super = hfsplus_put_super, .write_super = hfsplus_write_super, - .sync_fs = hfsplus_sync_fs, .statfs = hfsplus_statfs, .remount_fs = hfsplus_remount, .show_options = hfsplus_show_options, diff --git a/trunk/fs/hpfs/super.c b/trunk/fs/hpfs/super.c index f2feaa06bf26..fc77965be841 100644 --- a/trunk/fs/hpfs/super.c +++ b/trunk/fs/hpfs/super.c @@ -13,7 +13,6 @@ #include #include #include -#include /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */ @@ -100,16 +99,11 @@ int hpfs_stop_cycles(struct super_block *s, int key, int *c1, int *c2, static void hpfs_put_super(struct super_block *s) { struct hpfs_sb_info *sbi = hpfs_sb(s); - - lock_kernel(); - kfree(sbi->sb_cp_table); kfree(sbi->sb_bmp_dir); unmark_dirty(s); s->s_fs_info = NULL; kfree(sbi); - - unlock_kernel(); } unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) @@ -399,8 +393,6 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) *flags |= MS_NOATIME; - lock_kernel(); - lock_super(s); uid = sbi->sb_uid; gid = sbi->sb_gid; umask = 0777 & ~sbi->sb_mode; lowercase = sbi->sb_lowercase; conv = sbi->sb_conv; @@ -433,13 +425,9 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) replace_mount_options(s, new_opts); - unlock_super(s); - unlock_kernel(); return 0; out_err: - unlock_super(s); - unlock_kernel(); kfree(new_opts); return -EINVAL; } diff --git a/trunk/fs/inode.c b/trunk/fs/inode.c index a88baebf77cf..bca0c618fdb3 100644 --- a/trunk/fs/inode.c +++ b/trunk/fs/inode.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -190,10 +189,6 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode) inode->i_private = NULL; inode->i_mapping = mapping; -#ifdef CONFIG_FSNOTIFY - inode->i_fsnotify_mask = 0; -#endif - return inode; out_free_security: @@ -226,7 +221,6 @@ void destroy_inode(struct inode *inode) BUG_ON(inode_has_buffers(inode)); ima_inode_free(inode); security_inode_free(inode); - fsnotify_inode_delete(inode); if (inode->i_sb->s_op->destroy_inode) inode->i_sb->s_op->destroy_inode(inode); else @@ -258,9 +252,6 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->inotify_watches); mutex_init(&inode->inotify_mutex); #endif -#ifdef CONFIG_FSNOTIFY - INIT_HLIST_HEAD(&inode->i_fsnotify_mark_entries); -#endif } EXPORT_SYMBOL(inode_init_once); @@ -407,7 +398,6 @@ int invalidate_inodes(struct super_block *sb) mutex_lock(&iprune_mutex); spin_lock(&inode_lock); inotify_unmount_inodes(&sb->s_inodes); - fsnotify_unmount_inodes(&sb->s_inodes); busy = invalidate_list(&sb->s_inodes, &throw_away); spin_unlock(&inode_lock); @@ -1422,7 +1412,7 @@ void file_update_time(struct file *file) if (IS_NOCMTIME(inode)) return; - err = mnt_want_write_file(file); + err = mnt_want_write(file->f_path.mnt); if (err) return; diff --git a/trunk/fs/internal.h b/trunk/fs/internal.h index d55ef562f0bb..b4dac4fb6b61 100644 --- a/trunk/fs/internal.h +++ b/trunk/fs/internal.h @@ -25,8 +25,6 @@ static inline int sb_is_blkdev_sb(struct super_block *sb) return sb == blockdev_superblock; } -extern int __sync_blockdev(struct block_device *bdev, int wait); - #else static inline void bdev_cache_init(void) { @@ -36,11 +34,6 @@ static inline int sb_is_blkdev_sb(struct super_block *sb) { return 0; } - -static inline int __sync_blockdev(struct block_device *bdev, int wait) -{ - return 0; -} #endif /* @@ -73,13 +66,3 @@ extern void __init mnt_init(void); * fs_struct.c */ extern void chroot_fs_refs(struct path *, struct path *); - -/* - * file_table.c - */ -extern void mark_files_ro(struct super_block *); - -/* - * super.c - */ -extern int do_remount_sb(struct super_block *, int, void *, int); diff --git a/trunk/fs/isofs/inode.c b/trunk/fs/isofs/inode.c index 068b34b5a107..b4cbe9603c7d 100644 --- a/trunk/fs/isofs/inode.c +++ b/trunk/fs/isofs/inode.c @@ -42,16 +42,11 @@ static int isofs_dentry_cmp_ms(struct dentry *dentry, struct qstr *a, struct qst static void isofs_put_super(struct super_block *sb) { struct isofs_sb_info *sbi = ISOFS_SB(sb); - #ifdef CONFIG_JOLIET - lock_kernel(); - if (sbi->s_nls_iocharset) { unload_nls(sbi->s_nls_iocharset); sbi->s_nls_iocharset = NULL; } - - unlock_kernel(); #endif kfree(sbi); diff --git a/trunk/fs/jffs2/fs.c b/trunk/fs/jffs2/fs.c index 3451a81b2142..249305d65d5b 100644 --- a/trunk/fs/jffs2/fs.c +++ b/trunk/fs/jffs2/fs.c @@ -20,7 +20,6 @@ #include #include #include -#include #include "nodelist.h" static int jffs2_flash_setup(struct jffs2_sb_info *c); @@ -388,7 +387,6 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data) This also catches the case where it was stopped and this is just a remount to restart it. Flush the writebuffer, if neccecary, else we loose it */ - lock_kernel(); if (!(sb->s_flags & MS_RDONLY)) { jffs2_stop_garbage_collect_thread(c); mutex_lock(&c->alloc_sem); @@ -401,10 +399,24 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data) *flags |= MS_NOATIME; - unlock_kernel(); return 0; } +void jffs2_write_super (struct super_block *sb) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); + sb->s_dirt = 0; + + if (sb->s_flags & MS_RDONLY) + return; + + D1(printk(KERN_DEBUG "jffs2_write_super()\n")); + jffs2_garbage_collect_trigger(c); + jffs2_erase_pending_blocks(c, 0); + jffs2_flush_wbuf_gc(c, 0); +} + + /* jffs2_new_inode: allocate a new inode and inocache, add it to the hash, fill in the raw_inode while you're at it. */ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri) diff --git a/trunk/fs/jffs2/os-linux.h b/trunk/fs/jffs2/os-linux.h index 2228380c47b9..5e194a5c8e29 100644 --- a/trunk/fs/jffs2/os-linux.h +++ b/trunk/fs/jffs2/os-linux.h @@ -181,6 +181,7 @@ void jffs2_dirty_inode(struct inode *inode); struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri); int jffs2_statfs (struct dentry *, struct kstatfs *); +void jffs2_write_super (struct super_block *); int jffs2_remount_fs (struct super_block *, int *, char *); int jffs2_do_fill_super(struct super_block *sb, void *data, int silent); void jffs2_gc_release_inode(struct jffs2_sb_info *c, diff --git a/trunk/fs/jffs2/super.c b/trunk/fs/jffs2/super.c index 07a22caf2687..4c4e18c54a51 100644 --- a/trunk/fs/jffs2/super.c +++ b/trunk/fs/jffs2/super.c @@ -53,29 +53,10 @@ static void jffs2_i_init_once(void *foo) inode_init_once(&f->vfs_inode); } -static void jffs2_write_super(struct super_block *sb) -{ - struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); - - lock_super(sb); - sb->s_dirt = 0; - - if (!(sb->s_flags & MS_RDONLY)) { - D1(printk(KERN_DEBUG "jffs2_write_super()\n")); - jffs2_garbage_collect_trigger(c); - jffs2_erase_pending_blocks(c, 0); - jffs2_flush_wbuf_gc(c, 0); - } - - unlock_super(sb); -} - static int jffs2_sync_fs(struct super_block *sb, int wait) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); - jffs2_write_super(sb); - mutex_lock(&c->alloc_sem); jffs2_flush_wbuf_pad(c); mutex_unlock(&c->alloc_sem); @@ -193,11 +174,6 @@ static void jffs2_put_super (struct super_block *sb) D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n")); - lock_kernel(); - - if (sb->s_dirt) - jffs2_write_super(sb); - mutex_lock(&c->alloc_sem); jffs2_flush_wbuf_pad(c); mutex_unlock(&c->alloc_sem); @@ -216,8 +192,6 @@ static void jffs2_put_super (struct super_block *sb) if (c->mtd->sync) c->mtd->sync(c->mtd); - unlock_kernel(); - D1(printk(KERN_DEBUG "jffs2_put_super returning\n")); } diff --git a/trunk/fs/jfs/super.c b/trunk/fs/jfs/super.c index 09b1b6ee2186..d9b0e92b3602 100644 --- a/trunk/fs/jfs/super.c +++ b/trunk/fs/jfs/super.c @@ -32,7 +32,6 @@ #include #include #include -#include #include "jfs_incore.h" #include "jfs_filsys.h" @@ -184,9 +183,6 @@ static void jfs_put_super(struct super_block *sb) int rc; jfs_info("In jfs_put_super"); - - lock_kernel(); - rc = jfs_umount(sb); if (rc) jfs_err("jfs_umount failed with return code %d", rc); @@ -199,8 +195,6 @@ static void jfs_put_super(struct super_block *sb) sbi->direct_inode = NULL; kfree(sbi); - - unlock_kernel(); } enum { @@ -376,24 +370,19 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) s64 newLVSize = 0; int rc = 0; int flag = JFS_SBI(sb)->flag; - int ret; if (!parse_options(data, sb, &newLVSize, &flag)) { return -EINVAL; } - lock_kernel(); if (newLVSize) { if (sb->s_flags & MS_RDONLY) { printk(KERN_ERR "JFS: resize requires volume to be mounted read-write\n"); - unlock_kernel(); return -EROFS; } rc = jfs_extendfs(sb, newLVSize, 0); - if (rc) { - unlock_kernel(); + if (rc) return rc; - } } if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { @@ -404,31 +393,23 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) truncate_inode_pages(JFS_SBI(sb)->direct_inode->i_mapping, 0); JFS_SBI(sb)->flag = flag; - ret = jfs_mount_rw(sb, 1); - unlock_kernel(); - return ret; + return jfs_mount_rw(sb, 1); } if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) { rc = jfs_umount_rw(sb); JFS_SBI(sb)->flag = flag; - unlock_kernel(); return rc; } if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) if (!(sb->s_flags & MS_RDONLY)) { rc = jfs_umount_rw(sb); - if (rc) { - unlock_kernel(); + if (rc) return rc; - } JFS_SBI(sb)->flag = flag; - ret = jfs_mount_rw(sb, 1); - unlock_kernel(); - return ret; + return jfs_mount_rw(sb, 1); } JFS_SBI(sb)->flag = flag; - unlock_kernel(); return 0; } diff --git a/trunk/fs/libfs.c b/trunk/fs/libfs.c index ddfa89948c3f..80046ddf5063 100644 --- a/trunk/fs/libfs.c +++ b/trunk/fs/libfs.c @@ -9,8 +9,6 @@ #include #include #include -#include -#include #include @@ -809,29 +807,6 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, } EXPORT_SYMBOL_GPL(generic_fh_to_parent); -int simple_fsync(struct file *file, struct dentry *dentry, int datasync) -{ - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* metadata-only; caller takes care of data */ - }; - struct inode *inode = dentry->d_inode; - int err; - int ret; - - ret = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY)) - return ret; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - return ret; - - err = sync_inode(inode, &wbc); - if (ret == 0) - ret = err; - return ret; -} -EXPORT_SYMBOL(simple_fsync); - EXPORT_SYMBOL(dcache_dir_close); EXPORT_SYMBOL(dcache_dir_lseek); EXPORT_SYMBOL(dcache_dir_open); diff --git a/trunk/fs/minix/dir.c b/trunk/fs/minix/dir.c index e5f206467e40..d4946c4c90e2 100644 --- a/trunk/fs/minix/dir.c +++ b/trunk/fs/minix/dir.c @@ -22,7 +22,7 @@ static int minix_readdir(struct file *, void *, filldir_t); const struct file_operations minix_dir_operations = { .read = generic_read_dir, .readdir = minix_readdir, - .fsync = simple_fsync, + .fsync = minix_sync_file, }; static inline void dir_put_page(struct page *page) diff --git a/trunk/fs/minix/file.c b/trunk/fs/minix/file.c index 3eec3e607a87..17765f697e50 100644 --- a/trunk/fs/minix/file.c +++ b/trunk/fs/minix/file.c @@ -6,12 +6,15 @@ * minix regular file handling primitives */ +#include /* for fsync_inode_buffers() */ #include "minix.h" /* * We have mostly NULLs here: the current defaults are OK for * the minix filesystem. */ +int minix_sync_file(struct file *, struct dentry *, int); + const struct file_operations minix_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -19,7 +22,7 @@ const struct file_operations minix_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .fsync = simple_fsync, + .fsync = minix_sync_file, .splice_read = generic_file_splice_read, }; @@ -27,3 +30,18 @@ const struct inode_operations minix_file_inode_operations = { .truncate = minix_truncate, .getattr = minix_getattr, }; + +int minix_sync_file(struct file * file, struct dentry *dentry, int datasync) +{ + struct inode *inode = dentry->d_inode; + int err; + + err = sync_mapping_buffers(inode->i_mapping); + if (!(inode->i_state & I_DIRTY)) + return err; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + return err; + + err |= minix_sync_inode(inode); + return err ? -EIO : 0; +} diff --git a/trunk/fs/minix/inode.c b/trunk/fs/minix/inode.c index f91a23693597..daad3c2740db 100644 --- a/trunk/fs/minix/inode.c +++ b/trunk/fs/minix/inode.c @@ -35,8 +35,6 @@ static void minix_put_super(struct super_block *sb) int i; struct minix_sb_info *sbi = minix_sb(sb); - lock_kernel(); - if (!(sb->s_flags & MS_RDONLY)) { if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */ sbi->s_ms->s_state = sbi->s_mount_state; @@ -51,7 +49,7 @@ static void minix_put_super(struct super_block *sb) sb->s_fs_info = NULL; kfree(sbi); - unlock_kernel(); + return; } static struct kmem_cache * minix_inode_cachep; @@ -556,25 +554,38 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode) return bh; } -static int minix_write_inode(struct inode *inode, int wait) +static struct buffer_head *minix_update_inode(struct inode *inode) +{ + if (INODE_VERSION(inode) == MINIX_V1) + return V1_minix_update_inode(inode); + else + return V2_minix_update_inode(inode); +} + +static int minix_write_inode(struct inode * inode, int wait) +{ + brelse(minix_update_inode(inode)); + return 0; +} + +int minix_sync_inode(struct inode * inode) { int err = 0; struct buffer_head *bh; - if (INODE_VERSION(inode) == MINIX_V1) - bh = V1_minix_update_inode(inode); - else - bh = V2_minix_update_inode(inode); - if (!bh) - return -EIO; - if (wait && buffer_dirty(bh)) { + bh = minix_update_inode(inode); + if (bh && buffer_dirty(bh)) + { sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) { + if (buffer_req(bh) && !buffer_uptodate(bh)) + { printk("IO error syncing minix inode [%s:%08lx]\n", inode->i_sb->s_id, inode->i_ino); - err = -EIO; + err = -1; } } + else if (!bh) + err = -1; brelse (bh); return err; } diff --git a/trunk/fs/minix/minix.h b/trunk/fs/minix/minix.h index cb7fdd11f9a5..e6a0b193bea4 100644 --- a/trunk/fs/minix/minix.h +++ b/trunk/fs/minix/minix.h @@ -57,6 +57,7 @@ extern int __minix_write_begin(struct file *file, struct address_space *mapping, extern void V1_minix_truncate(struct inode *); extern void V2_minix_truncate(struct inode *); extern void minix_truncate(struct inode *); +extern int minix_sync_inode(struct inode *); extern void minix_set_inode(struct inode *, dev_t); extern int V1_minix_get_block(struct inode *, long, struct buffer_head *, int); extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int); @@ -71,6 +72,7 @@ extern int minix_empty_dir(struct inode*); extern void minix_set_link(struct minix_dir_entry*, struct page*, struct inode*); extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**); extern ino_t minix_inode_by_name(struct dentry*); +extern int minix_sync_file(struct file *, struct dentry *, int); extern const struct inode_operations minix_file_inode_operations; extern const struct inode_operations minix_dir_inode_operations; diff --git a/trunk/fs/namei.c b/trunk/fs/namei.c index 527119afb6a5..c82805d088e1 100644 --- a/trunk/fs/namei.c +++ b/trunk/fs/namei.c @@ -552,17 +552,6 @@ static __always_inline int link_path_walk(const char *name, struct nameidata *nd return result; } -static __always_inline void set_root(struct nameidata *nd) -{ - if (!nd->root.mnt) { - struct fs_struct *fs = current->fs; - read_lock(&fs->lock); - nd->root = fs->root; - path_get(&nd->root); - read_unlock(&fs->lock); - } -} - static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) { int res = 0; @@ -571,10 +560,14 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l goto fail; if (*link == '/') { - set_root(nd); + struct fs_struct *fs = current->fs; + path_put(&nd->path); - nd->path = nd->root; - path_get(&nd->root); + + read_lock(&fs->lock); + nd->path = fs->root; + path_get(&fs->root); + read_unlock(&fs->lock); } res = link_path_walk(link, nd); @@ -675,23 +668,23 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd) return err; } -int follow_up(struct path *path) +int follow_up(struct vfsmount **mnt, struct dentry **dentry) { struct vfsmount *parent; struct dentry *mountpoint; spin_lock(&vfsmount_lock); - parent = path->mnt->mnt_parent; - if (parent == path->mnt) { + parent=(*mnt)->mnt_parent; + if (parent == *mnt) { spin_unlock(&vfsmount_lock); return 0; } mntget(parent); - mountpoint = dget(path->mnt->mnt_mountpoint); + mountpoint=dget((*mnt)->mnt_mountpoint); spin_unlock(&vfsmount_lock); - dput(path->dentry); - path->dentry = mountpoint; - mntput(path->mnt); - path->mnt = parent; + dput(*dentry); + *dentry = mountpoint; + mntput(*mnt); + *mnt = parent; return 1; } @@ -702,7 +695,7 @@ static int __follow_mount(struct path *path) { int res = 0; while (d_mountpoint(path->dentry)) { - struct vfsmount *mounted = lookup_mnt(path); + struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry); if (!mounted) break; dput(path->dentry); @@ -715,32 +708,32 @@ static int __follow_mount(struct path *path) return res; } -static void follow_mount(struct path *path) +static void follow_mount(struct vfsmount **mnt, struct dentry **dentry) { - while (d_mountpoint(path->dentry)) { - struct vfsmount *mounted = lookup_mnt(path); + while (d_mountpoint(*dentry)) { + struct vfsmount *mounted = lookup_mnt(*mnt, *dentry); if (!mounted) break; - dput(path->dentry); - mntput(path->mnt); - path->mnt = mounted; - path->dentry = dget(mounted->mnt_root); + dput(*dentry); + mntput(*mnt); + *mnt = mounted; + *dentry = dget(mounted->mnt_root); } } /* no need for dcache_lock, as serialization is taken care in * namespace.c */ -int follow_down(struct path *path) +int follow_down(struct vfsmount **mnt, struct dentry **dentry) { struct vfsmount *mounted; - mounted = lookup_mnt(path); + mounted = lookup_mnt(*mnt, *dentry); if (mounted) { - dput(path->dentry); - mntput(path->mnt); - path->mnt = mounted; - path->dentry = dget(mounted->mnt_root); + dput(*dentry); + mntput(*mnt); + *mnt = mounted; + *dentry = dget(mounted->mnt_root); return 1; } return 0; @@ -748,16 +741,19 @@ int follow_down(struct path *path) static __always_inline void follow_dotdot(struct nameidata *nd) { - set_root(nd); + struct fs_struct *fs = current->fs; while(1) { struct vfsmount *parent; struct dentry *old = nd->path.dentry; - if (nd->path.dentry == nd->root.dentry && - nd->path.mnt == nd->root.mnt) { + read_lock(&fs->lock); + if (nd->path.dentry == fs->root.dentry && + nd->path.mnt == fs->root.mnt) { + read_unlock(&fs->lock); break; } + read_unlock(&fs->lock); spin_lock(&dcache_lock); if (nd->path.dentry != nd->path.mnt->mnt_root) { nd->path.dentry = dget(nd->path.dentry->d_parent); @@ -779,7 +775,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd) mntput(nd->path.mnt); nd->path.mnt = parent; } - follow_mount(&nd->path); + follow_mount(&nd->path.mnt, &nd->path.dentry); } /* @@ -1021,23 +1017,25 @@ static int path_walk(const char *name, struct nameidata *nd) return link_path_walk(name, nd); } -static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) +/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ +static int do_path_lookup(int dfd, const char *name, + unsigned int flags, struct nameidata *nd) { int retval = 0; int fput_needed; struct file *file; + struct fs_struct *fs = current->fs; nd->last_type = LAST_ROOT; /* if there are only slashes... */ nd->flags = flags; nd->depth = 0; - nd->root.mnt = NULL; if (*name=='/') { - set_root(nd); - nd->path = nd->root; - path_get(&nd->root); + read_lock(&fs->lock); + nd->path = fs->root; + path_get(&fs->root); + read_unlock(&fs->lock); } else if (dfd == AT_FDCWD) { - struct fs_struct *fs = current->fs; read_lock(&fs->lock); nd->path = fs->pwd; path_get(&fs->pwd); @@ -1065,29 +1063,17 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei fput_light(file, fput_needed); } - return 0; -fput_fail: - fput_light(file, fput_needed); -out_fail: - return retval; -} - -/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ -static int do_path_lookup(int dfd, const char *name, - unsigned int flags, struct nameidata *nd) -{ - int retval = path_init(dfd, name, flags, nd); - if (!retval) - retval = path_walk(name, nd); + retval = path_walk(name, nd); if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && nd->path.dentry->d_inode)) audit_inode(name, nd->path.dentry); - if (nd->root.mnt) { - path_put(&nd->root); - nd->root.mnt = NULL; - } +out_fail: return retval; + +fput_fail: + fput_light(file, fput_needed); + goto out_fail; } int path_lookup(const char *name, unsigned int flags, @@ -1127,18 +1113,14 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, nd->path.dentry = dentry; nd->path.mnt = mnt; path_get(&nd->path); - nd->root = nd->path; - path_get(&nd->root); retval = path_walk(name, nd); if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && nd->path.dentry->d_inode)) audit_inode(name, nd->path.dentry); - path_put(&nd->root); - nd->root.mnt = NULL; - return retval; + } /** @@ -1694,14 +1676,9 @@ struct file *do_filp_open(int dfd, const char *pathname, /* * Create - we need to know the parent. */ - error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); + error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd); if (error) return ERR_PTR(error); - error = path_walk(pathname, &nd); - if (error) - return ERR_PTR(error); - if (unlikely(!audit_dummy_context())) - audit_inode(pathname, nd.path.dentry); /* * We have the parent and last component. First of all, check @@ -1829,8 +1806,6 @@ struct file *do_filp_open(int dfd, const char *pathname, if (!IS_ERR(nd.intent.open.file)) release_open_intent(&nd); exit_parent: - if (nd.root.mnt) - path_put(&nd.root); path_put(&nd.path); return ERR_PTR(error); diff --git a/trunk/fs/namespace.c b/trunk/fs/namespace.c index 2dd333b0fe7f..134d494158d9 100644 --- a/trunk/fs/namespace.c +++ b/trunk/fs/namespace.c @@ -131,20 +131,10 @@ struct vfsmount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_share); INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); -#ifdef CONFIG_SMP - mnt->mnt_writers = alloc_percpu(int); - if (!mnt->mnt_writers) - goto out_free_devname; -#else - mnt->mnt_writers = 0; -#endif + atomic_set(&mnt->__mnt_writers, 0); } return mnt; -#ifdef CONFIG_SMP -out_free_devname: - kfree(mnt->mnt_devname); -#endif out_free_id: mnt_free_id(mnt); out_free_cache: @@ -181,38 +171,65 @@ int __mnt_is_readonly(struct vfsmount *mnt) } EXPORT_SYMBOL_GPL(__mnt_is_readonly); -static inline void inc_mnt_writers(struct vfsmount *mnt) -{ -#ifdef CONFIG_SMP - (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++; -#else - mnt->mnt_writers++; -#endif -} +struct mnt_writer { + /* + * If holding multiple instances of this lock, they + * must be ordered by cpu number. + */ + spinlock_t lock; + struct lock_class_key lock_class; /* compiles out with !lockdep */ + unsigned long count; + struct vfsmount *mnt; +} ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU(struct mnt_writer, mnt_writers); -static inline void dec_mnt_writers(struct vfsmount *mnt) +static int __init init_mnt_writers(void) { -#ifdef CONFIG_SMP - (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--; -#else - mnt->mnt_writers--; -#endif + int cpu; + for_each_possible_cpu(cpu) { + struct mnt_writer *writer = &per_cpu(mnt_writers, cpu); + spin_lock_init(&writer->lock); + lockdep_set_class(&writer->lock, &writer->lock_class); + writer->count = 0; + } + return 0; } +fs_initcall(init_mnt_writers); -static unsigned int count_mnt_writers(struct vfsmount *mnt) +static void unlock_mnt_writers(void) { -#ifdef CONFIG_SMP - unsigned int count = 0; int cpu; + struct mnt_writer *cpu_writer; for_each_possible_cpu(cpu) { - count += *per_cpu_ptr(mnt->mnt_writers, cpu); + cpu_writer = &per_cpu(mnt_writers, cpu); + spin_unlock(&cpu_writer->lock); } +} - return count; -#else - return mnt->mnt_writers; -#endif +static inline void __clear_mnt_count(struct mnt_writer *cpu_writer) +{ + if (!cpu_writer->mnt) + return; + /* + * This is in case anyone ever leaves an invalid, + * old ->mnt and a count of 0. + */ + if (!cpu_writer->count) + return; + atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers); + cpu_writer->count = 0; +} + /* + * must hold cpu_writer->lock + */ +static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer, + struct vfsmount *mnt) +{ + if (cpu_writer->mnt == mnt) + return; + __clear_mnt_count(cpu_writer); + cpu_writer->mnt = mnt; } /* @@ -236,73 +253,74 @@ static unsigned int count_mnt_writers(struct vfsmount *mnt) int mnt_want_write(struct vfsmount *mnt) { int ret = 0; + struct mnt_writer *cpu_writer; - preempt_disable(); - inc_mnt_writers(mnt); - /* - * The store to inc_mnt_writers must be visible before we pass - * MNT_WRITE_HOLD loop below, so that the slowpath can see our - * incremented count after it has set MNT_WRITE_HOLD. - */ - smp_mb(); - while (mnt->mnt_flags & MNT_WRITE_HOLD) - cpu_relax(); - /* - * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will - * be set to match its requirements. So we must not load that until - * MNT_WRITE_HOLD is cleared. - */ - smp_rmb(); + cpu_writer = &get_cpu_var(mnt_writers); + spin_lock(&cpu_writer->lock); if (__mnt_is_readonly(mnt)) { - dec_mnt_writers(mnt); ret = -EROFS; goto out; } + use_cpu_writer_for_mount(cpu_writer, mnt); + cpu_writer->count++; out: - preempt_enable(); + spin_unlock(&cpu_writer->lock); + put_cpu_var(mnt_writers); return ret; } EXPORT_SYMBOL_GPL(mnt_want_write); -/** - * mnt_clone_write - get write access to a mount - * @mnt: the mount on which to take a write - * - * This is effectively like mnt_want_write, except - * it must only be used to take an extra write reference - * on a mountpoint that we already know has a write reference - * on it. This allows some optimisation. - * - * After finished, mnt_drop_write must be called as usual to - * drop the reference. - */ -int mnt_clone_write(struct vfsmount *mnt) -{ - /* superblock may be r/o */ - if (__mnt_is_readonly(mnt)) - return -EROFS; - preempt_disable(); - inc_mnt_writers(mnt); - preempt_enable(); - return 0; +static void lock_mnt_writers(void) +{ + int cpu; + struct mnt_writer *cpu_writer; + + for_each_possible_cpu(cpu) { + cpu_writer = &per_cpu(mnt_writers, cpu); + spin_lock(&cpu_writer->lock); + __clear_mnt_count(cpu_writer); + cpu_writer->mnt = NULL; + } } -EXPORT_SYMBOL_GPL(mnt_clone_write); -/** - * mnt_want_write_file - get write access to a file's mount - * @file: the file who's mount on which to take a write - * - * This is like mnt_want_write, but it takes a file and can - * do some optimisations if the file is open for write already +/* + * These per-cpu write counts are not guaranteed to have + * matched increments and decrements on any given cpu. + * A file open()ed for write on one cpu and close()d on + * another cpu will imbalance this count. Make sure it + * does not get too far out of whack. */ -int mnt_want_write_file(struct file *file) +static void handle_write_count_underflow(struct vfsmount *mnt) { - if (!(file->f_mode & FMODE_WRITE)) - return mnt_want_write(file->f_path.mnt); - else - return mnt_clone_write(file->f_path.mnt); + if (atomic_read(&mnt->__mnt_writers) >= + MNT_WRITER_UNDERFLOW_LIMIT) + return; + /* + * It isn't necessary to hold all of the locks + * at the same time, but doing it this way makes + * us share a lot more code. + */ + lock_mnt_writers(); + /* + * vfsmount_lock is for mnt_flags. + */ + spin_lock(&vfsmount_lock); + /* + * If coalescing the per-cpu writer counts did not + * get us back to a positive writer count, we have + * a bug. + */ + if ((atomic_read(&mnt->__mnt_writers) < 0) && + !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) { + WARN(1, KERN_DEBUG "leak detected on mount(%p) writers " + "count: %d\n", + mnt, atomic_read(&mnt->__mnt_writers)); + /* use the flag to keep the dmesg spam down */ + mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT; + } + spin_unlock(&vfsmount_lock); + unlock_mnt_writers(); } -EXPORT_SYMBOL_GPL(mnt_want_write_file); /** * mnt_drop_write - give up write access to a mount @@ -314,9 +332,37 @@ EXPORT_SYMBOL_GPL(mnt_want_write_file); */ void mnt_drop_write(struct vfsmount *mnt) { - preempt_disable(); - dec_mnt_writers(mnt); - preempt_enable(); + int must_check_underflow = 0; + struct mnt_writer *cpu_writer; + + cpu_writer = &get_cpu_var(mnt_writers); + spin_lock(&cpu_writer->lock); + + use_cpu_writer_for_mount(cpu_writer, mnt); + if (cpu_writer->count > 0) { + cpu_writer->count--; + } else { + must_check_underflow = 1; + atomic_dec(&mnt->__mnt_writers); + } + + spin_unlock(&cpu_writer->lock); + /* + * Logically, we could call this each time, + * but the __mnt_writers cacheline tends to + * be cold, and makes this expensive. + */ + if (must_check_underflow) + handle_write_count_underflow(mnt); + /* + * This could be done right after the spinlock + * is taken because the spinlock keeps us on + * the cpu, and disables preemption. However, + * putting it here bounds the amount that + * __mnt_writers can underflow. Without it, + * we could theoretically wrap __mnt_writers. + */ + put_cpu_var(mnt_writers); } EXPORT_SYMBOL_GPL(mnt_drop_write); @@ -324,41 +370,24 @@ static int mnt_make_readonly(struct vfsmount *mnt) { int ret = 0; - spin_lock(&vfsmount_lock); - mnt->mnt_flags |= MNT_WRITE_HOLD; + lock_mnt_writers(); /* - * After storing MNT_WRITE_HOLD, we'll read the counters. This store - * should be visible before we do. + * With all the locks held, this value is stable */ - smp_mb(); - - /* - * With writers on hold, if this value is zero, then there are - * definitely no active writers (although held writers may subsequently - * increment the count, they'll have to wait, and decrement it after - * seeing MNT_READONLY). - * - * It is OK to have counter incremented on one CPU and decremented on - * another: the sum will add up correctly. The danger would be when we - * sum up each counter, if we read a counter before it is incremented, - * but then read another CPU's count which it has been subsequently - * decremented from -- we would see more decrements than we should. - * MNT_WRITE_HOLD protects against this scenario, because - * mnt_want_write first increments count, then smp_mb, then spins on - * MNT_WRITE_HOLD, so it can't be decremented by another CPU while - * we're counting up here. - */ - if (count_mnt_writers(mnt) > 0) + if (atomic_read(&mnt->__mnt_writers) > 0) { ret = -EBUSY; - else - mnt->mnt_flags |= MNT_READONLY; + goto out; + } /* - * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers - * that become unheld will see MNT_READONLY. + * nobody can do a successful mnt_want_write() with all + * of the counts in MNT_DENIED_WRITE and the locks held. */ - smp_wmb(); - mnt->mnt_flags &= ~MNT_WRITE_HOLD; + spin_lock(&vfsmount_lock); + if (!ret) + mnt->mnt_flags |= MNT_READONLY; spin_unlock(&vfsmount_lock); +out: + unlock_mnt_writers(); return ret; } @@ -381,9 +410,6 @@ void free_vfsmnt(struct vfsmount *mnt) { kfree(mnt->mnt_devname); mnt_free_id(mnt); -#ifdef CONFIG_SMP - free_percpu(mnt->mnt_writers); -#endif kmem_cache_free(mnt_cache, mnt); } @@ -416,11 +442,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, * lookup_mnt increments the ref count before returning * the vfsmount struct. */ -struct vfsmount *lookup_mnt(struct path *path) +struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { struct vfsmount *child_mnt; spin_lock(&vfsmount_lock); - if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1))) + if ((child_mnt = __lookup_mnt(mnt, dentry, 1))) mntget(child_mnt); spin_unlock(&vfsmount_lock); return child_mnt; @@ -578,18 +604,38 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, static inline void __mntput(struct vfsmount *mnt) { + int cpu; struct super_block *sb = mnt->mnt_sb; + /* + * We don't have to hold all of the locks at the + * same time here because we know that we're the + * last reference to mnt and that no new writers + * can come in. + */ + for_each_possible_cpu(cpu) { + struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu); + spin_lock(&cpu_writer->lock); + if (cpu_writer->mnt != mnt) { + spin_unlock(&cpu_writer->lock); + continue; + } + atomic_add(cpu_writer->count, &mnt->__mnt_writers); + cpu_writer->count = 0; + /* + * Might as well do this so that no one + * ever sees the pointer and expects + * it to be valid. + */ + cpu_writer->mnt = NULL; + spin_unlock(&cpu_writer->lock); + } /* * This probably indicates that somebody messed * up a mnt_want/drop_write() pair. If this * happens, the filesystem was probably unable * to make r/w->r/o transitions. */ - /* - * atomic_dec_and_lock() used to deal with ->mnt_count decrements - * provides barriers, so count_mnt_writers() below is safe. AV - */ - WARN_ON(count_mnt_writers(mnt)); + WARN_ON(atomic_read(&mnt->__mnt_writers)); dput(mnt->mnt_root); free_vfsmnt(mnt); deactivate_super(sb); @@ -1060,8 +1106,11 @@ static int do_umount(struct vfsmount *mnt, int flags) * we just try to remount it readonly. */ down_write(&sb->s_umount); - if (!(sb->s_flags & MS_RDONLY)) + if (!(sb->s_flags & MS_RDONLY)) { + lock_kernel(); retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); + unlock_kernel(); + } up_write(&sb->s_umount); return retval; } @@ -1204,11 +1253,11 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, return NULL; } -struct vfsmount *collect_mounts(struct path *path) +struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry) { struct vfsmount *tree; down_write(&namespace_sem); - tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE); + tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE); up_write(&namespace_sem); return tree; } @@ -1381,7 +1430,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path) goto out_unlock; err = -ENOENT; - if (!d_unlinked(path->dentry)) + if (IS_ROOT(path->dentry) || !d_unhashed(path->dentry)) err = attach_recursive_mnt(mnt, path, NULL); out_unlock: mutex_unlock(&path->dentry->d_inode->i_mutex); @@ -1552,7 +1601,7 @@ static int do_move_mount(struct path *path, char *old_name) down_write(&namespace_sem); while (d_mountpoint(path->dentry) && - follow_down(path)) + follow_down(&path->mnt, &path->dentry)) ; err = -EINVAL; if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) @@ -1563,7 +1612,7 @@ static int do_move_mount(struct path *path, char *old_name) if (IS_DEADDIR(path->dentry->d_inode)) goto out1; - if (d_unlinked(path->dentry)) + if (!IS_ROOT(path->dentry) && d_unhashed(path->dentry)) goto out1; err = -EINVAL; @@ -1627,9 +1676,7 @@ static int do_new_mount(struct path *path, char *type, int flags, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - lock_kernel(); mnt = do_kern_mount(type, flags, name, data); - unlock_kernel(); if (IS_ERR(mnt)) return PTR_ERR(mnt); @@ -1648,10 +1695,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path, down_write(&namespace_sem); /* Something was mounted here while we slept */ while (d_mountpoint(path->dentry) && - follow_down(path)) + follow_down(&path->mnt, &path->dentry)) ; err = -EINVAL; - if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt)) + if (!check_mnt(path->mnt)) goto unlock; /* Refuse the same filesystem on the same mount point */ @@ -2045,8 +2092,10 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, if (retval < 0) goto out3; + lock_kernel(); retval = do_mount((char *)dev_page, dir_page, (char *)type_page, flags, (void *)data_page); + unlock_kernel(); free_page(data_page); out3: @@ -2126,9 +2175,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, error = -ENOENT; if (IS_DEADDIR(new.dentry->d_inode)) goto out2; - if (d_unlinked(new.dentry)) + if (d_unhashed(new.dentry) && !IS_ROOT(new.dentry)) goto out2; - if (d_unlinked(old.dentry)) + if (d_unhashed(old.dentry) && !IS_ROOT(old.dentry)) goto out2; error = -EBUSY; if (new.mnt == root.mnt || diff --git a/trunk/fs/ncpfs/inode.c b/trunk/fs/ncpfs/inode.c index b99ce205b1bd..d642f0e5b365 100644 --- a/trunk/fs/ncpfs/inode.c +++ b/trunk/fs/ncpfs/inode.c @@ -736,8 +736,6 @@ static void ncp_put_super(struct super_block *sb) { struct ncp_server *server = NCP_SBP(sb); - lock_kernel(); - ncp_lock_server(server); ncp_disconnect(server); ncp_unlock_server(server); @@ -771,8 +769,6 @@ static void ncp_put_super(struct super_block *sb) vfree(server->packet); sb->s_fs_info = NULL; kfree(server); - - unlock_kernel(); } static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf) diff --git a/trunk/fs/nfs/namespace.c b/trunk/fs/nfs/namespace.c index f01caec84463..64a288ee046d 100644 --- a/trunk/fs/nfs/namespace.c +++ b/trunk/fs/nfs/namespace.c @@ -154,7 +154,7 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) goto out; out_follow: while (d_mountpoint(nd->path.dentry) && - follow_down(&nd->path)) + follow_down(&nd->path.mnt, &nd->path.dentry)) ; err = 0; goto out; diff --git a/trunk/fs/nfs/super.c b/trunk/fs/nfs/super.c index 26127b69a275..d2d67781c579 100644 --- a/trunk/fs/nfs/super.c +++ b/trunk/fs/nfs/super.c @@ -1813,7 +1813,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) if (data == NULL) return -ENOMEM; - lock_kernel(); /* fill out struct with values from existing mount */ data->flags = nfss->flags; data->rsize = nfss->rsize; @@ -1838,7 +1837,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) error = nfs_compare_remount_data(nfss, data); out: kfree(data); - unlock_kernel(); return error; } diff --git a/trunk/fs/nfsd/export.c b/trunk/fs/nfsd/export.c index 8b1f8efb4690..5839b229cd0e 100644 --- a/trunk/fs/nfsd/export.c +++ b/trunk/fs/nfsd/export.c @@ -847,8 +847,9 @@ exp_get_fsid_key(svc_client *clp, int fsid) return exp_find_key(clp, FSID_NUM, fsidv, NULL); } -static svc_export *exp_get_by_name(svc_client *clp, const struct path *path, - struct cache_req *reqp) +static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt, + struct dentry *dentry, + struct cache_req *reqp) { struct svc_export *exp, key; int err; @@ -857,7 +858,8 @@ static svc_export *exp_get_by_name(svc_client *clp, const struct path *path, return ERR_PTR(-ENOENT); key.ex_client = clp; - key.ex_path = *path; + key.ex_path.mnt = mnt; + key.ex_path.dentry = dentry; exp = svc_export_lookup(&key); if (exp == NULL) @@ -871,19 +873,24 @@ static svc_export *exp_get_by_name(svc_client *clp, const struct path *path, /* * Find the export entry for a given dentry. */ -static struct svc_export *exp_parent(svc_client *clp, struct path *path) +static struct svc_export *exp_parent(svc_client *clp, struct vfsmount *mnt, + struct dentry *dentry, + struct cache_req *reqp) { - struct dentry *saved = dget(path->dentry); - svc_export *exp = exp_get_by_name(clp, path, NULL); - - while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) { - struct dentry *parent = dget_parent(path->dentry); - dput(path->dentry); - path->dentry = parent; - exp = exp_get_by_name(clp, path, NULL); + svc_export *exp; + + dget(dentry); + exp = exp_get_by_name(clp, mnt, dentry, reqp); + + while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) { + struct dentry *parent; + + parent = dget_parent(dentry); + dput(dentry); + dentry = parent; + exp = exp_get_by_name(clp, mnt, dentry, reqp); } - dput(path->dentry); - path->dentry = saved; + dput(dentry); return exp; } @@ -1011,7 +1018,7 @@ exp_export(struct nfsctl_export *nxp) goto out_put_clp; err = -EINVAL; - exp = exp_get_by_name(clp, &path, NULL); + exp = exp_get_by_name(clp, path.mnt, path.dentry, NULL); memset(&new, 0, sizeof(new)); @@ -1128,7 +1135,7 @@ exp_unexport(struct nfsctl_export *nxp) goto out_domain; err = -EINVAL; - exp = exp_get_by_name(dom, &path, NULL); + exp = exp_get_by_name(dom, path.mnt, path.dentry, NULL); path_put(&path); if (IS_ERR(exp)) goto out_domain; @@ -1170,7 +1177,7 @@ exp_rootfh(svc_client *clp, char *name, struct knfsd_fh *f, int maxsize) dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n", name, path.dentry, clp->name, inode->i_sb->s_id, inode->i_ino); - exp = exp_parent(clp, &path); + exp = exp_parent(clp, path.mnt, path.dentry, NULL); if (IS_ERR(exp)) { err = PTR_ERR(exp); goto out; @@ -1200,7 +1207,7 @@ static struct svc_export *exp_find(struct auth_domain *clp, int fsid_type, if (IS_ERR(ek)) return ERR_CAST(ek); - exp = exp_get_by_name(clp, &ek->ek_path, reqp); + exp = exp_get_by_name(clp, ek->ek_path.mnt, ek->ek_path.dentry, reqp); cache_put(&ek->h, &svc_expkey_cache); if (IS_ERR(exp)) @@ -1240,7 +1247,8 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) * use exp_get_by_name() or exp_find(). */ struct svc_export * -rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path) +rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt, + struct dentry *dentry) { struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); @@ -1248,7 +1256,8 @@ rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path) goto gss; /* First try the auth_unix client: */ - exp = exp_get_by_name(rqstp->rq_client, path, &rqstp->rq_chandle); + exp = exp_get_by_name(rqstp->rq_client, mnt, dentry, + &rqstp->rq_chandle); if (PTR_ERR(exp) == -ENOENT) goto gss; if (IS_ERR(exp)) @@ -1260,7 +1269,8 @@ rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path) /* Otherwise, try falling back on gss client */ if (rqstp->rq_gssclient == NULL) return exp; - gssexp = exp_get_by_name(rqstp->rq_gssclient, path, &rqstp->rq_chandle); + gssexp = exp_get_by_name(rqstp->rq_gssclient, mnt, dentry, + &rqstp->rq_chandle); if (PTR_ERR(gssexp) == -ENOENT) return exp; if (!IS_ERR(exp)) @@ -1299,19 +1309,23 @@ rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv) } struct svc_export * -rqst_exp_parent(struct svc_rqst *rqstp, struct path *path) +rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt, + struct dentry *dentry) { - struct dentry *saved = dget(path->dentry); - struct svc_export *exp = rqst_exp_get_by_name(rqstp, path); - - while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) { - struct dentry *parent = dget_parent(path->dentry); - dput(path->dentry); - path->dentry = parent; - exp = rqst_exp_get_by_name(rqstp, path); + struct svc_export *exp; + + dget(dentry); + exp = rqst_exp_get_by_name(rqstp, mnt, dentry); + + while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) { + struct dentry *parent; + + parent = dget_parent(dentry); + dput(dentry); + dentry = parent; + exp = rqst_exp_get_by_name(rqstp, mnt, dentry); } - dput(path->dentry); - path->dentry = saved; + dput(dentry); return exp; } diff --git a/trunk/fs/nfsd/vfs.c b/trunk/fs/nfsd/vfs.c index 99f835753596..bd584bcf1d9f 100644 --- a/trunk/fs/nfsd/vfs.c +++ b/trunk/fs/nfsd/vfs.c @@ -101,35 +101,36 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, { struct svc_export *exp = *expp, *exp2 = NULL; struct dentry *dentry = *dpp; - struct path path = {.mnt = mntget(exp->ex_path.mnt), - .dentry = dget(dentry)}; + struct vfsmount *mnt = mntget(exp->ex_path.mnt); + struct dentry *mounts = dget(dentry); int err = 0; - while (d_mountpoint(path.dentry) && follow_down(&path)) - ; + while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts)); - exp2 = rqst_exp_get_by_name(rqstp, &path); + exp2 = rqst_exp_get_by_name(rqstp, mnt, mounts); if (IS_ERR(exp2)) { if (PTR_ERR(exp2) != -ENOENT) err = PTR_ERR(exp2); - path_put(&path); + dput(mounts); + mntput(mnt); goto out; } if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { /* successfully crossed mount point */ /* - * This is subtle: path.dentry is *not* on path.mnt - * at this point. The only reason we are safe is that - * original mnt is pinned down by exp, so we should - * put path *before* putting exp + * This is subtle: dentry is *not* under mnt at this point. + * The only reason we are safe is that original mnt is pinned + * down by exp, so we should dput before putting exp. */ - *dpp = path.dentry; - path.dentry = dentry; + dput(dentry); + *dpp = mounts; + exp_put(exp); *expp = exp2; - exp2 = exp; + } else { + exp_put(exp2); + dput(mounts); } - path_put(&path); - exp_put(exp2); + mntput(mnt); out: return err; } @@ -168,29 +169,28 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, /* checking mountpoint crossing is very different when stepping up */ struct svc_export *exp2 = NULL; struct dentry *dp; - struct path path = {.mnt = mntget(exp->ex_path.mnt), - .dentry = dget(dparent)}; - - while (path.dentry == path.mnt->mnt_root && - follow_up(&path)) + struct vfsmount *mnt = mntget(exp->ex_path.mnt); + dentry = dget(dparent); + while(dentry == mnt->mnt_root && follow_up(&mnt, &dentry)) ; - dp = dget_parent(path.dentry); - dput(path.dentry); - path.dentry = dp; + dp = dget_parent(dentry); + dput(dentry); + dentry = dp; - exp2 = rqst_exp_parent(rqstp, &path); + exp2 = rqst_exp_parent(rqstp, mnt, dentry); if (PTR_ERR(exp2) == -ENOENT) { + dput(dentry); dentry = dget(dparent); } else if (IS_ERR(exp2)) { host_err = PTR_ERR(exp2); - path_put(&path); + dput(dentry); + mntput(mnt); goto out_nfserr; } else { - dentry = dget(path.dentry); exp_put(exp); exp = exp2; } - path_put(&path); + mntput(mnt); } } else { fh_lock(fhp); diff --git a/trunk/fs/nilfs2/cpfile.c b/trunk/fs/nilfs2/cpfile.c index cadd36b14d07..300f1cdfa862 100644 --- a/trunk/fs/nilfs2/cpfile.c +++ b/trunk/fs/nilfs2/cpfile.c @@ -864,11 +864,11 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) case NILFS_CHECKPOINT: /* * Check for protecting existing snapshot mounts: - * ns_mount_mutex is used to make this operation atomic and + * bd_mount_sem is used to make this operation atomic and * exclusive with a new mount job. Though it doesn't cover * umount, it's enough for the purpose. */ - mutex_lock(&nilfs->ns_mount_mutex); + down(&nilfs->ns_bdev->bd_mount_sem); if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) { /* Current implementation does not have to protect plain read-only mounts since they are exclusive @@ -877,7 +877,7 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) ret = -EBUSY; } else ret = nilfs_cpfile_clear_snapshot(cpfile, cno); - mutex_unlock(&nilfs->ns_mount_mutex); + up(&nilfs->ns_bdev->bd_mount_sem); return ret; case NILFS_SNAPSHOT: return nilfs_cpfile_set_snapshot(cpfile, cno); diff --git a/trunk/fs/nilfs2/sb.h b/trunk/fs/nilfs2/sb.h index 0776ccc2504a..adccd4fc654e 100644 --- a/trunk/fs/nilfs2/sb.h +++ b/trunk/fs/nilfs2/sb.h @@ -60,7 +60,6 @@ struct nilfs_sb_info { struct super_block *s_super; /* reverse pointer to super_block */ struct the_nilfs *s_nilfs; struct list_head s_list; /* list head for nilfs->ns_supers */ - atomic_t s_count; /* reference count */ /* Segment constructor */ struct list_head s_dirty_files; /* dirty files list */ diff --git a/trunk/fs/nilfs2/super.c b/trunk/fs/nilfs2/super.c index 1777a3467bd2..6989b03e97ab 100644 --- a/trunk/fs/nilfs2/super.c +++ b/trunk/fs/nilfs2/super.c @@ -65,8 +65,9 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem " "(NILFS)"); MODULE_LICENSE("GPL"); -static void nilfs_write_super(struct super_block *sb); static int nilfs_remount(struct super_block *sb, int *flags, char *data); +static int test_exclusive_mount(struct file_system_type *fs_type, + struct block_device *bdev, int flags); /** * nilfs_error() - report failure condition on a filesystem @@ -314,11 +315,6 @@ static void nilfs_put_super(struct super_block *sb) struct nilfs_sb_info *sbi = NILFS_SB(sb); struct the_nilfs *nilfs = sbi->s_nilfs; - lock_kernel(); - - if (sb->s_dirt) - nilfs_write_super(sb); - nilfs_detach_segment_constructor(sbi); if (!(sb->s_flags & MS_RDONLY)) { @@ -327,18 +323,12 @@ static void nilfs_put_super(struct super_block *sb) nilfs_commit_super(sbi, 1); up_write(&nilfs->ns_sem); } - down_write(&nilfs->ns_super_sem); - if (nilfs->ns_current == sbi) - nilfs->ns_current = NULL; - up_write(&nilfs->ns_super_sem); nilfs_detach_checkpoint(sbi); put_nilfs(sbi->s_nilfs); sbi->s_super = NULL; sb->s_fs_info = NULL; - nilfs_put_sbinfo(sbi); - - unlock_kernel(); + kfree(sbi); } /** @@ -393,8 +383,6 @@ static int nilfs_sync_fs(struct super_block *sb, int wait) { int err = 0; - nilfs_write_super(sb); - /* This function is called when super block should be written back */ if (wait) err = nilfs_construct_segment(sb); @@ -408,9 +396,9 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) struct buffer_head *bh_cp; int err; - down_write(&nilfs->ns_super_sem); + down_write(&nilfs->ns_sem); list_add(&sbi->s_list, &nilfs->ns_supers); - up_write(&nilfs->ns_super_sem); + up_write(&nilfs->ns_sem); sbi->s_ifile = nilfs_mdt_new( nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP); @@ -448,9 +436,9 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) nilfs_mdt_destroy(sbi->s_ifile); sbi->s_ifile = NULL; - down_write(&nilfs->ns_super_sem); + down_write(&nilfs->ns_sem); list_del_init(&sbi->s_list); - up_write(&nilfs->ns_super_sem); + up_write(&nilfs->ns_sem); return err; } @@ -462,9 +450,9 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi) nilfs_mdt_clear(sbi->s_ifile); nilfs_mdt_destroy(sbi->s_ifile); sbi->s_ifile = NULL; - down_write(&nilfs->ns_super_sem); + down_write(&nilfs->ns_sem); list_del_init(&sbi->s_list); - up_write(&nilfs->ns_super_sem); + up_write(&nilfs->ns_sem); } static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi) @@ -764,7 +752,7 @@ int nilfs_store_magic_and_option(struct super_block *sb, * @silent: silent mode flag * @nilfs: the_nilfs struct * - * This function is called exclusively by nilfs->ns_mount_mutex. + * This function is called exclusively by bd_mount_mutex. * So, the recovery process is protected from other simultaneous mounts. */ static int @@ -785,7 +773,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, get_nilfs(nilfs); sbi->s_nilfs = nilfs; sbi->s_super = sb; - atomic_set(&sbi->s_count, 1); err = init_nilfs(nilfs, sbi, (char *)data); if (err) @@ -883,11 +870,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, goto failed_root; } - down_write(&nilfs->ns_super_sem); - if (!nilfs_test_opt(sbi, SNAPSHOT)) - nilfs->ns_current = sbi; - up_write(&nilfs->ns_super_sem); - return 0; failed_root: @@ -903,7 +885,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, failed_sbi: put_nilfs(nilfs); sb->s_fs_info = NULL; - nilfs_put_sbinfo(sbi); + kfree(sbi); return err; } @@ -916,9 +898,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) struct nilfs_mount_options old_opts; int err; - lock_kernel(); - - down_write(&nilfs->ns_super_sem); old_sb_flags = sb->s_flags; old_opts.mount_opt = sbi->s_mount_opt; old_opts.snapshot_cno = sbi->s_snapshot_cno; @@ -966,12 +945,14 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) * store the current valid flag. (It may have been changed * by fsck since we originally mounted the partition.) */ - if (nilfs->ns_current && nilfs->ns_current != sbi) { + down(&sb->s_bdev->bd_mount_sem); + /* Check existing RW-mount */ + if (test_exclusive_mount(sb->s_type, sb->s_bdev, 0)) { printk(KERN_WARNING "NILFS (device %s): couldn't " - "remount because an RW-mount exists.\n", + "remount because a RW-mount exists.\n", sb->s_id); err = -EBUSY; - goto restore_opts; + goto rw_remount_failed; } if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) { printk(KERN_WARNING "NILFS (device %s): couldn't " @@ -979,7 +960,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) "the latest one.\n", sb->s_id); err = -EINVAL; - goto restore_opts; + goto rw_remount_failed; } sb->s_flags &= ~MS_RDONLY; nilfs_clear_opt(sbi, SNAPSHOT); @@ -987,31 +968,28 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) err = nilfs_attach_segment_constructor(sbi); if (err) - goto restore_opts; + goto rw_remount_failed; down_write(&nilfs->ns_sem); nilfs_setup_super(sbi); up_write(&nilfs->ns_sem); - nilfs->ns_current = sbi; + up(&sb->s_bdev->bd_mount_sem); } out: - up_write(&nilfs->ns_super_sem); - unlock_kernel(); return 0; + rw_remount_failed: + up(&sb->s_bdev->bd_mount_sem); restore_opts: sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.mount_opt; sbi->s_snapshot_cno = old_opts.snapshot_cno; - up_write(&nilfs->ns_super_sem); - unlock_kernel(); return err; } struct nilfs_super_data { struct block_device *bdev; - struct nilfs_sb_info *sbi; __u64 cno; int flags; }; @@ -1070,7 +1048,33 @@ static int nilfs_test_bdev_super(struct super_block *s, void *data) { struct nilfs_super_data *sd = data; - return sd->sbi && s->s_fs_info == (void *)sd->sbi; + return s->s_bdev == sd->bdev; +} + +static int nilfs_test_bdev_super2(struct super_block *s, void *data) +{ + struct nilfs_super_data *sd = data; + int ret; + + if (s->s_bdev != sd->bdev) + return 0; + + if (!((s->s_flags | sd->flags) & MS_RDONLY)) + return 1; /* Reuse an old R/W-mode super_block */ + + if (s->s_flags & sd->flags & MS_RDONLY) { + if (down_read_trylock(&s->s_umount)) { + ret = s->s_root && + (sd->cno == NILFS_SB(s)->s_snapshot_cno); + up_read(&s->s_umount); + /* + * This path is locked with sb_lock by sget(). + * So, drop_super() causes deadlock. + */ + return ret; + } + } + return 0; } static int @@ -1078,8 +1082,8 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { struct nilfs_super_data sd; - struct super_block *s; - struct the_nilfs *nilfs; + struct super_block *s, *s2; + struct the_nilfs *nilfs = NULL; int err, need_to_close = 1; sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type); @@ -1091,6 +1095,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, * much more information than normal filesystems to identify mount * instance. For snapshot mounts, not only a mount type (ro-mount * or rw-mount) but also a checkpoint number is required. + * The results are passed in sget() using nilfs_super_data. */ sd.cno = 0; sd.flags = flags; @@ -1099,59 +1104,64 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, goto failed; } - nilfs = find_or_create_nilfs(sd.bdev); - if (!nilfs) { - err = -ENOMEM; - goto failed; - } - - mutex_lock(&nilfs->ns_mount_mutex); - - if (!sd.cno) { - /* - * Check if an exclusive mount exists or not. - * Snapshot mounts coexist with a current mount - * (i.e. rw-mount or ro-mount), whereas rw-mount and - * ro-mount are mutually exclusive. - */ - down_read(&nilfs->ns_super_sem); - if (nilfs->ns_current && - ((nilfs->ns_current->s_super->s_flags ^ flags) - & MS_RDONLY)) { - up_read(&nilfs->ns_super_sem); - err = -EBUSY; - goto failed_unlock; - } - up_read(&nilfs->ns_super_sem); - } - /* - * Find existing nilfs_sb_info struct + * once the super is inserted into the list by sget, s_umount + * will protect the lockfs code from trying to start a snapshot + * while we are mounting */ - sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno); - - if (!sd.cno) - /* trying to get the latest checkpoint. */ - sd.cno = nilfs_last_cno(nilfs); + down(&sd.bdev->bd_mount_sem); + if (!sd.cno && + (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) { + err = (err < 0) ? : -EBUSY; + goto failed_unlock; + } /* - * Get super block instance holding the nilfs_sb_info struct. - * A new instance is allocated if no existing mount is present or - * existing instance has been unmounted. + * Phase-1: search any existent instance and get the_nilfs */ s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd); - if (sd.sbi) - nilfs_put_sbinfo(sd.sbi); + if (IS_ERR(s)) + goto error_s; - if (IS_ERR(s)) { - err = PTR_ERR(s); - goto failed_unlock; + if (!s->s_root) { + err = -ENOMEM; + nilfs = alloc_nilfs(sd.bdev); + if (!nilfs) + goto cancel_new; + } else { + struct nilfs_sb_info *sbi = NILFS_SB(s); + + /* + * s_umount protects super_block from unmount process; + * It covers pointers of nilfs_sb_info and the_nilfs. + */ + nilfs = sbi->s_nilfs; + get_nilfs(nilfs); + up_write(&s->s_umount); + + /* + * Phase-2: search specified snapshot or R/W mode super_block + */ + if (!sd.cno) + /* trying to get the latest checkpoint. */ + sd.cno = nilfs_last_cno(nilfs); + + s2 = sget(fs_type, nilfs_test_bdev_super2, + nilfs_set_bdev_super, &sd); + deactivate_super(s); + /* + * Although deactivate_super() invokes close_bdev_exclusive() at + * kill_block_super(). Here, s is an existent mount; we need + * one more close_bdev_exclusive() call. + */ + s = s2; + if (IS_ERR(s)) + goto error_s; } if (!s->s_root) { char b[BDEVNAME_SIZE]; - /* New superblock instance created */ s->s_flags = flags; strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); sb_set_blocksize(s, block_size(sd.bdev)); @@ -1162,18 +1172,26 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, s->s_flags |= MS_ACTIVE; need_to_close = 0; + } else if (!(s->s_flags & MS_RDONLY)) { + err = -EBUSY; } - mutex_unlock(&nilfs->ns_mount_mutex); + up(&sd.bdev->bd_mount_sem); put_nilfs(nilfs); if (need_to_close) close_bdev_exclusive(sd.bdev, flags); simple_set_mnt(mnt, s); return 0; + error_s: + up(&sd.bdev->bd_mount_sem); + if (nilfs) + put_nilfs(nilfs); + close_bdev_exclusive(sd.bdev, flags); + return PTR_ERR(s); + failed_unlock: - mutex_unlock(&nilfs->ns_mount_mutex); - put_nilfs(nilfs); + up(&sd.bdev->bd_mount_sem); failed: close_bdev_exclusive(sd.bdev, flags); @@ -1181,18 +1199,70 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, cancel_new: /* Abandoning the newly allocated superblock */ - mutex_unlock(&nilfs->ns_mount_mutex); - put_nilfs(nilfs); + up(&sd.bdev->bd_mount_sem); + if (nilfs) + put_nilfs(nilfs); up_write(&s->s_umount); deactivate_super(s); /* * deactivate_super() invokes close_bdev_exclusive(). * We must finish all post-cleaning before this call; - * put_nilfs() needs the block device. + * put_nilfs() and unlocking bd_mount_sem need the block device. */ return err; } +static int nilfs_test_bdev_super3(struct super_block *s, void *data) +{ + struct nilfs_super_data *sd = data; + int ret; + + if (s->s_bdev != sd->bdev) + return 0; + if (down_read_trylock(&s->s_umount)) { + ret = (s->s_flags & MS_RDONLY) && s->s_root && + nilfs_test_opt(NILFS_SB(s), SNAPSHOT); + up_read(&s->s_umount); + if (ret) + return 0; /* ignore snapshot mounts */ + } + return !((sd->flags ^ s->s_flags) & MS_RDONLY); +} + +static int __false_bdev_super(struct super_block *s, void *data) +{ +#if 0 /* XXX: workaround for lock debug. This is not good idea */ + up_write(&s->s_umount); +#endif + return -EFAULT; +} + +/** + * test_exclusive_mount - check whether an exclusive RW/RO mount exists or not. + * fs_type: filesystem type + * bdev: block device + * flag: 0 (check rw-mount) or MS_RDONLY (check ro-mount) + * res: pointer to an integer to store result + * + * This function must be called within a section protected by bd_mount_mutex. + */ +static int test_exclusive_mount(struct file_system_type *fs_type, + struct block_device *bdev, int flags) +{ + struct super_block *s; + struct nilfs_super_data sd = { .flags = flags, .bdev = bdev }; + + s = sget(fs_type, nilfs_test_bdev_super3, __false_bdev_super, &sd); + if (IS_ERR(s)) { + if (PTR_ERR(s) != -EFAULT) + return PTR_ERR(s); + return 0; /* Not found */ + } + up_write(&s->s_umount); + deactivate_super(s); + return 1; /* Found */ +} + struct file_system_type nilfs_fs_type = { .owner = THIS_MODULE, .name = "nilfs2", diff --git a/trunk/fs/nilfs2/the_nilfs.c b/trunk/fs/nilfs2/the_nilfs.c index e4e5c78bcc93..a91f15b8673c 100644 --- a/trunk/fs/nilfs2/the_nilfs.c +++ b/trunk/fs/nilfs2/the_nilfs.c @@ -35,10 +35,6 @@ #include "seglist.h" #include "segbuf.h" - -static LIST_HEAD(nilfs_objects); -static DEFINE_SPINLOCK(nilfs_lock); - void nilfs_set_last_segment(struct the_nilfs *nilfs, sector_t start_blocknr, u64 seq, __u64 cno) { @@ -59,7 +55,7 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs, * Return Value: On success, pointer to the_nilfs is returned. * On error, NULL is returned. */ -static struct the_nilfs *alloc_nilfs(struct block_device *bdev) +struct the_nilfs *alloc_nilfs(struct block_device *bdev) { struct the_nilfs *nilfs; @@ -72,10 +68,7 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev) atomic_set(&nilfs->ns_writer_refcount, -1); atomic_set(&nilfs->ns_ndirtyblks, 0); init_rwsem(&nilfs->ns_sem); - init_rwsem(&nilfs->ns_super_sem); - mutex_init(&nilfs->ns_mount_mutex); mutex_init(&nilfs->ns_writer_mutex); - INIT_LIST_HEAD(&nilfs->ns_list); INIT_LIST_HEAD(&nilfs->ns_supers); spin_lock_init(&nilfs->ns_last_segment_lock); nilfs->ns_gc_inodes_h = NULL; @@ -84,45 +77,6 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev) return nilfs; } -/** - * find_or_create_nilfs - find or create nilfs object - * @bdev: block device to which the_nilfs is related - * - * find_nilfs() looks up an existent nilfs object created on the - * device and gets the reference count of the object. If no nilfs object - * is found on the device, a new nilfs object is allocated. - * - * Return Value: On success, pointer to the nilfs object is returned. - * On error, NULL is returned. - */ -struct the_nilfs *find_or_create_nilfs(struct block_device *bdev) -{ - struct the_nilfs *nilfs, *new = NULL; - - retry: - spin_lock(&nilfs_lock); - list_for_each_entry(nilfs, &nilfs_objects, ns_list) { - if (nilfs->ns_bdev == bdev) { - get_nilfs(nilfs); - spin_unlock(&nilfs_lock); - if (new) - put_nilfs(new); - return nilfs; /* existing object */ - } - } - if (new) { - list_add_tail(&new->ns_list, &nilfs_objects); - spin_unlock(&nilfs_lock); - return new; /* new object */ - } - spin_unlock(&nilfs_lock); - - new = alloc_nilfs(bdev); - if (new) - goto retry; - return NULL; /* insufficient memory */ -} - /** * put_nilfs - release a reference to the_nilfs * @nilfs: the_nilfs structure to be released @@ -132,20 +86,13 @@ struct the_nilfs *find_or_create_nilfs(struct block_device *bdev) */ void put_nilfs(struct the_nilfs *nilfs) { - spin_lock(&nilfs_lock); - if (!atomic_dec_and_test(&nilfs->ns_count)) { - spin_unlock(&nilfs_lock); + if (!atomic_dec_and_test(&nilfs->ns_count)) return; - } - list_del_init(&nilfs->ns_list); - spin_unlock(&nilfs_lock); - /* - * Increment of ns_count never occurs below because the caller + * Increment of ns_count never occur below because the caller * of get_nilfs() holds at least one reference to the_nilfs. * Thus its exclusion control is not required here. */ - might_sleep(); if (nilfs_loaded(nilfs)) { nilfs_mdt_clear(nilfs->ns_sufile); @@ -666,63 +613,13 @@ int nilfs_near_disk_full(struct the_nilfs *nilfs) return ret; } -/** - * nilfs_find_sbinfo - find existing nilfs_sb_info structure - * @nilfs: nilfs object - * @rw_mount: mount type (non-zero value for read/write mount) - * @cno: checkpoint number (zero for read-only mount) - * - * nilfs_find_sbinfo() returns the nilfs_sb_info structure which - * @rw_mount and @cno (in case of snapshots) matched. If no instance - * was found, NULL is returned. Although the super block instance can - * be unmounted after this function returns, the nilfs_sb_info struct - * is kept on memory until nilfs_put_sbinfo() is called. - */ -struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs, - int rw_mount, __u64 cno) -{ - struct nilfs_sb_info *sbi; - - down_read(&nilfs->ns_super_sem); - /* - * The SNAPSHOT flag and sb->s_flags are supposed to be - * protected with nilfs->ns_super_sem. - */ - sbi = nilfs->ns_current; - if (rw_mount) { - if (sbi && !(sbi->s_super->s_flags & MS_RDONLY)) - goto found; /* read/write mount */ - else - goto out; - } else if (cno == 0) { - if (sbi && (sbi->s_super->s_flags & MS_RDONLY)) - goto found; /* read-only mount */ - else - goto out; - } - - list_for_each_entry(sbi, &nilfs->ns_supers, s_list) { - if (nilfs_test_opt(sbi, SNAPSHOT) && - sbi->s_snapshot_cno == cno) - goto found; /* snapshot mount */ - } - out: - up_read(&nilfs->ns_super_sem); - return NULL; - - found: - atomic_inc(&sbi->s_count); - up_read(&nilfs->ns_super_sem); - return sbi; -} - int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno, int snapshot_mount) { struct nilfs_sb_info *sbi; int ret = 0; - down_read(&nilfs->ns_super_sem); + down_read(&nilfs->ns_sem); if (cno == 0 || cno > nilfs->ns_cno) goto out_unlock; @@ -739,6 +636,6 @@ int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno, ret++; out_unlock: - up_read(&nilfs->ns_super_sem); + up_read(&nilfs->ns_sem); return ret; } diff --git a/trunk/fs/nilfs2/the_nilfs.h b/trunk/fs/nilfs2/the_nilfs.h index e8adbffc626f..30fe58778d05 100644 --- a/trunk/fs/nilfs2/the_nilfs.h +++ b/trunk/fs/nilfs2/the_nilfs.h @@ -43,16 +43,12 @@ enum { * struct the_nilfs - struct to supervise multiple nilfs mount points * @ns_flags: flags * @ns_count: reference count - * @ns_list: list head for nilfs_list * @ns_bdev: block device * @ns_bdi: backing dev info * @ns_writer: back pointer to writable nilfs_sb_info * @ns_sem: semaphore for shared states - * @ns_super_sem: semaphore for global operations across super block instances - * @ns_mount_mutex: mutex protecting mount process of nilfs * @ns_writer_mutex: mutex protecting ns_writer attach/detach * @ns_writer_refcount: number of referrers on ns_writer - * @ns_current: back pointer to current mount * @ns_sbh: buffer heads of on-disk super blocks * @ns_sbp: pointers to super block data * @ns_sbwtime: previous write time of super blocks @@ -92,23 +88,14 @@ enum { struct the_nilfs { unsigned long ns_flags; atomic_t ns_count; - struct list_head ns_list; struct block_device *ns_bdev; struct backing_dev_info *ns_bdi; struct nilfs_sb_info *ns_writer; struct rw_semaphore ns_sem; - struct rw_semaphore ns_super_sem; - struct mutex ns_mount_mutex; struct mutex ns_writer_mutex; atomic_t ns_writer_refcount; - /* - * components protected by ns_super_sem - */ - struct nilfs_sb_info *ns_current; - struct list_head ns_supers; - /* * used for * - loading the latest checkpoint exclusively. @@ -121,6 +108,7 @@ struct the_nilfs { time_t ns_sbwtime[2]; unsigned ns_sbsize; unsigned ns_mount_state; + struct list_head ns_supers; /* * Following fields are dedicated to a writable FS-instance. @@ -203,12 +191,11 @@ THE_NILFS_FNS(DISCONTINUED, discontinued) #define NILFS_ALTSB_FREQ 60 /* spare superblock */ void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); -struct the_nilfs *find_or_create_nilfs(struct block_device *); +struct the_nilfs *alloc_nilfs(struct block_device *); void put_nilfs(struct the_nilfs *); int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *); int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *); int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); -struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64); int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int); int nilfs_near_disk_full(struct the_nilfs *); void nilfs_fall_back_super_block(struct the_nilfs *); @@ -251,12 +238,6 @@ nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) mutex_unlock(&nilfs->ns_writer_mutex); } -static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi) -{ - if (!atomic_dec_and_test(&sbi->s_count)) - kfree(sbi); -} - static inline void nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum, sector_t *seg_start, sector_t *seg_end) diff --git a/trunk/fs/notify/Kconfig b/trunk/fs/notify/Kconfig index 31dac7e3b0f1..50914d7303c6 100644 --- a/trunk/fs/notify/Kconfig +++ b/trunk/fs/notify/Kconfig @@ -1,15 +1,2 @@ -config FSNOTIFY - bool "Filesystem notification backend" - default y - ---help--- - fsnotify is a backend for filesystem notification. fsnotify does - not provide any userspace interface but does provide the basis - needed for other notification schemes such as dnotify, inotify, - and fanotify. - - Say Y here to enable fsnotify suport. - - If unsure, say Y. - source "fs/notify/dnotify/Kconfig" source "fs/notify/inotify/Kconfig" diff --git a/trunk/fs/notify/Makefile b/trunk/fs/notify/Makefile index 0922cc826c46..5a95b6010ce7 100644 --- a/trunk/fs/notify/Makefile +++ b/trunk/fs/notify/Makefile @@ -1,4 +1,2 @@ -obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o - obj-y += dnotify/ obj-y += inotify/ diff --git a/trunk/fs/notify/dnotify/Kconfig b/trunk/fs/notify/dnotify/Kconfig index 904ff8d5405a..26adf5dfa646 100644 --- a/trunk/fs/notify/dnotify/Kconfig +++ b/trunk/fs/notify/dnotify/Kconfig @@ -1,6 +1,5 @@ config DNOTIFY bool "Dnotify support" - depends on FSNOTIFY default y help Dnotify is a directory-based per-fd file change notification system diff --git a/trunk/fs/notify/dnotify/dnotify.c b/trunk/fs/notify/dnotify/dnotify.c index 828a889be909..b0aa2cde80bd 100644 --- a/trunk/fs/notify/dnotify/dnotify.c +++ b/trunk/fs/notify/dnotify/dnotify.c @@ -3,9 +3,6 @@ * * Copyright (C) 2000,2001,2002 Stephen Rothwell * - * Copyright (C) 2009 Eric Paris - * dnotify was largly rewritten to use the new fsnotify infrastructure - * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2, or (at your option) any @@ -24,173 +21,24 @@ #include #include #include -#include int dir_notify_enable __read_mostly = 1; -static struct kmem_cache *dnotify_struct_cache __read_mostly; -static struct kmem_cache *dnotify_mark_entry_cache __read_mostly; -static struct fsnotify_group *dnotify_group __read_mostly; -static DEFINE_MUTEX(dnotify_mark_mutex); - -/* - * dnotify will attach one of these to each inode (i_fsnotify_mark_entries) which - * is being watched by dnotify. If multiple userspace applications are watching - * the same directory with dnotify their information is chained in dn - */ -struct dnotify_mark_entry { - struct fsnotify_mark_entry fsn_entry; - struct dnotify_struct *dn; -}; +static struct kmem_cache *dn_cache __read_mostly; -/* - * When a process starts or stops watching an inode the set of events which - * dnotify cares about for that inode may change. This function runs the - * list of everything receiving dnotify events about this directory and calculates - * the set of all those events. After it updates what dnotify is interested in - * it calls the fsnotify function so it can update the set of all events relevant - * to this inode. - */ -static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry) +static void redo_inode_mask(struct inode *inode) { - __u32 new_mask, old_mask; + unsigned long new_mask; struct dnotify_struct *dn; - struct dnotify_mark_entry *dnentry = container_of(entry, - struct dnotify_mark_entry, - fsn_entry); - - assert_spin_locked(&entry->lock); - old_mask = entry->mask; new_mask = 0; - for (dn = dnentry->dn; dn != NULL; dn = dn->dn_next) - new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT); - entry->mask = new_mask; - - if (old_mask == new_mask) - return; - - if (entry->inode) - fsnotify_recalc_inode_mask(entry->inode); -} - -/* - * Mains fsnotify call where events are delivered to dnotify. - * Find the dnotify mark on the relevant inode, run the list of dnotify structs - * on that mark and determine which of them has expressed interest in receiving - * events of this type. When found send the correct process and signal and - * destroy the dnotify struct if it was not registered to receive multiple - * events. - */ -static int dnotify_handle_event(struct fsnotify_group *group, - struct fsnotify_event *event) -{ - struct fsnotify_mark_entry *entry = NULL; - struct dnotify_mark_entry *dnentry; - struct inode *to_tell; - struct dnotify_struct *dn; - struct dnotify_struct **prev; - struct fown_struct *fown; - - to_tell = event->to_tell; - - spin_lock(&to_tell->i_lock); - entry = fsnotify_find_mark_entry(group, to_tell); - spin_unlock(&to_tell->i_lock); - - /* unlikely since we alreay passed dnotify_should_send_event() */ - if (unlikely(!entry)) - return 0; - dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry); - - spin_lock(&entry->lock); - prev = &dnentry->dn; - while ((dn = *prev) != NULL) { - if ((dn->dn_mask & event->mask) == 0) { - prev = &dn->dn_next; - continue; - } - fown = &dn->dn_filp->f_owner; - send_sigio(fown, dn->dn_fd, POLL_MSG); - if (dn->dn_mask & FS_DN_MULTISHOT) - prev = &dn->dn_next; - else { - *prev = dn->dn_next; - kmem_cache_free(dnotify_struct_cache, dn); - dnotify_recalc_inode_mask(entry); - } - } - - spin_unlock(&entry->lock); - fsnotify_put_mark(entry); - - return 0; -} - -/* - * Given an inode and mask determine if dnotify would be interested in sending - * userspace notification for that pair. - */ -static bool dnotify_should_send_event(struct fsnotify_group *group, - struct inode *inode, __u32 mask) -{ - struct fsnotify_mark_entry *entry; - bool send; - - /* !dir_notify_enable should never get here, don't waste time checking - if (!dir_notify_enable) - return 0; */ - - /* not a dir, dnotify doesn't care */ - if (!S_ISDIR(inode->i_mode)) - return false; - - spin_lock(&inode->i_lock); - entry = fsnotify_find_mark_entry(group, inode); - spin_unlock(&inode->i_lock); - - /* no mark means no dnotify watch */ - if (!entry) - return false; - - mask = (mask & ~FS_EVENT_ON_CHILD); - send = (mask & entry->mask); - - fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */ - - return send; -} - -static void dnotify_free_mark(struct fsnotify_mark_entry *entry) -{ - struct dnotify_mark_entry *dnentry = container_of(entry, - struct dnotify_mark_entry, - fsn_entry); - - BUG_ON(dnentry->dn); - - kmem_cache_free(dnotify_mark_entry_cache, dnentry); + for (dn = inode->i_dnotify; dn != NULL; dn = dn->dn_next) + new_mask |= dn->dn_mask & ~DN_MULTISHOT; + inode->i_dnotify_mask = new_mask; } -static struct fsnotify_ops dnotify_fsnotify_ops = { - .handle_event = dnotify_handle_event, - .should_send_event = dnotify_should_send_event, - .free_group_priv = NULL, - .freeing_mark = NULL, - .free_event_priv = NULL, -}; - -/* - * Called every time a file is closed. Looks first for a dnotify mark on the - * inode. If one is found run all of the ->dn entries attached to that - * mark for one relevant to this process closing the file and remove that - * dnotify_struct. If that was the last dnotify_struct also remove the - * fsnotify_mark_entry. - */ void dnotify_flush(struct file *filp, fl_owner_t id) { - struct fsnotify_mark_entry *entry; - struct dnotify_mark_entry *dnentry; struct dnotify_struct *dn; struct dnotify_struct **prev; struct inode *inode; @@ -198,243 +46,145 @@ void dnotify_flush(struct file *filp, fl_owner_t id) inode = filp->f_path.dentry->d_inode; if (!S_ISDIR(inode->i_mode)) return; - spin_lock(&inode->i_lock); - entry = fsnotify_find_mark_entry(dnotify_group, inode); - spin_unlock(&inode->i_lock); - if (!entry) - return; - dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry); - - mutex_lock(&dnotify_mark_mutex); - - spin_lock(&entry->lock); - prev = &dnentry->dn; + prev = &inode->i_dnotify; while ((dn = *prev) != NULL) { if ((dn->dn_owner == id) && (dn->dn_filp == filp)) { *prev = dn->dn_next; - kmem_cache_free(dnotify_struct_cache, dn); - dnotify_recalc_inode_mask(entry); + redo_inode_mask(inode); + kmem_cache_free(dn_cache, dn); break; } prev = &dn->dn_next; } - - spin_unlock(&entry->lock); - - /* nothing else could have found us thanks to the dnotify_mark_mutex */ - if (dnentry->dn == NULL) - fsnotify_destroy_mark_by_entry(entry); - - fsnotify_recalc_group_mask(dnotify_group); - - mutex_unlock(&dnotify_mark_mutex); - - fsnotify_put_mark(entry); -} - -/* this conversion is done only at watch creation */ -static __u32 convert_arg(unsigned long arg) -{ - __u32 new_mask = FS_EVENT_ON_CHILD; - - if (arg & DN_MULTISHOT) - new_mask |= FS_DN_MULTISHOT; - if (arg & DN_DELETE) - new_mask |= (FS_DELETE | FS_MOVED_FROM); - if (arg & DN_MODIFY) - new_mask |= FS_MODIFY; - if (arg & DN_ACCESS) - new_mask |= FS_ACCESS; - if (arg & DN_ATTRIB) - new_mask |= FS_ATTRIB; - if (arg & DN_RENAME) - new_mask |= FS_DN_RENAME; - if (arg & DN_CREATE) - new_mask |= (FS_CREATE | FS_MOVED_TO); - - return new_mask; -} - -/* - * If multiple processes watch the same inode with dnotify there is only one - * dnotify mark in inode->i_fsnotify_mark_entries but we chain a dnotify_struct - * onto that mark. This function either attaches the new dnotify_struct onto - * that list, or it |= the mask onto an existing dnofiy_struct. - */ -static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnentry, - fl_owner_t id, int fd, struct file *filp, __u32 mask) -{ - struct dnotify_struct *odn; - - odn = dnentry->dn; - while (odn != NULL) { - /* adding more events to existing dnofiy_struct? */ - if ((odn->dn_owner == id) && (odn->dn_filp == filp)) { - odn->dn_fd = fd; - odn->dn_mask |= mask; - return -EEXIST; - } - odn = odn->dn_next; - } - - dn->dn_mask = mask; - dn->dn_fd = fd; - dn->dn_filp = filp; - dn->dn_owner = id; - dn->dn_next = dnentry->dn; - dnentry->dn = dn; - - return 0; + spin_unlock(&inode->i_lock); } -/* - * When a process calls fcntl to attach a dnotify watch to a directory it ends - * up here. Allocate both a mark for fsnotify to add and a dnotify_struct to be - * attached to the fsnotify_mark. - */ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) { - struct dnotify_mark_entry *new_dnentry, *dnentry; - struct fsnotify_mark_entry *new_entry, *entry; struct dnotify_struct *dn; + struct dnotify_struct *odn; + struct dnotify_struct **prev; struct inode *inode; fl_owner_t id = current->files; struct file *f; - int destroy = 0, error = 0; - __u32 mask; - - /* we use these to tell if we need to kfree */ - new_entry = NULL; - dn = NULL; - - if (!dir_notify_enable) { - error = -EINVAL; - goto out_err; - } + int error = 0; - /* a 0 mask means we are explicitly removing the watch */ if ((arg & ~DN_MULTISHOT) == 0) { dnotify_flush(filp, id); - error = 0; - goto out_err; + return 0; } - - /* dnotify only works on directories */ + if (!dir_notify_enable) + return -EINVAL; inode = filp->f_path.dentry->d_inode; - if (!S_ISDIR(inode->i_mode)) { - error = -ENOTDIR; - goto out_err; - } - - /* expect most fcntl to add new rather than augment old */ - dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL); - if (!dn) { - error = -ENOMEM; - goto out_err; - } - - /* new fsnotify mark, we expect most fcntl calls to add a new mark */ - new_dnentry = kmem_cache_alloc(dnotify_mark_entry_cache, GFP_KERNEL); - if (!new_dnentry) { - error = -ENOMEM; - goto out_err; - } - - /* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */ - mask = convert_arg(arg); - - /* set up the new_entry and new_dnentry */ - new_entry = &new_dnentry->fsn_entry; - fsnotify_init_mark(new_entry, dnotify_free_mark); - new_entry->mask = mask; - new_dnentry->dn = NULL; - - /* this is needed to prevent the fcntl/close race described below */ - mutex_lock(&dnotify_mark_mutex); - - /* add the new_entry or find an old one. */ + if (!S_ISDIR(inode->i_mode)) + return -ENOTDIR; + dn = kmem_cache_alloc(dn_cache, GFP_KERNEL); + if (dn == NULL) + return -ENOMEM; spin_lock(&inode->i_lock); - entry = fsnotify_find_mark_entry(dnotify_group, inode); - spin_unlock(&inode->i_lock); - if (entry) { - dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry); - spin_lock(&entry->lock); - } else { - fsnotify_add_mark(new_entry, dnotify_group, inode); - spin_lock(&new_entry->lock); - entry = new_entry; - dnentry = new_dnentry; - /* we used new_entry, so don't free it */ - new_entry = NULL; + prev = &inode->i_dnotify; + while ((odn = *prev) != NULL) { + if ((odn->dn_owner == id) && (odn->dn_filp == filp)) { + odn->dn_fd = fd; + odn->dn_mask |= arg; + inode->i_dnotify_mask |= arg & ~DN_MULTISHOT; + goto out_free; + } + prev = &odn->dn_next; } rcu_read_lock(); f = fcheck(fd); rcu_read_unlock(); - - /* if (f != filp) means that we lost a race and another task/thread - * actually closed the fd we are still playing with before we grabbed - * the dnotify_mark_mutex and entry->lock. Since closing the fd is the - * only time we clean up the mark entries we need to get our mark off - * the list. */ - if (f != filp) { - /* if we added ourselves, shoot ourselves, it's possible that - * the flush actually did shoot this entry. That's fine too - * since multiple calls to destroy_mark is perfectly safe, if - * we found a dnentry already attached to the inode, just sod - * off silently as the flush at close time dealt with it. - */ - if (dnentry == new_dnentry) - destroy = 1; - goto out; - } + /* we'd lost the race with close(), sod off silently */ + /* note that inode->i_lock prevents reordering problems + * between accesses to descriptor table and ->i_dnotify */ + if (f != filp) + goto out_free; error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); - if (error) { - /* if we added, we must shoot */ - if (dnentry == new_dnentry) - destroy = 1; - goto out; - } + if (error) + goto out_free; - error = attach_dn(dn, dnentry, id, fd, filp, mask); - /* !error means that we attached the dn to the dnentry, so don't free it */ - if (!error) - dn = NULL; - /* -EEXIST means that we didn't add this new dn and used an old one. - * that isn't an error (and the unused dn should be freed) */ - else if (error == -EEXIST) - error = 0; + dn->dn_mask = arg; + dn->dn_fd = fd; + dn->dn_filp = filp; + dn->dn_owner = id; + inode->i_dnotify_mask |= arg & ~DN_MULTISHOT; + dn->dn_next = inode->i_dnotify; + inode->i_dnotify = dn; + spin_unlock(&inode->i_lock); + return 0; - dnotify_recalc_inode_mask(entry); -out: - spin_unlock(&entry->lock); +out_free: + spin_unlock(&inode->i_lock); + kmem_cache_free(dn_cache, dn); + return error; +} - if (destroy) - fsnotify_destroy_mark_by_entry(entry); +void __inode_dir_notify(struct inode *inode, unsigned long event) +{ + struct dnotify_struct * dn; + struct dnotify_struct **prev; + struct fown_struct * fown; + int changed = 0; - fsnotify_recalc_group_mask(dnotify_group); + spin_lock(&inode->i_lock); + prev = &inode->i_dnotify; + while ((dn = *prev) != NULL) { + if ((dn->dn_mask & event) == 0) { + prev = &dn->dn_next; + continue; + } + fown = &dn->dn_filp->f_owner; + send_sigio(fown, dn->dn_fd, POLL_MSG); + if (dn->dn_mask & DN_MULTISHOT) + prev = &dn->dn_next; + else { + *prev = dn->dn_next; + changed = 1; + kmem_cache_free(dn_cache, dn); + } + } + if (changed) + redo_inode_mask(inode); + spin_unlock(&inode->i_lock); +} - mutex_unlock(&dnotify_mark_mutex); - fsnotify_put_mark(entry); -out_err: - if (new_entry) - fsnotify_put_mark(new_entry); - if (dn) - kmem_cache_free(dnotify_struct_cache, dn); - return error; +EXPORT_SYMBOL(__inode_dir_notify); + +/* + * This is hopelessly wrong, but unfixable without API changes. At + * least it doesn't oops the kernel... + * + * To safely access ->d_parent we need to keep d_move away from it. Use the + * dentry's d_lock for this. + */ +void dnotify_parent(struct dentry *dentry, unsigned long event) +{ + struct dentry *parent; + + if (!dir_notify_enable) + return; + + spin_lock(&dentry->d_lock); + parent = dentry->d_parent; + if (parent->d_inode->i_dnotify_mask & event) { + dget(parent); + spin_unlock(&dentry->d_lock); + __inode_dir_notify(parent->d_inode, event); + dput(parent); + } else { + spin_unlock(&dentry->d_lock); + } } +EXPORT_SYMBOL_GPL(dnotify_parent); static int __init dnotify_init(void) { - dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC); - dnotify_mark_entry_cache = KMEM_CACHE(dnotify_mark_entry, SLAB_PANIC); - - dnotify_group = fsnotify_obtain_group(DNOTIFY_GROUP_NUM, - 0, &dnotify_fsnotify_ops); - if (IS_ERR(dnotify_group)) - panic("unable to allocate fsnotify group for dnotify\n"); + dn_cache = kmem_cache_create("dnotify_cache", + sizeof(struct dnotify_struct), 0, SLAB_PANIC, NULL); return 0; } diff --git a/trunk/fs/notify/fsnotify.c b/trunk/fs/notify/fsnotify.c deleted file mode 100644 index ec2f7bd76818..000000000000 --- a/trunk/fs/notify/fsnotify.c +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Copyright (C) 2008 Red Hat, Inc., Eric Paris - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include -#include -#include - -#include -#include "fsnotify.h" - -/* - * Clear all of the marks on an inode when it is being evicted from core - */ -void __fsnotify_inode_delete(struct inode *inode) -{ - fsnotify_clear_marks_by_inode(inode); -} -EXPORT_SYMBOL_GPL(__fsnotify_inode_delete); - -/* - * Given an inode, first check if we care what happens to our children. Inotify - * and dnotify both tell their parents about events. If we care about any event - * on a child we run all of our children and set a dentry flag saying that the - * parent cares. Thus when an event happens on a child it can quickly tell if - * if there is a need to find a parent and send the event to the parent. - */ -void __fsnotify_update_child_dentry_flags(struct inode *inode) -{ - struct dentry *alias; - int watched; - - if (!S_ISDIR(inode->i_mode)) - return; - - /* determine if the children should tell inode about their events */ - watched = fsnotify_inode_watches_children(inode); - - spin_lock(&dcache_lock); - /* run all of the dentries associated with this inode. Since this is a - * directory, there damn well better only be one item on this list */ - list_for_each_entry(alias, &inode->i_dentry, d_alias) { - struct dentry *child; - - /* run all of the children of the original inode and fix their - * d_flags to indicate parental interest (their parent is the - * original inode) */ - list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) { - if (!child->d_inode) - continue; - - spin_lock(&child->d_lock); - if (watched) - child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; - else - child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; - spin_unlock(&child->d_lock); - } - } - spin_unlock(&dcache_lock); -} - -/* Notify this dentry's parent about a child's events. */ -void __fsnotify_parent(struct dentry *dentry, __u32 mask) -{ - struct dentry *parent; - struct inode *p_inode; - bool send = false; - bool should_update_children = false; - - if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) - return; - - spin_lock(&dentry->d_lock); - parent = dentry->d_parent; - p_inode = parent->d_inode; - - if (fsnotify_inode_watches_children(p_inode)) { - if (p_inode->i_fsnotify_mask & mask) { - dget(parent); - send = true; - } - } else { - /* - * The parent doesn't care about events on it's children but - * at least one child thought it did. We need to run all the - * children and update their d_flags to let them know p_inode - * doesn't care about them any more. - */ - dget(parent); - should_update_children = true; - } - - spin_unlock(&dentry->d_lock); - - if (send) { - /* we are notifying a parent so come up with the new mask which - * specifies these are events which came from a child. */ - mask |= FS_EVENT_ON_CHILD; - - fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, - dentry->d_name.name, 0); - dput(parent); - } - - if (unlikely(should_update_children)) { - __fsnotify_update_child_dentry_flags(p_inode); - dput(parent); - } -} -EXPORT_SYMBOL_GPL(__fsnotify_parent); - -/* - * This is the main call to fsnotify. The VFS calls into hook specific functions - * in linux/fsnotify.h. Those functions then in turn call here. Here will call - * out to all of the registered fsnotify_group. Those groups can then use the - * notification event in whatever means they feel necessary. - */ -void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name, u32 cookie) -{ - struct fsnotify_group *group; - struct fsnotify_event *event = NULL; - int idx; - /* global tests shouldn't care about events on child only the specific event */ - __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); - - if (list_empty(&fsnotify_groups)) - return; - - if (!(test_mask & fsnotify_mask)) - return; - - if (!(test_mask & to_tell->i_fsnotify_mask)) - return; - /* - * SRCU!! the groups list is very very much read only and the path is - * very hot. The VAST majority of events are not going to need to do - * anything other than walk the list so it's crazy to pre-allocate. - */ - idx = srcu_read_lock(&fsnotify_grp_srcu); - list_for_each_entry_rcu(group, &fsnotify_groups, group_list) { - if (test_mask & group->mask) { - if (!group->ops->should_send_event(group, to_tell, mask)) - continue; - if (!event) { - event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie); - /* shit, we OOM'd and now we can't tell, maybe - * someday someone else will want to do something - * here */ - if (!event) - break; - } - group->ops->handle_event(group, event); - } - } - srcu_read_unlock(&fsnotify_grp_srcu, idx); - /* - * fsnotify_create_event() took a reference so the event can't be cleaned - * up while we are still trying to add it to lists, drop that one. - */ - if (event) - fsnotify_put_event(event); -} -EXPORT_SYMBOL_GPL(fsnotify); - -static __init int fsnotify_init(void) -{ - return init_srcu_struct(&fsnotify_grp_srcu); -} -subsys_initcall(fsnotify_init); diff --git a/trunk/fs/notify/fsnotify.h b/trunk/fs/notify/fsnotify.h deleted file mode 100644 index 4dc240824b2d..000000000000 --- a/trunk/fs/notify/fsnotify.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef __FS_NOTIFY_FSNOTIFY_H_ -#define __FS_NOTIFY_FSNOTIFY_H_ - -#include -#include -#include -#include - -/* protects reads of fsnotify_groups */ -extern struct srcu_struct fsnotify_grp_srcu; -/* all groups which receive fsnotify events */ -extern struct list_head fsnotify_groups; -/* all bitwise OR of all event types (FS_*) for all fsnotify_groups */ -extern __u32 fsnotify_mask; - -/* destroy all events sitting in this groups notification queue */ -extern void fsnotify_flush_notify(struct fsnotify_group *group); - -/* final kfree of a group */ -extern void fsnotify_final_destroy_group(struct fsnotify_group *group); - -/* run the list of all marks associated with inode and flag them to be freed */ -extern void fsnotify_clear_marks_by_inode(struct inode *inode); -/* - * update the dentry->d_flags of all of inode's children to indicate if inode cares - * about events that happen to its children. - */ -extern void __fsnotify_update_child_dentry_flags(struct inode *inode); - -/* allocate and destroy and event holder to attach events to notification/access queues */ -extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void); -extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder); - -#endif /* __FS_NOTIFY_FSNOTIFY_H_ */ diff --git a/trunk/fs/notify/group.c b/trunk/fs/notify/group.c deleted file mode 100644 index 0e1677144bc5..000000000000 --- a/trunk/fs/notify/group.c +++ /dev/null @@ -1,254 +0,0 @@ -/* - * Copyright (C) 2008 Red Hat, Inc., Eric Paris - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include -#include -#include -#include - -#include -#include "fsnotify.h" - -#include - -/* protects writes to fsnotify_groups and fsnotify_mask */ -static DEFINE_MUTEX(fsnotify_grp_mutex); -/* protects reads while running the fsnotify_groups list */ -struct srcu_struct fsnotify_grp_srcu; -/* all groups registered to receive filesystem notifications */ -LIST_HEAD(fsnotify_groups); -/* bitwise OR of all events (FS_*) interesting to some group on this system */ -__u32 fsnotify_mask; - -/* - * When a new group registers or changes it's set of interesting events - * this function updates the fsnotify_mask to contain all interesting events - */ -void fsnotify_recalc_global_mask(void) -{ - struct fsnotify_group *group; - __u32 mask = 0; - int idx; - - idx = srcu_read_lock(&fsnotify_grp_srcu); - list_for_each_entry_rcu(group, &fsnotify_groups, group_list) - mask |= group->mask; - srcu_read_unlock(&fsnotify_grp_srcu, idx); - fsnotify_mask = mask; -} - -/* - * Update the group->mask by running all of the marks associated with this - * group and finding the bitwise | of all of the mark->mask. If we change - * the group->mask we need to update the global mask of events interesting - * to the system. - */ -void fsnotify_recalc_group_mask(struct fsnotify_group *group) -{ - __u32 mask = 0; - __u32 old_mask = group->mask; - struct fsnotify_mark_entry *entry; - - spin_lock(&group->mark_lock); - list_for_each_entry(entry, &group->mark_entries, g_list) - mask |= entry->mask; - spin_unlock(&group->mark_lock); - - group->mask = mask; - - if (old_mask != mask) - fsnotify_recalc_global_mask(); -} - -/* - * Take a reference to a group so things found under the fsnotify_grp_mutex - * can't get freed under us - */ -static void fsnotify_get_group(struct fsnotify_group *group) -{ - atomic_inc(&group->refcnt); -} - -/* - * Final freeing of a group - */ -void fsnotify_final_destroy_group(struct fsnotify_group *group) -{ - /* clear the notification queue of all events */ - fsnotify_flush_notify(group); - - if (group->ops->free_group_priv) - group->ops->free_group_priv(group); - - kfree(group); -} - -/* - * Trying to get rid of a group. We need to first get rid of any outstanding - * allocations and then free the group. Remember that fsnotify_clear_marks_by_group - * could miss marks that are being freed by inode and those marks could still - * hold a reference to this group (via group->num_marks) If we get into that - * situtation, the fsnotify_final_destroy_group will get called when that final - * mark is freed. - */ -static void fsnotify_destroy_group(struct fsnotify_group *group) -{ - /* clear all inode mark entries for this group */ - fsnotify_clear_marks_by_group(group); - - /* past the point of no return, matches the initial value of 1 */ - if (atomic_dec_and_test(&group->num_marks)) - fsnotify_final_destroy_group(group); -} - -/* - * Remove this group from the global list of groups that will get events - * this can be done even if there are still references and things still using - * this group. This just stops the group from getting new events. - */ -static void __fsnotify_evict_group(struct fsnotify_group *group) -{ - BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex)); - - if (group->on_group_list) - list_del_rcu(&group->group_list); - group->on_group_list = 0; -} - -/* - * Called when a group is no longer interested in getting events. This can be - * used if a group is misbehaving or if for some reason a group should no longer - * get any filesystem events. - */ -void fsnotify_evict_group(struct fsnotify_group *group) -{ - mutex_lock(&fsnotify_grp_mutex); - __fsnotify_evict_group(group); - mutex_unlock(&fsnotify_grp_mutex); -} - -/* - * Drop a reference to a group. Free it if it's through. - */ -void fsnotify_put_group(struct fsnotify_group *group) -{ - if (!atomic_dec_and_mutex_lock(&group->refcnt, &fsnotify_grp_mutex)) - return; - - /* - * OK, now we know that there's no other users *and* we hold mutex, - * so no new references will appear - */ - __fsnotify_evict_group(group); - - /* - * now it's off the list, so the only thing we might care about is - * srcu access.... - */ - mutex_unlock(&fsnotify_grp_mutex); - synchronize_srcu(&fsnotify_grp_srcu); - - /* and now it is really dead. _Nothing_ could be seeing it */ - fsnotify_recalc_global_mask(); - fsnotify_destroy_group(group); -} - -/* - * Simply run the fsnotify_groups list and find a group which matches - * the given parameters. If a group is found we take a reference to that - * group. - */ -static struct fsnotify_group *fsnotify_find_group(unsigned int group_num, __u32 mask, - const struct fsnotify_ops *ops) -{ - struct fsnotify_group *group_iter; - struct fsnotify_group *group = NULL; - - BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex)); - - list_for_each_entry_rcu(group_iter, &fsnotify_groups, group_list) { - if (group_iter->group_num == group_num) { - if ((group_iter->mask == mask) && - (group_iter->ops == ops)) { - fsnotify_get_group(group_iter); - group = group_iter; - } else - group = ERR_PTR(-EEXIST); - } - } - return group; -} - -/* - * Either finds an existing group which matches the group_num, mask, and ops or - * creates a new group and adds it to the global group list. In either case we - * take a reference for the group returned. - */ -struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask, - const struct fsnotify_ops *ops) -{ - struct fsnotify_group *group, *tgroup; - - /* very low use, simpler locking if we just always alloc */ - group = kmalloc(sizeof(struct fsnotify_group), GFP_KERNEL); - if (!group) - return ERR_PTR(-ENOMEM); - - atomic_set(&group->refcnt, 1); - - group->on_group_list = 0; - group->group_num = group_num; - group->mask = mask; - - mutex_init(&group->notification_mutex); - INIT_LIST_HEAD(&group->notification_list); - init_waitqueue_head(&group->notification_waitq); - group->q_len = 0; - group->max_events = UINT_MAX; - - spin_lock_init(&group->mark_lock); - atomic_set(&group->num_marks, 0); - INIT_LIST_HEAD(&group->mark_entries); - - group->ops = ops; - - mutex_lock(&fsnotify_grp_mutex); - tgroup = fsnotify_find_group(group_num, mask, ops); - if (tgroup) { - /* group already exists */ - mutex_unlock(&fsnotify_grp_mutex); - /* destroy the new one we made */ - fsnotify_put_group(group); - return tgroup; - } - - /* group not found, add a new one */ - list_add_rcu(&group->group_list, &fsnotify_groups); - group->on_group_list = 1; - /* being on the fsnotify_groups list holds one num_marks */ - atomic_inc(&group->num_marks); - - mutex_unlock(&fsnotify_grp_mutex); - - if (mask) - fsnotify_recalc_global_mask(); - - return group; -} diff --git a/trunk/fs/notify/inode_mark.c b/trunk/fs/notify/inode_mark.c deleted file mode 100644 index c8a07c65482b..000000000000 --- a/trunk/fs/notify/inode_mark.c +++ /dev/null @@ -1,426 +0,0 @@ -/* - * Copyright (C) 2008 Red Hat, Inc., Eric Paris - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* - * fsnotify inode mark locking/lifetime/and refcnting - * - * REFCNT: - * The mark->refcnt tells how many "things" in the kernel currently are - * referencing this object. The object typically will live inside the kernel - * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task - * which can find this object holding the appropriete locks, can take a reference - * and the object itself is guarenteed to survive until the reference is dropped. - * - * LOCKING: - * There are 3 spinlocks involved with fsnotify inode marks and they MUST - * be taken in order as follows: - * - * entry->lock - * group->mark_lock - * inode->i_lock - * - * entry->lock protects 2 things, entry->group and entry->inode. You must hold - * that lock to dereference either of these things (they could be NULL even with - * the lock) - * - * group->mark_lock protects the mark_entries list anchored inside a given group - * and each entry is hooked via the g_list. It also sorta protects the - * free_g_list, which when used is anchored by a private list on the stack of the - * task which held the group->mark_lock. - * - * inode->i_lock protects the i_fsnotify_mark_entries list anchored inside a - * given inode and each entry is hooked via the i_list. (and sorta the - * free_i_list) - * - * - * LIFETIME: - * Inode marks survive between when they are added to an inode and when their - * refcnt==0. - * - * The inode mark can be cleared for a number of different reasons including: - * - The inode is unlinked for the last time. (fsnotify_inode_remove) - * - The inode is being evicted from cache. (fsnotify_inode_delete) - * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes) - * - Something explicitly requests that it be removed. (fsnotify_destroy_mark_by_entry) - * - The fsnotify_group associated with the mark is going away and all such marks - * need to be cleaned up. (fsnotify_clear_marks_by_group) - * - * Worst case we are given an inode and need to clean up all the marks on that - * inode. We take i_lock and walk the i_fsnotify_mark_entries safely. For each - * mark on the list we take a reference (so the mark can't disappear under us). - * We remove that mark form the inode's list of marks and we add this mark to a - * private list anchored on the stack using i_free_list; At this point we no - * longer fear anything finding the mark using the inode's list of marks. - * - * We can safely and locklessly run the private list on the stack of everything - * we just unattached from the original inode. For each mark on the private list - * we grab the mark-> and can thus dereference mark->group and mark->inode. If - * we see the group and inode are not NULL we take those locks. Now holding all - * 3 locks we can completely remove the mark from other tasks finding it in the - * future. Remember, 10 things might already be referencing this mark, but they - * better be holding a ref. We drop our reference we took before we unhooked it - * from the inode. When the ref hits 0 we can free the mark. - * - * Very similarly for freeing by group, except we use free_g_list. - * - * This has the very interesting property of being able to run concurrently with - * any (or all) other directions. - */ - -#include -#include -#include -#include -#include -#include -#include -#include /* for inode_lock */ - -#include - -#include -#include "fsnotify.h" - -void fsnotify_get_mark(struct fsnotify_mark_entry *entry) -{ - atomic_inc(&entry->refcnt); -} - -void fsnotify_put_mark(struct fsnotify_mark_entry *entry) -{ - if (atomic_dec_and_test(&entry->refcnt)) - entry->free_mark(entry); -} - -/* - * Recalculate the mask of events relevant to a given inode locked. - */ -static void fsnotify_recalc_inode_mask_locked(struct inode *inode) -{ - struct fsnotify_mark_entry *entry; - struct hlist_node *pos; - __u32 new_mask = 0; - - assert_spin_locked(&inode->i_lock); - - hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) - new_mask |= entry->mask; - inode->i_fsnotify_mask = new_mask; -} - -/* - * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types - * any notifier is interested in hearing for this inode. - */ -void fsnotify_recalc_inode_mask(struct inode *inode) -{ - spin_lock(&inode->i_lock); - fsnotify_recalc_inode_mask_locked(inode); - spin_unlock(&inode->i_lock); - - __fsnotify_update_child_dentry_flags(inode); -} - -/* - * Any time a mark is getting freed we end up here. - * The caller had better be holding a reference to this mark so we don't actually - * do the final put under the entry->lock - */ -void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry) -{ - struct fsnotify_group *group; - struct inode *inode; - - spin_lock(&entry->lock); - - group = entry->group; - inode = entry->inode; - - BUG_ON(group && !inode); - BUG_ON(!group && inode); - - /* if !group something else already marked this to die */ - if (!group) { - spin_unlock(&entry->lock); - return; - } - - /* 1 from caller and 1 for being on i_list/g_list */ - BUG_ON(atomic_read(&entry->refcnt) < 2); - - spin_lock(&group->mark_lock); - spin_lock(&inode->i_lock); - - hlist_del_init(&entry->i_list); - entry->inode = NULL; - - list_del_init(&entry->g_list); - entry->group = NULL; - - fsnotify_put_mark(entry); /* for i_list and g_list */ - - /* - * this mark is now off the inode->i_fsnotify_mark_entries list and we - * hold the inode->i_lock, so this is the perfect time to update the - * inode->i_fsnotify_mask - */ - fsnotify_recalc_inode_mask_locked(inode); - - spin_unlock(&inode->i_lock); - spin_unlock(&group->mark_lock); - spin_unlock(&entry->lock); - - /* - * Some groups like to know that marks are being freed. This is a - * callback to the group function to let it know that this entry - * is being freed. - */ - if (group->ops->freeing_mark) - group->ops->freeing_mark(entry, group); - - /* - * __fsnotify_update_child_dentry_flags(inode); - * - * I really want to call that, but we can't, we have no idea if the inode - * still exists the second we drop the entry->lock. - * - * The next time an event arrive to this inode from one of it's children - * __fsnotify_parent will see that the inode doesn't care about it's - * children and will update all of these flags then. So really this - * is just a lazy update (and could be a perf win...) - */ - - - iput(inode); - - /* - * it's possible that this group tried to destroy itself, but this - * this mark was simultaneously being freed by inode. If that's the - * case, we finish freeing the group here. - */ - if (unlikely(atomic_dec_and_test(&group->num_marks))) - fsnotify_final_destroy_group(group); -} - -/* - * Given a group, destroy all of the marks associated with that group. - */ -void fsnotify_clear_marks_by_group(struct fsnotify_group *group) -{ - struct fsnotify_mark_entry *lentry, *entry; - LIST_HEAD(free_list); - - spin_lock(&group->mark_lock); - list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) { - list_add(&entry->free_g_list, &free_list); - list_del_init(&entry->g_list); - fsnotify_get_mark(entry); - } - spin_unlock(&group->mark_lock); - - list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) { - fsnotify_destroy_mark_by_entry(entry); - fsnotify_put_mark(entry); - } -} - -/* - * Given an inode, destroy all of the marks associated with that inode. - */ -void fsnotify_clear_marks_by_inode(struct inode *inode) -{ - struct fsnotify_mark_entry *entry, *lentry; - struct hlist_node *pos, *n; - LIST_HEAD(free_list); - - spin_lock(&inode->i_lock); - hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i_list) { - list_add(&entry->free_i_list, &free_list); - hlist_del_init(&entry->i_list); - fsnotify_get_mark(entry); - } - spin_unlock(&inode->i_lock); - - list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) { - fsnotify_destroy_mark_by_entry(entry); - fsnotify_put_mark(entry); - } -} - -/* - * given a group and inode, find the mark associated with that combination. - * if found take a reference to that mark and return it, else return NULL - */ -struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group, - struct inode *inode) -{ - struct fsnotify_mark_entry *entry; - struct hlist_node *pos; - - assert_spin_locked(&inode->i_lock); - - hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) { - if (entry->group == group) { - fsnotify_get_mark(entry); - return entry; - } - } - return NULL; -} - -/* - * Nothing fancy, just initialize lists and locks and counters. - */ -void fsnotify_init_mark(struct fsnotify_mark_entry *entry, - void (*free_mark)(struct fsnotify_mark_entry *entry)) - -{ - spin_lock_init(&entry->lock); - atomic_set(&entry->refcnt, 1); - INIT_HLIST_NODE(&entry->i_list); - entry->group = NULL; - entry->mask = 0; - entry->inode = NULL; - entry->free_mark = free_mark; -} - -/* - * Attach an initialized mark entry to a given group and inode. - * These marks may be used for the fsnotify backend to determine which - * event types should be delivered to which group and for which inodes. - */ -int fsnotify_add_mark(struct fsnotify_mark_entry *entry, - struct fsnotify_group *group, struct inode *inode) -{ - struct fsnotify_mark_entry *lentry; - int ret = 0; - - inode = igrab(inode); - if (unlikely(!inode)) - return -EINVAL; - - /* - * LOCKING ORDER!!!! - * entry->lock - * group->mark_lock - * inode->i_lock - */ - spin_lock(&entry->lock); - spin_lock(&group->mark_lock); - spin_lock(&inode->i_lock); - - entry->group = group; - entry->inode = inode; - - lentry = fsnotify_find_mark_entry(group, inode); - if (!lentry) { - hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries); - list_add(&entry->g_list, &group->mark_entries); - - fsnotify_get_mark(entry); /* for i_list and g_list */ - - atomic_inc(&group->num_marks); - - fsnotify_recalc_inode_mask_locked(inode); - } - - spin_unlock(&inode->i_lock); - spin_unlock(&group->mark_lock); - spin_unlock(&entry->lock); - - if (lentry) { - ret = -EEXIST; - iput(inode); - fsnotify_put_mark(lentry); - } else { - __fsnotify_update_child_dentry_flags(inode); - } - - return ret; -} - -/** - * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. - * @list: list of inodes being unmounted (sb->s_inodes) - * - * Called with inode_lock held, protecting the unmounting super block's list - * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay. - * We temporarily drop inode_lock, however, and CAN block. - */ -void fsnotify_unmount_inodes(struct list_head *list) -{ - struct inode *inode, *next_i, *need_iput = NULL; - - list_for_each_entry_safe(inode, next_i, list, i_sb_list) { - struct inode *need_iput_tmp; - - /* - * We cannot __iget() an inode in state I_CLEAR, I_FREEING, - * I_WILL_FREE, or I_NEW which is fine because by that point - * the inode cannot have any associated watches. - */ - if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW)) - continue; - - /* - * If i_count is zero, the inode cannot have any watches and - * doing an __iget/iput with MS_ACTIVE clear would actually - * evict all inodes with zero i_count from icache which is - * unnecessarily violent and may in fact be illegal to do. - */ - if (!atomic_read(&inode->i_count)) - continue; - - need_iput_tmp = need_iput; - need_iput = NULL; - - /* In case fsnotify_inode_delete() drops a reference. */ - if (inode != need_iput_tmp) - __iget(inode); - else - need_iput_tmp = NULL; - - /* In case the dropping of a reference would nuke next_i. */ - if ((&next_i->i_sb_list != list) && - atomic_read(&next_i->i_count) && - !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) { - __iget(next_i); - need_iput = next_i; - } - - /* - * We can safely drop inode_lock here because we hold - * references on both inode and next_i. Also no new inodes - * will be added since the umount has begun. Finally, - * iprune_mutex keeps shrink_icache_memory() away. - */ - spin_unlock(&inode_lock); - - if (need_iput_tmp) - iput(need_iput_tmp); - - /* for each watch, send FS_UNMOUNT and then remove it */ - fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0); - - fsnotify_inode_delete(inode); - - iput(inode); - - spin_lock(&inode_lock); - } -} diff --git a/trunk/fs/notify/inotify/Kconfig b/trunk/fs/notify/inotify/Kconfig index 5356884289a1..446792841023 100644 --- a/trunk/fs/notify/inotify/Kconfig +++ b/trunk/fs/notify/inotify/Kconfig @@ -1,30 +1,26 @@ config INOTIFY bool "Inotify file change notification support" - default n + default y ---help--- - Say Y here to enable legacy in kernel inotify support. Inotify is a - file change notification system. It is a replacement for dnotify. - This option only provides the legacy inotify in kernel API. There - are no in tree kernel users of this interface since it is deprecated. - You only need this if you are loading an out of tree kernel module - that uses inotify. + Say Y here to enable inotify support. Inotify is a file change + notification system and a replacement for dnotify. Inotify fixes + numerous shortcomings in dnotify and introduces several new features + including multiple file events, one-shot support, and unmount + notification. For more information, see - If unsure, say N. + If unsure, say Y. config INOTIFY_USER bool "Inotify support for userspace" - depends on FSNOTIFY + depends on INOTIFY default y ---help--- Say Y here to enable inotify support for userspace, including the associated system calls. Inotify allows monitoring of both files and directories via a single open fd. Events are read from the file descriptor, which is also select()- and poll()-able. - Inotify fixes numerous shortcomings in dnotify and introduces several - new features including multiple file events, one-shot support, and - unmount notification. For more information, see diff --git a/trunk/fs/notify/inotify/Makefile b/trunk/fs/notify/inotify/Makefile index 943828171362..e290f3bb9d8d 100644 --- a/trunk/fs/notify/inotify/Makefile +++ b/trunk/fs/notify/inotify/Makefile @@ -1,2 +1,2 @@ obj-$(CONFIG_INOTIFY) += inotify.o -obj-$(CONFIG_INOTIFY_USER) += inotify_fsnotify.o inotify_user.o +obj-$(CONFIG_INOTIFY_USER) += inotify_user.o diff --git a/trunk/fs/notify/inotify/inotify.c b/trunk/fs/notify/inotify/inotify.c index 40b1cf914ccb..220c13f0d73d 100644 --- a/trunk/fs/notify/inotify/inotify.c +++ b/trunk/fs/notify/inotify/inotify.c @@ -32,7 +32,6 @@ #include #include #include -#include static atomic_t inotify_cookie; @@ -906,25 +905,6 @@ EXPORT_SYMBOL_GPL(inotify_rm_watch); */ static int __init inotify_setup(void) { - BUILD_BUG_ON(IN_ACCESS != FS_ACCESS); - BUILD_BUG_ON(IN_MODIFY != FS_MODIFY); - BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB); - BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE); - BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); - BUILD_BUG_ON(IN_OPEN != FS_OPEN); - BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM); - BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO); - BUILD_BUG_ON(IN_CREATE != FS_CREATE); - BUILD_BUG_ON(IN_DELETE != FS_DELETE); - BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF); - BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF); - BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW); - - BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT); - BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR); - BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED); - BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT); - atomic_set(&inotify_cookie, 0); return 0; diff --git a/trunk/fs/notify/inotify/inotify.h b/trunk/fs/notify/inotify/inotify.h deleted file mode 100644 index ea2605a58b8a..000000000000 --- a/trunk/fs/notify/inotify/inotify.h +++ /dev/null @@ -1,21 +0,0 @@ -#include -#include -#include /* struct kmem_cache */ - -extern struct kmem_cache *event_priv_cachep; - -struct inotify_event_private_data { - struct fsnotify_event_private_data fsnotify_event_priv_data; - int wd; -}; - -struct inotify_inode_mark_entry { - /* fsnotify_mark_entry MUST be the first thing */ - struct fsnotify_mark_entry fsn_entry; - int wd; -}; - -extern void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group); -extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv); - -extern const struct fsnotify_ops inotify_fsnotify_ops; diff --git a/trunk/fs/notify/inotify/inotify_fsnotify.c b/trunk/fs/notify/inotify/inotify_fsnotify.c deleted file mode 100644 index 7ef75b83247e..000000000000 --- a/trunk/fs/notify/inotify/inotify_fsnotify.c +++ /dev/null @@ -1,138 +0,0 @@ -/* - * fs/inotify_user.c - inotify support for userspace - * - * Authors: - * John McCutchan - * Robert Love - * - * Copyright (C) 2005 John McCutchan - * Copyright 2006 Hewlett-Packard Development Company, L.P. - * - * Copyright (C) 2009 Eric Paris - * inotify was largely rewriten to make use of the fsnotify infrastructure - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2, or (at your option) any - * later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ - -#include /* struct inode */ -#include -#include -#include /* struct path */ -#include /* kmem_* */ -#include - -#include "inotify.h" - -static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event) -{ - struct fsnotify_mark_entry *entry; - struct inotify_inode_mark_entry *ientry; - struct inode *to_tell; - struct inotify_event_private_data *event_priv; - struct fsnotify_event_private_data *fsn_event_priv; - int wd, ret; - - to_tell = event->to_tell; - - spin_lock(&to_tell->i_lock); - entry = fsnotify_find_mark_entry(group, to_tell); - spin_unlock(&to_tell->i_lock); - /* race with watch removal? We already passes should_send */ - if (unlikely(!entry)) - return 0; - ientry = container_of(entry, struct inotify_inode_mark_entry, - fsn_entry); - wd = ientry->wd; - - event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); - if (unlikely(!event_priv)) - return -ENOMEM; - - fsn_event_priv = &event_priv->fsnotify_event_priv_data; - - fsn_event_priv->group = group; - event_priv->wd = wd; - - ret = fsnotify_add_notify_event(group, event, fsn_event_priv); - /* EEXIST is not an error */ - if (ret == -EEXIST) - ret = 0; - - /* did event_priv get attached? */ - if (list_empty(&fsn_event_priv->event_list)) - inotify_free_event_priv(fsn_event_priv); - - /* - * If we hold the entry until after the event is on the queue - * IN_IGNORED won't be able to pass this event in the queue - */ - fsnotify_put_mark(entry); - - return ret; -} - -static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group) -{ - inotify_destroy_mark_entry(entry, group); -} - -static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask) -{ - struct fsnotify_mark_entry *entry; - bool send; - - spin_lock(&inode->i_lock); - entry = fsnotify_find_mark_entry(group, inode); - spin_unlock(&inode->i_lock); - if (!entry) - return false; - - mask = (mask & ~FS_EVENT_ON_CHILD); - send = (entry->mask & mask); - - /* find took a reference */ - fsnotify_put_mark(entry); - - return send; -} - -static int idr_callback(int id, void *p, void *data) -{ - BUG(); - return 0; -} - -static void inotify_free_group_priv(struct fsnotify_group *group) -{ - /* ideally the idr is empty and we won't hit the BUG in teh callback */ - idr_for_each(&group->inotify_data.idr, idr_callback, NULL); - idr_remove_all(&group->inotify_data.idr); - idr_destroy(&group->inotify_data.idr); -} - -void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv) -{ - struct inotify_event_private_data *event_priv; - - - event_priv = container_of(fsn_event_priv, struct inotify_event_private_data, - fsnotify_event_priv_data); - - kmem_cache_free(event_priv_cachep, event_priv); -} - -const struct fsnotify_ops inotify_fsnotify_ops = { - .handle_event = inotify_handle_event, - .should_send_event = inotify_should_send_event, - .free_group_priv = inotify_free_group_priv, - .free_event_priv = inotify_free_event_priv, - .freeing_mark = inotify_freeing_mark, -}; diff --git a/trunk/fs/notify/inotify/inotify_user.c b/trunk/fs/notify/inotify/inotify_user.c index 982a412ac5bc..1634319e2404 100644 --- a/trunk/fs/notify/inotify/inotify_user.c +++ b/trunk/fs/notify/inotify/inotify_user.c @@ -8,9 +8,6 @@ * Copyright (C) 2005 John McCutchan * Copyright 2006 Hewlett-Packard Development Company, L.P. * - * Copyright (C) 2009 Eric Paris - * inotify was largely rewriten to make use of the fsnotify infrastructure - * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2, or (at your option) any @@ -22,48 +19,94 @@ * General Public License for more details. */ +#include +#include +#include +#include #include -#include /* struct inode */ -#include -#include -#include /* module_init */ +#include +#include +#include +#include +#include #include -#include /* roundup() */ -#include /* superblock magic number */ -#include /* mntget */ -#include /* LOOKUP_FOLLOW */ -#include /* struct path */ -#include /* struct user */ -#include /* struct kmem_cache */ #include -#include -#include -#include -#include - -#include "inotify.h" +#include #include -static struct vfsmount *inotify_mnt __read_mostly; +static struct kmem_cache *watch_cachep __read_mostly; +static struct kmem_cache *event_cachep __read_mostly; -/* this just sits here and wastes global memory. used to just pad userspace messages with zeros */ -static struct inotify_event nul_inotify_event; +static struct vfsmount *inotify_mnt __read_mostly; /* these are configurable via /proc/sys/fs/inotify/ */ static int inotify_max_user_instances __read_mostly; +static int inotify_max_user_watches __read_mostly; static int inotify_max_queued_events __read_mostly; -int inotify_max_user_watches __read_mostly; -static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; -struct kmem_cache *event_priv_cachep __read_mostly; -static struct fsnotify_event *inotify_ignored_event; +/* + * Lock ordering: + * + * inotify_dev->up_mutex (ensures we don't re-add the same watch) + * inode->inotify_mutex (protects inode's watch list) + * inotify_handle->mutex (protects inotify_handle's watch list) + * inotify_dev->ev_mutex (protects device's event queue) + */ /* - * When inotify registers a new group it increments this and uses that - * value as an offset to set the fsnotify group "name" and priority. + * Lifetimes of the main data structures: + * + * inotify_device: Lifetime is managed by reference count, from + * sys_inotify_init() until release. Additional references can bump the count + * via get_inotify_dev() and drop the count via put_inotify_dev(). + * + * inotify_user_watch: Lifetime is from create_watch() to the receipt of an + * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the + * first event, or to inotify_destroy(). */ -static atomic_t inotify_grp_num; + +/* + * struct inotify_device - represents an inotify instance + * + * This structure is protected by the mutex 'mutex'. + */ +struct inotify_device { + wait_queue_head_t wq; /* wait queue for i/o */ + struct mutex ev_mutex; /* protects event queue */ + struct mutex up_mutex; /* synchronizes watch updates */ + struct list_head events; /* list of queued events */ + struct user_struct *user; /* user who opened this dev */ + struct inotify_handle *ih; /* inotify handle */ + struct fasync_struct *fa; /* async notification */ + atomic_t count; /* reference count */ + unsigned int queue_size; /* size of the queue (bytes) */ + unsigned int event_count; /* number of pending events */ + unsigned int max_events; /* maximum number of events */ +}; + +/* + * struct inotify_kernel_event - An inotify event, originating from a watch and + * queued for user-space. A list of these is attached to each instance of the + * device. In read(), this list is walked and all events that can fit in the + * buffer are returned. + * + * Protected by dev->ev_mutex of the device in which we are queued. + */ +struct inotify_kernel_event { + struct inotify_event event; /* the user-space event */ + struct list_head list; /* entry in inotify_device's list */ + char *name; /* filename, if any */ +}; + +/* + * struct inotify_user_watch - our version of an inotify_watch, we add + * a reference to the associated inotify_device. + */ +struct inotify_user_watch { + struct inotify_device *dev; /* associated device */ + struct inotify_watch wdata; /* inotify watch data */ +}; #ifdef CONFIG_SYSCTL @@ -106,36 +149,280 @@ ctl_table inotify_table[] = { }; #endif /* CONFIG_SYSCTL */ -static inline __u32 inotify_arg_to_mask(u32 arg) +static inline void get_inotify_dev(struct inotify_device *dev) +{ + atomic_inc(&dev->count); +} + +static inline void put_inotify_dev(struct inotify_device *dev) +{ + if (atomic_dec_and_test(&dev->count)) { + atomic_dec(&dev->user->inotify_devs); + free_uid(dev->user); + kfree(dev); + } +} + +/* + * free_inotify_user_watch - cleans up the watch and its references + */ +static void free_inotify_user_watch(struct inotify_watch *w) +{ + struct inotify_user_watch *watch; + struct inotify_device *dev; + + watch = container_of(w, struct inotify_user_watch, wdata); + dev = watch->dev; + + atomic_dec(&dev->user->inotify_watches); + put_inotify_dev(dev); + kmem_cache_free(watch_cachep, watch); +} + +/* + * kernel_event - create a new kernel event with the given parameters + * + * This function can sleep. + */ +static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie, + const char *name) +{ + struct inotify_kernel_event *kevent; + + kevent = kmem_cache_alloc(event_cachep, GFP_NOFS); + if (unlikely(!kevent)) + return NULL; + + /* we hand this out to user-space, so zero it just in case */ + memset(&kevent->event, 0, sizeof(struct inotify_event)); + + kevent->event.wd = wd; + kevent->event.mask = mask; + kevent->event.cookie = cookie; + + INIT_LIST_HEAD(&kevent->list); + + if (name) { + size_t len, rem, event_size = sizeof(struct inotify_event); + + /* + * We need to pad the filename so as to properly align an + * array of inotify_event structures. Because the structure is + * small and the common case is a small filename, we just round + * up to the next multiple of the structure's sizeof. This is + * simple and safe for all architectures. + */ + len = strlen(name) + 1; + rem = event_size - len; + if (len > event_size) { + rem = event_size - (len % event_size); + if (len % event_size == 0) + rem = 0; + } + + kevent->name = kmalloc(len + rem, GFP_NOFS); + if (unlikely(!kevent->name)) { + kmem_cache_free(event_cachep, kevent); + return NULL; + } + memcpy(kevent->name, name, len); + if (rem) + memset(kevent->name + len, 0, rem); + kevent->event.len = len + rem; + } else { + kevent->event.len = 0; + kevent->name = NULL; + } + + return kevent; +} + +/* + * inotify_dev_get_event - return the next event in the given dev's queue + * + * Caller must hold dev->ev_mutex. + */ +static inline struct inotify_kernel_event * +inotify_dev_get_event(struct inotify_device *dev) +{ + return list_entry(dev->events.next, struct inotify_kernel_event, list); +} + +/* + * inotify_dev_get_last_event - return the last event in the given dev's queue + * + * Caller must hold dev->ev_mutex. + */ +static inline struct inotify_kernel_event * +inotify_dev_get_last_event(struct inotify_device *dev) { - __u32 mask; + if (list_empty(&dev->events)) + return NULL; + return list_entry(dev->events.prev, struct inotify_kernel_event, list); +} - /* everything should accept their own ignored and cares about children */ - mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD); +/* + * inotify_dev_queue_event - event handler registered with core inotify, adds + * a new event to the given device + * + * Can sleep (calls kernel_event()). + */ +static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask, + u32 cookie, const char *name, + struct inode *ignored) +{ + struct inotify_user_watch *watch; + struct inotify_device *dev; + struct inotify_kernel_event *kevent, *last; - /* mask off the flags used to open the fd */ - mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT)); + watch = container_of(w, struct inotify_user_watch, wdata); + dev = watch->dev; - return mask; + mutex_lock(&dev->ev_mutex); + + /* we can safely put the watch as we don't reference it while + * generating the event + */ + if (mask & IN_IGNORED || w->mask & IN_ONESHOT) + put_inotify_watch(w); /* final put */ + + /* coalescing: drop this event if it is a dupe of the previous */ + last = inotify_dev_get_last_event(dev); + if (last && last->event.mask == mask && last->event.wd == wd && + last->event.cookie == cookie) { + const char *lastname = last->name; + + if (!name && !lastname) + goto out; + if (name && lastname && !strcmp(lastname, name)) + goto out; + } + + /* the queue overflowed and we already sent the Q_OVERFLOW event */ + if (unlikely(dev->event_count > dev->max_events)) + goto out; + + /* if the queue overflows, we need to notify user space */ + if (unlikely(dev->event_count == dev->max_events)) + kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL); + else + kevent = kernel_event(wd, mask, cookie, name); + + if (unlikely(!kevent)) + goto out; + + /* queue the event and wake up anyone waiting */ + dev->event_count++; + dev->queue_size += sizeof(struct inotify_event) + kevent->event.len; + list_add_tail(&kevent->list, &dev->events); + wake_up_interruptible(&dev->wq); + kill_fasync(&dev->fa, SIGIO, POLL_IN); + +out: + mutex_unlock(&dev->ev_mutex); +} + +/* + * remove_kevent - cleans up the given kevent + * + * Caller must hold dev->ev_mutex. + */ +static void remove_kevent(struct inotify_device *dev, + struct inotify_kernel_event *kevent) +{ + list_del(&kevent->list); + + dev->event_count--; + dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len; +} + +/* + * free_kevent - frees the given kevent. + */ +static void free_kevent(struct inotify_kernel_event *kevent) +{ + kfree(kevent->name); + kmem_cache_free(event_cachep, kevent); +} + +/* + * inotify_dev_event_dequeue - destroy an event on the given device + * + * Caller must hold dev->ev_mutex. + */ +static void inotify_dev_event_dequeue(struct inotify_device *dev) +{ + if (!list_empty(&dev->events)) { + struct inotify_kernel_event *kevent; + kevent = inotify_dev_get_event(dev); + remove_kevent(dev, kevent); + free_kevent(kevent); + } +} + +/* + * find_inode - resolve a user-given path to a specific inode + */ +static int find_inode(const char __user *dirname, struct path *path, + unsigned flags) +{ + int error; + + error = user_path_at(AT_FDCWD, dirname, flags, path); + if (error) + return error; + /* you can only watch an inode if you have read permissions on it */ + error = inode_permission(path->dentry->d_inode, MAY_READ); + if (error) + path_put(path); + return error; } -static inline u32 inotify_mask_to_arg(__u32 mask) +/* + * create_watch - creates a watch on the given device. + * + * Callers must hold dev->up_mutex. + */ +static int create_watch(struct inotify_device *dev, struct inode *inode, + u32 mask) { - return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED | - IN_Q_OVERFLOW); + struct inotify_user_watch *watch; + int ret; + + if (atomic_read(&dev->user->inotify_watches) >= + inotify_max_user_watches) + return -ENOSPC; + + watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL); + if (unlikely(!watch)) + return -ENOMEM; + + /* save a reference to device and bump the count to make it official */ + get_inotify_dev(dev); + watch->dev = dev; + + atomic_inc(&dev->user->inotify_watches); + + inotify_init_watch(&watch->wdata); + ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask); + if (ret < 0) + free_inotify_user_watch(&watch->wdata); + + return ret; } -/* intofiy userspace file descriptor functions */ +/* Device Interface */ + static unsigned int inotify_poll(struct file *file, poll_table *wait) { - struct fsnotify_group *group = file->private_data; + struct inotify_device *dev = file->private_data; int ret = 0; - poll_wait(file, &group->notification_waitq, wait); - mutex_lock(&group->notification_mutex); - if (!fsnotify_notify_queue_is_empty(group)) + poll_wait(file, &dev->wq, wait); + mutex_lock(&dev->ev_mutex); + if (!list_empty(&dev->events)) ret = POLLIN | POLLRDNORM; - mutex_unlock(&group->notification_mutex); + mutex_unlock(&dev->ev_mutex); return ret; } @@ -145,29 +432,26 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait) * enough to fit in "count". Return an error pointer if * not large enough. * - * Called with the group->notification_mutex held. + * Called with the device ev_mutex held. */ -static struct fsnotify_event *get_one_event(struct fsnotify_group *group, - size_t count) +static struct inotify_kernel_event *get_one_event(struct inotify_device *dev, + size_t count) { size_t event_size = sizeof(struct inotify_event); - struct fsnotify_event *event; + struct inotify_kernel_event *kevent; - if (fsnotify_notify_queue_is_empty(group)) + if (list_empty(&dev->events)) return NULL; - event = fsnotify_peek_notify_event(group); - - event_size += roundup(event->name_len, event_size); + kevent = inotify_dev_get_event(dev); + if (kevent->name) + event_size += kevent->event.len; if (event_size > count) return ERR_PTR(-EINVAL); - /* held the notification_mutex the whole time, so this is the - * same event we peeked above */ - fsnotify_remove_notify_event(group); - - return event; + remove_kevent(dev, kevent); + return kevent; } /* @@ -176,90 +460,51 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, * We already checked that the event size is smaller than the * buffer we had in "get_one_event()" above. */ -static ssize_t copy_event_to_user(struct fsnotify_group *group, - struct fsnotify_event *event, +static ssize_t copy_event_to_user(struct inotify_kernel_event *kevent, char __user *buf) { - struct inotify_event inotify_event; - struct fsnotify_event_private_data *fsn_priv; - struct inotify_event_private_data *priv; size_t event_size = sizeof(struct inotify_event); - size_t name_len; - - /* we get the inotify watch descriptor from the event private data */ - spin_lock(&event->lock); - fsn_priv = fsnotify_remove_priv_from_event(group, event); - spin_unlock(&event->lock); - - if (!fsn_priv) - inotify_event.wd = -1; - else { - priv = container_of(fsn_priv, struct inotify_event_private_data, - fsnotify_event_priv_data); - inotify_event.wd = priv->wd; - inotify_free_event_priv(fsn_priv); - } - - /* round up event->name_len so it is a multiple of event_size */ - name_len = roundup(event->name_len, event_size); - inotify_event.len = name_len; - - inotify_event.mask = inotify_mask_to_arg(event->mask); - inotify_event.cookie = event->sync_cookie; - /* send the main event */ - if (copy_to_user(buf, &inotify_event, event_size)) + if (copy_to_user(buf, &kevent->event, event_size)) return -EFAULT; - buf += event_size; + if (kevent->name) { + buf += event_size; - /* - * fsnotify only stores the pathname, so here we have to send the pathname - * and then pad that pathname out to a multiple of sizeof(inotify_event) - * with zeros. I get my zeros from the nul_inotify_event. - */ - if (name_len) { - unsigned int len_to_zero = name_len - event->name_len; - /* copy the path name */ - if (copy_to_user(buf, event->file_name, event->name_len)) + if (copy_to_user(buf, kevent->name, kevent->event.len)) return -EFAULT; - buf += event->name_len; - /* fill userspace with 0's from nul_inotify_event */ - if (copy_to_user(buf, &nul_inotify_event, len_to_zero)) - return -EFAULT; - buf += len_to_zero; - event_size += name_len; + event_size += kevent->event.len; } - return event_size; } static ssize_t inotify_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { - struct fsnotify_group *group; - struct fsnotify_event *kevent; + struct inotify_device *dev; char __user *start; int ret; DEFINE_WAIT(wait); start = buf; - group = file->private_data; + dev = file->private_data; while (1) { - prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE); + struct inotify_kernel_event *kevent; - mutex_lock(&group->notification_mutex); - kevent = get_one_event(group, count); - mutex_unlock(&group->notification_mutex); + prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE); + + mutex_lock(&dev->ev_mutex); + kevent = get_one_event(dev, count); + mutex_unlock(&dev->ev_mutex); if (kevent) { ret = PTR_ERR(kevent); if (IS_ERR(kevent)) break; - ret = copy_event_to_user(group, kevent, buf); - fsnotify_put_event(kevent); + ret = copy_event_to_user(kevent, buf); + free_kevent(kevent); if (ret < 0) break; buf += ret; @@ -280,7 +525,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf, schedule(); } - finish_wait(&group->notification_waitq, &wait); + finish_wait(&dev->wq, &wait); if (start != buf && ret != -EFAULT) ret = buf - start; return ret; @@ -288,19 +533,25 @@ static ssize_t inotify_read(struct file *file, char __user *buf, static int inotify_fasync(int fd, struct file *file, int on) { - struct fsnotify_group *group = file->private_data; + struct inotify_device *dev = file->private_data; - return fasync_helper(fd, file, on, &group->inotify_data.fa) >= 0 ? 0 : -EIO; + return fasync_helper(fd, file, on, &dev->fa) >= 0 ? 0 : -EIO; } static int inotify_release(struct inode *ignored, struct file *file) { - struct fsnotify_group *group = file->private_data; + struct inotify_device *dev = file->private_data; + + inotify_destroy(dev->ih); - fsnotify_clear_marks_by_group(group); + /* destroy all of the events on this device */ + mutex_lock(&dev->ev_mutex); + while (!list_empty(&dev->events)) + inotify_dev_event_dequeue(dev); + mutex_unlock(&dev->ev_mutex); - /* free this group, matching get was inotify_init->fsnotify_obtain_group */ - fsnotify_put_group(group); + /* free this device: the put matching the get in inotify_init() */ + put_inotify_dev(dev); return 0; } @@ -308,27 +559,16 @@ static int inotify_release(struct inode *ignored, struct file *file) static long inotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct fsnotify_group *group; - struct fsnotify_event_holder *holder; - struct fsnotify_event *event; + struct inotify_device *dev; void __user *p; int ret = -ENOTTY; - size_t send_len = 0; - group = file->private_data; + dev = file->private_data; p = (void __user *) arg; switch (cmd) { case FIONREAD: - mutex_lock(&group->notification_mutex); - list_for_each_entry(holder, &group->notification_list, event_list) { - event = holder->event; - send_len += sizeof(struct inotify_event); - send_len += roundup(event->name_len, - sizeof(struct inotify_event)); - } - mutex_unlock(&group->notification_mutex); - ret = put_user(send_len, (int __user *) p); + ret = put_user(dev->queue_size, (int __user *) p); break; } @@ -336,233 +576,23 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, } static const struct file_operations inotify_fops = { - .poll = inotify_poll, - .read = inotify_read, - .fasync = inotify_fasync, - .release = inotify_release, - .unlocked_ioctl = inotify_ioctl, + .poll = inotify_poll, + .read = inotify_read, + .fasync = inotify_fasync, + .release = inotify_release, + .unlocked_ioctl = inotify_ioctl, .compat_ioctl = inotify_ioctl, }; +static const struct inotify_operations inotify_user_ops = { + .handle_event = inotify_dev_queue_event, + .destroy_watch = free_inotify_user_watch, +}; -/* - * find_inode - resolve a user-given path to a specific inode - */ -static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags) -{ - int error; - - error = user_path_at(AT_FDCWD, dirname, flags, path); - if (error) - return error; - /* you can only watch an inode if you have read permissions on it */ - error = inode_permission(path->dentry->d_inode, MAY_READ); - if (error) - path_put(path); - return error; -} - -/* - * When, for whatever reason, inotify is done with a mark (or what used to be a - * watch) we need to remove that watch from the idr and we need to send IN_IGNORED - * for the given wd. - * - * There is a bit of recursion here. The loop looks like: - * inotify_destroy_mark_entry -> fsnotify_destroy_mark_by_entry -> - * inotify_freeing_mark -> inotify_destory_mark_entry -> restart - * But the loop is broken in 2 places. fsnotify_destroy_mark_by_entry sets - * entry->group = NULL before the call to inotify_freeing_mark, so the if (egroup) - * test below will not call back to fsnotify again. But even if that test wasn't - * there this would still be safe since fsnotify_destroy_mark_by_entry() is - * safe from recursion. - */ -void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group) -{ - struct inotify_inode_mark_entry *ientry; - struct inotify_event_private_data *event_priv; - struct fsnotify_event_private_data *fsn_event_priv; - struct fsnotify_group *egroup; - struct idr *idr; - - spin_lock(&entry->lock); - egroup = entry->group; - - /* if egroup we aren't really done and something might still send events - * for this inode, on the callback we'll send the IN_IGNORED */ - if (egroup) { - spin_unlock(&entry->lock); - fsnotify_destroy_mark_by_entry(entry); - return; - } - spin_unlock(&entry->lock); - - ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); - - event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); - if (unlikely(!event_priv)) - goto skip_send_ignore; - - fsn_event_priv = &event_priv->fsnotify_event_priv_data; - - fsn_event_priv->group = group; - event_priv->wd = ientry->wd; - - fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv); - - /* did the private data get added? */ - if (list_empty(&fsn_event_priv->event_list)) - inotify_free_event_priv(fsn_event_priv); - -skip_send_ignore: - - /* remove this entry from the idr */ - spin_lock(&group->inotify_data.idr_lock); - idr = &group->inotify_data.idr; - idr_remove(idr, ientry->wd); - spin_unlock(&group->inotify_data.idr_lock); - - /* removed from idr, drop that reference */ - fsnotify_put_mark(entry); -} - -/* ding dong the mark is dead */ -static void inotify_free_mark(struct fsnotify_mark_entry *entry) -{ - struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry; - - kmem_cache_free(inotify_inode_mark_cachep, ientry); -} - -static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) -{ - struct fsnotify_mark_entry *entry = NULL; - struct inotify_inode_mark_entry *ientry; - int ret = 0; - int add = (arg & IN_MASK_ADD); - __u32 mask; - __u32 old_mask, new_mask; - - /* don't allow invalid bits: we don't want flags set */ - mask = inotify_arg_to_mask(arg); - if (unlikely(!mask)) - return -EINVAL; - - ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); - if (unlikely(!ientry)) - return -ENOMEM; - /* we set the mask at the end after attaching it */ - fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark); - ientry->wd = 0; - -find_entry: - spin_lock(&inode->i_lock); - entry = fsnotify_find_mark_entry(group, inode); - spin_unlock(&inode->i_lock); - if (entry) { - kmem_cache_free(inotify_inode_mark_cachep, ientry); - ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); - } else { - if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) { - ret = -ENOSPC; - goto out_err; - } - - ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode); - if (ret == -EEXIST) - goto find_entry; - else if (ret) - goto out_err; - - entry = &ientry->fsn_entry; -retry: - ret = -ENOMEM; - if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL))) - goto out_err; - - spin_lock(&group->inotify_data.idr_lock); - /* if entry is added to the idr we keep the reference obtained - * through fsnotify_mark_add. remember to drop this reference - * when entry is removed from idr */ - ret = idr_get_new_above(&group->inotify_data.idr, entry, - ++group->inotify_data.last_wd, - &ientry->wd); - spin_unlock(&group->inotify_data.idr_lock); - if (ret) { - if (ret == -EAGAIN) - goto retry; - goto out_err; - } - atomic_inc(&group->inotify_data.user->inotify_watches); - } - - spin_lock(&entry->lock); - - old_mask = entry->mask; - if (add) { - entry->mask |= mask; - new_mask = entry->mask; - } else { - entry->mask = mask; - new_mask = entry->mask; - } - - spin_unlock(&entry->lock); - - if (old_mask != new_mask) { - /* more bits in old than in new? */ - int dropped = (old_mask & ~new_mask); - /* more bits in this entry than the inode's mask? */ - int do_inode = (new_mask & ~inode->i_fsnotify_mask); - /* more bits in this entry than the group? */ - int do_group = (new_mask & ~group->mask); - - /* update the inode with this new entry */ - if (dropped || do_inode) - fsnotify_recalc_inode_mask(inode); - - /* update the group mask with the new mask */ - if (dropped || do_group) - fsnotify_recalc_group_mask(group); - } - - return ientry->wd; - -out_err: - /* see this isn't supposed to happen, just kill the watch */ - if (entry) { - fsnotify_destroy_mark_by_entry(entry); - fsnotify_put_mark(entry); - } - return ret; -} - -static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events) -{ - struct fsnotify_group *group; - unsigned int grp_num; - - /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */ - grp_num = (INOTIFY_GROUP_NUM - atomic_inc_return(&inotify_grp_num)); - group = fsnotify_obtain_group(grp_num, 0, &inotify_fsnotify_ops); - if (IS_ERR(group)) - return group; - - group->max_events = max_events; - - spin_lock_init(&group->inotify_data.idr_lock); - idr_init(&group->inotify_data.idr); - group->inotify_data.last_wd = 0; - group->inotify_data.user = user; - group->inotify_data.fa = NULL; - - return group; -} - - -/* inotify syscalls */ SYSCALL_DEFINE1(inotify_init1, int, flags) { - struct fsnotify_group *group; + struct inotify_device *dev; + struct inotify_handle *ih; struct user_struct *user; struct file *filp; int fd, ret; @@ -591,27 +621,45 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) goto out_free_uid; } - /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */ - group = inotify_new_group(user, inotify_max_queued_events); - if (IS_ERR(group)) { - ret = PTR_ERR(group); + dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL); + if (unlikely(!dev)) { + ret = -ENOMEM; goto out_free_uid; } + ih = inotify_init(&inotify_user_ops); + if (IS_ERR(ih)) { + ret = PTR_ERR(ih); + goto out_free_dev; + } + dev->ih = ih; + dev->fa = NULL; + filp->f_op = &inotify_fops; filp->f_path.mnt = mntget(inotify_mnt); filp->f_path.dentry = dget(inotify_mnt->mnt_root); filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping; filp->f_mode = FMODE_READ; filp->f_flags = O_RDONLY | (flags & O_NONBLOCK); - filp->private_data = group; - + filp->private_data = dev; + + INIT_LIST_HEAD(&dev->events); + init_waitqueue_head(&dev->wq); + mutex_init(&dev->ev_mutex); + mutex_init(&dev->up_mutex); + dev->event_count = 0; + dev->queue_size = 0; + dev->max_events = inotify_max_queued_events; + dev->user = user; + atomic_set(&dev->count, 0); + + get_inotify_dev(dev); atomic_inc(&user->inotify_devs); - fd_install(fd, filp); return fd; - +out_free_dev: + kfree(dev); out_free_uid: free_uid(user); put_filp(filp); @@ -628,8 +676,8 @@ SYSCALL_DEFINE0(inotify_init) SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, u32, mask) { - struct fsnotify_group *group; struct inode *inode; + struct inotify_device *dev; struct path path; struct file *filp; int ret, fput_needed; @@ -650,20 +698,20 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, if (mask & IN_ONLYDIR) flags |= LOOKUP_DIRECTORY; - ret = inotify_find_inode(pathname, &path, flags); - if (ret) + ret = find_inode(pathname, &path, flags); + if (unlikely(ret)) goto fput_and_out; - /* inode held in place by reference to path; group by fget on fd */ + /* inode held in place by reference to path; dev by fget on fd */ inode = path.dentry->d_inode; - group = filp->private_data; + dev = filp->private_data; - /* create/update an inode mark */ - ret = inotify_update_watch(group, inode, mask); - if (unlikely(ret)) - goto path_put_and_out; + mutex_lock(&dev->up_mutex); + ret = inotify_find_update_watch(dev->ih, inode, mask); + if (ret == -ENOENT) + ret = create_watch(dev, inode, mask); + mutex_unlock(&dev->up_mutex); -path_put_and_out: path_put(&path); fput_and_out: fput_light(filp, fput_needed); @@ -672,10 +720,9 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) { - struct fsnotify_group *group; - struct fsnotify_mark_entry *entry; struct file *filp; - int ret = 0, fput_needed; + struct inotify_device *dev; + int ret, fput_needed; filp = fget_light(fd, &fput_needed); if (unlikely(!filp)) @@ -687,20 +734,10 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) goto out; } - group = filp->private_data; + dev = filp->private_data; - spin_lock(&group->inotify_data.idr_lock); - entry = idr_find(&group->inotify_data.idr, wd); - if (unlikely(!entry)) { - spin_unlock(&group->inotify_data.idr_lock); - ret = -EINVAL; - goto out; - } - fsnotify_get_mark(entry); - spin_unlock(&group->inotify_data.idr_lock); - - inotify_destroy_mark_entry(entry, group); - fsnotify_put_mark(entry); + /* we free our watch data when we get IN_IGNORED */ + ret = inotify_rm_wd(dev->ih, wd); out: fput_light(filp, fput_needed); @@ -716,9 +753,9 @@ inotify_get_sb(struct file_system_type *fs_type, int flags, } static struct file_system_type inotify_fs_type = { - .name = "inotifyfs", - .get_sb = inotify_get_sb, - .kill_sb = kill_anon_super, + .name = "inotifyfs", + .get_sb = inotify_get_sb, + .kill_sb = kill_anon_super, }; /* @@ -738,16 +775,18 @@ static int __init inotify_user_setup(void) if (IS_ERR(inotify_mnt)) panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt)); - inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC); - event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); - inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0); - if (!inotify_ignored_event) - panic("unable to allocate the inotify ignored event\n"); - inotify_max_queued_events = 16384; inotify_max_user_instances = 128; inotify_max_user_watches = 8192; + watch_cachep = kmem_cache_create("inotify_watch_cache", + sizeof(struct inotify_user_watch), + 0, SLAB_PANIC, NULL); + event_cachep = kmem_cache_create("inotify_event_cache", + sizeof(struct inotify_kernel_event), + 0, SLAB_PANIC, NULL); + return 0; } + module_init(inotify_user_setup); diff --git a/trunk/fs/notify/notification.c b/trunk/fs/notify/notification.c deleted file mode 100644 index 959b73e756fd..000000000000 --- a/trunk/fs/notify/notification.c +++ /dev/null @@ -1,411 +0,0 @@ -/* - * Copyright (C) 2008 Red Hat, Inc., Eric Paris - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* - * Basic idea behind the notification queue: An fsnotify group (like inotify) - * sends the userspace notification about events asyncronously some time after - * the event happened. When inotify gets an event it will need to add that - * event to the group notify queue. Since a single event might need to be on - * multiple group's notification queues we can't add the event directly to each - * queue and instead add a small "event_holder" to each queue. This event_holder - * has a pointer back to the original event. Since the majority of events are - * going to end up on one, and only one, notification queue we embed one - * event_holder into each event. This means we have a single allocation instead - * of always needing two. If the embedded event_holder is already in use by - * another group a new event_holder (from fsnotify_event_holder_cachep) will be - * allocated and used. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include "fsnotify.h" - -static struct kmem_cache *fsnotify_event_cachep; -static struct kmem_cache *fsnotify_event_holder_cachep; -/* - * This is a magic event we send when the q is too full. Since it doesn't - * hold real event information we just keep one system wide and use it any time - * it is needed. It's refcnt is set 1 at kernel init time and will never - * get set to 0 so it will never get 'freed' - */ -static struct fsnotify_event q_overflow_event; -static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0); - -/** - * fsnotify_get_cookie - return a unique cookie for use in synchronizing events. - * Called from fsnotify_move, which is inlined into filesystem modules. - */ -u32 fsnotify_get_cookie(void) -{ - return atomic_inc_return(&fsnotify_sync_cookie); -} -EXPORT_SYMBOL_GPL(fsnotify_get_cookie); - -/* return true if the notify queue is empty, false otherwise */ -bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) -{ - BUG_ON(!mutex_is_locked(&group->notification_mutex)); - return list_empty(&group->notification_list) ? true : false; -} - -void fsnotify_get_event(struct fsnotify_event *event) -{ - atomic_inc(&event->refcnt); -} - -void fsnotify_put_event(struct fsnotify_event *event) -{ - if (!event) - return; - - if (atomic_dec_and_test(&event->refcnt)) { - if (event->data_type == FSNOTIFY_EVENT_PATH) - path_put(&event->path); - - BUG_ON(!list_empty(&event->private_data_list)); - - kfree(event->file_name); - kmem_cache_free(fsnotify_event_cachep, event); - } -} - -struct fsnotify_event_holder *fsnotify_alloc_event_holder(void) -{ - return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL); -} - -void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder) -{ - kmem_cache_free(fsnotify_event_holder_cachep, holder); -} - -/* - * Find the private data that the group previously attached to this event when - * the group added the event to the notification queue (fsnotify_add_notify_event) - */ -struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event) -{ - struct fsnotify_event_private_data *lpriv; - struct fsnotify_event_private_data *priv = NULL; - - assert_spin_locked(&event->lock); - - list_for_each_entry(lpriv, &event->private_data_list, event_list) { - if (lpriv->group == group) { - priv = lpriv; - list_del(&priv->event_list); - break; - } - } - return priv; -} - -/* - * Check if 2 events contain the same information. We do not compare private data - * but at this moment that isn't a problem for any know fsnotify listeners. - */ -static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new) -{ - if ((old->mask == new->mask) && - (old->to_tell == new->to_tell) && - (old->data_type == new->data_type)) { - switch (old->data_type) { - case (FSNOTIFY_EVENT_INODE): - if (old->inode == new->inode) - return true; - break; - case (FSNOTIFY_EVENT_PATH): - if ((old->path.mnt == new->path.mnt) && - (old->path.dentry == new->path.dentry)) - return true; - case (FSNOTIFY_EVENT_NONE): - return true; - }; - } - return false; -} - -/* - * Add an event to the group notification queue. The group can later pull this - * event off the queue to deal with. If the event is successfully added to the - * group's notification queue, a reference is taken on event. - */ -int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event, - struct fsnotify_event_private_data *priv) -{ - struct fsnotify_event_holder *holder = NULL; - struct list_head *list = &group->notification_list; - struct fsnotify_event_holder *last_holder; - struct fsnotify_event *last_event; - - /* easy to tell if priv was attached to the event */ - INIT_LIST_HEAD(&priv->event_list); - - /* - * There is one fsnotify_event_holder embedded inside each fsnotify_event. - * Check if we expect to be able to use that holder. If not alloc a new - * holder. - * For the overflow event it's possible that something will use the in - * event holder before we get the lock so we may need to jump back and - * alloc a new holder, this can't happen for most events... - */ - if (!list_empty(&event->holder.event_list)) { -alloc_holder: - holder = fsnotify_alloc_event_holder(); - if (!holder) - return -ENOMEM; - } - - mutex_lock(&group->notification_mutex); - - if (group->q_len >= group->max_events) { - event = &q_overflow_event; - /* sorry, no private data on the overflow event */ - priv = NULL; - } - - spin_lock(&event->lock); - - if (list_empty(&event->holder.event_list)) { - if (unlikely(holder)) - fsnotify_destroy_event_holder(holder); - holder = &event->holder; - } else if (unlikely(!holder)) { - /* between the time we checked above and got the lock the in - * event holder was used, go back and get a new one */ - spin_unlock(&event->lock); - mutex_unlock(&group->notification_mutex); - goto alloc_holder; - } - - if (!list_empty(list)) { - last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list); - last_event = last_holder->event; - if (event_compare(last_event, event)) { - spin_unlock(&event->lock); - mutex_unlock(&group->notification_mutex); - if (holder != &event->holder) - fsnotify_destroy_event_holder(holder); - return -EEXIST; - } - } - - group->q_len++; - holder->event = event; - - fsnotify_get_event(event); - list_add_tail(&holder->event_list, list); - if (priv) - list_add_tail(&priv->event_list, &event->private_data_list); - spin_unlock(&event->lock); - mutex_unlock(&group->notification_mutex); - - wake_up(&group->notification_waitq); - return 0; -} - -/* - * Remove and return the first event from the notification list. There is a - * reference held on this event since it was on the list. It is the responsibility - * of the caller to drop this reference. - */ -struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group) -{ - struct fsnotify_event *event; - struct fsnotify_event_holder *holder; - - BUG_ON(!mutex_is_locked(&group->notification_mutex)); - - holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); - - event = holder->event; - - spin_lock(&event->lock); - holder->event = NULL; - list_del_init(&holder->event_list); - spin_unlock(&event->lock); - - /* event == holder means we are referenced through the in event holder */ - if (holder != &event->holder) - fsnotify_destroy_event_holder(holder); - - group->q_len--; - - return event; -} - -/* - * This will not remove the event, that must be done with fsnotify_remove_notify_event() - */ -struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) -{ - struct fsnotify_event *event; - struct fsnotify_event_holder *holder; - - BUG_ON(!mutex_is_locked(&group->notification_mutex)); - - holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); - event = holder->event; - - return event; -} - -/* - * Called when a group is being torn down to clean up any outstanding - * event notifications. - */ -void fsnotify_flush_notify(struct fsnotify_group *group) -{ - struct fsnotify_event *event; - struct fsnotify_event_private_data *priv; - - mutex_lock(&group->notification_mutex); - while (!fsnotify_notify_queue_is_empty(group)) { - event = fsnotify_remove_notify_event(group); - /* if they don't implement free_event_priv they better not have attached any */ - if (group->ops->free_event_priv) { - spin_lock(&event->lock); - priv = fsnotify_remove_priv_from_event(group, event); - spin_unlock(&event->lock); - if (priv) - group->ops->free_event_priv(priv); - } - fsnotify_put_event(event); /* matches fsnotify_add_notify_event */ - } - mutex_unlock(&group->notification_mutex); -} - -static void initialize_event(struct fsnotify_event *event) -{ - event->holder.event = NULL; - INIT_LIST_HEAD(&event->holder.event_list); - atomic_set(&event->refcnt, 1); - - spin_lock_init(&event->lock); - - event->path.dentry = NULL; - event->path.mnt = NULL; - event->inode = NULL; - event->data_type = FSNOTIFY_EVENT_NONE; - - INIT_LIST_HEAD(&event->private_data_list); - - event->to_tell = NULL; - - event->file_name = NULL; - event->name_len = 0; - - event->sync_cookie = 0; -} - -/* - * fsnotify_create_event - Allocate a new event which will be sent to each - * group's handle_event function if the group was interested in this - * particular event. - * - * @to_tell the inode which is supposed to receive the event (sometimes a - * parent of the inode to which the event happened. - * @mask what actually happened. - * @data pointer to the object which was actually affected - * @data_type flag indication if the data is a file, path, inode, nothing... - * @name the filename, if available - */ -struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, - int data_type, const char *name, u32 cookie) -{ - struct fsnotify_event *event; - - event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL); - if (!event) - return NULL; - - initialize_event(event); - - if (name) { - event->file_name = kstrdup(name, GFP_KERNEL); - if (!event->file_name) { - kmem_cache_free(fsnotify_event_cachep, event); - return NULL; - } - event->name_len = strlen(event->file_name); - } - - event->sync_cookie = cookie; - event->to_tell = to_tell; - - switch (data_type) { - case FSNOTIFY_EVENT_FILE: { - struct file *file = data; - struct path *path = &file->f_path; - event->path.dentry = path->dentry; - event->path.mnt = path->mnt; - path_get(&event->path); - event->data_type = FSNOTIFY_EVENT_PATH; - break; - } - case FSNOTIFY_EVENT_PATH: { - struct path *path = data; - event->path.dentry = path->dentry; - event->path.mnt = path->mnt; - path_get(&event->path); - event->data_type = FSNOTIFY_EVENT_PATH; - break; - } - case FSNOTIFY_EVENT_INODE: - event->inode = data; - event->data_type = FSNOTIFY_EVENT_INODE; - break; - case FSNOTIFY_EVENT_NONE: - event->inode = NULL; - event->path.dentry = NULL; - event->path.mnt = NULL; - break; - default: - BUG(); - } - - event->mask = mask; - - return event; -} - -__init int fsnotify_notification_init(void) -{ - fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC); - fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC); - - initialize_event(&q_overflow_event); - q_overflow_event.mask = FS_Q_OVERFLOW; - - return 0; -} -subsys_initcall(fsnotify_notification_init); - diff --git a/trunk/fs/ntfs/super.c b/trunk/fs/ntfs/super.c index abaaa1cbf8de..6aa7c4713536 100644 --- a/trunk/fs/ntfs/super.c +++ b/trunk/fs/ntfs/super.c @@ -443,8 +443,6 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) ntfs_volume *vol = NTFS_SB(sb); ntfs_debug("Entering with remount options string: %s", opt); - - lock_kernel(); #ifndef NTFS_RW /* For read-only compiled driver, enforce read-only flag. */ *flags |= MS_RDONLY; @@ -468,18 +466,15 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) if (NVolErrors(vol)) { ntfs_error(sb, "Volume has errors and is read-only%s", es); - unlock_kernel(); return -EROFS; } if (vol->vol_flags & VOLUME_IS_DIRTY) { ntfs_error(sb, "Volume is dirty and read-only%s", es); - unlock_kernel(); return -EROFS; } if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { ntfs_error(sb, "Volume has been modified by chkdsk " "and is read-only%s", es); - unlock_kernel(); return -EROFS; } if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { @@ -487,13 +482,11 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) "(0x%x) and is read-only%s", (unsigned)le16_to_cpu(vol->vol_flags), es); - unlock_kernel(); return -EROFS; } if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { ntfs_error(sb, "Failed to set dirty bit in volume " "information flags%s", es); - unlock_kernel(); return -EROFS; } #if 0 @@ -513,21 +506,18 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) ntfs_error(sb, "Failed to empty journal $LogFile%s", es); NVolSetErrors(vol); - unlock_kernel(); return -EROFS; } if (!ntfs_mark_quotas_out_of_date(vol)) { ntfs_error(sb, "Failed to mark quotas out of date%s", es); NVolSetErrors(vol); - unlock_kernel(); return -EROFS; } if (!ntfs_stamp_usnjrnl(vol)) { ntfs_error(sb, "Failed to stamp transation log " "($UsnJrnl)%s", es); NVolSetErrors(vol); - unlock_kernel(); return -EROFS; } } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { @@ -543,11 +533,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) // TODO: Deal with *flags. - if (!parse_options(vol, opt)) { - unlock_kernel(); + if (!parse_options(vol, opt)) return -EINVAL; - } - unlock_kernel(); ntfs_debug("Done."); return 0; } @@ -2259,9 +2246,6 @@ static void ntfs_put_super(struct super_block *sb) ntfs_volume *vol = NTFS_SB(sb); ntfs_debug("Entering."); - - lock_kernel(); - #ifdef NTFS_RW /* * Commit all inodes while they are still open in case some of them @@ -2389,12 +2373,39 @@ static void ntfs_put_super(struct super_block *sb) vol->mftmirr_ino = NULL; } /* - * We should have no dirty inodes left, due to - * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as - * the underlying mft records are written out and cleaned. + * If any dirty inodes are left, throw away all mft data page cache + * pages to allow a clean umount. This should never happen any more + * due to mft.c::ntfs_mft_writepage() cleaning all the dirty pages as + * the underlying mft records are written out and cleaned. If it does, + * happen anyway, we want to know... */ ntfs_commit_inode(vol->mft_ino); write_inode_now(vol->mft_ino, 1); + if (sb_has_dirty_inodes(sb)) { + const char *s1, *s2; + + mutex_lock(&vol->mft_ino->i_mutex); + truncate_inode_pages(vol->mft_ino->i_mapping, 0); + mutex_unlock(&vol->mft_ino->i_mutex); + write_inode_now(vol->mft_ino, 1); + if (sb_has_dirty_inodes(sb)) { + static const char *_s1 = "inodes"; + static const char *_s2 = ""; + s1 = _s1; + s2 = _s2; + } else { + static const char *_s1 = "mft pages"; + static const char *_s2 = "They have been thrown " + "away. "; + s1 = _s1; + s2 = _s2; + } + ntfs_error(sb, "Dirty %s found at umount time. %sYou should " + "run chkdsk. Please email " + "linux-ntfs-dev@lists.sourceforge.net and say " + "that you saw this message. Thank you.", s1, + s2); + } #endif /* NTFS_RW */ iput(vol->mft_ino); @@ -2433,8 +2444,7 @@ static void ntfs_put_super(struct super_block *sb) } sb->s_fs_info = NULL; kfree(vol); - - unlock_kernel(); + return; } /** diff --git a/trunk/fs/ocfs2/super.c b/trunk/fs/ocfs2/super.c index 201b40a441fe..5c6163f55039 100644 --- a/trunk/fs/ocfs2/super.c +++ b/trunk/fs/ocfs2/super.c @@ -42,7 +42,6 @@ #include #include #include -#include #define MLOG_MASK_PREFIX ML_SUPER #include @@ -127,6 +126,7 @@ static int ocfs2_get_sector(struct super_block *sb, struct buffer_head **bh, int block, int sect_size); +static void ocfs2_write_super(struct super_block *sb); static struct inode *ocfs2_alloc_inode(struct super_block *sb); static void ocfs2_destroy_inode(struct inode *inode); static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend); @@ -141,6 +141,7 @@ static const struct super_operations ocfs2_sops = { .clear_inode = ocfs2_clear_inode, .delete_inode = ocfs2_delete_inode, .sync_fs = ocfs2_sync_fs, + .write_super = ocfs2_write_super, .put_super = ocfs2_put_super, .remount_fs = ocfs2_remount, .show_options = ocfs2_show_options, @@ -364,12 +365,24 @@ static struct file_operations ocfs2_osb_debug_fops = { .llseek = generic_file_llseek, }; +/* + * write_super and sync_fs ripped right out of ext3. + */ +static void ocfs2_write_super(struct super_block *sb) +{ + if (mutex_trylock(&sb->s_lock) != 0) + BUG(); + sb->s_dirt = 0; +} + static int ocfs2_sync_fs(struct super_block *sb, int wait) { int status; tid_t target; struct ocfs2_super *osb = OCFS2_SB(sb); + sb->s_dirt = 0; + if (ocfs2_is_hard_readonly(osb)) return -EROFS; @@ -582,8 +595,6 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) struct mount_options parsed_options; struct ocfs2_super *osb = OCFS2_SB(sb); - lock_kernel(); - if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) { ret = -EINVAL; goto out; @@ -687,7 +698,6 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) ocfs2_set_journal_params(osb); } out: - unlock_kernel(); return ret; } @@ -1540,13 +1550,9 @@ static void ocfs2_put_super(struct super_block *sb) { mlog_entry("(0x%p)\n", sb); - lock_kernel(); - ocfs2_sync_blockdev(sb); ocfs2_dismount_volume(sb, 0); - unlock_kernel(); - mlog_exit_void(); } diff --git a/trunk/fs/omfs/file.c b/trunk/fs/omfs/file.c index d17e774eaf45..834b2331f6b3 100644 --- a/trunk/fs/omfs/file.c +++ b/trunk/fs/omfs/file.c @@ -11,6 +11,21 @@ #include #include "omfs.h" +static int omfs_sync_file(struct file *file, struct dentry *dentry, + int datasync) +{ + struct inode *inode = dentry->d_inode; + int err; + + err = sync_mapping_buffers(inode->i_mapping); + if (!(inode->i_state & I_DIRTY)) + return err; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + return err; + err |= omfs_sync_inode(inode); + return err ? -EIO : 0; +} + static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset) { return (sbi->s_sys_blocksize - offset - @@ -329,7 +344,7 @@ struct file_operations omfs_file_operations = { .aio_read = generic_file_aio_read, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .fsync = simple_fsync, + .fsync = omfs_sync_file, .splice_read = generic_file_splice_read, }; diff --git a/trunk/fs/open.c b/trunk/fs/open.c index 7200e23d9258..bdfbf03615a4 100644 --- a/trunk/fs/open.c +++ b/trunk/fs/open.c @@ -612,7 +612,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode) audit_inode(NULL, dentry); - err = mnt_want_write_file(file); + err = mnt_want_write(file->f_path.mnt); if (err) goto out_putf; mutex_lock(&inode->i_mutex); @@ -761,7 +761,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) if (!file) goto out; - error = mnt_want_write_file(file); + error = mnt_want_write(file->f_path.mnt); if (error) goto out_fput; dentry = file->f_path.dentry; diff --git a/trunk/fs/proc/internal.h b/trunk/fs/proc/internal.h index 753ca37002c8..f6db9618a888 100644 --- a/trunk/fs/proc/internal.h +++ b/trunk/fs/proc/internal.h @@ -92,28 +92,3 @@ struct pde_opener { struct list_head lh; }; void pde_users_dec(struct proc_dir_entry *pde); - -extern spinlock_t proc_subdir_lock; - -struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *); -int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); -unsigned long task_vsize(struct mm_struct *); -int task_statm(struct mm_struct *, int *, int *, int *, int *); -void task_mem(struct seq_file *, struct mm_struct *); - -struct proc_dir_entry *de_get(struct proc_dir_entry *de); -void de_put(struct proc_dir_entry *de); - -extern struct vfsmount *proc_mnt; -int proc_fill_super(struct super_block *); -struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *); - -/* - * These are generic /proc routines that use the internal - * "struct proc_dir_entry" tree to traverse the filesystem. - * - * The /proc root directory has extended versions to take care - * of the /proc/ subdirectories. - */ -int proc_readdir(struct file *, void *, filldir_t); -struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *); diff --git a/trunk/fs/proc/proc_devtree.c b/trunk/fs/proc/proc_devtree.c index fc6c3025befd..de2bba5a3440 100644 --- a/trunk/fs/proc/proc_devtree.c +++ b/trunk/fs/proc/proc_devtree.c @@ -11,7 +11,6 @@ #include #include #include -#include "internal.h" #ifndef HAVE_ARCH_DEVTREE_FIXUPS static inline void set_node_proc_entry(struct device_node *np, diff --git a/trunk/fs/qnx4/Makefile b/trunk/fs/qnx4/Makefile index e4d408cc5473..502d7fe98bab 100644 --- a/trunk/fs/qnx4/Makefile +++ b/trunk/fs/qnx4/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_QNX4FS_FS) += qnx4.o -qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o +qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o fsync.o diff --git a/trunk/fs/qnx4/bitmap.c b/trunk/fs/qnx4/bitmap.c index e1cd061a25f7..8425cf6e9624 100644 --- a/trunk/fs/qnx4/bitmap.c +++ b/trunk/fs/qnx4/bitmap.c @@ -13,9 +13,14 @@ * 28-06-1998 by Frank Denis : qnx4_free_inode (to be fixed) . */ +#include +#include +#include +#include +#include +#include #include #include -#include "qnx4.h" #if 0 int qnx4_new_block(struct super_block *sb) diff --git a/trunk/fs/qnx4/dir.c b/trunk/fs/qnx4/dir.c index 003c68f3238b..ea9ffefb48ad 100644 --- a/trunk/fs/qnx4/dir.c +++ b/trunk/fs/qnx4/dir.c @@ -11,9 +11,14 @@ * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support. */ +#include +#include +#include +#include +#include #include #include -#include "qnx4.h" + static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir) { @@ -79,7 +84,7 @@ const struct file_operations qnx4_dir_operations = { .read = generic_read_dir, .readdir = qnx4_readdir, - .fsync = simple_fsync, + .fsync = file_fsync, }; const struct inode_operations qnx4_dir_inode_operations = diff --git a/trunk/fs/qnx4/file.c b/trunk/fs/qnx4/file.c index 09b170ac936c..867f42b02035 100644 --- a/trunk/fs/qnx4/file.c +++ b/trunk/fs/qnx4/file.c @@ -12,7 +12,8 @@ * 27-06-1998 by Frank Denis : file overwriting. */ -#include "qnx4.h" +#include +#include /* * We have mostly NULL's here: the current defaults are ok for @@ -28,7 +29,7 @@ const struct file_operations qnx4_file_operations = #ifdef CONFIG_QNX4FS_RW .write = do_sync_write, .aio_write = generic_file_aio_write, - .fsync = simple_fsync, + .fsync = qnx4_sync_file, #endif }; diff --git a/trunk/fs/qnx4/fsync.c b/trunk/fs/qnx4/fsync.c new file mode 100644 index 000000000000..aa3b19544bee --- /dev/null +++ b/trunk/fs/qnx4/fsync.c @@ -0,0 +1,169 @@ +/* + * QNX4 file system, Linux implementation. + * + * Version : 0.1 + * + * Using parts of the xiafs filesystem. + * + * History : + * + * 24-03-1998 by Richard Frowijn : first release. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +/* + * The functions for qnx4 fs file synchronization. + */ + +#ifdef CONFIG_QNX4FS_RW + +static int sync_block(struct inode *inode, unsigned short *block, int wait) +{ + struct buffer_head *bh; + unsigned short tmp; + + if (!*block) + return 0; + tmp = *block; + bh = sb_find_get_block(inode->i_sb, *block); + if (!bh) + return 0; + if (*block != tmp) { + brelse(bh); + return 1; + } + if (wait && buffer_req(bh) && !buffer_uptodate(bh)) { + brelse(bh); + return -1; + } + if (wait || !buffer_uptodate(bh) || !buffer_dirty(bh)) { + brelse(bh); + return 0; + } + ll_rw_block(WRITE, 1, &bh); + atomic_dec(&bh->b_count); + return 0; +} + +#ifdef WTF +static int sync_iblock(struct inode *inode, unsigned short *iblock, + struct buffer_head **bh, int wait) +{ + int rc; + unsigned short tmp; + + *bh = NULL; + tmp = *iblock; + if (!tmp) + return 0; + rc = sync_block(inode, iblock, wait); + if (rc) + return rc; + *bh = sb_bread(inode->i_sb, tmp); + if (tmp != *iblock) { + brelse(*bh); + *bh = NULL; + return 1; + } + if (!*bh) + return -1; + return 0; +} +#endif + +static int sync_direct(struct inode *inode, int wait) +{ + int i; + int rc, err = 0; + + for (i = 0; i < 7; i++) { + rc = sync_block(inode, + (unsigned short *) qnx4_raw_inode(inode)->di_first_xtnt.xtnt_blk + i, wait); + if (rc > 0) + break; + if (rc) + err = rc; + } + return err; +} + +#ifdef WTF +static int sync_indirect(struct inode *inode, unsigned short *iblock, int wait) +{ + int i; + struct buffer_head *ind_bh; + int rc, err = 0; + + rc = sync_iblock(inode, iblock, &ind_bh, wait); + if (rc || !ind_bh) + return rc; + + for (i = 0; i < 512; i++) { + rc = sync_block(inode, + ((unsigned short *) ind_bh->b_data) + i, + wait); + if (rc > 0) + break; + if (rc) + err = rc; + } + brelse(ind_bh); + return err; +} + +static int sync_dindirect(struct inode *inode, unsigned short *diblock, + int wait) +{ + int i; + struct buffer_head *dind_bh; + int rc, err = 0; + + rc = sync_iblock(inode, diblock, &dind_bh, wait); + if (rc || !dind_bh) + return rc; + + for (i = 0; i < 512; i++) { + rc = sync_indirect(inode, + ((unsigned short *) dind_bh->b_data) + i, + wait); + if (rc > 0) + break; + if (rc) + err = rc; + } + brelse(dind_bh); + return err; +} +#endif + +int qnx4_sync_file(struct file *file, struct dentry *dentry, int unused) +{ + struct inode *inode = dentry->d_inode; + int wait, err = 0; + + (void) file; + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return -EINVAL; + + lock_kernel(); + for (wait = 0; wait <= 1; wait++) { + err |= sync_direct(inode, wait); + } + err |= qnx4_sync_inode(inode); + unlock_kernel(); + return (err < 0) ? -EIO : 0; +} + +#endif diff --git a/trunk/fs/qnx4/inode.c b/trunk/fs/qnx4/inode.c index 681df5fcd161..fe1f0f31d11c 100644 --- a/trunk/fs/qnx4/inode.c +++ b/trunk/fs/qnx4/inode.c @@ -13,15 +13,19 @@ */ #include -#include +#include +#include +#include #include +#include +#include +#include #include #include #include #include -#include -#include -#include "qnx4.h" +#include +#include #define QNX4_VERSION 4 #define QNX4_BMNAME ".bitmap" @@ -30,6 +34,31 @@ static const struct super_operations qnx4_sops; #ifdef CONFIG_QNX4FS_RW +int qnx4_sync_inode(struct inode *inode) +{ + int err = 0; +# if 0 + struct buffer_head *bh; + + bh = qnx4_update_inode(inode); + if (bh && buffer_dirty(bh)) + { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) + { + printk ("IO error syncing qnx4 inode [%s:%08lx]\n", + inode->i_sb->s_id, inode->i_ino); + err = -1; + } + brelse (bh); + } else if (!bh) { + err = -1; + } +# endif + + return err; +} + static void qnx4_delete_inode(struct inode *inode) { QNX4DEBUG(("qnx4: deleting inode [%lu]\n", (unsigned long) inode->i_ino)); @@ -41,7 +70,15 @@ static void qnx4_delete_inode(struct inode *inode) unlock_kernel(); } -static int qnx4_write_inode(struct inode *inode, int do_sync) +static void qnx4_write_super(struct super_block *sb) +{ + lock_kernel(); + QNX4DEBUG(("qnx4: write_super\n")); + sb->s_dirt = 0; + unlock_kernel(); +} + +static int qnx4_write_inode(struct inode *inode, int unused) { struct qnx4_inode_entry *raw_inode; int block, ino; @@ -78,16 +115,6 @@ static int qnx4_write_inode(struct inode *inode, int do_sync) raw_inode->di_ctime = cpu_to_le32(inode->i_ctime.tv_sec); raw_inode->di_first_xtnt.xtnt_size = cpu_to_le32(inode->i_blocks); mark_buffer_dirty(bh); - if (do_sync) { - sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) { - printk("qnx4: IO error syncing inode [%s:%08x]\n", - inode->i_sb->s_id, ino); - brelse(bh); - unlock_kernel(); - return -EIO; - } - } brelse(bh); unlock_kernel(); return 0; @@ -111,6 +138,7 @@ static const struct super_operations qnx4_sops = #ifdef CONFIG_QNX4FS_RW .write_inode = qnx4_write_inode, .delete_inode = qnx4_delete_inode, + .write_super = qnx4_write_super, #endif }; diff --git a/trunk/fs/qnx4/namei.c b/trunk/fs/qnx4/namei.c index 5972ed214937..775eed3a4085 100644 --- a/trunk/fs/qnx4/namei.c +++ b/trunk/fs/qnx4/namei.c @@ -12,9 +12,16 @@ * 04-07-1998 by Frank Denis : first step for rmdir/unlink. */ +#include +#include +#include +#include +#include +#include +#include +#include #include #include -#include "qnx4.h" /* @@ -180,7 +187,7 @@ int qnx4_rmdir(struct inode *dir, struct dentry *dentry) de->di_status = 0; memset(de->di_fname, 0, sizeof de->di_fname); de->di_mode = 0; - mark_buffer_dirty_inode(bh, dir); + mark_buffer_dirty(bh); clear_nlink(inode); mark_inode_dirty(inode); inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; @@ -225,7 +232,7 @@ int qnx4_unlink(struct inode *dir, struct dentry *dentry) de->di_status = 0; memset(de->di_fname, 0, sizeof de->di_fname); de->di_mode = 0; - mark_buffer_dirty_inode(bh, dir); + mark_buffer_dirty(bh); dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; mark_inode_dirty(dir); inode->i_ctime = dir->i_ctime; diff --git a/trunk/fs/qnx4/qnx4.h b/trunk/fs/qnx4/qnx4.h deleted file mode 100644 index 9efc089454f6..000000000000 --- a/trunk/fs/qnx4/qnx4.h +++ /dev/null @@ -1,57 +0,0 @@ -#include -#include - -#define QNX4_DEBUG 0 - -#if QNX4_DEBUG -#define QNX4DEBUG(X) printk X -#else -#define QNX4DEBUG(X) (void) 0 -#endif - -struct qnx4_sb_info { - struct buffer_head *sb_buf; /* superblock buffer */ - struct qnx4_super_block *sb; /* our superblock */ - unsigned int Version; /* may be useful */ - struct qnx4_inode_entry *BitMap; /* useful */ -}; - -struct qnx4_inode_info { - struct qnx4_inode_entry raw; - loff_t mmu_private; - struct inode vfs_inode; -}; - -extern struct inode *qnx4_iget(struct super_block *, unsigned long); -extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd); -extern unsigned long qnx4_count_free_blocks(struct super_block *sb); -extern unsigned long qnx4_block_map(struct inode *inode, long iblock); - -extern struct buffer_head *qnx4_bread(struct inode *, int, int); - -extern const struct inode_operations qnx4_file_inode_operations; -extern const struct inode_operations qnx4_dir_inode_operations; -extern const struct file_operations qnx4_file_operations; -extern const struct file_operations qnx4_dir_operations; -extern int qnx4_is_free(struct super_block *sb, long block); -extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy); -extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd); -extern void qnx4_truncate(struct inode *inode); -extern void qnx4_free_inode(struct inode *inode); -extern int qnx4_unlink(struct inode *dir, struct dentry *dentry); -extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry); - -static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb) -{ - return sb->s_fs_info; -} - -static inline struct qnx4_inode_info *qnx4_i(struct inode *inode) -{ - return container_of(inode, struct qnx4_inode_info, vfs_inode); -} - -static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode) -{ - return &qnx4_i(inode)->raw; -} diff --git a/trunk/fs/qnx4/truncate.c b/trunk/fs/qnx4/truncate.c index d94d9ee241fe..6437c1c3d1dd 100644 --- a/trunk/fs/qnx4/truncate.c +++ b/trunk/fs/qnx4/truncate.c @@ -10,8 +10,12 @@ * 30-06-1998 by Frank DENIS : ugly filler. */ +#include +#include +#include +#include #include -#include "qnx4.h" +#include #ifdef CONFIG_QNX4FS_RW diff --git a/trunk/fs/quota/quota.c b/trunk/fs/quota/quota.c index 95c5b42384b2..b7f5a468f076 100644 --- a/trunk/fs/quota/quota.c +++ b/trunk/fs/quota/quota.c @@ -159,14 +159,10 @@ static int check_quotactl_valid(struct super_block *sb, int type, int cmd, return error; } -#ifdef CONFIG_QUOTA -void sync_quota_sb(struct super_block *sb, int type) +static void quota_sync_sb(struct super_block *sb, int type) { int cnt; - if (!sb->s_qcop->quota_sync) - return; - sb->s_qcop->quota_sync(sb, type); if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE) @@ -195,13 +191,17 @@ void sync_quota_sb(struct super_block *sb, int type) } mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); } -#endif -static void sync_dquots(int type) +void sync_dquots(struct super_block *sb, int type) { - struct super_block *sb; int cnt; + if (sb) { + if (sb->s_qcop->quota_sync) + quota_sync_sb(sb, type); + return; + } + spin_lock(&sb_lock); restart: list_for_each_entry(sb, &super_blocks, s_list) { @@ -222,8 +222,8 @@ static void sync_dquots(int type) sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); - if (sb->s_root) - sync_quota_sb(sb, type); + if (sb->s_root && sb->s_qcop->quota_sync) + quota_sync_sb(sb, type); up_read(&sb->s_umount); spin_lock(&sb_lock); if (__put_super_and_need_restart(sb)) @@ -301,10 +301,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, return sb->s_qcop->set_dqblk(sb, type, id, &idq); } case Q_SYNC: - if (sb) - sync_quota_sb(sb, type); - else - sync_dquots(type); + sync_dquots(sb, type); return 0; case Q_XQUOTAON: diff --git a/trunk/fs/reiserfs/dir.c b/trunk/fs/reiserfs/dir.c index 6d2668fdc384..45ee3d357c70 100644 --- a/trunk/fs/reiserfs/dir.c +++ b/trunk/fs/reiserfs/dir.c @@ -44,11 +44,13 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, static inline bool is_privroot_deh(struct dentry *dir, struct reiserfs_de_head *deh) { + int ret = 0; +#ifdef CONFIG_REISERFS_FS_XATTR struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root; - if (reiserfs_expose_privroot(dir->d_sb)) - return 0; - return (dir == dir->d_parent && privroot->d_inode && - deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid); + ret = (dir == dir->d_parent && privroot->d_inode && + deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid); +#endif + return ret; } int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, diff --git a/trunk/fs/reiserfs/super.c b/trunk/fs/reiserfs/super.c index 2969773cfc22..3567fb9e3fb1 100644 --- a/trunk/fs/reiserfs/super.c +++ b/trunk/fs/reiserfs/super.c @@ -28,7 +28,6 @@ #include #include #include -#include struct file_system_type reiserfs_fs_type; @@ -65,15 +64,18 @@ static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf); static int reiserfs_sync_fs(struct super_block *s, int wait) { - struct reiserfs_transaction_handle th; - - reiserfs_write_lock(s); - if (!journal_begin(&th, s, 1)) - if (!journal_end_sync(&th, s, 1)) - reiserfs_flush_old_commits(s); - s->s_dirt = 0; /* Even if it's not true. - * We'll loop forever in sync_supers otherwise */ - reiserfs_write_unlock(s); + if (!(s->s_flags & MS_RDONLY)) { + struct reiserfs_transaction_handle th; + reiserfs_write_lock(s); + if (!journal_begin(&th, s, 1)) + if (!journal_end_sync(&th, s, 1)) + reiserfs_flush_old_commits(s); + s->s_dirt = 0; /* Even if it's not true. + * We'll loop forever in sync_supers otherwise */ + reiserfs_write_unlock(s); + } else { + s->s_dirt = 0; + } return 0; } @@ -466,11 +468,6 @@ static void reiserfs_put_super(struct super_block *s) struct reiserfs_transaction_handle th; th.t_trans_id = 0; - lock_kernel(); - - if (s->s_dirt) - reiserfs_write_super(s); - /* change file system state to current state if it was mounted with read-write permissions */ if (!(s->s_flags & MS_RDONLY)) { if (!journal_begin(&th, s, 10)) { @@ -503,7 +500,7 @@ static void reiserfs_put_super(struct super_block *s) kfree(s->s_fs_info); s->s_fs_info = NULL; - unlock_kernel(); + return; } static struct kmem_cache *reiserfs_inode_cachep; @@ -901,7 +898,6 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin {"conv",.setmask = 1 << REISERFS_CONVERT}, {"attrs",.setmask = 1 << REISERFS_ATTRS}, {"noattrs",.clrmask = 1 << REISERFS_ATTRS}, - {"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT}, #ifdef CONFIG_REISERFS_FS_XATTR {"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER}, {"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER}, @@ -1197,7 +1193,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names)); #endif - lock_kernel(); rs = SB_DISK_SUPER_BLOCK(s); if (!reiserfs_parse_options @@ -1320,12 +1315,10 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) out_ok: replace_mount_options(s, new_opts); - unlock_kernel(); return 0; out_err: kfree(new_opts); - unlock_kernel(); return err; } diff --git a/trunk/fs/reiserfs/xattr.c b/trunk/fs/reiserfs/xattr.c index f3d47d856848..8e7deb0e6964 100644 --- a/trunk/fs/reiserfs/xattr.c +++ b/trunk/fs/reiserfs/xattr.c @@ -981,8 +981,7 @@ int reiserfs_lookup_privroot(struct super_block *s) strlen(PRIVROOT_NAME)); if (!IS_ERR(dentry)) { REISERFS_SB(s)->priv_root = dentry; - if (!reiserfs_expose_privroot(s)) - s->s_root->d_op = &xattr_lookup_poison_ops; + s->s_root->d_op = &xattr_lookup_poison_ops; if (dentry->d_inode) dentry->d_inode->i_flags |= S_PRIVATE; } else diff --git a/trunk/fs/smbfs/inode.c b/trunk/fs/smbfs/inode.c index 1402d2d54f52..fc27fbfc5397 100644 --- a/trunk/fs/smbfs/inode.c +++ b/trunk/fs/smbfs/inode.c @@ -474,8 +474,6 @@ smb_put_super(struct super_block *sb) { struct smb_sb_info *server = SMB_SB(sb); - lock_kernel(); - smb_lock_server(server); server->state = CONN_INVALID; smbiod_unregister_server(server); @@ -491,8 +489,6 @@ smb_put_super(struct super_block *sb) smb_unlock_server(server); put_pid(server->conn_pid); kfree(server); - - unlock_kernel(); } static int smb_fill_super(struct super_block *sb, void *raw_data, int silent) diff --git a/trunk/fs/squashfs/super.c b/trunk/fs/squashfs/super.c index 3b52770f46ff..0adc624c956f 100644 --- a/trunk/fs/squashfs/super.c +++ b/trunk/fs/squashfs/super.c @@ -338,8 +338,6 @@ static int squashfs_remount(struct super_block *sb, int *flags, char *data) static void squashfs_put_super(struct super_block *sb) { - lock_kernel(); - if (sb->s_fs_info) { struct squashfs_sb_info *sbi = sb->s_fs_info; squashfs_cache_delete(sbi->block_cache); @@ -352,8 +350,6 @@ static void squashfs_put_super(struct super_block *sb) kfree(sb->s_fs_info); sb->s_fs_info = NULL; } - - unlock_kernel(); } diff --git a/trunk/fs/super.c b/trunk/fs/super.c index 83b47416d006..1943fdf655fa 100644 --- a/trunk/fs/super.c +++ b/trunk/fs/super.c @@ -28,6 +28,7 @@ #include #include #include +#include /* for fsync_super() */ #include #include #include @@ -37,6 +38,7 @@ #include #include #include +#include #include #include "internal.h" @@ -70,6 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type) INIT_HLIST_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); INIT_LIST_HEAD(&s->s_dentry_lru); + INIT_LIST_HEAD(&s->s_async_list); init_rwsem(&s->s_umount); mutex_init(&s->s_lock); lockdep_set_class(&s->s_umount, &type->s_umount_key); @@ -282,6 +285,38 @@ void unlock_super(struct super_block * sb) EXPORT_SYMBOL(lock_super); EXPORT_SYMBOL(unlock_super); +/* + * Write out and wait upon all dirty data associated with this + * superblock. Filesystem data as well as the underlying block + * device. Takes the superblock lock. Requires a second blkdev + * flush by the caller to complete the operation. + */ +void __fsync_super(struct super_block *sb) +{ + sync_inodes_sb(sb, 0); + vfs_dq_sync(sb); + lock_super(sb); + if (sb->s_dirt && sb->s_op->write_super) + sb->s_op->write_super(sb); + unlock_super(sb); + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, 1); + sync_blockdev(sb->s_bdev); + sync_inodes_sb(sb, 1); +} + +/* + * Write out and wait upon all dirty data associated with this + * superblock. Filesystem data as well as the underlying block + * device. Takes the superblock lock. + */ +int fsync_super(struct super_block *sb) +{ + __fsync_super(sb); + return sync_blockdev(sb->s_bdev); +} +EXPORT_SYMBOL_GPL(fsync_super); + /** * generic_shutdown_super - common helper for ->kill_sb() * @sb: superblock to kill @@ -303,13 +338,21 @@ void generic_shutdown_super(struct super_block *sb) if (sb->s_root) { shrink_dcache_for_umount(sb); - sync_filesystem(sb); - get_fs_excl(); + fsync_super(sb); + lock_super(sb); sb->s_flags &= ~MS_ACTIVE; + /* + * wait for asynchronous fs operations to finish before going further + */ + async_synchronize_full_domain(&sb->s_async_list); + /* bad name - it should be evict_inodes() */ invalidate_inodes(sb); + lock_kernel(); + if (sop->write_super && sb->s_dirt) + sop->write_super(sb); if (sop->put_super) sop->put_super(sb); @@ -319,7 +362,9 @@ void generic_shutdown_super(struct super_block *sb) "Self-destruct in 5 seconds. Have a nice day...\n", sb->s_id); } - put_fs_excl(); + + unlock_kernel(); + unlock_super(sb); } spin_lock(&sb_lock); /* should be initialized for __put_super_and_need_restart() */ @@ -396,14 +441,16 @@ void drop_super(struct super_block *sb) EXPORT_SYMBOL(drop_super); -/** - * sync_supers - helper for periodic superblock writeback - * - * Call the write_super method if present on all dirty superblocks in - * the system. This is for the periodic writeback used by most older - * filesystems. For data integrity superblock writeback use - * sync_filesystems() instead. - * +static inline void write_super(struct super_block *sb) +{ + lock_super(sb); + if (sb->s_root && sb->s_dirt) + if (sb->s_op->write_super) + sb->s_op->write_super(sb); + unlock_super(sb); +} + +/* * Note: check the dirty flag before waiting, so we don't * hold up the sync while mounting a device. (The newly * mounted device won't need syncing.) @@ -415,15 +462,12 @@ void sync_supers(void) spin_lock(&sb_lock); restart: list_for_each_entry(sb, &super_blocks, s_list) { - if (sb->s_op->write_super && sb->s_dirt) { + if (sb->s_dirt) { sb->s_count++; spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (sb->s_root && sb->s_dirt) - sb->s_op->write_super(sb); + write_super(sb); up_read(&sb->s_umount); - spin_lock(&sb_lock); if (__put_super_and_need_restart(sb)) goto restart; @@ -432,6 +476,60 @@ void sync_supers(void) spin_unlock(&sb_lock); } +/* + * Call the ->sync_fs super_op against all filesystems which are r/w and + * which implement it. + * + * This operation is careful to avoid the livelock which could easily happen + * if two or more filesystems are being continuously dirtied. s_need_sync_fs + * is used only here. We set it against all filesystems and then clear it as + * we sync them. So redirtied filesystems are skipped. + * + * But if process A is currently running sync_filesystems and then process B + * calls sync_filesystems as well, process B will set all the s_need_sync_fs + * flags again, which will cause process A to resync everything. Fix that with + * a local mutex. + * + * (Fabian) Avoid sync_fs with clean fs & wait mode 0 + */ +void sync_filesystems(int wait) +{ + struct super_block *sb; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); /* Could be down_interruptible */ + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (!sb->s_op->sync_fs) + continue; + if (sb->s_flags & MS_RDONLY) + continue; + sb->s_need_sync_fs = 1; + } + +restart: + list_for_each_entry(sb, &super_blocks, s_list) { + if (!sb->s_need_sync_fs) + continue; + sb->s_need_sync_fs = 0; + if (sb->s_flags & MS_RDONLY) + continue; /* hm. Was remounted r/o meanwhile */ + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + async_synchronize_full_domain(&sb->s_async_list); + if (sb->s_root && (wait || sb->s_dirt)) + sb->s_op->sync_fs(sb, wait); + up_read(&sb->s_umount); + /* restart only when sb is no longer on the list */ + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto restart; + } + spin_unlock(&sb_lock); + mutex_unlock(&mutex); +} + /** * get_super - get the superblock of a device * @bdev: device to get the superblock for @@ -517,6 +615,45 @@ SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) return err; } +/** + * mark_files_ro - mark all files read-only + * @sb: superblock in question + * + * All files are marked read-only. We don't care about pending + * delete files so this should be used in 'force' mode only. + */ + +static void mark_files_ro(struct super_block *sb) +{ + struct file *f; + +retry: + file_list_lock(); + list_for_each_entry(f, &sb->s_files, f_u.fu_list) { + struct vfsmount *mnt; + if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) + continue; + if (!file_count(f)) + continue; + if (!(f->f_mode & FMODE_WRITE)) + continue; + f->f_mode &= ~FMODE_WRITE; + if (file_check_writeable(f) != 0) + continue; + file_release_write(f); + mnt = mntget(f->f_path.mnt); + file_list_unlock(); + /* + * This can sleep, so we can't hold + * the file_list_lock() spinlock. + */ + mnt_drop_write(mnt); + mntput(mnt); + goto retry; + } + file_list_unlock(); +} + /** * do_remount_sb - asks filesystem to change mount options. * @sb: superblock in question @@ -538,31 +675,27 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) if (flags & MS_RDONLY) acct_auto_close(sb); shrink_dcache_sb(sb); - sync_filesystem(sb); + fsync_super(sb); /* If we are remounting RDONLY and current sb is read/write, make sure there are no rw files opened */ if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) { if (force) mark_files_ro(sb); - else if (!fs_may_remount_ro(sb)) { - unlock_kernel(); + else if (!fs_may_remount_ro(sb)) return -EBUSY; - } retval = vfs_dq_off(sb, 1); - if (retval < 0 && retval != -ENOSYS) { - unlock_kernel(); + if (retval < 0 && retval != -ENOSYS) return -EBUSY; - } } remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY); if (sb->s_op->remount_fs) { + lock_super(sb); retval = sb->s_op->remount_fs(sb, &flags, data); - if (retval) { - unlock_kernel(); + unlock_super(sb); + if (retval) return retval; - } } sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); if (remount_rw) @@ -578,17 +711,18 @@ static void do_emergency_remount(struct work_struct *work) list_for_each_entry(sb, &super_blocks, s_list) { sb->s_count++; spin_unlock(&sb_lock); - down_write(&sb->s_umount); + down_read(&sb->s_umount); if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) { /* * ->remount_fs needs lock_kernel(). * * What lock protects sb->s_flags?? */ + lock_kernel(); do_remount_sb(sb, MS_RDONLY, NULL, 1); + unlock_kernel(); } - up_write(&sb->s_umount); - put_super(sb); + drop_super(sb); spin_lock(&sb_lock); } spin_unlock(&sb_lock); diff --git a/trunk/fs/sync.c b/trunk/fs/sync.c index dd200025af85..7abc65fbf21d 100644 --- a/trunk/fs/sync.c +++ b/trunk/fs/sync.c @@ -13,123 +13,38 @@ #include #include #include -#include "internal.h" #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ SYNC_FILE_RANGE_WAIT_AFTER) /* - * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0) - * just dirties buffers with inodes so we have to submit IO for these buffers - * via __sync_blockdev(). This also speeds up the wait == 1 case since in that - * case write_inode() functions do sync_dirty_buffer() and thus effectively - * write one block at a time. + * sync everything. Start out by waking pdflush, because that writes back + * all queues in parallel. */ -static int __sync_filesystem(struct super_block *sb, int wait) +static void do_sync(unsigned long wait) { - /* Avoid doing twice syncing and cache pruning for quota sync */ + wakeup_pdflush(0); + sync_inodes(0); /* All mappings, inodes and their blockdevs */ + vfs_dq_sync(NULL); + sync_supers(); /* Write the superblocks */ + sync_filesystems(0); /* Start syncing the filesystems */ + sync_filesystems(wait); /* Waitingly sync the filesystems */ + sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */ if (!wait) - writeout_quota_sb(sb, -1); - else - sync_quota_sb(sb, -1); - sync_inodes_sb(sb, wait); - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, wait); - return __sync_blockdev(sb->s_bdev, wait); -} - -/* - * Write out and wait upon all dirty data associated with this - * superblock. Filesystem data as well as the underlying block - * device. Takes the superblock lock. - */ -int sync_filesystem(struct super_block *sb) -{ - int ret; - - /* - * We need to be protected against the filesystem going from - * r/o to r/w or vice versa. - */ - WARN_ON(!rwsem_is_locked(&sb->s_umount)); - - /* - * No point in syncing out anything if the filesystem is read-only. - */ - if (sb->s_flags & MS_RDONLY) - return 0; - - ret = __sync_filesystem(sb, 0); - if (ret < 0) - return ret; - return __sync_filesystem(sb, 1); -} -EXPORT_SYMBOL_GPL(sync_filesystem); - -/* - * Sync all the data for all the filesystems (called by sys_sync() and - * emergency sync) - * - * This operation is careful to avoid the livelock which could easily happen - * if two or more filesystems are being continuously dirtied. s_need_sync - * is used only here. We set it against all filesystems and then clear it as - * we sync them. So redirtied filesystems are skipped. - * - * But if process A is currently running sync_filesystems and then process B - * calls sync_filesystems as well, process B will set all the s_need_sync - * flags again, which will cause process A to resync everything. Fix that with - * a local mutex. - */ -static void sync_filesystems(int wait) -{ - struct super_block *sb; - static DEFINE_MUTEX(mutex); - - mutex_lock(&mutex); /* Could be down_interruptible */ - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) - sb->s_need_sync = 1; - -restart: - list_for_each_entry(sb, &super_blocks, s_list) { - if (!sb->s_need_sync) - continue; - sb->s_need_sync = 0; - sb->s_count++; - spin_unlock(&sb_lock); - - down_read(&sb->s_umount); - if (!(sb->s_flags & MS_RDONLY) && sb->s_root) - __sync_filesystem(sb, wait); - up_read(&sb->s_umount); - - /* restart only when sb is no longer on the list */ - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; - } - spin_unlock(&sb_lock); - mutex_unlock(&mutex); + printk("Emergency Sync complete\n"); + if (unlikely(laptop_mode)) + laptop_sync_completion(); } SYSCALL_DEFINE0(sync) { - sync_filesystems(0); - sync_filesystems(1); - if (unlikely(laptop_mode)) - laptop_sync_completion(); + do_sync(1); return 0; } static void do_sync_work(struct work_struct *work) { - /* - * Sync twice to reduce the possibility we skipped some inodes / pages - * because they were temporarily locked - */ - sync_filesystems(0); - sync_filesystems(0); - printk("Emergency Sync complete\n"); + do_sync(0); kfree(work); } @@ -160,8 +75,10 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync) /* sync the superblock to buffers */ sb = inode->i_sb; + lock_super(sb); if (sb->s_dirt && sb->s_op->write_super) sb->s_op->write_super(sb); + unlock_super(sb); /* .. finally sync the buffers to disk */ err = sync_blockdev(sb->s_bdev); diff --git a/trunk/fs/sysv/dir.c b/trunk/fs/sysv/dir.c index c7798079e644..56f655254bfe 100644 --- a/trunk/fs/sysv/dir.c +++ b/trunk/fs/sysv/dir.c @@ -24,7 +24,7 @@ static int sysv_readdir(struct file *, void *, filldir_t); const struct file_operations sysv_dir_operations = { .read = generic_read_dir, .readdir = sysv_readdir, - .fsync = simple_fsync, + .fsync = sysv_sync_file, }; static inline void dir_put_page(struct page *page) diff --git a/trunk/fs/sysv/file.c b/trunk/fs/sysv/file.c index 96340c01f4a7..589be21d884e 100644 --- a/trunk/fs/sysv/file.c +++ b/trunk/fs/sysv/file.c @@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .fsync = simple_fsync, + .fsync = sysv_sync_file, .splice_read = generic_file_splice_read, }; @@ -34,3 +34,18 @@ const struct inode_operations sysv_file_inode_operations = { .truncate = sysv_truncate, .getattr = sysv_getattr, }; + +int sysv_sync_file(struct file * file, struct dentry *dentry, int datasync) +{ + struct inode *inode = dentry->d_inode; + int err; + + err = sync_mapping_buffers(inode->i_mapping); + if (!(inode->i_state & I_DIRTY)) + return err; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + return err; + + err |= sysv_sync_inode(inode); + return err ? -EIO : 0; +} diff --git a/trunk/fs/sysv/inode.c b/trunk/fs/sysv/inode.c index 479923456a54..da20b48d350f 100644 --- a/trunk/fs/sysv/inode.c +++ b/trunk/fs/sysv/inode.c @@ -31,13 +31,15 @@ #include #include "sysv.h" -static int sysv_sync_fs(struct super_block *sb, int wait) +/* This is only called on sync() and umount(), when s_dirt=1. */ +static void sysv_write_super(struct super_block *sb) { struct sysv_sb_info *sbi = SYSV_SB(sb); unsigned long time = get_seconds(), old_time; - lock_super(sb); lock_kernel(); + if (sb->s_flags & MS_RDONLY) + goto clean; /* * If we are going to write out the super block, @@ -51,30 +53,18 @@ static int sysv_sync_fs(struct super_block *sb, int wait) *sbi->s_sb_time = cpu_to_fs32(sbi, time); mark_buffer_dirty(sbi->s_bh2); } - +clean: + sb->s_dirt = 0; unlock_kernel(); - unlock_super(sb); - - return 0; -} - -static void sysv_write_super(struct super_block *sb) -{ - if (!(sb->s_flags & MS_RDONLY)) - sysv_sync_fs(sb, 1); - else - sb->s_dirt = 0; } static int sysv_remount(struct super_block *sb, int *flags, char *data) { struct sysv_sb_info *sbi = SYSV_SB(sb); - lock_super(sb); if (sbi->s_forced_ro) *flags |= MS_RDONLY; if (!(*flags & MS_RDONLY)) sb->s_dirt = 1; - unlock_super(sb); return 0; } @@ -82,11 +72,6 @@ static void sysv_put_super(struct super_block *sb) { struct sysv_sb_info *sbi = SYSV_SB(sb); - lock_kernel(); - - if (sb->s_dirt) - sysv_write_super(sb); - if (!(sb->s_flags & MS_RDONLY)) { /* XXX ext2 also updates the state here */ mark_buffer_dirty(sbi->s_bh1); @@ -99,8 +84,6 @@ static void sysv_put_super(struct super_block *sb) brelse(sbi->s_bh2); kfree(sbi); - - unlock_kernel(); } static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -253,7 +236,7 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino) return ERR_PTR(-EIO); } -int sysv_write_inode(struct inode *inode, int wait) +static struct buffer_head * sysv_update_inode(struct inode * inode) { struct super_block * sb = inode->i_sb; struct sysv_sb_info * sbi = SYSV_SB(sb); @@ -261,21 +244,19 @@ int sysv_write_inode(struct inode *inode, int wait) struct sysv_inode * raw_inode; struct sysv_inode_info * si; unsigned int ino, block; - int err = 0; ino = inode->i_ino; if (!ino || ino > sbi->s_ninodes) { printk("Bad inode number on dev %s: %d is out of range\n", inode->i_sb->s_id, ino); - return -EIO; + return NULL; } raw_inode = sysv_raw_inode(sb, ino, &bh); if (!raw_inode) { printk("unable to read i-node block\n"); - return -EIO; + return NULL; } - lock_kernel(); raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode); raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(inode->i_uid)); raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(inode->i_gid)); @@ -291,23 +272,38 @@ int sysv_write_inode(struct inode *inode, int wait) for (block = 0; block < 10+1+1+1; block++) write3byte(sbi, (u8 *)&si->i_data[block], &raw_inode->i_data[3*block]); - unlock_kernel(); mark_buffer_dirty(bh); - if (wait) { - sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) { - printk ("IO error syncing sysv inode [%s:%08x]\n", - sb->s_id, ino); - err = -EIO; - } - } + return bh; +} + +int sysv_write_inode(struct inode * inode, int wait) +{ + struct buffer_head *bh; + lock_kernel(); + bh = sysv_update_inode(inode); brelse(bh); + unlock_kernel(); return 0; } -int sysv_sync_inode(struct inode *inode) +int sysv_sync_inode(struct inode * inode) { - return sysv_write_inode(inode, 1); + int err = 0; + struct buffer_head *bh; + + bh = sysv_update_inode(inode); + if (bh && buffer_dirty(bh)) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) { + printk ("IO error syncing sysv inode [%s:%08lx]\n", + inode->i_sb->s_id, inode->i_ino); + err = -1; + } + } + else if (!bh) + err = -1; + brelse (bh); + return err; } static void sysv_delete_inode(struct inode *inode) @@ -351,7 +347,6 @@ const struct super_operations sysv_sops = { .delete_inode = sysv_delete_inode, .put_super = sysv_put_super, .write_super = sysv_write_super, - .sync_fs = sysv_sync_fs, .remount_fs = sysv_remount, .statfs = sysv_statfs, }; diff --git a/trunk/fs/sysv/sysv.h b/trunk/fs/sysv/sysv.h index 53786eb5cf60..5784a318c883 100644 --- a/trunk/fs/sysv/sysv.h +++ b/trunk/fs/sysv/sysv.h @@ -144,6 +144,7 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping, extern struct inode *sysv_iget(struct super_block *, unsigned int); extern int sysv_write_inode(struct inode *, int); extern int sysv_sync_inode(struct inode *); +extern int sysv_sync_file(struct file *, struct dentry *, int); extern void sysv_set_inode(struct inode *, dev_t); extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern int sysv_init_icache(void); diff --git a/trunk/fs/ubifs/super.c b/trunk/fs/ubifs/super.c index 3589eab02a2f..e9f7a754c4f7 100644 --- a/trunk/fs/ubifs/super.c +++ b/trunk/fs/ubifs/super.c @@ -36,7 +36,6 @@ #include #include #include -#include #include "ubifs.h" /* @@ -448,6 +447,9 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) if (!wait) return 0; + if (sb->s_flags & MS_RDONLY) + return 0; + /* * VFS calls '->sync_fs()' before synchronizing all dirty inodes and * pages, so synchronize them first, then commit the journal. Strictly @@ -1685,9 +1687,6 @@ static void ubifs_put_super(struct super_block *sb) ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num, c->vi.vol_id); - - lock_kernel(); - /* * The following asserts are only valid if there has not been a failure * of the media. For example, there will be dirty inodes if we failed @@ -1754,8 +1753,6 @@ static void ubifs_put_super(struct super_block *sb) ubi_close_volume(c->ubi); mutex_unlock(&c->umount_mutex); kfree(c); - - unlock_kernel(); } static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) @@ -1771,22 +1768,17 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) return err; } - lock_kernel(); if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { if (c->ro_media) { ubifs_msg("cannot re-mount due to prior errors"); - unlock_kernel(); return -EROFS; } err = ubifs_remount_rw(c); - if (err) { - unlock_kernel(); + if (err) return err; - } } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { if (c->ro_media) { ubifs_msg("cannot re-mount due to prior errors"); - unlock_kernel(); return -EROFS; } ubifs_remount_ro(c); @@ -1801,7 +1793,6 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) } ubifs_assert(c->lst.taken_empty_lebs > 0); - unlock_kernel(); return 0; } diff --git a/trunk/fs/udf/Makefile b/trunk/fs/udf/Makefile index eb880f66c23a..0d4503f7446d 100644 --- a/trunk/fs/udf/Makefile +++ b/trunk/fs/udf/Makefile @@ -5,5 +5,5 @@ obj-$(CONFIG_UDF_FS) += udf.o udf-objs := balloc.o dir.o file.o ialloc.o inode.o lowlevel.o namei.o \ - partition.o super.o truncate.o symlink.o \ + partition.o super.o truncate.o symlink.o fsync.o \ directory.o misc.o udftime.o unicode.o diff --git a/trunk/fs/udf/dir.c b/trunk/fs/udf/dir.c index 61d9a76a3a69..2efd4d5291b6 100644 --- a/trunk/fs/udf/dir.c +++ b/trunk/fs/udf/dir.c @@ -210,5 +210,5 @@ const struct file_operations udf_dir_operations = { .read = generic_read_dir, .readdir = udf_readdir, .ioctl = udf_ioctl, - .fsync = simple_fsync, + .fsync = udf_fsync_file, }; diff --git a/trunk/fs/udf/file.c b/trunk/fs/udf/file.c index 7464305382b5..eb91f3b70320 100644 --- a/trunk/fs/udf/file.c +++ b/trunk/fs/udf/file.c @@ -209,7 +209,7 @@ const struct file_operations udf_file_operations = { .write = do_sync_write, .aio_write = udf_file_aio_write, .release = udf_release_file, - .fsync = simple_fsync, + .fsync = udf_fsync_file, .splice_read = generic_file_splice_read, .llseek = generic_file_llseek, }; diff --git a/trunk/fs/udf/fsync.c b/trunk/fs/udf/fsync.c new file mode 100644 index 000000000000..b2c472b733b8 --- /dev/null +++ b/trunk/fs/udf/fsync.c @@ -0,0 +1,52 @@ +/* + * fsync.c + * + * PURPOSE + * Fsync handling routines for the OSTA-UDF(tm) filesystem. + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1999-2001 Ben Fennema + * (C) 1999-2000 Stelias Computing Inc + * + * HISTORY + * + * 05/22/99 blf Created. + */ + +#include "udfdecl.h" + +#include + +static int udf_fsync_inode(struct inode *, int); + +/* + * File may be NULL when we are called. Perhaps we shouldn't + * even pass file to fsync ? + */ + +int udf_fsync_file(struct file *file, struct dentry *dentry, int datasync) +{ + struct inode *inode = dentry->d_inode; + + return udf_fsync_inode(inode, datasync); +} + +static int udf_fsync_inode(struct inode *inode, int datasync) +{ + int err; + + err = sync_mapping_buffers(inode->i_mapping); + if (!(inode->i_state & I_DIRTY)) + return err; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + return err; + + err |= udf_sync_inode(inode); + + return err ? -EIO : 0; +} diff --git a/trunk/fs/udf/super.c b/trunk/fs/udf/super.c index 6832135159b6..0ba44107d8f1 100644 --- a/trunk/fs/udf/super.c +++ b/trunk/fs/udf/super.c @@ -568,7 +568,6 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) if (!udf_parse_options(options, &uopt, true)) return -EINVAL; - lock_kernel(); sbi->s_flags = uopt.flags; sbi->s_uid = uopt.uid; sbi->s_gid = uopt.gid; @@ -582,16 +581,13 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) *flags |= MS_RDONLY; } - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { - unlock_kernel(); + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; - } if (*flags & MS_RDONLY) udf_close_lvid(sb); else udf_open_lvid(sb); - unlock_kernel(); return 0; } @@ -2066,9 +2062,6 @@ static void udf_put_super(struct super_block *sb) struct udf_sb_info *sbi; sbi = UDF_SB(sb); - - lock_kernel(); - if (sbi->s_vat_inode) iput(sbi->s_vat_inode); if (sbi->s_partitions) @@ -2084,8 +2077,6 @@ static void udf_put_super(struct super_block *sb) kfree(sbi->s_partmaps); kfree(sb->s_fs_info); sb->s_fs_info = NULL; - - unlock_kernel(); } static int udf_sync_fs(struct super_block *sb, int wait) diff --git a/trunk/fs/udf/udfdecl.h b/trunk/fs/udf/udfdecl.h index 8d46f4294ee7..cac51b77a5d1 100644 --- a/trunk/fs/udf/udfdecl.h +++ b/trunk/fs/udf/udfdecl.h @@ -223,6 +223,9 @@ extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t, extern int udf_new_block(struct super_block *, struct inode *, uint16_t, uint32_t, int *); +/* fsync.c */ +extern int udf_fsync_file(struct file *, struct dentry *, int); + /* directory.c */ extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *, struct udf_fileident_bh *, diff --git a/trunk/fs/ufs/dir.c b/trunk/fs/ufs/dir.c index 6f671f1ac271..6321b797061b 100644 --- a/trunk/fs/ufs/dir.c +++ b/trunk/fs/ufs/dir.c @@ -666,6 +666,6 @@ int ufs_empty_dir(struct inode * inode) const struct file_operations ufs_dir_operations = { .read = generic_read_dir, .readdir = ufs_readdir, - .fsync = simple_fsync, + .fsync = ufs_sync_file, .llseek = generic_file_llseek, }; diff --git a/trunk/fs/ufs/file.c b/trunk/fs/ufs/file.c index 73655c61240a..2bd3a1615714 100644 --- a/trunk/fs/ufs/file.c +++ b/trunk/fs/ufs/file.c @@ -24,10 +24,31 @@ */ #include +#include /* for sync_mapping_buffers() */ #include "ufs_fs.h" #include "ufs.h" + +int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync) +{ + struct inode *inode = dentry->d_inode; + int err; + int ret; + + ret = sync_mapping_buffers(inode->i_mapping); + if (!(inode->i_state & I_DIRTY)) + return ret; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + return ret; + + err = ufs_sync_inode(inode); + if (ret == 0) + ret = err; + return ret; +} + + /* * We have mostly NULL's here: the current defaults are ok for * the ufs filesystem. @@ -41,6 +62,6 @@ const struct file_operations ufs_file_operations = { .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, .open = generic_file_open, - .fsync = simple_fsync, + .fsync = ufs_sync_file, .splice_read = generic_file_splice_read, }; diff --git a/trunk/fs/ufs/super.c b/trunk/fs/ufs/super.c index 5faed7954d0a..60359291761f 100644 --- a/trunk/fs/ufs/super.c +++ b/trunk/fs/ufs/super.c @@ -263,7 +263,6 @@ void ufs_panic (struct super_block * sb, const char * function, struct ufs_super_block_first * usb1; va_list args; - lock_kernel(); uspi = UFS_SB(sb)->s_uspi; usb1 = ubh_get_usb_first(uspi); @@ -595,9 +594,6 @@ static void ufs_put_super_internal(struct super_block *sb) UFSD("ENTER\n"); - - lock_kernel(); - ufs_put_cstotal(sb); size = uspi->s_cssize; blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; @@ -625,9 +621,6 @@ static void ufs_put_super_internal(struct super_block *sb) brelse (sbi->s_ucg[i]); kfree (sbi->s_ucg); kfree (base); - - unlock_kernel(); - UFSD("EXIT\n"); } @@ -1125,45 +1118,32 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; } -static int ufs_sync_fs(struct super_block *sb, int wait) +static void ufs_write_super(struct super_block *sb) { struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; struct ufs_super_block_third * usb3; unsigned flags; - lock_super(sb); lock_kernel(); - UFSD("ENTER\n"); - flags = UFS_SB(sb)->s_flags; uspi = UFS_SB(sb)->s_uspi; usb1 = ubh_get_usb_first(uspi); usb3 = ubh_get_usb_third(uspi); - usb1->fs_time = cpu_to_fs32(sb, get_seconds()); - if ((flags & UFS_ST_MASK) == UFS_ST_SUN || - (flags & UFS_ST_MASK) == UFS_ST_SUNOS || - (flags & UFS_ST_MASK) == UFS_ST_SUNx86) - ufs_set_fs_state(sb, usb1, usb3, - UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); - ufs_put_cstotal(sb); + if (!(sb->s_flags & MS_RDONLY)) { + usb1->fs_time = cpu_to_fs32(sb, get_seconds()); + if ((flags & UFS_ST_MASK) == UFS_ST_SUN + || (flags & UFS_ST_MASK) == UFS_ST_SUNOS + || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) + ufs_set_fs_state(sb, usb1, usb3, + UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); + ufs_put_cstotal(sb); + } sb->s_dirt = 0; - UFSD("EXIT\n"); unlock_kernel(); - unlock_super(sb); - - return 0; -} - -static void ufs_write_super(struct super_block *sb) -{ - if (!(sb->s_flags & MS_RDONLY)) - ufs_sync_fs(sb, 1); - else - sb->s_dirt = 0; } static void ufs_put_super(struct super_block *sb) @@ -1172,9 +1152,6 @@ static void ufs_put_super(struct super_block *sb) UFSD("ENTER\n"); - if (sb->s_dirt) - ufs_write_super(sb); - if (!(sb->s_flags & MS_RDONLY)) ufs_put_super_internal(sb); @@ -1194,9 +1171,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) struct ufs_super_block_third * usb3; unsigned new_mount_opt, ufstype; unsigned flags; - - lock_kernel(); - lock_super(sb); + uspi = UFS_SB(sb)->s_uspi; flags = UFS_SB(sb)->s_flags; usb1 = ubh_get_usb_first(uspi); @@ -1209,24 +1184,17 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufstype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE; new_mount_opt = 0; ufs_set_opt (new_mount_opt, ONERROR_LOCK); - if (!ufs_parse_options (data, &new_mount_opt)) { - unlock_super(sb); - unlock_kernel(); + if (!ufs_parse_options (data, &new_mount_opt)) return -EINVAL; - } if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { new_mount_opt |= ufstype; } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { printk("ufstype can't be changed during remount\n"); - unlock_super(sb); - unlock_kernel(); return -EINVAL; } if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { UFS_SB(sb)->s_mount_opt = new_mount_opt; - unlock_super(sb); - unlock_kernel(); return 0; } @@ -1251,8 +1219,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) #ifndef CONFIG_UFS_FS_WRITE printk("ufs was compiled with read-only support, " "can't be mounted as read-write\n"); - unlock_super(sb); - unlock_kernel(); return -EINVAL; #else if (ufstype != UFS_MOUNT_UFSTYPE_SUN && @@ -1261,22 +1227,16 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && ufstype != UFS_MOUNT_UFSTYPE_UFS2) { printk("this ufstype is read-only supported\n"); - unlock_super(sb); - unlock_kernel(); return -EINVAL; } if (!ufs_read_cylinder_structures(sb)) { printk("failed during remounting\n"); - unlock_super(sb); - unlock_kernel(); return -EPERM; } sb->s_flags &= ~MS_RDONLY; #endif } UFS_SB(sb)->s_mount_opt = new_mount_opt; - unlock_super(sb); - unlock_kernel(); return 0; } @@ -1392,7 +1352,6 @@ static const struct super_operations ufs_super_ops = { .delete_inode = ufs_delete_inode, .put_super = ufs_put_super, .write_super = ufs_write_super, - .sync_fs = ufs_sync_fs, .statfs = ufs_statfs, .remount_fs = ufs_remount, .show_options = ufs_show_options, diff --git a/trunk/fs/ufs/ufs.h b/trunk/fs/ufs/ufs.h index 644e77e13599..d0c4acd4f1f3 100644 --- a/trunk/fs/ufs/ufs.h +++ b/trunk/fs/ufs/ufs.h @@ -99,6 +99,7 @@ extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, extern const struct inode_operations ufs_file_inode_operations; extern const struct file_operations ufs_file_operations; extern const struct address_space_operations ufs_aops; +extern int ufs_sync_file(struct file *, struct dentry *, int); /* ialloc.c */ extern void ufs_free_inode (struct inode *inode); diff --git a/trunk/fs/xattr.c b/trunk/fs/xattr.c index 1c3d0af59ddf..d51b8f9db921 100644 --- a/trunk/fs/xattr.c +++ b/trunk/fs/xattr.c @@ -297,7 +297,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, return error; dentry = f->f_path.dentry; audit_inode(NULL, dentry); - error = mnt_want_write_file(f); + error = mnt_want_write(f->f_path.mnt); if (!error) { error = setxattr(dentry, name, value, size, flags); mnt_drop_write(f->f_path.mnt); @@ -524,7 +524,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) return error; dentry = f->f_path.dentry; audit_inode(NULL, dentry); - error = mnt_want_write_file(f); + error = mnt_want_write(f->f_path.mnt); if (!error) { error = removexattr(dentry, name); mnt_drop_write(f->f_path.mnt); diff --git a/trunk/fs/xfs/linux-2.6/xfs_super.c b/trunk/fs/xfs/linux-2.6/xfs_super.c index 08d6bd9a3947..bb685269f832 100644 --- a/trunk/fs/xfs/linux-2.6/xfs_super.c +++ b/trunk/fs/xfs/linux-2.6/xfs_super.c @@ -1104,6 +1104,15 @@ xfs_fs_put_super( kfree(mp); } +STATIC void +xfs_fs_write_super( + struct super_block *sb) +{ + if (!(sb->s_flags & MS_RDONLY)) + xfs_sync_fsdata(XFS_M(sb), 0); + sb->s_dirt = 0; +} + STATIC int xfs_fs_sync_super( struct super_block *sb, @@ -1128,6 +1137,7 @@ xfs_fs_sync_super( error = xfs_quiesce_data(mp); else error = xfs_sync_fsdata(mp, 0); + sb->s_dirt = 0; if (unlikely(laptop_mode)) { int prev_sync_seq = mp->m_sync_seq; @@ -1433,6 +1443,7 @@ xfs_fs_fill_super( XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname); + sb->s_dirt = 1; sb->s_magic = XFS_SB_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; @@ -1522,6 +1533,7 @@ static struct super_operations xfs_super_operations = { .write_inode = xfs_fs_write_inode, .clear_inode = xfs_fs_clear_inode, .put_super = xfs_fs_put_super, + .write_super = xfs_fs_write_super, .sync_fs = xfs_fs_sync_super, .freeze_fs = xfs_fs_freeze, .statfs = xfs_fs_statfs, diff --git a/trunk/fs/xfs/xfs_trans.c b/trunk/fs/xfs/xfs_trans.c index bcc39d358ad3..8570b826fedd 100644 --- a/trunk/fs/xfs/xfs_trans.c +++ b/trunk/fs/xfs/xfs_trans.c @@ -628,6 +628,8 @@ xfs_trans_apply_sb_deltas( xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount), offsetof(xfs_dsb_t, sb_frextents) + sizeof(sbp->sb_frextents) - 1); + + tp->t_mountp->m_super->s_dirt = 1; } /* diff --git a/trunk/include/linux/Kbuild b/trunk/include/linux/Kbuild index b3afd2219ad2..3f0eaa397ef5 100644 --- a/trunk/include/linux/Kbuild +++ b/trunk/include/linux/Kbuild @@ -135,7 +135,6 @@ header-y += posix_types.h header-y += ppdev.h header-y += prctl.h header-y += qnxtypes.h -header-y += qnx4_fs.h header-y += radeonfb.h header-y += raw.h header-y += resource.h @@ -309,6 +308,7 @@ unifdef-y += poll.h unifdef-y += ppp_defs.h unifdef-y += ppp-comp.h unifdef-y += ptrace.h +unifdef-y += qnx4_fs.h unifdef-y += quota.h unifdef-y += random.h unifdef-y += irqnr.h diff --git a/trunk/include/linux/cdev.h b/trunk/include/linux/cdev.h index f389e319a454..fb4591977b03 100644 --- a/trunk/include/linux/cdev.h +++ b/trunk/include/linux/cdev.h @@ -28,8 +28,6 @@ int cdev_add(struct cdev *, dev_t, unsigned); void cdev_del(struct cdev *); -int cdev_index(struct inode *inode); - void cd_forget(struct inode *); extern struct backing_dev_info directly_mappable_cdev_bdi; diff --git a/trunk/include/linux/cramfs_fs.h b/trunk/include/linux/cramfs_fs.h index 6fc2bed368b8..3be4e5a27d82 100644 --- a/trunk/include/linux/cramfs_fs.h +++ b/trunk/include/linux/cramfs_fs.h @@ -2,8 +2,9 @@ #define __CRAMFS_H #include -#include +#define CRAMFS_MAGIC 0x28cd3d45 /* some random number */ +#define CRAMFS_MAGIC_WEND 0x453dcd28 /* magic number with the wrong endianess */ #define CRAMFS_SIGNATURE "Compressed ROMFS" /* diff --git a/trunk/include/linux/dcache.h b/trunk/include/linux/dcache.h index 30b93b2a01a4..15156364d196 100644 --- a/trunk/include/linux/dcache.h +++ b/trunk/include/linux/dcache.h @@ -180,12 +180,10 @@ d_iput: no no no yes #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ #define DCACHE_UNHASHED 0x0010 -#define DCACHE_INOTIFY_PARENT_WATCHED 0x0020 /* Parent inode is watched by inotify */ +#define DCACHE_INOTIFY_PARENT_WATCHED 0x0020 /* Parent inode is watched */ #define DCACHE_COOKIE 0x0040 /* For use by dcookie subsystem */ -#define DCACHE_FSNOTIFY_PARENT_WATCHED 0x0080 /* Parent inode is watched by some fsnotify listener */ - extern spinlock_t dcache_lock; extern seqlock_t rename_lock; @@ -353,11 +351,6 @@ static inline int d_unhashed(struct dentry *dentry) return (dentry->d_flags & DCACHE_UNHASHED); } -static inline int d_unlinked(struct dentry *dentry) -{ - return d_unhashed(dentry) && !IS_ROOT(dentry); -} - static inline struct dentry *dget_parent(struct dentry *dentry) { struct dentry *ret; @@ -375,7 +368,7 @@ static inline int d_mountpoint(struct dentry *dentry) return dentry->d_mounted; } -extern struct vfsmount *lookup_mnt(struct path *); +extern struct vfsmount *lookup_mnt(struct vfsmount *, struct dentry *); extern struct dentry *lookup_create(struct nameidata *nd, int is_dir); extern int sysctl_vfs_cache_pressure; diff --git a/trunk/include/linux/dnotify.h b/trunk/include/linux/dnotify.h index ecc06286226d..102a902b4396 100644 --- a/trunk/include/linux/dnotify.h +++ b/trunk/include/linux/dnotify.h @@ -10,7 +10,7 @@ struct dnotify_struct { struct dnotify_struct * dn_next; - __u32 dn_mask; + unsigned long dn_mask; int dn_fd; struct file * dn_filp; fl_owner_t dn_owner; @@ -21,18 +21,23 @@ struct dnotify_struct { #ifdef CONFIG_DNOTIFY -#define DNOTIFY_ALL_EVENTS (FS_DELETE | FS_DELETE_CHILD |\ - FS_MODIFY | FS_MODIFY_CHILD |\ - FS_ACCESS | FS_ACCESS_CHILD |\ - FS_ATTRIB | FS_ATTRIB_CHILD |\ - FS_CREATE | FS_DN_RENAME |\ - FS_MOVED_FROM | FS_MOVED_TO) - +extern void __inode_dir_notify(struct inode *, unsigned long); extern void dnotify_flush(struct file *, fl_owner_t); extern int fcntl_dirnotify(int, struct file *, unsigned long); +extern void dnotify_parent(struct dentry *, unsigned long); + +static inline void inode_dir_notify(struct inode *inode, unsigned long event) +{ + if (inode->i_dnotify_mask & (event)) + __inode_dir_notify(inode, event); +} #else +static inline void __inode_dir_notify(struct inode *inode, unsigned long event) +{ +} + static inline void dnotify_flush(struct file *filp, fl_owner_t id) { } @@ -42,6 +47,14 @@ static inline int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) return -EINVAL; } +static inline void dnotify_parent(struct dentry *dentry, unsigned long event) +{ +} + +static inline void inode_dir_notify(struct inode *inode, unsigned long event) +{ +} + #endif /* CONFIG_DNOTIFY */ #endif /* __KERNEL __ */ diff --git a/trunk/include/linux/fs.h b/trunk/include/linux/fs.h index ede84fa7da5d..83d6b4397245 100644 --- a/trunk/include/linux/fs.h +++ b/trunk/include/linux/fs.h @@ -729,8 +729,8 @@ struct inode { struct timespec i_atime; struct timespec i_mtime; struct timespec i_ctime; - blkcnt_t i_blocks; unsigned int i_blkbits; + blkcnt_t i_blocks; unsigned short i_bytes; umode_t i_mode; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ @@ -751,12 +751,13 @@ struct inode { struct block_device *i_bdev; struct cdev *i_cdev; }; + int i_cindex; __u32 i_generation; -#ifdef CONFIG_FSNOTIFY - __u32 i_fsnotify_mask; /* all events this inode cares about */ - struct hlist_head i_fsnotify_mark_entries; /* fsnotify mark entries */ +#ifdef CONFIG_DNOTIFY + unsigned long i_dnotify_mask; /* Directory notify events */ + struct dnotify_struct *i_dnotify; /* for directory notifications */ #endif #ifdef CONFIG_INOTIFY @@ -1320,7 +1321,7 @@ struct super_block { struct rw_semaphore s_umount; struct mutex s_lock; int s_count; - int s_need_sync; + int s_need_sync_fs; atomic_t s_active; #ifdef CONFIG_SECURITY void *s_security; @@ -1371,6 +1372,11 @@ struct super_block { * generic_show_options() */ char *s_options; + + /* + * storage for asynchronous operations + */ + struct list_head s_async_list; }; extern struct timespec current_fs_time(struct super_block *sb); @@ -1794,7 +1800,7 @@ extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data); extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); extern long do_mount(char *, char *, char *, unsigned long, void *); -extern struct vfsmount *collect_mounts(struct path *); +extern struct vfsmount *collect_mounts(struct vfsmount *, struct dentry *); extern void drop_collected_mounts(struct vfsmount *); extern int vfs_statfs(struct dentry *, struct kstatfs *); @@ -1941,6 +1947,8 @@ extern struct super_block *freeze_bdev(struct block_device *); extern void emergency_thaw_all(void); extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); extern int fsync_bdev(struct block_device *); +extern int fsync_super(struct super_block *); +extern int fsync_no_super(struct block_device *); #else static inline void bd_forget(struct inode *inode) {} static inline int sync_blockdev(struct block_device *bdev) { return 0; } @@ -1956,7 +1964,6 @@ static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb) return 0; } #endif -extern int sync_filesystem(struct super_block *); extern const struct file_operations def_blk_fops; extern const struct file_operations def_chr_fops; extern const struct file_operations bad_sock_fops; @@ -2075,8 +2082,12 @@ extern int filemap_fdatawrite_range(struct address_space *mapping, extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync); extern void sync_supers(void); +extern void sync_filesystems(int wait); +extern void __fsync_super(struct super_block *sb); extern void emergency_sync(void); extern void emergency_remount(void); +extern int do_remount_sb(struct super_block *sb, int flags, + void *data, int force); #ifdef CONFIG_BLOCK extern sector_t bmap(struct inode *, sector_t); #endif @@ -2345,8 +2356,6 @@ extern void simple_release_fs(struct vfsmount **mount, int *count); extern ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available); -extern int simple_fsync(struct file *, struct dentry *, int); - #ifdef CONFIG_MIGRATION extern int buffer_migrate_page(struct address_space *, struct page *, struct page *); diff --git a/trunk/include/linux/fsnotify.h b/trunk/include/linux/fsnotify.h index 936f9aa8bb97..00fbd5b245c9 100644 --- a/trunk/include/linux/fsnotify.h +++ b/trunk/include/linux/fsnotify.h @@ -13,7 +13,6 @@ #include #include -#include #include /* @@ -23,44 +22,18 @@ static inline void fsnotify_d_instantiate(struct dentry *entry, struct inode *inode) { - __fsnotify_d_instantiate(entry, inode); - inotify_d_instantiate(entry, inode); } -/* Notify this dentry's parent about a child's events. */ -static inline void fsnotify_parent(struct dentry *dentry, __u32 mask) -{ - __fsnotify_parent(dentry, mask); - - inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name); -} - /* * fsnotify_d_move - entry has been moved * Called with dcache_lock and entry->d_lock held. */ static inline void fsnotify_d_move(struct dentry *entry) { - /* - * On move we need to update entry->d_flags to indicate if the new parent - * cares about events from this entry. - */ - __fsnotify_update_dcache_flags(entry); - inotify_d_move(entry); } -/* - * fsnotify_link_count - inode's link count changed - */ -static inline void fsnotify_link_count(struct inode *inode) -{ - inotify_inode_queue_event(inode, IN_ATTRIB, 0, NULL, NULL); - - fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE, NULL, 0); -} - /* * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir */ @@ -69,62 +42,42 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, int isdir, struct inode *target, struct dentry *moved) { struct inode *source = moved->d_inode; - u32 in_cookie = inotify_get_cookie(); - u32 fs_cookie = fsnotify_get_cookie(); - __u32 old_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_FROM); - __u32 new_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_TO); + u32 cookie = inotify_get_cookie(); if (old_dir == new_dir) - old_dir_mask |= FS_DN_RENAME; - - if (isdir) { - isdir = IN_ISDIR; - old_dir_mask |= FS_IN_ISDIR; - new_dir_mask |= FS_IN_ISDIR; + inode_dir_notify(old_dir, DN_RENAME); + else { + inode_dir_notify(old_dir, DN_DELETE); + inode_dir_notify(new_dir, DN_CREATE); } - inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir, in_cookie, old_name, + if (isdir) + isdir = IN_ISDIR; + inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir,cookie,old_name, source); - inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, in_cookie, new_name, + inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, cookie, new_name, source); - fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE, old_name, fs_cookie); - fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE, new_name, fs_cookie); - if (target) { inotify_inode_queue_event(target, IN_DELETE_SELF, 0, NULL, NULL); inotify_inode_is_dead(target); - - /* this is really a link_count change not a removal */ - fsnotify_link_count(target); } if (source) { inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL, NULL); - fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0); } audit_inode_child(new_name, moved, new_dir); } -/* - * fsnotify_inode_delete - and inode is being evicted from cache, clean up is needed - */ -static inline void fsnotify_inode_delete(struct inode *inode) -{ - __fsnotify_inode_delete(inode); -} - /* * fsnotify_nameremove - a filename was removed from a directory */ static inline void fsnotify_nameremove(struct dentry *dentry, int isdir) { - __u32 mask = FS_DELETE; - if (isdir) - mask |= FS_IN_ISDIR; - - fsnotify_parent(dentry, mask); + isdir = IN_ISDIR; + dnotify_parent(dentry, DN_DELETE); + inotify_dentry_parent_queue_event(dentry, IN_DELETE|isdir, 0, dentry->d_name.name); } /* @@ -134,9 +87,14 @@ static inline void fsnotify_inoderemove(struct inode *inode) { inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL, NULL); inotify_inode_is_dead(inode); +} - fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE, NULL, 0); - __fsnotify_inode_delete(inode); +/* + * fsnotify_link_count - inode's link count changed + */ +static inline void fsnotify_link_count(struct inode *inode) +{ + inotify_inode_queue_event(inode, IN_ATTRIB, 0, NULL, NULL); } /* @@ -144,11 +102,10 @@ static inline void fsnotify_inoderemove(struct inode *inode) */ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry) { + inode_dir_notify(inode, DN_CREATE); inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name, dentry->d_inode); audit_inode_child(dentry->d_name.name, dentry, inode); - - fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0); } /* @@ -158,12 +115,11 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry) */ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct dentry *new_dentry) { + inode_dir_notify(dir, DN_CREATE); inotify_inode_queue_event(dir, IN_CREATE, 0, new_dentry->d_name.name, inode); fsnotify_link_count(inode); audit_inode_child(new_dentry->d_name.name, new_dentry, dir); - - fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE, new_dentry->d_name.name, 0); } /* @@ -171,13 +127,10 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct */ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry) { - __u32 mask = (FS_CREATE | FS_IN_ISDIR); - struct inode *d_inode = dentry->d_inode; - - inotify_inode_queue_event(inode, mask, 0, dentry->d_name.name, d_inode); + inode_dir_notify(inode, DN_CREATE); + inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, + dentry->d_name.name, dentry->d_inode); audit_inode_child(dentry->d_name.name, dentry, inode); - - fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0); } /* @@ -186,15 +139,14 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry) static inline void fsnotify_access(struct dentry *dentry) { struct inode *inode = dentry->d_inode; - __u32 mask = FS_ACCESS; + u32 mask = IN_ACCESS; if (S_ISDIR(inode->i_mode)) - mask |= FS_IN_ISDIR; + mask |= IN_ISDIR; + dnotify_parent(dentry, DN_ACCESS); + inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name); inotify_inode_queue_event(inode, mask, 0, NULL, NULL); - - fsnotify_parent(dentry, mask); - fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0); } /* @@ -203,15 +155,14 @@ static inline void fsnotify_access(struct dentry *dentry) static inline void fsnotify_modify(struct dentry *dentry) { struct inode *inode = dentry->d_inode; - __u32 mask = FS_MODIFY; + u32 mask = IN_MODIFY; if (S_ISDIR(inode->i_mode)) - mask |= FS_IN_ISDIR; + mask |= IN_ISDIR; + dnotify_parent(dentry, DN_MODIFY); + inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name); inotify_inode_queue_event(inode, mask, 0, NULL, NULL); - - fsnotify_parent(dentry, mask); - fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0); } /* @@ -220,15 +171,13 @@ static inline void fsnotify_modify(struct dentry *dentry) static inline void fsnotify_open(struct dentry *dentry) { struct inode *inode = dentry->d_inode; - __u32 mask = FS_OPEN; + u32 mask = IN_OPEN; if (S_ISDIR(inode->i_mode)) - mask |= FS_IN_ISDIR; + mask |= IN_ISDIR; + inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name); inotify_inode_queue_event(inode, mask, 0, NULL, NULL); - - fsnotify_parent(dentry, mask); - fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0); } /* @@ -238,16 +187,15 @@ static inline void fsnotify_close(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; + const char *name = dentry->d_name.name; fmode_t mode = file->f_mode; - __u32 mask = (mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE; + u32 mask = (mode & FMODE_WRITE) ? IN_CLOSE_WRITE : IN_CLOSE_NOWRITE; if (S_ISDIR(inode->i_mode)) - mask |= FS_IN_ISDIR; + mask |= IN_ISDIR; + inotify_dentry_parent_queue_event(dentry, mask, 0, name); inotify_inode_queue_event(inode, mask, 0, NULL, NULL); - - fsnotify_parent(dentry, mask); - fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0); } /* @@ -256,15 +204,13 @@ static inline void fsnotify_close(struct file *file) static inline void fsnotify_xattr(struct dentry *dentry) { struct inode *inode = dentry->d_inode; - __u32 mask = FS_ATTRIB; + u32 mask = IN_ATTRIB; if (S_ISDIR(inode->i_mode)) - mask |= FS_IN_ISDIR; + mask |= IN_ISDIR; + inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name); inotify_inode_queue_event(inode, mask, 0, NULL, NULL); - - fsnotify_parent(dentry, mask); - fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0); } /* @@ -274,37 +220,50 @@ static inline void fsnotify_xattr(struct dentry *dentry) static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid) { struct inode *inode = dentry->d_inode; - __u32 mask = 0; - - if (ia_valid & ATTR_UID) - mask |= FS_ATTRIB; - if (ia_valid & ATTR_GID) - mask |= FS_ATTRIB; - if (ia_valid & ATTR_SIZE) - mask |= FS_MODIFY; + int dn_mask = 0; + u32 in_mask = 0; + if (ia_valid & ATTR_UID) { + in_mask |= IN_ATTRIB; + dn_mask |= DN_ATTRIB; + } + if (ia_valid & ATTR_GID) { + in_mask |= IN_ATTRIB; + dn_mask |= DN_ATTRIB; + } + if (ia_valid & ATTR_SIZE) { + in_mask |= IN_MODIFY; + dn_mask |= DN_MODIFY; + } /* both times implies a utime(s) call */ if ((ia_valid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME)) - mask |= FS_ATTRIB; - else if (ia_valid & ATTR_ATIME) - mask |= FS_ACCESS; - else if (ia_valid & ATTR_MTIME) - mask |= FS_MODIFY; - - if (ia_valid & ATTR_MODE) - mask |= FS_ATTRIB; + { + in_mask |= IN_ATTRIB; + dn_mask |= DN_ATTRIB; + } else if (ia_valid & ATTR_ATIME) { + in_mask |= IN_ACCESS; + dn_mask |= DN_ACCESS; + } else if (ia_valid & ATTR_MTIME) { + in_mask |= IN_MODIFY; + dn_mask |= DN_MODIFY; + } + if (ia_valid & ATTR_MODE) { + in_mask |= IN_ATTRIB; + dn_mask |= DN_ATTRIB; + } - if (mask) { + if (dn_mask) + dnotify_parent(dentry, dn_mask); + if (in_mask) { if (S_ISDIR(inode->i_mode)) - mask |= FS_IN_ISDIR; - inotify_inode_queue_event(inode, mask, 0, NULL, NULL); - - fsnotify_parent(dentry, mask); - fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0); + in_mask |= IN_ISDIR; + inotify_inode_queue_event(inode, in_mask, 0, NULL, NULL); + inotify_dentry_parent_queue_event(dentry, in_mask, 0, + dentry->d_name.name); } } -#if defined(CONFIG_INOTIFY) || defined(CONFIG_FSNOTIFY) /* notify helpers */ +#ifdef CONFIG_INOTIFY /* inotify helpers */ /* * fsnotify_oldname_init - save off the old filename before we change it @@ -322,7 +281,7 @@ static inline void fsnotify_oldname_free(const char *old_name) kfree(old_name); } -#else /* CONFIG_INOTIFY || CONFIG_FSNOTIFY */ +#else /* CONFIG_INOTIFY */ static inline const char *fsnotify_oldname_init(const char *name) { diff --git a/trunk/include/linux/fsnotify_backend.h b/trunk/include/linux/fsnotify_backend.h deleted file mode 100644 index 44848aa830dc..000000000000 --- a/trunk/include/linux/fsnotify_backend.h +++ /dev/null @@ -1,387 +0,0 @@ -/* - * Filesystem access notification for Linux - * - * Copyright (C) 2008 Red Hat, Inc., Eric Paris - */ - -#ifndef __LINUX_FSNOTIFY_BACKEND_H -#define __LINUX_FSNOTIFY_BACKEND_H - -#ifdef __KERNEL__ - -#include /* inotify uses this */ -#include /* struct inode */ -#include -#include /* struct path */ -#include -#include - -#include - -/* - * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily - * convert between them. dnotify only needs conversion at watch creation - * so no perf loss there. fanotify isn't defined yet, so it can use the - * wholes if it needs more events. - */ -#define FS_ACCESS 0x00000001 /* File was accessed */ -#define FS_MODIFY 0x00000002 /* File was modified */ -#define FS_ATTRIB 0x00000004 /* Metadata changed */ -#define FS_CLOSE_WRITE 0x00000008 /* Writtable file was closed */ -#define FS_CLOSE_NOWRITE 0x00000010 /* Unwrittable file closed */ -#define FS_OPEN 0x00000020 /* File was opened */ -#define FS_MOVED_FROM 0x00000040 /* File was moved from X */ -#define FS_MOVED_TO 0x00000080 /* File was moved to Y */ -#define FS_CREATE 0x00000100 /* Subfile was created */ -#define FS_DELETE 0x00000200 /* Subfile was deleted */ -#define FS_DELETE_SELF 0x00000400 /* Self was deleted */ -#define FS_MOVE_SELF 0x00000800 /* Self was moved */ - -#define FS_UNMOUNT 0x00002000 /* inode on umount fs */ -#define FS_Q_OVERFLOW 0x00004000 /* Event queued overflowed */ -#define FS_IN_IGNORED 0x00008000 /* last inotify event here */ - -#define FS_IN_ISDIR 0x40000000 /* event occurred against dir */ -#define FS_IN_ONESHOT 0x80000000 /* only send event once */ - -#define FS_DN_RENAME 0x10000000 /* file renamed */ -#define FS_DN_MULTISHOT 0x20000000 /* dnotify multishot */ - -/* This inode cares about things that happen to its children. Always set for - * dnotify and inotify. */ -#define FS_EVENT_ON_CHILD 0x08000000 - -/* This is a list of all events that may get sent to a parernt based on fs event - * happening to inodes inside that directory */ -#define FS_EVENTS_POSS_ON_CHILD (FS_ACCESS | FS_MODIFY | FS_ATTRIB |\ - FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | FS_OPEN |\ - FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE |\ - FS_DELETE) - -/* listeners that hard code group numbers near the top */ -#define DNOTIFY_GROUP_NUM UINT_MAX -#define INOTIFY_GROUP_NUM (DNOTIFY_GROUP_NUM-1) - -struct fsnotify_group; -struct fsnotify_event; -struct fsnotify_mark_entry; -struct fsnotify_event_private_data; - -/* - * Each group much define these ops. The fsnotify infrastructure will call - * these operations for each relevant group. - * - * should_send_event - given a group, inode, and mask this function determines - * if the group is interested in this event. - * handle_event - main call for a group to handle an fs event - * free_group_priv - called when a group refcnt hits 0 to clean up the private union - * freeing-mark - this means that a mark has been flagged to die when everything - * finishes using it. The function is supplied with what must be a - * valid group and inode to use to clean up. - */ -struct fsnotify_ops { - bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode, __u32 mask); - int (*handle_event)(struct fsnotify_group *group, struct fsnotify_event *event); - void (*free_group_priv)(struct fsnotify_group *group); - void (*freeing_mark)(struct fsnotify_mark_entry *entry, struct fsnotify_group *group); - void (*free_event_priv)(struct fsnotify_event_private_data *priv); -}; - -/* - * A group is a "thing" that wants to receive notification about filesystem - * events. The mask holds the subset of event types this group cares about. - * refcnt on a group is up to the implementor and at any moment if it goes 0 - * everything will be cleaned up. - */ -struct fsnotify_group { - /* - * global list of all groups receiving events from fsnotify. - * anchored by fsnotify_groups and protected by either fsnotify_grp_mutex - * or fsnotify_grp_srcu depending on write vs read. - */ - struct list_head group_list; - - /* - * Defines all of the event types in which this group is interested. - * This mask is a bitwise OR of the FS_* events from above. Each time - * this mask changes for a group (if it changes) the correct functions - * must be called to update the global structures which indicate global - * interest in event types. - */ - __u32 mask; - - /* - * How the refcnt is used is up to each group. When the refcnt hits 0 - * fsnotify will clean up all of the resources associated with this group. - * As an example, the dnotify group will always have a refcnt=1 and that - * will never change. Inotify, on the other hand, has a group per - * inotify_init() and the refcnt will hit 0 only when that fd has been - * closed. - */ - atomic_t refcnt; /* things with interest in this group */ - unsigned int group_num; /* simply prevents accidental group collision */ - - const struct fsnotify_ops *ops; /* how this group handles things */ - - /* needed to send notification to userspace */ - struct mutex notification_mutex; /* protect the notification_list */ - struct list_head notification_list; /* list of event_holder this group needs to send to userspace */ - wait_queue_head_t notification_waitq; /* read() on the notification file blocks on this waitq */ - unsigned int q_len; /* events on the queue */ - unsigned int max_events; /* maximum events allowed on the list */ - - /* stores all fastapth entries assoc with this group so they can be cleaned on unregister */ - spinlock_t mark_lock; /* protect mark_entries list */ - atomic_t num_marks; /* 1 for each mark entry and 1 for not being - * past the point of no return when freeing - * a group */ - struct list_head mark_entries; /* all inode mark entries for this group */ - - /* prevents double list_del of group_list. protected by global fsnotify_grp_mutex */ - bool on_group_list; - - /* groups can define private fields here or use the void *private */ - union { - void *private; -#ifdef CONFIG_INOTIFY_USER - struct inotify_group_private_data { - spinlock_t idr_lock; - struct idr idr; - u32 last_wd; - struct fasync_struct *fa; /* async notification */ - struct user_struct *user; - } inotify_data; -#endif - }; -}; - -/* - * A single event can be queued in multiple group->notification_lists. - * - * each group->notification_list will point to an event_holder which in turns points - * to the actual event that needs to be sent to userspace. - * - * Seemed cheaper to create a refcnt'd event and a small holder for every group - * than create a different event for every group - * - */ -struct fsnotify_event_holder { - struct fsnotify_event *event; - struct list_head event_list; -}; - -/* - * Inotify needs to tack data onto an event. This struct lets us later find the - * correct private data of the correct group. - */ -struct fsnotify_event_private_data { - struct fsnotify_group *group; - struct list_head event_list; -}; - -/* - * all of the information about the original object we want to now send to - * a group. If you want to carry more info from the accessing task to the - * listener this structure is where you need to be adding fields. - */ -struct fsnotify_event { - /* - * If we create an event we are also likely going to need a holder - * to link to a group. So embed one holder in the event. Means only - * one allocation for the common case where we only have one group - */ - struct fsnotify_event_holder holder; - spinlock_t lock; /* protection for the associated event_holder and private_list */ - /* to_tell may ONLY be dereferenced during handle_event(). */ - struct inode *to_tell; /* either the inode the event happened to or its parent */ - /* - * depending on the event type we should have either a path or inode - * We hold a reference on path, but NOT on inode. Since we have the ref on - * the path, it may be dereferenced at any point during this object's - * lifetime. That reference is dropped when this object's refcnt hits - * 0. If this event contains an inode instead of a path, the inode may - * ONLY be used during handle_event(). - */ - union { - struct path path; - struct inode *inode; - }; -/* when calling fsnotify tell it if the data is a path or inode */ -#define FSNOTIFY_EVENT_NONE 0 -#define FSNOTIFY_EVENT_PATH 1 -#define FSNOTIFY_EVENT_INODE 2 -#define FSNOTIFY_EVENT_FILE 3 - int data_type; /* which of the above union we have */ - atomic_t refcnt; /* how many groups still are using/need to send this event */ - __u32 mask; /* the type of access, bitwise OR for FS_* event types */ - - u32 sync_cookie; /* used to corrolate events, namely inotify mv events */ - char *file_name; - size_t name_len; - - struct list_head private_data_list; /* groups can store private data here */ -}; - -/* - * a mark is simply an entry attached to an in core inode which allows an - * fsnotify listener to indicate they are either no longer interested in events - * of a type matching mask or only interested in those events. - * - * these are flushed when an inode is evicted from core and may be flushed - * when the inode is modified (as seen by fsnotify_access). Some fsnotify users - * (such as dnotify) will flush these when the open fd is closed and not at - * inode eviction or modification. - */ -struct fsnotify_mark_entry { - __u32 mask; /* mask this mark entry is for */ - /* we hold ref for each i_list and g_list. also one ref for each 'thing' - * in kernel that found and may be using this mark. */ - atomic_t refcnt; /* active things looking at this mark */ - struct inode *inode; /* inode this entry is associated with */ - struct fsnotify_group *group; /* group this mark entry is for */ - struct hlist_node i_list; /* list of mark_entries by inode->i_fsnotify_mark_entries */ - struct list_head g_list; /* list of mark_entries by group->i_fsnotify_mark_entries */ - spinlock_t lock; /* protect group, inode, and killme */ - struct list_head free_i_list; /* tmp list used when freeing this mark */ - struct list_head free_g_list; /* tmp list used when freeing this mark */ - void (*free_mark)(struct fsnotify_mark_entry *entry); /* called on final put+free */ -}; - -#ifdef CONFIG_FSNOTIFY - -/* called from the vfs helpers */ - -/* main fsnotify call to send events */ -extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, - const char *name, u32 cookie); -extern void __fsnotify_parent(struct dentry *dentry, __u32 mask); -extern void __fsnotify_inode_delete(struct inode *inode); -extern u32 fsnotify_get_cookie(void); - -static inline int fsnotify_inode_watches_children(struct inode *inode) -{ - /* FS_EVENT_ON_CHILD is set if the inode may care */ - if (!(inode->i_fsnotify_mask & FS_EVENT_ON_CHILD)) - return 0; - /* this inode might care about child events, does it care about the - * specific set of events that can happen on a child? */ - return inode->i_fsnotify_mask & FS_EVENTS_POSS_ON_CHILD; -} - -/* - * Update the dentry with a flag indicating the interest of its parent to receive - * filesystem events when those events happens to this dentry->d_inode. - */ -static inline void __fsnotify_update_dcache_flags(struct dentry *dentry) -{ - struct dentry *parent; - - assert_spin_locked(&dcache_lock); - assert_spin_locked(&dentry->d_lock); - - parent = dentry->d_parent; - if (fsnotify_inode_watches_children(parent->d_inode)) - dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; - else - dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; -} - -/* - * fsnotify_d_instantiate - instantiate a dentry for inode - * Called with dcache_lock held. - */ -static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode *inode) -{ - if (!inode) - return; - - assert_spin_locked(&dcache_lock); - - spin_lock(&dentry->d_lock); - __fsnotify_update_dcache_flags(dentry); - spin_unlock(&dentry->d_lock); -} - -/* called from fsnotify listeners, such as fanotify or dnotify */ - -/* must call when a group changes its ->mask */ -extern void fsnotify_recalc_global_mask(void); -/* get a reference to an existing or create a new group */ -extern struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, - __u32 mask, - const struct fsnotify_ops *ops); -/* run all marks associated with this group and update group->mask */ -extern void fsnotify_recalc_group_mask(struct fsnotify_group *group); -/* drop reference on a group from fsnotify_obtain_group */ -extern void fsnotify_put_group(struct fsnotify_group *group); - -/* take a reference to an event */ -extern void fsnotify_get_event(struct fsnotify_event *event); -extern void fsnotify_put_event(struct fsnotify_event *event); -/* find private data previously attached to an event and unlink it */ -extern struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, - struct fsnotify_event *event); - -/* attach the event to the group notification queue */ -extern int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event, - struct fsnotify_event_private_data *priv); -/* true if the group notification queue is empty */ -extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group); -/* return, but do not dequeue the first event on the notification queue */ -extern struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group); -/* return AND dequeue the first event on the notification queue */ -extern struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group); - -/* functions used to manipulate the marks attached to inodes */ - -/* run all marks associated with an inode and update inode->i_fsnotify_mask */ -extern void fsnotify_recalc_inode_mask(struct inode *inode); -extern void fsnotify_init_mark(struct fsnotify_mark_entry *entry, void (*free_mark)(struct fsnotify_mark_entry *entry)); -/* find (and take a reference) to a mark associated with group and inode */ -extern struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group, struct inode *inode); -/* attach the mark to both the group and the inode */ -extern int fsnotify_add_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group, struct inode *inode); -/* given a mark, flag it to be freed when all references are dropped */ -extern void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry); -/* run all the marks in a group, and flag them to be freed */ -extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group); -extern void fsnotify_get_mark(struct fsnotify_mark_entry *entry); -extern void fsnotify_put_mark(struct fsnotify_mark_entry *entry); -extern void fsnotify_unmount_inodes(struct list_head *list); - -/* put here because inotify does some weird stuff when destroying watches */ -extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, - void *data, int data_is, const char *name, - u32 cookie); - -#else - -static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, - const char *name, u32 cookie) -{} - -static inline void __fsnotify_parent(struct dentry *dentry, __u32 mask) -{} - -static inline void __fsnotify_inode_delete(struct inode *inode) -{} - -static inline void __fsnotify_update_dcache_flags(struct dentry *dentry) -{} - -static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode *inode) -{} - -static inline u32 fsnotify_get_cookie(void) -{ - return 0; -} - -static inline void fsnotify_unmount_inodes(struct list_head *list) -{} - -#endif /* CONFIG_FSNOTIFY */ - -#endif /* __KERNEL __ */ - -#endif /* __LINUX_FSNOTIFY_BACKEND_H */ diff --git a/trunk/include/linux/magic.h b/trunk/include/linux/magic.h index 1923327b9869..927138cf3050 100644 --- a/trunk/include/linux/magic.h +++ b/trunk/include/linux/magic.h @@ -6,8 +6,6 @@ #define AFS_SUPER_MAGIC 0x5346414F #define AUTOFS_SUPER_MAGIC 0x0187 #define CODA_SUPER_MAGIC 0x73757245 -#define CRAMFS_MAGIC 0x28cd3d45 /* some random number */ -#define CRAMFS_MAGIC_WEND 0x453dcd28 /* magic number with the wrong endianess */ #define DEBUGFS_MAGIC 0x64626720 #define SYSFS_MAGIC 0x62656572 #define SECURITYFS_MAGIC 0x73636673 diff --git a/trunk/include/linux/mount.h b/trunk/include/linux/mount.h index 5d5275364867..51f55f903aff 100644 --- a/trunk/include/linux/mount.h +++ b/trunk/include/linux/mount.h @@ -30,7 +30,7 @@ struct mnt_namespace; #define MNT_STRICTATIME 0x80 #define MNT_SHRINKABLE 0x100 -#define MNT_WRITE_HOLD 0x200 +#define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */ #define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */ #define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */ @@ -65,22 +65,13 @@ struct vfsmount { int mnt_expiry_mark; /* true if marked for expiry */ int mnt_pinned; int mnt_ghosts; -#ifdef CONFIG_SMP - int *mnt_writers; -#else - int mnt_writers; -#endif + /* + * This value is not stable unless all of the mnt_writers[] spinlocks + * are held, and all mnt_writer[]s on this mount have 0 as their ->count + */ + atomic_t __mnt_writers; }; -static inline int *get_mnt_writers_ptr(struct vfsmount *mnt) -{ -#ifdef CONFIG_SMP - return mnt->mnt_writers; -#else - return &mnt->mnt_writers; -#endif -} - static inline struct vfsmount *mntget(struct vfsmount *mnt) { if (mnt) @@ -88,11 +79,7 @@ static inline struct vfsmount *mntget(struct vfsmount *mnt) return mnt; } -struct file; /* forward dec */ - extern int mnt_want_write(struct vfsmount *mnt); -extern int mnt_want_write_file(struct file *file); -extern int mnt_clone_write(struct vfsmount *mnt); extern void mnt_drop_write(struct vfsmount *mnt); extern void mntput_no_expire(struct vfsmount *mnt); extern void mnt_pin(struct vfsmount *mnt); diff --git a/trunk/include/linux/namei.h b/trunk/include/linux/namei.h index d870ae2faedc..518098fe63af 100644 --- a/trunk/include/linux/namei.h +++ b/trunk/include/linux/namei.h @@ -18,7 +18,6 @@ enum { MAX_NESTED_LINKS = 8 }; struct nameidata { struct path path; struct qstr last; - struct path root; unsigned int flags; int last_type; unsigned depth; @@ -78,8 +77,8 @@ extern void release_open_intent(struct nameidata *); extern struct dentry *lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_noperm(const char *, struct dentry *); -extern int follow_down(struct path *); -extern int follow_up(struct path *); +extern int follow_down(struct vfsmount **, struct dentry **); +extern int follow_up(struct vfsmount **, struct dentry **); extern struct dentry *lock_rename(struct dentry *, struct dentry *); extern void unlock_rename(struct dentry *, struct dentry *); diff --git a/trunk/include/linux/nfsd/export.h b/trunk/include/linux/nfsd/export.h index a6d9ef2bb34a..bcd0201589f8 100644 --- a/trunk/include/linux/nfsd/export.h +++ b/trunk/include/linux/nfsd/export.h @@ -125,9 +125,11 @@ void nfsd_export_flush(void); void exp_readlock(void); void exp_readunlock(void); struct svc_export * rqst_exp_get_by_name(struct svc_rqst *, - struct path *); + struct vfsmount *, + struct dentry *); struct svc_export * rqst_exp_parent(struct svc_rqst *, - struct path *); + struct vfsmount *mnt, + struct dentry *dentry); int exp_rootfh(struct auth_domain *, char *path, struct knfsd_fh *, int maxsize); __be32 exp_pseudoroot(struct svc_rqst *, struct svc_fh *); diff --git a/trunk/include/linux/proc_fs.h b/trunk/include/linux/proc_fs.h index e6e77d31c418..fbfa3d44d33d 100644 --- a/trunk/include/linux/proc_fs.h +++ b/trunk/include/linux/proc_fs.h @@ -93,9 +93,20 @@ struct vmcore { #ifdef CONFIG_PROC_FS +extern spinlock_t proc_subdir_lock; + extern void proc_root_init(void); void proc_flush_task(struct task_struct *task); +struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *); +int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); +unsigned long task_vsize(struct mm_struct *); +int task_statm(struct mm_struct *, int *, int *, int *, int *); +void task_mem(struct seq_file *, struct mm_struct *); +void clear_refs_smap(struct mm_struct *mm); + +struct proc_dir_entry *de_get(struct proc_dir_entry *de); +void de_put(struct proc_dir_entry *de); extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, struct proc_dir_entry *parent); @@ -105,7 +116,20 @@ struct proc_dir_entry *proc_create_data(const char *name, mode_t mode, void *data); extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent); +extern struct vfsmount *proc_mnt; struct pid_namespace; +extern int proc_fill_super(struct super_block *); +extern struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *); + +/* + * These are generic /proc routines that use the internal + * "struct proc_dir_entry" tree to traverse the filesystem. + * + * The /proc root directory has extended versions to take care + * of the /proc/ subdirectories. + */ +extern int proc_readdir(struct file *, void *, filldir_t); +extern struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *); extern int pid_ns_prepare_proc(struct pid_namespace *ns); extern void pid_ns_release_proc(struct pid_namespace *ns); diff --git a/trunk/include/linux/qnx4_fs.h b/trunk/include/linux/qnx4_fs.h index 8b9aee1a9ce3..787d19ea9f46 100644 --- a/trunk/include/linux/qnx4_fs.h +++ b/trunk/include/linux/qnx4_fs.h @@ -85,4 +85,65 @@ struct qnx4_super_block { struct qnx4_inode_entry AltBoot; }; +#ifdef __KERNEL__ + +#define QNX4_DEBUG 0 + +#if QNX4_DEBUG +#define QNX4DEBUG(X) printk X +#else +#define QNX4DEBUG(X) (void) 0 +#endif + +struct qnx4_sb_info { + struct buffer_head *sb_buf; /* superblock buffer */ + struct qnx4_super_block *sb; /* our superblock */ + unsigned int Version; /* may be useful */ + struct qnx4_inode_entry *BitMap; /* useful */ +}; + +struct qnx4_inode_info { + struct qnx4_inode_entry raw; + loff_t mmu_private; + struct inode vfs_inode; +}; + +extern struct inode *qnx4_iget(struct super_block *, unsigned long); +extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd); +extern unsigned long qnx4_count_free_blocks(struct super_block *sb); +extern unsigned long qnx4_block_map(struct inode *inode, long iblock); + +extern struct buffer_head *qnx4_bread(struct inode *, int, int); + +extern const struct inode_operations qnx4_file_inode_operations; +extern const struct inode_operations qnx4_dir_inode_operations; +extern const struct file_operations qnx4_file_operations; +extern const struct file_operations qnx4_dir_operations; +extern int qnx4_is_free(struct super_block *sb, long block); +extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy); +extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd); +extern void qnx4_truncate(struct inode *inode); +extern void qnx4_free_inode(struct inode *inode); +extern int qnx4_unlink(struct inode *dir, struct dentry *dentry); +extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry); +extern int qnx4_sync_file(struct file *file, struct dentry *dentry, int); +extern int qnx4_sync_inode(struct inode *inode); + +static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct qnx4_inode_info *qnx4_i(struct inode *inode) +{ + return container_of(inode, struct qnx4_inode_info, vfs_inode); +} + +static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode) +{ + return &qnx4_i(inode)->raw; +} + +#endif /* __KERNEL__ */ + #endif diff --git a/trunk/include/linux/quotaops.h b/trunk/include/linux/quotaops.h index 7bc457593684..36353d95c8db 100644 --- a/trunk/include/linux/quotaops.h +++ b/trunk/include/linux/quotaops.h @@ -20,12 +20,7 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb) /* * declaration of quota_function calls in kernel. */ -void sync_quota_sb(struct super_block *sb, int type); -static inline void writeout_quota_sb(struct super_block *sb, int type) -{ - if (sb->s_qcop->quota_sync) - sb->s_qcop->quota_sync(sb, type); -} +void sync_dquots(struct super_block *sb, int type); int dquot_initialize(struct inode *inode, int type); int dquot_drop(struct inode *inode); @@ -258,7 +253,12 @@ static inline void vfs_dq_free_inode(struct inode *inode) inode->i_sb->dq_op->free_inode(inode, 1); } -/* Cannot be called inside a transaction */ +/* The following two functions cannot be called inside a transaction */ +static inline void vfs_dq_sync(struct super_block *sb) +{ + sync_dquots(sb, -1); +} + static inline int vfs_dq_off(struct super_block *sb, int remount) { int ret = -ENOSYS; @@ -334,11 +334,7 @@ static inline void vfs_dq_free_inode(struct inode *inode) { } -static inline void sync_quota_sb(struct super_block *sb, int type) -{ -} - -static inline void writeout_quota_sb(struct super_block *sb, int type) +static inline void vfs_dq_sync(struct super_block *sb) { } diff --git a/trunk/include/linux/reiserfs_fs_sb.h b/trunk/include/linux/reiserfs_fs_sb.h index dab68bbed675..6473650c28f1 100644 --- a/trunk/include/linux/reiserfs_fs_sb.h +++ b/trunk/include/linux/reiserfs_fs_sb.h @@ -453,7 +453,6 @@ enum reiserfs_mount_options { REISERFS_ATTRS, REISERFS_XATTRS_USER, REISERFS_POSIXACL, - REISERFS_EXPOSE_PRIVROOT, REISERFS_BARRIER_NONE, REISERFS_BARRIER_FLUSH, @@ -491,7 +490,6 @@ enum reiserfs_mount_options { #define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK)) #define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER)) #define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL)) -#define reiserfs_expose_privroot(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_EXPOSE_PRIVROOT)) #define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s)) #define reiserfs_barrier_none(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_NONE)) #define reiserfs_barrier_flush(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_FLUSH)) diff --git a/trunk/include/linux/writeback.h b/trunk/include/linux/writeback.h index 3224820c8514..93445477f86a 100644 --- a/trunk/include/linux/writeback.h +++ b/trunk/include/linux/writeback.h @@ -79,6 +79,7 @@ struct writeback_control { void writeback_inodes(struct writeback_control *wbc); int inode_wait(void *); void sync_inodes_sb(struct super_block *, int wait); +void sync_inodes(int wait); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) diff --git a/trunk/init/Kconfig b/trunk/init/Kconfig index c649657e2259..9b68fee8d79e 100644 --- a/trunk/init/Kconfig +++ b/trunk/init/Kconfig @@ -302,8 +302,7 @@ config AUDITSYSCALL config AUDIT_TREE def_bool y - depends on AUDITSYSCALL - select INOTIFY + depends on AUDITSYSCALL && INOTIFY menu "RCU Subsystem" diff --git a/trunk/kernel/audit_tree.c b/trunk/kernel/audit_tree.c index 1f6396d76687..6e7351739a82 100644 --- a/trunk/kernel/audit_tree.c +++ b/trunk/kernel/audit_tree.c @@ -568,7 +568,7 @@ void audit_trim_trees(void) if (err) goto skip_it; - root_mnt = collect_mounts(&path); + root_mnt = collect_mounts(path.mnt, path.dentry); path_put(&path); if (!root_mnt) goto skip_it; @@ -660,7 +660,7 @@ int audit_add_tree_rule(struct audit_krule *rule) err = kern_path(tree->pathname, 0, &path); if (err) goto Err; - mnt = collect_mounts(&path); + mnt = collect_mounts(path.mnt, path.dentry); path_put(&path); if (!mnt) { err = -ENOMEM; @@ -720,7 +720,7 @@ int audit_tag_tree(char *old, char *new) err = kern_path(new, 0, &path); if (err) return err; - tagged = collect_mounts(&path); + tagged = collect_mounts(path.mnt, path.dentry); path_put(&path); if (!tagged) return -ENOMEM; diff --git a/trunk/kernel/cgroup.c b/trunk/kernel/cgroup.c index 3fb789f6df94..a7267bfd3765 100644 --- a/trunk/kernel/cgroup.c +++ b/trunk/kernel/cgroup.c @@ -46,7 +46,6 @@ #include #include #include -#include #include @@ -901,7 +900,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) struct cgroup *cgrp = &root->top_cgroup; struct cgroup_sb_opts opts; - lock_kernel(); mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); @@ -929,7 +927,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) kfree(opts.release_agent); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); - unlock_kernel(); return ret; } diff --git a/trunk/virt/kvm/kvm_main.c b/trunk/virt/kvm/kvm_main.c index 764554350ed8..e21194566b71 100644 --- a/trunk/virt/kvm/kvm_main.c +++ b/trunk/virt/kvm/kvm_main.c @@ -2604,6 +2604,7 @@ int kvm_init(void *opaque, unsigned int vcpu_size, r = -ENOMEM; goto out_free_0; } + cpumask_clear(cpus_hardware_enabled); r = kvm_arch_hardware_setup(); if (r < 0)