From 79453cfb209e43a0fcddf93170d4ad4674647fda Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Wed, 2 Feb 2011 17:27:24 +0000 Subject: [PATCH] --- yaml --- r: 251819 b: refs/heads/master c: 02424d8966d803e33cbe51469be56b5d177b4a37 h: refs/heads/master i: 251817: e2c89b6385ad0b7dc8ebda193cadeb8c53600d29 251815: 1b6dcb9e835f09026789de09991bf44dc5b1fc4a v: v3 --- [refs] | 2 +- .../ABI/{removed => obsolete}/o2cb | 9 +- .../ABI/testing/sysfs-kernel-mm-cleancache | 11 - .../feature-removal-schedule.txt | 10 + trunk/Documentation/filesystems/ext4.txt | 4 + trunk/Documentation/filesystems/ocfs2.txt | 8 +- trunk/Documentation/filesystems/xfs.txt | 6 - trunk/Documentation/vm/cleancache.txt | 278 ---- trunk/MAINTAINERS | 13 +- trunk/arch/powerpc/Kconfig | 1 + trunk/arch/powerpc/include/asm/ftrace.h | 14 + trunk/arch/powerpc/include/asm/syscall.h | 5 + trunk/arch/powerpc/include/asm/thread_info.h | 7 +- trunk/arch/powerpc/kernel/Makefile | 1 + trunk/arch/powerpc/kernel/ftrace.c | 8 + trunk/arch/powerpc/kernel/ptrace.c | 10 + trunk/arch/x86/include/asm/xen/hypercall.h | 7 - trunk/drivers/video/mb862xx/mb862xx-i2c.c | 1 - trunk/drivers/xen/Makefile | 1 - trunk/drivers/xen/tmem.c | 264 --- trunk/fs/9p/vfs_inode.c | 4 - trunk/fs/Kconfig | 31 +- trunk/fs/affs/namei.c | 5 - trunk/fs/afs/dir.c | 5 - trunk/fs/autofs4/root.c | 2 - trunk/fs/bfs/dir.c | 3 - trunk/fs/btrfs/extent_io.c | 9 - trunk/fs/btrfs/super.c | 2 - trunk/fs/buffer.c | 64 +- trunk/fs/coda/dir.c | 5 - trunk/fs/configfs/dir.c | 2 - trunk/fs/ecryptfs/inode.c | 5 - trunk/fs/ext3/super.c | 2 - trunk/fs/ext4/Makefile | 3 +- trunk/fs/ext4/balloc.c | 146 +- trunk/fs/ext4/ext4.h | 127 +- trunk/fs/ext4/ext4_jbd2.c | 14 + trunk/fs/ext4/ext4_jbd2.h | 5 + trunk/fs/ext4/extents.c | 1416 +++++++---------- trunk/fs/ext4/file.c | 1 + trunk/fs/ext4/fsync.c | 25 +- trunk/fs/ext4/inode.c | 114 +- trunk/fs/ext4/mballoc.c | 459 +++--- trunk/fs/ext4/mballoc.h | 6 + trunk/fs/ext4/migrate.c | 2 +- trunk/fs/ext4/mmp.c | 351 ---- trunk/fs/ext4/move_extent.c | 3 +- trunk/fs/ext4/namei.c | 82 +- trunk/fs/ext4/page-io.c | 39 +- trunk/fs/ext4/super.c | 206 +-- trunk/fs/ext4/xattr.c | 4 +- trunk/fs/fat/namei_msdos.c | 5 - trunk/fs/fat/namei_vfat.c | 5 - trunk/fs/fuse/dir.c | 6 - trunk/fs/hfs/dir.c | 6 - trunk/fs/hfsplus/dir.c | 8 +- trunk/fs/hostfs/hostfs_kern.c | 5 - trunk/fs/hpfs/namei.c | 9 +- trunk/fs/hugetlbfs/inode.c | 3 +- trunk/fs/jbd2/commit.c | 22 +- trunk/fs/jbd2/journal.c | 58 +- trunk/fs/jbd2/transaction.c | 22 +- trunk/fs/jffs2/dir.c | 5 - trunk/fs/jfs/namei.c | 5 - trunk/fs/logfs/dir.c | 5 - trunk/fs/minix/namei.c | 5 - trunk/fs/mpage.c | 7 - trunk/fs/namei.c | 380 +++-- trunk/fs/namespace.c | 2 +- trunk/fs/ncpfs/dir.c | 5 - trunk/fs/nilfs2/namei.c | 5 - trunk/fs/ocfs2/Makefile | 1 - trunk/fs/ocfs2/alloc.c | 166 -- trunk/fs/ocfs2/alloc.h | 1 - trunk/fs/ocfs2/cluster/sys.c | 9 + trunk/fs/ocfs2/dlm/dlmcommon.h | 14 - trunk/fs/ocfs2/dlm/dlmdebug.c | 6 - trunk/fs/ocfs2/dlm/dlmdomain.c | 94 +- trunk/fs/ocfs2/dlm/dlmmaster.c | 255 +-- trunk/fs/ocfs2/dlm/dlmrecovery.c | 1 - trunk/fs/ocfs2/dlmfs/dlmfs.c | 2 +- trunk/fs/ocfs2/file.c | 1 - trunk/fs/ocfs2/ioctl.c | 492 +----- trunk/fs/ocfs2/move_extents.c | 1153 -------------- trunk/fs/ocfs2/move_extents.h | 22 - trunk/fs/ocfs2/ocfs2_ioctl.h | 68 - trunk/fs/ocfs2/ocfs2_trace.h | 25 - trunk/fs/ocfs2/refcounttree.c | 58 +- trunk/fs/ocfs2/refcounttree.h | 11 - trunk/fs/ocfs2/super.c | 4 +- trunk/fs/omfs/dir.c | 11 +- trunk/fs/proc/Makefile | 1 - trunk/fs/proc/base.c | 20 +- trunk/fs/proc/inode.c | 7 - trunk/fs/proc/internal.h | 18 - trunk/fs/proc/namespaces.c | 198 --- trunk/fs/proc/task_mmu.c | 2 +- trunk/fs/reiserfs/namei.c | 5 - trunk/fs/reiserfs/xattr.c | 1 + trunk/fs/super.c | 3 - trunk/fs/sysv/namei.c | 5 - trunk/fs/ubifs/dir.c | 5 - trunk/fs/udf/namei.c | 5 - trunk/fs/ufs/namei.c | 5 - trunk/fs/xfs/linux-2.6/xfs_discard.c | 29 - trunk/fs/xfs/linux-2.6/xfs_discard.h | 2 - trunk/fs/xfs/linux-2.6/xfs_super.c | 18 +- trunk/fs/xfs/xfs_ag.h | 3 - trunk/fs/xfs/xfs_alloc.c | 35 +- trunk/fs/xfs/xfs_alloc.h | 5 +- trunk/fs/xfs/xfs_alloc_btree.c | 3 +- trunk/fs/xfs/xfs_bmap.c | 549 ++++--- trunk/fs/xfs/xfs_bmap.h | 2 + trunk/fs/xfs/xfs_inode.c | 15 +- trunk/fs/xfs/xfs_inode.h | 1 + trunk/fs/xfs/xfs_log_cil.c | 13 +- trunk/fs/xfs/xfs_mount.h | 1 - trunk/fs/xfs/xfs_trans.c | 2 +- trunk/include/linux/buffer_head.h | 16 - trunk/include/linux/cleancache.h | 122 -- trunk/include/linux/fs.h | 5 - trunk/include/linux/hugetlb.h | 7 +- trunk/include/linux/hugetlb_inline.h | 2 +- trunk/include/linux/if_link.h | 1 - trunk/include/linux/jbd2.h | 8 +- trunk/include/linux/mm.h | 6 +- trunk/include/linux/mm_types.h | 4 +- trunk/include/linux/proc_fs.h | 21 - trunk/include/linux/syscalls.h | 1 - trunk/include/net/net_namespace.h | 1 - trunk/include/xen/interface/xen.h | 22 - trunk/ipc/namespace.c | 37 - trunk/ipc/shm.c | 2 +- trunk/kernel/nsproxy.c | 42 - trunk/kernel/utsname.c | 39 - trunk/mm/Kconfig | 23 - trunk/mm/Makefile | 1 - trunk/mm/cleancache.c | 244 --- trunk/mm/filemap.c | 11 - trunk/mm/fremap.c | 2 +- trunk/mm/hugetlb.c | 4 +- trunk/mm/memory.c | 2 +- trunk/mm/mlock.c | 8 +- trunk/mm/mmap.c | 8 +- trunk/mm/slub.c | 1 + trunk/mm/truncate.c | 6 - trunk/net/core/net_namespace.c | 65 - trunk/net/core/rtnetlink.c | 5 +- 148 files changed, 2050 insertions(+), 6383 deletions(-) rename trunk/Documentation/ABI/{removed => obsolete}/o2cb (65%) delete mode 100644 trunk/Documentation/ABI/testing/sysfs-kernel-mm-cleancache delete mode 100644 trunk/Documentation/vm/cleancache.txt delete mode 100644 trunk/drivers/xen/tmem.c delete mode 100644 trunk/fs/ext4/mmp.c delete mode 100644 trunk/fs/ocfs2/move_extents.c delete mode 100644 trunk/fs/ocfs2/move_extents.h delete mode 100644 trunk/fs/proc/namespaces.c delete mode 100644 trunk/include/linux/cleancache.h delete mode 100644 trunk/mm/cleancache.c diff --git a/[refs] b/[refs] index e5a7f504fe7e..26644b9c747b 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: be93d8cfbae1996052e91b2883d306a5d9d0fe18 +refs/heads/master: 02424d8966d803e33cbe51469be56b5d177b4a37 diff --git a/trunk/Documentation/ABI/removed/o2cb b/trunk/Documentation/ABI/obsolete/o2cb similarity index 65% rename from trunk/Documentation/ABI/removed/o2cb rename to trunk/Documentation/ABI/obsolete/o2cb index 7f5daa465093..9c49d8e6c0cc 100644 --- a/trunk/Documentation/ABI/removed/o2cb +++ b/trunk/Documentation/ABI/obsolete/o2cb @@ -1,10 +1,11 @@ What: /sys/o2cb symlink -Date: May 2011 -KernelVersion: 2.6.40 +Date: Dec 2005 +KernelVersion: 2.6.16 Contact: ocfs2-devel@oss.oracle.com -Description: This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink is - removed when new versions of ocfs2-tools which know to look +Description: This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink will + be removed when new versions of ocfs2-tools which know to look in /sys/fs/o2cb are sufficiently prevalent. Don't code new software to look here, it should try /sys/fs/o2cb instead. + See Documentation/ABI/stable/o2cb for more information on usage. Users: ocfs2-tools. It's sufficient to mail proposed changes to ocfs2-devel@oss.oracle.com. diff --git a/trunk/Documentation/ABI/testing/sysfs-kernel-mm-cleancache b/trunk/Documentation/ABI/testing/sysfs-kernel-mm-cleancache deleted file mode 100644 index 662ae646ea12..000000000000 --- a/trunk/Documentation/ABI/testing/sysfs-kernel-mm-cleancache +++ /dev/null @@ -1,11 +0,0 @@ -What: /sys/kernel/mm/cleancache/ -Date: April 2011 -Contact: Dan Magenheimer -Description: - /sys/kernel/mm/cleancache/ contains a number of files which - record a count of various cleancache operations - (sum across all filesystems): - succ_gets - failed_gets - puts - flushes diff --git a/trunk/Documentation/feature-removal-schedule.txt b/trunk/Documentation/feature-removal-schedule.txt index ff31b1cc50aa..95788ad2506c 100644 --- a/trunk/Documentation/feature-removal-schedule.txt +++ b/trunk/Documentation/feature-removal-schedule.txt @@ -262,6 +262,16 @@ Who: Michael Buesch --------------------------- +What: /sys/o2cb symlink +When: January 2010 +Why: /sys/fs/o2cb is the proper location for this information - /sys/o2cb + exists as a symlink for backwards compatibility for old versions of + ocfs2-tools. 2 years should be sufficient time to phase in new versions + which know to look in /sys/fs/o2cb. +Who: ocfs2-devel@oss.oracle.com + +--------------------------- + What: Ability for non root users to shm_get hugetlb pages based on mlock resource limits When: 2.6.31 diff --git a/trunk/Documentation/filesystems/ext4.txt b/trunk/Documentation/filesystems/ext4.txt index 3ae9bc94352a..c79ec58fd7f6 100644 --- a/trunk/Documentation/filesystems/ext4.txt +++ b/trunk/Documentation/filesystems/ext4.txt @@ -226,6 +226,10 @@ acl Enables POSIX Access Control Lists support. noacl This option disables POSIX Access Control List support. +reservation + +noreservation + bsddf (*) Make 'df' act like BSD. minixdf Make 'df' act like Minix. diff --git a/trunk/Documentation/filesystems/ocfs2.txt b/trunk/Documentation/filesystems/ocfs2.txt index 7618a287aa41..9ed920a8cd79 100644 --- a/trunk/Documentation/filesystems/ocfs2.txt +++ b/trunk/Documentation/filesystems/ocfs2.txt @@ -46,15 +46,9 @@ errors=panic Panic and halt the machine if an error occurs. intr (*) Allow signals to interrupt cluster operations. nointr Do not allow signals to interrupt cluster operations. -noatime Do not update access time. -relatime(*) Update atime if the previous atime is older than - mtime or ctime -strictatime Always update atime, but the minimum update interval - is specified by atime_quantum. atime_quantum=60(*) OCFS2 will not update atime unless this number of seconds has passed since the last update. - Set to zero to always update atime. This option need - work with strictatime. + Set to zero to always update atime. data=ordered (*) All data are forced directly out to the main file system prior to its metadata being committed to the journal. diff --git a/trunk/Documentation/filesystems/xfs.txt b/trunk/Documentation/filesystems/xfs.txt index 3fc0c31a6f5d..7bff3e4f35df 100644 --- a/trunk/Documentation/filesystems/xfs.txt +++ b/trunk/Documentation/filesystems/xfs.txt @@ -39,12 +39,6 @@ When mounting an XFS filesystem, the following options are accepted. drive level write caching to be enabled, for devices that support write barriers. - discard - Issue command to let the block device reclaim space freed by the - filesystem. This is useful for SSD devices, thinly provisioned - LUNs and virtual machine images, but may have a performance - impact. This option is incompatible with the nodelaylog option. - dmapi Enable the DMAPI (Data Management API) event callouts. Use with the "mtpt" option. diff --git a/trunk/Documentation/vm/cleancache.txt b/trunk/Documentation/vm/cleancache.txt deleted file mode 100644 index 36c367c73084..000000000000 --- a/trunk/Documentation/vm/cleancache.txt +++ /dev/null @@ -1,278 +0,0 @@ -MOTIVATION - -Cleancache is a new optional feature provided by the VFS layer that -potentially dramatically increases page cache effectiveness for -many workloads in many environments at a negligible cost. - -Cleancache can be thought of as a page-granularity victim cache for clean -pages that the kernel's pageframe replacement algorithm (PFRA) would like -to keep around, but can't since there isn't enough memory. So when the -PFRA "evicts" a page, it first attempts to use cleancache code to -put the data contained in that page into "transcendent memory", memory -that is not directly accessible or addressable by the kernel and is -of unknown and possibly time-varying size. - -Later, when a cleancache-enabled filesystem wishes to access a page -in a file on disk, it first checks cleancache to see if it already -contains it; if it does, the page of data is copied into the kernel -and a disk access is avoided. - -Transcendent memory "drivers" for cleancache are currently implemented -in Xen (using hypervisor memory) and zcache (using in-kernel compressed -memory) and other implementations are in development. - -FAQs are included below. - -IMPLEMENTATION OVERVIEW - -A cleancache "backend" that provides transcendent memory registers itself -to the kernel's cleancache "frontend" by calling cleancache_register_ops, -passing a pointer to a cleancache_ops structure with funcs set appropriately. -Note that cleancache_register_ops returns the previous settings so that -chaining can be performed if desired. The functions provided must conform to -certain semantics as follows: - -Most important, cleancache is "ephemeral". Pages which are copied into -cleancache have an indefinite lifetime which is completely unknowable -by the kernel and so may or may not still be in cleancache at any later time. -Thus, as its name implies, cleancache is not suitable for dirty pages. -Cleancache has complete discretion over what pages to preserve and what -pages to discard and when. - -Mounting a cleancache-enabled filesystem should call "init_fs" to obtain a -pool id which, if positive, must be saved in the filesystem's superblock; -a negative return value indicates failure. A "put_page" will copy a -(presumably about-to-be-evicted) page into cleancache and associate it with -the pool id, a file key, and a page index into the file. (The combination -of a pool id, a file key, and an index is sometimes called a "handle".) -A "get_page" will copy the page, if found, from cleancache into kernel memory. -A "flush_page" will ensure the page no longer is present in cleancache; -a "flush_inode" will flush all pages associated with the specified file; -and, when a filesystem is unmounted, a "flush_fs" will flush all pages in -all files specified by the given pool id and also surrender the pool id. - -An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache -to treat the pool as shared using a 128-bit UUID as a key. On systems -that may run multiple kernels (such as hard partitioned or virtualized -systems) that may share a clustered filesystem, and where cleancache -may be shared among those kernels, calls to init_shared_fs that specify the -same UUID will receive the same pool id, thus allowing the pages to -be shared. Note that any security requirements must be imposed outside -of the kernel (e.g. by "tools" that control cleancache). Or a -cleancache implementation can simply disable shared_init by always -returning a negative value. - -If a get_page is successful on a non-shared pool, the page is flushed (thus -making cleancache an "exclusive" cache). On a shared pool, the page -is NOT flushed on a successful get_page so that it remains accessible to -other sharers. The kernel is responsible for ensuring coherency between -cleancache (shared or not), the page cache, and the filesystem, using -cleancache flush operations as required. - -Note that cleancache must enforce put-put-get coherency and get-get -coherency. For the former, if two puts are made to the same handle but -with different data, say AAA by the first put and BBB by the second, a -subsequent get can never return the stale data (AAA). For get-get coherency, -if a get for a given handle fails, subsequent gets for that handle will -never succeed unless preceded by a successful put with that handle. - -Last, cleancache provides no SMP serialization guarantees; if two -different Linux threads are simultaneously putting and flushing a page -with the same handle, the results are indeterminate. Callers must -lock the page to ensure serial behavior. - -CLEANCACHE PERFORMANCE METRICS - -Cleancache monitoring is done by sysfs files in the -/sys/kernel/mm/cleancache directory. The effectiveness of cleancache -can be measured (across all filesystems) with: - -succ_gets - number of gets that were successful -failed_gets - number of gets that failed -puts - number of puts attempted (all "succeed") -flushes - number of flushes attempted - -A backend implementatation may provide additional metrics. - -FAQ - -1) Where's the value? (Andrew Morton) - -Cleancache provides a significant performance benefit to many workloads -in many environments with negligible overhead by improving the -effectiveness of the pagecache. Clean pagecache pages are -saved in transcendent memory (RAM that is otherwise not directly -addressable to the kernel); fetching those pages later avoids "refaults" -and thus disk reads. - -Cleancache (and its sister code "frontswap") provide interfaces for -this transcendent memory (aka "tmem"), which conceptually lies between -fast kernel-directly-addressable RAM and slower DMA/asynchronous devices. -Disallowing direct kernel or userland reads/writes to tmem -is ideal when data is transformed to a different form and size (such -as with compression) or secretly moved (as might be useful for write- -balancing for some RAM-like devices). Evicted page-cache pages (and -swap pages) are a great use for this kind of slower-than-RAM-but-much- -faster-than-disk transcendent memory, and the cleancache (and frontswap) -"page-object-oriented" specification provides a nice way to read and -write -- and indirectly "name" -- the pages. - -In the virtual case, the whole point of virtualization is to statistically -multiplex physical resources across the varying demands of multiple -virtual machines. This is really hard to do with RAM and efforts to -do it well with no kernel change have essentially failed (except in some -well-publicized special-case workloads). Cleancache -- and frontswap -- -with a fairly small impact on the kernel, provide a huge amount -of flexibility for more dynamic, flexible RAM multiplexing. -Specifically, the Xen Transcendent Memory backend allows otherwise -"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple -virtual machines, but the pages can be compressed and deduplicated to -optimize RAM utilization. And when guest OS's are induced to surrender -underutilized RAM (e.g. with "self-ballooning"), page cache pages -are the first to go, and cleancache allows those pages to be -saved and reclaimed if overall host system memory conditions allow. - -And the identical interface used for cleancache can be used in -physical systems as well. The zcache driver acts as a memory-hungry -device that stores pages of data in a compressed state. And -the proposed "RAMster" driver shares RAM across multiple physical -systems. - -2) Why does cleancache have its sticky fingers so deep inside the - filesystems and VFS? (Andrew Morton and Christoph Hellwig) - -The core hooks for cleancache in VFS are in most cases a single line -and the minimum set are placed precisely where needed to maintain -coherency (via cleancache_flush operations) between cleancache, -the page cache, and disk. All hooks compile into nothingness if -cleancache is config'ed off and turn into a function-pointer- -compare-to-NULL if config'ed on but no backend claims the ops -functions, or to a compare-struct-element-to-negative if a -backend claims the ops functions but a filesystem doesn't enable -cleancache. - -Some filesystems are built entirely on top of VFS and the hooks -in VFS are sufficient, so don't require an "init_fs" hook; the -initial implementation of cleancache didn't provide this hook. -But for some filesystems (such as btrfs), the VFS hooks are -incomplete and one or more hooks in fs-specific code are required. -And for some other filesystems, such as tmpfs, cleancache may -be counterproductive. So it seemed prudent to require a filesystem -to "opt in" to use cleancache, which requires adding a hook in -each filesystem. Not all filesystems are supported by cleancache -only because they haven't been tested. The existing set should -be sufficient to validate the concept, the opt-in approach means -that untested filesystems are not affected, and the hooks in the -existing filesystems should make it very easy to add more -filesystems in the future. - -The total impact of the hooks to existing fs and mm files is only -about 40 lines added (not counting comments and blank lines). - -3) Why not make cleancache asynchronous and batched so it can - more easily interface with real devices with DMA instead - of copying each individual page? (Minchan Kim) - -The one-page-at-a-time copy semantics simplifies the implementation -on both the frontend and backend and also allows the backend to -do fancy things on-the-fly like page compression and -page deduplication. And since the data is "gone" (copied into/out -of the pageframe) before the cleancache get/put call returns, -a great deal of race conditions and potential coherency issues -are avoided. While the interface seems odd for a "real device" -or for real kernel-addressable RAM, it makes perfect sense for -transcendent memory. - -4) Why is non-shared cleancache "exclusive"? And where is the - page "flushed" after a "get"? (Minchan Kim) - -The main reason is to free up space in transcendent memory and -to avoid unnecessary cleancache_flush calls. If you want inclusive, -the page can be "put" immediately following the "get". If -put-after-get for inclusive becomes common, the interface could -be easily extended to add a "get_no_flush" call. - -The flush is done by the cleancache backend implementation. - -5) What's the performance impact? - -Performance analysis has been presented at OLS'09 and LCA'10. -Briefly, performance gains can be significant on most workloads, -especially when memory pressure is high (e.g. when RAM is -overcommitted in a virtual workload); and because the hooks are -invoked primarily in place of or in addition to a disk read/write, -overhead is negligible even in worst case workloads. Basically -cleancache replaces I/O with memory-copy-CPU-overhead; on older -single-core systems with slow memory-copy speeds, cleancache -has little value, but in newer multicore machines, especially -consolidated/virtualized machines, it has great value. - -6) How do I add cleancache support for filesystem X? (Boaz Harrash) - -Filesystems that are well-behaved and conform to certain -restrictions can utilize cleancache simply by making a call to -cleancache_init_fs at mount time. Unusual, misbehaving, or -poorly layered filesystems must either add additional hooks -and/or undergo extensive additional testing... or should just -not enable the optional cleancache. - -Some points for a filesystem to consider: - -- The FS should be block-device-based (e.g. a ram-based FS such - as tmpfs should not enable cleancache) -- To ensure coherency/correctness, the FS must ensure that all - file removal or truncation operations either go through VFS or - add hooks to do the equivalent cleancache "flush" operations -- To ensure coherency/correctness, either inode numbers must - be unique across the lifetime of the on-disk file OR the - FS must provide an "encode_fh" function. -- The FS must call the VFS superblock alloc and deactivate routines - or add hooks to do the equivalent cleancache calls done there. -- To maximize performance, all pages fetched from the FS should - go through the do_mpag_readpage routine or the FS should add - hooks to do the equivalent (cf. btrfs) -- Currently, the FS blocksize must be the same as PAGESIZE. This - is not an architectural restriction, but no backends currently - support anything different. -- A clustered FS should invoke the "shared_init_fs" cleancache - hook to get best performance for some backends. - -7) Why not use the KVA of the inode as the key? (Christoph Hellwig) - -If cleancache would use the inode virtual address instead of -inode/filehandle, the pool id could be eliminated. But, this -won't work because cleancache retains pagecache data pages -persistently even when the inode has been pruned from the -inode unused list, and only flushes the data page if the file -gets removed/truncated. So if cleancache used the inode kva, -there would be potential coherency issues if/when the inode -kva is reused for a different file. Alternately, if cleancache -flushed the pages when the inode kva was freed, much of the value -of cleancache would be lost because the cache of pages in cleanache -is potentially much larger than the kernel pagecache and is most -useful if the pages survive inode cache removal. - -8) Why is a global variable required? - -The cleancache_enabled flag is checked in all of the frequently-used -cleancache hooks. The alternative is a function call to check a static -variable. Since cleancache is enabled dynamically at runtime, systems -that don't enable cleancache would suffer thousands (possibly -tens-of-thousands) of unnecessary function calls per second. So the -global variable allows cleancache to be enabled by default at compile -time, but have insignificant performance impact when cleancache remains -disabled at runtime. - -9) Does cleanache work with KVM? - -The memory model of KVM is sufficiently different that a cleancache -backend may have less value for KVM. This remains to be tested, -especially in an overcommitted system. - -10) Does cleancache work in userspace? It sounds useful for - memory hungry caches like web browsers. (Jamie Lokier) - -No plans yet, though we agree it sounds useful, at least for -apps that bypass the page cache (e.g. O_DIRECT). - -Last updated: Dan Magenheimer, April 13 2011 diff --git a/trunk/MAINTAINERS b/trunk/MAINTAINERS index d54d551004f7..1ab17de642e5 100644 --- a/trunk/MAINTAINERS +++ b/trunk/MAINTAINERS @@ -3572,16 +3572,9 @@ M: Andrew Morton M: Jan Kara L: linux-ext4@vger.kernel.org S: Maintained -F: fs/jbd/ -F: include/linux/ext3_jbd.h -F: include/linux/jbd.h - -JOURNALLING LAYER FOR BLOCK DEVICES (JBD2) -M: "Theodore Ts'o" -L: linux-ext4@vger.kernel.org -S: Maintained -F: fs/jbd2/ -F: include/linux/jbd2.h +F: fs/jbd*/ +F: include/linux/ext*jbd*.h +F: include/linux/jbd*.h JSM Neo PCI based serial card M: Breno Leitao diff --git a/trunk/arch/powerpc/Kconfig b/trunk/arch/powerpc/Kconfig index 423145a6f7ba..2f6a22e8e935 100644 --- a/trunk/arch/powerpc/Kconfig +++ b/trunk/arch/powerpc/Kconfig @@ -141,6 +141,7 @@ config PPC select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL select HAVE_RCU_TABLE_FREE if SMP + select HAVE_SYSCALL_TRACEPOINTS config EARLY_PRINTK bool diff --git a/trunk/arch/powerpc/include/asm/ftrace.h b/trunk/arch/powerpc/include/asm/ftrace.h index dde1296b8b41..169d039ed402 100644 --- a/trunk/arch/powerpc/include/asm/ftrace.h +++ b/trunk/arch/powerpc/include/asm/ftrace.h @@ -60,4 +60,18 @@ struct dyn_arch_ftrace { #endif +#if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_PPC64) && !defined(__ASSEMBLY__) +#define ARCH_HAS_SYSCALL_MATCH_SYM_NAME +static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) +{ + /* + * Compare the symbol name with the system call name. Skip the .sys or .SyS + * prefix from the symbol name and the sys prefix from the system call name and + * just match the rest. This is only needed on ppc64 since symbol names on + * 32bit do not start with a period so the generic function will work. + */ + return !strcmp(sym + 4, name + 3); +} +#endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_PPC64 && !__ASSEMBLY__ */ + #endif /* _ASM_POWERPC_FTRACE */ diff --git a/trunk/arch/powerpc/include/asm/syscall.h b/trunk/arch/powerpc/include/asm/syscall.h index 23913e902fc3..b54b2add07be 100644 --- a/trunk/arch/powerpc/include/asm/syscall.h +++ b/trunk/arch/powerpc/include/asm/syscall.h @@ -15,6 +15,11 @@ #include +/* ftrace syscalls requires exporting the sys_call_table */ +#ifdef CONFIG_FTRACE_SYSCALLS +extern const unsigned long *sys_call_table; +#endif /* CONFIG_FTRACE_SYSCALLS */ + static inline long syscall_get_nr(struct task_struct *task, struct pt_regs *regs) { diff --git a/trunk/arch/powerpc/include/asm/thread_info.h b/trunk/arch/powerpc/include/asm/thread_info.h index 37c353e8af7c..836f231ec1f0 100644 --- a/trunk/arch/powerpc/include/asm/thread_info.h +++ b/trunk/arch/powerpc/include/asm/thread_info.h @@ -110,7 +110,8 @@ static inline struct thread_info *current_thread_info(void) #define TIF_NOERROR 12 /* Force successful syscall return */ #define TIF_NOTIFY_RESUME 13 /* callback before returning to user */ #define TIF_FREEZE 14 /* Freezing for suspend */ -#define TIF_RUNLATCH 15 /* Is the runlatch enabled? */ +#define TIF_SYSCALL_TRACEPOINT 15 /* syscall tracepoint instrumentation */ +#define TIF_RUNLATCH 16 /* Is the runlatch enabled? */ /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1< #include #include +#include #ifdef CONFIG_DYNAMIC_FTRACE @@ -600,3 +601,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) } } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +#if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_PPC64) +unsigned long __init arch_syscall_addr(int nr) +{ + return sys_call_table[nr*2]; +} +#endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_PPC64 */ diff --git a/trunk/arch/powerpc/kernel/ptrace.c b/trunk/arch/powerpc/kernel/ptrace.c index a6ae1cfad86c..cb22024f2b42 100644 --- a/trunk/arch/powerpc/kernel/ptrace.c +++ b/trunk/arch/powerpc/kernel/ptrace.c @@ -29,6 +29,7 @@ #include #include #include +#include #ifdef CONFIG_PPC32 #include #endif @@ -40,6 +41,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + /* * The parameter save area on the stack is used to store arguments being passed * to callee function and is located at fixed offset from stack pointer. @@ -1710,6 +1714,9 @@ long do_syscall_trace_enter(struct pt_regs *regs) */ ret = -1L; + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_enter(regs, regs->gpr[0]); + if (unlikely(current->audit_context)) { #ifdef CONFIG_PPC64 if (!is_32bit_task()) @@ -1738,6 +1745,9 @@ void do_syscall_trace_leave(struct pt_regs *regs) audit_syscall_exit((regs->ccr&0x10000000)?AUDITSC_FAILURE:AUDITSC_SUCCESS, regs->result); + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_exit(regs, regs->result); + step = test_thread_flag(TIF_SINGLESTEP); if (step || test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, step); diff --git a/trunk/arch/x86/include/asm/xen/hypercall.h b/trunk/arch/x86/include/asm/xen/hypercall.h index d240ea950519..8508bfe52296 100644 --- a/trunk/arch/x86/include/asm/xen/hypercall.h +++ b/trunk/arch/x86/include/asm/xen/hypercall.h @@ -447,13 +447,6 @@ HYPERVISOR_hvm_op(int op, void *arg) return _hypercall2(unsigned long, hvm_op, op, arg); } -static inline int -HYPERVISOR_tmem_op( - struct tmem_op *op) -{ - return _hypercall1(int, tmem_op, op); -} - static inline void MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) { diff --git a/trunk/drivers/video/mb862xx/mb862xx-i2c.c b/trunk/drivers/video/mb862xx/mb862xx-i2c.c index b953099edd8e..cb77d3b4657d 100644 --- a/trunk/drivers/video/mb862xx/mb862xx-i2c.c +++ b/trunk/drivers/video/mb862xx/mb862xx-i2c.c @@ -12,7 +12,6 @@ #include #include #include -#include #include "mb862xxfb.h" #include "mb862xx_reg.h" diff --git a/trunk/drivers/xen/Makefile b/trunk/drivers/xen/Makefile index bbc18258ecc5..4781f806701d 100644 --- a/trunk/drivers/xen/Makefile +++ b/trunk/drivers/xen/Makefile @@ -1,6 +1,5 @@ obj-y += grant-table.o features.o events.o manage.o balloon.o obj-y += xenbus/ -obj-y += tmem.o nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_features.o := $(nostackp) diff --git a/trunk/drivers/xen/tmem.c b/trunk/drivers/xen/tmem.c deleted file mode 100644 index 816a44959ef0..000000000000 --- a/trunk/drivers/xen/tmem.c +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Xen implementation for transcendent memory (tmem) - * - * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. - * Author: Dan Magenheimer - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#define TMEM_CONTROL 0 -#define TMEM_NEW_POOL 1 -#define TMEM_DESTROY_POOL 2 -#define TMEM_NEW_PAGE 3 -#define TMEM_PUT_PAGE 4 -#define TMEM_GET_PAGE 5 -#define TMEM_FLUSH_PAGE 6 -#define TMEM_FLUSH_OBJECT 7 -#define TMEM_READ 8 -#define TMEM_WRITE 9 -#define TMEM_XCHG 10 - -/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */ -#define TMEM_POOL_PERSIST 1 -#define TMEM_POOL_SHARED 2 -#define TMEM_POOL_PAGESIZE_SHIFT 4 -#define TMEM_VERSION_SHIFT 24 - - -struct tmem_pool_uuid { - u64 uuid_lo; - u64 uuid_hi; -}; - -struct tmem_oid { - u64 oid[3]; -}; - -#define TMEM_POOL_PRIVATE_UUID { 0, 0 } - -/* flags for tmem_ops.new_pool */ -#define TMEM_POOL_PERSIST 1 -#define TMEM_POOL_SHARED 2 - -/* xen tmem foundation ops/hypercalls */ - -static inline int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, struct tmem_oid oid, - u32 index, unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len) -{ - struct tmem_op op; - int rc = 0; - - op.cmd = tmem_cmd; - op.pool_id = tmem_pool; - op.u.gen.oid[0] = oid.oid[0]; - op.u.gen.oid[1] = oid.oid[1]; - op.u.gen.oid[2] = oid.oid[2]; - op.u.gen.index = index; - op.u.gen.tmem_offset = tmem_offset; - op.u.gen.pfn_offset = pfn_offset; - op.u.gen.len = len; - set_xen_guest_handle(op.u.gen.gmfn, (void *)gmfn); - rc = HYPERVISOR_tmem_op(&op); - return rc; -} - -static int xen_tmem_new_pool(struct tmem_pool_uuid uuid, - u32 flags, unsigned long pagesize) -{ - struct tmem_op op; - int rc = 0, pageshift; - - for (pageshift = 0; pagesize != 1; pageshift++) - pagesize >>= 1; - flags |= (pageshift - 12) << TMEM_POOL_PAGESIZE_SHIFT; - flags |= TMEM_SPEC_VERSION << TMEM_VERSION_SHIFT; - op.cmd = TMEM_NEW_POOL; - op.u.new.uuid[0] = uuid.uuid_lo; - op.u.new.uuid[1] = uuid.uuid_hi; - op.u.new.flags = flags; - rc = HYPERVISOR_tmem_op(&op); - return rc; -} - -/* xen generic tmem ops */ - -static int xen_tmem_put_page(u32 pool_id, struct tmem_oid oid, - u32 index, unsigned long pfn) -{ - unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn; - - return xen_tmem_op(TMEM_PUT_PAGE, pool_id, oid, index, - gmfn, 0, 0, 0); -} - -static int xen_tmem_get_page(u32 pool_id, struct tmem_oid oid, - u32 index, unsigned long pfn) -{ - unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn; - - return xen_tmem_op(TMEM_GET_PAGE, pool_id, oid, index, - gmfn, 0, 0, 0); -} - -static int xen_tmem_flush_page(u32 pool_id, struct tmem_oid oid, u32 index) -{ - return xen_tmem_op(TMEM_FLUSH_PAGE, pool_id, oid, index, - 0, 0, 0, 0); -} - -static int xen_tmem_flush_object(u32 pool_id, struct tmem_oid oid) -{ - return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0); -} - -static int xen_tmem_destroy_pool(u32 pool_id) -{ - struct tmem_oid oid = { { 0 } }; - - return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, oid, 0, 0, 0, 0, 0); -} - -int tmem_enabled; - -static int __init enable_tmem(char *s) -{ - tmem_enabled = 1; - return 1; -} - -__setup("tmem", enable_tmem); - -/* cleancache ops */ - -static void tmem_cleancache_put_page(int pool, struct cleancache_filekey key, - pgoff_t index, struct page *page) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - unsigned long pfn = page_to_pfn(page); - - if (pool < 0) - return; - if (ind != index) - return; - mb(); /* ensure page is quiescent; tmem may address it with an alias */ - (void)xen_tmem_put_page((u32)pool, oid, ind, pfn); -} - -static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key, - pgoff_t index, struct page *page) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - unsigned long pfn = page_to_pfn(page); - int ret; - - /* translate return values to linux semantics */ - if (pool < 0) - return -1; - if (ind != index) - return -1; - ret = xen_tmem_get_page((u32)pool, oid, ind, pfn); - if (ret == 1) - return 0; - else - return -1; -} - -static void tmem_cleancache_flush_page(int pool, struct cleancache_filekey key, - pgoff_t index) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - - if (pool < 0) - return; - if (ind != index) - return; - (void)xen_tmem_flush_page((u32)pool, oid, ind); -} - -static void tmem_cleancache_flush_inode(int pool, struct cleancache_filekey key) -{ - struct tmem_oid oid = *(struct tmem_oid *)&key; - - if (pool < 0) - return; - (void)xen_tmem_flush_object((u32)pool, oid); -} - -static void tmem_cleancache_flush_fs(int pool) -{ - if (pool < 0) - return; - (void)xen_tmem_destroy_pool((u32)pool); -} - -static int tmem_cleancache_init_fs(size_t pagesize) -{ - struct tmem_pool_uuid uuid_private = TMEM_POOL_PRIVATE_UUID; - - return xen_tmem_new_pool(uuid_private, 0, pagesize); -} - -static int tmem_cleancache_init_shared_fs(char *uuid, size_t pagesize) -{ - struct tmem_pool_uuid shared_uuid; - - shared_uuid.uuid_lo = *(u64 *)uuid; - shared_uuid.uuid_hi = *(u64 *)(&uuid[8]); - return xen_tmem_new_pool(shared_uuid, TMEM_POOL_SHARED, pagesize); -} - -static int use_cleancache = 1; - -static int __init no_cleancache(char *s) -{ - use_cleancache = 0; - return 1; -} - -__setup("nocleancache", no_cleancache); - -static struct cleancache_ops tmem_cleancache_ops = { - .put_page = tmem_cleancache_put_page, - .get_page = tmem_cleancache_get_page, - .flush_page = tmem_cleancache_flush_page, - .flush_inode = tmem_cleancache_flush_inode, - .flush_fs = tmem_cleancache_flush_fs, - .init_shared_fs = tmem_cleancache_init_shared_fs, - .init_fs = tmem_cleancache_init_fs -}; - -static int __init xen_tmem_init(void) -{ - struct cleancache_ops old_ops; - - if (!xen_domain()) - return 0; -#ifdef CONFIG_CLEANCACHE - BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid)); - if (tmem_enabled && use_cleancache) { - char *s = ""; - old_ops = cleancache_register_ops(&tmem_cleancache_ops); - if (old_ops.init_fs != NULL) - s = " (WARNING: cleancache_ops overridden)"; - printk(KERN_INFO "cleancache enabled, RAM provided by " - "Xen Transcendent Memory%s\n", s); - } -#endif - return 0; -} - -module_init(xen_tmem_init) diff --git a/trunk/fs/9p/vfs_inode.c b/trunk/fs/9p/vfs_inode.c index 8d7f3e69ae29..7f6c67703195 100644 --- a/trunk/fs/9p/vfs_inode.c +++ b/trunk/fs/9p/vfs_inode.c @@ -814,7 +814,6 @@ int v9fs_vfs_unlink(struct inode *i, struct dentry *d) int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) { - dentry_unhash(d); return v9fs_remove(i, d, 1); } @@ -840,9 +839,6 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct p9_fid *newdirfid; struct p9_wstat wstat; - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - P9_DPRINTK(P9_DEBUG_VFS, "\n"); retval = 0; old_inode = old_dentry->d_inode; diff --git a/trunk/fs/Kconfig b/trunk/fs/Kconfig index 19891aab9c6e..979992dcb386 100644 --- a/trunk/fs/Kconfig +++ b/trunk/fs/Kconfig @@ -47,7 +47,7 @@ config FS_POSIX_ACL def_bool n config EXPORTFS - tristate + bool config FILE_LOCKING bool "Enable POSIX file locking API" if EXPERT @@ -121,20 +121,6 @@ config TMPFS See for details. -config TMPFS_POSIX_ACL - bool "Tmpfs POSIX Access Control Lists" - depends on TMPFS - select TMPFS_XATTR - select GENERIC_ACL - help - POSIX Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. - - To learn more about Access Control Lists, visit the POSIX ACLs for - Linux website . - - If you don't know what Access Control Lists are, say N. - config TMPFS_XATTR bool "Tmpfs extended attributes" depends on TMPFS @@ -147,9 +133,22 @@ config TMPFS_XATTR Currently this enables support for the trusted.* and security.* namespaces. + If unsure, say N. + You need this for POSIX ACL support on tmpfs. - If unsure, say N. +config TMPFS_POSIX_ACL + bool "Tmpfs POSIX Access Control Lists" + depends on TMPFS_XATTR + select GENERIC_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N. config HUGETLBFS bool "HugeTLB file system support" diff --git a/trunk/fs/affs/namei.c b/trunk/fs/affs/namei.c index 03330e2e390c..e3e9efc1fdd8 100644 --- a/trunk/fs/affs/namei.c +++ b/trunk/fs/affs/namei.c @@ -320,8 +320,6 @@ affs_rmdir(struct inode *dir, struct dentry *dentry) dentry->d_inode->i_ino, (int)dentry->d_name.len, dentry->d_name.name); - dentry_unhash(dentry); - return affs_remove_header(dentry); } @@ -419,9 +417,6 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry, struct buffer_head *bh = NULL; int retval; - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n", (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name, (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name); diff --git a/trunk/fs/afs/dir.c b/trunk/fs/afs/dir.c index 2c4e05160042..20c106f24927 100644 --- a/trunk/fs/afs/dir.c +++ b/trunk/fs/afs/dir.c @@ -845,8 +845,6 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) _enter("{%x:%u},{%s}", dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); - dentry_unhash(dentry); - ret = -ENAMETOOLONG; if (dentry->d_name.len >= AFSNAMEMAX) goto error; @@ -1148,9 +1146,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, struct key *key; int ret; - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - vnode = AFS_FS_I(old_dentry->d_inode); orig_dvnode = AFS_FS_I(old_dir); new_dvnode = AFS_FS_I(new_dir); diff --git a/trunk/fs/autofs4/root.c b/trunk/fs/autofs4/root.c index 87d95a8cddbc..f55ae23b137e 100644 --- a/trunk/fs/autofs4/root.c +++ b/trunk/fs/autofs4/root.c @@ -583,8 +583,6 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) return -EACCES; - dentry_unhash(dentry); - if (atomic_dec_and_test(&ino->count)) { p_ino = autofs4_dentry_ino(dentry->d_parent); if (p_ino && dentry->d_parent != dentry) diff --git a/trunk/fs/bfs/dir.c b/trunk/fs/bfs/dir.c index c7d1d06b0483..b14cebfd9047 100644 --- a/trunk/fs/bfs/dir.c +++ b/trunk/fs/bfs/dir.c @@ -224,9 +224,6 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct bfs_sb_info *info; int error = -ENOENT; - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - old_bh = new_bh = NULL; old_inode = old_dentry->d_inode; if (S_ISDIR(old_inode->i_mode)) diff --git a/trunk/fs/btrfs/extent_io.c b/trunk/fs/btrfs/extent_io.c index 4f9893243dae..96fcfa522dab 100644 --- a/trunk/fs/btrfs/extent_io.c +++ b/trunk/fs/btrfs/extent_io.c @@ -11,7 +11,6 @@ #include #include #include -#include #include "extent_io.h" #include "extent_map.h" #include "compat.h" @@ -2017,13 +2016,6 @@ static int __extent_read_full_page(struct extent_io_tree *tree, set_page_extent_mapped(page); - if (!PageUptodate(page)) { - if (cleancache_get_page(page) == 0) { - BUG_ON(blocksize != PAGE_SIZE); - goto out; - } - } - end = page_end; while (1) { lock_extent(tree, start, end, GFP_NOFS); @@ -2157,7 +2149,6 @@ static int __extent_read_full_page(struct extent_io_tree *tree, cur = cur + iosize; page_offset += iosize; } -out: if (!nr) { if (!PageError(page)) SetPageUptodate(page); diff --git a/trunk/fs/btrfs/super.c b/trunk/fs/btrfs/super.c index be4ffa12f3ef..0ac712efcdf2 100644 --- a/trunk/fs/btrfs/super.c +++ b/trunk/fs/btrfs/super.c @@ -39,7 +39,6 @@ #include #include #include -#include #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -625,7 +624,6 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_root = root_dentry; save_mount_options(sb, data); - cleancache_init_fs(sb); return 0; fail_close: diff --git a/trunk/fs/buffer.c b/trunk/fs/buffer.c index 698c6b2cc462..a08bb8e61c6f 100644 --- a/trunk/fs/buffer.c +++ b/trunk/fs/buffer.c @@ -41,7 +41,6 @@ #include #include #include -#include static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -270,10 +269,6 @@ void invalidate_bdev(struct block_device *bdev) invalidate_bh_lrus(); lru_add_drain_all(); /* make sure all lru add caches are flushed */ invalidate_mapping_pages(mapping, 0, -1); - /* 99% of the time, we don't need to flush the cleancache on the bdev. - * But, for the strange corners, lets be cautious - */ - cleancache_flush_inode(mapping); } EXPORT_SYMBOL(invalidate_bdev); @@ -2336,26 +2331,24 @@ EXPORT_SYMBOL(block_commit_write); * page lock we can determine safely if the page is beyond EOF. If it is not * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. - * - * Direct callers of this function should call vfs_check_frozen() so that page - * fault does not busyloop until the fs is thawed. */ -int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block) +int +block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block) { struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; unsigned long end; loff_t size; - int ret; + int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ lock_page(page); size = i_size_read(inode); if ((page->mapping != inode->i_mapping) || (page_offset(page) > size)) { - /* We overload EFAULT to mean page got truncated */ - ret = -EFAULT; - goto out_unlock; + /* page got truncated out from underneath us */ + unlock_page(page); + goto out; } /* page is wholly or partially inside EOF */ @@ -2368,40 +2361,17 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, if (!ret) ret = block_commit_write(page, 0, end); - if (unlikely(ret < 0)) - goto out_unlock; - /* - * Freezing in progress? We check after the page is marked dirty and - * with page lock held so if the test here fails, we are sure freezing - * code will wait during syncing until the page fault is done - at that - * point page will be dirty and unlocked so freezing code will write it - * and writeprotect it again. - */ - set_page_dirty(page); - if (inode->i_sb->s_frozen != SB_UNFROZEN) { - ret = -EAGAIN; - goto out_unlock; - } - return 0; -out_unlock: - unlock_page(page); - return ret; -} -EXPORT_SYMBOL(__block_page_mkwrite); - -int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block) -{ - int ret; - struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb; + if (unlikely(ret)) { + unlock_page(page); + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else /* -ENOSPC, -EIO, etc */ + ret = VM_FAULT_SIGBUS; + } else + ret = VM_FAULT_LOCKED; - /* - * This check is racy but catches the common case. The check in - * __block_page_mkwrite() is reliable. - */ - vfs_check_frozen(sb, SB_FREEZE_WRITE); - ret = __block_page_mkwrite(vma, vmf, get_block); - return block_page_mkwrite_return(ret); +out: + return ret; } EXPORT_SYMBOL(block_page_mkwrite); diff --git a/trunk/fs/coda/dir.c b/trunk/fs/coda/dir.c index a46126fd5735..2b8dae4d121e 100644 --- a/trunk/fs/coda/dir.c +++ b/trunk/fs/coda/dir.c @@ -336,8 +336,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de) int len = de->d_name.len; int error; - dentry_unhash(de); - error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len); if (!error) { /* VFS may delete the child */ @@ -361,9 +359,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, int new_length = new_dentry->d_name.len; int error; - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - error = venus_rename(old_dir->i_sb, coda_i2f(old_dir), coda_i2f(new_dir), old_length, new_length, (const char *) old_name, (const char *)new_name); diff --git a/trunk/fs/configfs/dir.c b/trunk/fs/configfs/dir.c index 9d17d350abc5..9a37a9b6de3a 100644 --- a/trunk/fs/configfs/dir.c +++ b/trunk/fs/configfs/dir.c @@ -1359,8 +1359,6 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) struct module *subsys_owner = NULL, *dead_item_owner = NULL; int ret; - dentry_unhash(dentry); - if (dentry->d_parent == configfs_sb->s_root) return -EPERM; diff --git a/trunk/fs/ecryptfs/inode.c b/trunk/fs/ecryptfs/inode.c index 227b409b8406..4d4cc6a90cd5 100644 --- a/trunk/fs/ecryptfs/inode.c +++ b/trunk/fs/ecryptfs/inode.c @@ -521,8 +521,6 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) struct dentry *lower_dir_dentry; int rc; - dentry_unhash(dentry); - lower_dentry = ecryptfs_dentry_to_lower(dentry); dget(dentry); lower_dir_dentry = lock_parent(lower_dentry); @@ -573,9 +571,6 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct dentry *lower_new_dir_dentry; struct dentry *trap = NULL; - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); dget(lower_old_dentry); diff --git a/trunk/fs/ext3/super.c b/trunk/fs/ext3/super.c index aad153ef6b78..3c6a9e0eadc1 100644 --- a/trunk/fs/ext3/super.c +++ b/trunk/fs/ext3/super.c @@ -36,7 +36,6 @@ #include #include #include -#include #include @@ -1368,7 +1367,6 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, } else { ext3_msg(sb, KERN_INFO, "using internal journal"); } - cleancache_init_fs(sb); return res; } diff --git a/trunk/fs/ext4/Makefile b/trunk/fs/ext4/Makefile index 04109460ba9e..c947e36eda6c 100644 --- a/trunk/fs/ext4/Makefile +++ b/trunk/fs/ext4/Makefile @@ -6,8 +6,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ - ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ - mmp.o + ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o diff --git a/trunk/fs/ext4/balloc.c b/trunk/fs/ext4/balloc.c index 264f6949511e..1c67139ad4b4 100644 --- a/trunk/fs/ext4/balloc.c +++ b/trunk/fs/ext4/balloc.c @@ -361,6 +361,130 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) return bh; } +/** + * ext4_add_groupblocks() -- Add given blocks to an existing group + * @handle: handle to this transaction + * @sb: super block + * @block: start physcial block to add to the block group + * @count: number of blocks to free + * + * This marks the blocks as free in the bitmap. We ask the + * mballoc to reload the buddy after this by setting group + * EXT4_GROUP_INFO_NEED_INIT_BIT flag + */ +void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *gd_bh; + ext4_group_t block_group; + ext4_grpblk_t bit; + unsigned int i; + struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err = 0, ret, blk_free_count; + ext4_grpblk_t blocks_freed; + struct ext4_group_info *grp; + + ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); + + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + grp = ext4_get_group_info(sb, block_group); + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { + goto error_return; + } + bitmap_bh = ext4_read_block_bitmap(sb, block_group); + if (!bitmap_bh) + goto error_return; + desc = ext4_get_group_desc(sb, block_group, &gd_bh); + if (!desc) + goto error_return; + + if (in_range(ext4_block_bitmap(sb, desc), block, count) || + in_range(ext4_inode_bitmap(sb, desc), block, count) || + in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || + in_range(block + count - 1, ext4_inode_table(sb, desc), + sbi->s_itb_per_group)) { + ext4_error(sb, "Adding blocks in system zones - " + "Block = %llu, count = %lu", + block, count); + goto error_return; + } + + /* + * We are about to add blocks to the bitmap, + * so we need undo access. + */ + BUFFER_TRACE(bitmap_bh, "getting undo access"); + err = ext4_journal_get_undo_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; + /* + * make sure we don't allow a parallel init on other groups in the + * same buddy cache + */ + down_write(&grp->alloc_sem); + for (i = 0, blocks_freed = 0; i < count; i++) { + BUFFER_TRACE(bitmap_bh, "clear bit"); + if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), + bit + i, bitmap_bh->b_data)) { + ext4_error(sb, "bit already cleared for block %llu", + (ext4_fsblk_t)(block + i)); + BUFFER_TRACE(bitmap_bh, "bit already cleared"); + } else { + blocks_freed++; + } + } + ext4_lock_group(sb, block_group); + blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); + ext4_free_blks_set(sb, desc, blk_free_count); + desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); + ext4_unlock_group(sb, block_group); + percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + atomic_add(blocks_freed, + &sbi->s_flex_groups[flex_group].free_blocks); + } + /* + * request to reload the buddy with the + * new bitmap information + */ + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); + grp->bb_free += blocks_freed; + up_write(&grp->alloc_sem); + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); + if (!err) + err = ret; + +error_return: + brelse(bitmap_bh); + ext4_std_error(sb, err); + return; +} + /** * ext4_has_free_blocks() * @sbi: in-core super block structure. @@ -369,8 +493,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) * Check if filesystem has nblocks free & available for allocation. * On success return 1, return 0 on failure. */ -static int ext4_has_free_blocks(struct ext4_sb_info *sbi, - s64 nblocks, unsigned int flags) +static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) { s64 free_blocks, dirty_blocks, root_blocks; struct percpu_counter *fbc = &sbi->s_freeblocks_counter; @@ -384,6 +507,11 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, EXT4_FREEBLOCKS_WATERMARK) { free_blocks = percpu_counter_sum_positive(fbc); dirty_blocks = percpu_counter_sum_positive(dbc); + if (dirty_blocks < 0) { + printk(KERN_CRIT "Dirty block accounting " + "went wrong %lld\n", + (long long)dirty_blocks); + } } /* Check whether we have space after * accounting for current dirty blocks & root reserved blocks. @@ -394,9 +522,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, /* Hm, nope. Are (enough) root reserved blocks available? */ if (sbi->s_resuid == current_fsuid() || ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || - capable(CAP_SYS_RESOURCE) || - (flags & EXT4_MB_USE_ROOT_BLOCKS)) { - + capable(CAP_SYS_RESOURCE)) { if (free_blocks >= (nblocks + dirty_blocks)) return 1; } @@ -405,9 +531,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, } int ext4_claim_free_blocks(struct ext4_sb_info *sbi, - s64 nblocks, unsigned int flags) + s64 nblocks) { - if (ext4_has_free_blocks(sbi, nblocks, flags)) { + if (ext4_has_free_blocks(sbi, nblocks)) { percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks); return 0; } else @@ -428,7 +554,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi, */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { - if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) || + if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3 || !EXT4_SB(sb)->s_journal) return 0; @@ -451,8 +577,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) * error stores in errp pointer */ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, unsigned int flags, - unsigned long *count, int *errp) + ext4_fsblk_t goal, unsigned long *count, int *errp) { struct ext4_allocation_request ar; ext4_fsblk_t ret; @@ -462,7 +587,6 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ar.inode = inode; ar.goal = goal; ar.len = count ? *count : 1; - ar.flags = flags; ret = ext4_mb_new_blocks(handle, &ar, errp); if (count) diff --git a/trunk/fs/ext4/ext4.h b/trunk/fs/ext4/ext4.h index a74b89c09f90..4daaf2b753f4 100644 --- a/trunk/fs/ext4/ext4.h +++ b/trunk/fs/ext4/ext4.h @@ -108,8 +108,7 @@ typedef unsigned int ext4_group_t; #define EXT4_MB_DELALLOC_RESERVED 0x0400 /* We are doing stream allocation */ #define EXT4_MB_STREAM_ALLOC 0x0800 -/* Use reserved root blocks if needed */ -#define EXT4_MB_USE_ROOT_BLOCKS 0x1000 + struct ext4_allocation_request { /* target inode for block we're allocating */ @@ -210,8 +209,6 @@ struct ext4_io_submit { */ #define EXT4_BAD_INO 1 /* Bad blocks inode */ #define EXT4_ROOT_INO 2 /* Root inode */ -#define EXT4_USR_QUOTA_INO 3 /* User quota inode */ -#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */ #define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ #define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ #define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ @@ -515,10 +512,6 @@ struct ext4_new_group_data { /* Convert extent to initialized after IO complete */ #define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) - /* Punch out blocks of an extent */ -#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020 - /* Don't normalize allocation size (used for fallocate) */ -#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 /* * Flags used by ext4_free_blocks @@ -1035,7 +1028,7 @@ struct ext4_super_block { __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ __le32 s_flags; /* Miscellaneous flags */ __le16 s_raid_stride; /* RAID stride */ - __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ + __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ __le64 s_mmp_block; /* Block for multi-mount protection */ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ __u8 s_log_groups_per_flex; /* FLEX_BG group size */ @@ -1151,9 +1144,6 @@ struct ext4_sb_info { unsigned long s_ext_blocks; unsigned long s_ext_extents; #endif - /* ext4 extent cache stats */ - unsigned long extent_cache_hits; - unsigned long extent_cache_misses; /* for buddy allocator */ struct ext4_group_info ***s_group_info; @@ -1211,9 +1201,6 @@ struct ext4_sb_info { struct ext4_li_request *s_li_request; /* Wait multiplier for lazy initialization thread */ unsigned int s_li_wait_mult; - - /* Kernel thread for multiple mount protection */ - struct task_struct *s_mmp_tsk; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1351,7 +1338,6 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 -#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -1365,29 +1351,13 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ -#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR -#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ - EXT4_FEATURE_INCOMPAT_META_BG) -#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ - EXT4_FEATURE_RO_COMPAT_BTREE_DIR) - -#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR -#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ - EXT4_FEATURE_INCOMPAT_RECOVER| \ - EXT4_FEATURE_INCOMPAT_META_BG) -#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ - EXT4_FEATURE_RO_COMPAT_BTREE_DIR) - #define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_RECOVER| \ EXT4_FEATURE_INCOMPAT_META_BG| \ EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ - EXT4_FEATURE_INCOMPAT_FLEX_BG| \ - EXT4_FEATURE_INCOMPAT_MMP) + EXT4_FEATURE_INCOMPAT_FLEX_BG) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ @@ -1620,6 +1590,12 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, */ struct ext4_lazy_init { unsigned long li_state; + + wait_queue_head_t li_wait_daemon; + wait_queue_head_t li_wait_task; + struct timer_list li_timer; + struct task_struct *li_task; + struct list_head li_request_list; struct mutex li_list_mtx; }; @@ -1638,67 +1614,6 @@ struct ext4_features { struct completion f_kobj_unregister; }; -/* - * This structure will be used for multiple mount protection. It will be - * written into the block number saved in the s_mmp_block field in the - * superblock. Programs that check MMP should assume that if - * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe - * to use the filesystem, regardless of how old the timestamp is. - */ -#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ -#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ -#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ -#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ - -struct mmp_struct { - __le32 mmp_magic; /* Magic number for MMP */ - __le32 mmp_seq; /* Sequence no. updated periodically */ - - /* - * mmp_time, mmp_nodename & mmp_bdevname are only used for information - * purposes and do not affect the correctness of the algorithm - */ - __le64 mmp_time; /* Time last updated */ - char mmp_nodename[64]; /* Node which last updated MMP block */ - char mmp_bdevname[32]; /* Bdev which last updated MMP block */ - - /* - * mmp_check_interval is used to verify if the MMP block has been - * updated on the block device. The value is updated based on the - * maximum time to write the MMP block during an update cycle. - */ - __le16 mmp_check_interval; - - __le16 mmp_pad1; - __le32 mmp_pad2[227]; -}; - -/* arguments passed to the mmp thread */ -struct mmpd_data { - struct buffer_head *bh; /* bh from initial read_mmp_block() */ - struct super_block *sb; /* super block of the fs */ -}; - -/* - * Check interval multiplier - * The MMP block is written every update interval and initially checked every - * update interval x the multiplier (the value is then adapted based on the - * write latency). The reason is that writes can be delayed under load and we - * don't want readers to incorrectly assume that the filesystem is no longer - * in use. - */ -#define EXT4_MMP_CHECK_MULT 2UL - -/* - * Minimum interval for MMP checking in seconds. - */ -#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL - -/* - * Maximum interval for MMP checking in seconds. - */ -#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL - /* * Function prototypes */ @@ -1723,12 +1638,10 @@ extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); extern unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group); extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, - unsigned int flags, - unsigned long *count, - int *errp); -extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, - s64 nblocks, unsigned int flags); + ext4_fsblk_t goal, unsigned long *count, int *errp); +extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); +extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count); extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); extern void ext4_check_blocks_bitmap(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, @@ -1793,8 +1706,6 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode, unsigned long count, int flags); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); -extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, - ext4_fsblk_t block, unsigned long count); extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); /* inode.c */ @@ -1818,7 +1729,6 @@ extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); extern int ext4_can_truncate(struct inode *inode); extern void ext4_truncate(struct inode *); -extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); @@ -1828,8 +1738,6 @@ extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); -extern int ext4_block_zero_page_range(handle_t *handle, - struct address_space *mapping, loff_t from, loff_t length); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, @@ -1880,10 +1788,6 @@ extern void __ext4_warning(struct super_block *, const char *, unsigned int, __LINE__, ## message) extern void ext4_msg(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); -extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, - const char *, unsigned int, const char *); -#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \ - __LINE__, msg) extern void __ext4_grp_locked_error(const char *, unsigned int, \ struct super_block *, ext4_group_t, \ unsigned long, ext4_fsblk_t, \ @@ -2160,8 +2064,6 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern void ext4_ext_truncate(struct inode *); -extern int ext4_ext_punch_hole(struct file *file, loff_t offset, - loff_t length); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct file *file, int mode, loff_t offset, @@ -2190,9 +2092,6 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io, int len, struct writeback_control *wbc); -/* mmp.c */ -extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); - /* BH_Uninit flag: blocks are allocated but uninitialized on disk */ enum ext4_state_bits { BH_Uninit /* blocks are allocated but uninitialized on disk */ diff --git a/trunk/fs/ext4/ext4_jbd2.c b/trunk/fs/ext4/ext4_jbd2.c index f5240aa15601..6e272ef6ba96 100644 --- a/trunk/fs/ext4/ext4_jbd2.c +++ b/trunk/fs/ext4/ext4_jbd2.c @@ -6,6 +6,20 @@ #include +int __ext4_journal_get_undo_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh) +{ + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_get_undo_access(handle, bh); + if (err) + ext4_journal_abort_handle(where, line, __func__, bh, + handle, err); + } + return err; +} + int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh) { diff --git a/trunk/fs/ext4/ext4_jbd2.h b/trunk/fs/ext4/ext4_jbd2.h index bb85757689b6..d0f53538a57f 100644 --- a/trunk/fs/ext4/ext4_jbd2.h +++ b/trunk/fs/ext4/ext4_jbd2.h @@ -126,6 +126,9 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line, const char *err_fn, struct buffer_head *bh, handle_t *handle, int err); +int __ext4_journal_get_undo_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh); + int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh); @@ -143,6 +146,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, int __ext4_handle_dirty_super(const char *where, unsigned int line, handle_t *handle, struct super_block *sb); +#define ext4_journal_get_undo_access(handle, bh) \ + __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh)) #define ext4_journal_get_write_access(handle, bh) \ __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ diff --git a/trunk/fs/ext4/extents.c b/trunk/fs/ext4/extents.c index 5199bac7fc62..4890d6f3ad15 100644 --- a/trunk/fs/ext4/extents.c +++ b/trunk/fs/ext4/extents.c @@ -46,13 +46,6 @@ #include -static int ext4_split_extent(handle_t *handle, - struct inode *inode, - struct ext4_ext_path *path, - struct ext4_map_blocks *map, - int split_flag, - int flags); - static int ext4_ext_truncate_extend_restart(handle_t *handle, struct inode *inode, int needed) @@ -199,13 +192,12 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, static ext4_fsblk_t ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *ex, int *err, unsigned int flags) + struct ext4_extent *ex, int *err) { ext4_fsblk_t goal, newblock; goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); - newblock = ext4_new_meta_blocks(handle, inode, goal, flags, - NULL, err); + newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err); return newblock; } @@ -482,43 +474,9 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) } ext_debug("\n"); } - -static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, - ext4_fsblk_t newblock, int level) -{ - int depth = ext_depth(inode); - struct ext4_extent *ex; - - if (depth != level) { - struct ext4_extent_idx *idx; - idx = path[level].p_idx; - while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { - ext_debug("%d: move %d:%llu in new index %llu\n", level, - le32_to_cpu(idx->ei_block), - ext4_idx_pblock(idx), - newblock); - idx++; - } - - return; - } - - ex = path[depth].p_ext; - while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { - ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", - le32_to_cpu(ex->ee_block), - ext4_ext_pblock(ex), - ext4_ext_is_uninitialized(ex), - ext4_ext_get_actual_len(ex), - newblock); - ex++; - } -} - #else #define ext4_ext_show_path(inode, path) #define ext4_ext_show_leaf(inode, path) -#define ext4_ext_show_move(inode, path, newblock, level) #endif void ext4_ext_drop_refs(struct ext4_ext_path *path) @@ -834,14 +792,14 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, * - initializes subtree */ static int ext4_ext_split(handle_t *handle, struct inode *inode, - unsigned int flags, - struct ext4_ext_path *path, - struct ext4_extent *newext, int at) + struct ext4_ext_path *path, + struct ext4_extent *newext, int at) { struct buffer_head *bh = NULL; int depth = ext_depth(inode); struct ext4_extent_header *neh; struct ext4_extent_idx *fidx; + struct ext4_extent *ex; int i = at, k, m, a; ext4_fsblk_t newblock, oldblock; __le32 border; @@ -889,7 +847,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); for (a = 0; a < depth - at; a++) { newblock = ext4_ext_new_meta_block(handle, inode, path, - newext, &err, flags); + newext, &err); if (newblock == 0) goto cleanup; ablocks[a] = newblock; @@ -918,6 +876,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; neh->eh_depth = 0; + ex = EXT_FIRST_EXTENT(neh); /* move remainder of path[depth] to the new leaf */ if (unlikely(path[depth].p_hdr->eh_entries != @@ -929,12 +888,25 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, goto cleanup; } /* start copy from next extent */ - m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++; - ext4_ext_show_move(inode, path, newblock, depth); + /* TODO: we could do it by single memmove */ + m = 0; + path[depth].p_ext++; + while (path[depth].p_ext <= + EXT_MAX_EXTENT(path[depth].p_hdr)) { + ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", + le32_to_cpu(path[depth].p_ext->ee_block), + ext4_ext_pblock(path[depth].p_ext), + ext4_ext_is_uninitialized(path[depth].p_ext), + ext4_ext_get_actual_len(path[depth].p_ext), + newblock); + /*memmove(ex++, path[depth].p_ext++, + sizeof(struct ext4_extent)); + neh->eh_entries++;*/ + path[depth].p_ext++; + m++; + } if (m) { - struct ext4_extent *ex; - ex = EXT_FIRST_EXTENT(neh); - memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m); + memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m); le16_add_cpu(&neh->eh_entries, m); } @@ -996,8 +968,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ext_debug("int.index at %d (block %llu): %u -> %llu\n", i, newblock, le32_to_cpu(border), oldblock); + /* copy indexes */ + m = 0; + path[i].p_idx++; - /* move remainder of path[i] to the new index block */ + ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, + EXT_MAX_INDEX(path[i].p_hdr)); if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != EXT_LAST_INDEX(path[i].p_hdr))) { EXT4_ERROR_INODE(inode, @@ -1006,13 +982,20 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, err = -EIO; goto cleanup; } - /* start copy indexes */ - m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; - ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, - EXT_MAX_INDEX(path[i].p_hdr)); - ext4_ext_show_move(inode, path, newblock, i); + while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { + ext_debug("%d: move %d:%llu in new index %llu\n", i, + le32_to_cpu(path[i].p_idx->ei_block), + ext4_idx_pblock(path[i].p_idx), + newblock); + /*memmove(++fidx, path[i].p_idx++, + sizeof(struct ext4_extent_idx)); + neh->eh_entries++; + BUG_ON(neh->eh_entries > neh->eh_max);*/ + path[i].p_idx++; + m++; + } if (m) { - memmove(++fidx, path[i].p_idx, + memmove(++fidx, path[i].p_idx - m, sizeof(struct ext4_extent_idx) * m); le16_add_cpu(&neh->eh_entries, m); } @@ -1073,9 +1056,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, * just created block */ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, - unsigned int flags, - struct ext4_ext_path *path, - struct ext4_extent *newext) + struct ext4_ext_path *path, + struct ext4_extent *newext) { struct ext4_ext_path *curp = path; struct ext4_extent_header *neh; @@ -1083,8 +1065,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, ext4_fsblk_t newblock; int err = 0; - newblock = ext4_ext_new_meta_block(handle, inode, path, - newext, &err, flags); + newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err); if (newblock == 0) return err; @@ -1159,9 +1140,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, * if no free index is found, then it requests in-depth growing. */ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, - unsigned int flags, - struct ext4_ext_path *path, - struct ext4_extent *newext) + struct ext4_ext_path *path, + struct ext4_extent *newext) { struct ext4_ext_path *curp; int depth, i, err = 0; @@ -1181,7 +1161,7 @@ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, if (EXT_HAS_FREE_INDEX(curp)) { /* if we found index with free entry, then use that * entry: create all needed subtree and add new leaf */ - err = ext4_ext_split(handle, inode, flags, path, newext, i); + err = ext4_ext_split(handle, inode, path, newext, i); if (err) goto out; @@ -1194,8 +1174,7 @@ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, err = PTR_ERR(path); } else { /* tree is full, time to grow in depth */ - err = ext4_ext_grow_indepth(handle, inode, flags, - path, newext); + err = ext4_ext_grow_indepth(handle, inode, path, newext); if (err) goto out; @@ -1584,7 +1563,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns * 1 if they got merged. */ -static int ext4_ext_try_to_merge_right(struct inode *inode, +static int ext4_ext_try_to_merge(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex) { @@ -1623,31 +1602,6 @@ static int ext4_ext_try_to_merge_right(struct inode *inode, return merge_done; } -/* - * This function tries to merge the @ex extent to neighbours in the tree. - * return 1 if merge left else 0. - */ -static int ext4_ext_try_to_merge(struct inode *inode, - struct ext4_ext_path *path, - struct ext4_extent *ex) { - struct ext4_extent_header *eh; - unsigned int depth; - int merge_done = 0; - int ret = 0; - - depth = ext_depth(inode); - BUG_ON(path[depth].p_hdr == NULL); - eh = path[depth].p_hdr; - - if (ex > EXT_FIRST_EXTENT(eh)) - merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); - - if (!merge_done) - ret = ext4_ext_try_to_merge_right(inode, path, ex); - - return ret; -} - /* * check if a portion of the "newext" extent overlaps with an * existing extent. @@ -1714,7 +1668,6 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, int depth, len, err; ext4_lblk_t next; unsigned uninitialized = 0; - int flags = 0; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); @@ -1789,9 +1742,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, * There is no free space in the found leaf. * We're gonna add a new leaf in the tree. */ - if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) - flags = EXT4_MB_USE_ROOT_BLOCKS; - err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); + err = ext4_ext_create_new_leaf(handle, inode, path, newext); if (err) goto cleanup; depth = ext_depth(inode); @@ -2052,25 +2003,13 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, } /* - * ext4_ext_in_cache() - * Checks to see if the given block is in the cache. - * If it is, the cached extent is stored in the given - * cache extent pointer. If the cached extent is a hole, - * this routine should be used instead of - * ext4_ext_in_cache if the calling function needs to - * know the size of the hole. - * - * @inode: The files inode - * @block: The block to look for in the cache - * @ex: Pointer where the cached extent will be stored - * if it contains block - * * Return 0 if cache is invalid; 1 if the cache is valid */ -static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block, - struct ext4_ext_cache *ex){ +static int +ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, + struct ext4_extent *ex) +{ struct ext4_ext_cache *cex; - struct ext4_sb_info *sbi; int ret = 0; /* @@ -2078,59 +2017,25 @@ static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block, */ spin_lock(&EXT4_I(inode)->i_block_reservation_lock); cex = &EXT4_I(inode)->i_cached_extent; - sbi = EXT4_SB(inode->i_sb); /* has cache valid data? */ if (cex->ec_len == 0) goto errout; if (in_range(block, cex->ec_block, cex->ec_len)) { - memcpy(ex, cex, sizeof(struct ext4_ext_cache)); + ex->ee_block = cpu_to_le32(cex->ec_block); + ext4_ext_store_pblock(ex, cex->ec_start); + ex->ee_len = cpu_to_le16(cex->ec_len); ext_debug("%u cached by %u:%u:%llu\n", block, cex->ec_block, cex->ec_len, cex->ec_start); ret = 1; } errout: - if (!ret) - sbi->extent_cache_misses++; - else - sbi->extent_cache_hits++; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); return ret; } -/* - * ext4_ext_in_cache() - * Checks to see if the given block is in the cache. - * If it is, the cached extent is stored in the given - * extent pointer. - * - * @inode: The files inode - * @block: The block to look for in the cache - * @ex: Pointer where the cached extent will be stored - * if it contains block - * - * Return 0 if cache is invalid; 1 if the cache is valid - */ -static int -ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, - struct ext4_extent *ex) -{ - struct ext4_ext_cache cex; - int ret = 0; - - if (ext4_ext_check_cache(inode, block, &cex)) { - ex->ee_block = cpu_to_le32(cex.ec_block); - ext4_ext_store_pblock(ex, cex.ec_start); - ex->ee_len = cpu_to_le16(cex.ec_len); - ret = 1; - } - - return ret; -} - - /* * ext4_ext_rm_idx: * removes index from the index block. @@ -2258,16 +2163,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, ext4_free_blocks(handle, inode, NULL, start, num, flags); } else if (from == le32_to_cpu(ex->ee_block) && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { - /* head removal */ - ext4_lblk_t num; - ext4_fsblk_t start; - - num = to - from; - start = ext4_ext_pblock(ex); - - ext_debug("free first %u blocks starting %llu\n", num, start); - ext4_free_blocks(handle, inode, 0, start, num, flags); - + printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", + from, to, le32_to_cpu(ex->ee_block), ee_len); } else { printk(KERN_INFO "strange request: removal(2) " "%u-%u from %u:%u\n", @@ -2276,22 +2173,9 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, return 0; } - -/* - * ext4_ext_rm_leaf() Removes the extents associated with the - * blocks appearing between "start" and "end", and splits the extents - * if "start" and "end" appear in the same extent - * - * @handle: The journal handle - * @inode: The files inode - * @path: The path to the leaf - * @start: The first block to remove - * @end: The last block to remove - */ static int ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, ext4_lblk_t start, - ext4_lblk_t end) + struct ext4_ext_path *path, ext4_lblk_t start) { int err = 0, correct_index = 0; int depth = ext_depth(inode), credits; @@ -2302,7 +2186,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, unsigned short ex_ee_len; unsigned uninitialized = 0; struct ext4_extent *ex; - struct ext4_map_blocks map; /* the header must be checked already in ext4_ext_remove_space() */ ext_debug("truncate since %u in leaf\n", start); @@ -2332,95 +2215,31 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, path[depth].p_ext = ex; a = ex_ee_block > start ? ex_ee_block : start; - b = ex_ee_block+ex_ee_len - 1 < end ? - ex_ee_block+ex_ee_len - 1 : end; + b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ? + ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK; ext_debug(" border %u:%u\n", a, b); - /* If this extent is beyond the end of the hole, skip it */ - if (end <= ex_ee_block) { - ex--; - ex_ee_block = le32_to_cpu(ex->ee_block); - ex_ee_len = ext4_ext_get_actual_len(ex); - continue; - } else if (a != ex_ee_block && - b != ex_ee_block + ex_ee_len - 1) { - /* - * If this is a truncate, then this condition should - * never happen because at least one of the end points - * needs to be on the edge of the extent. - */ - if (end == EXT_MAX_BLOCK) { - ext_debug(" bad truncate %u:%u\n", - start, end); - block = 0; - num = 0; - err = -EIO; - goto out; - } - /* - * else this is a hole punch, so the extent needs to - * be split since neither edge of the hole is on the - * extent edge - */ - else{ - map.m_pblk = ext4_ext_pblock(ex); - map.m_lblk = ex_ee_block; - map.m_len = b - ex_ee_block; - - err = ext4_split_extent(handle, - inode, path, &map, 0, - EXT4_GET_BLOCKS_PUNCH_OUT_EXT | - EXT4_GET_BLOCKS_PRE_IO); - - if (err < 0) - goto out; - - ex_ee_len = ext4_ext_get_actual_len(ex); - - b = ex_ee_block+ex_ee_len - 1 < end ? - ex_ee_block+ex_ee_len - 1 : end; - - /* Then remove tail of this extent */ - block = ex_ee_block; - num = a - block; - } + if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) { + block = 0; + num = 0; + BUG(); } else if (a != ex_ee_block) { /* remove tail of the extent */ block = ex_ee_block; num = a - block; } else if (b != ex_ee_block + ex_ee_len - 1) { /* remove head of the extent */ - block = b; - num = ex_ee_block + ex_ee_len - b; - - /* - * If this is a truncate, this condition - * should never happen - */ - if (end == EXT_MAX_BLOCK) { - ext_debug(" bad truncate %u:%u\n", - start, end); - err = -EIO; - goto out; - } + block = a; + num = b - a; + /* there is no "make a hole" API yet */ + BUG(); } else { /* remove whole extent: excellent! */ block = ex_ee_block; num = 0; - if (a != ex_ee_block) { - ext_debug(" bad truncate %u:%u\n", - start, end); - err = -EIO; - goto out; - } - - if (b != ex_ee_block + ex_ee_len - 1) { - ext_debug(" bad truncate %u:%u\n", - start, end); - err = -EIO; - goto out; - } + BUG_ON(a != ex_ee_block); + BUG_ON(b != ex_ee_block + ex_ee_len - 1); } /* @@ -2451,13 +2270,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (num == 0) { /* this extent is removed; mark slot entirely unused */ ext4_ext_store_pblock(ex, 0); - } else if (block != ex_ee_block) { - /* - * If this was a head removal, then we need to update - * the physical block since it is now at a different - * location - */ - ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a)); + le16_add_cpu(&eh->eh_entries, -1); } ex->ee_block = cpu_to_le32(block); @@ -2473,27 +2286,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (err) goto out; - /* - * If the extent was completely released, - * we need to remove it from the leaf - */ - if (num == 0) { - if (end != EXT_MAX_BLOCK) { - /* - * For hole punching, we need to scoot all the - * extents up when an extent is removed so that - * we dont have blank extents in the middle - */ - memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) * - sizeof(struct ext4_extent)); - - /* Now get rid of the one at the end */ - memset(EXT_LAST_EXTENT(eh), 0, - sizeof(struct ext4_extent)); - } - le16_add_cpu(&eh->eh_entries, -1); - } - ext_debug("new extent: %u:%u:%llu\n", block, num, ext4_ext_pblock(ex)); ex--; @@ -2534,8 +2326,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) return 1; } -static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, - ext4_lblk_t end) +static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) { struct super_block *sb = inode->i_sb; int depth = ext_depth(inode); @@ -2574,8 +2365,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, while (i >= 0 && err == 0) { if (i == depth) { /* this is leaf block */ - err = ext4_ext_rm_leaf(handle, inode, path, - start, end); + err = ext4_ext_rm_leaf(handle, inode, path, start); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; @@ -2739,195 +2529,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) return ret; } -/* - * used by extent splitting. - */ -#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ - due to ENOSPC */ -#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ -#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ - -/* - * ext4_split_extent_at() splits an extent at given block. - * - * @handle: the journal handle - * @inode: the file inode - * @path: the path to the extent - * @split: the logical block where the extent is splitted. - * @split_flags: indicates if the extent could be zeroout if split fails, and - * the states(init or uninit) of new extents. - * @flags: flags used to insert new extent to extent tree. - * - * - * Splits extent [a, b] into two extents [a, @split) and [@split, b], states - * of which are deterimined by split_flag. - * - * There are two cases: - * a> the extent are splitted into two extent. - * b> split is not needed, and just mark the extent. - * - * return 0 on success. - */ -static int ext4_split_extent_at(handle_t *handle, - struct inode *inode, - struct ext4_ext_path *path, - ext4_lblk_t split, - int split_flag, - int flags) -{ - ext4_fsblk_t newblock; - ext4_lblk_t ee_block; - struct ext4_extent *ex, newex, orig_ex; - struct ext4_extent *ex2 = NULL; - unsigned int ee_len, depth; - int err = 0; - - ext_debug("ext4_split_extents_at: inode %lu, logical" - "block %llu\n", inode->i_ino, (unsigned long long)split); - - ext4_ext_show_leaf(inode, path); - - depth = ext_depth(inode); - ex = path[depth].p_ext; - ee_block = le32_to_cpu(ex->ee_block); - ee_len = ext4_ext_get_actual_len(ex); - newblock = split - ee_block + ext4_ext_pblock(ex); - - BUG_ON(split < ee_block || split >= (ee_block + ee_len)); - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; - - if (split == ee_block) { - /* - * case b: block @split is the block that the extent begins with - * then we just change the state of the extent, and splitting - * is not needed. - */ - if (split_flag & EXT4_EXT_MARK_UNINIT2) - ext4_ext_mark_uninitialized(ex); - else - ext4_ext_mark_initialized(ex); - - if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) - ext4_ext_try_to_merge(inode, path, ex); - - err = ext4_ext_dirty(handle, inode, path + depth); - goto out; - } - - /* case a */ - memcpy(&orig_ex, ex, sizeof(orig_ex)); - ex->ee_len = cpu_to_le16(split - ee_block); - if (split_flag & EXT4_EXT_MARK_UNINIT1) - ext4_ext_mark_uninitialized(ex); - - /* - * path may lead to new leaf, not to original leaf any more - * after ext4_ext_insert_extent() returns, - */ - err = ext4_ext_dirty(handle, inode, path + depth); - if (err) - goto fix_extent_len; - - ex2 = &newex; - ex2->ee_block = cpu_to_le32(split); - ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); - ext4_ext_store_pblock(ex2, newblock); - if (split_flag & EXT4_EXT_MARK_UNINIT2) - ext4_ext_mark_uninitialized(ex2); - - err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); - if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { - err = ext4_ext_zeroout(inode, &orig_ex); - if (err) - goto fix_extent_len; - /* update the extent length and mark as initialized */ - ex->ee_len = cpu_to_le32(ee_len); - ext4_ext_try_to_merge(inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + depth); - goto out; - } else if (err) - goto fix_extent_len; - -out: - ext4_ext_show_leaf(inode, path); - return err; - -fix_extent_len: - ex->ee_len = orig_ex.ee_len; - ext4_ext_dirty(handle, inode, path + depth); - return err; -} - -/* - * ext4_split_extents() splits an extent and mark extent which is covered - * by @map as split_flags indicates - * - * It may result in splitting the extent into multiple extents (upto three) - * There are three possibilities: - * a> There is no split required - * b> Splits in two extents: Split is happening at either end of the extent - * c> Splits in three extents: Somone is splitting in middle of the extent - * - */ -static int ext4_split_extent(handle_t *handle, - struct inode *inode, - struct ext4_ext_path *path, - struct ext4_map_blocks *map, - int split_flag, - int flags) -{ - ext4_lblk_t ee_block; - struct ext4_extent *ex; - unsigned int ee_len, depth; - int err = 0; - int uninitialized; - int split_flag1, flags1; - - depth = ext_depth(inode); - ex = path[depth].p_ext; - ee_block = le32_to_cpu(ex->ee_block); - ee_len = ext4_ext_get_actual_len(ex); - uninitialized = ext4_ext_is_uninitialized(ex); - - if (map->m_lblk + map->m_len < ee_block + ee_len) { - split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ? - EXT4_EXT_MAY_ZEROOUT : 0; - flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; - if (uninitialized) - split_flag1 |= EXT4_EXT_MARK_UNINIT1 | - EXT4_EXT_MARK_UNINIT2; - err = ext4_split_extent_at(handle, inode, path, - map->m_lblk + map->m_len, split_flag1, flags1); - if (err) - goto out; - } - - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path); - if (IS_ERR(path)) - return PTR_ERR(path); - - if (map->m_lblk >= ee_block) { - split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ? - EXT4_EXT_MAY_ZEROOUT : 0; - if (uninitialized) - split_flag1 |= EXT4_EXT_MARK_UNINIT1; - if (split_flag & EXT4_EXT_MARK_UNINIT2) - split_flag1 |= EXT4_EXT_MARK_UNINIT2; - err = ext4_split_extent_at(handle, inode, path, - map->m_lblk, split_flag1, flags); - if (err) - goto out; - } - - ext4_ext_show_leaf(inode, path); -out: - return err ? err : map->m_len; -} - #define EXT4_EXT_ZERO_LEN 7 /* * This function is called by ext4_ext_map_blocks() if someone tries to write @@ -2944,13 +2545,17 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct ext4_map_blocks *map, struct ext4_ext_path *path) { - struct ext4_map_blocks split_map; - struct ext4_extent zero_ex; - struct ext4_extent *ex; + struct ext4_extent *ex, newex, orig_ex; + struct ext4_extent *ex1 = NULL; + struct ext4_extent *ex2 = NULL; + struct ext4_extent *ex3 = NULL; + struct ext4_extent_header *eh; ext4_lblk_t ee_block, eof_block; unsigned int allocated, ee_len, depth; + ext4_fsblk_t newblock; int err = 0; - int split_flag = 0; + int ret = 0; + int may_zeroout; ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" "block %llu, max_blocks %u\n", inode->i_ino, @@ -2962,86 +2567,280 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, eof_block = map->m_lblk + map->m_len; depth = ext_depth(inode); + eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); allocated = ee_len - (map->m_lblk - ee_block); + newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex); + + ex2 = ex; + orig_ex.ee_block = ex->ee_block; + orig_ex.ee_len = cpu_to_le16(ee_len); + ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex)); - WARN_ON(map->m_lblk < ee_block); /* * It is safe to convert extent to initialized via explicit * zeroout only if extent is fully insde i_size or new_size. */ - split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; + may_zeroout = ee_block + ee_len <= eof_block; + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ - if (ee_len <= 2*EXT4_EXT_ZERO_LEN && - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { - err = ext4_ext_zeroout(inode, ex); - if (err) - goto out; - - err = ext4_ext_get_access(handle, inode, path + depth); + if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) { + err = ext4_ext_zeroout(inode, &orig_ex); if (err) - goto out; - ext4_ext_mark_initialized(ex); - ext4_ext_try_to_merge(inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + depth); - goto out; + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zeroed the full extent */ + return allocated; } + /* ex1: ee_block to map->m_lblk - 1 : uninitialized */ + if (map->m_lblk > ee_block) { + ex1 = ex; + ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); + ext4_ext_mark_uninitialized(ex1); + ex2 = &newex; + } /* - * four cases: - * 1. split the extent into three extents. - * 2. split the extent into two extents, zeroout the first half. - * 3. split the extent into two extents, zeroout the second half. - * 4. split the extent into two extents with out zeroout. + * for sanity, update the length of the ex2 extent before + * we insert ex3, if ex1 is NULL. This is to avoid temporary + * overlap of blocks. */ - split_map.m_lblk = map->m_lblk; - split_map.m_len = map->m_len; - + if (!ex1 && allocated > map->m_len) + ex2->ee_len = cpu_to_le16(map->m_len); + /* ex3: to ee_block + ee_len : uninitialised */ if (allocated > map->m_len) { - if (allocated <= EXT4_EXT_ZERO_LEN && - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { - /* case 3 */ - zero_ex.ee_block = - cpu_to_le32(map->m_lblk); - zero_ex.ee_len = cpu_to_le16(allocated); - ext4_ext_store_pblock(&zero_ex, - ext4_ext_pblock(ex) + map->m_lblk - ee_block); - err = ext4_ext_zeroout(inode, &zero_ex); - if (err) - goto out; - split_map.m_lblk = map->m_lblk; - split_map.m_len = allocated; - } else if ((map->m_lblk - ee_block + map->m_len < - EXT4_EXT_ZERO_LEN) && - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { - /* case 2 */ - if (map->m_lblk != ee_block) { - zero_ex.ee_block = ex->ee_block; - zero_ex.ee_len = cpu_to_le16(map->m_lblk - - ee_block); - ext4_ext_store_pblock(&zero_ex, - ext4_ext_pblock(ex)); - err = ext4_ext_zeroout(inode, &zero_ex); - if (err) - goto out; - } + unsigned int newdepth; + /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ + if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) { + /* + * map->m_lblk == ee_block is handled by the zerouout + * at the beginning. + * Mark first half uninitialized. + * Mark second half initialized and zero out the + * initialized extent + */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = cpu_to_le16(ee_len - allocated); + ext4_ext_mark_uninitialized(ex); + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + + ex3 = &newex; + ex3->ee_block = cpu_to_le32(map->m_lblk); + ext4_ext_store_pblock(ex3, newblock); + ex3->ee_len = cpu_to_le16(allocated); + err = ext4_ext_insert_extent(handle, inode, path, + ex3, 0); + if (err == -ENOSPC) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, + ext4_ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* blocks available from map->m_lblk */ + return allocated; + + } else if (err) + goto fix_extent_len; + + /* + * We need to zero out the second half because + * an fallocate request can update file size and + * converting the second half to initialized extent + * implies that we can leak some junk data to user + * space. + */ + err = ext4_ext_zeroout(inode, ex3); + if (err) { + /* + * We should actually mark the + * second half as uninit and return error + * Insert would have changed the extent + */ + depth = ext_depth(inode); + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, map->m_lblk, + path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + return err; + } + /* get the second half extent details */ + ex = path[depth].p_ext; + err = ext4_ext_get_access(handle, inode, + path + depth); + if (err) + return err; + ext4_ext_mark_uninitialized(ex); + ext4_ext_dirty(handle, inode, path + depth); + return err; + } - split_map.m_lblk = ee_block; - split_map.m_len = map->m_lblk - ee_block + map->m_len; - allocated = map->m_len; + /* zeroed the second half */ + return allocated; } - } + ex3 = &newex; + ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); + ext4_ext_store_pblock(ex3, newblock + map->m_len); + ex3->ee_len = cpu_to_le16(allocated - map->m_len); + ext4_ext_mark_uninitialized(ex3); + err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); + if (err == -ENOSPC && may_zeroout) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zeroed the full extent */ + /* blocks available from map->m_lblk */ + return allocated; + + } else if (err) + goto fix_extent_len; + /* + * The depth, and hence eh & ex might change + * as part of the insert above. + */ + newdepth = ext_depth(inode); + /* + * update the extent length after successful insert of the + * split extent + */ + ee_len -= ext4_ext_get_actual_len(ex3); + orig_ex.ee_len = cpu_to_le16(ee_len); + may_zeroout = ee_block + ee_len <= eof_block; + + depth = newdepth; + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, map->m_lblk, path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; + } + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + if (ex2 != &newex) + ex2 = ex; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; - allocated = ext4_split_extent(handle, inode, path, - &split_map, split_flag, 0); - if (allocated < 0) - err = allocated; + allocated = map->m_len; + /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying + * to insert a extent in the middle zerout directly + * otherwise give the extent a chance to merge to left + */ + if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN && + map->m_lblk != ee_block && may_zeroout) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zero out the first half */ + /* blocks available from map->m_lblk */ + return allocated; + } + } + /* + * If there was a change of depth as part of the + * insertion of ex3 above, we need to update the length + * of the ex1 extent again here + */ + if (ex1 && ex1 != ex) { + ex1 = ex; + ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); + ext4_ext_mark_uninitialized(ex1); + ex2 = &newex; + } + /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */ + ex2->ee_block = cpu_to_le32(map->m_lblk); + ext4_ext_store_pblock(ex2, newblock); + ex2->ee_len = cpu_to_le16(allocated); + if (ex2 != ex) + goto insert; + /* + * New (initialized) extent starts from the first block + * in the current extent. i.e., ex2 == ex + * We have to see if it can be merged with the extent + * on the left. + */ + if (ex2 > EXT_FIRST_EXTENT(eh)) { + /* + * To merge left, pass "ex2 - 1" to try_to_merge(), + * since it merges towards right _only_. + */ + ret = ext4_ext_try_to_merge(inode, path, ex2 - 1); + if (ret) { + err = ext4_ext_correct_indexes(handle, inode, path); + if (err) + goto out; + depth = ext_depth(inode); + ex2--; + } + } + /* + * Try to Merge towards right. This might be required + * only when the whole extent is being written to. + * i.e. ex2 == ex and ex3 == NULL. + */ + if (!ex3) { + ret = ext4_ext_try_to_merge(inode, path, ex2); + if (ret) { + err = ext4_ext_correct_indexes(handle, inode, path); + if (err) + goto out; + } + } + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + depth); + goto out; +insert: + err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); + if (err == -ENOSPC && may_zeroout) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zero out the first half */ + return allocated; + } else if (err) + goto fix_extent_len; out: + ext4_ext_show_leaf(inode, path); return err ? err : allocated; + +fix_extent_len: + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); + ext4_ext_mark_uninitialized(ex); + ext4_ext_dirty(handle, inode, path + depth); + return err; } /* @@ -3072,11 +2871,15 @@ static int ext4_split_unwritten_extents(handle_t *handle, struct ext4_ext_path *path, int flags) { - ext4_lblk_t eof_block; - ext4_lblk_t ee_block; - struct ext4_extent *ex; - unsigned int ee_len; - int split_flag = 0, depth; + struct ext4_extent *ex, newex, orig_ex; + struct ext4_extent *ex1 = NULL; + struct ext4_extent *ex2 = NULL; + struct ext4_extent *ex3 = NULL; + ext4_lblk_t ee_block, eof_block; + unsigned int allocated, ee_len, depth; + ext4_fsblk_t newblock; + int err = 0; + int may_zeroout; ext_debug("ext4_split_unwritten_extents: inode %lu, logical" "block %llu, max_blocks %u\n", inode->i_ino, @@ -3086,22 +2889,156 @@ static int ext4_split_unwritten_extents(handle_t *handle, inode->i_sb->s_blocksize_bits; if (eof_block < map->m_lblk + map->m_len) eof_block = map->m_lblk + map->m_len; - /* - * It is safe to convert extent to initialized via explicit - * zeroout only if extent is fully insde i_size or new_size. - */ + depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); + allocated = ee_len - (map->m_lblk - ee_block); + newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex); - split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; - split_flag |= EXT4_EXT_MARK_UNINIT2; + ex2 = ex; + orig_ex.ee_block = ex->ee_block; + orig_ex.ee_len = cpu_to_le16(ee_len); + ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex)); - flags |= EXT4_GET_BLOCKS_PRE_IO; - return ext4_split_extent(handle, inode, path, map, split_flag, flags); -} + /* + * It is safe to convert extent to initialized via explicit + * zeroout only if extent is fully insde i_size or new_size. + */ + may_zeroout = ee_block + ee_len <= eof_block; + + /* + * If the uninitialized extent begins at the same logical + * block where the write begins, and the write completely + * covers the extent, then we don't need to split it. + */ + if ((map->m_lblk == ee_block) && (allocated <= map->m_len)) + return allocated; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* ex1: ee_block to map->m_lblk - 1 : uninitialized */ + if (map->m_lblk > ee_block) { + ex1 = ex; + ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); + ext4_ext_mark_uninitialized(ex1); + ex2 = &newex; + } + /* + * for sanity, update the length of the ex2 extent before + * we insert ex3, if ex1 is NULL. This is to avoid temporary + * overlap of blocks. + */ + if (!ex1 && allocated > map->m_len) + ex2->ee_len = cpu_to_le16(map->m_len); + /* ex3: to ee_block + ee_len : uninitialised */ + if (allocated > map->m_len) { + unsigned int newdepth; + ex3 = &newex; + ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); + ext4_ext_store_pblock(ex3, newblock + map->m_len); + ex3->ee_len = cpu_to_le16(allocated - map->m_len); + ext4_ext_mark_uninitialized(ex3); + err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); + if (err == -ENOSPC && may_zeroout) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zeroed the full extent */ + /* blocks available from map->m_lblk */ + return allocated; + + } else if (err) + goto fix_extent_len; + /* + * The depth, and hence eh & ex might change + * as part of the insert above. + */ + newdepth = ext_depth(inode); + /* + * update the extent length after successful insert of the + * split extent + */ + ee_len -= ext4_ext_get_actual_len(ex3); + orig_ex.ee_len = cpu_to_le16(ee_len); + may_zeroout = ee_block + ee_len <= eof_block; + + depth = newdepth; + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, map->m_lblk, path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; + } + ex = path[depth].p_ext; + if (ex2 != &newex) + ex2 = ex; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + allocated = map->m_len; + } + /* + * If there was a change of depth as part of the + * insertion of ex3 above, we need to update the length + * of the ex1 extent again here + */ + if (ex1 && ex1 != ex) { + ex1 = ex; + ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); + ext4_ext_mark_uninitialized(ex1); + ex2 = &newex; + } + /* + * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written + * using direct I/O, uninitialised still. + */ + ex2->ee_block = cpu_to_le32(map->m_lblk); + ext4_ext_store_pblock(ex2, newblock); + ex2->ee_len = cpu_to_le16(allocated); + ext4_ext_mark_uninitialized(ex2); + if (ex2 != ex) + goto insert; + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + depth); + ext_debug("out here\n"); + goto out; +insert: + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (err == -ENOSPC && may_zeroout) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zero out the first half */ + return allocated; + } else if (err) + goto fix_extent_len; +out: + ext4_ext_show_leaf(inode, path); + return err ? err : allocated; +fix_extent_len: + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); + ext4_ext_mark_uninitialized(ex); + ext4_ext_dirty(handle, inode, path + depth); + return err; +} static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) @@ -3110,27 +3047,46 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct ext4_extent_header *eh; int depth; int err = 0; + int ret = 0; depth = ext_depth(inode); eh = path[depth].p_hdr; ex = path[depth].p_ext; - ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" - "block %llu, max_blocks %u\n", inode->i_ino, - (unsigned long long)le32_to_cpu(ex->ee_block), - ext4_ext_get_actual_len(ex)); - err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; /* first mark the extent as initialized */ ext4_ext_mark_initialized(ex); - /* note: ext4_ext_correct_indexes() isn't needed here because - * borders are not changed + /* + * We have to see if it can be merged with the extent + * on the left. */ - ext4_ext_try_to_merge(inode, path, ex); - + if (ex > EXT_FIRST_EXTENT(eh)) { + /* + * To merge left, pass "ex - 1" to try_to_merge(), + * since it merges towards right _only_. + */ + ret = ext4_ext_try_to_merge(inode, path, ex - 1); + if (ret) { + err = ext4_ext_correct_indexes(handle, inode, path); + if (err) + goto out; + depth = ext_depth(inode); + ex--; + } + } + /* + * Try to Merge towards right. + */ + ret = ext4_ext_try_to_merge(inode, path, ex); + if (ret) { + err = ext4_ext_correct_indexes(handle, inode, path); + if (err) + goto out; + depth = ext_depth(inode); + } /* Mark modified extent as dirty */ err = ext4_ext_dirty(handle, inode, path + depth); out: @@ -3346,19 +3302,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t newblock = 0; int err = 0, depth, ret; unsigned int allocated = 0; - unsigned int punched_out = 0; - unsigned int result = 0; struct ext4_allocation_request ar; ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; - struct ext4_map_blocks punch_map; ext_debug("blocks %u/%u requested for inode %lu\n", map->m_lblk, map->m_len, inode->i_ino); trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* check in cache */ - if (ext4_ext_in_cache(inode, map->m_lblk, &newex) && - ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) { + if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { if (!newex.ee_start_lo && !newex.ee_start_hi) { if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* @@ -3423,84 +3375,16 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, ee_block, ee_len, newblock); - if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) { - /* - * Do not put uninitialized extent - * in the cache - */ - if (!ext4_ext_is_uninitialized(ex)) { - ext4_ext_put_in_cache(inode, ee_block, - ee_len, ee_start); - goto out; - } - ret = ext4_ext_handle_uninitialized_extents( - handle, inode, map, path, flags, - allocated, newblock); - return ret; - } - - /* - * Punch out the map length, but only to the - * end of the extent - */ - punched_out = allocated < map->m_len ? - allocated : map->m_len; - - /* - * Sense extents need to be converted to - * uninitialized, they must fit in an - * uninitialized extent - */ - if (punched_out > EXT_UNINIT_MAX_LEN) - punched_out = EXT_UNINIT_MAX_LEN; - - punch_map.m_lblk = map->m_lblk; - punch_map.m_pblk = newblock; - punch_map.m_len = punched_out; - punch_map.m_flags = 0; - - /* Check to see if the extent needs to be split */ - if (punch_map.m_len != ee_len || - punch_map.m_lblk != ee_block) { - - ret = ext4_split_extent(handle, inode, - path, &punch_map, 0, - EXT4_GET_BLOCKS_PUNCH_OUT_EXT | - EXT4_GET_BLOCKS_PRE_IO); - - if (ret < 0) { - err = ret; - goto out2; - } - /* - * find extent for the block at - * the start of the hole - */ - ext4_ext_drop_refs(path); - kfree(path); - - path = ext4_ext_find_extent(inode, - map->m_lblk, NULL); - if (IS_ERR(path)) { - err = PTR_ERR(path); - path = NULL; - goto out2; - } - - depth = ext_depth(inode); - ex = path[depth].p_ext; - ee_len = ext4_ext_get_actual_len(ex); - ee_block = le32_to_cpu(ex->ee_block); - ee_start = ext4_ext_pblock(ex); - + /* Do not put uninitialized extent in the cache */ + if (!ext4_ext_is_uninitialized(ex)) { + ext4_ext_put_in_cache(inode, ee_block, + ee_len, ee_start); + goto out; } - - ext4_ext_mark_uninitialized(ex); - - err = ext4_ext_remove_space(inode, map->m_lblk, - map->m_lblk + punched_out); - - goto out2; + ret = ext4_ext_handle_uninitialized_extents(handle, + inode, map, path, flags, allocated, + newblock); + return ret; } } @@ -3562,8 +3446,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, else /* disable in-core preallocation for non-regular files */ ar.flags = 0; - if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) - ar.flags |= EXT4_MB_HINT_NOPREALLOC; newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; @@ -3647,11 +3529,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, } trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, newblock, map->m_len, err ? err : allocated); - - result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ? - punched_out : allocated; - - return err ? err : result; + return err ? err : allocated; } void ext4_ext_truncate(struct inode *inode) @@ -3699,7 +3577,7 @@ void ext4_ext_truncate(struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); - err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCK); + err = ext4_ext_remove_space(inode, last_block); /* In a multi-transaction truncate, we only make the final * transaction synchronous. @@ -3707,9 +3585,8 @@ void ext4_ext_truncate(struct inode *inode) if (IS_SYNC(inode)) ext4_handle_sync(handle); - up_write(&EXT4_I(inode)->i_data_sem); - out_stop: + up_write(&EXT4_I(inode)->i_data_sem); /* * If this was a simple ftruncate() and the file will remain alive, * then we need to clear up the orphan record which we created above. @@ -3774,6 +3651,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) struct ext4_map_blocks map; unsigned int credits, blkbits = inode->i_blkbits; + /* We only support the FALLOC_FL_KEEP_SIZE mode */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + /* * currently supporting (pre)allocate mode for extent-based * files _only_ @@ -3781,13 +3662,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; - /* Return error if mode is not supported */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) - return -EOPNOTSUPP; - - if (mode & FALLOC_FL_PUNCH_HOLE) - return ext4_punch_hole(file, offset, len); - trace_ext4_fallocate_enter(inode, offset, len, mode); map.m_lblk = offset >> blkbits; /* @@ -3817,8 +3691,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) break; } ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_CREATE_UNINIT_EXT | - EXT4_GET_BLOCKS_NO_NORMALIZE); + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); if (ret <= 0) { #ifdef EXT4FS_DEBUG WARN_ON(ret <= 0); @@ -3949,7 +3822,6 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, pgoff_t last_offset; pgoff_t offset; pgoff_t index; - pgoff_t start_index = 0; struct page **pages = NULL; struct buffer_head *bh = NULL; struct buffer_head *head = NULL; @@ -3976,57 +3848,39 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, kfree(pages); return EXT_CONTINUE; } - index = 0; -next_page: /* Try to find the 1st mapped buffer. */ - end = ((__u64)pages[index]->index << PAGE_SHIFT) >> + end = ((__u64)pages[0]->index << PAGE_SHIFT) >> blksize_bits; - if (!page_has_buffers(pages[index])) + if (!page_has_buffers(pages[0])) goto out; - head = page_buffers(pages[index]); + head = page_buffers(pages[0]); if (!head) goto out; - index++; bh = head; do { - if (end >= newex->ec_block + - newex->ec_len) - /* The buffer is out of - * the request range. - */ - goto out; - - if (buffer_mapped(bh) && - end >= newex->ec_block) { - start_index = index - 1; + if (buffer_mapped(bh)) { /* get the 1st mapped buffer. */ + if (end > newex->ec_block + + newex->ec_len) + /* The buffer is out of + * the request range. + */ + goto out; goto found_mapped_buffer; } - bh = bh->b_this_page; end++; } while (bh != head); - /* No mapped buffer in the range found in this page, - * We need to look up next page. - */ - if (index >= ret) { - /* There is no page left, but we need to limit - * newex->ec_len. - */ - newex->ec_len = end - newex->ec_block; - goto out; - } - goto next_page; + /* No mapped buffer found. */ + goto out; } else { /*Find contiguous delayed buffers. */ if (ret > 0 && pages[0]->index == last_offset) head = page_buffers(pages[0]); bh = head; - index = 1; - start_index = 0; } found_mapped_buffer: @@ -4049,7 +3903,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, end++; } while (bh != head); - for (; index < ret; index++) { + for (index = 1; index < ret; index++) { if (!page_has_buffers(pages[index])) { bh = NULL; break; @@ -4059,10 +3913,8 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, bh = NULL; break; } - if (pages[index]->index != - pages[start_index]->index + index - - start_index) { + pages[0]->index + index) { /* Blocks are not contiguous. */ bh = NULL; break; @@ -4154,177 +4006,6 @@ static int ext4_xattr_fiemap(struct inode *inode, return (error < 0 ? error : 0); } -/* - * ext4_ext_punch_hole - * - * Punches a hole of "length" bytes in a file starting - * at byte "offset" - * - * @inode: The inode of the file to punch a hole in - * @offset: The starting byte offset of the hole - * @length: The length of the hole - * - * Returns the number of blocks removed or negative on err - */ -int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) -{ - struct inode *inode = file->f_path.dentry->d_inode; - struct super_block *sb = inode->i_sb; - struct ext4_ext_cache cache_ex; - ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks; - struct address_space *mapping = inode->i_mapping; - struct ext4_map_blocks map; - handle_t *handle; - loff_t first_block_offset, last_block_offset, block_len; - loff_t first_page, last_page, first_page_offset, last_page_offset; - int ret, credits, blocks_released, err = 0; - - first_block = (offset + sb->s_blocksize - 1) >> - EXT4_BLOCK_SIZE_BITS(sb); - last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); - - first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb); - last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb); - - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - last_page = (offset + length) >> PAGE_CACHE_SHIFT; - - first_page_offset = first_page << PAGE_CACHE_SHIFT; - last_page_offset = last_page << PAGE_CACHE_SHIFT; - - /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - err = filemap_write_and_wait_range(mapping, - first_page_offset == 0 ? 0 : first_page_offset-1, - last_page_offset); - - if (err) - return err; - } - - /* Now release the pages */ - if (last_page_offset > first_page_offset) { - truncate_inode_pages_range(mapping, first_page_offset, - last_page_offset-1); - } - - /* finish any pending end_io work */ - ext4_flush_completed_IO(inode); - - credits = ext4_writepage_trans_blocks(inode); - handle = ext4_journal_start(inode, credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - err = ext4_orphan_add(handle, inode); - if (err) - goto out; - - /* - * Now we need to zero out the un block aligned data. - * If the file is smaller than a block, just - * zero out the middle - */ - if (first_block > last_block) - ext4_block_zero_page_range(handle, mapping, offset, length); - else { - /* zero out the head of the hole before the first block */ - block_len = first_block_offset - offset; - if (block_len > 0) - ext4_block_zero_page_range(handle, mapping, - offset, block_len); - - /* zero out the tail of the hole after the last block */ - block_len = offset + length - last_block_offset; - if (block_len > 0) { - ext4_block_zero_page_range(handle, mapping, - last_block_offset, block_len); - } - } - - /* If there are no blocks to remove, return now */ - if (first_block >= last_block) - goto out; - - down_write(&EXT4_I(inode)->i_data_sem); - ext4_ext_invalidate_cache(inode); - ext4_discard_preallocations(inode); - - /* - * Loop over all the blocks and identify blocks - * that need to be punched out - */ - iblock = first_block; - blocks_released = 0; - while (iblock < last_block) { - max_blocks = last_block - iblock; - num_blocks = 1; - memset(&map, 0, sizeof(map)); - map.m_lblk = iblock; - map.m_len = max_blocks; - ret = ext4_ext_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_PUNCH_OUT_EXT); - - if (ret > 0) { - blocks_released += ret; - num_blocks = ret; - } else if (ret == 0) { - /* - * If map blocks could not find the block, - * then it is in a hole. If the hole was - * not already cached, then map blocks should - * put it in the cache. So we can get the hole - * out of the cache - */ - memset(&cache_ex, 0, sizeof(cache_ex)); - if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) && - !cache_ex.ec_start) { - - /* The hole is cached */ - num_blocks = cache_ex.ec_block + - cache_ex.ec_len - iblock; - - } else { - /* The block could not be identified */ - err = -EIO; - break; - } - } else { - /* Map blocks error */ - err = ret; - break; - } - - if (num_blocks == 0) { - /* This condition should never happen */ - ext_debug("Block lookup failed"); - err = -EIO; - break; - } - - iblock += num_blocks; - } - - if (blocks_released > 0) { - ext4_ext_invalidate_cache(inode); - ext4_discard_preallocations(inode); - } - - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - - up_write(&EXT4_I(inode)->i_data_sem); - -out: - ext4_orphan_del(handle, inode); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); - return err; -} int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { @@ -4361,3 +4042,4 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return error; } + diff --git a/trunk/fs/ext4/file.c b/trunk/fs/ext4/file.c index 2c0972322009..7b80d543b89e 100644 --- a/trunk/fs/ext4/file.c +++ b/trunk/fs/ext4/file.c @@ -272,6 +272,7 @@ const struct file_operations ext4_file_operations = { }; const struct inode_operations ext4_file_inode_operations = { + .truncate = ext4_truncate, .setattr = ext4_setattr, .getattr = ext4_getattr, #ifdef CONFIG_EXT4_FS_XATTR diff --git a/trunk/fs/ext4/fsync.c b/trunk/fs/ext4/fsync.c index ce66d2fe826c..e9473cbe80df 100644 --- a/trunk/fs/ext4/fsync.c +++ b/trunk/fs/ext4/fsync.c @@ -36,7 +36,7 @@ static void dump_completed_IO(struct inode * inode) { -#ifdef EXT4FS_DEBUG +#ifdef EXT4_DEBUG struct list_head *cur, *before, *after; ext4_io_end_t *io, *io0, *io1; unsigned long flags; @@ -172,7 +172,6 @@ int ext4_sync_file(struct file *file, int datasync) journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; int ret; tid_t commit_tid; - bool needs_barrier = false; J_ASSERT(ext4_journal_current_handle() == NULL); @@ -212,12 +211,22 @@ int ext4_sync_file(struct file *file, int datasync) } commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; - if (journal->j_flags & JBD2_BARRIER && - !jbd2_trans_will_send_data_barrier(journal, commit_tid)) - needs_barrier = true; - jbd2_log_start_commit(journal, commit_tid); - ret = jbd2_log_wait_commit(journal, commit_tid); - if (needs_barrier) + if (jbd2_log_start_commit(journal, commit_tid)) { + /* + * When the journal is on a different device than the + * fs data disk, we need to issue the barrier in + * writeback mode. (In ordered mode, the jbd2 layer + * will take care of issuing the barrier. In + * data=journal, all of the data blocks are written to + * the journal device.) + */ + if (ext4_should_writeback_data(inode) && + (journal->j_fs_dev != journal->j_dev) && + (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, + NULL); + ret = jbd2_log_wait_commit(journal, commit_tid); + } else if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); out: trace_ext4_sync_file_exit(inode, ret); diff --git a/trunk/fs/ext4/inode.c b/trunk/fs/ext4/inode.c index 50d0e9c64584..f2fa5e8a582c 100644 --- a/trunk/fs/ext4/inode.c +++ b/trunk/fs/ext4/inode.c @@ -639,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, while (target > 0) { count = target; /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext4_new_meta_blocks(handle, inode, goal, - 0, &count, err); + current_block = ext4_new_meta_blocks(handle, inode, + goal, &count, err); if (*err) goto failed_out; @@ -1930,7 +1930,7 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) * We do still charge estimated metadata to the sb though; * we cannot afford to run out of free blocks. */ - if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) { + if (ext4_claim_free_blocks(sbi, md_needed + 1)) { dquot_release_reservation_block(inode, 1); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); @@ -2796,7 +2796,9 @@ static int write_cache_pages_da(struct address_space *mapping, continue; } - wait_on_page_writeback(page); + if (PageWriteback(page)) + wait_on_page_writeback(page); + BUG_ON(PageWriteback(page)); if (mpd->next_page != page->index) @@ -3511,7 +3513,7 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, loff_t end = offset + iov_length(iov, nr_segs); if (end > isize) - ext4_truncate_failed_write(inode); + vmtruncate(inode, isize); } } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) @@ -3913,31 +3915,10 @@ void ext4_set_aops(struct inode *inode) */ int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) -{ - unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned length; - unsigned blocksize; - struct inode *inode = mapping->host; - - blocksize = inode->i_sb->s_blocksize; - length = blocksize - (offset & (blocksize - 1)); - - return ext4_block_zero_page_range(handle, mapping, from, length); -} - -/* - * ext4_block_zero_page_range() zeros out a mapping of length 'length' - * starting from file offset 'from'. The range to be zero'd must - * be contained with in one block. If the specified range exceeds - * the end of the block it will be shortened to end of the block - * that cooresponds to 'from' - */ -int ext4_block_zero_page_range(handle_t *handle, - struct address_space *mapping, loff_t from, loff_t length) { ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned blocksize, max, pos; + unsigned blocksize, length, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; @@ -3950,15 +3931,7 @@ int ext4_block_zero_page_range(handle_t *handle, return -EINVAL; blocksize = inode->i_sb->s_blocksize; - max = blocksize - (offset & (blocksize - 1)); - - /* - * correct length if it does not fall between - * 'from' and the end of the block - */ - if (length > max || length < 0) - length = max; - + length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); if (!page_has_buffers(page)) @@ -4407,6 +4380,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, int ext4_can_truncate(struct inode *inode) { + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return 0; if (S_ISREG(inode->i_mode)) return 1; if (S_ISDIR(inode->i_mode)) @@ -4416,31 +4391,6 @@ int ext4_can_truncate(struct inode *inode) return 0; } -/* - * ext4_punch_hole: punches a hole in a file by releaseing the blocks - * associated with the given offset and length - * - * @inode: File inode - * @offset: The offset where the hole will begin - * @len: The length of the hole - * - * Returns: 0 on sucess or negative on failure - */ - -int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) -{ - struct inode *inode = file->f_path.dentry->d_inode; - if (!S_ISREG(inode->i_mode)) - return -ENOTSUPP; - - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - /* TODO: Add support for non extent hole punching */ - return -ENOTSUPP; - } - - return ext4_ext_punch_hole(file, offset, length); -} - /* * ext4_truncate() * @@ -4667,7 +4617,7 @@ static int __ext4_get_inode_loc(struct inode *inode, /* * Figure out the offset within the block group inode table */ - inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); inode_offset = ((inode->i_ino - 1) % EXT4_INODES_PER_GROUP(sb)); block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); @@ -5361,7 +5311,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE && - (attr->ia_size < inode->i_size)) { + (attr->ia_size < inode->i_size || + (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) { handle_t *handle; handle = ext4_journal_start(inode, 3); @@ -5395,16 +5346,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) goto err_out; } } - } - - if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size != i_size_read(inode)) { - truncate_setsize(inode, attr->ia_size); - ext4_truncate(inode); - } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) + /* ext4_truncate will clear the flag */ + if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) ext4_truncate(inode); } + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) + rc = vmtruncate(inode, attr->ia_size); + if (!rc) { setattr_copy(inode, attr); mark_inode_dirty(inode); @@ -5861,19 +5811,15 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) goto out_unlock; } ret = 0; - - lock_page(page); - wait_on_page_writeback(page); - if (PageMappedToDisk(page)) { - up_read(&inode->i_alloc_sem); - return VM_FAULT_LOCKED; - } + if (PageMappedToDisk(page)) + goto out_unlock; if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; else len = PAGE_CACHE_SIZE; + lock_page(page); /* * return if we have all the buffers mapped. This avoid * the need to call write_begin/write_end which does a @@ -5883,8 +5829,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (page_has_buffers(page)) { if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, ext4_bh_unmapped)) { - up_read(&inode->i_alloc_sem); - return VM_FAULT_LOCKED; + unlock_page(page); + goto out_unlock; } } unlock_page(page); @@ -5904,16 +5850,6 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret < 0) goto out_unlock; ret = 0; - - /* - * write_begin/end might have created a dirty page and someone - * could wander in and start the IO. Make sure that hasn't - * happened. - */ - lock_page(page); - wait_on_page_writeback(page); - up_read(&inode->i_alloc_sem); - return VM_FAULT_LOCKED; out_unlock: if (ret) ret = VM_FAULT_SIGBUS; diff --git a/trunk/fs/ext4/mballoc.c b/trunk/fs/ext4/mballoc.c index 859f2ae8864e..d8a16eecf1d5 100644 --- a/trunk/fs/ext4/mballoc.c +++ b/trunk/fs/ext4/mballoc.c @@ -787,7 +787,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore) struct inode *inode; char *data; char *bitmap; - struct ext4_group_info *grinfo; mb_debug(1, "init page %lu\n", page->index); @@ -820,18 +819,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore) if (first_group + i >= ngroups) break; - grinfo = ext4_get_group_info(sb, first_group + i); - /* - * If page is uptodate then we came here after online resize - * which added some new uninitialized group info structs, so - * we must skip all initialized uptodate buddies on the page, - * which may be currently in use by an allocating task. - */ - if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { - bh[i] = NULL; - continue; - } - err = -EIO; desc = ext4_get_group_desc(sb, first_group + i, NULL); if (desc == NULL) @@ -884,28 +871,26 @@ static int ext4_mb_init_cache(struct page *page, char *incore) } /* wait for I/O completion */ - for (i = 0; i < groups_per_page; i++) - if (bh[i]) - wait_on_buffer(bh[i]); + for (i = 0; i < groups_per_page && bh[i]; i++) + wait_on_buffer(bh[i]); err = -EIO; - for (i = 0; i < groups_per_page; i++) - if (bh[i] && !buffer_uptodate(bh[i])) + for (i = 0; i < groups_per_page && bh[i]; i++) + if (!buffer_uptodate(bh[i])) goto out; err = 0; first_block = page->index * blocks_per_page; + /* init the page */ + memset(page_address(page), 0xff, PAGE_CACHE_SIZE); for (i = 0; i < blocks_per_page; i++) { int group; + struct ext4_group_info *grinfo; group = (first_block + i) >> 1; if (group >= ngroups) break; - if (!bh[group - first_group]) - /* skip initialized uptodate buddy */ - continue; - /* * data carry information regarding this * particular group in the format specified @@ -934,8 +919,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore) * incore got set to the group block bitmap below */ ext4_lock_group(sb, group); - /* init the buddy */ - memset(data, 0xff, blocksize); ext4_mb_generate_buddy(sb, data, incore, group); ext4_unlock_group(sb, group); incore = NULL; @@ -965,7 +948,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) out: if (bh) { - for (i = 0; i < groups_per_page; i++) + for (i = 0; i < groups_per_page && bh[i]; i++) brelse(bh[i]); if (bh != &bhs) kfree(bh); @@ -974,21 +957,22 @@ static int ext4_mb_init_cache(struct page *page, char *incore) } /* - * Lock the buddy and bitmap pages. This make sure other parallel init_group - * on the same buddy page doesn't happen whild holding the buddy page lock. - * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap - * are on the same page e4b->bd_buddy_page is NULL and return value is 0. + * lock the group_info alloc_sem of all the groups + * belonging to the same buddy cache page. This + * make sure other parallel operation on the buddy + * cache doesn't happen whild holding the buddy cache + * lock */ -static int ext4_mb_get_buddy_page_lock(struct super_block *sb, - ext4_group_t group, struct ext4_buddy *e4b) +static int ext4_mb_get_buddy_cache_lock(struct super_block *sb, + ext4_group_t group) { - struct inode *inode = EXT4_SB(sb)->s_buddy_cache; - int block, pnum, poff; + int i; + int block, pnum; int blocks_per_page; - struct page *page; - - e4b->bd_buddy_page = NULL; - e4b->bd_bitmap_page = NULL; + int groups_per_page; + ext4_group_t ngroups = ext4_get_groups_count(sb); + ext4_group_t first_group; + struct ext4_group_info *grp; blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; /* @@ -998,40 +982,57 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, */ block = group * 2; pnum = block / blocks_per_page; - poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); - if (!page) - return -EIO; - BUG_ON(page->mapping != inode->i_mapping); - e4b->bd_bitmap_page = page; - e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + first_group = pnum * blocks_per_page / 2; - if (blocks_per_page >= 2) { - /* buddy and bitmap are on the same page */ - return 0; - } + groups_per_page = blocks_per_page >> 1; + if (groups_per_page == 0) + groups_per_page = 1; + /* read all groups the page covers into the cache */ + for (i = 0; i < groups_per_page; i++) { - block++; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); - if (!page) - return -EIO; - BUG_ON(page->mapping != inode->i_mapping); - e4b->bd_buddy_page = page; - return 0; + if ((first_group + i) >= ngroups) + break; + grp = ext4_get_group_info(sb, first_group + i); + /* take all groups write allocation + * semaphore. This make sure there is + * no block allocation going on in any + * of that groups + */ + down_write_nested(&grp->alloc_sem, i); + } + return i; } -static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) +static void ext4_mb_put_buddy_cache_lock(struct super_block *sb, + ext4_group_t group, int locked_group) { - if (e4b->bd_bitmap_page) { - unlock_page(e4b->bd_bitmap_page); - page_cache_release(e4b->bd_bitmap_page); - } - if (e4b->bd_buddy_page) { - unlock_page(e4b->bd_buddy_page); - page_cache_release(e4b->bd_buddy_page); + int i; + int block, pnum; + int blocks_per_page; + ext4_group_t first_group; + struct ext4_group_info *grp; + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + first_group = pnum * blocks_per_page / 2; + /* release locks on all the groups */ + for (i = 0; i < locked_group; i++) { + + grp = ext4_get_group_info(sb, first_group + i); + /* take all groups write allocation + * semaphore. This make sure there is + * no block allocation going on in any + * of that groups + */ + up_write(&grp->alloc_sem); } + } /* @@ -1043,60 +1044,93 @@ static noinline_for_stack int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) { - struct ext4_group_info *this_grp; - struct ext4_buddy e4b; - struct page *page; int ret = 0; + void *bitmap; + int blocks_per_page; + int block, pnum, poff; + int num_grp_locked = 0; + struct ext4_group_info *this_grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; + struct page *page = NULL, *bitmap_page = NULL; mb_debug(1, "init group %u\n", group); + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; this_grp = ext4_get_group_info(sb, group); /* * This ensures that we don't reinit the buddy cache * page which map to the group from which we are already * allocating. If we are looking at the buddy cache we would * have taken a reference using ext4_mb_load_buddy and that - * would have pinned buddy page to page cache. + * would have taken the alloc_sem lock. */ - ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); - if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); + if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { /* * somebody initialized the group * return without doing anything */ + ret = 0; goto err; } - - page = e4b.bd_bitmap_page; - ret = ext4_mb_init_cache(page, NULL); - if (ret) - goto err; - if (!PageUptodate(page)) { + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { + BUG_ON(page->mapping != inode->i_mapping); + ret = ext4_mb_init_cache(page, NULL); + if (ret) { + unlock_page(page); + goto err; + } + unlock_page(page); + } + if (page == NULL || !PageUptodate(page)) { ret = -EIO; goto err; } mark_page_accessed(page); + bitmap_page = page; + bitmap = page_address(page) + (poff * sb->s_blocksize); - if (e4b.bd_buddy_page == NULL) { + /* init buddy cache */ + block++; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page == bitmap_page) { /* * If both the bitmap and buddy are in * the same page we don't need to force * init the buddy */ - ret = 0; - goto err; + unlock_page(page); + } else if (page) { + BUG_ON(page->mapping != inode->i_mapping); + ret = ext4_mb_init_cache(page, bitmap); + if (ret) { + unlock_page(page); + goto err; + } + unlock_page(page); } - /* init buddy cache */ - page = e4b.bd_buddy_page; - ret = ext4_mb_init_cache(page, e4b.bd_bitmap); - if (ret) - goto err; - if (!PageUptodate(page)) { + if (page == NULL || !PageUptodate(page)) { ret = -EIO; goto err; } mark_page_accessed(page); err: - ext4_mb_put_buddy_page_lock(&e4b); + ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); + if (bitmap_page) + page_cache_release(bitmap_page); + if (page) + page_cache_release(page); return ret; } @@ -1130,8 +1164,24 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, e4b->bd_group = group; e4b->bd_buddy_page = NULL; e4b->bd_bitmap_page = NULL; + e4b->alloc_semp = &grp->alloc_sem; + + /* Take the read lock on the group alloc + * sem. This would make sure a parallel + * ext4_mb_init_group happening on other + * groups mapped by the page is blocked + * till we are done with allocation + */ +repeat_load_buddy: + down_read(e4b->alloc_semp); if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + /* we need to check for group need init flag + * with alloc_semp held so that we can be sure + * that new blocks didn't get added to the group + * when we are loading the buddy cache + */ + up_read(e4b->alloc_semp); /* * we need full data about the group * to make a good selection @@ -1139,6 +1189,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, ret = ext4_mb_init_group(sb, group); if (ret) return ret; + goto repeat_load_buddy; } /* @@ -1222,14 +1273,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, return 0; err: - if (page) - page_cache_release(page); if (e4b->bd_bitmap_page) page_cache_release(e4b->bd_bitmap_page); if (e4b->bd_buddy_page) page_cache_release(e4b->bd_buddy_page); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; + + /* Done with the buddy cache */ + up_read(e4b->alloc_semp); return ret; } @@ -1239,6 +1291,9 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) page_cache_release(e4b->bd_bitmap_page); if (e4b->bd_buddy_page) page_cache_release(e4b->bd_buddy_page); + /* Done with the buddy cache */ + if (e4b->alloc_semp) + up_read(e4b->alloc_semp); } @@ -1551,6 +1606,9 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, get_page(ac->ac_bitmap_page); ac->ac_buddy_page = e4b->bd_buddy_page; get_page(ac->ac_buddy_page); + /* on allocation we use ac to track the held semaphore */ + ac->alloc_semp = e4b->alloc_semp; + e4b->alloc_semp = NULL; /* store last allocated for subsequent stream allocation */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { spin_lock(&sbi->s_md_lock); @@ -2601,7 +2659,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) struct super_block *sb = journal->j_private; struct ext4_buddy e4b; struct ext4_group_info *db; - int err, count = 0, count2 = 0; + int err, ret, count = 0, count2 = 0; struct ext4_free_data *entry; struct list_head *l, *ltmp; @@ -2611,9 +2669,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->count, entry->group, entry); - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, entry->group, - entry->start_blk, entry->count); + if (test_opt(sb, DISCARD)) { + ret = ext4_issue_discard(sb, entry->group, + entry->start_blk, entry->count); + if (unlikely(ret == -EOPNOTSUPP)) { + ext4_warning(sb, "discard not supported, " + "disabling"); + clear_opt(sb, DISCARD); + } + } err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ @@ -4162,12 +4226,15 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) spin_unlock(&pa->pa_lock); } } + if (ac->alloc_semp) + up_read(ac->alloc_semp); if (pa) { /* * We want to add the pa to the right bucket. * Remove it from the list and while adding * make sure the list to which we are adding - * doesn't grow big. + * doesn't grow big. We need to release + * alloc_semp before calling ext4_mb_add_n_trim() */ if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { spin_lock(pa->pa_obj_lock); @@ -4236,9 +4303,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, * there is enough free blocks to do block allocation * and verify allocation doesn't exceed the quota limits. */ - while (ar->len && - ext4_claim_free_blocks(sbi, ar->len, ar->flags)) { - + while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) { /* let others to free the space */ yield(); ar->len = ar->len >> 1; @@ -4248,15 +4313,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, return 0; } reserv_blks = ar->len; - if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { - dquot_alloc_block_nofail(ar->inode, ar->len); - } else { - while (ar->len && - dquot_alloc_block(ar->inode, ar->len)) { - - ar->flags |= EXT4_MB_HINT_NOPREALLOC; - ar->len--; - } + while (ar->len && dquot_alloc_block(ar->inode, ar->len)) { + ar->flags |= EXT4_MB_HINT_NOPREALLOC; + ar->len--; } inquota = ar->len; if (ar->len == 0) { @@ -4644,127 +4703,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, return; } -/** - * ext4_add_groupblocks() -- Add given blocks to an existing group - * @handle: handle to this transaction - * @sb: super block - * @block: start physcial block to add to the block group - * @count: number of blocks to free - * - * This marks the blocks as free in the bitmap and buddy. - */ -void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, - ext4_fsblk_t block, unsigned long count) -{ - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *gd_bh; - ext4_group_t block_group; - ext4_grpblk_t bit; - unsigned int i; - struct ext4_group_desc *desc; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_buddy e4b; - int err = 0, ret, blk_free_count; - ext4_grpblk_t blocks_freed; - struct ext4_group_info *grp; - - ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); - - ext4_get_group_no_and_offset(sb, block, &block_group, &bit); - grp = ext4_get_group_info(sb, block_group); - /* - * Check to see if we are freeing blocks across a group - * boundary. - */ - if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) - goto error_return; - - bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (!bitmap_bh) - goto error_return; - desc = ext4_get_group_desc(sb, block_group, &gd_bh); - if (!desc) - goto error_return; - - if (in_range(ext4_block_bitmap(sb, desc), block, count) || - in_range(ext4_inode_bitmap(sb, desc), block, count) || - in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || - in_range(block + count - 1, ext4_inode_table(sb, desc), - sbi->s_itb_per_group)) { - ext4_error(sb, "Adding blocks in system zones - " - "Block = %llu, count = %lu", - block, count); - goto error_return; - } - - BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, bitmap_bh); - if (err) - goto error_return; - - /* - * We are about to modify some metadata. Call the journal APIs - * to unshare ->b_data if a currently-committing transaction is - * using it - */ - BUFFER_TRACE(gd_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gd_bh); - if (err) - goto error_return; - - for (i = 0, blocks_freed = 0; i < count; i++) { - BUFFER_TRACE(bitmap_bh, "clear bit"); - if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { - ext4_error(sb, "bit already cleared for block %llu", - (ext4_fsblk_t)(block + i)); - BUFFER_TRACE(bitmap_bh, "bit already cleared"); - } else { - blocks_freed++; - } - } - - err = ext4_mb_load_buddy(sb, block_group, &e4b); - if (err) - goto error_return; - - /* - * need to update group_info->bb_free and bitmap - * with group lock held. generate_buddy look at - * them with group lock_held - */ - ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count); - mb_free_blocks(NULL, &e4b, bit, count); - blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); - ext4_free_blks_set(sb, desc, blk_free_count); - desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); - ext4_unlock_group(sb, block_group); - percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); - - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(blocks_freed, - &sbi->s_flex_groups[flex_group].free_blocks); - } - - ext4_mb_unload_buddy(&e4b); - - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - - /* And the group descriptor block */ - BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); - if (!err) - err = ret; - -error_return: - brelse(bitmap_bh); - ext4_std_error(sb, err); - return; -} - /** * ext4_trim_extent -- function to TRIM one single free extent in the group * @sb: super block for the file system @@ -4777,10 +4715,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, * one will allocate those blocks, mark it as used in buddy bitmap. This must * be called with under the group lock. */ -static void ext4_trim_extent(struct super_block *sb, int start, int count, - ext4_group_t group, struct ext4_buddy *e4b) +static int ext4_trim_extent(struct super_block *sb, int start, int count, + ext4_group_t group, struct ext4_buddy *e4b) { struct ext4_free_extent ex; + int ret = 0; assert_spin_locked(ext4_group_lock_ptr(sb, group)); @@ -4794,9 +4733,12 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); - ext4_issue_discard(sb, group, start, count); + + ret = ext4_issue_discard(sb, group, start, count); + ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); + return ret; } /** @@ -4818,26 +4760,21 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, * the group buddy bitmap. This is done until whole group is scanned. */ static ext4_grpblk_t -ext4_trim_all_free(struct super_block *sb, ext4_group_t group, - ext4_grpblk_t start, ext4_grpblk_t max, - ext4_grpblk_t minblocks) +ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, + ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) { void *bitmap; ext4_grpblk_t next, count = 0; - struct ext4_buddy e4b; - int ret; + ext4_group_t group; + int ret = 0; - ret = ext4_mb_load_buddy(sb, group, &e4b); - if (ret) { - ext4_error(sb, "Error in loading buddy " - "information for %u", group); - return ret; - } - bitmap = e4b.bd_bitmap; + BUG_ON(e4b == NULL); + bitmap = e4b->bd_bitmap; + group = e4b->bd_group; + start = (e4b->bd_info->bb_first_free > start) ? + e4b->bd_info->bb_first_free : start; ext4_lock_group(sb, group); - start = (e4b.bd_info->bb_first_free > start) ? - e4b.bd_info->bb_first_free : start; while (start < max) { start = mb_find_next_zero_bit(bitmap, max, start); @@ -4846,8 +4783,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, next = mb_find_next_bit(bitmap, max, start); if ((next - start) >= minblocks) { - ext4_trim_extent(sb, start, - next - start, group, &e4b); + ret = ext4_trim_extent(sb, start, + next - start, group, e4b); + if (ret < 0) + break; count += next - start; } start = next + 1; @@ -4863,15 +4802,17 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_lock_group(sb, group); } - if ((e4b.bd_info->bb_free - count) < minblocks) + if ((e4b->bd_info->bb_free - count) < minblocks) break; } ext4_unlock_group(sb, group); - ext4_mb_unload_buddy(&e4b); ext4_debug("trimmed %d blocks in the group %d\n", count, group); + if (ret < 0) + count = ret; + return count; } @@ -4889,11 +4830,11 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, */ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { - struct ext4_group_info *grp; + struct ext4_buddy e4b; ext4_group_t first_group, last_group; ext4_group_t group, ngroups = ext4_get_groups_count(sb); ext4_grpblk_t cnt = 0, first_block, last_block; - uint64_t start, len, minlen, trimmed = 0; + uint64_t start, len, minlen, trimmed; ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); int ret = 0; @@ -4901,6 +4842,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) start = range->start >> sb->s_blocksize_bits; len = range->len >> sb->s_blocksize_bits; minlen = range->minlen >> sb->s_blocksize_bits; + trimmed = 0; if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) return -EINVAL; @@ -4921,12 +4863,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) return -EINVAL; for (group = first_group; group <= last_group; group++) { - grp = ext4_get_group_info(sb, group); - /* We only do this if the grp has never been initialized */ - if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { - ret = ext4_mb_init_group(sb, group); - if (ret) - break; + ret = ext4_mb_load_buddy(sb, group, &e4b); + if (ret) { + ext4_error(sb, "Error in loading buddy " + "information for %u", group); + break; } /* @@ -4939,14 +4880,16 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) last_block = first_block + len; len -= last_block - first_block; - if (grp->bb_free >= minlen) { - cnt = ext4_trim_all_free(sb, group, first_block, + if (e4b.bd_info->bb_free >= minlen) { + cnt = ext4_trim_all_free(sb, &e4b, first_block, last_block, minlen); if (cnt < 0) { ret = cnt; + ext4_mb_unload_buddy(&e4b); break; } } + ext4_mb_unload_buddy(&e4b); trimmed += cnt; first_block = 0; } diff --git a/trunk/fs/ext4/mballoc.h b/trunk/fs/ext4/mballoc.h index 20b5e7bfebd1..22bd4d7f289b 100644 --- a/trunk/fs/ext4/mballoc.h +++ b/trunk/fs/ext4/mballoc.h @@ -193,6 +193,11 @@ struct ext4_allocation_context { __u8 ac_op; /* operation, for history only */ struct page *ac_bitmap_page; struct page *ac_buddy_page; + /* + * pointer to the held semaphore upon successful + * block allocation + */ + struct rw_semaphore *alloc_semp; struct ext4_prealloc_space *ac_pa; struct ext4_locality_group *ac_lg; }; @@ -210,6 +215,7 @@ struct ext4_buddy { struct super_block *bd_sb; __u16 bd_blkbits; ext4_group_t bd_group; + struct rw_semaphore *alloc_semp; }; #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) diff --git a/trunk/fs/ext4/migrate.c b/trunk/fs/ext4/migrate.c index b57b98fb44d1..92816b4e0f16 100644 --- a/trunk/fs/ext4/migrate.c +++ b/trunk/fs/ext4/migrate.c @@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, * We have the extent map build with the tmp inode. * Now copy the i_data across */ - ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS); memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); /* diff --git a/trunk/fs/ext4/mmp.c b/trunk/fs/ext4/mmp.c deleted file mode 100644 index 9bdef3f537c5..000000000000 --- a/trunk/fs/ext4/mmp.c +++ /dev/null @@ -1,351 +0,0 @@ -#include -#include -#include -#include -#include - -#include "ext4.h" - -/* - * Write the MMP block using WRITE_SYNC to try to get the block on-disk - * faster. - */ -static int write_mmp_block(struct buffer_head *bh) -{ - mark_buffer_dirty(bh); - lock_buffer(bh); - bh->b_end_io = end_buffer_write_sync; - get_bh(bh); - submit_bh(WRITE_SYNC, bh); - wait_on_buffer(bh); - if (unlikely(!buffer_uptodate(bh))) - return 1; - - return 0; -} - -/* - * Read the MMP block. It _must_ be read from disk and hence we clear the - * uptodate flag on the buffer. - */ -static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, - ext4_fsblk_t mmp_block) -{ - struct mmp_struct *mmp; - - if (*bh) - clear_buffer_uptodate(*bh); - - /* This would be sb_bread(sb, mmp_block), except we need to be sure - * that the MD RAID device cache has been bypassed, and that the read - * is not blocked in the elevator. */ - if (!*bh) - *bh = sb_getblk(sb, mmp_block); - if (*bh) { - get_bh(*bh); - lock_buffer(*bh); - (*bh)->b_end_io = end_buffer_read_sync; - submit_bh(READ_SYNC, *bh); - wait_on_buffer(*bh); - if (!buffer_uptodate(*bh)) { - brelse(*bh); - *bh = NULL; - } - } - if (!*bh) { - ext4_warning(sb, "Error while reading MMP block %llu", - mmp_block); - return -EIO; - } - - mmp = (struct mmp_struct *)((*bh)->b_data); - if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) - return -EINVAL; - - return 0; -} - -/* - * Dump as much information as possible to help the admin. - */ -void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, - const char *function, unsigned int line, const char *msg) -{ - __ext4_warning(sb, function, line, msg); - __ext4_warning(sb, function, line, - "MMP failure info: last update time: %llu, last update " - "node: %s, last update device: %s\n", - (long long unsigned int) le64_to_cpu(mmp->mmp_time), - mmp->mmp_nodename, mmp->mmp_bdevname); -} - -/* - * kmmpd will update the MMP sequence every s_mmp_update_interval seconds - */ -static int kmmpd(void *data) -{ - struct super_block *sb = ((struct mmpd_data *) data)->sb; - struct buffer_head *bh = ((struct mmpd_data *) data)->bh; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - struct mmp_struct *mmp; - ext4_fsblk_t mmp_block; - u32 seq = 0; - unsigned long failed_writes = 0; - int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval); - unsigned mmp_check_interval; - unsigned long last_update_time; - unsigned long diff; - int retval; - - mmp_block = le64_to_cpu(es->s_mmp_block); - mmp = (struct mmp_struct *)(bh->b_data); - mmp->mmp_time = cpu_to_le64(get_seconds()); - /* - * Start with the higher mmp_check_interval and reduce it if - * the MMP block is being updated on time. - */ - mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval, - EXT4_MMP_MIN_CHECK_INTERVAL); - mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); - bdevname(bh->b_bdev, mmp->mmp_bdevname); - - memcpy(mmp->mmp_nodename, init_utsname()->sysname, - sizeof(mmp->mmp_nodename)); - - while (!kthread_should_stop()) { - if (++seq > EXT4_MMP_SEQ_MAX) - seq = 1; - - mmp->mmp_seq = cpu_to_le32(seq); - mmp->mmp_time = cpu_to_le64(get_seconds()); - last_update_time = jiffies; - - retval = write_mmp_block(bh); - /* - * Don't spew too many error messages. Print one every - * (s_mmp_update_interval * 60) seconds. - */ - if (retval && (failed_writes % 60) == 0) { - ext4_error(sb, "Error writing to MMP block"); - failed_writes++; - } - - if (!(le32_to_cpu(es->s_feature_incompat) & - EXT4_FEATURE_INCOMPAT_MMP)) { - ext4_warning(sb, "kmmpd being stopped since MMP feature" - " has been disabled."); - EXT4_SB(sb)->s_mmp_tsk = NULL; - goto failed; - } - - if (sb->s_flags & MS_RDONLY) { - ext4_warning(sb, "kmmpd being stopped since filesystem " - "has been remounted as readonly."); - EXT4_SB(sb)->s_mmp_tsk = NULL; - goto failed; - } - - diff = jiffies - last_update_time; - if (diff < mmp_update_interval * HZ) - schedule_timeout_interruptible(mmp_update_interval * - HZ - diff); - - /* - * We need to make sure that more than mmp_check_interval - * seconds have not passed since writing. If that has happened - * we need to check if the MMP block is as we left it. - */ - diff = jiffies - last_update_time; - if (diff > mmp_check_interval * HZ) { - struct buffer_head *bh_check = NULL; - struct mmp_struct *mmp_check; - - retval = read_mmp_block(sb, &bh_check, mmp_block); - if (retval) { - ext4_error(sb, "error reading MMP data: %d", - retval); - - EXT4_SB(sb)->s_mmp_tsk = NULL; - goto failed; - } - - mmp_check = (struct mmp_struct *)(bh_check->b_data); - if (mmp->mmp_seq != mmp_check->mmp_seq || - memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename, - sizeof(mmp->mmp_nodename))) { - dump_mmp_msg(sb, mmp_check, - "Error while updating MMP info. " - "The filesystem seems to have been" - " multiply mounted."); - ext4_error(sb, "abort"); - goto failed; - } - put_bh(bh_check); - } - - /* - * Adjust the mmp_check_interval depending on how much time - * it took for the MMP block to be written. - */ - mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ, - EXT4_MMP_MAX_CHECK_INTERVAL), - EXT4_MMP_MIN_CHECK_INTERVAL); - mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); - } - - /* - * Unmount seems to be clean. - */ - mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); - mmp->mmp_time = cpu_to_le64(get_seconds()); - - retval = write_mmp_block(bh); - -failed: - kfree(data); - brelse(bh); - return retval; -} - -/* - * Get a random new sequence number but make sure it is not greater than - * EXT4_MMP_SEQ_MAX. - */ -static unsigned int mmp_new_seq(void) -{ - u32 new_seq; - - do { - get_random_bytes(&new_seq, sizeof(u32)); - } while (new_seq > EXT4_MMP_SEQ_MAX); - - return new_seq; -} - -/* - * Protect the filesystem from being mounted more than once. - */ -int ext4_multi_mount_protect(struct super_block *sb, - ext4_fsblk_t mmp_block) -{ - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - struct buffer_head *bh = NULL; - struct mmp_struct *mmp = NULL; - struct mmpd_data *mmpd_data; - u32 seq; - unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); - unsigned int wait_time = 0; - int retval; - - if (mmp_block < le32_to_cpu(es->s_first_data_block) || - mmp_block >= ext4_blocks_count(es)) { - ext4_warning(sb, "Invalid MMP block in superblock"); - goto failed; - } - - retval = read_mmp_block(sb, &bh, mmp_block); - if (retval) - goto failed; - - mmp = (struct mmp_struct *)(bh->b_data); - - if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL) - mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL; - - /* - * If check_interval in MMP block is larger, use that instead of - * update_interval from the superblock. - */ - if (mmp->mmp_check_interval > mmp_check_interval) - mmp_check_interval = mmp->mmp_check_interval; - - seq = le32_to_cpu(mmp->mmp_seq); - if (seq == EXT4_MMP_SEQ_CLEAN) - goto skip; - - if (seq == EXT4_MMP_SEQ_FSCK) { - dump_mmp_msg(sb, mmp, "fsck is running on the filesystem"); - goto failed; - } - - wait_time = min(mmp_check_interval * 2 + 1, - mmp_check_interval + 60); - - /* Print MMP interval if more than 20 secs. */ - if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4) - ext4_warning(sb, "MMP interval %u higher than expected, please" - " wait.\n", wait_time * 2); - - if (schedule_timeout_interruptible(HZ * wait_time) != 0) { - ext4_warning(sb, "MMP startup interrupted, failing mount\n"); - goto failed; - } - - retval = read_mmp_block(sb, &bh, mmp_block); - if (retval) - goto failed; - mmp = (struct mmp_struct *)(bh->b_data); - if (seq != le32_to_cpu(mmp->mmp_seq)) { - dump_mmp_msg(sb, mmp, - "Device is already active on another node."); - goto failed; - } - -skip: - /* - * write a new random sequence number. - */ - mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); - - retval = write_mmp_block(bh); - if (retval) - goto failed; - - /* - * wait for MMP interval and check mmp_seq. - */ - if (schedule_timeout_interruptible(HZ * wait_time) != 0) { - ext4_warning(sb, "MMP startup interrupted, failing mount\n"); - goto failed; - } - - retval = read_mmp_block(sb, &bh, mmp_block); - if (retval) - goto failed; - mmp = (struct mmp_struct *)(bh->b_data); - if (seq != le32_to_cpu(mmp->mmp_seq)) { - dump_mmp_msg(sb, mmp, - "Device is already active on another node."); - goto failed; - } - - mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL); - if (!mmpd_data) { - ext4_warning(sb, "not enough memory for mmpd_data"); - goto failed; - } - mmpd_data->sb = sb; - mmpd_data->bh = bh; - - /* - * Start a kernel thread to update the MMP block periodically. - */ - EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s", - bdevname(bh->b_bdev, - mmp->mmp_bdevname)); - if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { - EXT4_SB(sb)->s_mmp_tsk = NULL; - kfree(mmpd_data); - ext4_warning(sb, "Unable to create kmmpd thread for %s.", - sb->s_id); - goto failed; - } - - return 0; - -failed: - brelse(bh); - return 1; -} - - diff --git a/trunk/fs/ext4/move_extent.c b/trunk/fs/ext4/move_extent.c index 2b8304bf3c50..b9f3e7862f13 100644 --- a/trunk/fs/ext4/move_extent.c +++ b/trunk/fs/ext4/move_extent.c @@ -876,7 +876,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, * It needs to call wait_on_page_writeback() to wait for the * writeback of the page. */ - wait_on_page_writeback(page); + if (PageWriteback(page)) + wait_on_page_writeback(page); /* Release old bh and drop refs */ try_to_release_page(page, 0); diff --git a/trunk/fs/ext4/namei.c b/trunk/fs/ext4/namei.c index b754b7721f51..67fd0b025858 100644 --- a/trunk/fs/ext4/namei.c +++ b/trunk/fs/ext4/namei.c @@ -1413,22 +1413,10 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, frame->at = entries; frame->bh = bh; bh = bh2; - - ext4_handle_dirty_metadata(handle, dir, frame->bh); - ext4_handle_dirty_metadata(handle, dir, bh); - de = do_split(handle,dir, &bh, frame, &hinfo, &retval); - if (!de) { - /* - * Even if the block split failed, we have to properly write - * out all the changes we did so far. Otherwise we can end up - * with corrupted filesystem. - */ - ext4_mark_inode_dirty(handle, dir); - dx_release(frames); + dx_release (frames); + if (!(de)) return retval; - } - dx_release(frames); retval = add_dirent_to_buf(handle, dentry, inode, de, bh); brelse(bh); @@ -2252,7 +2240,6 @@ static int ext4_symlink(struct inode *dir, handle_t *handle; struct inode *inode; int l, err, retries = 0; - int credits; l = strlen(symname)+1; if (l > dir->i_sb->s_blocksize) @@ -2260,26 +2247,10 @@ static int ext4_symlink(struct inode *dir, dquot_initialize(dir); - if (l > EXT4_N_BLOCKS * 4) { - /* - * For non-fast symlinks, we just allocate inode and put it on - * orphan list in the first transaction => we need bitmap, - * group descriptor, sb, inode block, quota blocks. - */ - credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); - } else { - /* - * Fast symlink. We have to add entry to directory - * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS), - * allocate new inode (bitmap, group descriptor, inode block, - * quota blocks, sb is already counted in previous macros). - */ - credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); - } retry: - handle = ext4_journal_start(dir, credits); + handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2292,44 +2263,21 @@ static int ext4_symlink(struct inode *dir, if (IS_ERR(inode)) goto out_stop; - if (l > EXT4_N_BLOCKS * 4) { + if (l > sizeof(EXT4_I(inode)->i_data)) { inode->i_op = &ext4_symlink_inode_operations; ext4_set_aops(inode); /* - * We cannot call page_symlink() with transaction started - * because it calls into ext4_write_begin() which can wait - * for transaction commit if we are running out of space - * and thus we deadlock. So we have to stop transaction now - * and restart it when symlink contents is written. - * - * To keep fs consistent in case of crash, we have to put inode - * to orphan list in the mean time. + * page_symlink() calls into ext4_prepare/commit_write. + * We have a transaction open. All is sweetness. It also sets + * i_size in generic_commit_write(). */ - drop_nlink(inode); - err = ext4_orphan_add(handle, inode); - ext4_journal_stop(handle); - if (err) - goto err_drop_inode; err = __page_symlink(inode, symname, l, 1); - if (err) - goto err_drop_inode; - /* - * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS - * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified - */ - handle = ext4_journal_start(dir, - EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto err_drop_inode; - } - inc_nlink(inode); - err = ext4_orphan_del(handle, inode); if (err) { - ext4_journal_stop(handle); clear_nlink(inode); - goto err_drop_inode; + unlock_new_inode(inode); + ext4_mark_inode_dirty(handle, inode); + iput(inode); + goto out_stop; } } else { /* clear the extent format for fast symlink */ @@ -2345,10 +2293,6 @@ static int ext4_symlink(struct inode *dir, if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; -err_drop_inode: - unlock_new_inode(inode); - iput(inode); - return err; } static int ext4_link(struct dentry *old_dentry, diff --git a/trunk/fs/ext4/page-io.c b/trunk/fs/ext4/page-io.c index 7bb8f76d470a..b6dbd056fcb1 100644 --- a/trunk/fs/ext4/page-io.c +++ b/trunk/fs/ext4/page-io.c @@ -203,29 +203,46 @@ static void ext4_end_bio(struct bio *bio, int error) for (i = 0; i < io_end->num_io_pages; i++) { struct page *page = io_end->pages[i]->p_page; struct buffer_head *bh, *head; - loff_t offset; - loff_t io_end_offset; + int partial_write = 0; - if (error) { + head = page_buffers(page); + if (error) SetPageError(page); - set_bit(AS_EIO, &page->mapping->flags); - head = page_buffers(page); - BUG_ON(!head); - - io_end_offset = io_end->offset + io_end->size; + BUG_ON(!head); + if (head->b_size != PAGE_CACHE_SIZE) { + loff_t offset; + loff_t io_end_offset = io_end->offset + io_end->size; offset = (sector_t) page->index << PAGE_CACHE_SHIFT; bh = head; do { if ((offset >= io_end->offset) && - (offset+bh->b_size <= io_end_offset)) - buffer_io_error(bh); - + (offset+bh->b_size <= io_end_offset)) { + if (error) + buffer_io_error(bh); + + } + if (buffer_delay(bh)) + partial_write = 1; + else if (!buffer_mapped(bh)) + clear_buffer_dirty(bh); + else if (buffer_dirty(bh)) + partial_write = 1; offset += bh->b_size; bh = bh->b_this_page; } while (bh != head); } + /* + * If this is a partial write which happened to make + * all buffers uptodate then we can optimize away a + * bogus readpage() for the next read(). Here we + * 'discover' whether the page went uptodate as a + * result of this (potentially partial) write. + */ + if (!partial_write) + SetPageUptodate(page); + put_io_page(io_end->pages[i]); } io_end->num_io_pages = 0; diff --git a/trunk/fs/ext4/super.c b/trunk/fs/ext4/super.c index cc5c157aa11d..8553dfb310af 100644 --- a/trunk/fs/ext4/super.c +++ b/trunk/fs/ext4/super.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include @@ -76,27 +75,11 @@ static void ext4_write_super(struct super_block *sb); static int ext4_freeze(struct super_block *sb); static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data); -static inline int ext2_feature_set_ok(struct super_block *sb); -static inline int ext3_feature_set_ok(struct super_block *sb); static int ext4_feature_set_ok(struct super_block *sb, int readonly); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); -#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) -static struct file_system_type ext2_fs_type = { - .owner = THIS_MODULE, - .name = "ext2", - .mount = ext4_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; -#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type) -#else -#define IS_EXT2_SB(sb) (0) -#endif - - #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static struct file_system_type ext3_fs_type = { .owner = THIS_MODULE, @@ -823,8 +806,6 @@ static void ext4_put_super(struct super_block *sb) invalidate_bdev(sbi->journal_bdev); ext4_blkdev_remove(sbi); } - if (sbi->s_mmp_tsk) - kthread_stop(sbi->s_mmp_tsk); sb->s_fs_info = NULL; /* * Now that we are completely done shutting down the @@ -1115,7 +1096,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (!test_opt(sb, INIT_INODE_TABLE)) seq_puts(seq, ",noinit_inode_table"); - else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT) + else if (sbi->s_li_wait_mult) seq_printf(seq, ",init_inode_table=%u", (unsigned) sbi->s_li_wait_mult); @@ -1206,7 +1187,9 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); static const struct dquot_operations ext4_quota_operations = { +#ifdef CONFIG_QUOTA .get_reserved_space = ext4_get_reserved_space, +#endif .write_dquot = ext4_write_dquot, .acquire_dquot = ext4_acquire_dquot, .release_dquot = ext4_release_dquot, @@ -1917,7 +1900,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, ext4_msg(sb, KERN_WARNING, "warning: mounting fs with errors, " "running e2fsck is recommended"); - else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && + else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && le16_to_cpu(es->s_mnt_count) >= (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) ext4_msg(sb, KERN_WARNING, @@ -1949,7 +1932,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, EXT4_INODES_PER_GROUP(sb), sbi->s_mount_opt, sbi->s_mount_opt2); - cleancache_init_fs(sb); return res; } @@ -2443,18 +2425,6 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, EXT4_SB(sb)->s_sectors_written_start) >> 1))); } -static ssize_t extent_cache_hits_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits); -} - -static ssize_t extent_cache_misses_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses); -} - static ssize_t inode_readahead_blks_store(struct ext4_attr *a, struct ext4_sb_info *sbi, const char *buf, size_t count) @@ -2512,8 +2482,6 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) EXT4_RO_ATTR(delayed_allocation_blocks); EXT4_RO_ATTR(session_write_kbytes); EXT4_RO_ATTR(lifetime_write_kbytes); -EXT4_RO_ATTR(extent_cache_hits); -EXT4_RO_ATTR(extent_cache_misses); EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, inode_readahead_blks_store, s_inode_readahead_blks); EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); @@ -2529,8 +2497,6 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(delayed_allocation_blocks), ATTR_LIST(session_write_kbytes), ATTR_LIST(lifetime_write_kbytes), - ATTR_LIST(extent_cache_hits), - ATTR_LIST(extent_cache_misses), ATTR_LIST(inode_readahead_blks), ATTR_LIST(inode_goal), ATTR_LIST(mb_stats), @@ -2693,6 +2659,12 @@ static void print_daily_error_info(unsigned long arg) mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ } +static void ext4_lazyinode_timeout(unsigned long data) +{ + struct task_struct *p = (struct task_struct *)data; + wake_up_process(p); +} + /* Find next suitable group and run ext4_init_inode_table */ static int ext4_run_li_request(struct ext4_li_request *elr) { @@ -2724,8 +2696,11 @@ static int ext4_run_li_request(struct ext4_li_request *elr) ret = ext4_init_inode_table(sb, group, elr->lr_timeout ? 0 : 1); if (elr->lr_timeout == 0) { - timeout = (jiffies - timeout) * - elr->lr_sbi->s_li_wait_mult; + timeout = jiffies - timeout; + if (elr->lr_sbi->s_li_wait_mult) + timeout *= elr->lr_sbi->s_li_wait_mult; + else + timeout *= 20; elr->lr_timeout = timeout; } elr->lr_next_sched = jiffies + elr->lr_timeout; @@ -2737,7 +2712,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr) /* * Remove lr_request from the list_request and free the - * request structure. Should be called with li_list_mtx held + * request tructure. Should be called with li_list_mtx held */ static void ext4_remove_li_request(struct ext4_li_request *elr) { @@ -2755,16 +2730,14 @@ static void ext4_remove_li_request(struct ext4_li_request *elr) static void ext4_unregister_li_request(struct super_block *sb) { - mutex_lock(&ext4_li_mtx); - if (!ext4_li_info) { - mutex_unlock(&ext4_li_mtx); + struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request; + + if (!ext4_li_info) return; - } mutex_lock(&ext4_li_info->li_list_mtx); - ext4_remove_li_request(EXT4_SB(sb)->s_li_request); + ext4_remove_li_request(elr); mutex_unlock(&ext4_li_info->li_list_mtx); - mutex_unlock(&ext4_li_mtx); } static struct task_struct *ext4_lazyinit_task; @@ -2783,10 +2756,17 @@ static int ext4_lazyinit_thread(void *arg) struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg; struct list_head *pos, *n; struct ext4_li_request *elr; - unsigned long next_wakeup, cur; + unsigned long next_wakeup; + DEFINE_WAIT(wait); BUG_ON(NULL == eli); + eli->li_timer.data = (unsigned long)current; + eli->li_timer.function = ext4_lazyinode_timeout; + + eli->li_task = current; + wake_up(&eli->li_wait_task); + cont_thread: while (true) { next_wakeup = MAX_JIFFY_OFFSET; @@ -2817,15 +2797,19 @@ static int ext4_lazyinit_thread(void *arg) if (freezing(current)) refrigerator(); - cur = jiffies; - if ((time_after_eq(cur, next_wakeup)) || + if ((time_after_eq(jiffies, next_wakeup)) || (MAX_JIFFY_OFFSET == next_wakeup)) { cond_resched(); continue; } - schedule_timeout_interruptible(next_wakeup - cur); - + eli->li_timer.expires = next_wakeup; + add_timer(&eli->li_timer); + prepare_to_wait(&eli->li_wait_daemon, &wait, + TASK_INTERRUPTIBLE); + if (time_before(jiffies, next_wakeup)) + schedule(); + finish_wait(&eli->li_wait_daemon, &wait); if (kthread_should_stop()) { ext4_clear_request_list(); goto exit_thread; @@ -2849,7 +2833,12 @@ static int ext4_lazyinit_thread(void *arg) goto cont_thread; } mutex_unlock(&eli->li_list_mtx); + del_timer_sync(&ext4_li_info->li_timer); + eli->li_task = NULL; + wake_up(&eli->li_wait_task); + kfree(ext4_li_info); + ext4_lazyinit_task = NULL; ext4_li_info = NULL; mutex_unlock(&ext4_li_mtx); @@ -2877,6 +2866,7 @@ static int ext4_run_lazyinit_thread(void) if (IS_ERR(ext4_lazyinit_task)) { int err = PTR_ERR(ext4_lazyinit_task); ext4_clear_request_list(); + del_timer_sync(&ext4_li_info->li_timer); kfree(ext4_li_info); ext4_li_info = NULL; printk(KERN_CRIT "EXT4: error %d creating inode table " @@ -2885,6 +2875,8 @@ static int ext4_run_lazyinit_thread(void) return err; } ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; + + wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL); return 0; } @@ -2919,9 +2911,13 @@ static int ext4_li_info_new(void) if (!eli) return -ENOMEM; + eli->li_task = NULL; INIT_LIST_HEAD(&eli->li_request_list); mutex_init(&eli->li_list_mtx); + init_waitqueue_head(&eli->li_wait_daemon); + init_waitqueue_head(&eli->li_wait_task); + init_timer(&eli->li_timer); eli->li_state |= EXT4_LAZYINIT_QUIT; ext4_li_info = eli; @@ -2964,19 +2960,20 @@ static int ext4_register_li_request(struct super_block *sb, ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; int ret = 0; - if (sbi->s_li_request != NULL) { - /* - * Reset timeout so it can be computed again, because - * s_li_wait_mult might have changed. - */ - sbi->s_li_request->lr_timeout = 0; + if (sbi->s_li_request != NULL) return 0; - } if (first_not_zeroed == ngroups || (sb->s_flags & MS_RDONLY) || - !test_opt(sb, INIT_INODE_TABLE)) + !test_opt(sb, INIT_INODE_TABLE)) { + sbi->s_li_request = NULL; return 0; + } + + if (first_not_zeroed == ngroups) { + sbi->s_li_request = NULL; + return 0; + } elr = ext4_li_request_new(sb, first_not_zeroed); if (!elr) @@ -3169,12 +3166,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); - /* - * set default s_li_wait_mult for lazyinit, for the case there is - * no mount option specified. - */ - sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; - if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, &journal_devnum, &journal_ioprio, NULL, 0)) { ext4_msg(sb, KERN_WARNING, @@ -3196,28 +3187,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "feature flags set on rev 0 fs, " "running e2fsck is recommended"); - if (IS_EXT2_SB(sb)) { - if (ext2_feature_set_ok(sb)) - ext4_msg(sb, KERN_INFO, "mounting ext2 file system " - "using the ext4 subsystem"); - else { - ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " - "to feature incompatibilities"); - goto failed_mount; - } - } - - if (IS_EXT3_SB(sb)) { - if (ext3_feature_set_ok(sb)) - ext4_msg(sb, KERN_INFO, "mounting ext3 file system " - "using the ext4 subsystem"); - else { - ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " - "to feature incompatibilities"); - goto failed_mount; - } - } - /* * Check feature flags regardless of the revision level, since we * previously didn't change the revision level when setting the flags, @@ -3490,11 +3459,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)); - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) && - !(sb->s_flags & MS_RDONLY)) - if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) - goto failed_mount3; - /* * The first inode we look at is the journal inode. Don't try * root first: it may be modified in the journal! @@ -3510,6 +3474,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount_wq; } else { clear_opt(sb, DATA_FLAGS); + set_opt(sb, WRITEBACK_DATA); sbi->s_journal = NULL; needs_recovery = 0; goto no_journal; @@ -3742,8 +3707,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyblocks_counter); - if (sbi->s_mmp_tsk) - kthread_stop(sbi->s_mmp_tsk); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); @@ -4279,7 +4242,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) int enable_quota = 0; ext4_group_t g; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; - int err = 0; + int err; #ifdef CONFIG_QUOTA int i; #endif @@ -4405,13 +4368,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; if (!ext4_setup_super(sb, es, 0)) sb->s_flags &= ~MS_RDONLY; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_MMP)) - if (ext4_multi_mount_protect(sb, - le64_to_cpu(es->s_mmp_block))) { - err = -EROFS; - goto restore_opts; - } enable_quota = 1; } } @@ -4476,7 +4432,6 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; u64 fsid; - s64 bfree; if (test_opt(sb, MINIX_DF)) { sbi->s_overhead_last = 0; @@ -4520,10 +4475,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; - bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - + buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); - /* prevent underflow in case that few free space is available */ - buf->f_bfree = max_t(s64, bfree, 0); buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); if (buf->f_bfree < ext4_r_blocks_count(es)) buf->f_bavail = 0; @@ -4699,9 +4652,6 @@ static int ext4_quota_off(struct super_block *sb, int type) if (test_opt(sb, DELALLOC)) sync_filesystem(sb); - if (!inode) - goto out; - /* Update modification times of quota files when userspace can * start looking at them */ handle = ext4_journal_start(inode, 1); @@ -4822,6 +4772,14 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, } #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +static struct file_system_type ext2_fs_type = { + .owner = THIS_MODULE, + .name = "ext2", + .mount = ext4_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + static inline void register_as_ext2(void) { int err = register_filesystem(&ext2_fs_type); @@ -4834,22 +4792,10 @@ static inline void unregister_as_ext2(void) { unregister_filesystem(&ext2_fs_type); } - -static inline int ext2_feature_set_ok(struct super_block *sb) -{ - if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP)) - return 0; - if (sb->s_flags & MS_RDONLY) - return 1; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP)) - return 0; - return 1; -} MODULE_ALIAS("ext2"); #else static inline void register_as_ext2(void) { } static inline void unregister_as_ext2(void) { } -static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; } #endif #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) @@ -4865,24 +4811,10 @@ static inline void unregister_as_ext3(void) { unregister_filesystem(&ext3_fs_type); } - -static inline int ext3_feature_set_ok(struct super_block *sb) -{ - if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP)) - return 0; - if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) - return 0; - if (sb->s_flags & MS_RDONLY) - return 1; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) - return 0; - return 1; -} MODULE_ALIAS("ext3"); #else static inline void register_as_ext3(void) { } static inline void unregister_as_ext3(void) { } -static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; } #endif static struct file_system_type ext4_fs_type = { @@ -4966,8 +4898,8 @@ static int __init ext4_init_fs(void) err = init_inodecache(); if (err) goto out1; - register_as_ext3(); register_as_ext2(); + register_as_ext3(); err = register_filesystem(&ext4_fs_type); if (err) goto out; diff --git a/trunk/fs/ext4/xattr.c b/trunk/fs/ext4/xattr.c index c757adc97250..b545ca1c459c 100644 --- a/trunk/fs/ext4/xattr.c +++ b/trunk/fs/ext4/xattr.c @@ -820,8 +820,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; - block = ext4_new_meta_blocks(handle, inode, goal, 0, - NULL, &error); + block = ext4_new_meta_blocks(handle, inode, + goal, NULL, &error); if (error) goto cleanup; diff --git a/trunk/fs/fat/namei_msdos.c b/trunk/fs/fat/namei_msdos.c index be15437c272e..3b222dafd15b 100644 --- a/trunk/fs/fat/namei_msdos.c +++ b/trunk/fs/fat/namei_msdos.c @@ -326,8 +326,6 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry) struct fat_slot_info sinfo; int err; - dentry_unhash(dentry); - lock_super(sb); /* * Check whether the directory is not in use, then check @@ -459,9 +457,6 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, old_inode = old_dentry->d_inode; new_inode = new_dentry->d_inode; - if (new_inode && S_ISDIR(new_inode->i_mode)) - dentry_unhash(new_dentry); - err = fat_scan(old_dir, old_name, &old_sinfo); if (err) { err = -EIO; diff --git a/trunk/fs/fat/namei_vfat.c b/trunk/fs/fat/namei_vfat.c index c61a6789f36c..20b4ea53fdc4 100644 --- a/trunk/fs/fat/namei_vfat.c +++ b/trunk/fs/fat/namei_vfat.c @@ -824,8 +824,6 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry) struct fat_slot_info sinfo; int err; - dentry_unhash(dentry); - lock_super(sb); err = fat_dir_empty(inode); @@ -933,9 +931,6 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, int err, is_dir, update_dotdot, corrupt = 0; struct super_block *sb = old_dir->i_sb; - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; old_inode = old_dentry->d_inode; new_inode = new_dentry->d_inode; diff --git a/trunk/fs/fuse/dir.c b/trunk/fs/fuse/dir.c index 0d0e3faddcfa..b32eb29a4e6f 100644 --- a/trunk/fs/fuse/dir.c +++ b/trunk/fs/fuse/dir.c @@ -667,8 +667,6 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) if (IS_ERR(req)) return PTR_ERR(req); - dentry_unhash(entry); - req->in.h.opcode = FUSE_RMDIR; req->in.h.nodeid = get_node_id(dir); req->in.numargs = 1; @@ -693,10 +691,6 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, struct fuse_rename_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); struct fuse_req *req = fuse_get_req(fc); - - if (newent->d_inode && S_ISDIR(newent->d_inode->i_mode)) - dentry_unhash(newent); - if (IS_ERR(req)) return PTR_ERR(req); diff --git a/trunk/fs/hfs/dir.c b/trunk/fs/hfs/dir.c index 1cb70cdba2c1..b4d70b13be92 100644 --- a/trunk/fs/hfs/dir.c +++ b/trunk/fs/hfs/dir.c @@ -253,9 +253,6 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry) struct inode *inode = dentry->d_inode; int res; - if (S_ISDIR(inode->i_mode)) - dentry_unhash(dentry); - if (S_ISDIR(inode->i_mode) && inode->i_size != 2) return -ENOTEMPTY; res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name); @@ -286,9 +283,6 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* Unlink destination if it already exists */ if (new_dentry->d_inode) { - if (S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - res = hfs_remove(new_dir, new_dentry); if (res) return res; diff --git a/trunk/fs/hfsplus/dir.c b/trunk/fs/hfsplus/dir.c index b28835091dd0..4df5059c25da 100644 --- a/trunk/fs/hfsplus/dir.c +++ b/trunk/fs/hfsplus/dir.c @@ -370,8 +370,6 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) struct inode *inode = dentry->d_inode; int res; - dentry_unhash(dentry); - if (inode->i_size != 2) return -ENOTEMPTY; @@ -469,12 +467,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, /* Unlink destination if it already exists */ if (new_dentry->d_inode) { - if (S_ISDIR(new_dentry->d_inode->i_mode)) { - dentry_unhash(new_dentry); + if (S_ISDIR(new_dentry->d_inode->i_mode)) res = hfsplus_rmdir(new_dir, new_dentry); - } else { + else res = hfsplus_unlink(new_dir, new_dentry); - } if (res) return res; } diff --git a/trunk/fs/hostfs/hostfs_kern.c b/trunk/fs/hostfs/hostfs_kern.c index e6816b9e6903..2638c834ed28 100644 --- a/trunk/fs/hostfs/hostfs_kern.c +++ b/trunk/fs/hostfs/hostfs_kern.c @@ -683,8 +683,6 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry) char *file; int err; - dentry_unhash(dentry); - if ((file = dentry_name(dentry)) == NULL) return -ENOMEM; err = do_rmdir(file); @@ -738,9 +736,6 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from, char *from_name, *to_name; int err; - if (to->d_inode && S_ISDIR(to->d_inode->i_mode)) - dentry_unhash(to); - if ((from_name = dentry_name(from)) == NULL) return -ENOMEM; if ((to_name = dentry_name(to)) == NULL) { diff --git a/trunk/fs/hpfs/namei.c b/trunk/fs/hpfs/namei.c index ff0ce21c0867..1f05839c27a7 100644 --- a/trunk/fs/hpfs/namei.c +++ b/trunk/fs/hpfs/namei.c @@ -395,6 +395,7 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry) dentry_unhash(dentry); if (!d_unhashed(dentry)) { + dput(dentry); hpfs_unlock(dir->i_sb); return -ENOSPC; } @@ -402,6 +403,7 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry) !S_ISREG(inode->i_mode) || get_write_access(inode)) { d_rehash(dentry); + dput(dentry); } else { struct iattr newattrs; /*printk("HPFS: truncating file before delete.\n");*/ @@ -409,6 +411,7 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry) newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; err = notify_change(dentry, &newattrs); put_write_access(inode); + dput(dentry); if (!err) goto again; } @@ -439,8 +442,6 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) int err; int r; - dentry_unhash(dentry); - hpfs_adjust_length(name, &len); hpfs_lock(dir->i_sb); err = -ENOENT; @@ -534,10 +535,6 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct buffer_head *bh; struct fnode *fnode; int err; - - if (new_inode && S_ISDIR(new_inode->i_mode)) - dentry_unhash(new_dentry); - if ((err = hpfs_chk_name(new_name, &new_len))) return err; err = 0; hpfs_adjust_length(old_name, &old_len); diff --git a/trunk/fs/hugetlbfs/inode.c b/trunk/fs/hugetlbfs/inode.c index 7aafeb8fa300..e7a035781b7d 100644 --- a/trunk/fs/hugetlbfs/inode.c +++ b/trunk/fs/hugetlbfs/inode.c @@ -921,8 +921,7 @@ static int can_do_hugetlb_shm(void) return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); } -struct file *hugetlb_file_setup(const char *name, size_t size, - vm_flags_t acctflag, +struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, struct user_struct **user, int creat_flags) { int error = -ENOMEM; diff --git a/trunk/fs/jbd2/commit.c b/trunk/fs/jbd2/commit.c index 7f21cf3aaf92..29148a81c783 100644 --- a/trunk/fs/jbd2/commit.c +++ b/trunk/fs/jbd2/commit.c @@ -219,6 +219,7 @@ static int journal_submit_data_buffers(journal_t *journal, ret = err; spin_lock(&journal->j_list_lock); J_ASSERT(jinode->i_transaction == commit_transaction); + commit_transaction->t_flushed_data_blocks = 1; clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); smp_mb__after_clear_bit(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); @@ -671,16 +672,12 @@ void jbd2_journal_commit_transaction(journal_t *journal) err = 0; } - write_lock(&journal->j_state_lock); - J_ASSERT(commit_transaction->t_state == T_COMMIT); - commit_transaction->t_state = T_COMMIT_DFLUSH; - write_unlock(&journal->j_state_lock); /* * If the journal is not located on the file system device, * then we must flush the file system device before we issue * the commit record */ - if (commit_transaction->t_need_data_flush && + if (commit_transaction->t_flushed_data_blocks && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); @@ -757,13 +754,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) required. */ JBUFFER_TRACE(jh, "file as BJ_Forget"); jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); - /* - * Wake up any transactions which were waiting for this IO to - * complete. The barrier must be here so that changes by - * jbd2_journal_file_buffer() take effect before wake_up_bit() - * does the waitqueue check. - */ - smp_mb(); + /* Wake up any transactions which were waiting for this + IO to complete */ wake_up_bit(&bh->b_state, BH_Unshadow); JBUFFER_TRACE(jh, "brelse shadowed buffer"); __brelse(bh); @@ -802,10 +794,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd2_journal_abort(journal, err); jbd_debug(3, "JBD: commit phase 5\n"); - write_lock(&journal->j_state_lock); - J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); - commit_transaction->t_state = T_COMMIT_JFLUSH; - write_unlock(&journal->j_state_lock); if (!JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { @@ -961,7 +949,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd_debug(3, "JBD: commit phase 7\n"); - J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); + J_ASSERT(commit_transaction->t_state == T_COMMIT); commit_transaction->t_start = jiffies; stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, diff --git a/trunk/fs/jbd2/journal.c b/trunk/fs/jbd2/journal.c index 9a7826990304..e0ec3db1c395 100644 --- a/trunk/fs/jbd2/journal.c +++ b/trunk/fs/jbd2/journal.c @@ -479,12 +479,9 @@ int __jbd2_log_space_left(journal_t *journal) int __jbd2_log_start_commit(journal_t *journal, tid_t target) { /* - * The only transaction we can possibly wait upon is the - * currently running transaction (if it exists). Otherwise, - * the target tid must be an old one. + * Are we already doing a recent enough commit? */ - if (journal->j_running_transaction && - journal->j_running_transaction->t_tid == target) { + if (!tid_geq(journal->j_commit_request, target)) { /* * We want a new commit: OK, mark the request and wakeup the * commit thread. We do _not_ do the commit ourselves. @@ -496,15 +493,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target) journal->j_commit_sequence); wake_up(&journal->j_wait_commit); return 1; - } else if (!tid_geq(journal->j_commit_request, target)) - /* This should never happen, but if it does, preserve - the evidence before kjournald goes into a loop and - increments j_commit_sequence beyond all recognition. */ - WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n", - journal->j_commit_request, - journal->j_commit_sequence, - target, journal->j_running_transaction ? - journal->j_running_transaction->t_tid : 0); + } return 0; } @@ -587,47 +576,6 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) return ret; } -/* - * Return 1 if a given transaction has not yet sent barrier request - * connected with a transaction commit. If 0 is returned, transaction - * may or may not have sent the barrier. Used to avoid sending barrier - * twice in common cases. - */ -int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid) -{ - int ret = 0; - transaction_t *commit_trans; - - if (!(journal->j_flags & JBD2_BARRIER)) - return 0; - read_lock(&journal->j_state_lock); - /* Transaction already committed? */ - if (tid_geq(journal->j_commit_sequence, tid)) - goto out; - commit_trans = journal->j_committing_transaction; - if (!commit_trans || commit_trans->t_tid != tid) { - ret = 1; - goto out; - } - /* - * Transaction is being committed and we already proceeded to - * submitting a flush to fs partition? - */ - if (journal->j_fs_dev != journal->j_dev) { - if (!commit_trans->t_need_data_flush || - commit_trans->t_state >= T_COMMIT_DFLUSH) - goto out; - } else { - if (commit_trans->t_state >= T_COMMIT_JFLUSH) - goto out; - } - ret = 1; -out: - read_unlock(&journal->j_state_lock); - return ret; -} -EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier); - /* * Wait for a specified commit to complete. * The caller may not hold the journal lock. diff --git a/trunk/fs/jbd2/transaction.c b/trunk/fs/jbd2/transaction.c index 3eec82d32fd4..05fa77a23711 100644 --- a/trunk/fs/jbd2/transaction.c +++ b/trunk/fs/jbd2/transaction.c @@ -82,7 +82,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) */ /* - * Update transaction's maximum wait time, if debugging is enabled. + * Update transiaction's maximum wait time, if debugging is enabled. * * In order for t_max_wait to be reliable, it must be protected by a * lock. But doing so will mean that start_this_handle() can not be @@ -91,10 +91,11 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) * means that maximum wait time reported by the jbd2_run_stats * tracepoint will always be zero. */ -static inline void update_t_max_wait(transaction_t *transaction, - unsigned long ts) +static inline void update_t_max_wait(transaction_t *transaction) { #ifdef CONFIG_JBD2_DEBUG + unsigned long ts = jiffies; + if (jbd2_journal_enable_debug && time_after(transaction->t_start, ts)) { ts = jbd2_time_diff(ts, transaction->t_start); @@ -120,7 +121,6 @@ static int start_this_handle(journal_t *journal, handle_t *handle, tid_t tid; int needed, need_to_start; int nblocks = handle->h_buffer_credits; - unsigned long ts = jiffies; if (nblocks > journal->j_max_transaction_buffers) { printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", @@ -271,7 +271,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle, /* OK, account for the buffers that this operation expects to * use and add the handle to the running transaction. */ - update_t_max_wait(transaction, ts); + update_t_max_wait(transaction); handle->h_transaction = transaction; atomic_inc(&transaction->t_updates); atomic_inc(&transaction->t_handle_count); @@ -316,8 +316,7 @@ static handle_t *new_handle(int nblocks) * This function is visible to journal users (like ext3fs), so is not * called with the journal already locked. * - * Return a pointer to a newly allocated handle, or an ERR_PTR() value - * on failure. + * Return a pointer to a newly allocated handle, or NULL on failure */ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) { @@ -922,8 +921,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) */ JBUFFER_TRACE(jh, "cancelling revoke"); jbd2_journal_cancel_revoke(handle, jh); -out: jbd2_journal_put_journal_head(jh); +out: return err; } @@ -2148,13 +2147,6 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) jinode->i_next_transaction == transaction) goto done; - /* - * We only ever set this variable to 1 so the test is safe. Since - * t_need_data_flush is likely to be set, we do the test to save some - * cacheline bouncing - */ - if (!transaction->t_need_data_flush) - transaction->t_need_data_flush = 1; /* On some different transaction's list - should be * the committing one */ if (jinode->i_transaction) { diff --git a/trunk/fs/jffs2/dir.c b/trunk/fs/jffs2/dir.c index 05f73328b28b..82faddd1f321 100644 --- a/trunk/fs/jffs2/dir.c +++ b/trunk/fs/jffs2/dir.c @@ -609,8 +609,6 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) int ret; uint32_t now = get_seconds(); - dentry_unhash(dentry); - for (fd = f->dents ; fd; fd = fd->next) { if (fd->ino) return -ENOTEMPTY; @@ -786,9 +784,6 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, uint8_t type; uint32_t now; - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - /* The VFS will check for us and prevent trying to rename a * file over a directory and vice versa, but if it's a directory, * the VFS can't check whether the victim is empty. The filesystem diff --git a/trunk/fs/jfs/namei.c b/trunk/fs/jfs/namei.c index 865df16a6cf3..eaaf2b511e89 100644 --- a/trunk/fs/jfs/namei.c +++ b/trunk/fs/jfs/namei.c @@ -360,8 +360,6 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name); - dentry_unhash(dentry); - /* Init inode for quota operations. */ dquot_initialize(dip); dquot_initialize(ip); @@ -1097,9 +1095,6 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, jfs_info("jfs_rename: %s %s", old_dentry->d_name.name, new_dentry->d_name.name); - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - dquot_initialize(old_dir); dquot_initialize(new_dir); diff --git a/trunk/fs/logfs/dir.c b/trunk/fs/logfs/dir.c index f34c9cde9e94..9ed89d1663f8 100644 --- a/trunk/fs/logfs/dir.c +++ b/trunk/fs/logfs/dir.c @@ -273,8 +273,6 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = dentry->d_inode; - dentry_unhash(dentry); - if (!logfs_empty_dir(inode)) return -ENOTEMPTY; @@ -624,9 +622,6 @@ static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry, loff_t pos; int err; - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - /* 1. locate source dd */ err = logfs_get_dd(old_dir, old_dentry, &dd, &pos); if (err) diff --git a/trunk/fs/minix/namei.c b/trunk/fs/minix/namei.c index f60aed8db9c4..6e6777f1b4b2 100644 --- a/trunk/fs/minix/namei.c +++ b/trunk/fs/minix/namei.c @@ -168,8 +168,6 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry) struct inode * inode = dentry->d_inode; int err = -ENOTEMPTY; - dentry_unhash(dentry); - if (minix_empty_dir(inode)) { err = minix_unlink(dir, dentry); if (!err) { @@ -192,9 +190,6 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry, struct minix_dir_entry * old_de; int err = -ENOENT; - if (new_inode && S_ISDIR(new_inode->i_mode)) - dentry_unhash(new_dentry); - old_de = minix_find_entry(old_dentry, &old_page); if (!old_de) goto out; diff --git a/trunk/fs/mpage.c b/trunk/fs/mpage.c index fdfae9fa98cd..0afc809e46e0 100644 --- a/trunk/fs/mpage.c +++ b/trunk/fs/mpage.c @@ -27,7 +27,6 @@ #include #include #include -#include /* * I/O completion handler for multipage BIOs. @@ -272,12 +271,6 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, SetPageMappedToDisk(page); } - if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) && - cleancache_get_page(page) == 0) { - SetPageUptodate(page); - goto confused; - } - /* * This page will go to BIO. Do we need to send this BIO off first? */ diff --git a/trunk/fs/namei.c b/trunk/fs/namei.c index 2358b326b221..6ff858c049c0 100644 --- a/trunk/fs/namei.c +++ b/trunk/fs/namei.c @@ -391,28 +391,79 @@ void path_put(struct path *path) } EXPORT_SYMBOL(path_put); -/* +/** + * nameidata_drop_rcu - drop this nameidata out of rcu-walk + * @nd: nameidata pathwalk data to drop + * Returns: 0 on success, -ECHILD on failure + * * Path walking has 2 modes, rcu-walk and ref-walk (see - * Documentation/filesystems/path-lookup.txt). In situations when we can't - * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab - * normal reference counts on dentries and vfsmounts to transition to rcu-walk - * mode. Refcounts are grabbed at the last known good point before rcu-walk - * got stuck, so ref-walk may continue from there. If this is not successful - * (eg. a seqcount has changed), then failure is returned and it's up to caller - * to restart the path walk from the beginning in ref-walk mode. + * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt + * to drop out of rcu-walk mode and take normal reference counts on dentries + * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take + * refcounts at the last known good point before rcu-walk got stuck, so + * ref-walk may continue from there. If this is not successful (eg. a seqcount + * has changed), then failure is returned and path walk restarts from the + * beginning in ref-walk mode. + * + * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into + * ref-walk. Must be called from rcu-walk context. */ +static int nameidata_drop_rcu(struct nameidata *nd) +{ + struct fs_struct *fs = current->fs; + struct dentry *dentry = nd->path.dentry; + int want_root = 0; + + BUG_ON(!(nd->flags & LOOKUP_RCU)); + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + want_root = 1; + spin_lock(&fs->lock); + if (nd->root.mnt != fs->root.mnt || + nd->root.dentry != fs->root.dentry) + goto err_root; + } + spin_lock(&dentry->d_lock); + if (!__d_rcu_to_refcount(dentry, nd->seq)) + goto err; + BUG_ON(nd->inode != dentry->d_inode); + spin_unlock(&dentry->d_lock); + if (want_root) { + path_get(&nd->root); + spin_unlock(&fs->lock); + } + mntget(nd->path.mnt); + + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + nd->flags &= ~LOOKUP_RCU; + return 0; +err: + spin_unlock(&dentry->d_lock); +err_root: + if (want_root) + spin_unlock(&fs->lock); + return -ECHILD; +} + +/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ +static inline int nameidata_drop_rcu_maybe(struct nameidata *nd) +{ + if (nd->flags & LOOKUP_RCU) + return nameidata_drop_rcu(nd); + return 0; +} /** - * unlazy_walk - try to switch to ref-walk mode. - * @nd: nameidata pathwalk data - * @dentry: child of nd->path.dentry or NULL + * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk + * @nd: nameidata pathwalk data to drop + * @dentry: dentry to drop * Returns: 0 on success, -ECHILD on failure * - * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry - * for ref-walk mode. @dentry must be a path found by a do_lookup call on - * @nd or NULL. Must be called from rcu-walk context. + * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root, + * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on + * @nd. Must be called from rcu-walk context. */ -static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) +static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry) { struct fs_struct *fs = current->fs; struct dentry *parent = nd->path.dentry; @@ -427,25 +478,18 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) goto err_root; } spin_lock(&parent->d_lock); - if (!dentry) { - if (!__d_rcu_to_refcount(parent, nd->seq)) - goto err_parent; - BUG_ON(nd->inode != parent->d_inode); - } else { - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (!__d_rcu_to_refcount(dentry, nd->seq)) - goto err_child; - /* - * If the sequence check on the child dentry passed, then - * the child has not been removed from its parent. This - * means the parent dentry must be valid and able to take - * a reference at this point. - */ - BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); - BUG_ON(!parent->d_count); - parent->d_count++; - spin_unlock(&dentry->d_lock); - } + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + if (!__d_rcu_to_refcount(dentry, nd->seq)) + goto err; + /* + * If the sequence check on the child dentry passed, then the child has + * not been removed from its parent. This means the parent dentry must + * be valid and able to take a reference at this point. + */ + BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); + BUG_ON(!parent->d_count); + parent->d_count++; + spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); if (want_root) { path_get(&nd->root); @@ -457,10 +501,8 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) br_read_unlock(vfsmount_lock); nd->flags &= ~LOOKUP_RCU; return 0; - -err_child: +err: spin_unlock(&dentry->d_lock); -err_parent: spin_unlock(&parent->d_lock); err_root: if (want_root) @@ -468,6 +510,59 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) return -ECHILD; } +/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ +static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry) +{ + if (nd->flags & LOOKUP_RCU) { + if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) { + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + return -ECHILD; + } + } + return 0; +} + +/** + * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk + * @nd: nameidata pathwalk data to drop + * Returns: 0 on success, -ECHILD on failure + * + * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk. + * nd->path should be the final element of the lookup, so nd->root is discarded. + * Must be called from rcu-walk context. + */ +static int nameidata_drop_rcu_last(struct nameidata *nd) +{ + struct dentry *dentry = nd->path.dentry; + + BUG_ON(!(nd->flags & LOOKUP_RCU)); + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + spin_lock(&dentry->d_lock); + if (!__d_rcu_to_refcount(dentry, nd->seq)) + goto err_unlock; + BUG_ON(nd->inode != dentry->d_inode); + spin_unlock(&dentry->d_lock); + + mntget(nd->path.mnt); + + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + + return 0; + +err_unlock: + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + return -ECHILD; +} + /** * release_open_intent - free up open intent resources * @nd: pointer to nameidata @@ -511,39 +606,26 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd) return dentry; } -/** - * complete_walk - successful completion of path walk - * @nd: pointer nameidata +/* + * handle_reval_path - force revalidation of a dentry + * + * In some situations the path walking code will trust dentries without + * revalidating them. This causes problems for filesystems that depend on + * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set + * (which indicates that it's possible for the dentry to go stale), force + * a d_revalidate call before proceeding. * - * If we had been in RCU mode, drop out of it and legitimize nd->path. - * Revalidate the final result, unless we'd already done that during - * the path walk or the filesystem doesn't ask for it. Return 0 on - * success, -error on failure. In case of failure caller does not - * need to drop nd->path. + * Returns 0 if the revalidation was successful. If the revalidation fails, + * either return the error returned by d_revalidate or -ESTALE if the + * revalidation it just returned 0. If d_revalidate returns 0, we attempt to + * invalidate the dentry. It's up to the caller to handle putting references + * to the path if necessary. */ -static int complete_walk(struct nameidata *nd) +static inline int handle_reval_path(struct nameidata *nd) { struct dentry *dentry = nd->path.dentry; int status; - if (nd->flags & LOOKUP_RCU) { - nd->flags &= ~LOOKUP_RCU; - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; - spin_lock(&dentry->d_lock); - if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) { - spin_unlock(&dentry->d_lock); - rcu_read_unlock(); - br_read_unlock(vfsmount_lock); - return -ECHILD; - } - BUG_ON(nd->inode != dentry->d_inode); - spin_unlock(&dentry->d_lock); - mntget(nd->path.mnt); - rcu_read_unlock(); - br_read_unlock(vfsmount_lock); - } - if (likely(!(nd->flags & LOOKUP_JUMPED))) return 0; @@ -561,7 +643,6 @@ static int complete_walk(struct nameidata *nd) if (!status) status = -ESTALE; - path_put(&nd->path); return status; } @@ -1160,8 +1241,13 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, if (likely(__follow_mount_rcu(nd, path, inode, false))) return 0; unlazy: - if (unlazy_walk(nd, dentry)) - return -ECHILD; + if (dentry) { + if (nameidata_dentry_drop_rcu(nd, dentry)) + return -ECHILD; + } else { + if (nameidata_drop_rcu(nd)) + return -ECHILD; + } } else { dentry = __d_lookup(parent, name); } @@ -1217,7 +1303,7 @@ static inline int may_lookup(struct nameidata *nd) int err = exec_permission(nd->inode, IPERM_FLAG_RCU); if (err != -ECHILD) return err; - if (unlazy_walk(nd, NULL)) + if (nameidata_drop_rcu(nd)) return -ECHILD; } return exec_permission(nd->inode, 0); @@ -1271,12 +1357,8 @@ static inline int walk_component(struct nameidata *nd, struct path *path, return -ENOENT; } if (unlikely(inode->i_op->follow_link) && follow) { - if (nd->flags & LOOKUP_RCU) { - if (unlikely(unlazy_walk(nd, path->dentry))) { - terminate_walk(nd); - return -ECHILD; - } - } + if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry)) + return -ECHILD; BUG_ON(inode != path->dentry->d_inode); return 1; } @@ -1575,8 +1657,18 @@ static int path_lookupat(int dfd, const char *name, } } - if (!err) - err = complete_walk(nd); + if (nd->flags & LOOKUP_RCU) { + /* went all way through without dropping RCU */ + BUG_ON(err); + if (nameidata_drop_rcu_last(nd)) + err = -ECHILD; + } + + if (!err) { + err = handle_reval_path(nd); + if (err) + path_put(&nd->path); + } if (!err && nd->flags & LOOKUP_DIRECTORY) { if (!nd->inode->i_op->lookup) { @@ -2042,9 +2134,13 @@ static struct file *do_last(struct nameidata *nd, struct path *path, return ERR_PTR(error); /* fallthrough */ case LAST_ROOT: - error = complete_walk(nd); + if (nd->flags & LOOKUP_RCU) { + if (nameidata_drop_rcu_last(nd)) + return ERR_PTR(-ECHILD); + } + error = handle_reval_path(nd); if (error) - return ERR_PTR(error); + goto exit; audit_inode(pathname, nd->path.dentry); if (open_flag & O_CREAT) { error = -EISDIR; @@ -2052,9 +2148,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path, } goto ok; case LAST_BIND: - error = complete_walk(nd); + /* can't be RCU mode here */ + error = handle_reval_path(nd); if (error) - return ERR_PTR(error); + goto exit; audit_inode(pathname, dir); goto ok; } @@ -2073,9 +2170,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (error) /* symlink */ return NULL; /* sayonara */ - error = complete_walk(nd); - if (error) - return ERR_PTR(-ECHILD); + if (nd->flags & LOOKUP_RCU) { + if (nameidata_drop_rcu_last(nd)) + return ERR_PTR(-ECHILD); + } error = -ENOTDIR; if (nd->flags & LOOKUP_DIRECTORY) { @@ -2087,9 +2185,11 @@ static struct file *do_last(struct nameidata *nd, struct path *path, } /* create side of things */ - error = complete_walk(nd); - if (error) - return ERR_PTR(error); + + if (nd->flags & LOOKUP_RCU) { + if (nameidata_drop_rcu_last(nd)) + return ERR_PTR(-ECHILD); + } audit_inode(pathname, dir); error = -EISDIR; @@ -2529,10 +2629,10 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode) } /* - * The dentry_unhash() helper will try to drop the dentry early: we - * should have a usage count of 2 if we're the only user of this - * dentry, and if that is true (possibly after pruning the dcache), - * then we drop the dentry now. + * We try to drop the dentry early: we should have + * a usage count of 2 if we're the only user of this + * dentry, and if that is true (possibly after pruning + * the dcache), then we drop the dentry now. * * A low-level filesystem can, if it choses, legally * do a @@ -2545,9 +2645,10 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode) */ void dentry_unhash(struct dentry *dentry) { + dget(dentry); shrink_dcache_parent(dentry); spin_lock(&dentry->d_lock); - if (dentry->d_count == 1) + if (dentry->d_count == 2) __d_drop(dentry); spin_unlock(&dentry->d_lock); } @@ -2563,26 +2664,25 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) return -EPERM; mutex_lock(&dentry->d_inode->i_mutex); - - error = -EBUSY; + dentry_unhash(dentry); if (d_mountpoint(dentry)) - goto out; - - error = security_inode_rmdir(dir, dentry); - if (error) - goto out; - - error = dir->i_op->rmdir(dir, dentry); - if (error) - goto out; - - dentry->d_inode->i_flags |= S_DEAD; - dont_mount(dentry); - -out: + error = -EBUSY; + else { + error = security_inode_rmdir(dir, dentry); + if (!error) { + error = dir->i_op->rmdir(dir, dentry); + if (!error) { + dentry->d_inode->i_flags |= S_DEAD; + dont_mount(dentry); + } + } + } mutex_unlock(&dentry->d_inode->i_mutex); - if (!error) + if (!error) { d_delete(dentry); + } + dput(dentry); + return error; } @@ -2953,7 +3053,12 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * HOWEVER, it relies on the assumption that any object with ->lookup() * has no more than 1 dentry. If "hybrid" objects will ever appear, * we'd better make sure that there's no link(2) for them. - * d) conversion from fhandle to dentry may come in the wrong moment - when + * d) some filesystems don't support opened-but-unlinked directories, + * either because of layout or because they are not ready to deal with + * all cases correctly. The latter will be fixed (taking this sort of + * stuff into VFS), but the former is not going away. Solution: the same + * trick as in rmdir(). + * e) conversion from fhandle to dentry may come in the wrong moment - when * we are removing the target. Solution: we will have to grab ->i_mutex * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on * ->i_mutex on parents, which works but leads to some truly excessive @@ -2963,7 +3068,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { int error = 0; - struct inode *target = new_dentry->d_inode; + struct inode *target; /* * If we are going to change the parent - check write permissions, @@ -2979,24 +3084,26 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, if (error) return error; + target = new_dentry->d_inode; if (target) mutex_lock(&target->i_mutex); - - error = -EBUSY; - if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) - goto out; - - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); - if (error) - goto out; - - if (target) { - target->i_flags |= S_DEAD; - dont_mount(new_dentry); + if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) + error = -EBUSY; + else { + if (target) + dentry_unhash(new_dentry); + error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); } -out: - if (target) + if (target) { + if (!error) { + target->i_flags |= S_DEAD; + dont_mount(new_dentry); + } mutex_unlock(&target->i_mutex); + if (d_unhashed(new_dentry)) + d_rehash(new_dentry); + dput(new_dentry); + } if (!error) if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) d_move(old_dentry,new_dentry); @@ -3006,7 +3113,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *target = new_dentry->d_inode; + struct inode *target; int error; error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); @@ -3014,22 +3121,19 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, return error; dget(new_dentry); + target = new_dentry->d_inode; if (target) mutex_lock(&target->i_mutex); - - error = -EBUSY; if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) - goto out; - - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); - if (error) - goto out; - - if (target) - dont_mount(new_dentry); - if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) - d_move(old_dentry, new_dentry); -out: + error = -EBUSY; + else + error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); + if (!error) { + if (target) + dont_mount(new_dentry); + if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) + d_move(old_dentry, new_dentry); + } if (target) mutex_unlock(&target->i_mutex); dput(new_dentry); diff --git a/trunk/fs/namespace.c b/trunk/fs/namespace.c index fe59bd145d21..d99bcf59e4c2 100644 --- a/trunk/fs/namespace.c +++ b/trunk/fs/namespace.c @@ -1695,7 +1695,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path) static int flags_to_propagation_type(int flags) { - int type = flags & ~(MS_REC | MS_SILENT); + int type = flags & ~MS_REC; /* Fail if any non-propagation flags are set */ if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) diff --git a/trunk/fs/ncpfs/dir.c b/trunk/fs/ncpfs/dir.c index e3e646b06404..f6946bb5cb55 100644 --- a/trunk/fs/ncpfs/dir.c +++ b/trunk/fs/ncpfs/dir.c @@ -1033,8 +1033,6 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry) DPRINTK("ncp_rmdir: removing %s/%s\n", dentry->d_parent->d_name.name, dentry->d_name.name); - dentry_unhash(dentry); - error = -EBUSY; if (!d_unhashed(dentry)) goto out; @@ -1141,9 +1139,6 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry, old_dentry->d_parent->d_name.name, old_dentry->d_name.name, new_dentry->d_parent->d_name.name, new_dentry->d_name.name); - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - ncp_age_dentry(server, old_dentry); ncp_age_dentry(server, new_dentry); diff --git a/trunk/fs/nilfs2/namei.c b/trunk/fs/nilfs2/namei.c index 1102a5fbb744..546849b3e88f 100644 --- a/trunk/fs/nilfs2/namei.c +++ b/trunk/fs/nilfs2/namei.c @@ -334,8 +334,6 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) struct nilfs_transaction_info ti; int err; - dentry_unhash(dentry); - err = nilfs_transaction_begin(dir->i_sb, &ti, 0); if (err) return err; @@ -371,9 +369,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct nilfs_transaction_info ti; int err; - if (new_inode && S_ISDIR(new_inode->i_mode)) - dentry_unhash(new_dentry); - err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1); if (unlikely(err)) return err; diff --git a/trunk/fs/ocfs2/Makefile b/trunk/fs/ocfs2/Makefile index f17e58b32989..d8a0313e99e6 100644 --- a/trunk/fs/ocfs2/Makefile +++ b/trunk/fs/ocfs2/Makefile @@ -30,7 +30,6 @@ ocfs2-objs := \ namei.o \ refcounttree.o \ reservations.o \ - move_extents.o \ resize.o \ slot_map.o \ suballoc.o \ diff --git a/trunk/fs/ocfs2/alloc.c b/trunk/fs/ocfs2/alloc.c index ed553c60de82..48aa9c7401c7 100644 --- a/trunk/fs/ocfs2/alloc.c +++ b/trunk/fs/ocfs2/alloc.c @@ -29,7 +29,6 @@ #include #include #include -#include #include @@ -7185,168 +7184,3 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, out: return ret; } - -static int ocfs2_trim_extent(struct super_block *sb, - struct ocfs2_group_desc *gd, - u32 start, u32 count) -{ - u64 discard, bcount; - - bcount = ocfs2_clusters_to_blocks(sb, count); - discard = le64_to_cpu(gd->bg_blkno) + - ocfs2_clusters_to_blocks(sb, start); - - trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount); - - return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0); -} - -static int ocfs2_trim_group(struct super_block *sb, - struct ocfs2_group_desc *gd, - u32 start, u32 max, u32 minbits) -{ - int ret = 0, count = 0, next; - void *bitmap = gd->bg_bitmap; - - if (le16_to_cpu(gd->bg_free_bits_count) < minbits) - return 0; - - trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno), - start, max, minbits); - - while (start < max) { - start = ocfs2_find_next_zero_bit(bitmap, max, start); - if (start >= max) - break; - next = ocfs2_find_next_bit(bitmap, max, start); - - if ((next - start) >= minbits) { - ret = ocfs2_trim_extent(sb, gd, - start, next - start); - if (ret < 0) { - mlog_errno(ret); - break; - } - count += next - start; - } - start = next + 1; - - if (fatal_signal_pending(current)) { - count = -ERESTARTSYS; - break; - } - - if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits) - break; - } - - if (ret < 0) - count = ret; - - return count; -} - -int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) -{ - struct ocfs2_super *osb = OCFS2_SB(sb); - u64 start, len, trimmed, first_group, last_group, group; - int ret, cnt; - u32 first_bit, last_bit, minlen; - struct buffer_head *main_bm_bh = NULL; - struct inode *main_bm_inode = NULL; - struct buffer_head *gd_bh = NULL; - struct ocfs2_dinode *main_bm; - struct ocfs2_group_desc *gd = NULL; - - start = range->start >> osb->s_clustersize_bits; - len = range->len >> osb->s_clustersize_bits; - minlen = range->minlen >> osb->s_clustersize_bits; - trimmed = 0; - - if (!len) { - range->len = 0; - return 0; - } - - if (minlen >= osb->bitmap_cpg) - return -EINVAL; - - main_bm_inode = ocfs2_get_system_file_inode(osb, - GLOBAL_BITMAP_SYSTEM_INODE, - OCFS2_INVALID_SLOT); - if (!main_bm_inode) { - ret = -EIO; - mlog_errno(ret); - goto out; - } - - mutex_lock(&main_bm_inode->i_mutex); - - ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0); - if (ret < 0) { - mlog_errno(ret); - goto out_mutex; - } - main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data; - - if (start >= le32_to_cpu(main_bm->i_clusters)) { - ret = -EINVAL; - goto out_unlock; - } - - if (start + len > le32_to_cpu(main_bm->i_clusters)) - len = le32_to_cpu(main_bm->i_clusters) - start; - - trace_ocfs2_trim_fs(start, len, minlen); - - /* Determine first and last group to examine based on start and len */ - first_group = ocfs2_which_cluster_group(main_bm_inode, start); - if (first_group == osb->first_cluster_group_blkno) - first_bit = start; - else - first_bit = start - ocfs2_blocks_to_clusters(sb, first_group); - last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); - last_bit = osb->bitmap_cpg; - - for (group = first_group; group <= last_group;) { - if (first_bit + len >= osb->bitmap_cpg) - last_bit = osb->bitmap_cpg; - else - last_bit = first_bit + len; - - ret = ocfs2_read_group_descriptor(main_bm_inode, - main_bm, group, - &gd_bh); - if (ret < 0) { - mlog_errno(ret); - break; - } - - gd = (struct ocfs2_group_desc *)gd_bh->b_data; - cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen); - brelse(gd_bh); - gd_bh = NULL; - if (cnt < 0) { - ret = cnt; - mlog_errno(ret); - break; - } - - trimmed += cnt; - len -= osb->bitmap_cpg - first_bit; - first_bit = 0; - if (group == osb->first_cluster_group_blkno) - group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); - else - group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); - } - range->len = trimmed * sb->s_blocksize; -out_unlock: - ocfs2_inode_unlock(main_bm_inode, 0); - brelse(main_bm_bh); -out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); - iput(main_bm_inode); -out: - return ret; -} diff --git a/trunk/fs/ocfs2/alloc.h b/trunk/fs/ocfs2/alloc.h index ca381c584127..3bd08a03251c 100644 --- a/trunk/fs/ocfs2/alloc.h +++ b/trunk/fs/ocfs2/alloc.h @@ -239,7 +239,6 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci, struct buffer_head **leaf_bh); int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster); -int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range); /* * Helper function to look at the # of clusters in an extent record. */ diff --git a/trunk/fs/ocfs2/cluster/sys.c b/trunk/fs/ocfs2/cluster/sys.c index a4b07730b2e1..bc702dab5d1f 100644 --- a/trunk/fs/ocfs2/cluster/sys.c +++ b/trunk/fs/ocfs2/cluster/sys.c @@ -57,6 +57,7 @@ static struct kset *o2cb_kset; void o2cb_sys_shutdown(void) { mlog_sys_shutdown(); + sysfs_remove_link(NULL, "o2cb"); kset_unregister(o2cb_kset); } @@ -68,6 +69,14 @@ int o2cb_sys_init(void) if (!o2cb_kset) return -ENOMEM; + /* + * Create this symlink for backwards compatibility with old + * versions of ocfs2-tools which look for things in /sys/o2cb. + */ + ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb"); + if (ret) + goto error; + ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group); if (ret) goto error; diff --git a/trunk/fs/ocfs2/dlm/dlmcommon.h b/trunk/fs/ocfs2/dlm/dlmcommon.h index d602abb51b61..4bdf7baee344 100644 --- a/trunk/fs/ocfs2/dlm/dlmcommon.h +++ b/trunk/fs/ocfs2/dlm/dlmcommon.h @@ -144,7 +144,6 @@ struct dlm_ctxt wait_queue_head_t dlm_join_events; unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; - unsigned long exit_domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; struct dlm_recovery_ctxt reco; spinlock_t master_lock; @@ -402,18 +401,6 @@ static inline int dlm_lvb_is_empty(char *lvb) return 1; } -static inline char *dlm_list_in_text(enum dlm_lockres_list idx) -{ - if (idx == DLM_GRANTED_LIST) - return "granted"; - else if (idx == DLM_CONVERTING_LIST) - return "converting"; - else if (idx == DLM_BLOCKED_LIST) - return "blocked"; - else - return "unknown"; -} - static inline struct list_head * dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx) { @@ -461,7 +448,6 @@ enum { DLM_FINALIZE_RECO_MSG = 518, DLM_QUERY_REGION = 519, DLM_QUERY_NODEINFO = 520, - DLM_BEGIN_EXIT_DOMAIN_MSG = 521, }; struct dlm_reco_node_data diff --git a/trunk/fs/ocfs2/dlm/dlmdebug.c b/trunk/fs/ocfs2/dlm/dlmdebug.c index 56f82cb912e3..04a32be0aeb9 100644 --- a/trunk/fs/ocfs2/dlm/dlmdebug.c +++ b/trunk/fs/ocfs2/dlm/dlmdebug.c @@ -756,12 +756,6 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) buf + out, len - out); out += snprintf(buf + out, len - out, "\n"); - /* Exit Domain Map: xx xx xx */ - out += snprintf(buf + out, len - out, "Exit Domain Map: "); - out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES, - buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); - /* Live Map: xx xx xx */ out += snprintf(buf + out, len - out, "Live Map: "); out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES, diff --git a/trunk/fs/ocfs2/dlm/dlmdomain.c b/trunk/fs/ocfs2/dlm/dlmdomain.c index 6ed6b95dcf93..3b179d6cbde0 100644 --- a/trunk/fs/ocfs2/dlm/dlmdomain.c +++ b/trunk/fs/ocfs2/dlm/dlmdomain.c @@ -132,12 +132,10 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); * New in version 1.1: * - Message DLM_QUERY_REGION added to support global heartbeat * - Message DLM_QUERY_NODEINFO added to allow online node removes - * New in version 1.2: - * - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain */ static const struct dlm_protocol_version dlm_protocol = { .pv_major = 1, - .pv_minor = 2, + .pv_minor = 1, }; #define DLM_DOMAIN_BACKOFF_MS 200 @@ -451,18 +449,14 @@ static int dlm_migrate_all_locks(struct dlm_ctxt *dlm) dropped = dlm_empty_lockres(dlm, res); spin_lock(&res->spinlock); - if (dropped) - __dlm_lockres_calc_usage(dlm, res); - else - iter = res->hash_node.next; + __dlm_lockres_calc_usage(dlm, res); + iter = res->hash_node.next; spin_unlock(&res->spinlock); dlm_lockres_put(res); - if (dropped) { - cond_resched_lock(&dlm->spinlock); + if (dropped) goto redo_bucket; - } } cond_resched_lock(&dlm->spinlock); num += n; @@ -492,28 +486,6 @@ static int dlm_no_joining_node(struct dlm_ctxt *dlm) return ret; } -static int dlm_begin_exit_domain_handler(struct o2net_msg *msg, u32 len, - void *data, void **ret_data) -{ - struct dlm_ctxt *dlm = data; - unsigned int node; - struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf; - - if (!dlm_grab(dlm)) - return 0; - - node = exit_msg->node_idx; - mlog(0, "%s: Node %u sent a begin exit domain message\n", dlm->name, node); - - spin_lock(&dlm->spinlock); - set_bit(node, dlm->exit_domain_map); - spin_unlock(&dlm->spinlock); - - dlm_put(dlm); - - return 0; -} - static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm) { /* Yikes, a double spinlock! I need domain_lock for the dlm @@ -570,7 +542,6 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, spin_lock(&dlm->spinlock); clear_bit(node, dlm->domain_map); - clear_bit(node, dlm->exit_domain_map); __dlm_print_nodes(dlm); /* notify anything attached to the heartbeat events */ @@ -583,56 +554,29 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, return 0; } -static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, u32 msg_type, +static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, unsigned int node) { int status; struct dlm_exit_domain leave_msg; - mlog(0, "%s: Sending domain exit message %u to node %u\n", dlm->name, - msg_type, node); + mlog(0, "Asking node %u if we can leave the domain %s me = %u\n", + node, dlm->name, dlm->node_num); memset(&leave_msg, 0, sizeof(leave_msg)); leave_msg.node_idx = dlm->node_num; - status = o2net_send_message(msg_type, dlm->key, &leave_msg, - sizeof(leave_msg), node, NULL); + status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key, + &leave_msg, sizeof(leave_msg), node, + NULL); if (status < 0) - mlog(ML_ERROR, "Error %d sending domain exit message %u " - "to node %u on domain %s\n", status, msg_type, node, - dlm->name); + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node); + mlog(0, "status return %d from o2net_send_message\n", status); return status; } -static void dlm_begin_exit_domain(struct dlm_ctxt *dlm) -{ - int node = -1; - - /* Support for begin exit domain was added in 1.2 */ - if (dlm->dlm_locking_proto.pv_major == 1 && - dlm->dlm_locking_proto.pv_minor < 2) - return; - - /* - * Unlike DLM_EXIT_DOMAIN_MSG, DLM_BEGIN_EXIT_DOMAIN_MSG is purely - * informational. Meaning if a node does not receive the message, - * so be it. - */ - spin_lock(&dlm->spinlock); - while (1) { - node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1); - if (node >= O2NM_MAX_NODES) - break; - if (node == dlm->node_num) - continue; - - spin_unlock(&dlm->spinlock); - dlm_send_one_domain_exit(dlm, DLM_BEGIN_EXIT_DOMAIN_MSG, node); - spin_lock(&dlm->spinlock); - } - spin_unlock(&dlm->spinlock); -} static void dlm_leave_domain(struct dlm_ctxt *dlm) { @@ -658,8 +602,7 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm) clear_node = 1; - status = dlm_send_one_domain_exit(dlm, DLM_EXIT_DOMAIN_MSG, - node); + status = dlm_send_one_domain_exit(dlm, node); if (status < 0 && status != -ENOPROTOOPT && status != -ENOTCONN) { @@ -734,7 +677,6 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm) if (leave) { mlog(0, "shutting down domain %s\n", dlm->name); - dlm_begin_exit_domain(dlm); /* We changed dlm state, notify the thread */ dlm_kick_thread(dlm, NULL); @@ -967,7 +909,6 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, * leftover join state. */ BUG_ON(dlm->joining_node != assert->node_idx); set_bit(assert->node_idx, dlm->domain_map); - clear_bit(assert->node_idx, dlm->exit_domain_map); __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n", @@ -1852,13 +1793,6 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) if (status) goto bail; - status = o2net_register_handler(DLM_BEGIN_EXIT_DOMAIN_MSG, dlm->key, - sizeof(struct dlm_exit_domain), - dlm_begin_exit_domain_handler, - dlm, NULL, &dlm->dlm_domain_handlers); - if (status) - goto bail; - bail: if (status) dlm_unregister_domain_handlers(dlm); diff --git a/trunk/fs/ocfs2/dlm/dlmmaster.c b/trunk/fs/ocfs2/dlm/dlmmaster.c index 11eefb8c12e9..84d166328cf7 100644 --- a/trunk/fs/ocfs2/dlm/dlmmaster.c +++ b/trunk/fs/ocfs2/dlm/dlmmaster.c @@ -2339,55 +2339,65 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) dlm_lockres_put(res); } -/* - * A migrateable resource is one that is : - * 1. locally mastered, and, - * 2. zero local locks, and, - * 3. one or more non-local locks, or, one or more references - * Returns 1 if yes, 0 if not. +/* Checks whether the lockres can be migrated. Returns 0 if yes, < 0 + * if not. If 0, numlocks is set to the number of locks in the lockres. */ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res) + struct dlm_lock_resource *res, + int *numlocks, + int *hasrefs) { - enum dlm_lockres_list idx; - int nonlocal = 0, node_ref; + int ret; + int i; + int count = 0; struct list_head *queue; struct dlm_lock *lock; - u64 cookie; assert_spin_locked(&res->spinlock); - if (res->owner != dlm->node_num) - return 0; + *numlocks = 0; + *hasrefs = 0; - for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) { - queue = dlm_list_idx_to_ptr(res, idx); + ret = -EINVAL; + if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { + mlog(0, "cannot migrate lockres with unknown owner!\n"); + goto leave; + } + + if (res->owner != dlm->node_num) { + mlog(0, "cannot migrate lockres this node doesn't own!\n"); + goto leave; + } + + ret = 0; + queue = &res->granted; + for (i = 0; i < 3; i++) { list_for_each_entry(lock, queue, list) { - if (lock->ml.node != dlm->node_num) { - nonlocal++; - continue; + ++count; + if (lock->ml.node == dlm->node_num) { + mlog(0, "found a lock owned by this node still " + "on the %s queue! will not migrate this " + "lockres\n", (i == 0 ? "granted" : + (i == 1 ? "converting" : + "blocked"))); + ret = -ENOTEMPTY; + goto leave; } - cookie = be64_to_cpu(lock->ml.cookie); - mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on " - "%s list\n", dlm->name, res->lockname.len, - res->lockname.name, - dlm_get_lock_cookie_node(cookie), - dlm_get_lock_cookie_seq(cookie), - dlm_list_in_text(idx)); - return 0; } + queue++; } - if (!nonlocal) { - node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); - if (node_ref >= O2NM_MAX_NODES) - return 0; - } + *numlocks = count; - mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len, - res->lockname.name); + count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + if (count < O2NM_MAX_NODES) + *hasrefs = 1; - return 1; + mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name, + res->lockname.len, res->lockname.name, *numlocks, *hasrefs); + +leave: + return ret; } /* @@ -2396,7 +2406,8 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, static int dlm_migrate_lockres(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, u8 target) + struct dlm_lock_resource *res, + u8 target) { struct dlm_master_list_entry *mle = NULL; struct dlm_master_list_entry *oldmle = NULL; @@ -2405,20 +2416,37 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, const char *name; unsigned int namelen; int mle_added = 0; + int numlocks, hasrefs; int wake = 0; if (!dlm_grab(dlm)) return -EINVAL; - BUG_ON(target == O2NM_MAX_NODES); - name = res->lockname.name; namelen = res->lockname.len; - mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name, - target); + mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target); + + /* + * ensure this lockres is a proper candidate for migration + */ + spin_lock(&res->spinlock); + ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs); + if (ret < 0) { + spin_unlock(&res->spinlock); + goto leave; + } + spin_unlock(&res->spinlock); + + /* no work to do */ + if (numlocks == 0 && !hasrefs) + goto leave; + + /* + * preallocate up front + * if this fails, abort + */ - /* preallocate up front. if this fails, abort */ ret = -ENOMEM; mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS); if (!mres) { @@ -2433,11 +2461,36 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, } ret = 0; + /* + * find a node to migrate the lockres to + */ + + spin_lock(&dlm->spinlock); + /* pick a new node */ + if (!test_bit(target, dlm->domain_map) || + target >= O2NM_MAX_NODES) { + target = dlm_pick_migration_target(dlm, res); + } + mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name, + namelen, name, target); + + if (target >= O2NM_MAX_NODES || + !test_bit(target, dlm->domain_map)) { + /* target chosen is not alive */ + ret = -EINVAL; + } + + if (ret) { + spin_unlock(&dlm->spinlock); + goto fail; + } + + mlog(0, "continuing with target = %u\n", target); + /* * clear any existing master requests and * add the migration mle to the list */ - spin_lock(&dlm->spinlock); spin_lock(&dlm->master_lock); ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, namelen, target, dlm->node_num); @@ -2478,7 +2531,6 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, dlm_put_mle(mle); } else if (mle) { kmem_cache_free(dlm_mle_cache, mle); - mle = NULL; } goto leave; } @@ -2600,52 +2652,69 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, if (wake) wake_up(&res->wq); + /* TODO: cleanup */ if (mres) free_page((unsigned long)mres); dlm_put(dlm); - mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen, - name, target, ret); + mlog(0, "returning %d\n", ret); return ret; } #define DLM_MIGRATION_RETRY_MS 100 -/* - * Should be called only after beginning the domain leave process. +/* Should be called only after beginning the domain leave process. * There should not be any remaining locks on nonlocal lock resources, * and there should be no local locks left on locally mastered resources. * * Called with the dlm spinlock held, may drop it to do migration, but * will re-acquire before exit. * - * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped - */ + * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { int ret; int lock_dropped = 0; - u8 target = O2NM_MAX_NODES; - - assert_spin_locked(&dlm->spinlock); + int numlocks, hasrefs; spin_lock(&res->spinlock); - if (dlm_is_lockres_migrateable(dlm, res)) - target = dlm_pick_migration_target(dlm, res); - spin_unlock(&res->spinlock); + if (res->owner != dlm->node_num) { + if (!__dlm_lockres_unused(res)) { + mlog(ML_ERROR, "%s:%.*s: this node is not master, " + "trying to free this but locks remain\n", + dlm->name, res->lockname.len, res->lockname.name); + } + spin_unlock(&res->spinlock); + goto leave; + } - if (target == O2NM_MAX_NODES) + /* No need to migrate a lockres having no locks */ + ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs); + if (ret >= 0 && numlocks == 0 && !hasrefs) { + spin_unlock(&res->spinlock); goto leave; + } + spin_unlock(&res->spinlock); /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */ spin_unlock(&dlm->spinlock); lock_dropped = 1; - ret = dlm_migrate_lockres(dlm, res, target); - if (ret) - mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n", - dlm->name, res->lockname.len, res->lockname.name, - target, ret); + while (1) { + ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES); + if (ret >= 0) + break; + if (ret == -ENOTEMPTY) { + mlog(ML_ERROR, "lockres %.*s still has local locks!\n", + res->lockname.len, res->lockname.name); + BUG(); + } + + mlog(0, "lockres %.*s: migrate failed, " + "retrying\n", res->lockname.len, + res->lockname.name); + msleep(DLM_MIGRATION_RETRY_MS); + } spin_lock(&dlm->spinlock); leave: return lock_dropped; @@ -2829,55 +2898,61 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, } } -/* - * Pick a node to migrate the lock resource to. This function selects a - * potential target based first on the locks and then on refmap. It skips - * nodes that are in the process of exiting the domain. - */ +/* for now this is not too intelligent. we will + * need stats to make this do the right thing. + * this just finds the first lock on one of the + * queues and uses that node as the target. */ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { - enum dlm_lockres_list idx; + int i; struct list_head *queue = &res->granted; struct dlm_lock *lock; - int noderef; - u8 nodenum = O2NM_MAX_NODES; + int nodenum; assert_spin_locked(&dlm->spinlock); - assert_spin_locked(&res->spinlock); - /* Go through all the locks */ - for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) { - queue = dlm_list_idx_to_ptr(res, idx); + spin_lock(&res->spinlock); + for (i=0; i<3; i++) { list_for_each_entry(lock, queue, list) { - if (lock->ml.node == dlm->node_num) - continue; - if (test_bit(lock->ml.node, dlm->exit_domain_map)) - continue; - nodenum = lock->ml.node; - goto bail; + /* up to the caller to make sure this node + * is alive */ + if (lock->ml.node != dlm->node_num) { + spin_unlock(&res->spinlock); + return lock->ml.node; + } } + queue++; + } + + nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + if (nodenum < O2NM_MAX_NODES) { + spin_unlock(&res->spinlock); + return nodenum; } + spin_unlock(&res->spinlock); + mlog(0, "have not found a suitable target yet! checking domain map\n"); - /* Go thru the refmap */ - noderef = -1; + /* ok now we're getting desperate. pick anyone alive. */ + nodenum = -1; while (1) { - noderef = find_next_bit(res->refmap, O2NM_MAX_NODES, - noderef + 1); - if (noderef >= O2NM_MAX_NODES) + nodenum = find_next_bit(dlm->domain_map, + O2NM_MAX_NODES, nodenum+1); + mlog(0, "found %d in domain map\n", nodenum); + if (nodenum >= O2NM_MAX_NODES) break; - if (noderef == dlm->node_num) - continue; - if (test_bit(noderef, dlm->exit_domain_map)) - continue; - nodenum = noderef; - goto bail; + if (nodenum != dlm->node_num) { + mlog(0, "picking %d\n", nodenum); + return nodenum; + } } -bail: - return nodenum; + mlog(0, "giving up. no master to migrate to\n"); + return DLM_LOCK_RES_OWNER_UNKNOWN; } + + /* this is called by the new master once all lockres * data has been received */ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, diff --git a/trunk/fs/ocfs2/dlm/dlmrecovery.c b/trunk/fs/ocfs2/dlm/dlmrecovery.c index 7efab6d28a21..f1beb6fc254d 100644 --- a/trunk/fs/ocfs2/dlm/dlmrecovery.c +++ b/trunk/fs/ocfs2/dlm/dlmrecovery.c @@ -2393,7 +2393,6 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx) mlog(0, "node %u being removed from domain map!\n", idx); clear_bit(idx, dlm->domain_map); - clear_bit(idx, dlm->exit_domain_map); /* wake up migration waiters if a node goes down. * perhaps later we can genericize this for other waiters. */ wake_up(&dlm->migration_wq); diff --git a/trunk/fs/ocfs2/dlmfs/dlmfs.c b/trunk/fs/ocfs2/dlmfs/dlmfs.c index b42076797049..8c5c0eddc365 100644 --- a/trunk/fs/ocfs2/dlmfs/dlmfs.c +++ b/trunk/fs/ocfs2/dlmfs/dlmfs.c @@ -88,7 +88,7 @@ struct workqueue_struct *user_dlm_worker; * signifies a bast fired on the lock. */ #define DLMFS_CAPABILITIES "bast stackglue" -static int param_set_dlmfs_capabilities(const char *val, +extern int param_set_dlmfs_capabilities(const char *val, struct kernel_param *kp) { printk(KERN_ERR "%s: readonly parameter\n", kp->name); diff --git a/trunk/fs/ocfs2/file.c b/trunk/fs/ocfs2/file.c index b1e35a392ca5..89659d6dc206 100644 --- a/trunk/fs/ocfs2/file.c +++ b/trunk/fs/ocfs2/file.c @@ -2670,7 +2670,6 @@ const struct file_operations ocfs2_fops_no_plocks = { .flock = ocfs2_flock, .splice_read = ocfs2_file_splice_read, .splice_write = ocfs2_file_splice_write, - .fallocate = ocfs2_fallocate, }; const struct file_operations ocfs2_dops_no_plocks = { diff --git a/trunk/fs/ocfs2/ioctl.c b/trunk/fs/ocfs2/ioctl.c index bc91072b7219..8f13c5989eae 100644 --- a/trunk/fs/ocfs2/ioctl.c +++ b/trunk/fs/ocfs2/ioctl.c @@ -22,11 +22,6 @@ #include "ioctl.h" #include "resize.h" #include "refcounttree.h" -#include "sysfile.h" -#include "dir.h" -#include "buffer_head_io.h" -#include "suballoc.h" -#include "move_extents.h" #include @@ -40,27 +35,31 @@ * be -EFAULT. The error will be returned from the ioctl(2) call. It's * just a best-effort to tell userspace that this request caused the error. */ -static inline void o2info_set_request_error(struct ocfs2_info_request *kreq, +static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq, struct ocfs2_info_request __user *req) { kreq->ir_flags |= OCFS2_INFO_FL_ERROR; (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags)); } -static inline void o2info_set_request_filled(struct ocfs2_info_request *req) +#define o2info_set_request_error(a, b) \ + __o2info_set_request_error((struct ocfs2_info_request *)&(a), b) + +static inline void __o2info_set_request_filled(struct ocfs2_info_request *req) { req->ir_flags |= OCFS2_INFO_FL_FILLED; } -static inline void o2info_clear_request_filled(struct ocfs2_info_request *req) +#define o2info_set_request_filled(a) \ + __o2info_set_request_filled((struct ocfs2_info_request *)&(a)) + +static inline void __o2info_clear_request_filled(struct ocfs2_info_request *req) { req->ir_flags &= ~OCFS2_INFO_FL_FILLED; } -static inline int o2info_coherent(struct ocfs2_info_request *req) -{ - return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT)); -} +#define o2info_clear_request_filled(a) \ + __o2info_clear_request_filled((struct ocfs2_info_request *)&(a)) static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) { @@ -154,7 +153,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode, oib.ib_blocksize = inode->i_sb->s_blocksize; - o2info_set_request_filled(&oib.ib_req); + o2info_set_request_filled(oib); if (o2info_to_user(oib, req)) goto bail; @@ -162,7 +161,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(&oib.ib_req, req); + o2info_set_request_error(oib, req); return status; } @@ -179,7 +178,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode, oic.ic_clustersize = osb->s_clustersize; - o2info_set_request_filled(&oic.ic_req); + o2info_set_request_filled(oic); if (o2info_to_user(oic, req)) goto bail; @@ -187,7 +186,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(&oic.ic_req, req); + o2info_set_request_error(oic, req); return status; } @@ -204,7 +203,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode, oim.im_max_slots = osb->max_slots; - o2info_set_request_filled(&oim.im_req); + o2info_set_request_filled(oim); if (o2info_to_user(oim, req)) goto bail; @@ -212,7 +211,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(&oim.im_req, req); + o2info_set_request_error(oim, req); return status; } @@ -229,7 +228,7 @@ int ocfs2_info_handle_label(struct inode *inode, memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); - o2info_set_request_filled(&oil.il_req); + o2info_set_request_filled(oil); if (o2info_to_user(oil, req)) goto bail; @@ -237,7 +236,7 @@ int ocfs2_info_handle_label(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(&oil.il_req, req); + o2info_set_request_error(oil, req); return status; } @@ -254,7 +253,7 @@ int ocfs2_info_handle_uuid(struct inode *inode, memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); - o2info_set_request_filled(&oiu.iu_req); + o2info_set_request_filled(oiu); if (o2info_to_user(oiu, req)) goto bail; @@ -262,7 +261,7 @@ int ocfs2_info_handle_uuid(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(&oiu.iu_req, req); + o2info_set_request_error(oiu, req); return status; } @@ -281,7 +280,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode, oif.if_incompat_features = osb->s_feature_incompat; oif.if_ro_compat_features = osb->s_feature_ro_compat; - o2info_set_request_filled(&oif.if_req); + o2info_set_request_filled(oif); if (o2info_to_user(oif, req)) goto bail; @@ -289,7 +288,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(&oif.if_req, req); + o2info_set_request_error(oif, req); return status; } @@ -306,7 +305,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode, oij.ij_journal_size = osb->journal->j_inode->i_size; - o2info_set_request_filled(&oij.ij_req); + o2info_set_request_filled(oij); if (o2info_to_user(oij, req)) goto bail; @@ -314,408 +313,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(&oij.ij_req, req); - - return status; -} - -int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, - struct inode *inode_alloc, u64 blkno, - struct ocfs2_info_freeinode *fi, u32 slot) -{ - int status = 0, unlock = 0; - - struct buffer_head *bh = NULL; - struct ocfs2_dinode *dinode_alloc = NULL; - - if (inode_alloc) - mutex_lock(&inode_alloc->i_mutex); - - if (o2info_coherent(&fi->ifi_req)) { - status = ocfs2_inode_lock(inode_alloc, &bh, 0); - if (status < 0) { - mlog_errno(status); - goto bail; - } - unlock = 1; - } else { - status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } - } - - dinode_alloc = (struct ocfs2_dinode *)bh->b_data; - - fi->ifi_stat[slot].lfi_total = - le32_to_cpu(dinode_alloc->id1.bitmap1.i_total); - fi->ifi_stat[slot].lfi_free = - le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) - - le32_to_cpu(dinode_alloc->id1.bitmap1.i_used); - -bail: - if (unlock) - ocfs2_inode_unlock(inode_alloc, 0); - - if (inode_alloc) - mutex_unlock(&inode_alloc->i_mutex); - - brelse(bh); - - return status; -} - -int ocfs2_info_handle_freeinode(struct inode *inode, - struct ocfs2_info_request __user *req) -{ - u32 i; - u64 blkno = -1; - char namebuf[40]; - int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE; - struct ocfs2_info_freeinode *oifi = NULL; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct inode *inode_alloc = NULL; - - oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL); - if (!oifi) { - status = -ENOMEM; - mlog_errno(status); - goto bail; - } - - if (o2info_from_user(*oifi, req)) - goto bail; - - oifi->ifi_slotnum = osb->max_slots; - - for (i = 0; i < oifi->ifi_slotnum; i++) { - if (o2info_coherent(&oifi->ifi_req)) { - inode_alloc = ocfs2_get_system_file_inode(osb, type, i); - if (!inode_alloc) { - mlog(ML_ERROR, "unable to get alloc inode in " - "slot %u\n", i); - status = -EIO; - goto bail; - } - } else { - ocfs2_sprintf_system_inode_name(namebuf, - sizeof(namebuf), - type, i); - status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, - namebuf, - strlen(namebuf), - &blkno); - if (status < 0) { - status = -ENOENT; - goto bail; - } - } - - status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i); - if (status < 0) - goto bail; - - iput(inode_alloc); - inode_alloc = NULL; - } - - o2info_set_request_filled(&oifi->ifi_req); - - if (o2info_to_user(*oifi, req)) - goto bail; - - status = 0; -bail: - if (status) - o2info_set_request_error(&oifi->ifi_req, req); - - kfree(oifi); - - return status; -} - -static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist, - unsigned int chunksize) -{ - int index; - - index = __ilog2_u32(chunksize); - if (index >= OCFS2_INFO_MAX_HIST) - index = OCFS2_INFO_MAX_HIST - 1; - - hist->fc_chunks[index]++; - hist->fc_clusters[index] += chunksize; -} - -static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats, - unsigned int chunksize) -{ - if (chunksize > stats->ffs_max) - stats->ffs_max = chunksize; - - if (chunksize < stats->ffs_min) - stats->ffs_min = chunksize; - - stats->ffs_avg += chunksize; - stats->ffs_free_chunks_real++; -} - -void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg, - unsigned int chunksize) -{ - o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize); - o2ffg_update_stats(&(ffg->iff_ffs), chunksize); -} - -int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb, - struct inode *gb_inode, - struct ocfs2_dinode *gb_dinode, - struct ocfs2_chain_rec *rec, - struct ocfs2_info_freefrag *ffg, - u32 chunks_in_group) -{ - int status = 0, used; - u64 blkno; - - struct buffer_head *bh = NULL; - struct ocfs2_group_desc *bg = NULL; - - unsigned int max_bits, num_clusters; - unsigned int offset = 0, cluster, chunk; - unsigned int chunk_free, last_chunksize = 0; - - if (!le32_to_cpu(rec->c_free)) - goto bail; - - do { - if (!bg) - blkno = le64_to_cpu(rec->c_blkno); - else - blkno = le64_to_cpu(bg->bg_next_group); - - if (bh) { - brelse(bh); - bh = NULL; - } - - if (o2info_coherent(&ffg->iff_req)) - status = ocfs2_read_group_descriptor(gb_inode, - gb_dinode, - blkno, &bh); - else - status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); - - if (status < 0) { - mlog(ML_ERROR, "Can't read the group descriptor # " - "%llu from device.", (unsigned long long)blkno); - status = -EIO; - goto bail; - } - - bg = (struct ocfs2_group_desc *)bh->b_data; - - if (!le16_to_cpu(bg->bg_free_bits_count)) - continue; - - max_bits = le16_to_cpu(bg->bg_bits); - offset = 0; - - for (chunk = 0; chunk < chunks_in_group; chunk++) { - /* - * last chunk may be not an entire one. - */ - if ((offset + ffg->iff_chunksize) > max_bits) - num_clusters = max_bits - offset; - else - num_clusters = ffg->iff_chunksize; - - chunk_free = 0; - for (cluster = 0; cluster < num_clusters; cluster++) { - used = ocfs2_test_bit(offset, - (unsigned long *)bg->bg_bitmap); - /* - * - chunk_free counts free clusters in #N chunk. - * - last_chunksize records the size(in) clusters - * for the last real free chunk being counted. - */ - if (!used) { - last_chunksize++; - chunk_free++; - } - - if (used && last_chunksize) { - ocfs2_info_update_ffg(ffg, - last_chunksize); - last_chunksize = 0; - } - - offset++; - } - - if (chunk_free == ffg->iff_chunksize) - ffg->iff_ffs.ffs_free_chunks++; - } - - /* - * need to update the info for last free chunk. - */ - if (last_chunksize) - ocfs2_info_update_ffg(ffg, last_chunksize); - - } while (le64_to_cpu(bg->bg_next_group)); - -bail: - brelse(bh); - - return status; -} - -int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, - struct inode *gb_inode, u64 blkno, - struct ocfs2_info_freefrag *ffg) -{ - u32 chunks_in_group; - int status = 0, unlock = 0, i; - - struct buffer_head *bh = NULL; - struct ocfs2_chain_list *cl = NULL; - struct ocfs2_chain_rec *rec = NULL; - struct ocfs2_dinode *gb_dinode = NULL; - - if (gb_inode) - mutex_lock(&gb_inode->i_mutex); - - if (o2info_coherent(&ffg->iff_req)) { - status = ocfs2_inode_lock(gb_inode, &bh, 0); - if (status < 0) { - mlog_errno(status); - goto bail; - } - unlock = 1; - } else { - status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } - } - - gb_dinode = (struct ocfs2_dinode *)bh->b_data; - cl = &(gb_dinode->id2.i_chain); - - /* - * Chunksize(in) clusters from userspace should be - * less than clusters in a group. - */ - if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) { - status = -EINVAL; - goto bail; - } - - memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats)); - - ffg->iff_ffs.ffs_min = ~0U; - ffg->iff_ffs.ffs_clusters = - le32_to_cpu(gb_dinode->id1.bitmap1.i_total); - ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters - - le32_to_cpu(gb_dinode->id1.bitmap1.i_used); - - chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1; - - for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { - rec = &(cl->cl_recs[i]); - status = ocfs2_info_freefrag_scan_chain(osb, gb_inode, - gb_dinode, - rec, ffg, - chunks_in_group); - if (status) - goto bail; - } - - if (ffg->iff_ffs.ffs_free_chunks_real) - ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg / - ffg->iff_ffs.ffs_free_chunks_real); -bail: - if (unlock) - ocfs2_inode_unlock(gb_inode, 0); - - if (gb_inode) - mutex_unlock(&gb_inode->i_mutex); - - if (gb_inode) - iput(gb_inode); - - brelse(bh); - - return status; -} - -int ocfs2_info_handle_freefrag(struct inode *inode, - struct ocfs2_info_request __user *req) -{ - u64 blkno = -1; - char namebuf[40]; - int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE; - - struct ocfs2_info_freefrag *oiff; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct inode *gb_inode = NULL; - - oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL); - if (!oiff) { - status = -ENOMEM; - mlog_errno(status); - goto bail; - } - - if (o2info_from_user(*oiff, req)) - goto bail; - /* - * chunksize from userspace should be power of 2. - */ - if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) || - (!oiff->iff_chunksize)) { - status = -EINVAL; - goto bail; - } - - if (o2info_coherent(&oiff->iff_req)) { - gb_inode = ocfs2_get_system_file_inode(osb, type, - OCFS2_INVALID_SLOT); - if (!gb_inode) { - mlog(ML_ERROR, "unable to get global_bitmap inode\n"); - status = -EIO; - goto bail; - } - } else { - ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, - OCFS2_INVALID_SLOT); - status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, - namebuf, - strlen(namebuf), - &blkno); - if (status < 0) { - status = -ENOENT; - goto bail; - } - } - - status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff); - if (status < 0) - goto bail; - - o2info_set_request_filled(&oiff->iff_req); - - if (o2info_to_user(*oiff, req)) - goto bail; - - status = 0; -bail: - if (status) - o2info_set_request_error(&oiff->iff_req, req); - - kfree(oiff); + o2info_set_request_error(oij, req); return status; } @@ -729,7 +327,7 @@ int ocfs2_info_handle_unknown(struct inode *inode, if (o2info_from_user(oir, req)) goto bail; - o2info_clear_request_filled(&oir); + o2info_clear_request_filled(oir); if (o2info_to_user(oir, req)) goto bail; @@ -737,7 +335,7 @@ int ocfs2_info_handle_unknown(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(&oir, req); + o2info_set_request_error(oir, req); return status; } @@ -791,14 +389,6 @@ int ocfs2_info_handle_request(struct inode *inode, if (oir.ir_size == sizeof(struct ocfs2_info_journal_size)) status = ocfs2_info_handle_journal_size(inode, req); break; - case OCFS2_INFO_FREEINODE: - if (oir.ir_size == sizeof(struct ocfs2_info_freeinode)) - status = ocfs2_info_handle_freeinode(inode, req); - break; - case OCFS2_INFO_FREEFRAG: - if (oir.ir_size == sizeof(struct ocfs2_info_freefrag)) - status = ocfs2_info_handle_freefrag(inode, req); - break; default: status = ocfs2_info_handle_unknown(inode, req); break; @@ -952,31 +542,6 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return -EFAULT; return ocfs2_info_handle(inode, &info, 0); - case FITRIM: - { - struct super_block *sb = inode->i_sb; - struct fstrim_range range; - int ret = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&range, (struct fstrim_range *)arg, - sizeof(range))) - return -EFAULT; - - ret = ocfs2_trim_fs(sb, &range); - if (ret < 0) - return ret; - - if (copy_to_user((struct fstrim_range *)arg, &range, - sizeof(range))) - return -EFAULT; - - return 0; - } - case OCFS2_IOC_MOVE_EXT: - return ocfs2_ioctl_move_extents(filp, (void __user *)arg); default: return -ENOTTY; } @@ -1004,7 +569,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) case OCFS2_IOC_GROUP_EXTEND: case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: - case FITRIM: break; case OCFS2_IOC_REFLINK: if (copy_from_user(&args, (struct reflink_arguments *)arg, @@ -1020,8 +584,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) return -EFAULT; return ocfs2_info_handle(inode, &info, 1); - case OCFS2_IOC_MOVE_EXT: - break; default: return -ENOIOCTLCMD; } diff --git a/trunk/fs/ocfs2/move_extents.c b/trunk/fs/ocfs2/move_extents.c deleted file mode 100644 index 4c5488468c14..000000000000 --- a/trunk/fs/ocfs2/move_extents.c +++ /dev/null @@ -1,1153 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * move_extents.c - * - * Copyright (C) 2011 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ -#include -#include -#include -#include - -#include - -#include "ocfs2.h" -#include "ocfs2_ioctl.h" - -#include "alloc.h" -#include "aops.h" -#include "dlmglue.h" -#include "extent_map.h" -#include "inode.h" -#include "journal.h" -#include "suballoc.h" -#include "uptodate.h" -#include "super.h" -#include "dir.h" -#include "buffer_head_io.h" -#include "sysfile.h" -#include "suballoc.h" -#include "refcounttree.h" -#include "move_extents.h" - -struct ocfs2_move_extents_context { - struct inode *inode; - struct file *file; - int auto_defrag; - int partial; - int credits; - u32 new_phys_cpos; - u32 clusters_moved; - u64 refcount_loc; - struct ocfs2_move_extents *range; - struct ocfs2_extent_tree et; - struct ocfs2_alloc_context *meta_ac; - struct ocfs2_alloc_context *data_ac; - struct ocfs2_cached_dealloc_ctxt dealloc; -}; - -static int __ocfs2_move_extent(handle_t *handle, - struct ocfs2_move_extents_context *context, - u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos, - int ext_flags) -{ - int ret = 0, index; - struct inode *inode = context->inode; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_extent_rec *rec, replace_rec; - struct ocfs2_path *path = NULL; - struct ocfs2_extent_list *el; - u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci); - u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos); - - ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos, - p_cpos, new_p_cpos, len); - if (ret) { - mlog_errno(ret); - goto out; - } - - memset(&replace_rec, 0, sizeof(replace_rec)); - replace_rec.e_cpos = cpu_to_le32(cpos); - replace_rec.e_leaf_clusters = cpu_to_le16(len); - replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, - new_p_cpos)); - - path = ocfs2_new_path_from_et(&context->et); - if (!path) { - ret = -ENOMEM; - mlog_errno(ret); - goto out; - } - - ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos); - if (ret) { - mlog_errno(ret); - goto out; - } - - el = path_leaf_el(path); - - index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { - ocfs2_error(inode->i_sb, - "Inode %llu has an extent at cpos %u which can no " - "longer be found.\n", - (unsigned long long)ino, cpos); - ret = -EROFS; - goto out; - } - - rec = &el->l_recs[index]; - - BUG_ON(ext_flags != rec->e_flags); - /* - * after moving/defraging to new location, the extent is not going - * to be refcounted anymore. - */ - replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED; - - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), - context->et.et_root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out; - } - - ret = ocfs2_split_extent(handle, &context->et, path, index, - &replace_rec, context->meta_ac, - &context->dealloc); - if (ret) { - mlog_errno(ret); - goto out; - } - - ocfs2_journal_dirty(handle, context->et.et_root_bh); - - context->new_phys_cpos = new_p_cpos; - - /* - * need I to append truncate log for old clusters? - */ - if (old_blkno) { - if (ext_flags & OCFS2_EXT_REFCOUNTED) - ret = ocfs2_decrease_refcount(inode, handle, - ocfs2_blocks_to_clusters(osb->sb, - old_blkno), - len, context->meta_ac, - &context->dealloc, 1); - else - ret = ocfs2_truncate_log_append(osb, handle, - old_blkno, len); - } - -out: - return ret; -} - -/* - * lock allocators, and reserving appropriate number of bits for - * meta blocks and data clusters. - * - * in some cases, we don't need to reserve clusters, just let data_ac - * be NULL. - */ -static int ocfs2_lock_allocators_move_extents(struct inode *inode, - struct ocfs2_extent_tree *et, - u32 clusters_to_move, - u32 extents_to_split, - struct ocfs2_alloc_context **meta_ac, - struct ocfs2_alloc_context **data_ac, - int extra_blocks, - int *credits) -{ - int ret, num_free_extents; - unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - - num_free_extents = ocfs2_num_free_extents(osb, et); - if (num_free_extents < 0) { - ret = num_free_extents; - mlog_errno(ret); - goto out; - } - - if (!num_free_extents || - (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) - extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); - - ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac); - if (ret) { - mlog_errno(ret); - goto out; - } - - if (data_ac) { - ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac); - if (ret) { - mlog_errno(ret); - goto out; - } - } - - *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el, - clusters_to_move + 2); - - mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n", - extra_blocks, clusters_to_move, *credits); -out: - if (ret) { - if (*meta_ac) { - ocfs2_free_alloc_context(*meta_ac); - *meta_ac = NULL; - } - } - - return ret; -} - -/* - * Using one journal handle to guarantee the data consistency in case - * crash happens anywhere. - * - * XXX: defrag can end up with finishing partial extent as requested, - * due to not enough contiguous clusters can be found in allocator. - */ -static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, - u32 cpos, u32 phys_cpos, u32 *len, int ext_flags) -{ - int ret, credits = 0, extra_blocks = 0, partial = context->partial; - handle_t *handle; - struct inode *inode = context->inode; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct inode *tl_inode = osb->osb_tl_inode; - struct ocfs2_refcount_tree *ref_tree = NULL; - u32 new_phys_cpos, new_len; - u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); - - if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { - - BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & - OCFS2_HAS_REFCOUNT_FL)); - - BUG_ON(!context->refcount_loc); - - ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, - &ref_tree, NULL); - if (ret) { - mlog_errno(ret); - return ret; - } - - ret = ocfs2_prepare_refcount_change_for_del(inode, - context->refcount_loc, - phys_blkno, - *len, - &credits, - &extra_blocks); - if (ret) { - mlog_errno(ret); - goto out; - } - } - - ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1, - &context->meta_ac, - &context->data_ac, - extra_blocks, &credits); - if (ret) { - mlog_errno(ret); - goto out; - } - - /* - * should be using allocation reservation strategy there? - * - * if (context->data_ac) - * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; - */ - - mutex_lock(&tl_inode->i_mutex); - - if (ocfs2_truncate_log_needs_flush(osb)) { - ret = __ocfs2_flush_truncate_log(osb); - if (ret < 0) { - mlog_errno(ret); - goto out_unlock_mutex; - } - } - - handle = ocfs2_start_trans(osb, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out_unlock_mutex; - } - - ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len, - &new_phys_cpos, &new_len); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - - /* - * allowing partial extent moving is kind of 'pros and cons', it makes - * whole defragmentation less likely to fail, on the contrary, the bad - * thing is it may make the fs even more fragmented after moving, let - * userspace make a good decision here. - */ - if (new_len != *len) { - mlog(0, "len_claimed: %u, len: %u\n", new_len, *len); - if (!partial) { - context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE; - ret = -ENOSPC; - goto out_commit; - } - } - - mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos, - phys_cpos, new_phys_cpos); - - ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos, - new_phys_cpos, ext_flags); - if (ret) - mlog_errno(ret); - - if (partial && (new_len != *len)) - *len = new_len; - - /* - * Here we should write the new page out first if we are - * in write-back mode. - */ - ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len); - if (ret) - mlog_errno(ret); - -out_commit: - ocfs2_commit_trans(osb, handle); - -out_unlock_mutex: - mutex_unlock(&tl_inode->i_mutex); - - if (context->data_ac) { - ocfs2_free_alloc_context(context->data_ac); - context->data_ac = NULL; - } - - if (context->meta_ac) { - ocfs2_free_alloc_context(context->meta_ac); - context->meta_ac = NULL; - } - -out: - if (ref_tree) - ocfs2_unlock_refcount_tree(osb, ref_tree, 1); - - return ret; -} - -/* - * find the victim alloc group, where #blkno fits. - */ -static int ocfs2_find_victim_alloc_group(struct inode *inode, - u64 vict_blkno, - int type, int slot, - int *vict_bit, - struct buffer_head **ret_bh) -{ - int ret, i, blocks_per_unit = 1; - u64 blkno; - char namebuf[40]; - - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct buffer_head *ac_bh = NULL, *gd_bh = NULL; - struct ocfs2_chain_list *cl; - struct ocfs2_chain_rec *rec; - struct ocfs2_dinode *ac_dinode; - struct ocfs2_group_desc *bg; - - ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); - ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, - strlen(namebuf), &blkno); - if (ret) { - ret = -ENOENT; - goto out; - } - - ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh); - if (ret) { - mlog_errno(ret); - goto out; - } - - ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data; - cl = &(ac_dinode->id2.i_chain); - rec = &(cl->cl_recs[0]); - - if (type == GLOBAL_BITMAP_SYSTEM_INODE) - blocks_per_unit <<= (osb->s_clustersize_bits - - inode->i_sb->s_blocksize_bits); - /* - * 'vict_blkno' was out of the valid range. - */ - if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || - (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) * - blocks_per_unit))) { - ret = -EINVAL; - goto out; - } - - for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { - - rec = &(cl->cl_recs[i]); - if (!rec) - continue; - - bg = NULL; - - do { - if (!bg) - blkno = le64_to_cpu(rec->c_blkno); - else - blkno = le64_to_cpu(bg->bg_next_group); - - if (gd_bh) { - brelse(gd_bh); - gd_bh = NULL; - } - - ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh); - if (ret) { - mlog_errno(ret); - goto out; - } - - bg = (struct ocfs2_group_desc *)gd_bh->b_data; - - if (vict_blkno < (le64_to_cpu(bg->bg_blkno) + - le16_to_cpu(bg->bg_bits))) { - - *ret_bh = gd_bh; - *vict_bit = (vict_blkno - blkno) / - blocks_per_unit; - mlog(0, "find the victim group: #%llu, " - "total_bits: %u, vict_bit: %u\n", - blkno, le16_to_cpu(bg->bg_bits), - *vict_bit); - goto out; - } - - } while (le64_to_cpu(bg->bg_next_group)); - } - - ret = -EINVAL; -out: - brelse(ac_bh); - - /* - * caller has to release the gd_bh properly. - */ - return ret; -} - -/* - * XXX: helper to validate and adjust moving goal. - */ -static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, - struct ocfs2_move_extents *range) -{ - int ret, goal_bit = 0; - - struct buffer_head *gd_bh = NULL; - struct ocfs2_group_desc *bg; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - int c_to_b = 1 << (osb->s_clustersize_bits - - inode->i_sb->s_blocksize_bits); - - /* - * validate goal sits within global_bitmap, and return the victim - * group desc - */ - ret = ocfs2_find_victim_alloc_group(inode, range->me_goal, - GLOBAL_BITMAP_SYSTEM_INODE, - OCFS2_INVALID_SLOT, - &goal_bit, &gd_bh); - if (ret) - goto out; - - bg = (struct ocfs2_group_desc *)gd_bh->b_data; - - /* - * make goal become cluster aligned. - */ - if (range->me_goal % c_to_b) - range->me_goal = range->me_goal / c_to_b * c_to_b; - - /* - * moving goal is not allowd to start with a group desc blok(#0 blk) - * let's compromise to the latter cluster. - */ - if (range->me_goal == le64_to_cpu(bg->bg_blkno)) - range->me_goal += c_to_b; - - /* - * movement is not gonna cross two groups. - */ - if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize < - range->me_len) { - ret = -EINVAL; - goto out; - } - /* - * more exact validations/adjustments will be performed later during - * moving operation for each extent range. - */ - mlog(0, "extents get ready to be moved to #%llu block\n", - range->me_goal); - -out: - brelse(gd_bh); - - return ret; -} - -static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, - int *goal_bit, u32 move_len, u32 max_hop, - u32 *phys_cpos) -{ - int i, used, last_free_bits = 0, base_bit = *goal_bit; - struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; - u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb, - le64_to_cpu(gd->bg_blkno)); - - for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) { - - used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap); - if (used) { - /* - * we even tried searching the free chunk by jumping - * a 'max_hop' distance, but still failed. - */ - if ((i - base_bit) > max_hop) { - *phys_cpos = 0; - break; - } - - if (last_free_bits) - last_free_bits = 0; - - continue; - } else - last_free_bits++; - - if (last_free_bits == move_len) { - *goal_bit = i; - *phys_cpos = base_cpos + i; - break; - } - } - - mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); -} - -static int ocfs2_alloc_dinode_update_counts(struct inode *inode, - handle_t *handle, - struct buffer_head *di_bh, - u32 num_bits, - u16 chain) -{ - int ret; - u32 tmp_used; - struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; - struct ocfs2_chain_list *cl = - (struct ocfs2_chain_list *) &di->id2.i_chain; - - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - - tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); - di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); - le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); - ocfs2_journal_dirty(handle, di_bh); - -out: - return ret; -} - -static inline int ocfs2_block_group_set_bits(handle_t *handle, - struct inode *alloc_inode, - struct ocfs2_group_desc *bg, - struct buffer_head *group_bh, - unsigned int bit_off, - unsigned int num_bits) -{ - int status; - void *bitmap = bg->bg_bitmap; - int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; - - /* All callers get the descriptor via - * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ - BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); - BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); - - mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, - num_bits); - - if (ocfs2_is_cluster_bitmap(alloc_inode)) - journal_type = OCFS2_JOURNAL_ACCESS_UNDO; - - status = ocfs2_journal_access_gd(handle, - INODE_CACHE(alloc_inode), - group_bh, - journal_type); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - le16_add_cpu(&bg->bg_free_bits_count, -num_bits); - if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { - ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" - " count %u but claims %u are freed. num_bits %d", - (unsigned long long)le64_to_cpu(bg->bg_blkno), - le16_to_cpu(bg->bg_bits), - le16_to_cpu(bg->bg_free_bits_count), num_bits); - return -EROFS; - } - while (num_bits--) - ocfs2_set_bit(bit_off++, bitmap); - - ocfs2_journal_dirty(handle, group_bh); - -bail: - return status; -} - -static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, - u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, - u32 len, int ext_flags) -{ - int ret, credits = 0, extra_blocks = 0, goal_bit = 0; - handle_t *handle; - struct inode *inode = context->inode; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct inode *tl_inode = osb->osb_tl_inode; - struct inode *gb_inode = NULL; - struct buffer_head *gb_bh = NULL; - struct buffer_head *gd_bh = NULL; - struct ocfs2_group_desc *gd; - struct ocfs2_refcount_tree *ref_tree = NULL; - u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb, - context->range->me_threshold); - u64 phys_blkno, new_phys_blkno; - - phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); - - if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) { - - BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & - OCFS2_HAS_REFCOUNT_FL)); - - BUG_ON(!context->refcount_loc); - - ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, - &ref_tree, NULL); - if (ret) { - mlog_errno(ret); - return ret; - } - - ret = ocfs2_prepare_refcount_change_for_del(inode, - context->refcount_loc, - phys_blkno, - len, - &credits, - &extra_blocks); - if (ret) { - mlog_errno(ret); - goto out; - } - } - - ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1, - &context->meta_ac, - NULL, extra_blocks, &credits); - if (ret) { - mlog_errno(ret); - goto out; - } - - /* - * need to count 2 extra credits for global_bitmap inode and - * group descriptor. - */ - credits += OCFS2_INODE_UPDATE_CREDITS + 1; - - /* - * ocfs2_move_extent() didn't reserve any clusters in lock_allocators() - * logic, while we still need to lock the global_bitmap. - */ - gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, - OCFS2_INVALID_SLOT); - if (!gb_inode) { - mlog(ML_ERROR, "unable to get global_bitmap inode\n"); - ret = -EIO; - goto out; - } - - mutex_lock(&gb_inode->i_mutex); - - ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); - if (ret) { - mlog_errno(ret); - goto out_unlock_gb_mutex; - } - - mutex_lock(&tl_inode->i_mutex); - - handle = ocfs2_start_trans(osb, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out_unlock_tl_inode; - } - - new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos); - ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno, - GLOBAL_BITMAP_SYSTEM_INODE, - OCFS2_INVALID_SLOT, - &goal_bit, &gd_bh); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - - /* - * probe the victim cluster group to find a proper - * region to fit wanted movement, it even will perfrom - * a best-effort attempt by compromising to a threshold - * around the goal. - */ - ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, - new_phys_cpos); - if (!new_phys_cpos) { - ret = -ENOSPC; - goto out_commit; - } - - ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos, - *new_phys_cpos, ext_flags); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - - gd = (struct ocfs2_group_desc *)gd_bh->b_data; - ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len, - le16_to_cpu(gd->bg_chain)); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - - ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, - goal_bit, len); - if (ret) - mlog_errno(ret); - - /* - * Here we should write the new page out first if we are - * in write-back mode. - */ - ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len); - if (ret) - mlog_errno(ret); - -out_commit: - ocfs2_commit_trans(osb, handle); - brelse(gd_bh); - -out_unlock_tl_inode: - mutex_unlock(&tl_inode->i_mutex); - - ocfs2_inode_unlock(gb_inode, 1); -out_unlock_gb_mutex: - mutex_unlock(&gb_inode->i_mutex); - brelse(gb_bh); - iput(gb_inode); - -out: - if (context->meta_ac) { - ocfs2_free_alloc_context(context->meta_ac); - context->meta_ac = NULL; - } - - if (ref_tree) - ocfs2_unlock_refcount_tree(osb, ref_tree, 1); - - return ret; -} - -/* - * Helper to calculate the defraging length in one run according to threshold. - */ -static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged, - u32 threshold, int *skip) -{ - if ((*alloc_size + *len_defraged) < threshold) { - /* - * proceed defragmentation until we meet the thresh - */ - *len_defraged += *alloc_size; - } else if (*len_defraged == 0) { - /* - * XXX: skip a large extent. - */ - *skip = 1; - } else { - /* - * split this extent to coalesce with former pieces as - * to reach the threshold. - * - * we're done here with one cycle of defragmentation - * in a size of 'thresh', resetting 'len_defraged' - * forces a new defragmentation. - */ - *alloc_size = threshold - *len_defraged; - *len_defraged = 0; - } -} - -static int __ocfs2_move_extents_range(struct buffer_head *di_bh, - struct ocfs2_move_extents_context *context) -{ - int ret = 0, flags, do_defrag, skip = 0; - u32 cpos, phys_cpos, move_start, len_to_move, alloc_size; - u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0; - - struct inode *inode = context->inode; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; - struct ocfs2_move_extents *range = context->range; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - - if ((inode->i_size == 0) || (range->me_len == 0)) - return 0; - - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) - return 0; - - context->refcount_loc = le64_to_cpu(di->i_refcount_loc); - - ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh); - ocfs2_init_dealloc_ctxt(&context->dealloc); - - /* - * TO-DO XXX: - * - * - xattr extents. - */ - - do_defrag = context->auto_defrag; - - /* - * extents moving happens in unit of clusters, for the sake - * of simplicity, we may ignore two clusters where 'byte_start' - * and 'byte_start + len' were within. - */ - move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start); - len_to_move = (range->me_start + range->me_len) >> - osb->s_clustersize_bits; - if (len_to_move >= move_start) - len_to_move -= move_start; - else - len_to_move = 0; - - if (do_defrag) { - defrag_thresh = range->me_threshold >> osb->s_clustersize_bits; - if (defrag_thresh <= 1) - goto done; - } else - new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, - range->me_goal); - - mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, " - "thresh: %u\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)range->me_start, - (unsigned long long)range->me_len, - move_start, len_to_move, defrag_thresh); - - cpos = move_start; - while (len_to_move) { - ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size, - &flags); - if (ret) { - mlog_errno(ret); - goto out; - } - - if (alloc_size > len_to_move) - alloc_size = len_to_move; - - /* - * XXX: how to deal with a hole: - * - * - skip the hole of course - * - force a new defragmentation - */ - if (!phys_cpos) { - if (do_defrag) - len_defraged = 0; - - goto next; - } - - if (do_defrag) { - ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged, - defrag_thresh, &skip); - /* - * skip large extents - */ - if (skip) { - skip = 0; - goto next; - } - - mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, " - "alloc_size: %u, len_defraged: %u\n", - cpos, phys_cpos, alloc_size, len_defraged); - - ret = ocfs2_defrag_extent(context, cpos, phys_cpos, - &alloc_size, flags); - } else { - ret = ocfs2_move_extent(context, cpos, phys_cpos, - &new_phys_cpos, alloc_size, - flags); - - new_phys_cpos += alloc_size; - } - - if (ret < 0) { - mlog_errno(ret); - goto out; - } - - context->clusters_moved += alloc_size; -next: - cpos += alloc_size; - len_to_move -= alloc_size; - } - -done: - range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE; - -out: - range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb, - context->clusters_moved); - range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb, - context->new_phys_cpos); - - ocfs2_schedule_truncate_log_flush(osb, 1); - ocfs2_run_deallocs(osb, &context->dealloc); - - return ret; -} - -static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) -{ - int status; - handle_t *handle; - struct inode *inode = context->inode; - struct ocfs2_dinode *di; - struct buffer_head *di_bh = NULL; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - - if (!inode) - return -ENOENT; - - if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) - return -EROFS; - - mutex_lock(&inode->i_mutex); - - /* - * This prevents concurrent writes from other nodes - */ - status = ocfs2_rw_lock(inode, 1); - if (status) { - mlog_errno(status); - goto out; - } - - status = ocfs2_inode_lock(inode, &di_bh, 1); - if (status) { - mlog_errno(status); - goto out_rw_unlock; - } - - /* - * rememer ip_xattr_sem also needs to be held if necessary - */ - down_write(&OCFS2_I(inode)->ip_alloc_sem); - - status = __ocfs2_move_extents_range(di_bh, context); - - up_write(&OCFS2_I(inode)->ip_alloc_sem); - if (status) { - mlog_errno(status); - goto out_inode_unlock; - } - - /* - * We update ctime for these changes - */ - handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto out_inode_unlock; - } - - status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status) { - mlog_errno(status); - goto out_commit; - } - - di = (struct ocfs2_dinode *)di_bh->b_data; - inode->i_ctime = CURRENT_TIME; - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); - - ocfs2_journal_dirty(handle, di_bh); - -out_commit: - ocfs2_commit_trans(osb, handle); - -out_inode_unlock: - brelse(di_bh); - ocfs2_inode_unlock(inode, 1); -out_rw_unlock: - ocfs2_rw_unlock(inode, 1); -out: - mutex_unlock(&inode->i_mutex); - - return status; -} - -int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) -{ - int status; - - struct inode *inode = filp->f_path.dentry->d_inode; - struct ocfs2_move_extents range; - struct ocfs2_move_extents_context *context = NULL; - - status = mnt_want_write(filp->f_path.mnt); - if (status) - return status; - - if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) - goto out; - - if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { - status = -EPERM; - goto out; - } - - context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS); - if (!context) { - status = -ENOMEM; - mlog_errno(status); - goto out; - } - - context->inode = inode; - context->file = filp; - - if (argp) { - if (copy_from_user(&range, (struct ocfs2_move_extents *)argp, - sizeof(range))) { - status = -EFAULT; - goto out; - } - } else { - status = -EINVAL; - goto out; - } - - if (range.me_start > i_size_read(inode)) - goto out; - - if (range.me_start + range.me_len > i_size_read(inode)) - range.me_len = i_size_read(inode) - range.me_start; - - context->range = ⦥ - - if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) { - context->auto_defrag = 1; - /* - * ok, the default theshold for the defragmentation - * is 1M, since our maximum clustersize was 1M also. - * any thought? - */ - if (!range.me_threshold) - range.me_threshold = 1024 * 1024; - - if (range.me_threshold > i_size_read(inode)) - range.me_threshold = i_size_read(inode); - - if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG) - context->partial = 1; - } else { - /* - * first best-effort attempt to validate and adjust the goal - * (physical address in block), while it can't guarantee later - * operation can succeed all the time since global_bitmap may - * change a bit over time. - */ - - status = ocfs2_validate_and_adjust_move_goal(inode, &range); - if (status) - goto out; - } - - status = ocfs2_move_extents(context); - if (status) - mlog_errno(status); -out: - /* - * movement/defragmentation may end up being partially completed, - * that's the reason why we need to return userspace the finished - * length and new_offset even if failure happens somewhere. - */ - if (argp) { - if (copy_to_user((struct ocfs2_move_extents *)argp, &range, - sizeof(range))) - status = -EFAULT; - } - - kfree(context); - - mnt_drop_write(filp->f_path.mnt); - - return status; -} diff --git a/trunk/fs/ocfs2/move_extents.h b/trunk/fs/ocfs2/move_extents.h deleted file mode 100644 index 4e143e811441..000000000000 --- a/trunk/fs/ocfs2/move_extents.h +++ /dev/null @@ -1,22 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * move_extents.h - * - * Copyright (C) 2011 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ -#ifndef OCFS2_MOVE_EXTENTS_H -#define OCFS2_MOVE_EXTENTS_H - -int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp); - -#endif /* OCFS2_MOVE_EXTENTS_H */ diff --git a/trunk/fs/ocfs2/ocfs2_ioctl.h b/trunk/fs/ocfs2/ocfs2_ioctl.h index 5b27ff1fa577..b46f39bf7438 100644 --- a/trunk/fs/ocfs2/ocfs2_ioctl.h +++ b/trunk/fs/ocfs2/ocfs2_ioctl.h @@ -142,38 +142,6 @@ struct ocfs2_info_journal_size { __u64 ij_journal_size; }; -struct ocfs2_info_freeinode { - struct ocfs2_info_request ifi_req; - struct ocfs2_info_local_freeinode { - __u64 lfi_total; - __u64 lfi_free; - } ifi_stat[OCFS2_MAX_SLOTS]; - __u32 ifi_slotnum; /* out */ - __u32 ifi_pad; -}; - -#define OCFS2_INFO_MAX_HIST (32) - -struct ocfs2_info_freefrag { - struct ocfs2_info_request iff_req; - struct ocfs2_info_freefrag_stats { /* (out) */ - struct ocfs2_info_free_chunk_list { - __u32 fc_chunks[OCFS2_INFO_MAX_HIST]; - __u32 fc_clusters[OCFS2_INFO_MAX_HIST]; - } ffs_fc_hist; - __u32 ffs_clusters; - __u32 ffs_free_clusters; - __u32 ffs_free_chunks; - __u32 ffs_free_chunks_real; - __u32 ffs_min; /* Minimum free chunksize in clusters */ - __u32 ffs_max; - __u32 ffs_avg; - __u32 ffs_pad; - } iff_ffs; - __u32 iff_chunksize; /* chunksize in clusters(in) */ - __u32 iff_pad; -}; - /* Codes for ocfs2_info_request */ enum ocfs2_info_type { OCFS2_INFO_CLUSTERSIZE = 1, @@ -183,8 +151,6 @@ enum ocfs2_info_type { OCFS2_INFO_UUID, OCFS2_INFO_FS_FEATURES, OCFS2_INFO_JOURNAL_SIZE, - OCFS2_INFO_FREEINODE, - OCFS2_INFO_FREEFRAG, OCFS2_INFO_NUM_TYPES }; @@ -205,38 +171,4 @@ enum ocfs2_info_type { #define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info) -struct ocfs2_move_extents { -/* All values are in bytes */ - /* in */ - __u64 me_start; /* Virtual start in the file to move */ - __u64 me_len; /* Length of the extents to be moved */ - __u64 me_goal; /* Physical offset of the goal, - it's in block unit */ - __u64 me_threshold; /* Maximum distance from goal or threshold - for auto defragmentation */ - __u64 me_flags; /* Flags for the operation: - * - auto defragmentation. - * - refcount,xattr cases. - */ - /* out */ - __u64 me_moved_len; /* Moved/defraged length */ - __u64 me_new_offset; /* Resulting physical location */ - __u32 me_reserved[2]; /* Reserved for futhure */ -}; - -#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG (0x00000001) /* Kernel manages to - claim new clusters - as the goal place - for extents moving */ -#define OCFS2_MOVE_EXT_FL_PART_DEFRAG (0x00000002) /* Allow partial extent - moving, is to make - movement less likely - to fail, may make fs - even more fragmented */ -#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmenation - completely gets done. - */ - -#define OCFS2_IOC_MOVE_EXT _IOW('o', 6, struct ocfs2_move_extents) - #endif /* OCFS2_IOCTL_H */ diff --git a/trunk/fs/ocfs2/ocfs2_trace.h b/trunk/fs/ocfs2/ocfs2_trace.h index 3b481f490633..a1dae5bb54ac 100644 --- a/trunk/fs/ocfs2/ocfs2_trace.h +++ b/trunk/fs/ocfs2/ocfs2_trace.h @@ -688,31 +688,6 @@ TRACE_EVENT(ocfs2_cache_block_dealloc, __entry->blkno, __entry->bit) ); -TRACE_EVENT(ocfs2_trim_extent, - TP_PROTO(struct super_block *sb, unsigned long long blk, - unsigned long long count), - TP_ARGS(sb, blk, count), - TP_STRUCT__entry( - __field(int, dev_major) - __field(int, dev_minor) - __field(unsigned long long, blk) - __field(__u64, count) - ), - TP_fast_assign( - __entry->dev_major = MAJOR(sb->s_dev); - __entry->dev_minor = MINOR(sb->s_dev); - __entry->blk = blk; - __entry->count = count; - ), - TP_printk("%d %d %llu %llu", - __entry->dev_major, __entry->dev_minor, - __entry->blk, __entry->count) -); - -DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group); - -DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs); - /* End of trace events for fs/ocfs2/alloc.c. */ /* Trace events for fs/ocfs2/localalloc.c. */ diff --git a/trunk/fs/ocfs2/refcounttree.c b/trunk/fs/ocfs2/refcounttree.c index ebfd3825f12a..3c7606cff1ab 100644 --- a/trunk/fs/ocfs2/refcounttree.c +++ b/trunk/fs/ocfs2/refcounttree.c @@ -66,7 +66,7 @@ struct ocfs2_cow_context { u32 *num_clusters, unsigned int *extent_flags); int (*cow_duplicate_clusters)(handle_t *handle, - struct file *file, + struct ocfs2_cow_context *context, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len); }; @@ -2921,21 +2921,20 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh) return 0; } -int ocfs2_duplicate_clusters_by_page(handle_t *handle, - struct file *file, - u32 cpos, u32 old_cluster, - u32 new_cluster, u32 new_len) +static int ocfs2_duplicate_clusters_by_page(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len) { int ret = 0, partial; - struct inode *inode = file->f_path.dentry->d_inode; - struct ocfs2_caching_info *ci = INODE_CACHE(inode); + struct ocfs2_caching_info *ci = context->data_et.et_ci; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); struct page *page; pgoff_t page_index; unsigned int from, to, readahead_pages; loff_t offset, end, map_end; - struct address_space *mapping = inode->i_mapping; + struct address_space *mapping = context->inode->i_mapping; trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, new_cluster, new_len); @@ -2949,8 +2948,8 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, * We only duplicate pages until we reach the page contains i_size - 1. * So trim 'end' to i_size. */ - if (end > i_size_read(inode)) - end = i_size_read(inode); + if (end > i_size_read(context->inode)) + end = i_size_read(context->inode); while (offset < end) { page_index = offset >> PAGE_CACHE_SHIFT; @@ -2973,9 +2972,10 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) BUG_ON(PageDirty(page)); - if (PageReadahead(page)) { + if (PageReadahead(page) && context->file) { page_cache_async_readahead(mapping, - &file->f_ra, file, + &context->file->f_ra, + context->file, page, page_index, readahead_pages); } @@ -2999,7 +2999,8 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, } } - ocfs2_map_and_dirty_page(inode, handle, from, to, + ocfs2_map_and_dirty_page(context->inode, + handle, from, to, page, 0, &new_block); mark_page_accessed(page); unlock: @@ -3014,15 +3015,14 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, return ret; } -int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, - struct file *file, - u32 cpos, u32 old_cluster, - u32 new_cluster, u32 new_len) +static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len) { int ret = 0; - struct inode *inode = file->f_path.dentry->d_inode; - struct super_block *sb = inode->i_sb; - struct ocfs2_caching_info *ci = INODE_CACHE(inode); + struct super_block *sb = context->inode->i_sb; + struct ocfs2_caching_info *ci = context->data_et.et_ci; int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster); u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); @@ -3145,8 +3145,8 @@ static int ocfs2_replace_clusters(handle_t *handle, /*If the old clusters is unwritten, no need to duplicate. */ if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { - ret = context->cow_duplicate_clusters(handle, context->file, - cpos, old, new, len); + ret = context->cow_duplicate_clusters(handle, context, cpos, + old, new, len); if (ret) { mlog_errno(ret); goto out; @@ -3162,22 +3162,22 @@ static int ocfs2_replace_clusters(handle_t *handle, return ret; } -int ocfs2_cow_sync_writeback(struct super_block *sb, - struct inode *inode, - u32 cpos, u32 num_clusters) +static int ocfs2_cow_sync_writeback(struct super_block *sb, + struct ocfs2_cow_context *context, + u32 cpos, u32 num_clusters) { int ret = 0; loff_t offset, end, map_end; pgoff_t page_index; struct page *page; - if (ocfs2_should_order_data(inode)) + if (ocfs2_should_order_data(context->inode)) return 0; offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits); - ret = filemap_fdatawrite_range(inode->i_mapping, + ret = filemap_fdatawrite_range(context->inode->i_mapping, offset, end - 1); if (ret < 0) { mlog_errno(ret); @@ -3190,7 +3190,7 @@ int ocfs2_cow_sync_writeback(struct super_block *sb, if (map_end > end) map_end = end; - page = find_or_create_page(inode->i_mapping, + page = find_or_create_page(context->inode->i_mapping, page_index, GFP_NOFS); BUG_ON(!page); @@ -3349,7 +3349,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb, * in write-back mode. */ if (context->get_clusters == ocfs2_di_get_clusters) { - ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos, + ret = ocfs2_cow_sync_writeback(sb, context, cpos, orig_num_clusters); if (ret) mlog_errno(ret); diff --git a/trunk/fs/ocfs2/refcounttree.h b/trunk/fs/ocfs2/refcounttree.h index 7754608c83a4..c8ce46f7d8e3 100644 --- a/trunk/fs/ocfs2/refcounttree.h +++ b/trunk/fs/ocfs2/refcounttree.h @@ -84,17 +84,6 @@ int ocfs2_refcount_cow_xattr(struct inode *inode, struct buffer_head *ref_root_bh, u32 cpos, u32 write_len, struct ocfs2_post_refcount *post); -int ocfs2_duplicate_clusters_by_page(handle_t *handle, - struct file *file, - u32 cpos, u32 old_cluster, - u32 new_cluster, u32 new_len); -int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, - struct file *file, - u32 cpos, u32 old_cluster, - u32 new_cluster, u32 new_len); -int ocfs2_cow_sync_writeback(struct super_block *sb, - struct inode *inode, - u32 cpos, u32 num_clusters); int ocfs2_add_refcount_flag(struct inode *inode, struct ocfs2_extent_tree *data_et, struct ocfs2_caching_info *ref_ci, diff --git a/trunk/fs/ocfs2/super.c b/trunk/fs/ocfs2/super.c index cdbaf5e97308..5a521c748859 100644 --- a/trunk/fs/ocfs2/super.c +++ b/trunk/fs/ocfs2/super.c @@ -41,7 +41,6 @@ #include #include #include -#include #define CREATE_TRACE_POINTS #include "ocfs2_trace.h" @@ -1567,7 +1566,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) if (osb->preferred_slot != OCFS2_INVALID_SLOT) seq_printf(s, ",preferred_slot=%d", osb->preferred_slot); - if (!(mnt->mnt_flags & MNT_NOATIME) && !(mnt->mnt_flags & MNT_RELATIME)) + if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM) seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); if (osb->osb_commit_interval) @@ -2353,7 +2352,6 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog_errno(status); goto bail; } - cleancache_init_shared_fs((char *)&uuid_net_key, sb); bail: return status; diff --git a/trunk/fs/omfs/dir.c b/trunk/fs/omfs/dir.c index c368360c35a1..de4ff29f1e05 100644 --- a/trunk/fs/omfs/dir.c +++ b/trunk/fs/omfs/dir.c @@ -240,12 +240,8 @@ static int omfs_remove(struct inode *dir, struct dentry *dentry) struct inode *inode = dentry->d_inode; int ret; - - if (S_ISDIR(inode->i_mode)) { - dentry_unhash(dentry); - if (!omfs_dir_is_empty(inode)) - return -ENOTEMPTY; - } + if (S_ISDIR(inode->i_mode) && !omfs_dir_is_empty(inode)) + return -ENOTEMPTY; ret = omfs_delete_entry(dentry); if (ret) @@ -382,9 +378,6 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry, int err; if (new_inode) { - if (S_ISDIR(new_inode->i_mode)) - dentry_unhash(new_dentry); - /* overwriting existing file/dir */ err = omfs_remove(new_dir, new_dentry); if (err) diff --git a/trunk/fs/proc/Makefile b/trunk/fs/proc/Makefile index c1c729335924..df434c5f28fb 100644 --- a/trunk/fs/proc/Makefile +++ b/trunk/fs/proc/Makefile @@ -20,7 +20,6 @@ proc-y += stat.o proc-y += uptime.o proc-y += version.o proc-y += softirqs.o -proc-y += namespaces.o proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o proc-$(CONFIG_NET) += proc_net.o proc-$(CONFIG_PROC_KCORE) += kcore.o diff --git a/trunk/fs/proc/base.c b/trunk/fs/proc/base.c index dc8bca72b002..dfa532730e55 100644 --- a/trunk/fs/proc/base.c +++ b/trunk/fs/proc/base.c @@ -600,7 +600,7 @@ static int proc_fd_access_allowed(struct inode *inode) return allowed; } -int proc_setattr(struct dentry *dentry, struct iattr *attr) +static int proc_setattr(struct dentry *dentry, struct iattr *attr) { int error; struct inode *inode = dentry->d_inode; @@ -1736,7 +1736,8 @@ static int task_dumpable(struct task_struct *task) return 0; } -struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) + +static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) { struct inode * inode; struct proc_inode *ei; @@ -1778,7 +1779,7 @@ struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *t return NULL; } -int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; struct task_struct *task; @@ -1819,7 +1820,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) * made this apply to all per process world readable and executable * directories. */ -int pid_revalidate(struct dentry *dentry, struct nameidata *nd) +static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *inode; struct task_struct *task; @@ -1861,7 +1862,7 @@ static int pid_delete_dentry(const struct dentry * dentry) return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; } -const struct dentry_operations pid_dentry_operations = +static const struct dentry_operations pid_dentry_operations = { .d_revalidate = pid_revalidate, .d_delete = pid_delete_dentry, @@ -1869,6 +1870,9 @@ const struct dentry_operations pid_dentry_operations = /* Lookups */ +typedef struct dentry *instantiate_t(struct inode *, struct dentry *, + struct task_struct *, const void *); + /* * Fill a directory entry. * @@ -1881,8 +1885,8 @@ const struct dentry_operations pid_dentry_operations = * reported by readdir in sync with the inode numbers reported * by stat. */ -int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, - const char *name, int len, +static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + char *name, int len, instantiate_t instantiate, struct task_struct *task, const void *ptr) { struct dentry *child, *dir = filp->f_path.dentry; @@ -2816,7 +2820,6 @@ static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), - DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif @@ -3165,7 +3168,6 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) static const struct pid_entry tid_base_stuff[] = { DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), - DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), REG("environ", S_IRUSR, proc_environ_operations), INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), diff --git a/trunk/fs/proc/inode.c b/trunk/fs/proc/inode.c index 74b48cfa1bb2..d15aa1b1cc8f 100644 --- a/trunk/fs/proc/inode.c +++ b/trunk/fs/proc/inode.c @@ -28,7 +28,6 @@ static void proc_evict_inode(struct inode *inode) { struct proc_dir_entry *de; struct ctl_table_header *head; - const struct proc_ns_operations *ns_ops; truncate_inode_pages(&inode->i_data, 0); end_writeback(inode); @@ -45,10 +44,6 @@ static void proc_evict_inode(struct inode *inode) rcu_assign_pointer(PROC_I(inode)->sysctl, NULL); sysctl_head_put(head); } - /* Release any associated namespace */ - ns_ops = PROC_I(inode)->ns_ops; - if (ns_ops && ns_ops->put) - ns_ops->put(PROC_I(inode)->ns); } static struct kmem_cache * proc_inode_cachep; @@ -67,8 +62,6 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei->pde = NULL; ei->sysctl = NULL; ei->sysctl_entry = NULL; - ei->ns = NULL; - ei->ns_ops = NULL; inode = &ei->vfs_inode; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; return inode; diff --git a/trunk/fs/proc/internal.h b/trunk/fs/proc/internal.h index 7838e5cfec14..3763b436e69d 100644 --- a/trunk/fs/proc/internal.h +++ b/trunk/fs/proc/internal.h @@ -127,21 +127,3 @@ struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); */ int proc_readdir(struct file *, void *, filldir_t); struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *); - - - -/* Lookups */ -typedef struct dentry *instantiate_t(struct inode *, struct dentry *, - struct task_struct *, const void *); -int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, - const char *name, int len, - instantiate_t instantiate, struct task_struct *task, const void *ptr); -int pid_revalidate(struct dentry *dentry, struct nameidata *nd); -struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task); -extern const struct dentry_operations pid_dentry_operations; -int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); -int proc_setattr(struct dentry *dentry, struct iattr *attr); - -extern const struct inode_operations proc_ns_dir_inode_operations; -extern const struct file_operations proc_ns_dir_operations; - diff --git a/trunk/fs/proc/namespaces.c b/trunk/fs/proc/namespaces.c deleted file mode 100644 index 781dec5bd682..000000000000 --- a/trunk/fs/proc/namespaces.c +++ /dev/null @@ -1,198 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "internal.h" - - -static const struct proc_ns_operations *ns_entries[] = { -#ifdef CONFIG_NET_NS - &netns_operations, -#endif -#ifdef CONFIG_UTS_NS - &utsns_operations, -#endif -#ifdef CONFIG_IPC_NS - &ipcns_operations, -#endif -}; - -static const struct file_operations ns_file_operations = { - .llseek = no_llseek, -}; - -static struct dentry *proc_ns_instantiate(struct inode *dir, - struct dentry *dentry, struct task_struct *task, const void *ptr) -{ - const struct proc_ns_operations *ns_ops = ptr; - struct inode *inode; - struct proc_inode *ei; - struct dentry *error = ERR_PTR(-ENOENT); - - inode = proc_pid_make_inode(dir->i_sb, task); - if (!inode) - goto out; - - ei = PROC_I(inode); - inode->i_mode = S_IFREG|S_IRUSR; - inode->i_fop = &ns_file_operations; - ei->ns_ops = ns_ops; - ei->ns = ns_ops->get(task); - if (!ei->ns) - goto out_iput; - - dentry->d_op = &pid_dentry_operations; - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, NULL)) - error = NULL; -out: - return error; -out_iput: - iput(inode); - goto out; -} - -static int proc_ns_fill_cache(struct file *filp, void *dirent, - filldir_t filldir, struct task_struct *task, - const struct proc_ns_operations *ops) -{ - return proc_fill_cache(filp, dirent, filldir, - ops->name, strlen(ops->name), - proc_ns_instantiate, task, ops); -} - -static int proc_ns_dir_readdir(struct file *filp, void *dirent, - filldir_t filldir) -{ - int i; - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); - const struct proc_ns_operations **entry, **last; - ino_t ino; - int ret; - - ret = -ENOENT; - if (!task) - goto out_no_task; - - ret = -EPERM; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out; - - ret = 0; - i = filp->f_pos; - switch (i) { - case 0: - ino = inode->i_ino; - if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - default: - i -= 2; - if (i >= ARRAY_SIZE(ns_entries)) { - ret = 1; - goto out; - } - entry = ns_entries + i; - last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; - while (entry <= last) { - if (proc_ns_fill_cache(filp, dirent, filldir, - task, *entry) < 0) - goto out; - filp->f_pos++; - entry++; - } - } - - ret = 1; -out: - put_task_struct(task); -out_no_task: - return ret; -} - -const struct file_operations proc_ns_dir_operations = { - .read = generic_read_dir, - .readdir = proc_ns_dir_readdir, -}; - -static struct dentry *proc_ns_dir_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd) -{ - struct dentry *error; - struct task_struct *task = get_proc_task(dir); - const struct proc_ns_operations **entry, **last; - unsigned int len = dentry->d_name.len; - - error = ERR_PTR(-ENOENT); - - if (!task) - goto out_no_task; - - error = ERR_PTR(-EPERM); - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out; - - last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; - for (entry = ns_entries; entry <= last; entry++) { - if (strlen((*entry)->name) != len) - continue; - if (!memcmp(dentry->d_name.name, (*entry)->name, len)) - break; - } - error = ERR_PTR(-ENOENT); - if (entry > last) - goto out; - - error = proc_ns_instantiate(dir, dentry, task, *entry); -out: - put_task_struct(task); -out_no_task: - return error; -} - -const struct inode_operations proc_ns_dir_inode_operations = { - .lookup = proc_ns_dir_lookup, - .getattr = pid_getattr, - .setattr = proc_setattr, -}; - -struct file *proc_ns_fget(int fd) -{ - struct file *file; - - file = fget(fd); - if (!file) - return ERR_PTR(-EBADF); - - if (file->f_op != &ns_file_operations) - goto out_invalid; - - return file; - -out_invalid: - fput(file); - return ERR_PTR(-EINVAL); -} - diff --git a/trunk/fs/proc/task_mmu.c b/trunk/fs/proc/task_mmu.c index db15935fa757..2c9db29ea358 100644 --- a/trunk/fs/proc/task_mmu.c +++ b/trunk/fs/proc/task_mmu.c @@ -211,7 +211,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; - vm_flags_t flags = vma->vm_flags; + int flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; unsigned long start, end; diff --git a/trunk/fs/reiserfs/namei.c b/trunk/fs/reiserfs/namei.c index 76c8164d5651..118662690cdf 100644 --- a/trunk/fs/reiserfs/namei.c +++ b/trunk/fs/reiserfs/namei.c @@ -831,8 +831,6 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) INITIALIZE_PATH(path); struct reiserfs_dir_entry de; - dentry_unhash(dentry); - /* we will be doing 2 balancings and update 2 stat data, we change quotas * of the owner of the directory and of the owner of the parent directory. * The quota structure is possibly deleted only on last iput => outside @@ -1227,9 +1225,6 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, unsigned long savelink = 1; struct timespec ctime; - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - /* three balancings: (1) old name removal, (2) new name insertion and (3) maybe "save" link insertion stat data updates: (1) old directory, diff --git a/trunk/fs/reiserfs/xattr.c b/trunk/fs/reiserfs/xattr.c index 50f1abccd1cd..47d2a4498b03 100644 --- a/trunk/fs/reiserfs/xattr.c +++ b/trunk/fs/reiserfs/xattr.c @@ -105,6 +105,7 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry) mutex_unlock(&dentry->d_inode->i_mutex); if (!error) d_delete(dentry); + dput(dentry); return error; } diff --git a/trunk/fs/super.c b/trunk/fs/super.c index c75593953c52..c04f7e0b7ed2 100644 --- a/trunk/fs/super.c +++ b/trunk/fs/super.c @@ -31,7 +31,6 @@ #include #include #include -#include #include "internal.h" @@ -113,7 +112,6 @@ static struct super_block *alloc_super(struct file_system_type *type) s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; - s->cleancache_poolid = -1; } out: return s; @@ -179,7 +177,6 @@ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { - cleancache_flush_fs(s); fs->kill_sb(s); /* * We need to call rcu_barrier so all the delayed rcu free diff --git a/trunk/fs/sysv/namei.c b/trunk/fs/sysv/namei.c index e2cc6756f3b1..e474fbcf8bde 100644 --- a/trunk/fs/sysv/namei.c +++ b/trunk/fs/sysv/namei.c @@ -196,8 +196,6 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry) struct inode *inode = dentry->d_inode; int err = -ENOTEMPTY; - dentry_unhash(dentry); - if (sysv_empty_dir(inode)) { err = sysv_unlink(dir, dentry); if (!err) { @@ -224,9 +222,6 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry, struct sysv_dir_entry * old_de; int err = -ENOENT; - if (new_inode && S_ISDIR(new_inode->i_mode)) - dentry_unhash(new_dentry); - old_de = sysv_find_entry(old_dentry, &old_page); if (!old_de) goto out; diff --git a/trunk/fs/ubifs/dir.c b/trunk/fs/ubifs/dir.c index c2b80943560d..ef5abd38f0bf 100644 --- a/trunk/fs/ubifs/dir.c +++ b/trunk/fs/ubifs/dir.c @@ -656,8 +656,6 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) struct ubifs_inode *dir_ui = ubifs_inode(dir); struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; - dentry_unhash(dentry); - /* * Budget request settings: deletion direntry, deletion inode and * changing the parent inode. If budgeting fails, go ahead anyway @@ -978,9 +976,6 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; struct timespec time; - if (new_inode && S_ISDIR(new_inode->i_mode)) - dentry_unhash(new_dentry); - /* * Budget request settings: deletion direntry, new direntry, removing * the old inode, and changing old and new parent directory inodes. diff --git a/trunk/fs/udf/namei.c b/trunk/fs/udf/namei.c index 4d76594c2a8f..f1dce848ef96 100644 --- a/trunk/fs/udf/namei.c +++ b/trunk/fs/udf/namei.c @@ -783,8 +783,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) struct fileIdentDesc *fi, cfi; struct kernel_lb_addr tloc; - dentry_unhash(dentry); - retval = -ENOENT; fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); if (!fi) @@ -1083,9 +1081,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, struct kernel_lb_addr tloc; struct udf_inode_info *old_iinfo = UDF_I(old_inode); - if (new_inode && S_ISDIR(new_inode->i_mode)) - dentry_unhash(new_dentry); - ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); if (ofi) { if (ofibh.sbh != ofibh.ebh) diff --git a/trunk/fs/ufs/namei.c b/trunk/fs/ufs/namei.c index 953ebdfc5bf7..29309e25417f 100644 --- a/trunk/fs/ufs/namei.c +++ b/trunk/fs/ufs/namei.c @@ -258,8 +258,6 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) struct inode * inode = dentry->d_inode; int err= -ENOTEMPTY; - dentry_unhash(dentry); - lock_ufs(dir->i_sb); if (ufs_empty_dir (inode)) { err = ufs_unlink(dir, dentry); @@ -284,9 +282,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, struct ufs_dir_entry *old_de; int err = -ENOENT; - if (new_inode && S_ISDIR(new_inode->i_mode)) - dentry_unhash(new_dentry); - old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_de) goto out; diff --git a/trunk/fs/xfs/linux-2.6/xfs_discard.c b/trunk/fs/xfs/linux-2.6/xfs_discard.c index 244e797dae32..d61611c88012 100644 --- a/trunk/fs/xfs/linux-2.6/xfs_discard.c +++ b/trunk/fs/xfs/linux-2.6/xfs_discard.c @@ -191,32 +191,3 @@ xfs_ioc_trim( return -XFS_ERROR(EFAULT); return 0; } - -int -xfs_discard_extents( - struct xfs_mount *mp, - struct list_head *list) -{ - struct xfs_busy_extent *busyp; - int error = 0; - - list_for_each_entry(busyp, list, list) { - trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, - busyp->length); - - error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, - XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), - XFS_FSB_TO_BB(mp, busyp->length), - GFP_NOFS, 0); - if (error && error != EOPNOTSUPP) { - xfs_info(mp, - "discard failed for extent [0x%llu,%u], error %d", - (unsigned long long)busyp->bno, - busyp->length, - error); - return error; - } - } - - return 0; -} diff --git a/trunk/fs/xfs/linux-2.6/xfs_discard.h b/trunk/fs/xfs/linux-2.6/xfs_discard.h index 344879aea646..e82b6dd3e127 100644 --- a/trunk/fs/xfs/linux-2.6/xfs_discard.h +++ b/trunk/fs/xfs/linux-2.6/xfs_discard.h @@ -2,9 +2,7 @@ #define XFS_DISCARD_H 1 struct fstrim_range; -struct list_head; extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); -extern int xfs_discard_extents(struct xfs_mount *, struct list_head *); #endif /* XFS_DISCARD_H */ diff --git a/trunk/fs/xfs/linux-2.6/xfs_super.c b/trunk/fs/xfs/linux-2.6/xfs_super.c index 98b9c91fcdf1..b0aa59e51fd0 100644 --- a/trunk/fs/xfs/linux-2.6/xfs_super.c +++ b/trunk/fs/xfs/linux-2.6/xfs_super.c @@ -110,10 +110,8 @@ mempool_t *xfs_ioend_pool; #define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ #define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ #define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ -#define MNTOPT_DELAYLOG "delaylog" /* Delayed logging enabled */ -#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed logging disabled */ -#define MNTOPT_DISCARD "discard" /* Discard unused blocks */ -#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */ +#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */ +#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */ /* * Table driven mount option parser. @@ -357,10 +355,6 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_DELAYLOG; } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { mp->m_flags &= ~XFS_MOUNT_DELAYLOG; - } else if (!strcmp(this_char, MNTOPT_DISCARD)) { - mp->m_flags |= XFS_MOUNT_DISCARD; - } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { - mp->m_flags &= ~XFS_MOUNT_DISCARD; } else if (!strcmp(this_char, "ihashsize")) { xfs_warn(mp, "ihashsize no longer used, option is deprecated."); @@ -394,13 +388,6 @@ xfs_parseargs( return EINVAL; } - if ((mp->m_flags & XFS_MOUNT_DISCARD) && - !(mp->m_flags & XFS_MOUNT_DELAYLOG)) { - xfs_warn(mp, - "the discard option is incompatible with the nodelaylog option"); - return EINVAL; - } - #ifndef CONFIG_XFS_QUOTA if (XFS_IS_QUOTA_RUNNING(mp)) { xfs_warn(mp, "quota support not available in this kernel."); @@ -501,7 +488,6 @@ xfs_showargs( { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG }, - { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, { 0, NULL } }; static struct proc_xfs_info xfs_info_unset[] = { diff --git a/trunk/fs/xfs/xfs_ag.h b/trunk/fs/xfs/xfs_ag.h index 6530769a999b..da0a561ffba2 100644 --- a/trunk/fs/xfs/xfs_ag.h +++ b/trunk/fs/xfs/xfs_ag.h @@ -187,9 +187,6 @@ struct xfs_busy_extent { xfs_agnumber_t agno; xfs_agblock_t bno; xfs_extlen_t length; - unsigned int flags; -#define XFS_ALLOC_BUSY_DISCARDED 0x01 /* undergoing a discard op. */ -#define XFS_ALLOC_BUSY_SKIP_DISCARD 0x02 /* do not discard */ }; /* diff --git a/trunk/fs/xfs/xfs_alloc.c b/trunk/fs/xfs/xfs_alloc.c index 95862bbff56b..acdced86413c 100644 --- a/trunk/fs/xfs/xfs_alloc.c +++ b/trunk/fs/xfs/xfs_alloc.c @@ -2469,7 +2469,7 @@ xfs_free_extent( error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); if (!error) - xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0); + xfs_alloc_busy_insert(tp, args.agno, args.agbno, len); error0: xfs_perag_put(args.pag); return error; @@ -2480,8 +2480,7 @@ xfs_alloc_busy_insert( struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t bno, - xfs_extlen_t len, - unsigned int flags) + xfs_extlen_t len) { struct xfs_busy_extent *new; struct xfs_busy_extent *busyp; @@ -2505,7 +2504,6 @@ xfs_alloc_busy_insert( new->bno = bno; new->length = len; INIT_LIST_HEAD(&new->list); - new->flags = flags; /* trace before insert to be able to see failed inserts */ trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len); @@ -2610,18 +2608,6 @@ xfs_alloc_busy_update_extent( xfs_agblock_t bbno = busyp->bno; xfs_agblock_t bend = bbno + busyp->length; - /* - * This extent is currently being discarded. Give the thread - * performing the discard a chance to mark the extent unbusy - * and retry. - */ - if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) { - spin_unlock(&pag->pagb_lock); - delay(1); - spin_lock(&pag->pagb_lock); - return false; - } - /* * If there is a busy extent overlapping a user allocation, we have * no choice but to force the log and retry the search. @@ -2827,8 +2813,7 @@ xfs_alloc_busy_trim( * If this is a metadata allocation, try to reuse the busy * extent instead of trimming the allocation. */ - if (!args->userdata && - !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) { + if (!args->userdata) { if (!xfs_alloc_busy_update_extent(args->mp, args->pag, busyp, fbno, flen, false)) @@ -2994,16 +2979,10 @@ xfs_alloc_busy_clear_one( kmem_free(busyp); } -/* - * Remove all extents on the passed in list from the busy extents tree. - * If do_discard is set skip extents that need to be discarded, and mark - * these as undergoing a discard operation instead. - */ void xfs_alloc_busy_clear( struct xfs_mount *mp, - struct list_head *list, - bool do_discard) + struct list_head *list) { struct xfs_busy_extent *busyp, *n; struct xfs_perag *pag = NULL; @@ -3020,11 +2999,7 @@ xfs_alloc_busy_clear( agno = busyp->agno; } - if (do_discard && busyp->length && - !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD)) - busyp->flags = XFS_ALLOC_BUSY_DISCARDED; - else - xfs_alloc_busy_clear_one(mp, pag, busyp); + xfs_alloc_busy_clear_one(mp, pag, busyp); } if (pag) { diff --git a/trunk/fs/xfs/xfs_alloc.h b/trunk/fs/xfs/xfs_alloc.h index 2f52b924be79..240ad288f2f9 100644 --- a/trunk/fs/xfs/xfs_alloc.h +++ b/trunk/fs/xfs/xfs_alloc.h @@ -137,11 +137,10 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp, #ifdef __KERNEL__ void xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, - xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); + xfs_agblock_t bno, xfs_extlen_t len); void -xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list, - bool do_discard); +xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list); int xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, diff --git a/trunk/fs/xfs/xfs_alloc_btree.c b/trunk/fs/xfs/xfs_alloc_btree.c index 2b3518826a69..8b469d53599f 100644 --- a/trunk/fs/xfs/xfs_alloc_btree.c +++ b/trunk/fs/xfs/xfs_alloc_btree.c @@ -120,8 +120,7 @@ xfs_allocbt_free_block( if (error) return error; - xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, - XFS_ALLOC_BUSY_SKIP_DISCARD); + xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); xfs_trans_agbtree_delta(cur->bc_tp, -1); return 0; } diff --git a/trunk/fs/xfs/xfs_bmap.c b/trunk/fs/xfs/xfs_bmap.c index e546a33214c9..fa00788de2f5 100644 --- a/trunk/fs/xfs/xfs_bmap.c +++ b/trunk/fs/xfs/xfs_bmap.c @@ -88,6 +88,22 @@ xfs_bmap_add_attrfork_local( xfs_bmap_free_t *flist, /* blocks to free at commit */ int *flags); /* inode logging flags */ +/* + * Called by xfs_bmapi to update file extent records and the btree + * after allocating space (or doing a delayed allocation). + */ +STATIC int /* error */ +xfs_bmap_add_extent( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to add to file extents */ + xfs_fsblock_t *first, /* pointer to firstblock variable */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + int *logflagsp, /* inode logging flags */ + int whichfork, /* data or attr fork */ + int rsvd); /* OK to allocate reserved blocks */ + /* * Called by xfs_bmap_add_extent to handle cases converting a delayed * allocation to a real allocation. @@ -95,13 +111,14 @@ xfs_bmap_add_attrfork_local( STATIC int /* error */ xfs_bmap_add_extent_delay_real( xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_extnum_t idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ xfs_fsblock_t *first, /* pointer to firstblock variable */ xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp); /* inode logging flags */ + int *logflagsp, /* inode logging flags */ + int rsvd); /* OK to allocate reserved blocks */ /* * Called by xfs_bmap_add_extent to handle cases converting a hole @@ -110,9 +127,10 @@ xfs_bmap_add_extent_delay_real( STATIC int /* error */ xfs_bmap_add_extent_hole_delay( xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_extnum_t idx, /* extent number to update/insert */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp); /* inode logging flags */ + int *logflagsp,/* inode logging flags */ + int rsvd); /* OK to allocate reserved blocks */ /* * Called by xfs_bmap_add_extent to handle cases converting a hole @@ -121,7 +139,7 @@ xfs_bmap_add_extent_hole_delay( STATIC int /* error */ xfs_bmap_add_extent_hole_real( xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_extnum_t idx, /* extent number to update/insert */ xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp, /* inode logging flags */ @@ -134,7 +152,7 @@ xfs_bmap_add_extent_hole_real( STATIC int /* error */ xfs_bmap_add_extent_unwritten_real( xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_extnum_t idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp); /* inode logging flags */ @@ -161,6 +179,22 @@ xfs_bmap_btree_to_extents( int *logflagsp, /* inode logging flags */ int whichfork); /* data or attr fork */ +/* + * Called by xfs_bmapi to update file extent records and the btree + * after removing space (or undoing a delayed allocation). + */ +STATIC int /* error */ +xfs_bmap_del_extent( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_trans_t *tp, /* current trans pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to add to file extents */ + int *logflagsp,/* inode logging flags */ + int whichfork, /* data or attr fork */ + int rsvd); /* OK to allocate reserved blocks */ + /* * Remove the entry "free" from the free item list. Prev points to the * previous entry, unless "free" is the head of the list. @@ -440,13 +474,14 @@ xfs_bmap_add_attrfork_local( STATIC int /* error */ xfs_bmap_add_extent( xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_extnum_t idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ xfs_fsblock_t *first, /* pointer to firstblock variable */ xfs_bmap_free_t *flist, /* list of extents to be freed */ int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ + int whichfork, /* data or attr fork */ + int rsvd) /* OK to use reserved data blocks */ { xfs_btree_cur_t *cur; /* btree cursor or null */ xfs_filblks_t da_new; /* new count del alloc blocks used */ @@ -457,27 +492,23 @@ xfs_bmap_add_extent( xfs_extnum_t nextents; /* number of extents in file now */ XFS_STATS_INC(xs_add_exlist); - cur = *curp; ifp = XFS_IFORK_PTR(ip, whichfork); nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(idx <= nextents); da_old = da_new = 0; error = 0; - - ASSERT(*idx >= 0); - ASSERT(*idx <= nextents); - /* * This is the first extent added to a new/empty file. * Special case this one, so other routines get to assume there are * already extents in the list. */ if (nextents == 0) { - xfs_iext_insert(ip, *idx, 1, new, + xfs_iext_insert(ip, 0, 1, new, whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); ASSERT(cur == NULL); - + ifp->if_lastex = 0; if (!isnullstartblock(new->br_startblock)) { XFS_IFORK_NEXT_SET(ip, whichfork, 1); logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); @@ -491,25 +522,27 @@ xfs_bmap_add_extent( if (cur) ASSERT((cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL) == 0); - error = xfs_bmap_add_extent_hole_delay(ip, idx, new, - &logflags); + if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new, + &logflags, rsvd))) + goto done; } /* * Real allocation off the end of the file. */ - else if (*idx == nextents) { + else if (idx == nextents) { if (cur) ASSERT((cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL) == 0); - error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new, - &logflags, whichfork); + if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new, + &logflags, whichfork))) + goto done; } else { xfs_bmbt_irec_t prev; /* old extent at offset idx */ /* * Get the record referred to by idx. */ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &prev); /* * If it's a real allocation record, and the new allocation ends * after the start of the referred to record, then we're filling @@ -524,18 +557,22 @@ xfs_bmap_add_extent( if (cur) ASSERT(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL); - error = xfs_bmap_add_extent_delay_real(ip, - idx, &cur, new, &da_new, - first, flist, &logflags); + if ((error = xfs_bmap_add_extent_delay_real(ip, + idx, &cur, new, &da_new, first, flist, + &logflags, rsvd))) + goto done; + } else if (new->br_state == XFS_EXT_NORM) { + ASSERT(new->br_state == XFS_EXT_NORM); + if ((error = xfs_bmap_add_extent_unwritten_real( + ip, idx, &cur, new, &logflags))) + goto done; } else { - ASSERT(new->br_state == XFS_EXT_NORM || - new->br_state == XFS_EXT_UNWRITTEN); - - error = xfs_bmap_add_extent_unwritten_real(ip, - idx, &cur, new, &logflags); - if (error) + ASSERT(new->br_state == XFS_EXT_UNWRITTEN); + if ((error = xfs_bmap_add_extent_unwritten_real( + ip, idx, &cur, new, &logflags))) goto done; } + ASSERT(*curp == cur || *curp == NULL); } /* * Otherwise we're filling in a hole with an allocation. @@ -544,15 +581,13 @@ xfs_bmap_add_extent( if (cur) ASSERT((cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL) == 0); - error = xfs_bmap_add_extent_hole_real(ip, idx, cur, - new, &logflags, whichfork); + if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, + new, &logflags, whichfork))) + goto done; } } - if (error) - goto done; ASSERT(*curp == cur || *curp == NULL); - /* * Convert to a btree if necessary. */ @@ -580,7 +615,7 @@ xfs_bmap_add_extent( ASSERT(nblks <= da_old); if (nblks < da_old) xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - (int64_t)(da_old - nblks), 0); + (int64_t)(da_old - nblks), rsvd); } /* * Clear out the allocated field, done with it now in any case. @@ -605,13 +640,14 @@ xfs_bmap_add_extent( STATIC int /* error */ xfs_bmap_add_extent_delay_real( xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_extnum_t idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ xfs_fsblock_t *first, /* pointer to firstblock variable */ xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp) /* inode logging flags */ + int *logflagsp, /* inode logging flags */ + int rsvd) /* OK to use reserved data block allocation */ { xfs_btree_cur_t *cur; /* btree cursor */ int diff; /* temp value */ @@ -637,7 +673,7 @@ xfs_bmap_add_extent_delay_real( */ cur = *curp; ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - ep = xfs_iext_get_ext(ifp, *idx); + ep = xfs_iext_get_ext(ifp, idx); xfs_bmbt_get_all(ep, &PREV); new_endoff = new->br_startoff + new->br_blockcount; ASSERT(PREV.br_startoff <= new->br_startoff); @@ -656,9 +692,9 @@ xfs_bmap_add_extent_delay_real( * Check and set flags if this segment has a left neighbor. * Don't set contiguous if the combined extent would be too large. */ - if (*idx > 0) { + if (idx > 0) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); if (isnullstartblock(LEFT.br_startblock)) state |= BMAP_LEFT_DELAY; @@ -676,9 +712,9 @@ xfs_bmap_add_extent_delay_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); if (isnullstartblock(RIGHT.br_startblock)) state |= BMAP_RIGHT_DELAY; @@ -709,14 +745,14 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The left and right neighbors are both contiguous with new. */ - --*idx; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + PREV.br_blockcount + RIGHT.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 2, state); + xfs_iext_remove(ip, idx, 2, state); + ip->i_df.if_lastex = idx - 1; ip->i_d.di_nextents--; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -748,14 +784,13 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The left neighbor is contiguous, the right is not. */ - --*idx; - - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + PREV.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 1, state); + ip->i_df.if_lastex = idx - 1; + xfs_iext_remove(ip, idx, 1, state); if (cur == NULL) rval = XFS_ILOG_DEXT; else { @@ -779,13 +814,14 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The right neighbor is contiguous, the left is not. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount + RIGHT.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 1, state); + ip->i_df.if_lastex = idx; + xfs_iext_remove(ip, idx + 1, 1, state); if (cur == NULL) rval = XFS_ILOG_DEXT; else { @@ -801,7 +837,6 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount, PREV.br_state))) goto done; } - *dnew = 0; break; @@ -811,10 +846,11 @@ xfs_bmap_add_extent_delay_real( * Neither the left nor right neighbors are contiguous with * the new one. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + ip->i_df.if_lastex = idx; ip->i_d.di_nextents++; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -830,7 +866,6 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - *dnew = 0; break; @@ -839,16 +874,17 @@ xfs_bmap_add_extent_delay_real( * Filling in the first part of a previous delayed allocation. * The left neighbor is contiguous. */ - trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1), + trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + new->br_blockcount); xfs_bmbt_set_startoff(ep, PREV.br_startoff + new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); + ip->i_df.if_lastex = idx - 1; if (cur == NULL) rval = XFS_ILOG_DEXT; else { @@ -868,9 +904,7 @@ xfs_bmap_add_extent_delay_real( temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), startblockval(PREV.br_startblock)); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - - --*idx; + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); *dnew = temp; break; @@ -879,11 +913,12 @@ xfs_bmap_add_extent_delay_real( * Filling in the first part of a previous delayed allocation. * The left neighbor is not contiguous. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_startoff(ep, new_endoff); temp = PREV.br_blockcount - new->br_blockcount; xfs_bmbt_set_blockcount(ep, temp); - xfs_iext_insert(ip, *idx, 1, new, state); + xfs_iext_insert(ip, idx, 1, new, state); + ip->i_df.if_lastex = idx; ip->i_d.di_nextents++; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -911,10 +946,9 @@ xfs_bmap_add_extent_delay_real( temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), startblockval(PREV.br_startblock) - (cur ? cur->bc_private.b.allocated : 0)); - ep = xfs_iext_get_ext(ifp, *idx + 1); + ep = xfs_iext_get_ext(ifp, idx + 1); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_); - + trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); *dnew = temp; break; @@ -924,13 +958,15 @@ xfs_bmap_add_extent_delay_real( * The right neighbor is contiguous with the new allocation. */ temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1), + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, RIGHT.br_state); - trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); + ip->i_df.if_lastex = idx + 1; if (cur == NULL) rval = XFS_ILOG_DEXT; else { @@ -947,14 +983,10 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_state))) goto done; } - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), startblockval(PREV.br_startblock)); - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - - ++*idx; + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); *dnew = temp; break; @@ -964,9 +996,10 @@ xfs_bmap_add_extent_delay_real( * The right neighbor is not contiguous. */ temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - xfs_iext_insert(ip, *idx + 1, 1, new, state); + xfs_iext_insert(ip, idx + 1, 1, new, state); + ip->i_df.if_lastex = idx + 1; ip->i_d.di_nextents++; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -994,11 +1027,9 @@ xfs_bmap_add_extent_delay_real( temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), startblockval(PREV.br_startblock) - (cur ? cur->bc_private.b.allocated : 0)); - ep = xfs_iext_get_ext(ifp, *idx); + ep = xfs_iext_get_ext(ifp, idx); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - - ++*idx; + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); *dnew = temp; break; @@ -1025,7 +1056,7 @@ xfs_bmap_add_extent_delay_real( */ temp = new->br_startoff - PREV.br_startoff; temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; - trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */ LEFT = *new; RIGHT.br_state = PREV.br_state; @@ -1034,7 +1065,8 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_startoff = new_endoff; RIGHT.br_blockcount = temp2; /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ - xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state); + xfs_iext_insert(ip, idx + 1, 2, &LEFT, state); + ip->i_df.if_lastex = idx + 1; ip->i_d.di_nextents++; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1065,7 +1097,7 @@ xfs_bmap_add_extent_delay_real( (cur ? cur->bc_private.b.allocated : 0)); if (diff > 0 && xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - -((int64_t)diff), 0)) { + -((int64_t)diff), rsvd)) { /* * Ick gross gag me with a spoon. */ @@ -1077,7 +1109,7 @@ xfs_bmap_add_extent_delay_real( if (!diff || !xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - -((int64_t)diff), 0)) + -((int64_t)diff), rsvd)) break; } if (temp2) { @@ -1086,20 +1118,18 @@ xfs_bmap_add_extent_delay_real( if (!diff || !xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - -((int64_t)diff), 0)) + -((int64_t)diff), rsvd)) break; } } } - ep = xfs_iext_get_ext(ifp, *idx); + ep = xfs_iext_get_ext(ifp, idx); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_); - xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2), + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx + 2, state, _THIS_IP_); + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2), nullstartblock((int)temp2)); - trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_); - - ++*idx; + trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_); *dnew = temp + temp2; break; @@ -1131,7 +1161,7 @@ xfs_bmap_add_extent_delay_real( STATIC int /* error */ xfs_bmap_add_extent_unwritten_real( xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_extnum_t idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp) /* inode logging flags */ @@ -1158,7 +1188,7 @@ xfs_bmap_add_extent_unwritten_real( error = 0; cur = *curp; ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - ep = xfs_iext_get_ext(ifp, *idx); + ep = xfs_iext_get_ext(ifp, idx); xfs_bmbt_get_all(ep, &PREV); newext = new->br_state; oldext = (newext == XFS_EXT_UNWRITTEN) ? @@ -1181,9 +1211,9 @@ xfs_bmap_add_extent_unwritten_real( * Check and set flags if this segment has a left neighbor. * Don't set contiguous if the combined extent would be too large. */ - if (*idx > 0) { + if (idx > 0) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); if (isnullstartblock(LEFT.br_startblock)) state |= BMAP_LEFT_DELAY; @@ -1201,9 +1231,9 @@ xfs_bmap_add_extent_unwritten_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); if (isnullstartblock(RIGHT.br_startblock)) state |= BMAP_RIGHT_DELAY; } @@ -1232,15 +1262,14 @@ xfs_bmap_add_extent_unwritten_real( * Setting all of a previous oldext extent to newext. * The left and right neighbors are both contiguous with new. */ - --*idx; - - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + PREV.br_blockcount + RIGHT.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 2, state); + xfs_iext_remove(ip, idx, 2, state); + ip->i_df.if_lastex = idx - 1; ip->i_d.di_nextents -= 2; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1276,14 +1305,13 @@ xfs_bmap_add_extent_unwritten_real( * Setting all of a previous oldext extent to newext. * The left neighbor is contiguous, the right is not. */ - --*idx; - - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + PREV.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 1, state); + ip->i_df.if_lastex = idx - 1; + xfs_iext_remove(ip, idx, 1, state); ip->i_d.di_nextents--; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1313,12 +1341,13 @@ xfs_bmap_add_extent_unwritten_real( * Setting all of a previous oldext extent to newext. * The right neighbor is contiguous, the left is not. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount + RIGHT.br_blockcount); xfs_bmbt_set_state(ep, newext); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 1, state); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + ip->i_df.if_lastex = idx; + xfs_iext_remove(ip, idx + 1, 1, state); ip->i_d.di_nextents--; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1349,10 +1378,11 @@ xfs_bmap_add_extent_unwritten_real( * Neither the left nor right neighbors are contiguous with * the new one. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_state(ep, newext); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + ip->i_df.if_lastex = idx; if (cur == NULL) rval = XFS_ILOG_DEXT; else { @@ -1374,22 +1404,21 @@ xfs_bmap_add_extent_unwritten_real( * Setting the first part of a previous oldext extent to newext. * The left neighbor is contiguous. */ - trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1), + trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + new->br_blockcount); xfs_bmbt_set_startoff(ep, PREV.br_startoff + new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock + new->br_blockcount); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - - --*idx; + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + ip->i_df.if_lastex = idx - 1; if (cur == NULL) rval = XFS_ILOG_DEXT; else { @@ -1420,16 +1449,17 @@ xfs_bmap_add_extent_unwritten_real( * Setting the first part of a previous oldext extent to newext. * The left neighbor is not contiguous. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); ASSERT(ep && xfs_bmbt_get_state(ep) == oldext); xfs_bmbt_set_startoff(ep, new_endoff); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); xfs_bmbt_set_startblock(ep, new->br_startblock + new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); - xfs_iext_insert(ip, *idx, 1, new, state); + xfs_iext_insert(ip, idx, 1, new, state); + ip->i_df.if_lastex = idx; ip->i_d.di_nextents++; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1458,19 +1488,17 @@ xfs_bmap_add_extent_unwritten_real( * Setting the last part of a previous oldext extent to newext. * The right neighbor is contiguous with the new allocation. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - - ++*idx; - - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, newext); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); + ip->i_df.if_lastex = idx + 1; if (cur == NULL) rval = XFS_ILOG_DEXT; else { @@ -1500,14 +1528,13 @@ xfs_bmap_add_extent_unwritten_real( * Setting the last part of a previous oldext extent to newext. * The right neighbor is not contiguous. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - - ++*idx; - xfs_iext_insert(ip, *idx, 1, new, state); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + xfs_iext_insert(ip, idx + 1, 1, new, state); + ip->i_df.if_lastex = idx + 1; ip->i_d.di_nextents++; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1541,10 +1568,10 @@ xfs_bmap_add_extent_unwritten_real( * newext. Contiguity is impossible here. * One extent becomes three extents. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, new->br_startoff - PREV.br_startoff); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); r[0] = *new; r[1].br_startoff = new_endoff; @@ -1552,10 +1579,8 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startoff + PREV.br_blockcount - new_endoff; r[1].br_startblock = new->br_startblock + new->br_blockcount; r[1].br_state = oldext; - - ++*idx; - xfs_iext_insert(ip, *idx, 2, &r[0], state); - + xfs_iext_insert(ip, idx + 1, 2, &r[0], state); + ip->i_df.if_lastex = idx + 1; ip->i_d.di_nextents += 2; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1625,10 +1650,12 @@ xfs_bmap_add_extent_unwritten_real( STATIC int /* error */ xfs_bmap_add_extent_hole_delay( xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_extnum_t idx, /* extent number to update/insert */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp) /* inode logging flags */ + int *logflagsp, /* inode logging flags */ + int rsvd) /* OK to allocate reserved blocks */ { + xfs_bmbt_rec_host_t *ep; /* extent record for idx */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_irec_t left; /* left neighbor extent entry */ xfs_filblks_t newlen=0; /* new indirect size */ @@ -1638,15 +1665,16 @@ xfs_bmap_add_extent_hole_delay( xfs_filblks_t temp=0; /* temp for indirect calculations */ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + ep = xfs_iext_get_ext(ifp, idx); state = 0; ASSERT(isnullstartblock(new->br_startblock)); /* * Check and set flags if this segment has a left neighbor */ - if (*idx > 0) { + if (idx > 0) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); if (isnullstartblock(left.br_startblock)) state |= BMAP_LEFT_DELAY; @@ -1656,9 +1684,9 @@ xfs_bmap_add_extent_hole_delay( * Check and set flags if the current (right) segment exists. * If it doesn't exist, we're converting the hole at end-of-file. */ - if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); + xfs_bmbt_get_all(ep, &right); if (isnullstartblock(right.br_startblock)) state |= BMAP_RIGHT_DELAY; @@ -1691,21 +1719,21 @@ xfs_bmap_add_extent_hole_delay( * on the left and on the right. * Merge all three into a single extent record. */ - --*idx; temp = left.br_blockcount + new->br_blockcount + right.br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); + trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock) + startblockval(right.br_startblock); newlen = xfs_bmap_worst_indlen(ip, temp); - xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), nullstartblock((int)newlen)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 1, state); + xfs_iext_remove(ip, idx, 1, state); + ip->i_df.if_lastex = idx - 1; break; case BMAP_LEFT_CONTIG: @@ -1714,17 +1742,17 @@ xfs_bmap_add_extent_hole_delay( * on the left. * Merge the new allocation with the left neighbor. */ - --*idx; temp = left.br_blockcount + new->br_blockcount; - - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); + trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock); newlen = xfs_bmap_worst_indlen(ip, temp); - xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), nullstartblock((int)newlen)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + + ip->i_df.if_lastex = idx - 1; break; case BMAP_RIGHT_CONTIG: @@ -1733,15 +1761,16 @@ xfs_bmap_add_extent_hole_delay( * on the right. * Merge the new allocation with the right neighbor. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); temp = new->br_blockcount + right.br_blockcount; oldlen = startblockval(new->br_startblock) + startblockval(right.br_startblock); newlen = xfs_bmap_worst_indlen(ip, temp); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), - new->br_startoff, + xfs_bmbt_set_allf(ep, new->br_startoff, nullstartblock((int)newlen), temp, right.br_state); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + + ip->i_df.if_lastex = idx; break; case 0: @@ -1751,13 +1780,14 @@ xfs_bmap_add_extent_hole_delay( * Insert a new entry. */ oldlen = newlen = 0; - xfs_iext_insert(ip, *idx, 1, new, state); + xfs_iext_insert(ip, idx, 1, new, state); + ip->i_df.if_lastex = idx; break; } if (oldlen != newlen) { ASSERT(oldlen > newlen); xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - (int64_t)(oldlen - newlen), 0); + (int64_t)(oldlen - newlen), rsvd); /* * Nothing to do for disk quota accounting here. */ @@ -1773,12 +1803,13 @@ xfs_bmap_add_extent_hole_delay( STATIC int /* error */ xfs_bmap_add_extent_hole_real( xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_extnum_t idx, /* extent number to update/insert */ xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp, /* inode logging flags */ int whichfork) /* data or attr fork */ { + xfs_bmbt_rec_host_t *ep; /* pointer to extent entry ins. point */ int error; /* error return value */ int i; /* temp state */ xfs_ifork_t *ifp; /* inode fork pointer */ @@ -1788,7 +1819,8 @@ xfs_bmap_add_extent_hole_real( int state; /* state bits, accessed thru macros */ ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); + ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); + ep = xfs_iext_get_ext(ifp, idx); state = 0; if (whichfork == XFS_ATTR_FORK) @@ -1797,9 +1829,9 @@ xfs_bmap_add_extent_hole_real( /* * Check and set flags if this segment has a left neighbor. */ - if (*idx > 0) { + if (idx > 0) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); if (isnullstartblock(left.br_startblock)) state |= BMAP_LEFT_DELAY; } @@ -1808,9 +1840,9 @@ xfs_bmap_add_extent_hole_real( * Check and set flags if this segment has a current value. * Not true if we're inserting into the "hole" at eof. */ - if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + if (idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); + xfs_bmbt_get_all(ep, &right); if (isnullstartblock(right.br_startblock)) state |= BMAP_RIGHT_DELAY; } @@ -1847,15 +1879,14 @@ xfs_bmap_add_extent_hole_real( * left and on the right. * Merge all three into a single extent record. */ - --*idx; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), left.br_blockcount + new->br_blockcount + right.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - - xfs_iext_remove(ip, *idx + 1, 1, state); + trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + xfs_iext_remove(ip, idx, 1, state); + ifp->if_lastex = idx - 1; XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) { @@ -1890,12 +1921,12 @@ xfs_bmap_add_extent_hole_real( * on the left. * Merge the new allocation with the left neighbor. */ - --*idx; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), left.br_blockcount + new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + ifp->if_lastex = idx - 1; if (cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { @@ -1921,13 +1952,13 @@ xfs_bmap_add_extent_hole_real( * on the right. * Merge the new allocation with the right neighbor. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), - new->br_startoff, new->br_startblock, + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); + xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock, new->br_blockcount + right.br_blockcount, right.br_state); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + ifp->if_lastex = idx; if (cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { @@ -1953,7 +1984,8 @@ xfs_bmap_add_extent_hole_real( * real allocation. * Insert a new entry. */ - xfs_iext_insert(ip, *idx, 1, new, state); + xfs_iext_insert(ip, idx, 1, new, state); + ifp->if_lastex = idx; XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) { @@ -2801,12 +2833,13 @@ STATIC int /* error */ xfs_bmap_del_extent( xfs_inode_t *ip, /* incore inode pointer */ xfs_trans_t *tp, /* current transaction pointer */ - xfs_extnum_t *idx, /* extent number to update/delete */ + xfs_extnum_t idx, /* extent number to update/delete */ xfs_bmap_free_t *flist, /* list of extents to be freed */ xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *del, /* data to remove from extents */ int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ + int whichfork, /* data or attr fork */ + int rsvd) /* OK to allocate reserved blocks */ { xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ @@ -2837,10 +2870,10 @@ xfs_bmap_del_extent( mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / + ASSERT((idx >= 0) && (idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); ASSERT(del->br_blockcount > 0); - ep = xfs_iext_get_ext(ifp, *idx); + ep = xfs_iext_get_ext(ifp, idx); xfs_bmbt_get_all(ep, &got); ASSERT(got.br_startoff <= del->br_startoff); del_endoff = del->br_startoff + del->br_blockcount; @@ -2914,12 +2947,11 @@ xfs_bmap_del_extent( /* * Matches the whole extent. Delete the entry. */ - xfs_iext_remove(ip, *idx, 1, + xfs_iext_remove(ip, idx, 1, whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); - --*idx; + ifp->if_lastex = idx; if (delay) break; - XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) - 1); flags |= XFS_ILOG_CORE; @@ -2936,20 +2968,21 @@ xfs_bmap_del_extent( /* * Deleting the first part of the extent. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_startoff(ep, del_endoff); temp = got.br_blockcount - del->br_blockcount; xfs_bmbt_set_blockcount(ep, temp); + ifp->if_lastex = idx; if (delay) { temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), da_old); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); da_new = temp; break; } xfs_bmbt_set_startblock(ep, del_endblock); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); if (!cur) { flags |= xfs_ilog_fext(whichfork); break; @@ -2965,17 +2998,18 @@ xfs_bmap_del_extent( * Deleting the last part of the extent. */ temp = got.br_blockcount - del->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); + ifp->if_lastex = idx; if (delay) { temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), da_old); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); da_new = temp; break; } - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); if (!cur) { flags |= xfs_ilog_fext(whichfork); break; @@ -2992,7 +3026,7 @@ xfs_bmap_del_extent( * Deleting the middle of the extent. */ temp = del->br_startoff - got.br_startoff; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); new.br_startoff = del_endoff; temp2 = got_endoff - del_endoff; @@ -3079,9 +3113,9 @@ xfs_bmap_del_extent( } } } - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - xfs_iext_insert(ip, *idx + 1, 1, &new, state); - ++*idx; + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + xfs_iext_insert(ip, idx + 1, 1, &new, state); + ifp->if_lastex = idx + 1; break; } /* @@ -3108,7 +3142,7 @@ xfs_bmap_del_extent( ASSERT(da_old >= da_new); if (da_old > da_new) { xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - (int64_t)(da_old - da_new), 0); + (int64_t)(da_old - da_new), rsvd); } done: *logflagsp = flags; @@ -4528,24 +4562,29 @@ xfs_bmapi( if (rt) { error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, - -((int64_t)extsz), 0); + -((int64_t)extsz), (flags & + XFS_BMAPI_RSVBLOCKS)); } else { error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - -((int64_t)alen), 0); + -((int64_t)alen), (flags & + XFS_BMAPI_RSVBLOCKS)); } if (!error) { error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - -((int64_t)indlen), 0); + -((int64_t)indlen), (flags & + XFS_BMAPI_RSVBLOCKS)); if (error && rt) xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, - (int64_t)extsz, 0); + (int64_t)extsz, (flags & + XFS_BMAPI_RSVBLOCKS)); else if (error) xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - (int64_t)alen, 0); + (int64_t)alen, (flags & + XFS_BMAPI_RSVBLOCKS)); } if (error) { @@ -4662,12 +4701,13 @@ xfs_bmapi( if (!wasdelay && (flags & XFS_BMAPI_PREALLOC)) got.br_state = XFS_EXT_UNWRITTEN; } - error = xfs_bmap_add_extent(ip, &lastx, &cur, &got, + error = xfs_bmap_add_extent(ip, lastx, &cur, &got, firstblock, flist, &tmp_logflags, - whichfork); + whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); logflags |= tmp_logflags; if (error) goto error0; + lastx = ifp->if_lastex; ep = xfs_iext_get_ext(ifp, lastx); nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); xfs_bmbt_get_all(ep, &got); @@ -4763,12 +4803,13 @@ xfs_bmapi( mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(ip, &lastx, &cur, mval, + error = xfs_bmap_add_extent(ip, lastx, &cur, mval, firstblock, flist, &tmp_logflags, - whichfork); + whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); logflags |= tmp_logflags; if (error) goto error0; + lastx = ifp->if_lastex; ep = xfs_iext_get_ext(ifp, lastx); nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); xfs_bmbt_get_all(ep, &got); @@ -4827,14 +4868,14 @@ xfs_bmapi( /* * Else go on to the next record. */ + ep = xfs_iext_get_ext(ifp, ++lastx); prev = got; - if (++lastx < nextents) { - ep = xfs_iext_get_ext(ifp, lastx); - xfs_bmbt_get_all(ep, &got); - } else { + if (lastx >= nextents) eof = 1; - } + else + xfs_bmbt_get_all(ep, &got); } + ifp->if_lastex = lastx; *nmap = n; /* * Transform from btree to extents, give it cur. @@ -4943,6 +4984,7 @@ xfs_bmapi_single( ASSERT(!isnullstartblock(got.br_startblock)); ASSERT(bno < got.br_startoff + got.br_blockcount); *fsb = got.br_startblock + (bno - got.br_startoff); + ifp->if_lastex = lastx; return 0; } @@ -4984,6 +5026,7 @@ xfs_bunmapi( int tmp_logflags; /* partial logging flags */ int wasdel; /* was a delayed alloc extent */ int whichfork; /* data or attribute fork */ + int rsvd; /* OK to allocate reserved blocks */ xfs_fsblock_t sum; trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); @@ -5001,7 +5044,7 @@ xfs_bunmapi( mp = ip->i_mount; if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - + rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0; ASSERT(len > 0); ASSERT(nexts >= 0); ASSERT(ifp->if_ext_max == @@ -5117,9 +5160,9 @@ xfs_bunmapi( del.br_blockcount = mod; } del.br_state = XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(ip, &lastx, &cur, &del, + error = xfs_bmap_add_extent(ip, lastx, &cur, &del, firstblock, flist, &logflags, - XFS_DATA_FORK); + XFS_DATA_FORK, 0); if (error) goto error0; goto nodelete; @@ -5145,12 +5188,9 @@ xfs_bunmapi( */ ASSERT(bno >= del.br_blockcount); bno -= del.br_blockcount; - if (got.br_startoff > bno) { - if (--lastx >= 0) { - ep = xfs_iext_get_ext(ifp, - lastx); - xfs_bmbt_get_all(ep, &got); - } + if (bno < got.br_startoff) { + if (--lastx >= 0) + xfs_bmbt_get_all(--ep, &got); } continue; } else if (del.br_state == XFS_EXT_UNWRITTEN) { @@ -5174,19 +5214,18 @@ xfs_bunmapi( prev.br_startoff = start; } prev.br_state = XFS_EXT_UNWRITTEN; - lastx--; - error = xfs_bmap_add_extent(ip, &lastx, &cur, + error = xfs_bmap_add_extent(ip, lastx - 1, &cur, &prev, firstblock, flist, &logflags, - XFS_DATA_FORK); + XFS_DATA_FORK, 0); if (error) goto error0; goto nodelete; } else { ASSERT(del.br_state == XFS_EXT_NORM); del.br_state = XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(ip, &lastx, &cur, + error = xfs_bmap_add_extent(ip, lastx, &cur, &del, firstblock, flist, &logflags, - XFS_DATA_FORK); + XFS_DATA_FORK, 0); if (error) goto error0; goto nodelete; @@ -5201,13 +5240,13 @@ xfs_bunmapi( rtexts = XFS_FSB_TO_B(mp, del.br_blockcount); do_div(rtexts, mp->m_sb.sb_rextsize); xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, - (int64_t)rtexts, 0); + (int64_t)rtexts, rsvd); (void)xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_RTBLKS); } else { xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - (int64_t)del.br_blockcount, 0); + (int64_t)del.br_blockcount, rsvd); (void)xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_REGBLKS); @@ -5238,29 +5277,31 @@ xfs_bunmapi( error = XFS_ERROR(ENOSPC); goto error0; } - error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del, - &tmp_logflags, whichfork); + error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del, + &tmp_logflags, whichfork, rsvd); logflags |= tmp_logflags; if (error) goto error0; bno = del.br_startoff - 1; nodelete: + lastx = ifp->if_lastex; /* * If not done go on to the next (previous) record. + * Reset ep in case the extents array was re-alloced. */ + ep = xfs_iext_get_ext(ifp, lastx); if (bno != (xfs_fileoff_t)-1 && bno >= start) { - if (lastx >= 0) { - ep = xfs_iext_get_ext(ifp, lastx); - if (xfs_bmbt_get_startoff(ep) > bno) { - if (--lastx >= 0) - ep = xfs_iext_get_ext(ifp, - lastx); - } - xfs_bmbt_get_all(ep, &got); + if (lastx >= XFS_IFORK_NEXTENTS(ip, whichfork) || + xfs_bmbt_get_startoff(ep) > bno) { + if (--lastx >= 0) + ep = xfs_iext_get_ext(ifp, lastx); } + if (lastx >= 0) + xfs_bmbt_get_all(ep, &got); extno++; } } + ifp->if_lastex = lastx; *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; ASSERT(ifp->if_ext_max == XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); diff --git a/trunk/fs/xfs/xfs_bmap.h b/trunk/fs/xfs/xfs_bmap.h index c62234bde053..3651191daea1 100644 --- a/trunk/fs/xfs/xfs_bmap.h +++ b/trunk/fs/xfs/xfs_bmap.h @@ -69,6 +69,7 @@ typedef struct xfs_bmap_free #define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */ #define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */ #define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */ +#define XFS_BMAPI_RSVBLOCKS 0x020 /* OK to alloc. reserved data blocks */ #define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */ #define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */ /* combine contig. space */ @@ -86,6 +87,7 @@ typedef struct xfs_bmap_free { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ + { XFS_BMAPI_RSVBLOCKS, "RSVBLOCKS" }, \ { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ diff --git a/trunk/fs/xfs/xfs_inode.c b/trunk/fs/xfs/xfs_inode.c index a098a20ca63e..c8e3349c287c 100644 --- a/trunk/fs/xfs/xfs_inode.c +++ b/trunk/fs/xfs/xfs_inode.c @@ -920,6 +920,7 @@ xfs_iread_extents( /* * We know that the size is valid (it's checked in iformat_btree) */ + ifp->if_lastex = NULLEXTNUM; ifp->if_bytes = ifp->if_real_bytes = 0; ifp->if_flags |= XFS_IFEXTENTS; xfs_iext_add(ifp, 0, nextents); @@ -2557,9 +2558,12 @@ xfs_iflush_fork( case XFS_DINODE_FMT_EXTENTS: ASSERT((ifp->if_flags & XFS_IFEXTENTS) || !(iip->ili_format.ilf_fields & extflag[whichfork])); + ASSERT((xfs_iext_get_ext(ifp, 0) != NULL) || + (ifp->if_bytes == 0)); + ASSERT((xfs_iext_get_ext(ifp, 0) == NULL) || + (ifp->if_bytes > 0)); if ((iip->ili_format.ilf_fields & extflag[whichfork]) && (ifp->if_bytes > 0)) { - ASSERT(xfs_iext_get_ext(ifp, 0)); ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, whichfork); @@ -3108,8 +3112,6 @@ xfs_iext_get_ext( xfs_extnum_t idx) /* index of target extent */ { ASSERT(idx >= 0); - ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); - if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { return ifp->if_u1.if_ext_irec->er_extbuf; } else if (ifp->if_flags & XFS_IFEXTIREC) { @@ -3189,6 +3191,7 @@ xfs_iext_add( } ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; ifp->if_real_bytes = 0; + ifp->if_lastex = nextents + ext_diff; } /* * Otherwise use a linear (direct) extent list. @@ -3883,10 +3886,8 @@ xfs_iext_idx_to_irec( xfs_extnum_t page_idx = *idxp; /* extent index in target list */ ASSERT(ifp->if_flags & XFS_IFEXTIREC); - ASSERT(page_idx >= 0); - ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); - ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc); - + ASSERT(page_idx >= 0 && page_idx <= + ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; erp_idx = 0; low = 0; diff --git a/trunk/fs/xfs/xfs_inode.h b/trunk/fs/xfs/xfs_inode.h index 3ae6d58e5473..ff4e2a30227d 100644 --- a/trunk/fs/xfs/xfs_inode.h +++ b/trunk/fs/xfs/xfs_inode.h @@ -67,6 +67,7 @@ typedef struct xfs_ifork { short if_broot_bytes; /* bytes allocated for root */ unsigned char if_flags; /* per-fork flags */ unsigned char if_ext_max; /* max # of extent records */ + xfs_extnum_t if_lastex; /* last if_extents used */ union { xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ diff --git a/trunk/fs/xfs/xfs_log_cil.c b/trunk/fs/xfs/xfs_log_cil.c index c7755d5a5fbe..7d56e88a3f0e 100644 --- a/trunk/fs/xfs/xfs_log_cil.c +++ b/trunk/fs/xfs/xfs_log_cil.c @@ -29,7 +29,6 @@ #include "xfs_mount.h" #include "xfs_error.h" #include "xfs_alloc.h" -#include "xfs_discard.h" /* * Perform initial CIL structure initialisation. If the CIL is not @@ -362,28 +361,18 @@ xlog_cil_committed( int abort) { struct xfs_cil_ctx *ctx = args; - struct xfs_mount *mp = ctx->cil->xc_log->l_mp; xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, ctx->start_lsn, abort); xfs_alloc_busy_sort(&ctx->busy_extents); - xfs_alloc_busy_clear(mp, &ctx->busy_extents, - (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); + xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, &ctx->busy_extents); spin_lock(&ctx->cil->xc_cil_lock); list_del(&ctx->committing); spin_unlock(&ctx->cil->xc_cil_lock); xlog_cil_free_logvec(ctx->lv_chain); - - if (!list_empty(&ctx->busy_extents)) { - ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); - - xfs_discard_extents(mp, &ctx->busy_extents); - xfs_alloc_busy_clear(mp, &ctx->busy_extents, false); - } - kmem_free(ctx); } diff --git a/trunk/fs/xfs/xfs_mount.h b/trunk/fs/xfs/xfs_mount.h index 3d68bb267c5f..19af0ab0d0c6 100644 --- a/trunk/fs/xfs/xfs_mount.h +++ b/trunk/fs/xfs/xfs_mount.h @@ -224,7 +224,6 @@ typedef struct xfs_mount { #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem operations, typically for disk errors in metadata */ -#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */ #define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to user */ #define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment diff --git a/trunk/fs/xfs/xfs_trans.c b/trunk/fs/xfs/xfs_trans.c index 7c7bc2b786bd..d1f24858ccc4 100644 --- a/trunk/fs/xfs/xfs_trans.c +++ b/trunk/fs/xfs/xfs_trans.c @@ -609,7 +609,7 @@ xfs_trans_free( struct xfs_trans *tp) { xfs_alloc_busy_sort(&tp->t_busy); - xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false); + xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy); atomic_dec(&tp->t_mountp->m_active_trans); xfs_trans_free_dqinfo(tp); diff --git a/trunk/include/linux/buffer_head.h b/trunk/include/linux/buffer_head.h index 503c8a6b3079..f5df23561b96 100644 --- a/trunk/include/linux/buffer_head.h +++ b/trunk/include/linux/buffer_head.h @@ -217,24 +217,8 @@ int cont_write_begin(struct file *, struct address_space *, loff_t, get_block_t *, loff_t *); int generic_cont_expand_simple(struct inode *inode, loff_t size); int block_commit_write(struct page *page, unsigned from, unsigned to); -int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block); int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block); -/* Convert errno to return value from ->page_mkwrite() call */ -static inline int block_page_mkwrite_return(int err) -{ - if (err == 0) - return VM_FAULT_LOCKED; - if (err == -EFAULT) - return VM_FAULT_NOPAGE; - if (err == -ENOMEM) - return VM_FAULT_OOM; - if (err == -EAGAIN) - return VM_FAULT_RETRY; - /* -ENOSPC, -EDQUOT, -EIO ... */ - return VM_FAULT_SIGBUS; -} sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); int block_truncate_page(struct address_space *, loff_t, get_block_t *); int nobh_write_begin(struct address_space *, loff_t, unsigned, unsigned, diff --git a/trunk/include/linux/cleancache.h b/trunk/include/linux/cleancache.h deleted file mode 100644 index 04ffb2e6c9d0..000000000000 --- a/trunk/include/linux/cleancache.h +++ /dev/null @@ -1,122 +0,0 @@ -#ifndef _LINUX_CLEANCACHE_H -#define _LINUX_CLEANCACHE_H - -#include -#include -#include - -#define CLEANCACHE_KEY_MAX 6 - -/* - * cleancache requires every file with a page in cleancache to have a - * unique key unless/until the file is removed/truncated. For some - * filesystems, the inode number is unique, but for "modern" filesystems - * an exportable filehandle is required (see exportfs.h) - */ -struct cleancache_filekey { - union { - ino_t ino; - __u32 fh[CLEANCACHE_KEY_MAX]; - u32 key[CLEANCACHE_KEY_MAX]; - } u; -}; - -struct cleancache_ops { - int (*init_fs)(size_t); - int (*init_shared_fs)(char *uuid, size_t); - int (*get_page)(int, struct cleancache_filekey, - pgoff_t, struct page *); - void (*put_page)(int, struct cleancache_filekey, - pgoff_t, struct page *); - void (*flush_page)(int, struct cleancache_filekey, pgoff_t); - void (*flush_inode)(int, struct cleancache_filekey); - void (*flush_fs)(int); -}; - -extern struct cleancache_ops - cleancache_register_ops(struct cleancache_ops *ops); -extern void __cleancache_init_fs(struct super_block *); -extern void __cleancache_init_shared_fs(char *, struct super_block *); -extern int __cleancache_get_page(struct page *); -extern void __cleancache_put_page(struct page *); -extern void __cleancache_flush_page(struct address_space *, struct page *); -extern void __cleancache_flush_inode(struct address_space *); -extern void __cleancache_flush_fs(struct super_block *); -extern int cleancache_enabled; - -#ifdef CONFIG_CLEANCACHE -static inline bool cleancache_fs_enabled(struct page *page) -{ - return page->mapping->host->i_sb->cleancache_poolid >= 0; -} -static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping) -{ - return mapping->host->i_sb->cleancache_poolid >= 0; -} -#else -#define cleancache_enabled (0) -#define cleancache_fs_enabled(_page) (0) -#define cleancache_fs_enabled_mapping(_page) (0) -#endif - -/* - * The shim layer provided by these inline functions allows the compiler - * to reduce all cleancache hooks to nothingness if CONFIG_CLEANCACHE - * is disabled, to a single global variable check if CONFIG_CLEANCACHE - * is enabled but no cleancache "backend" has dynamically enabled it, - * and, for the most frequent cleancache ops, to a single global variable - * check plus a superblock element comparison if CONFIG_CLEANCACHE is enabled - * and a cleancache backend has dynamically enabled cleancache, but the - * filesystem referenced by that cleancache op has not enabled cleancache. - * As a result, CONFIG_CLEANCACHE can be enabled by default with essentially - * no measurable performance impact. - */ - -static inline void cleancache_init_fs(struct super_block *sb) -{ - if (cleancache_enabled) - __cleancache_init_fs(sb); -} - -static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb) -{ - if (cleancache_enabled) - __cleancache_init_shared_fs(uuid, sb); -} - -static inline int cleancache_get_page(struct page *page) -{ - int ret = -1; - - if (cleancache_enabled && cleancache_fs_enabled(page)) - ret = __cleancache_get_page(page); - return ret; -} - -static inline void cleancache_put_page(struct page *page) -{ - if (cleancache_enabled && cleancache_fs_enabled(page)) - __cleancache_put_page(page); -} - -static inline void cleancache_flush_page(struct address_space *mapping, - struct page *page) -{ - /* careful... page->mapping is NULL sometimes when this is called */ - if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping)) - __cleancache_flush_page(mapping, page); -} - -static inline void cleancache_flush_inode(struct address_space *mapping) -{ - if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping)) - __cleancache_flush_inode(mapping); -} - -static inline void cleancache_flush_fs(struct super_block *sb) -{ - if (cleancache_enabled) - __cleancache_flush_fs(sb); -} - -#endif /* _LINUX_CLEANCACHE_H */ diff --git a/trunk/include/linux/fs.h b/trunk/include/linux/fs.h index 241609346dfb..3f9d3251790d 100644 --- a/trunk/include/linux/fs.h +++ b/trunk/include/linux/fs.h @@ -1428,11 +1428,6 @@ struct super_block { */ char __rcu *s_options; const struct dentry_operations *s_d_op; /* default d_op for dentries */ - - /* - * Saved pool identifier for cleancache (-1 means none) - */ - int cleancache_poolid; }; extern struct timespec current_fs_time(struct super_block *sb); diff --git a/trunk/include/linux/hugetlb.h b/trunk/include/linux/hugetlb.h index 59225ef27d15..943c76b3d4bb 100644 --- a/trunk/include/linux/hugetlb.h +++ b/trunk/include/linux/hugetlb.h @@ -1,7 +1,6 @@ #ifndef _LINUX_HUGETLB_H #define _LINUX_HUGETLB_H -#include #include #include @@ -42,7 +41,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); int hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, - vm_flags_t vm_flags); + int acctflags); void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); int dequeue_hwpoisoned_huge_page(struct page *page); void copy_huge_page(struct page *dst, struct page *src); @@ -169,7 +168,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) extern const struct file_operations hugetlbfs_file_operations; extern const struct vm_operations_struct hugetlb_vm_ops; -struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, +struct file *hugetlb_file_setup(const char *name, size_t size, int acct, struct user_struct **user, int creat_flags); int hugetlb_get_quota(struct address_space *mapping, long delta); void hugetlb_put_quota(struct address_space *mapping, long delta); @@ -193,7 +192,7 @@ static inline void set_file_hugepages(struct file *file) #define is_file_hugepages(file) 0 #define set_file_hugepages(file) BUG() static inline struct file *hugetlb_file_setup(const char *name, size_t size, - vm_flags_t acctflag, struct user_struct **user, int creat_flags) + int acctflag, struct user_struct **user, int creat_flags) { return ERR_PTR(-ENOSYS); } diff --git a/trunk/include/linux/hugetlb_inline.h b/trunk/include/linux/hugetlb_inline.h index 2bb681fbeb35..6931489a5c14 100644 --- a/trunk/include/linux/hugetlb_inline.h +++ b/trunk/include/linux/hugetlb_inline.h @@ -7,7 +7,7 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) { - return !!(vma->vm_flags & VM_HUGETLB); + return vma->vm_flags & VM_HUGETLB; } #else diff --git a/trunk/include/linux/if_link.h b/trunk/include/linux/if_link.h index 0ee969a5593d..f4a2e6b1b864 100644 --- a/trunk/include/linux/if_link.h +++ b/trunk/include/linux/if_link.h @@ -136,7 +136,6 @@ enum { IFLA_PORT_SELF, IFLA_AF_SPEC, IFLA_GROUP, /* Group the device belongs to */ - IFLA_NET_NS_FD, __IFLA_MAX }; diff --git a/trunk/include/linux/jbd2.h b/trunk/include/linux/jbd2.h index 4ecb7b16b278..a32dcaec04e1 100644 --- a/trunk/include/linux/jbd2.h +++ b/trunk/include/linux/jbd2.h @@ -529,10 +529,9 @@ struct transaction_s enum { T_RUNNING, T_LOCKED, + T_RUNDOWN, T_FLUSH, T_COMMIT, - T_COMMIT_DFLUSH, - T_COMMIT_JFLUSH, T_FINISHED } t_state; @@ -659,9 +658,7 @@ struct transaction_s * waiting for it to finish. */ unsigned int t_synchronous_commit:1; - - /* Disk flush needs to be sent to fs partition [no locking] */ - int t_need_data_flush; + unsigned int t_flushed_data_blocks:1; /* * For use by the filesystem to store fs-specific data @@ -1231,7 +1228,6 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); int jbd2_journal_force_commit_nested(journal_t *journal); int jbd2_log_wait_commit(journal_t *journal, tid_t tid); int jbd2_log_do_checkpoint(journal_t *journal); -int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid); void __jbd2_log_wait_for_space(journal_t *journal); extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); diff --git a/trunk/include/linux/mm.h b/trunk/include/linux/mm.h index fb8e814f78dc..8eb969ebf904 100644 --- a/trunk/include/linux/mm.h +++ b/trunk/include/linux/mm.h @@ -165,12 +165,12 @@ extern pgprot_t protection_map[16]; */ static inline int is_linear_pfn_mapping(struct vm_area_struct *vma) { - return !!(vma->vm_flags & VM_PFN_AT_MMAP); + return (vma->vm_flags & VM_PFN_AT_MMAP); } static inline int is_pfn_mapping(struct vm_area_struct *vma) { - return !!(vma->vm_flags & VM_PFNMAP); + return (vma->vm_flags & VM_PFNMAP); } /* @@ -1432,7 +1432,7 @@ extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long flag, unsigned long pgoff); extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, - vm_flags_t vm_flags, unsigned long pgoff); + unsigned int vm_flags, unsigned long pgoff); static inline unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, diff --git a/trunk/include/linux/mm_types.h b/trunk/include/linux/mm_types.h index 6fe96c19f85e..071d459e866b 100644 --- a/trunk/include/linux/mm_types.h +++ b/trunk/include/linux/mm_types.h @@ -102,8 +102,6 @@ struct page { #endif }; -typedef unsigned long __nocast vm_flags_t; - /* * A region containing a mapping of a non-memory backed file under NOMMU * conditions. These are held in a global tree and are pinned by the VMAs that @@ -111,7 +109,7 @@ typedef unsigned long __nocast vm_flags_t; */ struct vm_region { struct rb_node vm_rb; /* link in global region tree */ - vm_flags_t vm_flags; /* VMA vm_flags */ + unsigned long vm_flags; /* VMA vm_flags */ unsigned long vm_start; /* start address of region */ unsigned long vm_end; /* region initialised to here */ unsigned long vm_top; /* region allocated to here */ diff --git a/trunk/include/linux/proc_fs.h b/trunk/include/linux/proc_fs.h index 648c9c58add7..3686cd6c9aca 100644 --- a/trunk/include/linux/proc_fs.h +++ b/trunk/include/linux/proc_fs.h @@ -179,8 +179,6 @@ extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm); -extern struct file *proc_ns_fget(int fd); - #else #define proc_net_fops_create(net, name, mode, fops) ({ (void)(mode), NULL; }) @@ -243,11 +241,6 @@ static inline void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) {} -static inline struct file *proc_ns_fget(int fd) -{ - return ERR_PTR(-EINVAL); -} - #endif /* CONFIG_PROC_FS */ #if !defined(CONFIG_PROC_KCORE) @@ -259,18 +252,6 @@ kclist_add(struct kcore_list *new, void *addr, size_t size, int type) extern void kclist_add(struct kcore_list *, void *, size_t, int type); #endif -struct nsproxy; -struct proc_ns_operations { - const char *name; - int type; - void *(*get)(struct task_struct *task); - void (*put)(void *ns); - int (*install)(struct nsproxy *nsproxy, void *ns); -}; -extern const struct proc_ns_operations netns_operations; -extern const struct proc_ns_operations utsns_operations; -extern const struct proc_ns_operations ipcns_operations; - union proc_op { int (*proc_get_link)(struct inode *, struct path *); int (*proc_read)(struct task_struct *task, char *page); @@ -289,8 +270,6 @@ struct proc_inode { struct proc_dir_entry *pde; struct ctl_table_header *sysctl; struct ctl_table *sysctl_entry; - void *ns; - const struct proc_ns_operations *ns_ops; struct inode vfs_inode; }; diff --git a/trunk/include/linux/syscalls.h b/trunk/include/linux/syscalls.h index 8c03b98df5f9..ab71447d0c5a 100644 --- a/trunk/include/linux/syscalls.h +++ b/trunk/include/linux/syscalls.h @@ -846,5 +846,4 @@ asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name, asmlinkage long sys_open_by_handle_at(int mountdirfd, struct file_handle __user *handle, int flags); -asmlinkage long sys_setns(int fd, int nstype); #endif diff --git a/trunk/include/net/net_namespace.h b/trunk/include/net/net_namespace.h index dcc8f5749d3f..3ae491932bc8 100644 --- a/trunk/include/net/net_namespace.h +++ b/trunk/include/net/net_namespace.h @@ -119,7 +119,6 @@ static inline struct net *copy_net_ns(unsigned long flags, struct net *net_ns) extern struct list_head net_namespace_list; extern struct net *get_net_ns_by_pid(pid_t pid); -extern struct net *get_net_ns_by_fd(int pid); #ifdef CONFIG_NET_NS extern void __put_net(struct net *net); diff --git a/trunk/include/xen/interface/xen.h b/trunk/include/xen/interface/xen.h index 70213b4515eb..b33257bc7e83 100644 --- a/trunk/include/xen/interface/xen.h +++ b/trunk/include/xen/interface/xen.h @@ -58,7 +58,6 @@ #define __HYPERVISOR_event_channel_op 32 #define __HYPERVISOR_physdev_op 33 #define __HYPERVISOR_hvm_op 34 -#define __HYPERVISOR_tmem_op 38 /* Architecture-specific hypercall definitions. */ #define __HYPERVISOR_arch_0 48 @@ -462,27 +461,6 @@ typedef uint8_t xen_domain_handle_t[16]; #define __mk_unsigned_long(x) x ## UL #define mk_unsigned_long(x) __mk_unsigned_long(x) -#define TMEM_SPEC_VERSION 1 - -struct tmem_op { - uint32_t cmd; - int32_t pool_id; - union { - struct { /* for cmd == TMEM_NEW_POOL */ - uint64_t uuid[2]; - uint32_t flags; - } new; - struct { - uint64_t oid[3]; - uint32_t index; - uint32_t tmem_offset; - uint32_t pfn_offset; - uint32_t len; - GUEST_HANDLE(void) gmfn; /* guest machine page frame */ - } gen; - } u; -}; - #else /* __ASSEMBLY__ */ /* In assembly code we cannot use C numeric constant suffixes. */ diff --git a/trunk/ipc/namespace.c b/trunk/ipc/namespace.c index ce0a647869b1..8054c8e5faf1 100644 --- a/trunk/ipc/namespace.c +++ b/trunk/ipc/namespace.c @@ -12,7 +12,6 @@ #include #include #include -#include #include "util.h" @@ -141,39 +140,3 @@ void put_ipc_ns(struct ipc_namespace *ns) free_ipc_ns(ns); } } - -static void *ipcns_get(struct task_struct *task) -{ - struct ipc_namespace *ns = NULL; - struct nsproxy *nsproxy; - - rcu_read_lock(); - nsproxy = task_nsproxy(task); - if (nsproxy) - ns = get_ipc_ns(nsproxy->ipc_ns); - rcu_read_unlock(); - - return ns; -} - -static void ipcns_put(void *ns) -{ - return put_ipc_ns(ns); -} - -static int ipcns_install(struct nsproxy *nsproxy, void *ns) -{ - /* Ditch state from the old ipc namespace */ - exit_sem(current); - put_ipc_ns(nsproxy->ipc_ns); - nsproxy->ipc_ns = get_ipc_ns(ns); - return 0; -} - -const struct proc_ns_operations ipcns_operations = { - .name = "ipc", - .type = CLONE_NEWIPC, - .get = ipcns_get, - .put = ipcns_put, - .install = ipcns_install, -}; diff --git a/trunk/ipc/shm.c b/trunk/ipc/shm.c index ab3385a21b27..729acb7e3148 100644 --- a/trunk/ipc/shm.c +++ b/trunk/ipc/shm.c @@ -347,7 +347,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) struct file * file; char name[13]; int id; - vm_flags_t acctflag = 0; + int acctflag = 0; if (size < SHMMIN || size > ns->shm_ctlmax) return -EINVAL; diff --git a/trunk/kernel/nsproxy.c b/trunk/kernel/nsproxy.c index 5424e37673ed..a05d191ffdd9 100644 --- a/trunk/kernel/nsproxy.c +++ b/trunk/kernel/nsproxy.c @@ -22,9 +22,6 @@ #include #include #include -#include -#include -#include static struct kmem_cache *nsproxy_cachep; @@ -236,45 +233,6 @@ void exit_task_namespaces(struct task_struct *p) switch_task_namespaces(p, NULL); } -SYSCALL_DEFINE2(setns, int, fd, int, nstype) -{ - const struct proc_ns_operations *ops; - struct task_struct *tsk = current; - struct nsproxy *new_nsproxy; - struct proc_inode *ei; - struct file *file; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - file = proc_ns_fget(fd); - if (IS_ERR(file)) - return PTR_ERR(file); - - err = -EINVAL; - ei = PROC_I(file->f_dentry->d_inode); - ops = ei->ns_ops; - if (nstype && (ops->type != nstype)) - goto out; - - new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); - if (IS_ERR(new_nsproxy)) { - err = PTR_ERR(new_nsproxy); - goto out; - } - - err = ops->install(new_nsproxy, ei->ns); - if (err) { - free_nsproxy(new_nsproxy); - goto out; - } - switch_task_namespaces(tsk, new_nsproxy); -out: - fput(file); - return err; -} - static int __init nsproxy_cache_init(void) { nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); diff --git a/trunk/kernel/utsname.c b/trunk/kernel/utsname.c index bff131b9510a..44646179eaba 100644 --- a/trunk/kernel/utsname.c +++ b/trunk/kernel/utsname.c @@ -15,7 +15,6 @@ #include #include #include -#include static struct uts_namespace *create_uts_ns(void) { @@ -80,41 +79,3 @@ void free_uts_ns(struct kref *kref) put_user_ns(ns->user_ns); kfree(ns); } - -static void *utsns_get(struct task_struct *task) -{ - struct uts_namespace *ns = NULL; - struct nsproxy *nsproxy; - - rcu_read_lock(); - nsproxy = task_nsproxy(task); - if (nsproxy) { - ns = nsproxy->uts_ns; - get_uts_ns(ns); - } - rcu_read_unlock(); - - return ns; -} - -static void utsns_put(void *ns) -{ - put_uts_ns(ns); -} - -static int utsns_install(struct nsproxy *nsproxy, void *ns) -{ - get_uts_ns(ns); - put_uts_ns(nsproxy->uts_ns); - nsproxy->uts_ns = ns; - return 0; -} - -const struct proc_ns_operations utsns_operations = { - .name = "uts", - .type = CLONE_NEWUTS, - .get = utsns_get, - .put = utsns_put, - .install = utsns_install, -}; - diff --git a/trunk/mm/Kconfig b/trunk/mm/Kconfig index 8ca47a5ee9c8..e9c0c61f2ddd 100644 --- a/trunk/mm/Kconfig +++ b/trunk/mm/Kconfig @@ -347,26 +347,3 @@ config NEED_PER_CPU_KM depends on !SMP bool default y - -config CLEANCACHE - bool "Enable cleancache driver to cache clean pages if tmem is present" - default n - help - Cleancache can be thought of as a page-granularity victim cache - for clean pages that the kernel's pageframe replacement algorithm - (PFRA) would like to keep around, but can't since there isn't enough - memory. So when the PFRA "evicts" a page, it first attempts to use - cleancacne code to put the data contained in that page into - "transcendent memory", memory that is not directly accessible or - addressable by the kernel and is of unknown and possibly - time-varying size. And when a cleancache-enabled - filesystem wishes to access a page in a file on disk, it first - checks cleancache to see if it already contains it; if it does, - the page is copied into the kernel and a disk access is avoided. - When a transcendent memory driver is available (such as zcache or - Xen transcendent memory), a significant I/O reduction - may be achieved. When none is available, all cleancache calls - are reduced to a single pointer-compare-against-NULL resulting - in a negligible performance hit. - - If unsure, say Y to enable cleancache diff --git a/trunk/mm/Makefile b/trunk/mm/Makefile index 836e4163c1bf..42a8326c3e3d 100644 --- a/trunk/mm/Makefile +++ b/trunk/mm/Makefile @@ -49,4 +49,3 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o -obj-$(CONFIG_CLEANCACHE) += cleancache.o diff --git a/trunk/mm/cleancache.c b/trunk/mm/cleancache.c deleted file mode 100644 index bcaae4c2a770..000000000000 --- a/trunk/mm/cleancache.c +++ /dev/null @@ -1,244 +0,0 @@ -/* - * Cleancache frontend - * - * This code provides the generic "frontend" layer to call a matching - * "backend" driver implementation of cleancache. See - * Documentation/vm/cleancache.txt for more information. - * - * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. - * Author: Dan Magenheimer - * - * This work is licensed under the terms of the GNU GPL, version 2. - */ - -#include -#include -#include -#include -#include - -/* - * This global enablement flag may be read thousands of times per second - * by cleancache_get/put/flush even on systems where cleancache_ops - * is not claimed (e.g. cleancache is config'ed on but remains - * disabled), so is preferred to the slower alternative: a function - * call that checks a non-global. - */ -int cleancache_enabled; -EXPORT_SYMBOL(cleancache_enabled); - -/* - * cleancache_ops is set by cleancache_ops_register to contain the pointers - * to the cleancache "backend" implementation functions. - */ -static struct cleancache_ops cleancache_ops; - -/* useful stats available in /sys/kernel/mm/cleancache */ -static unsigned long cleancache_succ_gets; -static unsigned long cleancache_failed_gets; -static unsigned long cleancache_puts; -static unsigned long cleancache_flushes; - -/* - * register operations for cleancache, returning previous thus allowing - * detection of multiple backends and possible nesting - */ -struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops) -{ - struct cleancache_ops old = cleancache_ops; - - cleancache_ops = *ops; - cleancache_enabled = 1; - return old; -} -EXPORT_SYMBOL(cleancache_register_ops); - -/* Called by a cleancache-enabled filesystem at time of mount */ -void __cleancache_init_fs(struct super_block *sb) -{ - sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE); -} -EXPORT_SYMBOL(__cleancache_init_fs); - -/* Called by a cleancache-enabled clustered filesystem at time of mount */ -void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) -{ - sb->cleancache_poolid = - (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE); -} -EXPORT_SYMBOL(__cleancache_init_shared_fs); - -/* - * If the filesystem uses exportable filehandles, use the filehandle as - * the key, else use the inode number. - */ -static int cleancache_get_key(struct inode *inode, - struct cleancache_filekey *key) -{ - int (*fhfn)(struct dentry *, __u32 *fh, int *, int); - int len = 0, maxlen = CLEANCACHE_KEY_MAX; - struct super_block *sb = inode->i_sb; - - key->u.ino = inode->i_ino; - if (sb->s_export_op != NULL) { - fhfn = sb->s_export_op->encode_fh; - if (fhfn) { - struct dentry d; - d.d_inode = inode; - len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0); - if (len <= 0 || len == 255) - return -1; - if (maxlen > CLEANCACHE_KEY_MAX) - return -1; - } - } - return 0; -} - -/* - * "Get" data from cleancache associated with the poolid/inode/index - * that were specified when the data was put to cleanache and, if - * successful, use it to fill the specified page with data and return 0. - * The pageframe is unchanged and returns -1 if the get fails. - * Page must be locked by caller. - */ -int __cleancache_get_page(struct page *page) -{ - int ret = -1; - int pool_id; - struct cleancache_filekey key = { .u.key = { 0 } }; - - VM_BUG_ON(!PageLocked(page)); - pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (pool_id < 0) - goto out; - - if (cleancache_get_key(page->mapping->host, &key) < 0) - goto out; - - ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page); - if (ret == 0) - cleancache_succ_gets++; - else - cleancache_failed_gets++; -out: - return ret; -} -EXPORT_SYMBOL(__cleancache_get_page); - -/* - * "Put" data from a page to cleancache and associate it with the - * (previously-obtained per-filesystem) poolid and the page's, - * inode and page index. Page must be locked. Note that a put_page - * always "succeeds", though a subsequent get_page may succeed or fail. - */ -void __cleancache_put_page(struct page *page) -{ - int pool_id; - struct cleancache_filekey key = { .u.key = { 0 } }; - - VM_BUG_ON(!PageLocked(page)); - pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (pool_id >= 0 && - cleancache_get_key(page->mapping->host, &key) >= 0) { - (*cleancache_ops.put_page)(pool_id, key, page->index, page); - cleancache_puts++; - } -} -EXPORT_SYMBOL(__cleancache_put_page); - -/* - * Flush any data from cleancache associated with the poolid and the - * page's inode and page index so that a subsequent "get" will fail. - */ -void __cleancache_flush_page(struct address_space *mapping, struct page *page) -{ - /* careful... page->mapping is NULL sometimes when this is called */ - int pool_id = mapping->host->i_sb->cleancache_poolid; - struct cleancache_filekey key = { .u.key = { 0 } }; - - if (pool_id >= 0) { - VM_BUG_ON(!PageLocked(page)); - if (cleancache_get_key(mapping->host, &key) >= 0) { - (*cleancache_ops.flush_page)(pool_id, key, page->index); - cleancache_flushes++; - } - } -} -EXPORT_SYMBOL(__cleancache_flush_page); - -/* - * Flush all data from cleancache associated with the poolid and the - * mappings's inode so that all subsequent gets to this poolid/inode - * will fail. - */ -void __cleancache_flush_inode(struct address_space *mapping) -{ - int pool_id = mapping->host->i_sb->cleancache_poolid; - struct cleancache_filekey key = { .u.key = { 0 } }; - - if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) - (*cleancache_ops.flush_inode)(pool_id, key); -} -EXPORT_SYMBOL(__cleancache_flush_inode); - -/* - * Called by any cleancache-enabled filesystem at time of unmount; - * note that pool_id is surrendered and may be reutrned by a subsequent - * cleancache_init_fs or cleancache_init_shared_fs - */ -void __cleancache_flush_fs(struct super_block *sb) -{ - if (sb->cleancache_poolid >= 0) { - int old_poolid = sb->cleancache_poolid; - sb->cleancache_poolid = -1; - (*cleancache_ops.flush_fs)(old_poolid); - } -} -EXPORT_SYMBOL(__cleancache_flush_fs); - -#ifdef CONFIG_SYSFS - -/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */ - -#define CLEANCACHE_SYSFS_RO(_name) \ - static ssize_t cleancache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%lu\n", cleancache_##_name); \ - } \ - static struct kobj_attribute cleancache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = cleancache_##_name##_show, \ - } - -CLEANCACHE_SYSFS_RO(succ_gets); -CLEANCACHE_SYSFS_RO(failed_gets); -CLEANCACHE_SYSFS_RO(puts); -CLEANCACHE_SYSFS_RO(flushes); - -static struct attribute *cleancache_attrs[] = { - &cleancache_succ_gets_attr.attr, - &cleancache_failed_gets_attr.attr, - &cleancache_puts_attr.attr, - &cleancache_flushes_attr.attr, - NULL, -}; - -static struct attribute_group cleancache_attr_group = { - .attrs = cleancache_attrs, - .name = "cleancache", -}; - -#endif /* CONFIG_SYSFS */ - -static int __init init_cleancache(void) -{ -#ifdef CONFIG_SYSFS - int err; - - err = sysfs_create_group(mm_kobj, &cleancache_attr_group); -#endif /* CONFIG_SYSFS */ - return 0; -} -module_init(init_cleancache) diff --git a/trunk/mm/filemap.c b/trunk/mm/filemap.c index 7455ccd8bda8..68e782b3d3de 100644 --- a/trunk/mm/filemap.c +++ b/trunk/mm/filemap.c @@ -34,7 +34,6 @@ #include /* for BUG_ON(!in_atomic()) only */ #include #include /* for page_is_file_cache() */ -#include #include "internal.h" /* @@ -119,16 +118,6 @@ void __delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; - /* - * if we're uptodate, flush out into the cleancache, otherwise - * invalidate any existing cleancache entries. We can't leave - * stale data around in the cleancache once our page is gone - */ - if (PageUptodate(page) && PageMappedToDisk(page)) - cleancache_put_page(page); - else - cleancache_flush_page(mapping, page); - radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; diff --git a/trunk/mm/fremap.c b/trunk/mm/fremap.c index b8e0e2d468af..7f4123056e06 100644 --- a/trunk/mm/fremap.c +++ b/trunk/mm/fremap.c @@ -224,7 +224,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, /* * drop PG_Mlocked flag for over-mapped range */ - vm_flags_t saved_flags = vma->vm_flags; + unsigned int saved_flags = vma->vm_flags; munlock_vma_pages_range(vma, start, start + size); vma->vm_flags = saved_flags; } diff --git a/trunk/mm/hugetlb.c b/trunk/mm/hugetlb.c index f33bb319b73f..5fd68b95c671 100644 --- a/trunk/mm/hugetlb.c +++ b/trunk/mm/hugetlb.c @@ -2833,7 +2833,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, int hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, - vm_flags_t vm_flags) + int acctflag) { long ret, chg; struct hstate *h = hstate_inode(inode); @@ -2843,7 +2843,7 @@ int hugetlb_reserve_pages(struct inode *inode, * attempt will be made for VM_NORESERVE to allocate a page * and filesystem quota without using reserves */ - if (vm_flags & VM_NORESERVE) + if (acctflag & VM_NORESERVE) return 0; /* diff --git a/trunk/mm/memory.c b/trunk/mm/memory.c index fc24f7d788bd..b73f677f0bb1 100644 --- a/trunk/mm/memory.c +++ b/trunk/mm/memory.c @@ -730,7 +730,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, add_taint(TAINT_BAD_PAGE); } -static inline int is_cow_mapping(vm_flags_t flags) +static inline int is_cow_mapping(unsigned int flags) { return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } diff --git a/trunk/mm/mlock.c b/trunk/mm/mlock.c index 048260c4e02e..516b2c2ddd5a 100644 --- a/trunk/mm/mlock.c +++ b/trunk/mm/mlock.c @@ -307,13 +307,13 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, * For vmas that pass the filters, merge/split as appropriate. */ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end, vm_flags_t newflags) + unsigned long start, unsigned long end, unsigned int newflags) { struct mm_struct *mm = vma->vm_mm; pgoff_t pgoff; int nr_pages; int ret = 0; - int lock = !!(newflags & VM_LOCKED); + int lock = newflags & VM_LOCKED; if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) @@ -385,7 +385,7 @@ static int do_mlock(unsigned long start, size_t len, int on) prev = vma; for (nstart = start ; ; ) { - vm_flags_t newflags; + unsigned int newflags; /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ @@ -524,7 +524,7 @@ static int do_mlockall(int flags) goto out; for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { - vm_flags_t newflags; + unsigned int newflags; newflags = vma->vm_flags | VM_LOCKED; if (!(flags & MCL_CURRENT)) diff --git a/trunk/mm/mmap.c b/trunk/mm/mmap.c index bbdc9af5e117..ac2631b7477f 100644 --- a/trunk/mm/mmap.c +++ b/trunk/mm/mmap.c @@ -960,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, { struct mm_struct * mm = current->mm; struct inode *inode; - vm_flags_t vm_flags; + unsigned int vm_flags; int error; unsigned long reqprot = prot; @@ -1165,7 +1165,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) */ int vma_wants_writenotify(struct vm_area_struct *vma) { - vm_flags_t vm_flags = vma->vm_flags; + unsigned int vm_flags = vma->vm_flags; /* If it was private or non-writable, the write bit is already clear */ if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) @@ -1193,7 +1193,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma) * We account for memory if it's a private writeable mapping, * not hugepages and VM_NORESERVE wasn't set. */ -static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) +static inline int accountable_mapping(struct file *file, unsigned int vm_flags) { /* * hugetlb has its own accounting separate from the core VM @@ -1207,7 +1207,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, - vm_flags_t vm_flags, unsigned long pgoff) + unsigned int vm_flags, unsigned long pgoff) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; diff --git a/trunk/mm/slub.c b/trunk/mm/slub.c index 7be0223531b0..4aad32d2e60d 100644 --- a/trunk/mm/slub.c +++ b/trunk/mm/slub.c @@ -1831,6 +1831,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, page->inuse = page->objects; page->freelist = NULL; +unlock_out: slab_unlock(page); c->tid = next_tid(c->tid); local_irq_restore(flags); diff --git a/trunk/mm/truncate.c b/trunk/mm/truncate.c index 3a29a6180212..a95667529135 100644 --- a/trunk/mm/truncate.c +++ b/trunk/mm/truncate.c @@ -19,7 +19,6 @@ #include #include /* grr. try_to_release_page, do_invalidatepage */ -#include #include "internal.h" @@ -52,7 +51,6 @@ void do_invalidatepage(struct page *page, unsigned long offset) static inline void truncate_partial_page(struct page *page, unsigned partial) { zero_user_segment(page, partial, PAGE_CACHE_SIZE); - cleancache_flush_page(page->mapping, page); if (page_has_private(page)) do_invalidatepage(page, partial); } @@ -216,7 +214,6 @@ void truncate_inode_pages_range(struct address_space *mapping, pgoff_t next; int i; - cleancache_flush_inode(mapping); if (mapping->nrpages == 0) return; @@ -294,7 +291,6 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_release(&pvec); mem_cgroup_uncharge_end(); } - cleancache_flush_inode(mapping); } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -444,7 +440,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping, int did_range_unmap = 0; int wrapped = 0; - cleancache_flush_inode(mapping); pagevec_init(&pvec, 0); next = start; while (next <= end && !wrapped && @@ -503,7 +498,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping, mem_cgroup_uncharge_end(); cond_resched(); } - cleancache_flush_inode(mapping); return ret; } EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); diff --git a/trunk/net/core/net_namespace.c b/trunk/net/core/net_namespace.c index 6c6b86d0da15..2e2dce6583e1 100644 --- a/trunk/net/core/net_namespace.c +++ b/trunk/net/core/net_namespace.c @@ -8,8 +8,6 @@ #include #include #include -#include -#include #include #include @@ -304,28 +302,6 @@ void __put_net(struct net *net) } EXPORT_SYMBOL_GPL(__put_net); -struct net *get_net_ns_by_fd(int fd) -{ - struct proc_inode *ei; - struct file *file; - struct net *net; - - net = ERR_PTR(-EINVAL); - file = proc_ns_fget(fd); - if (!file) - goto out; - - ei = PROC_I(file->f_dentry->d_inode); - if (ei->ns_ops != &netns_operations) - goto out; - - net = get_net(ei->ns); -out: - if (file) - fput(file); - return net; -} - #else struct net *copy_net_ns(unsigned long flags, struct net *old_net) { @@ -333,11 +309,6 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net) return ERR_PTR(-EINVAL); return old_net; } - -struct net *get_net_ns_by_fd(int fd) -{ - return ERR_PTR(-EINVAL); -} #endif struct net *get_net_ns_by_pid(pid_t pid) @@ -590,39 +561,3 @@ void unregister_pernet_device(struct pernet_operations *ops) mutex_unlock(&net_mutex); } EXPORT_SYMBOL_GPL(unregister_pernet_device); - -#ifdef CONFIG_NET_NS -static void *netns_get(struct task_struct *task) -{ - struct net *net = NULL; - struct nsproxy *nsproxy; - - rcu_read_lock(); - nsproxy = task_nsproxy(task); - if (nsproxy) - net = get_net(nsproxy->net_ns); - rcu_read_unlock(); - - return net; -} - -static void netns_put(void *ns) -{ - put_net(ns); -} - -static int netns_install(struct nsproxy *nsproxy, void *ns) -{ - put_net(nsproxy->net_ns); - nsproxy->net_ns = get_net(ns); - return 0; -} - -const struct proc_ns_operations netns_operations = { - .name = "net", - .type = CLONE_NEWNET, - .get = netns_get, - .put = netns_put, - .install = netns_install, -}; -#endif diff --git a/trunk/net/core/rtnetlink.c b/trunk/net/core/rtnetlink.c index abd936d8a716..2d56cb9b0b94 100644 --- a/trunk/net/core/rtnetlink.c +++ b/trunk/net/core/rtnetlink.c @@ -1046,7 +1046,6 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_LINKMODE] = { .type = NLA_U8 }, [IFLA_LINKINFO] = { .type = NLA_NESTED }, [IFLA_NET_NS_PID] = { .type = NLA_U32 }, - [IFLA_NET_NS_FD] = { .type = NLA_U32 }, [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 }, [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, [IFLA_VF_PORTS] = { .type = NLA_NESTED }, @@ -1095,8 +1094,6 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) */ if (tb[IFLA_NET_NS_PID]) net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); - else if (tb[IFLA_NET_NS_FD]) - net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD])); else net = get_net(src_net); return net; @@ -1227,7 +1224,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, int send_addr_notify = 0; int err; - if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) { + if (tb[IFLA_NET_NS_PID]) { struct net *net = rtnl_link_get_net(dev_net(dev), tb); if (IS_ERR(net)) { err = PTR_ERR(net);