From 8c8836f2f464d2ff4e717ba39557fe4c1ef09a21 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Sun, 16 Dec 2012 21:11:54 +0100 Subject: [PATCH] --- yaml --- r: 344911 b: refs/heads/master c: 8547a5bc104496d54c66355df944c348e6525e3f h: refs/heads/master i: 344909: d661cedfa4803d640bbc74693aa810d69063e04b 344907: ca0b6589fd8c90b43306b642df2d617dedacb3c6 344903: 72f512d82a0246e345b3c5be933e909d38f49519 344895: 67feb969e832a180ec6a11d0d5c7e99f7c80046c v: v3 --- [refs] | 2 +- trunk/Documentation/filesystems/ext4.txt | 9 +- trunk/Documentation/kernel-parameters.txt | 3 - trunk/Documentation/prctl/seccomp_filter.txt | 74 +- trunk/Documentation/security/keys.txt | 17 - trunk/MAINTAINERS | 2 +- trunk/arch/sh/mm/Kconfig | 1 - trunk/arch/x86/Kconfig | 2 - trunk/arch/x86/include/asm/pgtable.h | 17 +- trunk/arch/x86/include/asm/pgtable_types.h | 20 - trunk/arch/x86/kernel/vsyscall_64.c | 110 +- trunk/arch/x86/mm/pgtable.c | 8 +- trunk/drivers/bus/Kconfig | 1 - trunk/drivers/char/tpm/tpm_ibmvtpm.c | 81 +- trunk/drivers/char/tpm/tpm_ibmvtpm.h | 5 +- trunk/drivers/input/keyboard/Kconfig | 1 - trunk/drivers/usb/phy/Kconfig | 1 - trunk/drivers/video/omap2/Kconfig | 4 - trunk/drivers/w1/masters/Kconfig | 1 - trunk/drivers/xen/swiotlb-xen.c | 25 +- trunk/fs/Kconfig | 4 +- trunk/fs/cifs/cifsacl.c | 12 +- trunk/fs/ext4/Kconfig | 15 + trunk/fs/ext4/Makefile | 4 +- trunk/fs/ext4/acl.c | 6 +- trunk/fs/ext4/dir.c | 41 +- trunk/fs/ext4/ext4.h | 165 +- trunk/fs/ext4/ext4_extents.h | 40 + trunk/fs/ext4/ext4_jbd2.h | 7 + trunk/fs/ext4/extents.c | 480 +++-- trunk/fs/ext4/extents_status.c | 500 ----- trunk/fs/ext4/extents_status.h | 45 - trunk/fs/ext4/file.c | 336 +--- trunk/fs/ext4/fsync.c | 6 +- trunk/fs/ext4/ialloc.c | 6 +- trunk/fs/ext4/indirect.c | 5 +- trunk/fs/ext4/inline.c | 1884 ------------------ trunk/fs/ext4/inode.c | 629 +++--- trunk/fs/ext4/mballoc.c | 60 +- trunk/fs/ext4/migrate.c | 1 - trunk/fs/ext4/move_extent.c | 1 - trunk/fs/ext4/namei.c | 531 ++--- trunk/fs/ext4/page-io.c | 3 +- trunk/fs/ext4/resize.c | 17 +- trunk/fs/ext4/super.c | 57 +- trunk/fs/ext4/symlink.c | 4 + trunk/fs/ext4/xattr.c | 110 +- trunk/fs/ext4/xattr.h | 158 +- trunk/fs/jbd2/journal.c | 1 + trunk/fs/jbd2/transaction.c | 11 + trunk/fs/nfs/idmap.c | 12 +- trunk/include/asm-generic/pgtable.h | 110 - trunk/include/linux/cred.h | 17 +- trunk/include/linux/huge_mm.h | 16 +- trunk/include/linux/hugetlb.h | 8 +- trunk/include/linux/jbd2.h | 9 +- trunk/include/linux/key.h | 1 - trunk/include/linux/mempolicy.h | 8 - trunk/include/linux/migrate.h | 46 +- trunk/include/linux/mm.h | 39 - trunk/include/linux/mm_types.h | 31 - trunk/include/linux/mmzone.h | 13 - trunk/include/linux/rmap.h | 33 +- trunk/include/linux/sched.h | 27 - trunk/include/linux/swiotlb.h | 20 +- trunk/include/linux/vm_event_item.h | 12 +- trunk/include/linux/vmstat.h | 8 - trunk/include/trace/events/ext4.h | 136 +- trunk/include/trace/events/migrate.h | 51 - trunk/include/uapi/linux/mempolicy.h | 15 +- trunk/init/Kconfig | 44 - trunk/kernel/cred.c | 127 +- trunk/kernel/fork.c | 3 - trunk/kernel/sched/core.c | 71 +- trunk/kernel/sched/fair.c | 227 --- trunk/kernel/sched/features.h | 11 - trunk/kernel/sched/sched.h | 12 - trunk/kernel/seccomp.c | 13 +- trunk/kernel/sysctl.c | 45 +- trunk/lib/swiotlb.c | 269 ++- trunk/mm/compaction.c | 15 +- trunk/mm/huge_memory.c | 108 +- trunk/mm/hugetlb.c | 10 +- trunk/mm/internal.h | 7 +- trunk/mm/ksm.c | 6 +- trunk/mm/memcontrol.c | 7 +- trunk/mm/memory-failure.c | 7 +- trunk/mm/memory.c | 198 +- trunk/mm/memory_hotplug.c | 3 +- trunk/mm/mempolicy.c | 283 +-- trunk/mm/migrate.c | 337 +--- trunk/mm/mmap.c | 10 +- trunk/mm/mprotect.c | 135 +- trunk/mm/mremap.c | 2 +- trunk/mm/page_alloc.c | 10 +- trunk/mm/pgtable-generic.c | 9 +- trunk/mm/rmap.c | 66 +- trunk/mm/vmstat.c | 16 +- trunk/net/dns_resolver/dns_key.c | 15 +- trunk/security/keys/key.c | 6 +- trunk/security/keys/keyctl.c | 15 +- trunk/security/keys/keyring.c | 10 +- trunk/security/keys/process_keys.c | 92 +- trunk/security/keys/request_key.c | 21 +- trunk/security/smack/Kconfig | 6 +- trunk/security/smack/smackfs.c | 17 - trunk/security/yama/yama_lsm.c | 88 +- 107 files changed, 1801 insertions(+), 6655 deletions(-) delete mode 100644 trunk/fs/ext4/extents_status.c delete mode 100644 trunk/fs/ext4/extents_status.h delete mode 100644 trunk/fs/ext4/inline.c delete mode 100644 trunk/include/trace/events/migrate.h diff --git a/[refs] b/[refs] index 53eddc986812..e1e3f7f6c4bf 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: 9b690c3d56ce15dd265b6398f9d8d58c29c17032 +refs/heads/master: 8547a5bc104496d54c66355df944c348e6525e3f diff --git a/trunk/Documentation/filesystems/ext4.txt b/trunk/Documentation/filesystems/ext4.txt index 34ea4f1fa6ea..104322bf378c 100644 --- a/trunk/Documentation/filesystems/ext4.txt +++ b/trunk/Documentation/filesystems/ext4.txt @@ -200,9 +200,12 @@ inode_readahead_blks=n This tuning parameter controls the maximum table readahead algorithm will pre-read into the buffer cache. The default value is 32 blocks. -nouser_xattr Disables Extended User Attributes. See the - attr(5) manual page and http://acl.bestbits.at/ - for more information about extended attributes. +nouser_xattr Disables Extended User Attributes. If you have extended + attribute support enabled in the kernel configuration + (CONFIG_EXT4_FS_XATTR), extended attribute support + is enabled by default on mount. See the attr(5) manual + page and http://acl.bestbits.at/ for more information + about extended attributes. noacl This option disables POSIX Access Control List support. If ACL support is enabled in the kernel diff --git a/trunk/Documentation/kernel-parameters.txt b/trunk/Documentation/kernel-parameters.txt index ea8e5b485576..20e248cc03a9 100644 --- a/trunk/Documentation/kernel-parameters.txt +++ b/trunk/Documentation/kernel-parameters.txt @@ -2032,9 +2032,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nr_uarts= [SERIAL] maximum number of UARTs to be registered. - numa_balancing= [KNL,X86] Enable or disable automatic NUMA balancing. - Allowed values are enable and disable - numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. one of ['zone', 'node', 'default'] can be specified This can be set from sysctl after boot. diff --git a/trunk/Documentation/prctl/seccomp_filter.txt b/trunk/Documentation/prctl/seccomp_filter.txt index 1e469ef75778..597c3c581375 100644 --- a/trunk/Documentation/prctl/seccomp_filter.txt +++ b/trunk/Documentation/prctl/seccomp_filter.txt @@ -95,15 +95,12 @@ SECCOMP_RET_KILL: SECCOMP_RET_TRAP: Results in the kernel sending a SIGSYS signal to the triggering - task without executing the system call. siginfo->si_call_addr - will show the address of the system call instruction, and - siginfo->si_syscall and siginfo->si_arch will indicate which - syscall was attempted. The program counter will be as though - the syscall happened (i.e. it will not point to the syscall - instruction). The return value register will contain an arch- - dependent value -- if resuming execution, set it to something - sensible. (The architecture dependency is because replacing - it with -ENOSYS could overwrite some useful information.) + task without executing the system call. The kernel will + rollback the register state to just before the system call + entry such that a signal handler in the task will be able to + inspect the ucontext_t->uc_mcontext registers and emulate + system call success or failure upon return from the signal + handler. The SECCOMP_RET_DATA portion of the return value will be passed as si_errno. @@ -126,18 +123,6 @@ SECCOMP_RET_TRACE: the BPF program return value will be available to the tracer via PTRACE_GETEVENTMSG. - The tracer can skip the system call by changing the syscall number - to -1. Alternatively, the tracer can change the system call - requested by changing the system call to a valid syscall number. If - the tracer asks to skip the system call, then the system call will - appear to return the value that the tracer puts in the return value - register. - - The seccomp check will not be run again after the tracer is - notified. (This means that seccomp-based sandboxes MUST NOT - allow use of ptrace, even of other sandboxed processes, without - extreme care; ptracers can use this mechanism to escape.) - SECCOMP_RET_ALLOW: Results in the system call being executed. @@ -176,50 +161,3 @@ architecture supports both ptrace_event and seccomp, it will be able to support seccomp filter with minor fixup: SIGSYS support and seccomp return value checking. Then it must just add CONFIG_HAVE_ARCH_SECCOMP_FILTER to its arch-specific Kconfig. - - - -Caveats -------- - -The vDSO can cause some system calls to run entirely in userspace, -leading to surprises when you run programs on different machines that -fall back to real syscalls. To minimize these surprises on x86, make -sure you test with -/sys/devices/system/clocksource/clocksource0/current_clocksource set to -something like acpi_pm. - -On x86-64, vsyscall emulation is enabled by default. (vsyscalls are -legacy variants on vDSO calls.) Currently, emulated vsyscalls will honor seccomp, with a few oddities: - -- A return value of SECCOMP_RET_TRAP will set a si_call_addr pointing to - the vsyscall entry for the given call and not the address after the - 'syscall' instruction. Any code which wants to restart the call - should be aware that (a) a ret instruction has been emulated and (b) - trying to resume the syscall will again trigger the standard vsyscall - emulation security checks, making resuming the syscall mostly - pointless. - -- A return value of SECCOMP_RET_TRACE will signal the tracer as usual, - but the syscall may not be changed to another system call using the - orig_rax register. It may only be changed to -1 order to skip the - currently emulated call. Any other change MAY terminate the process. - The rip value seen by the tracer will be the syscall entry address; - this is different from normal behavior. The tracer MUST NOT modify - rip or rsp. (Do not rely on other changes terminating the process. - They might work. For example, on some kernels, choosing a syscall - that only exists in future kernels will be correctly emulated (by - returning -ENOSYS). - -To detect this quirky behavior, check for addr & ~0x0C00 == -0xFFFFFFFFFF600000. (For SECCOMP_RET_TRACE, use rip. For -SECCOMP_RET_TRAP, use siginfo->si_call_addr.) Do not check any other -condition: future kernels may improve vsyscall emulation and current -kernels in vsyscall=native mode will behave differently, but the -instructions at 0xF...F600{0,4,8,C}00 will not be system calls in these -cases. - -Note that modern systems are unlikely to use vsyscalls at all -- they -are a legacy feature and they are considerably slower than standard -syscalls. New code will use the vDSO, and vDSO-issued system calls -are indistinguishable from normal system calls. diff --git a/trunk/Documentation/security/keys.txt b/trunk/Documentation/security/keys.txt index 7b4145d00452..7d9ca92022d8 100644 --- a/trunk/Documentation/security/keys.txt +++ b/trunk/Documentation/security/keys.txt @@ -994,23 +994,6 @@ payload contents" for more information. reference pointer if successful. -(*) A keyring can be created by: - - struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid, - const struct cred *cred, - key_perm_t perm, - unsigned long flags, - struct key *dest); - - This creates a keyring with the given attributes and returns it. If dest - is not NULL, the new keyring will be linked into the keyring to which it - points. No permission checks are made upon the destination keyring. - - Error EDQUOT can be returned if the keyring would overload the quota (pass - KEY_ALLOC_NOT_IN_QUOTA in flags if the keyring shouldn't be accounted - towards the user's quota). Error ENOMEM can also be returned. - - (*) To check the validity of a key, this function can be called: int validate_key(struct key *key); diff --git a/trunk/MAINTAINERS b/trunk/MAINTAINERS index f71d2f901a69..d9c31b906ac9 100644 --- a/trunk/MAINTAINERS +++ b/trunk/MAINTAINERS @@ -3712,7 +3712,7 @@ I2C/SMBUS STUB DRIVER M: "Mark M. Hoffman" L: linux-i2c@vger.kernel.org S: Maintained -F: drivers/i2c/busses/i2c-stub.c +F: drivers/i2c/i2c-stub.c I2C SUBSYSTEM M: Wolfram Sang diff --git a/trunk/arch/sh/mm/Kconfig b/trunk/arch/sh/mm/Kconfig index 0f7c852f355c..cb8f9920f4dd 100644 --- a/trunk/arch/sh/mm/Kconfig +++ b/trunk/arch/sh/mm/Kconfig @@ -111,7 +111,6 @@ config VSYSCALL config NUMA bool "Non Uniform Memory Access (NUMA) Support" depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL - select ARCH_WANT_NUMA_VARIABLE_LOCALITY default n help Some SH systems have many various memories scattered around diff --git a/trunk/arch/x86/Kconfig b/trunk/arch/x86/Kconfig index 97f8c5ad8c2d..65a872bf72f9 100644 --- a/trunk/arch/x86/Kconfig +++ b/trunk/arch/x86/Kconfig @@ -22,8 +22,6 @@ config X86 def_bool y select HAVE_AOUT if X86_32 select HAVE_UNSTABLE_SCHED_CLOCK - select ARCH_SUPPORTS_NUMA_BALANCING - select ARCH_WANTS_PROT_NUMA_PROT_NONE select HAVE_IDE select HAVE_OPROFILE select HAVE_PCSPKR_PLATFORM diff --git a/trunk/arch/x86/include/asm/pgtable.h b/trunk/arch/x86/include/asm/pgtable.h index 5199db2923d3..a1f780d45f76 100644 --- a/trunk/arch/x86/include/asm/pgtable.h +++ b/trunk/arch/x86/include/asm/pgtable.h @@ -404,14 +404,7 @@ static inline int pte_same(pte_t a, pte_t b) static inline int pte_present(pte_t a) { - return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE | - _PAGE_NUMA); -} - -#define pte_accessible pte_accessible -static inline int pte_accessible(pte_t a) -{ - return pte_flags(a) & _PAGE_PRESENT; + return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); } static inline int pte_hidden(pte_t pte) @@ -427,8 +420,7 @@ static inline int pmd_present(pmd_t pmd) * the _PAGE_PSE flag will remain set at all times while the * _PAGE_PRESENT bit is clear). */ - return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE | - _PAGE_NUMA); + return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); } static inline int pmd_none(pmd_t pmd) @@ -487,11 +479,6 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) static inline int pmd_bad(pmd_t pmd) { -#ifdef CONFIG_NUMA_BALANCING - /* pmd_numa check */ - if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA) - return 0; -#endif return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; } diff --git a/trunk/arch/x86/include/asm/pgtable_types.h b/trunk/arch/x86/include/asm/pgtable_types.h index 3c32db8c539d..ec8a1fc9505d 100644 --- a/trunk/arch/x86/include/asm/pgtable_types.h +++ b/trunk/arch/x86/include/asm/pgtable_types.h @@ -64,26 +64,6 @@ #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) -/* - * _PAGE_NUMA indicates that this page will trigger a numa hinting - * minor page fault to gather numa placement statistics (see - * pte_numa()). The bit picked (8) is within the range between - * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't - * require changes to the swp entry format because that bit is always - * zero when the pte is not present. - * - * The bit picked must be always zero when the pmd is present and not - * present, so that we don't lose information when we set it while - * atomically clearing the present bit. - * - * Because we shared the same bit (8) with _PAGE_PROTNONE this can be - * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE - * couldn't reach, like handle_mm_fault() (see access_error in - * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for - * handle_mm_fault() to be invoked). - */ -#define _PAGE_NUMA _PAGE_PROTNONE - #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ _PAGE_ACCESSED | _PAGE_DIRTY) #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ diff --git a/trunk/arch/x86/kernel/vsyscall_64.c b/trunk/arch/x86/kernel/vsyscall_64.c index 9a907a67be8f..3a3e8c9e280d 100644 --- a/trunk/arch/x86/kernel/vsyscall_64.c +++ b/trunk/arch/x86/kernel/vsyscall_64.c @@ -145,6 +145,19 @@ static int addr_to_vsyscall_nr(unsigned long addr) return nr; } +#ifdef CONFIG_SECCOMP +static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr) +{ + if (!seccomp_mode(&tsk->seccomp)) + return 0; + task_pt_regs(tsk)->orig_ax = syscall_nr; + task_pt_regs(tsk)->ax = syscall_nr; + return __secure_computing(syscall_nr); +} +#else +#define vsyscall_seccomp(_tsk, _nr) 0 +#endif + static bool write_ok_or_segv(unsigned long ptr, size_t size) { /* @@ -177,9 +190,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) { struct task_struct *tsk; unsigned long caller; - int vsyscall_nr, syscall_nr, tmp; + int vsyscall_nr; int prev_sig_on_uaccess_error; long ret; + int skip; /* * No point in checking CS -- the only way to get here is a user mode @@ -211,84 +225,56 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) } tsk = current; + /* + * With a real vsyscall, page faults cause SIGSEGV. We want to + * preserve that behavior to make writing exploits harder. + */ + prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; + current_thread_info()->sig_on_uaccess_error = 1; /* - * Check for access_ok violations and find the syscall nr. - * * NULL is a valid user pointer (in the access_ok sense) on 32-bit and * 64-bit, so we don't need to special-case it here. For all the * vsyscalls, NULL means "don't write anything" not "write it at * address 0". */ + ret = -EFAULT; + skip = 0; switch (vsyscall_nr) { case 0: - if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || - !write_ok_or_segv(regs->si, sizeof(struct timezone))) { - ret = -EFAULT; - goto check_fault; - } - - syscall_nr = __NR_gettimeofday; - break; - - case 1: - if (!write_ok_or_segv(regs->di, sizeof(time_t))) { - ret = -EFAULT; - goto check_fault; - } - - syscall_nr = __NR_time; - break; - - case 2: - if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || - !write_ok_or_segv(regs->si, sizeof(unsigned))) { - ret = -EFAULT; - goto check_fault; - } - - syscall_nr = __NR_getcpu; - break; - } - - /* - * Handle seccomp. regs->ip must be the original value. - * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. - * - * We could optimize the seccomp disabled case, but performance - * here doesn't matter. - */ - regs->orig_ax = syscall_nr; - regs->ax = -ENOSYS; - tmp = secure_computing(syscall_nr); - if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { - warn_bad_vsyscall(KERN_DEBUG, regs, - "seccomp tried to change syscall nr or ip"); - do_exit(SIGSYS); - } - if (tmp) - goto do_ret; /* skip requested */ + skip = vsyscall_seccomp(tsk, __NR_gettimeofday); + if (skip) + break; - /* - * With a real vsyscall, page faults cause SIGSEGV. We want to - * preserve that behavior to make writing exploits harder. - */ - prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; - current_thread_info()->sig_on_uaccess_error = 1; + if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || + !write_ok_or_segv(regs->si, sizeof(struct timezone))) + break; - ret = -EFAULT; - switch (vsyscall_nr) { - case 0: ret = sys_gettimeofday( (struct timeval __user *)regs->di, (struct timezone __user *)regs->si); break; case 1: + skip = vsyscall_seccomp(tsk, __NR_time); + if (skip) + break; + + if (!write_ok_or_segv(regs->di, sizeof(time_t))) + break; + ret = sys_time((time_t __user *)regs->di); break; case 2: + skip = vsyscall_seccomp(tsk, __NR_getcpu); + if (skip) + break; + + if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || + !write_ok_or_segv(regs->si, sizeof(unsigned))) + break; + ret = sys_getcpu((unsigned __user *)regs->di, (unsigned __user *)regs->si, NULL); @@ -297,7 +283,12 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; -check_fault: + if (skip) { + if ((long)regs->ax <= 0L) /* seccomp errno emulation */ + goto do_ret; + goto done; /* seccomp trace/trap */ + } + if (ret == -EFAULT) { /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, @@ -320,6 +311,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) /* Emulate a ret instruction. */ regs->ip = caller; regs->sp += 8; +done: return true; sigsegv: diff --git a/trunk/arch/x86/mm/pgtable.c b/trunk/arch/x86/mm/pgtable.c index e27fbf887f3b..217eb705fac0 100644 --- a/trunk/arch/x86/mm/pgtable.c +++ b/trunk/arch/x86/mm/pgtable.c @@ -301,13 +301,6 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_page((unsigned long)pgd); } -/* - * Used to set accessed or dirty bits in the page table entries - * on other architectures. On x86, the accessed and dirty bits - * are tracked by hardware. However, do_wp_page calls this function - * to also make the pte writeable at the same time the dirty bit is - * set. In that case we do actually need to write the PTE. - */ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) @@ -317,6 +310,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, if (changed && dirty) { *ptep = entry; pte_update_defer(vma->vm_mm, address, ptep); + flush_tlb_page(vma, address); } return changed; diff --git a/trunk/drivers/bus/Kconfig b/trunk/drivers/bus/Kconfig index 0f51ed687dc8..bbec35d21fe5 100644 --- a/trunk/drivers/bus/Kconfig +++ b/trunk/drivers/bus/Kconfig @@ -6,7 +6,6 @@ menu "Bus devices" config OMAP_OCP2SCP tristate "OMAP OCP2SCP DRIVER" - depends on ARCH_OMAP2PLUS help Driver to enable ocp2scp module which transforms ocp interface protocol to scp protocol. In OMAP4, USB PHY is connected via diff --git a/trunk/drivers/char/tpm/tpm_ibmvtpm.c b/trunk/drivers/char/tpm/tpm_ibmvtpm.c index 9978609d93b2..7da840d487d2 100644 --- a/trunk/drivers/char/tpm/tpm_ibmvtpm.c +++ b/trunk/drivers/char/tpm/tpm_ibmvtpm.c @@ -38,6 +38,8 @@ static struct vio_device_id tpm_ibmvtpm_device_table[] = { }; MODULE_DEVICE_TABLE(vio, tpm_ibmvtpm_device_table); +DECLARE_WAIT_QUEUE_HEAD(wq); + /** * ibmvtpm_send_crq - Send a CRQ request * @vdev: vio device struct @@ -81,7 +83,6 @@ static int tpm_ibmvtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count) { struct ibmvtpm_dev *ibmvtpm; u16 len; - int sig; ibmvtpm = (struct ibmvtpm_dev *)chip->vendor.data; @@ -90,23 +91,22 @@ static int tpm_ibmvtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count) return 0; } - sig = wait_event_interruptible(ibmvtpm->wq, ibmvtpm->res_len != 0); - if (sig) - return -EINTR; - - len = ibmvtpm->res_len; + wait_event_interruptible(wq, ibmvtpm->crq_res.len != 0); - if (count < len) { + if (count < ibmvtpm->crq_res.len) { dev_err(ibmvtpm->dev, "Invalid size in recv: count=%ld, crq_size=%d\n", - count, len); + count, ibmvtpm->crq_res.len); return -EIO; } spin_lock(&ibmvtpm->rtce_lock); - memcpy((void *)buf, (void *)ibmvtpm->rtce_buf, len); - memset(ibmvtpm->rtce_buf, 0, len); - ibmvtpm->res_len = 0; + memcpy((void *)buf, (void *)ibmvtpm->rtce_buf, ibmvtpm->crq_res.len); + memset(ibmvtpm->rtce_buf, 0, ibmvtpm->crq_res.len); + ibmvtpm->crq_res.valid = 0; + ibmvtpm->crq_res.msg = 0; + len = ibmvtpm->crq_res.len; + ibmvtpm->crq_res.len = 0; spin_unlock(&ibmvtpm->rtce_lock); return len; } @@ -273,6 +273,7 @@ static int tpm_ibmvtpm_remove(struct vio_dev *vdev) int rc = 0; free_irq(vdev->irq, ibmvtpm); + tasklet_kill(&ibmvtpm->tasklet); do { if (rc) @@ -371,6 +372,7 @@ static int ibmvtpm_reset_crq(struct ibmvtpm_dev *ibmvtpm) static int tpm_ibmvtpm_resume(struct device *dev) { struct ibmvtpm_dev *ibmvtpm = ibmvtpm_get_data(dev); + unsigned long flags; int rc = 0; do { @@ -385,11 +387,10 @@ static int tpm_ibmvtpm_resume(struct device *dev) return rc; } - rc = vio_enable_interrupts(ibmvtpm->vdev); - if (rc) { - dev_err(dev, "Error vio_enable_interrupts rc=%d\n", rc); - return rc; - } + spin_lock_irqsave(&ibmvtpm->lock, flags); + vio_disable_interrupts(ibmvtpm->vdev); + tasklet_schedule(&ibmvtpm->tasklet); + spin_unlock_irqrestore(&ibmvtpm->lock, flags); rc = ibmvtpm_crq_send_init(ibmvtpm); if (rc) @@ -466,7 +467,7 @@ static struct ibmvtpm_crq *ibmvtpm_crq_get_next(struct ibmvtpm_dev *ibmvtpm) if (crq->valid & VTPM_MSG_RES) { if (++crq_q->index == crq_q->num_entry) crq_q->index = 0; - smp_rmb(); + rmb(); } else crq = NULL; return crq; @@ -534,9 +535,11 @@ static void ibmvtpm_crq_process(struct ibmvtpm_crq *crq, ibmvtpm->vtpm_version = crq->data; return; case VTPM_TPM_COMMAND_RES: - /* len of the data in rtce buffer */ - ibmvtpm->res_len = crq->len; - wake_up_interruptible(&ibmvtpm->wq); + ibmvtpm->crq_res.valid = crq->valid; + ibmvtpm->crq_res.msg = crq->msg; + ibmvtpm->crq_res.len = crq->len; + ibmvtpm->crq_res.data = crq->data; + wake_up_interruptible(&wq); return; default: return; @@ -556,19 +559,38 @@ static void ibmvtpm_crq_process(struct ibmvtpm_crq *crq, static irqreturn_t ibmvtpm_interrupt(int irq, void *vtpm_instance) { struct ibmvtpm_dev *ibmvtpm = (struct ibmvtpm_dev *) vtpm_instance; + unsigned long flags; + + spin_lock_irqsave(&ibmvtpm->lock, flags); + vio_disable_interrupts(ibmvtpm->vdev); + tasklet_schedule(&ibmvtpm->tasklet); + spin_unlock_irqrestore(&ibmvtpm->lock, flags); + + return IRQ_HANDLED; +} + +/** + * ibmvtpm_tasklet - Interrupt handler tasklet + * @data: ibm vtpm device struct + * + * Returns: + * Nothing + **/ +static void ibmvtpm_tasklet(void *data) +{ + struct ibmvtpm_dev *ibmvtpm = data; struct ibmvtpm_crq *crq; + unsigned long flags; - /* while loop is needed for initial setup (get version and - * get rtce_size). There should be only one tpm request at any - * given time. - */ + spin_lock_irqsave(&ibmvtpm->lock, flags); while ((crq = ibmvtpm_crq_get_next(ibmvtpm)) != NULL) { ibmvtpm_crq_process(crq, ibmvtpm); crq->valid = 0; - smp_wmb(); + wmb(); } - return IRQ_HANDLED; + vio_enable_interrupts(ibmvtpm->vdev); + spin_unlock_irqrestore(&ibmvtpm->lock, flags); } /** @@ -628,6 +650,9 @@ static int tpm_ibmvtpm_probe(struct vio_dev *vio_dev, goto reg_crq_cleanup; } + tasklet_init(&ibmvtpm->tasklet, (void *)ibmvtpm_tasklet, + (unsigned long)ibmvtpm); + rc = request_irq(vio_dev->irq, ibmvtpm_interrupt, 0, tpm_ibmvtpm_driver_name, ibmvtpm); if (rc) { @@ -641,14 +666,13 @@ static int tpm_ibmvtpm_probe(struct vio_dev *vio_dev, goto init_irq_cleanup; } - init_waitqueue_head(&ibmvtpm->wq); - crq_q->index = 0; ibmvtpm->dev = dev; ibmvtpm->vdev = vio_dev; chip->vendor.data = (void *)ibmvtpm; + spin_lock_init(&ibmvtpm->lock); spin_lock_init(&ibmvtpm->rtce_lock); rc = ibmvtpm_crq_send_init(ibmvtpm); @@ -665,6 +689,7 @@ static int tpm_ibmvtpm_probe(struct vio_dev *vio_dev, return rc; init_irq_cleanup: + tasklet_kill(&ibmvtpm->tasklet); do { rc1 = plpar_hcall_norets(H_FREE_CRQ, vio_dev->unit_address); } while (rc1 == H_BUSY || H_IS_LONG_BUSY(rc1)); diff --git a/trunk/drivers/char/tpm/tpm_ibmvtpm.h b/trunk/drivers/char/tpm/tpm_ibmvtpm.h index bd82a791f995..4296eb4b4d82 100644 --- a/trunk/drivers/char/tpm/tpm_ibmvtpm.h +++ b/trunk/drivers/char/tpm/tpm_ibmvtpm.h @@ -38,12 +38,13 @@ struct ibmvtpm_dev { struct vio_dev *vdev; struct ibmvtpm_crq_queue crq_queue; dma_addr_t crq_dma_handle; + spinlock_t lock; + struct tasklet_struct tasklet; u32 rtce_size; void __iomem *rtce_buf; dma_addr_t rtce_dma_handle; spinlock_t rtce_lock; - wait_queue_head_t wq; - u16 res_len; + struct ibmvtpm_crq crq_res; u32 vtpm_version; }; diff --git a/trunk/drivers/input/keyboard/Kconfig b/trunk/drivers/input/keyboard/Kconfig index febead4bf8a5..77629d33f03f 100644 --- a/trunk/drivers/input/keyboard/Kconfig +++ b/trunk/drivers/input/keyboard/Kconfig @@ -544,7 +544,6 @@ config KEYBOARD_OMAP config KEYBOARD_OMAP4 tristate "TI OMAP4+ keypad support" - depends on ARCH_OMAP2PLUS select INPUT_MATRIXKMAP help Say Y here if you want to use the OMAP4+ keypad. diff --git a/trunk/drivers/usb/phy/Kconfig b/trunk/drivers/usb/phy/Kconfig index 5de6e7f39f9c..7eb73c561bd2 100644 --- a/trunk/drivers/usb/phy/Kconfig +++ b/trunk/drivers/usb/phy/Kconfig @@ -6,7 +6,6 @@ comment "USB Physical Layer drivers" config OMAP_USB2 tristate "OMAP USB2 PHY Driver" - depends on ARCH_OMAP2PLUS select USB_OTG_UTILS help Enable this to support the transceiver that is part of SOC. This diff --git a/trunk/drivers/video/omap2/Kconfig b/trunk/drivers/video/omap2/Kconfig index b07b2b042e7e..346d67d6cf4d 100644 --- a/trunk/drivers/video/omap2/Kconfig +++ b/trunk/drivers/video/omap2/Kconfig @@ -1,10 +1,6 @@ config OMAP2_VRFB bool -if ARCH_OMAP2PLUS - source "drivers/video/omap2/dss/Kconfig" source "drivers/video/omap2/omapfb/Kconfig" source "drivers/video/omap2/displays/Kconfig" - -endif diff --git a/trunk/drivers/w1/masters/Kconfig b/trunk/drivers/w1/masters/Kconfig index e8ca63a82b97..c433a746e3f5 100644 --- a/trunk/drivers/w1/masters/Kconfig +++ b/trunk/drivers/w1/masters/Kconfig @@ -60,7 +60,6 @@ config W1_MASTER_GPIO config HDQ_MASTER_OMAP tristate "OMAP HDQ driver" - depends on ARCH_OMAP help Say Y here if you want support for the 1-wire or HDQ Interface on an OMAP processor. diff --git a/trunk/drivers/xen/swiotlb-xen.c b/trunk/drivers/xen/swiotlb-xen.c index af47e7594460..58db6df866ef 100644 --- a/trunk/drivers/xen/swiotlb-xen.c +++ b/trunk/drivers/xen/swiotlb-xen.c @@ -338,8 +338,9 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, enum dma_data_direction dir, struct dma_attrs *attrs) { - phys_addr_t map, phys = page_to_phys(page) + offset; + phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dev_addr = xen_phys_to_bus(phys); + void *map; BUG_ON(dir == DMA_NONE); /* @@ -355,10 +356,10 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, * Oh well, have to allocate and map a bounce buffer. */ map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir); - if (map == SWIOTLB_MAP_ERROR) + if (!map) return DMA_ERROR_CODE; - dev_addr = xen_phys_to_bus(map); + dev_addr = xen_virt_to_bus(map); /* * Ensure that the address returned is DMA'ble @@ -388,7 +389,7 @@ static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr, /* NOTE: We use dev_addr here, not paddr! */ if (is_xen_swiotlb_buffer(dev_addr)) { - swiotlb_tbl_unmap_single(hwdev, paddr, size, dir); + swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir); return; } @@ -433,7 +434,8 @@ xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, /* NOTE: We use dev_addr here, not paddr! */ if (is_xen_swiotlb_buffer(dev_addr)) { - swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); + swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir, + target); return; } @@ -492,12 +494,11 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, if (swiotlb_force || !dma_capable(hwdev, dev_addr, sg->length) || range_straddles_page_boundary(paddr, sg->length)) { - phys_addr_t map = swiotlb_tbl_map_single(hwdev, - start_dma_addr, - sg_phys(sg), - sg->length, - dir); - if (map == SWIOTLB_MAP_ERROR) { + void *map = swiotlb_tbl_map_single(hwdev, + start_dma_addr, + sg_phys(sg), + sg->length, dir); + if (!map) { /* Don't panic here, we expect map_sg users to do proper error handling. */ xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, @@ -505,7 +506,7 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, sgl[0].dma_length = 0; return DMA_ERROR_CODE; } - sg->dma_address = xen_phys_to_bus(map); + sg->dma_address = xen_virt_to_bus(map); } else sg->dma_address = dev_addr; sg->dma_length = sg->length; diff --git a/trunk/fs/Kconfig b/trunk/fs/Kconfig index eaff24a19502..f95ae3a027f3 100644 --- a/trunk/fs/Kconfig +++ b/trunk/fs/Kconfig @@ -28,8 +28,8 @@ config FS_MBCACHE tristate default y if EXT2_FS=y && EXT2_FS_XATTR default y if EXT3_FS=y && EXT3_FS_XATTR - default y if EXT4_FS=y - default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS + default y if EXT4_FS=y && EXT4_FS_XATTR + default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR source "fs/reiserfs/Kconfig" source "fs/jfs/Kconfig" diff --git a/trunk/fs/cifs/cifsacl.c b/trunk/fs/cifs/cifsacl.c index 5cbd00e74067..75c1ee699143 100644 --- a/trunk/fs/cifs/cifsacl.c +++ b/trunk/fs/cifs/cifsacl.c @@ -346,15 +346,19 @@ init_cifs_idmap(void) if (!cred) return -ENOMEM; - keyring = keyring_alloc(".cifs_idmap", 0, 0, cred, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_NOT_IN_QUOTA, NULL); + keyring = key_alloc(&key_type_keyring, ".cifs_idmap", 0, 0, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto failed_put_cred; } + ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); + if (ret < 0) + goto failed_put_key; + ret = register_key_type(&cifs_idmap_key_type); if (ret < 0) goto failed_put_key; diff --git a/trunk/fs/ext4/Kconfig b/trunk/fs/ext4/Kconfig index 0a475c881852..c22f17021b6e 100644 --- a/trunk/fs/ext4/Kconfig +++ b/trunk/fs/ext4/Kconfig @@ -39,8 +39,22 @@ config EXT4_USE_FOR_EXT23 compiled kernel size by using one file system driver for ext2, ext3, and ext4 file systems. +config EXT4_FS_XATTR + bool "Ext4 extended attributes" + depends on EXT4_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + for details). + + If unsure, say N. + + You need this for POSIX ACL support on ext4. + config EXT4_FS_POSIX_ACL bool "Ext4 POSIX Access Control Lists" + depends on EXT4_FS_XATTR select FS_POSIX_ACL help POSIX Access Control Lists (ACLs) support permissions for users and @@ -53,6 +67,7 @@ config EXT4_FS_POSIX_ACL config EXT4_FS_SECURITY bool "Ext4 Security Labels" + depends on EXT4_FS_XATTR help Security labels support alternative access control models implemented by security modules like SELinux. This option diff --git a/trunk/fs/ext4/Makefile b/trunk/fs/ext4/Makefile index 0310fec2ee3d..56fd8f865930 100644 --- a/trunk/fs/ext4/Makefile +++ b/trunk/fs/ext4/Makefile @@ -7,8 +7,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ - mmp.o indirect.o extents_status.o xattr.o xattr_user.o \ - xattr_trusted.o inline.o + mmp.o indirect.o +ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/trunk/fs/ext4/acl.c b/trunk/fs/ext4/acl.c index e6e0d988439b..d3c5b88fd89f 100644 --- a/trunk/fs/ext4/acl.c +++ b/trunk/fs/ext4/acl.c @@ -423,10 +423,8 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, retry: handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - goto release_and_out; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); error = ext4_set_acl(handle, inode, type, acl); ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) diff --git a/trunk/fs/ext4/dir.c b/trunk/fs/ext4/dir.c index b8d877f6c1fa..8e07d2a5a139 100644 --- a/trunk/fs/ext4/dir.c +++ b/trunk/fs/ext4/dir.c @@ -27,11 +27,23 @@ #include #include #include "ext4.h" -#include "xattr.h" + +static unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; static int ext4_dx_readdir(struct file *filp, void *dirent, filldir_t filldir); +static unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || + (filetype >= EXT4_FT_MAX)) + return DT_UNKNOWN; + + return (ext4_filetype_table[filetype]); +} + /** * Check if the given dir-inode refers to an htree-indexed directory * (or a directory which chould potentially get coverted to use htree @@ -56,14 +68,11 @@ static int is_dx_dir(struct inode *inode) * Return 0 if the directory entry is OK, and 1 if there is a problem * * Note: this is the opposite of what ext2 and ext3 historically returned... - * - * bh passed here can be an inode block or a dir data block, depending - * on the inode inline data flag. */ int __ext4_check_dir_entry(const char *function, unsigned int line, struct inode *dir, struct file *filp, struct ext4_dir_entry_2 *de, - struct buffer_head *bh, char *buf, int size, + struct buffer_head *bh, unsigned int offset) { const char *error_msg = NULL; @@ -76,8 +85,9 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, error_msg = "rec_len % 4 != 0"; else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; - else if (unlikely(((char *) de - buf) + rlen > size)) - error_msg = "directory entry across range"; + else if (unlikely(((char *) de - bh->b_data) + rlen > + dir->i_sb->s_blocksize)) + error_msg = "directory entry across blocks"; else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; @@ -88,14 +98,14 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, ext4_error_file(filp, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u(%u), " "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset % size), + error_msg, (unsigned) (offset % bh->b_size), offset, le32_to_cpu(de->inode), rlen, de->name_len); else ext4_error_inode(dir, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u(%u), " "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset % size), + error_msg, (unsigned) (offset % bh->b_size), offset, le32_to_cpu(de->inode), rlen, de->name_len); @@ -115,14 +125,6 @@ static int ext4_readdir(struct file *filp, int ret = 0; int dir_has_error = 0; - if (ext4_has_inline_data(inode)) { - int has_inline_data = 1; - ret = ext4_read_inline_dir(filp, dirent, filldir, - &has_inline_data); - if (has_inline_data) - return ret; - } - if (is_dx_dir(inode)) { err = ext4_dx_readdir(filp, dirent, filldir); if (err != ERR_BAD_DX_DIR) { @@ -219,9 +221,8 @@ static int ext4_readdir(struct file *filp, while (!error && filp->f_pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); - if (ext4_check_dir_entry(inode, filp, de, bh, - bh->b_data, bh->b_size, - offset)) { + if (ext4_check_dir_entry(inode, filp, de, + bh, offset)) { /* * On error, skip the f_pos to the next block */ diff --git a/trunk/fs/ext4/ext4.h b/trunk/fs/ext4/ext4.h index 8462eb3c33aa..df163da388c9 100644 --- a/trunk/fs/ext4/ext4.h +++ b/trunk/fs/ext4/ext4.h @@ -57,16 +57,6 @@ #define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif -/* - * Turn on EXT_DEBUG to get lots of info about extents operations. - */ -#define EXT_DEBUG__ -#ifdef EXT_DEBUG -#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) -#else -#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) -#endif - #define EXT4_ERROR_INODE(inode, fmt, a...) \ ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) @@ -402,7 +392,6 @@ struct flex_groups { #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ -#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ @@ -459,26 +448,28 @@ enum { EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ - EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ }; -/* - * Since it's pretty easy to mix up bit numbers and hex values, we use a - * build-time check to make sure that EXT4_XXX_FL is consistent with respect to - * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost - * any extra space in the compiled kernel image, otherwise, the build will fail. - * It's important that these values are the same, since we are using - * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent - * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk - * values found in ext2, ext3 and ext4 filesystems, and of course the values - * defined in e2fsprogs. +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) +#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \ + printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \ + EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); } + +/* + * Since it's pretty easy to mix up bit numbers and hex values, and we + * can't do a compile-time test for ENUM values, we use a run-time + * test to make sure that EXT4_XXX_FL is consistent with respect to + * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop + * out so it won't cost any extra space in the compiled kernel image. + * But it's important that these values are the same, since we are + * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL + * must be consistent with the values of FS_XXX_FL defined in + * include/linux/fs.h and the on-disk values found in ext2, ext3, and + * ext4 filesystems, and of course the values defined in e2fsprogs. * * It's not paranoia if the Murphy's Law really *is* out to get you. :-) */ -#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) -#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) - static inline void ext4_check_flag_values(void) { CHECK_FLAG_VALUE(SECRM); @@ -503,7 +494,6 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(EXTENTS); CHECK_FLAG_VALUE(EA_INODE); CHECK_FLAG_VALUE(EOFBLOCKS); - CHECK_FLAG_VALUE(INLINE_DATA); CHECK_FLAG_VALUE(RESERVED); } @@ -821,8 +811,6 @@ struct ext4_ext_cache { __u32 ec_len; /* must be 32bit to return holes */ }; -#include "extents_status.h" - /* * fourth extended file system inode data in memory */ @@ -845,6 +833,7 @@ struct ext4_inode_info { #endif unsigned long i_flags; +#ifdef CONFIG_EXT4_FS_XATTR /* * Extended attributes can be read independently of the main file * data. Taking i_mutex even when reading would cause contention @@ -853,6 +842,7 @@ struct ext4_inode_info { * EAs. */ struct rw_semaphore xattr_sem; +#endif struct list_head i_orphan; /* unlinked but open inodes */ @@ -898,10 +888,6 @@ struct ext4_inode_info { struct list_head i_prealloc_list; spinlock_t i_prealloc_lock; - /* extents status tree */ - struct ext4_es_tree i_es_tree; - rwlock_t i_es_lock; - /* ialloc */ ext4_group_t i_last_alloc_group; @@ -916,10 +902,6 @@ struct ext4_inode_info { /* on-disk additional length */ __u16 i_extra_isize; - /* Indicate the inline data space. */ - u16 i_inline_off; - u16 i_inline_size; - #ifdef CONFIG_QUOTA /* quota space reservation, managed internally by quota code */ qsize_t i_reserved_quota; @@ -1378,7 +1360,6 @@ enum { EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read nolocking */ - EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -1500,7 +1481,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ #define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ -#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x8000 /* data in inode */ #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ @@ -1524,8 +1505,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ - EXT4_FEATURE_INCOMPAT_MMP | \ - EXT4_FEATURE_INCOMPAT_INLINE_DATA) + EXT4_FEATURE_INCOMPAT_MMP) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ @@ -1612,11 +1592,6 @@ struct ext4_dir_entry_tail { __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ }; -#define EXT4_DIRENT_TAIL(block, blocksize) \ - ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ - ((blocksize) - \ - sizeof(struct ext4_dir_entry_tail)))) - /* * Ext4 directory file types. Only the low 3 bits are used. The * other bits are reserved for now. @@ -1961,42 +1936,14 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, struct file *, struct ext4_dir_entry_2 *, - struct buffer_head *, char *, int, - unsigned int); -#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ + struct buffer_head *, unsigned int); +#define ext4_check_dir_entry(dir, filp, de, bh, offset) \ unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ - (de), (bh), (buf), (size), (offset))) + (de), (bh), (offset))) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent); extern void ext4_htree_free_dir_info(struct dir_private_info *p); -extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, - struct buffer_head *bh, - void *buf, int buf_size, - const char *name, int namelen, - struct ext4_dir_entry_2 **dest_de); -void ext4_insert_dentry(struct inode *inode, - struct ext4_dir_entry_2 *de, - int buf_size, - const char *name, int namelen); -static inline void ext4_update_dx_flag(struct inode *inode) -{ - if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_COMPAT_DIR_INDEX)) - ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); -} -static unsigned char ext4_filetype_table[] = { - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK -}; - -static inline unsigned char get_dtype(struct super_block *sb, int filetype) -{ - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || - (filetype >= EXT4_FT_MAX)) - return DT_UNKNOWN; - - return ext4_filetype_table[filetype]; -} /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); @@ -2047,23 +1994,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int, int *); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int, int *); -int ext4_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); -int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create); -int ext4_walk_page_buffers(handle_t *handle, - struct buffer_head *head, - unsigned from, - unsigned to, - int *partial, - int (*fn)(handle_t *handle, - struct buffer_head *bh)); -int do_journal_get_write_access(handle_t *handle, - struct buffer_head *bh); -#define FALL_BACK_TO_NONDELALLOC 1 -#define CONVERT_INLINE_DATA 2 extern struct inode *ext4_iget(struct super_block *, unsigned long); extern int ext4_write_inode(struct inode *, struct writeback_control *); @@ -2118,20 +2050,6 @@ extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); -extern int search_dir(struct buffer_head *bh, - char *search_buf, - int buf_size, - struct inode *dir, - const struct qstr *d_name, - unsigned int offset, - struct ext4_dir_entry_2 **res_dir); -extern int ext4_generic_delete_entry(handle_t *handle, - struct inode *dir, - struct ext4_dir_entry_2 *de_del, - struct buffer_head *bh, - void *entry_buf, - int buf_size, - int csum_size); /* resize.c */ extern int ext4_group_add(struct super_block *sb, @@ -2458,15 +2376,6 @@ extern void ext4_unwritten_wait(struct inode *inode); extern const struct inode_operations ext4_dir_inode_operations; extern const struct inode_operations ext4_special_inode_operations; extern struct dentry *ext4_get_parent(struct dentry *child); -extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, - struct ext4_dir_entry_2 *de, - int blocksize, int csum_size, - unsigned int parent_ino, int dotdot_real_len); -extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t, - unsigned int blocksize); -extern int ext4_handle_dirty_dirent_node(handle_t *handle, - struct inode *inode, - struct buffer_head *bh); /* symlink.c */ extern const struct inode_operations ext4_symlink_inode_operations; @@ -2484,9 +2393,6 @@ extern int ext4_check_blockref(const char *, unsigned int, struct inode *, __le32 *, unsigned int); /* extents.c */ -struct ext4_ext_path; -struct ext4_extent; - extern int ext4_ext_tree_init(handle_t *handle, struct inode *); extern int ext4_ext_writepage_trans_blocks(struct inode *, int); extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, @@ -2504,27 +2410,8 @@ extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, ssize_t len); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern int ext4_ext_calc_metadata_amount(struct inode *inode, - ext4_lblk_t lblocks); -extern int ext4_extent_tree_init(handle_t *, struct inode *); -extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, - int num, - struct ext4_ext_path *path); -extern int ext4_can_extents_be_merged(struct inode *inode, - struct ext4_extent *ex1, - struct ext4_extent *ex2); -extern int ext4_ext_insert_extent(handle_t *, struct inode *, - struct ext4_ext_path *, - struct ext4_extent *, int); -extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, - struct ext4_ext_path *); -extern void ext4_ext_drop_refs(struct ext4_ext_path *); -extern int ext4_ext_check_inode(struct inode *inode); -extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); - - /* move_extent.c */ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, @@ -2558,10 +2445,14 @@ enum ext4_state_bits { * never, ever appear in a buffer_head's state * flag. See EXT4_MAP_FROM_CLUSTER to see where * this is used. */ + BH_Da_Mapped, /* Delayed allocated block that now has a mapping. This + * flag is set when ext4_map_blocks is called on a + * delayed allocated block to get its real mapping. */ }; BUFFER_FNS(Uninit, uninit) TAS_BUFFER_FNS(Uninit, uninit) +BUFFER_FNS(Da_Mapped, da_mapped) /* * Add new method to test whether block and inode bitmaps are properly @@ -2612,4 +2503,6 @@ extern void ext4_resize_end(struct super_block *sb); #endif /* __KERNEL__ */ +#include "ext4_extents.h" + #endif /* _EXT4_H */ diff --git a/trunk/fs/ext4/ext4_extents.h b/trunk/fs/ext4/ext4_extents.h index 487fda12bc00..cb1b2c919963 100644 --- a/trunk/fs/ext4/ext4_extents.h +++ b/trunk/fs/ext4/ext4_extents.h @@ -42,6 +42,16 @@ */ #define CHECK_BINSEARCH__ +/* + * Turn on EXT_DEBUG to get lots of info about extents operations. + */ +#define EXT_DEBUG__ +#ifdef EXT_DEBUG +#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + /* * If EXT_STATS is defined then stats numbers are collected. * These number will be displayed at umount time. @@ -133,6 +143,20 @@ struct ext4_ext_path { * structure for external API */ +/* + * to be called by ext4_ext_walk_space() + * negative retcode - error + * positive retcode - signal for ext4_ext_walk_space(), see below + * callback must return valid extent (passed or newly created) + */ +typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t, + struct ext4_ext_cache *, + struct ext4_extent *, void *); + +#define EXT_CONTINUE 0 +#define EXT_BREAK 1 +#define EXT_REPEAT 2 + /* * Maximum number of logical blocks in a file; ext4_extent's ee_block is * __le32. @@ -276,5 +300,21 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, 0xffff); } +extern int ext4_ext_calc_metadata_amount(struct inode *inode, + ext4_lblk_t lblocks); +extern int ext4_extent_tree_init(handle_t *, struct inode *); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); +extern int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); +extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path *); +extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern int ext4_ext_check_inode(struct inode *inode); +extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, + int search_hint_reverse); #endif /* _EXT4_EXTENTS */ diff --git a/trunk/fs/ext4/ext4_jbd2.h b/trunk/fs/ext4/ext4_jbd2.h index 7177f9b21cb2..56d258c18303 100644 --- a/trunk/fs/ext4/ext4_jbd2.h +++ b/trunk/fs/ext4/ext4_jbd2.h @@ -254,6 +254,13 @@ static inline void ext4_handle_sync(handle_t *handle) handle->h_sync = 1; } +static inline void ext4_handle_release_buffer(handle_t *handle, + struct buffer_head *bh) +{ + if (ext4_handle_valid(handle)) + jbd2_journal_release_buffer(handle, bh); +} + static inline int ext4_handle_is_aborted(handle_t *handle) { if (ext4_handle_valid(handle)) diff --git a/trunk/fs/ext4/extents.c b/trunk/fs/ext4/extents.c index 26af22832a84..7011ac967208 100644 --- a/trunk/fs/ext4/extents.c +++ b/trunk/fs/ext4/extents.c @@ -41,8 +41,6 @@ #include #include #include "ext4_jbd2.h" -#include "ext4_extents.h" -#include "xattr.h" #include @@ -111,9 +109,6 @@ static int ext4_split_extent_at(handle_t *handle, int split_flag, int flags); -static int ext4_find_delayed_extent(struct inode *inode, - struct ext4_ext_cache *newex); - static int ext4_ext_truncate_extend_restart(handle_t *handle, struct inode *inode, int needed) @@ -1964,33 +1959,27 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, return err; } -static int ext4_fill_fiemap_extents(struct inode *inode, - ext4_lblk_t block, ext4_lblk_t num, - struct fiemap_extent_info *fieinfo) +static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, + ext4_lblk_t num, ext_prepare_callback func, + void *cbdata) { struct ext4_ext_path *path = NULL; - struct ext4_ext_cache newex; + struct ext4_ext_cache cbex; struct ext4_extent *ex; - ext4_lblk_t next, next_del, start = 0, end = 0; + ext4_lblk_t next, start = 0, end = 0; ext4_lblk_t last = block + num; - int exists, depth = 0, err = 0; - unsigned int flags = 0; - unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; + int depth, exists, err = 0; + + BUG_ON(func == NULL); + BUG_ON(inode == NULL); while (block < last && block != EXT_MAX_BLOCKS) { num = last - block; /* find extent for this block */ down_read(&EXT4_I(inode)->i_data_sem); - - if (path && ext_depth(inode) != depth) { - /* depth was changed. we have to realloc path */ - kfree(path); - path = NULL; - } - path = ext4_ext_find_extent(inode, block, path); + up_read(&EXT4_I(inode)->i_data_sem); if (IS_ERR(path)) { - up_read(&EXT4_I(inode)->i_data_sem); err = PTR_ERR(path); path = NULL; break; @@ -1998,16 +1987,13 @@ static int ext4_fill_fiemap_extents(struct inode *inode, depth = ext_depth(inode); if (unlikely(path[depth].p_hdr == NULL)) { - up_read(&EXT4_I(inode)->i_data_sem); EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); err = -EIO; break; } ex = path[depth].p_ext; next = ext4_ext_next_allocated_block(path); - ext4_ext_drop_refs(path); - flags = 0; exists = 0; if (!ex) { /* there is no extent yet, so try to allocate @@ -2044,64 +2030,40 @@ static int ext4_fill_fiemap_extents(struct inode *inode, BUG_ON(end <= start); if (!exists) { - newex.ec_block = start; - newex.ec_len = end - start; - newex.ec_start = 0; + cbex.ec_block = start; + cbex.ec_len = end - start; + cbex.ec_start = 0; } else { - newex.ec_block = le32_to_cpu(ex->ee_block); - newex.ec_len = ext4_ext_get_actual_len(ex); - newex.ec_start = ext4_ext_pblock(ex); - if (ext4_ext_is_uninitialized(ex)) - flags |= FIEMAP_EXTENT_UNWRITTEN; - } - - /* - * Find delayed extent and update newex accordingly. We call - * it even in !exists case to find out whether newex is the - * last existing extent or not. - */ - next_del = ext4_find_delayed_extent(inode, &newex); - if (!exists && next_del) { - exists = 1; - flags |= FIEMAP_EXTENT_DELALLOC; + cbex.ec_block = le32_to_cpu(ex->ee_block); + cbex.ec_len = ext4_ext_get_actual_len(ex); + cbex.ec_start = ext4_ext_pblock(ex); } - up_read(&EXT4_I(inode)->i_data_sem); - if (unlikely(newex.ec_len == 0)) { - EXT4_ERROR_INODE(inode, "newex.ec_len == 0"); + if (unlikely(cbex.ec_len == 0)) { + EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); err = -EIO; break; } + err = func(inode, next, &cbex, ex, cbdata); + ext4_ext_drop_refs(path); - /* This is possible iff next == next_del == EXT_MAX_BLOCKS */ - if (next == next_del) { - flags |= FIEMAP_EXTENT_LAST; - if (unlikely(next_del != EXT_MAX_BLOCKS || - next != EXT_MAX_BLOCKS)) { - EXT4_ERROR_INODE(inode, - "next extent == %u, next " - "delalloc extent = %u", - next, next_del); - err = -EIO; - break; - } + if (err < 0) + break; + + if (err == EXT_REPEAT) + continue; + else if (err == EXT_BREAK) { + err = 0; + break; } - if (exists) { - err = fiemap_fill_next_extent(fieinfo, - (__u64)newex.ec_block << blksize_bits, - (__u64)newex.ec_start << blksize_bits, - (__u64)newex.ec_len << blksize_bits, - flags); - if (err < 0) - break; - if (err == 1) { - err = 0; - break; - } + if (ext_depth(inode) != depth) { + /* depth was changed. we have to realloc path */ + kfree(path); + path = NULL; } - block = newex.ec_block + newex.ec_len; + block = cbex.ec_block + cbex.ec_len; } if (path) { @@ -2194,6 +2156,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, struct ext4_extent *ex) { struct ext4_ext_cache *cex; + struct ext4_sb_info *sbi; int ret = 0; /* @@ -2201,6 +2164,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, */ spin_lock(&EXT4_I(inode)->i_block_reservation_lock); cex = &EXT4_I(inode)->i_cached_extent; + sbi = EXT4_SB(inode->i_sb); /* has cache valid data? */ if (cex->ec_len == 0) @@ -2309,13 +2273,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) { int index; - int depth; - - /* If we are converting the inline data, only one is needed here. */ - if (ext4_has_inline_data(inode)) - return 1; - - depth = ext_depth(inode); + int depth = ext_depth(inode); if (chunk) index = depth * 2; @@ -3503,34 +3461,115 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode, /** * ext4_find_delalloc_range: find delayed allocated block in the given range. * - * Return 1 if there is a delalloc block in the range, otherwise 0. + * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns + * whether there are any buffers marked for delayed allocation. It returns '1' + * on the first delalloc'ed buffer head found. If no buffer head in the given + * range is marked for delalloc, it returns 0. + * lblk_start should always be <= lblk_end. + * search_hint_reverse is to indicate that searching in reverse from lblk_end to + * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed + * block sooner). This is useful when blocks are truncated sequentially from + * lblk_start towards lblk_end. */ static int ext4_find_delalloc_range(struct inode *inode, ext4_lblk_t lblk_start, - ext4_lblk_t lblk_end) + ext4_lblk_t lblk_end, + int search_hint_reverse) { - struct extent_status es; + struct address_space *mapping = inode->i_mapping; + struct buffer_head *head, *bh = NULL; + struct page *page; + ext4_lblk_t i, pg_lblk; + pgoff_t index; - es.start = lblk_start; - ext4_es_find_extent(inode, &es); - if (es.len == 0) - return 0; /* there is no delay extent in this tree */ - else if (es.start <= lblk_start && lblk_start < es.start + es.len) - return 1; - else if (lblk_start <= es.start && es.start <= lblk_end) - return 1; - else + if (!test_opt(inode->i_sb, DELALLOC)) return 0; + + /* reverse search wont work if fs block size is less than page size */ + if (inode->i_blkbits < PAGE_CACHE_SHIFT) + search_hint_reverse = 0; + + if (search_hint_reverse) + i = lblk_end; + else + i = lblk_start; + + index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + + while ((i >= lblk_start) && (i <= lblk_end)) { + page = find_get_page(mapping, index); + if (!page) + goto nextpage; + + if (!page_has_buffers(page)) + goto nextpage; + + head = page_buffers(page); + if (!head) + goto nextpage; + + bh = head; + pg_lblk = index << (PAGE_CACHE_SHIFT - + inode->i_blkbits); + do { + if (unlikely(pg_lblk < lblk_start)) { + /* + * This is possible when fs block size is less + * than page size and our cluster starts/ends in + * middle of the page. So we need to skip the + * initial few blocks till we reach the 'lblk' + */ + pg_lblk++; + continue; + } + + /* Check if the buffer is delayed allocated and that it + * is not yet mapped. (when da-buffers are mapped during + * their writeout, their da_mapped bit is set.) + */ + if (buffer_delay(bh) && !buffer_da_mapped(bh)) { + page_cache_release(page); + trace_ext4_find_delalloc_range(inode, + lblk_start, lblk_end, + search_hint_reverse, + 1, i); + return 1; + } + if (search_hint_reverse) + i--; + else + i++; + } while ((i >= lblk_start) && (i <= lblk_end) && + ((bh = bh->b_this_page) != head)); +nextpage: + if (page) + page_cache_release(page); + /* + * Move to next page. 'i' will be the first lblk in the next + * page. + */ + if (search_hint_reverse) + index--; + else + index++; + i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + } + + trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end, + search_hint_reverse, 0, 0); + return 0; } -int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) +int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, + int search_hint_reverse) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t lblk_start, lblk_end; lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); lblk_end = lblk_start + sbi->s_cluster_ratio - 1; - return ext4_find_delalloc_range(inode, lblk_start, lblk_end); + return ext4_find_delalloc_range(inode, lblk_start, lblk_end, + search_hint_reverse); } /** @@ -3591,7 +3630,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); lblk_to = lblk_from + c_offset - 1; - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) allocated_clusters--; } @@ -3601,7 +3640,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, lblk_from = lblk_start + num_blks; lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) allocated_clusters--; } @@ -3624,8 +3663,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, flags, allocated); ext4_ext_show_leaf(inode, path); - trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, - allocated, newblock); + trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated, + newblock); /* get_block() before submit the IO, split the extent */ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { @@ -3872,7 +3911,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_extent newex, *ex, *ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0; - int free_on_err = 0, err = 0, depth; + int free_on_err = 0, err = 0, depth, ret; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; @@ -3888,7 +3927,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { if (!newex.ee_start_lo && !newex.ee_start_hi) { if ((sbi->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk)) + ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) map->m_flags |= EXT4_MAP_FROM_CLUSTER; if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { @@ -3968,15 +4007,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ee_len, ee_start); goto out; } - allocated = ext4_ext_handle_uninitialized_extents( + ret = ext4_ext_handle_uninitialized_extents( handle, inode, map, path, flags, allocated, newblock); - goto out3; + return ret; } } if ((sbi->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk)) + ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) map->m_flags |= EXT4_MAP_FROM_CLUSTER; /* @@ -4245,8 +4284,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, kfree(path); } -out3: - trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated); + trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, + newblock, map->m_len, err ? err : allocated); return err ? err : allocated; } @@ -4305,8 +4344,6 @@ void ext4_ext_truncate(struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); - err = ext4_es_remove_extent(inode, last_block, - EXT_MAX_BLOCKS - last_block); err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); /* In a multi-transaction truncate, we only make the final @@ -4397,10 +4434,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (mode & FALLOC_FL_PUNCH_HOLE) return ext4_punch_hole(file, offset, len); - ret = ext4_convert_inline_data(inode); - if (ret) - return ret; - trace_ext4_fallocate_enter(inode, offset, len, mode); map.m_lblk = offset >> blkbits; /* @@ -4539,43 +4572,206 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, } /* - * If newex is not existing extent (newex->ec_start equals zero) find - * delayed extent at start of newex and update newex accordingly and - * return start of the next delayed extent. - * - * If newex is existing extent (newex->ec_start is not equal zero) - * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed - * extent found. Leave newex unmodified. + * Callback function called for each extent to gather FIEMAP information. */ -static int ext4_find_delayed_extent(struct inode *inode, - struct ext4_ext_cache *newex) +static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, + struct ext4_ext_cache *newex, struct ext4_extent *ex, + void *data) { - struct extent_status es; - ext4_lblk_t next_del; + __u64 logical; + __u64 physical; + __u64 length; + __u32 flags = 0; + int ret = 0; + struct fiemap_extent_info *fieinfo = data; + unsigned char blksize_bits; - es.start = newex->ec_block; - next_del = ext4_es_find_extent(inode, &es); + blksize_bits = inode->i_sb->s_blocksize_bits; + logical = (__u64)newex->ec_block << blksize_bits; if (newex->ec_start == 0) { /* * No extent in extent-tree contains block @newex->ec_start, * then the block may stay in 1)a hole or 2)delayed-extent. + * + * Holes or delayed-extents are processed as follows. + * 1. lookup dirty pages with specified range in pagecache. + * If no page is got, then there is no delayed-extent and + * return with EXT_CONTINUE. + * 2. find the 1st mapped buffer, + * 3. check if the mapped buffer is both in the request range + * and a delayed buffer. If not, there is no delayed-extent, + * then return. + * 4. a delayed-extent is found, the extent will be collected. */ - if (es.len == 0) - /* A hole found. */ - return 0; + ext4_lblk_t end = 0; + pgoff_t last_offset; + pgoff_t offset; + pgoff_t index; + pgoff_t start_index = 0; + struct page **pages = NULL; + struct buffer_head *bh = NULL; + struct buffer_head *head = NULL; + unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *); + + pages = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (pages == NULL) + return -ENOMEM; - if (es.start > newex->ec_block) { - /* A hole found. */ - newex->ec_len = min(es.start - newex->ec_block, - newex->ec_len); - return 0; + offset = logical >> PAGE_SHIFT; +repeat: + last_offset = offset; + head = NULL; + ret = find_get_pages_tag(inode->i_mapping, &offset, + PAGECACHE_TAG_DIRTY, nr_pages, pages); + + if (!(flags & FIEMAP_EXTENT_DELALLOC)) { + /* First time, try to find a mapped buffer. */ + if (ret == 0) { +out: + for (index = 0; index < ret; index++) + page_cache_release(pages[index]); + /* just a hole. */ + kfree(pages); + return EXT_CONTINUE; + } + index = 0; + +next_page: + /* Try to find the 1st mapped buffer. */ + end = ((__u64)pages[index]->index << PAGE_SHIFT) >> + blksize_bits; + if (!page_has_buffers(pages[index])) + goto out; + head = page_buffers(pages[index]); + if (!head) + goto out; + + index++; + bh = head; + do { + if (end >= newex->ec_block + + newex->ec_len) + /* The buffer is out of + * the request range. + */ + goto out; + + if (buffer_mapped(bh) && + end >= newex->ec_block) { + start_index = index - 1; + /* get the 1st mapped buffer. */ + goto found_mapped_buffer; + } + + bh = bh->b_this_page; + end++; + } while (bh != head); + + /* No mapped buffer in the range found in this page, + * We need to look up next page. + */ + if (index >= ret) { + /* There is no page left, but we need to limit + * newex->ec_len. + */ + newex->ec_len = end - newex->ec_block; + goto out; + } + goto next_page; + } else { + /*Find contiguous delayed buffers. */ + if (ret > 0 && pages[0]->index == last_offset) + head = page_buffers(pages[0]); + bh = head; + index = 1; + start_index = 0; + } + +found_mapped_buffer: + if (bh != NULL && buffer_delay(bh)) { + /* 1st or contiguous delayed buffer found. */ + if (!(flags & FIEMAP_EXTENT_DELALLOC)) { + /* + * 1st delayed buffer found, record + * the start of extent. + */ + flags |= FIEMAP_EXTENT_DELALLOC; + newex->ec_block = end; + logical = (__u64)end << blksize_bits; + } + /* Find contiguous delayed buffers. */ + do { + if (!buffer_delay(bh)) + goto found_delayed_extent; + bh = bh->b_this_page; + end++; + } while (bh != head); + + for (; index < ret; index++) { + if (!page_has_buffers(pages[index])) { + bh = NULL; + break; + } + head = page_buffers(pages[index]); + if (!head) { + bh = NULL; + break; + } + + if (pages[index]->index != + pages[start_index]->index + index + - start_index) { + /* Blocks are not contiguous. */ + bh = NULL; + break; + } + bh = head; + do { + if (!buffer_delay(bh)) + /* Delayed-extent ends. */ + goto found_delayed_extent; + bh = bh->b_this_page; + end++; + } while (bh != head); + } + } else if (!(flags & FIEMAP_EXTENT_DELALLOC)) + /* a hole found. */ + goto out; + +found_delayed_extent: + newex->ec_len = min(end - newex->ec_block, + (ext4_lblk_t)EXT_INIT_MAX_LEN); + if (ret == nr_pages && bh != NULL && + newex->ec_len < EXT_INIT_MAX_LEN && + buffer_delay(bh)) { + /* Have not collected an extent and continue. */ + for (index = 0; index < ret; index++) + page_cache_release(pages[index]); + goto repeat; } - newex->ec_len = es.start + es.len - newex->ec_block; + for (index = 0; index < ret; index++) + page_cache_release(pages[index]); + kfree(pages); } - return next_del; + physical = (__u64)newex->ec_start << blksize_bits; + length = (__u64)newex->ec_len << blksize_bits; + + if (ex && ext4_ext_is_uninitialized(ex)) + flags |= FIEMAP_EXTENT_UNWRITTEN; + + if (next == EXT_MAX_BLOCKS) + flags |= FIEMAP_EXTENT_LAST; + + ret = fiemap_fill_next_extent(fieinfo, logical, physical, + length, flags); + if (ret < 0) + return ret; + if (ret == 1) + return EXT_BREAK; + return EXT_CONTINUE; } /* fiemap flags we can handle specified here */ #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) @@ -4775,8 +4971,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) ext4_ext_invalidate_cache(inode); ext4_discard_preallocations(inode); - err = ext4_es_remove_extent(inode, first_block, - stop_block - first_block); err = ext4_ext_remove_space(inode, first_block, stop_block - 1); ext4_ext_invalidate_cache(inode); @@ -4797,22 +4991,12 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) mutex_unlock(&inode->i_mutex); return err; } - int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { ext4_lblk_t start_blk; int error = 0; - if (ext4_has_inline_data(inode)) { - int has_inline = 1; - - error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline); - - if (has_inline) - return error; - } - /* fallback to generic here if not in extents fmt */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return generic_block_fiemap(inode, fieinfo, start, len, @@ -4834,11 +5018,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; /* - * Walk the extent tree gathering extent information - * and pushing extents back to the user. + * Walk the extent tree gathering extent information. + * ext4_ext_fiemap_cb will push extents back to user. */ - error = ext4_fill_fiemap_extents(inode, start_blk, - len_blks, fieinfo); + error = ext4_ext_walk_space(inode, start_blk, len_blks, + ext4_ext_fiemap_cb, fieinfo); } return error; diff --git a/trunk/fs/ext4/extents_status.c b/trunk/fs/ext4/extents_status.c deleted file mode 100644 index 564d981a2fcc..000000000000 --- a/trunk/fs/ext4/extents_status.c +++ /dev/null @@ -1,500 +0,0 @@ -/* - * fs/ext4/extents_status.c - * - * Written by Yongqiang Yang - * Modified by - * Allison Henderson - * Hugh Dickins - * Zheng Liu - * - * Ext4 extents status tree core functions. - */ -#include -#include "ext4.h" -#include "extents_status.h" -#include "ext4_extents.h" - -#include - -/* - * According to previous discussion in Ext4 Developer Workshop, we - * will introduce a new structure called io tree to track all extent - * status in order to solve some problems that we have met - * (e.g. Reservation space warning), and provide extent-level locking. - * Delay extent tree is the first step to achieve this goal. It is - * original built by Yongqiang Yang. At that time it is called delay - * extent tree, whose goal is only track delay extent in memory to - * simplify the implementation of fiemap and bigalloc, and introduce - * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called - * delay extent tree at the following comment. But for better - * understand what it does, it has been rename to extent status tree. - * - * Currently the first step has been done. All delay extents are - * tracked in the tree. It maintains the delay extent when a delay - * allocation is issued, and the delay extent is written out or - * invalidated. Therefore the implementation of fiemap and bigalloc - * are simplified, and SEEK_DATA/SEEK_HOLE are introduced. - * - * The following comment describes the implemenmtation of extent - * status tree and future works. - */ - -/* - * extents status tree implementation for ext4. - * - * - * ========================================================================== - * Extents status encompass delayed extents and extent locks - * - * 1. Why delayed extent implementation ? - * - * Without delayed extent, ext4 identifies a delayed extent by looking - * up page cache, this has several deficiencies - complicated, buggy, - * and inefficient code. - * - * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need - * to know if a block or a range of blocks are belonged to a delayed - * extent. - * - * Let us have a look at how they do without delayed extents implementation. - * -- FIEMAP - * FIEMAP looks up page cache to identify delayed allocations from holes. - * - * -- SEEK_HOLE/DATA - * SEEK_HOLE/DATA has the same problem as FIEMAP. - * - * -- bigalloc - * bigalloc looks up page cache to figure out if a block is - * already under delayed allocation or not to determine whether - * quota reserving is needed for the cluster. - * - * -- punch hole - * punch hole looks up page cache to identify a delayed extent. - * - * -- writeout - * Writeout looks up whole page cache to see if a buffer is - * mapped, If there are not very many delayed buffers, then it is - * time comsuming. - * - * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA, - * bigalloc and writeout can figure out if a block or a range of - * blocks is under delayed allocation(belonged to a delayed extent) or - * not by searching the delayed extent tree. - * - * - * ========================================================================== - * 2. ext4 delayed extents impelmentation - * - * -- delayed extent - * A delayed extent is a range of blocks which are contiguous - * logically and under delayed allocation. Unlike extent in - * ext4, delayed extent in ext4 is a in-memory struct, there is - * no corresponding on-disk data. There is no limit on length of - * delayed extent, so a delayed extent can contain as many blocks - * as they are contiguous logically. - * - * -- delayed extent tree - * Every inode has a delayed extent tree and all under delayed - * allocation blocks are added to the tree as delayed extents. - * Delayed extents in the tree are ordered by logical block no. - * - * -- operations on a delayed extent tree - * There are three operations on a delayed extent tree: find next - * delayed extent, adding a space(a range of blocks) and removing - * a space. - * - * -- race on a delayed extent tree - * Delayed extent tree is protected inode->i_es_lock. - * - * - * ========================================================================== - * 3. performance analysis - * -- overhead - * 1. There is a cache extent for write access, so if writes are - * not very random, adding space operaions are in O(1) time. - * - * -- gain - * 2. Code is much simpler, more readable, more maintainable and - * more efficient. - * - * - * ========================================================================== - * 4. TODO list - * -- Track all extent status - * - * -- Improve get block process - * - * -- Extent-level locking - */ - -static struct kmem_cache *ext4_es_cachep; - -int __init ext4_init_es(void) -{ - ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); - if (ext4_es_cachep == NULL) - return -ENOMEM; - return 0; -} - -void ext4_exit_es(void) -{ - if (ext4_es_cachep) - kmem_cache_destroy(ext4_es_cachep); -} - -void ext4_es_init_tree(struct ext4_es_tree *tree) -{ - tree->root = RB_ROOT; - tree->cache_es = NULL; -} - -#ifdef ES_DEBUG__ -static void ext4_es_print_tree(struct inode *inode) -{ - struct ext4_es_tree *tree; - struct rb_node *node; - - printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino); - tree = &EXT4_I(inode)->i_es_tree; - node = rb_first(&tree->root); - while (node) { - struct extent_status *es; - es = rb_entry(node, struct extent_status, rb_node); - printk(KERN_DEBUG " [%u/%u)", es->start, es->len); - node = rb_next(node); - } - printk(KERN_DEBUG "\n"); -} -#else -#define ext4_es_print_tree(inode) -#endif - -static inline ext4_lblk_t extent_status_end(struct extent_status *es) -{ - BUG_ON(es->start + es->len < es->start); - return es->start + es->len - 1; -} - -/* - * search through the tree for an delayed extent with a given offset. If - * it can't be found, try to find next extent. - */ -static struct extent_status *__es_tree_search(struct rb_root *root, - ext4_lblk_t offset) -{ - struct rb_node *node = root->rb_node; - struct extent_status *es = NULL; - - while (node) { - es = rb_entry(node, struct extent_status, rb_node); - if (offset < es->start) - node = node->rb_left; - else if (offset > extent_status_end(es)) - node = node->rb_right; - else - return es; - } - - if (es && offset < es->start) - return es; - - if (es && offset > extent_status_end(es)) { - node = rb_next(&es->rb_node); - return node ? rb_entry(node, struct extent_status, rb_node) : - NULL; - } - - return NULL; -} - -/* - * ext4_es_find_extent: find the 1st delayed extent covering @es->start - * if it exists, otherwise, the next extent after @es->start. - * - * @inode: the inode which owns delayed extents - * @es: delayed extent that we found - * - * Returns the first block of the next extent after es, otherwise - * EXT_MAX_BLOCKS if no delay extent is found. - * Delayed extent is returned via @es. - */ -ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es) -{ - struct ext4_es_tree *tree = NULL; - struct extent_status *es1 = NULL; - struct rb_node *node; - ext4_lblk_t ret = EXT_MAX_BLOCKS; - - trace_ext4_es_find_extent_enter(inode, es->start); - - read_lock(&EXT4_I(inode)->i_es_lock); - tree = &EXT4_I(inode)->i_es_tree; - - /* find delay extent in cache firstly */ - if (tree->cache_es) { - es1 = tree->cache_es; - if (in_range(es->start, es1->start, es1->len)) { - es_debug("%u cached by [%u/%u)\n", - es->start, es1->start, es1->len); - goto out; - } - } - - es->len = 0; - es1 = __es_tree_search(&tree->root, es->start); - -out: - if (es1) { - tree->cache_es = es1; - es->start = es1->start; - es->len = es1->len; - node = rb_next(&es1->rb_node); - if (node) { - es1 = rb_entry(node, struct extent_status, rb_node); - ret = es1->start; - } - } - - read_unlock(&EXT4_I(inode)->i_es_lock); - - trace_ext4_es_find_extent_exit(inode, es, ret); - return ret; -} - -static struct extent_status * -ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len) -{ - struct extent_status *es; - es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); - if (es == NULL) - return NULL; - es->start = start; - es->len = len; - return es; -} - -static void ext4_es_free_extent(struct extent_status *es) -{ - kmem_cache_free(ext4_es_cachep, es); -} - -static struct extent_status * -ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es) -{ - struct extent_status *es1; - struct rb_node *node; - - node = rb_prev(&es->rb_node); - if (!node) - return es; - - es1 = rb_entry(node, struct extent_status, rb_node); - if (es->start == extent_status_end(es1) + 1) { - es1->len += es->len; - rb_erase(&es->rb_node, &tree->root); - ext4_es_free_extent(es); - es = es1; - } - - return es; -} - -static struct extent_status * -ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es) -{ - struct extent_status *es1; - struct rb_node *node; - - node = rb_next(&es->rb_node); - if (!node) - return es; - - es1 = rb_entry(node, struct extent_status, rb_node); - if (es1->start == extent_status_end(es) + 1) { - es->len += es1->len; - rb_erase(node, &tree->root); - ext4_es_free_extent(es1); - } - - return es; -} - -static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset, - ext4_lblk_t len) -{ - struct rb_node **p = &tree->root.rb_node; - struct rb_node *parent = NULL; - struct extent_status *es; - ext4_lblk_t end = offset + len - 1; - - BUG_ON(end < offset); - es = tree->cache_es; - if (es && offset == (extent_status_end(es) + 1)) { - es_debug("cached by [%u/%u)\n", es->start, es->len); - es->len += len; - es = ext4_es_try_to_merge_right(tree, es); - goto out; - } else if (es && es->start == end + 1) { - es_debug("cached by [%u/%u)\n", es->start, es->len); - es->start = offset; - es->len += len; - es = ext4_es_try_to_merge_left(tree, es); - goto out; - } else if (es && es->start <= offset && - end <= extent_status_end(es)) { - es_debug("cached by [%u/%u)\n", es->start, es->len); - goto out; - } - - while (*p) { - parent = *p; - es = rb_entry(parent, struct extent_status, rb_node); - - if (offset < es->start) { - if (es->start == end + 1) { - es->start = offset; - es->len += len; - es = ext4_es_try_to_merge_left(tree, es); - goto out; - } - p = &(*p)->rb_left; - } else if (offset > extent_status_end(es)) { - if (offset == extent_status_end(es) + 1) { - es->len += len; - es = ext4_es_try_to_merge_right(tree, es); - goto out; - } - p = &(*p)->rb_right; - } else { - if (extent_status_end(es) <= end) - es->len = offset - es->start + len; - goto out; - } - } - - es = ext4_es_alloc_extent(offset, len); - if (!es) - return -ENOMEM; - rb_link_node(&es->rb_node, parent, p); - rb_insert_color(&es->rb_node, &tree->root); - -out: - tree->cache_es = es; - return 0; -} - -/* - * ext4_es_insert_extent() adds a space to a delayed extent tree. - * Caller holds inode->i_es_lock. - * - * ext4_es_insert_extent is called by ext4_da_write_begin and - * ext4_es_remove_extent. - * - * Return 0 on success, error code on failure. - */ -int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset, - ext4_lblk_t len) -{ - struct ext4_es_tree *tree; - int err = 0; - - trace_ext4_es_insert_extent(inode, offset, len); - es_debug("add [%u/%u) to extent status tree of inode %lu\n", - offset, len, inode->i_ino); - - write_lock(&EXT4_I(inode)->i_es_lock); - tree = &EXT4_I(inode)->i_es_tree; - err = __es_insert_extent(tree, offset, len); - write_unlock(&EXT4_I(inode)->i_es_lock); - - ext4_es_print_tree(inode); - - return err; -} - -/* - * ext4_es_remove_extent() removes a space from a delayed extent tree. - * Caller holds inode->i_es_lock. - * - * Return 0 on success, error code on failure. - */ -int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset, - ext4_lblk_t len) -{ - struct rb_node *node; - struct ext4_es_tree *tree; - struct extent_status *es; - struct extent_status orig_es; - ext4_lblk_t len1, len2, end; - int err = 0; - - trace_ext4_es_remove_extent(inode, offset, len); - es_debug("remove [%u/%u) from extent status tree of inode %lu\n", - offset, len, inode->i_ino); - - end = offset + len - 1; - BUG_ON(end < offset); - write_lock(&EXT4_I(inode)->i_es_lock); - tree = &EXT4_I(inode)->i_es_tree; - es = __es_tree_search(&tree->root, offset); - if (!es) - goto out; - if (es->start > end) - goto out; - - /* Simply invalidate cache_es. */ - tree->cache_es = NULL; - - orig_es.start = es->start; - orig_es.len = es->len; - len1 = offset > es->start ? offset - es->start : 0; - len2 = extent_status_end(es) > end ? - extent_status_end(es) - end : 0; - if (len1 > 0) - es->len = len1; - if (len2 > 0) { - if (len1 > 0) { - err = __es_insert_extent(tree, end + 1, len2); - if (err) { - es->start = orig_es.start; - es->len = orig_es.len; - goto out; - } - } else { - es->start = end + 1; - es->len = len2; - } - goto out; - } - - if (len1 > 0) { - node = rb_next(&es->rb_node); - if (node) - es = rb_entry(node, struct extent_status, rb_node); - else - es = NULL; - } - - while (es && extent_status_end(es) <= end) { - node = rb_next(&es->rb_node); - rb_erase(&es->rb_node, &tree->root); - ext4_es_free_extent(es); - if (!node) { - es = NULL; - break; - } - es = rb_entry(node, struct extent_status, rb_node); - } - - if (es && es->start < end + 1) { - len1 = extent_status_end(es) - end; - es->start = end + 1; - es->len = len1; - } - -out: - write_unlock(&EXT4_I(inode)->i_es_lock); - ext4_es_print_tree(inode); - return err; -} diff --git a/trunk/fs/ext4/extents_status.h b/trunk/fs/ext4/extents_status.h deleted file mode 100644 index 077f82db092a..000000000000 --- a/trunk/fs/ext4/extents_status.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * fs/ext4/extents_status.h - * - * Written by Yongqiang Yang - * Modified by - * Allison Henderson - * Zheng Liu - * - */ - -#ifndef _EXT4_EXTENTS_STATUS_H -#define _EXT4_EXTENTS_STATUS_H - -/* - * Turn on ES_DEBUG__ to get lots of info about extent status operations. - */ -#ifdef ES_DEBUG__ -#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) -#else -#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) -#endif - -struct extent_status { - struct rb_node rb_node; - ext4_lblk_t start; /* first block extent covers */ - ext4_lblk_t len; /* length of extent in block */ -}; - -struct ext4_es_tree { - struct rb_root root; - struct extent_status *cache_es; /* recently accessed extent */ -}; - -extern int __init ext4_init_es(void); -extern void ext4_exit_es(void); -extern void ext4_es_init_tree(struct ext4_es_tree *tree); - -extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t start, - ext4_lblk_t len); -extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t start, - ext4_lblk_t len); -extern ext4_lblk_t ext4_es_find_extent(struct inode *inode, - struct extent_status *es); - -#endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/trunk/fs/ext4/file.c b/trunk/fs/ext4/file.c index b64a60bf105a..bf3966bccd34 100644 --- a/trunk/fs/ext4/file.c +++ b/trunk/fs/ext4/file.c @@ -24,7 +24,6 @@ #include #include #include -#include #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -286,324 +285,6 @@ static int ext4_file_open(struct inode * inode, struct file * filp) return dquot_file_open(inode, filp); } -/* - * Here we use ext4_map_blocks() to get a block mapping for a extent-based - * file rather than ext4_ext_walk_space() because we can introduce - * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same - * function. When extent status tree has been fully implemented, it will - * track all extent status for a file and we can directly use it to - * retrieve the offset for SEEK_DATA/SEEK_HOLE. - */ - -/* - * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to - * lookup page cache to check whether or not there has some data between - * [startoff, endoff] because, if this range contains an unwritten extent, - * we determine this extent as a data or a hole according to whether the - * page cache has data or not. - */ -static int ext4_find_unwritten_pgoff(struct inode *inode, - int origin, - struct ext4_map_blocks *map, - loff_t *offset) -{ - struct pagevec pvec; - unsigned int blkbits; - pgoff_t index; - pgoff_t end; - loff_t endoff; - loff_t startoff; - loff_t lastoff; - int found = 0; - - blkbits = inode->i_sb->s_blocksize_bits; - startoff = *offset; - lastoff = startoff; - endoff = (map->m_lblk + map->m_len) << blkbits; - - index = startoff >> PAGE_CACHE_SHIFT; - end = endoff >> PAGE_CACHE_SHIFT; - - pagevec_init(&pvec, 0); - do { - int i, num; - unsigned long nr_pages; - - num = min_t(pgoff_t, end - index, PAGEVEC_SIZE); - nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, - (pgoff_t)num); - if (nr_pages == 0) { - if (origin == SEEK_DATA) - break; - - BUG_ON(origin != SEEK_HOLE); - /* - * If this is the first time to go into the loop and - * offset is not beyond the end offset, it will be a - * hole at this offset - */ - if (lastoff == startoff || lastoff < endoff) - found = 1; - break; - } - - /* - * If this is the first time to go into the loop and - * offset is smaller than the first page offset, it will be a - * hole at this offset. - */ - if (lastoff == startoff && origin == SEEK_HOLE && - lastoff < page_offset(pvec.pages[0])) { - found = 1; - break; - } - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - struct buffer_head *bh, *head; - - /* - * If the current offset is not beyond the end of given - * range, it will be a hole. - */ - if (lastoff < endoff && origin == SEEK_HOLE && - page->index > end) { - found = 1; - *offset = lastoff; - goto out; - } - - lock_page(page); - - if (unlikely(page->mapping != inode->i_mapping)) { - unlock_page(page); - continue; - } - - if (!page_has_buffers(page)) { - unlock_page(page); - continue; - } - - if (page_has_buffers(page)) { - lastoff = page_offset(page); - bh = head = page_buffers(page); - do { - if (buffer_uptodate(bh) || - buffer_unwritten(bh)) { - if (origin == SEEK_DATA) - found = 1; - } else { - if (origin == SEEK_HOLE) - found = 1; - } - if (found) { - *offset = max_t(loff_t, - startoff, lastoff); - unlock_page(page); - goto out; - } - lastoff += bh->b_size; - bh = bh->b_this_page; - } while (bh != head); - } - - lastoff = page_offset(page) + PAGE_SIZE; - unlock_page(page); - } - - /* - * The no. of pages is less than our desired, that would be a - * hole in there. - */ - if (nr_pages < num && origin == SEEK_HOLE) { - found = 1; - *offset = lastoff; - break; - } - - index = pvec.pages[i - 1]->index + 1; - pagevec_release(&pvec); - } while (index <= end); - -out: - pagevec_release(&pvec); - return found; -} - -/* - * ext4_seek_data() retrieves the offset for SEEK_DATA. - */ -static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) -{ - struct inode *inode = file->f_mapping->host; - struct ext4_map_blocks map; - struct extent_status es; - ext4_lblk_t start, last, end; - loff_t dataoff, isize; - int blkbits; - int ret = 0; - - mutex_lock(&inode->i_mutex); - - isize = i_size_read(inode); - if (offset >= isize) { - mutex_unlock(&inode->i_mutex); - return -ENXIO; - } - - blkbits = inode->i_sb->s_blocksize_bits; - start = offset >> blkbits; - last = start; - end = isize >> blkbits; - dataoff = offset; - - do { - map.m_lblk = last; - map.m_len = end - last + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { - if (last != start) - dataoff = last << blkbits; - break; - } - - /* - * If there is a delay extent at this offset, - * it will be as a data. - */ - es.start = last; - (void)ext4_es_find_extent(inode, &es); - if (last >= es.start && - last < es.start + es.len) { - if (last != start) - dataoff = last << blkbits; - break; - } - - /* - * If there is a unwritten extent at this offset, - * it will be as a data or a hole according to page - * cache that has data or not. - */ - if (map.m_flags & EXT4_MAP_UNWRITTEN) { - int unwritten; - unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, - &map, &dataoff); - if (unwritten) - break; - } - - last++; - dataoff = last << blkbits; - } while (last <= end); - - mutex_unlock(&inode->i_mutex); - - if (dataoff > isize) - return -ENXIO; - - if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) - return -EINVAL; - if (dataoff > maxsize) - return -EINVAL; - - if (dataoff != file->f_pos) { - file->f_pos = dataoff; - file->f_version = 0; - } - - return dataoff; -} - -/* - * ext4_seek_hole() retrieves the offset for SEEK_HOLE. - */ -static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) -{ - struct inode *inode = file->f_mapping->host; - struct ext4_map_blocks map; - struct extent_status es; - ext4_lblk_t start, last, end; - loff_t holeoff, isize; - int blkbits; - int ret = 0; - - mutex_lock(&inode->i_mutex); - - isize = i_size_read(inode); - if (offset >= isize) { - mutex_unlock(&inode->i_mutex); - return -ENXIO; - } - - blkbits = inode->i_sb->s_blocksize_bits; - start = offset >> blkbits; - last = start; - end = isize >> blkbits; - holeoff = offset; - - do { - map.m_lblk = last; - map.m_len = end - last + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { - last += ret; - holeoff = last << blkbits; - continue; - } - - /* - * If there is a delay extent at this offset, - * we will skip this extent. - */ - es.start = last; - (void)ext4_es_find_extent(inode, &es); - if (last >= es.start && - last < es.start + es.len) { - last = es.start + es.len; - holeoff = last << blkbits; - continue; - } - - /* - * If there is a unwritten extent at this offset, - * it will be as a data or a hole according to page - * cache that has data or not. - */ - if (map.m_flags & EXT4_MAP_UNWRITTEN) { - int unwritten; - unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, - &map, &holeoff); - if (!unwritten) { - last += ret; - holeoff = last << blkbits; - continue; - } - } - - /* find a hole */ - break; - } while (last <= end); - - mutex_unlock(&inode->i_mutex); - - if (holeoff > isize) - holeoff = isize; - - if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) - return -EINVAL; - if (holeoff > maxsize) - return -EINVAL; - - if (holeoff != file->f_pos) { - file->f_pos = holeoff; - file->f_version = 0; - } - - return holeoff; -} - /* * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values * by calling generic_file_llseek_size() with the appropriate maxbytes @@ -619,19 +300,8 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin) else maxbytes = inode->i_sb->s_maxbytes; - switch (origin) { - case SEEK_SET: - case SEEK_CUR: - case SEEK_END: - return generic_file_llseek_size(file, offset, origin, - maxbytes, i_size_read(inode)); - case SEEK_DATA: - return ext4_seek_data(file, offset, maxbytes); - case SEEK_HOLE: - return ext4_seek_hole(file, offset, maxbytes); - } - - return -EINVAL; + return generic_file_llseek_size(file, offset, origin, + maxbytes, i_size_read(inode)); } const struct file_operations ext4_file_operations = { @@ -656,10 +326,12 @@ const struct file_operations ext4_file_operations = { const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, +#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, +#endif .get_acl = ext4_get_acl, .fiemap = ext4_fiemap, }; diff --git a/trunk/fs/ext4/fsync.c b/trunk/fs/ext4/fsync.c index dfbc1fe96674..be1d89f385b4 100644 --- a/trunk/fs/ext4/fsync.c +++ b/trunk/fs/ext4/fsync.c @@ -44,6 +44,7 @@ */ static int ext4_sync_parent(struct inode *inode) { + struct writeback_control wbc; struct dentry *dentry = NULL; struct inode *next; int ret = 0; @@ -65,7 +66,10 @@ static int ext4_sync_parent(struct inode *inode) ret = sync_mapping_buffers(inode->i_mapping); if (ret) break; - ret = sync_inode_metadata(inode, 1); + memset(&wbc, 0, sizeof(wbc)); + wbc.sync_mode = WB_SYNC_ALL; + wbc.nr_to_write = 0; /* only write out the inode */ + ret = sync_inode(inode, &wbc); if (ret) break; } diff --git a/trunk/fs/ext4/ialloc.c b/trunk/fs/ext4/ialloc.c index 3f32c8012447..3a100e7a62a8 100644 --- a/trunk/fs/ext4/ialloc.c +++ b/trunk/fs/ext4/ialloc.c @@ -762,6 +762,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); + brelse(block_bitmap_bh); /* recheck and clear flag under lock if we still need to */ ext4_lock_group(sb, group); @@ -774,7 +775,6 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); - brelse(block_bitmap_bh); if (err) goto fail; @@ -902,10 +902,6 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; - ei->i_inline_off = 0; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA)) - ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); - ret = inode; dquot_initialize(inode); err = dquot_alloc_inode(inode); diff --git a/trunk/fs/ext4/indirect.c b/trunk/fs/ext4/indirect.c index 20862f96e8ae..792e388e7b44 100644 --- a/trunk/fs/ext4/indirect.c +++ b/trunk/fs/ext4/indirect.c @@ -22,7 +22,6 @@ #include "ext4_jbd2.h" #include "truncate.h" -#include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */ #include @@ -756,7 +755,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, partial--; } out: - trace_ext4_ind_map_blocks_exit(inode, map, err); + trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, + map->m_pblk, map->m_len, err); return err; } @@ -1412,7 +1412,6 @@ void ext4_ind_truncate(struct inode *inode) down_write(&ei->i_data_sem); ext4_discard_preallocations(inode); - ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); /* * The orphan list entry will now protect us from any crash which diff --git a/trunk/fs/ext4/inline.c b/trunk/fs/ext4/inline.c deleted file mode 100644 index 387c47c6cda9..000000000000 --- a/trunk/fs/ext4/inline.c +++ /dev/null @@ -1,1884 +0,0 @@ -/* - * Copyright (c) 2012 Taobao. - * Written by Tao Ma - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ -#include "ext4_jbd2.h" -#include "ext4.h" -#include "xattr.h" -#include "truncate.h" -#include - -#define EXT4_XATTR_SYSTEM_DATA "data" -#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) -#define EXT4_INLINE_DOTDOT_SIZE 4 - -int ext4_get_inline_size(struct inode *inode) -{ - if (EXT4_I(inode)->i_inline_off) - return EXT4_I(inode)->i_inline_size; - - return 0; -} - -static int get_max_inline_xattr_value_size(struct inode *inode, - struct ext4_iloc *iloc) -{ - struct ext4_xattr_ibody_header *header; - struct ext4_xattr_entry *entry; - struct ext4_inode *raw_inode; - int free, min_offs; - - min_offs = EXT4_SB(inode->i_sb)->s_inode_size - - EXT4_GOOD_OLD_INODE_SIZE - - EXT4_I(inode)->i_extra_isize - - sizeof(struct ext4_xattr_ibody_header); - - /* - * We need to subtract another sizeof(__u32) since an in-inode xattr - * needs an empty 4 bytes to indicate the gap between the xattr entry - * and the name/value pair. - */ - if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) - return EXT4_XATTR_SIZE(min_offs - - EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) - - EXT4_XATTR_ROUND - sizeof(__u32)); - - raw_inode = ext4_raw_inode(iloc); - header = IHDR(inode, raw_inode); - entry = IFIRST(header); - - /* Compute min_offs. */ - for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { - if (!entry->e_value_block && entry->e_value_size) { - size_t offs = le16_to_cpu(entry->e_value_offs); - if (offs < min_offs) - min_offs = offs; - } - } - free = min_offs - - ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32); - - if (EXT4_I(inode)->i_inline_off) { - entry = (struct ext4_xattr_entry *) - ((void *)raw_inode + EXT4_I(inode)->i_inline_off); - - free += le32_to_cpu(entry->e_value_size); - goto out; - } - - free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)); - - if (free > EXT4_XATTR_ROUND) - free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND); - else - free = 0; - -out: - return free; -} - -/* - * Get the maximum size we now can store in an inode. - * If we can't find the space for a xattr entry, don't use the space - * of the extents since we have no space to indicate the inline data. - */ -int ext4_get_max_inline_size(struct inode *inode) -{ - int error, max_inline_size; - struct ext4_iloc iloc; - - if (EXT4_I(inode)->i_extra_isize == 0) - return 0; - - error = ext4_get_inode_loc(inode, &iloc); - if (error) { - ext4_error_inode(inode, __func__, __LINE__, 0, - "can't get inode location %lu", - inode->i_ino); - return 0; - } - - down_read(&EXT4_I(inode)->xattr_sem); - max_inline_size = get_max_inline_xattr_value_size(inode, &iloc); - up_read(&EXT4_I(inode)->xattr_sem); - - brelse(iloc.bh); - - if (!max_inline_size) - return 0; - - return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE; -} - -int ext4_has_inline_data(struct inode *inode) -{ - return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && - EXT4_I(inode)->i_inline_off; -} - -/* - * this function does not take xattr_sem, which is OK because it is - * currently only used in a code path coming form ext4_iget, before - * the new inode has been unlocked - */ -int ext4_find_inline_data_nolock(struct inode *inode) -{ - struct ext4_xattr_ibody_find is = { - .s = { .not_found = -ENODATA, }, - }; - struct ext4_xattr_info i = { - .name_index = EXT4_XATTR_INDEX_SYSTEM, - .name = EXT4_XATTR_SYSTEM_DATA, - }; - int error; - - if (EXT4_I(inode)->i_extra_isize == 0) - return 0; - - error = ext4_get_inode_loc(inode, &is.iloc); - if (error) - return error; - - error = ext4_xattr_ibody_find(inode, &i, &is); - if (error) - goto out; - - if (!is.s.not_found) { - EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - - (void *)ext4_raw_inode(&is.iloc)); - EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + - le32_to_cpu(is.s.here->e_value_size); - ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); - } -out: - brelse(is.iloc.bh); - return error; -} - -static int ext4_read_inline_data(struct inode *inode, void *buffer, - unsigned int len, - struct ext4_iloc *iloc) -{ - struct ext4_xattr_entry *entry; - struct ext4_xattr_ibody_header *header; - int cp_len = 0; - struct ext4_inode *raw_inode; - - if (!len) - return 0; - - BUG_ON(len > EXT4_I(inode)->i_inline_size); - - cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ? - len : EXT4_MIN_INLINE_DATA_SIZE; - - raw_inode = ext4_raw_inode(iloc); - memcpy(buffer, (void *)(raw_inode->i_block), cp_len); - - len -= cp_len; - buffer += cp_len; - - if (!len) - goto out; - - header = IHDR(inode, raw_inode); - entry = (struct ext4_xattr_entry *)((void *)raw_inode + - EXT4_I(inode)->i_inline_off); - len = min_t(unsigned int, len, - (unsigned int)le32_to_cpu(entry->e_value_size)); - - memcpy(buffer, - (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len); - cp_len += len; - -out: - return cp_len; -} - -/* - * write the buffer to the inline inode. - * If 'create' is set, we don't need to do the extra copy in the xattr - * value since it is already handled by ext4_xattr_ibody_inline_set. - * That saves us one memcpy. - */ -void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, - void *buffer, loff_t pos, unsigned int len) -{ - struct ext4_xattr_entry *entry; - struct ext4_xattr_ibody_header *header; - struct ext4_inode *raw_inode; - int cp_len = 0; - - BUG_ON(!EXT4_I(inode)->i_inline_off); - BUG_ON(pos + len > EXT4_I(inode)->i_inline_size); - - raw_inode = ext4_raw_inode(iloc); - buffer += pos; - - if (pos < EXT4_MIN_INLINE_DATA_SIZE) { - cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ? - EXT4_MIN_INLINE_DATA_SIZE - pos : len; - memcpy((void *)raw_inode->i_block + pos, buffer, cp_len); - - len -= cp_len; - buffer += cp_len; - pos += cp_len; - } - - if (!len) - return; - - pos -= EXT4_MIN_INLINE_DATA_SIZE; - header = IHDR(inode, raw_inode); - entry = (struct ext4_xattr_entry *)((void *)raw_inode + - EXT4_I(inode)->i_inline_off); - - memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos, - buffer, len); -} - -static int ext4_create_inline_data(handle_t *handle, - struct inode *inode, unsigned len) -{ - int error; - void *value = NULL; - struct ext4_xattr_ibody_find is = { - .s = { .not_found = -ENODATA, }, - }; - struct ext4_xattr_info i = { - .name_index = EXT4_XATTR_INDEX_SYSTEM, - .name = EXT4_XATTR_SYSTEM_DATA, - }; - - error = ext4_get_inode_loc(inode, &is.iloc); - if (error) - return error; - - error = ext4_journal_get_write_access(handle, is.iloc.bh); - if (error) - goto out; - - if (len > EXT4_MIN_INLINE_DATA_SIZE) { - value = EXT4_ZERO_XATTR_VALUE; - len -= EXT4_MIN_INLINE_DATA_SIZE; - } else { - value = ""; - len = 0; - } - - /* Insert the the xttr entry. */ - i.value = value; - i.value_len = len; - - error = ext4_xattr_ibody_find(inode, &i, &is); - if (error) - goto out; - - BUG_ON(!is.s.not_found); - - error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); - if (error) { - if (error == -ENOSPC) - ext4_clear_inode_state(inode, - EXT4_STATE_MAY_INLINE_DATA); - goto out; - } - - memset((void *)ext4_raw_inode(&is.iloc)->i_block, - 0, EXT4_MIN_INLINE_DATA_SIZE); - - EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - - (void *)ext4_raw_inode(&is.iloc)); - EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE; - ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); - ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); - get_bh(is.iloc.bh); - error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); - -out: - brelse(is.iloc.bh); - return error; -} - -static int ext4_update_inline_data(handle_t *handle, struct inode *inode, - unsigned int len) -{ - int error; - void *value = NULL; - struct ext4_xattr_ibody_find is = { - .s = { .not_found = -ENODATA, }, - }; - struct ext4_xattr_info i = { - .name_index = EXT4_XATTR_INDEX_SYSTEM, - .name = EXT4_XATTR_SYSTEM_DATA, - }; - - /* If the old space is ok, write the data directly. */ - if (len <= EXT4_I(inode)->i_inline_size) - return 0; - - error = ext4_get_inode_loc(inode, &is.iloc); - if (error) - return error; - - error = ext4_xattr_ibody_find(inode, &i, &is); - if (error) - goto out; - - BUG_ON(is.s.not_found); - - len -= EXT4_MIN_INLINE_DATA_SIZE; - value = kzalloc(len, GFP_NOFS); - if (!value) - goto out; - - error = ext4_xattr_ibody_get(inode, i.name_index, i.name, - value, len); - if (error == -ENODATA) - goto out; - - error = ext4_journal_get_write_access(handle, is.iloc.bh); - if (error) - goto out; - - /* Update the xttr entry. */ - i.value = value; - i.value_len = len; - - error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); - if (error) - goto out; - - EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - - (void *)ext4_raw_inode(&is.iloc)); - EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + - le32_to_cpu(is.s.here->e_value_size); - ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); - get_bh(is.iloc.bh); - error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); - -out: - kfree(value); - brelse(is.iloc.bh); - return error; -} - -int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, - unsigned int len) -{ - int ret, size; - struct ext4_inode_info *ei = EXT4_I(inode); - - if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) - return -ENOSPC; - - size = ext4_get_max_inline_size(inode); - if (size < len) - return -ENOSPC; - - down_write(&EXT4_I(inode)->xattr_sem); - - if (ei->i_inline_off) - ret = ext4_update_inline_data(handle, inode, len); - else - ret = ext4_create_inline_data(handle, inode, len); - - up_write(&EXT4_I(inode)->xattr_sem); - - return ret; -} - -static int ext4_destroy_inline_data_nolock(handle_t *handle, - struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_xattr_ibody_find is = { - .s = { .not_found = 0, }, - }; - struct ext4_xattr_info i = { - .name_index = EXT4_XATTR_INDEX_SYSTEM, - .name = EXT4_XATTR_SYSTEM_DATA, - .value = NULL, - .value_len = 0, - }; - int error; - - if (!ei->i_inline_off) - return 0; - - error = ext4_get_inode_loc(inode, &is.iloc); - if (error) - return error; - - error = ext4_xattr_ibody_find(inode, &i, &is); - if (error) - goto out; - - error = ext4_journal_get_write_access(handle, is.iloc.bh); - if (error) - goto out; - - error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); - if (error) - goto out; - - memset((void *)ext4_raw_inode(&is.iloc)->i_block, - 0, EXT4_MIN_INLINE_DATA_SIZE); - - if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_INCOMPAT_EXTENTS)) { - if (S_ISDIR(inode->i_mode) || - S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) { - ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); - ext4_ext_tree_init(handle, inode); - } - } - ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA); - - get_bh(is.iloc.bh); - error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); - - EXT4_I(inode)->i_inline_off = 0; - EXT4_I(inode)->i_inline_size = 0; - ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); -out: - brelse(is.iloc.bh); - if (error == -ENODATA) - error = 0; - return error; -} - -static int ext4_read_inline_page(struct inode *inode, struct page *page) -{ - void *kaddr; - int ret = 0; - size_t len; - struct ext4_iloc iloc; - - BUG_ON(!PageLocked(page)); - BUG_ON(!ext4_has_inline_data(inode)); - BUG_ON(page->index); - - if (!EXT4_I(inode)->i_inline_off) { - ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.", - inode->i_ino); - goto out; - } - - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) - goto out; - - len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode)); - kaddr = kmap_atomic(page); - ret = ext4_read_inline_data(inode, kaddr, len, &iloc); - flush_dcache_page(page); - kunmap_atomic(kaddr); - zero_user_segment(page, len, PAGE_CACHE_SIZE); - SetPageUptodate(page); - brelse(iloc.bh); - -out: - return ret; -} - -int ext4_readpage_inline(struct inode *inode, struct page *page) -{ - int ret = 0; - - down_read(&EXT4_I(inode)->xattr_sem); - if (!ext4_has_inline_data(inode)) { - up_read(&EXT4_I(inode)->xattr_sem); - return -EAGAIN; - } - - /* - * Current inline data can only exist in the 1st page, - * So for all the other pages, just set them uptodate. - */ - if (!page->index) - ret = ext4_read_inline_page(inode, page); - else if (!PageUptodate(page)) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); - SetPageUptodate(page); - } - - up_read(&EXT4_I(inode)->xattr_sem); - - unlock_page(page); - return ret >= 0 ? 0 : ret; -} - -static int ext4_convert_inline_data_to_extent(struct address_space *mapping, - struct inode *inode, - unsigned flags) -{ - int ret, needed_blocks; - handle_t *handle = NULL; - int retries = 0, sem_held = 0; - struct page *page = NULL; - unsigned from, to; - struct ext4_iloc iloc; - - if (!ext4_has_inline_data(inode)) { - /* - * clear the flag so that no new write - * will trap here again. - */ - ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); - return 0; - } - - needed_blocks = ext4_writepage_trans_blocks(inode); - - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) - return ret; - -retry: - handle = ext4_journal_start(inode, needed_blocks); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - handle = NULL; - goto out; - } - - /* We cannot recurse into the filesystem as the transaction is already - * started */ - flags |= AOP_FLAG_NOFS; - - page = grab_cache_page_write_begin(mapping, 0, flags); - if (!page) { - ret = -ENOMEM; - goto out; - } - - down_write(&EXT4_I(inode)->xattr_sem); - sem_held = 1; - /* If some one has already done this for us, just exit. */ - if (!ext4_has_inline_data(inode)) { - ret = 0; - goto out; - } - - from = 0; - to = ext4_get_inline_size(inode); - if (!PageUptodate(page)) { - ret = ext4_read_inline_page(inode, page); - if (ret < 0) - goto out; - } - - ret = ext4_destroy_inline_data_nolock(handle, inode); - if (ret) - goto out; - - if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(page, from, to, ext4_get_block_write); - else - ret = __block_write_begin(page, from, to, ext4_get_block); - - if (!ret && ext4_should_journal_data(inode)) { - ret = ext4_walk_page_buffers(handle, page_buffers(page), - from, to, NULL, - do_journal_get_write_access); - } - - if (ret) { - unlock_page(page); - page_cache_release(page); - ext4_orphan_add(handle, inode); - up_write(&EXT4_I(inode)->xattr_sem); - sem_held = 0; - ext4_journal_stop(handle); - handle = NULL; - ext4_truncate_failed_write(inode); - /* - * If truncate failed early the inode might - * still be on the orphan list; we need to - * make sure the inode is removed from the - * orphan list in that case. - */ - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - } - - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - - block_commit_write(page, from, to); -out: - if (page) { - unlock_page(page); - page_cache_release(page); - } - if (sem_held) - up_write(&EXT4_I(inode)->xattr_sem); - if (handle) - ext4_journal_stop(handle); - brelse(iloc.bh); - return ret; -} - -/* - * Try to write data in the inode. - * If the inode has inline data, check whether the new write can be - * in the inode also. If not, create the page the handle, move the data - * to the page make it update and let the later codes create extent for it. - */ -int ext4_try_to_write_inline_data(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - unsigned flags, - struct page **pagep) -{ - int ret; - handle_t *handle; - struct page *page; - struct ext4_iloc iloc; - - if (pos + len > ext4_get_max_inline_size(inode)) - goto convert; - - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) - return ret; - - /* - * The possible write could happen in the inode, - * so try to reserve the space in inode first. - */ - handle = ext4_journal_start(inode, 1); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - handle = NULL; - goto out; - } - - ret = ext4_prepare_inline_data(handle, inode, pos + len); - if (ret && ret != -ENOSPC) - goto out; - - /* We don't have space in inline inode, so convert it to extent. */ - if (ret == -ENOSPC) { - ext4_journal_stop(handle); - brelse(iloc.bh); - goto convert; - } - - flags |= AOP_FLAG_NOFS; - - page = grab_cache_page_write_begin(mapping, 0, flags); - if (!page) { - ret = -ENOMEM; - goto out; - } - - *pagep = page; - down_read(&EXT4_I(inode)->xattr_sem); - if (!ext4_has_inline_data(inode)) { - ret = 0; - unlock_page(page); - page_cache_release(page); - goto out_up_read; - } - - if (!PageUptodate(page)) { - ret = ext4_read_inline_page(inode, page); - if (ret < 0) - goto out_up_read; - } - - ret = 1; - handle = NULL; -out_up_read: - up_read(&EXT4_I(inode)->xattr_sem); -out: - if (handle) - ext4_journal_stop(handle); - brelse(iloc.bh); - return ret; -convert: - return ext4_convert_inline_data_to_extent(mapping, - inode, flags); -} - -int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, - unsigned copied, struct page *page) -{ - int ret; - void *kaddr; - struct ext4_iloc iloc; - - if (unlikely(copied < len)) { - if (!PageUptodate(page)) { - copied = 0; - goto out; - } - } - - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) { - ext4_std_error(inode->i_sb, ret); - copied = 0; - goto out; - } - - down_write(&EXT4_I(inode)->xattr_sem); - BUG_ON(!ext4_has_inline_data(inode)); - - kaddr = kmap_atomic(page); - ext4_write_inline_data(inode, &iloc, kaddr, pos, len); - kunmap_atomic(kaddr); - SetPageUptodate(page); - /* clear page dirty so that writepages wouldn't work for us. */ - ClearPageDirty(page); - - up_write(&EXT4_I(inode)->xattr_sem); - brelse(iloc.bh); -out: - return copied; -} - -struct buffer_head * -ext4_journalled_write_inline_data(struct inode *inode, - unsigned len, - struct page *page) -{ - int ret; - void *kaddr; - struct ext4_iloc iloc; - - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) { - ext4_std_error(inode->i_sb, ret); - return NULL; - } - - down_write(&EXT4_I(inode)->xattr_sem); - kaddr = kmap_atomic(page); - ext4_write_inline_data(inode, &iloc, kaddr, 0, len); - kunmap_atomic(kaddr); - up_write(&EXT4_I(inode)->xattr_sem); - - return iloc.bh; -} - -/* - * Try to make the page cache and handle ready for the inline data case. - * We can call this function in 2 cases: - * 1. The inode is created and the first write exceeds inline size. We can - * clear the inode state safely. - * 2. The inode has inline data, then we need to read the data, make it - * update and dirty so that ext4_da_writepages can handle it. We don't - * need to start the journal since the file's metatdata isn't changed now. - */ -static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, - struct inode *inode, - unsigned flags, - void **fsdata) -{ - int ret = 0, inline_size; - struct page *page; - - page = grab_cache_page_write_begin(mapping, 0, flags); - if (!page) - return -ENOMEM; - - down_read(&EXT4_I(inode)->xattr_sem); - if (!ext4_has_inline_data(inode)) { - ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); - goto out; - } - - inline_size = ext4_get_inline_size(inode); - - if (!PageUptodate(page)) { - ret = ext4_read_inline_page(inode, page); - if (ret < 0) - goto out; - } - - ret = __block_write_begin(page, 0, inline_size, - ext4_da_get_block_prep); - if (ret) { - ext4_truncate_failed_write(inode); - goto out; - } - - SetPageDirty(page); - SetPageUptodate(page); - ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); - *fsdata = (void *)CONVERT_INLINE_DATA; - -out: - up_read(&EXT4_I(inode)->xattr_sem); - if (page) { - unlock_page(page); - page_cache_release(page); - } - return ret; -} - -/* - * Prepare the write for the inline data. - * If the the data can be written into the inode, we just read - * the page and make it uptodate, and start the journal. - * Otherwise read the page, makes it dirty so that it can be - * handle in writepages(the i_disksize update is left to the - * normal ext4_da_write_end). - */ -int ext4_da_write_inline_data_begin(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - unsigned flags, - struct page **pagep, - void **fsdata) -{ - int ret, inline_size; - handle_t *handle; - struct page *page; - struct ext4_iloc iloc; - - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) - return ret; - - handle = ext4_journal_start(inode, 1); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - handle = NULL; - goto out; - } - - inline_size = ext4_get_max_inline_size(inode); - - ret = -ENOSPC; - if (inline_size >= pos + len) { - ret = ext4_prepare_inline_data(handle, inode, pos + len); - if (ret && ret != -ENOSPC) - goto out; - } - - if (ret == -ENOSPC) { - ret = ext4_da_convert_inline_data_to_extent(mapping, - inode, - flags, - fsdata); - goto out; - } - - /* - * We cannot recurse into the filesystem as the transaction - * is already started. - */ - flags |= AOP_FLAG_NOFS; - - page = grab_cache_page_write_begin(mapping, 0, flags); - if (!page) { - ret = -ENOMEM; - goto out; - } - - down_read(&EXT4_I(inode)->xattr_sem); - if (!ext4_has_inline_data(inode)) { - ret = 0; - goto out_release_page; - } - - if (!PageUptodate(page)) { - ret = ext4_read_inline_page(inode, page); - if (ret < 0) - goto out_release_page; - } - - up_read(&EXT4_I(inode)->xattr_sem); - *pagep = page; - handle = NULL; - brelse(iloc.bh); - return 1; -out_release_page: - up_read(&EXT4_I(inode)->xattr_sem); - unlock_page(page); - page_cache_release(page); -out: - if (handle) - ext4_journal_stop(handle); - brelse(iloc.bh); - return ret; -} - -int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, - unsigned len, unsigned copied, - struct page *page) -{ - int i_size_changed = 0; - - copied = ext4_write_inline_data_end(inode, pos, len, copied, page); - - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hold i_mutex. - * - * But it's important to update i_size while still holding page lock: - * page writeout could otherwise come in and zero beyond i_size. - */ - if (pos+copied > inode->i_size) { - i_size_write(inode, pos+copied); - i_size_changed = 1; - } - unlock_page(page); - page_cache_release(page); - - /* - * Don't mark the inode dirty under page lock. First, it unnecessarily - * makes the holding time of page lock longer. Second, it forces lock - * ordering of page lock and transaction start for journaling - * filesystems. - */ - if (i_size_changed) - mark_inode_dirty(inode); - - return copied; -} - -#ifdef INLINE_DIR_DEBUG -void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, - void *inline_start, int inline_size) -{ - int offset; - unsigned short de_len; - struct ext4_dir_entry_2 *de = inline_start; - void *dlimit = inline_start + inline_size; - - trace_printk("inode %lu\n", dir->i_ino); - offset = 0; - while ((void *)de < dlimit) { - de_len = ext4_rec_len_from_disk(de->rec_len, inline_size); - trace_printk("de: off %u rlen %u name %*.s nlen %u ino %u\n", - offset, de_len, de->name_len, de->name, - de->name_len, le32_to_cpu(de->inode)); - if (ext4_check_dir_entry(dir, NULL, de, bh, - inline_start, inline_size, offset)) - BUG(); - - offset += de_len; - de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); - } -} -#else -#define ext4_show_inline_dir(dir, bh, inline_start, inline_size) -#endif - -/* - * Add a new entry into a inline dir. - * It will return -ENOSPC if no space is available, and -EIO - * and -EEXIST if directory entry already exists. - */ -static int ext4_add_dirent_to_inline(handle_t *handle, - struct dentry *dentry, - struct inode *inode, - struct ext4_iloc *iloc, - void *inline_start, int inline_size) -{ - struct inode *dir = dentry->d_parent->d_inode; - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; - unsigned short reclen; - int err; - struct ext4_dir_entry_2 *de; - - reclen = EXT4_DIR_REC_LEN(namelen); - err = ext4_find_dest_de(dir, inode, iloc->bh, - inline_start, inline_size, - name, namelen, &de); - if (err) - return err; - - err = ext4_journal_get_write_access(handle, iloc->bh); - if (err) - return err; - ext4_insert_dentry(inode, de, inline_size, name, namelen); - - ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); - - /* - * XXX shouldn't update any times until successful - * completion of syscall, but too many callers depend - * on this. - * - * XXX similarly, too many callers depend on - * ext4_new_inode() setting the times, but error - * recovery deletes the inode, so the worst that can - * happen is that the times are slightly out of date - * and/or different from the directory change time. - */ - dir->i_mtime = dir->i_ctime = ext4_current_time(dir); - ext4_update_dx_flag(dir); - dir->i_version++; - ext4_mark_inode_dirty(handle, dir); - return 1; -} - -static void *ext4_get_inline_xattr_pos(struct inode *inode, - struct ext4_iloc *iloc) -{ - struct ext4_xattr_entry *entry; - struct ext4_xattr_ibody_header *header; - - BUG_ON(!EXT4_I(inode)->i_inline_off); - - header = IHDR(inode, ext4_raw_inode(iloc)); - entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) + - EXT4_I(inode)->i_inline_off); - - return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs); -} - -/* Set the final de to cover the whole block. */ -static void ext4_update_final_de(void *de_buf, int old_size, int new_size) -{ - struct ext4_dir_entry_2 *de, *prev_de; - void *limit; - int de_len; - - de = (struct ext4_dir_entry_2 *)de_buf; - if (old_size) { - limit = de_buf + old_size; - do { - prev_de = de; - de_len = ext4_rec_len_from_disk(de->rec_len, old_size); - de_buf += de_len; - de = (struct ext4_dir_entry_2 *)de_buf; - } while (de_buf < limit); - - prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size - - old_size, new_size); - } else { - /* this is just created, so create an empty entry. */ - de->inode = 0; - de->rec_len = ext4_rec_len_to_disk(new_size, new_size); - } -} - -static int ext4_update_inline_dir(handle_t *handle, struct inode *dir, - struct ext4_iloc *iloc) -{ - int ret; - int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; - int new_size = get_max_inline_xattr_value_size(dir, iloc); - - if (new_size - old_size <= EXT4_DIR_REC_LEN(1)) - return -ENOSPC; - - ret = ext4_update_inline_data(handle, dir, - new_size + EXT4_MIN_INLINE_DATA_SIZE); - if (ret) - return ret; - - ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size, - EXT4_I(dir)->i_inline_size - - EXT4_MIN_INLINE_DATA_SIZE); - dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size; - return 0; -} - -static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, - struct ext4_iloc *iloc, - void *buf, int inline_size) -{ - ext4_create_inline_data(handle, inode, inline_size); - ext4_write_inline_data(inode, iloc, buf, 0, inline_size); - ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); -} - -static int ext4_finish_convert_inline_dir(handle_t *handle, - struct inode *inode, - struct buffer_head *dir_block, - void *buf, - int inline_size) -{ - int err, csum_size = 0, header_size = 0; - struct ext4_dir_entry_2 *de; - struct ext4_dir_entry_tail *t; - void *target = dir_block->b_data; - - /* - * First create "." and ".." and then copy the dir information - * back to the block. - */ - de = (struct ext4_dir_entry_2 *)target; - de = ext4_init_dot_dotdot(inode, de, - inode->i_sb->s_blocksize, csum_size, - le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1); - header_size = (void *)de - target; - - memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, - inline_size - EXT4_INLINE_DOTDOT_SIZE); - - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) - csum_size = sizeof(struct ext4_dir_entry_tail); - - inode->i_size = inode->i_sb->s_blocksize; - i_size_write(inode, inode->i_sb->s_blocksize); - EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; - ext4_update_final_de(dir_block->b_data, - inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size, - inode->i_sb->s_blocksize - csum_size); - - if (csum_size) { - t = EXT4_DIRENT_TAIL(dir_block->b_data, - inode->i_sb->s_blocksize); - initialize_dirent_tail(t, inode->i_sb->s_blocksize); - } - set_buffer_uptodate(dir_block); - err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); - if (err) - goto out; - set_buffer_verified(dir_block); -out: - return err; -} - -static int ext4_convert_inline_data_nolock(handle_t *handle, - struct inode *inode, - struct ext4_iloc *iloc) -{ - int error; - void *buf = NULL; - struct buffer_head *data_bh = NULL; - struct ext4_map_blocks map; - int inline_size; - - inline_size = ext4_get_inline_size(inode); - buf = kmalloc(inline_size, GFP_NOFS); - if (!buf) { - error = -ENOMEM; - goto out; - } - - error = ext4_read_inline_data(inode, buf, inline_size, iloc); - if (error < 0) - goto out; - - error = ext4_destroy_inline_data_nolock(handle, inode); - if (error) - goto out; - - map.m_lblk = 0; - map.m_len = 1; - map.m_flags = 0; - error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE); - if (error < 0) - goto out_restore; - if (!(map.m_flags & EXT4_MAP_MAPPED)) { - error = -EIO; - goto out_restore; - } - - data_bh = sb_getblk(inode->i_sb, map.m_pblk); - if (!data_bh) { - error = -EIO; - goto out_restore; - } - - lock_buffer(data_bh); - error = ext4_journal_get_create_access(handle, data_bh); - if (error) { - unlock_buffer(data_bh); - error = -EIO; - goto out_restore; - } - memset(data_bh->b_data, 0, inode->i_sb->s_blocksize); - - if (!S_ISDIR(inode->i_mode)) { - memcpy(data_bh->b_data, buf, inline_size); - set_buffer_uptodate(data_bh); - error = ext4_handle_dirty_metadata(handle, - inode, data_bh); - } else { - error = ext4_finish_convert_inline_dir(handle, inode, data_bh, - buf, inline_size); - } - - unlock_buffer(data_bh); -out_restore: - if (error) - ext4_restore_inline_data(handle, inode, iloc, buf, inline_size); - -out: - brelse(data_bh); - kfree(buf); - return error; -} - -/* - * Try to add the new entry to the inline data. - * If succeeds, return 0. If not, extended the inline dir and copied data to - * the new created block. - */ -int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode) -{ - int ret, inline_size; - void *inline_start; - struct ext4_iloc iloc; - struct inode *dir = dentry->d_parent->d_inode; - - ret = ext4_get_inode_loc(dir, &iloc); - if (ret) - return ret; - - down_write(&EXT4_I(dir)->xattr_sem); - if (!ext4_has_inline_data(dir)) - goto out; - - inline_start = (void *)ext4_raw_inode(&iloc)->i_block + - EXT4_INLINE_DOTDOT_SIZE; - inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; - - ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, - inline_start, inline_size); - if (ret != -ENOSPC) - goto out; - - /* check whether it can be inserted to inline xattr space. */ - inline_size = EXT4_I(dir)->i_inline_size - - EXT4_MIN_INLINE_DATA_SIZE; - if (!inline_size) { - /* Try to use the xattr space.*/ - ret = ext4_update_inline_dir(handle, dir, &iloc); - if (ret && ret != -ENOSPC) - goto out; - - inline_size = EXT4_I(dir)->i_inline_size - - EXT4_MIN_INLINE_DATA_SIZE; - } - - if (inline_size) { - inline_start = ext4_get_inline_xattr_pos(dir, &iloc); - - ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, - inline_start, inline_size); - - if (ret != -ENOSPC) - goto out; - } - - /* - * The inline space is filled up, so create a new block for it. - * As the extent tree will be created, we have to save the inline - * dir first. - */ - ret = ext4_convert_inline_data_nolock(handle, dir, &iloc); - -out: - ext4_mark_inode_dirty(handle, dir); - up_write(&EXT4_I(dir)->xattr_sem); - brelse(iloc.bh); - return ret; -} - -int ext4_read_inline_dir(struct file *filp, - void *dirent, filldir_t filldir, - int *has_inline_data) -{ - int error = 0; - unsigned int offset, parent_ino; - int i, stored; - struct ext4_dir_entry_2 *de; - struct super_block *sb; - struct inode *inode = filp->f_path.dentry->d_inode; - int ret, inline_size = 0; - struct ext4_iloc iloc; - void *dir_buf = NULL; - - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) - return ret; - - down_read(&EXT4_I(inode)->xattr_sem); - if (!ext4_has_inline_data(inode)) { - up_read(&EXT4_I(inode)->xattr_sem); - *has_inline_data = 0; - goto out; - } - - inline_size = ext4_get_inline_size(inode); - dir_buf = kmalloc(inline_size, GFP_NOFS); - if (!dir_buf) { - ret = -ENOMEM; - up_read(&EXT4_I(inode)->xattr_sem); - goto out; - } - - ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); - up_read(&EXT4_I(inode)->xattr_sem); - if (ret < 0) - goto out; - - sb = inode->i_sb; - stored = 0; - parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); - - while (!error && !stored && filp->f_pos < inode->i_size) { -revalidate: - /* - * If the version has changed since the last call to - * readdir(2), then we might be pointing to an invalid - * dirent right now. Scan from the start of the inline - * dir to make sure. - */ - if (filp->f_version != inode->i_version) { - for (i = 0; - i < inode->i_size && i < offset;) { - if (!i) { - /* skip "." and ".." if needed. */ - i += EXT4_INLINE_DOTDOT_SIZE; - continue; - } - de = (struct ext4_dir_entry_2 *) - (dir_buf + i); - /* It's too expensive to do a full - * dirent test each time round this - * loop, but we do have to test at - * least that it is non-zero. A - * failure will be detected in the - * dirent test below. */ - if (ext4_rec_len_from_disk(de->rec_len, - inline_size) < EXT4_DIR_REC_LEN(1)) - break; - i += ext4_rec_len_from_disk(de->rec_len, - inline_size); - } - offset = i; - filp->f_pos = offset; - filp->f_version = inode->i_version; - } - - while (!error && filp->f_pos < inode->i_size) { - if (filp->f_pos == 0) { - error = filldir(dirent, ".", 1, 0, inode->i_ino, - DT_DIR); - if (error) - break; - stored++; - - error = filldir(dirent, "..", 2, 0, parent_ino, - DT_DIR); - if (error) - break; - stored++; - - filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE; - continue; - } - - de = (struct ext4_dir_entry_2 *)(dir_buf + offset); - if (ext4_check_dir_entry(inode, filp, de, - iloc.bh, dir_buf, - inline_size, offset)) { - ret = stored; - goto out; - } - offset += ext4_rec_len_from_disk(de->rec_len, - inline_size); - if (le32_to_cpu(de->inode)) { - /* We might block in the next section - * if the data destination is - * currently swapped out. So, use a - * version stamp to detect whether or - * not the directory has been modified - * during the copy operation. - */ - u64 version = filp->f_version; - - error = filldir(dirent, de->name, - de->name_len, - filp->f_pos, - le32_to_cpu(de->inode), - get_dtype(sb, de->file_type)); - if (error) - break; - if (version != filp->f_version) - goto revalidate; - stored++; - } - filp->f_pos += ext4_rec_len_from_disk(de->rec_len, - inline_size); - } - offset = 0; - } -out: - kfree(dir_buf); - brelse(iloc.bh); - return ret; -} - -struct buffer_head *ext4_get_first_inline_block(struct inode *inode, - struct ext4_dir_entry_2 **parent_de, - int *retval) -{ - struct ext4_iloc iloc; - - *retval = ext4_get_inode_loc(inode, &iloc); - if (*retval) - return NULL; - - *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; - - return iloc.bh; -} - -/* - * Try to create the inline data for the new dir. - * If it succeeds, return 0, otherwise return the error. - * In case of ENOSPC, the caller should create the normal disk layout dir. - */ -int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, - struct inode *inode) -{ - int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE; - struct ext4_iloc iloc; - struct ext4_dir_entry_2 *de; - - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) - return ret; - - ret = ext4_prepare_inline_data(handle, inode, inline_size); - if (ret) - goto out; - - /* - * For inline dir, we only save the inode information for the ".." - * and create a fake dentry to cover the left space. - */ - de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; - de->inode = cpu_to_le32(parent->i_ino); - de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE); - de->inode = 0; - de->rec_len = ext4_rec_len_to_disk( - inline_size - EXT4_INLINE_DOTDOT_SIZE, - inline_size); - set_nlink(inode, 2); - inode->i_size = EXT4_I(inode)->i_disksize = inline_size; -out: - brelse(iloc.bh); - return ret; -} - -struct buffer_head *ext4_find_inline_entry(struct inode *dir, - const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, - int *has_inline_data) -{ - int ret; - struct ext4_iloc iloc; - void *inline_start; - int inline_size; - - if (ext4_get_inode_loc(dir, &iloc)) - return NULL; - - down_read(&EXT4_I(dir)->xattr_sem); - if (!ext4_has_inline_data(dir)) { - *has_inline_data = 0; - goto out; - } - - inline_start = (void *)ext4_raw_inode(&iloc)->i_block + - EXT4_INLINE_DOTDOT_SIZE; - inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; - ret = search_dir(iloc.bh, inline_start, inline_size, - dir, d_name, 0, res_dir); - if (ret == 1) - goto out_find; - if (ret < 0) - goto out; - - if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE) - goto out; - - inline_start = ext4_get_inline_xattr_pos(dir, &iloc); - inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; - - ret = search_dir(iloc.bh, inline_start, inline_size, - dir, d_name, 0, res_dir); - if (ret == 1) - goto out_find; - -out: - brelse(iloc.bh); - iloc.bh = NULL; -out_find: - up_read(&EXT4_I(dir)->xattr_sem); - return iloc.bh; -} - -int ext4_delete_inline_entry(handle_t *handle, - struct inode *dir, - struct ext4_dir_entry_2 *de_del, - struct buffer_head *bh, - int *has_inline_data) -{ - int err, inline_size; - struct ext4_iloc iloc; - void *inline_start; - - err = ext4_get_inode_loc(dir, &iloc); - if (err) - return err; - - down_write(&EXT4_I(dir)->xattr_sem); - if (!ext4_has_inline_data(dir)) { - *has_inline_data = 0; - goto out; - } - - if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) < - EXT4_MIN_INLINE_DATA_SIZE) { - inline_start = (void *)ext4_raw_inode(&iloc)->i_block + - EXT4_INLINE_DOTDOT_SIZE; - inline_size = EXT4_MIN_INLINE_DATA_SIZE - - EXT4_INLINE_DOTDOT_SIZE; - } else { - inline_start = ext4_get_inline_xattr_pos(dir, &iloc); - inline_size = ext4_get_inline_size(dir) - - EXT4_MIN_INLINE_DATA_SIZE; - } - - err = ext4_journal_get_write_access(handle, bh); - if (err) - goto out; - - err = ext4_generic_delete_entry(handle, dir, de_del, bh, - inline_start, inline_size, 0); - if (err) - goto out; - - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_mark_inode_dirty(handle, dir); - if (unlikely(err)) - goto out; - - ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size); -out: - up_write(&EXT4_I(dir)->xattr_sem); - brelse(iloc.bh); - if (err != -ENOENT) - ext4_std_error(dir->i_sb, err); - return err; -} - -/* - * Get the inline dentry at offset. - */ -static inline struct ext4_dir_entry_2 * -ext4_get_inline_entry(struct inode *inode, - struct ext4_iloc *iloc, - unsigned int offset, - void **inline_start, - int *inline_size) -{ - void *inline_pos; - - BUG_ON(offset > ext4_get_inline_size(inode)); - - if (offset < EXT4_MIN_INLINE_DATA_SIZE) { - inline_pos = (void *)ext4_raw_inode(iloc)->i_block; - *inline_size = EXT4_MIN_INLINE_DATA_SIZE; - } else { - inline_pos = ext4_get_inline_xattr_pos(inode, iloc); - offset -= EXT4_MIN_INLINE_DATA_SIZE; - *inline_size = ext4_get_inline_size(inode) - - EXT4_MIN_INLINE_DATA_SIZE; - } - - if (inline_start) - *inline_start = inline_pos; - return (struct ext4_dir_entry_2 *)(inline_pos + offset); -} - -int empty_inline_dir(struct inode *dir, int *has_inline_data) -{ - int err, inline_size; - struct ext4_iloc iloc; - void *inline_pos; - unsigned int offset; - struct ext4_dir_entry_2 *de; - int ret = 1; - - err = ext4_get_inode_loc(dir, &iloc); - if (err) { - EXT4_ERROR_INODE(dir, "error %d getting inode %lu block", - err, dir->i_ino); - return 1; - } - - down_read(&EXT4_I(dir)->xattr_sem); - if (!ext4_has_inline_data(dir)) { - *has_inline_data = 0; - goto out; - } - - de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; - if (!le32_to_cpu(de->inode)) { - ext4_warning(dir->i_sb, - "bad inline directory (dir #%lu) - no `..'", - dir->i_ino); - ret = 1; - goto out; - } - - offset = EXT4_INLINE_DOTDOT_SIZE; - while (offset < dir->i_size) { - de = ext4_get_inline_entry(dir, &iloc, offset, - &inline_pos, &inline_size); - if (ext4_check_dir_entry(dir, NULL, de, - iloc.bh, inline_pos, - inline_size, offset)) { - ext4_warning(dir->i_sb, - "bad inline directory (dir #%lu) - " - "inode %u, rec_len %u, name_len %d" - "inline size %d\n", - dir->i_ino, le32_to_cpu(de->inode), - le16_to_cpu(de->rec_len), de->name_len, - inline_size); - ret = 1; - goto out; - } - if (le32_to_cpu(de->inode)) { - ret = 0; - goto out; - } - offset += ext4_rec_len_from_disk(de->rec_len, inline_size); - } - -out: - up_read(&EXT4_I(dir)->xattr_sem); - brelse(iloc.bh); - return ret; -} - -int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) -{ - int ret; - - down_write(&EXT4_I(inode)->xattr_sem); - ret = ext4_destroy_inline_data_nolock(handle, inode); - up_write(&EXT4_I(inode)->xattr_sem); - - return ret; -} - -int ext4_inline_data_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, - int *has_inline) -{ - __u64 physical = 0; - __u64 length; - __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST; - int error = 0; - struct ext4_iloc iloc; - - down_read(&EXT4_I(inode)->xattr_sem); - if (!ext4_has_inline_data(inode)) { - *has_inline = 0; - goto out; - } - - error = ext4_get_inode_loc(inode, &iloc); - if (error) - goto out; - - physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; - physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; - physical += offsetof(struct ext4_inode, i_block); - length = i_size_read(inode); - - if (physical) - error = fiemap_fill_next_extent(fieinfo, 0, physical, - length, flags); - brelse(iloc.bh); -out: - up_read(&EXT4_I(inode)->xattr_sem); - return (error < 0 ? error : 0); -} - -/* - * Called during xattr set, and if we can sparse space 'needed', - * just create the extent tree evict the data to the outer block. - * - * We use jbd2 instead of page cache to move data to the 1st block - * so that the whole transaction can be committed as a whole and - * the data isn't lost because of the delayed page cache write. - */ -int ext4_try_to_evict_inline_data(handle_t *handle, - struct inode *inode, - int needed) -{ - int error; - struct ext4_xattr_entry *entry; - struct ext4_xattr_ibody_header *header; - struct ext4_inode *raw_inode; - struct ext4_iloc iloc; - - error = ext4_get_inode_loc(inode, &iloc); - if (error) - return error; - - raw_inode = ext4_raw_inode(&iloc); - header = IHDR(inode, raw_inode); - entry = (struct ext4_xattr_entry *)((void *)raw_inode + - EXT4_I(inode)->i_inline_off); - if (EXT4_XATTR_LEN(entry->e_name_len) + - EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) { - error = -ENOSPC; - goto out; - } - - error = ext4_convert_inline_data_nolock(handle, inode, &iloc); -out: - brelse(iloc.bh); - return error; -} - -void ext4_inline_data_truncate(struct inode *inode, int *has_inline) -{ - handle_t *handle; - int inline_size, value_len, needed_blocks; - size_t i_size; - void *value = NULL; - struct ext4_xattr_ibody_find is = { - .s = { .not_found = -ENODATA, }, - }; - struct ext4_xattr_info i = { - .name_index = EXT4_XATTR_INDEX_SYSTEM, - .name = EXT4_XATTR_SYSTEM_DATA, - }; - - - needed_blocks = ext4_writepage_trans_blocks(inode); - handle = ext4_journal_start(inode, needed_blocks); - if (IS_ERR(handle)) - return; - - down_write(&EXT4_I(inode)->xattr_sem); - if (!ext4_has_inline_data(inode)) { - *has_inline = 0; - ext4_journal_stop(handle); - return; - } - - if (ext4_orphan_add(handle, inode)) - goto out; - - if (ext4_get_inode_loc(inode, &is.iloc)) - goto out; - - down_write(&EXT4_I(inode)->i_data_sem); - i_size = inode->i_size; - inline_size = ext4_get_inline_size(inode); - EXT4_I(inode)->i_disksize = i_size; - - if (i_size < inline_size) { - /* Clear the content in the xattr space. */ - if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) { - if (ext4_xattr_ibody_find(inode, &i, &is)) - goto out_error; - - BUG_ON(is.s.not_found); - - value_len = le32_to_cpu(is.s.here->e_value_size); - value = kmalloc(value_len, GFP_NOFS); - if (!value) - goto out_error; - - if (ext4_xattr_ibody_get(inode, i.name_index, i.name, - value, value_len)) - goto out_error; - - i.value = value; - i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ? - i_size - EXT4_MIN_INLINE_DATA_SIZE : 0; - if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is)) - goto out_error; - } - - /* Clear the content within i_blocks. */ - if (i_size < EXT4_MIN_INLINE_DATA_SIZE) - memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0, - EXT4_MIN_INLINE_DATA_SIZE - i_size); - - EXT4_I(inode)->i_inline_size = i_size < - EXT4_MIN_INLINE_DATA_SIZE ? - EXT4_MIN_INLINE_DATA_SIZE : i_size; - } - -out_error: - up_write(&EXT4_I(inode)->i_data_sem); -out: - brelse(is.iloc.bh); - up_write(&EXT4_I(inode)->xattr_sem); - kfree(value); - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - - ext4_journal_stop(handle); - return; -} - -int ext4_convert_inline_data(struct inode *inode) -{ - int error, needed_blocks; - handle_t *handle; - struct ext4_iloc iloc; - - if (!ext4_has_inline_data(inode)) { - ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); - return 0; - } - - needed_blocks = ext4_writepage_trans_blocks(inode); - - iloc.bh = NULL; - error = ext4_get_inode_loc(inode, &iloc); - if (error) - return error; - - handle = ext4_journal_start(inode, needed_blocks); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - goto out_free; - } - - down_write(&EXT4_I(inode)->xattr_sem); - if (!ext4_has_inline_data(inode)) { - up_write(&EXT4_I(inode)->xattr_sem); - goto out; - } - - error = ext4_convert_inline_data_nolock(handle, inode, &iloc); - up_write(&EXT4_I(inode)->xattr_sem); -out: - ext4_journal_stop(handle); -out_free: - brelse(iloc.bh); - return error; -} diff --git a/trunk/fs/ext4/inode.c b/trunk/fs/ext4/inode.c index cb1c1ab2720b..b3c243b9afa5 100644 --- a/trunk/fs/ext4/inode.c +++ b/trunk/fs/ext4/inode.c @@ -483,6 +483,49 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, return num; } +/* + * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map. + */ +static void set_buffers_da_mapped(struct inode *inode, + struct ext4_map_blocks *map) +{ + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + int i, nr_pages; + pgoff_t index, end; + + index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + end = (map->m_lblk + map->m_len - 1) >> + (PAGE_CACHE_SHIFT - inode->i_blkbits); + + pagevec_init(&pvec, 0); + while (index <= end) { + nr_pages = pagevec_lookup(&pvec, mapping, index, + min(end - index + 1, + (pgoff_t)PAGEVEC_SIZE)); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh, *head; + + if (unlikely(page->mapping != mapping) || + !PageDirty(page)) + break; + + if (page_has_buffers(page)) { + bh = head = page_buffers(page); + do { + set_buffer_da_mapped(bh); + bh = bh->b_this_page; + } while (bh != head); + } + index++; + } + pagevec_release(&pvec); + } +} + /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -531,16 +574,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, up_read((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret; - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { - /* delayed alloc may be allocated by fallocate and - * coverted to initialized by directIO. - * we need to handle delayed extent here. - */ - down_write((&EXT4_I(inode)->i_data_sem)); - goto delayed_mapped; - } - ret = check_block_validity(inode, map); + int ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -618,15 +652,12 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); - if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret; -delayed_mapped: - /* delayed allocation blocks has been allocated */ - ret = ext4_es_remove_extent(inode, map->m_lblk, - map->m_len); - if (ret < 0) - retval = ret; - } + /* If we have successfully mapped the delayed allocated blocks, + * set the BH_Da_Mapped bit on them. Its important to do this + * under the protection of i_data_sem. + */ + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) + set_buffers_da_mapped(inode, map); } up_write((&EXT4_I(inode)->i_data_sem)); @@ -649,13 +680,10 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, int ret = 0, started = 0; int dio_credits; - if (ext4_has_inline_data(inode)) - return -ERANGE; - map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; - if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) { + if (flags && !handle) { /* Direct IO write... */ if (map.m_len > DIO_MAX_BLOCKS) map.m_len = DIO_MAX_BLOCKS; @@ -770,13 +798,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, return NULL; } -int ext4_walk_page_buffers(handle_t *handle, - struct buffer_head *head, - unsigned from, - unsigned to, - int *partial, - int (*fn)(handle_t *handle, - struct buffer_head *bh)) +static int walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)) { struct buffer_head *bh; unsigned block_start, block_end; @@ -826,8 +854,8 @@ int ext4_walk_page_buffers(handle_t *handle, * is elevated. We'll still have enough credits for the tiny quotafile * write. */ -int do_journal_get_write_access(handle_t *handle, - struct buffer_head *bh) +static int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh) { int dirty = buffer_dirty(bh); int ret; @@ -850,7 +878,7 @@ int do_journal_get_write_access(handle_t *handle, return ret; } -static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, +static int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, @@ -874,17 +902,6 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; - if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { - ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, - flags, pagep); - if (ret < 0) - goto out; - if (ret == 1) { - ret = 0; - goto out; - } - } - retry: handle = ext4_journal_start(inode, needed_blocks); if (IS_ERR(handle)) { @@ -902,7 +919,6 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, ret = -ENOMEM; goto out; } - *pagep = page; if (ext4_should_dioread_nolock(inode)) @@ -911,9 +927,8 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, ret = __block_write_begin(page, pos, len, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { - ret = ext4_walk_page_buffers(handle, page_buffers(page), - from, to, NULL, - do_journal_get_write_access); + ret = walk_page_buffers(handle, page_buffers(page), + from, to, NULL, do_journal_get_write_access); } if (ret) { @@ -968,12 +983,7 @@ static int ext4_generic_write_end(struct file *file, struct inode *inode = mapping->host; handle_t *handle = ext4_journal_current_handle(); - if (ext4_has_inline_data(inode)) - copied = ext4_write_inline_data_end(inode, pos, len, - copied, page); - else - copied = block_write_end(file, mapping, pos, - len, copied, page, fsdata); + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); /* * No need to use i_size_read() here, the i_size @@ -1124,21 +1134,16 @@ static int ext4_journalled_write_end(struct file *file, BUG_ON(!ext4_handle_valid(handle)); - if (ext4_has_inline_data(inode)) - copied = ext4_write_inline_data_end(inode, pos, len, - copied, page); - else { - if (copied < len) { - if (!PageUptodate(page)) - copied = 0; - page_zero_new_buffers(page, from+copied, to); - } - - ret = ext4_walk_page_buffers(handle, page_buffers(page), from, - to, &partial, write_end_fn); - if (!partial) - SetPageUptodate(page); + if (copied < len) { + if (!PageUptodate(page)) + copied = 0; + page_zero_new_buffers(page, from+copied, to); } + + ret = walk_page_buffers(handle, page_buffers(page), from, + to, &partial, write_end_fn); + if (!partial) + SetPageUptodate(page); new_i_size = pos + copied; if (new_i_size > inode->i_size) i_size_write(inode, pos+copied); @@ -1296,7 +1301,6 @@ static void ext4_da_page_release_reservation(struct page *page, struct inode *inode = page->mapping->host; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int num_clusters; - ext4_fsblk_t lblk; head = page_buffers(page); bh = head; @@ -1306,23 +1310,20 @@ static void ext4_da_page_release_reservation(struct page *page, if ((offset <= curr_off) && (buffer_delay(bh))) { to_release++; clear_buffer_delay(bh); + clear_buffer_da_mapped(bh); } curr_off = next_off; } while ((bh = bh->b_this_page) != head); - if (to_release) { - lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - ext4_es_remove_extent(inode, lblk, to_release); - } - /* If we have released all the blocks belonging to a cluster, then we * need to release the reserved space for that cluster. */ num_clusters = EXT4_NUM_B2C(sbi, to_release); while (num_clusters > 0) { + ext4_fsblk_t lblk; lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) + ((num_clusters - 1) << sbi->s_cluster_bits); if (sbi->s_cluster_ratio == 1 || - !ext4_find_delalloc_cluster(inode, lblk)) + !ext4_find_delalloc_cluster(inode, lblk, 1)) ext4_da_release_space(inode, 1); num_clusters--; @@ -1428,6 +1429,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, clear_buffer_delay(bh); bh->b_blocknr = pblock; } + if (buffer_da_mapped(bh)) + clear_buffer_da_mapped(bh); if (buffer_unwritten(bh) || buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); @@ -1497,16 +1500,9 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) struct pagevec pvec; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; - ext4_lblk_t start, last; index = mpd->first_page; end = mpd->next_page - 1; - - start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); - ext4_es_remove_extent(inode, start, last - start + 1); - - pagevec_init(&pvec, 0); while (index <= end) { nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); if (nr_pages == 0) @@ -1660,6 +1656,15 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) for (i = 0; i < map.m_len; i++) unmap_underlying_metadata(bdev, map.m_pblk + i); + + if (ext4_should_order_data(mpd->inode)) { + err = ext4_jbd2_file_inode(handle, mpd->inode); + if (err) { + /* Only if the journal is aborted */ + mpd->retval = err; + goto submit_io; + } + } } /* @@ -1790,19 +1795,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, * file system block. */ down_read((&EXT4_I(inode)->i_data_sem)); - if (ext4_has_inline_data(inode)) { - /* - * We will soon create blocks for this page, and let - * us pretend as if the blocks aren't allocated yet. - * In case of clusters, we have to handle the work - * of mapping from cluster so that the reserved space - * is calculated properly. - */ - if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk)) - map->m_flags |= EXT4_MAP_FROM_CLUSTER; - retval = 0; - } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) retval = ext4_ext_map_blocks(NULL, inode, map, 0); else retval = ext4_ind_map_blocks(NULL, inode, map, 0); @@ -1821,10 +1814,6 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, goto out_unlock; } - retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len); - if (retval) - goto out_unlock; - /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served * and it should not appear on the bh->b_state. */ @@ -1853,8 +1842,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev * initialized properly. */ -int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) +static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) { struct ext4_map_blocks map; int ret = 0; @@ -1928,29 +1917,15 @@ static int __ext4_journalled_writepage(struct page *page, { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; - struct buffer_head *page_bufs = NULL; + struct buffer_head *page_bufs; handle_t *handle = NULL; - int ret = 0, err = 0; - int inline_data = ext4_has_inline_data(inode); - struct buffer_head *inode_bh = NULL; + int ret = 0; + int err; ClearPageChecked(page); - - if (inline_data) { - BUG_ON(page->index != 0); - BUG_ON(len > ext4_get_max_inline_size(inode)); - inode_bh = ext4_journalled_write_inline_data(inode, len, page); - if (inode_bh == NULL) - goto out; - } else { - page_bufs = page_buffers(page); - if (!page_bufs) { - BUG(); - goto out; - } - ext4_walk_page_buffers(handle, page_bufs, 0, len, - NULL, bget_one); - } + page_bufs = page_buffers(page); + BUG_ON(!page_bufs); + walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); /* As soon as we unlock the page, it can go away, but we have * references to buffers so we are safe */ unlock_page(page); @@ -1963,18 +1938,11 @@ static int __ext4_journalled_writepage(struct page *page, BUG_ON(!ext4_handle_valid(handle)); - if (inline_data) { - ret = ext4_journal_get_write_access(handle, inode_bh); - - err = ext4_handle_dirty_metadata(handle, inode, inode_bh); + ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, + do_journal_get_write_access); - } else { - ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, - do_journal_get_write_access); - - err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, - write_end_fn); - } + err = walk_page_buffers(handle, page_bufs, 0, len, NULL, + write_end_fn); if (ret == 0) ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; @@ -1982,12 +1950,9 @@ static int __ext4_journalled_writepage(struct page *page, if (!ret) ret = err; - if (!ext4_has_inline_data(inode)) - ext4_walk_page_buffers(handle, page_bufs, 0, len, - NULL, bput_one); + walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); ext4_set_inode_state(inode, EXT4_STATE_JDATA); out: - brelse(inode_bh); return ret; } @@ -2064,8 +2029,8 @@ static int ext4_writepage(struct page *page, commit_write = 1; } page_bufs = page_buffers(page); - if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, - ext4_bh_delay_or_unwritten)) { + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_delay_or_unwritten)) { /* * We don't want to do block allocation, so redirty * the page and return. We may reach here when we do @@ -2131,8 +2096,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) * mpage_da_map_and_submit to map a single contiguous memory region * and then write them. */ -static int write_cache_pages_da(handle_t *handle, - struct address_space *mapping, +static int write_cache_pages_da(struct address_space *mapping, struct writeback_control *wbc, struct mpage_da_data *mpd, pgoff_t *done_index) @@ -2211,17 +2175,6 @@ static int write_cache_pages_da(handle_t *handle, wait_on_page_writeback(page); BUG_ON(PageWriteback(page)); - /* - * If we have inline data and arrive here, it means that - * we will soon create the block for the 1st page, so - * we'd better clear the inline data here. - */ - if (ext4_has_inline_data(inode)) { - BUG_ON(ext4_test_inode_state(inode, - EXT4_STATE_MAY_INLINE_DATA)); - ext4_destroy_inline_data(handle, inode); - } - if (mpd->next_page != page->index) mpd->first_page = page->index; mpd->next_page = page->index + 1; @@ -2428,8 +2381,7 @@ static int ext4_da_writepages(struct address_space *mapping, * contiguous region of logical blocks that need * blocks to be allocated by ext4 and submit them. */ - ret = write_cache_pages_da(handle, mapping, - wbc, &mpd, &done_index); + ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); /* * If we have a contiguous extent of pages and we * haven't done the I/O yet, map the blocks and submit @@ -2493,6 +2445,7 @@ static int ext4_da_writepages(struct address_space *mapping, return ret; } +#define FALL_BACK_TO_NONDELALLOC 1 static int ext4_nonda_switch(struct super_block *sb) { s64 free_blocks, dirty_blocks; @@ -2549,19 +2502,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, } *fsdata = (void *)0; trace_ext4_da_write_begin(inode, pos, len, flags); - - if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { - ret = ext4_da_write_inline_data_begin(mapping, inode, - pos, len, flags, - pagep, fsdata); - if (ret < 0) - goto out; - if (ret == 1) { - ret = 0; - goto out; - } - } - retry: /* * With delayed allocation, we don't log the i_disksize update @@ -2663,13 +2603,22 @@ static int ext4_da_write_end(struct file *file, * changes. So let's piggyback the i_disksize mark_inode_dirty * into that. */ + new_i_size = pos + copied; if (copied && new_i_size > EXT4_I(inode)->i_disksize) { - if (ext4_has_inline_data(inode) || - ext4_da_should_update_i_disksize(page, end)) { + if (ext4_da_should_update_i_disksize(page, end)) { down_write(&EXT4_I(inode)->i_data_sem); - if (new_i_size > EXT4_I(inode)->i_disksize) + if (new_i_size > EXT4_I(inode)->i_disksize) { + /* + * Updating i_disksize when extending file + * without needing block allocation + */ + if (ext4_should_order_data(inode)) + ret = ext4_jbd2_file_inode(handle, + inode); + EXT4_I(inode)->i_disksize = new_i_size; + } up_write(&EXT4_I(inode)->i_data_sem); /* We need to mark inode dirty even if * new_i_size is less that inode->i_size @@ -2678,16 +2627,8 @@ static int ext4_da_write_end(struct file *file, ext4_mark_inode_dirty(handle, inode); } } - - if (write_mode != CONVERT_INLINE_DATA && - ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && - ext4_has_inline_data(inode)) - ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied, - page); - else - ret2 = generic_write_end(file, mapping, pos, len, copied, + ret2 = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - copied = ret2; if (ret2 < 0) ret = ret2; @@ -2780,12 +2721,6 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) journal_t *journal; int err; - /* - * We can get here for an inline file via the FIBMAP ioctl - */ - if (ext4_has_inline_data(inode)) - return 0; - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && test_opt(inode->i_sb, DELALLOC)) { /* @@ -2831,30 +2766,14 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) static int ext4_readpage(struct file *file, struct page *page) { - int ret = -EAGAIN; - struct inode *inode = page->mapping->host; - trace_ext4_readpage(page); - - if (ext4_has_inline_data(inode)) - ret = ext4_readpage_inline(inode, page); - - if (ret == -EAGAIN) - return mpage_readpage(page, ext4_get_block); - - return ret; + return mpage_readpage(page, ext4_get_block); } static int ext4_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { - struct inode *inode = mapping->host; - - /* If the file has inline data, no need to do readpages. */ - if (ext4_has_inline_data(inode)) - return 0; - return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); } @@ -2921,7 +2840,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) * We allocate an uinitialized extent if blocks haven't been allocated. * The extent will be converted to initialized after the IO is complete. */ -int ext4_get_block_write(struct inode *inode, sector_t iblock, +static int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", @@ -2931,12 +2850,29 @@ int ext4_get_block_write(struct inode *inode, sector_t iblock, } static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) + struct buffer_head *bh_result, int flags) { - ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n", - inode->i_ino, create); - return _ext4_get_block(inode, iblock, bh_result, - EXT4_GET_BLOCKS_NO_LOCK); + handle_t *handle = ext4_journal_current_handle(); + struct ext4_map_blocks map; + int ret = 0; + + ext4_debug("ext4_get_block_write_nolock: inode %lu, flag %d\n", + inode->i_ino, flags); + + flags = EXT4_GET_BLOCKS_NO_LOCK; + + map.m_lblk = iblock; + map.m_len = bh_result->b_size >> inode->i_blkbits; + + ret = ext4_map_blocks(handle, inode, &map, flags); + if (ret > 0) { + map_bh(bh_result, inode->i_sb, map.m_pblk); + bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) | + map.m_flags; + bh_result->b_size = inode->i_sb->s_blocksize * map.m_len; + ret = 0; + } + return ret; } static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, @@ -3042,10 +2978,10 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) * fall back to buffered IO. * * For holes, we fallocate those blocks, mark them as uninitialized - * If those blocks were preallocated, we mark sure they are split, but + * If those blocks were preallocated, we mark sure they are splited, but * still keep the range to write as uninitialized. * - * The unwritten extents will be converted to written when DIO is completed. + * The unwrritten extents will be converted to written when DIO is completed. * For async direct IO, since the IO may still pending when return, we * set up an end_io call back function, which will do the conversion * when async direct IO completed. @@ -3063,120 +2999,125 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, struct inode *inode = file->f_mapping->host; ssize_t ret; size_t count = iov_length(iov, nr_segs); - int overwrite = 0; - get_block_t *get_block_func = NULL; - int dio_flags = 0; - loff_t final_size = offset + count; - /* Use the old path for reads and writes beyond i_size. */ - if (rw != WRITE || final_size > inode->i_size) - return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); + loff_t final_size = offset + count; + if (rw == WRITE && final_size <= inode->i_size) { + int overwrite = 0; - BUG_ON(iocb->private == NULL); + BUG_ON(iocb->private == NULL); - /* If we do a overwrite dio, i_mutex locking can be released */ - overwrite = *((int *)iocb->private); + /* If we do a overwrite dio, i_mutex locking can be released */ + overwrite = *((int *)iocb->private); - if (overwrite) { - atomic_inc(&inode->i_dio_count); - down_read(&EXT4_I(inode)->i_data_sem); - mutex_unlock(&inode->i_mutex); - } - - /* - * We could direct write to holes and fallocate. - * - * Allocated blocks to fill the hole are marked as - * uninitialized to prevent parallel buffered read to expose - * the stale data before DIO complete the data IO. - * - * As to previously fallocated extents, ext4 get_block will - * just simply mark the buffer mapped but still keep the - * extents uninitialized. - * - * For non AIO case, we will convert those unwritten extents - * to written after return back from blockdev_direct_IO. - * - * For async DIO, the conversion needs to be deferred when the - * IO is completed. The ext4 end_io callback function will be - * called to take care of the conversion work. Here for async - * case, we allocate an io_end structure to hook to the iocb. - */ - iocb->private = NULL; - ext4_inode_aio_set(inode, NULL); - if (!is_sync_kiocb(iocb)) { - ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_end) { - ret = -ENOMEM; - goto retake_lock; + if (overwrite) { + atomic_inc(&inode->i_dio_count); + down_read(&EXT4_I(inode)->i_data_sem); + mutex_unlock(&inode->i_mutex); } - io_end->flag |= EXT4_IO_END_DIRECT; - iocb->private = io_end; - /* - * we save the io structure for current async direct - * IO, so that later ext4_map_blocks() could flag the - * io structure whether there is a unwritten extents - * needs to be converted when IO is completed. - */ - ext4_inode_aio_set(inode, io_end); - } - if (overwrite) { - get_block_func = ext4_get_block_write_nolock; - } else { - get_block_func = ext4_get_block_write; - dio_flags = DIO_LOCKING; - } - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - get_block_func, - ext4_end_io_dio, - NULL, - dio_flags); - - if (iocb->private) - ext4_inode_aio_set(inode, NULL); - /* - * The io_end structure takes a reference to the inode, that - * structure needs to be destroyed and the reference to the - * inode need to be dropped, when IO is complete, even with 0 - * byte write, or failed. - * - * In the successful AIO DIO case, the io_end structure will - * be destroyed and the reference to the inode will be dropped - * after the end_io call back function is called. - * - * In the case there is 0 byte write, or error case, since VFS - * direct IO won't invoke the end_io call back function, we - * need to free the end_io structure here. - */ - if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { - ext4_free_io_end(iocb->private); + /* + * We could direct write to holes and fallocate. + * + * Allocated blocks to fill the hole are marked as uninitialized + * to prevent parallel buffered read to expose the stale data + * before DIO complete the data IO. + * + * As to previously fallocated extents, ext4 get_block + * will just simply mark the buffer mapped but still + * keep the extents uninitialized. + * + * for non AIO case, we will convert those unwritten extents + * to written after return back from blockdev_direct_IO. + * + * for async DIO, the conversion needs to be defered when + * the IO is completed. The ext4 end_io callback function + * will be called to take care of the conversion work. + * Here for async case, we allocate an io_end structure to + * hook to the iocb. + */ iocb->private = NULL; - } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, - EXT4_STATE_DIO_UNWRITTEN)) { - int err; + ext4_inode_aio_set(inode, NULL); + if (!is_sync_kiocb(iocb)) { + ext4_io_end_t *io_end = + ext4_init_io_end(inode, GFP_NOFS); + if (!io_end) { + ret = -ENOMEM; + goto retake_lock; + } + io_end->flag |= EXT4_IO_END_DIRECT; + iocb->private = io_end; + /* + * we save the io structure for current async + * direct IO, so that later ext4_map_blocks() + * could flag the io structure whether there + * is a unwritten extents needs to be converted + * when IO is completed. + */ + ext4_inode_aio_set(inode, io_end); + } + + if (overwrite) + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block_write_nolock, + ext4_end_io_dio, + NULL, + 0); + else + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block_write, + ext4_end_io_dio, + NULL, + DIO_LOCKING); + if (iocb->private) + ext4_inode_aio_set(inode, NULL); /* - * for non AIO case, since the IO is already - * completed, we could do the conversion right here + * The io_end structure takes a reference to the inode, + * that structure needs to be destroyed and the + * reference to the inode need to be dropped, when IO is + * complete, even with 0 byte write, or failed. + * + * In the successful AIO DIO case, the io_end structure will be + * desctroyed and the reference to the inode will be dropped + * after the end_io call back function is called. + * + * In the case there is 0 byte write, or error case, since + * VFS direct IO won't invoke the end_io call back function, + * we need to free the end_io structure here. */ - err = ext4_convert_unwritten_extents(inode, - offset, ret); - if (err < 0) - ret = err; - ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); - } + if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { + ext4_free_io_end(iocb->private); + iocb->private = NULL; + } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN)) { + int err; + /* + * for non AIO case, since the IO is already + * completed, we could do the conversion right here + */ + err = ext4_convert_unwritten_extents(inode, + offset, ret); + if (err < 0) + ret = err; + ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + } -retake_lock: - /* take i_mutex locking again if we do a ovewrite dio */ - if (overwrite) { - inode_dio_done(inode); - up_read(&EXT4_I(inode)->i_data_sem); - mutex_lock(&inode->i_mutex); + retake_lock: + /* take i_mutex locking again if we do a ovewrite dio */ + if (overwrite) { + inode_dio_done(inode); + up_read(&EXT4_I(inode)->i_data_sem); + mutex_lock(&inode->i_mutex); + } + + return ret; } - return ret; + /* for write the the end of file case, we fall back to old way */ + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); } static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, @@ -3193,10 +3134,6 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, if (ext4_should_journal_data(inode)) return 0; - /* Let buffer I/O handle the inline data case. */ - if (ext4_has_inline_data(inode)) - return 0; - trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); @@ -3594,14 +3531,6 @@ void ext4_truncate(struct inode *inode) if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); - if (ext4_has_inline_data(inode)) { - int has_inline = 1; - - ext4_inline_data_truncate(inode, &has_inline); - if (has_inline) - return; - } - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ext4_ext_truncate(inode); else @@ -3827,19 +3756,6 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, } } -static inline void ext4_iget_extra_inode(struct inode *inode, - struct ext4_inode *raw_inode, - struct ext4_inode_info *ei) -{ - __le32 *magic = (void *)raw_inode + - EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; - if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { - ext4_set_inode_state(inode, EXT4_STATE_XATTR); - ext4_find_inline_data_nolock(inode); - } else - EXT4_I(inode)->i_inline_off = 0; -} - struct inode *ext4_iget(struct super_block *sb, unsigned long ino) { struct ext4_iloc iloc; @@ -3910,7 +3826,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ - ei->i_inline_off = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. @@ -3983,7 +3898,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { - ext4_iget_extra_inode(inode, raw_inode, ei); + __le32 *magic = (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + + ei->i_extra_isize; + if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) + ext4_set_inode_state(inode, EXT4_STATE_XATTR); } } @@ -4006,19 +3925,17 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_file_acl); ret = -EIO; goto bad_inode; - } else if (!ext4_has_inline_data(inode)) { - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - (S_ISLNK(inode->i_mode) && - !ext4_inode_is_fast_symlink(inode)))) - /* Validate extent which is part of inode */ - ret = ext4_ext_check_inode(inode); - } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - (S_ISLNK(inode->i_mode) && - !ext4_inode_is_fast_symlink(inode))) { - /* Validate block references which are part of inode */ - ret = ext4_ind_check_inode(inode); - } + } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode))) + /* Validate extent which is part of inode */ + ret = ext4_ext_check_inode(inode); + } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode))) { + /* Validate block references which are part of inode */ + ret = ext4_ind_check_inode(inode); } if (ret) goto bad_inode; @@ -4205,10 +4122,9 @@ static int ext4_do_update_inode(handle_t *handle, cpu_to_le32(new_encode_dev(inode->i_rdev)); raw_inode->i_block[2] = 0; } - } else if (!ext4_has_inline_data(inode)) { + } else for (block = 0; block < EXT4_N_BLOCKS; block++) raw_inode->i_block[block] = ei->i_data[block]; - } raw_inode->i_disk_version = cpu_to_le32(inode->i_version); if (ei->i_extra_isize) { @@ -4895,9 +4811,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) * journal_start/journal_stop which can block and take a long time */ if (page_has_buffers(page)) { - if (!ext4_walk_page_buffers(NULL, page_buffers(page), - 0, len, NULL, - ext4_bh_unmapped)) { + if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, + ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ wait_on_page_writeback(page); ret = VM_FAULT_LOCKED; @@ -4918,7 +4833,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) } ret = __block_page_mkwrite(vma, vmf, get_block); if (!ret && ext4_should_journal_data(inode)) { - if (ext4_walk_page_buffers(handle, page_buffers(page), 0, + if (walk_page_buffers(handle, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { unlock_page(page); ret = VM_FAULT_SIGBUS; diff --git a/trunk/fs/ext4/mballoc.c b/trunk/fs/ext4/mballoc.c index 1bf6fe785c4f..526e55358606 100644 --- a/trunk/fs/ext4/mballoc.c +++ b/trunk/fs/ext4/mballoc.c @@ -1373,7 +1373,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block, ex->fe_start += next; while (needed > ex->fe_len && - mb_find_buddy(e4b, order, &max)) { + (buddy = mb_find_buddy(e4b, order, &max))) { if (block + 1 >= max) break; @@ -2607,17 +2607,9 @@ static void ext4_free_data_callback(struct super_block *sb, mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->efd_count, entry->efd_group, entry); - if (test_opt(sb, DISCARD)) { - err = ext4_issue_discard(sb, entry->efd_group, - entry->efd_start_cluster, - entry->efd_count); - if (err && err != -EOPNOTSUPP) - ext4_msg(sb, KERN_WARNING, "discard request in" - " group:%d block:%d count:%d failed" - " with %d", entry->efd_group, - entry->efd_start_cluster, - entry->efd_count, err); - } + if (test_opt(sb, DISCARD)) + ext4_issue_discard(sb, entry->efd_group, + entry->efd_start_cluster, entry->efd_count); err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); /* we expect to find existing buddy because it's pinned */ @@ -4318,10 +4310,8 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, repeat: /* allocate space in core */ *errp = ext4_mb_regular_allocator(ac); - if (*errp) { - ext4_discard_allocated_blocks(ac); + if (*errp) goto errout; - } /* as we've just preallocated more space than * user requested orinally, we store allocated @@ -4343,10 +4333,10 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; goto repeat; - } else if (*errp) { + } else if (*errp) + errout: ext4_discard_allocated_blocks(ac); - goto errout; - } else { + else { block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); ar->len = ac->ac_b_ex.fe_len; } @@ -4357,7 +4347,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, *errp = -ENOSPC; } -errout: if (*errp) { ac->ac_b_ex.fe_len = 0; ar->len = 0; @@ -4667,16 +4656,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, * with group lock held. generate_buddy look at * them with group lock_held */ - if (test_opt(sb, DISCARD)) { - err = ext4_issue_discard(sb, block_group, bit, count); - if (err && err != -EOPNOTSUPP) - ext4_msg(sb, KERN_WARNING, "discard request in" - " group:%d block:%d count:%lu failed" - " with %d", block_group, bit, count, - err); - } - - + if (test_opt(sb, DISCARD)) + ext4_issue_discard(sb, block_group, bit, count); ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); mb_free_blocks(inode, &e4b, bit, count_clusters); @@ -4870,11 +4851,10 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, * one will allocate those blocks, mark it as used in buddy bitmap. This must * be called with under the group lock. */ -static int ext4_trim_extent(struct super_block *sb, int start, int count, +static void ext4_trim_extent(struct super_block *sb, int start, int count, ext4_group_t group, struct ext4_buddy *e4b) { struct ext4_free_extent ex; - int ret = 0; trace_ext4_trim_extent(sb, group, start, count); @@ -4890,10 +4870,9 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count, */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); - ret = ext4_issue_discard(sb, group, start, count); + ext4_issue_discard(sb, group, start, count); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); - return ret; } /** @@ -4922,7 +4901,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, void *bitmap; ext4_grpblk_t next, count = 0, free_count = 0; struct ext4_buddy e4b; - int ret = 0; + int ret; trace_ext4_trim_all_free(sb, group, start, max); @@ -4949,11 +4928,8 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, next = mb_find_next_bit(bitmap, max + 1, start); if ((next - start) >= minblocks) { - ret = ext4_trim_extent(sb, start, - next - start, group, &e4b); - if (ret && ret != -EOPNOTSUPP) - break; - ret = 0; + ext4_trim_extent(sb, start, + next - start, group, &e4b); count += next - start; } free_count += next - start; @@ -4974,10 +4950,8 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, break; } - if (!ret) { - ret = count; + if (!ret) EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); - } out: ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); @@ -4985,7 +4959,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_debug("trimmed %d blocks in the group %d\n", count, group); - return ret; + return count; } /** diff --git a/trunk/fs/ext4/migrate.c b/trunk/fs/ext4/migrate.c index db8226d595fa..f1bb32ec0169 100644 --- a/trunk/fs/ext4/migrate.c +++ b/trunk/fs/ext4/migrate.c @@ -14,7 +14,6 @@ #include #include "ext4_jbd2.h" -#include "ext4_extents.h" /* * The contiguous blocks details which can be diff --git a/trunk/fs/ext4/move_extent.c b/trunk/fs/ext4/move_extent.c index d9cc5ee42f53..292daeeed455 100644 --- a/trunk/fs/ext4/move_extent.c +++ b/trunk/fs/ext4/move_extent.c @@ -18,7 +18,6 @@ #include #include "ext4_jbd2.h" #include "ext4.h" -#include "ext4_extents.h" /** * get_ext_path - Find an extent path for designated logical block number. diff --git a/trunk/fs/ext4/namei.c b/trunk/fs/ext4/namei.c index cac448282331..6d600a69fc9d 100644 --- a/trunk/fs/ext4/namei.c +++ b/trunk/fs/ext4/namei.c @@ -202,8 +202,13 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); /* checksumming functions */ -void initialize_dirent_tail(struct ext4_dir_entry_tail *t, - unsigned int blocksize) +#define EXT4_DIRENT_TAIL(block, blocksize) \ + ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ + ((blocksize) - \ + sizeof(struct ext4_dir_entry_tail)))) + +static void initialize_dirent_tail(struct ext4_dir_entry_tail *t, + unsigned int blocksize) { memset(t, 0, sizeof(struct ext4_dir_entry_tail)); t->det_rec_len = ext4_rec_len_to_disk( @@ -256,12 +261,6 @@ static __le32 ext4_dirent_csum(struct inode *inode, return cpu_to_le32(csum); } -static void warn_no_space_for_csum(struct inode *inode) -{ - ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for " - "checksum. Please run e2fsck -D.", inode->i_ino); -} - int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) { struct ext4_dir_entry_tail *t; @@ -272,7 +271,8 @@ int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) t = get_dirent_tail(inode, dirent); if (!t) { - warn_no_space_for_csum(inode); + EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir " + "leaf for checksum. Please run e2fsck -D."); return 0; } @@ -294,7 +294,8 @@ static void ext4_dirent_csum_set(struct inode *inode, t = get_dirent_tail(inode, dirent); if (!t) { - warn_no_space_for_csum(inode); + EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir " + "leaf for checksum. Please run e2fsck -D."); return; } @@ -302,9 +303,9 @@ static void ext4_dirent_csum_set(struct inode *inode, (void *)t - (void *)dirent); } -int ext4_handle_dirty_dirent_node(handle_t *handle, - struct inode *inode, - struct buffer_head *bh) +static inline int ext4_handle_dirty_dirent_node(handle_t *handle, + struct inode *inode, + struct buffer_head *bh) { ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data); return ext4_handle_dirty_metadata(handle, inode, bh); @@ -376,7 +377,8 @@ static int ext4_dx_csum_verify(struct inode *inode, count = le16_to_cpu(c->count); if (count_offset + (limit * sizeof(struct dx_entry)) > EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { - warn_no_space_for_csum(inode); + EXT4_ERROR_INODE(inode, "metadata_csum set but no space for " + "tree checksum found. Run e2fsck -D."); return 1; } t = (struct dx_tail *)(((struct dx_entry *)c) + limit); @@ -406,7 +408,8 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) count = le16_to_cpu(c->count); if (count_offset + (limit * sizeof(struct dx_entry)) > EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { - warn_no_space_for_csum(inode); + EXT4_ERROR_INODE(inode, "metadata_csum set but no space for " + "tree checksum. Run e2fsck -D."); return; } t = (struct dx_tail *)(((struct dx_entry *)c) + limit); @@ -887,7 +890,6 @@ static int htree_dirblock_to_tree(struct file *dir_file, EXT4_DIR_REC_LEN(0)); for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (ext4_check_dir_entry(dir, NULL, de, bh, - bh->b_data, bh->b_size, (block<i_sb)) + ((char *)de - bh->b_data))) { /* On error, skip the f_pos to the next block. */ @@ -1005,15 +1007,6 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, return (err); } -static inline int search_dirblock(struct buffer_head *bh, - struct inode *dir, - const struct qstr *d_name, - unsigned int offset, - struct ext4_dir_entry_2 **res_dir) -{ - return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, - d_name, offset, res_dir); -} /* * Directory block splitting, compacting @@ -1088,6 +1081,13 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) dx_set_count(entries, count + 1); } +static void ext4_update_dx_flag(struct inode *inode) +{ + if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX)) + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); +} + /* * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure. * @@ -1107,13 +1107,11 @@ static inline int ext4_match (int len, const char * const name, /* * Returns 0 if not found, -1 on failure, and 1 on success */ -int search_dir(struct buffer_head *bh, - char *search_buf, - int buf_size, - struct inode *dir, - const struct qstr *d_name, - unsigned int offset, - struct ext4_dir_entry_2 **res_dir) +static inline int search_dirblock(struct buffer_head *bh, + struct inode *dir, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 ** res_dir) { struct ext4_dir_entry_2 * de; char * dlimit; @@ -1121,8 +1119,8 @@ int search_dir(struct buffer_head *bh, const char *name = d_name->name; int namelen = d_name->len; - de = (struct ext4_dir_entry_2 *)search_buf; - dlimit = search_buf + buf_size; + de = (struct ext4_dir_entry_2 *) bh->b_data; + dlimit = bh->b_data + dir->i_sb->s_blocksize; while ((char *) de < dlimit) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ @@ -1130,8 +1128,7 @@ int search_dir(struct buffer_head *bh, if ((char *) de + namelen <= dlimit && ext4_match (namelen, name, de)) { /* found a match - just to be sure, do a full check */ - if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, - bh->b_size, offset)) + if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) return -1; *res_dir = de; return 1; @@ -1147,21 +1144,6 @@ int search_dir(struct buffer_head *bh, return 0; } -static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, - struct ext4_dir_entry *de) -{ - struct super_block *sb = dir->i_sb; - - if (!is_dx(dir)) - return 0; - if (block == 0) - return 1; - if (de->inode == 0 && - ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) == - sb->s_blocksize) - return 1; - return 0; -} /* * ext4_find_entry() @@ -1176,8 +1158,7 @@ static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, */ static struct buffer_head * ext4_find_entry (struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, - int *inlined) + struct ext4_dir_entry_2 ** res_dir) { struct super_block *sb; struct buffer_head *bh_use[NAMEI_RA_SIZE]; @@ -1198,18 +1179,6 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, namelen = d_name->len; if (namelen > EXT4_NAME_LEN) return NULL; - - if (ext4_has_inline_data(dir)) { - int has_inline_data = 1; - ret = ext4_find_inline_entry(dir, d_name, res_dir, - &has_inline_data); - if (has_inline_data) { - if (inlined) - *inlined = 1; - return ret; - } - } - if ((namelen <= 2) && (name[0] == '.') && (name[1] == '.' || name[1] == '\0')) { /* @@ -1275,8 +1244,6 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, goto next; } if (!buffer_verified(bh) && - !is_dx_internal_node(dir, block, - (struct ext4_dir_entry *)bh->b_data) && !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) { EXT4_ERROR_INODE(dir, "checksumming directory " @@ -1394,7 +1361,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi if (dentry->d_name.len > EXT4_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + bh = ext4_find_entry(dir, &dentry->d_name, &de); inode = NULL; if (bh) { __u32 ino = le32_to_cpu(de->inode); @@ -1428,7 +1395,7 @@ struct dentry *ext4_get_parent(struct dentry *child) struct ext4_dir_entry_2 * de; struct buffer_head *bh; - bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL); + bh = ext4_find_entry(child->d_inode, &dotdot, &de); if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); @@ -1626,63 +1593,6 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, return NULL; } -int ext4_find_dest_de(struct inode *dir, struct inode *inode, - struct buffer_head *bh, - void *buf, int buf_size, - const char *name, int namelen, - struct ext4_dir_entry_2 **dest_de) -{ - struct ext4_dir_entry_2 *de; - unsigned short reclen = EXT4_DIR_REC_LEN(namelen); - int nlen, rlen; - unsigned int offset = 0; - char *top; - - de = (struct ext4_dir_entry_2 *)buf; - top = buf + buf_size - reclen; - while ((char *) de <= top) { - if (ext4_check_dir_entry(dir, NULL, de, bh, - buf, buf_size, offset)) - return -EIO; - if (ext4_match(namelen, name, de)) - return -EEXIST; - nlen = EXT4_DIR_REC_LEN(de->name_len); - rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); - if ((de->inode ? rlen - nlen : rlen) >= reclen) - break; - de = (struct ext4_dir_entry_2 *)((char *)de + rlen); - offset += rlen; - } - if ((char *) de > top) - return -ENOSPC; - - *dest_de = de; - return 0; -} - -void ext4_insert_dentry(struct inode *inode, - struct ext4_dir_entry_2 *de, - int buf_size, - const char *name, int namelen) -{ - - int nlen, rlen; - - nlen = EXT4_DIR_REC_LEN(de->name_len); - rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); - if (de->inode) { - struct ext4_dir_entry_2 *de1 = - (struct ext4_dir_entry_2 *)((char *)de + nlen); - de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size); - de->rec_len = ext4_rec_len_to_disk(nlen, buf_size); - de = de1; - } - de->file_type = EXT4_FT_UNKNOWN; - de->inode = cpu_to_le32(inode->i_ino); - ext4_set_de_type(inode->i_sb, de, inode->i_mode); - de->name_len = namelen; - memcpy(de->name, name, namelen); -} /* * Add a new entry into a directory (leaf) block. If de is non-NULL, * it points to a directory entry which is guaranteed to be large @@ -1698,10 +1608,12 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, struct inode *dir = dentry->d_parent->d_inode; const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; + unsigned int offset = 0; unsigned int blocksize = dir->i_sb->s_blocksize; unsigned short reclen; + int nlen, rlen, err; + char *top; int csum_size = 0; - int err; if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) @@ -1709,11 +1621,22 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, reclen = EXT4_DIR_REC_LEN(namelen); if (!de) { - err = ext4_find_dest_de(dir, inode, - bh, bh->b_data, blocksize - csum_size, - name, namelen, &de); - if (err) - return err; + de = (struct ext4_dir_entry_2 *)bh->b_data; + top = bh->b_data + (blocksize - csum_size) - reclen; + while ((char *) de <= top) { + if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) + return -EIO; + if (ext4_match(namelen, name, de)) + return -EEXIST; + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); + if ((de->inode? rlen - nlen: rlen) >= reclen) + break; + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } + if ((char *) de > top) + return -ENOSPC; } BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, bh); @@ -1723,8 +1646,19 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, } /* By now the buffer is marked for journaling */ - ext4_insert_dentry(inode, de, blocksize, name, namelen); - + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); + if (de->inode) { + struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); + de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize); + de->rec_len = ext4_rec_len_to_disk(nlen, blocksize); + de = de1; + } + de->file_type = EXT4_FT_UNKNOWN; + de->inode = cpu_to_le32(inode->i_ino); + ext4_set_de_type(dir->i_sb, de, inode->i_mode); + de->name_len = namelen; + memcpy(de->name, name, namelen); /* * XXX shouldn't update any times until successful * completion of syscall, but too many callers depend @@ -1897,17 +1831,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, blocksize = sb->s_blocksize; if (!dentry->d_name.len) return -EINVAL; - - if (ext4_has_inline_data(dir)) { - retval = ext4_try_add_inline_entry(handle, dentry, inode); - if (retval < 0) - return retval; - if (retval == 1) { - retval = 0; - return retval; - } - } - if (is_dx(dir)) { retval = ext4_dx_add_entry(handle, dentry, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) @@ -2113,29 +2036,36 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, } /* - * ext4_generic_delete_entry deletes a directory entry by merging it - * with the previous entry + * ext4_delete_entry deletes a directory entry by merging it with the + * previous entry */ -int ext4_generic_delete_entry(handle_t *handle, - struct inode *dir, - struct ext4_dir_entry_2 *de_del, - struct buffer_head *bh, - void *entry_buf, - int buf_size, - int csum_size) +static int ext4_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh) { struct ext4_dir_entry_2 *de, *pde; unsigned int blocksize = dir->i_sb->s_blocksize; - int i; + int csum_size = 0; + int i, err; + + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); i = 0; pde = NULL; - de = (struct ext4_dir_entry_2 *)entry_buf; - while (i < buf_size - csum_size) { - if (ext4_check_dir_entry(dir, NULL, de, bh, - bh->b_data, bh->b_size, i)) + de = (struct ext4_dir_entry_2 *) bh->b_data; + while (i < bh->b_size - csum_size) { + if (ext4_check_dir_entry(dir, NULL, de, bh, i)) return -EIO; if (de == de_del) { + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); + if (unlikely(err)) { + ext4_std_error(dir->i_sb, err); + return err; + } if (pde) pde->rec_len = ext4_rec_len_to_disk( ext4_rec_len_from_disk(pde->rec_len, @@ -2146,6 +2076,12 @@ int ext4_generic_delete_entry(handle_t *handle, else de->inode = 0; dir->i_version++; + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_dirent_node(handle, dir, bh); + if (unlikely(err)) { + ext4_std_error(dir->i_sb, err); + return err; + } return 0; } i += ext4_rec_len_from_disk(de->rec_len, blocksize); @@ -2155,48 +2091,6 @@ int ext4_generic_delete_entry(handle_t *handle, return -ENOENT; } -static int ext4_delete_entry(handle_t *handle, - struct inode *dir, - struct ext4_dir_entry_2 *de_del, - struct buffer_head *bh) -{ - int err, csum_size = 0; - - if (ext4_has_inline_data(dir)) { - int has_inline_data = 1; - err = ext4_delete_inline_entry(handle, dir, de_del, bh, - &has_inline_data); - if (has_inline_data) - return err; - } - - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) - csum_size = sizeof(struct ext4_dir_entry_tail); - - BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); - if (unlikely(err)) - goto out; - - err = ext4_generic_delete_entry(handle, dir, de_del, - bh, bh->b_data, - dir->i_sb->s_blocksize, csum_size); - if (err) - goto out; - - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_dirent_node(handle, dir, bh); - if (unlikely(err)) - goto out; - - return 0; -out: - if (err != -ENOENT) - ext4_std_error(dir->i_sb, err); - return err; -} - /* * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2, * since this indicates that nlinks count was previously 1. @@ -2317,95 +2211,21 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, return err; } -struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, - struct ext4_dir_entry_2 *de, - int blocksize, int csum_size, - unsigned int parent_ino, int dotdot_real_len) -{ - de->inode = cpu_to_le32(inode->i_ino); - de->name_len = 1; - de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), - blocksize); - strcpy(de->name, "."); - ext4_set_de_type(inode->i_sb, de, S_IFDIR); - - de = ext4_next_entry(de, blocksize); - de->inode = cpu_to_le32(parent_ino); - de->name_len = 2; - if (!dotdot_real_len) - de->rec_len = ext4_rec_len_to_disk(blocksize - - (csum_size + EXT4_DIR_REC_LEN(1)), - blocksize); - else - de->rec_len = ext4_rec_len_to_disk( - EXT4_DIR_REC_LEN(de->name_len), blocksize); - strcpy(de->name, ".."); - ext4_set_de_type(inode->i_sb, de, S_IFDIR); - - return ext4_next_entry(de, blocksize); -} - -static int ext4_init_new_dir(handle_t *handle, struct inode *dir, - struct inode *inode) +static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { + handle_t *handle; + struct inode *inode; struct buffer_head *dir_block = NULL; struct ext4_dir_entry_2 *de; struct ext4_dir_entry_tail *t; unsigned int blocksize = dir->i_sb->s_blocksize; int csum_size = 0; - int err; + int err, retries = 0; if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) csum_size = sizeof(struct ext4_dir_entry_tail); - if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { - err = ext4_try_create_inline_dir(handle, dir, inode); - if (err < 0 && err != -ENOSPC) - goto out; - if (!err) - goto out; - } - - inode->i_size = EXT4_I(inode)->i_disksize = blocksize; - dir_block = ext4_bread(handle, inode, 0, 1, &err); - if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) { - if (!err) { - err = -EIO; - ext4_error(inode->i_sb, - "Directory hole detected on inode %lu\n", - inode->i_ino); - } - goto out; - } - BUFFER_TRACE(dir_block, "get_write_access"); - err = ext4_journal_get_write_access(handle, dir_block); - if (err) - goto out; - de = (struct ext4_dir_entry_2 *)dir_block->b_data; - ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); - set_nlink(inode, 2); - if (csum_size) { - t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize); - initialize_dirent_tail(t, blocksize); - } - - BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); - if (err) - goto out; - set_buffer_verified(dir_block); -out: - brelse(dir_block); - return err; -} - -static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - handle_t *handle; - struct inode *inode; - int err, retries = 0; - if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; @@ -2429,9 +2249,47 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; - err = ext4_init_new_dir(handle, dir, inode); + inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; + if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) { + if (!err) { + err = -EIO; + ext4_error(inode->i_sb, + "Directory hole detected on inode %lu\n", + inode->i_ino); + } + goto out_clear_inode; + } + BUFFER_TRACE(dir_block, "get_write_access"); + err = ext4_journal_get_write_access(handle, dir_block); + if (err) + goto out_clear_inode; + de = (struct ext4_dir_entry_2 *) dir_block->b_data; + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; + de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), + blocksize); + strcpy(de->name, "."); + ext4_set_de_type(dir->i_sb, de, S_IFDIR); + de = ext4_next_entry(de, blocksize); + de->inode = cpu_to_le32(dir->i_ino); + de->rec_len = ext4_rec_len_to_disk(blocksize - + (csum_size + EXT4_DIR_REC_LEN(1)), + blocksize); + de->name_len = 2; + strcpy(de->name, ".."); + ext4_set_de_type(dir->i_sb, de, S_IFDIR); + set_nlink(inode, 2); + + if (csum_size) { + t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize); + initialize_dirent_tail(t, blocksize); + } + + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); if (err) goto out_clear_inode; + set_buffer_verified(dir_block); err = ext4_mark_inode_dirty(handle, inode); if (!err) err = ext4_add_entry(handle, dentry, inode); @@ -2451,6 +2309,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) unlock_new_inode(inode); d_instantiate(dentry, inode); out_stop: + brelse(dir_block); ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; @@ -2468,14 +2327,6 @@ static int empty_dir(struct inode *inode) struct super_block *sb; int err = 0; - if (ext4_has_inline_data(inode)) { - int has_inline_data = 1; - - err = empty_inline_dir(inode, &has_inline_data); - if (has_inline_data) - return err; - } - sb = inode->i_sb; if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { @@ -2542,8 +2393,7 @@ static int empty_dir(struct inode *inode) set_buffer_verified(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; } - if (ext4_check_dir_entry(inode, NULL, de, bh, - bh->b_data, bh->b_size, offset)) { + if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) { de = (struct ext4_dir_entry_2 *)(bh->b_data + sb->s_blocksize); offset = (offset | (sb->s_blocksize - 1)) + 1; @@ -2729,7 +2579,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) return PTR_ERR(handle); retval = -ENOENT; - bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + bh = ext4_find_entry(dir, &dentry->d_name, &de); if (!bh) goto end_rmdir; @@ -2794,7 +2644,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) ext4_handle_sync(handle); retval = -ENOENT; - bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + bh = ext4_find_entry(dir, &dentry->d_name, &de); if (!bh) goto end_unlink; @@ -2976,39 +2826,8 @@ static int ext4_link(struct dentry *old_dentry, return err; } - -/* - * Try to find buffer head where contains the parent block. - * It should be the inode block if it is inlined or the 1st block - * if it is a normal dir. - */ -static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, - struct inode *inode, - int *retval, - struct ext4_dir_entry_2 **parent_de, - int *inlined) -{ - struct buffer_head *bh; - - if (!ext4_has_inline_data(inode)) { - if (!(bh = ext4_bread(handle, inode, 0, 0, retval))) { - if (!*retval) { - *retval = -EIO; - ext4_error(inode->i_sb, - "Directory hole detected on inode %lu\n", - inode->i_ino); - } - return NULL; - } - *parent_de = ext4_next_entry( - (struct ext4_dir_entry_2 *)bh->b_data, - inode->i_sb->s_blocksize); - return bh; - } - - *inlined = 1; - return ext4_get_first_inline_block(inode, parent_de, retval); -} +#define PARENT_INO(buffer, size) \ + (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode) /* * Anybody can rename anything with this: the permission checks are left to the @@ -3022,8 +2841,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct buffer_head *old_bh, *new_bh, *dir_bh; struct ext4_dir_entry_2 *old_de, *new_de; int retval, force_da_alloc = 0; - int inlined = 0, new_inlined = 0; - struct ext4_dir_entry_2 *parent_de; dquot_initialize(old_dir); dquot_initialize(new_dir); @@ -3043,7 +2860,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) ext4_handle_sync(handle); - old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL); + old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process @@ -3056,8 +2873,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; new_inode = new_dentry->d_inode; - new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, - &new_de, &new_inlined); + new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de); if (new_bh) { if (!new_inode) { brelse(new_bh); @@ -3071,17 +2887,22 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; } retval = -EIO; - dir_bh = ext4_get_first_dir_block(handle, old_inode, - &retval, &parent_de, - &inlined); - if (!dir_bh) + if (!(dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval))) { + if (!retval) { + retval = -EIO; + ext4_error(old_inode->i_sb, + "Directory hole detected on inode %lu\n", + old_inode->i_ino); + } goto end_rename; - if (!inlined && !buffer_verified(dir_bh) && + } + if (!buffer_verified(dir_bh) && !ext4_dirent_csum_verify(old_inode, (struct ext4_dir_entry *)dir_bh->b_data)) goto end_rename; set_buffer_verified(dir_bh); - if (le32_to_cpu(parent_de->inode) != old_dir->i_ino) + if (le32_to_cpu(PARENT_INO(dir_bh->b_data, + old_dir->i_sb->s_blocksize)) != old_dir->i_ino) goto end_rename; retval = -EMLINK; if (!new_inode && new_dir != old_dir && @@ -3110,13 +2931,10 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_current_time(new_dir); ext4_mark_inode_dirty(handle, new_dir); BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); - if (!new_inlined) { - retval = ext4_handle_dirty_dirent_node(handle, - new_dir, new_bh); - if (unlikely(retval)) { - ext4_std_error(new_dir->i_sb, retval); - goto end_rename; - } + retval = ext4_handle_dirty_dirent_node(handle, new_dir, new_bh); + if (unlikely(retval)) { + ext4_std_error(new_dir->i_sb, retval); + goto end_rename; } brelse(new_bh); new_bh = NULL; @@ -3144,8 +2962,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct buffer_head *old_bh2; struct ext4_dir_entry_2 *old_de2; - old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, - &old_de2, NULL); + old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2); if (old_bh2) { retval = ext4_delete_entry(handle, old_dir, old_de2, old_bh2); @@ -3165,19 +2982,17 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); ext4_update_dx_flag(old_dir); if (dir_bh) { - parent_de->inode = cpu_to_le32(new_dir->i_ino); + PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = + cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); - if (!inlined) { - if (is_dx(old_inode)) { - retval = ext4_handle_dirty_dx_node(handle, - old_inode, - dir_bh); - } else { - retval = ext4_handle_dirty_dirent_node(handle, - old_inode, dir_bh); - } + if (is_dx(old_inode)) { + retval = ext4_handle_dirty_dx_node(handle, + old_inode, + dir_bh); } else { - retval = ext4_mark_inode_dirty(handle, old_inode); + retval = ext4_handle_dirty_dirent_node(handle, + old_inode, + dir_bh); } if (retval) { ext4_std_error(old_dir->i_sb, retval); @@ -3228,19 +3043,23 @@ const struct inode_operations ext4_dir_inode_operations = { .mknod = ext4_mknod, .rename = ext4_rename, .setattr = ext4_setattr, +#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, +#endif .get_acl = ext4_get_acl, .fiemap = ext4_fiemap, }; const struct inode_operations ext4_special_inode_operations = { .setattr = ext4_setattr, +#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, +#endif .get_acl = ext4_get_acl, }; diff --git a/trunk/fs/ext4/page-io.c b/trunk/fs/ext4/page-io.c index 0016fbca2a40..68e896e12a67 100644 --- a/trunk/fs/ext4/page-io.c +++ b/trunk/fs/ext4/page-io.c @@ -27,6 +27,7 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" +#include "ext4_extents.h" static struct kmem_cache *io_page_cachep, *io_end_cachep; @@ -110,7 +111,7 @@ static int ext4_end_io(ext4_io_end_t *io) inode_dio_done(inode); /* Wake up anyone waiting on unwritten extent conversion */ if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) - wake_up_all(ext4_ioend_wq(inode)); + wake_up_all(ext4_ioend_wq(io->inode)); return ret; } diff --git a/trunk/fs/ext4/resize.c b/trunk/fs/ext4/resize.c index d99387b89edd..47bf06a2765d 100644 --- a/trunk/fs/ext4/resize.c +++ b/trunk/fs/ext4/resize.c @@ -783,7 +783,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = ext4_journal_get_write_access(handle, gdb_bh); if (unlikely(err)) - goto exit_dind; + goto exit_sbh; err = ext4_journal_get_write_access(handle, dind); if (unlikely(err)) @@ -792,7 +792,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, /* ext4_reserve_inode_write() gets a reference on the iloc */ err = ext4_reserve_inode_write(handle, inode, &iloc); if (unlikely(err)) - goto exit_dind; + goto exit_dindj; n_group_desc = ext4_kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), @@ -846,7 +846,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, exit_inode: ext4_kvfree(n_group_desc); + /* ext4_handle_release_buffer(handle, iloc.bh); */ brelse(iloc.bh); +exit_dindj: + /* ext4_handle_release_buffer(handle, dind); */ +exit_sbh: + /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */ exit_dind: brelse(dind); exit_bh: @@ -964,8 +969,14 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, } for (i = 0; i < reserved_gdb; i++) { - if ((err = ext4_journal_get_write_access(handle, primary[i]))) + if ((err = ext4_journal_get_write_access(handle, primary[i]))) { + /* + int j; + for (j = 0; j < i; j++) + ext4_handle_release_buffer(handle, primary[j]); + */ goto exit_bh; + } } if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) diff --git a/trunk/fs/ext4/super.c b/trunk/fs/ext4/super.c index 3cdb0a2fc648..80928f716850 100644 --- a/trunk/fs/ext4/super.c +++ b/trunk/fs/ext4/super.c @@ -45,7 +45,7 @@ #include #include "ext4.h" -#include "ext4_extents.h" /* Needed for trace points definition */ +#include "ext4_extents.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" @@ -939,11 +939,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) return NULL; ei->vfs_inode.i_version = 1; + ei->vfs_inode.i_data.writeback_index = 0; memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); - ext4_es_init_tree(&ei->i_es_tree); - rwlock_init(&ei->i_es_lock); ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; @@ -997,7 +996,9 @@ static void init_once(void *foo) struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; INIT_LIST_HEAD(&ei->i_orphan); +#ifdef CONFIG_EXT4_FS_XATTR init_rwsem(&ei->xattr_sem); +#endif init_rwsem(&ei->i_data_sem); inode_init_once(&ei->vfs_inode); } @@ -1030,7 +1031,6 @@ void ext4_clear_inode(struct inode *inode) clear_inode(inode); dquot_drop(inode); ext4_discard_preallocations(inode); - ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); @@ -1447,8 +1447,13 @@ static const struct mount_opts { {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ}, {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ}, {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ}, +#ifdef CONFIG_EXT4_FS_XATTR {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, +#else + {Opt_user_xattr, 0, MOPT_NOSUPPORT}, + {Opt_nouser_xattr, 0, MOPT_NOSUPPORT}, +#endif #ifdef CONFIG_EXT4_FS_POSIX_ACL {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR}, @@ -3197,6 +3202,7 @@ int ext4_calculate_overhead(struct super_block *sb) ext4_fsblk_t overhead = 0; char *buf = (char *) get_zeroed_page(GFP_KERNEL); + memset(buf, 0, PAGE_SIZE); if (!buf) return -ENOMEM; @@ -3250,7 +3256,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) unsigned int i; int needs_recovery, has_huge_files, has_bigalloc; __u64 blocks_count; - int err = 0; + int err; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; ext4_group_t first_not_zeroed; @@ -3266,6 +3272,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sb->s_fs_info = sbi; sbi->s_sb = sb; + sbi->s_mount_opt = 0; + sbi->s_resuid = make_kuid(&init_user_ns, EXT4_DEF_RESUID); + sbi->s_resgid = make_kgid(&init_user_ns, EXT4_DEF_RESGID); sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sb_block = sb_block; if (sb->s_bdev->bd_part) @@ -3276,7 +3285,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (cp = sb->s_id; (cp = strchr(cp, '/'));) *cp = '!'; - /* -EINVAL is default */ ret = -EINVAL; blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); if (!blocksize) { @@ -3361,7 +3369,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sb, NO_UID32); /* xattr user namespace & acls are now defaulted on */ +#ifdef CONFIG_EXT4_FS_XATTR set_opt(sb, XATTR_USER); +#endif #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif @@ -3652,6 +3662,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) " too large to mount safely on this system"); if (sizeof(sector_t) < 8) ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); + ret = err; goto failed_mount; } @@ -3759,6 +3770,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); + ret = err; goto failed_mount3; } @@ -3789,6 +3801,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); + sbi->s_resize_flags = 0; sb->s_root = NULL; @@ -3884,8 +3897,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (es->s_overhead_clusters) sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); else { - err = ext4_calculate_overhead(sb); - if (err) + ret = ext4_calculate_overhead(sb); + if (ret) goto failed_mount_wq; } @@ -3897,7 +3910,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); if (!EXT4_SB(sb)->dio_unwritten_wq) { printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); - ret = -ENOMEM; goto failed_mount_wq; } @@ -4000,20 +4012,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Enable quota usage during mount. */ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && !(sb->s_flags & MS_RDONLY)) { - err = ext4_enable_quotas(sb); - if (err) + ret = ext4_enable_quotas(sb); + if (ret) goto failed_mount7; } #endif /* CONFIG_QUOTA */ - if (test_opt(sb, DISCARD)) { - struct request_queue *q = bdev_get_queue(sb->s_bdev); - if (!blk_queue_discard(q)) - ext4_msg(sb, KERN_WARNING, - "mounting with \"discard\" option, but " - "the device does not support discard"); - } - ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, *sbi->s_es->s_mount_opts ? "; " : "", orig_data); @@ -4080,7 +4084,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) kfree(sbi); out_free_orig: kfree(orig_data); - return err ? err : ret; + return ret; } /* @@ -4786,7 +4790,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; - buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead); + buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, sbi->s_overhead); bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); /* prevent underflow in case that few free space is available */ @@ -5278,7 +5282,6 @@ static int __init ext4_init_fs(void) ext4_li_info = NULL; mutex_init(&ext4_li_mtx); - /* Build-time check for flags consistency */ ext4_check_flag_values(); for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { @@ -5286,14 +5289,9 @@ static int __init ext4_init_fs(void) init_waitqueue_head(&ext4__ioend_wq[i]); } - err = ext4_init_es(); - if (err) - return err; - err = ext4_init_pageio(); if (err) - goto out7; - + return err; err = ext4_init_system_zone(); if (err) goto out6; @@ -5343,9 +5341,6 @@ static int __init ext4_init_fs(void) ext4_exit_system_zone(); out6: ext4_exit_pageio(); -out7: - ext4_exit_es(); - return err; } diff --git a/trunk/fs/ext4/symlink.c b/trunk/fs/ext4/symlink.c index ff3711932018..ed9354aff279 100644 --- a/trunk/fs/ext4/symlink.c +++ b/trunk/fs/ext4/symlink.c @@ -35,18 +35,22 @@ const struct inode_operations ext4_symlink_inode_operations = { .follow_link = page_follow_link_light, .put_link = page_put_link, .setattr = ext4_setattr, +#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, +#endif }; const struct inode_operations ext4_fast_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = ext4_follow_link, .setattr = ext4_setattr, +#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, +#endif }; diff --git a/trunk/fs/ext4/xattr.c b/trunk/fs/ext4/xattr.c index 3a91ebc2b66f..2cdb98d62980 100644 --- a/trunk/fs/ext4/xattr.c +++ b/trunk/fs/ext4/xattr.c @@ -61,6 +61,11 @@ #include "xattr.h" #include "acl.h" +#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) +#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) +#define BFIRST(bh) ENTRY(BHDR(bh)+1) +#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) + #ifdef EXT4_XATTR_DEBUG # define ea_idebug(inode, f...) do { \ printk(KERN_DEBUG "inode %s:%lu: ", \ @@ -307,7 +312,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, return error; } -int +static int ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { @@ -576,6 +581,21 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, return (*min_offs - ((void *)last - base) - sizeof(__u32)); } +struct ext4_xattr_info { + int name_index; + const char *name; + const void *value; + size_t value_len; +}; + +struct ext4_xattr_search { + struct ext4_xattr_entry *first; + void *base; + void *end; + struct ext4_xattr_entry *here; + int not_found; +}; + static int ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) { @@ -628,14 +648,9 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) size. Just replace. */ s->here->e_value_size = cpu_to_le32(i->value_len); - if (i->value == EXT4_ZERO_XATTR_VALUE) { - memset(val, 0, size); - } else { - /* Clear pad bytes first. */ - memset(val + size - EXT4_XATTR_PAD, 0, - EXT4_XATTR_PAD); - memcpy(val, i->value, i->value_len); - } + memset(val + size - EXT4_XATTR_PAD, 0, + EXT4_XATTR_PAD); /* Clear pad bytes. */ + memcpy(val, i->value, i->value_len); return 0; } @@ -674,14 +689,9 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) size_t size = EXT4_XATTR_SIZE(i->value_len); void *val = s->base + min_offs - size; s->here->e_value_offs = cpu_to_le16(min_offs - size); - if (i->value == EXT4_ZERO_XATTR_VALUE) { - memset(val, 0, size); - } else { - /* Clear the pad bytes first. */ - memset(val + size - EXT4_XATTR_PAD, 0, - EXT4_XATTR_PAD); - memcpy(val, i->value, i->value_len); - } + memset(val + size - EXT4_XATTR_PAD, 0, + EXT4_XATTR_PAD); /* Clear the pad bytes. */ + memcpy(val, i->value, i->value_len); } } return 0; @@ -784,6 +794,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, int offset = (char *)s->here - bs->bh->b_data; unlock_buffer(bs->bh); + ext4_handle_release_buffer(handle, bs->bh); if (ce) { mb_cache_entry_release(ce); ce = NULL; @@ -939,8 +950,14 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, #undef header } -int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is) +struct ext4_xattr_ibody_find { + struct ext4_xattr_search s; + struct ext4_iloc iloc; +}; + +static int +ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; @@ -968,47 +985,10 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, return 0; } -int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, - struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is) -{ - struct ext4_xattr_ibody_header *header; - struct ext4_xattr_search *s = &is->s; - int error; - - if (EXT4_I(inode)->i_extra_isize == 0) - return -ENOSPC; - error = ext4_xattr_set_entry(i, s); - if (error) { - if (error == -ENOSPC && - ext4_has_inline_data(inode)) { - error = ext4_try_to_evict_inline_data(handle, inode, - EXT4_XATTR_LEN(strlen(i->name) + - EXT4_XATTR_SIZE(i->value_len))); - if (error) - return error; - error = ext4_xattr_ibody_find(inode, i, is); - if (error) - return error; - error = ext4_xattr_set_entry(i, s); - } - if (error) - return error; - } - header = IHDR(inode, ext4_raw_inode(&is->iloc)); - if (!IS_LAST_ENTRY(s->first)) { - header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); - ext4_set_inode_state(inode, EXT4_STATE_XATTR); - } else { - header->h_magic = cpu_to_le32(0); - ext4_clear_inode_state(inode, EXT4_STATE_XATTR); - } - return 0; -} - -static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, - struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is) +static int +ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_search *s = &is->s; @@ -1164,17 +1144,9 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name, { handle_t *handle; int error, retries = 0; - int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); retry: - /* - * In case of inline data, we may push out the data to a block, - * So reserve the journal space first. - */ - if (ext4_has_inline_data(inode)) - credits += ext4_writepage_trans_blocks(inode) + 1; - - handle = ext4_journal_start(inode, credits); + handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { error = PTR_ERR(handle); } else { diff --git a/trunk/fs/ext4/xattr.h b/trunk/fs/ext4/xattr.h index 69eda787a96a..91f31ca7d9af 100644 --- a/trunk/fs/ext4/xattr.h +++ b/trunk/fs/ext4/xattr.h @@ -21,7 +21,6 @@ #define EXT4_XATTR_INDEX_TRUSTED 4 #define EXT4_XATTR_INDEX_LUSTRE 5 #define EXT4_XATTR_INDEX_SECURITY 6 -#define EXT4_XATTR_INDEX_SYSTEM 7 struct ext4_xattr_header { __le32 h_magic; /* magic number for identification */ @@ -66,32 +65,7 @@ struct ext4_xattr_entry { EXT4_I(inode)->i_extra_isize)) #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) -#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) -#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) -#define BFIRST(bh) ENTRY(BHDR(bh)+1) -#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) - -#define EXT4_ZERO_XATTR_VALUE ((void *)-1) - -struct ext4_xattr_info { - int name_index; - const char *name; - const void *value; - size_t value_len; -}; - -struct ext4_xattr_search { - struct ext4_xattr_entry *first; - void *base; - void *end; - struct ext4_xattr_entry *here; - int not_found; -}; - -struct ext4_xattr_ibody_find { - struct ext4_xattr_search s; - struct ext4_iloc iloc; -}; +# ifdef CONFIG_EXT4_FS_XATTR extern const struct xattr_handler ext4_xattr_user_handler; extern const struct xattr_handler ext4_xattr_trusted_handler; @@ -116,82 +90,60 @@ extern void ext4_exit_xattr(void); extern const struct xattr_handler *ext4_xattr_handlers[]; -extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is); -extern int ext4_xattr_ibody_get(struct inode *inode, int name_index, - const char *name, - void *buffer, size_t buffer_size); -extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, - struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is); - -extern int ext4_has_inline_data(struct inode *inode); -extern int ext4_get_inline_size(struct inode *inode); -extern int ext4_get_max_inline_size(struct inode *inode); -extern int ext4_find_inline_data_nolock(struct inode *inode); -extern void ext4_write_inline_data(struct inode *inode, - struct ext4_iloc *iloc, - void *buffer, loff_t pos, - unsigned int len); -extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, - unsigned int len); -extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, - unsigned int len); -extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); - -extern int ext4_readpage_inline(struct inode *inode, struct page *page); -extern int ext4_try_to_write_inline_data(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - unsigned flags, - struct page **pagep); -extern int ext4_write_inline_data_end(struct inode *inode, - loff_t pos, unsigned len, - unsigned copied, - struct page *page); -extern struct buffer_head * -ext4_journalled_write_inline_data(struct inode *inode, - unsigned len, - struct page *page); -extern int ext4_da_write_inline_data_begin(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - unsigned flags, - struct page **pagep, - void **fsdata); -extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, - unsigned len, unsigned copied, - struct page *page); -extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode); -extern int ext4_try_create_inline_dir(handle_t *handle, - struct inode *parent, - struct inode *inode); -extern int ext4_read_inline_dir(struct file *filp, - void *dirent, filldir_t filldir, - int *has_inline_data); -extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, - const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, - int *has_inline_data); -extern int ext4_delete_inline_entry(handle_t *handle, - struct inode *dir, - struct ext4_dir_entry_2 *de_del, - struct buffer_head *bh, - int *has_inline_data); -extern int empty_inline_dir(struct inode *dir, int *has_inline_data); -extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, - struct ext4_dir_entry_2 **parent_de, - int *retval); -extern int ext4_inline_data_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, - int *has_inline); -extern int ext4_try_to_evict_inline_data(handle_t *handle, - struct inode *inode, - int needed); -extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); - -extern int ext4_convert_inline_data(struct inode *inode); +# else /* CONFIG_EXT4_FS_XATTR */ + +static inline int +ext4_xattr_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline int +ext4_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline int +ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + const char *name, const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline void +ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) +{ +} + +static inline void +ext4_xattr_put_super(struct super_block *sb) +{ +} + +static __init inline int +ext4_init_xattr(void) +{ + return 0; +} + +static inline void +ext4_exit_xattr(void) +{ +} + +static inline int +ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + struct ext4_inode *raw_inode, handle_t *handle) +{ + return -EOPNOTSUPP; +} + +#define ext4_xattr_handlers NULL + +# endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, diff --git a/trunk/fs/jbd2/journal.c b/trunk/fs/jbd2/journal.c index dbf41f9452db..484b8d1c6cb6 100644 --- a/trunk/fs/jbd2/journal.c +++ b/trunk/fs/jbd2/journal.c @@ -60,6 +60,7 @@ EXPORT_SYMBOL(jbd2_journal_get_create_access); EXPORT_SYMBOL(jbd2_journal_get_undo_access); EXPORT_SYMBOL(jbd2_journal_set_triggers); EXPORT_SYMBOL(jbd2_journal_dirty_metadata); +EXPORT_SYMBOL(jbd2_journal_release_buffer); EXPORT_SYMBOL(jbd2_journal_forget); #if 0 EXPORT_SYMBOL(journal_sync_buffer); diff --git a/trunk/fs/jbd2/transaction.c b/trunk/fs/jbd2/transaction.c index 42f6615af0ac..d8da40e99d84 100644 --- a/trunk/fs/jbd2/transaction.c +++ b/trunk/fs/jbd2/transaction.c @@ -1207,6 +1207,17 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) return ret; } +/* + * jbd2_journal_release_buffer: undo a get_write_access without any buffer + * updates, if the update decided in the end that it didn't need access. + * + */ +void +jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh) +{ + BUFFER_TRACE(bh, "entry"); +} + /** * void jbd2_journal_forget() - bforget() for potentially-journaled buffers. * @handle: transaction handle diff --git a/trunk/fs/nfs/idmap.c b/trunk/fs/nfs/idmap.c index bc3968fa81e5..9cc4a3fbf4b0 100644 --- a/trunk/fs/nfs/idmap.c +++ b/trunk/fs/nfs/idmap.c @@ -193,15 +193,19 @@ static int nfs_idmap_init_keyring(void) if (!cred) return -ENOMEM; - keyring = keyring_alloc(".id_resolver", 0, 0, cred, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_NOT_IN_QUOTA, NULL); + keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto failed_put_cred; } + ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); + if (ret < 0) + goto failed_put_key; + ret = register_key_type(&key_type_id_resolver); if (ret < 0) goto failed_put_key; diff --git a/trunk/include/asm-generic/pgtable.h b/trunk/include/asm-generic/pgtable.h index 701beab27aab..284e80831d2c 100644 --- a/trunk/include/asm-generic/pgtable.h +++ b/trunk/include/asm-generic/pgtable.h @@ -219,10 +219,6 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) #define move_pte(pte, prot, old_addr, new_addr) (pte) #endif -#ifndef pte_accessible -# define pte_accessible(pte) ((void)(pte),1) -#endif - #ifndef flush_tlb_fix_spurious_fault #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) #endif @@ -584,112 +580,6 @@ static inline int pmd_trans_unstable(pmd_t *pmd) #endif } -#ifdef CONFIG_NUMA_BALANCING -#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE -/* - * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the - * same bit too). It's set only when _PAGE_PRESET is not set and it's - * never set if _PAGE_PRESENT is set. - * - * pte/pmd_present() returns true if pte/pmd_numa returns true. Page - * fault triggers on those regions if pte/pmd_numa returns true - * (because _PAGE_PRESENT is not set). - */ -#ifndef pte_numa -static inline int pte_numa(pte_t pte) -{ - return (pte_flags(pte) & - (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; -} -#endif - -#ifndef pmd_numa -static inline int pmd_numa(pmd_t pmd) -{ - return (pmd_flags(pmd) & - (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; -} -#endif - -/* - * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically - * because they're called by the NUMA hinting minor page fault. If we - * wouldn't set the _PAGE_ACCESSED bitflag here, the TLB miss handler - * would be forced to set it later while filling the TLB after we - * return to userland. That would trigger a second write to memory - * that we optimize away by setting _PAGE_ACCESSED here. - */ -#ifndef pte_mknonnuma -static inline pte_t pte_mknonnuma(pte_t pte) -{ - pte = pte_clear_flags(pte, _PAGE_NUMA); - return pte_set_flags(pte, _PAGE_PRESENT|_PAGE_ACCESSED); -} -#endif - -#ifndef pmd_mknonnuma -static inline pmd_t pmd_mknonnuma(pmd_t pmd) -{ - pmd = pmd_clear_flags(pmd, _PAGE_NUMA); - return pmd_set_flags(pmd, _PAGE_PRESENT|_PAGE_ACCESSED); -} -#endif - -#ifndef pte_mknuma -static inline pte_t pte_mknuma(pte_t pte) -{ - pte = pte_set_flags(pte, _PAGE_NUMA); - return pte_clear_flags(pte, _PAGE_PRESENT); -} -#endif - -#ifndef pmd_mknuma -static inline pmd_t pmd_mknuma(pmd_t pmd) -{ - pmd = pmd_set_flags(pmd, _PAGE_NUMA); - return pmd_clear_flags(pmd, _PAGE_PRESENT); -} -#endif -#else -extern int pte_numa(pte_t pte); -extern int pmd_numa(pmd_t pmd); -extern pte_t pte_mknonnuma(pte_t pte); -extern pmd_t pmd_mknonnuma(pmd_t pmd); -extern pte_t pte_mknuma(pte_t pte); -extern pmd_t pmd_mknuma(pmd_t pmd); -#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ -#else -static inline int pmd_numa(pmd_t pmd) -{ - return 0; -} - -static inline int pte_numa(pte_t pte) -{ - return 0; -} - -static inline pte_t pte_mknonnuma(pte_t pte) -{ - return pte; -} - -static inline pmd_t pmd_mknonnuma(pmd_t pmd) -{ - return pmd; -} - -static inline pte_t pte_mknuma(pte_t pte) -{ - return pte; -} - -static inline pmd_t pmd_mknuma(pmd_t pmd) -{ - return pmd; -} -#endif /* CONFIG_NUMA_BALANCING */ - #endif /* CONFIG_MMU */ #endif /* !__ASSEMBLY__ */ diff --git a/trunk/include/linux/cred.h b/trunk/include/linux/cred.h index 0142aacb70b7..ebbed2ce6637 100644 --- a/trunk/include/linux/cred.h +++ b/trunk/include/linux/cred.h @@ -76,6 +76,21 @@ extern int groups_search(const struct group_info *, kgid_t); extern int in_group_p(kgid_t); extern int in_egroup_p(kgid_t); +/* + * The common credentials for a thread group + * - shared by CLONE_THREAD + */ +#ifdef CONFIG_KEYS +struct thread_group_cred { + atomic_t usage; + pid_t tgid; /* thread group process ID */ + spinlock_t lock; + struct key __rcu *session_keyring; /* keyring inherited over fork */ + struct key *process_keyring; /* keyring private to this process */ + struct rcu_head rcu; /* RCU deletion hook */ +}; +#endif + /* * The security context of a task * @@ -124,8 +139,6 @@ struct cred { #ifdef CONFIG_KEYS unsigned char jit_keyring; /* default keyring to attach requested * keys to */ - struct key __rcu *session_keyring; /* keyring inherited over fork */ - struct key *process_keyring; /* keyring private to this process */ struct key *thread_keyring; /* keyring private to this thread */ struct key *request_key_auth; /* assumed request_key authority */ struct thread_group_cred *tgcred; /* thread-group shared credentials */ diff --git a/trunk/include/linux/huge_mm.h b/trunk/include/linux/huge_mm.h index 1d76f8ca90f0..092dc5305a32 100644 --- a/trunk/include/linux/huge_mm.h +++ b/trunk/include/linux/huge_mm.h @@ -31,8 +31,7 @@ extern int move_huge_pmd(struct vm_area_struct *vma, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd); extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot, - int prot_numa); + unsigned long addr, pgprot_t newprot); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, @@ -112,7 +111,7 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma, #define wait_split_huge_page(__anon_vma, __pmd) \ do { \ pmd_t *____pmd = (__pmd); \ - anon_vma_lock_write(__anon_vma); \ + anon_vma_lock(__anon_vma); \ anon_vma_unlock(__anon_vma); \ BUG_ON(pmd_trans_splitting(*____pmd) || \ pmd_trans_huge(*____pmd)); \ @@ -172,10 +171,6 @@ static inline struct page *compound_trans_head(struct page *page) } return page; } - -extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pmd_t pmd, pmd_t *pmdp); - #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) @@ -214,13 +209,6 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd, { return 0; } - -static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pmd_t pmd, pmd_t *pmdp) -{ - return 0; -} - #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_HUGE_MM_H */ diff --git a/trunk/include/linux/hugetlb.h b/trunk/include/linux/hugetlb.h index 0c80d3f57a5b..3e7fa1acf09c 100644 --- a/trunk/include/linux/hugetlb.h +++ b/trunk/include/linux/hugetlb.h @@ -87,7 +87,7 @@ struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud, int write); int pmd_huge(pmd_t pmd); int pud_huge(pud_t pmd); -unsigned long hugetlb_change_protection(struct vm_area_struct *vma, +void hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot); #else /* !CONFIG_HUGETLB_PAGE */ @@ -132,11 +132,7 @@ static inline void copy_huge_page(struct page *dst, struct page *src) { } -static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma, - unsigned long address, unsigned long end, pgprot_t newprot) -{ - return 0; -} +#define hugetlb_change_protection(vma, address, end, newprot) static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, diff --git a/trunk/include/linux/jbd2.h b/trunk/include/linux/jbd2.h index 1be23d9fdacb..3efc43f3f162 100644 --- a/trunk/include/linux/jbd2.h +++ b/trunk/include/linux/jbd2.h @@ -1096,6 +1096,7 @@ extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *); void jbd2_journal_set_triggers(struct buffer_head *, struct jbd2_buffer_trigger_type *type); extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); +extern void jbd2_journal_release_buffer (handle_t *, struct buffer_head *); extern int jbd2_journal_forget (handle_t *, struct buffer_head *); extern void journal_sync_buffer (struct buffer_head *); extern void jbd2_journal_invalidatepage(journal_t *, @@ -1302,21 +1303,15 @@ static inline int jbd_space_needed(journal_t *journal) extern int jbd_blocks_per_page(struct inode *inode); -/* JBD uses a CRC32 checksum */ -#define JBD_MAX_CHECKSUM_SIZE 4 - static inline u32 jbd2_chksum(journal_t *journal, u32 crc, const void *address, unsigned int length) { struct { struct shash_desc shash; - char ctx[JBD_MAX_CHECKSUM_SIZE]; + char ctx[crypto_shash_descsize(journal->j_chksum_driver)]; } desc; int err; - BUG_ON(crypto_shash_descsize(journal->j_chksum_driver) > - JBD_MAX_CHECKSUM_SIZE); - desc.shash.tfm = journal->j_chksum_driver; desc.shash.flags = 0; *(u32 *)desc.ctx = crc; diff --git a/trunk/include/linux/key.h b/trunk/include/linux/key.h index 4dfde1161c5e..2393b1c040b6 100644 --- a/trunk/include/linux/key.h +++ b/trunk/include/linux/key.h @@ -265,7 +265,6 @@ extern int key_unlink(struct key *keyring, extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, const struct cred *cred, - key_perm_t perm, unsigned long flags, struct key *dest); diff --git a/trunk/include/linux/mempolicy.h b/trunk/include/linux/mempolicy.h index 9adc270de7ef..dbd212723b74 100644 --- a/trunk/include/linux/mempolicy.h +++ b/trunk/include/linux/mempolicy.h @@ -188,8 +188,6 @@ static inline int vma_migratable(struct vm_area_struct *vma) return 1; } -extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); - #else struct mempolicy {}; @@ -309,11 +307,5 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, return 0; } -static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, - unsigned long address) -{ - return -1; /* no node preference */ -} - #endif /* CONFIG_NUMA */ #endif diff --git a/trunk/include/linux/migrate.h b/trunk/include/linux/migrate.h index 1e9f627967a3..0b5865c61efd 100644 --- a/trunk/include/linux/migrate.h +++ b/trunk/include/linux/migrate.h @@ -23,15 +23,6 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **); #define MIGRATEPAGE_BALLOON_SUCCESS 1 /* special ret code for balloon page * sucessful migration case. */ -enum migrate_reason { - MR_COMPACTION, - MR_MEMORY_FAILURE, - MR_MEMORY_HOTPLUG, - MR_SYSCALL, /* also applies to cpusets */ - MR_MEMPOLICY_MBIND, - MR_NUMA_MISPLACED, - MR_CMA -}; #ifdef CONFIG_MIGRATION @@ -41,7 +32,7 @@ extern int migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - enum migrate_mode mode, int reason); + enum migrate_mode mode); extern int migrate_huge_page(struct page *, new_page_t x, unsigned long private, bool offlining, enum migrate_mode mode); @@ -63,7 +54,7 @@ static inline void putback_lru_pages(struct list_head *l) {} static inline void putback_movable_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - enum migrate_mode mode, int reason) { return -ENOSYS; } + enum migrate_mode mode) { return -ENOSYS; } static inline int migrate_huge_page(struct page *page, new_page_t x, unsigned long private, bool offlining, enum migrate_mode mode) { return -ENOSYS; } @@ -92,37 +83,4 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, #define fail_migrate_page NULL #endif /* CONFIG_MIGRATION */ - -#ifdef CONFIG_NUMA_BALANCING -extern int migrate_misplaced_page(struct page *page, int node); -extern int migrate_misplaced_page(struct page *page, int node); -extern bool migrate_ratelimited(int node); -#else -static inline int migrate_misplaced_page(struct page *page, int node) -{ - return -EAGAIN; /* can't migrate now */ -} -static inline bool migrate_ratelimited(int node) -{ - return false; -} -#endif /* CONFIG_NUMA_BALANCING */ - -#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) -extern int migrate_misplaced_transhuge_page(struct mm_struct *mm, - struct vm_area_struct *vma, - pmd_t *pmd, pmd_t entry, - unsigned long address, - struct page *page, int node); -#else -static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, - struct vm_area_struct *vma, - pmd_t *pmd, pmd_t entry, - unsigned long address, - struct page *page, int node) -{ - return -EAGAIN; -} -#endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/ - #endif /* _LINUX_MIGRATE_H */ diff --git a/trunk/include/linux/mm.h b/trunk/include/linux/mm.h index 7f4f906190bd..4af4f0b1be4c 100644 --- a/trunk/include/linux/mm.h +++ b/trunk/include/linux/mm.h @@ -693,36 +693,6 @@ static inline int page_to_nid(const struct page *page) } #endif -#ifdef CONFIG_NUMA_BALANCING -static inline int page_xchg_last_nid(struct page *page, int nid) -{ - return xchg(&page->_last_nid, nid); -} - -static inline int page_last_nid(struct page *page) -{ - return page->_last_nid; -} -static inline void reset_page_last_nid(struct page *page) -{ - page->_last_nid = -1; -} -#else -static inline int page_xchg_last_nid(struct page *page, int nid) -{ - return page_to_nid(page); -} - -static inline int page_last_nid(struct page *page) -{ - return page_to_nid(page); -} - -static inline void reset_page_last_nid(struct page *page) -{ -} -#endif - static inline struct zone *page_zone(const struct page *page) { return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; @@ -1108,9 +1078,6 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, extern unsigned long do_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr); -extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa); extern int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); @@ -1612,11 +1579,6 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) } #endif -#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE -unsigned long change_prot_numa(struct vm_area_struct *vma, - unsigned long start, unsigned long end); -#endif - struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); @@ -1638,7 +1600,6 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_MLOCK 0x40 /* mark page as mlocked */ #define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ -#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); diff --git a/trunk/include/linux/mm_types.h b/trunk/include/linux/mm_types.h index 7d9ebb7cc982..7ade2731b5d6 100644 --- a/trunk/include/linux/mm_types.h +++ b/trunk/include/linux/mm_types.h @@ -175,10 +175,6 @@ struct page { */ void *shadow; #endif - -#ifdef CONFIG_NUMA_BALANCING - int _last_nid; -#endif } /* * The struct page can be forced to be double word aligned so that atomic ops @@ -414,37 +410,10 @@ struct mm_struct { #endif #ifdef CONFIG_CPUMASK_OFFSTACK struct cpumask cpumask_allocation; -#endif -#ifdef CONFIG_NUMA_BALANCING - /* - * numa_next_scan is the next time when the PTEs will me marked - * pte_numa to gather statistics and migrate pages to new nodes - * if necessary - */ - unsigned long numa_next_scan; - - /* numa_next_reset is when the PTE scanner period will be reset */ - unsigned long numa_next_reset; - - /* Restart point for scanning and setting pte_numa */ - unsigned long numa_scan_offset; - - /* numa_scan_seq prevents two threads setting pte_numa */ - int numa_scan_seq; - - /* - * The first node a task was scheduled on. If a task runs on - * a different node than Make PTE Scan Go Now. - */ - int first_nid; #endif struct uprobes_state uprobes_state; }; -/* first nid will either be a valid NID or one of these values */ -#define NUMA_PTE_SCAN_INIT -1 -#define NUMA_PTE_SCAN_ACTIVE -2 - static inline void mm_init_cpumask(struct mm_struct *mm) { #ifdef CONFIG_CPUMASK_OFFSTACK diff --git a/trunk/include/linux/mmzone.h b/trunk/include/linux/mmzone.h index 4bec5be82cab..cd55dad56aac 100644 --- a/trunk/include/linux/mmzone.h +++ b/trunk/include/linux/mmzone.h @@ -735,19 +735,6 @@ typedef struct pglist_data { struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ int kswapd_max_order; enum zone_type classzone_idx; -#ifdef CONFIG_NUMA_BALANCING - /* - * Lock serializing the per destination node AutoNUMA memory - * migration rate limiting data. - */ - spinlock_t numabalancing_migrate_lock; - - /* Rate limiting time interval */ - unsigned long numabalancing_migrate_next_window; - - /* Number of pages migrated during the rate limiting time interval */ - unsigned long numabalancing_migrate_nr_pages; -#endif } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) diff --git a/trunk/include/linux/rmap.h b/trunk/include/linux/rmap.h index c20635c527a9..bfe1f4780644 100644 --- a/trunk/include/linux/rmap.h +++ b/trunk/include/linux/rmap.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include /* @@ -25,8 +25,8 @@ * pointing to this anon_vma once its vma list is empty. */ struct anon_vma { - struct anon_vma *root; /* Root of this anon_vma tree */ - struct rw_semaphore rwsem; /* W: modification, R: walking the list */ + struct anon_vma *root; /* Root of this anon_vma tree */ + struct mutex mutex; /* Serialize access to vma list */ /* * The refcount is taken on an anon_vma when there is no * guarantee that the vma of page tables will exist for @@ -64,7 +64,7 @@ struct anon_vma_chain { struct vm_area_struct *vma; struct anon_vma *anon_vma; struct list_head same_vma; /* locked by mmap_sem & page_table_lock */ - struct rb_node rb; /* locked by anon_vma->rwsem */ + struct rb_node rb; /* locked by anon_vma->mutex */ unsigned long rb_subtree_last; #ifdef CONFIG_DEBUG_VM_RB unsigned long cached_vma_start, cached_vma_last; @@ -108,37 +108,26 @@ static inline void vma_lock_anon_vma(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) - down_write(&anon_vma->root->rwsem); + mutex_lock(&anon_vma->root->mutex); } static inline void vma_unlock_anon_vma(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) - up_write(&anon_vma->root->rwsem); + mutex_unlock(&anon_vma->root->mutex); } -static inline void anon_vma_lock_write(struct anon_vma *anon_vma) +static inline void anon_vma_lock(struct anon_vma *anon_vma) { - down_write(&anon_vma->root->rwsem); + mutex_lock(&anon_vma->root->mutex); } static inline void anon_vma_unlock(struct anon_vma *anon_vma) { - up_write(&anon_vma->root->rwsem); + mutex_unlock(&anon_vma->root->mutex); } -static inline void anon_vma_lock_read(struct anon_vma *anon_vma) -{ - down_read(&anon_vma->root->rwsem); -} - -static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) -{ - up_read(&anon_vma->root->rwsem); -} - - /* * anon_vma helper functions. */ @@ -231,8 +220,8 @@ int try_to_munlock(struct page *); /* * Called by memory-failure.c to kill processes. */ -struct anon_vma *page_lock_anon_vma_read(struct page *page); -void page_unlock_anon_vma_read(struct anon_vma *anon_vma); +struct anon_vma *page_lock_anon_vma(struct page *page); +void page_unlock_anon_vma(struct anon_vma *anon_vma); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); /* diff --git a/trunk/include/linux/sched.h b/trunk/include/linux/sched.h index b089c92c609b..2c2f3072beef 100644 --- a/trunk/include/linux/sched.h +++ b/trunk/include/linux/sched.h @@ -1527,14 +1527,6 @@ struct task_struct { short il_next; short pref_node_fork; #endif -#ifdef CONFIG_NUMA_BALANCING - int numa_scan_seq; - int numa_migrate_seq; - unsigned int numa_scan_period; - u64 node_stamp; /* migration stamp */ - struct callback_head numa_work; -#endif /* CONFIG_NUMA_BALANCING */ - struct rcu_head rcu; /* @@ -1609,18 +1601,6 @@ struct task_struct { /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) -#ifdef CONFIG_NUMA_BALANCING -extern void task_numa_fault(int node, int pages, bool migrated); -extern void set_numabalancing_state(bool enabled); -#else -static inline void task_numa_fault(int node, int pages, bool migrated) -{ -} -static inline void set_numabalancing_state(bool enabled) -{ -} -#endif - /* * Priority of a process goes from 0..MAX_PRIO-1, valid RT * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH @@ -2050,13 +2030,6 @@ enum sched_tunable_scaling { }; extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; -extern unsigned int sysctl_numa_balancing_scan_delay; -extern unsigned int sysctl_numa_balancing_scan_period_min; -extern unsigned int sysctl_numa_balancing_scan_period_max; -extern unsigned int sysctl_numa_balancing_scan_period_reset; -extern unsigned int sysctl_numa_balancing_scan_size; -extern unsigned int sysctl_numa_balancing_settle_count; - #ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; diff --git a/trunk/include/linux/swiotlb.h b/trunk/include/linux/swiotlb.h index 071d62c214a6..8d08b3ed406d 100644 --- a/trunk/include/linux/swiotlb.h +++ b/trunk/include/linux/swiotlb.h @@ -34,25 +34,21 @@ enum dma_sync_target { SYNC_FOR_CPU = 0, SYNC_FOR_DEVICE = 1, }; +extern void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr, + phys_addr_t phys, size_t size, + enum dma_data_direction dir); -/* define the last possible byte of physical address space as a mapping error */ -#define SWIOTLB_MAP_ERROR (~(phys_addr_t)0x0) - -extern phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, - dma_addr_t tbl_dma_addr, - phys_addr_t phys, size_t size, - enum dma_data_direction dir); - -extern void swiotlb_tbl_unmap_single(struct device *hwdev, - phys_addr_t tlb_addr, +extern void swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, size_t size, enum dma_data_direction dir); -extern void swiotlb_tbl_sync_single(struct device *hwdev, - phys_addr_t tlb_addr, +extern void swiotlb_tbl_sync_single(struct device *hwdev, char *dma_addr, size_t size, enum dma_data_direction dir, enum dma_sync_target target); /* Accessory functions. */ +extern void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, + enum dma_data_direction dir); + extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags); diff --git a/trunk/include/linux/vm_event_item.h b/trunk/include/linux/vm_event_item.h index fce0a2799d43..fe786f07d2bd 100644 --- a/trunk/include/linux/vm_event_item.h +++ b/trunk/include/linux/vm_event_item.h @@ -38,18 +38,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, KSWAPD_SKIP_CONGESTION_WAIT, PAGEOUTRUN, ALLOCSTALL, PGROTATED, -#ifdef CONFIG_NUMA_BALANCING - NUMA_PTE_UPDATES, - NUMA_HINT_FAULTS, - NUMA_HINT_FAULTS_LOCAL, - NUMA_PAGE_MIGRATE, -#endif -#ifdef CONFIG_MIGRATION - PGMIGRATE_SUCCESS, PGMIGRATE_FAIL, -#endif #ifdef CONFIG_COMPACTION - COMPACTMIGRATE_SCANNED, COMPACTFREE_SCANNED, - COMPACTISOLATED, + COMPACTBLOCKS, COMPACTPAGES, COMPACTPAGEFAILED, COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS, #endif #ifdef CONFIG_HUGETLB_PAGE diff --git a/trunk/include/linux/vmstat.h b/trunk/include/linux/vmstat.h index a13291f7da88..92a86b2cce33 100644 --- a/trunk/include/linux/vmstat.h +++ b/trunk/include/linux/vmstat.h @@ -80,14 +80,6 @@ static inline void vm_events_fold_cpu(int cpu) #endif /* CONFIG_VM_EVENT_COUNTERS */ -#ifdef CONFIG_NUMA_BALANCING -#define count_vm_numa_event(x) count_vm_event(x) -#define count_vm_numa_events(x, y) count_vm_events(x, y) -#else -#define count_vm_numa_event(x) do {} while (0) -#define count_vm_numa_events(x, y) do {} while (0) -#endif /* CONFIG_NUMA_BALANCING */ - #define __count_zone_vm_events(item, zone, delta) \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + \ zone_idx(zone), delta) diff --git a/trunk/include/trace/events/ext4.h b/trunk/include/trace/events/ext4.h index f6372b011366..d49b285385e8 100644 --- a/trunk/include/trace/events/ext4.h +++ b/trunk/include/trace/events/ext4.h @@ -15,7 +15,6 @@ struct ext4_inode_info; struct mpage_da_data; struct ext4_map_blocks; struct ext4_extent; -struct extent_status; #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) @@ -1520,9 +1519,10 @@ DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter, ); DECLARE_EVENT_CLASS(ext4__map_blocks_exit, - TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), + TP_PROTO(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, unsigned int len, int ret), - TP_ARGS(inode, map, ret), + TP_ARGS(inode, lblk, pblk, len, ret), TP_STRUCT__entry( __field( dev_t, dev ) @@ -1530,37 +1530,37 @@ DECLARE_EVENT_CLASS(ext4__map_blocks_exit, __field( ext4_fsblk_t, pblk ) __field( ext4_lblk_t, lblk ) __field( unsigned int, len ) - __field( unsigned int, flags ) __field( int, ret ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->pblk = map->m_pblk; - __entry->lblk = map->m_lblk; - __entry->len = map->m_len; - __entry->flags = map->m_flags; + __entry->pblk = pblk; + __entry->lblk = lblk; + __entry->len = len; __entry->ret = ret; ), - TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u flags %x ret %d", + TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->pblk, - __entry->len, __entry->flags, __entry->ret) + __entry->len, __entry->ret) ); DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit, - TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), + TP_PROTO(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, unsigned len, int ret), - TP_ARGS(inode, map, ret) + TP_ARGS(inode, lblk, pblk, len, ret) ); DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit, - TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), + TP_PROTO(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, unsigned len, int ret), - TP_ARGS(inode, map, ret) + TP_ARGS(inode, lblk, pblk, len, ret) ); TRACE_EVENT(ext4_ext_load_extent, @@ -1680,10 +1680,10 @@ DEFINE_EVENT(ext4__trim, ext4_trim_all_free, ); TRACE_EVENT(ext4_ext_handle_uninitialized_extents, - TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int flags, + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, unsigned int allocated, ext4_fsblk_t newblock), - TP_ARGS(inode, map, flags, allocated, newblock), + TP_ARGS(inode, map, allocated, newblock), TP_STRUCT__entry( __field( dev_t, dev ) @@ -1699,7 +1699,7 @@ TRACE_EVENT(ext4_ext_handle_uninitialized_extents, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->flags = flags; + __entry->flags = map->m_flags; __entry->lblk = map->m_lblk; __entry->pblk = map->m_pblk; __entry->len = map->m_len; @@ -1707,7 +1707,7 @@ TRACE_EVENT(ext4_ext_handle_uninitialized_extents, __entry->newblk = newblock; ), - TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %x " + TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %d" "allocated %d newblock %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, @@ -2055,106 +2055,6 @@ TRACE_EVENT(ext4_ext_remove_space_done, (unsigned short) __entry->eh_entries) ); -TRACE_EVENT(ext4_es_insert_extent, - TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len), - - TP_ARGS(inode, start, len), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( loff_t, start ) - __field( loff_t, len ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->start = start; - __entry->len = len; - ), - - TP_printk("dev %d,%d ino %lu es [%lld/%lld)", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - __entry->start, __entry->len) -); - -TRACE_EVENT(ext4_es_remove_extent, - TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len), - - TP_ARGS(inode, start, len), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( loff_t, start ) - __field( loff_t, len ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->start = start; - __entry->len = len; - ), - - TP_printk("dev %d,%d ino %lu es [%lld/%lld)", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - __entry->start, __entry->len) -); - -TRACE_EVENT(ext4_es_find_extent_enter, - TP_PROTO(struct inode *inode, ext4_lblk_t start), - - TP_ARGS(inode, start), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( ext4_lblk_t, start ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->start = start; - ), - - TP_printk("dev %d,%d ino %lu start %u", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, __entry->start) -); - -TRACE_EVENT(ext4_es_find_extent_exit, - TP_PROTO(struct inode *inode, struct extent_status *es, - ext4_lblk_t ret), - - TP_ARGS(inode, es, ret), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( ext4_lblk_t, start ) - __field( ext4_lblk_t, len ) - __field( ext4_lblk_t, ret ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->start = es->start; - __entry->len = es->len; - __entry->ret = ret; - ), - - TP_printk("dev %d,%d ino %lu es [%u/%u) ret %u", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - __entry->start, __entry->len, __entry->ret) -); - #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ diff --git a/trunk/include/trace/events/migrate.h b/trunk/include/trace/events/migrate.h deleted file mode 100644 index ec2a6ccfd7e5..000000000000 --- a/trunk/include/trace/events/migrate.h +++ /dev/null @@ -1,51 +0,0 @@ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM migrate - -#if !defined(_TRACE_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_MIGRATE_H - -#define MIGRATE_MODE \ - {MIGRATE_ASYNC, "MIGRATE_ASYNC"}, \ - {MIGRATE_SYNC_LIGHT, "MIGRATE_SYNC_LIGHT"}, \ - {MIGRATE_SYNC, "MIGRATE_SYNC"} - -#define MIGRATE_REASON \ - {MR_COMPACTION, "compaction"}, \ - {MR_MEMORY_FAILURE, "memory_failure"}, \ - {MR_MEMORY_HOTPLUG, "memory_hotplug"}, \ - {MR_SYSCALL, "syscall_or_cpuset"}, \ - {MR_MEMPOLICY_MBIND, "mempolicy_mbind"}, \ - {MR_CMA, "cma"} - -TRACE_EVENT(mm_migrate_pages, - - TP_PROTO(unsigned long succeeded, unsigned long failed, - enum migrate_mode mode, int reason), - - TP_ARGS(succeeded, failed, mode, reason), - - TP_STRUCT__entry( - __field( unsigned long, succeeded) - __field( unsigned long, failed) - __field( enum migrate_mode, mode) - __field( int, reason) - ), - - TP_fast_assign( - __entry->succeeded = succeeded; - __entry->failed = failed; - __entry->mode = mode; - __entry->reason = reason; - ), - - TP_printk("nr_succeeded=%lu nr_failed=%lu mode=%s reason=%s", - __entry->succeeded, - __entry->failed, - __print_symbolic(__entry->mode, MIGRATE_MODE), - __print_symbolic(__entry->reason, MIGRATE_REASON)) -); - -#endif /* _TRACE_MIGRATE_H */ - -/* This part must be outside protection */ -#include diff --git a/trunk/include/uapi/linux/mempolicy.h b/trunk/include/uapi/linux/mempolicy.h index 0d11c3dcd3a1..23e62e0537e2 100644 --- a/trunk/include/uapi/linux/mempolicy.h +++ b/trunk/include/uapi/linux/mempolicy.h @@ -20,7 +20,6 @@ enum { MPOL_PREFERRED, MPOL_BIND, MPOL_INTERLEAVE, - MPOL_LOCAL, MPOL_MAX, /* always last member of enum */ }; @@ -48,15 +47,9 @@ enum mpol_rebind_step { /* Flags for mbind */ #define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ -#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform - to policy */ -#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */ -#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */ -#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */ - -#define MPOL_MF_VALID (MPOL_MF_STRICT | \ - MPOL_MF_MOVE | \ - MPOL_MF_MOVE_ALL) +#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */ +#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */ +#define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */ /* * Internal flags that share the struct mempolicy flags word with @@ -66,8 +59,6 @@ enum mpol_rebind_step { #define MPOL_F_SHARED (1 << 0) /* identify shared policies */ #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ #define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */ -#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ -#define MPOL_F_MORON (1 << 4) /* Migrate On pte_numa Reference On Node */ #endif /* _UAPI_LINUX_MEMPOLICY_H */ diff --git a/trunk/init/Kconfig b/trunk/init/Kconfig index 1a207efca591..2054e048bb98 100644 --- a/trunk/init/Kconfig +++ b/trunk/init/Kconfig @@ -717,50 +717,6 @@ config LOG_BUF_SHIFT config HAVE_UNSTABLE_SCHED_CLOCK bool -# -# For architectures that want to enable the support for NUMA-affine scheduler -# balancing logic: -# -config ARCH_SUPPORTS_NUMA_BALANCING - bool - -# For architectures that (ab)use NUMA to represent different memory regions -# all cpu-local but of different latencies, such as SuperH. -# -config ARCH_WANT_NUMA_VARIABLE_LOCALITY - bool - -# -# For architectures that are willing to define _PAGE_NUMA as _PAGE_PROTNONE -config ARCH_WANTS_PROT_NUMA_PROT_NONE - bool - -config ARCH_USES_NUMA_PROT_NONE - bool - default y - depends on ARCH_WANTS_PROT_NUMA_PROT_NONE - depends on NUMA_BALANCING - -config NUMA_BALANCING_DEFAULT_ENABLED - bool "Automatically enable NUMA aware memory/task placement" - default y - depends on NUMA_BALANCING - help - If set, autonumic NUMA balancing will be enabled if running on a NUMA - machine. - -config NUMA_BALANCING - bool "Memory placement aware NUMA scheduler" - depends on ARCH_SUPPORTS_NUMA_BALANCING - depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION - help - This option adds support for automatic NUMA aware memory/task placement. - The mechanism is quite primitive and is based on migrating memory when - it is references to the node the task is running on. - - This system will be inactive on UMA systems. - menuconfig CGROUPS boolean "Control Group support" depends on EVENTFD diff --git a/trunk/kernel/cred.c b/trunk/kernel/cred.c index 8888afb846e9..48cea3da6d05 100644 --- a/trunk/kernel/cred.c +++ b/trunk/kernel/cred.c @@ -29,6 +29,17 @@ static struct kmem_cache *cred_jar; +/* + * The common credentials for the initial task's thread group + */ +#ifdef CONFIG_KEYS +static struct thread_group_cred init_tgcred = { + .usage = ATOMIC_INIT(2), + .tgid = 0, + .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock), +}; +#endif + /* * The initial credentials for the initial task */ @@ -54,6 +65,9 @@ struct cred init_cred = { .user = INIT_USER, .user_ns = &init_user_ns, .group_info = &init_groups, +#ifdef CONFIG_KEYS + .tgcred = &init_tgcred, +#endif }; static inline void set_cred_subscribers(struct cred *cred, int n) @@ -81,6 +95,36 @@ static inline void alter_cred_subscribers(const struct cred *_cred, int n) #endif } +/* + * Dispose of the shared task group credentials + */ +#ifdef CONFIG_KEYS +static void release_tgcred_rcu(struct rcu_head *rcu) +{ + struct thread_group_cred *tgcred = + container_of(rcu, struct thread_group_cred, rcu); + + BUG_ON(atomic_read(&tgcred->usage) != 0); + + key_put(tgcred->session_keyring); + key_put(tgcred->process_keyring); + kfree(tgcred); +} +#endif + +/* + * Release a set of thread group credentials. + */ +static void release_tgcred(struct cred *cred) +{ +#ifdef CONFIG_KEYS + struct thread_group_cred *tgcred = cred->tgcred; + + if (atomic_dec_and_test(&tgcred->usage)) + call_rcu(&tgcred->rcu, release_tgcred_rcu); +#endif +} + /* * The RCU callback to actually dispose of a set of credentials */ @@ -106,10 +150,9 @@ static void put_cred_rcu(struct rcu_head *rcu) #endif security_cred_free(cred); - key_put(cred->session_keyring); - key_put(cred->process_keyring); key_put(cred->thread_keyring); key_put(cred->request_key_auth); + release_tgcred(cred); if (cred->group_info) put_group_info(cred->group_info); free_uid(cred->user); @@ -203,6 +246,15 @@ struct cred *cred_alloc_blank(void) if (!new) return NULL; +#ifdef CONFIG_KEYS + new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); + if (!new->tgcred) { + kmem_cache_free(cred_jar, new); + return NULL; + } + atomic_set(&new->tgcred->usage, 1); +#endif + atomic_set(&new->usage, 1); #ifdef CONFIG_DEBUG_CREDENTIALS new->magic = CRED_MAGIC; @@ -256,10 +308,9 @@ struct cred *prepare_creds(void) get_user_ns(new->user_ns); #ifdef CONFIG_KEYS - key_get(new->session_keyring); - key_get(new->process_keyring); key_get(new->thread_keyring); key_get(new->request_key_auth); + atomic_inc(&new->tgcred->usage); #endif #ifdef CONFIG_SECURITY @@ -283,20 +334,39 @@ EXPORT_SYMBOL(prepare_creds); */ struct cred *prepare_exec_creds(void) { + struct thread_group_cred *tgcred = NULL; struct cred *new; +#ifdef CONFIG_KEYS + tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); + if (!tgcred) + return NULL; +#endif + new = prepare_creds(); - if (!new) + if (!new) { + kfree(tgcred); return new; + } #ifdef CONFIG_KEYS /* newly exec'd tasks don't get a thread keyring */ key_put(new->thread_keyring); new->thread_keyring = NULL; + /* create a new per-thread-group creds for all this set of threads to + * share */ + memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred)); + + atomic_set(&tgcred->usage, 1); + spin_lock_init(&tgcred->lock); + /* inherit the session keyring; new process keyring */ - key_put(new->process_keyring); - new->process_keyring = NULL; + key_get(tgcred->session_keyring); + tgcred->process_keyring = NULL; + + release_tgcred(new); + new->tgcred = tgcred; #endif return new; @@ -313,6 +383,9 @@ struct cred *prepare_exec_creds(void) */ int copy_creds(struct task_struct *p, unsigned long clone_flags) { +#ifdef CONFIG_KEYS + struct thread_group_cred *tgcred; +#endif struct cred *new; int ret; @@ -352,12 +425,22 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) install_thread_keyring_to_cred(new); } - /* The process keyring is only shared between the threads in a process; - * anything outside of those threads doesn't inherit. - */ + /* we share the process and session keyrings between all the threads in + * a process - this is slightly icky as we violate COW credentials a + * bit */ if (!(clone_flags & CLONE_THREAD)) { - key_put(new->process_keyring); - new->process_keyring = NULL; + tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); + if (!tgcred) { + ret = -ENOMEM; + goto error_put; + } + atomic_set(&tgcred->usage, 1); + spin_lock_init(&tgcred->lock); + tgcred->process_keyring = NULL; + tgcred->session_keyring = key_get(new->tgcred->session_keyring); + + release_tgcred(new); + new->tgcred = tgcred; } #endif @@ -560,6 +643,9 @@ void __init cred_init(void) */ struct cred *prepare_kernel_cred(struct task_struct *daemon) { +#ifdef CONFIG_KEYS + struct thread_group_cred *tgcred; +#endif const struct cred *old; struct cred *new; @@ -567,6 +653,14 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) if (!new) return NULL; +#ifdef CONFIG_KEYS + tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); + if (!tgcred) { + kmem_cache_free(cred_jar, new); + return NULL; + } +#endif + kdebug("prepare_kernel_cred() alloc %p", new); if (daemon) @@ -584,10 +678,13 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) get_group_info(new->group_info); #ifdef CONFIG_KEYS - new->session_keyring = NULL; - new->process_keyring = NULL; - new->thread_keyring = NULL; + atomic_set(&tgcred->usage, 1); + spin_lock_init(&tgcred->lock); + tgcred->process_keyring = NULL; + tgcred->session_keyring = NULL; + new->tgcred = tgcred; new->request_key_auth = NULL; + new->thread_keyring = NULL; new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; #endif diff --git a/trunk/kernel/fork.c b/trunk/kernel/fork.c index 115d6c2e4cca..3c31e874afad 100644 --- a/trunk/kernel/fork.c +++ b/trunk/kernel/fork.c @@ -822,9 +822,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk) #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; -#endif -#ifdef CONFIG_NUMA_BALANCING - mm->first_nid = NUMA_PTE_SCAN_INIT; #endif if (!mm_init(mm, tsk)) goto fail_nomem; diff --git a/trunk/kernel/sched/core.c b/trunk/kernel/sched/core.c index c1fb82104bfb..0533496b6228 100644 --- a/trunk/kernel/sched/core.c +++ b/trunk/kernel/sched/core.c @@ -193,10 +193,23 @@ static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; #endif /* HAVE_JUMP_LABEL */ -static int sched_feat_set(char *cmp) +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) { - int i; + char buf[64]; + char *cmp; int neg = 0; + int i; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + cmp = strstrip(buf); if (strncmp(cmp, "NO_", 3) == 0) { neg = 1; @@ -216,27 +229,6 @@ static int sched_feat_set(char *cmp) } } - return i; -} - -static ssize_t -sched_feat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - char *cmp; - int i; - - if (cnt > 63) - cnt = 63; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - cmp = strstrip(buf); - - i = sched_feat_set(cmp); if (i == __SCHED_FEAT_NR) return -EINVAL; @@ -1568,40 +1560,7 @@ static void __sched_fork(struct task_struct *p) #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif - -#ifdef CONFIG_NUMA_BALANCING - if (p->mm && atomic_read(&p->mm->mm_users) == 1) { - p->mm->numa_next_scan = jiffies; - p->mm->numa_next_reset = jiffies; - p->mm->numa_scan_seq = 0; - } - - p->node_stamp = 0ULL; - p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; - p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; - p->numa_scan_period = sysctl_numa_balancing_scan_delay; - p->numa_work.next = &p->numa_work; -#endif /* CONFIG_NUMA_BALANCING */ -} - -#ifdef CONFIG_NUMA_BALANCING -#ifdef CONFIG_SCHED_DEBUG -void set_numabalancing_state(bool enabled) -{ - if (enabled) - sched_feat_set("NUMA"); - else - sched_feat_set("NO_NUMA"); -} -#else -__read_mostly bool numabalancing_enabled; - -void set_numabalancing_state(bool enabled) -{ - numabalancing_enabled = enabled; } -#endif /* CONFIG_SCHED_DEBUG */ -#endif /* CONFIG_NUMA_BALANCING */ /* * fork()/clone()-time setup: diff --git a/trunk/kernel/sched/fair.c b/trunk/kernel/sched/fair.c index 9af5af979a13..756f9f9e8542 100644 --- a/trunk/kernel/sched/fair.c +++ b/trunk/kernel/sched/fair.c @@ -26,9 +26,6 @@ #include #include #include -#include -#include -#include #include @@ -777,227 +774,6 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ -#ifdef CONFIG_NUMA_BALANCING -/* - * numa task sample period in ms - */ -unsigned int sysctl_numa_balancing_scan_period_min = 100; -unsigned int sysctl_numa_balancing_scan_period_max = 100*50; -unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; - -/* Portion of address space to scan in MB */ -unsigned int sysctl_numa_balancing_scan_size = 256; - -/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ -unsigned int sysctl_numa_balancing_scan_delay = 1000; - -static void task_numa_placement(struct task_struct *p) -{ - int seq = ACCESS_ONCE(p->mm->numa_scan_seq); - - if (p->numa_scan_seq == seq) - return; - p->numa_scan_seq = seq; - - /* FIXME: Scheduling placement policy hints go here */ -} - -/* - * Got a PROT_NONE fault for a page on @node. - */ -void task_numa_fault(int node, int pages, bool migrated) -{ - struct task_struct *p = current; - - if (!sched_feat_numa(NUMA)) - return; - - /* FIXME: Allocate task-specific structure for placement policy here */ - - /* - * If pages are properly placed (did not migrate) then scan slower. - * This is reset periodically in case of phase changes - */ - if (!migrated) - p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, - p->numa_scan_period + jiffies_to_msecs(10)); - - task_numa_placement(p); -} - -static void reset_ptenuma_scan(struct task_struct *p) -{ - ACCESS_ONCE(p->mm->numa_scan_seq)++; - p->mm->numa_scan_offset = 0; -} - -/* - * The expensive part of numa migration is done from task_work context. - * Triggered from task_tick_numa(). - */ -void task_numa_work(struct callback_head *work) -{ - unsigned long migrate, next_scan, now = jiffies; - struct task_struct *p = current; - struct mm_struct *mm = p->mm; - struct vm_area_struct *vma; - unsigned long start, end; - long pages; - - WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); - - work->next = work; /* protect against double add */ - /* - * Who cares about NUMA placement when they're dying. - * - * NOTE: make sure not to dereference p->mm before this check, - * exit_task_work() happens _after_ exit_mm() so we could be called - * without p->mm even though we still had it when we enqueued this - * work. - */ - if (p->flags & PF_EXITING) - return; - - /* - * We do not care about task placement until a task runs on a node - * other than the first one used by the address space. This is - * largely because migrations are driven by what CPU the task - * is running on. If it's never scheduled on another node, it'll - * not migrate so why bother trapping the fault. - */ - if (mm->first_nid == NUMA_PTE_SCAN_INIT) - mm->first_nid = numa_node_id(); - if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { - /* Are we running on a new node yet? */ - if (numa_node_id() == mm->first_nid && - !sched_feat_numa(NUMA_FORCE)) - return; - - mm->first_nid = NUMA_PTE_SCAN_ACTIVE; - } - - /* - * Reset the scan period if enough time has gone by. Objective is that - * scanning will be reduced if pages are properly placed. As tasks - * can enter different phases this needs to be re-examined. Lacking - * proper tracking of reference behaviour, this blunt hammer is used. - */ - migrate = mm->numa_next_reset; - if (time_after(now, migrate)) { - p->numa_scan_period = sysctl_numa_balancing_scan_period_min; - next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); - xchg(&mm->numa_next_reset, next_scan); - } - - /* - * Enforce maximal scan/migration frequency.. - */ - migrate = mm->numa_next_scan; - if (time_before(now, migrate)) - return; - - if (p->numa_scan_period == 0) - p->numa_scan_period = sysctl_numa_balancing_scan_period_min; - - next_scan = now + msecs_to_jiffies(p->numa_scan_period); - if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) - return; - - /* - * Do not set pte_numa if the current running node is rate-limited. - * This loses statistics on the fault but if we are unwilling to - * migrate to this node, it is less likely we can do useful work - */ - if (migrate_ratelimited(numa_node_id())) - return; - - start = mm->numa_scan_offset; - pages = sysctl_numa_balancing_scan_size; - pages <<= 20 - PAGE_SHIFT; /* MB in pages */ - if (!pages) - return; - - down_read(&mm->mmap_sem); - vma = find_vma(mm, start); - if (!vma) { - reset_ptenuma_scan(p); - start = 0; - vma = mm->mmap; - } - for (; vma; vma = vma->vm_next) { - if (!vma_migratable(vma)) - continue; - - /* Skip small VMAs. They are not likely to be of relevance */ - if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) < HPAGE_PMD_NR) - continue; - - do { - start = max(start, vma->vm_start); - end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); - end = min(end, vma->vm_end); - pages -= change_prot_numa(vma, start, end); - - start = end; - if (pages <= 0) - goto out; - } while (end != vma->vm_end); - } - -out: - /* - * It is possible to reach the end of the VMA list but the last few VMAs are - * not guaranteed to the vma_migratable. If they are not, we would find the - * !migratable VMA on the next scan but not reset the scanner to the start - * so check it now. - */ - if (vma) - mm->numa_scan_offset = start; - else - reset_ptenuma_scan(p); - up_read(&mm->mmap_sem); -} - -/* - * Drive the periodic memory faults.. - */ -void task_tick_numa(struct rq *rq, struct task_struct *curr) -{ - struct callback_head *work = &curr->numa_work; - u64 period, now; - - /* - * We don't care about NUMA placement if we don't have memory. - */ - if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) - return; - - /* - * Using runtime rather than walltime has the dual advantage that - * we (mostly) drive the selection from busy threads and that the - * task needs to have done some actual work before we bother with - * NUMA placement. - */ - now = curr->se.sum_exec_runtime; - period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; - - if (now - curr->node_stamp > period) { - if (!curr->node_stamp) - curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; - curr->node_stamp = now; - - if (!time_before(jiffies, curr->mm->numa_next_scan)) { - init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ - task_work_add(curr, work, true); - } - } -} -#else -static void task_tick_numa(struct rq *rq, struct task_struct *curr) -{ -} -#endif /* CONFIG_NUMA_BALANCING */ - static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -5725,9 +5501,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } - if (sched_feat_numa(NUMA)) - task_tick_numa(rq, curr); - update_rq_runnable_avg(rq, 1); } diff --git a/trunk/kernel/sched/features.h b/trunk/kernel/sched/features.h index 1ad1d2b5395f..e68e69ab917d 100644 --- a/trunk/kernel/sched/features.h +++ b/trunk/kernel/sched/features.h @@ -66,14 +66,3 @@ SCHED_FEAT(TTWU_QUEUE, true) SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) - -/* - * Apply the automatic NUMA scheduling policy. Enabled automatically - * at runtime if running on a NUMA machine. Can be controlled via - * numa_balancing=. Allow PTE scanning to be forced on UMA machines - * for debugging the core machinery. - */ -#ifdef CONFIG_NUMA_BALANCING -SCHED_FEAT(NUMA, false) -SCHED_FEAT(NUMA_FORCE, false) -#endif diff --git a/trunk/kernel/sched/sched.h b/trunk/kernel/sched/sched.h index fc886441436a..5eca173b563f 100644 --- a/trunk/kernel/sched/sched.h +++ b/trunk/kernel/sched/sched.h @@ -663,18 +663,6 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ -#ifdef CONFIG_NUMA_BALANCING -#define sched_feat_numa(x) sched_feat(x) -#ifdef CONFIG_SCHED_DEBUG -#define numabalancing_enabled sched_feat_numa(NUMA) -#else -extern bool numabalancing_enabled; -#endif /* CONFIG_SCHED_DEBUG */ -#else -#define sched_feat_numa(x) (0) -#define numabalancing_enabled (0) -#endif /* CONFIG_NUMA_BALANCING */ - static inline u64 global_rt_period(void) { return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; diff --git a/trunk/kernel/seccomp.c b/trunk/kernel/seccomp.c index 5af44b593770..ee376beedaf9 100644 --- a/trunk/kernel/seccomp.c +++ b/trunk/kernel/seccomp.c @@ -396,29 +396,25 @@ int __secure_computing(int this_syscall) #ifdef CONFIG_SECCOMP_FILTER case SECCOMP_MODE_FILTER: { int data; - struct pt_regs *regs = task_pt_regs(current); ret = seccomp_run_filters(this_syscall); data = ret & SECCOMP_RET_DATA; ret &= SECCOMP_RET_ACTION; switch (ret) { case SECCOMP_RET_ERRNO: /* Set the low-order 16-bits as a errno. */ - syscall_set_return_value(current, regs, + syscall_set_return_value(current, task_pt_regs(current), -data, 0); goto skip; case SECCOMP_RET_TRAP: /* Show the handler the original registers. */ - syscall_rollback(current, regs); + syscall_rollback(current, task_pt_regs(current)); /* Let the filter pass back 16 bits of data. */ seccomp_send_sigsys(this_syscall, data); goto skip; case SECCOMP_RET_TRACE: /* Skip these calls if there is no tracer. */ - if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { - syscall_set_return_value(current, regs, - -ENOSYS, 0); + if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) goto skip; - } /* Allow the BPF to provide the event message */ ptrace_event(PTRACE_EVENT_SECCOMP, data); /* @@ -429,9 +425,6 @@ int __secure_computing(int this_syscall) */ if (fatal_signal_pending(current)) break; - if (syscall_get_nr(current, regs) < 0) - goto skip; /* Explicit request to skip. */ - return 0; case SECCOMP_RET_ALLOW: return 0; diff --git a/trunk/kernel/sysctl.c b/trunk/kernel/sysctl.c index c88878db491e..33f71f37267e 100644 --- a/trunk/kernel/sysctl.c +++ b/trunk/kernel/sysctl.c @@ -256,11 +256,9 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ -#ifdef CONFIG_SMP static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; -#endif /* CONFIG_SMP */ -#endif /* CONFIG_SCHED_DEBUG */ +#endif #ifdef CONFIG_COMPACTION static int min_extfrag_threshold; @@ -303,7 +301,6 @@ static struct ctl_table kern_table[] = { .extra1 = &min_wakeup_granularity_ns, .extra2 = &max_wakeup_granularity_ns, }, -#ifdef CONFIG_SMP { .procname = "sched_tunable_scaling", .data = &sysctl_sched_tunable_scaling, @@ -350,45 +347,7 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, -#endif /* CONFIG_SMP */ -#ifdef CONFIG_NUMA_BALANCING - { - .procname = "numa_balancing_scan_delay_ms", - .data = &sysctl_numa_balancing_scan_delay, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "numa_balancing_scan_period_min_ms", - .data = &sysctl_numa_balancing_scan_period_min, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "numa_balancing_scan_period_reset", - .data = &sysctl_numa_balancing_scan_period_reset, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "numa_balancing_scan_period_max_ms", - .data = &sysctl_numa_balancing_scan_period_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "numa_balancing_scan_size_mb", - .data = &sysctl_numa_balancing_scan_size, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif /* CONFIG_NUMA_BALANCING */ -#endif /* CONFIG_SCHED_DEBUG */ +#endif { .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, diff --git a/trunk/lib/swiotlb.c b/trunk/lib/swiotlb.c index 196b06984dec..f114bf6a8e13 100644 --- a/trunk/lib/swiotlb.c +++ b/trunk/lib/swiotlb.c @@ -57,7 +57,7 @@ int swiotlb_force; * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this * API. */ -static phys_addr_t io_tlb_start, io_tlb_end; +static char *io_tlb_start, *io_tlb_end; /* * The number of IO TLB blocks (in groups of 64) between io_tlb_start and @@ -70,7 +70,7 @@ static unsigned long io_tlb_nslabs; */ static unsigned long io_tlb_overflow = 32*1024; -static phys_addr_t io_tlb_overflow_buffer; +static void *io_tlb_overflow_buffer; /* * This is a free list describing the number of free entries available from @@ -125,37 +125,26 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, void swiotlb_print_info(void) { unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; - unsigned char *vstart, *vend; + phys_addr_t pstart, pend; - vstart = phys_to_virt(io_tlb_start); - vend = phys_to_virt(io_tlb_end); + pstart = virt_to_phys(io_tlb_start); + pend = virt_to_phys(io_tlb_end); printk(KERN_INFO "software IO TLB [mem %#010llx-%#010llx] (%luMB) mapped at [%p-%p]\n", - (unsigned long long)io_tlb_start, - (unsigned long long)io_tlb_end, - bytes >> 20, vstart, vend - 1); + (unsigned long long)pstart, (unsigned long long)pend - 1, + bytes >> 20, io_tlb_start, io_tlb_end - 1); } void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { - void *v_overflow_buffer; unsigned long i, bytes; bytes = nslabs << IO_TLB_SHIFT; io_tlb_nslabs = nslabs; - io_tlb_start = __pa(tlb); + io_tlb_start = tlb; io_tlb_end = io_tlb_start + bytes; - /* - * Get the overflow emergency buffer - */ - v_overflow_buffer = alloc_bootmem_low_pages(PAGE_ALIGN(io_tlb_overflow)); - if (!v_overflow_buffer) - panic("Cannot allocate SWIOTLB overflow buffer!\n"); - - io_tlb_overflow_buffer = __pa(v_overflow_buffer); - /* * Allocate and initialize the free list array. This array is used * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE @@ -167,6 +156,12 @@ void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) io_tlb_index = 0; io_tlb_orig_addr = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); + /* + * Get the overflow emergency buffer + */ + io_tlb_overflow_buffer = alloc_bootmem_low_pages(PAGE_ALIGN(io_tlb_overflow)); + if (!io_tlb_overflow_buffer) + panic("Cannot allocate SWIOTLB overflow buffer!\n"); if (verbose) swiotlb_print_info(); } @@ -178,7 +173,6 @@ void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) static void __init swiotlb_init_with_default_size(size_t default_size, int verbose) { - unsigned char *vstart; unsigned long bytes; if (!io_tlb_nslabs) { @@ -191,11 +185,11 @@ swiotlb_init_with_default_size(size_t default_size, int verbose) /* * Get IO TLB memory from the low pages */ - vstart = alloc_bootmem_low_pages(PAGE_ALIGN(bytes)); - if (!vstart) + io_tlb_start = alloc_bootmem_low_pages(PAGE_ALIGN(bytes)); + if (!io_tlb_start) panic("Cannot allocate SWIOTLB buffer"); - swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose); + swiotlb_init_with_tbl(io_tlb_start, io_tlb_nslabs, verbose); } void __init @@ -213,7 +207,6 @@ int swiotlb_late_init_with_default_size(size_t default_size) { unsigned long bytes, req_nslabs = io_tlb_nslabs; - unsigned char *vstart = NULL; unsigned int order; int rc = 0; @@ -230,14 +223,14 @@ swiotlb_late_init_with_default_size(size_t default_size) bytes = io_tlb_nslabs << IO_TLB_SHIFT; while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { - vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, - order); - if (vstart) + io_tlb_start = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, + order); + if (io_tlb_start) break; order--; } - if (!vstart) { + if (!io_tlb_start) { io_tlb_nslabs = req_nslabs; return -ENOMEM; } @@ -246,9 +239,9 @@ swiotlb_late_init_with_default_size(size_t default_size) "for software IO TLB\n", (PAGE_SIZE << order) >> 20); io_tlb_nslabs = SLABS_PER_PAGE << order; } - rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs); + rc = swiotlb_late_init_with_tbl(io_tlb_start, io_tlb_nslabs); if (rc) - free_pages((unsigned long)vstart, order); + free_pages((unsigned long)io_tlb_start, order); return rc; } @@ -256,25 +249,14 @@ int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) { unsigned long i, bytes; - unsigned char *v_overflow_buffer; bytes = nslabs << IO_TLB_SHIFT; io_tlb_nslabs = nslabs; - io_tlb_start = virt_to_phys(tlb); + io_tlb_start = tlb; io_tlb_end = io_tlb_start + bytes; - memset(tlb, 0, bytes); - - /* - * Get the overflow emergency buffer - */ - v_overflow_buffer = (void *)__get_free_pages(GFP_DMA, - get_order(io_tlb_overflow)); - if (!v_overflow_buffer) - goto cleanup2; - - io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer); + memset(io_tlb_start, 0, bytes); /* * Allocate and initialize the free list array. This array is used @@ -284,7 +266,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL, get_order(io_tlb_nslabs * sizeof(int))); if (!io_tlb_list) - goto cleanup3; + goto cleanup2; for (i = 0; i < io_tlb_nslabs; i++) io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); @@ -295,10 +277,18 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) get_order(io_tlb_nslabs * sizeof(phys_addr_t))); if (!io_tlb_orig_addr) - goto cleanup4; + goto cleanup3; memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(phys_addr_t)); + /* + * Get the overflow emergency buffer + */ + io_tlb_overflow_buffer = (void *)__get_free_pages(GFP_DMA, + get_order(io_tlb_overflow)); + if (!io_tlb_overflow_buffer) + goto cleanup4; + swiotlb_print_info(); late_alloc = 1; @@ -306,42 +296,42 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) return 0; cleanup4: + free_pages((unsigned long)io_tlb_orig_addr, + get_order(io_tlb_nslabs * sizeof(phys_addr_t))); + io_tlb_orig_addr = NULL; +cleanup3: free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * sizeof(int))); io_tlb_list = NULL; -cleanup3: - free_pages((unsigned long)v_overflow_buffer, - get_order(io_tlb_overflow)); - io_tlb_overflow_buffer = 0; cleanup2: - io_tlb_end = 0; - io_tlb_start = 0; + io_tlb_end = NULL; + io_tlb_start = NULL; io_tlb_nslabs = 0; return -ENOMEM; } void __init swiotlb_free(void) { - if (!io_tlb_orig_addr) + if (!io_tlb_overflow_buffer) return; if (late_alloc) { - free_pages((unsigned long)phys_to_virt(io_tlb_overflow_buffer), + free_pages((unsigned long)io_tlb_overflow_buffer, get_order(io_tlb_overflow)); free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs * sizeof(phys_addr_t))); free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * sizeof(int))); - free_pages((unsigned long)phys_to_virt(io_tlb_start), + free_pages((unsigned long)io_tlb_start, get_order(io_tlb_nslabs << IO_TLB_SHIFT)); } else { - free_bootmem_late(io_tlb_overflow_buffer, + free_bootmem_late(__pa(io_tlb_overflow_buffer), PAGE_ALIGN(io_tlb_overflow)); free_bootmem_late(__pa(io_tlb_orig_addr), PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); free_bootmem_late(__pa(io_tlb_list), PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); - free_bootmem_late(io_tlb_start, + free_bootmem_late(__pa(io_tlb_start), PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); } io_tlb_nslabs = 0; @@ -349,21 +339,21 @@ void __init swiotlb_free(void) static int is_swiotlb_buffer(phys_addr_t paddr) { - return paddr >= io_tlb_start && paddr < io_tlb_end; + return paddr >= virt_to_phys(io_tlb_start) && + paddr < virt_to_phys(io_tlb_end); } /* * Bounce: copy the swiotlb buffer back to the original dma location */ -static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir) +void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, + enum dma_data_direction dir) { - unsigned long pfn = PFN_DOWN(orig_addr); - unsigned char *vaddr = phys_to_virt(tlb_addr); + unsigned long pfn = PFN_DOWN(phys); if (PageHighMem(pfn_to_page(pfn))) { /* The buffer does not have a mapping. Map it in and copy */ - unsigned int offset = orig_addr & ~PAGE_MASK; + unsigned int offset = phys & ~PAGE_MASK; char *buffer; unsigned int sz = 0; unsigned long flags; @@ -374,31 +364,32 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, local_irq_save(flags); buffer = kmap_atomic(pfn_to_page(pfn)); if (dir == DMA_TO_DEVICE) - memcpy(vaddr, buffer + offset, sz); + memcpy(dma_addr, buffer + offset, sz); else - memcpy(buffer + offset, vaddr, sz); + memcpy(buffer + offset, dma_addr, sz); kunmap_atomic(buffer); local_irq_restore(flags); size -= sz; pfn++; - vaddr += sz; + dma_addr += sz; offset = 0; } - } else if (dir == DMA_TO_DEVICE) { - memcpy(vaddr, phys_to_virt(orig_addr), size); } else { - memcpy(phys_to_virt(orig_addr), vaddr, size); + if (dir == DMA_TO_DEVICE) + memcpy(dma_addr, phys_to_virt(phys), size); + else + memcpy(phys_to_virt(phys), dma_addr, size); } } +EXPORT_SYMBOL_GPL(swiotlb_bounce); -phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, - dma_addr_t tbl_dma_addr, - phys_addr_t orig_addr, size_t size, - enum dma_data_direction dir) +void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr, + phys_addr_t phys, size_t size, + enum dma_data_direction dir) { unsigned long flags; - phys_addr_t tlb_addr; + char *dma_addr; unsigned int nslots, stride, index, wrap; int i; unsigned long mask; @@ -462,7 +453,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, io_tlb_list[i] = 0; for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--) io_tlb_list[i] = ++count; - tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT); + dma_addr = io_tlb_start + (index << IO_TLB_SHIFT); /* * Update the indices to avoid searching in the next @@ -480,7 +471,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, not_found: spin_unlock_irqrestore(&io_tlb_lock, flags); - return SWIOTLB_MAP_ERROR; + return NULL; found: spin_unlock_irqrestore(&io_tlb_lock, flags); @@ -490,11 +481,11 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, * needed. */ for (i = 0; i < nslots; i++) - io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); + io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT); if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) - swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); + swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE); - return tlb_addr; + return dma_addr; } EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single); @@ -502,10 +493,11 @@ EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single); * Allocates bounce buffer and returns its kernel virtual address. */ -phys_addr_t map_single(struct device *hwdev, phys_addr_t phys, size_t size, - enum dma_data_direction dir) +static void * +map_single(struct device *hwdev, phys_addr_t phys, size_t size, + enum dma_data_direction dir) { - dma_addr_t start_dma_addr = phys_to_dma(hwdev, io_tlb_start); + dma_addr_t start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start); return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, dir); } @@ -513,19 +505,20 @@ phys_addr_t map_single(struct device *hwdev, phys_addr_t phys, size_t size, /* * dma_addr is the kernel virtual address of the bounce buffer to unmap. */ -void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir) +void +swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, size_t size, + enum dma_data_direction dir) { unsigned long flags; int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; - int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; - phys_addr_t orig_addr = io_tlb_orig_addr[index]; + int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; + phys_addr_t phys = io_tlb_orig_addr[index]; /* * First, sync the memory before unmapping the entry */ - if (orig_addr && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) - swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE); + if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) + swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE); /* * Return the buffer to the free list by setting the corresponding @@ -554,27 +547,26 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, } EXPORT_SYMBOL_GPL(swiotlb_tbl_unmap_single); -void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir, - enum dma_sync_target target) +void +swiotlb_tbl_sync_single(struct device *hwdev, char *dma_addr, size_t size, + enum dma_data_direction dir, + enum dma_sync_target target) { - int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; - phys_addr_t orig_addr = io_tlb_orig_addr[index]; + int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; + phys_addr_t phys = io_tlb_orig_addr[index]; - orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1); + phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1)); switch (target) { case SYNC_FOR_CPU: if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, - size, DMA_FROM_DEVICE); + swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE); else BUG_ON(dir != DMA_TO_DEVICE); break; case SYNC_FOR_DEVICE: if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, - size, DMA_TO_DEVICE); + swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE); else BUG_ON(dir != DMA_FROM_DEVICE); break; @@ -597,15 +589,12 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_mask = hwdev->coherent_dma_mask; ret = (void *)__get_free_pages(flags, order); - if (ret) { - dev_addr = swiotlb_virt_to_bus(hwdev, ret); - if (dev_addr + size - 1 > dma_mask) { - /* - * The allocated memory isn't reachable by the device. - */ - free_pages((unsigned long) ret, order); - ret = NULL; - } + if (ret && swiotlb_virt_to_bus(hwdev, ret) + size - 1 > dma_mask) { + /* + * The allocated memory isn't reachable by the device. + */ + free_pages((unsigned long) ret, order); + ret = NULL; } if (!ret) { /* @@ -613,29 +602,25 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, * GFP_DMA memory; fall back on map_single(), which * will grab memory from the lowest available address range. */ - phys_addr_t paddr = map_single(hwdev, 0, size, DMA_FROM_DEVICE); - if (paddr == SWIOTLB_MAP_ERROR) + ret = map_single(hwdev, 0, size, DMA_FROM_DEVICE); + if (!ret) return NULL; + } - ret = phys_to_virt(paddr); - dev_addr = phys_to_dma(hwdev, paddr); + memset(ret, 0, size); + dev_addr = swiotlb_virt_to_bus(hwdev, ret); - /* Confirm address can be DMA'd by device */ - if (dev_addr + size - 1 > dma_mask) { - printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n", - (unsigned long long)dma_mask, - (unsigned long long)dev_addr); + /* Confirm address can be DMA'd by device */ + if (dev_addr + size - 1 > dma_mask) { + printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n", + (unsigned long long)dma_mask, + (unsigned long long)dev_addr); - /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ - swiotlb_tbl_unmap_single(hwdev, paddr, - size, DMA_TO_DEVICE); - return NULL; - } + /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ + swiotlb_tbl_unmap_single(hwdev, ret, size, DMA_TO_DEVICE); + return NULL; } - *dma_handle = dev_addr; - memset(ret, 0, size); - return ret; } EXPORT_SYMBOL(swiotlb_alloc_coherent); @@ -651,7 +636,7 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, free_pages((unsigned long)vaddr, get_order(size)); else /* DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single */ - swiotlb_tbl_unmap_single(hwdev, paddr, size, DMA_TO_DEVICE); + swiotlb_tbl_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE); } EXPORT_SYMBOL(swiotlb_free_coherent); @@ -692,8 +677,9 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, enum dma_data_direction dir, struct dma_attrs *attrs) { - phys_addr_t map, phys = page_to_phys(page) + offset; + phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dev_addr = phys_to_dma(dev, phys); + void *map; BUG_ON(dir == DMA_NONE); /* @@ -704,19 +690,23 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, if (dma_capable(dev, dev_addr, size) && !swiotlb_force) return dev_addr; - /* Oh well, have to allocate and map a bounce buffer. */ + /* + * Oh well, have to allocate and map a bounce buffer. + */ map = map_single(dev, phys, size, dir); - if (map == SWIOTLB_MAP_ERROR) { + if (!map) { swiotlb_full(dev, size, dir, 1); - return phys_to_dma(dev, io_tlb_overflow_buffer); + map = io_tlb_overflow_buffer; } - dev_addr = phys_to_dma(dev, map); + dev_addr = swiotlb_virt_to_bus(dev, map); - /* Ensure that the address returned is DMA'ble */ + /* + * Ensure that the address returned is DMA'ble + */ if (!dma_capable(dev, dev_addr, size)) { swiotlb_tbl_unmap_single(dev, map, size, dir); - return phys_to_dma(dev, io_tlb_overflow_buffer); + dev_addr = swiotlb_virt_to_bus(dev, io_tlb_overflow_buffer); } return dev_addr; @@ -739,7 +729,7 @@ static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, BUG_ON(dir == DMA_NONE); if (is_swiotlb_buffer(paddr)) { - swiotlb_tbl_unmap_single(hwdev, paddr, size, dir); + swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir); return; } @@ -783,7 +773,8 @@ swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, BUG_ON(dir == DMA_NONE); if (is_swiotlb_buffer(paddr)) { - swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); + swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir, + target); return; } @@ -840,9 +831,9 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, if (swiotlb_force || !dma_capable(hwdev, dev_addr, sg->length)) { - phys_addr_t map = map_single(hwdev, sg_phys(sg), - sg->length, dir); - if (map == SWIOTLB_MAP_ERROR) { + void *map = map_single(hwdev, sg_phys(sg), + sg->length, dir); + if (!map) { /* Don't panic here, we expect map_sg users to do proper error handling. */ swiotlb_full(hwdev, sg->length, dir, 0); @@ -851,7 +842,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, sgl[0].dma_length = 0; return 0; } - sg->dma_address = phys_to_dma(hwdev, map); + sg->dma_address = swiotlb_virt_to_bus(hwdev, map); } else sg->dma_address = dev_addr; sg->dma_length = sg->length; @@ -934,7 +925,7 @@ EXPORT_SYMBOL(swiotlb_sync_sg_for_device); int swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) { - return (dma_addr == phys_to_dma(hwdev, io_tlb_overflow_buffer)); + return (dma_addr == swiotlb_virt_to_bus(hwdev, io_tlb_overflow_buffer)); } EXPORT_SYMBOL(swiotlb_dma_mapping_error); @@ -947,6 +938,6 @@ EXPORT_SYMBOL(swiotlb_dma_mapping_error); int swiotlb_dma_supported(struct device *hwdev, u64 mask) { - return phys_to_dma(hwdev, io_tlb_end - 1) <= mask; + return swiotlb_virt_to_bus(hwdev, io_tlb_end - 1) <= mask; } EXPORT_SYMBOL(swiotlb_dma_supported); diff --git a/trunk/mm/compaction.c b/trunk/mm/compaction.c index 5ad7f4f4d6f7..129791218226 100644 --- a/trunk/mm/compaction.c +++ b/trunk/mm/compaction.c @@ -303,10 +303,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (blockpfn == end_pfn) update_pageblock_skip(cc, valid_page, total_isolated, false); - count_vm_events(COMPACTFREE_SCANNED, nr_scanned); - if (total_isolated) - count_vm_events(COMPACTISOLATED, total_isolated); - return total_isolated; } @@ -613,10 +609,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); - count_vm_events(COMPACTMIGRATE_SCANNED, nr_scanned); - if (nr_isolated) - count_vm_events(COMPACTISOLATED, nr_isolated); - return low_pfn; } @@ -1023,11 +1015,14 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, (unsigned long)cc, false, - cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, - MR_COMPACTION); + cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; + count_vm_event(COMPACTBLOCKS); + count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); + if (nr_remaining) + count_vm_events(COMPACTPAGEFAILED, nr_remaining); trace_mm_compaction_migratepages(nr_migrate - nr_remaining, nr_remaining); diff --git a/trunk/mm/huge_memory.c b/trunk/mm/huge_memory.c index d7ee1691fd21..827d9c813051 100644 --- a/trunk/mm/huge_memory.c +++ b/trunk/mm/huge_memory.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include @@ -691,7 +690,7 @@ static int __init setup_transparent_hugepage(char *str) } __setup("transparent_hugepage=", setup_transparent_hugepage); -pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) pmd = pmd_mkwrite(pmd); @@ -849,8 +848,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, * run pte_offset_map on the pmd, if an huge pmd could * materialize from under us from a different thread. */ - if (unlikely(pmd_none(*pmd)) && - unlikely(__pte_alloc(mm, vma, pmd, address))) + if (unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; /* if an huge pmd materialized from under us just retry later */ if (unlikely(pmd_trans_huge(*pmd))) @@ -1289,81 +1287,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, return page; } -/* NUMA hinting page fault entry point for trans huge pmds */ -int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pmd_t pmd, pmd_t *pmdp) -{ - struct page *page; - unsigned long haddr = addr & HPAGE_PMD_MASK; - int target_nid; - int current_nid = -1; - bool migrated; - bool page_locked = false; - - spin_lock(&mm->page_table_lock); - if (unlikely(!pmd_same(pmd, *pmdp))) - goto out_unlock; - - page = pmd_page(pmd); - get_page(page); - current_nid = page_to_nid(page); - count_vm_numa_event(NUMA_HINT_FAULTS); - if (current_nid == numa_node_id()) - count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); - - target_nid = mpol_misplaced(page, vma, haddr); - if (target_nid == -1) { - put_page(page); - goto clear_pmdnuma; - } - - /* Acquire the page lock to serialise THP migrations */ - spin_unlock(&mm->page_table_lock); - lock_page(page); - page_locked = true; - - /* Confirm the PTE did not while locked */ - spin_lock(&mm->page_table_lock); - if (unlikely(!pmd_same(pmd, *pmdp))) { - unlock_page(page); - put_page(page); - goto out_unlock; - } - spin_unlock(&mm->page_table_lock); - - /* Migrate the THP to the requested node */ - migrated = migrate_misplaced_transhuge_page(mm, vma, - pmdp, pmd, addr, - page, target_nid); - if (migrated) - current_nid = target_nid; - else { - spin_lock(&mm->page_table_lock); - if (unlikely(!pmd_same(pmd, *pmdp))) { - unlock_page(page); - goto out_unlock; - } - goto clear_pmdnuma; - } - - task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); - return 0; - -clear_pmdnuma: - pmd = pmd_mknonnuma(pmd); - set_pmd_at(mm, haddr, pmdp, pmd); - VM_BUG_ON(pmd_numa(*pmdp)); - update_mmu_cache_pmd(vma, addr, pmdp); - if (page_locked) - unlock_page(page); - -out_unlock: - spin_unlock(&mm->page_table_lock); - if (current_nid != -1) - task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); - return 0; -} - int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { @@ -1452,7 +1375,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, } int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot, int prot_numa) + unsigned long addr, pgprot_t newprot) { struct mm_struct *mm = vma->vm_mm; int ret = 0; @@ -1460,17 +1383,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (__pmd_trans_huge_lock(pmd, vma) == 1) { pmd_t entry; entry = pmdp_get_and_clear(mm, addr, pmd); - if (!prot_numa) - entry = pmd_modify(entry, newprot); - else { - struct page *page = pmd_page(*pmd); - - /* only check non-shared pages */ - if (page_mapcount(page) == 1 && - !pmd_numa(*pmd)) { - entry = pmd_mknuma(entry); - } - } + entry = pmd_modify(entry, newprot); BUG_ON(pmd_write(entry)); set_pmd_at(mm, addr, pmd, entry); spin_unlock(&vma->vm_mm->page_table_lock); @@ -1561,7 +1474,7 @@ static int __split_huge_page_splitting(struct page *page, * We can't temporarily set the pmd to null in order * to split it, the pmd must remain marked huge at all * times or the VM won't take the pmd_trans_huge paths - * and it won't wait on the anon_vma->root->rwsem to + * and it won't wait on the anon_vma->root->mutex to * serialize against split_huge_page*. */ pmdp_splitting_flush(vma, address, pmd); @@ -1652,7 +1565,6 @@ static void __split_huge_page_refcount(struct page *page) page_tail->mapping = page->mapping; page_tail->index = page->index + i; - page_xchg_last_nid(page_tail, page_last_nid(page)); BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); @@ -1720,8 +1632,6 @@ static int __split_huge_page_map(struct page *page, BUG_ON(page_mapcount(page) != 1); if (!pmd_young(*pmd)) entry = pte_mkold(entry); - if (pmd_numa(*pmd)) - entry = pte_mknuma(entry); pte = pte_offset_map(&_pmd, haddr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); @@ -1764,7 +1674,7 @@ static int __split_huge_page_map(struct page *page, return ret; } -/* must be called with anon_vma->root->rwsem held */ +/* must be called with anon_vma->root->mutex hold */ static void __split_huge_page(struct page *page, struct anon_vma *anon_vma) { @@ -1819,7 +1729,7 @@ int split_huge_page(struct page *page) BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); BUG_ON(!PageAnon(page)); - anon_vma = page_lock_anon_vma_read(page); + anon_vma = page_lock_anon_vma(page); if (!anon_vma) goto out; ret = 0; @@ -1832,7 +1742,7 @@ int split_huge_page(struct page *page) BUG_ON(PageCompound(page)); out_unlock: - page_unlock_anon_vma_read(anon_vma); + page_unlock_anon_vma(anon_vma); out: return ret; } @@ -2324,7 +2234,7 @@ static void collapse_huge_page(struct mm_struct *mm, if (pmd_trans_huge(*pmd)) goto out; - anon_vma_lock_write(vma->anon_vma); + anon_vma_lock(vma->anon_vma); pte = pte_offset_map(pmd, address); ptl = pte_lockptr(mm, pmd); diff --git a/trunk/mm/hugetlb.c b/trunk/mm/hugetlb.c index e5318c7793ae..88e7293b96bd 100644 --- a/trunk/mm/hugetlb.c +++ b/trunk/mm/hugetlb.c @@ -3016,7 +3016,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, return i ? i : -EFAULT; } -unsigned long hugetlb_change_protection(struct vm_area_struct *vma, +void hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot) { struct mm_struct *mm = vma->vm_mm; @@ -3024,7 +3024,6 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pte_t *ptep; pte_t pte; struct hstate *h = hstate_vma(vma); - unsigned long pages = 0; BUG_ON(address >= end); flush_cache_range(vma, address, end); @@ -3035,15 +3034,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, ptep = huge_pte_offset(mm, address); if (!ptep) continue; - if (huge_pmd_unshare(mm, &address, ptep)) { - pages++; + if (huge_pmd_unshare(mm, &address, ptep)) continue; - } if (!huge_pte_none(huge_ptep_get(ptep))) { pte = huge_ptep_get_and_clear(mm, address, ptep); pte = pte_mkhuge(pte_modify(pte, newprot)); set_huge_pte_at(mm, address, ptep, pte); - pages++; } } spin_unlock(&mm->page_table_lock); @@ -3055,8 +3051,6 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, */ flush_tlb_range(vma, start, end); mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); - - return pages << h->order; } int hugetlb_reserve_pages(struct inode *inode, diff --git a/trunk/mm/internal.h b/trunk/mm/internal.h index d597f94cc205..52d1fa957194 100644 --- a/trunk/mm/internal.h +++ b/trunk/mm/internal.h @@ -217,18 +217,15 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) { if (TestClearPageMlocked(page)) { unsigned long flags; - int nr_pages = hpage_nr_pages(page); local_irq_save(flags); - __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + __dec_zone_page_state(page, NR_MLOCK); SetPageMlocked(newpage); - __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages); + __inc_zone_page_state(newpage, NR_MLOCK); local_irq_restore(flags); } } -extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); - #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern unsigned long vma_address(struct page *page, struct vm_area_struct *vma); diff --git a/trunk/mm/ksm.c b/trunk/mm/ksm.c index 82dfb4b54321..382d930a0bf1 100644 --- a/trunk/mm/ksm.c +++ b/trunk/mm/ksm.c @@ -1624,7 +1624,7 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock_write(anon_vma); + anon_vma_lock(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; @@ -1678,7 +1678,7 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock_write(anon_vma); + anon_vma_lock(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; @@ -1731,7 +1731,7 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock_write(anon_vma); + anon_vma_lock(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; diff --git a/trunk/mm/memcontrol.c b/trunk/mm/memcontrol.c index bbfac5063ca8..6c055929c8cc 100644 --- a/trunk/mm/memcontrol.c +++ b/trunk/mm/memcontrol.c @@ -3289,18 +3289,15 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, struct mem_cgroup **memcgp) { struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; struct page_cgroup *pc; enum charge_type ctype; *memcgp = NULL; + VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) return; - if (PageTransHuge(page)) - nr_pages <<= compound_order(page); - pc = lookup_page_cgroup(page); lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { @@ -3362,7 +3359,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, * charged to the res_counter since we plan on replacing the * old one and only one page is going to be left afterwards. */ - __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); + __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); } /* remove redundant charge if migration failed*/ diff --git a/trunk/mm/memory-failure.c b/trunk/mm/memory-failure.c index c6e4dd3e1c08..108c52fa60f6 100644 --- a/trunk/mm/memory-failure.c +++ b/trunk/mm/memory-failure.c @@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct anon_vma *av; pgoff_t pgoff; - av = page_lock_anon_vma_read(page); + av = page_lock_anon_vma(page); if (av == NULL) /* Not actually mapped anymore */ return; @@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, } } read_unlock(&tasklist_lock); - page_unlock_anon_vma_read(av); + page_unlock_anon_vma(av); } /* @@ -1566,8 +1566,7 @@ int soft_offline_page(struct page *page, int flags) page_is_file_cache(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, - false, MIGRATE_SYNC, - MR_MEMORY_FAILURE); + false, MIGRATE_SYNC); if (ret) { putback_lru_pages(&pagelist); pr_info("soft offline: %#lx: migration failed %d, type %lx\n", diff --git a/trunk/mm/memory.c b/trunk/mm/memory.c index e6a3b933517e..db2e9e797a05 100644 --- a/trunk/mm/memory.c +++ b/trunk/mm/memory.c @@ -57,7 +57,6 @@ #include #include #include -#include #include #include @@ -1504,8 +1503,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); goto out; } - if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) - goto no_page_table; if (pmd_trans_huge(*pmd)) { if (flags & FOLL_SPLIT) { split_huge_page_pmd(vma, address, pmd); @@ -1535,8 +1532,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, pte = *ptep; if (!pte_present(pte)) goto no_page; - if ((flags & FOLL_NUMA) && pte_numa(pte)) - goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; @@ -1688,19 +1683,6 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags &= (gup_flags & FOLL_FORCE) ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); - - /* - * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault - * would be called on PROT_NONE ranges. We must never invoke - * handle_mm_fault on PROT_NONE ranges or the NUMA hinting - * page faults would unprotect the PROT_NONE ranges if - * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd - * bitflag. So to avoid that, don't set FOLL_NUMA if - * FOLL_FORCE is set. - */ - if (!(gup_flags & FOLL_FORCE)) - gup_flags |= FOLL_NUMA; - i = 0; do { @@ -3430,169 +3412,6 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } -int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, - unsigned long addr, int current_nid) -{ - get_page(page); - - count_vm_numa_event(NUMA_HINT_FAULTS); - if (current_nid == numa_node_id()) - count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); - - return mpol_misplaced(page, vma, addr); -} - -int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) -{ - struct page *page = NULL; - spinlock_t *ptl; - int current_nid = -1; - int target_nid; - bool migrated = false; - - /* - * The "pte" at this point cannot be used safely without - * validation through pte_unmap_same(). It's of NUMA type but - * the pfn may be screwed if the read is non atomic. - * - * ptep_modify_prot_start is not called as this is clearing - * the _PAGE_NUMA bit and it is not really expected that there - * would be concurrent hardware modifications to the PTE. - */ - ptl = pte_lockptr(mm, pmd); - spin_lock(ptl); - if (unlikely(!pte_same(*ptep, pte))) { - pte_unmap_unlock(ptep, ptl); - goto out; - } - - pte = pte_mknonnuma(pte); - set_pte_at(mm, addr, ptep, pte); - update_mmu_cache(vma, addr, ptep); - - page = vm_normal_page(vma, addr, pte); - if (!page) { - pte_unmap_unlock(ptep, ptl); - return 0; - } - - current_nid = page_to_nid(page); - target_nid = numa_migrate_prep(page, vma, addr, current_nid); - pte_unmap_unlock(ptep, ptl); - if (target_nid == -1) { - /* - * Account for the fault against the current node if it not - * being replaced regardless of where the page is located. - */ - current_nid = numa_node_id(); - put_page(page); - goto out; - } - - /* Migrate to the requested node */ - migrated = migrate_misplaced_page(page, target_nid); - if (migrated) - current_nid = target_nid; - -out: - if (current_nid != -1) - task_numa_fault(current_nid, 1, migrated); - return 0; -} - -/* NUMA hinting page fault entry point for regular pmds */ -#ifdef CONFIG_NUMA_BALANCING -static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) -{ - pmd_t pmd; - pte_t *pte, *orig_pte; - unsigned long _addr = addr & PMD_MASK; - unsigned long offset; - spinlock_t *ptl; - bool numa = false; - int local_nid = numa_node_id(); - - spin_lock(&mm->page_table_lock); - pmd = *pmdp; - if (pmd_numa(pmd)) { - set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); - numa = true; - } - spin_unlock(&mm->page_table_lock); - - if (!numa) - return 0; - - /* we're in a page fault so some vma must be in the range */ - BUG_ON(!vma); - BUG_ON(vma->vm_start >= _addr + PMD_SIZE); - offset = max(_addr, vma->vm_start) & ~PMD_MASK; - VM_BUG_ON(offset >= PMD_SIZE); - orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl); - pte += offset >> PAGE_SHIFT; - for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { - pte_t pteval = *pte; - struct page *page; - int curr_nid = local_nid; - int target_nid; - bool migrated; - if (!pte_present(pteval)) - continue; - if (!pte_numa(pteval)) - continue; - if (addr >= vma->vm_end) { - vma = find_vma(mm, addr); - /* there's a pte present so there must be a vma */ - BUG_ON(!vma); - BUG_ON(addr < vma->vm_start); - } - if (pte_numa(pteval)) { - pteval = pte_mknonnuma(pteval); - set_pte_at(mm, addr, pte, pteval); - } - page = vm_normal_page(vma, addr, pteval); - if (unlikely(!page)) - continue; - /* only check non-shared pages */ - if (unlikely(page_mapcount(page) != 1)) - continue; - - /* - * Note that the NUMA fault is later accounted to either - * the node that is currently running or where the page is - * migrated to. - */ - curr_nid = local_nid; - target_nid = numa_migrate_prep(page, vma, addr, - page_to_nid(page)); - if (target_nid == -1) { - put_page(page); - continue; - } - - /* Migrate to the requested node */ - pte_unmap_unlock(pte, ptl); - migrated = migrate_misplaced_page(page, target_nid); - if (migrated) - curr_nid = target_nid; - task_numa_fault(curr_nid, 1, migrated); - - pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); - } - pte_unmap_unlock(orig_pte, ptl); - - return 0; -} -#else -static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) -{ - BUG(); -} -#endif /* CONFIG_NUMA_BALANCING */ - /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most @@ -3631,9 +3450,6 @@ int handle_pte_fault(struct mm_struct *mm, pte, pmd, flags, entry); } - if (pte_numa(entry)) - return do_numa_page(mm, vma, address, entry, pte, pmd); - ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (unlikely(!pte_same(*pte, entry))) @@ -3704,11 +3520,8 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (pmd_trans_huge(orig_pmd)) { unsigned int dirty = flags & FAULT_FLAG_WRITE; - if (pmd_numa(orig_pmd)) - return do_huge_pmd_numa_page(mm, vma, address, - orig_pmd, pmd); - - if (dirty && !pmd_write(orig_pmd)) { + if (dirty && !pmd_write(orig_pmd) && + !pmd_trans_splitting(orig_pmd)) { ret = do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); /* @@ -3723,21 +3536,16 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, huge_pmd_set_accessed(mm, vma, address, pmd, orig_pmd, dirty); } - return 0; } } - if (pmd_numa(*pmd)) - return do_pmd_numa_page(mm, vma, address, pmd); - /* * Use __pte_alloc instead of pte_alloc_map, because we can't * run pte_offset_map on the pmd, if an huge pmd could * materialize from under us from a different thread. */ - if (unlikely(pmd_none(*pmd)) && - unlikely(__pte_alloc(mm, vma, pmd, address))) + if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) return VM_FAULT_OOM; /* if an huge pmd materialized from under us just retry later */ if (unlikely(pmd_trans_huge(*pmd))) diff --git a/trunk/mm/memory_hotplug.c b/trunk/mm/memory_hotplug.c index 962e353aa86f..518baa896e83 100644 --- a/trunk/mm/memory_hotplug.c +++ b/trunk/mm/memory_hotplug.c @@ -1055,8 +1055,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * migrate_pages returns # of failed pages. */ ret = migrate_pages(&source, alloc_migrate_target, 0, - true, MIGRATE_SYNC, - MR_MEMORY_HOTPLUG); + true, MIGRATE_SYNC); if (ret) putback_lru_pages(&source); } diff --git a/trunk/mm/mempolicy.c b/trunk/mm/mempolicy.c index d1b315e98627..aaf54566cb6b 100644 --- a/trunk/mm/mempolicy.c +++ b/trunk/mm/mempolicy.c @@ -90,7 +90,6 @@ #include #include #include -#include #include #include @@ -118,26 +117,6 @@ static struct mempolicy default_policy = { .flags = MPOL_F_LOCAL, }; -static struct mempolicy preferred_node_policy[MAX_NUMNODES]; - -static struct mempolicy *get_task_policy(struct task_struct *p) -{ - struct mempolicy *pol = p->mempolicy; - int node; - - if (!pol) { - node = numa_node_id(); - if (node != -1) - pol = &preferred_node_policy[node]; - - /* preferred_node_policy is not initialised early in boot */ - if (!pol->mode) - pol = NULL; - } - - return pol; -} - static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); /* @@ -275,7 +254,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); - return NULL; + return NULL; /* simply delete any existing policy */ } VM_BUG_ON(!nodes); @@ -290,10 +269,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); } - } else if (mode == MPOL_LOCAL) { - if (!nodes_empty(*nodes)) - return ERR_PTR(-EINVAL); - mode = MPOL_PREFERRED; } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); @@ -586,36 +561,6 @@ static inline int check_pgd_range(struct vm_area_struct *vma, return 0; } -#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE -/* - * This is used to mark a range of virtual addresses to be inaccessible. - * These are later cleared by a NUMA hinting fault. Depending on these - * faults, pages may be migrated for better NUMA placement. - * - * This is assuming that NUMA faults are handled using PROT_NONE. If - * an architecture makes a different choice, it will need further - * changes to the core. - */ -unsigned long change_prot_numa(struct vm_area_struct *vma, - unsigned long addr, unsigned long end) -{ - int nr_updated; - BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); - - nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); - if (nr_updated) - count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); - - return nr_updated; -} -#else -static unsigned long change_prot_numa(struct vm_area_struct *vma, - unsigned long addr, unsigned long end) -{ - return 0; -} -#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ - /* * Check if all pages in a range are on a set of nodes. * If pagelist != NULL then isolate pages from the LRU and @@ -634,32 +579,22 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, return ERR_PTR(-EFAULT); prev = NULL; for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { - unsigned long endvma = vma->vm_end; - - if (endvma > end) - endvma = end; - if (vma->vm_start > start) - start = vma->vm_start; - if (!(flags & MPOL_MF_DISCONTIG_OK)) { if (!vma->vm_next && vma->vm_end < end) return ERR_PTR(-EFAULT); if (prev && prev->vm_end < vma->vm_start) return ERR_PTR(-EFAULT); } - - if (is_vm_hugetlb_page(vma)) - goto next; - - if (flags & MPOL_MF_LAZY) { - change_prot_numa(vma, start, endvma); - goto next; - } - - if ((flags & MPOL_MF_STRICT) || + if (!is_vm_hugetlb_page(vma) && + ((flags & MPOL_MF_STRICT) || ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && - vma_migratable(vma))) { + vma_migratable(vma)))) { + unsigned long endvma = vma->vm_end; + if (endvma > end) + endvma = end; + if (vma->vm_start > start) + start = vma->vm_start; err = check_pgd_range(vma, start, endvma, nodes, flags, private); if (err) { @@ -667,7 +602,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, break; } } -next: prev = vma; } return first; @@ -1027,8 +961,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, - false, MIGRATE_SYNC, - MR_SYSCALL); + false, MIGRATE_SYNC); if (err) putback_lru_pages(&pagelist); } @@ -1200,7 +1133,8 @@ static long do_mbind(unsigned long start, unsigned long len, int err; LIST_HEAD(pagelist); - if (flags & ~(unsigned long)MPOL_MF_VALID) + if (flags & ~(unsigned long)(MPOL_MF_STRICT | + MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; @@ -1223,9 +1157,6 @@ static long do_mbind(unsigned long start, unsigned long len, if (IS_ERR(new)) return PTR_ERR(new); - if (flags & MPOL_MF_LAZY) - new->flags |= MPOL_F_MOF; - /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all @@ -1262,24 +1193,21 @@ static long do_mbind(unsigned long start, unsigned long len, vma = check_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); - err = PTR_ERR(vma); /* maybe ... */ - if (!IS_ERR(vma)) - err = mbind_range(mm, start, end, new); - - if (!err) { + err = PTR_ERR(vma); + if (!IS_ERR(vma)) { int nr_failed = 0; + err = mbind_range(mm, start, end, new); + if (!list_empty(&pagelist)) { - WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_vma_page, (unsigned long)vma, - false, MIGRATE_SYNC, - MR_MEMPOLICY_MBIND); + false, MIGRATE_SYNC); if (nr_failed) putback_lru_pages(&pagelist); } - if (nr_failed && (flags & MPOL_MF_STRICT)) + if (!err && nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; } else putback_lru_pages(&pagelist); @@ -1618,7 +1546,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, struct mempolicy *get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) { - struct mempolicy *pol = get_task_policy(task); + struct mempolicy *pol = task->mempolicy; if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) { @@ -2028,7 +1956,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, */ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { - struct mempolicy *pol = get_task_policy(current); + struct mempolicy *pol = current->mempolicy; struct page *page; unsigned int cpuset_mems_cookie; @@ -2212,115 +2140,6 @@ static void sp_free(struct sp_node *n) kmem_cache_free(sn_cache, n); } -/** - * mpol_misplaced - check whether current page node is valid in policy - * - * @page - page to be checked - * @vma - vm area where page mapped - * @addr - virtual address where page mapped - * - * Lookup current policy node id for vma,addr and "compare to" page's - * node id. - * - * Returns: - * -1 - not misplaced, page is in the right node - * node - node id where the page should be - * - * Policy determination "mimics" alloc_page_vma(). - * Called from fault path where we know the vma and faulting address. - */ -int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) -{ - struct mempolicy *pol; - struct zone *zone; - int curnid = page_to_nid(page); - unsigned long pgoff; - int polnid = -1; - int ret = -1; - - BUG_ON(!vma); - - pol = get_vma_policy(current, vma, addr); - if (!(pol->flags & MPOL_F_MOF)) - goto out; - - switch (pol->mode) { - case MPOL_INTERLEAVE: - BUG_ON(addr >= vma->vm_end); - BUG_ON(addr < vma->vm_start); - - pgoff = vma->vm_pgoff; - pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; - polnid = offset_il_node(pol, vma, pgoff); - break; - - case MPOL_PREFERRED: - if (pol->flags & MPOL_F_LOCAL) - polnid = numa_node_id(); - else - polnid = pol->v.preferred_node; - break; - - case MPOL_BIND: - /* - * allows binding to multiple nodes. - * use current page if in policy nodemask, - * else select nearest allowed node, if any. - * If no allowed nodes, use current [!misplaced]. - */ - if (node_isset(curnid, pol->v.nodes)) - goto out; - (void)first_zones_zonelist( - node_zonelist(numa_node_id(), GFP_HIGHUSER), - gfp_zone(GFP_HIGHUSER), - &pol->v.nodes, &zone); - polnid = zone->node; - break; - - default: - BUG(); - } - - /* Migrate the page towards the node whose CPU is referencing it */ - if (pol->flags & MPOL_F_MORON) { - int last_nid; - - polnid = numa_node_id(); - - /* - * Multi-stage node selection is used in conjunction - * with a periodic migration fault to build a temporal - * task<->page relation. By using a two-stage filter we - * remove short/unlikely relations. - * - * Using P(p) ~ n_p / n_t as per frequentist - * probability, we can equate a task's usage of a - * particular page (n_p) per total usage of this - * page (n_t) (in a given time-span) to a probability. - * - * Our periodic faults will sample this probability and - * getting the same result twice in a row, given these - * samples are fully independent, is then given by - * P(n)^2, provided our sample period is sufficiently - * short compared to the usage pattern. - * - * This quadric squishes small probabilities, making - * it less likely we act on an unlikely task<->page - * relation. - */ - last_nid = page_xchg_last_nid(page, polnid); - if (last_nid != polnid) - goto out; - } - - if (curnid != polnid) - ret = polnid; -out: - mpol_cond_put(pol); - - return ret; -} - static void sp_delete(struct shared_policy *sp, struct sp_node *n) { pr_debug("deleting %lx-l%lx\n", n->start, n->end); @@ -2486,50 +2305,6 @@ void mpol_free_shared_policy(struct shared_policy *p) mutex_unlock(&p->mutex); } -#ifdef CONFIG_NUMA_BALANCING -static bool __initdata numabalancing_override; - -static void __init check_numabalancing_enable(void) -{ - bool numabalancing_default = false; - - if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) - numabalancing_default = true; - - if (nr_node_ids > 1 && !numabalancing_override) { - printk(KERN_INFO "Enabling automatic NUMA balancing. " - "Configure with numa_balancing= or sysctl"); - set_numabalancing_state(numabalancing_default); - } -} - -static int __init setup_numabalancing(char *str) -{ - int ret = 0; - if (!str) - goto out; - numabalancing_override = true; - - if (!strcmp(str, "enable")) { - set_numabalancing_state(true); - ret = 1; - } else if (!strcmp(str, "disable")) { - set_numabalancing_state(false); - ret = 1; - } -out: - if (!ret) - printk(KERN_WARNING "Unable to parse numa_balancing=\n"); - - return ret; -} -__setup("numa_balancing=", setup_numabalancing); -#else -static inline void __init check_numabalancing_enable(void) -{ -} -#endif /* CONFIG_NUMA_BALANCING */ - /* assumes fs == KERNEL_DS */ void __init numa_policy_init(void) { @@ -2545,15 +2320,6 @@ void __init numa_policy_init(void) sizeof(struct sp_node), 0, SLAB_PANIC, NULL); - for_each_node(nid) { - preferred_node_policy[nid] = (struct mempolicy) { - .refcnt = ATOMIC_INIT(1), - .mode = MPOL_PREFERRED, - .flags = MPOL_F_MOF | MPOL_F_MORON, - .v = { .preferred_node = nid, }, - }; - } - /* * Set interleaving policy for system init. Interleaving is only * enabled across suitably sized nodes (default is >= 16MB), or @@ -2580,8 +2346,6 @@ void __init numa_policy_init(void) if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) printk("numa_policy_init: interleaving failed\n"); - - check_numabalancing_enable(); } /* Reset policy of current process to default */ @@ -2598,13 +2362,14 @@ void numa_default_policy(void) * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag * Used only for mpol_parse_str() and mpol_to_str() */ +#define MPOL_LOCAL MPOL_MAX static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", - [MPOL_LOCAL] = "local", + [MPOL_LOCAL] = "local" }; @@ -2650,12 +2415,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) if (flags) *flags++ = '\0'; /* terminate mode string */ - for (mode = 0; mode < MPOL_MAX; mode++) { + for (mode = 0; mode <= MPOL_LOCAL; mode++) { if (!strcmp(str, policy_modes[mode])) { break; } } - if (mode >= MPOL_MAX) + if (mode > MPOL_LOCAL) goto out; switch (mode) { diff --git a/trunk/mm/migrate.c b/trunk/mm/migrate.c index 32efd8028bc9..cae02711181d 100644 --- a/trunk/mm/migrate.c +++ b/trunk/mm/migrate.c @@ -39,9 +39,6 @@ #include -#define CREATE_TRACE_POINTS -#include - #include "internal.h" /* @@ -296,7 +293,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, struct buffer_head *head, enum migrate_mode mode) { - int expected_count = 0; + int expected_count; void **pslot; if (!mapping) { @@ -424,7 +421,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, */ void migrate_page_copy(struct page *newpage, struct page *page) { - if (PageHuge(page) || PageTransHuge(page)) + if (PageHuge(page)) copy_huge_page(newpage, page); else copy_highpage(newpage, page); @@ -768,7 +765,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, */ if (PageAnon(page)) { /* - * Only page_lock_anon_vma_read() understands the subtleties of + * Only page_lock_anon_vma() understands the subtleties of * getting a hold on an anon_vma from outside one of its mms. */ anon_vma = page_get_anon_vma(page); @@ -1001,11 +998,10 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, */ int migrate_pages(struct list_head *from, new_page_t get_new_page, unsigned long private, bool offlining, - enum migrate_mode mode, int reason) + enum migrate_mode mode) { int retry = 1; int nr_failed = 0; - int nr_succeeded = 0; int pass = 0; struct page *page; struct page *page2; @@ -1032,7 +1028,6 @@ int migrate_pages(struct list_head *from, retry++; break; case MIGRATEPAGE_SUCCESS: - nr_succeeded++; break; default: /* Permanent failure */ @@ -1043,12 +1038,6 @@ int migrate_pages(struct list_head *from, } rc = nr_failed + retry; out: - if (nr_succeeded) - count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); - if (nr_failed) - count_vm_events(PGMIGRATE_FAIL, nr_failed); - trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason); - if (!swapwrite) current->flags &= ~PF_SWAPWRITE; @@ -1187,8 +1176,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, err = 0; if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, - (unsigned long)pm, 0, MIGRATE_SYNC, - MR_SYSCALL); + (unsigned long)pm, 0, MIGRATE_SYNC); if (err) putback_lru_pages(&pagelist); } @@ -1452,317 +1440,4 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, } return err; } - -#ifdef CONFIG_NUMA_BALANCING -/* - * Returns true if this is a safe migration target node for misplaced NUMA - * pages. Currently it only checks the watermarks which crude - */ -static bool migrate_balanced_pgdat(struct pglist_data *pgdat, - int nr_migrate_pages) -{ - int z; - for (z = pgdat->nr_zones - 1; z >= 0; z--) { - struct zone *zone = pgdat->node_zones + z; - - if (!populated_zone(zone)) - continue; - - if (zone->all_unreclaimable) - continue; - - /* Avoid waking kswapd by allocating pages_to_migrate pages. */ - if (!zone_watermark_ok(zone, 0, - high_wmark_pages(zone) + - nr_migrate_pages, - 0, 0)) - continue; - return true; - } - return false; -} - -static struct page *alloc_misplaced_dst_page(struct page *page, - unsigned long data, - int **result) -{ - int nid = (int) data; - struct page *newpage; - - newpage = alloc_pages_exact_node(nid, - (GFP_HIGHUSER_MOVABLE | GFP_THISNODE | - __GFP_NOMEMALLOC | __GFP_NORETRY | - __GFP_NOWARN) & - ~GFP_IOFS, 0); - if (newpage) - page_xchg_last_nid(newpage, page_last_nid(page)); - - return newpage; -} - -/* - * page migration rate limiting control. - * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs - * window of time. Default here says do not migrate more than 1280M per second. - * If a node is rate-limited then PTE NUMA updates are also rate-limited. However - * as it is faults that reset the window, pte updates will happen unconditionally - * if there has not been a fault since @pteupdate_interval_millisecs after the - * throttle window closed. - */ -static unsigned int migrate_interval_millisecs __read_mostly = 100; -static unsigned int pteupdate_interval_millisecs __read_mostly = 1000; -static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); - -/* Returns true if NUMA migration is currently rate limited */ -bool migrate_ratelimited(int node) -{ - pg_data_t *pgdat = NODE_DATA(node); - - if (time_after(jiffies, pgdat->numabalancing_migrate_next_window + - msecs_to_jiffies(pteupdate_interval_millisecs))) - return false; - - if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages) - return false; - - return true; -} - -/* Returns true if the node is migrate rate-limited after the update */ -bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) -{ - bool rate_limited = false; - - /* - * Rate-limit the amount of data that is being migrated to a node. - * Optimal placement is no good if the memory bus is saturated and - * all the time is being spent migrating! - */ - spin_lock(&pgdat->numabalancing_migrate_lock); - if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { - pgdat->numabalancing_migrate_nr_pages = 0; - pgdat->numabalancing_migrate_next_window = jiffies + - msecs_to_jiffies(migrate_interval_millisecs); - } - if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) - rate_limited = true; - else - pgdat->numabalancing_migrate_nr_pages += nr_pages; - spin_unlock(&pgdat->numabalancing_migrate_lock); - - return rate_limited; -} - -int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) -{ - int ret = 0; - - /* Avoid migrating to a node that is nearly full */ - if (migrate_balanced_pgdat(pgdat, 1)) { - int page_lru; - - if (isolate_lru_page(page)) { - put_page(page); - return 0; - } - - /* Page is isolated */ - ret = 1; - page_lru = page_is_file_cache(page); - if (!PageTransHuge(page)) - inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); - else - mod_zone_page_state(page_zone(page), - NR_ISOLATED_ANON + page_lru, - HPAGE_PMD_NR); - } - - /* - * Page is either isolated or there is not enough space on the target - * node. If isolated, then it has taken a reference count and the - * callers reference can be safely dropped without the page - * disappearing underneath us during migration. Otherwise the page is - * not to be migrated but the callers reference should still be - * dropped so it does not leak. - */ - put_page(page); - - return ret; -} - -/* - * Attempt to migrate a misplaced page to the specified destination - * node. Caller is expected to have an elevated reference count on - * the page that will be dropped by this function before returning. - */ -int migrate_misplaced_page(struct page *page, int node) -{ - pg_data_t *pgdat = NODE_DATA(node); - int isolated = 0; - int nr_remaining; - LIST_HEAD(migratepages); - - /* - * Don't migrate pages that are mapped in multiple processes. - * TODO: Handle false sharing detection instead of this hammer - */ - if (page_mapcount(page) != 1) { - put_page(page); - goto out; - } - - /* - * Rate-limit the amount of data that is being migrated to a node. - * Optimal placement is no good if the memory bus is saturated and - * all the time is being spent migrating! - */ - if (numamigrate_update_ratelimit(pgdat, 1)) { - put_page(page); - goto out; - } - - isolated = numamigrate_isolate_page(pgdat, page); - if (!isolated) - goto out; - - list_add(&page->lru, &migratepages); - nr_remaining = migrate_pages(&migratepages, - alloc_misplaced_dst_page, - node, false, MIGRATE_ASYNC, - MR_NUMA_MISPLACED); - if (nr_remaining) { - putback_lru_pages(&migratepages); - isolated = 0; - } else - count_vm_numa_event(NUMA_PAGE_MIGRATE); - BUG_ON(!list_empty(&migratepages)); -out: - return isolated; -} -#endif /* CONFIG_NUMA_BALANCING */ - -#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) -int migrate_misplaced_transhuge_page(struct mm_struct *mm, - struct vm_area_struct *vma, - pmd_t *pmd, pmd_t entry, - unsigned long address, - struct page *page, int node) -{ - unsigned long haddr = address & HPAGE_PMD_MASK; - pg_data_t *pgdat = NODE_DATA(node); - int isolated = 0; - struct page *new_page = NULL; - struct mem_cgroup *memcg = NULL; - int page_lru = page_is_file_cache(page); - - /* - * Don't migrate pages that are mapped in multiple processes. - * TODO: Handle false sharing detection instead of this hammer - */ - if (page_mapcount(page) != 1) - goto out_dropref; - - /* - * Rate-limit the amount of data that is being migrated to a node. - * Optimal placement is no good if the memory bus is saturated and - * all the time is being spent migrating! - */ - if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) - goto out_dropref; - - new_page = alloc_pages_node(node, - (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); - if (!new_page) { - count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); - goto out_dropref; - } - page_xchg_last_nid(new_page, page_last_nid(page)); - - isolated = numamigrate_isolate_page(pgdat, page); - if (!isolated) { - count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); - put_page(new_page); - goto out_keep_locked; - } - - /* Prepare a page as a migration target */ - __set_page_locked(new_page); - SetPageSwapBacked(new_page); - - /* anon mapping, we can simply copy page->mapping to the new page: */ - new_page->mapping = page->mapping; - new_page->index = page->index; - migrate_page_copy(new_page, page); - WARN_ON(PageLRU(new_page)); - - /* Recheck the target PMD */ - spin_lock(&mm->page_table_lock); - if (unlikely(!pmd_same(*pmd, entry))) { - spin_unlock(&mm->page_table_lock); - - /* Reverse changes made by migrate_page_copy() */ - if (TestClearPageActive(new_page)) - SetPageActive(page); - if (TestClearPageUnevictable(new_page)) - SetPageUnevictable(page); - mlock_migrate_page(page, new_page); - - unlock_page(new_page); - put_page(new_page); /* Free it */ - - unlock_page(page); - putback_lru_page(page); - - count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); - goto out; - } - - /* - * Traditional migration needs to prepare the memcg charge - * transaction early to prevent the old page from being - * uncharged when installing migration entries. Here we can - * save the potential rollback and start the charge transfer - * only when migration is already known to end successfully. - */ - mem_cgroup_prepare_migration(page, new_page, &memcg); - - entry = mk_pmd(new_page, vma->vm_page_prot); - entry = pmd_mknonnuma(entry); - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - entry = pmd_mkhuge(entry); - - page_add_new_anon_rmap(new_page, vma, haddr); - - set_pmd_at(mm, haddr, pmd, entry); - update_mmu_cache_pmd(vma, address, entry); - page_remove_rmap(page); - /* - * Finish the charge transaction under the page table lock to - * prevent split_huge_page() from dividing up the charge - * before it's fully transferred to the new page. - */ - mem_cgroup_end_migration(memcg, page, new_page, true); - spin_unlock(&mm->page_table_lock); - - unlock_page(new_page); - unlock_page(page); - put_page(page); /* Drop the rmap reference */ - put_page(page); /* Drop the LRU isolation reference */ - - count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); - count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); - -out: - mod_zone_page_state(page_zone(page), - NR_ISOLATED_ANON + page_lru, - -HPAGE_PMD_NR); - return isolated; - -out_dropref: - put_page(page); -out_keep_locked: - return 0; -} -#endif /* CONFIG_NUMA_BALANCING */ - -#endif /* CONFIG_NUMA */ +#endif diff --git a/trunk/mm/mmap.c b/trunk/mm/mmap.c index f54b235f29a9..2b7d9e78a569 100644 --- a/trunk/mm/mmap.c +++ b/trunk/mm/mmap.c @@ -736,7 +736,7 @@ again: remove_next = 1 + (end > next->vm_end); if (anon_vma) { VM_BUG_ON(adjust_next && next->anon_vma && anon_vma != next->anon_vma); - anon_vma_lock_write(anon_vma); + anon_vma_lock(anon_vma); anon_vma_interval_tree_pre_update_vma(vma); if (adjust_next) anon_vma_interval_tree_pre_update_vma(next); @@ -2886,15 +2886,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - down_write(&anon_vma->root->rwsem); + mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem); /* * We can safely modify head.next after taking the - * anon_vma->root->rwsem. If some other vma in this mm shares + * anon_vma->root->mutex. If some other vma in this mm shares * the same anon_vma we won't take it again. * * No need of atomic instructions here, head.next * can't change from under us thanks to the - * anon_vma->root->rwsem. + * anon_vma->root->mutex. */ if (__test_and_set_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) @@ -2996,7 +2996,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) * * No need of atomic instructions here, head.next * can't change from under us until we release the - * anon_vma->root->rwsem. + * anon_vma->root->mutex. */ if (!__test_and_clear_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) diff --git a/trunk/mm/mprotect.c b/trunk/mm/mprotect.c index 3dca970367db..e8c3938db6fa 100644 --- a/trunk/mm/mprotect.c +++ b/trunk/mm/mprotect.c @@ -35,16 +35,12 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) } #endif -static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, +static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa, bool *ret_all_same_node) + int dirty_accountable) { - struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; - unsigned long pages = 0; - bool all_same_node = true; - int last_nid = -1; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -52,43 +48,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, oldpte = *pte; if (pte_present(oldpte)) { pte_t ptent; - bool updated = false; ptent = ptep_modify_prot_start(mm, addr, pte); - if (!prot_numa) { - ptent = pte_modify(ptent, newprot); - updated = true; - } else { - struct page *page; - - page = vm_normal_page(vma, addr, oldpte); - if (page) { - int this_nid = page_to_nid(page); - if (last_nid == -1) - last_nid = this_nid; - if (last_nid != this_nid) - all_same_node = false; - - /* only check non-shared pages */ - if (!pte_numa(oldpte) && - page_mapcount(page) == 1) { - ptent = pte_mknuma(ptent); - updated = true; - } - } - } + ptent = pte_modify(ptent, newprot); /* * Avoid taking write faults for pages we know to be * dirty. */ - if (dirty_accountable && pte_dirty(ptent)) { + if (dirty_accountable && pte_dirty(ptent)) ptent = pte_mkwrite(ptent); - updated = true; - } - if (updated) - pages++; ptep_modify_prot_commit(mm, addr, pte, ptent); } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); @@ -102,40 +72,18 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, set_pte_at(mm, addr, pte, swp_entry_to_pte(entry)); } - pages++; } } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); - - *ret_all_same_node = all_same_node; - return pages; } -#ifdef CONFIG_NUMA_BALANCING -static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, - pmd_t *pmd) -{ - spin_lock(&mm->page_table_lock); - set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); - spin_unlock(&mm->page_table_lock); -} -#else -static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, - pmd_t *pmd) -{ - BUG(); -} -#endif /* CONFIG_NUMA_BALANCING */ - -static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, +static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) + int dirty_accountable) { pmd_t *pmd; unsigned long next; - unsigned long pages = 0; - bool all_same_node; pmd = pmd_offset(pud, addr); do { @@ -143,59 +91,42 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma, addr, pmd); - else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) { - pages += HPAGE_PMD_NR; + else if (change_huge_pmd(vma, pmd, addr, newprot)) continue; - } /* fall through */ } if (pmd_none_or_clear_bad(pmd)) continue; - pages += change_pte_range(vma, pmd, addr, next, newprot, - dirty_accountable, prot_numa, &all_same_node); - - /* - * If we are changing protections for NUMA hinting faults then - * set pmd_numa if the examined pages were all on the same - * node. This allows a regular PMD to be handled as one fault - * and effectively batches the taking of the PTL - */ - if (prot_numa && all_same_node) - change_pmd_protnuma(vma->vm_mm, addr, pmd); + change_pte_range(vma->vm_mm, pmd, addr, next, newprot, + dirty_accountable); } while (pmd++, addr = next, addr != end); - - return pages; } -static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, +static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) + int dirty_accountable) { pud_t *pud; unsigned long next; - unsigned long pages = 0; pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - pages += change_pmd_range(vma, pud, addr, next, newprot, - dirty_accountable, prot_numa); + change_pmd_range(vma, pud, addr, next, newprot, + dirty_accountable); } while (pud++, addr = next, addr != end); - - return pages; } -static unsigned long change_protection_range(struct vm_area_struct *vma, +static void change_protection(struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) + int dirty_accountable) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; unsigned long next; unsigned long start = addr; - unsigned long pages = 0; BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); @@ -204,32 +135,10 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - pages += change_pud_range(vma, pgd, addr, next, newprot, - dirty_accountable, prot_numa); + change_pud_range(vma, pgd, addr, next, newprot, + dirty_accountable); } while (pgd++, addr = next, addr != end); - - /* Only flush the TLB if we actually modified any entries: */ - if (pages) - flush_tlb_range(vma, start, end); - - return pages; -} - -unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) -{ - struct mm_struct *mm = vma->vm_mm; - unsigned long pages; - - mmu_notifier_invalidate_range_start(mm, start, end); - if (is_vm_hugetlb_page(vma)) - pages = hugetlb_change_protection(vma, start, end, newprot); - else - pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); - mmu_notifier_invalidate_range_end(mm, start, end); - - return pages; + flush_tlb_range(vma, start, end); } int @@ -304,8 +213,12 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, dirty_accountable = 1; } - change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); - + mmu_notifier_invalidate_range_start(mm, start, end); + if (is_vm_hugetlb_page(vma)) + hugetlb_change_protection(vma, start, end, vma->vm_page_prot); + else + change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); + mmu_notifier_invalidate_range_end(mm, start, end); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); perf_event_mmap(vma); diff --git a/trunk/mm/mremap.c b/trunk/mm/mremap.c index e1031e1f6a61..eabb24da6c9e 100644 --- a/trunk/mm/mremap.c +++ b/trunk/mm/mremap.c @@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, } if (vma->anon_vma) { anon_vma = vma->anon_vma; - anon_vma_lock_write(anon_vma); + anon_vma_lock(anon_vma); } } diff --git a/trunk/mm/page_alloc.c b/trunk/mm/page_alloc.c index d037c8bc1512..83637dfba110 100644 --- a/trunk/mm/page_alloc.c +++ b/trunk/mm/page_alloc.c @@ -611,7 +611,6 @@ static inline int free_pages_check(struct page *page) bad_page(page); return 1; } - reset_page_last_nid(page); if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; return 0; @@ -3884,7 +3883,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); reset_page_mapcount(page); - reset_page_last_nid(page); SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for @@ -4528,11 +4526,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, int ret; pgdat_resize_init(pgdat); -#ifdef CONFIG_NUMA_BALANCING - spin_lock_init(&pgdat->numabalancing_migrate_lock); - pgdat->numabalancing_migrate_nr_pages = 0; - pgdat->numabalancing_migrate_next_window = jiffies; -#endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); pgdat_page_cgroup_init(pgdat); @@ -5807,8 +5800,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, ret = migrate_pages(&cc->migratepages, alloc_migrate_target, - 0, false, MIGRATE_SYNC, - MR_CMA); + 0, false, MIGRATE_SYNC); } putback_movable_pages(&cc->migratepages); diff --git a/trunk/mm/pgtable-generic.c b/trunk/mm/pgtable-generic.c index 0c8323fe6c8f..e642627da6b7 100644 --- a/trunk/mm/pgtable-generic.c +++ b/trunk/mm/pgtable-generic.c @@ -12,8 +12,8 @@ #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS /* - * Only sets the access flags (dirty, accessed), as well as write - * permission. Furthermore, we know it always gets set to a "more + * Only sets the access flags (dirty, accessed, and + * writable). Furthermore, we know it always gets set to a "more * permissive" setting, which allows most architectures to optimize * this. We return whether the PTE actually changed, which in turn * instructs the caller to do things like update__mmu_cache. This @@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, int changed = !pte_same(*ptep, entry); if (changed) { set_pte_at(vma->vm_mm, address, ptep, entry); - flush_tlb_fix_spurious_fault(vma, address); + flush_tlb_page(vma, address); } return changed; } @@ -88,8 +88,7 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, { pte_t pte; pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); - if (pte_accessible(pte)) - flush_tlb_page(vma, address); + flush_tlb_page(vma, address); return pte; } #endif diff --git a/trunk/mm/rmap.c b/trunk/mm/rmap.c index 2c78f8cadc95..face808a489e 100644 --- a/trunk/mm/rmap.c +++ b/trunk/mm/rmap.c @@ -24,7 +24,7 @@ * mm->mmap_sem * page->flags PG_locked (lock_page) * mapping->i_mmap_mutex - * anon_vma->rwsem + * anon_vma->mutex * mm->page_table_lock or pte_lock * zone->lru_lock (in mark_page_accessed, isolate_lru_page) * swap_lock (in swap_duplicate, swap_info_get) @@ -37,7 +37,7 @@ * in arch-dependent flush_dcache_mmap_lock, * within bdi.wb->list_lock in __sync_single_inode) * - * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) + * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) * ->tasklist_lock * pte map lock */ @@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) VM_BUG_ON(atomic_read(&anon_vma->refcount)); /* - * Synchronize against page_lock_anon_vma_read() such that + * Synchronize against page_lock_anon_vma() such that * we can safely hold the lock without the anon_vma getting * freed. * * Relies on the full mb implied by the atomic_dec_and_test() from * put_anon_vma() against the acquire barrier implied by - * down_read_trylock() from page_lock_anon_vma_read(). This orders: + * mutex_trylock() from page_lock_anon_vma(). This orders: * - * page_lock_anon_vma_read() VS put_anon_vma() - * down_read_trylock() atomic_dec_and_test() + * page_lock_anon_vma() VS put_anon_vma() + * mutex_trylock() atomic_dec_and_test() * LOCK MB - * atomic_read() rwsem_is_locked() + * atomic_read() mutex_is_locked() * * LOCK should suffice since the actual taking of the lock must * happen _before_ what follows. */ - if (rwsem_is_locked(&anon_vma->root->rwsem)) { - anon_vma_lock_write(anon_vma); + if (mutex_is_locked(&anon_vma->root->mutex)) { + anon_vma_lock(anon_vma); anon_vma_unlock(anon_vma); } @@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, * allocate a new one. * * Anon-vma allocations are very subtle, because we may have - * optimistically looked up an anon_vma in page_lock_anon_vma_read() + * optimistically looked up an anon_vma in page_lock_anon_vma() * and that may actually touch the spinlock even in the newly * allocated vma (it depends on RCU to make sure that the * anon_vma isn't actually destroyed). @@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) allocated = anon_vma; } - anon_vma_lock_write(anon_vma); + anon_vma_lock(anon_vma); /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { @@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct struct anon_vma *new_root = anon_vma->root; if (new_root != root) { if (WARN_ON_ONCE(root)) - up_write(&root->rwsem); + mutex_unlock(&root->mutex); root = new_root; - down_write(&root->rwsem); + mutex_lock(&root->mutex); } return root; } @@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct static inline void unlock_anon_vma_root(struct anon_vma *root) { if (root) - up_write(&root->rwsem); + mutex_unlock(&root->mutex); } /* @@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) get_anon_vma(anon_vma->root); /* Mark this anon_vma as the one where our new (COWed) pages go. */ vma->anon_vma = anon_vma; - anon_vma_lock_write(anon_vma); + anon_vma_lock(anon_vma); anon_vma_chain_link(vma, avc, anon_vma); anon_vma_unlock(anon_vma); @@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) /* * Iterate the list once more, it now only contains empty and unlinked * anon_vmas, destroy them. Could not do before due to __put_anon_vma() - * needing to write-acquire the anon_vma->root->rwsem. + * needing to acquire the anon_vma->root->mutex. */ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; @@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data) { struct anon_vma *anon_vma = data; - init_rwsem(&anon_vma->rwsem); + mutex_init(&anon_vma->mutex); atomic_set(&anon_vma->refcount, 0); anon_vma->rb_root = RB_ROOT; } @@ -442,7 +442,7 @@ struct anon_vma *page_get_anon_vma(struct page *page) * atomic op -- the trylock. If we fail the trylock, we fall back to getting a * reference like with page_get_anon_vma() and then block on the mutex. */ -struct anon_vma *page_lock_anon_vma_read(struct page *page) +struct anon_vma *page_lock_anon_vma(struct page *page) { struct anon_vma *anon_vma = NULL; struct anon_vma *root_anon_vma; @@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); root_anon_vma = ACCESS_ONCE(anon_vma->root); - if (down_read_trylock(&root_anon_vma->rwsem)) { + if (mutex_trylock(&root_anon_vma->mutex)) { /* * If the page is still mapped, then this anon_vma is still * its anon_vma, and holding the mutex ensures that it will * not go away, see anon_vma_free(). */ if (!page_mapped(page)) { - up_read(&root_anon_vma->rwsem); + mutex_unlock(&root_anon_vma->mutex); anon_vma = NULL; } goto out; @@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) /* we pinned the anon_vma, its safe to sleep */ rcu_read_unlock(); - anon_vma_lock_read(anon_vma); + anon_vma_lock(anon_vma); if (atomic_dec_and_test(&anon_vma->refcount)) { /* * Oops, we held the last refcount, release the lock * and bail -- can't simply use put_anon_vma() because - * we'll deadlock on the anon_vma_lock_write() recursion. + * we'll deadlock on the anon_vma_lock() recursion. */ - anon_vma_unlock_read(anon_vma); + anon_vma_unlock(anon_vma); __put_anon_vma(anon_vma); anon_vma = NULL; } @@ -504,9 +504,9 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) return anon_vma; } -void page_unlock_anon_vma_read(struct anon_vma *anon_vma) +void page_unlock_anon_vma(struct anon_vma *anon_vma) { - anon_vma_unlock_read(anon_vma); + anon_vma_unlock(anon_vma); } /* @@ -744,7 +744,7 @@ static int page_referenced_anon(struct page *page, struct anon_vma_chain *avc; int referenced = 0; - anon_vma = page_lock_anon_vma_read(page); + anon_vma = page_lock_anon_vma(page); if (!anon_vma) return referenced; @@ -766,7 +766,7 @@ static int page_referenced_anon(struct page *page, break; } - page_unlock_anon_vma_read(anon_vma); + page_unlock_anon_vma(anon_vma); return referenced; } @@ -1315,7 +1315,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* * We need mmap_sem locking, Otherwise VM_LOCKED check makes * unstable result and race. Plus, We can't wait here because - * we now hold anon_vma->rwsem or mapping->i_mmap_mutex. + * we now hold anon_vma->mutex or mapping->i_mmap_mutex. * if trylock failed, the page remain in evictable lru and later * vmscan could retry to move the page to unevictable lru if the * page is actually mlocked. @@ -1480,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) struct anon_vma_chain *avc; int ret = SWAP_AGAIN; - anon_vma = page_lock_anon_vma_read(page); + anon_vma = page_lock_anon_vma(page); if (!anon_vma) return ret; @@ -1507,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) break; } - page_unlock_anon_vma_read(anon_vma); + page_unlock_anon_vma(anon_vma); return ret; } @@ -1702,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, int ret = SWAP_AGAIN; /* - * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() + * Note: remove_migration_ptes() cannot use page_lock_anon_vma() * because that depends on page_mapped(); but not all its usages * are holding mmap_sem. Users without mmap_sem are required to * take a reference count to prevent the anon_vma disappearing @@ -1710,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, anon_vma = page_anon_vma(page); if (!anon_vma) return ret; - anon_vma_lock_read(anon_vma); + anon_vma_lock(anon_vma); anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); @@ -1718,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, if (ret != SWAP_AGAIN) break; } - anon_vma_unlock_read(anon_vma); + anon_vma_unlock(anon_vma); return ret; } diff --git a/trunk/mm/vmstat.c b/trunk/mm/vmstat.c index 9800306c8195..df14808f0a36 100644 --- a/trunk/mm/vmstat.c +++ b/trunk/mm/vmstat.c @@ -774,20 +774,10 @@ const char * const vmstat_text[] = { "pgrotated", -#ifdef CONFIG_NUMA_BALANCING - "numa_pte_updates", - "numa_hint_faults", - "numa_hint_faults_local", - "numa_pages_migrated", -#endif -#ifdef CONFIG_MIGRATION - "pgmigrate_success", - "pgmigrate_fail", -#endif #ifdef CONFIG_COMPACTION - "compact_migrate_scanned", - "compact_free_scanned", - "compact_isolated", + "compact_blocks_moved", + "compact_pages_moved", + "compact_pagemigrate_failed", "compact_stall", "compact_fail", "compact_success", diff --git a/trunk/net/dns_resolver/dns_key.c b/trunk/net/dns_resolver/dns_key.c index 0a69d0757795..8aa4b1115384 100644 --- a/trunk/net/dns_resolver/dns_key.c +++ b/trunk/net/dns_resolver/dns_key.c @@ -259,16 +259,20 @@ static int __init init_dns_resolver(void) if (!cred) return -ENOMEM; - keyring = keyring_alloc(".dns_resolver", - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_NOT_IN_QUOTA, NULL); + keyring = key_alloc(&key_type_keyring, ".dns_resolver", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto failed_put_cred; } + ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); + if (ret < 0) + goto failed_put_key; + ret = register_key_type(&key_type_dns_resolver); if (ret < 0) goto failed_put_key; @@ -300,4 +304,3 @@ static void __exit exit_dns_resolver(void) module_init(init_dns_resolver) module_exit(exit_dns_resolver) MODULE_LICENSE("GPL"); - diff --git a/trunk/security/keys/key.c b/trunk/security/keys/key.c index 8fb7c7bd4657..a15c9da8f971 100644 --- a/trunk/security/keys/key.c +++ b/trunk/security/keys/key.c @@ -854,13 +854,13 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, /* if the client doesn't provide, decide on the permissions we want */ if (perm == KEY_PERM_UNDEF) { perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR; - perm |= KEY_USR_VIEW; + perm |= KEY_USR_VIEW | KEY_USR_SEARCH | KEY_USR_LINK | KEY_USR_SETATTR; if (ktype->read) - perm |= KEY_POS_READ; + perm |= KEY_POS_READ | KEY_USR_READ; if (ktype == &key_type_keyring || ktype->update) - perm |= KEY_POS_WRITE; + perm |= KEY_USR_WRITE; } /* allocate a new key */ diff --git a/trunk/security/keys/keyctl.c b/trunk/security/keys/keyctl.c index 4b5c948eb414..5d34b4e827d6 100644 --- a/trunk/security/keys/keyctl.c +++ b/trunk/security/keys/keyctl.c @@ -1132,12 +1132,12 @@ long keyctl_instantiate_key_iov(key_serial_t id, ret = rw_copy_check_uvector(WRITE, _payload_iov, ioc, ARRAY_SIZE(iovstack), iovstack, &iov); if (ret < 0) - goto err; + return ret; if (ret == 0) goto no_payload_free; ret = keyctl_instantiate_key_common(id, iov, ioc, ret, ringid); -err: + if (iov != iovstack) kfree(iov); return ret; @@ -1495,8 +1495,7 @@ long keyctl_session_to_parent(void) goto error_keyring; newwork = &cred->rcu; - cred->session_keyring = key_ref_to_ptr(keyring_r); - keyring_r = NULL; + cred->tgcred->session_keyring = key_ref_to_ptr(keyring_r); init_task_work(newwork, key_change_session_keyring); me = current; @@ -1520,7 +1519,7 @@ long keyctl_session_to_parent(void) mycred = current_cred(); pcred = __task_cred(parent); if (mycred == pcred || - mycred->session_keyring == pcred->session_keyring) { + mycred->tgcred->session_keyring == pcred->tgcred->session_keyring) { ret = 0; goto unlock; } @@ -1536,9 +1535,9 @@ long keyctl_session_to_parent(void) goto unlock; /* the keyrings must have the same UID */ - if ((pcred->session_keyring && - !uid_eq(pcred->session_keyring->uid, mycred->euid)) || - !uid_eq(mycred->session_keyring->uid, mycred->euid)) + if ((pcred->tgcred->session_keyring && + !uid_eq(pcred->tgcred->session_keyring->uid, mycred->euid)) || + !uid_eq(mycred->tgcred->session_keyring->uid, mycred->euid)) goto unlock; /* cancel an already pending keyring replacement */ diff --git a/trunk/security/keys/keyring.c b/trunk/security/keys/keyring.c index 6ece7f2e5707..6e42df15a24c 100644 --- a/trunk/security/keys/keyring.c +++ b/trunk/security/keys/keyring.c @@ -257,14 +257,17 @@ static long keyring_read(const struct key *keyring, * Allocate a keyring and link into the destination keyring. */ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, - const struct cred *cred, key_perm_t perm, - unsigned long flags, struct key *dest) + const struct cred *cred, unsigned long flags, + struct key *dest) { struct key *keyring; int ret; keyring = key_alloc(&key_type_keyring, description, - uid, gid, cred, perm, flags); + uid, gid, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL, + flags); + if (!IS_ERR(keyring)) { ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL); if (ret < 0) { @@ -275,7 +278,6 @@ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, return keyring; } -EXPORT_SYMBOL(keyring_alloc); /** * keyring_search_aux - Search a keyring tree for a key matching some criteria diff --git a/trunk/security/keys/process_keys.c b/trunk/security/keys/process_keys.c index 58dfe0890947..86468f385fc8 100644 --- a/trunk/security/keys/process_keys.c +++ b/trunk/security/keys/process_keys.c @@ -45,12 +45,10 @@ int install_user_keyrings(void) struct user_struct *user; const struct cred *cred; struct key *uid_keyring, *session_keyring; - key_perm_t user_keyring_perm; char buf[20]; int ret; uid_t uid; - user_keyring_perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL; cred = current_cred(); user = cred->user; uid = from_kuid(cred->user_ns, user->uid); @@ -75,8 +73,8 @@ int install_user_keyrings(void) uid_keyring = find_keyring_by_name(buf, true); if (IS_ERR(uid_keyring)) { uid_keyring = keyring_alloc(buf, user->uid, INVALID_GID, - cred, user_keyring_perm, - KEY_ALLOC_IN_QUOTA, NULL); + cred, KEY_ALLOC_IN_QUOTA, + NULL); if (IS_ERR(uid_keyring)) { ret = PTR_ERR(uid_keyring); goto error; @@ -91,8 +89,7 @@ int install_user_keyrings(void) if (IS_ERR(session_keyring)) { session_keyring = keyring_alloc(buf, user->uid, INVALID_GID, - cred, user_keyring_perm, - KEY_ALLOC_IN_QUOTA, NULL); + cred, KEY_ALLOC_IN_QUOTA, NULL); if (IS_ERR(session_keyring)) { ret = PTR_ERR(session_keyring); goto error_release; @@ -133,7 +130,6 @@ int install_thread_keyring_to_cred(struct cred *new) struct key *keyring; keyring = keyring_alloc("_tid", new->uid, new->gid, new, - KEY_POS_ALL | KEY_USR_VIEW, KEY_ALLOC_QUOTA_OVERRUN, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); @@ -174,18 +170,27 @@ static int install_thread_keyring(void) int install_process_keyring_to_cred(struct cred *new) { struct key *keyring; + int ret; - if (new->process_keyring) + if (new->tgcred->process_keyring) return -EEXIST; - keyring = keyring_alloc("_pid", new->uid, new->gid, new, - KEY_POS_ALL | KEY_USR_VIEW, - KEY_ALLOC_QUOTA_OVERRUN, NULL); + keyring = keyring_alloc("_pid", new->uid, new->gid, + new, KEY_ALLOC_QUOTA_OVERRUN, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); - new->process_keyring = keyring; - return 0; + spin_lock_irq(&new->tgcred->lock); + if (!new->tgcred->process_keyring) { + new->tgcred->process_keyring = keyring; + keyring = NULL; + ret = 0; + } else { + ret = -EEXIST; + } + spin_unlock_irq(&new->tgcred->lock); + key_put(keyring); + return ret; } /* @@ -226,12 +231,11 @@ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring) /* create an empty session keyring */ if (!keyring) { flags = KEY_ALLOC_QUOTA_OVERRUN; - if (cred->session_keyring) + if (cred->tgcred->session_keyring) flags = KEY_ALLOC_IN_QUOTA; - keyring = keyring_alloc("_ses", cred->uid, cred->gid, cred, - KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ, - flags, NULL); + keyring = keyring_alloc("_ses", cred->uid, cred->gid, + cred, flags, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); } else { @@ -239,11 +243,17 @@ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring) } /* install the keyring */ - old = cred->session_keyring; - rcu_assign_pointer(cred->session_keyring, keyring); - - if (old) + spin_lock_irq(&cred->tgcred->lock); + old = cred->tgcred->session_keyring; + rcu_assign_pointer(cred->tgcred->session_keyring, keyring); + spin_unlock_irq(&cred->tgcred->lock); + + /* we're using RCU on the pointer, but there's no point synchronising + * on it if it didn't previously point to anything */ + if (old) { + synchronize_rcu(); key_put(old); + } return 0; } @@ -358,9 +368,9 @@ key_ref_t search_my_process_keyrings(struct key_type *type, } /* search the process keyring second */ - if (cred->process_keyring) { + if (cred->tgcred->process_keyring) { key_ref = keyring_search_aux( - make_key_ref(cred->process_keyring, 1), + make_key_ref(cred->tgcred->process_keyring, 1), cred, type, description, match, no_state_check); if (!IS_ERR(key_ref)) goto found; @@ -379,10 +389,12 @@ key_ref_t search_my_process_keyrings(struct key_type *type, } /* search the session keyring */ - if (cred->session_keyring) { + if (cred->tgcred->session_keyring) { rcu_read_lock(); key_ref = keyring_search_aux( - make_key_ref(rcu_dereference(cred->session_keyring), 1), + make_key_ref(rcu_dereference( + cred->tgcred->session_keyring), + 1), cred, type, description, match, no_state_check); rcu_read_unlock(); @@ -552,7 +564,7 @@ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags, break; case KEY_SPEC_PROCESS_KEYRING: - if (!cred->process_keyring) { + if (!cred->tgcred->process_keyring) { if (!(lflags & KEY_LOOKUP_CREATE)) goto error; @@ -564,13 +576,13 @@ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags, goto reget_creds; } - key = cred->process_keyring; + key = cred->tgcred->process_keyring; atomic_inc(&key->usage); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_SESSION_KEYRING: - if (!cred->session_keyring) { + if (!cred->tgcred->session_keyring) { /* always install a session keyring upon access if one * doesn't exist yet */ ret = install_user_keyrings(); @@ -585,7 +597,7 @@ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags, if (ret < 0) goto error; goto reget_creds; - } else if (cred->session_keyring == + } else if (cred->tgcred->session_keyring == cred->user->session_keyring && lflags & KEY_LOOKUP_CREATE) { ret = join_session_keyring(NULL); @@ -595,7 +607,7 @@ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags, } rcu_read_lock(); - key = rcu_dereference(cred->session_keyring); + key = rcu_dereference(cred->tgcred->session_keyring); atomic_inc(&key->usage); rcu_read_unlock(); key_ref = make_key_ref(key, 1); @@ -755,6 +767,12 @@ long join_session_keyring(const char *name) struct key *keyring; long ret, serial; + /* only permit this if there's a single thread in the thread group - + * this avoids us having to adjust the creds on all threads and risking + * ENOMEM */ + if (!current_is_single_threaded()) + return -EMLINK; + new = prepare_creds(); if (!new) return -ENOMEM; @@ -766,7 +784,7 @@ long join_session_keyring(const char *name) if (ret < 0) goto error; - serial = new->session_keyring->serial; + serial = new->tgcred->session_keyring->serial; ret = commit_creds(new); if (ret == 0) ret = serial; @@ -780,10 +798,8 @@ long join_session_keyring(const char *name) keyring = find_keyring_by_name(name, false); if (PTR_ERR(keyring) == -ENOKEY) { /* not found - try and create a new one */ - keyring = keyring_alloc( - name, old->uid, old->gid, old, - KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_LINK, - KEY_ALLOC_IN_QUOTA, NULL); + keyring = keyring_alloc(name, old->uid, old->gid, old, + KEY_ALLOC_IN_QUOTA, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; @@ -791,9 +807,6 @@ long join_session_keyring(const char *name) } else if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; - } else if (keyring == new->session_keyring) { - ret = 0; - goto error2; } /* we've got a keyring - now to install it */ @@ -850,7 +863,8 @@ void key_change_session_keyring(struct callback_head *twork) new->jit_keyring = old->jit_keyring; new->thread_keyring = key_get(old->thread_keyring); - new->process_keyring = key_get(old->process_keyring); + new->tgcred->tgid = old->tgcred->tgid; + new->tgcred->process_keyring = key_get(old->tgcred->process_keyring); security_transfer_creds(new, old); diff --git a/trunk/security/keys/request_key.c b/trunk/security/keys/request_key.c index 4bd6bdb74193..66e21184b559 100644 --- a/trunk/security/keys/request_key.c +++ b/trunk/security/keys/request_key.c @@ -126,7 +126,6 @@ static int call_sbin_request_key(struct key_construction *cons, cred = get_current_cred(); keyring = keyring_alloc(desc, cred->fsuid, cred->fsgid, cred, - KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ, KEY_ALLOC_QUOTA_OVERRUN, NULL); put_cred(cred); if (IS_ERR(keyring)) { @@ -151,12 +150,12 @@ static int call_sbin_request_key(struct key_construction *cons, cred->thread_keyring ? cred->thread_keyring->serial : 0); prkey = 0; - if (cred->process_keyring) - prkey = cred->process_keyring->serial; + if (cred->tgcred->process_keyring) + prkey = cred->tgcred->process_keyring->serial; sprintf(keyring_str[1], "%d", prkey); rcu_read_lock(); - session = rcu_dereference(cred->session_keyring); + session = rcu_dereference(cred->tgcred->session_keyring); if (!session) session = cred->user->session_keyring; sskey = session->serial; @@ -298,14 +297,14 @@ static void construct_get_dest_keyring(struct key **_dest_keyring) break; case KEY_REQKEY_DEFL_PROCESS_KEYRING: - dest_keyring = key_get(cred->process_keyring); + dest_keyring = key_get(cred->tgcred->process_keyring); if (dest_keyring) break; case KEY_REQKEY_DEFL_SESSION_KEYRING: rcu_read_lock(); dest_keyring = key_get( - rcu_dereference(cred->session_keyring)); + rcu_dereference(cred->tgcred->session_keyring)); rcu_read_unlock(); if (dest_keyring) @@ -348,7 +347,6 @@ static int construct_alloc_key(struct key_type *type, const struct cred *cred = current_cred(); unsigned long prealloc; struct key *key; - key_perm_t perm; key_ref_t key_ref; int ret; @@ -357,15 +355,8 @@ static int construct_alloc_key(struct key_type *type, *_key = NULL; mutex_lock(&user->cons_lock); - perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR; - perm |= KEY_USR_VIEW; - if (type->read) - perm |= KEY_POS_READ; - if (type == &key_type_keyring || type->update) - perm |= KEY_POS_WRITE; - key = key_alloc(type, description, cred->fsuid, cred->fsgid, cred, - perm, flags); + KEY_POS_ALL, flags); if (IS_ERR(key)) goto alloc_failed; diff --git a/trunk/security/smack/Kconfig b/trunk/security/smack/Kconfig index e69de9c642b7..603b08784341 100644 --- a/trunk/security/smack/Kconfig +++ b/trunk/security/smack/Kconfig @@ -1,10 +1,6 @@ config SECURITY_SMACK bool "Simplified Mandatory Access Control Kernel Support" - depends on NET - depends on INET - depends on SECURITY - select NETLABEL - select SECURITY_NETWORK + depends on NETLABEL && SECURITY_NETWORK default n help This selects the Simplified Mandatory Access Control Kernel. diff --git a/trunk/security/smack/smackfs.c b/trunk/security/smack/smackfs.c index 76a5dca46404..99929a50093a 100644 --- a/trunk/security/smack/smackfs.c +++ b/trunk/security/smack/smackfs.c @@ -2063,19 +2063,6 @@ static const struct file_operations smk_revoke_subj_ops = { .llseek = generic_file_llseek, }; -static struct kset *smackfs_kset; -/** - * smk_init_sysfs - initialize /sys/fs/smackfs - * - */ -static int smk_init_sysfs(void) -{ - smackfs_kset = kset_create_and_add("smackfs", NULL, fs_kobj); - if (!smackfs_kset) - return -ENOMEM; - return 0; -} - /** * smk_fill_super - fill the /smackfs superblock * @sb: the empty superblock @@ -2196,10 +2183,6 @@ static int __init init_smk_fs(void) if (!security_module_enable(&smack_ops)) return 0; - err = smk_init_sysfs(); - if (err) - printk(KERN_ERR "smackfs: sysfs mountpoint problem.\n"); - err = register_filesystem(&smk_fs_type); if (!err) { smackfs_mount = kern_mount(&smk_fs_type); diff --git a/trunk/security/yama/yama_lsm.c b/trunk/security/yama/yama_lsm.c index 2663145d1197..b4c29848b49d 100644 --- a/trunk/security/yama/yama_lsm.c +++ b/trunk/security/yama/yama_lsm.c @@ -17,7 +17,6 @@ #include #include #include -#include #define YAMA_SCOPE_DISABLED 0 #define YAMA_SCOPE_RELATIONAL 1 @@ -30,37 +29,12 @@ static int ptrace_scope = YAMA_SCOPE_RELATIONAL; struct ptrace_relation { struct task_struct *tracer; struct task_struct *tracee; - bool invalid; struct list_head node; - struct rcu_head rcu; }; static LIST_HEAD(ptracer_relations); static DEFINE_SPINLOCK(ptracer_relations_lock); -static void yama_relation_cleanup(struct work_struct *work); -static DECLARE_WORK(yama_relation_work, yama_relation_cleanup); - -/** - * yama_relation_cleanup - remove invalid entries from the relation list - * - */ -static void yama_relation_cleanup(struct work_struct *work) -{ - struct ptrace_relation *relation; - - spin_lock(&ptracer_relations_lock); - rcu_read_lock(); - list_for_each_entry_rcu(relation, &ptracer_relations, node) { - if (relation->invalid) { - list_del_rcu(&relation->node); - kfree_rcu(relation, rcu); - } - } - rcu_read_unlock(); - spin_unlock(&ptracer_relations_lock); -} - /** * yama_ptracer_add - add/replace an exception for this tracer/tracee pair * @tracer: the task_struct of the process doing the ptrace @@ -74,34 +48,32 @@ static void yama_relation_cleanup(struct work_struct *work) static int yama_ptracer_add(struct task_struct *tracer, struct task_struct *tracee) { - struct ptrace_relation *relation, *added; + int rc = 0; + struct ptrace_relation *added; + struct ptrace_relation *entry, *relation = NULL; added = kmalloc(sizeof(*added), GFP_KERNEL); if (!added) return -ENOMEM; - added->tracee = tracee; - added->tracer = tracer; - added->invalid = false; - - spin_lock(&ptracer_relations_lock); - rcu_read_lock(); - list_for_each_entry_rcu(relation, &ptracer_relations, node) { - if (relation->invalid) - continue; - if (relation->tracee == tracee) { - list_replace_rcu(&relation->node, &added->node); - kfree_rcu(relation, rcu); - goto out; + spin_lock_bh(&ptracer_relations_lock); + list_for_each_entry(entry, &ptracer_relations, node) + if (entry->tracee == tracee) { + relation = entry; + break; } + if (!relation) { + relation = added; + relation->tracee = tracee; + list_add(&relation->node, &ptracer_relations); } + relation->tracer = tracer; - list_add_rcu(&added->node, &ptracer_relations); + spin_unlock_bh(&ptracer_relations_lock); + if (added != relation) + kfree(added); -out: - rcu_read_unlock(); - spin_unlock(&ptracer_relations_lock); - return 0; + return rc; } /** @@ -112,23 +84,16 @@ static int yama_ptracer_add(struct task_struct *tracer, static void yama_ptracer_del(struct task_struct *tracer, struct task_struct *tracee) { - struct ptrace_relation *relation; - bool marked = false; + struct ptrace_relation *relation, *safe; - rcu_read_lock(); - list_for_each_entry_rcu(relation, &ptracer_relations, node) { - if (relation->invalid) - continue; + spin_lock_bh(&ptracer_relations_lock); + list_for_each_entry_safe(relation, safe, &ptracer_relations, node) if (relation->tracee == tracee || (tracer && relation->tracer == tracer)) { - relation->invalid = true; - marked = true; + list_del(&relation->node); + kfree(relation); } - } - rcu_read_unlock(); - - if (marked) - schedule_work(&yama_relation_work); + spin_unlock_bh(&ptracer_relations_lock); } /** @@ -252,22 +217,21 @@ static int ptracer_exception_found(struct task_struct *tracer, struct task_struct *parent = NULL; bool found = false; + spin_lock_bh(&ptracer_relations_lock); rcu_read_lock(); if (!thread_group_leader(tracee)) tracee = rcu_dereference(tracee->group_leader); - list_for_each_entry_rcu(relation, &ptracer_relations, node) { - if (relation->invalid) - continue; + list_for_each_entry(relation, &ptracer_relations, node) if (relation->tracee == tracee) { parent = relation->tracer; found = true; break; } - } if (found && (parent == NULL || task_is_descendant(parent, tracer))) rc = 1; rcu_read_unlock(); + spin_unlock_bh(&ptracer_relations_lock); return rc; }