From f75dbe81131e32cf11795b38f7a225a0c09d2b79 Mon Sep 17 00:00:00 2001 From: James Morris Date: Wed, 18 Jan 2012 10:40:44 +1100 Subject: [PATCH] --- yaml --- r: 286075 b: refs/heads/master c: 89879a7eb81f69e6f63bdb2a442fb765c46482c0 h: refs/heads/master i: 286073: 0c7d71eff6d9710689b0283b8dcbeb1174b0a46a 286071: 61932c9bdb80a5c6a5b7575f687a7e40650d53c5 v: v3 --- [refs] | 2 +- trunk/MAINTAINERS | 2 +- trunk/arch/arm/include/asm/kprobes.h | 1 + trunk/arch/arm/include/asm/ptrace.h | 5 - trunk/arch/arm/include/asm/thread_info.h | 6 - trunk/arch/arm/kernel/entry-common.S | 4 +- trunk/arch/arm/kernel/ptrace.c | 16 +- trunk/arch/ia64/include/asm/ptrace.h | 13 +- trunk/arch/ia64/kernel/ptrace.c | 18 +- trunk/arch/microblaze/include/asm/ptrace.h | 5 - trunk/arch/microblaze/kernel/ptrace.c | 9 +- trunk/arch/microblaze/kernel/setup.c | 21 +- trunk/arch/mips/include/asm/ptrace.h | 14 +- trunk/arch/mips/kernel/ptrace.c | 11 +- trunk/arch/powerpc/include/asm/ptrace.h | 13 +- trunk/arch/powerpc/kernel/ptrace.c | 30 +- trunk/arch/s390/include/asm/ptrace.h | 6 +- trunk/arch/s390/kernel/ptrace.c | 15 +- trunk/arch/sh/include/asm/ptrace_32.h | 5 +- trunk/arch/sh/include/asm/ptrace_64.h | 5 +- trunk/arch/sh/kernel/ptrace_32.c | 11 +- trunk/arch/sh/kernel/ptrace_64.c | 11 +- trunk/arch/sparc/include/asm/ptrace.h | 10 +- trunk/arch/sparc/kernel/ptrace_64.c | 28 +- trunk/arch/um/kernel/ptrace.c | 20 +- trunk/arch/x86/ia32/ia32entry.S | 14 +- trunk/arch/x86/kernel/entry_32.S | 10 +- trunk/arch/x86/kernel/entry_64.S | 14 +- trunk/arch/x86/kernel/ptrace.c | 25 +- trunk/arch/x86/kernel/vm86_32.c | 4 +- trunk/arch/x86/um/shared/sysdep/ptrace.h | 5 - trunk/arch/xtensa/kernel/ptrace.c | 3 +- trunk/block/cfq-iosched.c | 7 +- trunk/drivers/usb/host/ehci-xilinx-of.c | 2 +- trunk/drivers/xen/xen-balloon.c | 2 +- trunk/fs/btrfs/Kconfig | 19 - trunk/fs/btrfs/Makefile | 3 +- trunk/fs/btrfs/backref.c | 1131 ++------ trunk/fs/btrfs/backref.h | 5 - trunk/fs/btrfs/btrfs_inode.h | 3 - trunk/fs/btrfs/check-integrity.c | 3068 -------------------- trunk/fs/btrfs/check-integrity.h | 36 - trunk/fs/btrfs/ctree.c | 42 +- trunk/fs/btrfs/ctree.h | 239 +- trunk/fs/btrfs/delayed-inode.c | 45 +- trunk/fs/btrfs/delayed-ref.c | 153 +- trunk/fs/btrfs/delayed-ref.h | 104 +- trunk/fs/btrfs/disk-io.c | 119 +- trunk/fs/btrfs/disk-io.h | 6 +- trunk/fs/btrfs/export.c | 2 +- trunk/fs/btrfs/extent-tree.c | 465 +-- trunk/fs/btrfs/extent_io.c | 6 +- trunk/fs/btrfs/extent_io.h | 2 - trunk/fs/btrfs/file.c | 11 +- trunk/fs/btrfs/free-space-cache.c | 417 +-- trunk/fs/btrfs/inode-map.c | 4 - trunk/fs/btrfs/inode.c | 66 +- trunk/fs/btrfs/ioctl.c | 268 +- trunk/fs/btrfs/ioctl.h | 54 - trunk/fs/btrfs/locking.c | 53 +- trunk/fs/btrfs/relocation.c | 20 +- trunk/fs/btrfs/scrub.c | 12 +- trunk/fs/btrfs/super.c | 190 +- trunk/fs/btrfs/transaction.c | 20 +- trunk/fs/btrfs/tree-log.c | 2 +- trunk/fs/btrfs/ulist.c | 220 -- trunk/fs/btrfs/ulist.h | 68 - trunk/fs/btrfs/volumes.c | 993 ++----- trunk/fs/btrfs/volumes.h | 54 +- trunk/fs/btrfs/xattr.c | 2 +- trunk/fs/namei.c | 28 +- trunk/fs/proc/base.c | 150 +- trunk/fs/xfs/xfs_aops.c | 29 +- trunk/fs/xfs/xfs_attr.c | 4 + trunk/fs/xfs/xfs_attr_leaf.c | 9 + trunk/fs/xfs/xfs_bmap.c | 116 +- trunk/fs/xfs/xfs_dfrag.c | 43 +- trunk/fs/xfs/xfs_file.c | 184 +- trunk/fs/xfs/xfs_fs_subr.c | 2 +- trunk/fs/xfs/xfs_iget.c | 24 +- trunk/fs/xfs/xfs_inode.c | 193 +- trunk/fs/xfs/xfs_inode.h | 114 +- trunk/fs/xfs/xfs_inode_item.c | 8 +- trunk/fs/xfs/xfs_iomap.c | 46 +- trunk/fs/xfs/xfs_iops.c | 46 +- trunk/fs/xfs/xfs_qm_syscalls.c | 8 +- trunk/fs/xfs/xfs_super.c | 8 + trunk/fs/xfs/xfs_sync.c | 9 +- trunk/fs/xfs/xfs_trace.h | 29 +- trunk/fs/xfs/xfs_vnodeops.c | 44 +- trunk/include/linux/audit.h | 116 +- trunk/include/linux/kref.h | 1 - trunk/include/linux/ptrace.h | 10 - trunk/include/linux/tty_driver.h | 1 + trunk/include/trace/events/btrfs.h | 203 -- trunk/init/Kconfig | 16 +- trunk/kernel/audit.c | 4 +- trunk/kernel/audit.h | 6 +- trunk/kernel/auditfilter.c | 17 +- trunk/kernel/auditsc.c | 735 ++--- trunk/kernel/capability.c | 2 +- trunk/kernel/exit.c | 3 +- trunk/kernel/fork.c | 2 + trunk/kernel/seccomp.c | 2 - trunk/security/integrity/ima/ima_audit.c | 8 +- trunk/security/lsm_audit.c | 27 +- trunk/sound/core/Kconfig | 13 +- trunk/sound/pci/au88x0/au88x0.c | 13 +- trunk/sound/pci/au88x0/au88x0.h | 1 - trunk/sound/pci/au88x0/au88x0_pcm.c | 1 - trunk/sound/pci/hda/hda_intel.c | 1 - trunk/sound/pci/hda/patch_sigmatel.c | 2 +- trunk/sound/pci/oxygen/xonar_wm87x6.c | 1 - 113 files changed, 2222 insertions(+), 8326 deletions(-) delete mode 100644 trunk/fs/btrfs/check-integrity.c delete mode 100644 trunk/fs/btrfs/check-integrity.h delete mode 100644 trunk/fs/btrfs/ulist.c delete mode 100644 trunk/fs/btrfs/ulist.h diff --git a/[refs] b/[refs] index 70eaeb510854..7f96309ad7cf 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: f429ee3b808118591d1f3cdf3c0d0793911a5677 +refs/heads/master: 89879a7eb81f69e6f63bdb2a442fb765c46482c0 diff --git a/trunk/MAINTAINERS b/trunk/MAINTAINERS index 2a90101309d1..ece8935025e3 100644 --- a/trunk/MAINTAINERS +++ b/trunk/MAINTAINERS @@ -5846,7 +5846,7 @@ F: drivers/mmc/host/sdhci-spear.c SECURITY SUBSYSTEM M: James Morris L: linux-security-module@vger.kernel.org (suggested Cc:) -T: git git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/security-testing-2.6.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security.git W: http://security.wiki.kernel.org/ S: Supported F: security/ diff --git a/trunk/arch/arm/include/asm/kprobes.h b/trunk/arch/arm/include/asm/kprobes.h index f82ec22eeb11..feec86768f9c 100644 --- a/trunk/arch/arm/include/asm/kprobes.h +++ b/trunk/arch/arm/include/asm/kprobes.h @@ -24,6 +24,7 @@ #define MAX_INSN_SIZE 2 #define MAX_STACK_SIZE 64 /* 32 would probably be OK */ +#define regs_return_value(regs) ((regs)->ARM_r0) #define flush_insn_slot(p) do { } while (0) #define kretprobe_blacklist_size 0 diff --git a/trunk/arch/arm/include/asm/ptrace.h b/trunk/arch/arm/include/asm/ptrace.h index 451808ba1211..96187ff58c24 100644 --- a/trunk/arch/arm/include/asm/ptrace.h +++ b/trunk/arch/arm/include/asm/ptrace.h @@ -189,11 +189,6 @@ static inline int valid_user_regs(struct pt_regs *regs) return 0; } -static inline long regs_return_value(struct pt_regs *regs) -{ - return regs->ARM_r0; -} - #define instruction_pointer(regs) (regs)->ARM_pc #ifdef CONFIG_SMP diff --git a/trunk/arch/arm/include/asm/thread_info.h b/trunk/arch/arm/include/asm/thread_info.h index d4c24d412a8d..0f30c3a78fc1 100644 --- a/trunk/arch/arm/include/asm/thread_info.h +++ b/trunk/arch/arm/include/asm/thread_info.h @@ -129,7 +129,6 @@ extern void vfp_flush_hwstate(struct thread_info *); /* * thread information flags: * TIF_SYSCALL_TRACE - syscall trace active - * TIF_SYSCAL_AUDIT - syscall auditing active * TIF_SIGPENDING - signal pending * TIF_NEED_RESCHED - rescheduling necessary * TIF_NOTIFY_RESUME - callback before returning to user @@ -140,7 +139,6 @@ extern void vfp_flush_hwstate(struct thread_info *); #define TIF_NEED_RESCHED 1 #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_SYSCALL_TRACE 8 -#define TIF_SYSCALL_AUDIT 9 #define TIF_POLLING_NRFLAG 16 #define TIF_USING_IWMMXT 17 #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ @@ -151,15 +149,11 @@ extern void vfp_flush_hwstate(struct thread_info *); #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) -#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT) #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) #define _TIF_SECCOMP (1 << TIF_SECCOMP) -/* Checks for any syscall work in entry-common.S */ -#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT) - /* * Change these and you break ASM code in entry-common.S */ diff --git a/trunk/arch/arm/kernel/entry-common.S b/trunk/arch/arm/kernel/entry-common.S index 520889cf1b5b..b2a27b6b0046 100644 --- a/trunk/arch/arm/kernel/entry-common.S +++ b/trunk/arch/arm/kernel/entry-common.S @@ -87,7 +87,7 @@ ENTRY(ret_from_fork) get_thread_info tsk ldr r1, [tsk, #TI_FLAGS] @ check for syscall tracing mov why, #1 - tst r1, #_TIF_SYSCALL_WORK @ are we tracing syscalls? + tst r1, #_TIF_SYSCALL_TRACE @ are we tracing syscalls? beq ret_slow_syscall mov r1, sp mov r0, #1 @ trace exit [IP = 1] @@ -443,7 +443,7 @@ ENTRY(vector_swi) 1: #endif - tst r10, #_TIF_SYSCALL_WORK @ are we tracing syscalls? + tst r10, #_TIF_SYSCALL_TRACE @ are we tracing syscalls? bne __sys_trace cmp scno, #NR_syscalls @ check upper syscall limit diff --git a/trunk/arch/arm/kernel/ptrace.c b/trunk/arch/arm/kernel/ptrace.c index e1d5e1929fbd..483727ad6892 100644 --- a/trunk/arch/arm/kernel/ptrace.c +++ b/trunk/arch/arm/kernel/ptrace.c @@ -906,6 +906,11 @@ asmlinkage int syscall_trace(int why, struct pt_regs *regs, int scno) { unsigned long ip; + if (!test_thread_flag(TIF_SYSCALL_TRACE)) + return scno; + if (!(current->ptrace & PT_PTRACED)) + return scno; + /* * Save IP. IP is used to denote syscall entry/exit: * IP = 0 -> entry, = 1 -> exit @@ -913,17 +918,6 @@ asmlinkage int syscall_trace(int why, struct pt_regs *regs, int scno) ip = regs->ARM_ip; regs->ARM_ip = why; - if (!ip) - audit_syscall_exit(regs); - else - audit_syscall_entry(AUDIT_ARCH_ARMEB, scno, regs->ARM_r0, - regs->ARM_r1, regs->ARM_r2, regs->ARM_r3); - - if (!test_thread_flag(TIF_SYSCALL_TRACE)) - return scno; - if (!(current->ptrace & PT_PTRACED)) - return scno; - current_thread_info()->syscall = scno; /* the 0x80 provides a way for the tracing parent to distinguish diff --git a/trunk/arch/ia64/include/asm/ptrace.h b/trunk/arch/ia64/include/asm/ptrace.h index 68c98f5b3ca6..f5cb27614e35 100644 --- a/trunk/arch/ia64/include/asm/ptrace.h +++ b/trunk/arch/ia64/include/asm/ptrace.h @@ -246,18 +246,7 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs) return regs->ar_bspstore; } -static inline int is_syscall_success(struct pt_regs *regs) -{ - return regs->r10 != -1; -} - -static inline long regs_return_value(struct pt_regs *regs) -{ - if (is_syscall_success(regs)) - return regs->r8; - else - return -regs->r8; -} +#define regs_return_value(regs) ((regs)->r8) /* Conserve space in histogram by encoding slot bits in address * bits 2 and 3 rather than bits 0 and 1. diff --git a/trunk/arch/ia64/kernel/ptrace.c b/trunk/arch/ia64/kernel/ptrace.c index dad91661ddf9..8848f43d819e 100644 --- a/trunk/arch/ia64/kernel/ptrace.c +++ b/trunk/arch/ia64/kernel/ptrace.c @@ -1246,8 +1246,15 @@ syscall_trace_enter (long arg0, long arg1, long arg2, long arg3, if (test_thread_flag(TIF_RESTORE_RSE)) ia64_sync_krbs(); + if (unlikely(current->audit_context)) { + long syscall; + int arch; - audit_syscall_entry(AUDIT_ARCH_IA64, regs.r15, arg0, arg1, arg2, arg3); + syscall = regs.r15; + arch = AUDIT_ARCH_IA64; + + audit_syscall_entry(arch, syscall, arg0, arg1, arg2, arg3); + } return 0; } @@ -1261,7 +1268,14 @@ syscall_trace_leave (long arg0, long arg1, long arg2, long arg3, { int step; - audit_syscall_exit(®s); + if (unlikely(current->audit_context)) { + int success = AUDITSC_RESULT(regs.r10); + long result = regs.r8; + + if (success != AUDITSC_SUCCESS) + result = -result; + audit_syscall_exit(success, result); + } step = test_thread_flag(TIF_SINGLESTEP); if (step || test_thread_flag(TIF_SYSCALL_TRACE)) diff --git a/trunk/arch/microblaze/include/asm/ptrace.h b/trunk/arch/microblaze/include/asm/ptrace.h index 94e92c805859..816bee64b196 100644 --- a/trunk/arch/microblaze/include/asm/ptrace.h +++ b/trunk/arch/microblaze/include/asm/ptrace.h @@ -61,11 +61,6 @@ struct pt_regs { #define instruction_pointer(regs) ((regs)->pc) #define profile_pc(regs) instruction_pointer(regs) -static inline long regs_return_value(struct pt_regs *regs) -{ - return regs->r3; -} - #else /* __KERNEL__ */ /* pt_regs offsets used by gdbserver etc in ptrace syscalls */ diff --git a/trunk/arch/microblaze/kernel/ptrace.c b/trunk/arch/microblaze/kernel/ptrace.c index 6eb2aa927d89..043cb58f9c44 100644 --- a/trunk/arch/microblaze/kernel/ptrace.c +++ b/trunk/arch/microblaze/kernel/ptrace.c @@ -147,8 +147,10 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) */ ret = -1L; - audit_syscall_entry(EM_MICROBLAZE, regs->r12, regs->r5, regs->r6, - regs->r7, regs->r8); + if (unlikely(current->audit_context)) + audit_syscall_entry(EM_MICROBLAZE, regs->r12, + regs->r5, regs->r6, + regs->r7, regs->r8); return ret ?: regs->r12; } @@ -157,7 +159,8 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs) { int step; - audit_syscall_exit(regs); + if (unlikely(current->audit_context)) + audit_syscall_exit(AUDITSC_RESULT(regs->r3), regs->r3); step = test_thread_flag(TIF_SINGLESTEP); if (step || test_thread_flag(TIF_SYSCALL_TRACE)) diff --git a/trunk/arch/microblaze/kernel/setup.c b/trunk/arch/microblaze/kernel/setup.c index d4fc1a971779..604cd9dd1333 100644 --- a/trunk/arch/microblaze/kernel/setup.c +++ b/trunk/arch/microblaze/kernel/setup.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -227,23 +226,5 @@ static int __init setup_bus_notifier(void) return 0; } -arch_initcall(setup_bus_notifier); - -static DEFINE_PER_CPU(struct cpu, cpu_devices); - -static int __init topology_init(void) -{ - int i, ret; - - for_each_present_cpu(i) { - struct cpu *c = &per_cpu(cpu_devices, i); - ret = register_cpu(c, i); - if (ret) - printk(KERN_WARNING "topology_init: register_cpu %d " - "failed (%d)\n", i, ret); - } - - return 0; -} -subsys_initcall(topology_init); +arch_initcall(setup_bus_notifier); diff --git a/trunk/arch/mips/include/asm/ptrace.h b/trunk/arch/mips/include/asm/ptrace.h index 4b7f5252d2fd..7b99c670e478 100644 --- a/trunk/arch/mips/include/asm/ptrace.h +++ b/trunk/arch/mips/include/asm/ptrace.h @@ -137,19 +137,7 @@ extern int ptrace_set_watch_regs(struct task_struct *child, */ #define user_mode(regs) (((regs)->cp0_status & KU_MASK) == KU_USER) -static inline int is_syscall_success(struct pt_regs *regs) -{ - return !regs->regs[7]; -} - -static inline long regs_return_value(struct pt_regs *regs) -{ - if (is_syscall_success(regs)) - return regs->regs[2]; - else - return -regs->regs[2]; -} - +#define regs_return_value(_regs) ((_regs)->regs[2]) #define instruction_pointer(regs) ((regs)->cp0_epc) #define profile_pc(regs) instruction_pointer(regs) diff --git a/trunk/arch/mips/kernel/ptrace.c b/trunk/arch/mips/kernel/ptrace.c index 7786b608d932..4e6ea1ffad46 100644 --- a/trunk/arch/mips/kernel/ptrace.c +++ b/trunk/arch/mips/kernel/ptrace.c @@ -560,9 +560,10 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs) } out: - audit_syscall_entry(audit_arch(), regs->regs[2], - regs->regs[4], regs->regs[5], - regs->regs[6], regs->regs[7]); + if (unlikely(current->audit_context)) + audit_syscall_entry(audit_arch(), regs->regs[2], + regs->regs[4], regs->regs[5], + regs->regs[6], regs->regs[7]); } /* @@ -571,7 +572,9 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs) */ asmlinkage void syscall_trace_leave(struct pt_regs *regs) { - audit_syscall_exit(regs); + if (unlikely(current->audit_context)) + audit_syscall_exit(AUDITSC_RESULT(regs->regs[7]), + -regs->regs[2]); if (!(current->ptrace & PT_PTRACED)) return; diff --git a/trunk/arch/powerpc/include/asm/ptrace.h b/trunk/arch/powerpc/include/asm/ptrace.h index 78a205162fd7..48223f9b8728 100644 --- a/trunk/arch/powerpc/include/asm/ptrace.h +++ b/trunk/arch/powerpc/include/asm/ptrace.h @@ -86,18 +86,7 @@ struct pt_regs { #define instruction_pointer(regs) ((regs)->nip) #define user_stack_pointer(regs) ((regs)->gpr[1]) #define kernel_stack_pointer(regs) ((regs)->gpr[1]) -static inline int is_syscall_success(struct pt_regs *regs) -{ - return !(regs->ccr & 0x10000000); -} - -static inline long regs_return_value(struct pt_regs *regs) -{ - if (is_syscall_success(regs)) - return regs->gpr[3]; - else - return -regs->gpr[3]; -} +#define regs_return_value(regs) ((regs)->gpr[3]) #ifdef CONFIG_SMP extern unsigned long profile_pc(struct pt_regs *regs); diff --git a/trunk/arch/powerpc/kernel/ptrace.c b/trunk/arch/powerpc/kernel/ptrace.c index 5b43325402bc..5de73dbd15c7 100644 --- a/trunk/arch/powerpc/kernel/ptrace.c +++ b/trunk/arch/powerpc/kernel/ptrace.c @@ -1724,20 +1724,22 @@ long do_syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->gpr[0]); + if (unlikely(current->audit_context)) { #ifdef CONFIG_PPC64 - if (!is_32bit_task()) - audit_syscall_entry(AUDIT_ARCH_PPC64, - regs->gpr[0], - regs->gpr[3], regs->gpr[4], - regs->gpr[5], regs->gpr[6]); - else + if (!is_32bit_task()) + audit_syscall_entry(AUDIT_ARCH_PPC64, + regs->gpr[0], + regs->gpr[3], regs->gpr[4], + regs->gpr[5], regs->gpr[6]); + else #endif - audit_syscall_entry(AUDIT_ARCH_PPC, - regs->gpr[0], - regs->gpr[3] & 0xffffffff, - regs->gpr[4] & 0xffffffff, - regs->gpr[5] & 0xffffffff, - regs->gpr[6] & 0xffffffff); + audit_syscall_entry(AUDIT_ARCH_PPC, + regs->gpr[0], + regs->gpr[3] & 0xffffffff, + regs->gpr[4] & 0xffffffff, + regs->gpr[5] & 0xffffffff, + regs->gpr[6] & 0xffffffff); + } return ret ?: regs->gpr[0]; } @@ -1746,7 +1748,9 @@ void do_syscall_trace_leave(struct pt_regs *regs) { int step; - audit_syscall_exit(regs); + if (unlikely(current->audit_context)) + audit_syscall_exit((regs->ccr&0x10000000)?AUDITSC_FAILURE:AUDITSC_SUCCESS, + regs->result); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->result); diff --git a/trunk/arch/s390/include/asm/ptrace.h b/trunk/arch/s390/include/asm/ptrace.h index aeb77f017985..56da355678f4 100644 --- a/trunk/arch/s390/include/asm/ptrace.h +++ b/trunk/arch/s390/include/asm/ptrace.h @@ -541,13 +541,9 @@ struct user_regs_struct #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0) #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN) #define user_stack_pointer(regs)((regs)->gprs[15]) +#define regs_return_value(regs)((regs)->gprs[2]) #define profile_pc(regs) instruction_pointer(regs) -static inline long regs_return_value(struct pt_regs *regs) -{ - return regs->gprs[2]; -} - int regs_query_register_offset(const char *name); const char *regs_query_register_name(unsigned int offset); unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset); diff --git a/trunk/arch/s390/kernel/ptrace.c b/trunk/arch/s390/kernel/ptrace.c index 9d82ed4bcb27..573bc29551ef 100644 --- a/trunk/arch/s390/kernel/ptrace.c +++ b/trunk/arch/s390/kernel/ptrace.c @@ -740,17 +740,20 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->gprs[2]); - audit_syscall_entry(is_compat_task() ? - AUDIT_ARCH_S390 : AUDIT_ARCH_S390X, - regs->gprs[2], regs->orig_gpr2, - regs->gprs[3], regs->gprs[4], - regs->gprs[5]); + if (unlikely(current->audit_context)) + audit_syscall_entry(is_compat_task() ? + AUDIT_ARCH_S390 : AUDIT_ARCH_S390X, + regs->gprs[2], regs->orig_gpr2, + regs->gprs[3], regs->gprs[4], + regs->gprs[5]); return ret ?: regs->gprs[2]; } asmlinkage void do_syscall_trace_exit(struct pt_regs *regs) { - audit_syscall_exit(regs); + if (unlikely(current->audit_context)) + audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]), + regs->gprs[2]); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->gprs[2]); diff --git a/trunk/arch/sh/include/asm/ptrace_32.h b/trunk/arch/sh/include/asm/ptrace_32.h index 2d3e906aa722..6c2239cca1a2 100644 --- a/trunk/arch/sh/include/asm/ptrace_32.h +++ b/trunk/arch/sh/include/asm/ptrace_32.h @@ -76,10 +76,7 @@ struct pt_dspregs { #ifdef __KERNEL__ #define MAX_REG_OFFSET offsetof(struct pt_regs, tra) -static inline long regs_return_value(struct pt_regs *regs) -{ - return regs->regs[0]; -} +#define regs_return_value(_regs) ((_regs)->regs[0]) #endif /* __KERNEL__ */ diff --git a/trunk/arch/sh/include/asm/ptrace_64.h b/trunk/arch/sh/include/asm/ptrace_64.h index eb3fcceaf64b..bf9be7764d69 100644 --- a/trunk/arch/sh/include/asm/ptrace_64.h +++ b/trunk/arch/sh/include/asm/ptrace_64.h @@ -13,10 +13,7 @@ struct pt_regs { #ifdef __KERNEL__ #define MAX_REG_OFFSET offsetof(struct pt_regs, tregs[7]) -static inline long regs_return_value(struct pt_regs *regs) -{ - return regs->regs[3]; -} +#define regs_return_value(_regs) ((_regs)->regs[3]) #endif /* __KERNEL__ */ diff --git a/trunk/arch/sh/kernel/ptrace_32.c b/trunk/arch/sh/kernel/ptrace_32.c index a3e651563763..92b3c276339a 100644 --- a/trunk/arch/sh/kernel/ptrace_32.c +++ b/trunk/arch/sh/kernel/ptrace_32.c @@ -518,9 +518,10 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->regs[0]); - audit_syscall_entry(audit_arch(), regs->regs[3], - regs->regs[4], regs->regs[5], - regs->regs[6], regs->regs[7]); + if (unlikely(current->audit_context)) + audit_syscall_entry(audit_arch(), regs->regs[3], + regs->regs[4], regs->regs[5], + regs->regs[6], regs->regs[7]); return ret ?: regs->regs[0]; } @@ -529,7 +530,9 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs) { int step; - audit_syscall_exit(regs); + if (unlikely(current->audit_context)) + audit_syscall_exit(AUDITSC_RESULT(regs->regs[0]), + regs->regs[0]); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->regs[0]); diff --git a/trunk/arch/sh/kernel/ptrace_64.c b/trunk/arch/sh/kernel/ptrace_64.c index 3d0080b5c976..c8f97649f354 100644 --- a/trunk/arch/sh/kernel/ptrace_64.c +++ b/trunk/arch/sh/kernel/ptrace_64.c @@ -536,9 +536,10 @@ asmlinkage long long do_syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->regs[9]); - audit_syscall_entry(audit_arch(), regs->regs[1], - regs->regs[2], regs->regs[3], - regs->regs[4], regs->regs[5]); + if (unlikely(current->audit_context)) + audit_syscall_entry(audit_arch(), regs->regs[1], + regs->regs[2], regs->regs[3], + regs->regs[4], regs->regs[5]); return ret ?: regs->regs[9]; } @@ -547,7 +548,9 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs) { int step; - audit_syscall_exit(regs); + if (unlikely(current->audit_context)) + audit_syscall_exit(AUDITSC_RESULT(regs->regs[9]), + regs->regs[9]); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->regs[9]); diff --git a/trunk/arch/sparc/include/asm/ptrace.h b/trunk/arch/sparc/include/asm/ptrace.h index c00c3b5c2806..a0e1bcf843a1 100644 --- a/trunk/arch/sparc/include/asm/ptrace.h +++ b/trunk/arch/sparc/include/asm/ptrace.h @@ -207,15 +207,7 @@ do { current_thread_info()->syscall_noerror = 1; \ #define instruction_pointer(regs) ((regs)->tpc) #define instruction_pointer_set(regs, val) ((regs)->tpc = (val)) #define user_stack_pointer(regs) ((regs)->u_regs[UREG_FP]) -static inline int is_syscall_success(struct pt_regs *regs) -{ - return !(regs->tstate & (TSTATE_XCARRY | TSTATE_ICARRY)); -} - -static inline long regs_return_value(struct pt_regs *regs) -{ - return regs->u_regs[UREG_I0]; -} +#define regs_return_value(regs) ((regs)->u_regs[UREG_I0]) #ifdef CONFIG_SMP extern unsigned long profile_pc(struct pt_regs *); #else diff --git a/trunk/arch/sparc/kernel/ptrace_64.c b/trunk/arch/sparc/kernel/ptrace_64.c index 9388844cd88c..96ee50a80661 100644 --- a/trunk/arch/sparc/kernel/ptrace_64.c +++ b/trunk/arch/sparc/kernel/ptrace_64.c @@ -1071,22 +1071,32 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->u_regs[UREG_G1]); - audit_syscall_entry((test_thread_flag(TIF_32BIT) ? - AUDIT_ARCH_SPARC : - AUDIT_ARCH_SPARC64), - regs->u_regs[UREG_G1], - regs->u_regs[UREG_I0], - regs->u_regs[UREG_I1], - regs->u_regs[UREG_I2], - regs->u_regs[UREG_I3]); + if (unlikely(current->audit_context) && !ret) + audit_syscall_entry((test_thread_flag(TIF_32BIT) ? + AUDIT_ARCH_SPARC : + AUDIT_ARCH_SPARC64), + regs->u_regs[UREG_G1], + regs->u_regs[UREG_I0], + regs->u_regs[UREG_I1], + regs->u_regs[UREG_I2], + regs->u_regs[UREG_I3]); return ret; } asmlinkage void syscall_trace_leave(struct pt_regs *regs) { - audit_syscall_exit(regs); +#ifdef CONFIG_AUDITSYSCALL + if (unlikely(current->audit_context)) { + unsigned long tstate = regs->tstate; + int result = AUDITSC_SUCCESS; + if (unlikely(tstate & (TSTATE_XCARRY | TSTATE_ICARRY))) + result = AUDITSC_FAILURE; + + audit_syscall_exit(result, regs->u_regs[UREG_I0]); + } +#endif if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->u_regs[UREG_G1]); diff --git a/trunk/arch/um/kernel/ptrace.c b/trunk/arch/um/kernel/ptrace.c index 06b190390505..c9da32b0c707 100644 --- a/trunk/arch/um/kernel/ptrace.c +++ b/trunk/arch/um/kernel/ptrace.c @@ -167,15 +167,17 @@ void syscall_trace(struct uml_pt_regs *regs, int entryexit) int is_singlestep = (current->ptrace & PT_DTRACE) && entryexit; int tracesysgood; - if (!entryexit) - audit_syscall_entry(HOST_AUDIT_ARCH, - UPT_SYSCALL_NR(regs), - UPT_SYSCALL_ARG1(regs), - UPT_SYSCALL_ARG2(regs), - UPT_SYSCALL_ARG3(regs), - UPT_SYSCALL_ARG4(regs)); - else - audit_syscall_exit(regs); + if (unlikely(current->audit_context)) { + if (!entryexit) + audit_syscall_entry(HOST_AUDIT_ARCH, + UPT_SYSCALL_NR(regs), + UPT_SYSCALL_ARG1(regs), + UPT_SYSCALL_ARG2(regs), + UPT_SYSCALL_ARG3(regs), + UPT_SYSCALL_ARG4(regs)); + else audit_syscall_exit(AUDITSC_RESULT(UPT_SYSCALL_RET(regs)), + UPT_SYSCALL_RET(regs)); + } /* Fake a debug trap */ if (is_singlestep) diff --git a/trunk/arch/x86/ia32/ia32entry.S b/trunk/arch/x86/ia32/ia32entry.S index e3e734005e19..1106261856c8 100644 --- a/trunk/arch/x86/ia32/ia32entry.S +++ b/trunk/arch/x86/ia32/ia32entry.S @@ -14,7 +14,6 @@ #include #include #include -#include /* Avoid __ASSEMBLER__'ifying just for this. */ #include @@ -190,7 +189,7 @@ sysexit_from_sys_call: movl %ebx,%edx /* 3rd arg: 1st syscall arg */ movl %eax,%esi /* 2nd arg: syscall number */ movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ - call __audit_syscall_entry + call audit_syscall_entry movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys @@ -207,13 +206,12 @@ sysexit_from_sys_call: TRACE_IRQS_ON sti movl %eax,%esi /* second arg, syscall return value */ - cmpl $-MAX_ERRNO,%eax /* is it an error ? */ - jbe 1f - movslq %eax, %rsi /* if error sign extend to 64 bits */ -1: setbe %al /* 1 if error, 0 if not */ + cmpl $0,%eax /* is it < 0? */ + setl %al /* 1 if so, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ - call __audit_syscall_exit - movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */ + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ + call audit_syscall_exit + movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi cli TRACE_IRQS_OFF diff --git a/trunk/arch/x86/kernel/entry_32.S b/trunk/arch/x86/kernel/entry_32.S index 79d97e68f042..4af9fd2450a5 100644 --- a/trunk/arch/x86/kernel/entry_32.S +++ b/trunk/arch/x86/kernel/entry_32.S @@ -42,7 +42,6 @@ */ #include -#include #include #include #include @@ -454,7 +453,7 @@ sysenter_audit: movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ movl %eax,%edx /* 2nd arg: syscall number */ movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ - call __audit_syscall_entry + call audit_syscall_entry pushl_cfi %ebx movl PT_EAX(%esp),%eax /* reload syscall number */ jmp sysenter_do_call @@ -465,10 +464,11 @@ sysexit_audit: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) movl %eax,%edx /* second arg, syscall return value */ - cmpl $-MAX_ERRNO,%eax /* is it an error ? */ - setbe %al /* 1 if so, 0 if not */ + cmpl $0,%eax /* is it < 0? */ + setl %al /* 1 if so, 0 if not */ movzbl %al,%eax /* zero-extend that */ - call __audit_syscall_exit + inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ + call audit_syscall_exit DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx diff --git a/trunk/arch/x86/kernel/entry_64.S b/trunk/arch/x86/kernel/entry_64.S index 3fe8239fd8fb..940ba711fc28 100644 --- a/trunk/arch/x86/kernel/entry_64.S +++ b/trunk/arch/x86/kernel/entry_64.S @@ -55,7 +55,6 @@ #include #include #include -#include /* Avoid __ASSEMBLER__'ifying just for this. */ #include @@ -549,7 +548,7 @@ badsys: #ifdef CONFIG_AUDITSYSCALL /* * Fast path for syscall audit without full syscall trace. - * We just call __audit_syscall_entry() directly, and then + * We just call audit_syscall_entry() directly, and then * jump back to the normal fast path. */ auditsys: @@ -559,21 +558,22 @@ auditsys: movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ movq %rax,%rsi /* 2nd arg: syscall number */ movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ - call __audit_syscall_entry + call audit_syscall_entry LOAD_ARGS 0 /* reload call-clobbered registers */ jmp system_call_fastpath /* - * Return fast path for syscall audit. Call __audit_syscall_exit() + * Return fast path for syscall audit. Call audit_syscall_exit() * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT * masked off. */ sysret_audit: movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ - cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */ - setbe %al /* 1 if so, 0 if not */ + cmpq $0,%rsi /* is it < 0? */ + setl %al /* 1 if so, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ - call __audit_syscall_exit + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ + call audit_syscall_exit movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi jmp sysret_check #endif /* CONFIG_AUDITSYSCALL */ diff --git a/trunk/arch/x86/kernel/ptrace.c b/trunk/arch/x86/kernel/ptrace.c index 50267386b766..89a04c7b5bb6 100644 --- a/trunk/arch/x86/kernel/ptrace.c +++ b/trunk/arch/x86/kernel/ptrace.c @@ -1392,18 +1392,20 @@ long syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->orig_ax); - if (IS_IA32) - audit_syscall_entry(AUDIT_ARCH_I386, - regs->orig_ax, - regs->bx, regs->cx, - regs->dx, regs->si); + if (unlikely(current->audit_context)) { + if (IS_IA32) + audit_syscall_entry(AUDIT_ARCH_I386, + regs->orig_ax, + regs->bx, regs->cx, + regs->dx, regs->si); #ifdef CONFIG_X86_64 - else - audit_syscall_entry(AUDIT_ARCH_X86_64, - regs->orig_ax, - regs->di, regs->si, - regs->dx, regs->r10); + else + audit_syscall_entry(AUDIT_ARCH_X86_64, + regs->orig_ax, + regs->di, regs->si, + regs->dx, regs->r10); #endif + } return ret ?: regs->orig_ax; } @@ -1412,7 +1414,8 @@ void syscall_trace_leave(struct pt_regs *regs) { bool step; - audit_syscall_exit(regs); + if (unlikely(current->audit_context)) + audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->ax); diff --git a/trunk/arch/x86/kernel/vm86_32.c b/trunk/arch/x86/kernel/vm86_32.c index af17e1c966dc..863f8753ab0a 100644 --- a/trunk/arch/x86/kernel/vm86_32.c +++ b/trunk/arch/x86/kernel/vm86_32.c @@ -335,9 +335,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk if (info->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); - /*call __audit_syscall_exit since we do not exit via the normal paths */ + /*call audit_syscall_exit since we do not exit via the normal paths */ if (unlikely(current->audit_context)) - __audit_syscall_exit(1, 0); + audit_syscall_exit(AUDITSC_RESULT(0), 0); __asm__ __volatile__( "movl %0,%%esp\n\t" diff --git a/trunk/arch/x86/um/shared/sysdep/ptrace.h b/trunk/arch/x86/um/shared/sysdep/ptrace.h index 5ef9344a8b24..711b1621747f 100644 --- a/trunk/arch/x86/um/shared/sysdep/ptrace.h +++ b/trunk/arch/x86/um/shared/sysdep/ptrace.h @@ -3,8 +3,3 @@ #else #include "ptrace_64.h" #endif - -static inline long regs_return_value(struct uml_pt_regs *regs) -{ - return UPT_SYSCALL_RET(regs); -} diff --git a/trunk/arch/xtensa/kernel/ptrace.c b/trunk/arch/xtensa/kernel/ptrace.c index 2dff698ab02e..a0d042aa2967 100644 --- a/trunk/arch/xtensa/kernel/ptrace.c +++ b/trunk/arch/xtensa/kernel/ptrace.c @@ -334,7 +334,8 @@ void do_syscall_trace_enter(struct pt_regs *regs) do_syscall_trace(); #if 0 - audit_syscall_entry(current, AUDIT_ARCH_XTENSA..); + if (unlikely(current->audit_context)) + audit_syscall_entry(current, AUDIT_ARCH_XTENSA..); #endif } diff --git a/trunk/block/cfq-iosched.c b/trunk/block/cfq-iosched.c index ee55019066a1..163263ddd381 100644 --- a/trunk/block/cfq-iosched.c +++ b/trunk/block/cfq-iosched.c @@ -3117,17 +3117,18 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, */ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { + struct cfq_queue *old_cfqq = cfqd->active_queue; + cfq_log_cfqq(cfqd, cfqq, "preempt"); + cfq_slice_expired(cfqd, 1); /* * workload type is changed, don't save slice, otherwise preempt * doesn't happen */ - if (cfqq_type(cfqd->active_queue) != cfqq_type(cfqq)) + if (cfqq_type(old_cfqq) != cfqq_type(cfqq)) cfqq->cfqg->saved_workload_slice = 0; - cfq_slice_expired(cfqd, 1); - /* * Put the new queue at the front of the of the current list, * so we know that it will be selected next. diff --git a/trunk/drivers/usb/host/ehci-xilinx-of.c b/trunk/drivers/usb/host/ehci-xilinx-of.c index 9c2cc4633894..32793ce3d9e9 100644 --- a/trunk/drivers/usb/host/ehci-xilinx-of.c +++ b/trunk/drivers/usb/host/ehci-xilinx-of.c @@ -183,7 +183,7 @@ static int __devinit ehci_hcd_xilinx_of_probe(struct platform_device *op) } irq = irq_of_parse_and_map(dn, 0); - if (!irq) { + if (irq == NO_IRQ) { printk(KERN_ERR "%s: irq_of_parse_and_map failed\n", __FILE__); rv = -EBUSY; goto err_irq; diff --git a/trunk/drivers/xen/xen-balloon.c b/trunk/drivers/xen/xen-balloon.c index 596e6a7b17d6..3832e303c33a 100644 --- a/trunk/drivers/xen/xen-balloon.c +++ b/trunk/drivers/xen/xen-balloon.c @@ -221,7 +221,7 @@ static int register_balloon(struct device *dev) { int i, error; - error = subsys_system_register(&balloon_subsys, NULL); + error = bus_register(&balloon_subsys); if (error) return error; diff --git a/trunk/fs/btrfs/Kconfig b/trunk/fs/btrfs/Kconfig index d33f01c08b60..ecb9fd3be143 100644 --- a/trunk/fs/btrfs/Kconfig +++ b/trunk/fs/btrfs/Kconfig @@ -31,22 +31,3 @@ config BTRFS_FS_POSIX_ACL Linux website . If you don't know what Access Control Lists are, say N - -config BTRFS_FS_CHECK_INTEGRITY - bool "Btrfs with integrity check tool compiled in (DANGEROUS)" - depends on BTRFS_FS - help - Adds code that examines all block write requests (including - writes of the super block). The goal is to verify that the - state of the filesystem on disk is always consistent, i.e., - after a power-loss or kernel panic event the filesystem is - in a consistent state. - - If the integrity check tool is included and activated in - the mount options, plenty of kernel memory is used, and - plenty of additional CPU cycles are spent. Enabling this - functionality is not intended for normal use. - - In most cases, unless you are a btrfs developer who needs - to verify the integrity of (super)-block write requests - during the run of a regression test, say N diff --git a/trunk/fs/btrfs/Makefile b/trunk/fs/btrfs/Makefile index 0c4fa2befae7..c0ddfd29c5e5 100644 --- a/trunk/fs/btrfs/Makefile +++ b/trunk/fs/btrfs/Makefile @@ -8,7 +8,6 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ - reada.o backref.o ulist.o + reada.o backref.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o -btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/trunk/fs/btrfs/backref.c b/trunk/fs/btrfs/backref.c index b9a843226de8..22c64fff1bd5 100644 --- a/trunk/fs/btrfs/backref.c +++ b/trunk/fs/btrfs/backref.c @@ -19,789 +19,18 @@ #include "ctree.h" #include "disk-io.h" #include "backref.h" -#include "ulist.h" -#include "transaction.h" -#include "delayed-ref.h" -/* - * this structure records all encountered refs on the way up to the root - */ -struct __prelim_ref { +struct __data_ref { struct list_head list; - u64 root_id; - struct btrfs_key key; - int level; - int count; - u64 parent; - u64 wanted_disk_byte; + u64 inum; + u64 root; + u64 extent_data_item_offset; }; -static int __add_prelim_ref(struct list_head *head, u64 root_id, - struct btrfs_key *key, int level, u64 parent, - u64 wanted_disk_byte, int count) -{ - struct __prelim_ref *ref; - - /* in case we're adding delayed refs, we're holding the refs spinlock */ - ref = kmalloc(sizeof(*ref), GFP_ATOMIC); - if (!ref) - return -ENOMEM; - - ref->root_id = root_id; - if (key) - ref->key = *key; - else - memset(&ref->key, 0, sizeof(ref->key)); - - ref->level = level; - ref->count = count; - ref->parent = parent; - ref->wanted_disk_byte = wanted_disk_byte; - list_add_tail(&ref->list, head); - - return 0; -} - -static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, - struct ulist *parents, - struct extent_buffer *eb, int level, - u64 wanted_objectid, u64 wanted_disk_byte) -{ - int ret; - int slot; - struct btrfs_file_extent_item *fi; - struct btrfs_key key; +struct __shared_ref { + struct list_head list; u64 disk_byte; - -add_parent: - ret = ulist_add(parents, eb->start, 0, GFP_NOFS); - if (ret < 0) - return ret; - - if (level != 0) - return 0; - - /* - * if the current leaf is full with EXTENT_DATA items, we must - * check the next one if that holds a reference as well. - * ref->count cannot be used to skip this check. - * repeat this until we don't find any additional EXTENT_DATA items. - */ - while (1) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - return ret; - if (ret) - return 0; - - eb = path->nodes[0]; - for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) { - btrfs_item_key_to_cpu(eb, &key, slot); - if (key.objectid != wanted_objectid || - key.type != BTRFS_EXTENT_DATA_KEY) - return 0; - fi = btrfs_item_ptr(eb, slot, - struct btrfs_file_extent_item); - disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte == wanted_disk_byte) - goto add_parent; - } - } - - return 0; -} - -/* - * resolve an indirect backref in the form (root_id, key, level) - * to a logical address - */ -static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, - struct __prelim_ref *ref, - struct ulist *parents) -{ - struct btrfs_path *path; - struct btrfs_root *root; - struct btrfs_key root_key; - struct btrfs_key key = {0}; - struct extent_buffer *eb; - int ret = 0; - int root_level; - int level = ref->level; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - root_key.objectid = ref->root_id; - root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(fs_info, &root_key); - if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; - } - - rcu_read_lock(); - root_level = btrfs_header_level(root->node); - rcu_read_unlock(); - - if (root_level + 1 == level) - goto out; - - path->lowest_level = level; - ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0); - pr_debug("search slot in root %llu (level %d, ref count %d) returned " - "%d for key (%llu %u %llu)\n", - (unsigned long long)ref->root_id, level, ref->count, ret, - (unsigned long long)ref->key.objectid, ref->key.type, - (unsigned long long)ref->key.offset); - if (ret < 0) - goto out; - - eb = path->nodes[level]; - if (!eb) { - WARN_ON(1); - ret = 1; - goto out; - } - - if (level == 0) { - if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(root, path); - if (ret) - goto out; - eb = path->nodes[0]; - } - - btrfs_item_key_to_cpu(eb, &key, path->slots[0]); - } - - /* the last two parameters will only be used for level == 0 */ - ret = add_all_parents(root, path, parents, eb, level, key.objectid, - ref->wanted_disk_byte); -out: - btrfs_free_path(path); - return ret; -} - -/* - * resolve all indirect backrefs from the list - */ -static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, - struct list_head *head) -{ - int err; - int ret = 0; - struct __prelim_ref *ref; - struct __prelim_ref *ref_safe; - struct __prelim_ref *new_ref; - struct ulist *parents; - struct ulist_node *node; - - parents = ulist_alloc(GFP_NOFS); - if (!parents) - return -ENOMEM; - - /* - * _safe allows us to insert directly after the current item without - * iterating over the newly inserted items. - * we're also allowed to re-assign ref during iteration. - */ - list_for_each_entry_safe(ref, ref_safe, head, list) { - if (ref->parent) /* already direct */ - continue; - if (ref->count == 0) - continue; - err = __resolve_indirect_ref(fs_info, ref, parents); - if (err) { - if (ret == 0) - ret = err; - continue; - } - - /* we put the first parent into the ref at hand */ - node = ulist_next(parents, NULL); - ref->parent = node ? node->val : 0; - - /* additional parents require new refs being added here */ - while ((node = ulist_next(parents, node))) { - new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS); - if (!new_ref) { - ret = -ENOMEM; - break; - } - memcpy(new_ref, ref, sizeof(*ref)); - new_ref->parent = node->val; - list_add(&new_ref->list, &ref->list); - } - ulist_reinit(parents); - } - - ulist_free(parents); - return ret; -} - -/* - * merge two lists of backrefs and adjust counts accordingly - * - * mode = 1: merge identical keys, if key is set - * mode = 2: merge identical parents - */ -static int __merge_refs(struct list_head *head, int mode) -{ - struct list_head *pos1; - - list_for_each(pos1, head) { - struct list_head *n2; - struct list_head *pos2; - struct __prelim_ref *ref1; - - ref1 = list_entry(pos1, struct __prelim_ref, list); - - if (mode == 1 && ref1->key.type == 0) - continue; - for (pos2 = pos1->next, n2 = pos2->next; pos2 != head; - pos2 = n2, n2 = pos2->next) { - struct __prelim_ref *ref2; - - ref2 = list_entry(pos2, struct __prelim_ref, list); - - if (mode == 1) { - if (memcmp(&ref1->key, &ref2->key, - sizeof(ref1->key)) || - ref1->level != ref2->level || - ref1->root_id != ref2->root_id) - continue; - ref1->count += ref2->count; - } else { - if (ref1->parent != ref2->parent) - continue; - ref1->count += ref2->count; - } - list_del(&ref2->list); - kfree(ref2); - } - - } - return 0; -} - -/* - * add all currently queued delayed refs from this head whose seq nr is - * smaller or equal that seq to the list - */ -static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, - struct btrfs_key *info_key, - struct list_head *prefs) -{ - struct btrfs_delayed_extent_op *extent_op = head->extent_op; - struct rb_node *n = &head->node.rb_node; - int sgn; - int ret; - - if (extent_op && extent_op->update_key) - btrfs_disk_key_to_cpu(info_key, &extent_op->key); - - while ((n = rb_prev(n))) { - struct btrfs_delayed_ref_node *node; - node = rb_entry(n, struct btrfs_delayed_ref_node, - rb_node); - if (node->bytenr != head->node.bytenr) - break; - WARN_ON(node->is_head); - - if (node->seq > seq) - continue; - - switch (node->action) { - case BTRFS_ADD_DELAYED_EXTENT: - case BTRFS_UPDATE_DELAYED_HEAD: - WARN_ON(1); - continue; - case BTRFS_ADD_DELAYED_REF: - sgn = 1; - break; - case BTRFS_DROP_DELAYED_REF: - sgn = -1; - break; - default: - BUG_ON(1); - } - switch (node->type) { - case BTRFS_TREE_BLOCK_REF_KEY: { - struct btrfs_delayed_tree_ref *ref; - - ref = btrfs_delayed_node_to_tree_ref(node); - ret = __add_prelim_ref(prefs, ref->root, info_key, - ref->level + 1, 0, node->bytenr, - node->ref_mod * sgn); - break; - } - case BTRFS_SHARED_BLOCK_REF_KEY: { - struct btrfs_delayed_tree_ref *ref; - - ref = btrfs_delayed_node_to_tree_ref(node); - ret = __add_prelim_ref(prefs, ref->root, info_key, - ref->level + 1, ref->parent, - node->bytenr, - node->ref_mod * sgn); - break; - } - case BTRFS_EXTENT_DATA_REF_KEY: { - struct btrfs_delayed_data_ref *ref; - struct btrfs_key key; - - ref = btrfs_delayed_node_to_data_ref(node); - - key.objectid = ref->objectid; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = ref->offset; - ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0, - node->bytenr, - node->ref_mod * sgn); - break; - } - case BTRFS_SHARED_DATA_REF_KEY: { - struct btrfs_delayed_data_ref *ref; - struct btrfs_key key; - - ref = btrfs_delayed_node_to_data_ref(node); - - key.objectid = ref->objectid; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = ref->offset; - ret = __add_prelim_ref(prefs, ref->root, &key, 0, - ref->parent, node->bytenr, - node->ref_mod * sgn); - break; - } - default: - WARN_ON(1); - } - BUG_ON(ret); - } - - return 0; -} - -/* - * add all inline backrefs for bytenr to the list - */ -static int __add_inline_refs(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 bytenr, - struct btrfs_key *info_key, int *info_level, - struct list_head *prefs) -{ - int ret; - int slot; - struct extent_buffer *leaf; - struct btrfs_key key; - unsigned long ptr; - unsigned long end; - struct btrfs_extent_item *ei; - u64 flags; - u64 item_size; - - /* - * enumerate all inline refs - */ - leaf = path->nodes[0]; - slot = path->slots[0] - 1; - - item_size = btrfs_item_size_nr(leaf, slot); - BUG_ON(item_size < sizeof(*ei)); - - ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); - flags = btrfs_extent_flags(leaf, ei); - - ptr = (unsigned long)(ei + 1); - end = (unsigned long)ei + item_size; - - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - struct btrfs_tree_block_info *info; - struct btrfs_disk_key disk_key; - - info = (struct btrfs_tree_block_info *)ptr; - *info_level = btrfs_tree_block_level(leaf, info); - btrfs_tree_block_key(leaf, info, &disk_key); - btrfs_disk_key_to_cpu(info_key, &disk_key); - ptr += sizeof(struct btrfs_tree_block_info); - BUG_ON(ptr > end); - } else { - BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); - } - - while (ptr < end) { - struct btrfs_extent_inline_ref *iref; - u64 offset; - int type; - - iref = (struct btrfs_extent_inline_ref *)ptr; - type = btrfs_extent_inline_ref_type(leaf, iref); - offset = btrfs_extent_inline_ref_offset(leaf, iref); - - switch (type) { - case BTRFS_SHARED_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, 0, info_key, - *info_level + 1, offset, - bytenr, 1); - break; - case BTRFS_SHARED_DATA_REF_KEY: { - struct btrfs_shared_data_ref *sdref; - int count; - - sdref = (struct btrfs_shared_data_ref *)(iref + 1); - count = btrfs_shared_data_ref_count(leaf, sdref); - ret = __add_prelim_ref(prefs, 0, NULL, 0, offset, - bytenr, count); - break; - } - case BTRFS_TREE_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, offset, info_key, - *info_level + 1, 0, bytenr, 1); - break; - case BTRFS_EXTENT_DATA_REF_KEY: { - struct btrfs_extent_data_ref *dref; - int count; - u64 root; - - dref = (struct btrfs_extent_data_ref *)(&iref->offset); - count = btrfs_extent_data_ref_count(leaf, dref); - key.objectid = btrfs_extent_data_ref_objectid(leaf, - dref); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = btrfs_extent_data_ref_offset(leaf, dref); - root = btrfs_extent_data_ref_root(leaf, dref); - ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr, - count); - break; - } - default: - WARN_ON(1); - } - BUG_ON(ret); - ptr += btrfs_extent_inline_ref_size(type); - } - - return 0; -} - -/* - * add all non-inline backrefs for bytenr to the list - */ -static int __add_keyed_refs(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 bytenr, - struct btrfs_key *info_key, int info_level, - struct list_head *prefs) -{ - struct btrfs_root *extent_root = fs_info->extent_root; - int ret; - int slot; - struct extent_buffer *leaf; - struct btrfs_key key; - - while (1) { - ret = btrfs_next_item(extent_root, path); - if (ret < 0) - break; - if (ret) { - ret = 0; - break; - } - - slot = path->slots[0]; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, slot); - - if (key.objectid != bytenr) - break; - if (key.type < BTRFS_TREE_BLOCK_REF_KEY) - continue; - if (key.type > BTRFS_SHARED_DATA_REF_KEY) - break; - - switch (key.type) { - case BTRFS_SHARED_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, 0, info_key, - info_level + 1, key.offset, - bytenr, 1); - break; - case BTRFS_SHARED_DATA_REF_KEY: { - struct btrfs_shared_data_ref *sdref; - int count; - - sdref = btrfs_item_ptr(leaf, slot, - struct btrfs_shared_data_ref); - count = btrfs_shared_data_ref_count(leaf, sdref); - ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset, - bytenr, count); - break; - } - case BTRFS_TREE_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, key.offset, info_key, - info_level + 1, 0, bytenr, 1); - break; - case BTRFS_EXTENT_DATA_REF_KEY: { - struct btrfs_extent_data_ref *dref; - int count; - u64 root; - - dref = btrfs_item_ptr(leaf, slot, - struct btrfs_extent_data_ref); - count = btrfs_extent_data_ref_count(leaf, dref); - key.objectid = btrfs_extent_data_ref_objectid(leaf, - dref); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = btrfs_extent_data_ref_offset(leaf, dref); - root = btrfs_extent_data_ref_root(leaf, dref); - ret = __add_prelim_ref(prefs, root, &key, 0, 0, - bytenr, count); - break; - } - default: - WARN_ON(1); - } - BUG_ON(ret); - } - - return ret; -} - -/* - * this adds all existing backrefs (inline backrefs, backrefs and delayed - * refs) for the given bytenr to the refs list, merges duplicates and resolves - * indirect refs to their parent bytenr. - * When roots are found, they're added to the roots list - * - * FIXME some caching might speed things up - */ -static int find_parent_nodes(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 seq, struct ulist *refs, struct ulist *roots) -{ - struct btrfs_key key; - struct btrfs_path *path; - struct btrfs_key info_key = { 0 }; - struct btrfs_delayed_ref_root *delayed_refs = NULL; - struct btrfs_delayed_ref_head *head = NULL; - int info_level = 0; - int ret; - struct list_head prefs_delayed; - struct list_head prefs; - struct __prelim_ref *ref; - - INIT_LIST_HEAD(&prefs); - INIT_LIST_HEAD(&prefs_delayed); - - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = (u64)-1; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - /* - * grab both a lock on the path and a lock on the delayed ref head. - * We need both to get a consistent picture of how the refs look - * at a specified point in time - */ -again: - ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); - if (ret < 0) - goto out; - BUG_ON(ret == 0); - - /* - * look if there are updates for this ref queued and lock the head - */ - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(trans, bytenr); - if (head) { - if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); - spin_unlock(&delayed_refs->lock); - - btrfs_release_path(path); - - /* - * Mutex was contended, block until it's - * released and try again - */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); - goto again; - } - ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed); - if (ret) - goto out; - } - spin_unlock(&delayed_refs->lock); - - if (path->slots[0]) { - struct extent_buffer *leaf; - int slot; - - leaf = path->nodes[0]; - slot = path->slots[0] - 1; - btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid == bytenr && - key.type == BTRFS_EXTENT_ITEM_KEY) { - ret = __add_inline_refs(fs_info, path, bytenr, - &info_key, &info_level, &prefs); - if (ret) - goto out; - ret = __add_keyed_refs(fs_info, path, bytenr, &info_key, - info_level, &prefs); - if (ret) - goto out; - } - } - btrfs_release_path(path); - - /* - * when adding the delayed refs above, the info_key might not have - * been known yet. Go over the list and replace the missing keys - */ - list_for_each_entry(ref, &prefs_delayed, list) { - if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0) - memcpy(&ref->key, &info_key, sizeof(ref->key)); - } - list_splice_init(&prefs_delayed, &prefs); - - ret = __merge_refs(&prefs, 1); - if (ret) - goto out; - - ret = __resolve_indirect_refs(fs_info, &prefs); - if (ret) - goto out; - - ret = __merge_refs(&prefs, 2); - if (ret) - goto out; - - while (!list_empty(&prefs)) { - ref = list_first_entry(&prefs, struct __prelim_ref, list); - list_del(&ref->list); - if (ref->count < 0) - WARN_ON(1); - if (ref->count && ref->root_id && ref->parent == 0) { - /* no parent == root of tree */ - ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); - BUG_ON(ret < 0); - } - if (ref->count && ref->parent) { - ret = ulist_add(refs, ref->parent, 0, GFP_NOFS); - BUG_ON(ret < 0); - } - kfree(ref); - } - -out: - if (head) - mutex_unlock(&head->mutex); - btrfs_free_path(path); - while (!list_empty(&prefs)) { - ref = list_first_entry(&prefs, struct __prelim_ref, list); - list_del(&ref->list); - kfree(ref); - } - while (!list_empty(&prefs_delayed)) { - ref = list_first_entry(&prefs_delayed, struct __prelim_ref, - list); - list_del(&ref->list); - kfree(ref); - } - - return ret; -} - -/* - * Finds all leafs with a reference to the specified combination of bytenr and - * offset. key_list_head will point to a list of corresponding keys (caller must - * free each list element). The leafs will be stored in the leafs ulist, which - * must be freed with ulist_free. - * - * returns 0 on success, <0 on error - */ -static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 seq, struct ulist **leafs) -{ - struct ulist *tmp; - int ret; - - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - return -ENOMEM; - *leafs = ulist_alloc(GFP_NOFS); - if (!*leafs) { - ulist_free(tmp); - return -ENOMEM; - } - - ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp); - ulist_free(tmp); - - if (ret < 0 && ret != -ENOENT) { - ulist_free(*leafs); - return ret; - } - - return 0; -} - -/* - * walk all backrefs for a given extent to find all roots that reference this - * extent. Walking a backref means finding all extents that reference this - * extent and in turn walk the backrefs of those, too. Naturally this is a - * recursive process, but here it is implemented in an iterative fashion: We - * find all referencing extents for the extent in question and put them on a - * list. In turn, we find all referencing extents for those, further appending - * to the list. The way we iterate the list allows adding more elements after - * the current while iterating. The process stops when we reach the end of the - * list. Found roots are added to the roots list. - * - * returns 0 on success, < 0 on error. - */ -int btrfs_find_all_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 seq, struct ulist **roots) -{ - struct ulist *tmp; - struct ulist_node *node = NULL; - int ret; - - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - return -ENOMEM; - *roots = ulist_alloc(GFP_NOFS); - if (!*roots) { - ulist_free(tmp); - return -ENOMEM; - } - - while (1) { - ret = find_parent_nodes(trans, fs_info, bytenr, seq, - tmp, *roots); - if (ret < 0 && ret != -ENOENT) { - ulist_free(tmp); - ulist_free(*roots); - return ret; - } - node = ulist_next(tmp, node); - if (!node) - break; - bytenr = node->val; - } - - ulist_free(tmp); - return 0; -} - +}; static int __inode_info(u64 inum, u64 ioff, u8 key_type, struct btrfs_root *fs_root, struct btrfs_path *path, @@ -952,11 +181,8 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); if (found_key->type != BTRFS_EXTENT_ITEM_KEY || found_key->objectid > logical || - found_key->objectid + found_key->offset <= logical) { - pr_debug("logical %llu is not within any extent\n", - (unsigned long long)logical); + found_key->objectid + found_key->offset <= logical) return -ENOENT; - } eb = path->nodes[0]; item_size = btrfs_item_size_nr(eb, path->slots[0]); @@ -965,13 +191,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); flags = btrfs_extent_flags(eb, ei); - pr_debug("logical %llu is at position %llu within the extent (%llu " - "EXTENT_ITEM %llu) flags %#llx size %u\n", - (unsigned long long)logical, - (unsigned long long)(logical - found_key->objectid), - (unsigned long long)found_key->objectid, - (unsigned long long)found_key->offset, - (unsigned long long)flags, item_size); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) return BTRFS_EXTENT_FLAG_TREE_BLOCK; if (flags & BTRFS_EXTENT_FLAG_DATA) @@ -1068,11 +287,128 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, return 0; } -static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 logical, - u64 orig_extent_item_objectid, - u64 extent_item_pos, u64 root, - iterate_extent_inodes_t *iterate, void *ctx) +static int __data_list_add(struct list_head *head, u64 inum, + u64 extent_data_item_offset, u64 root) +{ + struct __data_ref *ref; + + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + ref->inum = inum; + ref->extent_data_item_offset = extent_data_item_offset; + ref->root = root; + list_add_tail(&ref->list, head); + + return 0; +} + +static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb, + struct btrfs_extent_data_ref *dref) +{ + return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref), + btrfs_extent_data_ref_offset(eb, dref), + btrfs_extent_data_ref_root(eb, dref)); +} + +static int __shared_list_add(struct list_head *head, u64 disk_byte) +{ + struct __shared_ref *ref; + + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + ref->disk_byte = disk_byte; + list_add_tail(&ref->list, head); + + return 0; +} + +static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info, + u64 logical, u64 inum, + u64 extent_data_item_offset, + u64 extent_offset, + struct btrfs_path *path, + struct list_head *data_refs, + iterate_extent_inodes_t *iterate, + void *ctx) +{ + u64 ref_root; + u32 item_size; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *eiref; + struct __data_ref *ref; + int ret; + int type; + int last; + unsigned long ptr = 0; + + WARN_ON(!list_empty(data_refs)); + ret = extent_from_logical(fs_info, logical, path, &key); + if (ret & BTRFS_EXTENT_FLAG_DATA) + ret = -EIO; + if (ret < 0) + goto out; + + eb = path->nodes[0]; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + ret = 0; + ref_root = 0; + /* + * as done in iterate_extent_inodes, we first build a list of refs to + * iterate, then free the path and then iterate them to avoid deadlocks. + */ + do { + last = __get_extent_inline_ref(&ptr, eb, ei, item_size, + &eiref, &type); + if (last < 0) { + ret = last; + goto out; + } + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) { + ref_root = btrfs_extent_inline_ref_offset(eb, eiref); + ret = __data_list_add(data_refs, inum, + extent_data_item_offset, + ref_root); + } + } while (!ret && !last); + + btrfs_release_path(path); + + if (ref_root == 0) { + printk(KERN_ERR "btrfs: failed to find tree block ref " + "for shared data backref %llu\n", logical); + WARN_ON(1); + ret = -EIO; + } + +out: + while (!list_empty(data_refs)) { + ref = list_first_entry(data_refs, struct __data_ref, list); + list_del(&ref->list); + if (!ret) + ret = iterate(ref->inum, extent_offset + + ref->extent_data_item_offset, + ref->root, ctx); + kfree(ref); + } + + return ret; +} + +static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info, + u64 logical, u64 orig_extent_item_objectid, + u64 extent_offset, struct btrfs_path *path, + struct list_head *data_refs, + iterate_extent_inodes_t *iterate, + void *ctx) { u64 disk_byte; struct btrfs_key key; @@ -1080,10 +416,8 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, struct extent_buffer *eb; int slot; int nritems; - int ret = 0; - int extent_type; - u64 data_offset; - u64 data_len; + int ret; + int found = 0; eb = read_tree_block(fs_info->tree_root, logical, fs_info->tree_root->leafsize, 0); @@ -1101,99 +435,149 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, if (key.type != BTRFS_EXTENT_DATA_KEY) continue; fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); - extent_type = btrfs_file_extent_type(eb, fi); - if (extent_type == BTRFS_FILE_EXTENT_INLINE) - continue; - /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ + if (!fi) { + free_extent_buffer(eb); + return -EIO; + } disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte != orig_extent_item_objectid) - continue; - - data_offset = btrfs_file_extent_offset(eb, fi); - data_len = btrfs_file_extent_num_bytes(eb, fi); - - if (extent_item_pos < data_offset || - extent_item_pos >= data_offset + data_len) - continue; - - pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), " - "root %llu\n", orig_extent_item_objectid, - key.objectid, key.offset, root); - ret = iterate(key.objectid, - key.offset + (extent_item_pos - data_offset), - root, ctx); - if (ret) { - pr_debug("stopping iteration because ret=%d\n", ret); - break; + if (disk_byte != orig_extent_item_objectid) { + if (found) + break; + else + continue; } + ++found; + ret = __iter_shared_inline_ref_inodes(fs_info, logical, + key.objectid, + key.offset, + extent_offset, path, + data_refs, + iterate, ctx); + if (ret) + break; } - free_extent_buffer(eb); + if (!found) { + printk(KERN_ERR "btrfs: failed to follow shared data backref " + "to parent %llu\n", logical); + WARN_ON(1); + ret = -EIO; + } + free_extent_buffer(eb); return ret; } /* * calls iterate() for every inode that references the extent identified by - * the given parameters. + * the given parameters. will use the path given as a parameter and return it + * released. * when the iterator function returns a non-zero value, iteration stops. - * path is guaranteed to be in released state when iterate() is called. */ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - u64 extent_item_objectid, u64 extent_item_pos, + u64 extent_item_objectid, + u64 extent_offset, iterate_extent_inodes_t *iterate, void *ctx) { + unsigned long ptr = 0; + int last; int ret; + int type; + u64 logical; + u32 item_size; + struct btrfs_extent_inline_ref *eiref; + struct btrfs_extent_data_ref *dref; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_key key; struct list_head data_refs = LIST_HEAD_INIT(data_refs); struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); - struct btrfs_trans_handle *trans; - struct ulist *refs; - struct ulist *roots; - struct ulist_node *ref_node = NULL; - struct ulist_node *root_node = NULL; - struct seq_list seq_elem; - struct btrfs_delayed_ref_root *delayed_refs; - - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - pr_debug("resolving all inodes for extent %llu\n", - extent_item_objectid); - - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - btrfs_get_delayed_seq(delayed_refs, &seq_elem); - spin_unlock(&delayed_refs->lock); - - ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, - extent_item_pos, seq_elem.seq, - &refs); + struct __data_ref *ref_d; + struct __shared_ref *ref_s; - if (ret) - goto out; + eb = path->nodes[0]; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size_nr(eb, path->slots[0]); - while (!ret && (ref_node = ulist_next(refs, ref_node))) { - ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1, - seq_elem.seq, &roots); - if (ret) + /* first we iterate the inline refs, ... */ + do { + last = __get_extent_inline_ref(&ptr, eb, ei, item_size, + &eiref, &type); + if (last == -ENOENT) { + ret = 0; + break; + } + if (last < 0) { + ret = last; + break; + } + + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = (struct btrfs_extent_data_ref *)(&eiref->offset); + ret = __data_list_add_eb(&data_refs, eb, dref); + } else if (type == BTRFS_SHARED_DATA_REF_KEY) { + logical = btrfs_extent_inline_ref_offset(eb, eiref); + ret = __shared_list_add(&shared_refs, logical); + } + } while (!ret && !last); + + /* ... then we proceed to in-tree references and ... */ + while (!ret) { + ++path->slots[0]; + if (path->slots[0] > btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(fs_info->extent_root, path); + if (ret) { + if (ret == 1) + ret = 0; /* we're done */ + break; + } + eb = path->nodes[0]; + } + btrfs_item_key_to_cpu(eb, &key, path->slots[0]); + if (key.objectid != extent_item_objectid) break; - while (!ret && (root_node = ulist_next(roots, root_node))) { - pr_debug("root %llu references leaf %llu\n", - root_node->val, ref_node->val); - ret = iterate_leaf_refs(fs_info, path, ref_node->val, - extent_item_objectid, - extent_item_pos, root_node->val, - iterate, ctx); + if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_extent_data_ref); + ret = __data_list_add_eb(&data_refs, eb, dref); + } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + ret = __shared_list_add(&shared_refs, key.offset); } } - ulist_free(refs); - ulist_free(roots); -out: - btrfs_put_delayed_seq(delayed_refs, &seq_elem); - btrfs_end_transaction(trans, fs_info->extent_root); + btrfs_release_path(path); + + /* + * ... only at the very end we can process the refs we found. this is + * because the iterator function we call is allowed to make tree lookups + * and we have to avoid deadlocks. additionally, we need more tree + * lookups ourselves for shared data refs. + */ + while (!list_empty(&data_refs)) { + ref_d = list_first_entry(&data_refs, struct __data_ref, list); + list_del(&ref_d->list); + if (!ret) + ret = iterate(ref_d->inum, extent_offset + + ref_d->extent_data_item_offset, + ref_d->root, ctx); + kfree(ref_d); + } + + while (!list_empty(&shared_refs)) { + ref_s = list_first_entry(&shared_refs, struct __shared_ref, + list); + list_del(&ref_s->list); + if (!ret) + ret = __iter_shared_inline_ref(fs_info, + ref_s->disk_byte, + extent_item_objectid, + extent_offset, path, + &data_refs, + iterate, ctx); + kfree(ref_s); + } + return ret; } @@ -1202,20 +586,19 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, iterate_extent_inodes_t *iterate, void *ctx) { int ret; - u64 extent_item_pos; + u64 offset; struct btrfs_key found_key; ret = extent_from_logical(fs_info, logical, path, &found_key); - btrfs_release_path(path); if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) ret = -EINVAL; if (ret < 0) return ret; - extent_item_pos = logical - found_key.objectid; + offset = logical - found_key.objectid; ret = iterate_extent_inodes(fs_info, path, found_key.objectid, - extent_item_pos, iterate, ctx); + offset, iterate, ctx); return ret; } @@ -1260,10 +643,6 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { name_len = btrfs_inode_ref_name_len(eb, iref); /* path must be released before calling iterate()! */ - pr_debug("following ref at offset %u for inode %llu in " - "tree %llu\n", cur, - (unsigned long long)found_key.objectid, - (unsigned long long)fs_root->objectid); ret = iterate(parent, iref, eb, ctx); if (ret) { free_extent_buffer(eb); @@ -1304,14 +683,10 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, return PTR_ERR(fspath); if (fspath > fspath_min) { - pr_debug("path resolved: %s\n", fspath); ipath->fspath->val[i] = (u64)(unsigned long)fspath; ++ipath->fspath->elem_cnt; ipath->fspath->bytes_left = fspath - fspath_min; } else { - pr_debug("missed path, not enough space. missing bytes: %lu, " - "constructed so far: %s\n", - (unsigned long)(fspath_min - fspath), fspath_min); ++ipath->fspath->elem_missed; ipath->fspath->bytes_missing += fspath_min - fspath; ipath->fspath->bytes_left = 0; diff --git a/trunk/fs/btrfs/backref.h b/trunk/fs/btrfs/backref.h index d00dfa9ca934..92618837cb8f 100644 --- a/trunk/fs/btrfs/backref.h +++ b/trunk/fs/btrfs/backref.h @@ -20,7 +20,6 @@ #define __BTRFS_BACKREF__ #include "ioctl.h" -#include "ulist.h" struct inode_fs_paths { struct btrfs_path *btrfs_path; @@ -55,10 +54,6 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); -int btrfs_find_all_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 seq, struct ulist **roots); - struct btrfs_data_container *init_data_container(u32 total_bytes); struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, struct btrfs_path *path); diff --git a/trunk/fs/btrfs/btrfs_inode.h b/trunk/fs/btrfs/btrfs_inode.h index 9b9b15fd5204..634608d2a6d0 100644 --- a/trunk/fs/btrfs/btrfs_inode.h +++ b/trunk/fs/btrfs/btrfs_inode.h @@ -51,9 +51,6 @@ struct btrfs_inode { /* held while logging the inode in tree-log.c */ struct mutex log_mutex; - /* held while doing delalloc reservations */ - struct mutex delalloc_mutex; - /* used to order data wrt metadata */ struct btrfs_ordered_inode_tree ordered_tree; diff --git a/trunk/fs/btrfs/check-integrity.c b/trunk/fs/btrfs/check-integrity.c deleted file mode 100644 index ad0b3ba735b7..000000000000 --- a/trunk/fs/btrfs/check-integrity.c +++ /dev/null @@ -1,3068 +0,0 @@ -/* - * Copyright (C) STRATO AG 2011. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -/* - * This module can be used to catch cases when the btrfs kernel - * code executes write requests to the disk that bring the file - * system in an inconsistent state. In such a state, a power-loss - * or kernel panic event would cause that the data on disk is - * lost or at least damaged. - * - * Code is added that examines all block write requests during - * runtime (including writes of the super block). Three rules - * are verified and an error is printed on violation of the - * rules: - * 1. It is not allowed to write a disk block which is - * currently referenced by the super block (either directly - * or indirectly). - * 2. When a super block is written, it is verified that all - * referenced (directly or indirectly) blocks fulfill the - * following requirements: - * 2a. All referenced blocks have either been present when - * the file system was mounted, (i.e., they have been - * referenced by the super block) or they have been - * written since then and the write completion callback - * was called and a FLUSH request to the device where - * these blocks are located was received and completed. - * 2b. All referenced blocks need to have a generation - * number which is equal to the parent's number. - * - * One issue that was found using this module was that the log - * tree on disk became temporarily corrupted because disk blocks - * that had been in use for the log tree had been freed and - * reused too early, while being referenced by the written super - * block. - * - * The search term in the kernel log that can be used to filter - * on the existence of detected integrity issues is - * "btrfs: attempt". - * - * The integrity check is enabled via mount options. These - * mount options are only supported if the integrity check - * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY. - * - * Example #1, apply integrity checks to all metadata: - * mount /dev/sdb1 /mnt -o check_int - * - * Example #2, apply integrity checks to all metadata and - * to data extents: - * mount /dev/sdb1 /mnt -o check_int_data - * - * Example #3, apply integrity checks to all metadata and dump - * the tree that the super block references to kernel messages - * each time after a super block was written: - * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263 - * - * If the integrity check tool is included and activated in - * the mount options, plenty of kernel memory is used, and - * plenty of additional CPU cycles are spent. Enabling this - * functionality is not intended for normal use. In most - * cases, unless you are a btrfs developer who needs to verify - * the integrity of (super)-block write requests, do not - * enable the config option BTRFS_FS_CHECK_INTEGRITY to - * include and compile the integrity check tool. - */ - -#include -#include -#include -#include -#include -#include -#include -#include "ctree.h" -#include "disk-io.h" -#include "transaction.h" -#include "extent_io.h" -#include "disk-io.h" -#include "volumes.h" -#include "print-tree.h" -#include "locking.h" -#include "check-integrity.h" - -#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 -#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 -#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100 -#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051 -#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807 -#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530 -#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300 -#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters, - * excluding " [...]" */ -#define BTRFSIC_BLOCK_SIZE PAGE_SIZE - -#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1) - -/* - * The definition of the bitmask fields for the print_mask. - * They are specified with the mount option check_integrity_print_mask. - */ -#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001 -#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002 -#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004 -#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008 -#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010 -#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020 -#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040 -#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080 -#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100 -#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200 -#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400 -#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800 -#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000 - -struct btrfsic_dev_state; -struct btrfsic_state; - -struct btrfsic_block { - u32 magic_num; /* only used for debug purposes */ - unsigned int is_metadata:1; /* if it is meta-data, not data-data */ - unsigned int is_superblock:1; /* if it is one of the superblocks */ - unsigned int is_iodone:1; /* if is done by lower subsystem */ - unsigned int iodone_w_error:1; /* error was indicated to endio */ - unsigned int never_written:1; /* block was added because it was - * referenced, not because it was - * written */ - unsigned int mirror_num:2; /* large enough to hold - * BTRFS_SUPER_MIRROR_MAX */ - struct btrfsic_dev_state *dev_state; - u64 dev_bytenr; /* key, physical byte num on disk */ - u64 logical_bytenr; /* logical byte num on disk */ - u64 generation; - struct btrfs_disk_key disk_key; /* extra info to print in case of - * issues, will not always be correct */ - struct list_head collision_resolving_node; /* list node */ - struct list_head all_blocks_node; /* list node */ - - /* the following two lists contain block_link items */ - struct list_head ref_to_list; /* list */ - struct list_head ref_from_list; /* list */ - struct btrfsic_block *next_in_same_bio; - void *orig_bio_bh_private; - union { - bio_end_io_t *bio; - bh_end_io_t *bh; - } orig_bio_bh_end_io; - int submit_bio_bh_rw; - u64 flush_gen; /* only valid if !never_written */ -}; - -/* - * Elements of this type are allocated dynamically and required because - * each block object can refer to and can be ref from multiple blocks. - * The key to lookup them in the hashtable is the dev_bytenr of - * the block ref to plus the one from the block refered from. - * The fact that they are searchable via a hashtable and that a - * ref_cnt is maintained is not required for the btrfs integrity - * check algorithm itself, it is only used to make the output more - * beautiful in case that an error is detected (an error is defined - * as a write operation to a block while that block is still referenced). - */ -struct btrfsic_block_link { - u32 magic_num; /* only used for debug purposes */ - u32 ref_cnt; - struct list_head node_ref_to; /* list node */ - struct list_head node_ref_from; /* list node */ - struct list_head collision_resolving_node; /* list node */ - struct btrfsic_block *block_ref_to; - struct btrfsic_block *block_ref_from; - u64 parent_generation; -}; - -struct btrfsic_dev_state { - u32 magic_num; /* only used for debug purposes */ - struct block_device *bdev; - struct btrfsic_state *state; - struct list_head collision_resolving_node; /* list node */ - struct btrfsic_block dummy_block_for_bio_bh_flush; - u64 last_flush_gen; - char name[BDEVNAME_SIZE]; -}; - -struct btrfsic_block_hashtable { - struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE]; -}; - -struct btrfsic_block_link_hashtable { - struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE]; -}; - -struct btrfsic_dev_state_hashtable { - struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE]; -}; - -struct btrfsic_block_data_ctx { - u64 start; /* virtual bytenr */ - u64 dev_bytenr; /* physical bytenr on device */ - u32 len; - struct btrfsic_dev_state *dev; - char *data; - struct buffer_head *bh; /* do not use if set to NULL */ -}; - -/* This structure is used to implement recursion without occupying - * any stack space, refer to btrfsic_process_metablock() */ -struct btrfsic_stack_frame { - u32 magic; - u32 nr; - int error; - int i; - int limit_nesting; - int num_copies; - int mirror_num; - struct btrfsic_block *block; - struct btrfsic_block_data_ctx *block_ctx; - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx next_block_ctx; - struct btrfs_header *hdr; - struct btrfsic_stack_frame *prev; -}; - -/* Some state per mounted filesystem */ -struct btrfsic_state { - u32 print_mask; - int include_extent_data; - int csum_size; - struct list_head all_blocks_list; - struct btrfsic_block_hashtable block_hashtable; - struct btrfsic_block_link_hashtable block_link_hashtable; - struct btrfs_root *root; - u64 max_superblock_generation; - struct btrfsic_block *latest_superblock; -}; - -static void btrfsic_block_init(struct btrfsic_block *b); -static struct btrfsic_block *btrfsic_block_alloc(void); -static void btrfsic_block_free(struct btrfsic_block *b); -static void btrfsic_block_link_init(struct btrfsic_block_link *n); -static struct btrfsic_block_link *btrfsic_block_link_alloc(void); -static void btrfsic_block_link_free(struct btrfsic_block_link *n); -static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds); -static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void); -static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds); -static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h); -static void btrfsic_block_hashtable_add(struct btrfsic_block *b, - struct btrfsic_block_hashtable *h); -static void btrfsic_block_hashtable_remove(struct btrfsic_block *b); -static struct btrfsic_block *btrfsic_block_hashtable_lookup( - struct block_device *bdev, - u64 dev_bytenr, - struct btrfsic_block_hashtable *h); -static void btrfsic_block_link_hashtable_init( - struct btrfsic_block_link_hashtable *h); -static void btrfsic_block_link_hashtable_add( - struct btrfsic_block_link *l, - struct btrfsic_block_link_hashtable *h); -static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l); -static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( - struct block_device *bdev_ref_to, - u64 dev_bytenr_ref_to, - struct block_device *bdev_ref_from, - u64 dev_bytenr_ref_from, - struct btrfsic_block_link_hashtable *h); -static void btrfsic_dev_state_hashtable_init( - struct btrfsic_dev_state_hashtable *h); -static void btrfsic_dev_state_hashtable_add( - struct btrfsic_dev_state *ds, - struct btrfsic_dev_state_hashtable *h); -static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds); -static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( - struct block_device *bdev, - struct btrfsic_dev_state_hashtable *h); -static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void); -static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf); -static int btrfsic_process_superblock(struct btrfsic_state *state, - struct btrfs_fs_devices *fs_devices); -static int btrfsic_process_metablock(struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - struct btrfs_header *hdr, - int limit_nesting, int force_iodone_flag); -static int btrfsic_create_link_to_next_block( - struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx - *block_ctx, u64 next_bytenr, - int limit_nesting, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block **next_blockp, - int force_iodone_flag, - int *num_copiesp, int *mirror_nump, - struct btrfs_disk_key *disk_key, - u64 parent_generation); -static int btrfsic_handle_extent_data(struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - u32 item_offset, int force_iodone_flag); -static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, - struct btrfsic_block_data_ctx *block_ctx_out, - int mirror_num); -static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, - u32 len, struct block_device *bdev, - struct btrfsic_block_data_ctx *block_ctx_out); -static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); -static int btrfsic_read_block(struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx); -static void btrfsic_dump_database(struct btrfsic_state *state); -static int btrfsic_test_for_metadata(struct btrfsic_state *state, - const u8 *data, unsigned int size); -static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, u8 *mapped_data, - unsigned int len, struct bio *bio, - int *bio_is_patched, - struct buffer_head *bh, - int submit_bio_bh_rw); -static int btrfsic_process_written_superblock( - struct btrfsic_state *state, - struct btrfsic_block *const block, - struct btrfs_super_block *const super_hdr); -static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status); -static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate); -static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, - const struct btrfsic_block *block, - int recursion_level); -static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, - struct btrfsic_block *const block, - int recursion_level); -static void btrfsic_print_add_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l); -static void btrfsic_print_rem_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l); -static char btrfsic_get_block_type(const struct btrfsic_state *state, - const struct btrfsic_block *block); -static void btrfsic_dump_tree(const struct btrfsic_state *state); -static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, - const struct btrfsic_block *block, - int indent_level); -static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block *next_block, - struct btrfsic_block *from_block, - u64 parent_generation); -static struct btrfsic_block *btrfsic_block_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx, - const char *additional_string, - int is_metadata, - int is_iodone, - int never_written, - int mirror_num, - int *was_created); -static int btrfsic_process_superblock_dev_mirror( - struct btrfsic_state *state, - struct btrfsic_dev_state *dev_state, - struct btrfs_device *device, - int superblock_mirror_num, - struct btrfsic_dev_state **selected_dev_state, - struct btrfs_super_block *selected_super); -static struct btrfsic_dev_state *btrfsic_dev_state_lookup( - struct block_device *bdev); -static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, - u64 bytenr, - struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, char *data); - -static struct mutex btrfsic_mutex; -static int btrfsic_is_initialized; -static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable; - - -static void btrfsic_block_init(struct btrfsic_block *b) -{ - b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER; - b->dev_state = NULL; - b->dev_bytenr = 0; - b->logical_bytenr = 0; - b->generation = BTRFSIC_GENERATION_UNKNOWN; - b->disk_key.objectid = 0; - b->disk_key.type = 0; - b->disk_key.offset = 0; - b->is_metadata = 0; - b->is_superblock = 0; - b->is_iodone = 0; - b->iodone_w_error = 0; - b->never_written = 0; - b->mirror_num = 0; - b->next_in_same_bio = NULL; - b->orig_bio_bh_private = NULL; - b->orig_bio_bh_end_io.bio = NULL; - INIT_LIST_HEAD(&b->collision_resolving_node); - INIT_LIST_HEAD(&b->all_blocks_node); - INIT_LIST_HEAD(&b->ref_to_list); - INIT_LIST_HEAD(&b->ref_from_list); - b->submit_bio_bh_rw = 0; - b->flush_gen = 0; -} - -static struct btrfsic_block *btrfsic_block_alloc(void) -{ - struct btrfsic_block *b; - - b = kzalloc(sizeof(*b), GFP_NOFS); - if (NULL != b) - btrfsic_block_init(b); - - return b; -} - -static void btrfsic_block_free(struct btrfsic_block *b) -{ - BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num)); - kfree(b); -} - -static void btrfsic_block_link_init(struct btrfsic_block_link *l) -{ - l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER; - l->ref_cnt = 1; - INIT_LIST_HEAD(&l->node_ref_to); - INIT_LIST_HEAD(&l->node_ref_from); - INIT_LIST_HEAD(&l->collision_resolving_node); - l->block_ref_to = NULL; - l->block_ref_from = NULL; -} - -static struct btrfsic_block_link *btrfsic_block_link_alloc(void) -{ - struct btrfsic_block_link *l; - - l = kzalloc(sizeof(*l), GFP_NOFS); - if (NULL != l) - btrfsic_block_link_init(l); - - return l; -} - -static void btrfsic_block_link_free(struct btrfsic_block_link *l) -{ - BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num)); - kfree(l); -} - -static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds) -{ - ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER; - ds->bdev = NULL; - ds->state = NULL; - ds->name[0] = '\0'; - INIT_LIST_HEAD(&ds->collision_resolving_node); - ds->last_flush_gen = 0; - btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush); - ds->dummy_block_for_bio_bh_flush.is_iodone = 1; - ds->dummy_block_for_bio_bh_flush.dev_state = ds; -} - -static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void) -{ - struct btrfsic_dev_state *ds; - - ds = kzalloc(sizeof(*ds), GFP_NOFS); - if (NULL != ds) - btrfsic_dev_state_init(ds); - - return ds; -} - -static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds) -{ - BUG_ON(!(NULL == ds || - BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num)); - kfree(ds); -} - -static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h) -{ - int i; - - for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++) - INIT_LIST_HEAD(h->table + i); -} - -static void btrfsic_block_hashtable_add(struct btrfsic_block *b, - struct btrfsic_block_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(b->dev_bytenr >> 16)) ^ - ((unsigned int)((uintptr_t)b->dev_state->bdev))) & - (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); - - list_add(&b->collision_resolving_node, h->table + hashval); -} - -static void btrfsic_block_hashtable_remove(struct btrfsic_block *b) -{ - list_del(&b->collision_resolving_node); -} - -static struct btrfsic_block *btrfsic_block_hashtable_lookup( - struct block_device *bdev, - u64 dev_bytenr, - struct btrfsic_block_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(dev_bytenr >> 16)) ^ - ((unsigned int)((uintptr_t)bdev))) & - (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); - struct list_head *elem; - - list_for_each(elem, h->table + hashval) { - struct btrfsic_block *const b = - list_entry(elem, struct btrfsic_block, - collision_resolving_node); - - if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr) - return b; - } - - return NULL; -} - -static void btrfsic_block_link_hashtable_init( - struct btrfsic_block_link_hashtable *h) -{ - int i; - - for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++) - INIT_LIST_HEAD(h->table + i); -} - -static void btrfsic_block_link_hashtable_add( - struct btrfsic_block_link *l, - struct btrfsic_block_link_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^ - ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^ - ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^ - ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev))) - & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); - - BUG_ON(NULL == l->block_ref_to); - BUG_ON(NULL == l->block_ref_from); - list_add(&l->collision_resolving_node, h->table + hashval); -} - -static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l) -{ - list_del(&l->collision_resolving_node); -} - -static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( - struct block_device *bdev_ref_to, - u64 dev_bytenr_ref_to, - struct block_device *bdev_ref_from, - u64 dev_bytenr_ref_from, - struct btrfsic_block_link_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(dev_bytenr_ref_to >> 16)) ^ - ((unsigned int)(dev_bytenr_ref_from >> 16)) ^ - ((unsigned int)((uintptr_t)bdev_ref_to)) ^ - ((unsigned int)((uintptr_t)bdev_ref_from))) & - (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); - struct list_head *elem; - - list_for_each(elem, h->table + hashval) { - struct btrfsic_block_link *const l = - list_entry(elem, struct btrfsic_block_link, - collision_resolving_node); - - BUG_ON(NULL == l->block_ref_to); - BUG_ON(NULL == l->block_ref_from); - if (l->block_ref_to->dev_state->bdev == bdev_ref_to && - l->block_ref_to->dev_bytenr == dev_bytenr_ref_to && - l->block_ref_from->dev_state->bdev == bdev_ref_from && - l->block_ref_from->dev_bytenr == dev_bytenr_ref_from) - return l; - } - - return NULL; -} - -static void btrfsic_dev_state_hashtable_init( - struct btrfsic_dev_state_hashtable *h) -{ - int i; - - for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++) - INIT_LIST_HEAD(h->table + i); -} - -static void btrfsic_dev_state_hashtable_add( - struct btrfsic_dev_state *ds, - struct btrfsic_dev_state_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)((uintptr_t)ds->bdev)) & - (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); - - list_add(&ds->collision_resolving_node, h->table + hashval); -} - -static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds) -{ - list_del(&ds->collision_resolving_node); -} - -static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( - struct block_device *bdev, - struct btrfsic_dev_state_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)((uintptr_t)bdev)) & - (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); - struct list_head *elem; - - list_for_each(elem, h->table + hashval) { - struct btrfsic_dev_state *const ds = - list_entry(elem, struct btrfsic_dev_state, - collision_resolving_node); - - if (ds->bdev == bdev) - return ds; - } - - return NULL; -} - -static int btrfsic_process_superblock(struct btrfsic_state *state, - struct btrfs_fs_devices *fs_devices) -{ - int ret; - struct btrfs_super_block *selected_super; - struct list_head *dev_head = &fs_devices->devices; - struct btrfs_device *device; - struct btrfsic_dev_state *selected_dev_state = NULL; - int pass; - - BUG_ON(NULL == state); - selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS); - if (NULL == selected_super) { - printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); - return -1; - } - - list_for_each_entry(device, dev_head, dev_list) { - int i; - struct btrfsic_dev_state *dev_state; - - if (!device->bdev || !device->name) - continue; - - dev_state = btrfsic_dev_state_lookup(device->bdev); - BUG_ON(NULL == dev_state); - for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { - ret = btrfsic_process_superblock_dev_mirror( - state, dev_state, device, i, - &selected_dev_state, selected_super); - if (0 != ret && 0 == i) { - kfree(selected_super); - return ret; - } - } - } - - if (NULL == state->latest_superblock) { - printk(KERN_INFO "btrfsic: no superblock found!\n"); - kfree(selected_super); - return -1; - } - - state->csum_size = btrfs_super_csum_size(selected_super); - - for (pass = 0; pass < 3; pass++) { - int num_copies; - int mirror_num; - u64 next_bytenr; - - switch (pass) { - case 0: - next_bytenr = btrfs_super_root(selected_super); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "root@%llu\n", - (unsigned long long)next_bytenr); - break; - case 1: - next_bytenr = btrfs_super_chunk_root(selected_super); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "chunk@%llu\n", - (unsigned long long)next_bytenr); - break; - case 2: - next_bytenr = btrfs_super_log_root(selected_super); - if (0 == next_bytenr) - continue; - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "log@%llu\n", - (unsigned long long)next_bytenr); - break; - } - - num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", - (unsigned long long)next_bytenr, num_copies); - - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx tmp_next_block_ctx; - struct btrfsic_block_link *l; - struct btrfs_header *hdr; - - ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, - &tmp_next_block_ctx, - mirror_num); - if (ret) { - printk(KERN_INFO "btrfsic:" - " btrfsic_map_block(root @%llu," - " mirror %d) failed!\n", - (unsigned long long)next_bytenr, - mirror_num); - kfree(selected_super); - return -1; - } - - next_block = btrfsic_block_hashtable_lookup( - tmp_next_block_ctx.dev->bdev, - tmp_next_block_ctx.dev_bytenr, - &state->block_hashtable); - BUG_ON(NULL == next_block); - - l = btrfsic_block_link_hashtable_lookup( - tmp_next_block_ctx.dev->bdev, - tmp_next_block_ctx.dev_bytenr, - state->latest_superblock->dev_state-> - bdev, - state->latest_superblock->dev_bytenr, - &state->block_link_hashtable); - BUG_ON(NULL == l); - - ret = btrfsic_read_block(state, &tmp_next_block_ctx); - if (ret < (int)BTRFSIC_BLOCK_SIZE) { - printk(KERN_INFO - "btrfsic: read @logical %llu failed!\n", - (unsigned long long) - tmp_next_block_ctx.start); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - kfree(selected_super); - return -1; - } - - hdr = (struct btrfs_header *)tmp_next_block_ctx.data; - ret = btrfsic_process_metablock(state, - next_block, - &tmp_next_block_ctx, - hdr, - BTRFS_MAX_LEVEL + 3, 1); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - } - } - - kfree(selected_super); - return ret; -} - -static int btrfsic_process_superblock_dev_mirror( - struct btrfsic_state *state, - struct btrfsic_dev_state *dev_state, - struct btrfs_device *device, - int superblock_mirror_num, - struct btrfsic_dev_state **selected_dev_state, - struct btrfs_super_block *selected_super) -{ - struct btrfs_super_block *super_tmp; - u64 dev_bytenr; - struct buffer_head *bh; - struct btrfsic_block *superblock_tmp; - int pass; - struct block_device *const superblock_bdev = device->bdev; - - /* super block bytenr is always the unmapped device bytenr */ - dev_bytenr = btrfs_sb_offset(superblock_mirror_num); - bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096); - if (NULL == bh) - return -1; - super_tmp = (struct btrfs_super_block *) - (bh->b_data + (dev_bytenr & 4095)); - - if (btrfs_super_bytenr(super_tmp) != dev_bytenr || - strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, - sizeof(super_tmp->magic)) || - memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) { - brelse(bh); - return 0; - } - - superblock_tmp = - btrfsic_block_hashtable_lookup(superblock_bdev, - dev_bytenr, - &state->block_hashtable); - if (NULL == superblock_tmp) { - superblock_tmp = btrfsic_block_alloc(); - if (NULL == superblock_tmp) { - printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); - brelse(bh); - return -1; - } - /* for superblock, only the dev_bytenr makes sense */ - superblock_tmp->dev_bytenr = dev_bytenr; - superblock_tmp->dev_state = dev_state; - superblock_tmp->logical_bytenr = dev_bytenr; - superblock_tmp->generation = btrfs_super_generation(super_tmp); - superblock_tmp->is_metadata = 1; - superblock_tmp->is_superblock = 1; - superblock_tmp->is_iodone = 1; - superblock_tmp->never_written = 0; - superblock_tmp->mirror_num = 1 + superblock_mirror_num; - if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - printk(KERN_INFO "New initial S-block (bdev %p, %s)" - " @%llu (%s/%llu/%d)\n", - superblock_bdev, device->name, - (unsigned long long)dev_bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, - superblock_mirror_num); - list_add(&superblock_tmp->all_blocks_node, - &state->all_blocks_list); - btrfsic_block_hashtable_add(superblock_tmp, - &state->block_hashtable); - } - - /* select the one with the highest generation field */ - if (btrfs_super_generation(super_tmp) > - state->max_superblock_generation || - 0 == state->max_superblock_generation) { - memcpy(selected_super, super_tmp, sizeof(*selected_super)); - *selected_dev_state = dev_state; - state->max_superblock_generation = - btrfs_super_generation(super_tmp); - state->latest_superblock = superblock_tmp; - } - - for (pass = 0; pass < 3; pass++) { - u64 next_bytenr; - int num_copies; - int mirror_num; - const char *additional_string = NULL; - struct btrfs_disk_key tmp_disk_key; - - tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; - tmp_disk_key.offset = 0; - switch (pass) { - case 0: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID); - additional_string = "initial root "; - next_bytenr = btrfs_super_root(super_tmp); - break; - case 1: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID); - additional_string = "initial chunk "; - next_bytenr = btrfs_super_chunk_root(super_tmp); - break; - case 2: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_TREE_LOG_OBJECTID); - additional_string = "initial log "; - next_bytenr = btrfs_super_log_root(super_tmp); - if (0 == next_bytenr) - continue; - break; - } - - num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", - (unsigned long long)next_bytenr, num_copies); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx tmp_next_block_ctx; - struct btrfsic_block_link *l; - - if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE, - &tmp_next_block_ctx, - mirror_num)) { - printk(KERN_INFO "btrfsic: btrfsic_map_block(" - "bytenr @%llu, mirror %d) failed!\n", - (unsigned long long)next_bytenr, - mirror_num); - brelse(bh); - return -1; - } - - next_block = btrfsic_block_lookup_or_add( - state, &tmp_next_block_ctx, - additional_string, 1, 1, 0, - mirror_num, NULL); - if (NULL == next_block) { - btrfsic_release_block_ctx(&tmp_next_block_ctx); - brelse(bh); - return -1; - } - - next_block->disk_key = tmp_disk_key; - next_block->generation = BTRFSIC_GENERATION_UNKNOWN; - l = btrfsic_block_link_lookup_or_add( - state, &tmp_next_block_ctx, - next_block, superblock_tmp, - BTRFSIC_GENERATION_UNKNOWN); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - if (NULL == l) { - brelse(bh); - return -1; - } - } - } - if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES) - btrfsic_dump_tree_sub(state, superblock_tmp, 0); - - brelse(bh); - return 0; -} - -static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void) -{ - struct btrfsic_stack_frame *sf; - - sf = kzalloc(sizeof(*sf), GFP_NOFS); - if (NULL == sf) - printk(KERN_INFO "btrfsic: alloc memory failed!\n"); - else - sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER; - return sf; -} - -static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf) -{ - BUG_ON(!(NULL == sf || - BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic)); - kfree(sf); -} - -static int btrfsic_process_metablock( - struct btrfsic_state *state, - struct btrfsic_block *const first_block, - struct btrfsic_block_data_ctx *const first_block_ctx, - struct btrfs_header *const first_hdr, - int first_limit_nesting, int force_iodone_flag) -{ - struct btrfsic_stack_frame initial_stack_frame = { 0 }; - struct btrfsic_stack_frame *sf; - struct btrfsic_stack_frame *next_stack; - - sf = &initial_stack_frame; - sf->error = 0; - sf->i = -1; - sf->limit_nesting = first_limit_nesting; - sf->block = first_block; - sf->block_ctx = first_block_ctx; - sf->next_block = NULL; - sf->hdr = first_hdr; - sf->prev = NULL; - -continue_with_new_stack_frame: - sf->block->generation = le64_to_cpu(sf->hdr->generation); - if (0 == sf->hdr->level) { - struct btrfs_leaf *const leafhdr = - (struct btrfs_leaf *)sf->hdr; - - if (-1 == sf->i) { - sf->nr = le32_to_cpu(leafhdr->header.nritems); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "leaf %llu items %d generation %llu" - " owner %llu\n", - (unsigned long long) - sf->block_ctx->start, - sf->nr, - (unsigned long long) - le64_to_cpu(leafhdr->header.generation), - (unsigned long long) - le64_to_cpu(leafhdr->header.owner)); - } - -continue_with_current_leaf_stack_frame: - if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { - sf->i++; - sf->num_copies = 0; - } - - if (sf->i < sf->nr) { - struct btrfs_item *disk_item = leafhdr->items + sf->i; - struct btrfs_disk_key *disk_key = &disk_item->key; - u8 type; - const u32 item_offset = le32_to_cpu(disk_item->offset); - - type = disk_key->type; - - if (BTRFS_ROOT_ITEM_KEY == type) { - const struct btrfs_root_item *const root_item = - (struct btrfs_root_item *) - (sf->block_ctx->data + - offsetof(struct btrfs_leaf, items) + - item_offset); - const u64 next_bytenr = - le64_to_cpu(root_item->bytenr); - - sf->error = - btrfsic_create_link_to_next_block( - state, - sf->block, - sf->block_ctx, - next_bytenr, - sf->limit_nesting, - &sf->next_block_ctx, - &sf->next_block, - force_iodone_flag, - &sf->num_copies, - &sf->mirror_num, - disk_key, - le64_to_cpu(root_item-> - generation)); - if (sf->error) - goto one_stack_frame_backwards; - - if (NULL != sf->next_block) { - struct btrfs_header *const next_hdr = - (struct btrfs_header *) - sf->next_block_ctx.data; - - next_stack = - btrfsic_stack_frame_alloc(); - if (NULL == next_stack) { - btrfsic_release_block_ctx( - &sf-> - next_block_ctx); - goto one_stack_frame_backwards; - } - - next_stack->i = -1; - next_stack->block = sf->next_block; - next_stack->block_ctx = - &sf->next_block_ctx; - next_stack->next_block = NULL; - next_stack->hdr = next_hdr; - next_stack->limit_nesting = - sf->limit_nesting - 1; - next_stack->prev = sf; - sf = next_stack; - goto continue_with_new_stack_frame; - } - } else if (BTRFS_EXTENT_DATA_KEY == type && - state->include_extent_data) { - sf->error = btrfsic_handle_extent_data( - state, - sf->block, - sf->block_ctx, - item_offset, - force_iodone_flag); - if (sf->error) - goto one_stack_frame_backwards; - } - - goto continue_with_current_leaf_stack_frame; - } - } else { - struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr; - - if (-1 == sf->i) { - sf->nr = le32_to_cpu(nodehdr->header.nritems); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO "node %llu level %d items %d" - " generation %llu owner %llu\n", - (unsigned long long) - sf->block_ctx->start, - nodehdr->header.level, sf->nr, - (unsigned long long) - le64_to_cpu(nodehdr->header.generation), - (unsigned long long) - le64_to_cpu(nodehdr->header.owner)); - } - -continue_with_current_node_stack_frame: - if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { - sf->i++; - sf->num_copies = 0; - } - - if (sf->i < sf->nr) { - struct btrfs_key_ptr *disk_key_ptr = - nodehdr->ptrs + sf->i; - const u64 next_bytenr = - le64_to_cpu(disk_key_ptr->blockptr); - - sf->error = btrfsic_create_link_to_next_block( - state, - sf->block, - sf->block_ctx, - next_bytenr, - sf->limit_nesting, - &sf->next_block_ctx, - &sf->next_block, - force_iodone_flag, - &sf->num_copies, - &sf->mirror_num, - &disk_key_ptr->key, - le64_to_cpu(disk_key_ptr->generation)); - if (sf->error) - goto one_stack_frame_backwards; - - if (NULL != sf->next_block) { - struct btrfs_header *const next_hdr = - (struct btrfs_header *) - sf->next_block_ctx.data; - - next_stack = btrfsic_stack_frame_alloc(); - if (NULL == next_stack) - goto one_stack_frame_backwards; - - next_stack->i = -1; - next_stack->block = sf->next_block; - next_stack->block_ctx = &sf->next_block_ctx; - next_stack->next_block = NULL; - next_stack->hdr = next_hdr; - next_stack->limit_nesting = - sf->limit_nesting - 1; - next_stack->prev = sf; - sf = next_stack; - goto continue_with_new_stack_frame; - } - - goto continue_with_current_node_stack_frame; - } - } - -one_stack_frame_backwards: - if (NULL != sf->prev) { - struct btrfsic_stack_frame *const prev = sf->prev; - - /* the one for the initial block is freed in the caller */ - btrfsic_release_block_ctx(sf->block_ctx); - - if (sf->error) { - prev->error = sf->error; - btrfsic_stack_frame_free(sf); - sf = prev; - goto one_stack_frame_backwards; - } - - btrfsic_stack_frame_free(sf); - sf = prev; - goto continue_with_new_stack_frame; - } else { - BUG_ON(&initial_stack_frame != sf); - } - - return sf->error; -} - -static int btrfsic_create_link_to_next_block( - struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - u64 next_bytenr, - int limit_nesting, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block **next_blockp, - int force_iodone_flag, - int *num_copiesp, int *mirror_nump, - struct btrfs_disk_key *disk_key, - u64 parent_generation) -{ - struct btrfsic_block *next_block = NULL; - int ret; - struct btrfsic_block_link *l; - int did_alloc_block_link; - int block_was_created; - - *next_blockp = NULL; - if (0 == *num_copiesp) { - *num_copiesp = - btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", - (unsigned long long)next_bytenr, *num_copiesp); - *mirror_nump = 1; - } - - if (*mirror_nump > *num_copiesp) - return 0; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "btrfsic_create_link_to_next_block(mirror_num=%d)\n", - *mirror_nump); - ret = btrfsic_map_block(state, next_bytenr, - BTRFSIC_BLOCK_SIZE, - next_block_ctx, *mirror_nump); - if (ret) { - printk(KERN_INFO - "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", - (unsigned long long)next_bytenr, *mirror_nump); - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - - next_block = btrfsic_block_lookup_or_add(state, - next_block_ctx, "referenced ", - 1, force_iodone_flag, - !force_iodone_flag, - *mirror_nump, - &block_was_created); - if (NULL == next_block) { - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - if (block_was_created) { - l = NULL; - next_block->generation = BTRFSIC_GENERATION_UNKNOWN; - } else { - if (next_block->logical_bytenr != next_bytenr && - !(!next_block->is_metadata && - 0 == next_block->logical_bytenr)) { - printk(KERN_INFO - "Referenced block @%llu (%s/%llu/%d)" - " found in hash table, %c," - " bytenr mismatch (!= stored %llu).\n", - (unsigned long long)next_bytenr, - next_block_ctx->dev->name, - (unsigned long long)next_block_ctx->dev_bytenr, - *mirror_nump, - btrfsic_get_block_type(state, next_block), - (unsigned long long)next_block->logical_bytenr); - } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "Referenced block @%llu (%s/%llu/%d)" - " found in hash table, %c.\n", - (unsigned long long)next_bytenr, - next_block_ctx->dev->name, - (unsigned long long)next_block_ctx->dev_bytenr, - *mirror_nump, - btrfsic_get_block_type(state, next_block)); - next_block->logical_bytenr = next_bytenr; - - next_block->mirror_num = *mirror_nump; - l = btrfsic_block_link_hashtable_lookup( - next_block_ctx->dev->bdev, - next_block_ctx->dev_bytenr, - block_ctx->dev->bdev, - block_ctx->dev_bytenr, - &state->block_link_hashtable); - } - - next_block->disk_key = *disk_key; - if (NULL == l) { - l = btrfsic_block_link_alloc(); - if (NULL == l) { - printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - - did_alloc_block_link = 1; - l->block_ref_to = next_block; - l->block_ref_from = block; - l->ref_cnt = 1; - l->parent_generation = parent_generation; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - - list_add(&l->node_ref_to, &block->ref_to_list); - list_add(&l->node_ref_from, &next_block->ref_from_list); - - btrfsic_block_link_hashtable_add(l, - &state->block_link_hashtable); - } else { - did_alloc_block_link = 0; - if (0 == limit_nesting) { - l->ref_cnt++; - l->parent_generation = parent_generation; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - } - } - - if (limit_nesting > 0 && did_alloc_block_link) { - ret = btrfsic_read_block(state, next_block_ctx); - if (ret < (int)BTRFSIC_BLOCK_SIZE) { - printk(KERN_INFO - "btrfsic: read block @logical %llu failed!\n", - (unsigned long long)next_bytenr); - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - - *next_blockp = next_block; - } else { - *next_blockp = NULL; - } - (*mirror_nump)++; - - return 0; -} - -static int btrfsic_handle_extent_data( - struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - u32 item_offset, int force_iodone_flag) -{ - int ret; - struct btrfs_file_extent_item *file_extent_item = - (struct btrfs_file_extent_item *)(block_ctx->data + - offsetof(struct btrfs_leaf, - items) + item_offset); - u64 next_bytenr = - le64_to_cpu(file_extent_item->disk_bytenr) + - le64_to_cpu(file_extent_item->offset); - u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes); - u64 generation = le64_to_cpu(file_extent_item->generation); - struct btrfsic_block_link *l; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) - printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu," - " offset = %llu, num_bytes = %llu\n", - file_extent_item->type, - (unsigned long long) - le64_to_cpu(file_extent_item->disk_bytenr), - (unsigned long long) - le64_to_cpu(file_extent_item->offset), - (unsigned long long) - le64_to_cpu(file_extent_item->num_bytes)); - if (BTRFS_FILE_EXTENT_REG != file_extent_item->type || - ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr)) - return 0; - while (num_bytes > 0) { - u32 chunk_len; - int num_copies; - int mirror_num; - - if (num_bytes > BTRFSIC_BLOCK_SIZE) - chunk_len = BTRFSIC_BLOCK_SIZE; - else - chunk_len = num_bytes; - - num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", - (unsigned long long)next_bytenr, num_copies); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - struct btrfsic_block_data_ctx next_block_ctx; - struct btrfsic_block *next_block; - int block_was_created; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO "btrfsic_handle_extent_data(" - "mirror_num=%d)\n", mirror_num); - if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) - printk(KERN_INFO - "\tdisk_bytenr = %llu, num_bytes %u\n", - (unsigned long long)next_bytenr, - chunk_len); - ret = btrfsic_map_block(state, next_bytenr, - chunk_len, &next_block_ctx, - mirror_num); - if (ret) { - printk(KERN_INFO - "btrfsic: btrfsic_map_block(@%llu," - " mirror=%d) failed!\n", - (unsigned long long)next_bytenr, - mirror_num); - return -1; - } - - next_block = btrfsic_block_lookup_or_add( - state, - &next_block_ctx, - "referenced ", - 0, - force_iodone_flag, - !force_iodone_flag, - mirror_num, - &block_was_created); - if (NULL == next_block) { - printk(KERN_INFO - "btrfsic: error, kmalloc failed!\n"); - btrfsic_release_block_ctx(&next_block_ctx); - return -1; - } - if (!block_was_created) { - if (next_block->logical_bytenr != next_bytenr && - !(!next_block->is_metadata && - 0 == next_block->logical_bytenr)) { - printk(KERN_INFO - "Referenced block" - " @%llu (%s/%llu/%d)" - " found in hash table, D," - " bytenr mismatch" - " (!= stored %llu).\n", - (unsigned long long)next_bytenr, - next_block_ctx.dev->name, - (unsigned long long) - next_block_ctx.dev_bytenr, - mirror_num, - (unsigned long long) - next_block->logical_bytenr); - } - next_block->logical_bytenr = next_bytenr; - next_block->mirror_num = mirror_num; - } - - l = btrfsic_block_link_lookup_or_add(state, - &next_block_ctx, - next_block, block, - generation); - btrfsic_release_block_ctx(&next_block_ctx); - if (NULL == l) - return -1; - } - - next_bytenr += chunk_len; - num_bytes -= chunk_len; - } - - return 0; -} - -static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, - struct btrfsic_block_data_ctx *block_ctx_out, - int mirror_num) -{ - int ret; - u64 length; - struct btrfs_bio *multi = NULL; - struct btrfs_device *device; - - length = len; - ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ, - bytenr, &length, &multi, mirror_num); - - device = multi->stripes[0].dev; - block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); - block_ctx_out->dev_bytenr = multi->stripes[0].physical; - block_ctx_out->start = bytenr; - block_ctx_out->len = len; - block_ctx_out->data = NULL; - block_ctx_out->bh = NULL; - - if (0 == ret) - kfree(multi); - if (NULL == block_ctx_out->dev) { - ret = -ENXIO; - printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); - } - - return ret; -} - -static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, - u32 len, struct block_device *bdev, - struct btrfsic_block_data_ctx *block_ctx_out) -{ - block_ctx_out->dev = btrfsic_dev_state_lookup(bdev); - block_ctx_out->dev_bytenr = bytenr; - block_ctx_out->start = bytenr; - block_ctx_out->len = len; - block_ctx_out->data = NULL; - block_ctx_out->bh = NULL; - if (NULL != block_ctx_out->dev) { - return 0; - } else { - printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n"); - return -ENXIO; - } -} - -static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) -{ - if (NULL != block_ctx->bh) { - brelse(block_ctx->bh); - block_ctx->bh = NULL; - } -} - -static int btrfsic_read_block(struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx) -{ - block_ctx->bh = NULL; - if (block_ctx->dev_bytenr & 4095) { - printk(KERN_INFO - "btrfsic: read_block() with unaligned bytenr %llu\n", - (unsigned long long)block_ctx->dev_bytenr); - return -1; - } - if (block_ctx->len > 4096) { - printk(KERN_INFO - "btrfsic: read_block() with too huge size %d\n", - block_ctx->len); - return -1; - } - - block_ctx->bh = __bread(block_ctx->dev->bdev, - block_ctx->dev_bytenr >> 12, 4096); - if (NULL == block_ctx->bh) - return -1; - block_ctx->data = block_ctx->bh->b_data; - - return block_ctx->len; -} - -static void btrfsic_dump_database(struct btrfsic_state *state) -{ - struct list_head *elem_all; - - BUG_ON(NULL == state); - - printk(KERN_INFO "all_blocks_list:\n"); - list_for_each(elem_all, &state->all_blocks_list) { - const struct btrfsic_block *const b_all = - list_entry(elem_all, struct btrfsic_block, - all_blocks_node); - struct list_head *elem_ref_to; - struct list_head *elem_ref_from; - - printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n", - btrfsic_get_block_type(state, b_all), - (unsigned long long)b_all->logical_bytenr, - b_all->dev_state->name, - (unsigned long long)b_all->dev_bytenr, - b_all->mirror_num); - - list_for_each(elem_ref_to, &b_all->ref_to_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_to, - struct btrfsic_block_link, - node_ref_to); - - printk(KERN_INFO " %c @%llu (%s/%llu/%d)" - " refers %u* to" - " %c @%llu (%s/%llu/%d)\n", - btrfsic_get_block_type(state, b_all), - (unsigned long long)b_all->logical_bytenr, - b_all->dev_state->name, - (unsigned long long)b_all->dev_bytenr, - b_all->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - } - - list_for_each(elem_ref_from, &b_all->ref_from_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_from, - struct btrfsic_block_link, - node_ref_from); - - printk(KERN_INFO " %c @%llu (%s/%llu/%d)" - " is ref %u* from" - " %c @%llu (%s/%llu/%d)\n", - btrfsic_get_block_type(state, b_all), - (unsigned long long)b_all->logical_bytenr, - b_all->dev_state->name, - (unsigned long long)b_all->dev_bytenr, - b_all->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - (unsigned long long) - l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->name, - (unsigned long long) - l->block_ref_from->dev_bytenr, - l->block_ref_from->mirror_num); - } - - printk(KERN_INFO "\n"); - } -} - -/* - * Test whether the disk block contains a tree block (leaf or node) - * (note that this test fails for the super block) - */ -static int btrfsic_test_for_metadata(struct btrfsic_state *state, - const u8 *data, unsigned int size) -{ - struct btrfs_header *h; - u8 csum[BTRFS_CSUM_SIZE]; - u32 crc = ~(u32)0; - int fail = 0; - int crc_fail = 0; - - h = (struct btrfs_header *)data; - - if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE)) - fail++; - - crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, csum); - if (memcmp(csum, h->csum, state->csum_size)) - crc_fail++; - - return fail || crc_fail; -} - -static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, - u8 *mapped_data, unsigned int len, - struct bio *bio, - int *bio_is_patched, - struct buffer_head *bh, - int submit_bio_bh_rw) -{ - int is_metadata; - struct btrfsic_block *block; - struct btrfsic_block_data_ctx block_ctx; - int ret; - struct btrfsic_state *state = dev_state->state; - struct block_device *bdev = dev_state->bdev; - - WARN_ON(len > PAGE_SIZE); - is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len)); - if (NULL != bio_is_patched) - *bio_is_patched = 0; - - block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, - &state->block_hashtable); - if (NULL != block) { - u64 bytenr; - struct list_head *elem_ref_to; - struct list_head *tmp_ref_to; - - if (block->is_superblock) { - bytenr = le64_to_cpu(((struct btrfs_super_block *) - mapped_data)->bytenr); - is_metadata = 1; - if (state->print_mask & - BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { - printk(KERN_INFO - "[before new superblock is written]:\n"); - btrfsic_dump_tree_sub(state, block, 0); - } - } - if (is_metadata) { - if (!block->is_superblock) { - bytenr = le64_to_cpu(((struct btrfs_header *) - mapped_data)->bytenr); - btrfsic_cmp_log_and_dev_bytenr(state, bytenr, - dev_state, - dev_bytenr, - mapped_data); - } - if (block->logical_bytenr != bytenr) { - printk(KERN_INFO - "Written block @%llu (%s/%llu/%d)" - " found in hash table, %c," - " bytenr mismatch" - " (!= stored %llu).\n", - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, - block->mirror_num, - btrfsic_get_block_type(state, block), - (unsigned long long) - block->logical_bytenr); - block->logical_bytenr = bytenr; - } else if (state->print_mask & - BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "Written block @%llu (%s/%llu/%d)" - " found in hash table, %c.\n", - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, - block->mirror_num, - btrfsic_get_block_type(state, block)); - } else { - bytenr = block->logical_bytenr; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "Written block @%llu (%s/%llu/%d)" - " found in hash table, %c.\n", - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, - block->mirror_num, - btrfsic_get_block_type(state, block)); - } - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "ref_to_list: %cE, ref_from_list: %cE\n", - list_empty(&block->ref_to_list) ? ' ' : '!', - list_empty(&block->ref_from_list) ? ' ' : '!'); - if (btrfsic_is_block_ref_by_superblock(state, block, 0)) { - printk(KERN_INFO "btrfs: attempt to overwrite %c-block" - " @%llu (%s/%llu/%d), old(gen=%llu," - " objectid=%llu, type=%d, offset=%llu)," - " new(gen=%llu)," - " which is referenced by most recent superblock" - " (superblockgen=%llu)!\n", - btrfsic_get_block_type(state, block), - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, - block->mirror_num, - (unsigned long long)block->generation, - (unsigned long long) - le64_to_cpu(block->disk_key.objectid), - block->disk_key.type, - (unsigned long long) - le64_to_cpu(block->disk_key.offset), - (unsigned long long) - le64_to_cpu(((struct btrfs_header *) - mapped_data)->generation), - (unsigned long long) - state->max_superblock_generation); - btrfsic_dump_tree(state); - } - - if (!block->is_iodone && !block->never_written) { - printk(KERN_INFO "btrfs: attempt to overwrite %c-block" - " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu," - " which is not yet iodone!\n", - btrfsic_get_block_type(state, block), - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, - block->mirror_num, - (unsigned long long)block->generation, - (unsigned long long) - le64_to_cpu(((struct btrfs_header *) - mapped_data)->generation)); - /* it would not be safe to go on */ - btrfsic_dump_tree(state); - return; - } - - /* - * Clear all references of this block. Do not free - * the block itself even if is not referenced anymore - * because it still carries valueable information - * like whether it was ever written and IO completed. - */ - list_for_each_safe(elem_ref_to, tmp_ref_to, - &block->ref_to_list) { - struct btrfsic_block_link *const l = - list_entry(elem_ref_to, - struct btrfsic_block_link, - node_ref_to); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_rem_link(state, l); - l->ref_cnt--; - if (0 == l->ref_cnt) { - list_del(&l->node_ref_to); - list_del(&l->node_ref_from); - btrfsic_block_link_hashtable_remove(l); - btrfsic_block_link_free(l); - } - } - - if (block->is_superblock) - ret = btrfsic_map_superblock(state, bytenr, len, - bdev, &block_ctx); - else - ret = btrfsic_map_block(state, bytenr, len, - &block_ctx, 0); - if (ret) { - printk(KERN_INFO - "btrfsic: btrfsic_map_block(root @%llu)" - " failed!\n", (unsigned long long)bytenr); - return; - } - block_ctx.data = mapped_data; - /* the following is required in case of writes to mirrors, - * use the same that was used for the lookup */ - block_ctx.dev = dev_state; - block_ctx.dev_bytenr = dev_bytenr; - - if (is_metadata || state->include_extent_data) { - block->never_written = 0; - block->iodone_w_error = 0; - if (NULL != bio) { - block->is_iodone = 0; - BUG_ON(NULL == bio_is_patched); - if (!*bio_is_patched) { - block->orig_bio_bh_private = - bio->bi_private; - block->orig_bio_bh_end_io.bio = - bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - *bio_is_patched = 1; - } else { - struct btrfsic_block *chained_block = - (struct btrfsic_block *) - bio->bi_private; - - BUG_ON(NULL == chained_block); - block->orig_bio_bh_private = - chained_block->orig_bio_bh_private; - block->orig_bio_bh_end_io.bio = - chained_block->orig_bio_bh_end_io. - bio; - block->next_in_same_bio = chained_block; - bio->bi_private = block; - } - } else if (NULL != bh) { - block->is_iodone = 0; - block->orig_bio_bh_private = bh->b_private; - block->orig_bio_bh_end_io.bh = bh->b_end_io; - block->next_in_same_bio = NULL; - bh->b_private = block; - bh->b_end_io = btrfsic_bh_end_io; - } else { - block->is_iodone = 1; - block->orig_bio_bh_private = NULL; - block->orig_bio_bh_end_io.bio = NULL; - block->next_in_same_bio = NULL; - } - } - - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = submit_bio_bh_rw; - if (is_metadata) { - block->logical_bytenr = bytenr; - block->is_metadata = 1; - if (block->is_superblock) { - ret = btrfsic_process_written_superblock( - state, - block, - (struct btrfs_super_block *) - mapped_data); - if (state->print_mask & - BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) { - printk(KERN_INFO - "[after new superblock is written]:\n"); - btrfsic_dump_tree_sub(state, block, 0); - } - } else { - block->mirror_num = 0; /* unknown */ - ret = btrfsic_process_metablock( - state, - block, - &block_ctx, - (struct btrfs_header *) - block_ctx.data, - 0, 0); - } - if (ret) - printk(KERN_INFO - "btrfsic: btrfsic_process_metablock" - "(root @%llu) failed!\n", - (unsigned long long)dev_bytenr); - } else { - block->is_metadata = 0; - block->mirror_num = 0; /* unknown */ - block->generation = BTRFSIC_GENERATION_UNKNOWN; - if (!state->include_extent_data - && list_empty(&block->ref_from_list)) { - /* - * disk block is overwritten with extent - * data (not meta data) and we are configured - * to not include extent data: take the - * chance and free the block's memory - */ - btrfsic_block_hashtable_remove(block); - list_del(&block->all_blocks_node); - btrfsic_block_free(block); - } - } - btrfsic_release_block_ctx(&block_ctx); - } else { - /* block has not been found in hash table */ - u64 bytenr; - - if (!is_metadata) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO "Written block (%s/%llu/?)" - " !found in hash table, D.\n", - dev_state->name, - (unsigned long long)dev_bytenr); - if (!state->include_extent_data) - return; /* ignore that written D block */ - - /* this is getting ugly for the - * include_extent_data case... */ - bytenr = 0; /* unknown */ - block_ctx.start = bytenr; - block_ctx.len = len; - block_ctx.bh = NULL; - } else { - bytenr = le64_to_cpu(((struct btrfs_header *) - mapped_data)->bytenr); - btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, - dev_bytenr, - mapped_data); - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "Written block @%llu (%s/%llu/?)" - " !found in hash table, M.\n", - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr); - - ret = btrfsic_map_block(state, bytenr, len, &block_ctx, - 0); - if (ret) { - printk(KERN_INFO - "btrfsic: btrfsic_map_block(root @%llu)" - " failed!\n", - (unsigned long long)dev_bytenr); - return; - } - } - block_ctx.data = mapped_data; - /* the following is required in case of writes to mirrors, - * use the same that was used for the lookup */ - block_ctx.dev = dev_state; - block_ctx.dev_bytenr = dev_bytenr; - - block = btrfsic_block_alloc(); - if (NULL == block) { - printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); - btrfsic_release_block_ctx(&block_ctx); - return; - } - block->dev_state = dev_state; - block->dev_bytenr = dev_bytenr; - block->logical_bytenr = bytenr; - block->is_metadata = is_metadata; - block->never_written = 0; - block->iodone_w_error = 0; - block->mirror_num = 0; /* unknown */ - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = submit_bio_bh_rw; - if (NULL != bio) { - block->is_iodone = 0; - BUG_ON(NULL == bio_is_patched); - if (!*bio_is_patched) { - block->orig_bio_bh_private = bio->bi_private; - block->orig_bio_bh_end_io.bio = bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - *bio_is_patched = 1; - } else { - struct btrfsic_block *chained_block = - (struct btrfsic_block *) - bio->bi_private; - - BUG_ON(NULL == chained_block); - block->orig_bio_bh_private = - chained_block->orig_bio_bh_private; - block->orig_bio_bh_end_io.bio = - chained_block->orig_bio_bh_end_io.bio; - block->next_in_same_bio = chained_block; - bio->bi_private = block; - } - } else if (NULL != bh) { - block->is_iodone = 0; - block->orig_bio_bh_private = bh->b_private; - block->orig_bio_bh_end_io.bh = bh->b_end_io; - block->next_in_same_bio = NULL; - bh->b_private = block; - bh->b_end_io = btrfsic_bh_end_io; - } else { - block->is_iodone = 1; - block->orig_bio_bh_private = NULL; - block->orig_bio_bh_end_io.bio = NULL; - block->next_in_same_bio = NULL; - } - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "New written %c-block @%llu (%s/%llu/%d)\n", - is_metadata ? 'M' : 'D', - (unsigned long long)block->logical_bytenr, - block->dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num); - list_add(&block->all_blocks_node, &state->all_blocks_list); - btrfsic_block_hashtable_add(block, &state->block_hashtable); - - if (is_metadata) { - ret = btrfsic_process_metablock(state, block, - &block_ctx, - (struct btrfs_header *) - block_ctx.data, 0, 0); - if (ret) - printk(KERN_INFO - "btrfsic: process_metablock(root @%llu)" - " failed!\n", - (unsigned long long)dev_bytenr); - } - btrfsic_release_block_ctx(&block_ctx); - } -} - -static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) -{ - struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private; - int iodone_w_error; - - /* mutex is not held! This is not save if IO is not yet completed - * on umount */ - iodone_w_error = 0; - if (bio_error_status) - iodone_w_error = 1; - - BUG_ON(NULL == block); - bp->bi_private = block->orig_bio_bh_private; - bp->bi_end_io = block->orig_bio_bh_end_io.bio; - - do { - struct btrfsic_block *next_block; - struct btrfsic_dev_state *const dev_state = block->dev_state; - - if ((dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - printk(KERN_INFO - "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", - bio_error_status, - btrfsic_get_block_type(dev_state->state, block), - (unsigned long long)block->logical_bytenr, - dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num); - next_block = block->next_in_same_bio; - block->iodone_w_error = iodone_w_error; - if (block->submit_bio_bh_rw & REQ_FLUSH) { - dev_state->last_flush_gen++; - if ((dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - printk(KERN_INFO - "bio_end_io() new %s flush_gen=%llu\n", - dev_state->name, - (unsigned long long) - dev_state->last_flush_gen); - } - if (block->submit_bio_bh_rw & REQ_FUA) - block->flush_gen = 0; /* FUA completed means block is - * on disk */ - block->is_iodone = 1; /* for FLUSH, this releases the block */ - block = next_block; - } while (NULL != block); - - bp->bi_end_io(bp, bio_error_status); -} - -static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) -{ - struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private; - int iodone_w_error = !uptodate; - struct btrfsic_dev_state *dev_state; - - BUG_ON(NULL == block); - dev_state = block->dev_state; - if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - printk(KERN_INFO - "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n", - iodone_w_error, - btrfsic_get_block_type(dev_state->state, block), - (unsigned long long)block->logical_bytenr, - block->dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num); - - block->iodone_w_error = iodone_w_error; - if (block->submit_bio_bh_rw & REQ_FLUSH) { - dev_state->last_flush_gen++; - if ((dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - printk(KERN_INFO - "bh_end_io() new %s flush_gen=%llu\n", - dev_state->name, - (unsigned long long)dev_state->last_flush_gen); - } - if (block->submit_bio_bh_rw & REQ_FUA) - block->flush_gen = 0; /* FUA completed means block is on disk */ - - bh->b_private = block->orig_bio_bh_private; - bh->b_end_io = block->orig_bio_bh_end_io.bh; - block->is_iodone = 1; /* for FLUSH, this releases the block */ - bh->b_end_io(bh, uptodate); -} - -static int btrfsic_process_written_superblock( - struct btrfsic_state *state, - struct btrfsic_block *const superblock, - struct btrfs_super_block *const super_hdr) -{ - int pass; - - superblock->generation = btrfs_super_generation(super_hdr); - if (!(superblock->generation > state->max_superblock_generation || - 0 == state->max_superblock_generation)) { - if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - printk(KERN_INFO - "btrfsic: superblock @%llu (%s/%llu/%d)" - " with old gen %llu <= %llu\n", - (unsigned long long)superblock->logical_bytenr, - superblock->dev_state->name, - (unsigned long long)superblock->dev_bytenr, - superblock->mirror_num, - (unsigned long long) - btrfs_super_generation(super_hdr), - (unsigned long long) - state->max_superblock_generation); - } else { - if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - printk(KERN_INFO - "btrfsic: got new superblock @%llu (%s/%llu/%d)" - " with new gen %llu > %llu\n", - (unsigned long long)superblock->logical_bytenr, - superblock->dev_state->name, - (unsigned long long)superblock->dev_bytenr, - superblock->mirror_num, - (unsigned long long) - btrfs_super_generation(super_hdr), - (unsigned long long) - state->max_superblock_generation); - - state->max_superblock_generation = - btrfs_super_generation(super_hdr); - state->latest_superblock = superblock; - } - - for (pass = 0; pass < 3; pass++) { - int ret; - u64 next_bytenr; - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx tmp_next_block_ctx; - struct btrfsic_block_link *l; - int num_copies; - int mirror_num; - const char *additional_string = NULL; - struct btrfs_disk_key tmp_disk_key; - - tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; - tmp_disk_key.offset = 0; - - switch (pass) { - case 0: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID); - additional_string = "root "; - next_bytenr = btrfs_super_root(super_hdr); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "root@%llu\n", - (unsigned long long)next_bytenr); - break; - case 1: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID); - additional_string = "chunk "; - next_bytenr = btrfs_super_chunk_root(super_hdr); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "chunk@%llu\n", - (unsigned long long)next_bytenr); - break; - case 2: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_TREE_LOG_OBJECTID); - additional_string = "log "; - next_bytenr = btrfs_super_log_root(super_hdr); - if (0 == next_bytenr) - continue; - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "log@%llu\n", - (unsigned long long)next_bytenr); - break; - } - - num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", - (unsigned long long)next_bytenr, num_copies); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - int was_created; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "btrfsic_process_written_superblock(" - "mirror_num=%d)\n", mirror_num); - ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, - &tmp_next_block_ctx, - mirror_num); - if (ret) { - printk(KERN_INFO - "btrfsic: btrfsic_map_block(@%llu," - " mirror=%d) failed!\n", - (unsigned long long)next_bytenr, - mirror_num); - return -1; - } - - next_block = btrfsic_block_lookup_or_add( - state, - &tmp_next_block_ctx, - additional_string, - 1, 0, 1, - mirror_num, - &was_created); - if (NULL == next_block) { - printk(KERN_INFO - "btrfsic: error, kmalloc failed!\n"); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - return -1; - } - - next_block->disk_key = tmp_disk_key; - if (was_created) - next_block->generation = - BTRFSIC_GENERATION_UNKNOWN; - l = btrfsic_block_link_lookup_or_add( - state, - &tmp_next_block_ctx, - next_block, - superblock, - BTRFSIC_GENERATION_UNKNOWN); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - if (NULL == l) - return -1; - } - } - - if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) { - WARN_ON(1); - btrfsic_dump_tree(state); - } - - return 0; -} - -static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, - struct btrfsic_block *const block, - int recursion_level) -{ - struct list_head *elem_ref_to; - int ret = 0; - - if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { - /* - * Note that this situation can happen and does not - * indicate an error in regular cases. It happens - * when disk blocks are freed and later reused. - * The check-integrity module is not aware of any - * block free operations, it just recognizes block - * write operations. Therefore it keeps the linkage - * information for a block until a block is - * rewritten. This can temporarily cause incorrect - * and even circular linkage informations. This - * causes no harm unless such blocks are referenced - * by the most recent super block. - */ - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "btrfsic: abort cyclic linkage (case 1).\n"); - - return ret; - } - - /* - * This algorithm is recursive because the amount of used stack - * space is very small and the max recursion depth is limited. - */ - list_for_each(elem_ref_to, &block->ref_to_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_to, struct btrfsic_block_link, - node_ref_to); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "rl=%d, %c @%llu (%s/%llu/%d)" - " %u* refers to %c @%llu (%s/%llu/%d)\n", - recursion_level, - btrfsic_get_block_type(state, block), - (unsigned long long)block->logical_bytenr, - block->dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - if (l->block_ref_to->never_written) { - printk(KERN_INFO "btrfs: attempt to write superblock" - " which references block %c @%llu (%s/%llu/%d)" - " which is never written!\n", - btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - ret = -1; - } else if (!l->block_ref_to->is_iodone) { - printk(KERN_INFO "btrfs: attempt to write superblock" - " which references block %c @%llu (%s/%llu/%d)" - " which is not yet iodone!\n", - btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - ret = -1; - } else if (l->parent_generation != - l->block_ref_to->generation && - BTRFSIC_GENERATION_UNKNOWN != - l->parent_generation && - BTRFSIC_GENERATION_UNKNOWN != - l->block_ref_to->generation) { - printk(KERN_INFO "btrfs: attempt to write superblock" - " which references block %c @%llu (%s/%llu/%d)" - " with generation %llu !=" - " parent generation %llu!\n", - btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num, - (unsigned long long)l->block_ref_to->generation, - (unsigned long long)l->parent_generation); - ret = -1; - } else if (l->block_ref_to->flush_gen > - l->block_ref_to->dev_state->last_flush_gen) { - printk(KERN_INFO "btrfs: attempt to write superblock" - " which references block %c @%llu (%s/%llu/%d)" - " which is not flushed out of disk's write cache" - " (block flush_gen=%llu," - " dev->flush_gen=%llu)!\n", - btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num, - (unsigned long long)block->flush_gen, - (unsigned long long) - l->block_ref_to->dev_state->last_flush_gen); - ret = -1; - } else if (-1 == btrfsic_check_all_ref_blocks(state, - l->block_ref_to, - recursion_level + - 1)) { - ret = -1; - } - } - - return ret; -} - -static int btrfsic_is_block_ref_by_superblock( - const struct btrfsic_state *state, - const struct btrfsic_block *block, - int recursion_level) -{ - struct list_head *elem_ref_from; - - if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { - /* refer to comment at "abort cyclic linkage (case 1)" */ - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "btrfsic: abort cyclic linkage (case 2).\n"); - - return 0; - } - - /* - * This algorithm is recursive because the amount of used stack space - * is very small and the max recursion depth is limited. - */ - list_for_each(elem_ref_from, &block->ref_from_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_from, struct btrfsic_block_link, - node_ref_from); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "rl=%d, %c @%llu (%s/%llu/%d)" - " is ref %u* from %c @%llu (%s/%llu/%d)\n", - recursion_level, - btrfsic_get_block_type(state, block), - (unsigned long long)block->logical_bytenr, - block->dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - (unsigned long long) - l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->name, - (unsigned long long) - l->block_ref_from->dev_bytenr, - l->block_ref_from->mirror_num); - if (l->block_ref_from->is_superblock && - state->latest_superblock->dev_bytenr == - l->block_ref_from->dev_bytenr && - state->latest_superblock->dev_state->bdev == - l->block_ref_from->dev_state->bdev) - return 1; - else if (btrfsic_is_block_ref_by_superblock(state, - l->block_ref_from, - recursion_level + - 1)) - return 1; - } - - return 0; -} - -static void btrfsic_print_add_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l) -{ - printk(KERN_INFO - "Add %u* link from %c @%llu (%s/%llu/%d)" - " to %c @%llu (%s/%llu/%d).\n", - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - (unsigned long long)l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->name, - (unsigned long long)l->block_ref_from->dev_bytenr, - l->block_ref_from->mirror_num, - btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long)l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); -} - -static void btrfsic_print_rem_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l) -{ - printk(KERN_INFO - "Rem %u* link from %c @%llu (%s/%llu/%d)" - " to %c @%llu (%s/%llu/%d).\n", - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - (unsigned long long)l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->name, - (unsigned long long)l->block_ref_from->dev_bytenr, - l->block_ref_from->mirror_num, - btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long)l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); -} - -static char btrfsic_get_block_type(const struct btrfsic_state *state, - const struct btrfsic_block *block) -{ - if (block->is_superblock && - state->latest_superblock->dev_bytenr == block->dev_bytenr && - state->latest_superblock->dev_state->bdev == block->dev_state->bdev) - return 'S'; - else if (block->is_superblock) - return 's'; - else if (block->is_metadata) - return 'M'; - else - return 'D'; -} - -static void btrfsic_dump_tree(const struct btrfsic_state *state) -{ - btrfsic_dump_tree_sub(state, state->latest_superblock, 0); -} - -static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, - const struct btrfsic_block *block, - int indent_level) -{ - struct list_head *elem_ref_to; - int indent_add; - static char buf[80]; - int cursor_position; - - /* - * Should better fill an on-stack buffer with a complete line and - * dump it at once when it is time to print a newline character. - */ - - /* - * This algorithm is recursive because the amount of used stack space - * is very small and the max recursion depth is limited. - */ - indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)", - btrfsic_get_block_type(state, block), - (unsigned long long)block->logical_bytenr, - block->dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num); - if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { - printk("[...]\n"); - return; - } - printk(buf); - indent_level += indent_add; - if (list_empty(&block->ref_to_list)) { - printk("\n"); - return; - } - if (block->mirror_num > 1 && - !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) { - printk(" [...]\n"); - return; - } - - cursor_position = indent_level; - list_for_each(elem_ref_to, &block->ref_to_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_to, struct btrfsic_block_link, - node_ref_to); - - while (cursor_position < indent_level) { - printk(" "); - cursor_position++; - } - if (l->ref_cnt > 1) - indent_add = sprintf(buf, " %d*--> ", l->ref_cnt); - else - indent_add = sprintf(buf, " --> "); - if (indent_level + indent_add > - BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { - printk("[...]\n"); - cursor_position = 0; - continue; - } - - printk(buf); - - btrfsic_dump_tree_sub(state, l->block_ref_to, - indent_level + indent_add); - cursor_position = 0; - } -} - -static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block *next_block, - struct btrfsic_block *from_block, - u64 parent_generation) -{ - struct btrfsic_block_link *l; - - l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev, - next_block_ctx->dev_bytenr, - from_block->dev_state->bdev, - from_block->dev_bytenr, - &state->block_link_hashtable); - if (NULL == l) { - l = btrfsic_block_link_alloc(); - if (NULL == l) { - printk(KERN_INFO - "btrfsic: error, kmalloc" " failed!\n"); - return NULL; - } - - l->block_ref_to = next_block; - l->block_ref_from = from_block; - l->ref_cnt = 1; - l->parent_generation = parent_generation; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - - list_add(&l->node_ref_to, &from_block->ref_to_list); - list_add(&l->node_ref_from, &next_block->ref_from_list); - - btrfsic_block_link_hashtable_add(l, - &state->block_link_hashtable); - } else { - l->ref_cnt++; - l->parent_generation = parent_generation; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - } - - return l; -} - -static struct btrfsic_block *btrfsic_block_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx, - const char *additional_string, - int is_metadata, - int is_iodone, - int never_written, - int mirror_num, - int *was_created) -{ - struct btrfsic_block *block; - - block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev, - block_ctx->dev_bytenr, - &state->block_hashtable); - if (NULL == block) { - struct btrfsic_dev_state *dev_state; - - block = btrfsic_block_alloc(); - if (NULL == block) { - printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); - return NULL; - } - dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev); - if (NULL == dev_state) { - printk(KERN_INFO - "btrfsic: error, lookup dev_state failed!\n"); - btrfsic_block_free(block); - return NULL; - } - block->dev_state = dev_state; - block->dev_bytenr = block_ctx->dev_bytenr; - block->logical_bytenr = block_ctx->start; - block->is_metadata = is_metadata; - block->is_iodone = is_iodone; - block->never_written = never_written; - block->mirror_num = mirror_num; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "New %s%c-block @%llu (%s/%llu/%d)\n", - additional_string, - btrfsic_get_block_type(state, block), - (unsigned long long)block->logical_bytenr, - dev_state->name, - (unsigned long long)block->dev_bytenr, - mirror_num); - list_add(&block->all_blocks_node, &state->all_blocks_list); - btrfsic_block_hashtable_add(block, &state->block_hashtable); - if (NULL != was_created) - *was_created = 1; - } else { - if (NULL != was_created) - *was_created = 0; - } - - return block; -} - -static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, - u64 bytenr, - struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, char *data) -{ - int num_copies; - int mirror_num; - int ret; - struct btrfsic_block_data_ctx block_ctx; - int match = 0; - - num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, - bytenr, PAGE_SIZE); - - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, - &block_ctx, mirror_num); - if (ret) { - printk(KERN_INFO "btrfsic:" - " btrfsic_map_block(logical @%llu," - " mirror %d) failed!\n", - (unsigned long long)bytenr, mirror_num); - continue; - } - - if (dev_state->bdev == block_ctx.dev->bdev && - dev_bytenr == block_ctx.dev_bytenr) { - match++; - btrfsic_release_block_ctx(&block_ctx); - break; - } - btrfsic_release_block_ctx(&block_ctx); - } - - if (!match) { - printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio," - " buffer->log_bytenr=%llu, submit_bio(bdev=%s," - " phys_bytenr=%llu)!\n", - (unsigned long long)bytenr, dev_state->name, - (unsigned long long)dev_bytenr); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, - &block_ctx, mirror_num); - if (ret) - continue; - - printk(KERN_INFO "Read logical bytenr @%llu maps to" - " (%s/%llu/%d)\n", - (unsigned long long)bytenr, - block_ctx.dev->name, - (unsigned long long)block_ctx.dev_bytenr, - mirror_num); - } - WARN_ON(1); - } -} - -static struct btrfsic_dev_state *btrfsic_dev_state_lookup( - struct block_device *bdev) -{ - struct btrfsic_dev_state *ds; - - ds = btrfsic_dev_state_hashtable_lookup(bdev, - &btrfsic_dev_state_hashtable); - return ds; -} - -int btrfsic_submit_bh(int rw, struct buffer_head *bh) -{ - struct btrfsic_dev_state *dev_state; - - if (!btrfsic_is_initialized) - return submit_bh(rw, bh); - - mutex_lock(&btrfsic_mutex); - /* since btrfsic_submit_bh() might also be called before - * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bh->b_bdev); - - /* Only called to write the superblock (incl. FLUSH/FUA) */ - if (NULL != dev_state && - (rw & WRITE) && bh->b_size > 0) { - u64 dev_bytenr; - - dev_bytenr = 4096 * bh->b_blocknr; - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - printk(KERN_INFO - "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu)," - " size=%lu, data=%p, bdev=%p)\n", - rw, bh->b_blocknr, - (unsigned long long)dev_bytenr, bh->b_size, - bh->b_data, bh->b_bdev); - btrfsic_process_written_block(dev_state, dev_bytenr, - bh->b_data, bh->b_size, NULL, - NULL, bh, rw); - } else if (NULL != dev_state && (rw & REQ_FLUSH)) { - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - printk(KERN_INFO - "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n", - rw, bh->b_bdev); - if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { - if ((dev_state->state->print_mask & - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE))) - printk(KERN_INFO - "btrfsic_submit_bh(%s) with FLUSH" - " but dummy block already in use" - " (ignored)!\n", - dev_state->name); - } else { - struct btrfsic_block *const block = - &dev_state->dummy_block_for_bio_bh_flush; - - block->is_iodone = 0; - block->never_written = 0; - block->iodone_w_error = 0; - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = rw; - block->orig_bio_bh_private = bh->b_private; - block->orig_bio_bh_end_io.bh = bh->b_end_io; - block->next_in_same_bio = NULL; - bh->b_private = block; - bh->b_end_io = btrfsic_bh_end_io; - } - } - mutex_unlock(&btrfsic_mutex); - return submit_bh(rw, bh); -} - -void btrfsic_submit_bio(int rw, struct bio *bio) -{ - struct btrfsic_dev_state *dev_state; - - if (!btrfsic_is_initialized) { - submit_bio(rw, bio); - return; - } - - mutex_lock(&btrfsic_mutex); - /* since btrfsic_submit_bio() is also called before - * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); - if (NULL != dev_state && - (rw & WRITE) && NULL != bio->bi_io_vec) { - unsigned int i; - u64 dev_bytenr; - int bio_is_patched; - - dev_bytenr = 512 * bio->bi_sector; - bio_is_patched = 0; - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - printk(KERN_INFO - "submit_bio(rw=0x%x, bi_vcnt=%u," - " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n", - rw, bio->bi_vcnt, bio->bi_sector, - (unsigned long long)dev_bytenr, - bio->bi_bdev); - - for (i = 0; i < bio->bi_vcnt; i++) { - u8 *mapped_data; - - mapped_data = kmap(bio->bi_io_vec[i].bv_page); - if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE) == - (dev_state->state->print_mask & - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE))) - printk(KERN_INFO - "#%u: page=%p, mapped=%p, len=%u," - " offset=%u\n", - i, bio->bi_io_vec[i].bv_page, - mapped_data, - bio->bi_io_vec[i].bv_len, - bio->bi_io_vec[i].bv_offset); - btrfsic_process_written_block(dev_state, dev_bytenr, - mapped_data, - bio->bi_io_vec[i].bv_len, - bio, &bio_is_patched, - NULL, rw); - kunmap(bio->bi_io_vec[i].bv_page); - dev_bytenr += bio->bi_io_vec[i].bv_len; - } - } else if (NULL != dev_state && (rw & REQ_FLUSH)) { - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - printk(KERN_INFO - "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n", - rw, bio->bi_bdev); - if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { - if ((dev_state->state->print_mask & - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE))) - printk(KERN_INFO - "btrfsic_submit_bio(%s) with FLUSH" - " but dummy block already in use" - " (ignored)!\n", - dev_state->name); - } else { - struct btrfsic_block *const block = - &dev_state->dummy_block_for_bio_bh_flush; - - block->is_iodone = 0; - block->never_written = 0; - block->iodone_w_error = 0; - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = rw; - block->orig_bio_bh_private = bio->bi_private; - block->orig_bio_bh_end_io.bio = bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - } - } - mutex_unlock(&btrfsic_mutex); - - submit_bio(rw, bio); -} - -int btrfsic_mount(struct btrfs_root *root, - struct btrfs_fs_devices *fs_devices, - int including_extent_data, u32 print_mask) -{ - int ret; - struct btrfsic_state *state; - struct list_head *dev_head = &fs_devices->devices; - struct btrfs_device *device; - - state = kzalloc(sizeof(*state), GFP_NOFS); - if (NULL == state) { - printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); - return -1; - } - - if (!btrfsic_is_initialized) { - mutex_init(&btrfsic_mutex); - btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable); - btrfsic_is_initialized = 1; - } - mutex_lock(&btrfsic_mutex); - state->root = root; - state->print_mask = print_mask; - state->include_extent_data = including_extent_data; - state->csum_size = 0; - INIT_LIST_HEAD(&state->all_blocks_list); - btrfsic_block_hashtable_init(&state->block_hashtable); - btrfsic_block_link_hashtable_init(&state->block_link_hashtable); - state->max_superblock_generation = 0; - state->latest_superblock = NULL; - - list_for_each_entry(device, dev_head, dev_list) { - struct btrfsic_dev_state *ds; - char *p; - - if (!device->bdev || !device->name) - continue; - - ds = btrfsic_dev_state_alloc(); - if (NULL == ds) { - printk(KERN_INFO - "btrfs check-integrity: kmalloc() failed!\n"); - mutex_unlock(&btrfsic_mutex); - return -1; - } - ds->bdev = device->bdev; - ds->state = state; - bdevname(ds->bdev, ds->name); - ds->name[BDEVNAME_SIZE - 1] = '\0'; - for (p = ds->name; *p != '\0'; p++); - while (p > ds->name && *p != '/') - p--; - if (*p == '/') - p++; - strlcpy(ds->name, p, sizeof(ds->name)); - btrfsic_dev_state_hashtable_add(ds, - &btrfsic_dev_state_hashtable); - } - - ret = btrfsic_process_superblock(state, fs_devices); - if (0 != ret) { - mutex_unlock(&btrfsic_mutex); - btrfsic_unmount(root, fs_devices); - return ret; - } - - if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE) - btrfsic_dump_database(state); - if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE) - btrfsic_dump_tree(state); - - mutex_unlock(&btrfsic_mutex); - return 0; -} - -void btrfsic_unmount(struct btrfs_root *root, - struct btrfs_fs_devices *fs_devices) -{ - struct list_head *elem_all; - struct list_head *tmp_all; - struct btrfsic_state *state; - struct list_head *dev_head = &fs_devices->devices; - struct btrfs_device *device; - - if (!btrfsic_is_initialized) - return; - - mutex_lock(&btrfsic_mutex); - - state = NULL; - list_for_each_entry(device, dev_head, dev_list) { - struct btrfsic_dev_state *ds; - - if (!device->bdev || !device->name) - continue; - - ds = btrfsic_dev_state_hashtable_lookup( - device->bdev, - &btrfsic_dev_state_hashtable); - if (NULL != ds) { - state = ds->state; - btrfsic_dev_state_hashtable_remove(ds); - btrfsic_dev_state_free(ds); - } - } - - if (NULL == state) { - printk(KERN_INFO - "btrfsic: error, cannot find state information" - " on umount!\n"); - mutex_unlock(&btrfsic_mutex); - return; - } - - /* - * Don't care about keeping the lists' state up to date, - * just free all memory that was allocated dynamically. - * Free the blocks and the block_links. - */ - list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) { - struct btrfsic_block *const b_all = - list_entry(elem_all, struct btrfsic_block, - all_blocks_node); - struct list_head *elem_ref_to; - struct list_head *tmp_ref_to; - - list_for_each_safe(elem_ref_to, tmp_ref_to, - &b_all->ref_to_list) { - struct btrfsic_block_link *const l = - list_entry(elem_ref_to, - struct btrfsic_block_link, - node_ref_to); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_rem_link(state, l); - - l->ref_cnt--; - if (0 == l->ref_cnt) - btrfsic_block_link_free(l); - } - - if (b_all->is_iodone) - btrfsic_block_free(b_all); - else - printk(KERN_INFO "btrfs: attempt to free %c-block" - " @%llu (%s/%llu/%d) on umount which is" - " not yet iodone!\n", - btrfsic_get_block_type(state, b_all), - (unsigned long long)b_all->logical_bytenr, - b_all->dev_state->name, - (unsigned long long)b_all->dev_bytenr, - b_all->mirror_num); - } - - mutex_unlock(&btrfsic_mutex); - - kfree(state); -} diff --git a/trunk/fs/btrfs/check-integrity.h b/trunk/fs/btrfs/check-integrity.h deleted file mode 100644 index 8b59175cc502..000000000000 --- a/trunk/fs/btrfs/check-integrity.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (C) STRATO AG 2011. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#if !defined(__BTRFS_CHECK_INTEGRITY__) -#define __BTRFS_CHECK_INTEGRITY__ - -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY -int btrfsic_submit_bh(int rw, struct buffer_head *bh); -void btrfsic_submit_bio(int rw, struct bio *bio); -#else -#define btrfsic_submit_bh submit_bh -#define btrfsic_submit_bio submit_bio -#endif - -int btrfsic_mount(struct btrfs_root *root, - struct btrfs_fs_devices *fs_devices, - int including_extent_data, u32 print_mask); -void btrfsic_unmount(struct btrfs_root *root, - struct btrfs_fs_devices *fs_devices); - -#endif diff --git a/trunk/fs/btrfs/ctree.c b/trunk/fs/btrfs/ctree.c index 0639a555e16e..dede441bdeee 100644 --- a/trunk/fs/btrfs/ctree.c +++ b/trunk/fs/btrfs/ctree.c @@ -240,7 +240,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, cow = btrfs_alloc_free_block(trans, root, buf->len, 0, new_root_objectid, &disk_key, level, - buf->start, 0, 1); + buf->start, 0); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -261,9 +261,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, WARN_ON(btrfs_header_generation(buf) > trans->transid); if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + ret = btrfs_inc_ref(trans, root, cow, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0, 1); + ret = btrfs_inc_ref(trans, root, cow, 0); if (ret) return ret; @@ -350,14 +350,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if ((owner == root->root_key.objectid || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { - ret = btrfs_inc_ref(trans, root, buf, 1, 1); + ret = btrfs_inc_ref(trans, root, buf, 1); BUG_ON(ret); if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_dec_ref(trans, root, buf, 0, 1); + ret = btrfs_dec_ref(trans, root, buf, 0); BUG_ON(ret); - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + ret = btrfs_inc_ref(trans, root, cow, 1); BUG_ON(ret); } new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; @@ -365,9 +365,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + ret = btrfs_inc_ref(trans, root, cow, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0, 1); + ret = btrfs_inc_ref(trans, root, cow, 0); BUG_ON(ret); } if (new_flags != 0) { @@ -381,11 +381,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + ret = btrfs_inc_ref(trans, root, cow, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0, 1); + ret = btrfs_inc_ref(trans, root, cow, 0); BUG_ON(ret); - ret = btrfs_dec_ref(trans, root, buf, 1, 1); + ret = btrfs_dec_ref(trans, root, buf, 1); BUG_ON(ret); } clean_tree_block(trans, root, buf); @@ -446,7 +446,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, root->root_key.objectid, &disk_key, - level, search_start, empty_size, 1); + level, search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -484,7 +484,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, rcu_assign_pointer(root->node, cow); btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref, 1); + last_ref); free_extent_buffer(buf); add_root_to_dirty_list(root); } else { @@ -500,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, trans->transid); btrfs_mark_buffer_dirty(parent); btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref, 1); + last_ref); } if (unlock_orig) btrfs_tree_unlock(buf); @@ -957,7 +957,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, free_extent_buffer(mid); root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1, 0); + btrfs_free_tree_block(trans, root, mid, 0, 1); /* once for the root ptr */ free_extent_buffer(mid); return 0; @@ -1015,7 +1015,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret) ret = wret; root_sub_used(root, right->len); - btrfs_free_tree_block(trans, root, right, 0, 1, 0); + btrfs_free_tree_block(trans, root, right, 0, 1); free_extent_buffer(right); right = NULL; } else { @@ -1055,7 +1055,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret) ret = wret; root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1, 0); + btrfs_free_tree_block(trans, root, mid, 0, 1); free_extent_buffer(mid); mid = NULL; } else { @@ -2089,7 +2089,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, root->root_key.objectid, &lower_key, - level, root->node->start, 0, 0); + level, root->node->start, 0); if (IS_ERR(c)) return PTR_ERR(c); @@ -2216,7 +2216,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, root->root_key.objectid, - &disk_key, level, c->start, 0, 0); + &disk_key, level, c->start, 0); if (IS_ERR(split)) return PTR_ERR(split); @@ -2970,7 +2970,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, root->root_key.objectid, - &disk_key, 0, l->start, 0, 0); + &disk_key, 0, l->start, 0); if (IS_ERR(right)) return PTR_ERR(right); @@ -3781,7 +3781,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used(root, leaf->len); - btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); + btrfs_free_tree_block(trans, root, leaf, 0, 1); return 0; } /* diff --git a/trunk/fs/btrfs/ctree.h b/trunk/fs/btrfs/ctree.h index 27ebe61d3ccc..67385033323d 100644 --- a/trunk/fs/btrfs/ctree.h +++ b/trunk/fs/btrfs/ctree.h @@ -86,9 +86,6 @@ struct btrfs_ordered_sum; /* holds checksums of all the data extents */ #define BTRFS_CSUM_TREE_OBJECTID 7ULL -/* for storing balance parameters in the root tree */ -#define BTRFS_BALANCE_OBJECTID -4ULL - /* orhpan objectid for tracking unlinked/truncated files */ #define BTRFS_ORPHAN_OBJECTID -5ULL @@ -695,54 +692,6 @@ struct btrfs_root_ref { __le16 name_len; } __attribute__ ((__packed__)); -struct btrfs_disk_balance_args { - /* - * profiles to operate on, single is denoted by - * BTRFS_AVAIL_ALLOC_BIT_SINGLE - */ - __le64 profiles; - - /* usage filter */ - __le64 usage; - - /* devid filter */ - __le64 devid; - - /* devid subset filter [pstart..pend) */ - __le64 pstart; - __le64 pend; - - /* btrfs virtual address space subset filter [vstart..vend) */ - __le64 vstart; - __le64 vend; - - /* - * profile to convert to, single is denoted by - * BTRFS_AVAIL_ALLOC_BIT_SINGLE - */ - __le64 target; - - /* BTRFS_BALANCE_ARGS_* */ - __le64 flags; - - __le64 unused[8]; -} __attribute__ ((__packed__)); - -/* - * store balance parameters to disk so that balance can be properly - * resumed after crash or unmount - */ -struct btrfs_balance_item { - /* BTRFS_BALANCE_* */ - __le64 flags; - - struct btrfs_disk_balance_args data; - struct btrfs_disk_balance_args meta; - struct btrfs_disk_balance_args sys; - - __le64 unused[4]; -} __attribute__ ((__packed__)); - #define BTRFS_FILE_EXTENT_INLINE 0 #define BTRFS_FILE_EXTENT_REG 1 #define BTRFS_FILE_EXTENT_PREALLOC 2 @@ -802,32 +751,14 @@ struct btrfs_csum_item { } __attribute__ ((__packed__)); /* different types of block groups (and chunks) */ -#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) -#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) -#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2) -#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3) -#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) -#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) -#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) -#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE -#define BTRFS_NR_RAID_TYPES 5 - -#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ - BTRFS_BLOCK_GROUP_SYSTEM | \ - BTRFS_BLOCK_GROUP_METADATA) - -#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ - BTRFS_BLOCK_GROUP_RAID1 | \ - BTRFS_BLOCK_GROUP_DUP | \ - BTRFS_BLOCK_GROUP_RAID10) -/* - * We need a bit for restriper to be able to tell when chunks of type - * SINGLE are available. This "extended" profile format is used in - * fs_info->avail_*_alloc_bits (in-memory) and balance item fields - * (on-disk). The corresponding on-disk bit in chunk.type is reserved - * to avoid remappings between two formats in future. - */ -#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) +#define BTRFS_BLOCK_GROUP_DATA (1 << 0) +#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) +#define BTRFS_BLOCK_GROUP_METADATA (1 << 2) +#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3) +#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) +#define BTRFS_BLOCK_GROUP_DUP (1 << 5) +#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) +#define BTRFS_NR_RAID_TYPES 5 struct btrfs_block_group_item { __le64 used; @@ -985,7 +916,6 @@ struct btrfs_block_group_cache { struct reloc_control; struct btrfs_device; struct btrfs_fs_devices; -struct btrfs_balance_control; struct btrfs_delayed_root; struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; @@ -1041,7 +971,7 @@ struct btrfs_fs_info { * is required instead of the faster short fsync log commits */ u64 last_trans_log_full_commit; - unsigned long mount_opt:21; + unsigned long mount_opt:20; unsigned long compress_type:4; u64 max_inline; u64 alloc_start; @@ -1202,23 +1132,12 @@ struct btrfs_fs_info { spinlock_t ref_cache_lock; u64 total_ref_cache_size; - /* - * these three are in extended format (availability of single - * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other - * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits) - */ u64 avail_data_alloc_bits; u64 avail_metadata_alloc_bits; u64 avail_system_alloc_bits; - - /* restriper state */ - spinlock_t balance_lock; - struct mutex balance_mutex; - atomic_t balance_running; - atomic_t balance_pause_req; - atomic_t balance_cancel_req; - struct btrfs_balance_control *balance_ctl; - wait_queue_head_t balance_wait_q; + u64 data_alloc_profile; + u64 metadata_alloc_profile; + u64 system_alloc_profile; unsigned data_chunk_allocations; unsigned metadata_ratio; @@ -1236,10 +1155,6 @@ struct btrfs_fs_info { int scrub_workers_refcnt; struct btrfs_workers scrub_workers; -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - u32 check_integrity_print_mask; -#endif - /* filesystem state */ u64 fs_state; @@ -1468,8 +1383,6 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_DEV_ITEM_KEY 216 #define BTRFS_CHUNK_ITEM_KEY 228 -#define BTRFS_BALANCE_ITEM_KEY 248 - /* * string items are for debugging. They just store a short string of * data in the FS @@ -1500,9 +1413,6 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) #define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) #define BTRFS_MOUNT_RECOVERY (1 << 18) -#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) -#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) -#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -2167,86 +2077,8 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, num_devices, 64); -/* struct btrfs_balance_item */ -BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); - -static inline void btrfs_balance_data(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); -} - -static inline void btrfs_set_balance_data(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); -} - -static inline void btrfs_balance_meta(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); -} - -static inline void btrfs_set_balance_meta(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); -} - -static inline void btrfs_balance_sys(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); -} - -static inline void btrfs_set_balance_sys(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); -} - -static inline void -btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, - struct btrfs_disk_balance_args *disk) -{ - memset(cpu, 0, sizeof(*cpu)); - - cpu->profiles = le64_to_cpu(disk->profiles); - cpu->usage = le64_to_cpu(disk->usage); - cpu->devid = le64_to_cpu(disk->devid); - cpu->pstart = le64_to_cpu(disk->pstart); - cpu->pend = le64_to_cpu(disk->pend); - cpu->vstart = le64_to_cpu(disk->vstart); - cpu->vend = le64_to_cpu(disk->vend); - cpu->target = le64_to_cpu(disk->target); - cpu->flags = le64_to_cpu(disk->flags); -} - -static inline void -btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, - struct btrfs_balance_args *cpu) -{ - memset(disk, 0, sizeof(*disk)); - - disk->profiles = cpu_to_le64(cpu->profiles); - disk->usage = cpu_to_le64(cpu->usage); - disk->devid = cpu_to_le64(cpu->devid); - disk->pstart = cpu_to_le64(cpu->pstart); - disk->pend = cpu_to_le64(cpu->pend); - disk->vstart = cpu_to_le64(cpu->vstart); - disk->vend = cpu_to_le64(cpu->vend); - disk->target = cpu_to_le64(cpu->target); - disk->flags = cpu_to_le64(cpu->flags); -} - /* struct btrfs_super_block */ + BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, @@ -2364,7 +2196,7 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, return btrfs_item_size(eb, e) - offset; } -static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) +static inline struct btrfs_root *btrfs_sb(struct super_block *sb) { return sb->s_fs_info; } @@ -2445,11 +2277,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, - u64 hint, u64 empty_size, int for_cow); + u64 hint, u64 empty_size); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - u64 parent, int last_ref, int for_cow); + u64 parent, int last_ref); struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -2469,17 +2301,17 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, u64 search_end, struct btrfs_key *ins, u64 data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow); + struct extent_buffer *buf, int full_backref); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow); + struct extent_buffer *buf, int full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, int is_data); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int for_cow); + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset); int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, @@ -2491,7 +2323,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, int for_cow); + u64 root_objectid, u64 owner, u64 offset); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -2650,18 +2482,10 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, } int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); -static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) -{ - ++p->slots[0]; - if (p->slots[0] >= btrfs_header_nritems(p->nodes[0])) - return btrfs_next_leaf(root, p); - return 0; -} int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); void btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref, - int for_reloc); + struct btrfs_block_rsv *block_rsv, int update_ref); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, @@ -2676,7 +2500,6 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) } static inline void free_fs_info(struct btrfs_fs_info *fs_info) { - kfree(fs_info->balance_ctl); kfree(fs_info->delayed_root); kfree(fs_info->extent_root); kfree(fs_info->tree_root); @@ -2687,24 +2510,6 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->super_for_commit); kfree(fs_info); } -/** - * profile_is_valid - tests whether a given profile is valid and reduced - * @flags: profile to validate - * @extended: if true @flags is treated as an extended profile - */ -static inline int profile_is_valid(u64 flags, int extended) -{ - u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK; - - flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; - if (extended) - mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; - - if (flags & mask) - return 0; - /* true if zero or exactly one bit set */ - return (flags & (~flags + 1)) == flags; -} /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, diff --git a/trunk/fs/btrfs/delayed-inode.c b/trunk/fs/btrfs/delayed-inode.c index fe4cd0f1cef1..9c1eccc2c503 100644 --- a/trunk/fs/btrfs/delayed-inode.c +++ b/trunk/fs/btrfs/delayed-inode.c @@ -595,12 +595,8 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, num_bytes = btrfs_calc_trans_metadata_size(root, 1); ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) { - trace_btrfs_space_reservation(root->fs_info, "delayed_item", - item->key.objectid, - num_bytes, 1); + if (!ret) item->bytes_reserved = num_bytes; - } return ret; } @@ -614,9 +610,6 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, return; rsv = &root->fs_info->delayed_block_rsv; - trace_btrfs_space_reservation(root->fs_info, "delayed_item", - item->key.objectid, item->bytes_reserved, - 0); btrfs_block_rsv_release(root, rsv, item->bytes_reserved); } @@ -631,7 +624,7 @@ static int btrfs_delayed_inode_reserve_metadata( struct btrfs_block_rsv *dst_rsv; u64 num_bytes; int ret; - bool release = false; + int release = false; src_rsv = trans->block_rsv; dst_rsv = &root->fs_info->delayed_block_rsv; @@ -658,13 +651,8 @@ static int btrfs_delayed_inode_reserve_metadata( */ if (ret == -EAGAIN) ret = -ENOSPC; - if (!ret) { + if (!ret) node->bytes_reserved = num_bytes; - trace_btrfs_space_reservation(root->fs_info, - "delayed_inode", - btrfs_ino(inode), - num_bytes, 1); - } return ret; } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { spin_lock(&BTRFS_I(inode)->lock); @@ -719,17 +707,11 @@ static int btrfs_delayed_inode_reserve_metadata( * reservation here. I think it may be time for a documentation page on * how block rsvs. work. */ - if (!ret) { - trace_btrfs_space_reservation(root->fs_info, "delayed_inode", - btrfs_ino(inode), num_bytes, 1); + if (!ret) node->bytes_reserved = num_bytes; - } - if (release) { - trace_btrfs_space_reservation(root->fs_info, "delalloc", - btrfs_ino(inode), num_bytes, 0); + if (release) btrfs_block_rsv_release(root, src_rsv, num_bytes); - } return ret; } @@ -743,8 +725,6 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root, return; rsv = &root->fs_info->delayed_block_rsv; - trace_btrfs_space_reservation(root->fs_info, "delayed_inode", - node->inode_id, node->bytes_reserved, 0); btrfs_block_rsv_release(root, rsv, node->bytes_reserved); node->bytes_reserved = 0; @@ -1392,6 +1372,13 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, goto release_node; } + ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item); + /* + * we have reserved enough space when we start a new transaction, + * so reserving metadata failure is impossible + */ + BUG_ON(ret); + delayed_item->key.objectid = btrfs_ino(dir); btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY); delayed_item->key.offset = index; @@ -1404,14 +1391,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, dir_item->type = type; memcpy((char *)(dir_item + 1), name, name_len); - ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item); - /* - * we have reserved enough space when we start a new transaction, - * so reserving metadata failure is impossible - */ - BUG_ON(ret); - - mutex_lock(&delayed_node->mutex); ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); if (unlikely(ret)) { diff --git a/trunk/fs/btrfs/delayed-ref.c b/trunk/fs/btrfs/delayed-ref.c index 66e4f29505a3..125cf76fcd08 100644 --- a/trunk/fs/btrfs/delayed-ref.c +++ b/trunk/fs/btrfs/delayed-ref.c @@ -101,11 +101,6 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2, return -1; if (ref1->type > ref2->type) return 1; - /* merging of sequenced refs is not allowed */ - if (ref1->seq < ref2->seq) - return -1; - if (ref1->seq > ref2->seq) - return 1; if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), @@ -155,22 +150,16 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, /* * find an head entry based on bytenr. This returns the delayed ref - * head if it was able to find one, or NULL if nothing was in that spot. - * If return_bigger is given, the next bigger entry is returned if no exact - * match is found. + * head if it was able to find one, or NULL if nothing was in that spot */ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, u64 bytenr, - struct btrfs_delayed_ref_node **last, - int return_bigger) + struct btrfs_delayed_ref_node **last) { - struct rb_node *n; + struct rb_node *n = root->rb_node; struct btrfs_delayed_ref_node *entry; - int cmp = 0; + int cmp; -again: - n = root->rb_node; - entry = NULL; while (n) { entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); WARN_ON(!entry->in_tree); @@ -193,19 +182,6 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, else return entry; } - if (entry && return_bigger) { - if (cmp > 0) { - n = rb_next(&entry->rb_node); - if (!n) - n = rb_first(root); - entry = rb_entry(n, struct btrfs_delayed_ref_node, - rb_node); - bytenr = entry->bytenr; - return_bigger = 0; - goto again; - } - return entry; - } return NULL; } @@ -233,24 +209,6 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, return 0; } -int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, - u64 seq) -{ - struct seq_list *elem; - - assert_spin_locked(&delayed_refs->lock); - if (list_empty(&delayed_refs->seq_head)) - return 0; - - elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list); - if (seq >= elem->seq) { - pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n", - seq, elem->seq, delayed_refs); - return 1; - } - return 0; -} - int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, struct list_head *cluster, u64 start) { @@ -265,8 +223,20 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, node = rb_first(&delayed_refs->root); } else { ref = NULL; - find_ref_head(&delayed_refs->root, start + 1, &ref, 1); + find_ref_head(&delayed_refs->root, start, &ref); if (ref) { + struct btrfs_delayed_ref_node *tmp; + + node = rb_prev(&ref->rb_node); + while (node) { + tmp = rb_entry(node, + struct btrfs_delayed_ref_node, + rb_node); + if (tmp->bytenr < start) + break; + ref = tmp; + node = rb_prev(&ref->rb_node); + } node = &ref->rb_node; } else node = rb_first(&delayed_refs->root); @@ -420,8 +390,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, * this does all the dirty work in terms of maintaining the correct * overall modification count. */ -static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, +static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, int action, int is_data) @@ -468,7 +437,6 @@ static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info, ref->action = 0; ref->is_head = 1; ref->in_tree = 1; - ref->seq = 0; head_ref = btrfs_delayed_node_to_head(ref); head_ref->must_insert_reserved = must_insert_reserved; @@ -500,17 +468,14 @@ static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info, /* * helper to insert a delayed tree ref into the rbtree. */ -static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, +static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, int level, int action, - int for_cow) + u64 ref_root, int level, int action) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_tree_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; - u64 seq = 0; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; @@ -526,17 +491,14 @@ static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; - if (need_ref_seq(for_cow, ref_root)) - seq = inc_delayed_seq(delayed_refs); - ref->seq = seq; - full_ref = btrfs_delayed_node_to_tree_ref(ref); - full_ref->parent = parent; - full_ref->root = ref_root; - if (parent) + if (parent) { + full_ref->parent = parent; ref->type = BTRFS_SHARED_BLOCK_REF_KEY; - else + } else { + full_ref->root = ref_root; ref->type = BTRFS_TREE_BLOCK_REF_KEY; + } full_ref->level = level; trace_btrfs_delayed_tree_ref(ref, full_ref, action); @@ -560,17 +522,15 @@ static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info, /* * helper to insert a delayed data ref into the rbtree. */ -static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, +static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, - int action, int for_cow) + int action) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_data_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; - u64 seq = 0; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; @@ -586,18 +546,14 @@ static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; - if (need_ref_seq(for_cow, ref_root)) - seq = inc_delayed_seq(delayed_refs); - ref->seq = seq; - full_ref = btrfs_delayed_node_to_data_ref(ref); - full_ref->parent = parent; - full_ref->root = ref_root; - if (parent) + if (parent) { + full_ref->parent = parent; ref->type = BTRFS_SHARED_DATA_REF_KEY; - else + } else { + full_ref->root = ref_root; ref->type = BTRFS_EXTENT_DATA_REF_KEY; - + } full_ref->objectid = owner; full_ref->offset = offset; @@ -624,12 +580,10 @@ static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info, * to make sure the delayed ref is eventually processed before this * transaction commits. */ -int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, +int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op, - int for_cow) + struct btrfs_delayed_extent_op *extent_op) { struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; @@ -656,17 +610,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, - num_bytes, action, 0); + ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, + action, 0); BUG_ON(ret); - ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, - num_bytes, parent, ref_root, level, action, - for_cow); + ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes, + parent, ref_root, level, action); BUG_ON(ret); - if (!need_ref_seq(for_cow, ref_root) && - waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); return 0; } @@ -674,13 +624,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, /* * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. */ -int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, +int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op, - int for_cow) + struct btrfs_delayed_extent_op *extent_op) { struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; @@ -707,23 +655,18 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, - num_bytes, action, 1); + ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, + action, 1); BUG_ON(ret); - ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, - num_bytes, parent, ref_root, owner, offset, - action, for_cow); + ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes, + parent, ref_root, owner, offset, action); BUG_ON(ret); - if (!need_ref_seq(for_cow, ref_root) && - waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); return 0; } -int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, +int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op) { @@ -740,13 +683,11 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, + ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data); BUG_ON(ret); - if (waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); return 0; } @@ -763,7 +704,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) struct btrfs_delayed_ref_root *delayed_refs; delayed_refs = &trans->transaction->delayed_refs; - ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0); + ref = find_ref_head(&delayed_refs->root, bytenr, NULL); if (ref) return btrfs_delayed_node_to_head(ref); return NULL; diff --git a/trunk/fs/btrfs/delayed-ref.h b/trunk/fs/btrfs/delayed-ref.h index d8f244d94925..e287e3b0eab0 100644 --- a/trunk/fs/btrfs/delayed-ref.h +++ b/trunk/fs/btrfs/delayed-ref.h @@ -33,9 +33,6 @@ struct btrfs_delayed_ref_node { /* the size of the extent */ u64 num_bytes; - /* seq number to keep track of insertion order */ - u64 seq; - /* ref count on this data structure */ atomic_t refs; @@ -101,15 +98,19 @@ struct btrfs_delayed_ref_head { struct btrfs_delayed_tree_ref { struct btrfs_delayed_ref_node node; - u64 root; - u64 parent; + union { + u64 root; + u64 parent; + }; int level; }; struct btrfs_delayed_data_ref { struct btrfs_delayed_ref_node node; - u64 root; - u64 parent; + union { + u64 root; + u64 parent; + }; u64 objectid; u64 offset; }; @@ -139,26 +140,6 @@ struct btrfs_delayed_ref_root { int flushing; u64 run_delayed_start; - - /* - * seq number of delayed refs. We need to know if a backref was being - * added before the currently processed ref or afterwards. - */ - u64 seq; - - /* - * seq_list holds a list of all seq numbers that are currently being - * added to the list. While walking backrefs (btrfs_find_all_roots, - * qgroups), which might take some time, no newer ref must be processed, - * as it might influence the outcome of the walk. - */ - struct list_head seq_head; - - /* - * when the only refs we have in the list must not be processed, we want - * to wait for more refs to show up or for the end of backref walking. - */ - wait_queue_head_t seq_wait; }; static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) @@ -170,21 +151,16 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) } } -int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, +int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op, - int for_cow); -int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op, - int for_cow); -int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op); @@ -194,60 +170,6 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head); int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, struct list_head *cluster, u64 search_start); - -struct seq_list { - struct list_head list; - u64 seq; -}; - -static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs) -{ - assert_spin_locked(&delayed_refs->lock); - ++delayed_refs->seq; - return delayed_refs->seq; -} - -static inline void -btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, - struct seq_list *elem) -{ - assert_spin_locked(&delayed_refs->lock); - elem->seq = delayed_refs->seq; - list_add_tail(&elem->list, &delayed_refs->seq_head); -} - -static inline void -btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, - struct seq_list *elem) -{ - spin_lock(&delayed_refs->lock); - list_del(&elem->list); - wake_up(&delayed_refs->seq_wait); - spin_unlock(&delayed_refs->lock); -} - -int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, - u64 seq); - -/* - * delayed refs with a ref_seq > 0 must be held back during backref walking. - * this only applies to items in one of the fs-trees. for_cow items never need - * to be held back, so they won't get a ref_seq number. - */ -static inline int need_ref_seq(int for_cow, u64 rootid) -{ - if (for_cow) - return 0; - - if (rootid == BTRFS_FS_TREE_OBJECTID) - return 1; - - if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) - return 1; - - return 0; -} - /* * a node might live in a head or a regular ref, this lets you * test for the proper type to use. diff --git a/trunk/fs/btrfs/disk-io.c b/trunk/fs/btrfs/disk-io.c index 7aa9cd36bf1b..d8525662ca7a 100644 --- a/trunk/fs/btrfs/disk-io.c +++ b/trunk/fs/btrfs/disk-io.c @@ -43,7 +43,6 @@ #include "tree-log.h" #include "free-space-cache.h" #include "inode-map.h" -#include "check-integrity.h" static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); @@ -1144,6 +1143,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->orphan_item_inserted = 0; root->orphan_cleanup_state = 0; + root->fs_info = fs_info; root->objectid = objectid; root->last_trans = 0; root->highest_objectid = 0; @@ -1217,14 +1217,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root, return 0; } -static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) -{ - struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); - if (root) - root->fs_info = fs_info; - return root; -} - static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { @@ -1232,7 +1224,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root = fs_info->tree_root; struct extent_buffer *leaf; - root = btrfs_alloc_root(fs_info); + root = kzalloc(sizeof(*root), GFP_NOFS); if (!root) return ERR_PTR(-ENOMEM); @@ -1252,8 +1244,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, root->ref_cows = 0; leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, - BTRFS_TREE_LOG_OBJECTID, NULL, - 0, 0, 0, 0); + BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); if (IS_ERR(leaf)) { kfree(root); return ERR_CAST(leaf); @@ -1327,7 +1318,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, u32 blocksize; int ret = 0; - root = btrfs_alloc_root(fs_info); + root = kzalloc(sizeof(*root), GFP_NOFS); if (!root) return ERR_PTR(-ENOMEM); if (location->offset == (u64)-1) { @@ -1883,9 +1874,9 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) } -int open_ctree(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - char *options) +struct btrfs_root *open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options) { u32 sectorsize; u32 nodesize; @@ -1897,8 +1888,8 @@ int open_ctree(struct super_block *sb, struct btrfs_key location; struct buffer_head *bh; struct btrfs_super_block *disk_super; - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_root *tree_root; + struct btrfs_root *tree_root = btrfs_sb(sb); + struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_root *extent_root; struct btrfs_root *csum_root; struct btrfs_root *chunk_root; @@ -1909,14 +1900,16 @@ int open_ctree(struct super_block *sb, int num_backups_tried = 0; int backup_index = 0; - tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); - extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info); - csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info); - chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); - dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info); + extent_root = fs_info->extent_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + csum_root = fs_info->csum_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + chunk_root = fs_info->chunk_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + dev_root = fs_info->dev_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - if (!tree_root || !extent_root || !csum_root || - !chunk_root || !dev_root) { + if (!extent_root || !csum_root || !chunk_root || !dev_root) { err = -ENOMEM; goto fail; } @@ -2005,17 +1998,6 @@ int open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->scrub_pause_wait); init_rwsem(&fs_info->scrub_super_lock); fs_info->scrub_workers_refcnt = 0; -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - fs_info->check_integrity_print_mask = 0; -#endif - - spin_lock_init(&fs_info->balance_lock); - mutex_init(&fs_info->balance_mutex); - atomic_set(&fs_info->balance_running, 0); - atomic_set(&fs_info->balance_pause_req, 0); - atomic_set(&fs_info->balance_cancel_req, 0); - fs_info->balance_ctl = NULL; - init_waitqueue_head(&fs_info->balance_wait_q); sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); @@ -2285,7 +2267,9 @@ int open_ctree(struct super_block *sb, (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE); + mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_chunk_tree(chunk_root); + mutex_unlock(&fs_info->chunk_mutex); if (ret) { printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", sb->s_id); @@ -2334,6 +2318,9 @@ int open_ctree(struct super_block *sb, fs_info->generation = generation; fs_info->last_trans_committed = generation; + fs_info->data_alloc_profile = (u64)-1; + fs_info->metadata_alloc_profile = (u64)-1; + fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; ret = btrfs_init_space_info(fs_info); if (ret) { @@ -2366,19 +2353,6 @@ int open_ctree(struct super_block *sb, btrfs_set_opt(fs_info->mount_opt, SSD); } -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { - ret = btrfsic_mount(tree_root, fs_devices, - btrfs_test_opt(tree_root, - CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ? - 1 : 0, - fs_info->check_integrity_print_mask); - if (ret) - printk(KERN_WARNING "btrfs: failed to initialize" - " integrity check module %s\n", sb->s_id); - } -#endif - /* do not make disk changes in broken FS */ if (btrfs_super_log_root(disk_super) != 0 && !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { @@ -2394,7 +2368,7 @@ int open_ctree(struct super_block *sb, btrfs_level_size(tree_root, btrfs_super_log_root_level(disk_super)); - log_tree_root = btrfs_alloc_root(fs_info); + log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); if (!log_tree_root) { err = -ENOMEM; goto fail_trans_kthread; @@ -2449,17 +2423,13 @@ int open_ctree(struct super_block *sb, if (!err) err = btrfs_orphan_cleanup(fs_info->tree_root); up_read(&fs_info->cleanup_work_sem); - - if (!err) - err = btrfs_recover_balance(fs_info->tree_root); - if (err) { close_ctree(tree_root); - return err; + return ERR_PTR(err); } } - return 0; + return tree_root; fail_trans_kthread: kthread_stop(fs_info->transaction_kthread); @@ -2505,7 +2475,8 @@ int open_ctree(struct super_block *sb, cleanup_srcu_struct(&fs_info->subvol_srcu); fail: btrfs_close_devices(fs_info->fs_devices); - return err; + free_fs_info(fs_info); + return ERR_PTR(err); recovery_tree_root: if (!btrfs_test_opt(tree_root, RECOVERY)) @@ -2660,7 +2631,7 @@ static int write_dev_supers(struct btrfs_device *device, * we fua the first super. The others we allow * to go down lazy. */ - ret = btrfsic_submit_bh(WRITE_FUA, bh); + ret = submit_bh(WRITE_FUA, bh); if (ret) errors++; } @@ -2737,7 +2708,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) device->flush_bio = bio; bio_get(bio); - btrfsic_submit_bio(WRITE_FLUSH, bio); + submit_bio(WRITE_FLUSH, bio); return 0; } @@ -3001,9 +2972,6 @@ int close_ctree(struct btrfs_root *root) fs_info->closing = 1; smp_mb(); - /* pause restriper - we want to resume on mount */ - btrfs_pause_balance(root->fs_info); - btrfs_scrub_cancel(root); /* wait for any defraggers to finish */ @@ -3011,7 +2979,7 @@ int close_ctree(struct btrfs_root *root) (atomic_read(&fs_info->defrag_running) == 0)); /* clear out the rbtree of defraggable inodes */ - btrfs_run_defrag_inodes(fs_info); + btrfs_run_defrag_inodes(root->fs_info); /* * Here come 2 situations when btrfs is broken to flip readonly: @@ -3040,8 +3008,8 @@ int close_ctree(struct btrfs_root *root) btrfs_put_block_group_cache(fs_info); - kthread_stop(fs_info->transaction_kthread); - kthread_stop(fs_info->cleaner_kthread); + kthread_stop(root->fs_info->transaction_kthread); + kthread_stop(root->fs_info->cleaner_kthread); fs_info->closing = 2; smp_mb(); @@ -3059,14 +3027,14 @@ int close_ctree(struct btrfs_root *root) free_extent_buffer(fs_info->extent_root->commit_root); free_extent_buffer(fs_info->tree_root->node); free_extent_buffer(fs_info->tree_root->commit_root); - free_extent_buffer(fs_info->chunk_root->node); - free_extent_buffer(fs_info->chunk_root->commit_root); - free_extent_buffer(fs_info->dev_root->node); - free_extent_buffer(fs_info->dev_root->commit_root); - free_extent_buffer(fs_info->csum_root->node); - free_extent_buffer(fs_info->csum_root->commit_root); + free_extent_buffer(root->fs_info->chunk_root->node); + free_extent_buffer(root->fs_info->chunk_root->commit_root); + free_extent_buffer(root->fs_info->dev_root->node); + free_extent_buffer(root->fs_info->dev_root->commit_root); + free_extent_buffer(root->fs_info->csum_root->node); + free_extent_buffer(root->fs_info->csum_root->commit_root); - btrfs_free_block_groups(fs_info); + btrfs_free_block_groups(root->fs_info); del_fs_roots(fs_info); @@ -3086,17 +3054,14 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->caching_workers); btrfs_stop_workers(&fs_info->readahead_workers); -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(root, CHECK_INTEGRITY)) - btrfsic_unmount(root, fs_info->fs_devices); -#endif - btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); + free_fs_info(fs_info); + return 0; } diff --git a/trunk/fs/btrfs/disk-io.h b/trunk/fs/btrfs/disk-io.h index e4bc4741319b..c99d0a8f13fa 100644 --- a/trunk/fs/btrfs/disk-io.h +++ b/trunk/fs/btrfs/disk-io.h @@ -46,9 +46,9 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf); -int open_ctree(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - char *options); +struct btrfs_root *open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options); int close_ctree(struct btrfs_root *root); int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); diff --git a/trunk/fs/btrfs/export.c b/trunk/fs/btrfs/export.c index 5f77166fd01c..1b8dc33778f9 100644 --- a/trunk/fs/btrfs/export.c +++ b/trunk/fs/btrfs/export.c @@ -67,7 +67,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, u64 root_objectid, u32 generation, int check_generation) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; struct btrfs_root *root; struct inode *inode; struct btrfs_key key; diff --git a/trunk/fs/btrfs/extent-tree.c b/trunk/fs/btrfs/extent-tree.c index 700879ed64cf..f5fbe576d2ba 100644 --- a/trunk/fs/btrfs/extent-tree.c +++ b/trunk/fs/btrfs/extent-tree.c @@ -618,7 +618,8 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, struct list_head *head = &info->space_info; struct btrfs_space_info *found; - flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; + flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM | + BTRFS_BLOCK_GROUP_METADATA; rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { @@ -1871,24 +1872,20 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, int for_cow) + u64 root_objectid, u64 owner, u64 offset) { int ret; - struct btrfs_fs_info *fs_info = root->fs_info; - BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && root_objectid == BTRFS_TREE_LOG_OBJECTID); if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, - num_bytes, + ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL, for_cow); + BTRFS_ADD_DELAYED_REF, NULL); } else { - ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, - num_bytes, + ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, parent, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_REF, NULL, for_cow); + BTRFS_ADD_DELAYED_REF, NULL); } return ret; } @@ -2235,28 +2232,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } } - /* - * locked_ref is the head node, so we have to go one - * node back for any delayed ref updates - */ - ref = select_delayed_ref(locked_ref); - - if (ref && ref->seq && - btrfs_check_delayed_seq(delayed_refs, ref->seq)) { - /* - * there are still refs with lower seq numbers in the - * process of being added. Don't run this ref yet. - */ - list_del_init(&locked_ref->cluster); - mutex_unlock(&locked_ref->mutex); - locked_ref = NULL; - delayed_refs->num_heads_ready++; - spin_unlock(&delayed_refs->lock); - cond_resched(); - spin_lock(&delayed_refs->lock); - continue; - } - /* * record the must insert reserved flag before we * drop the spin lock. @@ -2267,6 +2242,11 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, extent_op = locked_ref->extent_op; locked_ref->extent_op = NULL; + /* + * locked_ref is the head node, so we have to go one + * node back for any delayed ref updates + */ + ref = select_delayed_ref(locked_ref); if (!ref) { /* All delayed refs have been processed, Go ahead * and send the head node to run_one_delayed_ref, @@ -2287,7 +2267,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, BUG_ON(ret); kfree(extent_op); - goto next; + cond_resched(); + spin_lock(&delayed_refs->lock); + continue; } list_del_init(&locked_ref->cluster); @@ -2297,12 +2279,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); delayed_refs->num_entries--; - /* - * we modified num_entries, but as we're currently running - * delayed refs, skip - * wake_up(&delayed_refs->seq_wait); - * here. - */ + spin_unlock(&delayed_refs->lock); ret = run_one_delayed_ref(trans, root, ref, extent_op, @@ -2312,34 +2289,13 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, btrfs_put_delayed_ref(ref); kfree(extent_op); count++; -next: - do_chunk_alloc(trans, root->fs_info->extent_root, - 2 * 1024 * 1024, - btrfs_get_alloc_profile(root, 0), - CHUNK_ALLOC_NO_FORCE); + cond_resched(); spin_lock(&delayed_refs->lock); } return count; } - -static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, - unsigned long num_refs) -{ - struct list_head *first_seq = delayed_refs->seq_head.next; - - spin_unlock(&delayed_refs->lock); - pr_debug("waiting for more refs (num %ld, first %p)\n", - num_refs, first_seq); - wait_event(delayed_refs->seq_wait, - num_refs != delayed_refs->num_entries || - delayed_refs->seq_head.next != first_seq); - pr_debug("done waiting for more refs (num %ld, first %p)\n", - delayed_refs->num_entries, delayed_refs->seq_head.next); - spin_lock(&delayed_refs->lock); -} - /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2355,23 +2311,15 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref; struct list_head cluster; int ret; - u64 delayed_start; int run_all = count == (unsigned long)-1; int run_most = 0; - unsigned long num_refs = 0; - int consider_waiting; if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; - do_chunk_alloc(trans, root->fs_info->extent_root, - 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), - CHUNK_ALLOC_NO_FORCE); - delayed_refs = &trans->transaction->delayed_refs; INIT_LIST_HEAD(&cluster); again: - consider_waiting = 0; spin_lock(&delayed_refs->lock); if (count == 0) { count = delayed_refs->num_entries * 2; @@ -2388,35 +2336,11 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, * of refs to process starting at the first one we are able to * lock */ - delayed_start = delayed_refs->run_delayed_start; ret = btrfs_find_ref_cluster(trans, &cluster, delayed_refs->run_delayed_start); if (ret) break; - if (delayed_start >= delayed_refs->run_delayed_start) { - if (consider_waiting == 0) { - /* - * btrfs_find_ref_cluster looped. let's do one - * more cycle. if we don't run any delayed ref - * during that cycle (because we can't because - * all of them are blocked) and if the number of - * refs doesn't change, we avoid busy waiting. - */ - consider_waiting = 1; - num_refs = delayed_refs->num_entries; - } else { - wait_for_more_refs(delayed_refs, num_refs); - /* - * after waiting, things have changed. we - * dropped the lock and someone else might have - * run some refs, built new clusters and so on. - * therefore, we restart staleness detection. - */ - consider_waiting = 0; - } - } - ret = run_clustered_refs(trans, root, &cluster); BUG_ON(ret < 0); @@ -2424,11 +2348,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (count == 0) break; - - if (ret || delayed_refs->run_delayed_start == 0) { - /* refs were run, let's reset staleness detection */ - consider_waiting = 0; - } } if (run_all) { @@ -2486,8 +2405,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->update_key = 0; extent_op->is_data = is_data ? 1 : 0; - ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, - num_bytes, extent_op); + ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); if (ret) kfree(extent_op); return ret; @@ -2672,7 +2590,7 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - int full_backref, int inc, int for_cow) + int full_backref, int inc) { u64 bytenr; u64 num_bytes; @@ -2685,7 +2603,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int level; int ret = 0; int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, - u64, u64, u64, u64, u64, u64, int); + u64, u64, u64, u64, u64, u64); ref_root = btrfs_header_owner(buf); nritems = btrfs_header_nritems(buf); @@ -2722,15 +2640,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, - key.offset, for_cow); + key.offset); if (ret) goto fail; } else { bytenr = btrfs_node_blockptr(buf, i); num_bytes = btrfs_level_size(root, level - 1); ret = process_func(trans, root, bytenr, num_bytes, - parent, ref_root, level - 1, 0, - for_cow); + parent, ref_root, level - 1, 0); if (ret) goto fail; } @@ -2742,15 +2659,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow) + struct extent_buffer *buf, int full_backref) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow); + return __btrfs_mod_ref(trans, root, buf, full_backref, 1); } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow) + struct extent_buffer *buf, int full_backref) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow); + return __btrfs_mod_ref(trans, root, buf, full_backref, 0); } static int write_one_cache_group(struct btrfs_trans_handle *trans, @@ -3076,7 +2993,9 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, INIT_LIST_HEAD(&found->block_groups[i]); init_rwsem(&found->groups_sem); spin_lock_init(&found->lock); - found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; + found->flags = flags & (BTRFS_BLOCK_GROUP_DATA | + BTRFS_BLOCK_GROUP_SYSTEM | + BTRFS_BLOCK_GROUP_METADATA); found->total_bytes = total_bytes; found->disk_total = total_bytes * factor; found->bytes_used = bytes_used; @@ -3097,27 +3016,20 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { - u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; - - /* chunk -> extended profile */ - if (extra_flags == 0) - extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE; - - if (flags & BTRFS_BLOCK_GROUP_DATA) - fs_info->avail_data_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_METADATA) - fs_info->avail_metadata_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - fs_info->avail_system_alloc_bits |= extra_flags; + u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP); + if (extra_flags) { + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits |= extra_flags; + } } -/* - * @flags: available profiles in extended format (see ctree.h) - * - * Returns reduced profile in chunk format. If profile changing is in - * progress (either running or paused) picks the target profile (if it's - * already available), otherwise falls back to plain reducing. - */ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { /* @@ -3128,34 +3040,6 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) u64 num_devices = root->fs_info->fs_devices->rw_devices + root->fs_info->fs_devices->missing_devices; - /* pick restriper's target profile if it's available */ - spin_lock(&root->fs_info->balance_lock); - if (root->fs_info->balance_ctl) { - struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; - u64 tgt = 0; - - if ((flags & BTRFS_BLOCK_GROUP_DATA) && - (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (flags & bctl->data.target)) { - tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; - } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) && - (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (flags & bctl->sys.target)) { - tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; - } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) && - (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (flags & bctl->meta.target)) { - tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; - } - - if (tgt) { - spin_unlock(&root->fs_info->balance_lock); - flags = tgt; - goto out; - } - } - spin_unlock(&root->fs_info->balance_lock); - if (num_devices == 1) flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); if (num_devices < 4) @@ -3175,25 +3059,22 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) if ((flags & BTRFS_BLOCK_GROUP_RAID0) && ((flags & BTRFS_BLOCK_GROUP_RAID1) | (flags & BTRFS_BLOCK_GROUP_RAID10) | - (flags & BTRFS_BLOCK_GROUP_DUP))) { + (flags & BTRFS_BLOCK_GROUP_DUP))) flags &= ~BTRFS_BLOCK_GROUP_RAID0; - } - -out: - /* extended -> chunk profile */ - flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; return flags; } static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) { if (flags & BTRFS_BLOCK_GROUP_DATA) - flags |= root->fs_info->avail_data_alloc_bits; + flags |= root->fs_info->avail_data_alloc_bits & + root->fs_info->data_alloc_profile; else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - flags |= root->fs_info->avail_system_alloc_bits; + flags |= root->fs_info->avail_system_alloc_bits & + root->fs_info->system_alloc_profile; else if (flags & BTRFS_BLOCK_GROUP_METADATA) - flags |= root->fs_info->avail_metadata_alloc_bits; - + flags |= root->fs_info->avail_metadata_alloc_bits & + root->fs_info->metadata_alloc_profile; return btrfs_reduce_alloc_profile(root, flags); } @@ -3310,8 +3191,6 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes) return -ENOSPC; } data_sinfo->bytes_may_use += bytes; - trace_btrfs_space_reservation(root->fs_info, "space_info", - (u64)data_sinfo, bytes, 1); spin_unlock(&data_sinfo->lock); return 0; @@ -3331,8 +3210,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) data_sinfo = BTRFS_I(inode)->space_info; spin_lock(&data_sinfo->lock); data_sinfo->bytes_may_use -= bytes; - trace_btrfs_space_reservation(root->fs_info, "space_info", - (u64)data_sinfo, bytes, 0); spin_unlock(&data_sinfo->lock); } @@ -3380,15 +3257,27 @@ static int should_alloc_chunk(struct btrfs_root *root, if (num_bytes - num_allocated < thresh) return 1; } + + /* + * we have two similar checks here, one based on percentage + * and once based on a hard number of 256MB. The idea + * is that if we have a good amount of free + * room, don't allocate a chunk. A good mount is + * less than 80% utilized of the chunks we have allocated, + * or more than 256MB free + */ + if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes) + return 0; + + if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) + return 0; + thresh = btrfs_super_total_bytes(root->fs_info->super_copy); - /* 256MB or 2% of the FS */ - thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2)); - /* system chunks need a much small threshold */ - if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM) - thresh = 32 * 1024 * 1024; + /* 256MB or 5% of the FS */ + thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); - if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8)) + if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) return 0; return 1; } @@ -3402,7 +3291,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, int wait_for_alloc = 0; int ret = 0; - BUG_ON(!profile_is_valid(flags, 0)); + flags = btrfs_reduce_alloc_profile(extent_root, flags); space_info = __find_space_info(extent_root->fs_info, flags); if (!space_info) { @@ -3693,10 +3582,6 @@ static int reserve_metadata_bytes(struct btrfs_root *root, if (used <= space_info->total_bytes) { if (used + orig_bytes <= space_info->total_bytes) { space_info->bytes_may_use += orig_bytes; - trace_btrfs_space_reservation(root->fs_info, - "space_info", - (u64)space_info, - orig_bytes, 1); ret = 0; } else { /* @@ -3764,10 +3649,6 @@ static int reserve_metadata_bytes(struct btrfs_root *root, if (used + num_bytes < space_info->total_bytes + avail) { space_info->bytes_may_use += orig_bytes; - trace_btrfs_space_reservation(root->fs_info, - "space_info", - (u64)space_info, - orig_bytes, 1); ret = 0; } else { wait_ordered = true; @@ -3874,8 +3755,7 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, spin_unlock(&block_rsv->lock); } -static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, +static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes) { struct btrfs_space_info *space_info = block_rsv->space_info; @@ -3911,9 +3791,6 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, if (num_bytes) { spin_lock(&space_info->lock); space_info->bytes_may_use -= num_bytes; - trace_btrfs_space_reservation(fs_info, "space_info", - (u64)space_info, - num_bytes, 0); space_info->reservation_progress++; spin_unlock(&space_info->lock); } @@ -4070,8 +3947,7 @@ void btrfs_block_rsv_release(struct btrfs_root *root, if (global_rsv->full || global_rsv == block_rsv || block_rsv->space_info != global_rsv->space_info) global_rsv = NULL; - block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, - num_bytes); + block_rsv_release_bytes(block_rsv, global_rsv, num_bytes); } /* @@ -4130,15 +4006,11 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) num_bytes = sinfo->total_bytes - num_bytes; block_rsv->reserved += num_bytes; sinfo->bytes_may_use += num_bytes; - trace_btrfs_space_reservation(fs_info, "space_info", - (u64)sinfo, num_bytes, 1); } if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; sinfo->bytes_may_use -= num_bytes; - trace_btrfs_space_reservation(fs_info, "space_info", - (u64)sinfo, num_bytes, 0); sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; @@ -4173,8 +4045,7 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) static void release_global_block_rsv(struct btrfs_fs_info *fs_info) { - block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, - (u64)-1); + block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1); WARN_ON(fs_info->delalloc_block_rsv.size > 0); WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); WARN_ON(fs_info->trans_block_rsv.size > 0); @@ -4191,8 +4062,6 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, if (!trans->bytes_reserved) return; - trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans, - trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); trans->bytes_reserved = 0; } @@ -4210,8 +4079,6 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, * when we are truly done with the orphan item. */ u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); - trace_btrfs_space_reservation(root->fs_info, "orphan", - btrfs_ino(inode), num_bytes, 1); return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } @@ -4219,8 +4086,6 @@ void btrfs_orphan_release_metadata(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); - trace_btrfs_space_reservation(root->fs_info, "orphan", - btrfs_ino(inode), num_bytes, 0); btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); } @@ -4348,11 +4213,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) /* Need to be holding the i_mutex here if we aren't free space cache */ if (btrfs_is_free_space_inode(root, inode)) flush = 0; + else + WARN_ON(!mutex_is_locked(&inode->i_mutex)); if (flush && btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); - mutex_lock(&BTRFS_I(inode)->delalloc_mutex); num_bytes = ALIGN(num_bytes, root->sectorsize); spin_lock(&BTRFS_I(inode)->lock); @@ -4400,14 +4266,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) if (dropped) to_free += btrfs_calc_trans_metadata_size(root, dropped); - if (to_free) { + if (to_free) btrfs_block_rsv_release(root, block_rsv, to_free); - trace_btrfs_space_reservation(root->fs_info, - "delalloc", - btrfs_ino(inode), - to_free, 0); - } - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); return ret; } @@ -4418,11 +4278,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) } BTRFS_I(inode)->reserved_extents += nr_extents; spin_unlock(&BTRFS_I(inode)->lock); - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); - if (to_reserve) - trace_btrfs_space_reservation(root->fs_info,"delalloc", - btrfs_ino(inode), to_reserve, 1); block_rsv_add_bytes(block_rsv, to_reserve, 1); return 0; @@ -4452,8 +4308,6 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); - trace_btrfs_space_reservation(root->fs_info, "delalloc", - btrfs_ino(inode), to_free, 0); btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, to_free); } @@ -4708,10 +4562,7 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, cache->reserved += num_bytes; space_info->bytes_reserved += num_bytes; if (reserve == RESERVE_ALLOC) { - trace_btrfs_space_reservation(cache->fs_info, - "space_info", - (u64)space_info, - num_bytes, 0); + BUG_ON(space_info->bytes_may_use < num_bytes); space_info->bytes_may_use -= num_bytes; } } @@ -5077,8 +4928,6 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, rb_erase(&head->node.rb_node, &delayed_refs->root); delayed_refs->num_entries--; - if (waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); /* * we don't take a ref on the node because we're removing it from the @@ -5106,17 +4955,16 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - u64 parent, int last_ref, int for_cow) + u64 parent, int last_ref) { struct btrfs_block_group_cache *cache = NULL; int ret; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, - buf->start, buf->len, - parent, root->root_key.objectid, - btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL, for_cow); + ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len, + parent, root->root_key.objectid, + btrfs_header_level(buf), + BTRFS_DROP_DELAYED_REF, NULL); BUG_ON(ret); } @@ -5151,12 +4999,12 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, btrfs_put_block_group(cache); } -int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int for_cow) +int btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset) { int ret; - struct btrfs_fs_info *fs_info = root->fs_info; /* * tree log blocks never actually go into the extent allocation @@ -5168,17 +5016,14 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_pin_extent(root, bytenr, num_bytes, 1); ret = 0; } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, - num_bytes, + ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL, for_cow); + BTRFS_DROP_DELAYED_REF, NULL); BUG_ON(ret); } else { - ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, owner, - offset, BTRFS_DROP_DELAYED_REF, - NULL, for_cow); + ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, + parent, root_objectid, owner, + offset, BTRFS_DROP_DELAYED_REF, NULL); BUG_ON(ret); } return ret; @@ -5301,8 +5146,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(orig_root, num_bytes, empty_size, data); - space_info = __find_space_info(root->fs_info, data); if (!space_info) { printk(KERN_ERR "No space info for %llu\n", data); @@ -5452,6 +5295,15 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, if (unlikely(block_group->ro)) goto loop; + spin_lock(&block_group->free_space_ctl->tree_lock); + if (cached && + block_group->free_space_ctl->free_space < + num_bytes + empty_cluster + empty_size) { + spin_unlock(&block_group->free_space_ctl->tree_lock); + goto loop; + } + spin_unlock(&block_group->free_space_ctl->tree_lock); + /* * Ok we want to try and use the cluster allocator, so * lets look there @@ -5479,8 +5331,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, if (offset) { /* we have a block, we're done */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(root, - block_group, search_start, num_bytes); goto checks; } @@ -5499,15 +5349,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, * plenty of times and not have found * anything, so we are likely way too * fragmented for the clustering stuff to find - * anything. - * - * However, if the cluster is taken from the - * current block group, release the cluster - * first, so that we stand a better chance of - * succeeding in the unclustered - * allocation. */ - if (loop >= LOOP_NO_EMPTY_SIZE && - last_ptr->block_group != block_group) { + * anything. */ + if (loop >= LOOP_NO_EMPTY_SIZE) { spin_unlock(&last_ptr->refill_lock); goto unclustered_alloc; } @@ -5518,11 +5361,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, */ btrfs_return_cluster_to_free_space(NULL, last_ptr); - if (loop >= LOOP_NO_EMPTY_SIZE) { - spin_unlock(&last_ptr->refill_lock); - goto unclustered_alloc; - } - /* allocate a cluster in this block group */ ret = btrfs_find_space_cluster(trans, root, block_group, last_ptr, @@ -5539,9 +5377,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, if (offset) { /* we found one, proceed */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(root, - block_group, search_start, - num_bytes); goto checks; } } else if (!cached && loop > LOOP_CACHING_NOWAIT @@ -5566,15 +5401,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, } unclustered_alloc: - spin_lock(&block_group->free_space_ctl->tree_lock); - if (cached && - block_group->free_space_ctl->free_space < - num_bytes + empty_cluster + empty_size) { - spin_unlock(&block_group->free_space_ctl->tree_lock); - goto loop; - } - spin_unlock(&block_group->free_space_ctl->tree_lock); - offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); /* @@ -5612,6 +5438,9 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, goto loop; } + ins->objectid = search_start; + ins->offset = num_bytes; + if (offset < search_start) btrfs_add_free_space(used_block_group, offset, search_start - offset); @@ -5628,8 +5457,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, ins->objectid = search_start; ins->offset = num_bytes; - trace_btrfs_reserve_extent(orig_root, block_group, - search_start, num_bytes); if (offset < search_start) btrfs_add_free_space(used_block_group, offset, search_start - offset); @@ -6015,10 +5842,9 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, - ins->offset, 0, - root_objectid, owner, offset, - BTRFS_ADD_DELAYED_EXTENT, NULL, 0); + ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, + 0, root_objectid, owner, offset, + BTRFS_ADD_DELAYED_EXTENT, NULL); return ret; } @@ -6171,11 +5997,10 @@ use_block_rsv(struct btrfs_trans_handle *trans, return ERR_PTR(-ENOSPC); } -static void unuse_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, u32 blocksize) +static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize) { block_rsv_add_bytes(block_rsv, blocksize, 0); - block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); + block_rsv_release_bytes(block_rsv, NULL, 0); } /* @@ -6189,7 +6014,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, - u64 hint, u64 empty_size, int for_cow) + u64 hint, u64 empty_size) { struct btrfs_key ins; struct btrfs_block_rsv *block_rsv; @@ -6205,7 +6030,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, empty_size, hint, (u64)-1, &ins, 0); if (ret) { - unuse_block_rsv(root->fs_info, block_rsv, blocksize); + unuse_block_rsv(block_rsv, blocksize); return ERR_PTR(ret); } @@ -6233,11 +6058,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, extent_op->update_flags = 1; extent_op->is_data = 0; - ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, - ins.objectid, + ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, ins.offset, parent, root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, - extent_op, for_cow); + extent_op); BUG_ON(ret); } return buf; @@ -6254,7 +6078,6 @@ struct walk_control { int keep_locks; int reada_slot; int reada_count; - int for_reloc; }; #define DROP_REFERENCE 1 @@ -6393,9 +6216,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, /* wc->stage == UPDATE_BACKREF */ if (!(wc->flags[level] & flag)) { BUG_ON(!path->locks[level]); - ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); + ret = btrfs_inc_ref(trans, root, eb, 1); BUG_ON(ret); - ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); + ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); ret = btrfs_set_disk_extent_flags(trans, root, eb->start, eb->len, flag, 0); @@ -6539,7 +6362,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, } ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, - root->root_key.objectid, level - 1, 0, 0); + root->root_key.objectid, level - 1, 0); BUG_ON(ret); } btrfs_tree_unlock(next); @@ -6613,11 +6436,9 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (wc->refs[level] == 1) { if (level == 0) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) - ret = btrfs_dec_ref(trans, root, eb, 1, - wc->for_reloc); + ret = btrfs_dec_ref(trans, root, eb, 1); else - ret = btrfs_dec_ref(trans, root, eb, 0, - wc->for_reloc); + ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); } /* make block locked assertion in clean_tree_block happy */ @@ -6644,7 +6465,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_header_owner(path->nodes[level + 1])); } - btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0); + btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); out: wc->refs[level] = 0; wc->flags[level] = 0; @@ -6728,8 +6549,7 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * blocks are properly updated. */ void btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref, - int for_reloc) + struct btrfs_block_rsv *block_rsv, int update_ref) { struct btrfs_path *path; struct btrfs_trans_handle *trans; @@ -6817,7 +6637,6 @@ void btrfs_drop_snapshot(struct btrfs_root *root, wc->stage = DROP_REFERENCE; wc->update_ref = update_ref; wc->keep_locks = 0; - wc->for_reloc = for_reloc; wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { @@ -6902,7 +6721,6 @@ void btrfs_drop_snapshot(struct btrfs_root *root, * drop subtree rooted at tree block 'node'. * * NOTE: this function will unlock and release tree block 'node' - * only used by relocation code */ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -6947,7 +6765,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, wc->stage = DROP_REFERENCE; wc->update_ref = 0; wc->keep_locks = 1; - wc->for_reloc = 1; wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { @@ -6975,29 +6792,6 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; - if (root->fs_info->balance_ctl) { - struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; - u64 tgt = 0; - - /* pick restriper's target profile and return */ - if (flags & BTRFS_BLOCK_GROUP_DATA && - bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { - tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; - } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && - bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { - tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; - } else if (flags & BTRFS_BLOCK_GROUP_METADATA && - bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { - tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; - } - - if (tgt) { - /* extended -> chunk profile */ - tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; - return tgt; - } - } - /* * we add in the count of missing devices because we want * to make sure that any RAID levels on a degraded FS @@ -7291,7 +7085,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * space to fit our block group in. */ if (device->total_bytes > device->bytes_used + min_free) { - ret = find_free_dev_extent(device, min_free, + ret = find_free_dev_extent(NULL, device, min_free, &dev_offset, NULL); if (!ret) dev_nr++; @@ -7653,7 +7447,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); BUG_ON(ret); - update_global_block_rsv(root->fs_info); spin_lock(&cache->space_info->lock); cache->space_info->bytes_readonly += cache->bytes_super; @@ -7673,22 +7466,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, return 0; } -static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) -{ - u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; - - /* chunk -> extended profile */ - if (extra_flags == 0) - extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE; - - if (flags & BTRFS_BLOCK_GROUP_DATA) - fs_info->avail_data_alloc_bits &= ~extra_flags; - if (flags & BTRFS_BLOCK_GROUP_METADATA) - fs_info->avail_metadata_alloc_bits &= ~extra_flags; - if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - fs_info->avail_system_alloc_bits &= ~extra_flags; -} - int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start) { @@ -7699,7 +7476,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_key key; struct inode *inode; int ret; - int index; int factor; root = root->fs_info->extent_root; @@ -7715,7 +7491,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, free_excluded_extents(root, block_group); memcpy(&key, &block_group->key, sizeof(key)); - index = get_block_group_index(block_group); if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) @@ -7790,8 +7565,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * are still on the list after taking the semaphore */ list_del_init(&block_group->list); - if (list_empty(&block_group->space_info->block_groups[index])) - clear_avail_alloc_bits(root->fs_info, block_group->flags); up_write(&block_group->space_info->groups_sem); if (block_group->cached == BTRFS_CACHE_STARTED) diff --git a/trunk/fs/btrfs/extent_io.c b/trunk/fs/btrfs/extent_io.c index 9d09a4f81875..49f3c9dc09f4 100644 --- a/trunk/fs/btrfs/extent_io.c +++ b/trunk/fs/btrfs/extent_io.c @@ -18,7 +18,6 @@ #include "ctree.h" #include "btrfs_inode.h" #include "volumes.h" -#include "check-integrity.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -1896,7 +1895,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, } bio->bi_bdev = dev->bdev; bio_add_page(bio, page, length, start-page_offset(page)); - btrfsic_submit_bio(WRITE_SYNC, bio); + submit_bio(WRITE_SYNC, bio); wait_for_completion(&compl); if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { @@ -2394,7 +2393,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, mirror_num, bio_flags, start); else - btrfsic_submit_bio(rw, bio); + submit_bio(rw, bio); if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; @@ -3580,7 +3579,6 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, atomic_set(&eb->blocking_writers, 0); atomic_set(&eb->spinning_readers, 0); atomic_set(&eb->spinning_writers, 0); - eb->lock_nested = 0; init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); diff --git a/trunk/fs/btrfs/extent_io.h b/trunk/fs/btrfs/extent_io.h index bc6a042cb6fc..7604c3001322 100644 --- a/trunk/fs/btrfs/extent_io.h +++ b/trunk/fs/btrfs/extent_io.h @@ -129,7 +129,6 @@ struct extent_buffer { struct list_head leak_list; struct rcu_head rcu_head; atomic_t refs; - pid_t lock_owner; /* count of read lock holders on the extent buffer */ atomic_t write_locks; @@ -138,7 +137,6 @@ struct extent_buffer { atomic_t blocking_readers; atomic_t spinning_readers; atomic_t spinning_writers; - int lock_nested; /* protects write locks */ rwlock_t lock; diff --git a/trunk/fs/btrfs/file.c b/trunk/fs/btrfs/file.c index 859ba2dd8890..034d98503229 100644 --- a/trunk/fs/btrfs/file.c +++ b/trunk/fs/btrfs/file.c @@ -678,7 +678,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, disk_bytenr, num_bytes, 0, root->root_key.objectid, new_key.objectid, - start - extent_offset, 0); + start - extent_offset); BUG_ON(ret); *hint_byte = disk_bytenr; } @@ -753,7 +753,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, disk_bytenr, num_bytes, 0, root->root_key.objectid, key.objectid, key.offset - - extent_offset, 0); + extent_offset); BUG_ON(ret); inode_sub_bytes(inode, extent_end - key.offset); @@ -962,7 +962,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 0); + ino, orig_offset); BUG_ON(ret); if (split == start) { @@ -989,7 +989,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 0); + ino, orig_offset); BUG_ON(ret); } other_start = 0; @@ -1006,7 +1006,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 0); + ino, orig_offset); BUG_ON(ret); } if (del_nr == 0) { @@ -1274,6 +1274,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, dirty_pages); if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) btrfs_btree_balance_dirty(root, 1); + btrfs_throttle(root); pos += copied; num_written += copied; diff --git a/trunk/fs/btrfs/free-space-cache.c b/trunk/fs/btrfs/free-space-cache.c index d20ff87ca603..9a897bf79538 100644 --- a/trunk/fs/btrfs/free-space-cache.c +++ b/trunk/fs/btrfs/free-space-cache.c @@ -319,11 +319,9 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl) io_ctl_unmap_page(io_ctl); for (i = 0; i < io_ctl->num_pages; i++) { - if (io_ctl->pages[i]) { - ClearPageChecked(io_ctl->pages[i]); - unlock_page(io_ctl->pages[i]); - page_cache_release(io_ctl->pages[i]); - } + ClearPageChecked(io_ctl->pages[i]); + unlock_page(io_ctl->pages[i]); + page_cache_release(io_ctl->pages[i]); } } @@ -637,10 +635,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (!num_entries) return 0; - ret = io_ctl_init(&io_ctl, inode, root); - if (ret) - return ret; - + io_ctl_init(&io_ctl, inode, root); ret = readahead_cache(inode); if (ret) goto out; @@ -843,7 +838,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct io_ctl io_ctl; struct list_head bitmap_list; struct btrfs_key key; - u64 start, extent_start, extent_end, len; + u64 start, end, len; int entries = 0; int bitmaps = 0; int ret; @@ -854,9 +849,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, if (!i_size_read(inode)) return -1; - ret = io_ctl_init(&io_ctl, inode, root); - if (ret) - return -1; + io_ctl_init(&io_ctl, inode, root); /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) @@ -864,12 +857,25 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_cluster, block_group_list); + /* + * We shouldn't have switched the pinned extents yet so this is the + * right one + */ + unpin = root->fs_info->pinned_extents; + /* Lock all pages first so we can lock the extent safely. */ io_ctl_prepare_pages(&io_ctl, inode, 0); lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 0, &cached_state, GFP_NOFS); + /* + * When searching for pinned extents, we need to start at our start + * offset. + */ + if (block_group) + start = block_group->key.objectid; + node = rb_first(&ctl->free_space_offset); if (!node && cluster) { node = rb_first(&cluster->root); @@ -912,20 +918,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * We want to add any pinned extents to our free space cache * so we don't leak the space */ - - /* - * We shouldn't have switched the pinned extents yet so this is the - * right one - */ - unpin = root->fs_info->pinned_extents; - - if (block_group) - start = block_group->key.objectid; - while (block_group && (start < block_group->key.objectid + block_group->key.offset)) { - ret = find_first_extent_bit(unpin, start, - &extent_start, &extent_end, + ret = find_first_extent_bit(unpin, start, &start, &end, EXTENT_DIRTY); if (ret) { ret = 0; @@ -933,21 +928,20 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, } /* This pinned extent is out of our range */ - if (extent_start >= block_group->key.objectid + + if (start >= block_group->key.objectid + block_group->key.offset) break; - extent_start = max(extent_start, start); - extent_end = min(block_group->key.objectid + - block_group->key.offset, extent_end + 1); - len = extent_end - extent_start; + len = block_group->key.objectid + + block_group->key.offset - start; + len = min(len, end + 1 - start); entries++; - ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL); + ret = io_ctl_add_entry(&io_ctl, start, len, NULL); if (ret) goto out_nospc; - start = extent_end; + start = end + 1; } /* Write out the bitmaps */ @@ -2289,23 +2283,23 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *entry, struct btrfs_free_cluster *cluster, - u64 offset, u64 bytes, - u64 cont1_bytes, u64 min_bytes) + u64 offset, u64 bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; unsigned long next_zero; unsigned long i; - unsigned long want_bits; - unsigned long min_bits; + unsigned long search_bits; + unsigned long total_bits; unsigned long found_bits; unsigned long start = 0; unsigned long total_found = 0; int ret; + bool found = false; i = offset_to_bit(entry->offset, block_group->sectorsize, max_t(u64, offset, entry->offset)); - want_bits = bytes_to_bits(bytes, block_group->sectorsize); - min_bits = bytes_to_bits(min_bytes, block_group->sectorsize); + search_bits = bytes_to_bits(bytes, block_group->sectorsize); + total_bits = bytes_to_bits(min_bytes, block_group->sectorsize); again: found_bits = 0; @@ -2314,7 +2308,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { next_zero = find_next_zero_bit(entry->bitmap, BITS_PER_BITMAP, i); - if (next_zero - i >= min_bits) { + if (next_zero - i >= search_bits) { found_bits = next_zero - i; break; } @@ -2324,9 +2318,10 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, if (!found_bits) return -ENOSPC; - if (!total_found) { + if (!found) { start = i; cluster->max_size = 0; + found = true; } total_found += found_bits; @@ -2334,8 +2329,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, if (cluster->max_size < found_bits * block_group->sectorsize) cluster->max_size = found_bits * block_group->sectorsize; - if (total_found < want_bits || cluster->max_size < cont1_bytes) { - i = next_zero + 1; + if (total_found < total_bits) { + i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); + if (i - start > total_bits * 2) { + total_found = 0; + cluster->max_size = 0; + found = false; + } goto again; } @@ -2346,31 +2346,28 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, &entry->offset_index, 1); BUG_ON(ret); - trace_btrfs_setup_cluster(block_group, cluster, - total_found * block_group->sectorsize, 1); return 0; } /* * This searches the block group for just extents to fill the cluster with. - * Try to find a cluster with at least bytes total bytes, at least one - * extent of cont1_bytes, and other clusters of at least min_bytes. */ static noinline int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, - u64 cont1_bytes, u64 min_bytes) + u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *first = NULL; struct btrfs_free_space *entry = NULL; + struct btrfs_free_space *prev = NULL; struct btrfs_free_space *last; struct rb_node *node; u64 window_start; u64 window_free; u64 max_extent; - u64 total_size = 0; + u64 max_gap = 128 * 1024; entry = tree_search_offset(ctl, offset, 0, 1); if (!entry) @@ -2380,8 +2377,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, * We don't want bitmaps, so just move along until we find a normal * extent entry. */ - while (entry->bitmap || entry->bytes < min_bytes) { - if (entry->bitmap && list_empty(&entry->list)) + while (entry->bitmap) { + if (list_empty(&entry->list)) list_add_tail(&entry->list, bitmaps); node = rb_next(&entry->offset_index); if (!node) @@ -2394,9 +2391,12 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, max_extent = entry->bytes; first = entry; last = entry; + prev = entry; - for (node = rb_next(&entry->offset_index); node; - node = rb_next(&entry->offset_index)) { + while (window_free <= min_bytes) { + node = rb_next(&entry->offset_index); + if (!node) + return -ENOSPC; entry = rb_entry(node, struct btrfs_free_space, offset_index); if (entry->bitmap) { @@ -2405,18 +2405,26 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, continue; } - if (entry->bytes < min_bytes) - continue; - - last = entry; - window_free += entry->bytes; - if (entry->bytes > max_extent) + /* + * we haven't filled the empty size and the window is + * very large. reset and try again + */ + if (entry->offset - (prev->offset + prev->bytes) > max_gap || + entry->offset - window_start > (min_bytes * 2)) { + first = entry; + window_start = entry->offset; + window_free = entry->bytes; + last = entry; max_extent = entry->bytes; + } else { + last = entry; + window_free += entry->bytes; + if (entry->bytes > max_extent) + max_extent = entry->bytes; + } + prev = entry; } - if (window_free < bytes || max_extent < cont1_bytes) - return -ENOSPC; - cluster->window_start = first->offset; node = &first->offset_index; @@ -2430,18 +2438,17 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); - if (entry->bitmap || entry->bytes < min_bytes) + if (entry->bitmap) continue; rb_erase(&entry->offset_index, &ctl->free_space_offset); ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 0); - total_size += entry->bytes; BUG_ON(ret); } while (node && entry != last); cluster->max_size = max_extent; - trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); + return 0; } @@ -2453,7 +2460,7 @@ static noinline int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, - u64 cont1_bytes, u64 min_bytes) + u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; @@ -2478,7 +2485,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, if (entry->bytes < min_bytes) continue; ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, - bytes, cont1_bytes, min_bytes); + bytes, min_bytes); if (!ret) return 0; } @@ -2492,7 +2499,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, /* * here we try to find a cluster of blocks in a block group. The goal - * is to find at least bytes+empty_size. + * is to find at least bytes free and up to empty_size + bytes free. * We might not find them all in one contiguous area. * * returns zero and sets up cluster if things worked out, otherwise @@ -2508,24 +2515,23 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, struct btrfs_free_space *entry, *tmp; LIST_HEAD(bitmaps); u64 min_bytes; - u64 cont1_bytes; int ret; - /* - * Choose the minimum extent size we'll require for this - * cluster. For SSD_SPREAD, don't allow any fragmentation. - * For metadata, allow allocates with smaller extents. For - * data, keep it dense. - */ + /* for metadata, allow allocates with more holes */ if (btrfs_test_opt(root, SSD_SPREAD)) { - cont1_bytes = min_bytes = bytes + empty_size; + min_bytes = bytes + empty_size; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { - cont1_bytes = bytes; - min_bytes = block_group->sectorsize; - } else { - cont1_bytes = max(bytes, (bytes + empty_size) >> 2); - min_bytes = block_group->sectorsize; - } + /* + * we want to do larger allocations when we are + * flushing out the delayed refs, it helps prevent + * making more work as we go along. + */ + if (trans->transaction->delayed_refs.flushing) + min_bytes = max(bytes, (bytes + empty_size) >> 1); + else + min_bytes = max(bytes, (bytes + empty_size) >> 4); + } else + min_bytes = max(bytes, (bytes + empty_size) >> 2); spin_lock(&ctl->tree_lock); @@ -2533,7 +2539,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, * If we know we don't have enough space to make a cluster don't even * bother doing all the work to try and find one. */ - if (ctl->free_space < bytes) { + if (ctl->free_space < min_bytes) { spin_unlock(&ctl->tree_lock); return -ENOSPC; } @@ -2546,17 +2552,11 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, goto out; } - trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, - min_bytes); - - INIT_LIST_HEAD(&bitmaps); ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, - bytes + empty_size, - cont1_bytes, min_bytes); + bytes, min_bytes); if (ret) ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, - offset, bytes + empty_size, - cont1_bytes, min_bytes); + offset, bytes, min_bytes); /* Clear our temporary list */ list_for_each_entry_safe(entry, tmp, &bitmaps, list) @@ -2567,8 +2567,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, list_add_tail(&cluster->block_group_list, &block_group->cluster_list); cluster->block_group = block_group; - } else { - trace_btrfs_failed_cluster_setup(block_group); } out: spin_unlock(&cluster->lock); @@ -2590,57 +2588,17 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) cluster->block_group = NULL; } -static int do_trimming(struct btrfs_block_group_cache *block_group, - u64 *total_trimmed, u64 start, u64 bytes, - u64 reserved_start, u64 reserved_bytes) -{ - struct btrfs_space_info *space_info = block_group->space_info; - struct btrfs_fs_info *fs_info = block_group->fs_info; - int ret; - int update = 0; - u64 trimmed = 0; - - spin_lock(&space_info->lock); - spin_lock(&block_group->lock); - if (!block_group->ro) { - block_group->reserved += reserved_bytes; - space_info->bytes_reserved += reserved_bytes; - update = 1; - } - spin_unlock(&block_group->lock); - spin_unlock(&space_info->lock); - - ret = btrfs_error_discard_extent(fs_info->extent_root, - start, bytes, &trimmed); - if (!ret) - *total_trimmed += trimmed; - - btrfs_add_free_space(block_group, reserved_start, reserved_bytes); - - if (update) { - spin_lock(&space_info->lock); - spin_lock(&block_group->lock); - if (block_group->ro) - space_info->bytes_readonly += reserved_bytes; - block_group->reserved -= reserved_bytes; - space_info->bytes_reserved -= reserved_bytes; - spin_unlock(&space_info->lock); - spin_unlock(&block_group->lock); - } - - return ret; -} - -static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, - u64 *total_trimmed, u64 start, u64 end, u64 minlen) +int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_space *entry; - struct rb_node *node; + struct btrfs_free_space *entry = NULL; + struct btrfs_fs_info *fs_info = block_group->fs_info; + u64 bytes = 0; + u64 actually_trimmed; int ret = 0; - u64 extent_start; - u64 extent_bytes; - u64 bytes; + + *trimmed = 0; while (start < end) { spin_lock(&ctl->tree_lock); @@ -2651,118 +2609,81 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, } entry = tree_search_offset(ctl, start, 0, 1); - if (!entry) { + if (!entry) + entry = tree_search_offset(ctl, + offset_to_bitmap(ctl, start), + 1, 1); + + if (!entry || entry->offset >= end) { spin_unlock(&ctl->tree_lock); break; } - /* skip bitmaps */ - while (entry->bitmap) { - node = rb_next(&entry->offset_index); - if (!node) { + if (entry->bitmap) { + ret = search_bitmap(ctl, entry, &start, &bytes); + if (!ret) { + if (start >= end) { + spin_unlock(&ctl->tree_lock); + break; + } + bytes = min(bytes, end - start); + bitmap_clear_bits(ctl, entry, start, bytes); + if (entry->bytes == 0) + free_bitmap(ctl, entry); + } else { + start = entry->offset + BITS_PER_BITMAP * + block_group->sectorsize; spin_unlock(&ctl->tree_lock); - goto out; + ret = 0; + continue; } - entry = rb_entry(node, struct btrfs_free_space, - offset_index); - } - - if (entry->offset >= end) { - spin_unlock(&ctl->tree_lock); - break; - } - - extent_start = entry->offset; - extent_bytes = entry->bytes; - start = max(start, extent_start); - bytes = min(extent_start + extent_bytes, end) - start; - if (bytes < minlen) { - spin_unlock(&ctl->tree_lock); - goto next; + } else { + start = entry->offset; + bytes = min(entry->bytes, end - start); + unlink_free_space(ctl, entry); + kmem_cache_free(btrfs_free_space_cachep, entry); } - unlink_free_space(ctl, entry); - kmem_cache_free(btrfs_free_space_cachep, entry); - spin_unlock(&ctl->tree_lock); - ret = do_trimming(block_group, total_trimmed, start, bytes, - extent_start, extent_bytes); - if (ret) - break; -next: - start += bytes; - - if (fatal_signal_pending(current)) { - ret = -ERESTARTSYS; - break; - } - - cond_resched(); - } -out: - return ret; -} - -static int trim_bitmaps(struct btrfs_block_group_cache *block_group, - u64 *total_trimmed, u64 start, u64 end, u64 minlen) -{ - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_space *entry; - int ret = 0; - int ret2; - u64 bytes; - u64 offset = offset_to_bitmap(ctl, start); - - while (offset < end) { - bool next_bitmap = false; - - spin_lock(&ctl->tree_lock); - - if (ctl->free_space < minlen) { - spin_unlock(&ctl->tree_lock); - break; - } - - entry = tree_search_offset(ctl, offset, 1, 0); - if (!entry) { - spin_unlock(&ctl->tree_lock); - next_bitmap = true; - goto next; - } - - bytes = minlen; - ret2 = search_bitmap(ctl, entry, &start, &bytes); - if (ret2 || start >= end) { - spin_unlock(&ctl->tree_lock); - next_bitmap = true; - goto next; - } - - bytes = min(bytes, end - start); - if (bytes < minlen) { - spin_unlock(&ctl->tree_lock); - goto next; - } - - bitmap_clear_bits(ctl, entry, start, bytes); - if (entry->bytes == 0) - free_bitmap(ctl, entry); - - spin_unlock(&ctl->tree_lock); + if (bytes >= minlen) { + struct btrfs_space_info *space_info; + int update = 0; + + space_info = block_group->space_info; + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (!block_group->ro) { + block_group->reserved += bytes; + space_info->bytes_reserved += bytes; + update = 1; + } + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + + ret = btrfs_error_discard_extent(fs_info->extent_root, + start, + bytes, + &actually_trimmed); + + btrfs_add_free_space(block_group, start, bytes); + if (update) { + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (block_group->ro) + space_info->bytes_readonly += bytes; + block_group->reserved -= bytes; + space_info->bytes_reserved -= bytes; + spin_unlock(&space_info->lock); + spin_unlock(&block_group->lock); + } - ret = do_trimming(block_group, total_trimmed, start, bytes, - start, bytes); - if (ret) - break; -next: - if (next_bitmap) { - offset += BITS_PER_BITMAP * ctl->unit; - } else { - start += bytes; - if (start >= offset + BITS_PER_BITMAP * ctl->unit) - offset += BITS_PER_BITMAP * ctl->unit; + if (ret) + break; + *trimmed += actually_trimmed; } + start += bytes; + bytes = 0; if (fatal_signal_pending(current)) { ret = -ERESTARTSYS; @@ -2775,22 +2696,6 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group, return ret; } -int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, - u64 *trimmed, u64 start, u64 end, u64 minlen) -{ - int ret; - - *trimmed = 0; - - ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); - if (ret) - return ret; - - ret = trim_bitmaps(block_group, trimmed, start, end, minlen); - - return ret; -} - /* * Find the left-most item in the cache tree, and then return the * smallest inode number in the item. diff --git a/trunk/fs/btrfs/inode-map.c b/trunk/fs/btrfs/inode-map.c index 213ffa86ce1b..f8962a957d65 100644 --- a/trunk/fs/btrfs/inode-map.c +++ b/trunk/fs/btrfs/inode-map.c @@ -438,8 +438,6 @@ int btrfs_save_ino_cache(struct btrfs_root *root, trans->bytes_reserved); if (ret) goto out; - trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans, - trans->bytes_reserved, 1); again: inode = lookup_free_ino_inode(root, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { @@ -500,8 +498,6 @@ int btrfs_save_ino_cache(struct btrfs_root *root, out_put: iput(inode); out_release: - trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans, - trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); out: trans->block_rsv = rsv; diff --git a/trunk/fs/btrfs/inode.c b/trunk/fs/btrfs/inode.c index 0da19a0ea00d..81b235a61f8c 100644 --- a/trunk/fs/btrfs/inode.c +++ b/trunk/fs/btrfs/inode.c @@ -1951,28 +1951,12 @@ enum btrfs_orphan_cleanup_state { void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_block_rsv *block_rsv; int ret; if (!list_empty(&root->orphan_list) || root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) return; - spin_lock(&root->orphan_lock); - if (!list_empty(&root->orphan_list)) { - spin_unlock(&root->orphan_lock); - return; - } - - if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) { - spin_unlock(&root->orphan_lock); - return; - } - - block_rsv = root->orphan_block_rsv; - root->orphan_block_rsv = NULL; - spin_unlock(&root->orphan_lock); - if (root->orphan_item_inserted && btrfs_root_refs(&root->root_item) > 0) { ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, @@ -1981,9 +1965,10 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, root->orphan_item_inserted = 0; } - if (block_rsv) { - WARN_ON(block_rsv->size > 0); - btrfs_free_block_rsv(root, block_rsv); + if (root->orphan_block_rsv) { + WARN_ON(root->orphan_block_rsv->size > 0); + btrfs_free_block_rsv(root, root->orphan_block_rsv); + root->orphan_block_rsv = NULL; } } @@ -2239,7 +2224,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) continue; } nr_truncate++; + /* + * Need to hold the imutex for reservation purposes, not + * a huge deal here but I have a WARN_ON in + * btrfs_delalloc_reserve_space to catch offenders. + */ + mutex_lock(&inode->i_mutex); ret = btrfs_truncate(inode); + mutex_unlock(&inode->i_mutex); } else { nr_unlink++; } @@ -2853,7 +2845,7 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans, BUG_ON(!root->fs_info->enospc_unlink); root->fs_info->enospc_unlink = 0; } - btrfs_end_transaction(trans, root); + btrfs_end_transaction_throttle(trans, root); } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) @@ -3017,6 +3009,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int pending_del_nr = 0; int pending_del_slot = 0; int extent_type = -1; + int encoding; int ret; int err = 0; u64 ino = btrfs_ino(inode); @@ -3066,6 +3059,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); found_type = btrfs_key_type(&found_key); + encoding = 0; if (found_key.objectid != ino) break; @@ -3078,6 +3072,10 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); + encoding = btrfs_file_extent_compression(leaf, fi); + encoding |= btrfs_file_extent_encryption(leaf, fi); + encoding |= btrfs_file_extent_other_encoding(leaf, fi); + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_num_bytes(leaf, fi); @@ -3105,7 +3103,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, if (extent_type != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); - if (!del_item) { + if (!del_item && !encoding) { u64 orig_num_bytes = btrfs_file_extent_num_bytes(leaf, fi); extent_num_bytes = new_size - @@ -3181,7 +3179,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, btrfs_header_owner(leaf), - ino, extent_offset, 0); + ino, extent_offset); BUG_ON(ret); } @@ -3436,7 +3434,7 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) i_size_write(inode, newsize); btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); ret = btrfs_update_inode(trans, root, inode); - btrfs_end_transaction(trans, root); + btrfs_end_transaction_throttle(trans, root); } else { /* @@ -4657,7 +4655,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, } out_unlock: nr = trans->blocks_used; - btrfs_end_transaction(trans, root); + btrfs_end_transaction_throttle(trans, root); btrfs_btree_balance_dirty(root, nr); if (drop_inode) { inode_dec_link_count(inode); @@ -4725,7 +4723,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, } out_unlock: nr = trans->blocks_used; - btrfs_end_transaction(trans, root); + btrfs_end_transaction_throttle(trans, root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -4784,7 +4782,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } nr = trans->blocks_used; - btrfs_end_transaction(trans, root); + btrfs_end_transaction_throttle(trans, root); fail: if (drop_inode) { inode_dec_link_count(inode); @@ -4850,7 +4848,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) out_fail: nr = trans->blocks_used; - btrfs_end_transaction(trans, root); + btrfs_end_transaction_throttle(trans, root); if (drop_on_err) iput(inode); btrfs_btree_balance_dirty(root, nr); @@ -5123,7 +5121,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, } flush_dcache_page(page); } else if (create && PageUptodate(page)) { - BUG(); + WARN_ON(1); if (!trans) { kunmap(page); free_extent_map(em); @@ -6404,7 +6402,10 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_start; u64 page_end; + /* Need this to keep space reservations serialized */ + mutex_lock(&inode->i_mutex); ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + mutex_unlock(&inode->i_mutex); if (!ret) ret = btrfs_update_time(vma->vm_file); if (ret) { @@ -6493,8 +6494,8 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (!ret) return VM_FAULT_LOCKED; unlock_page(page); -out: btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); +out: return ret; } @@ -6667,7 +6668,7 @@ static int btrfs_truncate(struct inode *inode) err = ret; nr = trans->blocks_used; - ret = btrfs_end_transaction(trans, root); + ret = btrfs_end_transaction_throttle(trans, root); btrfs_btree_balance_dirty(root, nr); } @@ -6748,7 +6749,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_io_tree_init(&ei->io_tree, &inode->i_data); extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); mutex_init(&ei->log_mutex); - mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->i_orphan); INIT_LIST_HEAD(&ei->delalloc_inodes); @@ -7074,7 +7074,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, btrfs_end_log_trans(root); } out_fail: - btrfs_end_transaction(trans, root); + btrfs_end_transaction_throttle(trans, root); out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&root->fs_info->subvol_sem); @@ -7246,7 +7246,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (!err) d_instantiate(dentry, inode); nr = trans->blocks_used; - btrfs_end_transaction(trans, root); + btrfs_end_transaction_throttle(trans, root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); diff --git a/trunk/fs/btrfs/ioctl.c b/trunk/fs/btrfs/ioctl.c index ab620014bcc3..5441ff1480fd 100644 --- a/trunk/fs/btrfs/ioctl.c +++ b/trunk/fs/btrfs/ioctl.c @@ -176,8 +176,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) struct btrfs_trans_handle *trans; unsigned int flags, oldflags; int ret; - u64 ip_oldflags; - unsigned int i_oldflags; if (btrfs_root_readonly(root)) return -EROFS; @@ -194,9 +192,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) mutex_lock(&inode->i_mutex); - ip_oldflags = ip->flags; - i_oldflags = inode->i_flags; - flags = btrfs_mask_flags(inode->i_mode, flags); oldflags = btrfs_flags_to_ioctl(ip->flags); if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { @@ -254,24 +249,19 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out_drop; - } + trans = btrfs_join_transaction(root); + BUG_ON(IS_ERR(trans)); btrfs_update_iflags(inode); inode->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); btrfs_end_transaction(trans, root); - out_drop: - if (ret) { - ip->flags = ip_oldflags; - inode->i_flags = i_oldflags; - } mnt_drop_write_file(file); + + ret = 0; out_unlock: mutex_unlock(&inode->i_mutex); return ret; @@ -286,13 +276,14 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg) static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) { - struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb); + struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_device *device; struct request_queue *q; struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; - u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); + u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); int ret; if (!capable(CAP_SYS_ADMIN)) @@ -321,7 +312,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) range.len = min(range.len, total_bytes - range.start); range.minlen = max(range.minlen, minlen); - ret = btrfs_trim_fs(fs_info->tree_root, &range); + ret = btrfs_trim_fs(root, &range); if (ret < 0) return ret; @@ -367,7 +358,7 @@ static noinline int create_subvol(struct btrfs_root *root, return PTR_ERR(trans); leaf = btrfs_alloc_free_block(trans, root, root->leafsize, - 0, objectid, NULL, 0, 0, 0, 0); + 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; @@ -867,8 +858,10 @@ static int cluster_pages_for_defrag(struct inode *inode, return 0; file_end = (isize - 1) >> PAGE_CACHE_SHIFT; + mutex_lock(&inode->i_mutex); ret = btrfs_delalloc_reserve_space(inode, num_pages << PAGE_CACHE_SHIFT); + mutex_unlock(&inode->i_mutex); if (ret) return ret; again: @@ -1210,21 +1203,13 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - mutex_lock(&root->fs_info->volume_mutex); - if (root->fs_info->balance_ctl) { - printk(KERN_INFO "btrfs: balance in progress\n"); - ret = -EINVAL; - goto out; - } - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) { - ret = PTR_ERR(vol_args); - goto out; - } + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + mutex_lock(&root->fs_info->volume_mutex); sizestr = vol_args->name; devstr = strchr(sizestr, ':'); if (devstr) { @@ -1241,7 +1226,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", (unsigned long long)devid); ret = -EINVAL; - goto out_free; + goto out_unlock; } if (!strcmp(sizestr, "max")) new_size = device->bdev->bd_inode->i_size; @@ -1256,7 +1241,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, new_size = memparse(sizestr, NULL); if (new_size == 0) { ret = -EINVAL; - goto out_free; + goto out_unlock; } } @@ -1265,7 +1250,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (mod < 0) { if (new_size > old_size) { ret = -EINVAL; - goto out_free; + goto out_unlock; } new_size = old_size - new_size; } else if (mod > 0) { @@ -1274,11 +1259,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (new_size < 256 * 1024 * 1024) { ret = -EINVAL; - goto out_free; + goto out_unlock; } if (new_size > device->bdev->bd_inode->i_size) { ret = -EFBIG; - goto out_free; + goto out_unlock; } do_div(new_size, root->sectorsize); @@ -1291,7 +1276,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - goto out_free; + goto out_unlock; } ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans, root); @@ -1299,10 +1284,9 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, ret = btrfs_shrink_device(device, new_size); } -out_free: - kfree(vol_args); -out: +out_unlock: mutex_unlock(&root->fs_info->volume_mutex); + kfree(vol_args); return ret; } @@ -2068,25 +2052,14 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - mutex_lock(&root->fs_info->volume_mutex); - if (root->fs_info->balance_ctl) { - printk(KERN_INFO "btrfs: balance in progress\n"); - ret = -EINVAL; - goto out; - } - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) { - ret = PTR_ERR(vol_args); - goto out; - } + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_init_new_device(root, vol_args->name); kfree(vol_args); -out: - mutex_unlock(&root->fs_info->volume_mutex); return ret; } @@ -2101,25 +2074,14 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; - mutex_lock(&root->fs_info->volume_mutex); - if (root->fs_info->balance_ctl) { - printk(KERN_INFO "btrfs: balance in progress\n"); - ret = -EINVAL; - goto out; - } - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) { - ret = PTR_ERR(vol_args); - goto out; - } + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_rm_device(root, vol_args->name); kfree(vol_args); -out: - mutex_unlock(&root->fs_info->volume_mutex); return ret; } @@ -2465,8 +2427,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, disko, diskl, 0, root->root_key.objectid, btrfs_ino(inode), - new_key.offset - datao, - 0); + new_key.offset - datao); BUG_ON(ret); } } else if (type == BTRFS_FILE_EXTENT_INLINE) { @@ -3016,7 +2977,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, { int ret = 0; int size; - u64 extent_item_pos; + u64 extent_offset; struct btrfs_ioctl_logical_ino_args *loi; struct btrfs_data_container *inodes = NULL; struct btrfs_path *path = NULL; @@ -3047,17 +3008,15 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, } ret = extent_from_logical(root->fs_info, loi->logical, path, &key); - btrfs_release_path(path); if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) ret = -ENOENT; if (ret < 0) goto out; - extent_item_pos = loi->logical - key.objectid; + extent_offset = loi->logical - key.objectid; ret = iterate_extent_inodes(root->fs_info, path, key.objectid, - extent_item_pos, build_ino_list, - inodes); + extent_offset, build_ino_list, inodes); if (ret < 0) goto out; @@ -3075,163 +3034,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, return ret; } -void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, - struct btrfs_ioctl_balance_args *bargs) -{ - struct btrfs_balance_control *bctl = fs_info->balance_ctl; - - bargs->flags = bctl->flags; - - if (atomic_read(&fs_info->balance_running)) - bargs->state |= BTRFS_BALANCE_STATE_RUNNING; - if (atomic_read(&fs_info->balance_pause_req)) - bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; - if (atomic_read(&fs_info->balance_cancel_req)) - bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ; - - memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); - memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); - memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); - - if (lock) { - spin_lock(&fs_info->balance_lock); - memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); - spin_unlock(&fs_info->balance_lock); - } else { - memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); - } -} - -static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_ioctl_balance_args *bargs; - struct btrfs_balance_control *bctl; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - - mutex_lock(&fs_info->volume_mutex); - mutex_lock(&fs_info->balance_mutex); - - if (arg) { - bargs = memdup_user(arg, sizeof(*bargs)); - if (IS_ERR(bargs)) { - ret = PTR_ERR(bargs); - goto out; - } - - if (bargs->flags & BTRFS_BALANCE_RESUME) { - if (!fs_info->balance_ctl) { - ret = -ENOTCONN; - goto out_bargs; - } - - bctl = fs_info->balance_ctl; - spin_lock(&fs_info->balance_lock); - bctl->flags |= BTRFS_BALANCE_RESUME; - spin_unlock(&fs_info->balance_lock); - - goto do_balance; - } - } else { - bargs = NULL; - } - - if (fs_info->balance_ctl) { - ret = -EINPROGRESS; - goto out_bargs; - } - - bctl = kzalloc(sizeof(*bctl), GFP_NOFS); - if (!bctl) { - ret = -ENOMEM; - goto out_bargs; - } - - bctl->fs_info = fs_info; - if (arg) { - memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); - memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); - memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); - - bctl->flags = bargs->flags; - } else { - /* balance everything - no filters */ - bctl->flags |= BTRFS_BALANCE_TYPE_MASK; - } - -do_balance: - ret = btrfs_balance(bctl, bargs); - /* - * bctl is freed in __cancel_balance or in free_fs_info if - * restriper was paused all the way until unmount - */ - if (arg) { - if (copy_to_user(arg, bargs, sizeof(*bargs))) - ret = -EFAULT; - } - -out_bargs: - kfree(bargs); -out: - mutex_unlock(&fs_info->balance_mutex); - mutex_unlock(&fs_info->volume_mutex); - return ret; -} - -static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd) -{ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - switch (cmd) { - case BTRFS_BALANCE_CTL_PAUSE: - return btrfs_pause_balance(root->fs_info); - case BTRFS_BALANCE_CTL_CANCEL: - return btrfs_cancel_balance(root->fs_info); - } - - return -EINVAL; -} - -static long btrfs_ioctl_balance_progress(struct btrfs_root *root, - void __user *arg) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_ioctl_balance_args *bargs; - int ret = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - mutex_lock(&fs_info->balance_mutex); - if (!fs_info->balance_ctl) { - ret = -ENOTCONN; - goto out; - } - - bargs = kzalloc(sizeof(*bargs), GFP_NOFS); - if (!bargs) { - ret = -ENOMEM; - goto out; - } - - update_ioctl_balance_args(fs_info, 1, bargs); - - if (copy_to_user(arg, bargs, sizeof(*bargs))) - ret = -EFAULT; - - kfree(bargs); -out: - mutex_unlock(&fs_info->balance_mutex); - return ret; -} - long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -3276,7 +3078,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(root, argp); case BTRFS_IOC_BALANCE: - return btrfs_ioctl_balance(root, NULL); + return btrfs_balance(root->fs_info->dev_root); case BTRFS_IOC_CLONE: return btrfs_ioctl_clone(file, arg, 0, 0, 0); case BTRFS_IOC_CLONE_RANGE: @@ -3308,12 +3110,6 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_scrub_cancel(root, argp); case BTRFS_IOC_SCRUB_PROGRESS: return btrfs_ioctl_scrub_progress(root, argp); - case BTRFS_IOC_BALANCE_V2: - return btrfs_ioctl_balance(root, argp); - case BTRFS_IOC_BALANCE_CTL: - return btrfs_ioctl_balance_ctl(root, arg); - case BTRFS_IOC_BALANCE_PROGRESS: - return btrfs_ioctl_balance_progress(root, argp); } return -ENOTTY; diff --git a/trunk/fs/btrfs/ioctl.h b/trunk/fs/btrfs/ioctl.h index 4f69028a68c4..252ae9915de8 100644 --- a/trunk/fs/btrfs/ioctl.h +++ b/trunk/fs/btrfs/ioctl.h @@ -109,55 +109,6 @@ struct btrfs_ioctl_fs_info_args { __u64 reserved[124]; /* pad to 1k */ }; -/* balance control ioctl modes */ -#define BTRFS_BALANCE_CTL_PAUSE 1 -#define BTRFS_BALANCE_CTL_CANCEL 2 - -/* - * this is packed, because it should be exactly the same as its disk - * byte order counterpart (struct btrfs_disk_balance_args) - */ -struct btrfs_balance_args { - __u64 profiles; - __u64 usage; - __u64 devid; - __u64 pstart; - __u64 pend; - __u64 vstart; - __u64 vend; - - __u64 target; - - __u64 flags; - - __u64 unused[8]; -} __attribute__ ((__packed__)); - -/* report balance progress to userspace */ -struct btrfs_balance_progress { - __u64 expected; /* estimated # of chunks that will be - * relocated to fulfill the request */ - __u64 considered; /* # of chunks we have considered so far */ - __u64 completed; /* # of chunks relocated so far */ -}; - -#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0) -#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1) -#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2) - -struct btrfs_ioctl_balance_args { - __u64 flags; /* in/out */ - __u64 state; /* out */ - - struct btrfs_balance_args data; /* in/out */ - struct btrfs_balance_args meta; /* in/out */ - struct btrfs_balance_args sys; /* in/out */ - - struct btrfs_balance_progress stat; /* out */ - - __u64 unused[72]; /* pad to 1k */ -}; - #define BTRFS_INO_LOOKUP_PATH_MAX 4080 struct btrfs_ioctl_ino_lookup_args { __u64 treeid; @@ -321,11 +272,6 @@ struct btrfs_ioctl_logical_ino_args { struct btrfs_ioctl_dev_info_args) #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ struct btrfs_ioctl_fs_info_args) -#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \ - struct btrfs_ioctl_balance_args) -#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int) -#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \ - struct btrfs_ioctl_balance_args) #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ diff --git a/trunk/fs/btrfs/locking.c b/trunk/fs/btrfs/locking.c index 5e178d8f7167..d77b67c4b275 100644 --- a/trunk/fs/btrfs/locking.c +++ b/trunk/fs/btrfs/locking.c @@ -33,14 +33,6 @@ void btrfs_assert_tree_read_locked(struct extent_buffer *eb); */ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) { - if (eb->lock_nested) { - read_lock(&eb->lock); - if (eb->lock_nested && current->pid == eb->lock_owner) { - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); - } if (rw == BTRFS_WRITE_LOCK) { if (atomic_read(&eb->blocking_writers) == 0) { WARN_ON(atomic_read(&eb->spinning_writers) != 1); @@ -65,14 +57,6 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) */ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) { - if (eb->lock_nested) { - read_lock(&eb->lock); - if (&eb->lock_nested && current->pid == eb->lock_owner) { - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); - } if (rw == BTRFS_WRITE_LOCK_BLOCKING) { BUG_ON(atomic_read(&eb->blocking_writers) != 1); write_lock(&eb->lock); @@ -97,25 +81,12 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) void btrfs_tree_read_lock(struct extent_buffer *eb) { again: - read_lock(&eb->lock); - if (atomic_read(&eb->blocking_writers) && - current->pid == eb->lock_owner) { - /* - * This extent is already write-locked by our thread. We allow - * an additional read lock to be added because it's for the same - * thread. btrfs_find_all_roots() depends on this as it may be - * called on a partly (write-)locked tree. - */ - BUG_ON(eb->lock_nested); - eb->lock_nested = 1; - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); read_lock(&eb->lock); if (atomic_read(&eb->blocking_writers)) { read_unlock(&eb->lock); + wait_event(eb->write_lock_wq, + atomic_read(&eb->blocking_writers) == 0); goto again; } atomic_inc(&eb->read_locks); @@ -158,7 +129,6 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) } atomic_inc(&eb->write_locks); atomic_inc(&eb->spinning_writers); - eb->lock_owner = current->pid; return 1; } @@ -167,15 +137,6 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) */ void btrfs_tree_read_unlock(struct extent_buffer *eb) { - if (eb->lock_nested) { - read_lock(&eb->lock); - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = 0; - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); - } btrfs_assert_tree_read_locked(eb); WARN_ON(atomic_read(&eb->spinning_readers) == 0); atomic_dec(&eb->spinning_readers); @@ -188,15 +149,6 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) */ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) { - if (eb->lock_nested) { - read_lock(&eb->lock); - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = 0; - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); - } btrfs_assert_tree_read_locked(eb); WARN_ON(atomic_read(&eb->blocking_readers) == 0); if (atomic_dec_and_test(&eb->blocking_readers)) @@ -229,7 +181,6 @@ int btrfs_tree_lock(struct extent_buffer *eb) WARN_ON(atomic_read(&eb->spinning_writers)); atomic_inc(&eb->spinning_writers); atomic_inc(&eb->write_locks); - eb->lock_owner = current->pid; return 0; } diff --git a/trunk/fs/btrfs/relocation.c b/trunk/fs/btrfs/relocation.c index 8c1aae2c845d..cfb55434a469 100644 --- a/trunk/fs/btrfs/relocation.c +++ b/trunk/fs/btrfs/relocation.c @@ -1604,12 +1604,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, root, new_bytenr, num_bytes, parent, btrfs_header_owner(leaf), - key.objectid, key.offset, 1); + key.objectid, key.offset); BUG_ON(ret); ret = btrfs_free_extent(trans, root, bytenr, num_bytes, parent, btrfs_header_owner(leaf), - key.objectid, key.offset, 1); + key.objectid, key.offset); BUG_ON(ret); } if (dirty) @@ -1778,23 +1778,21 @@ int replace_path(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, path->nodes[level]->start, - src->root_key.objectid, level - 1, 0, - 1); + src->root_key.objectid, level - 1, 0); BUG_ON(ret); ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, - 0, 1); + 0); BUG_ON(ret); ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, path->nodes[level]->start, - src->root_key.objectid, level - 1, 0, - 1); + src->root_key.objectid, level - 1, 0); BUG_ON(ret); ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, - 0, 1); + 0); BUG_ON(ret); btrfs_unlock_up_safe(path, 0); @@ -2246,7 +2244,7 @@ int merge_reloc_roots(struct reloc_control *rc) } else { list_del_init(&reloc_root->root_list); } - btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); + btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0); } if (found) { @@ -2560,7 +2558,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, node->eb->start, blocksize, upper->eb->start, btrfs_header_owner(upper->eb), - node->level, 0, 1); + node->level, 0); BUG_ON(ret); ret = btrfs_drop_subtree(trans, root, eb, upper->eb); @@ -2949,7 +2947,9 @@ static int relocate_file_extent_cluster(struct inode *inode, index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; while (index <= last_index) { + mutex_lock(&inode->i_mutex); ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); + mutex_unlock(&inode->i_mutex); if (ret) goto out; diff --git a/trunk/fs/btrfs/scrub.c b/trunk/fs/btrfs/scrub.c index 9770cc5bfb76..ddf2c90d3fc0 100644 --- a/trunk/fs/btrfs/scrub.c +++ b/trunk/fs/btrfs/scrub.c @@ -25,7 +25,6 @@ #include "transaction.h" #include "backref.h" #include "extent_io.h" -#include "check-integrity.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -310,7 +309,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, u8 ref_level; unsigned long ptr = 0; const int bufsize = 4096; - u64 extent_item_pos; + u64 extent_offset; path = btrfs_alloc_path(); @@ -330,13 +329,12 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, if (ret < 0) goto out; - extent_item_pos = swarn.logical - found_key.objectid; + extent_offset = swarn.logical - found_key.objectid; swarn.extent_item_size = found_key.offset; eb = path->nodes[0]; ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); item_size = btrfs_item_size_nr(eb, path->slots[0]); - btrfs_release_path(path); if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { do { @@ -353,7 +351,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, } else { swarn.path = path; iterate_extent_inodes(fs_info, path, found_key.objectid, - extent_item_pos, + extent_offset, scrub_print_warning_inode, &swarn); } @@ -734,7 +732,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, bio_add_page(bio, page, PAGE_SIZE, 0); bio->bi_end_io = scrub_fixup_end_io; bio->bi_private = &complete; - btrfsic_submit_bio(rw, bio); + submit_bio(rw, bio); /* this will also unplug the queue */ wait_for_completion(&complete); @@ -960,7 +958,7 @@ static int scrub_submit(struct scrub_dev *sdev) sdev->curr = -1; atomic_inc(&sdev->in_flight); - btrfsic_submit_bio(READ, sbio->bio); + submit_bio(READ, sbio->bio); return 0; } diff --git a/trunk/fs/btrfs/super.c b/trunk/fs/btrfs/super.c index 3ce97b217cbe..ae488aa1966a 100644 --- a/trunk/fs/btrfs/super.c +++ b/trunk/fs/btrfs/super.c @@ -147,13 +147,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, static void btrfs_put_super(struct super_block *sb) { - (void)close_ctree(btrfs_sb(sb)->tree_root); - /* FIXME: need to fix VFS to return error? */ - /* AV: return it _where_? ->put_super() can be triggered by any number - * of async events, up to and including delivery of SIGKILL to the - * last process that kept it busy. Or segfault in the aforementioned - * process... Whom would you report that to? - */ + struct btrfs_root *root = btrfs_sb(sb); + int ret; + + ret = close_ctree(root); + sb->s_fs_info = NULL; + + (void)ret; /* FIXME: need to fix VFS to return error? */ } enum { @@ -163,11 +163,8 @@ enum { Opt_compress_type, Opt_compress_force, Opt_compress_force_type, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, - Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, - Opt_no_space_cache, Opt_recovery, Opt_skip_balance, - Opt_check_integrity, Opt_check_integrity_including_extent_data, - Opt_check_integrity_print_mask, - Opt_err, + Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, + Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err, }; static match_table_t tokens = { @@ -202,10 +199,6 @@ static match_table_t tokens = { {Opt_inode_cache, "inode_cache"}, {Opt_no_space_cache, "nospace_cache"}, {Opt_recovery, "recovery"}, - {Opt_skip_balance, "skip_balance"}, - {Opt_check_integrity, "check_int"}, - {Opt_check_integrity_including_extent_data, "check_int_data"}, - {Opt_check_integrity_print_mask, "check_int_print_mask=%d"}, {Opt_err, NULL}, }; @@ -404,40 +397,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: enabling auto recovery"); btrfs_set_opt(info->mount_opt, RECOVERY); break; - case Opt_skip_balance: - btrfs_set_opt(info->mount_opt, SKIP_BALANCE); - break; -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - case Opt_check_integrity_including_extent_data: - printk(KERN_INFO "btrfs: enabling check integrity" - " including extent data\n"); - btrfs_set_opt(info->mount_opt, - CHECK_INTEGRITY_INCLUDING_EXTENT_DATA); - btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); - break; - case Opt_check_integrity: - printk(KERN_INFO "btrfs: enabling check integrity\n"); - btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); - break; - case Opt_check_integrity_print_mask: - intarg = 0; - match_int(&args[0], &intarg); - if (intarg) { - info->check_integrity_print_mask = intarg; - printk(KERN_INFO "btrfs:" - " check_integrity_print_mask 0x%x\n", - info->check_integrity_print_mask); - } - break; -#else - case Opt_check_integrity_including_extent_data: - case Opt_check_integrity: - case Opt_check_integrity_print_mask: - printk(KERN_ERR "btrfs: support for check_integrity*" - " not compiled in!\n"); - ret = -EINVAL; - goto out; -#endif case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); @@ -541,8 +500,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, static struct dentry *get_default_root(struct super_block *sb, u64 subvol_objectid) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_root *root = fs_info->tree_root; + struct btrfs_root *root = sb->s_fs_info; struct btrfs_root *new_root; struct btrfs_dir_item *di; struct btrfs_path *path; @@ -572,7 +530,7 @@ static struct dentry *get_default_root(struct super_block *sb, * will mount by default if we haven't been given a specific subvolume * to mount. */ - dir_id = btrfs_super_root_dir(fs_info->super_copy); + dir_id = btrfs_super_root_dir(root->fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); if (IS_ERR(di)) { btrfs_free_path(path); @@ -586,7 +544,7 @@ static struct dentry *get_default_root(struct super_block *sb, */ btrfs_free_path(path); dir_id = BTRFS_FIRST_FREE_OBJECTID; - new_root = fs_info->fs_root; + new_root = root->fs_info->fs_root; goto setup_root; } @@ -594,7 +552,7 @@ static struct dentry *get_default_root(struct super_block *sb, btrfs_free_path(path); find_root: - new_root = btrfs_read_fs_root_no_name(fs_info, &location); + new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); if (IS_ERR(new_root)) return ERR_CAST(new_root); @@ -630,7 +588,7 @@ static int btrfs_fill_super(struct super_block *sb, { struct inode *inode; struct dentry *root_dentry; - struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *tree_root; struct btrfs_key key; int err; @@ -645,16 +603,18 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_flags |= MS_POSIXACL; #endif - err = open_ctree(sb, fs_devices, (char *)data); - if (err) { + tree_root = open_ctree(sb, fs_devices, (char *)data); + + if (IS_ERR(tree_root)) { printk("btrfs: open_ctree failed\n"); - return err; + return PTR_ERR(tree_root); } + sb->s_fs_info = tree_root; key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL); + inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto fail_close; @@ -671,25 +631,23 @@ static int btrfs_fill_super(struct super_block *sb, save_mount_options(sb, data); cleancache_init_fs(sb); - sb->s_flags |= MS_ACTIVE; return 0; fail_close: - close_ctree(fs_info->tree_root); + close_ctree(tree_root); return err; } int btrfs_sync_fs(struct super_block *sb, int wait) { struct btrfs_trans_handle *trans; - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_root *root = fs_info->tree_root; + struct btrfs_root *root = btrfs_sb(sb); int ret; trace_btrfs_sync_fs(wait); if (!wait) { - filemap_flush(fs_info->btree_inode->i_mapping); + filemap_flush(root->fs_info->btree_inode->i_mapping); return 0; } @@ -705,8 +663,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait) static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) { - struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb); - struct btrfs_root *root = info->tree_root; + struct btrfs_root *root = btrfs_sb(dentry->d_sb); + struct btrfs_fs_info *info = root->fs_info; char *compress_type; if (btrfs_test_opt(root, DEGRADED)) @@ -764,25 +722,28 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",autodefrag"); if (btrfs_test_opt(root, INODE_MAP_CACHE)) seq_puts(seq, ",inode_cache"); - if (btrfs_test_opt(root, SKIP_BALANCE)) - seq_puts(seq, ",skip_balance"); return 0; } static int btrfs_test_super(struct super_block *s, void *data) { - struct btrfs_fs_info *p = data; - struct btrfs_fs_info *fs_info = btrfs_sb(s); + struct btrfs_root *test_root = data; + struct btrfs_root *root = btrfs_sb(s); - return fs_info->fs_devices == p->fs_devices; + /* + * If this super block is going away, return false as it + * can't match as an existing super block. + */ + if (!atomic_read(&s->s_active)) + return 0; + return root->fs_info->fs_devices == test_root->fs_info->fs_devices; } static int btrfs_set_super(struct super_block *s, void *data) { - int err = set_anon_super(s, data); - if (!err) - s->s_fs_info = data; - return err; + s->s_fs_info = data; + + return set_anon_super(s, data); } /* @@ -942,6 +903,12 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (!fs_info) return ERR_PTR(-ENOMEM); + fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + if (!fs_info->tree_root) { + error = -ENOMEM; + goto error_fs_info; + } + fs_info->tree_root->fs_info = fs_info; fs_info->fs_devices = fs_devices; fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); @@ -961,30 +928,43 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info); + s = sget(fs_type, btrfs_test_super, btrfs_set_super, + fs_info->tree_root); if (IS_ERR(s)) { error = PTR_ERR(s); goto error_close_devices; } if (s->s_root) { + if ((flags ^ s->s_flags) & MS_RDONLY) { + deactivate_locked_super(s); + error = -EBUSY; + goto error_close_devices; + } + btrfs_close_devices(fs_devices); free_fs_info(fs_info); - if ((flags ^ s->s_flags) & MS_RDONLY) - error = -EBUSY; } else { char b[BDEVNAME_SIZE]; s->s_flags = flags | MS_NOSEC; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); - btrfs_sb(s)->bdev_holder = fs_type; + btrfs_sb(s)->fs_info->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); + if (error) { + deactivate_locked_super(s); + return ERR_PTR(error); + } + + s->s_flags |= MS_ACTIVE; } - root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); - if (IS_ERR(root)) + root = get_default_root(s, subvol_objectid); + if (IS_ERR(root)) { deactivate_locked_super(s); + return root; + } return root; @@ -997,8 +977,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, static int btrfs_remount(struct super_block *sb, int *flags, char *data) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_root *root = fs_info->tree_root; + struct btrfs_root *root = btrfs_sb(sb); int ret; ret = btrfs_parse_options(root, data); @@ -1014,13 +993,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) ret = btrfs_commit_super(root); WARN_ON(ret); } else { - if (fs_info->fs_devices->rw_devices == 0) + if (root->fs_info->fs_devices->rw_devices == 0) return -EACCES; - if (btrfs_super_log_root(fs_info->super_copy) != 0) + if (btrfs_super_log_root(root->fs_info->super_copy) != 0) return -EINVAL; - ret = btrfs_cleanup_fs_roots(fs_info); + ret = btrfs_cleanup_fs_roots(root->fs_info); WARN_ON(ret); /* recover relocation */ @@ -1189,18 +1168,18 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); - struct btrfs_super_block *disk_super = fs_info->super_copy; - struct list_head *head = &fs_info->space_info; + struct btrfs_root *root = btrfs_sb(dentry->d_sb); + struct btrfs_super_block *disk_super = root->fs_info->super_copy; + struct list_head *head = &root->fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; u64 total_free_data = 0; int bits = dentry->d_sb->s_blocksize_bits; - __be32 *fsid = (__be32 *)fs_info->fsid; + __be32 *fsid = (__be32 *)root->fs_info->fsid; int ret; /* holding chunk_muext to avoid allocating new chunks */ - mutex_lock(&fs_info->chunk_mutex); + mutex_lock(&root->fs_info->chunk_mutex); rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { @@ -1219,14 +1198,14 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_type = BTRFS_SUPER_MAGIC; buf->f_bavail = total_free_data; - ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); + ret = btrfs_calc_avail_data_space(root, &total_free_data); if (ret) { - mutex_unlock(&fs_info->chunk_mutex); + mutex_unlock(&root->fs_info->chunk_mutex); return ret; } buf->f_bavail += total_free_data; buf->f_bavail = buf->f_bavail >> bits; - mutex_unlock(&fs_info->chunk_mutex); + mutex_unlock(&root->fs_info->chunk_mutex); /* We treat it as constant endianness (it doesn't matter _which_) because we want the fsid to come out the same whether mounted @@ -1240,18 +1219,11 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static void btrfs_kill_super(struct super_block *sb) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - kill_anon_super(sb); - free_fs_info(fs_info); -} - static struct file_system_type btrfs_fs_type = { .owner = THIS_MODULE, .name = "btrfs", .mount = btrfs_mount, - .kill_sb = btrfs_kill_super, + .kill_sb = kill_anon_super, .fs_flags = FS_REQUIRES_DEV, }; @@ -1285,17 +1257,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, static int btrfs_freeze(struct super_block *sb) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - mutex_lock(&fs_info->transaction_kthread_mutex); - mutex_lock(&fs_info->cleaner_mutex); + struct btrfs_root *root = btrfs_sb(sb); + mutex_lock(&root->fs_info->transaction_kthread_mutex); + mutex_lock(&root->fs_info->cleaner_mutex); return 0; } static int btrfs_unfreeze(struct super_block *sb) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - mutex_unlock(&fs_info->cleaner_mutex); - mutex_unlock(&fs_info->transaction_kthread_mutex); + struct btrfs_root *root = btrfs_sb(sb); + mutex_unlock(&root->fs_info->cleaner_mutex); + mutex_unlock(&root->fs_info->transaction_kthread_mutex); return 0; } diff --git a/trunk/fs/btrfs/transaction.c b/trunk/fs/btrfs/transaction.c index 287a6728b1ad..81376d94cd3c 100644 --- a/trunk/fs/btrfs/transaction.c +++ b/trunk/fs/btrfs/transaction.c @@ -36,8 +36,6 @@ static noinline void put_transaction(struct btrfs_transaction *transaction) WARN_ON(atomic_read(&transaction->use_count) == 0); if (atomic_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); - WARN_ON(transaction->delayed_refs.root.rb_node); - WARN_ON(!list_empty(&transaction->delayed_refs.seq_head)); memset(transaction, 0, sizeof(*transaction)); kmem_cache_free(btrfs_transaction_cachep, transaction); } @@ -110,11 +108,8 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail) cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.flushing = 0; cur_trans->delayed_refs.run_delayed_start = 0; - cur_trans->delayed_refs.seq = 1; - init_waitqueue_head(&cur_trans->delayed_refs.seq_wait); spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->delayed_refs.lock); - INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head); INIT_LIST_HEAD(&cur_trans->pending_snapshots); list_add_tail(&cur_trans->list, &root->fs_info->trans_list); @@ -326,8 +321,6 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, } if (num_bytes) { - trace_btrfs_space_reservation(root->fs_info, "transaction", - (u64)h, num_bytes, 1); h->block_rsv = &root->fs_info->trans_block_rsv; h->bytes_reserved = num_bytes; } @@ -474,12 +467,19 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; - while (count < 2) { + while (count < 4) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (cur && trans->transaction->delayed_refs.num_heads_ready > 64) { trans->delayed_ref_updates = 0; + + /* + * do a full flush if the transaction is trying + * to close + */ + if (trans->transaction->delayed_refs.flushing) + cur = 0; btrfs_run_delayed_refs(trans, root, cur); } else { break; @@ -1393,9 +1393,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) - btrfs_drop_snapshot(root, NULL, 0, 0); + btrfs_drop_snapshot(root, NULL, 0); else - btrfs_drop_snapshot(root, NULL, 1, 0); + btrfs_drop_snapshot(root, NULL, 1); } return 0; } diff --git a/trunk/fs/btrfs/tree-log.c b/trunk/fs/btrfs/tree-log.c index cb877e0886a7..3568374d419d 100644 --- a/trunk/fs/btrfs/tree-log.c +++ b/trunk/fs/btrfs/tree-log.c @@ -589,7 +589,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, root, ins.objectid, ins.offset, 0, root->root_key.objectid, - key->objectid, offset, 0); + key->objectid, offset); BUG_ON(ret); } else { /* diff --git a/trunk/fs/btrfs/ulist.c b/trunk/fs/btrfs/ulist.c deleted file mode 100644 index 12f5147bd2b1..000000000000 --- a/trunk/fs/btrfs/ulist.c +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Copyright (C) 2011 STRATO AG - * written by Arne Jansen - * Distributed under the GNU GPL license version 2. - */ - -#include -#include -#include "ulist.h" - -/* - * ulist is a generic data structure to hold a collection of unique u64 - * values. The only operations it supports is adding to the list and - * enumerating it. - * It is possible to store an auxiliary value along with the key. - * - * The implementation is preliminary and can probably be sped up - * significantly. A first step would be to store the values in an rbtree - * as soon as ULIST_SIZE is exceeded. - * - * A sample usage for ulists is the enumeration of directed graphs without - * visiting a node twice. The pseudo-code could look like this: - * - * ulist = ulist_alloc(); - * ulist_add(ulist, root); - * elem = NULL; - * - * while ((elem = ulist_next(ulist, elem)) { - * for (all child nodes n in elem) - * ulist_add(ulist, n); - * do something useful with the node; - * } - * ulist_free(ulist); - * - * This assumes the graph nodes are adressable by u64. This stems from the - * usage for tree enumeration in btrfs, where the logical addresses are - * 64 bit. - * - * It is also useful for tree enumeration which could be done elegantly - * recursively, but is not possible due to kernel stack limitations. The - * loop would be similar to the above. - */ - -/** - * ulist_init - freshly initialize a ulist - * @ulist: the ulist to initialize - * - * Note: don't use this function to init an already used ulist, use - * ulist_reinit instead. - */ -void ulist_init(struct ulist *ulist) -{ - ulist->nnodes = 0; - ulist->nodes = ulist->int_nodes; - ulist->nodes_alloced = ULIST_SIZE; -} -EXPORT_SYMBOL(ulist_init); - -/** - * ulist_fini - free up additionally allocated memory for the ulist - * @ulist: the ulist from which to free the additional memory - * - * This is useful in cases where the base 'struct ulist' has been statically - * allocated. - */ -void ulist_fini(struct ulist *ulist) -{ - /* - * The first ULIST_SIZE elements are stored inline in struct ulist. - * Only if more elements are alocated they need to be freed. - */ - if (ulist->nodes_alloced > ULIST_SIZE) - kfree(ulist->nodes); - ulist->nodes_alloced = 0; /* in case ulist_fini is called twice */ -} -EXPORT_SYMBOL(ulist_fini); - -/** - * ulist_reinit - prepare a ulist for reuse - * @ulist: ulist to be reused - * - * Free up all additional memory allocated for the list elements and reinit - * the ulist. - */ -void ulist_reinit(struct ulist *ulist) -{ - ulist_fini(ulist); - ulist_init(ulist); -} -EXPORT_SYMBOL(ulist_reinit); - -/** - * ulist_alloc - dynamically allocate a ulist - * @gfp_mask: allocation flags to for base allocation - * - * The allocated ulist will be returned in an initialized state. - */ -struct ulist *ulist_alloc(unsigned long gfp_mask) -{ - struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask); - - if (!ulist) - return NULL; - - ulist_init(ulist); - - return ulist; -} -EXPORT_SYMBOL(ulist_alloc); - -/** - * ulist_free - free dynamically allocated ulist - * @ulist: ulist to free - * - * It is not necessary to call ulist_fini before. - */ -void ulist_free(struct ulist *ulist) -{ - if (!ulist) - return; - ulist_fini(ulist); - kfree(ulist); -} -EXPORT_SYMBOL(ulist_free); - -/** - * ulist_add - add an element to the ulist - * @ulist: ulist to add the element to - * @val: value to add to ulist - * @aux: auxiliary value to store along with val - * @gfp_mask: flags to use for allocation - * - * Note: locking must be provided by the caller. In case of rwlocks write - * locking is needed - * - * Add an element to a ulist. The @val will only be added if it doesn't - * already exist. If it is added, the auxiliary value @aux is stored along with - * it. In case @val already exists in the ulist, @aux is ignored, even if - * it differs from the already stored value. - * - * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been - * inserted. - * In case of allocation failure -ENOMEM is returned and the ulist stays - * unaltered. - */ -int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, - unsigned long gfp_mask) -{ - int i; - - for (i = 0; i < ulist->nnodes; ++i) { - if (ulist->nodes[i].val == val) - return 0; - } - - if (ulist->nnodes >= ulist->nodes_alloced) { - u64 new_alloced = ulist->nodes_alloced + 128; - struct ulist_node *new_nodes; - void *old = NULL; - - /* - * if nodes_alloced == ULIST_SIZE no memory has been allocated - * yet, so pass NULL to krealloc - */ - if (ulist->nodes_alloced > ULIST_SIZE) - old = ulist->nodes; - - new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced, - gfp_mask); - if (!new_nodes) - return -ENOMEM; - - if (!old) - memcpy(new_nodes, ulist->int_nodes, - sizeof(ulist->int_nodes)); - - ulist->nodes = new_nodes; - ulist->nodes_alloced = new_alloced; - } - ulist->nodes[ulist->nnodes].val = val; - ulist->nodes[ulist->nnodes].aux = aux; - ++ulist->nnodes; - - return 1; -} -EXPORT_SYMBOL(ulist_add); - -/** - * ulist_next - iterate ulist - * @ulist: ulist to iterate - * @prev: previously returned element or %NULL to start iteration - * - * Note: locking must be provided by the caller. In case of rwlocks only read - * locking is needed - * - * This function is used to iterate an ulist. The iteration is started with - * @prev = %NULL. It returns the next element from the ulist or %NULL when the - * end is reached. No guarantee is made with respect to the order in which - * the elements are returned. They might neither be returned in order of - * addition nor in ascending order. - * It is allowed to call ulist_add during an enumeration. Newly added items - * are guaranteed to show up in the running enumeration. - */ -struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev) -{ - int next; - - if (ulist->nnodes == 0) - return NULL; - - if (!prev) - return &ulist->nodes[0]; - - next = (prev - ulist->nodes) + 1; - if (next < 0 || next >= ulist->nnodes) - return NULL; - - return &ulist->nodes[next]; -} -EXPORT_SYMBOL(ulist_next); diff --git a/trunk/fs/btrfs/ulist.h b/trunk/fs/btrfs/ulist.h deleted file mode 100644 index 2e25dec58ec0..000000000000 --- a/trunk/fs/btrfs/ulist.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (C) 2011 STRATO AG - * written by Arne Jansen - * Distributed under the GNU GPL license version 2. - * - */ - -#ifndef __ULIST__ -#define __ULIST__ - -/* - * ulist is a generic data structure to hold a collection of unique u64 - * values. The only operations it supports is adding to the list and - * enumerating it. - * It is possible to store an auxiliary value along with the key. - * - * The implementation is preliminary and can probably be sped up - * significantly. A first step would be to store the values in an rbtree - * as soon as ULIST_SIZE is exceeded. - */ - -/* - * number of elements statically allocated inside struct ulist - */ -#define ULIST_SIZE 16 - -/* - * element of the list - */ -struct ulist_node { - u64 val; /* value to store */ - unsigned long aux; /* auxiliary value saved along with the val */ -}; - -struct ulist { - /* - * number of elements stored in list - */ - unsigned long nnodes; - - /* - * number of nodes we already have room for - */ - unsigned long nodes_alloced; - - /* - * pointer to the array storing the elements. The first ULIST_SIZE - * elements are stored inline. In this case the it points to int_nodes. - * After exceeding ULIST_SIZE, dynamic memory is allocated. - */ - struct ulist_node *nodes; - - /* - * inline storage space for the first ULIST_SIZE entries - */ - struct ulist_node int_nodes[ULIST_SIZE]; -}; - -void ulist_init(struct ulist *ulist); -void ulist_fini(struct ulist *ulist); -void ulist_reinit(struct ulist *ulist); -struct ulist *ulist_alloc(unsigned long gfp_mask); -void ulist_free(struct ulist *ulist); -int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, - unsigned long gfp_mask); -struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev); - -#endif diff --git a/trunk/fs/btrfs/volumes.c b/trunk/fs/btrfs/volumes.c index 0b4e2af7954d..f4b839fd3c9d 100644 --- a/trunk/fs/btrfs/volumes.c +++ b/trunk/fs/btrfs/volumes.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include "compat.h" #include "ctree.h" @@ -33,7 +32,6 @@ #include "print-tree.h" #include "volumes.h" #include "async-thread.h" -#include "check-integrity.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -248,7 +246,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) sync_pending = 0; } - btrfsic_submit_bio(cur->bi_rw, cur); + submit_bio(cur->bi_rw, cur); num_run++; batch_run++; if (need_resched()) @@ -708,6 +706,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, u64 devid; u64 transid; + mutex_lock(&uuid_mutex); + flags |= FMODE_EXCL; bdev = blkdev_get_by_path(path, flags, holder); @@ -716,7 +716,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, goto error; } - mutex_lock(&uuid_mutex); ret = set_blocksize(bdev, 4096); if (ret) goto error_close; @@ -738,9 +737,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, brelse(bh); error_close: - mutex_unlock(&uuid_mutex); blkdev_put(bdev, flags); error: + mutex_unlock(&uuid_mutex); return ret; } @@ -830,6 +829,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, /* * find_free_dev_extent - find free space in the specified device + * @trans: transaction handler * @device: the device which we search the free space in * @num_bytes: the size of the free space that we need * @start: store the start of the free space. @@ -848,7 +848,8 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, * But if we don't find suitable free space, it is used to store the size of * the max free space. */ -int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *len) { struct btrfs_key key; @@ -892,7 +893,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); if (ret < 0) goto out; if (ret > 0) { @@ -1281,6 +1282,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) bool clear_super = false; mutex_lock(&uuid_mutex); + mutex_lock(&root->fs_info->volume_mutex); all_avail = root->fs_info->avail_data_alloc_bits | root->fs_info->avail_system_alloc_bits | @@ -1450,6 +1452,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (bdev) blkdev_put(bdev, FMODE_READ | FMODE_EXCL); out: + mutex_unlock(&root->fs_info->volume_mutex); mutex_unlock(&uuid_mutex); return ret; error_undo: @@ -1466,7 +1469,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) /* * does all the dirty work required for changing file system's UUID. */ -static int btrfs_prepare_sprout(struct btrfs_root *root) +static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_fs_devices *old_devices; @@ -1625,6 +1629,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) } filemap_write_and_wait(bdev->bd_inode->i_mapping); + mutex_lock(&root->fs_info->volume_mutex); devices = &root->fs_info->fs_devices->devices; /* @@ -1690,7 +1695,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (seeding_dev) { sb->s_flags &= ~MS_RDONLY; - ret = btrfs_prepare_sprout(root); + ret = btrfs_prepare_sprout(trans, root); BUG_ON(ret); } @@ -1752,7 +1757,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_relocate_sys_chunks(root); BUG_ON(ret); } - +out: + mutex_unlock(&root->fs_info->volume_mutex); return ret; error: blkdev_put(bdev, FMODE_EXCL); @@ -1760,7 +1766,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); } - return ret; + goto out; } static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, @@ -2071,362 +2077,6 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root) return ret; } -static int insert_balance_item(struct btrfs_root *root, - struct btrfs_balance_control *bctl) -{ - struct btrfs_trans_handle *trans; - struct btrfs_balance_item *item; - struct btrfs_disk_balance_args disk_bargs; - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_key key; - int ret, err; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - btrfs_free_path(path); - return PTR_ERR(trans); - } - - key.objectid = BTRFS_BALANCE_OBJECTID; - key.type = BTRFS_BALANCE_ITEM_KEY; - key.offset = 0; - - ret = btrfs_insert_empty_item(trans, root, path, &key, - sizeof(*item)); - if (ret) - goto out; - - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); - - memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); - - btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); - btrfs_set_balance_data(leaf, item, &disk_bargs); - btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); - btrfs_set_balance_meta(leaf, item, &disk_bargs); - btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); - btrfs_set_balance_sys(leaf, item, &disk_bargs); - - btrfs_set_balance_flags(leaf, item, bctl->flags); - - btrfs_mark_buffer_dirty(leaf); -out: - btrfs_free_path(path); - err = btrfs_commit_transaction(trans, root); - if (err && !ret) - ret = err; - return ret; -} - -static int del_balance_item(struct btrfs_root *root) -{ - struct btrfs_trans_handle *trans; - struct btrfs_path *path; - struct btrfs_key key; - int ret, err; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - btrfs_free_path(path); - return PTR_ERR(trans); - } - - key.objectid = BTRFS_BALANCE_OBJECTID; - key.type = BTRFS_BALANCE_ITEM_KEY; - key.offset = 0; - - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto out; - if (ret > 0) { - ret = -ENOENT; - goto out; - } - - ret = btrfs_del_item(trans, root, path); -out: - btrfs_free_path(path); - err = btrfs_commit_transaction(trans, root); - if (err && !ret) - ret = err; - return ret; -} - -/* - * This is a heuristic used to reduce the number of chunks balanced on - * resume after balance was interrupted. - */ -static void update_balance_args(struct btrfs_balance_control *bctl) -{ - /* - * Turn on soft mode for chunk types that were being converted. - */ - if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) - bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; - if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) - bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; - if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) - bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; - - /* - * Turn on usage filter if is not already used. The idea is - * that chunks that we have already balanced should be - * reasonably full. Don't do it for chunks that are being - * converted - that will keep us from relocating unconverted - * (albeit full) chunks. - */ - if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && - !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { - bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; - bctl->data.usage = 90; - } - if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && - !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { - bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; - bctl->sys.usage = 90; - } - if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && - !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { - bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; - bctl->meta.usage = 90; - } -} - -/* - * Should be called with both balance and volume mutexes held to - * serialize other volume operations (add_dev/rm_dev/resize) with - * restriper. Same goes for unset_balance_control. - */ -static void set_balance_control(struct btrfs_balance_control *bctl) -{ - struct btrfs_fs_info *fs_info = bctl->fs_info; - - BUG_ON(fs_info->balance_ctl); - - spin_lock(&fs_info->balance_lock); - fs_info->balance_ctl = bctl; - spin_unlock(&fs_info->balance_lock); -} - -static void unset_balance_control(struct btrfs_fs_info *fs_info) -{ - struct btrfs_balance_control *bctl = fs_info->balance_ctl; - - BUG_ON(!fs_info->balance_ctl); - - spin_lock(&fs_info->balance_lock); - fs_info->balance_ctl = NULL; - spin_unlock(&fs_info->balance_lock); - - kfree(bctl); -} - -/* - * Balance filters. Return 1 if chunk should be filtered out - * (should not be balanced). - */ -static int chunk_profiles_filter(u64 chunk_profile, - struct btrfs_balance_args *bargs) -{ - chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK; - - if (chunk_profile == 0) - chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE; - - if (bargs->profiles & chunk_profile) - return 0; - - return 1; -} - -static u64 div_factor_fine(u64 num, int factor) -{ - if (factor <= 0) - return 0; - if (factor >= 100) - return num; - - num *= factor; - do_div(num, 100); - return num; -} - -static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, - struct btrfs_balance_args *bargs) -{ - struct btrfs_block_group_cache *cache; - u64 chunk_used, user_thresh; - int ret = 1; - - cache = btrfs_lookup_block_group(fs_info, chunk_offset); - chunk_used = btrfs_block_group_used(&cache->item); - - user_thresh = div_factor_fine(cache->key.offset, bargs->usage); - if (chunk_used < user_thresh) - ret = 0; - - btrfs_put_block_group(cache); - return ret; -} - -static int chunk_devid_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - struct btrfs_balance_args *bargs) -{ - struct btrfs_stripe *stripe; - int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - int i; - - for (i = 0; i < num_stripes; i++) { - stripe = btrfs_stripe_nr(chunk, i); - if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) - return 0; - } - - return 1; -} - -/* [pstart, pend) */ -static int chunk_drange_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - u64 chunk_offset, - struct btrfs_balance_args *bargs) -{ - struct btrfs_stripe *stripe; - int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - u64 stripe_offset; - u64 stripe_length; - int factor; - int i; - - if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) - return 0; - - if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) - factor = 2; - else - factor = 1; - factor = num_stripes / factor; - - for (i = 0; i < num_stripes; i++) { - stripe = btrfs_stripe_nr(chunk, i); - if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) - continue; - - stripe_offset = btrfs_stripe_offset(leaf, stripe); - stripe_length = btrfs_chunk_length(leaf, chunk); - do_div(stripe_length, factor); - - if (stripe_offset < bargs->pend && - stripe_offset + stripe_length > bargs->pstart) - return 0; - } - - return 1; -} - -/* [vstart, vend) */ -static int chunk_vrange_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - u64 chunk_offset, - struct btrfs_balance_args *bargs) -{ - if (chunk_offset < bargs->vend && - chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) - /* at least part of the chunk is inside this vrange */ - return 0; - - return 1; -} - -static int chunk_soft_convert_filter(u64 chunk_profile, - struct btrfs_balance_args *bargs) -{ - if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) - return 0; - - chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK; - - if (chunk_profile == 0) - chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE; - - if (bargs->target & chunk_profile) - return 1; - - return 0; -} - -static int should_balance_chunk(struct btrfs_root *root, - struct extent_buffer *leaf, - struct btrfs_chunk *chunk, u64 chunk_offset) -{ - struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; - struct btrfs_balance_args *bargs = NULL; - u64 chunk_type = btrfs_chunk_type(leaf, chunk); - - /* type filter */ - if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & - (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { - return 0; - } - - if (chunk_type & BTRFS_BLOCK_GROUP_DATA) - bargs = &bctl->data; - else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) - bargs = &bctl->sys; - else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) - bargs = &bctl->meta; - - /* profiles filter */ - if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && - chunk_profiles_filter(chunk_type, bargs)) { - return 0; - } - - /* usage filter */ - if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && - chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { - return 0; - } - - /* devid filter */ - if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && - chunk_devid_filter(leaf, chunk, bargs)) { - return 0; - } - - /* drange filter, makes sense only with devid filter */ - if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && - chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) { - return 0; - } - - /* vrange filter */ - if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && - chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { - return 0; - } - - /* soft profile changing mode */ - if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && - chunk_soft_convert_filter(chunk_type, bargs)) { - return 0; - } - - return 1; -} - static u64 div_factor(u64 num, int factor) { if (factor == 10) @@ -2436,28 +2086,29 @@ static u64 div_factor(u64 num, int factor) return num; } -static int __btrfs_balance(struct btrfs_fs_info *fs_info) +int btrfs_balance(struct btrfs_root *dev_root) { - struct btrfs_balance_control *bctl = fs_info->balance_ctl; - struct btrfs_root *chunk_root = fs_info->chunk_root; - struct btrfs_root *dev_root = fs_info->dev_root; - struct list_head *devices; + int ret; + struct list_head *devices = &dev_root->fs_info->fs_devices->devices; struct btrfs_device *device; u64 old_size; u64 size_to_free; - struct btrfs_chunk *chunk; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_key found_key; + struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; struct btrfs_trans_handle *trans; - struct extent_buffer *leaf; - int slot; - int ret; - int enospc_errors = 0; - bool counting = true; + struct btrfs_key found_key; + + if (dev_root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&dev_root->fs_info->volume_mutex); + dev_root = dev_root->fs_info->dev_root; /* step one make some room on all the devices */ - devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { old_size = device->total_bytes; size_to_free = div_factor(old_size, 1); @@ -2486,23 +2137,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) ret = -ENOMEM; goto error; } - - /* zero out stat counters */ - spin_lock(&fs_info->balance_lock); - memset(&bctl->stat, 0, sizeof(bctl->stat)); - spin_unlock(&fs_info->balance_lock); -again: key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { - if ((!counting && atomic_read(&fs_info->balance_pause_req)) || - atomic_read(&fs_info->balance_cancel_req)) { - ret = -ECANCELED; - goto error; - } - ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) goto error; @@ -2512,19 +2151,15 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) * failed */ if (ret == 0) - BUG(); /* FIXME break ? */ + break; ret = btrfs_previous_item(chunk_root, path, 0, BTRFS_CHUNK_ITEM_KEY); - if (ret) { - ret = 0; + if (ret) break; - } - - leaf = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(leaf, &found_key, slot); + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); if (found_key.objectid != key.objectid) break; @@ -2532,375 +2167,22 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) if (found_key.offset == 0) break; - chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); - - if (!counting) { - spin_lock(&fs_info->balance_lock); - bctl->stat.considered++; - spin_unlock(&fs_info->balance_lock); - } - - ret = should_balance_chunk(chunk_root, leaf, chunk, - found_key.offset); btrfs_release_path(path); - if (!ret) - goto loop; - - if (counting) { - spin_lock(&fs_info->balance_lock); - bctl->stat.expected++; - spin_unlock(&fs_info->balance_lock); - goto loop; - } - ret = btrfs_relocate_chunk(chunk_root, chunk_root->root_key.objectid, found_key.objectid, found_key.offset); if (ret && ret != -ENOSPC) goto error; - if (ret == -ENOSPC) { - enospc_errors++; - } else { - spin_lock(&fs_info->balance_lock); - bctl->stat.completed++; - spin_unlock(&fs_info->balance_lock); - } -loop: key.offset = found_key.offset - 1; } - - if (counting) { - btrfs_release_path(path); - counting = false; - goto again; - } + ret = 0; error: btrfs_free_path(path); - if (enospc_errors) { - printk(KERN_INFO "btrfs: %d enospc errors during balance\n", - enospc_errors); - if (!ret) - ret = -ENOSPC; - } - + mutex_unlock(&dev_root->fs_info->volume_mutex); return ret; } -static inline int balance_need_close(struct btrfs_fs_info *fs_info) -{ - /* cancel requested || normal exit path */ - return atomic_read(&fs_info->balance_cancel_req) || - (atomic_read(&fs_info->balance_pause_req) == 0 && - atomic_read(&fs_info->balance_cancel_req) == 0); -} - -static void __cancel_balance(struct btrfs_fs_info *fs_info) -{ - int ret; - - unset_balance_control(fs_info); - ret = del_balance_item(fs_info->tree_root); - BUG_ON(ret); -} - -void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, - struct btrfs_ioctl_balance_args *bargs); - -/* - * Should be called with both balance and volume mutexes held - */ -int btrfs_balance(struct btrfs_balance_control *bctl, - struct btrfs_ioctl_balance_args *bargs) -{ - struct btrfs_fs_info *fs_info = bctl->fs_info; - u64 allowed; - int ret; - - if (btrfs_fs_closing(fs_info) || - atomic_read(&fs_info->balance_pause_req) || - atomic_read(&fs_info->balance_cancel_req)) { - ret = -EINVAL; - goto out; - } - - /* - * In case of mixed groups both data and meta should be picked, - * and identical options should be given for both of them. - */ - allowed = btrfs_super_incompat_flags(fs_info->super_copy); - if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && - (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) { - if (!(bctl->flags & BTRFS_BALANCE_DATA) || - !(bctl->flags & BTRFS_BALANCE_METADATA) || - memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { - printk(KERN_ERR "btrfs: with mixed groups data and " - "metadata balance options must be the same\n"); - ret = -EINVAL; - goto out; - } - } - - /* - * Profile changing sanity checks. Skip them if a simple - * balance is requested. - */ - if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) & - BTRFS_BALANCE_ARGS_CONVERT)) - goto do_balance; - - allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; - if (fs_info->fs_devices->num_devices == 1) - allowed |= BTRFS_BLOCK_GROUP_DUP; - else if (fs_info->fs_devices->num_devices < 4) - allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); - else - allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10); - - if (!profile_is_valid(bctl->data.target, 1) || - bctl->data.target & ~allowed) { - printk(KERN_ERR "btrfs: unable to start balance with target " - "data profile %llu\n", - (unsigned long long)bctl->data.target); - ret = -EINVAL; - goto out; - } - if (!profile_is_valid(bctl->meta.target, 1) || - bctl->meta.target & ~allowed) { - printk(KERN_ERR "btrfs: unable to start balance with target " - "metadata profile %llu\n", - (unsigned long long)bctl->meta.target); - ret = -EINVAL; - goto out; - } - if (!profile_is_valid(bctl->sys.target, 1) || - bctl->sys.target & ~allowed) { - printk(KERN_ERR "btrfs: unable to start balance with target " - "system profile %llu\n", - (unsigned long long)bctl->sys.target); - ret = -EINVAL; - goto out; - } - - if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) { - printk(KERN_ERR "btrfs: dup for data is not allowed\n"); - ret = -EINVAL; - goto out; - } - - /* allow to reduce meta or sys integrity only if force set */ - allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10; - if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (fs_info->avail_system_alloc_bits & allowed) && - !(bctl->sys.target & allowed)) || - ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (fs_info->avail_metadata_alloc_bits & allowed) && - !(bctl->meta.target & allowed))) { - if (bctl->flags & BTRFS_BALANCE_FORCE) { - printk(KERN_INFO "btrfs: force reducing metadata " - "integrity\n"); - } else { - printk(KERN_ERR "btrfs: balance will reduce metadata " - "integrity, use force if you want this\n"); - ret = -EINVAL; - goto out; - } - } - -do_balance: - ret = insert_balance_item(fs_info->tree_root, bctl); - if (ret && ret != -EEXIST) - goto out; - - if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { - BUG_ON(ret == -EEXIST); - set_balance_control(bctl); - } else { - BUG_ON(ret != -EEXIST); - spin_lock(&fs_info->balance_lock); - update_balance_args(bctl); - spin_unlock(&fs_info->balance_lock); - } - - atomic_inc(&fs_info->balance_running); - mutex_unlock(&fs_info->balance_mutex); - - ret = __btrfs_balance(fs_info); - - mutex_lock(&fs_info->balance_mutex); - atomic_dec(&fs_info->balance_running); - - if (bargs) { - memset(bargs, 0, sizeof(*bargs)); - update_ioctl_balance_args(fs_info, 0, bargs); - } - - if ((ret && ret != -ECANCELED && ret != -ENOSPC) || - balance_need_close(fs_info)) { - __cancel_balance(fs_info); - } - - wake_up(&fs_info->balance_wait_q); - - return ret; -out: - if (bctl->flags & BTRFS_BALANCE_RESUME) - __cancel_balance(fs_info); - else - kfree(bctl); - return ret; -} - -static int balance_kthread(void *data) -{ - struct btrfs_balance_control *bctl = - (struct btrfs_balance_control *)data; - struct btrfs_fs_info *fs_info = bctl->fs_info; - int ret = 0; - - mutex_lock(&fs_info->volume_mutex); - mutex_lock(&fs_info->balance_mutex); - - set_balance_control(bctl); - - if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { - printk(KERN_INFO "btrfs: force skipping balance\n"); - } else { - printk(KERN_INFO "btrfs: continuing balance\n"); - ret = btrfs_balance(bctl, NULL); - } - - mutex_unlock(&fs_info->balance_mutex); - mutex_unlock(&fs_info->volume_mutex); - return ret; -} - -int btrfs_recover_balance(struct btrfs_root *tree_root) -{ - struct task_struct *tsk; - struct btrfs_balance_control *bctl; - struct btrfs_balance_item *item; - struct btrfs_disk_balance_args disk_bargs; - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_key key; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - bctl = kzalloc(sizeof(*bctl), GFP_NOFS); - if (!bctl) { - ret = -ENOMEM; - goto out; - } - - key.objectid = BTRFS_BALANCE_OBJECTID; - key.type = BTRFS_BALANCE_ITEM_KEY; - key.offset = 0; - - ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); - if (ret < 0) - goto out_bctl; - if (ret > 0) { /* ret = -ENOENT; */ - ret = 0; - goto out_bctl; - } - - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); - - bctl->fs_info = tree_root->fs_info; - bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME; - - btrfs_balance_data(leaf, item, &disk_bargs); - btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); - btrfs_balance_meta(leaf, item, &disk_bargs); - btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); - btrfs_balance_sys(leaf, item, &disk_bargs); - btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); - - tsk = kthread_run(balance_kthread, bctl, "btrfs-balance"); - if (IS_ERR(tsk)) - ret = PTR_ERR(tsk); - else - goto out; - -out_bctl: - kfree(bctl); -out: - btrfs_free_path(path); - return ret; -} - -int btrfs_pause_balance(struct btrfs_fs_info *fs_info) -{ - int ret = 0; - - mutex_lock(&fs_info->balance_mutex); - if (!fs_info->balance_ctl) { - mutex_unlock(&fs_info->balance_mutex); - return -ENOTCONN; - } - - if (atomic_read(&fs_info->balance_running)) { - atomic_inc(&fs_info->balance_pause_req); - mutex_unlock(&fs_info->balance_mutex); - - wait_event(fs_info->balance_wait_q, - atomic_read(&fs_info->balance_running) == 0); - - mutex_lock(&fs_info->balance_mutex); - /* we are good with balance_ctl ripped off from under us */ - BUG_ON(atomic_read(&fs_info->balance_running)); - atomic_dec(&fs_info->balance_pause_req); - } else { - ret = -ENOTCONN; - } - - mutex_unlock(&fs_info->balance_mutex); - return ret; -} - -int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) -{ - mutex_lock(&fs_info->balance_mutex); - if (!fs_info->balance_ctl) { - mutex_unlock(&fs_info->balance_mutex); - return -ENOTCONN; - } - - atomic_inc(&fs_info->balance_cancel_req); - /* - * if we are running just wait and return, balance item is - * deleted in btrfs_balance in this case - */ - if (atomic_read(&fs_info->balance_running)) { - mutex_unlock(&fs_info->balance_mutex); - wait_event(fs_info->balance_wait_q, - atomic_read(&fs_info->balance_running) == 0); - mutex_lock(&fs_info->balance_mutex); - } else { - /* __cancel_balance needs volume_mutex */ - mutex_unlock(&fs_info->balance_mutex); - mutex_lock(&fs_info->volume_mutex); - mutex_lock(&fs_info->balance_mutex); - - if (fs_info->balance_ctl) - __cancel_balance(fs_info); - - mutex_unlock(&fs_info->volume_mutex); - } - - BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); - atomic_dec(&fs_info->balance_cancel_req); - mutex_unlock(&fs_info->balance_mutex); - return 0; -} - /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. @@ -3041,7 +2323,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) return ret; } -static int btrfs_add_system_chunk(struct btrfs_root *root, +static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_key *key, struct btrfs_chunk *chunk, int item_size) { @@ -3158,14 +2441,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_stripe_size = 1024 * 1024 * 1024; max_chunk_size = 10 * max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - /* for larger filesystems, use larger metadata chunks */ - if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) - max_stripe_size = 1024 * 1024 * 1024; - else - max_stripe_size = 256 * 1024 * 1024; + max_stripe_size = 256 * 1024 * 1024; max_chunk_size = max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - max_stripe_size = 32 * 1024 * 1024; + max_stripe_size = 8 * 1024 * 1024; max_chunk_size = 2 * max_stripe_size; } else { printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", @@ -3217,7 +2496,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (total_avail == 0) continue; - ret = find_free_dev_extent(device, + ret = find_free_dev_extent(trans, device, max_stripe_size * dev_stripes, &dev_offset, &max_avail); if (ret && ret != -ENOSPC) @@ -3408,7 +2687,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, BUG_ON(ret); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { - ret = btrfs_add_system_chunk(chunk_root, &key, chunk, + ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, item_size); BUG_ON(ret); } @@ -3473,7 +2752,8 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, return ret; alloc_profile = BTRFS_BLOCK_GROUP_METADATA | - fs_info->avail_metadata_alloc_bits; + (fs_info->metadata_alloc_profile & + fs_info->avail_metadata_alloc_bits); alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, @@ -3483,7 +2763,8 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, sys_chunk_offset = chunk_offset + chunk_size; alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | - fs_info->avail_system_alloc_bits; + (fs_info->system_alloc_profile & + fs_info->avail_system_alloc_bits); alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, @@ -3620,13 +2901,26 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 stripe_nr; u64 stripe_nr_orig; u64 stripe_nr_end; + int stripes_allocated = 8; + int stripes_required = 1; int stripe_index; int i; - int ret = 0; int num_stripes; int max_errors = 0; struct btrfs_bio *bbio = NULL; + if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) + stripes_allocated = 1; +again: + if (bbio_ret) { + bbio = kzalloc(btrfs_bio_size(stripes_allocated), + GFP_NOFS); + if (!bbio) + return -ENOMEM; + + atomic_set(&bbio->error, 0); + } + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, *length); read_unlock(&em_tree->lock); @@ -3645,6 +2939,32 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, if (mirror_num > map->num_stripes) mirror_num = 0; + /* if our btrfs_bio struct is too small, back off and try again */ + if (rw & REQ_WRITE) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP)) { + stripes_required = map->num_stripes; + max_errors = 1; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + stripes_required = map->sub_stripes; + max_errors = 1; + } + } + if (rw & REQ_DISCARD) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID10)) { + stripes_required = map->num_stripes; + } + } + if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && + stripes_allocated < stripes_required) { + stripes_allocated = map->num_stripes; + free_extent_map(em); + kfree(bbio); + goto again; + } stripe_nr = offset; /* * stripe_nr counts the total number of stripes we have to stride @@ -3660,7 +2980,10 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, if (rw & REQ_DISCARD) *length = min_t(u64, em->len - offset, *length); - else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP)) { /* we limit the length of each bio to what fits in a stripe */ *length = min_t(u64, em->len - offset, map->stripe_len - stripe_offset); @@ -3736,55 +3059,81 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, } BUG_ON(stripe_index >= map->num_stripes); - bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); - if (!bbio) { - ret = -ENOMEM; - goto out; - } - atomic_set(&bbio->error, 0); - if (rw & REQ_DISCARD) { - int factor = 0; - int sub_stripes = 0; - u64 stripes_per_dev = 0; - u32 remaining_stripes = 0; - - if (map->type & - (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - if (map->type & BTRFS_BLOCK_GROUP_RAID0) - sub_stripes = 1; - else - sub_stripes = map->sub_stripes; - - factor = map->num_stripes / sub_stripes; - stripes_per_dev = div_u64_rem(stripe_nr_end - - stripe_nr_orig, - factor, - &remaining_stripes); - } - for (i = 0; i < num_stripes; i++) { bbio->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; bbio->stripes[i].dev = map->stripes[stripe_index].dev; - if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID10)) { - bbio->stripes[i].length = stripes_per_dev * - map->stripe_len; - if (i / sub_stripes < remaining_stripes) - bbio->stripes[i].length += - map->stripe_len; - if (i < sub_stripes) + if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + u64 stripes; + u32 last_stripe = 0; + int j; + + div_u64_rem(stripe_nr_end - 1, + map->num_stripes, + &last_stripe); + + for (j = 0; j < map->num_stripes; j++) { + u32 test; + + div_u64_rem(stripe_nr_end - 1 - j, + map->num_stripes, &test); + if (test == stripe_index) + break; + } + stripes = stripe_nr_end - 1 - j; + do_div(stripes, map->num_stripes); + bbio->stripes[i].length = map->stripe_len * + (stripes - stripe_nr + 1); + + if (i == 0) { bbio->stripes[i].length -= stripe_offset; - if ((i / sub_stripes + 1) % - sub_stripes == remaining_stripes) + stripe_offset = 0; + } + if (stripe_index == last_stripe) bbio->stripes[i].length -= stripe_end_offset; - if (i == sub_stripes - 1) - stripe_offset = 0; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + u64 stripes; + int j; + int factor = map->num_stripes / + map->sub_stripes; + u32 last_stripe = 0; + + div_u64_rem(stripe_nr_end - 1, + factor, &last_stripe); + last_stripe *= map->sub_stripes; + + for (j = 0; j < factor; j++) { + u32 test; + + div_u64_rem(stripe_nr_end - 1 - j, + factor, &test); + + if (test == + stripe_index / map->sub_stripes) + break; + } + stripes = stripe_nr_end - 1 - j; + do_div(stripes, factor); + bbio->stripes[i].length = map->stripe_len * + (stripes - stripe_nr + 1); + + if (i < map->sub_stripes) { + bbio->stripes[i].length -= + stripe_offset; + if (i == map->sub_stripes - 1) + stripe_offset = 0; + } + if (stripe_index >= last_stripe && + stripe_index <= (last_stripe + + map->sub_stripes - 1)) { + bbio->stripes[i].length -= + stripe_end_offset; + } } else bbio->stripes[i].length = *length; @@ -3806,22 +3155,15 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, stripe_index++; } } - - if (rw & REQ_WRITE) { - if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_DUP)) { - max_errors = 1; - } + if (bbio_ret) { + *bbio_ret = bbio; + bbio->num_stripes = num_stripes; + bbio->max_errors = max_errors; + bbio->mirror_num = mirror_num; } - - *bbio_ret = bbio; - bbio->num_stripes = num_stripes; - bbio->max_errors = max_errors; - bbio->mirror_num = mirror_num; out: free_extent_map(em); - return ret; + return 0; } int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, @@ -3962,7 +3304,7 @@ static noinline int schedule_bio(struct btrfs_root *root, /* don't bother with additional async steps for reads, right now */ if (!(rw & REQ_WRITE)) { bio_get(bio); - btrfsic_submit_bio(rw, bio); + submit_bio(rw, bio); bio_put(bio); return 0; } @@ -4057,7 +3399,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, if (async_submit) schedule_bio(root, dev, rw, bio); else - btrfsic_submit_bio(rw, bio); + submit_bio(rw, bio); } else { bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; bio->bi_sector = logical >> 9; @@ -4226,7 +3568,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) struct btrfs_fs_devices *fs_devices; int ret; - BUG_ON(!mutex_is_locked(&uuid_mutex)); + mutex_lock(&uuid_mutex); fs_devices = root->fs_info->fs_devices->seed; while (fs_devices) { @@ -4264,6 +3606,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) fs_devices->seed = root->fs_info->fs_devices->seed; root->fs_info->fs_devices->seed = fs_devices; out: + mutex_unlock(&uuid_mutex); return ret; } @@ -4406,9 +3749,6 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) if (!path) return -ENOMEM; - mutex_lock(&uuid_mutex); - lock_chunks(root); - /* first we search for all of the device items, and then we * read in all of the chunk items. This way we can create chunk * mappings that reference all of the devices that are afound @@ -4459,9 +3799,6 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) } ret = 0; error: - unlock_chunks(root); - mutex_unlock(&uuid_mutex); - btrfs_free_path(path); return ret; } diff --git a/trunk/fs/btrfs/volumes.h b/trunk/fs/btrfs/volumes.h index 19ac95048b88..78f2d4d4f37f 100644 --- a/trunk/fs/btrfs/volumes.h +++ b/trunk/fs/btrfs/volumes.h @@ -186,51 +186,6 @@ struct map_lookup { #define map_lookup_size(n) (sizeof(struct map_lookup) + \ (sizeof(struct btrfs_bio_stripe) * (n))) -/* - * Restriper's general type filter - */ -#define BTRFS_BALANCE_DATA (1ULL << 0) -#define BTRFS_BALANCE_SYSTEM (1ULL << 1) -#define BTRFS_BALANCE_METADATA (1ULL << 2) - -#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \ - BTRFS_BALANCE_SYSTEM | \ - BTRFS_BALANCE_METADATA) - -#define BTRFS_BALANCE_FORCE (1ULL << 3) -#define BTRFS_BALANCE_RESUME (1ULL << 4) - -/* - * Balance filters - */ -#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) -#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) -#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) -#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) -#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) - -/* - * Profile changing flags. When SOFT is set we won't relocate chunk if - * it already has the target profile (even though it may be - * half-filled). - */ -#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8) -#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9) - -struct btrfs_balance_args; -struct btrfs_balance_progress; -struct btrfs_balance_control { - struct btrfs_fs_info *fs_info; - - struct btrfs_balance_args data; - struct btrfs_balance_args meta; - struct btrfs_balance_args sys; - - u64 flags; - - struct btrfs_balance_progress stat; -}; - int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); @@ -273,12 +228,9 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); -int btrfs_balance(struct btrfs_balance_control *bctl, - struct btrfs_ioctl_balance_args *bargs); -int btrfs_recover_balance(struct btrfs_root *tree_root); -int btrfs_pause_balance(struct btrfs_fs_info *fs_info); -int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); +int btrfs_balance(struct btrfs_root *dev_root); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); -int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); #endif diff --git a/trunk/fs/btrfs/xattr.c b/trunk/fs/btrfs/xattr.c index e7a5659087e6..3848b04e310e 100644 --- a/trunk/fs/btrfs/xattr.c +++ b/trunk/fs/btrfs/xattr.c @@ -200,7 +200,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); out: - btrfs_end_transaction(trans, root); + btrfs_end_transaction_throttle(trans, root); return ret; } diff --git a/trunk/fs/namei.c b/trunk/fs/namei.c index 208c6aa4a989..c283a1ec008e 100644 --- a/trunk/fs/namei.c +++ b/trunk/fs/namei.c @@ -140,19 +140,21 @@ static int do_getname(const char __user *filename, char *page) static char *getname_flags(const char __user *filename, int flags, int *empty) { - char *result = __getname(); - int retval; - - if (!result) - return ERR_PTR(-ENOMEM); - - retval = do_getname(filename, result); - if (retval < 0) { - if (retval == -ENOENT && empty) - *empty = 1; - if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { - __putname(result); - return ERR_PTR(retval); + char *tmp, *result; + + result = ERR_PTR(-ENOMEM); + tmp = __getname(); + if (tmp) { + int retval = do_getname(filename, tmp); + + result = tmp; + if (retval < 0) { + if (retval == -ENOENT && empty) + *empty = 1; + if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { + __putname(tmp); + result = ERR_PTR(retval); + } } } audit_getname(result); diff --git a/trunk/fs/proc/base.c b/trunk/fs/proc/base.c index 9cde9edf9c4d..5485a5388ecb 100644 --- a/trunk/fs/proc/base.c +++ b/trunk/fs/proc/base.c @@ -198,7 +198,65 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } -static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) +static struct mm_struct *__check_mem_permission(struct task_struct *task) +{ + struct mm_struct *mm; + + mm = get_task_mm(task); + if (!mm) + return ERR_PTR(-EINVAL); + + /* + * A task can always look at itself, in case it chooses + * to use system calls instead of load instructions. + */ + if (task == current) + return mm; + + /* + * If current is actively ptrace'ing, and would also be + * permitted to freshly attach with ptrace now, permit it. + */ + if (task_is_stopped_or_traced(task)) { + int match; + rcu_read_lock(); + match = (ptrace_parent(task) == current); + rcu_read_unlock(); + if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH)) + return mm; + } + + /* + * No one else is allowed. + */ + mmput(mm); + return ERR_PTR(-EPERM); +} + +/* + * If current may access user memory in @task return a reference to the + * corresponding mm, otherwise ERR_PTR. + */ +static struct mm_struct *check_mem_permission(struct task_struct *task) +{ + struct mm_struct *mm; + int err; + + /* + * Avoid racing if task exec's as we might get a new mm but validate + * against old credentials. + */ + err = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (err) + return ERR_PTR(err); + + mm = __check_mem_permission(task); + mutex_unlock(&task->signal->cred_guard_mutex); + + return mm; +} + +struct mm_struct *mm_for_maps(struct task_struct *task) { struct mm_struct *mm; int err; @@ -209,7 +267,7 @@ static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) mm = get_task_mm(task); if (mm && mm != current->mm && - !ptrace_may_access(task, mode)) { + !ptrace_may_access(task, PTRACE_MODE_READ)) { mmput(mm); mm = ERR_PTR(-EACCES); } @@ -218,11 +276,6 @@ static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) return mm; } -struct mm_struct *mm_for_maps(struct task_struct *task) -{ - return mm_access(task, PTRACE_MODE_READ); -} - static int proc_pid_cmdline(struct task_struct *task, char * buffer) { int res = 0; @@ -699,39 +752,38 @@ static const struct file_operations proc_single_file_operations = { static int mem_open(struct inode* inode, struct file* file) { - struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); - struct mm_struct *mm; - - if (!task) - return -ESRCH; - - mm = mm_access(task, PTRACE_MODE_ATTACH); - put_task_struct(task); - - if (IS_ERR(mm)) - return PTR_ERR(mm); - + file->private_data = (void*)((long)current->self_exec_id); /* OK to pass negative loff_t, we can catch out-of-range */ file->f_mode |= FMODE_UNSIGNED_OFFSET; - file->private_data = mm; - return 0; } static ssize_t mem_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - int ret; + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); char *page; unsigned long src = *ppos; - struct mm_struct *mm = file->private_data; + int ret = -ESRCH; + struct mm_struct *mm; - if (!mm) - return 0; + if (!task) + goto out_no_task; + ret = -ENOMEM; page = (char *)__get_free_page(GFP_TEMPORARY); if (!page) - return -ENOMEM; + goto out; + + mm = check_mem_permission(task); + ret = PTR_ERR(mm); + if (IS_ERR(mm)) + goto out_free; + + ret = -EIO; + + if (file->private_data != (void*)((long)current->self_exec_id)) + goto out_put; ret = 0; @@ -758,7 +810,13 @@ static ssize_t mem_read(struct file * file, char __user * buf, } *ppos = src; +out_put: + mmput(mm); +out_free: free_page((unsigned long) page); +out: + put_task_struct(task); +out_no_task: return ret; } @@ -767,15 +825,27 @@ static ssize_t mem_write(struct file * file, const char __user *buf, { int copied; char *page; + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); unsigned long dst = *ppos; - struct mm_struct *mm = file->private_data; + struct mm_struct *mm; - if (!mm) - return 0; + copied = -ESRCH; + if (!task) + goto out_no_task; + copied = -ENOMEM; page = (char *)__get_free_page(GFP_TEMPORARY); if (!page) - return -ENOMEM; + goto out_task; + + mm = check_mem_permission(task); + copied = PTR_ERR(mm); + if (IS_ERR(mm)) + goto out_free; + + copied = -EIO; + if (file->private_data != (void *)((long)current->self_exec_id)) + goto out_mm; copied = 0; while (count > 0) { @@ -799,7 +869,13 @@ static ssize_t mem_write(struct file * file, const char __user *buf, } *ppos = dst; +out_mm: + mmput(mm); +out_free: free_page((unsigned long) page); +out_task: + put_task_struct(task); +out_no_task: return copied; } @@ -819,20 +895,11 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig) return file->f_pos; } -static int mem_release(struct inode *inode, struct file *file) -{ - struct mm_struct *mm = file->private_data; - - mmput(mm); - return 0; -} - static const struct file_operations proc_mem_operations = { .llseek = mem_lseek, .read = mem_read, .write = mem_write, .open = mem_open, - .release = mem_release, }; static ssize_t environ_read(struct file *file, char __user *buf, @@ -1132,6 +1199,9 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, ssize_t length; uid_t loginuid; + if (!capable(CAP_AUDIT_CONTROL)) + return -EPERM; + rcu_read_lock(); if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { rcu_read_unlock(); @@ -1160,7 +1230,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, goto out_free_page; } - length = audit_set_loginuid(loginuid); + length = audit_set_loginuid(current, loginuid); if (likely(length == 0)) length = count; diff --git a/trunk/fs/xfs/xfs_aops.c b/trunk/fs/xfs/xfs_aops.c index 74b9baf36ac3..574d4ee9b625 100644 --- a/trunk/fs/xfs/xfs_aops.c +++ b/trunk/fs/xfs/xfs_aops.c @@ -111,7 +111,8 @@ xfs_ioend_new_eof( xfs_fsize_t bsize; bsize = ioend->io_offset + ioend->io_size; - isize = MIN(i_size_read(VFS_I(ip)), bsize); + isize = MAX(ip->i_size, ip->i_new_size); + isize = MIN(isize, bsize); return isize > ip->i_d.di_size ? isize : 0; } @@ -125,7 +126,11 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) } /* - * Update on-disk file size now that data has been written to disk. + * Update on-disk file size now that data has been written to disk. The + * current in-memory file size is i_size. If a write is beyond eof i_new_size + * will be the intended file size until i_size is updated. If this write does + * not extend all the way to the valid file size then restrict this update to + * the end of the write. * * This function does not block as blocking on the inode lock in IO completion * can lead to IO completion order dependency deadlocks.. If it can't get the @@ -1273,15 +1278,6 @@ xfs_end_io_direct_write( { struct xfs_ioend *ioend = iocb->private; - /* - * While the generic direct I/O code updates the inode size, it does - * so only after the end_io handler is called, which means our - * end_io handler thinks the on-disk size is outside the in-core - * size. To prevent this just update it a little bit earlier here. - */ - if (offset + size > i_size_read(ioend->io_inode)) - i_size_write(ioend->io_inode, offset + size); - /* * blockdev_direct_IO can return an error even after the I/O * completion handler was called. Thus we need to protect @@ -1344,11 +1340,12 @@ xfs_vm_write_failed( if (to > inode->i_size) { /* - * Punch out the delalloc blocks we have already allocated. - * - * Don't bother with xfs_setattr given that nothing can have - * made it to disk yet as the page is still locked at this - * point. + * punch out the delalloc blocks we have already allocated. We + * don't call xfs_setattr() to do this as we may be in the + * middle of a multi-iovec write and so the vfs inode->i_size + * will not match the xfs ip->i_size and so it will zero too + * much. Hence we jus truncate the page cache to zero what is + * necessary and punch the delalloc blocks directly. */ struct xfs_inode *ip = XFS_I(inode); xfs_fileoff_t start_fsb; diff --git a/trunk/fs/xfs/xfs_attr.c b/trunk/fs/xfs/xfs_attr.c index 08b9ac644c31..1e5d97f86ea8 100644 --- a/trunk/fs/xfs/xfs_attr.c +++ b/trunk/fs/xfs/xfs_attr.c @@ -827,6 +827,10 @@ xfs_attr_inactive(xfs_inode_t *dp) if (error) goto out; + /* + * Commit the last in the sequence of transactions. + */ + xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); diff --git a/trunk/fs/xfs/xfs_attr_leaf.c b/trunk/fs/xfs/xfs_attr_leaf.c index d25eafd4d28d..c1b55e596551 100644 --- a/trunk/fs/xfs/xfs_attr_leaf.c +++ b/trunk/fs/xfs/xfs_attr_leaf.c @@ -271,6 +271,10 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) dp = args->dp; mp = dp->i_mount; dp->i_d.di_forkoff = forkoff; + dp->i_df.if_ext_max = + XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); + dp->i_afp->if_ext_max = + XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); ifp = dp->i_afp; ASSERT(ifp->if_flags & XFS_IFINLINE); @@ -322,6 +326,7 @@ xfs_attr_fork_reset( ASSERT(ip->i_d.di_anextents == 0); ASSERT(ip->i_afp == NULL); + ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -384,6 +389,10 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) (args->op_flags & XFS_DA_OP_ADDNAME) || !(mp->m_flags & XFS_MOUNT_ATTR2) || dp->i_d.di_format == XFS_DINODE_FMT_BTREE); + dp->i_afp->if_ext_max = + XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); + dp->i_df.if_ext_max = + XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } diff --git a/trunk/fs/xfs/xfs_bmap.c b/trunk/fs/xfs/xfs_bmap.c index 188ef2fbd628..d0ab78837057 100644 --- a/trunk/fs/xfs/xfs_bmap.c +++ b/trunk/fs/xfs/xfs_bmap.c @@ -249,27 +249,7 @@ xfs_bmbt_lookup_ge( } /* - * Check if the inode needs to be converted to btree format. - */ -static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) -{ - return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) > - XFS_IFORK_MAXEXT(ip, whichfork); -} - -/* - * Check if the inode should be converted to extent format. - */ -static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) -{ - return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && - XFS_IFORK_NEXTENTS(ip, whichfork) <= - XFS_IFORK_MAXEXT(ip, whichfork); -} - -/* - * Update the record referred to by cur to the value given +* Update the record referred to by cur to the value given * by [off, bno, len, state]. * This either works (return 0) or gets an EFSCORRUPTED error. */ @@ -703,8 +683,8 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - - if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); @@ -787,8 +767,8 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - - if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); @@ -856,8 +836,8 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - - if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); @@ -904,7 +884,8 @@ xfs_bmap_add_extent_delay_real( } /* convert to a btree if necessary */ - if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) { int tmp_logflags; /* partial log flag return val */ ASSERT(bma->cur == NULL); @@ -1440,7 +1421,8 @@ xfs_bmap_add_extent_unwritten_real( } /* convert to a btree if necessary */ - if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) { + if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) { int tmp_logflags; /* partial log flag return val */ ASSERT(cur == NULL); @@ -1830,7 +1812,8 @@ xfs_bmap_add_extent_hole_real( } /* convert to a btree if necessary */ - if (xfs_bmap_needs_btree(bma->ip, whichfork)) { + if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) { int tmp_logflags; /* partial log flag return val */ ASSERT(bma->cur == NULL); @@ -3054,7 +3037,8 @@ xfs_bmap_extents_to_btree( ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); - + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); /* * Make space in the inode incore. */ @@ -3200,8 +3184,13 @@ xfs_bmap_forkoff_reset( ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; - if (dfl_forkoff > ip->i_d.di_forkoff) + if (dfl_forkoff > ip->i_d.di_forkoff) { ip->i_d.di_forkoff = dfl_forkoff; + ip->i_df.if_ext_max = + XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t); + ip->i_afp->if_ext_max = + XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t); + } } } @@ -3441,6 +3430,8 @@ xfs_bmap_add_attrfork( int error; /* error return value */ ASSERT(XFS_IFORK_Q(ip) == 0); + ASSERT(ip->i_df.if_ext_max == + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); mp = ip->i_mount; ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); @@ -3495,9 +3486,12 @@ xfs_bmap_add_attrfork( error = XFS_ERROR(EINVAL); goto error1; } - + ip->i_df.if_ext_max = + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); ASSERT(ip->i_afp == NULL); ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); + ip->i_afp->if_ext_max = + XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); ip->i_afp->if_flags = XFS_IFEXTENTS; logflags = 0; xfs_bmap_init(&flist, &firstblock); @@ -3541,17 +3535,20 @@ xfs_bmap_add_attrfork( } else spin_unlock(&mp->m_sb_lock); } - - error = xfs_bmap_finish(&tp, &flist, &committed); - if (error) + if ((error = xfs_bmap_finish(&tp, &flist, &committed))) goto error2; - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + ASSERT(ip->i_df.if_ext_max == + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); + return error; error2: xfs_bmap_cancel(&flist); error1: xfs_iunlock(ip, XFS_ILOCK_EXCL); error0: xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + ASSERT(ip->i_df.if_ext_max == + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); return error; } @@ -3997,8 +3994,11 @@ xfs_bmap_one_block( xfs_bmbt_irec_t s; /* internal version of extent */ #ifndef DEBUG - if (whichfork == XFS_DATA_FORK) - return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; + if (whichfork == XFS_DATA_FORK) { + return S_ISREG(ip->i_d.di_mode) ? + (ip->i_size == ip->i_mount->m_sb.sb_blocksize) : + (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize); + } #endif /* !DEBUG */ if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) return 0; @@ -4010,7 +4010,7 @@ xfs_bmap_one_block( xfs_bmbt_get_all(ep, &s); rval = s.br_startoff == 0 && s.br_blockcount == 1; if (rval && whichfork == XFS_DATA_FORK) - ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize); + ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize); return rval; } @@ -4379,6 +4379,8 @@ xfs_bmapi_read( XFS_STATS_INC(xs_blk_mapr); ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, whichfork); @@ -4869,6 +4871,8 @@ xfs_bmapi_write( return XFS_ERROR(EIO); ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); XFS_STATS_INC(xs_blk_mapw); @@ -4977,7 +4981,8 @@ xfs_bmapi_write( /* * Transform from btree to extents, give it cur. */ - if (xfs_bmap_wants_extents(ip, whichfork)) { + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { int tmp_logflags = 0; ASSERT(bma.cur); @@ -4987,10 +4992,10 @@ xfs_bmapi_write( if (error) goto error0; } - + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || - XFS_IFORK_NEXTENTS(ip, whichfork) > - XFS_IFORK_MAXEXT(ip, whichfork)); + XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max); error = 0; error0: /* @@ -5090,7 +5095,8 @@ xfs_bunmapi( ASSERT(len > 0); ASSERT(nexts >= 0); - + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); if (!(ifp->if_flags & XFS_IFEXTENTS) && (error = xfs_iread_extents(tp, ip, whichfork))) return error; @@ -5316,8 +5322,7 @@ xfs_bunmapi( */ if (!wasdel && xfs_trans_get_block_res(tp) == 0 && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */ - XFS_IFORK_MAXEXT(ip, whichfork) && + XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max && del.br_startoff > got.br_startoff && del.br_startoff + del.br_blockcount < got.br_startoff + got.br_blockcount) { @@ -5348,11 +5353,13 @@ xfs_bunmapi( } } *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; - + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); /* * Convert to a btree if necessary. */ - if (xfs_bmap_needs_btree(ip, whichfork)) { + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) { ASSERT(cur == NULL); error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0, &tmp_logflags, whichfork); @@ -5363,7 +5370,8 @@ xfs_bunmapi( /* * transform from btree to extents, give it cur */ - else if (xfs_bmap_wants_extents(ip, whichfork)) { + else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { ASSERT(cur != NULL); error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, whichfork); @@ -5374,6 +5382,8 @@ xfs_bunmapi( /* * transform from extents to local? */ + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); error = 0; error0: /* @@ -5424,7 +5434,7 @@ xfs_getbmapx_fix_eof_hole( if (startblock == HOLESTARTBLOCK) { mp = ip->i_mount; out->bmv_block = -1; - fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip))); + fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size)); fixlen -= out->bmv_offset; if (prealloced && out->bmv_offset + out->bmv_length == end) { /* Came to hole at EOF. Trim it. */ @@ -5512,7 +5522,7 @@ xfs_getbmap( fixlen = XFS_MAXIOFFSET(mp); } else { prealloced = 0; - fixlen = XFS_ISIZE(ip); + fixlen = ip->i_size; } } @@ -5541,7 +5551,7 @@ xfs_getbmap( xfs_ilock(ip, XFS_IOLOCK_SHARED); if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { - if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { + if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) { error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); if (error) goto out_unlock_iolock; diff --git a/trunk/fs/xfs/xfs_dfrag.c b/trunk/fs/xfs/xfs_dfrag.c index dd974a55c77d..654dc6f05bac 100644 --- a/trunk/fs/xfs/xfs_dfrag.c +++ b/trunk/fs/xfs/xfs_dfrag.c @@ -163,14 +163,12 @@ xfs_swap_extents_check_format( /* Check temp in extent form to max in target */ if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > - XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max) return EINVAL; /* Check target in extent form to max in temp */ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > - XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max) return EINVAL; /* @@ -182,25 +180,18 @@ xfs_swap_extents_check_format( * (a common defrag case) which will occur when the temp inode is in * extent format... */ - if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - if (XFS_IFORK_BOFF(ip) && - tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) - return EINVAL; - if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= - XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) - return EINVAL; - } + if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE && + ((XFS_IFORK_BOFF(ip) && + tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) || + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max)) + return EINVAL; /* Reciprocal target->temp btree format checks */ - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - if (XFS_IFORK_BOFF(tip) && - ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) - return EINVAL; - - if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= - XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) - return EINVAL; - } + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + ((XFS_IFORK_BOFF(tip) && + ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) || + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max)) + return EINVAL; return 0; } @@ -357,6 +348,16 @@ xfs_swap_extents( *ifp = *tifp; /* struct copy */ *tifp = *tempifp; /* struct copy */ + /* + * Fix the in-memory data fork values that are dependent on the fork + * offset in the inode. We can't assume they remain the same as attr2 + * has dynamic fork offsets. + */ + ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) / + (uint)sizeof(xfs_bmbt_rec_t); + tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) / + (uint)sizeof(xfs_bmbt_rec_t); + /* * Fix the on-disk inode values */ diff --git a/trunk/fs/xfs/xfs_file.c b/trunk/fs/xfs/xfs_file.c index 7e5bc872f2b4..f675f3d9d7b3 100644 --- a/trunk/fs/xfs/xfs_file.c +++ b/trunk/fs/xfs/xfs_file.c @@ -327,7 +327,7 @@ xfs_file_aio_read( mp->m_rtdev_targp : mp->m_ddev_targp; if ((iocb->ki_pos & target->bt_smask) || (size & target->bt_smask)) { - if (iocb->ki_pos == i_size_read(inode)) + if (iocb->ki_pos == ip->i_size) return 0; return -XFS_ERROR(EINVAL); } @@ -412,6 +412,51 @@ xfs_file_splice_read( return ret; } +STATIC void +xfs_aio_write_isize_update( + struct inode *inode, + loff_t *ppos, + ssize_t bytes_written) +{ + struct xfs_inode *ip = XFS_I(inode); + xfs_fsize_t isize = i_size_read(inode); + + if (bytes_written > 0) + XFS_STATS_ADD(xs_write_bytes, bytes_written); + + if (unlikely(bytes_written < 0 && bytes_written != -EFAULT && + *ppos > isize)) + *ppos = isize; + + if (*ppos > ip->i_size) { + xfs_rw_ilock(ip, XFS_ILOCK_EXCL); + if (*ppos > ip->i_size) + ip->i_size = *ppos; + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); + } +} + +/* + * If this was a direct or synchronous I/O that failed (such as ENOSPC) then + * part of the I/O may have been written to disk before the error occurred. In + * this case the on-disk file size may have been adjusted beyond the in-memory + * file size and now needs to be truncated back. + */ +STATIC void +xfs_aio_write_newsize_update( + struct xfs_inode *ip, + xfs_fsize_t new_size) +{ + if (new_size == ip->i_new_size) { + xfs_rw_ilock(ip, XFS_ILOCK_EXCL); + if (new_size == ip->i_new_size) + ip->i_new_size = 0; + if (ip->i_d.di_size > ip->i_size) + ip->i_d.di_size = ip->i_size; + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); + } +} + /* * xfs_file_splice_write() does not use xfs_rw_ilock() because * generic_file_splice_write() takes the i_mutex itself. This, in theory, @@ -430,6 +475,7 @@ xfs_file_splice_write( { struct inode *inode = outfilp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); + xfs_fsize_t new_size; int ioflags = 0; ssize_t ret; @@ -443,12 +489,19 @@ xfs_file_splice_write( xfs_ilock(ip, XFS_IOLOCK_EXCL); + new_size = *ppos + count; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (new_size > ip->i_size) + ip->i_new_size = new_size; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + trace_xfs_file_splice_write(ip, count, *ppos, ioflags); ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); - if (ret > 0) - XFS_STATS_ADD(xs_write_bytes, ret); + xfs_aio_write_isize_update(inode, ppos, ret); + xfs_aio_write_newsize_update(ip, new_size); xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -636,26 +689,28 @@ xfs_zero_eof( /* * Common pre-write limit and setup checks. * - * Called with the iolocked held either shared and exclusive according to - * @iolock, and returns with it held. Might upgrade the iolock to exclusive - * if called for a direct write beyond i_size. + * Returns with iolock held according to @iolock. */ STATIC ssize_t xfs_file_aio_write_checks( struct file *file, loff_t *pos, size_t *count, + xfs_fsize_t *new_sizep, int *iolock) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); + xfs_fsize_t new_size; int error = 0; xfs_rw_ilock(ip, XFS_ILOCK_EXCL); + *new_sizep = 0; restart: error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); if (error) { - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); + *iolock = 0; return error; } @@ -665,21 +720,36 @@ xfs_file_aio_write_checks( /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this - * write. If zeroing is needed and we are currently holding the - * iolock shared, we need to update it to exclusive which involves - * dropping all locks and relocking to maintain correct locking order. - * If we do this, restart the function to ensure all checks and values - * are still valid. + * write. There is no need to issue zeroing if another in-flght IO ends + * at or before this one If zeronig is needed and we are currently + * holding the iolock shared, we need to update it to exclusive which + * involves dropping all locks and relocking to maintain correct locking + * order. If we do this, restart the function to ensure all checks and + * values are still valid. */ - if (*pos > i_size_read(inode)) { + if ((ip->i_new_size && *pos > ip->i_new_size) || + (!ip->i_new_size && *pos > ip->i_size)) { if (*iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); goto restart; } - error = -xfs_zero_eof(ip, *pos, i_size_read(inode)); + error = -xfs_zero_eof(ip, *pos, ip->i_size); } + + /* + * If this IO extends beyond EOF, we may need to update ip->i_new_size. + * We have already zeroed space beyond EOF (if necessary). Only update + * ip->i_new_size if this IO ends beyond any other in-flight writes. + */ + new_size = *pos + *count; + if (new_size > ip->i_size) { + if (new_size > ip->i_new_size) + ip->i_new_size = new_size; + *new_sizep = new_size; + } + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; @@ -724,7 +794,9 @@ xfs_file_dio_aio_write( const struct iovec *iovp, unsigned long nr_segs, loff_t pos, - size_t ocount) + size_t ocount, + xfs_fsize_t *new_size, + int *iolock) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -734,10 +806,10 @@ xfs_file_dio_aio_write( ssize_t ret = 0; size_t count = ocount; int unaligned_io = 0; - int iolock; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; + *iolock = 0; if ((pos & target->bt_smask) || (count & target->bt_smask)) return -XFS_ERROR(EINVAL); @@ -752,31 +824,31 @@ xfs_file_dio_aio_write( * EOF zeroing cases and fill out the new inode size as appropriate. */ if (unaligned_io || mapping->nrpages) - iolock = XFS_IOLOCK_EXCL; + *iolock = XFS_IOLOCK_EXCL; else - iolock = XFS_IOLOCK_SHARED; - xfs_rw_ilock(ip, iolock); + *iolock = XFS_IOLOCK_SHARED; + xfs_rw_ilock(ip, *iolock); /* * Recheck if there are cached pages that need invalidate after we got * the iolock to protect against other threads adding new pages while * we were waiting for the iolock. */ - if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, iolock); - iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, iolock); + if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, *iolock); } - ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); + ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); if (ret) - goto out; + return ret; if (mapping->nrpages) { ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); if (ret) - goto out; + return ret; } /* @@ -785,18 +857,15 @@ xfs_file_dio_aio_write( */ if (unaligned_io) inode_dio_wait(inode); - else if (iolock == XFS_IOLOCK_EXCL) { + else if (*iolock == XFS_IOLOCK_EXCL) { xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); - iolock = XFS_IOLOCK_SHARED; + *iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); ret = generic_file_direct_write(iocb, iovp, &nr_segs, pos, &iocb->ki_pos, count, ocount); -out: - xfs_rw_iunlock(ip, iolock); - /* No fallback to buffered IO on errors for XFS. */ ASSERT(ret < 0 || ret == count); return ret; @@ -808,7 +877,9 @@ xfs_file_buffered_aio_write( const struct iovec *iovp, unsigned long nr_segs, loff_t pos, - size_t ocount) + size_t ocount, + xfs_fsize_t *new_size, + int *iolock) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -816,14 +887,14 @@ xfs_file_buffered_aio_write( struct xfs_inode *ip = XFS_I(inode); ssize_t ret; int enospc = 0; - int iolock = XFS_IOLOCK_EXCL; size_t count = ocount; - xfs_rw_ilock(ip, iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, *iolock); - ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); + ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); if (ret) - goto out; + return ret; /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; @@ -837,15 +908,13 @@ xfs_file_buffered_aio_write( * page locks and retry *once* */ if (ret == -ENOSPC && !enospc) { - enospc = 1; ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); - if (!ret) - goto write_retry; + if (ret) + return ret; + enospc = 1; + goto write_retry; } - current->backing_dev_info = NULL; -out: - xfs_rw_iunlock(ip, iolock); return ret; } @@ -861,7 +930,9 @@ xfs_file_aio_write( struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; + int iolock; size_t ocount = 0; + xfs_fsize_t new_size = 0; XFS_STATS_INC(xs_write_calls); @@ -880,22 +951,33 @@ xfs_file_aio_write( return -EIO; if (unlikely(file->f_flags & O_DIRECT)) - ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); + ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, + ocount, &new_size, &iolock); else ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, - ocount); + ocount, &new_size, &iolock); + + xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); - if (ret > 0) { - ssize_t err; + if (ret <= 0) + goto out_unlock; - XFS_STATS_ADD(xs_write_bytes, ret); + /* Handle various SYNC-type writes */ + if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { + loff_t end = pos + ret - 1; + int error; - /* Handle various SYNC-type writes */ - err = generic_write_sync(file, pos, ret); - if (err < 0) - ret = err; + xfs_rw_iunlock(ip, iolock); + error = xfs_file_fsync(file, pos, end, + (file->f_flags & __O_SYNC) ? 0 : 1); + xfs_rw_ilock(ip, iolock); + if (error) + ret = error; } +out_unlock: + xfs_aio_write_newsize_update(ip, new_size); + xfs_rw_iunlock(ip, iolock); return ret; } diff --git a/trunk/fs/xfs/xfs_fs_subr.c b/trunk/fs/xfs/xfs_fs_subr.c index 652b875a9d4c..ed88ed16811c 100644 --- a/trunk/fs/xfs/xfs_fs_subr.c +++ b/trunk/fs/xfs/xfs_fs_subr.c @@ -90,7 +90,7 @@ xfs_wait_on_pages( if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { return -filemap_fdatawait_range(mapping, first, - last == -1 ? XFS_ISIZE(ip) - 1 : last); + last == -1 ? ip->i_size - 1 : last); } return 0; } diff --git a/trunk/fs/xfs/xfs_iget.c b/trunk/fs/xfs/xfs_iget.c index 8c3e46394d48..3960a066d7ff 100644 --- a/trunk/fs/xfs/xfs_iget.c +++ b/trunk/fs/xfs/xfs_iget.c @@ -77,7 +77,7 @@ xfs_inode_alloc( ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(!xfs_isiflocked(ip)); + ASSERT(completion_done(&ip->i_flush)); ASSERT(ip->i_ino == 0); mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); @@ -94,6 +94,8 @@ xfs_inode_alloc( ip->i_update_core = 0; ip->i_delayed_blks = 0; memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); + ip->i_size = 0; + ip->i_new_size = 0; return ip; } @@ -148,7 +150,7 @@ xfs_inode_free( /* asserts to verify all state is correct here */ ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(!xfs_isiflocked(ip)); + ASSERT(completion_done(&ip->i_flush)); /* * Because we use RCU freeing we need to ensure the inode always @@ -448,6 +450,8 @@ xfs_iget( *ipp = ip; + ASSERT(ip->i_df.if_ext_max == + XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t)); /* * If we have a real type for an on-disk inode, we can set ops(&unlock) * now. If it's a new inode being created, xfs_ialloc will handle it. @@ -711,19 +715,3 @@ xfs_isilocked( return 0; } #endif - -void -__xfs_iflock( - struct xfs_inode *ip) -{ - wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); - DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); - - do { - prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); - if (xfs_isiflocked(ip)) - io_schedule(); - } while (!xfs_iflock_nowait(ip)); - - finish_wait(wq, &wait.wait); -} diff --git a/trunk/fs/xfs/xfs_inode.c b/trunk/fs/xfs/xfs_inode.c index b21022499c2e..9dda7cc32848 100644 --- a/trunk/fs/xfs/xfs_inode.c +++ b/trunk/fs/xfs/xfs_inode.c @@ -299,8 +299,11 @@ xfs_iformat( { xfs_attr_shortform_t *atp; int size; - int error = 0; + int error; xfs_fsize_t di_size; + ip->i_df.if_ext_max = + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + error = 0; if (unlikely(be32_to_cpu(dip->di_nextents) + be16_to_cpu(dip->di_anextents) > @@ -347,6 +350,7 @@ xfs_iformat( return XFS_ERROR(EFSCORRUPTED); } ip->i_d.di_size = 0; + ip->i_size = 0; ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); break; @@ -405,10 +409,10 @@ xfs_iformat( } if (!XFS_DFORK_Q(dip)) return 0; - ASSERT(ip->i_afp == NULL); ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); - + ip->i_afp->if_ext_max = + XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); switch (dip->di_aformat) { case XFS_DINODE_FMT_LOCAL: atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); @@ -600,11 +604,10 @@ xfs_iformat_btree( * or the number of extents is greater than the number of * blocks. */ - if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= - XFS_IFORK_MAXEXT(ip, whichfork) || - XFS_BMDR_SPACE_CALC(nrecs) > - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) || - XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { + if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max + || XFS_BMDR_SPACE_CALC(nrecs) > + XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) + || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).", (unsigned long long) ip->i_ino); XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, @@ -832,6 +835,12 @@ xfs_iread( * with the uninitialized part of it. */ ip->i_d.di_mode = 0; + /* + * Initialize the per-fork minima and maxima for a new + * inode here. xfs_iformat will do it for old inodes. + */ + ip->i_df.if_ext_max = + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); } /* @@ -852,6 +861,7 @@ xfs_iread( } ip->i_delayed_blks = 0; + ip->i_size = ip->i_d.di_size; /* * Mark the buffer containing the inode as something to keep @@ -1041,6 +1051,7 @@ xfs_ialloc( } ip->i_d.di_size = 0; + ip->i_size = 0; ip->i_d.di_nextents = 0; ASSERT(ip->i_d.di_nblocks == 0); @@ -1154,6 +1165,52 @@ xfs_ialloc( return 0; } +/* + * Check to make sure that there are no blocks allocated to the + * file beyond the size of the file. We don't check this for + * files with fixed size extents or real time extents, but we + * at least do it for regular files. + */ +#ifdef DEBUG +STATIC void +xfs_isize_check( + struct xfs_inode *ip, + xfs_fsize_t isize) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t map_first; + int nimaps; + xfs_bmbt_irec_t imaps[2]; + int error; + + if (!S_ISREG(ip->i_d.di_mode)) + return; + + if (XFS_IS_REALTIME_INODE(ip)) + return; + + if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) + return; + + nimaps = 2; + map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); + /* + * The filesystem could be shutting down, so bmapi may return + * an error. + */ + error = xfs_bmapi_read(ip, map_first, + (XFS_B_TO_FSB(mp, + (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first), + imaps, &nimaps, XFS_BMAPI_ENTIRE); + if (error) + return; + ASSERT(nimaps == 1); + ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); +} +#else /* DEBUG */ +#define xfs_isize_check(ip, isize) +#endif /* DEBUG */ + /* * Free up the underlying blocks past new_size. The new size must be smaller * than the current size. This routine can be used both for the attribute and @@ -1195,14 +1252,12 @@ xfs_itruncate_extents( int done = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); - ASSERT(new_size <= XFS_ISIZE(ip)); + ASSERT(new_size <= ip->i_size); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(ip->i_itemp != NULL); ASSERT(ip->i_itemp->ili_lock_flags == 0); ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); - trace_xfs_itruncate_extents_start(ip, new_size); - /* * Since it is possible for space to become allocated beyond * the end of the file (in a crash where the space is allocated @@ -1270,14 +1325,6 @@ xfs_itruncate_extents( goto out; } - /* - * Always re-log the inode so that our permanent transaction can keep - * on rolling it forward in the log. - */ - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - trace_xfs_itruncate_extents_end(ip, new_size); - out: *tpp = tp; return error; @@ -1291,6 +1338,74 @@ xfs_itruncate_extents( goto out; } +int +xfs_itruncate_data( + struct xfs_trans **tpp, + struct xfs_inode *ip, + xfs_fsize_t new_size) +{ + int error; + + trace_xfs_itruncate_data_start(ip, new_size); + + /* + * The first thing we do is set the size to new_size permanently on + * disk. This way we don't have to worry about anyone ever being able + * to look at the data being freed even in the face of a crash. + * What we're getting around here is the case where we free a block, it + * is allocated to another file, it is written to, and then we crash. + * If the new data gets written to the file but the log buffers + * containing the free and reallocation don't, then we'd end up with + * garbage in the blocks being freed. As long as we make the new_size + * permanent before actually freeing any blocks it doesn't matter if + * they get written to. + */ + if (ip->i_d.di_nextents > 0) { + /* + * If we are not changing the file size then do not update + * the on-disk file size - we may be called from + * xfs_inactive_free_eofblocks(). If we update the on-disk + * file size and then the system crashes before the contents + * of the file are flushed to disk then the files may be + * full of holes (ie NULL files bug). + */ + if (ip->i_size != new_size) { + ip->i_d.di_size = new_size; + ip->i_size = new_size; + xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); + } + } + + error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size); + if (error) + return error; + + /* + * If we are not changing the file size then do not update the on-disk + * file size - we may be called from xfs_inactive_free_eofblocks(). + * If we update the on-disk file size and then the system crashes + * before the contents of the file are flushed to disk then the files + * may be full of holes (ie NULL files bug). + */ + xfs_isize_check(ip, new_size); + if (ip->i_size != new_size) { + ip->i_d.di_size = new_size; + ip->i_size = new_size; + } + + ASSERT(new_size != 0 || ip->i_delayed_blks == 0); + ASSERT(new_size != 0 || ip->i_d.di_nextents == 0); + + /* + * Always re-log the inode so that our permanent transaction can keep + * on rolling it forward in the log. + */ + xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); + + trace_xfs_itruncate_data_end(ip, new_size); + return 0; +} + /* * This is called when the inode's link count goes to 0. * We place the on-disk inode on a list in the AGI. It @@ -1709,7 +1824,8 @@ xfs_ifree( ASSERT(ip->i_d.di_nlink == 0); ASSERT(ip->i_d.di_nextents == 0); ASSERT(ip->i_d.di_anextents == 0); - ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode)); + ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) || + (!S_ISREG(ip->i_d.di_mode))); ASSERT(ip->i_d.di_nblocks == 0); /* @@ -1728,6 +1844,8 @@ xfs_ifree( ip->i_d.di_flags = 0; ip->i_d.di_dmevmask = 0; ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ + ip->i_df.if_ext_max = + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; /* @@ -2033,7 +2151,7 @@ xfs_idestroy_fork( * once someone is waiting for it to be unpinned. */ static void -xfs_iunpin( +xfs_iunpin_nowait( struct xfs_inode *ip) { ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); @@ -2045,29 +2163,14 @@ xfs_iunpin( } -static void -__xfs_iunpin_wait( - struct xfs_inode *ip) -{ - wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT); - DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT); - - xfs_iunpin(ip); - - do { - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); - if (xfs_ipincount(ip)) - io_schedule(); - } while (xfs_ipincount(ip)); - finish_wait(wq, &wait.wait); -} - void xfs_iunpin_wait( struct xfs_inode *ip) { - if (xfs_ipincount(ip)) - __xfs_iunpin_wait(ip); + if (xfs_ipincount(ip)) { + xfs_iunpin_nowait(ip); + wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0)); + } } /* @@ -2407,9 +2510,9 @@ xfs_iflush( XFS_STATS_INC(xs_iflush_count); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(xfs_isiflocked(ip)); + ASSERT(!completion_done(&ip->i_flush)); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); + ip->i_d.di_nextents > ip->i_df.if_ext_max); iip = ip->i_itemp; mp = ip->i_mount; @@ -2426,7 +2529,7 @@ xfs_iflush( * out for us if they occur after the log force completes. */ if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) { - xfs_iunpin(ip); + xfs_iunpin_nowait(ip); xfs_ifunlock(ip); return EAGAIN; } @@ -2523,9 +2626,9 @@ xfs_iflush_int( #endif ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(xfs_isiflocked(ip)); + ASSERT(!completion_done(&ip->i_flush)); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); + ip->i_d.di_nextents > ip->i_df.if_ext_max); iip = ip->i_itemp; mp = ip->i_mount; diff --git a/trunk/fs/xfs/xfs_inode.h b/trunk/fs/xfs/xfs_inode.h index 2f27b7454085..f0e6b151ba37 100644 --- a/trunk/fs/xfs/xfs_inode.h +++ b/trunk/fs/xfs/xfs_inode.h @@ -66,6 +66,7 @@ typedef struct xfs_ifork { struct xfs_btree_block *if_broot; /* file's incore btree root */ short if_broot_bytes; /* bytes allocated for root */ unsigned char if_flags; /* per-fork flags */ + unsigned char if_ext_max; /* max # of extent records */ union { xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ @@ -205,12 +206,12 @@ typedef struct xfs_icdinode { ((w) == XFS_DATA_FORK ? \ ((ip)->i_d.di_nextents = (n)) : \ ((ip)->i_d.di_anextents = (n))) -#define XFS_IFORK_MAXEXT(ip, w) \ - (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) + #ifdef __KERNEL__ +struct bhv_desc; struct xfs_buf; struct xfs_bmap_free; struct xfs_bmbt_irec; @@ -219,6 +220,12 @@ struct xfs_mount; struct xfs_trans; struct xfs_dquot; +typedef struct dm_attrs_s { + __uint32_t da_dmevmask; /* DMIG event mask */ + __uint16_t da_dmstate; /* DMIG state info */ + __uint16_t da_pad; /* DMIG extra padding */ +} dm_attrs_t; + typedef struct xfs_inode { /* Inode linking and identification information. */ struct xfs_mount *i_mount; /* fs mount struct ptr */ @@ -237,19 +244,27 @@ typedef struct xfs_inode { struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ mrlock_t i_iolock; /* inode IO lock */ + struct completion i_flush; /* inode flush completion q */ atomic_t i_pincount; /* inode pin count */ + wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */ spinlock_t i_flags_lock; /* inode i_flags lock */ /* Miscellaneous state. */ - unsigned long i_flags; /* see defined flags below */ + unsigned short i_flags; /* see defined flags below */ unsigned char i_update_core; /* timestamps/size is dirty */ unsigned int i_delayed_blks; /* count of delay alloc blks */ xfs_icdinode_t i_d; /* most of ondisk inode */ + xfs_fsize_t i_size; /* in-memory size */ + xfs_fsize_t i_new_size; /* size when write completes */ + /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ } xfs_inode_t; +#define XFS_ISIZE(ip) S_ISREG((ip)->i_d.di_mode) ? \ + (ip)->i_size : (ip)->i_d.di_size; + /* Convert from vfs inode to xfs inode */ static inline struct xfs_inode *XFS_I(struct inode *inode) { @@ -262,18 +277,6 @@ static inline struct inode *VFS_I(struct xfs_inode *ip) return &ip->i_vnode; } -/* - * For regular files we only update the on-disk filesize when actually - * writing data back to disk. Until then only the copy in the VFS inode - * is uptodate. - */ -static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip) -{ - if (S_ISREG(ip->i_d.di_mode)) - return i_size_read(VFS_I(ip)); - return ip->i_d.di_size; -} - /* * i_flags helper functions */ @@ -328,19 +331,6 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) return ret; } -static inline int -xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags) -{ - int ret; - - spin_lock(&ip->i_flags_lock); - ret = ip->i_flags & flags; - if (!ret) - ip->i_flags |= flags; - spin_unlock(&ip->i_flags_lock); - return ret; -} - /* * Project quota id helpers (previously projid was 16bit only * and using two 16bit values to hold new 32bit projid was chosen @@ -360,20 +350,36 @@ xfs_set_projid(struct xfs_inode *ip, ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff); } +/* + * Manage the i_flush queue embedded in the inode. This completion + * queue synchronizes processes attempting to flush the in-core + * inode back to disk. + */ +static inline void xfs_iflock(xfs_inode_t *ip) +{ + wait_for_completion(&ip->i_flush); +} + +static inline int xfs_iflock_nowait(xfs_inode_t *ip) +{ + return try_wait_for_completion(&ip->i_flush); +} + +static inline void xfs_ifunlock(xfs_inode_t *ip) +{ + complete(&ip->i_flush); +} + /* * In-core inode flags. */ -#define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ -#define XFS_ISTALE (1 << 1) /* inode has been staled */ -#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ -#define XFS_INEW (1 << 3) /* inode has just been allocated */ -#define XFS_IFILESTREAM (1 << 4) /* inode is in a filestream dir. */ -#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ -#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ -#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ -#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT) -#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ -#define XFS_IPINNED (1 << __XFS_IPINNED_BIT) +#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */ +#define XFS_ISTALE 0x0002 /* inode has been staled */ +#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ +#define XFS_INEW 0x0008 /* inode has just been allocated */ +#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ +#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ +#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */ /* * Per-lifetime flags need to be reset when re-using a reclaimable inode during @@ -385,34 +391,6 @@ xfs_set_projid(struct xfs_inode *ip, XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | \ XFS_IFILESTREAM); -/* - * Synchronize processes attempting to flush the in-core inode back to disk. - */ - -extern void __xfs_iflock(struct xfs_inode *ip); - -static inline int xfs_iflock_nowait(struct xfs_inode *ip) -{ - return !xfs_iflags_test_and_set(ip, XFS_IFLOCK); -} - -static inline void xfs_iflock(struct xfs_inode *ip) -{ - if (!xfs_iflock_nowait(ip)) - __xfs_iflock(ip); -} - -static inline void xfs_ifunlock(struct xfs_inode *ip) -{ - xfs_iflags_clear(ip, XFS_IFLOCK); - wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); -} - -static inline int xfs_isiflocked(struct xfs_inode *ip) -{ - return xfs_iflags_test(ip, XFS_IFLOCK); -} - /* * Flags for inode locking. * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) @@ -513,6 +491,8 @@ int xfs_ifree(struct xfs_trans *, xfs_inode_t *, struct xfs_bmap_free *); int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, int, xfs_fsize_t); +int xfs_itruncate_data(struct xfs_trans **, struct xfs_inode *, + xfs_fsize_t); int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); void xfs_iext_realloc(xfs_inode_t *, int, int); diff --git a/trunk/fs/xfs/xfs_inode_item.c b/trunk/fs/xfs/xfs_inode_item.c index 91d71dcd4852..cfd6c7f8cc3c 100644 --- a/trunk/fs/xfs/xfs_inode_item.c +++ b/trunk/fs/xfs/xfs_inode_item.c @@ -79,6 +79,8 @@ xfs_inode_item_size( break; case XFS_DINODE_FMT_BTREE: + ASSERT(ip->i_df.if_ext_max == + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); iip->ili_format.ilf_fields &= ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | XFS_ILOG_DEV | XFS_ILOG_UUID); @@ -555,7 +557,7 @@ xfs_inode_item_unpin( trace_xfs_inode_unpin(ip, _RET_IP_); ASSERT(atomic_read(&ip->i_pincount) > 0); if (atomic_dec_and_test(&ip->i_pincount)) - wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); + wake_up(&ip->i_ipin_wait); } /* @@ -717,7 +719,7 @@ xfs_inode_item_pushbuf( * If a flush is not in progress anymore, chances are that the * inode was taken off the AIL. So, just get out. */ - if (!xfs_isiflocked(ip) || + if (completion_done(&ip->i_flush) || !(lip->li_flags & XFS_LI_IN_AIL)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); return true; @@ -750,7 +752,7 @@ xfs_inode_item_push( struct xfs_inode *ip = iip->ili_inode; ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); - ASSERT(xfs_isiflocked(ip)); + ASSERT(!completion_done(&ip->i_flush)); /* * Since we were able to lock the inode's flush lock and diff --git a/trunk/fs/xfs/xfs_iomap.c b/trunk/fs/xfs/xfs_iomap.c index 246c7d57c6f9..9afa282aa937 100644 --- a/trunk/fs/xfs/xfs_iomap.c +++ b/trunk/fs/xfs/xfs_iomap.c @@ -57,26 +57,26 @@ xfs_iomap_eof_align_last_fsb( xfs_fileoff_t *last_fsb) { xfs_fileoff_t new_last_fsb = 0; - xfs_extlen_t align = 0; + xfs_extlen_t align; int eof, error; - if (!XFS_IS_REALTIME_INODE(ip)) { - /* - * Round up the allocation request to a stripe unit - * (m_dalign) boundary if the file size is >= stripe unit - * size, and we are allocating past the allocation eof. - * - * If mounted with the "-o swalloc" option the alignment is - * increased from the strip unit size to the stripe width. - */ - if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) - align = mp->m_swidth; - else if (mp->m_dalign) - align = mp->m_dalign; - - if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align)) - new_last_fsb = roundup_64(*last_fsb, align); - } + if (XFS_IS_REALTIME_INODE(ip)) + ; + /* + * If mounted with the "-o swalloc" option, roundup the allocation + * request to a stripe width boundary if the file size is >= + * stripe width and we are allocating past the allocation eof. + */ + else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) && + (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth))) + new_last_fsb = roundup_64(*last_fsb, mp->m_swidth); + /* + * Roundup the allocation request to a stripe unit (m_dalign) boundary + * if the file size is >= stripe unit size, and we are allocating past + * the allocation eof. + */ + else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign))) + new_last_fsb = roundup_64(*last_fsb, mp->m_dalign); /* * Always round up the allocation request to an extent boundary @@ -154,7 +154,7 @@ xfs_iomap_write_direct( offset_fsb = XFS_B_TO_FSBT(mp, offset); last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); - if ((offset + count) > XFS_ISIZE(ip)) { + if ((offset + count) > ip->i_size) { error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); if (error) goto error_out; @@ -211,7 +211,7 @@ xfs_iomap_write_direct( xfs_trans_ijoin(tp, ip, 0); bmapi_flag = 0; - if (offset < XFS_ISIZE(ip) || extsz) + if (offset < ip->i_size || extsz) bmapi_flag |= XFS_BMAPI_PREALLOC; /* @@ -286,7 +286,7 @@ xfs_iomap_eof_want_preallocate( int found_delalloc = 0; *prealloc = 0; - if (offset + count <= XFS_ISIZE(ip)) + if ((offset + count) <= ip->i_size) return 0; /* @@ -340,7 +340,7 @@ xfs_iomap_prealloc_size( * if we pass in alloc_blocks = 0. Hence the "+ 1" to * ensure we always pass in a non-zero value. */ - alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1; + alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1; alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, rounddown_pow_of_two(alloc_blocks)); @@ -564,7 +564,7 @@ xfs_iomap_write_allocate( * back.... */ nimaps = 1; - end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); + end_fsb = XFS_B_TO_FSB(mp, ip->i_size); error = xfs_bmap_last_offset(NULL, ip, &last_block, XFS_DATA_FORK); if (error) diff --git a/trunk/fs/xfs/xfs_iops.c b/trunk/fs/xfs/xfs_iops.c index ab302539e5b9..f9babd179223 100644 --- a/trunk/fs/xfs/xfs_iops.c +++ b/trunk/fs/xfs/xfs_iops.c @@ -750,7 +750,6 @@ xfs_setattr_size( struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); int mask = iattr->ia_valid; - xfs_off_t oldsize, newsize; struct xfs_trans *tp; int error; uint lock_flags; @@ -778,13 +777,11 @@ xfs_setattr_size( lock_flags |= XFS_IOLOCK_EXCL; xfs_ilock(ip, lock_flags); - oldsize = inode->i_size; - newsize = iattr->ia_size; - /* * Short circuit the truncate case for zero length files. */ - if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { + if (iattr->ia_size == 0 && + ip->i_size == 0 && ip->i_d.di_nextents == 0) { if (!(mask & (ATTR_CTIME|ATTR_MTIME))) goto out_unlock; @@ -810,14 +807,14 @@ xfs_setattr_size( * the inode to the transaction, because the inode cannot be unlocked * once it is a part of the transaction. */ - if (newsize > oldsize) { + if (iattr->ia_size > ip->i_size) { /* * Do the first part of growing a file: zero any data in the * last block that is beyond the old EOF. We need to do this * before the inode is joined to the transaction to modify * i_size. */ - error = xfs_zero_eof(ip, newsize, oldsize); + error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size); if (error) goto out_unlock; } @@ -836,8 +833,8 @@ xfs_setattr_size( * here and prevents waiting for other data not within the range we * care about here. */ - if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { - error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0, + if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) { + error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0, FI_NONE); if (error) goto out_unlock; @@ -848,7 +845,8 @@ xfs_setattr_size( */ inode_dio_wait(inode); - error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); + error = -block_truncate_page(inode->i_mapping, iattr->ia_size, + xfs_get_blocks); if (error) goto out_unlock; @@ -859,7 +857,7 @@ xfs_setattr_size( if (error) goto out_trans_cancel; - truncate_setsize(inode, newsize); + truncate_setsize(inode, iattr->ia_size); commit_flags = XFS_TRANS_RELEASE_LOG_RES; lock_flags |= XFS_ILOCK_EXCL; @@ -878,29 +876,19 @@ xfs_setattr_size( * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ - if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { + if (iattr->ia_size != ip->i_size && + (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { iattr->ia_ctime = iattr->ia_mtime = current_fs_time(inode->i_sb); mask |= ATTR_CTIME | ATTR_MTIME; } - /* - * The first thing we do is set the size to new_size permanently on - * disk. This way we don't have to worry about anyone ever being able - * to look at the data being freed even in the face of a crash. - * What we're getting around here is the case where we free a block, it - * is allocated to another file, it is written to, and then we crash. - * If the new data gets written to the file but the log buffers - * containing the free and reallocation don't, then we'd end up with - * garbage in the blocks being freed. As long as we make the new size - * permanent before actually freeing any blocks it doesn't matter if - * they get written to. - */ - ip->i_d.di_size = newsize; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - if (newsize <= oldsize) { - error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); + if (iattr->ia_size > ip->i_size) { + ip->i_d.di_size = iattr->ia_size; + ip->i_size = iattr->ia_size; + } else if (iattr->ia_size <= ip->i_size || + (iattr->ia_size == 0 && ip->i_d.di_nextents)) { + error = xfs_itruncate_data(&tp, ip, iattr->ia_size); if (error) goto out_trans_abort; diff --git a/trunk/fs/xfs/xfs_qm_syscalls.c b/trunk/fs/xfs/xfs_qm_syscalls.c index eafbcff81f3a..5cc3dde1bc90 100644 --- a/trunk/fs/xfs/xfs_qm_syscalls.c +++ b/trunk/fs/xfs/xfs_qm_syscalls.c @@ -31,7 +31,6 @@ #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_inode.h" -#include "xfs_inode_item.h" #include "xfs_itable.h" #include "xfs_bmap.h" #include "xfs_rtalloc.h" @@ -264,18 +263,13 @@ xfs_qm_scall_trunc_qfile( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - ip->i_d.di_size = 0; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); + error = xfs_itruncate_data(&tp, ip, 0); if (error) { xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); goto out_unlock; } - ASSERT(ip->i_d.di_nextents == 0); - xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); diff --git a/trunk/fs/xfs/xfs_super.c b/trunk/fs/xfs/xfs_super.c index ee5b695c99a7..281961c1d81a 100644 --- a/trunk/fs/xfs/xfs_super.c +++ b/trunk/fs/xfs/xfs_super.c @@ -828,6 +828,14 @@ xfs_fs_inode_init_once( /* xfs inode */ atomic_set(&ip->i_pincount, 0); spin_lock_init(&ip->i_flags_lock); + init_waitqueue_head(&ip->i_ipin_wait); + /* + * Because we want to use a counting completion, complete + * the flush completion once to allow a single access to + * the flush completion without blocking. + */ + init_completion(&ip->i_flush); + complete(&ip->i_flush); mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, "xfsino", ip->i_ino); diff --git a/trunk/fs/xfs/xfs_sync.c b/trunk/fs/xfs/xfs_sync.c index 40b75eecd2b4..72c01a1c16e7 100644 --- a/trunk/fs/xfs/xfs_sync.c +++ b/trunk/fs/xfs/xfs_sync.c @@ -707,13 +707,14 @@ xfs_reclaim_inode_grab( return 1; /* - * If we are asked for non-blocking operation, do unlocked checks to - * see if the inode already is being flushed or in reclaim to avoid - * lock traffic. + * do some unlocked checks first to avoid unnecessary lock traffic. + * The first is a flush lock check, the second is a already in reclaim + * check. Only do these checks if we are not going to block on locks. */ if ((flags & SYNC_TRYLOCK) && - __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) + (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) { return 1; + } /* * The radix tree lock here protects a thread in xfs_iget from racing diff --git a/trunk/fs/xfs/xfs_trace.h b/trunk/fs/xfs/xfs_trace.h index 6b6df5802e95..a9d5b1e06efe 100644 --- a/trunk/fs/xfs/xfs_trace.h +++ b/trunk/fs/xfs/xfs_trace.h @@ -891,6 +891,7 @@ DECLARE_EVENT_CLASS(xfs_file_class, __field(dev_t, dev) __field(xfs_ino_t, ino) __field(xfs_fsize_t, size) + __field(xfs_fsize_t, new_size) __field(loff_t, offset) __field(size_t, count) __field(int, flags) @@ -899,15 +900,17 @@ DECLARE_EVENT_CLASS(xfs_file_class, __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; __entry->size = ip->i_d.di_size; + __entry->new_size = ip->i_new_size; __entry->offset = offset; __entry->count = count; __entry->flags = flags; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx " + TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " "offset 0x%llx count 0x%zx ioflags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, + __entry->new_size, __entry->offset, __entry->count, __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) @@ -975,6 +978,7 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __field(dev_t, dev) __field(xfs_ino_t, ino) __field(loff_t, size) + __field(loff_t, new_size) __field(loff_t, offset) __field(size_t, count) __field(int, type) @@ -986,6 +990,7 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; __entry->size = ip->i_d.di_size; + __entry->new_size = ip->i_new_size; __entry->offset = offset; __entry->count = count; __entry->type = type; @@ -993,11 +998,13 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __entry->startblock = irec ? irec->br_startblock : 0; __entry->blockcount = irec ? irec->br_blockcount : 0; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd " - "type %s startoff 0x%llx startblock %lld blockcount 0x%llx", + TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " + "offset 0x%llx count %zd type %s " + "startoff 0x%llx startblock %lld blockcount 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, + __entry->new_size, __entry->offset, __entry->count, __print_symbolic(__entry->type, XFS_IO_TYPES), @@ -1024,23 +1031,26 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class, __field(xfs_ino_t, ino) __field(loff_t, isize) __field(loff_t, disize) + __field(loff_t, new_size) __field(loff_t, offset) __field(size_t, count) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->isize = VFS_I(ip)->i_size; + __entry->isize = ip->i_size; __entry->disize = ip->i_d.di_size; + __entry->new_size = ip->i_new_size; __entry->offset = offset; __entry->count = count; ), - TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx " + TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx " "offset 0x%llx count %zd", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->isize, __entry->disize, + __entry->new_size, __entry->offset, __entry->count) ); @@ -1080,8 +1090,8 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class, DEFINE_EVENT(xfs_itrunc_class, name, \ TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \ TP_ARGS(ip, new_size)) -DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start); -DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end); +DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start); +DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end); TRACE_EVENT(xfs_pagecache_inval, TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish), @@ -1558,6 +1568,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __field(xfs_ino_t, ino) __field(int, format) __field(int, nex) + __field(int, max_nex) __field(int, broot_size) __field(int, fork_off) ), @@ -1567,16 +1578,18 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __entry->ino = ip->i_ino; __entry->format = ip->i_d.di_format; __entry->nex = ip->i_d.di_nextents; + __entry->max_nex = ip->i_df.if_ext_max; __entry->broot_size = ip->i_df.if_broot_bytes; __entry->fork_off = XFS_IFORK_BOFF(ip); ), TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " - "broot size %d, fork offset %d", + "Max in-fork extents %d, broot size %d, fork offset %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_symbolic(__entry->which, XFS_SWAPEXT_INODES), __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR), __entry->nex, + __entry->max_nex, __entry->broot_size, __entry->fork_off) ) diff --git a/trunk/fs/xfs/xfs_vnodeops.c b/trunk/fs/xfs/xfs_vnodeops.c index 0cf52da9d246..f2fea868d4db 100644 --- a/trunk/fs/xfs/xfs_vnodeops.c +++ b/trunk/fs/xfs/xfs_vnodeops.c @@ -175,7 +175,7 @@ xfs_free_eofblocks( * Figure out if there are any blocks beyond the end * of the file. If not, then there is nothing to do. */ - end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); + end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size)); last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); if (last_fsb <= end_fsb) return 0; @@ -226,14 +226,7 @@ xfs_free_eofblocks( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - /* - * Do not update the on-disk file size. If we update the - * on-disk file size and then the system crashes before the - * contents of the file are flushed to disk then the files - * may be full of holes (ie NULL files bug). - */ - error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, - XFS_ISIZE(ip)); + error = xfs_itruncate_data(&tp, ip, ip->i_size); if (error) { /* * If we get an error at this point we simply don't @@ -547,8 +540,8 @@ xfs_release( return 0; if ((S_ISREG(ip->i_d.di_mode) && - (VFS_I(ip)->i_size > 0 || - (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && + ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || + ip->i_delayed_blks > 0)) && (ip->i_df.if_flags & XFS_IFEXTENTS)) && (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { @@ -625,7 +618,7 @@ xfs_inactive( * only one with a reference to the inode. */ truncate = ((ip->i_d.di_nlink == 0) && - ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 || + ((ip->i_d.di_size != 0) || (ip->i_size != 0) || (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) && S_ISREG(ip->i_d.di_mode)); @@ -639,12 +632,12 @@ xfs_inactive( if (ip->i_d.di_nlink != 0) { if ((S_ISREG(ip->i_d.di_mode) && - (VFS_I(ip)->i_size > 0 || - (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && - (ip->i_df.if_flags & XFS_IFEXTENTS) && - (!(ip->i_d.di_flags & + ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || + ip->i_delayed_blks > 0)) && + (ip->i_df.if_flags & XFS_IFEXTENTS) && + (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || - ip->i_delayed_blks != 0))) { + (ip->i_delayed_blks != 0)))) { error = xfs_free_eofblocks(mp, ip, 0); if (error) return VN_INACTIVE_CACHE; @@ -677,18 +670,13 @@ xfs_inactive( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - ip->i_d.di_size = 0; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); + error = xfs_itruncate_data(&tp, ip, 0); if (error) { xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); return VN_INACTIVE_CACHE; } - - ASSERT(ip->i_d.di_nextents == 0); } else if (S_ISLNK(ip->i_d.di_mode)) { /* @@ -1973,11 +1961,11 @@ xfs_zero_remaining_bytes( * since nothing can read beyond eof. The space will * be zeroed when the file is extended anyway. */ - if (startoff >= XFS_ISIZE(ip)) + if (startoff >= ip->i_size) return 0; - if (endoff > XFS_ISIZE(ip)) - endoff = XFS_ISIZE(ip); + if (endoff > ip->i_size) + endoff = ip->i_size; bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp, @@ -2272,7 +2260,7 @@ xfs_change_file_space( bf->l_start += offset; break; case 2: /*SEEK_END*/ - bf->l_start += XFS_ISIZE(ip); + bf->l_start += ip->i_size; break; default: return XFS_ERROR(EINVAL); @@ -2289,7 +2277,7 @@ xfs_change_file_space( bf->l_whence = 0; startoffset = bf->l_start; - fsize = XFS_ISIZE(ip); + fsize = ip->i_size; /* * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve diff --git a/trunk/include/linux/audit.h b/trunk/include/linux/audit.h index 9ff7a2c48b50..426ab9f4dd85 100644 --- a/trunk/include/linux/audit.h +++ b/trunk/include/linux/audit.h @@ -26,7 +26,6 @@ #include #include -#include /* The netlink messages for the audit system is divided into blocks: * 1000 - 1099 are for commanding the audit system @@ -182,40 +181,6 @@ * AUDIT_UNUSED_BITS is updated if need be. */ #define AUDIT_UNUSED_BITS 0x07FFFC00 -/* AUDIT_FIELD_COMPARE rule list */ -#define AUDIT_COMPARE_UID_TO_OBJ_UID 1 -#define AUDIT_COMPARE_GID_TO_OBJ_GID 2 -#define AUDIT_COMPARE_EUID_TO_OBJ_UID 3 -#define AUDIT_COMPARE_EGID_TO_OBJ_GID 4 -#define AUDIT_COMPARE_AUID_TO_OBJ_UID 5 -#define AUDIT_COMPARE_SUID_TO_OBJ_UID 6 -#define AUDIT_COMPARE_SGID_TO_OBJ_GID 7 -#define AUDIT_COMPARE_FSUID_TO_OBJ_UID 8 -#define AUDIT_COMPARE_FSGID_TO_OBJ_GID 9 - -#define AUDIT_COMPARE_UID_TO_AUID 10 -#define AUDIT_COMPARE_UID_TO_EUID 11 -#define AUDIT_COMPARE_UID_TO_FSUID 12 -#define AUDIT_COMPARE_UID_TO_SUID 13 - -#define AUDIT_COMPARE_AUID_TO_FSUID 14 -#define AUDIT_COMPARE_AUID_TO_SUID 15 -#define AUDIT_COMPARE_AUID_TO_EUID 16 - -#define AUDIT_COMPARE_EUID_TO_SUID 17 -#define AUDIT_COMPARE_EUID_TO_FSUID 18 - -#define AUDIT_COMPARE_SUID_TO_FSUID 19 - -#define AUDIT_COMPARE_GID_TO_EGID 20 -#define AUDIT_COMPARE_GID_TO_FSGID 21 -#define AUDIT_COMPARE_GID_TO_SGID 22 - -#define AUDIT_COMPARE_EGID_TO_FSGID 23 -#define AUDIT_COMPARE_EGID_TO_SGID 24 -#define AUDIT_COMPARE_SGID_TO_FSGID 25 - -#define AUDIT_MAX_FIELD_COMPARE AUDIT_COMPARE_SGID_TO_FSGID /* Rule fields */ /* These are useful when checking the @@ -257,9 +222,6 @@ #define AUDIT_PERM 106 #define AUDIT_DIR 107 #define AUDIT_FILETYPE 108 -#define AUDIT_OBJ_UID 109 -#define AUDIT_OBJ_GID 110 -#define AUDIT_FIELD_COMPARE 111 #define AUDIT_ARG0 200 #define AUDIT_ARG1 (AUDIT_ARG0+1) @@ -446,24 +408,28 @@ struct audit_field { void *lsm_rule; }; +#define AUDITSC_INVALID 0 +#define AUDITSC_SUCCESS 1 +#define AUDITSC_FAILURE 2 +#define AUDITSC_RESULT(x) ( ((long)(x))<0?AUDITSC_FAILURE:AUDITSC_SUCCESS ) extern int __init audit_register_class(int class, unsigned *list); extern int audit_classify_syscall(int abi, unsigned syscall); extern int audit_classify_arch(int arch); #ifdef CONFIG_AUDITSYSCALL /* These are defined in auditsc.c */ /* Public API */ +extern void audit_finish_fork(struct task_struct *child); extern int audit_alloc(struct task_struct *task); -extern void __audit_free(struct task_struct *task); -extern void __audit_syscall_entry(int arch, - int major, unsigned long a0, unsigned long a1, - unsigned long a2, unsigned long a3); -extern void __audit_syscall_exit(int ret_success, long ret_value); +extern void audit_free(struct task_struct *task); +extern void audit_syscall_entry(int arch, + int major, unsigned long a0, unsigned long a1, + unsigned long a2, unsigned long a3); +extern void audit_syscall_exit(int failed, long return_code); extern void __audit_getname(const char *name); extern void audit_putname(const char *name); extern void __audit_inode(const char *name, const struct dentry *dentry); extern void __audit_inode_child(const struct dentry *dentry, const struct inode *parent); -extern void __audit_seccomp(unsigned long syscall); extern void __audit_ptrace(struct task_struct *t); static inline int audit_dummy_context(void) @@ -471,27 +437,6 @@ static inline int audit_dummy_context(void) void *p = current->audit_context; return !p || *(int *)p; } -static inline void audit_free(struct task_struct *task) -{ - if (unlikely(task->audit_context)) - __audit_free(task); -} -static inline void audit_syscall_entry(int arch, int major, unsigned long a0, - unsigned long a1, unsigned long a2, - unsigned long a3) -{ - if (unlikely(!audit_dummy_context())) - __audit_syscall_entry(arch, major, a0, a1, a2, a3); -} -static inline void audit_syscall_exit(void *pt_regs) -{ - if (unlikely(current->audit_context)) { - int success = is_syscall_success(pt_regs); - int return_code = regs_return_value(pt_regs); - - __audit_syscall_exit(success, return_code); - } -} static inline void audit_getname(const char *name) { if (unlikely(!audit_dummy_context())) @@ -508,12 +453,6 @@ static inline void audit_inode_child(const struct dentry *dentry, } void audit_core_dumps(long signr); -static inline void audit_seccomp(unsigned long syscall) -{ - if (unlikely(!audit_dummy_context())) - __audit_seccomp(syscall); -} - static inline void audit_ptrace(struct task_struct *t) { if (unlikely(!audit_dummy_context())) @@ -524,16 +463,17 @@ static inline void audit_ptrace(struct task_struct *t) extern unsigned int audit_serial(void); extern int auditsc_get_stamp(struct audit_context *ctx, struct timespec *t, unsigned int *serial); -extern int audit_set_loginuid(uid_t loginuid); +extern int audit_set_loginuid(struct task_struct *task, uid_t loginuid); #define audit_get_loginuid(t) ((t)->loginuid) #define audit_get_sessionid(t) ((t)->sessionid) extern void audit_log_task_context(struct audit_buffer *ab); extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp); extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode); -extern int __audit_bprm(struct linux_binprm *bprm); -extern void __audit_socketcall(int nargs, unsigned long *args); -extern int __audit_sockaddr(int len, void *addr); +extern int audit_bprm(struct linux_binprm *bprm); +extern void audit_socketcall(int nargs, unsigned long *args); +extern int audit_sockaddr(int len, void *addr); extern void __audit_fd_pair(int fd1, int fd2); +extern int audit_set_macxattr(const char *name); extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr); extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout); extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification); @@ -559,23 +499,6 @@ static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid if (unlikely(!audit_dummy_context())) __audit_ipc_set_perm(qbytes, uid, gid, mode); } -static inline int audit_bprm(struct linux_binprm *bprm) -{ - if (unlikely(!audit_dummy_context())) - return __audit_bprm(bprm); - return 0; -} -static inline void audit_socketcall(int nargs, unsigned long *args) -{ - if (unlikely(!audit_dummy_context())) - __audit_socketcall(nargs, args); -} -static inline int audit_sockaddr(int len, void *addr) -{ - if (unlikely(!audit_dummy_context())) - return __audit_sockaddr(len, addr); - return 0; -} static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { if (unlikely(!audit_dummy_context())) @@ -621,11 +544,12 @@ static inline void audit_mmap_fd(int fd, int flags) extern int audit_n_rules; extern int audit_signals; -#else /* CONFIG_AUDITSYSCALL */ +#else +#define audit_finish_fork(t) #define audit_alloc(t) ({ 0; }) #define audit_free(t) do { ; } while (0) #define audit_syscall_entry(ta,a,b,c,d,e) do { ; } while (0) -#define audit_syscall_exit(r) do { ; } while (0) +#define audit_syscall_exit(f,r) do { ; } while (0) #define audit_dummy_context() 1 #define audit_getname(n) do { ; } while (0) #define audit_putname(n) do { ; } while (0) @@ -634,7 +558,6 @@ extern int audit_signals; #define audit_inode(n,d) do { (void)(d); } while (0) #define audit_inode_child(i,p) do { ; } while (0) #define audit_core_dumps(i) do { ; } while (0) -#define audit_seccomp(i) do { ; } while (0) #define auditsc_get_stamp(c,t,s) (0) #define audit_get_loginuid(t) (-1) #define audit_get_sessionid(t) (-1) @@ -645,6 +568,7 @@ extern int audit_signals; #define audit_socketcall(n,a) ((void)0) #define audit_fd_pair(n,a) ((void)0) #define audit_sockaddr(len, addr) ({ 0; }) +#define audit_set_macxattr(n) do { ; } while (0) #define audit_mq_open(o,m,a) ((void)0) #define audit_mq_sendrecv(d,l,p,t) ((void)0) #define audit_mq_notify(d,n) ((void)0) @@ -655,7 +579,7 @@ extern int audit_signals; #define audit_ptrace(t) ((void)0) #define audit_n_rules 0 #define audit_signals 0 -#endif /* CONFIG_AUDITSYSCALL */ +#endif #ifdef CONFIG_AUDIT /* These are defined in audit.c */ diff --git a/trunk/include/linux/kref.h b/trunk/include/linux/kref.h index 9c07dcebded7..abc0120b09b7 100644 --- a/trunk/include/linux/kref.h +++ b/trunk/include/linux/kref.h @@ -17,7 +17,6 @@ #include #include -#include struct kref { atomic_t refcount; diff --git a/trunk/include/linux/ptrace.h b/trunk/include/linux/ptrace.h index c2f1f6a5fcb8..a27e56ca41a4 100644 --- a/trunk/include/linux/ptrace.h +++ b/trunk/include/linux/ptrace.h @@ -112,7 +112,6 @@ #include /* For unlikely. */ #include /* For struct task_struct. */ -#include /* for IS_ERR_VALUE */ extern long arch_ptrace(struct task_struct *child, long request, @@ -267,15 +266,6 @@ static inline void ptrace_release_task(struct task_struct *task) #define force_successful_syscall_return() do { } while (0) #endif -#ifndef is_syscall_success -/* - * On most systems we can tell if a syscall is a success based on if the retval - * is an error value. On some systems like ia64 and powerpc they have different - * indicators of success/failure and must define their own. - */ -#define is_syscall_success(regs) (!IS_ERR_VALUE((unsigned long)(regs_return_value(regs)))) -#endif - /* * should define the following things inside #ifdef __KERNEL__. * diff --git a/trunk/include/linux/tty_driver.h b/trunk/include/linux/tty_driver.h index 5cf685086dd3..ecdaeb98b293 100644 --- a/trunk/include/linux/tty_driver.h +++ b/trunk/include/linux/tty_driver.h @@ -312,6 +312,7 @@ struct tty_driver { */ struct tty_struct **ttys; struct ktermios **termios; + struct ktermios **termios_locked; void *driver_state; /* diff --git a/trunk/include/trace/events/btrfs.h b/trunk/include/trace/events/btrfs.h index 84f3001a568d..b31702ac15be 100644 --- a/trunk/include/trace/events/btrfs.h +++ b/trunk/include/trace/events/btrfs.h @@ -16,8 +16,6 @@ struct btrfs_delayed_ref_node; struct btrfs_delayed_tree_ref; struct btrfs_delayed_data_ref; struct btrfs_delayed_ref_head; -struct btrfs_block_group_cache; -struct btrfs_free_cluster; struct map_lookup; struct extent_buffer; @@ -46,17 +44,6 @@ struct extent_buffer; obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \ (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-" -#define BTRFS_GROUP_FLAGS \ - { BTRFS_BLOCK_GROUP_DATA, "DATA"}, \ - { BTRFS_BLOCK_GROUP_SYSTEM, "SYSTEM"}, \ - { BTRFS_BLOCK_GROUP_METADATA, "METADATA"}, \ - { BTRFS_BLOCK_GROUP_RAID0, "RAID0"}, \ - { BTRFS_BLOCK_GROUP_RAID1, "RAID1"}, \ - { BTRFS_BLOCK_GROUP_DUP, "DUP"}, \ - { BTRFS_BLOCK_GROUP_RAID10, "RAID10"} - -#define BTRFS_UUID_SIZE 16 - TRACE_EVENT(btrfs_transaction_commit, TP_PROTO(struct btrfs_root *root), @@ -634,34 +621,6 @@ TRACE_EVENT(btrfs_cow_block, __entry->cow_level) ); -TRACE_EVENT(btrfs_space_reservation, - - TP_PROTO(struct btrfs_fs_info *fs_info, char *type, u64 val, - u64 bytes, int reserve), - - TP_ARGS(fs_info, type, val, bytes, reserve), - - TP_STRUCT__entry( - __array( u8, fsid, BTRFS_UUID_SIZE ) - __string( type, type ) - __field( u64, val ) - __field( u64, bytes ) - __field( int, reserve ) - ), - - TP_fast_assign( - memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE); - __assign_str(type, type); - __entry->val = val; - __entry->bytes = bytes; - __entry->reserve = reserve; - ), - - TP_printk("%pU: %s: %Lu %s %Lu", __entry->fsid, __get_str(type), - __entry->val, __entry->reserve ? "reserve" : "release", - __entry->bytes) -); - DECLARE_EVENT_CLASS(btrfs__reserved_extent, TP_PROTO(struct btrfs_root *root, u64 start, u64 len), @@ -700,168 +659,6 @@ DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free, TP_ARGS(root, start, len) ); -TRACE_EVENT(find_free_extent, - - TP_PROTO(struct btrfs_root *root, u64 num_bytes, u64 empty_size, - u64 data), - - TP_ARGS(root, num_bytes, empty_size, data), - - TP_STRUCT__entry( - __field( u64, root_objectid ) - __field( u64, num_bytes ) - __field( u64, empty_size ) - __field( u64, data ) - ), - - TP_fast_assign( - __entry->root_objectid = root->root_key.objectid; - __entry->num_bytes = num_bytes; - __entry->empty_size = empty_size; - __entry->data = data; - ), - - TP_printk("root = %Lu(%s), len = %Lu, empty_size = %Lu, " - "flags = %Lu(%s)", show_root_type(__entry->root_objectid), - __entry->num_bytes, __entry->empty_size, __entry->data, - __print_flags((unsigned long)__entry->data, "|", - BTRFS_GROUP_FLAGS)) -); - -DECLARE_EVENT_CLASS(btrfs__reserve_extent, - - TP_PROTO(struct btrfs_root *root, - struct btrfs_block_group_cache *block_group, u64 start, - u64 len), - - TP_ARGS(root, block_group, start, len), - - TP_STRUCT__entry( - __field( u64, root_objectid ) - __field( u64, bg_objectid ) - __field( u64, flags ) - __field( u64, start ) - __field( u64, len ) - ), - - TP_fast_assign( - __entry->root_objectid = root->root_key.objectid; - __entry->bg_objectid = block_group->key.objectid; - __entry->flags = block_group->flags; - __entry->start = start; - __entry->len = len; - ), - - TP_printk("root = %Lu(%s), block_group = %Lu, flags = %Lu(%s), " - "start = %Lu, len = %Lu", - show_root_type(__entry->root_objectid), __entry->bg_objectid, - __entry->flags, __print_flags((unsigned long)__entry->flags, - "|", BTRFS_GROUP_FLAGS), - __entry->start, __entry->len) -); - -DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent, - - TP_PROTO(struct btrfs_root *root, - struct btrfs_block_group_cache *block_group, u64 start, - u64 len), - - TP_ARGS(root, block_group, start, len) -); - -DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster, - - TP_PROTO(struct btrfs_root *root, - struct btrfs_block_group_cache *block_group, u64 start, - u64 len), - - TP_ARGS(root, block_group, start, len) -); - -TRACE_EVENT(btrfs_find_cluster, - - TP_PROTO(struct btrfs_block_group_cache *block_group, u64 start, - u64 bytes, u64 empty_size, u64 min_bytes), - - TP_ARGS(block_group, start, bytes, empty_size, min_bytes), - - TP_STRUCT__entry( - __field( u64, bg_objectid ) - __field( u64, flags ) - __field( u64, start ) - __field( u64, bytes ) - __field( u64, empty_size ) - __field( u64, min_bytes ) - ), - - TP_fast_assign( - __entry->bg_objectid = block_group->key.objectid; - __entry->flags = block_group->flags; - __entry->start = start; - __entry->bytes = bytes; - __entry->empty_size = empty_size; - __entry->min_bytes = min_bytes; - ), - - TP_printk("block_group = %Lu, flags = %Lu(%s), start = %Lu, len = %Lu," - " empty_size = %Lu, min_bytes = %Lu", __entry->bg_objectid, - __entry->flags, - __print_flags((unsigned long)__entry->flags, "|", - BTRFS_GROUP_FLAGS), __entry->start, - __entry->bytes, __entry->empty_size, __entry->min_bytes) -); - -TRACE_EVENT(btrfs_failed_cluster_setup, - - TP_PROTO(struct btrfs_block_group_cache *block_group), - - TP_ARGS(block_group), - - TP_STRUCT__entry( - __field( u64, bg_objectid ) - ), - - TP_fast_assign( - __entry->bg_objectid = block_group->key.objectid; - ), - - TP_printk("block_group = %Lu", __entry->bg_objectid) -); - -TRACE_EVENT(btrfs_setup_cluster, - - TP_PROTO(struct btrfs_block_group_cache *block_group, - struct btrfs_free_cluster *cluster, u64 size, int bitmap), - - TP_ARGS(block_group, cluster, size, bitmap), - - TP_STRUCT__entry( - __field( u64, bg_objectid ) - __field( u64, flags ) - __field( u64, start ) - __field( u64, max_size ) - __field( u64, size ) - __field( int, bitmap ) - ), - - TP_fast_assign( - __entry->bg_objectid = block_group->key.objectid; - __entry->flags = block_group->flags; - __entry->start = cluster->window_start; - __entry->max_size = cluster->max_size; - __entry->size = size; - __entry->bitmap = bitmap; - ), - - TP_printk("block_group = %Lu, flags = %Lu(%s), window_start = %Lu, " - "size = %Lu, max_size = %Lu, bitmap = %d", - __entry->bg_objectid, - __entry->flags, - __print_flags((unsigned long)__entry->flags, "|", - BTRFS_GROUP_FLAGS), __entry->start, - __entry->size, __entry->max_size, __entry->bitmap) -); - #endif /* _TRACE_BTRFS_H */ /* This part must be outside protection */ diff --git a/trunk/init/Kconfig b/trunk/init/Kconfig index 3f42cd66f0f8..6ac2236244c3 100644 --- a/trunk/init/Kconfig +++ b/trunk/init/Kconfig @@ -355,7 +355,7 @@ config AUDIT config AUDITSYSCALL bool "Enable system-call auditing support" - depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || ARM) + depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH) default y if SECURITY_SELINUX help Enable low-overhead system-call auditing infrastructure that @@ -372,20 +372,6 @@ config AUDIT_TREE depends on AUDITSYSCALL select FSNOTIFY -config AUDIT_LOGINUID_IMMUTABLE - bool "Make audit loginuid immutable" - depends on AUDIT - help - The config option toggles if a task setting its loginuid requires - CAP_SYS_AUDITCONTROL or if that task should require no special permissions - but should instead only allow setting its loginuid if it was never - previously set. On systems which use systemd or a similar central - process to restart login services this should be set to true. On older - systems in which an admin would typically have to directly stop and - start processes this should be set to false. Setting this to true allows - one to drop potentially dangerous capabilites from the login tasks, - but may not be backwards compatible with older init systems. - source "kernel/irq/Kconfig" menu "RCU Subsystem" diff --git a/trunk/kernel/audit.c b/trunk/kernel/audit.c index bb0eb5bb9a0a..57e3f5107937 100644 --- a/trunk/kernel/audit.c +++ b/trunk/kernel/audit.c @@ -631,7 +631,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, } *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); - audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", + audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u", pid, uid, auid, ses); if (sid) { rc = security_secid_to_secctx(sid, &ctx, &len); @@ -1423,7 +1423,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, char *p, *pathname; if (prefix) - audit_log_format(ab, "%s", prefix); + audit_log_format(ab, " %s", prefix); /* We will allow 11 spaces for ' (deleted)' to be appended */ pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); diff --git a/trunk/kernel/audit.h b/trunk/kernel/audit.h index 816766803371..91e7071c4d2c 100644 --- a/trunk/kernel/audit.h +++ b/trunk/kernel/audit.h @@ -36,8 +36,12 @@ enum audit_state { AUDIT_DISABLED, /* Do not create per-task audit_context. * No syscall-specific audit records can * be generated. */ + AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context, + * but don't necessarily fill it in at + * syscall entry time (i.e., filter + * instead). */ AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, - * and fill it in at syscall + * and always fill it in at syscall * entry time. This makes a full * syscall record available if some * other part of the kernel decides it diff --git a/trunk/kernel/auditfilter.c b/trunk/kernel/auditfilter.c index a6c3f1abd206..f8277c80d678 100644 --- a/trunk/kernel/auditfilter.c +++ b/trunk/kernel/auditfilter.c @@ -235,15 +235,13 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) switch(listnr) { default: goto exit_err; + case AUDIT_FILTER_USER: + case AUDIT_FILTER_TYPE: #ifdef CONFIG_AUDITSYSCALL case AUDIT_FILTER_ENTRY: - if (rule->action == AUDIT_ALWAYS) - goto exit_err; case AUDIT_FILTER_EXIT: case AUDIT_FILTER_TASK: #endif - case AUDIT_FILTER_USER: - case AUDIT_FILTER_TYPE: ; } if (unlikely(rule->action == AUDIT_POSSIBLE)) { @@ -387,7 +385,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) goto exit_free; break; case AUDIT_FILETYPE: - if (f->val & ~S_IFMT) + if ((f->val & ~S_IFMT) > S_IFMT) goto exit_free; break; case AUDIT_INODE: @@ -461,8 +459,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_ARG1: case AUDIT_ARG2: case AUDIT_ARG3: - case AUDIT_OBJ_UID: - case AUDIT_OBJ_GID: break; case AUDIT_ARCH: entry->rule.arch_f = f; @@ -526,6 +522,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; break; case AUDIT_FILTERKEY: + err = -EINVAL; if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) goto exit_free; str = audit_unpack_string(&bufp, &remain, f->val); @@ -539,11 +536,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; break; case AUDIT_FILETYPE: - if (f->val & ~S_IFMT) - goto exit_free; - break; - case AUDIT_FIELD_COMPARE: - if (f->val > AUDIT_MAX_FIELD_COMPARE) + if ((f->val & ~S_IFMT) > S_IFMT) goto exit_free; break; default: diff --git a/trunk/kernel/auditsc.c b/trunk/kernel/auditsc.c index caaea6e944f8..e7fe2b0d29b3 100644 --- a/trunk/kernel/auditsc.c +++ b/trunk/kernel/auditsc.c @@ -70,15 +70,9 @@ #include "audit.h" -/* flags stating the success for a syscall */ -#define AUDITSC_INVALID 0 -#define AUDITSC_SUCCESS 1 -#define AUDITSC_FAILURE 2 - /* AUDIT_NAMES is the number of slots we reserve in the audit_context - * for saving names from getname(). If we get more names we will allocate - * a name dynamically and also add those to the list anchored by names_list. */ -#define AUDIT_NAMES 5 + * for saving names from getname(). */ +#define AUDIT_NAMES 20 /* Indicates that audit should log the full pathname. */ #define AUDIT_NAME_FULL -1 @@ -107,8 +101,9 @@ struct audit_cap_data { * * Further, in fs/namei.c:path_lookup() we store the inode and device. */ struct audit_names { - struct list_head list; /* audit_context->names_list */ const char *name; + int name_len; /* number of name's characters to log */ + unsigned name_put; /* call __putname() for this name */ unsigned long ino; dev_t dev; umode_t mode; @@ -118,14 +113,6 @@ struct audit_names { u32 osid; struct audit_cap_data fcap; unsigned int fcap_ver; - int name_len; /* number of name's characters to log */ - bool name_put; /* call __putname() for this name */ - /* - * This was an allocated audit_names and not from the array of - * names allocated in the task audit context. Thus this name - * should be freed on syscall exit - */ - bool should_free; }; struct audit_aux_data { @@ -187,17 +174,8 @@ struct audit_context { long return_code;/* syscall return code */ u64 prio; int return_valid; /* return code is valid */ - /* - * The names_list is the list of all audit_names collected during this - * syscall. The first AUDIT_NAMES entries in the names_list will - * actually be from the preallocated_names array for performance - * reasons. Except during allocation they should never be referenced - * through the preallocated_names array and should only be found/used - * by running the names_list. - */ - struct audit_names preallocated_names[AUDIT_NAMES]; - int name_count; /* total records in names_list */ - struct list_head names_list; /* anchor for struct audit_names->list */ + int name_count; + struct audit_names names[AUDIT_NAMES]; char * filterkey; /* key for rule that triggered record */ struct path pwd; struct audit_context *previous; /* For nested syscalls */ @@ -327,21 +305,21 @@ static int audit_match_perm(struct audit_context *ctx, int mask) } } -static int audit_match_filetype(struct audit_context *ctx, int val) +static int audit_match_filetype(struct audit_context *ctx, int which) { - struct audit_names *n; - umode_t mode = (umode_t)val; + unsigned index = which & ~S_IFMT; + umode_t mode = which & S_IFMT; if (unlikely(!ctx)) return 0; - list_for_each_entry(n, &ctx->names_list, list) { - if ((n->ino != -1) && - ((n->mode & S_IFMT) == mode)) - return 1; - } - - return 0; + if (index >= ctx->name_count) + return 0; + if (ctx->names[index].ino == -1) + return 0; + if ((ctx->names[index].mode ^ mode) & S_IFMT) + return 0; + return 1; } /* @@ -463,134 +441,6 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) return 0; } -static int audit_compare_id(uid_t uid1, - struct audit_names *name, - unsigned long name_offset, - struct audit_field *f, - struct audit_context *ctx) -{ - struct audit_names *n; - unsigned long addr; - uid_t uid2; - int rc; - - BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); - - if (name) { - addr = (unsigned long)name; - addr += name_offset; - - uid2 = *(uid_t *)addr; - rc = audit_comparator(uid1, f->op, uid2); - if (rc) - return rc; - } - - if (ctx) { - list_for_each_entry(n, &ctx->names_list, list) { - addr = (unsigned long)n; - addr += name_offset; - - uid2 = *(uid_t *)addr; - - rc = audit_comparator(uid1, f->op, uid2); - if (rc) - return rc; - } - } - return 0; -} - -static int audit_field_compare(struct task_struct *tsk, - const struct cred *cred, - struct audit_field *f, - struct audit_context *ctx, - struct audit_names *name) -{ - switch (f->val) { - /* process to file object comparisons */ - case AUDIT_COMPARE_UID_TO_OBJ_UID: - return audit_compare_id(cred->uid, - name, offsetof(struct audit_names, uid), - f, ctx); - case AUDIT_COMPARE_GID_TO_OBJ_GID: - return audit_compare_id(cred->gid, - name, offsetof(struct audit_names, gid), - f, ctx); - case AUDIT_COMPARE_EUID_TO_OBJ_UID: - return audit_compare_id(cred->euid, - name, offsetof(struct audit_names, uid), - f, ctx); - case AUDIT_COMPARE_EGID_TO_OBJ_GID: - return audit_compare_id(cred->egid, - name, offsetof(struct audit_names, gid), - f, ctx); - case AUDIT_COMPARE_AUID_TO_OBJ_UID: - return audit_compare_id(tsk->loginuid, - name, offsetof(struct audit_names, uid), - f, ctx); - case AUDIT_COMPARE_SUID_TO_OBJ_UID: - return audit_compare_id(cred->suid, - name, offsetof(struct audit_names, uid), - f, ctx); - case AUDIT_COMPARE_SGID_TO_OBJ_GID: - return audit_compare_id(cred->sgid, - name, offsetof(struct audit_names, gid), - f, ctx); - case AUDIT_COMPARE_FSUID_TO_OBJ_UID: - return audit_compare_id(cred->fsuid, - name, offsetof(struct audit_names, uid), - f, ctx); - case AUDIT_COMPARE_FSGID_TO_OBJ_GID: - return audit_compare_id(cred->fsgid, - name, offsetof(struct audit_names, gid), - f, ctx); - /* uid comparisons */ - case AUDIT_COMPARE_UID_TO_AUID: - return audit_comparator(cred->uid, f->op, tsk->loginuid); - case AUDIT_COMPARE_UID_TO_EUID: - return audit_comparator(cred->uid, f->op, cred->euid); - case AUDIT_COMPARE_UID_TO_SUID: - return audit_comparator(cred->uid, f->op, cred->suid); - case AUDIT_COMPARE_UID_TO_FSUID: - return audit_comparator(cred->uid, f->op, cred->fsuid); - /* auid comparisons */ - case AUDIT_COMPARE_AUID_TO_EUID: - return audit_comparator(tsk->loginuid, f->op, cred->euid); - case AUDIT_COMPARE_AUID_TO_SUID: - return audit_comparator(tsk->loginuid, f->op, cred->suid); - case AUDIT_COMPARE_AUID_TO_FSUID: - return audit_comparator(tsk->loginuid, f->op, cred->fsuid); - /* euid comparisons */ - case AUDIT_COMPARE_EUID_TO_SUID: - return audit_comparator(cred->euid, f->op, cred->suid); - case AUDIT_COMPARE_EUID_TO_FSUID: - return audit_comparator(cred->euid, f->op, cred->fsuid); - /* suid comparisons */ - case AUDIT_COMPARE_SUID_TO_FSUID: - return audit_comparator(cred->suid, f->op, cred->fsuid); - /* gid comparisons */ - case AUDIT_COMPARE_GID_TO_EGID: - return audit_comparator(cred->gid, f->op, cred->egid); - case AUDIT_COMPARE_GID_TO_SGID: - return audit_comparator(cred->gid, f->op, cred->sgid); - case AUDIT_COMPARE_GID_TO_FSGID: - return audit_comparator(cred->gid, f->op, cred->fsgid); - /* egid comparisons */ - case AUDIT_COMPARE_EGID_TO_SGID: - return audit_comparator(cred->egid, f->op, cred->sgid); - case AUDIT_COMPARE_EGID_TO_FSGID: - return audit_comparator(cred->egid, f->op, cred->fsgid); - /* sgid comparison */ - case AUDIT_COMPARE_SGID_TO_FSGID: - return audit_comparator(cred->sgid, f->op, cred->fsgid); - default: - WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); - return 0; - } - return 0; -} - /* Determine if any context name data matches a rule's watch data */ /* Compare a task_struct with an audit_rule. Return 1 on match, 0 * otherwise. @@ -607,14 +457,13 @@ static int audit_filter_rules(struct task_struct *tsk, bool task_creation) { const struct cred *cred; - int i, need_sid = 1; + int i, j, need_sid = 1; u32 sid; cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; - struct audit_names *n; int result = 0; switch (f->type) { @@ -673,14 +522,12 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_DEVMAJOR: - if (name) { - if (audit_comparator(MAJOR(name->dev), f->op, f->val) || - audit_comparator(MAJOR(name->rdev), f->op, f->val)) - ++result; - } else if (ctx) { - list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(MAJOR(n->dev), f->op, f->val) || - audit_comparator(MAJOR(n->rdev), f->op, f->val)) { + if (name) + result = audit_comparator(MAJOR(name->dev), + f->op, f->val); + else if (ctx) { + for (j = 0; j < ctx->name_count; j++) { + if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { ++result; break; } @@ -688,14 +535,12 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_DEVMINOR: - if (name) { - if (audit_comparator(MINOR(name->dev), f->op, f->val) || - audit_comparator(MINOR(name->rdev), f->op, f->val)) - ++result; - } else if (ctx) { - list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(MINOR(n->dev), f->op, f->val) || - audit_comparator(MINOR(n->rdev), f->op, f->val)) { + if (name) + result = audit_comparator(MINOR(name->dev), + f->op, f->val); + else if (ctx) { + for (j = 0; j < ctx->name_count; j++) { + if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { ++result; break; } @@ -706,32 +551,8 @@ static int audit_filter_rules(struct task_struct *tsk, if (name) result = (name->ino == f->val); else if (ctx) { - list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(n->ino, f->op, f->val)) { - ++result; - break; - } - } - } - break; - case AUDIT_OBJ_UID: - if (name) { - result = audit_comparator(name->uid, f->op, f->val); - } else if (ctx) { - list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(n->uid, f->op, f->val)) { - ++result; - break; - } - } - } - break; - case AUDIT_OBJ_GID: - if (name) { - result = audit_comparator(name->gid, f->op, f->val); - } else if (ctx) { - list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(n->gid, f->op, f->val)) { + for (j = 0; j < ctx->name_count; j++) { + if (audit_comparator(ctx->names[j].ino, f->op, f->val)) { ++result; break; } @@ -786,10 +607,11 @@ static int audit_filter_rules(struct task_struct *tsk, name->osid, f->type, f->op, f->lsm_rule, ctx); } else if (ctx) { - list_for_each_entry(n, &ctx->names_list, list) { - if (security_audit_rule_match(n->osid, f->type, - f->op, f->lsm_rule, - ctx)) { + for (j = 0; j < ctx->name_count; j++) { + if (security_audit_rule_match( + ctx->names[j].osid, + f->type, f->op, + f->lsm_rule, ctx)) { ++result; break; } @@ -821,10 +643,8 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_FILETYPE: result = audit_match_filetype(ctx, f->val); break; - case AUDIT_FIELD_COMPARE: - result = audit_field_compare(tsk, cred, f, ctx, name); - break; } + if (!result) return 0; } @@ -902,53 +722,40 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, return AUDIT_BUILD_CONTEXT; } -/* - * Given an audit_name check the inode hash table to see if they match. - * Called holding the rcu read lock to protect the use of audit_inode_hash - */ -static int audit_filter_inode_name(struct task_struct *tsk, - struct audit_names *n, - struct audit_context *ctx) { - int word, bit; - int h = audit_hash_ino((u32)n->ino); - struct list_head *list = &audit_inode_hash[h]; - struct audit_entry *e; - enum audit_state state; - - word = AUDIT_WORD(ctx->major); - bit = AUDIT_BIT(ctx->major); - - if (list_empty(list)) - return 0; - - list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit && - audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { - ctx->current_state = state; - return 1; - } - } - - return 0; -} - -/* At syscall exit time, this filter is called if any audit_names have been +/* At syscall exit time, this filter is called if any audit_names[] have been * collected during syscall processing. We only check rules in sublists at hash - * buckets applicable to the inode numbers in audit_names. + * buckets applicable to the inode numbers in audit_names[]. * Regarding audit_state, same rules apply as for audit_filter_syscall(). */ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) { - struct audit_names *n; + int i; + struct audit_entry *e; + enum audit_state state; if (audit_pid && tsk->tgid == audit_pid) return; rcu_read_lock(); + for (i = 0; i < ctx->name_count; i++) { + int word = AUDIT_WORD(ctx->major); + int bit = AUDIT_BIT(ctx->major); + struct audit_names *n = &ctx->names[i]; + int h = audit_hash_ino((u32)n->ino); + struct list_head *list = &audit_inode_hash[h]; - list_for_each_entry(n, &ctx->names_list, list) { - if (audit_filter_inode_name(tsk, n, ctx)) - break; + if (list_empty(list)) + continue; + + list_for_each_entry_rcu(e, list, list) { + if ((e->rule.mask[word] & bit) == bit && + audit_filter_rules(tsk, &e->rule, ctx, n, + &state, false)) { + rcu_read_unlock(); + ctx->current_state = state; + return; + } + } } rcu_read_unlock(); } @@ -959,7 +766,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, { struct audit_context *context = tsk->audit_context; - if (!context) + if (likely(!context)) return NULL; context->return_valid = return_valid; @@ -992,7 +799,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, static inline void audit_free_names(struct audit_context *context) { - struct audit_names *n, *next; + int i; #if AUDIT_DEBUG == 2 if (context->put_count + context->ino_count != context->name_count) { @@ -1003,9 +810,10 @@ static inline void audit_free_names(struct audit_context *context) context->serial, context->major, context->in_syscall, context->name_count, context->put_count, context->ino_count); - list_for_each_entry(n, &context->names_list, list) { + for (i = 0; i < context->name_count; i++) { printk(KERN_ERR "names[%d] = %p = %s\n", i, - n->name, n->name ?: "(null)"); + context->names[i].name, + context->names[i].name ?: "(null)"); } dump_stack(); return; @@ -1016,12 +824,9 @@ static inline void audit_free_names(struct audit_context *context) context->ino_count = 0; #endif - list_for_each_entry_safe(n, next, &context->names_list, list) { - list_del(&n->list); - if (n->name && n->name_put) - __putname(n->name); - if (n->should_free) - kfree(n); + for (i = 0; i < context->name_count; i++) { + if (context->names[i].name && context->names[i].name_put) + __putname(context->names[i].name); } context->name_count = 0; path_put(&context->pwd); @@ -1059,7 +864,6 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state) return NULL; audit_zero_context(context, state); INIT_LIST_HEAD(&context->killed_trees); - INIT_LIST_HEAD(&context->names_list); return context; } @@ -1082,7 +886,7 @@ int audit_alloc(struct task_struct *tsk) return 0; /* Return if not auditing. */ state = audit_filter_task(tsk, &key); - if (state == AUDIT_DISABLED) + if (likely(state == AUDIT_DISABLED)) return 0; if (!(context = audit_alloc_context(state))) { @@ -1171,7 +975,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk while (vma) { if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) { - audit_log_d_path(ab, " exe=", + audit_log_d_path(ab, "exe=", &vma->vm_file->f_path); break; } @@ -1362,8 +1166,8 @@ static void audit_log_execve_info(struct audit_context *context, struct audit_buffer **ab, struct audit_aux_data_execve *axi) { - int i, len; - size_t len_sent = 0; + int i; + size_t len, len_sent = 0; const char __user *p; char *buf; @@ -1520,68 +1324,6 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_end(ab); } -static void audit_log_name(struct audit_context *context, struct audit_names *n, - int record_num, int *call_panic) -{ - struct audit_buffer *ab; - ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); - if (!ab) - return; /* audit_panic has been called */ - - audit_log_format(ab, "item=%d", record_num); - - if (n->name) { - switch (n->name_len) { - case AUDIT_NAME_FULL: - /* log the full path */ - audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, n->name); - break; - case 0: - /* name was specified as a relative path and the - * directory component is the cwd */ - audit_log_d_path(ab, " name=", &context->pwd); - break; - default: - /* log the name's directory component */ - audit_log_format(ab, " name="); - audit_log_n_untrustedstring(ab, n->name, - n->name_len); - } - } else - audit_log_format(ab, " name=(null)"); - - if (n->ino != (unsigned long)-1) { - audit_log_format(ab, " inode=%lu" - " dev=%02x:%02x mode=%#ho" - " ouid=%u ogid=%u rdev=%02x:%02x", - n->ino, - MAJOR(n->dev), - MINOR(n->dev), - n->mode, - n->uid, - n->gid, - MAJOR(n->rdev), - MINOR(n->rdev)); - } - if (n->osid != 0) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx( - n->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", n->osid); - *call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - } - - audit_log_fcaps(ab, n); - - audit_log_end(ab); -} - static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { const struct cred *cred; @@ -1589,7 +1331,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts struct audit_buffer *ab; struct audit_aux_data *aux; const char *tty; - struct audit_names *n; /* tsk == current */ context->pid = tsk->pid; @@ -1725,14 +1466,70 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts if (context->pwd.dentry && context->pwd.mnt) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); if (ab) { - audit_log_d_path(ab, " cwd=", &context->pwd); + audit_log_d_path(ab, "cwd=", &context->pwd); audit_log_end(ab); } } + for (i = 0; i < context->name_count; i++) { + struct audit_names *n = &context->names[i]; - i = 0; - list_for_each_entry(n, &context->names_list, list) - audit_log_name(context, n, i++, &call_panic); + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); + if (!ab) + continue; /* audit_panic has been called */ + + audit_log_format(ab, "item=%d", i); + + if (n->name) { + switch(n->name_len) { + case AUDIT_NAME_FULL: + /* log the full path */ + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, n->name); + break; + case 0: + /* name was specified as a relative path and the + * directory component is the cwd */ + audit_log_d_path(ab, "name=", &context->pwd); + break; + default: + /* log the name's directory component */ + audit_log_format(ab, " name="); + audit_log_n_untrustedstring(ab, n->name, + n->name_len); + } + } else + audit_log_format(ab, " name=(null)"); + + if (n->ino != (unsigned long)-1) { + audit_log_format(ab, " inode=%lu" + " dev=%02x:%02x mode=%#ho" + " ouid=%u ogid=%u rdev=%02x:%02x", + n->ino, + MAJOR(n->dev), + MINOR(n->dev), + n->mode, + n->uid, + n->gid, + MAJOR(n->rdev), + MINOR(n->rdev)); + } + if (n->osid != 0) { + char *ctx = NULL; + u32 len; + if (security_secid_to_secctx( + n->osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", n->osid); + call_panic = 2; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } + } + + audit_log_fcaps(ab, n); + + audit_log_end(ab); + } /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); @@ -1748,12 +1545,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts * * Called from copy_process and do_exit */ -void __audit_free(struct task_struct *tsk) +void audit_free(struct task_struct *tsk) { struct audit_context *context; context = audit_get_context(tsk, 0, 0); - if (!context) + if (likely(!context)) return; /* Check for system calls that do not go through the exit @@ -1786,7 +1583,7 @@ void __audit_free(struct task_struct *tsk) * will only be written if another part of the kernel requests that it * be written). */ -void __audit_syscall_entry(int arch, int major, +void audit_syscall_entry(int arch, int major, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4) { @@ -1794,7 +1591,7 @@ void __audit_syscall_entry(int arch, int major, struct audit_context *context = tsk->audit_context; enum audit_state state; - if (!context) + if (unlikely(!context)) return; /* @@ -1851,7 +1648,7 @@ void __audit_syscall_entry(int arch, int major, context->prio = 0; state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); } - if (state == AUDIT_DISABLED) + if (likely(state == AUDIT_DISABLED)) return; context->serial = 0; @@ -1861,9 +1658,30 @@ void __audit_syscall_entry(int arch, int major, context->ppid = 0; } +void audit_finish_fork(struct task_struct *child) +{ + struct audit_context *ctx = current->audit_context; + struct audit_context *p = child->audit_context; + if (!p || !ctx) + return; + if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT) + return; + p->arch = ctx->arch; + p->major = ctx->major; + memcpy(p->argv, ctx->argv, sizeof(ctx->argv)); + p->ctime = ctx->ctime; + p->dummy = ctx->dummy; + p->in_syscall = ctx->in_syscall; + p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL); + p->ppid = current->pid; + p->prio = ctx->prio; + p->current_state = ctx->current_state; +} + /** * audit_syscall_exit - deallocate audit context after a system call - * @pt_regs: syscall registers + * @valid: success/failure flag + * @return_code: syscall return value * * Tear down after system call. If the audit context has been marked as * auditable (either because of the AUDIT_RECORD_CONTEXT state from @@ -1871,18 +1689,14 @@ void __audit_syscall_entry(int arch, int major, * message), then write out the syscall information. In call cases, * free the names stored from getname(). */ -void __audit_syscall_exit(int success, long return_code) +void audit_syscall_exit(int valid, long return_code) { struct task_struct *tsk = current; struct audit_context *context; - if (success) - success = AUDITSC_SUCCESS; - else - success = AUDITSC_FAILURE; + context = audit_get_context(tsk, valid, return_code); - context = audit_get_context(tsk, success, return_code); - if (!context) + if (likely(!context)) return; if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) @@ -2007,30 +1821,6 @@ static void handle_path(const struct dentry *dentry) #endif } -static struct audit_names *audit_alloc_name(struct audit_context *context) -{ - struct audit_names *aname; - - if (context->name_count < AUDIT_NAMES) { - aname = &context->preallocated_names[context->name_count]; - memset(aname, 0, sizeof(*aname)); - } else { - aname = kzalloc(sizeof(*aname), GFP_NOFS); - if (!aname) - return NULL; - aname->should_free = true; - } - - aname->ino = (unsigned long)-1; - list_add_tail(&aname->list, &context->names_list); - - context->name_count++; -#if AUDIT_DEBUG - context->ino_count++; -#endif - return aname; -} - /** * audit_getname - add a name to the list * @name: name to add @@ -2041,7 +1831,9 @@ static struct audit_names *audit_alloc_name(struct audit_context *context) void __audit_getname(const char *name) { struct audit_context *context = current->audit_context; - struct audit_names *n; + + if (IS_ERR(name) || !name) + return; if (!context->in_syscall) { #if AUDIT_DEBUG == 2 @@ -2051,15 +1843,13 @@ void __audit_getname(const char *name) #endif return; } - - n = audit_alloc_name(context); - if (!n) - return; - - n->name = name; - n->name_len = AUDIT_NAME_FULL; - n->name_put = true; - + BUG_ON(context->name_count >= AUDIT_NAMES); + context->names[context->name_count].name = name; + context->names[context->name_count].name_len = AUDIT_NAME_FULL; + context->names[context->name_count].name_put = 1; + context->names[context->name_count].ino = (unsigned long)-1; + context->names[context->name_count].osid = 0; + ++context->name_count; if (!context->pwd.dentry) get_fs_pwd(current->fs, &context->pwd); } @@ -2081,13 +1871,12 @@ void audit_putname(const char *name) printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", __FILE__, __LINE__, context->serial, name); if (context->name_count) { - struct audit_names *n; int i; - - list_for_each_entry(n, &context->names_list, list) + for (i = 0; i < context->name_count; i++) printk(KERN_ERR "name[%d] = %p = %s\n", i, - n->name, n->name ?: "(null)"); - } + context->names[i].name, + context->names[i].name ?: "(null)"); + } #endif __putname(name); } @@ -2108,11 +1897,39 @@ void audit_putname(const char *name) #endif } +static int audit_inc_name_count(struct audit_context *context, + const struct inode *inode) +{ + if (context->name_count >= AUDIT_NAMES) { + if (inode) + printk(KERN_DEBUG "audit: name_count maxed, losing inode data: " + "dev=%02x:%02x, inode=%lu\n", + MAJOR(inode->i_sb->s_dev), + MINOR(inode->i_sb->s_dev), + inode->i_ino); + + else + printk(KERN_DEBUG "name_count maxed, losing inode data\n"); + return 1; + } + context->name_count++; +#if AUDIT_DEBUG + context->ino_count++; +#endif + return 0; +} + + static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) { struct cpu_vfs_cap_data caps; int rc; + memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t)); + memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t)); + name->fcap.fE = 0; + name->fcap_ver = 0; + if (!dentry) return 0; @@ -2152,25 +1969,30 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent */ void __audit_inode(const char *name, const struct dentry *dentry) { + int idx; struct audit_context *context = current->audit_context; const struct inode *inode = dentry->d_inode; - struct audit_names *n; if (!context->in_syscall) return; - - list_for_each_entry_reverse(n, &context->names_list, list) { - if (n->name && (n->name == name)) - goto out; + if (context->name_count + && context->names[context->name_count-1].name + && context->names[context->name_count-1].name == name) + idx = context->name_count - 1; + else if (context->name_count > 1 + && context->names[context->name_count-2].name + && context->names[context->name_count-2].name == name) + idx = context->name_count - 2; + else { + /* FIXME: how much do we care about inodes that have no + * associated name? */ + if (audit_inc_name_count(context, inode)) + return; + idx = context->name_count - 1; + context->names[idx].name = NULL; } - - /* unable to find the name from a previous getname() */ - n = audit_alloc_name(context); - if (!n) - return; -out: handle_path(dentry); - audit_copy_inode(n, dentry, inode); + audit_copy_inode(&context->names[idx], dentry, inode); } /** @@ -2189,11 +2011,11 @@ void __audit_inode(const char *name, const struct dentry *dentry) void __audit_inode_child(const struct dentry *dentry, const struct inode *parent) { + int idx; struct audit_context *context = current->audit_context; const char *found_parent = NULL, *found_child = NULL; const struct inode *inode = dentry->d_inode; const char *dname = dentry->d_name.name; - struct audit_names *n; int dirlen = 0; if (!context->in_syscall) @@ -2203,7 +2025,9 @@ void __audit_inode_child(const struct dentry *dentry, handle_one(inode); /* parent is more likely, look for it first */ - list_for_each_entry(n, &context->names_list, list) { + for (idx = 0; idx < context->name_count; idx++) { + struct audit_names *n = &context->names[idx]; + if (!n->name) continue; @@ -2216,7 +2040,9 @@ void __audit_inode_child(const struct dentry *dentry, } /* no matching parent, look for matching child */ - list_for_each_entry(n, &context->names_list, list) { + for (idx = 0; idx < context->name_count; idx++) { + struct audit_names *n = &context->names[idx]; + if (!n->name) continue; @@ -2234,29 +2060,34 @@ void __audit_inode_child(const struct dentry *dentry, add_names: if (!found_parent) { - n = audit_alloc_name(context); - if (!n) + if (audit_inc_name_count(context, parent)) return; - audit_copy_inode(n, NULL, parent); + idx = context->name_count - 1; + context->names[idx].name = NULL; + audit_copy_inode(&context->names[idx], NULL, parent); } if (!found_child) { - n = audit_alloc_name(context); - if (!n) + if (audit_inc_name_count(context, inode)) return; + idx = context->name_count - 1; /* Re-use the name belonging to the slot for a matching parent * directory. All names for this context are relinquished in * audit_free_names() */ if (found_parent) { - n->name = found_parent; - n->name_len = AUDIT_NAME_FULL; + context->names[idx].name = found_parent; + context->names[idx].name_len = AUDIT_NAME_FULL; /* don't call __putname() */ - n->name_put = false; + context->names[idx].name_put = 0; + } else { + context->names[idx].name = NULL; } if (inode) - audit_copy_inode(n, NULL, inode); + audit_copy_inode(&context->names[idx], NULL, inode); + else + context->names[idx].ino = (unsigned long)-1; } } EXPORT_SYMBOL_GPL(__audit_inode_child); @@ -2290,28 +2121,19 @@ int auditsc_get_stamp(struct audit_context *ctx, static atomic_t session_id = ATOMIC_INIT(0); /** - * audit_set_loginuid - set current task's audit_context loginuid + * audit_set_loginuid - set a task's audit_context loginuid + * @task: task whose audit context is being modified * @loginuid: loginuid value * * Returns 0. * * Called (set) from fs/proc/base.c::proc_loginuid_write(). */ -int audit_set_loginuid(uid_t loginuid) +int audit_set_loginuid(struct task_struct *task, uid_t loginuid) { - struct task_struct *task = current; + unsigned int sessionid = atomic_inc_return(&session_id); struct audit_context *context = task->audit_context; - unsigned int sessionid; - -#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE - if (task->loginuid != -1) - return -EPERM; -#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ - if (!capable(CAP_AUDIT_CONTROL)) - return -EPERM; -#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ - sessionid = atomic_inc_return(&session_id); if (context && context->in_syscall) { struct audit_buffer *ab; @@ -2449,11 +2271,14 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo context->ipc.has_perm = 1; } -int __audit_bprm(struct linux_binprm *bprm) +int audit_bprm(struct linux_binprm *bprm) { struct audit_aux_data_execve *ax; struct audit_context *context = current->audit_context; + if (likely(!audit_enabled || !context || context->dummy)) + return 0; + ax = kmalloc(sizeof(*ax), GFP_KERNEL); if (!ax) return -ENOMEM; @@ -2474,10 +2299,13 @@ int __audit_bprm(struct linux_binprm *bprm) * @args: args array * */ -void __audit_socketcall(int nargs, unsigned long *args) +void audit_socketcall(int nargs, unsigned long *args) { struct audit_context *context = current->audit_context; + if (likely(!context || context->dummy)) + return; + context->type = AUDIT_SOCKETCALL; context->socketcall.nargs = nargs; memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); @@ -2503,10 +2331,13 @@ void __audit_fd_pair(int fd1, int fd2) * * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_sockaddr(int len, void *a) +int audit_sockaddr(int len, void *a) { struct audit_context *context = current->audit_context; + if (likely(!context || context->dummy)) + return 0; + if (!context->sockaddr) { void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); if (!p) @@ -2668,25 +2499,6 @@ void __audit_mmap_fd(int fd, int flags) context->type = AUDIT_MMAP; } -static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) -{ - uid_t auid, uid; - gid_t gid; - unsigned int sessionid; - - auid = audit_get_loginuid(current); - sessionid = audit_get_sessionid(current); - current_uid_gid(&uid, &gid); - - audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", - auid, uid, gid, sessionid); - audit_log_task_context(ab); - audit_log_format(ab, " pid=%d comm=", current->pid); - audit_log_untrustedstring(ab, current->comm); - audit_log_format(ab, " reason="); - audit_log_string(ab, reason); - audit_log_format(ab, " sig=%ld", signr); -} /** * audit_core_dumps - record information about processes that end abnormally * @signr: signal value @@ -2697,6 +2509,10 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) void audit_core_dumps(long signr) { struct audit_buffer *ab; + u32 sid; + uid_t auid = audit_get_loginuid(current), uid; + gid_t gid; + unsigned int sessionid = audit_get_sessionid(current); if (!audit_enabled) return; @@ -2705,17 +2521,24 @@ void audit_core_dumps(long signr) return; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); - audit_log_abend(ab, "memory violation", signr); - audit_log_end(ab); -} - -void __audit_seccomp(unsigned long syscall) -{ - struct audit_buffer *ab; + current_uid_gid(&uid, &gid); + audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", + auid, uid, gid, sessionid); + security_task_getsecid(current, &sid); + if (sid) { + char *ctx = NULL; + u32 len; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); - audit_log_abend(ab, "seccomp", SIGKILL); - audit_log_format(ab, " syscall=%ld", syscall); + if (security_secid_to_secctx(sid, &ctx, &len)) + audit_log_format(ab, " ssid=%u", sid); + else { + audit_log_format(ab, " subj=%s", ctx); + security_release_secctx(ctx, len); + } + } + audit_log_format(ab, " pid=%d comm=", current->pid); + audit_log_untrustedstring(ab, current->comm); + audit_log_format(ab, " sig=%ld", signr); audit_log_end(ab); } diff --git a/trunk/kernel/capability.c b/trunk/kernel/capability.c index 3f1adb6c6470..0fcf1c14a297 100644 --- a/trunk/kernel/capability.c +++ b/trunk/kernel/capability.c @@ -384,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap) BUG(); } - if (security_capable(current_cred(), ns, cap) == 0) { + if (has_ns_capability(current, ns, cap)) { current->flags |= PF_SUPERPRIV; return true; } diff --git a/trunk/kernel/exit.c b/trunk/kernel/exit.c index 294b1709170d..c44738267be7 100644 --- a/trunk/kernel/exit.c +++ b/trunk/kernel/exit.c @@ -964,7 +964,8 @@ void do_exit(long code) acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); - audit_free(tsk); + if (unlikely(tsk->audit_context)) + audit_free(tsk); tsk->exit_code = code; taskstats_exit(tsk, group_dead); diff --git a/trunk/kernel/fork.c b/trunk/kernel/fork.c index 051f090d40c1..f3fa18887cc9 100644 --- a/trunk/kernel/fork.c +++ b/trunk/kernel/fork.c @@ -1527,6 +1527,8 @@ long do_fork(unsigned long clone_flags, init_completion(&vfork); } + audit_finish_fork(p); + /* * We set PF_STARTING at creation in case tracing wants to * use this to distinguish a fully live task from one that diff --git a/trunk/kernel/seccomp.c b/trunk/kernel/seccomp.c index e8d76c5895ea..57d4b13b631d 100644 --- a/trunk/kernel/seccomp.c +++ b/trunk/kernel/seccomp.c @@ -6,7 +6,6 @@ * This defines a simple but solid secure-computing mode. */ -#include #include #include #include @@ -55,7 +54,6 @@ void __secure_computing(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif - audit_seccomp(this_syscall); do_exit(SIGKILL); } diff --git a/trunk/security/integrity/ima/ima_audit.c b/trunk/security/integrity/ima/ima_audit.c index 2ad942fb1e23..c5c5a72c30be 100644 --- a/trunk/security/integrity/ima/ima_audit.c +++ b/trunk/security/integrity/ima/ima_audit.c @@ -56,11 +56,9 @@ void integrity_audit_msg(int audit_msgno, struct inode *inode, audit_log_format(ab, " name="); audit_log_untrustedstring(ab, fname); } - if (inode) { - audit_log_format(ab, " dev="); - audit_log_untrustedstring(ab, inode->i_sb->s_id); - audit_log_format(ab, " ino=%lu", inode->i_ino); - } + if (inode) + audit_log_format(ab, " dev=%s ino=%lu", + inode->i_sb->s_id, inode->i_ino); audit_log_format(ab, " res=%d", !result ? 0 : 1); audit_log_end(ab); } diff --git a/trunk/security/lsm_audit.c b/trunk/security/lsm_audit.c index 293b8c45b1d1..7bd6f138236b 100644 --- a/trunk/security/lsm_audit.c +++ b/trunk/security/lsm_audit.c @@ -232,14 +232,13 @@ static void dump_common_audit_data(struct audit_buffer *ab, case LSM_AUDIT_DATA_PATH: { struct inode *inode; - audit_log_d_path(ab, " path=", &a->u.path); + audit_log_d_path(ab, "path=", &a->u.path); inode = a->u.path.dentry->d_inode; - if (inode) { - audit_log_format(ab, " dev="); - audit_log_untrustedstring(ab, inode->i_sb->s_id); - audit_log_format(ab, " ino=%lu", inode->i_ino); - } + if (inode) + audit_log_format(ab, " dev=%s ino=%lu", + inode->i_sb->s_id, + inode->i_ino); break; } case LSM_AUDIT_DATA_DENTRY: { @@ -249,11 +248,10 @@ static void dump_common_audit_data(struct audit_buffer *ab, audit_log_untrustedstring(ab, a->u.dentry->d_name.name); inode = a->u.dentry->d_inode; - if (inode) { - audit_log_format(ab, " dev="); - audit_log_untrustedstring(ab, inode->i_sb->s_id); - audit_log_format(ab, " ino=%lu", inode->i_ino); - } + if (inode) + audit_log_format(ab, " dev=%s ino=%lu", + inode->i_sb->s_id, + inode->i_ino); break; } case LSM_AUDIT_DATA_INODE: { @@ -268,9 +266,8 @@ static void dump_common_audit_data(struct audit_buffer *ab, dentry->d_name.name); dput(dentry); } - audit_log_format(ab, " dev="); - audit_log_untrustedstring(ab, inode->i_sb->s_id); - audit_log_format(ab, " ino=%lu", inode->i_ino); + audit_log_format(ab, " dev=%s ino=%lu", inode->i_sb->s_id, + inode->i_ino); break; } case LSM_AUDIT_DATA_TASK: @@ -318,7 +315,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, .dentry = u->dentry, .mnt = u->mnt }; - audit_log_d_path(ab, " path=", &path); + audit_log_d_path(ab, "path=", &path); break; } if (!u->addr) diff --git a/trunk/sound/core/Kconfig b/trunk/sound/core/Kconfig index b413ed05e74d..ad409381f8cc 100644 --- a/trunk/sound/core/Kconfig +++ b/trunk/sound/core/Kconfig @@ -12,9 +12,6 @@ config SND_HWDEP config SND_RAWMIDI tristate -config SND_COMPRESS_OFFLOAD - tristate - # To be effective this also requires INPUT - users should say: # select SND_JACK if INPUT=y || INPUT=SND # to avoid having to force INPUT on. @@ -157,6 +154,16 @@ config SND_DYNAMIC_MINORS If you are unsure about this, say N here. +config SND_COMPRESS_OFFLOAD + tristate "ALSA Compressed audio offload support" + default n + help + If you want support for offloading compressed audio and have such + a hardware, then you should say Y here and also to the DSP driver + of your platform. + + If you are unsure about this, say N here. + config SND_SUPPORT_OLD_API bool "Support old ALSA API" default y diff --git a/trunk/sound/pci/au88x0/au88x0.c b/trunk/sound/pci/au88x0/au88x0.c index f13ad536b2d5..762bb108c51c 100644 --- a/trunk/sound/pci/au88x0/au88x0.c +++ b/trunk/sound/pci/au88x0/au88x0.c @@ -268,14 +268,8 @@ snd_vortex_probe(struct pci_dev *pci, const struct pci_device_id *pci_id) card->shortname, chip->io, chip->irq); // (4) Alloc components. - err = snd_vortex_mixer(chip); - if (err < 0) { - snd_card_free(card); - return err; - } // ADB pcm. - err = snd_vortex_new_pcm(chip, VORTEX_PCM_ADB, NR_PCM); - if (err < 0) { + if ((err = snd_vortex_new_pcm(chip, VORTEX_PCM_ADB, NR_ADB)) < 0) { snd_card_free(card); return err; } @@ -305,6 +299,11 @@ snd_vortex_probe(struct pci_dev *pci, const struct pci_device_id *pci_id) return err; } #endif + // snd_ac97_mixer and Vortex mixer. + if ((err = snd_vortex_mixer(chip)) < 0) { + snd_card_free(card); + return err; + } if ((err = snd_vortex_midi(chip)) < 0) { snd_card_free(card); return err; diff --git a/trunk/sound/pci/au88x0/au88x0.h b/trunk/sound/pci/au88x0/au88x0.h index bb938153a964..02f6e08f7592 100644 --- a/trunk/sound/pci/au88x0/au88x0.h +++ b/trunk/sound/pci/au88x0/au88x0.h @@ -105,7 +105,6 @@ #define MIX_SPDIF(x) (vortex->mixspdif[x]) #define NR_WTPB 0x20 /* WT channels per each bank. */ -#define NR_PCM 0x10 /* Structs */ typedef struct { diff --git a/trunk/sound/pci/au88x0/au88x0_pcm.c b/trunk/sound/pci/au88x0/au88x0_pcm.c index 0ef2f9712208..0488633ea874 100644 --- a/trunk/sound/pci/au88x0/au88x0_pcm.c +++ b/trunk/sound/pci/au88x0/au88x0_pcm.c @@ -168,7 +168,6 @@ static int snd_vortex_pcm_open(struct snd_pcm_substream *substream) runtime->hw = snd_vortex_playback_hw_adb; #ifdef CHIP_AU8830 if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK && - VORTEX_IS_QUAD(vortex) && VORTEX_PCM_TYPE(substream->pcm) == VORTEX_PCM_ADB) { runtime->hw.channels_max = 4; snd_pcm_hw_constraint_list(runtime, 0, diff --git a/trunk/sound/pci/hda/hda_intel.c b/trunk/sound/pci/hda/hda_intel.c index fb35474c1203..0852e204a4c8 100644 --- a/trunk/sound/pci/hda/hda_intel.c +++ b/trunk/sound/pci/hda/hda_intel.c @@ -2498,7 +2498,6 @@ static struct snd_pci_quirk position_fix_list[] __devinitdata = { SND_PCI_QUIRK(0x1043, 0x81b3, "ASUS", POS_FIX_LPIB), SND_PCI_QUIRK(0x1043, 0x81e7, "ASUS M2V", POS_FIX_LPIB), SND_PCI_QUIRK(0x104d, 0x9069, "Sony VPCS11V9E", POS_FIX_LPIB), - SND_PCI_QUIRK(0x10de, 0xcb89, "Macbook Pro 7,1", POS_FIX_LPIB), SND_PCI_QUIRK(0x1297, 0x3166, "Shuttle", POS_FIX_LPIB), SND_PCI_QUIRK(0x1458, 0xa022, "ga-ma770-ud3", POS_FIX_LPIB), SND_PCI_QUIRK(0x1462, 0x1002, "MSI Wind U115", POS_FIX_LPIB), diff --git a/trunk/sound/pci/hda/patch_sigmatel.c b/trunk/sound/pci/hda/patch_sigmatel.c index 3556408d6ece..87e684fa830f 100644 --- a/trunk/sound/pci/hda/patch_sigmatel.c +++ b/trunk/sound/pci/hda/patch_sigmatel.c @@ -1596,7 +1596,7 @@ static const struct snd_pci_quirk stac92hd73xx_cfg_tbl[] = { SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x02bd, "Dell Studio 1557", STAC_DELL_M6_DMIC), SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x02fe, - "Dell Studio XPS 1645", STAC_DELL_M6_DMIC), + "Dell Studio XPS 1645", STAC_DELL_M6_BOTH), SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x0413, "Dell Studio 1558", STAC_DELL_M6_DMIC), {} /* terminator */ diff --git a/trunk/sound/pci/oxygen/xonar_wm87x6.c b/trunk/sound/pci/oxygen/xonar_wm87x6.c index 63cff90706bf..478303e6c2b0 100644 --- a/trunk/sound/pci/oxygen/xonar_wm87x6.c +++ b/trunk/sound/pci/oxygen/xonar_wm87x6.c @@ -177,7 +177,6 @@ static void wm8776_registers_init(struct oxygen *chip) struct xonar_wm87x6 *data = chip->model_data; wm8776_write(chip, WM8776_RESET, 0); - wm8776_write(chip, WM8776_PHASESWAP, WM8776_PH_MASK); wm8776_write(chip, WM8776_DACCTRL1, WM8776_DZCEN | WM8776_PL_LEFT_LEFT | WM8776_PL_RIGHT_RIGHT); wm8776_write(chip, WM8776_DACMUTE, chip->dac_mute ? WM8776_DMUTE : 0);